From fe95502b073095f88282497ac9acd1e548772414 Mon Sep 17 00:00:00 2001
From: eastb233 <xiezhiheng@huawei.com>
Date: Tue, 20 Oct 2020 09:45:39 +0800
Subject: [PATCH 1/2] Add URL and Source0 url

---
 gcc.spec | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc.spec b/gcc.spec
index e203c61..5e4eb3b 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -61,7 +61,9 @@ Name: gcc
 Version: %{gcc_version}
 Release: %{DATE}.12
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
-Source0: gcc-9.3.0.tar.xz
+URL: https://gcc.gnu.org
+
+Source0: https://ftp.gnu.org/gnu/gcc/gcc-9.3.0/gcc-9.3.0.tar.xz
 %global isl_version 0.16.1
 
 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
-- 
Gitee


From 01e0ec8ea6306ca9fc59f90ebe04e80cb2e1691d Mon Sep 17 00:00:00 2001
From: eastb233 <xiezhiheng@huawei.com>
Date: Wed, 30 Dec 2020 09:54:10 +0800
Subject: [PATCH 2/2] Upload GCC feature and bugfix patches.

- avoid-cycling-on-vertain-subreg-reloads.patch: Add patch source comment
- change-gcc-BASE-VER.patch: Likewise
- dont-generate-IF_THEN_ELSE.patch: Likewise
- fix-ICE-in-compute_live_loop_exits.patch: Likewise
- fix-ICE-in-eliminate_stmt.patch: Likewise
- fix-ICE-in-vect_create_epilog_for_reduction.patch: Likewise
- fix-ICE-in-vect_stmt_to_vectorize.patch: Likewise
- fix-ICE-in-verify_ssa.patch: Likewise
- fix-ICE-when-vectorizing-nested-cycles.patch: Likewise
- fix-cost-of-plus.patch: Likewise
- ipa-const-prop-self-recursion-bugfix.patch: Likewise
- simplify-removing-subregs.patch: Likewise
- medium-code-mode.patch: Bugfix
- fix-when-peeling-for-alignment.patch: Move to ...
- fix-PR-92351-When-peeling-for-alignment.patch: ... this
- AArch64-Fix-constraints-for-CPY-M.patch: New file
- Apply-maximum-nunits-for-BB-SLP.patch: New file
- Fix-EXTRACT_LAST_REDUCTION-segfault.patch: New file
- Fix-up-push_partial_def-little-endian-bitfield.patch: New file
- Fix-zero-masking-for-vcvtps2ph.patch: New file
- IRA-Handle-fully-tied-destinations.patch: New file
- SLP-VECT-Add-check-to-fix-96837.patch: New file
- aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch: New file
- aarch64-Fix-bf16-and-matrix-g++-gfortran.patch: New file
- aarch64-Fix-mismatched-SVE-predicate-modes.patch: New file
- aarch64-fix-sve-acle-error.patch: New file
- adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch: New file
- bf16-and-matrix-characteristic.patch: New file
- fix-ICE-IPA-compare-VRP-types.patch: New file
- fix-ICE-in-affine-combination.patch: New file
- fix-ICE-in-pass-vect.patch: New file
- fix-ICE-in-vect_update_misalignment_for_peel.patch: New file
- fix-addlosymdi-ICE-in-pass-reload.patch: New file
- fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch: New file
- fix-avx512vl-vcvttpd2dq-2-fail.patch: New file
- fix-issue499-add-nop-convert.patch: New file
- fix-issue604-ldist-dependency-fixup.patch: New file
- modulo-sched-Carefully-process-loop-counter-initiali.patch: New file
- re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch: New file
- reduction-paths-with-unhandled-live-stmt.patch: New file
- redundant-loop-elimination.patch: New file
- sccvn-Improve-handling-of-load-masked-with-integer.patch: New file
- speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch: New file
- store-merging-Consider-also-overlapping-stores-earlier.patch: New file
- tree-optimization-96920-another-ICE-when-vectorizing.patch: New file
- tree-optimization-97812-fix-range-query-in-VRP-asser.patch: New file
- vectorizable-comparison-Swap-operands-only-once.patch: New file
- x86-Fix-bf16-and-matrix.patch: New file
---
 AArch64-Fix-constraints-for-CPY-M.patch       |     67 +
 Apply-maximum-nunits-for-BB-SLP.patch         |    694 +
 Fix-EXTRACT_LAST_REDUCTION-segfault.patch     |     82 +
 ...h_partial_def-little-endian-bitfield.patch |     51 +
 Fix-zero-masking-for-vcvtps2ph.patch          |    139 +
 IRA-Handle-fully-tied-destinations.patch      |    155 +
 SLP-VECT-Add-check-to-fix-96837.patch         |     99 +
 ...h64-Fix-ash-lr-lshr-mode-3-expanders.patch |    165 +
 ...h64-Fix-bf16-and-matrix-g++-gfortran.patch |   1613 +
 ...4-Fix-mismatched-SVE-predicate-modes.patch |     34 +
 aarch64-fix-sve-acle-error.patch              |   2128 +
 ...-move-EXTRACT_LAST_REDUCTION-costing.patch |     88 +
 avoid-cycling-on-vertain-subreg-reloads.patch |      6 +
 bf16-and-matrix-characteristic.patch          | 466067 +++++++++++++++
 change-gcc-BASE-VER.patch                     |      6 +
 dont-generate-IF_THEN_ELSE.patch              |      6 +
 fix-ICE-IPA-compare-VRP-types.patch           |     51 +
 fix-ICE-in-affine-combination.patch           |    396 +
 fix-ICE-in-compute_live_loop_exits.patch      |      6 +
 fix-ICE-in-eliminate_stmt.patch               |     18 +-
 fix-ICE-in-pass-vect.patch                    |     37 +
 ...-in-vect_create_epilog_for_reduction.patch |      6 +
 fix-ICE-in-vect_stmt_to_vectorize.patch       |      6 +
 ...in-vect_update_misalignment_for_peel.patch |    784 +
 fix-ICE-in-verify_ssa.patch                   |      6 +
 fix-ICE-when-vectorizing-nested-cycles.patch  |      6 +
 fix-PR-92351-When-peeling-for-alignment.patch |    152 +
 fix-addlosymdi-ICE-in-pass-reload.patch       |     30 +
 ...n-vect_recog_mask_conversion_pattern.patch |    115 +
 fix-avx512vl-vcvttpd2dq-2-fail.patch          |    301 +
 fix-cost-of-plus.patch                        |      3 +
 fix-issue499-add-nop-convert.patch            |    928 +
 fix-issue604-ldist-dependency-fixup.patch     |    108 +
 fix-when-peeling-for-alignment.patch          |     23 -
 gcc.spec                                      |    181 +-
 ipa-const-prop-self-recursion-bugfix.patch    |     15 +-
 medium-code-mode.patch                        |      6 +-
 ...efully-process-loop-counter-initiali.patch |    251 +
 ...24-gcc.target-i386-avx512vl-vpshldvd.patch |    215 +
 ...ction-paths-with-unhandled-live-stmt.patch |     64 +
 redundant-loop-elimination.patch              |    486 +
 ...handling-of-load-masked-with-integer.patch |   2397 +
 simplify-removing-subregs.patch               |      6 +
 ...ysis-and-fix-bootstrap-compare-debug.patch |    718 +
 ...ider-also-overlapping-stores-earlier.patch |    359 +
 ...n-96920-another-ICE-when-vectorizing.patch |    316 +
 ...n-97812-fix-range-query-in-VRP-asser.patch |     48 +
 ...e-comparison-Swap-operands-only-once.patch |     19 +
 x86-Fix-bf16-and-matrix.patch                 |    321 +
 49 files changed, 479702 insertions(+), 76 deletions(-)
 create mode 100644 AArch64-Fix-constraints-for-CPY-M.patch
 create mode 100644 Apply-maximum-nunits-for-BB-SLP.patch
 create mode 100644 Fix-EXTRACT_LAST_REDUCTION-segfault.patch
 create mode 100644 Fix-up-push_partial_def-little-endian-bitfield.patch
 create mode 100644 Fix-zero-masking-for-vcvtps2ph.patch
 create mode 100644 IRA-Handle-fully-tied-destinations.patch
 create mode 100644 SLP-VECT-Add-check-to-fix-96837.patch
 create mode 100644 aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch
 create mode 100644 aarch64-Fix-bf16-and-matrix-g++-gfortran.patch
 create mode 100644 aarch64-Fix-mismatched-SVE-predicate-modes.patch
 create mode 100644 aarch64-fix-sve-acle-error.patch
 create mode 100644 adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch
 create mode 100644 bf16-and-matrix-characteristic.patch
 create mode 100644 fix-ICE-IPA-compare-VRP-types.patch
 create mode 100644 fix-ICE-in-affine-combination.patch
 create mode 100644 fix-ICE-in-pass-vect.patch
 create mode 100644 fix-ICE-in-vect_update_misalignment_for_peel.patch
 create mode 100644 fix-PR-92351-When-peeling-for-alignment.patch
 create mode 100644 fix-addlosymdi-ICE-in-pass-reload.patch
 create mode 100644 fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch
 create mode 100644 fix-avx512vl-vcvttpd2dq-2-fail.patch
 create mode 100644 fix-issue499-add-nop-convert.patch
 create mode 100644 fix-issue604-ldist-dependency-fixup.patch
 delete mode 100644 fix-when-peeling-for-alignment.patch
 create mode 100644 modulo-sched-Carefully-process-loop-counter-initiali.patch
 create mode 100644 re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch
 create mode 100644 reduction-paths-with-unhandled-live-stmt.patch
 create mode 100644 redundant-loop-elimination.patch
 create mode 100644 sccvn-Improve-handling-of-load-masked-with-integer.patch
 create mode 100644 speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch
 create mode 100644 store-merging-Consider-also-overlapping-stores-earlier.patch
 create mode 100644 tree-optimization-96920-another-ICE-when-vectorizing.patch
 create mode 100644 tree-optimization-97812-fix-range-query-in-VRP-asser.patch
 create mode 100644 vectorizable-comparison-Swap-operands-only-once.patch
 create mode 100644 x86-Fix-bf16-and-matrix.patch

diff --git a/AArch64-Fix-constraints-for-CPY-M.patch b/AArch64-Fix-constraints-for-CPY-M.patch
new file mode 100644
index 0000000..5fcb38e
--- /dev/null
+++ b/AArch64-Fix-constraints-for-CPY-M.patch
@@ -0,0 +1,67 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-AArch64-Fix-constraints-for-CPY-M.patch
+3c2707f33af46ac145769872b65e25fd0b870903
+
+diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
+index cbf29a82e28..59bf4a69507 100644
+--- a/gcc/config/aarch64/aarch64-sve.md
++++ b/gcc/config/aarch64/aarch64-sve.md
+@@ -6523,7 +6523,7 @@
+ (define_insn "@aarch64_sel_dup<mode>"
+   [(set (match_operand:SVE_FULL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, ?&w")
+ 	(unspec:SVE_FULL
+-	  [(match_operand:<VPRED> 3 "register_operand" "Upa, Upa, Upl, Upl, Upl, Upl")
++	  [(match_operand:<VPRED> 3 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+ 	   (vec_duplicate:SVE_FULL
+ 	     (match_operand:<VEL> 1 "register_operand" "r, w, r, w, r, w"))
+ 	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, w, w")]
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cpy_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cpy_1.c
+new file mode 100644
+index 00000000000..1d8f429caeb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cpy_1.c
+@@ -0,0 +1,42 @@
++/* { dg-do compile } */
++/* { dg-options "-O" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** dup_x0_m:
++**	add	(x[0-9]+), x0, #?1
++**	mov	(p[0-7])\.b, p15\.b
++**	mov	z0\.d, \2/m, \1
++**	ret
++*/
++svuint64_t
++dup_x0_m (svuint64_t z0, uint64_t x0)
++{
++  register svbool_t pg asm ("p15");
++  asm volatile ("" : "=Upa" (pg));
++  return svdup_u64_m (z0, pg, x0 + 1);
++}
++
++/*
++** dup_d1_z:
++**	mov	(p[0-7])\.b, p15\.b
++**	mov	z0\.d, \1/m, d1
++**	ret
++*/
++svfloat64_t
++dup_d1_z (svfloat64_t z0, float64_t d1)
++{
++  register svbool_t pg asm ("p15");
++  asm volatile ("" : "=Upa" (pg));
++  return svdup_f64_m (z0, pg, d1);
++}
++
++#ifdef __cplusplus
++}
++#endif
diff --git a/Apply-maximum-nunits-for-BB-SLP.patch b/Apply-maximum-nunits-for-BB-SLP.patch
new file mode 100644
index 0000000..43fc0e0
--- /dev/null
+++ b/Apply-maximum-nunits-for-BB-SLP.patch
@@ -0,0 +1,694 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-Apply-maximum-nunits-for-BB-SLP.patch
+9b75f56d4b7951c60a656396dddd4a65787b95bc
+
+diff -Nurp a/gcc/testsuite/gcc.dg/vect/bb-slp-4.c b/gcc/testsuite/gcc.dg/vect/bb-slp-4.c
+--- a/gcc/testsuite/gcc.dg/vect/bb-slp-4.c	2020-12-20 18:46:19.539633230 +0800
++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-4.c	2020-12-20 18:48:12.799633230 +0800
+@@ -38,5 +38,4 @@ int main (void)
+   return 0;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "basic block vectorized" 0 "slp2" } } */
+-  
++/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
+diff -Nurp a/gcc/testsuite/gcc.dg/vect/bb-slp-bool-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-bool-1.c
+--- a/gcc/testsuite/gcc.dg/vect/bb-slp-bool-1.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-bool-1.c	2020-12-20 18:48:12.799633230 +0800
+@@ -0,0 +1,44 @@
++#include "tree-vect.h"
++
++void __attribute__ ((noipa))
++f1 (_Bool *x, unsigned short *y)
++{
++  x[0] = (y[0] == 1);
++  x[1] = (y[1] == 1);
++}
++
++void __attribute__ ((noipa))
++f2 (_Bool *x, unsigned short *y)
++{
++  x[0] = (y[0] == 1);
++  x[1] = (y[1] == 1);
++  x[2] = (y[2] == 1);
++  x[3] = (y[3] == 1);
++  x[4] = (y[4] == 1);
++  x[5] = (y[5] == 1);
++  x[6] = (y[6] == 1);
++  x[7] = (y[7] == 1);
++}
++
++_Bool x[8];
++unsigned short y[8] = { 11, 1, 9, 5, 1, 44, 1, 1 };
++
++int
++main (void)
++{
++  check_vect ();
++
++  f1 (x, y);
++
++  if (x[0] || !x[1])
++    __builtin_abort ();
++
++  x[1] = 0;
++
++  f2 (x, y);
++
++  if (x[0] || !x[1] || x[2] | x[3] || !x[4] || x[5] || !x[6] || !x[7])
++    __builtin_abort ();
++
++  return 0;
++}
+diff -Nurp a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_14.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_14.c
+--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_14.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_14.c	2020-12-20 18:48:11.811633230 +0800
+@@ -0,0 +1,26 @@
++/* { dg-options "-O2 -ftree-vectorize" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/*
++** foo:
++** (
++**	ldr	d([0-9]+), \[x1\]
++**	ldr	q([0-9]+), \[x0\]
++**	saddw	v([0-9]+)\.4s, v\2\.4s, v\1\.4h
++**	str	q\3, \[x0\]
++** |
++**	ldr	q([0-9]+), \[x0\]
++**	ldr	d([0-9]+), \[x1\]
++**	saddw	v([0-9]+)\.4s, v\4\.4s, v\5\.4h
++**	str	q\6, \[x0\]
++** )
++**	ret
++*/
++void
++foo (int *x, short *y)
++{
++  x[0] += y[0];
++  x[1] += y[1];
++  x[2] += y[2];
++  x[3] += y[3];
++}
+diff -Nurp a/gcc/testsuite/gcc.target/i386/pr84101.c b/gcc/testsuite/gcc.target/i386/pr84101.c
+--- a/gcc/testsuite/gcc.target/i386/pr84101.c	2020-12-20 18:46:18.383633230 +0800
++++ b/gcc/testsuite/gcc.target/i386/pr84101.c	2020-12-20 18:48:11.611633230 +0800
+@@ -18,4 +18,5 @@ uint64_pair_t pair(int num)
+   return p ;
+ }
+ 
+-/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp2" } } */
++/* See PR92266 for the XFAIL.  */
++/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp2" { xfail ilp32 } } } */
+diff -Nurp a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
+--- a/gcc/tree-vect-data-refs.c	2020-12-20 18:46:19.911633230 +0800
++++ b/gcc/tree-vect-data-refs.c	2020-12-20 18:48:11.047633230 +0800
+@@ -4312,9 +4312,8 @@ vect_analyze_data_refs (vec_info *vinfo,
+ 
+       /* Set vectype for STMT.  */
+       scalar_type = TREE_TYPE (DR_REF (dr));
+-      STMT_VINFO_VECTYPE (stmt_info)
+-	= get_vectype_for_scalar_type (vinfo, scalar_type);
+-      if (!STMT_VINFO_VECTYPE (stmt_info))
++      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
++      if (!vectype)
+         {
+           if (dump_enabled_p ())
+             {
+@@ -4345,14 +4344,19 @@ vect_analyze_data_refs (vec_info *vinfo,
+ 	  if (dump_enabled_p ())
+ 	    dump_printf_loc (MSG_NOTE, vect_location,
+ 			     "got vectype for stmt: %G%T\n",
+-			     stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
++			     stmt_info->stmt, vectype);
+ 	}
+ 
+       /* Adjust the minimal vectorization factor according to the
+ 	 vector type.  */
+-      vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
++      vf = TYPE_VECTOR_SUBPARTS (vectype);
+       *min_vf = upper_bound (*min_vf, vf);
+ 
++      /* Leave the BB vectorizer to pick the vector type later, based on
++	 the final dataref group size and SLP node size.  */
++      if (is_a <loop_vec_info> (vinfo))
++	STMT_VINFO_VECTYPE (stmt_info) = vectype;
++
+       if (gatherscatter != SG_NONE)
+ 	{
+ 	  gather_scatter_info gs_info;
+diff -Nurp a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
+--- a/gcc/tree-vect-patterns.c	2020-12-20 18:46:19.979633230 +0800
++++ b/gcc/tree-vect-patterns.c	2020-12-20 18:48:11.227633230 +0800
+@@ -4142,9 +4142,10 @@ vect_recog_bool_pattern (stmt_vec_info s
+ 	   && STMT_VINFO_DATA_REF (stmt_vinfo))
+     {
+       stmt_vec_info pattern_stmt_info;
+-      vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
+-      gcc_assert (vectype != NULL_TREE);
+-      if (!VECTOR_MODE_P (TYPE_MODE (vectype)))
++      tree nunits_vectype;
++      if (!vect_get_vector_types_for_stmt (stmt_vinfo, &vectype,
++					   &nunits_vectype)
++	  || !VECTOR_MODE_P (TYPE_MODE (vectype)))
+ 	return NULL;
+ 
+       if (check_bool_pattern (var, vinfo, bool_stmts))
+diff -Nurp a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
+--- a/gcc/tree-vect-slp.c	2020-12-20 18:46:17.763633230 +0800
++++ b/gcc/tree-vect-slp.c	2020-12-20 18:48:11.227633230 +0800
+@@ -606,6 +606,77 @@ again:
+   return 0;
+ }
+ 
++/* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
++   Return true if we can, meaning that this choice doesn't conflict with
++   existing SLP nodes that use STMT_INFO.  */
++
++static bool
++vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
++{
++  tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
++  if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
++    return true;
++
++  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
++      && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
++    {
++      /* We maintain the invariant that if any statement in the group is
++	 used, all other members of the group have the same vector type.  */
++      stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
++      stmt_vec_info member_info = first_info;
++      for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
++	if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
++	    || is_pattern_stmt_p (member_info))
++	  break;
++
++      if (!member_info)
++	{
++	  for (member_info = first_info; member_info;
++	       member_info = DR_GROUP_NEXT_ELEMENT (member_info))
++	    STMT_VINFO_VECTYPE (member_info) = vectype;
++	  return true;
++	}
++    }
++  else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
++	   && !is_pattern_stmt_p (stmt_info))
++    {
++      STMT_VINFO_VECTYPE (stmt_info) = vectype;
++      return true;
++    }
++
++  if (dump_enabled_p ())
++    {
++      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++		       "Build SLP failed: incompatible vector"
++		       " types for: %G", stmt_info->stmt);
++      dump_printf_loc (MSG_NOTE, vect_location,
++		       "    old vector type: %T\n", old_vectype);
++      dump_printf_loc (MSG_NOTE, vect_location,
++		       "    new vector type: %T\n", vectype);
++    }
++  return false;
++}
++
++/* Try to infer and assign a vector type to all the statements in STMTS.
++   Used only for BB vectorization.  */
++
++static bool
++vect_update_all_shared_vectypes (vec<stmt_vec_info> stmts)
++{
++  tree vectype, nunits_vectype;
++  if (!vect_get_vector_types_for_stmt (stmts[0], &vectype,
++				       &nunits_vectype, stmts.length ()))
++    return false;
++
++  stmt_vec_info stmt_info;
++  unsigned int i;
++  FOR_EACH_VEC_ELT (stmts, i, stmt_info)
++    if (!vect_update_shared_vectype (stmt_info, vectype))
++      return false;
++
++  return true;
++}
++
+ /* Return true if call statements CALL1 and CALL2 are similar enough
+    to be combined into the same SLP group.  */
+ 
+@@ -751,6 +822,7 @@ vect_build_slp_tree_1 (unsigned char *sw
+   stmt_vec_info stmt_info;
+   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
+     {
++      vec_info *vinfo = stmt_info->vinfo;
+       gimple *stmt = stmt_info->stmt;
+       swap[i] = 0;
+       matches[i] = false;
+@@ -784,7 +856,7 @@ vect_build_slp_tree_1 (unsigned char *sw
+ 
+       tree nunits_vectype;
+       if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
+-					   &nunits_vectype)
++					   &nunits_vectype, group_size)
+ 	  || (nunits_vectype
+ 	      && !vect_record_max_nunits (stmt_info, group_size,
+ 					  nunits_vectype, max_nunits)))
+@@ -796,6 +868,10 @@ vect_build_slp_tree_1 (unsigned char *sw
+ 
+       gcc_assert (vectype);
+ 
++      if (is_a <bb_vec_info> (vinfo)
++	  && !vect_update_shared_vectype (stmt_info, vectype))
++	continue;
++
+       if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
+ 	{
+ 	  rhs_code = CALL_EXPR;
+@@ -1328,7 +1404,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
+ 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+ 		if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
+ 		  break;
+-	      if (!grandchild)
++	      if (!grandchild
++		  && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
+ 		{
+ 		  /* Roll back.  */
+ 		  this_tree_size = old_tree_size;
+@@ -1369,7 +1446,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
+ 	     do extra work to cancel the pattern so the uses see the
+ 	     scalar version.  */
+ 	  && !is_pattern_stmt_p (stmt_info)
+-	  && !oprnd_info->any_pattern)
++	  && !oprnd_info->any_pattern
++	  && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
+ 	{
+ 	  if (dump_enabled_p ())
+ 	    dump_printf_loc (MSG_NOTE, vect_location,
+@@ -1488,7 +1566,9 @@ vect_build_slp_tree_2 (vec_info *vinfo,
+ 		  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+ 		    if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
+ 		      break;
+-		  if (!grandchild)
++		  if (!grandchild
++		      && (vect_update_all_shared_vectypes
++			  (oprnd_info->def_stmts)))
+ 		    {
+ 		      /* Roll back.  */
+ 		      this_tree_size = old_tree_size;
+@@ -2026,8 +2106,8 @@ vect_analyze_slp_instance (vec_info *vin
+   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+     {
+       scalar_type = TREE_TYPE (DR_REF (dr));
+-      vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+       group_size = DR_GROUP_SIZE (stmt_info);
++      vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
+     }
+   else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+     {
+@@ -2669,22 +2749,13 @@ vect_slp_analyze_node_operations_1 (vec_
+      Memory accesses already got their vector type assigned
+      in vect_analyze_data_refs.  */
+   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+-  if (bb_vinfo
+-      && ! STMT_VINFO_DATA_REF (stmt_info))
++  if (bb_vinfo && STMT_VINFO_VECTYPE (stmt_info) == boolean_type_node)
+     {
+-      tree vectype, nunits_vectype;
+-      if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
+-					   &nunits_vectype))
+-	/* We checked this when building the node.  */
+-	gcc_unreachable ();
+-      if (vectype == boolean_type_node)
+-	{
+-	  vectype = vect_get_mask_type_for_stmt (stmt_info);
+-	  if (!vectype)
+-	    /* vect_get_mask_type_for_stmt has already explained the
+-	       failure.  */
+-	    return false;
+-	}
++      tree vectype = vect_get_mask_type_for_stmt (stmt_info, node);
++      if (!vectype)
++	/* vect_get_mask_type_for_stmt has already explained the
++	   failure.  */
++	return false;
+ 
+       stmt_vec_info sstmt_info;
+       unsigned int i;
+@@ -3585,7 +3656,7 @@ vect_get_constant_vectors (slp_tree op_n
+       && vect_mask_constant_operand_p (stmt_vinfo))
+     vector_type = truth_type_for (stmt_vectype);
+   else
+-    vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
++    vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), op_node);
+ 
+   unsigned int number_of_vectors
+     = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
+diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+--- a/gcc/tree-vect-stmts.c	2020-12-20 18:46:17.707633230 +0800
++++ b/gcc/tree-vect-stmts.c	2020-12-20 18:48:11.227633230 +0800
+@@ -798,7 +798,7 @@ vect_prologue_cost_for_slp_op (slp_tree
+   /* Without looking at the actual initializer a vector of
+      constants can be implemented as load from the constant pool.
+      When all elements are the same we can use a splat.  */
+-  tree vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op));
++  tree vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), node);
+   unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
+   unsigned num_vects_to_check;
+   unsigned HOST_WIDE_INT const_nunits;
+@@ -3308,7 +3308,7 @@ vectorizable_call (stmt_vec_info stmt_in
+   /* If all arguments are external or constant defs, infer the vector type
+      from the scalar type.  */
+   if (!vectype_in)
+-    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type);
++    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
+   if (vec_stmt)
+     gcc_assert (vectype_in);
+   if (!vectype_in)
+@@ -4106,7 +4106,8 @@ vectorizable_simd_clone_call (stmt_vec_i
+ 	&& bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
+       {
+ 	tree arg_type = TREE_TYPE (gimple_call_arg (stmt, i));
+-	arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type);
++	arginfo[i].vectype = get_vectype_for_scalar_type (vinfo, arg_type,
++							  slp_node);
+ 	if (arginfo[i].vectype == NULL
+ 	    || (simd_clone_subparts (arginfo[i].vectype)
+ 		> bestn->simdclone->simdlen))
+@@ -4805,7 +4806,7 @@ vectorizable_conversion (stmt_vec_info s
+   /* If op0 is an external or constant def, infer the vector type
+      from the scalar type.  */
+   if (!vectype_in)
+-    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type);
++    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
+   if (vec_stmt)
+     gcc_assert (vectype_in);
+   if (!vectype_in)
+@@ -5558,7 +5559,7 @@ vectorizable_shift (stmt_vec_info stmt_i
+   /* If op0 is an external or constant def, infer the vector type
+      from the scalar type.  */
+   if (!vectype)
+-    vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0));
++    vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0), slp_node);
+   if (vec_stmt)
+     gcc_assert (vectype);
+   if (!vectype)
+@@ -5656,7 +5657,8 @@ vectorizable_shift (stmt_vec_info stmt_i
+                          "vector/vector shift/rotate found.\n");
+ 
+       if (!op1_vectype)
+-	op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1));
++	op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1),
++						   slp_node);
+       incompatible_op1_vectype_p
+ 	= (op1_vectype == NULL_TREE
+ 	   || maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype),
+@@ -6000,7 +6002,8 @@ vectorizable_operation (stmt_vec_info st
+ 	  vectype = vectype_out;
+ 	}
+       else
+-	vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0));
++	vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0),
++					       slp_node);
+     }
+   if (vec_stmt)
+     gcc_assert (vectype);
+@@ -8903,7 +8906,7 @@ vectorizable_load (stmt_vec_info stmt_in
+    condition operands are supportable using vec_is_simple_use.  */
+ 
+ static bool
+-vect_is_simple_cond (tree cond, vec_info *vinfo,
++vect_is_simple_cond (tree cond, vec_info *vinfo, slp_tree slp_node,
+ 		     tree *comp_vectype, enum vect_def_type *dts,
+ 		     tree vectype)
+ {
+@@ -8966,7 +8969,8 @@ vect_is_simple_cond (tree cond, vec_info
+ 	scalar_type = build_nonstandard_integer_type
+ 	  (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))),
+ 	   TYPE_UNSIGNED (scalar_type));
+-      *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
++      *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
++						   slp_node);
+     }
+ 
+   return true;
+@@ -9073,7 +9077,7 @@ vectorizable_condition (stmt_vec_info st
+   then_clause = gimple_assign_rhs2 (stmt);
+   else_clause = gimple_assign_rhs3 (stmt);
+ 
+-  if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo,
++  if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo, slp_node,
+ 			    &comp_vectype, &dts[0], slp_node ? NULL : vectype)
+       || !comp_vectype)
+     return false;
+@@ -9564,7 +9568,8 @@ vectorizable_comparison (stmt_vec_info s
+   /* Invariant comparison.  */
+   if (!vectype)
+     {
+-      vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1));
++      vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1),
++					     slp_node);
+       if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits))
+ 	return false;
+     }
+@@ -10322,31 +10327,93 @@ get_related_vectype_for_scalar_type (mac
+ /* Function get_vectype_for_scalar_type.
+ 
+    Returns the vector type corresponding to SCALAR_TYPE as supported
+-   by the target.  */
++   by the target.  If GROUP_SIZE is nonzero and we're performing BB
++   vectorization, make sure that the number of elements in the vector
++   is no bigger than GROUP_SIZE.  */
+ 
+ tree
+-get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type)
++get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type,
++			     unsigned int group_size)
+ {
++  /* For BB vectorization, we should always have a group size once we've
++     constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
++     are tentative requests during things like early data reference
++     analysis and pattern recognition.  */
++  if (is_a <bb_vec_info> (vinfo))
++    gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
++  else
++    group_size = 0;
++
+   tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
+ 						      scalar_type);
+   if (vectype && vinfo->vector_mode == VOIDmode)
+     vinfo->vector_mode = TYPE_MODE (vectype);
+ 
++  /* Register the natural choice of vector type, before the group size
++     has been applied.  */
+   if (vectype)
+     vinfo->used_vector_modes.add (TYPE_MODE (vectype));
+ 
++  /* If the natural choice of vector type doesn't satisfy GROUP_SIZE,
++     try again with an explicit number of elements.  */
++  if (vectype
++      && group_size
++      && maybe_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size))
++    {
++      /* Start with the biggest number of units that fits within
++	 GROUP_SIZE and halve it until we find a valid vector type.
++	 Usually either the first attempt will succeed or all will
++	 fail (in the latter case because GROUP_SIZE is too small
++	 for the target), but it's possible that a target could have
++	 a hole between supported vector types.
++
++	 If GROUP_SIZE is not a power of 2, this has the effect of
++	 trying the largest power of 2 that fits within the group,
++	 even though the group is not a multiple of that vector size.
++	 The BB vectorizer will then try to carve up the group into
++	 smaller pieces.  */
++      unsigned int nunits = 1 << floor_log2 (group_size);
++      do
++	{
++	  vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
++							 scalar_type, nunits);
++	  nunits /= 2;
++	}
++      while (nunits > 1 && !vectype);
++    }
++
+   return vectype;
+ }
+ 
++/* Return the vector type corresponding to SCALAR_TYPE as supported
++   by the target.  NODE, if nonnull, is the SLP tree node that will
++   use the returned vector type.  */
++
++tree
++get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type, slp_tree node)
++{
++  unsigned int group_size = 0;
++  if (node)
++    {
++      group_size = SLP_TREE_SCALAR_OPS (node).length ();
++      if (group_size == 0)
++	group_size = SLP_TREE_SCALAR_STMTS (node).length ();
++    }
++  return get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
++}
++
+ /* Function get_mask_type_for_scalar_type.
+ 
+    Returns the mask type corresponding to a result of comparison
+-   of vectors of specified SCALAR_TYPE as supported by target.  */
++   of vectors of specified SCALAR_TYPE as supported by target.
++   NODE, if nonnull, is the SLP tree node that will use the returned
++   vector type.  */
+ 
+ tree
+-get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type)
++get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
++			       slp_tree node)
+ {
+-  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
++  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, node);
+ 
+   if (!vectype)
+     return NULL;
+@@ -11033,6 +11100,9 @@ vect_gen_while_not (gimple_seq *seq, tre
+ 
+ /* Try to compute the vector types required to vectorize STMT_INFO,
+    returning true on success and false if vectorization isn't possible.
++   If GROUP_SIZE is nonzero and we're performing BB vectorization,
++   take sure that the number of elements in the vectors is no bigger
++   than GROUP_SIZE.
+ 
+    On success:
+ 
+@@ -11050,11 +11120,21 @@ vect_gen_while_not (gimple_seq *seq, tre
+ opt_result
+ vect_get_vector_types_for_stmt (stmt_vec_info stmt_info,
+ 				tree *stmt_vectype_out,
+-				tree *nunits_vectype_out)
++				tree *nunits_vectype_out,
++				unsigned int group_size)
+ {
+   vec_info *vinfo = stmt_info->vinfo;
+   gimple *stmt = stmt_info->stmt;
+ 
++  /* For BB vectorization, we should always have a group size once we've
++     constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
++     are tentative requests during things like early data reference
++     analysis and pattern recognition.  */
++  if (is_a <bb_vec_info> (vinfo))
++    gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
++  else
++    group_size = 0;
++
+   *stmt_vectype_out = NULL_TREE;
+   *nunits_vectype_out = NULL_TREE;
+ 
+@@ -11085,7 +11165,7 @@ vect_get_vector_types_for_stmt (stmt_vec
+ 
+   tree vectype;
+   tree scalar_type = NULL_TREE;
+-  if (STMT_VINFO_VECTYPE (stmt_info))
++  if (group_size == 0 && STMT_VINFO_VECTYPE (stmt_info))
+     {
+       *stmt_vectype_out = vectype = STMT_VINFO_VECTYPE (stmt_info);
+       if (dump_enabled_p ())
+@@ -11094,15 +11174,17 @@ vect_get_vector_types_for_stmt (stmt_vec
+     }
+   else
+     {
+-      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
+-      if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
++      if (data_reference *dr = STMT_VINFO_DATA_REF (stmt_info))
++	scalar_type = TREE_TYPE (DR_REF (dr));
++      else if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
+ 	scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+       else
+ 	scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
+ 
+       /* Pure bool ops don't participate in number-of-units computation.
+ 	 For comparisons use the types being compared.  */
+-      if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
++      if (!STMT_VINFO_DATA_REF (stmt_info)
++	  && VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
+ 	  && is_gimple_assign (stmt)
+ 	  && gimple_assign_rhs_code (stmt) != COND_EXPR)
+ 	{
+@@ -11122,9 +11204,16 @@ vect_get_vector_types_for_stmt (stmt_vec
+ 	}
+ 
+       if (dump_enabled_p ())
+-	dump_printf_loc (MSG_NOTE, vect_location,
+-			 "get vectype for scalar type: %T\n", scalar_type);
+-      vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
++	{
++	  if (group_size)
++	    dump_printf_loc (MSG_NOTE, vect_location,
++			     "get vectype for scalar type (group size %d):"
++			     " %T\n", group_size, scalar_type);
++	  else
++	    dump_printf_loc (MSG_NOTE, vect_location,
++			     "get vectype for scalar type: %T\n", scalar_type);
++	}
++      vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
+       if (!vectype)
+ 	return opt_result::failure_at (stmt,
+ 				       "not vectorized:"
+@@ -11155,7 +11244,8 @@ vect_get_vector_types_for_stmt (stmt_vec
+ 	    dump_printf_loc (MSG_NOTE, vect_location,
+ 			     "get vectype for smallest scalar type: %T\n",
+ 			     scalar_type);
+-	  nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
++	  nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
++							group_size);
+ 	  if (!nunits_vectype)
+ 	    return opt_result::failure_at
+ 	      (stmt, "not vectorized: unsupported data-type %T\n",
+@@ -11183,10 +11273,11 @@ vect_get_vector_types_for_stmt (stmt_vec
+ 
+ /* Try to determine the correct vector type for STMT_INFO, which is a
+    statement that produces a scalar boolean result.  Return the vector
+-   type on success, otherwise return NULL_TREE.  */
++   type on success, otherwise return NULL_TREE.  NODE, if nonnull,
++   is the SLP tree node that will use the returned vector type.  */
+ 
+ opt_tree
+-vect_get_mask_type_for_stmt (stmt_vec_info stmt_info)
++vect_get_mask_type_for_stmt (stmt_vec_info stmt_info, slp_tree node)
+ {
+   vec_info *vinfo = stmt_info->vinfo;
+   gimple *stmt = stmt_info->stmt;
+@@ -11198,7 +11289,7 @@ vect_get_mask_type_for_stmt (stmt_vec_in
+       && !VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt))))
+     {
+       scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+-      mask_type = get_mask_type_for_scalar_type (vinfo, scalar_type);
++      mask_type = get_mask_type_for_scalar_type (vinfo, scalar_type, node);
+ 
+       if (!mask_type)
+ 	return opt_tree::failure_at (stmt,
+diff -Nurp a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+--- a/gcc/tree-vectorizer.h	2020-12-20 18:46:17.851633230 +0800
++++ b/gcc/tree-vectorizer.h	2020-12-20 18:48:11.227633230 +0800
+@@ -1618,8 +1618,9 @@ extern void vect_update_inits_of_drs (lo
+ /* In tree-vect-stmts.c.  */
+ extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
+ 						 poly_uint64 = 0);
+-extern tree get_vectype_for_scalar_type (vec_info *, tree);
+-extern tree get_mask_type_for_scalar_type (vec_info *, tree);
++extern tree get_vectype_for_scalar_type (vec_info *, tree, unsigned int = 0);
++extern tree get_vectype_for_scalar_type (vec_info *, tree, slp_tree);
++extern tree get_mask_type_for_scalar_type (vec_info *, tree, slp_tree = 0);
+ extern tree get_same_sized_vectype (tree, tree);
+ extern bool vect_chooses_same_modes_p (vec_info *, machine_mode);
+ extern bool vect_get_loop_mask_type (loop_vec_info);
+@@ -1671,8 +1672,8 @@ extern void optimize_mask_stores (struct
+ extern gcall *vect_gen_while (tree, tree, tree);
+ extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree);
+ extern opt_result vect_get_vector_types_for_stmt (stmt_vec_info, tree *,
+-						  tree *);
+-extern opt_tree vect_get_mask_type_for_stmt (stmt_vec_info);
++						  tree *, unsigned int = 0);
++extern opt_tree vect_get_mask_type_for_stmt (stmt_vec_info, slp_tree = 0);
+ 
+ /* In tree-vect-data-refs.c.  */
+ extern bool vect_can_force_dr_alignment_p (const_tree, poly_uint64);
diff --git a/Fix-EXTRACT_LAST_REDUCTION-segfault.patch b/Fix-EXTRACT_LAST_REDUCTION-segfault.patch
new file mode 100644
index 0000000..aa3b320
--- /dev/null
+++ b/Fix-EXTRACT_LAST_REDUCTION-segfault.patch
@@ -0,0 +1,82 @@
+This backport contains 2 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-Fix-EXTRACT_LAST_REDUCTION-handling-of-pattern-stmts.patch
+9ec35478ccf0f3539988a054b7996278706a7710
+
+0001-Fix-EXTRACT_LAST_REDUCTION-segfault.patch
+dc176c3ccd6a8cd3f809f3c1549ad00674061eb5
+
+diff -Nurp a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-6.c
+--- a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-6.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-6.c	2020-12-14 21:16:26.492000000 -0500
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++
++int
++f (int *y)
++{
++  int res = 0;
++  for (int i = 0; i < 100; ++i)
++    res = (y[i] & 1) == 0 && (y[i] < 10) ? res : 1;
++  return res;
++}
+diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+--- a/gcc/tree-vect-stmts.c	2020-12-14 21:15:27.004000000 -0500
++++ b/gcc/tree-vect-stmts.c	2020-12-14 21:16:26.492000000 -0500
+@@ -1777,9 +1777,10 @@ vect_finish_stmt_generation_1 (stmt_vec_
+ stmt_vec_info
+ vect_finish_replace_stmt (stmt_vec_info stmt_info, gimple *vec_stmt)
+ {
+-  gcc_assert (gimple_get_lhs (stmt_info->stmt) == gimple_get_lhs (vec_stmt));
++  gimple *scalar_stmt = vect_orig_stmt (stmt_info)->stmt;
++  gcc_assert (gimple_get_lhs (scalar_stmt) == gimple_get_lhs (vec_stmt));
+ 
+-  gimple_stmt_iterator gsi = gsi_for_stmt (stmt_info->stmt);
++  gimple_stmt_iterator gsi = gsi_for_stmt (scalar_stmt);
+   gsi_replace (&gsi, vec_stmt, true);
+ 
+   return vect_finish_stmt_generation_1 (stmt_info, vec_stmt);
+@@ -9118,10 +9119,12 @@ vectorizable_condition (stmt_vec_info st
+ 	  if (new_code == ERROR_MARK)
+ 	    must_invert_cmp_result = true;
+ 	  else
+-	    cond_code = new_code;
++	    {
++	      cond_code = new_code;
++	      /* Make sure we don't accidentally use the old condition.  */
++	      cond_expr = NULL_TREE;
++	    }
+ 	}
+-      /* Make sure we don't accidentally use the old condition.  */
+-      cond_expr = NULL_TREE;
+       std::swap (then_clause, else_clause);
+     }
+ 
+@@ -9426,20 +9429,21 @@ vectorizable_condition (stmt_vec_info st
+ 		  vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ 		  vec_compare = vec_compare_name;
+ 		}
++	      gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
++	      tree lhs = gimple_get_lhs (old_stmt);
+ 	      gcall *new_stmt = gimple_build_call_internal
+ 		(IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
+ 		 vec_then_clause);
+-	      gimple_call_set_lhs (new_stmt, scalar_dest);
+-	      SSA_NAME_DEF_STMT (scalar_dest) = new_stmt;
+-	      if (stmt_info->stmt == gsi_stmt (*gsi))
++	      gimple_call_set_lhs (new_stmt, lhs);
++	      SSA_NAME_DEF_STMT (lhs) = new_stmt;
++	      if (old_stmt == gsi_stmt (*gsi))
+ 		new_stmt_info = vect_finish_replace_stmt (stmt_info, new_stmt);
+ 	      else
+ 		{
+ 		  /* In this case we're moving the definition to later in the
+ 		     block.  That doesn't matter because the only uses of the
+ 		     lhs are in phi statements.  */
+-		  gimple_stmt_iterator old_gsi
+-		    = gsi_for_stmt (stmt_info->stmt);
++		  gimple_stmt_iterator old_gsi = gsi_for_stmt (old_stmt);
+ 		  gsi_remove (&old_gsi, true);
+ 		  new_stmt_info
+ 		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
diff --git a/Fix-up-push_partial_def-little-endian-bitfield.patch b/Fix-up-push_partial_def-little-endian-bitfield.patch
new file mode 100644
index 0000000..b707a36
--- /dev/null
+++ b/Fix-up-push_partial_def-little-endian-bitfield.patch
@@ -0,0 +1,51 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+c69325a5db450dbac198f76f1162734af05a1061
+0001-sccvn-Fix-up-push_partial_def-little-endian-bitfield.patch
+
+diff -urpN a/gcc/testsuite/gcc.c-torture/execute/pr97764.c b/gcc/testsuite/gcc.c-torture/execute/pr97764.c
+--- a/gcc/testsuite/gcc.c-torture/execute/pr97764.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.c-torture/execute/pr97764.c	2020-12-07 03:42:13.404000000 -0500
+@@ -0,0 +1,14 @@
++/* PR tree-optimization/97764 */
++/* { dg-require-effective-target int32plus } */
++
++struct S { int b : 3; int c : 28; int d : 1; };
++
++int
++main ()
++{
++  struct S e = {};
++  e.c = -1;
++  if (e.d)
++    __builtin_abort ();
++  return 0;
++}
+diff -urpN a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
+--- a/gcc/tree-ssa-sccvn.c	2020-12-07 03:43:37.792000000 -0500
++++ b/gcc/tree-ssa-sccvn.c	2020-12-07 03:42:13.404000000 -0500
+@@ -2013,12 +2013,12 @@ vn_walk_cb_data::push_partial_def (const
+ 	}
+       else
+ 	{
+-	  size = MIN (size, (HOST_WIDE_INT) needed_len * BITS_PER_UNIT);
+ 	  if (pd.offset >= 0)
+ 	    {
+ 	      /* LSB of this_buffer[0] byte should be at pd.offset bits
+ 		 in buffer.  */
+ 	      unsigned int msk;
++	      size = MIN (size, (HOST_WIDE_INT) needed_len * BITS_PER_UNIT);
+ 	      amnt = pd.offset % BITS_PER_UNIT;
+ 	      if (amnt)
+ 		shift_bytes_in_array_left (this_buffer, len + 1, amnt);
+@@ -2046,6 +2046,9 @@ vn_walk_cb_data::push_partial_def (const
+ 	    {
+ 	      amnt = (unsigned HOST_WIDE_INT) pd.offset % BITS_PER_UNIT;
+ 	      if (amnt)
++		size -= BITS_PER_UNIT - amnt;
++	      size = MIN (size, (HOST_WIDE_INT) needed_len * BITS_PER_UNIT);
++	      if (amnt)
+ 		shift_bytes_in_array_left (this_buffer, len + 1, amnt);
+ 	    }
+ 	  memcpy (p, this_buffer + (amnt != 0), size / BITS_PER_UNIT);
diff --git a/Fix-zero-masking-for-vcvtps2ph.patch b/Fix-zero-masking-for-vcvtps2ph.patch
new file mode 100644
index 0000000..df8c5a8
--- /dev/null
+++ b/Fix-zero-masking-for-vcvtps2ph.patch
@@ -0,0 +1,139 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-Fix-zero-masking-for-vcvtps2ph-when-dest-operand-is-.patch
+43088bb4dadd3d14b6b594c5f9363fe879f3d7f7
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index 87354451c58..7815d77bcbf 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -21775,19 +21775,19 @@
+    (set_attr "prefix" "maybe_evex")
+    (set_attr "mode" "V4SF")])
+ 
+-(define_insn "*vcvtps2ph_store<mask_name>"
++(define_insn "*vcvtps2ph_store<merge_mask_name>"
+   [(set (match_operand:V4HI 0 "memory_operand" "=m")
+ 	(unspec:V4HI [(match_operand:V4SF 1 "register_operand" "v")
+ 		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+ 		     UNSPEC_VCVTPS2PH))]
+   "TARGET_F16C || TARGET_AVX512VL"
+-  "vcvtps2ph\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
++  "vcvtps2ph\t{%2, %1, %0<merge_mask_operand3>|%0<merge_mask_operand3>, %1, %2}"
+   [(set_attr "type" "ssecvt")
+    (set_attr "prefix" "maybe_evex")
+    (set_attr "mode" "V4SF")])
+ 
+ (define_insn "vcvtps2ph256<mask_name>"
+-  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=vm")
++  [(set (match_operand:V8HI 0 "register_operand" "=v")
+ 	(unspec:V8HI [(match_operand:V8SF 1 "register_operand" "v")
+ 		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+ 		     UNSPEC_VCVTPS2PH))]
+@@ -21798,8 +21798,20 @@
+    (set_attr "btver2_decode" "vector")
+    (set_attr "mode" "V8SF")])
+ 
++(define_insn "*vcvtps2ph256<merge_mask_name>"
++  [(set (match_operand:V8HI 0 "memory_operand" "=m")
++	(unspec:V8HI [(match_operand:V8SF 1 "register_operand" "v")
++		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
++		     UNSPEC_VCVTPS2PH))]
++  "TARGET_F16C || TARGET_AVX512VL"
++  "vcvtps2ph\t{%2, %1, %0<merge_mask_operand3>|%0<merge_mask_operand3>, %1, %2}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "maybe_evex")
++   (set_attr "btver2_decode" "vector")
++   (set_attr "mode" "V8SF")])
++
+ (define_insn "<mask_codefor>avx512f_vcvtps2ph512<mask_name>"
+-  [(set (match_operand:V16HI 0 "nonimmediate_operand" "=vm")
++  [(set (match_operand:V16HI 0 "register_operand" "=v")
+ 	(unspec:V16HI
+ 	  [(match_operand:V16SF 1 "register_operand" "v")
+ 	   (match_operand:SI 2 "const_0_to_255_operand" "N")]
+@@ -21810,6 +21822,18 @@
+    (set_attr "prefix" "evex")
+    (set_attr "mode" "V16SF")])
+ 
++(define_insn "*avx512f_vcvtps2ph512<merge_mask_name>"
++  [(set (match_operand:V16HI 0 "memory_operand" "=m")
++	(unspec:V16HI
++	  [(match_operand:V16SF 1 "register_operand" "v")
++	   (match_operand:SI 2 "const_0_to_255_operand" "N")]
++	  UNSPEC_VCVTPS2PH))]
++  "TARGET_AVX512F"
++  "vcvtps2ph\t{%2, %1, %0<merge_mask_operand3>|%0<merge_mask_operand3>, %1, %2}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "V16SF")])
++
+ ;; For gather* insn patterns
+ (define_mode_iterator VEC_GATHER_MODE
+ 		      [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF])
+diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md
+index a5ca144c7f7..58ea9dc83e2 100644
+--- a/gcc/config/i386/subst.md
++++ b/gcc/config/i386/subst.md
+@@ -73,6 +73,18 @@
+ 	  (match_operand:SUBST_V 2 "nonimm_or_0_operand" "0C")
+ 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))])
+ 
++(define_subst_attr "merge_mask_name" "merge_mask" "" "_merge_mask")
++(define_subst_attr "merge_mask_operand3" "merge_mask" "" "%{%3%}")
++(define_subst "merge_mask"
++  [(set (match_operand:SUBST_V 0)
++        (match_operand:SUBST_V 1))]
++  "TARGET_AVX512F"
++  [(set (match_dup 0)
++        (vec_merge:SUBST_V
++	  (match_dup 1)
++	  (match_dup 0)
++	  (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))])
++
+ (define_subst_attr "mask_scalar_merge_name" "mask_scalar_merge" "" "_mask")
+ (define_subst_attr "mask_scalar_merge_operand3" "mask_scalar_merge" "" "%{%3%}")
+ (define_subst_attr "mask_scalar_merge_operand4" "mask_scalar_merge" "" "%{%4%}")
+diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtps2ph-pr95254.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtps2ph-pr95254.c
+new file mode 100644
+index 00000000000..9e0da947368
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtps2ph-pr95254.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mavx512f" } */
++
++#include<immintrin.h>
++extern __m256i res;
++void
++foo (__m512 a, __mmask16 m)
++{
++  res = _mm512_maskz_cvtps_ph (m, a, 10);
++}
++
++/* { dg-final { scan-assembler-not "vcvtps2ph\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]\[^\n\]*res\[^\n\]*\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"} } */
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr95254.c b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr95254.c
+new file mode 100644
+index 00000000000..0c685ea66fd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr95254.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mavx512vl -mavx512f" } */
++
++#include<immintrin.h>
++extern __m128i res;
++void
++foo (__m256 a, __mmask8 m)
++{
++  res = _mm256_maskz_cvtps_ph (m, a, 10);
++}
++
++void
++foo1 (__m128 a, __mmask8 m)
++{
++  res = _mm_maskz_cvtps_ph (m, a, 10);
++}
++
++/* { dg-final { scan-assembler-not "vcvtps2ph\[ \\t\]+\[^\{\n\]*%\[xy\]mm\[0-9\]\[^\n\]*res\[^\n\]*\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)"} } */
diff --git a/IRA-Handle-fully-tied-destinations.patch b/IRA-Handle-fully-tied-destinations.patch
new file mode 100644
index 0000000..ad181cd
--- /dev/null
+++ b/IRA-Handle-fully-tied-destinations.patch
@@ -0,0 +1,155 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-IRA-Handle-fully-tied-destinations-in-a-similar-way-.patch
+9b0365879b3c4917f5a2485a1fca8bb678484bfe
+
+diff --git a/gcc/ira-lives.c b/gcc/ira-lives.c
+index cce73a1c3d4..098b0e73953 100644
+--- a/gcc/ira-lives.c
++++ b/gcc/ira-lives.c
+@@ -633,9 +633,28 @@ check_and_make_def_use_conflict (rtx dreg, rtx orig_dreg,
+ 
+ /* Check and make if necessary conflicts for definition DEF of class
+    DEF_CL of the current insn with input operands.  Process only
+-   constraints of alternative ALT.  */
++   constraints of alternative ALT.
++
++   One of three things is true when this function is called:
++
++   (1) DEF is an earlyclobber for alternative ALT.  Input operands then
++       conflict with DEF in ALT unless they explicitly match DEF via 0-9
++       constraints.
++
++   (2) DEF matches (via 0-9 constraints) an operand that is an
++       earlyclobber for alternative ALT.  Other input operands then
++       conflict with DEF in ALT.
++
++   (3) [FOR_TIE_P] Some input operand X matches DEF for alternative ALT.
++       Input operands with a different value from X then conflict with
++       DEF in ALT.
++
++   However, there's still a judgement call to make when deciding
++   whether a conflict in ALT is important enough to be reflected
++   in the pan-alternative allocno conflict set.  */
+ static void
+-check_and_make_def_conflict (int alt, int def, enum reg_class def_cl)
++check_and_make_def_conflict (int alt, int def, enum reg_class def_cl,
++			     bool for_tie_p)
+ {
+   int use, use_match;
+   ira_allocno_t a;
+@@ -669,14 +688,40 @@ check_and_make_def_conflict (int alt, int def, enum reg_class def_cl)
+       if (use == def || recog_data.operand_type[use] == OP_OUT)
+ 	continue;
+ 
++      /* An earlyclobber on DEF doesn't apply to an input operand X if X
++	 explicitly matches DEF, but it applies to other input operands
++	 even if they happen to be the same value as X.
++
++	 In contrast, if an input operand X is tied to a non-earlyclobber
++	 DEF, there's no conflict with other input operands that have the
++	 same value as X.  */
++      if (op_alt[use].matches == def
++	  || (for_tie_p
++	      && rtx_equal_p (recog_data.operand[use],
++			      recog_data.operand[op_alt[def].matched])))
++	continue;
++
+       if (op_alt[use].anything_ok)
+ 	use_cl = ALL_REGS;
+       else
+ 	use_cl = op_alt[use].cl;
++      if (use_cl == NO_REGS)
++	continue;
++
++      /* If DEF is simply a tied operand, ignore cases in which this
++	 alternative requires USE to have a likely-spilled class.
++	 Adding a conflict would just constrain USE further if DEF
++	 happens to be allocated first.  */
++      if (for_tie_p && targetm.class_likely_spilled_p (use_cl))
++	continue;
+ 
+       /* If there's any alternative that allows USE to match DEF, do not
+ 	 record a conflict.  If that causes us to create an invalid
+-	 instruction due to the earlyclobber, reload must fix it up.  */
++	 instruction due to the earlyclobber, reload must fix it up.
++
++	 Likewise, if we're treating a tied DEF like a partial earlyclobber,
++	 do not record a conflict if there's another alternative in which
++	 DEF is neither tied nor earlyclobber.  */
+       for (alt1 = 0; alt1 < recog_data.n_alternatives; alt1++)
+ 	{
+ 	  if (!TEST_BIT (preferred_alternatives, alt1))
+@@ -691,6 +736,12 @@ check_and_make_def_conflict (int alt, int def, enum reg_class def_cl)
+ 		  && recog_data.constraints[use - 1][0] == '%'
+ 		  && op_alt1[use - 1].matches == def))
+ 	    break;
++	  if (for_tie_p
++	      && !op_alt1[def].earlyclobber
++	      && op_alt1[def].matched < 0
++	      && alternative_class (op_alt1, def) != NO_REGS
++	      && alternative_class (op_alt1, use) != NO_REGS)
++	    break;
+ 	}
+ 
+       if (alt1 < recog_data.n_alternatives)
+@@ -701,8 +752,7 @@ check_and_make_def_conflict (int alt, int def, enum reg_class def_cl)
+ 
+       if ((use_match = op_alt[use].matches) >= 0)
+ 	{
+-	  if (use_match == def)
+-	    continue;
++	  gcc_checking_assert (use_match != def);
+ 
+ 	  if (op_alt[use_match].anything_ok)
+ 	    use_cl = ALL_REGS;
+@@ -717,7 +767,11 @@ check_and_make_def_conflict (int alt, int def, enum reg_class def_cl)
+ /* Make conflicts of early clobber pseudo registers of the current
+    insn with its inputs.  Avoid introducing unnecessary conflicts by
+    checking classes of the constraints and pseudos because otherwise
+-   significant code degradation is possible for some targets.  */
++   significant code degradation is possible for some targets.
++
++   For these purposes, tying an input to an output makes that output act
++   like an earlyclobber for inputs with a different value, since the output
++   register then has a predetermined purpose on input to the instruction.  */
+ static void
+ make_early_clobber_and_input_conflicts (void)
+ {
+@@ -732,15 +786,19 @@ make_early_clobber_and_input_conflicts (void)
+     if (TEST_BIT (preferred_alternatives, alt))
+       for (def = 0; def < n_operands; def++)
+ 	{
+-	  def_cl = NO_REGS;
+-	  if (op_alt[def].earlyclobber)
++	  if (op_alt[def].anything_ok)
++	    def_cl = ALL_REGS;
++	  else
++	    def_cl = op_alt[def].cl;
++	  if (def_cl != NO_REGS)
+ 	    {
+-	      if (op_alt[def].anything_ok)
+-		def_cl = ALL_REGS;
+-	      else
+-		def_cl = op_alt[def].cl;
+-	      check_and_make_def_conflict (alt, def, def_cl);
++	      if (op_alt[def].earlyclobber)
++		check_and_make_def_conflict (alt, def, def_cl, false);
++	      else if (op_alt[def].matched >= 0
++		       && !targetm.class_likely_spilled_p (def_cl))
++		check_and_make_def_conflict (alt, def, def_cl, true);
+ 	    }
++
+ 	  if ((def_match = op_alt[def].matches) >= 0
+ 	      && (op_alt[def_match].earlyclobber
+ 		  || op_alt[def].earlyclobber))
+@@ -749,7 +807,7 @@ make_early_clobber_and_input_conflicts (void)
+ 		def_cl = ALL_REGS;
+ 	      else
+ 		def_cl = op_alt[def_match].cl;
+-	      check_and_make_def_conflict (alt, def, def_cl);
++	      check_and_make_def_conflict (alt, def, def_cl, false);
+ 	    }
+ 	}
+ }
diff --git a/SLP-VECT-Add-check-to-fix-96837.patch b/SLP-VECT-Add-check-to-fix-96837.patch
new file mode 100644
index 0000000..bfc60bc
--- /dev/null
+++ b/SLP-VECT-Add-check-to-fix-96837.patch
@@ -0,0 +1,99 @@
+This backport contains 2 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+97b798d80baf945ea28236eef3fa69f36626b579
+0001-SLP-VECT-Add-check-to-fix-96837.patch
+
+373b99dc40949efa697326f378e5022a02e0328b
+0002-Add-a-testcase-for-PR-target-96827.patch
+
+diff -uprN a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c
+--- a/gcc/testsuite/gcc.dg/vect/bb-slp-49.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/vect/bb-slp-49.c	2020-11-17 15:58:12.118126065 +0800
+@@ -0,0 +1,28 @@
++/* This checks that vectorized constructors have the correct ordering. */
++/* { dg-require-effective-target vect_int } */
++
++typedef int V __attribute__((__vector_size__(16)));
++
++__attribute__((__noipa__)) void
++foo (unsigned int x, V *y)
++{
++  unsigned int a[4] = { x + 0, x + 2, x + 4, x + 6 };
++  for (unsigned int i = 0; i < 3; ++i)
++    if (a[i] == 1234)
++      a[i]--;
++  *y = (V) { a[3], a[2], a[1], a[0] };
++}
++
++int
++main ()
++{
++  V b;
++  foo (0, &b);
++  if (b[0] != 6 || b[1] != 4 || b[2] != 2 || b[3] != 0)
++    __builtin_abort ();
++  return 0;
++}
++
++/* See that we vectorize an SLP instance.  */
++/* { dg-final { scan-tree-dump "Analyzing vectorizable constructor" "slp1" } } */
++/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "slp1" } } */
+diff -uprN a/gcc/testsuite/gcc.target/i386/pr96827.c b/gcc/testsuite/gcc.target/i386/pr96827.c
+--- a/gcc/testsuite/gcc.target/i386/pr96827.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.target/i386/pr96827.c	2020-11-17 15:58:15.182126065 +0800
+@@ -0,0 +1,41 @@
++/* { dg-do run { target sse2_runtime } } */
++/* { dg-options "-O3 -msse2 -mfpmath=sse" } */
++
++typedef unsigned short int __uint16_t;
++typedef unsigned int __uint32_t;
++typedef __uint16_t uint16_t;
++typedef __uint32_t uint32_t;
++typedef int __v4si __attribute__ ((__vector_size__ (16)));
++typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
++extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
++_mm_store_si128 (__m128i *__P, __m128i __B)
++{
++  *__P = __B;
++}
++extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
++_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
++{
++  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
++}
++typedef uint16_t u16;
++typedef uint32_t u32;
++extern int printf (const char *__restrict __format, ...);
++void do_the_thing(u32 idx, __m128i *dude)
++{
++ u32 dude_[4] = { idx+0, idx+2, idx+4, idx+6 };
++ for (u32 i = 0; i < 3; ++i)
++  if (dude_[i] == 1234)
++   dude_[i]--;
++ *dude = _mm_set_epi32(dude_[0], dude_[1], dude_[2], dude_[3]);
++}
++int main()
++{
++ __m128i dude;
++ u32 idx = 0;
++ do_the_thing(idx, &dude);
++ __attribute__((aligned(16))) u32 dude_[4];
++ _mm_store_si128((__m128i*)dude_, dude);
++ if (!(6 == dude_[0] && 4 == dude_[1] && 2 == dude_[2] && 0 == dude_[3]))
++   __builtin_abort ();
++ return 0;
++}
+diff -uprN a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
+--- a/gcc/tree-vect-slp.c	2020-11-17 15:55:57.098126065 +0800
++++ b/gcc/tree-vect-slp.c	2020-11-17 15:59:25.862126065 +0800
+@@ -1842,7 +1842,8 @@ vect_supported_load_permutation_p (slp_i
+   /* Reduction (there are no data-refs in the root).
+      In reduction chain the order of the loads is not important.  */
+   if (!STMT_VINFO_DATA_REF (stmt_info)
+-      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
++      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
++      && !SLP_INSTANCE_ROOT_STMT (slp_instn))
+     vect_attempt_slp_rearrange_stmts (slp_instn);
+ 
+   /* In basic block vectorization we allow any subchain of an interleaving
diff --git a/aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch b/aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch
new file mode 100644
index 0000000..e28c8a6
--- /dev/null
+++ b/aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch
@@ -0,0 +1,165 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+7a6588fe65432c0f1a8b5fdefba81700ebf88711
+0001-aarch64-Fix-ash-lr-lshr-mode-3-expanders-PR94488.patch
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index 24a11fb5040..9f0e2bd1e6f 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
++++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -1105,31 +1105,17 @@
+ 						     tmp));
+           DONE;
+         }
+-      else
+-        {
+-          operands[2] = force_reg (SImode, operands[2]);
+-        }
+-    }
+-  else if (MEM_P (operands[2]))
+-    {
+-      operands[2] = force_reg (SImode, operands[2]);
+     }
+ 
+-  if (REG_P (operands[2]))
+-    {
+-      rtx tmp = gen_reg_rtx (<MODE>mode);
+-      emit_insn (gen_aarch64_simd_dup<mode> (tmp,
+-					     convert_to_mode (<VEL>mode,
+-							      operands[2],
+-							      0)));
+-      emit_insn (gen_aarch64_simd_reg_sshl<mode> (operands[0], operands[1],
+-						  tmp));
+-      DONE;
+-    }
+-  else
+-    FAIL;
+-}
+-)
++  operands[2] = force_reg (SImode, operands[2]);
++
++  rtx tmp = gen_reg_rtx (<MODE>mode);
++  emit_insn (gen_aarch64_simd_dup<mode> (tmp, convert_to_mode (<VEL>mode,
++							       operands[2],
++							       0)));
++  emit_insn (gen_aarch64_simd_reg_sshl<mode> (operands[0], operands[1], tmp));
++  DONE;
++})
+ 
+ (define_expand "lshr<mode>3"
+   [(match_operand:VDQ_I 0 "register_operand")
+@@ -1152,31 +1138,19 @@
+ 						  tmp));
+ 	  DONE;
+ 	}
+-      else
+-        operands[2] = force_reg (SImode, operands[2]);
+-    }
+-  else if (MEM_P (operands[2]))
+-    {
+-      operands[2] = force_reg (SImode, operands[2]);
+     }
+ 
+-  if (REG_P (operands[2]))
+-    {
+-      rtx tmp = gen_reg_rtx (SImode);
+-      rtx tmp1 = gen_reg_rtx (<MODE>mode);
+-      emit_insn (gen_negsi2 (tmp, operands[2]));
+-      emit_insn (gen_aarch64_simd_dup<mode> (tmp1,
+-					     convert_to_mode (<VEL>mode,
+-							      tmp, 0)));
+-      emit_insn (gen_aarch64_simd_reg_shl<mode>_unsigned (operands[0],
+-							  operands[1],
+-							  tmp1));
+-      DONE;
+-    }
+-  else
+-    FAIL;
+-}
+-)
++  operands[2] = force_reg (SImode, operands[2]);
++
++  rtx tmp = gen_reg_rtx (SImode);
++  rtx tmp1 = gen_reg_rtx (<MODE>mode);
++  emit_insn (gen_negsi2 (tmp, operands[2]));
++  emit_insn (gen_aarch64_simd_dup<mode> (tmp1,
++					 convert_to_mode (<VEL>mode, tmp, 0)));
++  emit_insn (gen_aarch64_simd_reg_shl<mode>_unsigned (operands[0], operands[1],
++						      tmp1));
++  DONE;
++})
+ 
+ (define_expand "ashr<mode>3"
+   [(match_operand:VDQ_I 0 "register_operand")
+@@ -1199,31 +1173,19 @@
+ 						  tmp));
+           DONE;
+ 	}
+-      else
+-        operands[2] = force_reg (SImode, operands[2]);
+-    }
+-  else if (MEM_P (operands[2]))
+-    {
+-      operands[2] = force_reg (SImode, operands[2]);
+     }
+ 
+-  if (REG_P (operands[2]))
+-    {
+-      rtx tmp = gen_reg_rtx (SImode);
+-      rtx tmp1 = gen_reg_rtx (<MODE>mode);
+-      emit_insn (gen_negsi2 (tmp, operands[2]));
+-      emit_insn (gen_aarch64_simd_dup<mode> (tmp1,
+-					     convert_to_mode (<VEL>mode,
+-							      tmp, 0)));
+-      emit_insn (gen_aarch64_simd_reg_shl<mode>_signed (operands[0],
+-							operands[1],
+-							tmp1));
+-      DONE;
+-    }
+-  else
+-    FAIL;
+-}
+-)
++  operands[2] = force_reg (SImode, operands[2]);
++
++  rtx tmp = gen_reg_rtx (SImode);
++  rtx tmp1 = gen_reg_rtx (<MODE>mode);
++  emit_insn (gen_negsi2 (tmp, operands[2]));
++  emit_insn (gen_aarch64_simd_dup<mode> (tmp1, convert_to_mode (<VEL>mode,
++								tmp, 0)));
++  emit_insn (gen_aarch64_simd_reg_shl<mode>_signed (operands[0], operands[1],
++						    tmp1));
++  DONE;
++})
+ 
+ (define_expand "vashl<mode>3"
+  [(match_operand:VDQ_I 0 "register_operand")
+diff --git a/gcc/testsuite/gcc.c-torture/compile/pr94488.c b/gcc/testsuite/gcc.c-torture/compile/pr94488.c
+new file mode 100644
+index 00000000000..6e20a4168de
+--- /dev/null
++++ b/gcc/testsuite/gcc.c-torture/compile/pr94488.c
+@@ -0,0 +1,22 @@
++/* PR target/94488 */
++
++typedef unsigned long V __attribute__((__vector_size__(16)));
++typedef long W __attribute__((__vector_size__(16)));
++
++void
++foo (V *x, unsigned long y)
++{
++  *x = *x >> (unsigned int) y;
++}
++
++void
++bar (V *x, unsigned long y)
++{
++  *x = *x << (unsigned int) y;
++}
++
++void
++baz (W *x, unsigned long y)
++{
++  *x = *x >> (unsigned int) y;
++}
diff --git a/aarch64-Fix-bf16-and-matrix-g++-gfortran.patch b/aarch64-Fix-bf16-and-matrix-g++-gfortran.patch
new file mode 100644
index 0000000..6bc36da
--- /dev/null
+++ b/aarch64-Fix-bf16-and-matrix-g++-gfortran.patch
@@ -0,0 +1,1613 @@
+This backport contains 5 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-target-88838-SVE-Use-32-bit-WHILELO-in-LP64-mo.patch
+9b884225bfc609606f9b169b021c4da93feba48e
+
+0002-C-Avoid-aka-types-that-just-add-tags.patch
+558798156b41fcbe5ba68b75171708cad135b041
+
+0003-g-.dg-tree-ssa-pr61034.C-Add-param-max-inline-insns-.patch
+cfcf3551c432da3a8154ef11a26a4d75655deb3d
+
+0004-C-Avoid-exposing-internal-details-in-aka-types.patch
+56898e437a538c7edc0724a3650f5cb81c9d5721
+
+0005-C-Avoid-exposing-internal-details-in-aka-types.patch
+10bce48f104de56503b17954ed79f019df3252e3
+
+diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
+index bf3db074a..2c10743b9 100644
+--- a/gcc/c-family/c-common.c
++++ b/gcc/c-family/c-common.c
+@@ -48,6 +48,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "gimplify.h"
+ #include "substring-locations.h"
+ #include "spellcheck.h"
++#include "c-spellcheck.h"
+ #include "selftest.h"
+ 
+ cpp_reader *parse_in;		/* Declared in c-pragma.h.  */
+@@ -7685,6 +7686,52 @@ set_underlying_type (tree x)
+     }
+ }
+ 
++/* Return true if it is worth exposing the DECL_ORIGINAL_TYPE of TYPE to
++   the user in diagnostics, false if it would be better to use TYPE itself.
++   TYPE is known to satisfy typedef_variant_p.  */
++
++bool
++user_facing_original_type_p (const_tree type)
++{
++  gcc_assert (typedef_variant_p (type));
++  tree decl = TYPE_NAME (type);
++
++  /* Look through any typedef in "user" code.  */
++  if (!DECL_IN_SYSTEM_HEADER (decl) && !DECL_IS_BUILTIN (decl))
++    return true;
++
++  /* If the original type is also named and is in the user namespace,
++     assume it too is a user-facing type.  */
++  tree orig_type = DECL_ORIGINAL_TYPE (decl);
++  if (tree orig_id = TYPE_IDENTIFIER (orig_type))
++    if (!name_reserved_for_implementation_p (IDENTIFIER_POINTER (orig_id)))
++      return true;
++
++  switch (TREE_CODE (orig_type))
++    {
++    /* Don't look through to an anonymous vector type, since the syntax
++       we use for them in diagnostics isn't real C or C++ syntax.
++       And if ORIG_TYPE is named but in the implementation namespace,
++       TYPE is likely to be more meaningful to the user.  */
++    case VECTOR_TYPE:
++      return false;
++
++    /* Don't expose anonymous tag types that are presumably meant to be
++       known by their typedef name.  Also don't expose tags that are in
++       the implementation namespace, such as:
++
++         typedef struct __foo foo;  */
++    case RECORD_TYPE:
++    case UNION_TYPE:
++    case ENUMERAL_TYPE:
++      return false;
++
++    /* Look through to anything else.  */
++    default:
++      return true;
++    }
++}
++
+ /* Record the types used by the current global variable declaration
+    being parsed, so that we can decide later to emit their debug info.
+    Those types are in types_used_by_cur_var_decl, and we are going to
+diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
+index 46b8d265a..73ce7c5df 100644
+--- a/gcc/c-family/c-common.h
++++ b/gcc/c-family/c-common.h
+@@ -1063,6 +1063,7 @@ extern tree builtin_type_for_size (int, bool);
+ extern void c_common_mark_addressable_vec (tree);
+ 
+ extern void set_underlying_type (tree);
++extern bool user_facing_original_type_p (const_tree);
+ extern void record_types_used_by_current_var_decl (tree);
+ extern vec<tree, va_gc> *make_tree_vector (void);
+ extern void release_tree_vector (vec<tree, va_gc> *);
+diff --git a/gcc/c/c-objc-common.c b/gcc/c/c-objc-common.c
+index 2b76737a7..10d72c57d 100644
+--- a/gcc/c/c-objc-common.c
++++ b/gcc/c/c-objc-common.c
+@@ -28,6 +28,8 @@ along with GCC; see the file COPYING3.  If not see
+ #include "langhooks.h"
+ #include "c-objc-common.h"
+ #include "gcc-rich-location.h"
++#include "stringpool.h"
++#include "attribs.h"
+ 
+ static bool c_tree_printer (pretty_printer *, text_info *, const char *,
+ 			    int, bool, bool, bool, bool *, const char **);
+@@ -62,6 +64,122 @@ c_objc_common_init (void)
+   return c_common_init ();
+ }
+ 
++/* Decide whether it's worth saying that TYPE is also known as some other
++   type.  Return the other type if so, otherwise return TYPE.  */
++
++static tree
++get_aka_type (tree type)
++{
++  if (type == error_mark_node)
++    return type;
++
++  tree result;
++  if (typedef_variant_p (type))
++    {
++      /* Saying that "foo" is also known as "struct foo" or
++	 "struct <anonymous>" is unlikely to be useful, since users of
++	 structure-like types would already know that they're structures.
++	 The same applies to unions and enums; in general, printing the
++	 tag is only useful if it has a different name.  */
++      tree orig_type = DECL_ORIGINAL_TYPE (TYPE_NAME (type));
++      tree_code code = TREE_CODE (orig_type);
++      tree orig_id = TYPE_IDENTIFIER (orig_type);
++      if ((code == RECORD_TYPE || code == UNION_TYPE || code == ENUMERAL_TYPE)
++	  && (!orig_id || TYPE_IDENTIFIER (type) == orig_id))
++	return type;
++
++      if (!user_facing_original_type_p (type))
++	return type;
++
++      result = get_aka_type (orig_type);
++    }
++  else
++    {
++      tree canonical = TYPE_CANONICAL (type);
++      if (canonical && TREE_CODE (type) != TREE_CODE (canonical))
++	return canonical;
++
++      /* Recursive calls might choose a middle ground between TYPE
++	 (which has no typedefs stripped) and CANONICAL (which has
++	 all typedefs stripped).  So try to reuse TYPE or CANONICAL if
++	 convenient, but be prepared to create a new type if necessary.  */
++      switch (TREE_CODE (type))
++	{
++	case POINTER_TYPE:
++	case REFERENCE_TYPE:
++	  {
++	    tree target_type = get_aka_type (TREE_TYPE (type));
++
++	    if (target_type == TREE_TYPE (type))
++	      return type;
++
++	    if (canonical && target_type == TREE_TYPE (canonical))
++	      return canonical;
++
++	    result = (TREE_CODE (type) == POINTER_TYPE
++		      ? build_pointer_type (target_type)
++		      : build_reference_type (target_type));
++	    break;
++	  }
++
++	case ARRAY_TYPE:
++	  {
++	    tree element_type = get_aka_type (TREE_TYPE (type));
++	    tree index_type = (TYPE_DOMAIN (type)
++			       ? get_aka_type (TYPE_DOMAIN (type))
++			       : NULL_TREE);
++
++	    if (element_type == TREE_TYPE (type)
++		&& index_type == TYPE_DOMAIN (type))
++	      return type;
++
++	    if (canonical
++		&& element_type == TREE_TYPE (canonical)
++		&& index_type == TYPE_DOMAIN (canonical))
++	      return canonical;
++
++	    result = build_array_type (element_type, index_type,
++				       TYPE_TYPELESS_STORAGE (type));
++	    break;
++	  }
++
++	case FUNCTION_TYPE:
++	  {
++	    tree return_type = get_aka_type (TREE_TYPE (type));
++
++	    tree args = TYPE_ARG_TYPES (type);
++	    if (args == error_mark_node)
++	      return type;
++
++	    auto_vec<tree, 32> arg_types;
++	    bool type_ok_p = true;
++	    while (args && args != void_list_node)
++	      {
++		tree arg_type = get_aka_type (TREE_VALUE (args));
++		arg_types.safe_push (arg_type);
++		type_ok_p &= (arg_type == TREE_VALUE (args));
++		args = TREE_CHAIN (args);
++	      }
++
++	    if (type_ok_p && return_type == TREE_TYPE (type))
++	      return type;
++
++	    unsigned int i;
++	    tree arg_type;
++	    FOR_EACH_VEC_ELT_REVERSE (arg_types, i, arg_type)
++	      args = tree_cons (NULL_TREE, arg_type, args);
++	    result = build_function_type (return_type, args);
++	    break;
++	  }
++
++	default:
++	  return canonical ? canonical : type;
++	}
++    }
++  return build_type_attribute_qual_variant (result, TYPE_ATTRIBUTES (type),
++					    TYPE_QUALS (type));
++}
++
+ /* Print T to CPP.  */
+ 
+ static void
+@@ -83,11 +201,12 @@ print_type (c_pretty_printer *cpp, tree t, bool *quoted)
+      stripped version.  But sometimes the stripped version looks
+      exactly the same, so we don't want it after all.  To avoid
+      printing it in that case, we play ugly obstack games.  */
+-  if (TYPE_CANONICAL (t) && t != TYPE_CANONICAL (t))
++  tree aka_type = get_aka_type (t);
++  if (aka_type != t)
+     {
+       c_pretty_printer cpp2;
+       /* Print the stripped version into a temporary printer.  */
+-      cpp2.type_id (TYPE_CANONICAL (t));
++      cpp2.type_id (aka_type);
+       struct obstack *ob2 = cpp2.buffer->obstack;
+       /* Get the stripped version from the temporary printer.  */
+       const char *aka = (char *) obstack_base (ob2);
+@@ -107,7 +226,7 @@ print_type (c_pretty_printer *cpp, tree t, bool *quoted)
+       pp_c_whitespace (cpp);
+       if (*quoted)
+ 	pp_begin_quote (cpp, pp_show_color (cpp));
+-      cpp->type_id (TYPE_CANONICAL (t));
++      cpp->type_id (aka_type);
+       if (*quoted)
+ 	pp_end_quote (cpp, pp_show_color (cpp));
+       pp_right_brace (cpp);
+diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
+index 4bba1887f..e802dcbeb 100644
+--- a/gcc/cp/cp-tree.h
++++ b/gcc/cp/cp-tree.h
+@@ -5662,6 +5662,13 @@ enum auto_deduction_context
+ #define TFF_NO_TEMPLATE_BINDINGS		(1 << 13)
+ #define TFF_POINTER		                (1 << 14)
+ 
++/* These constants can be used as bit flags to control strip_typedefs.
++
++   STF_USER_VISIBLE: use heuristics to try to avoid stripping user-facing
++       aliases of internal details.  This is intended for diagnostics,
++       where it should (for example) give more useful "aka" types.  */
++const unsigned int STF_USER_VISIBLE = 1U;
++
+ /* Returns the TEMPLATE_DECL associated to a TEMPLATE_TEMPLATE_PARM
+    node.  */
+ #define TEMPLATE_TEMPLATE_PARM_TEMPLATE_DECL(NODE)	\
+@@ -7221,8 +7228,10 @@ extern int zero_init_p				(const_tree);
+ extern bool check_abi_tag_redeclaration		(const_tree, const_tree,
+ 						 const_tree);
+ extern bool check_abi_tag_args			(tree, tree);
+-extern tree strip_typedefs			(tree, bool * = NULL);
+-extern tree strip_typedefs_expr			(tree, bool * = NULL);
++extern tree strip_typedefs			(tree, bool * = NULL,
++						 unsigned int = 0);
++extern tree strip_typedefs_expr			(tree, bool * = NULL,
++						 unsigned int = 0);
+ extern tree copy_binfo				(tree, tree, tree,
+ 						 tree *, int);
+ extern int member_p				(const_tree);
+diff --git a/gcc/cp/error.c b/gcc/cp/error.c
+index 4a0aed2b7..5beaf2dc1 100644
+--- a/gcc/cp/error.c
++++ b/gcc/cp/error.c
+@@ -408,7 +408,7 @@ dump_template_bindings (cxx_pretty_printer *pp, tree parms, tree args,
+       pop_deferring_access_checks ();
+       /* Strip typedefs.  We can't just use TFF_CHASE_TYPEDEF because
+ 	 pp_simple_type_specifier doesn't know about it.  */
+-      t = strip_typedefs (t);
++      t = strip_typedefs (t, NULL, STF_USER_VISIBLE);
+       dump_type (pp, t, TFF_PLAIN_IDENTIFIER);
+     }
+ }
+@@ -447,7 +447,11 @@ dump_type (cxx_pretty_printer *pp, tree t, int flags)
+ 	       || DECL_SELF_REFERENCE_P (decl)
+ 	       || (!flag_pretty_templates
+ 		   && DECL_LANG_SPECIFIC (decl) && DECL_TEMPLATE_INFO (decl)))
+-	t = strip_typedefs (t);
++	{
++	  unsigned int stf_flags = (!(pp->flags & pp_c_flag_gnu_v3)
++				    ? STF_USER_VISIBLE : 0);
++	  t = strip_typedefs (t, NULL, stf_flags);
++	}
+       else if (alias_template_specialization_p (t))
+ 	{
+ 	  dump_alias_template_specialization (pp, t, flags);
+@@ -3193,7 +3197,7 @@ type_to_string (tree typ, int verbose, bool postprocessed, bool *quote,
+       && !uses_template_parms (typ))
+     {
+       int aka_start, aka_len; char *p;
+-      tree aka = strip_typedefs (typ);
++      tree aka = strip_typedefs (typ, NULL, STF_USER_VISIBLE);
+       if (quote && *quote)
+ 	pp_end_quote (cxx_pp, show_color);
+       pp_string (cxx_pp, " {aka");
+diff --git a/gcc/cp/tree.c b/gcc/cp/tree.c
+index 3f3583c82..6a1f760ba 100644
+--- a/gcc/cp/tree.c
++++ b/gcc/cp/tree.c
+@@ -1421,7 +1421,10 @@ apply_identity_attributes (tree result, tree attribs, bool *remove_attributes)
+   return cp_build_type_attribute_variant (result, new_attribs);
+ }
+ 
+-/* Builds a qualified variant of T that is not a typedef variant.
++/* Builds a qualified variant of T that is either not a typedef variant
++   (the default behavior) or not a typedef variant of a user-facing type
++   (if FLAGS contains STF_USER_FACING).
++
+    E.g. consider the following declarations:
+      typedef const int ConstInt;
+      typedef ConstInt* PtrConstInt;
+@@ -1446,7 +1449,7 @@ apply_identity_attributes (tree result, tree attribs, bool *remove_attributes)
+    stripped.  */
+ 
+ tree
+-strip_typedefs (tree t, bool *remove_attributes)
++strip_typedefs (tree t, bool *remove_attributes, unsigned int flags)
+ {
+   tree result = NULL, type = NULL, t0 = NULL;
+ 
+@@ -1461,7 +1464,7 @@ strip_typedefs (tree t, bool *remove_attributes)
+       for (; t; t = TREE_CHAIN (t))
+ 	{
+ 	  gcc_assert (!TREE_PURPOSE (t));
+-	  tree elt = strip_typedefs (TREE_VALUE (t), remove_attributes);
++	  tree elt = strip_typedefs (TREE_VALUE (t), remove_attributes, flags);
+ 	  if (elt != TREE_VALUE (t))
+ 	    changed = true;
+ 	  vec_safe_push (vec, elt);
+@@ -1485,28 +1488,29 @@ strip_typedefs (tree t, bool *remove_attributes)
+   switch (TREE_CODE (t))
+     {
+     case POINTER_TYPE:
+-      type = strip_typedefs (TREE_TYPE (t), remove_attributes);
++      type = strip_typedefs (TREE_TYPE (t), remove_attributes, flags);
+       result = build_pointer_type (type);
+       break;
+     case REFERENCE_TYPE:
+-      type = strip_typedefs (TREE_TYPE (t), remove_attributes);
++      type = strip_typedefs (TREE_TYPE (t), remove_attributes, flags);
+       result = cp_build_reference_type (type, TYPE_REF_IS_RVALUE (t));
+       break;
+     case OFFSET_TYPE:
+-      t0 = strip_typedefs (TYPE_OFFSET_BASETYPE (t), remove_attributes);
+-      type = strip_typedefs (TREE_TYPE (t), remove_attributes);
++      t0 = strip_typedefs (TYPE_OFFSET_BASETYPE (t), remove_attributes, flags);
++      type = strip_typedefs (TREE_TYPE (t), remove_attributes, flags);
+       result = build_offset_type (t0, type);
+       break;
+     case RECORD_TYPE:
+       if (TYPE_PTRMEMFUNC_P (t))
+ 	{
+-	  t0 = strip_typedefs (TYPE_PTRMEMFUNC_FN_TYPE (t), remove_attributes);
++	  t0 = strip_typedefs (TYPE_PTRMEMFUNC_FN_TYPE (t),
++			       remove_attributes, flags);
+ 	  result = build_ptrmemfunc_type (t0);
+ 	}
+       break;
+     case ARRAY_TYPE:
+-      type = strip_typedefs (TREE_TYPE (t), remove_attributes);
+-      t0  = strip_typedefs (TYPE_DOMAIN (t), remove_attributes);
++      type = strip_typedefs (TREE_TYPE (t), remove_attributes, flags);
++      t0  = strip_typedefs (TYPE_DOMAIN (t), remove_attributes, flags);
+       result = build_cplus_array_type (type, t0);
+       break;
+     case FUNCTION_TYPE:
+@@ -1525,7 +1529,7 @@ strip_typedefs (tree t, bool *remove_attributes)
+ 	    && (TYPE_ATTRIBUTES (t) || TYPE_USER_ALIGN (t)))
+ 	  is_variant = true;
+ 
+-	type = strip_typedefs (TREE_TYPE (t), remove_attributes);
++	type = strip_typedefs (TREE_TYPE (t), remove_attributes, flags);
+ 	tree canon_spec = (flag_noexcept_type
+ 			   ? canonical_eh_spec (TYPE_RAISES_EXCEPTIONS (t))
+ 			   : NULL_TREE);
+@@ -1539,7 +1543,7 @@ strip_typedefs (tree t, bool *remove_attributes)
+ 	    if (arg_node == void_list_node)
+ 	      break;
+ 	    arg_type = strip_typedefs (TREE_VALUE (arg_node),
+-				       remove_attributes);
++				       remove_attributes, flags);
+ 	    gcc_assert (arg_type);
+ 	    if (arg_type == TREE_VALUE (arg_node) && !changed)
+ 	      continue;
+@@ -1603,9 +1607,10 @@ strip_typedefs (tree t, bool *remove_attributes)
+ 		tree arg = TREE_VEC_ELT (args, i);
+ 		tree strip_arg;
+ 		if (TYPE_P (arg))
+-		  strip_arg = strip_typedefs (arg, remove_attributes);
++		  strip_arg = strip_typedefs (arg, remove_attributes, flags);
+ 		else
+-		  strip_arg = strip_typedefs_expr (arg, remove_attributes);
++		  strip_arg = strip_typedefs_expr (arg, remove_attributes,
++						   flags);
+ 		TREE_VEC_ELT (new_args, i) = strip_arg;
+ 		if (strip_arg != arg)
+ 		  changed = true;
+@@ -1621,7 +1626,7 @@ strip_typedefs (tree t, bool *remove_attributes)
+ 	    else
+ 	      ggc_free (new_args);
+ 	  }
+-	tree ctx = strip_typedefs (TYPE_CONTEXT (t), remove_attributes);
++	tree ctx = strip_typedefs (TYPE_CONTEXT (t), remove_attributes, flags);
+ 	if (!changed && ctx == TYPE_CONTEXT (t) && !typedef_variant_p (t))
+ 	  return t;
+ 	tree name = fullname;
+@@ -1634,7 +1639,7 @@ strip_typedefs (tree t, bool *remove_attributes)
+       break;
+     case DECLTYPE_TYPE:
+       result = strip_typedefs_expr (DECLTYPE_TYPE_EXPR (t),
+-				    remove_attributes);
++				    remove_attributes, flags);
+       if (result == DECLTYPE_TYPE_EXPR (t))
+ 	result = NULL_TREE;
+       else
+@@ -1644,7 +1649,8 @@ strip_typedefs (tree t, bool *remove_attributes)
+ 		   tf_none));
+       break;
+     case UNDERLYING_TYPE:
+-      type = strip_typedefs (UNDERLYING_TYPE_TYPE (t), remove_attributes);
++      type = strip_typedefs (UNDERLYING_TYPE_TYPE (t),
++			     remove_attributes, flags);
+       result = finish_underlying_type (type);
+       break;
+     default:
+@@ -1655,15 +1661,18 @@ strip_typedefs (tree t, bool *remove_attributes)
+     {
+       if (typedef_variant_p (t))
+ 	{
+-	  /* Explicitly get the underlying type, as TYPE_MAIN_VARIANT doesn't
+-	     strip typedefs with attributes.  */
+-	  result = TYPE_MAIN_VARIANT (DECL_ORIGINAL_TYPE (TYPE_NAME (t)));
+-	  result = strip_typedefs (result);
++	  if ((flags & STF_USER_VISIBLE)
++	      && !user_facing_original_type_p (t))
++	    return t;
++	  result = strip_typedefs (DECL_ORIGINAL_TYPE (TYPE_NAME (t)),
++				   remove_attributes, flags);
+ 	}
+       else
+ 	result = TYPE_MAIN_VARIANT (t);
+     }
+-  gcc_assert (!typedef_variant_p (result));
++  gcc_assert (!typedef_variant_p (result)
++	      || ((flags & STF_USER_VISIBLE)
++		  && !user_facing_original_type_p (result)));
+ 
+   if (COMPLETE_TYPE_P (result) && !COMPLETE_TYPE_P (t))
+   /* If RESULT is complete and T isn't, it's likely the case that T
+@@ -1712,7 +1721,7 @@ strip_typedefs (tree t, bool *remove_attributes)
+    sizeof(TT) is replaced by sizeof(T).  */
+ 
+ tree
+-strip_typedefs_expr (tree t, bool *remove_attributes)
++strip_typedefs_expr (tree t, bool *remove_attributes, unsigned int flags)
+ {
+   unsigned i,n;
+   tree r, type, *ops;
+@@ -1729,7 +1738,7 @@ strip_typedefs_expr (tree t, bool *remove_attributes)
+   /* Some expressions have type operands, so let's handle types here rather
+      than check TYPE_P in multiple places below.  */
+   if (TYPE_P (t))
+-    return strip_typedefs (t, remove_attributes);
++    return strip_typedefs (t, remove_attributes, flags);
+ 
+   code = TREE_CODE (t);
+   switch (code)
+@@ -1743,8 +1752,10 @@ strip_typedefs_expr (tree t, bool *remove_attributes)
+ 
+     case TRAIT_EXPR:
+       {
+-	tree type1 = strip_typedefs (TRAIT_EXPR_TYPE1 (t), remove_attributes);
+-	tree type2 = strip_typedefs (TRAIT_EXPR_TYPE2 (t), remove_attributes);
++	tree type1 = strip_typedefs (TRAIT_EXPR_TYPE1 (t),
++				     remove_attributes, flags);
++	tree type2 = strip_typedefs (TRAIT_EXPR_TYPE2 (t),
++				     remove_attributes, flags);
+ 	if (type1 == TRAIT_EXPR_TYPE1 (t)
+ 	    && type2 == TRAIT_EXPR_TYPE2 (t))
+ 	  return t;
+@@ -1761,7 +1772,8 @@ strip_typedefs_expr (tree t, bool *remove_attributes)
+ 	tree it;
+ 	for (it = t; it; it = TREE_CHAIN (it))
+ 	  {
+-	    tree val = strip_typedefs_expr (TREE_VALUE (it), remove_attributes);
++	    tree val = strip_typedefs_expr (TREE_VALUE (it),
++					    remove_attributes, flags);
+ 	    vec_safe_push (vec, val);
+ 	    if (val != TREE_VALUE (it))
+ 	      changed = true;
+@@ -1788,7 +1800,7 @@ strip_typedefs_expr (tree t, bool *remove_attributes)
+ 	for (i = 0; i < n; ++i)
+ 	  {
+ 	    tree op = strip_typedefs_expr (TREE_VEC_ELT (t, i),
+-					   remove_attributes);
++					   remove_attributes, flags);
+ 	    vec->quick_push (op);
+ 	    if (op != TREE_VEC_ELT (t, i))
+ 	      changed = true;
+@@ -1813,18 +1825,19 @@ strip_typedefs_expr (tree t, bool *remove_attributes)
+ 	vec<constructor_elt, va_gc> *vec
+ 	  = vec_safe_copy (CONSTRUCTOR_ELTS (t));
+ 	n = CONSTRUCTOR_NELTS (t);
+-	type = strip_typedefs (TREE_TYPE (t), remove_attributes);
++	type = strip_typedefs (TREE_TYPE (t), remove_attributes, flags);
+ 	for (i = 0; i < n; ++i)
+ 	  {
+ 	    constructor_elt *e = &(*vec)[i];
+-	    tree op = strip_typedefs_expr (e->value, remove_attributes);
++	    tree op = strip_typedefs_expr (e->value, remove_attributes, flags);
+ 	    if (op != e->value)
+ 	      {
+ 		changed = true;
+ 		e->value = op;
+ 	      }
+ 	    gcc_checking_assert
+-	      (e->index == strip_typedefs_expr (e->index, remove_attributes));
++	      (e->index == strip_typedefs_expr (e->index, remove_attributes,
++						flags));
+ 	  }
+ 
+ 	if (!changed && type == TREE_TYPE (t))
+@@ -1868,12 +1881,13 @@ strip_typedefs_expr (tree t, bool *remove_attributes)
+     case REINTERPRET_CAST_EXPR:
+     case CAST_EXPR:
+     case NEW_EXPR:
+-      type = strip_typedefs (type, remove_attributes);
++      type = strip_typedefs (type, remove_attributes, flags);
+       /* fallthrough */
+ 
+     default:
+       for (i = 0; i < n; ++i)
+-	ops[i] = strip_typedefs_expr (TREE_OPERAND (t, i), remove_attributes);
++	ops[i] = strip_typedefs_expr (TREE_OPERAND (t, i),
++				      remove_attributes, flags);
+       break;
+     }
+ 
+diff --git a/gcc/testsuite/g++.dg/diagnostic/aka5.h b/gcc/testsuite/g++.dg/diagnostic/aka5.h
+new file mode 100644
+index 000000000..0c7404d76
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/diagnostic/aka5.h
+@@ -0,0 +1,22 @@
++#ifdef IS_SYSTEM_HEADER
++#pragma GCC system_header
++#endif
++
++typedef enum __internal_enum { A, B } user_enum;
++typedef user_enum *user_enum_ptr;
++
++typedef struct __internal_struct { int i; } user_struct;
++typedef user_struct user_struct_copy;
++typedef user_struct *user_struct_ptr;
++
++typedef union __internal_union { int i; } user_union;
++typedef user_union user_union_copy;
++typedef user_union *user_union_ptr;
++
++typedef unsigned int user_vector __attribute__((__vector_size__(16)));
++typedef user_vector user_vector_copy;
++typedef user_vector *user_vector_ptr;
++
++typedef int user_int;
++typedef user_int user_int_copy;
++typedef user_int *user_int_ptr;
+diff --git a/gcc/testsuite/g++.dg/diagnostic/aka5a.C b/gcc/testsuite/g++.dg/diagnostic/aka5a.C
+new file mode 100644
+index 000000000..e9d4c02f6
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/diagnostic/aka5a.C
+@@ -0,0 +1,127 @@
++#define IS_SYSTEM_HEADER
++#include "aka5.h"
++
++typedef user_enum user_enum_copy;
++
++struct s { int i; };
++
++user_enum ue1;
++user_enum_copy ue2;
++user_enum_ptr ue_ptr1;
++user_enum *ue_ptr2;
++const user_enum *const_ue_ptr1;
++const user_enum_copy *const_ue_ptr2;
++volatile user_enum *volatile_ue_ptr1;
++volatile user_enum_copy *volatile_ue_ptr2;
++user_enum (*ue_array_ptr1)[10];
++user_enum_copy (*ue_array_ptr2)[10];
++user_enum (*ue_fn_ptr1) (void);
++void (*ue_fn_ptr2) (user_enum);
++void (*ue_fn_ptr3) (user_enum, ...);
++user_enum_copy (*ue_fn_ptr4) (void);
++void (*ue_fn_ptr5) (user_enum_copy);
++void (*ue_fn_ptr6) (user_enum_copy, ...);
++user_enum (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr1) (void);
++user_enum_copy (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr2) (void);
++
++user_struct us1;
++user_struct_copy us2;
++user_struct_ptr us_ptr1;
++user_struct *us_ptr2;
++const user_struct *const_us_ptr1;
++const user_struct_copy *const_us_ptr2;
++
++user_union uu1;
++user_union_copy uu2;
++user_union_ptr uu_ptr1;
++user_union *uu_ptr2;
++const user_union *const_uu_ptr1;
++const user_union_copy *const_uu_ptr2;
++
++user_vector uv1;
++user_vector_copy uv2;
++user_vector_ptr uv_ptr1;
++user_vector *uv_ptr2;
++const user_vector *const_uv_ptr1;
++const user_vector_copy *const_uv_ptr2;
++
++user_int ui1;
++user_int_copy ui2;
++user_int_ptr ui_ptr1;
++user_int *ui_ptr2;
++const user_int *const_ui_ptr1;
++const user_int_copy *const_ui_ptr2;
++volatile user_int *volatile_ui_ptr1;
++volatile user_int_copy *volatile_ui_ptr2;
++user_int (*ui_array_ptr1)[10];
++user_int_copy (*ui_array_ptr2)[10];
++user_int (*ui_fn_ptr1) (void);
++void (*ui_fn_ptr2) (user_int);
++void (*ui_fn_ptr3) (user_int, ...);
++user_int_copy (*ui_fn_ptr4) (void);
++void (*ui_fn_ptr5) (user_int_copy);
++void (*ui_fn_ptr6) (user_int_copy, ...);
++user_int (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr1) (void);
++user_int_copy (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr2) (void);
++
++void f (s s1)
++{
++  ue1 = s1; // { dg-error {cannot convert 's' to 'user_enum' in assignment} }
++  ue2 = s1; // { dg-error {cannot convert 's' to 'user_enum_copy' {aka 'user_enum'} in assignment} }
++  ue_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_ptr' {aka 'user_enum\*'} in assignment} }
++  ue_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum\*' in assignment} }
++  const_ue_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_enum\*' in assignment} }
++  const_ue_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_enum_copy\*' {aka 'const user_enum\*'} in assignment} }
++  volatile_ue_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_enum\*' in assignment} }
++  volatile_ue_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_enum_copy\*' {aka 'volatile user_enum\*'} in assignment} }
++  ue_array_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum \(\*\)\[10\]' in assignment} }
++  ue_array_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_copy \(\*\)\[10\]' {aka 'user_enum \(\*\)\[10\]'} in assignment} }
++  ue_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum \(\*\)\(\)' in assignment} }
++  ue_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum\)' in assignment} }
++  ue_fn_ptr3 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum, \.\.\.\)' in assignment} }
++  ue_fn_ptr4 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_copy \(\*\)\(\)' {aka 'user_enum \(\*\)\(\)'} in assignment} }
++  ue_fn_ptr5 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum_copy\)' {aka 'void \(\*\)\(user_enum\)'} in assignment} }
++  ue_fn_ptr6 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum_copy, \.\.\.\)' {aka 'void \(\*\)\(user_enum, \.\.\.\)'} in assignment} }
++  unsafe_ue_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' in assignment} }
++  unsafe_ue_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka 'user_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++
++  us1 = s1; // { dg-error {no match for 'operator=' in 'us1 = s1' \(operand types are 'user_struct' and 's'\)} }
++  us2 = s1; // { dg-error {no match for 'operator=' in 'us2 = s1' \(operand types are 'user_struct_copy' {aka 'user_struct'} and 's'\)} }
++  us_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_struct_ptr' {aka 'user_struct\*'} in assignment} }
++  us_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_struct\*' in assignment} }
++  const_us_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_struct\*' in assignment} }
++  const_us_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_struct_copy\*' {aka 'const user_struct\*'} in assignment} }
++
++  uu1 = s1; // { dg-error {no match for 'operator=' in 'uu1 = s1' \(operand types are 'user_union' and 's'\)} }
++  uu2 = s1; // { dg-error {no match for 'operator=' in 'uu2 = s1' \(operand types are 'user_union_copy' {aka 'user_union'} and 's'\)} }
++  uu_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_union_ptr' {aka 'user_union\*'} in assignment} }
++  uu_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_union\*' in assignment} }
++  const_uu_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_union\*' in assignment} }
++  const_uu_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_union_copy\*' {aka 'const user_union\*'} in assignment} }
++
++  uv1 = s1; // { dg-error {cannot convert 's' to 'user_vector' in assignment} }
++  uv2 = s1; // { dg-error {cannot convert 's' to 'user_vector_copy' {aka 'user_vector'} in assignment} }
++  uv_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_vector_ptr' {aka 'user_vector\*'} in assignment} }
++  uv_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_vector\*' in assignment} }
++  const_uv_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_vector\*' in assignment} }
++  const_uv_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_vector_copy\*' {aka 'const user_vector\*'} in assignment} }
++
++  ui1 = s1; // { dg-error {cannot convert 's' to 'user_int' {aka 'int'} in assignment} }
++  ui2 = s1; // { dg-error {cannot convert 's' to 'user_int_copy' {aka 'int'} in assignment} }
++  ui_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_ptr' {aka 'int\*'} in assignment} }
++  ui_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_int\*' {aka 'int\*'} in assignment} }
++  const_ui_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_int\*' {aka 'const int\*'} in assignment} }
++  const_ui_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_int_copy\*' {aka 'const int\*'} in assignment} }
++  volatile_ui_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_int\*' {aka 'volatile int\*'} in assignment} }
++  volatile_ui_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_int_copy\*' {aka 'volatile int\*'} in assignment} }
++  ui_array_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} in assignment} }
++  ui_array_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_copy \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} in assignment} }
++  ui_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int \(\*\)\(\)' {aka 'int \(\*\)\(\)'} in assignment} }
++  ui_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int\)' {aka 'void \(\*\)\(int\)'} in assignment} }
++  ui_fn_ptr3 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int, \.\.\.\)' {aka 'void \(\*\)\(int, \.\.\.\)'} in assignment} }
++  ui_fn_ptr4 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_copy \(\*\)\(\)' {aka 'int \(\*\)\(\)'} in assignment} }
++  ui_fn_ptr5 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int_copy\)' {aka 'void \(\*\)\(int\)'} in assignment} }
++  ui_fn_ptr6 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int_copy, \.\.\.\)' {aka 'void \(\*\)\(int, \.\.\.\)'} in assignment} }
++  unsafe_ui_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++  unsafe_ui_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++}
+diff --git a/gcc/testsuite/g++.dg/diagnostic/aka5b.C b/gcc/testsuite/g++.dg/diagnostic/aka5b.C
+new file mode 100644
+index 000000000..6942be3ee
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/diagnostic/aka5b.C
+@@ -0,0 +1,127 @@
++#include "aka5.h"
++
++typedef user_enum user_enum_copy;
++
++struct s { int i; };
++
++user_enum ue1;
++user_enum_copy ue2;
++user_enum_ptr ue_ptr1;
++user_enum *ue_ptr2;
++const user_enum *const_ue_ptr1;
++const user_enum_copy *const_ue_ptr2;
++volatile user_enum *volatile_ue_ptr1;
++volatile user_enum_copy *volatile_ue_ptr2;
++user_enum (*ue_array_ptr1)[10];
++user_enum_copy (*ue_array_ptr2)[10];
++user_enum (*ue_fn_ptr1) (void);
++void (*ue_fn_ptr2) (user_enum);
++void (*ue_fn_ptr3) (user_enum, ...);
++user_enum_copy (*ue_fn_ptr4) (void);
++void (*ue_fn_ptr5) (user_enum_copy);
++void (*ue_fn_ptr6) (user_enum_copy, ...);
++user_enum (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr1) (void);
++user_enum_copy (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr2) (void);
++
++user_struct us1;
++user_struct_copy us2;
++user_struct_ptr us_ptr1;
++user_struct *us_ptr2;
++const user_struct *const_us_ptr1;
++const user_struct_copy *const_us_ptr2;
++
++user_union uu1;
++user_union_copy uu2;
++user_union_ptr uu_ptr1;
++user_union *uu_ptr2;
++const user_union *const_uu_ptr1;
++const user_union_copy *const_uu_ptr2;
++
++user_vector uv1;
++user_vector_copy uv2;
++user_vector_ptr uv_ptr1;
++user_vector *uv_ptr2;
++const user_vector *const_uv_ptr1;
++const user_vector_copy *const_uv_ptr2;
++
++user_int ui1;
++user_int_copy ui2;
++user_int_ptr ui_ptr1;
++user_int *ui_ptr2;
++const user_int *const_ui_ptr1;
++const user_int_copy *const_ui_ptr2;
++volatile user_int *volatile_ui_ptr1;
++volatile user_int_copy *volatile_ui_ptr2;
++user_int (*ui_array_ptr1)[10];
++user_int_copy (*ui_array_ptr2)[10];
++user_int (*ui_fn_ptr1) (void);
++void (*ui_fn_ptr2) (user_int);
++void (*ui_fn_ptr3) (user_int, ...);
++user_int_copy (*ui_fn_ptr4) (void);
++void (*ui_fn_ptr5) (user_int_copy);
++void (*ui_fn_ptr6) (user_int_copy, ...);
++user_int (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr1) (void);
++user_int_copy (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr2) (void);
++
++void f (s s1)
++{
++  ue1 = s1; // { dg-error {cannot convert 's' to 'user_enum' {aka '__internal_enum'} in assignment} }
++  ue2 = s1; // { dg-error {cannot convert 's' to 'user_enum_copy' {aka '__internal_enum'} in assignment} }
++  ue_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_ptr' {aka '__internal_enum\*'} in assignment} }
++  ue_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum\*' {aka '__internal_enum\*'} in assignment} }
++  const_ue_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_enum\*' {aka 'const __internal_enum\*'} in assignment} }
++  const_ue_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_enum_copy\*' {aka 'const __internal_enum\*'} in assignment} }
++  volatile_ue_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_enum\*' {aka 'volatile __internal_enum\*'} in assignment} }
++  volatile_ue_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_enum_copy\*' {aka 'volatile __internal_enum\*'} in assignment} }
++  ue_array_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum \(\*\)\[10\]' {aka '__internal_enum \(\*\)\[10\]'} in assignment} }
++  ue_array_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_copy \(\*\)\[10\]' {aka '__internal_enum \(\*\)\[10\]'} in assignment} }
++  ue_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum \(\*\)\(\)' {aka '__internal_enum \(\*\)\(\)'} in assignment} }
++  ue_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum\)' {aka 'void \(\*\)\(__internal_enum\)'} in assignment} }
++  ue_fn_ptr3 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum, \.\.\.\)' {aka 'void \(\*\)\(__internal_enum, \.\.\.\)'} in assignment} }
++  ue_fn_ptr4 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_copy \(\*\)\(\)' {aka '__internal_enum \(\*\)\(\)'} in assignment} }
++  ue_fn_ptr5 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum_copy\)' {aka 'void \(\*\)\(__internal_enum\)'} in assignment} }
++  ue_fn_ptr6 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_enum_copy, \.\.\.\)' {aka 'void \(\*\)\(__internal_enum, \.\.\.\)'} in assignment} }
++  unsafe_ue_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka '__internal_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++  unsafe_ue_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_enum_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka '__internal_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++
++  us1 = s1; // { dg-error {no match for 'operator=' in 'us1 = s1' \(operand types are 'user_struct' {aka '__internal_struct'} and 's'\)} }
++  us2 = s1; // { dg-error {no match for 'operator=' in 'us2 = s1' \(operand types are 'user_struct_copy' {aka '__internal_struct'} and 's'\)} }
++  us_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_struct_ptr' {aka '__internal_struct\*'} in assignment} }
++  us_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_struct\*' {aka '__internal_struct\*'} in assignment} }
++  const_us_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_struct\*' {aka 'const __internal_struct\*'} in assignment} }
++  const_us_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_struct_copy\*' {aka 'const __internal_struct\*'} in assignment} }
++
++  uu1 = s1; // { dg-error {no match for 'operator=' in 'uu1 = s1' \(operand types are 'user_union' {aka '__internal_union'} and 's'\)} }
++  uu2 = s1; // { dg-error {no match for 'operator=' in 'uu2 = s1' \(operand types are 'user_union_copy' {aka '__internal_union'} and 's'\)} }
++  uu_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_union_ptr' {aka '__internal_union\*'} in assignment} }
++  uu_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_union\*' {aka '__internal_union\*'} in assignment} }
++  const_uu_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_union\*' {aka 'const __internal_union\*'} in assignment} }
++  const_uu_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_union_copy\*' {aka 'const __internal_union\*'} in assignment} }
++
++  uv1 = s1; // { dg-error {cannot convert 's' to 'user_vector' {aka '__vector\([48]\) unsigned int'} in assignment} }
++  uv2 = s1; // { dg-error {cannot convert 's' to 'user_vector_copy' {aka '__vector\([48]\) unsigned int'} in assignment} }
++  uv_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_vector_ptr' {aka '__vector\([48]\) unsigned int\*'} in assignment} }
++  uv_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_vector\*' {aka '__vector\([48]\) unsigned int\*'} in assignment} }
++  const_uv_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_vector\*' {aka 'const __vector\([48]\) unsigned int\*'} in assignment} }
++  const_uv_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_vector_copy\*' {aka 'const __vector\([48]\) unsigned int\*'} in assignment} }
++
++  ui1 = s1; // { dg-error {cannot convert 's' to 'user_int' {aka 'int'} in assignment} }
++  ui2 = s1; // { dg-error {cannot convert 's' to 'user_int_copy' {aka 'int'} in assignment} }
++  ui_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_ptr' {aka 'int\*'} in assignment} }
++  ui_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_int\*' {aka 'int\*'} in assignment} }
++  const_ui_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'const user_int\*' {aka 'const int\*'} in assignment} }
++  const_ui_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'const user_int_copy\*' {aka 'const int\*'} in assignment} }
++  volatile_ui_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_int\*' {aka 'volatile int\*'} in assignment} }
++  volatile_ui_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'volatile user_int_copy\*' {aka 'volatile int\*'} in assignment} }
++  ui_array_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} in assignment} }
++  ui_array_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_copy \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} in assignment} }
++  ui_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int \(\*\)\(\)' {aka 'int \(\*\)\(\)'} in assignment} }
++  ui_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int\)' {aka 'void \(\*\)\(int\)'} in assignment} }
++  ui_fn_ptr3 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int, \.\.\.\)' {aka 'void \(\*\)\(int, \.\.\.\)'} in assignment} }
++  ui_fn_ptr4 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_copy \(\*\)\(\)' {aka 'int \(\*\)\(\)'} in assignment} }
++  ui_fn_ptr5 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int_copy\)' {aka 'void \(\*\)\(int\)'} in assignment} }
++  ui_fn_ptr6 = &s1; // { dg-error {cannot convert 's\*' to 'void \(\*\)\(user_int_copy, \.\.\.\)' {aka 'void \(\*\)\(int, \.\.\.\)'} in assignment} }
++  unsafe_ui_fn_ptr1 = &s1; // { dg-error {cannot convert 's\*' to 'user_int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++  unsafe_ui_fn_ptr2 = &s1; // { dg-error {cannot convert 's\*' to 'user_int_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(\)'} in assignment} }
++}
++
+diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+index 2e3dfecac..6a76adb5b 100644
+--- a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
++++ b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+@@ -1,5 +1,5 @@
+ // { dg-do compile }
+-// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks --param early-inlining-insns-O2=14" }
++// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks --param early-inlining-insns-O2=14 --param max-inline-insns-single-O2=200" }
+ 
+ #define assume(x) if(!(x))__builtin_unreachable()
+ 
+diff --git a/gcc/testsuite/g++.target/aarch64/diag_aka_1.C b/gcc/testsuite/g++.target/aarch64/diag_aka_1.C
+new file mode 100644
+index 000000000..6b489981f
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/diag_aka_1.C
+@@ -0,0 +1,13 @@
++#include <arm_neon.h>
++
++typedef int16x4_t myvec;
++
++void f (float x)
++{
++  __Int8x8_t y1 = x; // { dg-error {cannot convert 'float' to '__Int8x8_t' in initialization} }
++  __Int8x8_t *ptr1 = &x; // { dg-error {cannot convert 'float\*' to '__Int8x8_t\*' in initialization} }
++  int8x8_t y2 = x; // { dg-error {cannot convert 'float' to 'int8x8_t' in initialization} }
++  int8x8_t *ptr2 = &x; // { dg-error {cannot convert 'float\*' to 'int8x8_t\*' in initialization} }
++  myvec y3 = x; // { dg-error {cannot convert 'float' to 'myvec' {aka 'int16x4_t'} in initialization} }
++  myvec *ptr3 = &x; // { dg-error {cannot convert 'float\*' to 'myvec\*' {aka 'int16x4_t\*'} in initialization} }
++}
+diff --git a/gcc/testsuite/gcc.dg/diag-aka-1.c b/gcc/testsuite/gcc.dg/diag-aka-1.c
+index fde4ca7c7..3383c1c26 100644
+--- a/gcc/testsuite/gcc.dg/diag-aka-1.c
++++ b/gcc/testsuite/gcc.dg/diag-aka-1.c
+@@ -2,7 +2,7 @@
+ /* { dg-options "-Wc++-compat" } */
+ 
+ typedef struct A { int i; } B;
+-typedef struct T { int i; } T;
++typedef struct T { int i; } *T; /* { dg-warning "using 'T' as both a typedef and a tag is invalid" } */
+ typedef const float TFA;
+ typedef TFA TFB;
+ typedef TFB TFC;
+@@ -24,6 +24,6 @@ bar (B *b, int *i)
+ int
+ foo (void *a)
+ {
+-  T *t = a; /* { dg-warning "request for implicit conversion from 'void \\*' to 'T \\*' {aka 'struct T \\*'} not" } */
++  T t = a; /* { dg-warning "request for implicit conversion from 'void \\*' to 'T' {aka 'struct T \\*'} not" } */
+   return t->i;
+ }
+diff --git a/gcc/testsuite/gcc.dg/diag-aka-4.c b/gcc/testsuite/gcc.dg/diag-aka-4.c
+new file mode 100644
+index 000000000..cf98dd96a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/diag-aka-4.c
+@@ -0,0 +1,72 @@
++typedef struct struct_wrapper { int i; } struct_wrapper;
++typedef struct { int i; } anon_struct_wrapper;
++
++typedef union union_wrapper { int i; } union_wrapper;
++typedef union { int i; } anon_union_wrapper;
++
++typedef enum enum_wrapper { A, B } enum_wrapper;
++typedef enum { C, D } anon_enum_wrapper;
++
++void test_struct_wrapper (struct_wrapper y, int x)
++{
++  struct_wrapper *ptr = &x; /* { dg-error {initialization of 'struct_wrapper \*' from incompatible pointer type 'int \*'} } */
++  const struct_wrapper *const_ptr = &x; /* { dg-error {initialization of 'const struct_wrapper \*' from incompatible pointer type 'int \*'} } */
++  volatile struct_wrapper *volatile_ptr = &x; /* { dg-error {initialization of 'volatile struct_wrapper \*' from incompatible pointer type 'int \*'} } */
++  struct_wrapper (*aptr)[10] = &x; /* { dg-error {initialization of 'struct_wrapper \(\*\)\[10\]' from incompatible pointer type 'int \*'} } */
++  struct_wrapper (*f1)(int) = &x; /* { dg-error {initialization of 'struct_wrapper \(\*\)\(int\)' from incompatible pointer type 'int \*'} } */
++  int (*f2)(struct_wrapper) = &x; /* { dg-error {initialization of 'int \(\*\)\(struct_wrapper\)' from incompatible pointer type 'int \*'} } */
++  y = x; /* { dg-error {incompatible types when assigning to type 'struct_wrapper' from type 'int'} } */
++}
++
++void test_anon_struct_wrapper (anon_struct_wrapper y, int x)
++{
++  anon_struct_wrapper *ptr = &x; /* { dg-error {initialization of 'anon_struct_wrapper \*' from incompatible pointer type 'int \*'} } */
++  const anon_struct_wrapper *const_ptr = &x; /* { dg-error {initialization of 'const anon_struct_wrapper \*' from incompatible pointer type 'int \*'} } */
++  volatile anon_struct_wrapper *volatile_ptr = &x; /* { dg-error {initialization of 'volatile anon_struct_wrapper \*' from incompatible pointer type 'int \*'} } */
++  anon_struct_wrapper (*aptr)[10] = &x; /* { dg-error {initialization of 'anon_struct_wrapper \(\*\)\[10\]' from incompatible pointer type 'int \*'} } */
++  anon_struct_wrapper (*f1)(int) = &x; /* { dg-error {initialization of 'anon_struct_wrapper \(\*\)\(int\)' from incompatible pointer type 'int \*'} } */
++  int (*f2)(anon_struct_wrapper) = &x; /* { dg-error {initialization of 'int \(\*\)\(anon_struct_wrapper\)' from incompatible pointer type 'int \*'} } */
++  y = x; /* { dg-error {incompatible types when assigning to type 'anon_struct_wrapper' from type 'int'} } */
++}
++
++void test_union_wrapper (union_wrapper y, int x)
++{
++  union_wrapper *ptr = &x; /* { dg-error {initialization of 'union_wrapper \*' from incompatible pointer type 'int \*'} } */
++  const union_wrapper *const_ptr = &x; /* { dg-error {initialization of 'const union_wrapper \*' from incompatible pointer type 'int \*'} } */
++  volatile union_wrapper *volatile_ptr = &x; /* { dg-error {initialization of 'volatile union_wrapper \*' from incompatible pointer type 'int \*'} } */
++  union_wrapper (*aptr)[10] = &x; /* { dg-error {initialization of 'union_wrapper \(\*\)\[10\]' from incompatible pointer type 'int \*'} } */
++  union_wrapper (*f1)(int) = &x; /* { dg-error {initialization of 'union_wrapper \(\*\)\(int\)' from incompatible pointer type 'int \*'} } */
++  int (*f2)(union_wrapper) = &x; /* { dg-error {initialization of 'int \(\*\)\(union_wrapper\)' from incompatible pointer type 'int \*'} } */
++  y = x; /* { dg-error {incompatible types when assigning to type 'union_wrapper' from type 'int'} } */
++}
++
++void test_anon_union_wrapper (anon_union_wrapper y, int x)
++{
++  anon_union_wrapper *ptr = &x; /* { dg-error {initialization of 'anon_union_wrapper \*' from incompatible pointer type 'int \*'} } */
++  const anon_union_wrapper *const_ptr = &x; /* { dg-error {initialization of 'const anon_union_wrapper \*' from incompatible pointer type 'int \*'} } */
++  volatile anon_union_wrapper *volatile_ptr = &x; /* { dg-error {initialization of 'volatile anon_union_wrapper \*' from incompatible pointer type 'int \*'} } */
++  anon_union_wrapper (*aptr)[10] = &x; /* { dg-error {initialization of 'anon_union_wrapper \(\*\)\[10\]' from incompatible pointer type 'int \*'} } */
++  anon_union_wrapper (*f1)(int) = &x; /* { dg-error {initialization of 'anon_union_wrapper \(\*\)\(int\)' from incompatible pointer type 'int \*'} } */
++  int (*f2)(anon_union_wrapper) = &x; /* { dg-error {initialization of 'int \(\*\)\(anon_union_wrapper\)' from incompatible pointer type 'int \*'} } */
++  y = x; /* { dg-error {incompatible types when assigning to type 'anon_union_wrapper' from type 'int'} } */
++}
++
++void test_enum_wrapper (enum_wrapper y, int x)
++{
++  enum_wrapper *ptr = &x; /* { dg-error {initialization of 'enum_wrapper \*' from incompatible pointer type 'int \*'} } */
++  const enum_wrapper *const_ptr = &x; /* { dg-error {initialization of 'const enum_wrapper \*' from incompatible pointer type 'int \*'} } */
++  volatile enum_wrapper *volatile_ptr = &x; /* { dg-error {initialization of 'volatile enum_wrapper \*' from incompatible pointer type 'int \*'} } */
++  enum_wrapper (*aptr)[10] = &x; /* { dg-error {initialization of 'enum_wrapper \(\*\)\[10\]' from incompatible pointer type 'int \*'} } */
++  enum_wrapper (*f1)(int) = &x; /* { dg-error {initialization of 'enum_wrapper \(\*\)\(int\)' from incompatible pointer type 'int \*'} } */
++  int (*f2)(enum_wrapper) = &x; /* { dg-error {initialization of 'int \(\*\)\(enum_wrapper\)' from incompatible pointer type 'int \*'} } */
++}
++
++void test_anon_enum_wrapper (anon_enum_wrapper y, int x)
++{
++  anon_enum_wrapper *ptr = &x; /* { dg-error {initialization of 'anon_enum_wrapper \*' from incompatible pointer type 'int \*'} } */
++  const anon_enum_wrapper *const_ptr = &x; /* { dg-error {initialization of 'const anon_enum_wrapper \*' from incompatible pointer type 'int \*'} } */
++  volatile anon_enum_wrapper *volatile_ptr = &x; /* { dg-error {initialization of 'volatile anon_enum_wrapper \*' from incompatible pointer type 'int \*'} } */
++  anon_enum_wrapper (*aptr)[10] = &x; /* { dg-error {initialization of 'anon_enum_wrapper \(\*\)\[10\]' from incompatible pointer type 'int \*'} } */
++  anon_enum_wrapper (*f1)(int) = &x; /* { dg-error {initialization of 'anon_enum_wrapper \(\*\)\(int\)' from incompatible pointer type 'int \*'} } */
++  int (*f2)(anon_enum_wrapper) = &x; /* { dg-error {initialization of 'int \(\*\)\(anon_enum_wrapper\)' from incompatible pointer type 'int \*'} } */
++}
+diff --git a/gcc/testsuite/gcc.dg/diag-aka-5.h b/gcc/testsuite/gcc.dg/diag-aka-5.h
+new file mode 100644
+index 000000000..0c7404d76
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/diag-aka-5.h
+@@ -0,0 +1,22 @@
++#ifdef IS_SYSTEM_HEADER
++#pragma GCC system_header
++#endif
++
++typedef enum __internal_enum { A, B } user_enum;
++typedef user_enum *user_enum_ptr;
++
++typedef struct __internal_struct { int i; } user_struct;
++typedef user_struct user_struct_copy;
++typedef user_struct *user_struct_ptr;
++
++typedef union __internal_union { int i; } user_union;
++typedef user_union user_union_copy;
++typedef user_union *user_union_ptr;
++
++typedef unsigned int user_vector __attribute__((__vector_size__(16)));
++typedef user_vector user_vector_copy;
++typedef user_vector *user_vector_ptr;
++
++typedef int user_int;
++typedef user_int user_int_copy;
++typedef user_int *user_int_ptr;
+diff --git a/gcc/testsuite/gcc.dg/diag-aka-5a.c b/gcc/testsuite/gcc.dg/diag-aka-5a.c
+new file mode 100644
+index 000000000..573020659
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/diag-aka-5a.c
+@@ -0,0 +1,135 @@
++#define IS_SYSTEM_HEADER
++#include "diag-aka-5.h"
++
++typedef user_enum user_enum_copy;
++
++struct s { int i; };
++
++user_enum ue1;
++user_enum_copy ue2;
++user_enum_ptr ue_ptr1;
++user_enum *ue_ptr2;
++const user_enum *const_ue_ptr1;
++const user_enum_copy *const_ue_ptr2;
++volatile user_enum *volatile_ue_ptr1;
++volatile user_enum_copy *volatile_ue_ptr2;
++__extension__ _Atomic user_enum *atomic_ue_ptr1;
++__extension__ _Atomic user_enum_copy *atomic_ue_ptr2;
++user_enum (*ue_array_ptr1)[10];
++user_enum_copy (*ue_array_ptr2)[10];
++user_enum (*ue_fn_ptr1) (void);
++void (*ue_fn_ptr2) (user_enum);
++void (*ue_fn_ptr3) (user_enum, ...);
++user_enum_copy (*ue_fn_ptr4) (void);
++void (*ue_fn_ptr5) (user_enum_copy);
++void (*ue_fn_ptr6) (user_enum_copy, ...);
++user_enum (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr1) (void);
++user_enum_copy (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr2) (void);
++
++user_struct us1;
++user_struct_copy us2;
++user_struct_ptr us_ptr1;
++user_struct *us_ptr2;
++const user_struct *const_us_ptr1;
++const user_struct_copy *const_us_ptr2;
++
++user_union uu1;
++user_union_copy uu2;
++user_union_ptr uu_ptr1;
++user_union *uu_ptr2;
++const user_union *const_uu_ptr1;
++const user_union_copy *const_uu_ptr2;
++
++user_vector uv1;
++user_vector_copy uv2;
++user_vector_ptr uv_ptr1;
++user_vector *uv_ptr2;
++const user_vector *const_uv_ptr1;
++const user_vector_copy *const_uv_ptr2;
++
++user_int ui1;
++user_int_copy ui2;
++user_int_ptr ui_ptr1;
++user_int *ui_ptr2;
++const user_int *const_ui_ptr1;
++const user_int_copy *const_ui_ptr2;
++volatile user_int *volatile_ui_ptr1;
++volatile user_int_copy *volatile_ui_ptr2;
++__extension__ _Atomic user_int *atomic_ui_ptr1;
++__extension__ _Atomic user_int_copy *atomic_ui_ptr2;
++user_int (*ui_array_ptr1)[10];
++user_int_copy (*ui_array_ptr2)[10];
++user_int (*ui_fn_ptr1) (void);
++void (*ui_fn_ptr2) (user_int);
++void (*ui_fn_ptr3) (user_int, ...);
++user_int_copy (*ui_fn_ptr4) (void);
++void (*ui_fn_ptr5) (user_int_copy);
++void (*ui_fn_ptr6) (user_int_copy, ...);
++user_int (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr1) (void);
++user_int_copy (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr2) (void);
++
++void f (struct s s)
++{
++  ue1 = s; /* { dg-error {assigning to type 'user_enum' from type 'struct s'} } */
++  ue2 = s; /* { dg-error {assigning to type 'user_enum_copy' {aka 'user_enum'} from type 'struct s'} } */
++  ue_ptr1 = &s; /* { dg-error {assignment to 'user_enum_ptr' {aka 'user_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  ue_ptr2 = &s; /* { dg-error {assignment to 'user_enum \*' from incompatible pointer type 'struct s \*'} } */
++  const_ue_ptr1 = &s; /* { dg-error {assignment to 'const user_enum \*' from incompatible pointer type 'struct s \*'} } */
++  const_ue_ptr2 = &s; /* { dg-error {assignment to 'const user_enum_copy \*' {aka 'const user_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ue_ptr1 = &s; /* { dg-error {assignment to 'volatile user_enum \*' from incompatible pointer type 'struct s \*'} } */
++  volatile_ue_ptr2 = &s; /* { dg-error {assignment to 'volatile user_enum_copy \*' {aka 'volatile user_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ue_ptr1 = &s; /* { dg-error {assignment to '_Atomic user_enum \*' from incompatible pointer type 'struct s \*'} } */
++  atomic_ue_ptr2 = &s; /* { dg-error {assignment to '_Atomic user_enum_copy \*' {aka '_Atomic user_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  ue_array_ptr1 = &s; /* { dg-error {assignment to 'user_enum \(\*\)\[10\]' from incompatible pointer type 'struct s \*'} } */
++  ue_array_ptr2 = &s; /* { dg-error {assignment to 'user_enum_copy \(\*\)\[10\]' {aka 'user_enum \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr1 = &s; /* { dg-error {assignment to 'user_enum \(\*\)\(void\)' from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr2 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum\)' from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr3 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum,  \.\.\.\)' from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr4 = &s; /* { dg-error {assignment to 'user_enum_copy \(\*\)\(void\)' {aka 'user_enum \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr5 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum_copy\)' {aka 'void \(\*\)\(user_enum\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr6 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum_copy,  \.\.\.\)' {aka 'void \(\*\)\(user_enum,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ue_fn_ptr1 = &s; /* { dg-error {assignment to 'user_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' from incompatible pointer type 'struct s \*'} } */
++  unsafe_ue_fn_ptr2 = &s; /* { dg-error {assignment to 'user_enum_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'user_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++
++  us1 = s; /* { dg-error {assigning to type 'user_struct' from type 'struct s'} } */
++  us2 = s; /* { dg-error {assigning to type 'user_struct_copy' {aka 'user_struct'} from type 'struct s'} } */
++  us_ptr1 = &s; /* { dg-error {assignment to 'user_struct_ptr' {aka 'user_struct \*'} from incompatible pointer type 'struct s \*'} } */
++  us_ptr2 = &s; /* { dg-error {assignment to 'user_struct \*' from incompatible pointer type 'struct s \*'} } */
++  const_us_ptr1 = &s; /* { dg-error {assignment to 'const user_struct \*' from incompatible pointer type 'struct s \*'} } */
++  const_us_ptr2 = &s; /* { dg-error {assignment to 'const user_struct_copy \*' {aka 'const user_struct \*'} from incompatible pointer type 'struct s \*'} } */
++
++  uu1 = s; /* { dg-error {assigning to type 'user_union' from type 'struct s'} } */
++  uu2 = s; /* { dg-error {assigning to type 'user_union_copy' {aka 'user_union'} from type 'struct s'} } */
++  uu_ptr1 = &s; /* { dg-error {assignment to 'user_union_ptr' {aka 'user_union \*'} from incompatible pointer type 'struct s \*'} } */
++  uu_ptr2 = &s; /* { dg-error {assignment to 'user_union \*' from incompatible pointer type 'struct s \*'} } */
++  const_uu_ptr1 = &s; /* { dg-error {assignment to 'const user_union \*' from incompatible pointer type 'struct s \*'} } */
++  const_uu_ptr2 = &s; /* { dg-error {assignment to 'const user_union_copy \*' {aka 'const user_union \*'} from incompatible pointer type 'struct s \*'} } */
++
++  uv1 = s; /* { dg-error {assigning to type 'user_vector' from type 'struct s'} } */
++  uv2 = s; /* { dg-error {assigning to type 'user_vector_copy' {aka 'user_vector'} from type 'struct s'} } */
++  uv_ptr1 = &s; /* { dg-error {assignment to 'user_vector_ptr' {aka 'user_vector \*'} from incompatible pointer type 'struct s \*'} } */
++  uv_ptr2 = &s; /* { dg-error {assignment to 'user_vector \*' from incompatible pointer type 'struct s \*'} } */
++  const_uv_ptr1 = &s; /* { dg-error {assignment to 'const user_vector \*' from incompatible pointer type 'struct s \*'} } */
++  const_uv_ptr2 = &s; /* { dg-error {assignment to 'const user_vector_copy \*' {aka 'const user_vector \*'} from incompatible pointer type 'struct s \*'} } */
++
++  ui1 = s; /* { dg-error {assigning to type 'user_int' {aka 'int'} from type 'struct s'} } */
++  ui2 = s; /* { dg-error {assigning to type 'user_int_copy' {aka 'int'} from type 'struct s'} } */
++  ui_ptr1 = &s; /* { dg-error {assignment to 'user_int_ptr' {aka 'int \*'} from incompatible pointer type 'struct s \*'} } */
++  ui_ptr2 = &s; /* { dg-error {assignment to 'user_int \*' {aka 'int \*'} from incompatible pointer type 'struct s \*'} } */
++  const_ui_ptr1 = &s; /* { dg-error {assignment to 'const user_int \*' {aka 'const int \*'} from incompatible pointer type 'struct s \*'} } */
++  const_ui_ptr2 = &s; /* { dg-error {assignment to 'const user_int_copy \*' {aka 'const int \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ui_ptr1 = &s; /* { dg-error {assignment to 'volatile user_int \*' {aka 'volatile int \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ui_ptr2 = &s; /* { dg-error {assignment to 'volatile user_int_copy \*' {aka 'volatile int \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ui_ptr1 = &s; /* { dg-error {assignment to '_Atomic user_int \*' {aka '_Atomic int \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ui_ptr2 = &s; /* { dg-error {assignment to '_Atomic user_int_copy \*' {aka '_Atomic int \*'} from incompatible pointer type 'struct s \*'} } */
++  ui_array_ptr1 = &s; /* { dg-error {assignment to 'user_int \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ui_array_ptr2 = &s; /* { dg-error {assignment to 'user_int_copy \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr1 = &s; /* { dg-error {assignment to 'user_int \(\*\)\(void\)' {aka 'int \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr2 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int\)' {aka 'void \(\*\)\(int\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr3 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int,  \.\.\.\)' {aka 'void \(\*\)\(int,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr4 = &s; /* { dg-error {assignment to 'user_int_copy \(\*\)\(void\)' {aka 'int \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr5 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int_copy\)' {aka 'void \(\*\)\(int\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr6 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int_copy,  \.\.\.\)' {aka 'void \(\*\)\(int,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ui_fn_ptr1 = &s; /* { dg-error {assignment to 'user_int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ui_fn_ptr2 = &s; /* { dg-error {assignment to 'user_int_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++}
+diff --git a/gcc/testsuite/gcc.dg/diag-aka-5b.c b/gcc/testsuite/gcc.dg/diag-aka-5b.c
+new file mode 100644
+index 000000000..f510d0d40
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/diag-aka-5b.c
+@@ -0,0 +1,134 @@
++#include "diag-aka-5.h"
++
++typedef user_enum user_enum_copy;
++
++struct s { int i; };
++
++user_enum ue1;
++user_enum_copy ue2;
++user_enum_ptr ue_ptr1;
++user_enum *ue_ptr2;
++const user_enum *const_ue_ptr1;
++const user_enum_copy *const_ue_ptr2;
++volatile user_enum *volatile_ue_ptr1;
++volatile user_enum_copy *volatile_ue_ptr2;
++__extension__ _Atomic user_enum *atomic_ue_ptr1;
++__extension__ _Atomic user_enum_copy *atomic_ue_ptr2;
++user_enum (*ue_array_ptr1)[10];
++user_enum_copy (*ue_array_ptr2)[10];
++user_enum (*ue_fn_ptr1) (void);
++void (*ue_fn_ptr2) (user_enum);
++void (*ue_fn_ptr3) (user_enum, ...);
++user_enum_copy (*ue_fn_ptr4) (void);
++void (*ue_fn_ptr5) (user_enum_copy);
++void (*ue_fn_ptr6) (user_enum_copy, ...);
++user_enum (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr1) (void);
++user_enum_copy (*__attribute__((__transaction_unsafe__)) unsafe_ue_fn_ptr2) (void);
++
++user_struct us1;
++user_struct_copy us2;
++user_struct_ptr us_ptr1;
++user_struct *us_ptr2;
++const user_struct *const_us_ptr1;
++const user_struct_copy *const_us_ptr2;
++
++user_union uu1;
++user_union_copy uu2;
++user_union_ptr uu_ptr1;
++user_union *uu_ptr2;
++const user_union *const_uu_ptr1;
++const user_union_copy *const_uu_ptr2;
++
++user_vector uv1;
++user_vector_copy uv2;
++user_vector_ptr uv_ptr1;
++user_vector *uv_ptr2;
++const user_vector *const_uv_ptr1;
++const user_vector_copy *const_uv_ptr2;
++
++user_int ui1;
++user_int_copy ui2;
++user_int_ptr ui_ptr1;
++user_int *ui_ptr2;
++const user_int *const_ui_ptr1;
++const user_int_copy *const_ui_ptr2;
++volatile user_int *volatile_ui_ptr1;
++volatile user_int_copy *volatile_ui_ptr2;
++__extension__ _Atomic user_int *atomic_ui_ptr1;
++__extension__ _Atomic user_int_copy *atomic_ui_ptr2;
++user_int (*ui_array_ptr1)[10];
++user_int_copy (*ui_array_ptr2)[10];
++user_int (*ui_fn_ptr1) (void);
++void (*ui_fn_ptr2) (user_int);
++void (*ui_fn_ptr3) (user_int, ...);
++user_int_copy (*ui_fn_ptr4) (void);
++void (*ui_fn_ptr5) (user_int_copy);
++void (*ui_fn_ptr6) (user_int_copy, ...);
++user_int (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr1) (void);
++user_int_copy (*__attribute__((__transaction_unsafe__)) unsafe_ui_fn_ptr2) (void);
++
++void f (struct s s)
++{
++  ue1 = s; /* { dg-error {assigning to type 'user_enum' {aka 'enum __internal_enum'} from type 'struct s'} } */
++  ue2 = s; /* { dg-error {assigning to type 'user_enum_copy' {aka 'enum __internal_enum'} from type 'struct s'} } */
++  ue_ptr1 = &s; /* { dg-error {assignment to 'user_enum_ptr' {aka 'enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  ue_ptr2 = &s; /* { dg-error {assignment to 'user_enum \*' {aka 'enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  const_ue_ptr1 = &s; /* { dg-error {assignment to 'const user_enum \*' {aka 'const enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  const_ue_ptr2 = &s; /* { dg-error {assignment to 'const user_enum_copy \*' {aka 'const enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ue_ptr1 = &s; /* { dg-error {assignment to 'volatile user_enum \*' {aka 'volatile enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ue_ptr2 = &s; /* { dg-error {assignment to 'volatile user_enum_copy \*' {aka 'volatile enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ue_ptr1 = &s; /* { dg-error {assignment to '_Atomic user_enum \*' {aka '_Atomic enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ue_ptr2 = &s; /* { dg-error {assignment to '_Atomic user_enum_copy \*' {aka '_Atomic enum __internal_enum \*'} from incompatible pointer type 'struct s \*'} } */
++  ue_array_ptr1 = &s; /* { dg-error {assignment to 'user_enum \(\*\)\[10\]' {aka 'enum __internal_enum \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ue_array_ptr2 = &s; /* { dg-error {assignment to 'user_enum_copy \(\*\)\[10\]' {aka 'enum __internal_enum \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr1 = &s; /* { dg-error {assignment to 'user_enum \(\*\)\(void\)' {aka 'enum __internal_enum \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr2 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum\)' {aka 'void \(\*\)\(enum __internal_enum\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr3 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum,  \.\.\.\)' {aka 'void \(\*\)\(enum __internal_enum,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr4 = &s; /* { dg-error {assignment to 'user_enum_copy \(\*\)\(void\)' {aka 'enum __internal_enum \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr5 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum_copy\)' {aka 'void \(\*\)\(enum __internal_enum\)'} from incompatible pointer type 'struct s \*'} } */
++  ue_fn_ptr6 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_enum_copy,  \.\.\.\)' {aka 'void \(\*\)\(enum __internal_enum,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ue_fn_ptr1 = &s; /* { dg-error {assignment to 'user_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'enum __internal_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ue_fn_ptr2 = &s; /* { dg-error {assignment to 'user_enum_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'enum __internal_enum \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++
++  us1 = s; /* { dg-error {assigning to type 'user_struct' {aka 'struct __internal_struct'} from type 'struct s'} } */
++  us2 = s; /* { dg-error {assigning to type 'user_struct_copy' {aka 'struct __internal_struct'} from type 'struct s'} } */
++  us_ptr1 = &s; /* { dg-error {assignment to 'user_struct_ptr' {aka 'struct __internal_struct \*'} from incompatible pointer type 'struct s \*'} } */
++  us_ptr2 = &s; /* { dg-error {assignment to 'user_struct \*' {aka 'struct __internal_struct \*'} from incompatible pointer type 'struct s \*'} } */
++  const_us_ptr1 = &s; /* { dg-error {assignment to 'const user_struct \*' {aka 'const struct __internal_struct \*'} from incompatible pointer type 'struct s \*'} } */
++  const_us_ptr2 = &s; /* { dg-error {assignment to 'const user_struct_copy \*' {aka 'const struct __internal_struct \*'} from incompatible pointer type 'struct s \*'} } */
++
++  uu1 = s; /* { dg-error {assigning to type 'user_union' {aka 'union __internal_union'} from type 'struct s'} } */
++  uu2 = s; /* { dg-error {assigning to type 'user_union_copy' {aka 'union __internal_union'} from type 'struct s'} } */
++  uu_ptr1 = &s; /* { dg-error {assignment to 'user_union_ptr' {aka 'union __internal_union \*'} from incompatible pointer type 'struct s \*'} } */
++  uu_ptr2 = &s; /* { dg-error {assignment to 'user_union \*' {aka 'union __internal_union \*'} from incompatible pointer type 'struct s \*'} } */
++  const_uu_ptr1 = &s; /* { dg-error {assignment to 'const user_union \*' {aka 'const union __internal_union \*'} from incompatible pointer type 'struct s \*'} } */
++  const_uu_ptr2 = &s; /* { dg-error {assignment to 'const user_union_copy \*' {aka 'const union __internal_union \*'} from incompatible pointer type 'struct s \*'} } */
++
++  uv1 = s; /* { dg-error {assigning to type 'user_vector' {aka '__vector\([48]\) unsigned int'} from type 'struct s'} } */
++  uv2 = s; /* { dg-error {assigning to type 'user_vector_copy' {aka '__vector\([48]\) unsigned int'} from type 'struct s'} } */
++  uv_ptr1 = &s; /* { dg-error {assignment to 'user_vector_ptr' {aka '__vector\([48]\) unsigned int \*'} from incompatible pointer type 'struct s \*'} } */
++  uv_ptr2 = &s; /* { dg-error {assignment to 'user_vector \*' {aka '__vector\([48]\) unsigned int \*'} from incompatible pointer type 'struct s \*'} } */
++  const_uv_ptr1 = &s; /* { dg-error {assignment to 'const user_vector \*' {aka 'const __vector\([48]\) unsigned int \*'} from incompatible pointer type 'struct s \*'} } */
++  const_uv_ptr2 = &s; /* { dg-error {assignment to 'const user_vector_copy \*' {aka 'const __vector\([48]\) unsigned int \*'} from incompatible pointer type 'struct s \*'} } */
++
++  ui1 = s; /* { dg-error {assigning to type 'user_int' {aka 'int'} from type 'struct s'} } */
++  ui2 = s; /* { dg-error {assigning to type 'user_int_copy' {aka 'int'} from type 'struct s'} } */
++  ui_ptr1 = &s; /* { dg-error {assignment to 'user_int_ptr' {aka 'int \*'} from incompatible pointer type 'struct s \*'} } */
++  ui_ptr2 = &s; /* { dg-error {assignment to 'user_int \*' {aka 'int \*'} from incompatible pointer type 'struct s \*'} } */
++  const_ui_ptr1 = &s; /* { dg-error {assignment to 'const user_int \*' {aka 'const int \*'} from incompatible pointer type 'struct s \*'} } */
++  const_ui_ptr2 = &s; /* { dg-error {assignment to 'const user_int_copy \*' {aka 'const int \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ui_ptr1 = &s; /* { dg-error {assignment to 'volatile user_int \*' {aka 'volatile int \*'} from incompatible pointer type 'struct s \*'} } */
++  volatile_ui_ptr2 = &s; /* { dg-error {assignment to 'volatile user_int_copy \*' {aka 'volatile int \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ui_ptr1 = &s; /* { dg-error {assignment to '_Atomic user_int \*' {aka '_Atomic int \*'} from incompatible pointer type 'struct s \*'} } */
++  atomic_ui_ptr2 = &s; /* { dg-error {assignment to '_Atomic user_int_copy \*' {aka '_Atomic int \*'} from incompatible pointer type 'struct s \*'} } */
++  ui_array_ptr1 = &s; /* { dg-error {assignment to 'user_int \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ui_array_ptr2 = &s; /* { dg-error {assignment to 'user_int_copy \(\*\)\[10\]' {aka 'int \(\*\)\[10\]'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr1 = &s; /* { dg-error {assignment to 'user_int \(\*\)\(void\)' {aka 'int \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr2 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int\)' {aka 'void \(\*\)\(int\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr3 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int,  \.\.\.\)' {aka 'void \(\*\)\(int,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr4 = &s; /* { dg-error {assignment to 'user_int_copy \(\*\)\(void\)' {aka 'int \(\*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr5 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int_copy\)' {aka 'void \(\*\)\(int\)'} from incompatible pointer type 'struct s \*'} } */
++  ui_fn_ptr6 = &s; /* { dg-error {assignment to 'void \(\*\)\(user_int_copy,  \.\.\.\)' {aka 'void \(\*\)\(int,  \.\.\.\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ui_fn_ptr1 = &s; /* { dg-error {assignment to 'user_int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++  unsafe_ui_fn_ptr2 = &s; /* { dg-error {assignment to 'user_int_copy \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)' {aka 'int \(__attribute__\(\(transaction_unsafe\)\) \*\)\(void\)'} from incompatible pointer type 'struct s \*'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c b/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
+index 59e24f48b..98dffead6 100644
+--- a/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
+@@ -8,7 +8,6 @@ void f (float x)
+   __Int8x8_t *ptr1 = &x; /* { dg-error {initialization of '__Int8x8_t \*' from incompatible pointer type 'float \*'} } */
+   int8x8_t y2 = x; /* { dg-error {incompatible types when initializing type 'int8x8_t' using type 'float'} } */
+   int8x8_t *ptr2 = &x; /* { dg-error {initialization of 'int8x8_t \*' from incompatible pointer type 'float \*'} } */
+-  /* ??? For these it would be better to print an aka for 'int16x4_t'.  */
+-  myvec y3 = x; /* { dg-error {incompatible types when initializing type 'myvec' using type 'float'} } */
+-  myvec *ptr3 = &x; /* { dg-error {initialization of 'myvec \*' from incompatible pointer type 'float \*'} } */
++  myvec y3 = x; /* { dg-error {incompatible types when initializing type 'myvec' {aka 'int16x4_t'} using type 'float'} } */
++  myvec *ptr3 = &x; /* { dg-error {initialization of 'myvec \*' {aka 'int16x4_t \*'} from incompatible pointer type 'float \*'} } */
+ }
+diff --git a/gcc/testsuite/gcc.target/aarch64/pr88838.c b/gcc/testsuite/gcc.target/aarch64/pr88838.c
+new file mode 100644
+index 000000000..d7db84758
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/pr88838.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
++
++void
++f (int *restrict x, int *restrict y, int *restrict z, int n)
++{
++    for (int i = 0; i < n; i += 1)
++          x[i] = y[i] + z[i];
++}
++
++/* { dg-final { scan-assembler-not "sxtw" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+index 2655c4242..2cfb3f697 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+@@ -26,14 +26,14 @@
+ TEST_ALL (ADD_LOOP)
+ 
+ /* { dg-final { scan-assembler-not {\tuqdec} } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, xzr,} 2 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, x[0-9]+,} 2 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, xzr,} 2 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, x[0-9]+,} 2 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, xzr,} 3 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, x[0-9]+,} 3 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, xzr,} 3 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, x[0-9]+,} 3 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, wzr,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b, w[0-9]+,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, wzr,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h, w[0-9]+,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, wzr,} 3 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s, w[0-9]+,} 3 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, wzr,} 3 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d, w[0-9]+,} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z, \[x0, x[0-9]+\]\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0, x[0-9]+\]\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x0, x[0-9]+, lsl 1\]\n} 2 } } */
+diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
+index d38b298aa..e51b95593 100644
+--- a/gcc/tree-vect-loop-manip.c
++++ b/gcc/tree-vect-loop-manip.c
+@@ -423,6 +423,7 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
+ 			      bool might_wrap_p)
+ {
+   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
++  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
+   tree mask_type = rgm->mask_type;
+   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
+   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+@@ -453,11 +454,16 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
+   tree index_before_incr, index_after_incr;
+   gimple_stmt_iterator incr_gsi;
+   bool insert_after;
+-  tree zero_index = build_int_cst (compare_type, 0);
+   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+-  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
++
++  tree zero_index = build_int_cst (iv_type, 0);
++  tree step = build_int_cst (iv_type,
++			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
++  /* Create IV of iv_type.  */
++  create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
+ 	     insert_after, &index_before_incr, &index_after_incr);
+ 
++  zero_index = build_int_cst (compare_type, 0);
+   tree test_index, test_limit, first_limit;
+   gimple_stmt_iterator *test_gsi;
+   if (might_wrap_p)
+@@ -537,6 +543,10 @@ vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
+   tree next_mask = NULL_TREE;
+   tree mask;
+   unsigned int i;
++  gimple_seq test_seq = NULL;
++  test_index = gimple_convert (&test_seq, compare_type, test_index);
++  gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
++
+   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
+     {
+       /* Previous masks will cover BIAS scalars.  This mask covers the
+@@ -645,12 +655,12 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
+ 
+   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+   unsigned int compare_precision = TYPE_PRECISION (compare_type);
+-  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+   tree orig_niters = niters;
+ 
+   /* Type of the initial value of NITERS.  */
+   tree ni_actual_type = TREE_TYPE (niters);
+   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
++  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ 
+   /* Convert NITERS to the same size as the compare.  */
+   if (compare_precision > ni_actual_precision
+@@ -669,33 +679,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
+   else
+     niters = gimple_convert (&preheader_seq, compare_type, niters);
+ 
+-  /* Convert skip_niters to the right type.  */
+-  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+-
+-  /* Now calculate the value that the induction variable must be able
+-     to hit in order to ensure that we end the loop with an all-false mask.
+-     This involves adding the maximum number of inactive trailing scalar
+-     iterations.  */
+-  widest_int iv_limit;
+-  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
+-  if (known_max_iters)
+-    {
+-      if (niters_skip)
+-	{
+-	  /* Add the maximum number of skipped iterations to the
+-	     maximum iteration count.  */
+-	  if (TREE_CODE (niters_skip) == INTEGER_CST)
+-	    iv_limit += wi::to_widest (niters_skip);
+-	  else
+-	    iv_limit += max_vf - 1;
+-	}
+-      /* IV_LIMIT is the maximum number of latch iterations, which is also
+-	 the maximum in-range IV value.  Round this value down to the previous
+-	 vector alignment boundary and then add an extra full iteration.  */
+-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+-      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
+-    }
+-
++  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+   /* Get the vectorization factor in tree form.  */
+   tree vf = build_int_cst (compare_type,
+ 			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+@@ -725,7 +709,7 @@ vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
+ 	/* See whether zero-based IV would ever generate all-false masks
+ 	   before wrapping around.  */
+ 	bool might_wrap_p
+-	  = (!known_max_iters
++	  = (iv_limit == -1
+ 	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
+ 				    UNSIGNED)
+ 		 > compare_precision));
+diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
+index 16d7d7788..e98bf2c15 100644
+--- a/gcc/tree-vect-loop.c
++++ b/gcc/tree-vect-loop.c
+@@ -1038,6 +1038,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
+ {
+   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+   unsigned int min_ni_width;
++  unsigned int max_nscalars_per_iter
++    = vect_get_max_nscalars_per_iter (loop_vinfo);
+ 
+   /* Use a normal loop if there are no statements that need masking.
+      This only happens in rare degenerate cases: it means that the loop
+@@ -1056,7 +1058,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
+     max_ni = wi::smin (max_ni, max_back_edges + 1);
+ 
+   /* Account for rgroup masks, in which each bit is replicated N times.  */
+-  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
++  max_ni *= max_nscalars_per_iter;
+ 
+   /* Work out how many bits we need to represent the limit.  */
+   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+@@ -1064,6 +1066,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
+   /* Find a scalar mode for which WHILE_ULT is supported.  */
+   opt_scalar_int_mode cmp_mode_iter;
+   tree cmp_type = NULL_TREE;
++  tree iv_type = NULL_TREE;
++  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
++  widest_int iv_precision = UINT_MAX;
++
++  if (iv_limit != -1)
++    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
++				      UNSIGNED);
++
+   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
+     {
+       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+@@ -1075,10 +1085,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
+ 	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
+ 	    {
+ 	      /* Although we could stop as soon as we find a valid mode,
+-		 it's often better to continue until we hit Pmode, since the
+-		 operands to the WHILE are more likely to be reusable in
+-		 address calculations.  */
+-	      cmp_type = this_type;
++		 there are at least two reasons why that's not always the
++		 best choice:
++
++		 - An IV that's Pmode or wider is more likely to be reusable
++		 in address calculations than an IV that's narrower than
++		 Pmode.
++
++		 - Doing the comparison in IV_PRECISION or wider allows
++		 a natural 0-based IV, whereas using a narrower comparison
++		 type requires mitigations against wrap-around.
++
++		 Conversely, if the IV limit is variable, doing the comparison
++		 in a wider type than the original type can introduce
++		 unnecessary extensions, so picking the widest valid mode
++		 is not always a good choice either.
++
++		 Here we prefer the first IV type that's Pmode or wider,
++		 and the first comparison type that's IV_PRECISION or wider.
++		 (The comparison type must be no wider than the IV type,
++		 to avoid extensions in the vector loop.)
++
++		 ??? We might want to try continuing beyond Pmode for ILP32
++		 targets if CMP_BITS < IV_PRECISION.  */
++	      iv_type = this_type;
++	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
++		cmp_type = this_type;
+ 	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
+ 		break;
+ 	    }
+@@ -1089,6 +1121,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
+     return false;
+ 
+   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
++  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
+   return true;
+ }
+ 
+@@ -9080,3 +9113,45 @@ optimize_mask_stores (struct loop *loop)
+       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
+     }
+ }
++
++/* Decide whether it is possible to use a zero-based induction variable
++   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
++   return the value that the induction variable must be able to hold
++   in order to ensure that the loop ends with an all-false mask.
++   Return -1 otherwise.  */
++widest_int
++vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
++{
++  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
++  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
++  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
++
++  /* Calculate the value that the induction variable must be able
++     to hit in order to ensure that we end the loop with an all-false mask.
++     This involves adding the maximum number of inactive trailing scalar
++     iterations.  */
++  widest_int iv_limit = -1;
++  if (max_loop_iterations (loop, &iv_limit))
++    {
++      if (niters_skip)
++	{
++	  /* Add the maximum number of skipped iterations to the
++	     maximum iteration count.  */
++	  if (TREE_CODE (niters_skip) == INTEGER_CST)
++	    iv_limit += wi::to_widest (niters_skip);
++	  else
++	    iv_limit += max_vf - 1;
++	}
++      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
++	/* Make a conservatively-correct assumption.  */
++	iv_limit += max_vf - 1;
++
++      /* IV_LIMIT is the maximum number of latch iterations, which is also
++	 the maximum in-range IV value.  Round this value down to the previous
++	 vector alignment boundary and then add an extra full iteration.  */
++      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
++      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
++    }
++  return iv_limit;
++}
++
+diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+index 34ba49f4d..fae4df52d 100644
+--- a/gcc/tree-vectorizer.h
++++ b/gcc/tree-vectorizer.h
+@@ -529,6 +529,10 @@ typedef struct _loop_vec_info : public vec_info {
+      is false and vectorized loop otherwise.  */
+   tree simd_if_cond;
+ 
++  /* Type of the IV to use in the WHILE_ULT call for fully-masked
++     loops.  */
++  tree iv_type;
++
+   /* Unknown DRs according to which loop was peeled.  */
+   struct dr_vec_info *unaligned_dr;
+ 
+@@ -675,6 +679,7 @@ typedef struct _loop_vec_info : public vec_info {
+ #define LOOP_VINFO_MASKS(L)                (L)->masks
+ #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
+ #define LOOP_VINFO_MASK_COMPARE_TYPE(L)    (L)->mask_compare_type
++#define LOOP_VINFO_MASK_IV_TYPE(L)         (L)->iv_type
+ #define LOOP_VINFO_PTR_MASK(L)             (L)->ptr_mask
+ #define LOOP_VINFO_LOOP_NEST(L)            (L)->shared->loop_nest
+ #define LOOP_VINFO_DATAREFS(L)             (L)->shared->datarefs
+@@ -1720,6 +1725,7 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *,
+ /* In tree-vect-loop.c.  */
+ /* Used in tree-vect-loop-manip.c */
+ extern void determine_peel_for_niter (loop_vec_info);
++extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo);
+ /* Used in gimple-loop-interchange.c and tree-parloops.c.  */
+ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
+ 				  enum tree_code);
diff --git a/aarch64-Fix-mismatched-SVE-predicate-modes.patch b/aarch64-Fix-mismatched-SVE-predicate-modes.patch
new file mode 100644
index 0000000..8bb66d9
--- /dev/null
+++ b/aarch64-Fix-mismatched-SVE-predicate-modes.patch
@@ -0,0 +1,34 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-aarch64-Fix-mismatched-SVE-predicate-modes.patch
+26bebf576ddcdcfb596f07e8c2896f17c48516e7
+
+diff -urpN a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+--- a/gcc/config/aarch64/aarch64.c	2020-12-14 00:57:20.128000000 -0500
++++ b/gcc/config/aarch64/aarch64.c	2020-12-14 01:00:15.080000000 -0500
+@@ -4328,6 +4328,7 @@ aarch64_expand_sve_const_pred_eor (rtx t
+   /* EOR the result with an ELT_SIZE PTRUE.  */
+   rtx mask = aarch64_ptrue_all (elt_size);
+   mask = force_reg (VNx16BImode, mask);
++  inv = gen_lowpart (VNx16BImode, inv);
+   target = aarch64_target_reg (target, VNx16BImode);
+   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
+   return target;
+diff -urpN a/gcc/testsuite/gcc.dg/vect/pr94606.c b/gcc/testsuite/gcc.dg/vect/pr94606.c
+--- a/gcc/testsuite/gcc.dg/vect/pr94606.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/vect/pr94606.c	2020-12-14 01:00:15.080000000 -0500
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve -msve-vector-bits=256" { target aarch64*-*-* } } */
++
++const short mask[] = { 0, 0, 0, 0, 0, 0, 0, 0,
++		       0, 0, 0, 1, 1, 1, 1, 1 };
++
++int
++foo (short *restrict x, short *restrict y)
++{
++  for (int i = 0; i < 16; ++i)
++    if (mask[i])
++      x[i] += y[i];
++}
diff --git a/aarch64-fix-sve-acle-error.patch b/aarch64-fix-sve-acle-error.patch
new file mode 100644
index 0000000..237093a
--- /dev/null
+++ b/aarch64-fix-sve-acle-error.patch
@@ -0,0 +1,2128 @@
+This backport contains 4 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-AArch64-Fix-build-for-non-default-languages.patch
+6ff0cdebb1bc281ba2374f3ecdbe358c4fa74093
+
+0002-C-Opt-out-of-GNU-vector-extensions-for-built-in-SVE-.patch
+f486280c53be53136f0bb9b578f43dc6c9c5acea
+
+0003-C-Add-a-target-hook-that-allows-targets-to-verify-ty.patch
+65ef05d0b7fb429c5760189e638c441dc3da33f4
+
+0004-AArch64-Run-general-SVE-ACLE-tests-for-C.patch
+6da4c454acee4dac53c4c549fa1caeb73fe1f82b
+
+diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
+index 2c10743b9..50423ec0f 100644
+--- a/gcc/c-family/c-common.c
++++ b/gcc/c-family/c-common.c
+@@ -1012,7 +1012,8 @@ c_build_vec_perm_expr (location_t loc, tree v0, tree v1, tree mask,
+       || mask == error_mark_node)
+     return error_mark_node;
+ 
+-  if (!VECTOR_INTEGER_TYPE_P (TREE_TYPE (mask)))
++  if (!gnu_vector_type_p (TREE_TYPE (mask))
++      || !VECTOR_INTEGER_TYPE_P (TREE_TYPE (mask)))
+     {
+       if (complain)
+ 	error_at (loc, "%<__builtin_shuffle%> last argument must "
+@@ -1020,8 +1021,8 @@ c_build_vec_perm_expr (location_t loc, tree v0, tree v1, tree mask,
+       return error_mark_node;
+     }
+ 
+-  if (!VECTOR_TYPE_P (TREE_TYPE (v0))
+-      || !VECTOR_TYPE_P (TREE_TYPE (v1)))
++  if (!gnu_vector_type_p (TREE_TYPE (v0))
++      || !gnu_vector_type_p (TREE_TYPE (v1)))
+     {
+       if (complain)
+ 	error_at (loc, "%<__builtin_shuffle%> arguments must be vectors");
+@@ -1096,8 +1097,9 @@ c_build_vec_convert (location_t loc1, tree expr, location_t loc2, tree type,
+   if (error_operand_p (expr))
+     return error_mark_node;
+ 
+-  if (!VECTOR_INTEGER_TYPE_P (TREE_TYPE (expr))
+-      && !VECTOR_FLOAT_TYPE_P (TREE_TYPE (expr)))
++  if (!gnu_vector_type_p (TREE_TYPE (expr))
++      || (!VECTOR_INTEGER_TYPE_P (TREE_TYPE (expr))
++	  && !VECTOR_FLOAT_TYPE_P (TREE_TYPE (expr))))
+     {
+       if (complain)
+ 	error_at (loc1, "%<__builtin_convertvector%> first argument must "
+@@ -1105,7 +1107,8 @@ c_build_vec_convert (location_t loc1, tree expr, location_t loc2, tree type,
+       return error_mark_node;
+     }
+ 
+-  if (!VECTOR_INTEGER_TYPE_P (type) && !VECTOR_FLOAT_TYPE_P (type))
++  if (!gnu_vector_type_p (type)
++      || (!VECTOR_INTEGER_TYPE_P (type) && !VECTOR_FLOAT_TYPE_P (type)))
+     {
+       if (complain)
+ 	error_at (loc2, "%<__builtin_convertvector%> second argument must "
+@@ -3128,6 +3131,9 @@ pointer_int_sum (location_t loc, enum tree_code resultcode,
+ 	return error_mark_node;
+       size_exp = integer_one_node;
+     }
++  else if (!verify_type_context (loc, TCTX_POINTER_ARITH,
++				 TREE_TYPE (result_type)))
++    size_exp = integer_one_node;
+   else
+     size_exp = size_in_bytes_loc (loc, TREE_TYPE (result_type));
+ 
+@@ -3673,6 +3679,13 @@ c_sizeof_or_alignof_type (location_t loc,
+ 		  "incomplete element type", op_name, type);
+       return error_mark_node;
+     }
++  else if (!verify_type_context (loc, is_sizeof ? TCTX_SIZEOF : TCTX_ALIGNOF,
++				 type, !complain))
++    {
++      if (!complain)
++	return error_mark_node;
++      value = size_one_node;
++    }
+   else
+     {
+       if (is_sizeof)
+@@ -3705,7 +3718,10 @@ c_alignof_expr (location_t loc, tree expr)
+ {
+   tree t;
+ 
+-  if (VAR_OR_FUNCTION_DECL_P (expr))
++  if (!verify_type_context (loc, TCTX_ALIGNOF, TREE_TYPE (expr)))
++    t = size_one_node;
++
++  else if (VAR_OR_FUNCTION_DECL_P (expr))
+     t = size_int (DECL_ALIGN_UNIT (expr));
+ 
+   else if (TREE_CODE (expr) == COMPONENT_REF
+@@ -7994,7 +8010,7 @@ convert_vector_to_array_for_subscript (location_t loc,
+ 				       tree *vecp, tree index)
+ {
+   bool ret = false;
+-  if (VECTOR_TYPE_P (TREE_TYPE (*vecp)))
++  if (gnu_vector_type_p (TREE_TYPE (*vecp)))
+     {
+       tree type = TREE_TYPE (*vecp);
+ 
+@@ -8030,7 +8046,7 @@ scalar_to_vector (location_t loc, enum tree_code code, tree op0, tree op1,
+   bool integer_only_op = false;
+   enum stv_conv ret = stv_firstarg;
+ 
+-  gcc_assert (VECTOR_TYPE_P (type0) || VECTOR_TYPE_P (type1));
++  gcc_assert (gnu_vector_type_p (type0) || gnu_vector_type_p (type1));
+   switch (code)
+     {
+       /* Most GENERIC binary expressions require homogeneous arguments.
+@@ -8081,7 +8097,7 @@ scalar_to_vector (location_t loc, enum tree_code code, tree op0, tree op1,
+       case LT_EXPR:
+       case GT_EXPR:
+       /* What about UNLT_EXPR?  */
+-	if (VECTOR_TYPE_P (type0))
++	if (gnu_vector_type_p (type0))
+ 	  {
+ 	    ret = stv_secondarg;
+ 	    std::swap (type0, type1);
+diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
+index 73ce7c5df..2a9008af4 100644
+--- a/gcc/c-family/c-common.h
++++ b/gcc/c-family/c-common.h
+@@ -756,6 +756,16 @@ extern bool done_lexing;
+ #define C_TYPE_OBJECT_OR_INCOMPLETE_P(type) \
+   (!C_TYPE_FUNCTION_P (type))
+ 
++/* Return true if TYPE is a vector type that should be subject to the GNU
++   vector extensions (as opposed to a vector type that is used only for
++   the purposes of defining target-specific built-in functions).  */
++
++inline bool
++gnu_vector_type_p (const_tree type)
++{
++  return TREE_CODE (type) == VECTOR_TYPE && !TYPE_INDIVISIBLE_P (type);
++}
++
+ struct visibility_flags
+ {
+   unsigned inpragma : 1;	/* True when in #pragma GCC visibility.  */
+diff --git a/gcc/c/c-convert.c b/gcc/c/c-convert.c
+index f0f846013..21b127d0d 100644
+--- a/gcc/c/c-convert.c
++++ b/gcc/c/c-convert.c
+@@ -147,8 +147,20 @@ convert (tree type, tree expr)
+       goto maybe_fold;
+ 
+     case VECTOR_TYPE:
+-      ret = convert_to_vector (type, e);
+-      goto maybe_fold;
++      if (gnu_vector_type_p (type)
++	  || gnu_vector_type_p (TREE_TYPE (e))
++	  /* Allow conversions between compatible non-GNU vector types
++	     when -flax-vector-conversions is passed.  The whole purpose
++	     of the option is to bend the normal type rules and accept
++	     nonconforming code.  */
++	  || (flag_lax_vector_conversions
++	      && VECTOR_TYPE_P (TREE_TYPE (e))
++	      && vector_types_convertible_p (type, TREE_TYPE (e), false)))
++	{
++	  ret = convert_to_vector (type, e);
++	  goto maybe_fold;
++	}
++      break;
+ 
+     case RECORD_TYPE:
+     case UNION_TYPE:
+diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
+index 288dbe9d9..bf88d3c7d 100644
+--- a/gcc/c/c-decl.c
++++ b/gcc/c/c-decl.c
+@@ -4927,7 +4927,7 @@ start_decl (struct c_declarator *declarator, struct c_declspecs *declspecs,
+ 	  {
+ 	    /* A complete type is ok if size is fixed.  */
+ 
+-	    if (TREE_CODE (TYPE_SIZE (TREE_TYPE (decl))) != INTEGER_CST
++	    if (!poly_int_tree_p (TYPE_SIZE (TREE_TYPE (decl)))
+ 		|| C_DECL_VARIABLE_SIZE (decl))
+ 	      {
+ 		error ("variable-sized object may not be initialized");
+@@ -5210,6 +5210,15 @@ finish_decl (tree decl, location_t init_loc, tree init,
+ 
+       complete_flexible_array_elts (DECL_INITIAL (decl));
+ 
++      if (is_global_var (decl))
++	{
++	  type_context_kind context = (DECL_THREAD_LOCAL_P (decl)
++				       ? TCTX_THREAD_STORAGE
++				       : TCTX_STATIC_STORAGE);
++	  if (!verify_type_context (input_location, context, TREE_TYPE (decl)))
++	    TREE_TYPE (decl) = error_mark_node;
++	}
++
+       if (DECL_SIZE (decl) == NULL_TREE && TREE_TYPE (decl) != error_mark_node
+ 	  && COMPLETE_TYPE_P (TREE_TYPE (decl)))
+ 	layout_decl (decl, 0);
+@@ -5239,7 +5248,9 @@ finish_decl (tree decl, location_t init_loc, tree init,
+ 	  && TREE_STATIC (decl))
+ 	incomplete_record_decls.safe_push (decl);
+ 
+-      if (is_global_var (decl) && DECL_SIZE (decl) != NULL_TREE)
++      if (is_global_var (decl)
++	  && DECL_SIZE (decl) != NULL_TREE
++	  && TREE_TYPE (decl) != error_mark_node)
+ 	{
+ 	  if (TREE_CODE (DECL_SIZE (decl)) == INTEGER_CST)
+ 	    constant_expression_warning (DECL_SIZE (decl));
+@@ -5559,6 +5570,10 @@ build_compound_literal (location_t loc, tree type, tree init, bool non_const,
+       return error_mark_node;
+     }
+ 
++  if (TREE_STATIC (decl)
++      && !verify_type_context (loc, TCTX_STATIC_STORAGE, type))
++    return error_mark_node;
++
+   stmt = build_stmt (DECL_SOURCE_LOCATION (decl), DECL_EXPR, decl);
+   complit = build1 (COMPOUND_LITERAL_EXPR, type, stmt);
+   TREE_SIDE_EFFECTS (complit) = 1;
+@@ -6227,6 +6242,12 @@ grokdeclarator (const struct c_declarator *declarator,
+ 	    if (type == error_mark_node)
+ 	      continue;
+ 
++	    if (!verify_type_context (loc, TCTX_ARRAY_ELEMENT, type))
++	      {
++		type = error_mark_node;
++		continue;
++	      }
++
+ 	    /* If size was specified, set ITYPE to a range-type for
+ 	       that size.  Otherwise, ITYPE remains null.  finish_decl
+ 	       may figure it out from an initial value.  */
+@@ -7076,6 +7097,10 @@ grokdeclarator (const struct c_declarator *declarator,
+ 	    if (orig_qual_indirect == 0)
+ 	      orig_qual_type = NULL_TREE;
+ 	  }
++	if (type != error_mark_node
++	    && !verify_type_context (loc, TCTX_FIELD, type))
++	  type = error_mark_node;
++
+ 	type = c_build_qualified_type (type, type_quals, orig_qual_type,
+ 				       orig_qual_indirect);
+ 	decl = build_decl (declarator->id_loc,
+diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
+index 87f4178ec..f456a66fb 100644
+--- a/gcc/c/c-typeck.c
++++ b/gcc/c/c-typeck.c
+@@ -2609,7 +2609,7 @@ build_array_ref (location_t loc, tree array, tree index)
+   if (TREE_CODE (TREE_TYPE (array)) != ARRAY_TYPE
+       && TREE_CODE (TREE_TYPE (array)) != POINTER_TYPE
+       /* Allow vector[index] but not index[vector].  */
+-      && !VECTOR_TYPE_P (TREE_TYPE (array)))
++      && !gnu_vector_type_p (TREE_TYPE (array)))
+     {
+       if (TREE_CODE (TREE_TYPE (index)) != ARRAY_TYPE
+ 	  && TREE_CODE (TREE_TYPE (index)) != POINTER_TYPE)
+@@ -3891,6 +3891,7 @@ pointer_diff (location_t loc, tree op0, tree op1, tree *instrument_expr)
+   addr_space_t as0 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (op0)));
+   addr_space_t as1 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (op1)));
+   tree target_type = TREE_TYPE (TREE_TYPE (op0));
++  tree orig_op0 = op0;
+   tree orig_op1 = op1;
+ 
+   /* If the operands point into different address spaces, we need to
+@@ -3961,6 +3962,10 @@ pointer_diff (location_t loc, tree op0, tree op1, tree *instrument_expr)
+   /* This generates an error if op1 is pointer to incomplete type.  */
+   if (!COMPLETE_OR_VOID_TYPE_P (TREE_TYPE (TREE_TYPE (orig_op1))))
+     error_at (loc, "arithmetic on pointer to an incomplete type");
++  else if (verify_type_context (loc, TCTX_POINTER_ARITH,
++				TREE_TYPE (TREE_TYPE (orig_op0))))
++    verify_type_context (loc, TCTX_POINTER_ARITH,
++			 TREE_TYPE (TREE_TYPE (orig_op1)));
+ 
+   op1 = c_size_in_bytes (target_type);
+ 
+@@ -4359,7 +4364,7 @@ build_unary_op (location_t location, enum tree_code code, tree xarg,
+ 	 associativity, but won't generate any code.  */
+       if (!(typecode == INTEGER_TYPE || typecode == REAL_TYPE
+ 	    || typecode == FIXED_POINT_TYPE || typecode == COMPLEX_TYPE
+-	    || typecode == VECTOR_TYPE))
++	    || gnu_vector_type_p (TREE_TYPE (arg))))
+ 	{
+ 	  error_at (location, "wrong type argument to unary plus");
+ 	  return error_mark_node;
+@@ -4372,7 +4377,7 @@ build_unary_op (location_t location, enum tree_code code, tree xarg,
+     case NEGATE_EXPR:
+       if (!(typecode == INTEGER_TYPE || typecode == REAL_TYPE
+ 	    || typecode == FIXED_POINT_TYPE || typecode == COMPLEX_TYPE
+-	    || typecode == VECTOR_TYPE))
++	    || gnu_vector_type_p (TREE_TYPE (arg))))
+ 	{
+ 	  error_at (location, "wrong type argument to unary minus");
+ 	  return error_mark_node;
+@@ -4384,7 +4389,7 @@ build_unary_op (location_t location, enum tree_code code, tree xarg,
+     case BIT_NOT_EXPR:
+       /* ~ works on integer types and non float vectors. */
+       if (typecode == INTEGER_TYPE
+-	  || (typecode == VECTOR_TYPE
++	  || (gnu_vector_type_p (TREE_TYPE (arg))
+ 	      && !VECTOR_FLOAT_TYPE_P (TREE_TYPE (arg))))
+ 	{
+ 	  tree e = arg;
+@@ -4570,7 +4575,8 @@ build_unary_op (location_t location, enum tree_code code, tree xarg,
+ 
+       if (typecode != POINTER_TYPE && typecode != FIXED_POINT_TYPE
+ 	  && typecode != INTEGER_TYPE && typecode != REAL_TYPE
+-	  && typecode != COMPLEX_TYPE && typecode != VECTOR_TYPE)
++	  && typecode != COMPLEX_TYPE
++	  && !gnu_vector_type_p (TREE_TYPE (arg)))
+ 	{
+ 	  if (code == PREINCREMENT_EXPR || code == POSTINCREMENT_EXPR)
+ 	    error_at (location, "wrong type argument to increment");
+@@ -4612,6 +4618,9 @@ build_unary_op (location_t location, enum tree_code code, tree xarg,
+ 		  pedwarn (location, OPT_Wpointer_arith,
+ 			   "wrong type argument to decrement");
+ 	      }
++	    else
++	      verify_type_context (location, TCTX_POINTER_ARITH,
++				   TREE_TYPE (argtype));
+ 
+ 	    inc = c_size_in_bytes (TREE_TYPE (argtype));
+ 	    inc = convert_to_ptrofftype_loc (location, inc);
+@@ -7854,7 +7863,7 @@ digest_init (location_t init_loc, tree type, tree init, tree origtype,
+ 		     TYPE_MAIN_VARIANT (type))
+ 	  || (code == ARRAY_TYPE
+ 	      && comptypes (TREE_TYPE (inside_init), type))
+-	  || (code == VECTOR_TYPE
++	  || (gnu_vector_type_p (type)
+ 	      && comptypes (TREE_TYPE (inside_init), type))
+ 	  || (code == POINTER_TYPE
+ 	      && TREE_CODE (TREE_TYPE (inside_init)) == ARRAY_TYPE
+@@ -8352,7 +8361,7 @@ really_start_incremental_init (tree type)
+ 
+       constructor_unfilled_index = constructor_index;
+     }
+-  else if (VECTOR_TYPE_P (constructor_type))
++  else if (gnu_vector_type_p (constructor_type))
+     {
+       /* Vectors are like simple fixed-size arrays.  */
+       constructor_max_index =
+@@ -8526,7 +8535,7 @@ push_init_level (location_t loc, int implicit,
+       constructor_unfilled_fields = constructor_fields;
+       constructor_bit_index = bitsize_zero_node;
+     }
+-  else if (VECTOR_TYPE_P (constructor_type))
++  else if (gnu_vector_type_p (constructor_type))
+     {
+       /* Vectors are like simple fixed-size arrays.  */
+       constructor_max_index =
+@@ -8715,7 +8724,7 @@ pop_init_level (location_t loc, int implicit,
+     ;
+   else if (!RECORD_OR_UNION_TYPE_P (constructor_type)
+ 	   && TREE_CODE (constructor_type) != ARRAY_TYPE
+-	   && !VECTOR_TYPE_P (constructor_type))
++	   && !gnu_vector_type_p (constructor_type))
+     {
+       /* A nonincremental scalar initializer--just return
+ 	 the element, after verifying there is just one.  */
+@@ -9941,7 +9950,7 @@ process_init_element (location_t loc, struct c_expr value, bool implicit,
+ 					      last_init_list_comma),
+ 			      true, braced_init_obstack);
+       else if ((TREE_CODE (constructor_type) == ARRAY_TYPE
+-		|| VECTOR_TYPE_P (constructor_type))
++		|| gnu_vector_type_p (constructor_type))
+ 	       && constructor_max_index
+ 	       && tree_int_cst_lt (constructor_max_index,
+ 				   constructor_index))
+@@ -10042,7 +10051,8 @@ process_init_element (location_t loc, struct c_expr value, bool implicit,
+ 		   && value.value != error_mark_node
+ 		   && TYPE_MAIN_VARIANT (TREE_TYPE (value.value)) != fieldtype
+ 		   && (fieldcode == RECORD_TYPE || fieldcode == ARRAY_TYPE
+-		       || fieldcode == UNION_TYPE || fieldcode == VECTOR_TYPE))
++		       || fieldcode == UNION_TYPE
++		       || gnu_vector_type_p (fieldtype)))
+ 	    {
+ 	      push_init_level (loc, 1, braced_init_obstack);
+ 	      continue;
+@@ -10133,7 +10143,8 @@ process_init_element (location_t loc, struct c_expr value, bool implicit,
+ 		   && value.value != error_mark_node
+ 		   && TYPE_MAIN_VARIANT (TREE_TYPE (value.value)) != fieldtype
+ 		   && (fieldcode == RECORD_TYPE || fieldcode == ARRAY_TYPE
+-		       || fieldcode == UNION_TYPE || fieldcode == VECTOR_TYPE))
++		       || fieldcode == UNION_TYPE
++		       || gnu_vector_type_p (fieldtype)))
+ 	    {
+ 	      push_init_level (loc, 1, braced_init_obstack);
+ 	      continue;
+@@ -10175,7 +10186,8 @@ process_init_element (location_t loc, struct c_expr value, bool implicit,
+ 		   && value.value != error_mark_node
+ 		   && TYPE_MAIN_VARIANT (TREE_TYPE (value.value)) != elttype
+ 		   && (eltcode == RECORD_TYPE || eltcode == ARRAY_TYPE
+-		       || eltcode == UNION_TYPE || eltcode == VECTOR_TYPE))
++		       || eltcode == UNION_TYPE
++		       || gnu_vector_type_p (elttype)))
+ 	    {
+ 	      push_init_level (loc, 1, braced_init_obstack);
+ 	      continue;
+@@ -10211,7 +10223,7 @@ process_init_element (location_t loc, struct c_expr value, bool implicit,
+ 	       constructor_unfilled_index.  */
+ 	    constructor_unfilled_index = constructor_index;
+ 	}
+-      else if (VECTOR_TYPE_P (constructor_type))
++      else if (gnu_vector_type_p (constructor_type))
+ 	{
+ 	  tree elttype = TYPE_MAIN_VARIANT (TREE_TYPE (constructor_type));
+ 
+@@ -11555,7 +11567,8 @@ build_binary_op (location_t location, enum tree_code code,
+ 
+   /* In case when one of the operands of the binary operation is
+      a vector and another is a scalar -- convert scalar to vector.  */
+-  if ((code0 == VECTOR_TYPE) != (code1 == VECTOR_TYPE))
++  if ((gnu_vector_type_p (type0) && code1 != VECTOR_TYPE)
++      || (gnu_vector_type_p (type1) && code0 != VECTOR_TYPE))
+     {
+       enum stv_conv convert_flag = scalar_to_vector (location, code, op0, op1,
+ 						     true);
+@@ -11650,10 +11663,12 @@ build_binary_op (location_t location, enum tree_code code,
+ 
+       if ((code0 == INTEGER_TYPE || code0 == REAL_TYPE
+ 	   || code0 == FIXED_POINT_TYPE
+-	   || code0 == COMPLEX_TYPE || code0 == VECTOR_TYPE)
++	   || code0 == COMPLEX_TYPE
++	   || gnu_vector_type_p (type0))
+ 	  && (code1 == INTEGER_TYPE || code1 == REAL_TYPE
+ 	      || code1 == FIXED_POINT_TYPE
+-	      || code1 == COMPLEX_TYPE || code1 == VECTOR_TYPE))
++	      || code1 == COMPLEX_TYPE
++	      || gnu_vector_type_p (type1)))
+ 	{
+ 	  enum tree_code tcode0 = code0, tcode1 = code1;
+ 
+@@ -11684,8 +11699,8 @@ build_binary_op (location_t location, enum tree_code code,
+       if (code0 == INTEGER_TYPE && code1 == INTEGER_TYPE)
+ 	shorten = -1;
+       /* Allow vector types which are not floating point types.   */
+-      else if (code0 == VECTOR_TYPE
+-	       && code1 == VECTOR_TYPE
++      else if (gnu_vector_type_p (type0)
++	       && gnu_vector_type_p (type1)
+ 	       && !VECTOR_FLOAT_TYPE_P (type0)
+ 	       && !VECTOR_FLOAT_TYPE_P (type1))
+ 	common = 1;
+@@ -11696,7 +11711,8 @@ build_binary_op (location_t location, enum tree_code code,
+       doing_div_or_mod = true;
+       warn_for_div_by_zero (location, op1);
+ 
+-      if (code0 == VECTOR_TYPE && code1 == VECTOR_TYPE
++      if (gnu_vector_type_p (type0)
++	  && gnu_vector_type_p (type1)
+ 	  && TREE_CODE (TREE_TYPE (type0)) == INTEGER_TYPE
+ 	  && TREE_CODE (TREE_TYPE (type1)) == INTEGER_TYPE)
+ 	common = 1;
+@@ -11775,7 +11791,8 @@ build_binary_op (location_t location, enum tree_code code,
+ 	 Also set SHORT_SHIFT if shifting rightward.  */
+ 
+     case RSHIFT_EXPR:
+-      if (code0 == VECTOR_TYPE && code1 == VECTOR_TYPE
++      if (gnu_vector_type_p (type0)
++	  && gnu_vector_type_p (type1)
+ 	  && TREE_CODE (TREE_TYPE (type0)) == INTEGER_TYPE
+ 	  && TREE_CODE (TREE_TYPE (type1)) == INTEGER_TYPE
+ 	  && known_eq (TYPE_VECTOR_SUBPARTS (type0),
+@@ -11785,7 +11802,7 @@ build_binary_op (location_t location, enum tree_code code,
+ 	  converted = 1;
+ 	}
+       else if ((code0 == INTEGER_TYPE || code0 == FIXED_POINT_TYPE
+-		|| (code0 == VECTOR_TYPE
++		|| (gnu_vector_type_p (type0)
+ 		    && TREE_CODE (TREE_TYPE (type0)) == INTEGER_TYPE))
+ 	       && code1 == INTEGER_TYPE)
+ 	{
+@@ -11834,7 +11851,8 @@ build_binary_op (location_t location, enum tree_code code,
+       break;
+ 
+     case LSHIFT_EXPR:
+-      if (code0 == VECTOR_TYPE && code1 == VECTOR_TYPE
++      if (gnu_vector_type_p (type0)
++	  && gnu_vector_type_p (type1)
+ 	  && TREE_CODE (TREE_TYPE (type0)) == INTEGER_TYPE
+ 	  && TREE_CODE (TREE_TYPE (type1)) == INTEGER_TYPE
+ 	  && known_eq (TYPE_VECTOR_SUBPARTS (type0),
+@@ -11844,7 +11862,7 @@ build_binary_op (location_t location, enum tree_code code,
+ 	  converted = 1;
+ 	}
+       else if ((code0 == INTEGER_TYPE || code0 == FIXED_POINT_TYPE
+-		|| (code0 == VECTOR_TYPE
++		|| (gnu_vector_type_p (type0)
+ 		    && TREE_CODE (TREE_TYPE (type0)) == INTEGER_TYPE))
+ 	       && code1 == INTEGER_TYPE)
+ 	{
+@@ -11903,7 +11921,7 @@ build_binary_op (location_t location, enum tree_code code,
+ 
+     case EQ_EXPR:
+     case NE_EXPR:
+-      if (code0 == VECTOR_TYPE && code1 == VECTOR_TYPE)
++      if (gnu_vector_type_p (type0) && gnu_vector_type_p (type1))
+         {
+           tree intt;
+ 	  if (!vector_types_compatible_elements_p (type0, type1))
+@@ -12071,7 +12089,7 @@ build_binary_op (location_t location, enum tree_code code,
+     case GE_EXPR:
+     case LT_EXPR:
+     case GT_EXPR:
+-      if (code0 == VECTOR_TYPE && code1 == VECTOR_TYPE)
++      if (gnu_vector_type_p (type0) && gnu_vector_type_p (type1))
+         {
+           tree intt;
+ 	  if (!vector_types_compatible_elements_p (type0, type1))
+@@ -12218,7 +12236,8 @@ build_binary_op (location_t location, enum tree_code code,
+   if (code0 == ERROR_MARK || code1 == ERROR_MARK)
+     return error_mark_node;
+ 
+-  if (code0 == VECTOR_TYPE && code1 == VECTOR_TYPE
++  if (gnu_vector_type_p (type0)
++      && gnu_vector_type_p (type1)
+       && (!tree_int_cst_equal (TYPE_SIZE (type0), TYPE_SIZE (type1))
+ 	  || !vector_types_compatible_elements_p (type0, type1)))
+     {
+@@ -12233,10 +12252,12 @@ build_binary_op (location_t location, enum tree_code code,
+     }
+ 
+   if ((code0 == INTEGER_TYPE || code0 == REAL_TYPE || code0 == COMPLEX_TYPE
+-       || code0 == FIXED_POINT_TYPE || code0 == VECTOR_TYPE)
++       || code0 == FIXED_POINT_TYPE
++       || gnu_vector_type_p (type0))
+       &&
+       (code1 == INTEGER_TYPE || code1 == REAL_TYPE || code1 == COMPLEX_TYPE
+-       || code1 == FIXED_POINT_TYPE || code1 == VECTOR_TYPE))
++       || code1 == FIXED_POINT_TYPE
++       || gnu_vector_type_p (type1)))
+     {
+       bool first_complex = (code0 == COMPLEX_TYPE);
+       bool second_complex = (code1 == COMPLEX_TYPE);
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index a9123c858..51356da37 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -717,6 +717,9 @@ namespace aarch64_sve {
+ 			   tree, unsigned int, tree *);
+   gimple *gimple_fold_builtin (unsigned int, gimple_stmt_iterator *, gcall *);
+   rtx expand_builtin (unsigned int, tree, rtx);
++#ifdef GCC_TARGET_H
++  bool verify_type_context (location_t, type_context_kind, const_tree, bool);
++#endif
+ }
+ 
+ extern void aarch64_split_combinev16qi (rtx operands[3]);
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
+index f830d9294..10595a5ab 100644
+--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
++++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
+@@ -3248,8 +3248,10 @@ register_builtin_types ()
+ 				   BITS_PER_SVE_VECTOR));
+ 	}
+       vectype = build_distinct_type_copy (vectype);
++      gcc_assert (vectype == TYPE_MAIN_VARIANT (vectype));
+       SET_TYPE_STRUCTURAL_EQUALITY (vectype);
+       TYPE_ARTIFICIAL (vectype) = 1;
++      TYPE_INDIVISIBLE_P (vectype) = 1;
+       abi_vector_types[i] = vectype;
+       lang_hooks.types.register_builtin_type (vectype,
+ 					      vector_types[i].abi_name);
+@@ -3490,8 +3492,7 @@ bool
+ svbool_type_p (const_tree type)
+ {
+   tree abi_type = abi_vector_types[VECTOR_TYPE_svbool_t];
+-  return (type != error_mark_node
+-	  && TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (abi_type));
++  return type != error_mark_node && TYPE_MAIN_VARIANT (type) == abi_type;
+ }
+ 
+ /* If TYPE is a built-in type defined by the SVE ABI, return the mangled name,
+@@ -3546,6 +3547,55 @@ builtin_type_p (const_tree type)
+   return svbool_type_p (type) || nvectors_if_data_type (type) > 0;
+ }
+ 
++/* Implement TARGET_VERIFY_TYPE_CONTEXT for SVE types.  */
++bool
++verify_type_context (location_t loc, type_context_kind context,
++		     const_tree type, bool silent_p)
++{
++  if (!builtin_type_p (type))
++    return true;
++
++  switch (context)
++    {
++    case TCTX_SIZEOF:
++    case TCTX_STATIC_STORAGE:
++      if (!silent_p)
++	error_at (loc, "SVE type %qT does not have a fixed size", type);
++      return false;
++
++    case TCTX_ALIGNOF:
++      if (!silent_p)
++	error_at (loc, "SVE type %qT does not have a defined alignment", type);
++      return false;
++
++    case TCTX_THREAD_STORAGE:
++      if (!silent_p)
++	error_at (loc, "variables of type %qT cannot have thread-local"
++		  " storage duration", type);
++      return false;
++
++    case TCTX_POINTER_ARITH:
++      if (!silent_p)
++	error_at (loc, "arithmetic on pointer to SVE type %qT", type);
++      return false;
++
++    case TCTX_FIELD:
++      if (silent_p)
++	;
++      else if (lang_GNU_CXX ())
++	error_at (loc, "member variables cannot have SVE type %qT", type);
++      else
++	error_at (loc, "fields cannot have SVE type %qT", type);
++      return false;
++
++    case TCTX_ARRAY_ELEMENT:
++      if (!silent_p)
++	error_at (loc, "array elements cannot have SVE type %qT", type);
++      return false;
++    }
++  gcc_unreachable ();
++}
++
+ }
+ 
+ using namespace aarch64_sve;
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 3486cca89..c2ab7af56 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -16201,6 +16201,15 @@ aarch64_mangle_type (const_tree type)
+   return NULL;
+ }
+ 
++/* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
++
++static bool
++aarch64_verify_type_context (location_t loc, type_context_kind context,
++			     const_tree type, bool silent_p)
++{
++  return aarch64_sve::verify_type_context (loc, context, type, silent_p);
++}
++
+ /* Find the first rtx_insn before insn that will generate an assembly
+    instruction.  */
+ 
+@@ -21967,6 +21976,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_MANGLE_TYPE
+ #define TARGET_MANGLE_TYPE aarch64_mangle_type
+ 
++#undef TARGET_VERIFY_TYPE_CONTEXT
++#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
++
+ #undef TARGET_INVALID_CONVERSION
+ #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
+ 
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 3f22bb1f6..220bbe7dd 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -11963,6 +11963,19 @@ conversion rules.
+ This is currently used only by the C and C++ front ends.
+ @end deftypefn
+ 
++@deftypefn {Target Hook} bool TARGET_VERIFY_TYPE_CONTEXT (location_t @var{loc}, type_context_kind @var{context}, const_tree @var{type}, bool @var{silent_p})
++If defined, this hook returns false if there is a target-specific reason
++why type @var{type} cannot be used in the source language context described
++by @var{context}.  When @var{silent_p} is false, the hook also reports an
++error against @var{loc} for invalid uses of @var{type}.
++
++Calls to this hook should be made through the global function
++@code{verify_type_context}, which makes the @var{silent_p} parameter
++default to false and also handles @code{error_mark_node}.
++
++The default implementation always returns true.
++@end deftypefn
++
+ @defmac OBJC_JBLEN
+ This macro determines the size of the objective C jump buffer for the
+ NeXT runtime. By default, OBJC_JBLEN is defined to an innocuous value.
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index 89cfb5253..a8cb42a6b 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -8095,6 +8095,8 @@ and scanf formatter settings.
+ 
+ @hook TARGET_CONVERT_TO_TYPE
+ 
++@hook TARGET_VERIFY_TYPE_CONTEXT
++
+ @defmac OBJC_JBLEN
+ This macro determines the size of the objective C jump buffer for the
+ NeXT runtime. By default, OBJC_JBLEN is defined to an innocuous value.
+diff --git a/gcc/target.def b/gcc/target.def
+index 05389cdd1..4e3dc341c 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -5234,6 +5234,22 @@ This is currently used only by the C and C++ front ends.",
+  tree, (tree type, tree expr),
+  hook_tree_tree_tree_null)
+ 
++DEFHOOK
++(verify_type_context,
++ "If defined, this hook returns false if there is a target-specific reason\n\
++why type @var{type} cannot be used in the source language context described\n\
++by @var{context}.  When @var{silent_p} is false, the hook also reports an\n\
++error against @var{loc} for invalid uses of @var{type}.\n\
++\n\
++Calls to this hook should be made through the global function\n\
++@code{verify_type_context}, which makes the @var{silent_p} parameter\n\
++default to false and also handles @code{error_mark_node}.\n\
++\n\
++The default implementation always returns true.",
++ bool, (location_t loc, type_context_kind context, const_tree type,
++	bool silent_p),
++ NULL)
++
+ DEFHOOK
+ (can_change_mode_class,
+  "This hook returns true if it is possible to bitcast values held in\n\
+diff --git a/gcc/target.h b/gcc/target.h
+index 964629669..3e6d34d34 100644
+--- a/gcc/target.h
++++ b/gcc/target.h
+@@ -219,6 +219,35 @@ typedef auto_vec<machine_mode, 8> auto_vector_modes;
+        will choose the first mode that works.  */
+ const unsigned int VECT_COMPARE_COSTS = 1U << 0;
+ 
++/* The contexts in which the use of a type T can be checked by
++   TARGET_VERIFY_TYPE_CONTEXT.  */
++enum type_context_kind {
++  /* Directly measuring the size of T.  */
++  TCTX_SIZEOF,
++
++  /* Directly measuring the alignment of T.  */
++  TCTX_ALIGNOF,
++
++  /* Creating objects of type T with static storage duration.  */
++  TCTX_STATIC_STORAGE,
++
++  /* Creating objects of type T with thread-local storage duration.  */
++  TCTX_THREAD_STORAGE,
++
++  /* Creating a field of type T.  */
++  TCTX_FIELD,
++
++  /* Creating an array with elements of type T.  */
++  TCTX_ARRAY_ELEMENT,
++
++  /* Adding to or subtracting from a pointer to T, or computing the
++     difference between two pointers when one of them is a pointer to T.  */
++  TCTX_POINTER_ARITH
++};
++
++extern bool verify_type_context (location_t, type_context_kind, const_tree,
++				 bool = false);
++
+ /* The target structure.  This holds all the backend hooks.  */
+ #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
+ #define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
+index 34d9dfd43..1672ddfef 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
+@@ -45,9 +45,9 @@ if { [check_effective_target_aarch64_sve] } {
+ }
+ 
+ # Main loop.
+-# FIXME: This should include general/*.c too, but leave that until the
+-# C frontend allows initialization of SVE vectors.
+-set files [glob -nocomplain $srcdir/$subdir/general-c/*.c]
++set files [glob -nocomplain \
++	       "$srcdir/$subdir/general/*.c" \
++	       "$srcdir/$subdir/general-c/*.c"]
+ dg-runtest [lsort $files] "$sve_flags" $DEFAULT_CFLAGS
+ 
+ # All done.
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/gnu_vectors_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/gnu_vectors_1.c
+new file mode 100644
+index 000000000..c4596f7e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/gnu_vectors_1.c
+@@ -0,0 +1,415 @@
++/* { dg-options "-msve-vector-bits=256" } */
++
++#include <arm_sve.h>
++
++typedef uint8_t gnu_uint8_t __attribute__ ((vector_size (32)));
++typedef int8_t gnu_int8_t __attribute__ ((vector_size (32)));
++
++void
++f (svuint8_t sve_u1, svint8_t sve_s1,
++   gnu_uint8_t gnu_u1, gnu_int8_t gnu_s1, int n, unsigned char uc)
++{
++  /* Initialization.  */
++
++  svuint8_t init_sve_u1 = 0; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'int'} } */
++  svuint8_t init_sve_u2 = {}; /* { dg-error {empty scalar initializer} } */
++  svuint8_t init_sve_u3 = { sve_u1 };
++  svuint8_t init_sve_u4 = { gnu_u1 };
++  svuint8_t init_sve_u5 = { sve_s1 }; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'svint8_t'} } */
++  svuint8_t init_sve_u6 = { gnu_s1 }; /* { dg-error {incompatible types when initializing type 'svuint8_t'} } */
++  svuint8_t init_sve_u7 = { 0 }; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'int'} } */
++  svuint8_t init_sve_u8 = { sve_u1, sve_u1 }; /* { dg-warning {excess elements in scalar initializer} } */
++  svuint8_t init_sve_u9 = { gnu_u1, gnu_u1 }; /* { dg-warning {excess elements in scalar initializer} } */
++
++  gnu_uint8_t init_gnu_u1 = 0; /* { dg-error {incompatible types when initializing type 'gnu_uint8_t'[^\n]* using type 'int'} } */
++  gnu_uint8_t init_gnu_u2 = {};
++  gnu_uint8_t init_gnu_u3 = { sve_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u4 = { gnu_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u5 = { sve_s1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u6 = { gnu_s1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u7 = { 0 };
++
++  /* Compound literals.  */
++
++  (svuint8_t) {}; /* { dg-error {empty scalar initializer} } */
++  (svuint8_t) { 0 }; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'int'} } */
++  (svuint8_t) { sve_u1 };
++  (svuint8_t) { gnu_u1 };
++  (svuint8_t) { sve_s1 }; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'svint8_t'} } */
++  (svuint8_t) { gnu_s1 }; /* { dg-error {incompatible types when initializing type 'svuint8_t'} } */
++
++  (gnu_uint8_t) {};
++  (gnu_uint8_t) { 0 };
++  (gnu_uint8_t) { sve_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  (gnu_uint8_t) { gnu_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++
++  /* Assignment.  */
++
++  sve_u1 = 0; /* { dg-error {incompatible types when assigning to type 'svuint8_t' from type 'int'} } */
++  sve_u1 = sve_u1;
++  sve_u1 = gnu_u1;
++  sve_u1 = sve_s1; /* { dg-error {incompatible types when assigning to type 'svuint8_t' from type 'svint8_t'} } */
++  sve_u1 = gnu_s1; /* { dg-error {incompatible types when assigning to type 'svuint8_t' from type 'gnu_int8_t'} } */
++
++  gnu_u1 = 0; /* { dg-error {incompatible types when assigning to type 'gnu_uint8_t'[^\n]* from type 'int'} } */
++  gnu_u1 = sve_u1;
++  gnu_u1 = gnu_u1;
++  gnu_u1 = sve_s1; /* { dg-error {incompatible types when assigning to type 'gnu_uint8_t'[^\n]* from type 'svint8_t'} } */
++  gnu_u1 = gnu_s1; /* { dg-error {incompatible types when assigning to type 'gnu_uint8_t'[^\n]* from type 'gnu_int8_t'} } */
++
++  /* Casts.  */
++
++  (void) sve_u1;
++  (svuint8_t) sve_u1;
++  (svuint8_t) gnu_u1;
++  (svuint8_t) 0; /* { dg-error {conversion to non-scalar type requested} } */
++  (svuint8_t) n; /* { dg-error {conversion to non-scalar type requested} } */
++  (svint8_t) sve_u1; /* { dg-error {conversion to non-scalar type requested} } */
++  (svint8_t) gnu_u1;
++
++  (void) gnu_u1;
++  (gnu_uint8_t) sve_u1;
++  (gnu_uint8_t) gnu_u1;
++  (gnu_uint8_t) 0; /* { dg-error {can't convert a value of type 'int' to vector type '[^']*' which has different size} } */
++  (gnu_uint8_t) n; /* { dg-error {can't convert a value of type 'int' to vector type '[^']*' which has different size} } */
++  (gnu_int8_t) sve_u1;
++  (gnu_int8_t) gnu_u1;
++
++  /* Vector indexing.  */
++
++  sve_u1[0]; /* { dg-error {subscripted value is neither array nor pointer} } */
++  &sve_u1[0]; /* { dg-error {subscripted value is neither array nor pointer} } */
++
++  gnu_u1[0];
++  &gnu_u1[0];
++
++  /* Unary operators.  */
++
++  +sve_u1; /* { dg-error {wrong type argument to unary plus} } */
++  -sve_u1; /* { dg-error {wrong type argument to unary minus} } */
++  ~sve_u1; /* { dg-error {wrong type argument to bit-complement} } */
++  !sve_u1; /* { dg-error {wrong type argument to unary exclamation mark} } */
++  *sve_u1; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real sve_u1; /* { dg-error {wrong type argument to __real} } */
++  __imag sve_u1; /* { dg-error {wrong type argument to __imag} } */
++  ++sve_u1; /* { dg-error {wrong type argument to increment} } */
++  --sve_u1; /* { dg-error {wrong type argument to decrement} } */
++  sve_u1++; /* { dg-error {wrong type argument to increment} } */
++  sve_u1--; /* { dg-error {wrong type argument to decrement} } */
++
++  +gnu_u1;
++  -gnu_u1;
++  ~gnu_u1;
++  !gnu_u1; /* { dg-error {wrong type argument to unary exclamation mark} } */
++  *gnu_u1; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real gnu_u1; /* { dg-error {wrong type argument to __real} } */
++  __imag gnu_u1; /* { dg-error {wrong type argument to __imag} } */
++  ++gnu_u1;
++  --gnu_u1;
++  gnu_u1++;
++  gnu_u1--;
++
++  /* Vector-vector binary arithmetic.  */
++
++  sve_u1 + sve_u1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - sve_u1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * sve_u1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / sve_u1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % sve_u1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & sve_u1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | sve_u1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ sve_u1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == sve_u1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != sve_u1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= sve_u1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < sve_u1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > sve_u1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= sve_u1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << sve_u1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> sve_u1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || sve_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  sve_u1 + gnu_u1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - gnu_u1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * gnu_u1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / gnu_u1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % gnu_u1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & gnu_u1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | gnu_u1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ gnu_u1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == gnu_u1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != gnu_u1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= gnu_u1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < gnu_u1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > gnu_u1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= gnu_u1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << gnu_u1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> gnu_u1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + sve_u1; /* { dg-error {invalid operands to binary \+} } */
++  gnu_u1 - sve_u1; /* { dg-error {invalid operands to binary -} } */
++  gnu_u1 * sve_u1; /* { dg-error {invalid operands to binary \*} } */
++  gnu_u1 / sve_u1; /* { dg-error {invalid operands to binary /} } */
++  gnu_u1 % sve_u1; /* { dg-error {invalid operands to binary %} } */
++  gnu_u1 & sve_u1; /* { dg-error {invalid operands to binary \&} } */
++  gnu_u1 | sve_u1; /* { dg-error {invalid operands to binary \|} } */
++  gnu_u1 ^ sve_u1; /* { dg-error {invalid operands to binary \^} } */
++  gnu_u1 == sve_u1; /* { dg-error {invalid operands to binary ==} } */
++  gnu_u1 != sve_u1; /* { dg-error {invalid operands to binary !=} } */
++  gnu_u1 <= sve_u1; /* { dg-error {invalid operands to binary <=} } */
++  gnu_u1 < sve_u1; /* { dg-error {invalid operands to binary <} } */
++  gnu_u1 > sve_u1; /* { dg-error {invalid operands to binary >} } */
++  gnu_u1 >= sve_u1; /* { dg-error {invalid operands to binary >=} } */
++  gnu_u1 << sve_u1; /* { dg-error {invalid operands to binary <<} } */
++  gnu_u1 >> sve_u1; /* { dg-error {invalid operands to binary >>} } */
++  gnu_u1 && sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || sve_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + gnu_u1;
++  gnu_u1 - gnu_u1;
++  gnu_u1 * gnu_u1;
++  gnu_u1 / gnu_u1;
++  gnu_u1 % gnu_u1;
++  gnu_u1 & gnu_u1;
++  gnu_u1 | gnu_u1;
++  gnu_u1 ^ gnu_u1;
++  gnu_u1 == gnu_u1;
++  gnu_u1 != gnu_u1;
++  gnu_u1 <= gnu_u1;
++  gnu_u1 < gnu_u1;
++  gnu_u1 > gnu_u1;
++  gnu_u1 >= gnu_u1;
++  gnu_u1 << gnu_u1;
++  gnu_u1 >> gnu_u1;
++  gnu_u1 && gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Vector-scalar binary arithmetic.  */
++
++  sve_u1 + 2; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - 2; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * 2; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / 2; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % 2; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & 2; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | 2; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ 2; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == 2; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != 2; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= 2; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < 2; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > 2; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= 2; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << 2; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> 2; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && 2; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || 2; /* { dg-error {used vector type where scalar is required} } */
++
++  sve_u1 + uc; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - uc; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * uc; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / uc; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % uc; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & uc; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | uc; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ uc; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == uc; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != uc; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= uc; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < uc; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > uc; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= uc; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << uc; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> uc; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && uc; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || uc; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + 2;
++  gnu_u1 - 2;
++  gnu_u1 * 2;
++  gnu_u1 / 2;
++  gnu_u1 % 2;
++  gnu_u1 & 2;
++  gnu_u1 | 2;
++  gnu_u1 ^ 2;
++  gnu_u1 == 2;
++  gnu_u1 != 2;
++  gnu_u1 <= 2;
++  gnu_u1 < 2;
++  gnu_u1 > 2;
++  gnu_u1 >= 2;
++  gnu_u1 << 2;
++  gnu_u1 >> 2;
++  gnu_u1 && 2; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || 2; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + uc;
++  gnu_u1 - uc;
++  gnu_u1 * uc;
++  gnu_u1 / uc;
++  gnu_u1 % uc;
++  gnu_u1 & uc;
++  gnu_u1 | uc;
++  gnu_u1 ^ uc;
++  gnu_u1 == uc;
++  gnu_u1 != uc;
++  gnu_u1 <= uc;
++  gnu_u1 < uc;
++  gnu_u1 > uc;
++  gnu_u1 >= uc;
++  gnu_u1 << uc;
++  gnu_u1 >> uc;
++  gnu_u1 && uc; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || uc; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Scalar-vector binary arithmetic.  */
++
++  3 + sve_u1; /* { dg-error {invalid operands to binary \+} } */
++  3 - sve_u1; /* { dg-error {invalid operands to binary -} } */
++  3 * sve_u1; /* { dg-error {invalid operands to binary \*} } */
++  3 / sve_u1; /* { dg-error {invalid operands to binary /} } */
++  3 % sve_u1; /* { dg-error {invalid operands to binary %} } */
++  3 & sve_u1; /* { dg-error {invalid operands to binary \&} } */
++  3 | sve_u1; /* { dg-error {invalid operands to binary \|} } */
++  3 ^ sve_u1; /* { dg-error {invalid operands to binary \^} } */
++  3 == sve_u1; /* { dg-error {invalid operands to binary ==} } */
++  3 != sve_u1; /* { dg-error {invalid operands to binary !=} } */
++  3 <= sve_u1; /* { dg-error {invalid operands to binary <=} } */
++  3 < sve_u1; /* { dg-error {invalid operands to binary <} } */
++  3 > sve_u1; /* { dg-error {invalid operands to binary >} } */
++  3 >= sve_u1; /* { dg-error {invalid operands to binary >=} } */
++  3 << sve_u1; /* { dg-error {invalid operands to binary <<} } */
++  3 >> sve_u1; /* { dg-error {invalid operands to binary >>} } */
++  3 && sve_u1; /* { dg-error {invalid operands to binary \&\&} } */
++  3 || sve_u1; /* { dg-error {invalid operands to binary \|\|} } */
++
++  3 + gnu_u1;
++  3 - gnu_u1;
++  3 * gnu_u1;
++  3 / gnu_u1;
++  3 % gnu_u1;
++  3 & gnu_u1;
++  3 | gnu_u1;
++  3 ^ gnu_u1;
++  3 == gnu_u1;
++  3 != gnu_u1;
++  3 <= gnu_u1;
++  3 < gnu_u1;
++  3 > gnu_u1;
++  3 >= gnu_u1;
++  3 << gnu_u1;
++  3 >> gnu_u1;
++  3 && gnu_u1; /* { dg-error {invalid operands to binary \&\&} } */
++  3 || gnu_u1; /* { dg-error {invalid operands to binary \|\|} } */
++
++  /* Mismatched types.  */
++
++  sve_u1 + sve_s1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - sve_s1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * sve_s1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / sve_s1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % sve_s1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & sve_s1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | sve_s1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ sve_s1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == sve_s1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != sve_s1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= sve_s1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < sve_s1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > sve_s1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= sve_s1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << sve_s1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> sve_s1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && sve_s1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || sve_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  sve_u1 + gnu_s1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - gnu_s1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * gnu_s1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / gnu_s1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % gnu_s1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & gnu_s1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | gnu_s1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ gnu_s1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == gnu_s1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != gnu_s1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= gnu_s1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < gnu_s1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > gnu_s1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= gnu_s1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << gnu_s1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> gnu_s1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + sve_s1; /* { dg-error {invalid operands to binary \+} } */
++  gnu_u1 - sve_s1; /* { dg-error {invalid operands to binary -} } */
++  gnu_u1 * sve_s1; /* { dg-error {invalid operands to binary \*} } */
++  gnu_u1 / sve_s1; /* { dg-error {invalid operands to binary /} } */
++  gnu_u1 % sve_s1; /* { dg-error {invalid operands to binary %} } */
++  gnu_u1 & sve_s1; /* { dg-error {invalid operands to binary \&} } */
++  gnu_u1 | sve_s1; /* { dg-error {invalid operands to binary \|} } */
++  gnu_u1 ^ sve_s1; /* { dg-error {invalid operands to binary \^} } */
++  gnu_u1 == sve_s1; /* { dg-error {invalid operands to binary ==} } */
++  gnu_u1 != sve_s1; /* { dg-error {invalid operands to binary !=} } */
++  gnu_u1 <= sve_s1; /* { dg-error {invalid operands to binary <=} } */
++  gnu_u1 < sve_s1; /* { dg-error {invalid operands to binary <} } */
++  gnu_u1 > sve_s1; /* { dg-error {invalid operands to binary >} } */
++  gnu_u1 >= sve_s1; /* { dg-error {invalid operands to binary >=} } */
++  gnu_u1 << sve_s1; /* { dg-error {invalid operands to binary <<} } */
++  gnu_u1 >> sve_s1; /* { dg-error {invalid operands to binary >>} } */
++  gnu_u1 && sve_s1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || sve_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + gnu_s1;
++  gnu_u1 - gnu_s1;
++  gnu_u1 * gnu_s1;
++  gnu_u1 / gnu_s1;
++  gnu_u1 % gnu_s1;
++  gnu_u1 & gnu_s1;
++  gnu_u1 | gnu_s1;
++  gnu_u1 ^ gnu_s1;
++  gnu_u1 == gnu_s1;
++  gnu_u1 != gnu_s1;
++  gnu_u1 <= gnu_s1;
++  gnu_u1 < gnu_s1;
++  gnu_u1 > gnu_s1;
++  gnu_u1 >= gnu_s1;
++  gnu_u1 << gnu_s1;
++  gnu_u1 >> gnu_s1;
++  gnu_u1 && gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Conditional expressions.  */
++
++  uc ? sve_u1 : sve_u1;
++  uc ? gnu_u1 : sve_u1; /* { dg-error {type mismatch in conditional expression} } */
++  uc ? sve_u1 : gnu_u1; /* { dg-error {type mismatch in conditional expression} } */
++  uc ? gnu_u1 : gnu_u1;
++
++  sve_u1 ? sve_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 ? gnu_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 ? sve_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 ? gnu_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 ? sve_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 ? gnu_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 ? sve_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 ? gnu_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Vector built-ins.  */
++
++  __builtin_shuffle (sve_u1, sve_u1, sve_u1); /* { dg-error {'__builtin_shuffle' last argument must be an integer vector} } */
++  __builtin_shuffle (sve_u1, gnu_u1, gnu_u1); /* { dg-error {'__builtin_shuffle' arguments must be vectors} } */
++  __builtin_shuffle (gnu_u1, sve_u1, gnu_u1); /* { dg-error {'__builtin_shuffle' arguments must be vectors} } */
++  __builtin_shuffle (gnu_u1, gnu_u1, sve_u1); /* { dg-error {'__builtin_shuffle' last argument must be an integer vector} } */
++  __builtin_shuffle (gnu_u1, gnu_u1, gnu_u1);
++
++  __builtin_convertvector (sve_u1, svuint8_t); /* { dg-error {'__builtin_convertvector' first argument must be an integer or floating vector} } */
++  __builtin_convertvector (gnu_u1, svuint8_t); /* { dg-error {'__builtin_convertvector' second argument must be an integer or floating vector type} } */
++  __builtin_convertvector (sve_u1, gnu_uint8_t); /* { dg-error {'__builtin_convertvector' first argument must be an integer or floating vector} } */
++  __builtin_convertvector (gnu_u1, gnu_uint8_t);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/gnu_vectors_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/gnu_vectors_2.c
+new file mode 100644
+index 000000000..61e6d2163
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/gnu_vectors_2.c
+@@ -0,0 +1,415 @@
++/* { dg-options "-msve-vector-bits=256 -flax-vector-conversions" } */
++
++#include <arm_sve.h>
++
++typedef uint8_t gnu_uint8_t __attribute__ ((vector_size (32)));
++typedef int8_t gnu_int8_t __attribute__ ((vector_size (32)));
++
++void
++f (svuint8_t sve_u1, svint8_t sve_s1,
++   gnu_uint8_t gnu_u1, gnu_int8_t gnu_s1, int n, unsigned char uc)
++{
++  /* Initialization.  */
++
++  svuint8_t init_sve_u1 = 0; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'int'} } */
++  svuint8_t init_sve_u2 = {}; /* { dg-error {empty scalar initializer} } */
++  svuint8_t init_sve_u3 = { sve_u1 };
++  svuint8_t init_sve_u4 = { gnu_u1 };
++  svuint8_t init_sve_u5 = { sve_s1 };
++  svuint8_t init_sve_u6 = { gnu_s1 };
++  svuint8_t init_sve_u7 = { 0 }; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'int'} } */
++  svuint8_t init_sve_u8 = { sve_u1, sve_u1 }; /* { dg-warning {excess elements in scalar initializer} } */
++  svuint8_t init_sve_u9 = { gnu_u1, gnu_u1 }; /* { dg-warning {excess elements in scalar initializer} } */
++
++  gnu_uint8_t init_gnu_u1 = 0; /* { dg-error {incompatible types when initializing type 'gnu_uint8_t'[^\n]* using type 'int'} } */
++  gnu_uint8_t init_gnu_u2 = {};
++  gnu_uint8_t init_gnu_u3 = { sve_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u4 = { gnu_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u5 = { sve_s1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u6 = { gnu_s1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  gnu_uint8_t init_gnu_u7 = { 0 };
++
++  /* Compound literals.  */
++
++  (svuint8_t) {}; /* { dg-error {empty scalar initializer} } */
++  (svuint8_t) { 0 }; /* { dg-error {incompatible types when initializing type 'svuint8_t' using type 'int'} } */
++  (svuint8_t) { sve_u1 };
++  (svuint8_t) { gnu_u1 };
++  (svuint8_t) { sve_s1 };
++  (svuint8_t) { gnu_s1 };
++
++  (gnu_uint8_t) {};
++  (gnu_uint8_t) { 0 };
++  (gnu_uint8_t) { sve_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++  (gnu_uint8_t) { gnu_u1 }; /* { dg-error {incompatible types when initializing type 'unsigned char'} } */
++
++  /* Assignment.  */
++
++  sve_u1 = 0; /* { dg-error {incompatible types when assigning to type 'svuint8_t' from type 'int'} } */
++  sve_u1 = sve_u1;
++  sve_u1 = gnu_u1;
++  sve_u1 = sve_s1;
++  sve_u1 = gnu_s1;
++
++  gnu_u1 = 0; /* { dg-error {incompatible types when assigning to type 'gnu_uint8_t'[^\n]* from type 'int'} } */
++  gnu_u1 = sve_u1;
++  gnu_u1 = gnu_u1;
++  gnu_u1 = sve_s1;
++  gnu_u1 = gnu_s1;
++
++  /* Casts.  */
++
++  (void) sve_u1;
++  (svuint8_t) sve_u1;
++  (svuint8_t) gnu_u1;
++  (svuint8_t) 0; /* { dg-error {conversion to non-scalar type requested} } */
++  (svuint8_t) n; /* { dg-error {conversion to non-scalar type requested} } */
++  (svint8_t) sve_u1;
++  (svint8_t) gnu_u1;
++
++  (void) gnu_u1;
++  (gnu_uint8_t) sve_u1;
++  (gnu_uint8_t) gnu_u1;
++  (gnu_uint8_t) 0; /* { dg-error {can't convert a value of type 'int' to vector type '[^']*' which has different size} } */
++  (gnu_uint8_t) n; /* { dg-error {can't convert a value of type 'int' to vector type '[^']*' which has different size} } */
++  (gnu_int8_t) sve_u1;
++  (gnu_int8_t) gnu_u1;
++
++  /* Vector indexing.  */
++
++  sve_u1[0]; /* { dg-error {subscripted value is neither array nor pointer} } */
++  &sve_u1[0]; /* { dg-error {subscripted value is neither array nor pointer} } */
++
++  gnu_u1[0];
++  &gnu_u1[0];
++
++  /* Unary operators.  */
++
++  +sve_u1; /* { dg-error {wrong type argument to unary plus} } */
++  -sve_u1; /* { dg-error {wrong type argument to unary minus} } */
++  ~sve_u1; /* { dg-error {wrong type argument to bit-complement} } */
++  !sve_u1; /* { dg-error {wrong type argument to unary exclamation mark} } */
++  *sve_u1; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real sve_u1; /* { dg-error {wrong type argument to __real} } */
++  __imag sve_u1; /* { dg-error {wrong type argument to __imag} } */
++  ++sve_u1; /* { dg-error {wrong type argument to increment} } */
++  --sve_u1; /* { dg-error {wrong type argument to decrement} } */
++  sve_u1++; /* { dg-error {wrong type argument to increment} } */
++  sve_u1--; /* { dg-error {wrong type argument to decrement} } */
++
++  +gnu_u1;
++  -gnu_u1;
++  ~gnu_u1;
++  !gnu_u1; /* { dg-error {wrong type argument to unary exclamation mark} } */
++  *gnu_u1; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real gnu_u1; /* { dg-error {wrong type argument to __real} } */
++  __imag gnu_u1; /* { dg-error {wrong type argument to __imag} } */
++  ++gnu_u1;
++  --gnu_u1;
++  gnu_u1++;
++  gnu_u1--;
++
++  /* Vector-vector binary arithmetic.  */
++
++  sve_u1 + sve_u1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - sve_u1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * sve_u1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / sve_u1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % sve_u1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & sve_u1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | sve_u1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ sve_u1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == sve_u1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != sve_u1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= sve_u1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < sve_u1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > sve_u1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= sve_u1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << sve_u1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> sve_u1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || sve_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  sve_u1 + gnu_u1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - gnu_u1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * gnu_u1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / gnu_u1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % gnu_u1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & gnu_u1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | gnu_u1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ gnu_u1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == gnu_u1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != gnu_u1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= gnu_u1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < gnu_u1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > gnu_u1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= gnu_u1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << gnu_u1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> gnu_u1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + sve_u1; /* { dg-error {invalid operands to binary \+} } */
++  gnu_u1 - sve_u1; /* { dg-error {invalid operands to binary -} } */
++  gnu_u1 * sve_u1; /* { dg-error {invalid operands to binary \*} } */
++  gnu_u1 / sve_u1; /* { dg-error {invalid operands to binary /} } */
++  gnu_u1 % sve_u1; /* { dg-error {invalid operands to binary %} } */
++  gnu_u1 & sve_u1; /* { dg-error {invalid operands to binary \&} } */
++  gnu_u1 | sve_u1; /* { dg-error {invalid operands to binary \|} } */
++  gnu_u1 ^ sve_u1; /* { dg-error {invalid operands to binary \^} } */
++  gnu_u1 == sve_u1; /* { dg-error {invalid operands to binary ==} } */
++  gnu_u1 != sve_u1; /* { dg-error {invalid operands to binary !=} } */
++  gnu_u1 <= sve_u1; /* { dg-error {invalid operands to binary <=} } */
++  gnu_u1 < sve_u1; /* { dg-error {invalid operands to binary <} } */
++  gnu_u1 > sve_u1; /* { dg-error {invalid operands to binary >} } */
++  gnu_u1 >= sve_u1; /* { dg-error {invalid operands to binary >=} } */
++  gnu_u1 << sve_u1; /* { dg-error {invalid operands to binary <<} } */
++  gnu_u1 >> sve_u1; /* { dg-error {invalid operands to binary >>} } */
++  gnu_u1 && sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || sve_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + gnu_u1;
++  gnu_u1 - gnu_u1;
++  gnu_u1 * gnu_u1;
++  gnu_u1 / gnu_u1;
++  gnu_u1 % gnu_u1;
++  gnu_u1 & gnu_u1;
++  gnu_u1 | gnu_u1;
++  gnu_u1 ^ gnu_u1;
++  gnu_u1 == gnu_u1;
++  gnu_u1 != gnu_u1;
++  gnu_u1 <= gnu_u1;
++  gnu_u1 < gnu_u1;
++  gnu_u1 > gnu_u1;
++  gnu_u1 >= gnu_u1;
++  gnu_u1 << gnu_u1;
++  gnu_u1 >> gnu_u1;
++  gnu_u1 && gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Vector-scalar binary arithmetic.  */
++
++  sve_u1 + 2; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - 2; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * 2; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / 2; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % 2; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & 2; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | 2; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ 2; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == 2; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != 2; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= 2; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < 2; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > 2; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= 2; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << 2; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> 2; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && 2; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || 2; /* { dg-error {used vector type where scalar is required} } */
++
++  sve_u1 + uc; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - uc; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * uc; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / uc; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % uc; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & uc; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | uc; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ uc; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == uc; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != uc; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= uc; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < uc; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > uc; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= uc; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << uc; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> uc; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && uc; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || uc; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + 2;
++  gnu_u1 - 2;
++  gnu_u1 * 2;
++  gnu_u1 / 2;
++  gnu_u1 % 2;
++  gnu_u1 & 2;
++  gnu_u1 | 2;
++  gnu_u1 ^ 2;
++  gnu_u1 == 2;
++  gnu_u1 != 2;
++  gnu_u1 <= 2;
++  gnu_u1 < 2;
++  gnu_u1 > 2;
++  gnu_u1 >= 2;
++  gnu_u1 << 2;
++  gnu_u1 >> 2;
++  gnu_u1 && 2; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || 2; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + uc;
++  gnu_u1 - uc;
++  gnu_u1 * uc;
++  gnu_u1 / uc;
++  gnu_u1 % uc;
++  gnu_u1 & uc;
++  gnu_u1 | uc;
++  gnu_u1 ^ uc;
++  gnu_u1 == uc;
++  gnu_u1 != uc;
++  gnu_u1 <= uc;
++  gnu_u1 < uc;
++  gnu_u1 > uc;
++  gnu_u1 >= uc;
++  gnu_u1 << uc;
++  gnu_u1 >> uc;
++  gnu_u1 && uc; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || uc; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Scalar-vector binary arithmetic.  */
++
++  3 + sve_u1; /* { dg-error {invalid operands to binary \+} } */
++  3 - sve_u1; /* { dg-error {invalid operands to binary -} } */
++  3 * sve_u1; /* { dg-error {invalid operands to binary \*} } */
++  3 / sve_u1; /* { dg-error {invalid operands to binary /} } */
++  3 % sve_u1; /* { dg-error {invalid operands to binary %} } */
++  3 & sve_u1; /* { dg-error {invalid operands to binary \&} } */
++  3 | sve_u1; /* { dg-error {invalid operands to binary \|} } */
++  3 ^ sve_u1; /* { dg-error {invalid operands to binary \^} } */
++  3 == sve_u1; /* { dg-error {invalid operands to binary ==} } */
++  3 != sve_u1; /* { dg-error {invalid operands to binary !=} } */
++  3 <= sve_u1; /* { dg-error {invalid operands to binary <=} } */
++  3 < sve_u1; /* { dg-error {invalid operands to binary <} } */
++  3 > sve_u1; /* { dg-error {invalid operands to binary >} } */
++  3 >= sve_u1; /* { dg-error {invalid operands to binary >=} } */
++  3 << sve_u1; /* { dg-error {invalid operands to binary <<} } */
++  3 >> sve_u1; /* { dg-error {invalid operands to binary >>} } */
++  3 && sve_u1; /* { dg-error {invalid operands to binary \&\&} } */
++  3 || sve_u1; /* { dg-error {invalid operands to binary \|\|} } */
++
++  3 + gnu_u1;
++  3 - gnu_u1;
++  3 * gnu_u1;
++  3 / gnu_u1;
++  3 % gnu_u1;
++  3 & gnu_u1;
++  3 | gnu_u1;
++  3 ^ gnu_u1;
++  3 == gnu_u1;
++  3 != gnu_u1;
++  3 <= gnu_u1;
++  3 < gnu_u1;
++  3 > gnu_u1;
++  3 >= gnu_u1;
++  3 << gnu_u1;
++  3 >> gnu_u1;
++  3 && gnu_u1; /* { dg-error {invalid operands to binary \&\&} } */
++  3 || gnu_u1; /* { dg-error {invalid operands to binary \|\|} } */
++
++  /* Mismatched types.  */
++
++  sve_u1 + sve_s1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - sve_s1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * sve_s1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / sve_s1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % sve_s1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & sve_s1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | sve_s1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ sve_s1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == sve_s1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != sve_s1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= sve_s1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < sve_s1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > sve_s1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= sve_s1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << sve_s1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> sve_s1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && sve_s1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || sve_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  sve_u1 + gnu_s1; /* { dg-error {invalid operands to binary \+} } */
++  sve_u1 - gnu_s1; /* { dg-error {invalid operands to binary -} } */
++  sve_u1 * gnu_s1; /* { dg-error {invalid operands to binary \*} } */
++  sve_u1 / gnu_s1; /* { dg-error {invalid operands to binary /} } */
++  sve_u1 % gnu_s1; /* { dg-error {invalid operands to binary %} } */
++  sve_u1 & gnu_s1; /* { dg-error {invalid operands to binary \&} } */
++  sve_u1 | gnu_s1; /* { dg-error {invalid operands to binary \|} } */
++  sve_u1 ^ gnu_s1; /* { dg-error {invalid operands to binary \^} } */
++  sve_u1 == gnu_s1; /* { dg-error {invalid operands to binary ==} } */
++  sve_u1 != gnu_s1; /* { dg-error {invalid operands to binary !=} } */
++  sve_u1 <= gnu_s1; /* { dg-error {invalid operands to binary <=} } */
++  sve_u1 < gnu_s1; /* { dg-error {invalid operands to binary <} } */
++  sve_u1 > gnu_s1; /* { dg-error {invalid operands to binary >} } */
++  sve_u1 >= gnu_s1; /* { dg-error {invalid operands to binary >=} } */
++  sve_u1 << gnu_s1; /* { dg-error {invalid operands to binary <<} } */
++  sve_u1 >> gnu_s1; /* { dg-error {invalid operands to binary >>} } */
++  sve_u1 && gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 || gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + sve_s1; /* { dg-error {invalid operands to binary \+} } */
++  gnu_u1 - sve_s1; /* { dg-error {invalid operands to binary -} } */
++  gnu_u1 * sve_s1; /* { dg-error {invalid operands to binary \*} } */
++  gnu_u1 / sve_s1; /* { dg-error {invalid operands to binary /} } */
++  gnu_u1 % sve_s1; /* { dg-error {invalid operands to binary %} } */
++  gnu_u1 & sve_s1; /* { dg-error {invalid operands to binary \&} } */
++  gnu_u1 | sve_s1; /* { dg-error {invalid operands to binary \|} } */
++  gnu_u1 ^ sve_s1; /* { dg-error {invalid operands to binary \^} } */
++  gnu_u1 == sve_s1; /* { dg-error {invalid operands to binary ==} } */
++  gnu_u1 != sve_s1; /* { dg-error {invalid operands to binary !=} } */
++  gnu_u1 <= sve_s1; /* { dg-error {invalid operands to binary <=} } */
++  gnu_u1 < sve_s1; /* { dg-error {invalid operands to binary <} } */
++  gnu_u1 > sve_s1; /* { dg-error {invalid operands to binary >} } */
++  gnu_u1 >= sve_s1; /* { dg-error {invalid operands to binary >=} } */
++  gnu_u1 << sve_s1; /* { dg-error {invalid operands to binary <<} } */
++  gnu_u1 >> sve_s1; /* { dg-error {invalid operands to binary >>} } */
++  gnu_u1 && sve_s1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || sve_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 + gnu_s1;
++  gnu_u1 - gnu_s1;
++  gnu_u1 * gnu_s1;
++  gnu_u1 / gnu_s1;
++  gnu_u1 % gnu_s1;
++  gnu_u1 & gnu_s1;
++  gnu_u1 | gnu_s1;
++  gnu_u1 ^ gnu_s1;
++  gnu_u1 == gnu_s1;
++  gnu_u1 != gnu_s1;
++  gnu_u1 <= gnu_s1;
++  gnu_u1 < gnu_s1;
++  gnu_u1 > gnu_s1;
++  gnu_u1 >= gnu_s1;
++  gnu_u1 << gnu_s1;
++  gnu_u1 >> gnu_s1;
++  gnu_u1 && gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 || gnu_s1; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Conditional expressions.  */
++
++  uc ? sve_u1 : sve_u1;
++  uc ? gnu_u1 : sve_u1; /* { dg-error {type mismatch in conditional expression} } */
++  uc ? sve_u1 : gnu_u1; /* { dg-error {type mismatch in conditional expression} } */
++  uc ? gnu_u1 : gnu_u1;
++
++  sve_u1 ? sve_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 ? gnu_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 ? sve_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  sve_u1 ? gnu_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  gnu_u1 ? sve_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 ? gnu_u1 : sve_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 ? sve_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++  gnu_u1 ? gnu_u1 : gnu_u1; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Vector built-ins.  */
++
++  __builtin_shuffle (sve_u1, sve_u1, sve_u1); /* { dg-error {'__builtin_shuffle' last argument must be an integer vector} } */
++  __builtin_shuffle (sve_u1, gnu_u1, gnu_u1); /* { dg-error {'__builtin_shuffle' arguments must be vectors} } */
++  __builtin_shuffle (gnu_u1, sve_u1, gnu_u1); /* { dg-error {'__builtin_shuffle' arguments must be vectors} } */
++  __builtin_shuffle (gnu_u1, gnu_u1, sve_u1); /* { dg-error {'__builtin_shuffle' last argument must be an integer vector} } */
++  __builtin_shuffle (gnu_u1, gnu_u1, gnu_u1);
++
++  __builtin_convertvector (sve_u1, svuint8_t); /* { dg-error {'__builtin_convertvector' first argument must be an integer or floating vector} } */
++  __builtin_convertvector (gnu_u1, svuint8_t); /* { dg-error {'__builtin_convertvector' second argument must be an integer or floating vector type} } */
++  __builtin_convertvector (sve_u1, gnu_uint8_t); /* { dg-error {'__builtin_convertvector' first argument must be an integer or floating vector} } */
++  __builtin_convertvector (gnu_u1, gnu_uint8_t);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/sizeless-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/sizeless-1.c
+new file mode 100644
+index 000000000..ec892a3fc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/sizeless-1.c
+@@ -0,0 +1,217 @@
++/* { dg-options "-std=gnu99" } */
++
++#include <arm_sve.h>
++
++typedef signed char int8x32_t __attribute__((__vector_size__ (32)));
++
++/* Sizeless objects with global scope.  */
++
++svint8_t global_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++static svint8_t local_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++extern svint8_t extern_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++__thread svint8_t tls_sve_sc; /* { dg-error {variables of type 'svint8_t' cannot have thread-local storage duration} } */
++_Atomic svint8_t atomic_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++
++/* Sizeless arrays.  */
++
++typedef svint8_t array_type[2]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++extern svint8_t extern_array[]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++
++/* Sizeless fields.  */
++
++struct struct1 {
++  svint8_t a; /* { dg-error {fields cannot have SVE type 'svint8_t'} } */
++};
++
++union union1 {
++  svint8_t a; /* { dg-error {fields cannot have SVE type 'svint8_t'} } */
++};
++
++/* Pointers to sizeless types.  */
++
++svint8_t *global_sve_sc_ptr;
++svint8_t *invalid_sve_sc_ptr = &(svint8_t) { *global_sve_sc_ptr }; /* { dg-error {initializer element is not constant} } */
++  /* { dg-error {SVE type 'svint8_t' does not have a fixed size} "" { target *-*-* } .-1 } */
++
++/* Sizeless arguments and return values.  */
++
++void ext_consume_sve_sc (svint8_t);
++void ext_consume_varargs (int, ...);
++svint8_t ext_produce_sve_sc ();
++
++/* Main tests for statements and expressions.  */
++
++void
++statements (int n)
++{
++  /* Local declarations.  */
++
++  unsigned char va __attribute__((__vector_size__(2)));
++  svint8_t sve_sc1, sve_sc2;
++  _Atomic svint8_t atomic_sve_sc;
++  int8x32_t gnu_sc1;
++  svint16_t sve_sh1;
++  static svint8_t local_static_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++
++  /* Layout queries.  */
++
++  sizeof (svint8_t); /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++  sizeof (sve_sc1); /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++  sizeof (ext_produce_sve_sc ()); /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++  _Alignof (svint8_t); /* { dg-error {SVE type 'svint8_t' does not have a defined alignment} } */
++  _Alignof (sve_sc1); /* { dg-error {SVE type 'svint8_t' does not have a defined alignment} } */
++  _Alignof (ext_produce_sve_sc ()); /* { dg-error {SVE type 'svint8_t' does not have a defined alignment} } */
++
++  /* Initialization.  */
++
++  svint8_t init_sve_sc1 = sve_sc1;
++  svint8_t init_sve_sc2 = sve_sh1; /* { dg-error {incompatible types when initializing type 'svint8_t' using type 'svint16_t'} } */
++  svint8_t init_sve_sc3 = {}; /* { dg-error {empty scalar initializer} } */
++
++  int initi_a = sve_sc1; /* { dg-error {incompatible types when initializing type 'int' using type 'svint8_t'} } */
++  int initi_b = { sve_sc1 }; /* { dg-error {incompatible types when initializing type 'int' using type 'svint8_t'} } */
++
++  /* Compound literals.  */
++
++  (svint8_t) {}; /* { dg-error {empty scalar initializer} } */
++  (svint8_t) { sve_sc1 };
++
++  (int) { sve_sc1 }; /* { dg-error {incompatible types when initializing type 'int' using type 'svint8_t'} } */
++
++  /* Arrays.  */
++
++  svint8_t array[2]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++  svint8_t zero_length_array[0]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++  svint8_t empty_init_array[] = {}; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++  				    /* { dg-error {empty scalar initializer} "" { target *-*-* } .-1 } */
++  typedef svint8_t vla_type[n]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++
++  /* Assignment.  */
++
++  n = sve_sc1; /* { dg-error {incompatible types when assigning to type 'int' from type 'svint8_t'} } */
++
++  sve_sc1 = 0; /* { dg-error {incompatible types when assigning to type 'svint8_t' from type 'int'} } */
++  sve_sc1 = sve_sc2;
++  sve_sc1 = sve_sh1; /* { dg-error {incompatible types when assigning to type 'svint8_t' from type 'svint16_t'} } */
++
++  /* Casting.  */
++
++  (void) sve_sc1;
++  (svint8_t) sve_sc1;
++
++  /* Addressing and dereferencing.  */
++
++  svint8_t *sve_sc_ptr = &sve_sc1;
++  int8x32_t *gnu_sc_ptr = &gnu_sc1;
++  sve_sc1 = *sve_sc_ptr;
++
++  /* Pointer assignment.  */
++
++  gnu_sc_ptr = sve_sc_ptr; /* { dg-warning {assignment to [^\n]* from incompatible pointer type} } */
++  sve_sc_ptr = gnu_sc_ptr; /* { dg-warning {assignment to [^\n]* from incompatible pointer type} } */
++
++  /* Pointer arithmetic.  */
++
++  ++sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  --sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr++; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr--; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr += 0; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr += 1; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr -= 0; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr -= 1; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr - sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  gnu_sc_ptr - sve_sc_ptr; /* { dg-error {invalid operands to binary -} } */
++  sve_sc_ptr - gnu_sc_ptr; /* { dg-error {invalid operands to binary -} } */
++  sve_sc1 = sve_sc_ptr[0]; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc1 = sve_sc_ptr[1]; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++
++  /* Pointer comparison.  */
++
++  sve_sc_ptr == &sve_sc1;
++  sve_sc_ptr != &sve_sc1;
++  sve_sc_ptr < &sve_sc1;
++  sve_sc_ptr <= &sve_sc1;
++  sve_sc_ptr > &sve_sc1;
++  sve_sc_ptr >= &sve_sc1;
++  gnu_sc_ptr == sve_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  gnu_sc_ptr != sve_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  gnu_sc_ptr < sve_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  gnu_sc_ptr <= sve_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  gnu_sc_ptr > sve_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  gnu_sc_ptr >= sve_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  sve_sc_ptr == gnu_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  sve_sc_ptr != gnu_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  sve_sc_ptr < gnu_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  sve_sc_ptr <= gnu_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  sve_sc_ptr > gnu_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++  sve_sc_ptr >= gnu_sc_ptr; /* { dg-warning {comparison of distinct pointer types lacks a cast} } */
++
++  /* Conditional expressions.  */
++
++  0 ? sve_sc1 : sve_sc1;
++  0 ? sve_sc1 : sve_sh1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? sve_sc1 : 0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? 0 : sve_sc1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ?: sve_sc1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? sve_sc_ptr : sve_sc_ptr;
++  0 ? sve_sc_ptr : gnu_sc_ptr; /* { dg-warning {pointer type mismatch in conditional expression} } */
++  0 ? gnu_sc_ptr : sve_sc_ptr; /* { dg-warning {pointer type mismatch in conditional expression} } */
++
++  /* Generic associations.  */
++
++  _Generic (sve_sc1, default: 100);
++  _Generic (1, svint8_t: 10, default: 20);
++
++  /* Function arguments.  */
++
++  ext_consume_sve_sc (sve_sc1);
++  ext_consume_sve_sc (sve_sh1); /* { dg-error {incompatible type for argument 1 of 'ext_consume_sve_sc'} } */
++  ext_consume_varargs (sve_sc1); /* { dg-error {incompatible type for argument 1 of 'ext_consume_varargs'} } */
++  ext_consume_varargs (1, sve_sc1);
++
++  /* Function returns.  */
++
++  ext_produce_sve_sc ();
++  sve_sc1 = ext_produce_sve_sc ();
++  sve_sh1 = ext_produce_sve_sc (); /* { dg-error {incompatible types when assigning to type 'svint16_t' from type 'svint8_t'} } */
++
++  /* Varargs processing.  */
++
++  __builtin_va_list valist;
++  __builtin_va_arg (valist, svint8_t);
++
++  /* Statement expressions.  */
++
++  ({ sve_sc1; });
++  ({ svint8_t another_sve_sc = *sve_sc_ptr; another_sve_sc; });
++}
++
++/* Function parameters in definitions.  */
++
++void
++old_style (input_sve_sc) /* { dg-error {SVE type 'svint8_t' cannot be passed to an unprototyped function} } */
++     svint8_t input_sve_sc;
++{
++  svint8_t sve_sc1 = input_sve_sc;
++}
++
++void
++new_style_param (svint8_t input_sve_sc)
++{
++  svint8_t sve_sc1 = input_sve_sc;
++}
++
++/* Function return values in definitions.  */
++
++svint8_t
++good_return_sve_sc (svint8_t param)
++{
++  return param;
++}
++
++svint8_t
++bad_return_sve_sc (svint16_t param)
++{
++  return param; /* { dg-error {incompatible types when returning type 'svint16_t' but 'svint8_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/sizeless-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/sizeless-2.c
+new file mode 100644
+index 000000000..717439300
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/sizeless-2.c
+@@ -0,0 +1,217 @@
++/* { dg-options "-std=gnu99 -msve-vector-bits=256" } */
++
++#include <arm_sve.h>
++
++typedef signed char int8x32_t __attribute__((__vector_size__ (32)));
++
++/* Sizeless objects with global scope.  */
++
++svint8_t global_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++static svint8_t local_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++extern svint8_t extern_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++__thread svint8_t tls_sve_sc; /* { dg-error {variables of type 'svint8_t' cannot have thread-local storage duration} } */
++_Atomic svint8_t atomic_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++
++/* Sizeless arrays.  */
++
++typedef svint8_t array_type[2]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++extern svint8_t extern_array[]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++
++/* Sizeless fields.  */
++
++struct struct1 {
++  svint8_t a; /* { dg-error {fields cannot have SVE type 'svint8_t'} } */
++};
++
++union union1 {
++  svint8_t a; /* { dg-error {fields cannot have SVE type 'svint8_t'} } */
++};
++
++/* Pointers to sizeless types.  */
++
++svint8_t *global_sve_sc_ptr;
++svint8_t *invalid_sve_sc_ptr = &(svint8_t) { *global_sve_sc_ptr }; /* { dg-error {initializer element is not constant} } */
++  /* { dg-error {SVE type 'svint8_t' does not have a fixed size} "" { target *-*-* } .-1 } */
++
++/* Sizeless arguments and return values.  */
++
++void ext_consume_sve_sc (svint8_t);
++void ext_consume_varargs (int, ...);
++svint8_t ext_produce_sve_sc ();
++
++/* Main tests for statements and expressions.  */
++
++void
++statements (int n)
++{
++  /* Local declarations.  */
++
++  unsigned char va __attribute__((__vector_size__(2)));
++  svint8_t sve_sc1, sve_sc2;
++  _Atomic svint8_t atomic_sve_sc;
++  int8x32_t gnu_sc1;
++  svint16_t sve_sh1;
++  static svint8_t local_static_sve_sc; /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++
++  /* Layout queries.  */
++
++  sizeof (svint8_t); /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++  sizeof (sve_sc1); /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++  sizeof (ext_produce_sve_sc ()); /* { dg-error {SVE type 'svint8_t' does not have a fixed size} } */
++  _Alignof (svint8_t); /* { dg-error {SVE type 'svint8_t' does not have a defined alignment} } */
++  _Alignof (sve_sc1); /* { dg-error {SVE type 'svint8_t' does not have a defined alignment} } */
++  _Alignof (ext_produce_sve_sc ()); /* { dg-error {SVE type 'svint8_t' does not have a defined alignment} } */
++
++  /* Initialization.  */
++
++  svint8_t init_sve_sc1 = sve_sc1;
++  svint8_t init_sve_sc2 = sve_sh1; /* { dg-error {incompatible types when initializing type 'svint8_t' using type 'svint16_t'} } */
++  svint8_t init_sve_sc3 = {}; /* { dg-error {empty scalar initializer} } */
++
++  int initi_a = sve_sc1; /* { dg-error {incompatible types when initializing type 'int' using type 'svint8_t'} } */
++  int initi_b = { sve_sc1 }; /* { dg-error {incompatible types when initializing type 'int' using type 'svint8_t'} } */
++
++  /* Compound literals.  */
++
++  (svint8_t) {}; /* { dg-error {empty scalar initializer} } */
++  (svint8_t) { sve_sc1 };
++
++  (int) { sve_sc1 }; /* { dg-error {incompatible types when initializing type 'int' using type 'svint8_t'} } */
++
++  /* Arrays.  */
++
++  svint8_t array[2]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++  svint8_t zero_length_array[0]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++  svint8_t empty_init_array[] = {}; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++  				    /* { dg-error {empty scalar initializer} "" { target *-*-* } .-1 } */
++  typedef svint8_t vla_type[n]; /* { dg-error {array elements cannot have SVE type 'svint8_t'} } */
++
++  /* Assignment.  */
++
++  n = sve_sc1; /* { dg-error {incompatible types when assigning to type 'int' from type 'svint8_t'} } */
++
++  sve_sc1 = 0; /* { dg-error {incompatible types when assigning to type 'svint8_t' from type 'int'} } */
++  sve_sc1 = sve_sc2;
++  sve_sc1 = sve_sh1; /* { dg-error {incompatible types when assigning to type 'svint8_t' from type 'svint16_t'} } */
++
++  /* Casting.  */
++
++  (void) sve_sc1;
++  (svint8_t) sve_sc1;
++
++  /* Addressing and dereferencing.  */
++
++  svint8_t *sve_sc_ptr = &sve_sc1;
++  int8x32_t *gnu_sc_ptr = &gnu_sc1;
++  sve_sc1 = *sve_sc_ptr;
++
++  /* Pointer assignment.  */
++
++  gnu_sc_ptr = sve_sc_ptr;
++  sve_sc_ptr = gnu_sc_ptr;
++
++  /* Pointer arithmetic.  */
++
++  ++sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  --sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr++; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr--; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr += 0; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr += 1; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr -= 0; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr -= 1; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr - sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  gnu_sc_ptr - sve_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc_ptr - gnu_sc_ptr; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc1 = sve_sc_ptr[0]; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++  sve_sc1 = sve_sc_ptr[1]; /* { dg-error {arithmetic on pointer to SVE type 'svint8_t'} } */
++
++  /* Pointer comparison.  */
++
++  sve_sc_ptr == &sve_sc1;
++  sve_sc_ptr != &sve_sc1;
++  sve_sc_ptr < &sve_sc1;
++  sve_sc_ptr <= &sve_sc1;
++  sve_sc_ptr > &sve_sc1;
++  sve_sc_ptr >= &sve_sc1;
++  gnu_sc_ptr == sve_sc_ptr;
++  gnu_sc_ptr != sve_sc_ptr;
++  gnu_sc_ptr < sve_sc_ptr;
++  gnu_sc_ptr <= sve_sc_ptr;
++  gnu_sc_ptr > sve_sc_ptr;
++  gnu_sc_ptr >= sve_sc_ptr;
++  sve_sc_ptr == gnu_sc_ptr;
++  sve_sc_ptr != gnu_sc_ptr;
++  sve_sc_ptr < gnu_sc_ptr;
++  sve_sc_ptr <= gnu_sc_ptr;
++  sve_sc_ptr > gnu_sc_ptr;
++  sve_sc_ptr >= gnu_sc_ptr;
++
++  /* Conditional expressions.  */
++
++  0 ? sve_sc1 : sve_sc1;
++  0 ? sve_sc1 : sve_sh1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? sve_sc1 : 0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? 0 : sve_sc1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ?: sve_sc1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? sve_sc_ptr : sve_sc_ptr;
++  0 ? sve_sc_ptr : gnu_sc_ptr;
++  0 ? gnu_sc_ptr : sve_sc_ptr;
++
++  /* Generic associations.  */
++
++  _Generic (sve_sc1, default: 100);
++  _Generic (1, svint8_t: 10, default: 20);
++
++  /* Function arguments.  */
++
++  ext_consume_sve_sc (sve_sc1);
++  ext_consume_sve_sc (sve_sh1); /* { dg-error {incompatible type for argument 1 of 'ext_consume_sve_sc'} } */
++  ext_consume_varargs (sve_sc1); /* { dg-error {incompatible type for argument 1 of 'ext_consume_varargs'} } */
++  ext_consume_varargs (1, sve_sc1);
++
++  /* Function returns.  */
++
++  ext_produce_sve_sc ();
++  sve_sc1 = ext_produce_sve_sc ();
++  sve_sh1 = ext_produce_sve_sc (); /* { dg-error {incompatible types when assigning to type 'svint16_t' from type 'svint8_t'} } */
++
++  /* Varargs processing.  */
++
++  __builtin_va_list valist;
++  __builtin_va_arg (valist, svint8_t);
++
++  /* Statement expressions.  */
++
++  ({ sve_sc1; });
++  ({ svint8_t another_sve_sc = *sve_sc_ptr; another_sve_sc; });
++}
++
++/* Function parameters in definitions.  */
++
++void
++old_style (input_sve_sc) /* { dg-error {SVE type 'svint8_t' cannot be passed to an unprototyped function} } */
++     svint8_t input_sve_sc;
++{
++  svint8_t sve_sc1 = input_sve_sc;
++}
++
++void
++new_style_param (svint8_t input_sve_sc)
++{
++  svint8_t sve_sc1 = input_sve_sc;
++}
++
++/* Function return values in definitions.  */
++
++svint8_t
++good_return_sve_sc (svint8_t param)
++{
++  return param;
++}
++
++svint8_t
++bad_return_sve_sc (svint16_t param)
++{
++  return param; /* { dg-error {incompatible types when returning type 'svint16_t' but 'svint8_t' was expected} } */
++}
+diff --git a/gcc/tree-core.h b/gcc/tree-core.h
+index 26b6f46ad..fca4abf2a 100644
+--- a/gcc/tree-core.h
++++ b/gcc/tree-core.h
+@@ -1585,7 +1585,8 @@ struct GTY(()) tree_type_common {
+   unsigned warn_if_not_align : 6;
+   unsigned typeless_storage : 1;
+   unsigned empty_flag : 1;
+-  unsigned spare : 17;
++  unsigned indivisible_p : 1;
++  unsigned spare : 16;
+ 
+   alias_set_type alias_set;
+   tree pointer_to;
+diff --git a/gcc/tree.c b/gcc/tree.c
+index 62607c63a..33e8dca2a 100644
+--- a/gcc/tree.c
++++ b/gcc/tree.c
+@@ -15146,6 +15146,21 @@ max_object_size (void)
+   return TYPE_MAX_VALUE (ptrdiff_type_node);
+ }
+ 
++/* A wrapper around TARGET_VERIFY_TYPE_CONTEXT that makes the silent_p
++   parameter default to false and that weeds out error_mark_node.  */
++
++bool
++verify_type_context (location_t loc, type_context_kind context,
++		     const_tree type, bool silent_p)
++{
++  if (type == error_mark_node)
++    return true;
++
++  gcc_assert (TYPE_P (type));
++  return (!targetm.verify_type_context
++	  || targetm.verify_type_context (loc, context, type, silent_p));
++}
++
+ #if CHECKING_P
+ 
+ namespace selftest {
+diff --git a/gcc/tree.h b/gcc/tree.h
+index 356a9f544..97d18fc2b 100644
+--- a/gcc/tree.h
++++ b/gcc/tree.h
+@@ -704,6 +704,11 @@ extern void omp_clause_range_check_failed (const_tree, const char *, int,
+ /* Used to indicate that this TYPE represents a compiler-generated entity.  */
+ #define TYPE_ARTIFICIAL(NODE) (TYPE_CHECK (NODE)->base.nowarning_flag)
+ 
++/* True if the type is indivisible at the source level, i.e. if its
++   component parts cannot be accessed directly.  This is used to suppress
++   normal GNU extensions for target-specific vector types.  */
++#define TYPE_INDIVISIBLE_P(NODE) (TYPE_CHECK (NODE)->type_common.indivisible_p)
++
+ /* In an IDENTIFIER_NODE, this means that assemble_name was called with
+    this string as an argument.  */
+ #define TREE_SYMBOL_REFERENCED(NODE) \
diff --git a/adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch b/adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch
new file mode 100644
index 0000000..6ee3d33
--- /dev/null
+++ b/adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch
@@ -0,0 +1,88 @@
+This backport contains 2 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+4bf29d15f2e01348a45a1f4e1a135962f123fdd6
+0001-AArch64-PR79262-Adjust-vector-cost.patch
+
+27071013521b015d17a2666448f27a6ff0c55aca
+0001-Move-EXTRACT_LAST_REDUCTION-costing-to-vectorizable_.patch
+
+diff -Nurp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+--- a/gcc/config/aarch64/aarch64.c	2020-11-20 04:36:33.988000000 +0800
++++ b/gcc/config/aarch64/aarch64.c	2020-11-20 04:32:20.984000000 +0800
+@@ -448,7 +448,7 @@ static const struct cpu_vector_cost gene
+   1, /* vec_int_stmt_cost  */
+   1, /* vec_fp_stmt_cost  */
+   2, /* vec_permute_cost  */
+-  1, /* vec_to_scalar_cost  */
++  2, /* vec_to_scalar_cost  */
+   1, /* scalar_to_vec_cost  */
+   1, /* vec_align_load_cost  */
+   1, /* vec_unalign_load_cost  */
+diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
+--- a/gcc/tree-vect-loop.c	2020-11-20 04:36:34.016000000 +0800
++++ b/gcc/tree-vect-loop.c	2020-11-20 04:32:20.984000000 +0800
+@@ -3926,8 +3926,11 @@ vect_model_reduction_cost (stmt_vec_info
+ 
+   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
+ 
+-  if (reduction_type == EXTRACT_LAST_REDUCTION
+-      || reduction_type == FOLD_LEFT_REDUCTION)
++  if (reduction_type == EXTRACT_LAST_REDUCTION)
++    /* No extra instructions are needed in the prologue.  The loop body
++       operations are costed in vectorizable_condition.  */
++    inside_cost = 0;
++  else if (reduction_type == FOLD_LEFT_REDUCTION)
+     {
+       /* No extra instructions needed in the prologue.  */
+       prologue_cost = 0;
+diff -Nurp a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+--- a/gcc/tree-vect-stmts.c	2020-11-20 04:36:33.996000000 +0800
++++ b/gcc/tree-vect-stmts.c	2020-11-20 04:32:20.984000000 +0800
+@@ -859,7 +859,8 @@ vect_model_simple_cost (stmt_vec_info st
+ 			enum vect_def_type *dt,
+ 			int ndts,
+ 			slp_tree node,
+-			stmt_vector_for_cost *cost_vec)
++			stmt_vector_for_cost *cost_vec,
++			vect_cost_for_stmt kind = vector_stmt)
+ {
+   int inside_cost = 0, prologue_cost = 0;
+ 
+@@ -906,7 +907,7 @@ vect_model_simple_cost (stmt_vec_info st
+     }
+ 
+   /* Pass the inside-of-loop statements to the target-specific cost model.  */
+-  inside_cost += record_stmt_cost (cost_vec, ncopies, vector_stmt,
++  inside_cost += record_stmt_cost (cost_vec, ncopies, kind,
+ 				   stmt_info, 0, vect_body);
+ 
+   if (dump_enabled_p ())
+@@ -9194,15 +9195,18 @@ vectorizable_condition (stmt_vec_info st
+ 			     " EXTRACT_LAST_REDUCTION.\n");
+ 	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+ 	}
+-      if (expand_vec_cond_expr_p (vectype, comp_vectype,
+-				     cond_code))
+-	{
+-	  STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
+-	  vect_model_simple_cost (stmt_info, ncopies, dts, ndts, slp_node,
+-				  cost_vec);
+-	  return true;
+-	}
+-      return false;
++
++      vect_cost_for_stmt kind = vector_stmt;
++      if (reduction_type == EXTRACT_LAST_REDUCTION)
++        /* Count one reduction-like operation per vector.  */
++        kind = vec_to_scalar;
++      else if (!expand_vec_cond_expr_p (vectype, comp_vectype, cond_code))
++        return false;
++
++      STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type;
++      vect_model_simple_cost (stmt_info, ncopies, dts, ndts, slp_node,
++                                                 cost_vec, kind);
++      return true;
+     }
+ 
+   /* Transform.  */
diff --git a/avoid-cycling-on-vertain-subreg-reloads.patch b/avoid-cycling-on-vertain-subreg-reloads.patch
index 709d68d..f4139ac 100644
--- a/avoid-cycling-on-vertain-subreg-reloads.patch
+++ b/avoid-cycling-on-vertain-subreg-reloads.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-lra-Avoid-cycling-on-certain-subreg-reloads-PR96796.patch
+6001db79c477b03eacc7e7049560921fb54b7845
+
 diff -uprN a/gcc/lra-constraints.c b/gcc/lra-constraints.c
 --- a/gcc/lra-constraints.c	2020-03-12 19:07:21.000000000 +0800
 +++ b/gcc/lra-constraints.c	2020-09-08 10:02:52.308147305 +0800
diff --git a/bf16-and-matrix-characteristic.patch b/bf16-and-matrix-characteristic.patch
new file mode 100644
index 0000000..8f9e252
--- /dev/null
+++ b/bf16-and-matrix-characteristic.patch
@@ -0,0 +1,466067 @@
+This backport contains 309 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-target-89261-ix86_data_alignment-has-wrong-arg.patch
+f8b906a2de3044f1dea753b182c244a1a560d40e
+
+0002-Fix-Wenum-compare-switch-warning-in-i386.c.patch
+791536baadc9f469ec8eef2d7213c6f6091c5fa9
+
+0003-Prefer-to-use-strlen-call-instead-of-inline-expansio.patch
+786e0e5239529de9a4254fe8411a0e8f843e721a
+
+0004-Enhance-target-and-target_clone-error-messages.patch
+cc2a672a60ff7476b3e4751ba41cb77c7fc85b09
+
+0005-re-PR-middle-end-88963-gcc-generates-terrible-code-f.patch
+a7eb97ad269b6509bd7b31ca373daea98e4d7e85
+
+0006-Split-i386.c.patch
+2bf6d93547e516b6b2b2051c0fb1b47ea4acc8a4
+
+0007-Split-part-of-functionality-from-lto.c-to-lto-common.patch
+a79420f995764129dc40d1abcbf8ce75a0b0f906
+
+0008-Error-only-when-a-non-default-mabi-is-used-with-sani.patch
+080629d32eca5ea202479022f0bd429a813be7c4
+
+0009-This-patch-adds-support-to-vectorize-sum-of-abslolut.patch
+a9fad8fe6c84de272f2a56d462e67d53c9f4a73d
+
+0010-cfgexpand.c-asm_clobber_reg_is_valid-Reject-clobbers.patch
+0a59215131c02dee4c8829f93d1ee678647614da
+
+0011-re-PR-tree-optimization-90395-ICE-verify_flow_info-f.patch
+362e280d10c61bec13c1d02c11a1c4ac0846db7e
+
+0012-re-PR-c-59813-tail-call-elimination-didn-t-fire-for-.patch
+b5b9147d35ee509714c34d813c7723bf18bb7b7a
+
+0013-Accept-code-attributes-as-rtx-codes-in-.md-files.patch
+75df257b38bd4cdcb750fc893c5023363230cfe8
+
+0014-x86-fix-pr82920.patch
+0f8768f73440b040707deafd254d189c2887d00d
+
+0015-2019-05-14-Przemyslaw-Wirkus-przemyslaw.wirkus-arm.c.patch
+a52cf5cf278e4a9e58bfa2bb67a93244766a122f
+
+0016-re-PR-tree-optimization-88828-Inefficient-update-of-.patch
+962372f9f853c582c879f11c0db14973cc8687e0
+
+0017-re-PR-tree-optimization-88828-Inefficient-update-of-.patch
+595ffc073bf5b1753e3a18dfa704391ad5fad626
+
+0018-gcc-move-assemble_start_function-assemble_end_functi.patch
+f7430263c07b4a1bcf3deb708c8c691f233fcb40
+
+0019-trans.c-check_inlining_for_nested_subprog-Quote-rese.patch
+a9c697b88395a0f2b175ac30c59bd8c0c22d0db1
+
+0020-gcc-aarch64-move-assemble_start_function-assemble_en.patch
+6b5777c6c7059b6b8e372e567a74bdccb59a02c3
+
+0021-gimple-match-head.c-Include-vec-perm-indices.h.patch
+ebd733a78ccf5792067e94852c6c81a5f9aa0020
+
+0022-i386-Fold-__builtin_ia32_shufpd-to-VEC_PERM_EXPR.patch
+4d508751f421491052bc1d83150344e6cba30b3b
+
+0023-aarch64-Introduce-flags-for-SVE2.patch
+28108a5341653568e9ebc49ea755ff93cc1e1711
+
+0024-aarch64-Change-two-function-declaration-types.patch
+1ec77eedd529f81b1dc99cda9818f1ef9e952b96
+
+0025-PATCH-3-3-GCC-AARCH64-Add-support-for-pointer-authen.patch
+8fc16d725206f2c40bae423d7d0d93bd1baf6da2
+
+0026-This-patch-implements-the-u-avgM3_floor-and-u-avgM3_.patch
+0617e23c9531373d3b232152c0d81a2c707858d9
+
+0027-tree-ssa-alias-access-spath-1.c-new-testcase.patch
+987c9fc581ffb04d5ab7a782bb7aee6205c45663
+
+0028-PATCH-GCC-AARCH64-Fix-libstdc-build-failure-after-r2.patch
+0e2e15abd0765c1866f36f0312f77c9595e7fdec
+
+0029-aarch64-add-support-for-fabd-in-sve.patch
+3db85990dbde7f9c8212fe0fb8a241c5d2993198
+
+0030-New-.md-construct-define_insn_and_rewrite.patch
+f4fde1b378ad68fb2dec6719ed26c1b901488e03
+
+0031-re-PR-target-88837-SVE-Poor-vector-construction-code.patch
+3a0afad0d212b3ff213b393728e018caf2daa526
+
+0032-AArch64-Emit-TARGET_DOTPROD-specific-sequence-for-us.patch
+72215009a9f9827397a4eb74e9341b2b7dc658df
+
+0033-AARCH64-ILP32-Fix-aarch64_asan_shadow_offset.patch
+10078f3e1d0cbebc5e6f7f4821d3ad41421ef1e0
+
+0034-Make-SRA-re-construct-orginal-memory-accesses-when-e.patch
+3b47da42de621c6c3bf7d2f9245df989aa7eb5a1
+
+0035-Fix-fwprop-call-to-call-to-paradoxical_subreg_p.patch
+6c202d9dc65833e04e35f566c645fde8278c1a24
+
+0036-init_1.c-Remove-options-O2-fno-schedule-insns-and-in.patch
+3a9debbd7660bafbd7658c9e843eddbac8980188
+
+0037-iterators.md-ADDSUB-Fix-typo-in-comment.patch
+dd550c996578ea7e94f3a59e57f24636186fbb95
+
+0038-re-PR-target-88834-SVE-Poor-addressing-mode-choices-.patch
+fa9863e7d34ecd011ae75083be2ae124e5831b64
+
+0039-Darwin-The-need-for-picsym-stubs-is-dependent-on-lin.patch
+ce3a201593d0ed5b606360c064778de34b5b04ef
+
+0040-netbsd-aarch64-add-netbsd-aarch64-target.patch
+f32f75858a14e7b304df7a71dae15d75081b0deb
+
+0041-Vectorizer-Support-masking-fold-left-reductions.patch
+bce29d65ebe1316d15ec7582a1d257ef1be163f7
+
+0042-Darwin-The-need-for-FDE-symbols-is-dependent-on-link.patch
+dbe89f49da468fbd42a27bdb7b8f06de76a871b4
+
+0043-AArch64-Simplify-SVE-IFN_COND-patterns.patch
+32cf949cec180799d3fb14d405772ea35b5aafd3
+
+0044-AArch64-Factor-out-ptrue-predicate-creation.patch
+16de3637c4df37e0203b3ad52b238887e6ca38fc
+
+0045-AArch64-Factor-out-pfalse-predicate-creation.patch
+e7053b0c7cf3f1cd8a23cc71e7e36ec29c46b217
+
+0046-AArch64-Tabify-aarch64-sve.md.patch
+ea403d8bb5129632aac4d2f270566d2d0073a8ae
+
+0047-AArch64-Add-a-new-CC-mode-for-SVE-conditions.patch
+57d6f4d04d438522dc03488ca31f71b4b7b904c8
+
+0048-aarch64-Refactor-common-errata-work-around-specs.patch
+91bed1a15a6dfb891b9658532b49f9488b5537f4
+
+0049-objective-c-c-testsuite-Fix-stubify-tests-for-fnext-.patch
+b7a0332ccd21c04a37535c97f04abc4bc28fb321
+
+0050-builtins.c-get_memory_rtx-Fix-comment.patch
+76715c3216cf6ccd071fc852920af55d6b0054ae
+
+0051-Use-alternative_mask-for-add_insn_allocno_copies.patch
+73bb8fe9e915cf3219f16afdc61c308c08aa7659
+
+0052-Simplify-ira_setup_alts.patch
+06a65e803ed06f3ad1fd8e5f90db03aa0a7e5414
+
+0053-Make-ira_get_dup_out_num-handle-more-cases.patch
+ed680e2cc18c73f90e6bfbd3f346a8820476371b
+
+0054-Allow-earlyclobbers-in-ira_get_dup_out_num.patch
+ae5569fa33c9f3286e0b747f8b6607d21a4b9827
+
+0055-Use-ira_setup_alts-for-conflict-detection.patch
+6de20b9d7a1af863fb51b4a783c153ea0092810a
+
+0056-aarch64-force-frame-pointer-setup-before-tlsdesc-cal.patch
+0e510d1824241953c67b38f7a894de7238c23c61
+
+0057-AArch64-Remove-constraint-strings-from-define_expand.patch
+1bbffb87a9ecc3e27a4074145e55e3315df57b7d
+
+0058-re-PR-target-88833-SVE-Redundant-moves-for-WHILELO-b.patch
+75da268e1a563a1a52389cd2ecee12d07c45a655
+
+0059-PATCH-GCC-AARCH64-PR-target-90712-Fix-gcc.dg-rtl-aar.patch
+2bdc7dcbbd2eee4f114c09443933cc37a546dbff
+
+0060-aarch64-redefine-aes-patterns.patch
+5169fa77322e36dd4783bc5126185159c35a3584
+
+0061-simplify-rtx.c-simplify_unary_operation_1-Use-GET_MO.patch
+4faba5c3bc37c0bfceec6b254d76c5d0b3e2fe8b
+
+0062-Support-multiple-operand-counts-for-.md-patterns.patch
+d281492de84960b5885f88fffeeb226650f5141d
+
+0063-arch64-Fix-ambiguous-.md-attribute-uses.patch
+e7ba492a04d0bfef9752cbb16fcce3ffc31bf99f
+
+0064-Relax-vector_builder-elt-sanity-check.patch
+72ab1c51b607dd5446ee24ff9fce9178d6b811cb
+
+0065-re-PR-target-90723-pr88598-2.c-segfaults-with-msve-v.patch
+f2b29269c407f10718bc935b3dd5c7e8641b6847
+
+0066-AArch64-Rename-bitperm-to-sve2-bitperm.patch
+c10abf530e52972ef708f6e72cf20dd920cd22a2
+
+0067-aarch64-add-usra-and-ssra-combine-patterns.patch
+462e6f9a932a44ca73715dc5c2960e5b332f63f7
+
+0068-config-i386-x86-tune.def-X86_TUNE_AVOID_256FMA_CHAIN.patch
+ef893a2a769b18c61953d80670b1db8c27bc44e0
+
+0069-i386-options.c-ix86_option_override_internal-Default.patch
+105c2795b0d63b2cc5cb224ba066fa8b9a0ad0ff
+
+0070-Come-up-with-function_decl_type-and-use-it-in-tree_f.patch
+cb50701ec2c7abdc48db278802022f7e94675d07
+
+0071-cif-code.def-NEVER_CALL-New-code.patch
+5ab2422adf894bdf84deed8c7c0557c16d6dca2b
+
+0072-AArch64-Make-processing-less-fragile-in-config.gcc.patch
+3644cadf6a9d5a5cd8e83b0123316cf184fa4e3e
+
+0073-Implement-more-rtx-vector-folds-on-variable-length-v.patch
+4ce6ab6889446984fd7017e2150962eb4550a7ee
+
+0074-Generalise-VEC_DUPLICATE-folding-for-variable-length.patch
+708cc6132bb374e2c5bd1c4f43f9fe7306d20970
+
+0075-Add-dg-test-for-matching-function-bodies.patch
+4d706ff86ea86868615558e92407674a4f4b4af9
+
+0076-Prevent-Og-from-deleting-stores-to-write-only-variab.patch
+ec8ac265ff21fb379ac072848561a91e4990c47f
+
+0077-Don-t-run-DSE-at-Og.patch
+c0fe6bce2a8c35e997f45b0a674ab2058ba50ae0
+
+0078-Prevent-tree-ssa-dce.c-from-deleting-stores-at-Og.patch
+f33b9c40b97f6f8a72ee370068ad81e33d71434e
+
+0079-re-PR-target-91150-wrong-code-with-O-mavx512vbmi-due.patch
+fa2987ed8db073b9d59688363e2dfb6c60f47d70
+
+0080-Handle-IFN_COND_MUL-in-tree-ssa-math-opts.c.patch
+c1b3d827832f883e0634b18c88eb2bbde335aa42
+
+0081-Make-lra-use-per-alternative-earlyclobber-info.patch
+a25f3e8efbbc7182fa58c445574848a73856e9b4
+
+0082-GCC-AArch64-Enable-Transactional-Memory-Extension.patch
+89626179b6fe42cbd58c715808f7c6401879757f
+
+0083-Add-a-gimple_move_vops-helper-function.patch
+779724a5913b4e6a7ccccc0b8b415a772144a067
+
+0084-Make-function_code-a-32-bit-field.patch
+55f863c4d694deafb968dbf44d08ba49bb7c0766
+
+0085-AArch64-Remove-unused-commutative-attribute.patch
+871b49afafe043d57f717e70532d66c5a56ca173
+
+0086-AArch64-Reorganise-aarch64-sve.md.patch
+915d28fe74dbb30352702ab07ea5bf30747043bb
+
+0087-AArch64-Make-SVE-UNSPEC_COND_-s-match-the-insn-mnemo.patch
+cb18e86dd005fe009c536a8bb0aec7aa88ca66df
+
+0088-AArch64-Remove-redundant-SVE-FADDA-pattern.patch
+8ad84de26e1032d80225905c611a47b64a385e8a
+
+0089-AArch64-Merge-SVE-FP-unary-patterns.patch
+d45b20a5539b6f306a559470c3a7e9f84a058bfb
+
+0090-AArch64-Merge-SVE-FMAXNM-FMINNM-patterns.patch
+214c42faa06a9eb1aa7f0296399f28df4fb068ec
+
+0091-AArch64-Merge-SVE-ternary-FP-operations.patch
+0d80d083a2e1d368fcb11eb7ea5490c274f0ea15
+
+0092-AArch64-Merge-SVE-reduction-patterns.patch
+b0760a40bef3ca690691bf5d214da95b5dc25266
+
+0093-AArch64-Prefer-FPRs-over-GPRs-for-CLASTB.patch
+801790b37ca817089ecbae214340162e6d94ea6a
+
+0094-AArch64-Prefer-FPRs-over-GPRs-for-INSR.patch
+61ee25b9e7d84fbb18218887d1fecfb10f72993a
+
+0095-AArch64-Fix-INSR-for-zero-floats.patch
+9b6fb97c99abe64147f82a3ea6e6ed598e387482
+
+0096-C-Fix-bogus-nested-enum-error-message.patch
+99769e7fb6ed153a53174b7f08415eee347655f0
+
+0097-AArch64-Make-perm_insn-the-complete-mnemonic.patch
+3e2751ce5591dc8f3b5f4ffd3dacf0fb8f789395
+
+0098-AArch64-Add-a-y-constraint-for-V0-V7.patch
+163b1f6ab2950553e1cc1b39a6b49293b3390e46
+
+0099-AArch64-Make-aarch64_classify_vector_mode-use-a-swit.patch
+806f69cd68c18399e8e54b1a0913ae57beabbe69
+
+0100-AArch64-Make-simd_immediate_info-INDEX-explicit.patch
+1da83ccee8e7b61e7777abb63eb0e5a0ff1f1e93
+
+0101-AArch64-Use-simd_immediate_info-for-SVE-predicate-co.patch
+1044fa32e2b456b59b3cdc31b4f261145f1589cc
+
+0102-AArch64-Increase-default-function-alignment.patch
+4e55aefa3ee19167a41892e4920a3e8c520aee42
+
+0103-AArch64-Improve-SVE-constant-moves.patch
+4aeb1ba7f62c1d680c819ae3e137c3bad6f520ca
+
+0104-Darwin-There-is-no-need-to-distinguish-PIC-non-PIC-s.patch
+d308419c64c52c2d48bdf53a65e1790a2c897e83
+
+0105-Optimise-constant-IFN_WHILE_ULTs.patch
+0b1fe8cf6f1dde656c505dde6d27279dff388962
+
+0106-Protect-some-checks-of-DECL_FUNCTION_CODE.patch
+cb1180d547e3b28547134a06ee020163afa59cc3
+
+0107-Use-checking-forms-of-DECL_FUNCTION_CODE-PR-91421.patch
+4d732405bd91b54c196fdc38191f838bb01f23a6
+
+0108-AArch64-Rework-SVE-PTEST-patterns.patch
+34467289631e29545e14148515ab5f5d0d9e4fa7
+
+0109-AArch64-Canonicalise-SVE-predicate-constants.patch
+678faefcab01f9e9eeb222852675b5a042aaf900
+
+0110-AArch64-Don-t-rely-on-REG_EQUAL-notes-to-combine-SVE.patch
+35d6c5913d2209eb50f48b589b29f0dce13cb9b7
+
+0111-AArch64-Use-unspecs-for-remaining-SVE-FP-binary-ops.patch
+6fe679cc6be7a55832f9b88a8cf0751e8d5eff6e
+
+0112-AArch64-Add-a-GP-strictness-operand-to-SVE-FP-unspec.patch
+c9c5a8090c58b84c1eb45e39e77eee223f992009
+
+0113-AArch64-Commonise-some-SVE-FP-patterns.patch
+0254ed7970e64abd82f21aedf9373720a73671c7
+
+0114-AArch64-Add-support-for-SVE-HF-vconds.patch
+a70965b114281553fa46cac9b8abab543f36793f
+
+0115-AArch64-Rework-SVE-FP-comparisons.patch
+4a942af61c16f38f7fe51ed72a7ac23f73f62f2a
+
+0116-AArch64-Use-unspecs-for-SVE-conversions-involving-fl.patch
+99361551624427aebe7a856a4327e083aa33733a
+
+0117-AArch64-Rearrange-SVE-conversion-patterns.patch
+95eb5537d8bb23b952105b46250ed4fba8766b84
+
+0118-AArch64-Use-x-predication-for-SVE-integer-arithmetic.patch
+063082768aab23d26e42954eb115b76318f0176d
+
+0119-AArch64-Rework-SVE-integer-comparisons.patch
+00fa90d975bfacfd91a615fbee24e3e6a100100f
+
+0120-AArch64-Handle-more-SVE-predicate-constants.patch
+2803bc3bbca332f53801770715a5b592b2467492
+
+0121-AArch64-Use-SVE-ADR-to-optimise-shift-add-sequences.patch
+a229966c9c76afe0cf18c566a3c13ddde3878288
+
+0122-AArch64-Add-support-for-SVE-CLS-and-CLZ.patch
+bca5a9971f47cf5fe79e6595beb762539f200f46
+
+0123-AArch64-Add-support-for-SVE-CNOT.patch
+e0a0be93d7c2b760779c3085c5abfd0496e3458b
+
+0124-AArch64-Add-support-for-SVE-SU-MAX-MIN-immediate.patch
+f8c22a8bbaf3ef4260f7d8beea22ed151ca4b726
+
+0125-AArch64-Add-support-for-SVE-F-MAX-MIN-NM-immediate.patch
+75079ddf9cb867576bbef66f3e8370d9fdeea3b8
+
+0126-AArch64-Make-more-use-of-SVE-conditional-constant-mo.patch
+d29f7dd50de9e8e46f7e247c53f3b0405a3dadd9
+
+0127-AArch64-Use-SVE-MOV-M-of-scalars.patch
+88a37c4d72899c5a3f5a7b2bca0ae0096f3270a3
+
+0128-AArch64-Add-support-for-SVE-absolute-comparisons.patch
+42b4e87d317377d6dcbb25ee2523da4a0c42478a
+
+0129-AArch64-Add-SVE-conditional-integer-unary-patterns.patch
+3c9f496337f754f7c22afb46b017871db5844a97
+
+0130-AArch64-Add-SVE-conditional-floating-point-unary-pat.patch
+b21f7d53095b253753c5622f99809e9c82fd3009
+
+0131-AArch64-Add-SVE-conditional-conversion-patterns.patch
+c5e16983cd1bd6dd6eca1b939c3c8859f0c6c866
+
+0132-AArch64-Use-SVE-UXT-BHW-as-a-form-of-predicated-AND.patch
+d113ece60450b2efb07e9057b6d2732b08fee2c4
+
+0133-AArch64-Use-SVE-BIC-for-conditional-arithmetic.patch
+1b187f36ec16d43d0227805955d8fae51af26970
+
+0134-Add-support-for-conditional-shifts.patch
+20103c0ea9336d2b5286eb7f2605ace3fd49a431
+
+0135-AArch64-Use-SVE-SU-ABD-in-conditional-arithmetic.patch
+9730c5ccd522cd955bcb6e65295023621cade8b6
+
+0136-AArch64-Use-SVE-FABD-in-conditional-arithmetic.patch
+bf30864e4c241e50585745af504b09db55f7f08b
+
+0137-AArch64-Use-SVE-binary-immediate-instructions-for-co.patch
+a19ba9e1b15d248e5a13ee773f4acd4ae29fdeaa
+
+0138-AArch64-Use-SVE-MLA-MLS-MAD-and-MSB-for-conditional-.patch
+b6c3aea1892c148c21f8b87668f344b2397f4aa5
+
+0139-AArch64-Add-a-commutativity-marker-to-the-SVE-SU-ABD.patch
+9a8d9b3f2422d4885e5c846dee66acf6336e6ccf
+
+0140-aarch64-Use-neoversen1-tuning-struct-for-mcpu-cortex.patch
+42418c1f7f5cb3b2f466f88053acc818ddc5cd4d
+
+0141-AArch64-Use-SVE-reversed-shifts-in-preference-to-MOV.patch
+7d1f24018b04c13134bc47619fb8aaa390b01754
+
+0142-AArch64-Add-more-unpredicated-MOVPRFX-alternatives.patch
+5e176a613ef2eda92aa65736763a562dc42a50fe
+
+0143-AArch64-Remove-unneeded-FSUB-alternatives-and-add-a-.patch
+2ae21bd133c357fcd7b6e06dc7d7d9e0660abe2c
+
+0144-AArch64-Add-MOVPRFX-alternatives-for-SVE-EXT-pattern.patch
+06b3ba23eb6ff965a92cd99d2835d4c29316a447
+
+0145-AArch64-Add-more-SVE-FMLA-and-FMAD-z-alternatives.patch
+432b29c189a6d26ed701c7518402708b2fcb794f
+
+0146-AArch64-Rework-SVE-REV-BHW-patterns.patch
+d7a09c445a475a95559e8b9f29eb06ad92effa91
+
+0147-AArch64-Rework-SVE-INC-DEC-handling.patch
+0fdc30bcf56d7b46122d7e67d61b56c0a198f3b3
+
+0148-AArch64-Optimise-aarch64_add_offset-for-SVE-VL-const.patch
+7d8bdfa7e409821c50f6d8a7b557bd7dc760c4ce
+
+0149-AArch64-Pass-a-pattern-to-aarch64_output_sve_cnt_imm.patch
+139df05a29eb71075e42f502978dea4d00a99708
+
+0150-AArch64-Tweak-operand-choice-for-SVE-predicate-AND.patch
+2d2388f82f2e7f2fd1da063192ba98be45f099d2
+
+0151-AArch64-Fix-predicate-alignment-for-fixed-length-SVE.patch
+07108a9ebe4776610bb23f684b3a346d28511bed
+
+0152-AArch64-Add-a-aarch64_sve_mode_p-query.patch
+5c38705dbde776f68bf1f99a71657d0e21b772a5
+
+0153-Remove-TARGET_SETUP_INCOMING_VARARG_BOUNDS.patch
+06b5889c434b941804d5592cd4fc8946b25c1c4b
+
+0154-As-discussed-below.patch
+1f2a3ac34620ab4669f9f32417a7a4496c8f603a
+
+0155-AArch64-Use-scvtf-fbits-option-where-appropriate.patch
+188d00796f5bd338b9b8ab1cc8ba4b43af8ab8fd
+
+0156-Add-pass_va_arg_by_reference.patch
+fde65a89fad742c2dca8ad50452e482d22f3c1b2
+
+0157-Add-must_pass_va_arg_in_stack.patch
+4f53599cb5b822cd7f95997861c2e064977ecb6a
+
+0158-Use-function_arg_info-for-TARGET_ARG_PARTIAL_BYTES.patch
+a7c81bc1fb43366ca1b4332d8a6042b648a84cdc
+
+0159-Use-function_arg_info-for-TARGET_PASS_BY_REFERENCE.patch
+52090e4dbd064f486af606e3f8a283dbddc7c18a
+
+0160-Use-function_arg_info-for-TARGET_SETUP_INCOMING_ARGS.patch
+e7056ca417326a70eca05defb6a8b20b737d3417
+
+0161-Use-function_arg_info-for-TARGET_FUNCTION_-INCOMING_.patch
+6783fdb7057d559aa1da8afa2c15a702c532a03e
+
+0162-Use-function_arg_info-for-TARGET_FUNCTION_ARG_ADVANC.patch
+6930c98c69ad695469ee7daa74b3b6d578afdd0d
+
+0163-Use-function_arg_info-for-TARGET_CALLEE_COPIES.patch
+7256c7194e186fce6ff866a124a77b08196c2a5f
+
+0164-Use-function_arg_info-for-TARGET_MUST_PASS_IN_STACK.patch
+0ffef2005fd7536efbc9c3a572701998c8a8080c
+
+0165-Add-a-apply_pass_by_reference_rules-helper.patch
+b12cdd6e8e8dd1f39a941b731ba1056d656a094f
+
+0166-re-PR-target-88839-SVE-Poor-implementation-of-blend-.patch
+9556ef20164e69d094f5a3e1af262dbb45ed8e3a
+
+0167-aarch64-sve.md-vcond_mask-Add.patch
+b1c9ec725da365165ce4c2fdf63daa33b7d86649
+
+0168-aarch64-add-intrinsics-for-vld1-q-_x4-and-vst1-q-_x4.patch
+391625888d4d97f9016ab9ac04acc55d81f0c26f
+
+0169-arm-aarch64-Add-comments-warning-that-stack-protecto.patch
+a7e73b4158f528600ef97aca29201ddc92b3439f
+
+0170-AArch64-Add-Linux-hwcap-strings-for-some-extensions.patch
+75f935365dba3eb5e9cbd11bc0d75009cad3d019
+
+0171-AArch64-Add-support-for-missing-CPUs.patch
+e0664b7a63ed8305e9f8539309df7fb3eb13babe
+
+0172-AArch64-Implement-ACLE-intrinsics-for-FRINT-32-64-Z-.patch
+10bd1d964ef12daa9f92ff0b8d1e5f600aa63f7b
+
+0173-AArch64-Add-support-for-__jcvt-intrinsic.patch
+e1d5d19ec4f84b67ac693fef5b2add7dc9cf056d
+
+0174-Remove-bt-load.c.patch
+f78f73cbd284abe4f1718fd7803f5f98800de225
+
+0175-Simplify-the-implementation-of-HARD_REG_SET.patch
+504279ae0a0ce28ad37f820dcdb7f6557aabef7c
+
+0176-Make-note_stores-take-an-rtx_insn.patch
+e8448ba5300e32917fb12f877ae40711c2b452a3
+
+0177-Remove-COPY_HARD_REG_SET.patch
+6576d245386e2ce52df274ef8f2ffed81cfaa1c3
+
+0178-Remove-COMPL_HARD_REG_SET.patch
+50b3f54d551787e0a066451ef60ef3b055a893e6
+
+0179-Remove-AND_HARD_REG_SET.patch
+dc333d8ff60909dbed89126443e3024f1592f8a4
+
+0180-Remove-IOR_HARD_REG_SET.patch
+44942965f4eae141bd1f8300e7f77d0c9a3936e4
+
+0181-Remove-AND_COMPL_HARD_REG_SET.patch
+d15e5131845e2a68513230a624839ef5abcda690
+
+0182-Remove-IOR_COMPL_HARD_REG_SET.patch
+4897c5aaa7a5db4c1ece28ef66acb3d5e41787b3
+
+0183-Remove-hard_reg_set_equal_p.patch
+a85796511b2b7985f79331c996761f7a87cb8116
+
+0184-Tweak-interface-to-ira-build.c-ior_hard_reg_conflict.patch
+75f4e3a1b322e16a1aca28bd0ced9af57cb0a683
+
+0185-Add-fast-conversions-from-arrays-to-bitmaps.patch
+148909bc700e4f52aa582346a29abc5bc51a9bda
+
+0186-Remove-global-REG_SETs.patch
+0b0310e9a0e0d553bbe9f961c52e0851328aa8b0
+
+0187-Remove-call_fixed_reg_set.patch
+df1f0eef67939274e9ddd3df426e8dfc5184086b
+
+0188-Remove-no_caller_save_reg_set.patch
+026116ce2a4dedad81518b0ca89dd8243b545778
+
+0189-Replace-call_used_reg_set-with-call_used_or_fixed_re.patch
+a5647ae846f6765f12a359acba6a71fc12254fa8
+
+0190-Add-call_used_or_fixed_reg_p.patch
+a365fa0636886aeda83e57b84d837cfba13597fe
+
+0191-Hide-call_used_regs-in-target-independent-code.patch
+53bee79caba4fb88acbcd9bad7891ea45b5511e3
+
+0192-Remove-call_really_used_regs.patch
+d7fb4c3162307590c0babddcea4fb60c07a7c033
+
+0193-Vectorise-multiply-high-with-scaling-operations-PR-8.patch
+58cc98767aa1d8136d36467b892dc4adaf427acc
+
+0194-arm-aarch64-Make-no_insn-issue-to-nothing.patch
+f62281dc1b3d751977266d8c30b4488833fcb9dd
+
+0195-Two-more-POLY_INT-cases-for-dwarf2out.c.patch
+ef20d2215067b1bfa8b3f9549ca0baed636a94a0
+
+0196-Handle-variable-length-vectors-in-compute_record_mod.patch
+defc6f266c1dd625cc64ad1ecfbd1eacbcd66e4f
+
+0197-Don-t-treat-variable-length-vectors-as-VLAs-during-g.patch
+22b6299199da4efd3944cdaabca1d095d19ff901
+
+0198-Make-get_value_for_expr-check-for-INTEGER_CSTs.patch
+01b57ebf58b8cc0d16db827d1d9aa5f10da23cce
+
+0199-aarch64-Extend-R-for-integer-registers.patch
+e3f15286d1129de2cceee6acd5d5584cb5422db6
+
+0200-aarch64-Implement-TImode-compare-and-swap.patch
+4a2095ebace8534038ce2adf4ae94bfc854066c4
+
+0201-aarch64-Tidy-aarch64_split_compare_and_swap.patch
+b7e560deb37e38fb224a0cf108e15df4a717167a
+
+0202-aarch64-Implement-moutline-atomics.patch
+3950b229a5ed6710f30241c2ddc3c74909bf4740
+
+0203-Rework-constant-subreg-folds-and-handle-more-variabl.patch
+f24f4c15884bf1ee65a10e2f959842eec4198876
+
+0204-Extend-neg_const_int-simplifications-to-other-const-.patch
+681fc0fa40cc4f018cb691d796aa819a24257774
+
+0205-Avoid-adding-impossible-copies-in-ira-conflicts.c-pr.patch
+9f635bd13fe9e85872e441b6f3618947f989909a
+
+0206-AArch64-Fix-memmodel-index-in-aarch64_store_exclusiv.patch
+3a30d2558b3a199fe346479e6140cddae7fba5ed
+
+0207-AArch64-Use-implementation-namespace-consistently-in.patch
+9a3afc3564b36fb34826899a345a9c35b1c53e39
+
+0208-C-C-Allow-targets-to-check-calls-to-BUILT_IN_MD-func.patch
+c6447c2014b76b5c077a07712a7f0b0aaa2e14d4
+
+0209-AArch64-Split-built-in-function-codes-into-major-and.patch
+6d4d616a782d5be693ea9575f69d5ebf450be090
+
+0210-AArch64-Strengthen-aarch64_hard_regno_call_part_clob.patch
+51051f474a768d285714d713f1b7535d6a139350
+
+0211-Add-function_abi.-h-cc.patch
+bd785b44932274f7067105de417938597289962c
+
+0212-Add-a-target-hook-for-getting-an-ABI-from-a-function.patch
+002ffd3caa684c3eb30f8f53206439b7aa34b370
+
+0213-Add-a-function-for-getting-the-ABI-of-a-call-insn-ta.patch
+5a5a3bc5fa14664be26748c11325021b6b6f8e74
+
+0214-Pass-an-ABI-identifier-to-hard_regno_call_part_clobb.patch
+6ee2cc70024253d2670a4a317158b2a65251a1d1
+
+0215-Remove-global-call-sets-DF-entry-exit-defs.patch
+559c1ae100489da76a0283750361ace146fdeb77
+
+0216-Remove-global-call-sets-IRA.patch
+6c47622219d6386807b26890dcdc84f192499d33
+
+0217-Remove-global-call-sets-LRA.patch
+a1e6ee38e708ef2bdef4dfbb99473344bd56fa2f
+
+0218-Remove-global-call-sets-regrename.c.patch
+0ce77f463d1d150e70a91807502d628492ca7ae5
+
+0219-Make-ira-call-df_set_regs_ever_live-for-extra-call-c.patch
+6d1e98dfd2bfce30640d71df355bedf114229744
+
+0220-AArch64-Allow-shrink-wrapping-of-non-leaf-vector-PCS.patch
+ce9d2a37f2db20328286f5d3d5a13a4e765c59f7
+
+0221-AArch64-Make-more-use-of-function_abi.patch
+dcdd0f055731a8c960a15e5de8715d041d9a7876
+
+0222-AArch64-SVE-Utilize-ASRD-instruction-for-division-an.patch
+c0c2f013906a695b8a02226f119649a370d9e083
+
+0223-AArch64-Make-call-insns-record-the-callee-s-arm_pcs.patch
+08cc4d925f640c3cd0336bae4dc6004244a5c80a
+
+0224-AArch64-Use-calls-for-SVE-TLSDESC.patch
+bb6ce448fc194cca8e51aea274a1b2408c7746c3
+
+0225-Remove-clobber_high.patch
+17d184e5c4896264c27c27d125a6c1f8462d9d37
+
+0226-C-Improve-diagnostics-for-vector-types.patch
+8209db250f305cc79fd751c3ed056fb9ff551a83
+
+0227-invoke.texi-early-inlining-insns-O2-Document.patch
+0b92cf305dcf34387a8e2564e55ca8948df3b47a
+
+0228-cif-code.def-MAX_INLINE_INSNS_SINGLE_O2_LIMIT-.-New.patch
+562d1e9556777988ae46c5d1357af2636bc272ea
+
+0229-Fix-EXECUTE_IF_SET_IN_HARD_REG_SET-use.patch
+1c8264003ab1d6932d874bd1a9af4ac498d4b4a4
+
+0230-Use-CONSTEXPR-in-machmode.h.patch
+ad00d6c1746fdcbfd86b2d50f2500d7ccb0d1691
+
+0231-pretty-print-support-URL-escape-sequences-PR-87488.patch
+d26082357676a3c3843595dfe88a6c682b56e334
+
+0232-Relax-store_bit_field-call-in-store_expr.patch
+8b27c9052b8d191c98686e77d2fa610390c78f32
+
+0233-Darwin-machopic-8-n-Back-out-part-of-PR71767-fix.patch
+f922d945244558904be6868dc036c31fd05750dd
+
+0234-Add-expr_callee_abi.patch
+63d25773e166e2e3babe626a5800e70939844754
+
+0235-AArch64-Use-frame-reference-in-aarch64_layout_frame.patch
+ab43763e519ed8efbbfdac801d008c338fbcb187
+
+0236-AArch64-Add-an-assert-to-aarch64_layout_frame.patch
+8e66b377a93e3fc371d0836768740d68ef8fffc5
+
+0237-AArch64-Improve-poly_int-handling-in-aarch64_layout_.patch
+9b17a646d90ad0cc30daf8432aa60ad0d751d914
+
+0238-AArch64-Add-partial-SVE-vector-modes.patch
+550a338052c374cb1f6c07ffd883c4046565fdd4
+
+0239-AArch64-Fix-symbol-offset-limit.patch
+7d3b27ff12610fde9d6c4b56abc70c6ee9b6b3db
+
+0240-AArch64-SVE2-Support-for-EOR3-and-variants-of-BSL.patch
+2d57b12e2acd52b843adbcd6d5909cb0b9f7196b
+
+0241-re-PR-target-86753-gcc.target-aarch64-sve-vcond_-45-.patch
+cc1facefe3b4e3b067d95291a7dba834b830ff18
+
+0242-Pass-a-vec_info-to-get_vectype_for_scalar_type.patch
+7ed54790da87bbb4a134020a9fb8bd1b72fd0acb
+
+0243-AArch64-Implement-__rndr-__rndrrs-intrinsics.patch
+c5dc215df17071281c21450fa2d584e1161e4bc2
+
+0244-re-PR-debug-90231-ivopts-causes-optimized-away-itera.patch
+d9eabacb0483ac1f730112d551551c258365f02e
+
+0245-Add-a-simulate_builin_function_decl-langhook.patch
+740785381ec9944c861dcc29b420c96aa933f040
+
+0246-Add-a-simulate_enum_decl-langhook.patch
+ac2cfa6cc35175311f92c25acbdd244f0f3bbb87
+
+0247-AArch64-Handle-scalars-in-cmp-and-shift-immediate-qu.patch
+6bc67182b6500b942674d6031c1bf0f02c779cbd
+
+0248-AArch64-Add-FFR-and-FFRT-registers.patch
+183bfdafc6f1f98711c5400498a7268cc1441096
+
+0249-AArch64-Extend-SVE-reverse-permutes-to-predicates.patch
+28350fd1bee1e238e9c57b04c0796e1e17b659e4
+
+0250-AArch64-Add-support-for-arm_sve.h.patch
+624d0f07d51b7fa8bc99142bd0e8380fb9e7badc
+
+0251-AArch64-Add-support-for-the-SVE-PCS.patch
+c600df9a4060da3c6121ff4d0b93f179eafd69d1
+
+0252-AArch64-Add-main-SVE-ACLE-tests.patch
+bc73c4c24daec96ad3e7ff904645c3095a4febe9
+
+0253-Remove-cgraph_global_info.patch
+a62bfab5d2a332925fcf10c45b4c5d8ca499439d
+
+0254-AArch64-Remove-unused-mode-iterators.patch
+ffc111637291037e5546428275e39d8ca16d1fac
+
+0255-AArch64-Use-aarch64_sve_int_mode-in-SVE-ACLE-code.patch
+86194087ce338c8d0073d905eb60dca654d6bba3
+
+0256-Add-build_truth_vector_type_for_mode.patch
+0a0ef2387cc1561d537d8d949aef9479ef17ba35
+
+0257-AArch64-Add-FULL-to-SVE-mode-iterator-names.patch
+f75cdd2c4e5282985a6fbdb2e72e17cb77782044
+
+0258-LRA-handle-memory-constraints-that-accept-more-than-.patch
+1aeffdce2dfe718e1337d75eb4f22c3c300df9bb
+
+0259-Handle-VIEW_CONVERT_EXPR-for-variable-length-vectors.patch
+13c247d6f2a75b7e7a11546e897489716bc31506
+
+0260-re-PR-target-90867-Multiplication-or-typecast-of-int.patch
+94cdd3b7ceff688d039a9f134013ac9069df2e8c
+
+0261-re-PR-inline-asm-92615-ICE-in-extract_insn.patch
+8d0d7a63019a7d67943d1867348673e3ca3dc824
+
+0262-re-PR-tree-optimization-92645-Hand-written-vector-co.patch
+1fa715db5490fb44668e0a37f9a5927d9030a50e
+
+0263-re-PR-tree-optimization-92690-vector-CTOR-optimizati.patch
+88feafba3cb5b186d53080c4958474065c4bd5d2
+
+0264-target.def-TARGET_VECTORIZE_BUILTIN_CONVERSION-Remov.patch
+477daf831aea18923733772d686eb1ed448d96e7
+
+0265-re-PR-tree-optimization-92645-Hand-written-vector-co.patch
+78307657cf9675bc4aa2e77561c823834714b4c8
+
+0266-re-PR-tree-optimization-92715-error-position-plus-si.patch
+438d9c4afa635c7a1475feebbc220fe8d335c664
+
+0267-re-PR-target-92758-r278833-breaks-gcc.target-powerpc.patch
+577f4a0e5e7f7ef9b5729a3eed79e523cba9dfa9
+
+0268-re-PR-tree-optimization-92803-error-type-mismatch-in.patch
+a3408fa3fbf20455eb3b17b5c78397f9d66065c7
+
+0269-Add-ARM-specific-Bfloat-format-support-to-middle-end.patch
+d5ffd47e9a739770aa7ef5ad06c07fe9f16a3260
+
+0270-re-PR-target-92904-varargs-for-__int128-is-placed-at.patch
+46f3e52e834ab0c06902e7424e57513ee6a8aacd
+
+0271-AArch64-Enable-CLI-for-Armv8.6-a-armv8.6-a-i8mm-and-.patch
+a93e1d5c70abe9fba3522318131a352fad0a4f48
+
+0272-gcc-testsuite-ChangeLog.patch
+9260fb066b7ed0b237a3300e05fca9bffe018c6b
+
+0273-Add-a-compatible_vector_types_p-target-hook.patch
+482b2b43e5101921ad94e51e052a18b353f8a3f5
+
+0274-AArch64-Specify-some-SVE-ACLE-functions-in-a-more-ge.patch
+99a3b91535cb41807d62478cd769bc1bed0db5df
+
+0275-AArch64-Rename-SVE-shape-unary_count-to-unary_to_uin.patch
+5b052959dcd2e9c390c7de34f806c4b22a66d8f7
+
+0276-AArch64-Rename-UNSPEC_WHILE-to-match-instruction-mne.patch
+6ad9571b172cd98099b477cba4efdd92c85bd222
+
+0277-AArch64-Add-support-for-the-SVE2-ACLE.patch
+0a09a9483825233f16e5b26bb0ffee76752339fc
+
+0278-config.gcc-Add-arm_bf16.h.patch
+abbe1ed27355178223cd099fb73227f392416ea6
+
+0279-aarch64.c-aarch64_invalid_conversion-New-function-fo.patch
+9869896730f3055850034c05c596828d517fa9a2
+
+0280-GCC-PATCH-AArch64-Add-ACLE-intrinsics-for-dot-produc.patch
+8c197c851e7528baba7cb837f34c05ba2242f705
+
+0281-GCC-PATCH-AArch64-Add-ACLE-intrinsics-for-bfdot-for-.patch
+f275d73a57f1e5a07fbd4978f4b4457a5eaa1e39
+
+0282-AArch64-Fix-shrinkwrapping-interactions-with-atomics.patch
+e5e07b68187b9aa334519746c45b8cffc5eb7e5c
+
+0283-AArch64-Enable-CLI-for-Armv8.6-A-f64mm.patch
+336e1b950db8b91027cdf0ab33bd905930d7f363
+
+0284-AArch64-SVE-Implement-svld1ro-intrinsic.patch
+9ceec73fc0e5033049704becef5d79001e31a245
+
+0285-AArch64-Obvious-Correct-pattern-target-requirement.patch
+568f0f355f259f58688dd73f749f4d80adc10e40
+
+0286-AArch64-effective_target-for-aarch64-f64mm-asm.patch
+3c9e580511e713068c0ea0d7b34f6e50ebf85447
+
+0287-testsuite-Add-target-xfail-argument-to-check-functio.patch
+4c33b2daeb5a87aedef77993971db1a1a1c291e6
+
+0288-aarch64-Skip-some-SVE-ACLE-function-body-tests-for-I.patch
+b02fbed15a36a86dda6a09a8dc237a8d288f6c09
+
+0289-i386-Fix-ix86_fold_builtin-shift-folding-PR93418.patch
+bff948aa337807260344c83ac9079d6386410094
+
+0290-forwprop-Tweak-choice-of-VEC_PERM_EXPR-filler-PR9282.patch
+1ee3b380dfb479b335f3b50039ce26abcbffe59a
+
+0291-SRA-Add-verification-of-accesses.patch
+5b9e89c922dc2e7e8b8da644bd3a8917c16b22ac
+
+0292-SRA-Total-scalarization-after-access-propagation-PR9.patch
+636e80eea24b780f1d5f4c14c58fc00001df8508
+
+0293-aarch64-Fix-SVE-PCS-failures-for-BE-ILP32.patch
+2171a9207f51bc486ed9c502cb4da706f594615e
+
+0294-aarch64-Add-Armv8.6-SVE-matrix-multiply-support.patch
+3669677425f249c163201c4760d05abb3cf4e6bc
+
+0295-aarch64-Add-svbfloat16_t-support-to-arm_sve.h.patch
+02fcd8ac408be56d2a6e67e2e09b26532862f233
+
+0296-aarch64-Add-Armv8.6-SVE-bfloat16-support.patch
+896dff99e18d67afdbe4d1effec20a3da474b22b
+
+0297-aarch64-ACLE-intrinsics-bfmmla-and-bfmlal-b-t.patch
+f78335df69993a900512f92324cab6a20b1bde0c
+
+0298-aarch64-Add-an-extra-sbfiz-pattern-PR87763.patch
+b65a1eb3fae53f2e1ea1ef8c1164f490d55855a1
+
+0299-x86-64-Pass-aggregates-with-only-float-double-in-GPR.patch
+ea5ca698dca15dc86b823661ac357a30b49dd0f6
+
+0300-aarch64-ACLE-I8MM-multiply-accumulate-intrinsics.patch
+40f648378061c170cf6a9ab680af01b3a3a83569
+
+0301-i386-Skip-ENDBR32-at-the-target-function-entry.patch
+1d69147af203d4dcd2270429f90c93f1a37ddfff
+
+0302-testsuite-Fix-recently-added-ipa-testcases-PR93763.patch
+103bc4db7665a03bf2390ccc8ceca0dc5a7a81b7
+
+0303-aarch64-Add-bfloat16-vdup-and-vreinterpret-ACLE-intr.patch
+8ea6c1b89a20ef7c675535ba1994355361dac977
+
+0304-aarch64-Add-bfloat16-vldn-vstn-intrinsics.patch
+e603cd43b145c426468c95cf85b3c12c94daedaa
+
+0305-aarch64-ACLE-intrinsics-for-BFCVTN-BFCVTN2-and-BFCVT.patch
+1f520d3412962e22b0338461d82f41abba8a4f12
+
+0306-testsuite-Fix-misquoted-string-in-bfcvt-nosimd.c.patch
+db3fa3476e9e922ca3e283df03ebd14be7220b6e
+
+0307-aarch64-Fix-bf16_v-ld-st-n.c-failures-for-big-endian.patch
+cf9c3bff39cf973c5c8621ff44199dcb831193a7
+
+0308-testsuite-Fix-gcc.target-aarch64-advsimd-intrinsics-.patch
+58a703f0726b3bb6c5ac8b600369106985906590
+
+0309-cleanup-graphite-results.patch
+1acde74cf611f560172c74324610c29ca81edf94
+
+diff --git a/gcc/Makefile.in b/gcc/Makefile.in
+index bc188bbed..46ba89598 100644
+--- a/gcc/Makefile.in
++++ b/gcc/Makefile.in
+@@ -1239,7 +1239,6 @@ OBJS = \
+ 	auto-profile.o \
+ 	bb-reorder.o \
+ 	bitmap.o \
+-	bt-load.o \
+ 	builtins.o \
+ 	caller-save.o \
+ 	calls.o \
+@@ -1305,6 +1304,7 @@ OBJS = \
+ 	fold-const.o \
+ 	fold-const-call.o \
+ 	function.o \
++	function-abi.o \
+ 	function-tests.o \
+ 	fwprop.o \
+ 	gcc-rich-location.o \
+@@ -2522,6 +2522,7 @@ GTFILES = $(CPPLIB_H) $(srcdir)/input.h $(srcdir)/coretypes.h \
+   $(srcdir)/libfuncs.h $(SYMTAB_H) \
+   $(srcdir)/real.h $(srcdir)/function.h $(srcdir)/insn-addr.h $(srcdir)/hwint.h \
+   $(srcdir)/fixed-value.h \
++  $(srcdir)/function-abi.h \
+   $(srcdir)/output.h $(srcdir)/cfgloop.h $(srcdir)/cfg.h $(srcdir)/profile-count.h \
+   $(srcdir)/cselib.h $(srcdir)/basic-block.h  $(srcdir)/ipa-ref.h $(srcdir)/cgraph.h \
+   $(srcdir)/reload.h $(srcdir)/caller-save.c $(srcdir)/symtab.c \
+diff --git a/gcc/alias.c b/gcc/alias.c
+index 053c3494e..1a60f905a 100644
+--- a/gcc/alias.c
++++ b/gcc/alias.c
+@@ -1572,16 +1572,6 @@ record_set (rtx dest, const_rtx set, void *data ATTRIBUTE_UNUSED)
+ 	  new_reg_base_value[regno] = 0;
+ 	  return;
+ 	}
+-      /* A CLOBBER_HIGH only wipes out the old value if the mode of the old
+-	 value is greater than that of the clobber.  */
+-      else if (GET_CODE (set) == CLOBBER_HIGH)
+-	{
+-	  if (new_reg_base_value[regno] != 0
+-	      && reg_is_clobbered_by_clobber_high (
+-		   regno, GET_MODE (new_reg_base_value[regno]), XEXP (set, 0)))
+-	    new_reg_base_value[regno] = 0;
+-	  return;
+-	}
+ 
+       src = SET_SRC (set);
+     }
+@@ -3284,7 +3274,8 @@ memory_modified_in_insn_p (const_rtx mem, const_rtx insn)
+   if (CALL_P (insn))
+     return true;
+   memory_modified = false;
+-  note_stores (PATTERN (insn), memory_modified_1, CONST_CAST_RTX(mem));
++  note_stores (as_a<const rtx_insn *> (insn), memory_modified_1,
++	       CONST_CAST_RTX(mem));
+   return memory_modified;
+ }
+ 
+@@ -3412,7 +3403,7 @@ init_alias_analysis (void)
+ 		      && find_reg_note (insn, REG_NOALIAS, NULL_RTX))
+ 		    record_set (SET_DEST (PATTERN (insn)), NULL_RTX, NULL);
+ 		  else
+-		    note_stores (PATTERN (insn), record_set, NULL);
++		    note_stores (insn, record_set, NULL);
+ 
+ 		  set = single_set (insn);
+ 
+diff --git a/gcc/array-traits.h b/gcc/array-traits.h
+new file mode 100644
+index 000000000..eb65ede94
+--- /dev/null
++++ b/gcc/array-traits.h
+@@ -0,0 +1,48 @@
++/* Descriptions of array-like objects.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_ARRAY_TRAITS_H
++#define GCC_ARRAY_TRAITS_H
++
++/* Implementation for single integers (and similar types).  */
++template<typename T, T zero = T (0)>
++struct scalar_array_traits
++{
++  typedef T element_type;
++  static const bool has_constant_size = true;
++  static const size_t constant_size = 1;
++  static const T *base (const T &x) { return &x; }
++  static size_t size (const T &) { return 1; }
++};
++
++template<typename T>
++struct array_traits : scalar_array_traits<T> {};
++
++/* Implementation for arrays with a static size.  */
++template<typename T, size_t N>
++struct array_traits<T[N]>
++{
++  typedef T element_type;
++  static const bool has_constant_size = true;
++  static const size_t constant_size = N;
++  static const T *base (const T (&x)[N]) { return x; }
++  static size_t size (const T (&x)[N]) { return N; }
++};
++
++#endif
+diff --git a/gcc/attribs.c b/gcc/attribs.c
+index 444192254..d447ea0e4 100644
+--- a/gcc/attribs.c
++++ b/gcc/attribs.c
+@@ -691,6 +691,7 @@ decl_attributes (tree *node, tree attributes, int flags,
+ 
+ 	  if (!built_in
+ 	      || !DECL_P (*anode)
++	      || DECL_BUILT_IN_CLASS (*anode) != BUILT_IN_NORMAL
+ 	      || (DECL_FUNCTION_CODE (*anode) != BUILT_IN_UNREACHABLE
+ 		  && (DECL_FUNCTION_CODE (*anode)
+ 		      != BUILT_IN_UBSAN_HANDLE_BUILTIN_UNREACHABLE)))
+diff --git a/gcc/bitmap.c b/gcc/bitmap.c
+index 5a8236de7..911d506f3 100644
+--- a/gcc/bitmap.c
++++ b/gcc/bitmap.c
+@@ -958,17 +958,17 @@ bitmap_set_bit (bitmap head, int bit)
+ /* Return whether a bit is set within a bitmap.  */
+ 
+ int
+-bitmap_bit_p (bitmap head, int bit)
++bitmap_bit_p (const_bitmap head, int bit)
+ {
+   unsigned int indx = bit / BITMAP_ELEMENT_ALL_BITS;
+-  bitmap_element *ptr;
++  const bitmap_element *ptr;
+   unsigned bit_num;
+   unsigned word_num;
+ 
+   if (!head->tree_form)
+-    ptr = bitmap_list_find_element (head, indx);
++    ptr = bitmap_list_find_element (const_cast<bitmap> (head), indx);
+   else
+-    ptr = bitmap_tree_find_element (head, indx);
++    ptr = bitmap_tree_find_element (const_cast<bitmap> (head), indx);
+   if (ptr == 0)
+     return 0;
+ 
+diff --git a/gcc/bitmap.h b/gcc/bitmap.h
+index ed25c1ee5..7217f9e0a 100644
+--- a/gcc/bitmap.h
++++ b/gcc/bitmap.h
+@@ -210,6 +210,7 @@ along with GCC; see the file COPYING3.  If not see
+    on which many random-access membership tests will happen.  */
+ 
+ #include "obstack.h"
++#include "array-traits.h"
+ 
+ /* Bitmap memory usage.  */
+ struct bitmap_usage: public mem_usage
+@@ -418,7 +419,7 @@ extern bool bitmap_clear_bit (bitmap, int);
+ extern bool bitmap_set_bit (bitmap, int);
+ 
+ /* Return true if a bit is set in a bitmap.  */
+-extern int bitmap_bit_p (bitmap, int);
++extern int bitmap_bit_p (const_bitmap, int);
+ 
+ /* Debug functions to print a bitmap.  */
+ extern void debug_bitmap (const_bitmap);
+@@ -937,4 +938,123 @@ class auto_bitmap
+   bitmap_head m_bits;
+ };
+ 
++/* Base class for bitmap_view; see there for details.  */
++template<typename T, typename Traits = array_traits<T> >
++class base_bitmap_view
++{
++public:
++  typedef typename Traits::element_type array_element_type;
++
++  base_bitmap_view (const T &, bitmap_element *);
++  operator const_bitmap () const { return &m_head; }
++
++private:
++  base_bitmap_view (const base_bitmap_view &);
++
++  bitmap_head m_head;
++};
++
++/* Provides a read-only bitmap view of a single integer bitmask or a
++   constant-sized array of integer bitmasks, or of a wrapper around such
++   bitmasks.  */
++template<typename T, typename Traits>
++class bitmap_view<T, Traits, true> : public base_bitmap_view<T, Traits>
++{
++public:
++  bitmap_view (const T &array)
++    : base_bitmap_view<T, Traits> (array, m_bitmap_elements) {}
++
++private:
++  /* How many bitmap_elements we need to hold a full T.  */
++  static const size_t num_bitmap_elements
++    = CEIL (CHAR_BIT
++	    * sizeof (typename Traits::element_type)
++	    * Traits::constant_size,
++	    BITMAP_ELEMENT_ALL_BITS);
++  bitmap_element m_bitmap_elements[num_bitmap_elements];
++};
++
++/* Initialize the view for array ARRAY, using the array of bitmap
++   elements in BITMAP_ELEMENTS (which is known to contain enough
++   entries).  */
++template<typename T, typename Traits>
++base_bitmap_view<T, Traits>::base_bitmap_view (const T &array,
++					       bitmap_element *bitmap_elements)
++{
++  m_head.obstack = NULL;
++
++  /* The code currently assumes that each element of ARRAY corresponds
++     to exactly one bitmap_element.  */
++  const size_t array_element_bits = CHAR_BIT * sizeof (array_element_type);
++  STATIC_ASSERT (BITMAP_ELEMENT_ALL_BITS % array_element_bits == 0);
++  size_t array_step = BITMAP_ELEMENT_ALL_BITS / array_element_bits;
++  size_t array_size = Traits::size (array);
++
++  /* Process each potential bitmap_element in turn.  The loop is written
++     this way rather than per array element because usually there are
++     only a small number of array elements per bitmap element (typically
++     two or four).  The inner loops should therefore unroll completely.  */
++  const array_element_type *array_elements = Traits::base (array);
++  unsigned int indx = 0;
++  for (size_t array_base = 0;
++       array_base < array_size;
++       array_base += array_step, indx += 1)
++    {
++      /* How many array elements are in this particular bitmap_element.  */
++      unsigned int array_count
++	= (STATIC_CONSTANT_P (array_size % array_step == 0)
++	   ? array_step : MIN (array_step, array_size - array_base));
++
++      /* See whether we need this bitmap element.  */
++      array_element_type ior = array_elements[array_base];
++      for (size_t i = 1; i < array_count; ++i)
++	ior |= array_elements[array_base + i];
++      if (ior == 0)
++	continue;
++
++      /* Grab the next bitmap element and chain it.  */
++      bitmap_element *bitmap_element = bitmap_elements++;
++      if (m_head.current)
++	m_head.current->next = bitmap_element;
++      else
++	m_head.first = bitmap_element;
++      bitmap_element->prev = m_head.current;
++      bitmap_element->next = NULL;
++      bitmap_element->indx = indx;
++      m_head.current = bitmap_element;
++      m_head.indx = indx;
++
++      /* Fill in the bits of the bitmap element.  */
++      if (array_element_bits < BITMAP_WORD_BITS)
++	{
++	  /* Multiple array elements fit in one element of
++	     bitmap_element->bits.  */
++	  size_t array_i = array_base;
++	  for (unsigned int word_i = 0; word_i < BITMAP_ELEMENT_WORDS;
++	       ++word_i)
++	    {
++	      BITMAP_WORD word = 0;
++	      for (unsigned int shift = 0;
++		   shift < BITMAP_WORD_BITS && array_i < array_size;
++		   shift += array_element_bits)
++		word |= array_elements[array_i++] << shift;
++	      bitmap_element->bits[word_i] = word;
++	    }
++	}
++      else
++	{
++	  /* Array elements are the same size as elements of
++	     bitmap_element->bits, or are an exact multiple of that size.  */
++	  unsigned int word_i = 0;
++	  for (unsigned int i = 0; i < array_count; ++i)
++	    for (unsigned int shift = 0; shift < array_element_bits;
++		 shift += BITMAP_WORD_BITS)
++	      bitmap_element->bits[word_i++]
++		= array_elements[array_base + i] >> shift;
++	  while (word_i < BITMAP_ELEMENT_WORDS)
++	    bitmap_element->bits[word_i++] = 0;
++	}
++    }
++}
++
+ #endif /* GCC_BITMAP_H */
+diff --git a/gcc/bt-load.c b/gcc/bt-load.c
+deleted file mode 100644
+index f68879ca4..000000000
+--- a/gcc/bt-load.c
++++ /dev/null
+@@ -1,1577 +0,0 @@
+-/* Perform branch target register load optimizations.
+-   Copyright (C) 2001-2019 Free Software Foundation, Inc.
+-
+-This file is part of GCC.
+-
+-GCC is free software; you can redistribute it and/or modify it under
+-the terms of the GNU General Public License as published by the Free
+-Software Foundation; either version 3, or (at your option) any later
+-version.
+-
+-GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+-WARRANTY; without even the implied warranty of MERCHANTABILITY or
+-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+-for more details.
+-
+-You should have received a copy of the GNU General Public License
+-along with GCC; see the file COPYING3.  If not see
+-<http://www.gnu.org/licenses/>.  */
+-
+-#include "config.h"
+-#include "system.h"
+-#include "coretypes.h"
+-#include "backend.h"
+-#include "target.h"
+-#include "rtl.h"
+-#include "tree.h"
+-#include "df.h"
+-#include "insn-config.h"
+-#include "regs.h"
+-#include "memmodel.h"
+-#include "emit-rtl.h"
+-#include "recog.h"
+-#include "diagnostic-core.h"
+-#include "expr.h"
+-#include "insn-attr.h"
+-#include "tree-pass.h"
+-#include "cfgrtl.h"
+-#include "cfganal.h"
+-#include "cfgcleanup.h"
+-#include "cfgloop.h"
+-#include "rtl-iter.h"
+-#include "fibonacci_heap.h"
+-
+-struct btr_def;
+-
+-/* Target register optimizations - these are performed after reload.  */
+-
+-struct btr_def_group
+-{
+-  btr_def_group *next;
+-  rtx src;
+-  btr_def *members;
+-};
+-
+-struct btr_user
+-{
+-  btr_user *next;
+-  basic_block bb;
+-  int luid;
+-  rtx_insn *insn;
+-  /* If INSN has a single use of a single branch register, then
+-     USE points to it within INSN.  If there is more than
+-     one branch register use, or the use is in some way ambiguous,
+-     then USE is NULL.  */
+-  rtx use;
+-  int n_reaching_defs;
+-  int first_reaching_def;
+-  char other_use_this_block;
+-};
+-
+-/* btr_def structs appear on three lists:
+-     1. A list of all btr_def structures (head is
+-	ALL_BTR_DEFS, linked by the NEXT field).
+-     2. A list of branch reg definitions per basic block (head is
+-	BB_BTR_DEFS[i], linked by the NEXT_THIS_BB field).
+-     3. A list of all branch reg definitions belonging to the same
+-	group (head is in a BTR_DEF_GROUP struct, linked by
+-	NEXT_THIS_GROUP field).  */
+-
+-struct btr_def
+-{
+-  btr_def *next_this_bb;
+-  btr_def *next_this_group;
+-  basic_block bb;
+-  int luid;
+-  rtx_insn *insn;
+-  int btr;
+-  int cost;
+-  /* For a branch register setting insn that has a constant
+-     source (i.e. a label), group links together all the
+-     insns with the same source.  For other branch register
+-     setting insns, group is NULL.  */
+-  btr_def_group *group;
+-  btr_user *uses;
+-  /* If this def has a reaching use which is not a simple use
+-     in a branch instruction, then has_ambiguous_use will be true,
+-     and we will not attempt to migrate this definition.  */
+-  char has_ambiguous_use;
+-  /* live_range is an approximation to the true live range for this
+-     def/use web, because it records the set of blocks that contain
+-     the live range.  There could be other live ranges for the same
+-     branch register in that set of blocks, either in the block
+-     containing the def (before the def), or in a block containing
+-     a use (after the use).  If there are such other live ranges, then
+-     other_btr_uses_before_def or other_btr_uses_after_use must be set true
+-     as appropriate.  */
+-  char other_btr_uses_before_def;
+-  char other_btr_uses_after_use;
+-  /* We set own_end when we have moved a definition into a dominator.
+-     Thus, when a later combination removes this definition again, we know
+-     to clear out trs_live_at_end again.  */
+-  char own_end;
+-  bitmap live_range;
+-};
+-
+-typedef fibonacci_heap <long, btr_def> btr_heap_t;
+-typedef fibonacci_node <long, btr_def> btr_heap_node_t;
+-
+-static int issue_rate;
+-
+-static int basic_block_freq (const_basic_block);
+-static int insn_sets_btr_p (const rtx_insn *, int, int *);
+-static void find_btr_def_group (btr_def_group **, btr_def *);
+-static btr_def *add_btr_def (btr_heap_t *, basic_block, int, rtx_insn *,
+-			    unsigned int, int, btr_def_group **);
+-static btr_user *new_btr_user (basic_block, int, rtx_insn *);
+-static void dump_hard_reg_set (HARD_REG_SET);
+-static void dump_btrs_live (int);
+-static void note_other_use_this_block (unsigned int, btr_user *);
+-static void compute_defs_uses_and_gen (btr_heap_t *, btr_def **, btr_user **,
+-				       sbitmap *, sbitmap *, HARD_REG_SET *);
+-static void compute_kill (sbitmap *, sbitmap *, HARD_REG_SET *);
+-static void compute_out (sbitmap *bb_out, sbitmap *, sbitmap *, int);
+-static void link_btr_uses (btr_def **, btr_user **, sbitmap *, sbitmap *, int);
+-static void build_btr_def_use_webs (btr_heap_t *);
+-static int block_at_edge_of_live_range_p (int, btr_def *);
+-static void clear_btr_from_live_range (btr_def *def);
+-static void add_btr_to_live_range (btr_def *, int);
+-static void augment_live_range (bitmap, HARD_REG_SET *, basic_block,
+-				basic_block, int);
+-static int choose_btr (HARD_REG_SET);
+-static void combine_btr_defs (btr_def *, HARD_REG_SET *);
+-static void btr_def_live_range (btr_def *, HARD_REG_SET *);
+-static void move_btr_def (basic_block, int, btr_def *, bitmap, HARD_REG_SET *);
+-static int migrate_btr_def (btr_def *, int);
+-static void migrate_btr_defs (enum reg_class, int);
+-static int can_move_up (const_basic_block, const rtx_insn *, int);
+-static void note_btr_set (rtx, const_rtx, void *);
+-
+-/* The following code performs code motion of target load instructions
+-   (instructions that set branch target registers), to move them
+-   forward away from the branch instructions and out of loops (or,
+-   more generally, from a more frequently executed place to a less
+-   frequently executed place).
+-   Moving target load instructions further in front of the branch
+-   instruction that uses the target register value means that the hardware
+-   has a better chance of preloading the instructions at the branch
+-   target by the time the branch is reached.  This avoids bubbles
+-   when a taken branch needs to flush out the pipeline.
+-   Moving target load instructions out of loops means they are executed
+-   less frequently.  */
+-
+-/* An obstack to hold the def-use web data structures built up for
+-   migrating branch target load instructions.  */
+-static struct obstack migrate_btrl_obstack;
+-
+-/* Array indexed by basic block number, giving the set of registers
+-   live in that block.  */
+-static HARD_REG_SET *btrs_live;
+-
+-/* Array indexed by basic block number, giving the set of registers live at
+-  the end of that block, including any uses by a final jump insn, if any.  */
+-static HARD_REG_SET *btrs_live_at_end;
+-
+-/* Set of all target registers that we are willing to allocate.  */
+-static HARD_REG_SET all_btrs;
+-
+-/* Provide lower and upper bounds for target register numbers, so that
+-   we don't need to search through all the hard registers all the time.  */
+-static int first_btr, last_btr;
+-
+-
+-
+-/* Return an estimate of the frequency of execution of block bb.  */
+-static int
+-basic_block_freq (const_basic_block bb)
+-{
+-  return bb->count.to_frequency (cfun);
+-}
+-
+-/* If the rtx at *XP references (sets or reads) any branch target
+-   register, return one such register.  If EXCLUDEP is set, disregard
+-   any references within that location.  */
+-static rtx *
+-find_btr_use (rtx *xp, rtx *excludep = 0)
+-{
+-  subrtx_ptr_iterator::array_type array;
+-  FOR_EACH_SUBRTX_PTR (iter, array, xp, NONCONST)
+-    {
+-      rtx *loc = *iter;
+-      if (loc == excludep)
+-	iter.skip_subrtxes ();
+-      else
+-	{
+-	  const_rtx x = *loc;
+-	  if (REG_P (x)
+-	      && overlaps_hard_reg_set_p (all_btrs, GET_MODE (x), REGNO (x)))
+-	    return loc;
+-	}
+-    }
+-  return 0;
+-}
+-
+-/* Return true if insn is an instruction that sets a target register.
+-   if CHECK_CONST is true, only return true if the source is constant.
+-   If such a set is found and REGNO is nonzero, assign the register number
+-   of the destination register to *REGNO.  */
+-static int
+-insn_sets_btr_p (const rtx_insn *insn, int check_const, int *regno)
+-{
+-  rtx set;
+-
+-  if (NONJUMP_INSN_P (insn)
+-      && (set = single_set (insn)))
+-    {
+-      rtx dest = SET_DEST (set);
+-      rtx src = SET_SRC (set);
+-
+-      if (GET_CODE (dest) == SUBREG)
+-	dest = XEXP (dest, 0);
+-
+-      if (REG_P (dest)
+-	  && TEST_HARD_REG_BIT (all_btrs, REGNO (dest)))
+-	{
+-	  gcc_assert (!find_btr_use (&src));
+-
+-	  if (!check_const || CONSTANT_P (src))
+-	    {
+-	      if (regno)
+-		*regno = REGNO (dest);
+-	      return 1;
+-	    }
+-	}
+-    }
+-  return 0;
+-}
+-
+-/* Find the group that the target register definition DEF belongs
+-   to in the list starting with *ALL_BTR_DEF_GROUPS.  If no such
+-   group exists, create one.  Add def to the group.  */
+-static void
+-find_btr_def_group (btr_def_group **all_btr_def_groups, btr_def *def)
+-{
+-  if (insn_sets_btr_p (def->insn, 1, NULL))
+-    {
+-      btr_def_group *this_group;
+-      rtx def_src = SET_SRC (single_set (def->insn));
+-
+-      /* ?? This linear search is an efficiency concern, particularly
+-	 as the search will almost always fail to find a match.  */
+-      for (this_group = *all_btr_def_groups;
+-	   this_group != NULL;
+-	   this_group = this_group->next)
+-	if (rtx_equal_p (def_src, this_group->src))
+-	  break;
+-
+-      if (!this_group)
+-	{
+-	  this_group = XOBNEW (&migrate_btrl_obstack, btr_def_group);
+-	  this_group->src = def_src;
+-	  this_group->members = NULL;
+-	  this_group->next = *all_btr_def_groups;
+-	  *all_btr_def_groups = this_group;
+-	}
+-      def->group = this_group;
+-      def->next_this_group = this_group->members;
+-      this_group->members = def;
+-    }
+-  else
+-    def->group = NULL;
+-}
+-
+-/* Create a new target register definition structure, for a definition in
+-   block BB, instruction INSN, and insert it into ALL_BTR_DEFS.  Return
+-   the new definition.  */
+-static btr_def *
+-add_btr_def (btr_heap_t *all_btr_defs, basic_block bb, int insn_luid,
+-	     rtx_insn *insn,
+-	     unsigned int dest_reg, int other_btr_uses_before_def,
+-	     btr_def_group **all_btr_def_groups)
+-{
+-  btr_def *this_def = XOBNEW (&migrate_btrl_obstack, btr_def);
+-  this_def->bb = bb;
+-  this_def->luid = insn_luid;
+-  this_def->insn = insn;
+-  this_def->btr = dest_reg;
+-  this_def->cost = basic_block_freq (bb);
+-  this_def->has_ambiguous_use = 0;
+-  this_def->other_btr_uses_before_def = other_btr_uses_before_def;
+-  this_def->other_btr_uses_after_use = 0;
+-  this_def->next_this_bb = NULL;
+-  this_def->next_this_group = NULL;
+-  this_def->uses = NULL;
+-  this_def->live_range = NULL;
+-  find_btr_def_group (all_btr_def_groups, this_def);
+-
+-  all_btr_defs->insert (-this_def->cost, this_def);
+-
+-  if (dump_file)
+-    fprintf (dump_file,
+-      "Found target reg definition: sets %u { bb %d, insn %d }%s priority %d\n",
+-	     dest_reg, bb->index, INSN_UID (insn),
+-	     (this_def->group ? "" : ":not const"), this_def->cost);
+-
+-  return this_def;
+-}
+-
+-/* Create a new target register user structure, for a use in block BB,
+-   instruction INSN.  Return the new user.  */
+-static btr_user *
+-new_btr_user (basic_block bb, int insn_luid, rtx_insn *insn)
+-{
+-  /* This instruction reads target registers.  We need
+-     to decide whether we can replace all target register
+-     uses easily.
+-   */
+-  rtx *usep = find_btr_use (&PATTERN (insn));
+-  rtx use;
+-  btr_user *user = NULL;
+-
+-  if (usep)
+-    {
+-      int unambiguous_single_use;
+-
+-      /* We want to ensure that USE is the only use of a target
+-	 register in INSN, so that we know that to rewrite INSN to use
+-	 a different target register, all we have to do is replace USE.  */
+-      unambiguous_single_use = !find_btr_use (&PATTERN (insn), usep);
+-      if (!unambiguous_single_use)
+-	usep = NULL;
+-    }
+-  use = usep ? *usep : NULL_RTX;
+-  user = XOBNEW (&migrate_btrl_obstack, btr_user);
+-  user->bb = bb;
+-  user->luid = insn_luid;
+-  user->insn = insn;
+-  user->use = use;
+-  user->other_use_this_block = 0;
+-  user->next = NULL;
+-  user->n_reaching_defs = 0;
+-  user->first_reaching_def = -1;
+-
+-  if (dump_file)
+-    {
+-      fprintf (dump_file, "Uses target reg: { bb %d, insn %d }",
+-	       bb->index, INSN_UID (insn));
+-
+-      if (user->use)
+-	fprintf (dump_file, ": unambiguous use of reg %d\n",
+-		 REGNO (user->use));
+-    }
+-
+-  return user;
+-}
+-
+-/* Write the contents of S to the dump file.  */
+-static void
+-dump_hard_reg_set (HARD_REG_SET s)
+-{
+-  int reg;
+-  for (reg = 0; reg < FIRST_PSEUDO_REGISTER; reg++)
+-    if (TEST_HARD_REG_BIT (s, reg))
+-      fprintf (dump_file, " %d", reg);
+-}
+-
+-/* Write the set of target regs live in block BB to the dump file.  */
+-static void
+-dump_btrs_live (int bb)
+-{
+-  fprintf (dump_file, "BB%d live:", bb);
+-  dump_hard_reg_set (btrs_live[bb]);
+-  fprintf (dump_file, "\n");
+-}
+-
+-/* REGNO is the number of a branch target register that is being used or
+-   set.  USERS_THIS_BB is a list of preceding branch target register users;
+-   If any of them use the same register, set their other_use_this_block
+-   flag.  */
+-static void
+-note_other_use_this_block (unsigned int regno, btr_user *users_this_bb)
+-{
+-  btr_user *user;
+-
+-  for (user = users_this_bb; user != NULL; user = user->next)
+-    if (user->use && REGNO (user->use) == regno)
+-      user->other_use_this_block = 1;
+-}
+-
+-struct defs_uses_info {
+-  btr_user *users_this_bb;
+-  HARD_REG_SET btrs_written_in_block;
+-  HARD_REG_SET btrs_live_in_block;
+-  sbitmap bb_gen;
+-  sbitmap *btr_defset;
+-};
+-
+-/* Called via note_stores or directly to register stores into /
+-   clobbers of a branch target register DEST that are not recognized as
+-   straightforward definitions.  DATA points to information about the
+-   current basic block that needs updating.  */
+-static void
+-note_btr_set (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
+-{
+-  defs_uses_info *info = (defs_uses_info *) data;
+-  int regno, end_regno;
+-
+-  if (!REG_P (dest))
+-    return;
+-  regno = REGNO (dest);
+-  end_regno = END_REGNO (dest);
+-  for (; regno < end_regno; regno++)
+-    if (TEST_HARD_REG_BIT (all_btrs, regno))
+-      {
+-	note_other_use_this_block (regno, info->users_this_bb);
+-	SET_HARD_REG_BIT (info->btrs_written_in_block, regno);
+-	SET_HARD_REG_BIT (info->btrs_live_in_block, regno);
+-	bitmap_and_compl (info->bb_gen, info->bb_gen,
+-			    info->btr_defset[regno - first_btr]);
+-      }
+-}
+-
+-static void
+-compute_defs_uses_and_gen (btr_heap_t *all_btr_defs, btr_def **def_array,
+-			   btr_user **use_array, sbitmap *btr_defset,
+-			   sbitmap *bb_gen, HARD_REG_SET *btrs_written)
+-{
+-  /* Scan the code building up the set of all defs and all uses.
+-     For each target register, build the set of defs of that register.
+-     For each block, calculate the set of target registers
+-     written in that block.
+-     Also calculate the set of btrs ever live in that block.
+-  */
+-  int i;
+-  int insn_luid = 0;
+-  btr_def_group *all_btr_def_groups = NULL;
+-  defs_uses_info info;
+-
+-  bitmap_vector_clear (bb_gen, last_basic_block_for_fn (cfun));
+-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
+-    {
+-      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
+-      int reg;
+-      btr_def *defs_this_bb = NULL;
+-      rtx_insn *insn;
+-      rtx_insn *last;
+-      int can_throw = 0;
+-
+-      info.users_this_bb = NULL;
+-      info.bb_gen = bb_gen[i];
+-      info.btr_defset = btr_defset;
+-
+-      CLEAR_HARD_REG_SET (info.btrs_live_in_block);
+-      CLEAR_HARD_REG_SET (info.btrs_written_in_block);
+-      for (reg = first_btr; reg <= last_btr; reg++)
+-	if (TEST_HARD_REG_BIT (all_btrs, reg)
+-	    && REGNO_REG_SET_P (df_get_live_in (bb), reg))
+-	  SET_HARD_REG_BIT (info.btrs_live_in_block, reg);
+-
+-      for (insn = BB_HEAD (bb), last = NEXT_INSN (BB_END (bb));
+-	   insn != last;
+-	   insn = NEXT_INSN (insn), insn_luid++)
+-	{
+-	  if (INSN_P (insn))
+-	    {
+-	      int regno;
+-	      int insn_uid = INSN_UID (insn);
+-
+-	      if (insn_sets_btr_p (insn, 0, &regno))
+-		{
+-		  btr_def *def = add_btr_def (
+-		      all_btr_defs, bb, insn_luid, insn, regno,
+-		      TEST_HARD_REG_BIT (info.btrs_live_in_block, regno),
+-		      &all_btr_def_groups);
+-
+-		  def_array[insn_uid] = def;
+-		  SET_HARD_REG_BIT (info.btrs_written_in_block, regno);
+-		  SET_HARD_REG_BIT (info.btrs_live_in_block, regno);
+-		  bitmap_and_compl (bb_gen[i], bb_gen[i],
+-				      btr_defset[regno - first_btr]);
+-		  bitmap_set_bit (bb_gen[i], insn_uid);
+-		  def->next_this_bb = defs_this_bb;
+-		  defs_this_bb = def;
+-		  bitmap_set_bit (btr_defset[regno - first_btr], insn_uid);
+-		  note_other_use_this_block (regno, info.users_this_bb);
+-		}
+-	      /* Check for the blockage emitted by expand_nl_goto_receiver.  */
+-	      else if (cfun->has_nonlocal_label
+-		       && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE)
+-		{
+-		  btr_user *user;
+-
+-		  /* Do the equivalent of calling note_other_use_this_block
+-		     for every target register.  */
+-		  for (user = info.users_this_bb; user != NULL;
+-		       user = user->next)
+-		    if (user->use)
+-		      user->other_use_this_block = 1;
+-		  IOR_HARD_REG_SET (info.btrs_written_in_block, all_btrs);
+-		  IOR_HARD_REG_SET (info.btrs_live_in_block, all_btrs);
+-		  bitmap_clear (info.bb_gen);
+-		}
+-	      else
+-		{
+-		  if (find_btr_use (&PATTERN (insn)))
+-		    {
+-		      btr_user *user = new_btr_user (bb, insn_luid, insn);
+-
+-		      use_array[insn_uid] = user;
+-		      if (user->use)
+-			SET_HARD_REG_BIT (info.btrs_live_in_block,
+-					  REGNO (user->use));
+-		      else
+-			{
+-			  int reg;
+-			  for (reg = first_btr; reg <= last_btr; reg++)
+-			    if (TEST_HARD_REG_BIT (all_btrs, reg)
+-				&& refers_to_regno_p (reg, user->insn))
+-			      {
+-				note_other_use_this_block (reg,
+-							   info.users_this_bb);
+-				SET_HARD_REG_BIT (info.btrs_live_in_block, reg);
+-			      }
+-			  note_stores (PATTERN (insn), note_btr_set, &info);
+-			}
+-		      user->next = info.users_this_bb;
+-		      info.users_this_bb = user;
+-		    }
+-		  if (CALL_P (insn))
+-		    {
+-		      HARD_REG_SET *clobbered = &call_used_reg_set;
+-		      HARD_REG_SET call_saved;
+-		      rtx pat = PATTERN (insn);
+-		      int i;
+-
+-		      /* Check for sibcall.  */
+-		      if (GET_CODE (pat) == PARALLEL)
+-			for (i = XVECLEN (pat, 0) - 1; i >= 0; i--)
+-			  if (ANY_RETURN_P (XVECEXP (pat, 0, i)))
+-			    {
+-			      COMPL_HARD_REG_SET (call_saved,
+-						  call_used_reg_set);
+-			      clobbered = &call_saved;
+-			    }
+-
+-		      for (regno = first_btr; regno <= last_btr; regno++)
+-			if (TEST_HARD_REG_BIT (*clobbered, regno))
+-			  note_btr_set (regno_reg_rtx[regno], NULL_RTX, &info);
+-		    }
+-		}
+-	    }
+-	}
+-
+-      COPY_HARD_REG_SET (btrs_live[i], info.btrs_live_in_block);
+-      COPY_HARD_REG_SET (btrs_written[i], info.btrs_written_in_block);
+-
+-      REG_SET_TO_HARD_REG_SET (btrs_live_at_end[i], df_get_live_out (bb));
+-      /* If this block ends in a jump insn, add any uses or even clobbers
+-	 of branch target registers that it might have.  */
+-      for (insn = BB_END (bb); insn != BB_HEAD (bb) && ! INSN_P (insn); )
+-	insn = PREV_INSN (insn);
+-      /* ??? for the fall-through edge, it would make sense to insert the
+-	 btr set on the edge, but that would require to split the block
+-	 early on so that we can distinguish between dominance from the fall
+-	 through edge - which can use the call-clobbered registers - from
+-	 dominance by the throw edge.  */
+-      if (can_throw_internal (insn))
+-	{
+-	  HARD_REG_SET tmp;
+-
+-	  COPY_HARD_REG_SET (tmp, call_used_reg_set);
+-	  AND_HARD_REG_SET (tmp, all_btrs);
+-	  IOR_HARD_REG_SET (btrs_live_at_end[i], tmp);
+-	  can_throw = 1;
+-	}
+-      if (can_throw || JUMP_P (insn))
+-	{
+-	  int regno;
+-
+-	  for (regno = first_btr; regno <= last_btr; regno++)
+-	    if (refers_to_regno_p (regno, insn))
+-	      SET_HARD_REG_BIT (btrs_live_at_end[i], regno);
+-	}
+-
+-      if (dump_file)
+-	dump_btrs_live (i);
+-    }
+-}
+-
+-static void
+-compute_kill (sbitmap *bb_kill, sbitmap *btr_defset,
+-	      HARD_REG_SET *btrs_written)
+-{
+-  int i;
+-  int regno;
+-
+-  /* For each basic block, form the set BB_KILL - the set
+-     of definitions that the block kills.  */
+-  bitmap_vector_clear (bb_kill, last_basic_block_for_fn (cfun));
+-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
+-    {
+-      for (regno = first_btr; regno <= last_btr; regno++)
+-	if (TEST_HARD_REG_BIT (all_btrs, regno)
+-	    && TEST_HARD_REG_BIT (btrs_written[i], regno))
+-	  bitmap_ior (bb_kill[i], bb_kill[i],
+-			  btr_defset[regno - first_btr]);
+-    }
+-}
+-
+-static void
+-compute_out (sbitmap *bb_out, sbitmap *bb_gen, sbitmap *bb_kill, int max_uid)
+-{
+-  /* Perform iterative dataflow:
+-      Initially, for all blocks, BB_OUT = BB_GEN.
+-      For each block,
+-	BB_IN  = union over predecessors of BB_OUT(pred)
+-	BB_OUT = (BB_IN - BB_KILL) + BB_GEN
+-     Iterate until the bb_out sets stop growing.  */
+-  int i;
+-  int changed;
+-  auto_sbitmap bb_in (max_uid);
+-
+-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
+-    bitmap_copy (bb_out[i], bb_gen[i]);
+-
+-  changed = 1;
+-  while (changed)
+-    {
+-      changed = 0;
+-      for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
+-	{
+-	  bitmap_union_of_preds (bb_in, bb_out, BASIC_BLOCK_FOR_FN (cfun, i));
+-	  changed |= bitmap_ior_and_compl (bb_out[i], bb_gen[i],
+-					       bb_in, bb_kill[i]);
+-	}
+-    }
+-}
+-
+-static void
+-link_btr_uses (btr_def **def_array, btr_user **use_array, sbitmap *bb_out,
+-	       sbitmap *btr_defset, int max_uid)
+-{
+-  int i;
+-  auto_sbitmap reaching_defs (max_uid);
+-
+-  /* Link uses to the uses lists of all of their reaching defs.
+-     Count up the number of reaching defs of each use.  */
+-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
+-    {
+-      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
+-      rtx_insn *insn;
+-      rtx_insn *last;
+-
+-      bitmap_union_of_preds (reaching_defs, bb_out, BASIC_BLOCK_FOR_FN (cfun, i));
+-      for (insn = BB_HEAD (bb), last = NEXT_INSN (BB_END (bb));
+-	   insn != last;
+-	   insn = NEXT_INSN (insn))
+-	{
+-	  if (INSN_P (insn))
+-	    {
+-	      int insn_uid = INSN_UID (insn);
+-
+-	      btr_def *def   = def_array[insn_uid];
+-	      btr_user *user = use_array[insn_uid];
+-	      if (def != NULL)
+-		{
+-		  /* Remove all reaching defs of regno except
+-		     for this one.  */
+-		  bitmap_and_compl (reaching_defs, reaching_defs,
+-				      btr_defset[def->btr - first_btr]);
+-		  bitmap_set_bit (reaching_defs, insn_uid);
+-		}
+-
+-	      if (user != NULL)
+-		{
+-		  /* Find all the reaching defs for this use.  */
+-		  auto_sbitmap reaching_defs_of_reg (max_uid);
+-		  unsigned int uid = 0;
+-		  sbitmap_iterator sbi;
+-
+-		  if (user->use)
+-		    bitmap_and (
+-		      reaching_defs_of_reg,
+-		      reaching_defs,
+-		      btr_defset[REGNO (user->use) - first_btr]);
+-		  else
+-		    {
+-		      int reg;
+-
+-		      bitmap_clear (reaching_defs_of_reg);
+-		      for (reg = first_btr; reg <= last_btr; reg++)
+-			if (TEST_HARD_REG_BIT (all_btrs, reg)
+-			    && refers_to_regno_p (reg, user->insn))
+-			  bitmap_or_and (reaching_defs_of_reg,
+-			    reaching_defs_of_reg,
+-			    reaching_defs,
+-			    btr_defset[reg - first_btr]);
+-		    }
+-		  EXECUTE_IF_SET_IN_BITMAP (reaching_defs_of_reg, 0, uid, sbi)
+-		    {
+-		      btr_def *def = def_array[uid];
+-
+-		      /* We now know that def reaches user.  */
+-
+-		      if (dump_file)
+-			fprintf (dump_file,
+-			  "Def in insn %d reaches use in insn %d\n",
+-			  uid, insn_uid);
+-
+-		      user->n_reaching_defs++;
+-		      if (!user->use)
+-			def->has_ambiguous_use = 1;
+-		      if (user->first_reaching_def != -1)
+-			{ /* There is more than one reaching def.  This is
+-			     a rare case, so just give up on this def/use
+-			     web when it occurs.  */
+-			  def->has_ambiguous_use = 1;
+-			  def_array[user->first_reaching_def]
+-			    ->has_ambiguous_use = 1;
+-			  if (dump_file)
+-			    fprintf (dump_file,
+-				     "(use %d has multiple reaching defs)\n",
+-				     insn_uid);
+-			}
+-		      else
+-			user->first_reaching_def = uid;
+-		      if (user->other_use_this_block)
+-			def->other_btr_uses_after_use = 1;
+-		      user->next = def->uses;
+-		      def->uses = user;
+-		    }
+-		}
+-
+-	      if (CALL_P (insn))
+-		{
+-		  int regno;
+-
+-		  for (regno = first_btr; regno <= last_btr; regno++)
+-		    if (TEST_HARD_REG_BIT (all_btrs, regno)
+-			&& TEST_HARD_REG_BIT (call_used_reg_set, regno))
+-		      bitmap_and_compl (reaching_defs, reaching_defs,
+-					  btr_defset[regno - first_btr]);
+-		}
+-	    }
+-	}
+-    }
+-}
+-
+-static void
+-build_btr_def_use_webs (btr_heap_t *all_btr_defs)
+-{
+-  const int max_uid = get_max_uid ();
+-  btr_def  **def_array   = XCNEWVEC (btr_def *, max_uid);
+-  btr_user **use_array   = XCNEWVEC (btr_user *, max_uid);
+-  sbitmap *btr_defset   = sbitmap_vector_alloc (
+-			   (last_btr - first_btr) + 1, max_uid);
+-  sbitmap *bb_gen = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+-					  max_uid);
+-  HARD_REG_SET *btrs_written = XCNEWVEC (HARD_REG_SET,
+-					 last_basic_block_for_fn (cfun));
+-  sbitmap *bb_kill;
+-  sbitmap *bb_out;
+-
+-  bitmap_vector_clear (btr_defset, (last_btr - first_btr) + 1);
+-
+-  compute_defs_uses_and_gen (all_btr_defs, def_array, use_array, btr_defset,
+-			     bb_gen, btrs_written);
+-
+-  bb_kill = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), max_uid);
+-  compute_kill (bb_kill, btr_defset, btrs_written);
+-  free (btrs_written);
+-
+-  bb_out = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), max_uid);
+-  compute_out (bb_out, bb_gen, bb_kill, max_uid);
+-
+-  sbitmap_vector_free (bb_gen);
+-  sbitmap_vector_free (bb_kill);
+-
+-  link_btr_uses (def_array, use_array, bb_out, btr_defset, max_uid);
+-
+-  sbitmap_vector_free (bb_out);
+-  sbitmap_vector_free (btr_defset);
+-  free (use_array);
+-  free (def_array);
+-}
+-
+-/* Return true if basic block BB contains the start or end of the
+-   live range of the definition DEF, AND there are other live
+-   ranges of the same target register that include BB.  */
+-static int
+-block_at_edge_of_live_range_p (int bb, btr_def *def)
+-{
+-  if (def->other_btr_uses_before_def
+-      && BASIC_BLOCK_FOR_FN (cfun, bb) == def->bb)
+-    return 1;
+-  else if (def->other_btr_uses_after_use)
+-    {
+-      btr_user *user;
+-      for (user = def->uses; user != NULL; user = user->next)
+-	if (BASIC_BLOCK_FOR_FN (cfun, bb) == user->bb)
+-	  return 1;
+-    }
+-  return 0;
+-}
+-
+-/* We are removing the def/use web DEF.  The target register
+-   used in this web is therefore no longer live in the live range
+-   of this web, so remove it from the live set of all basic blocks
+-   in the live range of the web.
+-   Blocks at the boundary of the live range may contain other live
+-   ranges for the same target register, so we have to be careful
+-   to remove the target register from the live set of these blocks
+-   only if they do not contain other live ranges for the same register.  */
+-static void
+-clear_btr_from_live_range (btr_def *def)
+-{
+-  unsigned bb;
+-  bitmap_iterator bi;
+-
+-  EXECUTE_IF_SET_IN_BITMAP (def->live_range, 0, bb, bi)
+-    {
+-      if ((!def->other_btr_uses_before_def
+-	   && !def->other_btr_uses_after_use)
+-	  || !block_at_edge_of_live_range_p (bb, def))
+-	{
+-	  CLEAR_HARD_REG_BIT (btrs_live[bb], def->btr);
+-	  CLEAR_HARD_REG_BIT (btrs_live_at_end[bb], def->btr);
+-	  if (dump_file)
+-	    dump_btrs_live (bb);
+-	}
+-    }
+- if (def->own_end)
+-   CLEAR_HARD_REG_BIT (btrs_live_at_end[def->bb->index], def->btr);
+-}
+-
+-
+-/* We are adding the def/use web DEF.  Add the target register used
+-   in this web to the live set of all of the basic blocks that contain
+-   the live range of the web.
+-   If OWN_END is set, also show that the register is live from our
+-   definitions at the end of the basic block where it is defined.  */
+-static void
+-add_btr_to_live_range (btr_def *def, int own_end)
+-{
+-  unsigned bb;
+-  bitmap_iterator bi;
+-
+-  EXECUTE_IF_SET_IN_BITMAP (def->live_range, 0, bb, bi)
+-    {
+-      SET_HARD_REG_BIT (btrs_live[bb], def->btr);
+-      SET_HARD_REG_BIT (btrs_live_at_end[bb], def->btr);
+-      if (dump_file)
+-	dump_btrs_live (bb);
+-    }
+-  if (own_end)
+-    {
+-      SET_HARD_REG_BIT (btrs_live_at_end[def->bb->index], def->btr);
+-      def->own_end = 1;
+-    }
+-}
+-
+-/* Update a live range to contain the basic block NEW_BLOCK, and all
+-   blocks on paths between the existing live range and NEW_BLOCK.
+-   HEAD is a block contained in the existing live range that dominates
+-   all other blocks in the existing live range.
+-   Also add to the set BTRS_LIVE_IN_RANGE all target registers that
+-   are live in the blocks that we add to the live range.
+-   If FULL_RANGE is set, include the full live range of NEW_BB;
+-   otherwise, if NEW_BB dominates HEAD_BB, only add registers that
+-   are life at the end of NEW_BB for NEW_BB itself.
+-   It is a precondition that either NEW_BLOCK dominates HEAD,or
+-   HEAD dom NEW_BLOCK.  This is used to speed up the
+-   implementation of this function.  */
+-static void
+-augment_live_range (bitmap live_range, HARD_REG_SET *btrs_live_in_range,
+-		    basic_block head_bb, basic_block new_bb, int full_range)
+-{
+-  basic_block *worklist, *tos;
+-
+-  tos = worklist = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun) + 1);
+-
+-  if (dominated_by_p (CDI_DOMINATORS, new_bb, head_bb))
+-    {
+-      if (new_bb == head_bb)
+-	{
+-	  if (full_range)
+-	    IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live[new_bb->index]);
+-	  free (tos);
+-	  return;
+-	}
+-      *tos++ = new_bb;
+-    }
+-  else
+-    {
+-      edge e;
+-      edge_iterator ei;
+-      int new_block = new_bb->index;
+-
+-      gcc_assert (dominated_by_p (CDI_DOMINATORS, head_bb, new_bb));
+-
+-      IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live[head_bb->index]);
+-      bitmap_set_bit (live_range, new_block);
+-      /* A previous btr migration could have caused a register to be
+-	live just at the end of new_block which we need in full, so
+-	use trs_live_at_end even if full_range is set.  */
+-      IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live_at_end[new_block]);
+-      if (full_range)
+-	IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live[new_block]);
+-      if (dump_file)
+-	{
+-	  fprintf (dump_file,
+-		   "Adding end of block %d and rest of %d to live range\n",
+-		   new_block, head_bb->index);
+-	  fprintf (dump_file,"Now live btrs are ");
+-	  dump_hard_reg_set (*btrs_live_in_range);
+-	  fprintf (dump_file, "\n");
+-	}
+-      FOR_EACH_EDGE (e, ei, head_bb->preds)
+-	*tos++ = e->src;
+-    }
+-
+-  while (tos != worklist)
+-    {
+-      basic_block bb = *--tos;
+-      if (!bitmap_bit_p (live_range, bb->index))
+-	{
+-	  edge e;
+-	  edge_iterator ei;
+-
+-	  bitmap_set_bit (live_range, bb->index);
+-	  IOR_HARD_REG_SET (*btrs_live_in_range,
+-	    btrs_live[bb->index]);
+-	  /* A previous btr migration could have caused a register to be
+-	     live just at the end of a block which we need in full.  */
+-	  IOR_HARD_REG_SET (*btrs_live_in_range,
+-	    btrs_live_at_end[bb->index]);
+-	  if (dump_file)
+-	    {
+-	      fprintf (dump_file,
+-		"Adding block %d to live range\n", bb->index);
+-	      fprintf (dump_file,"Now live btrs are ");
+-	      dump_hard_reg_set (*btrs_live_in_range);
+-	      fprintf (dump_file, "\n");
+-	    }
+-
+-	  FOR_EACH_EDGE (e, ei, bb->preds)
+-	    {
+-	      basic_block pred = e->src;
+-	      if (!bitmap_bit_p (live_range, pred->index))
+-		*tos++ = pred;
+-	    }
+-	}
+-    }
+-
+-  free (worklist);
+-}
+-
+-/*  Return the most desirable target register that is not in
+-    the set USED_BTRS.  */
+-static int
+-choose_btr (HARD_REG_SET used_btrs)
+-{
+-  int i;
+-
+-  if (!hard_reg_set_subset_p (all_btrs, used_btrs))
+-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      {
+-#ifdef REG_ALLOC_ORDER
+-	int regno = reg_alloc_order[i];
+-#else
+-	int regno = i;
+-#endif
+-	if (TEST_HARD_REG_BIT (all_btrs, regno)
+-	    && !TEST_HARD_REG_BIT (used_btrs, regno))
+-	  return regno;
+-      }
+-  return -1;
+-}
+-
+-/* Calculate the set of basic blocks that contain the live range of
+-   the def/use web DEF.
+-   Also calculate the set of target registers that are live at time
+-   in this live range, but ignore the live range represented by DEF
+-   when calculating this set.  */
+-static void
+-btr_def_live_range (btr_def *def, HARD_REG_SET *btrs_live_in_range)
+-{
+-  if (!def->live_range)
+-    {
+-      btr_user *user;
+-
+-      def->live_range = BITMAP_ALLOC (NULL);
+-
+-      bitmap_set_bit (def->live_range, def->bb->index);
+-      COPY_HARD_REG_SET (*btrs_live_in_range,
+-			 (flag_btr_bb_exclusive
+-			  ? btrs_live : btrs_live_at_end)[def->bb->index]);
+-
+-      for (user = def->uses; user != NULL; user = user->next)
+-	augment_live_range (def->live_range, btrs_live_in_range,
+-			    def->bb, user->bb,
+-			    (flag_btr_bb_exclusive
+-			     || user->insn != BB_END (def->bb)
+-			     || !JUMP_P (user->insn)));
+-    }
+-  else
+-    {
+-      /* def->live_range is accurate, but we need to recompute
+-	 the set of target registers live over it, because migration
+-	 of other PT instructions may have affected it.
+-      */
+-      unsigned bb;
+-      unsigned def_bb = flag_btr_bb_exclusive ? -1 : def->bb->index;
+-      bitmap_iterator bi;
+-
+-      CLEAR_HARD_REG_SET (*btrs_live_in_range);
+-      EXECUTE_IF_SET_IN_BITMAP (def->live_range, 0, bb, bi)
+-	{
+-	  IOR_HARD_REG_SET (*btrs_live_in_range,
+-			    (def_bb == bb
+-			     ? btrs_live_at_end : btrs_live) [bb]);
+-	}
+-    }
+-  if (!def->other_btr_uses_before_def &&
+-      !def->other_btr_uses_after_use)
+-    CLEAR_HARD_REG_BIT (*btrs_live_in_range, def->btr);
+-}
+-
+-/* Merge into the def/use web DEF any other def/use webs in the same
+-   group that are dominated by DEF, provided that there is a target
+-   register available to allocate to the merged web.  */
+-static void
+-combine_btr_defs (btr_def *def, HARD_REG_SET *btrs_live_in_range)
+-{
+-  btr_def *other_def;
+-
+-  for (other_def = def->group->members;
+-       other_def != NULL;
+-       other_def = other_def->next_this_group)
+-    {
+-      if (other_def != def
+-	  && other_def->uses != NULL
+-	  && ! other_def->has_ambiguous_use
+-	  && dominated_by_p (CDI_DOMINATORS, other_def->bb, def->bb))
+-	{
+-	  /* def->bb dominates the other def, so def and other_def could
+-	     be combined.  */
+-	  /* Merge their live ranges, and get the set of
+-	     target registers live over the merged range.  */
+-	  int btr;
+-	  HARD_REG_SET combined_btrs_live;
+-	  auto_bitmap combined_live_range;
+-	  btr_user *user;
+-
+-	  if (other_def->live_range == NULL)
+-	    {
+-	      HARD_REG_SET dummy_btrs_live_in_range;
+-	      btr_def_live_range (other_def, &dummy_btrs_live_in_range);
+-	    }
+-	  COPY_HARD_REG_SET (combined_btrs_live, *btrs_live_in_range);
+-	  bitmap_copy (combined_live_range, def->live_range);
+-
+-	  for (user = other_def->uses; user != NULL; user = user->next)
+-	    augment_live_range (combined_live_range, &combined_btrs_live,
+-				def->bb, user->bb,
+-				(flag_btr_bb_exclusive
+-				 || user->insn != BB_END (def->bb)
+-				 || !JUMP_P (user->insn)));
+-
+-	  btr = choose_btr (combined_btrs_live);
+-	  if (btr != -1)
+-	    {
+-	      /* We can combine them.  */
+-	      if (dump_file)
+-		fprintf (dump_file,
+-			 "Combining def in insn %d with def in insn %d\n",
+-			 INSN_UID (other_def->insn), INSN_UID (def->insn));
+-
+-	      def->btr = btr;
+-	      user = other_def->uses;
+-	      while (user != NULL)
+-		{
+-		  btr_user *next = user->next;
+-
+-		  user->next = def->uses;
+-		  def->uses = user;
+-		  user = next;
+-		}
+-	      /* Combining def/use webs can make target registers live
+-		 after uses where they previously were not.  This means
+-		 some REG_DEAD notes may no longer be correct.  We could
+-		 be more precise about this if we looked at the combined
+-		 live range, but here I just delete any REG_DEAD notes
+-		 in case they are no longer correct.  */
+-	      for (user = def->uses; user != NULL; user = user->next)
+-		remove_note (user->insn,
+-			     find_regno_note (user->insn, REG_DEAD,
+-					      REGNO (user->use)));
+-	      clear_btr_from_live_range (other_def);
+-	      other_def->uses = NULL;
+-	      bitmap_copy (def->live_range, combined_live_range);
+-	      if (other_def->btr == btr && other_def->other_btr_uses_after_use)
+-		def->other_btr_uses_after_use = 1;
+-	      COPY_HARD_REG_SET (*btrs_live_in_range, combined_btrs_live);
+-
+-	      /* Delete the old target register initialization.  */
+-	      delete_insn (other_def->insn);
+-
+-	    }
+-	}
+-    }
+-}
+-
+-/* Move the definition DEF from its current position to basic
+-   block NEW_DEF_BB, and modify it to use branch target register BTR.
+-   Delete the old defining insn, and insert a new one in NEW_DEF_BB.
+-   Update all reaching uses of DEF in the RTL to use BTR.
+-   If this new position means that other defs in the
+-   same group can be combined with DEF then combine them.  */
+-static void
+-move_btr_def (basic_block new_def_bb, int btr, btr_def *def, bitmap live_range,
+-	     HARD_REG_SET *btrs_live_in_range)
+-{
+-  /* We can move the instruction.
+-     Set a target register in block NEW_DEF_BB to the value
+-     needed for this target register definition.
+-     Replace all uses of the old target register definition by
+-     uses of the new definition.  Delete the old definition.  */
+-  basic_block b = new_def_bb;
+-  rtx_insn *insp = BB_HEAD (b);
+-  rtx_insn *old_insn = def->insn;
+-  rtx src;
+-  rtx btr_rtx;
+-  rtx_insn *new_insn;
+-  machine_mode btr_mode;
+-  btr_user *user;
+-  rtx set;
+-
+-  if (dump_file)
+-    fprintf(dump_file, "migrating to basic block %d, using reg %d\n",
+-	    new_def_bb->index, btr);
+-
+-  clear_btr_from_live_range (def);
+-  def->btr = btr;
+-  def->bb = new_def_bb;
+-  def->luid = 0;
+-  def->cost = basic_block_freq (new_def_bb);
+-  bitmap_copy (def->live_range, live_range);
+-  combine_btr_defs (def, btrs_live_in_range);
+-  btr = def->btr;
+-  def->other_btr_uses_before_def
+-    = TEST_HARD_REG_BIT (btrs_live[b->index], btr) ? 1 : 0;
+-  add_btr_to_live_range (def, 1);
+-  if (LABEL_P (insp))
+-    insp = NEXT_INSN (insp);
+-  /* N.B.: insp is expected to be NOTE_INSN_BASIC_BLOCK now.  Some
+-     optimizations can result in insp being both first and last insn of
+-     its basic block.  */
+-  /* ?? some assertions to check that insp is sensible? */
+-
+-  if (def->other_btr_uses_before_def)
+-    {
+-      for (insp = BB_END (b); ! INSN_P (insp); insp = PREV_INSN (insp))
+-	gcc_assert (insp != BB_HEAD (b));
+-
+-      if (JUMP_P (insp) || can_throw_internal (insp))
+-	insp = PREV_INSN (insp);
+-    }
+-
+-  set = single_set (old_insn);
+-  src = SET_SRC (set);
+-  btr_mode = GET_MODE (SET_DEST (set));
+-  btr_rtx = gen_rtx_REG (btr_mode, btr);
+-
+-  new_insn = gen_move_insn (btr_rtx, src);
+-
+-  /* Insert target register initialization at head of basic block.  */
+-  def->insn = emit_insn_after (new_insn, insp);
+-
+-  df_set_regs_ever_live (btr, true);
+-
+-  if (dump_file)
+-    fprintf (dump_file, "New pt is insn %d, inserted after insn %d\n",
+-	     INSN_UID (def->insn), INSN_UID (insp));
+-
+-  /* Delete the old target register initialization.  */
+-  delete_insn (old_insn);
+-
+-  /* Replace each use of the old target register by a use of the new target
+-     register.  */
+-  for (user = def->uses; user != NULL; user = user->next)
+-    {
+-      /* Some extra work here to ensure consistent modes, because
+-	 it seems that a target register REG rtx can be given a different
+-	 mode depending on the context (surely that should not be
+-	 the case?).  */
+-      rtx replacement_rtx;
+-      if (GET_MODE (user->use) == GET_MODE (btr_rtx)
+-	  || GET_MODE (user->use) == VOIDmode)
+-	replacement_rtx = btr_rtx;
+-      else
+-	replacement_rtx = gen_rtx_REG (GET_MODE (user->use), btr);
+-      validate_replace_rtx (user->use, replacement_rtx, user->insn);
+-      user->use = replacement_rtx;
+-    }
+-}
+-
+-/* We anticipate intra-block scheduling to be done.  See if INSN could move
+-   up within BB by N_INSNS.  */
+-static int
+-can_move_up (const_basic_block bb, const rtx_insn *insn, int n_insns)
+-{
+-  while (insn != BB_HEAD (bb) && n_insns > 0)
+-    {
+-      insn = PREV_INSN (insn);
+-      /* ??? What if we have an anti-dependency that actually prevents the
+-	 scheduler from doing the move?  We'd like to re-allocate the register,
+-	 but not necessarily put the load into another basic block.  */
+-      if (INSN_P (insn))
+-	n_insns--;
+-    }
+-  return n_insns <= 0;
+-}
+-
+-/* Attempt to migrate the target register definition DEF to an
+-   earlier point in the flowgraph.
+-
+-   It is a precondition of this function that DEF is migratable:
+-   i.e. it has a constant source, and all uses are unambiguous.
+-
+-   Only migrations that reduce the cost of DEF will be made.
+-   MIN_COST is the lower bound on the cost of the DEF after migration.
+-   If we migrate DEF so that its cost falls below MIN_COST,
+-   then we do not attempt to migrate further.  The idea is that
+-   we migrate definitions in a priority order based on their cost,
+-   when the cost of this definition falls below MIN_COST, then
+-   there is another definition with cost == MIN_COST which now
+-   has a higher priority than this definition.
+-
+-   Return nonzero if there may be benefit from attempting to
+-   migrate this DEF further (i.e. we have reduced the cost below
+-   MIN_COST, but we may be able to reduce it further).
+-   Return zero if no further migration is possible.  */
+-static int
+-migrate_btr_def (btr_def *def, int min_cost)
+-{
+-  HARD_REG_SET btrs_live_in_range;
+-  int btr_used_near_def = 0;
+-  int def_basic_block_freq;
+-  basic_block attempt;
+-  int give_up = 0;
+-  int def_moved = 0;
+-  btr_user *user;
+-  int def_latency;
+-
+-  if (dump_file)
+-    fprintf (dump_file,
+-	     "Attempting to migrate pt from insn %d (cost = %d, min_cost = %d) ... ",
+-	     INSN_UID (def->insn), def->cost, min_cost);
+-
+-  if (!def->group || def->has_ambiguous_use)
+-    /* These defs are not migratable.  */
+-    {
+-      if (dump_file)
+-	fprintf (dump_file, "it's not migratable\n");
+-      return 0;
+-    }
+-
+-  if (!def->uses)
+-    /* We have combined this def with another in the same group, so
+-       no need to consider it further.
+-    */
+-    {
+-      if (dump_file)
+-	fprintf (dump_file, "it's already combined with another pt\n");
+-      return 0;
+-    }
+-
+-  btr_def_live_range (def, &btrs_live_in_range);
+-  auto_bitmap live_range;
+-  bitmap_copy (live_range, def->live_range);
+-
+-#ifdef INSN_SCHEDULING
+-  def_latency = insn_default_latency (def->insn) * issue_rate;
+-#else
+-  def_latency = issue_rate;
+-#endif
+-
+-  for (user = def->uses; user != NULL; user = user->next)
+-    {
+-      if (user->bb == def->bb
+-	  && user->luid > def->luid
+-	  && (def->luid + def_latency) > user->luid
+-	  && ! can_move_up (def->bb, def->insn,
+-			    (def->luid + def_latency) - user->luid))
+-	{
+-	  btr_used_near_def = 1;
+-	  break;
+-	}
+-    }
+-
+-  def_basic_block_freq = basic_block_freq (def->bb);
+-
+-  for (attempt = get_immediate_dominator (CDI_DOMINATORS, def->bb);
+-       !give_up && attempt && attempt != ENTRY_BLOCK_PTR_FOR_FN (cfun)
+-       && def->cost >= min_cost;
+-       attempt = get_immediate_dominator (CDI_DOMINATORS, attempt))
+-    {
+-      /* Try to move the instruction that sets the target register into
+-	 basic block ATTEMPT.  */
+-      int try_freq = basic_block_freq (attempt);
+-      edge_iterator ei;
+-      edge e;
+-
+-      /* If ATTEMPT has abnormal edges, skip it.  */
+-      FOR_EACH_EDGE (e, ei, attempt->succs)
+-	if (e->flags & EDGE_COMPLEX)
+-	  break;
+-      if (e)
+-	continue;
+-
+-      if (dump_file)
+-	fprintf (dump_file, "trying block %d ...", attempt->index);
+-
+-      if (try_freq < def_basic_block_freq
+-	  || (try_freq == def_basic_block_freq && btr_used_near_def))
+-	{
+-	  int btr;
+-	  augment_live_range (live_range, &btrs_live_in_range, def->bb, attempt,
+-			      flag_btr_bb_exclusive);
+-	  if (dump_file)
+-	    {
+-	      fprintf (dump_file, "Now btrs live in range are: ");
+-	      dump_hard_reg_set (btrs_live_in_range);
+-	      fprintf (dump_file, "\n");
+-	    }
+-	  btr = choose_btr (btrs_live_in_range);
+-	  if (btr != -1)
+-	    {
+-	      move_btr_def (attempt, btr, def, live_range, &btrs_live_in_range);
+-	      bitmap_copy (live_range, def->live_range);
+-	      btr_used_near_def = 0;
+-	      def_moved = 1;
+-	      def_basic_block_freq = basic_block_freq (def->bb);
+-	    }
+-	  else
+-	    {
+-	      /* There are no free target registers available to move
+-		 this far forward, so give up */
+-	      give_up = 1;
+-	      if (dump_file)
+-		fprintf (dump_file,
+-			 "giving up because there are no free target registers\n");
+-	    }
+-
+-	}
+-    }
+-  if (!def_moved)
+-    {
+-      give_up = 1;
+-      if (dump_file)
+-	fprintf (dump_file, "failed to move\n");
+-    }
+-
+-  return !give_up;
+-}
+-
+-/* Attempt to move instructions that set target registers earlier
+-   in the flowgraph, away from their corresponding uses.  */
+-static void
+-migrate_btr_defs (enum reg_class btr_class, int allow_callee_save)
+-{
+-  btr_heap_t all_btr_defs (LONG_MIN);
+-  int reg;
+-
+-  gcc_obstack_init (&migrate_btrl_obstack);
+-  if (dump_file)
+-    {
+-      int i;
+-
+-      for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
+-	{
+-	  basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
+-	  fprintf (dump_file, "Basic block %d: count = ", i);
+-	  bb->count.dump (dump_file);
+-	  fprintf (dump_file, " loop-depth = %d idom = %d\n",
+-		   bb_loop_depth (bb),
+-		   get_immediate_dominator (CDI_DOMINATORS, bb)->index);
+-	}
+-    }
+-
+-  CLEAR_HARD_REG_SET (all_btrs);
+-  for (first_btr = -1, reg = 0; reg < FIRST_PSEUDO_REGISTER; reg++)
+-    if (TEST_HARD_REG_BIT (reg_class_contents[(int) btr_class], reg)
+-	&& (allow_callee_save || call_used_regs[reg]
+-	    || df_regs_ever_live_p (reg)))
+-      {
+-	SET_HARD_REG_BIT (all_btrs, reg);
+-	last_btr = reg;
+-	if (first_btr < 0)
+-	  first_btr = reg;
+-      }
+-
+-  btrs_live = XCNEWVEC (HARD_REG_SET, last_basic_block_for_fn (cfun));
+-  btrs_live_at_end = XCNEWVEC (HARD_REG_SET, last_basic_block_for_fn (cfun));
+-
+-  build_btr_def_use_webs (&all_btr_defs);
+-
+-  while (!all_btr_defs.empty ())
+-    {
+-      int min_cost = -all_btr_defs.min_key ();
+-      btr_def *def = all_btr_defs.extract_min ();
+-      if (migrate_btr_def (def, min_cost))
+-	{
+-	  all_btr_defs.insert (-def->cost, def);
+-	  if (dump_file)
+-	    {
+-	      fprintf (dump_file,
+-		"Putting insn %d back on queue with priority %d\n",
+-		INSN_UID (def->insn), def->cost);
+-	    }
+-	}
+-      else
+-	BITMAP_FREE (def->live_range);
+-    }
+-
+-  free (btrs_live);
+-  free (btrs_live_at_end);
+-  obstack_free (&migrate_btrl_obstack, NULL);
+-}
+-
+-static void
+-branch_target_load_optimize (bool after_prologue_epilogue_gen)
+-{
+-  enum reg_class klass
+-    = (enum reg_class) targetm.branch_target_register_class ();
+-  if (klass != NO_REGS)
+-    {
+-      /* Initialize issue_rate.  */
+-      if (targetm.sched.issue_rate)
+-	issue_rate = targetm.sched.issue_rate ();
+-      else
+-	issue_rate = 1;
+-
+-      if (!after_prologue_epilogue_gen)
+-	{
+-	  /* Build the CFG for migrate_btr_defs.  */
+-#if 1
+-	  /* This may or may not be needed, depending on where we
+-	     run this phase.  */
+-	  cleanup_cfg (optimize ? CLEANUP_EXPENSIVE : 0);
+-#endif
+-	}
+-      df_analyze ();
+-
+-
+-      /* Dominator info is also needed for migrate_btr_def.  */
+-      calculate_dominance_info (CDI_DOMINATORS);
+-      migrate_btr_defs (klass,
+-		       (targetm.branch_target_register_callee_saved
+-			(after_prologue_epilogue_gen)));
+-
+-      free_dominance_info (CDI_DOMINATORS);
+-    }
+-}
+-
+-namespace {
+-
+-const pass_data pass_data_branch_target_load_optimize1 =
+-{
+-  RTL_PASS, /* type */
+-  "btl1", /* name */
+-  OPTGROUP_NONE, /* optinfo_flags */
+-  TV_NONE, /* tv_id */
+-  0, /* properties_required */
+-  0, /* properties_provided */
+-  0, /* properties_destroyed */
+-  0, /* todo_flags_start */
+-  0, /* todo_flags_finish */
+-};
+-
+-class pass_branch_target_load_optimize1 : public rtl_opt_pass
+-{
+-public:
+-  pass_branch_target_load_optimize1 (gcc::context *ctxt)
+-    : rtl_opt_pass (pass_data_branch_target_load_optimize1, ctxt)
+-  {}
+-
+-  /* opt_pass methods: */
+-  virtual bool gate (function *) { return flag_branch_target_load_optimize; }
+-  virtual unsigned int execute (function *)
+-    {
+-      branch_target_load_optimize (epilogue_completed);
+-      return 0;
+-    }
+-
+-}; // class pass_branch_target_load_optimize1
+-
+-} // anon namespace
+-
+-rtl_opt_pass *
+-make_pass_branch_target_load_optimize1 (gcc::context *ctxt)
+-{
+-  return new pass_branch_target_load_optimize1 (ctxt);
+-}
+-
+-
+-namespace {
+-
+-const pass_data pass_data_branch_target_load_optimize2 =
+-{
+-  RTL_PASS, /* type */
+-  "btl2", /* name */
+-  OPTGROUP_NONE, /* optinfo_flags */
+-  TV_NONE, /* tv_id */
+-  0, /* properties_required */
+-  0, /* properties_provided */
+-  0, /* properties_destroyed */
+-  0, /* todo_flags_start */
+-  0, /* todo_flags_finish */
+-};
+-
+-class pass_branch_target_load_optimize2 : public rtl_opt_pass
+-{
+-public:
+-  pass_branch_target_load_optimize2 (gcc::context *ctxt)
+-    : rtl_opt_pass (pass_data_branch_target_load_optimize2, ctxt)
+-  {}
+-
+-  /* opt_pass methods: */
+-  virtual bool gate (function *)
+-    {
+-      return (optimize > 0 && flag_branch_target_load_optimize2);
+-    }
+-
+-  virtual unsigned int execute (function *);
+-
+-}; // class pass_branch_target_load_optimize2
+-
+-unsigned int
+-pass_branch_target_load_optimize2::execute (function *)
+-{
+-  static int warned = 0;
+-
+-  /* Leave this a warning for now so that it is possible to experiment
+-     with running this pass twice.  In 3.6, we should either make this
+-     an error, or use separate dump files.  */
+-  if (flag_branch_target_load_optimize
+-      && flag_branch_target_load_optimize2
+-      && !warned)
+-    {
+-      warning (0, "branch target register load optimization is not intended "
+-	       "to be run twice");
+-
+-      warned = 1;
+-    }
+-
+-  branch_target_load_optimize (epilogue_completed);
+-  return 0;
+-}
+-
+-} // anon namespace
+-
+-rtl_opt_pass *
+-make_pass_branch_target_load_optimize2 (gcc::context *ctxt)
+-{
+-  return new pass_branch_target_load_optimize2 (ctxt);
+-}
+diff --git a/gcc/builtins.c b/gcc/builtins.c
+index 910e614a4..945205c1d 100644
+--- a/gcc/builtins.c
++++ b/gcc/builtins.c
+@@ -1431,7 +1431,7 @@ expand_builtin_prefetch (tree exp)
+ }
+ 
+ /* Get a MEM rtx for expression EXP which is the address of an operand
+-   to be used in a string instruction (cmpstrsi, movmemsi, ..).  LEN is
++   to be used in a string instruction (cmpstrsi, cpymemsi, ..).  LEN is
+    the maximum length of the block of memory that might be accessed or
+    NULL if unknown.  */
+ 
+@@ -7224,7 +7224,6 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
+ 		int ignore)
+ {
+   tree fndecl = get_callee_fndecl (exp);
+-  enum built_in_function fcode = DECL_FUNCTION_CODE (fndecl);
+   machine_mode target_mode = TYPE_MODE (TREE_TYPE (exp));
+   int flags;
+ 
+@@ -7236,6 +7235,7 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
+      redundant checks and be sure, that possible overflow will be detected
+      by ASan.  */
+ 
++  enum built_in_function fcode = DECL_FUNCTION_CODE (fndecl);
+   if ((flag_sanitize & SANITIZE_ADDRESS) && asan_intercepted_p (fcode))
+     return expand_call (exp, target, ignore);
+ 
+diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
+index d220e8135..bf3db074a 100644
+--- a/gcc/c-family/c-common.c
++++ b/gcc/c-family/c-common.c
+@@ -5835,15 +5835,27 @@ builtin_function_validate_nargs (location_t loc, tree fndecl, int nargs,
+ /* Verifies the NARGS arguments ARGS to the builtin function FNDECL.
+    Returns false if there was an error, otherwise true.  LOC is the
+    location of the function; ARG_LOC is a vector of locations of the
+-   arguments.  */
++   arguments.  If FNDECL is the result of resolving an overloaded
++   target built-in, ORIG_FNDECL is the original function decl,
++   otherwise it is null.  */
+ 
+ bool
+ check_builtin_function_arguments (location_t loc, vec<location_t> arg_loc,
+-				  tree fndecl, int nargs, tree *args)
++				  tree fndecl, tree orig_fndecl,
++				  int nargs, tree *args)
+ {
+-  if (!fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
++  if (!fndecl_built_in_p (fndecl))
+     return true;
+ 
++  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
++    return (!targetm.check_builtin_call
++	    || targetm.check_builtin_call (loc, arg_loc, fndecl,
++					   orig_fndecl, nargs, args));
++
++  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_FRONTEND)
++    return true;
++
++  gcc_assert (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL);
+   switch (DECL_FUNCTION_CODE (fndecl))
+     {
+     case BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX:
+@@ -7317,8 +7329,6 @@ tree
+ resolve_overloaded_builtin (location_t loc, tree function,
+ 			    vec<tree, va_gc> *params)
+ {
+-  enum built_in_function orig_code = DECL_FUNCTION_CODE (function);
+-
+   /* Is function one of the _FETCH_OP_ or _OP_FETCH_ built-ins?
+      Those are not valid to call with a pointer to _Bool (or C++ bool)
+      and so must be rejected.  */
+@@ -7340,6 +7350,7 @@ resolve_overloaded_builtin (location_t loc, tree function,
+     }
+ 
+   /* Handle BUILT_IN_NORMAL here.  */
++  enum built_in_function orig_code = DECL_FUNCTION_CODE (function);
+   switch (orig_code)
+     {
+     case BUILT_IN_SPECULATION_SAFE_VALUE_N:
+diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
+index 683764267..46b8d265a 100644
+--- a/gcc/c-family/c-common.h
++++ b/gcc/c-family/c-common.h
+@@ -818,7 +818,7 @@ extern void check_function_arguments_recurse (void (*)
+ 					      void *, tree,
+ 					      unsigned HOST_WIDE_INT);
+ extern bool check_builtin_function_arguments (location_t, vec<location_t>,
+-					      tree, int, tree *);
++					      tree, tree, int, tree *);
+ extern void check_function_format (const_tree, tree, int, tree *,
+ 				   vec<location_t> *);
+ extern bool attribute_fallthrough_p (tree);
+@@ -995,7 +995,8 @@ extern bool c_switch_covers_all_cases_p (splay_tree, tree);
+ extern tree build_function_call (location_t, tree, tree);
+ 
+ extern tree build_function_call_vec (location_t, vec<location_t>, tree,
+-				     vec<tree, va_gc> *, vec<tree, va_gc> *);
++				     vec<tree, va_gc> *, vec<tree, va_gc> *,
++				     tree = NULL_TREE);
+ 
+ extern tree resolve_overloaded_builtin (location_t, tree, vec<tree, va_gc> *);
+ 
+diff --git a/gcc/c-family/c-pretty-print.c b/gcc/c-family/c-pretty-print.c
+index 3e25624d3..1e14658c0 100644
+--- a/gcc/c-family/c-pretty-print.c
++++ b/gcc/c-family/c-pretty-print.c
+@@ -470,6 +470,16 @@ pp_c_specifier_qualifier_list (c_pretty_printer *pp, tree t)
+ 			     ? "_Complex" : "__complex__"));
+       else if (code == VECTOR_TYPE)
+ 	{
++	  /* The syntax we print for vector types isn't real C or C++ syntax,
++	     so it's better to print the type name if we have one.  */
++	  tree name = TYPE_NAME (t);
++	  if (!(pp->flags & pp_c_flag_gnu_v3)
++	      && name
++	      && TREE_CODE (name) == TYPE_DECL)
++	    {
++	      pp->id_expression (name);
++	      break;
++	    }
+ 	  pp_c_ws_string (pp, "__vector");
+ 	  pp_c_left_paren (pp);
+ 	  pp_wide_integer (pp, TYPE_VECTOR_SUBPARTS (t));
+diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
+index 859a62412..288dbe9d9 100644
+--- a/gcc/c/c-decl.c
++++ b/gcc/c/c-decl.c
+@@ -604,7 +604,7 @@ static tree grokparms (struct c_arg_info *, bool);
+ static void layout_array_type (tree);
+ static void warn_defaults_to (location_t, int, const char *, ...)
+     ATTRIBUTE_GCC_DIAG(3,4);
+-static const char *header_for_builtin_fn (enum built_in_function);
++static const char *header_for_builtin_fn (tree);
+ 
+ /* T is a statement.  Add it to the statement-tree.  This is the
+    C/ObjC version--C++ has a slightly different version of this
+@@ -1951,7 +1951,8 @@ diagnose_mismatched_decls (tree newdecl, tree olddecl,
+   if (!comptypes (oldtype, newtype))
+     {
+       if (TREE_CODE (olddecl) == FUNCTION_DECL
+-	  && fndecl_built_in_p (olddecl) && !C_DECL_DECLARED_BUILTIN (olddecl))
++	  && fndecl_built_in_p (olddecl, BUILT_IN_NORMAL)
++	  && !C_DECL_DECLARED_BUILTIN (olddecl))
+ 	{
+ 	  /* Accept "harmless" mismatches in function types such
+ 	     as missing qualifiers or pointer vs same size integer
+@@ -1973,8 +1974,7 @@ diagnose_mismatched_decls (tree newdecl, tree olddecl,
+ 	      /* If types don't match for a built-in, throw away the
+ 		 built-in.  No point in calling locate_old_decl here, it
+ 		 won't print anything.  */
+-	      const char *header
+-		= header_for_builtin_fn (DECL_FUNCTION_CODE (olddecl));
++	      const char *header = header_for_builtin_fn (olddecl);
+ 	      location_t loc = DECL_SOURCE_LOCATION (newdecl);
+ 	      if (warning_at (loc, OPT_Wbuiltin_declaration_mismatch,
+ 			      "conflicting types for built-in function %q+D; "
+@@ -2637,7 +2637,8 @@ merge_decls (tree newdecl, tree olddecl, tree newtype, tree oldtype)
+ 	    |= DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (olddecl);
+ 	  TREE_THIS_VOLATILE (newdecl) |= TREE_THIS_VOLATILE (olddecl);
+ 	  DECL_IS_MALLOC (newdecl) |= DECL_IS_MALLOC (olddecl);
+-	  DECL_IS_OPERATOR_NEW (newdecl) |= DECL_IS_OPERATOR_NEW (olddecl);
++	  if (DECL_IS_OPERATOR_NEW_P (olddecl))
++	    DECL_SET_IS_OPERATOR_NEW (newdecl, true);
+ 	  TREE_READONLY (newdecl) |= TREE_READONLY (olddecl);
+ 	  DECL_PURE_P (newdecl) |= DECL_PURE_P (olddecl);
+ 	  DECL_IS_NOVOPS (newdecl) |= DECL_IS_NOVOPS (olddecl);
+@@ -2731,8 +2732,7 @@ merge_decls (tree newdecl, tree olddecl, tree newtype, tree oldtype)
+ 	{
+ 	  /* If redeclaring a builtin function, it stays built in.
+ 	     But it gets tagged as having been declared.  */
+-	  DECL_BUILT_IN_CLASS (newdecl) = DECL_BUILT_IN_CLASS (olddecl);
+-	  DECL_FUNCTION_CODE (newdecl) = DECL_FUNCTION_CODE (olddecl);
++	  copy_decl_built_in_function (newdecl, olddecl);
+ 	  C_DECL_DECLARED_BUILTIN (newdecl) = 1;
+ 	  if (new_is_prototype)
+ 	    {
+@@ -3334,13 +3334,17 @@ implicit_decl_warning (location_t loc, tree id, tree olddecl)
+     hint.suppress ();
+ }
+ 
+-/* This function represents mapping of a function code FCODE
+-   to its respective header.  */
++/* Return the name of the header file that declares built-in function
++   FNDECL, or null if either we don't know or don't expect to see an
++   explicit declaration.  */
+ 
+ static const char *
+-header_for_builtin_fn (enum built_in_function fcode)
++header_for_builtin_fn (tree fndecl)
+ {
+-  switch (fcode)
++  if (DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
++    return NULL;
++
++  switch (DECL_FUNCTION_CODE (fndecl))
+     {
+     CASE_FLT_FN (BUILT_IN_ACOS):
+     CASE_FLT_FN (BUILT_IN_ACOSH):
+@@ -3590,8 +3594,7 @@ implicitly_declare (location_t loc, tree functionid)
+ 					    "declaration of built-in "
+ 					    "function %qD", decl);
+ 		  /* See if we can hint which header to include.  */
+-		  const char *header
+-		    = header_for_builtin_fn (DECL_FUNCTION_CODE (decl));
++		  const char *header = header_for_builtin_fn (decl);
+ 		  if (header != NULL && warned)
+ 		    {
+ 		      rich_location richloc (line_table, loc);
+@@ -4471,6 +4474,16 @@ c_builtin_function_ext_scope (tree decl)
+ 
+   return decl;
+ }
++
++/* Implement LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL.  */
++
++tree
++c_simulate_builtin_function_decl (tree decl)
++{
++  tree type = TREE_TYPE (decl);
++  C_DECL_BUILTIN_PROTOTYPE (decl) = prototype_p (type);
++  return pushdecl (decl);
++}
+ 
+ /* Called when a declaration is seen that contains no names to declare.
+    If its type is a reference to a structure, union or enum inherited
+@@ -8746,6 +8759,8 @@ finish_enum (tree enumtype, tree values, tree attributes)
+       && !in_sizeof && !in_typeof && !in_alignof)
+     struct_parse_info->struct_types.safe_push (enumtype);
+ 
++  C_TYPE_BEING_DEFINED (enumtype) = 0;
++
+   return enumtype;
+ }
+ 
+@@ -8851,6 +8866,36 @@ build_enumerator (location_t decl_loc, location_t loc,
+   return tree_cons (decl, value, NULL_TREE);
+ }
+ 
++/* Implement LANG_HOOKS_SIMULATE_ENUM_DECL.  */
++
++tree
++c_simulate_enum_decl (location_t loc, const char *name,
++		      vec<string_int_pair> values)
++{
++  location_t saved_loc = input_location;
++  input_location = loc;
++
++  struct c_enum_contents the_enum;
++  tree enumtype = start_enum (loc, &the_enum, get_identifier (name));
++
++  tree value_chain = NULL_TREE;
++  string_int_pair *value;
++  unsigned int i;
++  FOR_EACH_VEC_ELT (values, i, value)
++    {
++      tree decl = build_enumerator (loc, loc, &the_enum,
++				    get_identifier (value->first),
++				    build_int_cst (integer_type_node,
++						   value->second));
++      TREE_CHAIN (decl) = value_chain;
++      value_chain = decl;
++    }
++
++  finish_enum (enumtype, nreverse (value_chain), NULL_TREE);
++
++  input_location = saved_loc;
++  return enumtype;
++}
+ 
+ /* Create the FUNCTION_DECL for a function definition.
+    DECLSPECS, DECLARATOR and ATTRIBUTES are the parts of
+diff --git a/gcc/c/c-objc-common.h b/gcc/c/c-objc-common.h
+index f5e820420..c8739e0b8 100644
+--- a/gcc/c/c-objc-common.h
++++ b/gcc/c/c-objc-common.h
+@@ -60,6 +60,9 @@ along with GCC; see the file COPYING3.  If not see
+ #define LANG_HOOKS_BUILTIN_FUNCTION c_builtin_function
+ #undef  LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE
+ #define LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE c_builtin_function_ext_scope
++#undef  LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL
++#define LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL \
++  c_simulate_builtin_function_decl
+ #undef LANG_HOOKS_EMITS_BEGIN_STMT
+ #define LANG_HOOKS_EMITS_BEGIN_STMT true
+ 
+@@ -72,6 +75,8 @@ along with GCC; see the file COPYING3.  If not see
+ #undef LANG_HOOKS_TREE_DUMP_DUMP_TREE_FN
+ #define LANG_HOOKS_TREE_DUMP_DUMP_TREE_FN c_dump_tree
+ 
++#undef LANG_HOOKS_SIMULATE_ENUM_DECL
++#define LANG_HOOKS_SIMULATE_ENUM_DECL c_simulate_enum_decl
+ #undef LANG_HOOKS_TYPE_FOR_MODE
+ #define LANG_HOOKS_TYPE_FOR_MODE c_common_type_for_mode
+ #undef LANG_HOOKS_TYPE_FOR_SIZE
+diff --git a/gcc/c/c-tree.h b/gcc/c/c-tree.h
+index 7e35ab1f0..19925e793 100644
+--- a/gcc/c/c-tree.h
++++ b/gcc/c/c-tree.h
+@@ -561,6 +561,8 @@ extern tree finish_enum (tree, tree, tree);
+ extern void finish_function (void);
+ extern tree finish_struct (location_t, tree, tree, tree,
+ 			   struct c_struct_parse_info *);
++extern tree c_simulate_enum_decl (location_t, const char *,
++				  vec<string_int_pair>);
+ extern struct c_arg_info *build_arg_info (void);
+ extern struct c_arg_info *get_parm_info (bool, tree);
+ extern tree grokfield (location_t, struct c_declarator *,
+@@ -577,6 +579,7 @@ extern struct c_declarator *set_array_declarator_inner (struct c_declarator *,
+ 							struct c_declarator *);
+ extern tree c_builtin_function (tree);
+ extern tree c_builtin_function_ext_scope (tree);
++extern tree c_simulate_builtin_function_decl (tree);
+ extern void shadow_tag (const struct c_declspecs *);
+ extern void shadow_tag_warned (const struct c_declspecs *, int);
+ extern tree start_enum (location_t, struct c_enum_contents *, tree);
+diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
+index cb999cbf8..87f4178ec 100644
+--- a/gcc/c/c-typeck.c
++++ b/gcc/c/c-typeck.c
+@@ -3002,6 +3002,8 @@ inform_declaration (tree decl)
+ }
+ 
+ /* Build a function call to function FUNCTION with parameters PARAMS.
++   If FUNCTION is the result of resolving an overloaded target built-in,
++   ORIG_FUNDECL is the original function decl, otherwise it is null.
+    ORIGTYPES, if not NULL, is a vector of types; each element is
+    either NULL or the original type of the corresponding element in
+    PARAMS.  The original type may differ from TREE_TYPE of the
+@@ -3012,7 +3014,7 @@ inform_declaration (tree decl)
+ tree
+ build_function_call_vec (location_t loc, vec<location_t> arg_loc,
+ 			 tree function, vec<tree, va_gc> *params,
+-			 vec<tree, va_gc> *origtypes)
++			 vec<tree, va_gc> *origtypes, tree orig_fundecl)
+ {
+   tree fntype, fundecl = NULL_TREE;
+   tree name = NULL_TREE, result;
+@@ -3032,6 +3034,8 @@ build_function_call_vec (location_t loc, vec<location_t> arg_loc,
+       if (flag_tm)
+ 	tm_malloc_replacement (function);
+       fundecl = function;
++      if (!orig_fundecl)
++	orig_fundecl = fundecl;
+       /* Atomic functions have type checking/casting already done.  They are 
+ 	 often rewritten and don't match the original parameter list.  */
+       if (name && !strncmp (IDENTIFIER_POINTER (name), "__atomic_", 9))
+@@ -3109,9 +3113,10 @@ build_function_call_vec (location_t loc, vec<location_t> arg_loc,
+   argarray = vec_safe_address (params);
+ 
+   /* Check that arguments to builtin functions match the expectations.  */
+-  if (fundecl && fndecl_built_in_p (fundecl, BUILT_IN_NORMAL)
+-      && !check_builtin_function_arguments (loc, arg_loc, fundecl, nargs,
+-					    argarray))
++  if (fundecl
++      && fndecl_built_in_p (fundecl)
++      && !check_builtin_function_arguments (loc, arg_loc, fundecl,
++					    orig_fundecl, nargs, argarray))
+     return error_mark_node;
+ 
+   /* Check that the arguments to the function are valid.  */
+diff --git a/gcc/caller-save.c b/gcc/caller-save.c
+index 9ff470c33..0d66e0ce5 100644
+--- a/gcc/caller-save.c
++++ b/gcc/caller-save.c
+@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "dumpfile.h"
+ #include "rtl-iter.h"
+ #include "target.h"
++#include "function-abi.h"
+ 
+ #define MOVE_MAX_WORDS (MOVE_MAX / UNITS_PER_WORD)
+ 
+@@ -192,29 +193,17 @@ init_caller_save (void)
+ 
+   caller_save_initialized_p = true;
+ 
+-  CLEAR_HARD_REG_SET (no_caller_save_reg_set);
+   /* First find all the registers that we need to deal with and all
+      the modes that they can have.  If we can't find a mode to use,
+      we can't have the register live over calls.  */
+ 
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-    {
+-      if (call_used_regs[i]
+-          && !TEST_HARD_REG_BIT (call_fixed_reg_set, i))
+-	{
+-	  for (j = 1; j <= MOVE_MAX_WORDS; j++)
+-	    {
+-	      regno_save_mode[i][j] = HARD_REGNO_CALLER_SAVE_MODE (i, j,
+-								   VOIDmode);
+-	      if (regno_save_mode[i][j] == VOIDmode && j == 1)
+-		{
+-		  SET_HARD_REG_BIT (call_fixed_reg_set, i);
+-		}
+-	    }
+-	}
+-      else
+-	regno_save_mode[i][1] = VOIDmode;
+-    }
++    for (j = 1; j <= MOVE_MAX_WORDS; j++)
++      {
++	regno_save_mode[i][j] = HARD_REGNO_CALLER_SAVE_MODE (i, j, VOIDmode);
++	if (regno_save_mode[i][j] == VOIDmode && j == 1)
++	  CLEAR_HARD_REG_BIT (savable_regs, i);
++      }
+ 
+   /* The following code tries to approximate the conditions under which
+      we can easily save and restore a register without scratch registers or
+@@ -275,11 +264,7 @@ init_caller_save (void)
+ 	{
+ 	  regno_save_mode[i][j] = VOIDmode;
+ 	  if (j == 1)
+-	    {
+-	      SET_HARD_REG_BIT (call_fixed_reg_set, i);
+-	      if (call_used_regs[i])
+-		SET_HARD_REG_BIT (no_caller_save_reg_set, i);
+-	    }
++	    CLEAR_HARD_REG_BIT (savable_regs, i);
+ 	}
+ }
+ 
+@@ -442,7 +427,9 @@ setup_save_areas (void)
+       freq = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (insn));
+       REG_SET_TO_HARD_REG_SET (hard_regs_to_save,
+ 			       &chain->live_throughout);
+-      get_call_reg_set_usage (insn, &used_regs, call_used_reg_set);
++      used_regs = insn_callee_abi (insn).full_reg_clobbers ();
++      /* ??? This preserves traditional behavior; it might not be needed.  */
++      used_regs |= fixed_reg_set;
+ 
+       /* Record all registers set in this call insn.  These don't
+ 	 need to be saved.  N.B. the call insn might set a subreg
+@@ -450,14 +437,13 @@ setup_save_areas (void)
+ 	 live during the call, but the subreg that is set
+ 	 isn't.  */
+       CLEAR_HARD_REG_SET (this_insn_sets);
+-      note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
++      note_stores (insn, mark_set_regs, &this_insn_sets);
+       /* Sibcalls are considered to set the return value.  */
+       if (SIBLING_CALL_P (insn) && crtl->return_rtx)
+ 	mark_set_regs (crtl->return_rtx, NULL_RTX, &this_insn_sets);
+ 
+-      AND_COMPL_HARD_REG_SET (used_regs, call_fixed_reg_set);
+-      AND_COMPL_HARD_REG_SET (used_regs, this_insn_sets);
+-      AND_HARD_REG_SET (hard_regs_to_save, used_regs);
++      used_regs &= ~(fixed_reg_set | this_insn_sets);
++      hard_regs_to_save &= used_regs & savable_regs;
+       for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ 	if (TEST_HARD_REG_BIT (hard_regs_to_save, regno))
+ 	  {
+@@ -526,7 +512,10 @@ setup_save_areas (void)
+ 
+ 	  REG_SET_TO_HARD_REG_SET (hard_regs_to_save,
+ 				   &chain->live_throughout);
+-	  get_call_reg_set_usage (insn, &used_regs, call_used_reg_set);
++	  used_regs = insn_callee_abi (insn).full_reg_clobbers ();
++	  /* ??? This preserves traditional behavior; it might not
++	     be needed.  */
++	  used_regs |= fixed_reg_set;
+ 
+ 	  /* Record all registers set in this call insn.  These don't
+ 	     need to be saved.  N.B. the call insn might set a subreg
+@@ -534,15 +523,14 @@ setup_save_areas (void)
+ 	     live during the call, but the subreg that is set
+ 	     isn't.  */
+ 	  CLEAR_HARD_REG_SET (this_insn_sets);
+-	  note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
++	  note_stores (insn, mark_set_regs, &this_insn_sets);
+ 	  /* Sibcalls are considered to set the return value,
+ 	     compare df-scan.c:df_get_call_refs.  */
+ 	  if (SIBLING_CALL_P (insn) && crtl->return_rtx)
+ 	    mark_set_regs (crtl->return_rtx, NULL_RTX, &this_insn_sets);
+ 
+-	  AND_COMPL_HARD_REG_SET (used_regs, call_fixed_reg_set);
+-	  AND_COMPL_HARD_REG_SET (used_regs, this_insn_sets);
+-	  AND_HARD_REG_SET (hard_regs_to_save, used_regs);
++	  used_regs &= ~(fixed_reg_set | this_insn_sets);
++	  hard_regs_to_save &= used_regs & savable_regs;
+ 	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ 	    if (TEST_HARD_REG_BIT (hard_regs_to_save, regno))
+ 	      {
+@@ -775,13 +763,13 @@ save_call_clobbered_regs (void)
+ 
+ 	      if (code == JUMP_INSN)
+ 		/* Restore all registers if this is a JUMP_INSN.  */
+-		COPY_HARD_REG_SET (referenced_regs, hard_regs_saved);
++		referenced_regs = hard_regs_saved;
+ 	      else
+ 		{
+ 		  CLEAR_HARD_REG_SET (referenced_regs);
+ 		  mark_referenced_regs (&PATTERN (insn),
+ 					mark_reg_as_referenced, NULL);
+-		  AND_HARD_REG_SET (referenced_regs, hard_regs_saved);
++		  referenced_regs &= hard_regs_saved;
+ 		}
+ 
+ 	      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+@@ -795,8 +783,8 @@ save_call_clobbered_regs (void)
+ 		 be live across the call, while the other is set
+ 		 afterwards.  */
+ 	      CLEAR_HARD_REG_SET (this_insn_sets);
+-	      note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
+-	      AND_COMPL_HARD_REG_SET (hard_regs_saved, this_insn_sets);
++	      note_stores (insn, mark_set_regs, &this_insn_sets);
++	      hard_regs_saved &= ~this_insn_sets;
+ 	    }
+ 
+ 	  if (code == CALL_INSN
+@@ -849,15 +837,18 @@ save_call_clobbered_regs (void)
+ 		 multi-hard-reg pseudo; then the pseudo is considered live
+ 		 during the call, but the subreg that is set isn't.  */
+ 	      CLEAR_HARD_REG_SET (this_insn_sets);
+-	      note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
++	      note_stores (insn, mark_set_regs, &this_insn_sets);
+ 
+ 	      /* Compute which hard regs must be saved before this call.  */
+-	      AND_COMPL_HARD_REG_SET (hard_regs_to_save, call_fixed_reg_set);
+-	      AND_COMPL_HARD_REG_SET (hard_regs_to_save, this_insn_sets);
+-	      AND_COMPL_HARD_REG_SET (hard_regs_to_save, hard_regs_saved);
+-	      get_call_reg_set_usage (insn, &call_def_reg_set,
+-				      call_used_reg_set);
+-	      AND_HARD_REG_SET (hard_regs_to_save, call_def_reg_set);
++	      hard_regs_to_save &= ~(fixed_reg_set
++				     | this_insn_sets
++				     | hard_regs_saved);
++	      hard_regs_to_save &= savable_regs;
++	      call_def_reg_set = insn_callee_abi (insn).full_reg_clobbers ();
++	      /* ??? This preserves traditional behavior; it might not
++		 be needed.  */
++	      call_def_reg_set |= fixed_reg_set;
++	      hard_regs_to_save &= call_def_reg_set;
+ 
+ 	      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ 		if (TEST_HARD_REG_BIT (hard_regs_to_save, regno))
+@@ -872,7 +863,8 @@ save_call_clobbered_regs (void)
+ 	      
+ 	      if (cheap
+ 		  && HARD_REGISTER_P (cheap)
+-		  && TEST_HARD_REG_BIT (call_used_reg_set, REGNO (cheap)))
++		  && TEST_HARD_REG_BIT (call_used_or_fixed_regs,
++					REGNO (cheap)))
+ 		{
+ 		  rtx dest, newpat;
+ 		  rtx pat = PATTERN (insn);
+@@ -1414,8 +1406,7 @@ insert_one_insn (struct insn_chain *chain, int before_p, int code, rtx pat)
+       /* Registers that are set in CHAIN->INSN live in the new insn.
+ 	 (Unless there is a REG_UNUSED note for them, but we don't
+ 	  look for them here.) */
+-      note_stores (PATTERN (chain->insn), add_stored_regs,
+-		   &new_chain->live_throughout);
++      note_stores (chain->insn, add_stored_regs, &new_chain->live_throughout);
+       CLEAR_REG_SET (&new_chain->dead_or_set);
+       if (chain->insn == BB_END (BASIC_BLOCK_FOR_FN (cfun, chain->block)))
+ 	BB_END (BASIC_BLOCK_FOR_FN (cfun, chain->block)) = new_chain->insn;
+diff --git a/gcc/calls.c b/gcc/calls.c
+index 567959956..2638752ad 100644
+--- a/gcc/calls.c
++++ b/gcc/calls.c
+@@ -346,7 +346,8 @@ prepare_call_address (tree fndecl_or_type, rtx funexp, rtx static_chain_value,
+    It is zero if this call doesn't want a structure value.
+ 
+    NEXT_ARG_REG is the rtx that results from executing
+-     targetm.calls.function_arg (&args_so_far, VOIDmode, void_type_node, true)
++     targetm.calls.function_arg (&args_so_far,
++				 function_arg_info::end_marker ());
+    just after all the args have had their registers assigned.
+    This could be whatever you like, but normally it is the first
+    arg-register beyond those used for args in this call,
+@@ -897,13 +898,12 @@ call_expr_flags (const_tree t)
+   return flags;
+ }
+ 
+-/* Return true if TYPE should be passed by invisible reference.  */
++/* Return true if ARG should be passed by invisible reference.  */
+ 
+ bool
+-pass_by_reference (CUMULATIVE_ARGS *ca, machine_mode mode,
+-		   tree type, bool named_arg)
++pass_by_reference (CUMULATIVE_ARGS *ca, function_arg_info arg)
+ {
+-  if (type)
++  if (tree type = arg.type)
+     {
+       /* If this type contains non-trivial constructors, then it is
+ 	 forbidden for the middle-end to create any new copies.  */
+@@ -911,33 +911,55 @@ pass_by_reference (CUMULATIVE_ARGS *ca, machine_mode mode,
+ 	return true;
+ 
+       /* GCC post 3.4 passes *all* variable sized types by reference.  */
+-      if (!TYPE_SIZE (type) || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
++      if (!TYPE_SIZE (type) || !poly_int_tree_p (TYPE_SIZE (type)))
+ 	return true;
+ 
+       /* If a record type should be passed the same as its first (and only)
+ 	 member, use the type and mode of that member.  */
+       if (TREE_CODE (type) == RECORD_TYPE && TYPE_TRANSPARENT_AGGR (type))
+ 	{
+-	  type = TREE_TYPE (first_field (type));
+-	  mode = TYPE_MODE (type);
++	  arg.type = TREE_TYPE (first_field (type));
++	  arg.mode = TYPE_MODE (arg.type);
+ 	}
+     }
+ 
+-  return targetm.calls.pass_by_reference (pack_cumulative_args (ca), mode,
+-					  type, named_arg);
++  return targetm.calls.pass_by_reference (pack_cumulative_args (ca), arg);
+ }
+ 
+-/* Return true if TYPE, which is passed by reference, should be callee
++/* Return true if TYPE should be passed by reference when passed to
++   the "..." arguments of a function.  */
++
++bool
++pass_va_arg_by_reference (tree type)
++{
++  return pass_by_reference (NULL, function_arg_info (type, /*named=*/false));
++}
++
++/* Decide whether ARG, which occurs in the state described by CA,
++   should be passed by reference.  Return true if so and update
++   ARG accordingly.  */
++
++bool
++apply_pass_by_reference_rules (CUMULATIVE_ARGS *ca, function_arg_info &arg)
++{
++  if (pass_by_reference (ca, arg))
++    {
++      arg.type = build_pointer_type (arg.type);
++      arg.mode = TYPE_MODE (arg.type);
++      return true;
++    }
++  return false;
++}
++
++/* Return true if ARG, which is passed by reference, should be callee
+    copied instead of caller copied.  */
+ 
+ bool
+-reference_callee_copied (CUMULATIVE_ARGS *ca, machine_mode mode,
+-			 tree type, bool named_arg)
++reference_callee_copied (CUMULATIVE_ARGS *ca, const function_arg_info &arg)
+ {
+-  if (type && TREE_ADDRESSABLE (type))
++  if (arg.type && TREE_ADDRESSABLE (arg.type))
+     return false;
+-  return targetm.calls.callee_copies (pack_cumulative_args (ca), mode, type,
+-				      named_arg);
++  return targetm.calls.callee_copies (pack_cumulative_args (ca), arg);
+ }
+ 
+ 
+@@ -1350,7 +1372,6 @@ maybe_warn_alloc_args_overflow (tree fn, tree exp, tree args[2], int idx[2])
+   location_t loc = EXPR_LOCATION (exp);
+ 
+   tree fntype = fn ? TREE_TYPE (fn) : TREE_TYPE (TREE_TYPE (exp));
+-  built_in_function fncode = fn ? DECL_FUNCTION_CODE (fn) : BUILT_IN_NONE;
+   bool warned = false;
+ 
+   /* Validate each argument individually.  */
+@@ -1376,11 +1397,10 @@ maybe_warn_alloc_args_overflow (tree fn, tree exp, tree args[2], int idx[2])
+ 		 friends.
+ 		 Also avoid issuing the warning for calls to function named
+ 		 "alloca".  */
+-	      if ((fncode == BUILT_IN_ALLOCA
+-		   && IDENTIFIER_LENGTH (DECL_NAME (fn)) != 6)
+-		  || (fncode != BUILT_IN_ALLOCA
+-		      && !lookup_attribute ("returns_nonnull",
+-					    TYPE_ATTRIBUTES (fntype))))
++	      if (fn && fndecl_built_in_p (fn, BUILT_IN_ALLOCA)
++		  ? IDENTIFIER_LENGTH (DECL_NAME (fn)) != 6
++		  : !lookup_attribute ("returns_nonnull",
++				       TYPE_ATTRIBUTES (fntype)))
+ 		warned = warning_at (loc, OPT_Walloc_zero,
+ 				     "%Kargument %i value is zero",
+ 				     exp, idx[i] + 1);
+@@ -1395,7 +1415,7 @@ maybe_warn_alloc_args_overflow (tree fn, tree exp, tree args[2], int idx[2])
+ 		  && fn
+ 		  && !args[1]
+ 		  && lang_GNU_CXX ()
+-		  && DECL_IS_OPERATOR_NEW (fn)
++		  && DECL_IS_OPERATOR_NEW_P (fn)
+ 		  && integer_all_onesp (args[i]))
+ 		continue;
+ 
+@@ -1989,15 +2009,13 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
+ 	 with those made by function.c.  */
+ 
+       /* See if this argument should be passed by invisible reference.  */
+-      if (pass_by_reference (args_so_far_pnt, TYPE_MODE (type),
+-			     type, argpos < n_named_args))
++      function_arg_info orig_arg (type, argpos < n_named_args);
++      if (pass_by_reference (args_so_far_pnt, orig_arg))
+ 	{
+ 	  bool callee_copies;
+ 	  tree base = NULL_TREE;
+ 
+-	  callee_copies
+-	    = reference_callee_copied (args_so_far_pnt, TYPE_MODE (type),
+-				       type, argpos < n_named_args);
++	  callee_copies = reference_callee_copied (args_so_far_pnt, orig_arg);
+ 
+ 	  /* If we're compiling a thunk, pass through invisible references
+ 	     instead of making a copy.  */
+@@ -2118,8 +2136,8 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
+ 
+       targetm.calls.warn_parameter_passing_abi (args_so_far, type);
+ 
+-      args[i].reg = targetm.calls.function_arg (args_so_far, mode, type,
+-						argpos < n_named_args);
++      function_arg_info arg (type, mode, argpos < n_named_args);
++      args[i].reg = targetm.calls.function_arg (args_so_far, arg);
+ 
+       if (args[i].reg && CONST_INT_P (args[i].reg))
+ 	args[i].reg = NULL;
+@@ -2129,17 +2147,14 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
+ 	 arguments have to go into the incoming registers.  */
+       if (targetm.calls.function_incoming_arg != targetm.calls.function_arg)
+ 	args[i].tail_call_reg
+-	  = targetm.calls.function_incoming_arg (args_so_far, mode, type,
+-						 argpos < n_named_args);
++	  = targetm.calls.function_incoming_arg (args_so_far, arg);
+       else
+ 	args[i].tail_call_reg = args[i].reg;
+ 
+       if (args[i].reg)
+-	args[i].partial
+-	  = targetm.calls.arg_partial_bytes (args_so_far, mode, type,
+-					     argpos < n_named_args);
++	args[i].partial = targetm.calls.arg_partial_bytes (args_so_far, arg);
+ 
+-      args[i].pass_on_stack = targetm.calls.must_pass_in_stack (mode, type);
++      args[i].pass_on_stack = targetm.calls.must_pass_in_stack (arg);
+ 
+       /* If FUNCTION_ARG returned a (parallel [(expr_list (nil) ...) ...]),
+ 	 it means that we are to pass this arg in the register(s) designated
+@@ -2188,8 +2203,13 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
+       /* Increment ARGS_SO_FAR, which has info about which arg-registers
+ 	 have been used, etc.  */
+ 
+-      targetm.calls.function_arg_advance (args_so_far, TYPE_MODE (type),
+-					  type, argpos < n_named_args);
++      /* ??? Traditionally we've passed TYPE_MODE here, instead of the
++	 promoted_mode used for function_arg above.  However, the
++	 corresponding handling of incoming arguments in function.c
++	 does pass the promoted mode.  */
++      function_arg_info arg_to_skip (type, TYPE_MODE (type),
++				     argpos < n_named_args);
++      targetm.calls.function_arg_advance (args_so_far, arg_to_skip);
+ 
+       /* Store argument values for functions decorated with attribute
+ 	 alloc_size.  */
+@@ -4222,14 +4242,11 @@ expand_call (tree exp, rtx target, int ignore)
+       /* Set up next argument register.  For sibling calls on machines
+ 	 with register windows this should be the incoming register.  */
+       if (pass == 0)
+-	next_arg_reg = targetm.calls.function_incoming_arg (args_so_far,
+-							    VOIDmode,
+-							    void_type_node,
+-							    true);
++	next_arg_reg = targetm.calls.function_incoming_arg
++	  (args_so_far, function_arg_info::end_marker ());
+       else
+-	next_arg_reg = targetm.calls.function_arg (args_so_far,
+-						   VOIDmode, void_type_node,
+-						   true);
++	next_arg_reg = targetm.calls.function_arg
++	  (args_so_far, function_arg_info::end_marker ());
+ 
+       if (pass == 1 && (return_flags & ERF_RETURNS_ARG))
+ 	{
+@@ -4846,10 +4863,9 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
+       argvec[count].mode = Pmode;
+       argvec[count].partial = 0;
+ 
+-      argvec[count].reg = targetm.calls.function_arg (args_so_far,
+-						      Pmode, NULL_TREE, true);
+-      gcc_assert (targetm.calls.arg_partial_bytes (args_so_far, Pmode,
+-						   NULL_TREE, 1) == 0);
++      function_arg_info ptr_arg (Pmode, /*named=*/true);
++      argvec[count].reg = targetm.calls.function_arg (args_so_far, ptr_arg);
++      gcc_assert (targetm.calls.arg_partial_bytes (args_so_far, ptr_arg) == 0);
+ 
+       locate_and_pad_parm (Pmode, NULL_TREE,
+ #ifdef STACK_PARMS_IN_REG_PARM_AREA
+@@ -4864,7 +4880,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
+ 	  || reg_parm_stack_space > 0)
+ 	args_size.constant += argvec[count].locate.size.constant;
+ 
+-      targetm.calls.function_arg_advance (args_so_far, Pmode, (tree) 0, true);
++      targetm.calls.function_arg_advance (args_so_far, ptr_arg);
+ 
+       count++;
+     }
+@@ -4885,11 +4901,11 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
+ 	  && !(CONSTANT_P (val) && targetm.legitimate_constant_p (mode, val)))
+ 	val = force_operand (val, NULL_RTX);
+ 
+-      if (pass_by_reference (&args_so_far_v, mode, NULL_TREE, 1))
++      function_arg_info orig_arg (mode, /*named=*/true);
++      if (pass_by_reference (&args_so_far_v, orig_arg))
+ 	{
+ 	  rtx slot;
+-	  int must_copy
+-	    = !reference_callee_copied (&args_so_far_v, mode, NULL_TREE, 1);
++	  int must_copy = !reference_callee_copied (&args_so_far_v, orig_arg);
+ 
+ 	  /* If this was a CONST function, it is now PURE since it now
+ 	     reads memory.  */
+@@ -4927,13 +4943,13 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
+ 	}
+ 
+       mode = promote_function_mode (NULL_TREE, mode, &unsigned_p, NULL_TREE, 0);
++      function_arg_info arg (mode, /*named=*/true);
+       argvec[count].mode = mode;
+       argvec[count].value = convert_modes (mode, GET_MODE (val), val, unsigned_p);
+-      argvec[count].reg = targetm.calls.function_arg (args_so_far, mode,
+-						      NULL_TREE, true);
++      argvec[count].reg = targetm.calls.function_arg (args_so_far, arg);
+ 
+       argvec[count].partial
+-	= targetm.calls.arg_partial_bytes (args_so_far, mode, NULL_TREE, 1);
++	= targetm.calls.arg_partial_bytes (args_so_far, arg);
+ 
+       if (argvec[count].reg == 0
+ 	  || argvec[count].partial != 0
+@@ -4959,7 +4975,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
+ 			     known_le (GET_MODE_SIZE (mode), UNITS_PER_WORD));
+ #endif
+ 
+-      targetm.calls.function_arg_advance (args_so_far, mode, (tree) 0, true);
++      targetm.calls.function_arg_advance (args_so_far, arg);
+     }
+ 
+   /* If this machine requires an external definition for library
+@@ -5302,7 +5318,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
+ 	       original_args_size.constant, args_size.constant,
+ 	       struct_value_size,
+ 	       targetm.calls.function_arg (args_so_far,
+-					   VOIDmode, void_type_node, true),
++					   function_arg_info::end_marker ()),
+ 	       valreg,
+ 	       old_inhibit_defer_pop + 1, call_fusage, flags, args_so_far);
+ 
+@@ -5815,22 +5831,21 @@ store_one_arg (struct arg_data *arg, rtx argblock, int flags,
+   return sibcall_failure;
+ }
+ 
+-/* Nonzero if we do not know how to pass TYPE solely in registers.  */
++/* Nonzero if we do not know how to pass ARG solely in registers.  */
+ 
+ bool
+-must_pass_in_stack_var_size (machine_mode mode ATTRIBUTE_UNUSED,
+-			     const_tree type)
++must_pass_in_stack_var_size (const function_arg_info &arg)
+ {
+-  if (!type)
++  if (!arg.type)
+     return false;
+ 
+   /* If the type has variable size...  */
+-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
++  if (!poly_int_tree_p (TYPE_SIZE (arg.type)))
+     return true;
+ 
+   /* If the type is marked as addressable (it is required
+      to be constructed into the stack)...  */
+-  if (TREE_ADDRESSABLE (type))
++  if (TREE_ADDRESSABLE (arg.type))
+     return true;
+ 
+   return false;
+@@ -5841,33 +5856,43 @@ must_pass_in_stack_var_size (machine_mode mode ATTRIBUTE_UNUSED,
+ /* ??? Should be able to merge these two by examining BLOCK_REG_PADDING.  */
+ 
+ bool
+-must_pass_in_stack_var_size_or_pad (machine_mode mode, const_tree type)
++must_pass_in_stack_var_size_or_pad (const function_arg_info &arg)
+ {
+-  if (!type)
++  if (!arg.type)
+     return false;
+ 
+   /* If the type has variable size...  */
+-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
++  if (TREE_CODE (TYPE_SIZE (arg.type)) != INTEGER_CST)
+     return true;
+ 
+   /* If the type is marked as addressable (it is required
+      to be constructed into the stack)...  */
+-  if (TREE_ADDRESSABLE (type))
++  if (TREE_ADDRESSABLE (arg.type))
+     return true;
+ 
+-  if (TYPE_EMPTY_P (type))
++  if (TYPE_EMPTY_P (arg.type))
+     return false;
+ 
+   /* If the padding and mode of the type is such that a copy into
+      a register would put it into the wrong part of the register.  */
+-  if (mode == BLKmode
+-      && int_size_in_bytes (type) % (PARM_BOUNDARY / BITS_PER_UNIT)
+-      && (targetm.calls.function_arg_padding (mode, type)
++  if (arg.mode == BLKmode
++      && int_size_in_bytes (arg.type) % (PARM_BOUNDARY / BITS_PER_UNIT)
++      && (targetm.calls.function_arg_padding (arg.mode, arg.type)
+ 	  == (BYTES_BIG_ENDIAN ? PAD_UPWARD : PAD_DOWNWARD)))
+     return true;
+ 
+   return false;
+ }
+ 
++/* Return true if TYPE must be passed on the stack when passed to
++   the "..." arguments of a function.  */
++
++bool
++must_pass_va_arg_in_stack (tree type)
++{
++  function_arg_info arg (type, /*named=*/false);
++  return targetm.calls.must_pass_in_stack (arg);
++}
++
+ /* Tell the garbage collector about GTY markers in this source file.  */
+ #include "gt-calls.h"
+diff --git a/gcc/calls.h b/gcc/calls.h
+index 128bb5130..01ab3905a 100644
+--- a/gcc/calls.h
++++ b/gcc/calls.h
+@@ -20,23 +20,108 @@ along with GCC; see the file COPYING3.  If not see
+ #ifndef GCC_CALLS_H
+ #define GCC_CALLS_H
+ 
++/* Describes a function argument.
++
++   Each argument conceptually has a gimple-level type.  Usually this type
++   is available directly as a tree via the TYPE field, but when calling
++   libgcc support functions it might instead be inferred from a mode,
++   in which case the type isn't available directly.
++
++   This gimple-level type might go through promotion before being passed to
++   the target function.  Depending on the context, the MODE field is either
++   the mode of the gimple-level type (whether explicitly given or not)
++   or the mode after promotion has been performed.  */
++class function_arg_info
++{
++public:
++  function_arg_info () : type (NULL_TREE), mode (VOIDmode), named (false) {}
++
++  /* Initialize an argument of mode MODE, either before or after promotion.  */
++  function_arg_info (machine_mode mode, bool named)
++    : type (NULL_TREE), mode (mode), named (named)
++  {}
++
++  /* Initialize an unpromoted argument of type TYPE.  */
++  function_arg_info (tree type, bool named)
++    : type (type), mode (TYPE_MODE (type)), named (named)
++  {}
++
++  /* Initialize an argument with explicit properties.  */
++  function_arg_info (tree type, machine_mode mode, bool named)
++    : type (type), mode (mode), named (named)
++  {}
++
++  /* Return true if the gimple-level type is an aggregate.  */
++  bool aggregate_type_p () const { return type && AGGREGATE_TYPE_P (type); }
++
++  /* Return the size of the gimple-level type, or -1 if the size is
++     variable or otherwise not representable as a poly_int64.
++
++     Use this function when MODE is the mode of the type before promotion,
++     or in any context if the target never promotes function arguments.  */
++  poly_int64 type_size_in_bytes () const
++  {
++    if (type)
++      return int_size_in_bytes (type);
++    return GET_MODE_SIZE (mode);
++  }
++
++  /* Return the size of the argument after promotion, or -1 if the size
++     is variable or otherwise not representable as a poly_int64.
++
++     Use this function when MODE is the mode of the type after promotion.  */
++  poly_int64 promoted_size_in_bytes () const
++  {
++    if (mode == BLKmode)
++      return int_size_in_bytes (type);
++    return GET_MODE_SIZE (mode);
++  }
++
++  /* True if the argument represents the end of the argument list,
++     as returned by end_marker ().  */
++  bool end_marker_p () const { return mode == VOIDmode; }
++
++  /* Return a function_arg_info that represents the end of the
++     argument list.  */
++  static function_arg_info end_marker ()
++  {
++    return function_arg_info (void_type_node, /*named=*/true);
++  }
++
++  /* The type of the argument, or null if not known (which is true for
++     libgcc support functions).  */
++  tree type;
++
++  /* The mode of the argument.  Depending on context, this might be
++     the mode of the argument type or the mode after promotion.  */
++  machine_mode mode;
++
++  /* True if the argument is treated as a named argument, false if it is
++     treated as an unnamed variadic argument (i.e. one passed through
++     "...").  See also TARGET_STRICT_ARGUMENT_NAMING.  */
++  unsigned int named : 1;
++};
++
+ extern int flags_from_decl_or_type (const_tree);
+ extern int call_expr_flags (const_tree);
+ extern int setjmp_call_p (const_tree);
+ extern bool gimple_maybe_alloca_call_p (const gimple *);
+ extern bool gimple_alloca_call_p (const gimple *);
+ extern bool alloca_call_p (const_tree);
+-extern bool must_pass_in_stack_var_size (machine_mode, const_tree);
+-extern bool must_pass_in_stack_var_size_or_pad (machine_mode, const_tree);
++extern bool must_pass_in_stack_var_size (const function_arg_info &);
++extern bool must_pass_in_stack_var_size_or_pad (const function_arg_info &);
++extern bool must_pass_va_arg_in_stack (tree);
+ extern rtx prepare_call_address (tree, rtx, rtx, rtx *, int, int);
+ extern bool shift_return_value (machine_mode, bool, rtx);
+ extern rtx expand_call (tree, rtx, int);
+ extern void fixup_tail_calls (void);
+ 
+-extern bool pass_by_reference (CUMULATIVE_ARGS *, machine_mode,
+-			       tree, bool);
+-extern bool reference_callee_copied (CUMULATIVE_ARGS *, machine_mode,
+-				     tree, bool);
++extern bool pass_by_reference (CUMULATIVE_ARGS *, function_arg_info);
++extern bool pass_va_arg_by_reference (tree);
++extern bool apply_pass_by_reference_rules (CUMULATIVE_ARGS *,
++					   function_arg_info &);
++extern bool reference_callee_copied (CUMULATIVE_ARGS *,
++				     const function_arg_info &);
+ extern void maybe_warn_alloc_args_overflow (tree, tree, tree[2], int[2]);
+ extern tree get_attr_nonstring_decl (tree, tree * = NULL);
+ extern void maybe_warn_nonstring_arg (tree, tree);
+diff --git a/gcc/cfgcleanup.c b/gcc/cfgcleanup.c
+index 8c464ec79..ff7f014da 100644
+--- a/gcc/cfgcleanup.c
++++ b/gcc/cfgcleanup.c
+@@ -54,6 +54,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "dbgcnt.h"
+ #include "rtl-iter.h"
+ #include "regs.h"
++#include "function-abi.h"
+ 
+ #define FORWARDER_BLOCK_P(BB) ((BB)->flags & BB_FORWARDER_BLOCK)
+ 
+@@ -1230,12 +1231,13 @@ old_insns_match_p (int mode ATTRIBUTE_UNUSED, rtx_insn *i1, rtx_insn *i2)
+ 	    }
+ 	}
+ 
+-      HARD_REG_SET i1_used, i2_used;
++      HARD_REG_SET i1_used = insn_callee_abi (i1).full_reg_clobbers ();
++      HARD_REG_SET i2_used = insn_callee_abi (i2).full_reg_clobbers ();
++      /* ??? This preserves traditional behavior; it might not be needed.  */
++      i1_used |= fixed_reg_set;
++      i2_used |= fixed_reg_set;
+ 
+-      get_call_reg_set_usage (i1, &i1_used, call_used_reg_set);
+-      get_call_reg_set_usage (i2, &i2_used, call_used_reg_set);
+-
+-      if (!hard_reg_set_equal_p (i1_used, i2_used))
++      if (i1_used != i2_used)
+         return dir_none;
+     }
+ 
+@@ -1269,7 +1271,7 @@ old_insns_match_p (int mode ATTRIBUTE_UNUSED, rtx_insn *i1, rtx_insn *i2)
+ 	if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
+ 	  SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
+ 
+-      if (!hard_reg_set_equal_p (i1_regset, i2_regset))
++      if (i1_regset != i2_regset)
+ 	return dir_none;
+     }
+ #endif
+diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
+index 4ae8e3b32..218414b39 100644
+--- a/gcc/cfgexpand.c
++++ b/gcc/cfgexpand.c
+@@ -2874,6 +2874,15 @@ asm_clobber_reg_is_valid (int regno, int nregs, const char *regname)
+       error ("PIC register clobbered by %qs in %<asm%>", regname);
+       is_valid = false;
+     }
++  else if (!in_hard_reg_set_p
++	   (accessible_reg_set, reg_raw_mode[regno], regno))
++    {
++      /* ??? Diagnose during gimplification?  */
++      error ("the register %qs cannot be clobbered in %<asm%>"
++	     " for the current target", regname);
++      is_valid = false;
++    }
++
+   /* Clobbering the stack pointer register is deprecated.  GCC expects
+      the value of the stack pointer after an asm statement to be the same
+      as it was before, so no asm can validly clobber the stack pointer in
+@@ -3865,7 +3874,6 @@ expand_gimple_stmt (gimple *stmt)
+ 	      /* If we want exceptions for non-call insns, any
+ 		 may_trap_p instruction may throw.  */
+ 	      && GET_CODE (PATTERN (insn)) != CLOBBER
+-	      && GET_CODE (PATTERN (insn)) != CLOBBER_HIGH
+ 	      && GET_CODE (PATTERN (insn)) != USE
+ 	      && insn_could_throw_p (insn))
+ 	    make_reg_eh_region_note (insn, 0, lp_nr);
+diff --git a/gcc/cfgloopanal.c b/gcc/cfgloopanal.c
+index 6dbe96f9d..3388da7dd 100644
+--- a/gcc/cfgloopanal.c
++++ b/gcc/cfgloopanal.c
+@@ -353,7 +353,7 @@ init_set_costs (void)
+ 	&& !fixed_regs[i])
+       {
+ 	target_avail_regs++;
+-	if (call_used_regs[i])
++	if (call_used_or_fixed_reg_p (i))
+ 	  target_clobbered_regs++;
+       }
+ 
+diff --git a/gcc/cgraph.c b/gcc/cgraph.c
+index 62f1afa2a..9dca43031 100644
+--- a/gcc/cgraph.c
++++ b/gcc/cgraph.c
+@@ -1883,7 +1883,7 @@ cgraph_node::local_info (tree decl)
+ /* Return local info for the compiled function.  */
+ 
+ cgraph_rtl_info *
+-cgraph_node::rtl_info (tree decl)
++cgraph_node::rtl_info (const_tree decl)
+ {
+   gcc_assert (TREE_CODE (decl) == FUNCTION_DECL);
+   cgraph_node *node = get (decl);
+@@ -1898,7 +1898,10 @@ cgraph_node::rtl_info (tree decl)
+     return NULL;
+   /* Allocate if it doesn't exist.  */
+   if (node->rtl == NULL)
+-    node->rtl = ggc_cleared_alloc<cgraph_rtl_info> ();
++    {
++      node->rtl = ggc_cleared_alloc<cgraph_rtl_info> ();
++      node->rtl->function_used_regs = reg_class_contents[ALL_REGS];
++    }
+   return node->rtl;
+ }
+ 
+diff --git a/gcc/cgraph.h b/gcc/cgraph.h
+index 10d1a2c6f..ad6720a4b 100644
+--- a/gcc/cgraph.h
++++ b/gcc/cgraph.h
+@@ -1347,7 +1347,7 @@ public:
+   static cgraph_local_info *local_info (tree decl);
+ 
+   /* Return local info for the compiled function.  */
+-  static struct cgraph_rtl_info *rtl_info (tree);
++  static struct cgraph_rtl_info *rtl_info (const_tree);
+ 
+   /* Return the cgraph node that has ASMNAME for its DECL_ASSEMBLER_NAME.
+      Return NULL if there's no such node.  */
+diff --git a/gcc/cgraphclones.c b/gcc/cgraphclones.c
+index cd3f585bd..43423234b 100644
+--- a/gcc/cgraphclones.c
++++ b/gcc/cgraphclones.c
+@@ -225,10 +225,7 @@ build_function_decl_skip_args (tree orig_decl, bitmap args_to_skip,
+   if (fndecl_built_in_p (new_decl)
+       && args_to_skip
+       && !bitmap_empty_p (args_to_skip))
+-    {
+-      DECL_BUILT_IN_CLASS (new_decl) = NOT_BUILT_IN;
+-      DECL_FUNCTION_CODE (new_decl) = (enum built_in_function) 0;
+-    }
++    set_decl_built_in_function (new_decl, NOT_BUILT_IN, 0);
+   /* The FE might have information and assumptions about the other
+      arguments.  */
+   DECL_LANG_SPECIFIC (new_decl) = NULL;
+@@ -415,7 +412,7 @@ dump_callgraph_transformation (const cgraph_node *original,
+ 
+    If the new node is being inlined into another one, NEW_INLINED_TO should be
+    the outline function the new one is (even indirectly) inlined to.  All hooks
+-   will see this in node's global.inlined_to, when invoked.  Can be NULL if the
++   will see this in node's inlined_to, when invoked.  Can be NULL if the
+    node is not inlined.  */
+ 
+ cgraph_node *
+@@ -1056,7 +1053,7 @@ cgraph_node::create_version_clone_with_body
+       location_t saved_loc = input_location;
+       tree v = TREE_VALUE (target_attributes);
+       input_location = DECL_SOURCE_LOCATION (new_decl);
+-      bool r = targetm.target_option.valid_attribute_p (new_decl, NULL, v, 0);
++      bool r = targetm.target_option.valid_attribute_p (new_decl, NULL, v, 1);
+       input_location = saved_loc;
+       if (!r)
+ 	return NULL;
+diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
+index dee6becc7..ddf298583 100644
+--- a/gcc/cgraphunit.c
++++ b/gcc/cgraphunit.c
+@@ -1793,7 +1793,6 @@ cgraph_node::expand_thunk (bool output_asm_thunks, bool force_gimple_thunk)
+       && targetm.asm_out.can_output_mi_thunk (thunk_fndecl, fixed_offset,
+ 					      virtual_value, alias))
+     {
+-      const char *fnname;
+       tree fn_block;
+       tree restype = TREE_TYPE (TREE_TYPE (thunk_fndecl));
+ 
+@@ -1817,7 +1816,6 @@ cgraph_node::expand_thunk (bool output_asm_thunks, bool force_gimple_thunk)
+ 	= build_decl (DECL_SOURCE_LOCATION (thunk_fndecl),
+ 		      RESULT_DECL, 0, restype);
+       DECL_CONTEXT (DECL_RESULT (thunk_fndecl)) = thunk_fndecl;
+-      fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk_fndecl));
+ 
+       /* The back end expects DECL_INITIAL to contain a BLOCK, so we
+ 	 create one.  */
+@@ -1831,12 +1829,10 @@ cgraph_node::expand_thunk (bool output_asm_thunks, bool force_gimple_thunk)
+       insn_locations_init ();
+       set_curr_insn_location (DECL_SOURCE_LOCATION (thunk_fndecl));
+       prologue_location = curr_insn_location ();
+-      assemble_start_function (thunk_fndecl, fnname);
+ 
+       targetm.asm_out.output_mi_thunk (asm_out_file, thunk_fndecl,
+ 				       fixed_offset, virtual_value, alias);
+ 
+-      assemble_end_function (thunk_fndecl, fnname);
+       insn_locations_finalize ();
+       init_insn_lengths ();
+       free_after_compilation (cfun);
+diff --git a/gcc/cif-code.def b/gcc/cif-code.def
+index 3356377a1..a154f24f1 100644
+--- a/gcc/cif-code.def
++++ b/gcc/cif-code.def
+@@ -70,8 +70,12 @@ DEFCIFCODE(LARGE_STACK_FRAME_GROWTH_LIMIT, CIF_FINAL_NORMAL,
+ 	   N_("--param large-stack-frame-growth limit reached"))
+ DEFCIFCODE(MAX_INLINE_INSNS_SINGLE_LIMIT, CIF_FINAL_NORMAL,
+ 	   N_("--param max-inline-insns-single limit reached"))
++DEFCIFCODE(MAX_INLINE_INSNS_SINGLE_O2_LIMIT, CIF_FINAL_NORMAL,
++	   N_("--param max-inline-insns-single-O2 limit reached"))
+ DEFCIFCODE(MAX_INLINE_INSNS_AUTO_LIMIT, CIF_FINAL_NORMAL,
+ 	   N_("--param max-inline-insns-auto limit reached"))
++DEFCIFCODE(MAX_INLINE_INSNS_AUTO_O2_LIMIT, CIF_FINAL_NORMAL,
++	   N_("--param max-inline-insns-auto-O2 limit reached"))
+ DEFCIFCODE(INLINE_UNIT_GROWTH_LIMIT, CIF_FINAL_NORMAL,
+ 	   N_("--param inline-unit-growth limit reached"))
+ 
+@@ -83,6 +87,10 @@ DEFCIFCODE(RECURSIVE_INLINING, CIF_FINAL_NORMAL,
+ DEFCIFCODE(UNLIKELY_CALL, CIF_FINAL_NORMAL,
+ 	   N_("call is unlikely and code size would grow"))
+ 
++/* Call is considered never executed.  */
++DEFCIFCODE(NEVER_CALL, CIF_FINAL_NORMAL,
++	   N_("call is considered never executed and code size would grow"))
++
+ /* Function is not declared as inline.  */
+ DEFCIFCODE(NOT_DECLARED_INLINED, CIF_FINAL_NORMAL,
+ 	   N_("function not declared inline and code size would grow"))
+diff --git a/gcc/combine-stack-adj.c b/gcc/combine-stack-adj.c
+index 3638a1b10..d14d59abc 100644
+--- a/gcc/combine-stack-adj.c
++++ b/gcc/combine-stack-adj.c
+@@ -133,7 +133,6 @@ single_set_for_csa (rtx_insn *insn)
+ 	  && SET_SRC (this_rtx) == SET_DEST (this_rtx))
+ 	;
+       else if (GET_CODE (this_rtx) != CLOBBER
+-	       && GET_CODE (this_rtx) != CLOBBER_HIGH
+ 	       && GET_CODE (this_rtx) != USE)
+ 	return NULL_RTX;
+     }
+diff --git a/gcc/combine.c b/gcc/combine.c
+index b9d674c96..a425f0ca6 100644
+--- a/gcc/combine.c
++++ b/gcc/combine.c
+@@ -571,7 +571,6 @@ find_single_use_1 (rtx dest, rtx *loc)
+     case SYMBOL_REF:
+     CASE_CONST_ANY:
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       return 0;
+ 
+     case SET:
+@@ -1224,8 +1223,7 @@ combine_instructions (rtx_insn *f, unsigned int nregs)
+             subst_low_luid = DF_INSN_LUID (insn);
+             subst_insn = insn;
+ 
+-	    note_stores (PATTERN (insn), set_nonzero_bits_and_sign_copies,
+-		         insn);
++	    note_stores (insn, set_nonzero_bits_and_sign_copies, insn);
+ 	    record_dead_and_set_regs (insn);
+ 
+ 	    if (AUTO_INC_DEC)
+@@ -1763,9 +1761,6 @@ set_nonzero_bits_and_sign_copies (rtx x, const_rtx set, void *data)
+ 	  return;
+ 	}
+ 
+-      /* Should not happen as we only using pseduo registers.  */
+-      gcc_assert (GET_CODE (set) != CLOBBER_HIGH);
+-
+       /* If this register is being initialized using itself, and the
+ 	 register is uninitialized in this basic block, and there are
+ 	 no LOG_LINKS which set the register, then part of the
+@@ -1924,7 +1919,6 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn *pred ATTRIBUTE_UNUSED,
+ 
+ 	      /* We can ignore CLOBBERs.  */
+ 	    case CLOBBER:
+-	    case CLOBBER_HIGH:
+ 	      break;
+ 
+ 	    case SET:
+@@ -2439,7 +2433,7 @@ likely_spilled_retval_p (rtx_insn *insn)
+   info.mask = mask;
+   for (p = PREV_INSN (use); info.mask && p != insn; p = PREV_INSN (p))
+     if (INSN_P (p))
+-      note_stores (PATTERN (p), likely_spilled_retval_1, &info);
++      note_stores (p, likely_spilled_retval_1, &info);
+   mask = info.mask;
+ 
+   /* Check if any of the (probably) live return value registers is
+@@ -2595,8 +2589,6 @@ is_parallel_of_n_reg_sets (rtx pat, int n)
+ 	if (XEXP (XVECEXP (pat, 0, i), 0) == const0_rtx)
+ 	  return false;
+ 	break;
+-      case CLOBBER_HIGH:
+-	break;
+       default:
+ 	return false;
+       }
+@@ -2897,8 +2889,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
+       for (i = 0; ok && i < XVECLEN (p2, 0); i++)
+ 	{
+ 	  if ((GET_CODE (XVECEXP (p2, 0, i)) == SET
+-	       || GET_CODE (XVECEXP (p2, 0, i)) == CLOBBER
+-	       || GET_CODE (XVECEXP (p2, 0, i)) == CLOBBER_HIGH)
++	       || GET_CODE (XVECEXP (p2, 0, i)) == CLOBBER)
+ 	      && reg_overlap_mentioned_p (SET_DEST (PATTERN (i3)),
+ 					  SET_DEST (XVECEXP (p2, 0, i))))
+ 	    ok = false;
+@@ -4741,8 +4732,8 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
+        been made to this insn.  The order is important, because newi2pat
+        can affect nonzero_bits of newpat.  */
+     if (newi2pat)
+-      note_stores (newi2pat, set_nonzero_bits_and_sign_copies, NULL);
+-    note_stores (newpat, set_nonzero_bits_and_sign_copies, NULL);
++      note_pattern_stores (newi2pat, set_nonzero_bits_and_sign_copies, NULL);
++    note_pattern_stores (newpat, set_nonzero_bits_and_sign_copies, NULL);
+   }
+ 
+   if (undobuf.other_insn != NULL_RTX)
+@@ -13409,15 +13400,6 @@ record_dead_and_set_regs_1 (rtx dest, const_rtx setter, void *data)
+ 			      ? SET_SRC (setter)
+ 			      : gen_lowpart (GET_MODE (dest),
+ 					     SET_SRC (setter)));
+-      else if (GET_CODE (setter) == CLOBBER_HIGH)
+-	{
+-	  reg_stat_type *rsp = &reg_stat[REGNO (dest)];
+-	  if (rsp->last_set_value
+-	      && reg_is_clobbered_by_clobber_high
+-		   (REGNO (dest), GET_MODE (rsp->last_set_value),
+-		    XEXP (setter, 0)))
+-	    record_value_for_reg (dest, NULL, NULL_RTX);
+-	}
+       else
+ 	record_value_for_reg (dest, record_dead_insn, NULL_RTX);
+     }
+@@ -13487,10 +13469,10 @@ record_dead_and_set_regs (rtx_insn *insn)
+ 	 the return value register is set at this LUID.  We could
+ 	 still replace a register with the return value from the
+ 	 wrong subroutine call!  */
+-      note_stores (PATTERN (insn), record_dead_and_set_regs_1, NULL_RTX);
++      note_stores (insn, record_dead_and_set_regs_1, NULL_RTX);
+     }
+   else
+-    note_stores (PATTERN (insn), record_dead_and_set_regs_1, insn);
++    note_stores (insn, record_dead_and_set_regs_1, insn);
+ }
+ 
+ /* If a SUBREG has the promoted bit set, it is in fact a property of the
+@@ -13853,10 +13835,6 @@ reg_dead_at_p_1 (rtx dest, const_rtx x, void *data ATTRIBUTE_UNUSED)
+   if (!REG_P (dest))
+     return;
+ 
+-  if (GET_CODE (x) == CLOBBER_HIGH
+-      && !reg_is_clobbered_by_clobber_high (reg_dead_reg, XEXP (x, 0)))
+-    return;
+-
+   regno = REGNO (dest);
+   endregno = END_REGNO (dest);
+   if (reg_dead_endregno > regno && reg_dead_regno < endregno)
+@@ -13904,7 +13882,7 @@ reg_dead_at_p (rtx reg, rtx_insn *insn)
+ 	  if (find_regno_note (insn, REG_UNUSED, reg_dead_regno))
+ 	    return 1;
+ 
+-	  note_stores (PATTERN (insn), reg_dead_at_p_1, NULL);
++	  note_stores (insn, reg_dead_at_p_1, NULL);
+ 	  if (reg_dead_flag)
+ 	    return reg_dead_flag == 1 ? 1 : 0;
+ 
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 0bdf51dd8..7dee534b8 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1080,16 +1080,16 @@ Common Report Var(flag_branch_probabilities) Optimization
+ Use profiling information for branch probabilities.
+ 
+ fbranch-target-load-optimize
+-Common Report Var(flag_branch_target_load_optimize) Optimization
+-Perform branch target load optimization before prologue / epilogue threading.
++Common Ignore
++Does nothing.  Preserved for backward compatibility.
+ 
+ fbranch-target-load-optimize2
+-Common Report Var(flag_branch_target_load_optimize2) Optimization
+-Perform branch target load optimization after prologue / epilogue threading.
++Common Ignore
++Does nothing.  Preserved for backward compatibility.
+ 
+ fbtr-bb-exclusive
+-Common Report Var(flag_btr_bb_exclusive) Optimization
+-Restrict target load migration not to re-use registers in any basic block.
++Common Ignore
++Does nothing.  Preserved for backward compatibility.
+ 
+ fcall-saved-
+ Common Joined RejectNegative Var(common_deferred_options) Defer
+@@ -1289,6 +1289,26 @@ Enum(diagnostic_color_rule) String(always) Value(DIAGNOSTICS_COLOR_YES)
+ EnumValue
+ Enum(diagnostic_color_rule) String(auto) Value(DIAGNOSTICS_COLOR_AUTO)
+ 
++fdiagnostics-urls=
++Driver Common Joined RejectNegative Var(flag_diagnostics_show_urls) Enum(diagnostic_url_rule) Init(DIAGNOSTICS_URL_AUTO)
++-fdiagnostics-urls=[never|always|auto]	Embed URLs in diagnostics.
++
++; Required for these enum values.
++SourceInclude
++diagnostic-url.h
++
++Enum
++Name(diagnostic_url_rule) Type(int)
++
++EnumValue
++Enum(diagnostic_url_rule) String(never) Value(DIAGNOSTICS_URL_NO)
++
++EnumValue
++Enum(diagnostic_url_rule) String(always) Value(DIAGNOSTICS_URL_YES)
++
++EnumValue
++Enum(diagnostic_url_rule) String(auto) Value(DIAGNOSTICS_URL_AUTO)
++
+ fdiagnostics-format=
+ Common Joined RejectNegative Enum(diagnostics_output_format)
+ -fdiagnostics-format=[text|json] Select output format.
+@@ -1963,7 +1983,7 @@ Common Var(flag_dce) Init(1) Optimization
+ Use the RTL dead code elimination pass.
+ 
+ fdse
+-Common Var(flag_dse) Init(1) Optimization
++Common Var(flag_dse) Init(0) Optimization
+ Use the RTL dead store elimination pass.
+ 
+ freschedule-modulo-scheduled-loops
+diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
+index bab3ab3fa..07c032539 100644
+--- a/gcc/common/config/aarch64/aarch64-common.c
++++ b/gcc/common/config/aarch64/aarch64-common.c
+@@ -170,9 +170,9 @@ aarch64_handle_option (struct gcc_options *opts,
+ struct aarch64_option_extension
+ {
+   const char *const name;
+-  const unsigned long flag_canonical;
+-  const unsigned long flags_on;
+-  const unsigned long flags_off;
++  const uint64_t flag_canonical;
++  const uint64_t flags_on;
++  const uint64_t flags_off;
+   const bool is_synthetic;
+ };
+ 
+@@ -201,14 +201,14 @@ struct processor_name_to_arch
+ {
+   const std::string processor_name;
+   const enum aarch64_arch arch;
+-  const unsigned long flags;
++  const uint64_t flags;
+ };
+ 
+ struct arch_to_arch_name
+ {
+   const enum aarch64_arch arch;
+   const std::string arch_name;
+-  const unsigned long flags;
++  const uint64_t flags;
+ };
+ 
+ /* Map processor names to the architecture revision they implement and
+@@ -238,7 +238,7 @@ static const struct arch_to_arch_name all_architectures[] =
+    a copy of the string is created and stored to INVALID_EXTENSION.  */
+ 
+ enum aarch64_parse_opt_result
+-aarch64_parse_extension (const char *str, unsigned long *isa_flags,
++aarch64_parse_extension (const char *str, uint64_t *isa_flags,
+ 			 std::string *invalid_extension)
+ {
+   /* The extension string is parsed left to right.  */
+@@ -326,18 +326,21 @@ int opt_ext_cmp (const void* a, const void* b)
+      turns on as a dependency.  As an example +dotprod turns on FL_DOTPROD and
+      FL_SIMD.  As such the set of bits represented by this option is
+      {FL_DOTPROD, FL_SIMD}. */
+-  unsigned long total_flags_a = opt_a->flag_canonical & opt_a->flags_on;
+-  unsigned long total_flags_b = opt_b->flag_canonical & opt_b->flags_on;
++  uint64_t total_flags_a = opt_a->flag_canonical & opt_a->flags_on;
++  uint64_t total_flags_b = opt_b->flag_canonical & opt_b->flags_on;
+   int popcnt_a = popcount_hwi ((HOST_WIDE_INT)total_flags_a);
+   int popcnt_b = popcount_hwi ((HOST_WIDE_INT)total_flags_b);
+   int order = popcnt_b - popcnt_a;
+ 
+   /* If they have the same amount of bits set, give it a more
+      deterministic ordering by using the value of the bits themselves.  */
+-  if (order == 0)
+-    return total_flags_b - total_flags_a;
++  if (order != 0)
++    return order;
+ 
+-  return order;
++  if (total_flags_a != total_flags_b)
++    return total_flags_a < total_flags_b ? 1 : -1;
++
++  return 0;
+ }
+ 
+ /* Implement TARGET_OPTION_INIT_STRUCT.  */
+@@ -373,9 +376,9 @@ aarch64_option_init_struct (struct gcc_options *opts ATTRIBUTE_UNUSED)
+ */
+ 
+ static bool
+-aarch64_contains_opt (unsigned long isa_flag_bits, opt_ext *opt)
++aarch64_contains_opt (uint64_t isa_flag_bits, opt_ext *opt)
+ {
+-  unsigned long flags_check
++  uint64_t flags_check
+     = opt->is_synthetic ? opt->flags_on : opt->flag_canonical;
+ 
+   return (isa_flag_bits & flags_check) == flags_check;
+@@ -388,13 +391,13 @@ aarch64_contains_opt (unsigned long isa_flag_bits, opt_ext *opt)
+    that all the "+" flags come before the "+no" flags.  */
+ 
+ std::string
+-aarch64_get_extension_string_for_isa_flags (unsigned long isa_flags,
+-					    unsigned long default_arch_flags)
++aarch64_get_extension_string_for_isa_flags (uint64_t isa_flags,
++					    uint64_t default_arch_flags)
+ {
+   const struct aarch64_option_extension *opt = NULL;
+   std::string outstr = "";
+ 
+-  unsigned long isa_flag_bits = isa_flags;
++  uint64_t isa_flag_bits = isa_flags;
+ 
+   /* Pass one: Minimize the search space by reducing the set of options
+      to the smallest set that still turns on the same features as before in
+@@ -538,7 +541,7 @@ aarch64_rewrite_selected_cpu (const char *name)
+       || a_to_an->arch == aarch64_no_arch)
+     fatal_error (input_location, "unknown value %qs for %<-mcpu%>", name);
+ 
+-  unsigned long extensions = p_to_a->flags;
++  uint64_t extensions = p_to_a->flags;
+   aarch64_parse_extension (extension_str.c_str (), &extensions, NULL);
+ 
+   std::string outstr = a_to_an->arch_name
+diff --git a/gcc/config.gcc b/gcc/config.gcc
+index b2282ecdf..506a918ed 100644
+--- a/gcc/config.gcc
++++ b/gcc/config.gcc
+@@ -315,12 +315,12 @@ m32c*-*-*)
+         ;;
+ aarch64*-*-*)
+ 	cpu_type=aarch64
+-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
++	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
+ 	c_target_objs="aarch64-c.o"
+ 	cxx_target_objs="aarch64-c.o"
+ 	d_target_objs="aarch64-d.o"
+-	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o"
+-	target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c"
++	extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o"
++	target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
+ 	target_has_targetm_common=yes
+ 	;;
+ alpha*-*-*)
+@@ -382,7 +382,8 @@ i[34567]86-*-*)
+ 	c_target_objs="i386-c.o"
+ 	cxx_target_objs="i386-c.o"
+ 	d_target_objs="i386-d.o"
+-	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
++	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o i386-options.o i386-builtins.o i386-expand.o i386-features.o"
++  target_gtfiles="\$(srcdir)/config/i386/i386-builtins.c \$(srcdir)/config/i386/i386-expand.c \$(srcdir)/config/i386/i386-options.c"
+ 	extra_options="${extra_options} fused-madd.opt"
+ 	extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
+ 		       pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
+@@ -414,7 +415,8 @@ x86_64-*-*)
+ 	cxx_target_objs="i386-c.o"
+ 	d_target_objs="i386-d.o"
+ 	extra_options="${extra_options} fused-madd.opt"
+-	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
++	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o i386-options.o i386-builtins.o i386-expand.o i386-features.o"
++  target_gtfiles="\$(srcdir)/config/i386/i386-builtins.c \$(srcdir)/config/i386/i386-expand.c \$(srcdir)/config/i386/i386-options.c"
+ 	extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
+ 		       pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
+ 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
+@@ -980,7 +982,7 @@ esac
+ case ${target} in
+ aarch64*-*-elf | aarch64*-*-fuchsia* | aarch64*-*-rtems*)
+ 	tm_file="${tm_file} dbxelf.h elfos.h newlib-stdint.h"
+-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-elf-raw.h"
++	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-elf-raw.h"
+ 	tmake_file="${tmake_file} aarch64/t-aarch64"
+ 	case $target in
+ 	aarch64-*-elf*)
+@@ -1017,13 +1019,19 @@ aarch64*-*-elf | aarch64*-*-fuchsia* | aarch64*-*-rtems*)
+ 	;;
+ aarch64*-*-freebsd*)
+ 	tm_file="${tm_file} dbxelf.h elfos.h ${fbsd_tm_file}"
+-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-freebsd.h"
++	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-freebsd.h"
+ 	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-freebsd"
+ 	tm_defines="${tm_defines}  TARGET_DEFAULT_ASYNC_UNWIND_TABLES=1"
+ 	;;
++aarch64*-*-netbsd*)
++	tm_file="${tm_file} dbxelf.h elfos.h ${nbsd_tm_file}"
++	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-netbsd.h"
++	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-netbsd"
++	extra_options="${extra_options} netbsd.opt netbsd-elf.opt"
++	;;
+ aarch64*-*-linux*)
+ 	tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
+-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
++	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-linux.h"
+ 	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
+ 	tm_defines="${tm_defines}  TARGET_DEFAULT_ASYNC_UNWIND_TABLES=1"
+ 	case $target in
+@@ -3847,32 +3855,40 @@ case "${target}" in
+ 				  sed -e 's/,.*$//'`
+ 			  fi
+ 
++			  # Use the pre-processor to strip flatten the options.
++			  # This makes the format less rigid than if we use
++			  # grep and sed directly here.
++			  opt_macro="AARCH64_OPT_EXTENSION(A, B, C, D, E, F)=A, B, C, D, E, F"
++			  options_parsed="`$ac_cv_prog_CPP -D"$opt_macro" -x c \
++				${srcdir}/config/aarch64/aarch64-option-extensions.def`"
++
++			  # Match one element inside AARCH64_OPT_EXTENSION, we
++			  # consume anything that's not a ,.
++			  elem="[ 	]*\([^,]\+\)[ 	]*"
++
++			  # Repeat the pattern for the number of entries in the
++			  # AARCH64_OPT_EXTENSION, currently 6 times.
++			  sed_patt="^$elem,$elem,$elem,$elem,$elem,$elem"
++
+ 			  while [ x"$ext_val" != x ]
+ 			  do
+ 				ext_val=`echo $ext_val | sed -e 's/\+//'`
+ 				ext=`echo $ext_val | sed -e 's/\+.*//'`
+ 				base_ext=`echo $ext | sed -e 's/^no//'`
++				opt_line=`echo -e "$options_parsed" | \
++					grep "^\"$base_ext\""`
+ 
+ 				if [ x"$base_ext" = x ] \
+-				    || grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
+-				    ${srcdir}/config/aarch64/aarch64-option-extensions.def \
+-				    > /dev/null; then
+-
+-				  ext_canon=`grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
+-					${srcdir}/config/aarch64/aarch64-option-extensions.def | \
+-					sed -e 's/^[^,]*,[ 	]*//' | \
+-					sed -e 's/,.*$//'`
+-				  ext_on=`grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
+-					${srcdir}/config/aarch64/aarch64-option-extensions.def | \
+-					sed -e 's/^[^,]*,[ 	]*[^,]*,[ 	]*//' | \
+-					sed -e 's/,.*$//' | \
+-					sed -e 's/).*$//'`
+-				  ext_off=`grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
+-					${srcdir}/config/aarch64/aarch64-option-extensions.def | \
+-					sed -e 's/^[^,]*,[ 	]*[^,]*,[ 	]*[^,]*,[ 	]*//' | \
+-					sed -e 's/,.*$//' | \
+-					sed -e 's/).*$//'`
+-
++				    || [[ -n $opt_line ]]; then
++
++				  # These regexp extract the elements based on
++				  # their group match index in the regexp.
++				  ext_canon=`echo -e "$opt_line" | \
++					sed -e "s/$sed_patt/\2/"`
++				  ext_on=`echo -e "$opt_line" | \
++					sed -e "s/$sed_patt/\3/"`
++				  ext_off=`echo -e "$opt_line" | \
++					sed -e "s/$sed_patt/\4/"`
+ 
+ 				  if [ $ext = $base_ext ]; then
+ 					# Adding extension
+diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
+index d258bd492..e464d329c 100644
+--- a/gcc/config/aarch64/aarch64-arches.def
++++ b/gcc/config/aarch64/aarch64-arches.def
+@@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
+ AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
+ AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
+ AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
++AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
+ 
+ #undef AARCH64_ARCH
+diff --git a/gcc/config/aarch64/aarch64-bti-insert.c b/gcc/config/aarch64/aarch64-bti-insert.c
+index e519a0f0a..db8ebb1ba 100644
+--- a/gcc/config/aarch64/aarch64-bti-insert.c
++++ b/gcc/config/aarch64/aarch64-bti-insert.c
+@@ -106,7 +106,9 @@ aarch64_pac_insn_p (rtx x)
+ 	  int unspec_val = XINT (sub, 1);
+ 	  switch (unspec_val)
+ 	    {
+-	    case UNSPEC_PACISP:
++	    case UNSPEC_PACIASP:
++            /* fall-through.  */
++            case UNSPEC_PACIBSP:
+ 	      return true;
+ 
+ 	    default:
+diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
+index d7b1b7bd6..c890fcc37 100644
+--- a/gcc/config/aarch64/aarch64-builtins.c
++++ b/gcc/config/aarch64/aarch64-builtins.c
+@@ -68,6 +68,9 @@
+ #define hi_UP    E_HImode
+ #define hf_UP    E_HFmode
+ #define qi_UP    E_QImode
++#define bf_UP    E_BFmode
++#define v4bf_UP  E_V4BFmode
++#define v8bf_UP  E_V8BFmode
+ #define UP(X) X##_UP
+ 
+ #define SIMD_MAX_BUILTIN_ARGS 5
+@@ -107,6 +110,9 @@ enum aarch64_type_qualifiers
+   /* Lane indices selected in pairs. - must be in range, and flipped for
+      bigendian.  */
+   qualifier_lane_pair_index = 0x800,
++  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
++     bigendian.  */
++  qualifier_lane_quadtup_index = 0x1000,
+ };
+ 
+ typedef struct
+@@ -173,6 +179,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_unsigned, qualifier_unsigned,
+       qualifier_unsigned, qualifier_immediate };
+ #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
++static enum aarch64_type_qualifiers
++aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
++#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
+ 
+ 
+ static enum aarch64_type_qualifiers
+@@ -191,6 +201,19 @@ aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+       qualifier_unsigned, qualifier_lane_index };
+ #define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
+ 
++static enum aarch64_type_qualifiers
++aarch64_types_quadopssus_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_none, qualifier_none, qualifier_unsigned,
++      qualifier_none, qualifier_lane_quadtup_index };
++#define TYPES_QUADOPSSUS_LANE_QUADTUP \
++	(aarch64_types_quadopssus_lane_quadtup_qualifiers)
++static enum aarch64_type_qualifiers
++aarch64_types_quadopsssu_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
++  = { qualifier_none, qualifier_none, qualifier_none,
++      qualifier_unsigned, qualifier_lane_quadtup_index };
++#define TYPES_QUADOPSSSU_LANE_QUADTUP \
++	(aarch64_types_quadopsssu_lane_quadtup_qualifiers)
++
+ static enum aarch64_type_qualifiers
+ aarch64_types_quadopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+@@ -347,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ #define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+   VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+   VAR1 (T, X, MAP, N)
++#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
++  VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
++  VAR1 (T, X, MAP, O)
++#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
++  VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
++  VAR1 (T, X, MAP, P)
+ 
+ #include "aarch64-builtin-iterators.h"
+ 
+@@ -432,10 +461,22 @@ enum aarch64_builtins
+   /* ARMv8.3-A Pointer Authentication Builtins.  */
+   AARCH64_PAUTH_BUILTIN_AUTIA1716,
+   AARCH64_PAUTH_BUILTIN_PACIA1716,
++  AARCH64_PAUTH_BUILTIN_AUTIB1716,
++  AARCH64_PAUTH_BUILTIN_PACIB1716,
+   AARCH64_PAUTH_BUILTIN_XPACLRI,
+   /* Special cased Armv8.3-A Complex FMA by Lane quad Builtins.  */
+   AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE,
+   AARCH64_SIMD_FCMLA_LANEQ_BUILTINS
++  /* Builtin for Arm8.3-a Javascript conversion instruction.  */
++  AARCH64_JSCVT,
++  /* TME builtins.  */
++  AARCH64_TME_BUILTIN_TSTART,
++  AARCH64_TME_BUILTIN_TCOMMIT,
++  AARCH64_TME_BUILTIN_TTEST,
++  AARCH64_TME_BUILTIN_TCANCEL,
++  /* Armv8.5-a RNG instruction builtins.  */
++  AARCH64_BUILTIN_RNG_RNDR,
++  AARCH64_BUILTIN_RNG_RNDRRS,
+   AARCH64_BUILTIN_MAX
+ };
+ 
+@@ -490,6 +531,7 @@ const char *aarch64_scalar_builtin_types[] = {
+   "__builtin_aarch64_simd_oi",
+   "__builtin_aarch64_simd_ci",
+   "__builtin_aarch64_simd_xi",
++  "__builtin_aarch64_simd_bf",
+   NULL
+ };
+ 
+@@ -547,6 +589,21 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
+ tree aarch64_fp16_type_node = NULL_TREE;
+ tree aarch64_fp16_ptr_type_node = NULL_TREE;
+ 
++/* Back-end node type for brain float (bfloat) types.  */
++tree aarch64_bf16_type_node = NULL_TREE;
++tree aarch64_bf16_ptr_type_node = NULL_TREE;
++
++/* Wrapper around add_builtin_function.  NAME is the name of the built-in
++   function, TYPE is the function type, and CODE is the function subcode
++   (relative to AARCH64_BUILTIN_GENERAL).  */
++static tree
++aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
++{
++  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
++  return add_builtin_function (name, type, code, BUILT_IN_MD,
++			       NULL, NULL_TREE);
++}
++
+ static const char *
+ aarch64_mangle_builtin_scalar_type (const_tree type)
+ {
+@@ -585,7 +642,7 @@ aarch64_mangle_builtin_vector_type (const_tree type)
+ }
+ 
+ const char *
+-aarch64_mangle_builtin_type (const_tree type)
++aarch64_general_mangle_builtin_type (const_tree type)
+ {
+   const char *mangle;
+   /* Walk through all the AArch64 builtins types tables to filter out the
+@@ -627,6 +684,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
+       return float_type_node;
+     case E_DFmode:
+       return double_type_node;
++    case E_BFmode:
++      return aarch64_bf16_type_node;
+     default:
+       gcc_unreachable ();
+     }
+@@ -718,6 +777,10 @@ aarch64_init_simd_builtin_types (void)
+   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
+   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
+ 
++  /* Init Bfloat vector types with underlying __bf16 type.  */
++  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
++  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
++
+   for (i = 0; i < nelts; i++)
+     {
+       tree eltype = aarch64_simd_types[i].eltype;
+@@ -782,6 +845,8 @@ aarch64_init_simd_builtin_scalar_types (void)
+ 					     "__builtin_aarch64_simd_poly128");
+   (*lang_hooks.types.register_builtin_type) (intTI_type_node,
+ 					     "__builtin_aarch64_simd_ti");
++  (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node,
++					     "__builtin_aarch64_simd_bf");
+   /* Unsigned integer types for various mode sizes.  */
+   (*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node,
+ 					     "__builtin_aarch64_simd_uqi");
+@@ -816,8 +881,7 @@ aarch64_init_fcmla_laneq_builtins (void)
+ 	= aarch64_simd_builtin_std_type (SImode, qualifier_lane_pair_index);
+       tree ftype = build_function_type_list (argtype, argtype, argtype,
+ 					     quadtype, lanetype, NULL_TREE);
+-      tree fndecl = add_builtin_function (d->name, ftype, d->fcode,
+-					  BUILT_IN_MD, NULL, NULL_TREE);
++      tree fndecl = aarch64_general_add_builtin (d->name, ftype, d->fcode);
+ 
+       aarch64_builtin_decls[d->fcode] = fndecl;
+     }
+@@ -846,10 +910,10 @@ aarch64_init_simd_builtins (void)
+ 						  size_type_node,
+ 						  intSI_type_node,
+ 						  NULL);
+-  aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_LANE_CHECK] =
+-      add_builtin_function ("__builtin_aarch64_im_lane_boundsi", lane_check_fpr,
+-			    AARCH64_SIMD_BUILTIN_LANE_CHECK, BUILT_IN_MD,
+-			    NULL, NULL_TREE);
++  aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_LANE_CHECK]
++    = aarch64_general_add_builtin ("__builtin_aarch64_im_lane_boundsi",
++				   lane_check_fpr,
++				   AARCH64_SIMD_BUILTIN_LANE_CHECK);
+ 
+   for (i = 0; i < ARRAY_SIZE (aarch64_simd_builtin_data); i++, fcode++)
+     {
+@@ -947,8 +1011,7 @@ aarch64_init_simd_builtins (void)
+ 	snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
+ 		  d->name);
+ 
+-      fndecl = add_builtin_function (namebuf, ftype, fcode, BUILT_IN_MD,
+-				     NULL, NULL_TREE);
++      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
+       aarch64_builtin_decls[fcode] = fndecl;
+     }
+ 
+@@ -968,8 +1031,7 @@ aarch64_init_crc32_builtins ()
+       tree argtype = aarch64_simd_builtin_std_type (d->mode,
+ 						    qualifier_unsigned);
+       tree ftype = build_function_type_list (usi_type, usi_type, argtype, NULL_TREE);
+-      tree fndecl = add_builtin_function (d->name, ftype, d->fcode,
+-                                          BUILT_IN_MD, NULL, NULL_TREE);
++      tree fndecl = aarch64_general_add_builtin (d->name, ftype, d->fcode);
+ 
+       aarch64_builtin_decls[d->fcode] = fndecl;
+     }
+@@ -1009,8 +1071,8 @@ aarch64_init_builtin_rsqrt (void)
+   for (; bdd < bdd_end; bdd++)
+   {
+     ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+-    fndecl = add_builtin_function (bdd->builtin_name,
+-      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
++    fndecl = aarch64_general_add_builtin (bdd->builtin_name,
++					  ftype, bdd->function_code);
+     aarch64_builtin_decls[bdd->function_code] = fndecl;
+   }
+ }
+@@ -1030,6 +1092,19 @@ aarch64_init_fp16_types (void)
+   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
+ }
+ 
++/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
++static void
++aarch64_init_bf16_types (void)
++{
++  aarch64_bf16_type_node = make_node (REAL_TYPE);
++  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
++  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
++  layout_type (aarch64_bf16_type_node);
++
++  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
++  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
++}
++
+ /* Pointer authentication builtins that will become NOP on legacy platform.
+    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
+ 
+@@ -1044,21 +1119,77 @@ aarch64_init_pauth_hint_builtins (void)
+     = build_function_type_list (ptr_type_node, ptr_type_node, NULL_TREE);
+ 
+   aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_AUTIA1716]
+-    = add_builtin_function ("__builtin_aarch64_autia1716", ftype_pointer_auth,
+-			    AARCH64_PAUTH_BUILTIN_AUTIA1716, BUILT_IN_MD, NULL,
+-			    NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_autia1716",
++				   ftype_pointer_auth,
++				   AARCH64_PAUTH_BUILTIN_AUTIA1716);
+   aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_PACIA1716]
+-    = add_builtin_function ("__builtin_aarch64_pacia1716", ftype_pointer_auth,
+-			    AARCH64_PAUTH_BUILTIN_PACIA1716, BUILT_IN_MD, NULL,
+-			    NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_pacia1716",
++				   ftype_pointer_auth,
++				   AARCH64_PAUTH_BUILTIN_PACIA1716);
++  aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_AUTIB1716]
++    = aarch64_general_add_builtin ("__builtin_aarch64_autib1716",
++				   ftype_pointer_auth,
++				   AARCH64_PAUTH_BUILTIN_AUTIB1716);
++  aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_PACIB1716]
++    = aarch64_general_add_builtin ("__builtin_aarch64_pacib1716",
++				   ftype_pointer_auth,
++				   AARCH64_PAUTH_BUILTIN_PACIB1716);
+   aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_XPACLRI]
+-    = add_builtin_function ("__builtin_aarch64_xpaclri", ftype_pointer_strip,
+-			    AARCH64_PAUTH_BUILTIN_XPACLRI, BUILT_IN_MD, NULL,
+-			    NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_xpaclri",
++				   ftype_pointer_strip,
++				   AARCH64_PAUTH_BUILTIN_XPACLRI);
++}
++
++/* Initialize the transactional memory extension (TME) builtins.  */
++static void
++aarch64_init_tme_builtins (void)
++{
++  tree ftype_uint64_void
++    = build_function_type_list (uint64_type_node, NULL);
++  tree ftype_void_void
++    = build_function_type_list (void_type_node, NULL);
++  tree ftype_void_uint64
++    = build_function_type_list (void_type_node, uint64_type_node, NULL);
++
++  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TSTART]
++    = aarch64_general_add_builtin ("__builtin_aarch64_tstart",
++				   ftype_uint64_void,
++				   AARCH64_TME_BUILTIN_TSTART);
++  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TTEST]
++    = aarch64_general_add_builtin ("__builtin_aarch64_ttest",
++				   ftype_uint64_void,
++				   AARCH64_TME_BUILTIN_TTEST);
++  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TCOMMIT]
++    = aarch64_general_add_builtin ("__builtin_aarch64_tcommit",
++				   ftype_void_void,
++				   AARCH64_TME_BUILTIN_TCOMMIT);
++  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TCANCEL]
++    = aarch64_general_add_builtin ("__builtin_aarch64_tcancel",
++				   ftype_void_uint64,
++				   AARCH64_TME_BUILTIN_TCANCEL);
++}
++
++/* Add builtins for Random Number instructions.  */
++
++static void
++aarch64_init_rng_builtins (void)
++{
++  tree unsigned_ptr_type = build_pointer_type (unsigned_intDI_type_node);
++  tree ftype
++    = build_function_type_list (integer_type_node, unsigned_ptr_type, NULL);
++  aarch64_builtin_decls[AARCH64_BUILTIN_RNG_RNDR]
++    = aarch64_general_add_builtin ("__builtin_aarch64_rndr", ftype,
++				   AARCH64_BUILTIN_RNG_RNDR);
++  aarch64_builtin_decls[AARCH64_BUILTIN_RNG_RNDRRS]
++    = aarch64_general_add_builtin ("__builtin_aarch64_rndrrs", ftype,
++				   AARCH64_BUILTIN_RNG_RNDRRS);
+ }
+ 
++
++/* Initialize all builtins in the AARCH64_BUILTIN_GENERAL group.  */
++
+ void
+-aarch64_init_builtins (void)
++aarch64_general_init_builtins (void)
+ {
+   tree ftype_set_fpr
+     = build_function_type_list (void_type_node, unsigned_type_node, NULL);
+@@ -1066,25 +1197,38 @@ aarch64_init_builtins (void)
+     = build_function_type_list (unsigned_type_node, NULL);
+ 
+   aarch64_builtin_decls[AARCH64_BUILTIN_GET_FPCR]
+-    = add_builtin_function ("__builtin_aarch64_get_fpcr", ftype_get_fpr,
+-			    AARCH64_BUILTIN_GET_FPCR, BUILT_IN_MD, NULL, NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_get_fpcr",
++				   ftype_get_fpr,
++				   AARCH64_BUILTIN_GET_FPCR);
+   aarch64_builtin_decls[AARCH64_BUILTIN_SET_FPCR]
+-    = add_builtin_function ("__builtin_aarch64_set_fpcr", ftype_set_fpr,
+-			    AARCH64_BUILTIN_SET_FPCR, BUILT_IN_MD, NULL, NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_set_fpcr",
++				   ftype_set_fpr,
++				   AARCH64_BUILTIN_SET_FPCR);
+   aarch64_builtin_decls[AARCH64_BUILTIN_GET_FPSR]
+-    = add_builtin_function ("__builtin_aarch64_get_fpsr", ftype_get_fpr,
+-			    AARCH64_BUILTIN_GET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_get_fpsr",
++				   ftype_get_fpr,
++				   AARCH64_BUILTIN_GET_FPSR);
+   aarch64_builtin_decls[AARCH64_BUILTIN_SET_FPSR]
+-    = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr,
+-			    AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
++    = aarch64_general_add_builtin ("__builtin_aarch64_set_fpsr",
++				   ftype_set_fpr,
++				   AARCH64_BUILTIN_SET_FPSR);
+ 
+   aarch64_init_fp16_types ();
+ 
++  aarch64_init_bf16_types ();
++
+   if (TARGET_SIMD)
+     aarch64_init_simd_builtins ();
+ 
+   aarch64_init_crc32_builtins ();
+   aarch64_init_builtin_rsqrt ();
++  aarch64_init_rng_builtins ();
++
++  tree ftype_jcvt
++    = build_function_type_list (intSI_type_node, double_type_node, NULL);
++  aarch64_builtin_decls[AARCH64_JSCVT]
++    = aarch64_general_add_builtin ("__builtin_aarch64_jcvtzs", ftype_jcvt,
++				   AARCH64_JSCVT);
+ 
+   /* Initialize pointer authentication builtins which are backed by instructions
+      in NOP encoding space.
+@@ -1094,10 +1238,14 @@ aarch64_init_builtins (void)
+      register them.  */
+   if (!TARGET_ILP32)
+     aarch64_init_pauth_hint_builtins ();
++
++  if (TARGET_TME)
++    aarch64_init_tme_builtins ();
+ }
+ 
++/* Implement TARGET_BUILTIN_DECL for the AARCH64_BUILTIN_GENERAL group.  */
+ tree
+-aarch64_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
++aarch64_general_builtin_decl (unsigned code, bool)
+ {
+   if (code >= AARCH64_BUILTIN_MAX)
+     return error_mark_node;
+@@ -1112,6 +1260,7 @@ typedef enum
+   SIMD_ARG_LANE_INDEX,
+   SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX,
+   SIMD_ARG_LANE_PAIR_INDEX,
++  SIMD_ARG_LANE_QUADTUP_INDEX,
+   SIMD_ARG_STOP
+ } builtin_simd_arg;
+ 
+@@ -1201,9 +1350,25 @@ aarch64_simd_expand_args (rtx target, int icode, int have_retval,
+ 		  op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane),
+ 					  SImode);
+ 		}
+-	      /* Fall through - if the lane index isn't a constant then
+-		 the next case will error.  */
+-	      /* FALLTHRU */
++	      /* If the lane index isn't a constant then error out.  */
++	      goto constant_arg;
++	    case SIMD_ARG_LANE_QUADTUP_INDEX:
++	      /* Must be a previous operand into which this is an index and
++		 index is restricted to nunits / 4.  */
++	      gcc_assert (opc > 0);
++	      if (CONST_INT_P (op[opc]))
++		{
++		  machine_mode vmode = insn_data[icode].operand[opc - 1].mode;
++		  unsigned int nunits
++		    = GET_MODE_NUNITS (vmode).to_constant ();
++		  aarch64_simd_lane_bounds (op[opc], 0, nunits / 4, exp);
++		  /* Keep to GCC-vector-extension lane indices in the RTL.  */
++		  int lane = INTVAL (op[opc]);
++		  op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane),
++					  SImode);
++		}
++	      /* If the lane index isn't a constant then error out.  */
++	      goto constant_arg;
+ 	    case SIMD_ARG_CONSTANT:
+ constant_arg:
+ 	      if (!(*insn_data[icode].operand[opc].predicate)
+@@ -1316,6 +1481,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target)
+ 	args[k] = SIMD_ARG_LANE_INDEX;
+       else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index)
+ 	args[k] = SIMD_ARG_LANE_PAIR_INDEX;
++      else if (d->qualifiers[qualifiers_k] & qualifier_lane_quadtup_index)
++	args[k] = SIMD_ARG_LANE_QUADTUP_INDEX;
+       else if (d->qualifiers[qualifiers_k] & qualifier_struct_load_store_lane_index)
+ 	args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX;
+       else if (d->qualifiers[qualifiers_k] & qualifier_immediate)
+@@ -1497,17 +1664,90 @@ aarch64_expand_fcmla_builtin (tree exp, rtx target, int fcode)
+   return target;
+ }
+ 
+-/* Expand an expression EXP that calls a built-in function,
+-   with result going to TARGET if that's convenient.  */
++/* Function to expand an expression EXP which calls one of the Transactional
++   Memory Extension (TME) builtins FCODE with the result going to TARGET.  */
++static rtx
++aarch64_expand_builtin_tme (int fcode, tree exp, rtx target)
++{
++  switch (fcode)
++    {
++    case AARCH64_TME_BUILTIN_TSTART:
++      target = gen_reg_rtx (DImode);
++      emit_insn (GEN_FCN (CODE_FOR_tstart) (target));
++      break;
++
++    case AARCH64_TME_BUILTIN_TTEST:
++      target = gen_reg_rtx (DImode);
++      emit_insn (GEN_FCN (CODE_FOR_ttest) (target));
++      break;
++
++    case AARCH64_TME_BUILTIN_TCOMMIT:
++      emit_insn (GEN_FCN (CODE_FOR_tcommit) ());
++      break;
++
++    case AARCH64_TME_BUILTIN_TCANCEL:
++      {
++	tree arg0 = CALL_EXPR_ARG (exp, 0);
++	rtx op0 = expand_normal (arg0);
++	if (CONST_INT_P (op0) && UINTVAL (op0) <= 65536)
++	  emit_insn (GEN_FCN (CODE_FOR_tcancel) (op0));
++	else
++	  {
++	    error ("%Kargument must be a 16-bit constant immediate", exp);
++	    return const0_rtx;
++	  }
++      }
++      break;
++
++    default :
++      gcc_unreachable ();
++    }
++    return target;
++}
++
++/* Expand a random number builtin EXP with code FCODE, putting the result
++   int TARGET.  If IGNORE is true the return value is ignored.  */
++
+ rtx
+-aarch64_expand_builtin (tree exp,
+-		     rtx target,
+-		     rtx subtarget ATTRIBUTE_UNUSED,
+-		     machine_mode mode ATTRIBUTE_UNUSED,
+-		     int ignore ATTRIBUTE_UNUSED)
++aarch64_expand_rng_builtin (tree exp, rtx target, int fcode, int ignore)
++{
++  rtx pat;
++  enum insn_code icode;
++  if (fcode == AARCH64_BUILTIN_RNG_RNDR)
++    icode = CODE_FOR_aarch64_rndr;
++  else if (fcode == AARCH64_BUILTIN_RNG_RNDRRS)
++    icode = CODE_FOR_aarch64_rndrrs;
++  else
++    gcc_unreachable ();
++
++  rtx rand = gen_reg_rtx (DImode);
++  pat = GEN_FCN (icode) (rand);
++  if (!pat)
++    return NULL_RTX;
++
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  rtx res_addr = expand_normal (arg0);
++  res_addr = convert_memory_address (Pmode, res_addr);
++  rtx res_mem = gen_rtx_MEM (DImode, res_addr);
++  emit_insn (pat);
++  emit_move_insn (res_mem, rand);
++  /* If the status result is unused don't generate the CSET code.  */
++  if (ignore)
++    return target;
++
++  rtx cc_reg = gen_rtx_REG (CC_Zmode, CC_REGNUM);
++  rtx cmp_rtx = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx);
++  emit_insn (gen_aarch64_cstoresi (target, cmp_rtx, cc_reg));
++  return target;
++}
++
++/* Expand an expression EXP that calls built-in function FCODE,
++   with result going to TARGET if that's convenient.  IGNORE is true
++   if the result of the builtin is ignored.  */
++rtx
++aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
++				int ignore)
+ {
+-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+-  int fcode = DECL_FUNCTION_CODE (fndecl);
+   int icode;
+   rtx pat, op0;
+   tree arg0;
+@@ -1540,6 +1780,8 @@ aarch64_expand_builtin (tree exp,
+ 
+     case AARCH64_PAUTH_BUILTIN_AUTIA1716:
+     case AARCH64_PAUTH_BUILTIN_PACIA1716:
++    case AARCH64_PAUTH_BUILTIN_AUTIB1716:
++    case AARCH64_PAUTH_BUILTIN_PACIB1716:
+     case AARCH64_PAUTH_BUILTIN_XPACLRI:
+       arg0 = CALL_EXPR_ARG (exp, 0);
+       op0 = force_reg (Pmode, expand_normal (arg0));
+@@ -1563,8 +1805,24 @@ aarch64_expand_builtin (tree exp,
+ 	{
+ 	  tree arg1 = CALL_EXPR_ARG (exp, 1);
+ 	  rtx op1 = force_reg (Pmode, expand_normal (arg1));
+-	  icode = (fcode == AARCH64_PAUTH_BUILTIN_PACIA1716
+-		   ? CODE_FOR_paci1716 : CODE_FOR_auti1716);
++	  switch (fcode)
++	    {
++	    case AARCH64_PAUTH_BUILTIN_AUTIA1716:
++	      icode = CODE_FOR_autia1716;
++	      break;
++	    case AARCH64_PAUTH_BUILTIN_AUTIB1716:
++	      icode = CODE_FOR_autib1716;
++	      break;
++	    case AARCH64_PAUTH_BUILTIN_PACIA1716:
++	      icode = CODE_FOR_pacia1716;
++	      break;
++	    case AARCH64_PAUTH_BUILTIN_PACIB1716:
++	      icode = CODE_FOR_pacib1716;
++	      break;
++	    default:
++	      icode = 0;
++	      gcc_unreachable ();
++	    }
+ 
+ 	  rtx x16_reg = gen_rtx_REG (Pmode, R16_REGNUM);
+ 	  rtx x17_reg = gen_rtx_REG (Pmode, R17_REGNUM);
+@@ -1576,6 +1834,16 @@ aarch64_expand_builtin (tree exp,
+ 
+       return target;
+ 
++    case AARCH64_JSCVT:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = force_reg (DFmode, expand_normal (arg0));
++      if (!target)
++	target = gen_reg_rtx (SImode);
++      else
++	target = force_reg (SImode, target);
++      emit_insn (GEN_FCN (CODE_FOR_aarch64_fjcvtzs) (target, op0));
++      return target;
++
+     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF:
+     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF:
+     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF:
+@@ -1585,6 +1853,9 @@ aarch64_expand_builtin (tree exp,
+     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF:
+     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF:
+       return aarch64_expand_fcmla_builtin (exp, target, fcode);
++    case AARCH64_BUILTIN_RNG_RNDR:
++    case AARCH64_BUILTIN_RNG_RNDRRS:
++      return aarch64_expand_rng_builtin (exp, target, fcode, ignore);
+     }
+ 
+   if (fcode >= AARCH64_SIMD_BUILTIN_BASE && fcode <= AARCH64_SIMD_BUILTIN_MAX)
+@@ -1599,6 +1870,12 @@ aarch64_expand_builtin (tree exp,
+       || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+     return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+ 
++  if (fcode == AARCH64_TME_BUILTIN_TSTART
++      || fcode == AARCH64_TME_BUILTIN_TCOMMIT
++      || fcode == AARCH64_TME_BUILTIN_TTEST
++      || fcode == AARCH64_TME_BUILTIN_TCANCEL)
++    return aarch64_expand_builtin_tme (fcode, exp, target);
++
+   gcc_unreachable ();
+ }
+ 
+@@ -1750,7 +2027,7 @@ aarch64_builtin_vectorized_function (unsigned int fn, tree type_out,
+ /* Return builtin for reciprocal square root.  */
+ 
+ tree
+-aarch64_builtin_rsqrt (unsigned int fn)
++aarch64_general_builtin_rsqrt (unsigned int fn)
+ {
+   if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+     return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+@@ -1765,13 +2042,14 @@ aarch64_builtin_rsqrt (unsigned int fn)
+ #define VAR1(T, N, MAP, A) \
+   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
+ 
++/* Try to fold a call to the built-in function with subcode FCODE.  The
++   function is passed the N_ARGS arguments in ARGS and it returns a value
++   of type TYPE.  Return the new expression on success and NULL_TREE on
++   failure.  */
+ tree
+-aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
+-		      bool ignore ATTRIBUTE_UNUSED)
++aarch64_general_fold_builtin (unsigned int fcode, tree type,
++			      unsigned int n_args ATTRIBUTE_UNUSED, tree *args)
+ {
+-  int fcode = DECL_FUNCTION_CODE (fndecl);
+-  tree type = TREE_TYPE (TREE_TYPE (fndecl));
+-
+   switch (fcode)
+     {
+       BUILTIN_VDQF (UNOP, abs, 2)
+@@ -1787,109 +2065,90 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
+   return NULL_TREE;
+ }
+ 
+-bool
+-aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
++/* Try to fold STMT, given that it's a call to the built-in function with
++   subcode FCODE.  Return the new statement on success and null on
++   failure.  */
++gimple *
++aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
+ {
+-  bool changed = false;
+-  gimple *stmt = gsi_stmt (*gsi);
+-  tree call = gimple_call_fn (stmt);
+-  tree fndecl;
+   gimple *new_stmt = NULL;
+-
+-  if (call)
++  unsigned nargs = gimple_call_num_args (stmt);
++  tree *args = (nargs > 0
++		? gimple_call_arg_ptr (stmt, 0)
++		: &error_mark_node);
++
++  /* We use gimple's IFN_REDUC_(PLUS|MIN|MAX)s for float, signed int
++     and unsigned int; it will distinguish according to the types of
++     the arguments to the __builtin.  */
++  switch (fcode)
+     {
+-      fndecl = gimple_call_fndecl (stmt);
+-      if (fndecl)
++      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
++	new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
++					       1, args[0]);
++	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
++	break;
++      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
++      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
++	new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
++					       1, args[0]);
++	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
++	break;
++      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
++      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
++	new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
++					       1, args[0]);
++	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
++	break;
++      BUILTIN_GPF (BINOP, fmulx, 0)
+ 	{
+-	  int fcode = DECL_FUNCTION_CODE (fndecl);
+-	  unsigned nargs = gimple_call_num_args (stmt);
+-	  tree *args = (nargs > 0
+-			? gimple_call_arg_ptr (stmt, 0)
+-			: &error_mark_node);
+-
+-	  /* We use gimple's IFN_REDUC_(PLUS|MIN|MAX)s for float, signed int
+-	     and unsigned int; it will distinguish according to the types of
+-	     the arguments to the __builtin.  */
+-	  switch (fcode)
++	  gcc_assert (nargs == 2);
++	  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
++	  bool a1_cst_p = TREE_CODE (args[1]) == REAL_CST;
++	  if (a0_cst_p || a1_cst_p)
+ 	    {
+-	      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+-	        new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
+-						       1, args[0]);
+-		gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+-		break;
+-	      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
+-	      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+-	        new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
+-						       1, args[0]);
+-		gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+-		break;
+-	      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
+-	      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+-	        new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
+-						       1, args[0]);
+-		gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+-		break;
+-	      BUILTIN_GPF (BINOP, fmulx, 0)
++	      if (a0_cst_p && a1_cst_p)
+ 		{
+-		  gcc_assert (nargs == 2);
+-		  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
+-		  bool a1_cst_p = TREE_CODE (args[1]) == REAL_CST;
+-		  if (a0_cst_p || a1_cst_p)
++		  tree t0 = TREE_TYPE (args[0]);
++		  real_value a0 = (TREE_REAL_CST (args[0]));
++		  real_value a1 = (TREE_REAL_CST (args[1]));
++		  if (real_equal (&a1, &dconst0))
++		    std::swap (a0, a1);
++		  /* According to real_equal (), +0 equals -0.  */
++		  if (real_equal (&a0, &dconst0) && real_isinf (&a1))
+ 		    {
+-		      if (a0_cst_p && a1_cst_p)
+-			{
+-			  tree t0 = TREE_TYPE (args[0]);
+-			  real_value a0 = (TREE_REAL_CST (args[0]));
+-			  real_value a1 = (TREE_REAL_CST (args[1]));
+-			  if (real_equal (&a1, &dconst0))
+-			    std::swap (a0, a1);
+-			  /* According to real_equal (), +0 equals -0.  */
+-			  if (real_equal (&a0, &dconst0) && real_isinf (&a1))
+-			    {
+-			      real_value res = dconst2;
+-			      res.sign = a0.sign ^ a1.sign;
+-			      new_stmt =
+-				gimple_build_assign (gimple_call_lhs (stmt),
+-						     REAL_CST,
+-						     build_real (t0, res));
+-			    }
+-			  else
+-			    new_stmt =
+-			      gimple_build_assign (gimple_call_lhs (stmt),
+-						   MULT_EXPR,
+-						   args[0], args[1]);
+-			}
+-		      else /* a0_cst_p ^ a1_cst_p.  */
+-			{
+-			  real_value const_part = a0_cst_p
+-			    ? TREE_REAL_CST (args[0]) : TREE_REAL_CST (args[1]);
+-			  if (!real_equal (&const_part, &dconst0)
+-			      && !real_isinf (&const_part))
+-			    new_stmt =
+-			      gimple_build_assign (gimple_call_lhs (stmt),
+-						   MULT_EXPR, args[0], args[1]);
+-			}
++		      real_value res = dconst2;
++		      res.sign = a0.sign ^ a1.sign;
++		      new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
++						      REAL_CST,
++						      build_real (t0, res));
+ 		    }
+-		  if (new_stmt)
+-		    {
+-		      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-		      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+-		    }
+-		  break;
++		  else
++		    new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
++						    MULT_EXPR,
++						    args[0], args[1]);
+ 		}
+-	    default:
+-	      break;
++	      else /* a0_cst_p ^ a1_cst_p.  */
++		{
++		  real_value const_part = a0_cst_p
++		    ? TREE_REAL_CST (args[0]) : TREE_REAL_CST (args[1]);
++		  if (!real_equal (&const_part, &dconst0)
++		      && !real_isinf (&const_part))
++		    new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
++						    MULT_EXPR, args[0],
++						    args[1]);
++		}
++	    }
++	  if (new_stmt)
++	    {
++	      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
++	      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+ 	    }
++	  break;
+ 	}
++    default:
++      break;
+     }
+-
+-  if (new_stmt)
+-    {
+-      gsi_replace (gsi, new_stmt, true);
+-      changed = true;
+-    }
+-
+-  return changed;
++  return new_stmt;
+ }
+ 
+ void
+diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
+index 6d5acb02f..da78f6fe3 100644
+--- a/gcc/config/aarch64/aarch64-c.c
++++ b/gcc/config/aarch64/aarch64-c.c
+@@ -110,6 +110,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
+   aarch64_def_or_undef (TARGET_CRC32, "__ARM_FEATURE_CRC32", pfile);
+   aarch64_def_or_undef (TARGET_DOTPROD, "__ARM_FEATURE_DOTPROD", pfile);
+   aarch64_def_or_undef (TARGET_COMPLEX, "__ARM_FEATURE_COMPLEX", pfile);
++  aarch64_def_or_undef (TARGET_JSCVT, "__ARM_FEATURE_JCVT", pfile);
+ 
+   cpp_undef (pfile, "__AARCH64_CMODEL_TINY__");
+   cpp_undef (pfile, "__AARCH64_CMODEL_SMALL__");
+@@ -146,6 +147,13 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
+ 	bits = 0;
+       builtin_define_with_int_value ("__ARM_FEATURE_SVE_BITS", bits);
+     }
++  aarch64_def_or_undef (TARGET_SVE_I8MM,
++			"__ARM_FEATURE_SVE_MATMUL_INT8", pfile);
++  aarch64_def_or_undef (TARGET_SVE_F32MM,
++			"__ARM_FEATURE_SVE_MATMUL_FP32", pfile);
++  aarch64_def_or_undef (TARGET_SVE_F64MM,
++			"__ARM_FEATURE_SVE_MATMUL_FP64", pfile);
++  aarch64_def_or_undef (TARGET_SVE2, "__ARM_FEATURE_SVE2", pfile);
+ 
+   aarch64_def_or_undef (TARGET_LSE, "__ARM_FEATURE_ATOMICS", pfile);
+   aarch64_def_or_undef (TARGET_AES, "__ARM_FEATURE_AES", pfile);
+@@ -156,6 +164,16 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
+   aarch64_def_or_undef (TARGET_SM4, "__ARM_FEATURE_SM4", pfile);
+   aarch64_def_or_undef (TARGET_F16FML, "__ARM_FEATURE_FP16_FML", pfile);
+ 
++  aarch64_def_or_undef (TARGET_FRINT, "__ARM_FEATURE_FRINT", pfile);
++  aarch64_def_or_undef (TARGET_TME, "__ARM_FEATURE_TME", pfile);
++  aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
++
++  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
++  aarch64_def_or_undef (TARGET_BF16_SIMD,
++			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
++  aarch64_def_or_undef (TARGET_BF16_FP,
++			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
++
+   /* Not for ACLE, but required to keep "float.h" correct if we switch
+      target between implementations that do or do not support ARMv8.2-A
+      16-bit floating-point extensions.  */
+@@ -237,6 +255,73 @@ aarch64_pragma_target_parse (tree args, tree pop_target)
+   return true;
+ }
+ 
++/* Implement "#pragma GCC aarch64".  */
++static void
++aarch64_pragma_aarch64 (cpp_reader *)
++{
++  tree x;
++  if (pragma_lex (&x) != CPP_STRING)
++    {
++      error ("%<#pragma GCC aarch64%> requires a string parameter");
++      return;
++    }
++
++  const char *name = TREE_STRING_POINTER (x);
++  if (strcmp (name, "arm_sve.h") == 0)
++    aarch64_sve::handle_arm_sve_h ();
++  else
++    error ("unknown %<#pragma GCC aarch64%> option %qs", name);
++}
++
++/* Implement TARGET_RESOLVE_OVERLOADED_BUILTIN.  */
++static tree
++aarch64_resolve_overloaded_builtin (unsigned int uncast_location,
++				    tree fndecl, void *uncast_arglist)
++{
++  vec<tree, va_gc> empty = {};
++  location_t location = (location_t) uncast_location;
++  vec<tree, va_gc> *arglist = (uncast_arglist
++			       ? (vec<tree, va_gc> *) uncast_arglist
++			       : &empty);
++  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  tree new_fndecl;
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      return NULL_TREE;
++
++    case AARCH64_BUILTIN_SVE:
++      new_fndecl = aarch64_sve::resolve_overloaded_builtin (location, subcode,
++							    arglist);
++      break;
++    }
++  if (new_fndecl == NULL_TREE || new_fndecl == error_mark_node)
++    return new_fndecl;
++  return build_function_call_vec (location, vNULL, new_fndecl, arglist,
++				  NULL, fndecl);
++}
++
++/* Implement TARGET_CHECK_BUILTIN_CALL.  */
++static bool
++aarch64_check_builtin_call (location_t loc, vec<location_t> arg_loc,
++			    tree fndecl, tree orig_fndecl,
++			    unsigned int nargs, tree *args)
++{
++  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      return true;
++
++    case AARCH64_BUILTIN_SVE:
++      return aarch64_sve::check_builtin_call (loc, arg_loc, subcode,
++					      orig_fndecl, nargs, args);
++    }
++  gcc_unreachable ();
++}
++
+ /* Implement REGISTER_TARGET_PRAGMAS.  */
+ 
+ void
+@@ -244,4 +329,9 @@ aarch64_register_pragmas (void)
+ {
+   /* Update pragma hook to allow parsing #pragma GCC target.  */
+   targetm.target_option.pragma_parse = aarch64_pragma_target_parse;
++
++  targetm.resolve_overloaded_builtin = aarch64_resolve_overloaded_builtin;
++  targetm.check_builtin_call = aarch64_check_builtin_call;
++
++  c_register_pragma ("GCC", "aarch64", aarch64_pragma_aarch64);
+ }
+diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
+index 82d91d625..053c6390e 100644
+--- a/gcc/config/aarch64/aarch64-cores.def
++++ b/gcc/config/aarch64/aarch64-cores.def
+@@ -46,6 +46,7 @@
+ /* ARMv8-A Architecture Processors.  */
+ 
+ /* ARM ('A') cores. */
++AARCH64_CORE("cortex-a34",  cortexa34, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, 0x41, 0xd02, -1)
+ AARCH64_CORE("cortex-a35",  cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, 0x41, 0xd04, -1)
+ AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, 0x41, 0xd03, -1)
+ AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, 0xd07, -1)
+@@ -99,7 +100,11 @@ AARCH64_CORE("thunderx2t99",  thunderx2t99,  thunderx2t99, 8_1A,  AARCH64_FL_FOR
+ /* ARM ('A') cores. */
+ AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa53, 0x41, 0xd05, -1)
+ AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa73, 0x41, 0xd0a, -1)
+-AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa72, 0x41, 0xd0b, -1)
++AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, neoversen1, 0x41, 0xd0b, -1)
++AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa72, 0x41, 0xd0e, -1)
++AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa72, 0x41, 0xd0d, -1)
++AARCH64_CORE("cortex-a65",  cortexa65, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd06, -1)
++AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd43, -1)
+ AARCH64_CORE("ares",  ares, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
+ AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
+ AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa53, 0x41, 0xd4a, -1)
+diff --git a/gcc/config/aarch64/aarch64-elf-raw.h b/gcc/config/aarch64/aarch64-elf-raw.h
+index bbebd0ef0..8fe7b3783 100644
+--- a/gcc/config/aarch64/aarch64-elf-raw.h
++++ b/gcc/config/aarch64/aarch64-elf-raw.h
+@@ -27,22 +27,6 @@
+   " crtend%O%s crtn%O%s " \
+   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+ 
+-#if TARGET_FIX_ERR_A53_835769_DEFAULT
+-#define CA53_ERR_835769_SPEC \
+-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#else
+-#define CA53_ERR_835769_SPEC \
+-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#endif
+-
+-#if TARGET_FIX_ERR_A53_843419_DEFAULT
+-#define CA53_ERR_843419_SPEC \
+-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#else
+-#define CA53_ERR_843419_SPEC \
+-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#endif
+-
+ #ifndef LINK_SPEC
+ #define LINK_SPEC "%{h*}			\
+    %{static:-Bstatic}				\
+@@ -51,8 +35,7 @@
+    %{!static:%{rdynamic:-export-dynamic}}	\
+    %{mbig-endian:-EB} %{mlittle-endian:-EL} -X	\
+   -maarch64elf%{mabi=ilp32*:32}%{mbig-endian:b}" \
+-  CA53_ERR_835769_SPEC \
+-  CA53_ERR_843419_SPEC
++  AARCH64_ERRATA_LINK_SPEC
+ #endif
+ 
+ #endif /* GCC_AARCH64_ELF_RAW_H */
+diff --git a/gcc/config/aarch64/aarch64-errata.h b/gcc/config/aarch64/aarch64-errata.h
+new file mode 100644
+index 000000000..8f062536e
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-errata.h
+@@ -0,0 +1,44 @@
++/* Machine description for AArch64 architecture.
++   Copyright (C) 2009-2019 Free Software Foundation, Inc.
++   Contributed by ARM Ltd.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_ERRATA_H
++#define GCC_AARCH64_ERRATA_H
++
++#if TARGET_FIX_ERR_A53_835769_DEFAULT
++#define CA53_ERR_835769_SPEC \
++  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
++#else
++#define CA53_ERR_835769_SPEC \
++  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
++#endif
++
++#if TARGET_FIX_ERR_A53_843419_DEFAULT
++#define CA53_ERR_843419_SPEC \
++  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
++#else
++#define CA53_ERR_843419_SPEC \
++  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
++#endif
++
++#define AARCH64_ERRATA_LINK_SPEC		\
++  CA53_ERR_835769_SPEC				\
++  CA53_ERR_843419_SPEC
++
++#endif /*  GCC_AARCH64_ERRATA_H */
+diff --git a/gcc/config/aarch64/aarch64-freebsd.h b/gcc/config/aarch64/aarch64-freebsd.h
+index 899e6f95e..7a3e89b1b 100644
+--- a/gcc/config/aarch64/aarch64-freebsd.h
++++ b/gcc/config/aarch64/aarch64-freebsd.h
+@@ -46,26 +46,8 @@
+     -X" SUBTARGET_EXTRA_LINK_SPEC "                             \
+     %{mbig-endian:-EB} %{mlittle-endian:-EL}"
+ 
+-#if TARGET_FIX_ERR_A53_835769_DEFAULT
+-#define CA53_ERR_835769_SPEC \
+-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#else
+-#define CA53_ERR_835769_SPEC \
+-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#endif
+-
+-#ifdef TARGET_FIX_ERR_A53_843419_DEFAULT
+-#define CA53_ERR_843419_SPEC \
+-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#else
+-#define CA53_ERR_843419_SPEC \
+-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#endif
+-
+ #undef  LINK_SPEC
+-#define LINK_SPEC FBSD_TARGET_LINK_SPEC	\
+-                  CA53_ERR_835769_SPEC	\
+-                  CA53_ERR_843419_SPEC
++#define LINK_SPEC FBSD_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
+ 
+ #define GNU_USER_TARGET_MATHFILE_SPEC \
+   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
+index 5e8b34ded..6ff2163b6 100644
+--- a/gcc/config/aarch64/aarch64-linux.h
++++ b/gcc/config/aarch64/aarch64-linux.h
+@@ -46,25 +46,8 @@
+    %{mbig-endian:-EB} %{mlittle-endian:-EL}     \
+    -maarch64linux%{mabi=ilp32:32}%{mbig-endian:b}"
+ 
+-#if TARGET_FIX_ERR_A53_835769_DEFAULT
+-#define CA53_ERR_835769_SPEC \
+-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#else
+-#define CA53_ERR_835769_SPEC \
+-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
+-#endif
+-
+-#if TARGET_FIX_ERR_A53_843419_DEFAULT
+-#define CA53_ERR_843419_SPEC \
+-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#else
+-#define CA53_ERR_843419_SPEC \
+-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
+-#endif
+-
+-#define LINK_SPEC LINUX_TARGET_LINK_SPEC \
+-                  CA53_ERR_835769_SPEC \
+-                  CA53_ERR_843419_SPEC
++
++#define LINK_SPEC LINUX_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
+ 
+ #define GNU_USER_TARGET_MATHFILE_SPEC \
+   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
+diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
+index 14c1a43fe..3640540b3 100644
+--- a/gcc/config/aarch64/aarch64-modes.def
++++ b/gcc/config/aarch64/aarch64-modes.def
+@@ -33,6 +33,8 @@
+ CC_MODE (CCFP);
+ CC_MODE (CCFPE);
+ CC_MODE (CC_SWP);
++CC_MODE (CC_NZC);   /* Only N, Z and C bits of condition flags are valid.
++		       (Used with SVE predicate tests.)  */
+ CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
+ CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
+ CC_MODE (CC_C);     /* C represents unsigned overflow of a simple addition.  */
+@@ -60,6 +62,10 @@ ADJUST_ALIGNMENT (VNx8BI, 2);
+ ADJUST_ALIGNMENT (VNx4BI, 2);
+ ADJUST_ALIGNMENT (VNx2BI, 2);
+ 
++/* Bfloat16 modes.  */
++FLOAT_MODE (BF, 2, 0);
++ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
++
+ VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI.  */
+ VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI.  */
+ VECTOR_MODES (FLOAT, 8);      /*                 V2SF.  */
+@@ -80,13 +86,14 @@ INT_MODE (XI, 64);
+    strictly necessary to set the alignment here, since the default would
+    be clamped to BIGGEST_ALIGNMENT anyhow, but it seems clearer.  */
+ #define SVE_MODES(NVECS, VB, VH, VS, VD) \
+-  VECTOR_MODES_WITH_PREFIX (VNx, INT, 16 * NVECS); \
+-  VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 16 * NVECS); \
++  VECTOR_MODES_WITH_PREFIX (VNx, INT, 16 * NVECS, 0); \
++  VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 16 * NVECS, 0); \
+   \
+   ADJUST_NUNITS (VB##QI, aarch64_sve_vg * NVECS * 8); \
+   ADJUST_NUNITS (VH##HI, aarch64_sve_vg * NVECS * 4); \
+   ADJUST_NUNITS (VS##SI, aarch64_sve_vg * NVECS * 2); \
+   ADJUST_NUNITS (VD##DI, aarch64_sve_vg * NVECS); \
++  ADJUST_NUNITS (VH##BF, aarch64_sve_vg * NVECS * 4); \
+   ADJUST_NUNITS (VH##HF, aarch64_sve_vg * NVECS * 4); \
+   ADJUST_NUNITS (VS##SF, aarch64_sve_vg * NVECS * 2); \
+   ADJUST_NUNITS (VD##DF, aarch64_sve_vg * NVECS); \
+@@ -95,6 +102,7 @@ INT_MODE (XI, 64);
+   ADJUST_ALIGNMENT (VH##HI, 16); \
+   ADJUST_ALIGNMENT (VS##SI, 16); \
+   ADJUST_ALIGNMENT (VD##DI, 16); \
++  ADJUST_ALIGNMENT (VH##BF, 16); \
+   ADJUST_ALIGNMENT (VH##HF, 16); \
+   ADJUST_ALIGNMENT (VS##SF, 16); \
+   ADJUST_ALIGNMENT (VD##DF, 16);
+@@ -106,6 +114,40 @@ SVE_MODES (2, VNx32, VNx16, VNx8, VNx4)
+ SVE_MODES (3, VNx48, VNx24, VNx12, VNx6)
+ SVE_MODES (4, VNx64, VNx32, VNx16, VNx8)
+ 
++/* Partial SVE vectors:
++
++      VNx2QI VNx4QI VNx8QI
++      VNx2HI VNx4HI
++      VNx2SI
++
++   In memory they occupy contiguous locations, in the same way as fixed-length
++   vectors.  E.g. VNx8QImode is half the size of VNx16QImode.
++
++   Passing 1 as the final argument ensures that the modes come after all
++   other modes in the GET_MODE_WIDER chain, so that we never pick them
++   in preference to a full vector mode.  */
++VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 1);
++VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 1);
++VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 1);
++
++ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
++ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
++ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
++
++ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
++ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
++
++ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
++
++ADJUST_ALIGNMENT (VNx2QI, 1);
++ADJUST_ALIGNMENT (VNx4QI, 1);
++ADJUST_ALIGNMENT (VNx8QI, 1);
++
++ADJUST_ALIGNMENT (VNx2HI, 2);
++ADJUST_ALIGNMENT (VNx4HI, 2);
++
++ADJUST_ALIGNMENT (VNx2SI, 4);
++
+ /* Quad float: 128-bit floating mode for long doubles.  */
+ FLOAT_MODE (TF, 16, ieee_quad_format);
+ 
+diff --git a/gcc/config/aarch64/aarch64-netbsd.h b/gcc/config/aarch64/aarch64-netbsd.h
+new file mode 100644
+index 000000000..e6c9264bd
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-netbsd.h
+@@ -0,0 +1,63 @@
++/* Definitions for AArch64 running NetBSD
++   Copyright (C) 2016-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_NETBSD_H
++#define GCC_AARCH64_NETBSD_H
++
++#define TARGET_LINKER_BIG_EMULATION "aarch64nbsdb"
++#define TARGET_LINKER_LITTLE_EMULATION "aarch64nbsd"
++
++#if TARGET_BIG_ENDIAN_DEFAULT
++#define TARGET_LINKER_EMULATION  TARGET_LINKER_BIG_EMULATION
++#else
++#define TARGET_LINKER_EMULATION  TARGET_LINKER_LITTLE_EMULATION
++#endif
++
++#undef  SUBTARGET_EXTRA_LINK_SPEC
++#define SUBTARGET_EXTRA_LINK_SPEC " -m" TARGET_LINKER_EMULATION
++
++#define NETBSD_ENTRY_POINT "__start"
++
++#define NETBSD_TARGET_LINK_SPEC  "%{h*} "				\
++  "-X %{mbig-endian:-EB -m " TARGET_LINKER_BIG_EMULATION "} "		\
++  "%{mlittle-endian:-EL -m " TARGET_LINKER_LITTLE_EMULATION "} "	\
++  "%(netbsd_link_spec)"
++
++#undef  LINK_SPEC
++#define LINK_SPEC NETBSD_LINK_SPEC_ELF		\
++		  NETBSD_TARGET_LINK_SPEC	\
++		  AARCH64_ERRATA_LINK_SPEC
++
++#undef TARGET_OS_CPP_BUILTINS
++#define TARGET_OS_CPP_BUILTINS()		\
++  do						\
++    {						\
++      NETBSD_OS_CPP_BUILTINS_ELF();		\
++    }						\
++  while (0)
++
++#undef SUBTARGET_CPP_SPEC
++#define SUBTARGET_CPP_SPEC NETBSD_CPP_SPEC
++
++#undef EXTRA_SPECS
++#define EXTRA_SPECS \
++  { "asm_cpu_spec",             ASM_CPU_SPEC }, \
++  NETBSD_SUBTARGET_EXTRA_SPECS
++
++#endif  /* GCC_AARCH64_NETBSD_H */
+diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
+index 010fd3ccf..345cdc4da 100644
+--- a/gcc/config/aarch64/aarch64-option-extensions.def
++++ b/gcc/config/aarch64/aarch64-option-extensions.def
+@@ -45,29 +45,46 @@
+      entries: aes, pmull, sha1, sha2 being present).  In that case this field
+      should contain a space (" ") separated list of the strings in 'Features'
+      that are required.  Their order is not important.  An empty string means
+-     do not detect this feature during auto detection.  */
++     do not detect this feature during auto detection.
+ 
+-/* NOTE: This file is being parsed by config.gcc and so the
+-   AARCH64_OPT_EXTENSION must adhere to a strict format:
+-   1) No space between the AARCH64_OPT_EXTENSION and the opening (.
+-   2) No space between the opening ( and the extension name.
+-   3) No space after the extension name before the ,.
+-   4) Spaces are only allowed after a , and around |.
+-   5) Everything must be on one line.  */
++     NOTE: Any changes to the AARCH64_OPT_EXTENSION macro need to be mirrored in
++     config.gcc.  */
+ 
+ /* Enabling "fp" just enables "fp".
+    Disabling "fp" also disables "simd", "crypto", "fp16", "aes", "sha2",
+-   "sha3", sm3/sm4 and "sve".  */
+-AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | AARCH64_FL_SVE, false, "fp")
++   "sha3", sm3/sm4, "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
++   "sve2-bitperm", "i8mm", "f32mm", "f64mm", and "bf16".  */
++AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | \
++		      AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | \
++		      AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | \
++		      AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_AES | \
++		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4 | \
++		      AARCH64_FL_SVE2_BITPERM | AARCH64_FL_I8MM | \
++		      AARCH64_FL_F32MM | AARCH64_FL_F64MM | AARCH64_FL_BF16,
++		       false, "fp")
+ 
+ /* Enabling "simd" also enables "fp".
+    Disabling "simd" also disables "crypto", "dotprod", "aes", "sha2", "sha3",
+-   "sm3/sm4" and "sve".  */
+-AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD | AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | AARCH64_FL_SVE, false, "asimd")
++   "sm3/sm4", "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
++   "sve2-bitperm", "i8mm", "f32mm" and "f64mm".  */
++AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, \
++		      AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD | \
++		      AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | \
++		      AARCH64_FL_SM4 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | \
++		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
++		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM | \
++		      AARCH64_FL_I8MM | AARCH64_FL_F32MM | AARCH64_FL_F64MM, \
++		      false, "asimd")
+ 
+ /* Enabling "crypto" also enables "fp", "simd", "aes" and "sha2".
+-   Disabling "crypto" disables "crypto", "aes", "sha2", "sha3" and "sm3/sm4".  */
+-AARCH64_OPT_EXTENSION("crypto", AARCH64_FL_CRYPTO, AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_AES | AARCH64_FL_SHA2, AARCH64_FL_AES | AARCH64_FL_SHA2 |AARCH64_FL_SHA3 | AARCH64_FL_SM4, true, "aes pmull sha1 sha2")
++   Disabling "crypto" disables "crypto", "aes", "sha2", "sha3" and "sm3/sm4",
++   "sve2-aes", "sve2-sha3", "sve2-sm4".  */
++AARCH64_OPT_EXTENSION("crypto", AARCH64_FL_CRYPTO, AARCH64_FL_FP | \
++		      AARCH64_FL_SIMD | AARCH64_FL_AES | AARCH64_FL_SHA2, \
++		      AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | \
++		      AARCH64_FL_SM4 | AARCH64_FL_SVE2_AES | \
++		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4, true, \
++		      "aes pmull sha1 sha2")
+ 
+ /* Enabling or disabling "crc" only changes "crc".  */
+ AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, false, "crc32")
+@@ -76,43 +93,63 @@ AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, false, "crc32")
+ AARCH64_OPT_EXTENSION("lse", AARCH64_FL_LSE, 0, 0, false, "atomics")
+ 
+ /* Enabling "fp16" also enables "fp".
+-   Disabling "fp16" disables "fp16", "fp16fml" and "sve".  */
+-AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, AARCH64_FL_F16FML | AARCH64_FL_SVE, false, "fphp asimdhp")
++   Disabling "fp16" disables "fp16", "fp16fml", "sve", "sve2",
++   "sve2-aes", "sve2-sha3", "sve2-sm4", "sve2-bitperm", "f32mm" and
++    "f64mm".  */
++AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, \
++		      AARCH64_FL_F16FML | AARCH64_FL_SVE | AARCH64_FL_F32MM | \
++		      AARCH64_FL_F64MM | AARCH64_FL_SVE2 | \
++		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
++		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM, false, \
++		      "fphp asimdhp")
+ 
+ /* Enabling or disabling "rcpc" only changes "rcpc".  */
+ AARCH64_OPT_EXTENSION("rcpc", AARCH64_FL_RCPC, 0, 0, false, "lrcpc")
+ 
+ /* Enabling "rdma" also enables "fp", "simd".
+    Disabling "rdma" just disables "rdma".  */
+-AARCH64_OPT_EXTENSION("rdma", AARCH64_FL_RDMA, AARCH64_FL_FP | AARCH64_FL_SIMD, 0, false, "asimdrdm")
++AARCH64_OPT_EXTENSION("rdma", AARCH64_FL_RDMA, \
++		      AARCH64_FL_FP | AARCH64_FL_SIMD, 0, false, "asimdrdm")
+ 
+ /* Enabling "dotprod" also enables "simd".
+    Disabling "dotprod" only disables "dotprod".  */
+-AARCH64_OPT_EXTENSION("dotprod", AARCH64_FL_DOTPROD, AARCH64_FL_SIMD, 0, false, "asimddp")
++AARCH64_OPT_EXTENSION("dotprod", AARCH64_FL_DOTPROD, AARCH64_FL_SIMD, 0, \
++		      false, "asimddp")
+ 
+ /* Enabling "aes" also enables "simd".
+-   Disabling "aes" just disables "aes".  */
+-AARCH64_OPT_EXTENSION("aes", AARCH64_FL_AES, AARCH64_FL_SIMD, 0, false, "aes")
++   Disabling "aes" disables "aes" and "sve2-aes'.  */
++AARCH64_OPT_EXTENSION("aes", AARCH64_FL_AES, AARCH64_FL_SIMD, \
++		      AARCH64_FL_SVE2_AES, false, "aes")
+ 
+ /* Enabling "sha2" also enables "simd".
+    Disabling "sha2" just disables "sha2".  */
+-AARCH64_OPT_EXTENSION("sha2", AARCH64_FL_SHA2, AARCH64_FL_SIMD, 0, false, "sha1 sha2")
++AARCH64_OPT_EXTENSION("sha2", AARCH64_FL_SHA2, AARCH64_FL_SIMD, 0, false, \
++		      "sha1 sha2")
+ 
+ /* Enabling "sha3" enables "simd" and "sha2".
+-   Disabling "sha3" just disables "sha3".  */
+-AARCH64_OPT_EXTENSION("sha3", AARCH64_FL_SHA3, AARCH64_FL_SIMD | AARCH64_FL_SHA2, 0, false, "sha3 sha512")
++   Disabling "sha3" disables "sha3" and "sve2-sha3".  */
++AARCH64_OPT_EXTENSION("sha3", AARCH64_FL_SHA3, AARCH64_FL_SIMD | \
++		      AARCH64_FL_SHA2, AARCH64_FL_SVE2_SHA3, false, \
++		      "sha3 sha512")
+ 
+ /* Enabling "sm4" also enables "simd".
+-   Disabling "sm4" just disables "sm4".  */
+-AARCH64_OPT_EXTENSION("sm4", AARCH64_FL_SM4, AARCH64_FL_SIMD, 0, false, "sm3 sm4")
++   Disabling "sm4" disables "sm4" and "sve2-sm4".  */
++AARCH64_OPT_EXTENSION("sm4", AARCH64_FL_SM4, AARCH64_FL_SIMD, \
++		      AARCH64_FL_SVE2_SM4, false, "sm3 sm4")
+ 
+ /* Enabling "fp16fml" also enables "fp" and "fp16".
+    Disabling "fp16fml" just disables "fp16fml".  */
+-AARCH64_OPT_EXTENSION("fp16fml", AARCH64_FL_F16FML, AARCH64_FL_FP | AARCH64_FL_F16, 0, false, "asimdfhm")
++AARCH64_OPT_EXTENSION("fp16fml", AARCH64_FL_F16FML, \
++		      AARCH64_FL_FP | AARCH64_FL_F16, 0, false, "asimdfhm")
+ 
+ /* Enabling "sve" also enables "fp16", "fp" and "simd".
+-   Disabling "sve" just disables "sve".  */
+-AARCH64_OPT_EXTENSION("sve", AARCH64_FL_SVE, AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16, 0, false, "sve")
++   Disabling "sve" disables "sve", "f32mm", "f64mm", "sve2", "sve2-aes",
++   "sve2-sha3", "sve2-sm4" and "sve2-bitperm".  */
++AARCH64_OPT_EXTENSION("sve", AARCH64_FL_SVE, AARCH64_FL_FP | AARCH64_FL_SIMD | \
++		      AARCH64_FL_F16, AARCH64_FL_F32MM | AARCH64_FL_F64MM | \
++		      AARCH64_FL_SVE2 | AARCH64_FL_SVE2_AES | \
++		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4 | \
++		      AARCH64_FL_SVE2_BITPERM, false, "sve")
+ 
+ /* Enabling/Disabling "profile" does not enable/disable any other feature.  */
+ AARCH64_OPT_EXTENSION("profile", AARCH64_FL_PROFILE, 0, 0, false, "")
+@@ -124,12 +161,69 @@ AARCH64_OPT_EXTENSION("rng", AARCH64_FL_RNG, 0, 0, false, "")
+ AARCH64_OPT_EXTENSION("memtag", AARCH64_FL_MEMTAG, 0, 0, false, "")
+ 
+ /* Enabling/Disabling "sb" only changes "sb".  */
+-AARCH64_OPT_EXTENSION("sb", AARCH64_FL_SB, 0, 0, false, "")
++AARCH64_OPT_EXTENSION("sb", AARCH64_FL_SB, 0, 0, false, "sb")
+ 
+ /* Enabling/Disabling "ssbs" only changes "ssbs".  */
+-AARCH64_OPT_EXTENSION("ssbs", AARCH64_FL_SSBS, 0, 0, false, "")
++AARCH64_OPT_EXTENSION("ssbs", AARCH64_FL_SSBS, 0, 0, false, "ssbs")
+ 
+ /* Enabling/Disabling "predres" only changes "predres".  */
+ AARCH64_OPT_EXTENSION("predres", AARCH64_FL_PREDRES, 0, 0, false, "")
+ 
++/* Enabling "sve2" also enables "sve", "fp16", "fp", and "simd".
++   Disabling "sve2" disables "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4", and
++   "sve2-bitperm".  */
++AARCH64_OPT_EXTENSION("sve2", AARCH64_FL_SVE2, AARCH64_FL_SVE | \
++		      AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16, \
++		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
++		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM, false, "sve2")
++
++/* Enabling "sve2-sm4" also enables "sm4", "simd", "fp16", "fp", "sve", and
++   "sve2". Disabling "sve2-sm4" just disables "sve2-sm4".  */
++AARCH64_OPT_EXTENSION("sve2-sm4", AARCH64_FL_SVE2_SM4, AARCH64_FL_SM4 | \
++		      AARCH64_FL_SIMD | AARCH64_FL_F16 | AARCH64_FL_FP | \
++		      AARCH64_FL_SVE | AARCH64_FL_SVE2, 0, false, "svesm4")
++
++/* Enabling "sve2-aes" also enables "aes", "simd", "fp16", "fp", "sve", and
++   "sve2". Disabling "sve2-aes" just disables "sve2-aes".  */
++AARCH64_OPT_EXTENSION("sve2-aes", AARCH64_FL_SVE2_AES, AARCH64_FL_AES | \
++		      AARCH64_FL_SIMD | AARCH64_FL_F16 | AARCH64_FL_FP | \
++		      AARCH64_FL_SVE | AARCH64_FL_SVE2, 0, false, "sveaes")
++
++/* Enabling "sve2-sha3" also enables "sha3", "simd", "fp16", "fp", "sve", and
++   "sve2". Disabling "sve2-sha3" just disables "sve2-sha3".  */
++AARCH64_OPT_EXTENSION("sve2-sha3", AARCH64_FL_SVE2_SHA3, AARCH64_FL_SHA3 | \
++		      AARCH64_FL_SIMD | AARCH64_FL_F16 | AARCH64_FL_FP | \
++		      AARCH64_FL_SVE | AARCH64_FL_SVE2, 0, false, "svesha3")
++
++/* Enabling "sve2-bitperm" also enables "simd", "fp16", "fp", "sve", and
++   "sve2".  Disabling "sve2-bitperm" just disables "sve2-bitperm".  */
++AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD | \
++		      AARCH64_FL_F16 | AARCH64_FL_FP | AARCH64_FL_SVE | \
++		      AARCH64_FL_SVE2, 0, false, "svebitperm")
++
++/* Enabling or disabling "tme" only changes "tme".  */
++AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
++
++/* Enabling "i8mm" also enables "simd" and "fp".
++   Disabling "i8mm" only disables "i8mm".  */
++AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, \
++		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "i8mm")
++
++/* Enabling "f32mm" also enables "sve", "fp16", "fp", and "simd".
++   Disabling "f32mm" only disables "f32mm".  */
++AARCH64_OPT_EXTENSION("f32mm", AARCH64_FL_F32MM, \
++		      AARCH64_FL_SVE | AARCH64_FL_F16 | AARCH64_FL_FP | \
++		      AARCH64_FL_SIMD, 0, false, "f32mm")
++
++/* Enabling "f64mm" also enables "sve", "fp16", "fp", and "simd".
++   Disabling "f64mm" only disables "f64mm".  */
++AARCH64_OPT_EXTENSION("f64mm", AARCH64_FL_F64MM, \
++		      AARCH64_FL_SVE | AARCH64_FL_F16 | AARCH64_FL_FP | \
++		      AARCH64_FL_SIMD, 0, false, "f64mm")
++
++/* Enabling "bf16" also enables "simd" and "fp".
++   Disabling "bf16" only disables "bf16".  */
++AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, \
++		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "bf16")
++
+ #undef AARCH64_OPT_EXTENSION
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index 994bcfc7e..5e0a499e8 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -396,8 +396,81 @@ enum simd_immediate_check {
+   AARCH64_CHECK_MOV  = AARCH64_CHECK_ORR | AARCH64_CHECK_BIC
+ };
+ 
++/* The key type that -msign-return-address should use.  */
++enum aarch64_key_type {
++  AARCH64_KEY_A,
++  AARCH64_KEY_B
++};
++
++extern enum aarch64_key_type aarch64_ra_sign_key;
++
+ extern struct tune_params aarch64_tune_params;
+ 
++/* The available SVE predicate patterns, known in the ACLE as "svpattern".  */
++#define AARCH64_FOR_SVPATTERN(T) \
++  T (POW2, pow2, 0) \
++  T (VL1, vl1, 1) \
++  T (VL2, vl2, 2) \
++  T (VL3, vl3, 3) \
++  T (VL4, vl4, 4) \
++  T (VL5, vl5, 5) \
++  T (VL6, vl6, 6) \
++  T (VL7, vl7, 7) \
++  T (VL8, vl8, 8) \
++  T (VL16, vl16, 9) \
++  T (VL32, vl32, 10) \
++  T (VL64, vl64, 11) \
++  T (VL128, vl128, 12) \
++  T (VL256, vl256, 13) \
++  T (MUL4, mul4, 29) \
++  T (MUL3, mul3, 30) \
++  T (ALL, all, 31)
++
++/* The available SVE prefetch operations, known in the ACLE as "svprfop".  */
++#define AARCH64_FOR_SVPRFOP(T) \
++  T (PLDL1KEEP, pldl1keep, 0) \
++  T (PLDL1STRM, pldl1strm, 1) \
++  T (PLDL2KEEP, pldl2keep, 2) \
++  T (PLDL2STRM, pldl2strm, 3) \
++  T (PLDL3KEEP, pldl3keep, 4) \
++  T (PLDL3STRM, pldl3strm, 5) \
++  T (PSTL1KEEP, pstl1keep, 8) \
++  T (PSTL1STRM, pstl1strm, 9) \
++  T (PSTL2KEEP, pstl2keep, 10) \
++  T (PSTL2STRM, pstl2strm, 11) \
++  T (PSTL3KEEP, pstl3keep, 12) \
++  T (PSTL3STRM, pstl3strm, 13)
++
++#define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
++enum aarch64_svpattern {
++  AARCH64_FOR_SVPATTERN (AARCH64_SVENUM)
++  AARCH64_NUM_SVPATTERNS
++};
++
++enum aarch64_svprfop {
++  AARCH64_FOR_SVPRFOP (AARCH64_SVENUM)
++  AARCH64_NUM_SVPRFOPS
++};
++#undef AARCH64_SVENUM
++
++/* It's convenient to divide the built-in function codes into groups,
++   rather than having everything in a single enum.  This type enumerates
++   those groups.  */
++enum aarch64_builtin_class
++{
++  AARCH64_BUILTIN_GENERAL,
++  AARCH64_BUILTIN_SVE
++};
++
++/* Built-in function codes are structured so that the low
++   AARCH64_BUILTIN_SHIFT bits contain the aarch64_builtin_class
++   and the upper bits contain a group-specific subcode.  */
++const unsigned int AARCH64_BUILTIN_SHIFT = 1;
++
++/* Mask that selects the aarch64_builtin_class part of a function code.  */
++const unsigned int AARCH64_BUILTIN_CLASS = (1 << AARCH64_BUILTIN_SHIFT) - 1;
++
++void aarch64_post_cfi_startproc (void);
+ poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
+ int aarch64_get_condition_code (rtx);
+ bool aarch64_address_valid_for_prefetch_p (rtx, bool);
+@@ -407,6 +480,8 @@ unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
+ bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
+ int aarch64_branch_cost (bool, bool);
+ enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
++opt_machine_mode aarch64_vq_mode (scalar_mode);
++opt_machine_mode aarch64_full_sve_mode (scalar_mode);
+ bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
+ bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
+ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
+@@ -414,14 +489,13 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
+ bool aarch64_constant_address_p (rtx);
+ bool aarch64_emit_approx_div (rtx, rtx, rtx);
+ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+-void aarch64_expand_call (rtx, rtx, bool);
+-bool aarch64_expand_movmem (rtx *);
++void aarch64_expand_call (rtx, rtx, rtx, bool);
++bool aarch64_expand_cpymem (rtx *);
+ bool aarch64_float_const_zero_rtx_p (rtx);
+ bool aarch64_float_const_rtx_p (rtx);
+ bool aarch64_function_arg_regno_p (unsigned);
+ bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
+-bool aarch64_gen_movmemqi (rtx *);
+-bool aarch64_gimple_fold_builtin (gimple_stmt_iterator *);
++bool aarch64_gen_cpymemqi (rtx *);
+ bool aarch64_is_extend_from_extract (scalar_int_mode, rtx, rtx);
+ bool aarch64_is_long_call_p (rtx);
+ bool aarch64_is_noplt_call_p (rtx);
+@@ -436,24 +510,32 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
+ 					unsigned HOST_WIDE_INT);
+ bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
+ bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
++machine_mode aarch64_sve_int_mode (machine_mode);
+ opt_machine_mode aarch64_sve_pred_mode (unsigned int);
++opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
++bool aarch64_sve_mode_p (machine_mode);
++HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
+ bool aarch64_sve_cnt_immediate_p (rtx);
++bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
+ bool aarch64_sve_addvl_addpl_immediate_p (rtx);
+-bool aarch64_sve_inc_dec_immediate_p (rtx);
++bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
+ int aarch64_add_offset_temporaries (rtx);
+ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
+ bool aarch64_mov_operand_p (rtx, machine_mode);
+ rtx aarch64_reverse_mask (machine_mode, unsigned int);
+ bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+ bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
++char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
+ char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
+-char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
+-char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
++char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
++char *aarch64_output_sve_scalar_inc_dec (rtx);
++char *aarch64_output_sve_addvl_addpl (rtx);
++char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
+ char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
+ char *aarch64_output_simd_mov_immediate (rtx, unsigned,
+ 			enum simd_immediate_check w = AARCH64_CHECK_MOV);
+ char *aarch64_output_sve_mov_immediate (rtx);
+-char *aarch64_output_ptrue (machine_mode, char);
++char *aarch64_output_sve_ptrues (rtx);
+ bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
+ bool aarch64_regno_ok_for_base_p (int, bool);
+ bool aarch64_regno_ok_for_index_p (int, bool);
+@@ -462,11 +544,13 @@ bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
+ 					    bool high);
+ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
+ bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
++bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
+ bool aarch64_simd_valid_immediate (rtx, struct simd_immediate_info *,
+ 			enum simd_immediate_check w = AARCH64_CHECK_MOV);
+ rtx aarch64_check_zero_based_sve_index_immediate (rtx);
+ bool aarch64_sve_index_immediate_p (rtx);
+ bool aarch64_sve_arith_immediate_p (rtx, bool);
++bool aarch64_sve_sqadd_sqsub_immediate_p (rtx, bool);
+ bool aarch64_sve_bitmask_immediate_p (rtx);
+ bool aarch64_sve_dup_immediate_p (rtx);
+ bool aarch64_sve_cmp_immediate_p (rtx, bool);
+@@ -476,15 +560,15 @@ bool aarch64_split_dimode_const_store (rtx, rtx);
+ bool aarch64_symbolic_address_p (rtx);
+ bool aarch64_uimm12_shift (HOST_WIDE_INT);
+ bool aarch64_use_return_insn_p (void);
+-bool aarch64_use_simple_return_insn_p (void);
+-const char *aarch64_mangle_builtin_type (const_tree);
+ const char *aarch64_output_casesi (rtx *);
+ 
++unsigned int aarch64_tlsdesc_abi_id ();
+ enum aarch64_symbol_type aarch64_classify_symbol (rtx, HOST_WIDE_INT);
+ enum aarch64_symbol_type aarch64_classify_tls_symbol (rtx);
+ enum reg_class aarch64_regno_regclass (unsigned);
+ int aarch64_asm_preferred_eh_data_format (int, int);
+ int aarch64_fpconst_pow_of_2 (rtx);
++int aarch64_fpconst_pow2_recip (rtx);
+ machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
+ 						       machine_mode);
+ int aarch64_uxt_size (int, HOST_WIDE_INT);
+@@ -496,13 +580,17 @@ rtx aarch64_return_addr (int, rtx);
+ rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
+ bool aarch64_simd_mem_operand_p (rtx);
+ bool aarch64_sve_ld1r_operand_p (rtx);
++bool aarch64_sve_ld1rq_operand_p (rtx);
++bool aarch64_sve_ld1ro_operand_p (rtx, scalar_mode);
++bool aarch64_sve_ldff1_operand_p (rtx);
++bool aarch64_sve_ldnf1_operand_p (rtx);
+ bool aarch64_sve_ldr_operand_p (rtx);
++bool aarch64_sve_prefetch_operand_p (rtx, machine_mode);
+ bool aarch64_sve_struct_memory_operand_p (rtx);
+ rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
+ rtx aarch64_gen_stepped_int_parallel (unsigned int, int, int);
+ bool aarch64_stepped_int_parallel_p (rtx, int);
+ rtx aarch64_tls_get_addr (void);
+-tree aarch64_fold_builtin (tree, int, tree *, bool);
+ unsigned aarch64_dbx_register_number (unsigned);
+ unsigned aarch64_trampoline_size (void);
+ void aarch64_asm_output_labelref (FILE *, const char *);
+@@ -512,7 +600,15 @@ const char * aarch64_output_probe_stack_range (rtx, rtx);
+ const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
+ void aarch64_err_no_fpadvsimd (machine_mode);
+ void aarch64_expand_epilogue (bool);
+-void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
++rtx aarch64_ptrue_all (unsigned int);
++opt_machine_mode aarch64_ptrue_all_mode (rtx);
++rtx aarch64_convert_sve_data_to_pred (rtx, machine_mode, rtx);
++rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
++void aarch64_expand_mov_immediate (rtx, rtx);
++rtx aarch64_ptrue_reg (machine_mode);
++rtx aarch64_pfalse_reg (machine_mode);
++bool aarch64_sve_pred_dominates_p (rtx *, rtx);
++bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
+ void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
+ void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
+ bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
+@@ -520,8 +616,9 @@ rtx aarch64_replace_reg_mode (rtx, machine_mode);
+ void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
+ void aarch64_expand_prologue (void);
+ void aarch64_expand_vector_init (rtx, rtx);
++void aarch64_sve_expand_vector_init (rtx, rtx);
+ void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
+-				   const_tree, unsigned);
++				   const_tree, unsigned, bool = false);
+ void aarch64_init_expanders (void);
+ void aarch64_init_simd_builtins (void);
+ void aarch64_emit_call_insn (rtx);
+@@ -587,22 +684,39 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
+ void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
+ bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
+ void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
+-#endif /* RTX_CODE */
+ 
+-void aarch64_init_builtins (void);
++bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
++bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
++#endif /* RTX_CODE */
+ 
+ bool aarch64_process_target_attr (tree);
+ void aarch64_override_options_internal (struct gcc_options *);
+ 
+-rtx aarch64_expand_builtin (tree exp,
+-			    rtx target,
+-			    rtx subtarget ATTRIBUTE_UNUSED,
+-			    machine_mode mode ATTRIBUTE_UNUSED,
+-			    int ignore ATTRIBUTE_UNUSED);
+-tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
+-tree aarch64_builtin_rsqrt (unsigned int);
++const char *aarch64_general_mangle_builtin_type (const_tree);
++void aarch64_general_init_builtins (void);
++tree aarch64_general_fold_builtin (unsigned int, tree, unsigned int, tree *);
++gimple *aarch64_general_gimple_fold_builtin (unsigned int, gcall *);
++rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int);
++tree aarch64_general_builtin_decl (unsigned, bool);
++tree aarch64_general_builtin_rsqrt (unsigned int);
+ tree aarch64_builtin_vectorized_function (unsigned int, tree, tree);
+ 
++namespace aarch64_sve {
++  void init_builtins ();
++  void handle_arm_sve_h ();
++  tree builtin_decl (unsigned, bool);
++  bool builtin_type_p (const_tree);
++  bool svbool_type_p (const_tree);
++  unsigned int nvectors_if_data_type (const_tree);
++  const char *mangle_builtin_type (const_tree);
++  tree resolve_overloaded_builtin (location_t, unsigned int,
++				   vec<tree, va_gc> *);
++  bool check_builtin_call (location_t, vec<location_t>, unsigned int,
++			   tree, unsigned int, tree *);
++  gimple *gimple_fold_builtin (unsigned int, gimple_stmt_iterator *, gcall *);
++  rtx expand_builtin (unsigned int, tree, rtx);
++}
++
+ extern void aarch64_split_combinev16qi (rtx operands[3]);
+ extern void aarch64_expand_vec_perm (rtx, rtx, rtx, rtx, unsigned int);
+ extern void aarch64_expand_sve_vec_perm (rtx, rtx, rtx, rtx);
+@@ -629,11 +743,10 @@ bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
+ 			     const struct cl_decoded_option *, location_t);
+ const char *aarch64_rewrite_selected_cpu (const char *name);
+ enum aarch64_parse_opt_result aarch64_parse_extension (const char *,
+-						       unsigned long *,
++						       uint64_t *,
+ 						       std::string *);
+ void aarch64_get_all_extension_candidates (auto_vec<const char *> *candidates);
+-std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
+-							unsigned long);
++std::string aarch64_get_extension_string_for_isa_flags (uint64_t, uint64_t);
+ 
+ /* Defined in aarch64-d.c  */
+ extern void aarch64_d_target_versions (void);
+@@ -647,4 +760,17 @@ poly_uint64 aarch64_regmode_natural_size (machine_mode);
+ 
+ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
+ 
++struct atomic_ool_names
++{
++    const char *str[5][4];
++};
++
++rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
++			    const atomic_ool_names *names);
++extern const atomic_ool_names aarch64_ool_swp_names;
++extern const atomic_ool_names aarch64_ool_ldadd_names;
++extern const atomic_ool_names aarch64_ool_ldset_names;
++extern const atomic_ool_names aarch64_ool_ldclr_names;
++extern const atomic_ool_names aarch64_ool_ldeor_names;
++
+ #endif /* GCC_AARCH64_PROTOS_H */
+diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
+index b01569429..2be0ce824 100644
+--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
++++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
+@@ -50,3 +50,5 @@
+   ENTRY (Float32x4_t, V4SF, none, 13)
+   ENTRY (Float64x1_t, V1DF, none, 13)
+   ENTRY (Float64x2_t, V2DF, none, 13)
++  ENTRY (Bfloat16x4_t, V4BF, none, 14)
++  ENTRY (Bfloat16x8_t, V8BF, none, 14)
+diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
+index 17bb0c486..d0fe4e7c8 100644
+--- a/gcc/config/aarch64/aarch64-simd-builtins.def
++++ b/gcc/config/aarch64/aarch64-simd-builtins.def
+@@ -212,10 +212,15 @@
+   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
+   BUILTIN_VB (TERNOP, sdot, 0)
+   BUILTIN_VB (TERNOPU, udot, 0)
++  BUILTIN_VB (TERNOP_SSUS, usdot, 0)
+   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
+   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
+   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
+   BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
++  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
++  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
++  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
++  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
+ 
+   /* Implemented by aarch64_fcadd<rot><mode>.   */
+   BUILTIN_VHSDF (BINOP, fcadd90, 0)
+@@ -424,7 +429,7 @@
+   BUILTIN_VB (UNOP, rbit, 0)
+ 
+   /* Implemented by
+-     aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>.  */
++     aarch64_<PERMUTE:perm_insn><mode>.  */
+   BUILTIN_VALL (BINOP, zip1, 0)
+   BUILTIN_VALL (BINOP, zip2, 0)
+   BUILTIN_VALL (BINOP, uzp1, 0)
+@@ -465,12 +470,18 @@
+   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
+   BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
+ 
++  /* Implemented by aarch64_ld1x4<VALLDIF:mode>.  */
++  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0)
++
+   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
+   BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
+ 
+   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
+   BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
+ 
++  /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
++  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0)
++
+   /* Implemented by fma<mode>4.  */
+   BUILTIN_VHSDF (TERNOP, fma, 4)
+   VAR1 (TERNOP, fma, 4, hf)
+@@ -670,3 +681,36 @@
+   /* Implemented by aarch64_fml<f16mac1>lq_laneq_highv4sf.  */
+   VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, v4sf)
+   VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, v4sf)
++
++  /* Implemented by aarch64_<frintnzs_op><mode>.  */
++  BUILTIN_VSFDF (UNOP, frint32z, 0)
++  BUILTIN_VSFDF (UNOP, frint32x, 0)
++  BUILTIN_VSFDF (UNOP, frint64z, 0)
++  BUILTIN_VSFDF (UNOP, frint64x, 0)
++
++  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
++  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
++  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
++  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
++
++  /* Implemented by aarch64_bfmmlaqv4sf  */
++  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
++
++  /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
++  VAR1 (TERNOP, bfmlalb, 0, v4sf)
++  VAR1 (TERNOP, bfmlalt, 0, v4sf)
++  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
++  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
++  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
++  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
++
++  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
++  VAR1 (TERNOP, simd_smmla, 0, v16qi)
++  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
++  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
++
++  /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
++  VAR1 (UNOP, bfcvtn, 0, v4bf)
++  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
++  VAR1 (BINOP, bfcvtn2, 0, v8bf)
++  VAR1 (UNOP, bfcvt, 0, bf)
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index 29ca37c65..137c88da1 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
++++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -19,8 +19,8 @@
+ ;; <http://www.gnu.org/licenses/>.
+ 
+ (define_expand "mov<mode>"
+-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand" "")
+-	(match_operand:VALL_F16 1 "general_operand" ""))]
++  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
++	(match_operand:VALL_F16MOV 1 "general_operand"))]
+   "TARGET_SIMD"
+   "
+   /* Force the operand into a register if it is not an
+@@ -39,8 +39,8 @@
+ )
+ 
+ (define_expand "movmisalign<mode>"
+-  [(set (match_operand:VALL 0 "nonimmediate_operand" "")
+-        (match_operand:VALL 1 "general_operand" ""))]
++  [(set (match_operand:VALL 0 "nonimmediate_operand")
++        (match_operand:VALL 1 "general_operand"))]
+   "TARGET_SIMD"
+ {
+   /* This pattern is not permitted to fail during expansion: if both arguments
+@@ -101,10 +101,10 @@
+   [(set_attr "type" "neon_dup<q>")]
+ )
+ 
+-(define_insn "*aarch64_simd_mov<VD:mode>"
+-  [(set (match_operand:VD 0 "nonimmediate_operand"
++(define_insn "*aarch64_simd_mov<VDMOV:mode>"
++  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
+ 		"=w, m,  m,  w, ?r, ?w, ?r, w")
+-	(match_operand:VD 1 "general_operand"
++	(match_operand:VDMOV 1 "general_operand"
+ 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "TARGET_SIMD
+    && (register_operand (operands[0], <MODE>mode)
+@@ -129,10 +129,10 @@
+ 		     mov_reg, neon_move<q>")]
+ )
+ 
+-(define_insn "*aarch64_simd_mov<VQ:mode>"
+-  [(set (match_operand:VQ 0 "nonimmediate_operand"
++(define_insn "*aarch64_simd_mov<VQMOV:mode>"
++  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
+ 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
+-	(match_operand:VQ 1 "general_operand"
++	(match_operand:VQMOV 1 "general_operand"
+ 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+   "TARGET_SIMD
+    && (register_operand (operands[0], <MODE>mode)
+@@ -234,8 +234,8 @@
+ 
+ 
+ (define_split
+-  [(set (match_operand:VQ 0 "register_operand" "")
+-      (match_operand:VQ 1 "register_operand" ""))]
++  [(set (match_operand:VQMOV 0 "register_operand" "")
++      (match_operand:VQMOV 1 "register_operand" ""))]
+   "TARGET_SIMD && reload_completed
+    && GP_REGNUM_P (REGNO (operands[0]))
+    && GP_REGNUM_P (REGNO (operands[1]))"
+@@ -246,8 +246,8 @@
+ })
+ 
+ (define_split
+-  [(set (match_operand:VQ 0 "register_operand" "")
+-        (match_operand:VQ 1 "register_operand" ""))]
++  [(set (match_operand:VQMOV 0 "register_operand" "")
++        (match_operand:VQMOV 1 "register_operand" ""))]
+   "TARGET_SIMD && reload_completed
+    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
+        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
+@@ -258,8 +258,8 @@
+ })
+ 
+ (define_expand "@aarch64_split_simd_mov<mode>"
+-  [(set (match_operand:VQ 0)
+-        (match_operand:VQ 1))]
++  [(set (match_operand:VQMOV 0)
++        (match_operand:VQMOV 1))]
+   "TARGET_SIMD"
+   {
+     rtx dst = operands[0];
+@@ -520,6 +520,20 @@
+   [(set_attr "type" "neon_dot<q>")]
+ )
+ 
++;; These instructions map to the __builtins for the armv8.6a I8MM usdot
++;; (vector) Dot Product operation.
++(define_insn "aarch64_usdot<vsi2qi>"
++  [(set (match_operand:VS 0 "register_operand" "=w")
++	(plus:VS
++	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
++		      (match_operand:<VSI2QI> 3 "register_operand" "w")]
++	  UNSPEC_USDOT)
++	  (match_operand:VS 1 "register_operand" "0")))]
++  "TARGET_I8MM"
++  "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
++  [(set_attr "type" "neon_dot<q>")]
++)
++
+ ;; These expands map to the Dot Product optab the vectorizer checks for.
+ ;; The auto-vectorizer expects a dot product builtin that also does an
+ ;; accumulation into the provided register.
+@@ -587,6 +601,26 @@
+   [(set_attr "type" "neon_dot<q>")]
+ )
+ 
++;; These instructions map to the __builtins for the armv8.6a I8MM usdot, sudot
++;; (by element) Dot Product operations.
++(define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi>"
++  [(set (match_operand:VS 0 "register_operand" "=w")
++	(plus:VS
++	  (unspec:VS [(match_operand:<VS:VSI2QI> 2 "register_operand" "w")
++		      (match_operand:VB 3 "register_operand" "w")
++		      (match_operand:SI 4 "immediate_operand" "i")]
++	  DOTPROD_I8MM)
++	  (match_operand:VS 1 "register_operand" "0")))]
++  "TARGET_I8MM"
++  {
++    int nunits = GET_MODE_NUNITS (<VB:MODE>mode).to_constant ();
++    int lane = INTVAL (operands[4]);
++    operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode);
++    return "<DOTPROD_I8MM:sur>dot\\t%0.<VS:Vtype>, %2.<VS:Vdottype>, %3.4b[%4]";
++  }
++  [(set_attr "type" "neon_dot<VS:q>")]
++)
++
+ (define_expand "copysign<mode>3"
+   [(match_operand:VHSDF 0 "register_operand")
+    (match_operand:VHSDF 1 "register_operand")
+@@ -666,8 +700,8 @@
+   [(set_attr "type" "neon_fp_rsqrts_<stype><q>")])
+ 
+ (define_expand "rsqrt<mode>2"
+-  [(set (match_operand:VALLF 0 "register_operand" "=w")
+-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
++  [(set (match_operand:VALLF 0 "register_operand")
++	(unspec:VALLF [(match_operand:VALLF 1 "register_operand")]
+ 		     UNSPEC_RSQRT))]
+   "TARGET_SIMD"
+ {
+@@ -724,15 +758,15 @@
+ ;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
+ ;; Whereas SABD would return 192 (-64 signed) on the above example.
+ ;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
+-(define_insn "*aarch64_<su>abd<mode>_3"
++(define_insn "aarch64_<su>abd<mode>_3"
+   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
+ 	(minus:VDQ_BHSI
+ 	  (USMAX:VDQ_BHSI
+ 	    (match_operand:VDQ_BHSI 1 "register_operand" "w")
+ 	    (match_operand:VDQ_BHSI 2 "register_operand" "w"))
+-	  (match_operator 3 "aarch64_<max_opp>"
+-	    [(match_dup 1)
+-	     (match_dup 2)])))]
++	  (<max_opp>:VDQ_BHSI
++	    (match_dup 1)
++	    (match_dup 2))))]
+   "TARGET_SIMD"
+   "<su>abd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+   [(set_attr "type" "neon_abd<q>")]
+@@ -778,7 +812,16 @@
+ ;; UABAL	tmp.8h, op1.16b, op2.16b
+ ;; UADALP	op3.4s, tmp.8h
+ ;; MOV		op0, op3 // should be eliminated in later passes.
+-;; The signed version just uses the signed variants of the above instructions.
++;;
++;; For TARGET_DOTPROD we do:
++;; MOV	tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
++;; UABD	tmp2.16b, op1.16b, op2.16b
++;; UDOT	op3.4s, tmp2.16b, tmp1.16b
++;; MOV	op0, op3 // RA will tie the operands of UDOT appropriately.
++;;
++;; The signed version just uses the signed variants of the above instructions
++;; but for TARGET_DOTPROD still emits a UDOT as the absolute difference is
++;; unsigned.
+ 
+ (define_expand "<sur>sadv16qi"
+   [(use (match_operand:V4SI 0 "register_operand"))
+@@ -787,6 +830,15 @@
+    (use (match_operand:V4SI 3 "register_operand"))]
+   "TARGET_SIMD"
+   {
++    if (TARGET_DOTPROD)
++      {
++	rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
++	rtx abd = gen_reg_rtx (V16QImode);
++	emit_insn (gen_aarch64_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
++	emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
++					  abd, ones));
++	DONE;
++      }
+     rtx reduc = gen_reg_rtx (V8HImode);
+     emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
+ 					       operands[2]));
+@@ -949,6 +1001,21 @@
+   [(set_attr "type" "neon_ins<q>")]
+ )
+ 
++(define_expand "signbit<mode>2"
++  [(use (match_operand:<V_INT_EQUIV> 0 "register_operand"))
++   (use (match_operand:VDQSF 1 "register_operand"))]
++  "TARGET_SIMD"
++{
++  int shift_amount = GET_MODE_UNIT_BITSIZE (<V_INT_EQUIV>mode) - 1;
++  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
++                                                        shift_amount);
++  operands[1] = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
++
++  emit_insn (gen_aarch64_simd_lshr<v_int_equiv> (operands[0], operands[1],
++                                                 shift_vector));
++  DONE;
++})
++
+ (define_insn "aarch64_simd_lshr<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
+@@ -967,6 +1034,18 @@
+   [(set_attr "type" "neon_shift_imm<q>")]
+ )
+ 
++(define_insn "*aarch64_simd_sra<mode>"
++ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
++	(plus:VDQ_I
++	   (SHIFTRT:VDQ_I
++		(match_operand:VDQ_I 1 "register_operand" "w")
++		(match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr"))
++	   (match_operand:VDQ_I 3 "register_operand" "0")))]
++  "TARGET_SIMD"
++  "<sra_op>sra\t%0.<Vtype>, %1.<Vtype>, %2"
++  [(set_attr "type" "neon_shift_acc<q>")]
++)
++
+ (define_insn "aarch64_simd_imm_shl<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+        (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
+@@ -1006,9 +1085,9 @@
+ )
+ 
+ (define_expand "ashl<mode>3"
+-  [(match_operand:VDQ_I 0 "register_operand" "")
+-   (match_operand:VDQ_I 1 "register_operand" "")
+-   (match_operand:SI  2 "general_operand" "")]
++  [(match_operand:VDQ_I 0 "register_operand")
++   (match_operand:VDQ_I 1 "register_operand")
++   (match_operand:SI  2 "general_operand")]
+  "TARGET_SIMD"
+ {
+   int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
+@@ -1053,9 +1132,9 @@
+ )
+ 
+ (define_expand "lshr<mode>3"
+-  [(match_operand:VDQ_I 0 "register_operand" "")
+-   (match_operand:VDQ_I 1 "register_operand" "")
+-   (match_operand:SI  2 "general_operand" "")]
++  [(match_operand:VDQ_I 0 "register_operand")
++   (match_operand:VDQ_I 1 "register_operand")
++   (match_operand:SI  2 "general_operand")]
+  "TARGET_SIMD"
+ {
+   int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
+@@ -1100,9 +1179,9 @@
+ )
+ 
+ (define_expand "ashr<mode>3"
+-  [(match_operand:VDQ_I 0 "register_operand" "")
+-   (match_operand:VDQ_I 1 "register_operand" "")
+-   (match_operand:SI  2 "general_operand" "")]
++  [(match_operand:VDQ_I 0 "register_operand")
++   (match_operand:VDQ_I 1 "register_operand")
++   (match_operand:SI  2 "general_operand")]
+  "TARGET_SIMD"
+ {
+   int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
+@@ -1147,9 +1226,9 @@
+ )
+ 
+ (define_expand "vashl<mode>3"
+- [(match_operand:VDQ_I 0 "register_operand" "")
+-  (match_operand:VDQ_I 1 "register_operand" "")
+-  (match_operand:VDQ_I 2 "register_operand" "")]
++ [(match_operand:VDQ_I 0 "register_operand")
++  (match_operand:VDQ_I 1 "register_operand")
++  (match_operand:VDQ_I 2 "register_operand")]
+  "TARGET_SIMD"
+ {
+   emit_insn (gen_aarch64_simd_reg_sshl<mode> (operands[0], operands[1],
+@@ -1161,9 +1240,9 @@
+ ;; Negating individual lanes most certainly offsets the
+ ;; gain from vectorization.
+ (define_expand "vashr<mode>3"
+- [(match_operand:VDQ_BHSI 0 "register_operand" "")
+-  (match_operand:VDQ_BHSI 1 "register_operand" "")
+-  (match_operand:VDQ_BHSI 2 "register_operand" "")]
++ [(match_operand:VDQ_BHSI 0 "register_operand")
++  (match_operand:VDQ_BHSI 1 "register_operand")
++  (match_operand:VDQ_BHSI 2 "register_operand")]
+  "TARGET_SIMD"
+ {
+   rtx neg = gen_reg_rtx (<MODE>mode);
+@@ -1175,9 +1254,9 @@
+ 
+ ;; DI vector shift
+ (define_expand "aarch64_ashr_simddi"
+-  [(match_operand:DI 0 "register_operand" "=w")
+-   (match_operand:DI 1 "register_operand" "w")
+-   (match_operand:SI 2 "aarch64_shift_imm64_di" "")]
++  [(match_operand:DI 0 "register_operand")
++   (match_operand:DI 1 "register_operand")
++   (match_operand:SI 2 "aarch64_shift_imm64_di")]
+   "TARGET_SIMD"
+   {
+     /* An arithmetic shift right by 64 fills the result with copies of the sign
+@@ -1191,9 +1270,9 @@
+ )
+ 
+ (define_expand "vlshr<mode>3"
+- [(match_operand:VDQ_BHSI 0 "register_operand" "")
+-  (match_operand:VDQ_BHSI 1 "register_operand" "")
+-  (match_operand:VDQ_BHSI 2 "register_operand" "")]
++ [(match_operand:VDQ_BHSI 0 "register_operand")
++  (match_operand:VDQ_BHSI 1 "register_operand")
++  (match_operand:VDQ_BHSI 2 "register_operand")]
+  "TARGET_SIMD"
+ {
+   rtx neg = gen_reg_rtx (<MODE>mode);
+@@ -1204,9 +1283,9 @@
+ })
+ 
+ (define_expand "aarch64_lshr_simddi"
+-  [(match_operand:DI 0 "register_operand" "=w")
+-   (match_operand:DI 1 "register_operand" "w")
+-   (match_operand:SI 2 "aarch64_shift_imm64_di" "")]
++  [(match_operand:DI 0 "register_operand")
++   (match_operand:DI 1 "register_operand")
++   (match_operand:SI 2 "aarch64_shift_imm64_di")]
+   "TARGET_SIMD"
+   {
+     if (INTVAL (operands[2]) == 64)
+@@ -1234,9 +1313,9 @@
+ )
+ 
+ (define_expand "vec_set<mode>"
+-  [(match_operand:VALL_F16 0 "register_operand" "+w")
+-   (match_operand:<VEL> 1 "register_operand" "w")
+-   (match_operand:SI 2 "immediate_operand" "")]
++  [(match_operand:VALL_F16 0 "register_operand")
++   (match_operand:<VEL> 1 "register_operand")
++   (match_operand:SI 2 "immediate_operand")]
+   "TARGET_SIMD"
+   {
+     HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
+@@ -1375,9 +1454,9 @@
+ )
+ 
+ (define_expand "<su><maxmin>v2di3"
+- [(set (match_operand:V2DI 0 "register_operand" "")
+-       (MAXMIN:V2DI (match_operand:V2DI 1 "register_operand" "")
+-                    (match_operand:V2DI 2 "register_operand" "")))]
++ [(set (match_operand:V2DI 0 "register_operand")
++       (MAXMIN:V2DI (match_operand:V2DI 1 "register_operand")
++                    (match_operand:V2DI 2 "register_operand")))]
+  "TARGET_SIMD"
+ {
+   enum rtx_code cmp_operator;
+@@ -1440,8 +1519,8 @@
+ ;; On big-endian this is { zeroes, operand }
+ 
+ (define_insn "move_lo_quad_internal_<mode>"
+-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
+-	(vec_concat:VQ_NO2E
++  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
++	(vec_concat:VQMOV_NO2E
+ 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
+ 	  (vec_duplicate:<VHALF> (const_int 0))))]
+   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+@@ -1470,8 +1549,8 @@
+ )
+ 
+ (define_insn "move_lo_quad_internal_be_<mode>"
+-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
+-	(vec_concat:VQ_NO2E
++  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
++	(vec_concat:VQMOV_NO2E
+ 	  (vec_duplicate:<VHALF> (const_int 0))
+ 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
+   "TARGET_SIMD && BYTES_BIG_ENDIAN"
+@@ -1500,8 +1579,8 @@
+ )
+ 
+ (define_expand "move_lo_quad_<mode>"
+-  [(match_operand:VQ 0 "register_operand")
+-   (match_operand:VQ 1 "register_operand")]
++  [(match_operand:VQMOV 0 "register_operand")
++   (match_operand:VQMOV 1 "register_operand")]
+   "TARGET_SIMD"
+ {
+   if (BYTES_BIG_ENDIAN)
+@@ -1518,11 +1597,11 @@
+ ;; For big-endian this is { operand1, operand2 }
+ 
+ (define_insn "aarch64_simd_move_hi_quad_<mode>"
+-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
+-        (vec_concat:VQ
++  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
++        (vec_concat:VQMOV
+           (vec_select:<VHALF>
+                 (match_dup 0)
+-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))
++                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
+ 	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
+   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+   "@
+@@ -1532,12 +1611,12 @@
+ )
+ 
+ (define_insn "aarch64_simd_move_hi_quad_be_<mode>"
+-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
+-        (vec_concat:VQ
++  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
++        (vec_concat:VQMOV
+ 	  (match_operand:<VHALF> 1 "register_operand" "w,r")
+           (vec_select:<VHALF>
+                 (match_dup 0)
+-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))))]
++                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
+   "TARGET_SIMD && BYTES_BIG_ENDIAN"
+   "@
+    ins\\t%0.d[1], %1.d[0]
+@@ -1546,8 +1625,8 @@
+ )
+ 
+ (define_expand "move_hi_quad_<mode>"
+- [(match_operand:VQ 0 "register_operand" "")
+-  (match_operand:<VHALF> 1 "register_operand" "")]
++ [(match_operand:VQMOV 0 "register_operand")
++  (match_operand:<VHALF> 1 "register_operand")]
+  "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+@@ -1571,10 +1650,122 @@
+   [(set_attr "type" "neon_shift_imm_narrow_q")]
+ )
+ 
++(define_insn "aarch64_bfdot<mode>"
++  [(set (match_operand:VDQSF 0 "register_operand" "=w")
++	(plus:VDQSF
++	  (unspec:VDQSF
++	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
++	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
++	    UNSPEC_BFDOT)
++	  (match_operand:VDQSF 1 "register_operand" "0")))]
++  "TARGET_BF16_SIMD"
++  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
++  [(set_attr "type" "neon_dot<q>")]
++)
++
++(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
++  [(set (match_operand:VDQSF 0 "register_operand" "=w")
++	(plus:VDQSF
++	  (unspec:VDQSF
++	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
++	    (match_operand:VBF 3 "register_operand" "w")
++	    (match_operand:SI 4 "const_int_operand" "n")]
++	    UNSPEC_BFDOT)
++	  (match_operand:VDQSF 1 "register_operand" "0")))]
++  "TARGET_BF16_SIMD"
++{
++  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
++  int lane = INTVAL (operands[4]);
++  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
++  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
++}
++  [(set_attr "type" "neon_dot<VDQSF:q>")]
++)
++
++;; bfmmla
++(define_insn "aarch64_bfmmlaqv4sf"
++  [(set (match_operand:V4SF 0 "register_operand" "=w")
++        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
++                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
++                                 (match_operand:V8BF 3 "register_operand" "w")]
++                    UNSPEC_BFMMLA)))]
++  "TARGET_BF16_SIMD"
++  "bfmmla\\t%0.4s, %2.8h, %3.8h"
++  [(set_attr "type" "neon_fp_mla_s_q")]
++)
++
++;; bfmlal<bt>
++(define_insn "aarch64_bfmlal<bt>v4sf"
++  [(set (match_operand:V4SF 0 "register_operand" "=w")
++        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
++                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
++                                  (match_operand:V8BF 3 "register_operand" "w")]
++                     BF_MLA)))]
++  "TARGET_BF16_SIMD"
++  "bfmlal<bt>\\t%0.4s, %2.8h, %3.8h"
++  [(set_attr "type" "neon_fp_mla_s_q")]
++)
++
++(define_insn "aarch64_bfmlal<bt>_lane<q>v4sf"
++  [(set (match_operand:V4SF 0 "register_operand" "=w")
++        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
++                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
++                                  (match_operand:VBF 3 "register_operand" "w")
++                                  (match_operand:SI 4 "const_int_operand" "n")]
++                     BF_MLA)))]
++  "TARGET_BF16_SIMD"
++{
++  operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));
++  return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
++}
++  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
++)
++
++;; 8-bit integer matrix multiply-accumulate
++(define_insn "aarch64_simd_<sur>mmlav16qi"
++  [(set (match_operand:V4SI 0 "register_operand" "=w")
++	(plus:V4SI
++	 (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
++		       (match_operand:V16QI 3 "register_operand" "w")] MATMUL)
++	 (match_operand:V4SI 1 "register_operand" "0")))]
++  "TARGET_I8MM"
++  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
++  [(set_attr "type" "neon_mla_s_q")]
++)
++
++;; bfcvtn
++(define_insn "aarch64_bfcvtn<q><mode>"
++  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
++        (unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
++                            UNSPEC_BFCVTN))]
++  "TARGET_BF16_SIMD"
++  "bfcvtn\\t%0.4h, %1.4s"
++  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
++)
++
++(define_insn "aarch64_bfcvtn2v8bf"
++  [(set (match_operand:V8BF 0 "register_operand" "=w")
++        (unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0")
++                      (match_operand:V4SF 2 "register_operand" "w")]
++                      UNSPEC_BFCVTN2))]
++  "TARGET_BF16_SIMD"
++  "bfcvtn2\\t%0.8h, %2.4s"
++  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
++)
++
++(define_insn "aarch64_bfcvtbf"
++  [(set (match_operand:BF 0 "register_operand" "=w")
++        (unspec:BF [(match_operand:SF 1 "register_operand" "w")]
++                    UNSPEC_BFCVT))]
++  "TARGET_BF16_FP"
++  "bfcvt\\t%h0, %s1"
++  [(set_attr "type" "f_cvt")]
++)
++
+ (define_expand "vec_pack_trunc_<mode>"
+- [(match_operand:<VNARROWD> 0 "register_operand" "")
+-  (match_operand:VDN 1 "register_operand" "")
+-  (match_operand:VDN 2 "register_operand" "")]
++ [(match_operand:<VNARROWD> 0 "register_operand")
++  (match_operand:VDN 1 "register_operand")
++  (match_operand:VDN 2 "register_operand")]
+  "TARGET_SIMD"
+ {
+   rtx tempreg = gen_reg_rtx (<VDBL>mode);
+@@ -1630,7 +1821,7 @@
+ )
+ 
+ (define_expand "vec_unpack<su>_hi_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "")
++  [(match_operand:<VWIDE> 0 "register_operand")
+    (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
+   "TARGET_SIMD"
+   {
+@@ -1642,8 +1833,8 @@
+ )
+ 
+ (define_expand "vec_unpack<su>_lo_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "")
+-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
+   "TARGET_SIMD"
+   {
+     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+@@ -1761,9 +1952,9 @@
+ )
+ 
+ (define_expand "vec_widen_<su>mult_lo_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "")
+-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))
+-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
++   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+@@ -1788,9 +1979,9 @@
+ )
+ 
+ (define_expand "vec_widen_<su>mult_hi_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "")
+-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))
+-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
++   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -1855,9 +2046,9 @@
+ )
+ 
+ (define_expand "div<mode>3"
+- [(set (match_operand:VHSDF 0 "register_operand" "=w")
+-       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
+-		  (match_operand:VHSDF 2 "register_operand" "w")))]
++ [(set (match_operand:VHSDF 0 "register_operand")
++       (div:VHSDF (match_operand:VHSDF 1 "register_operand")
++		  (match_operand:VHSDF 2 "register_operand")))]
+  "TARGET_SIMD"
+ {
+   if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+@@ -2192,8 +2383,8 @@
+ ;; other big-endian patterns their behavior is as required.
+ 
+ (define_expand "vec_unpacks_lo_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "")
+-   (match_operand:VQ_HSF 1 "register_operand" "")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQ_HSF 1 "register_operand")]
+   "TARGET_SIMD"
+   {
+     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+@@ -2215,8 +2406,8 @@
+ )
+ 
+ (define_expand "vec_unpacks_hi_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "")
+-   (match_operand:VQ_HSF 1 "register_operand" "")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQ_HSF 1 "register_operand")]
+   "TARGET_SIMD"
+   {
+     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -2268,9 +2459,9 @@
+ )
+ 
+ (define_expand "aarch64_float_truncate_hi_<Vdbl>"
+-  [(match_operand:<VDBL> 0 "register_operand" "=w")
+-   (match_operand:VDF 1 "register_operand" "0")
+-   (match_operand:<VWIDE> 2 "register_operand" "w")]
++  [(match_operand:<VDBL> 0 "register_operand")
++   (match_operand:VDF 1 "register_operand")
++   (match_operand:<VWIDE> 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx (*gen) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN
+@@ -2363,8 +2554,8 @@
+ ;; 'across lanes' add.
+ 
+ (define_expand "reduc_plus_scal_<mode>"
+-  [(match_operand:<VEL> 0 "register_operand" "=w")
+-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")]
++  [(match_operand:<VEL> 0 "register_operand")
++   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
+ 	       UNSPEC_ADDV)]
+   "TARGET_SIMD"
+   {
+@@ -3116,30 +3307,31 @@
+ (define_insn "*aarch64_get_lane_extend<GPI:mode><VDQQH:mode>"
+   [(set (match_operand:GPI 0 "register_operand" "=r")
+ 	(sign_extend:GPI
+-	  (vec_select:<VEL>
++	  (vec_select:<VDQQH:VEL>
+ 	    (match_operand:VDQQH 1 "register_operand" "w")
+ 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+   "TARGET_SIMD"
+   {
+-    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
++    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
++					   INTVAL (operands[2]));
+     return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]";
+   }
+-  [(set_attr "type" "neon_to_gp<q>")]
+-)
+-
+-(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
+-  [(set (match_operand:GPI 0 "register_operand" "=r")
+-	(zero_extend:GPI
+-	  (vec_select:<VEL>
+-	    (match_operand:VDQQH 1 "register_operand" "w")
+-	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+-  "TARGET_SIMD"
+-  {
+-    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+-					   INTVAL (operands[2]));
+-    return "umov\\t%w0, %1.<Vetype>[%2]";
+-  }
+-  [(set_attr "type" "neon_to_gp<q>")]
++  [(set_attr "type" "neon_to_gp<VDQQH:q>")]
++)
++
++(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
++  [(set (match_operand:GPI 0 "register_operand" "=r")
++	(zero_extend:GPI
++	  (vec_select:<VDQQH:VEL>
++	    (match_operand:VDQQH 1 "register_operand" "w")
++	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
++  "TARGET_SIMD"
++  {
++    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
++					   INTVAL (operands[2]));
++    return "umov\\t%w0, %1.<VDQQH:Vetype>[%2]";
++  }
++  [(set_attr "type" "neon_to_gp<VDQQH:q>")]
+ )
+ 
+ ;; Lane extraction of a value, neither sign nor zero extension
+@@ -3280,9 +3472,9 @@
+ 
+ 
+ (define_expand "aarch64_saddl2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQW 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQW 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3292,9 +3484,9 @@
+ })
+ 
+ (define_expand "aarch64_uaddl2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQW 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQW 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3304,9 +3496,9 @@
+ })
+ 
+ (define_expand "aarch64_ssubl2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQW 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQW 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3316,9 +3508,9 @@
+ })
+ 
+ (define_expand "aarch64_usubl2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQW 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQW 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3341,10 +3533,10 @@
+ ;; <su><addsub>w<q>.
+ 
+ (define_expand "widen_ssum<mode>3"
+-  [(set (match_operand:<VDBLW> 0 "register_operand" "")
++  [(set (match_operand:<VDBLW> 0 "register_operand")
+ 	(plus:<VDBLW> (sign_extend:<VDBLW> 
+-		        (match_operand:VQW 1 "register_operand" ""))
+-		      (match_operand:<VDBLW> 2 "register_operand" "")))]
++		        (match_operand:VQW 1 "register_operand"))
++		      (match_operand:<VDBLW> 2 "register_operand")))]
+   "TARGET_SIMD"
+   {
+     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+@@ -3358,10 +3550,10 @@
+ )
+ 
+ (define_expand "widen_ssum<mode>3"
+-  [(set (match_operand:<VWIDE> 0 "register_operand" "")
++  [(set (match_operand:<VWIDE> 0 "register_operand")
+ 	(plus:<VWIDE> (sign_extend:<VWIDE>
+-		        (match_operand:VD_BHSI 1 "register_operand" ""))
+-		      (match_operand:<VWIDE> 2 "register_operand" "")))]
++		        (match_operand:VD_BHSI 1 "register_operand"))
++		      (match_operand:<VWIDE> 2 "register_operand")))]
+   "TARGET_SIMD"
+ {
+   emit_insn (gen_aarch64_saddw<mode> (operands[0], operands[2], operands[1]));
+@@ -3369,10 +3561,10 @@
+ })
+ 
+ (define_expand "widen_usum<mode>3"
+-  [(set (match_operand:<VDBLW> 0 "register_operand" "")
++  [(set (match_operand:<VDBLW> 0 "register_operand")
+ 	(plus:<VDBLW> (zero_extend:<VDBLW> 
+-		        (match_operand:VQW 1 "register_operand" ""))
+-		      (match_operand:<VDBLW> 2 "register_operand" "")))]
++		        (match_operand:VQW 1 "register_operand"))
++		      (match_operand:<VDBLW> 2 "register_operand")))]
+   "TARGET_SIMD"
+   {
+     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+@@ -3386,10 +3578,10 @@
+ )
+ 
+ (define_expand "widen_usum<mode>3"
+-  [(set (match_operand:<VWIDE> 0 "register_operand" "")
++  [(set (match_operand:<VWIDE> 0 "register_operand")
+ 	(plus:<VWIDE> (zero_extend:<VWIDE>
+-		        (match_operand:VD_BHSI 1 "register_operand" ""))
+-		      (match_operand:<VWIDE> 2 "register_operand" "")))]
++		        (match_operand:VD_BHSI 1 "register_operand"))
++		      (match_operand:<VWIDE> 2 "register_operand")))]
+   "TARGET_SIMD"
+ {
+   emit_insn (gen_aarch64_uaddw<mode> (operands[0], operands[2], operands[1]));
+@@ -3467,9 +3659,9 @@
+ )
+ 
+ (define_expand "aarch64_saddw2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3479,9 +3671,9 @@
+ })
+ 
+ (define_expand "aarch64_uaddw2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3492,9 +3684,9 @@
+ 
+ 
+ (define_expand "aarch64_ssubw2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -3504,9 +3696,9 @@
+ })
+ 
+ (define_expand "aarch64_usubw2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQW 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQW 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4039,10 +4231,10 @@
+ )
+ 
+ (define_expand "aarch64_sqdmlal2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:VQ_HSI 3 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:VQ_HSI 3 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4052,10 +4244,10 @@
+ })
+ 
+ (define_expand "aarch64_sqdmlsl2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:VQ_HSI 3 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:VQ_HSI 3 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4119,11 +4311,11 @@
+ )
+ 
+ (define_expand "aarch64_sqdmlal2_lane<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:<VCOND> 3 "register_operand" "<vwx>")
+-   (match_operand:SI 4 "immediate_operand" "i")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:<VCOND> 3 "register_operand")
++   (match_operand:SI 4 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4134,11 +4326,11 @@
+ })
+ 
+ (define_expand "aarch64_sqdmlal2_laneq<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
+-   (match_operand:SI 4 "immediate_operand" "i")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:<VCONQ> 3 "register_operand")
++   (match_operand:SI 4 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4149,11 +4341,11 @@
+ })
+ 
+ (define_expand "aarch64_sqdmlsl2_lane<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:<VCOND> 3 "register_operand" "<vwx>")
+-   (match_operand:SI 4 "immediate_operand" "i")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:<VCOND> 3 "register_operand")
++   (match_operand:SI 4 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4164,11 +4356,11 @@
+ })
+ 
+ (define_expand "aarch64_sqdmlsl2_laneq<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
+-   (match_operand:SI 4 "immediate_operand" "i")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:<VCONQ> 3 "register_operand")
++   (match_operand:SI 4 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4198,10 +4390,10 @@
+ )
+ 
+ (define_expand "aarch64_sqdmlal2_n<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:<VEL> 3 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:<VEL> 3 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4212,10 +4404,10 @@
+ })
+ 
+ (define_expand "aarch64_sqdmlsl2_n<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:<VWIDE> 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")
+-   (match_operand:<VEL> 3 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:<VWIDE> 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")
++   (match_operand:<VEL> 3 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4367,9 +4559,9 @@
+ )
+ 
+ (define_expand "aarch64_sqdmull2<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQ_HSI 1 "register_operand" "w")
+-   (match_operand:VQ_HSI 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQ_HSI 1 "register_operand")
++   (match_operand:VQ_HSI 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4427,10 +4619,10 @@
+ )
+ 
+ (define_expand "aarch64_sqdmull2_lane<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQ_HSI 1 "register_operand" "w")
+-   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
+-   (match_operand:SI 3 "immediate_operand" "i")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQ_HSI 1 "register_operand")
++   (match_operand:<VCOND> 2 "register_operand")
++   (match_operand:SI 3 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4441,10 +4633,10 @@
+ })
+ 
+ (define_expand "aarch64_sqdmull2_laneq<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQ_HSI 1 "register_operand" "w")
+-   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
+-   (match_operand:SI 3 "immediate_operand" "i")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQ_HSI 1 "register_operand")
++   (match_operand:<VCONQ> 2 "register_operand")
++   (match_operand:SI 3 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4475,9 +4667,9 @@
+ )
+ 
+ (define_expand "aarch64_sqdmull2_n<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
+-   (match_operand:VQ_HSI 1 "register_operand" "w")
+-   (match_operand:<VEL> 2 "register_operand" "w")]
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (match_operand:VQ_HSI 1 "register_operand")
++   (match_operand:<VEL> 2 "register_operand")]
+   "TARGET_SIMD"
+ {
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+@@ -4879,8 +5071,8 @@
+ ;; sqrt
+ 
+ (define_expand "sqrt<mode>2"
+-  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+-	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
++  [(set (match_operand:VHSDF 0 "register_operand")
++	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand")))]
+   "TARGET_SIMD"
+ {
+   if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+@@ -4933,8 +5125,8 @@
+ )
+ 
+ (define_expand "vec_load_lanesoi<mode>"
+-  [(set (match_operand:OI 0 "register_operand" "=w")
+-	(unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
++  [(set (match_operand:OI 0 "register_operand")
++	(unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand")
+ 		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ 		   UNSPEC_LD2))]
+   "TARGET_SIMD"
+@@ -4977,8 +5169,8 @@
+ )
+ 
+ (define_expand "vec_store_lanesoi<mode>"
+-  [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+-	(unspec:OI [(match_operand:OI 1 "register_operand" "w")
++  [(set (match_operand:OI 0 "aarch64_simd_struct_operand")
++	(unspec:OI [(match_operand:OI 1 "register_operand")
+                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    UNSPEC_ST2))]
+   "TARGET_SIMD"
+@@ -5031,8 +5223,8 @@
+ )
+ 
+ (define_expand "vec_load_lanesci<mode>"
+-  [(set (match_operand:CI 0 "register_operand" "=w")
+-	(unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
++  [(set (match_operand:CI 0 "register_operand")
++	(unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand")
+ 		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ 		   UNSPEC_LD3))]
+   "TARGET_SIMD"
+@@ -5075,8 +5267,8 @@
+ )
+ 
+ (define_expand "vec_store_lanesci<mode>"
+-  [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
+-	(unspec:CI [(match_operand:CI 1 "register_operand" "w")
++  [(set (match_operand:CI 0 "aarch64_simd_struct_operand")
++	(unspec:CI [(match_operand:CI 1 "register_operand")
+                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    UNSPEC_ST3))]
+   "TARGET_SIMD"
+@@ -5129,8 +5321,8 @@
+ )
+ 
+ (define_expand "vec_load_lanesxi<mode>"
+-  [(set (match_operand:XI 0 "register_operand" "=w")
+-	(unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
++  [(set (match_operand:XI 0 "register_operand")
++	(unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand")
+ 		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ 		   UNSPEC_LD4))]
+   "TARGET_SIMD"
+@@ -5173,8 +5365,8 @@
+ )
+ 
+ (define_expand "vec_store_lanesxi<mode>"
+-  [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
+-	(unspec:XI [(match_operand:XI 1 "register_operand" "w")
++  [(set (match_operand:XI 0 "aarch64_simd_struct_operand")
++	(unspec:XI [(match_operand:XI 1 "register_operand")
+                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    UNSPEC_ST4))]
+   "TARGET_SIMD"
+@@ -5219,8 +5411,8 @@
+ ;; Reload patterns for AdvSIMD register list operands.
+ 
+ (define_expand "mov<mode>"
+-  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "")
+-	(match_operand:VSTRUCT 1 "general_operand" ""))]
++  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
++	(match_operand:VSTRUCT 1 "general_operand"))]
+   "TARGET_SIMD"
+ {
+   if (can_create_pseudo_p ())
+@@ -5232,8 +5424,8 @@
+ 
+ 
+ (define_expand "aarch64_ld1x3<VALLDIF:mode>"
+-  [(match_operand:CI 0 "register_operand" "=w")
+-   (match_operand:DI 1 "register_operand" "r")
++  [(match_operand:CI 0 "register_operand")
++   (match_operand:DI 1 "register_operand")
+    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5252,9 +5444,31 @@
+   [(set_attr "type" "neon_load1_3reg<q>")]
+ )
+ 
++(define_expand "aarch64_ld1x4<VALLDIF:mode>"
++  [(match_operand:XI 0 "register_operand" "=w")
++   (match_operand:DI 1 "register_operand" "r")
++   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
++  "TARGET_SIMD"
++{
++  rtx mem = gen_rtx_MEM (XImode, operands[1]);
++  emit_insn (gen_aarch64_ld1_x4_<VALLDIF:mode> (operands[0], mem));
++  DONE;
++})
++
++(define_insn "aarch64_ld1_x4_<mode>"
++  [(set (match_operand:XI 0 "register_operand" "=w")
++	(unspec:XI
++	  [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
++	   (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
++	UNSPEC_LD1))]
++  "TARGET_SIMD"
++  "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
++  [(set_attr "type" "neon_load1_4reg<q>")]
++)
++
+ (define_expand "aarch64_st1x2<VALLDIF:mode>"
+-  [(match_operand:DI 0 "register_operand" "")
+-   (match_operand:OI 1 "register_operand" "")
++  [(match_operand:DI 0 "register_operand")
++   (match_operand:OI 1 "register_operand")
+    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5274,8 +5488,8 @@
+ )
+ 
+ (define_expand "aarch64_st1x3<VALLDIF:mode>"
+-  [(match_operand:DI 0 "register_operand" "")
+-   (match_operand:CI 1 "register_operand" "")
++  [(match_operand:DI 0 "register_operand")
++   (match_operand:CI 1 "register_operand")
+    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5294,6 +5508,28 @@
+   [(set_attr "type" "neon_store1_3reg<q>")]
+ )
+ 
++(define_expand "aarch64_st1x4<VALLDIF:mode>"
++  [(match_operand:DI 0 "register_operand" "")
++   (match_operand:XI 1 "register_operand" "")
++   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
++  "TARGET_SIMD"
++{
++  rtx mem = gen_rtx_MEM (XImode, operands[0]);
++  emit_insn (gen_aarch64_st1_x4_<VALLDIF:mode> (mem, operands[1]));
++  DONE;
++})
++
++(define_insn "aarch64_st1_x4_<mode>"
++  [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
++	(unspec:XI
++	   [(match_operand:XI 1 "register_operand" "w")
++	   (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
++	UNSPEC_ST1))]
++  "TARGET_SIMD"
++  "st1\\t{%S1.<Vtype> - %V1.<Vtype>}, %0"
++  [(set_attr "type" "neon_store1_4reg<q>")]
++)
++
+ (define_insn "*aarch64_mov<mode>"
+   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
+ 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
+@@ -5427,8 +5663,8 @@
+ })
+ 
+ (define_expand "aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>"
+-  [(match_operand:VSTRUCT 0 "register_operand" "=w")
+-   (match_operand:DI 1 "register_operand" "w")
++  [(match_operand:VSTRUCT 0 "register_operand")
++   (match_operand:DI 1 "register_operand")
+    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5502,8 +5738,8 @@
+ )
+ 
+ (define_expand "aarch64_ld<VSTRUCT:nregs><VDC:mode>"
+- [(match_operand:VSTRUCT 0 "register_operand" "=w")
+-  (match_operand:DI 1 "register_operand" "r")
++ [(match_operand:VSTRUCT 0 "register_operand")
++  (match_operand:DI 1 "register_operand")
+   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5530,8 +5766,8 @@
+ })
+ 
+ (define_expand "aarch64_ld<VSTRUCT:nregs><VQ:mode>"
+- [(match_operand:VSTRUCT 0 "register_operand" "=w")
+-  (match_operand:DI 1 "register_operand" "r")
++ [(match_operand:VSTRUCT 0 "register_operand")
++  (match_operand:DI 1 "register_operand")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5543,8 +5779,8 @@
+ })
+ 
+ (define_expand "aarch64_ld1x2<VQ:mode>"
+- [(match_operand:OI 0 "register_operand" "=w")
+-  (match_operand:DI 1 "register_operand" "r")
++ [(match_operand:OI 0 "register_operand")
++  (match_operand:DI 1 "register_operand")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5556,8 +5792,8 @@
+ })
+ 
+ (define_expand "aarch64_ld1x2<VDC:mode>"
+- [(match_operand:OI 0 "register_operand" "=w")
+-  (match_operand:DI 1 "register_operand" "r")
++ [(match_operand:OI 0 "register_operand")
++  (match_operand:DI 1 "register_operand")
+   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5570,10 +5806,10 @@
+ 
+ 
+ (define_expand "aarch64_ld<VSTRUCT:nregs>_lane<VALLDIF:mode>"
+-  [(match_operand:VSTRUCT 0 "register_operand" "=w")
+-	(match_operand:DI 1 "register_operand" "w")
+-	(match_operand:VSTRUCT 2 "register_operand" "0")
+-	(match_operand:SI 3 "immediate_operand" "i")
++  [(match_operand:VSTRUCT 0 "register_operand")
++	(match_operand:DI 1 "register_operand")
++	(match_operand:VSTRUCT 2 "register_operand")
++	(match_operand:SI 3 "immediate_operand")
+ 	(unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5593,9 +5829,9 @@
+ ;; D-register list.
+ 
+ (define_expand "aarch64_get_dreg<VSTRUCT:mode><VDC:mode>"
+- [(match_operand:VDC 0 "register_operand" "=w")
+-  (match_operand:VSTRUCT 1 "register_operand" "w")
+-  (match_operand:SI 2 "immediate_operand" "i")]
++ [(match_operand:VDC 0 "register_operand")
++  (match_operand:VSTRUCT 1 "register_operand")
++  (match_operand:SI 2 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   int part = INTVAL (operands[2]);
+@@ -5610,9 +5846,9 @@
+ ;; Q-register list.
+ 
+ (define_expand "aarch64_get_qreg<VSTRUCT:mode><VQ:mode>"
+- [(match_operand:VQ 0 "register_operand" "=w")
+-  (match_operand:VSTRUCT 1 "register_operand" "w")
+-  (match_operand:SI 2 "immediate_operand" "i")]
++ [(match_operand:VQ 0 "register_operand")
++  (match_operand:VSTRUCT 1 "register_operand")
++  (match_operand:SI 2 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   int part = INTVAL (operands[2]);
+@@ -5749,13 +5985,13 @@
+ ;; This instruction's pattern is generated directly by
+ ;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+ ;; need corresponding changes there.
+-(define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
++(define_insn "aarch64_<PERMUTE:perm_insn><mode>"
+   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+ 	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+ 			  (match_operand:VALL_F16 2 "register_operand" "w")]
+ 	 PERMUTE))]
+   "TARGET_SIMD"
+-  "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
++  "<PERMUTE:perm_insn>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+   [(set_attr "type" "neon_permute<q>")]
+ )
+ 
+@@ -5851,8 +6087,8 @@
+ )
+ 
+ (define_expand "aarch64_st<VSTRUCT:nregs><VDC:mode>"
+- [(match_operand:DI 0 "register_operand" "r")
+-  (match_operand:VSTRUCT 1 "register_operand" "w")
++ [(match_operand:DI 0 "register_operand")
++  (match_operand:VSTRUCT 1 "register_operand")
+   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5864,8 +6100,8 @@
+ })
+ 
+ (define_expand "aarch64_st<VSTRUCT:nregs><VQ:mode>"
+- [(match_operand:DI 0 "register_operand" "r")
+-  (match_operand:VSTRUCT 1 "register_operand" "w")
++ [(match_operand:DI 0 "register_operand")
++  (match_operand:VSTRUCT 1 "register_operand")
+   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   "TARGET_SIMD"
+ {
+@@ -5877,8 +6113,8 @@
+ })
+ 
+ (define_expand "aarch64_st<VSTRUCT:nregs>_lane<VALLDIF:mode>"
+- [(match_operand:DI 0 "register_operand" "r")
+-  (match_operand:VSTRUCT 1 "register_operand" "w")
++ [(match_operand:DI 0 "register_operand")
++  (match_operand:VSTRUCT 1 "register_operand")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+   (match_operand:SI 2 "immediate_operand")]
+   "TARGET_SIMD"
+@@ -5914,10 +6150,10 @@
+ ;; extend them in arm_neon.h and insert the resulting Q-regs.
+ 
+ (define_expand "aarch64_set_qreg<VSTRUCT:mode><VQ:mode>"
+- [(match_operand:VSTRUCT 0 "register_operand" "+w")
+-  (match_operand:VSTRUCT 1 "register_operand" "0")
+-  (match_operand:VQ 2 "register_operand" "w")
+-  (match_operand:SI 3 "immediate_operand" "i")]
++ [(match_operand:VSTRUCT 0 "register_operand")
++  (match_operand:VSTRUCT 1 "register_operand")
++  (match_operand:VQ 2 "register_operand")
++  (match_operand:SI 3 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+   int part = INTVAL (operands[3]);
+@@ -5932,7 +6168,7 @@
+ ;; Standard pattern name vec_init<mode><Vel>.
+ 
+ (define_expand "vec_init<mode><Vel>"
+-  [(match_operand:VALL_F16 0 "register_operand" "")
++  [(match_operand:VALL_F16 0 "register_operand")
+    (match_operand 1 "" "")]
+   "TARGET_SIMD"
+ {
+@@ -5941,7 +6177,7 @@
+ })
+ 
+ (define_expand "vec_init<mode><Vhalf>"
+-  [(match_operand:VQ_NO2E 0 "register_operand" "")
++  [(match_operand:VQ_NO2E 0 "register_operand")
+    (match_operand 1 "" "")]
+   "TARGET_SIMD"
+ {
+@@ -6020,9 +6256,9 @@
+ ;; Standard pattern name vec_extract<mode><Vel>.
+ 
+ (define_expand "vec_extract<mode><Vel>"
+-  [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "")
+-   (match_operand:VALL_F16 1 "register_operand" "")
+-   (match_operand:SI 2 "immediate_operand" "")]
++  [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
++   (match_operand:VALL_F16 1 "register_operand")
++   (match_operand:SI 2 "immediate_operand")]
+   "TARGET_SIMD"
+ {
+     emit_insn
+@@ -6063,56 +6299,23 @@
+ 
+ (define_insn "aarch64_crypto_aes<aes_op>v16qi"
+   [(set (match_operand:V16QI 0 "register_operand" "=w")
+-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%0")
+-		       (match_operand:V16QI 2 "register_operand" "w")]
++	(unspec:V16QI
++		[(xor:V16QI
++		 (match_operand:V16QI 1 "register_operand" "%0")
++		 (match_operand:V16QI 2 "register_operand" "w"))]
+          CRYPTO_AES))]
+   "TARGET_SIMD && TARGET_AES"
+   "aes<aes_op>\\t%0.16b, %2.16b"
+   [(set_attr "type" "crypto_aese")]
+ )
+ 
+-(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine"
+-  [(set (match_operand:V16QI 0 "register_operand" "=w")
+-	(unspec:V16QI [(xor:V16QI
+-			(match_operand:V16QI 1 "register_operand" "%0")
+-			(match_operand:V16QI 2 "register_operand" "w"))
+-		       (match_operand:V16QI 3 "aarch64_simd_imm_zero" "")]
+-		       CRYPTO_AES))]
+-  "TARGET_SIMD && TARGET_AES"
+-  "aes<aes_op>\\t%0.16b, %2.16b"
+-  [(set_attr "type" "crypto_aese")]
+-)
+-
+-(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine"
+-  [(set (match_operand:V16QI 0 "register_operand" "=w")
+-	(unspec:V16QI [(match_operand:V16QI 3 "aarch64_simd_imm_zero" "")
+-	(xor:V16QI (match_operand:V16QI 1 "register_operand" "%0")
+-		   (match_operand:V16QI 2 "register_operand" "w"))]
+-	CRYPTO_AES))]
+-  "TARGET_SIMD && TARGET_AES"
+-  "aes<aes_op>\\t%0.16b, %2.16b"
+-  [(set_attr "type" "crypto_aese")]
+-)
+-
+-;; When AES/AESMC fusion is enabled we want the register allocation to
+-;; look like:
+-;;    AESE Vn, _
+-;;    AESMC Vn, Vn
+-;; So prefer to tie operand 1 to operand 0 when fusing.
+-
+ (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
+-  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
+-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
++  [(set (match_operand:V16QI 0 "register_operand" "=w")
++	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
+ 	 CRYPTO_AESMC))]
+   "TARGET_SIMD && TARGET_AES"
+   "aes<aesmc_op>\\t%0.16b, %1.16b"
+-  [(set_attr "type" "crypto_aesmc")
+-   (set_attr_alternative "enabled"
+-     [(if_then_else (match_test
+-		       "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
+-		     (const_string "yes" )
+-		     (const_string "no"))
+-      (const_string "yes")])]
++  [(set_attr "type" "crypto_aesmc")]
+ )
+ 
+ ;; When AESE/AESMC fusion is enabled we really want to keep the two together
+@@ -6121,12 +6324,14 @@
+ ;;  Mash the two together during combine.
+ 
+ (define_insn "*aarch64_crypto_aese_fused"
+-  [(set (match_operand:V16QI 0 "register_operand" "=&w")
++  [(set (match_operand:V16QI 0 "register_operand" "=w")
+ 	(unspec:V16QI
+ 	  [(unspec:V16QI
+-	    [(match_operand:V16QI 1 "register_operand" "0")
+-	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE)
+-	  ] UNSPEC_AESMC))]
++	   [(xor:V16QI
++		(match_operand:V16QI 1 "register_operand" "%0")
++		(match_operand:V16QI 2 "register_operand" "w"))]
++	     UNSPEC_AESE)]
++	UNSPEC_AESMC))]
+   "TARGET_SIMD && TARGET_AES
+    && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+   "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
+@@ -6140,12 +6345,14 @@
+ ;;  Mash the two together during combine.
+ 
+ (define_insn "*aarch64_crypto_aesd_fused"
+-  [(set (match_operand:V16QI 0 "register_operand" "=&w")
++  [(set (match_operand:V16QI 0 "register_operand" "=w")
+ 	(unspec:V16QI
+ 	  [(unspec:V16QI
+-	    [(match_operand:V16QI 1 "register_operand" "0")
+-	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD)
+-	  ] UNSPEC_AESIMC))]
++		    [(xor:V16QI
++			(match_operand:V16QI 1 "register_operand" "%0")
++			(match_operand:V16QI 2 "register_operand" "w"))]
++		UNSPEC_AESD)]
++	  UNSPEC_AESIMC))]
+   "TARGET_SIMD && TARGET_AES
+    && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+   "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
+@@ -6397,11 +6604,11 @@
+ ;; fp16fml
+ 
+ (define_expand "aarch64_fml<f16mac1>l<f16quad>_low<mode>"
+-  [(set (match_operand:VDQSF 0 "register_operand" "=w")
++  [(set (match_operand:VDQSF 0 "register_operand")
+ 	(unspec:VDQSF
+-	 [(match_operand:VDQSF 1 "register_operand" "0")
+-	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
+-	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
++	 [(match_operand:VDQSF 1 "register_operand")
++	  (match_operand:<VFMLA_W> 2 "register_operand")
++	  (match_operand:<VFMLA_W> 3 "register_operand")]
+ 	 VFMLA16_LOW))]
+   "TARGET_F16FML"
+ {
+@@ -6420,11 +6627,11 @@
+ })
+ 
+ (define_expand "aarch64_fml<f16mac1>l<f16quad>_high<mode>"
+-  [(set (match_operand:VDQSF 0 "register_operand" "=w")
++  [(set (match_operand:VDQSF 0 "register_operand")
+ 	(unspec:VDQSF
+-	 [(match_operand:VDQSF 1 "register_operand" "0")
+-	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
+-	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
++	 [(match_operand:VDQSF 1 "register_operand")
++	  (match_operand:<VFMLA_W> 2 "register_operand")
++	  (match_operand:<VFMLA_W> 3 "register_operand")]
+ 	 VFMLA16_HIGH))]
+   "TARGET_F16FML"
+ {
+@@ -6510,11 +6717,11 @@
+ )
+ 
+ (define_expand "aarch64_fml<f16mac1>l_lane_lowv2sf"
+-  [(set (match_operand:V2SF 0 "register_operand" "")
+-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+-			   (match_operand:V4HF 2 "register_operand" "")
+-			   (match_operand:V4HF 3 "register_operand" "")
+-			   (match_operand:SI 4 "aarch64_imm2" "")]
++  [(set (match_operand:V2SF 0 "register_operand")
++	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
++			   (match_operand:V4HF 2 "register_operand")
++			   (match_operand:V4HF 3 "register_operand")
++			   (match_operand:SI 4 "aarch64_imm2")]
+ 	 VFMLA16_LOW))]
+   "TARGET_F16FML"
+ {
+@@ -6531,11 +6738,11 @@
+ )
+ 
+ (define_expand "aarch64_fml<f16mac1>l_lane_highv2sf"
+-  [(set (match_operand:V2SF 0 "register_operand" "")
+-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+-			   (match_operand:V4HF 2 "register_operand" "")
+-			   (match_operand:V4HF 3 "register_operand" "")
+-			   (match_operand:SI 4 "aarch64_imm2" "")]
++  [(set (match_operand:V2SF 0 "register_operand")
++	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
++			   (match_operand:V4HF 2 "register_operand")
++			   (match_operand:V4HF 3 "register_operand")
++			   (match_operand:SI 4 "aarch64_imm2")]
+ 	 VFMLA16_HIGH))]
+   "TARGET_F16FML"
+ {
+@@ -6625,11 +6832,11 @@
+ )
+ 
+ (define_expand "aarch64_fml<f16mac1>lq_laneq_lowv4sf"
+-  [(set (match_operand:V4SF 0 "register_operand" "")
+-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+-			   (match_operand:V8HF 2 "register_operand" "")
+-			   (match_operand:V8HF 3 "register_operand" "")
+-			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
++  [(set (match_operand:V4SF 0 "register_operand")
++	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
++			   (match_operand:V8HF 2 "register_operand")
++			   (match_operand:V8HF 3 "register_operand")
++			   (match_operand:SI 4 "aarch64_lane_imm3")]
+ 	 VFMLA16_LOW))]
+   "TARGET_F16FML"
+ {
+@@ -6645,11 +6852,11 @@
+ })
+ 
+ (define_expand "aarch64_fml<f16mac1>lq_laneq_highv4sf"
+-  [(set (match_operand:V4SF 0 "register_operand" "")
+-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+-			   (match_operand:V8HF 2 "register_operand" "")
+-			   (match_operand:V8HF 3 "register_operand" "")
+-			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
++  [(set (match_operand:V4SF 0 "register_operand")
++	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
++			   (match_operand:V8HF 2 "register_operand")
++			   (match_operand:V8HF 3 "register_operand")
++			   (match_operand:SI 4 "aarch64_lane_imm3")]
+ 	 VFMLA16_HIGH))]
+   "TARGET_F16FML"
+ {
+@@ -6739,11 +6946,11 @@
+ )
+ 
+ (define_expand "aarch64_fml<f16mac1>l_laneq_lowv2sf"
+-  [(set (match_operand:V2SF 0 "register_operand" "")
+-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+-		      (match_operand:V4HF 2 "register_operand" "")
+-		      (match_operand:V8HF 3 "register_operand" "")
+-		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
++  [(set (match_operand:V2SF 0 "register_operand")
++	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
++		      (match_operand:V4HF 2 "register_operand")
++		      (match_operand:V8HF 3 "register_operand")
++		      (match_operand:SI 4 "aarch64_lane_imm3")]
+ 	 VFMLA16_LOW))]
+   "TARGET_F16FML"
+ {
+@@ -6760,11 +6967,11 @@
+ })
+ 
+ (define_expand "aarch64_fml<f16mac1>l_laneq_highv2sf"
+-  [(set (match_operand:V2SF 0 "register_operand" "")
+-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+-		      (match_operand:V4HF 2 "register_operand" "")
+-		      (match_operand:V8HF 3 "register_operand" "")
+-		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
++  [(set (match_operand:V2SF 0 "register_operand")
++	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
++		      (match_operand:V4HF 2 "register_operand")
++		      (match_operand:V8HF 3 "register_operand")
++		      (match_operand:SI 4 "aarch64_lane_imm3")]
+ 	 VFMLA16_HIGH))]
+   "TARGET_F16FML"
+ {
+@@ -6855,11 +7062,11 @@
+ )
+ 
+ (define_expand "aarch64_fml<f16mac1>lq_lane_lowv4sf"
+-  [(set (match_operand:V4SF 0 "register_operand" "")
+-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+-		      (match_operand:V8HF 2 "register_operand" "")
+-		      (match_operand:V4HF 3 "register_operand" "")
+-		      (match_operand:SI 4 "aarch64_imm2" "")]
++  [(set (match_operand:V4SF 0 "register_operand")
++	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
++		      (match_operand:V8HF 2 "register_operand")
++		      (match_operand:V4HF 3 "register_operand")
++		      (match_operand:SI 4 "aarch64_imm2")]
+ 	 VFMLA16_LOW))]
+   "TARGET_F16FML"
+ {
+@@ -6875,11 +7082,11 @@
+ })
+ 
+ (define_expand "aarch64_fml<f16mac1>lq_lane_highv4sf"
+-  [(set (match_operand:V4SF 0 "register_operand" "")
+-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+-		      (match_operand:V8HF 2 "register_operand" "")
+-		      (match_operand:V4HF 3 "register_operand" "")
+-		      (match_operand:SI 4 "aarch64_imm2" "")]
++  [(set (match_operand:V4SF 0 "register_operand")
++	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
++		      (match_operand:V8HF 2 "register_operand")
++		      (match_operand:V4HF 3 "register_operand")
++		      (match_operand:SI 4 "aarch64_imm2")]
+ 	 VFMLA16_HIGH))]
+   "TARGET_F16FML"
+ {
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+new file mode 100644
+index 000000000..b28ded0f5
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+@@ -0,0 +1,2760 @@
++/* ACLE support for AArch64 SVE (__ARM_FEATURE_SVE intrinsics)
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "tm.h"
++#include "tree.h"
++#include "rtl.h"
++#include "tm_p.h"
++#include "memmodel.h"
++#include "insn-codes.h"
++#include "optabs.h"
++#include "recog.h"
++#include "expr.h"
++#include "basic-block.h"
++#include "function.h"
++#include "fold-const.h"
++#include "gimple.h"
++#include "gimple-iterator.h"
++#include "gimplify.h"
++#include "explow.h"
++#include "emit-rtl.h"
++#include "tree-vector-builder.h"
++#include "rtx-vector-builder.h"
++#include "vec-perm-indices.h"
++#include "aarch64-sve-builtins.h"
++#include "aarch64-sve-builtins-shapes.h"
++#include "aarch64-sve-builtins-base.h"
++#include "aarch64-sve-builtins-functions.h"
++
++using namespace aarch64_sve;
++
++namespace {
++
++/* Expand a call to svmad, or svmla after reordering its operands.
++   Make _m forms merge with argument MERGE_ARGNO.  */
++static rtx
++expand_mad (function_expander &e,
++	    unsigned int merge_argno = DEFAULT_MERGE_ARGNO)
++{
++  if (e.pred == PRED_x)
++    {
++      insn_code icode;
++      if (e.type_suffix (0).integer_p)
++	icode = code_for_aarch64_pred_fma (e.vector_mode (0));
++      else
++	icode = code_for_aarch64_pred (UNSPEC_COND_FMLA, e.vector_mode (0));
++      return e.use_pred_x_insn (icode);
++    }
++
++  insn_code icode = e.direct_optab_handler (cond_fma_optab);
++  return e.use_cond_insn (icode, merge_argno);
++}
++
++/* Expand a call to svmsb, or svmls after reordering its operands.
++   Make _m forms merge with argument MERGE_ARGNO.  */
++static rtx
++expand_msb (function_expander &e,
++	    unsigned int merge_argno = DEFAULT_MERGE_ARGNO)
++{
++  if (e.pred == PRED_x)
++    {
++      insn_code icode;
++      if (e.type_suffix (0).integer_p)
++	icode = code_for_aarch64_pred_fnma (e.vector_mode (0));
++      else
++	icode = code_for_aarch64_pred (UNSPEC_COND_FMLS, e.vector_mode (0));
++      return e.use_pred_x_insn (icode);
++    }
++
++  insn_code icode = e.direct_optab_handler (cond_fnma_optab);
++  return e.use_cond_insn (icode, merge_argno);
++}
++
++class svabd_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* The integer operations are represented as the subtraction of the
++       minimum from the maximum, with the signedness of the instruction
++       keyed off the signedness of the maximum operation.  */
++    rtx_code max_code = e.type_suffix (0).unsigned_p ? UMAX : SMAX;
++    insn_code icode;
++    if (e.pred == PRED_x)
++      {
++	if (e.type_suffix (0).integer_p)
++	  icode = code_for_aarch64_pred_abd (max_code, e.vector_mode (0));
++	else
++	  icode = code_for_aarch64_pred_abd (e.vector_mode (0));
++	return e.use_pred_x_insn (icode);
++      }
++
++    if (e.type_suffix (0).integer_p)
++      icode = code_for_aarch64_cond_abd (max_code, e.vector_mode (0));
++    else
++      icode = code_for_aarch64_cond_abd (e.vector_mode (0));
++    return e.use_cond_insn (icode);
++  }
++};
++
++/* Implements svacge, svacgt, svacle and svaclt.  */
++class svac_impl : public function_base
++{
++public:
++  CONSTEXPR svac_impl (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.add_ptrue_hint (0, e.gp_mode (0));
++    insn_code icode = code_for_aarch64_pred_fac (m_unspec, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++
++  /* The unspec code for the underlying comparison.  */
++  int m_unspec;
++};
++
++class svadda_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Put the predicate last, as required by mask_fold_left_plus_optab.  */
++    e.rotate_inputs_left (0, 3);
++    machine_mode mode = e.vector_mode (0);
++    insn_code icode = direct_optab_handler (mask_fold_left_plus_optab, mode);
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements svadr[bhwd].  */
++class svadr_bhwd_impl : public function_base
++{
++public:
++  CONSTEXPR svadr_bhwd_impl (unsigned int shift) : m_shift (shift) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = GET_MODE (e.args[0]);
++    if (m_shift == 0)
++      return e.use_exact_insn (code_for_aarch64_adr (mode));
++
++    /* Turn the access size into an extra shift argument.  */
++    rtx shift = gen_int_mode (m_shift, GET_MODE_INNER (mode));
++    e.args.quick_push (expand_vector_broadcast (mode, shift));
++    return e.use_exact_insn (code_for_aarch64_adr_shift (mode));
++  }
++
++  /* How many bits left to shift the vector displacement.  */
++  unsigned int m_shift;
++};
++
++class svasrd_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_cond_insn (code_for_cond_asrd (e.vector_mode (0)));
++  }
++};
++
++class svbic_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Convert svbic of a constant into svand of its inverse.  */
++    if (CONST_INT_P (e.args[2]))
++      {
++	machine_mode mode = GET_MODE_INNER (e.vector_mode (0));
++	e.args[2] = simplify_unary_operation (NOT, mode, e.args[2], mode);
++	return e.map_to_rtx_codes (AND, AND, -1);
++      }
++
++    if (e.type_suffix_ids[0] == TYPE_SUFFIX_b)
++      {
++	gcc_assert (e.pred == PRED_z);
++	return e.use_exact_insn (CODE_FOR_aarch64_pred_bicvnx16bi_z);
++      }
++
++    if (e.pred == PRED_x)
++      return e.use_unpred_insn (code_for_aarch64_bic (e.vector_mode (0)));
++
++    return e.use_cond_insn (code_for_cond_bic (e.vector_mode (0)));
++  }
++};
++
++/* Implements svbrkn, svbrkpa and svbrkpb.  */
++class svbrk_binary_impl : public function_base
++{
++public:
++  CONSTEXPR svbrk_binary_impl (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (code_for_aarch64_brk (m_unspec));
++  }
++
++  /* The unspec code associated with the operation.  */
++  int m_unspec;
++};
++
++/* Implements svbrka and svbrkb.  */
++class svbrk_unary_impl : public function_base
++{
++public:
++  CONSTEXPR svbrk_unary_impl (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_cond_insn (code_for_aarch64_brk (m_unspec));
++  }
++
++  /* The unspec code associated with the operation.  */
++  int m_unspec;
++};
++
++class svcadd_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Convert the rotation amount into a specific unspec.  */
++    int rot = INTVAL (e.args[3]);
++    e.args.ordered_remove (3);
++    int unspec = (rot == 90 ? UNSPEC_COND_FCADD90
++		  : rot == 270 ? UNSPEC_COND_FCADD270
++		  : (gcc_unreachable (), 0));
++    return e.map_to_unspecs (-1, -1, unspec);
++  }
++};
++
++/* Implements svclasta and svclastb.  */
++class svclast_impl : public quiet<function_base>
++{
++public:
++  CONSTEXPR svclast_impl (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Match the fold_extract_optab order.  */
++    std::swap (e.args[0], e.args[1]);
++    machine_mode mode = e.vector_mode (0);
++    insn_code icode;
++    if (e.mode_suffix_id == MODE_n)
++      icode = code_for_fold_extract (m_unspec, mode);
++    else
++      icode = code_for_aarch64_fold_extract_vector (m_unspec, mode);
++    return e.use_exact_insn (icode);
++  }
++
++  /* The unspec code associated with the operation.  */
++  int m_unspec;
++};
++
++class svcmla_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Convert the rotation amount into a specific unspec.  */
++    int rot = INTVAL (e.args[4]);
++    e.args.ordered_remove (4);
++    int unspec = (rot == 0 ? UNSPEC_COND_FCMLA
++		  : rot == 90 ? UNSPEC_COND_FCMLA90
++		  : rot == 180 ? UNSPEC_COND_FCMLA180
++		  : rot == 270 ? UNSPEC_COND_FCMLA270
++		  : (gcc_unreachable (), 0));
++
++    /* Make the operand order the same as the one used by the fma optabs,
++       with the accumulator last.  */
++    e.rotate_inputs_left (1, 4);
++    return e.map_to_unspecs (-1, -1, unspec, 3);
++  }
++};
++
++class svcmla_lane_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Convert the rotation amount into a specific unspec.  */
++    int rot = INTVAL (e.args[4]);
++    e.args.ordered_remove (4);
++    int unspec = (rot == 0 ? UNSPEC_FCMLA
++		  : rot == 90 ? UNSPEC_FCMLA90
++		  : rot == 180 ? UNSPEC_FCMLA180
++		  : rot == 270 ? UNSPEC_FCMLA270
++		  : (gcc_unreachable (), 0));
++
++    /* Make the operand order the same as the one used by the fma optabs,
++       with the accumulator last.  */
++    e.rotate_inputs_left (0, 4);
++    insn_code icode = code_for_aarch64_lane (unspec, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements svcmp<cc> (except svcmpuo, which is handled separately).  */
++class svcmp_impl : public function_base
++{
++public:
++  CONSTEXPR svcmp_impl (tree_code code, int unspec_for_fp)
++    : m_code (code), m_unspec_for_fp (unspec_for_fp) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++
++    /* Comparisons are UNSPEC_PRED_Z operations and so need a hint
++       operand.  */
++    e.add_ptrue_hint (0, e.gp_mode (0));
++
++    if (e.type_suffix (0).integer_p)
++      {
++	bool unsigned_p = e.type_suffix (0).unsigned_p;
++	rtx_code code = get_rtx_code (m_code, unsigned_p);
++	return e.use_exact_insn (code_for_aarch64_pred_cmp (code, mode));
++      }
++
++    insn_code icode = code_for_aarch64_pred_fcm (m_unspec_for_fp, mode);
++    return e.use_exact_insn (icode);
++  }
++
++  /* The tree code associated with the comparison.  */
++  tree_code m_code;
++
++  /* The unspec code to use for floating-point comparisons.  */
++  int m_unspec_for_fp;
++};
++
++/* Implements svcmp<cc>_wide.  */
++class svcmp_wide_impl : public function_base
++{
++public:
++  CONSTEXPR svcmp_wide_impl (tree_code code, int unspec_for_sint,
++			     int unspec_for_uint)
++    : m_code (code), m_unspec_for_sint (unspec_for_sint),
++      m_unspec_for_uint (unspec_for_uint) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    bool unsigned_p = e.type_suffix (0).unsigned_p;
++    rtx_code code = get_rtx_code (m_code, unsigned_p);
++
++    /* Comparisons are UNSPEC_PRED_Z operations and so need a hint
++       operand.  */
++    e.add_ptrue_hint (0, e.gp_mode (0));
++
++    /* If the argument is a constant that the unwidened comparisons
++       can handle directly, use them instead.  */
++    insn_code icode = code_for_aarch64_pred_cmp (code, mode);
++    rtx op2 = unwrap_const_vec_duplicate (e.args[3]);
++    if (CONSTANT_P (op2)
++	&& insn_data[icode].operand[4].predicate (op2, DImode))
++      {
++	e.args[3] = op2;
++	return e.use_exact_insn (icode);
++      }
++
++    int unspec = (unsigned_p ? m_unspec_for_uint : m_unspec_for_sint);
++    return e.use_exact_insn (code_for_aarch64_pred_cmp_wide (unspec, mode));
++  }
++
++  /* The tree code associated with the comparison.  */
++  tree_code m_code;
++
++  /* The unspec codes for signed and unsigned wide comparisons
++     respectively.  */
++  int m_unspec_for_sint;
++  int m_unspec_for_uint;
++};
++
++class svcmpuo_impl : public quiet<function_base>
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.add_ptrue_hint (0, e.gp_mode (0));
++    return e.use_exact_insn (code_for_aarch64_pred_fcmuo (e.vector_mode (0)));
++  }
++};
++
++class svcnot_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    if (e.pred == PRED_x)
++      {
++	/* The pattern for CNOT includes an UNSPEC_PRED_Z, so needs
++	   a ptrue hint.  */
++	e.add_ptrue_hint (0, e.gp_mode (0));
++	return e.use_pred_x_insn (code_for_aarch64_pred_cnot (mode));
++      }
++
++    return e.use_cond_insn (code_for_cond_cnot (mode), 0);
++  }
++};
++
++/* Implements svcnt[bhwd], which count the number of elements
++   in a particular vector mode.  */
++class svcnt_bhwd_impl : public function_base
++{
++public:
++  CONSTEXPR svcnt_bhwd_impl (machine_mode ref_mode) : m_ref_mode (ref_mode) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree count = build_int_cstu (TREE_TYPE (f.lhs),
++				 GET_MODE_NUNITS (m_ref_mode));
++    return gimple_build_assign (f.lhs, count);
++  }
++
++  rtx
++  expand (function_expander &) const OVERRIDE
++  {
++    return gen_int_mode (GET_MODE_NUNITS (m_ref_mode), DImode);
++  }
++
++  /* The mode of the vector associated with the [bhwd] suffix.  */
++  machine_mode m_ref_mode;
++};
++
++/* Implements svcnt[bhwd]_pat.  */
++class svcnt_bhwd_pat_impl : public svcnt_bhwd_impl
++{
++public:
++  CONSTEXPR svcnt_bhwd_pat_impl (machine_mode ref_mode)
++    : svcnt_bhwd_impl (ref_mode) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree pattern_arg = gimple_call_arg (f.call, 0);
++    aarch64_svpattern pattern = (aarch64_svpattern) tree_to_shwi (pattern_arg);
++
++    if (pattern == AARCH64_SV_ALL)
++      /* svcvnt[bwhd]_pat (SV_ALL) == svcnt[bwhd] ().  */
++      return svcnt_bhwd_impl::fold (f);
++
++    /* See whether we can count the number of elements in the pattern
++       at compile time.  */
++    unsigned int elements_per_vq = 128 / GET_MODE_UNIT_BITSIZE (m_ref_mode);
++    HOST_WIDE_INT value = aarch64_fold_sve_cnt_pat (pattern, elements_per_vq);
++    if (value >= 0)
++      {
++	tree count = build_int_cstu (TREE_TYPE (f.lhs), value);
++	return gimple_build_assign (f.lhs, count);
++      }
++
++    return NULL;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    unsigned int elements_per_vq = 128 / GET_MODE_UNIT_BITSIZE (m_ref_mode);
++    e.args.quick_push (gen_int_mode (elements_per_vq, DImode));
++    e.args.quick_push (const1_rtx);
++    return e.use_exact_insn (CODE_FOR_aarch64_sve_cnt_pat);
++  }
++};
++
++class svcntp_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    e.add_ptrue_hint (0, mode);
++    return e.use_exact_insn (code_for_aarch64_pred_cntp (mode));
++  }
++};
++
++/* Implements svcreate2, svcreate3 and svcreate4.  */
++class svcreate_impl : public quiet<multi_vector_function>
++{
++public:
++  CONSTEXPR svcreate_impl (unsigned int vectors_per_tuple)
++    : quiet<multi_vector_function> (vectors_per_tuple) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    unsigned int nargs = gimple_call_num_args (f.call);
++    tree lhs_type = TREE_TYPE (f.lhs);
++
++    /* Replace the call with a clobber of the result (to prevent it from
++       becoming upwards exposed) followed by stores into each individual
++       vector of tuple.
++
++       The fold routines expect the replacement statement to have the
++       same lhs as the original call, so return the clobber statement
++       rather than the final vector store.  */
++    gassign *clobber = gimple_build_assign (f.lhs, build_clobber (lhs_type));
++
++    for (unsigned int i = nargs; i-- > 0; )
++      {
++	tree rhs_vector = gimple_call_arg (f.call, i);
++	tree field = tuple_type_field (TREE_TYPE (f.lhs));
++	tree lhs_array = build3 (COMPONENT_REF, TREE_TYPE (field),
++				 unshare_expr (f.lhs), field, NULL_TREE);
++	tree lhs_vector = build4 (ARRAY_REF, TREE_TYPE (rhs_vector),
++				  lhs_array, size_int (i),
++				  NULL_TREE, NULL_TREE);
++	gassign *assign = gimple_build_assign (lhs_vector, rhs_vector);
++	gsi_insert_after (f.gsi, assign, GSI_SAME_STMT);
++      }
++    return clobber;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    rtx lhs_tuple = e.get_nonoverlapping_reg_target ();
++
++    /* Record that LHS_TUPLE is dead before the first store.  */
++    emit_clobber (lhs_tuple);
++    for (unsigned int i = 0; i < e.args.length (); ++i)
++      {
++	/* Use an lvalue subreg to refer to vector I in LHS_TUPLE.  */
++	rtx lhs_vector = simplify_gen_subreg (GET_MODE (e.args[i]),
++					      lhs_tuple, GET_MODE (lhs_tuple),
++					      i * BYTES_PER_SVE_VECTOR);
++	emit_move_insn (lhs_vector, e.args[i]);
++      }
++    return lhs_tuple;
++  }
++};
++
++class svcvt_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode0 = e.vector_mode (0);
++    machine_mode mode1 = e.vector_mode (1);
++    insn_code icode;
++    /* All this complication comes from the need to select four things
++       simultaneously:
++
++       (1) the kind of conversion (int<-float, float<-int, float<-float)
++       (2) signed vs. unsigned integers, where relevant
++       (3) the predication mode, which must be the wider of the predication
++	   modes for MODE0 and MODE1
++       (4) the predication type (m, x or z)
++
++       The only supported int<->float conversions for which the integer is
++       narrower than the float are SI<->DF.  It's therefore more convenient
++       to handle (3) by defining two patterns for int<->float conversions:
++       one in which the integer is at least as wide as the float and so
++       determines the predication mode, and another single SI<->DF pattern
++       in which the float's mode determines the predication mode (which is
++       always VNx2BI in that case).
++
++       The names of the patterns follow the optab convention of giving
++       the source mode before the destination mode.  */
++    if (e.type_suffix (1).integer_p)
++      {
++	int unspec = (e.type_suffix (1).unsigned_p
++		      ? UNSPEC_COND_UCVTF
++		      : UNSPEC_COND_SCVTF);
++	if (e.type_suffix (0).element_bytes <= e.type_suffix (1).element_bytes)
++	  icode = (e.pred == PRED_x
++		   ? code_for_aarch64_sve_nonextend (unspec, mode1, mode0)
++		   : code_for_cond_nonextend (unspec, mode1, mode0));
++	else
++	  icode = (e.pred == PRED_x
++		   ? code_for_aarch64_sve_extend (unspec, mode1, mode0)
++		   : code_for_cond_extend (unspec, mode1, mode0));
++      }
++    else
++      {
++	int unspec = (!e.type_suffix (0).integer_p ? UNSPEC_COND_FCVT
++		      : e.type_suffix (0).unsigned_p ? UNSPEC_COND_FCVTZU
++		      : UNSPEC_COND_FCVTZS);
++	if (e.type_suffix (0).element_bytes >= e.type_suffix (1).element_bytes)
++	  icode = (e.pred == PRED_x
++		   ? code_for_aarch64_sve_nontrunc (unspec, mode1, mode0)
++		   : code_for_cond_nontrunc (unspec, mode1, mode0));
++	else
++	  icode = (e.pred == PRED_x
++		   ? code_for_aarch64_sve_trunc (unspec, mode1, mode0)
++		   : code_for_cond_trunc (unspec, mode1, mode0));
++      }
++
++    if (e.pred == PRED_x)
++      return e.use_pred_x_insn (icode);
++    return e.use_cond_insn (icode);
++  }
++};
++
++class svdot_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* In the optab, the multiplication operands come before the accumulator
++       operand.  The optab is keyed off the multiplication mode.  */
++    e.rotate_inputs_left (0, 3);
++    insn_code icode
++      = e.direct_optab_handler_for_sign (sdot_prod_optab, udot_prod_optab,
++					 0, GET_MODE (e.args[0]));
++    return e.use_unpred_insn (icode);
++  }
++};
++
++class svdotprod_lane_impl : public unspec_based_function_base
++{
++public:
++  CONSTEXPR svdotprod_lane_impl (int unspec_for_sint,
++				 int unspec_for_uint,
++				 int unspec_for_float)
++    : unspec_based_function_base (unspec_for_sint,
++				  unspec_for_uint,
++				  unspec_for_float) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Use the same ordering as the dot_prod_optab, with the
++       accumulator last.  */
++    e.rotate_inputs_left (0, 4);
++    int unspec = unspec_for (e);
++    machine_mode mode = e.vector_mode (0);
++    return e.use_exact_insn (code_for_aarch64_dot_prod_lane (unspec, mode));
++  }
++};
++
++class svdup_impl : public quiet<function_base>
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree vec_type = TREE_TYPE (f.lhs);
++    tree rhs = gimple_call_arg (f.call, f.pred == PRED_none ? 0 : 1);
++
++    if (f.pred == PRED_none || f.pred == PRED_x)
++      {
++	if (CONSTANT_CLASS_P (rhs))
++	  {
++	    if (f.type_suffix (0).bool_p)
++	      return (tree_to_shwi (rhs)
++		      ? f.fold_to_ptrue ()
++		      : f.fold_to_pfalse ());
++
++	    tree rhs_vector = build_vector_from_val (vec_type, rhs);
++	    return gimple_build_assign (f.lhs, rhs_vector);
++	  }
++
++	/* Avoid folding _b to a VEC_DUPLICATE_EXPR, since to do that we
++	   would need to introduce an extra and unwanted conversion to
++	   the truth vector element type.  */
++	if (!f.type_suffix (0).bool_p)
++	  return gimple_build_assign (f.lhs, VEC_DUPLICATE_EXPR, rhs);
++      }
++
++    return NULL;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    if (e.pred == PRED_none || e.pred == PRED_x)
++      /* There's no benefit to using predicated instructions for _x here.  */
++      return e.use_unpred_insn (e.direct_optab_handler (vec_duplicate_optab));
++
++    /* Model predicated svdups as a SEL in which the "true" value is
++       the duplicate of the function argument and the "false" value
++       is the value of inactive lanes.  */
++    insn_code icode;
++    machine_mode mode = e.vector_mode (0);
++    if (valid_for_const_vector_p (GET_MODE_INNER (mode), e.args.last ()))
++      /* Duplicate the constant to fill a vector.  The pattern optimizes
++	 various cases involving constant operands, falling back to SEL
++	 if necessary.  */
++      icode = code_for_vcond_mask (mode, mode);
++    else
++      /* Use the pattern for selecting between a duplicated scalar
++	 variable and a vector fallback.  */
++      icode = code_for_aarch64_sel_dup (mode);
++    return e.use_vcond_mask_insn (icode);
++  }
++};
++
++class svdup_lane_impl : public quiet<function_base>
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* The native DUP lane has an index range of 64 bytes.  */
++    machine_mode mode = e.vector_mode (0);
++    if (CONST_INT_P (e.args[1])
++	&& IN_RANGE (INTVAL (e.args[1]) * GET_MODE_UNIT_SIZE (mode), 0, 63))
++      return e.use_exact_insn (code_for_aarch64_sve_dup_lane (mode));
++
++    /* Treat svdup_lane as if it were svtbl_n.  */
++    return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
++  }
++};
++
++class svdupq_impl : public quiet<function_base>
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree vec_type = TREE_TYPE (f.lhs);
++    unsigned int nargs = gimple_call_num_args (f.call);
++    /* For predicates, pad out each argument so that we have one element
++       per bit.  */
++    unsigned int factor = (f.type_suffix (0).bool_p
++			   ? f.type_suffix (0).element_bytes : 1);
++    tree_vector_builder builder (vec_type, nargs * factor, 1);
++    for (unsigned int i = 0; i < nargs; ++i)
++      {
++	tree elt = gimple_call_arg (f.call, i);
++	if (!CONSTANT_CLASS_P (elt))
++	  return NULL;
++	builder.quick_push (elt);
++	for (unsigned int j = 1; j < factor; ++j)
++	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
++      }
++    return gimple_build_assign (f.lhs, builder.build ());
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    unsigned int elements_per_vq = e.args.length ();
++    if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
++      {
++	/* Construct a vector of integers so that we can compare them against
++	   zero below.  Zero vs. nonzero is the only distinction that
++	   matters.  */
++	mode = aarch64_sve_int_mode (mode);
++	for (unsigned int i = 0; i < elements_per_vq; ++i)
++	  e.args[i] = simplify_gen_unary (ZERO_EXTEND, GET_MODE_INNER (mode),
++					  e.args[i], QImode);
++      }
++
++    /* Get the 128-bit Advanced SIMD vector for this data size.  */
++    scalar_mode element_mode = GET_MODE_INNER (mode);
++    machine_mode vq_mode = aarch64_vq_mode (element_mode).require ();
++    gcc_assert (known_eq (elements_per_vq, GET_MODE_NUNITS (vq_mode)));
++
++    /* Put the arguments into a 128-bit Advanced SIMD vector.  We want
++       argument N to go into architectural lane N, whereas Advanced SIMD
++       vectors are loaded memory lsb to register lsb.  We therefore need
++       to reverse the elements for big-endian targets.  */
++    rtx vq_reg = gen_reg_rtx (vq_mode);
++    rtvec vec = rtvec_alloc (elements_per_vq);
++    for (unsigned int i = 0; i < elements_per_vq; ++i)
++      {
++	unsigned int argno = BYTES_BIG_ENDIAN ? elements_per_vq - i - 1 : i;
++	RTVEC_ELT (vec, i) = e.args[argno];
++      }
++    aarch64_expand_vector_init (vq_reg, gen_rtx_PARALLEL (vq_mode, vec));
++
++    /* If the result is a boolean, compare the data vector against zero.  */
++    if (mode != e.vector_mode (0))
++      {
++	rtx data_dupq = aarch64_expand_sve_dupq (NULL, mode, vq_reg);
++	return aarch64_convert_sve_data_to_pred (e.possible_target,
++						 e.vector_mode (0), data_dupq);
++      }
++
++    return aarch64_expand_sve_dupq (e.possible_target, mode, vq_reg);
++  }
++};
++
++class svdupq_lane_impl : public quiet<function_base>
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    rtx index = e.args[1];
++    if (CONST_INT_P (index) && IN_RANGE (INTVAL (index), 0, 3))
++      {
++	/* Use the .Q form of DUP, which is the native instruction for
++	   this function.  */
++	insn_code icode = code_for_aarch64_sve_dupq_lane (mode);
++	unsigned int num_indices = e.elements_per_vq (0);
++	rtx indices = aarch64_gen_stepped_int_parallel
++	  (num_indices, INTVAL (index) * num_indices, 1);
++
++	e.add_output_operand (icode);
++	e.add_input_operand (icode, e.args[0]);
++	e.add_fixed_operand (indices);
++	return e.generate_insn (icode);
++      }
++
++    /* Build a .D TBL index for the pairs of doublewords that we want to
++       duplicate.  */
++    if (CONST_INT_P (index))
++      {
++	/* The index vector is a constant.  */
++	rtx_vector_builder builder (VNx2DImode, 2, 1);
++	builder.quick_push (gen_int_mode (INTVAL (index) * 2, DImode));
++	builder.quick_push (gen_int_mode (INTVAL (index) * 2 + 1, DImode));
++	index = builder.build ();
++      }
++    else
++      {
++	/* Duplicate INDEX * 2 to fill a DImode vector.  The ACLE spec
++	   explicitly allows the top of the index to be dropped.  */
++	index = force_reg (DImode, simplify_gen_binary (ASHIFT, DImode,
++							index, const1_rtx));
++	index = expand_vector_broadcast (VNx2DImode, index);
++
++	/* Get an alternating 0, 1 predicate.  */
++	rtx_vector_builder builder (VNx2BImode, 2, 1);
++	builder.quick_push (const0_rtx);
++	builder.quick_push (constm1_rtx);
++	rtx pg = force_reg (VNx2BImode, builder.build ());
++
++	/* Add one to the odd elements of the index.  */
++	rtx one = force_reg (VNx2DImode, CONST1_RTX (VNx2DImode));
++	rtx target = gen_reg_rtx (VNx2DImode);
++	emit_insn (gen_cond_addvnx2di (target, pg, index, one, index));
++	index = target;
++      }
++
++    e.args[0] = gen_lowpart (VNx2DImode, e.args[0]);
++    e.args[1] = index;
++    return e.use_exact_insn (CODE_FOR_aarch64_sve_tblvnx2di);
++  }
++};
++
++/* Implements svextb, svexth and svextw.  */
++class svext_bhw_impl : public function_base
++{
++public:
++  CONSTEXPR svext_bhw_impl (scalar_int_mode from_mode)
++    : m_from_mode (from_mode) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    if (e.type_suffix (0).unsigned_p)
++      {
++	/* Convert to an AND.  The widest we go is 0xffffffff, which fits
++	   in a CONST_INT.  */
++	e.args.quick_push (GEN_INT (GET_MODE_MASK (m_from_mode)));
++	if (e.pred == PRED_m)
++	  /* We now have arguments "(inactive, pg, op, mask)".  Convert this
++	     to "(pg, op, mask, inactive)" so that the order matches svand_m
++	     with an extra argument on the end.  Take the inactive elements
++	     from this extra argument.  */
++	  e.rotate_inputs_left (0, 4);
++	return e.map_to_rtx_codes (AND, AND, -1, 3);
++      }
++
++    machine_mode wide_mode = e.vector_mode (0);
++    poly_uint64 nunits = GET_MODE_NUNITS (wide_mode);
++    machine_mode narrow_mode
++      = aarch64_sve_data_mode (m_from_mode, nunits).require ();
++    if (e.pred == PRED_x)
++      {
++	insn_code icode = code_for_aarch64_pred_sxt (wide_mode, narrow_mode);
++	return e.use_pred_x_insn (icode);
++      }
++
++    insn_code icode = code_for_aarch64_cond_sxt (wide_mode, narrow_mode);
++    return e.use_cond_insn (icode);
++  }
++
++  /* The element mode that we're extending from.  */
++  scalar_int_mode m_from_mode;
++};
++
++/* Implements svget2, svget3 and svget4.  */
++class svget_impl : public quiet<multi_vector_function>
++{
++public:
++  CONSTEXPR svget_impl (unsigned int vectors_per_tuple)
++    : quiet<multi_vector_function> (vectors_per_tuple) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* Fold into a normal gimple component access.  */
++    tree rhs_tuple = gimple_call_arg (f.call, 0);
++    tree index = gimple_call_arg (f.call, 1);
++    tree field = tuple_type_field (TREE_TYPE (rhs_tuple));
++    tree rhs_array = build3 (COMPONENT_REF, TREE_TYPE (field),
++			     rhs_tuple, field, NULL_TREE);
++    tree rhs_vector = build4 (ARRAY_REF, TREE_TYPE (f.lhs),
++			      rhs_array, index, NULL_TREE, NULL_TREE);
++    return gimple_build_assign (f.lhs, rhs_vector);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Fold the access into a subreg rvalue.  */
++    return simplify_gen_subreg (e.vector_mode (0), e.args[0],
++				GET_MODE (e.args[0]),
++				INTVAL (e.args[1]) * BYTES_PER_SVE_VECTOR);
++  }
++};
++
++class svindex_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (e.direct_optab_handler (vec_series_optab));
++  }
++};
++
++class svinsr_impl : public quiet<function_base>
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    gcall *new_call = gimple_build_call_internal (IFN_VEC_SHL_INSERT, 2,
++						  gimple_call_arg (f.call, 0),
++						  gimple_call_arg (f.call, 1));
++    gimple_call_set_lhs (new_call, f.lhs);
++    return new_call;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = direct_optab_handler (vec_shl_insert_optab,
++					    e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements svlasta and svlastb.  */
++class svlast_impl : public quiet<function_base>
++{
++public:
++  CONSTEXPR svlast_impl (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (code_for_extract (m_unspec, e.vector_mode (0)));
++  }
++
++  /* The unspec code associated with the operation.  */
++  int m_unspec;
++};
++
++class svld1_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY;
++  }
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree vectype = f.vector_type (0);
++
++    /* Get the predicate and base pointer.  */
++    gimple_seq stmts = NULL;
++    tree pred = f.convert_pred (stmts, vectype, 0);
++    tree base = f.fold_contiguous_base (stmts, vectype);
++    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
++
++    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
++    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
++						  base, cookie, pred);
++    gimple_call_set_lhs (new_call, f.lhs);
++    return new_call;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = convert_optab_handler (maskload_optab,
++					     e.vector_mode (0), e.gp_mode (0));
++    return e.use_contiguous_load_insn (icode);
++  }
++};
++
++/* Implements extending contiguous forms of svld1.  */
++class svld1_extend_impl : public extending_load
++{
++public:
++  CONSTEXPR svld1_extend_impl (type_suffix_index memory_type)
++    : extending_load (memory_type) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_load (extend_rtx_code (),
++					     e.vector_mode (0),
++					     e.memory_vector_mode ());
++    return e.use_contiguous_load_insn (icode);
++  }
++};
++
++class svld1_gather_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.prepare_gather_address_operands (1);
++    /* Put the predicate last, as required by mask_gather_load_optab.  */
++    e.rotate_inputs_left (0, 5);
++    machine_mode mem_mode = e.memory_vector_mode ();
++    insn_code icode = direct_optab_handler (mask_gather_load_optab, mem_mode);
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements extending forms of svld1_gather.  */
++class svld1_gather_extend_impl : public extending_load
++{
++public:
++  CONSTEXPR svld1_gather_extend_impl (type_suffix_index memory_type)
++    : extending_load (memory_type) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.prepare_gather_address_operands (1);
++    /* Put the predicate last, since the extending gathers use the same
++       operand order as mask_gather_load_optab.  */
++    e.rotate_inputs_left (0, 5);
++    insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (),
++						    e.vector_mode (0),
++						    e.memory_vector_mode ());
++    return e.use_exact_insn (icode);
++  }
++};
++
++class load_replicate : public function_base
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY;
++  }
++
++  tree
++  memory_scalar_type (const function_instance &fi) const OVERRIDE
++  {
++    return fi.scalar_type (0);
++  }
++};
++
++class svld1rq_impl : public load_replicate
++{
++public:
++  machine_mode
++  memory_vector_mode (const function_instance &fi) const OVERRIDE
++  {
++    return aarch64_vq_mode (GET_MODE_INNER (fi.vector_mode (0))).require ();
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
++    return e.use_contiguous_load_insn (icode);
++  }
++};
++
++class svld1ro_impl : public load_replicate
++{
++public:
++  machine_mode
++  memory_vector_mode (const function_instance &fi) const OVERRIDE
++  {
++    return OImode;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_sve_ld1ro (e.vector_mode (0));
++    return e.use_contiguous_load_insn (icode);
++  }
++};
++
++/* Implements svld2, svld3 and svld4.  */
++class svld234_impl : public full_width_access
++{
++public:
++  CONSTEXPR svld234_impl (unsigned int vectors_per_tuple)
++    : full_width_access (vectors_per_tuple) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY;
++  }
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree tuple_type = TREE_TYPE (f.lhs);
++    tree vectype = f.vector_type (0);
++
++    /* Get the predicate and base pointer.  */
++    gimple_seq stmts = NULL;
++    tree pred = f.convert_pred (stmts, vectype, 0);
++    tree base = f.fold_contiguous_base (stmts, vectype);
++    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
++
++    /* Emit two statements: a clobber of the lhs, so that it isn't
++       upwards exposed, and then the load itself.
++
++       The fold routines expect the replacement statement to have the
++       same lhs as the original call, so return the clobber statement
++       rather than the load.  */
++    gimple *clobber = gimple_build_assign (f.lhs, build_clobber (tuple_type));
++
++    /* View the loaded data as an array of vectors.  */
++    tree field = tuple_type_field (tuple_type);
++    tree lhs_array = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (field),
++			     unshare_expr (f.lhs));
++
++    /* Emit the load itself.  */
++    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
++    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
++						  base, cookie, pred);
++    gimple_call_set_lhs (new_call, lhs_array);
++    gsi_insert_after (f.gsi, new_call, GSI_SAME_STMT);
++
++    return clobber;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode tuple_mode = TYPE_MODE (TREE_TYPE (e.call_expr));
++    insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab,
++					     tuple_mode, e.vector_mode (0));
++    return e.use_contiguous_load_insn (icode);
++  }
++};
++
++class svldff1_gather_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* See the block comment in aarch64-sve.md for details about the
++       FFR handling.  */
++    emit_insn (gen_aarch64_update_ffr_for_load ());
++
++    e.prepare_gather_address_operands (1);
++    /* Put the predicate last, since ldff1_gather uses the same operand
++       order as mask_gather_load_optab.  */
++    e.rotate_inputs_left (0, 5);
++    machine_mode mem_mode = e.memory_vector_mode ();
++    return e.use_exact_insn (code_for_aarch64_ldff1_gather (mem_mode));
++  }
++};
++
++/* Implements extending forms of svldff1_gather.  */
++class svldff1_gather_extend : public extending_load
++{
++public:
++  CONSTEXPR svldff1_gather_extend (type_suffix_index memory_type)
++    : extending_load (memory_type) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* See the block comment in aarch64-sve.md for details about the
++       FFR handling.  */
++    emit_insn (gen_aarch64_update_ffr_for_load ());
++
++    e.prepare_gather_address_operands (1);
++    /* Put the predicate last, since ldff1_gather uses the same operand
++       order as mask_gather_load_optab.  */
++    e.rotate_inputs_left (0, 5);
++    insn_code icode = code_for_aarch64_ldff1_gather (extend_rtx_code (),
++						     e.vector_mode (0),
++						     e.memory_vector_mode ());
++    return e.use_exact_insn (icode);
++  }
++};
++
++class svldnt1_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_ldnt1 (e.vector_mode (0));
++    return e.use_contiguous_load_insn (icode);
++  }
++};
++
++/* Implements svldff1 and svldnf1.  */
++class svldxf1_impl : public full_width_access
++{
++public:
++  CONSTEXPR svldxf1_impl (int unspec) : m_unspec (unspec) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* See the block comment in aarch64-sve.md for details about the
++       FFR handling.  */
++    emit_insn (gen_aarch64_update_ffr_for_load ());
++
++    machine_mode mode = e.vector_mode (0);
++    return e.use_contiguous_load_insn (code_for_aarch64_ldf1 (m_unspec, mode));
++  }
++
++  /* The unspec associated with the load.  */
++  int m_unspec;
++};
++
++/* Implements extending contiguous forms of svldff1 and svldnf1.  */
++class svldxf1_extend_impl : public extending_load
++{
++public:
++  CONSTEXPR svldxf1_extend_impl (type_suffix_index memory_type, int unspec)
++    : extending_load (memory_type), m_unspec (unspec) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* See the block comment in aarch64-sve.md for details about the
++       FFR handling.  */
++    emit_insn (gen_aarch64_update_ffr_for_load ());
++
++    insn_code icode = code_for_aarch64_ldf1 (m_unspec, extend_rtx_code (),
++					     e.vector_mode (0),
++					     e.memory_vector_mode ());
++    return e.use_contiguous_load_insn (icode);
++  }
++
++  /* The unspec associated with the load.  */
++  int m_unspec;
++};
++
++class svlen_impl : public quiet<function_base>
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* The argument only exists for its type.  */
++    tree rhs_type = TREE_TYPE (gimple_call_arg (f.call, 0));
++    tree count = build_int_cstu (TREE_TYPE (f.lhs),
++				 TYPE_VECTOR_SUBPARTS (rhs_type));
++    return gimple_build_assign (f.lhs, count);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* The argument only exists for its type.  */
++    return gen_int_mode (GET_MODE_NUNITS (e.vector_mode (0)), DImode);
++  }
++};
++
++class svmad_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return expand_mad (e);
++  }
++};
++
++class svmla_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Put the accumulator at the end (argument 3), but keep it as the
++       merge input for _m functions.  */
++    e.rotate_inputs_left (1, 4);
++    return expand_mad (e, 3);
++  }
++};
++
++/* Base class for svmla_lane and svmls_lane.  */
++class svmla_svmls_lane_impl : public function_base
++{
++public:
++  CONSTEXPR svmla_svmls_lane_impl (int unspec)
++    : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Put the operands in the normal (fma ...) order, with the accumulator
++       last.  This fits naturally since that's also the unprinted operand
++       in the asm output.  */
++    e.rotate_inputs_left (0, 4);
++    insn_code icode = code_for_aarch64_lane (m_unspec, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++
++  /* The unspec code associated with the operation.  */
++  int m_unspec;
++};
++
++class svmls_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Put the accumulator at the end (argument 3), but keep it as the
++       merge input for _m functions.  */
++    e.rotate_inputs_left (1, 4);
++    return expand_msb (e, 3);
++  }
++};
++
++class svmov_impl : public function_base
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    return gimple_build_assign (f.lhs, BIT_AND_EXPR,
++				gimple_call_arg (f.call, 0),
++				gimple_call_arg (f.call, 1));
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* The canonical form for the assembler alias "MOV Pa.B, Pb/Z, Pc.B"
++       is "AND Pa.B, Pb/Z, Pc.B, Pc.B".  */
++    gcc_assert (e.pred == PRED_z);
++    e.args.quick_push (e.args[1]);
++    return e.use_exact_insn (CODE_FOR_aarch64_pred_andvnx16bi_z);
++  }
++};
++
++class svmmla_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode;
++    if (e.type_suffix (0).integer_p)
++      {
++	if (e.type_suffix (0).unsigned_p)
++	  icode = code_for_aarch64_sve_add (UNSPEC_UMATMUL, e.vector_mode (0));
++	else
++	  icode = code_for_aarch64_sve_add (UNSPEC_SMATMUL, e.vector_mode (0));
++      }
++    else
++      icode = code_for_aarch64_sve (UNSPEC_FMMLA, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++class svmsb_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return expand_msb (e);
++  }
++};
++
++class svnand_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    gcc_assert (e.pred == PRED_z);
++    return e.use_exact_insn (CODE_FOR_aarch64_pred_nandvnx16bi_z);
++  }
++};
++
++class svnor_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    gcc_assert (e.pred == PRED_z);
++    return e.use_exact_insn (CODE_FOR_aarch64_pred_norvnx16bi_z);
++  }
++};
++
++class svnot_impl : public rtx_code_function
++{
++public:
++  CONSTEXPR svnot_impl () : rtx_code_function (NOT, NOT, -1) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    if (e.type_suffix_ids[0] == TYPE_SUFFIX_b)
++      {
++	/* The canonical form for the assembler alias "NOT Pa.B, Pb/Z, Pc.B"
++	   is "EOR Pa.B, Pb/Z, Pb.B, Pc.B".  */
++	gcc_assert (e.pred == PRED_z);
++	e.args.quick_insert (1, e.args[0]);
++	return e.use_exact_insn (CODE_FOR_aarch64_pred_xorvnx16bi_z);
++      }
++    return rtx_code_function::expand (e);
++  }
++};
++
++class svorn_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    gcc_assert (e.pred == PRED_z);
++    return e.use_exact_insn (CODE_FOR_aarch64_pred_ornvnx16bi_z);
++  }
++};
++
++class svpfalse_impl : public function_base
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    return f.fold_to_pfalse ();
++  }
++
++  rtx
++  expand (function_expander &) const OVERRIDE
++  {
++    return CONST0_RTX (VNx16BImode);
++  }
++};
++
++/* Implements svpfirst and svpnext, which share the same .md patterns.  */
++class svpfirst_svpnext_impl : public function_base
++{
++public:
++  CONSTEXPR svpfirst_svpnext_impl (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    e.add_ptrue_hint (0, mode);
++    return e.use_exact_insn (code_for_aarch64_sve (m_unspec, mode));
++  }
++
++  /* The unspec associated with the operation.  */
++  int m_unspec;
++};
++
++/* Implements contiguous forms of svprf[bhwd].  */
++class svprf_bhwd_impl : public function_base
++{
++public:
++  CONSTEXPR svprf_bhwd_impl (machine_mode mode) : m_mode (mode) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_PREFETCH_MEMORY;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.prepare_prefetch_operands ();
++    insn_code icode = code_for_aarch64_sve_prefetch (m_mode);
++    return e.use_contiguous_prefetch_insn (icode);
++  }
++
++  /* The mode that we'd use to hold one vector of prefetched data.  */
++  machine_mode m_mode;
++};
++
++/* Implements svprf[bhwd]_gather.  */
++class svprf_bhwd_gather_impl : public function_base
++{
++public:
++  CONSTEXPR svprf_bhwd_gather_impl (machine_mode mode) : m_mode (mode) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_PREFETCH_MEMORY;
++  }
++
++  machine_mode
++  memory_vector_mode (const function_instance &) const OVERRIDE
++  {
++    return m_mode;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.prepare_prefetch_operands ();
++    e.prepare_gather_address_operands (1);
++
++    /* Insert a zero operand to identify the mode of the memory being
++       accessed.  This goes between the gather operands and prefetch
++       operands created above.  */
++    e.args.quick_insert (5, CONST0_RTX (m_mode));
++
++    machine_mode reg_mode = GET_MODE (e.args[2]);
++    insn_code icode = code_for_aarch64_sve_gather_prefetch (m_mode, reg_mode);
++    return e.use_exact_insn (icode);
++  }
++
++  /* The mode that we'd use to hold one vector of prefetched data.  */
++  machine_mode m_mode;
++};
++
++/* Implements svptest_any, svptest_first and svptest_last.  */
++class svptest_impl : public function_base
++{
++public:
++  CONSTEXPR svptest_impl (rtx_code compare) : m_compare (compare) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* See whether GP is an exact ptrue for some predicate mode;
++       i.e. whether converting the GP to that mode will not drop
++       set bits and will leave all significant bits set.  */
++    machine_mode wide_mode;
++    int hint;
++    if (aarch64_ptrue_all_mode (e.args[0]).exists (&wide_mode))
++      hint = SVE_KNOWN_PTRUE;
++    else
++      {
++	hint = SVE_MAYBE_NOT_PTRUE;
++	wide_mode = VNx16BImode;
++      }
++
++    /* Generate the PTEST itself.  */
++    rtx pg = force_reg (VNx16BImode, e.args[0]);
++    rtx wide_pg = gen_lowpart (wide_mode, pg);
++    rtx hint_rtx = gen_int_mode (hint, DImode);
++    rtx op = force_reg (wide_mode, gen_lowpart (wide_mode, e.args[1]));
++    emit_insn (gen_aarch64_ptestvnx16bi (pg, wide_pg, hint_rtx, op));
++
++    /* Get the location of the boolean result.  We can provide SImode and
++       DImode values directly; rely on generic code to convert others.  */
++    rtx target = e.possible_target;
++    if (!target
++	|| !REG_P (target)
++	|| (GET_MODE (target) != SImode && GET_MODE (target) != DImode))
++      target = gen_reg_rtx (DImode);
++
++    /* Generate a CSET to convert the CC result of the PTEST to a boolean.  */
++    rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
++    rtx compare = gen_rtx_fmt_ee (m_compare, GET_MODE (target),
++				  cc_reg, const0_rtx);
++    emit_insn (gen_rtx_SET (target, compare));
++    return target;
++  }
++
++  /* The comparison code associated with ptest condition.  */
++  rtx_code m_compare;
++};
++
++class svptrue_impl : public function_base
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    return f.fold_to_ptrue ();
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return aarch64_ptrue_all (e.type_suffix (0).element_bytes);
++  }
++};
++
++class svptrue_pat_impl : public function_base
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree pattern_arg = gimple_call_arg (f.call, 0);
++    aarch64_svpattern pattern = (aarch64_svpattern) tree_to_shwi (pattern_arg);
++
++    if (pattern == AARCH64_SV_ALL)
++      /* svptrue_pat_bN (SV_ALL) == svptrue_bN ().  */
++      return f.fold_to_ptrue ();
++
++    /* See whether we can count the number of elements in the pattern
++       at compile time.  If so, construct a predicate with that number
++       of 1s followed by all 0s.  */
++    int nelts_per_vq = f.elements_per_vq (0);
++    HOST_WIDE_INT value = aarch64_fold_sve_cnt_pat (pattern, nelts_per_vq);
++    if (value >= 0)
++      return f.fold_to_vl_pred (value);
++
++    return NULL;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* In rtl, the predicate is represented as the constant:
++
++         (const:V16BI (unspec:V16BI [(const_int PATTERN)
++				     (const_vector:VnnBI [zeros])]
++				    UNSPEC_PTRUE))
++
++       where nn determines the element size.  */
++    rtvec vec = gen_rtvec (2, e.args[0], CONST0_RTX (e.vector_mode (0)));
++    return gen_rtx_CONST (VNx16BImode,
++			  gen_rtx_UNSPEC (VNx16BImode, vec, UNSPEC_PTRUE));
++  }
++};
++
++class svqadd_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.expand_signed_unpred_op (SS_PLUS, US_PLUS);
++  }
++};
++
++/* Implements svqdec[bhwd]{,_pat} and svqinc[bhwd]{,_pat}.  */
++class svqdec_svqinc_bhwd_impl : public function_base
++{
++public:
++  CONSTEXPR svqdec_svqinc_bhwd_impl (rtx_code code_for_sint,
++				     rtx_code code_for_uint,
++				     scalar_int_mode elem_mode)
++    : m_code_for_sint (code_for_sint),
++      m_code_for_uint (code_for_uint),
++      m_elem_mode (elem_mode)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Treat non-_pat functions in the same way as _pat functions with
++       an SV_ALL argument.  */
++    if (e.args.length () == 2)
++      e.args.quick_insert (1, gen_int_mode (AARCH64_SV_ALL, DImode));
++
++    /* Insert the number of elements per 128-bit block as a fake argument,
++       between the pattern and the multiplier.  Arguments 1, 2 and 3 then
++       correspond exactly with the 3 UNSPEC_SVE_CNT_PAT operands; see
++       aarch64_sve_cnt_pat for details.  */
++    unsigned int elements_per_vq = 128 / GET_MODE_BITSIZE (m_elem_mode);
++    e.args.quick_insert (2, gen_int_mode (elements_per_vq, DImode));
++
++    rtx_code code = (e.type_suffix (0).unsigned_p
++		     ? m_code_for_uint
++		     : m_code_for_sint);
++
++    /* Choose between operating on integer scalars or integer vectors.  */
++    machine_mode mode = e.vector_mode (0);
++    if (e.mode_suffix_id == MODE_n)
++      mode = GET_MODE_INNER (mode);
++    return e.use_exact_insn (code_for_aarch64_sve_pat (code, mode));
++  }
++
++  /* The saturating addition or subtraction codes to use for signed and
++     unsigned values respectively.  */
++  rtx_code m_code_for_sint;
++  rtx_code m_code_for_uint;
++
++  /* The integer mode associated with the [bhwd] suffix.  */
++  scalar_int_mode m_elem_mode;
++};
++
++/* Implements svqdec[bhwd]{,_pat}.  */
++class svqdec_bhwd_impl : public svqdec_svqinc_bhwd_impl
++{
++public:
++  CONSTEXPR svqdec_bhwd_impl (scalar_int_mode elem_mode)
++    : svqdec_svqinc_bhwd_impl (SS_MINUS, US_MINUS, elem_mode) {}
++};
++
++/* Implements svqinc[bhwd]{,_pat}.  */
++class svqinc_bhwd_impl : public svqdec_svqinc_bhwd_impl
++{
++public:
++  CONSTEXPR svqinc_bhwd_impl (scalar_int_mode elem_mode)
++    : svqdec_svqinc_bhwd_impl (SS_PLUS, US_PLUS, elem_mode) {}
++};
++
++/* Implements svqdecp and svqincp.  */
++class svqdecp_svqincp_impl : public function_base
++{
++public:
++  CONSTEXPR svqdecp_svqincp_impl (rtx_code code_for_sint,
++				  rtx_code code_for_uint)
++    : m_code_for_sint (code_for_sint),
++      m_code_for_uint (code_for_uint)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    rtx_code code = (e.type_suffix (0).unsigned_p
++		     ? m_code_for_uint
++		     : m_code_for_sint);
++    insn_code icode;
++    if (e.mode_suffix_id == MODE_n)
++      {
++	/* Increment or decrement a scalar (whose mode is given by the first
++	   type suffix) by the number of active elements in a predicate
++	   (whose mode is given by the second type suffix).  */
++	machine_mode mode = GET_MODE_INNER (e.vector_mode (0));
++	icode = code_for_aarch64_sve_cntp (code, mode, e.vector_mode (1));
++      }
++    else
++      /* Increment a vector by the number of active elements in a predicate,
++	 with the vector mode determining the predicate mode.  */
++      icode = code_for_aarch64_sve_cntp (code, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++
++  /* The saturating addition or subtraction codes to use for signed and
++     unsigned values respectively.  */
++  rtx_code m_code_for_sint;
++  rtx_code m_code_for_uint;
++};
++
++class svqsub_impl : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.expand_signed_unpred_op (SS_MINUS, US_MINUS);
++  }
++};
++
++class svrdffr_impl : public function_base
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_FFR;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* See the block comment in aarch64-sve.md for details about the
++       FFR handling.  */
++    emit_insn (gen_aarch64_copy_ffr_to_ffrt ());
++    rtx result = e.use_exact_insn (e.pred == PRED_z
++				   ? CODE_FOR_aarch64_rdffr_z
++				   : CODE_FOR_aarch64_rdffr);
++    emit_insn (gen_aarch64_update_ffrt ());
++    return result;
++  }
++};
++
++class svreinterpret_impl : public quiet<function_base>
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* Punt to rtl if the effect of the reinterpret on registers does not
++       conform to GCC's endianness model.  */
++    if (!targetm.can_change_mode_class (f.vector_mode (0),
++					f.vector_mode (1), FP_REGS))
++      return NULL;
++
++    /* Otherwise svreinterpret corresponds directly to a VIEW_CONVERT_EXPR
++       reinterpretation.  */
++    tree rhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (f.lhs),
++		       gimple_call_arg (f.call, 0));
++    return gimple_build_assign (f.lhs, VIEW_CONVERT_EXPR, rhs);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    return e.use_exact_insn (code_for_aarch64_sve_reinterpret (mode));
++  }
++};
++
++class svrev_impl : public permute
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* Punt for now on _b16 and wider; we'd need more complex evpc logic
++       to rerecognize the result.  */
++    if (f.type_suffix (0).bool_p && f.type_suffix (0).element_bits > 8)
++      return NULL;
++
++    /* Permute as { nelts - 1, nelts - 2, nelts - 3, ... }.  */
++    poly_int64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
++    vec_perm_builder builder (nelts, 1, 3);
++    for (int i = 0; i < 3; ++i)
++      builder.quick_push (nelts - i - 1);
++    return fold_permute (f, builder);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (code_for_aarch64_sve_rev (e.vector_mode (0)));
++  }
++};
++
++class svsel_impl : public quiet<function_base>
++{
++public:
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* svsel corresponds exactly to VEC_COND_EXPR.  */
++    gimple_seq stmts = NULL;
++    tree pred = f.convert_pred (stmts, f.vector_type (0), 0);
++    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
++    return gimple_build_assign (f.lhs, VEC_COND_EXPR, pred,
++				gimple_call_arg (f.call, 1),
++				gimple_call_arg (f.call, 2));
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* svsel (cond, truev, falsev) is vcond_mask (truev, falsev, cond).  */
++    e.rotate_inputs_left (0, 3);
++    insn_code icode = convert_optab_handler (vcond_mask_optab,
++					     e.vector_mode (0),
++					     e.gp_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements svset2, svset3 and svset4.  */
++class svset_impl : public quiet<multi_vector_function>
++{
++public:
++  CONSTEXPR svset_impl (unsigned int vectors_per_tuple)
++    : quiet<multi_vector_function> (vectors_per_tuple) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree rhs_tuple = gimple_call_arg (f.call, 0);
++    tree index = gimple_call_arg (f.call, 1);
++    tree rhs_vector = gimple_call_arg (f.call, 2);
++
++    /* Replace the call with two statements: a copy of the full tuple
++       to the call result, followed by an update of the individual vector.
++
++       The fold routines expect the replacement statement to have the
++       same lhs as the original call, so return the copy statement
++       rather than the field update.  */
++    gassign *copy = gimple_build_assign (unshare_expr (f.lhs), rhs_tuple);
++
++    /* Get a reference to the individual vector.  */
++    tree field = tuple_type_field (TREE_TYPE (f.lhs));
++    tree lhs_array = build3 (COMPONENT_REF, TREE_TYPE (field),
++			     f.lhs, field, NULL_TREE);
++    tree lhs_vector = build4 (ARRAY_REF, TREE_TYPE (rhs_vector),
++			      lhs_array, index, NULL_TREE, NULL_TREE);
++    gassign *update = gimple_build_assign (lhs_vector, rhs_vector);
++    gsi_insert_after (f.gsi, update, GSI_SAME_STMT);
++
++    return copy;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    rtx rhs_tuple = e.args[0];
++    unsigned int index = INTVAL (e.args[1]);
++    rtx rhs_vector = e.args[2];
++
++    /* First copy the full tuple to the target register.  */
++    rtx lhs_tuple = e.get_nonoverlapping_reg_target ();
++    emit_move_insn (lhs_tuple, rhs_tuple);
++
++    /* ...then update the individual vector.  */
++    rtx lhs_vector = simplify_gen_subreg (GET_MODE (rhs_vector),
++					  lhs_tuple, GET_MODE (lhs_tuple),
++					  index * BYTES_PER_SVE_VECTOR);
++    emit_move_insn (lhs_vector, rhs_vector);
++    return lhs_vector;
++  }
++};
++
++class svsetffr_impl : public function_base
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_FFR;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.args.quick_push (CONSTM1_RTX (VNx16BImode));
++    return e.use_exact_insn (CODE_FOR_aarch64_wrffr);
++  }
++};
++
++class svst1_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_MEMORY;
++  }
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree vectype = f.vector_type (0);
++
++    /* Get the predicate and base pointer.  */
++    gimple_seq stmts = NULL;
++    tree pred = f.convert_pred (stmts, vectype, 0);
++    tree base = f.fold_contiguous_base (stmts, vectype);
++    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
++
++    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
++    tree rhs = gimple_call_arg (f.call, gimple_call_num_args (f.call) - 1);
++    return gimple_build_call_internal (IFN_MASK_STORE, 4,
++				       base, cookie, pred, rhs);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = convert_optab_handler (maskstore_optab,
++					     e.vector_mode (0), e.gp_mode (0));
++    return e.use_contiguous_store_insn (icode);
++  }
++};
++
++class svst1_scatter_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_MEMORY;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.prepare_gather_address_operands (1);
++    /* Put the predicate last, as required by mask_scatter_store_optab.  */
++    e.rotate_inputs_left (0, 6);
++    insn_code icode = direct_optab_handler (mask_scatter_store_optab,
++					    e.memory_vector_mode ());
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements truncating forms of svst1_scatter.  */
++class svst1_scatter_truncate_impl : public truncating_store
++{
++public:
++  CONSTEXPR svst1_scatter_truncate_impl (scalar_int_mode to_mode)
++    : truncating_store (to_mode) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    e.prepare_gather_address_operands (1);
++    /* Put the predicate last, since the truncating scatters use the same
++       operand order as mask_scatter_store_optab.  */
++    e.rotate_inputs_left (0, 6);
++    insn_code icode = code_for_aarch64_scatter_store_trunc
++      (e.memory_vector_mode (), e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Implements truncating contiguous forms of svst1.  */
++class svst1_truncate_impl : public truncating_store
++{
++public:
++  CONSTEXPR svst1_truncate_impl (scalar_int_mode to_mode)
++    : truncating_store (to_mode) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_store_trunc (e.memory_vector_mode (),
++						    e.vector_mode (0));
++    return e.use_contiguous_store_insn (icode);
++  }
++};
++
++/* Implements svst2, svst3 and svst4.  */
++class svst234_impl : public full_width_access
++{
++public:
++  CONSTEXPR svst234_impl (unsigned int vectors_per_tuple)
++    : full_width_access (vectors_per_tuple) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_MEMORY;
++  }
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    tree vectype = f.vector_type (0);
++
++    /* Get the predicate and base pointer.  */
++    gimple_seq stmts = NULL;
++    tree pred = f.convert_pred (stmts, vectype, 0);
++    tree base = f.fold_contiguous_base (stmts, vectype);
++    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
++
++    /* View the stored data as an array of vectors.  */
++    unsigned int num_args = gimple_call_num_args (f.call);
++    tree rhs_tuple = gimple_call_arg (f.call, num_args - 1);
++    tree field = tuple_type_field (TREE_TYPE (rhs_tuple));
++    tree rhs_array = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (field), rhs_tuple);
++
++    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
++    return gimple_build_call_internal (IFN_MASK_STORE_LANES, 4,
++				       base, cookie, pred, rhs_array);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode tuple_mode = GET_MODE (e.args.last ());
++    insn_code icode = convert_optab_handler (vec_mask_store_lanes_optab,
++					     tuple_mode, e.vector_mode (0));
++    return e.use_contiguous_store_insn (icode);
++  }
++};
++
++class svstnt1_impl : public full_width_access
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_MEMORY;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_stnt1 (e.vector_mode (0));
++    return e.use_contiguous_store_insn (icode);
++  }
++};
++
++class svsub_impl : public rtx_code_function
++{
++public:
++  CONSTEXPR svsub_impl ()
++    : rtx_code_function (MINUS, MINUS, UNSPEC_COND_FSUB) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Canonicalize subtractions of constants to additions.  */
++    machine_mode mode = e.vector_mode (0);
++    if (e.try_negating_argument (2, mode))
++      return e.map_to_rtx_codes (PLUS, PLUS, UNSPEC_COND_FADD);
++
++    return rtx_code_function::expand (e);
++  }
++};
++
++class svtbl_impl : public permute
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
++  }
++};
++
++/* Implements svtrn1 and svtrn2.  */
++class svtrn_impl : public binary_permute
++{
++public:
++  CONSTEXPR svtrn_impl (int base)
++    : binary_permute (base ? UNSPEC_TRN2 : UNSPEC_TRN1), m_base (base) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* svtrn1: { 0, nelts, 2, nelts + 2, 4, nelts + 4, ... }
++       svtrn2: as for svtrn1, but with 1 added to each index.  */
++    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
++    vec_perm_builder builder (nelts, 2, 3);
++    for (unsigned int i = 0; i < 3; ++i)
++      {
++	builder.quick_push (m_base + i * 2);
++	builder.quick_push (m_base + i * 2 + nelts);
++      }
++    return fold_permute (f, builder);
++  }
++
++  /* 0 for svtrn1, 1 for svtrn2.  */
++  unsigned int m_base;
++};
++
++/* Base class for svundef{,2,3,4}.  */
++class svundef_impl : public quiet<multi_vector_function>
++{
++public:
++  CONSTEXPR svundef_impl (unsigned int vectors_per_tuple)
++    : quiet<multi_vector_function> (vectors_per_tuple) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* Don't fold svundef at the gimple level.  There's no exact
++       correspondence for SSA_NAMEs, and we explicitly don't want
++       to generate a specific value (like an all-zeros vector).  */
++    if (vectors_per_tuple () == 1)
++      return NULL;
++    return gimple_build_assign (f.lhs, build_clobber (TREE_TYPE (f.lhs)));
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    rtx target = e.get_reg_target ();
++    emit_clobber (copy_rtx (target));
++    return target;
++  }
++};
++
++/* Implements svunpklo and svunpkhi.  */
++class svunpk_impl : public quiet<function_base>
++{
++public:
++  CONSTEXPR svunpk_impl (bool high_p) : m_high_p (high_p) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* Don't fold the predicate ops, since every bit of the svbool_t
++       result is significant.  */
++    if (f.type_suffix_ids[0] == TYPE_SUFFIX_b)
++      return NULL;
++
++    /* The first half in memory is VEC_UNPACK_LO_EXPR for little-endian
++       and VEC_UNPACK_HI_EXPR for big-endian.  */
++    bool high_p = BYTES_BIG_ENDIAN ? !m_high_p : m_high_p;
++    tree_code code = high_p ? VEC_UNPACK_HI_EXPR : VEC_UNPACK_LO_EXPR;
++    return gimple_build_assign (f.lhs, code, gimple_call_arg (f.call, 0));
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = GET_MODE (e.args[0]);
++    unsigned int unpacku = m_high_p ? UNSPEC_UNPACKUHI : UNSPEC_UNPACKULO;
++    unsigned int unpacks = m_high_p ? UNSPEC_UNPACKSHI : UNSPEC_UNPACKSLO;
++    insn_code icode;
++    if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
++      icode = code_for_aarch64_sve_punpk (unpacku, mode);
++    else
++      {
++	int unspec = e.type_suffix (0).unsigned_p ? unpacku : unpacks;
++	icode = code_for_aarch64_sve_unpk (unspec, unspec, mode);
++      }
++    return e.use_exact_insn (icode);
++  }
++
++  /* True for svunpkhi, false for svunpklo.  */
++  bool m_high_p;
++};
++
++/* Also implements svsudot.  */
++class svusdot_impl : public function_base
++{
++public:
++  CONSTEXPR svusdot_impl (bool su) : m_su (su) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* The implementation of the ACLE function svsudot (for the non-lane
++       version) is through the USDOT instruction but with the second and third
++       inputs swapped.  */
++    if (m_su)
++      e.rotate_inputs_left (1, 2);
++    /* The ACLE function has the same order requirements as for svdot.
++       While there's no requirement for the RTL pattern to have the same sort
++       of order as that for <sur>dot_prod, it's easier to read.
++       Hence we do the same rotation on arguments as svdot_impl does.  */
++    e.rotate_inputs_left (0, 3);
++    machine_mode mode = e.vector_mode (0);
++    insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode);
++    return e.use_exact_insn (icode);
++  }
++
++private:
++  bool m_su;
++};
++
++/* Implements svuzp1 and svuzp2.  */
++class svuzp_impl : public binary_permute
++{
++public:
++  CONSTEXPR svuzp_impl (unsigned int base)
++    : binary_permute (base ? UNSPEC_UZP2 : UNSPEC_UZP1), m_base (base) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* svuzp1: { 0, 2, 4, 6, ... }
++       svuzp2: { 1, 3, 5, 7, ... }.  */
++    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
++    vec_perm_builder builder (nelts, 1, 3);
++    for (unsigned int i = 0; i < 3; ++i)
++      builder.quick_push (m_base + i * 2);
++    return fold_permute (f, builder);
++  }
++
++  /* 0 for svuzp1, 1 for svuzp2.  */
++  unsigned int m_base;
++};
++
++/* A function_base for svwhilele and svwhilelt functions.  */
++class svwhile_impl : public function_base
++{
++public:
++  CONSTEXPR svwhile_impl (int unspec_for_sint, int unspec_for_uint, bool eq_p)
++    : m_unspec_for_sint (unspec_for_sint),
++      m_unspec_for_uint (unspec_for_uint), m_eq_p (eq_p)
++  {}
++
++  /* Try to fold a call by treating its arguments as constants of type T.  */
++  template<typename T>
++  gimple *
++  fold_type (gimple_folder &f) const
++  {
++    /* Only handle cases in which both operands are constant.  */
++    T arg0, arg1;
++    if (!poly_int_tree_p (gimple_call_arg (f.call, 0), &arg0)
++	|| !poly_int_tree_p (gimple_call_arg (f.call, 1), &arg1))
++      return NULL;
++
++    /* Check whether the result is known to be all-false.  */
++    if (m_eq_p ? known_gt (arg0, arg1) : known_ge (arg0, arg1))
++      return f.fold_to_pfalse ();
++
++    /* Punt if we can't tell at compile time whether the result
++       is all-false.  */
++    if (m_eq_p ? maybe_gt (arg0, arg1) : maybe_ge (arg0, arg1))
++      return NULL;
++
++    /* At this point we know the result has at least one set element.  */
++    poly_uint64 diff = arg1 - arg0;
++    poly_uint64 nelts = GET_MODE_NUNITS (f.vector_mode (0));
++
++    /* Canonicalize the svwhilele form to the svwhilelt form.  Subtract
++       from NELTS rather than adding to DIFF, to prevent overflow.  */
++    if (m_eq_p)
++      nelts -= 1;
++
++    /* Check whether the result is known to be all-true.  */
++    if (known_ge (diff, nelts))
++      return f.fold_to_ptrue ();
++
++    /* Punt if DIFF might not be the actual number of set elements
++       in the result.  Conditional equality is fine.  */
++    if (maybe_gt (diff, nelts))
++      return NULL;
++
++    /* At this point we know that the predicate will have DIFF set elements
++       for svwhilelt and DIFF + 1 set elements for svwhilele (which stops
++       after rather than before ARG1 is reached).  See if we can create
++       the predicate at compile time.  */
++    unsigned HOST_WIDE_INT vl;
++    if (diff.is_constant (&vl))
++      /* Overflow is no longer possible after the checks above.  */
++      return f.fold_to_vl_pred (m_eq_p ? vl + 1 : vl);
++
++    return NULL;
++  }
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    if (f.type_suffix (1).unsigned_p)
++      return fold_type<poly_uint64> (f);
++    else
++      return fold_type<poly_int64> (f);
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Suffix 0 determines the predicate mode, suffix 1 determines the
++       scalar mode and signedness.  */
++    int unspec = (e.type_suffix (1).unsigned_p
++		  ? m_unspec_for_uint
++		  : m_unspec_for_sint);
++    machine_mode pred_mode = e.vector_mode (0);
++    scalar_mode reg_mode = GET_MODE_INNER (e.vector_mode (1));
++    return e.use_exact_insn (code_for_while (unspec, reg_mode, pred_mode));
++  }
++
++  /* The unspec codes associated with signed and unsigned operations
++     respectively.  */
++  int m_unspec_for_sint;
++  int m_unspec_for_uint;
++
++  /* True svwhilele, false for svwhilelt.  */
++  bool m_eq_p;
++};
++
++class svwrffr_impl : public function_base
++{
++public:
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_FFR;
++  }
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (CODE_FOR_aarch64_wrffr);
++  }
++};
++
++/* Implements svzip1 and svzip2.  */
++class svzip_impl : public binary_permute
++{
++public:
++  CONSTEXPR svzip_impl (unsigned int base)
++    : binary_permute (base ? UNSPEC_ZIP2 : UNSPEC_ZIP1), m_base (base) {}
++
++  gimple *
++  fold (gimple_folder &f) const OVERRIDE
++  {
++    /* svzip1: { 0, nelts, 1, nelts + 1, 2, nelts + 2, ... }
++       svzip2: as for svzip1, but with nelts / 2 added to each index.  */
++    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
++    poly_uint64 base = m_base * exact_div (nelts, 2);
++    vec_perm_builder builder (nelts, 2, 3);
++    for (unsigned int i = 0; i < 3; ++i)
++      {
++	builder.quick_push (base + i);
++	builder.quick_push (base + i + nelts);
++      }
++    return fold_permute (f, builder);
++  }
++
++  /* 0 for svzip1, 1 for svzip2.  */
++  unsigned int m_base;
++};
++
++} /* end anonymous namespace */
++
++namespace aarch64_sve {
++
++FUNCTION (svabd, svabd_impl,)
++FUNCTION (svabs, quiet<rtx_code_function>, (ABS, ABS, UNSPEC_COND_FABS))
++FUNCTION (svacge, svac_impl, (UNSPEC_COND_FCMGE))
++FUNCTION (svacgt, svac_impl, (UNSPEC_COND_FCMGT))
++FUNCTION (svacle, svac_impl, (UNSPEC_COND_FCMLE))
++FUNCTION (svaclt, svac_impl, (UNSPEC_COND_FCMLT))
++FUNCTION (svadd, rtx_code_function, (PLUS, PLUS, UNSPEC_COND_FADD))
++FUNCTION (svadda, svadda_impl,)
++FUNCTION (svaddv, reduction, (UNSPEC_SADDV, UNSPEC_UADDV, UNSPEC_FADDV))
++FUNCTION (svadrb, svadr_bhwd_impl, (0))
++FUNCTION (svadrd, svadr_bhwd_impl, (3))
++FUNCTION (svadrh, svadr_bhwd_impl, (1))
++FUNCTION (svadrw, svadr_bhwd_impl, (2))
++FUNCTION (svand, rtx_code_function, (AND, AND))
++FUNCTION (svandv, reduction, (UNSPEC_ANDV))
++FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT))
++FUNCTION (svasr_wide, shift_wide, (ASHIFTRT, UNSPEC_ASHIFTRT_WIDE))
++FUNCTION (svasrd, svasrd_impl,)
++FUNCTION (svbfdot, fixed_insn_function, (CODE_FOR_aarch64_sve_bfdotvnx4sf))
++FUNCTION (svbfdot_lane, fixed_insn_function,
++	  (CODE_FOR_aarch64_sve_bfdot_lanevnx4sf))
++FUNCTION (svbfmlalb, fixed_insn_function, (CODE_FOR_aarch64_sve_bfmlalbvnx4sf))
++FUNCTION (svbfmlalb_lane, fixed_insn_function,
++	  (CODE_FOR_aarch64_sve_bfmlalb_lanevnx4sf))
++FUNCTION (svbfmlalt, fixed_insn_function, (CODE_FOR_aarch64_sve_bfmlaltvnx4sf))
++FUNCTION (svbfmlalt_lane, fixed_insn_function,
++	  (CODE_FOR_aarch64_sve_bfmlalt_lanevnx4sf))
++FUNCTION (svbfmmla, fixed_insn_function, (CODE_FOR_aarch64_sve_bfmmlavnx4sf))
++FUNCTION (svbic, svbic_impl,)
++FUNCTION (svbrka, svbrk_unary_impl, (UNSPEC_BRKA))
++FUNCTION (svbrkb, svbrk_unary_impl, (UNSPEC_BRKB))
++FUNCTION (svbrkn, svbrk_binary_impl, (UNSPEC_BRKN))
++FUNCTION (svbrkpa, svbrk_binary_impl, (UNSPEC_BRKPA))
++FUNCTION (svbrkpb, svbrk_binary_impl, (UNSPEC_BRKPB))
++FUNCTION (svcadd, svcadd_impl,)
++FUNCTION (svclasta, svclast_impl, (UNSPEC_CLASTA))
++FUNCTION (svclastb, svclast_impl, (UNSPEC_CLASTB))
++FUNCTION (svcls, unary_count, (CLRSB))
++FUNCTION (svclz, unary_count, (CLZ))
++FUNCTION (svcmla, svcmla_impl,)
++FUNCTION (svcmla_lane, svcmla_lane_impl,)
++FUNCTION (svcmpeq, svcmp_impl, (EQ_EXPR, UNSPEC_COND_FCMEQ))
++FUNCTION (svcmpeq_wide, svcmp_wide_impl, (EQ_EXPR, UNSPEC_COND_CMPEQ_WIDE,
++					  UNSPEC_COND_CMPEQ_WIDE))
++FUNCTION (svcmpge, svcmp_impl, (GE_EXPR, UNSPEC_COND_FCMGE))
++FUNCTION (svcmpge_wide, svcmp_wide_impl, (GE_EXPR, UNSPEC_COND_CMPGE_WIDE,
++					  UNSPEC_COND_CMPHS_WIDE))
++FUNCTION (svcmpgt, svcmp_impl, (GT_EXPR, UNSPEC_COND_FCMGT))
++FUNCTION (svcmpgt_wide, svcmp_wide_impl, (GT_EXPR, UNSPEC_COND_CMPGT_WIDE,
++					  UNSPEC_COND_CMPHI_WIDE))
++FUNCTION (svcmple, svcmp_impl, (LE_EXPR, UNSPEC_COND_FCMLE))
++FUNCTION (svcmple_wide, svcmp_wide_impl, (LE_EXPR, UNSPEC_COND_CMPLE_WIDE,
++					  UNSPEC_COND_CMPLS_WIDE))
++FUNCTION (svcmplt, svcmp_impl, (LT_EXPR, UNSPEC_COND_FCMLT))
++FUNCTION (svcmplt_wide, svcmp_wide_impl, (LT_EXPR, UNSPEC_COND_CMPLT_WIDE,
++					  UNSPEC_COND_CMPLO_WIDE))
++FUNCTION (svcmpne, svcmp_impl, (NE_EXPR, UNSPEC_COND_FCMNE))
++FUNCTION (svcmpne_wide, svcmp_wide_impl, (NE_EXPR, UNSPEC_COND_CMPNE_WIDE,
++					  UNSPEC_COND_CMPNE_WIDE))
++FUNCTION (svcmpuo, svcmpuo_impl,)
++FUNCTION (svcnot, svcnot_impl,)
++FUNCTION (svcnt, unary_count, (POPCOUNT))
++FUNCTION (svcntb, svcnt_bhwd_impl, (VNx16QImode))
++FUNCTION (svcntb_pat, svcnt_bhwd_pat_impl, (VNx16QImode))
++FUNCTION (svcntd, svcnt_bhwd_impl, (VNx2DImode))
++FUNCTION (svcntd_pat, svcnt_bhwd_pat_impl, (VNx2DImode))
++FUNCTION (svcnth, svcnt_bhwd_impl, (VNx8HImode))
++FUNCTION (svcnth_pat, svcnt_bhwd_pat_impl, (VNx8HImode))
++FUNCTION (svcntp, svcntp_impl,)
++FUNCTION (svcntw, svcnt_bhwd_impl, (VNx4SImode))
++FUNCTION (svcntw_pat, svcnt_bhwd_pat_impl, (VNx4SImode))
++FUNCTION (svcompact, QUIET_CODE_FOR_MODE0 (aarch64_sve_compact),)
++FUNCTION (svcreate2, svcreate_impl, (2))
++FUNCTION (svcreate3, svcreate_impl, (3))
++FUNCTION (svcreate4, svcreate_impl, (4))
++FUNCTION (svcvt, svcvt_impl,)
++FUNCTION (svcvtnt, CODE_FOR_MODE0 (aarch64_sve_cvtnt),)
++FUNCTION (svdiv, rtx_code_function, (DIV, UDIV, UNSPEC_COND_FDIV))
++FUNCTION (svdivr, rtx_code_function_rotated, (DIV, UDIV, UNSPEC_COND_FDIV))
++FUNCTION (svdot, svdot_impl,)
++FUNCTION (svdot_lane, svdotprod_lane_impl, (UNSPEC_SDOT, UNSPEC_UDOT, -1))
++FUNCTION (svdup, svdup_impl,)
++FUNCTION (svdup_lane, svdup_lane_impl,)
++FUNCTION (svdupq, svdupq_impl,)
++FUNCTION (svdupq_lane, svdupq_lane_impl,)
++FUNCTION (sveor, rtx_code_function, (XOR, XOR, -1))
++FUNCTION (sveorv, reduction, (UNSPEC_XORV))
++FUNCTION (svexpa, unspec_based_function, (-1, -1, UNSPEC_FEXPA))
++FUNCTION (svext, QUIET_CODE_FOR_MODE0 (aarch64_sve_ext),)
++FUNCTION (svextb, svext_bhw_impl, (QImode))
++FUNCTION (svexth, svext_bhw_impl, (HImode))
++FUNCTION (svextw, svext_bhw_impl, (SImode))
++FUNCTION (svget2, svget_impl, (2))
++FUNCTION (svget3, svget_impl, (3))
++FUNCTION (svget4, svget_impl, (4))
++FUNCTION (svindex, svindex_impl,)
++FUNCTION (svinsr, svinsr_impl,)
++FUNCTION (svlasta, svlast_impl, (UNSPEC_LASTA))
++FUNCTION (svlastb, svlast_impl, (UNSPEC_LASTB))
++FUNCTION (svld1, svld1_impl,)
++FUNCTION (svld1_gather, svld1_gather_impl,)
++FUNCTION (svld1ro, svld1ro_impl,)
++FUNCTION (svld1rq, svld1rq_impl,)
++FUNCTION (svld1sb, svld1_extend_impl, (TYPE_SUFFIX_s8))
++FUNCTION (svld1sb_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_s8))
++FUNCTION (svld1sh, svld1_extend_impl, (TYPE_SUFFIX_s16))
++FUNCTION (svld1sh_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_s16))
++FUNCTION (svld1sw, svld1_extend_impl, (TYPE_SUFFIX_s32))
++FUNCTION (svld1sw_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_s32))
++FUNCTION (svld1ub, svld1_extend_impl, (TYPE_SUFFIX_u8))
++FUNCTION (svld1ub_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_u8))
++FUNCTION (svld1uh, svld1_extend_impl, (TYPE_SUFFIX_u16))
++FUNCTION (svld1uh_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_u16))
++FUNCTION (svld1uw, svld1_extend_impl, (TYPE_SUFFIX_u32))
++FUNCTION (svld1uw_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_u32))
++FUNCTION (svld2, svld234_impl, (2))
++FUNCTION (svld3, svld234_impl, (3))
++FUNCTION (svld4, svld234_impl, (4))
++FUNCTION (svldff1, svldxf1_impl, (UNSPEC_LDFF1))
++FUNCTION (svldff1_gather, svldff1_gather_impl,)
++FUNCTION (svldff1sb, svldxf1_extend_impl, (TYPE_SUFFIX_s8, UNSPEC_LDFF1))
++FUNCTION (svldff1sb_gather, svldff1_gather_extend, (TYPE_SUFFIX_s8))
++FUNCTION (svldff1sh, svldxf1_extend_impl, (TYPE_SUFFIX_s16, UNSPEC_LDFF1))
++FUNCTION (svldff1sh_gather, svldff1_gather_extend, (TYPE_SUFFIX_s16))
++FUNCTION (svldff1sw, svldxf1_extend_impl, (TYPE_SUFFIX_s32, UNSPEC_LDFF1))
++FUNCTION (svldff1sw_gather, svldff1_gather_extend, (TYPE_SUFFIX_s32))
++FUNCTION (svldff1ub, svldxf1_extend_impl, (TYPE_SUFFIX_u8, UNSPEC_LDFF1))
++FUNCTION (svldff1ub_gather, svldff1_gather_extend, (TYPE_SUFFIX_u8))
++FUNCTION (svldff1uh, svldxf1_extend_impl, (TYPE_SUFFIX_u16, UNSPEC_LDFF1))
++FUNCTION (svldff1uh_gather, svldff1_gather_extend, (TYPE_SUFFIX_u16))
++FUNCTION (svldff1uw, svldxf1_extend_impl, (TYPE_SUFFIX_u32, UNSPEC_LDFF1))
++FUNCTION (svldff1uw_gather, svldff1_gather_extend, (TYPE_SUFFIX_u32))
++FUNCTION (svldnf1, svldxf1_impl, (UNSPEC_LDNF1))
++FUNCTION (svldnf1sb, svldxf1_extend_impl, (TYPE_SUFFIX_s8, UNSPEC_LDNF1))
++FUNCTION (svldnf1sh, svldxf1_extend_impl, (TYPE_SUFFIX_s16, UNSPEC_LDNF1))
++FUNCTION (svldnf1sw, svldxf1_extend_impl, (TYPE_SUFFIX_s32, UNSPEC_LDNF1))
++FUNCTION (svldnf1ub, svldxf1_extend_impl, (TYPE_SUFFIX_u8, UNSPEC_LDNF1))
++FUNCTION (svldnf1uh, svldxf1_extend_impl, (TYPE_SUFFIX_u16, UNSPEC_LDNF1))
++FUNCTION (svldnf1uw, svldxf1_extend_impl, (TYPE_SUFFIX_u32, UNSPEC_LDNF1))
++FUNCTION (svldnt1, svldnt1_impl,)
++FUNCTION (svlen, svlen_impl,)
++FUNCTION (svlsl, rtx_code_function, (ASHIFT, ASHIFT))
++FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE))
++FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT))
++FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE))
++FUNCTION (svmad, svmad_impl,)
++FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX))
++FUNCTION (svmaxnm, unspec_based_function, (-1, -1, UNSPEC_COND_FMAXNM))
++FUNCTION (svmaxnmv, reduction, (UNSPEC_FMAXNMV))
++FUNCTION (svmaxv, reduction, (UNSPEC_SMAXV, UNSPEC_UMAXV, UNSPEC_FMAXV))
++FUNCTION (svmin, rtx_code_function, (SMIN, UMIN, UNSPEC_COND_FMIN))
++FUNCTION (svminnm, unspec_based_function, (-1, -1, UNSPEC_COND_FMINNM))
++FUNCTION (svminnmv, reduction, (UNSPEC_FMINNMV))
++FUNCTION (svminv, reduction, (UNSPEC_SMINV, UNSPEC_UMINV, UNSPEC_FMINV))
++FUNCTION (svmla, svmla_impl,)
++FUNCTION (svmla_lane, svmla_svmls_lane_impl, (UNSPEC_FMLA))
++FUNCTION (svmls, svmls_impl,)
++FUNCTION (svmls_lane, svmla_svmls_lane_impl, (UNSPEC_FMLS))
++FUNCTION (svmmla, svmmla_impl,)
++FUNCTION (svmov, svmov_impl,)
++FUNCTION (svmsb, svmsb_impl,)
++FUNCTION (svmul, rtx_code_function, (MULT, MULT, UNSPEC_COND_FMUL))
++FUNCTION (svmul_lane, CODE_FOR_MODE0 (aarch64_mul_lane),)
++FUNCTION (svmulh, unspec_based_function, (UNSPEC_SMUL_HIGHPART,
++					  UNSPEC_UMUL_HIGHPART, -1))
++FUNCTION (svmulx, unspec_based_function, (-1, -1, UNSPEC_COND_FMULX))
++FUNCTION (svnand, svnand_impl,)
++FUNCTION (svneg, quiet<rtx_code_function>, (NEG, NEG, UNSPEC_COND_FNEG))
++FUNCTION (svnmad, unspec_based_function, (-1, -1, UNSPEC_COND_FNMLA))
++FUNCTION (svnmla, unspec_based_function_rotated, (-1, -1, UNSPEC_COND_FNMLA))
++FUNCTION (svnmls, unspec_based_function_rotated, (-1, -1, UNSPEC_COND_FNMLS))
++FUNCTION (svnmsb, unspec_based_function, (-1, -1, UNSPEC_COND_FNMLS))
++FUNCTION (svnor, svnor_impl,)
++FUNCTION (svnot, svnot_impl,)
++FUNCTION (svorn, svorn_impl,)
++FUNCTION (svorr, rtx_code_function, (IOR, IOR))
++FUNCTION (svorv, reduction, (UNSPEC_IORV))
++FUNCTION (svpfalse, svpfalse_impl,)
++FUNCTION (svpfirst, svpfirst_svpnext_impl, (UNSPEC_PFIRST))
++FUNCTION (svpnext, svpfirst_svpnext_impl, (UNSPEC_PNEXT))
++FUNCTION (svprfb, svprf_bhwd_impl, (VNx16QImode))
++FUNCTION (svprfb_gather, svprf_bhwd_gather_impl, (VNx16QImode))
++FUNCTION (svprfd, svprf_bhwd_impl, (VNx2DImode))
++FUNCTION (svprfd_gather, svprf_bhwd_gather_impl, (VNx2DImode))
++FUNCTION (svprfh, svprf_bhwd_impl, (VNx8HImode))
++FUNCTION (svprfh_gather, svprf_bhwd_gather_impl, (VNx8HImode))
++FUNCTION (svprfw, svprf_bhwd_impl, (VNx4SImode))
++FUNCTION (svprfw_gather, svprf_bhwd_gather_impl, (VNx4SImode))
++FUNCTION (svptest_any, svptest_impl, (NE))
++FUNCTION (svptest_first, svptest_impl, (LT))
++FUNCTION (svptest_last, svptest_impl, (LTU))
++FUNCTION (svptrue, svptrue_impl,)
++FUNCTION (svptrue_pat, svptrue_pat_impl,)
++FUNCTION (svqadd, svqadd_impl,)
++FUNCTION (svqdecb, svqdec_bhwd_impl, (QImode))
++FUNCTION (svqdecb_pat, svqdec_bhwd_impl, (QImode))
++FUNCTION (svqdecd, svqdec_bhwd_impl, (DImode))
++FUNCTION (svqdecd_pat, svqdec_bhwd_impl, (DImode))
++FUNCTION (svqdech, svqdec_bhwd_impl, (HImode))
++FUNCTION (svqdech_pat, svqdec_bhwd_impl, (HImode))
++FUNCTION (svqdecp, svqdecp_svqincp_impl, (SS_MINUS, US_MINUS))
++FUNCTION (svqdecw, svqdec_bhwd_impl, (SImode))
++FUNCTION (svqdecw_pat, svqdec_bhwd_impl, (SImode))
++FUNCTION (svqincb, svqinc_bhwd_impl, (QImode))
++FUNCTION (svqincb_pat, svqinc_bhwd_impl, (QImode))
++FUNCTION (svqincd, svqinc_bhwd_impl, (DImode))
++FUNCTION (svqincd_pat, svqinc_bhwd_impl, (DImode))
++FUNCTION (svqinch, svqinc_bhwd_impl, (HImode))
++FUNCTION (svqinch_pat, svqinc_bhwd_impl, (HImode))
++FUNCTION (svqincp, svqdecp_svqincp_impl, (SS_PLUS, US_PLUS))
++FUNCTION (svqincw, svqinc_bhwd_impl, (SImode))
++FUNCTION (svqincw_pat, svqinc_bhwd_impl, (SImode))
++FUNCTION (svqsub, svqsub_impl,)
++FUNCTION (svrbit, unspec_based_function, (UNSPEC_RBIT, UNSPEC_RBIT, -1))
++FUNCTION (svrdffr, svrdffr_impl,)
++FUNCTION (svrecpe, unspec_based_function, (-1, -1, UNSPEC_FRECPE))
++FUNCTION (svrecps, unspec_based_function, (-1, -1, UNSPEC_FRECPS))
++FUNCTION (svrecpx, unspec_based_function, (-1, -1, UNSPEC_COND_FRECPX))
++FUNCTION (svreinterpret, svreinterpret_impl,)
++FUNCTION (svrev, svrev_impl,)
++FUNCTION (svrevb, unspec_based_function, (UNSPEC_REVB, UNSPEC_REVB, -1))
++FUNCTION (svrevh, unspec_based_function, (UNSPEC_REVH, UNSPEC_REVH, -1))
++FUNCTION (svrevw, unspec_based_function, (UNSPEC_REVW, UNSPEC_REVW, -1))
++FUNCTION (svrinta, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTA))
++FUNCTION (svrinti, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTI))
++FUNCTION (svrintm, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTM))
++FUNCTION (svrintn, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTN))
++FUNCTION (svrintp, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTP))
++FUNCTION (svrintx, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTX))
++FUNCTION (svrintz, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTZ))
++FUNCTION (svrsqrte, unspec_based_function, (-1, -1, UNSPEC_RSQRTE))
++FUNCTION (svrsqrts, unspec_based_function, (-1, -1, UNSPEC_RSQRTS))
++FUNCTION (svscale, unspec_based_function, (-1, -1, UNSPEC_COND_FSCALE))
++FUNCTION (svsel, svsel_impl,)
++FUNCTION (svset2, svset_impl, (2))
++FUNCTION (svset3, svset_impl, (3))
++FUNCTION (svset4, svset_impl, (4))
++FUNCTION (svsetffr, svsetffr_impl,)
++FUNCTION (svsplice, QUIET_CODE_FOR_MODE0 (aarch64_sve_splice),)
++FUNCTION (svsqrt, rtx_code_function, (SQRT, SQRT, UNSPEC_COND_FSQRT))
++FUNCTION (svst1, svst1_impl,)
++FUNCTION (svst1_scatter, svst1_scatter_impl,)
++FUNCTION (svst1b, svst1_truncate_impl, (QImode))
++FUNCTION (svst1b_scatter, svst1_scatter_truncate_impl, (QImode))
++FUNCTION (svst1h, svst1_truncate_impl, (HImode))
++FUNCTION (svst1h_scatter, svst1_scatter_truncate_impl, (HImode))
++FUNCTION (svst1w, svst1_truncate_impl, (SImode))
++FUNCTION (svst1w_scatter, svst1_scatter_truncate_impl, (SImode))
++FUNCTION (svst2, svst234_impl, (2))
++FUNCTION (svst3, svst234_impl, (3))
++FUNCTION (svst4, svst234_impl, (4))
++FUNCTION (svstnt1, svstnt1_impl,)
++FUNCTION (svsub, svsub_impl,)
++FUNCTION (svsubr, rtx_code_function_rotated, (MINUS, MINUS, UNSPEC_COND_FSUB))
++FUNCTION (svsudot, svusdot_impl, (true))
++FUNCTION (svsudot_lane, svdotprod_lane_impl, (UNSPEC_SUDOT, -1, -1))
++FUNCTION (svtbl, svtbl_impl,)
++FUNCTION (svtmad, CODE_FOR_MODE0 (aarch64_sve_tmad),)
++FUNCTION (svtrn1, svtrn_impl, (0))
++FUNCTION (svtrn1q, unspec_based_function, (UNSPEC_TRN1Q, UNSPEC_TRN1Q,
++					   UNSPEC_TRN1Q))
++FUNCTION (svtrn2, svtrn_impl, (1))
++FUNCTION (svtrn2q, unspec_based_function, (UNSPEC_TRN2Q, UNSPEC_TRN2Q,
++					   UNSPEC_TRN2Q))
++FUNCTION (svtsmul, unspec_based_function, (-1, -1, UNSPEC_FTSMUL))
++FUNCTION (svtssel, unspec_based_function, (-1, -1, UNSPEC_FTSSEL))
++FUNCTION (svundef, svundef_impl, (1))
++FUNCTION (svundef2, svundef_impl, (2))
++FUNCTION (svundef3, svundef_impl, (3))
++FUNCTION (svundef4, svundef_impl, (4))
++FUNCTION (svunpkhi, svunpk_impl, (true))
++FUNCTION (svunpklo, svunpk_impl, (false))
++FUNCTION (svusdot, svusdot_impl, (false))
++FUNCTION (svusdot_lane, svdotprod_lane_impl, (UNSPEC_USDOT, -1, -1))
++FUNCTION (svusmmla, unspec_based_add_function, (UNSPEC_USMATMUL, -1, -1))
++FUNCTION (svuzp1, svuzp_impl, (0))
++FUNCTION (svuzp1q, unspec_based_function, (UNSPEC_UZP1Q, UNSPEC_UZP1Q,
++					   UNSPEC_UZP1Q))
++FUNCTION (svuzp2, svuzp_impl, (1))
++FUNCTION (svuzp2q, unspec_based_function, (UNSPEC_UZP2Q, UNSPEC_UZP2Q,
++					   UNSPEC_UZP2Q))
++FUNCTION (svwhilele, svwhile_impl, (UNSPEC_WHILELE, UNSPEC_WHILELS, true))
++FUNCTION (svwhilelt, svwhile_impl, (UNSPEC_WHILELT, UNSPEC_WHILELO, false))
++FUNCTION (svwrffr, svwrffr_impl,)
++FUNCTION (svzip1, svzip_impl, (0))
++FUNCTION (svzip1q, unspec_based_function, (UNSPEC_ZIP1Q, UNSPEC_ZIP1Q,
++					   UNSPEC_ZIP1Q))
++FUNCTION (svzip2, svzip_impl, (1))
++FUNCTION (svzip2q, unspec_based_function, (UNSPEC_ZIP2Q, UNSPEC_ZIP2Q,
++					   UNSPEC_ZIP2Q))
++
++} /* end namespace aarch64_sve */
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def
+new file mode 100644
+index 000000000..795a5fd90
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def
+@@ -0,0 +1,355 @@
++/* ACLE support for AArch64 SVE (__ARM_FEATURE_SVE intrinsics)
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#define REQUIRED_EXTENSIONS 0
++DEF_SVE_FUNCTION (svabd, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svabs, unary, all_float_and_signed, mxz)
++DEF_SVE_FUNCTION (svacge, compare_opt_n, all_float, implicit)
++DEF_SVE_FUNCTION (svacgt, compare_opt_n, all_float, implicit)
++DEF_SVE_FUNCTION (svacle, compare_opt_n, all_float, implicit)
++DEF_SVE_FUNCTION (svaclt, compare_opt_n, all_float, implicit)
++DEF_SVE_FUNCTION (svadd, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svadda, fold_left, all_float, implicit)
++DEF_SVE_FUNCTION (svaddv, reduction_wide, all_arith, implicit)
++DEF_SVE_FUNCTION (svadrb, adr_offset, none, none)
++DEF_SVE_FUNCTION (svadrd, adr_index, none, none)
++DEF_SVE_FUNCTION (svadrh, adr_index, none, none)
++DEF_SVE_FUNCTION (svadrw, adr_index, none, none)
++DEF_SVE_FUNCTION (svand, binary_opt_n, all_integer, mxz)
++DEF_SVE_FUNCTION (svand, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (svandv, reduction, all_integer, implicit)
++DEF_SVE_FUNCTION (svasr, binary_uint_opt_n, all_signed, mxz)
++DEF_SVE_FUNCTION (svasr_wide, binary_uint64_opt_n, bhs_signed, mxz)
++DEF_SVE_FUNCTION (svasrd, shift_right_imm, all_signed, mxz)
++DEF_SVE_FUNCTION (svbic, binary_opt_n, all_integer, mxz)
++DEF_SVE_FUNCTION (svbic, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (svbrka, unary, b, mz)
++DEF_SVE_FUNCTION (svbrkb, unary, b, mz)
++DEF_SVE_FUNCTION (svbrkn, binary, b, z)
++DEF_SVE_FUNCTION (svbrkpa, binary, b, z)
++DEF_SVE_FUNCTION (svbrkpb, binary, b, z)
++DEF_SVE_FUNCTION (svcadd, binary_rotate, all_float, mxz)
++DEF_SVE_FUNCTION (svclasta, clast, all_data, implicit)
++DEF_SVE_FUNCTION (svclastb, clast, all_data, implicit)
++DEF_SVE_FUNCTION (svcls, unary_to_uint, all_signed, mxz)
++DEF_SVE_FUNCTION (svclz, unary_to_uint, all_integer, mxz)
++DEF_SVE_FUNCTION (svcmla, ternary_rotate, all_float, mxz)
++DEF_SVE_FUNCTION (svcmla_lane, ternary_lane_rotate, hs_float, none)
++DEF_SVE_FUNCTION (svcmpeq, compare_opt_n, all_arith, implicit)
++DEF_SVE_FUNCTION (svcmpeq_wide, compare_wide_opt_n, bhs_signed, implicit)
++DEF_SVE_FUNCTION (svcmpge, compare_opt_n, all_arith, implicit)
++DEF_SVE_FUNCTION (svcmpge_wide, compare_wide_opt_n, bhs_integer, implicit)
++DEF_SVE_FUNCTION (svcmpgt, compare_opt_n, all_arith, implicit)
++DEF_SVE_FUNCTION (svcmpgt_wide, compare_wide_opt_n, bhs_integer, implicit)
++DEF_SVE_FUNCTION (svcmple, compare_opt_n, all_arith, implicit)
++DEF_SVE_FUNCTION (svcmple_wide, compare_wide_opt_n, bhs_integer, implicit)
++DEF_SVE_FUNCTION (svcmplt, compare_opt_n, all_arith, implicit)
++DEF_SVE_FUNCTION (svcmplt_wide, compare_wide_opt_n, bhs_integer, implicit)
++DEF_SVE_FUNCTION (svcmpne, compare_opt_n, all_arith, implicit)
++DEF_SVE_FUNCTION (svcmpne_wide, compare_wide_opt_n, bhs_signed, implicit)
++DEF_SVE_FUNCTION (svcmpuo, compare_opt_n, all_float, implicit)
++DEF_SVE_FUNCTION (svcnot, unary, all_integer, mxz)
++DEF_SVE_FUNCTION (svcnt, unary_to_uint, all_data, mxz)
++DEF_SVE_FUNCTION (svcntb, count_inherent, none, none)
++DEF_SVE_FUNCTION (svcntb_pat, count_pat, none, none)
++DEF_SVE_FUNCTION (svcntd, count_inherent, none, none)
++DEF_SVE_FUNCTION (svcntd_pat, count_pat, none, none)
++DEF_SVE_FUNCTION (svcnth, count_inherent, none, none)
++DEF_SVE_FUNCTION (svcnth_pat, count_pat, none, none)
++DEF_SVE_FUNCTION (svcntp, count_pred, all_pred, implicit)
++DEF_SVE_FUNCTION (svcntw, count_inherent, none, none)
++DEF_SVE_FUNCTION (svcntw_pat, count_pat, none, none)
++DEF_SVE_FUNCTION (svcompact, unary, sd_data, implicit)
++DEF_SVE_FUNCTION (svcreate2, create, all_data, none)
++DEF_SVE_FUNCTION (svcreate3, create, all_data, none)
++DEF_SVE_FUNCTION (svcreate4, create, all_data, none)
++DEF_SVE_FUNCTION (svcvt, unary_convert, cvt, mxz)
++DEF_SVE_FUNCTION (svdiv, binary_opt_n, all_float_and_sd_integer, mxz)
++DEF_SVE_FUNCTION (svdivr, binary_opt_n, all_float_and_sd_integer, mxz)
++DEF_SVE_FUNCTION (svdot, ternary_qq_opt_n, sd_integer, none)
++DEF_SVE_FUNCTION (svdot_lane, ternary_qq_lane, sd_integer, none)
++DEF_SVE_FUNCTION (svdup, unary_n, all_data, mxz_or_none)
++DEF_SVE_FUNCTION (svdup, unary_n, all_pred, none)
++DEF_SVE_FUNCTION (svdup_lane, binary_uint_n, all_data, none)
++DEF_SVE_FUNCTION (svdupq, dupq, all_data, none)
++DEF_SVE_FUNCTION (svdupq, dupq, all_pred, none)
++DEF_SVE_FUNCTION (svdupq_lane, binary_uint64_n, all_data, none)
++DEF_SVE_FUNCTION (sveor, binary_opt_n, all_integer, mxz)
++DEF_SVE_FUNCTION (sveor, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (sveorv, reduction, all_integer, implicit)
++DEF_SVE_FUNCTION (svexpa, unary_uint, all_float, none)
++DEF_SVE_FUNCTION (svext, ext, all_data, none)
++DEF_SVE_FUNCTION (svextb, unary, hsd_integer, mxz)
++DEF_SVE_FUNCTION (svexth, unary, sd_integer, mxz)
++DEF_SVE_FUNCTION (svextw, unary, d_integer, mxz)
++DEF_SVE_FUNCTION (svget2, get, all_data, none)
++DEF_SVE_FUNCTION (svget3, get, all_data, none)
++DEF_SVE_FUNCTION (svget4, get, all_data, none)
++DEF_SVE_FUNCTION (svindex, binary_scalar, all_integer, none)
++DEF_SVE_FUNCTION (svinsr, binary_n, all_data, none)
++DEF_SVE_FUNCTION (svlasta, reduction, all_data, implicit)
++DEF_SVE_FUNCTION (svlastb, reduction, all_data, implicit)
++DEF_SVE_FUNCTION (svld1, load, all_data, implicit)
++DEF_SVE_FUNCTION (svld1_gather, load_gather_sv, sd_data, implicit)
++DEF_SVE_FUNCTION (svld1_gather, load_gather_vs, sd_data, implicit)
++DEF_SVE_FUNCTION (svld1rq, load_replicate, all_data, implicit)
++DEF_SVE_FUNCTION (svld1sb, load_ext, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svld1sb_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1sh, load_ext, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1sh_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1sh_gather, load_ext_gather_index, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1sw, load_ext, d_integer, implicit)
++DEF_SVE_FUNCTION (svld1sw_gather, load_ext_gather_offset, d_integer, implicit)
++DEF_SVE_FUNCTION (svld1sw_gather, load_ext_gather_index, d_integer, implicit)
++DEF_SVE_FUNCTION (svld1ub, load_ext, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svld1ub_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1uh, load_ext, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1uh_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1uh_gather, load_ext_gather_index, sd_integer, implicit)
++DEF_SVE_FUNCTION (svld1uw, load_ext, d_integer, implicit)
++DEF_SVE_FUNCTION (svld1uw_gather, load_ext_gather_offset, d_integer, implicit)
++DEF_SVE_FUNCTION (svld1uw_gather, load_ext_gather_index, d_integer, implicit)
++DEF_SVE_FUNCTION (svldff1, load, all_data, implicit)
++DEF_SVE_FUNCTION (svldff1_gather, load_gather_sv, sd_data, implicit)
++DEF_SVE_FUNCTION (svldff1_gather, load_gather_vs, sd_data, implicit)
++DEF_SVE_FUNCTION (svldff1sb, load_ext, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sb_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sh, load_ext, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sh_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sh_gather, load_ext_gather_index, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sw, load_ext, d_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sw_gather, load_ext_gather_offset, d_integer, implicit)
++DEF_SVE_FUNCTION (svldff1sw_gather, load_ext_gather_index, d_integer, implicit)
++DEF_SVE_FUNCTION (svldff1ub, load_ext, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1ub_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1uh, load_ext, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1uh_gather, load_ext_gather_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1uh_gather, load_ext_gather_index, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldff1uw, load_ext, d_integer, implicit)
++DEF_SVE_FUNCTION (svldff1uw_gather, load_ext_gather_offset, d_integer, implicit)
++DEF_SVE_FUNCTION (svldff1uw_gather, load_ext_gather_index, d_integer, implicit)
++DEF_SVE_FUNCTION (svldnf1, load, all_data, implicit)
++DEF_SVE_FUNCTION (svldnf1sb, load_ext, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svldnf1sh, load_ext, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldnf1sw, load_ext, d_integer, implicit)
++DEF_SVE_FUNCTION (svldnf1ub, load_ext, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svldnf1uh, load_ext, sd_integer, implicit)
++DEF_SVE_FUNCTION (svldnf1uw, load_ext, d_integer, implicit)
++DEF_SVE_FUNCTION (svldnt1, load, all_data, implicit)
++DEF_SVE_FUNCTION (svld2, load, all_data, implicit)
++DEF_SVE_FUNCTION (svld3, load, all_data, implicit)
++DEF_SVE_FUNCTION (svld4, load, all_data, implicit)
++DEF_SVE_FUNCTION (svlen, count_vector, all_data, none)
++DEF_SVE_FUNCTION (svlsl, binary_uint_opt_n, all_integer, mxz)
++DEF_SVE_FUNCTION (svlsl_wide, binary_uint64_opt_n, bhs_integer, mxz)
++DEF_SVE_FUNCTION (svlsr, binary_uint_opt_n, all_unsigned, mxz)
++DEF_SVE_FUNCTION (svlsr_wide, binary_uint64_opt_n, bhs_unsigned, mxz)
++DEF_SVE_FUNCTION (svmad, ternary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svmax, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svmaxnm, binary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svmaxnmv, reduction, all_float, implicit)
++DEF_SVE_FUNCTION (svmaxv, reduction, all_arith, implicit)
++DEF_SVE_FUNCTION (svmin, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svminnm, binary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svminnmv, reduction, all_float, implicit)
++DEF_SVE_FUNCTION (svminv, reduction, all_arith, implicit)
++DEF_SVE_FUNCTION (svmla, ternary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svmla_lane, ternary_lane, all_float, none)
++DEF_SVE_FUNCTION (svmls, ternary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svmls_lane, ternary_lane, all_float, none)
++DEF_SVE_FUNCTION (svmmla, mmla, none, none)
++DEF_SVE_FUNCTION (svmov, unary, b, z)
++DEF_SVE_FUNCTION (svmsb, ternary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svmul, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svmul_lane, binary_lane, all_float, none)
++DEF_SVE_FUNCTION (svmulh, binary_opt_n, all_integer, mxz)
++DEF_SVE_FUNCTION (svmulx, binary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svnand, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (svneg, unary, all_float_and_signed, mxz)
++DEF_SVE_FUNCTION (svnmad, ternary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svnmla, ternary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svnmls, ternary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svnmsb, ternary_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svnor, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (svnot, unary, all_integer, mxz)
++DEF_SVE_FUNCTION (svnot, unary, b, z)
++DEF_SVE_FUNCTION (svorn, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (svorr, binary_opt_n, all_integer, mxz)
++DEF_SVE_FUNCTION (svorr, binary_opt_n, b, z)
++DEF_SVE_FUNCTION (svorv, reduction, all_integer, implicit)
++DEF_SVE_FUNCTION (svpfalse, inherent_b, b, none)
++DEF_SVE_FUNCTION (svpfirst, unary, b, implicit)
++DEF_SVE_FUNCTION (svpnext, unary_pred, all_pred, implicit)
++DEF_SVE_FUNCTION (svprfb, prefetch, none, implicit)
++DEF_SVE_FUNCTION (svprfb_gather, prefetch_gather_offset, none, implicit)
++DEF_SVE_FUNCTION (svprfd, prefetch, none, implicit)
++DEF_SVE_FUNCTION (svprfd_gather, prefetch_gather_index, none, implicit)
++DEF_SVE_FUNCTION (svprfh, prefetch, none, implicit)
++DEF_SVE_FUNCTION (svprfh_gather, prefetch_gather_index, none, implicit)
++DEF_SVE_FUNCTION (svprfw, prefetch, none, implicit)
++DEF_SVE_FUNCTION (svprfw_gather, prefetch_gather_index, none, implicit)
++DEF_SVE_FUNCTION (svptest_any, ptest, none, implicit)
++DEF_SVE_FUNCTION (svptest_first, ptest, none, implicit)
++DEF_SVE_FUNCTION (svptest_last, ptest, none, implicit)
++DEF_SVE_FUNCTION (svptrue, inherent, all_pred, none)
++DEF_SVE_FUNCTION (svptrue_pat, pattern_pred, all_pred, none)
++DEF_SVE_FUNCTION (svqadd, binary_opt_n, all_integer, none)
++DEF_SVE_FUNCTION (svqdecb, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqdecb_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqdecd, inc_dec, d_integer, none)
++DEF_SVE_FUNCTION (svqdecd, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqdecd_pat, inc_dec_pat, d_integer, none)
++DEF_SVE_FUNCTION (svqdecd_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqdech, inc_dec, h_integer, none)
++DEF_SVE_FUNCTION (svqdech, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqdech_pat, inc_dec_pat, h_integer, none)
++DEF_SVE_FUNCTION (svqdech_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqdecp, inc_dec_pred, hsd_integer, none)
++DEF_SVE_FUNCTION (svqdecp, inc_dec_pred_scalar, inc_dec_n, none)
++DEF_SVE_FUNCTION (svqdecw, inc_dec, s_integer, none)
++DEF_SVE_FUNCTION (svqdecw, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqdecw_pat, inc_dec_pat, s_integer, none)
++DEF_SVE_FUNCTION (svqdecw_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqincb, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqincb_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqincd, inc_dec, d_integer, none)
++DEF_SVE_FUNCTION (svqincd, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqincd_pat, inc_dec_pat, d_integer, none)
++DEF_SVE_FUNCTION (svqincd_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqinch, inc_dec, h_integer, none)
++DEF_SVE_FUNCTION (svqinch, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqinch_pat, inc_dec_pat, h_integer, none)
++DEF_SVE_FUNCTION (svqinch_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqincp, inc_dec_pred, hsd_integer, none)
++DEF_SVE_FUNCTION (svqincp, inc_dec_pred_scalar, inc_dec_n, none)
++DEF_SVE_FUNCTION (svqincw, inc_dec, s_integer, none)
++DEF_SVE_FUNCTION (svqincw, inc_dec, sd_integer, none)
++DEF_SVE_FUNCTION (svqincw_pat, inc_dec_pat, s_integer, none)
++DEF_SVE_FUNCTION (svqincw_pat, inc_dec_pat, sd_integer, none)
++DEF_SVE_FUNCTION (svqsub, binary_opt_n, all_integer, none)
++DEF_SVE_FUNCTION (svrbit, unary, all_integer, mxz)
++DEF_SVE_FUNCTION (svrdffr, rdffr, none, z_or_none)
++DEF_SVE_FUNCTION (svrecpe, unary, all_float, none)
++DEF_SVE_FUNCTION (svrecps, binary, all_float, none)
++DEF_SVE_FUNCTION (svrecpx, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svreinterpret, unary_convert, reinterpret, none)
++DEF_SVE_FUNCTION (svrev, unary, all_data, none)
++DEF_SVE_FUNCTION (svrev, unary_pred, all_pred, none)
++DEF_SVE_FUNCTION (svrevb, unary, hsd_integer, mxz)
++DEF_SVE_FUNCTION (svrevh, unary, sd_integer, mxz)
++DEF_SVE_FUNCTION (svrevw, unary, d_integer, mxz)
++DEF_SVE_FUNCTION (svrinta, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrinti, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrintm, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrintn, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrintp, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrintx, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrintz, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svrsqrte, unary, all_float, none)
++DEF_SVE_FUNCTION (svrsqrts, binary, all_float, none)
++DEF_SVE_FUNCTION (svscale, binary_int_opt_n, all_float, mxz)
++DEF_SVE_FUNCTION (svsel, binary, all_data, implicit)
++DEF_SVE_FUNCTION (svsel, binary, b, implicit)
++DEF_SVE_FUNCTION (svset2, set, all_data, none)
++DEF_SVE_FUNCTION (svset3, set, all_data, none)
++DEF_SVE_FUNCTION (svset4, set, all_data, none)
++DEF_SVE_FUNCTION (svsetffr, setffr, none, none)
++DEF_SVE_FUNCTION (svsplice, binary, all_data, implicit)
++DEF_SVE_FUNCTION (svsqrt, unary, all_float, mxz)
++DEF_SVE_FUNCTION (svst1, store, all_data, implicit)
++DEF_SVE_FUNCTION (svst1_scatter, store_scatter_index, sd_data, implicit)
++DEF_SVE_FUNCTION (svst1_scatter, store_scatter_offset, sd_data, implicit)
++DEF_SVE_FUNCTION (svst1b, store, hsd_integer, implicit)
++DEF_SVE_FUNCTION (svst1b_scatter, store_scatter_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svst1h, store, sd_integer, implicit)
++DEF_SVE_FUNCTION (svst1h_scatter, store_scatter_index, sd_integer, implicit)
++DEF_SVE_FUNCTION (svst1h_scatter, store_scatter_offset, sd_integer, implicit)
++DEF_SVE_FUNCTION (svst1w, store, d_integer, implicit)
++DEF_SVE_FUNCTION (svst1w_scatter, store_scatter_index, d_integer, implicit)
++DEF_SVE_FUNCTION (svst1w_scatter, store_scatter_offset, d_integer, implicit)
++DEF_SVE_FUNCTION (svst2, store, all_data, implicit)
++DEF_SVE_FUNCTION (svst3, store, all_data, implicit)
++DEF_SVE_FUNCTION (svst4, store, all_data, implicit)
++DEF_SVE_FUNCTION (svstnt1, store, all_data, implicit)
++DEF_SVE_FUNCTION (svsub, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svsubr, binary_opt_n, all_arith, mxz)
++DEF_SVE_FUNCTION (svtbl, binary_uint, all_data, none)
++DEF_SVE_FUNCTION (svtmad, tmad, all_float, none)
++DEF_SVE_FUNCTION (svtrn1, binary, all_data, none)
++DEF_SVE_FUNCTION (svtrn1, binary_pred, all_pred, none)
++DEF_SVE_FUNCTION (svtrn2, binary, all_data, none)
++DEF_SVE_FUNCTION (svtrn2, binary_pred, all_pred, none)
++DEF_SVE_FUNCTION (svtsmul, binary_uint, all_float, none)
++DEF_SVE_FUNCTION (svtssel, binary_uint, all_float, none)
++DEF_SVE_FUNCTION (svundef, inherent, all_data, none)
++DEF_SVE_FUNCTION (svundef2, inherent, all_data, none)
++DEF_SVE_FUNCTION (svundef3, inherent, all_data, none)
++DEF_SVE_FUNCTION (svundef4, inherent, all_data, none)
++DEF_SVE_FUNCTION (svunpkhi, unary_widen, hsd_integer, none)
++DEF_SVE_FUNCTION (svunpkhi, unary_widen, b, none)
++DEF_SVE_FUNCTION (svunpklo, unary_widen, hsd_integer, none)
++DEF_SVE_FUNCTION (svunpklo, unary_widen, b, none)
++DEF_SVE_FUNCTION (svuzp1, binary, all_data, none)
++DEF_SVE_FUNCTION (svuzp1, binary_pred, all_pred, none)
++DEF_SVE_FUNCTION (svuzp2, binary, all_data, none)
++DEF_SVE_FUNCTION (svuzp2, binary_pred, all_pred, none)
++DEF_SVE_FUNCTION (svwhilele, compare_scalar, while, none)
++DEF_SVE_FUNCTION (svwhilelt, compare_scalar, while, none)
++DEF_SVE_FUNCTION (svwrffr, setffr, none, implicit)
++DEF_SVE_FUNCTION (svzip1, binary, all_data, none)
++DEF_SVE_FUNCTION (svzip1, binary_pred, all_pred, none)
++DEF_SVE_FUNCTION (svzip2, binary, all_data, none)
++DEF_SVE_FUNCTION (svzip2, binary_pred, all_pred, none)
++#undef REQUIRED_EXTENSIONS
++
++#define REQUIRED_EXTENSIONS AARCH64_FL_BF16
++DEF_SVE_FUNCTION (svbfdot, ternary_bfloat_opt_n, s_float, none)
++DEF_SVE_FUNCTION (svbfdot_lane, ternary_bfloat_lanex2, s_float, none)
++DEF_SVE_FUNCTION (svbfmlalb, ternary_bfloat_opt_n, s_float, none)
++DEF_SVE_FUNCTION (svbfmlalb_lane, ternary_bfloat_lane, s_float, none)
++DEF_SVE_FUNCTION (svbfmlalt, ternary_bfloat_opt_n, s_float, none)
++DEF_SVE_FUNCTION (svbfmlalt_lane, ternary_bfloat_lane, s_float, none)
++DEF_SVE_FUNCTION (svbfmmla, ternary_bfloat, s_float, none)
++DEF_SVE_FUNCTION (svcvt, unary_convert, cvt_bfloat, mxz)
++DEF_SVE_FUNCTION (svcvtnt, unary_convert_narrowt, cvt_bfloat, mx)
++#undef REQUIRED_EXTENSIONS
++
++#define REQUIRED_EXTENSIONS AARCH64_FL_I8MM
++DEF_SVE_FUNCTION (svmmla, mmla, s_integer, none)
++DEF_SVE_FUNCTION (svusmmla, ternary_uintq_intq, s_signed, none)
++DEF_SVE_FUNCTION (svsudot, ternary_intq_uintq_opt_n, s_signed, none)
++DEF_SVE_FUNCTION (svsudot_lane, ternary_intq_uintq_lane, s_signed, none)
++DEF_SVE_FUNCTION (svusdot, ternary_uintq_intq_opt_n, s_signed, none)
++DEF_SVE_FUNCTION (svusdot_lane, ternary_uintq_intq_lane, s_signed, none)
++#undef REQUIRED_EXTENSIONS
++
++#define REQUIRED_EXTENSIONS AARCH64_FL_F32MM
++DEF_SVE_FUNCTION (svmmla, mmla, s_float, none)
++#undef REQUIRED_EXTENSIONS
++
++#define REQUIRED_EXTENSIONS AARCH64_FL_F64MM
++DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit)
++DEF_SVE_FUNCTION (svmmla, mmla, d_float, none)
++DEF_SVE_FUNCTION (svtrn1q, binary, all_data, none)
++DEF_SVE_FUNCTION (svtrn2q, binary, all_data, none)
++DEF_SVE_FUNCTION (svuzp1q, binary, all_data, none)
++DEF_SVE_FUNCTION (svuzp2q, binary, all_data, none)
++DEF_SVE_FUNCTION (svzip1q, binary, all_data, none)
++DEF_SVE_FUNCTION (svzip2q, binary, all_data, none)
++#undef REQUIRED_EXTENSIONS
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h
+new file mode 100644
+index 000000000..2467e729e
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h
+@@ -0,0 +1,304 @@
++/* ACLE support for AArch64 SVE (__ARM_FEATURE_SVE intrinsics)
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_SVE_BUILTINS_BASE_H
++#define GCC_AARCH64_SVE_BUILTINS_BASE_H
++
++namespace aarch64_sve
++{
++  namespace functions
++  {
++    extern const function_base *const svabd;
++    extern const function_base *const svabs;
++    extern const function_base *const svacge;
++    extern const function_base *const svacgt;
++    extern const function_base *const svacle;
++    extern const function_base *const svaclt;
++    extern const function_base *const svadd;
++    extern const function_base *const svadda;
++    extern const function_base *const svaddv;
++    extern const function_base *const svadrb;
++    extern const function_base *const svadrd;
++    extern const function_base *const svadrh;
++    extern const function_base *const svadrw;
++    extern const function_base *const svand;
++    extern const function_base *const svandv;
++    extern const function_base *const svasr;
++    extern const function_base *const svasr_wide;
++    extern const function_base *const svasrd;
++    extern const function_base *const svbfdot;
++    extern const function_base *const svbfdot_lane;
++    extern const function_base *const svbfmlalb;
++    extern const function_base *const svbfmlalb_lane;
++    extern const function_base *const svbfmlalt;
++    extern const function_base *const svbfmlalt_lane;
++    extern const function_base *const svbfmmla;
++    extern const function_base *const svbic;
++    extern const function_base *const svbrka;
++    extern const function_base *const svbrkb;
++    extern const function_base *const svbrkn;
++    extern const function_base *const svbrkpa;
++    extern const function_base *const svbrkpb;
++    extern const function_base *const svcadd;
++    extern const function_base *const svclasta;
++    extern const function_base *const svclastb;
++    extern const function_base *const svcls;
++    extern const function_base *const svclz;
++    extern const function_base *const svcmla;
++    extern const function_base *const svcmla_lane;
++    extern const function_base *const svcmpeq;
++    extern const function_base *const svcmpeq_wide;
++    extern const function_base *const svcmpge;
++    extern const function_base *const svcmpge_wide;
++    extern const function_base *const svcmpgt;
++    extern const function_base *const svcmpgt_wide;
++    extern const function_base *const svcmple;
++    extern const function_base *const svcmple_wide;
++    extern const function_base *const svcmplt;
++    extern const function_base *const svcmplt_wide;
++    extern const function_base *const svcmpne;
++    extern const function_base *const svcmpne_wide;
++    extern const function_base *const svcmpuo;
++    extern const function_base *const svcnot;
++    extern const function_base *const svcnt;
++    extern const function_base *const svcntb;
++    extern const function_base *const svcntb_pat;
++    extern const function_base *const svcntd;
++    extern const function_base *const svcntd_pat;
++    extern const function_base *const svcnth;
++    extern const function_base *const svcnth_pat;
++    extern const function_base *const svcntp;
++    extern const function_base *const svcntw;
++    extern const function_base *const svcntw_pat;
++    extern const function_base *const svcompact;
++    extern const function_base *const svcreate2;
++    extern const function_base *const svcreate3;
++    extern const function_base *const svcreate4;
++    extern const function_base *const svcvt;
++    extern const function_base *const svcvtnt;
++    extern const function_base *const svdiv;
++    extern const function_base *const svdivr;
++    extern const function_base *const svdot;
++    extern const function_base *const svdot_lane;
++    extern const function_base *const svdup;
++    extern const function_base *const svdup_lane;
++    extern const function_base *const svdupq;
++    extern const function_base *const svdupq_lane;
++    extern const function_base *const sveor;
++    extern const function_base *const sveorv;
++    extern const function_base *const svexpa;
++    extern const function_base *const svext;
++    extern const function_base *const svextb;
++    extern const function_base *const svexth;
++    extern const function_base *const svextw;
++    extern const function_base *const svget2;
++    extern const function_base *const svget3;
++    extern const function_base *const svget4;
++    extern const function_base *const svindex;
++    extern const function_base *const svinsr;
++    extern const function_base *const svlasta;
++    extern const function_base *const svlastb;
++    extern const function_base *const svld1;
++    extern const function_base *const svld1_gather;
++    extern const function_base *const svld1ro;
++    extern const function_base *const svld1rq;
++    extern const function_base *const svld1sb;
++    extern const function_base *const svld1sb_gather;
++    extern const function_base *const svld1sh;
++    extern const function_base *const svld1sh_gather;
++    extern const function_base *const svld1sw;
++    extern const function_base *const svld1sw_gather;
++    extern const function_base *const svld1ub;
++    extern const function_base *const svld1ub_gather;
++    extern const function_base *const svld1uh;
++    extern const function_base *const svld1uh_gather;
++    extern const function_base *const svld1uw;
++    extern const function_base *const svld1uw_gather;
++    extern const function_base *const svld2;
++    extern const function_base *const svld3;
++    extern const function_base *const svld4;
++    extern const function_base *const svldff1;
++    extern const function_base *const svldff1_gather;
++    extern const function_base *const svldff1sb;
++    extern const function_base *const svldff1sb_gather;
++    extern const function_base *const svldff1sh;
++    extern const function_base *const svldff1sh_gather;
++    extern const function_base *const svldff1sw;
++    extern const function_base *const svldff1sw_gather;
++    extern const function_base *const svldff1ub;
++    extern const function_base *const svldff1ub_gather;
++    extern const function_base *const svldff1uh;
++    extern const function_base *const svldff1uh_gather;
++    extern const function_base *const svldff1uw;
++    extern const function_base *const svldff1uw_gather;
++    extern const function_base *const svldnf1;
++    extern const function_base *const svldnf1sb;
++    extern const function_base *const svldnf1sh;
++    extern const function_base *const svldnf1sw;
++    extern const function_base *const svldnf1ub;
++    extern const function_base *const svldnf1uh;
++    extern const function_base *const svldnf1uw;
++    extern const function_base *const svldnt1;
++    extern const function_base *const svlen;
++    extern const function_base *const svlsl;
++    extern const function_base *const svlsl_wide;
++    extern const function_base *const svlsr;
++    extern const function_base *const svlsr_wide;
++    extern const function_base *const svmad;
++    extern const function_base *const svmax;
++    extern const function_base *const svmaxnm;
++    extern const function_base *const svmaxnmv;
++    extern const function_base *const svmaxv;
++    extern const function_base *const svmin;
++    extern const function_base *const svminnm;
++    extern const function_base *const svminnmv;
++    extern const function_base *const svminv;
++    extern const function_base *const svmla;
++    extern const function_base *const svmla_lane;
++    extern const function_base *const svmls;
++    extern const function_base *const svmls_lane;
++    extern const function_base *const svmmla;
++    extern const function_base *const svmov;
++    extern const function_base *const svmsb;
++    extern const function_base *const svmul;
++    extern const function_base *const svmul_lane;
++    extern const function_base *const svmulh;
++    extern const function_base *const svmulx;
++    extern const function_base *const svnand;
++    extern const function_base *const svneg;
++    extern const function_base *const svnmad;
++    extern const function_base *const svnmla;
++    extern const function_base *const svnmls;
++    extern const function_base *const svnmsb;
++    extern const function_base *const svnor;
++    extern const function_base *const svnot;
++    extern const function_base *const svorn;
++    extern const function_base *const svorr;
++    extern const function_base *const svorv;
++    extern const function_base *const svpfalse;
++    extern const function_base *const svpfirst;
++    extern const function_base *const svpnext;
++    extern const function_base *const svprfb;
++    extern const function_base *const svprfb_gather;
++    extern const function_base *const svprfd;
++    extern const function_base *const svprfd_gather;
++    extern const function_base *const svprfh;
++    extern const function_base *const svprfh_gather;
++    extern const function_base *const svprfw;
++    extern const function_base *const svprfw_gather;
++    extern const function_base *const svptest_any;
++    extern const function_base *const svptest_first;
++    extern const function_base *const svptest_last;
++    extern const function_base *const svptrue;
++    extern const function_base *const svptrue_pat;
++    extern const function_base *const svqadd;
++    extern const function_base *const svqdecb;
++    extern const function_base *const svqdecb_pat;
++    extern const function_base *const svqdecd;
++    extern const function_base *const svqdecd_pat;
++    extern const function_base *const svqdech;
++    extern const function_base *const svqdech_pat;
++    extern const function_base *const svqdecp;
++    extern const function_base *const svqdecw;
++    extern const function_base *const svqdecw_pat;
++    extern const function_base *const svqincb;
++    extern const function_base *const svqincb_pat;
++    extern const function_base *const svqincd;
++    extern const function_base *const svqincd_pat;
++    extern const function_base *const svqinch;
++    extern const function_base *const svqinch_pat;
++    extern const function_base *const svqincp;
++    extern const function_base *const svqincw;
++    extern const function_base *const svqincw_pat;
++    extern const function_base *const svqsub;
++    extern const function_base *const svrbit;
++    extern const function_base *const svrdffr;
++    extern const function_base *const svrecpe;
++    extern const function_base *const svrecps;
++    extern const function_base *const svrecpx;
++    extern const function_base *const svreinterpret;
++    extern const function_base *const svrev;
++    extern const function_base *const svrevb;
++    extern const function_base *const svrevh;
++    extern const function_base *const svrevw;
++    extern const function_base *const svrinta;
++    extern const function_base *const svrinti;
++    extern const function_base *const svrintm;
++    extern const function_base *const svrintn;
++    extern const function_base *const svrintp;
++    extern const function_base *const svrintx;
++    extern const function_base *const svrintz;
++    extern const function_base *const svrsqrte;
++    extern const function_base *const svrsqrts;
++    extern const function_base *const svscale;
++    extern const function_base *const svsel;
++    extern const function_base *const svset2;
++    extern const function_base *const svset3;
++    extern const function_base *const svset4;
++    extern const function_base *const svsetffr;
++    extern const function_base *const svsplice;
++    extern const function_base *const svsqrt;
++    extern const function_base *const svst1;
++    extern const function_base *const svst1_scatter;
++    extern const function_base *const svst1b;
++    extern const function_base *const svst1b_scatter;
++    extern const function_base *const svst1h;
++    extern const function_base *const svst1h_scatter;
++    extern const function_base *const svst1w;
++    extern const function_base *const svst1w_scatter;
++    extern const function_base *const svst2;
++    extern const function_base *const svst3;
++    extern const function_base *const svst4;
++    extern const function_base *const svstnt1;
++    extern const function_base *const svsub;
++    extern const function_base *const svsubr;
++    extern const function_base *const svsudot;
++    extern const function_base *const svsudot_lane;
++    extern const function_base *const svtbl;
++    extern const function_base *const svtmad;
++    extern const function_base *const svtrn1;
++    extern const function_base *const svtrn1q;
++    extern const function_base *const svtrn2;
++    extern const function_base *const svtrn2q;
++    extern const function_base *const svtsmul;
++    extern const function_base *const svtssel;
++    extern const function_base *const svundef;
++    extern const function_base *const svundef2;
++    extern const function_base *const svundef3;
++    extern const function_base *const svundef4;
++    extern const function_base *const svunpkhi;
++    extern const function_base *const svunpklo;
++    extern const function_base *const svusdot;
++    extern const function_base *const svusdot_lane;
++    extern const function_base *const svusmmla;
++    extern const function_base *const svuzp1;
++    extern const function_base *const svuzp1q;
++    extern const function_base *const svuzp2;
++    extern const function_base *const svuzp2q;
++    extern const function_base *const svwhilele;
++    extern const function_base *const svwhilelt;
++    extern const function_base *const svwrffr;
++    extern const function_base *const svzip1;
++    extern const function_base *const svzip1q;
++    extern const function_base *const svzip2;
++    extern const function_base *const svzip2q;
++  }
++}
++
++#endif
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins-functions.h b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
+new file mode 100644
+index 000000000..ee1760668
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
+@@ -0,0 +1,630 @@
++/* ACLE support for AArch64 SVE (function_base classes)
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_SVE_BUILTINS_FUNCTIONS_H
++#define GCC_AARCH64_SVE_BUILTINS_FUNCTIONS_H
++
++namespace aarch64_sve {
++
++/* Wrap T, which is derived from function_base, and indicate that the
++   function never has side effects.  It is only necessary to use this
++   wrapper on functions that might have floating-point suffixes, since
++   otherwise we assume by default that the function has no side effects.  */
++template<typename T>
++class quiet : public T
++{
++public:
++  CONSTEXPR quiet () : T () {}
++
++  /* Unfortunately we can't use parameter packs yet.  */
++  template<typename T1>
++  CONSTEXPR quiet (const T1 &t1) : T (t1) {}
++
++  template<typename T1, typename T2>
++  CONSTEXPR quiet (const T1 &t1, const T2 &t2) : T (t1, t2) {}
++
++  template<typename T1, typename T2, typename T3>
++  CONSTEXPR quiet (const T1 &t1, const T2 &t2, const T3 &t3)
++    : T (t1, t2, t3) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return 0;
++  }
++};
++
++/* A function_base that sometimes or always operates on tuples of
++   vectors.  */
++class multi_vector_function : public function_base
++{
++public:
++  CONSTEXPR multi_vector_function (unsigned int vectors_per_tuple)
++    : m_vectors_per_tuple (vectors_per_tuple) {}
++
++  unsigned int
++  vectors_per_tuple () const OVERRIDE
++  {
++    return m_vectors_per_tuple;
++  }
++
++  /* The number of vectors in a tuple, or 1 if the function only operates
++     on single vectors.  */
++  unsigned int m_vectors_per_tuple;
++};
++
++/* A function_base that loads or stores contiguous memory elements
++   without extending or truncating them.  */
++class full_width_access : public multi_vector_function
++{
++public:
++  CONSTEXPR full_width_access (unsigned int vectors_per_tuple = 1)
++    : multi_vector_function (vectors_per_tuple) {}
++
++  tree
++  memory_scalar_type (const function_instance &fi) const OVERRIDE
++  {
++    return fi.scalar_type (0);
++  }
++
++  machine_mode
++  memory_vector_mode (const function_instance &fi) const OVERRIDE
++  {
++    machine_mode mode = fi.vector_mode (0);
++    if (m_vectors_per_tuple != 1)
++      mode = targetm.array_mode (mode, m_vectors_per_tuple).require ();
++    return mode;
++  }
++};
++
++/* A function_base that loads elements from memory and extends them
++   to a wider element.  The memory element type is a fixed part of
++   the function base name.  */
++class extending_load : public function_base
++{
++public:
++  CONSTEXPR extending_load (type_suffix_index memory_type)
++    : m_memory_type (memory_type) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_READ_MEMORY;
++  }
++
++  tree
++  memory_scalar_type (const function_instance &) const OVERRIDE
++  {
++    return scalar_types[type_suffixes[m_memory_type].vector_type];
++  }
++
++  machine_mode
++  memory_vector_mode (const function_instance &fi) const OVERRIDE
++  {
++    machine_mode mem_mode = type_suffixes[m_memory_type].vector_mode;
++    machine_mode reg_mode = fi.vector_mode (0);
++    return aarch64_sve_data_mode (GET_MODE_INNER (mem_mode),
++				  GET_MODE_NUNITS (reg_mode)).require ();
++  }
++
++  /* Return the rtx code associated with the kind of extension that
++     the load performs.  */
++  rtx_code
++  extend_rtx_code () const
++  {
++    return (type_suffixes[m_memory_type].unsigned_p
++	    ? ZERO_EXTEND : SIGN_EXTEND);
++  }
++
++  /* The type of the memory elements.  This is part of the function base
++     name rather than a true type suffix.  */
++  type_suffix_index m_memory_type;
++};
++
++/* A function_base that truncates vector elements and stores them to memory.
++   The memory element width is a fixed part of the function base name.  */
++class truncating_store : public function_base
++{
++public:
++  CONSTEXPR truncating_store (scalar_int_mode to_mode) : m_to_mode (to_mode) {}
++
++  unsigned int
++  call_properties (const function_instance &) const OVERRIDE
++  {
++    return CP_WRITE_MEMORY;
++  }
++
++  tree
++  memory_scalar_type (const function_instance &fi) const OVERRIDE
++  {
++    /* In truncating stores, the signedness of the memory element is defined
++       to be the same as the signedness of the vector element.  The signedness
++       doesn't make any difference to the behavior of the function.  */
++    type_class_index tclass = fi.type_suffix (0).tclass;
++    unsigned int element_bits = GET_MODE_BITSIZE (m_to_mode);
++    type_suffix_index suffix = find_type_suffix (tclass, element_bits);
++    return scalar_types[type_suffixes[suffix].vector_type];
++  }
++
++  machine_mode
++  memory_vector_mode (const function_instance &fi) const OVERRIDE
++  {
++    poly_uint64 nunits = GET_MODE_NUNITS (fi.vector_mode (0));
++    return aarch64_sve_data_mode (m_to_mode, nunits).require ();
++  }
++
++  /* The mode of a single memory element.  */
++  scalar_int_mode m_to_mode;
++};
++
++/* An incomplete function_base for functions that have an associated rtx code.
++   It simply records information about the mapping for derived classes
++   to use.  */
++class rtx_code_function_base : public function_base
++{
++public:
++  CONSTEXPR rtx_code_function_base (rtx_code code_for_sint,
++				    rtx_code code_for_uint,
++				    int unspec_for_fp = -1)
++    : m_code_for_sint (code_for_sint), m_code_for_uint (code_for_uint),
++      m_unspec_for_fp (unspec_for_fp) {}
++
++  /* The rtx code to use for signed and unsigned integers respectively.
++     Can be UNKNOWN for functions that don't have integer forms.  */
++  rtx_code m_code_for_sint;
++  rtx_code m_code_for_uint;
++
++  /* The UNSPEC_COND_* to use for floating-point operations.  Can be -1
++     for functions that only operate on integers.  */
++  int m_unspec_for_fp;
++};
++
++/* A function_base for functions that have an associated rtx code.
++   It supports all forms of predication except PRED_implicit.  */
++class rtx_code_function : public rtx_code_function_base
++{
++public:
++  CONSTEXPR rtx_code_function (rtx_code code_for_sint, rtx_code code_for_uint,
++			       int unspec_for_fp = -1)
++    : rtx_code_function_base (code_for_sint, code_for_uint, unspec_for_fp) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.map_to_rtx_codes (m_code_for_sint, m_code_for_uint,
++			       m_unspec_for_fp);
++  }
++};
++
++/* Like rtx_code_function, but for functions that take what is normally
++   the final argument first.  One use of this class is to handle binary
++   reversed operations; another is to handle MLA-style operations that
++   are normally expressed in GCC as MAD-style operations.  */
++class rtx_code_function_rotated : public rtx_code_function_base
++{
++public:
++  CONSTEXPR rtx_code_function_rotated (rtx_code code_for_sint,
++				       rtx_code code_for_uint,
++				       int unspec_for_fp = -1)
++    : rtx_code_function_base (code_for_sint, code_for_uint, unspec_for_fp) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Rotate the inputs into their normal order, but continue to make _m
++       functions merge with what was originally the first vector argument.  */
++    unsigned int nargs = e.args.length ();
++    e.rotate_inputs_left (e.pred != PRED_none ? 1 : 0, nargs);
++    return e.map_to_rtx_codes (m_code_for_sint, m_code_for_uint,
++			       m_unspec_for_fp, nargs - 1);
++  }
++};
++
++/* An incomplete function_base for functions that have an associated
++   unspec code, with separate codes for signed integers, unsigned
++   integers and floating-point values.  The class simply records
++   information about the mapping for derived classes to use.  */
++class unspec_based_function_base : public function_base
++{
++public:
++  CONSTEXPR unspec_based_function_base (int unspec_for_sint,
++					int unspec_for_uint,
++					int unspec_for_fp)
++    : m_unspec_for_sint (unspec_for_sint),
++      m_unspec_for_uint (unspec_for_uint),
++      m_unspec_for_fp (unspec_for_fp)
++  {}
++
++  /* Return the unspec code to use for INSTANCE, based on type suffix 0.  */
++  int
++  unspec_for (const function_instance &instance) const
++  {
++    return (!instance.type_suffix (0).integer_p ? m_unspec_for_fp
++	    : instance.type_suffix (0).unsigned_p ? m_unspec_for_uint
++	    : m_unspec_for_sint);
++  }
++
++  /* The unspec code associated with signed-integer, unsigned-integer
++     and floating-point operations respectively.  */
++  int m_unspec_for_sint;
++  int m_unspec_for_uint;
++  int m_unspec_for_fp;
++};
++
++/* A function_base for functions that have an associated unspec code.
++   It supports all forms of predication except PRED_implicit.  */
++class unspec_based_function : public unspec_based_function_base
++{
++public:
++  CONSTEXPR unspec_based_function (int unspec_for_sint, int unspec_for_uint,
++				   int unspec_for_fp)
++    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
++				  unspec_for_fp)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.map_to_unspecs (m_unspec_for_sint, m_unspec_for_uint,
++			     m_unspec_for_fp);
++  }
++};
++
++/* Like unspec_based_function, but for functions that take what is normally
++   the final argument first.  One use of this class is to handle binary
++   reversed operations; another is to handle MLA-style operations that
++   are normally expressed in GCC as MAD-style operations.  */
++class unspec_based_function_rotated : public unspec_based_function_base
++{
++public:
++  CONSTEXPR unspec_based_function_rotated (int unspec_for_sint,
++					   int unspec_for_uint,
++					   int unspec_for_fp)
++    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
++				  unspec_for_fp)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Rotate the inputs into their normal order, but continue to make _m
++       functions merge with what was originally the first vector argument.  */
++    unsigned int nargs = e.args.length ();
++    e.rotate_inputs_left (e.pred != PRED_none ? 1 : 0, nargs);
++    return e.map_to_unspecs (m_unspec_for_sint, m_unspec_for_uint,
++			     m_unspec_for_fp, nargs - 1);
++  }
++};
++
++/* Like unspec_based_function, but map the function directly to
++   CODE (UNSPEC, M) instead of using the generic predication-based
++   expansion. where M is the vector mode associated with type suffix 0.
++   This is useful if the unspec doesn't describe the full operation or
++   if the usual predication rules don't apply for some reason.  */
++template<insn_code (*CODE) (int, machine_mode)>
++class unspec_based_function_exact_insn : public unspec_based_function_base
++{
++public:
++  CONSTEXPR unspec_based_function_exact_insn (int unspec_for_sint,
++					      int unspec_for_uint,
++					      int unspec_for_fp)
++    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
++				  unspec_for_fp)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (CODE (unspec_for (e), e.vector_mode (0)));
++  }
++};
++
++/* A function that performs an unspec and then adds it to another value.  */
++typedef unspec_based_function_exact_insn<code_for_aarch64_sve_add>
++  unspec_based_add_function;
++
++/* A functon that uses aarch64_pred* patterns regardless of the
++   predication type.  */
++typedef unspec_based_function_exact_insn<code_for_aarch64_pred>
++  unspec_based_pred_function;
++
++/* A function that acts like unspec_based_function_exact_insn<INT_CODE>
++   when operating on integers, but that expands to an (fma ...)-style
++   aarch64_sve* operation when applied to floats.  */
++template<insn_code (*INT_CODE) (int, machine_mode)>
++class unspec_based_fused_function : public unspec_based_function_base
++{
++public:
++  CONSTEXPR unspec_based_fused_function (int unspec_for_sint,
++					 int unspec_for_uint,
++					 int unspec_for_fp)
++    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
++				  unspec_for_fp)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    int unspec = unspec_for (e);
++    insn_code icode;
++    if (e.type_suffix (0).float_p)
++      {
++	/* Put the operands in the normal (fma ...) order, with the accumulator
++	   last.  This fits naturally since that's also the unprinted operand
++	   in the asm output.  */
++	e.rotate_inputs_left (0, e.pred != PRED_none ? 4 : 3);
++	icode = code_for_aarch64_sve (unspec, e.vector_mode (0));
++      }
++    else
++      icode = INT_CODE (unspec, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* Like unspec_based_fused_function, but for _lane functions.  */
++template<insn_code (*INT_CODE) (int, machine_mode)>
++class unspec_based_fused_lane_function : public unspec_based_function_base
++{
++public:
++  CONSTEXPR unspec_based_fused_lane_function (int unspec_for_sint,
++					      int unspec_for_uint,
++					      int unspec_for_fp)
++    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
++				  unspec_for_fp)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    int unspec = unspec_for (e);
++    insn_code icode;
++    if (e.type_suffix (0).float_p)
++      {
++	/* Put the operands in the normal (fma ...) order, with the accumulator
++	   last.  This fits naturally since that's also the unprinted operand
++	   in the asm output.  */
++	e.rotate_inputs_left (0, e.pred != PRED_none ? 5 : 4);
++	icode = code_for_aarch64_lane (unspec, e.vector_mode (0));
++      }
++    else
++      icode = INT_CODE (unspec, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++};
++
++/* A function_base that uses CODE_FOR_MODE (M) to get the associated
++   instruction code, where M is the vector mode associated with type
++   suffix N.  */
++template<insn_code (*CODE_FOR_MODE) (machine_mode), unsigned int N>
++class code_for_mode_function : public function_base
++{
++public:
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (CODE_FOR_MODE (e.vector_mode (N)));
++  }
++};
++
++/* A function that uses code_for_<PATTERN> (M), where M is the vector
++   mode associated with the first type suffix.  */
++#define CODE_FOR_MODE0(PATTERN) code_for_mode_function<code_for_##PATTERN, 0>
++
++/* Likewise for the second type suffix.  */
++#define CODE_FOR_MODE1(PATTERN) code_for_mode_function<code_for_##PATTERN, 1>
++
++/* Like CODE_FOR_MODE0, but the function doesn't raise exceptions when
++   operating on floating-point data.  */
++#define QUIET_CODE_FOR_MODE0(PATTERN) \
++  quiet< code_for_mode_function<code_for_##PATTERN, 0> >
++
++/* A function_base for functions that always expand to a fixed insn pattern,
++   regardless of what the suffixes are.  */
++class fixed_insn_function : public function_base
++{
++public:
++  CONSTEXPR fixed_insn_function (insn_code code) : m_code (code) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    return e.use_exact_insn (m_code);
++  }
++
++  /* The instruction to use.  */
++  insn_code m_code;
++};
++
++/* A function_base for functions that permute their arguments.  */
++class permute : public quiet<function_base>
++{
++public:
++  /* Fold a unary or binary permute with the permute vector given by
++     BUILDER.  */
++  gimple *
++  fold_permute (const gimple_folder &f, const vec_perm_builder &builder) const
++  {
++    /* Punt for now on _b16 and wider; we'd need more complex evpc logic
++       to rerecognize the result.  */
++    if (f.type_suffix (0).bool_p && f.type_suffix (0).element_bits > 8)
++      return NULL;
++
++    unsigned int nargs = gimple_call_num_args (f.call);
++    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
++    vec_perm_indices indices (builder, nargs, nelts);
++    tree perm_type = build_vector_type (ssizetype, nelts);
++    return gimple_build_assign (f.lhs, VEC_PERM_EXPR,
++				gimple_call_arg (f.call, 0),
++				gimple_call_arg (f.call, nargs - 1),
++				vec_perm_indices_to_tree (perm_type, indices));
++  }
++};
++
++/* A function_base for functions that permute two vectors using a fixed
++   choice of indices.  */
++class binary_permute : public permute
++{
++public:
++  CONSTEXPR binary_permute (int unspec) : m_unspec (unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    insn_code icode = code_for_aarch64_sve (m_unspec, e.vector_mode (0));
++    return e.use_exact_insn (icode);
++  }
++
++  /* The unspec code associated with the operation.  */
++  int m_unspec;
++};
++
++/* A function_base for functions that reduce a vector to a scalar.  */
++class reduction : public function_base
++{
++public:
++  CONSTEXPR reduction (int unspec)
++    : m_unspec_for_sint (unspec),
++      m_unspec_for_uint (unspec),
++      m_unspec_for_fp (unspec)
++  {}
++
++  CONSTEXPR reduction (int unspec_for_sint, int unspec_for_uint,
++		       int unspec_for_fp)
++    : m_unspec_for_sint (unspec_for_sint),
++      m_unspec_for_uint (unspec_for_uint),
++      m_unspec_for_fp (unspec_for_fp)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    int unspec = (!e.type_suffix (0).integer_p ? m_unspec_for_fp
++		  : e.type_suffix (0).unsigned_p ? m_unspec_for_uint
++		  : m_unspec_for_sint);
++    /* There's no distinction between SADDV and UADDV for 64-bit elements;
++       the signed versions only exist for narrower elements.  */
++    if (GET_MODE_UNIT_BITSIZE (mode) == 64 && unspec == UNSPEC_SADDV)
++      unspec = UNSPEC_UADDV;
++    return e.use_exact_insn (code_for_aarch64_pred_reduc (unspec, mode));
++  }
++
++  /* The unspec code associated with signed-integer, unsigned-integer
++     and floating-point operations respectively.  */
++  int m_unspec_for_sint;
++  int m_unspec_for_uint;
++  int m_unspec_for_fp;
++};
++
++/* A function_base for functions that shift narrower-than-64-bit values
++   by 64-bit amounts.  */
++class shift_wide : public function_base
++{
++public:
++  CONSTEXPR shift_wide (rtx_code code, int wide_unspec)
++    : m_code (code), m_wide_unspec (wide_unspec) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    machine_mode mode = e.vector_mode (0);
++    machine_mode elem_mode = GET_MODE_INNER (mode);
++
++    /* If the argument is a constant that the normal shifts can handle
++       directly, use them instead.  */
++    rtx shift = unwrap_const_vec_duplicate (e.args.last ());
++    if (aarch64_simd_shift_imm_p (shift, elem_mode, m_code == ASHIFT))
++      {
++	e.args.last () = shift;
++	return e.map_to_rtx_codes (m_code, m_code, -1);
++      }
++
++    if (e.pred == PRED_x)
++      return e.use_unpred_insn (code_for_aarch64_sve (m_wide_unspec, mode));
++
++    return e.use_cond_insn (code_for_cond (m_wide_unspec, mode));
++  }
++
++  /* The rtx code associated with a "normal" shift.  */
++  rtx_code m_code;
++
++  /* The unspec code associated with the wide shift.  */
++  int m_wide_unspec;
++};
++
++/* A function_base for unary functions that count bits.  */
++class unary_count : public quiet<function_base>
++{
++public:
++  CONSTEXPR unary_count (rtx_code code) : m_code (code) {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* The md patterns treat the operand as an integer.  */
++    machine_mode mode = aarch64_sve_int_mode (e.vector_mode (0));
++    e.args.last () = gen_lowpart (mode, e.args.last ());
++
++    if (e.pred == PRED_x)
++      return e.use_pred_x_insn (code_for_aarch64_pred (m_code, mode));
++
++    return e.use_cond_insn (code_for_cond (m_code, mode));
++  }
++
++  /* The rtx code associated with the operation.  */
++  rtx_code m_code;
++};
++
++/* A function_base for svwhile* functions.  */
++class while_comparison : public function_base
++{
++public:
++  CONSTEXPR while_comparison (int unspec_for_sint, int unspec_for_uint)
++    : m_unspec_for_sint (unspec_for_sint),
++      m_unspec_for_uint (unspec_for_uint)
++  {}
++
++  rtx
++  expand (function_expander &e) const OVERRIDE
++  {
++    /* Suffix 0 determines the predicate mode, suffix 1 determines the
++       scalar mode and signedness.  */
++    int unspec = (e.type_suffix (1).unsigned_p
++		  ? m_unspec_for_uint
++		  : m_unspec_for_sint);
++    machine_mode pred_mode = e.vector_mode (0);
++    scalar_mode reg_mode = GET_MODE_INNER (e.vector_mode (1));
++    return e.use_exact_insn (code_for_while (unspec, reg_mode, pred_mode));
++  }
++
++  /* The unspec codes associated with signed and unsigned operations
++     respectively.  */
++  int m_unspec_for_sint;
++  int m_unspec_for_uint;
++};
++
++}
++
++/* Declare the global function base NAME, creating it from an instance
++   of class CLASS with constructor arguments ARGS.  */
++#define FUNCTION(NAME, CLASS, ARGS) \
++  namespace { static CONSTEXPR const CLASS NAME##_obj ARGS; } \
++  namespace functions { const function_base *const NAME = &NAME##_obj; }
++
++#endif
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
+new file mode 100644
+index 000000000..c6f6ce170
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
+@@ -0,0 +1,3451 @@
++/* ACLE support for AArch64 SVE (function shapes)
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "tm.h"
++#include "tree.h"
++#include "rtl.h"
++#include "tm_p.h"
++#include "memmodel.h"
++#include "insn-codes.h"
++#include "optabs.h"
++#include "aarch64-sve-builtins.h"
++#include "aarch64-sve-builtins-shapes.h"
++
++/* In the comments below, _t0 represents the first type suffix and _t1
++   represents the second.  Square brackets enclose characters that are
++   present in only the full name, not the overloaded name.  Governing
++   predicate arguments and predicate suffixes are not shown, since they
++   depend on the predication type, which is a separate piece of
++   information from the shape.
++
++   Non-overloaded functions may have additional suffixes beyond the
++   ones shown, if those suffixes don't affect the types in the type
++   signature.  E.g. the predicate form of svtrn1 has a _b<bits> suffix,
++   but this does not affect the prototype, which is always
++   "svbool_t(svbool_t, svbool_t)".  */
++
++namespace aarch64_sve {
++
++/* Return a representation of "const T *".  */
++static tree
++build_const_pointer (tree t)
++{
++  return build_pointer_type (build_qualified_type (t, TYPE_QUAL_CONST));
++}
++
++/* If INSTANCE has a governing predicate, add it to the list of argument
++   types in ARGUMENT_TYPES.  RETURN_TYPE is the type returned by the
++   function.  */
++static void
++apply_predication (const function_instance &instance, tree return_type,
++		   vec<tree> &argument_types)
++{
++  if (instance.pred != PRED_none)
++    {
++      argument_types.quick_insert (0, get_svbool_t ());
++      /* For unary merge operations, the first argument is a vector with
++	 the same type as the result.  For unary_convert_narrowt it also
++	 provides the "bottom" half of active elements, and is present
++	 for all types of predication.  */
++      if ((argument_types.length () == 2 && instance.pred == PRED_m)
++	  || instance.shape == shapes::unary_convert_narrowt)
++	argument_types.quick_insert (0, return_type);
++    }
++}
++
++/* Parse and move past an element type in FORMAT and return it as a type
++   suffix.  The format is:
++
++   [01]    - the element type in type suffix 0 or 1 of INSTANCE
++   f<bits> - a floating-point type with the given number of bits
++   f[01]   - a floating-point type with the same width as type suffix 0 or 1
++   B       - bfloat16_t
++   h<elt>  - a half-sized version of <elt>
++   p       - a predicate (represented as TYPE_SUFFIX_b)
++   q<elt>  - a quarter-sized version of <elt>
++   s<bits> - a signed type with the given number of bits
++   s[01]   - a signed type with the same width as type suffix 0 or 1
++   u<bits> - an unsigned type with the given number of bits
++   u[01]   - an unsigned type with the same width as type suffix 0 or 1
++   w<elt>  - a 64-bit version of <elt> if <elt> is integral, otherwise <elt>
++
++   where <elt> is another element type.  */
++static type_suffix_index
++parse_element_type (const function_instance &instance, const char *&format)
++{
++  int ch = *format++;
++
++  if (ch == 'f' || ch == 's' || ch == 'u')
++    {
++      type_class_index tclass = (ch == 'f' ? TYPE_float
++				 : ch == 's' ? TYPE_signed
++				 : TYPE_unsigned);
++      char *end;
++      unsigned int bits = strtol (format, &end, 10);
++      format = end;
++      if (bits == 0 || bits == 1)
++	bits = instance.type_suffix (bits).element_bits;
++      return find_type_suffix (tclass, bits);
++    }
++
++  if (ch == 'w')
++    {
++      type_suffix_index suffix = parse_element_type (instance, format);
++      if (type_suffixes[suffix].integer_p)
++	return find_type_suffix (type_suffixes[suffix].tclass, 64);
++      return suffix;
++    }
++
++  if (ch == 'p')
++    return TYPE_SUFFIX_b;
++
++  if (ch == 'B')
++    return TYPE_SUFFIX_bf16;
++
++  if (ch == 'q')
++    {
++      type_suffix_index suffix = parse_element_type (instance, format);
++      return find_type_suffix (type_suffixes[suffix].tclass,
++			       type_suffixes[suffix].element_bits / 4);
++    }
++
++  if (ch == 'h')
++    {
++      type_suffix_index suffix = parse_element_type (instance, format);
++      /* Widening and narrowing doesn't change the type for predicates;
++	 everything's still an svbool_t.  */
++      if (suffix == TYPE_SUFFIX_b)
++	return suffix;
++      return find_type_suffix (type_suffixes[suffix].tclass,
++			       type_suffixes[suffix].element_bits / 2);
++    }
++
++  if (ch == '0' || ch == '1')
++    return instance.type_suffix_ids[ch - '0'];
++
++  gcc_unreachable ();
++}
++
++/* Read and return a type from FORMAT for function INSTANCE.  Advance
++   FORMAT beyond the type string.  The format is:
++
++   _       - void
++   al      - array pointer for loads
++   ap      - array pointer for prefetches
++   as      - array pointer for stores
++   b       - base vector type (from a _<m0>base suffix)
++   d       - displacement vector type (from a _<m1>index or _<m1>offset suffix)
++   e<name> - an enum with the given name
++   s<elt>  - a scalar type with the given element suffix
++   t<elt>  - a vector or tuple type with given element suffix [*1]
++   v<elt>  - a vector with the given element suffix
++
++   where <elt> has the format described above parse_element_type
++
++   [*1] the vectors_per_tuple function indicates whether the type should
++        be a tuple, and if so, how many vectors it should contain.  */
++static tree
++parse_type (const function_instance &instance, const char *&format)
++{
++  int ch = *format++;
++
++  if (ch == '_')
++    return void_type_node;
++
++  if (ch == 'a')
++    {
++      ch = *format++;
++      if (ch == 'l')
++	return build_const_pointer (instance.memory_scalar_type ());
++      if (ch == 'p')
++	return const_ptr_type_node;
++      if (ch == 's')
++	return build_pointer_type (instance.memory_scalar_type ());
++      gcc_unreachable ();
++    }
++
++  if (ch == 'b')
++    return instance.base_vector_type ();
++
++  if (ch == 'd')
++    return instance.displacement_vector_type ();
++
++  if (ch == 'e')
++    {
++      if (strncmp (format, "pattern", 7) == 0)
++	{
++	  format += 7;
++	  return acle_svpattern;
++	}
++      if (strncmp (format, "prfop", 5) == 0)
++	{
++	  format += 5;
++	  return acle_svprfop;
++	}
++      gcc_unreachable ();
++    }
++
++  if (ch == 's')
++    {
++      type_suffix_index suffix = parse_element_type (instance, format);
++      return scalar_types[type_suffixes[suffix].vector_type];
++    }
++
++  if (ch == 't')
++    {
++      type_suffix_index suffix = parse_element_type (instance, format);
++      vector_type_index vector_type = type_suffixes[suffix].vector_type;
++      unsigned int num_vectors = instance.vectors_per_tuple ();
++      return acle_vector_types[num_vectors - 1][vector_type];
++    }
++
++  if (ch == 'v')
++    {
++      type_suffix_index suffix = parse_element_type (instance, format);
++      return acle_vector_types[0][type_suffixes[suffix].vector_type];
++    }
++
++  gcc_unreachable ();
++}
++
++/* Read and move past any argument count at FORMAT for the function
++   signature of INSTANCE.  The counts are:
++
++   *q: one argument per element in a 128-bit quadword (as for svdupq)
++   *t: one argument per vector in a tuple (as for svcreate)
++
++   Otherwise the count is 1.  */
++static unsigned int
++parse_count (const function_instance &instance, const char *&format)
++{
++  if (format[0] == '*' && format[1] == 'q')
++    {
++      format += 2;
++      return instance.elements_per_vq (0);
++    }
++  if (format[0] == '*' && format[1] == 't')
++    {
++      format += 2;
++      return instance.vectors_per_tuple ();
++    }
++  return 1;
++}
++
++/* Read a type signature for INSTANCE from FORMAT.  Add the argument types
++   to ARGUMENT_TYPES and return the return type.
++
++   The format is a comma-separated list of types (as for parse_type),
++   with the first type being the return type and the rest being the
++   argument types.  Each argument type can be followed by an optional
++   count (as for parse_count).  */
++static tree
++parse_signature (const function_instance &instance, const char *format,
++		 vec<tree> &argument_types)
++{
++  tree return_type = parse_type (instance, format);
++  while (format[0] == ',')
++    {
++      format += 1;
++      tree argument_type = parse_type (instance, format);
++      unsigned int count = parse_count (instance, format);
++      for (unsigned int i = 0; i < count; ++i)
++	argument_types.quick_push (argument_type);
++    }
++  gcc_assert (format[0] == 0);
++  return return_type;
++}
++
++/* Add one function instance for GROUP, using mode suffix MODE_SUFFIX_ID,
++   the type suffixes at index TI and the predication suffix at index PI.
++   The other arguments are as for build_all.  */
++static void
++build_one (function_builder &b, const char *signature,
++	   const function_group_info &group, mode_suffix_index mode_suffix_id,
++	   unsigned int ti, unsigned int pi, bool force_direct_overloads)
++{
++  /* Byte forms of svdupq take 16 arguments.  */
++  auto_vec<tree, 16> argument_types;
++  function_instance instance (group.base_name, *group.base, *group.shape,
++			      mode_suffix_id, group.types[ti],
++			      group.preds[pi]);
++  tree return_type = parse_signature (instance, signature, argument_types);
++  apply_predication (instance, return_type, argument_types);
++  b.add_unique_function (instance, return_type, argument_types,
++			 group.required_extensions, force_direct_overloads);
++}
++
++/* GROUP describes some sort of gather or scatter operation.  There are
++   two cases:
++
++   - If the function has any type suffixes (as for loads and stores), the
++     first function type suffix specifies either a 32-bit or a 64-bit type,
++     which in turn selects either MODE32 or MODE64 as the addressing mode.
++     Add a function instance for every type and predicate combination
++     in GROUP for which the associated addressing mode is not MODE_none.
++
++   - If the function has no type suffixes (as for prefetches), add one
++     MODE32 form and one MODE64 form for each predication type.
++
++   The other arguments are as for build_all.  */
++static void
++build_32_64 (function_builder &b, const char *signature,
++	     const function_group_info &group, mode_suffix_index mode32,
++	     mode_suffix_index mode64, bool force_direct_overloads = false)
++{
++  for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi)
++    if (group.types[0][0] == NUM_TYPE_SUFFIXES)
++      {
++	gcc_assert (mode32 != MODE_none && mode64 != MODE_none);
++	build_one (b, signature, group, mode32, 0, pi,
++		   force_direct_overloads);
++	build_one (b, signature, group, mode64, 0, pi,
++		   force_direct_overloads);
++      }
++    else
++      for (unsigned int ti = 0; group.types[ti][0] != NUM_TYPE_SUFFIXES; ++ti)
++	{
++	  unsigned int bits = type_suffixes[group.types[ti][0]].element_bits;
++	  gcc_assert (bits == 32 || bits == 64);
++	  mode_suffix_index mode = bits == 32 ? mode32 : mode64;
++	  if (mode != MODE_none)
++	    build_one (b, signature, group, mode, ti, pi,
++		       force_direct_overloads);
++	}
++}
++
++/* For every type and predicate combination in GROUP, add one function
++   that takes a scalar (pointer) base and a signed vector array index,
++   and another that instead takes an unsigned vector array index.
++   The vector array index has the same element size as the first
++   function type suffix.  SIGNATURE is as for build_all.  */
++static void
++build_sv_index (function_builder &b, const char *signature,
++		const function_group_info &group)
++{
++  build_32_64 (b, signature, group, MODE_s32index, MODE_s64index);
++  build_32_64 (b, signature, group, MODE_u32index, MODE_u64index);
++}
++
++/* Like build_sv_index, but only handle 64-bit types.  */
++static void
++build_sv_index64 (function_builder &b, const char *signature,
++		  const function_group_info &group)
++{
++  build_32_64 (b, signature, group, MODE_none, MODE_s64index);
++  build_32_64 (b, signature, group, MODE_none, MODE_u64index);
++}
++
++/* Like build_sv_index, but taking vector byte offsets instead of vector
++   array indices.  */
++static void
++build_sv_offset (function_builder &b, const char *signature,
++		 const function_group_info &group)
++{
++  build_32_64 (b, signature, group, MODE_s32offset, MODE_s64offset);
++  build_32_64 (b, signature, group, MODE_u32offset, MODE_u64offset);
++}
++
++/* Like build_sv_offset, but exclude offsets that must be interpreted
++   as signed (i.e. s32offset).  */
++static void
++build_sv_uint_offset (function_builder &b, const char *signature,
++		      const function_group_info &group)
++{
++  build_32_64 (b, signature, group, MODE_none, MODE_s64offset);
++  build_32_64 (b, signature, group, MODE_u32offset, MODE_u64offset);
++}
++
++/* For every type and predicate combination in GROUP, add a function
++   that takes a vector base address and no displacement.  The vector
++   base has the same element size as the first type suffix.
++
++   The other arguments are as for build_all.  */
++static void
++build_v_base (function_builder &b, const char *signature,
++	      const function_group_info &group,
++	      bool force_direct_overloads = false)
++{
++  build_32_64 (b, signature, group, MODE_u32base, MODE_u64base,
++	       force_direct_overloads);
++}
++
++/* Like build_v_base, but for functions that also take a scalar array
++   index.  */
++static void
++build_vs_index (function_builder &b, const char *signature,
++		const function_group_info &group,
++		bool force_direct_overloads = false)
++{
++  build_32_64 (b, signature, group, MODE_u32base_index, MODE_u64base_index,
++	       force_direct_overloads);
++}
++
++/* Like build_v_base, but for functions that also take a scalar byte
++   offset.  */
++static void
++build_vs_offset (function_builder &b, const char *signature,
++		 const function_group_info &group,
++		 bool force_direct_overloads = false)
++{
++  build_32_64 (b, signature, group, MODE_u32base_offset, MODE_u64base_offset,
++	       force_direct_overloads);
++}
++
++/* Add a function instance for every type and predicate combination
++   in GROUP.  Take the function base name from GROUP and the mode suffix
++   from MODE_SUFFIX_ID.  Use SIGNATURE to construct the function signature
++   without a governing predicate, then use apply_predication to add in the
++   predicate.  FORCE_DIRECT_OVERLOADS is true if there is a one-to-one
++   mapping between "short" and "full" names, and if standard overload
++   resolution therefore isn't necessary.  */
++static void
++build_all (function_builder &b, const char *signature,
++	   const function_group_info &group, mode_suffix_index mode_suffix_id,
++	   bool force_direct_overloads = false)
++{
++  for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi)
++    for (unsigned int ti = 0;
++	 ti == 0 || group.types[ti][0] != NUM_TYPE_SUFFIXES; ++ti)
++      build_one (b, signature, group, mode_suffix_id, ti, pi,
++		 force_direct_overloads);
++}
++
++/* TYPE is the largest type suffix associated with the arguments of R,
++   but the result is twice as wide.  Return the associated type suffix
++   if it exists, otherwise report an appropriate error and return
++   NUM_TYPE_SUFFIXES.  */
++static type_suffix_index
++long_type_suffix (function_resolver &r, type_suffix_index type)
++{
++  unsigned int element_bits = type_suffixes[type].element_bits;
++  if (type_suffixes[type].integer_p && element_bits < 64)
++    return find_type_suffix (type_suffixes[type].tclass, element_bits * 2);
++
++  r.report_no_such_form (type);
++  return NUM_TYPE_SUFFIXES;
++}
++
++/* Declare the function shape NAME, pointing it to an instance
++   of class <NAME>_def.  */
++#define SHAPE(NAME) \
++  static CONSTEXPR const NAME##_def NAME##_obj; \
++  namespace shapes { const function_shape *const NAME = &NAME##_obj; }
++
++/* Base class for functions that are not overloaded.  */
++struct nonoverloaded_base : public function_shape
++{
++  bool
++  explicit_type_suffix_p (unsigned int) const OVERRIDE
++  {
++    return true;
++  }
++
++  tree
++  resolve (function_resolver &) const OVERRIDE
++  {
++    gcc_unreachable ();
++  }
++};
++
++/* Base class for overloaded functions.  Bit N of EXPLICIT_MASK is true
++   if type suffix N appears in the overloaded name.  */
++template<unsigned int EXPLICIT_MASK>
++struct overloaded_base : public function_shape
++{
++  bool
++  explicit_type_suffix_p (unsigned int i) const OVERRIDE
++  {
++    return (EXPLICIT_MASK >> i) & 1;
++  }
++};
++
++/* Base class for adr_index and adr_offset.  */
++struct adr_base : public overloaded_base<0>
++{
++  /* The function takes two arguments: a vector base and a vector displacement
++     (either an index or an offset).  Resolve based on them both.  */
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    mode_suffix_index mode;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (mode = r.resolve_adr_address (0)) == MODE_none)
++      return error_mark_node;
++
++    return r.resolve_to (mode);
++  };
++};
++
++/* Base class for narrowing bottom binary functions that take an
++   immediate second operand.  The result is half the size of input
++   and has class CLASS.  */
++template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
++struct binary_imm_narrowb_base : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
++		   || CLASS == TYPE_unsigned);
++    if (CLASS == TYPE_unsigned)
++      build_all (b, "vhu0,v0,su64", group, MODE_n);
++    else
++      build_all (b, "vh0,v0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (1, 1);
++  }
++};
++
++/* The top equivalent of binary_imm_narrowb_base.  It takes three arguments,
++   with the first being the values of the even elements, which are typically
++   the result of the narrowb operation.  */
++template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
++struct binary_imm_narrowt_base : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
++		   || CLASS == TYPE_unsigned);
++    if (CLASS == TYPE_unsigned)
++      build_all (b, "vhu0,vhu0,v0,su64", group, MODE_n);
++    else
++      build_all (b, "vh0,vh0,v0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i, i + 1, type, CLASS, r.HALF_SIZE)
++	|| !r.require_integer_immediate (i + 2))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++
++/* Base class for long (i.e. narrow op narrow -> wide) binary functions
++   that take an immediate second operand.  The type suffix specifies
++   the wider type.  */
++struct binary_imm_long_base : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "v0,vh0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type, result_type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_integer_immediate (i + 1)
++	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    if (tree res = r.lookup_form (r.mode_suffix_id, result_type))
++      return res;
++
++    return r.report_no_such_form (type);
++  }
++};
++
++/* Base class for inc_dec and inc_dec_pat.  */
++struct inc_dec_base : public overloaded_base<0>
++{
++  CONSTEXPR inc_dec_base (bool pat_p) : m_pat_p (pat_p) {}
++
++  /* Resolve based on the first argument only, which must be either a
++     scalar or a vector.  If it's a scalar, it must be a 32-bit or
++     64-bit integer.  */
++  tree
++  resolve (function_resolver &r) const
++  {
++    unsigned int i, nargs;
++    if (!r.check_gp_argument (m_pat_p ? 3 : 2, i, nargs)
++	|| !r.require_vector_or_scalar_type (i))
++      return error_mark_node;
++
++    mode_suffix_index mode;
++    type_suffix_index type;
++    if (r.scalar_argument_p (i))
++      {
++	mode = MODE_n;
++	type = r.infer_integer_scalar_type (i);
++      }
++    else
++      {
++	mode = MODE_none;
++	type = r.infer_vector_type (i);
++      }
++    if (type == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    for (++i; i < nargs; ++i)
++      if (!r.require_integer_immediate (i))
++	return error_mark_node;
++
++    return r.resolve_to (mode, type);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_range (m_pat_p ? 2 : 1, 1, 16);
++  }
++
++  bool m_pat_p;
++};
++
++/* Base class for load and load_replicate.  */
++struct load_contiguous_base : public overloaded_base<0>
++{
++  /* Resolve a call based purely on a pointer argument.  The other arguments
++     are a governing predicate and (for MODE_vnum) a vnum offset.  */
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    bool vnum_p = r.mode_suffix_id == MODE_vnum;
++    gcc_assert (r.mode_suffix_id == MODE_none || vnum_p);
++
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (vnum_p ? 2 : 1, i, nargs)
++	|| (type = r.infer_pointer_type (i)) == NUM_TYPE_SUFFIXES
++	|| (vnum_p && !r.require_scalar_type (i + 1, "int64_t")))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++
++/* Base class for gather loads that take a scalar base and a vector
++   displacement (either an offset or an index).  */
++struct load_gather_sv_base : public overloaded_base<0>
++{
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    mode_suffix_index mode;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_pointer_type (i, true)) == NUM_TYPE_SUFFIXES
++	|| (mode = r.resolve_sv_displacement (i + 1, type, true),
++	    mode == MODE_none))
++      return error_mark_node;
++
++    return r.resolve_to (mode, type);
++  }
++};
++
++/* Base class for load_ext_gather_index and load_ext_gather_offset,
++   which differ only in the units of the displacement.  */
++struct load_ext_gather_base : public overloaded_base<1>
++{
++  /* Resolve a gather load that takes one of:
++
++     - a scalar pointer base and a vector displacement
++     - a vector base with no displacement or
++     - a vector base and a scalar displacement
++
++     The function has an explicit type suffix that determines the type
++     of the loaded data.  */
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    /* No resolution is needed for a vector base with no displacement;
++       there's a one-to-one mapping between short and long names.  */
++    gcc_assert (r.displacement_units () != UNITS_none);
++
++    type_suffix_index type = r.type_suffix_ids[0];
++
++    unsigned int i, nargs;
++    mode_suffix_index mode;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (mode = r.resolve_gather_address (i, type, true)) == MODE_none)
++      return error_mark_node;
++
++    return r.resolve_to (mode, type);
++  }
++};
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:quarter>_t,
++		       sv<t0:quarter>_t)  (for integer t0)
++   sv<t0>_t svmmla[_t0](sv<t0>_t, sv<t0>_t, sv<t0>_t)  (for floating-point t0)
++
++   The functions act like the equivalent of "ternary_qq" for integer elements
++   and normal vector-only ternary functions for floating-point elements.  */
++struct mmla_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    /* svmmla is distributed over several extensions.  Allow the common
++       denominator to define the overloaded svmmla function without
++       defining any specific versions.  */
++    if (group.types[0][0] != NUM_TYPE_SUFFIXES)
++      {
++	if (type_suffixes[group.types[0][0]].float_p)
++	  build_all (b, "v0,v0,v0,v0", group, MODE_none);
++	else
++	  build_all (b, "v0,v0,vq0,vq0", group, MODE_none);
++      }
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    /* Make sure that the function exists now, since not all forms
++       follow a set pattern after this point.  */
++    tree res = r.resolve_to (r.mode_suffix_id, type);
++    if (res == error_mark_node)
++      return res;
++
++    bool float_p = type_suffixes[type].float_p;
++    unsigned int modifier = float_p ? r.SAME_SIZE : r.QUARTER_SIZE;
++    if (!r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
++					modifier)
++	|| !r.require_derived_vector_type (i + 2, i, type, r.SAME_TYPE_CLASS,
++					   modifier))
++      return error_mark_node;
++
++    return res;
++  }
++};
++SHAPE (mmla)
++
++/* Base class for prefetch_gather_index and prefetch_gather_offset,
++   which differ only in the units of the displacement.  */
++struct prefetch_gather_base : public overloaded_base<0>
++{
++  /* Resolve a gather prefetch that takes one of:
++
++     - a scalar pointer base (const void *) and a vector displacement
++     - a vector base with no displacement or
++     - a vector base and a scalar displacement
++
++     The prefetch operation is the final argument.  This is purely a
++     mode-based resolution; there are no type suffixes.  */
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    bool has_displacement_p = r.displacement_units () != UNITS_none;
++
++    unsigned int i, nargs;
++    mode_suffix_index mode;
++    if (!r.check_gp_argument (has_displacement_p ? 3 : 2, i, nargs)
++	|| (mode = r.resolve_gather_address (i, NUM_TYPE_SUFFIXES,
++					     false)) == MODE_none
++	|| !r.require_integer_immediate (nargs - 1))
++      return error_mark_node;
++
++    return r.resolve_to (mode);
++  }
++};
++
++/* Wraps BASE to provide a narrowing shift right function.  Argument N
++   is an immediate shift amount in the range [1, sizeof(<t0>_t) * 4].  */
++template<typename BASE, unsigned int N>
++struct shift_right_imm_narrow_wrapper : public BASE
++{
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bits = c.type_suffix (0).element_bits / 2;
++    return c.require_immediate_range (N, 1, bits);
++  }
++};
++
++/* Base class for store_scatter_index and store_scatter_offset,
++   which differ only in the units of the displacement.  */
++struct store_scatter_base : public overloaded_base<0>
++{
++  /* Resolve a scatter store that takes one of:
++
++     - a scalar pointer base and a vector displacement
++     - a vector base with no displacement or
++     - a vector base and a scalar displacement
++
++     The stored data is the final argument, and it determines the
++     type suffix.  */
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    bool has_displacement_p = r.displacement_units () != UNITS_none;
++
++    unsigned int i, nargs;
++    mode_suffix_index mode;
++    type_suffix_index type;
++    if (!r.check_gp_argument (has_displacement_p ? 3 : 2, i, nargs)
++	|| (type = r.infer_sd_vector_type (nargs - 1)) == NUM_TYPE_SUFFIXES
++	|| (mode = r.resolve_gather_address (i, type, false)) == MODE_none)
++      return error_mark_node;
++
++    return r.resolve_to (mode, type);
++  }
++};
++
++/* Base class for ternary operations in which the final argument is an
++   immediate shift amount.  The derived class should check the range.  */
++struct ternary_shift_imm_base : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "v0,v0,v0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2, 1);
++  }
++};
++
++/* Base class for ternary operations in which the first argument has the
++   same element type as the result, and in which the second and third
++   arguments have an element type that is derived the first.
++
++   MODIFIER is the number of element bits in the second and third
++   arguments, or a function_resolver modifier that says how this
++   precision is derived from the first argument's elements.
++
++   TYPE_CLASS2 and TYPE_CLASS3 are the type classes of the second and
++   third arguments, or function_resolver::SAME_TYPE_CLASS if the type
++   class is the same as the first argument.  */
++template<unsigned int MODIFIER,
++	 type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
++	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
++struct ternary_resize2_opt_n_base : public overloaded_base<0>
++{
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_CLASS2,
++					   MODIFIER))
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 2, i, type, TYPE_CLASS3, MODIFIER);
++  }
++};
++
++/* Like ternary_resize2_opt_n_base, but for functions that don't take
++   a final scalar argument.  */
++template<unsigned int MODIFIER,
++	 type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
++	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
++struct ternary_resize2_base : public overloaded_base<0>
++{
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_CLASS2,
++					   MODIFIER)
++	|| !r.require_derived_vector_type (i + 2, i, type, TYPE_CLASS3,
++					   MODIFIER))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++
++/* Like ternary_resize2_opt_n_base, but for functions that take a final
++   lane argument.  */
++template<unsigned int MODIFIER,
++	 type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
++	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
++struct ternary_resize2_lane_base : public overloaded_base<0>
++{
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (4, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_CLASS2,
++					   MODIFIER)
++	|| !r.require_derived_vector_type (i + 2, i, type, TYPE_CLASS3,
++					   MODIFIER)
++	|| !r.require_integer_immediate (i + 3))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++
++/* A specialization of ternary_resize2_lane_base for bfloat16 elements,
++   indexed in groups of N elements.  */
++template<unsigned int N>
++struct ternary_bfloat_lane_base
++  : public ternary_resize2_lane_base<16, TYPE_bfloat, TYPE_bfloat>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vB,vB,su64", group, MODE_none);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_lane_index (3, N);
++  }
++};
++
++/* A specialization of ternary_resize2_lane_base for quarter-sized
++   elements.  */
++template<type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
++	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
++struct ternary_qq_lane_base
++  : public ternary_resize2_lane_base<function_resolver::QUARTER_SIZE,
++				     TYPE_CLASS2, TYPE_CLASS3>
++{
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_lane_index (3, 4);
++  }
++};
++
++/* Base class for narrowing bottom unary functions.  The result is half
++   the size of input and has class CLASS.  */
++template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
++struct unary_narrowb_base : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
++		   || CLASS == TYPE_unsigned);
++    if (CLASS == TYPE_unsigned)
++      build_all (b, "vhu0,v0", group, MODE_none);
++    else
++      build_all (b, "vh0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_unary (CLASS, r.HALF_SIZE);
++  }
++};
++
++/* The top equivalent of unary_imm_narrowb_base.  All forms take the values
++   of the even elements as an extra argument, before any governing predicate.
++   These even elements are typically the result of the narrowb operation.  */
++template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
++struct unary_narrowt_base : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
++		   || CLASS == TYPE_unsigned);
++    if (CLASS == TYPE_unsigned)
++      build_all (b, "vhu0,vhu0,v0", group, MODE_none);
++    else
++      build_all (b, "vh0,vh0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i, i + 1, type, CLASS, r.HALF_SIZE))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++
++/* sv<m0>_t svfoo[_m0base]_[m1]index(sv<m0>_t, sv<m1>_t)
++
++   for all valid combinations of vector base type <m0> and vector
++   displacement type <m1>.  */
++struct adr_index_def : public adr_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    build_all (b, "b,b,d", group, MODE_u32base_s32index);
++    build_all (b, "b,b,d", group, MODE_u32base_u32index);
++    build_all (b, "b,b,d", group, MODE_u64base_s64index);
++    build_all (b, "b,b,d", group, MODE_u64base_u64index);
++  }
++};
++SHAPE (adr_index)
++
++/* sv<m0>_t svfoo[_m0base]_[m1]offset(sv<m0>_t, sv<m1>_t).
++
++   for all valid combinations of vector base type <m0> and vector
++   displacement type <m1>.  */
++struct adr_offset_def : public adr_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_offset);
++    build_all (b, "b,b,d", group, MODE_u32base_s32offset);
++    build_all (b, "b,b,d", group, MODE_u32base_u32offset);
++    build_all (b, "b,b,d", group, MODE_u64base_s64offset);
++    build_all (b, "b,b,d", group, MODE_u64base_u64offset);
++  }
++};
++SHAPE (adr_offset)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
++
++   i.e. a binary operation with uniform types, but with no scalar form.  */
++struct binary_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2);
++  }
++};
++SHAPE (binary)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0:int>_t).
++
++   i.e. a version of the standard binary shape binary_opt_n in which
++   the final argument is always a signed integer.  */
++struct binary_int_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vs0", group, MODE_none);
++    build_all (b, "v0,v0,ss0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 1, i, type, TYPE_signed);
++  }
++};
++SHAPE (binary_int_opt_n)
++
++/* sv<t0>_t svfoo_<t0>(sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument is an integer constant expression in the
++   range [0, 16 / sizeof (<t0>_t) - 1].  */
++struct binary_lane_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_lane_index (2);
++  }
++};
++SHAPE (binary_lane)
++
++/* sv<t0>_t svfoo[_t0](sv<t0:half>_t, sv<t0:half>_t, uint64_t).
++
++   where the final argument is an integer constant expression in the
++   range [0, 32 / sizeof (<t0>_t) - 1].  */
++struct binary_long_lane_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,vh0,vh0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type, result_type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_matching_vector_type (i + 1, type)
++	|| !r.require_integer_immediate (i + 2)
++	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    if (tree res = r.lookup_form (r.mode_suffix_id, result_type))
++      return res;
++
++    return r.report_no_such_form (type);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_lane_index (2);
++  }
++};
++SHAPE (binary_long_lane)
++
++/* sv<t0>_t svfoo[_t0](sv<t0:half>_t, sv<t0:half>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0:half>_t, <t0:half>_t).  */
++struct binary_long_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,vh0,vh0", group, MODE_none);
++    build_all (b, "v0,vh0,sh0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type, result_type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 1, i, type, r.SAME_TYPE_CLASS,
++				      r.SAME_SIZE, result_type);
++  }
++};
++SHAPE (binary_long_opt_n)
++
++/* sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0>_t).
++
++   i.e. a binary operation in which the final argument is always a scalar
++   rather than a vector.  */
++struct binary_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "v0,v0,s0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_scalar_type (i + 1, r.SAME_TYPE_CLASS))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (binary_n)
++
++/* sv<t0:half>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
++   sv<t0:half>_t svfoo[_n_t0](sv<t0>_t, <t0>_t)
++
++   i.e. a version of binary_opt_n in which the output elements are half the
++   width of the input elements.  */
++struct binary_narrowb_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vh0,v0,v0", group, MODE_none);
++    build_all (b, "vh0,v0,s0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform_opt_n (2);
++  }
++};
++SHAPE (binary_narrowb_opt_n)
++
++/* sv<t0:half>_t svfoo[_t0](sv<t0:half>_t, sv<t0>_t, sv<t0>_t)
++   sv<t0:half>_t svfoo[_n_t0](sv<t0:half>_t, sv<t0>_t, <t0>_t)
++
++   This is the "top" counterpart to binary_narrowb_opt_n.  */
++struct binary_narrowt_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vh0,vh0,v0,v0", group, MODE_none);
++    build_all (b, "vh0,vh0,v0,s0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i, i + 1, type, r.SAME_TYPE_CLASS,
++					   r.HALF_SIZE))
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 2, i + 1, type);
++  }
++};
++SHAPE (binary_narrowt_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0>_t)
++
++   i.e. the standard shape for binary operations that operate on
++   uniform types.  */
++struct binary_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0", group, MODE_none);
++    /* _b functions do not have an _n form, but are classified as
++       binary_opt_n so that they can be overloaded with vector
++       functions.  */
++    if (group.types[0][0] == TYPE_SUFFIX_b)
++      gcc_assert (group.types[0][1] == NUM_TYPE_SUFFIXES);
++    else
++      build_all (b, "v0,v0,s0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform_opt_n (2);
++  }
++};
++SHAPE (binary_opt_n)
++
++/* svbool_t svfoo(svbool_t, svbool_t).  */
++struct binary_pred_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "v0,v0,v0", group, MODE_none);
++  }
++};
++SHAPE (binary_pred)
++
++/* sv<t0>_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument must be 90 or 270.  */
++struct binary_rotate_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_either_or (2, 90, 270);
++  }
++};
++SHAPE (binary_rotate)
++
++/* sv<t0>_t svfoo_t0(<t0>_t, <t0>_t)
++
++   i.e. a binary function that takes two scalars and returns a vector.
++   An explicit type suffix is required.  */
++struct binary_scalar_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "v0,s0,s0", group, MODE_none);
++  }
++};
++SHAPE (binary_scalar)
++
++/* sv<t0:uint>_t svfoo[_t0](sv<t0>_t, sv<t0>_t).
++
++   i.e. a version of "binary" that returns unsigned integers.  */
++struct binary_to_uint_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vu0,v0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2);
++  }
++};
++SHAPE (binary_to_uint)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint>_t)
++
++   i.e. a version of "binary" in which the final argument is always an
++   unsigned integer.  */
++struct binary_uint_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vu0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_unsigned))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (binary_uint)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, <t0:uint>_t)
++
++   i.e. a version of binary_n in which the final argument is always an
++   unsigned integer.  */
++struct binary_uint_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,su0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_scalar_type (i + 1, TYPE_unsigned))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (binary_uint_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0:uint>_t)
++
++   i.e. a version of the standard binary shape binary_opt_n in which
++   the final argument is always an unsigned integer.  */
++struct binary_uint_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vu0", group, MODE_none);
++    build_all (b, "v0,v0,su0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 1, i, type, TYPE_unsigned);
++  }
++};
++SHAPE (binary_uint_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, uint64_t).
++
++   i.e. a version of binary_n in which the final argument is always
++   a 64-bit unsigned integer.  */
++struct binary_uint64_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_scalar_type (i + 1, "uint64_t"))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (binary_uint64_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svuint64_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, uint64_t)
++
++   i.e. a version of the standard binary shape binary_opt_n in which
++   the final argument is always a uint64_t.  */
++struct binary_uint64_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vu64", group, MODE_none);
++    build_all (b, "v0,v0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 1, i, type, TYPE_unsigned, 64);
++  }
++};
++SHAPE (binary_uint64_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t).  */
++struct binary_wide_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vh0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
++					   r.HALF_SIZE))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (binary_wide)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0:half>_t).  */
++struct binary_wide_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vh0", group, MODE_none);
++    build_all (b, "v0,v0,sh0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 1, i, type, r.SAME_TYPE_CLASS,
++				      r.HALF_SIZE);
++  }
++};
++SHAPE (binary_wide_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
++   <t0>_t svfoo[_n_t0](<t0>_t, sv<t0>_t).  */
++struct clast_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0", group, MODE_none);
++    build_all (b, "s0,s0,v0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| !r.require_vector_or_scalar_type (i))
++      return error_mark_node;
++
++    if (r.scalar_argument_p (i))
++      {
++	type_suffix_index type;
++	if (!r.require_derived_scalar_type (i, r.SAME_TYPE_CLASS)
++	    || (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES)
++	  return error_mark_node;
++	return r.resolve_to (MODE_n, type);
++      }
++    else
++      {
++	type_suffix_index type;
++	if ((type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	    || !r.require_matching_vector_type (i + 1, type))
++	  return error_mark_node;
++	return r.resolve_to (MODE_none, type);
++      }
++  }
++};
++SHAPE (clast)
++
++/* svbool_t svfoo[_t0](sv<t0>_t, sv<t0>_t).  */
++struct compare_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vp,v0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2);
++  }
++};
++SHAPE (compare)
++
++/* svbool_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
++   svbool_t svfoo[_n_t0](sv<t0>_t, <t0>_t)
++
++   i.e. a comparison between two vectors, or between a vector and a scalar.  */
++struct compare_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vp,v0,v0", group, MODE_none);
++    build_all (b, "vp,v0,s0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform_opt_n (2);
++  }
++};
++SHAPE (compare_opt_n)
++
++/* svbool_t svfoo[_t0](const <t0>_t *, const <t0>_t *).  */
++struct compare_ptr_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vp,al,al", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_pointer_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_matching_pointer_type (i + 1, i, type))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (compare_ptr)
++
++/* svbool_t svfoo_t0[_t1](<t1>_t, <t1>_t)
++
++   where _t0 is a _b<bits> suffix that describes the predicate result.
++   There is no direct relationship between the element sizes of _t0
++   and _t1.  */
++struct compare_scalar_def : public overloaded_base<1>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vp,s1,s1", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_integer_scalar_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_matching_integer_scalar_type (i + 1, i, type))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, r.type_suffix_ids[0], type);
++  }
++};
++SHAPE (compare_scalar)
++
++/* svbool_t svfoo[_t0](sv<t0>_t, svint64_t)  (for signed t0)
++   svbool_t svfoo[_n_t0](sv<t0>_t, int64_t)  (for signed t0)
++   svbool_t svfoo[_t0](sv<t0>_t, svuint64_t)  (for unsigned t0)
++   svbool_t svfoo[_n_t0](sv<t0>_t, uint64_t)  (for unsigned t0)
++
++   i.e. a comparison in which the second argument is 64 bits.  */
++struct compare_wide_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vp,v0,vw0", group, MODE_none);
++    build_all (b, "vp,v0,sw0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.finish_opt_n_resolution (i + 1, i, type, r.SAME_TYPE_CLASS, 64);
++  }
++};
++SHAPE (compare_wide_opt_n)
++
++/* uint64_t svfoo().  */
++struct count_inherent_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "su64", group, MODE_none);
++  }
++};
++SHAPE (count_inherent)
++
++/* uint64_t svfoo(enum svpattern).  */
++struct count_pat_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "su64,epattern", group, MODE_none);
++  }
++};
++SHAPE (count_pat)
++
++/* uint64_t svfoo(svbool_t).  */
++struct count_pred_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "su64,vp", group, MODE_none);
++  }
++};
++SHAPE (count_pred)
++
++/* uint64_t svfoo[_t0](sv<t0>_t).  */
++struct count_vector_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "su64,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (1);
++  }
++};
++SHAPE (count_vector)
++
++/* sv<t0>xN_t svfoo[_t0](sv<t0>_t, ..., sv<t0>_t)
++
++   where there are N arguments in total.  */
++struct create_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "t0,v0*t", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (r.vectors_per_tuple ());
++  }
++};
++SHAPE (create)
++
++/* sv<t0>_t svfoo[_n]_t0(<t0>_t, ..., <t0>_t)
++
++   where there are enough arguments to fill 128 bits of data (or to
++   control 128 bits of data in the case of predicates).  */
++struct dupq_def : public overloaded_base<1>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    /* The "_n" suffix is optional; the full name has it, but the short
++       name doesn't.  */
++    build_all (b, "v0,s0*q", group, MODE_n, true);
++  }
++
++  tree
++  resolve (function_resolver &) const OVERRIDE
++  {
++    /* The short forms just make "_n" implicit, so no resolution is needed.  */
++    gcc_unreachable ();
++  }
++};
++SHAPE (dupq)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument is an integer constant expression that when
++   multiplied by the number of bytes in t0 is in the range [0, 255].  */
++struct ext_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bytes = c.type_suffix (0).element_bytes;
++    return c.require_immediate_range (2, 0, 256 / bytes - 1);
++  }
++};
++SHAPE (ext)
++
++/* <t0>_t svfoo[_t0](<t0>_t, sv<t0>_t).  */
++struct fold_left_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "s0,s0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| !r.require_derived_scalar_type (i, r.SAME_TYPE_CLASS)
++	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (fold_left)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>xN_t, uint64_t)
++
++   where the final argument is an integer constant expression in
++   the range [0, N - 1].  */
++struct get_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,t0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_tuple_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_integer_immediate (i + 1))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int nvectors = c.vectors_per_tuple ();
++    return c.require_immediate_range (1, 0, nvectors - 1);
++  }
++};
++SHAPE (get)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, uint64_t)
++   <t0>_t svfoo[_n_t0](<t0>_t, uint64_t)
++
++   where the t0 in the vector form is a signed or unsigned integer
++   whose size is tied to the [bhwd] suffix of "svfoo".  */
++struct inc_dec_def : public inc_dec_base
++{
++  CONSTEXPR inc_dec_def () : inc_dec_base (false) {}
++
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    /* These functions are unusual in that the type suffixes for
++       the scalar and vector forms are not related.  The vector
++       form always has exactly two potential suffixes while the
++       scalar form always has four.  */
++    if (group.types[2][0] == NUM_TYPE_SUFFIXES)
++      build_all (b, "v0,v0,su64", group, MODE_none);
++    else
++      build_all (b, "s0,s0,su64", group, MODE_n);
++  }
++};
++SHAPE (inc_dec)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, enum svpattern, uint64_t)
++   <t0>_t svfoo[_n_t0](<t0>_t, enum svpattern, uint64_t)
++
++   where the t0 in the vector form is a signed or unsigned integer
++   whose size is tied to the [bhwd] suffix of "svfoo".  */
++struct inc_dec_pat_def : public inc_dec_base
++{
++  CONSTEXPR inc_dec_pat_def () : inc_dec_base (true) {}
++
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    /* These functions are unusual in that the type suffixes for
++       the scalar and vector forms are not related.  The vector
++       form always has exactly two potential suffixes while the
++       scalar form always has four.  */
++    if (group.types[2][0] == NUM_TYPE_SUFFIXES)
++      build_all (b, "v0,v0,epattern,su64", group, MODE_none);
++    else
++      build_all (b, "s0,s0,epattern,su64", group, MODE_n);
++  }
++};
++SHAPE (inc_dec_pat)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbool_t).  */
++struct inc_dec_pred_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vp", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_vector_type (i + 1, VECTOR_TYPE_svbool_t))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (inc_dec_pred)
++
++/* <t0>_t svfoo[_n_t0]_t1(<t0>_t, svbool_t)
++
++   where _t1 is a _b<bits> suffix that describes the svbool_t argument.  */
++struct inc_dec_pred_scalar_def : public overloaded_base<2>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "s0,s0,vp", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_integer_scalar_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_vector_type (i + 1, VECTOR_TYPE_svbool_t))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type, r.type_suffix_ids[1]);
++  }
++};
++SHAPE (inc_dec_pred_scalar)
++
++/* sv<t0>[xN]_t svfoo_t0().  */
++struct inherent_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "t0", group, MODE_none);
++  }
++};
++SHAPE (inherent)
++
++/* svbool_t svfoo[_b]().  */
++struct inherent_b_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    /* The "_b" suffix is optional; the full name has it, but the short
++       name doesn't.  */
++    build_all (b, "v0", group, MODE_none, true);
++  }
++
++  tree
++  resolve (function_resolver &) const OVERRIDE
++  {
++    /* The short forms just make "_b" implicit, so no resolution is needed.  */
++    gcc_unreachable ();
++  }
++};
++SHAPE (inherent_b)
++
++/* sv<t0>[xN]_t svfoo[_t0](const <t0>_t *)
++   sv<t0>[xN]_t svfoo_vnum[_t0](const <t0>_t *, int64_t).  */
++struct load_def : public load_contiguous_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    b.add_overloaded_functions (group, MODE_vnum);
++    build_all (b, "t0,al", group, MODE_none);
++    build_all (b, "t0,al,ss64", group, MODE_vnum);
++  }
++};
++SHAPE (load)
++
++/* sv<t0>_t svfoo_t0(const <X>_t *)
++   sv<t0>_t svfoo_vnum_t0(const <X>_t *, int64_t)
++
++   where <X> is determined by the function base name.  */
++struct load_ext_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "t0,al", group, MODE_none);
++    build_all (b, "t0,al,ss64", group, MODE_vnum);
++  }
++};
++SHAPE (load_ext)
++
++/* sv<t0>_t svfoo_[s32]index_t0(const <X>_t *, svint32_t)
++   sv<t0>_t svfoo_[s64]index_t0(const <X>_t *, svint64_t)
++   sv<t0>_t svfoo_[u32]index_t0(const <X>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]index_t0(const <X>_t *, svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_index_t0(svuint32_t, int64_t)
++   sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t)
++
++   where <X> is determined by the function base name.  */
++struct load_ext_gather_index_def : public load_ext_gather_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    build_sv_index (b, "t0,al,d", group);
++    build_vs_index (b, "t0,b,ss64", group);
++  }
++};
++SHAPE (load_ext_gather_index)
++
++/* sv<t0>_t svfoo_[s64]index_t0(const <X>_t *, svint64_t)
++   sv<t0>_t svfoo_[u64]index_t0(const <X>_t *, svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_index_t0(svuint32_t, int64_t)
++   sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t)
++
++   where <X> is determined by the function base name.  This is
++   load_ext_gather_index that doesn't support 32-bit vector indices.  */
++struct load_ext_gather_index_restricted_def : public load_ext_gather_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    build_sv_index64 (b, "t0,al,d", group);
++    build_vs_index (b, "t0,b,ss64", group);
++  }
++};
++SHAPE (load_ext_gather_index_restricted)
++
++/* sv<t0>_t svfoo_[s32]offset_t0(const <X>_t *, svint32_t)
++   sv<t0>_t svfoo_[s64]offset_t0(const <X>_t *, svint64_t)
++   sv<t0>_t svfoo_[u32]offset_t0(const <X>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]offset_t0(const <X>_t *, svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_t0(svuint32_t)
++   sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_offset_t0(svuint32_t, int64_t)
++   sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t)
++
++   where <X> is determined by the function base name.  */
++struct load_ext_gather_offset_def : public load_ext_gather_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_offset (b, "t0,al,d", group);
++    build_v_base (b, "t0,b", group, true);
++    build_vs_offset (b, "t0,b,ss64", group);
++  }
++};
++SHAPE (load_ext_gather_offset)
++
++/* sv<t0>_t svfoo_[s64]offset_t0(const <X>_t *, svint64_t)
++   sv<t0>_t svfoo_[u32]offset_t0(const <X>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]offset_t0(const <X>_t *, svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_t0(svuint32_t)
++   sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_offset_t0(svuint32_t, int64_t)
++   sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t)
++
++   where <X> is determined by the function base name.  This is
++   load_ext_gather_offset without the s32 vector offset form.  */
++struct load_ext_gather_offset_restricted_def : public load_ext_gather_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_uint_offset (b, "t0,al,d", group);
++    build_v_base (b, "t0,b", group, true);
++    build_vs_offset (b, "t0,b,ss64", group);
++  }
++};
++SHAPE (load_ext_gather_offset_restricted)
++
++/* sv<t0>_t svfoo_[s32]index[_t0](const <t0>_t *, svint32_t)
++   sv<t0>_t svfoo_[s64]index[_t0](const <t0>_t *, svint64_t)
++   sv<t0>_t svfoo_[u32]index[_t0](const <t0>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]index[_t0](const <t0>_t *, svuint64_t)
++
++   sv<t0>_t svfoo_[s32]offset[_t0](const <t0>_t *, svint32_t)
++   sv<t0>_t svfoo_[s64]offset[_t0](const <t0>_t *, svint64_t)
++   sv<t0>_t svfoo_[u32]offset[_t0](const <t0>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]offset[_t0](const <t0>_t *, svuint64_t).  */
++struct load_gather_sv_def : public load_gather_sv_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_index (b, "t0,al,d", group);
++    build_sv_offset (b, "t0,al,d", group);
++  }
++};
++SHAPE (load_gather_sv)
++
++/* sv<t0>_t svfoo_[u32]index[_t0](const <t0>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]index[_t0](const <t0>_t *, svuint64_t)
++
++   sv<t0>_t svfoo_[s64]offset[_t0](const <t0>_t *, svint64_t)
++   sv<t0>_t svfoo_[u32]offset[_t0](const <t0>_t *, svuint32_t)
++   sv<t0>_t svfoo_[u64]offset[_t0](const <t0>_t *, svuint64_t)
++
++   This is load_gather_sv without the 32-bit vector index forms and
++   without the s32 vector offset form.  */
++struct load_gather_sv_restricted_def : public load_gather_sv_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_index64 (b, "t0,al,d", group);
++    build_sv_uint_offset (b, "t0,al,d", group);
++  }
++};
++SHAPE (load_gather_sv_restricted)
++
++/* sv<t0>_t svfoo[_u32base]_t0(svuint32_t)
++   sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
++
++   sv<t0>_t svfoo[_u32base]_index_t0(svuint32_t, int64_t)
++   sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t)
++
++   sv<t0>_t svfoo[_u32base]_offset_t0(svuint32_t, int64_t)
++   sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t).  */
++struct load_gather_vs_def : public overloaded_base<1>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    /* The base vector mode is optional; the full name has it but the
++       short name doesn't.  There is no ambiguity with SHAPE_load_gather_sv
++       because the latter uses an implicit type suffix.  */
++    build_v_base (b, "t0,b", group, true);
++    build_vs_index (b, "t0,b,ss64", group, true);
++    build_vs_offset (b, "t0,b,ss64", group, true);
++  }
++
++  tree
++  resolve (function_resolver &) const OVERRIDE
++  {
++    /* The short name just makes the base vector mode implicit;
++       no resolution is needed.  */
++    gcc_unreachable ();
++  }
++};
++SHAPE (load_gather_vs)
++
++/* sv<t0>_t svfoo[_t0](const <t0>_t *)
++
++   The only difference from "load" is that this shape has no vnum form.  */
++struct load_replicate_def : public load_contiguous_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "t0,al", group, MODE_none);
++  }
++};
++SHAPE (load_replicate)
++
++/* svbool_t svfoo(enum svpattern).  */
++struct pattern_pred_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "vp,epattern", group, MODE_none);
++  }
++};
++SHAPE (pattern_pred)
++
++/* void svfoo(const void *, svprfop)
++   void svfoo_vnum(const void *, int64_t, svprfop).  */
++struct prefetch_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "_,ap,eprfop", group, MODE_none);
++    build_all (b, "_,ap,ss64,eprfop", group, MODE_vnum);
++  }
++};
++SHAPE (prefetch)
++
++/* void svfoo_[s32]index(const void *, svint32_t, svprfop)
++   void svfoo_[s64]index(const void *, svint64_t, svprfop)
++   void svfoo_[u32]index(const void *, svuint32_t, svprfop)
++   void svfoo_[u64]index(const void *, svuint64_t, svprfop)
++
++   void svfoo[_u32base](svuint32_t, svprfop)
++   void svfoo[_u64base](svuint64_t, svprfop)
++
++   void svfoo[_u32base]_index(svuint32_t, int64_t, svprfop)
++   void svfoo[_u64base]_index(svuint64_t, int64_t, svprfop).  */
++struct prefetch_gather_index_def : public prefetch_gather_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    b.add_overloaded_functions (group, MODE_index);
++    build_sv_index (b, "_,ap,d,eprfop", group);
++    build_v_base (b, "_,b,eprfop", group);
++    build_vs_index (b, "_,b,ss64,eprfop", group);
++  }
++};
++SHAPE (prefetch_gather_index)
++
++/* void svfoo_[s32]offset(const void *, svint32_t, svprfop)
++   void svfoo_[s64]offset(const void *, svint64_t, svprfop)
++   void svfoo_[u32]offset(const void *, svuint32_t, svprfop)
++   void svfoo_[u64]offset(const void *, svuint64_t, svprfop)
++
++   void svfoo[_u32base](svuint32_t, svprfop)
++   void svfoo[_u64base](svuint64_t, svprfop)
++
++   void svfoo[_u32base]_offset(svuint32_t, int64_t, svprfop)
++   void svfoo[_u64base]_offset(svuint64_t, int64_t, svprfop).  */
++struct prefetch_gather_offset_def : public prefetch_gather_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_offset (b, "_,ap,d,eprfop", group);
++    build_v_base (b, "_,b,eprfop", group);
++    build_vs_offset (b, "_,b,ss64,eprfop", group);
++  }
++};
++SHAPE (prefetch_gather_offset)
++
++/* bool svfoo(svbool_t).  */
++struct ptest_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "sp,vp", group, MODE_none);
++  }
++};
++SHAPE (ptest)
++
++/* svbool_t svfoo().  */
++struct rdffr_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "vp", group, MODE_none);
++  }
++};
++SHAPE (rdffr)
++
++/* <t0>_t svfoo[_t0](sv<t0>_t).  */
++struct reduction_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "s0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (1);
++  }
++};
++SHAPE (reduction)
++
++/* int64_t svfoo[_t0](sv<t0>_t)  (for signed t0)
++   uint64_t svfoo[_t0](sv<t0>_t)  (for unsigned t0)
++   <t0>_t svfoo[_t0](sv<t0>_t)  (for floating-point t0)
++
++   i.e. a version of "reduction" in which the return type for integers
++   always has 64 bits.  */
++struct reduction_wide_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "sw0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (1);
++  }
++};
++SHAPE (reduction_wide)
++
++/* sv<t0>xN_t svfoo[_t0](sv<t0>xN_t, uint64_t, sv<t0>_t)
++
++   where the second argument is an integer constant expression in the
++   range [0, N - 1].  */
++struct set_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "t0,t0,su64,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_tuple_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_integer_immediate (i + 1)
++	|| !r.require_derived_vector_type (i + 2, i, type))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int nvectors = c.vectors_per_tuple ();
++    return c.require_immediate_range (1, 0, nvectors - 1);
++  }
++};
++SHAPE (set)
++
++/* void svfoo().  */
++struct setffr_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "_", group, MODE_none);
++  }
++};
++SHAPE (setffr)
++
++/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [0, sizeof (<t0>_t) * 8 - 1].  */
++struct shift_left_imm_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "v0,v0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (1, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bits = c.type_suffix (0).element_bits;
++    return c.require_immediate_range (1, 0, bits - 1);
++  }
++};
++SHAPE (shift_left_imm)
++
++/* sv<t0>_t svfoo[_n_t0])(sv<t0:half>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [0, sizeof (<t0>_t) * 4 - 1].  */
++struct shift_left_imm_long_def : public binary_imm_long_base
++{
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bits = c.type_suffix (0).element_bits / 2;
++    return c.require_immediate_range (1, 0, bits - 1);
++  }
++};
++SHAPE (shift_left_imm_long)
++
++/* sv<t0:uint>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [0, sizeof (<t0>_t) * 8 - 1].  */
++struct shift_left_imm_to_uint_def : public shift_left_imm_def
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "vu0,v0,su64", group, MODE_n);
++  }
++};
++SHAPE (shift_left_imm_to_uint)
++
++/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [1, sizeof (<t0>_t) * 8].  */
++struct shift_right_imm_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_n);
++    build_all (b, "v0,v0,su64", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (1, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bits = c.type_suffix (0).element_bits;
++    return c.require_immediate_range (1, 1, bits);
++  }
++};
++SHAPE (shift_right_imm)
++
++/* sv<t0:half>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [1, sizeof (<t0>_t) * 4].  */
++typedef shift_right_imm_narrow_wrapper<binary_imm_narrowb_base<>, 1>
++  shift_right_imm_narrowb_def;
++SHAPE (shift_right_imm_narrowb)
++
++/* sv<t0:half>_t svfoo[_n_t0])(sv<t0:half>_t, sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [1, sizeof (<t0>_t) * 4].  */
++typedef shift_right_imm_narrow_wrapper<binary_imm_narrowt_base<>, 2>
++  shift_right_imm_narrowt_def;
++SHAPE (shift_right_imm_narrowt)
++
++/* sv<t0:uint:half>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [1, sizeof (<t0>_t) * 4].  */
++typedef binary_imm_narrowb_base<TYPE_unsigned>
++  binary_imm_narrowb_base_unsigned;
++typedef shift_right_imm_narrow_wrapper<binary_imm_narrowb_base_unsigned, 1>
++  shift_right_imm_narrowb_to_uint_def;
++SHAPE (shift_right_imm_narrowb_to_uint)
++
++/* sv<t0:uint:half>_t svfoo[_n_t0])(sv<t0:uint:half>_t, sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [1, sizeof (<t0>_t) * 4].  */
++typedef binary_imm_narrowt_base<TYPE_unsigned>
++  binary_imm_narrowt_base_unsigned;
++typedef shift_right_imm_narrow_wrapper<binary_imm_narrowt_base_unsigned, 2>
++  shift_right_imm_narrowt_to_uint_def;
++SHAPE (shift_right_imm_narrowt_to_uint)
++
++/* void svfoo[_t0](<X>_t *, sv<t0>[xN]_t)
++   void svfoo_vnum[_t0](<X>_t *, int64_t, sv<t0>[xN]_t)
++
++   where <X> might be tied to <t0> (for non-truncating stores) or might
++   depend on the function base name (for truncating stores).  */
++struct store_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    b.add_overloaded_functions (group, MODE_vnum);
++    build_all (b, "_,as,t0", group, MODE_none);
++    build_all (b, "_,as,ss64,t0", group, MODE_vnum);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    bool vnum_p = r.mode_suffix_id == MODE_vnum;
++    gcc_assert (r.mode_suffix_id == MODE_none || vnum_p);
++
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (vnum_p ? 3 : 2, i, nargs)
++	|| !r.require_pointer_type (i)
++	|| (vnum_p && !r.require_scalar_type (i + 1, "int64_t"))
++	|| ((type = r.infer_tuple_type (nargs - 1)) == NUM_TYPE_SUFFIXES))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (store)
++
++/* void svfoo_[s32]index[_t0](<X>_t *, svint32_t, sv<t0>_t)
++   void svfoo_[s64]index[_t0](<X>_t *, svint64_t, sv<t0>_t)
++   void svfoo_[u32]index[_t0](<X>_t *, svuint32_t, sv<t0>_t)
++   void svfoo_[u64]index[_t0](<X>_t *, svuint64_t, sv<t0>_t)
++
++   void svfoo[_u32base]_index[_t0](svuint32_t, int64_t, sv<t0>_t)
++   void svfoo[_u64base]_index[_t0](svuint64_t, int64_t, sv<t0>_t)
++
++   where <X> might be tied to <t0> (for non-truncating stores) or might
++   depend on the function base name (for truncating stores).  */
++struct store_scatter_index_def : public store_scatter_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    build_sv_index (b, "_,as,d,t0", group);
++    build_vs_index (b, "_,b,ss64,t0", group);
++  }
++};
++SHAPE (store_scatter_index)
++
++/* void svfoo_[s64]index[_t0](<X>_t *, svint64_t, sv<t0>_t)
++   void svfoo_[u64]index[_t0](<X>_t *, svuint64_t, sv<t0>_t)
++
++   void svfoo[_u32base]_index[_t0](svuint32_t, int64_t, sv<t0>_t)
++   void svfoo[_u64base]_index[_t0](svuint64_t, int64_t, sv<t0>_t)
++
++   i.e. a version of store_scatter_index that doesn't support 32-bit
++   vector indices.  */
++struct store_scatter_index_restricted_def : public store_scatter_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_index);
++    build_sv_index64 (b, "_,as,d,t0", group);
++    build_vs_index (b, "_,b,ss64,t0", group);
++  }
++};
++SHAPE (store_scatter_index_restricted)
++
++/* void svfoo_[s32]offset[_t0](<X>_t *, svint32_t, sv<t0>_t)
++   void svfoo_[s64]offset[_t0](<X>_t *, svint64_t, sv<t0>_t)
++   void svfoo_[u32]offset[_t0](<X>_t *, svuint32_t, sv<t0>_t)
++   void svfoo_[u64]offset[_t0](<X>_t *, svuint64_t, sv<t0>_t)
++
++   void svfoo[_u32base_t0](svuint32_t, sv<t0>_t)
++   void svfoo[_u64base_t0](svuint64_t, sv<t0>_t)
++
++   void svfoo[_u32base]_offset[_t0](svuint32_t, int64_t, sv<t0>_t)
++   void svfoo[_u64base]_offset[_t0](svuint64_t, int64_t, sv<t0>_t)
++
++   where <X> might be tied to <t0> (for non-truncating stores) or might
++   depend on the function base name (for truncating stores).  */
++struct store_scatter_offset_def : public store_scatter_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_offset (b, "_,as,d,t0", group);
++    build_v_base (b, "_,b,t0", group);
++    build_vs_offset (b, "_,b,ss64,t0", group);
++  }
++};
++SHAPE (store_scatter_offset)
++
++/* void svfoo_[s64]offset[_t0](<X>_t *, svint64_t, sv<t0>_t)
++   void svfoo_[u32]offset[_t0](<X>_t *, svuint32_t, sv<t0>_t)
++   void svfoo_[u64]offset[_t0](<X>_t *, svuint64_t, sv<t0>_t)
++
++   void svfoo[_u32base_t0](svuint32_t, sv<t0>_t)
++   void svfoo[_u64base_t0](svuint64_t, sv<t0>_t)
++
++   void svfoo[_u32base]_offset[_t0](svuint32_t, int64_t, sv<t0>_t)
++   void svfoo[_u64base]_offset[_t0](svuint64_t, int64_t, sv<t0>_t)
++
++   i.e. a version of store_scatter_offset that doesn't support svint32_t
++   offsets.  */
++struct store_scatter_offset_restricted_def : public store_scatter_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    b.add_overloaded_functions (group, MODE_offset);
++    build_sv_uint_offset (b, "_,as,d,t0", group);
++    build_v_base (b, "_,b,t0", group);
++    build_vs_offset (b, "_,b,ss64,t0", group);
++  }
++};
++SHAPE (store_scatter_offset_restricted)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>xN_t, sv<t0:uint>_t).  */
++struct tbl_tuple_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,t0,vu0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (2, i, nargs)
++	|| (type = r.infer_tuple_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_unsigned))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (tbl_tuple)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t).  */
++struct ternary_bfloat_def
++  : public ternary_resize2_base<16, TYPE_bfloat, TYPE_bfloat>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vB,vB", group, MODE_none);
++  }
++};
++SHAPE (ternary_bfloat)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloat16_t, svbfloat16_t, uint64_t)
++
++   where the final argument is an integer constant expression in the range
++   [0, 7].  */
++typedef ternary_bfloat_lane_base<1> ternary_bfloat_lane_def;
++SHAPE (ternary_bfloat_lane)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloat16_t, svbfloat16_t, uint64_t)
++
++   where the final argument is an integer constant expression in the range
++   [0, 3].  */
++typedef ternary_bfloat_lane_base<2> ternary_bfloat_lanex2_def;
++SHAPE (ternary_bfloat_lanex2)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, svbfloat16_t, bfloat16_t).  */
++struct ternary_bfloat_opt_n_def
++  : public ternary_resize2_opt_n_base<16, TYPE_bfloat, TYPE_bfloat>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vB,vB", group, MODE_none);
++    build_all (b, "v0,v0,vB,sB", group, MODE_n);
++  }
++};
++SHAPE (ternary_bfloat_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int:quarter>_t, sv<t0:uint:quarter>_t,
++		       uint64_t)
++
++   where the final argument is an integer constant expression in the range
++   [0, 16 / sizeof (<t0>_t) - 1].  */
++struct ternary_intq_uintq_lane_def
++  : public ternary_qq_lane_base<TYPE_signed, TYPE_unsigned>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vqs0,vqu0,su64", group, MODE_none);
++  }
++};
++SHAPE (ternary_intq_uintq_lane)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int:quarter>_t, sv<t0:uint:quarter>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:int:quarter>_t,
++			 <t0:uint:quarter>_t).  */
++struct ternary_intq_uintq_opt_n_def
++  : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE,
++				      TYPE_signed, TYPE_unsigned>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vqs0,vqu0", group, MODE_none);
++    build_all (b, "v0,v0,vqs0,squ0", group, MODE_n);
++  }
++};
++SHAPE (ternary_intq_uintq_opt_n)
++
++/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument is an integer constant expression in the
++   range [0, 16 / sizeof (<t0>_t) - 1].  */
++struct ternary_lane_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (3, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_lane_index (3);
++  }
++};
++SHAPE (ternary_lane)
++
++/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, sv<t0>_t, uint64_t, uint64_t)
++
++   where the penultimate argument is an integer constant expression in
++   the range [0, 8 / sizeof (<t0>_t) - 1] and where the final argument
++   is an integer constant expression in {0, 90, 180, 270}.  */
++struct ternary_lane_rotate_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,v0,su64,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (3, 2);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return (c.require_immediate_lane_index (3, 2)
++	    && c.require_immediate_one_of (4, 0, 90, 180, 270));
++  }
++};
++SHAPE (ternary_lane_rotate)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t, sv<t0:half>_t, uint64_t)
++
++   where the final argument is an integer constant expression in the range
++   [0, 32 / sizeof (<t0>_t) - 1].  */
++struct ternary_long_lane_def
++  : public ternary_resize2_lane_base<function_resolver::HALF_SIZE>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vh0,vh0,su64", group, MODE_none);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_lane_index (3);
++  }
++};
++SHAPE (ternary_long_lane)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t, sv<t0:half>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:half>_t, <t0:half>_t)
++
++   i.e. a version of the standard ternary shape ternary_opt_n in which
++   the element type of the last two arguments is the half-sized
++   equivalent of <t0>.  */
++struct ternary_long_opt_n_def
++  : public ternary_resize2_opt_n_base<function_resolver::HALF_SIZE>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vh0,vh0", group, MODE_none);
++    build_all (b, "v0,v0,vh0,sh0", group, MODE_n);
++  }
++};
++SHAPE (ternary_long_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, sv<t0>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0>_t, <t0>_t)
++
++   i.e. the standard shape for ternary operations that operate on
++   uniform types.  */
++struct ternary_opt_n_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,v0", group, MODE_none);
++    build_all (b, "v0,v0,v0,s0", group, MODE_n);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform_opt_n (3);
++  }
++};
++SHAPE (ternary_opt_n)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t, uint64_t)
++
++   where the final argument is an integer constant expression in the range
++   [0, 16 / sizeof (<t0>_t) - 1].  */
++struct ternary_qq_lane_def : public ternary_qq_lane_base<>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vq0,vq0,su64", group, MODE_none);
++  }
++};
++SHAPE (ternary_qq_lane)
++
++/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t,
++			 uint64_t)
++
++   where the final argument is an integer constant expression in
++   {0, 90, 180, 270}.  */
++struct ternary_qq_lane_rotate_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vq0,vq0,su64,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (5, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
++					   r.QUARTER_SIZE)
++	|| !r.require_derived_vector_type (i + 2, i, type, r.SAME_TYPE_CLASS,
++					   r.QUARTER_SIZE)
++	|| !r.require_integer_immediate (i + 3)
++	|| !r.require_integer_immediate (i + 4))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return (c.require_immediate_lane_index (3, 4)
++	    && c.require_immediate_one_of (4, 0, 90, 180, 270));
++  }
++};
++SHAPE (ternary_qq_lane_rotate)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:quarter>_t, <t0:quarter>_t)
++
++   i.e. a version of the standard ternary shape ternary_opt_n in which
++   the element type of the last two arguments is the quarter-sized
++   equivalent of <t0>.  */
++struct ternary_qq_opt_n_def
++  : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vq0,vq0", group, MODE_none);
++    build_all (b, "v0,v0,vq0,sq0", group, MODE_n);
++  }
++};
++SHAPE (ternary_qq_opt_n)
++
++/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t,
++			 uint64_t)
++
++   where the final argument is an integer constant expression in
++   {0, 90, 180, 270}.  */
++struct ternary_qq_rotate_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vq0,vq0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (4, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
++					   r.QUARTER_SIZE)
++	|| !r.require_derived_vector_type (i + 2, i, type, r.SAME_TYPE_CLASS,
++					   r.QUARTER_SIZE)
++	|| !r.require_integer_immediate (i + 3))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_one_of (3, 0, 90, 180, 270);
++  }
++};
++SHAPE (ternary_qq_rotate)
++
++/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument is an integer constant expression in
++   {0, 90, 180, 270}.  */
++struct ternary_rotate_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (3, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_one_of (3, 0, 90, 180, 270);
++  }
++};
++SHAPE (ternary_rotate)
++
++/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [0, sizeof (<t0>_t) * 8 - 1].  */
++struct ternary_shift_left_imm_def : public ternary_shift_imm_base
++{
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bits = c.type_suffix (0).element_bits;
++    return c.require_immediate_range (2, 0, bits - 1);
++  }
++};
++SHAPE (ternary_shift_left_imm)
++
++/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument must be an integer constant expression in the
++   range [1, sizeof (<t0>_t) * 8].  */
++struct ternary_shift_right_imm_def : public ternary_shift_imm_base
++{
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    unsigned int bits = c.type_suffix (0).element_bits;
++    return c.require_immediate_range (2, 1, bits);
++  }
++};
++SHAPE (ternary_shift_right_imm)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, sv<t0:uint>_t).  */
++struct ternary_uint_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,vu0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (3, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| !r.require_matching_vector_type (i + 1, type)
++	|| !r.require_derived_vector_type (i + 2, i, type, TYPE_unsigned))
++      return error_mark_node;
++
++    return r.resolve_to (r.mode_suffix_id, type);
++  }
++};
++SHAPE (ternary_uint)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, svu<t0:uint:quarter>_t,
++		       sv<t0:int:quarter>_t).  */
++struct ternary_uintq_intq_def
++  : public ternary_resize2_base<function_resolver::QUARTER_SIZE,
++				TYPE_unsigned, TYPE_signed>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vqu0,vqs0", group, MODE_none);
++  }
++};
++SHAPE (ternary_uintq_intq)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint:quarter>_t, sv<t0:int:quarter>_t,
++		       uint64_t)
++
++   where the final argument is an integer constant expression in the range
++   [0, 16 / sizeof (<t0>_t) - 1].  */
++struct ternary_uintq_intq_lane_def
++  : public ternary_qq_lane_base<TYPE_unsigned, TYPE_signed>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vqu0,vqs0,su64", group, MODE_none);
++  }
++};
++SHAPE (ternary_uintq_intq_lane)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint:quarter>_t, sv<t0:int:quarter>_t)
++   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:uint:quarter>_t,
++			 <t0:int:quarter>_t).  */
++struct ternary_uintq_intq_opt_n_def
++  : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE,
++				      TYPE_unsigned, TYPE_signed>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,vqu0,vqs0", group, MODE_none);
++    build_all (b, "v0,v0,vqu0,sqs0", group, MODE_n);
++  }
++};
++SHAPE (ternary_uintq_intq_opt_n)
++
++/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, uint64_t)
++
++   where the final argument is an integer constant expression in the
++   range [0, 7].  */
++struct tmad_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0,v0,su64", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_uniform (2, 1);
++  }
++
++  bool
++  check (function_checker &c) const OVERRIDE
++  {
++    return c.require_immediate_range (2, 0, 7);
++  }
++};
++SHAPE (tmad)
++
++/* sv<t0>_t svfoo[_t0](sv<t0>_t)
++
++   i.e. the standard shape for unary operations that operate on
++   uniform types.  */
++struct unary_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_unary ();
++  }
++};
++SHAPE (unary)
++
++/* sv<t0>_t svfoo_t0[_t1](sv<t1>_t)
++
++   where the target type <t0> must be specified explicitly but the source
++   type <t1> can be inferred.  */
++struct unary_convert_def : public overloaded_base<1>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v1", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_unary (r.type_suffix (0).tclass,
++			    r.type_suffix (0).element_bits);
++  }
++};
++SHAPE (unary_convert)
++
++/* sv<t0>_t svfoo_t0[_t1](sv<t0>_t, sv<t1>_t)
++
++   This is a version of unary_convert in which the even-indexed
++   elements are passed in as a first parameter, before any governing
++   predicate.  */
++struct unary_convert_narrowt_def : public overloaded_base<1>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,v1", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_unary (r.type_suffix (0).tclass,
++			    r.type_suffix (0).element_bits, true);
++  }
++};
++SHAPE (unary_convert_narrowt)
++
++/* sv<t0>_t svfoo[_t0](sv<t0:half>_t).  */
++struct unary_long_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,vh0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type, result_type;
++    if (!r.check_gp_argument (1, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
++	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    if (tree res = r.lookup_form (r.mode_suffix_id, result_type))
++      return res;
++
++    return r.report_no_such_form (type);
++  }
++};
++SHAPE (unary_long)
++
++/* sv<t0>_t svfoo[_n]_t0(<t0>_t).  */
++struct unary_n_def : public overloaded_base<1>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    /* The "_n" suffix is optional; the full name has it, but the short
++       name doesn't.  */
++    build_all (b, "v0,s0", group, MODE_n, true);
++  }
++
++  tree
++  resolve (function_resolver &) const OVERRIDE
++  {
++    /* The short forms just make "_n" implicit, so no resolution is needed.  */
++    gcc_unreachable ();
++  }
++};
++SHAPE (unary_n)
++
++/* sv<t0:half>_t svfoo[_t0](sv<t0>_t).  */
++typedef unary_narrowb_base<> unary_narrowb_def;
++SHAPE (unary_narrowb)
++
++/* sv<t0:half>_t svfoo[_t0](sv<t0:half>_t, sv<t0>_t).  */
++typedef unary_narrowt_base<> unary_narrowt_def;
++SHAPE (unary_narrowt)
++
++/* sv<t0:uint:half>_t svfoo[_t0](sv<t0>_t).  */
++typedef unary_narrowb_base<TYPE_unsigned> unary_narrowb_to_uint_def;
++SHAPE (unary_narrowb_to_uint)
++
++/* sv<t0:uint:half>_t svfoo[_t0](sv<t0:uint:half>_t, sv<t0>_t).  */
++typedef unary_narrowt_base<TYPE_unsigned> unary_narrowt_to_uint_def;
++SHAPE (unary_narrowt_to_uint)
++
++/* svbool_t svfoo(svbool_t).  */
++struct unary_pred_def : public nonoverloaded_base
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    build_all (b, "v0,v0", group, MODE_none);
++  }
++};
++SHAPE (unary_pred)
++
++/* sv<t0:int>_t svfoo[_t0](sv<t0>_t)
++
++   i.e. a version of "unary" in which the returned vector contains
++   signed integers.  */
++struct unary_to_int_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vs0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_unary (TYPE_signed);
++  }
++};
++SHAPE (unary_to_int)
++
++/* sv<t0:uint>_t svfoo[_t0](sv<t0>_t)
++
++   i.e. a version of "unary" in which the returned vector contains
++   unsigned integers.  */
++struct unary_to_uint_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "vu0,v0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    return r.resolve_unary (TYPE_unsigned);
++  }
++};
++SHAPE (unary_to_uint)
++
++/* sv<t0>_t svfoo[_t0](sv<t0:uint>_t)
++
++   where <t0> always belongs a certain type class, and where <t0:uint>
++   therefore uniquely determines <t0>.  */
++struct unary_uint_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,vu0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (1, i, nargs)
++	|| (type = r.infer_unsigned_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    /* Search for a valid suffix with the same number of bits as TYPE.  */
++    unsigned int element_bits = type_suffixes[type].element_bits;
++    if (type_suffixes[type].unsigned_p)
++      for (unsigned int j = 0; j < NUM_TYPE_SUFFIXES; ++j)
++	if (type_suffixes[j].element_bits == element_bits)
++	  if (tree res = r.lookup_form (r.mode_suffix_id,
++					type_suffix_index (j)))
++	    return res;
++
++    return r.report_no_such_form (type);
++  }
++};
++SHAPE (unary_uint)
++
++/* sv<t0>_t svfoo[_<t0>](sv<t0:half>_t)
++
++   i.e. a version of "unary" in which the source elements are half the
++   size of the destination elements, but have the same type class.  */
++struct unary_widen_def : public overloaded_base<0>
++{
++  void
++  build (function_builder &b, const function_group_info &group) const OVERRIDE
++  {
++    b.add_overloaded_functions (group, MODE_none);
++    build_all (b, "v0,vh0", group, MODE_none);
++  }
++
++  tree
++  resolve (function_resolver &r) const OVERRIDE
++  {
++    unsigned int i, nargs;
++    type_suffix_index type;
++    if (!r.check_gp_argument (1, i, nargs)
++	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++      return error_mark_node;
++
++    /* There is only a single form for predicates.  */
++    if (type == TYPE_SUFFIX_b)
++      return r.resolve_to (r.mode_suffix_id, type);
++
++    if (type_suffixes[type].integer_p
++	&& type_suffixes[type].element_bits < 64)
++      {
++	type_suffix_index wide_suffix
++	  = find_type_suffix (type_suffixes[type].tclass,
++			      type_suffixes[type].element_bits * 2);
++	if (tree res = r.lookup_form (r.mode_suffix_id, wide_suffix))
++	  return res;
++      }
++
++    return r.report_no_such_form (type);
++  }
++};
++SHAPE (unary_widen)
++
++}
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.h b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
+new file mode 100644
+index 000000000..b36f50acd
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
+@@ -0,0 +1,191 @@
++/* ACLE support for AArch64 SVE (function shapes)
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_SVE_BUILTINS_SHAPES_H
++#define GCC_AARCH64_SVE_BUILTINS_SHAPES_H
++
++namespace aarch64_sve
++{
++  /* The naming convention is:
++
++     - to use the name of the function if the rules are very specific to
++       a particular function (e.g. svext, for which the range of the
++       final immediate value is in no way generic).
++
++     - to use names like "unary" etc. if the rules are somewhat generic,
++       especially if there are no ranges involved.
++
++     When using generic names, the handling of the final vector argument
++     can be modified as follows:
++
++     - an "_n" suffix changes the argument from a vector to a scalar.
++
++     - an "_opt_n" suffix says that there are two forms of each function:
++       one in which the argument is the usual vector, and one in which it
++       is replaced by a scalar.
++
++     - "_int" and "_uint" replace the argument's element type with a
++       signed or unsigned integer of the same width.  The suffixes above
++       then indicate whether this final argument is or might be a scalar.
++
++     - "_int64" and "_uint64" similarly replace the argument's element type
++       with int64_t or uint64_t.
++
++     - "_wide" replaces the argument's element type with a 64-bit integer
++       of the same signedness.  This only makes sense for integer elements.
++
++     - "_lane" indicates that the argument is indexed by a constant lane
++       number, provided as an immediately-following argument of type uint64_t.
++
++     Also:
++
++     - "inherent" means that the function takes no arguments.
++
++     - "_rotate" means that the final argument is a rotation amount
++       (0, 90, 180 or 270).
++
++     - "_scalar" indicates that all data arguments are scalars rather
++       than vectors.
++
++     - in gather/scatter addresses, "sv" stands for "scalar base,
++       vector displacement" while "vs" stands for "vector base,
++       scalar displacement".
++
++     - "_pred" indicates that the function takes an svbool_t argument
++       that does not act as a governing predicate..  */
++  namespace shapes
++  {
++    extern const function_shape *const adr_index;
++    extern const function_shape *const adr_offset;
++    extern const function_shape *const binary;
++    extern const function_shape *const binary_int_opt_n;
++    extern const function_shape *const binary_lane;
++    extern const function_shape *const binary_long_lane;
++    extern const function_shape *const binary_long_opt_n;
++    extern const function_shape *const binary_n;
++    extern const function_shape *const binary_narrowb_opt_n;
++    extern const function_shape *const binary_narrowt_opt_n;
++    extern const function_shape *const binary_opt_n;
++    extern const function_shape *const binary_pred;
++    extern const function_shape *const binary_rotate;
++    extern const function_shape *const binary_scalar;
++    extern const function_shape *const binary_to_uint;
++    extern const function_shape *const binary_uint;
++    extern const function_shape *const binary_uint_n;
++    extern const function_shape *const binary_uint_opt_n;
++    extern const function_shape *const binary_uint64_n;
++    extern const function_shape *const binary_uint64_opt_n;
++    extern const function_shape *const binary_wide;
++    extern const function_shape *const binary_wide_opt_n;
++    extern const function_shape *const clast;
++    extern const function_shape *const compare;
++    extern const function_shape *const compare_opt_n;
++    extern const function_shape *const compare_ptr;
++    extern const function_shape *const compare_scalar;
++    extern const function_shape *const compare_wide_opt_n;
++    extern const function_shape *const count_inherent;
++    extern const function_shape *const count_pat;
++    extern const function_shape *const count_pred;
++    extern const function_shape *const count_vector;
++    extern const function_shape *const create;
++    extern const function_shape *const dupq;
++    extern const function_shape *const ext;
++    extern const function_shape *const fold_left;
++    extern const function_shape *const get;
++    extern const function_shape *const inc_dec;
++    extern const function_shape *const inc_dec_pat;
++    extern const function_shape *const inc_dec_pred;
++    extern const function_shape *const inc_dec_pred_scalar;
++    extern const function_shape *const inherent;
++    extern const function_shape *const inherent_b;
++    extern const function_shape *const load;
++    extern const function_shape *const load_ext;
++    extern const function_shape *const load_ext_gather_index;
++    extern const function_shape *const load_ext_gather_index_restricted;
++    extern const function_shape *const load_ext_gather_offset;
++    extern const function_shape *const load_ext_gather_offset_restricted;
++    extern const function_shape *const load_gather_sv;
++    extern const function_shape *const load_gather_sv_restricted;
++    extern const function_shape *const load_gather_vs;
++    extern const function_shape *const load_replicate;
++    extern const function_shape *const mmla;
++    extern const function_shape *const pattern_pred;
++    extern const function_shape *const prefetch;
++    extern const function_shape *const prefetch_gather_index;
++    extern const function_shape *const prefetch_gather_offset;
++    extern const function_shape *const ptest;
++    extern const function_shape *const rdffr;
++    extern const function_shape *const reduction;
++    extern const function_shape *const reduction_wide;
++    extern const function_shape *const set;
++    extern const function_shape *const setffr;
++    extern const function_shape *const shift_left_imm_long;
++    extern const function_shape *const shift_left_imm_to_uint;
++    extern const function_shape *const shift_right_imm;
++    extern const function_shape *const shift_right_imm_narrowb;
++    extern const function_shape *const shift_right_imm_narrowt;
++    extern const function_shape *const shift_right_imm_narrowb_to_uint;
++    extern const function_shape *const shift_right_imm_narrowt_to_uint;
++    extern const function_shape *const store;
++    extern const function_shape *const store_scatter_index;
++    extern const function_shape *const store_scatter_index_restricted;
++    extern const function_shape *const store_scatter_offset;
++    extern const function_shape *const store_scatter_offset_restricted;
++    extern const function_shape *const tbl_tuple;
++    extern const function_shape *const ternary_bfloat;
++    extern const function_shape *const ternary_bfloat_lane;
++    extern const function_shape *const ternary_bfloat_lanex2;
++    extern const function_shape *const ternary_bfloat_opt_n;
++    extern const function_shape *const ternary_intq_uintq_lane;
++    extern const function_shape *const ternary_intq_uintq_opt_n;
++    extern const function_shape *const ternary_lane;
++    extern const function_shape *const ternary_lane_rotate;
++    extern const function_shape *const ternary_long_lane;
++    extern const function_shape *const ternary_long_opt_n;
++    extern const function_shape *const ternary_opt_n;
++    extern const function_shape *const ternary_qq_lane;
++    extern const function_shape *const ternary_qq_lane_rotate;
++    extern const function_shape *const ternary_qq_opt_n;
++    extern const function_shape *const ternary_qq_rotate;
++    extern const function_shape *const ternary_rotate;
++    extern const function_shape *const ternary_shift_left_imm;
++    extern const function_shape *const ternary_shift_right_imm;
++    extern const function_shape *const ternary_uint;
++    extern const function_shape *const ternary_uintq_intq;
++    extern const function_shape *const ternary_uintq_intq_lane;
++    extern const function_shape *const ternary_uintq_intq_opt_n;
++    extern const function_shape *const tmad;
++    extern const function_shape *const unary;
++    extern const function_shape *const unary_convert;
++    extern const function_shape *const unary_convert_narrowt;
++    extern const function_shape *const unary_long;
++    extern const function_shape *const unary_n;
++    extern const function_shape *const unary_narrowb;
++    extern const function_shape *const unary_narrowt;
++    extern const function_shape *const unary_narrowb_to_uint;
++    extern const function_shape *const unary_narrowt_to_uint;
++    extern const function_shape *const unary_pred;
++    extern const function_shape *const unary_to_int;
++    extern const function_shape *const unary_to_uint;
++    extern const function_shape *const unary_uint;
++    extern const function_shape *const unary_widen;
++  }
++}
++
++#endif
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
+new file mode 100644
+index 000000000..f830d9294
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
+@@ -0,0 +1,3568 @@
++/* ACLE support for AArch64 SVE
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#define IN_TARGET_CODE 1
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "tm.h"
++#include "tree.h"
++#include "rtl.h"
++#include "tm_p.h"
++#include "memmodel.h"
++#include "insn-codes.h"
++#include "optabs.h"
++#include "recog.h"
++#include "diagnostic.h"
++#include "expr.h"
++#include "basic-block.h"
++#include "function.h"
++#include "fold-const.h"
++#include "gimple.h"
++#include "gimple-iterator.h"
++#include "gimplify.h"
++#include "explow.h"
++#include "emit-rtl.h"
++#include "tree-vector-builder.h"
++#include "stor-layout.h"
++#include "regs.h"
++#include "alias.h"
++#include "gimple-fold.h"
++#include "langhooks.h"
++#include "stringpool.h"
++#include "aarch64-sve-builtins.h"
++#include "aarch64-sve-builtins-base.h"
++#include "aarch64-sve-builtins-shapes.h"
++
++namespace aarch64_sve {
++
++/* Static information about each single-predicate or single-vector
++   ABI and ACLE type.  */
++struct vector_type_info
++{
++  /* The name of the type as declared by arm_sve.h.  */
++  const char *acle_name;
++
++  /* The name of the type specified in AAPCS64.  The type is always
++     available under this name, even when arm_sve.h isn't included.  */
++  const char *abi_name;
++
++  /* The C++ mangling of ABI_NAME.  */
++  const char *mangled_name;
++};
++
++/* Describes a function decl.  */
++class GTY(()) registered_function
++{
++public:
++  /* The ACLE function that the decl represents.  */
++  function_instance instance GTY ((skip));
++
++  /* The decl itself.  */
++  tree decl;
++
++  /* The architecture extensions that the function requires, as a set of
++     AARCH64_FL_* flags.  */
++  uint64_t required_extensions;
++
++  /* True if the decl represents an overloaded function that needs to be
++     resolved by function_resolver.  */
++  bool overloaded_p;
++};
++
++/* Hash traits for registered_function.  */
++struct registered_function_hasher : nofree_ptr_hash <registered_function>
++{
++  typedef function_instance compare_type;
++
++  static hashval_t hash (value_type);
++  static bool equal (value_type, const compare_type &);
++};
++
++/* Information about each single-predicate or single-vector type.  */
++static CONSTEXPR const vector_type_info vector_types[] = {
++#define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
++  { #ACLE_NAME, #ABI_NAME, #NCHARS #ABI_NAME },
++#include "aarch64-sve-builtins.def"
++};
++
++/* The function name suffix associated with each predication type.  */
++static const char *const pred_suffixes[NUM_PREDS + 1] = {
++  "",
++  "",
++  "_m",
++  "_x",
++  "_z",
++  ""
++};
++
++/* Static information about each mode_suffix_index.  */
++CONSTEXPR const mode_suffix_info mode_suffixes[] = {
++#define VECTOR_TYPE_none NUM_VECTOR_TYPES
++#define DEF_SVE_MODE(NAME, BASE, DISPLACEMENT, UNITS) \
++  { "_" #NAME, VECTOR_TYPE_##BASE, VECTOR_TYPE_##DISPLACEMENT, UNITS_##UNITS },
++#include "aarch64-sve-builtins.def"
++#undef VECTOR_TYPE_none
++  { "", NUM_VECTOR_TYPES, NUM_VECTOR_TYPES, UNITS_none }
++};
++
++/* Static information about each type_suffix_index.  */
++CONSTEXPR const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
++#define DEF_SVE_TYPE_SUFFIX(NAME, ACLE_TYPE, CLASS, BITS, MODE) \
++  { "_" #NAME, \
++    VECTOR_TYPE_##ACLE_TYPE, \
++    TYPE_##CLASS, \
++    BITS, \
++    BITS / BITS_PER_UNIT, \
++    TYPE_##CLASS == TYPE_signed || TYPE_##CLASS == TYPE_unsigned, \
++    TYPE_##CLASS == TYPE_unsigned, \
++    TYPE_##CLASS == TYPE_float, \
++    TYPE_##CLASS == TYPE_bool, \
++    0, \
++    MODE },
++#include "aarch64-sve-builtins.def"
++  { "", NUM_VECTOR_TYPES, TYPE_bool, 0, 0, false, false, false, false,
++    0, VOIDmode }
++};
++
++/* Define a TYPES_<combination> macro for each combination of type
++   suffixes that an ACLE function can have, where <combination> is the
++   name used in DEF_SVE_FUNCTION entries.
++
++   Use S (T) for single type suffix T and D (T1, T2) for a pair of type
++   suffixes T1 and T2.  Use commas to separate the suffixes.
++
++   Although the order shouldn't matter, the convention is to sort the
++   suffixes lexicographically after dividing suffixes into a type
++   class ("b", "f", etc.) and a numerical bit count.  */
++
++/* _b8 _b16 _b32 _b64.  */
++#define TYPES_all_pred(S, D) \
++  S (b8), S (b16), S (b32), S (b64)
++
++/* _f16 _f32 _f64.  */
++#define TYPES_all_float(S, D) \
++  S (f16), S (f32), S (f64)
++
++/* _s8 _s16 _s32 _s64.  */
++#define TYPES_all_signed(S, D) \
++  S (s8), S (s16), S (s32), S (s64)
++
++/*     _f16 _f32 _f64
++   _s8 _s16 _s32 _s64.  */
++#define TYPES_all_float_and_signed(S, D) \
++  TYPES_all_float (S, D), TYPES_all_signed (S, D)
++
++/* _u8 _u16 _u32 _u64.  */
++#define TYPES_all_unsigned(S, D) \
++  S (u8), S (u16), S (u32), S (u64)
++
++/* _s8 _s16 _s32 _s64
++   _u8 _u16 _u32 _u64.  */
++#define TYPES_all_integer(S, D) \
++  TYPES_all_signed (S, D), TYPES_all_unsigned (S, D)
++
++/*     _f16 _f32 _f64
++   _s8 _s16 _s32 _s64
++   _u8 _u16 _u32 _u64.  */
++#define TYPES_all_arith(S, D) \
++  TYPES_all_float (S, D), TYPES_all_integer (S, D)
++
++/*     _bf16
++	_f16 _f32 _f64
++   _s8  _s16 _s32 _s64
++   _u8  _u16 _u32 _u64.  */
++#define TYPES_all_data(S, D) \
++  S (bf16), TYPES_all_arith (S, D)
++
++/* _b only.  */
++#define TYPES_b(S, D) \
++  S (b)
++
++/* _u8.  */
++#define TYPES_b_unsigned(S, D) \
++  S (u8)
++
++/* _s8
++   _u8.  */
++#define TYPES_b_integer(S, D) \
++  S (s8), TYPES_b_unsigned (S, D)
++
++/* _s8 _s16
++   _u8 _u16.  */
++#define TYPES_bh_integer(S, D) \
++  S (s8), S (s16), S (u8), S (u16)
++
++/* _u8 _u32.  */
++#define TYPES_bs_unsigned(S, D) \
++  S (u8), S (u32)
++
++/* _s8 _s16 _s32.  */
++#define TYPES_bhs_signed(S, D) \
++  S (s8), S (s16), S (s32)
++
++/* _u8 _u16 _u32.  */
++#define TYPES_bhs_unsigned(S, D) \
++  S (u8), S (u16), S (u32)
++
++/* _s8 _s16 _s32
++   _u8 _u16 _u32.  */
++#define TYPES_bhs_integer(S, D) \
++  TYPES_bhs_signed (S, D), TYPES_bhs_unsigned (S, D)
++
++/* _s16
++   _u16.  */
++#define TYPES_h_integer(S, D) \
++  S (s16), S (u16)
++
++/* _s16 _s32.  */
++#define TYPES_hs_signed(S, D) \
++  S (s16), S (s32)
++
++/* _s16 _s32
++   _u16 _u32.  */
++#define TYPES_hs_integer(S, D) \
++  TYPES_hs_signed (S, D), S (u16), S (u32)
++
++/* _f16 _f32.  */
++#define TYPES_hs_float(S, D) \
++  S (f16), S (f32)
++
++/* _u16 _u64.  */
++#define TYPES_hd_unsigned(S, D) \
++  S (u16), S (u64)
++
++/* _s16 _s32 _s64.  */
++#define TYPES_hsd_signed(S, D) \
++  S (s16), S (s32), S (s64)
++
++/* _s16 _s32 _s64
++   _u16 _u32 _u64.  */
++#define TYPES_hsd_integer(S, D) \
++  TYPES_hsd_signed (S, D), S (u16), S (u32), S (u64)
++
++/* _f32.  */
++#define TYPES_s_float(S, D) \
++  S (f32)
++
++/*      _f32
++   _s16 _s32 _s64
++   _u16 _u32 _u64.  */
++#define TYPES_s_float_hsd_integer(S, D) \
++  TYPES_s_float (S, D), TYPES_hsd_integer (S, D)
++
++/* _f32
++   _s32 _s64
++   _u32 _u64.  */
++#define TYPES_s_float_sd_integer(S, D) \
++  TYPES_s_float (S, D), TYPES_sd_integer (S, D)
++
++/* _s32.  */
++#define TYPES_s_signed(S, D) \
++  S (s32)
++
++/* _u32.  */
++#define TYPES_s_unsigned(S, D) \
++  S (u32)
++
++/* _s32 _u32.  */
++#define TYPES_s_integer(S, D) \
++  TYPES_s_signed (S, D), TYPES_s_unsigned (S, D)
++
++/* _s32 _s64.  */
++#define TYPES_sd_signed(S, D) \
++  S (s32), S (s64)
++
++/* _u32 _u64.  */
++#define TYPES_sd_unsigned(S, D) \
++  S (u32), S (u64)
++
++/* _s32 _s64
++   _u32 _u64.  */
++#define TYPES_sd_integer(S, D) \
++  TYPES_sd_signed (S, D), TYPES_sd_unsigned (S, D)
++
++/* _f32 _f64
++   _s32 _s64
++   _u32 _u64.  */
++#define TYPES_sd_data(S, D) \
++  S (f32), S (f64), TYPES_sd_integer (S, D)
++
++/* _f16 _f32 _f64
++	_s32 _s64
++	_u32 _u64.  */
++#define TYPES_all_float_and_sd_integer(S, D) \
++  TYPES_all_float (S, D), TYPES_sd_integer (S, D)
++
++/* _f64.  */
++#define TYPES_d_float(S, D) \
++  S (f64)
++
++/* _u64.  */
++#define TYPES_d_unsigned(S, D) \
++  S (u64)
++
++/* _s64
++   _u64.  */
++#define TYPES_d_integer(S, D) \
++  S (s64), TYPES_d_unsigned (S, D)
++
++/* _f64
++   _s64
++   _u64.  */
++#define TYPES_d_data(S, D) \
++  TYPES_d_float (S, D), TYPES_d_integer (S, D)
++
++/* All the type combinations allowed by svcvt.  */
++#define TYPES_cvt(S, D) \
++  D (f16, f32), D (f16, f64), \
++  D (f16, s16), D (f16, s32), D (f16, s64), \
++  D (f16, u16), D (f16, u32), D (f16, u64), \
++  \
++  D (f32, f16), D (f32, f64), \
++  D (f32, s32), D (f32, s64), \
++  D (f32, u32), D (f32, u64), \
++  \
++  D (f64, f16), D (f64, f32), \
++  D (f64, s32), D (f64, s64), \
++  D (f64, u32), D (f64, u64), \
++  \
++  D (s16, f16), \
++  D (s32, f16), D (s32, f32), D (s32, f64), \
++  D (s64, f16), D (s64, f32), D (s64, f64), \
++  \
++  D (u16, f16), \
++  D (u32, f16), D (u32, f32), D (u32, f64), \
++  D (u64, f16), D (u64, f32), D (u64, f64)
++
++/* _bf16_f32.  */
++#define TYPES_cvt_bfloat(S, D) \
++  D (bf16, f32)
++
++/* _f32_f16
++   _f64_f32.  */
++#define TYPES_cvt_long(S, D) \
++  D (f32, f16), D (f64, f32)
++
++/* _f16_f32.  */
++#define TYPES_cvt_narrow_s(S, D) \
++  D (f32, f64)
++
++/* _f16_f32
++   _f32_f64.  */
++#define TYPES_cvt_narrow(S, D) \
++  D (f16, f32), TYPES_cvt_narrow_s (S, D)
++
++/* { _s32 _s64 } x { _b8 _b16 _b32 _b64 }
++   { _u32 _u64 }.  */
++#define TYPES_inc_dec_n1(D, A) \
++  D (A, b8), D (A, b16), D (A, b32), D (A, b64)
++#define TYPES_inc_dec_n(S, D) \
++  TYPES_inc_dec_n1 (D, s32), \
++  TYPES_inc_dec_n1 (D, s64), \
++  TYPES_inc_dec_n1 (D, u32), \
++  TYPES_inc_dec_n1 (D, u64)
++
++/* {     _bf16           }   {     _bf16           }
++   {      _f16 _f32 _f64 }   {      _f16 _f32 _f64 }
++   { _s8  _s16 _s32 _s64 } x { _s8  _s16 _s32 _s64 }
++   { _u8  _u16 _u32 _u64 }   { _u8  _u16 _u32 _u64 }.  */
++#define TYPES_reinterpret1(D, A) \
++  D (A, bf16), \
++  D (A, f16), D (A, f32), D (A, f64), \
++  D (A, s8), D (A, s16), D (A, s32), D (A, s64), \
++  D (A, u8), D (A, u16), D (A, u32), D (A, u64)
++#define TYPES_reinterpret(S, D) \
++  TYPES_reinterpret1 (D, bf16), \
++  TYPES_reinterpret1 (D, f16), \
++  TYPES_reinterpret1 (D, f32), \
++  TYPES_reinterpret1 (D, f64), \
++  TYPES_reinterpret1 (D, s8), \
++  TYPES_reinterpret1 (D, s16), \
++  TYPES_reinterpret1 (D, s32), \
++  TYPES_reinterpret1 (D, s64), \
++  TYPES_reinterpret1 (D, u8), \
++  TYPES_reinterpret1 (D, u16), \
++  TYPES_reinterpret1 (D, u32), \
++  TYPES_reinterpret1 (D, u64)
++
++/* { _b8 _b16 _b32 _b64 } x { _s32 _s64 }
++			    { _u32 _u64 } */
++#define TYPES_while1(D, bn) \
++  D (bn, s32), D (bn, s64), D (bn, u32), D (bn, u64)
++#define TYPES_while(S, D) \
++  TYPES_while1 (D, b8), \
++  TYPES_while1 (D, b16), \
++  TYPES_while1 (D, b32), \
++  TYPES_while1 (D, b64)
++
++/* Describe a pair of type suffixes in which only the first is used.  */
++#define DEF_VECTOR_TYPE(X) { TYPE_SUFFIX_ ## X, NUM_TYPE_SUFFIXES }
++
++/* Describe a pair of type suffixes in which both are used.  */
++#define DEF_DOUBLE_TYPE(X, Y) { TYPE_SUFFIX_ ## X, TYPE_SUFFIX_ ## Y }
++
++/* Create an array that can be used in aarch64-sve-builtins.def to
++   select the type suffixes in TYPES_<NAME>.  */
++#define DEF_SVE_TYPES_ARRAY(NAME) \
++  static const type_suffix_pair types_##NAME[] = { \
++    TYPES_##NAME (DEF_VECTOR_TYPE, DEF_DOUBLE_TYPE), \
++    { NUM_TYPE_SUFFIXES, NUM_TYPE_SUFFIXES } \
++  }
++
++/* For functions that don't take any type suffixes.  */
++static const type_suffix_pair types_none[] = {
++  { NUM_TYPE_SUFFIXES, NUM_TYPE_SUFFIXES },
++  { NUM_TYPE_SUFFIXES, NUM_TYPE_SUFFIXES }
++};
++
++/* Create an array for each TYPES_<combination> macro above.  */
++DEF_SVE_TYPES_ARRAY (all_pred);
++DEF_SVE_TYPES_ARRAY (all_float);
++DEF_SVE_TYPES_ARRAY (all_signed);
++DEF_SVE_TYPES_ARRAY (all_float_and_signed);
++DEF_SVE_TYPES_ARRAY (all_unsigned);
++DEF_SVE_TYPES_ARRAY (all_integer);
++DEF_SVE_TYPES_ARRAY (all_arith);
++DEF_SVE_TYPES_ARRAY (all_data);
++DEF_SVE_TYPES_ARRAY (b);
++DEF_SVE_TYPES_ARRAY (b_unsigned);
++DEF_SVE_TYPES_ARRAY (b_integer);
++DEF_SVE_TYPES_ARRAY (bh_integer);
++DEF_SVE_TYPES_ARRAY (bs_unsigned);
++DEF_SVE_TYPES_ARRAY (bhs_signed);
++DEF_SVE_TYPES_ARRAY (bhs_unsigned);
++DEF_SVE_TYPES_ARRAY (bhs_integer);
++DEF_SVE_TYPES_ARRAY (h_integer);
++DEF_SVE_TYPES_ARRAY (hs_signed);
++DEF_SVE_TYPES_ARRAY (hs_integer);
++DEF_SVE_TYPES_ARRAY (hs_float);
++DEF_SVE_TYPES_ARRAY (hd_unsigned);
++DEF_SVE_TYPES_ARRAY (hsd_signed);
++DEF_SVE_TYPES_ARRAY (hsd_integer);
++DEF_SVE_TYPES_ARRAY (s_float);
++DEF_SVE_TYPES_ARRAY (s_float_hsd_integer);
++DEF_SVE_TYPES_ARRAY (s_float_sd_integer);
++DEF_SVE_TYPES_ARRAY (s_signed);
++DEF_SVE_TYPES_ARRAY (s_unsigned);
++DEF_SVE_TYPES_ARRAY (s_integer);
++DEF_SVE_TYPES_ARRAY (sd_signed);
++DEF_SVE_TYPES_ARRAY (sd_unsigned);
++DEF_SVE_TYPES_ARRAY (sd_integer);
++DEF_SVE_TYPES_ARRAY (sd_data);
++DEF_SVE_TYPES_ARRAY (all_float_and_sd_integer);
++DEF_SVE_TYPES_ARRAY (d_float);
++DEF_SVE_TYPES_ARRAY (d_unsigned);
++DEF_SVE_TYPES_ARRAY (d_integer);
++DEF_SVE_TYPES_ARRAY (d_data);
++DEF_SVE_TYPES_ARRAY (cvt);
++DEF_SVE_TYPES_ARRAY (cvt_bfloat);
++DEF_SVE_TYPES_ARRAY (cvt_long);
++DEF_SVE_TYPES_ARRAY (cvt_narrow_s);
++DEF_SVE_TYPES_ARRAY (cvt_narrow);
++DEF_SVE_TYPES_ARRAY (inc_dec_n);
++DEF_SVE_TYPES_ARRAY (reinterpret);
++DEF_SVE_TYPES_ARRAY (while);
++
++/* Used by functions that have no governing predicate.  */
++static const predication_index preds_none[] = { PRED_none, NUM_PREDS };
++
++/* Used by functions that have a governing predicate but do not have an
++   explicit suffix.  */
++static const predication_index preds_implicit[] = { PRED_implicit, NUM_PREDS };
++
++/* Used by functions that allow merging and "don't care" predication,
++   but are not suitable for predicated MOVPRFX.  */
++static const predication_index preds_mx[] = {
++  PRED_m, PRED_x, NUM_PREDS
++};
++
++/* Used by functions that allow merging, zeroing and "don't care"
++   predication.  */
++static const predication_index preds_mxz[] = {
++  PRED_m, PRED_x, PRED_z, NUM_PREDS
++};
++
++/* Used by functions that have the mxz predicated forms above, and in addition
++   have an unpredicated form.  */
++static const predication_index preds_mxz_or_none[] = {
++  PRED_m, PRED_x, PRED_z, PRED_none, NUM_PREDS
++};
++
++/* Used by functions that allow merging and zeroing predication but have
++   no "_x" form.  */
++static const predication_index preds_mz[] = { PRED_m, PRED_z, NUM_PREDS };
++
++/* Used by functions that have an unpredicated form and a _z predicated
++   form.  */
++static const predication_index preds_z_or_none[] = {
++  PRED_z, PRED_none, NUM_PREDS
++};
++
++/* Used by (mostly predicate) functions that only support "_z" predication.  */
++static const predication_index preds_z[] = { PRED_z, NUM_PREDS };
++
++/* A list of all SVE ACLE functions.  */
++static CONSTEXPR const function_group_info function_groups[] = {
++#define DEF_SVE_FUNCTION(NAME, SHAPE, TYPES, PREDS) \
++  { #NAME, &functions::NAME, &shapes::SHAPE, types_##TYPES, preds_##PREDS, \
++    REQUIRED_EXTENSIONS | AARCH64_FL_SVE },
++#include "aarch64-sve-builtins.def"
++};
++
++/* The scalar type associated with each vector type.  */
++GTY(()) tree scalar_types[NUM_VECTOR_TYPES];
++
++/* The single-predicate and single-vector types, with their built-in
++   "__SV..._t" name.  Allow an index of NUM_VECTOR_TYPES, which always
++   yields a null tree.  */
++static GTY(()) tree abi_vector_types[NUM_VECTOR_TYPES + 1];
++
++/* Same, but with the arm_sve.h "sv..._t" name.  */
++GTY(()) tree acle_vector_types[MAX_TUPLE_SIZE][NUM_VECTOR_TYPES + 1];
++
++/* The svpattern enum type.  */
++GTY(()) tree acle_svpattern;
++
++/* The svprfop enum type.  */
++GTY(()) tree acle_svprfop;
++
++/* The list of all registered function decls, indexed by code.  */
++static GTY(()) vec<registered_function *, va_gc> *registered_functions;
++
++/* All registered function decls, hashed on the function_instance
++   that they implement.  This is used for looking up implementations of
++   overloaded functions.  */
++static hash_table<registered_function_hasher> *function_table;
++
++/* True if we've already complained about attempts to use functions
++   when the required extension is disabled.  */
++static bool reported_missing_extension_p;
++
++/* If TYPE is an ACLE vector type, return the associated vector_type,
++   otherwise return NUM_VECTOR_TYPES.  */
++static vector_type_index
++find_vector_type (const_tree type)
++{
++  /* A linear search should be OK here, since the code isn't hot and
++     the number of types is only small.  */
++  type = TYPE_MAIN_VARIANT (type);
++  for (unsigned int i = 0; i < NUM_VECTOR_TYPES; ++i)
++    if (type == abi_vector_types[i])
++      return vector_type_index (i);
++  return NUM_VECTOR_TYPES;
++}
++
++/* If TYPE is a valid SVE element type, return the corresponding type
++   suffix, otherwise return NUM_TYPE_SUFFIXES.  */
++static type_suffix_index
++find_type_suffix_for_scalar_type (const_tree type)
++{
++  /* A linear search should be OK here, since the code isn't hot and
++     the number of types is only small.  */
++  type = TYPE_MAIN_VARIANT (type);
++  for (unsigned int suffix_i = 0; suffix_i < NUM_TYPE_SUFFIXES; ++suffix_i)
++    if (!type_suffixes[suffix_i].bool_p)
++      {
++	vector_type_index vector_i = type_suffixes[suffix_i].vector_type;
++	if (type == TYPE_MAIN_VARIANT (scalar_types[vector_i]))
++	  return type_suffix_index (suffix_i);
++      }
++  return NUM_TYPE_SUFFIXES;
++}
++
++/* Report an error against LOCATION that the user has tried to use
++   function FNDECL when extension EXTENSION is disabled.  */
++static void
++report_missing_extension (location_t location, tree fndecl,
++			  const char *extension)
++{
++  /* Avoid reporting a slew of messages for a single oversight.  */
++  if (reported_missing_extension_p)
++    return;
++
++  error_at (location, "ACLE function %qD requires ISA extension %qs",
++	    fndecl, extension);
++  inform (location, "you can enable %qs using the command-line"
++	  " option %<-march%>, or by using the %<target%>"
++	  " attribute or pragma", extension);
++  reported_missing_extension_p = true;
++}
++
++/* Check whether all the AARCH64_FL_* values in REQUIRED_EXTENSIONS are
++   enabled, given that those extensions are required for function FNDECL.
++   Report an error against LOCATION if not.  */
++static bool
++check_required_extensions (location_t location, tree fndecl,
++			   uint64_t required_extensions)
++{
++  uint64_t missing_extensions = required_extensions & ~aarch64_isa_flags;
++  if (missing_extensions == 0)
++    return true;
++
++  static const struct { uint64_t flag; const char *name; } extensions[] = {
++#define AARCH64_OPT_EXTENSION(EXT_NAME, FLAG_CANONICAL, FLAGS_ON, FLAGS_OFF, \
++			      SYNTHETIC, FEATURE_STRING) \
++    { FLAG_CANONICAL, EXT_NAME },
++#include "aarch64-option-extensions.def"
++  };
++
++  for (unsigned int i = 0; i < ARRAY_SIZE (extensions); ++i)
++    if (missing_extensions & extensions[i].flag)
++      {
++	report_missing_extension (location, fndecl, extensions[i].name);
++	return false;
++      }
++  gcc_unreachable ();
++}
++
++/* Report that LOCATION has a call to FNDECL in which argument ARGNO
++   was not an integer constant expression.  ARGNO counts from zero.  */
++static void
++report_non_ice (location_t location, tree fndecl, unsigned int argno)
++{
++  error_at (location, "argument %d of %qE must be an integer constant"
++	    " expression", argno + 1, fndecl);
++}
++
++/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
++   the value ACTUAL, whereas the function requires a value in the range
++   [MIN, MAX].  ARGNO counts from zero.  */
++static void
++report_out_of_range (location_t location, tree fndecl, unsigned int argno,
++		     HOST_WIDE_INT actual, HOST_WIDE_INT min,
++		     HOST_WIDE_INT max)
++{
++  error_at (location, "passing %wd to argument %d of %qE, which expects"
++	    " a value in the range [%wd, %wd]", actual, argno + 1, fndecl,
++	    min, max);
++}
++
++/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
++   the value ACTUAL, whereas the function requires either VALUE0 or
++   VALUE1.  ARGNO counts from zero.  */
++static void
++report_neither_nor (location_t location, tree fndecl, unsigned int argno,
++		    HOST_WIDE_INT actual, HOST_WIDE_INT value0,
++		    HOST_WIDE_INT value1)
++{
++  error_at (location, "passing %wd to argument %d of %qE, which expects"
++	    " either %wd or %wd", actual, argno + 1, fndecl, value0, value1);
++}
++
++/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
++   the value ACTUAL, whereas the function requires one of VALUE0..3.
++   ARGNO counts from zero.  */
++static void
++report_not_one_of (location_t location, tree fndecl, unsigned int argno,
++		   HOST_WIDE_INT actual, HOST_WIDE_INT value0,
++		   HOST_WIDE_INT value1, HOST_WIDE_INT value2,
++		   HOST_WIDE_INT value3)
++{
++  error_at (location, "passing %wd to argument %d of %qE, which expects"
++	    " %wd, %wd, %wd or %wd", actual, argno + 1, fndecl, value0, value1,
++	    value2, value3);
++}
++
++/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
++   the value ACTUAL, whereas the function requires a valid value of
++   enum type ENUMTYPE.  ARGNO counts from zero.  */
++static void
++report_not_enum (location_t location, tree fndecl, unsigned int argno,
++		 HOST_WIDE_INT actual, tree enumtype)
++{
++  error_at (location, "passing %wd to argument %d of %qE, which expects"
++	    " a valid %qT value", actual, argno + 1, fndecl, enumtype);
++}
++
++/* Return a hash code for a function_instance.  */
++hashval_t
++function_instance::hash () const
++{
++  inchash::hash h;
++  /* BASE uniquely determines BASE_NAME, so we don't need to hash both.  */
++  h.add_ptr (base);
++  h.add_ptr (shape);
++  h.add_int (mode_suffix_id);
++  h.add_int (type_suffix_ids[0]);
++  h.add_int (type_suffix_ids[1]);
++  h.add_int (pred);
++  return h.end ();
++}
++
++/* Return a set of CP_* flags that describe what the function could do,
++   taking the command-line flags into account.  */
++unsigned int
++function_instance::call_properties () const
++{
++  unsigned int flags = base->call_properties (*this);
++
++  /* -fno-trapping-math means that we can assume any FP exceptions
++     are not user-visible.  */
++  if (!flag_trapping_math)
++    flags &= ~CP_RAISE_FP_EXCEPTIONS;
++
++  return flags;
++}
++
++/* Return true if calls to the function could read some form of
++   global state.  */
++bool
++function_instance::reads_global_state_p () const
++{
++  unsigned int flags = call_properties ();
++
++  /* Preserve any dependence on rounding mode, flush to zero mode, etc.
++     There is currently no way of turning this off; in particular,
++     -fno-rounding-math (which is the default) means that we should make
++     the usual assumptions about rounding mode, which for intrinsics means
++     acting as the instructions do.  */
++  if (flags & CP_READ_FPCR)
++    return true;
++
++  /* Handle direct reads of global state.  */
++  return flags & (CP_READ_MEMORY | CP_READ_FFR);
++}
++
++/* Return true if calls to the function could modify some form of
++   global state.  */
++bool
++function_instance::modifies_global_state_p () const
++{
++  unsigned int flags = call_properties ();
++
++  /* Preserve any exception state written back to the FPCR,
++     unless -fno-trapping-math says this is unnecessary.  */
++  if (flags & CP_RAISE_FP_EXCEPTIONS)
++    return true;
++
++  /* Treat prefetches as modifying global state, since that's the
++     only means we have of keeping them in their correct position.  */
++  if (flags & CP_PREFETCH_MEMORY)
++    return true;
++
++  /* Handle direct modifications of global state.  */
++  return flags & (CP_WRITE_MEMORY | CP_WRITE_FFR);
++}
++
++/* Return true if calls to the function could raise a signal.  */
++bool
++function_instance::could_trap_p () const
++{
++  unsigned int flags = call_properties ();
++
++  /* Handle functions that could raise SIGFPE.  */
++  if (flags & CP_RAISE_FP_EXCEPTIONS)
++    return true;
++
++  /* Handle functions that could raise SIGBUS or SIGSEGV.  */
++  if (flags & (CP_READ_MEMORY | CP_WRITE_MEMORY))
++    return true;
++
++  return false;
++}
++
++inline hashval_t
++registered_function_hasher::hash (value_type value)
++{
++  return value->instance.hash ();
++}
++
++inline bool
++registered_function_hasher::equal (value_type value, const compare_type &key)
++{
++  return value->instance == key;
++}
++
++sve_switcher::sve_switcher ()
++  : m_old_isa_flags (aarch64_isa_flags)
++{
++  /* Changing the ISA flags and have_regs_of_mode should be enough here.
++     We shouldn't need to pay the compile-time cost of a full target
++     switch.  */
++  aarch64_isa_flags = (AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16
++		       | AARCH64_FL_SVE);
++
++  memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
++	  sizeof (have_regs_of_mode));
++  for (int i = 0; i < NUM_MACHINE_MODES; ++i)
++    if (aarch64_sve_mode_p ((machine_mode) i))
++      have_regs_of_mode[i] = true;
++}
++
++sve_switcher::~sve_switcher ()
++{
++  memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
++	  sizeof (have_regs_of_mode));
++  aarch64_isa_flags = m_old_isa_flags;
++}
++
++function_builder::function_builder ()
++{
++  m_overload_type = build_function_type (void_type_node, void_list_node);
++  m_direct_overloads = lang_GNU_CXX ();
++  gcc_obstack_init (&m_string_obstack);
++}
++
++function_builder::~function_builder ()
++{
++  obstack_free (&m_string_obstack, NULL);
++}
++
++/* Add NAME to the end of the function name being built.  */
++void
++function_builder::append_name (const char *name)
++{
++  obstack_grow (&m_string_obstack, name, strlen (name));
++}
++
++/* Zero-terminate and complete the function name being built.  */
++char *
++function_builder::finish_name ()
++{
++  obstack_1grow (&m_string_obstack, 0);
++  return (char *) obstack_finish (&m_string_obstack);
++}
++
++/* Return the overloaded or full function name for INSTANCE; OVERLOADED_P
++   selects which.  Allocate the string on m_string_obstack; the caller
++   must use obstack_free to free it after use.  */
++char *
++function_builder::get_name (const function_instance &instance,
++			    bool overloaded_p)
++{
++  append_name (instance.base_name);
++  if (overloaded_p)
++    switch (instance.displacement_units ())
++      {
++      case UNITS_none:
++	break;
++
++      case UNITS_bytes:
++	append_name ("_offset");
++	break;
++
++      case UNITS_elements:
++	append_name ("_index");
++	break;
++
++      case UNITS_vectors:
++	append_name ("_vnum");
++	break;
++      }
++  else
++    append_name (instance.mode_suffix ().string);
++  for (unsigned int i = 0; i < 2; ++i)
++    if (!overloaded_p || instance.shape->explicit_type_suffix_p (i))
++      append_name (instance.type_suffix (i).string);
++  append_name (pred_suffixes[instance.pred]);
++  return finish_name ();
++}
++
++/* Add attribute NAME to ATTRS.  */
++static tree
++add_attribute (const char *name, tree attrs)
++{
++  return tree_cons (get_identifier (name), NULL_TREE, attrs);
++}
++
++/* Return the appropriate function attributes for INSTANCE.  */
++tree
++function_builder::get_attributes (const function_instance &instance)
++{
++  tree attrs = NULL_TREE;
++
++  if (!instance.modifies_global_state_p ())
++    {
++      if (instance.reads_global_state_p ())
++	attrs = add_attribute ("pure", attrs);
++      else
++	attrs = add_attribute ("const", attrs);
++    }
++
++  if (!flag_non_call_exceptions || !instance.could_trap_p ())
++    attrs = add_attribute ("nothrow", attrs);
++
++  return add_attribute ("leaf", attrs);
++}
++
++/* Add a function called NAME with type FNTYPE and attributes ATTRS.
++   INSTANCE describes what the function does and OVERLOADED_P indicates
++   whether it is overloaded.  REQUIRED_EXTENSIONS are the set of
++   architecture extensions that the function requires.  */
++registered_function &
++function_builder::add_function (const function_instance &instance,
++				const char *name, tree fntype, tree attrs,
++				uint64_t required_extensions,
++				bool overloaded_p)
++{
++  unsigned int code = vec_safe_length (registered_functions);
++  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_SVE;
++  tree decl = simulate_builtin_function_decl (input_location, name, fntype,
++					      code, NULL, attrs);
++
++  registered_function &rfn = *ggc_alloc <registered_function> ();
++  rfn.instance = instance;
++  rfn.decl = decl;
++  rfn.required_extensions = required_extensions;
++  rfn.overloaded_p = overloaded_p;
++  vec_safe_push (registered_functions, &rfn);
++
++  return rfn;
++}
++
++/* Add a built-in function for INSTANCE, with the argument types given
++   by ARGUMENT_TYPES and the return type given by RETURN_TYPE.
++   REQUIRED_EXTENSIONS are the set of architecture extensions that the
++   function requires.  FORCE_DIRECT_OVERLOADS is true if there is a
++   one-to-one mapping between "short" and "full" names, and if standard
++   overload resolution therefore isn't necessary.  */
++void
++function_builder::add_unique_function (const function_instance &instance,
++				       tree return_type,
++				       vec<tree> &argument_types,
++				       uint64_t required_extensions,
++				       bool force_direct_overloads)
++{
++  /* Add the function under its full (unique) name.  */
++  char *name = get_name (instance, false);
++  tree fntype = build_function_type_array (return_type,
++					   argument_types.length (),
++					   argument_types.address ());
++  tree attrs = get_attributes (instance);
++  registered_function &rfn = add_function (instance, name, fntype, attrs,
++					   required_extensions, false);
++
++  /* Enter the function into the hash table.  */
++  hashval_t hash = instance.hash ();
++  registered_function **rfn_slot
++    = function_table->find_slot_with_hash (instance, hash, INSERT);
++  gcc_assert (!*rfn_slot);
++  *rfn_slot = &rfn;
++
++  /* Also add the function under its overloaded alias, if we want
++     a separate decl for each instance of an overloaded function.  */
++  if (m_direct_overloads || force_direct_overloads)
++    {
++      char *overload_name = get_name (instance, true);
++      if (strcmp (name, overload_name) != 0)
++	{
++	  /* Attribute lists shouldn't be shared.  */
++	  tree attrs = get_attributes (instance);
++	  add_function (instance, overload_name, fntype, attrs,
++			required_extensions, false);
++	}
++    }
++
++  obstack_free (&m_string_obstack, name);
++}
++
++/* Add one function decl for INSTANCE, to be used with manual overload
++   resolution.  REQUIRED_EXTENSIONS are the set of architecture extensions
++   that the function requires.
++
++   For simplicity, deal with duplicate attempts to add the same function,
++   including cases in which the new function requires more features than
++   the original one did.  In that case we'll check whether the required
++   features are available as part of resolving the function to the
++   relevant unique function.  */
++void
++function_builder::add_overloaded_function (const function_instance &instance,
++					   uint64_t required_extensions)
++{
++  char *name = get_name (instance, true);
++  if (registered_function **map_value = m_overload_names.get (name))
++    gcc_assert ((*map_value)->instance == instance
++		&& ((*map_value)->required_extensions
++		    & ~required_extensions) == 0);
++  else
++    {
++      registered_function &rfn
++	= add_function (instance, name, m_overload_type, NULL_TREE,
++			required_extensions, true);
++      const char *permanent_name = IDENTIFIER_POINTER (DECL_NAME (rfn.decl));
++      m_overload_names.put (permanent_name, &rfn);
++    }
++  obstack_free (&m_string_obstack, name);
++}
++
++/* If we are using manual overload resolution, add one function decl
++   for each overloaded function in GROUP.  Take the function base name
++   from GROUP and the mode from MODE.  */
++void
++function_builder::add_overloaded_functions (const function_group_info &group,
++					    mode_suffix_index mode)
++{
++  if (m_direct_overloads)
++    return;
++
++  unsigned int explicit_type0 = (*group.shape)->explicit_type_suffix_p (0);
++  unsigned int explicit_type1 = (*group.shape)->explicit_type_suffix_p (1);
++  for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi)
++    {
++      if (!explicit_type0 && !explicit_type1)
++	{
++	  /* Deal with the common case in which there is one overloaded
++	     function for all type combinations.  */
++	  function_instance instance (group.base_name, *group.base,
++				      *group.shape, mode, types_none[0],
++				      group.preds[pi]);
++	  add_overloaded_function (instance, group.required_extensions);
++	}
++      else
++	for (unsigned int ti = 0; group.types[ti][0] != NUM_TYPE_SUFFIXES;
++	     ++ti)
++	  {
++	    /* Stub out the types that are determined by overload
++	       resolution.  */
++	    type_suffix_pair types = {
++	      explicit_type0 ? group.types[ti][0] : NUM_TYPE_SUFFIXES,
++	      explicit_type1 ? group.types[ti][1] : NUM_TYPE_SUFFIXES
++	    };
++	    function_instance instance (group.base_name, *group.base,
++					*group.shape, mode, types,
++					group.preds[pi]);
++	    add_overloaded_function (instance, group.required_extensions);
++	  }
++    }
++}
++
++/* Register all the functions in GROUP.  */
++void
++function_builder::register_function_group (const function_group_info &group)
++{
++  (*group.shape)->build (*this, group);
++}
++
++function_call_info::function_call_info (location_t location_in,
++					const function_instance &instance_in,
++					tree fndecl_in)
++  : function_instance (instance_in), location (location_in), fndecl (fndecl_in)
++{
++}
++
++function_resolver::function_resolver (location_t location,
++				      const function_instance &instance,
++				      tree fndecl, vec<tree, va_gc> &arglist)
++  : function_call_info (location, instance, fndecl), m_arglist (arglist)
++{
++}
++
++/* Return the vector type associated with type suffix TYPE.  */
++tree
++function_resolver::get_vector_type (type_suffix_index type)
++{
++  return acle_vector_types[0][type_suffixes[type].vector_type];
++}
++
++/* Return the <stdint.h> name associated with TYPE.  Using the <stdint.h>
++   name should be more user-friendly than the underlying canonical type,
++   since it makes the signedness and bitwidth explicit.  */
++const char *
++function_resolver::get_scalar_type_name (type_suffix_index type)
++{
++  return vector_types[type_suffixes[type].vector_type].acle_name + 2;
++}
++
++/* Return the type of argument I, or error_mark_node if it isn't
++   well-formed.  */
++tree
++function_resolver::get_argument_type (unsigned int i)
++{
++  tree arg = m_arglist[i];
++  return arg == error_mark_node ? arg : TREE_TYPE (arg);
++}
++
++/* Return true if argument I is some form of scalar value.  */
++bool
++function_resolver::scalar_argument_p (unsigned int i)
++{
++  tree type = get_argument_type (i);
++  return (INTEGRAL_TYPE_P (type)
++	  /* Allow pointer types, leaving the frontend to warn where
++	     necessary.  */
++	  || POINTER_TYPE_P (type)
++	  || SCALAR_FLOAT_TYPE_P (type));
++}
++
++/* Report that the function has no form that takes type suffix TYPE.
++   Return error_mark_node.  */
++tree
++function_resolver::report_no_such_form (type_suffix_index type)
++{
++  error_at (location, "%qE has no form that takes %qT arguments",
++	    fndecl, get_vector_type (type));
++  return error_mark_node;
++}
++
++/* Silently check whether there is an instance of the function with the
++   mode suffix given by MODE and the type suffixes given by TYPE0 and TYPE1.
++   Return its function decl if so, otherwise return null.  */
++tree
++function_resolver::lookup_form (mode_suffix_index mode,
++				type_suffix_index type0,
++				type_suffix_index type1)
++{
++  type_suffix_pair types = { type0, type1 };
++  function_instance instance (base_name, base, shape, mode, types, pred);
++  registered_function *rfn
++    = function_table->find_with_hash (instance, instance.hash ());
++  return rfn ? rfn->decl : NULL_TREE;
++}
++
++/* Resolve the function to one with the mode suffix given by MODE and the
++   type suffixes given by TYPE0 and TYPE1.  Return its function decl on
++   success, otherwise report an error and return error_mark_node.  */
++tree
++function_resolver::resolve_to (mode_suffix_index mode,
++			       type_suffix_index type0,
++			       type_suffix_index type1)
++{
++  tree res = lookup_form (mode, type0, type1);
++  if (!res)
++    {
++      if (type1 == NUM_TYPE_SUFFIXES)
++	return report_no_such_form (type0);
++      if (type0 == type_suffix_ids[0])
++	return report_no_such_form (type1);
++      /* To be filled in when we have other cases.  */
++      gcc_unreachable ();
++    }
++  return res;
++}
++
++/* Require argument ARGNO to be a 32-bit or 64-bit scalar integer type.
++   Return the associated type suffix on success, otherwise report an
++   error and return NUM_TYPE_SUFFIXES.  */
++type_suffix_index
++function_resolver::infer_integer_scalar_type (unsigned int argno)
++{
++  tree actual = get_argument_type (argno);
++  if (actual == error_mark_node)
++    return NUM_TYPE_SUFFIXES;
++
++  /* Allow enums and booleans to decay to integers, for compatibility
++     with C++ overloading rules.  */
++  if (INTEGRAL_TYPE_P (actual))
++    {
++      bool uns_p = TYPE_UNSIGNED (actual);
++      /* Honor the usual integer promotions, so that resolution works
++	 in the same way as for C++.  */
++      if (TYPE_PRECISION (actual) < 32)
++	return TYPE_SUFFIX_s32;
++      if (TYPE_PRECISION (actual) == 32)
++	return uns_p ? TYPE_SUFFIX_u32 : TYPE_SUFFIX_s32;
++      if (TYPE_PRECISION (actual) == 64)
++	return uns_p ? TYPE_SUFFIX_u64 : TYPE_SUFFIX_s64;
++    }
++
++  error_at (location, "passing %qT to argument %d of %qE, which expects"
++	    " a 32-bit or 64-bit integer type", actual, argno + 1, fndecl);
++  return NUM_TYPE_SUFFIXES;
++}
++
++/* Require argument ARGNO to be a pointer to a scalar type that has a
++   corresponding type suffix.  Return that type suffix on success,
++   otherwise report an error and return NUM_TYPE_SUFFIXES.
++   GATHER_SCATTER_P is true if the function is a gather/scatter
++   operation, and so requires a pointer to 32-bit or 64-bit data.  */
++type_suffix_index
++function_resolver::infer_pointer_type (unsigned int argno,
++				       bool gather_scatter_p)
++{
++  tree actual = get_argument_type (argno);
++  if (actual == error_mark_node)
++    return NUM_TYPE_SUFFIXES;
++
++  if (TREE_CODE (actual) != POINTER_TYPE)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a pointer type", actual, argno + 1, fndecl);
++      if (VECTOR_TYPE_P (actual) && gather_scatter_p)
++	inform (location, "an explicit type suffix is needed"
++		" when using a vector of base addresses");
++      return NUM_TYPE_SUFFIXES;
++    }
++
++  tree target = TREE_TYPE (actual);
++  type_suffix_index type = find_type_suffix_for_scalar_type (target);
++  if (type == NUM_TYPE_SUFFIXES)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, but %qT is not"
++		" a valid SVE element type", actual, argno + 1, fndecl,
++		build_qualified_type (target, 0));
++      return NUM_TYPE_SUFFIXES;
++    }
++  unsigned int bits = type_suffixes[type].element_bits;
++  if (gather_scatter_p && bits != 32 && bits != 64)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a pointer to 32-bit or 64-bit elements",
++		actual, argno + 1, fndecl);
++      return NUM_TYPE_SUFFIXES;
++    }
++
++  return type;
++}
++
++/* Require argument ARGNO to be a single vector or a tuple of NUM_VECTORS
++   vectors; NUM_VECTORS is 1 for the former.  Return the associated type
++   suffix on success, using TYPE_SUFFIX_b for predicates.  Report an error
++   and return NUM_TYPE_SUFFIXES on failure.  */
++type_suffix_index
++function_resolver::infer_vector_or_tuple_type (unsigned int argno,
++					       unsigned int num_vectors)
++{
++  tree actual = get_argument_type (argno);
++  if (actual == error_mark_node)
++    return NUM_TYPE_SUFFIXES;
++
++  /* A linear search should be OK here, since the code isn't hot and
++     the number of types is only small.  */
++  for (unsigned int size_i = 0; size_i < MAX_TUPLE_SIZE; ++size_i)
++    for (unsigned int suffix_i = 0; suffix_i < NUM_TYPE_SUFFIXES; ++suffix_i)
++      {
++	vector_type_index type_i = type_suffixes[suffix_i].vector_type;
++	tree type = acle_vector_types[size_i][type_i];
++	if (type && TYPE_MAIN_VARIANT (actual) == TYPE_MAIN_VARIANT (type))
++	  {
++	    if (size_i + 1 == num_vectors)
++	      return type_suffix_index (suffix_i);
++
++	    if (num_vectors == 1)
++	      error_at (location, "passing %qT to argument %d of %qE, which"
++			" expects a single SVE vector rather than a tuple",
++			actual, argno + 1, fndecl);
++	    else if (size_i == 0 && type_i != VECTOR_TYPE_svbool_t)
++	      error_at (location, "passing single vector %qT to argument %d"
++			" of %qE, which expects a tuple of %d vectors",
++			actual, argno + 1, fndecl, num_vectors);
++	    else
++	      error_at (location, "passing %qT to argument %d of %qE, which"
++			" expects a tuple of %d vectors", actual, argno + 1,
++			fndecl, num_vectors);
++	    return NUM_TYPE_SUFFIXES;
++	  }
++      }
++
++  if (num_vectors == 1)
++    error_at (location, "passing %qT to argument %d of %qE, which"
++	      " expects an SVE vector type", actual, argno + 1, fndecl);
++  else
++    error_at (location, "passing %qT to argument %d of %qE, which"
++	      " expects an SVE tuple type", actual, argno + 1, fndecl);
++  return NUM_TYPE_SUFFIXES;
++}
++
++/* Require argument ARGNO to have some form of vector type.  Return the
++   associated type suffix on success, using TYPE_SUFFIX_b for predicates.
++   Report an error and return NUM_TYPE_SUFFIXES on failure.  */
++type_suffix_index
++function_resolver::infer_vector_type (unsigned int argno)
++{
++  return infer_vector_or_tuple_type (argno, 1);
++}
++
++/* Like infer_vector_type, but also require the type to be integral.  */
++type_suffix_index
++function_resolver::infer_integer_vector_type (unsigned int argno)
++{
++  type_suffix_index type = infer_vector_type (argno);
++  if (type == NUM_TYPE_SUFFIXES)
++    return type;
++
++  if (!type_suffixes[type].integer_p)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector of integers", get_argument_type (argno),
++		argno + 1, fndecl);
++      return NUM_TYPE_SUFFIXES;
++    }
++
++  return type;
++}
++
++/* Like infer_vector_type, but also require the type to be an unsigned
++   integer.  */
++type_suffix_index
++function_resolver::infer_unsigned_vector_type (unsigned int argno)
++{
++  type_suffix_index type = infer_vector_type (argno);
++  if (type == NUM_TYPE_SUFFIXES)
++    return type;
++
++  if (!type_suffixes[type].unsigned_p)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector of unsigned integers",
++		get_argument_type (argno), argno + 1, fndecl);
++      return NUM_TYPE_SUFFIXES;
++    }
++
++  return type;
++}
++
++/* Like infer_vector_type, but also require the element size to be
++   32 or 64 bits.  */
++type_suffix_index
++function_resolver::infer_sd_vector_type (unsigned int argno)
++{
++  type_suffix_index type = infer_vector_type (argno);
++  if (type == NUM_TYPE_SUFFIXES)
++    return type;
++
++  unsigned int bits = type_suffixes[type].element_bits;
++  if (bits != 32 && bits != 64)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector of 32-bit or 64-bit elements",
++		get_argument_type (argno), argno + 1, fndecl);
++      return NUM_TYPE_SUFFIXES;
++    }
++
++  return type;
++}
++
++/* If the function operates on tuples of vectors, require argument ARGNO to be
++   a tuple with the appropriate number of vectors, otherwise require it to be
++   a single vector.  Return the associated type suffix on success, using
++   TYPE_SUFFIX_b for predicates.  Report an error and return NUM_TYPE_SUFFIXES
++   on failure.  */
++type_suffix_index
++function_resolver::infer_tuple_type (unsigned int argno)
++{
++  return infer_vector_or_tuple_type (argno, vectors_per_tuple ());
++}
++
++/* Require argument ARGNO to be a vector or scalar argument.  Return true
++   if it is, otherwise report an appropriate error.  */
++bool
++function_resolver::require_vector_or_scalar_type (unsigned int argno)
++{
++  tree actual = get_argument_type (argno);
++  if (actual == error_mark_node)
++    return false;
++
++  if (!scalar_argument_p (argno) && !VECTOR_TYPE_P (actual))
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector or scalar type", actual, argno + 1, fndecl);
++      return false;
++    }
++
++  return true;
++}
++
++/* Require argument ARGNO to have vector type TYPE, in cases where this
++   requirement holds for all uses of the function.  Return true if the
++   argument has the right form, otherwise report an appropriate error.  */
++bool
++function_resolver::require_vector_type (unsigned int argno,
++					vector_type_index type)
++{
++  tree expected = acle_vector_types[0][type];
++  tree actual = get_argument_type (argno);
++  if (actual != error_mark_node
++      && TYPE_MAIN_VARIANT (expected) != TYPE_MAIN_VARIANT (actual))
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects %qT", actual, argno + 1, fndecl, expected);
++      return false;
++    }
++  return true;
++}
++
++/* Like require_vector_type, but TYPE is inferred from previous arguments
++   rather than being a fixed part of the function signature.  This changes
++   the nature of the error messages.  */
++bool
++function_resolver::require_matching_vector_type (unsigned int argno,
++						 type_suffix_index type)
++{
++  type_suffix_index new_type = infer_vector_type (argno);
++  if (new_type == NUM_TYPE_SUFFIXES)
++    return false;
++
++  if (type != new_type)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, but"
++		" previous arguments had type %qT",
++		get_vector_type (new_type), argno + 1, fndecl,
++		get_vector_type (type));
++      return false;
++    }
++  return true;
++}
++
++/* Require argument ARGNO to be a vector type with the following properties:
++
++   - the type class must be the same as FIRST_TYPE's if EXPECTED_TCLASS
++     is SAME_TYPE_CLASS, otherwise it must be EXPECTED_TCLASS itself.
++
++   - the element size must be:
++
++     - the same as FIRST_TYPE's if EXPECTED_BITS == SAME_SIZE
++     - half of FIRST_TYPE's if EXPECTED_BITS == HALF_SIZE
++     - a quarter of FIRST_TYPE's if EXPECTED_BITS == QUARTER_SIZE
++     - EXPECTED_BITS itself otherwise
++
++   Return true if the argument has the required type, otherwise report
++   an appropriate error.
++
++   FIRST_ARGNO is the first argument that is known to have type FIRST_TYPE.
++   Usually it comes before ARGNO, but sometimes it is more natural to resolve
++   arguments out of order.
++
++   If the required properties depend on FIRST_TYPE then both FIRST_ARGNO and
++   ARGNO contribute to the resolution process.  If the required properties
++   are fixed, only FIRST_ARGNO contributes to the resolution process.
++
++   This function is a bit of a Swiss army knife.  The complication comes
++   from trying to give good error messages when FIRST_ARGNO and ARGNO are
++   inconsistent, since either of them might be wrong.  */
++bool function_resolver::
++require_derived_vector_type (unsigned int argno,
++			     unsigned int first_argno,
++			     type_suffix_index first_type,
++			     type_class_index expected_tclass,
++			     unsigned int expected_bits)
++{
++  /* If the type needs to match FIRST_ARGNO exactly, use the preferred
++     error message for that case.  The VECTOR_TYPE_P test excludes tuple
++     types, which we handle below instead.  */
++  bool both_vectors_p = VECTOR_TYPE_P (get_argument_type (first_argno));
++  if (both_vectors_p
++      && expected_tclass == SAME_TYPE_CLASS
++      && expected_bits == SAME_SIZE)
++    {
++      /* There's no need to resolve this case out of order.  */
++      gcc_assert (argno > first_argno);
++      return require_matching_vector_type (argno, first_type);
++    }
++
++  /* Use FIRST_TYPE to get the expected type class and element size.  */
++  type_class_index orig_expected_tclass = expected_tclass;
++  if (expected_tclass == NUM_TYPE_CLASSES)
++    expected_tclass = type_suffixes[first_type].tclass;
++
++  unsigned int orig_expected_bits = expected_bits;
++  if (expected_bits == SAME_SIZE)
++    expected_bits = type_suffixes[first_type].element_bits;
++  else if (expected_bits == HALF_SIZE)
++    expected_bits = type_suffixes[first_type].element_bits / 2;
++  else if (expected_bits == QUARTER_SIZE)
++    expected_bits = type_suffixes[first_type].element_bits / 4;
++
++  /* If the expected type doesn't depend on FIRST_TYPE at all,
++     just check for the fixed choice of vector type.  */
++  if (expected_tclass == orig_expected_tclass
++      && expected_bits == orig_expected_bits)
++    {
++      const type_suffix_info &expected_suffix
++	= type_suffixes[find_type_suffix (expected_tclass, expected_bits)];
++      return require_vector_type (argno, expected_suffix.vector_type);
++    }
++
++  /* Require the argument to be some form of SVE vector type,
++     without being specific about the type of vector we want.  */
++  type_suffix_index actual_type = infer_vector_type (argno);
++  if (actual_type == NUM_TYPE_SUFFIXES)
++    return false;
++
++  /* Exit now if we got the right type.  */
++  bool tclass_ok_p = (type_suffixes[actual_type].tclass == expected_tclass);
++  bool size_ok_p = (type_suffixes[actual_type].element_bits == expected_bits);
++  if (tclass_ok_p && size_ok_p)
++    return true;
++
++  /* First look for cases in which the actual type contravenes a fixed
++     size requirement, without having to refer to FIRST_TYPE.  */
++  if (!size_ok_p && expected_bits == orig_expected_bits)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector of %d-bit elements",
++		get_vector_type (actual_type), argno + 1, fndecl,
++		expected_bits);
++      return false;
++    }
++
++  /* Likewise for a fixed type class requirement.  This is only ever
++     needed for signed and unsigned types, so don't create unnecessary
++     translation work for other type classes.  */
++  if (!tclass_ok_p && orig_expected_tclass == TYPE_signed)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector of signed integers",
++		get_vector_type (actual_type), argno + 1, fndecl);
++      return false;
++    }
++  if (!tclass_ok_p && orig_expected_tclass == TYPE_unsigned)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a vector of unsigned integers",
++		get_vector_type (actual_type), argno + 1, fndecl);
++      return false;
++    }
++
++  /* Make sure that FIRST_TYPE itself is sensible before using it
++     as a basis for an error message.  */
++  if (resolve_to (mode_suffix_id, first_type) == error_mark_node)
++    return false;
++
++  /* If the arguments have consistent type classes, but a link between
++     the sizes has been broken, try to describe the error in those terms.  */
++  if (both_vectors_p && tclass_ok_p && orig_expected_bits == SAME_SIZE)
++    {
++      if (argno < first_argno)
++	{
++	  std::swap (argno, first_argno);
++	  std::swap (actual_type, first_type);
++	}
++      error_at (location, "arguments %d and %d of %qE must have the"
++		" same element size, but the values passed here have type"
++		" %qT and %qT respectively", first_argno + 1, argno + 1,
++		fndecl, get_vector_type (first_type),
++		get_vector_type (actual_type));
++      return false;
++    }
++
++  /* Likewise in reverse: look for cases in which the sizes are consistent
++     but a link between the type classes has been broken.  */
++  if (both_vectors_p
++      && size_ok_p
++      && orig_expected_tclass == SAME_TYPE_CLASS
++      && type_suffixes[first_type].integer_p
++      && type_suffixes[actual_type].integer_p)
++    {
++      if (argno < first_argno)
++	{
++	  std::swap (argno, first_argno);
++	  std::swap (actual_type, first_type);
++	}
++      error_at (location, "arguments %d and %d of %qE must have the"
++		" same signedness, but the values passed here have type"
++		" %qT and %qT respectively", first_argno + 1, argno + 1,
++		fndecl, get_vector_type (first_type),
++		get_vector_type (actual_type));
++      return false;
++    }
++
++  /* The two arguments are wildly inconsistent.  */
++  type_suffix_index expected_type
++    = find_type_suffix (expected_tclass, expected_bits);
++  error_at (location, "passing %qT instead of the expected %qT to argument"
++	    " %d of %qE, after passing %qT to argument %d",
++	    get_vector_type (actual_type), get_vector_type (expected_type),
++	    argno + 1, fndecl, get_argument_type (first_argno),
++	    first_argno + 1);
++  return false;
++}
++
++/* Require argument ARGNO to match argument FIRST_ARGNO, which was inferred
++   to be a pointer to a scalar element of type TYPE.  */
++bool
++function_resolver::require_matching_pointer_type (unsigned int argno,
++						  unsigned int first_argno,
++						  type_suffix_index type)
++{
++  type_suffix_index new_type = infer_pointer_type (argno);
++  if (new_type == NUM_TYPE_SUFFIXES)
++    return false;
++
++  if (type != new_type)
++    {
++      error_at (location, "passing %qT to argument %d of %qE, but"
++		" argument %d had type %qT", get_argument_type (argno),
++		argno + 1, fndecl, first_argno + 1,
++		get_argument_type (first_argno));
++      return false;
++    }
++  return true;
++}
++
++/* Require argument ARGNO to be a (possibly variable) scalar, using EXPECTED
++   as the name of its expected type.  Return true if the argument has the
++   right form, otherwise report an appropriate error.  */
++bool
++function_resolver::require_scalar_type (unsigned int argno,
++					const char *expected)
++{
++  if (!scalar_argument_p (argno))
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects %qs", get_argument_type (argno), argno + 1,
++		fndecl, expected);
++      return false;
++    }
++  return true;
++}
++
++/* Require argument ARGNO to be some form of pointer, without being specific
++   about its target type.  Return true if the argument has the right form,
++   otherwise report an appropriate error.  */
++bool
++function_resolver::require_pointer_type (unsigned int argno)
++{
++  if (!scalar_argument_p (argno))
++    {
++      error_at (location, "passing %qT to argument %d of %qE, which"
++		" expects a scalar pointer", get_argument_type (argno),
++		argno + 1, fndecl);
++      return false;
++    }
++  return true;
++}
++
++/* Argument FIRST_ARGNO is a scalar with type EXPECTED_TYPE, and argument
++   ARGNO should be consistent with it.  Return true if it is, otherwise
++   report an appropriate error.  */
++bool function_resolver::
++require_matching_integer_scalar_type (unsigned int argno,
++				      unsigned int first_argno,
++				      type_suffix_index expected_type)
++{
++  type_suffix_index actual_type = infer_integer_scalar_type (argno);
++  if (actual_type == NUM_TYPE_SUFFIXES)
++    return false;
++
++  if (actual_type == expected_type)
++    return true;
++
++  error_at (location, "call to %qE is ambiguous; argument %d has type"
++	    " %qs but argument %d has type %qs", fndecl,
++	    first_argno + 1, get_scalar_type_name (expected_type),
++	    argno + 1, get_scalar_type_name (actual_type));
++  return false;
++}
++
++/* Require argument ARGNO to be a (possibly variable) scalar, expecting it
++   to have the following properties:
++
++   - the type class must be the same as for type suffix 0 if EXPECTED_TCLASS
++     is SAME_TYPE_CLASS, otherwise it must be EXPECTED_TCLASS itself.
++
++   - the element size must be the same as for type suffix 0 if EXPECTED_BITS
++     is SAME_TYPE_SIZE, otherwise it must be EXPECTED_BITS itself.
++
++   Return true if the argument is valid, otherwise report an appropriate error.
++
++   Note that we don't check whether the scalar type actually has the required
++   properties, since that's subject to implicit promotions and conversions.
++   Instead we just use the expected properties to tune the error message.  */
++bool function_resolver::
++require_derived_scalar_type (unsigned int argno,
++			     type_class_index expected_tclass,
++			     unsigned int expected_bits)
++{
++  gcc_assert (expected_tclass == SAME_TYPE_CLASS
++	      || expected_tclass == TYPE_signed
++	      || expected_tclass == TYPE_unsigned);
++
++  /* If the expected type doesn't depend on the type suffix at all,
++     just check for the fixed choice of scalar type.  */
++  if (expected_tclass != SAME_TYPE_CLASS && expected_bits != SAME_SIZE)
++    {
++      type_suffix_index expected_type
++	= find_type_suffix (expected_tclass, expected_bits);
++      return require_scalar_type (argno, get_scalar_type_name (expected_type));
++    }
++
++  if (scalar_argument_p (argno))
++    return true;
++
++  if (expected_tclass == SAME_TYPE_CLASS)
++    /* It doesn't really matter whether the element is expected to be
++       the same size as type suffix 0.  */
++    error_at (location, "passing %qT to argument %d of %qE, which"
++	      " expects a scalar element", get_argument_type (argno),
++	      argno + 1, fndecl);
++  else
++    /* It doesn't seem useful to distinguish between signed and unsigned
++       scalars here.  */
++    error_at (location, "passing %qT to argument %d of %qE, which"
++	      " expects a scalar integer", get_argument_type (argno),
++	      argno + 1, fndecl);
++  return false;
++}
++
++/* Require argument ARGNO to be suitable for an integer constant expression.
++   Return true if it is, otherwise report an appropriate error.
++
++   function_checker checks whether the argument is actually constant and
++   has a suitable range.  The reason for distinguishing immediate arguments
++   here is because it provides more consistent error messages than
++   require_scalar_type would.  */
++bool
++function_resolver::require_integer_immediate (unsigned int argno)
++{
++  if (!scalar_argument_p (argno))
++    {
++      report_non_ice (location, fndecl, argno);
++      return false;
++    }
++  return true;
++}
++
++/* Require argument ARGNO to be a vector base in a gather-style address.
++   Return its type on success, otherwise return NUM_VECTOR_TYPES.  */
++vector_type_index
++function_resolver::infer_vector_base_type (unsigned int argno)
++{
++  type_suffix_index type = infer_vector_type (argno);
++  if (type == NUM_TYPE_SUFFIXES)
++    return NUM_VECTOR_TYPES;
++
++  if (type == TYPE_SUFFIX_u32 || type == TYPE_SUFFIX_u64)
++    return type_suffixes[type].vector_type;
++
++  error_at (location, "passing %qT to argument %d of %qE, which"
++	    " expects %qs or %qs", get_argument_type (argno),
++	    argno + 1, fndecl, "svuint32_t", "svuint64_t");
++  return NUM_VECTOR_TYPES;
++}
++
++/* Require argument ARGNO to be a vector displacement in a gather-style
++   address.  Return its type on success, otherwise return NUM_VECTOR_TYPES.  */
++vector_type_index
++function_resolver::infer_vector_displacement_type (unsigned int argno)
++{
++  type_suffix_index type = infer_integer_vector_type (argno);
++  if (type == NUM_TYPE_SUFFIXES)
++    return NUM_VECTOR_TYPES;
++
++  if (type_suffixes[type].integer_p
++      && (type_suffixes[type].element_bits == 32
++	  || type_suffixes[type].element_bits == 64))
++    return type_suffixes[type].vector_type;
++
++  error_at (location, "passing %qT to argument %d of %qE, which"
++	    " expects a vector of 32-bit or 64-bit integers",
++	    get_argument_type (argno), argno + 1, fndecl);
++  return NUM_VECTOR_TYPES;
++}
++
++/* Require argument ARGNO to be a vector displacement in a gather-style
++   address.  There are three possible uses:
++
++   - for loading into elements of type TYPE (when LOAD_P is true)
++   - for storing from elements of type TYPE (when LOAD_P is false)
++   - for prefetching data (when TYPE is NUM_TYPE_SUFFIXES)
++
++   The overloaded function's mode suffix determines the units of the
++   displacement (bytes for "_offset", elements for "_index").
++
++   Return the associated mode on success, otherwise report an error
++   and return MODE_none.  */
++mode_suffix_index
++function_resolver::resolve_sv_displacement (unsigned int argno,
++					    type_suffix_index type,
++					    bool load_p)
++{
++  if (type == NUM_TYPE_SUFFIXES)
++    {
++      /* For prefetches, the base is a void pointer and the displacement
++	 can be any valid offset or index type.  */
++      vector_type_index displacement_vector_type
++	= infer_vector_displacement_type (argno);
++      if (displacement_vector_type == NUM_VECTOR_TYPES)
++	return MODE_none;
++
++      mode_suffix_index mode = find_mode_suffix (NUM_VECTOR_TYPES,
++						 displacement_vector_type,
++						 displacement_units ());
++      gcc_assert (mode != MODE_none);
++      return mode;
++    }
++
++  unsigned int required_bits = type_suffixes[type].element_bits;
++  if (required_bits == 32
++      && displacement_units () == UNITS_elements
++      && !lookup_form (MODE_s32index, type)
++      && !lookup_form (MODE_u32index, type))
++    {
++      if (lookup_form (MODE_u32base_index, type))
++	{
++	  if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
++	    {
++	      gcc_assert (!load_p);
++	      error_at (location, "when storing %qT, %qE requires a vector"
++			" base and a scalar index", get_vector_type (type),
++			fndecl);
++	    }
++	  else
++	    error_at (location, "%qE requires a vector base and a scalar"
++		      " index", fndecl);
++	}
++      else
++	error_at (location, "%qE does not support 32-bit vector type %qT",
++		  fndecl, get_vector_type (type));
++      return MODE_none;
++    }
++
++  /* Check for some form of vector type, without naming any in particular
++     as being expected.  */
++  type_suffix_index displacement_type = infer_vector_type (argno);
++  if (displacement_type == NUM_TYPE_SUFFIXES)
++    return MODE_none;
++
++  /* If the displacement type is consistent with the data vector type,
++     try to find the associated mode suffix.  This will fall through
++     for non-integral displacement types.  */
++  if (type_suffixes[displacement_type].element_bits == required_bits)
++    {
++      vector_type_index displacement_vector_type
++	= type_suffixes[displacement_type].vector_type;
++      mode_suffix_index mode = find_mode_suffix (NUM_VECTOR_TYPES,
++						 displacement_vector_type,
++						 displacement_units ());
++      if (mode != MODE_none)
++	{
++	  if (mode == MODE_s32offset
++	      && !lookup_form (mode, type)
++	      && lookup_form (MODE_u32offset, type))
++	    {
++	      if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
++		error_at (location, "%qE does not support 32-bit sign-extended"
++			  " offsets", fndecl);
++	      else
++		error_at (location, "%qE does not support sign-extended"
++			  " offsets", fndecl);
++	      return MODE_none;
++	    }
++	  return mode;
++	}
++    }
++
++  if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
++    {
++      /* TYPE has been inferred rather than specified by the user,
++	 so mention it in the error messages.  */
++      if (load_p)
++	error_at (location, "passing %qT to argument %d of %qE, which when"
++		  " loading %qT expects a vector of %d-bit integers",
++		  get_argument_type (argno), argno + 1, fndecl,
++		  get_vector_type (type), required_bits);
++      else
++	error_at (location, "passing %qT to argument %d of %qE, which when"
++		  " storing %qT expects a vector of %d-bit integers",
++		  get_argument_type (argno), argno + 1, fndecl,
++		  get_vector_type (type), required_bits);
++    }
++  else
++    /* TYPE is part of the function name.  */
++    error_at (location, "passing %qT to argument %d of %qE, which"
++	      " expects a vector of %d-bit integers",
++	      get_argument_type (argno), argno + 1, fndecl, required_bits);
++  return MODE_none;
++}
++
++/* Require the arguments starting at ARGNO to form a gather-style address.
++   There are three possible uses:
++
++   - for loading into elements of type TYPE (when LOAD_P is true)
++   - for storing from elements of type TYPE (when LOAD_P is false)
++   - for prefetching data (when TYPE is NUM_TYPE_SUFFIXES)
++
++   The three possible addresses are:
++
++   - a vector base with no displacement
++   - a vector base and a scalar displacement
++   - a scalar (pointer) base and a vector displacement
++
++   The overloaded function's mode suffix determines whether there is
++   a displacement, and if so, what units it uses:
++
++   - MODE_none: no displacement
++   - MODE_offset: the displacement is measured in bytes
++   - MODE_index: the displacement is measured in elements
++
++   Return the mode of the non-overloaded function on success, otherwise
++   report an error and return MODE_none.  */
++mode_suffix_index
++function_resolver::resolve_gather_address (unsigned int argno,
++					   type_suffix_index type,
++					   bool load_p)
++{
++  tree actual = get_argument_type (argno);
++  if (actual == error_mark_node)
++    return MODE_none;
++
++  if (displacement_units () != UNITS_none)
++    {
++      /* Some form of displacement is needed.  First handle a scalar
++	 pointer base and a vector displacement.  */
++      if (scalar_argument_p (argno))
++	/* Don't check the pointer type here, since there's only one valid
++	   choice.  Leave that to the frontend.  */
++	return resolve_sv_displacement (argno + 1, type, load_p);
++
++      if (!VECTOR_TYPE_P (actual))
++	{
++	  error_at (location, "passing %qT to argument %d of %qE,"
++		    " which expects a vector or pointer base address",
++		    actual, argno + 1, fndecl);
++	  return MODE_none;
++	}
++    }
++
++  /* Check for the correct choice of vector base type.  */
++  vector_type_index base_vector_type;
++  if (type == NUM_TYPE_SUFFIXES)
++    {
++      /* Since prefetches have no type suffix, there is a free choice
++	 between 32-bit and 64-bit base addresses.  */
++      base_vector_type = infer_vector_base_type (argno);
++      if (base_vector_type == NUM_VECTOR_TYPES)
++	return MODE_none;
++    }
++  else
++    {
++      /* Check for some form of vector type, without saying which type
++	 we expect.  */
++      type_suffix_index base_type = infer_vector_type (argno);
++      if (base_type == NUM_TYPE_SUFFIXES)
++	return MODE_none;
++
++      /* Check whether the type is the right one.  */
++      unsigned int required_bits = type_suffixes[type].element_bits;
++      gcc_assert (required_bits == 32 || required_bits == 64);
++      type_suffix_index required_type = (required_bits == 32
++					 ? TYPE_SUFFIX_u32
++					 : TYPE_SUFFIX_u64);
++      if (required_type != base_type)
++	{
++	  error_at (location, "passing %qT to argument %d of %qE,"
++		    " which expects %qT", actual, argno + 1, fndecl,
++		    get_vector_type (required_type));
++	  return MODE_none;
++	}
++      base_vector_type = type_suffixes[base_type].vector_type;
++    }
++
++  /* Check the scalar displacement, if any.  */
++  if (displacement_units () != UNITS_none
++      && !require_scalar_type (argno + 1, "int64_t"))
++    return MODE_none;
++
++  /* Find the appropriate mode suffix.  The checks above should have
++     weeded out all erroneous cases.  */
++  for (unsigned int mode_i = 0; mode_i < ARRAY_SIZE (mode_suffixes); ++mode_i)
++    {
++      const mode_suffix_info &mode = mode_suffixes[mode_i];
++      if (mode.base_vector_type == base_vector_type
++	  && mode.displacement_vector_type == NUM_VECTOR_TYPES
++	  && mode.displacement_units == displacement_units ())
++	return mode_suffix_index (mode_i);
++    }
++
++  gcc_unreachable ();
++}
++
++/* Require arguments ARGNO and ARGNO + 1 to form an ADR-style address,
++   i.e. one with a vector of base addresses and a vector of displacements.
++   The overloaded function's mode suffix determines the units of the
++   displacement (bytes for "_offset", elements for "_index").
++
++   Return the associated mode suffix on success, otherwise report
++   an error and return MODE_none.  */
++mode_suffix_index
++function_resolver::resolve_adr_address (unsigned int argno)
++{
++  vector_type_index base_type = infer_vector_base_type (argno);
++  if (base_type == NUM_VECTOR_TYPES)
++    return MODE_none;
++
++  vector_type_index displacement_type
++    = infer_vector_displacement_type (argno + 1);
++  if (displacement_type == NUM_VECTOR_TYPES)
++    return MODE_none;
++
++  mode_suffix_index mode = find_mode_suffix (base_type, displacement_type,
++					     displacement_units ());
++  if (mode == MODE_none)
++    {
++      if (mode_suffix_id == MODE_offset)
++	error_at (location, "cannot combine a base of type %qT with"
++		  " an offset of type %qT",
++		  get_argument_type (argno), get_argument_type (argno + 1));
++      else
++	error_at (location, "cannot combine a base of type %qT with"
++		  " an index of type %qT",
++		  get_argument_type (argno), get_argument_type (argno + 1));
++    }
++  return mode;
++}
++
++/* Require the function to have exactly EXPECTED arguments.  Return true
++   if it does, otherwise report an appropriate error.  */
++bool
++function_resolver::check_num_arguments (unsigned int expected)
++{
++  if (m_arglist.length () < expected)
++    error_at (location, "too few arguments to function %qE", fndecl);
++  else if (m_arglist.length () > expected)
++    error_at (location, "too many arguments to function %qE", fndecl);
++  return m_arglist.length () == expected;
++}
++
++/* If the function is predicated, check that the first argument is a
++   suitable governing predicate.  Also check that there are NOPS further
++   arguments after any governing predicate, but don't check what they are.
++
++   Return true on success, otherwise report a suitable error.
++   When returning true:
++
++   - set I to the number of the first unchecked argument.
++   - set NARGS to the total number of arguments.  */
++bool
++function_resolver::check_gp_argument (unsigned int nops,
++				      unsigned int &i, unsigned int &nargs)
++{
++  i = 0;
++  if (pred != PRED_none)
++    {
++      /* Unary merge operations should use resolve_unary instead.  */
++      gcc_assert (nops != 1 || pred != PRED_m);
++      nargs = nops + 1;
++      if (!check_num_arguments (nargs)
++	  || !require_vector_type (i, VECTOR_TYPE_svbool_t))
++	return false;
++      i += 1;
++    }
++  else
++    {
++      nargs = nops;
++      if (!check_num_arguments (nargs))
++	return false;
++    }
++
++  return true;
++}
++
++/* Finish resolving a function whose final argument can be a vector
++   or a scalar, with the function having an implicit "_n" suffix
++   in the latter case.  This "_n" form might only exist for certain
++   type suffixes.
++
++   ARGNO is the index of the final argument.  The inferred type suffix
++   was obtained from argument FIRST_ARGNO, which has type FIRST_TYPE.
++   EXPECTED_TCLASS and EXPECTED_BITS describe the expected properties
++   of the final vector or scalar argument, in the same way as for
++   require_derived_vector_type.  INFERRED_TYPE is the inferred type
++   suffix itself, or NUM_TYPE_SUFFIXES if it's the same as FIRST_TYPE.
++
++   Return the function decl of the resolved function on success,
++   otherwise report a suitable error and return error_mark_node.  */
++tree function_resolver::
++finish_opt_n_resolution (unsigned int argno, unsigned int first_argno,
++			 type_suffix_index first_type,
++			 type_class_index expected_tclass,
++			 unsigned int expected_bits,
++			 type_suffix_index inferred_type)
++{
++  if (inferred_type == NUM_TYPE_SUFFIXES)
++    inferred_type = first_type;
++  tree scalar_form = lookup_form (MODE_n, inferred_type);
++
++  /* Allow the final argument to be scalar, if an _n form exists.  */
++  if (scalar_argument_p (argno))
++    {
++      if (scalar_form)
++	return scalar_form;
++
++      /* Check the vector form normally.  If that succeeds, raise an
++	 error about having no corresponding _n form.  */
++      tree res = resolve_to (mode_suffix_id, inferred_type);
++      if (res != error_mark_node)
++	error_at (location, "passing %qT to argument %d of %qE, but its"
++		  " %qT form does not accept scalars",
++		  get_argument_type (argno), argno + 1, fndecl,
++		  get_vector_type (first_type));
++      return error_mark_node;
++    }
++
++  /* If an _n form does exist, provide a more accurate message than
++     require_derived_vector_type would for arguments that are neither
++     vectors nor scalars.  */
++  if (scalar_form && !require_vector_or_scalar_type (argno))
++    return error_mark_node;
++
++  /* Check for the correct vector type.  */
++  if (!require_derived_vector_type (argno, first_argno, first_type,
++				    expected_tclass, expected_bits))
++    return error_mark_node;
++
++  return resolve_to (mode_suffix_id, inferred_type);
++}
++
++/* Resolve a (possibly predicated) unary function.  If the function uses
++   merge predication or if TREAT_AS_MERGE_P is true, there is an extra
++   vector argument before the governing predicate that specifies the
++   values of inactive elements.  This argument has the following
++   properties:
++
++   - the type class must be the same as for active elements if MERGE_TCLASS
++     is SAME_TYPE_CLASS, otherwise it must be MERGE_TCLASS itself.
++
++   - the element size must be the same as for active elements if MERGE_BITS
++     is SAME_TYPE_SIZE, otherwise it must be MERGE_BITS itself.
++
++   Return the function decl of the resolved function on success,
++   otherwise report a suitable error and return error_mark_node.  */
++tree
++function_resolver::resolve_unary (type_class_index merge_tclass,
++				  unsigned int merge_bits,
++				  bool treat_as_merge_p)
++{
++  type_suffix_index type;
++  if (pred == PRED_m || treat_as_merge_p)
++    {
++      if (!check_num_arguments (3))
++	return error_mark_node;
++      if (merge_tclass == SAME_TYPE_CLASS && merge_bits == SAME_SIZE)
++	{
++	  /* The inactive elements are the same as the active elements,
++	     so we can use normal left-to-right resolution.  */
++	  if ((type = infer_vector_type (0)) == NUM_TYPE_SUFFIXES
++	      || !require_vector_type (1, VECTOR_TYPE_svbool_t)
++	      || !require_matching_vector_type (2, type))
++	    return error_mark_node;
++	}
++      else
++	{
++	  /* The inactive element type is a function of the active one,
++	     so resolve the active one first.  */
++	  if (!require_vector_type (1, VECTOR_TYPE_svbool_t)
++	      || (type = infer_vector_type (2)) == NUM_TYPE_SUFFIXES
++	      || !require_derived_vector_type (0, 2, type, merge_tclass,
++					       merge_bits))
++	    return error_mark_node;
++	}
++    }
++  else
++    {
++      /* We just need to check the predicate (if any) and the single
++	 vector argument.  */
++      unsigned int i, nargs;
++      if (!check_gp_argument (1, i, nargs)
++	  || (type = infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++	return error_mark_node;
++    }
++
++  /* Handle convert-like functions in which the first type suffix is
++     explicit.  */
++  if (type_suffix_ids[0] != NUM_TYPE_SUFFIXES)
++    return resolve_to (mode_suffix_id, type_suffix_ids[0], type);
++
++  return resolve_to (mode_suffix_id, type);
++}
++
++/* Resolve a (possibly predicated) function that takes NOPS like-typed
++   vector arguments followed by NIMM integer immediates.  Return the
++   function decl of the resolved function on success, otherwise report
++   a suitable error and return error_mark_node.  */
++tree
++function_resolver::resolve_uniform (unsigned int nops, unsigned int nimm)
++{
++  unsigned int i, nargs;
++  type_suffix_index type;
++  if (!check_gp_argument (nops + nimm, i, nargs)
++      || (type = infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++    return error_mark_node;
++
++  i += 1;
++  for (; i < nargs - nimm; ++i)
++    if (!require_matching_vector_type (i, type))
++      return error_mark_node;
++
++  for (; i < nargs; ++i)
++    if (!require_integer_immediate (i))
++      return error_mark_node;
++
++  return resolve_to (mode_suffix_id, type);
++}
++
++/* Resolve a (possibly predicated) function that offers a choice between
++   taking:
++
++   - NOPS like-typed vector arguments or
++   - NOPS - 1 like-typed vector arguments followed by a scalar argument
++
++   Return the function decl of the resolved function on success,
++   otherwise report a suitable error and return error_mark_node.  */
++tree
++function_resolver::resolve_uniform_opt_n (unsigned int nops)
++{
++  unsigned int i, nargs;
++  type_suffix_index type;
++  if (!check_gp_argument (nops, i, nargs)
++      || (type = infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
++    return error_mark_node;
++
++  unsigned int first_arg = i++;
++  for (; i < nargs - 1; ++i)
++    if (!require_matching_vector_type (i, type))
++      return error_mark_node;
++
++  return finish_opt_n_resolution (i, first_arg, type);
++}
++
++/* If the call is erroneous, report an appropriate error and return
++   error_mark_node.  Otherwise, if the function is overloaded, return
++   the decl of the non-overloaded function.  Return NULL_TREE otherwise,
++   indicating that the call should be processed in the normal way.  */
++tree
++function_resolver::resolve ()
++{
++  return shape->resolve (*this);
++}
++
++function_checker::function_checker (location_t location,
++				    const function_instance &instance,
++				    tree fndecl, tree fntype,
++				    unsigned int nargs, tree *args)
++  : function_call_info (location, instance, fndecl),
++    m_fntype (fntype), m_nargs (nargs), m_args (args),
++    /* We don't have to worry about unary _m operations here, since they
++       never have arguments that need checking.  */
++    m_base_arg (pred != PRED_none ? 1 : 0)
++{
++}
++
++/* Return true if argument ARGNO exists. which it might not for
++   erroneous calls.  It is safe to wave through checks if this
++   function returns false.  */
++bool
++function_checker::argument_exists_p (unsigned int argno)
++{
++  gcc_assert (argno < (unsigned int) type_num_arguments (m_fntype));
++  return argno < m_nargs;
++}
++
++/* Check that argument ARGNO is an integer constant expression and
++   store its value in VALUE_OUT if so.  The caller should first
++   check that argument ARGNO exists.  */
++bool
++function_checker::require_immediate (unsigned int argno,
++				     HOST_WIDE_INT &value_out)
++{
++  gcc_assert (argno < m_nargs);
++  tree arg = m_args[argno];
++
++  /* The type and range are unsigned, so read the argument as an
++     unsigned rather than signed HWI.  */
++  if (!tree_fits_uhwi_p (arg))
++    {
++      report_non_ice (location, fndecl, argno);
++      return false;
++    }
++
++  /* ...but treat VALUE_OUT as signed for error reporting, since printing
++     -1 is more user-friendly than the maximum uint64_t value.  */
++  value_out = tree_to_uhwi (arg);
++  return true;
++}
++
++/* Check that argument REL_ARGNO is an integer constant expression that
++   has the value VALUE0 or VALUE1.  REL_ARGNO counts from the end of the
++   predication arguments.  */
++bool
++function_checker::require_immediate_either_or (unsigned int rel_argno,
++					       HOST_WIDE_INT value0,
++					       HOST_WIDE_INT value1)
++{
++  unsigned int argno = m_base_arg + rel_argno;
++  if (!argument_exists_p (argno))
++    return true;
++
++  HOST_WIDE_INT actual;
++  if (!require_immediate (argno, actual))
++    return false;
++
++  if (actual != value0 && actual != value1)
++    {
++      report_neither_nor (location, fndecl, argno, actual, 90, 270);
++      return false;
++    }
++
++  return true;
++}
++
++/* Check that argument REL_ARGNO is an integer constant expression that has
++   a valid value for enumeration type TYPE.  REL_ARGNO counts from the end
++   of the predication arguments.  */
++bool
++function_checker::require_immediate_enum (unsigned int rel_argno, tree type)
++{
++  unsigned int argno = m_base_arg + rel_argno;
++  if (!argument_exists_p (argno))
++    return true;
++
++  HOST_WIDE_INT actual;
++  if (!require_immediate (argno, actual))
++    return false;
++
++  for (tree entry = TYPE_VALUES (type); entry; entry = TREE_CHAIN (entry))
++    {
++      /* The value is an INTEGER_CST for C and a CONST_DECL wrapper
++	 around an INTEGER_CST for C++.  */
++      tree value = TREE_VALUE (entry);
++      if (TREE_CODE (value) == CONST_DECL)
++	value = DECL_INITIAL (value);
++      if (wi::to_widest (value) == actual)
++	return true;
++    }
++
++  report_not_enum (location, fndecl, argno, actual, type);
++  return false;
++}
++
++/* Check that argument REL_ARGNO is suitable for indexing argument
++   REL_ARGNO - 1, in groups of GROUP_SIZE elements.  REL_ARGNO counts
++   from the end of the predication arguments.  */
++bool
++function_checker::require_immediate_lane_index (unsigned int rel_argno,
++						unsigned int group_size)
++{
++  unsigned int argno = m_base_arg + rel_argno;
++  if (!argument_exists_p (argno))
++    return true;
++
++  /* Get the type of the previous argument.  tree_argument_type wants a
++     1-based number, whereas ARGNO is 0-based.  */
++  machine_mode mode = TYPE_MODE (type_argument_type (m_fntype, argno));
++  gcc_assert (VECTOR_MODE_P (mode));
++  unsigned int nlanes = 128 / (group_size * GET_MODE_UNIT_BITSIZE (mode));
++  return require_immediate_range (rel_argno, 0, nlanes - 1);
++}
++
++/* Check that argument REL_ARGNO is an integer constant expression that
++   has one of the given values.  */
++bool
++function_checker::require_immediate_one_of (unsigned int rel_argno,
++					    HOST_WIDE_INT value0,
++					    HOST_WIDE_INT value1,
++					    HOST_WIDE_INT value2,
++					    HOST_WIDE_INT value3)
++{
++  unsigned int argno = m_base_arg + rel_argno;
++  if (!argument_exists_p (argno))
++    return true;
++
++  HOST_WIDE_INT actual;
++  if (!require_immediate (argno, actual))
++    return false;
++
++  if (actual != value0
++      && actual != value1
++      && actual != value2
++      && actual != value3)
++    {
++      report_not_one_of (location, fndecl, argno, actual,
++			 value0, value1, value2, value3);
++      return false;
++    }
++
++  return true;
++}
++
++/* Check that argument REL_ARGNO is an integer constant expression in the
++   range [MIN, MAX].  REL_ARGNO counts from the end of the predication
++   arguments.  */
++bool
++function_checker::require_immediate_range (unsigned int rel_argno,
++					   HOST_WIDE_INT min,
++					   HOST_WIDE_INT max)
++{
++  unsigned int argno = m_base_arg + rel_argno;
++  if (!argument_exists_p (argno))
++    return true;
++
++  /* Required because of the tree_to_uhwi -> HOST_WIDE_INT conversion
++     in require_immediate.  */
++  gcc_assert (min >= 0 && min <= max);
++  HOST_WIDE_INT actual;
++  if (!require_immediate (argno, actual))
++    return false;
++
++  if (!IN_RANGE (actual, min, max))
++    {
++      report_out_of_range (location, fndecl, argno, actual, min, max);
++      return false;
++    }
++
++  return true;
++}
++
++/* Perform semantic checks on the call.  Return true if the call is valid,
++   otherwise report a suitable error.  */
++bool
++function_checker::check ()
++{
++  function_args_iterator iter;
++  tree type;
++  unsigned int i = 0;
++  FOREACH_FUNCTION_ARGS (m_fntype, type, iter)
++    {
++      if (type == void_type_node || i >= m_nargs)
++	break;
++
++      if (i >= m_base_arg
++	  && TREE_CODE (type) == ENUMERAL_TYPE
++	  && !require_immediate_enum (i - m_base_arg, type))
++	return false;
++
++      i += 1;
++    }
++
++  return shape->check (*this);
++}
++
++gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
++			      gimple_stmt_iterator *gsi_in, gcall *call_in)
++  : function_call_info (gimple_location (call_in), instance, fndecl),
++    gsi (gsi_in), call (call_in), lhs (gimple_call_lhs (call_in))
++{
++}
++
++/* Convert predicate argument ARGNO so that it has the type appropriate for
++   an operation on VECTYPE.  Add any new statements to STMTS.  */
++tree
++gimple_folder::convert_pred (gimple_seq &stmts, tree vectype,
++			     unsigned int argno)
++{
++  tree pred = gimple_call_arg (call, argno);
++  if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (pred)),
++		TYPE_VECTOR_SUBPARTS (vectype)))
++    return pred;
++
++  return gimple_build (&stmts, VIEW_CONVERT_EXPR,
++		       truth_type_for (vectype), pred);
++}
++
++/* Return a pointer to the address in a contiguous load or store,
++   given that each memory vector has type VECTYPE.  Add any new
++   statements to STMTS.  */
++tree
++gimple_folder::fold_contiguous_base (gimple_seq &stmts, tree vectype)
++{
++  tree base = gimple_call_arg (call, 1);
++  if (mode_suffix_id == MODE_vnum)
++    {
++      tree offset = gimple_call_arg (call, 2);
++      offset = gimple_convert (&stmts, sizetype, offset);
++      offset = gimple_build (&stmts, MULT_EXPR, sizetype, offset,
++			     TYPE_SIZE_UNIT (vectype));
++      base = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (base),
++			   base, offset);
++    }
++  return base;
++}
++
++/* Return the alignment and TBAA argument to an internal load or store
++   function like IFN_MASK_LOAD or IFN_MASK_STORE, given that it accesses
++   memory elements of type TYPE.  */
++tree
++gimple_folder::load_store_cookie (tree type)
++{
++  return build_int_cst (build_pointer_type (type), TYPE_ALIGN_UNIT (type));
++}
++
++/* Fold the call to a call to INSTANCE, with the same arguments.  */
++gimple *
++gimple_folder::redirect_call (const function_instance &instance)
++{
++  registered_function *rfn
++    = function_table->find_with_hash (instance, instance.hash ());
++  if (!rfn)
++    return NULL;
++
++  gimple_call_set_fndecl (call, rfn->decl);
++  return call;
++}
++
++/* Fold the call to a PTRUE, taking the element size from type suffix 0.  */
++gimple *
++gimple_folder::fold_to_ptrue ()
++{
++  tree svbool_type = TREE_TYPE (lhs);
++  tree bool_type = TREE_TYPE (svbool_type);
++  unsigned int element_bytes = type_suffix (0).element_bytes;
++
++  /* The return type is svbool_t for all type suffixes, thus for b8 we
++     want { 1, 1, 1, 1, ... }, for b16 we want { 1, 0, 1, 0, ... }, etc.  */
++  tree_vector_builder builder (svbool_type, element_bytes, 1);
++  builder.quick_push (build_all_ones_cst (bool_type));
++  for (unsigned int i = 1; i < element_bytes; ++i)
++    builder.quick_push (build_zero_cst (bool_type));
++  return gimple_build_assign (lhs, builder.build ());
++}
++
++/* Fold the call to a PFALSE.  */
++gimple *
++gimple_folder::fold_to_pfalse ()
++{
++  return gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
++}
++
++/* Fold an operation to a constant predicate in which the first VL
++   elements are set and the rest are clear.  Take the element size
++   from type suffix 0.  */
++gimple *
++gimple_folder::fold_to_vl_pred (unsigned int vl)
++{
++  tree vectype = TREE_TYPE (lhs);
++  tree element_type = TREE_TYPE (vectype);
++  tree minus_one = build_all_ones_cst (element_type);
++  tree zero = build_zero_cst (element_type);
++  unsigned int element_bytes = type_suffix (0).element_bytes;
++
++  /* Construct COUNT elements that contain the ptrue followed by
++     a repeating sequence of COUNT elements.  */
++  unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vectype));
++  gcc_assert (vl * element_bytes <= count);
++  tree_vector_builder builder (vectype, count, 2);
++  for (unsigned int i = 0; i < count * 2; ++i)
++    {
++      bool bit = (i & (element_bytes - 1)) == 0 && i < vl * element_bytes;
++      builder.quick_push (bit ? minus_one : zero);
++    }
++  return gimple_build_assign (lhs, builder.build ());
++}
++
++/* Try to fold the call.  Return the new statement on success and null
++   on failure.  */
++gimple *
++gimple_folder::fold ()
++{
++  /* Don't fold anything when SVE is disabled; emit an error during
++     expansion instead.  */
++  if (!TARGET_SVE)
++    return NULL;
++
++  /* Punt if the function has a return type and no result location is
++     provided.  The attributes should allow target-independent code to
++     remove the calls if appropriate.  */
++  if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node)
++    return NULL;
++
++  return base->fold (*this);
++}
++
++function_expander::function_expander (const function_instance &instance,
++				      tree fndecl, tree call_expr_in,
++				      rtx possible_target_in)
++  : function_call_info (EXPR_LOCATION (call_expr_in), instance, fndecl),
++    call_expr (call_expr_in), possible_target (possible_target_in)
++{
++}
++
++/* Return the handler of direct optab OP for type suffix SUFFIX_I.  */
++insn_code
++function_expander::direct_optab_handler (optab op, unsigned int suffix_i)
++{
++  return ::direct_optab_handler (op, vector_mode (suffix_i));
++}
++
++/* Choose between signed and unsigned direct optabs SIGNED_OP and
++   UNSIGNED_OP based on the signedness of type suffix SUFFIX_I, then
++   pick the appropriate optab handler for the mode.  Use MODE as the
++   mode if given, otherwise use the mode of type suffix SUFFIX_I.  */
++insn_code
++function_expander::direct_optab_handler_for_sign (optab signed_op,
++						  optab unsigned_op,
++						  unsigned int suffix_i,
++						  machine_mode mode)
++{
++  if (mode == VOIDmode)
++    mode = vector_mode (suffix_i);
++  optab op = type_suffix (suffix_i).unsigned_p ? unsigned_op : signed_op;
++  return ::direct_optab_handler (op, mode);
++}
++
++/* Return true if X overlaps any input.  */
++bool
++function_expander::overlaps_input_p (rtx x)
++{
++  for (unsigned int i = 0; i < args.length (); ++i)
++    if (reg_overlap_mentioned_p (x, args[i]))
++      return true;
++  return false;
++}
++
++/* Return the base address for a contiguous load or store function.
++   MEM_MODE is the mode of the addressed memory.  */
++rtx
++function_expander::get_contiguous_base (machine_mode mem_mode)
++{
++  rtx base = args[1];
++  if (mode_suffix_id == MODE_vnum)
++    {
++      /* Use the size of the memory mode for extending loads and truncating
++	 stores.  Use the size of a full vector for non-extending loads
++	 and non-truncating stores (including svld[234] and svst[234]).  */
++      poly_int64 size = ordered_min (GET_MODE_SIZE (mem_mode),
++				     BYTES_PER_SVE_VECTOR);
++      rtx offset = gen_int_mode (size, Pmode);
++      offset = simplify_gen_binary (MULT, Pmode, args[2], offset);
++      base = simplify_gen_binary (PLUS, Pmode, base, offset);
++    }
++  return base;
++}
++
++/* For a function that does the equivalent of:
++
++     OUTPUT = COND ? FN (INPUTS) : FALLBACK;
++
++   return the value of FALLBACK.
++
++   MODE is the mode of OUTPUT.  NOPS is the number of operands in INPUTS.
++   MERGE_ARGNO is the argument that provides FALLBACK for _m functions,
++   or DEFAULT_MERGE_ARGNO if we should apply the usual rules.
++
++   ARGNO is the caller's index into args.  If the returned value is
++   argument 0 (as for unary _m operations), increment ARGNO past the
++   returned argument.  */
++rtx
++function_expander::get_fallback_value (machine_mode mode, unsigned int nops,
++				       unsigned int merge_argno,
++				       unsigned int &argno)
++{
++  if (pred == PRED_z)
++    return CONST0_RTX (mode);
++
++  gcc_assert (pred == PRED_m || pred == PRED_x);
++  if (merge_argno == DEFAULT_MERGE_ARGNO)
++    merge_argno = nops == 1 && pred == PRED_m ? 0 : 1;
++
++  if (merge_argno == 0)
++    return args[argno++];
++
++  return args[merge_argno];
++}
++
++/* Return a REG rtx that can be used for the result of the function,
++   using the preferred target if suitable.  */
++rtx
++function_expander::get_reg_target ()
++{
++  machine_mode target_mode = TYPE_MODE (TREE_TYPE (TREE_TYPE (fndecl)));
++  if (!possible_target || GET_MODE (possible_target) != target_mode)
++    possible_target = gen_reg_rtx (target_mode);
++  return possible_target;
++}
++
++/* As for get_reg_target, but make sure that the returned REG does not
++   overlap any inputs.  */
++rtx
++function_expander::get_nonoverlapping_reg_target ()
++{
++  if (possible_target && overlaps_input_p (possible_target))
++    possible_target = NULL_RTX;
++  return get_reg_target ();
++}
++
++/* Add an output operand to the instruction we're building, which has
++   code ICODE.  Bind the output to the preferred target rtx if possible.  */
++void
++function_expander::add_output_operand (insn_code icode)
++{
++  unsigned int opno = m_ops.length ();
++  machine_mode mode = insn_data[icode].operand[opno].mode;
++  m_ops.safe_grow (opno + 1);
++  create_output_operand (&m_ops.last (), possible_target, mode);
++}
++
++/* Add an input operand to the instruction we're building, which has
++   code ICODE.  Calculate the value of the operand as follows:
++
++   - If the operand is a vector and X is not, broadcast X to fill a
++     vector of the appropriate mode.
++
++   - Otherwise, if the operand is a predicate, coerce X to have the
++     mode that the instruction expects.  In this case X is known to be
++     VNx16BImode (the mode of svbool_t).
++
++   - Otherwise use X directly.  The expand machinery checks that X has
++     the right mode for the instruction.  */
++void
++function_expander::add_input_operand (insn_code icode, rtx x)
++{
++  unsigned int opno = m_ops.length ();
++  const insn_operand_data &operand = insn_data[icode].operand[opno];
++  machine_mode mode = operand.mode;
++  if (mode == VOIDmode)
++    {
++      /* The only allowable use of VOIDmode is the wildcard
++	 aarch64_any_register_operand, which is used to avoid
++	 combinatorial explosion in the reinterpret patterns.  */
++      gcc_assert (operand.predicate == aarch64_any_register_operand);
++      mode = GET_MODE (x);
++    }
++  else if (!VECTOR_MODE_P (GET_MODE (x)) && VECTOR_MODE_P (mode))
++    x = expand_vector_broadcast (mode, x);
++  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
++    {
++      gcc_assert (GET_MODE (x) == VNx16BImode);
++      x = gen_lowpart (mode, x);
++    }
++  m_ops.safe_grow (m_ops.length () + 1);
++  create_input_operand (&m_ops.last (), x, mode);
++}
++
++/* Add an integer operand with value X to the instruction.  */
++void
++function_expander::add_integer_operand (HOST_WIDE_INT x)
++{
++  m_ops.safe_grow (m_ops.length () + 1);
++  create_integer_operand (&m_ops.last (), x);
++}
++
++/* Add a memory operand with mode MODE and address ADDR.  */
++void
++function_expander::add_mem_operand (machine_mode mode, rtx addr)
++{
++  /* Exception for OImode for the ld1ro intrinsics.
++     They act on 256 bit octaword data, and it's just easier to use a scalar
++     mode to represent that than add a new vector mode solely for the purpose
++     of this intrinsic.  */
++  gcc_assert (VECTOR_MODE_P (mode) || mode == OImode);
++  rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
++  /* The memory is only guaranteed to be element-aligned.  */
++  set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
++  add_fixed_operand (mem);
++}
++
++/* Add an address operand with value X.  The static operand data says
++   what mode and form the address must have.  */
++void
++function_expander::add_address_operand (rtx x)
++{
++  m_ops.safe_grow (m_ops.length () + 1);
++  create_address_operand (&m_ops.last (), x);
++}
++
++/* Add an operand that must be X.  The only way of legitimizing an
++   invalid X is to reload the address of a MEM.  */
++void
++function_expander::add_fixed_operand (rtx x)
++{
++  m_ops.safe_grow (m_ops.length () + 1);
++  create_fixed_operand (&m_ops.last (), x);
++}
++
++/* Generate instruction ICODE, given that its operands have already
++   been added to M_OPS.  Return the value of the first operand.  */
++rtx
++function_expander::generate_insn (insn_code icode)
++{
++  expand_insn (icode, m_ops.length (), m_ops.address ());
++  return function_returns_void_p () ? const0_rtx : m_ops[0].value;
++}
++
++/* Convert the arguments to a gather/scatter function into the
++   associated md operands.  Argument ARGNO is the scalar or vector base and
++   argument ARGNO + 1 is the scalar or vector displacement (if applicable).
++   The md pattern expects:
++
++   - a scalar base
++   - a vector displacement
++
++   If SCALED_P is true, it also expects:
++
++   - a const_int that is 1 if the displacement is zero-extended from 32 bits
++   - a scaling multiplier (1 for bytes, 2 for .h indices, etc.).
++
++   If SCALED_P is false, the displacement is implicitly zero-extended
++   and the scaling multiplier is implicitly 1.  */
++void
++function_expander::prepare_gather_address_operands (unsigned int argno,
++						    bool scaled_p)
++{
++  machine_mode mem_mode = memory_vector_mode ();
++  tree vector_type = base_vector_type ();
++  units_index units = displacement_units ();
++  int shift_idx = -1;
++  if (units == UNITS_none)
++    {
++      /* Vector base, no displacement.  Convert to an integer zero base
++	 and a vector byte offset.  */
++      args.quick_insert (argno, const0_rtx);
++      units = UNITS_bytes;
++    }
++  else if (vector_type)
++    {
++      /* Vector base, scalar displacement.  Convert to a scalar base and
++	 a vector byte offset.  */
++      std::swap (args[argno], args[argno + 1]);
++      if (units == UNITS_elements)
++	shift_idx = argno;
++    }
++  else
++    {
++      /* Scalar base, vector displacement.  This is the order that the md
++	 pattern wants.  */
++      if (Pmode == SImode)
++	args[argno] = simplify_gen_unary (ZERO_EXTEND, DImode,
++					  args[argno], SImode);
++      vector_type = displacement_vector_type ();
++      if (units == UNITS_elements && !scaled_p)
++	shift_idx = argno + 1;
++    }
++  tree scalar_displacement_type = TREE_TYPE (vector_type);
++
++  if (shift_idx >= 0)
++    {
++      machine_mode arg_mode = GET_MODE (args[shift_idx]);
++      if (arg_mode == VOIDmode)
++	arg_mode = DImode;
++      unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mem_mode);
++      rtx shift = gen_int_mode (exact_log2 (elt_bytes), DImode);
++      args[shift_idx] = simplify_gen_binary (ASHIFT, arg_mode,
++					     args[shift_idx], shift);
++      units = UNITS_bytes;
++    }
++
++  bool uxtw_p = (TYPE_PRECISION (scalar_displacement_type) == 64
++		 || TYPE_UNSIGNED (scalar_displacement_type));
++  unsigned int scale = (units == UNITS_bytes
++			? 1 : GET_MODE_UNIT_SIZE (mem_mode));
++
++  if (scaled_p)
++    {
++      args.quick_insert (argno + 2, GEN_INT (uxtw_p));
++      args.quick_insert (argno + 3, GEN_INT (scale));
++    }
++  else
++    gcc_assert (uxtw_p && scale == 1);
++}
++
++/* The final argument is an immediate svprfop value.  Add two fake arguments
++   to represent the rw and locality operands of a PREFETCH rtx.  */
++void
++function_expander::prepare_prefetch_operands ()
++{
++  unsigned int prfop = INTVAL (args.last ());
++  /* Bit 3 of the prfop selects stores over loads.  */
++  args.quick_push (GEN_INT ((prfop & 8) != 0));
++  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
++     1-based for PREFETCH.  */
++  args.quick_push (GEN_INT (((prfop >> 1) & 3) + 1));
++}
++
++/* Add a dummy argument to indicate whether predicate argument ARGNO
++   is all-true when interpreted in mode PRED_MODE.  The hint goes
++   immediately after ARGNO.  */
++void
++function_expander::add_ptrue_hint (unsigned int argno, machine_mode pred_mode)
++{
++  rtx pred = gen_lowpart (pred_mode, args[argno]);
++  int hint = (pred == CONSTM1_RTX (pred_mode)
++	      ? SVE_KNOWN_PTRUE : SVE_MAYBE_NOT_PTRUE);
++  args.quick_insert (argno + 1, gen_int_mode (hint, SImode));
++}
++
++/* Rotate inputs args[START:END] one position to the left, so that
++   args[START] becomes args[END - 1].  */
++void
++function_expander::rotate_inputs_left (unsigned int start, unsigned int end)
++{
++  rtx new_last = args[start];
++  for (unsigned int i = start; i < end - 1; ++i)
++    args[i] = args[i + 1];
++  args[end - 1] = new_last;
++}
++
++/* Return true if the negation of argument ARGNO can be folded away,
++   replacing it with the negated value if so.  MODE is the associated
++   vector mode, but the argument could be a single element.  The main
++   case this handles is constant arguments.  */
++bool
++function_expander::try_negating_argument (unsigned int argno,
++					  machine_mode mode)
++{
++  rtx x = args[argno];
++  if (!VECTOR_MODE_P (GET_MODE (x)))
++    mode = GET_MODE_INNER (mode);
++
++  x = simplify_unary_operation (NEG, mode, x, mode);
++  if (!x)
++    return false;
++
++  args[argno] = x;
++  return true;
++}
++
++/* Implement the call using instruction ICODE, with a 1:1 mapping between
++   arguments and input operands.  */
++rtx
++function_expander::use_exact_insn (insn_code icode)
++{
++  unsigned int nops = insn_data[icode].n_operands;
++  if (!function_returns_void_p ())
++    {
++      add_output_operand (icode);
++      nops -= 1;
++    }
++  for (unsigned int i = 0; i < nops; ++i)
++    add_input_operand (icode, args[i]);
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which does not use a
++   governing predicate.  We must therefore drop the GP from an _x call.  */
++rtx
++function_expander::use_unpred_insn (insn_code icode)
++{
++  /* We can't drop the predicate for _z and _m.  */
++  gcc_assert (pred == PRED_x || pred == PRED_none);
++  /* Discount the output operand.  */
++  unsigned int nops = insn_data[icode].n_operands - 1;
++  /* Drop the predicate argument in the case of _x predication.  */
++  unsigned int bias = (pred == PRED_x ? 1 : 0);
++  unsigned int i = 0;
++
++  add_output_operand (icode);
++  for (; i < nops; ++i)
++    add_input_operand (icode, args[i + bias]);
++
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which is a predicated
++   operation that returns arbitrary values for inactive lanes.  */
++rtx
++function_expander::use_pred_x_insn (insn_code icode)
++{
++  /* At present we never need to handle PRED_none, which would involve
++     creating a new predicate rather than using one supplied by the user.  */
++  gcc_assert (pred == PRED_x);
++  /* Discount the output operand.  */
++  unsigned int nops = args.length () - 1;
++
++  bool has_float_operand_p = FLOAT_MODE_P (insn_data[icode].operand[0].mode);
++
++  /* Add the normal operands.  */
++  add_output_operand (icode);
++  add_input_operand (icode, args[0]);
++  for (unsigned int i = 0; i < nops; ++i)
++    {
++      add_input_operand (icode, args[i + 1]);
++      if (FLOAT_MODE_P (GET_MODE (args[i + 1])))
++	has_float_operand_p = true;
++    }
++
++  if (has_float_operand_p)
++    {
++      /* Add a flag that indicates whether unpredicated instructions
++	 are allowed.  */
++      rtx pred = m_ops[1].value;
++      if (flag_trapping_math && pred != CONST1_RTX (GET_MODE (pred)))
++	add_integer_operand (SVE_STRICT_GP);
++      else
++	add_integer_operand (SVE_RELAXED_GP);
++    }
++
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which does the equivalent of:
++
++     OUTPUT = COND ? FN (INPUTS) : FALLBACK;
++
++   The instruction operands are in the order above: OUTPUT, COND, INPUTS
++   and FALLBACK.  MERGE_ARGNO is the argument that provides FALLBACK for _m
++   functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
++rtx
++function_expander::use_cond_insn (insn_code icode, unsigned int merge_argno)
++{
++  /* At present we never need to handle PRED_none, which would involve
++     creating a new predicate rather than using one supplied by the user.  */
++  gcc_assert (pred != PRED_none);
++  /* Discount the output, predicate and fallback value.  */
++  unsigned int nops = insn_data[icode].n_operands - 3;
++  machine_mode mode = insn_data[icode].operand[0].mode;
++
++  unsigned int opno = 0;
++  rtx fallback_arg = get_fallback_value (mode, nops, merge_argno, opno);
++  rtx pred = args[opno++];
++
++  add_output_operand (icode);
++  add_input_operand (icode, pred);
++  for (unsigned int i = 0; i < nops; ++i)
++    add_input_operand (icode, args[opno + i]);
++  add_input_operand (icode, fallback_arg);
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which is a select-like
++   operation with the following operands:
++
++   0: output
++   1: true value
++   2: false value
++   3: predicate
++
++   MERGE_ARGNO is the argument that provides the "false" value for _m
++   functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
++rtx
++function_expander::use_vcond_mask_insn (insn_code icode,
++					unsigned int merge_argno)
++{
++  machine_mode mode = vector_mode (0);
++
++  unsigned int opno = 0;
++  rtx false_arg = get_fallback_value (mode, 1, merge_argno, opno);
++  rtx pred_arg = args[opno++];
++  rtx true_arg = args[opno++];
++
++  add_output_operand (icode);
++  add_input_operand (icode, true_arg);
++  add_input_operand (icode, false_arg);
++  add_input_operand (icode, pred_arg);
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which loads memory operand 1
++   into register operand 0 under the control of predicate operand 2.  */
++rtx
++function_expander::use_contiguous_load_insn (insn_code icode)
++{
++  machine_mode mem_mode = memory_vector_mode ();
++
++  add_output_operand (icode);
++  add_mem_operand (mem_mode, get_contiguous_base (mem_mode));
++  add_input_operand (icode, args[0]);
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which prefetches from
++   address operand 1 under the control of predicate operand 0.
++   Operands 2, 3 and 4 respectively specify the svprfop value,
++   the PREFETCH rw flag and the PREFETCH locality.  */
++rtx
++function_expander::use_contiguous_prefetch_insn (insn_code icode)
++{
++  add_input_operand (icode, args[0]);
++  add_address_operand (get_contiguous_base (VNx16QImode));
++  for (unsigned int i = args.length () - 3; i < args.length (); ++i)
++    add_input_operand (icode, args[i]);
++  return generate_insn (icode);
++}
++
++/* Implement the call using instruction ICODE, which stores register operand 1
++   into memory operand 0 under the control of predicate operand 2.  */
++rtx
++function_expander::use_contiguous_store_insn (insn_code icode)
++{
++  machine_mode mem_mode = memory_vector_mode ();
++
++  add_mem_operand (mem_mode, get_contiguous_base (mem_mode));
++  add_input_operand (icode, args.last ());
++  add_input_operand (icode, args[0]);
++  return generate_insn (icode);
++}
++
++/* Implement the call using one of the following strategies, chosen in order:
++
++   (1) "aarch64_pred_<optab><mode>_z" for PRED_z predicate functions
++
++   (2) "aarch64_pred_<optab><mode>" for PRED_x functions
++
++   (3) a normal unpredicated optab for PRED_none and PRED_x functions,
++       dropping the predicate in the latter case
++
++   (4) "cond_<optab><mode>" otherwise
++
++   where <optab> corresponds to:
++
++   - CODE_FOR_SINT for signed integers
++   - CODE_FOR_UINT for unsigned integers
++   - UNSPEC_FOR_FP for floating-point values
++
++   MERGE_ARGNO is the argument that provides the values of inactive lanes for
++   _m functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
++rtx
++function_expander::map_to_rtx_codes (rtx_code code_for_sint,
++				     rtx_code code_for_uint,
++				     int unspec_for_fp,
++				     unsigned int merge_argno)
++{
++  machine_mode mode = vector_mode (0);
++  rtx_code code = (type_suffix (0).unsigned_p ? code_for_uint : code_for_sint);
++  insn_code icode;
++
++  /* Handle predicate logic operations, which always use _z predication.  */
++  if (type_suffix (0).tclass == TYPE_bool)
++    {
++      gcc_assert (pred == PRED_z && code_for_uint == code_for_sint);
++      return use_exact_insn (code_for_aarch64_pred_z (code, mode));
++    }
++
++  /* First try using UNSPEC_PRED_X patterns for _x predication,
++     if available.  */
++  if (pred == PRED_x)
++    {
++      if (type_suffix (0).integer_p)
++	icode = maybe_code_for_aarch64_pred (code, mode);
++      else
++	icode = maybe_code_for_aarch64_pred (unspec_for_fp, mode);
++      if (icode != CODE_FOR_nothing)
++	return use_pred_x_insn (icode);
++    }
++
++  /* Otherwise expand PRED_none and PRED_x operations without a predicate.
++     Floating-point operations conventionally use the signed rtx code.  */
++  if (pred == PRED_none || pred == PRED_x)
++    return use_unpred_insn (direct_optab_handler (code_to_optab (code), 0));
++
++  /* Don't use cond_*_optabs here, since not all codes have one yet.  */
++  if (type_suffix (0).integer_p)
++    icode = code_for_cond (code, mode);
++  else
++    icode = code_for_cond (unspec_for_fp, mode);
++  return use_cond_insn (icode, merge_argno);
++}
++
++/* Implement the call using one of the following strategies, chosen in order:
++
++   (1) "aarch64_pred_<optab><mode>" for PRED_x functions; this is a
++       predicated pattern
++
++   (2) "aarch64_sve_<optab><mode>" for PRED_none and PRED_x functions;
++       this is an unpredicated pattern
++
++   (3) "cond_<optab><mode>" otherwise
++
++   where <optab> corresponds to:
++
++   - UNSPEC_FOR_SINT for signed integers
++   - UNSPEC_FOR_UINT for unsigned integers
++   - UNSPEC_FOR_FP for floating-point values
++
++   MERGE_ARGNO is the argument that provides the values of inactive lanes for
++   _m functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
++rtx
++function_expander::map_to_unspecs (int unspec_for_sint, int unspec_for_uint,
++				   int unspec_for_fp, unsigned int merge_argno)
++{
++  machine_mode mode = vector_mode (0);
++  int unspec = (!type_suffix (0).integer_p ? unspec_for_fp
++		: type_suffix (0).unsigned_p ? unspec_for_uint
++		: unspec_for_sint);
++
++  if (pred == PRED_x)
++    {
++      insn_code icode = maybe_code_for_aarch64_pred (unspec, mode);
++      if (icode != CODE_FOR_nothing)
++	return use_pred_x_insn (icode);
++    }
++
++  if (pred == PRED_none || pred == PRED_x)
++    {
++      insn_code icode = maybe_code_for_aarch64_sve (unspec, mode);
++      if (icode != CODE_FOR_nothing)
++	return use_unpred_insn (icode);
++    }
++
++  insn_code icode = code_for_cond (unspec, vector_mode (0));
++  return use_cond_insn (icode, merge_argno);
++}
++
++/* Implement the call using an @aarch64 instruction and the
++   instructions are parameterized by an rtx_code.  CODE_FOR_SINT
++   is the rtx_code for signed integer operations, CODE_FOR_UINT
++   is the rtx_code for unsigned integer operations.  */
++rtx
++function_expander::expand_signed_unpred_op (rtx_code code_for_sint,
++					    rtx_code code_for_uint)
++{
++  insn_code icode;
++  if (type_suffix (0).unsigned_p)
++    icode = code_for_aarch64 (code_for_uint, code_for_uint, vector_mode (0));
++  else
++    icode = code_for_aarch64 (code_for_sint, code_for_sint, vector_mode (0));
++  return use_unpred_insn (icode);
++}
++
++/* Expand the call and return its lhs.  */
++rtx
++function_expander::expand ()
++{
++  unsigned int nargs = call_expr_nargs (call_expr);
++  args.reserve (nargs);
++  for (unsigned int i = 0; i < nargs; ++i)
++    args.quick_push (expand_normal (CALL_EXPR_ARG (call_expr, i)));
++
++  return base->expand (*this);
++}
++
++/* Register the built-in SVE ABI types, such as __SVBool_t.  */
++static void
++register_builtin_types ()
++{
++#define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
++  scalar_types[VECTOR_TYPE_ ## ACLE_NAME] = SCALAR_TYPE;
++#include "aarch64-sve-builtins.def"
++
++  for (unsigned int i = 0; i < NUM_VECTOR_TYPES; ++i)
++    {
++      tree eltype = scalar_types[i];
++      tree vectype;
++      if (eltype == boolean_type_node)
++	{
++	  vectype = build_truth_vector_type_for_mode (BYTES_PER_SVE_VECTOR,
++						      VNx16BImode);
++	  gcc_assert (TYPE_MODE (vectype) == VNx16BImode
++		      && TYPE_MODE (vectype) == TYPE_MODE_RAW (vectype)
++		      && TYPE_ALIGN (vectype) == 16
++		      && known_eq (wi::to_poly_offset (TYPE_SIZE (vectype)),
++				   BYTES_PER_SVE_VECTOR));
++	}
++      else
++	{
++	  unsigned int elbytes = tree_to_uhwi (TYPE_SIZE_UNIT (eltype));
++	  poly_uint64 nunits = exact_div (BYTES_PER_SVE_VECTOR, elbytes);
++	  vectype = build_vector_type (eltype, nunits);
++	  gcc_assert (VECTOR_MODE_P (TYPE_MODE (vectype))
++		      && TYPE_MODE (vectype) == TYPE_MODE_RAW (vectype)
++		      && TYPE_ALIGN (vectype) == 128
++		      && known_eq (wi::to_poly_offset (TYPE_SIZE (vectype)),
++				   BITS_PER_SVE_VECTOR));
++	}
++      vectype = build_distinct_type_copy (vectype);
++      SET_TYPE_STRUCTURAL_EQUALITY (vectype);
++      TYPE_ARTIFICIAL (vectype) = 1;
++      abi_vector_types[i] = vectype;
++      lang_hooks.types.register_builtin_type (vectype,
++					      vector_types[i].abi_name);
++    }
++}
++
++/* Initialize all compiler built-ins related to SVE that should be
++   defined at start-up.  */
++void
++init_builtins ()
++{
++  sve_switcher sve;
++  register_builtin_types ();
++}
++
++/* Register vector type TYPE under its arm_sve.h name.  */
++static void
++register_vector_type (vector_type_index type)
++{
++  tree vectype = abi_vector_types[type];
++  tree id = get_identifier (vector_types[type].acle_name);
++  tree decl = build_decl (input_location, TYPE_DECL, id, vectype);
++  decl = lang_hooks.decls.pushdecl (decl);
++
++  /* Record the new ACLE type if pushdecl succeeded without error.  Use
++     the ABI type otherwise, so that the type we record at least has the
++     right form, even if it doesn't have the right name.  This should give
++     better error recovery behavior than installing error_mark_node or
++     installing an incorrect type.  */
++  if (TREE_CODE (decl) == TYPE_DECL
++      && TYPE_MAIN_VARIANT (TREE_TYPE (decl)) == vectype)
++    vectype = TREE_TYPE (decl);
++  acle_vector_types[0][type] = vectype;
++}
++
++/* Register the tuple type that contains NUM_VECTORS vectors of type TYPE.  */
++static void
++register_tuple_type (unsigned int num_vectors, vector_type_index type)
++{
++  tree tuple_type = lang_hooks.types.make_type (RECORD_TYPE);
++
++  /* The contents of the type are opaque, so we can define them in any
++     way that maps to the correct ABI type.
++
++     Here we choose to use the same layout as for arm_neon.h, but with
++     "__val" instead of "val":
++
++	struct svfooxN_t { svfoo_t __val[N]; };
++
++     (It wouldn't be possible to write that directly in C or C++ for
++     sizeless types, but that's not a problem for this function.)
++
++     Using arrays simplifies the handling of svget and svset for variable
++     arguments.  */
++  tree vector_type = acle_vector_types[0][type];
++  tree array_type = build_array_type_nelts (vector_type, num_vectors);
++  gcc_assert (VECTOR_MODE_P (TYPE_MODE (array_type))
++	      && TYPE_MODE_RAW (array_type) == TYPE_MODE (array_type)
++	      && TYPE_ALIGN (array_type) == 128);
++
++  tree field = build_decl (input_location, FIELD_DECL,
++			   get_identifier ("__val"), array_type);
++  DECL_FIELD_CONTEXT (field) = tuple_type;
++  TYPE_FIELDS (tuple_type) = field;
++  layout_type (tuple_type);
++  gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
++	      && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
++	      && TYPE_ALIGN (tuple_type) == 128);
++
++  /* Work out the structure name.  */
++  char buffer[sizeof ("svbfloat16x4_t")];
++  const char *vector_type_name = vector_types[type].acle_name;
++  snprintf (buffer, sizeof (buffer), "%.*sx%d_t",
++	    (int) strlen (vector_type_name) - 2, vector_type_name,
++	    num_vectors);
++
++  tree decl = build_decl (input_location, TYPE_DECL,
++			  get_identifier (buffer), tuple_type);
++  TYPE_NAME (tuple_type) = decl;
++  TYPE_STUB_DECL (tuple_type) = decl;
++  lang_hooks.decls.pushdecl (decl);
++  /* ??? Undo the effect of set_underlying_type for C.  The C frontend
++     doesn't recognize DECL as a built-in because (as intended) the decl has
++     a real location instead of BUILTINS_LOCATION.  The frontend therefore
++     treats the decl like a normal C "typedef struct foo foo;", expecting
++     the type for tag "struct foo" to have a dummy unnamed TYPE_DECL instead
++     of the named one we attached above.  It then sets DECL_ORIGINAL_TYPE
++     on the supposedly unnamed decl, creating a circularity that upsets
++     dwarf2out.
++
++     We don't want to follow the normal C model and create "struct foo"
++     tags for tuple types since (a) the types are supposed to be opaque
++     and (b) they couldn't be defined as a real struct anyway.  Treating
++     the TYPE_DECLs as "typedef struct foo foo;" without creating
++     "struct foo" would lead to confusing error messages.  */
++  DECL_ORIGINAL_TYPE (decl) = NULL_TREE;
++
++  acle_vector_types[num_vectors - 1][type] = tuple_type;
++}
++
++/* Register the svpattern enum.  */
++static void
++register_svpattern ()
++{
++  auto_vec<string_int_pair, 32> values;
++#define PUSH(UPPER, LOWER, VALUE) \
++    values.quick_push (string_int_pair ("SV_" #UPPER, VALUE));
++  AARCH64_FOR_SVPATTERN (PUSH)
++#undef PUSH
++
++  acle_svpattern = lang_hooks.types.simulate_enum_decl (input_location,
++							"svpattern", values);
++}
++
++/* Register the svprfop enum.  */
++static void
++register_svprfop ()
++{
++  auto_vec<string_int_pair, 16> values;
++#define PUSH(UPPER, LOWER, VALUE) \
++    values.quick_push (string_int_pair ("SV_" #UPPER, VALUE));
++  AARCH64_FOR_SVPRFOP (PUSH)
++#undef PUSH
++
++  acle_svprfop = lang_hooks.types.simulate_enum_decl (input_location,
++						      "svprfop", values);
++}
++
++/* Implement #pragma GCC aarch64 "arm_sve.h".  */
++void
++handle_arm_sve_h ()
++{
++  if (function_table)
++    {
++      error ("duplicate definition of %qs", "arm_sve.h");
++      return;
++    }
++
++  sve_switcher sve;
++
++  /* Define the vector and tuple types.  */
++  for (unsigned int type_i = 0; type_i < NUM_VECTOR_TYPES; ++type_i)
++    {
++      vector_type_index type = vector_type_index (type_i);
++      register_vector_type (type);
++      if (type != VECTOR_TYPE_svbool_t)
++	for (unsigned int count = 2; count <= MAX_TUPLE_SIZE; ++count)
++	  register_tuple_type (count, type);
++    }
++
++  /* Define the enums.  */
++  register_svpattern ();
++  register_svprfop ();
++
++  /* Define the functions.  */
++  function_table = new hash_table<registered_function_hasher> (1023);
++  function_builder builder;
++  for (unsigned int i = 0; i < ARRAY_SIZE (function_groups); ++i)
++    builder.register_function_group (function_groups[i]);
++}
++
++/* Return the function decl with SVE function subcode CODE, or error_mark_node
++   if no such function exists.  */
++tree
++builtin_decl (unsigned int code, bool)
++{
++  if (code >= vec_safe_length (registered_functions))
++    return error_mark_node;
++  return (*registered_functions)[code]->decl;
++}
++
++/* If we're implementing manual overloading, check whether the SVE
++   function with subcode CODE is overloaded, and if so attempt to
++   determine the corresponding non-overloaded function.  The call
++   occurs at location LOCATION and has the arguments given by ARGLIST.
++
++   If the call is erroneous, report an appropriate error and return
++   error_mark_node.  Otherwise, if the function is overloaded, return
++   the decl of the non-overloaded function.  Return NULL_TREE otherwise,
++   indicating that the call should be processed in the normal way.  */
++tree
++resolve_overloaded_builtin (location_t location, unsigned int code,
++			    vec<tree, va_gc> *arglist)
++{
++  if (code >= vec_safe_length (registered_functions))
++    return NULL_TREE;
++
++  registered_function &rfn = *(*registered_functions)[code];
++  if (rfn.overloaded_p)
++    return function_resolver (location, rfn.instance, rfn.decl,
++			      *arglist).resolve ();
++  return NULL_TREE;
++}
++
++/* Perform any semantic checks needed for a call to the SVE function
++   with subcode CODE, such as testing for integer constant expressions.
++   The call occurs at location LOCATION and has NARGS arguments,
++   given by ARGS.  FNDECL is the original function decl, before
++   overload resolution.
++
++   Return true if the call is valid, otherwise report a suitable error.  */
++bool
++check_builtin_call (location_t location, vec<location_t>, unsigned int code,
++		    tree fndecl, unsigned int nargs, tree *args)
++{
++  const registered_function &rfn = *(*registered_functions)[code];
++  if (!check_required_extensions (location, rfn.decl, rfn.required_extensions))
++    return false;
++  return function_checker (location, rfn.instance, fndecl,
++			   TREE_TYPE (rfn.decl), nargs, args).check ();
++}
++
++/* Attempt to fold STMT, given that it's a call to the SVE function
++   with subcode CODE.  Return the new statement on success and null
++   on failure.  Insert any other new statements at GSI.  */
++gimple *
++gimple_fold_builtin (unsigned int code, gimple_stmt_iterator *gsi, gcall *stmt)
++{
++  registered_function &rfn = *(*registered_functions)[code];
++  return gimple_folder (rfn.instance, rfn.decl, gsi, stmt).fold ();
++}
++
++/* Expand a call to the SVE function with subcode CODE.  EXP is the call
++   expression and TARGET is the preferred location for the result.
++   Return the value of the lhs.  */
++rtx
++expand_builtin (unsigned int code, tree exp, rtx target)
++{
++  registered_function &rfn = *(*registered_functions)[code];
++  if (!check_required_extensions (EXPR_LOCATION (exp), rfn.decl,
++				  rfn.required_extensions))
++    return target;
++  return function_expander (rfn.instance, rfn.decl, exp, target).expand ();
++}
++
++/* Return true if TYPE is the ABI-defined __SVBool_t type.  */
++bool
++svbool_type_p (const_tree type)
++{
++  tree abi_type = abi_vector_types[VECTOR_TYPE_svbool_t];
++  return (type != error_mark_node
++	  && TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (abi_type));
++}
++
++/* If TYPE is a built-in type defined by the SVE ABI, return the mangled name,
++   otherwise return NULL.  */
++const char *
++mangle_builtin_type (const_tree type)
++{
++  if (type == error_mark_node)
++    return NULL;
++
++  vector_type_index vtype = find_vector_type (type);
++  if (vtype != NUM_VECTOR_TYPES)
++    return vector_types[vtype].mangled_name;
++
++  return NULL;
++}
++
++/* If TYPE is one of the ABI-defined SVE vector types, or an ACLE-defined
++   tuple of them, return the number of vectors it contains.  Return 0
++   otherwise.  */
++unsigned int
++nvectors_if_data_type (const_tree type)
++{
++  if (type == error_mark_node)
++    return 0;
++
++  type = TYPE_MAIN_VARIANT (type);
++  if (VECTOR_TYPE_P (type))
++    {
++      vector_type_index type_id = find_vector_type (type);
++      if (type_id != VECTOR_TYPE_svbool_t && type_id != NUM_VECTOR_TYPES)
++	return 1;
++    }
++  else if (TREE_CODE (type) == RECORD_TYPE)
++    {
++      for (unsigned int size_i = 1; size_i < MAX_TUPLE_SIZE; ++size_i)
++	for (unsigned int type_i = 0; type_i < NUM_VECTOR_TYPES; ++type_i)
++	  {
++	    tree tuple_type = acle_vector_types[size_i][type_i];
++	    if (tuple_type && type == TYPE_MAIN_VARIANT (tuple_type))
++	      return size_i + 1;
++	  }
++    }
++
++  return 0;
++}
++
++/* Return true if TYPE is a built-in type defined by the SVE ABI.  */
++bool
++builtin_type_p (const_tree type)
++{
++  return svbool_type_p (type) || nvectors_if_data_type (type) > 0;
++}
++
++}
++
++using namespace aarch64_sve;
++
++inline void
++gt_ggc_mx (function_instance *)
++{
++}
++
++inline void
++gt_pch_nx (function_instance *)
++{
++}
++
++inline void
++gt_pch_nx (function_instance *, void (*) (void *, void *), void *)
++{
++}
++
++#include "gt-aarch64-sve-builtins.h"
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins.def b/gcc/config/aarch64/aarch64-sve-builtins.def
+new file mode 100644
+index 000000000..83fba0d41
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins.def
+@@ -0,0 +1,100 @@
++/* Builtin lists for AArch64 SVE
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef DEF_SVE_MODE
++#define DEF_SVE_MODE(A, B, C, D)
++#endif
++
++#ifndef DEF_SVE_TYPE
++#define DEF_SVE_TYPE(A, B, C, D)
++#endif
++
++#ifndef DEF_SVE_TYPE_SUFFIX
++#define DEF_SVE_TYPE_SUFFIX(A, B, C, D, E)
++#endif
++
++#ifndef DEF_SVE_FUNCTION
++#define DEF_SVE_FUNCTION(A, B, C, D)
++#endif
++
++DEF_SVE_MODE (n, none, none, none)
++DEF_SVE_MODE (index, none, none, elements)
++DEF_SVE_MODE (offset, none, none, bytes)
++DEF_SVE_MODE (s32index, none, svint32_t, elements)
++DEF_SVE_MODE (s32offset, none, svint32_t, bytes)
++DEF_SVE_MODE (s64index,  none, svint64_t, elements)
++DEF_SVE_MODE (s64offset, none, svint64_t, bytes)
++DEF_SVE_MODE (u32base, svuint32_t, none, none)
++DEF_SVE_MODE (u32base_index, svuint32_t, none, elements)
++DEF_SVE_MODE (u32base_offset, svuint32_t, none, bytes)
++DEF_SVE_MODE (u32base_s32index, svuint32_t, svint32_t, elements)
++DEF_SVE_MODE (u32base_s32offset, svuint32_t, svint32_t, bytes)
++DEF_SVE_MODE (u32base_u32index, svuint32_t, svuint32_t, elements)
++DEF_SVE_MODE (u32base_u32offset, svuint32_t, svuint32_t, bytes)
++DEF_SVE_MODE (u32index, none, svuint32_t, elements)
++DEF_SVE_MODE (u32offset, none, svuint32_t, bytes)
++DEF_SVE_MODE (u64base, svuint64_t, none, none)
++DEF_SVE_MODE (u64base_index, svuint64_t, none, elements)
++DEF_SVE_MODE (u64base_offset, svuint64_t, none, bytes)
++DEF_SVE_MODE (u64base_s64index, svuint64_t, svint64_t, elements)
++DEF_SVE_MODE (u64base_s64offset, svuint64_t, svint64_t, bytes)
++DEF_SVE_MODE (u64base_u64index, svuint64_t, svuint64_t, elements)
++DEF_SVE_MODE (u64base_u64offset, svuint64_t, svuint64_t, bytes)
++DEF_SVE_MODE (u64index, none, svuint64_t, elements)
++DEF_SVE_MODE (u64offset, none, svuint64_t, bytes)
++DEF_SVE_MODE (vnum, none, none, vectors)
++
++DEF_SVE_TYPE (svbool_t, 10, __SVBool_t, boolean_type_node)
++DEF_SVE_TYPE (svbfloat16_t, 14, __SVBfloat16_t, aarch64_bf16_type_node)
++DEF_SVE_TYPE (svfloat16_t, 13, __SVFloat16_t, aarch64_fp16_type_node)
++DEF_SVE_TYPE (svfloat32_t, 13, __SVFloat32_t, float_type_node)
++DEF_SVE_TYPE (svfloat64_t, 13, __SVFloat64_t, double_type_node)
++DEF_SVE_TYPE (svint8_t, 10, __SVInt8_t, intQI_type_node)
++DEF_SVE_TYPE (svint16_t, 11, __SVInt16_t, intHI_type_node)
++DEF_SVE_TYPE (svint32_t, 11, __SVInt32_t, intSI_type_node)
++DEF_SVE_TYPE (svint64_t, 11, __SVInt64_t, intDI_type_node)
++DEF_SVE_TYPE (svuint8_t, 11, __SVUint8_t, unsigned_intQI_type_node)
++DEF_SVE_TYPE (svuint16_t, 12, __SVUint16_t, unsigned_intHI_type_node)
++DEF_SVE_TYPE (svuint32_t, 12, __SVUint32_t, unsigned_intSI_type_node)
++DEF_SVE_TYPE (svuint64_t, 12, __SVUint64_t, unsigned_intDI_type_node)
++
++DEF_SVE_TYPE_SUFFIX (b, svbool_t, bool, 8, VNx16BImode)
++DEF_SVE_TYPE_SUFFIX (b8, svbool_t, bool, 8, VNx16BImode)
++DEF_SVE_TYPE_SUFFIX (b16, svbool_t, bool, 16, VNx8BImode)
++DEF_SVE_TYPE_SUFFIX (b32, svbool_t, bool, 32, VNx4BImode)
++DEF_SVE_TYPE_SUFFIX (b64, svbool_t, bool, 64, VNx2BImode)
++DEF_SVE_TYPE_SUFFIX (bf16, svbfloat16_t, bfloat, 16, VNx8BFmode)
++DEF_SVE_TYPE_SUFFIX (f16, svfloat16_t, float, 16, VNx8HFmode)
++DEF_SVE_TYPE_SUFFIX (f32, svfloat32_t, float, 32, VNx4SFmode)
++DEF_SVE_TYPE_SUFFIX (f64, svfloat64_t, float, 64, VNx2DFmode)
++DEF_SVE_TYPE_SUFFIX (s8, svint8_t, signed, 8, VNx16QImode)
++DEF_SVE_TYPE_SUFFIX (s16, svint16_t, signed, 16, VNx8HImode)
++DEF_SVE_TYPE_SUFFIX (s32, svint32_t, signed, 32, VNx4SImode)
++DEF_SVE_TYPE_SUFFIX (s64, svint64_t, signed, 64, VNx2DImode)
++DEF_SVE_TYPE_SUFFIX (u8, svuint8_t, unsigned, 8, VNx16QImode)
++DEF_SVE_TYPE_SUFFIX (u16, svuint16_t, unsigned, 16, VNx8HImode)
++DEF_SVE_TYPE_SUFFIX (u32, svuint32_t, unsigned, 32, VNx4SImode)
++DEF_SVE_TYPE_SUFFIX (u64, svuint64_t, unsigned, 64, VNx2DImode)
++
++#include "aarch64-sve-builtins-base.def"
++
++#undef DEF_SVE_FUNCTION
++#undef DEF_SVE_TYPE_SUFFIX
++#undef DEF_SVE_TYPE
++#undef DEF_SVE_MODE
+diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
+new file mode 100644
+index 000000000..d1aa612b9
+--- /dev/null
++++ b/gcc/config/aarch64/aarch64-sve-builtins.h
+@@ -0,0 +1,878 @@
++/* ACLE support for AArch64 SVE
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   GCC is distributed in the hope that it will be useful, but
++   WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING3.  If not see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_AARCH64_SVE_BUILTINS_H
++#define GCC_AARCH64_SVE_BUILTINS_H
++
++/* The full name of an SVE ACLE function is the concatenation of:
++
++   - the base name ("svadd", etc.)
++   - the "mode" suffix ("_n", "_index", etc.)
++   - the type suffixes ("_s32", "_b8", etc.)
++   - the predication suffix ("_x", "_z", etc.)
++
++   Each piece of information is individually useful, so we retain this
++   classification throughout:
++
++   - function_base represents the base name
++
++   - mode_suffix_index represents the mode suffix
++
++   - type_suffix_index represents individual type suffixes, while
++     type_suffix_pair represents a pair of them
++
++   - prediction_index extends the predication suffix with an additional
++     alternative: PRED_implicit for implicitly-predicated operations
++
++   In addition to its unique full name, a function may have a shorter
++   overloaded alias.  This alias removes pieces of the suffixes that
++   can be inferred from the arguments, such as by shortening the mode
++   suffix or dropping some of the type suffixes.  The base name and the
++   predication suffix stay the same.
++
++   The function_shape class describes what arguments a given function
++   takes and what its overloaded alias is called.  In broad terms,
++   function_base describes how the underlying instruction behaves while
++   function_shape describes how that instruction has been presented at
++   the language level.
++
++   The static list of functions uses function_group to describe a group
++   of related functions.  The function_builder class is responsible for
++   expanding this static description into a list of individual functions
++   and registering the associated built-in functions.  function_instance
++   describes one of these individual functions in terms of the properties
++   described above.
++
++   The classes involved in compiling a function call are:
++
++   - function_resolver, which resolves an overloaded function call to a
++     specific function_instance and its associated function decl
++
++   - function_checker, which checks whether the values of the arguments
++     conform to the ACLE specification
++
++   - gimple_folder, which tries to fold a function call at the gimple level
++
++   - function_expander, which expands a function call into rtl instructions
++
++   function_resolver and function_checker operate at the language level
++   and so are associated with the function_shape.  gimple_folder and
++   function_expander are concerned with the behavior of the function
++   and so are associated with the function_base.
++
++   Note that we've specifically chosen not to fold calls in the frontend,
++   since SVE intrinsics will hardly ever fold a useful language-level
++   constant.  */
++namespace aarch64_sve
++{
++/* The maximum number of vectors in an ACLE tuple type.  */
++const unsigned int MAX_TUPLE_SIZE = 4;
++
++/* Used to represent the default merge argument index for _m functions.
++   The actual index depends on how many arguments the function takes.  */
++const unsigned int DEFAULT_MERGE_ARGNO = ~0U;
++
++/* Flags that describe what a function might do, in addition to reading
++   its arguments and returning a result.  */
++const unsigned int CP_READ_FPCR = 1U << 0;
++const unsigned int CP_RAISE_FP_EXCEPTIONS = 1U << 1;
++const unsigned int CP_READ_MEMORY = 1U << 2;
++const unsigned int CP_PREFETCH_MEMORY = 1U << 3;
++const unsigned int CP_WRITE_MEMORY = 1U << 4;
++const unsigned int CP_READ_FFR = 1U << 5;
++const unsigned int CP_WRITE_FFR = 1U << 6;
++
++/* Enumerates the SVE predicate and (data) vector types, together called
++   "vector types" for brevity.  */
++enum vector_type_index
++{
++#define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
++  VECTOR_TYPE_ ## ACLE_NAME,
++#include "aarch64-sve-builtins.def"
++  NUM_VECTOR_TYPES
++};
++
++/* Classifies the available measurement units for an address displacement.  */
++enum units_index
++{
++  UNITS_none,
++  UNITS_bytes,
++  UNITS_elements,
++  UNITS_vectors
++};
++
++/* Describes the various uses of a governing predicate.  */
++enum predication_index
++{
++  /* No governing predicate is present.  */
++  PRED_none,
++
++  /* A governing predicate is present but there is no predication suffix
++     associated with it.  This is used when the result is neither a vector
++     nor a predicate, since the distinction between "zeroing" and "merging"
++     doesn't apply in that case.  It is also used when a suffix would be
++     redundant (such as for loads and comparisons, which are inherently
++     zeroing operations).  */
++  PRED_implicit,
++
++  /* Merging predication: copy inactive lanes from the first data argument
++     to the vector result.  */
++  PRED_m,
++
++  /* "Don't care" predication: set inactive lanes of the vector result
++     to arbitrary values.  */
++  PRED_x,
++
++  /* Zero predication: set inactive lanes of the vector result to zero.  */
++  PRED_z,
++
++  NUM_PREDS
++};
++
++/* Classifies element types, based on type suffixes with the bit count
++   removed.  */
++enum type_class_index
++{
++  TYPE_bool,
++  TYPE_bfloat,
++  TYPE_float,
++  TYPE_signed,
++  TYPE_unsigned,
++  NUM_TYPE_CLASSES
++};
++
++/* Classifies an operation into "modes"; for example, to distinguish
++   vector-scalar operations from vector-vector operations, or to
++   distinguish between different addressing modes.  This classification
++   accounts for the function suffixes that occur between the base name
++   and the first type suffix.  */
++enum mode_suffix_index
++{
++#define DEF_SVE_MODE(NAME, BASE, DISPLACEMENT, UNITS) MODE_##NAME,
++#include "aarch64-sve-builtins.def"
++  MODE_none
++};
++
++/* Enumerates the possible type suffixes.  Each suffix is associated with
++   a vector type, but for predicates provides extra information about the
++   element size.  */
++enum type_suffix_index
++{
++#define DEF_SVE_TYPE_SUFFIX(NAME, ACLE_TYPE, CLASS, BITS, MODE) \
++  TYPE_SUFFIX_ ## NAME,
++#include "aarch64-sve-builtins.def"
++  NUM_TYPE_SUFFIXES
++};
++
++/* Combines two type suffixes.  */
++typedef enum type_suffix_index type_suffix_pair[2];
++
++class function_base;
++class function_shape;
++
++/* Static information about a mode suffix.  */
++struct mode_suffix_info
++{
++  /* The suffix string itself.  */
++  const char *string;
++
++  /* The type of the vector base address, or NUM_VECTOR_TYPES if the
++     mode does not include a vector base address.  */
++  vector_type_index base_vector_type;
++
++  /* The type of the vector displacement, or NUM_VECTOR_TYPES if the
++     mode does not include a vector displacement.  (Note that scalar
++     displacements are always int64_t.)  */
++  vector_type_index displacement_vector_type;
++
++  /* The units in which the vector or scalar displacement is measured,
++     or UNITS_none if the mode doesn't take a displacement.  */
++  units_index displacement_units;
++};
++
++/* Static information about a type suffix.  */
++struct type_suffix_info
++{
++  /* The suffix string itself.  */
++  const char *string;
++
++  /* The associated ACLE vector or predicate type.  */
++  vector_type_index vector_type : 8;
++
++  /* What kind of type the suffix represents.  */
++  type_class_index tclass : 8;
++
++  /* The number of bits and bytes in an element.  For predicates this
++     measures the associated data elements.  */
++  unsigned int element_bits : 8;
++  unsigned int element_bytes : 8;
++
++  /* True if the suffix is for an integer type.  */
++  unsigned int integer_p : 1;
++  /* True if the suffix is for an unsigned type.  */
++  unsigned int unsigned_p : 1;
++  /* True if the suffix is for a floating-point type.  */
++  unsigned int float_p : 1;
++  /* True if the suffix is for a boolean type.  */
++  unsigned int bool_p : 1;
++  unsigned int spare : 12;
++
++  /* The associated vector or predicate mode.  */
++  machine_mode vector_mode : 16;
++};
++
++/* Static information about a set of functions.  */
++struct function_group_info
++{
++  /* The base name, as a string.  */
++  const char *base_name;
++
++  /* Describes the behavior associated with the function base name.  */
++  const function_base *const *base;
++
++  /* The shape of the functions, as described above the class definition.
++     It's possible to have entries with the same base name but different
++     shapes.  */
++  const function_shape *const *shape;
++
++  /* A list of the available type suffixes, and of the available predication
++     types.  The function supports every combination of the two.
++
++     The list of type suffixes is terminated by two NUM_TYPE_SUFFIXES
++     while the list of predication types is terminated by NUM_PREDS.
++     The list of type suffixes is lexicographically ordered based
++     on the index value.  */
++  const type_suffix_pair *types;
++  const predication_index *preds;
++
++  /* The architecture extensions that the functions require, as a set of
++     AARCH64_FL_* flags.  */
++  uint64_t required_extensions;
++};
++
++/* Describes a single fully-resolved function (i.e. one that has a
++   unique full name).  */
++class GTY((user)) function_instance
++{
++public:
++  function_instance (const char *, const function_base *,
++		     const function_shape *, mode_suffix_index,
++		     const type_suffix_pair &, predication_index);
++
++  bool operator== (const function_instance &) const;
++  bool operator!= (const function_instance &) const;
++  hashval_t hash () const;
++
++  unsigned int call_properties () const;
++  bool reads_global_state_p () const;
++  bool modifies_global_state_p () const;
++  bool could_trap_p () const;
++
++  unsigned int vectors_per_tuple () const;
++  tree memory_scalar_type () const;
++  machine_mode memory_vector_mode () const;
++
++  const mode_suffix_info &mode_suffix () const;
++  tree base_vector_type () const;
++  tree displacement_vector_type () const;
++  units_index displacement_units () const;
++
++  const type_suffix_info &type_suffix (unsigned int) const;
++  tree scalar_type (unsigned int) const;
++  tree vector_type (unsigned int) const;
++  tree tuple_type (unsigned int) const;
++  unsigned int elements_per_vq (unsigned int i) const;
++  machine_mode vector_mode (unsigned int) const;
++  machine_mode gp_mode (unsigned int) const;
++
++  /* The properties of the function.  (The explicit "enum"s are required
++     for gengtype.)  */
++  const char *base_name;
++  const function_base *base;
++  const function_shape *shape;
++  enum mode_suffix_index mode_suffix_id;
++  type_suffix_pair type_suffix_ids;
++  enum predication_index pred;
++};
++
++class registered_function;
++
++/* A class for building and registering function decls.  */
++class function_builder
++{
++public:
++  function_builder ();
++  ~function_builder ();
++
++  void add_unique_function (const function_instance &, tree,
++			    vec<tree> &, uint64_t, bool);
++  void add_overloaded_function (const function_instance &, uint64_t);
++  void add_overloaded_functions (const function_group_info &,
++				 mode_suffix_index);
++
++  void register_function_group (const function_group_info &);
++
++private:
++  void append_name (const char *);
++  char *finish_name ();
++
++  char *get_name (const function_instance &, bool);
++
++  tree get_attributes (const function_instance &);
++
++  registered_function &add_function (const function_instance &,
++				     const char *, tree, tree, uint64_t, bool);
++
++  /* The function type to use for functions that are resolved by
++     function_resolver.  */
++  tree m_overload_type;
++
++  /* True if we should create a separate decl for each instance of an
++     overloaded function, instead of using function_resolver.  */
++  bool m_direct_overloads;
++
++  /* Used for building up function names.  */
++  obstack m_string_obstack;
++
++  /* Maps all overloaded function names that we've registered so far
++     to their associated function_instances.  */
++  hash_map<nofree_string_hash, registered_function *> m_overload_names;
++};
++
++/* A base class for handling calls to built-in functions.  */
++class function_call_info : public function_instance
++{
++public:
++  function_call_info (location_t, const function_instance &, tree);
++
++  bool function_returns_void_p ();
++
++  /* The location of the call.  */
++  location_t location;
++
++  /* The FUNCTION_DECL that is being called.  */
++  tree fndecl;
++};
++
++/* A class for resolving an overloaded function call.  */
++class function_resolver : public function_call_info
++{
++public:
++  enum { SAME_SIZE = 256, HALF_SIZE, QUARTER_SIZE };
++  static const type_class_index SAME_TYPE_CLASS = NUM_TYPE_CLASSES;
++
++  function_resolver (location_t, const function_instance &, tree,
++		     vec<tree, va_gc> &);
++
++  tree get_vector_type (type_suffix_index);
++  const char *get_scalar_type_name (type_suffix_index);
++  tree get_argument_type (unsigned int);
++  bool scalar_argument_p (unsigned int);
++
++  tree report_no_such_form (type_suffix_index);
++  tree lookup_form (mode_suffix_index,
++		    type_suffix_index = NUM_TYPE_SUFFIXES,
++		    type_suffix_index = NUM_TYPE_SUFFIXES);
++  tree resolve_to (mode_suffix_index,
++		   type_suffix_index = NUM_TYPE_SUFFIXES,
++		   type_suffix_index = NUM_TYPE_SUFFIXES);
++
++  type_suffix_index infer_integer_scalar_type (unsigned int);
++  type_suffix_index infer_pointer_type (unsigned int, bool = false);
++  type_suffix_index infer_vector_or_tuple_type (unsigned int, unsigned int);
++  type_suffix_index infer_vector_type (unsigned int);
++  type_suffix_index infer_integer_vector_type (unsigned int);
++  type_suffix_index infer_unsigned_vector_type (unsigned int);
++  type_suffix_index infer_sd_vector_type (unsigned int);
++  type_suffix_index infer_tuple_type (unsigned int);
++
++  bool require_vector_or_scalar_type (unsigned int);
++
++  bool require_vector_type (unsigned int, vector_type_index);
++  bool require_matching_vector_type (unsigned int, type_suffix_index);
++  bool require_derived_vector_type (unsigned int, unsigned int,
++				    type_suffix_index,
++				    type_class_index = SAME_TYPE_CLASS,
++				    unsigned int = SAME_SIZE);
++
++  bool require_scalar_type (unsigned int, const char *);
++  bool require_pointer_type (unsigned int);
++  bool require_matching_integer_scalar_type (unsigned int, unsigned int,
++					     type_suffix_index);
++  bool require_derived_scalar_type (unsigned int, type_class_index,
++				    unsigned int = SAME_SIZE);
++  bool require_matching_pointer_type (unsigned int, unsigned int,
++				      type_suffix_index);
++  bool require_integer_immediate (unsigned int);
++
++  vector_type_index infer_vector_base_type (unsigned int);
++  vector_type_index infer_vector_displacement_type (unsigned int);
++
++  mode_suffix_index resolve_sv_displacement (unsigned int,
++					     type_suffix_index, bool);
++  mode_suffix_index resolve_gather_address (unsigned int,
++					    type_suffix_index, bool);
++  mode_suffix_index resolve_adr_address (unsigned int);
++
++  bool check_num_arguments (unsigned int);
++  bool check_gp_argument (unsigned int, unsigned int &, unsigned int &);
++  tree resolve_unary (type_class_index = SAME_TYPE_CLASS,
++		      unsigned int = SAME_SIZE, bool = false);
++  tree resolve_uniform (unsigned int, unsigned int = 0);
++  tree resolve_uniform_opt_n (unsigned int);
++  tree finish_opt_n_resolution (unsigned int, unsigned int, type_suffix_index,
++				type_class_index = SAME_TYPE_CLASS,
++				unsigned int = SAME_SIZE,
++				type_suffix_index = NUM_TYPE_SUFFIXES);
++
++  tree resolve ();
++
++private:
++  /* The arguments to the overloaded function.  */
++  vec<tree, va_gc> &m_arglist;
++};
++
++/* A class for checking that the semantic constraints on a function call are
++   satisfied, such as arguments being integer constant expressions with
++   a particular range.  The parent class's FNDECL is the decl that was
++   called in the original source, before overload resolution.  */
++class function_checker : public function_call_info
++{
++public:
++  function_checker (location_t, const function_instance &, tree,
++		    tree, unsigned int, tree *);
++
++  bool require_immediate_either_or (unsigned int, HOST_WIDE_INT,
++				    HOST_WIDE_INT);
++  bool require_immediate_enum (unsigned int, tree);
++  bool require_immediate_lane_index (unsigned int, unsigned int = 1);
++  bool require_immediate_one_of (unsigned int, HOST_WIDE_INT, HOST_WIDE_INT,
++				 HOST_WIDE_INT, HOST_WIDE_INT);
++  bool require_immediate_range (unsigned int, HOST_WIDE_INT, HOST_WIDE_INT);
++
++  bool check ();
++
++private:
++  bool argument_exists_p (unsigned int);
++
++  bool require_immediate (unsigned int, HOST_WIDE_INT &);
++
++  /* The type of the resolved function.  */
++  tree m_fntype;
++
++  /* The arguments to the function.  */
++  unsigned int m_nargs;
++  tree *m_args;
++
++  /* The first argument not associated with the function's predication
++     type.  */
++  unsigned int m_base_arg;
++};
++
++/* A class for folding a gimple function call.  */
++class gimple_folder : public function_call_info
++{
++public:
++  gimple_folder (const function_instance &, tree,
++		 gimple_stmt_iterator *, gcall *);
++
++  tree convert_pred (gimple_seq &, tree, unsigned int);
++  tree fold_contiguous_base (gimple_seq &, tree);
++  tree load_store_cookie (tree);
++
++  gimple *redirect_call (const function_instance &);
++  gimple *fold_to_pfalse ();
++  gimple *fold_to_ptrue ();
++  gimple *fold_to_vl_pred (unsigned int);
++
++  gimple *fold ();
++
++  /* Where to insert extra statements that feed the final replacement.  */
++  gimple_stmt_iterator *gsi;
++
++  /* The call we're folding.  */
++  gcall *call;
++
++  /* The result of the call, or null if none.  */
++  tree lhs;
++};
++
++/* A class for expanding a function call into RTL.  */
++class function_expander : public function_call_info
++{
++public:
++  function_expander (const function_instance &, tree, tree, rtx);
++  rtx expand ();
++
++  insn_code direct_optab_handler (optab, unsigned int = 0);
++  insn_code direct_optab_handler_for_sign (optab, optab, unsigned int = 0,
++					   machine_mode = E_VOIDmode);
++
++  bool overlaps_input_p (rtx);
++
++  rtx get_contiguous_base (machine_mode);
++  rtx get_fallback_value (machine_mode, unsigned int,
++			  unsigned int, unsigned int &);
++  rtx get_reg_target ();
++  rtx get_nonoverlapping_reg_target ();
++
++  void add_output_operand (insn_code);
++  void add_input_operand (insn_code, rtx);
++  void add_integer_operand (HOST_WIDE_INT);
++  void add_mem_operand (machine_mode, rtx);
++  void add_address_operand (rtx);
++  void add_fixed_operand (rtx);
++  rtx generate_insn (insn_code);
++
++  void prepare_gather_address_operands (unsigned int, bool = true);
++  void prepare_prefetch_operands ();
++  void add_ptrue_hint (unsigned int, machine_mode);
++  void rotate_inputs_left (unsigned int, unsigned int);
++  bool try_negating_argument (unsigned int, machine_mode);
++
++  rtx use_exact_insn (insn_code);
++  rtx use_unpred_insn (insn_code);
++  rtx use_pred_x_insn (insn_code);
++  rtx use_cond_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
++  rtx use_vcond_mask_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
++  rtx use_contiguous_load_insn (insn_code);
++  rtx use_contiguous_prefetch_insn (insn_code);
++  rtx use_contiguous_store_insn (insn_code);
++
++  rtx map_to_rtx_codes (rtx_code, rtx_code, int,
++			unsigned int = DEFAULT_MERGE_ARGNO);
++  rtx map_to_unspecs (int, int, int, unsigned int = DEFAULT_MERGE_ARGNO);
++  rtx expand_signed_unpred_op (rtx_code, rtx_code);
++
++  /* The function call expression.  */
++  tree call_expr;
++
++  /* For functions that return a value, this is the preferred location
++     of that value.  It could be null or could have a different mode
++     from the function return type.  */
++  rtx possible_target;
++
++  /* The expanded arguments.  */
++  auto_vec<rtx, 16> args;
++
++private:
++  /* Used to build up the operands to an instruction.  */
++  auto_vec<expand_operand, 8> m_ops;
++};
++
++/* Provides information about a particular function base name, and handles
++   tasks related to the base name.  */
++class function_base
++{
++public:
++  /* Return a set of CP_* flags that describe what the function might do,
++     in addition to reading its arguments and returning a result.  */
++  virtual unsigned int call_properties (const function_instance &) const;
++
++  /* If the function operates on tuples of vectors, return the number
++     of vectors in the tuples, otherwise return 1.  */
++  virtual unsigned int vectors_per_tuple () const { return 1; }
++
++  /* If the function addresses memory, return the type of a single
++     scalar memory element.  */
++  virtual tree
++  memory_scalar_type (const function_instance &) const
++  {
++    gcc_unreachable ();
++  }
++
++  /* If the function addresses memory, return a vector mode whose
++     GET_MODE_NUNITS is the number of elements addressed and whose
++     GET_MODE_INNER is the mode of a single scalar memory element.  */
++  virtual machine_mode
++  memory_vector_mode (const function_instance &) const
++  {
++    gcc_unreachable ();
++  }
++
++  /* Try to fold the given gimple call.  Return the new gimple statement
++     on success, otherwise return null.  */
++  virtual gimple *fold (gimple_folder &) const { return NULL; }
++
++  /* Expand the given call into rtl.  Return the result of the function,
++     or an arbitrary value if the function doesn't return a result.  */
++  virtual rtx expand (function_expander &) const = 0;
++};
++
++/* Classifies functions into "shapes".  The idea is to take all the
++   type signatures for a set of functions, remove the governing predicate
++   (if any), and classify what's left based on:
++
++   - the number of arguments
++
++   - the process of determining the types in the signature from the mode
++     and type suffixes in the function name (including types that are not
++     affected by the suffixes)
++
++   - which arguments must be integer constant expressions, and what range
++     those arguments have
++
++   - the process for mapping overloaded names to "full" names.  */
++class function_shape
++{
++public:
++  virtual bool explicit_type_suffix_p (unsigned int) const = 0;
++
++  /* Define all functions associated with the given group.  */
++  virtual void build (function_builder &,
++		      const function_group_info &) const = 0;
++
++  /* Try to resolve the overloaded call.  Return the non-overloaded
++     function decl on success and error_mark_node on failure.  */
++  virtual tree resolve (function_resolver &) const = 0;
++
++  /* Check whether the given call is semantically valid.  Return true
++     if it is, otherwise report an error and return false.  */
++  virtual bool check (function_checker &) const { return true; }
++};
++
++/* RAII class for enabling enough SVE features to define the built-in
++   types and implement the arm_sve.h pragma.  */
++class sve_switcher
++{
++public:
++  sve_switcher ();
++  ~sve_switcher ();
++
++private:
++  unsigned long m_old_isa_flags;
++  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
++};
++
++extern const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1];
++extern const mode_suffix_info mode_suffixes[MODE_none + 1];
++
++extern tree scalar_types[NUM_VECTOR_TYPES];
++extern tree acle_vector_types[MAX_TUPLE_SIZE][NUM_VECTOR_TYPES + 1];
++extern tree acle_svpattern;
++extern tree acle_svprfop;
++
++/* Return the ACLE type svbool_t.  */
++inline tree
++get_svbool_t (void)
++{
++  return acle_vector_types[0][VECTOR_TYPE_svbool_t];
++}
++
++/* Try to find a mode with the given mode_suffix_info fields.  Return the
++   mode on success or MODE_none on failure.  */
++inline mode_suffix_index
++find_mode_suffix (vector_type_index base_vector_type,
++		  vector_type_index displacement_vector_type,
++		  units_index displacement_units)
++{
++  for (unsigned int mode_i = 0; mode_i < ARRAY_SIZE (mode_suffixes); ++mode_i)
++    {
++      const mode_suffix_info &mode = mode_suffixes[mode_i];
++      if (mode.base_vector_type == base_vector_type
++	  && mode.displacement_vector_type == displacement_vector_type
++	  && mode.displacement_units == displacement_units)
++	return mode_suffix_index (mode_i);
++    }
++  return MODE_none;
++}
++
++/* Return the type suffix associated with ELEMENT_BITS-bit elements of type
++   class TCLASS.  */
++inline type_suffix_index
++find_type_suffix (type_class_index tclass, unsigned int element_bits)
++{
++  for (unsigned int i = 0; i < NUM_TYPE_SUFFIXES; ++i)
++    if (type_suffixes[i].tclass == tclass
++	&& type_suffixes[i].element_bits == element_bits)
++      return type_suffix_index (i);
++  gcc_unreachable ();
++}
++
++/* Return the single field in tuple type TYPE.  */
++inline tree
++tuple_type_field (tree type)
++{
++  for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
++    if (TREE_CODE (field) == FIELD_DECL)
++      return field;
++  gcc_unreachable ();
++}
++
++inline function_instance::
++function_instance (const char *base_name_in,
++		   const function_base *base_in,
++		   const function_shape *shape_in,
++		   mode_suffix_index mode_suffix_id_in,
++		   const type_suffix_pair &type_suffix_ids_in,
++		   predication_index pred_in)
++  : base_name (base_name_in), base (base_in), shape (shape_in),
++    mode_suffix_id (mode_suffix_id_in), pred (pred_in)
++{
++  memcpy (type_suffix_ids, type_suffix_ids_in, sizeof (type_suffix_ids));
++}
++
++inline bool
++function_instance::operator== (const function_instance &other) const
++{
++  return (base == other.base
++	  && shape == other.shape
++	  && mode_suffix_id == other.mode_suffix_id
++	  && pred == other.pred
++	  && type_suffix_ids[0] == other.type_suffix_ids[0]
++	  && type_suffix_ids[1] == other.type_suffix_ids[1]);
++}
++
++inline bool
++function_instance::operator!= (const function_instance &other) const
++{
++  return !operator== (other);
++}
++
++/* If the function operates on tuples of vectors, return the number
++   of vectors in the tuples, otherwise return 1.  */
++inline unsigned int
++function_instance::vectors_per_tuple () const
++{
++  return base->vectors_per_tuple ();
++}
++
++/* If the function addresses memory, return the type of a single
++   scalar memory element.  */
++inline tree
++function_instance::memory_scalar_type () const
++{
++  return base->memory_scalar_type (*this);
++}
++
++/* If the function addresses memory, return a vector mode whose
++   GET_MODE_NUNITS is the number of elements addressed and whose
++   GET_MODE_INNER is the mode of a single scalar memory element.  */
++inline machine_mode
++function_instance::memory_vector_mode () const
++{
++  return base->memory_vector_mode (*this);
++}
++
++/* Return information about the function's mode suffix.  */
++inline const mode_suffix_info &
++function_instance::mode_suffix () const
++{
++  return mode_suffixes[mode_suffix_id];
++}
++
++/* Return the type of the function's vector base address argument,
++   or null it doesn't have a vector base address.  */
++inline tree
++function_instance::base_vector_type () const
++{
++  return acle_vector_types[0][mode_suffix ().base_vector_type];
++}
++
++/* Return the type of the function's vector index or offset argument,
++   or null if doesn't have a vector index or offset argument.  */
++inline tree
++function_instance::displacement_vector_type () const
++{
++  return acle_vector_types[0][mode_suffix ().displacement_vector_type];
++}
++
++/* If the function takes a vector or scalar displacement, return the units
++   in which the displacement is measured, otherwise return UNITS_none.  */
++inline units_index
++function_instance::displacement_units () const
++{
++  return mode_suffix ().displacement_units;
++}
++
++/* Return information about type suffix I.  */
++inline const type_suffix_info &
++function_instance::type_suffix (unsigned int i) const
++{
++  return type_suffixes[type_suffix_ids[i]];
++}
++
++/* Return the scalar type associated with type suffix I.  */
++inline tree
++function_instance::scalar_type (unsigned int i) const
++{
++  return scalar_types[type_suffix (i).vector_type];
++}
++
++/* Return the vector type associated with type suffix I.  */
++inline tree
++function_instance::vector_type (unsigned int i) const
++{
++  return acle_vector_types[0][type_suffix (i).vector_type];
++}
++
++/* If the function operates on tuples of vectors, return the tuple type
++   associated with type suffix I, otherwise return the vector type associated
++   with type suffix I.  */
++inline tree
++function_instance::tuple_type (unsigned int i) const
++{
++  unsigned int num_vectors = vectors_per_tuple ();
++  return acle_vector_types[num_vectors - 1][type_suffix (i).vector_type];
++}
++
++/* Return the number of elements of type suffix I that fit within a
++   128-bit block.  */
++inline unsigned int
++function_instance::elements_per_vq (unsigned int i) const
++{
++  return 128 / type_suffix (i).element_bits;
++}
++
++/* Return the vector or predicate mode associated with type suffix I.  */
++inline machine_mode
++function_instance::vector_mode (unsigned int i) const
++{
++  return type_suffix (i).vector_mode;
++}
++
++/* Return the mode of the governing predicate to use when operating on
++   type suffix I.  */
++inline machine_mode
++function_instance::gp_mode (unsigned int i) const
++{
++  return aarch64_sve_pred_mode (type_suffix (i).element_bytes).require ();
++}
++
++/* Return true if the function has no return value.  */
++inline bool
++function_call_info::function_returns_void_p ()
++{
++  return TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node;
++}
++
++/* Default implementation of function::call_properties, with conservatively
++   correct behavior for floating-point instructions.  */
++inline unsigned int
++function_base::call_properties (const function_instance &instance) const
++{
++  unsigned int flags = 0;
++  if (instance.type_suffix (0).float_p || instance.type_suffix (1).float_p)
++    flags |= CP_READ_FPCR | CP_RAISE_FP_EXCEPTIONS;
++  return flags;
++}
++
++}
++
++#endif
+diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
+index 02d33b727..11198e8a9 100644
+--- a/gcc/config/aarch64/aarch64-sve.md
++++ b/gcc/config/aarch64/aarch64-sve.md
+@@ -18,8 +18,168 @@
+ ;; along with GCC; see the file COPYING3.  If not see
+ ;; <http://www.gnu.org/licenses/>.
+ 
+-;; Note on the handling of big-endian SVE
+-;; --------------------------------------
++;; The file is organised into the following sections (search for the full
++;; line):
++;;
++;; == General notes
++;; ---- Note on the handling of big-endian SVE
++;; ---- Description of UNSPEC_PTEST
++;; ---- Description of UNSPEC_PRED_Z
++;; ---- Note on predicated integer arithemtic and UNSPEC_PRED_X
++;; ---- Note on predicated FP arithmetic patterns and GP "strictness"
++;; ---- Note on FFR handling
++;;
++;; == Moves
++;; ---- Moves of single vectors
++;; ---- Moves of multiple vectors
++;; ---- Moves of predicates
++;; ---- Moves relating to the FFR
++;;
++;; == Loads
++;; ---- Normal contiguous loads
++;; ---- Extending contiguous loads
++;; ---- First-faulting contiguous loads
++;; ---- First-faulting extending contiguous loads
++;; ---- Non-temporal contiguous loads
++;; ---- Normal gather loads
++;; ---- Extending gather loads
++;; ---- First-faulting gather loads
++;; ---- First-faulting extending gather loads
++;;
++;; == Prefetches
++;; ---- Contiguous prefetches
++;; ---- Gather prefetches
++;;
++;; == Stores
++;; ---- Normal contiguous stores
++;; ---- Truncating contiguous stores
++;; ---- Non-temporal contiguous stores
++;; ---- Normal scatter stores
++;; ---- Truncating scatter stores
++;;
++;; == Vector creation
++;; ---- [INT,FP] Duplicate element
++;; ---- [INT,FP] Initialize from individual elements
++;; ---- [INT] Linear series
++;; ---- [PRED] Duplicate element
++;;
++;; == Vector decomposition
++;; ---- [INT,FP] Extract index
++;; ---- [INT,FP] Extract active element
++;; ---- [PRED] Extract index
++;;
++;; == Unary arithmetic
++;; ---- [INT] General unary arithmetic corresponding to rtx codes
++;; ---- [INT] General unary arithmetic corresponding to unspecs
++;; ---- [INT] Sign extension
++;; ---- [INT] Zero extension
++;; ---- [INT] Logical inverse
++;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
++;; ---- [FP] General unary arithmetic corresponding to unspecs
++;; ---- [PRED] Inverse
++
++;; == Binary arithmetic
++;; ---- [INT] General binary arithmetic corresponding to rtx codes
++;; ---- [INT] Addition
++;; ---- [INT] Subtraction
++;; ---- [INT] Take address
++;; ---- [INT] Absolute difference
++;; ---- [INT] Saturating addition and subtraction
++;; ---- [INT] Highpart multiplication
++;; ---- [INT] Division
++;; ---- [INT] Binary logical operations
++;; ---- [INT] Binary logical operations (inverted second input)
++;; ---- [INT] Shifts (rounding towards -Inf)
++;; ---- [INT] Shifts (rounding towards 0)
++;; ---- [FP<-INT] General binary arithmetic corresponding to unspecs
++;; ---- [FP] General binary arithmetic corresponding to rtx codes
++;; ---- [FP] General binary arithmetic corresponding to unspecs
++;; ---- [FP] Addition
++;; ---- [FP] Complex addition
++;; ---- [FP] Subtraction
++;; ---- [FP] Absolute difference
++;; ---- [FP] Multiplication
++;; ---- [FP] Binary logical operations
++;; ---- [FP] Sign copying
++;; ---- [FP] Maximum and minimum
++;; ---- [PRED] Binary logical operations
++;; ---- [PRED] Binary logical operations (inverted second input)
++;; ---- [PRED] Binary logical operations (inverted result)
++;;
++;; == Ternary arithmetic
++;; ---- [INT] MLA and MAD
++;; ---- [INT] MLS and MSB
++;; ---- [INT] Dot product
++;; ---- [INT] Sum of absolute differences
++;; ---- [INT] Matrix multiply-accumulate
++;; ---- [FP] General ternary arithmetic corresponding to unspecs
++;; ---- [FP] Complex multiply-add
++;; ---- [FP] Trigonometric multiply-add
++;; ---- [FP] Bfloat16 long ternary arithmetic (SF,BF,BF)
++;; ---- [FP] Matrix multiply-accumulate
++;;
++;; == Comparisons and selects
++;; ---- [INT,FP] Select based on predicates
++;; ---- [INT,FP] Compare and select
++;; ---- [INT] Comparisons
++;; ---- [INT] While tests
++;; ---- [FP] Direct comparisons
++;; ---- [FP] Absolute comparisons
++;; ---- [PRED] Select
++;; ---- [PRED] Test bits
++;;
++;; == Reductions
++;; ---- [INT,FP] Conditional reductions
++;; ---- [INT] Tree reductions
++;; ---- [FP] Tree reductions
++;; ---- [FP] Left-to-right reductions
++;;
++;; == Permutes
++;; ---- [INT,FP] General permutes
++;; ---- [INT,FP] Special-purpose unary permutes
++;; ---- [INT,FP] Special-purpose binary permutes
++;; ---- [PRED] Special-purpose unary permutes
++;; ---- [PRED] Special-purpose binary permutes
++;;
++;; == Conversions
++;; ---- [INT<-INT] Packs
++;; ---- [INT<-INT] Unpacks
++;; ---- [INT<-FP] Conversions
++;; ---- [INT<-FP] Packs
++;; ---- [INT<-FP] Unpacks
++;; ---- [FP<-INT] Conversions
++;; ---- [FP<-INT] Packs
++;; ---- [FP<-INT] Unpacks
++;; ---- [FP<-FP] Packs
++;; ---- [FP<-FP] Packs (bfloat16)
++;; ---- [FP<-FP] Unpacks
++;; ---- [PRED<-PRED] Packs
++;; ---- [PRED<-PRED] Unpacks
++;;
++;; == Vector partitioning
++;; ---- [PRED] Unary partitioning
++;; ---- [PRED] Binary partitioning
++;; ---- [PRED] Scalarization
++;;
++;; == Counting elements
++;; ---- [INT] Count elements in a pattern (scalar)
++;; ---- [INT] Increment by the number of elements in a pattern (scalar)
++;; ---- [INT] Increment by the number of elements in a pattern (vector)
++;; ---- [INT] Decrement by the number of elements in a pattern (scalar)
++;; ---- [INT] Decrement by the number of elements in a pattern (vector)
++;; ---- [INT] Count elements in a predicate (scalar)
++;; ---- [INT] Increment by the number of elements in a predicate (scalar)
++;; ---- [INT] Increment by the number of elements in a predicate (vector)
++;; ---- [INT] Decrement by the number of elements in a predicate (scalar)
++;; ---- [INT] Decrement by the number of elements in a predicate (vector)
++
++;; =========================================================================
++;; == General notes
++;; =========================================================================
++;;
++;; -------------------------------------------------------------------------
++;; ---- Note on the handling of big-endian SVE
++;; -------------------------------------------------------------------------
+ ;;
+ ;; On big-endian systems, Advanced SIMD mov<mode> patterns act in the
+ ;; same way as movdi or movti would: the first byte of memory goes
+@@ -59,12 +219,339 @@
+ ;;   the order of the bytes within the elements is different.  We instead
+ ;;   access spill slots via LD1 and ST1, using secondary reloads to
+ ;;   reserve a predicate register.
++;;
++;; -------------------------------------------------------------------------
++;; ---- Description of UNSPEC_PTEST
++;; -------------------------------------------------------------------------
++;;
++;; SVE provides a PTEST instruction for testing the active lanes of a
++;; predicate and setting the flags based on the result.  The associated
++;; condition code tests are:
++;;
++;; - any   (= ne): at least one active bit is set
++;; - none  (= eq): all active bits are clear (*)
++;; - first (= mi): the first active bit is set
++;; - nfrst (= pl): the first active bit is clear (*)
++;; - last  (= cc): the last active bit is set
++;; - nlast (= cs): the last active bit is clear (*)
++;;
++;; where the conditions marked (*) are also true when there are no active
++;; lanes (i.e. when the governing predicate is a PFALSE).  The flags results
++;; of a PTEST use the condition code mode CC_NZC.
++;;
++;; PTEST is always a .B operation (i.e. it always operates on VNx16BI).
++;; This means that for other predicate modes, we need a governing predicate
++;; in which all bits are defined.
++;;
++;; For example, most predicated .H operations ignore the odd bits of the
++;; governing predicate, so that an active lane is represented by the
++;; bits "1x" and an inactive lane by the bits "0x", where "x" can be
++;; any value.  To test a .H predicate, we instead need "10" and "00"
++;; respectively, so that the condition only tests the even bits of the
++;; predicate.
++;;
++;; Several instructions set the flags as a side-effect, in the same way
++;; that a separate PTEST would.  It's important for code quality that we
++;; use these flags results as often as possible, particularly in the case
++;; of WHILE* and RDFFR.
++;;
++;; Also, some of the instructions that set the flags are unpredicated
++;; and instead implicitly test all .B, .H, .S or .D elements, as though
++;; they were predicated on a PTRUE of that size.  For example, a .S
++;; WHILELO sets the flags in the same way as a PTEST with a .S PTRUE
++;; would.
++;;
++;; We therefore need to represent PTEST operations in a way that
++;; makes it easy to combine them with both predicated and unpredicated
++;; operations, while using a VNx16BI governing predicate for all
++;; predicate modes.  We do this using:
++;;
++;;   (unspec:CC_NZC [gp cast_gp ptrue_flag op] UNSPEC_PTEST)
++;;
++;; where:
++;;
++;; - GP is the real VNx16BI governing predicate
++;;
++;; - CAST_GP is GP cast to the mode of OP.  All bits dropped by casting
++;;   GP to CAST_GP are guaranteed to be clear in GP.
++;;
++;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
++;;   SVE_KNOWN_PTRUE if we know that CAST_GP (rather than GP) is all-true and
++;;   SVE_MAYBE_NOT_PTRUE otherwise.
++;;
++;; - OP is the predicate we want to test, of the same mode as CAST_GP.
++;;
++;; -------------------------------------------------------------------------
++;; ---- Description of UNSPEC_PRED_Z
++;; -------------------------------------------------------------------------
++;;
++;; SVE integer comparisons are predicated and return zero for inactive
++;; lanes.  Sometimes we use them with predicates that are all-true and
++;; sometimes we use them with general predicates.
++;;
++;; The integer comparisons also set the flags and so build-in the effect
++;; of a PTEST.  We therefore want to be able to combine integer comparison
++;; patterns with PTESTs of the result.  One difficulty with doing this is
++;; that (as noted above) the PTEST is always a .B operation and so can place
++;; stronger requirements on the governing predicate than the comparison does.
++;;
++;; For example, when applying a separate PTEST to the result of a full-vector
++;; .H comparison, the PTEST must be predicated on a .H PTRUE instead of a
++;; .B PTRUE.  In constrast, the comparison might be predicated on either
++;; a .H PTRUE or a .B PTRUE, since the values of odd-indexed predicate
++;; bits don't matter for .H operations.
++;;
++;; We therefore can't rely on a full-vector comparison using the same
++;; predicate register as a following PTEST.  We instead need to remember
++;; whether a comparison is known to be a full-vector comparison and use
++;; this information in addition to a check for equal predicate registers.
++;; At the same time, it's useful to have a common representation for all
++;; integer comparisons, so that they can be handled by a single set of
++;; patterns.
++;;
++;; We therefore take a similar approach to UNSPEC_PTEST above and use:
++;;
++;;   (unspec:<M:VPRED> [gp ptrue_flag (code:M op0 op1)] UNSPEC_PRED_Z)
++;;
++;; where:
++;;
++;; - GP is the governing predicate, of mode <M:VPRED>
++;;
++;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
++;;   SVE_KNOWN_PTRUE if we know that GP is all-true and SVE_MAYBE_NOT_PTRUE
++;;   otherwise
++;;
++;; - CODE is the comparison code
++;;
++;; - OP0 and OP1 are the values being compared, of mode M
++;;
++;; The "Z" in UNSPEC_PRED_Z indicates that inactive lanes are zero.
++;;
++;; -------------------------------------------------------------------------
++;; ---- Note on predicated integer arithemtic and UNSPEC_PRED_X
++;; -------------------------------------------------------------------------
++;;
++;; Many SVE integer operations are predicated.  We can generate them
++;; from four sources:
++;;
++;; (1) Using normal unpredicated optabs.  In this case we need to create
++;;     an all-true predicate register to act as the governing predicate
++;;     for the SVE instruction.  There are no inactive lanes, and thus
++;;     the values of inactive lanes don't matter.
++;;
++;; (2) Using _x ACLE functions.  In this case the function provides a
++;;     specific predicate and some lanes might be inactive.  However,
++;;     as for (1), the values of the inactive lanes don't matter.
++;;     We can make extra lanes active without changing the behavior
++;;     (although for code-quality reasons we should avoid doing so
++;;     needlessly).
++;;
++;; (3) Using cond_* optabs that correspond to IFN_COND_* internal functions.
++;;     These optabs have a predicate operand that specifies which lanes are
++;;     active and another operand that provides the values of inactive lanes.
++;;
++;; (4) Using _m and _z ACLE functions.  These functions map to the same
++;;     patterns as (3), with the _z functions setting inactive lanes to zero
++;;     and the _m functions setting the inactive lanes to one of the function
++;;     arguments.
++;;
++;; For (1) and (2) we need a way of attaching the predicate to a normal
++;; unpredicated integer operation.  We do this using:
++;;
++;;   (unspec:M [pred (code:M (op0 op1 ...))] UNSPEC_PRED_X)
++;;
++;; where (code:M (op0 op1 ...)) is the normal integer operation and PRED
++;; is a predicate of mode <M:VPRED>.  PRED might or might not be a PTRUE;
++;; it always is for (1), but might not be for (2).
++;;
++;; The unspec as a whole has the same value as (code:M ...) when PRED is
++;; all-true.  It is always semantically valid to replace PRED with a PTRUE,
++;; but as noted above, we should only do so if there's a specific benefit.
++;;
++;; (The "_X" in the unspec is named after the ACLE functions in (2).)
++;;
++;; For (3) and (4) we can simply use the SVE port's normal representation
++;; of a predicate-based select:
++;;
++;;   (unspec:M [pred (code:M (op0 op1 ...)) inactive] UNSPEC_SEL)
++;;
++;; where INACTIVE specifies the values of inactive lanes.
++;;
++;; We can also use the UNSPEC_PRED_X wrapper in the UNSPEC_SEL rather
++;; than inserting the integer operation directly.  This is mostly useful
++;; if we want the combine pass to merge an integer operation with an explicit
++;; vcond_mask (in other words, with a following SEL instruction).  However,
++;; it's generally better to merge such operations at the gimple level
++;; using (3).
++;;
++;; -------------------------------------------------------------------------
++;; ---- Note on predicated FP arithmetic patterns and GP "strictness"
++;; -------------------------------------------------------------------------
++;;
++;; Most SVE floating-point operations are predicated.  We can generate
++;; them from four sources:
++;;
++;; (1) Using normal unpredicated optabs.  In this case we need to create
++;;     an all-true predicate register to act as the governing predicate
++;;     for the SVE instruction.  There are no inactive lanes, and thus
++;;     the values of inactive lanes don't matter.
++;;
++;; (2) Using _x ACLE functions.  In this case the function provides a
++;;     specific predicate and some lanes might be inactive.  However,
++;;     as for (1), the values of the inactive lanes don't matter.
++;;
++;;     The instruction must have the same exception behavior as the
++;;     function call unless things like command-line flags specifically
++;;     allow otherwise.  For example, with -ffast-math, it is OK to
++;;     raise exceptions for inactive lanes, but normally it isn't.
++;;
++;; (3) Using cond_* optabs that correspond to IFN_COND_* internal functions.
++;;     These optabs have a predicate operand that specifies which lanes are
++;;     active and another operand that provides the values of inactive lanes.
++;;
++;; (4) Using _m and _z ACLE functions.  These functions map to the same
++;;     patterns as (3), with the _z functions setting inactive lanes to zero
++;;     and the _m functions setting the inactive lanes to one of the function
++;;     arguments.
++;;
++;; So:
++;;
++;; - In (1), the predicate is known to be all true and the pattern can use
++;;   unpredicated operations where available.
++;;
++;; - In (2), the predicate might or might not be all true.  The pattern can
++;;   use unpredicated instructions if the predicate is all-true or if things
++;;   like command-line flags allow exceptions for inactive lanes.
++;;
++;; - (3) and (4) represent a native SVE predicated operation.  Some lanes
++;;   might be inactive and inactive lanes of the result must have specific
++;;   values.  There is no scope for using unpredicated instructions (and no
++;;   reason to want to), so the question about command-line flags doesn't
++;;   arise.
++;;
++;; It would be inaccurate to model (2) as an rtx code like (sqrt ...)
++;; in combination with a separate predicate operand, e.g.
++;;
++;;   (unspec [(match_operand:<VPRED> 1 "register_operand" "Upl")
++;;	      (sqrt:SVE_FULL_F 2 "register_operand" "w")]
++;;	     ....)
++;;
++;; because (sqrt ...) can raise an exception for any lane, including
++;; inactive ones.  We therefore need to use an unspec instead.
++;;
++;; Also, (2) requires some way of distinguishing the case in which the
++;; predicate might have inactive lanes and cannot be changed from the
++;; case in which the predicate has no inactive lanes or can be changed.
++;; This information is also useful when matching combined FP patterns
++;; in which the predicates might not be equal.
++;;
++;; We therefore model FP operations as an unspec of the form:
++;;
++;;   (unspec [pred strictness op0 op1 ...] UNSPEC_COND_<MNEMONIC>)
++;;
++;; where:
++;;
++;; - PRED is the governing predicate.
++;;
++;; - STRICTNESS is a CONST_INT that conceptually has mode SI.  It has the
++;;   value SVE_STRICT_GP if PRED might have inactive lanes and if those
++;;   lanes must remain inactive.  It has the value SVE_RELAXED_GP otherwise.
++;;
++;; - OP0 OP1 ... are the normal input operands to the operation.
++;;
++;; - MNEMONIC is the mnemonic of the associated SVE instruction.
++;;
++;; -------------------------------------------------------------------------
++;; ---- Note on FFR handling
++;; -------------------------------------------------------------------------
++;;
++;; Logically we want to divide FFR-related instructions into regions
++;; that contain exactly one of:
++;;
++;; - a single write to the FFR
++;; - any number of reads from the FFR (but only one read is likely)
++;; - any number of LDFF1 and LDNF1 instructions
++;;
++;; However, LDFF1 and LDNF1 instructions should otherwise behave like
++;; normal loads as far as possible.  This means that they should be
++;; schedulable within a region in the same way that LD1 would be,
++;; and they should be deleted as dead if the result is unused.  The loads
++;; should therefore not write to the FFR, since that would both serialize
++;; the loads with respect to each other and keep the loads live for any
++;; later RDFFR.
++;;
++;; We get around this by using a fake "FFR token" (FFRT) to help describe
++;; the dependencies.  Writing to the FFRT starts a new "FFRT region",
++;; while using the FFRT keeps the instruction within its region.
++;; Specifically:
++;;
++;; - Writes start a new FFRT region as well as setting the FFR:
++;;
++;;       W1: parallel (FFRT = <new value>, FFR = <actual FFR value>)
++;;
++;; - Loads use an LD1-like instruction that also uses the FFRT, so that the
++;;   loads stay within the same FFRT region:
++;;
++;;       L1: load data while using the FFRT
++;;
++;;   In addition, any FFRT region that includes a load also has at least one
++;;   instance of:
++;;
++;;       L2: FFR = update(FFR, FFRT)  [type == no_insn]
++;;
++;;   to make it clear that the region both reads from and writes to the FFR.
++;;
++;; - Reads do the following:
++;;
++;;       R1: FFRT = FFR               [type == no_insn]
++;;       R2: read from the FFRT
++;;       R3: FFRT = update(FFRT)      [type == no_insn]
++;;
++;;   R1 and R3 both create new FFRT regions, so that previous LDFF1s and
++;;   LDNF1s cannot move forwards across R1 and later LDFF1s and LDNF1s
++;;   cannot move backwards across R3.
++;;
++;; This way, writes are only kept alive by later loads or reads,
++;; and write/read pairs fold normally.  For two consecutive reads,
++;; the first R3 is made dead by the second R1, which in turn becomes
++;; redundant with the first R1.  We then have:
++;;
++;;     first R1: FFRT = FFR
++;;     first read from the FFRT
++;;     second read from the FFRT
++;;     second R3: FFRT = update(FFRT)
++;;
++;; i.e. the two FFRT regions collapse into a single one with two
++;; independent reads.
++;;
++;; The model still prevents some valid optimizations though.  For example,
++;; if all loads in an FFRT region are deleted as dead, nothing would remove
++;; the L2 instructions.
++
++;; =========================================================================
++;; == Moves
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- Moves of single vectors
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - MOV  (including aliases)
++;; - LD1B (contiguous form)
++;; - LD1D (    "    "     )
++;; - LD1H (    "    "     )
++;; - LD1W (    "    "     )
++;; - LDR
++;; - ST1B (contiguous form)
++;; - ST1D (    "    "     )
++;; - ST1H (    "    "     )
++;; - ST1W (    "    "     )
++;; - STR
++;; -------------------------------------------------------------------------
+ 
+-
+-;; SVE data moves.
+ (define_expand "mov<mode>"
+-  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+-	(match_operand:SVE_ALL 1 "general_operand"))]
++  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
++	(match_operand:SVE_FULL 1 "general_operand"))]
+   "TARGET_SVE"
+   {
+     /* Use the predicated load and store patterns where possible.
+@@ -72,7 +559,7 @@
+        head of the file) and increases the addressing choices for
+        little-endian.  */
+     if ((MEM_P (operands[0]) || MEM_P (operands[1]))
+-        && can_create_pseudo_p ())
++	&& can_create_pseudo_p ())
+       {
+ 	aarch64_expand_sve_mem_move (operands[0], operands[1], <VPRED>mode);
+ 	DONE;
+@@ -80,47 +567,37 @@
+ 
+     if (CONSTANT_P (operands[1]))
+       {
+-	aarch64_expand_mov_immediate (operands[0], operands[1],
+-				      gen_vec_duplicate<mode>);
++	aarch64_expand_mov_immediate (operands[0], operands[1]);
+ 	DONE;
+       }
+ 
+     /* Optimize subregs on big-endian targets: we can use REV[BHW]
+        instead of going through memory.  */
+     if (BYTES_BIG_ENDIAN
+-        && aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
++	&& aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
+       DONE;
+   }
+ )
+ 
+-;; A pattern for optimizing SUBREGs that have a reinterpreting effect
+-;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
+-;; for details.  We use a special predicate for operand 2 to reduce
+-;; the number of patterns.
+-(define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
+-  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
+-	(unspec:SVE_ALL
+-          [(match_operand:VNx16BI 1 "register_operand" "Upl")
+-	   (match_operand 2 "aarch64_any_register_operand" "w")]
+-	  UNSPEC_REV_SUBREG))]
+-  "TARGET_SVE && BYTES_BIG_ENDIAN"
+-  "#"
+-  "&& reload_completed"
+-  [(const_int 0)]
++(define_expand "movmisalign<mode>"
++  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
++	(match_operand:SVE_FULL 1 "general_operand"))]
++  "TARGET_SVE"
+   {
+-    aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
++    /* Equivalent to a normal move for our purpooses.  */
++    emit_move_insn (operands[0], operands[1]);
+     DONE;
+   }
+ )
+ 
+-;; Unpredicated moves (little-endian).  Only allow memory operations
+-;; during and after RA; before RA we want the predicated load and
+-;; store patterns to be used instead.
++;; Unpredicated moves (bytes or little-endian).  Only allow memory operations
++;; during and after RA; before RA we want the predicated load and store
++;; patterns to be used instead.
+ (define_insn "*aarch64_sve_mov<mode>_le"
+-  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
+-	(match_operand:SVE_ALL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
++  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
++	(match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
+   "TARGET_SVE
+-   && !BYTES_BIG_ENDIAN
++   && (<MODE>mode == VNx16QImode || !BYTES_BIG_ENDIAN)
+    && ((lra_in_progress || reload_completed)
+        || (register_operand (operands[0], <MODE>mode)
+ 	   && nonmemory_operand (operands[1], <MODE>mode)))"
+@@ -131,12 +608,12 @@
+    * return aarch64_output_sve_mov_immediate (operands[1]);"
+ )
+ 
+-;; Unpredicated moves (big-endian).  Memory accesses require secondary
++;; Unpredicated moves (non-byte big-endian).  Memory accesses require secondary
+ ;; reloads.
+ (define_insn "*aarch64_sve_mov<mode>_be"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+-	(match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
+-  "TARGET_SVE && BYTES_BIG_ENDIAN"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w")
++	(match_operand:SVE_FULL 1 "aarch64_nonmemory_operand" "w, Dn"))]
++  "TARGET_SVE && BYTES_BIG_ENDIAN && <MODE>mode != VNx16QImode"
+   "@
+    mov\t%0.d, %1.d
+    * return aarch64_output_sve_mov_immediate (operands[1]);"
+@@ -144,10 +621,11 @@
+ 
+ ;; Handle big-endian memory reloads.  We use byte PTRUE for all modes
+ ;; to try to encourage reuse.
++;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook.
+ (define_expand "aarch64_sve_reload_be"
+   [(parallel
+      [(set (match_operand 0)
+-           (match_operand 1))
++	   (match_operand 1))
+       (clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
+   "TARGET_SVE && BYTES_BIG_ENDIAN"
+   {
+@@ -166,16 +644,15 @@
+   }
+ )
+ 
+-;; A predicated load or store for which the predicate is known to be
+-;; all-true.  Note that this pattern is generated directly by
+-;; aarch64_emit_sve_pred_move, so changes to this pattern will
+-;; need changes there as well.
++;; A predicated move in which the predicate is known to be all-true.
++;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
++;; so changes to this pattern will need changes there as well.
+ (define_insn_and_split "@aarch64_pred_mov<mode>"
+-  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
+-	(unspec:SVE_ALL
++  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand" "=w, w, m")
++	(unspec:SVE_FULL
+ 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
+-	  UNSPEC_MERGE_PTRUE))]
++	   (match_operand:SVE_FULL 2 "nonimmediate_operand" "w, m, w")]
++	  UNSPEC_PRED_X))]
+   "TARGET_SVE
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[2], <MODE>mode))"
+@@ -188,152 +665,67 @@
+   [(set (match_dup 0) (match_dup 2))]
+ )
+ 
+-(define_expand "movmisalign<mode>"
+-  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+-	(match_operand:SVE_ALL 1 "general_operand"))]
+-  "TARGET_SVE"
++;; A pattern for optimizing SUBREGs that have a reinterpreting effect
++;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
++;; for details.  We use a special predicate for operand 2 to reduce
++;; the number of patterns.
++(define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
++  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
++	   (match_operand 2 "aarch64_any_register_operand" "w")]
++	  UNSPEC_REV_SUBREG))]
++  "TARGET_SVE && BYTES_BIG_ENDIAN"
++  "#"
++  "&& reload_completed"
++  [(const_int 0)]
+   {
+-    /* Equivalent to a normal move for our purpooses.  */
+-    emit_move_insn (operands[0], operands[1]);
++    aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
+     DONE;
+   }
+ )
+ 
+-(define_insn "maskload<mode><vpred>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL
+-	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+-	   (match_operand:SVE_ALL 1 "memory_operand" "m")]
+-	  UNSPEC_LD1_SVE))]
+-  "TARGET_SVE"
+-  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
+-)
+-
+-(define_insn "maskstore<mode><vpred>"
+-  [(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
+-	(unspec:SVE_ALL [(match_operand:<VPRED> 2 "register_operand" "Upl")
+-			 (match_operand:SVE_ALL 1 "register_operand" "w")
+-			 (match_dup 0)]
+-			UNSPEC_ST1_SVE))]
+-  "TARGET_SVE"
+-  "st1<Vesize>\t%1.<Vetype>, %2, %0"
+-)
+-
+-;; Unpredicated gather loads.
+-(define_expand "gather_load<mode>"
+-  [(set (match_operand:SVE_SD 0 "register_operand")
+-	(unspec:SVE_SD
+-	  [(match_dup 5)
+-	   (match_operand:DI 1 "aarch64_reg_or_zero")
+-	   (match_operand:<V_INT_EQUIV> 2 "register_operand")
+-	   (match_operand:DI 3 "const_int_operand")
+-	   (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+-	   (mem:BLK (scratch))]
+-	  UNSPEC_LD1_GATHER))]
++;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
++;; This is equivalent to a subreg on little-endian targets but not for
++;; big-endian; see the comment at the head of the file for details.
++(define_expand "@aarch64_sve_reinterpret<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand")
++	(unspec:SVE_FULL
++	  [(match_operand 1 "aarch64_any_register_operand")]
++	  UNSPEC_REINTERPRET))]
+   "TARGET_SVE"
+   {
+-    operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    if (!BYTES_BIG_ENDIAN)
++      {
++	emit_move_insn (operands[0], gen_lowpart (<MODE>mode, operands[1]));
++	DONE;
++      }
+   }
+ )
+ 
+-;; Predicated gather loads for 32-bit elements.  Operand 3 is true for
+-;; unsigned extension and false for signed extension.
+-(define_insn "mask_gather_load<mode>"
+-  [(set (match_operand:SVE_S 0 "register_operand" "=w, w, w, w, w")
+-	(unspec:SVE_S
+-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+-	   (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
+-	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w, w, w")
+-	   (match_operand:DI 3 "const_int_operand" "i, Z, Ui1, Z, Ui1")
+-	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
+-	   (mem:BLK (scratch))]
+-	  UNSPEC_LD1_GATHER))]
+-  "TARGET_SVE"
+-  "@
+-   ld1w\t%0.s, %5/z, [%2.s]
+-   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
+-   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
+-   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+-   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+-)
+-
+-;; Predicated gather loads for 64-bit elements.  The value of operand 3
+-;; doesn't matter in this case.
+-(define_insn "mask_gather_load<mode>"
+-  [(set (match_operand:SVE_D 0 "register_operand" "=w, w, w")
+-	(unspec:SVE_D
+-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
+-	   (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk")
+-	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w")
+-	   (match_operand:DI 3 "const_int_operand")
+-	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
+-	   (mem:BLK (scratch))]
+-	  UNSPEC_LD1_GATHER))]
+-  "TARGET_SVE"
+-  "@
+-   ld1d\t%0.d, %5/z, [%2.d]
+-   ld1d\t%0.d, %5/z, [%1, %2.d]
+-   ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+-)
+-
+-;; Unpredicated scatter store.
+-(define_expand "scatter_store<mode>"
+-  [(set (mem:BLK (scratch))
+-	(unspec:BLK
+-	  [(match_dup 5)
+-	   (match_operand:DI 0 "aarch64_reg_or_zero")
+-	   (match_operand:<V_INT_EQUIV> 1 "register_operand")
+-	   (match_operand:DI 2 "const_int_operand")
+-	   (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>")
+-	   (match_operand:SVE_SD 4 "register_operand")]
+-	  UNSPEC_ST1_SCATTER))]
++;; A pattern for handling type punning on big-endian targets.  We use a
++;; special predicate for operand 1 to reduce the number of patterns.
++(define_insn_and_split "*aarch64_sve_reinterpret<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand 1 "aarch64_any_register_operand" "w")]
++	  UNSPEC_REINTERPRET))]
+   "TARGET_SVE"
++  "#"
++  "&& reload_completed"
++  [(set (match_dup 0) (match_dup 1))]
+   {
+-    operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[1] = aarch64_replace_reg_mode (operands[1], <MODE>mode);
+   }
+ )
+ 
+-;; Predicated scatter stores for 32-bit elements.  Operand 2 is true for
+-;; unsigned extension and false for signed extension.
+-(define_insn "mask_scatter_store<mode>"
+-  [(set (mem:BLK (scratch))
+-	(unspec:BLK
+-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+-	   (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
+-	   (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w, w, w")
+-	   (match_operand:DI 2 "const_int_operand" "i, Z, Ui1, Z, Ui1")
+-	   (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
+-	   (match_operand:SVE_S 4 "register_operand" "w, w, w, w, w")]
+-	  UNSPEC_ST1_SCATTER))]
+-  "TARGET_SVE"
+-  "@
+-   st1w\t%4.s, %5, [%1.s]
+-   st1w\t%4.s, %5, [%0, %1.s, sxtw]
+-   st1w\t%4.s, %5, [%0, %1.s, uxtw]
+-   st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
+-   st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
+-)
+-
+-;; Predicated scatter stores for 64-bit elements.  The value of operand 2
+-;; doesn't matter in this case.
+-(define_insn "mask_scatter_store<mode>"
+-  [(set (mem:BLK (scratch))
+-	(unspec:BLK
+-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
+-	   (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk")
+-	   (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w")
+-	   (match_operand:DI 2 "const_int_operand")
+-	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
+-	   (match_operand:SVE_D 4 "register_operand" "w, w, w")]
+-	  UNSPEC_ST1_SCATTER))]
+-  "TARGET_SVE"
+-  "@
+-   st1d\t%4.d, %5, [%1.d]
+-   st1d\t%4.d, %5, [%0, %1.d]
+-   st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
+-)
++;; -------------------------------------------------------------------------
++;; ---- Moves of multiple vectors
++;; -------------------------------------------------------------------------
++;; All patterns in this section are synthetic and split to real
++;; instructions after reload.
++;; -------------------------------------------------------------------------
+ 
+-;; SVE structure moves.
+ (define_expand "mov<mode>"
+   [(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
+ 	(match_operand:SVE_STRUCT 1 "general_operand"))]
+@@ -368,7 +760,7 @@
+ 
+ ;; Unpredicated structure moves (big-endian).  Memory accesses require
+ ;; secondary reloads.
+-(define_insn "*aarch64_sve_mov<mode>_le"
++(define_insn "*aarch64_sve_mov<mode>_be"
+   [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w, w")
+ 	(match_operand:SVE_STRUCT 1 "aarch64_nonmemory_operand" "w, Dn"))]
+   "TARGET_SVE && BYTES_BIG_ENDIAN"
+@@ -409,7 +801,7 @@
+ 	(unspec:SVE_STRUCT
+ 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+ 	   (match_operand:SVE_STRUCT 2 "aarch64_sve_struct_nonimmediate_operand" "w, Utx, w")]
+-	  UNSPEC_MERGE_PTRUE))]
++	  UNSPEC_PRED_X))]
+   "TARGET_SVE
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[2], <MODE>mode))"
+@@ -432,6 +824,18 @@
+   [(set_attr "length" "<insn_length>")]
+ )
+ 
++;; -------------------------------------------------------------------------
++;; ---- Moves of predicates
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - MOV
++;; - LDR
++;; - PFALSE
++;; - PTRUE
++;; - PTRUES
++;; - STR
++;; -------------------------------------------------------------------------
++
+ (define_expand "mov<mode>"
+   [(set (match_operand:PRED_ALL 0 "nonimmediate_operand")
+ 	(match_operand:PRED_ALL 1 "general_operand"))]
+@@ -439,12 +843,18 @@
+   {
+     if (GET_CODE (operands[0]) == MEM)
+       operands[1] = force_reg (<MODE>mode, operands[1]);
++
++    if (CONSTANT_P (operands[1]))
++      {
++	aarch64_expand_mov_immediate (operands[0], operands[1]);
++	DONE;
++      }
+   }
+ )
+ 
+ (define_insn "*aarch64_sve_mov<mode>"
+-  [(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa, Upa")
+-	(match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dz, Dm"))]
++  [(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa")
++	(match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))]
+   "TARGET_SVE
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[1], <MODE>mode))"
+@@ -452,287 +862,296 @@
+    mov\t%0.b, %1.b
+    str\t%1, %0
+    ldr\t%0, %1
+-   pfalse\t%0.b
+-   * return aarch64_output_ptrue (<MODE>mode, '<Vetype>');"
++   * return aarch64_output_sve_mov_immediate (operands[1]);"
+ )
+ 
+-;; Handle extractions from a predicate by converting to an integer vector
+-;; and extracting from there.
+-(define_expand "vec_extract<vpred><Vel>"
+-  [(match_operand:<VEL> 0 "register_operand")
+-   (match_operand:<VPRED> 1 "register_operand")
+-   (match_operand:SI 2 "nonmemory_operand")
+-   ;; Dummy operand to which we can attach the iterator.
+-   (reg:SVE_I V0_REGNUM)]
++;; Match PTRUES Pn.B when both the predicate and flags are useful.
++(define_insn_and_rewrite "*aarch64_sve_ptruevnx16bi_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand 2)
++	   (match_operand 3)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
++	     [(unspec:VNx16BI
++		[(match_operand:SI 4 "const_int_operand")
++		 (match_operand:VNx16BI 5 "aarch64_simd_imm_zero")]
++		UNSPEC_PTRUE)])]
++	  UNSPEC_PTEST))
++   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(match_dup 1))]
+   "TARGET_SVE"
+   {
+-    rtx tmp = gen_reg_rtx (<MODE>mode);
+-    emit_insn (gen_aarch64_sve_dup<mode>_const (tmp, operands[1],
+-						CONST1_RTX (<MODE>mode),
+-						CONST0_RTX (<MODE>mode)));
+-    emit_insn (gen_vec_extract<mode><Vel> (operands[0], tmp, operands[2]));
+-    DONE;
++    return aarch64_output_sve_ptrues (operands[1]);
++  }
++  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
++  {
++    operands[2] = operands[3] = CONSTM1_RTX (VNx16BImode);
+   }
+ )
+ 
+-(define_expand "vec_extract<mode><Vel>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(vec_select:<VEL>
+-	  (match_operand:SVE_ALL 1 "register_operand")
+-	  (parallel [(match_operand:SI 2 "nonmemory_operand")])))]
++;; Match PTRUES Pn.[HSD] when both the predicate and flags are useful.
++(define_insn_and_rewrite "*aarch64_sve_ptrue<mode>_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand 2)
++	   (match_operand 3)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (subreg:PRED_HSD
++	     (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
++	       [(unspec:VNx16BI
++		  [(match_operand:SI 4 "const_int_operand")
++		   (match_operand:PRED_HSD 5 "aarch64_simd_imm_zero")]
++		  UNSPEC_PTRUE)]) 0)]
++	  UNSPEC_PTEST))
++   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(match_dup 1))]
+   "TARGET_SVE"
+   {
+-    poly_int64 val;
+-    if (poly_int_rtx_p (operands[2], &val)
+-	&& known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1))
+-      {
+-	/* The last element can be extracted with a LASTB and a false
+-	   predicate.  */
+-	rtx sel = force_reg (<VPRED>mode, CONST0_RTX (<VPRED>mode));
+-	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
+-	DONE;
+-      }
+-    if (!CONST_INT_P (operands[2]))
+-      {
+-	/* Create an index with operand[2] as the base and -1 as the step.
+-	   It will then be zero for the element we care about.  */
+-	rtx index = gen_lowpart (<VEL_INT>mode, operands[2]);
+-	index = force_reg (<VEL_INT>mode, index);
+-	rtx series = gen_reg_rtx (<V_INT_EQUIV>mode);
+-	emit_insn (gen_vec_series<v_int_equiv> (series, index, constm1_rtx));
+-
+-	/* Get a predicate that is true for only that element.  */
+-	rtx zero = CONST0_RTX (<V_INT_EQUIV>mode);
+-	rtx cmp = gen_rtx_EQ (<V_INT_EQUIV>mode, series, zero);
+-	rtx sel = gen_reg_rtx (<VPRED>mode);
+-	emit_insn (gen_vec_cmp<v_int_equiv><vpred> (sel, cmp, series, zero));
+-
+-	/* Select the element using LASTB.  */
+-	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
+-	DONE;
+-      }
+-  }
+-)
+-
+-;; Extract element zero.  This is a special case because we want to force
+-;; the registers to be the same for the second alternative, and then
+-;; split the instruction into nothing after RA.
+-(define_insn_and_split "*vec_extract<mode><Vel>_0"
+-  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
+-	(vec_select:<VEL>
+-	  (match_operand:SVE_ALL 1 "register_operand" "w, 0, w")
+-	  (parallel [(const_int 0)])))]
+-  "TARGET_SVE"
+-  {
+-    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
+-    switch (which_alternative)
+-      {
+-	case 0:
+-	  return "umov\\t%<vwcore>0, %1.<Vetype>[0]";
+-	case 1:
+-	  return "#";
+-	case 2:
+-	  return "st1\\t{%1.<Vetype>}[0], %0";
+-	default:
+-	  gcc_unreachable ();
+-      }
++    return aarch64_output_sve_ptrues (operands[1]);
+   }
+-  "&& reload_completed
+-   && REG_P (operands[0])
+-   && REGNO (operands[0]) == REGNO (operands[1])"
+-  [(const_int 0)]
++  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
+   {
+-    emit_note (NOTE_INSN_DELETED);
+-    DONE;
++    operands[2] = CONSTM1_RTX (VNx16BImode);
++    operands[3] = CONSTM1_RTX (<MODE>mode);
+   }
+-  [(set_attr "type" "neon_to_gp_q, untyped, neon_store1_one_lane_q")]
+ )
+ 
+-;; Extract an element from the Advanced SIMD portion of the register.
+-;; We don't just reuse the aarch64-simd.md pattern because we don't
+-;; want any change in lane number on big-endian targets.
+-(define_insn "*vec_extract<mode><Vel>_v128"
+-  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
+-	(vec_select:<VEL>
+-	  (match_operand:SVE_ALL 1 "register_operand" "w, w, w")
+-	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
+-  "TARGET_SVE
+-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 1, 15)"
++;; Match PTRUES Pn.B when only the flags result is useful (which is
++;; a way of testing VL).
++(define_insn_and_rewrite "*aarch64_sve_ptruevnx16bi_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand 2)
++	   (match_operand 3)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
++	     [(unspec:VNx16BI
++		[(match_operand:SI 4 "const_int_operand")
++		 (match_operand:VNx16BI 5 "aarch64_simd_imm_zero")]
++		UNSPEC_PTRUE)])]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
+   {
+-    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
+-    switch (which_alternative)
+-      {
+-	case 0:
+-	  return "umov\\t%<vwcore>0, %1.<Vetype>[%2]";
+-	case 1:
+-	  return "dup\\t%<Vetype>0, %1.<Vetype>[%2]";
+-	case 2:
+-	  return "st1\\t{%1.<Vetype>}[%2], %0";
+-	default:
+-	  gcc_unreachable ();
+-      }
++    return aarch64_output_sve_ptrues (operands[1]);
+   }
+-  [(set_attr "type" "neon_to_gp_q, neon_dup_q, neon_store1_one_lane_q")]
+-)
+-
+-;; Extract an element in the range of DUP.  This pattern allows the
+-;; source and destination to be different.
+-(define_insn "*vec_extract<mode><Vel>_dup"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(vec_select:<VEL>
+-	  (match_operand:SVE_ALL 1 "register_operand" "w")
+-	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
+-  "TARGET_SVE
+-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 16, 63)"
++  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
+   {
+-    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
+-    return "dup\t%0.<Vetype>, %1.<Vetype>[%2]";
++    operands[2] = operands[3] = CONSTM1_RTX (VNx16BImode);
+   }
+ )
+ 
+-;; Extract an element outside the range of DUP.  This pattern requires the
+-;; source and destination to be the same.
+-(define_insn "*vec_extract<mode><Vel>_ext"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(vec_select:<VEL>
+-	  (match_operand:SVE_ALL 1 "register_operand" "0")
+-	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
+-  "TARGET_SVE && INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode) >= 64"
++;; Match PTRUES Pn.[HWD] when only the flags result is useful (which is
++;; a way of testing VL).
++(define_insn_and_rewrite "*aarch64_sve_ptrue<mode>_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand 2)
++	   (match_operand 3)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (subreg:PRED_HSD
++	     (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
++	       [(unspec:VNx16BI
++		  [(match_operand:SI 4 "const_int_operand")
++		   (match_operand:PRED_HSD 5 "aarch64_simd_imm_zero")]
++		  UNSPEC_PTRUE)]) 0)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
+   {
+-    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
+-    operands[2] = GEN_INT (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode));
+-    return "ext\t%0.b, %0.b, %0.b, #%2";
++    return aarch64_output_sve_ptrues (operands[1]);
+   }
+-)
+-
+-;; Extract the last active element of operand 1 into operand 0.
+-;; If no elements are active, extract the last inactive element instead.
+-(define_insn "extract_last_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=r, w")
+-	(unspec:<VEL>
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (match_operand:SVE_ALL 2 "register_operand" "w, w")]
+-	  UNSPEC_LASTB))]
+-  "TARGET_SVE"
+-  "@
+-   lastb\t%<vwcore>0, %1, %2.<Vetype>
+-   lastb\t%<Vetype>0, %1, %2.<Vetype>"
+-)
+-
+-(define_expand "vec_duplicate<mode>"
+-  [(parallel
+-    [(set (match_operand:SVE_ALL 0 "register_operand")
+-	  (vec_duplicate:SVE_ALL
+-	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
+-     (clobber (scratch:<VPRED>))])]
+-  "TARGET_SVE"
++  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
+   {
+-    if (MEM_P (operands[1]))
+-      {
+-	rtx ptrue = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-	emit_insn (gen_sve_ld1r<mode> (operands[0], ptrue, operands[1],
+-				       CONST0_RTX (<MODE>mode)));
+-	DONE;
+-      }
++    operands[2] = CONSTM1_RTX (VNx16BImode);
++    operands[3] = CONSTM1_RTX (<MODE>mode);
+   }
+ )
+ 
+-;; Accept memory operands for the benefit of combine, and also in case
+-;; the scalar input gets spilled to memory during RA.  We want to split
+-;; the load at the first opportunity in order to allow the PTRUE to be
+-;; optimized with surrounding code.
+-(define_insn_and_split "*vec_duplicate<mode>_reg"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
+-	(vec_duplicate:SVE_ALL
+-	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
+-   (clobber (match_scratch:<VPRED> 2 "=X, X, Upl"))]
++;; -------------------------------------------------------------------------
++;; ---- Moves relating to the FFR
++;; -------------------------------------------------------------------------
++;; RDFFR
++;; RDFFRS
++;; SETFFR
++;; WRFFR
++;; -------------------------------------------------------------------------
++
++;; [W1 in the block comment above about FFR handling]
++;;
++;; Write to the FFR and start a new FFRT scheduling region.
++(define_insn "aarch64_wrffr"
++  [(set (reg:VNx16BI FFR_REGNUM)
++	(match_operand:VNx16BI 0 "aarch64_simd_reg_or_minus_one" "Dm, Upa"))
++   (set (reg:VNx16BI FFRT_REGNUM)
++	(match_dup 0))]
+   "TARGET_SVE"
+   "@
+-   mov\t%0.<Vetype>, %<vwcore>1
+-   mov\t%0.<Vetype>, %<Vetype>1
+-   #"
+-  "&& MEM_P (operands[1])"
+-  [(const_int 0)]
+-  {
+-    if (GET_CODE (operands[2]) == SCRATCH)
+-      operands[2] = gen_reg_rtx (<VPRED>mode);
+-    emit_move_insn (operands[2], CONSTM1_RTX (<VPRED>mode));
+-    emit_insn (gen_sve_ld1r<mode> (operands[0], operands[2], operands[1],
+-				   CONST0_RTX (<MODE>mode)));
+-    DONE;
+-  }
+-  [(set_attr "length" "4,4,8")]
++   setffr
++   wrffr\t%0.b"
+ )
+ 
+-;; This is used for vec_duplicate<mode>s from memory, but can also
+-;; be used by combine to optimize selects of a a vec_duplicate<mode>
+-;; with zero.
+-(define_insn "sve_ld1r<mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (vec_duplicate:SVE_ALL
+-	     (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
+-	   (match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")]
+-	  UNSPEC_SEL))]
++;; [L2 in the block comment above about FFR handling]
++;;
++;; Introduce a read from and write to the FFR in the current FFRT region,
++;; so that the FFR value is live on entry to the region and so that the FFR
++;; value visibly changes within the region.  This is used (possibly multiple
++;; times) in an FFRT region that includes LDFF1 or LDNF1 instructions.
++(define_insn "aarch64_update_ffr_for_load"
++  [(set (reg:VNx16BI FFR_REGNUM)
++	(unspec:VNx16BI [(reg:VNx16BI FFRT_REGNUM)
++			 (reg:VNx16BI FFR_REGNUM)] UNSPEC_UPDATE_FFR))]
+   "TARGET_SVE"
+-  "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
++  ""
++  [(set_attr "type" "no_insn")]
+ )
+ 
+-;; Load 128 bits from memory and duplicate to fill a vector.  Since there
+-;; are so few operations on 128-bit "elements", we don't define a VNx1TI
+-;; and simply use vectors of bytes instead.
+-(define_insn "*sve_ld1rq<Vesize>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
+-	  UNSPEC_LD1RQ))]
++;; [R1 in the block comment above about FFR handling]
++;;
++;; Notionally copy the FFR to the FFRT, so that the current FFR value
++;; can be read from there by the RDFFR instructions below.  This acts
++;; as a scheduling barrier for earlier LDFF1 and LDNF1 instructions and
++;; creates a natural dependency with earlier writes.
++(define_insn "aarch64_copy_ffr_to_ffrt"
++  [(set (reg:VNx16BI FFRT_REGNUM)
++	(reg:VNx16BI FFR_REGNUM))]
+   "TARGET_SVE"
+-  "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
++  ""
++  [(set_attr "type" "no_insn")]
+ )
+ 
+-;; Implement a predicate broadcast by shifting the low bit of the scalar
+-;; input into the top bit and using a WHILELO.  An alternative would be to
+-;; duplicate the input and do a compare with zero.
+-(define_expand "vec_duplicate<mode>"
+-  [(set (match_operand:PRED_ALL 0 "register_operand")
+-	(vec_duplicate:PRED_ALL (match_operand 1 "register_operand")))]
++;; [R2 in the block comment above about FFR handling]
++;;
++;; Read the FFR via the FFRT.
++(define_insn "aarch64_rdffr"
++  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(reg:VNx16BI FFRT_REGNUM))]
++  "TARGET_SVE"
++  "rdffr\t%0.b"
++)
++
++;; Likewise with zero predication.
++(define_insn "aarch64_rdffr_z"
++  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(and:VNx16BI
++	  (reg:VNx16BI FFRT_REGNUM)
++	  (match_operand:VNx16BI 1 "register_operand" "Upa")))]
++  "TARGET_SVE"
++  "rdffr\t%0.b, %1/z"
++)
++
++;; Read the FFR to test for a fault, without using the predicate result.
++(define_insn "*aarch64_rdffr_z_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_dup 1)
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (and:VNx16BI
++	     (reg:VNx16BI FFRT_REGNUM)
++	     (match_dup 1))]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
++  "rdffrs\t%0.b, %1/z"
++)
++
++;; Same for unpredicated RDFFR when tested with a known PTRUE.
++(define_insn "*aarch64_rdffr_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_dup 1)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
++  "rdffrs\t%0.b, %1/z"
++)
++
++;; Read the FFR with zero predication and test the result.
++(define_insn "*aarch64_rdffr_z_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_dup 1)
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (and:VNx16BI
++	     (reg:VNx16BI FFRT_REGNUM)
++	     (match_dup 1))]
++	  UNSPEC_PTEST))
++   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(and:VNx16BI
++	  (reg:VNx16BI FFRT_REGNUM)
++	  (match_dup 1)))]
++  "TARGET_SVE"
++  "rdffrs\t%0.b, %1/z"
++)
++
++;; Same for unpredicated RDFFR when tested with a known PTRUE.
++(define_insn "*aarch64_rdffr_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_dup 1)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  UNSPEC_PTEST))
++   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(reg:VNx16BI FFRT_REGNUM))]
++  "TARGET_SVE"
++  "rdffrs\t%0.b, %1/z"
++)
++
++;; [R3 in the block comment above about FFR handling]
++;;
++;; Arbitrarily update the FFRT after a read from the FFR.  This acts as
++;; a scheduling barrier for later LDFF1 and LDNF1 instructions.
++(define_insn "aarch64_update_ffrt"
++  [(set (reg:VNx16BI FFRT_REGNUM)
++	(unspec:VNx16BI [(reg:VNx16BI FFRT_REGNUM)] UNSPEC_UPDATE_FFRT))]
+   "TARGET_SVE"
+-  {
+-    rtx tmp = gen_reg_rtx (DImode);
+-    rtx op1 = gen_lowpart (DImode, operands[1]);
+-    emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
+-    emit_insn (gen_while_ultdi<mode> (operands[0], const0_rtx, tmp));
+-    DONE;
+-  }
+-)
+-
+-(define_insn "vec_series<mode>"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w")
+-	(vec_series:SVE_I
+-	  (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
+-	  (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
++  ""
++  [(set_attr "type" "no_insn")]
++)
++
++;; =========================================================================
++;; == Loads
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- Normal contiguous loads
++;; -------------------------------------------------------------------------
++;; Includes contiguous forms of:
++;; - LD1B
++;; - LD1D
++;; - LD1H
++;; - LD1W
++;; - LD2B
++;; - LD2D
++;; - LD2H
++;; - LD2W
++;; - LD3B
++;; - LD3D
++;; - LD3H
++;; - LD3W
++;; - LD4B
++;; - LD4D
++;; - LD4H
++;; - LD4W
++;; -------------------------------------------------------------------------
++
++;; Predicated LD1.
++(define_insn "maskload<mode><vpred>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:SVE_FULL 1 "memory_operand" "m")]
++	  UNSPEC_LD1_SVE))]
+   "TARGET_SVE"
+-  "@
+-   index\t%0.<Vetype>, #%1, %<vw>2
+-   index\t%0.<Vetype>, %<vw>1, #%2
+-   index\t%0.<Vetype>, %<vw>1, %<vw>2"
+-)
+-
+-;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
+-;; of an INDEX instruction.
+-(define_insn "*vec_series<mode>_plus"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+-	(plus:SVE_I
+-	  (vec_duplicate:SVE_I
+-	    (match_operand:<VEL> 1 "register_operand" "r"))
+-	  (match_operand:SVE_I 2 "immediate_operand")))]
+-  "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
+-  {
+-    operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
+-    return "index\t%0.<Vetype>, %<vw>1, #%2";
+-  }
++  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
+ )
+ 
+ ;; Unpredicated LD[234].
+@@ -744,7 +1163,7 @@
+ 	  UNSPEC_LDN))]
+   "TARGET_SVE"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+@@ -759,884 +1178,5373 @@
+   "ld<vector_count><Vesize>\t%0, %2/z, %1"
+ )
+ 
+-;; Unpredicated ST[234].  This is always a full update, so the dependence
+-;; on the old value of the memory location (via (match_dup 0)) is redundant.
+-;; There doesn't seem to be any obvious benefit to treating the all-true
+-;; case differently though.  In particular, it's very unlikely that we'll
+-;; only find out during RTL that a store_lanes is dead.
+-(define_expand "vec_store_lanes<mode><vsingle>"
+-  [(set (match_operand:SVE_STRUCT 0 "memory_operand")
+-	(unspec:SVE_STRUCT
+-	  [(match_dup 2)
+-	   (match_operand:SVE_STRUCT 1 "register_operand")
+-	   (match_dup 0)]
+-	  UNSPEC_STN))]
++;; -------------------------------------------------------------------------
++;; ---- Extending contiguous loads
++;; -------------------------------------------------------------------------
++;; Includes contiguous forms of:
++;; LD1B
++;; LD1H
++;; LD1SB
++;; LD1SH
++;; LD1SW
++;; LD1W
++;; -------------------------------------------------------------------------
++
++;; Predicated load and extend, with 8 elements per 128-bit block.
++(define_insn "@aarch64_load_<ANY_EXTEND:optab><VNx8_WIDE:mode><VNx8_NARROW:mode>"
++  [(set (match_operand:VNx8_WIDE 0 "register_operand" "=w")
++	(ANY_EXTEND:VNx8_WIDE
++	  (unspec:VNx8_NARROW
++	    [(match_operand:VNx8BI 2 "register_operand" "Upl")
++	     (match_operand:VNx8_NARROW 1 "memory_operand" "m")]
++	    UNSPEC_LD1_SVE)))]
++  "TARGET_SVE"
++  "ld1<ANY_EXTEND:s><VNx8_NARROW:Vesize>\t%0.<VNx8_WIDE:Vetype>, %2/z, %1"
++)
++
++;; Predicated load and extend, with 4 elements per 128-bit block.
++(define_insn "@aarch64_load_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
++  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w")
++	(ANY_EXTEND:VNx4_WIDE
++	  (unspec:VNx4_NARROW
++	    [(match_operand:VNx4BI 2 "register_operand" "Upl")
++	     (match_operand:VNx4_NARROW 1 "memory_operand" "m")]
++	    UNSPEC_LD1_SVE)))]
++  "TARGET_SVE"
++  "ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.<VNx4_WIDE:Vetype>, %2/z, %1"
++)
++
++;; Predicated load and extend, with 2 elements per 128-bit block.
++(define_insn "@aarch64_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 2 "register_operand" "Upl")
++	     (match_operand:VNx2_NARROW 1 "memory_operand" "m")]
++	    UNSPEC_LD1_SVE)))]
++  "TARGET_SVE"
++  "ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.<VNx2_WIDE:Vetype>, %2/z, %1"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- First-faulting contiguous loads
++;; -------------------------------------------------------------------------
++;; Includes contiguous forms of:
++;; - LDFF1B
++;; - LDFF1D
++;; - LDFF1H
++;; - LDFF1W
++;; - LDNF1B
++;; - LDNF1D
++;; - LDNF1H
++;; - LDNF1W
++;; -------------------------------------------------------------------------
++
++;; Contiguous non-extending first-faulting or non-faulting loads.
++(define_insn "@aarch64_ld<fn>f1<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:SVE_FULL 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  SVE_LDFF1_LDNF1))]
++  "TARGET_SVE"
++  "ld<fn>f1<Vesize>\t%0.<Vetype>, %2/z, %1"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- First-faulting extending contiguous loads
++;; -------------------------------------------------------------------------
++;; Includes contiguous forms of:
++;; - LDFF1B
++;; - LDFF1H
++;; - LDFF1SB
++;; - LDFF1SH
++;; - LDFF1SW
++;; - LDFF1W
++;; - LDNF1B
++;; - LDNF1H
++;; - LDNF1SB
++;; - LDNF1SH
++;; - LDNF1SW
++;; - LDNF1W
++;; -------------------------------------------------------------------------
++
++;; Predicated first-faulting or non-faulting load and extend, with 8 elements
++;; per 128-bit block.
++(define_insn "@aarch64_ld<fn>f1_<ANY_EXTEND:optab><VNx8_WIDE:mode><VNx8_NARROW:mode>"
++  [(set (match_operand:VNx8_WIDE 0 "register_operand" "=w")
++	(ANY_EXTEND:VNx8_WIDE
++	  (unspec:VNx8_NARROW
++	    [(match_operand:VNx8BI 2 "register_operand" "Upl")
++	     (match_operand:VNx8_NARROW 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    SVE_LDFF1_LDNF1)))]
++  "TARGET_SVE"
++  "ld<fn>f1<ANY_EXTEND:s><VNx8_NARROW:Vesize>\t%0.<VNx8_WIDE:Vetype>, %2/z, %1"
++)
++
++;; Predicated first-faulting or non-faulting load and extend, with 4 elements
++;; per 128-bit block.
++(define_insn "@aarch64_ld<fn>f1_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
++  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w")
++	(ANY_EXTEND:VNx4_WIDE
++	  (unspec:VNx4_NARROW
++	    [(match_operand:VNx4BI 2 "register_operand" "Upl")
++	     (match_operand:VNx4_NARROW 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    SVE_LDFF1_LDNF1)))]
++  "TARGET_SVE"
++  "ld<fn>f1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.<VNx4_WIDE:Vetype>, %2/z, %1"
++)
++
++;; Predicated first-faulting or non-faulting load and extend, with 2 elements
++;; per 128-bit block.
++(define_insn "@aarch64_ld<fn>f1_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 2 "register_operand" "Upl")
++	     (match_operand:VNx2_NARROW 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    SVE_LDFF1_LDNF1)))]
++  "TARGET_SVE"
++  "ld<fn>f1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.<VNx2_WIDE:Vetype>, %2/z, %1"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Non-temporal contiguous loads
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - LDNT1B
++;; - LDNT1D
++;; - LDNT1H
++;; - LDNT1W
++;; -------------------------------------------------------------------------
++
++;; Predicated contiguous non-temporal load.
++(define_insn "@aarch64_ldnt1<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:SVE_FULL 1 "memory_operand" "m")]
++	  UNSPEC_LDNT1_SVE))]
++  "TARGET_SVE"
++  "ldnt1<Vesize>\t%0.<Vetype>, %2/z, %1"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Normal gather loads
++;; -------------------------------------------------------------------------
++;; Includes gather forms of:
++;; - LD1D
++;; - LD1W
++;; -------------------------------------------------------------------------
++
++;; Unpredicated gather loads.
++(define_expand "gather_load<mode>"
++  [(set (match_operand:SVE_FULL_SD 0 "register_operand")
++	(unspec:SVE_FULL_SD
++	  [(match_dup 5)
++	   (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>")
++	   (match_operand:<V_INT_EQUIV> 2 "register_operand")
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
++	   (mem:BLK (scratch))]
++	  UNSPEC_LD1_GATHER))]
++  "TARGET_SVE"
++  {
++    operands[5] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated gather loads for 32-bit elements.  Operand 3 is true for
++;; unsigned extension and false for signed extension.
++(define_insn "mask_gather_load<mode>"
++  [(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
++	(unspec:SVE_FULL_S
++	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
++	   (match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
++	   (match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
++	   (mem:BLK (scratch))]
++	  UNSPEC_LD1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ld1w\t%0.s, %5/z, [%2.s]
++   ld1w\t%0.s, %5/z, [%2.s, #%1]
++   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
++   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
++   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
++   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
++)
++
++;; Predicated gather loads for 64-bit elements.  The value of operand 3
++;; doesn't matter in this case.
++(define_insn "mask_gather_load<mode>"
++  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
++	(unspec:SVE_FULL_D
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
++	   (match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
++	   (match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
++	   (mem:BLK (scratch))]
++	  UNSPEC_LD1_GATHER))]
+   "TARGET_SVE"
++  "@
++   ld1d\t%0.d, %5/z, [%2.d]
++   ld1d\t%0.d, %5/z, [%2.d, #%1]
++   ld1d\t%0.d, %5/z, [%1, %2.d]
++   ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn "*mask_gather_load<mode>_sxtw"
++  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
++	(unspec:SVE_FULL_D
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 1 "register_operand" "rk, rk")
++	   (unspec:VNx2DI
++	     [(match_dup 5)
++	      (sign_extend:VNx2DI
++		(truncate:VNx2SI
++		  (match_operand:VNx2DI 2 "register_operand" "w, w")))]
++	     UNSPEC_PRED_X)
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
++	   (mem:BLK (scratch))]
++	  UNSPEC_LD1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ld1d\t%0.d, %5/z, [%1, %2.d, sxtw]
++   ld1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*mask_gather_load<mode>_uxtw"
++  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
++	(unspec:SVE_FULL_D
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 1 "register_operand" "rk, rk")
++	   (and:VNx2DI
++	     (match_operand:VNx2DI 2 "register_operand" "w, w")
++	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
++	   (mem:BLK (scratch))]
++	  UNSPEC_LD1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ld1d\t%0.d, %5/z, [%1, %2.d, uxtw]
++   ld1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Extending gather loads
++;; -------------------------------------------------------------------------
++;; Includes gather forms of:
++;; - LD1B
++;; - LD1H
++;; - LD1SB
++;; - LD1SH
++;; - LD1SW
++;; - LD1W
++;; -------------------------------------------------------------------------
++
++;; Predicated extending gather loads for 32-bit elements.  Operand 3 is
++;; true for unsigned extension and false for signed extension.
++(define_insn "@aarch64_gather_load_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
++  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w, w, w, w, w, w")
++	(ANY_EXTEND:VNx4_WIDE
++	  (unspec:VNx4_NARROW
++	    [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx4_NARROW:Vesize>" "Z, vg<VNx4_NARROW:Vesize>, rk, rk, rk, rk")
++	     (match_operand:VNx4_WIDE 2 "register_operand" "w, w, w, w, w, w")
++	     (match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx4_NARROW:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
++	     (mem:BLK (scratch))]
++	    UNSPEC_LD1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s]
++   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s, #%1]
++   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
++   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
++   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
++   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
++)
++
++;; Predicated extending gather loads for 64-bit elements.  The value of
++;; operand 3 doesn't matter in this case.
++(define_insn "@aarch64_gather_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w, w, w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
++	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx2_NARROW:Vesize>" "Z, vg<VNx2_NARROW:Vesize>, rk, rk")
++	     (match_operand:VNx2_WIDE 2 "register_operand" "w, w, w, w")
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, Ui1, Ui1, i")
++	     (mem:BLK (scratch))]
++	    UNSPEC_LD1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d]
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d, #%1]
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d]
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_sxtw"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
++	     (unspec:VNx2DI
++	       [(match_operand 6)
++		(sign_extend:VNx2DI
++		  (truncate:VNx2SI
++		    (match_operand:VNx2DI 2 "register_operand" "w, w")))]
++	       UNSPEC_PRED_X)
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
++	     (mem:BLK (scratch))]
++	    UNSPEC_LD1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw]
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
++  "&& !rtx_equal_p (operands[5], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[5]);
++  }
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*aarch64_gather_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_uxtw"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
++	     (and:VNx2DI
++	       (match_operand:VNx2DI 2 "register_operand" "w, w")
++	       (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
++	     (mem:BLK (scratch))]
++	    UNSPEC_LD1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw]
++   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- First-faulting gather loads
++;; -------------------------------------------------------------------------
++;; Includes gather forms of:
++;; - LDFF1D
++;; - LDFF1W
++;; -------------------------------------------------------------------------
++
++;; Predicated first-faulting gather loads for 32-bit elements.  Operand
++;; 3 is true for unsigned extension and false for signed extension.
++(define_insn "@aarch64_ldff1_gather<mode>"
++  [(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
++	(unspec:SVE_FULL_S
++	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
++	   (match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
++	   (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
++	   (mem:BLK (scratch))
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  UNSPEC_LDFF1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ldff1w\t%0.s, %5/z, [%2.s]
++   ldff1w\t%0.s, %5/z, [%2.s, #%1]
++   ldff1w\t%0.s, %5/z, [%1, %2.s, sxtw]
++   ldff1w\t%0.s, %5/z, [%1, %2.s, uxtw]
++   ldff1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
++   ldff1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
++)
++
++;; Predicated first-faulting gather loads for 64-bit elements.  The value
++;; of operand 3 doesn't matter in this case.
++(define_insn "@aarch64_ldff1_gather<mode>"
++  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
++	(unspec:SVE_FULL_D
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
++	   (match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
++	   (match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
++	   (mem:BLK (scratch))
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  UNSPEC_LDFF1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ldff1d\t%0.d, %5/z, [%2.d]
++   ldff1d\t%0.d, %5/z, [%2.d, #%1]
++   ldff1d\t%0.d, %5/z, [%1, %2.d]
++   ldff1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn_and_rewrite "*aarch64_ldff1_gather<mode>_sxtw"
++  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
++	(unspec:SVE_FULL_D
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 1 "register_operand" "rk, rk")
++	   (unspec:VNx2DI
++	     [(match_operand 6)
++	      (sign_extend:VNx2DI
++		(truncate:VNx2SI
++		  (match_operand:VNx2DI 2 "register_operand" "w, w")))]
++	     UNSPEC_PRED_X)
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
++	   (mem:BLK (scratch))
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  UNSPEC_LDFF1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ldff1d\t%0.d, %5/z, [%1, %2.d, sxtw]
++   ldff1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
++  "&& !rtx_equal_p (operands[5], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[5]);
++  }
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*aarch64_ldff1_gather<mode>_uxtw"
++  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
++	(unspec:SVE_FULL_D
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 1 "register_operand" "rk, rk")
++	   (and:VNx2DI
++	     (match_operand:VNx2DI 2 "register_operand" "w, w")
++	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
++	   (match_operand:DI 3 "const_int_operand")
++	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
++	   (mem:BLK (scratch))
++	   (reg:VNx16BI FFRT_REGNUM)]
++	  UNSPEC_LDFF1_GATHER))]
++  "TARGET_SVE"
++  "@
++   ldff1d\t%0.d, %5/z, [%1, %2.d, uxtw]
++   ldff1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- First-faulting extending gather loads
++;; -------------------------------------------------------------------------
++;; Includes gather forms of:
++;; - LDFF1B
++;; - LDFF1H
++;; - LDFF1SB
++;; - LDFF1SH
++;; - LDFF1SW
++;; - LDFF1W
++;; -------------------------------------------------------------------------
++
++;; Predicated extending first-faulting gather loads for 32-bit elements.
++;; Operand 3 is true for unsigned extension and false for signed extension.
++(define_insn "@aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
++  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w, w, w, w, w, w")
++	(ANY_EXTEND:VNx4_WIDE
++	  (unspec:VNx4_NARROW
++	    [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx4_NARROW:Vesize>" "Z, vg<VNx4_NARROW:Vesize>, rk, rk, rk, rk")
++	     (match_operand:VNx4_WIDE 2 "register_operand" "w, w, w, w, w, w")
++	     (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx4_NARROW:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
++	     (mem:BLK (scratch))
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    UNSPEC_LDFF1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s]
++   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s, #%1]
++   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
++   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
++   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
++   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
++)
++
++;; Predicated extending first-faulting gather loads for 64-bit elements.
++;; The value of operand 3 doesn't matter in this case.
++(define_insn "@aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w, w, w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
++	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx2_NARROW:Vesize>" "Z, vg<VNx2_NARROW:Vesize>, rk, rk")
++	     (match_operand:VNx2_WIDE 2 "register_operand" "w, w, w, w")
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, Ui1, Ui1, i")
++	     (mem:BLK (scratch))
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    UNSPEC_LDFF1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d]
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d, #%1]
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d]
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn_and_rewrite "*aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_sxtw"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
++	     (unspec:VNx2DI
++	       [(match_operand 6)
++		(sign_extend:VNx2DI
++		  (truncate:VNx2SI
++		    (match_operand:VNx2DI 2 "register_operand" "w, w")))]
++	       UNSPEC_PRED_X)
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
++	     (mem:BLK (scratch))
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    UNSPEC_LDFF1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw]
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
++  "&& !rtx_equal_p (operands[5], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[5]);
++  }
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_uxtw"
++  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
++	(ANY_EXTEND:VNx2_WIDE
++	  (unspec:VNx2_NARROW
++	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
++	     (and:VNx2DI
++	       (match_operand:VNx2DI 2 "register_operand" "w, w")
++	       (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
++	     (mem:BLK (scratch))
++	     (reg:VNx16BI FFRT_REGNUM)]
++	    UNSPEC_LDFF1_GATHER)))]
++  "TARGET_SVE"
++  "@
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw]
++   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
++)
++
++;; =========================================================================
++;; == Prefetches
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- Contiguous prefetches
++;; -------------------------------------------------------------------------
++;; Includes contiguous forms of:
++;; - PRFB
++;; - PRFD
++;; - PRFH
++;; - PRFW
++;; -------------------------------------------------------------------------
++
++;; Contiguous predicated prefetches.  Operand 2 gives the real prefetch
++;; operation (as an svprfop), with operands 3 and 4 providing distilled
++;; information.
++(define_insn "@aarch64_sve_prefetch<mode>"
++  [(prefetch (unspec:DI
++	       [(match_operand:<VPRED> 0 "register_operand" "Upl")
++		(match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
++		(match_operand:DI 2 "const_int_operand")]
++	       UNSPEC_SVE_PREFETCH)
++	     (match_operand:DI 3 "const_int_operand")
++	     (match_operand:DI 4 "const_int_operand"))]
++  "TARGET_SVE"
++  {
++    operands[1] = gen_rtx_MEM (<MODE>mode, operands[1]);
++    return aarch64_output_sve_prefetch ("prf<Vesize>", operands[2], "%0, %1");
++  }
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Gather prefetches
++;; -------------------------------------------------------------------------
++;; Includes gather forms of:
++;; - PRFB
++;; - PRFD
++;; - PRFH
++;; - PRFW
++;; -------------------------------------------------------------------------
++
++;; Predicated gather prefetches for 32-bit bases and offsets.  The operands
++;; are:
++;; 0: the governing predicate
++;; 1: the scalar component of the address
++;; 2: the vector component of the address
++;; 3: 1 for zero extension, 0 for sign extension
++;; 4: the scale multiplier
++;; 5: a vector zero that identifies the mode of data being accessed
++;; 6: the prefetch operator (an svprfop)
++;; 7: the normal RTL prefetch rw flag
++;; 8: the normal RTL prefetch locality value
++(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx4SI_ONLY:mode>"
++  [(prefetch (unspec:DI
++	       [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk, rk, rk")
++		(match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w")
++		(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
++		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 6 "const_int_operand")]
++	       UNSPEC_SVE_PREFETCH_GATHER)
++	     (match_operand:DI 7 "const_int_operand")
++	     (match_operand:DI 8 "const_int_operand"))]
++  "TARGET_SVE"
++  {
++    static const char *const insns[][2] = {
++      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s, #%1]",
++      "prfb", "%0, [%1, %2.s, sxtw]",
++      "prfb", "%0, [%1, %2.s, uxtw]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
++    };
++    const char *const *parts = insns[which_alternative];
++    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
++  }
++)
++
++;; Predicated gather prefetches for 64-bit elements.  The value of operand 3
++;; doesn't matter in this case.
++(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>"
++  [(prefetch (unspec:DI
++	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl")
++		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk")
++		(match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w")
++		(match_operand:DI 3 "const_int_operand")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, i")
++		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 6 "const_int_operand")]
++	       UNSPEC_SVE_PREFETCH_GATHER)
++	     (match_operand:DI 7 "const_int_operand")
++	     (match_operand:DI 8 "const_int_operand"))]
++  "TARGET_SVE"
++  {
++    static const char *const insns[][2] = {
++      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d, #%1]",
++      "prfb", "%0, [%1, %2.d]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, lsl %p4]"
++    };
++    const char *const *parts = insns[which_alternative];
++    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
++  }
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_sxtw"
++  [(prefetch (unspec:DI
++	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
++		(match_operand:DI 1 "register_operand" "rk, rk")
++		(unspec:VNx2DI_ONLY
++		  [(match_operand 9)
++		   (sign_extend:VNx2DI
++		     (truncate:VNx2SI
++		       (match_operand:VNx2DI 2 "register_operand" "w, w")))]
++		  UNSPEC_PRED_X)
++		(match_operand:DI 3 "const_int_operand")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
++		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 6 "const_int_operand")]
++	       UNSPEC_SVE_PREFETCH_GATHER)
++	     (match_operand:DI 7 "const_int_operand")
++	     (match_operand:DI 8 "const_int_operand"))]
++  "TARGET_SVE"
++  {
++    static const char *const insns[][2] = {
++      "prfb", "%0, [%1, %2.d, sxtw]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
++    };
++    const char *const *parts = insns[which_alternative];
++    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
++  }
++  "&& !rtx_equal_p (operands[0], operands[9])"
++  {
++    operands[9] = copy_rtx (operands[0]);
++  }
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_uxtw"
++  [(prefetch (unspec:DI
++	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
++		(match_operand:DI 1 "register_operand" "rk, rk")
++		(and:VNx2DI_ONLY
++		  (match_operand:VNx2DI 2 "register_operand" "w, w")
++		  (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate"))
++		(match_operand:DI 3 "const_int_operand")
++		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
++		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
++		(match_operand:DI 6 "const_int_operand")]
++	       UNSPEC_SVE_PREFETCH_GATHER)
++	     (match_operand:DI 7 "const_int_operand")
++	     (match_operand:DI 8 "const_int_operand"))]
++  "TARGET_SVE"
++  {
++    static const char *const insns[][2] = {
++      "prfb", "%0, [%1, %2.d, uxtw]",
++      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
++    };
++    const char *const *parts = insns[which_alternative];
++    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
++  }
++)
++
++;; =========================================================================
++;; == Stores
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- Normal contiguous stores
++;; -------------------------------------------------------------------------
++;; Includes contiguous forms of:
++;; - ST1B
++;; - ST1D
++;; - ST1H
++;; - ST1W
++;; - ST2B
++;; - ST2D
++;; - ST2H
++;; - ST2W
++;; - ST3B
++;; - ST3D
++;; - ST3H
++;; - ST3W
++;; - ST4B
++;; - ST4D
++;; - ST4H
++;; - ST4W
++;; -------------------------------------------------------------------------
++
++;; Predicated ST1.
++(define_insn "maskstore<mode><vpred>"
++  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:SVE_FULL 1 "register_operand" "w")
++	   (match_dup 0)]
++	  UNSPEC_ST1_SVE))]
++  "TARGET_SVE"
++  "st1<Vesize>\t%1.<Vetype>, %2, %0"
++)
++
++;; Unpredicated ST[234].  This is always a full update, so the dependence
++;; on the old value of the memory location (via (match_dup 0)) is redundant.
++;; There doesn't seem to be any obvious benefit to treating the all-true
++;; case differently though.  In particular, it's very unlikely that we'll
++;; only find out during RTL that a store_lanes is dead.
++(define_expand "vec_store_lanes<mode><vsingle>"
++  [(set (match_operand:SVE_STRUCT 0 "memory_operand")
++	(unspec:SVE_STRUCT
++	  [(match_dup 2)
++	   (match_operand:SVE_STRUCT 1 "register_operand")
++	   (match_dup 0)]
++	  UNSPEC_STN))]
++  "TARGET_SVE"
++  {
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated ST[234].
++(define_insn "vec_mask_store_lanes<mode><vsingle>"
++  [(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
++	(unspec:SVE_STRUCT
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:SVE_STRUCT 1 "register_operand" "w")
++	   (match_dup 0)]
++	  UNSPEC_STN))]
++  "TARGET_SVE"
++  "st<vector_count><Vesize>\t%1, %2, %0"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Truncating contiguous stores
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ST1B
++;; - ST1H
++;; - ST1W
++;; -------------------------------------------------------------------------
++
++;; Predicated truncate and store, with 8 elements per 128-bit block.
++(define_insn "@aarch64_store_trunc<VNx8_NARROW:mode><VNx8_WIDE:mode>"
++  [(set (match_operand:VNx8_NARROW 0 "memory_operand" "+m")
++	(unspec:VNx8_NARROW
++	  [(match_operand:VNx8BI 2 "register_operand" "Upl")
++	   (truncate:VNx8_NARROW
++	     (match_operand:VNx8_WIDE 1 "register_operand" "w"))
++	   (match_dup 0)]
++	  UNSPEC_ST1_SVE))]
++  "TARGET_SVE"
++  "st1<VNx8_NARROW:Vesize>\t%1.<VNx8_WIDE:Vetype>, %2, %0"
++)
++
++;; Predicated truncate and store, with 4 elements per 128-bit block.
++(define_insn "@aarch64_store_trunc<VNx4_NARROW:mode><VNx4_WIDE:mode>"
++  [(set (match_operand:VNx4_NARROW 0 "memory_operand" "+m")
++	(unspec:VNx4_NARROW
++	  [(match_operand:VNx4BI 2 "register_operand" "Upl")
++	   (truncate:VNx4_NARROW
++	     (match_operand:VNx4_WIDE 1 "register_operand" "w"))
++	   (match_dup 0)]
++	  UNSPEC_ST1_SVE))]
++  "TARGET_SVE"
++  "st1<VNx4_NARROW:Vesize>\t%1.<VNx4_WIDE:Vetype>, %2, %0"
++)
++
++;; Predicated truncate and store, with 2 elements per 128-bit block.
++(define_insn "@aarch64_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>"
++  [(set (match_operand:VNx2_NARROW 0 "memory_operand" "+m")
++	(unspec:VNx2_NARROW
++	  [(match_operand:VNx2BI 2 "register_operand" "Upl")
++	   (truncate:VNx2_NARROW
++	     (match_operand:VNx2_WIDE 1 "register_operand" "w"))
++	   (match_dup 0)]
++	  UNSPEC_ST1_SVE))]
++  "TARGET_SVE"
++  "st1<VNx2_NARROW:Vesize>\t%1.<VNx2_WIDE:Vetype>, %2, %0"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Non-temporal contiguous stores
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - STNT1B
++;; - STNT1D
++;; - STNT1H
++;; - STNT1W
++;; -------------------------------------------------------------------------
++
++(define_insn "@aarch64_stnt1<mode>"
++  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:SVE_FULL 1 "register_operand" "w")
++	   (match_dup 0)]
++	  UNSPEC_STNT1_SVE))]
++  "TARGET_SVE"
++  "stnt1<Vesize>\t%1.<Vetype>, %2, %0"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Normal scatter stores
++;; -------------------------------------------------------------------------
++;; Includes scatter forms of:
++;; - ST1D
++;; - ST1W
++;; -------------------------------------------------------------------------
++
++;; Unpredicated scatter stores.
++(define_expand "scatter_store<mode>"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_dup 5)
++	   (match_operand:DI 0 "aarch64_sve_gather_offset_<Vesize>")
++	   (match_operand:<V_INT_EQUIV> 1 "register_operand")
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>")
++	   (match_operand:SVE_FULL_SD 4 "register_operand")]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  {
++    operands[5] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated scatter stores for 32-bit elements.  Operand 2 is true for
++;; unsigned extension and false for signed extension.
++(define_insn "mask_scatter_store<mode>"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:DI 0 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
++	   (match_operand:VNx4SI 1 "register_operand" "w, w, w, w, w, w")
++	   (match_operand:DI 2 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
++	   (match_operand:SVE_FULL_S 4 "register_operand" "w, w, w, w, w, w")]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1w\t%4.s, %5, [%1.s]
++   st1w\t%4.s, %5, [%1.s, #%0]
++   st1w\t%4.s, %5, [%0, %1.s, sxtw]
++   st1w\t%4.s, %5, [%0, %1.s, uxtw]
++   st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
++   st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
++)
++
++;; Predicated scatter stores for 64-bit elements.  The value of operand 2
++;; doesn't matter in this case.
++(define_insn "mask_scatter_store<mode>"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
++	   (match_operand:DI 0 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
++	   (match_operand:VNx2DI 1 "register_operand" "w, w, w, w")
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
++	   (match_operand:SVE_FULL_D 4 "register_operand" "w, w, w, w")]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1d\t%4.d, %5, [%1.d]
++   st1d\t%4.d, %5, [%1.d, #%0]
++   st1d\t%4.d, %5, [%0, %1.d]
++   st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn_and_rewrite "*mask_scatter_store<mode>_sxtw"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 0 "register_operand" "rk, rk")
++	   (unspec:VNx2DI
++	     [(match_operand 6)
++	      (sign_extend:VNx2DI
++		(truncate:VNx2SI
++		  (match_operand:VNx2DI 1 "register_operand" "w, w")))]
++	     UNSPEC_PRED_X)
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, i")
++	   (match_operand:SVE_FULL_D 4 "register_operand" "w, w")]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1d\t%4.d, %5, [%0, %1.d, sxtw]
++   st1d\t%4.d, %5, [%0, %1.d, sxtw %p3]"
++  "&& !rtx_equal_p (operands[5], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[5]);
++  }
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*mask_scatter_store<mode>_uxtw"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 0 "aarch64_reg_or_zero" "rk, rk")
++	   (and:VNx2DI
++	     (match_operand:VNx2DI 1 "register_operand" "w, w")
++	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, i")
++	   (match_operand:SVE_FULL_D 4 "register_operand" "w, w")]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1d\t%4.d, %5, [%0, %1.d, uxtw]
++   st1d\t%4.d, %5, [%0, %1.d, uxtw %p3]"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- Truncating scatter stores
++;; -------------------------------------------------------------------------
++;; Includes scatter forms of:
++;; - ST1B
++;; - ST1H
++;; - ST1W
++;; -------------------------------------------------------------------------
++
++;; Predicated truncating scatter stores for 32-bit elements.  Operand 2 is
++;; true for unsigned extension and false for signed extension.
++(define_insn "@aarch64_scatter_store_trunc<VNx4_NARROW:mode><VNx4_WIDE:mode>"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:DI 0 "aarch64_sve_gather_offset_<VNx4_NARROW:Vesize>" "Z, vg<VNx4_NARROW:Vesize>, rk, rk, rk, rk")
++	   (match_operand:VNx4SI 1 "register_operand" "w, w, w, w, w, w")
++	   (match_operand:DI 2 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx4_NARROW:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
++	   (truncate:VNx4_NARROW
++	     (match_operand:VNx4_WIDE 4 "register_operand" "w, w, w, w, w, w"))]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%1.s]
++   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%1.s, #%0]
++   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, sxtw]
++   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, uxtw]
++   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, sxtw %p3]
++   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, uxtw %p3]"
++)
++
++;; Predicated truncating scatter stores for 64-bit elements.  The value of
++;; operand 2 doesn't matter in this case.
++(define_insn "@aarch64_scatter_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
++	   (match_operand:DI 0 "aarch64_sve_gather_offset_<VNx2_NARROW:Vesize>" "Z, vg<VNx2_NARROW:Vesize>, rk, rk")
++	   (match_operand:VNx2DI 1 "register_operand" "w, w, w, w")
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, Ui1, Ui1, i")
++	   (truncate:VNx2_NARROW
++	     (match_operand:VNx2_WIDE 4 "register_operand" "w, w, w, w"))]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%1.d]
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%1.d, #%0]
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d]
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, lsl %p3]"
++)
++
++;; Likewise, but with the offset being sign-extended from 32 bits.
++(define_insn_and_rewrite "*aarch64_scatter_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>_sxtw"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 0 "register_operand" "rk, rk")
++	   (unspec:VNx2DI
++	     [(match_operand 6)
++	      (sign_extend:VNx2DI
++		(truncate:VNx2SI
++		  (match_operand:VNx2DI 1 "register_operand" "w, w")))]
++	     UNSPEC_PRED_X)
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
++	   (truncate:VNx2_NARROW
++	     (match_operand:VNx2_WIDE 4 "register_operand" "w, w"))]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, sxtw]
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, sxtw %p3]"
++  "&& !rtx_equal_p (operands[5], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[5]);
++  }
++)
++
++;; Likewise, but with the offset being zero-extended from 32 bits.
++(define_insn "*aarch64_scatter_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>_uxtw"
++  [(set (mem:BLK (scratch))
++	(unspec:BLK
++	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
++	   (match_operand:DI 0 "aarch64_reg_or_zero" "rk, rk")
++	   (and:VNx2DI
++	     (match_operand:VNx2DI 1 "register_operand" "w, w")
++	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
++	   (match_operand:DI 2 "const_int_operand")
++	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
++	   (truncate:VNx2_NARROW
++	     (match_operand:VNx2_WIDE 4 "register_operand" "w, w"))]
++	  UNSPEC_ST1_SCATTER))]
++  "TARGET_SVE"
++  "@
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, uxtw]
++   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, uxtw %p3]"
++)
++
++;; =========================================================================
++;; == Vector creation
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Duplicate element
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - DUP
++;; - MOV
++;; - LD1RB
++;; - LD1RD
++;; - LD1RH
++;; - LD1RW
++;; - LD1ROB (F64MM)
++;; - LD1ROD (F64MM)
++;; - LD1ROH (F64MM)
++;; - LD1ROW (F64MM)
++;; - LD1RQB
++;; - LD1RQD
++;; - LD1RQH
++;; - LD1RQW
++;; -------------------------------------------------------------------------
++
++(define_expand "vec_duplicate<mode>"
++  [(parallel
++    [(set (match_operand:SVE_FULL 0 "register_operand")
++	  (vec_duplicate:SVE_FULL
++	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
++     (clobber (scratch:VNx16BI))])]
++  "TARGET_SVE"
++  {
++    if (MEM_P (operands[1]))
++      {
++	rtx ptrue = aarch64_ptrue_reg (<VPRED>mode);
++	emit_insn (gen_sve_ld1r<mode> (operands[0], ptrue, operands[1],
++				       CONST0_RTX (<MODE>mode)));
++	DONE;
++      }
++  }
++)
++
++;; Accept memory operands for the benefit of combine, and also in case
++;; the scalar input gets spilled to memory during RA.  We want to split
++;; the load at the first opportunity in order to allow the PTRUE to be
++;; optimized with surrounding code.
++(define_insn_and_split "*vec_duplicate<mode>_reg"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w")
++	(vec_duplicate:SVE_FULL
++	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
++   (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
++  "TARGET_SVE"
++  "@
++   mov\t%0.<Vetype>, %<vwcore>1
++   mov\t%0.<Vetype>, %<Vetype>1
++   #"
++  "&& MEM_P (operands[1])"
++  [(const_int 0)]
++  {
++    if (GET_CODE (operands[2]) == SCRATCH)
++      operands[2] = gen_reg_rtx (VNx16BImode);
++    emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
++    rtx gp = gen_lowpart (<VPRED>mode, operands[2]);
++    emit_insn (gen_sve_ld1r<mode> (operands[0], gp, operands[1],
++				   CONST0_RTX (<MODE>mode)));
++    DONE;
++  }
++  [(set_attr "length" "4,4,8")]
++)
++
++;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
++(define_insn "@aarch64_vec_duplicate_vq<mode>_le"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(vec_duplicate:SVE_FULL
++	  (match_operand:<V128> 1 "register_operand" "w")))]
++  "TARGET_SVE && !BYTES_BIG_ENDIAN"
++  {
++    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
++    return "dup\t%0.q, %1.q[0]";
++  }
++)
++
++;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
++;; The SVE register layout puts memory lane N into (architectural)
++;; register lane N, whereas the Advanced SIMD layout puts the memory
++;; lsb into the register lsb.  We therefore have to describe this in rtl
++;; terms as a reverse of the V128 vector followed by a duplicate.
++(define_insn "@aarch64_vec_duplicate_vq<mode>_be"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(vec_duplicate:SVE_FULL
++	  (vec_select:<V128>
++	    (match_operand:<V128> 1 "register_operand" "w")
++	    (match_operand 2 "descending_int_parallel"))))]
++  "TARGET_SVE
++   && BYTES_BIG_ENDIAN
++   && known_eq (INTVAL (XVECEXP (operands[2], 0, 0)),
++		GET_MODE_NUNITS (<V128>mode) - 1)"
++  {
++    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
++    return "dup\t%0.q, %1.q[0]";
++  }
++)
++
++;; This is used for vec_duplicate<mode>s from memory, but can also
++;; be used by combine to optimize selects of a a vec_duplicate<mode>
++;; with zero.
++(define_insn "sve_ld1r<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (vec_duplicate:SVE_FULL
++	     (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
++	   (match_operand:SVE_FULL 3 "aarch64_simd_imm_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
++)
++
++;; Load 128 bits from memory under predicate control and duplicate to
++;; fill a vector.
++(define_insn "@aarch64_sve_ld1rq<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:<V128> 1 "aarch64_sve_ld1rq_operand" "UtQ")]
++	  UNSPEC_LD1RQ))]
++  "TARGET_SVE"
++  {
++    operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
++    return "ld1rq<Vesize>\t%0.<Vetype>, %2/z, %1";
++  }
++)
++
++(define_insn "@aarch64_sve_ld1ro<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
++	   (match_operand:OI 1 "aarch64_sve_ld1ro_operand_<Vesize>"
++			       "UO<Vesize>")]
++	  UNSPEC_LD1RO))]
++  "TARGET_SVE_F64MM"
++  {
++    operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
++    return "ld1ro<Vesize>\t%0.<Vetype>, %2/z, %1";
++  }
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Initialize from individual elements
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - INSR
++;; -------------------------------------------------------------------------
++
++(define_expand "vec_init<mode><Vel>"
++  [(match_operand:SVE_FULL 0 "register_operand")
++    (match_operand 1 "")]
++  "TARGET_SVE"
++  {
++    aarch64_sve_expand_vector_init (operands[0], operands[1]);
++    DONE;
++  }
++)
++
++;; Shift an SVE vector left and insert a scalar into element 0.
++(define_insn "vec_shl_insert_<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=?w, w, ??&w, ?&w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "0, 0, w, w")
++	   (match_operand:<VEL> 2 "aarch64_reg_or_zero" "rZ, w, rZ, w")]
++	  UNSPEC_INSR))]
++  "TARGET_SVE"
++  "@
++   insr\t%0.<Vetype>, %<vwcore>2
++   insr\t%0.<Vetype>, %<Vetype>2
++   movprfx\t%0, %1\;insr\t%0.<Vetype>, %<vwcore>2
++   movprfx\t%0, %1\;insr\t%0.<Vetype>, %<Vetype>2"
++  [(set_attr "movprfx" "*,*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Linear series
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - INDEX
++;; -------------------------------------------------------------------------
++
++(define_insn "vec_series<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w")
++	(vec_series:SVE_FULL_I
++	  (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
++	  (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
++  "TARGET_SVE"
++  "@
++   index\t%0.<Vetype>, #%1, %<vw>2
++   index\t%0.<Vetype>, %<vw>1, #%2
++   index\t%0.<Vetype>, %<vw>1, %<vw>2"
++)
++
++;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
++;; of an INDEX instruction.
++(define_insn "*vec_series<mode>_plus"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(plus:SVE_FULL_I
++	  (vec_duplicate:SVE_FULL_I
++	    (match_operand:<VEL> 1 "register_operand" "r"))
++	  (match_operand:SVE_FULL_I 2 "immediate_operand")))]
++  "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
++  {
++    operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
++    return "index\t%0.<Vetype>, %<vw>1, #%2";
++  }
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Duplicate element
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
++;; Implement a predicate broadcast by shifting the low bit of the scalar
++;; input into the top bit and using a WHILELO.  An alternative would be to
++;; duplicate the input and do a compare with zero.
++(define_expand "vec_duplicate<mode>"
++  [(set (match_operand:PRED_ALL 0 "register_operand")
++	(vec_duplicate:PRED_ALL (match_operand:QI 1 "register_operand")))]
++  "TARGET_SVE"
++  {
++    rtx tmp = gen_reg_rtx (DImode);
++    rtx op1 = gen_lowpart (DImode, operands[1]);
++    emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
++    emit_insn (gen_while_ultdi<mode> (operands[0], const0_rtx, tmp));
++    DONE;
++  }
++)
++
++;; =========================================================================
++;; == Vector decomposition
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Extract index
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - DUP    (Advanced SIMD)
++;; - DUP    (SVE)
++;; - EXT    (SVE)
++;; - ST1    (Advanced SIMD)
++;; - UMOV   (Advanced SIMD)
++;; -------------------------------------------------------------------------
++
++(define_expand "vec_extract<mode><Vel>"
++  [(set (match_operand:<VEL> 0 "register_operand")
++	(vec_select:<VEL>
++	  (match_operand:SVE_FULL 1 "register_operand")
++	  (parallel [(match_operand:SI 2 "nonmemory_operand")])))]
++  "TARGET_SVE"
++  {
++    poly_int64 val;
++    if (poly_int_rtx_p (operands[2], &val)
++	&& known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1))
++      {
++	/* The last element can be extracted with a LASTB and a false
++	   predicate.  */
++	rtx sel = aarch64_pfalse_reg (<VPRED>mode);
++	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
++	DONE;
++      }
++    if (!CONST_INT_P (operands[2]))
++      {
++	/* Create an index with operand[2] as the base and -1 as the step.
++	   It will then be zero for the element we care about.  */
++	rtx index = gen_lowpart (<VEL_INT>mode, operands[2]);
++	index = force_reg (<VEL_INT>mode, index);
++	rtx series = gen_reg_rtx (<V_INT_EQUIV>mode);
++	emit_insn (gen_vec_series<v_int_equiv> (series, index, constm1_rtx));
++
++	/* Get a predicate that is true for only that element.  */
++	rtx zero = CONST0_RTX (<V_INT_EQUIV>mode);
++	rtx cmp = gen_rtx_EQ (<V_INT_EQUIV>mode, series, zero);
++	rtx sel = gen_reg_rtx (<VPRED>mode);
++	emit_insn (gen_vec_cmp<v_int_equiv><vpred> (sel, cmp, series, zero));
++
++	/* Select the element using LASTB.  */
++	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
++	DONE;
++      }
++  }
++)
++
++;; Extract element zero.  This is a special case because we want to force
++;; the registers to be the same for the second alternative, and then
++;; split the instruction into nothing after RA.
++(define_insn_and_split "*vec_extract<mode><Vel>_0"
++  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
++	(vec_select:<VEL>
++	  (match_operand:SVE_FULL 1 "register_operand" "w, 0, w")
++	  (parallel [(const_int 0)])))]
++  "TARGET_SVE"
++  {
++    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
++    switch (which_alternative)
++      {
++	case 0:
++	  return "umov\\t%<vwcore>0, %1.<Vetype>[0]";
++	case 1:
++	  return "#";
++	case 2:
++	  return "st1\\t{%1.<Vetype>}[0], %0";
++	default:
++	  gcc_unreachable ();
++      }
++  }
++  "&& reload_completed
++   && REG_P (operands[0])
++   && REGNO (operands[0]) == REGNO (operands[1])"
++  [(const_int 0)]
++  {
++    emit_note (NOTE_INSN_DELETED);
++    DONE;
++  }
++  [(set_attr "type" "neon_to_gp_q, untyped, neon_store1_one_lane_q")]
++)
++
++;; Extract an element from the Advanced SIMD portion of the register.
++;; We don't just reuse the aarch64-simd.md pattern because we don't
++;; want any change in lane number on big-endian targets.
++(define_insn "*vec_extract<mode><Vel>_v128"
++  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
++	(vec_select:<VEL>
++	  (match_operand:SVE_FULL 1 "register_operand" "w, w, w")
++	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
++  "TARGET_SVE
++   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 1, 15)"
++  {
++    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
++    switch (which_alternative)
++      {
++	case 0:
++	  return "umov\\t%<vwcore>0, %1.<Vetype>[%2]";
++	case 1:
++	  return "dup\\t%<Vetype>0, %1.<Vetype>[%2]";
++	case 2:
++	  return "st1\\t{%1.<Vetype>}[%2], %0";
++	default:
++	  gcc_unreachable ();
++      }
++  }
++  [(set_attr "type" "neon_to_gp_q, neon_dup_q, neon_store1_one_lane_q")]
++)
++
++;; Extract an element in the range of DUP.  This pattern allows the
++;; source and destination to be different.
++(define_insn "*vec_extract<mode><Vel>_dup"
++  [(set (match_operand:<VEL> 0 "register_operand" "=w")
++	(vec_select:<VEL>
++	  (match_operand:SVE_FULL 1 "register_operand" "w")
++	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
++  "TARGET_SVE
++   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 16, 63)"
++  {
++    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
++    return "dup\t%0.<Vetype>, %1.<Vetype>[%2]";
++  }
++)
++
++;; Extract an element outside the range of DUP.  This pattern requires the
++;; source and destination to be the same.
++(define_insn "*vec_extract<mode><Vel>_ext"
++  [(set (match_operand:<VEL> 0 "register_operand" "=w, ?&w")
++	(vec_select:<VEL>
++	  (match_operand:SVE_FULL 1 "register_operand" "0, w")
++	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
++  "TARGET_SVE && INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode) >= 64"
++  {
++    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
++    operands[2] = GEN_INT (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode));
++    return (which_alternative == 0
++	    ? "ext\t%0.b, %0.b, %0.b, #%2"
++	    : "movprfx\t%0, %1\;ext\t%0.b, %0.b, %1.b, #%2");
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Extract active element
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - LASTA
++;; - LASTB
++;; -------------------------------------------------------------------------
++
++;; Extract the last active element of operand 1 into operand 0.
++;; If no elements are active, extract the last inactive element instead.
++(define_insn "@extract_<last_op>_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand" "=?r, w")
++	(unspec:<VEL>
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SVE_FULL 2 "register_operand" "w, w")]
++	  LAST))]
++  "TARGET_SVE"
++  "@
++   last<ab>\t%<vwcore>0, %1, %2.<Vetype>
++   last<ab>\t%<Vetype>0, %1, %2.<Vetype>"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Extract index
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
++;; Handle extractions from a predicate by converting to an integer vector
++;; and extracting from there.
++(define_expand "vec_extract<vpred><Vel>"
++  [(match_operand:<VEL> 0 "register_operand")
++   (match_operand:<VPRED> 1 "register_operand")
++   (match_operand:SI 2 "nonmemory_operand")
++   ;; Dummy operand to which we can attach the iterator.
++   (reg:SVE_FULL_I V0_REGNUM)]
++  "TARGET_SVE"
++  {
++    rtx tmp = gen_reg_rtx (<MODE>mode);
++    emit_insn (gen_vcond_mask_<mode><vpred> (tmp, operands[1],
++					     CONST1_RTX (<MODE>mode),
++					     CONST0_RTX (<MODE>mode)));
++    emit_insn (gen_vec_extract<mode><Vel> (operands[0], tmp, operands[2]));
++    DONE;
++  }
++)
++
++;; =========================================================================
++;; == Unary arithmetic
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] General unary arithmetic corresponding to rtx codes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ABS
++;; - CLS (= clrsb)
++;; - CLZ
++;; - CNT (= popcount)
++;; - NEG
++;; - NOT
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer unary arithmetic.
++(define_expand "<optab><mode>2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_dup 2)
++	   (SVE_INT_UNARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 1 "register_operand"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  {
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Integer unary arithmetic predicated with a PTRUE.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (SVE_INT_UNARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "w"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++)
++
++;; Predicated integer unary arithmetic with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (SVE_INT_UNARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand"))
++	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated integer unary arithmetic, merging with the first input.
++(define_insn "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (SVE_INT_UNARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w"))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer unary arithmetic, merging with an independent value.
++;;
++;; The earlyclobber isn't needed for the first alternative, but omitting
++;; it would only help the case in which operands 2 and 3 are the same,
++;; which is handled above rather than here.  Marking all the alternatives
++;; as earlyclobber helps to make the instruction more regular to the
++;; register allocator.
++(define_insn "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (SVE_INT_UNARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w"))
++	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0, %3\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] General unary arithmetic corresponding to unspecs
++;; -------------------------------------------------------------------------
++;; Includes
++;; - RBIT
++;; - REVB
++;; - REVH
++;; - REVW
++;; -------------------------------------------------------------------------
++
++;; Predicated integer unary operations.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "w")]
++	     SVE_INT_UNARY)]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE && <elem_bits> >= <min_elem_bits>"
++  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++)
++
++;; Predicated integer unary operations with merging.
++(define_insn "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")]
++	     SVE_INT_UNARY)
++	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <elem_bits> >= <min_elem_bits>"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0, %3\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Sign extension
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SXTB
++;; - SXTH
++;; - SXTW
++;; -------------------------------------------------------------------------
++
++;; Predicated SXT[BHW].
++(define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL:mode>"
++  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
++	(unspec:SVE_FULL_HSDI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (sign_extend:SVE_FULL_HSDI
++	     (truncate:SVE_PARTIAL
++	       (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE && (~<narrower_mask> & <self_mask>) == 0"
++  "sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
++)
++
++;; Predicated SXT[BHW] with merging.
++(define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL:mode>"
++  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w")
++	(unspec:SVE_FULL_HSDI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (sign_extend:SVE_FULL_HSDI
++	     (truncate:SVE_PARTIAL
++	       (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")))
++	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && (~<narrower_mask> & <self_mask>) == 0"
++  "@
++   sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
++   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
++   movprfx\t%0, %3\;sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Zero extension
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - UXTB
++;; - UXTH
++;; - UXTW
++;; -------------------------------------------------------------------------
++
++;; Match UXT[BHW] as a conditional AND of a constant, merging with the
++;; first input.
++(define_insn "*cond_uxt<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (and:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	     (match_operand:SVE_FULL_I 3 "aarch64_sve_uxt_immediate"))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   uxt%e3\t%0.<Vetype>, %1/m, %0.<Vetype>
++   movprfx\t%0, %2\;uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Match UXT[BHW] as a conditional AND of a constant, merging with an
++;; independent value.
++;;
++;; The earlyclobber isn't needed for the first alternative, but omitting
++;; it would only help the case in which operands 2 and 4 are the same,
++;; which is handled above rather than here.  Marking all the alternatives
++;; as early-clobber helps to make the instruction more regular to the
++;; register allocator.
++(define_insn "*cond_uxt<mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (and:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
++	     (match_operand:SVE_FULL_I 3 "aarch64_sve_uxt_immediate"))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
++  "@
++   uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0, %4\;uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Logical inverse
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - CNOT
++;; -------------------------------------------------------------------------
++
++;; Predicated logical inverse.
++(define_expand "@aarch64_pred_cnot<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(unspec:<VPRED>
++	     [(match_operand:<VPRED> 1 "register_operand")
++	      (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	      (eq:<VPRED>
++		(match_operand:SVE_FULL_I 3 "register_operand")
++		(match_dup 4))]
++	     UNSPEC_PRED_Z)
++	   (match_dup 5)
++	   (match_dup 4)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  {
++    operands[4] = CONST0_RTX (<MODE>mode);
++    operands[5] = CONST1_RTX (<MODE>mode);
++  }
++)
++
++(define_insn "*cnot<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(unspec:SVE_FULL_I
++	  [(unspec:<VPRED>
++	     [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	      (eq:<VPRED>
++		(match_operand:SVE_FULL_I 2 "register_operand" "w")
++		(match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
++	     UNSPEC_PRED_Z)
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
++	   (match_dup 3)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "cnot\t%0.<Vetype>, %1/m, %2.<Vetype>"
++)
++
++;; Predicated logical inverse with merging.
++(define_expand "@cond_cnot<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_I
++	     [(unspec:<VPRED>
++		[(match_dup 4)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (eq:<VPRED>
++		   (match_operand:SVE_FULL_I 2 "register_operand")
++		   (match_dup 5))]
++		UNSPEC_PRED_Z)
++	      (match_dup 6)
++	      (match_dup 5)]
++	     UNSPEC_SEL)
++	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  {
++    operands[4] = CONSTM1_RTX (<VPRED>mode);
++    operands[5] = CONST0_RTX (<MODE>mode);
++    operands[6] = CONST1_RTX (<MODE>mode);
++  }
++)
++
++;; Predicated logical inverse, merging with the first input.
++(define_insn_and_rewrite "*cond_cnot<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   ;; Logical inverse of operand 2 (as above).
++	   (unspec:SVE_FULL_I
++	     [(unspec:<VPRED>
++		[(match_operand 5)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (eq:<VPRED>
++		   (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++		   (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
++		UNSPEC_PRED_Z)
++	      (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
++	      (match_dup 3)]
++	     UNSPEC_SEL)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   cnot\t%0.<Vetype>, %1/m, %0.<Vetype>
++   movprfx\t%0, %2\;cnot\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  "&& !CONSTANT_P (operands[5])"
++  {
++    operands[5] = CONSTM1_RTX (<VPRED>mode);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated logical inverse, merging with an independent value.
++;;
++;; The earlyclobber isn't needed for the first alternative, but omitting
++;; it would only help the case in which operands 2 and 6 are the same,
++;; which is handled above rather than here.  Marking all the alternatives
++;; as earlyclobber helps to make the instruction more regular to the
++;; register allocator.
++(define_insn_and_rewrite "*cond_cnot<mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   ;; Logical inverse of operand 2 (as above).
++	   (unspec:SVE_FULL_I
++	     [(unspec:<VPRED>
++		[(match_operand 5)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (eq:<VPRED>
++		   (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
++		   (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
++		UNSPEC_PRED_Z)
++	      (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
++	      (match_dup 3)]
++	     UNSPEC_SEL)
++	   (match_operand:SVE_FULL_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && !rtx_equal_p (operands[2], operands[6])"
++  "@
++   cnot\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;cnot\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0, %6\;cnot\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  "&& !CONSTANT_P (operands[5])"
++  {
++    operands[5] = CONSTM1_RTX (<VPRED>mode);
++  }
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FEXPA
++;; -------------------------------------------------------------------------
++
++;; Unpredicated unary operations that take an integer and return a float.
++(define_insn "@aarch64_sve_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<V_INT_EQUIV> 1 "register_operand" "w")]
++	  SVE_FP_UNARY_INT))]
++  "TARGET_SVE"
++  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] General unary arithmetic corresponding to unspecs
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FABS
++;; - FNEG
++;; - FRECPE
++;; - FRECPX
++;; - FRINTA
++;; - FRINTI
++;; - FRINTM
++;; - FRINTN
++;; - FRINTP
++;; - FRINTX
++;; - FRINTZ
++;; - FRSQRT
++;; - FSQRT
++;; -------------------------------------------------------------------------
++
++;; Unpredicated floating-point unary operations.
++(define_insn "@aarch64_sve_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")]
++	  SVE_FP_UNARY))]
++  "TARGET_SVE"
++  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>"
++)
++
++;; Unpredicated floating-point unary operations.
++(define_expand "<optab><mode>2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_dup 2)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:SVE_FULL_F 1 "register_operand")]
++	  SVE_COND_FP_UNARY))]
++  "TARGET_SVE"
++  {
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated floating-point unary operations.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++	  SVE_COND_FP_UNARY))]
++  "TARGET_SVE"
++  "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++)
++
++;; Predicated floating-point unary arithmetic with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "register_operand")]
++	     SVE_COND_FP_UNARY)
++	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated floating-point unary arithmetic, merging with the first input.
++(define_insn_and_rewrite "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 3)
++	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")]
++	     SVE_COND_FP_UNARY)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[3], operands[1])"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[3])"
++  {
++    operands[3] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point unary arithmetic, merging with an independent
++;; value.
++;;
++;; The earlyclobber isn't needed for the first alternative, but omitting
++;; it would only help the case in which operands 2 and 3 are the same,
++;; which is handled above rather than here.  Marking all the alternatives
++;; as earlyclobber helps to make the instruction more regular to the
++;; register allocator.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, ?&w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
++	     SVE_COND_FP_UNARY)
++	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[3])
++   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
++   movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Inverse
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - NOT
++;; -------------------------------------------------------------------------
++
++;; Unpredicated predicate inverse.
++(define_expand "one_cmpl<mode>2"
++  [(set (match_operand:PRED_ALL 0 "register_operand")
++	(and:PRED_ALL
++	  (not:PRED_ALL (match_operand:PRED_ALL 1 "register_operand"))
++	  (match_dup 2)))]
++  "TARGET_SVE"
++  {
++    operands[2] = aarch64_ptrue_reg (<MODE>mode);
++  }
++)
++
++;; Predicated predicate inverse.
++(define_insn "*one_cmpl<mode>3"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL
++	  (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
++  "TARGET_SVE"
++  "not\t%0.b, %1/z, %2.b"
++)
++
++;; =========================================================================
++;; == Binary arithmetic
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] General binary arithmetic corresponding to rtx codes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ADD    (merging form only)
++;; - AND    (merging form only)
++;; - ASR    (merging form only)
++;; - EOR    (merging form only)
++;; - LSL    (merging form only)
++;; - LSR    (merging form only)
++;; - MUL
++;; - ORR    (merging form only)
++;; - SMAX
++;; - SMIN
++;; - SUB    (merging form only)
++;; - UMAX
++;; - UMIN
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer binary operations that have an immediate form.
++(define_expand "<optab><mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_dup 3)
++	   (SVE_INT_BINARY_IMM:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 1 "register_operand")
++	     (match_operand:SVE_FULL_I 2 "aarch64_sve_<sve_imm_con>_operand"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Integer binary operations that have an immediate form, predicated
++;; with a PTRUE.  We don't actually need the predicate for the first
++;; and third alternatives, but using Upa or X isn't likely to gain much
++;; and would make the instruction seem less uniform to the register
++;; allocator.
++(define_insn_and_split "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (SVE_INT_BINARY_IMM:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "%0, 0, w, w")
++	     (match_operand:SVE_FULL_I 3 "aarch64_sve_<sve_imm_con>_operand" "<sve_imm_con>, w, <sve_imm_con>, w"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  "@
++   #
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  ; Split the unpredicated form after reload, so that we don't have
++  ; the unnecessary PTRUE.
++  "&& reload_completed
++   && !register_operand (operands[3], <MODE>mode)"
++  [(set (match_dup 0)
++	(SVE_INT_BINARY_IMM:SVE_FULL_I (match_dup 2) (match_dup 3)))]
++  ""
++  [(set_attr "movprfx" "*,*,yes,yes")]
++)
++
++;; Unpredicated binary operations with a constant (post-RA only).
++;; These are generated by splitting a predicated instruction whose
++;; predicate is unused.
++(define_insn "*post_ra_<optab><mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(SVE_INT_BINARY_IMM:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand" "0, w")
++	  (match_operand:SVE_FULL_I 2 "aarch64_sve_<sve_imm_con>_immediate")))]
++  "TARGET_SVE && reload_completed"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2
++   movprfx\t%0, %1\;<sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer operations with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (SVE_INT_BINARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand")
++	     (match_operand:SVE_FULL_I 3 "<sve_pred_int_rhs2_operand>"))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated integer operations, merging with the first input.
++(define_insn "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (SVE_INT_BINARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	     (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer operations, merging with the second input.
++(define_insn "*cond_<optab><mode>_3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (SVE_INT_BINARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
++	     (match_operand:SVE_FULL_I 3 "register_operand" "0, w"))
++	   (match_dup 3)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer operations, merging with an independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
++	   (SVE_INT_BINARY:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w, w, w, w")
++	     (match_operand:SVE_FULL_I 3 "register_operand" "w, 0, w, w, w"))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && !rtx_equal_p (operands[3], operands[4])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& reload_completed
++   && register_operand (operands[4], <MODE>mode)
++   && !rtx_equal_p (operands[0], operands[4])"
++  {
++    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++					     operands[4], operands[1]));
++    operands[4] = operands[2] = operands[0];
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Addition
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ADD
++;; - DECB
++;; - DECD
++;; - DECH
++;; - DECW
++;; - INCB
++;; - INCD
++;; - INCH
++;; - INCW
++;; - SUB
++;; -------------------------------------------------------------------------
++
++(define_insn "add<mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
++	(plus:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand" "%0, 0, 0, w, w, w")
++	  (match_operand:SVE_FULL_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
++  "TARGET_SVE"
++  "@
++   add\t%0.<Vetype>, %0.<Vetype>, #%D2
++   sub\t%0.<Vetype>, %0.<Vetype>, #%N2
++   * return aarch64_output_sve_vector_inc_dec (\"%0.<Vetype>\", operands[2]);
++   movprfx\t%0, %1\;add\t%0.<Vetype>, %0.<Vetype>, #%D2
++   movprfx\t%0, %1\;sub\t%0.<Vetype>, %0.<Vetype>, #%N2
++   add\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++  [(set_attr "movprfx" "*,*,*,yes,yes,*")]
++)
++
++;; Merging forms are handled through SVE_INT_BINARY.
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Subtraction
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SUB
++;; - SUBR
++;; -------------------------------------------------------------------------
++
++(define_insn "sub<mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
++	(minus:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "aarch64_sve_arith_operand" "w, vsa, vsa")
++	  (match_operand:SVE_FULL_I 2 "register_operand" "w, 0, w")))]
++  "TARGET_SVE"
++  "@
++   sub\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
++   subr\t%0.<Vetype>, %0.<Vetype>, #%D1
++   movprfx\t%0, %2\;subr\t%0.<Vetype>, %0.<Vetype>, #%D1"
++  [(set_attr "movprfx" "*,*,yes")]
++)
++
++;; Merging forms are handled through SVE_INT_BINARY.
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Take address
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ADR
++;; -------------------------------------------------------------------------
++
++;; An unshifted and unscaled ADR.  This is functionally equivalent to an ADD,
++;; but the svadrb intrinsics should preserve the user's choice.
++(define_insn "@aarch64_adr<mode>"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w")
++	(unspec:SVE_FULL_SDI
++	  [(match_operand:SVE_FULL_SDI 1 "register_operand" "w")
++	   (match_operand:SVE_FULL_SDI 2 "register_operand" "w")]
++	  UNSPEC_ADR))]
++  "TARGET_SVE"
++  "adr\t%0.<Vetype>, [%1.<Vetype>, %2.<Vetype>]"
++)
++
++;; Same, but with the offset being sign-extended from the low 32 bits.
++(define_insn_and_rewrite "*aarch64_adr_sxtw"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
++	(unspec:VNx2DI
++	  [(match_operand:VNx2DI 1 "register_operand" "w")
++	   (unspec:VNx2DI
++	     [(match_operand 3)
++	      (sign_extend:VNx2DI
++		(truncate:VNx2SI
++		  (match_operand:VNx2DI 2 "register_operand" "w")))]
++	     UNSPEC_PRED_X)]
++	  UNSPEC_ADR))]
++  "TARGET_SVE"
++  "adr\t%0.d, [%1.d, %2.d, sxtw]"
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (VNx2BImode);
++  }
++)
++
++;; Same, but with the offset being zero-extended from the low 32 bits.
++(define_insn "*aarch64_adr_uxtw_unspec"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
++	(unspec:VNx2DI
++	  [(match_operand:VNx2DI 1 "register_operand" "w")
++	   (and:VNx2DI
++	     (match_operand:VNx2DI 2 "register_operand" "w")
++	     (match_operand:VNx2DI 3 "aarch64_sve_uxtw_immediate"))]
++	  UNSPEC_ADR))]
++  "TARGET_SVE"
++  "adr\t%0.d, [%1.d, %2.d, uxtw]"
++)
++
++;; Same, matching as a PLUS rather than unspec.
++(define_insn "*aarch64_adr_uxtw_and"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
++	(plus:VNx2DI
++	  (and:VNx2DI
++	    (match_operand:VNx2DI 2 "register_operand" "w")
++	    (match_operand:VNx2DI 3 "aarch64_sve_uxtw_immediate"))
++	  (match_operand:VNx2DI 1 "register_operand" "w")))]
++  "TARGET_SVE"
++  "adr\t%0.d, [%1.d, %2.d, uxtw]"
++)
++
++;; ADR with a nonzero shift.
++(define_expand "@aarch64_adr<mode>_shift"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
++	(plus:SVE_FULL_SDI
++	  (unspec:SVE_FULL_SDI
++	    [(match_dup 4)
++	     (ashift:SVE_FULL_SDI
++	       (match_operand:SVE_FULL_SDI 2 "register_operand")
++	       (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:SVE_FULL_SDI 1 "register_operand")))]
++  "TARGET_SVE"
++  {
++    operands[4] = CONSTM1_RTX (<VPRED>mode);
++  }
++)
++
++(define_insn_and_rewrite "*aarch64_adr<mode>_shift"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w")
++	(plus:SVE_FULL_SDI
++	  (unspec:SVE_FULL_SDI
++	    [(match_operand 4)
++	     (ashift:SVE_FULL_SDI
++	       (match_operand:SVE_FULL_SDI 2 "register_operand" "w")
++	       (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:SVE_FULL_SDI 1 "register_operand" "w")))]
++  "TARGET_SVE"
++  "adr\t%0.<Vetype>, [%1.<Vetype>, %2.<Vetype>, lsl %3]"
++  "&& !CONSTANT_P (operands[4])"
++  {
++    operands[4] = CONSTM1_RTX (<VPRED>mode);
++  }
++)
++
++;; Same, but with the index being sign-extended from the low 32 bits.
++(define_insn_and_rewrite "*aarch64_adr_shift_sxtw"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
++	(plus:VNx2DI
++	  (unspec:VNx2DI
++	    [(match_operand 4)
++	     (ashift:VNx2DI
++	       (unspec:VNx2DI
++		 [(match_operand 5)
++		  (sign_extend:VNx2DI
++		    (truncate:VNx2SI
++		      (match_operand:VNx2DI 2 "register_operand" "w")))]
++		 UNSPEC_PRED_X)
++	       (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:VNx2DI 1 "register_operand" "w")))]
++  "TARGET_SVE"
++  "adr\t%0.d, [%1.d, %2.d, sxtw %3]"
++  "&& (!CONSTANT_P (operands[4]) || !CONSTANT_P (operands[5]))"
++  {
++    operands[5] = operands[4] = CONSTM1_RTX (VNx2BImode);
++  }
++)
++
++;; Same, but with the index being zero-extended from the low 32 bits.
++(define_insn_and_rewrite "*aarch64_adr_shift_uxtw"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
++	(plus:VNx2DI
++	  (unspec:VNx2DI
++	    [(match_operand 5)
++	     (ashift:VNx2DI
++	       (and:VNx2DI
++		 (match_operand:VNx2DI 2 "register_operand" "w")
++		 (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate"))
++	       (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:VNx2DI 1 "register_operand" "w")))]
++  "TARGET_SVE"
++  "adr\t%0.d, [%1.d, %2.d, uxtw %3]"
++  "&& !CONSTANT_P (operands[5])"
++  {
++    operands[5] = CONSTM1_RTX (VNx2BImode);
++  }
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Absolute difference
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SABD
++;; - UABD
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer absolute difference.
++(define_expand "<su>abd<mode>_3"
++  [(use (match_operand:SVE_FULL_I 0 "register_operand"))
++   (USMAX:SVE_FULL_I
++     (match_operand:SVE_FULL_I 1 "register_operand")
++     (match_operand:SVE_FULL_I 2 "register_operand"))]
++  "TARGET_SVE"
++  {
++    rtx pred = aarch64_ptrue_reg (<VPRED>mode);
++    emit_insn (gen_aarch64_pred_<su>abd<mode> (operands[0], pred, operands[1],
++					       operands[2]));
++    DONE;
++  }
++)
++
++;; Predicated integer absolute difference.
++(define_insn "@aarch64_pred_<su>abd<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (minus:SVE_FULL_I
++	     (USMAX:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "%0, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
++	     (<max_opp>:SVE_FULL_I
++	       (match_dup 2)
++	       (match_dup 3)))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  "@
++   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++(define_expand "@aarch64_cond_<su>abd<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (minus:SVE_FULL_I
++	     (unspec:SVE_FULL_I
++	       [(match_dup 1)
++		(USMAX:SVE_FULL_I
++		  (match_operand:SVE_FULL_I 2 "register_operand")
++		  (match_operand:SVE_FULL_I 3 "register_operand"))]
++	       UNSPEC_PRED_X)
++	     (unspec:SVE_FULL_I
++	       [(match_dup 1)
++		(<max_opp>:SVE_FULL_I
++		  (match_dup 2)
++		  (match_dup 3))]
++	       UNSPEC_PRED_X))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++{
++  if (rtx_equal_p (operands[3], operands[4]))
++    std::swap (operands[2], operands[3]);
++})
++
++;; Predicated integer absolute difference, merging with the first input.
++(define_insn_and_rewrite "*aarch64_cond_<su>abd<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (minus:SVE_FULL_I
++	     (unspec:SVE_FULL_I
++	       [(match_operand 4)
++		(USMAX:SVE_FULL_I
++		  (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++		  (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))]
++	       UNSPEC_PRED_X)
++	     (unspec:SVE_FULL_I
++	       [(match_operand 5)
++		(<max_opp>:SVE_FULL_I
++		  (match_dup 2)
++		  (match_dup 3))]
++	       UNSPEC_PRED_X))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  "&& (!CONSTANT_P (operands[4]) || !CONSTANT_P (operands[5]))"
++  {
++    operands[4] = operands[5] = CONSTM1_RTX (<VPRED>mode);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer absolute difference, merging with an independent value.
++(define_insn_and_rewrite "*aarch64_cond_<su>abd<mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
++	   (minus:SVE_FULL_I
++	     (unspec:SVE_FULL_I
++	       [(match_operand 5)
++		(USMAX:SVE_FULL_I
++		  (match_operand:SVE_FULL_I 2 "register_operand" "0, w, w, w, w")
++		  (match_operand:SVE_FULL_I 3 "register_operand" "w, 0, w, w, w"))]
++	       UNSPEC_PRED_X)
++	     (unspec:SVE_FULL_I
++	       [(match_operand 6)
++		(<max_opp>:SVE_FULL_I
++		  (match_dup 2)
++		  (match_dup 3))]
++	       UNSPEC_PRED_X))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && !rtx_equal_p (operands[3], operands[4])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& 1"
++  {
++    if (!CONSTANT_P (operands[5]) || !CONSTANT_P (operands[6]))
++      operands[5] = operands[6] = CONSTM1_RTX (<VPRED>mode);
++    else if (reload_completed
++	     && register_operand (operands[4], <MODE>mode)
++	     && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++						 operands[4], operands[1]));
++	operands[4] = operands[2] = operands[0];
++      }
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Saturating addition and subtraction
++;; -------------------------------------------------------------------------
++;; - SQADD
++;; - SQSUB
++;; - UQADD
++;; - UQSUB
++;; -------------------------------------------------------------------------
++
++;; Unpredicated saturating signed addition and subtraction.
++(define_insn "@aarch64_<su_optab><optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w, ?&w, w")
++	(SBINQOPS:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand" "0, 0, w, w, w")
++	  (match_operand:SVE_FULL_I 2 "aarch64_sve_sqadd_operand" "vsQ, vsS, vsQ, vsS, w")))]
++  "TARGET_SVE"
++  "@
++   <binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
++   <binqops_op_rev>\t%0.<Vetype>, %0.<Vetype>, #%N2
++   movprfx\t%0, %1\;<binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
++   movprfx\t%0, %1\;<binqops_op_rev>\t%0.<Vetype>, %0.<Vetype>, #%N2
++   <binqops_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++  [(set_attr "movprfx" "*,*,yes,yes,*")]
++)
++
++;; Unpredicated saturating unsigned addition and subtraction.
++(define_insn "@aarch64_<su_optab><optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, w")
++	(UBINQOPS:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand" "0, w, w")
++	  (match_operand:SVE_FULL_I 2 "aarch64_sve_arith_operand" "vsa, vsa, w")))]
++  "TARGET_SVE"
++  "@
++   <binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
++   movprfx\t%0, %1\;<binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
++   <binqops_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes,*")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Highpart multiplication
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SMULH
++;; - UMULH
++;; -------------------------------------------------------------------------
++
++;; Unpredicated highpart multiplication.
++(define_expand "<su>mul<mode>3_highpart"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_dup 3)
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 1 "register_operand")
++	      (match_operand:SVE_FULL_I 2 "register_operand")]
++	     MUL_HIGHPART)]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated highpart multiplication.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "%0, w")
++	      (match_operand:SVE_FULL_I 3 "register_operand" "w, w")]
++	     MUL_HIGHPART)]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  "@
++   <su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated highpart multiplications with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand")
++	      (match_operand:SVE_FULL_I 3 "register_operand")]
++	     MUL_HIGHPART)
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++{
++  /* Only target code is aware of these operations, so we don't need
++     to handle the fully-general case.  */
++  gcc_assert (rtx_equal_p (operands[2], operands[4])
++	      || CONSTANT_P (operands[4]));
++})
++
++;; Predicated highpart multiplications, merging with the first input.
++(define_insn "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	      (match_operand:SVE_FULL_I 3 "register_operand" "w, w")]
++	     MUL_HIGHPART)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")])
++
++;; Predicated highpart multiplications, merging with zero.
++(define_insn "*cond_<optab><mode>_z"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "%0, w")
++	      (match_operand:SVE_FULL_I 3 "register_operand" "w, w")]
++	     MUL_HIGHPART)
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "yes")])
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Division
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SDIV
++;; - SDIVR
++;; - UDIV
++;; - UDIVR
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer division.
++(define_expand "<optab><mode>3"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
++	(unspec:SVE_FULL_SDI
++	  [(match_dup 3)
++	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
++	     (match_operand:SVE_FULL_SDI 1 "register_operand")
++	     (match_operand:SVE_FULL_SDI 2 "register_operand"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Integer division predicated with a PTRUE.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, w, ?&w")
++	(unspec:SVE_FULL_SDI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
++	     (match_operand:SVE_FULL_SDI 2 "register_operand" "0, w, w")
++	     (match_operand:SVE_FULL_SDI 3 "register_operand" "w, 0, w"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,*,yes")]
++)
++
++;; Predicated integer division with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
++	(unspec:SVE_FULL_SDI
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
++	     (match_operand:SVE_FULL_SDI 2 "register_operand")
++	     (match_operand:SVE_FULL_SDI 3 "register_operand"))
++	   (match_operand:SVE_FULL_SDI 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated integer division, merging with the first input.
++(define_insn "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_SDI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
++	     (match_operand:SVE_FULL_SDI 2 "register_operand" "0, w")
++	     (match_operand:SVE_FULL_SDI 3 "register_operand" "w, w"))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer division, merging with the second input.
++(define_insn "*cond_<optab><mode>_3"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_SDI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
++	     (match_operand:SVE_FULL_SDI 2 "register_operand" "w, w")
++	     (match_operand:SVE_FULL_SDI 3 "register_operand" "0, w"))
++	   (match_dup 3)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer division, merging with an independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=&w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_SDI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
++	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
++	     (match_operand:SVE_FULL_SDI 2 "register_operand" "0, w, w, w, w")
++	     (match_operand:SVE_FULL_SDI 3 "register_operand" "w, 0, w, w, w"))
++	   (match_operand:SVE_FULL_SDI 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && !rtx_equal_p (operands[3], operands[4])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& reload_completed
++   && register_operand (operands[4], <MODE>mode)
++   && !rtx_equal_p (operands[0], operands[4])"
++  {
++    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++					     operands[4], operands[1]));
++    operands[4] = operands[2] = operands[0];
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Binary logical operations
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - AND
++;; - EOR
++;; - ORR
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer binary logical operations.
++(define_insn "<optab><mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?w, w")
++	(LOGICAL:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand" "%0, w, w")
++	  (match_operand:SVE_FULL_I 2 "aarch64_sve_logical_operand" "vsl, vsl, w")))]
++  "TARGET_SVE"
++  "@
++   <logical>\t%0.<Vetype>, %0.<Vetype>, #%C2
++   movprfx\t%0, %1\;<logical>\t%0.<Vetype>, %0.<Vetype>, #%C2
++   <logical>\t%0.d, %1.d, %2.d"
++  [(set_attr "movprfx" "*,yes,*")]
++)
++
++;; Merging forms are handled through SVE_INT_BINARY.
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Binary logical operations (inverted second input)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - BIC
++;; -------------------------------------------------------------------------
++
++;; Unpredicated BIC.
++(define_expand "@aarch64_bic<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(and:SVE_FULL_I
++	  (unspec:SVE_FULL_I
++	    [(match_dup 3)
++	     (not:SVE_FULL_I (match_operand:SVE_FULL_I 2 "register_operand"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:SVE_FULL_I 1 "register_operand")))]
++  "TARGET_SVE"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
++)
++
++;; Predicated BIC.
++(define_insn_and_rewrite "*bic<mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(and:SVE_FULL_I
++	  (unspec:SVE_FULL_I
++	    [(match_operand 3)
++	     (not:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "w"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:SVE_FULL_I 1 "register_operand" "w")))]
++  "TARGET_SVE"
++  "bic\t%0.d, %1.d, %2.d"
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
++)
++
++;; Predicated BIC with merging.
++(define_expand "@cond_bic<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (and:SVE_FULL_I
++	     (not:SVE_FULL_I (match_operand:SVE_FULL_I 3 "register_operand"))
++	     (match_operand:SVE_FULL_I 2 "register_operand"))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated integer BIC, merging with the first input.
++(define_insn "*cond_bic<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (and:SVE_FULL_I
++	     (not:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w"))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer BIC, merging with an independent value.
++(define_insn_and_rewrite "*cond_bic<mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (and:SVE_FULL_I
++	     (not:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w, w"))
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w, w, w"))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& reload_completed
++   && register_operand (operands[4], <MODE>mode)
++   && !rtx_equal_p (operands[0], operands[4])"
++  {
++    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++					     operands[4], operands[1]));
++    operands[4] = operands[2] = operands[0];
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Shifts (rounding towards -Inf)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ASR
++;; - ASRR
++;; - LSL
++;; - LSLR
++;; - LSR
++;; - LSRR
++;; -------------------------------------------------------------------------
++
++;; Unpredicated shift by a scalar, which expands into one of the vector
++;; shifts below.
++(define_expand "<ASHIFT:optab><mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(ASHIFT:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand")
++	  (match_operand:<VEL> 2 "general_operand")))]
++  "TARGET_SVE"
++  {
++    rtx amount;
++    if (CONST_INT_P (operands[2]))
++      {
++	amount = gen_const_vec_duplicate (<MODE>mode, operands[2]);
++	if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode))
++	  amount = force_reg (<MODE>mode, amount);
++      }
++    else
++      {
++	amount = gen_reg_rtx (<MODE>mode);
++	emit_insn (gen_vec_duplicate<mode> (amount,
++					    convert_to_mode (<VEL>mode,
++							     operands[2], 0)));
++      }
++    emit_insn (gen_v<optab><mode>3 (operands[0], operands[1], amount));
++    DONE;
++  }
++)
++
++;; Unpredicated shift by a vector.
++(define_expand "v<optab><mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_dup 3)
++	   (ASHIFT:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 1 "register_operand")
++	     (match_operand:SVE_FULL_I 2 "aarch64_sve_<lr>shift_operand"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Shift by a vector, predicated with a PTRUE.  We don't actually need
++;; the predicate for the first alternative, but using Upa or X isn't
++;; likely to gain much and would make the instruction seem less uniform
++;; to the register allocator.
++(define_insn_and_split "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (ASHIFT:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "w, 0, w, w")
++	     (match_operand:SVE_FULL_I 3 "aarch64_sve_<lr>shift_operand" "D<lr>, w, 0, w"))]
++	  UNSPEC_PRED_X))]
++  "TARGET_SVE"
++  "@
++   #
++   <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   <shift>r\t%0.<Vetype>, %1/m, %3.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  "&& reload_completed
++   && !register_operand (operands[3], <MODE>mode)"
++  [(set (match_dup 0) (ASHIFT:SVE_FULL_I (match_dup 2) (match_dup 3)))]
++  ""
++  [(set_attr "movprfx" "*,*,*,yes")]
++)
++
++;; Unpredicated shift operations by a constant (post-RA only).
++;; These are generated by splitting a predicated instruction whose
++;; predicate is unused.
++(define_insn "*post_ra_v<optab><mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(ASHIFT:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 1 "register_operand" "w")
++	  (match_operand:SVE_FULL_I 2 "aarch64_simd_<lr>shift_imm")))]
++  "TARGET_SVE && reload_completed"
++  "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2"
++)
++
++;; Predicated integer shift, merging with the first input.
++(define_insn "*cond_<optab><mode>_2_const"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (ASHIFT:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	     (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm"))
++	   (match_dup 2)]
++	 UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated integer shift, merging with an independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any_const"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, &w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (ASHIFT:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
++	     (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm"))
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
++	 UNSPEC_SEL))]
++  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   #"
++  "&& reload_completed
++   && register_operand (operands[4], <MODE>mode)
++   && !rtx_equal_p (operands[0], operands[4])"
++  {
++    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++					     operands[4], operands[1]));
++    operands[4] = operands[2] = operands[0];
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; Unpredicated shifts of narrow elements by 64-bit amounts.
++(define_insn "@aarch64_sve_<sve_int_op><mode>"
++  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=w")
++	(unspec:SVE_FULL_BHSI
++	  [(match_operand:SVE_FULL_BHSI 1 "register_operand" "w")
++	   (match_operand:VNx2DI 2 "register_operand" "w")]
++	  SVE_SHIFT_WIDE))]
++  "TARGET_SVE"
++  "<sve_int_op>\t%0.<Vetype>, %1.<Vetype>, %2.d"
++)
++
++;; Merging predicated shifts of narrow elements by 64-bit amounts.
++(define_expand "@cond_<sve_int_op><mode>"
++  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand")
++	(unspec:SVE_FULL_BHSI
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_BHSI
++	     [(match_operand:SVE_FULL_BHSI 2 "register_operand")
++	      (match_operand:VNx2DI 3 "register_operand")]
++	     SVE_SHIFT_WIDE)
++	   (match_operand:SVE_FULL_BHSI 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated shifts of narrow elements by 64-bit amounts, merging with
++;; the first input.
++(define_insn "*cond_<sve_int_op><mode>_m"
++  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_BHSI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_BHSI
++	     [(match_operand:SVE_FULL_BHSI 2 "register_operand" "0, w")
++	      (match_operand:VNx2DI 3 "register_operand" "w, w")]
++	     SVE_SHIFT_WIDE)
++	   (match_dup 2)]
++	 UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d
++   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d"
++  [(set_attr "movprfx" "*, yes")])
++
++;; Predicated shifts of narrow elements by 64-bit amounts, merging with zero.
++(define_insn "*cond_<sve_int_op><mode>_z"
++  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=&w, &w")
++	(unspec:SVE_FULL_BHSI
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_BHSI
++	     [(match_operand:SVE_FULL_BHSI 2 "register_operand" "0, w")
++	      (match_operand:VNx2DI 3 "register_operand" "w, w")]
++	     SVE_SHIFT_WIDE)
++	   (match_operand:SVE_FULL_BHSI 4 "aarch64_simd_imm_zero")]
++	 UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d"
++  [(set_attr "movprfx" "yes")])
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Shifts (rounding towards 0)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ASRD
++;; -------------------------------------------------------------------------
++
++;; Unpredicated ASRD.
++(define_expand "sdiv_pow2<mode>3"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_dup 3)
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 1 "register_operand")
++	      (match_operand 2 "aarch64_simd_rshift_imm")]
++	     UNSPEC_ASRD)
++	   (match_dup 1)]
++	 UNSPEC_SEL))]
++  "TARGET_SVE"
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated ASRD with merging.
++(define_expand "@cond_asrd<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand")
++	      (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")]
++	     UNSPEC_ASRD)
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated ASRD, merging with the first input.
++(define_insn "*cond_asrd<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	      (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")]
++	     UNSPEC_ASRD)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "@
++   asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
++  [(set_attr "movprfx" "*,yes")])
++
++;; Predicated ASRD, merging with zero.
++(define_insn "*cond_asrd<mode>_z"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (unspec:SVE_FULL_I
++	     [(match_operand:SVE_FULL_I 2 "register_operand" "w")
++	      (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")]
++	     UNSPEC_ASRD)
++	   (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
++  [(set_attr "movprfx" "yes")])
++
++;; -------------------------------------------------------------------------
++;; ---- [FP<-INT] General binary arithmetic corresponding to unspecs
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FSCALE
++;; - FTSMUL
++;; - FTSSEL
++;; -------------------------------------------------------------------------
++
++;; Unpredicated floating-point binary operations that take an integer as
++;; their second operand.
++(define_insn "@aarch64_sve_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")
++	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
++	  SVE_FP_BINARY_INT))]
++  "TARGET_SVE"
++  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++)
++
++;; Predicated floating-point binary operations that take an integer
++;; as their second operand.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	   (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")]
++	  SVE_COND_FP_BINARY_INT))]
++  "TARGET_SVE"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point binary operations with merging, taking an
++;; integer as their second operand.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "register_operand")
++	      (match_operand:<V_INT_EQUIV> 3 "register_operand")]
++	     SVE_COND_FP_BINARY_INT)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated floating-point binary operations that take an integer as their
++;; second operand, with inactive lanes coming from the first operand.
++(define_insn_and_rewrite "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")]
++	     SVE_COND_FP_BINARY_INT)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point binary operations that take an integer as
++;; their second operand, with the values of inactive lanes being distinct
++;; from the other inputs.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w")
++	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w, w, w")]
++	     SVE_COND_FP_BINARY_INT)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& 1"
++  {
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++						 operands[4], operands[1]));
++	operands[4] = operands[2] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5]))
++      operands[5] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] General binary arithmetic corresponding to rtx codes
++;; -------------------------------------------------------------------------
++;; Includes post-RA forms of:
++;; - FADD
++;; - FMUL
++;; - FSUB
++;; -------------------------------------------------------------------------
++
++;; Unpredicated floating-point binary operations (post-RA only).
++;; These are generated by splitting a predicated instruction whose
++;; predicate is unused.
++(define_insn "*post_ra_<sve_fp_op><mode>3"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(SVE_UNPRED_FP_BINARY:SVE_FULL_F
++	  (match_operand:SVE_FULL_F 1 "register_operand" "w")
++	  (match_operand:SVE_FULL_F 2 "register_operand" "w")))]
++  "TARGET_SVE && reload_completed"
++  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] General binary arithmetic corresponding to unspecs
++;; -------------------------------------------------------------------------
++;; Includes merging forms of:
++;; - FADD    (constant forms handled in the "Addition" section)
++;; - FDIV
++;; - FDIVR
++;; - FMAX
++;; - FMAXNM  (including #0.0 and #1.0)
++;; - FMIN
++;; - FMINNM  (including #0.0 and #1.0)
++;; - FMUL    (including #0.5 and #2.0)
++;; - FMULX
++;; - FRECPS
++;; - FRSQRTS
++;; - FSUB    (constant forms handled in the "Addition" section)
++;; - FSUBR   (constant forms handled in the "Subtraction" section)
++;; -------------------------------------------------------------------------
++
++;; Unpredicated floating-point binary operations.
++(define_insn "@aarch64_sve_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++	  SVE_FP_BINARY))]
++  "TARGET_SVE"
++  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++)
++
++;; Unpredicated floating-point binary operations that need to be predicated
++;; for SVE.
++(define_expand "<optab><mode>3"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_dup 3)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:SVE_FULL_F 1 "<sve_pred_fp_rhs1_operand>")
++	   (match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs2_operand>")]
++	  SVE_COND_FP_BINARY))]
++  "TARGET_SVE"
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
++)
++
++;; Predicated floating-point binary operations that have no immediate forms.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w")]
++	  SVE_COND_FP_BINARY_REG))]
++  "TARGET_SVE"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,*,yes")]
++)
++
++;; Predicated floating-point operations with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs1_operand>")
++	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_operand>")]
++	     SVE_COND_FP_BINARY)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated floating-point operations, merging with the first input.
++(define_insn_and_rewrite "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
++	     SVE_COND_FP_BINARY)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Same for operations that take a 1-bit constant.
++(define_insn_and_rewrite "*cond_<optab><mode>_2_const"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
++	     SVE_COND_FP_BINARY_I1)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point operations, merging with the second input.
++(define_insn_and_rewrite "*cond_<optab><mode>_3"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
++	     SVE_COND_FP_BINARY)
++	   (match_dup 3)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point operations, merging with an independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
++	     SVE_COND_FP_BINARY)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && !rtx_equal_p (operands[3], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& 1"
++  {
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++						 operands[4], operands[1]));
++	operands[4] = operands[2] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5]))
++      operands[5] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; Same for operations that take a 1-bit constant.
++(define_insn_and_rewrite "*cond_<optab><mode>_any_const"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")
++	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
++	     SVE_COND_FP_BINARY_I1)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   #"
++  "&& 1"
++  {
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++						 operands[4], operands[1]));
++	operands[4] = operands[2] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5]))
++      operands[5] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Addition
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FADD
++;; - FSUB
++;; -------------------------------------------------------------------------
++
++;; Predicated floating-point addition.
++(define_insn_and_split "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?&w, ?&w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness" "i, i, Z, Ui1, i, i, Ui1")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "%0, 0, w, 0, w, w, w")
++	   (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_operand" "vsA, vsN, w, w, vsA, vsN, w")]
++	  SVE_COND_FP_ADD))]
++  "TARGET_SVE"
++  "@
++   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
++   #
++   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
++   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  ; Split the unpredicated form after reload, so that we don't have
++  ; the unnecessary PTRUE.
++  "&& reload_completed
++   && register_operand (operands[3], <MODE>mode)
++   && INTVAL (operands[4]) == SVE_RELAXED_GP"
++  [(set (match_dup 0) (plus:SVE_FULL_F (match_dup 2) (match_dup 3)))]
++  ""
++  [(set_attr "movprfx" "*,*,*,*,yes,yes,yes")]
++)
++
++;; Predicated floating-point addition of a constant, merging with the
++;; first input.
++(define_insn_and_rewrite "*cond_add<mode>_2_const"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w, ?w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, 0, w, w")
++	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN")]
++	     UNSPEC_COND_FADD)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
++   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,*,yes,yes")]
++)
++
++;; Predicated floating-point addition of a constant, merging with an
++;; independent value.
++(define_insn_and_rewrite "*cond_add<mode>_any_const"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?w, ?w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w, w, w")
++	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN, vsA, vsN")]
++	     UNSPEC_COND_FADD)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, 0, w, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
++   #
++   #"
++  "&& 1"
++  {
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++						 operands[4], operands[1]));
++	operands[4] = operands[2] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5]))
++      operands[5] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; Register merging forms are handled through SVE_COND_FP_BINARY.
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Complex addition
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FCADD
++;; -------------------------------------------------------------------------
++
++;; Predicated FCADD.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
++	  SVE_COND_FCADD))]
++  "TARGET_SVE"
++  "@
++   fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated FCADD with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "register_operand")
++	      (match_operand:SVE_FULL_F 3 "register_operand")]
++	     SVE_COND_FCADD)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
++)
++
++;; Predicated FCADD, merging with the first input.
++(define_insn_and_rewrite "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
++	     SVE_COND_FCADD)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated FCADD, merging with an independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, 0, w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")]
++	     SVE_COND_FCADD)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
++   #"
++  "&& 1"
++  {
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
++						 operands[4], operands[1]));
++	operands[4] = operands[2] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5]))
++      operands[5] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Subtraction
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FSUB
++;; - FSUBR
++;; -------------------------------------------------------------------------
++
++;; Predicated floating-point subtraction.
++(define_insn_and_split "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?&w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness" "i, Z, Ui1, Ui1, i, Ui1")
++	   (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_operand" "vsA, w, 0, w, vsA, w")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "0, w, w, 0, w, w")]
++	  SVE_COND_FP_SUB))]
++  "TARGET_SVE"
++  "@
++   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
++   #
++   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
++   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  ; Split the unpredicated form after reload, so that we don't have
++  ; the unnecessary PTRUE.
++  "&& reload_completed
++   && register_operand (operands[2], <MODE>mode)
++   && INTVAL (operands[4]) == SVE_RELAXED_GP"
++  [(set (match_dup 0) (minus:SVE_FULL_F (match_dup 2) (match_dup 3)))]
++  ""
++  [(set_attr "movprfx" "*,*,*,*,yes,yes")]
++)
++
++;; Predicated floating-point subtraction from a constant, merging with the
++;; second input.
++(define_insn_and_rewrite "*cond_sub<mode>_3_const"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
++	     UNSPEC_COND_FSUB)
++	   (match_dup 3)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
++   movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2"
++  "&& !rtx_equal_p (operands[1], operands[4])"
++  {
++    operands[4] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point subtraction from a constant, merging with an
++;; independent value.
++(define_insn_and_rewrite "*cond_sub<mode>_any_const"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w")]
++	     UNSPEC_COND_FSUB)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[3], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
++   movprfx\t%0.<Vetype>, %1/m, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
++   #"
++  "&& 1"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
++						 operands[4], operands[1]));
++	operands[4] = operands[3] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5]))
++      operands[5] = copy_rtx (operands[1]);
++    else
++      FAIL;
+   }
++  [(set_attr "movprfx" "yes")]
+ )
+ 
+-;; Predicated ST[234].
+-(define_insn "vec_mask_store_lanes<mode><vsingle>"
+-  [(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
+-	(unspec:SVE_STRUCT
+-	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+-	   (match_operand:SVE_STRUCT 1 "register_operand" "w")
+-	   (match_dup 0)]
+-	  UNSPEC_STN))]
++;; Register merging forms are handled through SVE_COND_FP_BINARY.
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Absolute difference
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FABD
++;; -------------------------------------------------------------------------
++
++;; Predicated floating-point absolute difference.
++(define_expand "@aarch64_pred_abd<mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (match_dup 4)
++	      (match_operand:SVE_FULL_F 2 "register_operand")
++	      (match_operand:SVE_FULL_F 3 "register_operand")]
++	     UNSPEC_COND_FSUB)]
++	  UNSPEC_COND_FABS))]
+   "TARGET_SVE"
+-  "st<vector_count><Vesize>\t%1, %2, %0"
+ )
+ 
+-(define_expand "vec_perm<mode>"
+-  [(match_operand:SVE_ALL 0 "register_operand")
+-   (match_operand:SVE_ALL 1 "register_operand")
+-   (match_operand:SVE_ALL 2 "register_operand")
+-   (match_operand:<V_INT_EQUIV> 3 "aarch64_sve_vec_perm_operand")]
+-  "TARGET_SVE && GET_MODE_NUNITS (<MODE>mode).is_constant ()"
++;; Predicated floating-point absolute difference.
++(define_insn_and_rewrite "*aarch64_pred_abd<mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "%0, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
++	     UNSPEC_COND_FSUB)]
++	  UNSPEC_COND_FABS))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[5])"
+   {
+-    aarch64_expand_sve_vec_perm (operands[0], operands[1],
+-				 operands[2], operands[3]);
+-    DONE;
++    operands[5] = copy_rtx (operands[1]);
+   }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-(define_insn "*aarch64_sve_tbl<mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL
+-	  [(match_operand:SVE_ALL 1 "register_operand" "w")
+-	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
+-	  UNSPEC_TBL))]
++(define_expand "@aarch64_cond_abd<mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (unspec:SVE_FULL_F
++		[(match_dup 1)
++		 (const_int SVE_STRICT_GP)
++		 (match_operand:SVE_FULL_F 2 "register_operand")
++		 (match_operand:SVE_FULL_F 3 "register_operand")]
++		UNSPEC_COND_FSUB)]
++	     UNSPEC_COND_FABS)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "tbl\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++{
++  if (rtx_equal_p (operands[3], operands[4]))
++    std::swap (operands[2], operands[3]);
++})
++
++;; Predicated floating-point absolute difference, merging with the first
++;; input.
++(define_insn_and_rewrite "*aarch64_cond_abd<mode>_2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (unspec:SVE_FULL_F
++		[(match_operand 6)
++		 (match_operand:SI 7 "aarch64_sve_gp_strictness")
++		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++		 (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
++		UNSPEC_COND_FSUB)]
++	     UNSPEC_COND_FABS)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])
++   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
++  "@
++   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  "&& (!rtx_equal_p (operands[1], operands[4])
++       || !rtx_equal_p (operands[1], operands[6]))"
++  {
++    operands[4] = copy_rtx (operands[1]);
++    operands[6] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-(define_insn "*aarch64_sve_<perm_insn><perm_hilo><mode>"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+-			  (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+-			 PERMUTE))]
+-  "TARGET_SVE"
+-  "<perm_insn><perm_hilo>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++;; Predicated floating-point absolute difference, merging with the second
++;; input.
++(define_insn_and_rewrite "*aarch64_cond_abd<mode>_3"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (unspec:SVE_FULL_F
++		[(match_operand 6)
++		 (match_operand:SI 7 "aarch64_sve_gp_strictness")
++		 (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++		 (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
++		UNSPEC_COND_FSUB)]
++	     UNSPEC_COND_FABS)
++	   (match_dup 3)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])
++   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
++  "@
++   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0, %3\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
++  "&& (!rtx_equal_p (operands[1], operands[4])
++       || !rtx_equal_p (operands[1], operands[6]))"
++  {
++    operands[4] = copy_rtx (operands[1]);
++    operands[6] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; Predicated floating-point absolute difference, merging with an
++;; independent value.
++(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (unspec:SVE_FULL_F
++		[(match_operand 7)
++		 (match_operand:SI 8 "aarch64_sve_gp_strictness")
++		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
++		 (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
++		UNSPEC_COND_FSUB)]
++	     UNSPEC_COND_FABS)
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[4])
++   && !rtx_equal_p (operands[3], operands[4])
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])
++   && aarch64_sve_pred_dominates_p (&operands[7], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   #"
++  "&& 1"
++  {
++    if (reload_completed
++        && register_operand (operands[4], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[4]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
++						 operands[4], operands[1]));
++	operands[4] = operands[3] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[5])
++	     || !rtx_equal_p (operands[1], operands[7]))
++      {
++	operands[5] = copy_rtx (operands[1]);
++	operands[7] = copy_rtx (operands[1]);
++      }
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
+ )
+ 
+-(define_insn "aarch64_sve_<perm_insn><perm_hilo><mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")
+-			 (match_operand:SVE_ALL 2 "register_operand" "w")]
+-			PERMUTE))]
++;; -------------------------------------------------------------------------
++;; ---- [FP] Multiplication
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FMUL
++;; -------------------------------------------------------------------------
++
++;; Predicated floating-point multiplication.
++(define_insn_and_split "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, ?&w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness" "i, Z, Ui1, i, Ui1")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "%0, w, 0, w, w")
++	   (match_operand:SVE_FULL_F 3 "aarch64_sve_float_mul_operand" "vsM, w, w, vsM, w")]
++	  SVE_COND_FP_MUL))]
+   "TARGET_SVE"
+-  "<perm_insn><perm_hilo>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++  "@
++   fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   #
++   fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  ; Split the unpredicated form after reload, so that we don't have
++  ; the unnecessary PTRUE.
++  "&& reload_completed
++   && register_operand (operands[3], <MODE>mode)
++   && INTVAL (operands[4]) == SVE_RELAXED_GP"
++  [(set (match_dup 0) (mult:SVE_FULL_F (match_dup 2) (match_dup 3)))]
++  ""
++  [(set_attr "movprfx" "*,*,*,yes,yes")]
+ )
+ 
+-(define_insn "*aarch64_sve_rev64<mode>"
+-  [(set (match_operand:SVE_BHS 0 "register_operand" "=w")
+-	(unspec:SVE_BHS
+-	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
+-	   (unspec:SVE_BHS [(match_operand:SVE_BHS 2 "register_operand" "w")]
+-			   UNSPEC_REV64)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Merging forms are handled through SVE_COND_FP_BINARY and
++;; SVE_COND_FP_BINARY_I1.
++
++;; Unpredicated multiplication by selected lanes.
++(define_insn "@aarch64_mul_lane_<mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(mult:SVE_FULL_F
++	  (unspec:SVE_FULL_F
++	    [(match_operand:SVE_FULL_F 2 "register_operand" "<sve_lane_con>")
++	     (match_operand:SI 3 "const_int_operand")]
++	    UNSPEC_SVE_LANE_SELECT)
++	  (match_operand:SVE_FULL_F 1 "register_operand" "w")))]
+   "TARGET_SVE"
+-  "rev<Vesize>\t%0.d, %1/m, %2.d"
++  "fmul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
+ )
+ 
+-(define_insn "*aarch64_sve_rev32<mode>"
+-  [(set (match_operand:SVE_BH 0 "register_operand" "=w")
+-	(unspec:SVE_BH
+-	  [(match_operand:VNx4BI 1 "register_operand" "Upl")
+-	   (unspec:SVE_BH [(match_operand:SVE_BH 2 "register_operand" "w")]
+-			  UNSPEC_REV32)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [FP] Binary logical operations
++;; -------------------------------------------------------------------------
++;; Includes
++;; - AND
++;; - EOR
++;; - ORR
++;; -------------------------------------------------------------------------
++
++;; Binary logical operations on floating-point modes.  We avoid subregs
++;; by providing this, but we need to use UNSPECs since rtx logical ops
++;; aren't defined for floating-point modes.
++(define_insn "*<optab><mode>3"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++	  LOGICALF))]
+   "TARGET_SVE"
+-  "rev<Vesize>\t%0.s, %1/m, %2.s"
++  "<logicalf_op>\t%0.d, %1.d, %2.d"
+ )
+ 
+-(define_insn "*aarch64_sve_rev16vnx16qi"
+-  [(set (match_operand:VNx16QI 0 "register_operand" "=w")
+-	(unspec:VNx16QI
+-	  [(match_operand:VNx8BI 1 "register_operand" "Upl")
+-	   (unspec:VNx16QI [(match_operand:VNx16QI 2 "register_operand" "w")]
+-			   UNSPEC_REV16)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [FP] Sign copying
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
++(define_expand "copysign<mode>3"
++  [(match_operand:SVE_FULL_F 0 "register_operand")
++   (match_operand:SVE_FULL_F 1 "register_operand")
++   (match_operand:SVE_FULL_F 2 "register_operand")]
+   "TARGET_SVE"
+-  "revb\t%0.h, %1/m, %2.h"
++  {
++    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
++    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
++    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
++    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
++
++    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
++    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
++
++    emit_insn (gen_and<v_int_equiv>3
++	       (sign, arg2,
++		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
++						   HOST_WIDE_INT_M1U
++						   << bits)));
++    emit_insn (gen_and<v_int_equiv>3
++	       (mant, arg1,
++		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
++						   ~(HOST_WIDE_INT_M1U
++						     << bits))));
++    emit_insn (gen_ior<v_int_equiv>3 (int_res, sign, mant));
++    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
++    DONE;
++  }
+ )
+ 
+-(define_insn "*aarch64_sve_rev<mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")]
+-			UNSPEC_REV))]
++(define_expand "xorsign<mode>3"
++  [(match_operand:SVE_FULL_F 0 "register_operand")
++   (match_operand:SVE_FULL_F 1 "register_operand")
++   (match_operand:SVE_FULL_F 2 "register_operand")]
+   "TARGET_SVE"
+-  "rev\t%0.<Vetype>, %1.<Vetype>")
++  {
++    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
++    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
++    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+ 
+-(define_insn "*aarch64_sve_dup_lane<mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(vec_duplicate:SVE_ALL
+-	  (vec_select:<VEL>
+-	    (match_operand:SVE_ALL 1 "register_operand" "w")
+-	    (parallel [(match_operand:SI 2 "const_int_operand")]))))]
+-  "TARGET_SVE
+-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
+-  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
++    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
++    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
++
++    emit_insn (gen_and<v_int_equiv>3
++	       (sign, arg2,
++		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
++						   HOST_WIDE_INT_M1U
++						   << bits)));
++    emit_insn (gen_xor<v_int_equiv>3 (int_res, arg1, sign));
++    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
++    DONE;
++  }
+ )
+ 
+-;; Note that the immediate (third) operand is the lane index not
+-;; the byte index.
+-(define_insn "*aarch64_sve_ext<mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "0")
+-			 (match_operand:SVE_ALL 2 "register_operand" "w")
+-			 (match_operand:SI 3 "const_int_operand")]
+-			UNSPEC_EXT))]
+-  "TARGET_SVE
+-   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
++;; -------------------------------------------------------------------------
++;; ---- [FP] Maximum and minimum
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FMAX
++;; - FMAXNM
++;; - FMIN
++;; - FMINNM
++;; -------------------------------------------------------------------------
++
++;; Unpredicated fmax/fmin (the libm functions).  The optabs for the
++;; smin/smax rtx codes are handled in the generic section above.
++(define_expand "<maxmin_uns><mode>3"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_dup 3)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:SVE_FULL_F 1 "register_operand")
++	   (match_operand:SVE_FULL_F 2 "aarch64_sve_float_maxmin_operand")]
++	  SVE_COND_FP_MAXMIN_PUBLIC))]
++  "TARGET_SVE"
+   {
+-    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
+-    return "ext\\t%0.b, %0.b, %2.b, #%3";
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+-(define_insn "add<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, w")
+-	(plus:SVE_I
+-	  (match_operand:SVE_I 1 "register_operand" "%0, 0, 0, w")
+-	  (match_operand:SVE_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, w")))]
++;; Predicated floating-point maximum/minimum.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?&w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "%0, 0, w, w")
++	   (match_operand:SVE_FULL_F 3 "aarch64_sve_float_maxmin_operand" "vsB, w, vsB, w")]
++	  SVE_COND_FP_MAXMIN))]
+   "TARGET_SVE"
+   "@
+-   add\t%0.<Vetype>, %0.<Vetype>, #%D2
+-   sub\t%0.<Vetype>, %0.<Vetype>, #%N2
+-   * return aarch64_output_sve_inc_dec_immediate (\"%0.<Vetype>\", operands[2]);
+-   add\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
++   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,*,yes,yes")]
+ )
+ 
+-(define_insn "sub<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w")
+-	(minus:SVE_I
+-	  (match_operand:SVE_I 1 "aarch64_sve_arith_operand" "w, vsa")
+-	  (match_operand:SVE_I 2 "register_operand" "w, 0")))]
++;; Merging forms are handled through SVE_COND_FP_BINARY and
++;; SVE_COND_FP_BINARY_I1.
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Binary logical operations
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - AND
++;; - ANDS
++;; - EOR
++;; - EORS
++;; - ORR
++;; - ORRS
++;; -------------------------------------------------------------------------
++
++;; Predicate AND.  We can reuse one of the inputs as the GP.
++;; Doubling the second operand is the preferred implementation
++;; of the MOV alias, so we use that instead of %1/z, %1, %2.
++(define_insn "and<mode>3"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand" "Upa")
++		      (match_operand:PRED_ALL 2 "register_operand" "Upa")))]
+   "TARGET_SVE"
+-  "@
+-   sub\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
+-   subr\t%0.<Vetype>, %0.<Vetype>, #%D1"
++  "and\t%0.b, %1/z, %2.b, %2.b"
+ )
+ 
+-;; Unpredicated multiplication.
+-(define_expand "mul<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_dup 3)
+-	   (mult:SVE_I
+-	     (match_operand:SVE_I 1 "register_operand")
+-	     (match_operand:SVE_I 2 "aarch64_sve_mul_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Unpredicated predicate EOR and ORR.
++(define_expand "<optab><mode>3"
++  [(set (match_operand:PRED_ALL 0 "register_operand")
++	(and:PRED_ALL
++	  (LOGICAL_OR:PRED_ALL
++	    (match_operand:PRED_ALL 1 "register_operand")
++	    (match_operand:PRED_ALL 2 "register_operand"))
++	  (match_dup 3)))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[3] = aarch64_ptrue_reg (<MODE>mode);
+   }
+ )
+ 
+-;; Multiplication predicated with a PTRUE.  We don't actually need the
+-;; predicate for the first alternative, but using Upa or X isn't likely
+-;; to gain much and would make the instruction seem less uniform to the
+-;; register allocator.
+-(define_insn_and_split "*mul<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (mult:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "%0, 0, w")
+-	     (match_operand:SVE_I 3 "aarch64_sve_mul_operand" "vsm, w, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated predicate AND, EOR and ORR.
++(define_insn "@aarch64_pred_<optab><mode>_z"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL
++	  (LOGICAL:PRED_ALL
++	    (match_operand:PRED_ALL 2 "register_operand" "Upa")
++	    (match_operand:PRED_ALL 3 "register_operand" "Upa"))
++	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+   "TARGET_SVE"
+-  "@
+-   #
+-   mul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;mul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  ; Split the unpredicated form after reload, so that we don't have
+-  ; the unnecessary PTRUE.
+-  "&& reload_completed
+-   && !register_operand (operands[3], <MODE>mode)"
+-  [(set (match_dup 0) (mult:SVE_I (match_dup 2) (match_dup 3)))]
+-  ""
+-  [(set_attr "movprfx" "*,*,yes")]
+-)
+-
+-;; Unpredicated multiplications by a constant (post-RA only).
+-;; These are generated by splitting a predicated instruction whose
+-;; predicate is unused.
+-(define_insn "*post_ra_mul<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+-	(mult:SVE_I
+-	  (match_operand:SVE_I 1 "register_operand" "0")
+-	  (match_operand:SVE_I 2 "aarch64_sve_mul_immediate")))]
+-  "TARGET_SVE && reload_completed"
+-  "mul\t%0.<Vetype>, %0.<Vetype>, #%2"
++  "<logical>\t%0.b, %1/z, %2.b, %3.b"
+ )
+ 
+-(define_insn "*madd<mode>"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
+-	(plus:SVE_I
+-	  (unspec:SVE_I
+-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "%0, w, w")
+-			 (match_operand:SVE_I 3 "register_operand" "w, w, w"))]
+-	    UNSPEC_MERGE_PTRUE)
+-	  (match_operand:SVE_I 4 "register_operand" "w, 0, w")))]
++;; Perform a logical operation on operands 2 and 3, using operand 1 as
++;; the GP.  Store the result in operand 0 and set the flags in the same
++;; way as for PTEST.
++(define_insn "*<optab><mode>3_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (and:PRED_ALL
++	     (LOGICAL:PRED_ALL
++	       (match_operand:PRED_ALL 2 "register_operand" "Upa")
++	       (match_operand:PRED_ALL 3 "register_operand" "Upa"))
++	     (match_dup 4))]
++	  UNSPEC_PTEST))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
++		      (match_dup 4)))]
+   "TARGET_SVE"
+-  "@
+-   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++  "<logical>s\t%0.b, %1/z, %2.b, %3.b"
+ )
+ 
+-(define_insn "*msub<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
+-	(minus:SVE_I
+-	  (match_operand:SVE_I 4 "register_operand" "w, 0, w")
+-	  (unspec:SVE_I
+-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "%0, w, w")
+-			 (match_operand:SVE_I 3 "register_operand" "w, w, w"))]
+-	    UNSPEC_MERGE_PTRUE)))]
++;; Same with just the flags result.
++(define_insn "*<optab><mode>3_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (and:PRED_ALL
++	     (LOGICAL:PRED_ALL
++	       (match_operand:PRED_ALL 2 "register_operand" "Upa")
++	       (match_operand:PRED_ALL 3 "register_operand" "Upa"))
++	     (match_dup 4))]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+   "TARGET_SVE"
+-  "@
+-   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++  "<logical>s\t%0.b, %1/z, %2.b, %3.b"
+ )
+ 
+-;; Unpredicated highpart multiplication.
+-(define_expand "<su>mul<mode>3_highpart"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_dup 3)
+-	   (unspec:SVE_I [(match_operand:SVE_I 1 "register_operand")
+-			  (match_operand:SVE_I 2 "register_operand")]
+-			 MUL_HIGHPART)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Binary logical operations (inverted second input)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - BIC
++;; - ORN
++;; -------------------------------------------------------------------------
++
++;; Predicated predicate BIC and ORN.
++(define_insn "aarch64_pred_<nlogical><mode>_z"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL
++	  (NLOGICAL:PRED_ALL
++	    (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa"))
++	    (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+   "TARGET_SVE"
+-  {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-  }
+-)
++  "<nlogical>\t%0.b, %1/z, %2.b, %3.b"
++)
++
++;; Same, but set the flags as a side-effect.
++(define_insn "*<nlogical><mode>3_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (and:PRED_ALL
++	     (NLOGICAL:PRED_ALL
++	       (not:PRED_ALL
++		 (match_operand:PRED_ALL 3 "register_operand" "Upa"))
++	       (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	     (match_dup 4))]
++	  UNSPEC_PTEST))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL (NLOGICAL:PRED_ALL
++			(not:PRED_ALL (match_dup 3))
++			(match_dup 2))
++		      (match_dup 4)))]
++  "TARGET_SVE"
++  "<nlogical>s\t%0.b, %1/z, %2.b, %3.b"
++)
++
++;; Same with just the flags result.
++(define_insn "*<nlogical><mode>3_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (and:PRED_ALL
++	     (NLOGICAL:PRED_ALL
++	       (not:PRED_ALL
++		 (match_operand:PRED_ALL 3 "register_operand" "Upa"))
++	       (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	     (match_dup 4))]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
++  "<nlogical>s\t%0.b, %1/z, %2.b, %3.b"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Binary logical operations (inverted result)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - NAND
++;; - NOR
++;; -------------------------------------------------------------------------
+ 
+-;; Predicated highpart multiplication.
+-(define_insn "*<su>mul<mode>3_highpart"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (unspec:SVE_I [(match_operand:SVE_I 2 "register_operand" "%0, w")
+-			  (match_operand:SVE_I 3 "register_operand" "w, w")]
+-			 MUL_HIGHPART)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated predicate NAND and NOR.
++(define_insn "aarch64_pred_<logical_nn><mode>_z"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL
++	  (NLOGICAL:PRED_ALL
++	    (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	    (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa")))
++	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+   "TARGET_SVE"
+-  "@
+-   <su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  "<logical_nn>\t%0.b, %1/z, %2.b, %3.b"
+ )
+ 
+-;; Unpredicated division.
+-(define_expand "<optab><mode>3"
+-  [(set (match_operand:SVE_SDI 0 "register_operand")
+-	(unspec:SVE_SDI
+-	  [(match_dup 3)
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 1 "register_operand")
+-	     (match_operand:SVE_SDI 2 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Same, but set the flags as a side-effect.
++(define_insn "*<logical_nn><mode>3_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (and:PRED_ALL
++	     (NLOGICAL:PRED_ALL
++	       (not:PRED_ALL
++		 (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	       (not:PRED_ALL
++		 (match_operand:PRED_ALL 3 "register_operand" "Upa")))
++	     (match_dup 4))]
++	  UNSPEC_PTEST))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(and:PRED_ALL (NLOGICAL:PRED_ALL
++			(not:PRED_ALL (match_dup 2))
++			(not:PRED_ALL (match_dup 3)))
++		      (match_dup 4)))]
++  "TARGET_SVE"
++  "<logical_nn>s\t%0.b, %1/z, %2.b, %3.b"
++)
++
++;; Same with just the flags result.
++(define_insn "*<logical_nn><mode>3_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (and:PRED_ALL
++	     (NLOGICAL:PRED_ALL
++	       (not:PRED_ALL
++		 (match_operand:PRED_ALL 2 "register_operand" "Upa"))
++	       (not:PRED_ALL
++		 (match_operand:PRED_ALL 3 "register_operand" "Upa")))
++	     (match_dup 4))]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
++  "<logical_nn>s\t%0.b, %1/z, %2.b, %3.b"
++)
++
++;; =========================================================================
++;; == Ternary arithmetic
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] MLA and MAD
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - MAD
++;; - MLA
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer addition of product.
++(define_expand "fma<mode>4"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(plus:SVE_FULL_I
++	  (unspec:SVE_FULL_I
++	    [(match_dup 4)
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 1 "register_operand")
++	       (match_operand:SVE_FULL_I 2 "nonmemory_operand"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:SVE_FULL_I 3 "register_operand")))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    if (aarch64_prepare_sve_int_fma (operands, PLUS))
++      DONE;
++    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+-;; Division predicated with a PTRUE.
+-(define_insn "*<optab><mode>3"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 2 "register_operand" "0, w, w")
+-	     (match_operand:SVE_SDI 3 "aarch64_sve_mul_operand" "w, 0, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated integer addition of product.
++(define_insn "@aarch64_pred_fma<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
++	(plus:SVE_FULL_I
++	  (unspec:SVE_FULL_I
++	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "%0, w, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w"))]
++	    UNSPEC_PRED_X)
++	  (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w")))]
+   "TARGET_SVE"
+   "@
+-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
++   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+   [(set_attr "movprfx" "*,*,yes")]
+ )
+ 
+-;; Unpredicated NEG, NOT and POPCOUNT.
+-(define_expand "<optab><mode>2"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_dup 2)
+-	   (SVE_INT_UNARY:SVE_I (match_operand:SVE_I 1 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated integer addition of product with merging.
++(define_expand "cond_fma<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (plus:SVE_FULL_I
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand")
++	       (match_operand:SVE_FULL_I 3 "general_operand"))
++	     (match_operand:SVE_FULL_I 4 "register_operand"))
++	   (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
++      DONE;
++    /* Swap the multiplication operands if the fallback value is the
++       second of the two.  */
++    if (rtx_equal_p (operands[3], operands[5]))
++      std::swap (operands[2], operands[3]);
+   }
+ )
+ 
+-;; NEG, NOT and POPCOUNT predicated with a PTRUE.
+-(define_insn "*<optab><mode>2"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (SVE_INT_UNARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated integer addition of product, merging with the first input.
++(define_insn "*cond_fma<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (plus:SVE_FULL_I
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
++	     (match_operand:SVE_FULL_I 4 "register_operand" "w, w"))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
++  "@
++   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Vector AND, ORR and XOR.
+-(define_insn "<optab><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w")
+-	(LOGICAL:SVE_I
+-	  (match_operand:SVE_I 1 "register_operand" "%0, w")
+-	  (match_operand:SVE_I 2 "aarch64_sve_logical_operand" "vsl, w")))]
++;; Predicated integer addition of product, merging with the third input.
++(define_insn "*cond_fma<mode>_4"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (plus:SVE_FULL_I
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
++	     (match_operand:SVE_FULL_I 4 "register_operand" "0, w"))
++	   (match_dup 4)]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+   "@
+-   <logical>\t%0.<Vetype>, %0.<Vetype>, #%C2
+-   <logical>\t%0.d, %1.d, %2.d"
++   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Vector AND, ORR and XOR on floating-point modes.  We avoid subregs
+-;; by providing this, but we need to use UNSPECs since rtx logical ops
+-;; aren't defined for floating-point modes.
+-(define_insn "*<optab><mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+-	(unspec:SVE_F [(match_operand:SVE_F 1 "register_operand" "w")
+-		       (match_operand:SVE_F 2 "register_operand" "w")]
+-		      LOGICALF))]
+-  "TARGET_SVE"
+-  "<logicalf_op>\t%0.d, %1.d, %2.d"
++;; Predicated integer addition of product, merging with an independent value.
++(define_insn_and_rewrite "*cond_fma<mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (plus:SVE_FULL_I
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w, 0, w, w, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w, 0, w, w"))
++	     (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w, w, w, w"))
++	   (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[5])
++   && !rtx_equal_p (operands[3], operands[5])
++   && !rtx_equal_p (operands[4], operands[5])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   #"
++  "&& reload_completed
++   && register_operand (operands[5], <MODE>mode)
++   && !rtx_equal_p (operands[0], operands[5])"
++  {
++    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
++					     operands[5], operands[1]));
++    operands[5] = operands[4] = operands[0];
++  }
++  [(set_attr "movprfx" "yes")]
+ )
+ 
+-;; REG_EQUAL notes on "not<mode>3" should ensure that we can generate
+-;; this pattern even though the NOT instruction itself is predicated.
+-(define_insn "bic<mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+-	(and:SVE_I
+-	  (not:SVE_I (match_operand:SVE_I 1 "register_operand" "w"))
+-	  (match_operand:SVE_I 2 "register_operand" "w")))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] MLS and MSB
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - MLS
++;; - MSB
++;; -------------------------------------------------------------------------
++
++;; Unpredicated integer subtraction of product.
++(define_expand "fnma<mode>4"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++	(minus:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 3 "register_operand")
++	  (unspec:SVE_FULL_I
++	    [(match_dup 4)
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 1 "register_operand")
++	       (match_operand:SVE_FULL_I 2 "general_operand"))]
++	    UNSPEC_PRED_X)))]
+   "TARGET_SVE"
+-  "bic\t%0.d, %2.d, %1.d"
++  {
++    if (aarch64_prepare_sve_int_fma (operands, MINUS))
++      DONE;
++    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
++  }
+ )
+ 
+-;; Predicate AND.  We can reuse one of the inputs as the GP.
+-(define_insn "and<mode>3"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand" "Upa")
+-		      (match_operand:PRED_ALL 2 "register_operand" "Upa")))]
++;; Predicated integer subtraction of product.
++(define_insn "@aarch64_pred_fnma<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
++	(minus:SVE_FULL_I
++	  (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w")
++	  (unspec:SVE_FULL_I
++	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "%0, w, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w"))]
++	    UNSPEC_PRED_X)))]
+   "TARGET_SVE"
+-  "and\t%0.b, %1/z, %1.b, %2.b"
++  "@
++   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,*,yes")]
+ )
+ 
+-;; Unpredicated predicate ORR and XOR.
+-(define_expand "<optab><mode>3"
+-  [(set (match_operand:PRED_ALL 0 "register_operand")
+-	(and:PRED_ALL
+-	  (LOGICAL_OR:PRED_ALL
+-	    (match_operand:PRED_ALL 1 "register_operand")
+-	    (match_operand:PRED_ALL 2 "register_operand"))
+-	  (match_dup 3)))]
++;; Predicated integer subtraction of product with merging.
++(define_expand "cond_fnma<mode>"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand")
++   (unspec:SVE_FULL_I
++	[(match_operand:<VPRED> 1 "register_operand")
++	 (minus:SVE_FULL_I
++	   (match_operand:SVE_FULL_I 4 "register_operand")
++	   (mult:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 2 "register_operand")
++	     (match_operand:SVE_FULL_I 3 "general_operand")))
++	 (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero")]
++	UNSPEC_SEL))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
++    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
++      DONE;
++    /* Swap the multiplication operands if the fallback value is the
++       second of the two.  */
++    if (rtx_equal_p (operands[3], operands[5]))
++      std::swap (operands[2], operands[3]);
+   }
+ )
+ 
+-;; Predicated predicate ORR and XOR.
+-(define_insn "pred_<optab><mode>3"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(and:PRED_ALL
+-	  (LOGICAL:PRED_ALL
+-	    (match_operand:PRED_ALL 2 "register_operand" "Upa")
+-	    (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
++;; Predicated integer subtraction of product, merging with the first input.
++(define_insn "*cond_fnma<mode>_2"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (minus:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 4 "register_operand" "w, w")
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w")))
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "<logical>\t%0.b, %1/z, %2.b, %3.b"
++  "@
++   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Perform a logical operation on operands 2 and 3, using operand 1 as
+-;; the GP (which is known to be a PTRUE).  Store the result in operand 0
+-;; and set the flags in the same way as for PTEST.  The (and ...) in the
+-;; UNSPEC_PTEST_PTRUE is logically redundant, but means that the tested
+-;; value is structurally equivalent to rhs of the second set.
+-(define_insn "*<optab><mode>3_cc"
+-  [(set (reg:CC CC_REGNUM)
+-	(compare:CC
+-	  (unspec:SI [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+-		      (and:PRED_ALL
+-			(LOGICAL:PRED_ALL
+-			  (match_operand:PRED_ALL 2 "register_operand" "Upa")
+-			  (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+-			(match_dup 1))]
+-		     UNSPEC_PTEST_PTRUE)
+-	  (const_int 0)))
+-   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
+-		      (match_dup 1)))]
++;; Predicated integer subtraction of product, merging with the third input.
++(define_insn "*cond_fnma<mode>_4"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (minus:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 4 "register_operand" "0, w")
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w")))
++	   (match_dup 4)]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "<logical>s\t%0.b, %1/z, %2.b, %3.b"
++  "@
++   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Unpredicated predicate inverse.
+-(define_expand "one_cmpl<mode>2"
+-  [(set (match_operand:PRED_ALL 0 "register_operand")
+-	(and:PRED_ALL
+-	  (not:PRED_ALL (match_operand:PRED_ALL 1 "register_operand"))
+-	  (match_dup 2)))]
+-  "TARGET_SVE"
++;; Predicated integer subtraction of product, merging with an
++;; independent value.
++(define_insn_and_rewrite "*cond_fnma<mode>_any"
++  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_I
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (minus:SVE_FULL_I
++	     (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w, w, w, w")
++	     (mult:SVE_FULL_I
++	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w, 0, w, w, w")
++	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w, 0, w, w")))
++	   (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[5])
++   && !rtx_equal_p (operands[3], operands[5])
++   && !rtx_equal_p (operands[4], operands[5])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   #"
++  "&& reload_completed
++   && register_operand (operands[5], <MODE>mode)
++   && !rtx_equal_p (operands[0], operands[5])"
+   {
+-    operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
++    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
++					     operands[5], operands[1]));
++    operands[5] = operands[4] = operands[0];
+   }
++  [(set_attr "movprfx" "yes")]
+ )
+ 
+-;; Predicated predicate inverse.
+-(define_insn "*one_cmpl<mode>3"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(and:PRED_ALL
+-	  (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+-  "TARGET_SVE"
+-  "not\t%0.b, %1/z, %2.b"
+-)
+-
+-;; Predicated predicate BIC and ORN.
+-(define_insn "*<nlogical><mode>3"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(and:PRED_ALL
+-	  (NLOGICAL:PRED_ALL
+-	    (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+-	    (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+-  "TARGET_SVE"
+-  "<nlogical>\t%0.b, %1/z, %3.b, %2.b"
+-)
++;; -------------------------------------------------------------------------
++;; ---- [INT] Dot product
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SDOT
++;; - SUDOT   (I8MM)
++;; - UDOT
++;; - USDOT   (I8MM)
++;; -------------------------------------------------------------------------
+ 
+-;; Predicated predicate NAND and NOR.
+-(define_insn "*<logical_nn><mode>3"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(and:PRED_ALL
+-	  (NLOGICAL:PRED_ALL
+-	    (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+-	    (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa")))
+-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
++;; Four-element integer dot-product with accumulation.
++(define_insn "<sur>dot_prod<vsi2qi>"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
++	(plus:SVE_FULL_SDI
++	  (unspec:SVE_FULL_SDI
++	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
++	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
++	    DOTPROD)
++	  (match_operand:SVE_FULL_SDI 3 "register_operand" "0, w")))]
+   "TARGET_SVE"
+-  "<logical_nn>\t%0.b, %1/z, %2.b, %3.b"
++  "@
++   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
++   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Unpredicated LSL, LSR and ASR by a vector.
+-(define_expand "v<optab><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_dup 3)
+-	   (ASHIFT:SVE_I
+-	     (match_operand:SVE_I 1 "register_operand")
+-	     (match_operand:SVE_I 2 "aarch64_sve_<lr>shift_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Four-element integer dot-product by selected lanes with accumulation.
++(define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
++  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
++	(plus:SVE_FULL_SDI
++	  (unspec:SVE_FULL_SDI
++	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
++	     (unspec:<VSI2QI>
++	       [(match_operand:<VSI2QI> 2 "register_operand" "<sve_lane_con>, <sve_lane_con>")
++		(match_operand:SI 3 "const_int_operand")]
++	       UNSPEC_SVE_LANE_SELECT)]
++	    DOTPROD)
++	  (match_operand:SVE_FULL_SDI 4 "register_operand" "0, w")))]
+   "TARGET_SVE"
+-  {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-  }
++  "@
++   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>[%3]
++   movprfx\t%0, %4\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>[%3]"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; LSL, LSR and ASR by a vector, predicated with a PTRUE.  We don't
+-;; actually need the predicate for the first alternative, but using Upa
+-;; or X isn't likely to gain much and would make the instruction seem
+-;; less uniform to the register allocator.
+-(define_insn_and_split "*v<optab><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (ASHIFT:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "w, 0, w")
+-	     (match_operand:SVE_I 3 "aarch64_sve_<lr>shift_operand" "D<lr>, w, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
++(define_insn "@aarch64_<sur>dot_prod<vsi2qi>"
++  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
++        (plus:VNx4SI_ONLY
++	  (unspec:VNx4SI_ONLY
++	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
++	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
++	    DOTPROD_US_ONLY)
++	  (match_operand:VNx4SI_ONLY 3 "register_operand" "0, w")))]
++  "TARGET_SVE_I8MM"
+   "@
+-   #
+-   <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  "&& reload_completed
+-   && !register_operand (operands[3], <MODE>mode)"
+-  [(set (match_dup 0) (ASHIFT:SVE_I (match_dup 2) (match_dup 3)))]
+-  ""
+-  [(set_attr "movprfx" "*,*,yes")]
++   <sur>dot\\t%0.s, %1.b, %2.b
++   movprfx\t%0, %3\;<sur>dot\\t%0.s, %1.b, %2.b"
++   [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Unpredicated shift operations by a constant (post-RA only).
+-;; These are generated by splitting a predicated instruction whose
+-;; predicate is unused.
+-(define_insn "*post_ra_v<optab><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+-	(ASHIFT:SVE_I
+-	  (match_operand:SVE_I 1 "register_operand" "w")
+-	  (match_operand:SVE_I 2 "aarch64_simd_<lr>shift_imm")))]
+-  "TARGET_SVE && reload_completed"
+-  "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2"
++(define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
++  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
++	(plus:VNx4SI_ONLY
++	  (unspec:VNx4SI_ONLY
++	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
++	     (unspec:<VSI2QI>
++	       [(match_operand:<VSI2QI> 2 "register_operand" "y, y")
++		(match_operand:SI 3 "const_int_operand")]
++	       UNSPEC_SVE_LANE_SELECT)]
++	    DOTPROD_I8MM)
++	  (match_operand:VNx4SI_ONLY 4 "register_operand" "0, w")))]
++  "TARGET_SVE_I8MM"
++  "@
++   <sur>dot\\t%0.s, %1.b, %2.b[%3]
++   movprfx\t%0, %4\;<sur>dot\\t%0.s, %1.b, %2.b[%3]"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; LSL, LSR and ASR by a scalar, which expands into one of the vector
+-;; shifts above.
+-(define_expand "<ASHIFT:optab><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(ASHIFT:SVE_I (match_operand:SVE_I 1 "register_operand")
+-		      (match_operand:<VEL> 2 "general_operand")))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Sum of absolute differences
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
++;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in
++;; operands 1 and 2.  The sequence also has to perform a widening reduction of
++;; the difference into a vector and accumulate that into operand 3 before
++;; copying that into the result operand 0.
++;; Perform that with a sequence of:
++;; MOV		ones.b, #1
++;; [SU]ABD	diff.b, p0/m, op1.b, op2.b
++;; MOVPRFX	op0, op3	// If necessary
++;; UDOT		op0.s, diff.b, ones.b
++(define_expand "<sur>sad<vsi2qi>"
++  [(use (match_operand:SVE_FULL_SDI 0 "register_operand"))
++   (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand"))
++		    (use (match_operand:<VSI2QI> 2 "register_operand"))] ABAL)
++   (use (match_operand:SVE_FULL_SDI 3 "register_operand"))]
+   "TARGET_SVE"
+   {
+-    rtx amount;
+-    if (CONST_INT_P (operands[2]))
+-      {
+-	amount = gen_const_vec_duplicate (<MODE>mode, operands[2]);
+-	if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode))
+-	  amount = force_reg (<MODE>mode, amount);
+-      }
+-    else
+-      {
+-	amount = gen_reg_rtx (<MODE>mode);
+-	emit_insn (gen_vec_duplicate<mode> (amount,
+-					    convert_to_mode (<VEL>mode,
+-							     operands[2], 0)));
+-      }
+-    emit_insn (gen_v<optab><mode>3 (operands[0], operands[1], amount));
++    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
++    rtx diff = gen_reg_rtx (<VSI2QI>mode);
++    emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2]));
++    emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3]));
+     DONE;
+   }
+ )
+ 
+-;; Test all bits of operand 1.  Operand 0 is a GP that is known to hold PTRUE.
+-;;
+-;; Using UNSPEC_PTEST_PTRUE allows combine patterns to assume that the GP
+-;; is a PTRUE even if the optimizers haven't yet been able to propagate
+-;; the constant.  We would use a separate unspec code for PTESTs involving
+-;; GPs that might not be PTRUEs.
+-(define_insn "ptest_ptrue<mode>"
+-  [(set (reg:CC CC_REGNUM)
+-	(compare:CC
+-	  (unspec:SI [(match_operand:PRED_ALL 0 "register_operand" "Upa")
+-		      (match_operand:PRED_ALL 1 "register_operand" "Upa")]
+-		     UNSPEC_PTEST_PTRUE)
+-	  (const_int 0)))]
+-  "TARGET_SVE"
+-  "ptest\t%0, %1.b"
++;; -------------------------------------------------------------------------
++;; ---- [INT] Matrix multiply-accumulate
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SMMLA (I8MM)
++;; - UMMLA (I8MM)
++;; - USMMLA (I8MM)
++;; -------------------------------------------------------------------------
++
++(define_insn "@aarch64_sve_add_<optab><vsi2qi>"
++  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
++	(plus:VNx4SI_ONLY
++	  (unspec:VNx4SI_ONLY
++	    [(match_operand:<VSI2QI> 2 "register_operand" "w, w")
++	     (match_operand:<VSI2QI> 3 "register_operand" "w, w")]
++	    MATMUL)
++	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")))]
++  "TARGET_SVE_I8MM"
++  "@
++   <sur>mmla\\t%0.s, %2.b, %3.b
++   movprfx\t%0, %1\;<sur>mmla\\t%0.s, %2.b, %3.b"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Set element I of the result if operand1 + J < operand2 for all J in [0, I].
+-;; with the comparison being unsigned.
+-(define_insn "while_ult<GPI:mode><PRED_ALL:mode>"
+-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
+-			  (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
+-			 UNSPEC_WHILE_LO))
+-   (clobber (reg:CC CC_REGNUM))]
+-  "TARGET_SVE"
+-  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
+-)
+-
+-;; WHILELO sets the flags in the same way as a PTEST with a PTRUE GP.
+-;; Handle the case in which both results are useful.  The GP operand
+-;; to the PTEST isn't needed, so we allow it to be anything.
+-(define_insn_and_split "while_ult<GPI:mode><PRED_ALL:mode>_cc"
+-  [(set (reg:CC CC_REGNUM)
+-	(compare:CC
+-	  (unspec:SI [(match_operand:PRED_ALL 1)
+-		      (unspec:PRED_ALL
+-			[(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")
+-			 (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")]
+-			UNSPEC_WHILE_LO)]
+-		     UNSPEC_PTEST_PTRUE)
+-	  (const_int 0)))
+-   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+-	(unspec:PRED_ALL [(match_dup 2)
+-			  (match_dup 3)]
+-			 UNSPEC_WHILE_LO))]
++;; -------------------------------------------------------------------------
++;; ---- [FP] General ternary arithmetic corresponding to unspecs
++;; -------------------------------------------------------------------------
++;; Includes merging patterns for:
++;; - FMAD
++;; - FMLA
++;; - FMLS
++;; - FMSB
++;; - FNMAD
++;; - FNMLA
++;; - FNMLS
++;; - FNMSB
++;; -------------------------------------------------------------------------
++
++;; Unpredicated floating-point ternary operations.
++(define_expand "<optab><mode>4"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_dup 4)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:SVE_FULL_F 1 "register_operand")
++	   (match_operand:SVE_FULL_F 2 "register_operand")
++	   (match_operand:SVE_FULL_F 3 "register_operand")]
++	  SVE_COND_FP_TERNARY))]
+   "TARGET_SVE"
+-  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>2, %<w>3"
+-  ;; Force the compiler to drop the unused predicate operand, so that we
+-  ;; don't have an unnecessary PTRUE.
+-  "&& !CONSTANT_P (operands[1])"
+-  [(const_int 0)]
+   {
+-    emit_insn (gen_while_ult<GPI:mode><PRED_ALL:mode>_cc
+-	       (operands[0], CONSTM1_RTX (<MODE>mode),
+-		operands[2], operands[3]));
+-    DONE;
++    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+-;; Integer comparisons predicated with a PTRUE.
+-(define_insn "*cmp<cmp_op><mode>"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(unspec:<VPRED>
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (SVE_INT_CMP:<VPRED>
+-	     (match_operand:SVE_I 2 "register_operand" "w, w")
+-	     (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+-	  UNSPEC_MERGE_PTRUE))
+-   (clobber (reg:CC CC_REGNUM))]
++;; Predicated floating-point ternary operations.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "%w, 0, w")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w")
++	   (match_operand:SVE_FULL_F 4 "register_operand" "0, w, w")]
++	  SVE_COND_FP_TERNARY))]
+   "TARGET_SVE"
+   "@
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,*,yes")]
+ )
+ 
+-;; Integer comparisons predicated with a PTRUE in which only the flags result
+-;; is interesting.
+-(define_insn "*cmp<cmp_op><mode>_ptest"
+-  [(set (reg:CC CC_REGNUM)
+-	(compare:CC
+-	  (unspec:SI
+-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	     (unspec:<VPRED>
+-	       [(match_dup 1)
+-		(SVE_INT_CMP:<VPRED>
+-		  (match_operand:SVE_I 2 "register_operand" "w, w")
+-		  (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+-	       UNSPEC_MERGE_PTRUE)]
+-	    UNSPEC_PTEST_PTRUE)
+-	  (const_int 0)))
+-   (clobber (match_scratch:<VPRED> 0 "=Upa, Upa"))]
++;; Predicated floating-point ternary operations with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "register_operand")
++	      (match_operand:SVE_FULL_F 3 "register_operand")
++	      (match_operand:SVE_FULL_F 4 "register_operand")]
++	     SVE_COND_FP_TERNARY)
++	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
++{
++  /* Swap the multiplication operands if the fallback value is the
++     second of the two.  */
++  if (rtx_equal_p (operands[3], operands[5]))
++    std::swap (operands[2], operands[3]);
++})
++
++;; Predicated floating-point ternary operations, merging with the
++;; first input.
++(define_insn_and_rewrite "*cond_<optab><mode>_2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
++	      (match_operand:SVE_FULL_F 4 "register_operand" "w, w")]
++	     SVE_COND_FP_TERNARY)
++	   (match_dup 2)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+   "@
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[5])"
++  {
++    operands[5] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Integer comparisons predicated with a PTRUE in which both the flag and
+-;; predicate results are interesting.
+-(define_insn "*cmp<cmp_op><mode>_cc"
+-  [(set (reg:CC CC_REGNUM)
+-	(compare:CC
+-	  (unspec:SI
+-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	     (unspec:<VPRED>
+-	       [(match_dup 1)
+-		(SVE_INT_CMP:<VPRED>
+-		  (match_operand:SVE_I 2 "register_operand" "w, w")
+-		  (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+-	       UNSPEC_MERGE_PTRUE)]
+-	    UNSPEC_PTEST_PTRUE)
+-	  (const_int 0)))
+-   (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(unspec:<VPRED>
+-	  [(match_dup 1)
+-	   (SVE_INT_CMP:<VPRED>
+-	     (match_dup 2)
+-	     (match_dup 3))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
++;; Predicated floating-point ternary operations, merging with the
++;; third input.
++(define_insn_and_rewrite "*cond_<optab><mode>_4"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
++	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
++	     SVE_COND_FP_TERNARY)
++	   (match_dup 4)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+   "@
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[5])"
++  {
++    operands[5] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Predicated integer comparisons, formed by combining a PTRUE-predicated
+-;; comparison with an AND.  Split the instruction into its preferred form
+-;; (below) at the earliest opportunity, in order to get rid of the
+-;; redundant operand 1.
+-(define_insn_and_split "*pred_cmp<cmp_op><mode>_combine"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-       (and:<VPRED>
+-         (unspec:<VPRED>
+-           [(match_operand:<VPRED> 1)
+-            (SVE_INT_CMP:<VPRED>
+-              (match_operand:SVE_I 2 "register_operand" "w, w")
+-              (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+-           UNSPEC_MERGE_PTRUE)
+-         (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))
+-   (clobber (reg:CC CC_REGNUM))]
+-  "TARGET_SVE"
+-  "#"
++;; Predicated floating-point ternary operations, merging with an
++;; independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 6)
++	      (match_operand:SI 7 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, 0, w, w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, 0, w, w")
++	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w, w, w")]
++	     SVE_COND_FP_TERNARY)
++	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[2], operands[5])
++   && !rtx_equal_p (operands[3], operands[5])
++   && !rtx_equal_p (operands[4], operands[5])
++   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
++   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
++   #"
+   "&& 1"
+-  [(parallel
+-     [(set (match_dup 0)
+-          (and:<VPRED>
+-            (SVE_INT_CMP:<VPRED>
+-              (match_dup 2)
+-              (match_dup 3))
+-            (match_dup 4)))
+-      (clobber (reg:CC CC_REGNUM))])]
++  {
++    if (reload_completed
++        && register_operand (operands[5], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[5]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
++						 operands[5], operands[1]));
++	operands[5] = operands[4] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[6]))
++      operands[6] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
+ )
+ 
+-;; Predicated integer comparisons.
+-(define_insn "*pred_cmp<cmp_op><mode>"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(and:<VPRED>
+-	  (SVE_INT_CMP:<VPRED>
+-	    (match_operand:SVE_I 2 "register_operand" "w, w")
+-	    (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))
+-	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))
+-   (clobber (reg:CC CC_REGNUM))]
++;; Unpredicated FMLA and FMLS by selected lanes.  It doesn't seem worth using
++;; (fma ...) since target-independent code won't understand the indexing.
++(define_insn "@aarch64_<optab>_lane_<mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:SVE_FULL_F 1 "register_operand" "w, w")
++	   (unspec:SVE_FULL_F
++	     [(match_operand:SVE_FULL_F 2 "register_operand" "<sve_lane_con>, <sve_lane_con>")
++	      (match_operand:SI 3 "const_int_operand")]
++	     UNSPEC_SVE_LANE_SELECT)
++	   (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
++	  SVE_FP_TERNARY_LANE))]
+   "TARGET_SVE"
+   "@
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   <sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]
++   movprfx\t%0, %4\;<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Floating-point comparisons predicated with a PTRUE.
+-(define_insn "*fcm<cmp_op><mode>"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(unspec:<VPRED>
++;; -------------------------------------------------------------------------
++;; ---- [FP] Complex multiply-add
++;; -------------------------------------------------------------------------
++;; Includes merging patterns for:
++;; - FCMLA
++;; -------------------------------------------------------------------------
++
++;; Predicated FCMLA.
++(define_insn "@aarch64_pred_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
+ 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (SVE_FP_CMP:<VPRED>
+-	     (match_operand:SVE_F 2 "register_operand" "w, w")
+-	     (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++	   (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
++	   (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
++	  SVE_COND_FCMLA))]
+   "TARGET_SVE"
+   "@
+-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #0.0
+-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-(define_insn "*fcmuo<mode>"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+-	(unspec:<VPRED>
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (unordered:<VPRED>
+-	     (match_operand:SVE_F 2 "register_operand" "w")
+-	     (match_operand:SVE_F 3 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated FCMLA with merging.
++(define_expand "@cond_<optab><mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "register_operand")
++	      (match_operand:SVE_FULL_F 3 "register_operand")
++	      (match_operand:SVE_FULL_F 4 "register_operand")]
++	     SVE_COND_FCMLA)
++	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "fcmuo\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+ )
+ 
+-;; Floating-point comparisons predicated on a PTRUE, with the results ANDed
+-;; with another predicate P.  This does not have the same trapping behavior
+-;; as predicating the comparison itself on P, but it's a legitimate fold,
+-;; since we can drop any potentially-trapping operations whose results
+-;; are not needed.
+-;;
+-;; Split the instruction into its preferred form (below) at the earliest
+-;; opportunity, in order to get rid of the redundant operand 1.
+-(define_insn_and_split "*fcm<cmp_op><mode>_and_combine"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(and:<VPRED>
+-	  (unspec:<VPRED>
+-	    [(match_operand:<VPRED> 1)
+-	     (SVE_FP_CMP
+-	       (match_operand:SVE_F 2 "register_operand" "w, w")
+-	       (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w"))]
+-	    UNSPEC_MERGE_PTRUE)
+-	  (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))]
+-  "TARGET_SVE"
+-  "#"
+-  "&& 1"
+-  [(set (match_dup 0)
+-	(and:<VPRED>
+-	  (SVE_FP_CMP:<VPRED>
+-	    (match_dup 2)
+-	    (match_dup 3))
+-	  (match_dup 4)))]
++;; Predicated FCMLA, merging with the third input.
++(define_insn_and_rewrite "*cond_<optab><mode>_4"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
++	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
++	     SVE_COND_FCMLA)
++	   (match_dup 4)]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
++  "@
++   fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>"
++  "&& !rtx_equal_p (operands[1], operands[5])"
++  {
++    operands[5] = copy_rtx (operands[1]);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-(define_insn_and_split "*fcmuo<mode>_and_combine"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+-	(and:<VPRED>
+-	  (unspec:<VPRED>
+-	    [(match_operand:<VPRED> 1)
+-	     (unordered
+-	       (match_operand:SVE_F 2 "register_operand" "w")
+-	       (match_operand:SVE_F 3 "register_operand" "w"))]
+-	    UNSPEC_MERGE_PTRUE)
+-	  (match_operand:<VPRED> 4 "register_operand" "Upl")))]
+-  "TARGET_SVE"
+-  "#"
++;; Predicated FCMLA, merging with an independent value.
++(define_insn_and_rewrite "*cond_<optab><mode>_any"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 6)
++	      (match_operand:SI 7 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")
++	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w")]
++	     SVE_COND_FCMLA)
++	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && !rtx_equal_p (operands[4], operands[5])
++   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
++  "@
++   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
++   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
++   #"
+   "&& 1"
+-  [(set (match_dup 0)
+-	(and:<VPRED>
+-	  (unordered:<VPRED>
+-	    (match_dup 2)
+-	    (match_dup 3))
+-	  (match_dup 4)))]
++  {
++    if (reload_completed
++        && register_operand (operands[5], <MODE>mode)
++        && !rtx_equal_p (operands[0], operands[5]))
++      {
++	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
++						 operands[5], operands[1]));
++	operands[5] = operands[4] = operands[0];
++      }
++    else if (!rtx_equal_p (operands[1], operands[6]))
++      operands[6] = copy_rtx (operands[1]);
++    else
++      FAIL;
++  }
++  [(set_attr "movprfx" "yes")]
+ )
+ 
+-;; Unpredicated floating-point comparisons, with the results ANDed
+-;; with another predicate.  This is a valid fold for the same reasons
+-;; as above.
+-(define_insn "*fcm<cmp_op><mode>_and"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(and:<VPRED>
+-	  (SVE_FP_CMP:<VPRED>
+-	    (match_operand:SVE_F 2 "register_operand" "w, w")
+-	    (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w"))
+-	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))]
++;; Unpredicated FCMLA with indexing.
++(define_insn "@aarch64_<optab>_lane_<mode>"
++  [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_HSF
++	  [(match_operand:SVE_FULL_HSF 1 "register_operand" "w, w")
++	   (unspec:SVE_FULL_HSF
++	     [(match_operand:SVE_FULL_HSF 2 "register_operand" "<sve_lane_pair_con>, <sve_lane_pair_con>")
++	      (match_operand:SI 3 "const_int_operand")]
++	     UNSPEC_SVE_LANE_SELECT)
++	   (match_operand:SVE_FULL_HSF 4 "register_operand" "0, w")]
++	  FCMLA))]
+   "TARGET_SVE"
+   "@
+-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #0.0
+-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   fcmla\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3], #<rot>
++   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3], #<rot>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-(define_insn "*fcmuo<mode>_and"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+-	(and:<VPRED>
+-	  (unordered:<VPRED>
+-	    (match_operand:SVE_F 2 "register_operand" "w")
+-	    (match_operand:SVE_F 3 "register_operand" "w"))
+-	  (match_operand:<VPRED> 1 "register_operand" "Upl")))]
++;; -------------------------------------------------------------------------
++;; ---- [FP] Trigonometric multiply-add
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FTMAD
++;; -------------------------------------------------------------------------
++
++(define_insn "@aarch64_sve_tmad<mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:SVE_FULL_F 1 "register_operand" "0, w")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++	   (match_operand:DI 3 "const_int_operand")]
++	  UNSPEC_FTMAD))]
+   "TARGET_SVE"
+-  "fcmuo\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++  "@
++   ftmad\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3
++   movprfx\t%0, %1\;ftmad\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Predicated floating-point comparisons.  We don't need a version
+-;; of this for unordered comparisons.
+-(define_insn "*pred_fcm<cmp_op><mode>"
+-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+-	(unspec:<VPRED>
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (match_operand:SVE_F 2 "register_operand" "w, w")
+-	   (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
+-	  SVE_COND_FP_CMP))]
++;; -------------------------------------------------------------------------
++;; ---- [FP] Bfloat16 long ternary arithmetic (SF,BF,BF)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - BFDOT (BF16)
++;; - BFMLALB (BF16)
++;; - BFMLALT (BF16)
++;; - BFMMLA (BF16)
++;; -------------------------------------------------------------------------
++
++(define_insn "@aarch64_sve_<sve_fp_op>vnx4sf"
++  [(set (match_operand:VNx4SF 0 "register_operand" "=w, ?&w")
++	(unspec:VNx4SF
++	  [(match_operand:VNx4SF 1 "register_operand" "0, w")
++	   (match_operand:VNx8BF 2 "register_operand" "w, w")
++	   (match_operand:VNx8BF 3 "register_operand" "w, w")]
++	  SVE_BFLOAT_TERNARY_LONG))]
++  "TARGET_SVE_BF16"
++  "@
++   <sve_fp_op>\t%0.s, %2.h, %3.h
++   movprfx\t%0, %1\;<sve_fp_op>\t%0.s, %2.h, %3.h"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; The immediate range is enforced before generating the instruction.
++(define_insn "@aarch64_sve_<sve_fp_op>_lanevnx4sf"
++  [(set (match_operand:VNx4SF 0 "register_operand" "=w, ?&w")
++	(unspec:VNx4SF
++	  [(match_operand:VNx4SF 1 "register_operand" "0, w")
++	   (match_operand:VNx8BF 2 "register_operand" "w, w")
++	   (match_operand:VNx8BF 3 "register_operand" "y, y")
++	   (match_operand:SI 4 "const_int_operand")]
++	  SVE_BFLOAT_TERNARY_LONG_LANE))]
++  "TARGET_SVE_BF16"
++  "@
++   <sve_fp_op>\t%0.s, %2.h, %3.h[%4]
++   movprfx\t%0, %1\;<sve_fp_op>\t%0.s, %2.h, %3.h[%4]"
++  [(set_attr "movprfx" "*,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Matrix multiply-accumulate
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FMMLA (F32MM,F64MM)
++;; -------------------------------------------------------------------------
++
++;; The mode iterator enforces the target requirements.
++(define_insn "@aarch64_sve_<sve_fp_op><mode>"
++  [(set (match_operand:SVE_MATMULF 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_MATMULF
++	  [(match_operand:SVE_MATMULF 2 "register_operand" "w, w")
++	   (match_operand:SVE_MATMULF 3 "register_operand" "w, w")
++	   (match_operand:SVE_MATMULF 1 "register_operand" "0, w")]
++	  FMMLA))]
+   "TARGET_SVE"
+   "@
+-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #0.0
+-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++   <sve_fp_op>\\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %1\;<sve_fp_op>\\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
++;; =========================================================================
++;; == Comparisons and selects
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Select based on predicates
++;; -------------------------------------------------------------------------
++;; Includes merging patterns for:
++;; - FMOV
++;; - MOV
++;; - SEL
++;; -------------------------------------------------------------------------
++
+ ;; vcond_mask operand order: true, false, mask
+ ;; UNSPEC_SEL operand order: mask, true, false (as for VEC_COND_EXPR)
+ ;; SEL operand order:        mask, true, false
+-(define_insn "vcond_mask_<mode><vpred>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+-	(unspec:SVE_ALL
+-	  [(match_operand:<VPRED> 3 "register_operand" "Upa")
+-	   (match_operand:SVE_ALL 1 "register_operand" "w")
+-	   (match_operand:SVE_ALL 2 "register_operand" "w")]
++(define_expand "@vcond_mask_<mode><vpred>"
++  [(set (match_operand:SVE_FULL 0 "register_operand")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 3 "register_operand")
++	   (match_operand:SVE_FULL 1 "aarch64_sve_reg_or_dup_imm")
++	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero")]
+ 	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "sel\t%0.<Vetype>, %3, %1.<Vetype>, %2.<Vetype>"
++  {
++    if (register_operand (operands[1], <MODE>mode))
++      operands[2] = force_reg (<MODE>mode, operands[2]);
++  }
+ )
+ 
+-;; Selects between a duplicated immediate and zero.
+-(define_insn "aarch64_sve_dup<mode>_const"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (match_operand:SVE_I 2 "aarch64_sve_dup_immediate")
+-	   (match_operand:SVE_I 3 "aarch64_simd_imm_zero")]
++;; Selects between:
++;; - two registers
++;; - a duplicated immediate and a register
++;; - a duplicated immediate and zero
++(define_insn "*vcond_mask_<mode><vpred>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w, w, ?w, ?&w, ?&w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 3 "register_operand" "Upa, Upa, Upa, Upa, Upl, Upl, Upl")
++	   (match_operand:SVE_FULL 1 "aarch64_sve_reg_or_dup_imm" "w, vss, vss, Ufc, Ufc, vss, Ufc")
++	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "w, 0, Dz, 0, Dz, w, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && (!register_operand (operands[1], <MODE>mode)
++       || register_operand (operands[2], <MODE>mode))"
++  "@
++   sel\t%0.<Vetype>, %3, %1.<Vetype>, %2.<Vetype>
++   mov\t%0.<Vetype>, %3/m, #%I1
++   mov\t%0.<Vetype>, %3/z, #%I1
++   fmov\t%0.<Vetype>, %3/m, #%1
++   movprfx\t%0.<Vetype>, %3/z, %0.<Vetype>\;fmov\t%0.<Vetype>, %3/m, #%1
++   movprfx\t%0, %2\;mov\t%0.<Vetype>, %3/m, #%I1
++   movprfx\t%0, %2\;fmov\t%0.<Vetype>, %3/m, #%1"
++  [(set_attr "movprfx" "*,*,*,*,yes,yes,yes")]
++)
++
++;; Optimize selects between a duplicated scalar variable and another vector,
++;; the latter of which can be a zero constant or a variable.  Treat duplicates
++;; of GPRs as being more expensive than duplicates of FPRs, since they
++;; involve a cross-file move.
++(define_insn "@aarch64_sel_dup<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, ?&w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 3 "register_operand" "Upa, Upa, Upl, Upl, Upl, Upl")
++	   (vec_duplicate:SVE_FULL
++	     (match_operand:<VEL> 1 "register_operand" "r, w, r, w, r, w"))
++	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, w, w")]
+ 	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "mov\t%0.<Vetype>, %1/z, #%2"
++  "@
++   mov\t%0.<Vetype>, %3/m, %<vwcore>1
++   mov\t%0.<Vetype>, %3/m, %<Vetype>1
++   movprfx\t%0.<Vetype>, %3/z, %0.<Vetype>\;mov\t%0.<Vetype>, %3/m, %<vwcore>1
++   movprfx\t%0.<Vetype>, %3/z, %0.<Vetype>\;mov\t%0.<Vetype>, %3/m, %<Vetype>1
++   movprfx\t%0, %2\;mov\t%0.<Vetype>, %3/m, %<vwcore>1
++   movprfx\t%0, %2\;mov\t%0.<Vetype>, %3/m, %<Vetype>1"
++  [(set_attr "movprfx" "*,*,yes,yes,yes,yes")]
+ )
+ 
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Compare and select
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
+ ;; Integer (signed) vcond.  Don't enforce an immediate range here, since it
+ ;; depends on the comparison; leave it to aarch64_expand_sve_vcond instead.
+ (define_expand "vcond<mode><v_int_equiv>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand")
+-	(if_then_else:SVE_ALL
++  [(set (match_operand:SVE_FULL 0 "register_operand")
++	(if_then_else:SVE_FULL
+ 	  (match_operator 3 "comparison_operator"
+ 	    [(match_operand:<V_INT_EQUIV> 4 "register_operand")
+ 	     (match_operand:<V_INT_EQUIV> 5 "nonmemory_operand")])
+-	  (match_operand:SVE_ALL 1 "register_operand")
+-	  (match_operand:SVE_ALL 2 "register_operand")))]
++	  (match_operand:SVE_FULL 1 "nonmemory_operand")
++	  (match_operand:SVE_FULL 2 "nonmemory_operand")))]
+   "TARGET_SVE"
+   {
+     aarch64_expand_sve_vcond (<MODE>mode, <V_INT_EQUIV>mode, operands);
+@@ -1647,13 +6555,13 @@
+ ;; Integer vcondu.  Don't enforce an immediate range here, since it
+ ;; depends on the comparison; leave it to aarch64_expand_sve_vcond instead.
+ (define_expand "vcondu<mode><v_int_equiv>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand")
+-	(if_then_else:SVE_ALL
++  [(set (match_operand:SVE_FULL 0 "register_operand")
++	(if_then_else:SVE_FULL
+ 	  (match_operator 3 "comparison_operator"
+ 	    [(match_operand:<V_INT_EQUIV> 4 "register_operand")
+ 	     (match_operand:<V_INT_EQUIV> 5 "nonmemory_operand")])
+-	  (match_operand:SVE_ALL 1 "register_operand")
+-	  (match_operand:SVE_ALL 2 "register_operand")))]
++	  (match_operand:SVE_FULL 1 "nonmemory_operand")
++	  (match_operand:SVE_FULL 2 "nonmemory_operand")))]
+   "TARGET_SVE"
+   {
+     aarch64_expand_sve_vcond (<MODE>mode, <V_INT_EQUIV>mode, operands);
+@@ -1661,17 +6569,16 @@
+   }
+ )
+ 
+-;; Floating-point vcond.  All comparisons except FCMUO allow a zero
+-;; operand; aarch64_expand_sve_vcond handles the case of an FCMUO
+-;; with zero.
++;; Floating-point vcond.  All comparisons except FCMUO allow a zero operand;
++;; aarch64_expand_sve_vcond handles the case of an FCMUO with zero.
+ (define_expand "vcond<mode><v_fp_equiv>"
+-  [(set (match_operand:SVE_SD 0 "register_operand")
+-	(if_then_else:SVE_SD
++  [(set (match_operand:SVE_FULL_HSD 0 "register_operand")
++	(if_then_else:SVE_FULL_HSD
+ 	  (match_operator 3 "comparison_operator"
+ 	    [(match_operand:<V_FP_EQUIV> 4 "register_operand")
+ 	     (match_operand:<V_FP_EQUIV> 5 "aarch64_simd_reg_or_zero")])
+-	  (match_operand:SVE_SD 1 "register_operand")
+-	  (match_operand:SVE_SD 2 "register_operand")))]
++	  (match_operand:SVE_FULL_HSD 1 "nonmemory_operand")
++	  (match_operand:SVE_FULL_HSD 2 "nonmemory_operand")))]
+   "TARGET_SVE"
+   {
+     aarch64_expand_sve_vcond (<MODE>mode, <V_FP_EQUIV>mode, operands);
+@@ -1679,6 +6586,22 @@
+   }
+ )
+ 
++;; -------------------------------------------------------------------------
++;; ---- [INT] Comparisons
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - CMPEQ
++;; - CMPGE
++;; - CMPGT
++;; - CMPHI
++;; - CMPHS
++;; - CMPLE
++;; - CMPLO
++;; - CMPLS
++;; - CMPLT
++;; - CMPNE
++;; -------------------------------------------------------------------------
++
+ ;; Signed integer comparisons.  Don't enforce an immediate range here, since
+ ;; it depends on the comparison; leave it to aarch64_expand_sve_vec_cmp_int
+ ;; instead.
+@@ -1686,9 +6609,9 @@
+   [(parallel
+     [(set (match_operand:<VPRED> 0 "register_operand")
+ 	  (match_operator:<VPRED> 1 "comparison_operator"
+-	    [(match_operand:SVE_I 2 "register_operand")
+-	     (match_operand:SVE_I 3 "nonmemory_operand")]))
+-     (clobber (reg:CC CC_REGNUM))])]
++	    [(match_operand:SVE_FULL_I 2 "register_operand")
++	     (match_operand:SVE_FULL_I 3 "nonmemory_operand")]))
++     (clobber (reg:CC_NZC CC_REGNUM))])]
+   "TARGET_SVE"
+   {
+     aarch64_expand_sve_vec_cmp_int (operands[0], GET_CODE (operands[1]),
+@@ -1704,9 +6627,9 @@
+   [(parallel
+     [(set (match_operand:<VPRED> 0 "register_operand")
+ 	  (match_operator:<VPRED> 1 "comparison_operator"
+-	    [(match_operand:SVE_I 2 "register_operand")
+-	     (match_operand:SVE_I 3 "nonmemory_operand")]))
+-     (clobber (reg:CC CC_REGNUM))])]
++	    [(match_operand:SVE_FULL_I 2 "register_operand")
++	     (match_operand:SVE_FULL_I 3 "nonmemory_operand")]))
++     (clobber (reg:CC_NZC CC_REGNUM))])]
+   "TARGET_SVE"
+   {
+     aarch64_expand_sve_vec_cmp_int (operands[0], GET_CODE (operands[1]),
+@@ -1715,14 +6638,285 @@
+   }
+ )
+ 
++;; Predicated integer comparisons.
++(define_insn "@aarch64_pred_cmp<cmp_op><mode>"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
++	(unspec:<VPRED>
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (SVE_INT_CMP:<VPRED>
++	     (match_operand:SVE_FULL_I 3 "register_operand" "w, w")
++	     (match_operand:SVE_FULL_I 4 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
++	  UNSPEC_PRED_Z))
++   (clobber (reg:CC_NZC CC_REGNUM))]
++  "TARGET_SVE"
++  "@
++   cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #%4
++   cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
++)
++
++;; Predicated integer comparisons in which both the flag and predicate
++;; results are interesting.
++(define_insn_and_rewrite "*cmp<cmp_op><mode>_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upl, Upl")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (unspec:<VPRED>
++	     [(match_operand 6)
++	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
++	      (SVE_INT_CMP:<VPRED>
++		(match_operand:SVE_FULL_I 2 "register_operand" "w, w")
++		(match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
++	     UNSPEC_PRED_Z)]
++	  UNSPEC_PTEST))
++   (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
++	(unspec:<VPRED>
++	  [(match_dup 6)
++	   (match_dup 7)
++	   (SVE_INT_CMP:<VPRED>
++	     (match_dup 2)
++	     (match_dup 3))]
++	  UNSPEC_PRED_Z))]
++  "TARGET_SVE
++   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
++  "@
++   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
++   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++  "&& !rtx_equal_p (operands[4], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[4]);
++    operands[7] = operands[5];
++  }
++)
++
++;; Predicated integer comparisons in which only the flags result is
++;; interesting.
++(define_insn_and_rewrite "*cmp<cmp_op><mode>_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upl, Upl")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (unspec:<VPRED>
++	     [(match_operand 6)
++	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
++	      (SVE_INT_CMP:<VPRED>
++		(match_operand:SVE_FULL_I 2 "register_operand" "w, w")
++		(match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
++	     UNSPEC_PRED_Z)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:<VPRED> 0 "=Upa, Upa"))]
++  "TARGET_SVE
++   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
++  "@
++   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
++   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++  "&& !rtx_equal_p (operands[4], operands[6])"
++  {
++    operands[6] = copy_rtx (operands[4]);
++    operands[7] = operands[5];
++  }
++)
++
++;; Predicated integer comparisons, formed by combining a PTRUE-predicated
++;; comparison with an AND.  Split the instruction into its preferred form
++;; at the earliest opportunity, in order to get rid of the redundant
++;; operand 4.
++(define_insn_and_split "*cmp<cmp_op><mode>_and"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
++	(and:<VPRED>
++	  (unspec:<VPRED>
++	    [(match_operand 4)
++	     (const_int SVE_KNOWN_PTRUE)
++	     (SVE_INT_CMP:<VPRED>
++	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
++	       (match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
++	    UNSPEC_PRED_Z)
++	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))
++   (clobber (reg:CC_NZC CC_REGNUM))]
++  "TARGET_SVE"
++  "#"
++  "&& 1"
++  [(parallel
++     [(set (match_dup 0)
++	   (unspec:<VPRED>
++	     [(match_dup 1)
++	      (const_int SVE_MAYBE_NOT_PTRUE)
++	      (SVE_INT_CMP:<VPRED>
++		(match_dup 2)
++		(match_dup 3))]
++	     UNSPEC_PRED_Z))
++      (clobber (reg:CC_NZC CC_REGNUM))])]
++)
++
++;; Predicated integer wide comparisons.
++(define_insn "@aarch64_pred_cmp<cmp_op><mode>_wide"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
++	(unspec:<VPRED>
++	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (unspec:<VPRED>
++	     [(match_operand:SVE_FULL_BHSI 3 "register_operand" "w")
++	      (match_operand:VNx2DI 4 "register_operand" "w")]
++	     SVE_COND_INT_CMP_WIDE)]
++	  UNSPEC_PRED_Z))
++   (clobber (reg:CC_NZC CC_REGNUM))]
++  "TARGET_SVE"
++  "cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.d"
++)
++
++;; Predicated integer wide comparisons in which both the flag and
++;; predicate results are interesting.
++(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (unspec:<VPRED>
++	     [(match_operand:VNx16BI 6 "register_operand" "Upl")
++	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
++	      (unspec:<VPRED>
++		[(match_operand:SVE_FULL_BHSI 2 "register_operand" "w")
++		 (match_operand:VNx2DI 3 "register_operand" "w")]
++		SVE_COND_INT_CMP_WIDE)]
++	     UNSPEC_PRED_Z)]
++	  UNSPEC_PTEST))
++   (set (match_operand:<VPRED> 0 "register_operand" "=Upa")
++	(unspec:<VPRED>
++	  [(match_dup 6)
++	   (match_dup 7)
++	   (unspec:<VPRED>
++	     [(match_dup 2)
++	      (match_dup 3)]
++	     SVE_COND_INT_CMP_WIDE)]
++	  UNSPEC_PRED_Z))]
++  "TARGET_SVE
++   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
++  "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
++)
++
++;; Predicated integer wide comparisons in which only the flags result
++;; is interesting.
++(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
++	   (match_operand 4)
++	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	   (unspec:<VPRED>
++	     [(match_operand:VNx16BI 6 "register_operand" "Upl")
++	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
++	      (unspec:<VPRED>
++		[(match_operand:SVE_FULL_BHSI 2 "register_operand" "w")
++		 (match_operand:VNx2DI 3 "register_operand" "w")]
++		SVE_COND_INT_CMP_WIDE)]
++	     UNSPEC_PRED_Z)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:<VPRED> 0 "=Upa"))]
++  "TARGET_SVE
++   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
++  "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] While tests
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - WHILELE
++;; - WHILELO
++;; - WHILELS
++;; - WHILELT
++;; -------------------------------------------------------------------------
++
++;; Set element I of the result if (cmp (plus operand1 J) operand2) is
++;; true for all J in [0, I].
++(define_insn "@while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
++			  (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
++			 SVE_WHILE))
++   (clobber (reg:CC_NZC CC_REGNUM))]
++  "TARGET_SVE"
++  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
++)
++
++;; The WHILE instructions set the flags in the same way as a PTEST with
++;; a PTRUE GP.  Handle the case in which both results are useful.  The GP
++;; operands to the PTEST aren't needed, so we allow them to be anything.
++(define_insn_and_rewrite "*while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand 3)
++	   (match_operand 4)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (unspec:PRED_ALL
++	     [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
++	      (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
++	     SVE_WHILE)]
++	  UNSPEC_PTEST))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(unspec:PRED_ALL [(match_dup 1)
++			  (match_dup 2)]
++			 SVE_WHILE))]
++  "TARGET_SVE"
++  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
++  ;; Force the compiler to drop the unused predicate operand, so that we
++  ;; don't have an unnecessary PTRUE.
++  "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
++  {
++    operands[3] = CONSTM1_RTX (VNx16BImode);
++    operands[4] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
++)
++
++;; Same, but handle the case in which only the flags result is useful.
++(define_insn_and_rewrite "*while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand 3)
++	   (match_operand 4)
++	   (const_int SVE_KNOWN_PTRUE)
++	   (unspec:PRED_ALL
++	     [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
++	      (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
++	     SVE_WHILE)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
++  "TARGET_SVE"
++  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
++  ;; Force the compiler to drop the unused predicate operand, so that we
++  ;; don't have an unnecessary PTRUE.
++  "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
++  {
++    operands[3] = CONSTM1_RTX (VNx16BImode);
++    operands[4] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Direct comparisons
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FCMEQ
++;; - FCMGE
++;; - FCMGT
++;; - FCMLE
++;; - FCMLT
++;; - FCMNE
++;; - FCMUO
++;; -------------------------------------------------------------------------
++
+ ;; Floating-point comparisons.  All comparisons except FCMUO allow a zero
+ ;; operand; aarch64_expand_sve_vec_cmp_float handles the case of an FCMUO
+ ;; with zero.
+ (define_expand "vec_cmp<mode><vpred>"
+   [(set (match_operand:<VPRED> 0 "register_operand")
+ 	(match_operator:<VPRED> 1 "comparison_operator"
+-	  [(match_operand:SVE_F 2 "register_operand")
+-	   (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
++	  [(match_operand:SVE_FULL_F 2 "register_operand")
++	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
+   "TARGET_SVE"
+   {
+     aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
+@@ -1731,6 +6925,172 @@
+   }
+ )
+ 
++;; Predicated floating-point comparisons.
++(define_insn "@aarch64_pred_fcm<cmp_op><mode>"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
++	(unspec:<VPRED>
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
++	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, w")]
++	  SVE_COND_FP_CMP_I0))]
++  "TARGET_SVE"
++  "@
++   fcm<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #0.0
++   fcm<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
++)
++
++;; Same for unordered comparisons.
++(define_insn "@aarch64_pred_fcmuo<mode>"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
++	(unspec:<VPRED>
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (match_operand:SVE_FULL_F 3 "register_operand" "w")
++	   (match_operand:SVE_FULL_F 4 "register_operand" "w")]
++	  UNSPEC_COND_FCMUO))]
++  "TARGET_SVE"
++  "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
++)
++
++;; Floating-point comparisons predicated on a PTRUE, with the results ANDed
++;; with another predicate P.  This does not have the same trapping behavior
++;; as predicating the comparison itself on P, but it's a legitimate fold,
++;; since we can drop any potentially-trapping operations whose results
++;; are not needed.
++;;
++;; Split the instruction into its preferred form (below) at the earliest
++;; opportunity, in order to get rid of the redundant operand 1.
++(define_insn_and_split "*fcm<cmp_op><mode>_and_combine"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
++	(and:<VPRED>
++	  (unspec:<VPRED>
++	    [(match_operand:<VPRED> 1)
++	     (const_int SVE_KNOWN_PTRUE)
++	     (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
++	     (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
++	    SVE_COND_FP_CMP_I0)
++	  (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))]
++  "TARGET_SVE"
++  "#"
++  "&& 1"
++  [(set (match_dup 0)
++	(unspec:<VPRED>
++	  [(match_dup 4)
++	   (const_int SVE_MAYBE_NOT_PTRUE)
++	   (match_dup 2)
++	   (match_dup 3)]
++	  SVE_COND_FP_CMP_I0))]
++)
++
++;; Same for unordered comparisons.
++(define_insn_and_split "*fcmuo<mode>_and_combine"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
++	(and:<VPRED>
++	  (unspec:<VPRED>
++	    [(match_operand:<VPRED> 1)
++	     (const_int SVE_KNOWN_PTRUE)
++	     (match_operand:SVE_FULL_F 2 "register_operand" "w")
++	     (match_operand:SVE_FULL_F 3 "register_operand" "w")]
++	    UNSPEC_COND_FCMUO)
++	  (match_operand:<VPRED> 4 "register_operand" "Upl")))]
++  "TARGET_SVE"
++  "#"
++  "&& 1"
++  [(set (match_dup 0)
++	(unspec:<VPRED>
++	  [(match_dup 4)
++	   (const_int SVE_MAYBE_NOT_PTRUE)
++	   (match_dup 2)
++	   (match_dup 3)]
++	  UNSPEC_COND_FCMUO))]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP] Absolute comparisons
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FACGE
++;; - FACGT
++;; - FACLE
++;; - FACLT
++;; -------------------------------------------------------------------------
++
++;; Predicated floating-point absolute comparisons.
++(define_expand "@aarch64_pred_fac<cmp_op><mode>"
++  [(set (match_operand:<VPRED> 0 "register_operand")
++	(unspec:<VPRED>
++	  [(match_operand:<VPRED> 1 "register_operand")
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (match_dup 2)
++	      (match_operand:SVE_FULL_F 3 "register_operand")]
++	     UNSPEC_COND_FABS)
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (match_dup 2)
++	      (match_operand:SVE_FULL_F 4 "register_operand")]
++	     UNSPEC_COND_FABS)]
++	  SVE_COND_FP_ABS_CMP))]
++  "TARGET_SVE"
++)
++
++(define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>"
++  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
++	(unspec:<VPRED>
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 5)
++	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++	     UNSPEC_COND_FABS)
++	   (unspec:SVE_FULL_F
++	     [(match_operand 7)
++	      (match_operand:SI 8 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 3 "register_operand" "w")]
++	     UNSPEC_COND_FABS)]
++	  SVE_COND_FP_ABS_CMP))]
++  "TARGET_SVE
++   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])
++   && aarch64_sve_pred_dominates_p (&operands[7], operands[1])"
++  "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
++  "&& (!rtx_equal_p (operands[1], operands[5])
++       || !rtx_equal_p (operands[1], operands[7]))"
++  {
++    operands[5] = copy_rtx (operands[1]);
++    operands[7] = copy_rtx (operands[1]);
++  }
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Select
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SEL
++;; -------------------------------------------------------------------------
++
++(define_insn "@vcond_mask_<mode><mode>"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(ior:PRED_ALL
++	  (and:PRED_ALL
++	    (match_operand:PRED_ALL 3 "register_operand" "Upa")
++	    (match_operand:PRED_ALL 1 "register_operand" "Upa"))
++	  (and:PRED_ALL
++	    (not (match_dup 3))
++	    (match_operand:PRED_ALL 2 "register_operand" "Upa"))))]
++  "TARGET_SVE"
++  "sel\t%0.b, %3, %1.b, %2.b"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Test bits
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - PTEST
++;; -------------------------------------------------------------------------
++
+ ;; Branch based on predicate equality or inequality.
+ (define_expand "cbranch<mode>4"
+   [(set (pc)
+@@ -1742,1409 +7102,2120 @@
+ 	  (pc)))]
+   ""
+   {
+-    rtx ptrue = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
++    rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (<data_bytes>));
++    rtx cast_ptrue = gen_lowpart (<MODE>mode, ptrue);
++    rtx ptrue_flag = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
+     rtx pred;
+     if (operands[2] == CONST0_RTX (<MODE>mode))
+       pred = operands[1];
+     else
+       {
+ 	pred = gen_reg_rtx (<MODE>mode);
+-	emit_insn (gen_pred_xor<mode>3 (pred, ptrue, operands[1],
+-					operands[2]));
++	emit_insn (gen_aarch64_pred_xor<mode>_z (pred, cast_ptrue, operands[1],
++						 operands[2]));
+       }
+-    emit_insn (gen_ptest_ptrue<mode> (ptrue, pred));
+-    operands[1] = gen_rtx_REG (CCmode, CC_REGNUM);
++    emit_insn (gen_aarch64_ptest<mode> (ptrue, cast_ptrue, ptrue_flag, pred));
++    operands[1] = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+     operands[2] = const0_rtx;
+   }
+ )
+ 
+-;; Unpredicated integer MIN/MAX.
+-(define_expand "<su><maxmin><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_dup 3)
+-	   (MAXMIN:SVE_I (match_operand:SVE_I 1 "register_operand")
+-			 (match_operand:SVE_I 2 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; See "Description of UNSPEC_PTEST" above for details.
++(define_insn "aarch64_ptest<mode>"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC [(match_operand:VNx16BI 0 "register_operand" "Upa")
++			(match_operand 1)
++			(match_operand:SI 2 "aarch64_sve_ptrue_flag")
++			(match_operand:PRED_ALL 3 "register_operand" "Upa")]
++		       UNSPEC_PTEST))]
++  "TARGET_SVE"
++  "ptest\t%0, %3.b"
++)
++
++;; =========================================================================
++;; == Reductions
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Conditional reductions
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - CLASTA
++;; - CLASTB
++;; -------------------------------------------------------------------------
++
++;; Set operand 0 to the last active element in operand 3, or to tied
++;; operand 1 if no elements are active.
++(define_insn "@fold_extract_<last_op>_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand" "=?r, w")
++	(unspec:<VEL>
++	  [(match_operand:<VEL> 1 "register_operand" "0, 0")
++	   (match_operand:<VPRED> 2 "register_operand" "Upl, Upl")
++	   (match_operand:SVE_FULL 3 "register_operand" "w, w")]
++	  CLAST))]
+   "TARGET_SVE"
+-  {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-  }
++  "@
++   clast<ab>\t%<vwcore>0, %2, %<vwcore>0, %3.<Vetype>
++   clast<ab>\t%<Vetype>0, %2, %<Vetype>0, %3.<Vetype>"
+ )
+ 
+-;; Integer MIN/MAX predicated with a PTRUE.
+-(define_insn "*<su><maxmin><mode>3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (MAXMIN:SVE_I (match_operand:SVE_I 2 "register_operand" "%0, w")
+-			 (match_operand:SVE_I 3 "register_operand" "w, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++(define_insn "@aarch64_fold_extract_vector_<last_op>_<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "0, w")
++	   (match_operand:<VPRED> 2 "register_operand" "Upl, Upl")
++	   (match_operand:SVE_FULL 3 "register_operand" "w, w")]
++	  CLAST))]
+   "TARGET_SVE"
+   "@
+-   <su><maxmin>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<su><maxmin>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
+-)
++   clast<ab>\t%0.<Vetype>, %2, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %1\;clast<ab>\t%0.<Vetype>, %2, %0.<Vetype>, %3.<Vetype>"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Tree reductions
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - ANDV
++;; - EORV
++;; - ORV
++;; - SADDV
++;; - SMAXV
++;; - SMINV
++;; - UADDV
++;; - UMAXV
++;; - UMINV
++;; -------------------------------------------------------------------------
+ 
+-;; Unpredicated floating-point MIN/MAX.
+-(define_expand "<su><maxmin><mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 3)
+-	   (FMAXMIN:SVE_F (match_operand:SVE_F 1 "register_operand")
+-			  (match_operand:SVE_F 2 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Unpredicated integer add reduction.
++(define_expand "reduc_plus_scal_<mode>"
++  [(match_operand:<VEL> 0 "register_operand")
++   (match_operand:SVE_FULL_I 1 "register_operand")]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    rtx pred = aarch64_ptrue_reg (<VPRED>mode);
++    rtx tmp = <VEL>mode == DImode ? operands[0] : gen_reg_rtx (DImode);
++    emit_insn (gen_aarch64_pred_reduc_uadd_<mode> (tmp, pred, operands[1]));
++    if (tmp != operands[0])
++      emit_move_insn (operands[0], gen_lowpart (<VEL>mode, tmp));
++    DONE;
+   }
+ )
+ 
+-;; Floating-point MIN/MAX predicated with a PTRUE.
+-(define_insn "*<su><maxmin><mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (FMAXMIN:SVE_F (match_operand:SVE_F 2 "register_operand" "%0, w")
+-			  (match_operand:SVE_F 3 "register_operand" "w, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  "@
+-   f<maxmin>nm\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;f<maxmin>nm\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++;; Predicated integer add reduction.  The result is always 64-bits.
++(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
++  [(set (match_operand:DI 0 "register_operand" "=w")
++	(unspec:DI [(match_operand:<VPRED> 1 "register_operand" "Upl")
++		    (match_operand:SVE_FULL_I 2 "register_operand" "w")]
++		   SVE_INT_ADDV))]
++  "TARGET_SVE && <max_elem_bits> >= <elem_bits>"
++  "<su>addv\t%d0, %1, %2.<Vetype>"
+ )
+ 
+-;; Unpredicated fmin/fmax.
+-(define_expand "<maxmin_uns><mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 3)
+-	   (unspec:SVE_F [(match_operand:SVE_F 1 "register_operand")
+-			  (match_operand:SVE_F 2 "register_operand")]
+-			 FMAXMIN_UNS)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Unpredicated integer reductions.
++(define_expand "reduc_<optab>_scal_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand")
++	(unspec:<VEL> [(match_dup 2)
++		       (match_operand:SVE_FULL_I 1 "register_operand")]
++		      SVE_INT_REDUCTION))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+-;; fmin/fmax predicated with a PTRUE.
+-(define_insn "*<maxmin_uns><mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (unspec:SVE_F [(match_operand:SVE_F 2 "register_operand" "%0, w")
+-			  (match_operand:SVE_F 3 "register_operand" "w, w")]
+-			 FMAXMIN_UNS)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated integer reductions.
++(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand" "=w")
++	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
++		       (match_operand:SVE_FULL_I 2 "register_operand" "w")]
++		      SVE_INT_REDUCTION))]
+   "TARGET_SVE"
+-  "@
+-   <maxmin_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<maxmin_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  "<sve_int_op>\t%<Vetype>0, %1, %2.<Vetype>"
+ )
+ 
+-;; Predicated integer operations with select.
+-(define_expand "cond_<optab><mode>"
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand")
+-	   (SVE_INT_BINARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand")
+-	     (match_operand:SVE_I 3 "register_operand"))
+-	   (match_operand:SVE_I 4 "aarch64_simd_reg_or_zero")]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-)
++;; -------------------------------------------------------------------------
++;; ---- [FP] Tree reductions
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FADDV
++;; - FMAXNMV
++;; - FMAXV
++;; - FMINNMV
++;; - FMINV
++;; -------------------------------------------------------------------------
+ 
+-(define_expand "cond_<optab><mode>"
+-  [(set (match_operand:SVE_SDI 0 "register_operand")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand")
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 2 "register_operand")
+-	     (match_operand:SVE_SDI 3 "register_operand"))
+-	   (match_operand:SVE_SDI 4 "aarch64_simd_reg_or_zero")]
+-	  UNSPEC_SEL))]
++;; Unpredicated floating-point tree reductions.
++(define_expand "reduc_<optab>_scal_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand")
++	(unspec:<VEL> [(match_dup 2)
++		       (match_operand:SVE_FULL_F 1 "register_operand")]
++		      SVE_FP_REDUCTION))]
+   "TARGET_SVE"
++  {
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
++  }
+ )
+ 
+-;; Predicated integer operations with select matching the output operand.
+-(define_insn "*cond_<optab><mode>_0"
+-  [(set (match_operand:SVE_I 0 "register_operand" "+w, w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (SVE_INT_BINARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "0, w, w")
+-	     (match_operand:SVE_I 3 "register_operand" "w, 0, w"))
+-	   (match_dup 0)]
+-	  UNSPEC_SEL))]
++;; Predicated floating-point tree reductions.
++(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand" "=w")
++	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
++		       (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++		      SVE_FP_REDUCTION))]
+   "TARGET_SVE"
+-  "@
+-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %1/m, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++  "<sve_fp_op>\t%<Vetype>0, %1, %2.<Vetype>"
+ )
+ 
+-(define_insn "*cond_<optab><mode>_0"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "+w, w, ?&w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 2 "register_operand" "0, w, w")
+-	     (match_operand:SVE_SDI 3 "register_operand" "w, 0, w"))
+-	   (match_dup 0)]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-  "@
+-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %1/m, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
+-)
++;; -------------------------------------------------------------------------
++;; ---- [FP] Left-to-right reductions
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FADDA
++;; -------------------------------------------------------------------------
+ 
+-;; Predicated integer operations with select matching the first operand.
+-(define_insn "*cond_<optab><mode>_2"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (SVE_INT_BINARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "0, w")
+-	     (match_operand:SVE_I 3 "register_operand" "w, w"))
+-	   (match_dup 2)]
+-	  UNSPEC_SEL))]
++;; Unpredicated in-order FP reductions.
++(define_expand "fold_left_plus_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand")
++	(unspec:<VEL> [(match_dup 3)
++		       (match_operand:<VEL> 1 "register_operand")
++		       (match_operand:SVE_FULL_F 2 "register_operand")]
++		      UNSPEC_FADDA))]
+   "TARGET_SVE"
+-  "@
+-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  {
++    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
++  }
+ )
+ 
+-(define_insn "*cond_<optab><mode>_2"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 2 "register_operand" "0, w")
+-	     (match_operand:SVE_SDI 3 "register_operand" "w, w"))
+-	   (match_dup 2)]
+-	  UNSPEC_SEL))]
++;; Predicated in-order FP reductions.
++(define_insn "mask_fold_left_plus_<mode>"
++  [(set (match_operand:<VEL> 0 "register_operand" "=w")
++	(unspec:<VEL> [(match_operand:<VPRED> 3 "register_operand" "Upl")
++		       (match_operand:<VEL> 1 "register_operand" "0")
++		       (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++		      UNSPEC_FADDA))]
+   "TARGET_SVE"
+-  "@
+-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  "fadda\t%<Vetype>0, %3, %<Vetype>0, %2.<Vetype>"
+ )
+ 
+-;; Predicated integer operations with select matching the second operand.
+-(define_insn "*cond_<optab><mode>_3"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (SVE_INT_BINARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "w, w")
+-	     (match_operand:SVE_I 3 "register_operand" "0, w"))
+-	   (match_dup 3)]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-  "@
+-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
+-)
++;; =========================================================================
++;; == Permutes
++;; =========================================================================
+ 
+-(define_insn "*cond_<optab><mode>_3"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 2 "register_operand" "w, w")
+-	     (match_operand:SVE_SDI 3 "register_operand" "0, w"))
+-	   (match_dup 3)]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-  "@
+-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
+-)
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] General permutes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - TBL
++;; -------------------------------------------------------------------------
+ 
+-;; Predicated integer operations with select matching zero.
+-(define_insn "*cond_<optab><mode>_z"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=&w")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (SVE_INT_BINARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "w")
+-	     (match_operand:SVE_I 3 "register_operand" "w"))
+-	   (match_operand:SVE_I 4 "aarch64_simd_imm_zero")]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "yes")]
++(define_expand "vec_perm<mode>"
++  [(match_operand:SVE_FULL 0 "register_operand")
++   (match_operand:SVE_FULL 1 "register_operand")
++   (match_operand:SVE_FULL 2 "register_operand")
++   (match_operand:<V_INT_EQUIV> 3 "aarch64_sve_vec_perm_operand")]
++  "TARGET_SVE && GET_MODE_NUNITS (<MODE>mode).is_constant ()"
++  {
++    aarch64_expand_sve_vec_perm (operands[0], operands[1],
++				 operands[2], operands[3]);
++    DONE;
++  }
+ )
+ 
+-(define_insn "*cond_<optab><mode>_z"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=&w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (SVE_INT_BINARY_SD:SVE_SDI
+-	     (match_operand:SVE_SDI 2 "register_operand" "w")
+-	     (match_operand:SVE_SDI 3 "register_operand" "w"))
+-	   (match_operand:SVE_SDI 4 "aarch64_simd_imm_zero")]
+-	  UNSPEC_SEL))]
++(define_insn "@aarch64_sve_tbl<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "w")
++	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
++	  UNSPEC_TBL))]
+   "TARGET_SVE"
+-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "yes")]
++  "tbl\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+ )
+ 
+-;; Synthetic predications with select unmatched.
+-(define_insn "*cond_<optab><mode>_any"
+-  [(set (match_operand:SVE_I 0 "register_operand" "=&w")
+-	(unspec:SVE_I
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Special-purpose unary permutes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - COMPACT
++;; - DUP
++;; - REV
++;; -------------------------------------------------------------------------
++
++;; Compact active elements and pad with zeros.
++(define_insn "@aarch64_sve_compact<mode>"
++  [(set (match_operand:SVE_FULL_SD 0 "register_operand" "=w")
++	(unspec:SVE_FULL_SD
+ 	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (SVE_INT_BINARY:SVE_I
+-	     (match_operand:SVE_I 2 "register_operand" "w")
+-	     (match_operand:SVE_I 3 "register_operand" "w"))
+-	   (match_operand:SVE_I 4 "register_operand"   "w")]
+-	  UNSPEC_SEL))]
++	   (match_operand:SVE_FULL_SD 2 "register_operand" "w")]
++	  UNSPEC_SVE_COMPACT))]
+   "TARGET_SVE"
+-  "#"
++  "compact\t%0.<Vetype>, %1, %2.<Vetype>"
+ )
+ 
+-(define_insn "*cond_<optab><mode>_any"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=&w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (SVE_INT_BINARY_SD:SVE_I
+-	     (match_operand:SVE_SDI 2 "register_operand" "w")
+-	     (match_operand:SVE_SDI 3 "register_operand" "w"))
+-	   (match_operand:SVE_SDI 4 "register_operand"   "w")]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-  "#"
++;; Duplicate one element of a vector.
++(define_insn "@aarch64_sve_dup_lane<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(vec_duplicate:SVE_FULL
++	  (vec_select:<VEL>
++	    (match_operand:SVE_FULL 1 "register_operand" "w")
++	    (parallel [(match_operand:SI 2 "const_int_operand")]))))]
++  "TARGET_SVE
++   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
++  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
+ )
+ 
+-(define_split
+-  [(set (match_operand:SVE_I 0 "register_operand")
+-	(unspec:SVE_I
+-	  [(match_operand:<VPRED> 1 "register_operand")
+-	   (match_operator:SVE_I 5 "aarch64_sve_any_binary_operator"
+-	     [(match_operand:SVE_I 2 "register_operand")
+-	      (match_operand:SVE_I 3 "register_operand")])
+-	   (match_operand:SVE_I 4 "register_operand")]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE && reload_completed
+-   && !(rtx_equal_p (operands[0], operands[4])
+-        || rtx_equal_p (operands[2], operands[4])
+-        || rtx_equal_p (operands[3], operands[4]))"
+-  ; Not matchable by any one insn or movprfx insn.  We need a separate select.
+-  [(set (match_dup 0)
+-	(unspec:SVE_I [(match_dup 1) (match_dup 2) (match_dup 4)]
+-                      UNSPEC_SEL))
+-   (set (match_dup 0)
+-	(unspec:SVE_I
+-	  [(match_dup 1)
+-	   (match_op_dup 5 [(match_dup 0) (match_dup 3)])
+-           (match_dup 0)]
+-	  UNSPEC_SEL))]
++;; Use DUP.Q to duplicate a 128-bit segment of a register.
++;;
++;; The vec_select:<V128> sets memory lane number N of the V128 to lane
++;; number op2 + N of op1.  (We don't need to distinguish between memory
++;; and architectural register lane numbering for op1 or op0, since the
++;; two numbering schemes are the same for SVE.)
++;;
++;; The vec_duplicate:SVE_FULL then copies memory lane number N of the
++;; V128 (and thus lane number op2 + N of op1) to lane numbers N + I * STEP
++;; of op0.  We therefore get the correct result for both endiannesses.
++;;
++;; The wrinkle is that for big-endian V128 registers, memory lane numbering
++;; is in the opposite order to architectural register lane numbering.
++;; Thus if we were to do this operation via a V128 temporary register,
++;; the vec_select and vec_duplicate would both involve a reverse operation
++;; for big-endian targets.  In this fused pattern the two reverses cancel
++;; each other out.
++(define_insn "@aarch64_sve_dupq_lane<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(vec_duplicate:SVE_FULL
++	  (vec_select:<V128>
++	    (match_operand:SVE_FULL 1 "register_operand" "w")
++	    (match_operand 2 "ascending_int_parallel"))))]
++  "TARGET_SVE
++   && (INTVAL (XVECEXP (operands[2], 0, 0))
++       * GET_MODE_SIZE (<VEL>mode)) % 16 == 0
++   && IN_RANGE (INTVAL (XVECEXP (operands[2], 0, 0))
++		* GET_MODE_SIZE (<VEL>mode), 0, 63)"
++  {
++    unsigned int byte = (INTVAL (XVECEXP (operands[2], 0, 0))
++			 * GET_MODE_SIZE (<VEL>mode));
++    operands[2] = gen_int_mode (byte / 16, DImode);
++    return "dup\t%0.q, %1.q[%2]";
++  }
+ )
+ 
+-;; Set operand 0 to the last active element in operand 3, or to tied
+-;; operand 1 if no elements are active.
+-(define_insn "fold_extract_last_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=r, w")
+-	(unspec:<VEL>
+-	  [(match_operand:<VEL> 1 "register_operand" "0, 0")
+-	   (match_operand:<VPRED> 2 "register_operand" "Upl, Upl")
+-	   (match_operand:SVE_ALL 3 "register_operand" "w, w")]
+-	  UNSPEC_CLASTB))]
++;; Reverse the order of elements within a full vector.
++(define_insn "@aarch64_sve_rev<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "w")]
++	  UNSPEC_REV))]
+   "TARGET_SVE"
+-  "@
+-   clastb\t%<vwcore>0, %2, %<vwcore>0, %3.<Vetype>
+-   clastb\t%<vw>0, %2, %<vw>0, %3.<Vetype>"
+-)
++  "rev\t%0.<Vetype>, %1.<Vetype>")
+ 
+-;; Unpredicated integer add reduction.
+-(define_expand "reduc_plus_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(unspec:<VEL> [(match_dup 2)
+-		       (match_operand:SVE_I 1 "register_operand")]
+-		      UNSPEC_ADDV))]
++;; -------------------------------------------------------------------------
++;; ---- [INT,FP] Special-purpose binary permutes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SPLICE
++;; - TRN1
++;; - TRN2
++;; - UZP1
++;; - UZP2
++;; - ZIP1
++;; - ZIP2
++;; -------------------------------------------------------------------------
++
++;; Like EXT, but start at the first active element.
++(define_insn "@aarch64_sve_splice<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL
++	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
++	   (match_operand:SVE_FULL 2 "register_operand" "0, w")
++	   (match_operand:SVE_FULL 3 "register_operand" "w, w")]
++	  UNSPEC_SVE_SPLICE))]
+   "TARGET_SVE"
++  "@
++   splice\t%0.<Vetype>, %1, %0.<Vetype>, %3.<Vetype>
++   movprfx\t%0, %2\;splice\t%0.<Vetype>, %1, %0.<Vetype>, %3.<Vetype>"
++  [(set_attr "movprfx" "*, yes")]
++)
++
++;; Permutes that take half the elements from one vector and half the
++;; elements from the other.
++(define_insn "@aarch64_sve_<perm_insn><mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "w")
++	   (match_operand:SVE_FULL 2 "register_operand" "w")]
++	  PERMUTE))]
++  "TARGET_SVE"
++  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++)
++
++;; Apply PERMUTE to 128-bit sequences.  The behavior of these patterns
++;; doesn't depend on the mode.
++(define_insn "@aarch64_sve_<optab><mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "w")
++	   (match_operand:SVE_FULL 2 "register_operand" "w")]
++	  PERMUTEQ))]
++  "TARGET_SVE_F64MM"
++  "<perm_insn>\t%0.q, %1.q, %2.q"
++)
++
++;; Concatenate two vectors and extract a subvector.  Note that the
++;; immediate (third) operand is the lane index not the byte index.
++(define_insn "@aarch64_sve_ext<mode>"
++  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
++	(unspec:SVE_FULL
++	  [(match_operand:SVE_FULL 1 "register_operand" "0, w")
++	   (match_operand:SVE_FULL 2 "register_operand" "w, w")
++	   (match_operand:SI 3 "const_int_operand")]
++	  UNSPEC_EXT))]
++  "TARGET_SVE
++   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
++    return (which_alternative == 0
++	    ? "ext\\t%0.b, %0.b, %2.b, #%3"
++	    : "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");
+   }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Predicated integer add reduction.  The result is always 64-bits.
+-(define_insn "*reduc_plus_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-		       (match_operand:SVE_I 2 "register_operand" "w")]
+-		      UNSPEC_ADDV))]
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Special-purpose unary permutes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - REV
++;; -------------------------------------------------------------------------
++
++(define_insn "@aarch64_sve_rev<mode>"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")]
++			 UNSPEC_REV))]
+   "TARGET_SVE"
+-  "uaddv\t%d0, %1, %2.<Vetype>"
+-)
++  "rev\t%0.<Vetype>, %1.<Vetype>")
+ 
+-;; Unpredicated floating-point add reduction.
+-(define_expand "reduc_plus_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(unspec:<VEL> [(match_dup 2)
+-		       (match_operand:SVE_F 1 "register_operand")]
+-		      UNSPEC_FADDV))]
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Special-purpose binary permutes
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - TRN1
++;; - TRN2
++;; - UZP1
++;; - UZP2
++;; - ZIP1
++;; - ZIP2
++;; -------------------------------------------------------------------------
++
++;; Permutes that take half the elements from one vector and half the
++;; elements from the other.
++(define_insn "@aarch64_sve_<perm_insn><mode>"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")
++			  (match_operand:PRED_ALL 2 "register_operand" "Upa")]
++			 PERMUTE))]
+   "TARGET_SVE"
+-  {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-  }
++  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+ )
+ 
+-;; Predicated floating-point add reduction.
+-(define_insn "*reduc_plus_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-		       (match_operand:SVE_F 2 "register_operand" "w")]
+-		      UNSPEC_FADDV))]
++;; =========================================================================
++;; == Conversions
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [INT<-INT] Packs
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - UZP1
++;; -------------------------------------------------------------------------
++
++;; Integer pack.  Use UZP1 on the narrower type, which discards
++;; the high part of each wide element.
++(define_insn "vec_pack_trunc_<Vwide>"
++  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=w")
++	(unspec:SVE_FULL_BHSI
++	  [(match_operand:<VWIDE> 1 "register_operand" "w")
++	   (match_operand:<VWIDE> 2 "register_operand" "w")]
++	  UNSPEC_PACK))]
+   "TARGET_SVE"
+-  "faddv\t%<Vetype>0, %1, %2.<Vetype>"
++  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+ )
+ 
+-;; Unpredicated integer MIN/MAX reduction.
+-(define_expand "reduc_<maxmin_uns>_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(unspec:<VEL> [(match_dup 2)
+-		       (match_operand:SVE_I 1 "register_operand")]
+-		      MAXMINV))]
++;; -------------------------------------------------------------------------
++;; ---- [INT<-INT] Unpacks
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SUNPKHI
++;; - SUNPKLO
++;; - UUNPKHI
++;; - UUNPKLO
++;; -------------------------------------------------------------------------
++
++;; Unpack the low or high half of a vector, where "high" refers to
++;; the low-numbered lanes for big-endian and the high-numbered lanes
++;; for little-endian.
++(define_expand "vec_unpack<su>_<perm_hilo>_<SVE_FULL_BHSI:mode>"
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (unspec:<VWIDE>
++     [(match_operand:SVE_FULL_BHSI 1 "register_operand")] UNPACK)]
+   "TARGET_SVE"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    emit_insn ((<hi_lanes_optab>
++		? gen_aarch64_sve_<su>unpkhi_<SVE_FULL_BHSI:mode>
++		: gen_aarch64_sve_<su>unpklo_<SVE_FULL_BHSI:mode>)
++	       (operands[0], operands[1]));
++    DONE;
+   }
+ )
+ 
+-;; Predicated integer MIN/MAX reduction.
+-(define_insn "*reduc_<maxmin_uns>_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-		       (match_operand:SVE_I 2 "register_operand" "w")]
+-		      MAXMINV))]
++(define_insn "@aarch64_sve_<su>unpk<perm_hilo>_<SVE_FULL_BHSI:mode>"
++  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
++	(unspec:<VWIDE>
++	  [(match_operand:SVE_FULL_BHSI 1 "register_operand" "w")]
++	  UNPACK))]
+   "TARGET_SVE"
+-  "<maxmin_uns_op>v\t%<Vetype>0, %1, %2.<Vetype>"
++  "<su>unpk<perm_hilo>\t%0.<Vewtype>, %1.<Vetype>"
+ )
+ 
+-;; Unpredicated floating-point MIN/MAX reduction.
+-(define_expand "reduc_<maxmin_uns>_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(unspec:<VEL> [(match_dup 2)
+-		       (match_operand:SVE_F 1 "register_operand")]
+-		      FMAXMINV))]
++;; -------------------------------------------------------------------------
++;; ---- [INT<-FP] Conversions
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FCVTZS
++;; - FCVTZU
++;; -------------------------------------------------------------------------
++
++;; Unpredicated conversion of floats to integers of the same size (HF to HI,
++;; SF to SI or DF to DI).
++(define_expand "<optab><mode><v_int_equiv>2"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
++	(unspec:<V_INT_EQUIV>
++	  [(match_dup 2)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:SVE_FULL_F 1 "register_operand")]
++	  SVE_COND_FCVTI))]
+   "TARGET_SVE"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+-;; Predicated floating-point MIN/MAX reduction.
+-(define_insn "*reduc_<maxmin_uns>_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-		       (match_operand:SVE_F 2 "register_operand" "w")]
+-		      FMAXMINV))]
+-  "TARGET_SVE"
+-  "<maxmin_uns_op>v\t%<Vetype>0, %1, %2.<Vetype>"
++;; Predicated float-to-integer conversion, either to the same width or wider.
++(define_insn "@aarch64_sve_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
++  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
++	(unspec:SVE_FULL_HSDI
++	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
++	  SVE_COND_FCVTI))]
++  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
++  "fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>"
+ )
+ 
+-(define_expand "reduc_<optab>_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(unspec:<VEL> [(match_dup 2)
+-		       (match_operand:SVE_I 1 "register_operand")]
+-		      BITWISEV))]
+-  "TARGET_SVE"
++;; Predicated narrowing float-to-integer conversion.
++(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
++  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w")
++	(unspec:VNx4SI_ONLY
++	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:VNx2DF_ONLY 2 "register_operand" "w")]
++	  SVE_COND_FCVTI))]
++  "TARGET_SVE"
++  "fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>"
++)
++
++;; Predicated float-to-integer conversion with merging, either to the same
++;; width or wider.
++(define_expand "@cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
++  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
++	(unspec:SVE_FULL_HSDI
++	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_HSDI
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_F 2 "register_operand")]
++	     SVE_COND_FCVTI)
++	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
++)
++
++;; The first alternative doesn't need the earlyclobber, but the only case
++;; it would help is the uninteresting one in which operands 2 and 3 are
++;; the same register (despite having different modes).  Making all the
++;; alternatives earlyclobber makes things more consistent for the
++;; register allocator.
++(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
++  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=&w, &w, ?&w")
++	(unspec:SVE_FULL_HSDI
++	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_HSDI
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
++	     SVE_COND_FCVTI)
++	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE
++   && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>
++   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
++  "@
++   fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
++   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
++   movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[4] = copy_rtx (operands[1]);
+   }
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; Predicated narrowing float-to-integer conversion with merging.
++(define_expand "@cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
++  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
++	(unspec:VNx4SI_ONLY
++	  [(match_operand:VNx2BI 1 "register_operand")
++	   (unspec:VNx4SI_ONLY
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:VNx2DF_ONLY 2 "register_operand")]
++	     SVE_COND_FCVTI)
++	   (match_operand:VNx4SI_ONLY 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
+ )
+ 
+-(define_insn "*reduc_<optab>_scal_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-		       (match_operand:SVE_I 2 "register_operand" "w")]
+-		      BITWISEV))]
++(define_insn "*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
++  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=&w, &w, ?&w")
++	(unspec:VNx4SI_ONLY
++	  [(match_operand:VNx2BI 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:VNx4SI_ONLY
++	     [(match_dup 1)
++	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	      (match_operand:VNx2DF_ONLY 2 "register_operand" "w, w, w")]
++	     SVE_COND_FCVTI)
++	   (match_operand:VNx4SI_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+-  "<bit_reduc_op>\t%<Vetype>0, %1, %2.<Vetype>"
++  "@
++   fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
++   movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
++   movprfx\t%0, %3\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
+ )
+ 
+-;; Unpredicated in-order FP reductions.
+-(define_expand "fold_left_plus_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand")
+-	(unspec:<VEL> [(match_dup 3)
+-		       (match_operand:<VEL> 1 "register_operand")
+-		       (match_operand:SVE_F 2 "register_operand")]
+-		      UNSPEC_FADDA))]
++;; -------------------------------------------------------------------------
++;; ---- [INT<-FP] Packs
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
++;; Convert two vectors of DF to SI and pack the results into a single vector.
++(define_expand "vec_pack_<su>fix_trunc_vnx2df"
++  [(set (match_dup 4)
++	(unspec:VNx4SI
++	  [(match_dup 3)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:VNx2DF 1 "register_operand")]
++	  SVE_COND_FCVTI))
++   (set (match_dup 5)
++	(unspec:VNx4SI
++	  [(match_dup 3)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:VNx2DF 2 "register_operand")]
++	  SVE_COND_FCVTI))
++   (set (match_operand:VNx4SI 0 "register_operand")
++	(unspec:VNx4SI [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[3] = aarch64_ptrue_reg (VNx2BImode);
++    operands[4] = gen_reg_rtx (VNx4SImode);
++    operands[5] = gen_reg_rtx (VNx4SImode);
+   }
+ )
+ 
+-;; In-order FP reductions predicated with PTRUE.
+-(define_insn "*fold_left_plus_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-		       (match_operand:<VEL> 2 "register_operand" "0")
+-		       (match_operand:SVE_F 3 "register_operand" "w")]
+-		      UNSPEC_FADDA))]
+-  "TARGET_SVE"
+-  "fadda\t%<Vetype>0, %1, %<Vetype>0, %3.<Vetype>"
+-)
++;; -------------------------------------------------------------------------
++;; ---- [INT<-FP] Unpacks
++;; -------------------------------------------------------------------------
++;; No patterns here yet!
++;; -------------------------------------------------------------------------
+ 
+-;; Predicated form of the above in-order reduction.
+-(define_insn "*pred_fold_left_plus_<mode>"
+-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+-	(unspec:<VEL>
+-	  [(match_operand:<VEL> 1 "register_operand" "0")
+-	   (unspec:SVE_F
+-	     [(match_operand:<VPRED> 2 "register_operand" "Upl")
+-	      (match_operand:SVE_F 3 "register_operand" "w")
+-	      (match_operand:SVE_F 4 "aarch64_simd_imm_zero")]
+-	     UNSPEC_SEL)]
+-	  UNSPEC_FADDA))]
+-  "TARGET_SVE"
+-  "fadda\t%<Vetype>0, %2, %<Vetype>0, %3.<Vetype>"
+-)
++;; -------------------------------------------------------------------------
++;; ---- [FP<-INT] Conversions
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SCVTF
++;; - UCVTF
++;; -------------------------------------------------------------------------
+ 
+-;; Unpredicated floating-point addition.
+-(define_expand "add<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 3)
+-	   (plus:SVE_F
+-	     (match_operand:SVE_F 1 "register_operand")
+-	     (match_operand:SVE_F 2 "aarch64_sve_float_arith_with_sub_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Unpredicated conversion of integers to floats of the same size
++;; (HI to HF, SI to SF or DI to DF).
++(define_expand "<optab><v_int_equiv><mode>2"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_dup 2)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:<V_INT_EQUIV> 1 "register_operand")]
++	  SVE_COND_ICVTF))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+   }
+ )
+ 
+-;; Floating-point addition predicated with a PTRUE.
+-(define_insn_and_split "*add<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (plus:SVE_F
+-	      (match_operand:SVE_F 2 "register_operand" "%0, 0, w")
+-	      (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_operand" "vsA, vsN, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  "@
+-   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+-   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+-   #"
+-  ; Split the unpredicated form after reload, so that we don't have
+-  ; the unnecessary PTRUE.
+-  "&& reload_completed
+-   && register_operand (operands[3], <MODE>mode)"
+-  [(set (match_dup 0) (plus:SVE_F (match_dup 2) (match_dup 3)))]
++;; Predicated integer-to-float conversion, either to the same width or
++;; narrower.
++(define_insn "@aarch64_sve_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")]
++	  SVE_COND_ICVTF))]
++  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
++  "<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
+ )
+ 
+-;; Unpredicated floating-point subtraction.
+-(define_expand "sub<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 3)
+-	   (minus:SVE_F
+-	     (match_operand:SVE_F 1 "aarch64_sve_float_arith_operand")
+-	     (match_operand:SVE_F 2 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-  }
++;; Predicated widening integer-to-float conversion.
++(define_insn "@aarch64_sve_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
++  [(set (match_operand:VNx2DF_ONLY 0 "register_operand" "=w")
++	(unspec:VNx2DF_ONLY
++	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:VNx4SI_ONLY 2 "register_operand" "w")]
++	  SVE_COND_ICVTF))]
++  "TARGET_SVE"
++  "<su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>"
++)
++
++;; Predicated integer-to-float conversion with merging, either to the same
++;; width or narrower.
++(define_expand "@cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_F
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_HSDI 2 "register_operand")]
++	     SVE_COND_ICVTF)
++	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
+ )
+ 
+-;; Floating-point subtraction predicated with a PTRUE.
+-(define_insn_and_split "*sub<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, w, w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+-	   (minus:SVE_F
+-	     (match_operand:SVE_F 2 "aarch64_sve_float_arith_operand" "0, 0, vsA, w")
+-	     (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_operand" "vsA, vsN, 0, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; The first alternative doesn't need the earlyclobber, but the only case
++;; it would help is the uninteresting one in which operands 2 and 3 are
++;; the same register (despite having different modes).  Making all the
++;; alternatives earlyclobber makes things more consistent for the
++;; register allocator.
++(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
++  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, ?&w")
++	(unspec:SVE_FULL_F
++	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_F
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")]
++	     SVE_COND_ICVTF)
++	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE
+-   && (register_operand (operands[2], <MODE>mode)
+-       || register_operand (operands[3], <MODE>mode))"
++   && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>
++   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+   "@
+-   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+-   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+-   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+-   #"
+-  ; Split the unpredicated form after reload, so that we don't have
+-  ; the unnecessary PTRUE.
+-  "&& reload_completed
+-   && register_operand (operands[2], <MODE>mode)
+-   && register_operand (operands[3], <MODE>mode)"
+-  [(set (match_dup 0) (minus:SVE_F (match_dup 2) (match_dup 3)))]
+-)
+-
+-;; Unpredicated floating-point multiplication.
+-(define_expand "mul<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 3)
+-	   (mult:SVE_F
+-	     (match_operand:SVE_F 1 "register_operand")
+-	     (match_operand:SVE_F 2 "aarch64_sve_float_mul_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
++   <su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
++   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
++   movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
++  "&& !rtx_equal_p (operands[1], operands[4])"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[4] = copy_rtx (operands[1]);
+   }
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; Predicated widening integer-to-float conversion with merging.
++(define_expand "@cond_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
++  [(set (match_operand:VNx2DF_ONLY 0 "register_operand")
++	(unspec:VNx2DF_ONLY
++	  [(match_operand:VNx2BI 1 "register_operand")
++	   (unspec:VNx2DF_ONLY
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:VNx4SI_ONLY 2 "register_operand")]
++	     SVE_COND_ICVTF)
++	   (match_operand:VNx2DF_ONLY 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE"
+ )
+ 
+-;; Floating-point multiplication predicated with a PTRUE.
+-(define_insn_and_split "*mul<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (mult:SVE_F
+-	     (match_operand:SVE_F 2 "register_operand" "%0, w")
+-	     (match_operand:SVE_F 3 "aarch64_sve_float_mul_operand" "vsM, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++(define_insn "*cond_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
++  [(set (match_operand:VNx2DF_ONLY 0 "register_operand" "=w, ?&w, ?&w")
++	(unspec:VNx2DF_ONLY
++	  [(match_operand:VNx2BI 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:VNx2DF_ONLY
++	     [(match_dup 1)
++	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	      (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w")]
++	     SVE_COND_ICVTF)
++	   (match_operand:VNx2DF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
+   "TARGET_SVE"
+   "@
+-   fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+-   #"
+-  ; Split the unpredicated form after reload, so that we don't have
+-  ; the unnecessary PTRUE.
+-  "&& reload_completed
+-   && register_operand (operands[3], <MODE>mode)"
+-  [(set (match_dup 0) (mult:SVE_F (match_dup 2) (match_dup 3)))]
++   <su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>
++   movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;<su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>
++   movprfx\t%0, %3\;<su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
+ )
+ 
+-;; Unpredicated floating-point binary operations (post-RA only).
+-;; These are generated by splitting a predicated instruction whose
+-;; predicate is unused.
+-(define_insn "*post_ra_<sve_fp_op><mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+-	(SVE_UNPRED_FP_BINARY:SVE_F
+-	  (match_operand:SVE_F 1 "register_operand" "w")
+-	  (match_operand:SVE_F 2 "register_operand" "w")))]
+-  "TARGET_SVE && reload_completed"
+-  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
++;; -------------------------------------------------------------------------
++;; ---- [FP<-INT] Packs
++;; -------------------------------------------------------------------------
++;; No patterns here yet!
++;; -------------------------------------------------------------------------
+ 
+-;; Unpredicated fma (%0 = (%1 * %2) + %3).
+-(define_expand "fma<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 4)
+-	   (fma:SVE_F (match_operand:SVE_F 1 "register_operand")
+-		      (match_operand:SVE_F 2 "register_operand")
+-		      (match_operand:SVE_F 3 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [FP<-INT] Unpacks
++;; -------------------------------------------------------------------------
++;; The patterns in this section are synthetic.
++;; -------------------------------------------------------------------------
++
++;; Unpack one half of a VNx4SI to VNx2DF.  First unpack from VNx4SI
++;; to VNx2DI, reinterpret the VNx2DI as a VNx4SI, then convert the
++;; unpacked VNx4SI to VNx2DF.
++(define_expand "vec_unpack<su_optab>_float_<perm_hilo>_vnx4si"
++  [(match_operand:VNx2DF 0 "register_operand")
++   (FLOATUORS:VNx2DF
++     (unspec:VNx2DI [(match_operand:VNx4SI 1 "register_operand")]
++		    UNPACK_UNSIGNED))]
+   "TARGET_SVE"
+   {
+-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    /* Use ZIP to do the unpack, since we don't care about the upper halves
++       and since it has the nice property of not needing any subregs.
++       If using UUNPK* turns out to be preferable, we could model it as
++       a ZIP whose first operand is zero.  */
++    rtx temp = gen_reg_rtx (VNx4SImode);
++    emit_insn ((<hi_lanes_optab>
++		? gen_aarch64_sve_zip2vnx4si
++		: gen_aarch64_sve_zip1vnx4si)
++	       (temp, operands[1], operands[1]));
++    rtx ptrue = aarch64_ptrue_reg (VNx2BImode);
++    rtx strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
++    emit_insn (gen_aarch64_sve_<FLOATUORS:optab>_extendvnx4sivnx2df
++	       (operands[0], ptrue, temp, strictness));
++    DONE;
+   }
+ )
+ 
+-;; fma predicated with a PTRUE.
+-(define_insn "*fma<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (fma:SVE_F (match_operand:SVE_F 3 "register_operand" "%0, w, w")
+-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
+-		      (match_operand:SVE_F 2 "register_operand" "w, 0, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  "@
+-   fmad\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
+-   fmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   movprfx\t%0, %2\;fmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
+-)
++;; -------------------------------------------------------------------------
++;; ---- [FP<-FP] Packs
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FCVT
++;; -------------------------------------------------------------------------
+ 
+-;; Unpredicated fnma (%0 = (-%1 * %2) + %3).
+-(define_expand "fnma<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 4)
+-	   (fma:SVE_F (neg:SVE_F
+-			(match_operand:SVE_F 1 "register_operand"))
+-		      (match_operand:SVE_F 2 "register_operand")
+-		      (match_operand:SVE_F 3 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Convert two vectors of DF to SF, or two vectors of SF to HF, and pack
++;; the results into a single vector.
++(define_expand "vec_pack_trunc_<Vwide>"
++  [(set (match_dup 4)
++	(unspec:SVE_FULL_HSF
++	  [(match_dup 3)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:<VWIDE> 1 "register_operand")]
++	  UNSPEC_COND_FCVT))
++   (set (match_dup 5)
++	(unspec:SVE_FULL_HSF
++	  [(match_dup 3)
++	   (const_int SVE_RELAXED_GP)
++	   (match_operand:<VWIDE> 2 "register_operand")]
++	  UNSPEC_COND_FCVT))
++   (set (match_operand:SVE_FULL_HSF 0 "register_operand")
++	(unspec:SVE_FULL_HSF [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
+   "TARGET_SVE"
+   {
+-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[3] = aarch64_ptrue_reg (<VWIDE_PRED>mode);
++    operands[4] = gen_reg_rtx (<MODE>mode);
++    operands[5] = gen_reg_rtx (<MODE>mode);
+   }
+ )
+ 
+-;; fnma predicated with a PTRUE.
+-(define_insn "*fnma<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (fma:SVE_F (neg:SVE_F
+-			(match_operand:SVE_F 3 "register_operand" "%0, w, w"))
+-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
+-		      (match_operand:SVE_F 2 "register_operand" "w, 0, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
++;; Predicated float-to-float truncation.
++(define_insn "@aarch64_sve_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
++  [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w")
++	(unspec:SVE_FULL_HSF
++	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_SDF 2 "register_operand" "w")]
++	  SVE_COND_FCVT))]
++  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
++  "fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>"
++)
++
++;; Predicated float-to-float truncation with merging.
++(define_expand "@cond_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
++  [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
++	(unspec:SVE_FULL_HSF
++	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_HSF
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_SDF 2 "register_operand")]
++	     SVE_COND_FCVT)
++	   (match_operand:SVE_FULL_HSF 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
++)
++
++(define_insn "*cond_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
++  [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w, ?&w, ?&w")
++	(unspec:SVE_FULL_HSF
++	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_HSF
++	     [(match_dup 1)
++	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_SDF 2 "register_operand" "w, w, w")]
++	     SVE_COND_FCVT)
++	   (match_operand:SVE_FULL_HSF 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
+   "@
+-   fmsb\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
+-   fmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   movprfx\t%0, %2\;fmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++   fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>
++   movprfx\t%0.<SVE_FULL_SDF:Vetype>, %1/z, %2.<SVE_FULL_SDF:Vetype>\;fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>
++   movprfx\t%0, %3\;fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP<-FP] Packs (bfloat16)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - BFCVT (BF16)
++;; - BFCVTNT (BF16)
++;; -------------------------------------------------------------------------
++
++;; Predicated BFCVT.
++(define_insn "@aarch64_sve_<optab>_trunc<VNx4SF_ONLY:mode><VNx8BF_ONLY:mode>"
++  [(set (match_operand:VNx8BF_ONLY 0 "register_operand" "=w")
++	(unspec:VNx8BF_ONLY
++	  [(match_operand:VNx4BI 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:VNx4SF_ONLY 2 "register_operand" "w")]
++	  SVE_COND_FCVT))]
++  "TARGET_SVE_BF16"
++  "bfcvt\t%0.h, %1/m, %2.s"
++)
++
++;; Predicated BFCVT with merging.
++(define_expand "@cond_<optab>_trunc<VNx4SF_ONLY:mode><VNx8BF_ONLY:mode>"
++  [(set (match_operand:VNx8BF_ONLY 0 "register_operand")
++	(unspec:VNx8BF_ONLY
++	  [(match_operand:VNx4BI 1 "register_operand")
++	   (unspec:VNx8BF_ONLY
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:VNx4SF_ONLY 2 "register_operand")]
++	     SVE_COND_FCVT)
++	   (match_operand:VNx8BF_ONLY 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE_BF16"
++)
++
++(define_insn "*cond_<optab>_trunc<VNx4SF_ONLY:mode><VNx8BF_ONLY:mode>"
++  [(set (match_operand:VNx8BF_ONLY 0 "register_operand" "=w, ?&w, ?&w")
++	(unspec:VNx8BF_ONLY
++	  [(match_operand:VNx4BI 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:VNx8BF_ONLY
++	     [(match_dup 1)
++	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	      (match_operand:VNx4SF_ONLY 2 "register_operand" "w, w, w")]
++	     SVE_COND_FCVT)
++	   (match_operand:VNx8BF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE_BF16"
++  "@
++   bfcvt\t%0.h, %1/m, %2.s
++   movprfx\t%0.s, %1/z, %2.s\;bfcvt\t%0.h, %1/m, %2.s
++   movprfx\t%0, %3\;bfcvt\t%0.h, %1/m, %2.s"
++  [(set_attr "movprfx" "*,yes,yes")]
+ )
+ 
+-;; Unpredicated fms (%0 = (%1 * %2) - %3).
+-(define_expand "fms<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 4)
+-	   (fma:SVE_F (match_operand:SVE_F 1 "register_operand")
+-		      (match_operand:SVE_F 2 "register_operand")
+-		      (neg:SVE_F
+-			(match_operand:SVE_F 3 "register_operand")))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicated BFCVTNT.  This doesn't give a natural aarch64_pred_*/cond_*
++;; pair because the even elements always have to be supplied for active
++;; elements, even if the inactive elements don't matter.
++;;
++;; This instructions does not take MOVPRFX.
++(define_insn "@aarch64_sve_cvtnt<mode>"
++  [(set (match_operand:VNx8BF_ONLY 0 "register_operand" "=w")
++	(unspec:VNx8BF_ONLY
++	  [(match_operand:VNx4BI 2 "register_operand" "Upl")
++	   (const_int SVE_STRICT_GP)
++	   (match_operand:VNx8BF_ONLY 1 "register_operand" "0")
++	   (match_operand:VNx4SF 3 "register_operand" "w")]
++	  UNSPEC_COND_FCVTNT))]
++  "TARGET_SVE_BF16"
++  "bfcvtnt\t%0.h, %2/m, %3.s"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [FP<-FP] Unpacks
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - FCVT
++;; -------------------------------------------------------------------------
++
++;; Unpack one half of a VNx4SF to VNx2DF, or one half of a VNx8HF to VNx4SF.
++;; First unpack the source without conversion, then float-convert the
++;; unpacked source.
++(define_expand "vec_unpacks_<perm_hilo>_<mode>"
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (unspec:SVE_FULL_HSF
++     [(match_operand:SVE_FULL_HSF 1 "register_operand")]
++     UNPACK_UNSIGNED)]
+   "TARGET_SVE"
+   {
+-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    /* Use ZIP to do the unpack, since we don't care about the upper halves
++       and since it has the nice property of not needing any subregs.
++       If using UUNPK* turns out to be preferable, we could model it as
++       a ZIP whose first operand is zero.  */
++    rtx temp = gen_reg_rtx (<MODE>mode);
++    emit_insn ((<hi_lanes_optab>
++		? gen_aarch64_sve_zip2<mode>
++		: gen_aarch64_sve_zip1<mode>)
++		(temp, operands[1], operands[1]));
++    rtx ptrue = aarch64_ptrue_reg (<VWIDE_PRED>mode);
++    rtx strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
++    emit_insn (gen_aarch64_sve_fcvt_nontrunc<mode><Vwide>
++	       (operands[0], ptrue, temp, strictness));
++    DONE;
+   }
+ )
+ 
+-;; fms predicated with a PTRUE.
+-(define_insn "*fms<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (fma:SVE_F (match_operand:SVE_F 3 "register_operand" "%0, w, w")
+-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
+-		      (neg:SVE_F
+-			(match_operand:SVE_F 2 "register_operand" "w, 0, w")))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
++;; Predicated float-to-float extension.
++(define_insn "@aarch64_sve_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
++  [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w")
++	(unspec:SVE_FULL_SDF
++	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl")
++	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
++	   (match_operand:SVE_FULL_HSF 2 "register_operand" "w")]
++	  SVE_COND_FCVT))]
++  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
++  "fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>"
++)
++
++;; Predicated float-to-float extension with merging.
++(define_expand "@cond_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
++  [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
++	(unspec:SVE_FULL_SDF
++	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand")
++	   (unspec:SVE_FULL_SDF
++	     [(match_dup 1)
++	      (const_int SVE_STRICT_GP)
++	      (match_operand:SVE_FULL_HSF 2 "register_operand")]
++	     SVE_COND_FCVT)
++	   (match_operand:SVE_FULL_SDF 3 "aarch64_simd_reg_or_zero")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
++)
++
++(define_insn "*cond_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
++  [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w, ?&w, ?&w")
++	(unspec:SVE_FULL_SDF
++	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl, Upl, Upl")
++	   (unspec:SVE_FULL_SDF
++	     [(match_dup 1)
++	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
++	      (match_operand:SVE_FULL_HSF 2 "register_operand" "w, w, w")]
++	     SVE_COND_FCVT)
++	   (match_operand:SVE_FULL_SDF 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
++	  UNSPEC_SEL))]
++  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
+   "@
+-   fnmsb\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
+-   fnmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   movprfx\t%0, %2\;fnmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++   fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>
++   movprfx\t%0.<SVE_FULL_SDF:Vetype>, %1/z, %2.<SVE_FULL_SDF:Vetype>\;fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>
++   movprfx\t%0, %3\;fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>"
++  [(set_attr "movprfx" "*,yes,yes")]
+ )
+ 
+-;; Unpredicated fnms (%0 = (-%1 * %2) - %3).
+-(define_expand "fnms<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 4)
+-	   (fma:SVE_F (neg:SVE_F
+-			(match_operand:SVE_F 1 "register_operand"))
+-		      (match_operand:SVE_F 2 "register_operand")
+-		      (neg:SVE_F
+-			(match_operand:SVE_F 3 "register_operand")))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  {
+-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+-  }
+-)
++;; -------------------------------------------------------------------------
++;; ---- [PRED<-PRED] Packs
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - UZP1
++;; -------------------------------------------------------------------------
+ 
+-;; fnms predicated with a PTRUE.
+-(define_insn "*fnms<mode>4"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (fma:SVE_F (neg:SVE_F
+-			(match_operand:SVE_F 3 "register_operand" "%0, w, w"))
+-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
+-		      (neg:SVE_F
+-			(match_operand:SVE_F 2 "register_operand" "w, 0, w")))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Predicate pack.  Use UZP1 on the narrower type, which discards
++;; the high part of each wide element.
++(define_insn "vec_pack_trunc_<Vwide>"
++  [(set (match_operand:PRED_BHS 0 "register_operand" "=Upa")
++	(unspec:PRED_BHS
++	  [(match_operand:<VWIDE> 1 "register_operand" "Upa")
++	   (match_operand:<VWIDE> 2 "register_operand" "Upa")]
++	  UNSPEC_PACK))]
+   "TARGET_SVE"
+-  "@
+-   fnmad\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
+-   fnmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   movprfx\t%0, %2\;fnmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+ )
+ 
+-;; Unpredicated floating-point division.
+-(define_expand "div<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 3)
+-	   (div:SVE_F (match_operand:SVE_F 1 "register_operand")
+-		      (match_operand:SVE_F 2 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [PRED<-PRED] Unpacks
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - PUNPKHI
++;; - PUNPKLO
++;; -------------------------------------------------------------------------
++
++;; Unpack the low or high half of a predicate, where "high" refers to
++;; the low-numbered lanes for big-endian and the high-numbered lanes
++;; for little-endian.
++(define_expand "vec_unpack<su>_<perm_hilo>_<mode>"
++  [(match_operand:<VWIDE> 0 "register_operand")
++   (unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand")]
++		   UNPACK)]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    emit_insn ((<hi_lanes_optab>
++		? gen_aarch64_sve_punpkhi_<PRED_BHS:mode>
++		: gen_aarch64_sve_punpklo_<PRED_BHS:mode>)
++	       (operands[0], operands[1]));
++    DONE;
+   }
+ )
+ 
+-;; Floating-point division predicated with a PTRUE.
+-(define_insn "*div<mode>3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (div:SVE_F (match_operand:SVE_F 2 "register_operand" "0, w, w")
+-		      (match_operand:SVE_F 3 "register_operand" "w, 0, w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++(define_insn "@aarch64_sve_punpk<perm_hilo>_<mode>"
++  [(set (match_operand:<VWIDE> 0 "register_operand" "=Upa")
++	(unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand" "Upa")]
++			UNPACK_UNSIGNED))]
+   "TARGET_SVE"
+-  "@
+-   fdiv\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   fdivr\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %2\;fdiv\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++  "punpk<perm_hilo>\t%0.h, %1.b"
+ )
+ 
+-;; Unpredicated FNEG, FABS and FSQRT.
+-(define_expand "<optab><mode>2"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 2)
+-	   (SVE_FP_UNARY:SVE_F (match_operand:SVE_F 1 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; =========================================================================
++;; == Vector partitioning
++;; =========================================================================
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Unary partitioning
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - BRKA
++;; - BRKAS
++;; - BRKB
++;; - BRKBS
++;; -------------------------------------------------------------------------
++
++;; Note that unlike most other instructions that have both merging and
++;; zeroing forms, these instructions don't operate elementwise and so
++;; don't fit the IFN_COND model.
++(define_insn "@aarch64_brk<brk_op>"
++  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa, Upa")
++	(unspec:VNx16BI
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa, Upa")
++	   (match_operand:VNx16BI 2 "register_operand" "Upa, Upa")
++	   (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero" "Dz, 0")]
++	  SVE_BRK_UNARY))]
++  "TARGET_SVE"
++  "@
++   brk<brk_op>\t%0.b, %1/z, %2.b
++   brk<brk_op>\t%0.b, %1/m, %2.b"
++)
++
++;; Same, but also producing a flags result.
++(define_insn "*aarch64_brk<brk_op>_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa, Upa")
++	   (match_dup 1)
++	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
++	   (unspec:VNx16BI
++	     [(match_dup 1)
++	      (match_operand:VNx16BI 2 "register_operand" "Upa, Upa")
++	      (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero" "Dz, 0")]
++	     SVE_BRK_UNARY)]
++	  UNSPEC_PTEST))
++   (set (match_operand:VNx16BI 0 "register_operand" "=Upa, Upa")
++	(unspec:VNx16BI
++	  [(match_dup 1)
++	   (match_dup 2)
++	   (match_dup 3)]
++	  SVE_BRK_UNARY))]
++  "TARGET_SVE"
++  "@
++   brk<brk_op>s\t%0.b, %1/z, %2.b
++   brk<brk_op>s\t%0.b, %1/m, %2.b"
++)
++
++;; Same, but with only the flags result being interesting.
++(define_insn "*aarch64_brk<brk_op>_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa, Upa")
++	   (match_dup 1)
++	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
++	   (unspec:VNx16BI
++	     [(match_dup 1)
++	      (match_operand:VNx16BI 2 "register_operand" "Upa, Upa")
++	      (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero" "Dz, 0")]
++	     SVE_BRK_UNARY)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa, Upa"))]
+   "TARGET_SVE"
++  "@
++   brk<brk_op>s\t%0.b, %1/z, %2.b
++   brk<brk_op>s\t%0.b, %1/m, %2.b"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Binary partitioning
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - BRKN
++;; - BRKNS
++;; - BRKPA
++;; - BRKPAS
++;; - BRKPB
++;; - BRKPBS
++;; -------------------------------------------------------------------------
++
++;; Binary BRKs (BRKN, BRKPA, BRKPB).
++(define_insn "@aarch64_brk<brk_op>"
++  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(unspec:VNx16BI
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand:VNx16BI 2 "register_operand" "Upa")
++	   (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")]
++	  SVE_BRK_BINARY))]
++  "TARGET_SVE"
++  "brk<brk_op>\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b"
++)
++
++;; Same, but also producing a flags result.
++(define_insn "*aarch64_brk<brk_op>_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_dup 1)
++	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
++	   (unspec:VNx16BI
++	     [(match_dup 1)
++	      (match_operand:VNx16BI 2 "register_operand" "Upa")
++	      (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")]
++	     SVE_BRK_BINARY)]
++	  UNSPEC_PTEST))
++   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
++	(unspec:VNx16BI
++	  [(match_dup 1)
++	   (match_dup 2)
++	   (match_dup 3)]
++	  SVE_BRK_BINARY))]
++  "TARGET_SVE"
++  "brk<brk_op>s\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b"
++)
++
++;; Same, but with only the flags result being interesting.
++(define_insn "*aarch64_brk<brk_op>_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_dup 1)
++	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
++	   (unspec:VNx16BI
++	     [(match_dup 1)
++	      (match_operand:VNx16BI 2 "register_operand" "Upa")
++	      (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")]
++	     SVE_BRK_BINARY)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
++  "TARGET_SVE"
++  "brk<brk_op>s\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b"
++)
++
++;; -------------------------------------------------------------------------
++;; ---- [PRED] Scalarization
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - PFIRST
++;; - PNEXT
++;; -------------------------------------------------------------------------
++
++(define_insn "@aarch64_sve_<sve_pred_op><mode>"
++  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(unspec:PRED_ALL
++	  [(match_operand:PRED_ALL 1 "register_operand" "Upa")
++	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++	   (match_operand:PRED_ALL 3 "register_operand" "0")]
++	  SVE_PITER))
++   (clobber (reg:CC_NZC CC_REGNUM))]
++  "TARGET_SVE && <max_elem_bits> >= <elem_bits>"
++  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
++)
++
++;; Same, but also producing a flags result.
++(define_insn_and_rewrite "*aarch64_sve_<sve_pred_op><mode>_cc"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 2)
++	   (match_operand:SI 3 "aarch64_sve_ptrue_flag")
++	   (unspec:PRED_ALL
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	      (match_operand:PRED_ALL 6 "register_operand" "0")]
++	     SVE_PITER)]
++	  UNSPEC_PTEST))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++	(unspec:PRED_ALL
++	  [(match_dup 4)
++	   (match_dup 5)
++	   (match_dup 6)]
++	  SVE_PITER))]
++  "TARGET_SVE
++   && <max_elem_bits> >= <elem_bits>
++   && aarch64_sve_same_pred_for_ptest_p (&operands[2], &operands[4])"
++  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
++  "&& !rtx_equal_p (operands[2], operands[4])"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[4] = operands[2];
++    operands[5] = operands[3];
+   }
+ )
+ 
+-;; FNEG, FABS and FSQRT predicated with a PTRUE.
+-(define_insn "*<optab><mode>2"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (SVE_FP_UNARY:SVE_F (match_operand:SVE_F 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+-)
+-
+-;; Unpredicated FRINTy.
+-(define_expand "<frint_pattern><mode>2"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 2)
+-	   (unspec:SVE_F [(match_operand:SVE_F 1 "register_operand")]
+-			 FRINT)]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
++;; Same, but with only the flags result being interesting.
++(define_insn_and_rewrite "*aarch64_sve_<sve_pred_op><mode>_ptest"
++  [(set (reg:CC_NZC CC_REGNUM)
++	(unspec:CC_NZC
++	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
++	   (match_operand 2)
++	   (match_operand:SI 3 "aarch64_sve_ptrue_flag")
++	   (unspec:PRED_ALL
++	     [(match_operand 4)
++	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
++	      (match_operand:PRED_ALL 6 "register_operand" "0")]
++	     SVE_PITER)]
++	  UNSPEC_PTEST))
++   (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
++  "TARGET_SVE
++   && <max_elem_bits> >= <elem_bits>
++   && aarch64_sve_same_pred_for_ptest_p (&operands[2], &operands[4])"
++  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
++  "&& !rtx_equal_p (operands[2], operands[4])"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    operands[4] = operands[2];
++    operands[5] = operands[3];
+   }
+ )
+ 
+-;; FRINTy predicated with a PTRUE.
+-(define_insn "*<frint_pattern><mode>2"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (unspec:SVE_F [(match_operand:SVE_F 2 "register_operand" "w")]
+-			 FRINT)]
+-	  UNSPEC_MERGE_PTRUE))]
+-  "TARGET_SVE"
+-  "frint<frint_suffix>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+-)
++;; =========================================================================
++;; == Counting elements
++;; =========================================================================
+ 
+-;; Unpredicated conversion of floats to integers of the same size (HF to HI,
+-;; SF to SI or DF to DI).
+-(define_expand "<fix_trunc_optab><mode><v_int_equiv>2"
+-  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+-	(unspec:<V_INT_EQUIV>
+-	  [(match_dup 2)
+-	   (FIXUORS:<V_INT_EQUIV>
+-	     (match_operand:SVE_F 1 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Count elements in a pattern (scalar)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - CNTB
++;; - CNTD
++;; - CNTH
++;; - CNTW
++;; -------------------------------------------------------------------------
++
++;; Count the number of elements in an svpattern.  Operand 1 is the pattern,
++;; operand 2 is the number of elements that fit in a 128-bit block, and
++;; operand 3 is a multiplier in the range [1, 16].
++;;
++;; Note that this pattern isn't used for SV_ALL (but would work for that too).
++(define_insn "aarch64_sve_cnt_pat"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(zero_extend:DI
++	  (unspec:SI [(match_operand:DI 1 "const_int_operand")
++		      (match_operand:DI 2 "const_int_operand")
++		      (match_operand:DI 3 "const_int_operand")]
++		     UNSPEC_SVE_CNT_PAT)))]
+   "TARGET_SVE"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    return aarch64_output_sve_cnt_pat_immediate ("cnt", "%x0", operands + 1);
+   }
+ )
+ 
+-;; Conversion of SF to DI, SI or HI, predicated with a PTRUE.
+-(define_insn "*<fix_trunc_optab>v16hsf<mode>2"
+-  [(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
+-	(unspec:SVE_HSDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (FIXUORS:SVE_HSDI
+-	     (match_operand:VNx8HF 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Increment by the number of elements in a pattern (scalar)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - INC
++;; - SQINC
++;; - UQINC
++;; -------------------------------------------------------------------------
++
++;; Increment a DImode register by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(ANY_PLUS:DI (zero_extend:DI
++		       (unspec:SI [(match_operand:DI 2 "const_int_operand")
++				   (match_operand:DI 3 "const_int_operand")
++				   (match_operand:DI 4 "const_int_operand")]
++				  UNSPEC_SVE_CNT_PAT))
++		     (match_operand:DI_ONLY 1 "register_operand" "0")))]
+   "TARGET_SVE"
+-  "fcvtz<su>\t%0.<Vetype>, %1/m, %2.h"
++  {
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%x0",
++						 operands + 2);
++  }
+ )
+ 
+-;; Conversion of SF to DI or SI, predicated with a PTRUE.
+-(define_insn "*<fix_trunc_optab>vnx4sf<mode>2"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (FIXUORS:SVE_SDI
+-	     (match_operand:VNx4SF 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Increment an SImode register by the number of elements in an svpattern
++;; using modular arithmetic.  See aarch64_sve_cnt_pat for the counting
++;; behavior.
++(define_insn "*aarch64_sve_incsi_pat"
++  [(set (match_operand:SI 0 "register_operand" "=r")
++	(plus:SI (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			     (match_operand:DI 3 "const_int_operand")
++			     (match_operand:DI 4 "const_int_operand")]
++			    UNSPEC_SVE_CNT_PAT)
++		 (match_operand:SI 1 "register_operand" "0")))]
+   "TARGET_SVE"
+-  "fcvtz<su>\t%0.<Vetype>, %1/m, %2.s"
++  {
++    return aarch64_output_sve_cnt_pat_immediate ("inc", "%x0", operands + 2);
++  }
+ )
+ 
+-;; Conversion of DF to DI or SI, predicated with a PTRUE.
+-(define_insn "*<fix_trunc_optab>vnx2df<mode>2"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
+-	(unspec:SVE_SDI
+-	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
+-	   (FIXUORS:SVE_SDI
+-	     (match_operand:VNx2DF 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Increment an SImode register by the number of elements in an svpattern
++;; using saturating arithmetic, extending the result to 64 bits.
++;;
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(<paired_extend>:DI
++	  (SAT_PLUS:SI
++	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			(match_operand:DI 3 "const_int_operand")
++			(match_operand:DI 4 "const_int_operand")]
++		       UNSPEC_SVE_CNT_PAT)
++	    (match_operand:SI_ONLY 1 "register_operand" "0"))))]
+   "TARGET_SVE"
+-  "fcvtz<su>\t%0.<Vetype>, %1/m, %2.d"
++  {
++    const char *registers = (<CODE> == SS_PLUS ? "%x0, %w0" : "%w0");
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", registers,
++						 operands + 2);
++  }
+ )
+ 
+-;; Unpredicated conversion of integers to floats of the same size
+-;; (HI to HF, SI to SF or DI to DF).
+-(define_expand "<optab><v_int_equiv><mode>2"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_dup 2)
+-	   (FLOATUORS:SVE_F
+-	     (match_operand:<V_INT_EQUIV> 1 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Increment by the number of elements in a pattern (vector)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - INC
++;; - SQINC
++;; - UQINC
++;; -------------------------------------------------------------------------
++
++;; Increment a vector of DIs by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
++	(ANY_PLUS:VNx2DI
++	  (vec_duplicate:VNx2DI
++	    (zero_extend:DI
++	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			  (match_operand:DI 3 "const_int_operand")
++			  (match_operand:DI 4 "const_int_operand")]
++			 UNSPEC_SVE_CNT_PAT)))
++	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")))]
+   "TARGET_SVE"
+   {
+-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
++    if (which_alternative == 1)
++      output_asm_insn ("movprfx\t%0, %1", operands);
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
++						 operands + 2);
+   }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Conversion of DI, SI or HI to the same number of HFs, predicated
+-;; with a PTRUE.
+-(define_insn "*<optab><mode>vnx8hf2"
+-  [(set (match_operand:VNx8HF 0 "register_operand" "=w")
+-	(unspec:VNx8HF
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (FLOATUORS:VNx8HF
+-	     (match_operand:SVE_HSDI 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Increment a vector of SIs by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
++	(ANY_PLUS:VNx4SI
++	  (vec_duplicate:VNx4SI
++	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			(match_operand:DI 3 "const_int_operand")
++			(match_operand:DI 4 "const_int_operand")]
++		       UNSPEC_SVE_CNT_PAT))
++	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")))]
+   "TARGET_SVE"
+-  "<su_optab>cvtf\t%0.h, %1/m, %2.<Vetype>"
++  {
++    if (which_alternative == 1)
++      output_asm_insn ("movprfx\t%0, %1", operands);
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
++						 operands + 2);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Conversion of DI or SI to the same number of SFs, predicated with a PTRUE.
+-(define_insn "*<optab><mode>vnx4sf2"
+-  [(set (match_operand:VNx4SF 0 "register_operand" "=w")
+-	(unspec:VNx4SF
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (FLOATUORS:VNx4SF
+-	     (match_operand:SVE_SDI 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Increment a vector of HIs by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_expand "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx8HI 0 "register_operand")
++	(ANY_PLUS:VNx8HI
++	  (vec_duplicate:VNx8HI
++	    (truncate:HI
++	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			  (match_operand:DI 3 "const_int_operand")
++			  (match_operand:DI 4 "const_int_operand")]
++			 UNSPEC_SVE_CNT_PAT)))
++	  (match_operand:VNx8HI_ONLY 1 "register_operand")))]
++  "TARGET_SVE"
++)
++
++(define_insn "*aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
++	(ANY_PLUS:VNx8HI
++	  (vec_duplicate:VNx8HI
++	    (match_operator:HI 5 "subreg_lowpart_operator"
++	      [(unspec:SI [(match_operand:DI 2 "const_int_operand")
++			   (match_operand:DI 3 "const_int_operand")
++			   (match_operand:DI 4 "const_int_operand")]
++			  UNSPEC_SVE_CNT_PAT)]))
++	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")))]
+   "TARGET_SVE"
+-  "<su_optab>cvtf\t%0.s, %1/m, %2.<Vetype>"
++  {
++    if (which_alternative == 1)
++      output_asm_insn ("movprfx\t%0, %1", operands);
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
++						 operands + 2);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Conversion of DI or SI to DF, predicated with a PTRUE.
+-(define_insn "aarch64_sve_<optab><mode>vnx2df2"
+-  [(set (match_operand:VNx2DF 0 "register_operand" "=w")
+-	(unspec:VNx2DF
+-	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
+-	   (FLOATUORS:VNx2DF
+-	     (match_operand:SVE_SDI 2 "register_operand" "w"))]
+-	  UNSPEC_MERGE_PTRUE))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Decrement by the number of elements in a pattern (scalar)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - DEC
++;; - SQDEC
++;; - UQDEC
++;; -------------------------------------------------------------------------
++
++;; Decrement a DImode register by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(ANY_MINUS:DI (match_operand:DI_ONLY 1 "register_operand" "0")
++		      (zero_extend:DI
++			(unspec:SI [(match_operand:DI 2 "const_int_operand")
++				    (match_operand:DI 3 "const_int_operand")
++				    (match_operand:DI 4 "const_int_operand")]
++				   UNSPEC_SVE_CNT_PAT))))]
+   "TARGET_SVE"
+-  "<su_optab>cvtf\t%0.d, %1/m, %2.<Vetype>"
++  {
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%x0",
++						 operands + 2);
++  }
+ )
+ 
+-;; Conversion of DFs to the same number of SFs, or SFs to the same number
+-;; of HFs.
+-(define_insn "*trunc<Vwide><mode>2"
+-  [(set (match_operand:SVE_HSF 0 "register_operand" "=w")
+-	(unspec:SVE_HSF
+-	  [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl")
+-	   (unspec:SVE_HSF
+-	     [(match_operand:<VWIDE> 2 "register_operand" "w")]
+-	     UNSPEC_FLOAT_CONVERT)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Decrement an SImode register by the number of elements in an svpattern
++;; using modular arithmetic.  See aarch64_sve_cnt_pat for the counting
++;; behavior.
++(define_insn "*aarch64_sve_decsi_pat"
++  [(set (match_operand:SI 0 "register_operand" "=r")
++	(minus:SI (match_operand:SI 1 "register_operand" "0")
++		  (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			      (match_operand:DI 3 "const_int_operand")
++			      (match_operand:DI 4 "const_int_operand")]
++			     UNSPEC_SVE_CNT_PAT)))]
+   "TARGET_SVE"
+-  "fcvt\t%0.<Vetype>, %1/m, %2.<Vewtype>"
++  {
++    return aarch64_output_sve_cnt_pat_immediate ("dec", "%x0", operands + 2);
++  }
+ )
+ 
+-;; Conversion of SFs to the same number of DFs, or HFs to the same number
+-;; of SFs.
+-(define_insn "aarch64_sve_extend<mode><Vwide>2"
+-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+-	(unspec:<VWIDE>
+-	  [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl")
+-	   (unspec:<VWIDE>
+-	     [(match_operand:SVE_HSF 2 "register_operand" "w")]
+-	     UNSPEC_FLOAT_CONVERT)]
+-	  UNSPEC_MERGE_PTRUE))]
++;; Decrement an SImode register by the number of elements in an svpattern
++;; using saturating arithmetic, extending the result to 64 bits.
++;;
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(<paired_extend>:DI
++	  (SAT_MINUS:SI
++	    (match_operand:SI_ONLY 1 "register_operand" "0")
++	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			(match_operand:DI 3 "const_int_operand")
++			(match_operand:DI 4 "const_int_operand")]
++		       UNSPEC_SVE_CNT_PAT))))]
+   "TARGET_SVE"
+-  "fcvt\t%0.<Vewtype>, %1/m, %2.<Vetype>"
++  {
++    const char *registers = (<CODE> == SS_MINUS ? "%x0, %w0" : "%w0");
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", registers,
++						 operands + 2);
++  }
+ )
+ 
+-;; Unpack the low or high half of a predicate, where "high" refers to
+-;; the low-numbered lanes for big-endian and the high-numbered lanes
+-;; for little-endian.
+-(define_expand "vec_unpack<su>_<perm_hilo>_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand")
+-   (unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand")]
+-		   UNPACK)]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Decrement by the number of elements in a pattern (vector)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - DEC
++;; - SQDEC
++;; - UQDEC
++;; -------------------------------------------------------------------------
++
++;; Decrement a vector of DIs by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
++	(ANY_MINUS:VNx2DI
++	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")
++	  (vec_duplicate:VNx2DI
++	    (zero_extend:DI
++	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			  (match_operand:DI 3 "const_int_operand")
++			  (match_operand:DI 4 "const_int_operand")]
++			 UNSPEC_SVE_CNT_PAT)))))]
+   "TARGET_SVE"
+   {
+-    emit_insn ((<hi_lanes_optab>
+-		? gen_aarch64_sve_punpkhi_<PRED_BHS:mode>
+-		: gen_aarch64_sve_punpklo_<PRED_BHS:mode>)
+-	       (operands[0], operands[1]));
+-    DONE;
++    if (which_alternative == 1)
++      output_asm_insn ("movprfx\t%0, %1", operands);
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
++						 operands + 2);
+   }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; PUNPKHI and PUNPKLO.
+-(define_insn "aarch64_sve_punpk<perm_hilo>_<mode>"
+-  [(set (match_operand:<VWIDE> 0 "register_operand" "=Upa")
+-	(unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand" "Upa")]
+-			UNPACK_UNSIGNED))]
++;; Decrement a vector of SIs by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
++	(ANY_MINUS:VNx4SI
++	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")
++	  (vec_duplicate:VNx4SI
++	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			(match_operand:DI 3 "const_int_operand")
++			(match_operand:DI 4 "const_int_operand")]
++		       UNSPEC_SVE_CNT_PAT))))]
+   "TARGET_SVE"
+-  "punpk<perm_hilo>\t%0.h, %1.b"
++  {
++    if (which_alternative == 1)
++      output_asm_insn ("movprfx\t%0, %1", operands);
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
++						 operands + 2);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Unpack the low or high half of a vector, where "high" refers to
+-;; the low-numbered lanes for big-endian and the high-numbered lanes
+-;; for little-endian.
+-(define_expand "vec_unpack<su>_<perm_hilo>_<SVE_BHSI:mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand")
+-   (unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand")] UNPACK)]
++;; Decrement a vector of HIs by the number of elements in an svpattern.
++;; See aarch64_sve_cnt_pat for the counting behavior.
++(define_expand "@aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx8HI 0 "register_operand")
++	(ANY_MINUS:VNx8HI
++	  (match_operand:VNx8HI_ONLY 1 "register_operand")
++	  (vec_duplicate:VNx8HI
++	    (truncate:HI
++	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
++			  (match_operand:DI 3 "const_int_operand")
++			  (match_operand:DI 4 "const_int_operand")]
++			 UNSPEC_SVE_CNT_PAT)))))]
++  "TARGET_SVE"
++)
++
++(define_insn "*aarch64_sve_<inc_dec><mode>_pat"
++  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
++	(ANY_MINUS:VNx8HI
++	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")
++	  (vec_duplicate:VNx8HI
++	    (match_operator:HI 5 "subreg_lowpart_operator"
++	      [(unspec:SI [(match_operand:DI 2 "const_int_operand")
++			   (match_operand:DI 3 "const_int_operand")
++			   (match_operand:DI 4 "const_int_operand")]
++			  UNSPEC_SVE_CNT_PAT)]))))]
+   "TARGET_SVE"
+   {
+-    emit_insn ((<hi_lanes_optab>
+-		? gen_aarch64_sve_<su>unpkhi_<SVE_BHSI:mode>
+-		: gen_aarch64_sve_<su>unpklo_<SVE_BHSI:mode>)
+-	       (operands[0], operands[1]));
+-    DONE;
++    if (which_alternative == 1)
++      output_asm_insn ("movprfx\t%0, %1", operands);
++    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
++						 operands + 2);
+   }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; SUNPKHI, UUNPKHI, SUNPKLO and UUNPKLO.
+-(define_insn "aarch64_sve_<su>unpk<perm_hilo>_<SVE_BHSI:mode>"
+-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+-	(unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand" "w")]
+-			UNPACK))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Count elements in a predicate (scalar)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - CNTP
++;; -------------------------------------------------------------------------
++
++;; Count the number of set bits in a predicate.  Operand 3 is true if
++;; operand 1 is known to be all-true.
++(define_insn "@aarch64_pred_cntp<mode>"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(zero_extend:DI
++	  (unspec:SI [(match_operand:PRED_ALL 1 "register_operand" "Upl")
++		      (match_operand:SI 2 "aarch64_sve_ptrue_flag")
++		      (match_operand:PRED_ALL 3 "register_operand" "Upa")]
++		     UNSPEC_CNTP)))]
++  "TARGET_SVE"
++  "cntp\t%x0, %1, %3.<Vetype>")
++
++;; -------------------------------------------------------------------------
++;; ---- [INT] Increment by the number of elements in a predicate (scalar)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - INCP
++;; - SQINCP
++;; - UQINCP
++;; -------------------------------------------------------------------------
++
++;; Increment a DImode register by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand")
++	(ANY_PLUS:DI
++	  (zero_extend:DI
++	    (unspec:SI [(match_dup 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand")]
++		       UNSPEC_CNTP))
++	  (match_operand:DI_ONLY 1 "register_operand")))]
+   "TARGET_SVE"
+-  "<su>unpk<perm_hilo>\t%0.<Vewtype>, %1.<Vetype>"
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Unpack one half of a VNx4SF to VNx2DF, or one half of a VNx8HF to VNx4SF.
+-;; First unpack the source without conversion, then float-convert the
+-;; unpacked source.
+-(define_expand "vec_unpacks_<perm_hilo>_<mode>"
+-  [(match_operand:<VWIDE> 0 "register_operand")
+-   (unspec:SVE_HSF [(match_operand:SVE_HSF 1 "register_operand")]
+-		   UNPACK_UNSIGNED)]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(ANY_PLUS:DI
++	  (zero_extend:DI
++	    (unspec:SI [(match_operand 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
++		       UNSPEC_CNTP))
++	  (match_operand:DI_ONLY 1 "register_operand" "0")))]
+   "TARGET_SVE"
++  "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>"
++  "&& !CONSTANT_P (operands[3])"
+   {
+-    /* Use ZIP to do the unpack, since we don't care about the upper halves
+-       and since it has the nice property of not needing any subregs.
+-       If using UUNPK* turns out to be preferable, we could model it as
+-       a ZIP whose first operand is zero.  */
+-    rtx temp = gen_reg_rtx (<MODE>mode);
+-    emit_insn ((<hi_lanes_optab>
+-		? gen_aarch64_sve_zip2<mode>
+-		: gen_aarch64_sve_zip1<mode>)
+-		(temp, operands[1], operands[1]));
+-    rtx ptrue = force_reg (<VWIDE_PRED>mode, CONSTM1_RTX (<VWIDE_PRED>mode));
+-    emit_insn (gen_aarch64_sve_extend<mode><Vwide>2 (operands[0],
+-						     ptrue, temp));
+-    DONE;
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+   }
+ )
+ 
+-;; Unpack one half of a VNx4SI to VNx2DF.  First unpack from VNx4SI
+-;; to VNx2DI, reinterpret the VNx2DI as a VNx4SI, then convert the
+-;; unpacked VNx4SI to VNx2DF.
+-(define_expand "vec_unpack<su_optab>_float_<perm_hilo>_vnx4si"
+-  [(match_operand:VNx2DF 0 "register_operand")
+-   (FLOATUORS:VNx2DF
+-     (unspec:VNx2DI [(match_operand:VNx4SI 1 "register_operand")]
+-		    UNPACK_UNSIGNED))]
+-  "TARGET_SVE"
++;; Increment an SImode register by the number of set bits in a predicate
++;; using modular arithmetic.  See aarch64_sve_cntp for a description of
++;; the operands.
++(define_insn_and_rewrite "*aarch64_incsi<mode>_cntp"
++  [(set (match_operand:SI 0 "register_operand" "=r")
++	(plus:SI
++	  (unspec:SI [(match_operand 3)
++		      (const_int SVE_KNOWN_PTRUE)
++		      (match_operand:PRED_ALL 2 "register_operand" "Upa")]
++		     UNSPEC_CNTP)
++	  (match_operand:SI 1 "register_operand" "0")))]
++  "TARGET_SVE"
++  "incp\t%x0, %2.<Vetype>"
++  "&& !CONSTANT_P (operands[3])"
+   {
+-    /* Use ZIP to do the unpack, since we don't care about the upper halves
+-       and since it has the nice property of not needing any subregs.
+-       If using UUNPK* turns out to be preferable, we could model it as
+-       a ZIP whose first operand is zero.  */
+-    rtx temp = gen_reg_rtx (VNx4SImode);
+-    emit_insn ((<hi_lanes_optab>
+-	        ? gen_aarch64_sve_zip2vnx4si
+-	        : gen_aarch64_sve_zip1vnx4si)
+-	       (temp, operands[1], operands[1]));
+-    rtx ptrue = force_reg (VNx2BImode, CONSTM1_RTX (VNx2BImode));
+-    emit_insn (gen_aarch64_sve_<FLOATUORS:optab>vnx4sivnx2df2 (operands[0],
+-							       ptrue, temp));
+-    DONE;
++    operands[3] = CONSTM1_RTX (<MODE>mode);
+   }
+ )
+ 
+-;; Predicate pack.  Use UZP1 on the narrower type, which discards
+-;; the high part of each wide element.
+-(define_insn "vec_pack_trunc_<Vwide>"
+-  [(set (match_operand:PRED_BHS 0 "register_operand" "=Upa")
+-	(unspec:PRED_BHS
+-	  [(match_operand:<VWIDE> 1 "register_operand" "Upa")
+-	   (match_operand:<VWIDE> 2 "register_operand" "Upa")]
+-	  UNSPEC_PACK))]
++;; Increment an SImode register by the number of set bits in a predicate
++;; using saturating arithmetic, extending the result to 64 bits.
++;;
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand")
++	(<paired_extend>:DI
++	  (SAT_PLUS:SI
++	    (unspec:SI [(match_dup 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand")]
++		       UNSPEC_CNTP)
++	    (match_operand:SI_ONLY 1 "register_operand"))))]
+   "TARGET_SVE"
+-  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Integer pack.  Use UZP1 on the narrower type, which discards
+-;; the high part of each wide element.
+-(define_insn "vec_pack_trunc_<Vwide>"
+-  [(set (match_operand:SVE_BHSI 0 "register_operand" "=w")
+-	(unspec:SVE_BHSI
+-	  [(match_operand:<VWIDE> 1 "register_operand" "w")
+-	   (match_operand:<VWIDE> 2 "register_operand" "w")]
+-	  UNSPEC_PACK))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(<paired_extend>:DI
++	  (SAT_PLUS:SI
++	    (unspec:SI [(match_operand 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
++		       UNSPEC_CNTP)
++	    (match_operand:SI_ONLY 1 "register_operand" "0"))))]
+   "TARGET_SVE"
+-  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
++  {
++    if (<CODE> == SS_PLUS)
++      return "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>, %w0";
++    else
++      return "<inc_dec>p\t%w0, %2.<PRED_ALL:Vetype>";
++  }
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Convert two vectors of DF to SF, or two vectors of SF to HF, and pack
+-;; the results into a single vector.
+-(define_expand "vec_pack_trunc_<Vwide>"
+-  [(set (match_dup 4)
+-	(unspec:SVE_HSF
+-	  [(match_dup 3)
+-	   (unspec:SVE_HSF [(match_operand:<VWIDE> 1 "register_operand")]
+-			   UNSPEC_FLOAT_CONVERT)]
+-	  UNSPEC_MERGE_PTRUE))
+-   (set (match_dup 5)
+-	(unspec:SVE_HSF
+-	  [(match_dup 3)
+-	   (unspec:SVE_HSF [(match_operand:<VWIDE> 2 "register_operand")]
+-			   UNSPEC_FLOAT_CONVERT)]
+-	  UNSPEC_MERGE_PTRUE))
+-   (set (match_operand:SVE_HSF 0 "register_operand")
+-	(unspec:SVE_HSF [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Increment by the number of elements in a predicate (vector)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - INCP
++;; - SQINCP
++;; - UQINCP
++;; -------------------------------------------------------------------------
++
++;; Increment a vector of DIs by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx2DI 0 "register_operand")
++	(ANY_PLUS:VNx2DI
++	  (vec_duplicate:VNx2DI
++	    (zero_extend:DI
++	      (unspec:SI
++		[(match_dup 3)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (match_operand:<VPRED> 2 "register_operand")]
++		UNSPEC_CNTP)))
++	  (match_operand:VNx2DI_ONLY 1 "register_operand")))]
+   "TARGET_SVE"
+   {
+-    operands[3] = force_reg (<VWIDE_PRED>mode, CONSTM1_RTX (<VWIDE_PRED>mode));
+-    operands[4] = gen_reg_rtx (<MODE>mode);
+-    operands[5] = gen_reg_rtx (<MODE>mode);
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
+   }
+ )
+ 
+-;; Convert two vectors of DF to SI and pack the results into a single vector.
+-(define_expand "vec_pack_<su>fix_trunc_vnx2df"
+-  [(set (match_dup 4)
+-	(unspec:VNx4SI
+-	  [(match_dup 3)
+-	   (FIXUORS:VNx4SI (match_operand:VNx2DF 1 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))
+-   (set (match_dup 5)
+-	(unspec:VNx4SI
+-	  [(match_dup 3)
+-	   (FIXUORS:VNx4SI (match_operand:VNx2DF 2 "register_operand"))]
+-	  UNSPEC_MERGE_PTRUE))
+-   (set (match_operand:VNx4SI 0 "register_operand")
+-	(unspec:VNx4SI [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
++	(ANY_PLUS:VNx2DI
++	  (vec_duplicate:VNx2DI
++	    (zero_extend:DI
++	      (unspec:SI
++		[(match_operand 3)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
++		UNSPEC_CNTP)))
++	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")))]
+   "TARGET_SVE"
++  "@
++   <inc_dec>p\t%0.d, %2
++   movprfx\t%0, %1\;<inc_dec>p\t%0.d, %2"
++  "&& !CONSTANT_P (operands[3])"
+   {
+-    operands[3] = force_reg (VNx2BImode, CONSTM1_RTX (VNx2BImode));
+-    operands[4] = gen_reg_rtx (VNx4SImode);
+-    operands[5] = gen_reg_rtx (VNx4SImode);
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
+   }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Predicated floating-point operations with select.
+-(define_expand "cond_<optab><mode>"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand")
+-	      (match_operand:SVE_F 3 "register_operand")]
+-	     SVE_COND_FP_BINARY)
+-	   (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
+-	  UNSPEC_SEL))]
++;; Increment a vector of SIs by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx4SI 0 "register_operand")
++	(ANY_PLUS:VNx4SI
++	  (vec_duplicate:VNx4SI
++	    (unspec:SI
++	      [(match_dup 3)
++	       (const_int SVE_KNOWN_PTRUE)
++	       (match_operand:<VPRED> 2 "register_operand")]
++	      UNSPEC_CNTP))
++	  (match_operand:VNx4SI_ONLY 1 "register_operand")))]
+   "TARGET_SVE"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
+ )
+ 
+-;; Predicated floating-point operations with select matching output.
+-(define_insn "*cond_<optab><mode>_0"
+-  [(set (match_operand:SVE_F 0 "register_operand" "+w, w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "0, w, w")
+-	      (match_operand:SVE_F 3 "register_operand" "w, 0, w")]
+-	     SVE_COND_FP_BINARY)
+-	   (match_dup 0)]
+-	  UNSPEC_SEL))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
++	(ANY_PLUS:VNx4SI
++	  (vec_duplicate:VNx4SI
++	    (unspec:SI
++	      [(match_operand 3)
++	       (const_int SVE_KNOWN_PTRUE)
++	       (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
++	      UNSPEC_CNTP))
++	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")))]
+   "TARGET_SVE"
+   "@
+-   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %1/m, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,*,yes")]
++   <inc_dec>p\t%0.s, %2
++   movprfx\t%0, %1\;<inc_dec>p\t%0.s, %2"
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Predicated floating-point operations with select matching first operand.
+-(define_insn "*cond_<optab><mode>_2"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "0, w")
+-	      (match_operand:SVE_F 3 "register_operand" "w, w")]
+-	     SVE_COND_FP_BINARY)
+-	   (match_dup 2)]
+-	  UNSPEC_SEL))]
++;; Increment a vector of HIs by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx8HI 0 "register_operand")
++	(ANY_PLUS:VNx8HI
++	  (vec_duplicate:VNx8HI
++	    (truncate:HI
++	      (unspec:SI
++		[(match_dup 3)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (match_operand:<VPRED> 2 "register_operand")]
++		UNSPEC_CNTP)))
++	  (match_operand:VNx8HI_ONLY 1 "register_operand")))]
+   "TARGET_SVE"
+-  "@
+-   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
+ )
+ 
+-;; Predicated floating-point operations with select matching second operand.
+-(define_insn "*cond_<optab><mode>_3"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "w, w")
+-	      (match_operand:SVE_F 3 "register_operand" "0, w")]
+-	     SVE_COND_FP_BINARY)
+-	   (match_dup 3)]
+-	  UNSPEC_SEL))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
++	(ANY_PLUS:VNx8HI
++	  (vec_duplicate:VNx8HI
++	    (match_operator:HI 3 "subreg_lowpart_operator"
++	      [(unspec:SI
++		 [(match_operand 4)
++		  (const_int SVE_KNOWN_PTRUE)
++		  (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
++		 UNSPEC_CNTP)]))
++	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")))]
+   "TARGET_SVE"
+   "@
+-   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+-   movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
++   <inc_dec>p\t%0.h, %2
++   movprfx\t%0, %1\;<inc_dec>p\t%0.h, %2"
++  "&& !CONSTANT_P (operands[4])"
++  {
++    operands[4] = CONSTM1_RTX (<VPRED>mode);
++  }
+   [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-;; Predicated floating-point operations with select matching zero.
+-(define_insn "*cond_<optab><mode>_z"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "w")
+-	      (match_operand:SVE_F 3 "register_operand" "w")]
+-	     SVE_COND_FP_BINARY)
+-	   (match_operand:SVE_F 4 "aarch64_simd_imm_zero")]
+-	  UNSPEC_SEL))]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Decrement by the number of elements in a predicate (scalar)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - DECP
++;; - SQDECP
++;; - UQDECP
++;; -------------------------------------------------------------------------
++
++;; Decrement a DImode register by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand")
++	(ANY_MINUS:DI
++	  (match_operand:DI_ONLY 1 "register_operand")
++	  (zero_extend:DI
++	    (unspec:SI [(match_dup 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand")]
++		       UNSPEC_CNTP))))]
+   "TARGET_SVE"
+-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "yes")]
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Synthetic predication of floating-point operations with select unmatched.
+-(define_insn_and_split "*cond_<optab><mode>_any"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "w")
+-	      (match_operand:SVE_F 3 "register_operand" "w")]
+-	     SVE_COND_FP_BINARY)
+-	   (match_operand:SVE_F 4 "register_operand" "w")]
+-	  UNSPEC_SEL))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(ANY_MINUS:DI
++	  (match_operand:DI_ONLY 1 "register_operand" "0")
++	  (zero_extend:DI
++	    (unspec:SI [(match_operand 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
++		       UNSPEC_CNTP))))]
+   "TARGET_SVE"
+-  "#"
+-  "&& reload_completed
+-   && !(rtx_equal_p (operands[0], operands[4])
+-        || rtx_equal_p (operands[2], operands[4])
+-        || rtx_equal_p (operands[3], operands[4]))"
+-  ; Not matchable by any one insn or movprfx insn.  We need a separate select.
+-  [(set (match_dup 0)
+-	(unspec:SVE_F [(match_dup 1) (match_dup 2) (match_dup 4)] UNSPEC_SEL))
+-   (set (match_dup 0)
+-	(unspec:SVE_F
+-	  [(match_dup 1)
+-	   (unspec:SVE_F [(match_dup 0) (match_dup 3)] SVE_COND_FP_BINARY)
+-           (match_dup 0)]
+-	  UNSPEC_SEL))]
++  "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>"
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Predicated floating-point ternary operations with select.
+-(define_expand "cond_<optab><mode>"
+-  [(set (match_operand:SVE_F 0 "register_operand")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand")
+-	      (match_operand:SVE_F 3 "register_operand")
+-	      (match_operand:SVE_F 4 "register_operand")]
+-	     SVE_COND_FP_TERNARY)
+-	   (match_operand:SVE_F 5 "aarch64_simd_reg_or_zero")]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE"
+-{
+-  /* Swap the multiplication operands if the fallback value is the
+-     second of the two.  */
+-  if (rtx_equal_p (operands[3], operands[5]))
+-    std::swap (operands[2], operands[3]);
+-})
++;; Decrement an SImode register by the number of set bits in a predicate
++;; using modular arithmetic.  See aarch64_sve_cntp for a description of the
++;; operands.
++(define_insn_and_rewrite "*aarch64_decsi<mode>_cntp"
++  [(set (match_operand:SI 0 "register_operand" "=r")
++	(minus:SI
++	  (match_operand:SI 1 "register_operand" "0")
++	  (unspec:SI [(match_operand 3)
++		      (const_int SVE_KNOWN_PTRUE)
++		      (match_operand:PRED_ALL 2 "register_operand" "Upa")]
++		     UNSPEC_CNTP)))]
++  "TARGET_SVE"
++  "decp\t%x0, %2.<Vetype>"
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<MODE>mode);
++  }
++)
+ 
+-;; Predicated floating-point ternary operations using the FMAD-like form.
+-(define_insn "*cond_<optab><mode>_2"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "0, w")
+-	      (match_operand:SVE_F 3 "register_operand" "w, w")
+-	      (match_operand:SVE_F 4 "register_operand" "w, w")]
+-	     SVE_COND_FP_TERNARY)
+-	   (match_dup 2)]
+-	  UNSPEC_SEL))]
++;; Decrement an SImode register by the number of set bits in a predicate
++;; using saturating arithmetic, extending the result to 64 bits.
++;;
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand")
++	(<paired_extend>:DI
++	  (SAT_MINUS:SI
++	    (match_operand:SI_ONLY 1 "register_operand")
++	    (unspec:SI [(match_dup 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand")]
++		       UNSPEC_CNTP))))]
+   "TARGET_SVE"
+-  "@
+-   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+-   movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Predicated floating-point ternary operations using the FMLA-like form.
+-(define_insn "*cond_<optab><mode>_4"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "w, w")
+-	      (match_operand:SVE_F 3 "register_operand" "w, w")
+-	      (match_operand:SVE_F 4 "register_operand" "0, w")]
+-	     SVE_COND_FP_TERNARY)
+-	   (match_dup 4)]
+-	  UNSPEC_SEL))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(<paired_extend>:DI
++	  (SAT_MINUS:SI
++	    (match_operand:SI_ONLY 1 "register_operand" "0")
++	    (unspec:SI [(match_operand 3)
++			(const_int SVE_KNOWN_PTRUE)
++			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
++		       UNSPEC_CNTP))))]
+   "TARGET_SVE"
+-  "@
+-   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+-   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+-  [(set_attr "movprfx" "*,yes")]
++  {
++    if (<CODE> == SS_MINUS)
++      return "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>, %w0";
++    else
++      return "<inc_dec>p\t%w0, %2.<PRED_ALL:Vetype>";
++  }
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
++  }
+ )
+ 
+-;; Predicated floating-point ternary operations in which the value for
+-;; inactive lanes is distinct from the other inputs.
+-(define_insn_and_split "*cond_<optab><mode>_any"
+-  [(set (match_operand:SVE_F 0 "register_operand" "=&w, &w, ?&w")
+-	(unspec:SVE_F
+-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+-	   (unspec:SVE_F
+-	     [(match_operand:SVE_F 2 "register_operand" "w, w, w")
+-	      (match_operand:SVE_F 3 "register_operand" "w, w, w")
+-	      (match_operand:SVE_F 4 "register_operand" "w, w, w")]
+-	     SVE_COND_FP_TERNARY)
+-	   (match_operand:SVE_F 5 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+-	  UNSPEC_SEL))]
+-  "TARGET_SVE
+-   && !rtx_equal_p (operands[2], operands[5])
+-   && !rtx_equal_p (operands[3], operands[5])
+-   && !rtx_equal_p (operands[4], operands[5])"
+-  "@
+-   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+-   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+-   #"
+-  "&& reload_completed
+-   && !CONSTANT_P (operands[5])
+-   && !rtx_equal_p (operands[0], operands[5])"
+-  [(set (match_dup 0)
+-	(unspec:SVE_F [(match_dup 1) (match_dup 4) (match_dup 5)] UNSPEC_SEL))
+-   (set (match_dup 0)
+-	(unspec:SVE_F
+-	  [(match_dup 1)
+-	   (unspec:SVE_F [(match_dup 2) (match_dup 3) (match_dup 0)]
+-			 SVE_COND_FP_TERNARY)
+-           (match_dup 0)]
+-	  UNSPEC_SEL))]
+-  ""
+-  [(set_attr "movprfx" "yes")]
++;; -------------------------------------------------------------------------
++;; ---- [INT] Decrement by the number of elements in a predicate (vector)
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - DECP
++;; - SQDECP
++;; - UQDECP
++;; -------------------------------------------------------------------------
++
++;; Decrement a vector of DIs by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx2DI 0 "register_operand")
++	(ANY_MINUS:VNx2DI
++	  (match_operand:VNx2DI_ONLY 1 "register_operand")
++	  (vec_duplicate:VNx2DI
++	    (zero_extend:DI
++	      (unspec:SI
++		[(match_dup 3)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (match_operand:<VPRED> 2 "register_operand")]
++		UNSPEC_CNTP)))))]
++  "TARGET_SVE"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
+ )
+ 
+-;; Shift an SVE vector left and insert a scalar into element 0.
+-(define_insn "vec_shl_insert_<mode>"
+-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+-	(unspec:SVE_ALL
+-	  [(match_operand:SVE_ALL 1 "register_operand" "0, 0")
+-	   (match_operand:<VEL> 2 "register_operand" "rZ, w")]
+-	  UNSPEC_INSR))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
++	(ANY_MINUS:VNx2DI
++	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")
++	  (vec_duplicate:VNx2DI
++	    (zero_extend:DI
++	      (unspec:SI
++		[(match_operand 3)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
++		UNSPEC_CNTP)))))]
+   "TARGET_SVE"
+   "@
+-   insr\t%0.<Vetype>, %<vwcore>2
+-   insr\t%0.<Vetype>, %<Vetype>2"
++   <inc_dec>p\t%0.d, %2
++   movprfx\t%0, %1\;<inc_dec>p\t%0.d, %2"
++  "&& !CONSTANT_P (operands[3])"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
++  [(set_attr "movprfx" "*,yes")]
+ )
+ 
+-(define_expand "copysign<mode>3"
+-  [(match_operand:SVE_F 0 "register_operand")
+-   (match_operand:SVE_F 1 "register_operand")
+-   (match_operand:SVE_F 2 "register_operand")]
++;; Decrement a vector of SIs by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx4SI 0 "register_operand")
++	(ANY_MINUS:VNx4SI
++	  (match_operand:VNx4SI_ONLY 1 "register_operand")
++	  (vec_duplicate:VNx4SI
++	    (unspec:SI
++	      [(match_dup 3)
++	       (const_int SVE_KNOWN_PTRUE)
++	       (match_operand:<VPRED> 2 "register_operand")]
++	      UNSPEC_CNTP))))]
+   "TARGET_SVE"
+   {
+-    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+-    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
+-    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+-    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+-
+-    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+-    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+-
+-    emit_insn (gen_and<v_int_equiv>3
+-	       (sign, arg2,
+-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+-						   HOST_WIDE_INT_M1U
+-						   << bits)));
+-    emit_insn (gen_and<v_int_equiv>3
+-	       (mant, arg1,
+-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+-						   ~(HOST_WIDE_INT_M1U
+-						     << bits))));
+-    emit_insn (gen_ior<v_int_equiv>3 (int_res, sign, mant));
+-    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+-    DONE;
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
+   }
+ )
+ 
+-(define_expand "xorsign<mode>3"
+-  [(match_operand:SVE_F 0 "register_operand")
+-   (match_operand:SVE_F 1 "register_operand")
+-   (match_operand:SVE_F 2 "register_operand")]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
++	(ANY_MINUS:VNx4SI
++	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")
++	  (vec_duplicate:VNx4SI
++	    (unspec:SI
++	      [(match_operand 3)
++	       (const_int SVE_KNOWN_PTRUE)
++	       (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
++	      UNSPEC_CNTP))))]
+   "TARGET_SVE"
++  "@
++   <inc_dec>p\t%0.s, %2
++   movprfx\t%0, %1\;<inc_dec>p\t%0.s, %2"
++  "&& !CONSTANT_P (operands[3])"
+   {
+-    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+-    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+-    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+-
+-    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+-    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
++  }
++  [(set_attr "movprfx" "*,yes")]
++)
+ 
+-    emit_insn (gen_and<v_int_equiv>3
+-	       (sign, arg2,
+-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+-						   HOST_WIDE_INT_M1U
+-						   << bits)));
+-    emit_insn (gen_xor<v_int_equiv>3 (int_res, arg1, sign));
+-    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+-    DONE;
++;; Decrement a vector of HIs by the number of set bits in a predicate.
++;; See aarch64_sve_cntp for a description of the operands.
++(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx8HI 0 "register_operand")
++	(ANY_MINUS:VNx8HI
++	  (match_operand:VNx8HI_ONLY 1 "register_operand")
++	  (vec_duplicate:VNx8HI
++	    (truncate:HI
++	      (unspec:SI
++		[(match_dup 3)
++		 (const_int SVE_KNOWN_PTRUE)
++		 (match_operand:<VPRED> 2 "register_operand")]
++		UNSPEC_CNTP)))))]
++  "TARGET_SVE"
++  {
++    operands[3] = CONSTM1_RTX (<VPRED>mode);
+   }
+ )
+ 
+-;; Unpredicated DOT product.
+-(define_insn "<sur>dot_prod<vsi2qi>"
+-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
+-	(plus:SVE_SDI
+-	  (unspec:SVE_SDI
+-	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+-	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
+-	    DOTPROD)
+-	  (match_operand:SVE_SDI 3 "register_operand" "0, w")))]
++(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
++  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
++	(ANY_MINUS:VNx8HI
++	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")
++	  (vec_duplicate:VNx8HI
++	    (match_operator:HI 3 "subreg_lowpart_operator"
++	      [(unspec:SI
++		 [(match_operand 4)
++		  (const_int SVE_KNOWN_PTRUE)
++		  (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
++		 UNSPEC_CNTP)]))))]
+   "TARGET_SVE"
+   "@
+-   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
+-   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
++   <inc_dec>p\t%0.h, %2
++   movprfx\t%0, %1\;<inc_dec>p\t%0.h, %2"
++  "&& !CONSTANT_P (operands[4])"
++  {
++    operands[4] = CONSTM1_RTX (<VPRED>mode);
++  }
+   [(set_attr "movprfx" "*,yes")]
+ )
+diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
+index 2b1ec85ae..a6a14b7fc 100644
+--- a/gcc/config/aarch64/aarch64-tune.md
++++ b/gcc/config/aarch64/aarch64-tune.md
+@@ -1,5 +1,5 @@
+ ;; -*- buffer-read-only: t -*-
+ ;; Generated automatically by gentune.sh from aarch64-cores.def
+ (define_attr "tune"
+-	"cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,ares,neoversen1,neoversee1,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
++	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
+ 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
+diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
+index 5dfcaf57e..ee85bb4e2 100644
+--- a/gcc/config/aarch64/aarch64.c
++++ b/gcc/config/aarch64/aarch64.c
+@@ -73,6 +73,8 @@
+ #include "selftest-rtl.h"
+ #include "rtx-vector-builder.h"
+ #include "intl.h"
++#include "expmed.h"
++#include "function-abi.h"
+ 
+ /* This file should be included last.  */
+ #include "target-def.h"
+@@ -83,7 +85,7 @@
+ /* Information about a legitimate vector immediate operand.  */
+ struct simd_immediate_info
+ {
+-  enum insn_type { MOV, MVN };
++  enum insn_type { MOV, MVN, INDEX, PTRUE };
+   enum modifier_type { LSL, MSL };
+ 
+   simd_immediate_info () {}
+@@ -92,33 +94,51 @@ struct simd_immediate_info
+ 		       insn_type = MOV, modifier_type = LSL,
+ 		       unsigned int = 0);
+   simd_immediate_info (scalar_mode, rtx, rtx);
++  simd_immediate_info (scalar_int_mode, aarch64_svpattern);
+ 
+   /* The mode of the elements.  */
+   scalar_mode elt_mode;
+ 
+-  /* The value of each element if all elements are the same, or the
+-     first value if the constant is a series.  */
+-  rtx value;
+-
+-  /* The value of the step if the constant is a series, null otherwise.  */
+-  rtx step;
+-
+   /* The instruction to use to move the immediate into a vector.  */
+   insn_type insn;
+ 
+-  /* The kind of shift modifier to use, and the number of bits to shift.
+-     This is (LSL, 0) if no shift is needed.  */
+-  modifier_type modifier;
+-  unsigned int shift;
++  union
++  {
++    /* For MOV and MVN.  */
++    struct
++    {
++      /* The value of each element.  */
++      rtx value;
++
++      /* The kind of shift modifier to use, and the number of bits to shift.
++	 This is (LSL, 0) if no shift is needed.  */
++      modifier_type modifier;
++      unsigned int shift;
++    } mov;
++
++    /* For INDEX.  */
++    struct
++    {
++      /* The value of the first element and the step to be added for each
++	 subsequent element.  */
++      rtx base, step;
++    } index;
++
++    /* For PTRUE.  */
++    aarch64_svpattern pattern;
++  } u;
+ };
+ 
+ /* Construct a floating-point immediate in which each element has mode
+    ELT_MODE_IN and value VALUE_IN.  */
+ inline simd_immediate_info
+ ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
+-  : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
+-    modifier (LSL), shift (0)
+-{}
++  : elt_mode (elt_mode_in), insn (MOV)
++{
++  u.mov.value = value_in;
++  u.mov.modifier = LSL;
++  u.mov.shift = 0;
++}
+ 
+ /* Construct an integer immediate in which each element has mode ELT_MODE_IN
+    and value VALUE_IN.  The other parameters are as for the structure
+@@ -128,17 +148,32 @@ inline simd_immediate_info
+ 		       unsigned HOST_WIDE_INT value_in,
+ 		       insn_type insn_in, modifier_type modifier_in,
+ 		       unsigned int shift_in)
+-  : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
+-    step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
+-{}
++  : elt_mode (elt_mode_in), insn (insn_in)
++{
++  u.mov.value = gen_int_mode (value_in, elt_mode_in);
++  u.mov.modifier = modifier_in;
++  u.mov.shift = shift_in;
++}
+ 
+ /* Construct an integer immediate in which each element has mode ELT_MODE_IN
+-   and where element I is equal to VALUE_IN + I * STEP_IN.  */
++   and where element I is equal to BASE_IN + I * STEP_IN.  */
++inline simd_immediate_info
++::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
++  : elt_mode (elt_mode_in), insn (INDEX)
++{
++  u.index.base = base_in;
++  u.index.step = step_in;
++}
++
++/* Construct a predicate that controls elements of mode ELT_MODE_IN
++   and has PTRUE pattern PATTERN_IN.  */
+ inline simd_immediate_info
+-::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
+-  : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
+-    modifier (LSL), shift (0)
+-{}
++::simd_immediate_info (scalar_int_mode elt_mode_in,
++		       aarch64_svpattern pattern_in)
++  : elt_mode (elt_mode_in), insn (PTRUE)
++{
++  u.pattern = pattern_in;
++}
+ 
+ /* The current code model.  */
+ enum aarch64_code_model aarch64_cmodel;
+@@ -177,7 +212,7 @@ unsigned aarch64_architecture_version;
+ enum aarch64_processor aarch64_tune = cortexa53;
+ 
+ /* Mask to specify which instruction scheduling options should be used.  */
+-unsigned long aarch64_tune_flags = 0;
++uint64_t aarch64_tune_flags = 0;
+ 
+ /* Global flag for PC relative loads.  */
+ bool aarch64_pcrelative_literal_loads;
+@@ -693,7 +728,7 @@ static const struct tune_params generic_tunings =
+   4, /* memmov_cost  */
+   2, /* issue_rate  */
+   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
+-  "8",	/* function_align.  */
++  "16:12",	/* function_align.  */
+   "4",	/* jump_align.  */
+   "8",	/* loop_align.  */
+   2,	/* int_reassoc_width.  */
+@@ -1139,7 +1174,7 @@ struct processor
+   enum aarch64_processor sched_core;
+   enum aarch64_arch arch;
+   unsigned architecture_version;
+-  const unsigned long flags;
++  const uint64_t flags;
+   const struct tune_params *const tune;
+ };
+ 
+@@ -1172,15 +1207,46 @@ static const struct processor *selected_arch;
+ static const struct processor *selected_cpu;
+ static const struct processor *selected_tune;
+ 
++enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
++
+ /* The current tuning set.  */
+ struct tune_params aarch64_tune_params = generic_tunings;
+ 
++/* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
++
++static tree
++handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
++				     int, bool *no_add_attrs)
++{
++  /* Since we set fn_type_req to true, the caller should have checked
++     this for us.  */
++  gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
++  switch ((arm_pcs) fntype_abi (*node).id ())
++    {
++    case ARM_PCS_AAPCS64:
++    case ARM_PCS_SIMD:
++      return NULL_TREE;
++
++    case ARM_PCS_SVE:
++      error ("the %qE attribute cannot be applied to an SVE function type",
++	     name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++
++    case ARM_PCS_TLSDESC:
++    case ARM_PCS_UNKNOWN:
++      break;
++    }
++  gcc_unreachable ();
++}
++
+ /* Table of machine attributes.  */
+ static const struct attribute_spec aarch64_attribute_table[] =
+ {
+   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+        affects_type_identity, handler, exclude } */
+-  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
++  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
++			  handle_aarch64_vector_pcs_attribute, NULL },
+   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
+ };
+ 
+@@ -1241,6 +1307,7 @@ static enum aarch64_parse_opt_result
+ aarch64_handle_standard_branch_protection (char* str, char* rest)
+ {
+   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
++  aarch64_ra_sign_key = AARCH64_KEY_A;
+   aarch64_enable_bti = 1;
+   if (rest)
+     {
+@@ -1255,6 +1322,7 @@ aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
+ 				    char* rest ATTRIBUTE_UNUSED)
+ {
+   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
++  aarch64_ra_sign_key = AARCH64_KEY_A;
+   return AARCH64_PARSE_OK;
+ }
+ 
+@@ -1266,6 +1334,14 @@ aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
+   return AARCH64_PARSE_OK;
+ }
+ 
++static enum aarch64_parse_opt_result
++aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
++			      char* rest ATTRIBUTE_UNUSED)
++{
++  aarch64_ra_sign_key = AARCH64_KEY_B;
++  return AARCH64_PARSE_OK;
++}
++
+ static enum aarch64_parse_opt_result
+ aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
+ 				    char* rest ATTRIBUTE_UNUSED)
+@@ -1276,6 +1352,7 @@ aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
+ 
+ static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
+   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
++  { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
+   { NULL, NULL, NULL, 0 }
+ };
+ 
+@@ -1295,6 +1372,66 @@ static const char * const aarch64_condition_codes[] =
+   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
+ };
+ 
++/* The preferred condition codes for SVE conditions.  */
++static const char *const aarch64_sve_condition_codes[] =
++{
++  "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
++  "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
++};
++
++/* Return the assembly token for svpattern value VALUE.  */
++
++static const char *
++svpattern_token (enum aarch64_svpattern pattern)
++{
++  switch (pattern)
++    {
++#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
++    AARCH64_FOR_SVPATTERN (CASE)
++#undef CASE
++    case AARCH64_NUM_SVPATTERNS:
++      break;
++    }
++  gcc_unreachable ();
++}
++
++/* Return the descriptor of the SIMD ABI.  */
++
++static const predefined_function_abi &
++aarch64_simd_abi (void)
++{
++  predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
++  if (!simd_abi.initialized_p ())
++    {
++      HARD_REG_SET full_reg_clobbers
++	= default_function_abi.full_reg_clobbers ();
++      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
++	if (FP_SIMD_SAVED_REGNUM_P (regno))
++	  CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
++      simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
++    }
++  return simd_abi;
++}
++
++/* Return the descriptor of the SVE PCS.  */
++
++static const predefined_function_abi &
++aarch64_sve_abi (void)
++{
++  predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
++  if (!sve_abi.initialized_p ())
++    {
++      HARD_REG_SET full_reg_clobbers
++	= default_function_abi.full_reg_clobbers ();
++      for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
++	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
++      for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
++	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
++      sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
++    }
++  return sve_abi;
++}
++
+ /* Generate code to enable conditional branches in functions over 1 MiB.  */
+ const char *
+ aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
+@@ -1337,6 +1474,14 @@ aarch64_err_no_fpadvsimd (machine_mode mode)
+ 	     " vector types", "+nofp");
+ }
+ 
++/* Return true if REGNO is P0-P15 or one of the special FFR-related
++   registers.  */
++inline bool
++pr_or_ffr_regnum_p (unsigned int regno)
++{
++  return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
++}
++
+ /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
+    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
+    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
+@@ -1413,6 +1558,16 @@ aarch64_dbx_register_number (unsigned regno)
+    return DWARF_FRAME_REGISTERS;
+ }
+ 
++/* If X is a CONST_DOUBLE, return its bit representation as a constant
++   integer, otherwise return X unmodified.  */
++static rtx
++aarch64_bit_representation (rtx x)
++{
++  if (CONST_DOUBLE_P (x))
++    x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
++  return x;
++}
++
+ /* Return true if MODE is any of the Advanced SIMD structure modes.  */
+ static bool
+ aarch64_advsimd_struct_mode_p (machine_mode mode)
+@@ -1439,6 +1594,9 @@ const unsigned int VEC_SVE_PRED = 4;
+ /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
+    a structure of 2, 3 or 4 vectors.  */
+ const unsigned int VEC_STRUCT   = 8;
++/* Can be used in combination with VEC_SVE_DATA to indicate that the
++   vector has fewer significant bytes than a full SVE vector.  */
++const unsigned int VEC_PARTIAL  = 16;
+ /* Useful combinations of the above.  */
+ const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
+ const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
+@@ -1454,34 +1612,84 @@ aarch64_classify_vector_mode (machine_mode mode)
+   if (aarch64_sve_pred_mode_p (mode))
+     return VEC_SVE_PRED;
+ 
+-  scalar_mode inner = GET_MODE_INNER (mode);
+-  if (VECTOR_MODE_P (mode)
+-      && (inner == QImode
+-	  || inner == HImode
+-	  || inner == HFmode
+-	  || inner == SImode
+-	  || inner == SFmode
+-	  || inner == DImode
+-	  || inner == DFmode))
+-    {
+-      if (TARGET_SVE)
+-	{
+-	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
+-	    return VEC_SVE_DATA;
+-	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
+-	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
+-	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
+-	    return VEC_SVE_DATA | VEC_STRUCT;
+-	}
++  /* Make the decision based on the mode's enum value rather than its
++     properties, so that we keep the correct classification regardless
++     of -msve-vector-bits.  */
++  switch (mode)
++    {
++    /* Partial SVE QI vectors.  */
++    case E_VNx2QImode:
++    case E_VNx4QImode:
++    case E_VNx8QImode:
++    /* Partial SVE HI vectors.  */
++    case E_VNx2HImode:
++    case E_VNx4HImode:
++    /* Partial SVE SI vector.  */
++    case E_VNx2SImode:
++      return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
++
++    case E_VNx16QImode:
++    case E_VNx8HImode:
++    case E_VNx4SImode:
++    case E_VNx2DImode:
++    case E_VNx8BFmode:
++    case E_VNx8HFmode:
++    case E_VNx4SFmode:
++    case E_VNx2DFmode:
++      return TARGET_SVE ? VEC_SVE_DATA : 0;
++
++    /* x2 SVE vectors.  */
++    case E_VNx32QImode:
++    case E_VNx16HImode:
++    case E_VNx8SImode:
++    case E_VNx4DImode:
++    case E_VNx16BFmode:
++    case E_VNx16HFmode:
++    case E_VNx8SFmode:
++    case E_VNx4DFmode:
++    /* x3 SVE vectors.  */
++    case E_VNx48QImode:
++    case E_VNx24HImode:
++    case E_VNx12SImode:
++    case E_VNx6DImode:
++    case E_VNx24BFmode:
++    case E_VNx24HFmode:
++    case E_VNx12SFmode:
++    case E_VNx6DFmode:
++    /* x4 SVE vectors.  */
++    case E_VNx64QImode:
++    case E_VNx32HImode:
++    case E_VNx16SImode:
++    case E_VNx8DImode:
++    case E_VNx32BFmode:
++    case E_VNx32HFmode:
++    case E_VNx16SFmode:
++    case E_VNx8DFmode:
++      return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
++
++    /* 64-bit Advanced SIMD vectors.  */
++    case E_V8QImode:
++    case E_V4HImode:
++    case E_V2SImode:
++    /* ...E_V1DImode doesn't exist.  */
++    case E_V4HFmode:
++    case E_V4BFmode:
++    case E_V2SFmode:
++    case E_V1DFmode:
++    /* 128-bit Advanced SIMD vectors.  */
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++    case E_V8HFmode:
++    case E_V8BFmode:
++    case E_V4SFmode:
++    case E_V2DFmode:
++      return TARGET_SIMD ? VEC_ADVSIMD : 0;
+ 
+-      /* This includes V1DF but not V1DI (which doesn't exist).  */
+-      if (TARGET_SIMD
+-	  && (known_eq (GET_MODE_BITSIZE (mode), 64)
+-	      || known_eq (GET_MODE_BITSIZE (mode), 128)))
+-	return VEC_ADVSIMD;
++    default:
++      return 0;
+     }
+-
+-  return 0;
+ }
+ 
+ /* Return true if MODE is any of the data vector modes, including
+@@ -1492,6 +1700,14 @@ aarch64_vector_data_mode_p (machine_mode mode)
+   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
+ }
+ 
++/* Return true if MODE is any form of SVE mode, including predicates,
++   vectors and structures.  */
++bool
++aarch64_sve_mode_p (machine_mode mode)
++{
++  return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
++}
++
+ /* Return true if MODE is an SVE data vector mode; either a single vector
+    or a structure of vectors.  */
+ static bool
+@@ -1500,6 +1716,24 @@ aarch64_sve_data_mode_p (machine_mode mode)
+   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
+ }
+ 
++/* Return the number of defined bytes in one constituent vector of
++   SVE mode MODE, which has vector flags VEC_FLAGS.  */
++static poly_int64
++aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
++{
++  if (vec_flags & VEC_PARTIAL)
++    /* A single partial vector.  */
++    return GET_MODE_SIZE (mode);
++
++  if (vec_flags & VEC_SVE_DATA)
++    /* A single vector or a tuple.  */
++    return BYTES_PER_SVE_VECTOR;
++
++  /* A single predicate.  */
++  gcc_assert (vec_flags & VEC_SVE_PRED);
++  return BYTES_PER_SVE_PRED;
++}
++
+ /* Implement target hook TARGET_ARRAY_MODE.  */
+ static opt_machine_mode
+ aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
+@@ -1582,6 +1816,43 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
+   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
+ }
+ 
++/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
++
++opt_machine_mode
++aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
++{
++  enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
++			    ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
++  machine_mode mode;
++  FOR_EACH_MODE_IN_CLASS (mode, mclass)
++    if (inner_mode == GET_MODE_INNER (mode)
++	&& known_eq (nunits, GET_MODE_NUNITS (mode))
++	&& aarch64_sve_data_mode_p (mode))
++      return mode;
++  return opt_machine_mode ();
++}
++
++/* Return the integer element mode associated with SVE mode MODE.  */
++
++static scalar_int_mode
++aarch64_sve_element_int_mode (machine_mode mode)
++{
++  unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
++					       GET_MODE_NUNITS (mode));
++  return int_mode_for_size (elt_bits, 0).require ();
++}
++
++/* Return the integer vector mode associated with SVE mode MODE.
++   Unlike mode_for_int_vector, this can handle the case in which
++   MODE is a predicate (and thus has a different total size).  */
++
++machine_mode
++aarch64_sve_int_mode (machine_mode mode)
++{
++  scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
++  return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
++}
++
+ /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
+    prefer to use the first arithmetic operand as the else value if
+    the else value doesn't matter, since that exactly matches the SVE
+@@ -1610,13 +1881,19 @@ aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
+     {
+     case FP_REGS:
+     case FP_LO_REGS:
+-      if (aarch64_sve_data_mode_p (mode))
+-	return exact_div (GET_MODE_SIZE (mode),
+-			  BYTES_PER_SVE_VECTOR).to_constant ();
+-      return CEIL (lowest_size, UNITS_PER_VREG);
++    case FP_LO8_REGS:
++      {
++	unsigned int vec_flags = aarch64_classify_vector_mode (mode);
++	if (vec_flags & VEC_SVE_DATA)
++	  return exact_div (GET_MODE_SIZE (mode),
++			    aarch64_vl_bytes (mode, vec_flags)).to_constant ();
++	return CEIL (lowest_size, UNITS_PER_VREG);
++      }
+     case PR_REGS:
+     case PR_LO_REGS:
+     case PR_HI_REGS:
++    case FFR_REGS:
++    case PR_AND_FFR_REGS:
+       return 1;
+     default:
+       return CEIL (lowest_size, UNITS_PER_WORD);
+@@ -1637,11 +1914,16 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
+     return mode == DImode;
+ 
+   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
++  /* At the moment, partial vector modes are only useful for memory
++     references, but that could change in future.  */
++  if (vec_flags & VEC_PARTIAL)
++    return false;
++
+   if (vec_flags & VEC_SVE_PRED)
+-    return PR_REGNUM_P (regno);
++    return pr_or_ffr_regnum_p (regno);
+ 
+-  if (PR_REGNUM_P (regno))
+-    return 0;
++  if (pr_or_ffr_regnum_p (regno))
++    return false;
+ 
+   if (regno == SP_REGNUM)
+     /* The purpose of comparing with ptr_mode is to support the
+@@ -1670,102 +1952,184 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
+   return false;
+ }
+ 
+-/* Return true if this is a definition of a vectorized simd function.  */
++/* Return true if TYPE is a type that should be passed or returned in
++   SVE registers, assuming enough registers are available.  When returning
++   true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
++   respectively.  */
+ 
+ static bool
+-aarch64_simd_decl_p (tree fndecl)
++aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
++			unsigned int *num_pr)
+ {
+-  tree fntype;
+-
+-  if (fndecl == NULL)
+-    return false;
+-  fntype = TREE_TYPE (fndecl);
+-  if (fntype == NULL)
+-    return false;
++  if (aarch64_sve::svbool_type_p (type))
++    {
++      *num_pr = 1;
++      *num_zr = 0;
++      return true;
++    }
+ 
+-  /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
+-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
+-    return true;
++  if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
++    {
++      *num_pr = 0;
++      *num_zr = nvectors;
++      return true;
++    }
+ 
+   return false;
+ }
+ 
+-/* Return the mode a register save/restore should use.  DImode for integer
+-   registers, DFmode for FP registers in non-SIMD functions (they only save
+-   the bottom half of a 128 bit register), or TFmode for FP registers in
+-   SIMD functions.  */
++/* Return true if a function with type FNTYPE returns its value in
++   SVE vector or predicate registers.  */
+ 
+-static machine_mode
+-aarch64_reg_save_mode (tree fndecl, unsigned regno)
++static bool
++aarch64_returns_value_in_sve_regs_p (const_tree fntype)
+ {
+-  return GP_REGNUM_P (regno)
+-	   ? E_DImode
+-	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
++  unsigned int num_zr, num_pr;
++  tree return_type = TREE_TYPE (fntype);
++  return (return_type != error_mark_node
++	  && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
+ }
+ 
+-/* Return true if the instruction is a call to a SIMD function, false
+-   if it is not a SIMD function or if we do not know anything about
+-   the function.  */
++/* Return true if a function with type FNTYPE takes arguments in
++   SVE vector or predicate registers.  */
+ 
+ static bool
+-aarch64_simd_call_p (rtx_insn *insn)
++aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
+ {
+-  rtx symbol;
+-  rtx call;
+-  tree fndecl;
++  CUMULATIVE_ARGS args_so_far_v;
++  aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
++				NULL_TREE, 0, true);
++  cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
+ 
+-  gcc_assert (CALL_P (insn));
+-  call = get_call_rtx_from (insn);
+-  symbol = XEXP (XEXP (call, 0), 0);
+-  if (GET_CODE (symbol) != SYMBOL_REF)
+-    return false;
+-  fndecl = SYMBOL_REF_DECL (symbol);
+-  if (!fndecl)
+-    return false;
++  for (tree chain = TYPE_ARG_TYPES (fntype);
++       chain && chain != void_list_node;
++       chain = TREE_CHAIN (chain))
++    {
++      tree arg_type = TREE_VALUE (chain);
++      if (arg_type == error_mark_node)
++	return false;
++
++      function_arg_info arg (arg_type, /*named=*/true);
++      apply_pass_by_reference_rules (&args_so_far_v, arg);
++      unsigned int num_zr, num_pr;
++      if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
++	return true;
+ 
+-  return aarch64_simd_decl_p (fndecl);
++      targetm.calls.function_arg_advance (args_so_far, arg);
++    }
++  return false;
+ }
+ 
+-/* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
+-   a function that uses the SIMD ABI, take advantage of the extra
+-   call-preserved registers that the ABI provides.  */
++/* Implement TARGET_FNTYPE_ABI.  */
+ 
+-void
+-aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
+-					  HARD_REG_SET *return_set)
++static const predefined_function_abi &
++aarch64_fntype_abi (const_tree fntype)
+ {
+-  if (aarch64_simd_call_p (insn))
+-    {
+-      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-	if (FP_SIMD_SAVED_REGNUM_P (regno))
+-	  CLEAR_HARD_REG_BIT (*return_set, regno);
+-    }
++  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
++    return aarch64_simd_abi ();
++
++  if (aarch64_returns_value_in_sve_regs_p (fntype)
++      || aarch64_takes_arguments_in_sve_regs_p (fntype))
++    return aarch64_sve_abi ();
++
++  return default_function_abi;
+ }
+ 
+-/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
+-   the lower 64 bits of a 128-bit register.  Tell the compiler the callee
+-   clobbers the top 64 bits when restoring the bottom 64 bits.  */
++/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
+ 
+ static bool
+-aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
+-					machine_mode mode)
++aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
++{
++  return (aarch64_sve::builtin_type_p (type1)
++	  == aarch64_sve::builtin_type_p (type2));
++}
++
++/* Return true if we should emit CFI for register REGNO.  */
++
++static bool
++aarch64_emit_cfi_for_reg_p (unsigned int regno)
+ {
+-  bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
+-  return FP_REGNUM_P (regno)
+-	 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
++  return (GP_REGNUM_P (regno)
++	  || !default_function_abi.clobbers_full_reg_p (regno));
+ }
+ 
+-/* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
++/* Return the mode we should use to save and restore register REGNO.  */
+ 
+-rtx_insn *
+-aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
++static machine_mode
++aarch64_reg_save_mode (unsigned int regno)
+ {
+-  gcc_assert (CALL_P (call_1) && CALL_P (call_2));
++  if (GP_REGNUM_P (regno))
++    return DImode;
+ 
+-  if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
+-    return call_1;
+-  else
+-    return call_2;
++  if (FP_REGNUM_P (regno))
++    switch (crtl->abi->id ())
++      {
++      case ARM_PCS_AAPCS64:
++	/* Only the low 64 bits are saved by the base PCS.  */
++	return DFmode;
++
++      case ARM_PCS_SIMD:
++	/* The vector PCS saves the low 128 bits (which is the full
++	   register on non-SVE targets).  */
++	return TFmode;
++
++      case ARM_PCS_SVE:
++	/* Use vectors of DImode for registers that need frame
++	   information, so that the first 64 bytes of the save slot
++	   are always the equivalent of what storing D<n> would give.  */
++	if (aarch64_emit_cfi_for_reg_p (regno))
++	  return VNx2DImode;
++
++	/* Use vectors of bytes otherwise, so that the layout is
++	   endian-agnostic, and so that we can use LDR and STR for
++	   big-endian targets.  */
++	return VNx16QImode;
++
++      case ARM_PCS_TLSDESC:
++      case ARM_PCS_UNKNOWN:
++	break;
++      }
++
++  if (PR_REGNUM_P (regno))
++    /* Save the full predicate register.  */
++    return VNx16BImode;
++
++  gcc_unreachable ();
++}
++
++/* Implement TARGET_INSN_CALLEE_ABI.  */
++
++const predefined_function_abi &
++aarch64_insn_callee_abi (const rtx_insn *insn)
++{
++  rtx pat = PATTERN (insn);
++  gcc_assert (GET_CODE (pat) == PARALLEL);
++  rtx unspec = XVECEXP (pat, 0, 1);
++  gcc_assert (GET_CODE (unspec) == UNSPEC
++	      && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
++  return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
++}
++
++/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
++   the lower 64 bits of a 128-bit register.  Tell the compiler the callee
++   clobbers the top 64 bits when restoring the bottom 64 bits.  */
++
++static bool
++aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
++					unsigned int regno,
++					machine_mode mode)
++{
++  if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
++    {
++      poly_int64 per_register_size = GET_MODE_SIZE (mode);
++      unsigned int nregs = hard_regno_nregs (regno, mode);
++      if (nregs > 1)
++	per_register_size = exact_div (per_register_size, nregs);
++      if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
++	return maybe_gt (per_register_size, 16);
++      return maybe_gt (per_register_size, 8);
++    }
++  return false;
+ }
+ 
+ /* Implement REGMODE_NATURAL_SIZE.  */
+@@ -1899,10 +2263,33 @@ emit_set_insn (rtx x, rtx y)
+ rtx
+ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
+ {
+-  machine_mode mode = SELECT_CC_MODE (code, x, y);
+-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
++  machine_mode cmp_mode = GET_MODE (x);
++  machine_mode cc_mode;
++  rtx cc_reg;
++
++  if (cmp_mode == TImode)
++    {
++      gcc_assert (code == NE);
++
++      cc_mode = CCmode;
++      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+ 
+-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
++      rtx x_lo = operand_subword (x, 0, 0, TImode);
++      rtx y_lo = operand_subword (y, 0, 0, TImode);
++      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
++
++      rtx x_hi = operand_subword (x, 1, 0, TImode);
++      rtx y_hi = operand_subword (y, 1, 0, TImode);
++      emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
++			     gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
++			     GEN_INT (AARCH64_EQ)));
++    }
++  else
++    {
++      cc_mode = SELECT_CC_MODE (code, x, y);
++      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
++      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
++    }
+   return cc_reg;
+ }
+ 
+@@ -2466,7 +2853,36 @@ aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
+   gcc_assert (r != NULL);
+   return rtx_equal_p (x, r);
+ }
+-			      
++
++/* Return TARGET if it is nonnull and a register of mode MODE.
++   Otherwise, return a fresh register of mode MODE if we can,
++   or TARGET reinterpreted as MODE if we can't.  */
++
++static rtx
++aarch64_target_reg (rtx target, machine_mode mode)
++{
++  if (target && REG_P (target) && GET_MODE (target) == mode)
++    return target;
++  if (!can_create_pseudo_p ())
++    {
++      gcc_assert (target);
++      return gen_lowpart (mode, target);
++    }
++  return gen_reg_rtx (mode);
++}
++
++/* Return a register that contains the constant in BUILDER, given that
++   the constant is a legitimate move operand.  Use TARGET as the register
++   if it is nonnull and convenient.  */
++
++static rtx
++aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
++{
++  rtx src = builder.build ();
++  target = aarch64_target_reg (target, GET_MODE (src));
++  emit_insn (gen_rtx_SET (target, src));
++  return target;
++}
+ 
+ static rtx
+ aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
+@@ -2481,82 +2897,474 @@ aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
+     }
+ }
+ 
+-/* Return true if we can move VALUE into a register using a single
+-   CNT[BHWD] instruction.  */
++/* Return true if predicate value X is a constant in which every element
++   is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
++   value, i.e. as a predicate in which all bits are significant.  */
+ 
+ static bool
+-aarch64_sve_cnt_immediate_p (poly_int64 value)
++aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
+ {
+-  HOST_WIDE_INT factor = value.coeffs[0];
+-  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
+-  return (value.coeffs[1] == factor
+-	  && IN_RANGE (factor, 2, 16 * 16)
+-	  && (factor & 1) == 0
+-	  && factor <= 16 * (factor & -factor));
++  if (GET_CODE (x) != CONST_VECTOR)
++    return false;
++
++  unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
++					     GET_MODE_NUNITS (GET_MODE (x)));
++  unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
++  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
++  builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
++
++  unsigned int nelts = const_vector_encoded_nelts (x);
++  for (unsigned int i = 0; i < nelts; ++i)
++    {
++      rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
++      if (!CONST_INT_P (elt))
++	return false;
++
++      builder.quick_push (elt);
++      for (unsigned int j = 1; j < factor; ++j)
++	builder.quick_push (const0_rtx);
++    }
++  builder.finalize ();
++  return true;
+ }
+ 
+-/* Likewise for rtx X.  */
++/* BUILDER contains a predicate constant of mode VNx16BI.  Return the
++   widest predicate element size it can have (that is, the largest size
++   for which each element would still be 0 or 1).  */
+ 
+-bool
+-aarch64_sve_cnt_immediate_p (rtx x)
++unsigned int
++aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
+ {
+-  poly_int64 value;
+-  return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
++  /* Start with the most optimistic assumption: that we only need
++     one bit per pattern.  This is what we will use if only the first
++     bit in each pattern is ever set.  */
++  unsigned int mask = GET_MODE_SIZE (DImode);
++  mask |= builder.npatterns ();
++
++  /* Look for set bits.  */
++  unsigned int nelts = builder.encoded_nelts ();
++  for (unsigned int i = 1; i < nelts; ++i)
++    if (INTVAL (builder.elt (i)) != 0)
++      {
++	if (i & 1)
++	  return 1;
++	mask |= i;
++      }
++  return mask & -mask;
+ }
+ 
+-/* Return the asm string for an instruction with a CNT-like vector size
+-   operand (a vector pattern followed by a multiplier in the range [1, 16]).
+-   PREFIX is the mnemonic without the size suffix and OPERANDS is the
+-   first part of the operands template (the part that comes before the
+-   vector size itself).  FACTOR is the number of quadwords.
+-   NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
+-   If it is zero, we can use any element size.  */
++/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
++   return that predicate mode, otherwise return opt_machine_mode ().  */
+ 
+-static char *
+-aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
+-				  unsigned int factor,
+-				  unsigned int nelts_per_vq)
++opt_machine_mode
++aarch64_ptrue_all_mode (rtx x)
+ {
+-  static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
++  gcc_assert (GET_MODE (x) == VNx16BImode);
++  if (GET_CODE (x) != CONST_VECTOR
++      || !CONST_VECTOR_DUPLICATE_P (x)
++      || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
++      || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
++    return opt_machine_mode ();
+ 
+-  if (nelts_per_vq == 0)
+-    /* There is some overlap in the ranges of the four CNT instructions.
+-       Here we always use the smallest possible element size, so that the
+-       multiplier is 1 whereever possible.  */
+-    nelts_per_vq = factor & -factor;
+-  int shift = std::min (exact_log2 (nelts_per_vq), 4);
+-  gcc_assert (IN_RANGE (shift, 1, 4));
+-  char suffix = "dwhb"[shift - 1];
++  unsigned int nelts = const_vector_encoded_nelts (x);
++  for (unsigned int i = 1; i < nelts; ++i)
++    if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
++      return opt_machine_mode ();
+ 
+-  factor >>= shift;
+-  unsigned int written;
+-  if (factor == 1)
+-    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
+-			prefix, suffix, operands);
+-  else
+-    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
+-			prefix, suffix, operands, factor);
+-  gcc_assert (written < sizeof (buffer));
+-  return buffer;
++  return aarch64_sve_pred_mode (nelts);
+ }
+ 
+-/* Return the asm string for an instruction with a CNT-like vector size
+-   operand (a vector pattern followed by a multiplier in the range [1, 16]).
+-   PREFIX is the mnemonic without the size suffix and OPERANDS is the
+-   first part of the operands template (the part that comes before the
+-   vector size itself).  X is the value of the vector size operand,
+-   as a polynomial integer rtx.  */
++/* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
++   that the constant would have with predicate element size ELT_SIZE
++   (ignoring the upper bits in each element) and return:
+ 
+-char *
++   * -1 if all bits are set
++   * N if the predicate has N leading set bits followed by all clear bits
++   * 0 if the predicate does not have any of these forms.  */
++
++int
++aarch64_partial_ptrue_length (rtx_vector_builder &builder,
++			      unsigned int elt_size)
++{
++  /* If nelts_per_pattern is 3, we have set bits followed by clear bits
++     followed by set bits.  */
++  if (builder.nelts_per_pattern () == 3)
++    return 0;
++
++  /* Skip over leading set bits.  */
++  unsigned int nelts = builder.encoded_nelts ();
++  unsigned int i = 0;
++  for (; i < nelts; i += elt_size)
++    if (INTVAL (builder.elt (i)) == 0)
++      break;
++  unsigned int vl = i / elt_size;
++
++  /* Check for the all-true case.  */
++  if (i == nelts)
++    return -1;
++
++  /* If nelts_per_pattern is 1, then either VL is zero, or we have a
++     repeating pattern of set bits followed by clear bits.  */
++  if (builder.nelts_per_pattern () != 2)
++    return 0;
++
++  /* We have a "foreground" value and a duplicated "background" value.
++     If the background might repeat and the last set bit belongs to it,
++     we might have set bits followed by clear bits followed by set bits.  */
++  if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
++    return 0;
++
++  /* Make sure that the rest are all clear.  */
++  for (; i < nelts; i += elt_size)
++    if (INTVAL (builder.elt (i)) != 0)
++      return 0;
++
++  return vl;
++}
++
++/* See if there is an svpattern that encodes an SVE predicate of mode
++   PRED_MODE in which the first VL bits are set and the rest are clear.
++   Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
++   A VL of -1 indicates an all-true vector.  */
++
++aarch64_svpattern
++aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
++{
++  if (vl < 0)
++    return AARCH64_SV_ALL;
++
++  if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
++    return AARCH64_NUM_SVPATTERNS;
++
++  if (vl >= 1 && vl <= 8)
++    return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
++
++  if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
++    return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
++
++  int max_vl;
++  if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
++    {
++      if (vl == (max_vl / 3) * 3)
++	return AARCH64_SV_MUL3;
++      /* These would only trigger for non-power-of-2 lengths.  */
++      if (vl == (max_vl & -4))
++	return AARCH64_SV_MUL4;
++      if (vl == (1 << floor_log2 (max_vl)))
++	return AARCH64_SV_POW2;
++      if (vl == max_vl)
++	return AARCH64_SV_ALL;
++    }
++  return AARCH64_NUM_SVPATTERNS;
++}
++
++/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
++   bits has the lowest bit set and the upper bits clear.  This is the
++   VNx16BImode equivalent of a PTRUE for controlling elements of
++   ELT_SIZE bytes.  However, because the constant is VNx16BImode,
++   all bits are significant, even the upper zeros.  */
++
++rtx
++aarch64_ptrue_all (unsigned int elt_size)
++{
++  rtx_vector_builder builder (VNx16BImode, elt_size, 1);
++  builder.quick_push (const1_rtx);
++  for (unsigned int i = 1; i < elt_size; ++i)
++    builder.quick_push (const0_rtx);
++  return builder.build ();
++}
++
++/* Return an all-true predicate register of mode MODE.  */
++
++rtx
++aarch64_ptrue_reg (machine_mode mode)
++{
++  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
++  rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
++  return gen_lowpart (mode, reg);
++}
++
++/* Return an all-false predicate register of mode MODE.  */
++
++rtx
++aarch64_pfalse_reg (machine_mode mode)
++{
++  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
++  rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
++  return gen_lowpart (mode, reg);
++}
++
++/* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
++   true, or alternatively if we know that the operation predicated by
++   PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
++   aarch64_sve_gp_strictness operand that describes the operation
++   predicated by PRED1[0].  */
++
++bool
++aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
++{
++  machine_mode mode = GET_MODE (pred2);
++  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
++	      && mode == GET_MODE (pred1[0])
++	      && aarch64_sve_gp_strictness (pred1[1], SImode));
++  return (pred1[0] == CONSTM1_RTX (mode)
++	  || INTVAL (pred1[1]) == SVE_RELAXED_GP
++	  || rtx_equal_p (pred1[0], pred2));
++}
++
++/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
++   for it.  PRED2[0] is the predicate for the instruction whose result
++   is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
++   for it.  Return true if we can prove that the two predicates are
++   equivalent for PTEST purposes; that is, if we can replace PRED2[0]
++   with PRED1[0] without changing behavior.  */
++
++bool
++aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
++{
++  machine_mode mode = GET_MODE (pred1[0]);
++  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
++	      && mode == GET_MODE (pred2[0])
++	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
++	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
++
++  bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
++		   || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
++  bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
++		   || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
++  return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
++}
++
++/* Emit a comparison CMP between OP0 and OP1, both of which have mode
++   DATA_MODE, and return the result in a predicate of mode PRED_MODE.
++   Use TARGET as the target register if nonnull and convenient.  */
++
++static rtx
++aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
++			  machine_mode data_mode, rtx op1, rtx op2)
++{
++  insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
++  expand_operand ops[5];
++  create_output_operand (&ops[0], target, pred_mode);
++  create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
++  create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
++  create_input_operand (&ops[3], op1, data_mode);
++  create_input_operand (&ops[4], op2, data_mode);
++  expand_insn (icode, 5, ops);
++  return ops[0].value;
++}
++
++/* Use a comparison to convert integer vector SRC into MODE, which is
++   the corresponding SVE predicate mode.  Use TARGET for the result
++   if it's nonnull and convenient.  */
++
++rtx
++aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
++{
++  machine_mode src_mode = GET_MODE (src);
++  return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
++				   src, CONST0_RTX (src_mode));
++}
++
++/* Return the assembly token for svprfop value PRFOP.  */
++
++static const char *
++svprfop_token (enum aarch64_svprfop prfop)
++{
++  switch (prfop)
++    {
++#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
++    AARCH64_FOR_SVPRFOP (CASE)
++#undef CASE
++    case AARCH64_NUM_SVPRFOPS:
++      break;
++    }
++  gcc_unreachable ();
++}
++
++/* Return the assembly string for an SVE prefetch operation with
++   mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
++   and that SUFFIX is the format for the remaining operands.  */
++
++char *
++aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
++			     const char *suffix)
++{
++  static char buffer[128];
++  aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
++  unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
++				   mnemonic, svprfop_token (prfop), suffix);
++  gcc_assert (written < sizeof (buffer));
++  return buffer;
++}
++
++/* Check whether we can calculate the number of elements in PATTERN
++   at compile time, given that there are NELTS_PER_VQ elements per
++   128-bit block.  Return the value if so, otherwise return -1.  */
++
++HOST_WIDE_INT
++aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
++{
++  unsigned int vl, const_vg;
++  if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
++    vl = 1 + (pattern - AARCH64_SV_VL1);
++  else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
++    vl = 16 << (pattern - AARCH64_SV_VL16);
++  else if (aarch64_sve_vg.is_constant (&const_vg))
++    {
++      /* There are two vector granules per quadword.  */
++      unsigned int nelts = (const_vg / 2) * nelts_per_vq;
++      switch (pattern)
++	{
++	case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
++	case AARCH64_SV_MUL4: return nelts & -4;
++	case AARCH64_SV_MUL3: return (nelts / 3) * 3;
++	case AARCH64_SV_ALL: return nelts;
++	default: gcc_unreachable ();
++	}
++    }
++  else
++    return -1;
++
++  /* There are two vector granules per quadword.  */
++  poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
++  if (known_le (vl, nelts_all))
++    return vl;
++
++  /* Requesting more elements than are available results in a PFALSE.  */
++  if (known_gt (vl, nelts_all))
++    return 0;
++
++  return -1;
++}
++
++/* Return true if we can move VALUE into a register using a single
++   CNT[BHWD] instruction.  */
++
++static bool
++aarch64_sve_cnt_immediate_p (poly_int64 value)
++{
++  HOST_WIDE_INT factor = value.coeffs[0];
++  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
++  return (value.coeffs[1] == factor
++	  && IN_RANGE (factor, 2, 16 * 16)
++	  && (factor & 1) == 0
++	  && factor <= 16 * (factor & -factor));
++}
++
++/* Likewise for rtx X.  */
++
++bool
++aarch64_sve_cnt_immediate_p (rtx x)
++{
++  poly_int64 value;
++  return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
++}
++
++/* Return the asm string for an instruction with a CNT-like vector size
++   operand (a vector pattern followed by a multiplier in the range [1, 16]).
++   PREFIX is the mnemonic without the size suffix and OPERANDS is the
++   first part of the operands template (the part that comes before the
++   vector size itself).  PATTERN is the pattern to use.  FACTOR is the
++   number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
++   in each quadword.  If it is zero, we can use any element size.  */
++
++static char *
++aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
++				  aarch64_svpattern pattern,
++				  unsigned int factor,
++				  unsigned int nelts_per_vq)
++{
++  static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
++
++  if (nelts_per_vq == 0)
++    /* There is some overlap in the ranges of the four CNT instructions.
++       Here we always use the smallest possible element size, so that the
++       multiplier is 1 whereever possible.  */
++    nelts_per_vq = factor & -factor;
++  int shift = std::min (exact_log2 (nelts_per_vq), 4);
++  gcc_assert (IN_RANGE (shift, 1, 4));
++  char suffix = "dwhb"[shift - 1];
++
++  factor >>= shift;
++  unsigned int written;
++  if (pattern == AARCH64_SV_ALL && factor == 1)
++    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
++			prefix, suffix, operands);
++  else if (factor == 1)
++    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
++			prefix, suffix, operands, svpattern_token (pattern));
++  else
++    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
++			prefix, suffix, operands, svpattern_token (pattern),
++			factor);
++  gcc_assert (written < sizeof (buffer));
++  return buffer;
++}
++
++/* Return the asm string for an instruction with a CNT-like vector size
++   operand (a vector pattern followed by a multiplier in the range [1, 16]).
++   PREFIX is the mnemonic without the size suffix and OPERANDS is the
++   first part of the operands template (the part that comes before the
++   vector size itself).  X is the value of the vector size operand,
++   as a polynomial integer rtx; we need to convert this into an "all"
++   pattern with a multiplier.  */
++
++char *
+ aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
+ 				  rtx x)
+ {
+   poly_int64 value = rtx_to_poly_int64 (x);
+   gcc_assert (aarch64_sve_cnt_immediate_p (value));
+-  return aarch64_output_sve_cnt_immediate (prefix, operands,
++  return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
+ 					   value.coeffs[1], 0);
+ }
+ 
++/* Return the asm string for an instruction with a CNT-like vector size
++   operand (a vector pattern followed by a multiplier in the range [1, 16]).
++   PREFIX is the mnemonic without the size suffix and OPERANDS is the
++   first part of the operands template (the part that comes before the
++   vector size itself).  CNT_PAT[0..2] are the operands of the
++   UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
++
++char *
++aarch64_output_sve_cnt_pat_immediate (const char *prefix,
++				      const char *operands, rtx *cnt_pat)
++{
++  aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
++  unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
++  unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
++  return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
++					   factor, nelts_per_vq);
++}
++
++/* Return true if we can add X using a single SVE INC or DEC instruction.  */
++
++bool
++aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
++{
++  poly_int64 value;
++  return (poly_int_rtx_p (x, &value)
++	  && (aarch64_sve_cnt_immediate_p (value)
++	      || aarch64_sve_cnt_immediate_p (-value)));
++}
++
++/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
++   operand 0.  */
++
++char *
++aarch64_output_sve_scalar_inc_dec (rtx offset)
++{
++  poly_int64 offset_value = rtx_to_poly_int64 (offset);
++  gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
++  if (offset_value.coeffs[1] > 0)
++    return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
++					     offset_value.coeffs[1], 0);
++  else
++    return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
++					     -offset_value.coeffs[1], 0);
++}
++
+ /* Return true if we can add VALUE to a register using a single ADDVL
+    or ADDPL instruction.  */
+ 
+@@ -2582,27 +3390,16 @@ aarch64_sve_addvl_addpl_immediate_p (rtx x)
+ 	  && aarch64_sve_addvl_addpl_immediate_p (value));
+ }
+ 
+-/* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
+-   and storing the result in operand 0.  */
++/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
++   to operand 1 and storing the result in operand 0.  */
+ 
+ char *
+-aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
++aarch64_output_sve_addvl_addpl (rtx offset)
+ {
+   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
+   poly_int64 offset_value = rtx_to_poly_int64 (offset);
+   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
+ 
+-  /* Use INC or DEC if possible.  */
+-  if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
+-    {
+-      if (aarch64_sve_cnt_immediate_p (offset_value))
+-	return aarch64_output_sve_cnt_immediate ("inc", "%x0",
+-						 offset_value.coeffs[1], 0);
+-      if (aarch64_sve_cnt_immediate_p (-offset_value))
+-	return aarch64_output_sve_cnt_immediate ("dec", "%x0",
+-						 -offset_value.coeffs[1], 0);
+-    }
+-
+   int factor = offset_value.coeffs[1];
+   if ((factor & 15) == 0)
+     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
+@@ -2617,8 +3414,8 @@ aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
+    factor in *FACTOR_OUT (if nonnull).  */
+ 
+ bool
+-aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
+-				 unsigned int *nelts_per_vq_out)
++aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
++					unsigned int *nelts_per_vq_out)
+ {
+   rtx elt;
+   poly_int64 value;
+@@ -2652,9 +3449,9 @@ aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
+    instruction.  */
+ 
+ bool
+-aarch64_sve_inc_dec_immediate_p (rtx x)
++aarch64_sve_vector_inc_dec_immediate_p (rtx x)
+ {
+-  return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
++  return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
+ }
+ 
+ /* Return the asm template for an SVE vector INC or DEC instruction.
+@@ -2662,18 +3459,18 @@ aarch64_sve_inc_dec_immediate_p (rtx x)
+    value of the vector count operand itself.  */
+ 
+ char *
+-aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
++aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
+ {
+   int factor;
+   unsigned int nelts_per_vq;
+-  if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
++  if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
+     gcc_unreachable ();
+   if (factor < 0)
+-    return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
+-					     nelts_per_vq);
++    return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
++					     -factor, nelts_per_vq);
+   else
+-    return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
+-					     nelts_per_vq);
++    return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
++					     factor, nelts_per_vq);
+ }
+ 
+ static int
+@@ -3056,20 +3853,36 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ 	}
+       else
+ 	{
+-	  /* Use CNTD, then multiply it by FACTOR.  */
+-	  val = gen_int_mode (poly_int64 (2, 2), mode);
++	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
++	     directly, since that should increase the chances of being
++	     able to use a shift and add sequence.  If LOW_BIT itself
++	     is out of range, just use CNTD.  */
++	  if (low_bit <= 16 * 8)
++	    factor /= low_bit;
++	  else
++	    low_bit = 1;
++
++	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
+ 	  val = aarch64_force_temporary (mode, temp1, val);
+ 
+-	  /* Go back to using a negative multiplication factor if we have
+-	     no register from which to subtract.  */
+-	  if (code == MINUS && src == const0_rtx)
++	  if (can_create_pseudo_p ())
++	    {
++	      rtx coeff1 = gen_int_mode (factor, mode);
++	      val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
++	    }
++	  else
+ 	    {
+-	      factor = -factor;
+-	      code = PLUS;
++	      /* Go back to using a negative multiplication factor if we have
++		 no register from which to subtract.  */
++	      if (code == MINUS && src == const0_rtx)
++		{
++		  factor = -factor;
++		  code = PLUS;
++		}
++	      rtx coeff1 = gen_int_mode (factor, mode);
++	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
++	      val = gen_rtx_MULT (mode, val, coeff1);
+ 	    }
+-	  rtx coeff1 = gen_int_mode (factor, mode);
+-	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
+-	  val = gen_rtx_MULT (mode, val, coeff1);
+ 	}
+ 
+       if (shift > 0)
+@@ -3176,32 +3989,55 @@ aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
+   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
+ }
+ 
+-/* Try to duplicate SRC into SVE register DEST, given that SRC is an
+-   integer of mode INT_MODE.  Return true on success.  */
++/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
++   register of mode MODE.  Use TARGET for the result if it's nonnull
++   and convenient.
+ 
+-static bool
+-aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
+-				      rtx src)
+-{
+-  /* If the constant is smaller than 128 bits, we can do the move
+-     using a vector of SRC_MODEs.  */
+-  if (src_mode != TImode)
+-    {
+-      poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
+-				     GET_MODE_SIZE (src_mode));
+-      machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
+-      emit_move_insn (gen_lowpart (dup_mode, dest),
+-		      gen_const_vec_duplicate (dup_mode, src));
+-      return true;
++   The two vector modes must have the same element mode.  The behavior
++   is to duplicate architectural lane N of SRC into architectural lanes
++   N + I * STEP of the result.  On big-endian targets, architectural
++   lane 0 of an Advanced SIMD vector is the last element of the vector
++   in memory layout, so for big-endian targets this operation has the
++   effect of reversing SRC before duplicating it.  Callers need to
++   account for this.  */
++
++rtx
++aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
++{
++  machine_mode src_mode = GET_MODE (src);
++  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
++  insn_code icode = (BYTES_BIG_ENDIAN
++		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
++		     : code_for_aarch64_vec_duplicate_vq_le (mode));
++
++  unsigned int i = 0;
++  expand_operand ops[3];
++  create_output_operand (&ops[i++], target, mode);
++  create_output_operand (&ops[i++], src, src_mode);
++  if (BYTES_BIG_ENDIAN)
++    {
++      /* Create a PARALLEL describing the reversal of SRC.  */
++      unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
++      rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
++						  nelts_per_vq - 1, -1);
++      create_fixed_operand (&ops[i++], sel);
+     }
++  expand_insn (icode, i, ops);
++  return ops[0].value;
++}
++
++/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
++   the memory image into DEST.  Return true on success.  */
+ 
+-  /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
+-  src = force_const_mem (src_mode, src);
++static bool
++aarch64_expand_sve_ld1rq (rtx dest, rtx src)
++{
++  src = force_const_mem (GET_MODE (src), src);
+   if (!src)
+     return false;
+ 
+   /* Make sure that the address is legitimate.  */
+-  if (!aarch64_sve_ld1r_operand_p (src))
++  if (!aarch64_sve_ld1rq_operand_p (src))
+     {
+       rtx addr = force_reg (Pmode, XEXP (src, 0));
+       src = replace_equiv_address (src, addr);
+@@ -3210,47 +4046,128 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
+   machine_mode mode = GET_MODE (dest);
+   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
+   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+-  src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
+-  emit_insn (gen_rtx_SET (dest, src));
++  rtx ptrue = aarch64_ptrue_reg (pred_mode);
++  emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
+   return true;
+ }
+ 
+-/* Expand a move of general CONST_VECTOR SRC into DEST, given that it
+-   isn't a simple duplicate or series.  */
++/* Return a register containing CONST_VECTOR SRC, given that SRC has an
++   SVE data mode and isn't a legitimate constant.  Use TARGET for the
++   result if convenient.
+ 
+-static void
+-aarch64_expand_sve_const_vector (rtx dest, rtx src)
++   The returned register can have whatever mode seems most natural
++   given the contents of SRC.  */
++
++static rtx
++aarch64_expand_sve_const_vector (rtx target, rtx src)
+ {
+   machine_mode mode = GET_MODE (src);
+   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
+   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
+-  gcc_assert (npatterns > 1);
++  scalar_mode elt_mode = GET_MODE_INNER (mode);
++  unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
++  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
++
++  if (nelts_per_pattern == 1 && encoded_bits == 128)
++    {
++      /* The constant is a duplicated quadword but can't be narrowed
++	 beyond a quadword.  Get the memory image of the first quadword
++	 as a 128-bit vector and try using LD1RQ to load it from memory.
++
++	 The effect for both endiannesses is to load memory lane N into
++	 architectural lanes N + I * STEP of the result.  On big-endian
++	 targets, the layout of the 128-bit vector in an Advanced SIMD
++	 register would be different from its layout in an SVE register,
++	 but this 128-bit vector is a memory value only.  */
++      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
++      rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
++      if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
++	return target;
++    }
++
++  if (nelts_per_pattern == 1 && encoded_bits < 128)
++    {
++      /* The vector is a repeating sequence of 64 bits or fewer.
++	 See if we can load them using an Advanced SIMD move and then
++	 duplicate it to fill a vector.  This is better than using a GPR
++	 move because it keeps everything in the same register file.  */
++      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
++      rtx_vector_builder builder (vq_mode, npatterns, 1);
++      for (unsigned int i = 0; i < npatterns; ++i)
++	{
++	  /* We want memory lane N to go into architectural lane N,
++	     so reverse for big-endian targets.  The DUP .Q pattern
++	     has a compensating reverse built-in.  */
++	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
++	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
++	}
++      rtx vq_src = builder.build ();
++      if (aarch64_simd_valid_immediate (vq_src, NULL))
++	{
++	  vq_src = force_reg (vq_mode, vq_src);
++	  return aarch64_expand_sve_dupq (target, mode, vq_src);
++	}
+ 
+-  if (nelts_per_pattern == 1)
+-    {
+-      /* The constant is a repeating seqeuence of at least two elements,
+-	 where the repeating elements occupy no more than 128 bits.
+-	 Get an integer representation of the replicated value.  */
+-      scalar_int_mode int_mode;
+-      if (BYTES_BIG_ENDIAN)
+-	/* For now, always use LD1RQ to load the value on big-endian
+-	   targets, since the handling of smaller integers includes a
+-	   subreg that is semantically an element reverse.  */
+-	int_mode = TImode;
+-      else
++      /* Get an integer representation of the repeating part of Advanced
++	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
++	 which for big-endian targets is lane-swapped wrt a normal
++	 Advanced SIMD vector.  This means that for both endiannesses,
++	 memory lane N of SVE vector SRC corresponds to architectural
++	 lane N of a register holding VQ_SRC.  This in turn means that
++	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
++	 as a single 128-bit value) and thus that memory lane 0 of SRC is
++	 in the lsb of the integer.  Duplicating the integer therefore
++	 ensures that memory lane N of SRC goes into architectural lane
++	 N + I * INDEX of the SVE register.  */
++      scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
++      rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
++      if (elt_value)
+ 	{
+-	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
+-	  gcc_assert (int_bits <= 128);
+-	  int_mode = int_mode_for_size (int_bits, 0).require ();
++	  /* Pretend that we had a vector of INT_MODE to start with.  */
++	  elt_mode = int_mode;
++	  mode = aarch64_full_sve_mode (int_mode).require ();
++
++	  /* If the integer can be moved into a general register by a
++	     single instruction, do that and duplicate the result.  */
++	  if (CONST_INT_P (elt_value)
++	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
++	    {
++	      elt_value = force_reg (elt_mode, elt_value);
++	      return expand_vector_broadcast (mode, elt_value);
++	    }
++	}
++      else if (npatterns == 1)
++	/* We're duplicating a single value, but can't do better than
++	   force it to memory and load from there.  This handles things
++	   like symbolic constants.  */
++	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
++
++      if (elt_value)
++	{
++	  /* Load the element from memory if we can, otherwise move it into
++	     a register and use a DUP.  */
++	  rtx op = force_const_mem (elt_mode, elt_value);
++	  if (!op)
++	    op = force_reg (elt_mode, elt_value);
++	  return expand_vector_broadcast (mode, op);
+ 	}
+-      rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
+-      if (int_value
+-	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
+-	return;
+     }
+ 
++  /* Try using INDEX.  */
++  rtx base, step;
++  if (const_vec_series_p (src, &base, &step))
++    {
++      aarch64_expand_vec_series (target, base, step);
++      return target;
++    }
++
++  /* From here on, it's better to force the whole constant to memory
++     if we can.  */
++  if (GET_MODE_NUNITS (mode).is_constant ())
++    return NULL_RTX;
++
+   /* Expand each pattern individually.  */
++  gcc_assert (npatterns > 1);
+   rtx_vector_builder builder;
+   auto_vec<rtx, 16> vectors (npatterns);
+   for (unsigned int i = 0; i < npatterns; ++i)
+@@ -3267,22 +4184,263 @@ aarch64_expand_sve_const_vector (rtx dest, rtx src)
+       npatterns /= 2;
+       for (unsigned int i = 0; i < npatterns; ++i)
+ 	{
+-	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
++	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
+ 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
+ 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+ 	  vectors[i] = tmp;
+ 	}
+     }
+-  gcc_assert (vectors[0] == dest);
++  gcc_assert (vectors[0] == target);
++  return target;
++}
++
++/* Use WHILE to set a predicate register of mode MODE in which the first
++   VL bits are set and the rest are clear.  Use TARGET for the register
++   if it's nonnull and convenient.  */
++
++static rtx
++aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
++				 unsigned int vl)
++{
++  rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
++  target = aarch64_target_reg (target, mode);
++  emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
++			target, const0_rtx, limit));
++  return target;
++}
++
++static rtx
++aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
++
++/* BUILDER is a constant predicate in which the index of every set bit
++   is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
++   by inverting every element at a multiple of ELT_SIZE and EORing the
++   result with an ELT_SIZE PTRUE.
++
++   Return a register that contains the constant on success, otherwise
++   return null.  Use TARGET as the register if it is nonnull and
++   convenient.  */
++
++static rtx
++aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
++				   unsigned int elt_size)
++{
++  /* Invert every element at a multiple of ELT_SIZE, keeping the
++     other bits zero.  */
++  rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
++				  builder.nelts_per_pattern ());
++  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
++    if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
++      inv_builder.quick_push (const1_rtx);
++    else
++      inv_builder.quick_push (const0_rtx);
++  inv_builder.finalize ();
++
++  /* See if we can load the constant cheaply.  */
++  rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
++  if (!inv)
++    return NULL_RTX;
++
++  /* EOR the result with an ELT_SIZE PTRUE.  */
++  rtx mask = aarch64_ptrue_all (elt_size);
++  mask = force_reg (VNx16BImode, mask);
++  target = aarch64_target_reg (target, VNx16BImode);
++  emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
++  return target;
++}
++
++/* BUILDER is a constant predicate in which the index of every set bit
++   is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
++   using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
++   register on success, otherwise return null.  Use TARGET as the register
++   if nonnull and convenient.  */
++
++static rtx
++aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
++				   unsigned int elt_size,
++				   unsigned int permute_size)
++{
++  /* We're going to split the constant into two new constants A and B,
++     with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
++     and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
++
++     A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
++     B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
++
++     where _ indicates elements that will be discarded by the permute.
++
++     First calculate the ELT_SIZEs for A and B.  */
++  unsigned int a_elt_size = GET_MODE_SIZE (DImode);
++  unsigned int b_elt_size = GET_MODE_SIZE (DImode);
++  for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
++    if (INTVAL (builder.elt (i)) != 0)
++      {
++	if (i & permute_size)
++	  b_elt_size |= i - permute_size;
++	else
++	  a_elt_size |= i;
++      }
++  a_elt_size &= -a_elt_size;
++  b_elt_size &= -b_elt_size;
++
++  /* Now construct the vectors themselves.  */
++  rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
++				builder.nelts_per_pattern ());
++  rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
++				builder.nelts_per_pattern ());
++  unsigned int nelts = builder.encoded_nelts ();
++  for (unsigned int i = 0; i < nelts; ++i)
++    if (i & (elt_size - 1))
++      {
++	a_builder.quick_push (const0_rtx);
++	b_builder.quick_push (const0_rtx);
++      }
++    else if ((i & permute_size) == 0)
++      {
++	/* The A and B elements are significant.  */
++	a_builder.quick_push (builder.elt (i));
++	b_builder.quick_push (builder.elt (i + permute_size));
++      }
++    else
++      {
++	/* The A and B elements are going to be discarded, so pick whatever
++	   is likely to give a nice constant.  We are targeting element
++	   sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
++	   with the aim of each being a sequence of ones followed by
++	   a sequence of zeros.  So:
++
++	   * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
++	     duplicate the last X_ELT_SIZE element, to extend the
++	     current sequence of ones or zeros.
++
++	   * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
++	     zero, so that the constant really does have X_ELT_SIZE and
++	     not a smaller size.  */
++	if (a_elt_size > permute_size)
++	  a_builder.quick_push (const0_rtx);
++	else
++	  a_builder.quick_push (a_builder.elt (i - a_elt_size));
++	if (b_elt_size > permute_size)
++	  b_builder.quick_push (const0_rtx);
++	else
++	  b_builder.quick_push (b_builder.elt (i - b_elt_size));
++      }
++  a_builder.finalize ();
++  b_builder.finalize ();
++
++  /* Try loading A into a register.  */
++  rtx_insn *last = get_last_insn ();
++  rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
++  if (!a)
++    return NULL_RTX;
++
++  /* Try loading B into a register.  */
++  rtx b = a;
++  if (a_builder != b_builder)
++    {
++      b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
++      if (!b)
++	{
++	  delete_insns_since (last);
++	  return NULL_RTX;
++	}
++    }
++
++  /* Emit the TRN1 itself.  */
++  machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
++  target = aarch64_target_reg (target, mode);
++  emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
++			      gen_lowpart (mode, a),
++			      gen_lowpart (mode, b)));
++  return target;
++}
++
++/* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
++   constant in BUILDER into an SVE predicate register.  Return the register
++   on success, otherwise return null.  Use TARGET for the register if
++   nonnull and convenient.
++
++   ALLOW_RECURSE_P is true if we can use methods that would call this
++   function recursively.  */
++
++static rtx
++aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
++				 bool allow_recurse_p)
++{
++  if (builder.encoded_nelts () == 1)
++    /* A PFALSE or a PTRUE .B ALL.  */
++    return aarch64_emit_set_immediate (target, builder);
++
++  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
++  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
++    {
++      /* If we can load the constant using PTRUE, use it as-is.  */
++      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
++      if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
++	return aarch64_emit_set_immediate (target, builder);
++
++      /* Otherwise use WHILE to set the first VL bits.  */
++      return aarch64_sve_move_pred_via_while (target, mode, vl);
++    }
++
++  if (!allow_recurse_p)
++    return NULL_RTX;
++
++  /* Try inverting the vector in element size ELT_SIZE and then EORing
++     the result with an ELT_SIZE PTRUE.  */
++  if (INTVAL (builder.elt (0)) == 0)
++    if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
++						     elt_size))
++      return res;
++
++  /* Try using TRN1 to permute two simpler constants.  */
++  for (unsigned int i = elt_size; i <= 8; i *= 2)
++    if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
++						     elt_size, i))
++      return res;
++
++  return NULL_RTX;
+ }
+ 
+-/* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
+-   is a pattern that can be used to set DEST to a replicated scalar
+-   element.  */
++/* Return an SVE predicate register that contains the VNx16BImode
++   constant in BUILDER, without going through the move expanders.
++
++   The returned register can have whatever mode seems most natural
++   given the contents of BUILDER.  Use TARGET for the result if
++   convenient.  */
++
++static rtx
++aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
++{
++  /* Try loading the constant using pure predicate operations.  */
++  if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
++    return res;
++
++  /* Try forcing the constant to memory.  */
++  if (builder.full_nelts ().is_constant ())
++    if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
++      {
++	target = aarch64_target_reg (target, VNx16BImode);
++	emit_move_insn (target, mem);
++	return target;
++      }
++
++  /* The last resort is to load the constant as an integer and then
++     compare it against zero.  Use -1 for set bits in order to increase
++     the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
++  rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
++				  builder.nelts_per_pattern ());
++  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
++    int_builder.quick_push (INTVAL (builder.elt (i))
++			    ? constm1_rtx : const0_rtx);
++  return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
++					   int_builder.build ());
++}
++
++/* Set DEST to immediate IMM.  */
+ 
+ void
+-aarch64_expand_mov_immediate (rtx dest, rtx imm,
+-			      rtx (*gen_vec_duplicate) (rtx, rtx))
++aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ {
+   machine_mode mode = GET_MODE (dest);
+ 
+@@ -3405,38 +4563,50 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm,
+ 
+   if (!CONST_INT_P (imm))
+     {
+-      rtx base, step, value;
+-      if (GET_CODE (imm) == HIGH
+-	  || aarch64_simd_valid_immediate (imm, NULL))
+-	emit_insn (gen_rtx_SET (dest, imm));
+-      else if (const_vec_series_p (imm, &base, &step))
+-	aarch64_expand_vec_series (dest, base, step);
+-      else if (const_vec_duplicate_p (imm, &value))
++      if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+ 	{
+-	  /* If the constant is out of range of an SVE vector move,
+-	     load it from memory if we can, otherwise move it into
+-	     a register and use a DUP.  */
+-	  scalar_mode inner_mode = GET_MODE_INNER (mode);
+-	  rtx op = force_const_mem (inner_mode, value);
+-	  if (!op)
+-	    op = force_reg (inner_mode, value);
+-	  else if (!aarch64_sve_ld1r_operand_p (op))
++	  /* Only the low bit of each .H, .S and .D element is defined,
++	     so we can set the upper bits to whatever we like.  If the
++	     predicate is all-true in MODE, prefer to set all the undefined
++	     bits as well, so that we can share a single .B predicate for
++	     all modes.  */
++	  if (imm == CONSTM1_RTX (mode))
++	    imm = CONSTM1_RTX (VNx16BImode);
++
++	  /* All methods for constructing predicate modes wider than VNx16BI
++	     will set the upper bits of each element to zero.  Expose this
++	     by moving such constants as a VNx16BI, so that all bits are
++	     significant and so that constants for different modes can be
++	     shared.  The wider constant will still be available as a
++	     REG_EQUAL note.  */
++	  rtx_vector_builder builder;
++	  if (aarch64_get_sve_pred_bits (builder, imm))
+ 	    {
+-	      rtx addr = force_reg (Pmode, XEXP (op, 0));
+-	      op = replace_equiv_address (op, addr);
++	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
++	      if (dest != res)
++		emit_move_insn (dest, gen_lowpart (mode, res));
++	      return;
+ 	    }
+-	  emit_insn (gen_vec_duplicate (dest, op));
+ 	}
+-      else if (GET_CODE (imm) == CONST_VECTOR
+-	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
+-	aarch64_expand_sve_const_vector (dest, imm);
+-      else
++
++      if (GET_CODE (imm) == HIGH
++	  || aarch64_simd_valid_immediate (imm, NULL))
+ 	{
+-	  rtx mem = force_const_mem (mode, imm);
+-	  gcc_assert (mem);
+-	  emit_move_insn (dest, mem);
++	  emit_insn (gen_rtx_SET (dest, imm));
++	  return;
+ 	}
+ 
++      if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
++	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
++	  {
++	    if (dest != res)
++	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
++	    return;
++	  }
++
++      rtx mem = force_const_mem (mode, imm);
++      gcc_assert (mem);
++      emit_move_insn (dest, mem);
+       return;
+     }
+ 
+@@ -3455,6 +4625,7 @@ aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
+   create_output_operand (&ops[0], dest, mode);
+   create_input_operand (&ops[1], pred, GET_MODE(pred));
+   create_input_operand (&ops[2], src, mode);
++  temporary_volatile_ok v (true);
+   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+ }
+ 
+@@ -3471,7 +4642,7 @@ void
+ aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
+ {
+   machine_mode mode = GET_MODE (dest);
+-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
++  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+   if (!register_operand (src, mode)
+       && !register_operand (dest, mode))
+     {
+@@ -3535,7 +4706,7 @@ aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
+     return false;
+ 
+   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
+-  rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
++  rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
+   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
+ 			       UNSPEC_REV_SUBREG);
+   emit_insn (gen_rtx_SET (dest, unspec));
+@@ -3557,14 +4728,29 @@ aarch64_replace_reg_mode (rtx x, machine_mode mode)
+   return x;
+ }
+ 
++/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
++   stored in wider integer containers.  */
++
++static unsigned int
++aarch64_sve_rev_unspec (machine_mode mode)
++{
++  switch (GET_MODE_UNIT_SIZE (mode))
++    {
++    case 1: return UNSPEC_REVB;
++    case 2: return UNSPEC_REVH;
++    case 4: return UNSPEC_REVW;
++    }
++  gcc_unreachable ();
++}
++
+ /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
+    operands.  */
+ 
+ void
+ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
+ {
+-  /* Decide which REV operation we need.  The mode with narrower elements
+-     determines the mode of the operands and the mode with the wider
++  /* Decide which REV operation we need.  The mode with wider elements
++     determines the mode of the operands and the mode with the narrower
+      elements determines the reverse width.  */
+   machine_mode mode_with_wider_elts = GET_MODE (dest);
+   machine_mode mode_with_narrower_elts = GET_MODE (src);
+@@ -3572,38 +4758,22 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
+       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
+     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
+ 
++  unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
+   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
+-  unsigned int unspec;
+-  if (wider_bytes == 8)
+-    unspec = UNSPEC_REV64;
+-  else if (wider_bytes == 4)
+-    unspec = UNSPEC_REV32;
+-  else if (wider_bytes == 2)
+-    unspec = UNSPEC_REV16;
+-  else
+-    gcc_unreachable ();
+   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
+ 
+-  /* Emit:
+-
+-       (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
+-			 UNSPEC_MERGE_PTRUE))
+-
+-     with the appropriate modes.  */
++  /* Get the operands in the appropriate modes and emit the instruction.  */
+   ptrue = gen_lowpart (pred_mode, ptrue);
+-  dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
+-  src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
+-  src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
+-  src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
+-			UNSPEC_MERGE_PTRUE);
+-  emit_insn (gen_rtx_SET (dest, src));
++  dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
++  src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
++  emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
++			       dest, ptrue, src));
+ }
+ 
+ static bool
+-aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
+-				 tree exp ATTRIBUTE_UNUSED)
++aarch64_function_ok_for_sibcall (tree, tree exp)
+ {
+-  if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
++  if (crtl->abi->id () != expr_callee_abi (exp).id ())
+     return false;
+ 
+   return true;
+@@ -3612,35 +4782,48 @@ aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
+ /* Implement TARGET_PASS_BY_REFERENCE.  */
+ 
+ static bool
+-aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
+-			   machine_mode mode,
+-			   const_tree type,
+-			   bool named ATTRIBUTE_UNUSED)
++aarch64_pass_by_reference (cumulative_args_t pcum_v,
++			   const function_arg_info &arg)
+ {
++  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+   HOST_WIDE_INT size;
+   machine_mode dummymode;
+   int nregs;
+ 
++  unsigned int num_zr, num_pr;
++  if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
++    {
++      if (pcum && !pcum->silent_p && !TARGET_SVE)
++	/* We can't gracefully recover at this point, so make this a
++	   fatal error.  */
++	fatal_error (input_location, "arguments of type %qT require"
++		     " the SVE ISA extension", arg.type);
++
++      /* Variadic SVE types are passed by reference.  Normal non-variadic
++	 arguments are too if we've run out of registers.  */
++      return (!arg.named
++	      || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
++	      || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
++    }
++
+   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
+-  if (mode == BLKmode && type)
+-    size = int_size_in_bytes (type);
++  if (arg.mode == BLKmode && arg.type)
++    size = int_size_in_bytes (arg.type);
+   else
+     /* No frontends can create types with variable-sized modes, so we
+        shouldn't be asked to pass or return them.  */
+-    size = GET_MODE_SIZE (mode).to_constant ();
++    size = GET_MODE_SIZE (arg.mode).to_constant ();
+ 
+   /* Aggregates are passed by reference based on their size.  */
+-  if (type && AGGREGATE_TYPE_P (type))
+-    {
+-      size = int_size_in_bytes (type);
+-    }
++  if (arg.aggregate_type_p ())
++    size = int_size_in_bytes (arg.type);
+ 
+   /* Variable sized arguments are always returned by reference.  */
+   if (size < 0)
+     return true;
+ 
+   /* Can this be a candidate to be passed in fp/simd register(s)?  */
+-  if (aarch64_vfp_is_call_or_return_candidate (mode, type,
++  if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
+ 					       &dummymode, &nregs,
+ 					       NULL))
+     return false;
+@@ -3696,6 +4879,29 @@ aarch64_function_value (const_tree type, const_tree func,
+   if (INTEGRAL_TYPE_P (type))
+     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
+ 
++  unsigned int num_zr, num_pr;
++  if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
++    {
++      /* Don't raise an error here if we're called when SVE is disabled,
++	 since this is really just a query function.  Other code must
++	 do that where appropriate.  */
++      mode = TYPE_MODE_RAW (type);
++      gcc_assert (VECTOR_MODE_P (mode)
++		  && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
++
++      if (num_zr > 0 && num_pr == 0)
++	return gen_rtx_REG (mode, V0_REGNUM);
++
++      if (num_zr == 0 && num_pr == 1)
++	return gen_rtx_REG (mode, P0_REGNUM);
++
++      gcc_unreachable ();
++    }
++
++  /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
++     returned in memory, not by value.  */
++  gcc_assert (!aarch64_sve_mode_p (mode));
++
+   if (aarch64_return_in_msb (type))
+     {
+       HOST_WIDE_INT size = int_size_in_bytes (type);
+@@ -3778,6 +4984,16 @@ aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
+     /* Simple scalar types always returned in registers.  */
+     return false;
+ 
++  unsigned int num_zr, num_pr;
++  if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
++    {
++      /* All SVE types we support fit in registers.  For example, it isn't
++	 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
++	 predicates.  */
++      gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
++      return false;
++    }
++
+   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
+ 					       type,
+ 					       &ag_mode,
+@@ -3853,11 +5069,11 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type,
+    numbers refer to the rule numbers in the AAPCS64.  */
+ 
+ static void
+-aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
+-		    const_tree type,
+-		    bool named ATTRIBUTE_UNUSED)
++aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
+ {
+   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
++  tree type = arg.type;
++  machine_mode mode = arg.mode;
+   int ncrn, nvrn, nregs;
+   bool allocate_ncrn, allocate_nvrn;
+   HOST_WIDE_INT size;
+@@ -3869,6 +5085,46 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
+ 
+   pcum->aapcs_arg_processed = true;
+ 
++  unsigned int num_zr, num_pr;
++  if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
++    {
++      /* The PCS says that it is invalid to pass an SVE value to an
++	 unprototyped function.  There is no ABI-defined location we
++	 can return in this case, so we have no real choice but to raise
++	 an error immediately, even though this is only a query function.  */
++      if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
++	{
++	  gcc_assert (!pcum->silent_p);
++	  error ("SVE type %qT cannot be passed to an unprototyped function",
++		 arg.type);
++	  /* Avoid repeating the message, and avoid tripping the assert
++	     below.  */
++	  pcum->pcs_variant = ARM_PCS_SVE;
++	}
++
++      /* We would have converted the argument into pass-by-reference
++	 form if it didn't fit in registers.  */
++      pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
++      pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
++      gcc_assert (arg.named
++		  && pcum->pcs_variant == ARM_PCS_SVE
++		  && aarch64_sve_mode_p (mode)
++		  && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
++		  && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
++
++      if (num_zr > 0 && num_pr == 0)
++	pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
++      else if (num_zr == 0 && num_pr == 1)
++	pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
++      else
++	gcc_unreachable ();
++      return;
++    }
++
++  /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
++     passed by reference, not by value.  */
++  gcc_assert (!aarch64_sve_mode_p (mode));
++
+   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
+   if (type)
+     size = int_size_in_bytes (type);
+@@ -3893,7 +5149,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
+      and homogenous short-vector aggregates (HVA).  */
+   if (allocate_nvrn)
+     {
+-      if (!TARGET_FLOAT)
++      if (!pcum->silent_p && !TARGET_FLOAT)
+ 	aarch64_err_no_fpadvsimd (mode);
+ 
+       if (nvrn + nregs <= NUM_FP_ARG_REGS)
+@@ -4009,37 +5265,46 @@ on_stack:
+ /* Implement TARGET_FUNCTION_ARG.  */
+ 
+ static rtx
+-aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
+-		      const_tree type, bool named)
++aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
+ {
+   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+-  gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
++  gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
++	      || pcum->pcs_variant == ARM_PCS_SIMD
++	      || pcum->pcs_variant == ARM_PCS_SVE);
+ 
+-  if (mode == VOIDmode)
+-    return NULL_RTX;
++  if (arg.end_marker_p ())
++    return gen_int_mode (pcum->pcs_variant, DImode);
+ 
+-  aarch64_layout_arg (pcum_v, mode, type, named);
++  aarch64_layout_arg (pcum_v, arg);
+   return pcum->aapcs_reg;
+ }
+ 
+ void
+ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
+-			   const_tree fntype ATTRIBUTE_UNUSED,
+-			   rtx libname ATTRIBUTE_UNUSED,
+-			   const_tree fndecl ATTRIBUTE_UNUSED,
+-			   unsigned n_named ATTRIBUTE_UNUSED)
++			      const_tree fntype,
++			      rtx libname ATTRIBUTE_UNUSED,
++			      const_tree fndecl ATTRIBUTE_UNUSED,
++			      unsigned n_named ATTRIBUTE_UNUSED,
++			      bool silent_p)
+ {
+   pcum->aapcs_ncrn = 0;
+   pcum->aapcs_nvrn = 0;
++  pcum->aapcs_nprn = 0;
+   pcum->aapcs_nextncrn = 0;
+   pcum->aapcs_nextnvrn = 0;
+-  pcum->pcs_variant = ARM_PCS_AAPCS64;
++  pcum->aapcs_nextnprn = 0;
++  if (fntype)
++    pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
++  else
++    pcum->pcs_variant = ARM_PCS_AAPCS64;
+   pcum->aapcs_reg = NULL_RTX;
+   pcum->aapcs_arg_processed = false;
+   pcum->aapcs_stack_words = 0;
+   pcum->aapcs_stack_size = 0;
++  pcum->silent_p = silent_p;
+ 
+-  if (!TARGET_FLOAT
++  if (!silent_p
++      && !TARGET_FLOAT
+       && fndecl && TREE_PUBLIC (fndecl)
+       && fntype && fntype != error_mark_node)
+     {
+@@ -4050,24 +5315,38 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
+ 						   &mode, &nregs, NULL))
+ 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
+     }
+-  return;
++
++  if (!silent_p
++      && !TARGET_SVE
++      && pcum->pcs_variant == ARM_PCS_SVE)
++    {
++      /* We can't gracefully recover at this point, so make this a
++	 fatal error.  */
++      if (fndecl)
++	fatal_error (input_location, "%qE requires the SVE ISA extension",
++		     fndecl);
++      else
++	fatal_error (input_location, "calls to functions of type %qT require"
++		     " the SVE ISA extension", fntype);
++    }
+ }
+ 
+ static void
+ aarch64_function_arg_advance (cumulative_args_t pcum_v,
+-			      machine_mode mode,
+-			      const_tree type,
+-			      bool named)
++			      const function_arg_info &arg)
+ {
+   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+-  if (pcum->pcs_variant == ARM_PCS_AAPCS64)
++  if (pcum->pcs_variant == ARM_PCS_AAPCS64
++      || pcum->pcs_variant == ARM_PCS_SIMD
++      || pcum->pcs_variant == ARM_PCS_SVE)
+     {
+-      aarch64_layout_arg (pcum_v, mode, type, named);
++      aarch64_layout_arg (pcum_v, arg);
+       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
+ 		  != (pcum->aapcs_stack_words != 0));
+       pcum->aapcs_arg_processed = false;
+       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
+       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
++      pcum->aapcs_nprn = pcum->aapcs_nextnprn;
+       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
+       pcum->aapcs_stack_words = 0;
+       pcum->aapcs_reg = NULL_RTX;
+@@ -4500,11 +5779,14 @@ aarch64_needs_frame_chain (void)
+ static void
+ aarch64_layout_frame (void)
+ {
+-  HOST_WIDE_INT offset = 0;
++  poly_int64 offset = 0;
+   int regno, last_fp_reg = INVALID_REGNUM;
+-  bool simd_function = aarch64_simd_decl_p (cfun->decl);
++  machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
++  poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
++  bool frame_related_fp_reg_p = false;
++  aarch64_frame &frame = cfun->machine->frame;
+ 
+-  cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
++  frame.emit_frame_chain = aarch64_needs_frame_chain ();
+ 
+   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
+      the mid-end is doing.  */
+@@ -4513,184 +5795,264 @@ aarch64_layout_frame (void)
+ #define SLOT_NOT_REQUIRED (-2)
+ #define SLOT_REQUIRED     (-1)
+ 
+-  cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
+-  cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
+-
+-  /* If this is a non-leaf simd function with calls we assume that
+-     at least one of those calls is to a non-simd function and thus
+-     we must save V8 to V23 in the prologue.  */
+-
+-  if (simd_function && !crtl->is_leaf)
+-    {
+-      for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+-	if (FP_SIMD_SAVED_REGNUM_P (regno))
+- 	  df_set_regs_ever_live (regno, true);
+-    }
++  frame.wb_candidate1 = INVALID_REGNUM;
++  frame.wb_candidate2 = INVALID_REGNUM;
++  frame.spare_pred_reg = INVALID_REGNUM;
+ 
+   /* First mark all the registers that really need to be saved...  */
+-  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
+-    cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
+-
+-  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+-    cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
++  for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
++    frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
+ 
+   /* ... that includes the eh data registers (if needed)...  */
+   if (crtl->calls_eh_return)
+     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
+-      cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
+-	= SLOT_REQUIRED;
++      frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
+ 
+   /* ... and any callee saved register that dataflow says is live.  */
+   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
+     if (df_regs_ever_live_p (regno)
++	&& !fixed_regs[regno]
+ 	&& (regno == R30_REGNUM
+-	    || !call_used_regs[regno]))
+-      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
++	    || !crtl->abi->clobbers_full_reg_p (regno)))
++      frame.reg_offset[regno] = SLOT_REQUIRED;
+ 
+   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+     if (df_regs_ever_live_p (regno)
+-	&& (!call_used_regs[regno]
+-	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
++	&& !fixed_regs[regno]
++	&& !crtl->abi->clobbers_full_reg_p (regno))
+       {
+-	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
++	frame.reg_offset[regno] = SLOT_REQUIRED;
+ 	last_fp_reg = regno;
++	if (aarch64_emit_cfi_for_reg_p (regno))
++	  frame_related_fp_reg_p = true;
+       }
+ 
+-  if (cfun->machine->frame.emit_frame_chain)
+-    {
+-      /* FP and LR are placed in the linkage record.  */
+-      cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
+-      cfun->machine->frame.wb_candidate1 = R29_REGNUM;
+-      cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
+-      cfun->machine->frame.wb_candidate2 = R30_REGNUM;
+-      offset = 2 * UNITS_PER_WORD;
++  /* Big-endian SVE frames need a spare predicate register in order
++     to save Z8-Z15.  Decide which register they should use.  Prefer
++     an unused argument register if possible, so that we don't force P4
++     to be saved unnecessarily.  */
++  if (frame_related_fp_reg_p
++      && crtl->abi->id () == ARM_PCS_SVE
++      && BYTES_BIG_ENDIAN)
++    {
++      bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
++      bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
++      for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
++	if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
++	  break;
++      gcc_assert (regno <= P7_REGNUM);
++      frame.spare_pred_reg = regno;
++      df_set_regs_ever_live (regno, true);
+     }
+ 
++  for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
++    if (df_regs_ever_live_p (regno)
++	&& !fixed_regs[regno]
++	&& !crtl->abi->clobbers_full_reg_p (regno))
++      frame.reg_offset[regno] = SLOT_REQUIRED;
++
+   /* With stack-clash, LR must be saved in non-leaf functions.  */
+   gcc_assert (crtl->is_leaf
+-	      || (cfun->machine->frame.reg_offset[R30_REGNUM]
+-		  != SLOT_NOT_REQUIRED));
++	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
++
++  /* Now assign stack slots for the registers.  Start with the predicate
++     registers, since predicate LDR and STR have a relatively small
++     offset range.  These saves happen below the hard frame pointer.  */
++  for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
++    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
++      {
++	frame.reg_offset[regno] = offset;
++	offset += BYTES_PER_SVE_PRED;
++      }
++
++  /* We save a maximum of 8 predicate registers, and since vector
++     registers are 8 times the size of a predicate register, all the
++     saved predicates fit within a single vector.  Doing this also
++     rounds the offset to a 128-bit boundary.  */
++  if (maybe_ne (offset, 0))
++    {
++      gcc_assert (known_le (offset, vector_save_size));
++      offset = vector_save_size;
++    }
++
++  /* If we need to save any SVE vector registers, add them next.  */
++  if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
++    for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
++      if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
++	{
++	  frame.reg_offset[regno] = offset;
++	  offset += vector_save_size;
++	}
++
++  /* OFFSET is now the offset of the hard frame pointer from the bottom
++     of the callee save area.  */
++  bool saves_below_hard_fp_p = maybe_ne (offset, 0);
++  frame.below_hard_fp_saved_regs_size = offset;
++  if (frame.emit_frame_chain)
++    {
++      /* FP and LR are placed in the linkage record.  */
++      frame.reg_offset[R29_REGNUM] = offset;
++      frame.wb_candidate1 = R29_REGNUM;
++      frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
++      frame.wb_candidate2 = R30_REGNUM;
++      offset += 2 * UNITS_PER_WORD;
++    }
+ 
+-  /* Now assign stack slots for them.  */
+   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
+-    if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
++    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+       {
+-	cfun->machine->frame.reg_offset[regno] = offset;
+-	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
+-	  cfun->machine->frame.wb_candidate1 = regno;
+-	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
+-	  cfun->machine->frame.wb_candidate2 = regno;
++	frame.reg_offset[regno] = offset;
++	if (frame.wb_candidate1 == INVALID_REGNUM)
++	  frame.wb_candidate1 = regno;
++	else if (frame.wb_candidate2 == INVALID_REGNUM)
++	  frame.wb_candidate2 = regno;
+ 	offset += UNITS_PER_WORD;
+       }
+ 
+-  HOST_WIDE_INT max_int_offset = offset;
+-  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+-  bool has_align_gap = offset != max_int_offset;
++  poly_int64 max_int_offset = offset;
++  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
++  bool has_align_gap = maybe_ne (offset, max_int_offset);
+ 
+   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+-    if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
++    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+       {
+ 	/* If there is an alignment gap between integer and fp callee-saves,
+ 	   allocate the last fp register to it if possible.  */
+ 	if (regno == last_fp_reg
+ 	    && has_align_gap
+-	    && !simd_function
+-	    && (offset & 8) == 0)
++	    && known_eq (vector_save_size, 8)
++	    && multiple_p (offset, 16))
+ 	  {
+-	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
++	    frame.reg_offset[regno] = max_int_offset;
+ 	    break;
+ 	  }
+ 
+-	cfun->machine->frame.reg_offset[regno] = offset;
+-	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
+-	  cfun->machine->frame.wb_candidate1 = regno;
+-	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
+-		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
+-	  cfun->machine->frame.wb_candidate2 = regno;
+-	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
++	frame.reg_offset[regno] = offset;
++	if (frame.wb_candidate1 == INVALID_REGNUM)
++	  frame.wb_candidate1 = regno;
++	else if (frame.wb_candidate2 == INVALID_REGNUM
++		 && frame.wb_candidate1 >= V0_REGNUM)
++	  frame.wb_candidate2 = regno;
++	offset += vector_save_size;
+       }
+ 
+-  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
++  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+-  cfun->machine->frame.saved_regs_size = offset;
++  frame.saved_regs_size = offset;
+ 
+-  HOST_WIDE_INT varargs_and_saved_regs_size
+-    = offset + cfun->machine->frame.saved_varargs_size;
++  poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
+ 
+-  cfun->machine->frame.hard_fp_offset
++  poly_int64 above_outgoing_args
+     = aligned_upper_bound (varargs_and_saved_regs_size
+ 			   + get_frame_size (),
+ 			   STACK_BOUNDARY / BITS_PER_UNIT);
+ 
++  frame.hard_fp_offset
++    = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
++
+   /* Both these values are already aligned.  */
+   gcc_assert (multiple_p (crtl->outgoing_args_size,
+ 			  STACK_BOUNDARY / BITS_PER_UNIT));
+-  cfun->machine->frame.frame_size
+-    = (cfun->machine->frame.hard_fp_offset
+-       + crtl->outgoing_args_size);
++  frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
+ 
+-  cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
++  frame.locals_offset = frame.saved_varargs_size;
+ 
+-  cfun->machine->frame.initial_adjust = 0;
+-  cfun->machine->frame.final_adjust = 0;
+-  cfun->machine->frame.callee_adjust = 0;
+-  cfun->machine->frame.callee_offset = 0;
++  frame.initial_adjust = 0;
++  frame.final_adjust = 0;
++  frame.callee_adjust = 0;
++  frame.sve_callee_adjust = 0;
++  frame.callee_offset = 0;
+ 
+   HOST_WIDE_INT max_push_offset = 0;
+-  if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
++  if (frame.wb_candidate2 != INVALID_REGNUM)
+     max_push_offset = 512;
+-  else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
++  else if (frame.wb_candidate1 != INVALID_REGNUM)
+     max_push_offset = 256;
+ 
+-  HOST_WIDE_INT const_size, const_fp_offset;
+-  if (cfun->machine->frame.frame_size.is_constant (&const_size)
++  HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
++  HOST_WIDE_INT const_saved_regs_size;
++  if (frame.frame_size.is_constant (&const_size)
+       && const_size < max_push_offset
+-      && known_eq (crtl->outgoing_args_size, 0))
++      && known_eq (frame.hard_fp_offset, const_size))
+     {
+       /* Simple, small frame with no outgoing arguments:
++
+ 	 stp reg1, reg2, [sp, -frame_size]!
+ 	 stp reg3, reg4, [sp, 16]  */
+-      cfun->machine->frame.callee_adjust = const_size;
+-    }
+-  else if (known_lt (crtl->outgoing_args_size
+-		     + cfun->machine->frame.saved_regs_size, 512)
++      frame.callee_adjust = const_size;
++    }
++  else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
++	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
++	   && const_outgoing_args_size + const_saved_regs_size < 512
++	   /* We could handle this case even with outgoing args, provided
++	      that the number of args left us with valid offsets for all
++	      predicate and vector save slots.  It's such a rare case that
++	      it hardly seems worth the effort though.  */
++	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
+ 	   && !(cfun->calls_alloca
+-		&& known_lt (cfun->machine->frame.hard_fp_offset,
+-			     max_push_offset)))
++		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
++		&& const_fp_offset < max_push_offset))
+     {
+       /* Frame with small outgoing arguments:
++
+ 	 sub sp, sp, frame_size
+ 	 stp reg1, reg2, [sp, outgoing_args_size]
+ 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
+-      cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
+-      cfun->machine->frame.callee_offset
+-	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
++      frame.initial_adjust = frame.frame_size;
++      frame.callee_offset = const_outgoing_args_size;
+     }
+-  else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
++  else if (saves_below_hard_fp_p
++	   && known_eq (frame.saved_regs_size,
++			frame.below_hard_fp_saved_regs_size))
++    {
++      /* Frame in which all saves are SVE saves:
++
++	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
++	 save SVE registers relative to SP
++	 sub sp, sp, outgoing_args_size  */
++      frame.initial_adjust = (frame.hard_fp_offset
++			      + frame.below_hard_fp_saved_regs_size);
++      frame.final_adjust = crtl->outgoing_args_size;
++    }
++  else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
+ 	   && const_fp_offset < max_push_offset)
+     {
+-      /* Frame with large outgoing arguments but a small local area:
++      /* Frame with large outgoing arguments or SVE saves, but with
++	 a small local area:
++
+ 	 stp reg1, reg2, [sp, -hard_fp_offset]!
+ 	 stp reg3, reg4, [sp, 16]
++	 [sub sp, sp, below_hard_fp_saved_regs_size]
++	 [save SVE registers relative to SP]
+ 	 sub sp, sp, outgoing_args_size  */
+-      cfun->machine->frame.callee_adjust = const_fp_offset;
+-      cfun->machine->frame.final_adjust
+-	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
++      frame.callee_adjust = const_fp_offset;
++      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
++      frame.final_adjust = crtl->outgoing_args_size;
+     }
+   else
+     {
+-      /* Frame with large local area and outgoing arguments using frame pointer:
++      /* Frame with large local area and outgoing arguments or SVE saves,
++	 using frame pointer:
++
+ 	 sub sp, sp, hard_fp_offset
+ 	 stp x29, x30, [sp, 0]
+ 	 add x29, sp, 0
+ 	 stp reg3, reg4, [sp, 16]
++	 [sub sp, sp, below_hard_fp_saved_regs_size]
++	 [save SVE registers relative to SP]
+ 	 sub sp, sp, outgoing_args_size  */
+-      cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
+-      cfun->machine->frame.final_adjust
+-	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
++      frame.initial_adjust = frame.hard_fp_offset;
++      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
++      frame.final_adjust = crtl->outgoing_args_size;
+     }
+ 
+-  cfun->machine->frame.laid_out = true;
++  /* Make sure the individual adjustments add up to the full frame size.  */
++  gcc_assert (known_eq (frame.initial_adjust
++			+ frame.callee_adjust
++			+ frame.sve_callee_adjust
++			+ frame.final_adjust, frame.frame_size));
++
++  frame.laid_out = true;
+ }
+ 
+ /* Return true if the register REGNO is saved on entry to
+@@ -4699,7 +6061,7 @@ aarch64_layout_frame (void)
+ static bool
+ aarch64_register_saved_on_entry (int regno)
+ {
+-  return cfun->machine->frame.reg_offset[regno] >= 0;
++  return known_ge (cfun->machine->frame.reg_offset[regno], 0);
+ }
+ 
+ /* Return the next register up from REGNO up to LIMIT for the callee
+@@ -4766,7 +6128,7 @@ static void
+ aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
+ {
+   rtx_insn *insn;
+-  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
++  machine_mode mode = aarch64_reg_save_mode (regno1);
+ 
+   if (regno2 == INVALID_REGNUM)
+     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
+@@ -4812,7 +6174,7 @@ static void
+ aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
+ 		  rtx *cfi_ops)
+ {
+-  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
++  machine_mode mode = aarch64_reg_save_mode (regno1);
+   rtx reg1 = gen_rtx_REG (mode, regno1);
+ 
+   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
+@@ -4888,10 +6250,10 @@ aarch64_return_address_signing_enabled (void)
+   gcc_assert (cfun->machine->frame.laid_out);
+ 
+   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
+-     if it's LR is pushed onto stack.  */
++     if its LR is pushed onto stack.  */
+   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
+ 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
+-	      && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
++	      && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
+ }
+ 
+ /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
+@@ -4901,17 +6263,75 @@ aarch64_bti_enabled (void)
+   return (aarch64_enable_bti == 1);
+ }
+ 
++/* The caller is going to use ST1D or LD1D to save or restore an SVE
++   register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
++   the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
++
++     (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
++	 or LD1D address
++
++     (2) setting PRED to a valid predicate register for the ST1D or LD1D,
++	 if the variable isn't already nonnull
++
++   (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
++   Handle this case using a temporary base register that is suitable for
++   all offsets in that range.  Use ANCHOR_REG as this base register if it
++   is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
++
++static inline void
++aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
++				     rtx &anchor_reg, poly_int64 &offset,
++				     rtx &ptrue)
++{
++  if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
++    {
++      /* This is the maximum valid offset of the anchor from the base.
++	 Lower values would be valid too.  */
++      poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
++      if (!anchor_reg)
++	{
++	  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
++	  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
++				    gen_int_mode (anchor_offset, Pmode)));
++	}
++      base_rtx = anchor_reg;
++      offset -= anchor_offset;
++    }
++  if (!ptrue)
++    {
++      int pred_reg = cfun->machine->frame.spare_pred_reg;
++      emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
++		      CONSTM1_RTX (VNx16BImode));
++      ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
++    }
++}
++
++/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
++   is saved at BASE + OFFSET.  */
++
++static void
++aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
++			    rtx base, poly_int64 offset)
++{
++  rtx mem = gen_frame_mem (GET_MODE (reg),
++			   plus_constant (Pmode, base, offset));
++  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
++}
++
+ /* Emit code to save the callee-saved registers from register number START
+    to LIMIT to the stack at the location starting at offset START_OFFSET,
+-   skipping any write-back candidates if SKIP_WB is true.  */
++   skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
++   is true if the hard frame pointer has been set up.  */
+ 
+ static void
+-aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
+-			   unsigned start, unsigned limit, bool skip_wb)
++aarch64_save_callee_saves (poly_int64 start_offset,
++			   unsigned start, unsigned limit, bool skip_wb,
++			   bool hard_fp_valid_p)
+ {
+   rtx_insn *insn;
+   unsigned regno;
+   unsigned regno2;
++  rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
+ 
+   for (regno = aarch64_next_callee_save (start, limit);
+        regno <= limit;
+@@ -4919,7 +6339,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
+     {
+       rtx reg, mem;
+       poly_int64 offset;
+-      int offset_diff;
++      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+ 
+       if (skip_wb
+ 	  && (regno == cfun->machine->frame.wb_candidate1
+@@ -4927,27 +6347,53 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
+ 	continue;
+ 
+       if (cfun->machine->reg_is_wrapped_separately[regno])
+-       continue;
++	continue;
+ 
++      machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+       offset = start_offset + cfun->machine->frame.reg_offset[regno];
+-      mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
+-						offset));
++      rtx base_rtx = stack_pointer_rtx;
++      poly_int64 sp_offset = offset;
+ 
+-      regno2 = aarch64_next_callee_save (regno + 1, limit);
+-      offset_diff = cfun->machine->frame.reg_offset[regno2]
+-		    - cfun->machine->frame.reg_offset[regno];
++      HOST_WIDE_INT const_offset;
++      if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
++	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
++					     offset, ptrue);
++      else if (GP_REGNUM_P (regno)
++	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
++	{
++	  gcc_assert (known_eq (start_offset, 0));
++	  poly_int64 fp_offset
++	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
++	  if (hard_fp_valid_p)
++	    base_rtx = hard_frame_pointer_rtx;
++	  else
++	    {
++	      if (!anchor_reg)
++		{
++		  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
++		  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
++					    gen_int_mode (fp_offset, Pmode)));
++		}
++	      base_rtx = anchor_reg;
++	    }
++	  offset -= fp_offset;
++	}
++      mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
++      bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
+ 
+-      if (regno2 <= limit
++      if (!aarch64_sve_mode_p (mode)
++	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
+ 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
+-	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
++	  && known_eq (GET_MODE_SIZE (mode),
++		       cfun->machine->frame.reg_offset[regno2]
++		       - cfun->machine->frame.reg_offset[regno]))
+ 	{
+ 	  rtx reg2 = gen_rtx_REG (mode, regno2);
+ 	  rtx mem2;
+ 
+-	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
+-	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
+-						     offset));
++	  offset += GET_MODE_SIZE (mode);
++	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+ 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
+ 						    reg2));
+ 
+@@ -4955,71 +6401,96 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
+ 	     always assumed to be relevant to the frame
+ 	     calculations; subsequent parts, are only
+ 	     frame-related if explicitly marked.  */
+-	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
++	  if (aarch64_emit_cfi_for_reg_p (regno2))
++	    {
++	      if (need_cfa_note_p)
++		aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
++					    sp_offset + GET_MODE_SIZE (mode));
++	      else
++		RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
++	    }
++
+ 	  regno = regno2;
+ 	}
++      else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
++	{
++	  insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
++	  need_cfa_note_p = true;
++	}
++      else if (aarch64_sve_mode_p (mode))
++	insn = emit_insn (gen_rtx_SET (mem, reg));
+       else
+ 	insn = emit_move_insn (mem, reg);
+ 
+-      RTX_FRAME_RELATED_P (insn) = 1;
++      RTX_FRAME_RELATED_P (insn) = frame_related_p;
++      if (frame_related_p && need_cfa_note_p)
++	aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
+     }
+ }
+ 
+-/* Emit code to restore the callee registers of mode MODE from register
+-   number START up to and including LIMIT.  Restore from the stack offset
+-   START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
+-   Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
++/* Emit code to restore the callee registers from register number START
++   up to and including LIMIT.  Restore from the stack offset START_OFFSET,
++   skipping any write-back candidates if SKIP_WB is true.  Write the
++   appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
+ 
+ static void
+-aarch64_restore_callee_saves (machine_mode mode,
+-			      poly_int64 start_offset, unsigned start,
++aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
+ 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
+ {
+-  rtx base_rtx = stack_pointer_rtx;
+   unsigned regno;
+   unsigned regno2;
+   poly_int64 offset;
++  rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
+ 
+   for (regno = aarch64_next_callee_save (start, limit);
+        regno <= limit;
+        regno = aarch64_next_callee_save (regno + 1, limit))
+     {
++      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+       if (cfun->machine->reg_is_wrapped_separately[regno])
+-       continue;
++	continue;
+ 
+       rtx reg, mem;
+-      int offset_diff;
+ 
+       if (skip_wb
+ 	  && (regno == cfun->machine->frame.wb_candidate1
+ 	      || regno == cfun->machine->frame.wb_candidate2))
+ 	continue;
+ 
++      machine_mode mode = aarch64_reg_save_mode (regno);
+       reg = gen_rtx_REG (mode, regno);
+       offset = start_offset + cfun->machine->frame.reg_offset[regno];
++      rtx base_rtx = stack_pointer_rtx;
++      if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
++	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
++					     offset, ptrue);
+       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+ 
+-      regno2 = aarch64_next_callee_save (regno + 1, limit);
+-      offset_diff = cfun->machine->frame.reg_offset[regno2]
+-		    - cfun->machine->frame.reg_offset[regno];
+-
+-      if (regno2 <= limit
++      if (!aarch64_sve_mode_p (mode)
++	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
+ 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
+-	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
++	  && known_eq (GET_MODE_SIZE (mode),
++		       cfun->machine->frame.reg_offset[regno2]
++		       - cfun->machine->frame.reg_offset[regno]))
+ 	{
+ 	  rtx reg2 = gen_rtx_REG (mode, regno2);
+ 	  rtx mem2;
+ 
+-	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
++	  offset += GET_MODE_SIZE (mode);
+ 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+ 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
+ 
+ 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
+ 	  regno = regno2;
+ 	}
++      else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
++	emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
++      else if (aarch64_sve_mode_p (mode))
++	emit_insn (gen_rtx_SET (reg, mem));
+       else
+ 	emit_move_insn (reg, mem);
+-      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
++      if (frame_related_p)
++	*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
+     }
+ }
+ 
+@@ -5101,13 +6572,35 @@ aarch64_get_separate_components (void)
+   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
+     if (aarch64_register_saved_on_entry (regno))
+       {
++	/* Punt on saves and restores that use ST1D and LD1D.  We could
++	   try to be smarter, but it would involve making sure that the
++	   spare predicate register itself is safe to use at the save
++	   and restore points.  Also, when a frame pointer is being used,
++	   the slots are often out of reach of ST1D and LD1D anyway.  */
++	machine_mode mode = aarch64_reg_save_mode (regno);
++	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
++	  continue;
++
+ 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
+-	if (!frame_pointer_needed)
+-	  offset += cfun->machine->frame.frame_size
+-		    - cfun->machine->frame.hard_fp_offset;
++
++	/* If the register is saved in the first SVE save slot, we use
++	   it as a stack probe for -fstack-clash-protection.  */
++	if (flag_stack_clash_protection
++	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
++	    && known_eq (offset, 0))
++	  continue;
++
++	/* Get the offset relative to the register we'll use.  */
++	if (frame_pointer_needed)
++	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
++	else
++	  offset += crtl->outgoing_args_size;
++
+ 	/* Check that we can access the stack slot of the register with one
+ 	   direct load with no adjustments needed.  */
+-	if (offset_12bit_unsigned_scaled_p (DImode, offset))
++	if (aarch64_sve_mode_p (mode)
++	    ? offset_9bit_signed_scaled_p (mode, offset)
++	    : offset_12bit_unsigned_scaled_p (mode, offset))
+ 	  bitmap_set_bit (components, regno);
+       }
+ 
+@@ -5115,6 +6608,12 @@ aarch64_get_separate_components (void)
+   if (frame_pointer_needed)
+     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+ 
++  /* If the spare predicate register used by big-endian SVE code
++     is call-preserved, it must be saved in the main prologue
++     before any saves that use it.  */
++  if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
++    bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
++
+   unsigned reg1 = cfun->machine->frame.wb_candidate1;
+   unsigned reg2 = cfun->machine->frame.wb_candidate2;
+   /* If registers have been chosen to be stored/restored with
+@@ -5139,31 +6638,48 @@ aarch64_components_for_bb (basic_block bb)
+   bitmap in = DF_LIVE_IN (bb);
+   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
+   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+-  bool simd_function = aarch64_simd_decl_p (cfun->decl);
+ 
+   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
+   bitmap_clear (components);
+ 
++  /* Clobbered registers don't generate values in any meaningful sense,
++     since nothing after the clobber can rely on their value.  And we can't
++     say that partially-clobbered registers are unconditionally killed,
++     because whether they're killed or not depends on the mode of the
++     value they're holding.  Thus partially call-clobbered registers
++     appear in neither the kill set nor the gen set.
++
++     Check manually for any calls that clobber more of a register than the
++     current function can.  */
++  function_abi_aggregator callee_abis;
++  rtx_insn *insn;
++  FOR_BB_INSNS (bb, insn)
++    if (CALL_P (insn))
++      callee_abis.note_callee_abi (insn_callee_abi (insn));
++  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
++
+   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
+   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
+-    if ((!call_used_regs[regno]
+-	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
+-       && (bitmap_bit_p (in, regno)
+-	   || bitmap_bit_p (gen, regno)
+-	   || bitmap_bit_p (kill, regno)))
++    if (!fixed_regs[regno]
++	&& !crtl->abi->clobbers_full_reg_p (regno)
++	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno)
++	    || bitmap_bit_p (in, regno)
++	    || bitmap_bit_p (gen, regno)
++	    || bitmap_bit_p (kill, regno)))
+       {
+-	unsigned regno2, offset, offset2;
+ 	bitmap_set_bit (components, regno);
+ 
+ 	/* If there is a callee-save at an adjacent offset, add it too
+ 	   to increase the use of LDP/STP.  */
+-	offset = cfun->machine->frame.reg_offset[regno];
+-	regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
++	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
++	unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
+ 
+ 	if (regno2 <= LAST_SAVED_REGNUM)
+ 	  {
+-	    offset2 = cfun->machine->frame.reg_offset[regno2];
+-	    if ((offset & ~8) == (offset2 & ~8))
++	    poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
++	    if (regno < regno2
++		? known_eq (offset + 8, offset2)
++		: multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
+ 	      bitmap_set_bit (components, regno2);
+ 	  }
+       }
+@@ -5218,16 +6734,16 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+ 
+   while (regno != last_regno)
+     {
+-      /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
+-	 so DFmode for the vector registers is enough.  For simd functions
+-	 we want to save the low 128 bits.  */
+-      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
++      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
++      machine_mode mode = aarch64_reg_save_mode (regno);
+       
+       rtx reg = gen_rtx_REG (mode, regno);
+       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
+-      if (!frame_pointer_needed)
+-	offset += cfun->machine->frame.frame_size
+-		  - cfun->machine->frame.hard_fp_offset;
++      if (frame_pointer_needed)
++	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
++      else
++	offset += crtl->outgoing_args_size;
++
+       rtx addr = plus_constant (Pmode, ptr_reg, offset);
+       rtx mem = gen_frame_mem (mode, addr);
+ 
+@@ -5238,39 +6754,49 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       if (regno2 == last_regno)
+ 	{
+ 	  insn = emit_insn (set);
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	  if (prologue_p)
+-	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
+-	  else
+-	    add_reg_note (insn, REG_CFA_RESTORE, reg);
++	  if (frame_related_p)
++	    {
++	      RTX_FRAME_RELATED_P (insn) = 1;
++	      if (prologue_p)
++		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
++	      else
++		add_reg_note (insn, REG_CFA_RESTORE, reg);
++	    }
+ 	  break;
+ 	}
+ 
+       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
+       /* The next register is not of the same class or its offset is not
+ 	 mergeable with the current one into a pair.  */
+-      if (!satisfies_constraint_Ump (mem)
++      if (aarch64_sve_mode_p (mode)
++	  || !satisfies_constraint_Ump (mem)
+ 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
+-	  || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
++	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
+ 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
+ 		       GET_MODE_SIZE (mode)))
+ 	{
+ 	  insn = emit_insn (set);
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	  if (prologue_p)
+-	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
+-	  else
+-	    add_reg_note (insn, REG_CFA_RESTORE, reg);
++	  if (frame_related_p)
++	    {
++	      RTX_FRAME_RELATED_P (insn) = 1;
++	      if (prologue_p)
++		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
++	      else
++		add_reg_note (insn, REG_CFA_RESTORE, reg);
++	    }
+ 
+ 	  regno = regno2;
+ 	  continue;
+ 	}
+ 
++      bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
++
+       /* REGNO2 can be saved/restored in a pair with REGNO.  */
+       rtx reg2 = gen_rtx_REG (mode, regno2);
+-      if (!frame_pointer_needed)
+-	offset2 += cfun->machine->frame.frame_size
+-		  - cfun->machine->frame.hard_fp_offset;
++      if (frame_pointer_needed)
++	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
++      else
++	offset2 += crtl->outgoing_args_size;
+       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
+       rtx mem2 = gen_frame_mem (mode, addr2);
+       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
+@@ -5281,16 +6807,23 @@ aarch64_process_components (sbitmap components, bool prologue_p)
+       else
+ 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
+ 
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-      if (prologue_p)
+-	{
+-	  add_reg_note (insn, REG_CFA_OFFSET, set);
+-	  add_reg_note (insn, REG_CFA_OFFSET, set2);
+-	}
+-      else
++      if (frame_related_p || frame_related2_p)
+ 	{
+-	  add_reg_note (insn, REG_CFA_RESTORE, reg);
+-	  add_reg_note (insn, REG_CFA_RESTORE, reg2);
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	  if (prologue_p)
++	    {
++	      if (frame_related_p)
++		add_reg_note (insn, REG_CFA_OFFSET, set);
++	      if (frame_related2_p)
++		add_reg_note (insn, REG_CFA_OFFSET, set2);
++	    }
++	  else
++	    {
++	      if (frame_related_p)
++		add_reg_note (insn, REG_CFA_RESTORE, reg);
++	      if (frame_related2_p)
++		add_reg_note (insn, REG_CFA_RESTORE, reg2);
++	    }
+ 	}
+ 
+       regno = aarch64_get_next_set_bit (components, regno2 + 1);
+@@ -5359,15 +6892,31 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   HOST_WIDE_INT guard_size
+     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+-  /* When doing the final adjustment for the outgoing argument size we can't
+-     assume that LR was saved at position 0.  So subtract it's offset from the
+-     ABI safe buffer so that we don't accidentally allow an adjustment that
+-     would result in an allocation larger than the ABI buffer without
+-     probing.  */
+   HOST_WIDE_INT min_probe_threshold
+-    = final_adjustment_p
+-      ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
+-      : guard_size - guard_used_by_caller;
++    = (final_adjustment_p
++       ? guard_used_by_caller
++       : guard_size - guard_used_by_caller);
++  /* When doing the final adjustment for the outgoing arguments, take into
++     account any unprobed space there is above the current SP.  There are
++     two cases:
++
++     - When saving SVE registers below the hard frame pointer, we force
++       the lowest save to take place in the prologue before doing the final
++       adjustment (i.e. we don't allow the save to be shrink-wrapped).
++       This acts as a probe at SP, so there is no unprobed space.
++
++     - When there are no SVE register saves, we use the store of the link
++       register as a probe.  We can't assume that LR was saved at position 0
++       though, so treat any space below it as unprobed.  */
++  if (final_adjustment_p
++      && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
++    {
++      poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
++      if (known_ge (lr_offset, 0))
++	min_probe_threshold -= lr_offset.to_constant ();
++      else
++	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
++    }
+ 
+   poly_int64 frame_size = cfun->machine->frame.frame_size;
+ 
+@@ -5377,13 +6926,15 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   if (flag_stack_clash_protection && !final_adjustment_p)
+     {
+       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
++      poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
+       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
+ 
+       if (known_eq (frame_size, 0))
+ 	{
+ 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+ 	}
+-      else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
++      else if (known_lt (initial_adjust + sve_callee_adjust,
++			 guard_size - guard_used_by_caller)
+ 	       && known_lt (final_adjust, guard_used_by_caller))
+ 	{
+ 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+@@ -5583,24 +7134,10 @@ aarch64_epilogue_uses (int regno)
+     {
+       if (regno == LR_REGNUM)
+ 	return 1;
+-      if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
+-	return 1;
+     }
+   return 0;
+ }
+ 
+-/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
+-   is saved at BASE + OFFSET.  */
+-
+-static void
+-aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
+-			    rtx base, poly_int64 offset)
+-{
+-  rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
+-  add_reg_note (insn, REG_CFA_EXPRESSION,
+-		gen_rtx_SET (mem, regno_reg_rtx[reg]));
+-}
+-
+ /* AArch64 stack frames generated by this compiler look like:
+ 
+ 	+-------------------------------+
+@@ -5622,8 +7159,12 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
+ 	+-------------------------------+  |
+ 	|  LR'                          |  |
+ 	+-------------------------------+  |
+-	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
+-        +-------------------------------+
++	|  FP'                          |  |
++	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
++	|  SVE vector registers         |  | \
++	+-------------------------------+  |  | below_hard_fp_saved_regs_size
++	|  SVE predicate registers      | /  /
++	+-------------------------------+
+ 	|  dynamic allocation           |
+ 	+-------------------------------+
+ 	|  padding                      |
+@@ -5656,7 +7197,8 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
+    The following registers are reserved during frame layout and should not be
+    used for any other purpose:
+ 
+-   - r11: Used by stack clash protection when SVE is enabled.
++   - r11: Used by stack clash protection when SVE is enabled, and also
++	  as an anchor register when saving and restoring registers
+    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
+    - r14 and r15: Used for speculation tracking.
+    - r16(IP0), r17(IP1): Used by indirect tailcalls.
+@@ -5679,15 +7221,37 @@ aarch64_expand_prologue (void)
+   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
+   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
+   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
++  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
++  poly_int64 below_hard_fp_saved_regs_size
++    = cfun->machine->frame.below_hard_fp_saved_regs_size;
+   unsigned reg1 = cfun->machine->frame.wb_candidate1;
+   unsigned reg2 = cfun->machine->frame.wb_candidate2;
+   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
+   rtx_insn *insn;
+ 
++  if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
++    {
++      /* Fold the SVE allocation into the initial allocation.
++	 We don't do this in aarch64_layout_arg to avoid pessimizing
++	 the epilogue code.  */
++      initial_adjust += sve_callee_adjust;
++      sve_callee_adjust = 0;
++    }
++
+   /* Sign return address for functions.  */
+   if (aarch64_return_address_signing_enabled ())
+     {
+-      insn = emit_insn (gen_pacisp ());
++      switch (aarch64_ra_sign_key)
++	{
++	  case AARCH64_KEY_A:
++	    insn = emit_insn (gen_paciasp ());
++	    break;
++	  case AARCH64_KEY_B:
++	    insn = emit_insn (gen_pacibsp ());
++	    break;
++	  default:
++	    gcc_unreachable ();
++	}
+       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
+       RTX_FRAME_RELATED_P (insn) = 1;
+     }
+@@ -5726,18 +7290,27 @@ aarch64_expand_prologue (void)
+   if (callee_adjust != 0)
+     aarch64_push_regs (reg1, reg2, callee_adjust);
+ 
++  /* The offset of the frame chain record (if any) from the current SP.  */
++  poly_int64 chain_offset = (initial_adjust + callee_adjust
++			     - cfun->machine->frame.hard_fp_offset);
++  gcc_assert (known_ge (chain_offset, 0));
++
++  /* The offset of the bottom of the save area from the current SP.  */
++  poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
++
+   if (emit_frame_chain)
+     {
+-      poly_int64 reg_offset = callee_adjust;
+       if (callee_adjust == 0)
+ 	{
+ 	  reg1 = R29_REGNUM;
+ 	  reg2 = R30_REGNUM;
+-	  reg_offset = callee_offset;
+-	  aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
++	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
++				     false, false);
+ 	}
++      else
++	gcc_assert (known_eq (chain_offset, 0));
+       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
+-			  stack_pointer_rtx, callee_offset,
++			  stack_pointer_rtx, chain_offset,
+ 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
+       if (frame_pointer_needed && !frame_size.is_constant ())
+ 	{
+@@ -5764,23 +7337,31 @@ aarch64_expand_prologue (void)
+ 
+ 	  /* Change the save slot expressions for the registers that
+ 	     we've already saved.  */
+-	  reg_offset -= callee_offset;
+-	  aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
+-				      reg_offset + UNITS_PER_WORD);
+-	  aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
+-				      reg_offset);
++	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
++				      hard_frame_pointer_rtx, UNITS_PER_WORD);
++	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
++				      hard_frame_pointer_rtx, 0);
+ 	}
+       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
+     }
+ 
+-  aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
+-			     callee_adjust != 0 || emit_frame_chain);
+-  if (aarch64_simd_decl_p (cfun->decl))
+-    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+-			       callee_adjust != 0 || emit_frame_chain);
+-  else
+-    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+-			       callee_adjust != 0 || emit_frame_chain);
++  aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
++			     callee_adjust != 0 || emit_frame_chain,
++			     emit_frame_chain);
++  if (maybe_ne (sve_callee_adjust, 0))
++    {
++      gcc_assert (!flag_stack_clash_protection
++		  || known_eq (initial_adjust, 0));
++      aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
++					      sve_callee_adjust,
++					      !frame_pointer_needed, false);
++      saved_regs_offset += sve_callee_adjust;
++    }
++  aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
++			     false, emit_frame_chain);
++  aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
++			     callee_adjust != 0 || emit_frame_chain,
++			     emit_frame_chain);
+ 
+   /* We may need to probe the final adjustment if it is larger than the guard
+      that is assumed by the called.  */
+@@ -5806,19 +7387,6 @@ aarch64_use_return_insn_p (void)
+   return known_eq (cfun->machine->frame.frame_size, 0);
+ }
+ 
+-/* Return false for non-leaf SIMD functions in order to avoid
+-   shrink-wrapping them.  Doing this will lose the necessary
+-   save/restore of FP registers.  */
+-
+-bool
+-aarch64_use_simple_return_insn_p (void)
+-{
+-  if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
+-    return false;
+-
+-  return true;
+-}
+-
+ /* Generate the epilogue instructions for returning from a function.
+    This is almost exactly the reverse of the prolog sequence, except
+    that we need to insert barriers to avoid scheduling loads that read
+@@ -5831,6 +7399,9 @@ aarch64_expand_epilogue (bool for_sibcall)
+   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
+   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
+   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
++  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
++  poly_int64 below_hard_fp_saved_regs_size
++    = cfun->machine->frame.below_hard_fp_saved_regs_size;
+   unsigned reg1 = cfun->machine->frame.wb_candidate1;
+   unsigned reg2 = cfun->machine->frame.wb_candidate2;
+   rtx cfi_ops = NULL;
+@@ -5844,15 +7415,23 @@ aarch64_expand_epilogue (bool for_sibcall)
+     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
+ 
+-  /* We can re-use the registers when the allocation amount is smaller than
+-     guard_size - guard_used_by_caller because we won't be doing any probes
+-     then.  In such situations the register should remain live with the correct
++  /* We can re-use the registers when:
++
++     (a) the deallocation amount is the same as the corresponding
++	 allocation amount (which is false if we combine the initial
++	 and SVE callee save allocations in the prologue); and
++
++     (b) the allocation amount doesn't need a probe (which is false
++	 if the amount is guard_size - guard_used_by_caller or greater).
++
++     In such situations the register should remain live with the correct
+      value.  */
+   bool can_inherit_p = (initial_adjust.is_constant ()
+-			&& final_adjust.is_constant ())
++			&& final_adjust.is_constant ()
+ 			&& (!flag_stack_clash_protection
+-			    || known_lt (initial_adjust,
+-					 guard_size - guard_used_by_caller));
++			    || (known_lt (initial_adjust,
++					  guard_size - guard_used_by_caller)
++				&& known_eq (sve_callee_adjust, 0))));
+ 
+   /* We need to add memory barrier to prevent read from deallocated stack.  */
+   bool need_barrier_p
+@@ -5877,7 +7456,8 @@ aarch64_expand_epilogue (bool for_sibcall)
+     /* If writeback is used when restoring callee-saves, the CFA
+        is restored on the instruction doing the writeback.  */
+     aarch64_add_offset (Pmode, stack_pointer_rtx,
+-			hard_frame_pointer_rtx, -callee_offset,
++			hard_frame_pointer_rtx,
++			-callee_offset - below_hard_fp_saved_regs_size,
+ 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
+   else
+      /* The case where we need to re-use the register here is very rare, so
+@@ -5885,14 +7465,17 @@ aarch64_expand_epilogue (bool for_sibcall)
+ 	immediate doesn't fit.  */
+      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
+ 
+-  aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
++  /* Restore the vector registers before the predicate registers,
++     so that we can use P4 as a temporary for big-endian SVE frames.  */
++  aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
++				callee_adjust != 0, &cfi_ops);
++  aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
++				false, &cfi_ops);
++  if (maybe_ne (sve_callee_adjust, 0))
++    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
++  aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
++				R0_REGNUM, R30_REGNUM,
+ 				callee_adjust != 0, &cfi_ops);
+-  if (aarch64_simd_decl_p (cfun->decl))
+-    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+-				  callee_adjust != 0, &cfi_ops);
+-  else
+-    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
+-				  callee_adjust != 0, &cfi_ops);
+ 
+   if (need_barrier_p)
+     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
+@@ -5943,13 +7526,23 @@ aarch64_expand_epilogue (bool for_sibcall)
+   if (aarch64_return_address_signing_enabled ()
+       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
+     {
+-      insn = emit_insn (gen_autisp ());
++      switch (aarch64_ra_sign_key)
++	{
++	  case AARCH64_KEY_A:
++	    insn = emit_insn (gen_autiasp ());
++	    break;
++	  case AARCH64_KEY_B:
++	    insn = emit_insn (gen_autibsp ());
++	    break;
++	  default:
++	    gcc_unreachable ();
++	}
+       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
+       RTX_FRAME_RELATED_P (insn) = 1;
+     }
+ 
+   /* Stack adjustment for exception handler.  */
+-  if (crtl->calls_eh_return)
++  if (crtl->calls_eh_return && !for_sibcall)
+     {
+       /* We need to unwind the stack by the offset computed by
+ 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
+@@ -6015,6 +7608,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+   int this_regno = R0_REGNUM;
+   rtx this_rtx, temp0, temp1, addr, funexp;
+   rtx_insn *insn;
++  const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
+ 
+   if (aarch64_bti_enabled ())
+     emit_insn (gen_bti_c());
+@@ -6077,14 +7671,18 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+     }
+   funexp = XEXP (DECL_RTL (function), 0);
+   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
+-  insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
++  rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
++  insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
+   SIBLING_CALL_P (insn) = 1;
+ 
+   insn = get_insns ();
+   shorten_branches (insn);
++
++  assemble_start_function (thunk, fnname);
+   final_start_function (insn, file, 1);
+   final (insn, file, 1);
+   final_end_function ();
++  assemble_end_function (thunk, fnname);
+ 
+   /* Stop pretending to be a post-reload pass.  */
+   reload_completed = 0;
+@@ -6608,9 +8206,15 @@ aarch64_classify_address (struct aarch64_address_info *info,
+ 
+   HOST_WIDE_INT const_size;
+ 
++  /* Whether a vector mode is partial doesn't affect address legitimacy.
++     Partial vectors like VNx8QImode allow the same indexed addressing
++     mode and MUL VL addressing mode as full vectors like VNx16QImode;
++     in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
++  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
++  vec_flags &= ~VEC_PARTIAL;
++
+   /* On BE, we use load/store pair for all large int mode load/stores.
+      TI/TFmode may also use a load/store pair.  */
+-  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
+   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
+ 			    || type == ADDR_QUERY_LDP_STP_N
+@@ -6628,7 +8232,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
+   bool allow_reg_index_p = (!load_store_pair_p
+ 			    && (known_lt (GET_MODE_SIZE (mode), 16)
+ 				|| vec_flags == VEC_ADVSIMD
+-				|| vec_flags == VEC_SVE_DATA));
++				|| vec_flags & VEC_SVE_DATA));
+ 
+   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
+      [Rn, #offset, MUL VL].  */
+@@ -7152,11 +8756,12 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
+    RESULT is the register in which the result is returned.  It's NULL for
+    "call" and "sibcall".
+    MEM is the location of the function call.
++   CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
+    SIBCALL indicates whether this function call is normal call or sibling call.
+    It will generate different pattern accordingly.  */
+ 
+ void
+-aarch64_expand_call (rtx result, rtx mem, bool sibcall)
++aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
+ {
+   rtx call, callee, tmp;
+   rtvec vec;
+@@ -7186,7 +8791,11 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
+   else
+     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
+ 
+-  vec = gen_rtvec (2, call, tmp);
++  gcc_assert (CONST_INT_P (callee_abi));
++  callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
++			       UNSPEC_CALLEE_ABI);
++
++  vec = gen_rtvec (3, call, callee_abi, tmp);
+   call = gen_rtx_PARALLEL (VOIDmode, vec);
+ 
+   aarch64_emit_call_insn (call);
+@@ -7382,6 +8991,21 @@ aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
+ 	}
+       break;
+ 
++    case E_CC_NZCmode:
++      switch (comp_code)
++	{
++	case NE: return AARCH64_NE; /* = any */
++	case EQ: return AARCH64_EQ; /* = none */
++	case GE: return AARCH64_PL; /* = nfrst */
++	case LT: return AARCH64_MI; /* = first */
++	case GEU: return AARCH64_CS; /* = nlast */
++	case GTU: return AARCH64_HI; /* = pmore */
++	case LEU: return AARCH64_LS; /* = plast */
++	case LTU: return AARCH64_CC; /* = last */
++	default: return -1;
++	}
++      break;
++
+     case E_CC_NZmode:
+       switch (comp_code)
+ 	{
+@@ -7524,15 +9148,24 @@ aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
+   if (negate)
+     r = real_value_negate (&r);
+ 
+-  /* We only handle the SVE single-bit immediates here.  */
++  /* Handle the SVE single-bit immediates specially, since they have a
++     fixed form in the assembly syntax.  */
+   if (real_equal (&r, &dconst0))
+     asm_fprintf (f, "0.0");
++  else if (real_equal (&r, &dconst2))
++    asm_fprintf (f, "2.0");
+   else if (real_equal (&r, &dconst1))
+     asm_fprintf (f, "1.0");
+   else if (real_equal (&r, &dconsthalf))
+     asm_fprintf (f, "0.5");
+   else
+-    return false;
++    {
++      const int buf_size = 20;
++      char float_buf[buf_size] = {'\0'};
++      real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
++				1, GET_MODE (elt));
++      asm_fprintf (f, "%s", float_buf);
++    }
+ 
+   return true;
+ }
+@@ -7560,7 +9193,13 @@ sizetochar (int size)
+      'D':		Take the duplicated element in a vector constant
+ 			and print it as an unsigned integer, in decimal.
+      'e':		Print the sign/zero-extend size as a character 8->b,
+-			16->h, 32->w.
++			16->h, 32->w.  Can also be used for masks:
++			0xff->b, 0xffff->h, 0xffffffff->w.
++     'I':		If the operand is a duplicated vector constant,
++			replace it with the duplicated scalar.  If the
++			operand is then a floating-point constant, replace
++			it with the integer bit representation.  Print the
++			transformed constant as a signed decimal number.
+      'p':		Prints N such that 2^N == X (X must be power of 2 and
+ 			const int).
+      'P':		Print the number of non-zero bits in X (a const_int).
+@@ -7574,7 +9213,7 @@ sizetochar (int size)
+      'S/T/U/V':		Print a FP/SIMD register name for a register list.
+ 			The register printed is the FP/SIMD register name
+ 			of X + 0/1/2/3 for S/T/U/V.
+-     'R':		Print a scalar FP/SIMD register name + 1.
++     'R':		Print a scalar Integer/FP/SIMD register name + 1.
+      'X':		Print bottom 16 bits of integer constant in hex.
+      'w/x':		Print a general register name or the zero register
+ 			(32-bit or 64-bit).
+@@ -7626,27 +9265,22 @@ aarch64_print_operand (FILE *f, rtx x, int code)
+ 
+     case 'e':
+       {
+-	int n;
+-
+-	if (!CONST_INT_P (x)
+-	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
++	x = unwrap_const_vec_duplicate (x);
++	if (!CONST_INT_P (x))
+ 	  {
+ 	    output_operand_lossage ("invalid operand for '%%%c'", code);
+ 	    return;
+ 	  }
+ 
+-	switch (n)
++	HOST_WIDE_INT val = INTVAL (x);
++	if ((val & ~7) == 8 || val == 0xff)
++	  fputc ('b', f);
++	else if ((val & ~7) == 16 || val == 0xffff)
++	  fputc ('h', f);
++	else if ((val & ~7) == 32 || val == 0xffffffff)
++	  fputc ('w', f);
++	else
+ 	  {
+-	  case 3:
+-	    fputc ('b', f);
+-	    break;
+-	  case 4:
+-	    fputc ('h', f);
+-	    break;
+-	  case 5:
+-	    fputc ('w', f);
+-	    break;
+-	  default:
+ 	    output_operand_lossage ("invalid operand for '%%%c'", code);
+ 	    return;
+ 	  }
+@@ -7693,6 +9327,19 @@ aarch64_print_operand (FILE *f, rtx x, int code)
+       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
+       break;
+ 
++    case 'I':
++      {
++	x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
++	if (CONST_INT_P (x))
++	  asm_fprintf (f, "%wd", INTVAL (x));
++	else
++	  {
++	    output_operand_lossage ("invalid operand for '%%%c'", code);
++	    return;
++	  }
++	break;
++      }
++
+     case 'M':
+     case 'm':
+       {
+@@ -7715,7 +9362,10 @@ aarch64_print_operand (FILE *f, rtx x, int code)
+         gcc_assert (cond_code >= 0);
+ 	if (code == 'M')
+ 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
+-	fputs (aarch64_condition_codes[cond_code], f);
++	if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
++	  fputs (aarch64_sve_condition_codes[cond_code], f);
++	else
++	  fputs (aarch64_condition_codes[cond_code], f);
+       }
+       break;
+ 
+@@ -7766,12 +9416,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
+       break;
+ 
+     case 'R':
+-      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
+-	{
+-	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
+-	  return;
+-	}
+-      asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
++      if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
++	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
++      else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
++	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
++      else
++	output_operand_lossage ("incompatible register operand for '%%%c'",
++				code);
+       break;
+ 
+     case 'X':
+@@ -8068,7 +9719,7 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
+ 				aarch64_addr_query_type type)
+ {
+   struct aarch64_address_info addr;
+-  unsigned int size;
++  unsigned int size, vec_flags;
+ 
+   /* Check all addresses are Pmode - including ILP32.  */
+   if (GET_MODE (x) != Pmode
+@@ -8084,26 +9735,24 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
+       {
+       case ADDRESS_REG_IMM:
+ 	if (known_eq (addr.const_offset, 0))
+-	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
+-	else if (aarch64_sve_data_mode_p (mode))
+ 	  {
+-	    HOST_WIDE_INT vnum
+-	      = exact_div (addr.const_offset,
+-			   BYTES_PER_SVE_VECTOR).to_constant ();
+-	    asm_fprintf (f, "[%s, #%wd, mul vl]",
+-			 reg_names[REGNO (addr.base)], vnum);
++	    asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
++	    return true;
+ 	  }
+-	else if (aarch64_sve_pred_mode_p (mode))
++
++	vec_flags = aarch64_classify_vector_mode (mode);
++	if (vec_flags & VEC_ANY_SVE)
+ 	  {
+ 	    HOST_WIDE_INT vnum
+ 	      = exact_div (addr.const_offset,
+-			   BYTES_PER_SVE_PRED).to_constant ();
++			   aarch64_vl_bytes (mode, vec_flags)).to_constant ();
+ 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
+ 			 reg_names[REGNO (addr.base)], vnum);
++	    return true;
+ 	  }
+-	else
+-	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
+-		       INTVAL (addr.offset));
++
++	asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
++		     INTVAL (addr.offset));
+ 	return true;
+ 
+       case ADDRESS_REG_REG:
+@@ -8234,11 +9883,15 @@ aarch64_regno_regclass (unsigned regno)
+     return POINTER_REGS;
+ 
+   if (FP_REGNUM_P (regno))
+-    return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
++    return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
++	    : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
+ 
+   if (PR_REGNUM_P (regno))
+     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
+ 
++  if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
++    return FFR_REGS;
++
+   return NO_REGS;
+ }
+ 
+@@ -8348,13 +10001,14 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+ 			  secondary_reload_info *sri)
+ {
+   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
+-     directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
++     directly by the *aarch64_sve_mov<mode>_[lb]e move patterns.  See the
+      comment at the head of aarch64-sve.md for more details about the
+      big-endian handling.  */
+   if (BYTES_BIG_ENDIAN
+       && reg_class_subset_p (rclass, FP_REGS)
+       && !((REG_P (x) && HARD_REGISTER_P (x))
+ 	   || aarch64_simd_valid_immediate (x, NULL))
++      && mode != VNx16QImode
+       && aarch64_sve_data_mode_p (mode))
+     {
+       sri->icode = CODE_FOR_aarch64_sve_reload_be;
+@@ -8514,7 +10168,7 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
+      can hold MODE, but at the moment we need to handle all modes.
+      Just ignore any runtime parts for registers that can't store them.  */
+   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
+-  unsigned int nregs;
++  unsigned int nregs, vec_flags;
+   switch (regclass)
+     {
+     case TAILCALL_ADDR_REGS:
+@@ -8524,17 +10178,21 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
+     case POINTER_AND_FP_REGS:
+     case FP_REGS:
+     case FP_LO_REGS:
+-      if (aarch64_sve_data_mode_p (mode)
++    case FP_LO8_REGS:
++      vec_flags = aarch64_classify_vector_mode (mode);
++      if ((vec_flags & VEC_SVE_DATA)
+ 	  && constant_multiple_p (GET_MODE_SIZE (mode),
+-				  BYTES_PER_SVE_VECTOR, &nregs))
++				  aarch64_vl_bytes (mode, vec_flags), &nregs))
+ 	return nregs;
+-      return (aarch64_vector_data_mode_p (mode)
++      return (vec_flags & VEC_ADVSIMD
+ 	      ? CEIL (lowest_size, UNITS_PER_VREG)
+ 	      : CEIL (lowest_size, UNITS_PER_WORD));
+     case STACK_REG:
+     case PR_REGS:
+     case PR_LO_REGS:
+     case PR_HI_REGS:
++    case FFR_REGS:
++    case PR_AND_FFR_REGS:
+       return 1;
+ 
+     case NO_REGS:
+@@ -10715,6 +12373,14 @@ aarch64_register_move_cost (machine_mode mode,
+   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
+     from = GENERAL_REGS;
+ 
++  /* Make RDFFR very expensive.  In particular, if we know that the FFR
++     contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
++     as a way of obtaining a PTRUE.  */
++  if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
++      && hard_reg_set_subset_p (reg_class_contents[from_i],
++				reg_class_contents[FFR_REGS]))
++    return 80;
++
+   /* Moving between GPR and stack cost is the same as GP2GP.  */
+   if ((from == GENERAL_REGS && to == STACK_REG)
+       || (to == GENERAL_REGS && from == STACK_REG))
+@@ -10764,6 +12430,93 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
+   return aarch64_tune_params.memmov_cost;
+ }
+ 
++/* Implement TARGET_INIT_BUILTINS.  */
++static void
++aarch64_init_builtins ()
++{
++  aarch64_general_init_builtins ();
++  aarch64_sve::init_builtins ();
++}
++
++/* Implement TARGET_FOLD_BUILTIN.  */
++static tree
++aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
++{
++  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  tree type = TREE_TYPE (TREE_TYPE (fndecl));
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      return aarch64_general_fold_builtin (subcode, type, nargs, args);
++
++    case AARCH64_BUILTIN_SVE:
++      return NULL_TREE;
++    }
++  gcc_unreachable ();
++}
++
++/* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
++static bool
++aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
++{
++  gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
++  tree fndecl = gimple_call_fndecl (stmt);
++  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  gimple *new_stmt = NULL;
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
++      break;
++
++    case AARCH64_BUILTIN_SVE:
++      new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
++      break;
++    }
++
++  if (!new_stmt)
++    return false;
++
++  gsi_replace (gsi, new_stmt, true);
++  return true;
++}
++
++/* Implement TARGET_EXPAND_BUILTIN.  */
++static rtx
++aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
++{
++  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
++  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      return aarch64_general_expand_builtin (subcode, exp, target, ignore);
++
++    case AARCH64_BUILTIN_SVE:
++      return aarch64_sve::expand_builtin (subcode, exp, target);
++    }
++  gcc_unreachable ();
++}
++
++/* Implement TARGET_BUILTIN_DECL.  */
++static tree
++aarch64_builtin_decl (unsigned int code, bool initialize_p)
++{
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      return aarch64_general_builtin_decl (subcode, initialize_p);
++
++    case AARCH64_BUILTIN_SVE:
++      return aarch64_sve::builtin_decl (subcode, initialize_p);
++    }
++  gcc_unreachable ();
++}
++
+ /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
+    to optimize 1.0/sqrt.  */
+ 
+@@ -10787,7 +12540,17 @@ aarch64_builtin_reciprocal (tree fndecl)
+ 
+   if (!use_rsqrt_p (mode))
+     return NULL_TREE;
+-  return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
++  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
++  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
++  switch (code & AARCH64_BUILTIN_CLASS)
++    {
++    case AARCH64_BUILTIN_GENERAL:
++      return aarch64_general_builtin_rsqrt (subcode);
++
++    case AARCH64_BUILTIN_SVE:
++      return NULL_TREE;
++    }
++  gcc_unreachable ();
+ }
+ 
+ /* Emit instruction sequence to compute either the approximate square root
+@@ -11096,7 +12859,7 @@ static void initialize_aarch64_code_model (struct gcc_options *);
+ 
+ static enum aarch64_parse_opt_result
+ aarch64_parse_arch (const char *to_parse, const struct processor **res,
+-		    unsigned long *isa_flags, std::string *invalid_extension)
++		    uint64_t *isa_flags, std::string *invalid_extension)
+ {
+   const char *ext;
+   const struct processor *arch;
+@@ -11119,7 +12882,7 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
+       if (strlen (arch->name) == len
+ 	  && strncmp (arch->name, to_parse, len) == 0)
+ 	{
+-	  unsigned long isa_temp = arch->flags;
++	  uint64_t isa_temp = arch->flags;
+ 
+ 	  if (ext != NULL)
+ 	    {
+@@ -11151,7 +12914,7 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
+ 
+ static enum aarch64_parse_opt_result
+ aarch64_parse_cpu (const char *to_parse, const struct processor **res,
+-		   unsigned long *isa_flags, std::string *invalid_extension)
++		   uint64_t *isa_flags, std::string *invalid_extension)
+ {
+   const char *ext;
+   const struct processor *cpu;
+@@ -11173,7 +12936,7 @@ aarch64_parse_cpu (const char *to_parse, const struct processor **res,
+     {
+       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
+ 	{
+-	  unsigned long isa_temp = cpu->flags;
++	  uint64_t isa_temp = cpu->flags;
+ 
+ 
+ 	  if (ext != NULL)
+@@ -11758,7 +13521,7 @@ aarch64_print_hint_for_extensions (const std::string &str)
+ 
+ static bool
+ aarch64_validate_mcpu (const char *str, const struct processor **res,
+-		       unsigned long *isa_flags)
++		       uint64_t *isa_flags)
+ {
+   std::string invalid_extension;
+   enum aarch64_parse_opt_result parse_res
+@@ -11885,9 +13648,9 @@ aarch64_validate_mbranch_protection (const char *const_str)
+   enum aarch64_parse_opt_result res =
+     aarch64_parse_branch_protection (const_str, &str);
+   if (res == AARCH64_PARSE_INVALID_ARG)
+-    error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
++    error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
+   else if (res == AARCH64_PARSE_MISSING_ARG)
+-    error ("missing arg for %<-mbranch-protection=%>");
++    error ("missing argument for %<-mbranch-protection=%>");
+   free (str);
+   return res == AARCH64_PARSE_OK;
+ }
+@@ -11899,7 +13662,7 @@ aarch64_validate_mbranch_protection (const char *const_str)
+ 
+ static bool
+ aarch64_validate_march (const char *str, const struct processor **res,
+-			 unsigned long *isa_flags)
++			 uint64_t *isa_flags)
+ {
+   std::string invalid_extension;
+   enum aarch64_parse_opt_result parse_res
+@@ -12014,8 +13777,8 @@ aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
+ static void
+ aarch64_override_options (void)
+ {
+-  unsigned long cpu_isa = 0;
+-  unsigned long arch_isa = 0;
++  uint64_t cpu_isa = 0;
++  uint64_t arch_isa = 0;
+   aarch64_isa_flags = 0;
+ 
+   bool valid_cpu = true;
+@@ -12255,7 +14018,7 @@ aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
+ {
+   const struct processor *cpu
+     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
+-  unsigned long isa_flags = ptr->x_aarch64_isa_flags;
++  uint64_t isa_flags = ptr->x_aarch64_isa_flags;
+   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
+   std::string extension
+     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
+@@ -12508,7 +14271,7 @@ static bool
+ aarch64_handle_attr_isa_flags (char *str)
+ {
+   enum aarch64_parse_opt_result parse_res;
+-  unsigned long isa_flags = aarch64_isa_flags;
++  uint64_t isa_flags = aarch64_isa_flags;
+ 
+   /* We allow "+nothing" in the beginning to clear out all architectural
+      features if the user wants to handpick specific features.  */
+@@ -12999,6 +14762,26 @@ aarch64_can_inline_p (tree caller, tree callee)
+   return true;
+ }
+ 
++/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
++   been already.  */
++
++unsigned int
++aarch64_tlsdesc_abi_id ()
++{
++  predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
++  if (!tlsdesc_abi.initialized_p ())
++    {
++      HARD_REG_SET full_reg_clobbers;
++      CLEAR_HARD_REG_SET (full_reg_clobbers);
++      SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
++      SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
++      for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
++	SET_HARD_REG_BIT (full_reg_clobbers, regno);
++      tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
++    }
++  return tlsdesc_abi.id ();
++}
++
+ /* Return true if SYMBOL_REF X binds locally.  */
+ 
+ static bool
+@@ -13104,26 +14887,31 @@ aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
+ 	     the offset does not cause overflow of the final address.  But
+ 	     we have no way of knowing the address of symbol at compile time
+ 	     so we can't accurately say if the distance between the PC and
+-	     symbol + offset is outside the addressible range of +/-1M in the
+-	     TINY code model.  So we rely on images not being greater than
+-	     1M and cap the offset at 1M and anything beyond 1M will have to
+-	     be loaded using an alternative mechanism.  Furthermore if the
+-	     symbol is a weak reference to something that isn't known to
+-	     resolve to a symbol in this module, then force to memory.  */
+-	  if ((SYMBOL_REF_WEAK (x)
+-	       && !aarch64_symbol_binds_local_p (x))
+-	      || !IN_RANGE (offset, -1048575, 1048575))
++	     symbol + offset is outside the addressible range of +/-1MB in the
++	     TINY code model.  So we limit the maximum offset to +/-64KB and
++	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
++	     If offset_within_block_p is true we allow larger offsets.
++	     Furthermore force to memory if the symbol is a weak reference to
++	     something that doesn't resolve to a symbol in this module.  */
++
++	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
++	    return SYMBOL_FORCE_TO_MEM;
++	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
++		|| offset_within_block_p (x, offset)))
+ 	    return SYMBOL_FORCE_TO_MEM;
++
+ 	  return SYMBOL_TINY_ABSOLUTE;
+ 
+ 	case AARCH64_CMODEL_SMALL:
+ 	  /* Same reasoning as the tiny code model, but the offset cap here is
+-	     4G.  */
+-	  if ((SYMBOL_REF_WEAK (x)
+-	       && !aarch64_symbol_binds_local_p (x))
+-	      || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
+-			    HOST_WIDE_INT_C (4294967264)))
++	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
++
++	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
+ 	    return SYMBOL_FORCE_TO_MEM;
++	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
++		|| offset_within_block_p (x, offset)))
++	    return SYMBOL_FORCE_TO_MEM;
++
+ 	  return SYMBOL_SMALL_ABSOLUTE;
+ 
+ 	case AARCH64_CMODEL_TINY_PIC:
+@@ -13432,7 +15220,7 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   HOST_WIDE_INT size, rsize, adjust, align;
+   tree t, u, cond1, cond2;
+ 
+-  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  indirect_p = pass_va_arg_by_reference (type);
+   if (indirect_p)
+     type = build_pointer_type (type);
+ 
+@@ -13626,6 +15414,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+ 	  field_t = aarch64_fp16_type_node;
+ 	  field_ptr_t = aarch64_fp16_ptr_type_node;
+ 	  break;
++	case E_BFmode:
++	  field_t = aarch64_bf16_type_node;
++	  field_ptr_t = aarch64_bf16_ptr_type_node;
++	  break;
+ 	case E_V2SImode:
+ 	case E_V4SImode:
+ 	    {
+@@ -13677,9 +15469,9 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+ /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
+ 
+ static void
+-aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+-				tree type, int *pretend_size ATTRIBUTE_UNUSED,
+-				int no_rtl)
++aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
++				const function_arg_info &arg,
++				int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
+ {
+   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+   CUMULATIVE_ARGS local_cum;
+@@ -13690,7 +15482,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+      argument.  Advance a local copy of CUM past the last "real" named
+      argument, to find out how many registers are left over.  */
+   local_cum = *cum;
+-  aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
++  aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
+ 
+   /* Found out how many registers we need to save.
+      Honor tree-stdvar analysis results.  */
+@@ -13777,6 +15569,10 @@ aarch64_conditional_register_usage (void)
+ 	call_used_regs[i] = 1;
+       }
+ 
++  /* Only allow the FFR and FFRT to be accessed via special patterns.  */
++  CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
++  CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
++
+   /* When tracking speculation, we need a couple of call-clobbered registers
+      to track the speculation state.  It would be nice to just use
+      IP0 and IP1, but currently there are numerous places that just
+@@ -13802,6 +15598,10 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
+   machine_mode mode;
+   HOST_WIDE_INT size;
+ 
++  /* SVE types (and types containing SVE types) must be handled
++     before calling this function.  */
++  gcc_assert (!aarch64_sve::builtin_type_p (type));
++
+   switch (TREE_CODE (type))
+     {
+     case REAL_TYPE:
+@@ -13973,6 +15773,9 @@ aarch64_short_vector_p (const_tree type,
+ {
+   poly_int64 size = -1;
+ 
++  if (type && aarch64_sve::builtin_type_p (type))
++    return false;
++
+   if (type && TREE_CODE (type) == VECTOR_TYPE)
+     size = int_size_in_bytes (type);
+   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+@@ -14033,11 +15836,14 @@ aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
+ 					 int *count,
+ 					 bool *is_ha)
+ {
++  if (is_ha != NULL) *is_ha = false;
++
++  if (type && aarch64_sve::builtin_type_p (type))
++    return false;
++
+   machine_mode new_mode = VOIDmode;
+   bool composite_p = aarch64_composite_type_p (type, mode);
+ 
+-  if (is_ha != NULL) *is_ha = false;
+-
+   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
+       || aarch64_short_vector_p (type, mode))
+     {
+@@ -14083,7 +15889,63 @@ static bool
+ aarch64_vector_mode_supported_p (machine_mode mode)
+ {
+   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+-  return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
++  return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
++}
++
++/* Return the full-width SVE vector mode for element mode MODE, if one
++   exists.  */
++opt_machine_mode
++aarch64_full_sve_mode (scalar_mode mode)
++{
++  switch (mode)
++    {
++    case E_DFmode:
++      return VNx2DFmode;
++    case E_SFmode:
++      return VNx4SFmode;
++    case E_HFmode:
++      return VNx8HFmode;
++    case E_BFmode:
++      return VNx8BFmode;
++    case E_DImode:
++      return VNx2DImode;
++    case E_SImode:
++      return VNx4SImode;
++    case E_HImode:
++      return VNx8HImode;
++    case E_QImode:
++      return VNx16QImode;
++    default:
++      return opt_machine_mode ();
++    }
++}
++
++/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
++   if it exists.  */
++opt_machine_mode
++aarch64_vq_mode (scalar_mode mode)
++{
++  switch (mode)
++    {
++    case E_DFmode:
++      return V2DFmode;
++    case E_SFmode:
++      return V4SFmode;
++    case E_HFmode:
++      return V8HFmode;
++    case E_BFmode:
++      return V8BFmode;
++    case E_SImode:
++      return V4SImode;
++    case E_HImode:
++      return V8HImode;
++    case E_QImode:
++      return V16QImode;
++    case E_DImode:
++      return V2DImode;
++    default:
++      return opt_machine_mode ();
++    }
+ }
+ 
+ /* Return appropriate SIMD container
+@@ -14092,49 +15954,13 @@ static machine_mode
+ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
+ {
+   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
+-    switch (mode)
+-      {
+-      case E_DFmode:
+-	return VNx2DFmode;
+-      case E_SFmode:
+-	return VNx4SFmode;
+-      case E_HFmode:
+-	return VNx8HFmode;
+-      case E_DImode:
+-	return VNx2DImode;
+-      case E_SImode:
+-	return VNx4SImode;
+-      case E_HImode:
+-	return VNx8HImode;
+-      case E_QImode:
+-	return VNx16QImode;
+-      default:
+-	return word_mode;
+-      }
++    return aarch64_full_sve_mode (mode).else_mode (word_mode);
+ 
+   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
+   if (TARGET_SIMD)
+     {
+       if (known_eq (width, 128))
+-	switch (mode)
+-	  {
+-	  case E_DFmode:
+-	    return V2DFmode;
+-	  case E_SFmode:
+-	    return V4SFmode;
+-	  case E_HFmode:
+-	    return V8HFmode;
+-	  case E_SImode:
+-	    return V4SImode;
+-	  case E_HImode:
+-	    return V8HImode;
+-	  case E_QImode:
+-	    return V16QImode;
+-	  case E_DImode:
+-	    return V2DImode;
+-	  default:
+-	    break;
+-	  }
++	return aarch64_vq_mode (mode).else_mode (word_mode);
+       else
+ 	switch (mode)
+ 	  {
+@@ -14142,6 +15968,8 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
+ 	    return V2SFmode;
+ 	  case E_HFmode:
+ 	    return V4HFmode;
++	  case E_BFmode:
++	    return V4BFmode;
+ 	  case E_SImode:
+ 	    return V2SImode;
+ 	  case E_HImode:
+@@ -14205,14 +16033,24 @@ aarch64_mangle_type (const_tree type)
+   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
+     return "St9__va_list";
+ 
+-  /* Half-precision float.  */
++  /* Half-precision floating point types.  */
+   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
+-    return "Dh";
++    {
++      if (TYPE_MODE (type) == BFmode)
++	return "u6__bf16";
++      else
++	return "Dh";
++    }
+ 
+   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
+      builtin types.  */
+   if (TYPE_NAME (type) != NULL)
+-    return aarch64_mangle_builtin_type (type);
++    {
++      const char *res;
++      if ((res = aarch64_general_mangle_builtin_type (type))
++	  || (res = aarch64_sve::mangle_builtin_type (type)))
++	return res;
++    }
+ 
+   /* Use the default mangling.  */
+   return NULL;
+@@ -14370,6 +16208,27 @@ aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
+   return IN_RANGE (val, 0, 0xff00);
+ }
+ 
++/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
++   instructions.  Negate X first if NEGATE_P is true.  */
++
++bool
++aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
++{
++  rtx elt;
++
++  if (!const_vec_duplicate_p (x, &elt)
++      || !CONST_INT_P (elt))
++    return false;
++
++  if (!aarch64_sve_arith_immediate_p (x, negate_p))
++    return false;
++
++  /* After the optional negation, the immediate must be nonnegative.
++     E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
++     instead of SQADD Zn.B, Zn.B, #129.  */
++  return negate_p == (INTVAL (elt) < 0);
++}
++
+ /* Return true if X is a valid immediate operand for an SVE logical
+    instruction such as AND.  */
+ 
+@@ -14390,13 +16249,11 @@ aarch64_sve_bitmask_immediate_p (rtx x)
+ bool
+ aarch64_sve_dup_immediate_p (rtx x)
+ {
+-  rtx elt;
+-
+-  if (!const_vec_duplicate_p (x, &elt)
+-      || !CONST_INT_P (elt))
++  x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
++  if (!CONST_INT_P (x))
+     return false;
+ 
+-  HOST_WIDE_INT val = INTVAL (elt);
++  HOST_WIDE_INT val = INTVAL (x);
+   if (val & 0xff)
+     return IN_RANGE (val, -0x80, 0x7f);
+   return IN_RANGE (val, -0x8000, 0x7f00);
+@@ -14408,13 +16265,11 @@ aarch64_sve_dup_immediate_p (rtx x)
+ bool
+ aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
+ {
+-  rtx elt;
+-
+-  return (const_vec_duplicate_p (x, &elt)
+-	  && CONST_INT_P (elt)
++  x = unwrap_const_vec_duplicate (x);
++  return (CONST_INT_P (x)
+ 	  && (signed_p
+-	      ? IN_RANGE (INTVAL (elt), -16, 15)
+-	      : IN_RANGE (INTVAL (elt), 0, 127)));
++	      ? IN_RANGE (INTVAL (x), -16, 15)
++	      : IN_RANGE (INTVAL (x), 0, 127)));
+ }
+ 
+ /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
+@@ -14450,11 +16305,10 @@ aarch64_sve_float_mul_immediate_p (rtx x)
+ {
+   rtx elt;
+ 
+-  /* GCC will never generate a multiply with an immediate of 2, so there is no
+-     point testing for it (even though it is a valid constant).  */
+   return (const_vec_duplicate_p (x, &elt)
+ 	  && GET_CODE (elt) == CONST_DOUBLE
+-	  && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
++	  && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
++	      || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
+ }
+ 
+ /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
+@@ -14607,6 +16461,77 @@ aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
+   return false;
+ }
+ 
++/* Return true if X is an UNSPEC_PTRUE constant of the form:
++
++       (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
++
++   where PATTERN is the svpattern as a CONST_INT and where ZERO
++   is a zero constant of the required PTRUE mode (which can have
++   fewer elements than X's mode, if zero bits are significant).
++
++   If so, and if INFO is nonnull, describe the immediate in INFO.  */
++bool
++aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
++{
++  if (GET_CODE (x) != CONST)
++    return false;
++
++  x = XEXP (x, 0);
++  if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
++    return false;
++
++  if (info)
++    {
++      aarch64_svpattern pattern
++	= (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
++      machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
++      scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
++      *info = simd_immediate_info (int_mode, pattern);
++    }
++  return true;
++}
++
++/* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
++   it to describe valid immediates.  */
++
++static bool
++aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
++{
++  if (aarch64_sve_ptrue_svpattern_p (x, info))
++    return true;
++
++  if (x == CONST0_RTX (GET_MODE (x)))
++    {
++      if (info)
++	*info = simd_immediate_info (DImode, 0);
++      return true;
++    }
++
++  /* Analyze the value as a VNx16BImode.  This should be relatively
++     efficient, since rtx_vector_builder has enough built-in capacity
++     to store all VLA predicate constants without needing the heap.  */
++  rtx_vector_builder builder;
++  if (!aarch64_get_sve_pred_bits (builder, x))
++    return false;
++
++  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
++  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
++    {
++      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
++      aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
++      if (pattern != AARCH64_NUM_SVPATTERNS)
++	{
++	  if (info)
++	    {
++	      scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
++	      *info = simd_immediate_info (int_mode, pattern);
++	    }
++	  return true;
++	}
++    }
++  return false;
++}
++
+ /* Return true if OP is a valid SIMD immediate for the operation
+    described by WHICH.  If INFO is nonnull, use it to describe valid
+    immediates.  */
+@@ -14619,6 +16544,9 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
+   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
+     return false;
+ 
++  if (vec_flags & VEC_SVE_PRED)
++    return aarch64_sve_pred_valid_immediate (op, info);
++
+   scalar_mode elt_mode = GET_MODE_INNER (mode);
+   rtx base, step;
+   unsigned int n_elts;
+@@ -14643,11 +16571,6 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
+   else
+     return false;
+ 
+-  /* Handle PFALSE and PTRUE.  */
+-  if (vec_flags & VEC_SVE_PRED)
+-    return (op == CONST0_RTX (mode)
+-	    || op == CONSTM1_RTX (mode));
+-
+   scalar_float_mode elt_float_mode;
+   if (n_elts == 1
+       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
+@@ -14731,11 +16654,14 @@ aarch64_check_zero_based_sve_index_immediate (rtx x)
+ bool
+ aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
+ {
++  x = unwrap_const_vec_duplicate (x);
++  if (!CONST_INT_P (x))
++    return false;
+   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
+   if (left)
+-    return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
++    return IN_RANGE (INTVAL (x), 0, bit_width - 1);
+   else
+-    return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
++    return IN_RANGE (INTVAL (x), 1, bit_width);
+ }
+ 
+ /* Return the bitmask CONST_INT to select the bits required by a zero extract
+@@ -14763,7 +16689,17 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
+     return true;
+ 
+   if (VECTOR_MODE_P (GET_MODE (x)))
+-    return aarch64_simd_valid_immediate (x, NULL);
++    {
++      /* Require predicate constants to be VNx16BI before RA, so that we
++	 force everything to have a canonical form.  */
++      if (!lra_in_progress
++	  && !reload_completed
++	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
++	  && GET_MODE (x) != VNx16BImode)
++	return false;
++
++      return aarch64_simd_valid_immediate (x, NULL);
++    }
+ 
+   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
+     return true;
+@@ -14953,6 +16889,72 @@ aarch64_sve_ld1r_operand_p (rtx op)
+ 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
+ }
+ 
++/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
++   where the size of the read data is specified by `mode` and the size of the
++   vector elements are specified by `elem_mode`.   */
++bool
++aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
++				   scalar_mode elem_mode)
++{
++  struct aarch64_address_info addr;
++  if (!MEM_P (op)
++      || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
++    return false;
++
++  if (addr.type == ADDRESS_REG_IMM)
++    return offset_4bit_signed_scaled_p (mode, addr.const_offset);
++
++  if (addr.type == ADDRESS_REG_REG)
++    return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
++
++  return false;
++}
++
++/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
++bool
++aarch64_sve_ld1rq_operand_p (rtx op)
++{
++  return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
++					    GET_MODE_INNER (GET_MODE (op)));
++}
++
++/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
++   accessing a vector where the element size is specified by `elem_mode`.  */
++bool
++aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
++{
++  return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
++}
++
++/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
++bool
++aarch64_sve_ldff1_operand_p (rtx op)
++{
++  if (!MEM_P (op))
++    return false;
++
++  struct aarch64_address_info addr;
++  if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
++    return false;
++
++  if (addr.type == ADDRESS_REG_IMM)
++    return known_eq (addr.const_offset, 0);
++
++  return addr.type == ADDRESS_REG_REG;
++}
++
++/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
++bool
++aarch64_sve_ldnf1_operand_p (rtx op)
++{
++  struct aarch64_address_info addr;
++
++  return (MEM_P (op)
++	  && aarch64_classify_address (&addr, XEXP (op, 0),
++				       GET_MODE (op), false)
++	  && addr.type == ADDRESS_REG_IMM);
++}
++
+ /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
+    The conditions for STR are the same.  */
+ bool
+@@ -14966,6 +16968,21 @@ aarch64_sve_ldr_operand_p (rtx op)
+ 	  && addr.type == ADDRESS_REG_IMM);
+ }
+ 
++/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
++   addressing memory of mode MODE.  */
++bool
++aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
++{
++  struct aarch64_address_info addr;
++  if (!aarch64_classify_address (&addr, op, mode, false))
++    return false;
++
++  if (addr.type == ADDRESS_REG_IMM)
++    return known_eq (addr.const_offset, 0);
++
++  return addr.type == ADDRESS_REG_REG;
++}
++
+ /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
+    We need to be able to access the individual pieces, so the range
+    is different from LD[234] and ST[234].  */
+@@ -15027,11 +17044,13 @@ aarch64_simd_attr_length_rglist (machine_mode mode)
+ static HOST_WIDE_INT
+ aarch64_simd_vector_alignment (const_tree type)
+ {
++  /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
++     be set for non-predicate vectors of booleans.  Modes are the most
++     direct way we have of identifying real SVE predicate types.  */
++  if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
++    return 16;
+   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
+-    /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
+-       be set for non-predicate vectors of booleans.  Modes are the most
+-       direct way we have of identifying real SVE predicate types.  */
+-    return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
++    return 128;
+   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
+ }
+ 
+@@ -15361,34 +17380,383 @@ aarch64_expand_vector_init (rtx target, rtx vals)
+ 	 (see aarch64_simd_valid_immediate).  */
+       for (int i = 0; i < n_elts; i++)
+ 	{
+-	  rtx x = XVECEXP (vals, 0, i);
+-	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+-	    continue;
+-	  rtx subst = any_const;
+-	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
+-	    {
+-	      /* Look in the copied vector, as more elements are const.  */
+-	      rtx test = XVECEXP (copy, 0, i ^ bit);
+-	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
+-		{
+-		  subst = test;
+-		  break;
+-		}
+-	    }
+-	  XVECEXP (copy, 0, i) = subst;
++	  rtx x = XVECEXP (vals, 0, i);
++	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
++	    continue;
++	  rtx subst = any_const;
++	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
++	    {
++	      /* Look in the copied vector, as more elements are const.  */
++	      rtx test = XVECEXP (copy, 0, i ^ bit);
++	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
++		{
++		  subst = test;
++		  break;
++		}
++	    }
++	  XVECEXP (copy, 0, i) = subst;
++	}
++      aarch64_expand_vector_init (target, copy);
++    }
++
++  /* Insert the variable lanes directly.  */
++  for (int i = 0; i < n_elts; i++)
++    {
++      rtx x = XVECEXP (vals, 0, i);
++      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
++	continue;
++      x = copy_to_mode_reg (inner_mode, x);
++      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
++    }
++}
++
++/* Emit RTL corresponding to:
++   insr TARGET, ELEM.  */
++
++static void
++emit_insr (rtx target, rtx elem)
++{
++  machine_mode mode = GET_MODE (target);
++  scalar_mode elem_mode = GET_MODE_INNER (mode);
++  elem = force_reg (elem_mode, elem);
++
++  insn_code icode = optab_handler (vec_shl_insert_optab, mode);
++  gcc_assert (icode != CODE_FOR_nothing);
++  emit_insn (GEN_FCN (icode) (target, target, elem));
++}
++
++/* Subroutine of aarch64_sve_expand_vector_init for handling
++   trailing constants.
++   This function works as follows:
++   (a) Create a new vector consisting of trailing constants.
++   (b) Initialize TARGET with the constant vector using emit_move_insn.
++   (c) Insert remaining elements in TARGET using insr.
++   NELTS is the total number of elements in original vector while
++   while NELTS_REQD is the number of elements that are actually
++   significant.
++
++   ??? The heuristic used is to do above only if number of constants
++   is at least half the total number of elements.  May need fine tuning.  */
++
++static bool
++aarch64_sve_expand_vector_init_handle_trailing_constants
++ (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
++{
++  machine_mode mode = GET_MODE (target);
++  scalar_mode elem_mode = GET_MODE_INNER (mode);
++  int n_trailing_constants = 0;
++
++  for (int i = nelts_reqd - 1;
++       i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
++       i--)
++    n_trailing_constants++;
++
++  if (n_trailing_constants >= nelts_reqd / 2)
++    {
++      rtx_vector_builder v (mode, 1, nelts);
++      for (int i = 0; i < nelts; i++)
++	v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
++      rtx const_vec = v.build ();
++      emit_move_insn (target, const_vec);
++
++      for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
++	emit_insr (target, builder.elt (i));
++
++      return true;
++    }
++
++  return false;
++}
++
++/* Subroutine of aarch64_sve_expand_vector_init.
++   Works as follows:
++   (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
++   (b) Skip trailing elements from BUILDER, which are the same as
++       element NELTS_REQD - 1.
++   (c) Insert earlier elements in reverse order in TARGET using insr.  */
++
++static void
++aarch64_sve_expand_vector_init_insert_elems (rtx target,
++					     const rtx_vector_builder &builder,
++					     int nelts_reqd)
++{
++  machine_mode mode = GET_MODE (target);
++  scalar_mode elem_mode = GET_MODE_INNER (mode);
++
++  struct expand_operand ops[2];
++  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
++  gcc_assert (icode != CODE_FOR_nothing);
++
++  create_output_operand (&ops[0], target, mode);
++  create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
++  expand_insn (icode, 2, ops);
++
++  int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
++  for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
++    emit_insr (target, builder.elt (i));
++}
++
++/* Subroutine of aarch64_sve_expand_vector_init to handle case
++   when all trailing elements of builder are same.
++   This works as follows:
++   (a) Use expand_insn interface to broadcast last vector element in TARGET.
++   (b) Insert remaining elements in TARGET using insr.
++
++   ??? The heuristic used is to do above if number of same trailing elements
++   is at least 3/4 of total number of elements, loosely based on
++   heuristic from mostly_zeros_p.  May need fine-tuning.  */
++
++static bool
++aarch64_sve_expand_vector_init_handle_trailing_same_elem
++ (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
++{
++  int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
++  if (ndups >= (3 * nelts_reqd) / 4)
++    {
++      aarch64_sve_expand_vector_init_insert_elems (target, builder,
++						   nelts_reqd - ndups + 1);
++      return true;
++    }
++
++  return false;
++}
++
++/* Initialize register TARGET from BUILDER. NELTS is the constant number
++   of elements in BUILDER.
++
++   The function tries to initialize TARGET from BUILDER if it fits one
++   of the special cases outlined below.
++
++   Failing that, the function divides BUILDER into two sub-vectors:
++   v_even = even elements of BUILDER;
++   v_odd = odd elements of BUILDER;
++
++   and recursively calls itself with v_even and v_odd.
++
++   if (recursive call succeeded for v_even or v_odd)
++     TARGET = zip (v_even, v_odd)
++
++   The function returns true if it managed to build TARGET from BUILDER
++   with one of the special cases, false otherwise.
++
++   Example: {a, 1, b, 2, c, 3, d, 4}
++
++   The vector gets divided into:
++   v_even = {a, b, c, d}
++   v_odd = {1, 2, 3, 4}
++
++   aarch64_sve_expand_vector_init(v_odd) hits case 1 and
++   initialize tmp2 from constant vector v_odd using emit_move_insn.
++
++   aarch64_sve_expand_vector_init(v_even) fails since v_even contains
++   4 elements, so we construct tmp1 from v_even using insr:
++   tmp1 = dup(d)
++   insr tmp1, c
++   insr tmp1, b
++   insr tmp1, a
++
++   And finally:
++   TARGET = zip (tmp1, tmp2)
++   which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
++
++static bool
++aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
++				int nelts, int nelts_reqd)
++{
++  machine_mode mode = GET_MODE (target);
++
++  /* Case 1: Vector contains trailing constants.  */
++
++  if (aarch64_sve_expand_vector_init_handle_trailing_constants
++       (target, builder, nelts, nelts_reqd))
++    return true;
++
++  /* Case 2: Vector contains leading constants.  */
++
++  rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
++  for (int i = 0; i < nelts_reqd; i++)
++    rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
++  rev_builder.finalize ();
++
++  if (aarch64_sve_expand_vector_init_handle_trailing_constants
++       (target, rev_builder, nelts, nelts_reqd))
++    {
++      emit_insn (gen_aarch64_sve_rev (mode, target, target));
++      return true;
++    }
++
++  /* Case 3: Vector contains trailing same element.  */
++
++  if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
++       (target, builder, nelts_reqd))
++    return true;
++
++  /* Case 4: Vector contains leading same element.  */
++
++  if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
++       (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
++    {
++      emit_insn (gen_aarch64_sve_rev (mode, target, target));
++      return true;
++    }
++
++  /* Avoid recursing below 4-elements.
++     ??? The threshold 4 may need fine-tuning.  */
++
++  if (nelts_reqd <= 4)
++    return false;
++
++  rtx_vector_builder v_even (mode, 1, nelts);
++  rtx_vector_builder v_odd (mode, 1, nelts);
++
++  for (int i = 0; i < nelts * 2; i += 2)
++    {
++      v_even.quick_push (builder.elt (i));
++      v_odd.quick_push (builder.elt (i + 1));
++    }
++
++  v_even.finalize ();
++  v_odd.finalize ();
++
++  rtx tmp1 = gen_reg_rtx (mode);
++  bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
++						    nelts, nelts_reqd / 2);
++
++  rtx tmp2 = gen_reg_rtx (mode);
++  bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
++						   nelts, nelts_reqd / 2);
++
++  if (!did_even_p && !did_odd_p)
++    return false;
++
++  /* Initialize v_even and v_odd using INSR if it didn't match any of the
++     special cases and zip v_even, v_odd.  */
++
++  if (!did_even_p)
++    aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
++
++  if (!did_odd_p)
++    aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
++
++  rtvec v = gen_rtvec (2, tmp1, tmp2);
++  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
++  return true;
++}
++
++/* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
++
++void
++aarch64_sve_expand_vector_init (rtx target, rtx vals)
++{
++  machine_mode mode = GET_MODE (target);
++  int nelts = XVECLEN (vals, 0);
++
++  rtx_vector_builder v (mode, 1, nelts);
++  for (int i = 0; i < nelts; i++)
++    v.quick_push (XVECEXP (vals, 0, i));
++  v.finalize ();
++
++  /* If neither sub-vectors of v could be initialized specially,
++     then use INSR to insert all elements from v into TARGET.
++     ??? This might not be optimal for vectors with large
++     initializers like 16-element or above.
++     For nelts < 4, it probably isn't useful to handle specially.  */
++
++  if (nelts < 4
++      || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
++    aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
++}
++
++/* Check whether VALUE is a vector constant in which every element
++   is either a power of 2 or a negated power of 2.  If so, return
++   a constant vector of log2s, and flip CODE between PLUS and MINUS
++   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
++
++static rtx
++aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
++{
++  if (GET_CODE (value) != CONST_VECTOR)
++    return NULL_RTX;
++
++  rtx_vector_builder builder;
++  if (!builder.new_unary_operation (GET_MODE (value), value, false))
++    return NULL_RTX;
++
++  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
++  /* 1 if the result of the multiplication must be negated,
++     0 if it mustn't, or -1 if we don't yet care.  */
++  int negate = -1;
++  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
++  for (unsigned int i = 0; i < encoded_nelts; ++i)
++    {
++      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
++      if (!CONST_SCALAR_INT_P (elt))
++	return NULL_RTX;
++      rtx_mode_t val (elt, int_mode);
++      wide_int pow2 = wi::neg (val);
++      if (val != pow2)
++	{
++	  /* It matters whether we negate or not.  Make that choice,
++	     and make sure that it's consistent with previous elements.  */
++	  if (negate == !wi::neg_p (val))
++	    return NULL_RTX;
++	  negate = wi::neg_p (val);
++	  if (!negate)
++	    pow2 = val;
+ 	}
+-      aarch64_expand_vector_init (target, copy);
++      /* POW2 is now the value that we want to be a power of 2.  */
++      int shift = wi::exact_log2 (pow2);
++      if (shift < 0)
++	return NULL_RTX;
++      builder.quick_push (gen_int_mode (shift, int_mode));
++    }
++  if (negate == -1)
++    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
++    code = PLUS;
++  else if (negate == 1)
++    code = code == PLUS ? MINUS : PLUS;
++  return builder.build ();
++}
++
++/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
++   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
++   operands array, in the same order as for fma_optab.  Return true if
++   the function emitted all the necessary instructions, false if the caller
++   should generate the pattern normally with the new OPERANDS array.  */
++
++bool
++aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
++{
++  machine_mode mode = GET_MODE (operands[0]);
++  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
++    {
++      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
++				  NULL_RTX, true, OPTAB_DIRECT);
++      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
++			  operands[3], product, operands[0], true,
++			  OPTAB_DIRECT);
++      return true;
+     }
++  operands[2] = force_reg (mode, operands[2]);
++  return false;
++}
+ 
+-  /* Insert the variable lanes directly.  */
+-  for (int i = 0; i < n_elts; i++)
++/* Likewise, but for a conditional pattern.  */
++
++bool
++aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
++{
++  machine_mode mode = GET_MODE (operands[0]);
++  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
+     {
+-      rtx x = XVECEXP (vals, 0, i);
+-      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+-	continue;
+-      x = copy_to_mode_reg (inner_mode, x);
+-      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
++      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
++				  NULL_RTX, true, OPTAB_DIRECT);
++      emit_insn (gen_cond (code, mode, operands[0], operands[1],
++			   operands[4], product, operands[5]));
++      return true;
+     }
++  operands[3] = force_reg (mode, operands[3]);
++  return false;
+ }
+ 
+ static unsigned HOST_WIDE_INT
+@@ -15428,11 +17796,15 @@ aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
+ static void
+ aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
+ {
+-  if (aarch64_simd_decl_p (decl))
++  if (TREE_CODE (decl) == FUNCTION_DECL)
+     {
+-      fprintf (stream, "\t.variant_pcs\t");
+-      assemble_name (stream, name);
+-      fprintf (stream, "\n");
++      arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
++      if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
++	{
++	  fprintf (stream, "\t.variant_pcs\t");
++	  assemble_name (stream, name);
++	  fprintf (stream, "\n");
++	}
+     }
+ }
+ 
+@@ -15459,7 +17831,7 @@ aarch64_declare_function_name (FILE *stream, const char* name,
+   const struct processor *this_arch
+     = aarch64_get_arch (targ_options->x_explicit_arch);
+ 
+-  unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
++  uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
+   std::string extension
+     = aarch64_get_extension_string_for_isa_flags (isa_flags,
+ 						  this_arch->flags);
+@@ -15541,6 +17913,18 @@ aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
+   aarch64_asm_output_variant_pcs (stream, decl, name);
+ }
+ 
++/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
++   Used to output the .cfi_b_key_frame directive when signing the current
++   function with the B key.  */
++
++void
++aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
++{
++  if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
++      && aarch64_ra_sign_key == AARCH64_KEY_B)
++	asm_fprintf (f, "\t.cfi_b_key_frame\n");
++}
++
+ /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
+ 
+ static void
+@@ -15551,7 +17935,7 @@ aarch64_start_file (void)
+ 
+   const struct processor *default_arch
+     = aarch64_get_arch (default_options->x_explicit_arch);
+-  unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
++  uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
+   std::string extension
+     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
+ 						  default_arch->flags);
+@@ -15570,16 +17954,26 @@ static void
+ aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
+ 			     rtx mem, rtx model_rtx)
+ {
+-  emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
++  if (mode == TImode)
++    emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
++						gen_highpart (DImode, rval),
++						mem, model_rtx));
++  else
++    emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
+ }
+ 
+ /* Emit store exclusive.  */
+ 
+ static void
+ aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
+-			      rtx rval, rtx mem, rtx model_rtx)
++			      rtx mem, rtx rval, rtx model_rtx)
+ {
+-  emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
++  if (mode == TImode)
++    emit_insn (gen_aarch64_store_exclusive_pair
++	       (bval, mem, operand_subword (rval, 0, 0, TImode),
++		operand_subword (rval, 1, 0, TImode), model_rtx));
++  else
++    emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
+ }
+ 
+ /* Mark the previous jump instruction as unlikely.  */
+@@ -15591,6 +17985,82 @@ aarch64_emit_unlikely_jump (rtx insn)
+   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
+ }
+ 
++/* We store the names of the various atomic helpers in a 5x4 array.
++   Return the libcall function given MODE, MODEL and NAMES.  */
++
++rtx
++aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
++			const atomic_ool_names *names)
++{
++  memmodel model = memmodel_base (INTVAL (model_rtx));
++  int mode_idx, model_idx;
++
++  switch (mode)
++    {
++    case E_QImode:
++      mode_idx = 0;
++      break;
++    case E_HImode:
++      mode_idx = 1;
++      break;
++    case E_SImode:
++      mode_idx = 2;
++      break;
++    case E_DImode:
++      mode_idx = 3;
++      break;
++    case E_TImode:
++      mode_idx = 4;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  switch (model)
++    {
++    case MEMMODEL_RELAXED:
++      model_idx = 0;
++      break;
++    case MEMMODEL_CONSUME:
++    case MEMMODEL_ACQUIRE:
++      model_idx = 1;
++      break;
++    case MEMMODEL_RELEASE:
++      model_idx = 2;
++      break;
++    case MEMMODEL_ACQ_REL:
++    case MEMMODEL_SEQ_CST:
++      model_idx = 3;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
++				      VISIBILITY_HIDDEN);
++}
++
++#define DEF0(B, N) \
++  { "__aarch64_" #B #N "_relax", \
++    "__aarch64_" #B #N "_acq", \
++    "__aarch64_" #B #N "_rel", \
++    "__aarch64_" #B #N "_acq_rel" }
++
++#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
++		 { NULL, NULL, NULL, NULL }
++#define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
++
++static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
++const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
++const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
++const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
++const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
++const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
++
++#undef DEF0
++#undef DEF4
++#undef DEF5
++
+ /* Expand a compare and swap pattern.  */
+ 
+ void
+@@ -15637,6 +18107,17 @@ aarch64_expand_compare_and_swap (rtx operands[])
+ 						   newval, mod_s));
+       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+     }
++  else if (TARGET_OUTLINE_ATOMICS)
++    {
++      /* Oldval must satisfy compare afterward.  */
++      if (!aarch64_plus_operand (oldval, mode))
++	oldval = force_reg (mode, oldval);
++      rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
++      rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
++				      oldval, mode, newval, mode,
++				      XEXP (mem, 0), Pmode);
++      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
++    }
+   else
+     {
+       /* The oldval predicate varies by mode.  Test it and force to reg.  */
+@@ -15682,13 +18163,11 @@ aarch64_split_compare_and_swap (rtx operands[])
+   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
+   gcc_assert (epilogue_completed);
+ 
+-  rtx rval, mem, oldval, newval, scratch;
++  rtx rval, mem, oldval, newval, scratch, x, model_rtx;
+   machine_mode mode;
+   bool is_weak;
+   rtx_code_label *label1, *label2;
+-  rtx x, cond;
+   enum memmodel model;
+-  rtx model_rtx;
+ 
+   rval = operands[0];
+   mem = operands[1];
+@@ -15709,7 +18188,8 @@ aarch64_split_compare_and_swap (rtx operands[])
+ 	CBNZ	scratch, .label1
+     .label2:
+ 	CMP	rval, 0.  */
+-  bool strong_zero_p = !is_weak && oldval == const0_rtx;
++  bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
++			oldval == const0_rtx && mode != TImode);
+ 
+   label1 = NULL;
+   if (!is_weak)
+@@ -15722,35 +18202,20 @@ aarch64_split_compare_and_swap (rtx operands[])
+   /* The initial load can be relaxed for a __sync operation since a final
+      barrier will be emitted to stop code hoisting.  */
+   if (is_mm_sync (model))
+-    aarch64_emit_load_exclusive (mode, rval, mem,
+-				 GEN_INT (MEMMODEL_RELAXED));
++    aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
+   else
+     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
+ 
+   if (strong_zero_p)
+-    {
+-      if (aarch64_track_speculation)
+-	{
+-	  /* Emit an explicit compare instruction, so that we can correctly
+-	     track the condition codes.  */
+-	  rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
+-	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+-	}
+-      else
+-	x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
+-
+-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+-				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+-      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+-    }
++    x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
+   else
+     {
+-      cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+-      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+-				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+-      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
++      rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
++      x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+     }
++  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
++			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
++  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+ 
+   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
+ 
+@@ -15771,22 +18236,16 @@ aarch64_split_compare_and_swap (rtx operands[])
+       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+     }
+   else
+-    {
+-      cond = gen_rtx_REG (CCmode, CC_REGNUM);
+-      x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
+-      emit_insn (gen_rtx_SET (cond, x));
+-    }
++    aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ 
+   emit_label (label2);
++
+   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
+      to set the condition flags.  If this is not used it will be removed by
+      later passes.  */
+   if (strong_zero_p)
+-    {
+-      cond = gen_rtx_REG (CCmode, CC_REGNUM);
+-      x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
+-      emit_insn (gen_rtx_SET (cond, x));
+-    }
++    aarch64_gen_compare_reg (NE, rval, const0_rtx);
++
+   /* Emit any final barrier needed for a __sync operation.  */
+   if (is_mm_sync (model))
+     aarch64_emit_post_barrier (model);
+@@ -15939,6 +18398,7 @@ aarch64_float_const_representable_p (rtx x)
+   REAL_VALUE_TYPE r, m;
+   bool fail;
+ 
++  x = unwrap_const_vec_duplicate (x);
+   if (!CONST_DOUBLE_P (x))
+     return false;
+ 
+@@ -16034,17 +18494,18 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
+ 
+   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
+     {
+-      gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
++      gcc_assert (info.insn == simd_immediate_info::MOV
++		  && info.u.mov.shift == 0);
+       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
+ 	 move immediate path.  */
+-      if (aarch64_float_const_zero_rtx_p (info.value))
+-        info.value = GEN_INT (0);
++      if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
++        info.u.mov.value = GEN_INT (0);
+       else
+ 	{
+ 	  const unsigned int buf_size = 20;
+ 	  char float_buf[buf_size] = {'\0'};
+ 	  real_to_decimal_for_mode (float_buf,
+-				    CONST_DOUBLE_REAL_VALUE (info.value),
++				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
+ 				    buf_size, buf_size, 1, info.elt_mode);
+ 
+ 	  if (lane_count == 1)
+@@ -16056,36 +18517,39 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
+ 	}
+     }
+ 
+-  gcc_assert (CONST_INT_P (info.value));
++  gcc_assert (CONST_INT_P (info.u.mov.value));
+ 
+   if (which == AARCH64_CHECK_MOV)
+     {
+       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
+-      shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
++      shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
++		  ? "msl" : "lsl");
+       if (lane_count == 1)
+ 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
+-		  mnemonic, UINTVAL (info.value));
+-      else if (info.shift)
++		  mnemonic, UINTVAL (info.u.mov.value));
++      else if (info.u.mov.shift)
+ 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
+ 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
+-		  element_char, UINTVAL (info.value), shift_op, info.shift);
++		  element_char, UINTVAL (info.u.mov.value), shift_op,
++		  info.u.mov.shift);
+       else
+ 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
+ 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
+-		  element_char, UINTVAL (info.value));
++		  element_char, UINTVAL (info.u.mov.value));
+     }
+   else
+     {
+       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
+       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
+-      if (info.shift)
++      if (info.u.mov.shift)
+ 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
+ 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
+-		  element_char, UINTVAL (info.value), "lsl", info.shift);
++		  element_char, UINTVAL (info.u.mov.value), "lsl",
++		  info.u.mov.shift);
+       else
+ 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
+ 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
+-		  element_char, UINTVAL (info.value));
++		  element_char, UINTVAL (info.u.mov.value));
+     }
+   return templ;
+ }
+@@ -16129,24 +18593,49 @@ aarch64_output_sve_mov_immediate (rtx const_vector)
+ 
+   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
+ 
+-  if (info.step)
++  machine_mode vec_mode = GET_MODE (const_vector);
++  if (aarch64_sve_pred_mode_p (vec_mode))
++    {
++      static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
++      if (info.insn == simd_immediate_info::MOV)
++	{
++	  gcc_assert (info.u.mov.value == const0_rtx);
++	  snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
++	}
++      else
++	{
++	  gcc_assert (info.insn == simd_immediate_info::PTRUE);
++	  unsigned int total_bytes;
++	  if (info.u.pattern == AARCH64_SV_ALL
++	      && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
++	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
++		      total_bytes / GET_MODE_SIZE (info.elt_mode));
++	  else
++	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
++		      svpattern_token (info.u.pattern));
++	}
++      return buf;
++    }
++
++  if (info.insn == simd_immediate_info::INDEX)
+     {
+       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
+ 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
+-		element_char, INTVAL (info.value), INTVAL (info.step));
++		element_char, INTVAL (info.u.index.base),
++		INTVAL (info.u.index.step));
+       return templ;
+     }
+ 
+   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
+     {
+-      if (aarch64_float_const_zero_rtx_p (info.value))
+-	info.value = GEN_INT (0);
++      if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
++	info.u.mov.value = GEN_INT (0);
+       else
+ 	{
+ 	  const int buf_size = 20;
+ 	  char float_buf[buf_size] = {};
+ 	  real_to_decimal_for_mode (float_buf,
+-				    CONST_DOUBLE_REAL_VALUE (info.value),
++				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
+ 				    buf_size, buf_size, 1, info.elt_mode);
+ 
+ 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
+@@ -16156,23 +18645,27 @@ aarch64_output_sve_mov_immediate (rtx const_vector)
+     }
+ 
+   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
+-	    element_char, INTVAL (info.value));
++	    element_char, INTVAL (info.u.mov.value));
+   return templ;
+ }
+ 
+-/* Return the asm format for a PTRUE instruction whose destination has
+-   mode MODE.  SUFFIX is the element size suffix.  */
++/* Return the asm template for a PTRUES.  CONST_UNSPEC is the
++   aarch64_sve_ptrue_svpattern_immediate that describes the predicate
++   pattern.  */
+ 
+ char *
+-aarch64_output_ptrue (machine_mode mode, char suffix)
++aarch64_output_sve_ptrues (rtx const_unspec)
+ {
+-  unsigned int nunits;
+-  static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
+-  if (GET_MODE_NUNITS (mode).is_constant (&nunits))
+-    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
+-  else
+-    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
+-  return buf;
++  static char templ[40];
++
++  struct simd_immediate_info info;
++  bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
++  gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
++
++  char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
++  snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
++	    svpattern_token (info.u.pattern));
++  return templ;
+ }
+ 
+ /* Split operands into moves from op[1] + op[2] into op[0].  */
+@@ -16590,13 +19083,31 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
+   if (d->testing_p)
+     return true;
+ 
+-  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
+   if (d->vec_flags == VEC_SVE_DATA)
+     {
+-      rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+-      src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
+-			    UNSPEC_MERGE_PTRUE);
++      machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
++      rtx target = gen_reg_rtx (int_mode);
++      if (BYTES_BIG_ENDIAN)
++	/* The act of taking a subreg between INT_MODE and d->vmode
++	   is itself a reversing operation on big-endian targets;
++	   see the comment at the head of aarch64-sve.md for details.
++	   First reinterpret OP0 as INT_MODE without using a subreg
++	   and without changing the contents.  */
++	emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
++      else
++	{
++	  /* For SVE we use REV[BHW] unspecs derived from the element size
++	     of v->mode and vector modes whose elements have SIZE bytes.
++	     This ensures that the vector modes match the predicate modes.  */
++	  int unspec = aarch64_sve_rev_unspec (d->vmode);
++	  rtx pred = aarch64_ptrue_reg (pred_mode);
++	  emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
++				       gen_lowpart (int_mode, d->op0)));
++	}
++      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
++      return true;
+     }
++  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
+   emit_set_insn (d->target, src);
+   return true;
+ }
+@@ -16609,7 +19120,7 @@ aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
+ {
+   poly_uint64 nelt = d->perm.length ();
+ 
+-  if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
++  if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
+     return false;
+ 
+   if (!d->perm.series_p (0, 1, nelt - 1, -1))
+@@ -16722,6 +19233,50 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
+   return true;
+ }
+ 
++/* Try to implement D using SVE SEL instruction.  */
++
++static bool
++aarch64_evpc_sel (struct expand_vec_perm_d *d)
++{
++  machine_mode vmode = d->vmode;
++  int unit_size = GET_MODE_UNIT_SIZE (vmode);
++
++  if (d->vec_flags != VEC_SVE_DATA
++      || unit_size > 8)
++    return false;
++
++  int n_patterns = d->perm.encoding ().npatterns ();
++  poly_int64 vec_len = d->perm.length ();
++
++  for (int i = 0; i < n_patterns; ++i)
++    if (!known_eq (d->perm[i], i)
++	&& !known_eq (d->perm[i], vec_len + i))
++      return false;
++
++  for (int i = n_patterns; i < n_patterns * 2; i++)
++    if (!d->perm.series_p (i, n_patterns, i, n_patterns)
++	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++  machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
++
++  rtx_vector_builder builder (pred_mode, n_patterns, 2);
++  for (int i = 0; i < n_patterns * 2; i++)
++    {
++      rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
++					  : CONST0_RTX (BImode);
++      builder.quick_push (elem);
++    }
++
++  rtx const_vec = builder.build ();
++  rtx pred = force_reg (pred_mode, const_vec);
++  emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
++  return true;
++}
++
+ static bool
+ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+ {
+@@ -16754,6 +19309,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+ 	return true;
+       else if (aarch64_evpc_trn (d))
+ 	return true;
++      else if (aarch64_evpc_sel (d))
++	return true;
+       if (d->vec_flags == VEC_SVE_DATA)
+ 	return aarch64_evpc_sve_tbl (d);
+       else if (d->vec_flags == VEC_ADVSIMD)
+@@ -16829,60 +19386,19 @@ aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
+   return force_reg (V16QImode, mask);
+ }
+ 
+-/* Return true if X is a valid second operand for the SVE instruction
+-   that implements integer comparison OP_CODE.  */
+-
+-static bool
+-aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
+-{
+-  if (register_operand (x, VOIDmode))
+-    return true;
+-
+-  switch (op_code)
+-    {
+-    case LTU:
+-    case LEU:
+-    case GEU:
+-    case GTU:
+-      return aarch64_sve_cmp_immediate_p (x, false);
+-    case LT:
+-    case LE:
+-    case GE:
+-    case GT:
+-    case NE:
+-    case EQ:
+-      return aarch64_sve_cmp_immediate_p (x, true);
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* Use predicated SVE instructions to implement the equivalent of:
+-
+-     (set TARGET OP)
+-
+-   given that PTRUE is an all-true predicate of the appropriate mode.  */
+-
+-static void
+-aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
+-{
+-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
+-			       gen_rtvec (2, ptrue, op),
+-			       UNSPEC_MERGE_PTRUE);
+-  rtx_insn *insn = emit_set_insn (target, unspec);
+-  set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
+-}
++/* Expand an SVE integer comparison using the SVE equivalent of:
+ 
+-/* Likewise, but also clobber the condition codes.  */
++     (set TARGET (CODE OP0 OP1)).  */
+ 
+-static void
+-aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
++void
++aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
+ {
+-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
+-			       gen_rtvec (2, ptrue, op),
+-			       UNSPEC_MERGE_PTRUE);
+-  rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
+-  set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
++  machine_mode pred_mode = GET_MODE (target);
++  machine_mode data_mode = GET_MODE (op0);
++  rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
++				      op0, op1);
++  if (!rtx_equal_p (target, res))
++    emit_move_insn (target, res);
+ }
+ 
+ /* Return the UNSPEC_COND_* code for comparison CODE.  */
+@@ -16893,17 +19409,19 @@ aarch64_unspec_cond_code (rtx_code code)
+   switch (code)
+     {
+     case NE:
+-      return UNSPEC_COND_NE;
++      return UNSPEC_COND_FCMNE;
+     case EQ:
+-      return UNSPEC_COND_EQ;
++      return UNSPEC_COND_FCMEQ;
+     case LT:
+-      return UNSPEC_COND_LT;
++      return UNSPEC_COND_FCMLT;
+     case GT:
+-      return UNSPEC_COND_GT;
++      return UNSPEC_COND_FCMGT;
+     case LE:
+-      return UNSPEC_COND_LE;
++      return UNSPEC_COND_FCMLE;
+     case GE:
+-      return UNSPEC_COND_GE;
++      return UNSPEC_COND_FCMGE;
++    case UNORDERED:
++      return UNSPEC_COND_FCMUO;
+     default:
+       gcc_unreachable ();
+     }
+@@ -16911,78 +19429,58 @@ aarch64_unspec_cond_code (rtx_code code)
+ 
+ /* Emit:
+ 
+-      (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
++      (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
+ 
+-   where <X> is the operation associated with comparison CODE.  This form
+-   of instruction is used when (and (CODE OP0 OP1) PRED) would have different
+-   semantics, such as when PRED might not be all-true and when comparing
+-   inactive lanes could have side effects.  */
++   where <X> is the operation associated with comparison CODE.
++   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
+ 
+ static void
+-aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
+-				  rtx pred, rtx op0, rtx op1)
++aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
++			  bool known_ptrue_p, rtx op0, rtx op1)
+ {
++  rtx flag = gen_int_mode (known_ptrue_p, SImode);
+   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
+-			       gen_rtvec (3, pred, op0, op1),
++			       gen_rtvec (4, pred, flag, op0, op1),
+ 			       aarch64_unspec_cond_code (code));
+   emit_set_insn (target, unspec);
+ }
+ 
+-/* Expand an SVE integer comparison using the SVE equivalent of:
+-
+-     (set TARGET (CODE OP0 OP1)).  */
+-
+-void
+-aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
+-{
+-  machine_mode pred_mode = GET_MODE (target);
+-  machine_mode data_mode = GET_MODE (op0);
+-
+-  if (!aarch64_sve_cmp_operand_p (code, op1))
+-    op1 = force_reg (data_mode, op1);
+-
+-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+-  rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
+-  aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
+-}
+-
+ /* Emit the SVE equivalent of:
+ 
+-      (set TMP1 (CODE1 OP0 OP1))
+-      (set TMP2 (CODE2 OP0 OP1))
++      (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
++      (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
+       (set TARGET (ior:PRED_MODE TMP1 TMP2))
+ 
+-   PTRUE is an all-true predicate with the same mode as TARGET.  */
++   where <Xi> is the operation associated with comparison CODEi.
++   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
+ 
+ static void
+-aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
+-			   rtx ptrue, rtx op0, rtx op1)
++aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
++			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
+ {
+-  machine_mode pred_mode = GET_MODE (ptrue);
++  machine_mode pred_mode = GET_MODE (pred);
+   rtx tmp1 = gen_reg_rtx (pred_mode);
+-  aarch64_emit_sve_ptrue_op (tmp1, ptrue,
+-			     gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
++  aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
+   rtx tmp2 = gen_reg_rtx (pred_mode);
+-  aarch64_emit_sve_ptrue_op (tmp2, ptrue,
+-			     gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
++  aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
+   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
+ }
+ 
+ /* Emit the SVE equivalent of:
+ 
+-      (set TMP (CODE OP0 OP1))
++      (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
+       (set TARGET (not TMP))
+ 
+-   PTRUE is an all-true predicate with the same mode as TARGET.  */
++   where <X> is the operation associated with comparison CODE.
++   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
+ 
+ static void
+-aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
+-				rtx op0, rtx op1)
++aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
++				 bool known_ptrue_p, rtx op0, rtx op1)
+ {
+-  machine_mode pred_mode = GET_MODE (ptrue);
++  machine_mode pred_mode = GET_MODE (pred);
+   rtx tmp = gen_reg_rtx (pred_mode);
+-  aarch64_emit_sve_ptrue_op (tmp, ptrue,
+-			     gen_rtx_fmt_ee (code, pred_mode, op0, op1));
++  aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
+   aarch64_emit_unop (target, one_cmpl_optab, tmp);
+ }
+ 
+@@ -17000,7 +19498,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
+   machine_mode pred_mode = GET_MODE (target);
+   machine_mode data_mode = GET_MODE (op0);
+ 
+-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
++  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+   switch (code)
+     {
+     case UNORDERED:
+@@ -17015,14 +19513,13 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
+     case NE:
+       {
+ 	/* There is native support for the comparison.  */
+-	rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
+-	aarch64_emit_sve_ptrue_op (target, ptrue, cond);
++	aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
+ 	return false;
+       }
+ 
+     case LTGT:
+       /* This is a trapping operation (LT or GT).  */
+-      aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
++      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
+       return false;
+ 
+     case UNEQ:
+@@ -17030,7 +19527,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
+ 	{
+ 	  /* This would trap for signaling NaNs.  */
+ 	  op1 = force_reg (data_mode, op1);
+-	  aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
++	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
++					ptrue, true, op0, op1);
+ 	  return false;
+ 	}
+       /* fall through */
+@@ -17043,7 +19541,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
+ 	  /* Work out which elements are ordered.  */
+ 	  rtx ordered = gen_reg_rtx (pred_mode);
+ 	  op1 = force_reg (data_mode, op1);
+-	  aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
++	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
++					   ptrue, true, op0, op1);
+ 
+ 	  /* Test the opposite condition for the ordered elements,
+ 	     then invert the result.  */
+@@ -17053,13 +19552,12 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
+ 	    code = reverse_condition_maybe_unordered (code);
+ 	  if (can_invert_p)
+ 	    {
+-	      aarch64_emit_sve_predicated_cond (target, code,
+-						ordered, op0, op1);
++	      aarch64_emit_sve_fp_cond (target, code,
++					ordered, false, op0, op1);
+ 	      return true;
+ 	    }
+-	  rtx tmp = gen_reg_rtx (pred_mode);
+-	  aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
+-	  aarch64_emit_unop (target, one_cmpl_optab, tmp);
++	  aarch64_emit_sve_invert_fp_cond (target, code,
++					   ordered, false, op0, op1);
+ 	  return false;
+ 	}
+       break;
+@@ -17077,11 +19575,10 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
+   code = reverse_condition_maybe_unordered (code);
+   if (can_invert_p)
+     {
+-      rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
+-      aarch64_emit_sve_ptrue_op (target, ptrue, cond);
++      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
+       return true;
+     }
+-  aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
++  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
+   return false;
+ }
+ 
+@@ -17104,6 +19601,13 @@ aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
+   else
+     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
+ 
++  if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
++    ops[1] = force_reg (data_mode, ops[1]);
++  /* The "false" value can only be zero if the "true" value is a constant.  */
++  if (register_operand (ops[1], data_mode)
++      || !aarch64_simd_reg_or_zero (ops[2], data_mode))
++    ops[2] = force_reg (data_mode, ops[2]);
++
+   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
+   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
+ }
+@@ -17181,11 +19685,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
+   *dst = aarch64_progress_pointer (*dst);
+ }
+ 
+-/* Expand movmem, as if from a __builtin_memcpy.  Return true if
++/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
+    we succeed, otherwise return false.  */
+ 
+ bool
+-aarch64_expand_movmem (rtx *operands)
++aarch64_expand_cpymem (rtx *operands)
+ {
+   int n, mode_bits;
+   rtx dst = operands[0];
+@@ -17452,7 +19956,10 @@ aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+ static unsigned HOST_WIDE_INT
+ aarch64_asan_shadow_offset (void)
+ {
+-  return (HOST_WIDE_INT_1 << 36);
++  if (TARGET_ILP32)
++    return (HOST_WIDE_INT_1 << 29);
++  else
++    return (HOST_WIDE_INT_1 << 36);
+ }
+ 
+ static rtx
+@@ -17758,10 +20265,6 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
+         }
+     }
+ 
+-  if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
+-       && aarch_crypto_can_dual_issue (prev, curr))
+-    return true;
+-
+   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
+       && any_condjump_p (curr))
+     {
+@@ -18545,6 +21048,29 @@ aarch64_fpconst_pow_of_2 (rtx x)
+   return exact_log2 (real_to_integer (r));
+ }
+ 
++/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
++   power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
++   return n. Otherwise return -1.  */
++
++int
++aarch64_fpconst_pow2_recip (rtx x)
++{
++  REAL_VALUE_TYPE r0;
++
++  if (!CONST_DOUBLE_P (x))
++    return -1;
++
++  r0 = *CONST_DOUBLE_REAL_VALUE (x);
++  if (exact_real_inverse (DFmode, &r0)
++      && !REAL_VALUE_NEGATIVE (r0))
++    {
++	int ret = exact_log2 (real_to_integer (&r0));
++	if (ret >= 1 && ret <= 32)
++	    return ret;
++    }
++  return -1;
++}
++
+ /* If X is a vector of equal CONST_DOUBLE values and that value is
+    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
+ 
+@@ -18765,12 +21291,8 @@ aarch64_select_early_remat_modes (sbitmap modes)
+   /* SVE values are not normally live across a call, so it should be
+      worth doing early rematerialization even in VL-specific mode.  */
+   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+-    {
+-      machine_mode mode = (machine_mode) i;
+-      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+-      if (vec_flags & VEC_ANY_SVE)
+-	bitmap_set_bit (modes, i);
+-    }
++    if (aarch64_sve_mode_p ((machine_mode) i))
++      bitmap_set_bit (modes, i);
+ }
+ 
+ /* Override the default target speculation_safe_value.  */
+@@ -18994,6 +21516,55 @@ aarch64_stack_protect_guard (void)
+   return NULL_TREE;
+ }
+ 
++/* Return the diagnostic message string if conversion from FROMTYPE to
++   TOTYPE is not allowed, NULL otherwise.  */
++
++static const char *
++aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
++{
++  if (element_mode (fromtype) != element_mode (totype))
++    {
++      /* Do no allow conversions to/from BFmode scalar types.  */
++      if (TYPE_MODE (fromtype) == BFmode)
++	return N_("invalid conversion from type %<bfloat16_t%>");
++      if (TYPE_MODE (totype) == BFmode)
++	return N_("invalid conversion to type %<bfloat16_t%>");
++    }
++
++  /* Conversion allowed.  */
++  return NULL;
++}
++
++/* Return the diagnostic message string if the unary operation OP is
++   not permitted on TYPE, NULL otherwise.  */
++
++static const char *
++aarch64_invalid_unary_op (int op, const_tree type)
++{
++  /* Reject all single-operand operations on BFmode except for &.  */
++  if (element_mode (type) == BFmode && op != ADDR_EXPR)
++    return N_("operation not permitted on type %<bfloat16_t%>");
++
++  /* Operation allowed.  */
++  return NULL;
++}
++
++/* Return the diagnostic message string if the binary operation OP is
++   not permitted on TYPE1 and TYPE2, NULL otherwise.  */
++
++static const char *
++aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
++			   const_tree type2)
++{
++  /* Reject all 2-operand operations on BFmode.  */
++  if (element_mode (type1) == BFmode
++      || element_mode (type2) == BFmode)
++    return N_("operation not permitted on type %<bfloat16_t%>");
++
++  /* Operation allowed.  */
++  return NULL;
++}
++
+ /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
+    section at the end if needed.  */
+ #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
+@@ -19137,7 +21708,7 @@ aarch64_run_selftests (void)
+ #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
+ 
+ #undef TARGET_CALLEE_COPIES
+-#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
++#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
+ 
+ #undef TARGET_CAN_ELIMINATE
+ #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
+@@ -19247,6 +21818,15 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_MANGLE_TYPE
+ #define TARGET_MANGLE_TYPE aarch64_mangle_type
+ 
++#undef TARGET_INVALID_CONVERSION
++#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
++
++#undef TARGET_INVALID_UNARY_OP
++#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
++
++#undef TARGET_INVALID_BINARY_OP
++#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
++
+ #undef TARGET_MEMORY_MOVE_COST
+ #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
+ 
+@@ -19370,6 +21950,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_VECTOR_MODE_SUPPORTED_P
+ #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
+ 
++#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
++#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
++
+ #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+ #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+   aarch64_builtin_support_vector_misalignment
+@@ -19517,13 +22100,8 @@ aarch64_libgcc_floating_mode_supported_p
+ #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
+   aarch64_hard_regno_call_part_clobbered
+ 
+-#undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
+-#define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
+-  aarch64_remove_extra_call_preserved_regs
+-
+-#undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
+-#define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
+-  aarch64_return_call_with_max_clobbers
++#undef TARGET_INSN_CALLEE_ABI
++#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
+ 
+ #undef TARGET_CONSTANT_ALIGNMENT
+ #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
+@@ -19566,11 +22144,20 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_GET_MULTILIB_ABI_NAME
+ #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
+ 
++#undef TARGET_FNTYPE_ABI
++#define TARGET_FNTYPE_ABI aarch64_fntype_abi
++
+ #if CHECKING_P
+ #undef TARGET_RUN_TARGET_SELFTESTS
+ #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
+ #endif /* #if CHECKING_P */
+ 
++#undef TARGET_ASM_POST_CFI_STARTPROC
++#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
++
++#undef TARGET_STRICT_ARGUMENT_NAMING
++#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
++
+ struct gcc_target targetm = TARGET_INITIALIZER;
+ 
+ #include "gt-aarch64.h"
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 772a97296..d5341656f 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -192,6 +192,31 @@ extern unsigned aarch64_architecture_version;
+ /* Execution and Data Prediction Restriction instructions supported.  */
+ #define AARCH64_FL_PREDRES    (1 << 27)
+ 
++/* SVE2 instruction supported.  */
++#define AARCH64_FL_SVE2		(1 << 28)
++#define AARCH64_FL_SVE2_AES	(1 << 29)
++#define AARCH64_FL_SVE2_SM4	(1 << 30)
++#define AARCH64_FL_SVE2_SHA3	(1ULL << 31)
++#define AARCH64_FL_SVE2_BITPERM	(1ULL << 32)
++
++/* Transactional Memory Extension.  */
++#define AARCH64_FL_TME	      (1ULL << 33)  /* Has TME instructions.  */
++
++/* Armv8.6-A architecture extensions.  */
++#define AARCH64_FL_V8_6	      (1ULL << 34)
++
++/* 8-bit Integer Matrix Multiply (I8MM) extensions.  */
++#define AARCH64_FL_I8MM	      (1ULL << 35)
++
++/* Brain half-precision floating-point (BFloat16) Extension.  */
++#define AARCH64_FL_BF16	      (1ULL << 36)
++
++/* 32-bit Floating-point Matrix Multiply (F32MM) extensions.  */
++#define AARCH64_FL_F32MM      (1ULL << 37)
++
++/* 64-bit Floating-point Matrix Multiply (F64MM) extensions.  */
++#define AARCH64_FL_F64MM      (1ULL << 38)
++
+ /* Has FP and SIMD.  */
+ #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
+ 
+@@ -213,6 +238,9 @@ extern unsigned aarch64_architecture_version;
+ #define AARCH64_FL_FOR_ARCH8_5			\
+   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_5	\
+    | AARCH64_FL_SB | AARCH64_FL_SSBS | AARCH64_FL_PREDRES)
++#define AARCH64_FL_FOR_ARCH8_6			\
++  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
++   | AARCH64_FL_I8MM | AARCH64_FL_BF16)
+ 
+ /* Macros to test ISA flags.  */
+ 
+@@ -225,6 +253,7 @@ extern unsigned aarch64_architecture_version;
+ #define AARCH64_ISA_V8_2	   (aarch64_isa_flags & AARCH64_FL_V8_2)
+ #define AARCH64_ISA_F16		   (aarch64_isa_flags & AARCH64_FL_F16)
+ #define AARCH64_ISA_SVE            (aarch64_isa_flags & AARCH64_FL_SVE)
++#define AARCH64_ISA_SVE2	   (aarch64_isa_flags & AARCH64_FL_SVE2)
+ #define AARCH64_ISA_V8_3	   (aarch64_isa_flags & AARCH64_FL_V8_3)
+ #define AARCH64_ISA_DOTPROD	   (aarch64_isa_flags & AARCH64_FL_DOTPROD)
+ #define AARCH64_ISA_AES	           (aarch64_isa_flags & AARCH64_FL_AES)
+@@ -234,7 +263,14 @@ extern unsigned aarch64_architecture_version;
+ #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
+ #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
+ #define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
++#define AARCH64_ISA_RNG		   (aarch64_isa_flags & AARCH64_FL_RNG)
+ #define AARCH64_ISA_V8_5	   (aarch64_isa_flags & AARCH64_FL_V8_5)
++#define AARCH64_ISA_TME		   (aarch64_isa_flags & AARCH64_FL_TME)
++#define AARCH64_ISA_V8_6	   (aarch64_isa_flags & AARCH64_FL_V8_6)
++#define AARCH64_ISA_I8MM	   (aarch64_isa_flags & AARCH64_FL_I8MM)
++#define AARCH64_ISA_F32MM	   (aarch64_isa_flags & AARCH64_FL_F32MM)
++#define AARCH64_ISA_F64MM	   (aarch64_isa_flags & AARCH64_FL_F64MM)
++#define AARCH64_ISA_BF16	   (aarch64_isa_flags & AARCH64_FL_BF16)
+ 
+ /* Crypto is an optional extension to AdvSIMD.  */
+ #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
+@@ -270,12 +306,44 @@ extern unsigned aarch64_architecture_version;
+ /* SVE instructions, enabled through +sve.  */
+ #define TARGET_SVE (AARCH64_ISA_SVE)
+ 
++/* SVE2 instructions, enabled through +sve2.  */
++#define TARGET_SVE2 (AARCH64_ISA_SVE2)
++
+ /* ARMv8.3-A features.  */
+ #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
+ 
++/* Javascript conversion instruction from Armv8.3-a.  */
++#define TARGET_JSCVT	(TARGET_FLOAT && AARCH64_ISA_V8_3)
++
+ /* Armv8.3-a Complex number extension to AdvSIMD extensions.  */
+ #define TARGET_COMPLEX (TARGET_SIMD && TARGET_ARMV8_3)
+ 
++/* Floating-point rounding instructions from Armv8.5-a.  */
++#define TARGET_FRINT (AARCH64_ISA_V8_5 && TARGET_FLOAT)
++
++/* TME instructions are enabled.  */
++#define TARGET_TME (AARCH64_ISA_TME)
++
++/* Random number instructions from Armv8.5-a.  */
++#define TARGET_RNG (AARCH64_ISA_RNG)
++
++/* I8MM instructions are enabled through +i8mm.  */
++#define TARGET_I8MM (AARCH64_ISA_I8MM)
++#define TARGET_SVE_I8MM (TARGET_SVE && AARCH64_ISA_I8MM)
++
++/* F32MM instructions are enabled through +f32mm.  */
++#define TARGET_F32MM (AARCH64_ISA_F32MM)
++#define TARGET_SVE_F32MM (TARGET_SVE && AARCH64_ISA_F32MM)
++
++/* F64MM instructions are enabled through +f64mm.  */
++#define TARGET_F64MM (AARCH64_ISA_F64MM)
++#define TARGET_SVE_F64MM (TARGET_SVE && AARCH64_ISA_F64MM)
++
++/* BF16 instructions are enabled through +bf16.  */
++#define TARGET_BF16_FP (AARCH64_ISA_BF16)
++#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
++#define TARGET_SVE_BF16 (TARGET_SVE && AARCH64_ISA_BF16)
++
+ /* Make sure this is always defined so we don't have to check for ifdefs
+    but rather use normal ifs.  */
+ #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
+@@ -338,6 +406,9 @@ extern unsigned aarch64_architecture_version;
+    P0-P7        Predicate low registers: valid in all predicate contexts
+    P8-P15       Predicate high registers: used as scratch space
+ 
++   FFR		First Fault Register, a fixed-use SVE predicate register
++   FFRT		FFR token: a fake register used for modelling dependencies
++
+    VG           Pseudo "vector granules" register
+ 
+    VG is the number of 64-bit elements in an SVE vector.  We define
+@@ -358,6 +429,7 @@ extern unsigned aarch64_architecture_version;
+     1, 1, 1, 1,			/* SFP, AP, CC, VG */	\
+     0, 0, 0, 0,   0, 0, 0, 0,   /* P0 - P7 */           \
+     0, 0, 0, 0,   0, 0, 0, 0,   /* P8 - P15 */          \
++    1, 1			/* FFR and FFRT */	\
+   }
+ 
+ /* X30 is marked as caller-saved which is in line with regular function call
+@@ -380,6 +452,7 @@ extern unsigned aarch64_architecture_version;
+     1, 1, 1, 1,			/* SFP, AP, CC, VG */	\
+     1, 1, 1, 1,   1, 1, 1, 1,	/* P0 - P7 */		\
+     1, 1, 1, 1,   1, 1, 1, 1,	/* P8 - P15 */		\
++    1, 1			/* FFR and FFRT */	\
+   }
+ 
+ #define REGISTER_NAMES						\
+@@ -395,6 +468,7 @@ extern unsigned aarch64_architecture_version;
+     "sfp", "ap",  "cc",  "vg",					\
+     "p0",  "p1",  "p2",  "p3",  "p4",  "p5",  "p6",  "p7",	\
+     "p8",  "p9",  "p10", "p11", "p12", "p13", "p14", "p15",	\
++    "ffr", "ffrt"						\
+   }
+ 
+ /* Generate the register aliases for core register N */
+@@ -443,11 +517,12 @@ extern unsigned aarch64_architecture_version;
+ #define FRAME_POINTER_REGNUM		SFP_REGNUM
+ #define STACK_POINTER_REGNUM		SP_REGNUM
+ #define ARG_POINTER_REGNUM		AP_REGNUM
+-#define FIRST_PSEUDO_REGISTER		(P15_REGNUM + 1)
++#define FIRST_PSEUDO_REGISTER		(FFRT_REGNUM + 1)
+ 
+-/* The number of (integer) argument register available.  */
++/* The number of argument registers available for each class.  */
+ #define NUM_ARG_REGS			8
+ #define NUM_FP_ARG_REGS			8
++#define NUM_PR_ARG_REGS			4
+ 
+ /* A Homogeneous Floating-Point or Short-Vector Aggregate may have at most
+    four members.  */
+@@ -514,6 +589,9 @@ extern unsigned aarch64_architecture_version;
+ #define ASM_OUTPUT_EXTERNAL(STR, DECL, NAME) \
+   aarch64_asm_output_external (STR, DECL, NAME)
+ 
++/* Output assembly strings after .cfi_startproc is emitted.  */
++#define ASM_POST_CFI_STARTPROC  aarch64_post_cfi_startproc
++
+ /* For EH returns X4 contains the stack adjustment.  */
+ #define EH_RETURN_STACKADJ_RTX	gen_rtx_REG (Pmode, R4_REGNUM)
+ #define EH_RETURN_HANDLER_RTX  aarch64_eh_return_handler_rtx ()
+@@ -542,6 +620,9 @@ extern unsigned aarch64_architecture_version;
+ #define FP_LO_REGNUM_P(REGNO)            \
+   (((unsigned) (REGNO - V0_REGNUM)) <= (V15_REGNUM - V0_REGNUM))
+ 
++#define FP_LO8_REGNUM_P(REGNO)            \
++  (((unsigned) (REGNO - V0_REGNUM)) <= (V7_REGNUM - V0_REGNUM))
++
+ #define PR_REGNUM_P(REGNO)\
+   (((unsigned) (REGNO - P0_REGNUM)) <= (P15_REGNUM - P0_REGNUM))
+ 
+@@ -560,12 +641,15 @@ enum reg_class
+   GENERAL_REGS,
+   STACK_REG,
+   POINTER_REGS,
++  FP_LO8_REGS,
+   FP_LO_REGS,
+   FP_REGS,
+   POINTER_AND_FP_REGS,
+   PR_LO_REGS,
+   PR_HI_REGS,
+   PR_REGS,
++  FFR_REGS,
++  PR_AND_FFR_REGS,
+   ALL_REGS,
+   LIM_REG_CLASSES		/* Last */
+ };
+@@ -579,12 +663,15 @@ enum reg_class
+   "GENERAL_REGS",				\
+   "STACK_REG",					\
+   "POINTER_REGS",				\
++  "FP_LO8_REGS",				\
+   "FP_LO_REGS",					\
+   "FP_REGS",					\
+   "POINTER_AND_FP_REGS",			\
+   "PR_LO_REGS",					\
+   "PR_HI_REGS",					\
+   "PR_REGS",					\
++  "FFR_REGS",					\
++  "PR_AND_FFR_REGS",				\
+   "ALL_REGS"					\
+ }
+ 
+@@ -595,12 +682,15 @@ enum reg_class
+   { 0x7fffffff, 0x00000000, 0x00000003 },	/* GENERAL_REGS */	\
+   { 0x80000000, 0x00000000, 0x00000000 },	/* STACK_REG */		\
+   { 0xffffffff, 0x00000000, 0x00000003 },	/* POINTER_REGS */	\
++  { 0x00000000, 0x000000ff, 0x00000000 },       /* FP_LO8_REGS  */	\
+   { 0x00000000, 0x0000ffff, 0x00000000 },       /* FP_LO_REGS  */	\
+   { 0x00000000, 0xffffffff, 0x00000000 },       /* FP_REGS  */		\
+   { 0xffffffff, 0xffffffff, 0x00000003 },	/* POINTER_AND_FP_REGS */\
+   { 0x00000000, 0x00000000, 0x00000ff0 },	/* PR_LO_REGS */	\
+   { 0x00000000, 0x00000000, 0x000ff000 },	/* PR_HI_REGS */	\
+   { 0x00000000, 0x00000000, 0x000ffff0 },	/* PR_REGS */		\
++  { 0x00000000, 0x00000000, 0x00300000 },	/* FFR_REGS */		\
++  { 0x00000000, 0x00000000, 0x003ffff0 },	/* PR_AND_FFR_REGS */	\
+   { 0xffffffff, 0xffffffff, 0x000fffff }	/* ALL_REGS */		\
+ }
+ 
+@@ -676,7 +766,7 @@ extern enum aarch64_processor aarch64_tune;
+ #ifdef HAVE_POLY_INT_H
+ struct GTY (()) aarch64_frame
+ {
+-  HOST_WIDE_INT reg_offset[FIRST_PSEUDO_REGISTER];
++  poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
+ 
+   /* The number of extra stack bytes taken up by register varargs.
+      This area is allocated by the callee at the very top of the
+@@ -684,9 +774,12 @@ struct GTY (()) aarch64_frame
+      STACK_BOUNDARY.  */
+   HOST_WIDE_INT saved_varargs_size;
+ 
+-  /* The size of the saved callee-save int/FP registers.  */
++  /* The size of the callee-save registers with a slot in REG_OFFSET.  */
++  poly_int64 saved_regs_size;
+ 
+-  HOST_WIDE_INT saved_regs_size;
++  /* The size of the callee-save registers with a slot in REG_OFFSET that
++     are saved below the hard frame pointer.  */
++  poly_int64 below_hard_fp_saved_regs_size;
+ 
+   /* Offset from the base of the frame (incomming SP) to the
+      top of the locals area.  This value is always a multiple of
+@@ -714,6 +807,10 @@ struct GTY (()) aarch64_frame
+      It may be non-zero if no push is used (ie. callee_adjust == 0).  */
+   poly_int64 callee_offset;
+ 
++  /* The size of the stack adjustment before saving or after restoring
++     SVE registers.  */
++  poly_int64 sve_callee_adjust;
++
+   /* The size of the stack adjustment after saving callee-saves.  */
+   poly_int64 final_adjust;
+ 
+@@ -723,6 +820,11 @@ struct GTY (()) aarch64_frame
+   unsigned wb_candidate1;
+   unsigned wb_candidate2;
+ 
++  /* Big-endian SVE frames need a spare predicate register in order
++     to save vector registers in the correct layout for unwinding.
++     This is the register they should use.  */
++  unsigned spare_pred_reg;
++
+   bool laid_out;
+ };
+ 
+@@ -751,6 +853,10 @@ enum aarch64_abi_type
+ enum arm_pcs
+ {
+   ARM_PCS_AAPCS64,		/* Base standard AAPCS for 64 bit.  */
++  ARM_PCS_SIMD,			/* For aarch64_vector_pcs functions.  */
++  ARM_PCS_SVE,			/* For functions that pass or return
++				   values in SVE registers.  */
++  ARM_PCS_TLSDESC,		/* For targets of tlsdesc calls.  */
+   ARM_PCS_UNKNOWN
+ };
+ 
+@@ -777,6 +883,8 @@ typedef struct
+   int aapcs_nextncrn;		/* Next next core register number.  */
+   int aapcs_nvrn;		/* Next Vector register number.  */
+   int aapcs_nextnvrn;		/* Next Next Vector register number.  */
++  int aapcs_nprn;		/* Next Predicate register number.  */
++  int aapcs_nextnprn;		/* Next Next Predicate register number.  */
+   rtx aapcs_reg;		/* Register assigned to this argument.  This
+ 				   is NULL_RTX if this parameter goes on
+ 				   the stack.  */
+@@ -787,6 +895,8 @@ typedef struct
+ 				   aapcs_reg == NULL_RTX.  */
+   int aapcs_stack_size;		/* The total size (in words, per 8 byte) of the
+ 				   stack arg area so far.  */
++  bool silent_p;		/* True if we should act silently, rather than
++				   raise an error for invalid calls.  */
+ } CUMULATIVE_ARGS;
+ #endif
+ 
+@@ -842,7 +952,7 @@ typedef struct
+ /* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
+    move_by_pieces will continually copy the largest safe chunks.  So a
+    7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
+-   for both size and speed of copy, so we will instead use the "movmem"
++   for both size and speed of copy, so we will instead use the "cpymem"
+    standard name to implement the copy.  This logic does not apply when
+    targeting -mstrict-align, so keep a sensible default in that case.  */
+ #define MOVE_RATIO(speed) \
+@@ -1025,13 +1135,13 @@ extern enum aarch64_code_model aarch64_cmodel;
+ #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
+   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
+    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
+-   || (MODE) == DFmode)
++   || (MODE) == DFmode || (MODE) == V4BFmode)
+ 
+ /* Modes valid for AdvSIMD Q registers.  */
+ #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
+   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
+    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
+-   || (MODE) == V2DFmode)
++   || (MODE) == V2DFmode || (MODE) == V8BFmode)
+ 
+ #define ENDIAN_LANE_N(NUNITS, N) \
+   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
+@@ -1079,6 +1189,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
+ extern tree aarch64_fp16_type_node;
+ extern tree aarch64_fp16_ptr_type_node;
+ 
++/* This type is the user-visible __bf16, and a pointer to that type.  Defined
++   in aarch64-builtins.c.  */
++extern tree aarch64_bf16_type_node;
++extern tree aarch64_bf16_ptr_type_node;
++
+ /* The generic unwind code in libgcc does not initialize the frame pointer.
+    So in order to unwind a function using a frame pointer, the very first
+    function that is unwound must save the frame pointer.  That way the frame
+@@ -1094,7 +1209,8 @@ extern poly_uint16 aarch64_sve_vg;
+ #define BITS_PER_SVE_VECTOR (poly_uint16 (aarch64_sve_vg * 64))
+ #define BYTES_PER_SVE_VECTOR (poly_uint16 (aarch64_sve_vg * 8))
+ 
+-/* The number of bytes in an SVE predicate.  */
++/* The number of bits and bytes in an SVE predicate.  */
++#define BITS_PER_SVE_PRED BYTES_PER_SVE_VECTOR
+ #define BYTES_PER_SVE_PRED aarch64_sve_vg
+ 
+ /* The SVE mode for a vector of bytes.  */
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index 73c34a227..34cccc7cd 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -85,7 +85,6 @@
+     (V29_REGNUM		61)
+     (V30_REGNUM		62)
+     (V31_REGNUM		63)
+-    (LAST_SAVED_REGNUM	63)
+     (SFP_REGNUM		64)
+     (AP_REGNUM		65)
+     (CC_REGNUM		66)
+@@ -107,6 +106,11 @@
+     (P13_REGNUM		81)
+     (P14_REGNUM		82)
+     (P15_REGNUM		83)
++    (LAST_SAVED_REGNUM	83)
++    (FFR_REGNUM		84)
++    ;; "FFR token": a fake register used for representing the scheduling
++    ;; restrictions on FFR-related operations.
++    (FFRT_REGNUM	85)
+     ;; Scratch register used by stack clash protection to calculate
+     ;; SVE CFA offsets during probing.
+     (STACK_CLASH_SVE_CFA_REGNUM 11)
+@@ -120,13 +124,17 @@
+     ;; Scratch registers used in frame layout.
+     (IP0_REGNUM         16)
+     (IP1_REGNUM         17)
++    (FP_REGNUM		29)
+     (LR_REGNUM          30)
+   ]
+ )
+ 
+ (define_c_enum "unspec" [
+-    UNSPEC_AUTI1716
+-    UNSPEC_AUTISP
++    UNSPEC_AUTIA1716
++    UNSPEC_AUTIB1716
++    UNSPEC_AUTIASP
++    UNSPEC_AUTIBSP
++    UNSPEC_CALLEE_ABI
+     UNSPEC_CASESI
+     UNSPEC_CRC32B
+     UNSPEC_CRC32CB
+@@ -138,6 +146,11 @@
+     UNSPEC_CRC32X
+     UNSPEC_FCVTZS
+     UNSPEC_FCVTZU
++    UNSPEC_FJCVTZS
++    UNSPEC_FRINT32Z
++    UNSPEC_FRINT32X
++    UNSPEC_FRINT64Z
++    UNSPEC_FRINT64X
+     UNSPEC_URECPE
+     UNSPEC_FRECPE
+     UNSPEC_FRECPS
+@@ -169,8 +182,10 @@
+     UNSPEC_LD4_LANE
+     UNSPEC_MB
+     UNSPEC_NOP
+-    UNSPEC_PACI1716
+-    UNSPEC_PACISP
++    UNSPEC_PACIA1716
++    UNSPEC_PACIB1716
++    UNSPEC_PACIASP
++    UNSPEC_PACIBSP
+     UNSPEC_PRLG_STK
+     UNSPEC_REV
+     UNSPEC_RBIT
+@@ -211,26 +226,49 @@
+     UNSPEC_XPACLRI
+     UNSPEC_LD1_SVE
+     UNSPEC_ST1_SVE
++    UNSPEC_LDNT1_SVE
++    UNSPEC_STNT1_SVE
+     UNSPEC_LD1RQ
+     UNSPEC_LD1_GATHER
++    UNSPEC_LDFF1_GATHER
+     UNSPEC_ST1_SCATTER
+-    UNSPEC_MERGE_PTRUE
+-    UNSPEC_PTEST_PTRUE
++    UNSPEC_PRED_X
++    UNSPEC_PRED_Z
++    UNSPEC_PTEST
++    UNSPEC_PTRUE
+     UNSPEC_UNPACKSHI
+     UNSPEC_UNPACKUHI
+     UNSPEC_UNPACKSLO
+     UNSPEC_UNPACKULO
+     UNSPEC_PACK
+-    UNSPEC_FLOAT_CONVERT
+-    UNSPEC_WHILE_LO
++    UNSPEC_WHILELE
++    UNSPEC_WHILELO
++    UNSPEC_WHILELS
++    UNSPEC_WHILELT
+     UNSPEC_LDN
+     UNSPEC_STN
+     UNSPEC_INSR
++    UNSPEC_CLASTA
+     UNSPEC_CLASTB
+     UNSPEC_FADDA
+     UNSPEC_REV_SUBREG
++    UNSPEC_REINTERPRET
+     UNSPEC_SPECULATION_TRACKER
+     UNSPEC_COPYSIGN
++    UNSPEC_TTEST		; Represent transaction test.
++    UNSPEC_UPDATE_FFR
++    UNSPEC_UPDATE_FFRT
++    UNSPEC_RDFFR
++    UNSPEC_WRFFR
++    ;; Represents an SVE-style lane index, in which the indexing applies
++    ;; within the containing 128-bit block.
++    UNSPEC_SVE_LANE_SELECT
++    UNSPEC_SVE_CNT_PAT
++    UNSPEC_SVE_PREFETCH
++    UNSPEC_SVE_PREFETCH_GATHER
++    UNSPEC_SVE_COMPACT
++    UNSPEC_SVE_SPLICE
++    UNSPEC_LD1RO
+ ])
+ 
+ (define_c_enum "unspecv" [
+@@ -246,9 +284,35 @@
+     UNSPECV_BTI_C		; Represent BTI c.
+     UNSPECV_BTI_J		; Represent BTI j.
+     UNSPECV_BTI_JC		; Represent BTI jc.
++    UNSPECV_TSTART		; Represent transaction start.
++    UNSPECV_TCOMMIT		; Represent transaction commit.
++    UNSPECV_TCANCEL		; Represent transaction cancel.
++    UNSPEC_RNDR			; Represent RNDR
++    UNSPEC_RNDRRS		; Represent RNDRRS
+   ]
+ )
+ 
++;; These constants are used as a const_int in various SVE unspecs
++;; to indicate whether the governing predicate is known to be a PTRUE.
++(define_constants
++  [; Indicates that the predicate might not be a PTRUE.
++   (SVE_MAYBE_NOT_PTRUE 0)
++
++   ; Indicates that the predicate is known to be a PTRUE.
++   (SVE_KNOWN_PTRUE 1)])
++
++;; These constants are used as a const_int in predicated SVE FP arithmetic
++;; to indicate whether the operation is allowed to make additional lanes
++;; active without worrying about the effect on faulting behavior.
++(define_constants
++  [; Indicates either that all lanes are active or that the instruction may
++   ; operate on inactive inputs even if doing so could induce a fault.
++   (SVE_RELAXED_GP 0)
++
++   ; Indicates that some lanes might be inactive and that the instruction
++   ; must not operate on inactive inputs if doing so could induce a fault.
++   (SVE_STRICT_GP 1)])
++
+ ;; If further include files are added the defintion of MD_INCLUDES
+ ;; must be updated.
+ 
+@@ -383,8 +447,8 @@
+ 
+ (define_expand "cbranch<mode>4"
+   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+-			    [(match_operand:GPI 1 "register_operand" "")
+-			     (match_operand:GPI 2 "aarch64_plus_operand" "")])
++			    [(match_operand:GPI 1 "register_operand")
++			     (match_operand:GPI 2 "aarch64_plus_operand")])
+ 			   (label_ref (match_operand 3 "" ""))
+ 			   (pc)))]
+   ""
+@@ -397,8 +461,8 @@
+ 
+ (define_expand "cbranch<mode>4"
+   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+-			    [(match_operand:GPF 1 "register_operand" "")
+-			     (match_operand:GPF 2 "aarch64_fp_compare_operand" "")])
++			    [(match_operand:GPF 1 "register_operand")
++			     (match_operand:GPF 2 "aarch64_fp_compare_operand")])
+ 			   (label_ref (match_operand 3 "" ""))
+ 			   (pc)))]
+   ""
+@@ -412,7 +476,7 @@
+ (define_expand "cbranchcc4"
+   [(set (pc) (if_then_else
+ 	      (match_operator 0 "aarch64_comparison_operator"
+-	       [(match_operand 1 "cc_register" "")
++	       [(match_operand 1 "cc_register")
+ 	        (match_operand 2 "const0_operand")])
+ 	      (label_ref (match_operand 3 "" ""))
+ 	      (pc)))]
+@@ -475,9 +539,9 @@
+ ;; csneg  x0, x0, x1, mi
+ 
+ (define_expand "mod<mode>3"
+-  [(match_operand:GPI 0 "register_operand" "")
+-   (match_operand:GPI 1 "register_operand" "")
+-   (match_operand:GPI 2 "const_int_operand" "")]
++  [(match_operand:GPI 0 "register_operand")
++   (match_operand:GPI 1 "register_operand")
++   (match_operand:GPI 2 "const_int_operand")]
+   ""
+   {
+     HOST_WIDE_INT val = INTVAL (operands[2]);
+@@ -530,10 +594,14 @@
+ 			   (pc)))]
+   ""
+   {
++    /* GCC's traditional style has been to use "beq" instead of "b.eq", etc.,
++       but the "." is required for SVE conditions.  */
++    bool use_dot_p = GET_MODE (operands[1]) == CC_NZCmode;
+     if (get_attr_length (insn) == 8)
+-      return aarch64_gen_far_branch (operands, 2, "Lbcond", "b%M0\\t");
++      return aarch64_gen_far_branch (operands, 2, "Lbcond",
++				     use_dot_p ? "b.%M0\\t" : "b%M0\\t");
+     else
+-      return  "b%m0\\t%l2";
++      return use_dot_p ? "b.%m0\\t%l2" : "b%m0\\t%l2";
+   }
+   [(set_attr "type" "branch")
+    (set (attr "length")
+@@ -558,14 +626,14 @@
+ ;; 	sub	x0, x1, #(CST & 0xfff000)
+ ;; 	subs	x0, x0, #(CST & 0x000fff)
+ ;; 	b<ne,eq> .Label
+-(define_insn_and_split "*compare_condjump<mode>"
++(define_insn_and_split "*compare_condjump<GPI:mode>"
+   [(set (pc) (if_then_else (EQL
+ 			      (match_operand:GPI 0 "register_operand" "r")
+ 			      (match_operand:GPI 1 "aarch64_imm24" "n"))
+ 			   (label_ref:P (match_operand 2 "" ""))
+ 			   (pc)))]
+-  "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode)
+-   && !aarch64_plus_operand (operands[1], <MODE>mode)
++  "!aarch64_move_imm (INTVAL (operands[1]), <GPI:MODE>mode)
++   && !aarch64_plus_operand (operands[1], <GPI:MODE>mode)
+    && !reload_completed"
+   "#"
+   "&& true"
+@@ -573,20 +641,21 @@
+   {
+     HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
+     HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
+-    rtx tmp = gen_reg_rtx (<MODE>mode);
+-    emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
+-    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
++    rtx tmp = gen_reg_rtx (<GPI:MODE>mode);
++    emit_insn (gen_add<GPI:mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
++    emit_insn (gen_add<GPI:mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
+     rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+-    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);
++    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <GPI:MODE>mode,
++				  cc_reg, const0_rtx);
+     emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
+     DONE;
+   }
+ )
+ 
+ (define_expand "casesi"
+-  [(match_operand:SI 0 "register_operand" "")	; Index
+-   (match_operand:SI 1 "const_int_operand" "")	; Lower bound
+-   (match_operand:SI 2 "const_int_operand" "")	; Total range
++  [(match_operand:SI 0 "register_operand")	; Index
++   (match_operand:SI 1 "const_int_operand")	; Lower bound
++   (match_operand:SI 2 "const_int_operand")	; Total range
+    (match_operand:DI 3 "" "")			; Table label
+    (match_operand:DI 4 "" "")]			; Out of range label
+   ""
+@@ -739,8 +808,12 @@
+     if (aarch64_return_address_signing_enabled ()
+ 	&& TARGET_ARMV8_3
+ 	&& !crtl->calls_eh_return)
+-      return "retaa";
+-
++      {
++	if (aarch64_ra_sign_key == AARCH64_KEY_B)
++	  return "retab";
++	else
++	  return "retaa";
++      }
+     return "ret";
+   }
+   [(set_attr "type" "branch")]
+@@ -754,7 +827,7 @@
+ 
+ (define_insn "simple_return"
+   [(simple_return)]
+-  "aarch64_use_simple_return_insn_p ()"
++  ""
+   "ret"
+   [(set_attr "type" "branch")]
+ )
+@@ -868,14 +941,15 @@
+ ;; -------------------------------------------------------------------
+ 
+ (define_expand "call"
+-  [(parallel [(call (match_operand 0 "memory_operand" "")
+-		    (match_operand 1 "general_operand" ""))
+-	      (use (match_operand 2 "" ""))
+-	      (clobber (reg:DI LR_REGNUM))])]
++  [(parallel
++     [(call (match_operand 0 "memory_operand")
++	    (match_operand 1 "general_operand"))
++      (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
++      (clobber (reg:DI LR_REGNUM))])]
+   ""
+   "
+   {
+-    aarch64_expand_call (NULL_RTX, operands[0], false);
++    aarch64_expand_call (NULL_RTX, operands[0], operands[2], false);
+     DONE;
+   }"
+ )
+@@ -883,6 +957,7 @@
+ (define_insn "*call_insn"
+   [(call (mem:DI (match_operand:DI 0 "aarch64_call_insn_operand" "r, Usf"))
+ 	 (match_operand 1 "" ""))
++   (unspec:DI [(match_operand:DI 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
+    (clobber (reg:DI LR_REGNUM))]
+   ""
+   "@
+@@ -892,15 +967,16 @@
+ )
+ 
+ (define_expand "call_value"
+-  [(parallel [(set (match_operand 0 "" "")
+-		   (call (match_operand 1 "memory_operand" "")
+-			 (match_operand 2 "general_operand" "")))
+-	      (use (match_operand 3 "" ""))
+-	      (clobber (reg:DI LR_REGNUM))])]
++  [(parallel
++     [(set (match_operand 0 "")
++	   (call (match_operand 1 "memory_operand")
++		 (match_operand 2 "general_operand")))
++     (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
++     (clobber (reg:DI LR_REGNUM))])]
+   ""
+   "
+   {
+-    aarch64_expand_call (operands[0], operands[1], false);
++    aarch64_expand_call (operands[0], operands[1], operands[3], false);
+     DONE;
+   }"
+ )
+@@ -909,6 +985,7 @@
+   [(set (match_operand 0 "" "")
+ 	(call (mem:DI (match_operand:DI 1 "aarch64_call_insn_operand" "r, Usf"))
+ 		      (match_operand 2 "" "")))
++   (unspec:DI [(match_operand:DI 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
+    (clobber (reg:DI LR_REGNUM))]
+   ""
+   "@
+@@ -918,33 +995,36 @@
+ )
+ 
+ (define_expand "sibcall"
+-  [(parallel [(call (match_operand 0 "memory_operand" "")
+-		    (match_operand 1 "general_operand" ""))
+-	      (return)
+-	      (use (match_operand 2 "" ""))])]
++  [(parallel
++     [(call (match_operand 0 "memory_operand")
++	    (match_operand 1 "general_operand"))
++      (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
++      (return)])]
+   ""
+   {
+-    aarch64_expand_call (NULL_RTX, operands[0], true);
++    aarch64_expand_call (NULL_RTX, operands[0], operands[2], true);
+     DONE;
+   }
+ )
+ 
+ (define_expand "sibcall_value"
+-  [(parallel [(set (match_operand 0 "" "")
+-		   (call (match_operand 1 "memory_operand" "")
+-			 (match_operand 2 "general_operand" "")))
+-	      (return)
+-	      (use (match_operand 3 "" ""))])]
++  [(parallel
++     [(set (match_operand 0 "")
++	   (call (match_operand 1 "memory_operand")
++		 (match_operand 2 "general_operand")))
++      (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
++      (return)])]
+   ""
+   {
+-    aarch64_expand_call (operands[0], operands[1], true);
++    aarch64_expand_call (operands[0], operands[1], operands[3], true);
+     DONE;
+   }
+ )
+ 
+ (define_insn "*sibcall_insn"
+   [(call (mem:DI (match_operand:DI 0 "aarch64_call_insn_operand" "Ucs, Usf"))
+-	 (match_operand 1 "" ""))
++	 (match_operand 1 ""))
++   (unspec:DI [(match_operand:DI 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
+    (return)]
+   "SIBLING_CALL_P (insn)"
+   "@
+@@ -954,10 +1034,11 @@
+ )
+ 
+ (define_insn "*sibcall_value_insn"
+-  [(set (match_operand 0 "" "")
++  [(set (match_operand 0 "")
+ 	(call (mem:DI
+ 		(match_operand:DI 1 "aarch64_call_insn_operand" "Ucs, Usf"))
+-	      (match_operand 2 "" "")))
++	      (match_operand 2 "")))
++   (unspec:DI [(match_operand:DI 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
+    (return)]
+   "SIBLING_CALL_P (insn)"
+   "@
+@@ -977,7 +1058,9 @@
+ {
+   int i;
+ 
+-  emit_call_insn (gen_call (operands[0], const0_rtx, NULL));
++  /* Untyped calls always use the default ABI.  It's only possible to use
++     ABI variants if we know the type of the target function.  */
++  emit_call_insn (gen_call (operands[0], const0_rtx, const0_rtx));
+ 
+   for (i = 0; i < XVECLEN (operands[2], 0); i++)
+     {
+@@ -998,8 +1081,8 @@
+ ;; -------------------------------------------------------------------
+ 
+ (define_expand "mov<mode>"
+-  [(set (match_operand:SHORT 0 "nonimmediate_operand" "")
+-	(match_operand:SHORT 1 "general_operand" ""))]
++  [(set (match_operand:SHORT 0 "nonimmediate_operand")
++	(match_operand:SHORT 1 "general_operand"))]
+   ""
+   "
+     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
+@@ -1055,8 +1138,8 @@
+ )
+ 
+ (define_expand "mov<mode>"
+-  [(set (match_operand:GPI 0 "nonimmediate_operand" "")
+-	(match_operand:GPI 1 "general_operand" ""))]
++  [(set (match_operand:GPI 0 "nonimmediate_operand")
++	(match_operand:GPI 1 "general_operand"))]
+   ""
+   "
+     if (MEM_P (operands[0]) && !MEM_VOLATILE_P (operands[0])
+@@ -1162,8 +1245,8 @@
+ )
+ 
+ (define_expand "movti"
+-  [(set (match_operand:TI 0 "nonimmediate_operand" "")
+-	(match_operand:TI 1 "general_operand" ""))]
++  [(set (match_operand:TI 0 "nonimmediate_operand")
++	(match_operand:TI 1 "general_operand"))]
+   ""
+   "
+     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
+@@ -1217,8 +1300,8 @@
+ })
+ 
+ (define_expand "mov<mode>"
+-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand" "")
+-	(match_operand:GPF_TF_F16 1 "general_operand" ""))]
++  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
++	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
+   ""
+   {
+     if (!TARGET_FLOAT)
+@@ -1234,11 +1317,11 @@
+   }
+ )
+ 
+-(define_insn "*movhf_aarch64"
+-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
+-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
++(define_insn "*mov<mode>_aarch64"
++  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
++	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
++  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
++    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
+   "@
+    movi\\t%0.4h, #0
+    fmov\\t%h0, %w1
+@@ -1363,17 +1446,17 @@
+ 
+ ;; 0 is dst
+ ;; 1 is src
+-;; 2 is size of move in bytes
++;; 2 is size of copy in bytes
+ ;; 3 is alignment
+ 
+-(define_expand "movmemdi"
++(define_expand "cpymemdi"
+   [(match_operand:BLK 0 "memory_operand")
+    (match_operand:BLK 1 "memory_operand")
+    (match_operand:DI 2 "immediate_operand")
+    (match_operand:DI 3 "immediate_operand")]
+    "!STRICT_ALIGNMENT"
+ {
+-  if (aarch64_expand_movmem (operands))
++  if (aarch64_expand_cpymem (operands))
+     DONE;
+   FAIL;
+ }
+@@ -1492,8 +1575,8 @@
+           (mem:GPI (plus:P (match_dup 1)
+                    (match_operand:P 5 "const_int_operand" "n"))))])]
+   "INTVAL (operands[5]) == GET_MODE_SIZE (<GPI:MODE>mode)"
+-  "ldp\\t%<w>2, %<w>3, [%1], %4"
+-  [(set_attr "type" "load_<ldpstp_sz>")]
++  "ldp\\t%<GPI:w>2, %<GPI:w>3, [%1], %4"
++  [(set_attr "type" "load_<GPI:ldpstp_sz>")]
+ )
+ 
+ (define_insn "loadwb_pair<GPF:mode>_<P:mode>"
+@@ -1507,7 +1590,7 @@
+           (mem:GPF (plus:P (match_dup 1)
+                    (match_operand:P 5 "const_int_operand" "n"))))])]
+   "INTVAL (operands[5]) == GET_MODE_SIZE (<GPF:MODE>mode)"
+-  "ldp\\t%<w>2, %<w>3, [%1], %4"
++  "ldp\\t%<GPF:w>2, %<GPF:w>3, [%1], %4"
+   [(set_attr "type" "neon_load1_2reg")]
+ )
+ 
+@@ -1540,8 +1623,8 @@
+                    (match_operand:P 5 "const_int_operand" "n")))
+           (match_operand:GPI 3 "register_operand" "r"))])]
+   "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<GPI:MODE>mode)"
+-  "stp\\t%<w>2, %<w>3, [%0, %4]!"
+-  [(set_attr "type" "store_<ldpstp_sz>")]
++  "stp\\t%<GPI:w>2, %<GPI:w>3, [%0, %4]!"
++  [(set_attr "type" "store_<GPI:ldpstp_sz>")]
+ )
+ 
+ (define_insn "storewb_pair<GPF:mode>_<P:mode>"
+@@ -1556,7 +1639,7 @@
+                    (match_operand:P 5 "const_int_operand" "n")))
+           (match_operand:GPF 3 "register_operand" "w"))])]
+   "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<GPF:MODE>mode)"
+-  "stp\\t%<w>2, %<w>3, [%0, %4]!"
++  "stp\\t%<GPF:w>2, %<GPF:w>3, [%0, %4]!"
+   [(set_attr "type" "neon_store1_2reg<q>")]
+ )
+ 
+@@ -1702,9 +1785,9 @@
+ 
+ (define_expand "add<mode>3"
+   [(set
+-    (match_operand:GPI 0 "register_operand" "")
+-    (plus:GPI (match_operand:GPI 1 "register_operand" "")
+-	      (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "")))]
++    (match_operand:GPI 0 "register_operand")
++    (plus:GPI (match_operand:GPI 1 "register_operand")
++	      (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand")))]
+   ""
+ {
+   /* If operands[1] is a subreg extract the inner RTX.  */
+@@ -1713,6 +1796,7 @@
+   /* If the constant is too large for a single instruction and isn't frame
+      based, split off the immediate so it is available for CSE.  */
+   if (!aarch64_plus_immediate (operands[2], <MODE>mode)
++      && !(TARGET_SVE && aarch64_sve_plus_immediate (operands[2], <MODE>mode))
+       && can_create_pseudo_p ()
+       && (!REG_P (op1)
+ 	 || !REGNO_PTR_FRAME_P (REGNO (op1))))
+@@ -1730,10 +1814,10 @@
+ 
+ (define_insn "*add<mode>3_aarch64"
+   [(set
+-    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,rk")
++    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
+     (plus:GPI
+-     (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,rk")
+-     (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uav")))]
++     (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
++     (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uai,Uav")))]
+   ""
+   "@
+   add\\t%<w>0, %<w>1, %2
+@@ -1741,10 +1825,11 @@
+   add\\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>
+   sub\\t%<w>0, %<w>1, #%n2
+   #
+-  * return aarch64_output_sve_addvl_addpl (operands[0], operands[1], operands[2]);"
+-  ;; The "alu_imm" type for ADDVL/ADDPL is just a placeholder.
+-  [(set_attr "type" "alu_imm,alu_sreg,neon_add,alu_imm,multiple,alu_imm")
+-   (set_attr "arch" "*,*,simd,*,*,*")]
++  * return aarch64_output_sve_scalar_inc_dec (operands[2]);
++  * return aarch64_output_sve_addvl_addpl (operands[2]);"
++  ;; The "alu_imm" types for INC/DEC and ADDVL/ADDPL are just placeholders.
++  [(set_attr "type" "alu_imm,alu_sreg,neon_add,alu_imm,multiple,alu_imm,alu_imm")
++   (set_attr "arch" "*,*,simd,*,*,sve,sve")]
+ )
+ 
+ ;; zero_extend version of above
+@@ -1823,17 +1908,18 @@
+ ;; this pattern.
+ (define_insn_and_split "*add<mode>3_poly_1"
+   [(set
+-    (match_operand:GPI 0 "register_operand" "=r,r,r,r,r,&r")
++    (match_operand:GPI 0 "register_operand" "=r,r,r,r,r,r,&r")
+     (plus:GPI
+-     (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,rk,rk")
+-     (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uav,Uat")))]
++     (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,rk,0,rk")
++     (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uav,Uai,Uat")))]
+   "TARGET_SVE && operands[0] != stack_pointer_rtx"
+   "@
+   add\\t%<w>0, %<w>1, %2
+   add\\t%<w>0, %<w>1, %<w>2
+   sub\\t%<w>0, %<w>1, #%n2
+   #
+-  * return aarch64_output_sve_addvl_addpl (operands[0], operands[1], operands[2]);
++  * return aarch64_output_sve_scalar_inc_dec (operands[2]);
++  * return aarch64_output_sve_addvl_addpl (operands[2]);
+   #"
+   "&& epilogue_completed
+    && !reg_overlap_mentioned_p (operands[0], operands[1])
+@@ -1844,8 +1930,8 @@
+ 			      operands[2], operands[0], NULL_RTX);
+     DONE;
+   }
+-  ;; The "alu_imm" type for ADDVL/ADDPL is just a placeholder.
+-  [(set_attr "type" "alu_imm,alu_sreg,alu_imm,multiple,alu_imm,multiple")]
++  ;; The "alu_imm" types for INC/DEC and ADDVL/ADDPL are just placeholders.
++  [(set_attr "type" "alu_imm,alu_sreg,alu_imm,multiple,alu_imm,alu_imm,multiple")]
+ )
+ 
+ (define_split
+@@ -1897,9 +1983,9 @@
+ })
+ 
+ (define_expand "addti3"
+-  [(set (match_operand:TI 0 "register_operand" "")
+-	(plus:TI (match_operand:TI 1 "register_operand" "")
+-		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
++  [(set (match_operand:TI 0 "register_operand")
++	(plus:TI (match_operand:TI 1 "register_operand")
++		 (match_operand:TI 2 "aarch64_reg_or_imm")))]
+   ""
+ {
+   rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+@@ -1930,9 +2016,9 @@
+ })
+ 
+ (define_expand "addvti4"
+-  [(match_operand:TI 0 "register_operand" "")
+-   (match_operand:TI 1 "register_operand" "")
+-   (match_operand:TI 2 "aarch64_reg_or_imm" "")
++  [(match_operand:TI 0 "register_operand")
++   (match_operand:TI 1 "register_operand")
++   (match_operand:TI 2 "aarch64_reg_or_imm")
+    (label_ref (match_operand 3 "" ""))]
+   ""
+ {
+@@ -1964,9 +2050,9 @@
+ })
+ 
+ (define_expand "uaddvti4"
+-  [(match_operand:TI 0 "register_operand" "")
+-   (match_operand:TI 1 "register_operand" "")
+-   (match_operand:TI 2 "aarch64_reg_or_imm" "")
++  [(match_operand:TI 0 "register_operand")
++   (match_operand:TI 1 "register_operand")
++   (match_operand:TI 2 "aarch64_reg_or_imm")
+    (label_ref (match_operand 3 "" ""))]
+   ""
+ {
+@@ -2501,9 +2587,9 @@
+ 	       (plus:<DWI>
+ 		 (match_dup 4)
+ 		 (zero_extend:<DWI>
+-		   (match_operand:GPI 1 "register_operand" "")))
++		   (match_operand:GPI 1 "register_operand")))
+ 	       (zero_extend:<DWI>
+-		 (match_operand:GPI 2 "register_operand" "")))
++		 (match_operand:GPI 2 "register_operand")))
+ 	     (match_dup 6)))
+       (set (match_operand:GPI 0 "register_operand")
+ 	   (plus:GPI
+@@ -2564,9 +2650,9 @@
+ 	       (plus:<DWI>
+ 		 (match_dup 3)
+ 		 (sign_extend:<DWI>
+-		   (match_operand:GPI 1 "register_operand" "")))
++		   (match_operand:GPI 1 "register_operand")))
+ 	       (sign_extend:<DWI>
+-		 (match_operand:GPI 2 "register_operand" "")))
++		 (match_operand:GPI 2 "register_operand")))
+ 	   (sign_extend:<DWI>
+ 	     (plus:GPI
+ 	       (plus:GPI (match_dup 4) (match_dup 1))
+@@ -2835,9 +2921,9 @@
+ })
+ 
+ (define_expand "subti3"
+-  [(set (match_operand:TI 0 "register_operand" "")
+-	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
+-		  (match_operand:TI 2 "register_operand" "")))]
++  [(set (match_operand:TI 0 "register_operand")
++	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero")
++		  (match_operand:TI 2 "register_operand")))]
+   ""
+ {
+   rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+@@ -3285,12 +3371,12 @@
+      [(set (reg:CC CC_REGNUM)
+ 	   (compare:CC
+ 	     (zero_extend:<DWI>
+-	       (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
++	       (match_operand:GPI 1 "aarch64_reg_or_zero"))
+ 	     (plus:<DWI>
+ 	       (zero_extend:<DWI>
+-		 (match_operand:GPI 2 "register_operand" ""))
++		 (match_operand:GPI 2 "register_operand"))
+ 	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
+-      (set (match_operand:GPI 0 "register_operand" "")
++      (set (match_operand:GPI 0 "register_operand")
+ 	   (minus:GPI
+ 	     (minus:GPI (match_dup 1) (match_dup 2))
+ 	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+@@ -3353,16 +3439,16 @@
+ 	   (compare:CC_V
+ 	    (minus:<DWI>
+ 	     (sign_extend:<DWI>
+-	       (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
++	       (match_operand:GPI 1 "aarch64_reg_or_zero"))
+ 	     (plus:<DWI>
+ 	       (sign_extend:<DWI>
+-		 (match_operand:GPI 2 "register_operand" ""))
++		 (match_operand:GPI 2 "register_operand"))
+ 	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0))))
+ 	    (sign_extend:<DWI>
+ 	     (minus:GPI (match_dup 1)
+ 			(plus:GPI (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))
+ 				  (match_dup 2))))))
+-      (set (match_operand:GPI 0 "register_operand" "")
++      (set (match_operand:GPI 0 "register_operand")
+ 	   (minus:GPI
+ 	     (minus:GPI (match_dup 1) (match_dup 2))
+ 	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+@@ -3475,8 +3561,8 @@
+ )
+ 
+ (define_expand "abs<mode>2"
+-  [(match_operand:GPI 0 "register_operand" "")
+-   (match_operand:GPI 1 "register_operand" "")]
++  [(match_operand:GPI 0 "register_operand")
++   (match_operand:GPI 1 "register_operand")]
+   ""
+   {
+     rtx ccreg = aarch64_gen_compare_reg (LT, operands[1], const0_rtx);
+@@ -3889,10 +3975,10 @@
+ ;; -------------------------------------------------------------------
+ 
+ (define_expand "cstore<mode>4"
+-  [(set (match_operand:SI 0 "register_operand" "")
++  [(set (match_operand:SI 0 "register_operand")
+ 	(match_operator:SI 1 "aarch64_comparison_operator"
+-	 [(match_operand:GPI 2 "register_operand" "")
+-	  (match_operand:GPI 3 "aarch64_plus_operand" "")]))]
++	 [(match_operand:GPI 2 "register_operand")
++	  (match_operand:GPI 3 "aarch64_plus_operand")]))]
+   ""
+   "
+   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
+@@ -3914,10 +4000,10 @@
+ 
+ 
+ (define_expand "cstore<mode>4"
+-  [(set (match_operand:SI 0 "register_operand" "")
++  [(set (match_operand:SI 0 "register_operand")
+ 	(match_operator:SI 1 "aarch64_comparison_operator_mode"
+-	 [(match_operand:GPF 2 "register_operand" "")
+-	  (match_operand:GPF 3 "aarch64_fp_compare_operand" "")]))]
++	 [(match_operand:GPF 2 "register_operand")
++	  (match_operand:GPF 3 "aarch64_fp_compare_operand")]))]
+   ""
+   "
+   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
+@@ -4002,13 +4088,13 @@
+ )
+ 
+ (define_expand "cmov<mode>6"
+-  [(set (match_operand:GPI 0 "register_operand" "")
++  [(set (match_operand:GPI 0 "register_operand")
+ 	(if_then_else:GPI
+ 	 (match_operator 1 "aarch64_comparison_operator"
+-	  [(match_operand:GPI 2 "register_operand" "")
+-	   (match_operand:GPI 3 "aarch64_plus_operand" "")])
+-	 (match_operand:GPI 4 "register_operand" "")
+-	 (match_operand:GPI 5 "register_operand" "")))]
++	  [(match_operand:GPI 2 "register_operand")
++	   (match_operand:GPI 3 "aarch64_plus_operand")])
++	 (match_operand:GPI 4 "register_operand")
++	 (match_operand:GPI 5 "register_operand")))]
+   ""
+   "
+   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
+@@ -4018,13 +4104,13 @@
+ )
+ 
+ (define_expand "cmov<mode>6"
+-  [(set (match_operand:GPF 0 "register_operand" "")
++  [(set (match_operand:GPF 0 "register_operand")
+ 	(if_then_else:GPF
+ 	 (match_operator 1 "aarch64_comparison_operator"
+-	  [(match_operand:GPF 2 "register_operand" "")
+-	   (match_operand:GPF 3 "aarch64_fp_compare_operand" "")])
+-	 (match_operand:GPF 4 "register_operand" "")
+-	 (match_operand:GPF 5 "register_operand" "")))]
++	  [(match_operand:GPF 2 "register_operand")
++	   (match_operand:GPF 3 "aarch64_fp_compare_operand")])
++	 (match_operand:GPF 4 "register_operand")
++	 (match_operand:GPF 5 "register_operand")))]
+   ""
+   "
+   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
+@@ -4102,10 +4188,10 @@
+ )
+ 
+ (define_expand "mov<mode>cc"
+-  [(set (match_operand:ALLI 0 "register_operand" "")
+-	(if_then_else:ALLI (match_operand 1 "aarch64_comparison_operator" "")
+-			   (match_operand:ALLI 2 "register_operand" "")
+-			   (match_operand:ALLI 3 "register_operand" "")))]
++  [(set (match_operand:ALLI 0 "register_operand")
++	(if_then_else:ALLI (match_operand 1 "aarch64_comparison_operator")
++			   (match_operand:ALLI 2 "register_operand")
++			   (match_operand:ALLI 3 "register_operand")))]
+   ""
+   {
+     rtx ccreg;
+@@ -4121,10 +4207,10 @@
+ )
+ 
+ (define_expand "mov<GPF:mode><GPI:mode>cc"
+-  [(set (match_operand:GPI 0 "register_operand" "")
+-	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator" "")
+-			  (match_operand:GPF 2 "register_operand" "")
+-			  (match_operand:GPF 3 "register_operand" "")))]
++  [(set (match_operand:GPI 0 "register_operand")
++	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator")
++			  (match_operand:GPF 2 "register_operand")
++			  (match_operand:GPF 3 "register_operand")))]
+   ""
+   {
+     rtx ccreg;
+@@ -4140,10 +4226,10 @@
+ )
+ 
+ (define_expand "mov<mode>cc"
+-  [(set (match_operand:GPF 0 "register_operand" "")
+-	(if_then_else:GPF (match_operand 1 "aarch64_comparison_operator" "")
+-			  (match_operand:GPF 2 "register_operand" "")
+-			  (match_operand:GPF 3 "register_operand" "")))]
++  [(set (match_operand:GPF 0 "register_operand")
++	(if_then_else:GPF (match_operand 1 "aarch64_comparison_operator")
++			  (match_operand:GPF 2 "register_operand")
++			  (match_operand:GPF 3 "register_operand")))]
+   ""
+   {
+     rtx ccreg;
+@@ -4159,10 +4245,10 @@
+ )
+ 
+ (define_expand "<neg_not_op><mode>cc"
+-  [(set (match_operand:GPI 0 "register_operand" "")
+-	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator" "")
+-			  (NEG_NOT:GPI (match_operand:GPI 2 "register_operand" ""))
+-			  (match_operand:GPI 3 "register_operand" "")))]
++  [(set (match_operand:GPI 0 "register_operand")
++	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator")
++			  (NEG_NOT:GPI (match_operand:GPI 2 "register_operand"))
++			  (match_operand:GPI 3 "register_operand")))]
+   ""
+   {
+     rtx ccreg;
+@@ -4769,7 +4855,7 @@
+   [(set_attr "type" "alus_imm")]
+ )
+ 
+-(define_insn "*ands<mode>_compare0"
++(define_insn "*ands<GPI:mode>_compare0"
+   [(set (reg:CC_NZ CC_REGNUM)
+ 	(compare:CC_NZ
+ 	 (zero_extend:GPI (match_operand:SHORT 1 "register_operand" "r"))
+@@ -5391,7 +5477,7 @@
+ ;; -------------------------------------------------------------------
+ 
+ (define_expand "<optab>"
+-  [(set (match_operand:DI 0 "register_operand" "=r")
++  [(set (match_operand:DI 0 "register_operand")
+ 	(ANY_EXTRACT:DI (match_operand:DI 1 "register_operand")
+ 			(match_operand 2
+ 			  "aarch64_simd_shift_imm_offset_di")
+@@ -5647,6 +5733,21 @@
+   [(set_attr "type" "bfx")]
+ )
+ 
++(define_insn "*ashiftsi_extvdi_bfiz"
++  [(set (match_operand:SI 0 "register_operand" "=r")
++	(ashift:SI
++	  (match_operator:SI 4 "subreg_lowpart_operator"
++	    [(sign_extract:DI
++	       (match_operand:DI 1 "register_operand" "r")
++	       (match_operand 2 "aarch64_simd_shift_imm_offset_si")
++	       (const_int 0))])
++	  (match_operand 3 "aarch64_simd_shift_imm_si")))]
++  "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]),
++	     1, GET_MODE_BITSIZE (SImode) - 1)"
++  "sbfiz\\t%w0, %w1, %3, %2"
++  [(set_attr "type" "bfx")]
++)
++
+ ;; When the bit position and width of the equivalent extraction add up to 32
+ ;; we can use a W-reg LSL instruction taking advantage of the implicit
+ ;; zero-extension of the X-reg.
+@@ -6008,6 +6109,44 @@
+   [(set_attr "type" "f_cvtf2i")]
+ )
+ 
++;; Equal width integer to fp and multiply combine.
++(define_insn "*aarch64_<su_optab>cvtf<fcvt_target><GPF:mode>2_mult"
++  [(set (match_operand:GPF 0 "register_operand" "=w,w")
++	(mult:GPF (FLOATUORS:GPF
++		   (match_operand:<FCVT_TARGET> 1 "register_operand" "w,?r"))
++		   (match_operand:GPF 2 "aarch64_fp_pow2_recip" "Dt,Dt")))]
++  "TARGET_FLOAT"
++  {
++    operands[2] = GEN_INT (aarch64_fpconst_pow2_recip (operands[2]));
++    switch (which_alternative)
++    {
++      case 0:
++	return "<su_optab>cvtf\t%<GPF:s>0, %<s>1, #%2";
++      case 1:
++	return "<su_optab>cvtf\t%<GPF:s>0, %<w1>1, #%2";
++      default:
++	gcc_unreachable ();
++    }
++  }
++  [(set_attr "type" "neon_int_to_fp_<Vetype>,f_cvti2f")
++   (set_attr "arch" "simd,fp")]
++)
++
++;; Unequal width integer to fp and multiply combine.
++(define_insn "*aarch64_<su_optab>cvtf<fcvt_iesize><GPF:mode>2_mult"
++  [(set (match_operand:GPF 0 "register_operand" "=w")
++	(mult:GPF (FLOATUORS:GPF
++		   (match_operand:<FCVT_IESIZE> 1 "register_operand" "r"))
++		   (match_operand:GPF 2 "aarch64_fp_pow2_recip" "Dt")))]
++  "TARGET_FLOAT"
++  {
++    operands[2] = GEN_INT (aarch64_fpconst_pow2_recip (operands[2]));
++    return "<su_optab>cvtf\t%<GPF:s>0, %<w2>1, #%2";
++  }
++  [(set_attr "type" "f_cvti2f")]
++)
++
++;; Equal width integer to fp conversion.
+ (define_insn "<optab><fcvt_target><GPF:mode>2"
+   [(set (match_operand:GPF 0 "register_operand" "=w,w")
+         (FLOATUORS:GPF (match_operand:<FCVT_TARGET> 1 "register_operand" "w,?r")))]
+@@ -6019,6 +6158,7 @@
+    (set_attr "arch" "simd,fp")]
+ )
+ 
++;; Unequal width integer to fp conversions.
+ (define_insn "<optab><fcvt_iesize><GPF:mode>2"
+   [(set (match_operand:GPF 0 "register_operand" "=w")
+         (FLOATUORS:GPF (match_operand:<FCVT_IESIZE> 1 "register_operand" "r")))]
+@@ -6241,8 +6381,8 @@
+ )
+ 
+ (define_expand "sqrt<mode>2"
+-  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
+-	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
++  [(set (match_operand:GPF_F16 0 "register_operand")
++	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand")))]
+   "TARGET_FLOAT"
+ {
+   if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+@@ -6401,6 +6541,7 @@
+ ;; -------------------------------------------------------------------
+ ;; Reload Scalar Floating point modes from constant pool.
+ ;; The AArch64 port doesn't have __int128 constant move support.
++;; The patterns need constraints due to TARGET_SECONDARY_RELOAD hook.
+ (define_expand "@aarch64_reload_movcp<GPF_TF:mode><P:mode>"
+  [(set (match_operand:GPF_TF 0 "register_operand" "=w")
+        (mem:GPF_TF (match_operand 1 "aarch64_constant_pool_symref" "S")))
+@@ -6501,9 +6642,9 @@
+ ;; rodata section.
+ 
+ (define_expand "add_losym"
+-  [(set (match_operand 0 "register_operand" "=r")
+-	(lo_sum (match_operand 1 "register_operand" "r")
+-		(match_operand 2 "aarch64_valid_symref" "S")))]
++  [(set (match_operand 0 "register_operand")
++	(lo_sum (match_operand 1 "register_operand")
++		(match_operand 2 "aarch64_valid_symref")))]
+   ""
+ {
+   machine_mode mode = GET_MODE (operands[0]);
+@@ -6602,9 +6743,10 @@
+ ;; instructions in the TLS stubs, in order to enable linker relaxation.
+ ;; Therefore we treat the stubs as an atomic sequence.
+ (define_expand "tlsgd_small_<mode>"
+- [(parallel [(set (match_operand 0 "register_operand" "")
++ [(parallel [(set (match_operand 0 "register_operand")
+                   (call (mem:DI (match_dup 2)) (const_int 1)))
+-	     (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
++	     (unspec:DI [(const_int 0)] UNSPEC_CALLEE_ABI)
++	     (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref")] UNSPEC_GOTSMALLTLS)
+ 	     (clobber (reg:DI LR_REGNUM))])]
+  ""
+ {
+@@ -6614,6 +6756,7 @@
+ (define_insn "*tlsgd_small_<mode>"
+   [(set (match_operand 0 "register_operand" "")
+ 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
++   (unspec:DI [(const_int 0)] UNSPEC_CALLEE_ABI)
+    (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
+    (clobber (reg:DI LR_REGNUM))
+   ]
+@@ -6714,7 +6857,12 @@
+   "TARGET_TLS_DESC"
+   {
+     if (TARGET_SVE)
+-      emit_insn (gen_tlsdesc_small_sve_<mode> (operands[0]));
++      {
++	rtx abi = gen_int_mode (aarch64_tlsdesc_abi_id (), DImode);
++	rtx_insn *call
++	  = emit_call_insn (gen_tlsdesc_small_sve_<mode> (operands[0], abi));
++	RTL_CONST_CALL_P (call) = 1;
++      }
+     else
+       emit_insn (gen_tlsdesc_small_advsimd_<mode> (operands[0]));
+     DONE;
+@@ -6729,72 +6877,27 @@
+ 		    UNSPEC_TLSDESC))
+    (clobber (reg:DI LR_REGNUM))
+    (clobber (reg:CC CC_REGNUM))
+-   (clobber (match_scratch:DI 1 "=r"))]
++   (clobber (match_scratch:DI 1 "=r"))
++   (use (reg:DI FP_REGNUM))]
+   "TARGET_TLS_DESC && !TARGET_SVE"
+   "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
+   [(set_attr "type" "call")
+    (set_attr "length" "16")])
+ 
+-;; For SVE, model tlsdesc calls as clobbering the lower 128 bits of
+-;; all vector registers, and clobber all predicate registers, on
+-;; top of the usual R0 and LR.
++;; For SVE, model tlsdesc calls as normal calls, with the callee ABI
++;; describing the extra call-preserved guarantees.  This would work
++;; for non-SVE too, but avoiding a call is probably better if we can.
+ (define_insn "tlsdesc_small_sve_<mode>"
+   [(set (reg:PTR R0_REGNUM)
+-        (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
+-		    UNSPEC_TLSDESC))
++	(call (mem:DI (unspec:PTR
++			[(match_operand 0 "aarch64_valid_symref")]
++			UNSPEC_TLSDESC))
++	      (const_int 0)))
++   (unspec:DI [(match_operand:DI 1 "const_int_operand")] UNSPEC_CALLEE_ABI)
+    (clobber (reg:DI LR_REGNUM))
+-   (clobber (reg:CC CC_REGNUM))
+-   (clobber_high (reg:TI V0_REGNUM))
+-   (clobber_high (reg:TI V1_REGNUM))
+-   (clobber_high (reg:TI V2_REGNUM))
+-   (clobber_high (reg:TI V3_REGNUM))
+-   (clobber_high (reg:TI V4_REGNUM))
+-   (clobber_high (reg:TI V5_REGNUM))
+-   (clobber_high (reg:TI V6_REGNUM))
+-   (clobber_high (reg:TI V7_REGNUM))
+-   (clobber_high (reg:TI V8_REGNUM))
+-   (clobber_high (reg:TI V9_REGNUM))
+-   (clobber_high (reg:TI V10_REGNUM))
+-   (clobber_high (reg:TI V11_REGNUM))
+-   (clobber_high (reg:TI V12_REGNUM))
+-   (clobber_high (reg:TI V13_REGNUM))
+-   (clobber_high (reg:TI V14_REGNUM))
+-   (clobber_high (reg:TI V15_REGNUM))
+-   (clobber_high (reg:TI V16_REGNUM))
+-   (clobber_high (reg:TI V17_REGNUM))
+-   (clobber_high (reg:TI V18_REGNUM))
+-   (clobber_high (reg:TI V19_REGNUM))
+-   (clobber_high (reg:TI V20_REGNUM))
+-   (clobber_high (reg:TI V21_REGNUM))
+-   (clobber_high (reg:TI V22_REGNUM))
+-   (clobber_high (reg:TI V23_REGNUM))
+-   (clobber_high (reg:TI V24_REGNUM))
+-   (clobber_high (reg:TI V25_REGNUM))
+-   (clobber_high (reg:TI V26_REGNUM))
+-   (clobber_high (reg:TI V27_REGNUM))
+-   (clobber_high (reg:TI V28_REGNUM))
+-   (clobber_high (reg:TI V29_REGNUM))
+-   (clobber_high (reg:TI V30_REGNUM))
+-   (clobber_high (reg:TI V31_REGNUM))
+-   (clobber (reg:VNx2BI P0_REGNUM))
+-   (clobber (reg:VNx2BI P1_REGNUM))
+-   (clobber (reg:VNx2BI P2_REGNUM))
+-   (clobber (reg:VNx2BI P3_REGNUM))
+-   (clobber (reg:VNx2BI P4_REGNUM))
+-   (clobber (reg:VNx2BI P5_REGNUM))
+-   (clobber (reg:VNx2BI P6_REGNUM))
+-   (clobber (reg:VNx2BI P7_REGNUM))
+-   (clobber (reg:VNx2BI P8_REGNUM))
+-   (clobber (reg:VNx2BI P9_REGNUM))
+-   (clobber (reg:VNx2BI P10_REGNUM))
+-   (clobber (reg:VNx2BI P11_REGNUM))
+-   (clobber (reg:VNx2BI P12_REGNUM))
+-   (clobber (reg:VNx2BI P13_REGNUM))
+-   (clobber (reg:VNx2BI P14_REGNUM))
+-   (clobber (reg:VNx2BI P15_REGNUM))
+-   (clobber (match_scratch:DI 1 "=r"))]
++   (clobber (match_scratch:DI 2 "=r"))]
+   "TARGET_TLS_DESC && TARGET_SVE"
+-  "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
++  "adrp\\tx0, %A0\;ldr\\t%<w>2, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%2"
+   [(set_attr "type" "call")
+    (set_attr "length" "16")])
+ 
+@@ -6808,6 +6911,15 @@
+   [(set_attr "length" "0")]
+ )
+ 
++(define_insn "aarch64_fjcvtzs"
++  [(set (match_operand:SI 0 "register_operand" "=r")
++	(unspec:SI [(match_operand:DF 1 "register_operand" "w")]
++		   UNSPEC_FJCVTZS))]
++  "TARGET_JSCVT"
++  "fjcvtzs\\t%w0, %d1"
++  [(set_attr "type" "f_cvtf2i")]
++)
++
+ ;; Pointer authentication patterns are always provided.  In architecture
+ ;; revisions prior to ARMv8.3-A these HINT instructions operate as NOPs.
+ ;; This lets the user write portable software which authenticates pointers
+@@ -6821,7 +6933,7 @@
+   [(set (reg:DI R30_REGNUM)
+ 	(unspec:DI [(reg:DI R30_REGNUM) (reg:DI SP_REGNUM)] PAUTH_LR_SP))]
+   ""
+-  "hint\t<pauth_hint_num_a> // <pauth_mnem_prefix>asp";
++  "hint\t<pauth_hint_num> // <pauth_mnem_prefix>sp";
+ )
+ 
+ ;; Signing/Authenticating X17 using X16 as the salt.
+@@ -6830,7 +6942,7 @@
+   [(set (reg:DI R17_REGNUM)
+ 	(unspec:DI [(reg:DI R17_REGNUM) (reg:DI R16_REGNUM)] PAUTH_17_16))]
+   ""
+-  "hint\t<pauth_hint_num_a> // <pauth_mnem_prefix>a1716";
++  "hint\t<pauth_hint_num> // <pauth_mnem_prefix>1716";
+ )
+ 
+ ;; Stripping the signature in R30.
+@@ -6885,7 +6997,7 @@
+ 
+ ;; Named pattern for expanding thread pointer reference.
+ (define_expand "get_thread_pointerdi"
+-  [(match_operand:DI 0 "register_operand" "=r")]
++  [(match_operand:DI 0 "register_operand")]
+   ""
+ {
+   rtx tmp = aarch64_load_tp (operands[0]);
+@@ -6941,13 +7053,15 @@
+  }
+  [(set_attr "type" "mrs")])
+ 
++;; DO NOT SPLIT THIS PATTERN.  It is important for security reasons that the
++;; canary value does not live beyond the life of this sequence.
+ (define_insn "stack_protect_set_<mode>"
+   [(set (match_operand:PTR 0 "memory_operand" "=m")
+ 	(unspec:PTR [(match_operand:PTR 1 "memory_operand" "m")]
+ 	 UNSPEC_SP_SET))
+    (set (match_scratch:PTR 2 "=&r") (const_int 0))]
+   ""
+-  "ldr\\t%<w>2, %1\;str\\t%<w>2, %0\;mov\t%<w>2,0"
++  "ldr\\t%<w>2, %1\;str\\t%<w>2, %0\;mov\t%<w>2, 0"
+   [(set_attr "length" "12")
+    (set_attr "type" "multiple")])
+ 
+@@ -7122,12 +7236,6 @@
+   [(set_attr "type" "no_insn")]
+ )
+ 
+-;; Helper for aarch64.c code.
+-(define_expand "set_clobber_cc"
+-  [(parallel [(set (match_operand 0)
+-		   (match_operand 1))
+-	      (clobber (reg:CC CC_REGNUM))])])
+-
+ ;; Hard speculation barrier.
+ (define_insn "speculation_barrier"
+   [(unspec_volatile [(const_int 0)] UNSPECV_SPECULATION_BARRIER)]
+@@ -7142,10 +7250,10 @@
+ ;; tracking enabled.  Use the speculation tracker to decide whether to
+ ;; copy operand 1 to the target, or to copy the fail value (operand 2).
+ (define_expand "@despeculate_copy<ALLI_TI:mode>"
+-  [(set (match_operand:ALLI_TI 0 "register_operand" "=r")
++  [(set (match_operand:ALLI_TI 0 "register_operand")
+ 	(unspec_volatile:ALLI_TI
+-	 [(match_operand:ALLI_TI 1 "register_operand" "r")
+-	  (match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
++	 [(match_operand:ALLI_TI 1 "register_operand")
++	  (match_operand:ALLI_TI 2 "aarch64_reg_or_zero")
+ 	  (use (reg:DI SPECULATION_TRACKER_REGNUM))
+ 	  (clobber (reg:CC CC_REGNUM))] UNSPECV_SPECULATION_BARRIER))]
+   ""
+@@ -7235,6 +7343,73 @@
+    (set_attr "speculation_barrier" "true")]
+ )
+ 
++(define_insn "aarch64_<frintnzs_op><mode>"
++  [(set (match_operand:VSFDF 0 "register_operand" "=w")
++	(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
++		      FRINTNZX))]
++  "TARGET_FRINT && TARGET_FLOAT
++   && !(VECTOR_MODE_P (<MODE>mode) && !TARGET_SIMD)"
++  "<frintnzs_op>\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
++  [(set_attr "type" "f_rint<stype>")]
++)
++
++;; Transactional Memory Extension (TME) instructions.
++
++(define_insn "tstart"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(unspec_volatile:DI [(const_int 0)] UNSPECV_TSTART))
++   (clobber (mem:BLK (scratch)))]
++  "TARGET_TME"
++  "tstart\\t%0"
++  [(set_attr "type" "tme")]
++)
++
++(define_insn "ttest"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(unspec_volatile:DI [(const_int 0)] UNSPEC_TTEST))
++   (clobber (mem:BLK (scratch)))]
++  "TARGET_TME"
++  "ttest\\t%0"
++  [(set_attr "type" "tme")]
++)
++
++(define_insn "tcommit"
++  [(unspec_volatile:BLK [(const_int 0)] UNSPECV_TCOMMIT)
++   (clobber (mem:BLK (scratch)))]
++  "TARGET_TME"
++  "tcommit"
++  [(set_attr "type" "tme")]
++)
++
++(define_insn "tcancel"
++  [(unspec_volatile:BLK
++     [(match_operand 0 "const_int_operand" "n")] UNSPECV_TCANCEL)
++   (clobber (mem:BLK (scratch)))]
++  "TARGET_TME && (UINTVAL (operands[0]) <= 65535)"
++  "tcancel\\t#%0"
++  [(set_attr "type" "tme")]
++)
++
++(define_insn "aarch64_rndr"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(unspec_volatile:DI [(const_int 0)] UNSPEC_RNDR))
++   (set (reg:CC_Z CC_REGNUM)
++	(unspec_volatile:CC_Z [(const_int 0)] UNSPEC_RNDR))]
++  "TARGET_RNG"
++  "mrs\t%0, RNDR"
++  [(set_attr "type" "mrs")]
++)
++
++(define_insn "aarch64_rndrrs"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(unspec_volatile:DI [(const_int 0)] UNSPEC_RNDRRS))
++   (set (reg:CC_Z CC_REGNUM)
++	(unspec_volatile:CC_Z [(const_int 0)] UNSPEC_RNDRRS))]
++  "TARGET_RNG"
++  "mrs\t%0, RNDRRS"
++  [(set_attr "type" "mrs")]
++)
++
+ ;; AdvSIMD Stuff
+ (include "aarch64-simd.md")
+ 
+diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
+index d2cb41be6..e2be8ff6f 100644
+--- a/gcc/config/aarch64/aarch64.opt
++++ b/gcc/config/aarch64/aarch64.opt
+@@ -31,7 +31,7 @@ TargetSave
+ const char *x_aarch64_override_tune_string
+ 
+ TargetVariable
+-unsigned long aarch64_isa_flags = 0
++uint64_t aarch64_isa_flags = 0
+ 
+ TargetVariable
+ unsigned aarch64_enable_bti = 2
+@@ -261,3 +261,6 @@ user-land code.
+ TargetVariable
+ long aarch64_stack_protector_guard_offset = 0
+ 
++moutline-atomics
++Target Report Mask(OUTLINE_ATOMICS) Save
++Generate local calls to out-of-line atomic operations.
+diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
+index 534a989c3..2284e7164 100644
+--- a/gcc/config/aarch64/arm_acle.h
++++ b/gcc/config/aarch64/arm_acle.h
+@@ -29,14 +29,77 @@
+ 
+ #include <stdint.h>
+ 
+-#pragma GCC push_options
+-
+-#pragma GCC target ("+nothing+crc")
+-
+ #ifdef __cplusplus
+ extern "C" {
+ #endif
+ 
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.3-a")
++__extension__ static __inline int32_t __attribute__ ((__always_inline__))
++__jcvt (double __a)
++{
++  return __builtin_aarch64_jcvtzs (__a);
++}
++
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.5-a")
++__extension__ static __inline float __attribute__ ((__always_inline__))
++__rint32zf (float __a)
++{
++  return __builtin_aarch64_frint32zsf (__a);
++}
++
++__extension__ static __inline double __attribute__ ((__always_inline__))
++__rint32z (double __a)
++{
++  return __builtin_aarch64_frint32zdf (__a);
++}
++
++__extension__ static __inline float __attribute__ ((__always_inline__))
++__rint64zf (float __a)
++{
++  return __builtin_aarch64_frint64zsf (__a);
++}
++
++__extension__ static __inline double __attribute__ ((__always_inline__))
++__rint64z (double __a)
++{
++  return __builtin_aarch64_frint64zdf (__a);
++}
++
++__extension__ static __inline float __attribute__ ((__always_inline__))
++__rint32xf (float __a)
++{
++  return __builtin_aarch64_frint32xsf (__a);
++}
++
++__extension__ static __inline double __attribute__ ((__always_inline__))
++__rint32x (double __a)
++{
++  return __builtin_aarch64_frint32xdf (__a);
++}
++
++__extension__ static __inline float __attribute__ ((__always_inline__))
++__rint64xf (float __a)
++{
++  return __builtin_aarch64_frint64xsf (__a);
++}
++
++__extension__ static __inline double __attribute__ ((__always_inline__))
++__rint64x (double __a)
++{
++  return __builtin_aarch64_frint64xdf (__a);
++}
++
++
++#pragma GCC pop_options
++
++#pragma GCC push_options
++
++#pragma GCC target ("+nothing+crc")
++
+ __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+ __crc32b (uint32_t __a, uint8_t __b)
+ {
+@@ -85,10 +148,69 @@ __crc32d (uint32_t __a, uint64_t __b)
+   return __builtin_aarch64_crc32x (__a, __b);
+ }
+ 
+-#ifdef __cplusplus
++#pragma GCC pop_options
++
++#ifdef __ARM_FEATURE_TME
++#pragma GCC push_options
++#pragma GCC target ("+nothing+tme")
++
++#define _TMFAILURE_REASON     0x00007fffu
++#define _TMFAILURE_RTRY       0x00008000u
++#define _TMFAILURE_CNCL       0x00010000u
++#define _TMFAILURE_MEM        0x00020000u
++#define _TMFAILURE_IMP        0x00040000u
++#define _TMFAILURE_ERR        0x00080000u
++#define _TMFAILURE_SIZE       0x00100000u
++#define _TMFAILURE_NEST       0x00200000u
++#define _TMFAILURE_DBG        0x00400000u
++#define _TMFAILURE_INT        0x00800000u
++#define _TMFAILURE_TRIVIAL    0x01000000u
++
++__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
++__tstart (void)
++{
++  return __builtin_aarch64_tstart ();
++}
++
++__extension__ static __inline void __attribute__ ((__always_inline__))
++__tcommit (void)
++{
++  __builtin_aarch64_tcommit ();
++}
++
++__extension__ static __inline void __attribute__ ((__always_inline__))
++__tcancel (const uint64_t __reason)
++{
++  __builtin_aarch64_tcancel (__reason);
+ }
++
++__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
++__ttest (void)
++{
++  return __builtin_aarch64_ttest ();
++}
++
++#pragma GCC pop_options
+ #endif
+ 
++#pragma GCC push_options
++#pragma GCC target ("+nothing+rng")
++__extension__ static __inline int __attribute__ ((__always_inline__))
++__rndr (uint64_t *__res)
++{
++  return __builtin_aarch64_rndr (__res);
++}
++
++__extension__ static __inline int __attribute__ ((__always_inline__))
++__rndrrs (uint64_t *__res)
++{
++  return __builtin_aarch64_rndrrs (__res);
++}
++
+ #pragma GCC pop_options
+ 
++#ifdef __cplusplus
++}
++#endif
++
+ #endif
+diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
+new file mode 100644
+index 000000000..984875dcc
+--- /dev/null
++++ b/gcc/config/aarch64/arm_bf16.h
+@@ -0,0 +1,45 @@
++/* Arm BF16 instrinsics include file.
++
++   Copyright (C) 2019-2020 Free Software Foundation, Inc.
++   Contributed by Arm.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published
++   by the Free Software Foundation; either version 3, or (at your
++   option) any later version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef _AARCH64_BF16_H_
++#define _AARCH64_BF16_H_
++
++typedef __bf16 bfloat16_t;
++typedef float float32_t;
++
++#pragma GCC push_options
++#pragma GCC target ("+nothing+bf16+nosimd")
++
++__extension__ extern __inline bfloat16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcvth_bf16_f32 (float32_t __a)
++{
++  return __builtin_aarch64_bfcvtbf (__a);
++}
++
++#pragma GCC pop_options
++
++#endif
+diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
+index 314ef3018..7435905ff 100644
+--- a/gcc/config/aarch64/arm_neon.h
++++ b/gcc/config/aarch64/arm_neon.h
+@@ -73,6 +73,39 @@ typedef __fp16 float16_t;
+ typedef float float32_t;
+ typedef double float64_t;
+ 
++typedef __Bfloat16x4_t bfloat16x4_t;
++typedef __Bfloat16x8_t bfloat16x8_t;
++
++typedef struct bfloat16x4x2_t
++{
++  bfloat16x4_t val[2];
++} bfloat16x4x2_t;
++
++typedef struct bfloat16x8x2_t
++{
++  bfloat16x8_t val[2];
++} bfloat16x8x2_t;
++
++typedef struct bfloat16x4x3_t
++{
++  bfloat16x4_t val[3];
++} bfloat16x4x3_t;
++
++typedef struct bfloat16x8x3_t
++{
++  bfloat16x8_t val[3];
++} bfloat16x8x3_t;
++
++typedef struct bfloat16x4x4_t
++{
++  bfloat16x4_t val[4];
++} bfloat16x4x4_t;
++
++typedef struct bfloat16x8x4_t
++{
++  bfloat16x8_t val[4];
++} bfloat16x8x4_t;
++
+ typedef struct int8x8x2_t
+ {
+   int8x8_t val[2];
+@@ -6572,867 +6605,867 @@ vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
++vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+-  int8x8_t result;
++  int8x8_t __result;
+   __asm__ ("saba %0.8b,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
++vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("saba %0.4h,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
++vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("saba %0.2s,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
++vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+-  uint8x8_t result;
++  uint8x8_t __result;
+   __asm__ ("uaba %0.8b,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
++vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("uaba %0.4h,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
++vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("uaba %0.2s,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
++vabal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sabal2 %0.8h,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
++vabal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sabal2 %0.4s,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
++vabal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sabal2 %0.2d,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
++vabal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uabal2 %0.8h,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
++vabal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uabal2 %0.4s,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
++vabal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("uabal2 %0.2d,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
++vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sabal %0.8h,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
++vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sabal %0.4s,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
++vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sabal %0.2d,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
++vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uabal %0.8h,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
++vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uabal %0.4s,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
++vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("uabal %0.2d,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
++vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+-  int8x16_t result;
++  int8x16_t __result;
+   __asm__ ("saba %0.16b,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
++vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("saba %0.8h,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
++vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("saba %0.4s,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
++vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+-  uint8x16_t result;
++  uint8x16_t __result;
+   __asm__ ("uaba %0.16b,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
++vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uaba %0.8h,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
++vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uaba %0.4s,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_s8 (int8x8_t a, int8x8_t b)
++vabd_s8 (int8x8_t __a, int8x8_t __b)
+ {
+-  int8x8_t result;
++  int8x8_t __result;
+   __asm__ ("sabd %0.8b, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_s16 (int16x4_t a, int16x4_t b)
++vabd_s16 (int16x4_t __a, int16x4_t __b)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("sabd %0.4h, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_s32 (int32x2_t a, int32x2_t b)
++vabd_s32 (int32x2_t __a, int32x2_t __b)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("sabd %0.2s, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_u8 (uint8x8_t a, uint8x8_t b)
++vabd_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+-  uint8x8_t result;
++  uint8x8_t __result;
+   __asm__ ("uabd %0.8b, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_u16 (uint16x4_t a, uint16x4_t b)
++vabd_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("uabd %0.4h, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_u32 (uint32x2_t a, uint32x2_t b)
++vabd_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("uabd %0.2s, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_high_s8 (int8x16_t a, int8x16_t b)
++vabdl_high_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sabdl2 %0.8h,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_high_s16 (int16x8_t a, int16x8_t b)
++vabdl_high_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sabdl2 %0.4s,%1.8h,%2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_high_s32 (int32x4_t a, int32x4_t b)
++vabdl_high_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sabdl2 %0.2d,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
++vabdl_high_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uabdl2 %0.8h,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
++vabdl_high_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uabdl2 %0.4s,%1.8h,%2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
++vabdl_high_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("uabdl2 %0.2d,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_s8 (int8x8_t a, int8x8_t b)
++vabdl_s8 (int8x8_t __a, int8x8_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sabdl %0.8h, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_s16 (int16x4_t a, int16x4_t b)
++vabdl_s16 (int16x4_t __a, int16x4_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sabdl %0.4s, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_s32 (int32x2_t a, int32x2_t b)
++vabdl_s32 (int32x2_t __a, int32x2_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sabdl %0.2d, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_u8 (uint8x8_t a, uint8x8_t b)
++vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uabdl %0.8h, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_u16 (uint16x4_t a, uint16x4_t b)
++vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uabdl %0.4s, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdl_u32 (uint32x2_t a, uint32x2_t b)
++vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("uabdl %0.2d, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_s8 (int8x16_t a, int8x16_t b)
++vabdq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  int8x16_t result;
++  int8x16_t __result;
+   __asm__ ("sabd %0.16b, %1.16b, %2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_s16 (int16x8_t a, int16x8_t b)
++vabdq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sabd %0.8h, %1.8h, %2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_s32 (int32x4_t a, int32x4_t b)
++vabdq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sabd %0.4s, %1.4s, %2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_u8 (uint8x16_t a, uint8x16_t b)
++vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  uint8x16_t result;
++  uint8x16_t __result;
+   __asm__ ("uabd %0.16b, %1.16b, %2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_u16 (uint16x8_t a, uint16x8_t b)
++vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uabd %0.8h, %1.8h, %2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_u32 (uint32x4_t a, uint32x4_t b)
++vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uabd %0.4s, %1.4s, %2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlv_s8 (int8x8_t a)
++vaddlv_s8 (int8x8_t __a)
+ {
+-  int16_t result;
++  int16_t __result;
+   __asm__ ("saddlv %h0,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlv_s16 (int16x4_t a)
++vaddlv_s16 (int16x4_t __a)
+ {
+-  int32_t result;
++  int32_t __result;
+   __asm__ ("saddlv %s0,%1.4h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlv_u8 (uint8x8_t a)
++vaddlv_u8 (uint8x8_t __a)
+ {
+-  uint16_t result;
++  uint16_t __result;
+   __asm__ ("uaddlv %h0,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlv_u16 (uint16x4_t a)
++vaddlv_u16 (uint16x4_t __a)
+ {
+-  uint32_t result;
++  uint32_t __result;
+   __asm__ ("uaddlv %s0,%1.4h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlvq_s8 (int8x16_t a)
++vaddlvq_s8 (int8x16_t __a)
+ {
+-  int16_t result;
++  int16_t __result;
+   __asm__ ("saddlv %h0,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlvq_s16 (int16x8_t a)
++vaddlvq_s16 (int16x8_t __a)
+ {
+-  int32_t result;
++  int32_t __result;
+   __asm__ ("saddlv %s0,%1.8h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlvq_s32 (int32x4_t a)
++vaddlvq_s32 (int32x4_t __a)
+ {
+-  int64_t result;
++  int64_t __result;
+   __asm__ ("saddlv %d0,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlvq_u8 (uint8x16_t a)
++vaddlvq_u8 (uint8x16_t __a)
+ {
+-  uint16_t result;
++  uint16_t __result;
+   __asm__ ("uaddlv %h0,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlvq_u16 (uint16x8_t a)
++vaddlvq_u16 (uint16x8_t __a)
+ {
+-  uint32_t result;
++  uint32_t __result;
+   __asm__ ("uaddlv %s0,%1.8h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlvq_u32 (uint32x4_t a)
++vaddlvq_u32 (uint32x4_t __a)
+ {
+-  uint64_t result;
++  uint64_t __result;
+   __asm__ ("uaddlv %d0,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vcvtx_f32_f64 (float64x2_t a)
++vcvtx_f32_f64 (float64x2_t __a)
+ {
+-  float32x2_t result;
++  float32x2_t __result;
+   __asm__ ("fcvtxn %0.2s,%1.2d"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
++vcvtx_high_f32_f64 (float32x2_t __a, float64x2_t __b)
+ {
+-  float32x4_t result;
++  float32x4_t __result;
+   __asm__ ("fcvtxn2 %0.4s,%1.2d"
+-           : "=w"(result)
+-           : "w" (b), "0"(a)
++           : "=w"(__result)
++           : "w" (__b), "0"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vcvtxd_f32_f64 (float64_t a)
++vcvtxd_f32_f64 (float64_t __a)
+ {
+-  float32_t result;
++  float32_t __result;
+   __asm__ ("fcvtxn %s0,%d1"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
++vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+ {
+-  float32x2_t result;
+-  float32x2_t t1;
++  float32x2_t __result;
++  float32x2_t __t1;
+   __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
+-           : "=w"(result), "=w"(t1)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result), "=w"(__t1)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
++vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
++vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
++vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
++vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
++vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+-  int8x8_t result;
++  int8x8_t __result;
+   __asm__ ("mla %0.8b, %2.8b, %3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
++vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("mla %0.4h, %2.4h, %3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
++vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("mla %0.2s, %2.2s, %3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
++vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+-  uint8x8_t result;
++  uint8x8_t __result;
+   __asm__ ("mla %0.8b, %2.8b, %3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
++vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("mla %0.4h, %2.4h, %3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
++vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("mla %0.2s, %2.2s, %3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vmlal_high_lane_s16(a, b, c, d)                                 \
+@@ -7549,122 +7582,122 @@ vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
++vmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
++vmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
++vmlal_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
++vmlal_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
++vmlal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
++vmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
++vmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
++vmlal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
++vmlal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
++vmlal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlal2 %0.2d,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vmlal_lane_s16(a, b, c, d)                                      \
+@@ -7781,388 +7814,388 @@ vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
++vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
++vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
++vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
++vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
++vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("smlal %0.8h,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
++vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlal %0.4s,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
++vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlal %0.2d,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
++vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("umlal %0.8h,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
++vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlal %0.4s,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
++vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlal %0.2d,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
++vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+ {
+-  float32x4_t result;
+-  float32x4_t t1;
++  float32x4_t __result;
++  float32x4_t __t1;
+   __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
+-           : "=w"(result), "=w"(t1)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result), "=w"(__t1)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
++vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
++vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
++vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
++vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
++vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+-  int8x16_t result;
++  int8x16_t __result;
+   __asm__ ("mla %0.16b, %2.16b, %3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
++vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("mla %0.8h, %2.8h, %3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
++vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("mla %0.4s, %2.4s, %3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
++vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+-  uint8x16_t result;
++  uint8x16_t __result;
+   __asm__ ("mla %0.16b, %2.16b, %3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
++vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("mla %0.8h, %2.8h, %3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
++vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("mla %0.4s, %2.4s, %3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
++vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+ {
+-  float32x2_t result;
+-  float32x2_t t1;
++  float32x2_t __result;
++  float32x2_t __t1;
+   __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
+-           : "=w"(result), "=w"(t1)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result), "=w"(__t1)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
++vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
++vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
++vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
++vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
++vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+-  int8x8_t result;
++  int8x8_t __result;
+   __asm__ ("mls %0.8b,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
++vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("mls %0.4h,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
++vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("mls %0.2s,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
++vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+-  uint8x8_t result;
++  uint8x8_t __result;
+   __asm__ ("mls %0.8b,%2.8b,%3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
++vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("mls %0.4h,%2.4h,%3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
++vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("mls %0.2s,%2.2s,%3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vmlsl_high_lane_s16(a, b, c, d)                                 \
+@@ -8279,122 +8312,122 @@ vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
++vmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
++vmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
++vmlsl_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
++vmlsl_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
++vmlsl_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
++vmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
++vmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
++vmlsl_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
++vmlsl_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
++vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vmlsl_lane_s16(a, b, c, d)                                      \
+@@ -8511,543 +8544,543 @@ vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
++vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
++vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
++vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
++vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
++vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
++vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
++vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
++vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
++vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
++vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
++vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+ {
+-  float32x4_t result;
+-  float32x4_t t1;
++  float32x4_t __result;
++  float32x4_t __t1;
+   __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
+-           : "=w"(result), "=w"(t1)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result), "=w"(__t1)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
++vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
++vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
++vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "x"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "x"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
++vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
++vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+ {
+-  int8x16_t result;
++  int8x16_t __result;
+   __asm__ ("mls %0.16b,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
++vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("mls %0.8h,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
++vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("mls %0.4s,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
++vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+ {
+-  uint8x16_t result;
++  uint8x16_t __result;
+   __asm__ ("mls %0.16b,%2.16b,%3.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
++vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("mls %0.8h,%2.8h,%3.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
++vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("mls %0.4s,%2.4s,%3.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b), "w"(c)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b), "w"(__c)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_high_s8 (int8x16_t a)
++vmovl_high_s8 (int8x16_t __a)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sshll2 %0.8h,%1.16b,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_high_s16 (int16x8_t a)
++vmovl_high_s16 (int16x8_t __a)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sshll2 %0.4s,%1.8h,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_high_s32 (int32x4_t a)
++vmovl_high_s32 (int32x4_t __a)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sshll2 %0.2d,%1.4s,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_high_u8 (uint8x16_t a)
++vmovl_high_u8 (uint8x16_t __a)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("ushll2 %0.8h,%1.16b,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_high_u16 (uint16x8_t a)
++vmovl_high_u16 (uint16x8_t __a)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("ushll2 %0.4s,%1.8h,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_high_u32 (uint32x4_t a)
++vmovl_high_u32 (uint32x4_t __a)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("ushll2 %0.2d,%1.4s,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_s8 (int8x8_t a)
++vmovl_s8 (int8x8_t __a)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sshll %0.8h,%1.8b,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_s16 (int16x4_t a)
++vmovl_s16 (int16x4_t __a)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sshll %0.4s,%1.4h,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_s32 (int32x2_t a)
++vmovl_s32 (int32x2_t __a)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sshll %0.2d,%1.2s,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_u8 (uint8x8_t a)
++vmovl_u8 (uint8x8_t __a)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("ushll %0.8h,%1.8b,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_u16 (uint16x4_t a)
++vmovl_u16 (uint16x4_t __a)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("ushll %0.4s,%1.4h,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovl_u32 (uint32x2_t a)
++vmovl_u32 (uint32x2_t __a)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("ushll %0.2d,%1.2s,#0"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_high_s16 (int8x8_t a, int16x8_t b)
++vmovn_high_s16 (int8x8_t __a, int16x8_t __b)
+ {
+-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
++  int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("xtn2 %0.16b,%1.8h"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_high_s32 (int16x4_t a, int32x4_t b)
++vmovn_high_s32 (int16x4_t __a, int32x4_t __b)
+ {
+-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
++  int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("xtn2 %0.8h,%1.4s"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_high_s64 (int32x2_t a, int64x2_t b)
++vmovn_high_s64 (int32x2_t __a, int64x2_t __b)
+ {
+-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
++  int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("xtn2 %0.4s,%1.2d"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
++vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
+ {
+-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("xtn2 %0.16b,%1.8h"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
++vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
+ {
+-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
++  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("xtn2 %0.8h,%1.4s"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
++vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
+ {
+-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
++  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("xtn2 %0.4s,%1.2d"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_s16 (int16x8_t a)
++vmovn_s16 (int16x8_t __a)
+ {
+-  int8x8_t result;
++  int8x8_t __result;
+   __asm__ ("xtn %0.8b,%1.8h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_s32 (int32x4_t a)
++vmovn_s32 (int32x4_t __a)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("xtn %0.4h,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_s64 (int64x2_t a)
++vmovn_s64 (int64x2_t __a)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("xtn %0.2s,%1.2d"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_u16 (uint16x8_t a)
++vmovn_u16 (uint16x8_t __a)
+ {
+-  uint8x8_t result;
++  uint8x8_t __result;
+   __asm__ ("xtn %0.8b,%1.8h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_u32 (uint32x4_t a)
++vmovn_u32 (uint32x4_t __a)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("xtn %0.4h,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmovn_u64 (uint64x2_t a)
++vmovn_u64 (uint64x2_t __a)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("xtn %0.2s,%1.2d"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vmull_high_lane_s16(a, b, c)                                    \
+@@ -9156,134 +9189,134 @@ vmovn_u64 (uint64x2_t a)
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_n_s16 (int16x8_t a, int16_t b)
++vmull_high_n_s16 (int16x8_t __a, int16_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_n_s32 (int32x4_t a, int32_t b)
++vmull_high_n_s32 (int32x4_t __a, int32_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
++vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
++vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
++vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+-  poly16x8_t result;
++  poly16x8_t __result;
+   __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_s8 (int8x16_t a, int8x16_t b)
++vmull_high_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_s16 (int16x8_t a, int16x8_t b)
++vmull_high_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_s32 (int32x4_t a, int32x4_t b)
++vmull_high_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
++vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
++vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
++vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vmull_lane_s16(a, b, c)                                         \
+@@ -9392,722 +9425,722 @@ vmull_high_u32 (uint32x4_t a, uint32x4_t b)
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_n_s16 (int16x4_t a, int16_t b)
++vmull_n_s16 (int16x4_t __a, int16_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_n_s32 (int32x2_t a, int32_t b)
++vmull_n_s32 (int32x2_t __a, int32_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_n_u16 (uint16x4_t a, uint16_t b)
++vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_n_u32 (uint32x2_t a, uint32_t b)
++vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_p8 (poly8x8_t a, poly8x8_t b)
++vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+-  poly16x8_t result;
++  poly16x8_t __result;
+   __asm__ ("pmull %0.8h, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_s8 (int8x8_t a, int8x8_t b)
++vmull_s8 (int8x8_t __a, int8x8_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("smull %0.8h, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_s16 (int16x4_t a, int16x4_t b)
++vmull_s16 (int16x4_t __a, int16x4_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("smull %0.4s, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_s32 (int32x2_t a, int32x2_t b)
++vmull_s32 (int32x2_t __a, int32x2_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("smull %0.2d, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_u8 (uint8x8_t a, uint8x8_t b)
++vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("umull %0.8h, %1.8b, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_u16 (uint16x4_t a, uint16x4_t b)
++vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("umull %0.4s, %1.4h, %2.4h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_u32 (uint32x2_t a, uint32x2_t b)
++vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("umull %0.2d, %1.2s, %2.2s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadal_s8 (int16x4_t a, int8x8_t b)
++vpadal_s8 (int16x4_t __a, int8x8_t __b)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("sadalp %0.4h,%2.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadal_s16 (int32x2_t a, int16x4_t b)
++vpadal_s16 (int32x2_t __a, int16x4_t __b)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("sadalp %0.2s,%2.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadal_s32 (int64x1_t a, int32x2_t b)
++vpadal_s32 (int64x1_t __a, int32x2_t __b)
+ {
+-  int64x1_t result;
++  int64x1_t __result;
+   __asm__ ("sadalp %0.1d,%2.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadal_u8 (uint16x4_t a, uint8x8_t b)
++vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("uadalp %0.4h,%2.8b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadal_u16 (uint32x2_t a, uint16x4_t b)
++vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("uadalp %0.2s,%2.4h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadal_u32 (uint64x1_t a, uint32x2_t b)
++vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
+ {
+-  uint64x1_t result;
++  uint64x1_t __result;
+   __asm__ ("uadalp %0.1d,%2.2s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadalq_s8 (int16x8_t a, int8x16_t b)
++vpadalq_s8 (int16x8_t __a, int8x16_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sadalp %0.8h,%2.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadalq_s16 (int32x4_t a, int16x8_t b)
++vpadalq_s16 (int32x4_t __a, int16x8_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sadalp %0.4s,%2.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadalq_s32 (int64x2_t a, int32x4_t b)
++vpadalq_s32 (int64x2_t __a, int32x4_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("sadalp %0.2d,%2.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadalq_u8 (uint16x8_t a, uint8x16_t b)
++vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uadalp %0.8h,%2.16b"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadalq_u16 (uint32x4_t a, uint16x8_t b)
++vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uadalp %0.4s,%2.8h"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadalq_u32 (uint64x2_t a, uint32x4_t b)
++vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("uadalp %0.2d,%2.4s"
+-           : "=w"(result)
+-           : "0"(a), "w"(b)
++           : "=w"(__result)
++           : "0"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddl_s8 (int8x8_t a)
++vpaddl_s8 (int8x8_t __a)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("saddlp %0.4h,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddl_s16 (int16x4_t a)
++vpaddl_s16 (int16x4_t __a)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("saddlp %0.2s,%1.4h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddl_s32 (int32x2_t a)
++vpaddl_s32 (int32x2_t __a)
+ {
+-  int64x1_t result;
++  int64x1_t __result;
+   __asm__ ("saddlp %0.1d,%1.2s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddl_u8 (uint8x8_t a)
++vpaddl_u8 (uint8x8_t __a)
+ {
+-  uint16x4_t result;
++  uint16x4_t __result;
+   __asm__ ("uaddlp %0.4h,%1.8b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddl_u16 (uint16x4_t a)
++vpaddl_u16 (uint16x4_t __a)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("uaddlp %0.2s,%1.4h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddl_u32 (uint32x2_t a)
++vpaddl_u32 (uint32x2_t __a)
+ {
+-  uint64x1_t result;
++  uint64x1_t __result;
+   __asm__ ("uaddlp %0.1d,%1.2s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddlq_s8 (int8x16_t a)
++vpaddlq_s8 (int8x16_t __a)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("saddlp %0.8h,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddlq_s16 (int16x8_t a)
++vpaddlq_s16 (int16x8_t __a)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("saddlp %0.4s,%1.8h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddlq_s32 (int32x4_t a)
++vpaddlq_s32 (int32x4_t __a)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("saddlp %0.2d,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddlq_u8 (uint8x16_t a)
++vpaddlq_u8 (uint8x16_t __a)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("uaddlp %0.8h,%1.16b"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddlq_u16 (uint16x8_t a)
++vpaddlq_u16 (uint16x8_t __a)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("uaddlp %0.4s,%1.8h"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddlq_u32 (uint32x4_t a)
++vpaddlq_u32 (uint32x4_t __a)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("uaddlp %0.2d,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_s8 (int8x16_t a, int8x16_t b)
++vpaddq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  int8x16_t result;
++  int8x16_t __result;
+   __asm__ ("addp %0.16b,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_s16 (int16x8_t a, int16x8_t b)
++vpaddq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("addp %0.8h,%1.8h,%2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_s32 (int32x4_t a, int32x4_t b)
++vpaddq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("addp %0.4s,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_s64 (int64x2_t a, int64x2_t b)
++vpaddq_s64 (int64x2_t __a, int64x2_t __b)
+ {
+-  int64x2_t result;
++  int64x2_t __result;
+   __asm__ ("addp %0.2d,%1.2d,%2.2d"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_u8 (uint8x16_t a, uint8x16_t b)
++vpaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  uint8x16_t result;
++  uint8x16_t __result;
+   __asm__ ("addp %0.16b,%1.16b,%2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_u16 (uint16x8_t a, uint16x8_t b)
++vpaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  uint16x8_t result;
++  uint16x8_t __result;
+   __asm__ ("addp %0.8h,%1.8h,%2.8h"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_u32 (uint32x4_t a, uint32x4_t b)
++vpaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("addp %0.4s,%1.4s,%2.4s"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_u64 (uint64x2_t a, uint64x2_t b)
++vpaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+ {
+-  uint64x2_t result;
++  uint64x2_t __result;
+   __asm__ ("addp %0.2d,%1.2d,%2.2d"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqdmulh_n_s16 (int16x4_t a, int16_t b)
++vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqdmulh_n_s32 (int32x2_t a, int32_t b)
++vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqdmulhq_n_s16 (int16x8_t a, int16_t b)
++vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqdmulhq_n_s32 (int32x4_t a, int32_t b)
++vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovn_high_s16 (int8x8_t a, int16x8_t b)
++vqmovn_high_s16 (int8x8_t __a, int16x8_t __b)
+ {
+-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
++  int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("sqxtn2 %0.16b, %1.8h"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovn_high_s32 (int16x4_t a, int32x4_t b)
++vqmovn_high_s32 (int16x4_t __a, int32x4_t __b)
+ {
+-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
++  int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("sqxtn2 %0.8h, %1.4s"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovn_high_s64 (int32x2_t a, int64x2_t b)
++vqmovn_high_s64 (int32x2_t __a, int64x2_t __b)
+ {
+-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
++  int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("sqxtn2 %0.4s, %1.2d"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
++vqmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
+ {
+-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("uqxtn2 %0.16b, %1.8h"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
++vqmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
+ {
+-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
++  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("uqxtn2 %0.8h, %1.4s"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
++vqmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
+ {
+-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
++  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("uqxtn2 %0.4s, %1.2d"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
++vqmovun_high_s16 (uint8x8_t __a, int16x8_t __b)
+ {
+-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("sqxtun2 %0.16b, %1.8h"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
++vqmovun_high_s32 (uint16x4_t __a, int32x4_t __b)
+ {
+-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
++  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("sqxtun2 %0.8h, %1.4s"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
++vqmovun_high_s64 (uint32x2_t __a, int64x2_t __b)
+ {
+-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
++  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("sqxtun2 %0.4s, %1.2d"
+-           : "+w"(result)
+-           : "w"(b)
++           : "+w"(__result)
++           : "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqrdmulh_n_s16 (int16x4_t a, int16_t b)
++vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
+ {
+-  int16x4_t result;
++  int16x4_t __result;
+   __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqrdmulh_n_s32 (int32x2_t a, int32_t b)
++vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
+ {
+-  int32x2_t result;
++  int32x2_t __result;
+   __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
++vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+ {
+-  int16x8_t result;
++  int16x8_t __result;
+   __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
+-           : "=w"(result)
+-           : "w"(a), "x"(b)
++           : "=w"(__result)
++           : "w"(__a), "x"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
++vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+ {
+-  int32x4_t result;
++  int32x4_t __result;
+   __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vqrshrn_high_n_s16(a, b, c)                                     \
+@@ -10544,26 +10577,26 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrsqrte_u32 (uint32x2_t a)
++vrsqrte_u32 (uint32x2_t __a)
+ {
+-  uint32x2_t result;
++  uint32x2_t __result;
+   __asm__ ("ursqrte %0.2s,%1.2s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrsqrteq_u32 (uint32x4_t a)
++vrsqrteq_u32 (uint32x4_t __a)
+ {
+-  uint32x4_t result;
++  uint32x4_t __result;
+   __asm__ ("ursqrte %0.4s,%1.4s"
+-           : "=w"(result)
+-           : "w"(a)
++           : "=w"(__result)
++           : "w"(__a)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ #define vshrn_high_n_s16(a, b, c)                                       \
+@@ -10860,48 +10893,48 @@ vrsqrteq_u32 (uint32x4_t a)
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtst_p8 (poly8x8_t a, poly8x8_t b)
++vtst_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+-  return (uint8x8_t) ((((uint8x8_t) a) & ((uint8x8_t) b))
++  return (uint8x8_t) ((((uint8x8_t) __a) & ((uint8x8_t) __b))
+ 		       != 0);
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtst_p16 (poly16x4_t a, poly16x4_t b)
++vtst_p16 (poly16x4_t __a, poly16x4_t __b)
+ {
+-  return (uint16x4_t) ((((uint16x4_t) a) & ((uint16x4_t) b))
++  return (uint16x4_t) ((((uint16x4_t) __a) & ((uint16x4_t) __b))
+ 		       != 0);
+ }
+ 
+ __extension__ extern __inline uint64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtst_p64 (poly64x1_t a, poly64x1_t b)
++vtst_p64 (poly64x1_t __a, poly64x1_t __b)
+ {
+-  return (uint64x1_t) ((a & b) != __AARCH64_INT64_C (0));
++  return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0));
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtstq_p8 (poly8x16_t a, poly8x16_t b)
++vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+-  return (uint8x16_t) ((((uint8x16_t) a) & ((uint8x16_t) b))
++  return (uint8x16_t) ((((uint8x16_t) __a) & ((uint8x16_t) __b))
+ 		       != 0);
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtstq_p16 (poly16x8_t a, poly16x8_t b)
++vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
+ {
+-  return (uint16x8_t) ((((uint16x8_t) a) & ((uint16x8_t) b))
++  return (uint16x8_t) ((((uint16x8_t) __a) & ((uint16x8_t) __b))
+ 		       != 0);
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtstq_p64 (poly64x2_t a, poly64x2_t b)
++vtstq_p64 (poly64x2_t __a, poly64x2_t __b)
+ {
+-  return (uint64x2_t) ((((uint64x2_t) a) & ((uint64x2_t) b))
++  return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b))
+ 		       != __AARCH64_INT64_C (0));
+ }
+ 
+@@ -11248,20 +11281,20 @@ __ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+ 
+ __extension__ extern __inline int64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlv_s32 (int32x2_t a)
++vaddlv_s32 (int32x2_t __a)
+ {
+-  int64_t result;
+-  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+-  return result;
++  int64_t __result;
++  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : );
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vaddlv_u32 (uint32x2_t a)
++vaddlv_u32 (uint32x2_t __a)
+ {
+-  uint64_t result;
+-  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
+-  return result;
++  uint64_t __result;
++  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : );
++  return __result;
+ }
+ 
+ __extension__ extern __inline int16x4_t
+@@ -11324,367 +11357,367 @@ vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
++vqtbl1_p8 (poly8x16_t __a, uint8x8_t __b)
+ {
+-  poly8x8_t result;
++  poly8x8_t __result;
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl1_s8 (int8x16_t a, uint8x8_t b)
++vqtbl1_s8 (int8x16_t __a, uint8x8_t __b)
+ {
+-  int8x8_t result;
++  int8x8_t __result;
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
++vqtbl1_u8 (uint8x16_t __a, uint8x8_t __b)
+ {
+-  uint8x8_t result;
++  uint8x8_t __result;
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
++vqtbl1q_p8 (poly8x16_t __a, uint8x16_t __b)
+ {
+-  poly8x16_t result;
++  poly8x16_t __result;
+   __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
++vqtbl1q_s8 (int8x16_t __a, uint8x16_t __b)
+ {
+-  int8x16_t result;
++  int8x16_t __result;
+   __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
++vqtbl1q_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  uint8x16_t result;
++  uint8x16_t __result;
+   __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
+-           : "=w"(result)
+-           : "w"(a), "w"(b)
++           : "=w"(__result)
++           : "w"(__a), "w"(__b)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
++vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx)
+ {
+-  int8x8_t result = r;
++  int8x8_t __result = __r;
+   __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+-           : "+w"(result)
+-           : "w"(tab), "w"(idx)
++           : "+w"(__result)
++           : "w"(__tab), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
++vqtbx1_u8 (uint8x8_t __r, uint8x16_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result = r;
++  uint8x8_t __result = __r;
+   __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+-           : "+w"(result)
+-           : "w"(tab), "w"(idx)
++           : "+w"(__result)
++           : "w"(__tab), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
++vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result = r;
++  poly8x8_t __result = __r;
+   __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
+-           : "+w"(result)
+-           : "w"(tab), "w"(idx)
++           : "+w"(__result)
++           : "w"(__tab), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
++vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx)
+ {
+-  int8x16_t result = r;
++  int8x16_t __result = __r;
+   __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+-           : "+w"(result)
+-           : "w"(tab), "w"(idx)
++           : "+w"(__result)
++           : "w"(__tab), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
++vqtbx1q_u8 (uint8x16_t __r, uint8x16_t __tab, uint8x16_t __idx)
+ {
+-  uint8x16_t result = r;
++  uint8x16_t __result = __r;
+   __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+-           : "+w"(result)
+-           : "w"(tab), "w"(idx)
++           : "+w"(__result)
++           : "w"(__tab), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
++vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx)
+ {
+-  poly8x16_t result = r;
++  poly8x16_t __result = __r;
+   __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
+-           : "+w"(result)
+-           : "w"(tab), "w"(idx)
++           : "+w"(__result)
++           : "w"(__tab), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ /* V7 legacy table intrinsics.  */
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl1_s8 (int8x8_t tab, int8x8_t idx)
++vtbl1_s8 (int8x8_t __tab, int8x8_t __idx)
+ {
+-  int8x8_t result;
+-  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
++  int8x8_t __result;
++  int8x16_t __temp = vcombine_s8 (__tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(temp), "w"(idx)
++           : "=w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
++vtbl1_u8 (uint8x8_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result;
+-  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  uint8x8_t __result;
++  uint8x16_t __temp = vcombine_u8 (__tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(temp), "w"(idx)
++           : "=w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
++vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result;
+-  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
++  poly8x8_t __result;
++  poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(temp), "w"(idx)
++           : "=w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
++vtbl2_s8 (int8x8x2_t __tab, int8x8_t __idx)
+ {
+-  int8x8_t result;
+-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
++  int8x8_t __result;
++  int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]);
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(temp), "w"(idx)
++           : "=w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
++vtbl2_u8 (uint8x8x2_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result;
+-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
++  uint8x8_t __result;
++  uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]);
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(temp), "w"(idx)
++           : "=w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
++vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result;
+-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
++  poly8x8_t __result;
++  poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
+   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
+-           : "=w"(result)
+-           : "w"(temp), "w"(idx)
++           : "=w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
++vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx)
+ {
+-  int8x8_t result;
+-  int8x16x2_t temp;
++  int8x8_t __result;
++  int8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+-  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
++  __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = __builtin_aarch64_tbl3v8qi (__o, idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = __builtin_aarch64_tbl3v8qi (__o, __idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
++vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result;
+-  uint8x16x2_t temp;
++  uint8x8_t __result;
++  uint8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+-  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
++  __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
++vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result;
+-  poly8x16x2_t temp;
++  poly8x8_t __result;
++  poly8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+-  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
++  __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
++vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx)
+ {
+-  int8x8_t result;
+-  int8x16x2_t temp;
++  int8x8_t __result;
++  int8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
+-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
++  __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = __builtin_aarch64_tbl3v8qi (__o, idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = __builtin_aarch64_tbl3v8qi (__o, __idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
++vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result;
+-  uint8x16x2_t temp;
++  uint8x8_t __result;
++  uint8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
+-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
++  __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
++vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result;
+-  poly8x16x2_t temp;
++  poly8x8_t __result;
++  poly8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
+-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
++  __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
++vtbx2_s8 (int8x8_t __r, int8x8x2_t __tab, int8x8_t __idx)
+ {
+-  int8x8_t result = r;
+-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
++  int8x8_t __result = __r;
++  int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]);
+   __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+-           : "+w"(result)
+-           : "w"(temp), "w"(idx)
++           : "+w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
++vtbx2_u8 (uint8x8_t __r, uint8x8x2_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result = r;
+-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
++  uint8x8_t __result = __r;
++  uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]);
+   __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+-           : "+w"(result)
+-           : "w"(temp), "w"(idx)
++           : "+w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
++vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result = r;
+-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
++  poly8x8_t __result = __r;
++  poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
+   __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
+-           : "+w"(result)
+-           : "w"(temp), "w"(idx)
++           : "+w"(__result)
++           : "w"(__temp), "w"(__idx)
+            : /* No clobbers */);
+-  return result;
++  return __result;
+ }
+ 
+ /* End of temporary inline asm.  */
+@@ -17063,98 +17096,98 @@ vld1_f16 (const float16_t *__a)
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_f32 (const float32_t *a)
++vld1_f32 (const float32_t *__a)
+ {
+-  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
++  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) __a);
+ }
+ 
+ __extension__ extern __inline float64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_f64 (const float64_t *a)
++vld1_f64 (const float64_t *__a)
+ {
+-  return (float64x1_t) {*a};
++  return (float64x1_t) {*__a};
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_p8 (const poly8_t *a)
++vld1_p8 (const poly8_t *__a)
+ {
+   return (poly8x8_t)
+-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
++    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
+ }
+ 
+ __extension__ extern __inline poly16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_p16 (const poly16_t *a)
++vld1_p16 (const poly16_t *__a)
+ {
+   return (poly16x4_t)
+-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
++    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
+ }
+ 
+ __extension__ extern __inline poly64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_p64 (const poly64_t *a)
++vld1_p64 (const poly64_t *__a)
+ {
+-  return (poly64x1_t) {*a};
++  return (poly64x1_t) {*__a};
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_s8 (const int8_t *a)
++vld1_s8 (const int8_t *__a)
+ {
+-  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
++  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_s16 (const int16_t *a)
++vld1_s16 (const int16_t *__a)
+ {
+-  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
++  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_s32 (const int32_t *a)
++vld1_s32 (const int32_t *__a)
+ {
+-  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
++  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a);
+ }
+ 
+ __extension__ extern __inline int64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_s64 (const int64_t *a)
++vld1_s64 (const int64_t *__a)
+ {
+-  return (int64x1_t) {*a};
++  return (int64x1_t) {*__a};
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_u8 (const uint8_t *a)
++vld1_u8 (const uint8_t *__a)
+ {
+   return (uint8x8_t)
+-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
++    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_u16 (const uint16_t *a)
++vld1_u16 (const uint16_t *__a)
+ {
+   return (uint16x4_t)
+-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
++    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_u32 (const uint32_t *a)
++vld1_u32 (const uint32_t *__a)
+ {
+   return (uint32x2_t)
+-    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
++    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a);
+ }
+ 
+ __extension__ extern __inline uint64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1_u64 (const uint64_t *a)
++vld1_u64 (const uint64_t *__a)
+ {
+-  return (uint64x1_t) {*a};
++  return (uint64x1_t) {*__a};
+ }
+ 
+ /* vld1x3  */
+@@ -17536,76 +17569,76 @@ vld1q_f16 (const float16_t *__a)
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_f32 (const float32_t *a)
++vld1q_f32 (const float32_t *__a)
+ {
+-  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
++  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a);
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_f64 (const float64_t *a)
++vld1q_f64 (const float64_t *__a)
+ {
+-  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
++  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) __a);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_p8 (const poly8_t *a)
++vld1q_p8 (const poly8_t *__a)
+ {
+   return (poly8x16_t)
+-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
++    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ }
+ 
+ __extension__ extern __inline poly16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_p16 (const poly16_t *a)
++vld1q_p16 (const poly16_t *__a)
+ {
+   return (poly16x8_t)
+-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
++    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ }
+ 
+ __extension__ extern __inline poly64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_p64 (const poly64_t *a)
++vld1q_p64 (const poly64_t *__a)
+ {
+   return (poly64x2_t)
+-    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
++    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_s8 (const int8_t *a)
++vld1q_s8 (const int8_t *__a)
+ {
+-  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
++  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_s16 (const int16_t *a)
++vld1q_s16 (const int16_t *__a)
+ {
+-  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
++  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_s32 (const int32_t *a)
++vld1q_s32 (const int32_t *__a)
+ {
+-  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
++  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a);
+ }
+ 
+ __extension__ extern __inline int64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_s64 (const int64_t *a)
++vld1q_s64 (const int64_t *__a)
+ {
+-  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
++  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_u8 (const uint8_t *a)
++vld1q_u8 (const uint8_t *__a)
+ {
+   return (uint8x16_t)
+-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
++    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ }
+ 
+ __extension__ extern __inline uint8x8x2_t
+@@ -17946,26 +17979,308 @@ vld1q_p64_x2 (const poly64_t *__a)
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_u16 (const uint16_t *a)
++vld1q_u16 (const uint16_t *__a)
+ {
+   return (uint16x8_t)
+-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
++    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_u32 (const uint32_t *a)
++vld1q_u32 (const uint32_t *__a)
+ {
+   return (uint32x4_t)
+-    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
++    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a);
+ }
+ 
+ __extension__ extern __inline uint64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vld1q_u64 (const uint64_t *a)
++vld1q_u64 (const uint64_t *__a)
+ {
+   return (uint64x2_t)
+-    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
++    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
++}
++
++/* vld1(q)_x4.  */
++
++__extension__ extern __inline int8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_s8_x4 (const int8_t *__a)
++{
++  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_s8_x4 (const int8_t *__a)
++{
++  union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_s16_x4 (const int16_t *__a)
++{
++  union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_s16_x4 (const int16_t *__a)
++{
++  union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_s32_x4 (const int32_t *__a)
++{
++  union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++  = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_s32_x4 (const int32_t *__a)
++{
++  union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++  = __builtin_aarch64_ld1x4v4si ((const __builtin_aarch64_simd_si *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_u8_x4 (const uint8_t *__a)
++{
++  union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_u8_x4 (const uint8_t *__a)
++{
++  union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_u16_x4 (const uint16_t *__a)
++{
++  union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_u16_x4 (const uint16_t *__a)
++{
++  union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_u32_x4 (const uint32_t *__a)
++{
++  union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_u32_x4 (const uint32_t *__a)
++{
++  union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4si ((const __builtin_aarch64_simd_si *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline float16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_f16_x4 (const float16_t *__a)
++{
++  union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4hf ((const __builtin_aarch64_simd_hf *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline float16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_f16_x4 (const float16_t *__a)
++{
++  union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8hf ((const __builtin_aarch64_simd_hf *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline float32x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_f32_x4 (const float32_t *__a)
++{
++  union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v2sf ((const __builtin_aarch64_simd_sf *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline float32x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_f32_x4 (const float32_t *__a)
++{
++  union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4sf ((const __builtin_aarch64_simd_sf *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline poly8x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_p8_x4 (const poly8_t *__a)
++{
++  union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline poly8x16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_p8_x4 (const poly8_t *__a)
++{
++  union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline poly16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_p16_x4 (const poly16_t *__a)
++{
++  union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline poly16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_p16_x4 (const poly16_t *__a)
++{
++  union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_s64_x4 (const int64_t *__a)
++{
++  union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_u64_x4 (const uint64_t *__a)
++{
++  union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline poly64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_p64_x4 (const poly64_t *__a)
++{
++  union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline int64x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_s64_x4 (const int64_t *__a)
++{
++  union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline uint64x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_u64_x4 (const uint64_t *__a)
++{
++  union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline poly64x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_p64_x4 (const poly64_t *__a)
++{
++  union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline float64x1x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_f64_x4 (const float64_t *__a)
++{
++  union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4df ((const __builtin_aarch64_simd_df *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline float64x2x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_f64_x4 (const float64_t *__a)
++{
++  union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v2df ((const __builtin_aarch64_simd_df *) __a);
++  return __au.__i;
+ }
+ 
+ /* vld1_dup  */
+@@ -21115,328 +21430,328 @@ vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_s8 (int8x8_t a, int8x8_t b)
++vpmax_s8 (int8x8_t __a, int8x8_t __b)
+ {
+-  return __builtin_aarch64_smaxpv8qi (a, b);
++  return __builtin_aarch64_smaxpv8qi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_s16 (int16x4_t a, int16x4_t b)
++vpmax_s16 (int16x4_t __a, int16x4_t __b)
+ {
+-  return __builtin_aarch64_smaxpv4hi (a, b);
++  return __builtin_aarch64_smaxpv4hi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_s32 (int32x2_t a, int32x2_t b)
++vpmax_s32 (int32x2_t __a, int32x2_t __b)
+ {
+-  return __builtin_aarch64_smaxpv2si (a, b);
++  return __builtin_aarch64_smaxpv2si (__a, __b);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_u8 (uint8x8_t a, uint8x8_t b)
++vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+-  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
+-						  (int8x8_t) b);
++  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) __a,
++						  (int8x8_t) __b);
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_u16 (uint16x4_t a, uint16x4_t b)
++vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+-  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
+-						   (int16x4_t) b);
++  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) __a,
++						   (int16x4_t) __b);
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_u32 (uint32x2_t a, uint32x2_t b)
++vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+-  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
+-						   (int32x2_t) b);
++  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) __a,
++						   (int32x2_t) __b);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_s8 (int8x16_t a, int8x16_t b)
++vpmaxq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  return __builtin_aarch64_smaxpv16qi (a, b);
++  return __builtin_aarch64_smaxpv16qi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_s16 (int16x8_t a, int16x8_t b)
++vpmaxq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  return __builtin_aarch64_smaxpv8hi (a, b);
++  return __builtin_aarch64_smaxpv8hi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_s32 (int32x4_t a, int32x4_t b)
++vpmaxq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  return __builtin_aarch64_smaxpv4si (a, b);
++  return __builtin_aarch64_smaxpv4si (__a, __b);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
++vpmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
+-						    (int8x16_t) b);
++  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) __a,
++						    (int8x16_t) __b);
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
++vpmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
+-						   (int16x8_t) b);
++  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) __a,
++						   (int16x8_t) __b);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
++vpmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
+-						   (int32x4_t) b);
++  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) __a,
++						   (int32x4_t) __b);
+ }
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_f32 (float32x2_t a, float32x2_t b)
++vpmax_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-  return __builtin_aarch64_smax_nanpv2sf (a, b);
++  return __builtin_aarch64_smax_nanpv2sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_f32 (float32x4_t a, float32x4_t b)
++vpmaxq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-  return __builtin_aarch64_smax_nanpv4sf (a, b);
++  return __builtin_aarch64_smax_nanpv4sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_f64 (float64x2_t a, float64x2_t b)
++vpmaxq_f64 (float64x2_t __a, float64x2_t __b)
+ {
+-  return __builtin_aarch64_smax_nanpv2df (a, b);
++  return __builtin_aarch64_smax_nanpv2df (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxqd_f64 (float64x2_t a)
++vpmaxqd_f64 (float64x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
++  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
+ }
+ 
+ __extension__ extern __inline float32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxs_f32 (float32x2_t a)
++vpmaxs_f32 (float32x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
++  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
+ }
+ 
+ /* vpmaxnm  */
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnm_f32 (float32x2_t a, float32x2_t b)
++vpmaxnm_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-  return __builtin_aarch64_smaxpv2sf (a, b);
++  return __builtin_aarch64_smaxpv2sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
++vpmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-  return __builtin_aarch64_smaxpv4sf (a, b);
++  return __builtin_aarch64_smaxpv4sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
++vpmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
+ {
+-  return __builtin_aarch64_smaxpv2df (a, b);
++  return __builtin_aarch64_smaxpv2df (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnmqd_f64 (float64x2_t a)
++vpmaxnmqd_f64 (float64x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smax_scal_v2df (a);
++  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
+ }
+ 
+ __extension__ extern __inline float32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnms_f32 (float32x2_t a)
++vpmaxnms_f32 (float32x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
++  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
+ }
+ 
+ /* vpmin  */
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_s8 (int8x8_t a, int8x8_t b)
++vpmin_s8 (int8x8_t __a, int8x8_t __b)
+ {
+-  return __builtin_aarch64_sminpv8qi (a, b);
++  return __builtin_aarch64_sminpv8qi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_s16 (int16x4_t a, int16x4_t b)
++vpmin_s16 (int16x4_t __a, int16x4_t __b)
+ {
+-  return __builtin_aarch64_sminpv4hi (a, b);
++  return __builtin_aarch64_sminpv4hi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_s32 (int32x2_t a, int32x2_t b)
++vpmin_s32 (int32x2_t __a, int32x2_t __b)
+ {
+-  return __builtin_aarch64_sminpv2si (a, b);
++  return __builtin_aarch64_sminpv2si (__a, __b);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_u8 (uint8x8_t a, uint8x8_t b)
++vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+-  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
+-						  (int8x8_t) b);
++  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) __a,
++						  (int8x8_t) __b);
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_u16 (uint16x4_t a, uint16x4_t b)
++vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+-  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
+-						   (int16x4_t) b);
++  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) __a,
++						   (int16x4_t) __b);
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_u32 (uint32x2_t a, uint32x2_t b)
++vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+-  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
+-						   (int32x2_t) b);
++  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) __a,
++						   (int32x2_t) __b);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_s8 (int8x16_t a, int8x16_t b)
++vpminq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  return __builtin_aarch64_sminpv16qi (a, b);
++  return __builtin_aarch64_sminpv16qi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_s16 (int16x8_t a, int16x8_t b)
++vpminq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  return __builtin_aarch64_sminpv8hi (a, b);
++  return __builtin_aarch64_sminpv8hi (__a, __b);
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_s32 (int32x4_t a, int32x4_t b)
++vpminq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  return __builtin_aarch64_sminpv4si (a, b);
++  return __builtin_aarch64_sminpv4si (__a, __b);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_u8 (uint8x16_t a, uint8x16_t b)
++vpminq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
+-						    (int8x16_t) b);
++  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) __a,
++						    (int8x16_t) __b);
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_u16 (uint16x8_t a, uint16x8_t b)
++vpminq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
+-						   (int16x8_t) b);
++  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) __a,
++						   (int16x8_t) __b);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_u32 (uint32x4_t a, uint32x4_t b)
++vpminq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
+-						   (int32x4_t) b);
++  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) __a,
++						   (int32x4_t) __b);
+ }
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_f32 (float32x2_t a, float32x2_t b)
++vpmin_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-  return __builtin_aarch64_smin_nanpv2sf (a, b);
++  return __builtin_aarch64_smin_nanpv2sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_f32 (float32x4_t a, float32x4_t b)
++vpminq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-  return __builtin_aarch64_smin_nanpv4sf (a, b);
++  return __builtin_aarch64_smin_nanpv4sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_f64 (float64x2_t a, float64x2_t b)
++vpminq_f64 (float64x2_t __a, float64x2_t __b)
+ {
+-  return __builtin_aarch64_smin_nanpv2df (a, b);
++  return __builtin_aarch64_smin_nanpv2df (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminqd_f64 (float64x2_t a)
++vpminqd_f64 (float64x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
++  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
+ }
+ 
+ __extension__ extern __inline float32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmins_f32 (float32x2_t a)
++vpmins_f32 (float32x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
++  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
+ }
+ 
+ /* vpminnm  */
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnm_f32 (float32x2_t a, float32x2_t b)
++vpminnm_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-  return __builtin_aarch64_sminpv2sf (a, b);
++  return __builtin_aarch64_sminpv2sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnmq_f32 (float32x4_t a, float32x4_t b)
++vpminnmq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-  return __builtin_aarch64_sminpv4sf (a, b);
++  return __builtin_aarch64_sminpv4sf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnmq_f64 (float64x2_t a, float64x2_t b)
++vpminnmq_f64 (float64x2_t __a, float64x2_t __b)
+ {
+-  return __builtin_aarch64_sminpv2df (a, b);
++  return __builtin_aarch64_sminpv2df (__a, __b);
+ }
+ 
+ __extension__ extern __inline float64_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnmqd_f64 (float64x2_t a)
++vpminnmqd_f64 (float64x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smin_scal_v2df (a);
++  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
+ }
+ 
+ __extension__ extern __inline float32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnms_f32 (float32x2_t a)
++vpminnms_f32 (float32x2_t __a)
+ {
+-  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
++  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
+ }
+ 
+ /* vmaxnm  */
+@@ -21889,9 +22204,9 @@ vminnmvq_f64 (float64x2_t __a)
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
++vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+-  return a + b * c;
++  return __a + __b * __c;
+ }
+ 
+ __extension__ extern __inline float64x1_t
+@@ -21903,16 +22218,16 @@ vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
++vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+-  return a + b * c;
++  return __a + __b * __c;
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
++vmlaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+ {
+-  return a + b * c;
++  return __a + __b * __c;
+ }
+ 
+ /* vmla_lane  */
+@@ -22087,9 +22402,9 @@ vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
++vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+ {
+-  return a - b * c;
++  return __a - __b * __c;
+ }
+ 
+ __extension__ extern __inline float64x1_t
+@@ -22101,16 +22416,16 @@ vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
++vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+ {
+-  return a - b * c;
++  return __a - __b * __c;
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
++vmlsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+ {
+-  return a - b * c;
++  return __a - __b * __c;
+ }
+ 
+ /* vmls_lane  */
+@@ -24874,419 +25189,419 @@ vqsubd_u64 (uint64_t __a, uint64_t __b)
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
++vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+-  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
++  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
++vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
++vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
++vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
++vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
++vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ /* vqtbl3 */
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
++vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
++vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
++vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
++vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
++vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
++vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ /* vqtbl4 */
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
++vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
++vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
++vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
++vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
++vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
++vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
+ }
+ 
+ 
+ /* vqtbx2 */
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
++vqtbx2_s8 (int8x8_t __r, int8x16x2_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+-  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
++  return __builtin_aarch64_tbx4v8qi (__r, __o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
++vqtbx2_u8 (uint8x8_t __r, uint8x16x2_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
+-						(int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
++						(int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
++vqtbx2_p8 (poly8x8_t __r, poly8x16x2_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
+-						(int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
++						(int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
++vqtbx2q_s8 (int8x16_t __r, int8x16x2_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
+-  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
++  return __builtin_aarch64_tbx4v16qi (__r, __o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
++vqtbx2q_u8 (uint8x16_t __r, uint8x16x2_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
+-						  (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o,
++						  (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
++vqtbx2q_p8 (poly8x16_t __r, poly8x16x2_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
+-						  (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o,
++						  (int8x16_t)__idx);
+ }
+ 
+ /* vqtbx3 */
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
++vqtbx3_s8 (int8x8_t __r, int8x16x3_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
+-  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2);
++  return __builtin_aarch64_qtbx3v8qi (__r, __o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
++vqtbx3_u8 (uint8x8_t __r, uint8x16x3_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
+-						 (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o,
++						 (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
++vqtbx3_p8 (poly8x8_t __r, poly8x16x3_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
+-						 (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o,
++						 (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
++vqtbx3q_s8 (int8x16_t __r, int8x16x3_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
+-  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2);
++  return __builtin_aarch64_qtbx3v16qi (__r, __o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
++vqtbx3q_u8 (uint8x16_t __r, uint8x16x3_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
+-						   (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o,
++						   (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
++vqtbx3q_p8 (poly8x16_t __r, poly8x16x3_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
+-						   (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o,
++						   (int8x16_t)__idx);
+ }
+ 
+ /* vqtbx4 */
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
++vqtbx4_s8 (int8x8_t __r, int8x16x4_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
+-  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3);
++  return __builtin_aarch64_qtbx4v8qi (__r, __o, (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
++vqtbx4_u8 (uint8x8_t __r, uint8x16x4_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
+-						 (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o,
++						 (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
++vqtbx4_p8 (poly8x8_t __r, poly8x16x4_t __tab, uint8x8_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
+-						 (int8x8_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o,
++						 (int8x8_t)__idx);
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
++vqtbx4q_s8 (int8x16_t __r, int8x16x4_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
+-  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3);
++  return __builtin_aarch64_qtbx4v16qi (__r, __o, (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
++vqtbx4q_u8 (uint8x16_t __r, uint8x16x4_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
+-						   (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o,
++						   (int8x16_t)__idx);
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
++vqtbx4q_p8 (poly8x16_t __r, poly8x16x4_t __tab, uint8x16_t __idx)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
+-  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
+-						   (int8x16_t)idx);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
++  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o,
++						   (int8x16_t)__idx);
+ }
+ 
+ /* vrbit  */
+@@ -25457,134 +25772,134 @@ vrecpxd_f64 (float64_t __a)
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev16_p8 (poly8x8_t a)
++vrev16_p8 (poly8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev16_s8 (int8x8_t a)
++vrev16_s8 (int8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev16_u8 (uint8x8_t a)
++vrev16_u8 (uint8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev16q_p8 (poly8x16_t a)
++vrev16q_p8 (poly8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev16q_s8 (int8x16_t a)
++vrev16q_s8 (int8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev16q_u8 (uint8x16_t a)
++vrev16q_u8 (uint8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32_p8 (poly8x8_t a)
++vrev32_p8 (poly8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+ 
+ __extension__ extern __inline poly16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32_p16 (poly16x4_t a)
++vrev32_p16 (poly16x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
++  return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32_s8 (int8x8_t a)
++vrev32_s8 (int8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32_s16 (int16x4_t a)
++vrev32_s16 (int16x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
++  return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32_u8 (uint8x8_t a)
++vrev32_u8 (uint8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32_u16 (uint16x4_t a)
++vrev32_u16 (uint16x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
++  return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32q_p8 (poly8x16_t a)
++vrev32q_p8 (poly8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+ }
+ 
+ __extension__ extern __inline poly16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32q_p16 (poly16x8_t a)
++vrev32q_p16 (poly16x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
++  return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32q_s8 (int8x16_t a)
++vrev32q_s8 (int8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32q_s16 (int16x8_t a)
++vrev32q_s16 (int16x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
++  return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32q_u8 (uint8x16_t a)
++vrev32q_u8 (uint8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev32q_u16 (uint16x8_t a)
++vrev32q_u16 (uint16x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
++  return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+ }
+ 
+ __extension__ extern __inline float16x4_t
+@@ -25596,65 +25911,65 @@ vrev64_f16 (float16x4_t __a)
+ 
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_f32 (float32x2_t a)
++vrev64_f32 (float32x2_t __a)
+ {
+-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
++  return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_p8 (poly8x8_t a)
++vrev64_p8 (poly8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+ }
+ 
+ __extension__ extern __inline poly16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_p16 (poly16x4_t a)
++vrev64_p16 (poly16x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
++  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+ }
+ 
+ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_s8 (int8x8_t a)
++vrev64_s8 (int8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+ }
+ 
+ __extension__ extern __inline int16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_s16 (int16x4_t a)
++vrev64_s16 (int16x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
++  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+ }
+ 
+ __extension__ extern __inline int32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_s32 (int32x2_t a)
++vrev64_s32 (int32x2_t __a)
+ {
+-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
++  return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_u8 (uint8x8_t a)
++vrev64_u8 (uint8x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
++  return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_u16 (uint16x4_t a)
++vrev64_u16 (uint16x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
++  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+ }
+ 
+ __extension__ extern __inline uint32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64_u32 (uint32x2_t a)
++vrev64_u32 (uint32x2_t __a)
+ {
+-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
++  return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
+ }
+ 
+ __extension__ extern __inline float16x8_t
+@@ -25666,68 +25981,68 @@ vrev64q_f16 (float16x8_t __a)
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_f32 (float32x4_t a)
++vrev64q_f32 (float32x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
++  return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
+ }
+ 
+ __extension__ extern __inline poly8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_p8 (poly8x16_t a)
++vrev64q_p8 (poly8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+ }
+ 
+ __extension__ extern __inline poly16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_p16 (poly16x8_t a)
++vrev64q_p16 (poly16x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
++  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+ 
+ __extension__ extern __inline int8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_s8 (int8x16_t a)
++vrev64q_s8 (int8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+ }
+ 
+ __extension__ extern __inline int16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_s16 (int16x8_t a)
++vrev64q_s16 (int16x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
++  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+ 
+ __extension__ extern __inline int32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_s32 (int32x4_t a)
++vrev64q_s32 (int32x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
++  return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
+ }
+ 
+ __extension__ extern __inline uint8x16_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_u8 (uint8x16_t a)
++vrev64q_u8 (uint8x16_t __a)
+ {
+-  return __builtin_shuffle (a,
++  return __builtin_shuffle (__a,
+       (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+ }
+ 
+ __extension__ extern __inline uint16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_u16 (uint16x8_t a)
++vrev64q_u16 (uint16x8_t __a)
+ {
+-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
++  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrev64q_u32 (uint32x4_t a)
++vrev64q_u32 (uint32x4_t __a)
+ {
+-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
++  return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
+ }
+ 
+ /* vrnd  */
+@@ -26420,87 +26735,90 @@ vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
++vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
+ {
+-  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
++  return __builtin_aarch64_crypto_sha1cv4si_uuuu (__hash_abcd, __hash_e, __wk);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
++vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
+ {
+-  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
++  return __builtin_aarch64_crypto_sha1mv4si_uuuu (__hash_abcd, __hash_e, __wk);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
++vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
+ {
+-  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
++  return __builtin_aarch64_crypto_sha1pv4si_uuuu (__hash_abcd, __hash_e, __wk);
+ }
+ 
+ __extension__ extern __inline uint32_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha1h_u32 (uint32_t hash_e)
++vsha1h_u32 (uint32_t __hash_e)
+ {
+-  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
++  return __builtin_aarch64_crypto_sha1hsi_uu (__hash_e);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
++vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
+ {
+-  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
++  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (__w0_3, __w4_7, __w8_11);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
++vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
+ {
+-  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
++  return __builtin_aarch64_crypto_sha1su1v4si_uuu (__tw0_3, __w12_15);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
++vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
+ {
+-  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
++  return __builtin_aarch64_crypto_sha256hv4si_uuuu (__hash_abcd, __hash_efgh,
++						     __wk);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
++vsha256h2q_u32 (uint32x4_t __hash_efgh, uint32x4_t __hash_abcd, uint32x4_t __wk)
+ {
+-  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
++  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (__hash_efgh, __hash_abcd,
++						      __wk);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
++vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
+ {
+-  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
++  return __builtin_aarch64_crypto_sha256su0v4si_uuu (__w0_3, __w4_7);
+ }
+ 
+ __extension__ extern __inline uint32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
++vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
+ {
+-  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
++  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (__tw0_3, __w8_11,
++						       __w12_15);
+ }
+ 
+ __extension__ extern __inline poly128_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_p64 (poly64_t a, poly64_t b)
++vmull_p64 (poly64_t __a, poly64_t __b)
+ {
+   return
+-    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
++    __builtin_aarch64_crypto_pmulldi_ppp (__a, __b);
+ }
+ 
+ __extension__ extern __inline poly128_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vmull_high_p64 (poly64x2_t a, poly64x2_t b)
++vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
+ {
+-  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
++  return __builtin_aarch64_crypto_pmullv2di_ppp (__a, __b);
+ }
+ 
+ #pragma GCC pop_options
+@@ -27202,30 +27520,30 @@ vsqaddd_u64 (uint64_t __a, int64_t __b)
+ /* vsqrt */
+ __extension__ extern __inline float32x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsqrt_f32 (float32x2_t a)
++vsqrt_f32 (float32x2_t __a)
+ {
+-  return __builtin_aarch64_sqrtv2sf (a);
++  return __builtin_aarch64_sqrtv2sf (__a);
+ }
+ 
+ __extension__ extern __inline float32x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsqrtq_f32 (float32x4_t a)
++vsqrtq_f32 (float32x4_t __a)
+ {
+-  return __builtin_aarch64_sqrtv4sf (a);
++  return __builtin_aarch64_sqrtv4sf (__a);
+ }
+ 
+ __extension__ extern __inline float64x1_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsqrt_f64 (float64x1_t a)
++vsqrt_f64 (float64x1_t __a)
+ {
+-  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
++  return (float64x1_t) { __builtin_aarch64_sqrtdf (__a[0]) };
+ }
+ 
+ __extension__ extern __inline float64x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsqrtq_f64 (float64x2_t a)
++vsqrtq_f64 (float64x2_t __a)
+ {
+-  return __builtin_aarch64_sqrtv2df (a);
++  return __builtin_aarch64_sqrtv2df (__a);
+ }
+ 
+ /* vsra */
+@@ -27495,98 +27813,98 @@ vst1_f16 (float16_t *__a, float16x4_t __b)
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f32 (float32_t *a, float32x2_t b)
++vst1_f32 (float32_t *__a, float32x2_t __b)
+ {
+-  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
++  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f64 (float64_t *a, float64x1_t b)
++vst1_f64 (float64_t *__a, float64x1_t __b)
+ {
+-  *a = b[0];
++  *__a = __b[0];
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p8 (poly8_t *a, poly8x8_t b)
++vst1_p8 (poly8_t *__a, poly8x8_t __b)
+ {
+-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+-			     (int8x8_t) b);
++  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a,
++			     (int8x8_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p16 (poly16_t *a, poly16x4_t b)
++vst1_p16 (poly16_t *__a, poly16x4_t __b)
+ {
+-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+-			     (int16x4_t) b);
++  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a,
++			     (int16x4_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p64 (poly64_t *a, poly64x1_t b)
++vst1_p64 (poly64_t *__a, poly64x1_t __b)
+ {
+-  *a = b[0];
++  *__a = __b[0];
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s8 (int8_t *a, int8x8_t b)
++vst1_s8 (int8_t *__a, int8x8_t __b)
+ {
+-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
++  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s16 (int16_t *a, int16x4_t b)
++vst1_s16 (int16_t *__a, int16x4_t __b)
+ {
+-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
++  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s32 (int32_t *a, int32x2_t b)
++vst1_s32 (int32_t *__a, int32x2_t __b)
+ {
+-  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
++  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s64 (int64_t *a, int64x1_t b)
++vst1_s64 (int64_t *__a, int64x1_t __b)
+ {
+-  *a = b[0];
++  *__a = __b[0];
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u8 (uint8_t *a, uint8x8_t b)
++vst1_u8 (uint8_t *__a, uint8x8_t __b)
+ {
+-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
+-			     (int8x8_t) b);
++  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a,
++			     (int8x8_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u16 (uint16_t *a, uint16x4_t b)
++vst1_u16 (uint16_t *__a, uint16x4_t __b)
+ {
+-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
+-			     (int16x4_t) b);
++  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a,
++			     (int16x4_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u32 (uint32_t *a, uint32x2_t b)
++vst1_u32 (uint32_t *__a, uint32x2_t __b)
+ {
+-  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
+-			     (int32x2_t) b);
++  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) __a,
++			     (int32x2_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u64 (uint64_t *a, uint64x1_t b)
++vst1_u64 (uint64_t *__a, uint64x1_t __b)
+ {
+-  *a = b[0];
++  *__a = __b[0];
+ }
+ 
+ /* vst1q */
+@@ -27600,100 +27918,100 @@ vst1q_f16 (float16_t *__a, float16x8_t __b)
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f32 (float32_t *a, float32x4_t b)
++vst1q_f32 (float32_t *__a, float32x4_t __b)
+ {
+-  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
++  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f64 (float64_t *a, float64x2_t b)
++vst1q_f64 (float64_t *__a, float64x2_t __b)
+ {
+-  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
++  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p8 (poly8_t *a, poly8x16_t b)
++vst1q_p8 (poly8_t *__a, poly8x16_t __b)
+ {
+-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+-			      (int8x16_t) b);
++  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a,
++			      (int8x16_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p16 (poly16_t *a, poly16x8_t b)
++vst1q_p16 (poly16_t *__a, poly16x8_t __b)
+ {
+-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+-			     (int16x8_t) b);
++  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a,
++			     (int16x8_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p64 (poly64_t *a, poly64x2_t b)
++vst1q_p64 (poly64_t *__a, poly64x2_t __b)
+ {
+-  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a,
+-				(poly64x2_t) b);
++  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) __a,
++				(poly64x2_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s8 (int8_t *a, int8x16_t b)
++vst1q_s8 (int8_t *__a, int8x16_t __b)
+ {
+-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
++  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s16 (int16_t *a, int16x8_t b)
++vst1q_s16 (int16_t *__a, int16x8_t __b)
+ {
+-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
++  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s32 (int32_t *a, int32x4_t b)
++vst1q_s32 (int32_t *__a, int32x4_t __b)
+ {
+-  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
++  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s64 (int64_t *a, int64x2_t b)
++vst1q_s64 (int64_t *__a, int64x2_t __b)
+ {
+-  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
++  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) __a, __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u8 (uint8_t *a, uint8x16_t b)
++vst1q_u8 (uint8_t *__a, uint8x16_t __b)
+ {
+-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
+-			      (int8x16_t) b);
++  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a,
++			      (int8x16_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u16 (uint16_t *a, uint16x8_t b)
++vst1q_u16 (uint16_t *__a, uint16x8_t __b)
+ {
+-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
+-			     (int16x8_t) b);
++  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a,
++			     (int16x8_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u32 (uint32_t *a, uint32x4_t b)
++vst1q_u32 (uint32_t *__a, uint32x4_t __b)
+ {
+-  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
+-			     (int32x4_t) b);
++  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) __a,
++			     (int32x4_t) __b);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u64 (uint64_t *a, uint64x2_t b)
++vst1q_u64 (uint64_t *__a, uint64x2_t __b)
+ {
+-  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
+-			     (int64x2_t) b);
++  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) __a,
++			     (int64x2_t) __b);
+ }
+ 
+ /* vst1_lane */
+@@ -27900,327 +28218,343 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
++vst1_s64_x2 (int64_t * __a, int64x1x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  int64x2x2_t temp;
+-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
++  int64x2x2_t __temp;
++  __temp.val[0]
++    = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[1]
++    = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
++vst1_u64_x2 (uint64_t * __a, uint64x1x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  uint64x2x2_t temp;
+-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
++  uint64x2x2_t __temp;
++  __temp.val[0]
++    = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1]
++    = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
++vst1_f64_x2 (float64_t * __a, float64x1x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  float64x2x2_t temp;
+-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
++  float64x2x2_t __temp;
++  __temp.val[0]
++    = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1]
++    = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
++vst1_s8_x2 (int8_t * __a, int8x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  int8x16x2_t temp;
+-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
++  int8x16x2_t __temp;
++  __temp.val[0]
++    = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[1]
++    = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
++vst1_p8_x2 (poly8_t * __a, poly8x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  poly8x16x2_t temp;
+-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
++  poly8x16x2_t __temp;
++  __temp.val[0]
++    = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1]
++    = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
++vst1_s16_x2 (int16_t * __a, int16x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  int16x8x2_t temp;
+-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
++  int16x8x2_t __temp;
++  __temp.val[0]
++    = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[1]
++    = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
++vst1_p16_x2 (poly16_t * __a, poly16x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  poly16x8x2_t temp;
+-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
++  poly16x8x2_t __temp;
++  __temp.val[0]
++    = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1]
++    = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
++vst1_s32_x2 (int32_t * __a, int32x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  int32x4x2_t temp;
+-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
++  int32x4x2_t __temp;
++  __temp.val[0]
++    = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[1]
++    = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
++vst1_u8_x2 (uint8_t * __a, uint8x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  uint8x16x2_t temp;
+-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
++  uint8x16x2_t __temp;
++  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
++vst1_u16_x2 (uint16_t * __a, uint16x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  uint16x8x2_t temp;
+-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
++  uint16x8x2_t __temp;
++  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
++vst1_u32_x2 (uint32_t * __a, uint32x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  uint32x4x2_t temp;
+-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
++  uint32x4x2_t __temp;
++  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
++vst1_f16_x2 (float16_t * __a, float16x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  float16x8x2_t temp;
+-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
++  float16x8x2_t __temp;
++  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1);
+   __builtin_aarch64_st1x2v4hf (__a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
++vst1_f32_x2 (float32_t * __a, float32x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  float32x4x2_t temp;
+-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
++  float32x4x2_t __temp;
++  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
++vst1_p64_x2 (poly64_t * __a, poly64x1x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  poly64x2x2_t temp;
+-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  poly64x2x2_t __temp;
++  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[0], 0);
++					       (poly64x2_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[1], 1);
++					       (poly64x2_t) __temp.val[1], 1);
+   __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
++vst1q_s8_x2 (int8_t * __a, int8x16x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
++vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
++vst1q_s16_x2 (int16_t * __a, int16x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
++vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
++vst1q_s32_x2 (int32_t * __a, int32x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
++vst1q_s64_x2 (int64_t * __a, int64x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
++vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
++vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
++vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
++vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
++vst1q_f16_x2 (float16_t * __a, float16x8x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1);
+   __builtin_aarch64_st1x2v8hf (__a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
++vst1q_f32_x2 (float32_t * __a, float32x4x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
++vst1q_f64_x2 (float64_t * __a, float64x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
++vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __val)
+ {
+   __builtin_aarch64_simd_oi __o;
+   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[0], 0);
++					       (poly64x2_t) __val.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[1], 1);
++					       (poly64x2_t) __val.val[1], 1);
+   __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+@@ -28228,1483 +28562,1709 @@ vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
++vst1_s64_x3 (int64_t * __a, int64x1x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int64x2x3_t temp;
+-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
++  int64x2x3_t __temp;
++  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
++vst1_u64_x3 (uint64_t * __a, uint64x1x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint64x2x3_t temp;
+-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
++  uint64x2x3_t __temp;
++  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
++vst1_f64_x3 (float64_t * __a, float64x1x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  float64x2x3_t temp;
+-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
++  float64x2x3_t __temp;
++  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
++vst1_s8_x3 (int8_t * __a, int8x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int8x16x3_t temp;
+-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
++  int8x16x3_t __temp;
++  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
++vst1_p8_x3 (poly8_t * __a, poly8x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  poly8x16x3_t temp;
+-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
++  poly8x16x3_t __temp;
++  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
++vst1_s16_x3 (int16_t * __a, int16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int16x8x3_t temp;
+-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
++  int16x8x3_t __temp;
++  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
++vst1_p16_x3 (poly16_t * __a, poly16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  poly16x8x3_t temp;
+-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
++  poly16x8x3_t __temp;
++  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
++vst1_s32_x3 (int32_t * __a, int32x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int32x4x3_t temp;
+-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
++  int32x4x3_t __temp;
++  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
++vst1_u8_x3 (uint8_t * __a, uint8x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint8x16x3_t temp;
+-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
++  uint8x16x3_t __temp;
++  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
++vst1_u16_x3 (uint16_t * __a, uint16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint16x8x3_t temp;
+-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
++  uint16x8x3_t __temp;
++  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
++vst1_u32_x3 (uint32_t * __a, uint32x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint32x4x3_t temp;
+-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
++  uint32x4x3_t __temp;
++  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
++vst1_f16_x3 (float16_t * __a, float16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  float16x8x3_t temp;
+-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
++  float16x8x3_t __temp;
++  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
++vst1_f32_x3 (float32_t * __a, float32x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  float32x4x3_t temp;
+-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
++  float32x4x3_t __temp;
++  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
++vst1_p64_x3 (poly64_t * __a, poly64x1x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  poly64x2x3_t temp;
+-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  poly64x2x3_t __temp;
++  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[0], 0);
++					       (poly64x2_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[1], 1);
++					       (poly64x2_t) __temp.val[1], 1);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[2], 2);
++					       (poly64x2_t) __temp.val[2], 2);
+   __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
++vst1q_s8_x3 (int8_t * __a, int8x16x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
++vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
++vst1q_s16_x3 (int16_t * __a, int16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
++vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
++vst1q_s32_x3 (int32_t * __a, int32x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
++vst1q_s64_x3 (int64_t * __a, int64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
++vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
++vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
++vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
++vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
++vst1q_f16_x3 (float16_t * __a, float16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
++vst1q_f32_x3 (float32_t * __a, float32x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
++vst1q_f64_x3 (float64_t * __a, float64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
++vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) val.val[0], 0);
++					       (poly64x2_t) __val.val[0], 0);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) val.val[1], 1);
++					       (poly64x2_t) __val.val[1], 1);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) val.val[2], 2);
++					       (poly64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+-/* vstn */
++/* vst1(q)_x4.  */
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_s64 (int64_t * __a, int64x1x2_t val)
++vst1_s8_x4 (int8_t * __a, int8x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  int64x2x2_t temp;
+-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
++  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_u64 (uint64_t * __a, uint64x1x2_t val)
++vst1q_s8_x4 (int8_t * __a, int8x16x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  uint64x2x2_t temp;
+-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
++  union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_f64 (float64_t * __a, float64x1x2_t val)
++vst1_s16_x4 (int16_t * __a, int16x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  float64x2x2_t temp;
+-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+-  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
++  union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_s8 (int8_t * __a, int8x8x2_t val)
++vst1q_s16_x4 (int16_t * __a, int16x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  int8x16x2_t temp;
+-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
++  union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_p8 (poly8_t * __a, poly8x8x2_t val)
++vst1_s32_x4 (int32_t * __a, int32x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  poly8x16x2_t temp;
+-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
++  union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_s16 (int16_t * __a, int16x4x2_t val)
++vst1q_s32_x4 (int32_t * __a, int32x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  int16x8x2_t temp;
+-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
++  union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_p16 (poly16_t * __a, poly16x4x2_t val)
++vst1_u8_x4 (uint8_t * __a, uint8x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  poly16x8x2_t temp;
+-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
++  union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_s32 (int32_t * __a, int32x2x2_t val)
++vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  int32x4x2_t temp;
+-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
++  union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_u8 (uint8_t * __a, uint8x8x2_t val)
++vst1_u16_x4 (uint16_t * __a, uint16x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  uint8x16x2_t temp;
+-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
++  union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_u16 (uint16_t * __a, uint16x4x2_t val)
++vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  uint16x8x2_t temp;
+-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
++  union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_u32 (uint32_t * __a, uint32x2x2_t val)
++vst1_u32_x4 (uint32_t * __a, uint32x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  uint32x4x2_t temp;
+-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
++  union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_f16 (float16_t * __a, float16x4x2_t val)
++vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  float16x8x2_t temp;
+-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+-  __builtin_aarch64_st2v4hf (__a, __o);
++  union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_f32 (float32_t * __a, float32x2x2_t val)
++vst1_f16_x4 (float16_t * __a, float16x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  float32x4x2_t temp;
+-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+-  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
++  union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2_p64 (poly64_t * __a, poly64x1x2_t val)
++vst1q_f16_x4 (float16_t * __a, float16x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  poly64x2x2_t temp;
+-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[1], 1);
+-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
++  union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8hf ((__builtin_aarch64_simd_hf *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_s8 (int8_t * __a, int8x16x2_t val)
++vst1_f32_x4 (float32_t * __a, float32x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
++  union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
++vst1q_f32_x4 (float32_t * __a, float32x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
++  union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4sf ((__builtin_aarch64_simd_sf *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_s16 (int16_t * __a, int16x8x2_t val)
++vst1_p8_x4 (poly8_t * __a, poly8x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
++  union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
++vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
++  union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_s32 (int32_t * __a, int32x4x2_t val)
++vst1_p16_x4 (poly16_t * __a, poly16x4x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
++  union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_s64 (int64_t * __a, int64x2x2_t val)
++vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
++  union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
++vst1_s64_x4 (int64_t * __a, int64x1x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
++  union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
++vst1_u64_x4 (uint64_t * __a, uint64x1x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
++  union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
++vst1_p64_x4 (poly64_t * __a, poly64x1x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
++  union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
++vst1q_s64_x4 (int64_t * __a, int64x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
++  union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_f16 (float16_t * __a, float16x8x2_t val)
++vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+-  __builtin_aarch64_st2v8hf (__a, __o);
++  union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_f32 (float32_t * __a, float32x4x2_t val)
++vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+-  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
++  union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_f64 (float64_t * __a, float64x2x2_t val)
++vst1_f64_x4 (float64_t * __a, float64x1x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+-  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
++  union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __u.__o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst2q_p64 (poly64_t * __a, poly64x2x2_t val)
++vst1q_f64_x4 (float64_t * __a, float64x2x4_t val)
+ {
+-  __builtin_aarch64_simd_oi __o;
+-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[1], 1);
+-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
++  union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v2df ((__builtin_aarch64_simd_df *) __a, __u.__o);
+ }
+ 
++/* vstn */
++
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_s64 (int64_t * __a, int64x1x3_t val)
++vst2_s64 (int64_t * __a, int64x1x2_t __val)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  int64x2x3_t temp;
+-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+-  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
++  __builtin_aarch64_simd_oi __o;
++  int64x2x2_t __temp;
++  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_u64 (uint64_t * __a, uint64x1x3_t val)
++vst2_u64 (uint64_t * __a, uint64x1x2_t __val)
+ {
+-  __builtin_aarch64_simd_ci __o;
+-  uint64x2x3_t temp;
+-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+-  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
++  __builtin_aarch64_simd_oi __o;
++  uint64x2x2_t __temp;
++  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_f64 (float64_t * __a, float64x1x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  float64x2x2_t __temp;
++  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1);
++  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_s8 (int8_t * __a, int8x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  int8x16x2_t __temp;
++  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_p8 (poly8_t * __a, poly8x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  poly8x16x2_t __temp;
++  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_s16 (int16_t * __a, int16x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  int16x8x2_t __temp;
++  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_p16 (poly16_t * __a, poly16x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  poly16x8x2_t __temp;
++  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_s32 (int32_t * __a, int32x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  int32x4x2_t __temp;
++  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_u8 (uint8_t * __a, uint8x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  uint8x16x2_t __temp;
++  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_u16 (uint16_t * __a, uint16x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  uint16x8x2_t __temp;
++  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_u32 (uint32_t * __a, uint32x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  uint32x4x2_t __temp;
++  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_f16 (float16_t * __a, float16x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  float16x8x2_t __temp;
++  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1);
++  __builtin_aarch64_st2v4hf (__a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_f32 (float32_t * __a, float32x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  float32x4x2_t __temp;
++  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1);
++  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_p64 (poly64_t * __a, poly64x1x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  poly64x2x2_t __temp;
++  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) __temp.val[1], 1);
++  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_s8 (int8_t * __a, int8x16x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_p8 (poly8_t * __a, poly8x16x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_s16 (int16_t * __a, int16x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_p16 (poly16_t * __a, poly16x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_s32 (int32_t * __a, int32x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
++  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_s64 (int64_t * __a, int64x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
++  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_u8 (uint8_t * __a, uint8x16x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_u16 (uint16_t * __a, uint16x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_u32 (uint32_t * __a, uint32x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
++  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_u64 (uint64_t * __a, uint64x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
++  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_f16 (float16_t * __a, float16x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1);
++  __builtin_aarch64_st2v8hf (__a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_f32 (float32_t * __a, float32x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1);
++  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_f64 (float64_t * __a, float64x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1);
++  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_p64 (poly64_t * __a, poly64x2x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
++					       (poly64x2_t) __val.val[1], 1);
++  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst3_s64 (int64_t * __a, int64x1x3_t __val)
++{
++  __builtin_aarch64_simd_ci __o;
++  int64x2x3_t __temp;
++  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
++  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst3_u64 (uint64_t * __a, uint64x1x3_t __val)
++{
++  __builtin_aarch64_simd_ci __o;
++  uint64x2x3_t __temp;
++  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
++  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_f64 (float64_t * __a, float64x1x3_t val)
++vst3_f64 (float64_t * __a, float64x1x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  float64x2x3_t temp;
+-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
++  float64x2x3_t __temp;
++  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2);
+   __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_s8 (int8_t * __a, int8x8x3_t val)
++vst3_s8 (int8_t * __a, int8x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int8x16x3_t temp;
+-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
++  int8x16x3_t __temp;
++  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_p8 (poly8_t * __a, poly8x8x3_t val)
++vst3_p8 (poly8_t * __a, poly8x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  poly8x16x3_t temp;
+-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
++  poly8x16x3_t __temp;
++  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_s16 (int16_t * __a, int16x4x3_t val)
++vst3_s16 (int16_t * __a, int16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int16x8x3_t temp;
+-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
++  int16x8x3_t __temp;
++  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_p16 (poly16_t * __a, poly16x4x3_t val)
++vst3_p16 (poly16_t * __a, poly16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  poly16x8x3_t temp;
+-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
++  poly16x8x3_t __temp;
++  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_s32 (int32_t * __a, int32x2x3_t val)
++vst3_s32 (int32_t * __a, int32x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  int32x4x3_t temp;
+-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
++  int32x4x3_t __temp;
++  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_u8 (uint8_t * __a, uint8x8x3_t val)
++vst3_u8 (uint8_t * __a, uint8x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint8x16x3_t temp;
+-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
++  uint8x16x3_t __temp;
++  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_u16 (uint16_t * __a, uint16x4x3_t val)
++vst3_u16 (uint16_t * __a, uint16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint16x8x3_t temp;
+-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
++  uint16x8x3_t __temp;
++  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_u32 (uint32_t * __a, uint32x2x3_t val)
++vst3_u32 (uint32_t * __a, uint32x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  uint32x4x3_t temp;
+-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
++  uint32x4x3_t __temp;
++  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_f16 (float16_t * __a, float16x4x3_t val)
++vst3_f16 (float16_t * __a, float16x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  float16x8x3_t temp;
+-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
++  float16x8x3_t __temp;
++  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_f32 (float32_t * __a, float32x2x3_t val)
++vst3_f32 (float32_t * __a, float32x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  float32x4x3_t temp;
+-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
++  float32x4x3_t __temp;
++  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2);
+   __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3_p64 (poly64_t * __a, poly64x1x3_t val)
++vst3_p64 (poly64_t * __a, poly64x1x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  poly64x2x3_t temp;
+-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  poly64x2x3_t __temp;
++  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[0], 0);
++					       (poly64x2_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[1], 1);
++					       (poly64x2_t) __temp.val[1], 1);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[2], 2);
++					       (poly64x2_t) __temp.val[2], 2);
+   __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_s8 (int8_t * __a, int8x16x3_t val)
++vst3q_s8 (int8_t * __a, int8x16x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
+   __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
++vst3q_p8 (poly8_t * __a, poly8x16x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
+   __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_s16 (int16_t * __a, int16x8x3_t val)
++vst3q_s16 (int16_t * __a, int16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
++vst3q_p16 (poly16_t * __a, poly16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_s32 (int32_t * __a, int32x4x3_t val)
++vst3q_s32 (int32_t * __a, int32x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
+   __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_s64 (int64_t * __a, int64x2x3_t val)
++vst3q_s64 (int64_t * __a, int64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
++vst3q_u8 (uint8_t * __a, uint8x16x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
+   __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
++vst3q_u16 (uint16_t * __a, uint16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
++vst3q_u32 (uint32_t * __a, uint32x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
+   __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
++vst3q_u64 (uint64_t * __a, uint64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_f16 (float16_t * __a, float16x8x3_t val)
++vst3q_f16 (float16_t * __a, float16x8x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2);
+   __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_f32 (float32_t * __a, float32x4x3_t val)
++vst3q_f32 (float32_t * __a, float32x4x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2);
+   __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_f64 (float64_t * __a, float64x2x3_t val)
++vst3q_f64 (float64_t * __a, float64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst3q_p64 (poly64_t * __a, poly64x2x3_t val)
++vst3q_p64 (poly64_t * __a, poly64x2x3_t __val)
+ {
+   __builtin_aarch64_simd_ci __o;
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) val.val[0], 0);
++					       (poly64x2_t) __val.val[0], 0);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) val.val[1], 1);
++					       (poly64x2_t) __val.val[1], 1);
+   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+-					       (poly64x2_t) val.val[2], 2);
++					       (poly64x2_t) __val.val[2], 2);
+   __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_s64 (int64_t * __a, int64x1x4_t val)
++vst4_s64 (int64_t * __a, int64x1x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  int64x2x4_t temp;
+-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
++  int64x2x4_t __temp;
++  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __temp.val[3] = vcombine_s64 (__val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3);
+   __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_u64 (uint64_t * __a, uint64x1x4_t val)
++vst4_u64 (uint64_t * __a, uint64x1x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  uint64x2x4_t temp;
+-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
++  uint64x2x4_t __temp;
++  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_u64 (__val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3);
+   __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_f64 (float64_t * __a, float64x1x4_t val)
++vst4_f64 (float64_t * __a, float64x1x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  float64x2x4_t temp;
+-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3);
++  float64x2x4_t __temp;
++  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_f64 (__val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3);
+   __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_s8 (int8_t * __a, int8x8x4_t val)
++vst4_s8 (int8_t * __a, int8x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  int8x16x4_t temp;
+-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
++  int8x16x4_t __temp;
++  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __temp.val[3] = vcombine_s8 (__val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_p8 (poly8_t * __a, poly8x8x4_t val)
++vst4_p8 (poly8_t * __a, poly8x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  poly8x16x4_t temp;
+-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
++  poly8x16x4_t __temp;
++  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_p8 (__val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_s16 (int16_t * __a, int16x4x4_t val)
++vst4_s16 (int16_t * __a, int16x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  int16x8x4_t temp;
+-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
++  int16x8x4_t __temp;
++  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __temp.val[3] = vcombine_s16 (__val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_p16 (poly16_t * __a, poly16x4x4_t val)
++vst4_p16 (poly16_t * __a, poly16x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  poly16x8x4_t temp;
+-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
++  poly16x8x4_t __temp;
++  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_p16 (__val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_s32 (int32_t * __a, int32x2x4_t val)
++vst4_s32 (int32_t * __a, int32x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  int32x4x4_t temp;
+-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
++  int32x4x4_t __temp;
++  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __temp.val[3] = vcombine_s32 (__val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_u8 (uint8_t * __a, uint8x8x4_t val)
++vst4_u8 (uint8_t * __a, uint8x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  uint8x16x4_t temp;
+-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
++  uint8x16x4_t __temp;
++  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_u8 (__val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_u16 (uint16_t * __a, uint16x4x4_t val)
++vst4_u16 (uint16_t * __a, uint16x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  uint16x8x4_t temp;
+-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
++  uint16x8x4_t __temp;
++  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_u16 (__val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_u32 (uint32_t * __a, uint32x2x4_t val)
++vst4_u32 (uint32_t * __a, uint32x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  uint32x4x4_t temp;
+-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
++  uint32x4x4_t __temp;
++  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_u32 (__val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_f16 (float16_t * __a, float16x4x4_t val)
++vst4_f16 (float16_t * __a, float16x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  float16x8x4_t temp;
+-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
++  float16x8x4_t __temp;
++  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_f16 (__val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_f32 (float32_t * __a, float32x2x4_t val)
++vst4_f32 (float32_t * __a, float32x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  float32x4x4_t temp;
+-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3);
++  float32x4x4_t __temp;
++  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_f32 (__val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3);
+   __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4_p64 (poly64_t * __a, poly64x1x4_t val)
++vst4_p64 (poly64_t * __a, poly64x1x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  poly64x2x4_t temp;
+-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+-  temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  poly64x2x4_t __temp;
++  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_p64 (__val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[0], 0);
++					       (poly64x2_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[1], 1);
++					       (poly64x2_t) __temp.val[1], 1);
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[2], 2);
++					       (poly64x2_t) __temp.val[2], 2);
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) temp.val[3], 3);
++					       (poly64x2_t) __temp.val[3], 3);
+   __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_s8 (int8_t * __a, int8x16x4_t val)
++vst4q_s8 (int8_t * __a, int8x16x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3);
+   __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
++vst4q_p8 (poly8_t * __a, poly8x16x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3);
+   __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_s16 (int16_t * __a, int16x8x4_t val)
++vst4q_s16 (int16_t * __a, int16x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3);
+   __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
++vst4q_p16 (poly16_t * __a, poly16x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3);
+   __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_s32 (int32_t * __a, int32x4x4_t val)
++vst4q_s32 (int32_t * __a, int32x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3);
+   __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_s64 (int64_t * __a, int64x2x4_t val)
++vst4q_s64 (int64_t * __a, int64x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3);
+   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
++vst4q_u8 (uint8_t * __a, uint8x16x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3);
+   __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
++vst4q_u16 (uint16_t * __a, uint16x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3);
+   __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
++vst4q_u32 (uint32_t * __a, uint32x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3);
+   __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
++vst4q_u64 (uint64_t * __a, uint64x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3);
+   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_f16 (float16_t * __a, float16x8x4_t val)
++vst4q_f16 (float16_t * __a, float16x8x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[3], 3);
+   __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_f32 (float32_t * __a, float32x4x4_t val)
++vst4q_f32 (float32_t * __a, float32x4x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[3], 3);
+   __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_f64 (float64_t * __a, float64x2x4_t val)
++vst4q_f64 (float64_t * __a, float64x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
+-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[3], 3);
+   __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ }
+ 
+ __extension__ extern __inline void
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vst4q_p64 (poly64_t * __a, poly64x2x4_t val)
++vst4q_p64 (poly64_t * __a, poly64x2x4_t __val)
+ {
+   __builtin_aarch64_simd_xi __o;
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[0], 0);
++					       (poly64x2_t) __val.val[0], 0);
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[1], 1);
++					       (poly64x2_t) __val.val[1], 1);
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[2], 2);
++					       (poly64x2_t) __val.val[2], 2);
+   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+-					       (poly64x2_t) val.val[3], 3);
++					       (poly64x2_t) __val.val[3], 3);
+   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ }
+ 
+@@ -29796,53 +30356,53 @@ __extension__ extern __inline int8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
+ {
+-  int8x8_t result;
+-  int8x16x2_t temp;
++  int8x8_t __result;
++  int8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+-  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
++  __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
+-  return result;
++					   (int8x16_t) __temp.val[1], 1);
++  __result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
++  return __result;
+ }
+ 
+ __extension__ extern __inline uint8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
+ {
+-  uint8x8_t result;
+-  uint8x16x2_t temp;
++  uint8x8_t __result;
++  uint8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+-  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
++  __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
++					   (int8x16_t) __temp.val[1], 1);
++  __result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+ 						  (int8x8_t)__idx);
+-  return result;
++  return __result;
+ }
+ 
+ __extension__ extern __inline poly8x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+ vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
+ {
+-  poly8x8_t result;
+-  poly8x16x2_t temp;
++  poly8x8_t __result;
++  poly8x16x2_t __temp;
+   __builtin_aarch64_simd_oi __o;
+-  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+-  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
++  __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
++  __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[0], 0);
++					   (int8x16_t) __temp.val[0], 0);
+   __o = __builtin_aarch64_set_qregoiv16qi (__o,
+-					   (int8x16_t) temp.val[1], 1);
+-  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
++					   (int8x16_t) __temp.val[1], 1);
++  __result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+ 						  (int8x8_t)__idx);
+-  return result;
++  return __result;
+ }
+ 
+ /* vtrn */
+@@ -30374,65 +30934,65 @@ vtrn_f16 (float16x4_t __a, float16x4_t __b)
+ 
+ __extension__ extern __inline float32x2x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_f32 (float32x2_t a, float32x2_t b)
++vtrn_f32 (float32x2_t __a, float32x2_t __b)
+ {
+-  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
++  return (float32x2x2_t) {vtrn1_f32 (__a, __b), vtrn2_f32 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline poly8x8x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_p8 (poly8x8_t a, poly8x8_t b)
++vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
+ {
+-  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
++  return (poly8x8x2_t) {vtrn1_p8 (__a, __b), vtrn2_p8 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline poly16x4x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_p16 (poly16x4_t a, poly16x4_t b)
++vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
+ {
+-  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
++  return (poly16x4x2_t) {vtrn1_p16 (__a, __b), vtrn2_p16 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline int8x8x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_s8 (int8x8_t a, int8x8_t b)
++vtrn_s8 (int8x8_t __a, int8x8_t __b)
+ {
+-  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
++  return (int8x8x2_t) {vtrn1_s8 (__a, __b), vtrn2_s8 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline int16x4x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_s16 (int16x4_t a, int16x4_t b)
++vtrn_s16 (int16x4_t __a, int16x4_t __b)
+ {
+-  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
++  return (int16x4x2_t) {vtrn1_s16 (__a, __b), vtrn2_s16 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline int32x2x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_s32 (int32x2_t a, int32x2_t b)
++vtrn_s32 (int32x2_t __a, int32x2_t __b)
+ {
+-  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
++  return (int32x2x2_t) {vtrn1_s32 (__a, __b), vtrn2_s32 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline uint8x8x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_u8 (uint8x8_t a, uint8x8_t b)
++vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
+ {
+-  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
++  return (uint8x8x2_t) {vtrn1_u8 (__a, __b), vtrn2_u8 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline uint16x4x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_u16 (uint16x4_t a, uint16x4_t b)
++vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
+ {
+-  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
++  return (uint16x4x2_t) {vtrn1_u16 (__a, __b), vtrn2_u16 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline uint32x2x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrn_u32 (uint32x2_t a, uint32x2_t b)
++vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
+ {
+-  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
++  return (uint32x2x2_t) {vtrn1_u32 (__a, __b), vtrn2_u32 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline float16x8x2_t
+@@ -30444,65 +31004,65 @@ vtrnq_f16 (float16x8_t __a, float16x8_t __b)
+ 
+ __extension__ extern __inline float32x4x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_f32 (float32x4_t a, float32x4_t b)
++vtrnq_f32 (float32x4_t __a, float32x4_t __b)
+ {
+-  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
++  return (float32x4x2_t) {vtrn1q_f32 (__a, __b), vtrn2q_f32 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline poly8x16x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_p8 (poly8x16_t a, poly8x16_t b)
++vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
+ {
+-  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
++  return (poly8x16x2_t) {vtrn1q_p8 (__a, __b), vtrn2q_p8 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline poly16x8x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_p16 (poly16x8_t a, poly16x8_t b)
++vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
+ {
+-  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
++  return (poly16x8x2_t) {vtrn1q_p16 (__a, __b), vtrn2q_p16 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline int8x16x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_s8 (int8x16_t a, int8x16_t b)
++vtrnq_s8 (int8x16_t __a, int8x16_t __b)
+ {
+-  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
++  return (int8x16x2_t) {vtrn1q_s8 (__a, __b), vtrn2q_s8 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline int16x8x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_s16 (int16x8_t a, int16x8_t b)
++vtrnq_s16 (int16x8_t __a, int16x8_t __b)
+ {
+-  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
++  return (int16x8x2_t) {vtrn1q_s16 (__a, __b), vtrn2q_s16 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline int32x4x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_s32 (int32x4_t a, int32x4_t b)
++vtrnq_s32 (int32x4_t __a, int32x4_t __b)
+ {
+-  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
++  return (int32x4x2_t) {vtrn1q_s32 (__a, __b), vtrn2q_s32 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline uint8x16x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_u8 (uint8x16_t a, uint8x16_t b)
++vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
+ {
+-  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
++  return (uint8x16x2_t) {vtrn1q_u8 (__a, __b), vtrn2q_u8 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline uint16x8x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_u16 (uint16x8_t a, uint16x8_t b)
++vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
+ {
+-  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
++  return (uint16x8x2_t) {vtrn1q_u16 (__a, __b), vtrn2q_u16 (__a, __b)};
+ }
+ 
+ __extension__ extern __inline uint32x4x2_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vtrnq_u32 (uint32x4_t a, uint32x4_t b)
++vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
+ {
+-  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
++  return (uint32x4x2_t) {vtrn1q_u32 (__a, __b), vtrn2q_u32 (__a, __b)};
+ }
+ 
+ /* vtst */
+@@ -32200,30 +32760,30 @@ vrndxq_f16 (float16x8_t __a)
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrsqrte_f16 (float16x4_t a)
++vrsqrte_f16 (float16x4_t __a)
+ {
+-  return __builtin_aarch64_rsqrtev4hf (a);
++  return __builtin_aarch64_rsqrtev4hf (__a);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrsqrteq_f16 (float16x8_t a)
++vrsqrteq_f16 (float16x8_t __a)
+ {
+-  return __builtin_aarch64_rsqrtev8hf (a);
++  return __builtin_aarch64_rsqrtev8hf (__a);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsqrt_f16 (float16x4_t a)
++vsqrt_f16 (float16x4_t __a)
+ {
+-  return __builtin_aarch64_sqrtv4hf (a);
++  return __builtin_aarch64_sqrtv4hf (__a);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vsqrtq_f16 (float16x8_t a)
++vsqrtq_f16 (float16x8_t __a)
+ {
+-  return __builtin_aarch64_sqrtv8hf (a);
++  return __builtin_aarch64_sqrtv8hf (__a);
+ }
+ 
+ /* ARMv8.2-A FP16 two operands vector intrinsics.  */
+@@ -32244,16 +32804,16 @@ vaddq_f16 (float16x8_t __a, float16x8_t __b)
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabd_f16 (float16x4_t a, float16x4_t b)
++vabd_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_fabdv4hf (a, b);
++  return __builtin_aarch64_fabdv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vabdq_f16 (float16x8_t a, float16x8_t b)
++vabdq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_fabdv8hf (a, b);
++  return __builtin_aarch64_fabdv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline uint16x4_t
+@@ -32538,72 +33098,72 @@ vmulxq_f16 (float16x8_t __a, float16x8_t __b)
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpadd_f16 (float16x4_t a, float16x4_t b)
++vpadd_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_faddpv4hf (a, b);
++  return __builtin_aarch64_faddpv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpaddq_f16 (float16x8_t a, float16x8_t b)
++vpaddq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_faddpv8hf (a, b);
++  return __builtin_aarch64_faddpv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmax_f16 (float16x4_t a, float16x4_t b)
++vpmax_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_smax_nanpv4hf (a, b);
++  return __builtin_aarch64_smax_nanpv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxq_f16 (float16x8_t a, float16x8_t b)
++vpmaxq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_smax_nanpv8hf (a, b);
++  return __builtin_aarch64_smax_nanpv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnm_f16 (float16x4_t a, float16x4_t b)
++vpmaxnm_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_smaxpv4hf (a, b);
++  return __builtin_aarch64_smaxpv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmaxnmq_f16 (float16x8_t a, float16x8_t b)
++vpmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_smaxpv8hf (a, b);
++  return __builtin_aarch64_smaxpv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpmin_f16 (float16x4_t a, float16x4_t b)
++vpmin_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_smin_nanpv4hf (a, b);
++  return __builtin_aarch64_smin_nanpv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminq_f16 (float16x8_t a, float16x8_t b)
++vpminq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_smin_nanpv8hf (a, b);
++  return __builtin_aarch64_smin_nanpv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnm_f16 (float16x4_t a, float16x4_t b)
++vpminnm_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_sminpv4hf (a, b);
++  return __builtin_aarch64_sminpv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vpminnmq_f16 (float16x8_t a, float16x8_t b)
++vpminnmq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_sminpv8hf (a, b);
++  return __builtin_aarch64_sminpv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+@@ -32622,16 +33182,16 @@ vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
+ 
+ __extension__ extern __inline float16x4_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrsqrts_f16 (float16x4_t a, float16x4_t b)
++vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
+ {
+-  return __builtin_aarch64_rsqrtsv4hf (a, b);
++  return __builtin_aarch64_rsqrtsv4hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x8_t
+ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+-vrsqrtsq_f16 (float16x8_t a, float16x8_t b)
++vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
+ {
+-  return __builtin_aarch64_rsqrtsv8hf (a, b);
++  return __builtin_aarch64_rsqrtsv8hf (__a, __b);
+ }
+ 
+ __extension__ extern __inline float16x4_t
+@@ -33961,6 +34521,1308 @@ vfmlslq_laneq_high_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+ 
+ #pragma GCC pop_options
+ 
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.5-a")
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32z_f32 (float32x2_t __a)
++{
++  return __builtin_aarch64_frint32zv2sf (__a);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32zq_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_frint32zv4sf (__a);
++}
++
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32z_f64 (float64x1_t __a)
++{
++  return (float64x1_t)
++	   {__builtin_aarch64_frint32zdf (vget_lane_f64 (__a, 0))};
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32zq_f64 (float64x2_t __a)
++{
++  return __builtin_aarch64_frint32zv2df (__a);
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32x_f32 (float32x2_t __a)
++{
++  return __builtin_aarch64_frint32xv2sf (__a);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32xq_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_frint32xv4sf (__a);
++}
++
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32x_f64 (float64x1_t __a)
++{
++  return (float64x1_t) {__builtin_aarch64_frint32xdf (vget_lane_f64 (__a, 0))};
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd32xq_f64 (float64x2_t __a)
++{
++  return __builtin_aarch64_frint32xv2df (__a);
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64z_f32 (float32x2_t __a)
++{
++  return __builtin_aarch64_frint64zv2sf (__a);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64zq_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_frint64zv4sf (__a);
++}
++
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64z_f64 (float64x1_t __a)
++{
++  return (float64x1_t) {__builtin_aarch64_frint64zdf (vget_lane_f64 (__a, 0))};
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64zq_f64 (float64x2_t __a)
++{
++  return __builtin_aarch64_frint64zv2df (__a);
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64x_f32 (float32x2_t __a)
++{
++  return __builtin_aarch64_frint64xv2sf (__a);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64xq_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_frint64xv4sf (__a);
++}
++
++__extension__ extern __inline float64x1_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64x_f64 (float64x1_t __a)
++{
++  return (float64x1_t) {__builtin_aarch64_frint64xdf (vget_lane_f64 (__a, 0))};
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vrnd64xq_f64 (float64x2_t __a)
++{
++  return __builtin_aarch64_frint64xv2df (__a);
++}
++
++#pragma GCC pop_options
++
++#include "arm_bf16.h"
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+bf16")
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index)
++{
++  return __aarch64_vset_lane_any (__elem, __vec, __index);
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index)
++{
++  return __aarch64_vset_lane_any (__elem, __vec, __index);
++}
++
++__extension__ extern __inline bfloat16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vget_lane_bf16 (bfloat16x4_t __a, const int __b)
++{
++  return __aarch64_vget_lane_any (__a, __b);
++}
++
++__extension__ extern __inline bfloat16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vgetq_lane_bf16 (bfloat16x8_t __a, const int __b)
++{
++  return __aarch64_vget_lane_any (__a, __b);
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcreate_bf16 (uint64_t __a)
++{
++  return (bfloat16x4_t) __a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
++{
++  return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b);
++}
++
++/* vdup */
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdup_n_bf16 (bfloat16_t __a)
++{
++  return (bfloat16x4_t) {__a, __a, __a, __a};
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdupq_n_bf16 (bfloat16_t __a)
++{
++  return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
++{
++  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
++{
++  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
++{
++  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
++{
++  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
++}
++
++__extension__ extern __inline bfloat16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
++{
++  return __aarch64_vget_lane_any (__a, __b);
++}
++
++__extension__ extern __inline bfloat16_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
++{
++  return __aarch64_vget_lane_any (__a, __b);
++}
++
++/* vld */
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_bf16 (const bfloat16_t *__a)
++{
++  return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a);
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_bf16 (const bfloat16_t *__a)
++{
++  return __builtin_aarch64_ld1v8bf (__a);
++}
++
++__extension__ extern __inline bfloat16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_bf16_x2 (const bfloat16_t *__a)
++{
++  bfloat16x4x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_bf16_x2 (const bfloat16_t *__a)
++{
++  bfloat16x8x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_bf16_x3 (const bfloat16_t *__a)
++{
++  bfloat16x4x3_t __i;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a);
++  __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf  (__o, 0);
++  __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf  (__o, 1);
++  __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf  (__o, 2);
++  return __i;
++}
++
++__extension__ extern __inline bfloat16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_bf16_x3 (const bfloat16_t *__a)
++{
++  bfloat16x8x3_t __i;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a);
++  __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf  (__o, 0);
++  __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf  (__o, 1);
++  __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf  (__o, 2);
++  return __i;
++}
++__extension__ extern __inline bfloat16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_bf16_x4 (const bfloat16_t *__a)
++{
++  union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline bfloat16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_bf16_x4 (const bfloat16_t *__a)
++{
++  union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
++  __au.__o
++    = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a);
++  return __au.__i;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane)
++{
++  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane)
++{
++  return __aarch64_vset_lane_any (*__src, __vec, __lane);
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1_dup_bf16 (const bfloat16_t* __a)
++{
++  return vdup_n_bf16 (*__a);
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld1q_dup_bf16 (const bfloat16_t* __a)
++{
++  return vdupq_n_bf16 (*__a);
++}
++
++__extension__ extern __inline bfloat16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x4x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2v4bf (__a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2q_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x8x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x4x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2_dup_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x4x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld2q_dup_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x8x2_t ret;
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x4x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
++  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3q_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x8x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
++  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x4x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3_dup_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x4x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
++  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x3_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld3q_dup_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x8x3_t ret;
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
++  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x4x4_t ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
++  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
++  ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4q_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x8x4_t ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
++  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
++  ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x4x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4_dup_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x4x4_t ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
++  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
++  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
++  ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
++  return ret;
++}
++
++__extension__ extern __inline bfloat16x8x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vld4q_dup_bf16 (const bfloat16_t * __a)
++{
++  bfloat16x8x4_t ret;
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a);
++  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
++  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
++  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
++  ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
++  return ret;
++}
++
++/* vst */
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b)
++{
++  __builtin_aarch64_st1v4bf (__a, __b);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  bfloat16x8x2_t __temp;
++  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
++  __builtin_aarch64_st1x2v4bf (__a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
++  __builtin_aarch64_st1x2v8bf (__a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val)
++{
++  __builtin_aarch64_simd_ci __o;
++  bfloat16x8x3_t __temp;
++  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
++  __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val)
++{
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
++  __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val)
++{
++  union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val)
++{
++  union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
++  __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b)
++{
++  __builtin_aarch64_st1v8bf (__a, __b);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane)
++{
++  *__a = __aarch64_vget_lane_any (__b, __lane);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane)
++{
++  *__a = __aarch64_vget_lane_any (__b, __lane);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  bfloat16x8x2_t __temp;
++  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
++  __builtin_aarch64_st2v4bf (__a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val)
++{
++  __builtin_aarch64_simd_oi __o;
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
++  __builtin_aarch64_st2v8bf (__a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val)
++{
++  __builtin_aarch64_simd_ci __o;
++  bfloat16x8x3_t __temp;
++  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
++  __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val)
++{
++  __builtin_aarch64_simd_ci __o;
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
++  __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val)
++{
++  __builtin_aarch64_simd_xi __o;
++  bfloat16x8x4_t __temp;
++  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3);
++  __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
++}
++
++__extension__ extern __inline void
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val)
++{
++  __builtin_aarch64_simd_xi __o;
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
++  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3);
++  __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
++}
++
++/* vreinterpret */
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_u8 (uint8x8_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_u16 (uint16x4_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_u32 (uint32x2_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_u64 (uint64x1_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_s8 (int8x8_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_s16 (int16x4_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_s32 (int32x2_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_s64 (int64x1_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_p8 (poly8x8_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_p16 (poly16x4_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_p64 (poly64x1_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_f16 (float16x4_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_f32 (float32x2_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_bf16_f64 (float64x1_t __a)
++{
++  return (bfloat16x4_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_u8 (uint8x16_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_u16 (uint16x8_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_u32 (uint32x4_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_u64 (uint64x2_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_s8 (int8x16_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_s16 (int16x8_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_s32 (int32x4_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_s64 (int64x2_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_p8 (poly8x16_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_p16 (poly16x8_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_p64 (poly64x2_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_p128 (poly128_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_f16 (float16x8_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_f32 (float32x4_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_bf16_f64 (float64x2_t __a)
++{
++  return (bfloat16x8_t)__a;
++}
++
++__extension__ extern __inline int8x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s8_bf16 (bfloat16x4_t __a)
++{
++  return (int8x8_t)__a;
++}
++
++__extension__ extern __inline int16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s16_bf16 (bfloat16x4_t __a)
++{
++  return (int16x4_t)__a;
++}
++
++__extension__ extern __inline int32x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s32_bf16 (bfloat16x4_t __a)
++{
++  return (int32x2_t)__a;
++}
++
++__extension__ extern __inline int64x1_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_s64_bf16 (bfloat16x4_t __a)
++{
++  return (int64x1_t)__a;
++}
++
++__extension__ extern __inline uint8x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u8_bf16 (bfloat16x4_t __a)
++{
++  return (uint8x8_t)__a;
++}
++
++__extension__ extern __inline uint16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u16_bf16 (bfloat16x4_t __a)
++{
++  return (uint16x4_t)__a;
++}
++
++__extension__ extern __inline uint32x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u32_bf16 (bfloat16x4_t __a)
++{
++  return (uint32x2_t)__a;
++}
++
++__extension__ extern __inline uint64x1_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_u64_bf16 (bfloat16x4_t __a)
++{
++  return (uint64x1_t)__a;
++}
++
++__extension__ extern __inline float16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_f16_bf16 (bfloat16x4_t __a)
++{
++  return (float16x4_t)__a;
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_f32_bf16 (bfloat16x4_t __a)
++{
++  return (float32x2_t)__a;
++}
++
++__extension__ extern __inline float64x1_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_f64_bf16 (bfloat16x4_t __a)
++{
++  return (float64x1_t)__a;
++}
++
++__extension__ extern __inline poly8x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p8_bf16 (bfloat16x4_t __a)
++{
++  return (poly8x8_t)__a;
++}
++
++__extension__ extern __inline poly16x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p16_bf16 (bfloat16x4_t __a)
++{
++  return (poly16x4_t)__a;
++}
++
++__extension__ extern __inline poly64x1_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpret_p64_bf16 (bfloat16x4_t __a)
++{
++  return (poly64x1_t)__a;
++}
++
++__extension__ extern __inline int8x16_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s8_bf16 (bfloat16x8_t __a)
++{
++  return (int8x16_t)__a;
++}
++
++__extension__ extern __inline int16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s16_bf16 (bfloat16x8_t __a)
++{
++  return (int16x8_t)__a;
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s32_bf16 (bfloat16x8_t __a)
++{
++  return (int32x4_t)__a;
++}
++
++__extension__ extern __inline int64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_s64_bf16 (bfloat16x8_t __a)
++{
++  return (int64x2_t)__a;
++}
++
++__extension__ extern __inline uint8x16_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u8_bf16 (bfloat16x8_t __a)
++{
++  return (uint8x16_t)__a;
++}
++
++__extension__ extern __inline uint16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u16_bf16 (bfloat16x8_t __a)
++{
++  return (uint16x8_t)__a;
++}
++
++__extension__ extern __inline uint32x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u32_bf16 (bfloat16x8_t __a)
++{
++  return (uint32x4_t)__a;
++}
++
++__extension__ extern __inline uint64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_u64_bf16 (bfloat16x8_t __a)
++{
++  return (uint64x2_t)__a;
++}
++
++__extension__ extern __inline float16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f16_bf16 (bfloat16x8_t __a)
++{
++  return (float16x8_t)__a;
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f32_bf16 (bfloat16x8_t __a)
++{
++  return (float32x4_t)__a;
++}
++
++__extension__ extern __inline float64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_f64_bf16 (bfloat16x8_t __a)
++{
++  return (float64x2_t)__a;
++}
++
++__extension__ extern __inline poly8x16_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p8_bf16 (bfloat16x8_t __a)
++{
++  return (poly8x16_t)__a;
++}
++
++__extension__ extern __inline poly16x8_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p16_bf16 (bfloat16x8_t __a)
++{
++  return (poly16x8_t)__a;
++}
++
++__extension__ extern __inline poly64x2_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p64_bf16 (bfloat16x8_t __a)
++{
++  return (poly64x2_t)__a;
++}
++
++__extension__ extern __inline poly128_t
++__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
++vreinterpretq_p128_bf16 (bfloat16x8_t __a)
++{
++  return (poly128_t)__a;
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
++{
++  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
++{
++  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
++		 const int __index)
++{
++  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
++		  const int __index)
++{
++  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
++		  const int __index)
++{
++  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
++		   const int __index)
++{
++  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
++
++{
++  return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
++{
++  return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
++{
++  return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
++		    const int __index)
++{
++  return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
++		    const int __index)
++{
++  return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
++		     const int __index)
++{
++  return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline float32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
++		     const int __index)
++{
++  return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline bfloat16x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcvt_bf16_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_bfcvtnv4bf (__a);
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcvtq_low_bf16_f32 (float32x4_t __a)
++{
++  return __builtin_aarch64_bfcvtn_qv8bf (__a);
++}
++
++__extension__ extern __inline bfloat16x8_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a)
++{
++  return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
++}
++
++#pragma GCC pop_options
++
++/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+i8mm")
++
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
++{
++  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
++{
++  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
++}
++
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index)
++{
++  return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b,
++		  const int __index)
++{
++  return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b,
++		  const int __index)
++{
++  return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b,
++		   const int __index)
++{
++  return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index)
++{
++  return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x2_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b,
++		  const int __index)
++{
++  return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b,
++		  const int __index)
++{
++  return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index);
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b,
++		   const int __index)
++{
++  return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
++}
++
++/* Matrix Multiply-Accumulate.  */
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
++{
++  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
++}
++
++__extension__ extern __inline uint32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
++{
++  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
++}
++
++__extension__ extern __inline int32x4_t
++__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
++vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
++{
++  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
++}
++
++#pragma GCC pop_options
++
+ #undef __aarch64_vget_lane_any
+ 
+ #undef __aarch64_vdup_lane_any
+diff --git a/gcc/config/aarch64/arm_sve.h b/gcc/config/aarch64/arm_sve.h
+new file mode 100644
+index 000000000..0a316c0a0
+--- /dev/null
++++ b/gcc/config/aarch64/arm_sve.h
+@@ -0,0 +1,37 @@
++/* AArch64 SVE intrinsics include file.
++   Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published
++   by the Free Software Foundation; either version 3, or (at your
++   option) any later version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef _ARM_SVE_H_
++#define _ARM_SVE_H_
++
++#include <stdint.h>
++#include <arm_bf16.h>
++
++typedef __fp16 float16_t;
++typedef float float32_t;
++typedef double float64_t;
++
++#pragma GCC aarch64 "arm_sve.h"
++
++#endif
+diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
+index 0f357662a..002e91d2b 100644
+--- a/gcc/config/aarch64/atomics.md
++++ b/gcc/config/aarch64/atomics.md
+@@ -22,10 +22,10 @@
+ 
+ (define_expand "@atomic_compare_and_swap<mode>"
+   [(match_operand:SI 0 "register_operand" "")			;; bool out
+-   (match_operand:ALLI 1 "register_operand" "")			;; val out
+-   (match_operand:ALLI 2 "aarch64_sync_memory_operand" "")	;; memory
+-   (match_operand:ALLI 3 "nonmemory_operand" "")		;; expected
+-   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")		;; desired
++   (match_operand:ALLI_TI 1 "register_operand" "")		;; val out
++   (match_operand:ALLI_TI 2 "aarch64_sync_memory_operand" "")	;; memory
++   (match_operand:ALLI_TI 3 "nonmemory_operand" "")		;; expected
++   (match_operand:ALLI_TI 4 "aarch64_reg_or_zero" "")		;; desired
+    (match_operand:SI 5 "const_int_operand")			;; is_weak
+    (match_operand:SI 6 "const_int_operand")			;; mod_s
+    (match_operand:SI 7 "const_int_operand")]			;; mod_f
+@@ -88,6 +88,30 @@
+   }
+ )
+ 
++(define_insn_and_split "@aarch64_compare_and_swap<mode>"
++  [(set (reg:CC CC_REGNUM)					;; bool out
++    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
++   (set (match_operand:JUST_TI 0 "register_operand" "=&r")	;; val out
++    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
++   (set (match_dup 1)
++    (unspec_volatile:JUST_TI
++      [(match_operand:JUST_TI 2 "aarch64_reg_or_zero" "rZ")	;; expect
++       (match_operand:JUST_TI 3 "aarch64_reg_or_zero" "rZ")	;; desired
++       (match_operand:SI 4 "const_int_operand")			;; is_weak
++       (match_operand:SI 5 "const_int_operand")			;; mod_s
++       (match_operand:SI 6 "const_int_operand")]		;; mod_f
++      UNSPECV_ATOMIC_CMPSW))
++   (clobber (match_scratch:SI 7 "=&r"))]
++  ""
++  "#"
++  "&& epilogue_completed"
++  [(const_int 0)]
++  {
++    aarch64_split_compare_and_swap (operands);
++    DONE;
++  }
++)
++
+ (define_insn "@aarch64_compare_and_swap<mode>_lse"
+   [(set (match_operand:SI 0 "register_operand" "+r")		;; val out
+     (zero_extend:SI
+@@ -133,23 +157,56 @@
+     return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
+ })
+ 
++(define_insn "@aarch64_compare_and_swap<mode>_lse"
++  [(set (match_operand:JUST_TI 0 "register_operand" "+r")	;; val out
++    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
++   (set (match_dup 1)
++    (unspec_volatile:JUST_TI
++      [(match_dup 0)						;; expect
++       (match_operand:JUST_TI 2 "register_operand" "r")		;; desired
++       (match_operand:SI 3 "const_int_operand")]		;; mod_s
++      UNSPECV_ATOMIC_CMPSW))]
++  "TARGET_LSE"
++{
++  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
++  if (is_mm_relaxed (model))
++    return "casp\t%0, %R0, %2, %R2, %1";
++  else if (is_mm_acquire (model) || is_mm_consume (model))
++    return "caspa\t%0, %R0, %2, %R2, %1";
++  else if (is_mm_release (model))
++    return "caspl\t%0, %R0, %2, %R2, %1";
++  else
++    return "caspal\t%0, %R0, %2, %R2, %1";
++})
++
+ (define_expand "atomic_exchange<mode>"
+- [(match_operand:ALLI 0 "register_operand" "")
+-  (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
+-  (match_operand:ALLI 2 "aarch64_reg_or_zero" "")
+-  (match_operand:SI 3 "const_int_operand" "")]
++ [(match_operand:ALLI 0 "register_operand")
++  (match_operand:ALLI 1 "aarch64_sync_memory_operand")
++  (match_operand:ALLI 2 "aarch64_reg_or_zero")
++  (match_operand:SI 3 "const_int_operand")]
+   ""
+   {
+-    rtx (*gen) (rtx, rtx, rtx, rtx);
+-
+     /* Use an atomic SWP when available.  */
+     if (TARGET_LSE)
+-      gen = gen_aarch64_atomic_exchange<mode>_lse;
++      {
++	emit_insn (gen_aarch64_atomic_exchange<mode>_lse
++		   (operands[0], operands[1], operands[2], operands[3]));
++      }
++    else if (TARGET_OUTLINE_ATOMICS)
++      {
++	machine_mode mode = <MODE>mode;
++	rtx func = aarch64_atomic_ool_func (mode, operands[3],
++					    &aarch64_ool_swp_names);
++	rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL,
++					    mode, operands[2], mode,
++					    XEXP (operands[1], 0), Pmode);
++        emit_move_insn (operands[0], rval);
++      }
+     else
+-      gen = gen_aarch64_atomic_exchange<mode>;
+-
+-    emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
+-
++      {
++	emit_insn (gen_aarch64_atomic_exchange<mode>
++		   (operands[0], operands[1], operands[2], operands[3]));
++      }
+     DONE;
+   }
+ )
+@@ -198,9 +255,9 @@
+ )
+ 
+ (define_expand "atomic_<atomic_optab><mode>"
+- [(match_operand:ALLI 0 "aarch64_sync_memory_operand" "")
++ [(match_operand:ALLI 0 "aarch64_sync_memory_operand")
+   (atomic_op:ALLI
+-   (match_operand:ALLI 1 "<atomic_op_operand>" "")
++   (match_operand:ALLI 1 "<atomic_op_operand>")
+    (match_operand:SI 2 "const_int_operand"))]
+   ""
+   {
+@@ -234,6 +291,39 @@
+ 	  }
+ 	operands[1] = force_reg (<MODE>mode, operands[1]);
+       }
++    else if (TARGET_OUTLINE_ATOMICS)
++      {
++        const atomic_ool_names *names;
++	switch (<CODE>)
++	  {
++	  case MINUS:
++	    operands[1] = expand_simple_unop (<MODE>mode, NEG, operands[1],
++					      NULL, 1);
++	    /* fallthru */
++	  case PLUS:
++	    names = &aarch64_ool_ldadd_names;
++	    break;
++	  case IOR:
++	    names = &aarch64_ool_ldset_names;
++	    break;
++	  case XOR:
++	    names = &aarch64_ool_ldeor_names;
++	    break;
++	  case AND:
++	    operands[1] = expand_simple_unop (<MODE>mode, NOT, operands[1],
++					      NULL, 1);
++	    names = &aarch64_ool_ldclr_names;
++	    break;
++	  default:
++	    gcc_unreachable ();
++	  }
++        machine_mode mode = <MODE>mode;
++	rtx func = aarch64_atomic_ool_func (mode, operands[2], names);
++	emit_library_call_value (func, NULL_RTX, LCT_NORMAL, mode,
++				 operands[1], mode,
++				 XEXP (operands[0], 0), Pmode);
++        DONE;
++      }
+     else
+       gen = gen_aarch64_atomic_<atomic_optab><mode>;
+ 
+@@ -322,10 +412,10 @@
+ ;; Load-operate-store, returning the original memory data.
+ 
+ (define_expand "atomic_fetch_<atomic_optab><mode>"
+- [(match_operand:ALLI 0 "register_operand" "")
+-  (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
++ [(match_operand:ALLI 0 "register_operand")
++  (match_operand:ALLI 1 "aarch64_sync_memory_operand")
+   (atomic_op:ALLI
+-   (match_operand:ALLI 2 "<atomic_op_operand>" "")
++   (match_operand:ALLI 2 "<atomic_op_operand>")
+    (match_operand:SI 3 "const_int_operand"))]
+  ""
+ {
+@@ -359,6 +449,40 @@
+ 	}
+       operands[2] = force_reg (<MODE>mode, operands[2]);
+     }
++  else if (TARGET_OUTLINE_ATOMICS)
++    {
++      const atomic_ool_names *names;
++      switch (<CODE>)
++	{
++	case MINUS:
++	  operands[2] = expand_simple_unop (<MODE>mode, NEG, operands[2],
++					    NULL, 1);
++	  /* fallthru */
++	case PLUS:
++	  names = &aarch64_ool_ldadd_names;
++	  break;
++	case IOR:
++	  names = &aarch64_ool_ldset_names;
++	  break;
++	case XOR:
++	  names = &aarch64_ool_ldeor_names;
++	  break;
++	case AND:
++	  operands[2] = expand_simple_unop (<MODE>mode, NOT, operands[2],
++					    NULL, 1);
++	  names = &aarch64_ool_ldclr_names;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      machine_mode mode = <MODE>mode;
++      rtx func = aarch64_atomic_ool_func (mode, operands[3], names);
++      rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL, mode,
++					  operands[2], mode,
++					  XEXP (operands[1], 0), Pmode);
++      emit_move_insn (operands[0], rval);
++      DONE;
++    }
+   else
+     gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
+ 
+@@ -439,16 +563,16 @@
+ ;; Load-operate-store, returning the updated memory data.
+ 
+ (define_expand "atomic_<atomic_optab>_fetch<mode>"
+- [(match_operand:ALLI 0 "register_operand" "")
++ [(match_operand:ALLI 0 "register_operand")
+   (atomic_op:ALLI
+-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
+-   (match_operand:ALLI 2 "<atomic_op_operand>" ""))
++   (match_operand:ALLI 1 "aarch64_sync_memory_operand")
++   (match_operand:ALLI 2 "<atomic_op_operand>"))
+   (match_operand:SI 3 "const_int_operand")]
+  ""
+ {
+   /* Use an atomic load-operate instruction when possible.  In this case
+      we will re-compute the result from the original mem value. */
+-  if (TARGET_LSE)
++  if (TARGET_LSE || TARGET_OUTLINE_ATOMICS)
+     {
+       rtx tmp = gen_reg_rtx (<MODE>mode);
+       operands[2] = force_reg (<MODE>mode, operands[2]);
+@@ -581,6 +705,24 @@
+   }
+ )
+ 
++(define_insn "aarch64_load_exclusive_pair"
++  [(set (match_operand:DI 0 "register_operand" "=r")
++	(unspec_volatile:DI
++	  [(match_operand:TI 2 "aarch64_sync_memory_operand" "Q")
++	   (match_operand:SI 3 "const_int_operand")]
++	  UNSPECV_LX))
++   (set (match_operand:DI 1 "register_operand" "=r")
++	(unspec_volatile:DI [(match_dup 2) (match_dup 3)] UNSPECV_LX))]
++  ""
++  {
++    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
++    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
++      return "ldxp\t%0, %1, %2";
++    else
++      return "ldaxp\t%0, %1, %2";
++  }
++)
++
+ (define_insn "@aarch64_store_exclusive<mode>"
+   [(set (match_operand:SI 0 "register_operand" "=&r")
+     (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
+@@ -599,8 +741,27 @@
+   }
+ )
+ 
++(define_insn "aarch64_store_exclusive_pair"
++  [(set (match_operand:SI 0 "register_operand" "=&r")
++	(unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
++   (set (match_operand:TI 1 "aarch64_sync_memory_operand" "=Q")
++	(unspec_volatile:TI
++	  [(match_operand:DI 2 "aarch64_reg_or_zero" "rZ")
++	   (match_operand:DI 3 "aarch64_reg_or_zero" "rZ")
++	   (match_operand:SI 4 "const_int_operand")]
++	  UNSPECV_SX))]
++  ""
++  {
++    enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
++    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
++      return "stxp\t%w0, %x2, %x3, %1";
++    else
++      return "stlxp\t%w0, %x2, %x3, %1";
++  }
++)
++
+ (define_expand "mem_thread_fence"
+-  [(match_operand:SI 0 "const_int_operand" "")]
++  [(match_operand:SI 0 "const_int_operand")]
+   ""
+   {
+     enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
+diff --git a/gcc/config/aarch64/check-sve-md.awk b/gcc/config/aarch64/check-sve-md.awk
+new file mode 100644
+index 000000000..3da78f3dd
+--- /dev/null
++++ b/gcc/config/aarch64/check-sve-md.awk
+@@ -0,0 +1,66 @@
++#!/usr/bin/awk -f
++# Copyright (C) 2019 Free Software Foundation, Inc.
++#
++# This program is free software; you can redistribute it and/or modify it
++# under the terms of the GNU General Public License as published by the
++# Free Software Foundation; either version 3, or (at your option) any
++# later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with this program; see the file COPYING3.  If not see
++# <http://www.gnu.org/licenses/>.
++
++# This awk script checks that aarch64-sve.md (passed either on the
++# command line or via stdin) has an up-to-date contents section.
++
++BEGIN {
++  seen1 = 0
++  seen2 = 0
++  errors = 0
++}
++
++# The headings in the comments use a two-level hierarchy: ";; == ..."
++# for major sections and ";; ---- ..." for minor sections.  Each section
++# heading must be unique.
++#
++# The contents section should list all the section headings, using the
++# same text and in the same order.  We should therefore see exactly two
++# copies of the section list.
++/^;; == / || /^;; ---- / {
++  if ($0 in seen || seen2 > 0)
++    {
++      if (seen2 >= seen1)
++	{
++	  printf "error: line not in contents: %s\n", $0 > "/dev/stderr"
++	  errors += 1
++	  exit(1)
++	}
++      if ($0 != order[seen2])
++	{
++	  printf "error: mismatched contents\n     saw: %s\nexpected: %s\n", \
++	    $0, order[seen2] > "/dev/stderr"
++	  errors += 1
++	  exit(1)
++	}
++      seen2 += 1
++    }
++  else
++    {
++      seen[$0] = 1
++      order[seen1] = $0
++      seen1 += 1
++    }
++}
++
++END {
++  if (seen2 < seen1 && errors == 0)
++    {
++      printf "error: line only in contents: %s\n", order[seen2] > "/dev/stderr"
++      exit(1)
++    }
++}
+diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
+index 21f9549e6..191c996c1 100644
+--- a/gcc/config/aarch64/constraints.md
++++ b/gcc/config/aarch64/constraints.md
+@@ -36,6 +36,9 @@
+ (define_register_constraint "x" "FP_LO_REGS"
+   "Floating point and SIMD vector registers V0 - V15.")
+ 
++(define_register_constraint "y" "FP_LO8_REGS"
++  "Floating point and SIMD vector registers V0 - V7.")
++
+ (define_constraint "I"
+  "A constant that can be used with an ADD operation."
+  (and (match_code "const_int")
+@@ -46,6 +49,12 @@
+   (and (match_code "const_int")
+        (match_test "aarch64_pluslong_strict_immedate (op, VOIDmode)")))
+ 
++(define_constraint "Uai"
++  "@internal
++   A constraint that matches a VG-based constant that can be added by
++   a single INC or DEC."
++  (match_operand 0 "aarch64_sve_scalar_inc_dec_immediate"))
++
+ (define_constraint "Uav"
+   "@internal
+    A constraint that matches a VG-based constant that can be added by
+@@ -114,8 +123,8 @@
+        (match_test "aarch64_float_const_zero_rtx_p (op)")))
+ 
+ (define_constraint "Z"
+-  "Integer constant zero."
+-  (match_test "op == const0_rtx"))
++  "Integer or floating-point constant zero."
++  (match_test "op == CONST0_RTX (GET_MODE (op))"))
+ 
+ (define_constraint "Ush"
+   "A constraint that matches an absolute symbolic address high part."
+@@ -248,6 +257,38 @@
+ 						  true,
+ 						  ADDR_QUERY_LDP_STP_N)")))
+ 
++(define_address_constraint "UPb"
++  "@internal
++   An address valid for SVE PRFB instructions."
++  (match_test "aarch64_sve_prefetch_operand_p (op, VNx16QImode)"))
++
++(define_address_constraint "UPd"
++  "@internal
++   An address valid for SVE PRFD instructions."
++  (match_test "aarch64_sve_prefetch_operand_p (op, VNx2DImode)"))
++
++(define_address_constraint "UPh"
++  "@internal
++   An address valid for SVE PRFH instructions."
++  (match_test "aarch64_sve_prefetch_operand_p (op, VNx8HImode)"))
++
++(define_address_constraint "UPw"
++  "@internal
++   An address valid for SVE PRFW instructions."
++  (match_test "aarch64_sve_prefetch_operand_p (op, VNx4SImode)"))
++
++(define_memory_constraint "Utf"
++  "@internal
++   An address valid for SVE LDFF1 instructions."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ldff1_operand_p (op)")))
++
++(define_memory_constraint "Utn"
++  "@internal
++   An address valid for SVE LDNF1 instructions."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ldnf1_operand_p (op)")))
++
+ (define_memory_constraint "Utr"
+   "@internal
+    An address valid for SVE LDR and STR instructions (as distinct from
+@@ -269,6 +310,37 @@
+        (match_test "aarch64_legitimate_address_p (V2DImode,
+ 						  XEXP (op, 0), 1)")))
+ 
++(define_memory_constraint "UtQ"
++  "@internal
++   An address valid for SVE LD1RQs."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
++
++(define_memory_constraint "UOb"
++  "@internal
++   An address valid for SVE LD1ROH."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, QImode)")))
++
++(define_memory_constraint "UOh"
++  "@internal
++   An address valid for SVE LD1ROH."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, HImode)")))
++
++
++(define_memory_constraint "UOw"
++  "@internal
++   An address valid for SVE LD1ROW."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, SImode)")))
++
++(define_memory_constraint "UOd"
++  "@internal
++   An address valid for SVE LD1ROD."
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, DImode)")))
++
+ (define_memory_constraint "Uty"
+   "@internal
+    An address valid for SVE LD1Rs."
+@@ -284,7 +356,7 @@
+ (define_constraint "Ufc"
+   "A floating point constant which can be used with an\
+    FMOV immediate operation."
+-  (and (match_code "const_double")
++  (and (match_code "const_double,const_vector")
+        (match_test "aarch64_float_const_representable_p (op)")))
+ 
+ (define_constraint "Uvi"
+@@ -329,6 +401,13 @@
+       (match_test "aarch64_simd_scalar_immediate_valid_for_move (op,
+ 						 QImode)")))
+ 
++(define_constraint "Dt"
++  "@internal
++ A const_double which is the reciprocal of an exact power of two, can be
++ used in an scvtf with fract bits operation"
++ (and (match_code "const_double")
++      (match_test "aarch64_fpconst_pow2_recip (op) > 0")))
++
+ (define_constraint "Dl"
+   "@internal
+  A constraint that matches vector of immediates for left shifts."
+@@ -373,18 +452,54 @@
+  An address valid for a prefetch instruction."
+  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
+ 
++(define_constraint "vgb"
++  "@internal
++   A constraint that matches an immediate offset valid for SVE LD1B
++   gather instructions."
++ (match_operand 0 "aarch64_sve_gather_immediate_b"))
++
++(define_constraint "vgd"
++  "@internal
++   A constraint that matches an immediate offset valid for SVE LD1D
++   gather instructions."
++ (match_operand 0 "aarch64_sve_gather_immediate_d"))
++
++(define_constraint "vgh"
++  "@internal
++   A constraint that matches an immediate offset valid for SVE LD1H
++   gather instructions."
++ (match_operand 0 "aarch64_sve_gather_immediate_h"))
++
++(define_constraint "vgw"
++  "@internal
++   A constraint that matches an immediate offset valid for SVE LD1W
++   gather instructions."
++ (match_operand 0 "aarch64_sve_gather_immediate_w"))
++
+ (define_constraint "vsa"
+   "@internal
+    A constraint that matches an immediate operand valid for SVE
+    arithmetic instructions."
+  (match_operand 0 "aarch64_sve_arith_immediate"))
+ 
++(define_constraint "vsb"
++  "@internal
++   A constraint that matches an immediate operand valid for SVE UMAX
++   and UMIN operations."
++ (match_operand 0 "aarch64_sve_vsb_immediate"))
++
+ (define_constraint "vsc"
+   "@internal
+    A constraint that matches a signed immediate operand valid for SVE
+    CMP instructions."
+  (match_operand 0 "aarch64_sve_cmp_vsc_immediate"))
+ 
++(define_constraint "vss"
++  "@internal
++   A constraint that matches a signed immediate operand valid for SVE
++   DUP instructions."
++ (match_test "aarch64_sve_dup_immediate_p (op)"))
++
+ (define_constraint "vsd"
+   "@internal
+    A constraint that matches an unsigned immediate operand valid for SVE
+@@ -395,7 +510,7 @@
+   "@internal
+    A constraint that matches a vector count operand valid for SVE INC and
+    DEC instructions."
+- (match_operand 0 "aarch64_sve_inc_dec_immediate"))
++ (match_operand 0 "aarch64_sve_vector_inc_dec_immediate"))
+ 
+ (define_constraint "vsn"
+   "@internal
+@@ -403,6 +518,18 @@
+    is valid for SVE SUB instructions."
+  (match_operand 0 "aarch64_sve_sub_arith_immediate"))
+ 
++(define_constraint "vsQ"
++  "@internal
++   Like vsa, but additionally check that the immediate is nonnegative
++   when interpreted as a signed value."
++ (match_operand 0 "aarch64_sve_qadd_immediate"))
++
++(define_constraint "vsS"
++  "@internal
++   Like vsn, but additionally check that the immediate is negative
++   when interpreted as a signed value."
++ (match_operand 0 "aarch64_sve_qsub_immediate"))
++
+ (define_constraint "vsl"
+   "@internal
+    A constraint that matches an immediate operand valid for SVE logical
+@@ -411,9 +538,9 @@
+ 
+ (define_constraint "vsm"
+   "@internal
+-   A constraint that matches an immediate operand valid for SVE MUL
+-   operations."
+- (match_operand 0 "aarch64_sve_mul_immediate"))
++   A constraint that matches an immediate operand valid for SVE MUL,
++   SMAX and SMIN operations."
++ (match_operand 0 "aarch64_sve_vsm_immediate"))
+ 
+ (define_constraint "vsA"
+   "@internal
+@@ -421,13 +548,20 @@
+    and FSUB operations."
+  (match_operand 0 "aarch64_sve_float_arith_immediate"))
+ 
++;; "B" for "bound".
++(define_constraint "vsB"
++  "@internal
++   A constraint that matches an immediate operand valid for SVE FMAX
++   and FMIN operations."
++ (match_operand 0 "aarch64_sve_float_maxmin_immediate"))
++
+ (define_constraint "vsM"
+   "@internal
+-   A constraint that matches an imediate operand valid for SVE FMUL
++   A constraint that matches an immediate operand valid for SVE FMUL
+    operations."
+  (match_operand 0 "aarch64_sve_float_mul_immediate"))
+ 
+ (define_constraint "vsN"
+   "@internal
+    A constraint that matches the negative of vsA"
+- (match_operand 0 "aarch64_sve_float_arith_with_sub_immediate"))
++ (match_operand 0 "aarch64_sve_float_negated_arith_immediate"))
+diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.c b/gcc/config/aarch64/cortex-a57-fma-steering.c
+index eb91662b6..d8e6038d1 100644
+--- a/gcc/config/aarch64/cortex-a57-fma-steering.c
++++ b/gcc/config/aarch64/cortex-a57-fma-steering.c
+@@ -37,6 +37,7 @@
+ #include "insn-attr.h"
+ #include "context.h"
+ #include "tree-pass.h"
++#include "function-abi.h"
+ #include "regrename.h"
+ #include "aarch64-protos.h"
+ 
+@@ -267,7 +268,7 @@ rename_single_chain (du_head_p head, HARD_REG_SET *unavailable)
+       if (DEBUG_INSN_P (tmp->insn))
+ 	continue;
+       n_uses++;
+-      IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
++      *unavailable |= ~reg_class_contents[tmp->cl];
+       super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
+     }
+ 
+@@ -281,7 +282,7 @@ rename_single_chain (du_head_p head, HARD_REG_SET *unavailable)
+     {
+       fprintf (dump_file, "Register %s in insn %d", reg_names[reg],
+ 	       INSN_UID (head->first->insn));
+-      if (head->need_caller_save_reg)
++      if (head->call_abis)
+ 	fprintf (dump_file, " crosses a call");
+     }
+ 
+diff --git a/gcc/config/aarch64/driver-aarch64.c b/gcc/config/aarch64/driver-aarch64.c
+index 6f16775f4..ef4f18352 100644
+--- a/gcc/config/aarch64/driver-aarch64.c
++++ b/gcc/config/aarch64/driver-aarch64.c
+@@ -32,7 +32,7 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
+ struct aarch64_arch_extension
+ {
+   const char *ext;
+-  unsigned int flag;
++  uint64_t flag;
+   const char *feat_string;
+ };
+ 
+@@ -52,7 +52,7 @@ struct aarch64_core_data
+   unsigned char implementer_id; /* Exactly 8 bits */
+   unsigned int part_no; /* 12 bits + 12 bits */
+   unsigned variant;
+-  const unsigned long flags;
++  const uint64_t flags;
+ };
+ 
+ #define AARCH64_BIG_LITTLE(BIG, LITTLE) \
+@@ -75,7 +75,7 @@ struct aarch64_arch_driver_info
+ {
+   const char* id;
+   const char* name;
+-  const unsigned long flags;
++  const uint64_t flags;
+ };
+ 
+ #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
+@@ -179,8 +179,8 @@ host_detect_local_cpu (int argc, const char **argv)
+   unsigned int variants[2] = { ALL_VARIANTS, ALL_VARIANTS };
+   unsigned int n_variants = 0;
+   bool processed_exts = false;
+-  unsigned long extension_flags = 0;
+-  unsigned long default_flags = 0;
++  uint64_t extension_flags = 0;
++  uint64_t default_flags = 0;
+ 
+   gcc_assert (argc);
+ 
+diff --git a/gcc/config/aarch64/falkor-tag-collision-avoidance.c b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
+index 779dee81f..35ca79232 100644
+--- a/gcc/config/aarch64/falkor-tag-collision-avoidance.c
++++ b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
+@@ -38,6 +38,7 @@
+ #include "optabs.h"
+ #include "regs.h"
+ #include "recog.h"
++#include "function-abi.h"
+ #include "regrename.h"
+ #include "print-rtl.h"
+ 
+@@ -229,7 +230,7 @@ init_unavailable (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head,
+       if (DEBUG_INSN_P (tmp->insn))
+ 	continue;
+ 
+-      IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
++      *unavailable |= ~reg_class_contents[tmp->cl];
+       super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
+     }
+ 
+diff --git a/gcc/config/aarch64/falkor.md b/gcc/config/aarch64/falkor.md
+index 41955af81..2bcc661e5 100644
+--- a/gcc/config/aarch64/falkor.md
++++ b/gcc/config/aarch64/falkor.md
+@@ -648,7 +648,7 @@
+ 
+ (define_insn_reservation "falkor_other_0_nothing" 0
+   (and (eq_attr "tune" "falkor")
+-       (eq_attr "type" "no_insn,trap,block"))
++       (eq_attr "type" "trap,block"))
+   "nothing")
+ 
+ (define_insn_reservation "falkor_other_2_z" 2
+diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
+index c7ccd5bf6..7b6456961 100644
+--- a/gcc/config/aarch64/iterators.md
++++ b/gcc/config/aarch64/iterators.md
+@@ -29,9 +29,16 @@
+ ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
+ (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
+ 
++;; "Iterator" for just TI -- features like @pattern only work with iterators.
++(define_mode_iterator JUST_TI [TI])
++
+ ;; Iterator for QI and HI modes
+ (define_mode_iterator SHORT [QI HI])
+ 
++;; Iterators for single modes, for "@" patterns.
++(define_mode_iterator SI_ONLY [SI])
++(define_mode_iterator DI_ONLY [DI])
++
+ ;; Iterator for all integer modes (up to 64-bit)
+ (define_mode_iterator ALLI [QI HI SI DI])
+ 
+@@ -50,9 +57,16 @@
+ ;; Iterator for all scalar floating point modes (HF, SF, DF)
+ (define_mode_iterator GPF_HF [HF SF DF])
+ 
++;; Iterator for all 16-bit scalar floating point modes (HF, BF)
++(define_mode_iterator HFBF [HF BF])
++
+ ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
+ (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
+ 
++;; Iterator for all scalar floating point modes suitable for moving, including
++;; special BF type (HF, SF, DF, TF and BF)
++(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
++
+ ;; Double vector modes.
+ (define_mode_iterator VDF [V2SF V4HF])
+ 
+@@ -70,7 +84,10 @@
+ (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
+ 
+ ;; Double vector modes.
+-(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
++(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
++
++;; Double vector modes suitable for moving.  Includes BFmode.
++(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+ 
+ ;; All modes stored in registers d0-d31.
+ (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
+@@ -85,20 +102,29 @@
+ (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
+ 
+ ;; Quad vector modes.
+-(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
++(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
+ 
+ ;; Copy of the above.
+-(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
++(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
++
++;; Quad vector modes suitable for moving.  Includes BFmode.
++(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
++
++;; VQMOV without 2-element modes.
++(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
+ 
+ ;; Quad integer vector modes.
+ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
+ 
+ ;; VQ without 2 element modes.
+-(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF])
++(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
+ 
+ ;; Quad vector with only 2 element modes.
+ (define_mode_iterator VQ_2E [V2DI V2DF])
+ 
++;; BFmode vector modes.
++(define_mode_iterator VBF [V4BF V8BF])
++
+ ;; This mode iterator allows :P to be used for patterns that operate on
+ ;; addresses in different modes.  In LP64, only DI will match, while in
+ ;; ILP32, either can match.
+@@ -110,7 +136,8 @@
+ (define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == DImode")])
+ 
+ ;; Advanced SIMD Float modes suitable for moving, loading and storing.
+-(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
++(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF
++				V4BF V8BF])
+ 
+ ;; Advanced SIMD Float modes.
+ (define_mode_iterator VDQF [V2SF V4SF V2DF])
+@@ -128,6 +155,9 @@
+ 				  (HF "TARGET_SIMD_F16INST")
+ 				  SF DF])
+ 
++;; Scalar and vetor modes for SF, DF.
++(define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF])
++
+ ;; Advanced SIMD single Float modes.
+ (define_mode_iterator VDQSF [V2SF V4SF])
+ 
+@@ -148,7 +178,12 @@
+ 
+ ;; All Advanced SIMD modes suitable for moving, loading, and storing.
+ (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+-				V4HF V8HF V2SF V4SF V2DF])
++				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
++
++;; All Advanced SIMD modes suitable for moving, loading, and storing,
++;; including special Bfloat vector types.
++(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
++				   V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+ 
+ ;; The VALL_F16 modes except the 128-bit 2-element ones.
+ (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
+@@ -159,10 +194,10 @@
+ 
+ ;; All Advanced SIMD modes and DI.
+ (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+-				  V4HF V8HF V2SF V4SF V2DF DI])
++				  V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
+ 
+ ;; All Advanced SIMD modes, plus DI and DF.
+-(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
++(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
+ 			       V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
+ 
+ ;; Advanced SIMD modes for Integer reduction across lanes.
+@@ -185,7 +220,7 @@
+ (define_mode_iterator VQW [V16QI V8HI V4SI])
+ 
+ ;; Double vector modes for combines.
+-(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF])
++(define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
+ 
+ ;; Advanced SIMD modes except double int.
+ (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
+@@ -274,50 +309,85 @@
+ ;; count.
+ (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
+ 
+-;; All SVE vector modes.
+-(define_mode_iterator SVE_ALL [VNx16QI VNx8HI VNx4SI VNx2DI
+-			       VNx8HF VNx4SF VNx2DF])
++;; Iterators for single modes, for "@" patterns.
++(define_mode_iterator VNx8HI_ONLY [VNx8HI])
++(define_mode_iterator VNx8BF_ONLY [VNx8BF])
++(define_mode_iterator VNx4SI_ONLY [VNx4SI])
++(define_mode_iterator VNx4SF_ONLY [VNx4SF])
++(define_mode_iterator VNx2DI_ONLY [VNx2DI])
++(define_mode_iterator VNx2DF_ONLY [VNx2DF])
+ 
+ ;; All SVE vector structure modes.
+ (define_mode_iterator SVE_STRUCT [VNx32QI VNx16HI VNx8SI VNx4DI
+-				  VNx16HF VNx8SF VNx4DF
++				  VNx16BF VNx16HF VNx8SF VNx4DF
+ 				  VNx48QI VNx24HI VNx12SI VNx6DI
+-				  VNx24HF VNx12SF VNx6DF
++				  VNx24BF VNx24HF VNx12SF VNx6DF
+ 				  VNx64QI VNx32HI VNx16SI VNx8DI
+-				  VNx32HF VNx16SF VNx8DF])
++				  VNx32BF VNx32HF VNx16SF VNx8DF])
+ 
+-;; All SVE vector modes that have 8-bit or 16-bit elements.
+-(define_mode_iterator SVE_BH [VNx16QI VNx8HI VNx8HF])
++;; All fully-packed SVE vector modes.
++(define_mode_iterator SVE_FULL [VNx16QI VNx8HI VNx4SI VNx2DI
++			        VNx8BF VNx8HF VNx4SF VNx2DF])
+ 
+-;; All SVE vector modes that have 8-bit, 16-bit or 32-bit elements.
+-(define_mode_iterator SVE_BHS [VNx16QI VNx8HI VNx4SI VNx8HF VNx4SF])
++;; All fully-packed SVE integer vector modes.
++(define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI])
+ 
+-;; All SVE integer vector modes that have 8-bit, 16-bit or 32-bit elements.
+-(define_mode_iterator SVE_BHSI [VNx16QI VNx8HI VNx4SI])
++;; All fully-packed SVE floating-point vector modes.
++(define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
+ 
+-;; All SVE integer vector modes that have 16-bit, 32-bit or 64-bit elements.
+-(define_mode_iterator SVE_HSDI [VNx16QI VNx8HI VNx4SI])
++;; Fully-packed SVE integer vector modes that have 8-bit, 16-bit or 32-bit
++;; elements.
++(define_mode_iterator SVE_FULL_BHSI [VNx16QI VNx8HI VNx4SI])
+ 
+-;; All SVE floating-point vector modes that have 16-bit or 32-bit elements.
+-(define_mode_iterator SVE_HSF [VNx8HF VNx4SF])
++;; Fully-packed SVE vector modes that have 16-bit, 32-bit or 64-bit elements.
++(define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI
++				    VNx8BF VNx8HF VNx4SF VNx2DF])
+ 
+-;; All SVE vector modes that have 32-bit or 64-bit elements.
+-(define_mode_iterator SVE_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
++;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
++;; elements.
++(define_mode_iterator SVE_FULL_HSDI [VNx8HI VNx4SI VNx2DI])
+ 
+-;; All SVE vector modes that have 32-bit elements.
+-(define_mode_iterator SVE_S [VNx4SI VNx4SF])
++;; Fully-packed SVE floating-point vector modes that have 16-bit or 32-bit
++;; elements.
++(define_mode_iterator SVE_FULL_HSF [VNx8HF VNx4SF])
+ 
+-;; All SVE vector modes that have 64-bit elements.
+-(define_mode_iterator SVE_D [VNx2DI VNx2DF])
++;; Fully-packed SVE vector modes that have 32-bit or 64-bit elements.
++(define_mode_iterator SVE_FULL_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
+ 
+-;; All SVE integer vector modes that have 32-bit or 64-bit elements.
+-(define_mode_iterator SVE_SDI [VNx4SI VNx2DI])
++;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements.
++(define_mode_iterator SVE_FULL_SDI [VNx4SI VNx2DI])
+ 
+-;; All SVE integer vector modes.
+-(define_mode_iterator SVE_I [VNx16QI VNx8HI VNx4SI VNx2DI])
++;; Fully-packed SVE floating-point vector modes that have 32-bit or 64-bit
++;; elements.
++(define_mode_iterator SVE_FULL_SDF [VNx4SF VNx2DF])
+ 
+-;; All SVE floating-point vector modes.
+-(define_mode_iterator SVE_F [VNx8HF VNx4SF VNx2DF])
++;; Same, but with the appropriate conditions for FMMLA support.
++(define_mode_iterator SVE_MATMULF [(VNx4SF "TARGET_SVE_F32MM")
++				   (VNx2DF "TARGET_SVE_F64MM")])
++
++;; Fully-packed SVE vector modes that have 32-bit elements.
++(define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])
++
++;; Fully-packed SVE vector modes that have 64-bit elements.
++(define_mode_iterator SVE_FULL_D [VNx2DI VNx2DF])
++
++;; All partial SVE modes.
++(define_mode_iterator SVE_PARTIAL [VNx2QI
++				   VNx4QI VNx2HI
++				   VNx8QI VNx4HI VNx2SI])
++
++;; Modes involved in extending or truncating SVE data, for 8 elements per
++;; 128-bit block.
++(define_mode_iterator VNx8_NARROW [VNx8QI])
++(define_mode_iterator VNx8_WIDE [VNx8HI])
++
++;; ...same for 4 elements per 128-bit block.
++(define_mode_iterator VNx4_NARROW [VNx4QI VNx4HI])
++(define_mode_iterator VNx4_WIDE [VNx4SI])
++
++;; ...same for 2 elements per 128-bit block.
++(define_mode_iterator VNx2_NARROW [VNx2QI VNx2HI VNx2SI])
++(define_mode_iterator VNx2_WIDE [VNx2DI])
+ 
+ ;; All SVE predicate modes.
+ (define_mode_iterator PRED_ALL [VNx16BI VNx8BI VNx4BI VNx2BI])
+@@ -325,6 +395,12 @@
+ ;; SVE predicate modes that control 8-bit, 16-bit or 32-bit elements.
+ (define_mode_iterator PRED_BHS [VNx16BI VNx8BI VNx4BI])
+ 
++;; SVE predicate modes that control 16-bit, 32-bit or 64-bit elements.
++(define_mode_iterator PRED_HSD [VNx8BI VNx4BI VNx2BI])
++
++;; Bfloat16 modes to which V4SF can be converted
++(define_mode_iterator V4SF_TO_BF [V4BF V8BF])
++
+ ;; ------------------------------------------------------------------
+ ;; Unspec enumerations for Advance SIMD. These could well go into
+ ;; aarch64.md but for their use in int_iterators here.
+@@ -365,6 +441,10 @@
+     UNSPEC_RSUBHN2	; Used in aarch64-simd.md.
+     UNSPEC_SQDMULH	; Used in aarch64-simd.md.
+     UNSPEC_SQRDMULH	; Used in aarch64-simd.md.
++    UNSPEC_SMULLB	; Used in aarch64-sve2.md.
++    UNSPEC_SMULLT	; Used in aarch64-sve2.md.
++    UNSPEC_UMULLB	; Used in aarch64-sve2.md.
++    UNSPEC_UMULLT	; Used in aarch64-sve2.md.
+     UNSPEC_PMUL		; Used in aarch64-simd.md.
+     UNSPEC_FMULX	; Used in aarch64-simd.md.
+     UNSPEC_USQADD	; Used in aarch64-simd.md.
+@@ -387,6 +467,10 @@
+     UNSPEC_UQSHRN	; Used in aarch64-simd.md.
+     UNSPEC_SQRSHRN	; Used in aarch64-simd.md.
+     UNSPEC_UQRSHRN	; Used in aarch64-simd.md.
++    UNSPEC_SHRNB	; Used in aarch64-sve2.md.
++    UNSPEC_SHRNT	; Used in aarch64-sve2.md.
++    UNSPEC_RSHRNB	; Used in aarch64-sve2.md.
++    UNSPEC_RSHRNT	; Used in aarch64-sve2.md.
+     UNSPEC_SSHL		; Used in aarch64-simd.md.
+     UNSPEC_USHL		; Used in aarch64-simd.md.
+     UNSPEC_SRSHL	; Used in aarch64-simd.md.
+@@ -459,38 +543,126 @@
+     UNSPEC_FMLSL	; Used in aarch64-simd.md.
+     UNSPEC_FMLAL2	; Used in aarch64-simd.md.
+     UNSPEC_FMLSL2	; Used in aarch64-simd.md.
++    UNSPEC_ADR		; Used in aarch64-sve.md.
+     UNSPEC_SEL		; Used in aarch64-sve.md.
++    UNSPEC_BRKA		; Used in aarch64-sve.md.
++    UNSPEC_BRKB		; Used in aarch64-sve.md.
++    UNSPEC_BRKN		; Used in aarch64-sve.md.
++    UNSPEC_BRKPA	; Used in aarch64-sve.md.
++    UNSPEC_BRKPB	; Used in aarch64-sve.md.
++    UNSPEC_PFIRST	; Used in aarch64-sve.md.
++    UNSPEC_PNEXT	; Used in aarch64-sve.md.
++    UNSPEC_CNTP		; Used in aarch64-sve.md.
++    UNSPEC_SADDV	; Used in aarch64-sve.md.
++    UNSPEC_UADDV	; Used in aarch64-sve.md.
+     UNSPEC_ANDV		; Used in aarch64-sve.md.
+     UNSPEC_IORV		; Used in aarch64-sve.md.
+     UNSPEC_XORV		; Used in aarch64-sve.md.
+     UNSPEC_ANDF		; Used in aarch64-sve.md.
+     UNSPEC_IORF		; Used in aarch64-sve.md.
+     UNSPEC_XORF		; Used in aarch64-sve.md.
++    UNSPEC_REVB		; Used in aarch64-sve.md.
++    UNSPEC_REVH		; Used in aarch64-sve.md.
++    UNSPEC_REVW		; Used in aarch64-sve.md.
+     UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
+     UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
+-    UNSPEC_COND_ADD	; Used in aarch64-sve.md.
+-    UNSPEC_COND_SUB	; Used in aarch64-sve.md.
+-    UNSPEC_COND_MUL	; Used in aarch64-sve.md.
+-    UNSPEC_COND_DIV	; Used in aarch64-sve.md.
+-    UNSPEC_COND_MAX	; Used in aarch64-sve.md.
+-    UNSPEC_COND_MIN	; Used in aarch64-sve.md.
++    UNSPEC_FMLA		; Used in aarch64-sve.md.
++    UNSPEC_FMLS		; Used in aarch64-sve.md.
++    UNSPEC_FEXPA	; Used in aarch64-sve.md.
++    UNSPEC_FMMLA	; Used in aarch64-sve.md.
++    UNSPEC_FTMAD	; Used in aarch64-sve.md.
++    UNSPEC_FTSMUL	; Used in aarch64-sve.md.
++    UNSPEC_FTSSEL	; Used in aarch64-sve.md.
++    UNSPEC_SMATMUL	; Used in aarch64-sve.md.
++    UNSPEC_UMATMUL	; Used in aarch64-sve.md.
++    UNSPEC_USMATMUL	; Used in aarch64-sve.md.
++    UNSPEC_TRN1Q	; Used in aarch64-sve.md.
++    UNSPEC_TRN2Q	; Used in aarch64-sve.md.
++    UNSPEC_UZP1Q	; Used in aarch64-sve.md.
++    UNSPEC_UZP2Q	; Used in aarch64-sve.md.
++    UNSPEC_ZIP1Q	; Used in aarch64-sve.md.
++    UNSPEC_ZIP2Q	; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPHI_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPHS_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPLE_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPLO_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPLS_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPLT_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_CMPNE_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_COND_FABS	; Used in aarch64-sve.md.
++    UNSPEC_COND_FADD	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCADD90	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCADD270 ; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMEQ	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMGE	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMGT	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMLA	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMLA90	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMLA180 ; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMLA270 ; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMLE	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMLT	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMNE	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCMUO	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCVT	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCVTZS	; Used in aarch64-sve.md.
++    UNSPEC_COND_FCVTZU	; Used in aarch64-sve.md.
++    UNSPEC_COND_FDIV	; Used in aarch64-sve.md.
++    UNSPEC_COND_FMAX	; Used in aarch64-sve.md.
++    UNSPEC_COND_FMAXNM	; Used in aarch64-sve.md.
++    UNSPEC_COND_FMIN	; Used in aarch64-sve.md.
++    UNSPEC_COND_FMINNM	; Used in aarch64-sve.md.
+     UNSPEC_COND_FMLA	; Used in aarch64-sve.md.
+     UNSPEC_COND_FMLS	; Used in aarch64-sve.md.
++    UNSPEC_COND_FMUL	; Used in aarch64-sve.md.
++    UNSPEC_COND_FMULX	; Used in aarch64-sve.md.
++    UNSPEC_COND_FNEG	; Used in aarch64-sve.md.
+     UNSPEC_COND_FNMLA	; Used in aarch64-sve.md.
+     UNSPEC_COND_FNMLS	; Used in aarch64-sve.md.
+-    UNSPEC_COND_LT	; Used in aarch64-sve.md.
+-    UNSPEC_COND_LE	; Used in aarch64-sve.md.
+-    UNSPEC_COND_EQ	; Used in aarch64-sve.md.
+-    UNSPEC_COND_NE	; Used in aarch64-sve.md.
+-    UNSPEC_COND_GE	; Used in aarch64-sve.md.
+-    UNSPEC_COND_GT	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRECPX	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTA	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTI	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTM	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTN	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTP	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTX	; Used in aarch64-sve.md.
++    UNSPEC_COND_FRINTZ	; Used in aarch64-sve.md.
++    UNSPEC_COND_FSCALE	; Used in aarch64-sve.md.
++    UNSPEC_COND_FSQRT	; Used in aarch64-sve.md.
++    UNSPEC_COND_FSUB	; Used in aarch64-sve.md.
++    UNSPEC_COND_SCVTF	; Used in aarch64-sve.md.
++    UNSPEC_COND_UCVTF	; Used in aarch64-sve.md.
++    UNSPEC_LASTA	; Used in aarch64-sve.md.
+     UNSPEC_LASTB	; Used in aarch64-sve.md.
++    UNSPEC_ASHIFT_WIDE  ; Used in aarch64-sve.md.
++    UNSPEC_ASHIFTRT_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_LSHIFTRT_WIDE ; Used in aarch64-sve.md.
++    UNSPEC_LDFF1	; Used in aarch64-sve.md.
++    UNSPEC_LDNF1	; Used in aarch64-sve.md.
+     UNSPEC_FCADD90	; Used in aarch64-simd.md.
+     UNSPEC_FCADD270	; Used in aarch64-simd.md.
+     UNSPEC_FCMLA	; Used in aarch64-simd.md.
+     UNSPEC_FCMLA90	; Used in aarch64-simd.md.
+     UNSPEC_FCMLA180	; Used in aarch64-simd.md.
+     UNSPEC_FCMLA270	; Used in aarch64-simd.md.
++    UNSPEC_COND_FCVTNT	; Used in aarch64-sve2.md.
++    UNSPEC_SMULHS	; Used in aarch64-sve2.md.
++    UNSPEC_SMULHRS	; Used in aarch64-sve2.md.
++    UNSPEC_UMULHS	; Used in aarch64-sve2.md.
++    UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
++    UNSPEC_ASRD		; Used in aarch64-sve.md.
++    UNSPEC_USDOT	; Used in aarch64-simd.md.
++    UNSPEC_SUDOT	; Used in aarch64-simd.md.
++    UNSPEC_BFDOT	; Used in aarch64-simd.md.
++    UNSPEC_BFMLALB	; Used in aarch64-sve.md.
++    UNSPEC_BFMLALT	; Used in aarch64-sve.md.
++    UNSPEC_BFMMLA	; Used in aarch64-sve.md.
++    UNSPEC_BFCVTN      ; Used in aarch64-simd.md.
++    UNSPEC_BFCVTN2     ; Used in aarch64-simd.md.
++    UNSPEC_BFCVT       ; Used in aarch64-simd.md.
+ ])
+ 
+ ;; ------------------------------------------------------------------
+@@ -586,6 +758,7 @@
+ 			  (V2SI "2") (V4SI "4")
+ 				     (V2DI "2")
+ 			  (V4HF "4") (V8HF "8")
++			  (V4BF "4") (V8BF "8")
+ 			  (V2SF "2") (V4SF "4")
+ 			  (V1DF "1") (V2DF "2")
+ 			  (DI "1") (DF "1")])
+@@ -610,6 +783,14 @@
+ (define_mode_attr sizem1 [(QI "#7") (HI "#15") (SI "#31") (DI "#63")
+ 			  (HF "#15") (SF "#31") (DF "#63")])
+ 
++;; The number of bits in a vector element, or controlled by a predicate
++;; element.
++(define_mode_attr elem_bits [(VNx16BI "8") (VNx8BI "16")
++			     (VNx4BI "32") (VNx2BI "64")
++			     (VNx16QI "8") (VNx8HI "16")
++			     (VNx4SI "32") (VNx2DI "64")
++			     (VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])
++
+ ;; Attribute to describe constants acceptable in logical operations
+ (define_mode_attr lconst [(SI "K") (DI "L")])
+ 
+@@ -624,6 +805,7 @@
+ 
+ (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
+ 			 (V4HI "4h") (V8HI  "8h")
++			 (V4BF "4h") (V8BF  "8h")
+                          (V2SI "2s") (V4SI  "4s")
+                          (DI   "1d") (DF    "1d")
+                          (V2DI "2d") (V2SF "2s")
+@@ -637,7 +819,8 @@
+ 			 (V4HI ".4h") (V8HI  ".8h")
+ 			 (V2SI ".2s") (V4SI  ".4s")
+ 			 (V2DI ".2d") (V4HF ".4h")
+-			 (V8HF ".8h") (V2SF ".2s")
++			 (V8HF ".8h") (V4BF ".4h")
++			 (V8BF ".8h") (V2SF ".2s")
+ 			 (V4SF ".4s") (V2DF ".2d")
+ 			 (DI   "")    (SI   "")
+ 			 (HI   "")    (QI   "")
+@@ -655,9 +838,10 @@
+ 			  (V4HI "h") (V8HI  "h") (VNx8HI  "h") (VNx8BI  "h")
+ 			  (V2SI "s") (V4SI  "s") (VNx4SI  "s") (VNx4BI  "s")
+ 			  (V2DI "d")             (VNx2DI  "d") (VNx2BI  "d")
+-			  (V4HF "h") (V8HF  "h") (VNx8HF  "h")
++			  (V4HF "h") (V8HF  "h") (VNx8HF  "h") (VNx8BF  "h")
+ 			  (V2SF "s") (V4SF  "s") (VNx4SF  "s")
+ 			  (V2DF "d")             (VNx2DF  "d")
++			  (BF "h") (V4BF "h") (V8BF "h")
+ 			  (HF   "h")
+ 			  (SF   "s") (DF  "d")
+ 			  (QI "b")   (HI "h")
+@@ -667,13 +851,17 @@
+ (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
+ 
+ ;; Equivalent of "size" for a vector element.
+-(define_mode_attr Vesize [(VNx16QI "b")
+-			  (VNx8HI  "h") (VNx8HF  "h")
+-			  (VNx4SI  "w") (VNx4SF  "w")
++(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI  "b")
++			  (VNx4QI  "b") (VNx2QI  "b")
++			  (VNx8HI  "h") (VNx4HI  "h")
++			  (VNx2HI  "h") (VNx8HF  "h")
++			  (VNx4SI  "w") (VNx2SI  "w") (VNx4SF  "w")
+ 			  (VNx2DI  "d") (VNx2DF  "d")
+ 			  (VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
+ 			  (VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
+ 			  (VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
++			  (VNx16BF "h") (VNx24BF "h") (VNx32BF "h")
++			  (VNx8BF  "h")
+ 			  (VNx8SI  "w") (VNx12SI "w") (VNx16SI "w")
+ 			  (VNx8SF  "w") (VNx12SF "w") (VNx16SF "w")
+ 			  (VNx4DI  "d") (VNx6DI  "d") (VNx8DI  "d")
+@@ -697,13 +885,16 @@
+ 			  (V8HF "16b") (V2SF  "8b")
+ 			  (V4SF "16b") (V2DF  "16b")
+ 			  (DI   "8b")  (DF    "8b")
+-			  (SI   "8b")  (SF    "8b")])
++			  (SI   "8b")  (SF    "8b")
++			  (V4BF "8b")  (V8BF  "16b")])
+ 
+ ;; Define element mode for each vector mode.
+ (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI") (VNx16QI "QI")
+ 			(V4HI "HI") (V8HI  "HI") (VNx8HI  "HI")
+ 			(V2SI "SI") (V4SI  "SI") (VNx4SI  "SI")
++			(VNx8BF "BF")
+ 			(DI   "DI") (V2DI  "DI") (VNx2DI  "DI")
++			(V4BF "BF") (V8BF "BF")
+ 			(V4HF "HF") (V8HF  "HF") (VNx8HF  "HF")
+ 			(V2SF "SF") (V4SF  "SF") (VNx4SF  "SF")
+ 			(DF   "DF") (V2DF  "DF") (VNx2DF  "DF")
+@@ -713,8 +904,10 @@
+ ;; Define element mode for each vector mode (lower case).
+ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (VNx16QI "qi")
+ 			(V4HI "hi") (V8HI "hi") (VNx8HI  "hi")
++			(VNx8BF "bf")
+ 			(V2SI "si") (V4SI "si") (VNx4SI  "si")
+ 			(DI "di")   (V2DI "di") (VNx2DI  "di")
++			(V4BF "bf") (V8BF "bf")
+ 			(V4HF "hf") (V8HF "hf") (VNx8HF  "hf")
+ 			(V2SF "sf") (V4SF "sf") (VNx4SF  "sf")
+ 			(V2DF "df") (DF "df")   (VNx2DF  "df")
+@@ -723,19 +916,19 @@
+ 
+ ;; Element mode with floating-point values replaced by like-sized integers.
+ (define_mode_attr VEL_INT [(VNx16QI "QI")
+-			   (VNx8HI  "HI") (VNx8HF "HI")
++			   (VNx8HI  "HI") (VNx8HF "HI") (VNx8BF "HI")
+ 			   (VNx4SI  "SI") (VNx4SF "SI")
+ 			   (VNx2DI  "DI") (VNx2DF "DI")])
+ 
+ ;; Gives the mode of the 128-bit lowpart of an SVE vector.
+ (define_mode_attr V128 [(VNx16QI "V16QI")
+-			(VNx8HI  "V8HI") (VNx8HF "V8HF")
++			(VNx8HI  "V8HI") (VNx8HF "V8HF") (VNx8BF "V8BF")
+ 			(VNx4SI  "V4SI") (VNx4SF "V4SF")
+ 			(VNx2DI  "V2DI") (VNx2DF "V2DF")])
+ 
+ ;; ...and again in lower case.
+ (define_mode_attr v128 [(VNx16QI "v16qi")
+-			(VNx8HI  "v8hi") (VNx8HF "v8hf")
++			(VNx8HI  "v8hi") (VNx8HF "v8hf") (VNx8BF "v8bf")
+ 			(VNx4SI  "v4si") (VNx4SF "v4sf")
+ 			(VNx2DI  "v2di") (VNx2DF "v2df")])
+ 
+@@ -763,19 +956,20 @@
+ 			 (V2SI "SI")    (V4SI  "V2SI")
+ 			 (V2DI "DI")    (V2SF  "SF")
+ 			 (V4SF "V2SF")  (V4HF "V2HF")
+-			 (V8HF "V4HF")  (V2DF  "DF")])
++			 (V8HF "V4HF")  (V2DF  "DF")
++			 (V8BF "V4BF")])
+ 
+ ;; Half modes of all vector modes, in lower-case.
+ (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
+ 			 (V4HI "v2hi")  (V8HI  "v4hi")
+-			 (V8HF  "v4hf")
++			 (V8HF  "v4hf") (V8BF  "v4bf")
+ 			 (V2SI "si")    (V4SI  "v2si")
+ 			 (V2DI "di")    (V2SF  "sf")
+ 			 (V4SF "v2sf")  (V2DF  "df")])
+ 
+ ;; Double modes of vector modes.
+ (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
+-			(V4HF "V8HF")
++			(V4HF "V8HF")  (V4BF "V8BF")
+ 			(V2SI "V4SI")  (V2SF "V4SF")
+ 			(SI   "V2SI")  (DI   "V2DI")
+ 			(DF   "V2DF")])
+@@ -785,7 +979,7 @@
+ 
+ ;; Double modes of vector modes (lower case).
+ (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
+-			(V4HF "v8hf")
++			(V4HF "v8hf")  (V4BF "v8bf")
+ 			(V2SI "v4si")  (V2SF "v4sf")
+ 			(SI   "v2si")  (DI   "v2di")
+ 			(DF   "v2df")])
+@@ -879,6 +1073,7 @@
+ ;; variation on <vw> mapping FP modes to GP regs.
+ (define_mode_attr vwcore [(V8QI "w") (V16QI "w") (VNx16QI "w")
+ 			  (V4HI "w") (V8HI "w") (VNx8HI "w")
++			  (VNx8BF "w")
+ 			  (V2SI "w") (V4SI "w") (VNx4SI "w")
+ 			  (DI   "x") (V2DI "x") (VNx2DI "x")
+ 			  (V4HF "w") (V8HF "w") (VNx8HF "w")
+@@ -894,12 +1089,14 @@
+ 			       (V2SI "V2SI") (V4SI  "V4SI")
+ 			       (DI   "DI")   (V2DI  "V2DI")
+ 			       (V4HF "V4HI") (V8HF  "V8HI")
++			       (V4BF "V4HI") (V8BF  "V8HI")
+ 			       (V2SF "V2SI") (V4SF  "V4SI")
+ 			       (DF   "DI")   (V2DF  "V2DI")
+ 			       (SF   "SI")   (SI    "SI")
+ 			       (HF    "HI")
+ 			       (VNx16QI "VNx16QI")
+ 			       (VNx8HI  "VNx8HI") (VNx8HF "VNx8HI")
++			       (VNx8BF  "VNx8HI")
+ 			       (VNx4SI  "VNx4SI") (VNx4SF "VNx4SI")
+ 			       (VNx2DI  "VNx2DI") (VNx2DF "VNx2DI")
+ ])
+@@ -910,19 +1107,25 @@
+ 			       (V2SI "v2si") (V4SI  "v4si")
+ 			       (DI   "di")   (V2DI  "v2di")
+ 			       (V4HF "v4hi") (V8HF  "v8hi")
++			       (V4BF "v4hi") (V8BF  "v8hi")
+ 			       (V2SF "v2si") (V4SF  "v4si")
+ 			       (DF   "di")   (V2DF  "v2di")
+ 			       (SF   "si")
+ 			       (VNx16QI "vnx16qi")
+ 			       (VNx8HI  "vnx8hi") (VNx8HF "vnx8hi")
++			       (VNx8BF  "vnx8hi")
+ 			       (VNx4SI  "vnx4si") (VNx4SF "vnx4si")
+ 			       (VNx2DI  "vnx2di") (VNx2DF "vnx2di")
+ ])
+ 
+ ;; Floating-point equivalent of selected modes.
+-(define_mode_attr V_FP_EQUIV [(VNx4SI "VNx4SF") (VNx4SF "VNx4SF")
++(define_mode_attr V_FP_EQUIV [(VNx8HI "VNx8HF") (VNx8HF "VNx8HF")
++			      (VNx8BF "VNx8HF")
++			      (VNx4SI "VNx4SF") (VNx4SF "VNx4SF")
+ 			      (VNx2DI "VNx2DF") (VNx2DF "VNx2DF")])
+-(define_mode_attr v_fp_equiv [(VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
++(define_mode_attr v_fp_equiv [(VNx8HI "vnx8hf") (VNx8HF "vnx8hf")
++			      (VNx8BF "vnx8hf")
++			      (VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
+ 			      (VNx2DI "vnx2df") (VNx2DF "vnx2df")])
+ 
+ ;; Mode for vector conditional operations where the comparison has
+@@ -976,6 +1179,7 @@
+ 
+ (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
+ 				(V4HI "V8HI") (V8HI  "V4HI")
++				(V8BF "V4BF") (V4BF  "V8BF")
+ 				(V2SI "V4SI") (V4SI  "V2SI")
+ 				(DI   "V2DI") (V2DI  "DI")
+ 				(V2SF "V4SF") (V4SF  "V2SF")
+@@ -988,6 +1192,7 @@
+ 				    (DI   "to_128") (V2DI  "to_64")
+ 				    (V4HF "to_128") (V8HF  "to_64")
+ 				    (V2SF "to_128") (V4SF  "to_64")
++				    (V4BF "to_128") (V8BF  "to_64")
+ 				    (DF   "to_128") (V2DF  "to_64")])
+ 
+ ;; For certain vector-by-element multiplication instructions we must
+@@ -1021,9 +1226,11 @@
+ ;; Defined to '_q' for 128-bit types.
+ (define_mode_attr q [(V8QI "") (V16QI "_q")
+ 		     (V4HI "") (V8HI  "_q")
++		     (V4BF "") (V8BF  "_q")
+ 		     (V2SI "") (V4SI  "_q")
+ 		     (DI   "") (V2DI  "_q")
+ 		     (V4HF "") (V8HF "_q")
++		     (V4BF "") (V8BF "_q")
+ 		     (V2SF "") (V4SF  "_q")
+ 			       (V2DF  "_q")
+ 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
+@@ -1044,6 +1251,9 @@
+ ;; Register suffix for DOTPROD input types from the return type.
+ (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
+ 
++;; Register suffix for BFDOT input types from the return type.
++(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
++
+ ;; Sum of lengths of instructions needed to move vector registers of a mode.
+ (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
+ 
+@@ -1054,63 +1264,83 @@
+ ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
+ (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
+ 
++;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
++(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
++
+ (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
+ 
+ (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
+ 
++(define_mode_attr isquadop [(V8QI "") (V16QI "q") (V4BF "") (V8BF "q")])
++
+ (define_code_attr f16mac [(plus "a") (minus "s")])
+ 
+ ;; Map smax to smin and umax to umin.
+ (define_code_attr max_opp [(smax "smin") (umax "umin")])
+ 
++;; Same as above, but louder.
++(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
++
+ ;; The number of subvectors in an SVE_STRUCT.
+ (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
+ 				(VNx8SI  "2") (VNx4DI  "2")
++				(VNx16BF "2")
+ 				(VNx16HF "2") (VNx8SF  "2") (VNx4DF "2")
+ 				(VNx48QI "3") (VNx24HI "3")
+ 				(VNx12SI "3") (VNx6DI  "3")
++				(VNx24BF "3")
+ 				(VNx24HF "3") (VNx12SF "3") (VNx6DF "3")
+ 				(VNx64QI "4") (VNx32HI "4")
+ 				(VNx16SI "4") (VNx8DI  "4")
++				(VNx32BF "4")
+ 				(VNx32HF "4") (VNx16SF "4") (VNx8DF "4")])
+ 
+ ;; The number of instruction bytes needed for an SVE_STRUCT move.  This is
+ ;; equal to vector_count * 4.
+ (define_mode_attr insn_length [(VNx32QI "8")  (VNx16HI "8")
+ 			       (VNx8SI  "8")  (VNx4DI  "8")
++			       (VNx16BF "8")
+ 			       (VNx16HF "8")  (VNx8SF  "8")  (VNx4DF "8")
+ 			       (VNx48QI "12") (VNx24HI "12")
+ 			       (VNx12SI "12") (VNx6DI  "12")
++			       (VNx24BF "12")
+ 			       (VNx24HF "12") (VNx12SF "12") (VNx6DF "12")
+ 			       (VNx64QI "16") (VNx32HI "16")
+ 			       (VNx16SI "16") (VNx8DI  "16")
++			       (VNx32BF "16")
+ 			       (VNx32HF "16") (VNx16SF "16") (VNx8DF "16")])
+ 
+ ;; The type of a subvector in an SVE_STRUCT.
+ (define_mode_attr VSINGLE [(VNx32QI "VNx16QI")
+ 			   (VNx16HI "VNx8HI") (VNx16HF "VNx8HF")
++			   (VNx16BF "VNx8BF")
+ 			   (VNx8SI "VNx4SI") (VNx8SF "VNx4SF")
+ 			   (VNx4DI "VNx2DI") (VNx4DF "VNx2DF")
+ 			   (VNx48QI "VNx16QI")
+ 			   (VNx24HI "VNx8HI") (VNx24HF "VNx8HF")
++			   (VNx24BF "VNx8BF")
+ 			   (VNx12SI "VNx4SI") (VNx12SF "VNx4SF")
+ 			   (VNx6DI "VNx2DI") (VNx6DF "VNx2DF")
+ 			   (VNx64QI "VNx16QI")
+ 			   (VNx32HI "VNx8HI") (VNx32HF "VNx8HF")
++			   (VNx32BF "VNx8BF")
+ 			   (VNx16SI "VNx4SI") (VNx16SF "VNx4SF")
+ 			   (VNx8DI "VNx2DI") (VNx8DF "VNx2DF")])
+ 
+ ;; ...and again in lower case.
+ (define_mode_attr vsingle [(VNx32QI "vnx16qi")
+ 			   (VNx16HI "vnx8hi") (VNx16HF "vnx8hf")
++			   (VNx16BF "vnx8bf")
+ 			   (VNx8SI "vnx4si") (VNx8SF "vnx4sf")
+ 			   (VNx4DI "vnx2di") (VNx4DF "vnx2df")
+ 			   (VNx48QI "vnx16qi")
+ 			   (VNx24HI "vnx8hi") (VNx24HF "vnx8hf")
++			   (VNx24BF "vnx8bf")
+ 			   (VNx12SI "vnx4si") (VNx12SF "vnx4sf")
+ 			   (VNx6DI "vnx2di") (VNx6DF "vnx2df")
+ 			   (VNx64QI "vnx16qi")
+ 			   (VNx32HI "vnx8hi") (VNx32HF "vnx8hf")
++			   (VNx32BF "vnx8bf")
+ 			   (VNx16SI "vnx4si") (VNx16SF "vnx4sf")
+ 			   (VNx8DI "vnx2di") (VNx8DF "vnx2df")])
+ 
+@@ -1118,36 +1348,44 @@
+ ;; this is equivalent to the <VPRED> of the subvector mode.
+ (define_mode_attr VPRED [(VNx16QI "VNx16BI")
+ 			 (VNx8HI "VNx8BI") (VNx8HF "VNx8BI")
++			 (VNx8BF "VNx8BI")
+ 			 (VNx4SI "VNx4BI") (VNx4SF "VNx4BI")
+ 			 (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")
+ 			 (VNx32QI "VNx16BI")
+ 			 (VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
++			 (VNx16BF "VNx8BI")
+ 			 (VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
+ 			 (VNx4DI "VNx2BI") (VNx4DF "VNx2BI")
+ 			 (VNx48QI "VNx16BI")
+ 			 (VNx24HI "VNx8BI") (VNx24HF "VNx8BI")
++			 (VNx24BF "VNx8BI")
+ 			 (VNx12SI "VNx4BI") (VNx12SF "VNx4BI")
+ 			 (VNx6DI "VNx2BI") (VNx6DF "VNx2BI")
+ 			 (VNx64QI "VNx16BI")
+ 			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
++			 (VNx32BF "VNx8BI")
+ 			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
+ 			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
+ 
+ ;; ...and again in lower case.
+ (define_mode_attr vpred [(VNx16QI "vnx16bi")
+ 			 (VNx8HI "vnx8bi") (VNx8HF "vnx8bi")
++			 (VNx8BF "vnx8bi")
+ 			 (VNx4SI "vnx4bi") (VNx4SF "vnx4bi")
+ 			 (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")
+ 			 (VNx32QI "vnx16bi")
+ 			 (VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
++			 (VNx16BF "vnx8bi")
+ 			 (VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
+ 			 (VNx4DI "vnx2bi") (VNx4DF "vnx2bi")
+ 			 (VNx48QI "vnx16bi")
+ 			 (VNx24HI "vnx8bi") (VNx24HF "vnx8bi")
++			 (VNx24BF "vnx8bi")
+ 			 (VNx12SI "vnx4bi") (VNx12SF "vnx4bi")
+ 			 (VNx6DI "vnx2bi") (VNx6DF "vnx2bi")
+ 			 (VNx64QI "vnx16bi")
+ 			 (VNx32HI "vnx8bi") (VNx32HF "vnx4bi")
++			 (VNx32BF "vnx8bi")
+ 			 (VNx16SI "vnx4bi") (VNx16SF "vnx4bi")
+ 			 (VNx8DI "vnx2bi") (VNx8DF "vnx2bi")])
+ 
+@@ -1158,6 +1396,30 @@
+ 				    (V4HF "<Vetype>[%4]") (V8HF "<Vetype>[%4]")
+ 				    ])
+ 
++;; The number of bytes controlled by a predicate
++(define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
++			      (VNx4BI "4") (VNx2BI "8")])
++
++;; Two-nybble mask for partial vector modes: nunits, byte size.
++(define_mode_attr self_mask [(VNx8QI "0x81")
++			     (VNx4QI "0x41")
++			     (VNx2QI "0x21")
++			     (VNx4HI "0x42")
++			     (VNx2HI "0x22")
++			     (VNx2SI "0x24")])
++
++;; For full vector modes, the mask of narrower modes, encoded as above.
++(define_mode_attr narrower_mask [(VNx8HI "0x81")
++				 (VNx4SI "0x43")
++				 (VNx2DI "0x27")])
++
++;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
++(define_mode_attr sve_lane_con [(VNx4SI "y") (VNx2DI "x")
++				(VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])
++
++;; The constraint to use for an SVE FCMLA lane index.
++(define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
++
+ ;; -------------------------------------------------------------------
+ ;; Code Iterators
+ ;; -------------------------------------------------------------------
+@@ -1168,6 +1430,8 @@
+ ;; This code iterator allows the shifts supported in arithmetic instructions
+ (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
+ 
++(define_code_iterator SHIFTRT [ashiftrt lshiftrt])
++
+ ;; Code iterator for logical operations
+ (define_code_iterator LOGICAL [and ior xor])
+ 
+@@ -1214,7 +1478,7 @@
+ ;; Signed and unsigned max operations.
+ (define_code_iterator USMAX [smax umax])
+ 
+-;; Code iterator for variants of vector max and min.
++;; Code iterator for plus and minus.
+ (define_code_iterator ADDSUB [plus minus])
+ 
+ ;; Code iterator for variants of vector saturating binary ops.
+@@ -1226,6 +1490,21 @@
+ ;; Code iterator for signed variants of vector saturating binary ops.
+ (define_code_iterator SBINQOPS [ss_plus ss_minus])
+ 
++;; Code iterator for unsigned variants of vector saturating binary ops.
++(define_code_iterator UBINQOPS [us_plus us_minus])
++
++;; Modular and saturating addition.
++(define_code_iterator ANY_PLUS [plus ss_plus us_plus])
++
++;; Saturating addition.
++(define_code_iterator SAT_PLUS [ss_plus us_plus])
++
++;; Modular and saturating subtraction.
++(define_code_iterator ANY_MINUS [minus ss_minus us_minus])
++
++;; Saturating subtraction.
++(define_code_iterator SAT_MINUS [ss_minus us_minus])
++
+ ;; Comparison operators for <F>CM.
+ (define_code_iterator COMPARISONS [lt le eq ge gt])
+ 
+@@ -1236,27 +1515,25 @@
+ (define_code_iterator FAC_COMPARISONS [lt le ge gt])
+ 
+ ;; SVE integer unary operations.
+-(define_code_iterator SVE_INT_UNARY [abs neg not popcount])
+-
+-;; SVE floating-point unary operations.
+-(define_code_iterator SVE_FP_UNARY [abs neg sqrt])
++(define_code_iterator SVE_INT_UNARY [abs neg not clrsb clz popcount])
+ 
+ ;; SVE integer binary operations.
+ (define_code_iterator SVE_INT_BINARY [plus minus mult smax umax smin umin
++				      ashift ashiftrt lshiftrt
+ 				      and ior xor])
+ 
+ ;; SVE integer binary division operations.
+ (define_code_iterator SVE_INT_BINARY_SD [div udiv])
+ 
++;; SVE integer binary operations that have an immediate form.
++(define_code_iterator SVE_INT_BINARY_IMM [mult smax smin umax umin])
++
+ ;; SVE floating-point operations with an unpredicated all-register form.
+ (define_code_iterator SVE_UNPRED_FP_BINARY [plus minus mult])
+ 
+ ;; SVE integer comparisons.
+ (define_code_iterator SVE_INT_CMP [lt le eq ne ge gt ltu leu geu gtu])
+ 
+-;; SVE floating-point comparisons.
+-(define_code_iterator SVE_FP_CMP [lt le eq ne ge gt])
+-
+ ;; -------------------------------------------------------------------
+ ;; Code Attributes
+ ;; -------------------------------------------------------------------
+@@ -1273,6 +1550,8 @@
+ 			 (unsigned_fix "fixuns")
+ 			 (float "float")
+ 			 (unsigned_float "floatuns")
++			 (clrsb "clrsb")
++			 (clz "clz")
+ 			 (popcount "popcount")
+ 			 (and "and")
+ 			 (ior "ior")
+@@ -1304,8 +1583,7 @@
+ 			 (leu "leu")
+ 			 (geu "geu")
+ 			 (gtu "gtu")
+-			 (abs "abs")
+-			 (sqrt "sqrt")])
++			 (abs "abs")])
+ 
+ ;; For comparison operators we use the FCM* and CM* instructions.
+ ;; As there are no CMLE or CMLT instructions which act on 3 vector
+@@ -1350,6 +1628,9 @@
+ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
+ 			 (lshiftrt "lsr") (rotatert "ror")])
+ 
++;; Op prefix for shift right and accumulate.
++(define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
++
+ ;; Map shift operators onto underlying bit-field instructions
+ (define_code_attr bfshift [(ashift "ubfiz") (ashiftrt "sbfx")
+ 			   (lshiftrt "ubfx") (rotatert "extr")])
+@@ -1374,6 +1655,15 @@
+ 		      (smax "s") (umax "u")
+ 		      (smin "s") (umin "u")])
+ 
++;; "s" for signed ops, empty for unsigned ones.
++(define_code_attr s [(sign_extend "s") (zero_extend "")])
++
++;; Map signed/unsigned ops to the corresponding extension.
++(define_code_attr paired_extend [(ss_plus "sign_extend")
++				 (us_plus "zero_extend")
++				 (ss_minus "sign_extend")
++				 (us_minus "zero_extend")])
++
+ ;; Whether a shift is left or right.
+ (define_code_attr lr [(ashift "l") (ashiftrt "r") (lshiftrt "r")])
+ 
+@@ -1434,35 +1724,45 @@
+ 			      (smax "smax")
+ 			      (umin "umin")
+ 			      (umax "umax")
++			      (ashift "lsl")
++			      (ashiftrt "asr")
++			      (lshiftrt "lsr")
+ 			      (and "and")
+ 			      (ior "orr")
+ 			      (xor "eor")
+ 			      (not "not")
++			      (clrsb "cls")
++			      (clz "clz")
+ 			      (popcount "cnt")])
+ 
+ (define_code_attr sve_int_op_rev [(plus "add")
+-			          (minus "subr")
+-			          (mult "mul")
+-			          (div "sdivr")
+-			          (udiv "udivr")
+-			          (smin "smin")
+-			          (smax "smax")
+-			          (umin "umin")
+-			          (umax "umax")
+-			          (and "and")
+-			          (ior "orr")
+-			          (xor "eor")])
++				  (minus "subr")
++				  (mult "mul")
++				  (div "sdivr")
++				  (udiv "udivr")
++				  (smin "smin")
++				  (smax "smax")
++				  (umin "umin")
++				  (umax "umax")
++				  (ashift "lslr")
++				  (ashiftrt "asrr")
++				  (lshiftrt "lsrr")
++				  (and "and")
++				  (ior "orr")
++				  (xor "eor")])
+ 
+ ;; The floating-point SVE instruction that implements an rtx code.
+ (define_code_attr sve_fp_op [(plus "fadd")
+ 			     (minus "fsub")
+-			     (mult "fmul")
+-			     (neg "fneg")
+-			     (abs "fabs")
+-			     (sqrt "fsqrt")])
++			     (mult "fmul")])
+ 
+ ;; The SVE immediate constraint to use for an rtl code.
+-(define_code_attr sve_imm_con [(eq "vsc")
++(define_code_attr sve_imm_con [(mult "vsm")
++			       (smax "vsm")
++			       (smin "vsm")
++			       (umax "vsb")
++			       (umin "vsb")
++			       (eq "vsc")
+ 			       (ne "vsc")
+ 			       (lt "vsc")
+ 			       (ge "vsc")
+@@ -1473,6 +1773,33 @@
+ 			       (geu "vsd")
+ 			       (gtu "vsd")])
+ 
++;; The prefix letter to use when printing an immediate operand.
++(define_code_attr sve_imm_prefix [(mult "")
++				  (smax "")
++				  (smin "")
++				  (umax "D")
++				  (umin "D")])
++
++;; The predicate to use for the second input operand in a cond_<optab><mode>
++;; pattern.
++(define_code_attr sve_pred_int_rhs2_operand
++  [(plus "register_operand")
++   (minus "register_operand")
++   (mult "register_operand")
++   (smax "register_operand")
++   (umax "register_operand")
++   (smin "register_operand")
++   (umin "register_operand")
++   (ashift "aarch64_sve_lshift_operand")
++   (ashiftrt "aarch64_sve_rshift_operand")
++   (lshiftrt "aarch64_sve_rshift_operand")
++   (and "aarch64_sve_pred_and_operand")
++   (ior "register_operand")
++   (xor "register_operand")])
++
++(define_code_attr inc_dec [(minus "dec") (ss_minus "sqdec") (us_minus "uqdec")
++			   (plus "inc") (ss_plus "sqinc") (us_plus "uqinc")])
++
+ ;; -------------------------------------------------------------------
+ ;; Int Iterators.
+ ;; -------------------------------------------------------------------
+@@ -1492,7 +1819,7 @@
+ (define_int_iterator FMAXMINV [UNSPEC_FMAXV UNSPEC_FMINV
+ 			       UNSPEC_FMAXNMV UNSPEC_FMINNMV])
+ 
+-(define_int_iterator BITWISEV [UNSPEC_ANDV UNSPEC_IORV UNSPEC_XORV])
++(define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV])
+ 
+ (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
+ 
+@@ -1505,8 +1832,20 @@
+ 
+ (define_int_iterator RHADD [UNSPEC_SRHADD UNSPEC_URHADD])
+ 
++(define_int_iterator MULLBT [UNSPEC_SMULLB UNSPEC_UMULLB
++                             UNSPEC_SMULLT UNSPEC_UMULLT])
++
++(define_int_iterator SHRNB [UNSPEC_SHRNB UNSPEC_RSHRNB])
++
++(define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT])
++
++(define_int_iterator BSL_DUP [1 2])
++
+ (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
+ 
++(define_int_iterator DOTPROD_I8MM [UNSPEC_USDOT UNSPEC_SUDOT])
++(define_int_iterator DOTPROD_US_ONLY [UNSPEC_USDOT])
++
+ (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
+ 			       UNSPEC_SUBHN UNSPEC_RSUBHN])
+ 
+@@ -1516,12 +1855,17 @@
+ (define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN
+ 				  UNSPEC_FMAXNM UNSPEC_FMINNM])
+ 
+-(define_int_iterator PAUTH_LR_SP [UNSPEC_PACISP UNSPEC_AUTISP])
++(define_int_iterator PAUTH_LR_SP [UNSPEC_PACIASP UNSPEC_AUTIASP
++				  UNSPEC_PACIBSP UNSPEC_AUTIBSP])
+ 
+-(define_int_iterator PAUTH_17_16 [UNSPEC_PACI1716 UNSPEC_AUTI1716])
++(define_int_iterator PAUTH_17_16 [UNSPEC_PACIA1716 UNSPEC_AUTIA1716
++				  UNSPEC_PACIB1716 UNSPEC_AUTIB1716])
+ 
+ (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH])
+ 
++(define_int_iterator MULHRS [UNSPEC_SMULHS UNSPEC_UMULHS
++                             UNSPEC_SMULHRS UNSPEC_UMULHRS])
++
+ (define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD])
+ 
+ (define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN])
+@@ -1555,6 +1899,10 @@
+ 			      UNSPEC_TRN1 UNSPEC_TRN2
+ 			      UNSPEC_UZP1 UNSPEC_UZP2])
+ 
++(define_int_iterator PERMUTEQ [UNSPEC_ZIP1Q UNSPEC_ZIP2Q
++			       UNSPEC_TRN1Q UNSPEC_TRN2Q
++			       UNSPEC_UZP1Q UNSPEC_UZP2Q])
++
+ (define_int_iterator OPTAB_PERMUTE [UNSPEC_ZIP1 UNSPEC_ZIP2
+ 				    UNSPEC_UZP1 UNSPEC_UZP2])
+ 
+@@ -1601,18 +1949,144 @@
+ 
+ (define_int_iterator MUL_HIGHPART [UNSPEC_SMUL_HIGHPART UNSPEC_UMUL_HIGHPART])
+ 
+-(define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_ADD UNSPEC_COND_SUB
+-					 UNSPEC_COND_MUL UNSPEC_COND_DIV
+-					 UNSPEC_COND_MAX UNSPEC_COND_MIN])
++(define_int_iterator CLAST [UNSPEC_CLASTA UNSPEC_CLASTB])
++
++(define_int_iterator LAST [UNSPEC_LASTA UNSPEC_LASTB])
++
++(define_int_iterator SVE_INT_UNARY [UNSPEC_RBIT UNSPEC_REVB
++				    UNSPEC_REVH UNSPEC_REVW])
++
++(define_int_iterator SVE_FP_UNARY [UNSPEC_FRECPE UNSPEC_RSQRTE])
++
++(define_int_iterator SVE_FP_UNARY_INT [UNSPEC_FEXPA])
++
++(define_int_iterator SVE_FP_BINARY [UNSPEC_FRECPS UNSPEC_RSQRTS])
++
++(define_int_iterator SVE_FP_BINARY_INT [UNSPEC_FTSMUL UNSPEC_FTSSEL])
++
++(define_int_iterator SVE_BFLOAT_TERNARY_LONG [UNSPEC_BFDOT
++					      UNSPEC_BFMLALB
++					      UNSPEC_BFMLALT
++					      UNSPEC_BFMMLA])
++
++(define_int_iterator SVE_BFLOAT_TERNARY_LONG_LANE [UNSPEC_BFDOT
++						   UNSPEC_BFMLALB
++						   UNSPEC_BFMLALT])
++
++(define_int_iterator SVE_INT_REDUCTION [UNSPEC_ANDV
++					UNSPEC_IORV
++					UNSPEC_SMAXV
++					UNSPEC_SMINV
++					UNSPEC_UMAXV
++					UNSPEC_UMINV
++					UNSPEC_XORV])
++
++(define_int_iterator SVE_FP_REDUCTION [UNSPEC_FADDV
++				       UNSPEC_FMAXV
++				       UNSPEC_FMAXNMV
++				       UNSPEC_FMINV
++				       UNSPEC_FMINNMV])
++
++(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FABS
++					UNSPEC_COND_FNEG
++					UNSPEC_COND_FRECPX
++					UNSPEC_COND_FRINTA
++					UNSPEC_COND_FRINTI
++					UNSPEC_COND_FRINTM
++					UNSPEC_COND_FRINTN
++					UNSPEC_COND_FRINTP
++					UNSPEC_COND_FRINTX
++					UNSPEC_COND_FRINTZ
++					UNSPEC_COND_FSQRT])
++
++(define_int_iterator SVE_COND_FCVT [UNSPEC_COND_FCVT])
++(define_int_iterator SVE_COND_FCVTI [UNSPEC_COND_FCVTZS UNSPEC_COND_FCVTZU])
++(define_int_iterator SVE_COND_ICVTF [UNSPEC_COND_SCVTF UNSPEC_COND_UCVTF])
++
++(define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_FADD
++					 UNSPEC_COND_FDIV
++					 UNSPEC_COND_FMAX
++					 UNSPEC_COND_FMAXNM
++					 UNSPEC_COND_FMIN
++					 UNSPEC_COND_FMINNM
++					 UNSPEC_COND_FMUL
++					 UNSPEC_COND_FMULX
++					 UNSPEC_COND_FSUB])
++
++(define_int_iterator SVE_COND_FP_BINARY_INT [UNSPEC_COND_FSCALE])
++
++(define_int_iterator SVE_COND_FP_ADD [UNSPEC_COND_FADD])
++(define_int_iterator SVE_COND_FP_SUB [UNSPEC_COND_FSUB])
++(define_int_iterator SVE_COND_FP_MUL [UNSPEC_COND_FMUL])
++
++(define_int_iterator SVE_COND_FP_BINARY_I1 [UNSPEC_COND_FMAX
++					    UNSPEC_COND_FMAXNM
++					    UNSPEC_COND_FMIN
++					    UNSPEC_COND_FMINNM
++					    UNSPEC_COND_FMUL])
++
++(define_int_iterator SVE_COND_FP_BINARY_REG [UNSPEC_COND_FDIV
++					     UNSPEC_COND_FMULX])
++
++(define_int_iterator SVE_COND_FCADD [UNSPEC_COND_FCADD90
++				     UNSPEC_COND_FCADD270])
++
++(define_int_iterator SVE_COND_FP_MAXMIN [UNSPEC_COND_FMAX
++					 UNSPEC_COND_FMAXNM
++					 UNSPEC_COND_FMIN
++					 UNSPEC_COND_FMINNM])
++
++;; Floating-point max/min operations that correspond to optabs,
++;; as opposed to those that are internal to the port.
++(define_int_iterator SVE_COND_FP_MAXMIN_PUBLIC [UNSPEC_COND_FMAXNM
++						UNSPEC_COND_FMINNM])
+ 
+ (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA
+ 					  UNSPEC_COND_FMLS
+ 					  UNSPEC_COND_FNMLA
+ 					  UNSPEC_COND_FNMLS])
+ 
+-(define_int_iterator SVE_COND_FP_CMP [UNSPEC_COND_LT UNSPEC_COND_LE
+-				      UNSPEC_COND_EQ UNSPEC_COND_NE
+-				      UNSPEC_COND_GE UNSPEC_COND_GT])
++(define_int_iterator SVE_COND_FCMLA [UNSPEC_COND_FCMLA
++				     UNSPEC_COND_FCMLA90
++				     UNSPEC_COND_FCMLA180
++				     UNSPEC_COND_FCMLA270])
++
++(define_int_iterator SVE_COND_INT_CMP_WIDE [UNSPEC_COND_CMPEQ_WIDE
++					    UNSPEC_COND_CMPGE_WIDE
++					    UNSPEC_COND_CMPGT_WIDE
++					    UNSPEC_COND_CMPHI_WIDE
++					    UNSPEC_COND_CMPHS_WIDE
++					    UNSPEC_COND_CMPLE_WIDE
++					    UNSPEC_COND_CMPLO_WIDE
++					    UNSPEC_COND_CMPLS_WIDE
++					    UNSPEC_COND_CMPLT_WIDE
++					    UNSPEC_COND_CMPNE_WIDE])
++
++;; SVE FP comparisons that accept #0.0.
++(define_int_iterator SVE_COND_FP_CMP_I0 [UNSPEC_COND_FCMEQ
++					 UNSPEC_COND_FCMGE
++					 UNSPEC_COND_FCMGT
++					 UNSPEC_COND_FCMLE
++					 UNSPEC_COND_FCMLT
++					 UNSPEC_COND_FCMNE])
++
++(define_int_iterator SVE_COND_FP_ABS_CMP [UNSPEC_COND_FCMGE
++					  UNSPEC_COND_FCMGT
++					  UNSPEC_COND_FCMLE
++					  UNSPEC_COND_FCMLT])
++
++(define_int_iterator SVE_FP_TERNARY_LANE [UNSPEC_FMLA UNSPEC_FMLS])
++
++(define_int_iterator SVE_CFP_TERNARY_LANE [UNSPEC_FCMLA UNSPEC_FCMLA90
++					   UNSPEC_FCMLA180 UNSPEC_FCMLA270])
++
++(define_int_iterator SVE_WHILE [UNSPEC_WHILELE UNSPEC_WHILELO UNSPEC_WHILELS UNSPEC_WHILELT])
++
++(define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE
++				     UNSPEC_ASHIFTRT_WIDE
++				     UNSPEC_LSHIFTRT_WIDE])
++
++(define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1])
+ 
+ (define_int_iterator FCADD [UNSPEC_FCADD90
+ 			    UNSPEC_FCADD270])
+@@ -1622,6 +2096,23 @@
+ 			    UNSPEC_FCMLA180
+ 			    UNSPEC_FCMLA270])
+ 
++(define_int_iterator FRINTNZX [UNSPEC_FRINT32Z UNSPEC_FRINT32X
++			       UNSPEC_FRINT64Z UNSPEC_FRINT64X])
++
++(define_int_iterator SVE_BRK_UNARY [UNSPEC_BRKA UNSPEC_BRKB])
++
++(define_int_iterator SVE_BRK_BINARY [UNSPEC_BRKN UNSPEC_BRKPA UNSPEC_BRKPB])
++
++(define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
++
++(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL
++			     UNSPEC_USMATMUL])
++
++(define_int_iterator FMMLA [UNSPEC_FMMLA])
++
++(define_int_iterator BF_MLA [UNSPEC_BFMLALB
++			     UNSPEC_BFMLALT])
++
+ ;; Iterators for atomic operations.
+ 
+ (define_int_iterator ATOMIC_LDOP
+@@ -1646,19 +2137,84 @@
+ (define_int_attr optab [(UNSPEC_ANDF "and")
+ 			(UNSPEC_IORF "ior")
+ 			(UNSPEC_XORF "xor")
++			(UNSPEC_SADDV "sadd")
++			(UNSPEC_UADDV "uadd")
+ 			(UNSPEC_ANDV "and")
+ 			(UNSPEC_IORV "ior")
+ 			(UNSPEC_XORV "xor")
+-			(UNSPEC_COND_ADD "add")
+-			(UNSPEC_COND_SUB "sub")
+-			(UNSPEC_COND_MUL "mul")
+-			(UNSPEC_COND_DIV "div")
+-			(UNSPEC_COND_MAX "smax")
+-			(UNSPEC_COND_MIN "smin")
++			(UNSPEC_FRECPE "frecpe")
++			(UNSPEC_FRECPS "frecps")
++			(UNSPEC_RSQRTE "frsqrte")
++			(UNSPEC_RSQRTS "frsqrts")
++			(UNSPEC_RBIT "rbit")
++			(UNSPEC_REVB "revb")
++			(UNSPEC_REVH "revh")
++			(UNSPEC_REVW "revw")
++			(UNSPEC_UMAXV "umax")
++			(UNSPEC_UMINV "umin")
++			(UNSPEC_SMAXV "smax")
++			(UNSPEC_SMINV "smin")
++			(UNSPEC_FADDV "plus")
++			(UNSPEC_FMAXNMV "smax")
++			(UNSPEC_FMAXV "smax_nan")
++			(UNSPEC_FMINNMV "smin")
++			(UNSPEC_FMINV "smin_nan")
++		        (UNSPEC_SMUL_HIGHPART "smulh")
++		        (UNSPEC_UMUL_HIGHPART "umulh")
++			(UNSPEC_FMLA "fma")
++			(UNSPEC_FMLS "fnma")
++			(UNSPEC_FCMLA "fcmla")
++			(UNSPEC_FCMLA90 "fcmla90")
++			(UNSPEC_FCMLA180 "fcmla180")
++			(UNSPEC_FCMLA270 "fcmla270")
++			(UNSPEC_FEXPA "fexpa")
++			(UNSPEC_FTSMUL "ftsmul")
++			(UNSPEC_FTSSEL "ftssel")
++			(UNSPEC_SMATMUL "smatmul")
++			(UNSPEC_TRN1Q "trn1q")
++			(UNSPEC_TRN2Q "trn2q")
++			(UNSPEC_UMATMUL "umatmul")
++			(UNSPEC_USMATMUL "usmatmul")
++			(UNSPEC_UZP1Q "uzp1q")
++			(UNSPEC_UZP2Q "uzp2q")
++			(UNSPEC_ZIP1Q "zip1q")
++			(UNSPEC_ZIP2Q "zip2q")
++			(UNSPEC_COND_FABS "abs")
++			(UNSPEC_COND_FADD "add")
++			(UNSPEC_COND_FCADD90 "cadd90")
++			(UNSPEC_COND_FCADD270 "cadd270")
++			(UNSPEC_COND_FCMLA "fcmla")
++			(UNSPEC_COND_FCMLA90 "fcmla90")
++			(UNSPEC_COND_FCMLA180 "fcmla180")
++			(UNSPEC_COND_FCMLA270 "fcmla270")
++			(UNSPEC_COND_FCVT "fcvt")
++			(UNSPEC_COND_FCVTZS "fix_trunc")
++			(UNSPEC_COND_FCVTZU "fixuns_trunc")
++			(UNSPEC_COND_FDIV "div")
++			(UNSPEC_COND_FMAX "smax_nan")
++			(UNSPEC_COND_FMAXNM "smax")
++			(UNSPEC_COND_FMIN "smin_nan")
++			(UNSPEC_COND_FMINNM "smin")
+ 			(UNSPEC_COND_FMLA "fma")
+ 			(UNSPEC_COND_FMLS "fnma")
++			(UNSPEC_COND_FMUL "mul")
++			(UNSPEC_COND_FMULX "mulx")
++			(UNSPEC_COND_FNEG "neg")
+ 			(UNSPEC_COND_FNMLA "fnms")
+-			(UNSPEC_COND_FNMLS "fms")])
++			(UNSPEC_COND_FNMLS "fms")
++			(UNSPEC_COND_FRECPX "frecpx")
++			(UNSPEC_COND_FRINTA "round")
++			(UNSPEC_COND_FRINTI "nearbyint")
++			(UNSPEC_COND_FRINTM "floor")
++			(UNSPEC_COND_FRINTN "frintn")
++			(UNSPEC_COND_FRINTP "ceil")
++			(UNSPEC_COND_FRINTX "rint")
++			(UNSPEC_COND_FRINTZ "btrunc")
++			(UNSPEC_COND_FSCALE "fscale")
++			(UNSPEC_COND_FSQRT "sqrt")
++			(UNSPEC_COND_FSUB "sub")
++			(UNSPEC_COND_SCVTF "float")
++			(UNSPEC_COND_UCVTF "floatuns")])
+ 
+ (define_int_attr  maxmin_uns [(UNSPEC_UMAXV "umax")
+ 			      (UNSPEC_UMINV "umin")
+@@ -1671,7 +2227,11 @@
+ 			      (UNSPEC_FMINNMV "smin")
+ 			      (UNSPEC_FMINV "smin_nan")
+ 			      (UNSPEC_FMAXNM "fmax")
+-			      (UNSPEC_FMINNM "fmin")])
++			      (UNSPEC_FMINNM "fmin")
++			      (UNSPEC_COND_FMAX "fmax_nan")
++			      (UNSPEC_COND_FMAXNM "fmax")
++			      (UNSPEC_COND_FMIN "fmin_nan")
++			      (UNSPEC_COND_FMINNM "fmin")])
+ 
+ (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
+ 				 (UNSPEC_UMINV "umin")
+@@ -1686,22 +2246,41 @@
+ 				 (UNSPEC_FMAXNM "fmaxnm")
+ 				 (UNSPEC_FMINNM "fminnm")])
+ 
+-(define_int_attr bit_reduc_op [(UNSPEC_ANDV "andv")
+-			       (UNSPEC_IORV "orv")
+-			       (UNSPEC_XORV "eorv")])
++(define_code_attr binqops_op [(ss_plus "sqadd")
++			      (us_plus "uqadd")
++			      (ss_minus "sqsub")
++			      (us_minus "uqsub")])
++
++(define_code_attr binqops_op_rev [(ss_plus "sqsub")
++				  (ss_minus "sqadd")])
+ 
+ ;; The SVE logical instruction that implements an unspec.
+ (define_int_attr logicalf_op [(UNSPEC_ANDF "and")
+ 		 	      (UNSPEC_IORF "orr")
+ 			      (UNSPEC_XORF "eor")])
+ 
++(define_int_attr last_op [(UNSPEC_CLASTA "after_last")
++			  (UNSPEC_CLASTB "last")
++			  (UNSPEC_LASTA "after_last")
++			  (UNSPEC_LASTB "last")])
++
+ ;; "s" for signed operations and "u" for unsigned ones.
+-(define_int_attr su [(UNSPEC_UNPACKSHI "s")
++(define_int_attr su [(UNSPEC_SADDV "s")
++		     (UNSPEC_UADDV "u")
++		     (UNSPEC_UNPACKSHI "s")
+ 		     (UNSPEC_UNPACKUHI "u")
+ 		     (UNSPEC_UNPACKSLO "s")
+ 		     (UNSPEC_UNPACKULO "u")
+ 		     (UNSPEC_SMUL_HIGHPART "s")
+-		     (UNSPEC_UMUL_HIGHPART "u")])
++		     (UNSPEC_UMUL_HIGHPART "u")
++		     (UNSPEC_COND_FCVTZS "s")
++		     (UNSPEC_COND_FCVTZU "u")
++		     (UNSPEC_COND_SCVTF "s")
++		     (UNSPEC_COND_UCVTF "u")
++		     (UNSPEC_SMULLB "s") (UNSPEC_UMULLB "u")
++		     (UNSPEC_SMULLT "s") (UNSPEC_UMULLT "u")
++		     (UNSPEC_SMULHS "s") (UNSPEC_UMULHS "u")
++		     (UNSPEC_SMULHRS "s") (UNSPEC_UMULHRS "u")])
+ 
+ (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u")
+ 		      (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
+@@ -1731,6 +2310,9 @@
+ 		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
+ 		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
+ 		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
++		      (UNSPEC_USDOT "us") (UNSPEC_SUDOT "su")
++		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")
++		      (UNSPEC_USMATMUL "us")
+ ])
+ 
+ (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
+@@ -1739,6 +2321,10 @@
+                     (UNSPEC_SQRSHRN "r") (UNSPEC_UQRSHRN "r")
+                     (UNSPEC_SQSHL   "")  (UNSPEC_UQSHL  "")
+                     (UNSPEC_SQRSHL   "r")(UNSPEC_UQRSHL  "r")
++		    (UNSPEC_SHRNB "") (UNSPEC_SHRNT "")
++		    (UNSPEC_RSHRNB "r") (UNSPEC_RSHRNT "r")
++		    (UNSPEC_SMULHS "") (UNSPEC_UMULHS "")
++		    (UNSPEC_SMULHRS "r") (UNSPEC_UMULHRS "r")
+ ])
+ 
+ (define_int_attr lr [(UNSPEC_SSLI  "l") (UNSPEC_USLI  "l")
+@@ -1751,6 +2337,13 @@
+ 		    (UNSPEC_SHADD "") (UNSPEC_UHADD "u")
+ 		    (UNSPEC_SRHADD "") (UNSPEC_URHADD "u")])
+ 
++(define_int_attr fn [(UNSPEC_LDFF1 "f") (UNSPEC_LDNF1 "n")])
++
++(define_int_attr ab [(UNSPEC_CLASTA "a") (UNSPEC_CLASTB "b")
++		     (UNSPEC_LASTA "a") (UNSPEC_LASTB "b")])
++
++(define_int_attr bt [(UNSPEC_BFMLALB "b") (UNSPEC_BFMLALT "t")])
++
+ (define_int_attr addsub [(UNSPEC_SHADD "add")
+ 			 (UNSPEC_UHADD "add")
+ 			 (UNSPEC_SRHADD "add")
+@@ -1768,6 +2361,18 @@
+ 			 (UNSPEC_RADDHN2 "add")
+ 			 (UNSPEC_RSUBHN2 "sub")])
+ 
++;; BSL variants: first commutative operand.
++(define_int_attr bsl_1st [(1 "w") (2 "0")])
++
++;; BSL variants: second commutative operand.
++(define_int_attr bsl_2nd [(1 "0") (2 "w")])
++
++;; BSL variants: duplicated input operand.
++(define_int_attr bsl_dup [(1 "1") (2 "2")])
++
++;; BSL variants: operand which requires preserving via movprfx.
++(define_int_attr bsl_mov [(1 "2") (2 "1")])
++
+ (define_int_attr offsetlr [(UNSPEC_SSLI "") (UNSPEC_USLI "")
+ 			   (UNSPEC_SSRI "offset_")
+ 			   (UNSPEC_USRI "offset_")])
+@@ -1797,29 +2402,47 @@
+ 				  (UNSPEC_FCVTZU "fcvtzu")])
+ 
+ ;; Pointer authentication mnemonic prefix.
+-(define_int_attr pauth_mnem_prefix [(UNSPEC_PACISP "paci")
+-				    (UNSPEC_AUTISP "auti")
+-				    (UNSPEC_PACI1716 "paci")
+-				    (UNSPEC_AUTI1716 "auti")])
+-
+-;; Pointer authentication HINT number for NOP space instructions using A Key.
+-(define_int_attr pauth_hint_num_a [(UNSPEC_PACISP "25")
+-				    (UNSPEC_AUTISP "29")
+-				    (UNSPEC_PACI1716 "8")
+-				    (UNSPEC_AUTI1716 "12")])
+-
+-(define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
+-			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
+-			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
++(define_int_attr pauth_mnem_prefix [(UNSPEC_PACIASP "pacia")
++				    (UNSPEC_PACIBSP "pacib")
++				    (UNSPEC_PACIA1716 "pacia")
++				    (UNSPEC_PACIB1716 "pacib")
++				    (UNSPEC_AUTIASP "autia")
++				    (UNSPEC_AUTIBSP "autib")
++				    (UNSPEC_AUTIA1716 "autia")
++				    (UNSPEC_AUTIB1716 "autib")])
++
++(define_int_attr pauth_key [(UNSPEC_PACIASP "AARCH64_KEY_A")
++			    (UNSPEC_PACIBSP "AARCH64_KEY_B")
++			    (UNSPEC_PACIA1716 "AARCH64_KEY_A")
++			    (UNSPEC_PACIB1716 "AARCH64_KEY_B")
++			    (UNSPEC_AUTIASP "AARCH64_KEY_A")
++			    (UNSPEC_AUTIBSP "AARCH64_KEY_B")
++			    (UNSPEC_AUTIA1716 "AARCH64_KEY_A")
++			    (UNSPEC_AUTIB1716 "AARCH64_KEY_B")])
++
++;; Pointer authentication HINT number for NOP space instructions using A and
++;; B key.
++(define_int_attr pauth_hint_num [(UNSPEC_PACIASP "25")
++				   (UNSPEC_PACIBSP "27")
++				   (UNSPEC_AUTIASP "29")
++				   (UNSPEC_AUTIBSP "31")
++				   (UNSPEC_PACIA1716 "8")
++				   (UNSPEC_PACIB1716 "10")
++				   (UNSPEC_AUTIA1716 "12")
++				   (UNSPEC_AUTIB1716 "14")])
++
++(define_int_attr perm_insn [(UNSPEC_ZIP1 "zip1") (UNSPEC_ZIP2 "zip2")
++			    (UNSPEC_ZIP1Q "zip1") (UNSPEC_ZIP2Q "zip2")
++			    (UNSPEC_TRN1 "trn1") (UNSPEC_TRN2 "trn2")
++			    (UNSPEC_TRN1Q "trn1") (UNSPEC_TRN2Q "trn2")
++			    (UNSPEC_UZP1 "uzp1") (UNSPEC_UZP2 "uzp2")
++			    (UNSPEC_UZP1Q "uzp1") (UNSPEC_UZP2Q "uzp2")])
+ 
+ ; op code for REV instructions (size within which elements are reversed).
+ (define_int_attr rev_op [(UNSPEC_REV64 "64") (UNSPEC_REV32 "32")
+ 			 (UNSPEC_REV16 "16")])
+ 
+-(define_int_attr perm_hilo [(UNSPEC_ZIP1 "1") (UNSPEC_ZIP2 "2")
+-			    (UNSPEC_TRN1 "1") (UNSPEC_TRN2 "2")
+-			    (UNSPEC_UZP1 "1") (UNSPEC_UZP2 "2")
+-			    (UNSPEC_UNPACKSHI "hi") (UNSPEC_UNPACKUHI "hi")
++(define_int_attr perm_hilo [(UNSPEC_UNPACKSHI "hi") (UNSPEC_UNPACKUHI "hi")
+ 			    (UNSPEC_UNPACKSLO "lo") (UNSPEC_UNPACKULO "lo")])
+ 
+ ;; Return true if the associated optab refers to the high-numbered lanes,
+@@ -1861,34 +2484,122 @@
+ (define_int_attr f16mac1 [(UNSPEC_FMLAL "a") (UNSPEC_FMLSL "s")
+ 			  (UNSPEC_FMLAL2 "a") (UNSPEC_FMLSL2 "s")])
+ 
++(define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z") (UNSPEC_FRINT32X "frint32x")
++			      (UNSPEC_FRINT64Z "frint64z") (UNSPEC_FRINT64X "frint64x")])
++
+ ;; The condition associated with an UNSPEC_COND_<xx>.
+-(define_int_attr cmp_op [(UNSPEC_COND_LT "lt")
+-			 (UNSPEC_COND_LE "le")
+-			 (UNSPEC_COND_EQ "eq")
+-			 (UNSPEC_COND_NE "ne")
+-			 (UNSPEC_COND_GE "ge")
+-			 (UNSPEC_COND_GT "gt")])
+-
+-(define_int_attr sve_fp_op [(UNSPEC_COND_ADD "fadd")
+-			    (UNSPEC_COND_SUB "fsub")
+-			    (UNSPEC_COND_MUL "fmul")
+-			    (UNSPEC_COND_DIV "fdiv")
+-			    (UNSPEC_COND_MAX "fmaxnm")
+-			    (UNSPEC_COND_MIN "fminnm")])
+-
+-(define_int_attr sve_fp_op_rev [(UNSPEC_COND_ADD "fadd")
+-			        (UNSPEC_COND_SUB "fsubr")
+-			        (UNSPEC_COND_MUL "fmul")
+-			        (UNSPEC_COND_DIV "fdivr")
+-			        (UNSPEC_COND_MAX "fmaxnm")
+-			        (UNSPEC_COND_MIN "fminnm")])
++(define_int_attr cmp_op [(UNSPEC_COND_CMPEQ_WIDE "eq")
++			 (UNSPEC_COND_CMPGE_WIDE "ge")
++			 (UNSPEC_COND_CMPGT_WIDE "gt")
++			 (UNSPEC_COND_CMPHI_WIDE "hi")
++			 (UNSPEC_COND_CMPHS_WIDE "hs")
++			 (UNSPEC_COND_CMPLE_WIDE "le")
++			 (UNSPEC_COND_CMPLO_WIDE "lo")
++			 (UNSPEC_COND_CMPLS_WIDE "ls")
++			 (UNSPEC_COND_CMPLT_WIDE "lt")
++			 (UNSPEC_COND_CMPNE_WIDE "ne")
++			 (UNSPEC_COND_FCMEQ "eq")
++			 (UNSPEC_COND_FCMGE "ge")
++			 (UNSPEC_COND_FCMGT "gt")
++			 (UNSPEC_COND_FCMLE "le")
++			 (UNSPEC_COND_FCMLT "lt")
++			 (UNSPEC_COND_FCMNE "ne")
++			 (UNSPEC_WHILELE "le")
++			 (UNSPEC_WHILELO "lo")
++			 (UNSPEC_WHILELS "ls")
++			 (UNSPEC_WHILELT "lt")])
++
++(define_int_attr while_optab_cmp [(UNSPEC_WHILELE "le")
++				  (UNSPEC_WHILELO "ult")
++				  (UNSPEC_WHILELS "ule")
++				  (UNSPEC_WHILELT "lt")])
++
++(define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
++			 (UNSPEC_BRKN "n")
++			 (UNSPEC_BRKPA "pa") (UNSPEC_BRKPB "pb")])
++
++(define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
++
++(define_int_attr sve_int_op [(UNSPEC_ANDV "andv")
++			     (UNSPEC_IORV "orv")
++			     (UNSPEC_XORV "eorv")
++			     (UNSPEC_UMAXV "umaxv")
++			     (UNSPEC_UMINV "uminv")
++			     (UNSPEC_SMAXV "smaxv")
++			     (UNSPEC_SMINV "sminv")
++			     (UNSPEC_SMUL_HIGHPART "smulh")
++			     (UNSPEC_UMUL_HIGHPART "umulh")
++			     (UNSPEC_ASHIFT_WIDE "lsl")
++			     (UNSPEC_ASHIFTRT_WIDE "asr")
++			     (UNSPEC_LSHIFTRT_WIDE "lsr")
++			     (UNSPEC_RBIT "rbit")
++			     (UNSPEC_REVB "revb")
++			     (UNSPEC_REVH "revh")
++			     (UNSPEC_REVW "revw")])
++
++(define_int_attr sve_fp_op [(UNSPEC_BFDOT "bfdot")
++			    (UNSPEC_BFMLALB "bfmlalb")
++			    (UNSPEC_BFMLALT "bfmlalt")
++			    (UNSPEC_BFMMLA "bfmmla")
++			    (UNSPEC_FRECPE "frecpe")
++			    (UNSPEC_FRECPS "frecps")
++			    (UNSPEC_RSQRTE "frsqrte")
++			    (UNSPEC_RSQRTS "frsqrts")
++			    (UNSPEC_FADDV "faddv")
++			    (UNSPEC_FEXPA "fexpa")
++			    (UNSPEC_FMAXNMV "fmaxnmv")
++			    (UNSPEC_FMAXV "fmaxv")
++			    (UNSPEC_FMINNMV "fminnmv")
++			    (UNSPEC_FMINV "fminv")
++			    (UNSPEC_FMLA "fmla")
++			    (UNSPEC_FMLS "fmls")
++			    (UNSPEC_FMMLA "fmmla")
++			    (UNSPEC_FTSMUL "ftsmul")
++			    (UNSPEC_FTSSEL "ftssel")
++			    (UNSPEC_COND_FABS "fabs")
++			    (UNSPEC_COND_FADD "fadd")
++			    (UNSPEC_COND_FDIV "fdiv")
++			    (UNSPEC_COND_FMAX "fmax")
++			    (UNSPEC_COND_FMAXNM "fmaxnm")
++			    (UNSPEC_COND_FMIN "fmin")
++			    (UNSPEC_COND_FMINNM "fminnm")
++			    (UNSPEC_COND_FMUL "fmul")
++			    (UNSPEC_COND_FMULX "fmulx")
++			    (UNSPEC_COND_FNEG "fneg")
++			    (UNSPEC_COND_FRECPX "frecpx")
++			    (UNSPEC_COND_FRINTA "frinta")
++			    (UNSPEC_COND_FRINTI "frinti")
++			    (UNSPEC_COND_FRINTM "frintm")
++			    (UNSPEC_COND_FRINTN "frintn")
++			    (UNSPEC_COND_FRINTP "frintp")
++			    (UNSPEC_COND_FRINTX "frintx")
++			    (UNSPEC_COND_FRINTZ "frintz")
++			    (UNSPEC_COND_FSCALE "fscale")
++			    (UNSPEC_COND_FSQRT "fsqrt")
++			    (UNSPEC_COND_FSUB "fsub")])
++
++(define_int_attr sve_fp_op_rev [(UNSPEC_COND_FADD "fadd")
++				(UNSPEC_COND_FDIV "fdivr")
++				(UNSPEC_COND_FMAX "fmax")
++				(UNSPEC_COND_FMAXNM "fmaxnm")
++				(UNSPEC_COND_FMIN "fmin")
++				(UNSPEC_COND_FMINNM "fminnm")
++				(UNSPEC_COND_FMUL "fmul")
++				(UNSPEC_COND_FMULX "fmulx")
++				(UNSPEC_COND_FSUB "fsubr")])
+ 
+ (define_int_attr rot [(UNSPEC_FCADD90 "90")
+ 		      (UNSPEC_FCADD270 "270")
+ 		      (UNSPEC_FCMLA "0")
+ 		      (UNSPEC_FCMLA90 "90")
+ 		      (UNSPEC_FCMLA180 "180")
+-		      (UNSPEC_FCMLA270 "270")])
++		      (UNSPEC_FCMLA270 "270")
++		      (UNSPEC_COND_FCADD90 "90")
++		      (UNSPEC_COND_FCADD270 "270")
++		      (UNSPEC_COND_FCMLA "0")
++		      (UNSPEC_COND_FCMLA90 "90")
++		      (UNSPEC_COND_FCMLA180 "180")
++		      (UNSPEC_COND_FCMLA270 "270")])
+ 
+ (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
+ 			      (UNSPEC_COND_FMLS "fmls")
+@@ -1900,9 +2611,54 @@
+ 			      (UNSPEC_COND_FNMLA "fnmad")
+ 			      (UNSPEC_COND_FNMLS "fnmsb")])
+ 
+-(define_int_attr commutative [(UNSPEC_COND_ADD "true")
+-			      (UNSPEC_COND_SUB "false")
+-			      (UNSPEC_COND_MUL "true")
+-			      (UNSPEC_COND_DIV "false")
+-			      (UNSPEC_COND_MIN "true")
+-			      (UNSPEC_COND_MAX "true")])
++;; The register constraint to use for the final operand in a binary BRK.
++(define_int_attr brk_reg_con [(UNSPEC_BRKN "0")
++			      (UNSPEC_BRKPA "Upa") (UNSPEC_BRKPB "Upa")])
++
++;; The register number to print for the above.
++(define_int_attr brk_reg_opno [(UNSPEC_BRKN "0")
++			       (UNSPEC_BRKPA "3") (UNSPEC_BRKPB "3")])
++
++;; The predicate to use for the first input operand in a floating-point
++;; <optab><mode>3 pattern.
++(define_int_attr sve_pred_fp_rhs1_operand
++  [(UNSPEC_COND_FADD "register_operand")
++   (UNSPEC_COND_FDIV "register_operand")
++   (UNSPEC_COND_FMAX "register_operand")
++   (UNSPEC_COND_FMAXNM "register_operand")
++   (UNSPEC_COND_FMIN "register_operand")
++   (UNSPEC_COND_FMINNM "register_operand")
++   (UNSPEC_COND_FMUL "register_operand")
++   (UNSPEC_COND_FMULX "register_operand")
++   (UNSPEC_COND_FSUB "aarch64_sve_float_arith_operand")])
++
++;; The predicate to use for the second input operand in a floating-point
++;; <optab><mode>3 pattern.
++(define_int_attr sve_pred_fp_rhs2_operand
++  [(UNSPEC_COND_FADD "aarch64_sve_float_arith_with_sub_operand")
++   (UNSPEC_COND_FDIV "register_operand")
++   (UNSPEC_COND_FMAX "aarch64_sve_float_maxmin_operand")
++   (UNSPEC_COND_FMAXNM "aarch64_sve_float_maxmin_operand")
++   (UNSPEC_COND_FMIN "aarch64_sve_float_maxmin_operand")
++   (UNSPEC_COND_FMINNM "aarch64_sve_float_maxmin_operand")
++   (UNSPEC_COND_FMUL "aarch64_sve_float_mul_operand")
++   (UNSPEC_COND_FMULX "register_operand")
++   (UNSPEC_COND_FSUB "register_operand")])
++
++;; Likewise for immediates only.
++(define_int_attr sve_pred_fp_rhs2_immediate
++  [(UNSPEC_COND_FMAX "aarch64_sve_float_maxmin_immediate")
++   (UNSPEC_COND_FMAXNM "aarch64_sve_float_maxmin_immediate")
++   (UNSPEC_COND_FMIN "aarch64_sve_float_maxmin_immediate")
++   (UNSPEC_COND_FMINNM "aarch64_sve_float_maxmin_immediate")
++   (UNSPEC_COND_FMUL "aarch64_sve_float_mul_immediate")])
++
++;; The maximum number of element bits that an instruction can handle.
++(define_int_attr max_elem_bits [(UNSPEC_UADDV "64") (UNSPEC_SADDV "32")
++				(UNSPEC_PFIRST "8") (UNSPEC_PNEXT "64")])
++
++;; The minimum number of element bits that an instruction can handle.
++(define_int_attr min_elem_bits [(UNSPEC_RBIT "8")
++				(UNSPEC_REVB "16")
++				(UNSPEC_REVH "32")
++				(UNSPEC_REVW "64")])
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index 5f7f281e2..0b6bf6172 100644
+--- a/gcc/config/aarch64/predicates.md
++++ b/gcc/config/aarch64/predicates.md
+@@ -39,9 +39,17 @@
+   (and (match_code "const_int")
+        (match_test "op == CONST0_RTX (mode)")))
+ 
+-(define_special_predicate "subreg_lowpart_operator"
+-  (and (match_code "subreg")
+-       (match_test "subreg_lowpart_p (op)")))
++(define_predicate "const_1_to_3_operand"
++  (match_code "const_int,const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
++})
++
++(define_predicate "subreg_lowpart_operator"
++  (ior (match_code "truncate")
++       (and (match_code "subreg")
++	    (match_test "subreg_lowpart_p (op)"))))
+ 
+ (define_predicate "aarch64_ccmp_immediate"
+   (and (match_code "const_int")
+@@ -53,13 +61,12 @@
+ 
+ (define_predicate "aarch64_simd_register"
+   (and (match_code "reg")
+-       (ior (match_test "REGNO_REG_CLASS (REGNO (op)) == FP_LO_REGS")
+-            (match_test "REGNO_REG_CLASS (REGNO (op)) == FP_REGS"))))
++       (match_test "FP_REGNUM_P (REGNO (op))")))
+ 
+ (define_predicate "aarch64_reg_or_zero"
+-  (and (match_code "reg,subreg,const_int")
++  (and (match_code "reg,subreg,const_int,const_double")
+        (ior (match_operand 0 "register_operand")
+-	    (match_test "op == const0_rtx"))))
++	    (match_test "op == CONST0_RTX (GET_MODE (op))"))))
+ 
+ (define_predicate "aarch64_reg_or_fp_zero"
+   (ior (match_operand 0 "register_operand")
+@@ -98,6 +105,10 @@
+   (and (match_code "const_double")
+ 	(match_test "aarch64_fpconst_pow_of_2 (op) > 0")))
+ 
++(define_predicate "aarch64_fp_pow2_recip"
++  (and (match_code "const_double")
++       (match_test "aarch64_fpconst_pow2_recip (op) > 0")))
++
+ (define_predicate "aarch64_fp_vec_pow2"
+   (match_test "aarch64_vec_fpconst_pow_of_2 (op) > 0"))
+ 
+@@ -138,10 +149,18 @@
+   (and (match_operand 0 "aarch64_pluslong_immediate")
+        (not (match_operand 0 "aarch64_plus_immediate"))))
+ 
++(define_predicate "aarch64_sve_scalar_inc_dec_immediate"
++  (and (match_code "const_poly_int")
++       (match_test "aarch64_sve_scalar_inc_dec_immediate_p (op)")))
++
+ (define_predicate "aarch64_sve_addvl_addpl_immediate"
+   (and (match_code "const_poly_int")
+        (match_test "aarch64_sve_addvl_addpl_immediate_p (op)")))
+ 
++(define_predicate "aarch64_sve_plus_immediate"
++  (ior (match_operand 0 "aarch64_sve_scalar_inc_dec_immediate")
++       (match_operand 0 "aarch64_sve_addvl_addpl_immediate")))
++
+ (define_predicate "aarch64_split_add_offset_immediate"
+   (and (match_code "const_poly_int")
+        (match_test "aarch64_add_offset_temporaries (op) == 1")))
+@@ -149,7 +168,8 @@
+ (define_predicate "aarch64_pluslong_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_pluslong_immediate")
+-       (match_operand 0 "aarch64_sve_addvl_addpl_immediate")))
++       (and (match_test "TARGET_SVE")
++	    (match_operand 0 "aarch64_sve_plus_immediate"))))
+ 
+ (define_predicate "aarch64_pluslong_or_poly_operand"
+   (ior (match_operand 0 "aarch64_pluslong_operand")
+@@ -323,12 +343,6 @@
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "const_scalar_int_operand")))
+ 
+-(define_predicate "aarch64_smin"
+-  (match_code "smin"))
+-
+-(define_predicate "aarch64_umin"
+-  (match_code "umin"))
+-
+ ;; True for integer comparisons and for FP comparisons other than LTGT or UNEQ.
+ (define_special_predicate "aarch64_comparison_operator"
+   (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu,unordered,
+@@ -444,6 +458,12 @@
+   return aarch64_stepped_int_parallel_p (op, 1);
+ })
+ 
++(define_predicate "descending_int_parallel"
++  (match_code "parallel")
++{
++  return aarch64_stepped_int_parallel_p (op, -1);
++})
++
+ (define_special_predicate "aarch64_simd_lshift_imm"
+   (match_code "const,const_vector")
+ {
+@@ -460,6 +480,10 @@
+   (and (match_code "const,const_vector")
+        (match_test "op == CONST0_RTX (GET_MODE (op))")))
+ 
++(define_predicate "aarch64_simd_imm_one"
++  (and (match_code "const_vector")
++       (match_test "op == CONST1_RTX (GET_MODE (op))")))
++
+ (define_predicate "aarch64_simd_or_scalar_imm_zero"
+   (and (match_code "const_int,const_double,const,const_vector")
+        (match_test "op == CONST0_RTX (GET_MODE (op))")))
+@@ -474,6 +498,10 @@
+ 	    (match_test "op == const0_rtx")
+ 	    (match_operand 0 "aarch64_simd_or_scalar_imm_zero"))))
+ 
++(define_predicate "aarch64_simd_reg_or_minus_one"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_simd_imm_minus_one")))
++
+ (define_predicate "aarch64_simd_struct_operand"
+   (and (match_code "mem")
+        (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p (op)")))
+@@ -556,12 +584,44 @@
+   (and (match_operand 0 "memory_operand")
+        (match_test "aarch64_sve_ld1r_operand_p (op)")))
+ 
++(define_predicate "aarch64_sve_ld1rq_operand"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
++
++(define_predicate "aarch64_sve_ld1ro_operand_b"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, QImode)")))
++
++(define_predicate "aarch64_sve_ld1ro_operand_h"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, HImode)")))
++
++(define_predicate "aarch64_sve_ld1ro_operand_w"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, SImode)")))
++
++(define_predicate "aarch64_sve_ld1ro_operand_d"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ld1ro_operand_p (op, DImode)")))
++
++(define_predicate "aarch64_sve_ldff1_operand"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ldff1_operand_p (op)")))
++
++(define_predicate "aarch64_sve_ldnf1_operand"
++  (and (match_code "mem")
++       (match_test "aarch64_sve_ldnf1_operand_p (op)")))
++
+ ;; Like memory_operand, but restricted to addresses that are valid for
+ ;; SVE LDR and STR instructions.
+ (define_predicate "aarch64_sve_ldr_operand"
+   (and (match_code "mem")
+        (match_test "aarch64_sve_ldr_operand_p (op)")))
+ 
++(define_special_predicate "aarch64_sve_prefetch_operand"
++  (and (match_code "reg, plus")
++       (match_test "aarch64_sve_prefetch_operand_p (op, mode)")))
++
+ (define_predicate "aarch64_sve_nonimmediate_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_sve_ldr_operand")))
+@@ -586,6 +646,10 @@
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_sve_ld1r_operand")))
+ 
++(define_predicate "aarch64_sve_ptrue_svpattern_immediate"
++  (and (match_code "const")
++       (match_test "aarch64_sve_ptrue_svpattern_p (op, NULL)")))
++
+ (define_predicate "aarch64_sve_arith_immediate"
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_sve_arith_immediate_p (op, false)")))
+@@ -594,28 +658,84 @@
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_sve_arith_immediate_p (op, true)")))
+ 
+-(define_predicate "aarch64_sve_inc_dec_immediate"
++(define_predicate "aarch64_sve_qadd_immediate"
+   (and (match_code "const,const_vector")
+-       (match_test "aarch64_sve_inc_dec_immediate_p (op)")))
++       (match_test "aarch64_sve_sqadd_sqsub_immediate_p (op, false)")))
++
++(define_predicate "aarch64_sve_qsub_immediate"
++  (and (match_code "const,const_vector")
++       (match_test "aarch64_sve_sqadd_sqsub_immediate_p (op, true)")))
++
++(define_predicate "aarch64_sve_vector_inc_dec_immediate"
++  (and (match_code "const,const_vector")
++       (match_test "aarch64_sve_vector_inc_dec_immediate_p (op)")))
++
++(define_predicate "aarch64_sve_gather_immediate_b"
++  (and (match_code "const_int")
++       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
++
++(define_predicate "aarch64_sve_gather_immediate_h"
++  (and (match_code "const_int")
++       (match_test "IN_RANGE (INTVAL (op), 0, 62)")
++       (match_test "(INTVAL (op) & 1) == 0")))
++
++(define_predicate "aarch64_sve_gather_immediate_w"
++  (and (match_code "const_int")
++       (match_test "IN_RANGE (INTVAL (op), 0, 124)")
++       (match_test "(INTVAL (op) & 3) == 0")))
++
++(define_predicate "aarch64_sve_gather_immediate_d"
++  (and (match_code "const_int")
++       (match_test "IN_RANGE (INTVAL (op), 0, 248)")
++       (match_test "(INTVAL (op) & 7) == 0")))
++
++(define_predicate "aarch64_sve_uxtb_immediate"
++  (and (match_code "const_vector")
++       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 8")
++       (match_test "aarch64_const_vec_all_same_int_p (op, 0xff)")))
++
++(define_predicate "aarch64_sve_uxth_immediate"
++  (and (match_code "const_vector")
++       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 16")
++       (match_test "aarch64_const_vec_all_same_int_p (op, 0xffff)")))
++
++(define_predicate "aarch64_sve_uxtw_immediate"
++  (and (match_code "const_vector")
++       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 32")
++       (match_test "aarch64_const_vec_all_same_int_p (op, 0xffffffff)")))
++
++(define_predicate "aarch64_sve_uxt_immediate"
++  (ior (match_operand 0 "aarch64_sve_uxtb_immediate")
++       (match_operand 0 "aarch64_sve_uxth_immediate")
++       (match_operand 0 "aarch64_sve_uxtw_immediate")))
+ 
+ (define_predicate "aarch64_sve_logical_immediate"
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_sve_bitmask_immediate_p (op)")))
+ 
+-(define_predicate "aarch64_sve_mul_immediate"
++;; Used for SVE UMAX and UMIN.
++(define_predicate "aarch64_sve_vsb_immediate"
++  (and (match_code "const_vector")
++       (match_test "GET_MODE_INNER (GET_MODE (op)) == QImode
++		    ? aarch64_const_vec_all_same_in_range_p (op, -128, 127)
++		    : aarch64_const_vec_all_same_in_range_p (op, 0, 255)")))
++
++;; Used for SVE MUL, SMAX and SMIN.
++(define_predicate "aarch64_sve_vsm_immediate"
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_const_vec_all_same_in_range_p (op, -128, 127)")))
+ 
+ (define_predicate "aarch64_sve_dup_immediate"
+   (and (match_code "const,const_vector")
+-       (match_test "aarch64_sve_dup_immediate_p (op)")))
++       (ior (match_test "aarch64_sve_dup_immediate_p (op)")
++	    (match_test "aarch64_float_const_representable_p (op)"))))
+ 
+ (define_predicate "aarch64_sve_cmp_vsc_immediate"
+-  (and (match_code "const,const_vector")
++  (and (match_code "const_int,const_vector")
+        (match_test "aarch64_sve_cmp_immediate_p (op, true)")))
+ 
+ (define_predicate "aarch64_sve_cmp_vsd_immediate"
+-  (and (match_code "const,const_vector")
++  (and (match_code "const_int,const_vector")
+        (match_test "aarch64_sve_cmp_immediate_p (op, false)")))
+ 
+ (define_predicate "aarch64_sve_index_immediate"
+@@ -626,14 +746,23 @@
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_sve_float_arith_immediate_p (op, false)")))
+ 
+-(define_predicate "aarch64_sve_float_arith_with_sub_immediate"
++(define_predicate "aarch64_sve_float_negated_arith_immediate"
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_sve_float_arith_immediate_p (op, true)")))
+ 
++(define_predicate "aarch64_sve_float_arith_with_sub_immediate"
++  (ior (match_operand 0 "aarch64_sve_float_arith_immediate")
++       (match_operand 0 "aarch64_sve_float_negated_arith_immediate")))
++
+ (define_predicate "aarch64_sve_float_mul_immediate"
+   (and (match_code "const,const_vector")
+        (match_test "aarch64_sve_float_mul_immediate_p (op)")))
+ 
++(define_predicate "aarch64_sve_float_maxmin_immediate"
++  (and (match_code "const_vector")
++       (ior (match_test "op == CONST0_RTX (GET_MODE (op))")
++	    (match_test "op == CONST1_RTX (GET_MODE (op))"))))
++
+ (define_predicate "aarch64_sve_arith_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_sve_arith_immediate")))
+@@ -641,12 +770,37 @@
+ (define_predicate "aarch64_sve_add_operand"
+   (ior (match_operand 0 "aarch64_sve_arith_operand")
+        (match_operand 0 "aarch64_sve_sub_arith_immediate")
+-       (match_operand 0 "aarch64_sve_inc_dec_immediate")))
++       (match_operand 0 "aarch64_sve_vector_inc_dec_immediate")))
++
++(define_predicate "aarch64_sve_sqadd_operand"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_qadd_immediate")
++       (match_operand 0 "aarch64_sve_qsub_immediate")))
++
++(define_predicate "aarch64_sve_pred_and_operand"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_uxt_immediate")))
+ 
+ (define_predicate "aarch64_sve_logical_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_sve_logical_immediate")))
+ 
++(define_predicate "aarch64_sve_gather_offset_b"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_gather_immediate_b")))
++
++(define_predicate "aarch64_sve_gather_offset_h"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_gather_immediate_h")))
++
++(define_predicate "aarch64_sve_gather_offset_w"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_gather_immediate_w")))
++
++(define_predicate "aarch64_sve_gather_offset_d"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_gather_immediate_d")))
++
+ (define_predicate "aarch64_sve_lshift_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_simd_lshift_imm")))
+@@ -655,9 +809,17 @@
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_simd_rshift_imm")))
+ 
+-(define_predicate "aarch64_sve_mul_operand"
++(define_predicate "aarch64_sve_vsb_operand"
+   (ior (match_operand 0 "register_operand")
+-       (match_operand 0 "aarch64_sve_mul_immediate")))
++       (match_operand 0 "aarch64_sve_vsb_immediate")))
++
++(define_predicate "aarch64_sve_vsm_operand"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_vsm_immediate")))
++
++(define_predicate "aarch64_sve_reg_or_dup_imm"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_dup_immediate")))
+ 
+ (define_predicate "aarch64_sve_cmp_vsc_operand"
+   (ior (match_operand 0 "register_operand")
+@@ -676,17 +838,39 @@
+        (match_operand 0 "aarch64_sve_float_arith_immediate")))
+ 
+ (define_predicate "aarch64_sve_float_arith_with_sub_operand"
+-  (ior (match_operand 0 "aarch64_sve_float_arith_operand")
++  (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_sve_float_arith_with_sub_immediate")))
+ 
+ (define_predicate "aarch64_sve_float_mul_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_sve_float_mul_immediate")))
+ 
++(define_predicate "aarch64_sve_float_maxmin_operand"
++  (ior (match_operand 0 "register_operand")
++       (match_operand 0 "aarch64_sve_float_maxmin_immediate")))
++
+ (define_predicate "aarch64_sve_vec_perm_operand"
+   (ior (match_operand 0 "register_operand")
+        (match_operand 0 "aarch64_constant_vector_operand")))
+ 
++(define_predicate "aarch64_sve_ptrue_flag"
++  (and (match_code "const_int")
++       (ior (match_test "INTVAL (op) == SVE_MAYBE_NOT_PTRUE")
++	    (match_test "INTVAL (op) == SVE_KNOWN_PTRUE"))))
++
++(define_predicate "aarch64_sve_gp_strictness"
++  (and (match_code "const_int")
++       (ior (match_test "INTVAL (op) == SVE_RELAXED_GP")
++	    (match_test "INTVAL (op) == SVE_STRICT_GP"))))
++
++(define_predicate "aarch64_gather_scale_operand_b"
++  (and (match_code "const_int")
++       (match_test "INTVAL (op) == 1")))
++
++(define_predicate "aarch64_gather_scale_operand_h"
++  (and (match_code "const_int")
++       (match_test "INTVAL (op) == 1 || INTVAL (op) == 2")))
++
+ (define_predicate "aarch64_gather_scale_operand_w"
+   (and (match_code "const_int")
+        (match_test "INTVAL (op) == 1 || INTVAL (op) == 4")))
+diff --git a/gcc/config/aarch64/saphira.md b/gcc/config/aarch64/saphira.md
+index 853deeef0..3cc7bc410 100644
+--- a/gcc/config/aarch64/saphira.md
++++ b/gcc/config/aarch64/saphira.md
+@@ -520,7 +520,7 @@
+ 
+ (define_insn_reservation "saphira_other_0_nothing" 0
+   (and (eq_attr "tune" "saphira")
+-       (eq_attr "type" "no_insn,trap,block"))
++       (eq_attr "type" "trap,block"))
+   "nothing")
+ 
+ (define_insn_reservation "saphira_other_2_ld" 2
+diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
+index ee471f898..28e1c7aec 100644
+--- a/gcc/config/aarch64/t-aarch64
++++ b/gcc/config/aarch64/t-aarch64
+@@ -40,6 +40,43 @@ aarch64-builtins.o: $(srcdir)/config/aarch64/aarch64-builtins.c $(CONFIG_H) \
+ 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ 		$(srcdir)/config/aarch64/aarch64-builtins.c
+ 
++aarch64-sve-builtins.o: $(srcdir)/config/aarch64/aarch64-sve-builtins.cc \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins.def \
++  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(RTL_H) \
++  $(TM_P_H) memmodel.h insn-codes.h $(OPTABS_H) $(RECOG_H) $(DIAGNOSTIC_H) \
++  $(EXPR_H) $(BASIC_BLOCK_H) $(FUNCTION_H) fold-const.h $(GIMPLE_H) \
++  gimple-iterator.h gimplify.h explow.h $(EMIT_RTL_H) tree-vector-builder.h \
++  stor-layout.h $(REG_H) alias.h gimple-fold.h langhooks.h \
++  stringpool.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-base.h
++	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
++		$(srcdir)/config/aarch64/aarch64-sve-builtins.cc
++
++aarch64-sve-builtins-shapes.o: \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.cc \
++  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(RTL_H) \
++  $(TM_P_H) memmodel.h insn-codes.h $(OPTABS_H) \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.h
++	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
++		$(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.cc
++
++aarch64-sve-builtins-base.o: \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-base.cc \
++  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(RTL_H) \
++  $(TM_P_H) memmodel.h insn-codes.h $(OPTABS_H) $(RECOG_H) \
++  $(EXPR_H) $(BASIC_BLOCK_H) $(FUNCTION_H) fold-const.h $(GIMPLE_H) \
++  gimple-iterator.h gimplify.h explow.h $(EMIT_RTL_H) tree-vector-builder.h \
++  rtx-vector-builder.h vec-perm-indices.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-base.h \
++  $(srcdir)/config/aarch64/aarch64-sve-builtins-functions.h
++	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
++		$(srcdir)/config/aarch64/aarch64-sve-builtins-base.cc
++
+ aarch64-builtin-iterators.h: $(srcdir)/config/aarch64/geniterators.sh \
+ 	$(srcdir)/config/aarch64/iterators.md
+ 	$(SHELL) $(srcdir)/config/aarch64/geniterators.sh \
+@@ -103,3 +140,10 @@ aarch64-bti-insert.o: $(srcdir)/config/aarch64/aarch64-bti-insert.c \
+ comma=,
+ MULTILIB_OPTIONS    = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
+ MULTILIB_DIRNAMES   = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
++
++insn-conditions.md: s-check-sve-md
++s-check-sve-md: $(srcdir)/config/aarch64/check-sve-md.awk \
++		$(srcdir)/config/aarch64/aarch64-sve.md
++	$(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \
++	  $(srcdir)/config/aarch64/aarch64-sve.md
++	$(STAMP) s-check-sve-md
+diff --git a/gcc/config/aarch64/t-aarch64-netbsd b/gcc/config/aarch64/t-aarch64-netbsd
+new file mode 100644
+index 000000000..aa447d0f6
+--- /dev/null
++++ b/gcc/config/aarch64/t-aarch64-netbsd
+@@ -0,0 +1,21 @@
++# Machine description for AArch64 architecture.
++#  Copyright (C) 2016-2019 Free Software Foundation, Inc.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.
++
++LIB1ASMSRC   = aarch64/lib1funcs.asm
++LIB1ASMFUNCS = _aarch64_sync_cache_range
+diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
+index c43c39ecd..bb6e0abb0 100644
+--- a/gcc/config/aarch64/thunderx2t99.md
++++ b/gcc/config/aarch64/thunderx2t99.md
+@@ -74,7 +74,7 @@
+ 
+ (define_insn_reservation "thunderx2t99_nothing" 0
+   (and (eq_attr "tune" "thunderx2t99")
+-       (eq_attr "type" "no_insn,block"))
++       (eq_attr "type" "block"))
+   "nothing")
+ 
+ (define_insn_reservation "thunderx2t99_mrs" 0
+diff --git a/gcc/config/aarch64/tsv110.md b/gcc/config/aarch64/tsv110.md
+index 680c48a68..f20055dae 100644
+--- a/gcc/config/aarch64/tsv110.md
++++ b/gcc/config/aarch64/tsv110.md
+@@ -281,7 +281,7 @@
+ 			shift_imm,shift_reg,\
+ 			mov_imm,mov_reg,\
+ 			mvn_imm,mvn_reg,\
+-			mrs,multiple,no_insn"))
++			mrs,multiple"))
+   "tsv110_alu1|tsv110_alu2|tsv110_alu3")
+   
+ (define_insn_reservation "tsv110_alus" 1
+diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c
+index 524379d37..cd6aa117c 100644
+--- a/gcc/config/alpha/alpha.c
++++ b/gcc/config/alpha/alpha.c
+@@ -6380,7 +6380,7 @@ alpha_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+   t = fold_convert (build_nonstandard_integer_type (64, 0), offset_field);
+   offset = get_initialized_tmp_var (t, pre_p, NULL);
+ 
+-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  indirect = pass_va_arg_by_reference (type);
+ 
+   if (indirect)
+     {
+diff --git a/gcc/config/alpha/alpha.h b/gcc/config/alpha/alpha.h
+index e2008202a..68eafe194 100644
+--- a/gcc/config/alpha/alpha.h
++++ b/gcc/config/alpha/alpha.h
+@@ -759,7 +759,7 @@ do {									     \
+ #define MOVE_MAX 8
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction pairs, we will do a movmem or libcall instead.
++   move-instruction pairs, we will do a cpymem or libcall instead.
+ 
+    Without byte/word accesses, we want no more than four instructions;
+    with, several single byte accesses are better.  */
+diff --git a/gcc/config/alpha/alpha.md b/gcc/config/alpha/alpha.md
+index dd340a08e..228dee44c 100644
+--- a/gcc/config/alpha/alpha.md
++++ b/gcc/config/alpha/alpha.md
+@@ -4673,7 +4673,7 @@
+ ;; Argument 2 is the length
+ ;; Argument 3 is the alignment
+ 
+-(define_expand "movmemqi"
++(define_expand "cpymemqi"
+   [(parallel [(set (match_operand:BLK 0 "memory_operand")
+ 		   (match_operand:BLK 1 "memory_operand"))
+ 	      (use (match_operand:DI 2 "immediate_operand"))
+@@ -4686,7 +4686,7 @@
+     FAIL;
+ })
+ 
+-(define_expand "movmemdi"
++(define_expand "cpymemdi"
+   [(parallel [(set (match_operand:BLK 0 "memory_operand")
+ 		   (match_operand:BLK 1 "memory_operand"))
+ 	      (use (match_operand:DI 2 "immediate_operand"))
+@@ -4703,7 +4703,7 @@
+   "TARGET_ABI_OPEN_VMS"
+   "operands[4] = gen_rtx_SYMBOL_REF (Pmode, \"OTS$MOVE\");")
+ 
+-(define_insn "*movmemdi_1"
++(define_insn "*cpymemdi_1"
+   [(set (match_operand:BLK 0 "memory_operand" "=m,m")
+ 	(match_operand:BLK 1 "memory_operand" "m,m"))
+    (use (match_operand:DI 2 "nonmemory_operand" "r,i"))
+diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
+index ac0de6b28..00d2dd2c6 100644
+--- a/gcc/config/arc/arc-protos.h
++++ b/gcc/config/arc/arc-protos.h
+@@ -35,7 +35,7 @@ extern void arc_final_prescan_insn (rtx_insn *, rtx *, int);
+ extern const char *arc_output_libcall (const char *);
+ extern int arc_output_addsi (rtx *operands, bool, bool);
+ extern int arc_output_commutative_cond_exec (rtx *operands, bool);
+-extern bool arc_expand_movmem (rtx *operands);
++extern bool arc_expand_cpymem (rtx *operands);
+ extern bool prepare_move_operands (rtx *operands, machine_mode mode);
+ extern void emit_shift (enum rtx_code, rtx, rtx, rtx);
+ extern void arc_expand_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
+diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
+index 325dd3cea..c0f13ebe7 100644
+--- a/gcc/config/arc/arc.c
++++ b/gcc/config/arc/arc.c
+@@ -8791,7 +8791,7 @@ arc_output_commutative_cond_exec (rtx *operands, bool output_p)
+   return 8;
+ }
+ 
+-/* Helper function of arc_expand_movmem.  ADDR points to a chunk of memory.
++/* Helper function of arc_expand_cpymem.  ADDR points to a chunk of memory.
+    Emit code and return an potentially modified address such that offsets
+    up to SIZE are can be added to yield a legitimate address.
+    if REUSE is set, ADDR is a register that may be modified.  */
+@@ -8825,7 +8825,7 @@ force_offsettable (rtx addr, HOST_WIDE_INT size, bool reuse)
+    offset ranges.  Return true on success.  */
+ 
+ bool
+-arc_expand_movmem (rtx *operands)
++arc_expand_cpymem (rtx *operands)
+ {
+   rtx dst = operands[0];
+   rtx src = operands[1];
+@@ -10335,7 +10335,7 @@ arc_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+ 				    enum by_pieces_operation op,
+ 				    bool speed_p)
+ {
+-  /* Let the movmem expander handle small block moves.  */
++  /* Let the cpymem expander handle small block moves.  */
+   if (op == MOVE_BY_PIECES)
+     return false;
+ 
+diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
+index 00fc3e471..7ae10a666 100644
+--- a/gcc/config/arc/arc.h
++++ b/gcc/config/arc/arc.h
+@@ -1423,7 +1423,7 @@ do { \
+    in one reasonably fast instruction.  */
+ #define MOVE_MAX 4
+ 
+-/* Undo the effects of the movmem pattern presence on STORE_BY_PIECES_P .  */
++/* Undo the effects of the cpymem pattern presence on STORE_BY_PIECES_P .  */
+ #define MOVE_RATIO(SPEED) ((SPEED) ? 15 : 3)
+ 
+ /* Define this to be nonzero if shift instructions ignore all but the
+diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
+index 34e8248bc..2cfcf8bdd 100644
+--- a/gcc/config/arc/arc.md
++++ b/gcc/config/arc/arc.md
+@@ -5114,13 +5114,13 @@ core_3, archs4x, archs4xd, archs4xd_slow"
+    (set_attr "type" "loop_end")
+    (set_attr "length" "4,20")])
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(match_operand:BLK 0 "" "")
+    (match_operand:BLK 1 "" "")
+    (match_operand:SI 2 "nonmemory_operand" "")
+    (match_operand 3 "immediate_operand" "")]
+   ""
+-  "if (arc_expand_movmem (operands)) DONE; else FAIL;")
++  "if (arc_expand_cpymem (operands)) DONE; else FAIL;")
+ 
+ ;; Close http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35803 if this works
+ ;; to the point that we can generate cmove instructions.
+diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
+index 98beb6109..dd1f32798 100644
+--- a/gcc/config/arm/arm-protos.h
++++ b/gcc/config/arm/arm-protos.h
+@@ -127,8 +127,8 @@ extern bool offset_ok_for_ldrd_strd (HOST_WIDE_INT);
+ extern bool operands_ok_ldrd_strd (rtx, rtx, rtx, HOST_WIDE_INT, bool, bool);
+ extern bool gen_operands_ldrd_strd (rtx *, bool, bool, bool);
+ extern bool valid_operands_ldrd_strd (rtx *, bool);
+-extern int arm_gen_movmemqi (rtx *);
+-extern bool gen_movmem_ldrd_strd (rtx *);
++extern int arm_gen_cpymemqi (rtx *);
++extern bool gen_cpymem_ldrd_strd (rtx *);
+ extern machine_mode arm_select_cc_mode (RTX_CODE, rtx, rtx);
+ extern machine_mode arm_select_dominance_cc_mode (rtx, rtx,
+ 						       HOST_WIDE_INT);
+@@ -204,7 +204,7 @@ extern void thumb2_final_prescan_insn (rtx_insn *);
+ extern const char *thumb_load_double_from_address (rtx *);
+ extern const char *thumb_output_move_mem_multiple (int, rtx *);
+ extern const char *thumb_call_via_reg (rtx);
+-extern void thumb_expand_movmemqi (rtx *);
++extern void thumb_expand_cpymemqi (rtx *);
+ extern rtx arm_return_addr (int, rtx);
+ extern void thumb_reload_out_hi (rtx *);
+ extern void thumb_set_return_address (rtx, rtx);
+diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
+index eba26011e..c8a09329a 100644
+--- a/gcc/config/arm/arm.c
++++ b/gcc/config/arm/arm.c
+@@ -14426,7 +14426,7 @@ arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+    core type, optimize_size setting, etc.  */
+ 
+ static int
+-arm_movmemqi_unaligned (rtx *operands)
++arm_cpymemqi_unaligned (rtx *operands)
+ {
+   HOST_WIDE_INT length = INTVAL (operands[2]);
+   
+@@ -14463,7 +14463,7 @@ arm_movmemqi_unaligned (rtx *operands)
+ }
+ 
+ int
+-arm_gen_movmemqi (rtx *operands)
++arm_gen_cpymemqi (rtx *operands)
+ {
+   HOST_WIDE_INT in_words_to_go, out_words_to_go, last_bytes;
+   HOST_WIDE_INT srcoffset, dstoffset;
+@@ -14477,7 +14477,7 @@ arm_gen_movmemqi (rtx *operands)
+     return 0;
+ 
+   if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+-    return arm_movmemqi_unaligned (operands);
++    return arm_cpymemqi_unaligned (operands);
+ 
+   if (INTVAL (operands[3]) & 3)
+     return 0;
+@@ -14611,7 +14611,7 @@ arm_gen_movmemqi (rtx *operands)
+   return 1;
+ }
+ 
+-/* Helper for gen_movmem_ldrd_strd. Increase the address of memory rtx
++/* Helper for gen_cpymem_ldrd_strd. Increase the address of memory rtx
+ by mode size.  */
+ inline static rtx
+ next_consecutive_mem (rtx mem)
+@@ -14626,7 +14626,7 @@ next_consecutive_mem (rtx mem)
+ /* Copy using LDRD/STRD instructions whenever possible.
+    Returns true upon success. */
+ bool
+-gen_movmem_ldrd_strd (rtx *operands)
++gen_cpymem_ldrd_strd (rtx *operands)
+ {
+   unsigned HOST_WIDE_INT len;
+   HOST_WIDE_INT align;
+@@ -14670,7 +14670,7 @@ gen_movmem_ldrd_strd (rtx *operands)
+ 
+   /* If we cannot generate any LDRD/STRD, try to generate LDM/STM.  */
+   if (!(dst_aligned || src_aligned))
+-    return arm_gen_movmemqi (operands);
++    return arm_gen_cpymemqi (operands);
+ 
+   /* If the either src or dst is unaligned we'll be accessing it as pairs
+      of unaligned SImode accesses.  Otherwise we can generate DImode
+@@ -26472,7 +26472,7 @@ thumb_call_via_reg (rtx reg)
+ 
+ /* Routines for generating rtl.  */
+ void
+-thumb_expand_movmemqi (rtx *operands)
++thumb_expand_cpymemqi (rtx *operands)
+ {
+   rtx out = copy_to_mode_reg (SImode, XEXP (operands[0], 0));
+   rtx in  = copy_to_mode_reg (SImode, XEXP (operands[1], 0));
+@@ -26481,13 +26481,13 @@ thumb_expand_movmemqi (rtx *operands)
+ 
+   while (len >= 12)
+     {
+-      emit_insn (gen_movmem12b (out, in, out, in));
++      emit_insn (gen_cpymem12b (out, in, out, in));
+       len -= 12;
+     }
+ 
+   if (len >= 8)
+     {
+-      emit_insn (gen_movmem8b (out, in, out, in));
++      emit_insn (gen_cpymem8b (out, in, out, in));
+       len -= 8;
+     }
+ 
+diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
+index 53e54874c..a1b9d9fac 100644
+--- a/gcc/config/arm/arm.md
++++ b/gcc/config/arm/arm.md
+@@ -7260,7 +7260,7 @@
+ ;; We could let this apply for blocks of less than this, but it clobbers so
+ ;; many registers that there is then probably a better way.
+ 
+-(define_expand "movmemqi"
++(define_expand "cpymemqi"
+   [(match_operand:BLK 0 "general_operand" "")
+    (match_operand:BLK 1 "general_operand" "")
+    (match_operand:SI 2 "const_int_operand" "")
+@@ -7272,12 +7272,12 @@
+       if (TARGET_LDRD && current_tune->prefer_ldrd_strd
+           && !optimize_function_for_size_p (cfun))
+         {
+-          if (gen_movmem_ldrd_strd (operands))
++          if (gen_cpymem_ldrd_strd (operands))
+             DONE;
+           FAIL;
+         }
+ 
+-      if (arm_gen_movmemqi (operands))
++      if (arm_gen_cpymemqi (operands))
+         DONE;
+       FAIL;
+     }
+@@ -7287,7 +7287,7 @@
+           || INTVAL (operands[2]) > 48)
+         FAIL;
+ 
+-      thumb_expand_movmemqi (operands);
++      thumb_expand_cpymemqi (operands);
+       DONE;
+     }
+   "
+@@ -8807,6 +8807,8 @@
+   [(set_attr "arch" "t1,32")]
+ )
+ 
++;; DO NOT SPLIT THIS INSN.  It's important for security reasons that the
++;; canary value does not live beyond the life of this sequence.
+ (define_insn "*stack_protect_set_insn"
+   [(set (match_operand:SI 0 "memory_operand" "=m,m")
+ 	(unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "+&l,&r"))]
+@@ -8814,8 +8816,8 @@
+    (clobber (match_dup 1))]
+   ""
+   "@
+-   ldr\\t%1, [%1]\;str\\t%1, %0\;movs\t%1,#0
+-   ldr\\t%1, [%1]\;str\\t%1, %0\;mov\t%1,#0"
++   ldr\\t%1, [%1]\;str\\t%1, %0\;movs\t%1, #0
++   ldr\\t%1, [%1]\;str\\t%1, %0\;mov\t%1, #0"
+   [(set_attr "length" "8,12")
+    (set_attr "conds" "clob,nocond")
+    (set_attr "type" "multiple")
+diff --git a/gcc/config/arm/arm1020e.md b/gcc/config/arm/arm1020e.md
+index b835cbaaa..c4c038b04 100644
+--- a/gcc/config/arm/arm1020e.md
++++ b/gcc/config/arm/arm1020e.md
+@@ -72,7 +72,7 @@
+                        adr,bfm,rev,\
+                        shift_imm,shift_reg,\
+                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
+-                       multiple,no_insn"))
++                       multiple"))
+  "1020a_e,1020a_m,1020a_w")
+ 
+ ;; ALU operations with a shift-by-constant operand
+diff --git a/gcc/config/arm/arm1026ejs.md b/gcc/config/arm/arm1026ejs.md
+index 05f4d724f..88546872a 100644
+--- a/gcc/config/arm/arm1026ejs.md
++++ b/gcc/config/arm/arm1026ejs.md
+@@ -72,7 +72,7 @@
+                        adr,bfm,rev,\
+                        shift_imm,shift_reg,\
+                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
+-                       multiple,no_insn"))
++                       multiple"))
+  "a_e,a_m,a_w")
+ 
+ ;; ALU operations with a shift-by-constant operand
+diff --git a/gcc/config/arm/arm1136jfs.md b/gcc/config/arm/arm1136jfs.md
+index ae0b54f5e..e7fd53afe 100644
+--- a/gcc/config/arm/arm1136jfs.md
++++ b/gcc/config/arm/arm1136jfs.md
+@@ -81,7 +81,7 @@
+                        adr,bfm,rev,\
+                        shift_imm,shift_reg,\
+                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
+-                       multiple,no_insn"))
++                       multiple"))
+  "e_1,e_2,e_3,e_wb")
+ 
+ ;; ALU operations with a shift-by-constant operand
+diff --git a/gcc/config/arm/arm926ejs.md b/gcc/config/arm/arm926ejs.md
+index db4c7db8c..b4f503159 100644
+--- a/gcc/config/arm/arm926ejs.md
++++ b/gcc/config/arm/arm926ejs.md
+@@ -67,7 +67,7 @@
+                        shift_imm,shift_reg,extend,\
+                        mov_imm,mov_reg,mov_shift,\
+                        mvn_imm,mvn_reg,mvn_shift,\
+-                       multiple,no_insn"))
++                       multiple"))
+  "e,m,w")
+ 
+ ;; ALU operations with a shift-by-register operand
+diff --git a/gcc/config/arm/cortex-a15.md b/gcc/config/arm/cortex-a15.md
+index f57f98675..26765c3db 100644
+--- a/gcc/config/arm/cortex-a15.md
++++ b/gcc/config/arm/cortex-a15.md
+@@ -68,7 +68,7 @@
+                         shift_imm,shift_reg,\
+                         mov_imm,mov_reg,\
+                         mvn_imm,mvn_reg,\
+-                        mrs,multiple,no_insn"))
++                        mrs,multiple"))
+   "ca15_issue1,(ca15_sx1,ca15_sx1_alu)|(ca15_sx2,ca15_sx2_alu)")
+ 
+ ;; ALU ops with immediate shift
+diff --git a/gcc/config/arm/cortex-a17.md b/gcc/config/arm/cortex-a17.md
+index a0c6e5141..97b716414 100644
+--- a/gcc/config/arm/cortex-a17.md
++++ b/gcc/config/arm/cortex-a17.md
+@@ -42,7 +42,7 @@
+                         adc_imm,adcs_imm,adc_reg,adcs_reg,\
+                         adr, mov_imm,mov_reg,\
+                         mvn_imm,mvn_reg,extend,\
+-                        mrs,multiple,no_insn"))
++                        mrs,multiple"))
+   "ca17_alu")
+ 
+ (define_insn_reservation "cortex_a17_alu_shiftimm" 2
+diff --git a/gcc/config/arm/cortex-a5.md b/gcc/config/arm/cortex-a5.md
+index efced646a..08aa90856 100644
+--- a/gcc/config/arm/cortex-a5.md
++++ b/gcc/config/arm/cortex-a5.md
+@@ -64,7 +64,7 @@
+                         adr,bfm,clz,rbit,rev,alu_dsp_reg,\
+                         shift_imm,shift_reg,\
+                         mov_imm,mov_reg,mvn_imm,mvn_reg,\
+-                        mrs,multiple,no_insn"))
++                        mrs,multiple"))
+   "cortex_a5_ex1")
+ 
+ (define_insn_reservation "cortex_a5_alu_shift" 2
+diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
+index b55d34e91..9b29f3874 100644
+--- a/gcc/config/arm/cortex-a53.md
++++ b/gcc/config/arm/cortex-a53.md
+@@ -86,7 +86,7 @@
+ 			alu_sreg,alus_sreg,logic_reg,logics_reg,
+ 			adc_imm,adcs_imm,adc_reg,adcs_reg,
+ 			csel,clz,rbit,rev,alu_dsp_reg,
+-			mov_reg,mvn_reg,mrs,multiple,no_insn"))
++			mov_reg,mvn_reg,mrs,multiple"))
+   "cortex_a53_slot_any")
+ 
+ (define_insn_reservation "cortex_a53_alu_shift" 3
+diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md
+index 577dc8d7f..49654bf18 100644
+--- a/gcc/config/arm/cortex-a57.md
++++ b/gcc/config/arm/cortex-a57.md
+@@ -301,7 +301,7 @@
+ 			rotate_imm,shift_imm,shift_reg,\
+ 			mov_imm,mov_reg,\
+ 			mvn_imm,mvn_reg,\
+-			mrs,multiple,no_insn"))
++			mrs,multiple"))
+   "ca57_sx1|ca57_sx2")
+ 
+ ;; ALU ops with immediate shift
+diff --git a/gcc/config/arm/cortex-a7.md b/gcc/config/arm/cortex-a7.md
+index 1f9d6414e..f1b60aa27 100644
+--- a/gcc/config/arm/cortex-a7.md
++++ b/gcc/config/arm/cortex-a7.md
+@@ -149,7 +149,7 @@
+                         logic_shift_reg,logics_shift_reg,\
+                         mov_shift,mov_shift_reg,\
+                         mvn_shift,mvn_shift_reg,\
+-                        mrs,multiple,no_insn"))
++                        mrs,multiple"))
+   "cortex_a7_ex1")
+ 
+ ;; Forwarding path for unshifted operands.
+diff --git a/gcc/config/arm/cortex-a8.md b/gcc/config/arm/cortex-a8.md
+index 980aed86e..e3372453d 100644
+--- a/gcc/config/arm/cortex-a8.md
++++ b/gcc/config/arm/cortex-a8.md
+@@ -90,7 +90,7 @@
+                         adc_imm,adcs_imm,adc_reg,adcs_reg,\
+                         adr,bfm,clz,rbit,rev,alu_dsp_reg,\
+                         shift_imm,shift_reg,\
+-                        multiple,no_insn"))
++                        multiple"))
+   "cortex_a8_default")
+ 
+ (define_insn_reservation "cortex_a8_alu_shift" 2
+diff --git a/gcc/config/arm/cortex-a9.md b/gcc/config/arm/cortex-a9.md
+index 6402a4438..c8474152c 100644
+--- a/gcc/config/arm/cortex-a9.md
++++ b/gcc/config/arm/cortex-a9.md
+@@ -87,7 +87,7 @@ cortex_a9_p1_e2 + cortex_a9_p0_e1 + cortex_a9_p1_e1")
+                         shift_imm,shift_reg,\
+                         mov_imm,mov_reg,mvn_imm,mvn_reg,\
+                         mov_shift_reg,mov_shift,\
+-                        mrs,multiple,no_insn"))
++                        mrs,multiple"))
+   "cortex_a9_p0_default|cortex_a9_p1_default")
+ 
+ ;; An instruction using the shifter will go down E1.
+diff --git a/gcc/config/arm/cortex-m4.md b/gcc/config/arm/cortex-m4.md
+index 60038c1e7..f8efcfcfc 100644
+--- a/gcc/config/arm/cortex-m4.md
++++ b/gcc/config/arm/cortex-m4.md
+@@ -42,7 +42,7 @@
+                              logic_shift_reg,logics_shift_reg,\
+                              mov_imm,mov_reg,mov_shift,mov_shift_reg,\
+                              mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg,\
+-                             mrs,multiple,no_insn")
++                             mrs,multiple")
+ 	    (ior (eq_attr "mul32" "yes")
+ 		 (eq_attr "widen_mul64" "yes"))))
+   "cortex_m4_ex")
+diff --git a/gcc/config/arm/cortex-m7.md b/gcc/config/arm/cortex-m7.md
+index e4695ad66..dfe9a742c 100644
+--- a/gcc/config/arm/cortex-m7.md
++++ b/gcc/config/arm/cortex-m7.md
+@@ -48,7 +48,7 @@
+                         logic_shift_imm,logics_shift_imm,\
+                         alu_shift_reg,alus_shift_reg,\
+                         logic_shift_reg,logics_shift_reg,\
+-                        mrs,clz,f_mcr,f_mrc,multiple,no_insn"))
++                        mrs,clz,f_mcr,f_mrc,multiple"))
+   "cm7_i0|cm7_i1,cm7_a0|cm7_a1")
+ 
+ ;; Simple alu with inline shift operation.
+diff --git a/gcc/config/arm/cortex-r4.md b/gcc/config/arm/cortex-r4.md
+index d7c0135fc..af5db23a6 100644
+--- a/gcc/config/arm/cortex-r4.md
++++ b/gcc/config/arm/cortex-r4.md
+@@ -102,7 +102,7 @@
+        (eq_attr "type" "alu_shift_reg,alus_shift_reg,\
+                        logic_shift_reg,logics_shift_reg,\
+                        mov_shift_reg,mvn_shift_reg,\
+-                       mrs,multiple,no_insn"))
++                       mrs,multiple"))
+   "cortex_r4_alu_shift_reg")
+ 
+ ;; An ALU instruction followed by an ALU instruction with no early dep.
+diff --git a/gcc/config/arm/fa526.md b/gcc/config/arm/fa526.md
+index e6625b011..294b79692 100644
+--- a/gcc/config/arm/fa526.md
++++ b/gcc/config/arm/fa526.md
+@@ -68,7 +68,7 @@
+                        adr,bfm,rev,\
+                        shift_imm,shift_reg,\
+                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
+-                       mrs,multiple,no_insn"))
++                       mrs,multiple"))
+  "fa526_core")
+ 
+ (define_insn_reservation "526_alu_shift_op" 2
+diff --git a/gcc/config/arm/fa606te.md b/gcc/config/arm/fa606te.md
+index f2c104fb1..9007050ed 100644
+--- a/gcc/config/arm/fa606te.md
++++ b/gcc/config/arm/fa606te.md
+@@ -73,7 +73,7 @@
+                        logic_shift_reg,logics_shift_reg,\
+                        mov_imm,mov_reg,mov_shift,mov_shift_reg,\
+                        mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg,\
+-                       mrs,multiple,no_insn"))
++                       mrs,multiple"))
+  "fa606te_core")
+ 
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+diff --git a/gcc/config/arm/fa626te.md b/gcc/config/arm/fa626te.md
+index 880090fd7..6bdc2e8b5 100644
+--- a/gcc/config/arm/fa626te.md
++++ b/gcc/config/arm/fa626te.md
+@@ -74,7 +74,7 @@
+                        adr,bfm,rev,\
+                        shift_imm,shift_reg,\
+                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
+-                       mrs,multiple,no_insn"))
++                       mrs,multiple"))
+  "fa626te_core")
+ 
+ (define_insn_reservation "626te_alu_shift_op" 2
+diff --git a/gcc/config/arm/fa726te.md b/gcc/config/arm/fa726te.md
+index cb5fbaf99..f6f2531c8 100644
+--- a/gcc/config/arm/fa726te.md
++++ b/gcc/config/arm/fa726te.md
+@@ -91,7 +91,7 @@
+                        adc_imm,adcs_imm,adc_reg,adcs_reg,\
+                        adr,bfm,rev,\
+                        shift_imm,shift_reg,\
+-                       mrs,multiple,no_insn"))
++                       mrs,multiple"))
+   "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
+ 
+ ;; ALU operations with a shift-by-register operand.
+diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
+index 041e2db34..f8eb732ac 100644
+--- a/gcc/config/arm/thumb1.md
++++ b/gcc/config/arm/thumb1.md
+@@ -985,7 +985,7 @@
+ 
+ ;; Thumb block-move insns
+ 
+-(define_insn "movmem12b"
++(define_insn "cpymem12b"
+   [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
+ 	(mem:SI (match_operand:SI 3 "register_operand" "1")))
+    (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+@@ -1007,7 +1007,7 @@
+    (set_attr "type" "store_12")]
+ )
+ 
+-(define_insn "movmem8b"
++(define_insn "cpymem8b"
+   [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
+ 	(mem:SI (match_operand:SI 3 "register_operand" "1")))
+    (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
+diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
+index f8f8dd090..60faad659 100644
+--- a/gcc/config/arm/types.md
++++ b/gcc/config/arm/types.md
+@@ -546,6 +546,10 @@
+ ; The classification below is for coprocessor instructions
+ ;
+ ; coproc
++;
++; The classification below is for TME instructions
++;
++; tme
+ 
+ (define_attr "type"
+  "adc_imm,\
+@@ -1091,7 +1095,8 @@
+   crypto_sha3,\
+   crypto_sm3,\
+   crypto_sm4,\
+-  coproc"
++  coproc,\
++  tme"
+    (const_string "untyped"))
+ 
+ ; Is this an (integer side) multiply with a 32-bit (or smaller) result?
+@@ -1215,3 +1220,7 @@
+           crypto_sha256_fast, crypto_sha256_slow")
+         (const_string "yes")
+         (const_string "no")))
++
++(define_insn_reservation "no_reservation" 0
++  (eq_attr "type" "no_insn")
++  "nothing")
+diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
+index 14156421d..81498daa0 100644
+--- a/gcc/config/arm/xgene1.md
++++ b/gcc/config/arm/xgene1.md
+@@ -64,11 +64,6 @@
+        (eq_attr "type" "branch"))
+   "xgene1_decode1op")
+ 
+-(define_insn_reservation "xgene1_nop" 1
+-  (and (eq_attr "tune" "xgene1")
+-       (eq_attr "type" "no_insn"))
+-  "xgene1_decode1op")
+-
+ (define_insn_reservation "xgene1_call" 1
+   (and (eq_attr "tune" "xgene1")
+        (eq_attr "type" "call"))
+diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
+index dd0babbd7..31fe3a66d 100644
+--- a/gcc/config/avr/avr-protos.h
++++ b/gcc/config/avr/avr-protos.h
+@@ -82,7 +82,7 @@ extern rtx avr_to_int_mode (rtx);
+ 
+ extern void avr_expand_prologue (void);
+ extern void avr_expand_epilogue (bool);
+-extern bool avr_emit_movmemhi (rtx*);
++extern bool avr_emit_cpymemhi (rtx*);
+ extern int avr_epilogue_uses (int regno);
+ 
+ extern void avr_output_addr_vec (rtx_insn*, rtx);
+@@ -92,7 +92,7 @@ extern const char* avr_out_plus (rtx, rtx*, int* =NULL, int* =NULL, bool =true);
+ extern const char* avr_out_round (rtx_insn *, rtx*, int* =NULL);
+ extern const char* avr_out_addto_sp (rtx*, int*);
+ extern const char* avr_out_xload (rtx_insn *, rtx*, int*);
+-extern const char* avr_out_movmem (rtx_insn *, rtx*, int*);
++extern const char* avr_out_cpymem (rtx_insn *, rtx*, int*);
+ extern const char* avr_out_insert_bits (rtx*, int*);
+ extern bool avr_popcount_each_byte (rtx, int, int);
+ extern bool avr_has_nibble_0xf (rtx);
+diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c
+index cb4b14ae3..3e6e5d2ee 100644
+--- a/gcc/config/avr/avr.c
++++ b/gcc/config/avr/avr.c
+@@ -9421,7 +9421,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len)
+     case ADJUST_LEN_MOV16: output_movhi (insn, op, &len); break;
+     case ADJUST_LEN_MOV24: avr_out_movpsi (insn, op, &len); break;
+     case ADJUST_LEN_MOV32: output_movsisf (insn, op, &len); break;
+-    case ADJUST_LEN_MOVMEM: avr_out_movmem (insn, op, &len); break;
++    case ADJUST_LEN_CPYMEM: avr_out_cpymem (insn, op, &len); break;
+     case ADJUST_LEN_XLOAD: avr_out_xload (insn, op, &len); break;
+     case ADJUST_LEN_SEXT: avr_out_sign_extend (insn, op, &len); break;
+ 
+@@ -13338,7 +13338,7 @@ avr_emit3_fix_outputs (rtx (*gen)(rtx,rtx,rtx), rtx *op,
+ }
+ 
+ 
+-/* Worker function for movmemhi expander.
++/* Worker function for cpymemhi expander.
+    XOP[0]  Destination as MEM:BLK
+    XOP[1]  Source      "     "
+    XOP[2]  # Bytes to copy
+@@ -13347,7 +13347,7 @@ avr_emit3_fix_outputs (rtx (*gen)(rtx,rtx,rtx), rtx *op,
+    Return FALSE if the operand compination is not supported.  */
+ 
+ bool
+-avr_emit_movmemhi (rtx *xop)
++avr_emit_cpymemhi (rtx *xop)
+ {
+   HOST_WIDE_INT count;
+   machine_mode loop_mode;
+@@ -13424,14 +13424,14 @@ avr_emit_movmemhi (rtx *xop)
+          Do the copy-loop inline.  */
+ 
+       rtx (*fun) (rtx, rtx, rtx)
+-        = QImode == loop_mode ? gen_movmem_qi : gen_movmem_hi;
++        = QImode == loop_mode ? gen_cpymem_qi : gen_cpymem_hi;
+ 
+       insn = fun (xas, loop_reg, loop_reg);
+     }
+   else
+     {
+       rtx (*fun) (rtx, rtx)
+-        = QImode == loop_mode ? gen_movmemx_qi : gen_movmemx_hi;
++        = QImode == loop_mode ? gen_cpymemx_qi : gen_cpymemx_hi;
+ 
+       emit_move_insn (gen_rtx_REG (QImode, 23), a_hi8);
+ 
+@@ -13445,7 +13445,7 @@ avr_emit_movmemhi (rtx *xop)
+ }
+ 
+ 
+-/* Print assembler for movmem_qi, movmem_hi insns...
++/* Print assembler for cpymem_qi, cpymem_hi insns...
+        $0     : Address Space
+        $1, $2 : Loop register
+        Z      : Source address
+@@ -13453,7 +13453,7 @@ avr_emit_movmemhi (rtx *xop)
+ */
+ 
+ const char*
+-avr_out_movmem (rtx_insn *insn ATTRIBUTE_UNUSED, rtx *op, int *plen)
++avr_out_cpymem (rtx_insn *insn ATTRIBUTE_UNUSED, rtx *op, int *plen)
+ {
+   addr_space_t as = (addr_space_t) INTVAL (op[0]);
+   machine_mode loop_mode = GET_MODE (op[1]);
+diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
+index f263b693c..e85bf4963 100644
+--- a/gcc/config/avr/avr.md
++++ b/gcc/config/avr/avr.md
+@@ -70,7 +70,7 @@
+ 
+ (define_c_enum "unspec"
+   [UNSPEC_STRLEN
+-   UNSPEC_MOVMEM
++   UNSPEC_CPYMEM
+    UNSPEC_INDEX_JMP
+    UNSPEC_FMUL
+    UNSPEC_FMULS
+@@ -158,7 +158,7 @@
+    tsthi, tstpsi, tstsi, compare, compare64, call,
+    mov8, mov16, mov24, mov32, reload_in16, reload_in24, reload_in32,
+    ufract, sfract, round,
+-   xload, movmem,
++   xload, cpymem,
+    ashlqi, ashrqi, lshrqi,
+    ashlhi, ashrhi, lshrhi,
+    ashlsi, ashrsi, lshrsi,
+@@ -992,20 +992,20 @@
+ ;;=========================================================================
+ ;; move string (like memcpy)
+ 
+-(define_expand "movmemhi"
++(define_expand "cpymemhi"
+   [(parallel [(set (match_operand:BLK 0 "memory_operand" "")
+                    (match_operand:BLK 1 "memory_operand" ""))
+               (use (match_operand:HI 2 "const_int_operand" ""))
+               (use (match_operand:HI 3 "const_int_operand" ""))])]
+   ""
+   {
+-    if (avr_emit_movmemhi (operands))
++    if (avr_emit_cpymemhi (operands))
+       DONE;
+ 
+     FAIL;
+   })
+ 
+-(define_mode_attr MOVMEM_r_d [(QI "r")
++(define_mode_attr CPYMEM_r_d [(QI "r")
+                               (HI "wd")])
+ 
+ ;; $0     : Address Space
+@@ -1013,23 +1013,23 @@
+ ;; R30    : source address
+ ;; R26    : destination address
+ 
+-;; "movmem_qi"
+-;; "movmem_hi"
+-(define_insn "movmem_<mode>"
++;; "cpymem_qi"
++;; "cpymem_hi"
++(define_insn "cpymem_<mode>"
+   [(set (mem:BLK (reg:HI REG_X))
+         (mem:BLK (reg:HI REG_Z)))
+    (unspec [(match_operand:QI 0 "const_int_operand" "n")]
+-           UNSPEC_MOVMEM)
+-   (use (match_operand:QIHI 1 "register_operand" "<MOVMEM_r_d>"))
++           UNSPEC_CPYMEM)
++   (use (match_operand:QIHI 1 "register_operand" "<CPYMEM_r_d>"))
+    (clobber (reg:HI REG_X))
+    (clobber (reg:HI REG_Z))
+    (clobber (reg:QI LPM_REGNO))
+    (clobber (match_operand:QIHI 2 "register_operand" "=1"))]
+   ""
+   {
+-    return avr_out_movmem (insn, operands, NULL);
++    return avr_out_cpymem (insn, operands, NULL);
+   }
+-  [(set_attr "adjust_len" "movmem")
++  [(set_attr "adjust_len" "cpymem")
+    (set_attr "cc" "clobber")])
+ 
+ 
+@@ -1039,14 +1039,14 @@
+ ;; R23:Z : 24-bit source address
+ ;; R26   : 16-bit destination address
+ 
+-;; "movmemx_qi"
+-;; "movmemx_hi"
+-(define_insn "movmemx_<mode>"
++;; "cpymemx_qi"
++;; "cpymemx_hi"
++(define_insn "cpymemx_<mode>"
+   [(set (mem:BLK (reg:HI REG_X))
+         (mem:BLK (lo_sum:PSI (reg:QI 23)
+                              (reg:HI REG_Z))))
+    (unspec [(match_operand:QI 0 "const_int_operand" "n")]
+-           UNSPEC_MOVMEM)
++           UNSPEC_CPYMEM)
+    (use (reg:QIHI 24))
+    (clobber (reg:HI REG_X))
+    (clobber (reg:HI REG_Z))
+diff --git a/gcc/config/bfin/bfin-protos.h b/gcc/config/bfin/bfin-protos.h
+index 64a184275..7d0f705e0 100644
+--- a/gcc/config/bfin/bfin-protos.h
++++ b/gcc/config/bfin/bfin-protos.h
+@@ -81,7 +81,7 @@ extern bool expand_move (rtx *, machine_mode);
+ extern void bfin_expand_call (rtx, rtx, rtx, rtx, int);
+ extern bool bfin_longcall_p (rtx, int);
+ extern bool bfin_dsp_memref_p (rtx);
+-extern bool bfin_expand_movmem (rtx, rtx, rtx, rtx);
++extern bool bfin_expand_cpymem (rtx, rtx, rtx, rtx);
+ 
+ extern enum reg_class secondary_input_reload_class (enum reg_class,
+ 						    machine_mode,
+diff --git a/gcc/config/bfin/bfin.c b/gcc/config/bfin/bfin.c
+index 97c2c12d5..288a2ff59 100644
+--- a/gcc/config/bfin/bfin.c
++++ b/gcc/config/bfin/bfin.c
+@@ -3208,7 +3208,7 @@ output_pop_multiple (rtx insn, rtx *operands)
+ /* Adjust DST and SRC by OFFSET bytes, and generate one move in mode MODE.  */
+ 
+ static void
+-single_move_for_movmem (rtx dst, rtx src, machine_mode mode, HOST_WIDE_INT offset)
++single_move_for_cpymem (rtx dst, rtx src, machine_mode mode, HOST_WIDE_INT offset)
+ {
+   rtx scratch = gen_reg_rtx (mode);
+   rtx srcmem, dstmem;
+@@ -3224,7 +3224,7 @@ single_move_for_movmem (rtx dst, rtx src, machine_mode mode, HOST_WIDE_INT offse
+    back on a different method.  */
+ 
+ bool
+-bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
++bfin_expand_cpymem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+ {
+   rtx srcreg, destreg, countreg;
+   HOST_WIDE_INT align = 0;
+@@ -3269,7 +3269,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+ 	{
+ 	  if ((count & ~3) == 4)
+ 	    {
+-	      single_move_for_movmem (dst, src, SImode, offset);
++	      single_move_for_cpymem (dst, src, SImode, offset);
+ 	      offset = 4;
+ 	    }
+ 	  else if (count & ~3)
+@@ -3282,7 +3282,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+ 	    }
+ 	  if (count & 2)
+ 	    {
+-	      single_move_for_movmem (dst, src, HImode, offset);
++	      single_move_for_cpymem (dst, src, HImode, offset);
+ 	      offset += 2;
+ 	    }
+ 	}
+@@ -3290,7 +3290,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+ 	{
+ 	  if ((count & ~1) == 2)
+ 	    {
+-	      single_move_for_movmem (dst, src, HImode, offset);
++	      single_move_for_cpymem (dst, src, HImode, offset);
+ 	      offset = 2;
+ 	    }
+ 	  else if (count & ~1)
+@@ -3304,7 +3304,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+ 	}
+       if (count & 1)
+ 	{
+-	  single_move_for_movmem (dst, src, QImode, offset);
++	  single_move_for_cpymem (dst, src, QImode, offset);
+ 	}
+       return true;
+     }
+diff --git a/gcc/config/bfin/bfin.h b/gcc/config/bfin/bfin.h
+index 19b7f819d..4aba596f6 100644
+--- a/gcc/config/bfin/bfin.h
++++ b/gcc/config/bfin/bfin.h
+@@ -793,7 +793,7 @@ typedef struct {
+ #define MOVE_MAX UNITS_PER_WORD
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction pairs, we will do a movmem or libcall instead.  */
++   move-instruction pairs, we will do a cpymem or libcall instead.  */
+ 
+ #define MOVE_RATIO(speed) 5
+ 
+diff --git a/gcc/config/bfin/bfin.md b/gcc/config/bfin/bfin.md
+index ac5892424..6ac208d04 100644
+--- a/gcc/config/bfin/bfin.md
++++ b/gcc/config/bfin/bfin.md
+@@ -2316,14 +2316,14 @@
+    (set_attr "length" "16")
+    (set_attr "seq_insns" "multi")])
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(match_operand:BLK 0 "general_operand" "")
+    (match_operand:BLK 1 "general_operand" "")
+    (match_operand:SI 2 "const_int_operand" "")
+    (match_operand:SI 3 "const_int_operand" "")]
+   ""
+ {
+-  if (bfin_expand_movmem (operands[0], operands[1], operands[2], operands[3]))
++  if (bfin_expand_cpymem (operands[0], operands[1], operands[2], operands[3]))
+     DONE;
+   FAIL;
+ })
+diff --git a/gcc/config/c6x/c6x-protos.h b/gcc/config/c6x/c6x-protos.h
+index a657969a2..8c04c315a 100644
+--- a/gcc/config/c6x/c6x-protos.h
++++ b/gcc/config/c6x/c6x-protos.h
+@@ -35,7 +35,7 @@ extern bool c6x_long_call_p (rtx);
+ extern void c6x_expand_call (rtx, rtx, bool);
+ extern rtx c6x_expand_compare (rtx, machine_mode);
+ extern bool c6x_force_op_for_comparison_p (enum rtx_code, rtx);
+-extern bool c6x_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx);
++extern bool c6x_expand_cpymem (rtx, rtx, rtx, rtx, rtx, rtx);
+ 
+ extern rtx c6x_subword (rtx, bool);
+ extern void split_di (rtx *, int, rtx *, rtx *);
+diff --git a/gcc/config/c6x/c6x.c b/gcc/config/c6x/c6x.c
+index 9a07c4013..e4176774b 100644
+--- a/gcc/config/c6x/c6x.c
++++ b/gcc/config/c6x/c6x.c
+@@ -1683,10 +1683,10 @@ c6x_valid_mask_p (HOST_WIDE_INT val)
+   return true;
+ }
+ 
+-/* Expand a block move for a movmemM pattern.  */
++/* Expand a block move for a cpymemM pattern.  */
+ 
+ bool
+-c6x_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
++c6x_expand_cpymem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
+ 		   rtx expected_align_exp ATTRIBUTE_UNUSED,
+ 		   rtx expected_size_exp ATTRIBUTE_UNUSED)
+ {
+diff --git a/gcc/config/c6x/c6x.md b/gcc/config/c6x/c6x.md
+index 8218e1dad..f9bf9ba99 100644
+--- a/gcc/config/c6x/c6x.md
++++ b/gcc/config/c6x/c6x.md
+@@ -2844,7 +2844,7 @@
+ ;; Block moves
+ ;; -------------------------------------------------------------------------
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(use (match_operand:BLK 0 "memory_operand" ""))
+    (use (match_operand:BLK 1 "memory_operand" ""))
+    (use (match_operand:SI 2 "nonmemory_operand" ""))
+@@ -2853,7 +2853,7 @@
+    (use (match_operand:SI 5 "const_int_operand" ""))]
+   ""
+ {
+- if (c6x_expand_movmem (operands[0], operands[1], operands[2], operands[3],
++ if (c6x_expand_cpymem (operands[0], operands[1], operands[2], operands[3],
+ 			operands[4], operands[5]))
+    DONE;
+  else
+diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c
+index a7610829f..dcd69698f 100644
+--- a/gcc/config/darwin.c
++++ b/gcc/config/darwin.c
+@@ -2150,7 +2150,7 @@ darwin_emit_unwind_label (FILE *file, tree decl, int for_eh, int empty)
+   if (! for_eh || ! ld_needs_eh_markers)
+     return;
+ 
+-  /* FIXME: This only works when the eh for all sections of a function are
++  /* FIXME: This only works when the eh for all sections of a function are 
+      emitted at the same time.  If that changes, we would need to use a lookup
+      table of some form to determine what to do.  Also, we should emit the
+      unadorned label for the partition containing the public label for a
+@@ -3325,7 +3325,7 @@ darwin_override_options (void)
+ 
+   /* Linkers >= ld64-62.1 (at least) are capable of making the necessary PIC
+      indirections and we no longer need to emit pic symbol stubs.
+-     However, if we are generating code for earlier ones (or for use in the
++     However, if we are generating code for earlier ones (or for use in the 
+      kernel) the stubs might still be required, and this will be set true.
+      If the user sets it on or off - then that takes precedence.
+ 
+@@ -3334,18 +3334,18 @@ darwin_override_options (void)
+ 
+   if (!global_options_set.x_darwin_symbol_stubs)
+     {
+-      if (darwin_target_linker)
++      if (darwin_target_linker) 
+ 	{
+ 	  if (strverscmp (darwin_target_linker, MIN_LD64_OMIT_STUBS) < 0)
+ 	    {
+ 	      darwin_symbol_stubs = true;
+ 	      ld_needs_eh_markers = true;
+ 	    }
+-	}
++	} 
+       else if (generating_for_darwin_version < 9)
+ 	{
+ 	  /* If we don't know the linker version and we're targeting an old
+-	     system, we know no better than to assume the use of an earlier
++	     system, we know no better than to assume the use of an earlier 
+ 	     linker.  */
+ 	  darwin_symbol_stubs = true;
+ 	  ld_needs_eh_markers = true;
+@@ -3354,7 +3354,7 @@ darwin_override_options (void)
+   else if (DARWIN_X86 && darwin_symbol_stubs && TARGET_64BIT)
+     {
+       inform (input_location,
+-	      "%<-msymbol-stubs%> is not required for 64b code (ignored)");
++	      "%<-mpic-symbol-stubs%> is not required for 64b code (ignored)");
+       darwin_symbol_stubs = false;
+     }
+ 
+diff --git a/gcc/config/frv/frv.md b/gcc/config/frv/frv.md
+index 064bf53ea..6e8db59fd 100644
+--- a/gcc/config/frv/frv.md
++++ b/gcc/config/frv/frv.md
+@@ -1887,7 +1887,7 @@
+ ;; Argument 2 is the length
+ ;; Argument 3 is the alignment
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "" "")
+ 		   (match_operand:BLK 1 "" ""))
+ 	      (use (match_operand:SI 2 "" ""))
+diff --git a/gcc/config/ft32/ft32.md b/gcc/config/ft32/ft32.md
+index de2394644..9e31f2ca7 100644
+--- a/gcc/config/ft32/ft32.md
++++ b/gcc/config/ft32/ft32.md
+@@ -851,7 +851,7 @@
+ "stpcpy %b1,%b2 # %0 %b1 %b2"
+ )
+ 
+-(define_insn "movmemsi"
++(define_insn "cpymemsi"
+   [(set (match_operand:BLK 0 "memory_operand" "=W,BW")
+         (match_operand:BLK 1 "memory_operand" "W,BW"))
+         (use (match_operand:SI 2 "ft32_imm_operand" "KA,KA"))
+diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
+index eb06ff9e0..480bb22ee 100644
+--- a/gcc/config/gcn/gcn.c
++++ b/gcc/config/gcn/gcn.c
+@@ -2495,7 +2495,7 @@ gcn_gimplify_va_arg_expr (tree valist, tree type,
+   tree t, u;
+   bool indirect;
+ 
+-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
++  indirect = pass_va_arg_by_reference (type);
+   if (indirect)
+     {
+       type = ptr;
+diff --git a/gcc/config/h8300/h8300.md b/gcc/config/h8300/h8300.md
+index eb0ae835f..42610fddb 100644
+--- a/gcc/config/h8300/h8300.md
++++ b/gcc/config/h8300/h8300.md
+@@ -474,11 +474,11 @@
+    (set_attr "length_table" "*,movl")
+    (set_attr "cc" "set_zn,set_znv")])
+ 
+-;; Implement block moves using movmd.  Defining movmemsi allows the full
++;; Implement block copies using movmd.  Defining cpymemsi allows the full
+ ;; range of constant lengths (up to 0x40000 bytes when using movmd.l).
+ ;; See h8sx_emit_movmd for details.
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(use (match_operand:BLK 0 "memory_operand" ""))
+    (use (match_operand:BLK 1 "memory_operand" ""))
+    (use (match_operand:SI 2 "" ""))
+diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
+new file mode 100644
+index 000000000..6afb246eb
+--- /dev/null
++++ b/gcc/config/i386/i386-builtins.c
+@@ -0,0 +1,2539 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#define IN_TARGET_CODE 1
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "rtl.h"
++#include "tree.h"
++#include "memmodel.h"
++#include "gimple.h"
++#include "cfghooks.h"
++#include "cfgloop.h"
++#include "df.h"
++#include "tm_p.h"
++#include "stringpool.h"
++#include "expmed.h"
++#include "optabs.h"
++#include "regs.h"
++#include "emit-rtl.h"
++#include "recog.h"
++#include "cgraph.h"
++#include "diagnostic.h"
++#include "cfgbuild.h"
++#include "alias.h"
++#include "fold-const.h"
++#include "attribs.h"
++#include "calls.h"
++#include "stor-layout.h"
++#include "varasm.h"
++#include "output.h"
++#include "insn-attr.h"
++#include "flags.h"
++#include "except.h"
++#include "explow.h"
++#include "expr.h"
++#include "cfgrtl.h"
++#include "common/common-target.h"
++#include "langhooks.h"
++#include "reload.h"
++#include "gimplify.h"
++#include "dwarf2.h"
++#include "tm-constrs.h"
++#include "params.h"
++#include "cselib.h"
++#include "sched-int.h"
++#include "opts.h"
++#include "tree-pass.h"
++#include "context.h"
++#include "pass_manager.h"
++#include "target-globals.h"
++#include "gimple-iterator.h"
++#include "tree-vectorizer.h"
++#include "shrink-wrap.h"
++#include "builtins.h"
++#include "rtl-iter.h"
++#include "tree-iterator.h"
++#include "dbgcnt.h"
++#include "case-cfn-macros.h"
++#include "dojump.h"
++#include "fold-const-call.h"
++#include "tree-vrp.h"
++#include "tree-ssanames.h"
++#include "selftest.h"
++#include "selftest-rtl.h"
++#include "print-rtl.h"
++#include "intl.h"
++#include "ifcvt.h"
++#include "symbol-summary.h"
++#include "ipa-prop.h"
++#include "ipa-fnsummary.h"
++#include "wide-int-bitmask.h"
++#include "tree-vector-builder.h"
++#include "debug.h"
++#include "dwarf2out.h"
++#include "i386-builtins.h"
++
++#undef BDESC
++#undef BDESC_FIRST
++#undef BDESC_END
++
++/* Macros for verification of enum ix86_builtins order.  */
++#define BDESC_VERIFY(x, y, z) \
++  gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
++#define BDESC_VERIFYS(x, y, z) \
++  STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
++
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
++	       IX86_BUILTIN__BDESC_COMI_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
++	       IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
++	       IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
++	       IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
++	       IX86_BUILTIN__BDESC_ARGS_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
++	       IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
++	       IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
++	       IX86_BUILTIN__BDESC_CET_LAST, 1);
++BDESC_VERIFYS (IX86_BUILTIN_MAX,
++	       IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
++
++
++/* Table for the ix86 builtin non-function types.  */
++static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
++
++/* Retrieve an element from the above table, building some of
++   the types lazily.  */
++
++static tree
++ix86_get_builtin_type (enum ix86_builtin_type tcode)
++{
++  unsigned int index;
++  tree type, itype;
++
++  gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
++
++  type = ix86_builtin_type_tab[(int) tcode];
++  if (type != NULL)
++    return type;
++
++  gcc_assert (tcode > IX86_BT_LAST_PRIM);
++  if (tcode <= IX86_BT_LAST_VECT)
++    {
++      machine_mode mode;
++
++      index = tcode - IX86_BT_LAST_PRIM - 1;
++      itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
++      mode = ix86_builtin_type_vect_mode[index];
++
++      type = build_vector_type_for_mode (itype, mode);
++    }
++  else
++    {
++      int quals;
++
++      index = tcode - IX86_BT_LAST_VECT - 1;
++      if (tcode <= IX86_BT_LAST_PTR)
++	quals = TYPE_UNQUALIFIED;
++      else
++	quals = TYPE_QUAL_CONST;
++
++      itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
++      if (quals != TYPE_UNQUALIFIED)
++	itype = build_qualified_type (itype, quals);
++
++      type = build_pointer_type (itype);
++    }
++
++  ix86_builtin_type_tab[(int) tcode] = type;
++  return type;
++}
++
++/* Table for the ix86 builtin function types.  */
++static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
++
++/* Retrieve an element from the above table, building some of
++   the types lazily.  */
++
++static tree
++ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
++{
++  tree type;
++
++  gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
++
++  type = ix86_builtin_func_type_tab[(int) tcode];
++  if (type != NULL)
++    return type;
++
++  if (tcode <= IX86_BT_LAST_FUNC)
++    {
++      unsigned start = ix86_builtin_func_start[(int) tcode];
++      unsigned after = ix86_builtin_func_start[(int) tcode + 1];
++      tree rtype, atype, args = void_list_node;
++      unsigned i;
++
++      rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
++      for (i = after - 1; i > start; --i)
++	{
++	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
++	  args = tree_cons (NULL, atype, args);
++	}
++
++      type = build_function_type (rtype, args);
++    }
++  else
++    {
++      unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
++      enum ix86_builtin_func_type icode;
++
++      icode = ix86_builtin_func_alias_base[index];
++      type = ix86_get_builtin_func_type (icode);
++    }
++
++  ix86_builtin_func_type_tab[(int) tcode] = type;
++  return type;
++}
++
++/* Table for the ix86 builtin decls.  */
++static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
++
++struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
++
++tree get_ix86_builtin (enum ix86_builtins c)
++{
++  return ix86_builtins[c];
++}
++
++/* Bits that can still enable any inclusion of a builtin.  */
++HOST_WIDE_INT deferred_isa_values = 0;
++HOST_WIDE_INT deferred_isa_values2 = 0;
++
++/* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the
++   MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the
++   ix86_builtins_isa array.  Stores the function decl in the ix86_builtins
++   array.  Returns the function decl or NULL_TREE, if the builtin was not
++   added.
++
++   If the front end has a special hook for builtin functions, delay adding
++   builtin functions that aren't in the current ISA until the ISA is changed
++   with function specific optimization.  Doing so, can save about 300K for the
++   default compiler.  When the builtin is expanded, check at that time whether
++   it is valid.
++
++   If the front end doesn't have a special hook, record all builtins, even if
++   it isn't an instruction set in the current ISA in case the user uses
++   function specific options for a different ISA, so that we don't get scope
++   errors if a builtin is added in the middle of a function scope.  */
++
++static inline tree
++def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2,
++	     const char *name,
++	     enum ix86_builtin_func_type tcode,
++	     enum ix86_builtins code)
++{
++  tree decl = NULL_TREE;
++
++  /* An instruction may be 64bit only regardless of ISAs.  */
++  if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
++    {
++      ix86_builtins_isa[(int) code].isa = mask;
++      ix86_builtins_isa[(int) code].isa2 = mask2;
++
++      mask &= ~OPTION_MASK_ISA_64BIT;
++
++      /* Filter out the masks most often ored together with others.  */
++      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
++	  && mask != OPTION_MASK_ISA_AVX512VL)
++	mask &= ~OPTION_MASK_ISA_AVX512VL;
++      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
++	  && mask != OPTION_MASK_ISA_AVX512BW)
++	mask &= ~OPTION_MASK_ISA_AVX512BW;
++
++      if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0)
++	   && (mask == 0 || (mask & ix86_isa_flags) != 0))
++	  || (lang_hooks.builtin_function
++	      == lang_hooks.builtin_function_ext_scope))
++	{
++	  tree type = ix86_get_builtin_func_type (tcode);
++	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
++				       NULL, NULL_TREE);
++	  ix86_builtins[(int) code] = decl;
++	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
++	}
++      else
++	{
++	  /* Just MASK and MASK2 where set_and_not_built_p == true can potentially
++	     include a builtin.  */
++	  deferred_isa_values |= mask;
++	  deferred_isa_values2 |= mask2;
++	  ix86_builtins[(int) code] = NULL_TREE;
++	  ix86_builtins_isa[(int) code].tcode = tcode;
++	  ix86_builtins_isa[(int) code].name = name;
++	  ix86_builtins_isa[(int) code].const_p = false;
++	  ix86_builtins_isa[(int) code].pure_p = false;
++	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
++	}
++    }
++
++  return decl;
++}
++
++/* Like def_builtin, but also marks the function decl "const".  */
++
++static inline tree
++def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
++		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
++{
++  tree decl = def_builtin (mask, mask2, name, tcode, code);
++  if (decl)
++    TREE_READONLY (decl) = 1;
++  else
++    ix86_builtins_isa[(int) code].const_p = true;
++
++  return decl;
++}
++
++/* Like def_builtin, but also marks the function decl "pure".  */
++
++static inline tree
++def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
++		  enum ix86_builtin_func_type tcode, enum ix86_builtins code)
++{
++  tree decl = def_builtin (mask, mask2, name, tcode, code);
++  if (decl)
++    DECL_PURE_P (decl) = 1;
++  else
++    ix86_builtins_isa[(int) code].pure_p = true;
++
++  return decl;
++}
++
++/* Add any new builtin functions for a given ISA that may not have been
++   declared.  This saves a bit of space compared to adding all of the
++   declarations to the tree, even if we didn't use them.  */
++
++void
++ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
++{
++  isa &= ~OPTION_MASK_ISA_64BIT;
++
++  if ((isa & deferred_isa_values) == 0
++      && (isa2 & deferred_isa_values2) == 0)
++    return;
++
++  /* Bits in ISA value can be removed from potential isa values.  */
++  deferred_isa_values &= ~isa;
++  deferred_isa_values2 &= ~isa2;
++
++  int i;
++  tree saved_current_target_pragma = current_target_pragma;
++  current_target_pragma = NULL_TREE;
++
++  for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
++    {
++      if (((ix86_builtins_isa[i].isa & isa) != 0
++	   || (ix86_builtins_isa[i].isa2 & isa2) != 0)
++	  && ix86_builtins_isa[i].set_and_not_built_p)
++	{
++	  tree decl, type;
++
++	  /* Don't define the builtin again.  */
++	  ix86_builtins_isa[i].set_and_not_built_p = false;
++
++	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
++	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
++						 type, i, BUILT_IN_MD, NULL,
++						 NULL_TREE);
++
++	  ix86_builtins[i] = decl;
++	  if (ix86_builtins_isa[i].const_p)
++	    TREE_READONLY (decl) = 1;
++	}
++    }
++
++  current_target_pragma = saved_current_target_pragma;
++}
++
++/* TM vector builtins.  */
++
++/* Reuse the existing x86-specific `struct builtin_description' cause
++   we're lazy.  Add casts to make them fit.  */
++static const struct builtin_description bdesc_tm[] =
++{
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
++
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
++
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
++
++  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
++  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
++  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
++};
++
++/* Initialize the transactional memory vector load/store builtins.  */
++
++static void
++ix86_init_tm_builtins (void)
++{
++  enum ix86_builtin_func_type ftype;
++  const struct builtin_description *d;
++  size_t i;
++  tree decl;
++  tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
++  tree attrs_log, attrs_type_log;
++
++  if (!flag_tm)
++    return;
++
++  /* If there are no builtins defined, we must be compiling in a
++     language without trans-mem support.  */
++  if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
++    return;
++
++  /* Use whatever attributes a normal TM load has.  */
++  decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
++  attrs_load = DECL_ATTRIBUTES (decl);
++  attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
++  /* Use whatever attributes a normal TM store has.  */
++  decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
++  attrs_store = DECL_ATTRIBUTES (decl);
++  attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
++  /* Use whatever attributes a normal TM log has.  */
++  decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
++  attrs_log = DECL_ATTRIBUTES (decl);
++  attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
++
++  for (i = 0, d = bdesc_tm;
++       i < ARRAY_SIZE (bdesc_tm);
++       i++, d++)
++    {
++      if ((d->mask & ix86_isa_flags) != 0
++	  || (lang_hooks.builtin_function
++	      == lang_hooks.builtin_function_ext_scope))
++	{
++	  tree type, attrs, attrs_type;
++	  enum built_in_function code = (enum built_in_function) d->code;
++
++	  ftype = (enum ix86_builtin_func_type) d->flag;
++	  type = ix86_get_builtin_func_type (ftype);
++
++	  if (BUILTIN_TM_LOAD_P (code))
++	    {
++	      attrs = attrs_load;
++	      attrs_type = attrs_type_load;
++	    }
++	  else if (BUILTIN_TM_STORE_P (code))
++	    {
++	      attrs = attrs_store;
++	      attrs_type = attrs_type_store;
++	    }
++	  else
++	    {
++	      attrs = attrs_log;
++	      attrs_type = attrs_type_log;
++	    }
++	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
++				       /* The builtin without the prefix for
++					  calling it directly.  */
++				       d->name + strlen ("__builtin_"),
++				       attrs);
++	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
++	     set the TYPE_ATTRIBUTES.  */
++	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
++
++	  set_builtin_decl (code, decl, false);
++	}
++    }
++}
++
++/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
++   in the current target ISA to allow the user to compile particular modules
++   with different target specific options that differ from the command line
++   options.  */
++static void
++ix86_init_mmx_sse_builtins (void)
++{
++  const struct builtin_description * d;
++  enum ix86_builtin_func_type ftype;
++  size_t i;
++
++  /* Add all special builtins with variable number of operands.  */
++  for (i = 0, d = bdesc_special_args;
++       i < ARRAY_SIZE (bdesc_special_args);
++       i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
++      if (d->name == 0)
++	continue;
++
++      ftype = (enum ix86_builtin_func_type) d->flag;
++      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
++		 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
++		 ARRAY_SIZE (bdesc_special_args) - 1);
++
++  /* Add all builtins with variable number of operands.  */
++  for (i = 0, d = bdesc_args;
++       i < ARRAY_SIZE (bdesc_args);
++       i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
++      if (d->name == 0)
++	continue;
++
++      ftype = (enum ix86_builtin_func_type) d->flag;
++      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
++		 IX86_BUILTIN__BDESC_ARGS_FIRST,
++		 ARRAY_SIZE (bdesc_args) - 1);
++
++  /* Add all builtins with rounding.  */
++  for (i = 0, d = bdesc_round_args;
++       i < ARRAY_SIZE (bdesc_round_args);
++       i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
++      if (d->name == 0)
++	continue;
++
++      ftype = (enum ix86_builtin_func_type) d->flag;
++      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
++		 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
++		 ARRAY_SIZE (bdesc_round_args) - 1);
++
++  /* pcmpestr[im] insns.  */
++  for (i = 0, d = bdesc_pcmpestr;
++       i < ARRAY_SIZE (bdesc_pcmpestr);
++       i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
++      if (d->code == IX86_BUILTIN_PCMPESTRM128)
++	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
++      else
++	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
++      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
++		 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
++		 ARRAY_SIZE (bdesc_pcmpestr) - 1);
++
++  /* pcmpistr[im] insns.  */
++  for (i = 0, d = bdesc_pcmpistr;
++       i < ARRAY_SIZE (bdesc_pcmpistr);
++       i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
++      if (d->code == IX86_BUILTIN_PCMPISTRM128)
++	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
++      else
++	ftype = INT_FTYPE_V16QI_V16QI_INT;
++      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
++		 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
++		 ARRAY_SIZE (bdesc_pcmpistr) - 1);
++
++  /* comi/ucomi insns.  */
++  for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
++      if (d->mask == OPTION_MASK_ISA_SSE2)
++	ftype = INT_FTYPE_V2DF_V2DF;
++      else
++	ftype = INT_FTYPE_V4SF_V4SF;
++      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
++		 IX86_BUILTIN__BDESC_COMI_FIRST,
++		 ARRAY_SIZE (bdesc_comi) - 1);
++
++  /* SSE */
++  def_builtin (OPTION_MASK_ISA_SSE, 0,  "__builtin_ia32_ldmxcsr",
++	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
++  def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
++		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
++
++  /* SSE or 3DNow!A */
++  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
++	       /* As it uses V4HImode, we have to require -mmmx too.  */
++	       | OPTION_MASK_ISA_MMX, 0,
++	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
++	       IX86_BUILTIN_MASKMOVQ);
++
++  /* SSE2 */
++  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
++	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
++
++  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush",
++	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
++  x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence",
++			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
++
++  /* SSE3.  */
++  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor",
++	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
++  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait",
++	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
++
++  /* AES */
++  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_aesenc128",
++		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
++  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_aesenclast128",
++		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
++  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_aesdec128",
++		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
++  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_aesdeclast128",
++		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
++  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_aesimc128",
++		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
++  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_aeskeygenassist128",
++		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
++
++  /* PCLMUL */
++  def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0,
++		     "__builtin_ia32_pclmulqdq128",
++		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
++
++  /* RDRND */
++  def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step",
++	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
++  def_builtin (OPTION_MASK_ISA_RDRND, 0,  "__builtin_ia32_rdrand32_step",
++	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
++  def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0,
++	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
++	       IX86_BUILTIN_RDRAND64_STEP);
++
++  /* AVX2 */
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df",
++		    V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
++		    IX86_BUILTIN_GATHERSIV2DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df",
++		    V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
++		    IX86_BUILTIN_GATHERSIV4DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df",
++		    V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
++		    IX86_BUILTIN_GATHERDIV2DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df",
++		    V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
++		    IX86_BUILTIN_GATHERDIV4DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf",
++		    V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
++		    IX86_BUILTIN_GATHERSIV4SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf",
++		    V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
++		    IX86_BUILTIN_GATHERSIV8SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf",
++		    V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
++		    IX86_BUILTIN_GATHERDIV4SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256",
++		    V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
++		    IX86_BUILTIN_GATHERDIV8SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di",
++		    V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
++		    IX86_BUILTIN_GATHERSIV2DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di",
++		    V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
++		    IX86_BUILTIN_GATHERSIV4DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di",
++		    V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
++		    IX86_BUILTIN_GATHERDIV2DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di",
++		    V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
++		    IX86_BUILTIN_GATHERDIV4DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si",
++		    V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
++		    IX86_BUILTIN_GATHERSIV4SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si",
++		    V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
++		    IX86_BUILTIN_GATHERSIV8SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si",
++		    V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
++		    IX86_BUILTIN_GATHERDIV4SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256",
++		    V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
++		    IX86_BUILTIN_GATHERDIV8SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ",
++		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
++		    IX86_BUILTIN_GATHERALTSIV4DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ",
++		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
++		    IX86_BUILTIN_GATHERALTDIV8SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ",
++		    V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
++		    IX86_BUILTIN_GATHERALTSIV4DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ",
++		    V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
++		    IX86_BUILTIN_GATHERALTDIV8SI);
++
++  /* AVX512F */
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf",
++		    V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
++		    IX86_BUILTIN_GATHER3SIV16SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df",
++		    V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV8DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf",
++		    V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV16SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df",
++		    V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV8DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si",
++		    V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
++		    IX86_BUILTIN_GATHER3SIV16SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di",
++		    V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV8DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si",
++		    V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV16SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di",
++		    V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV8DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ",
++		    V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
++		    IX86_BUILTIN_GATHER3ALTSIV8DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ",
++		    V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
++		    IX86_BUILTIN_GATHER3ALTDIV16SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ",
++		    V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
++		    IX86_BUILTIN_GATHER3ALTSIV8DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ",
++		    V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
++		    IX86_BUILTIN_GATHER3ALTDIV16SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf",
++	       VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
++	       IX86_BUILTIN_SCATTERSIV16SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df",
++	       VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
++	       IX86_BUILTIN_SCATTERSIV8DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf",
++	       VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
++	       IX86_BUILTIN_SCATTERDIV16SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df",
++	       VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
++	       IX86_BUILTIN_SCATTERDIV8DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si",
++	       VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
++	       IX86_BUILTIN_SCATTERSIV16SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di",
++	       VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
++	       IX86_BUILTIN_SCATTERSIV8DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si",
++	       VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
++	       IX86_BUILTIN_SCATTERDIV16SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di",
++	       VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
++	       IX86_BUILTIN_SCATTERDIV8DI);
++
++  /* AVX512VL */
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df",
++		    V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV2DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df",
++		    V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV4DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df",
++		    V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV2DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df",
++		    V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV4DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf",
++		    V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV4SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf",
++		    V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV8SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf",
++		    V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV4SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf",
++		    V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV8SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di",
++		    V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV2DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di",
++		    V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV4DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di",
++		    V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV2DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di",
++		    V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV4DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si",
++		    V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV4SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si",
++		    V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
++		    IX86_BUILTIN_GATHER3SIV8SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si",
++		    V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV4SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si",
++		    V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
++		    IX86_BUILTIN_GATHER3DIV8SI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ",
++		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
++		    IX86_BUILTIN_GATHER3ALTSIV4DF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ",
++		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
++		    IX86_BUILTIN_GATHER3ALTDIV8SF);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ",
++		    V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
++		    IX86_BUILTIN_GATHER3ALTSIV4DI);
++
++  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ",
++		    V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
++		    IX86_BUILTIN_GATHER3ALTDIV8SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf",
++	       VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
++	       IX86_BUILTIN_SCATTERSIV8SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf",
++	       VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
++	       IX86_BUILTIN_SCATTERSIV4SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df",
++	       VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
++	       IX86_BUILTIN_SCATTERSIV4DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df",
++	       VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
++	       IX86_BUILTIN_SCATTERSIV2DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf",
++	       VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
++	       IX86_BUILTIN_SCATTERDIV8SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf",
++	       VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
++	       IX86_BUILTIN_SCATTERDIV4SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df",
++	       VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
++	       IX86_BUILTIN_SCATTERDIV4DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df",
++	       VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
++	       IX86_BUILTIN_SCATTERDIV2DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si",
++	       VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
++	       IX86_BUILTIN_SCATTERSIV8SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si",
++	       VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
++	       IX86_BUILTIN_SCATTERSIV4SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di",
++	       VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
++	       IX86_BUILTIN_SCATTERSIV4DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di",
++	       VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
++	       IX86_BUILTIN_SCATTERSIV2DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si",
++	       VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
++	       IX86_BUILTIN_SCATTERDIV8SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si",
++	       VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
++	       IX86_BUILTIN_SCATTERDIV4SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di",
++	       VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
++	       IX86_BUILTIN_SCATTERDIV4DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di",
++	       VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
++	       IX86_BUILTIN_SCATTERDIV2DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ",
++	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
++	       IX86_BUILTIN_SCATTERALTSIV8DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ",
++	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
++	       IX86_BUILTIN_SCATTERALTDIV16SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ",
++	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
++	       IX86_BUILTIN_SCATTERALTSIV8DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ",
++	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
++	       IX86_BUILTIN_SCATTERALTDIV16SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ",
++	       VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT,
++	       IX86_BUILTIN_SCATTERALTSIV4DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ",
++	       VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT,
++	       IX86_BUILTIN_SCATTERALTDIV8SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ",
++	       VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT,
++	       IX86_BUILTIN_SCATTERALTSIV4DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ",
++	       VOID_FTYPE_PINT_QI_V4DI_V8SI_INT,
++	       IX86_BUILTIN_SCATTERALTDIV8SI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ",
++	       VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
++	       IX86_BUILTIN_SCATTERALTSIV2DF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ",
++	       VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
++	       IX86_BUILTIN_SCATTERALTDIV4SF);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ",
++	       VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
++	       IX86_BUILTIN_SCATTERALTSIV2DI);
++
++  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ",
++	       VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
++	       IX86_BUILTIN_SCATTERALTDIV4SI);
++
++  /* AVX512PF */
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd",
++	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
++	       IX86_BUILTIN_GATHERPFDPD);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps",
++	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
++	       IX86_BUILTIN_GATHERPFDPS);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd",
++	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
++	       IX86_BUILTIN_GATHERPFQPD);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps",
++	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
++	       IX86_BUILTIN_GATHERPFQPS);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd",
++	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
++	       IX86_BUILTIN_SCATTERPFDPD);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps",
++	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
++	       IX86_BUILTIN_SCATTERPFDPS);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd",
++	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
++	       IX86_BUILTIN_SCATTERPFQPD);
++  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps",
++	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
++	       IX86_BUILTIN_SCATTERPFQPS);
++
++  /* SHA */
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1",
++		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2",
++		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte",
++		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4",
++		     V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1",
++		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2",
++		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
++  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2",
++		     V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
++
++  /* RTM.  */
++  def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort",
++	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
++
++  /* MMX access to the vec_init patterns.  */
++  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si",
++		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
++
++  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi",
++		     V4HI_FTYPE_HI_HI_HI_HI,
++		     IX86_BUILTIN_VEC_INIT_V4HI);
++
++  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi",
++		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
++		     IX86_BUILTIN_VEC_INIT_V8QI);
++
++  /* Access to the vec_extract patterns.  */
++  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df",
++		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
++  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di",
++		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
++  def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf",
++		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
++  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si",
++		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
++  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi",
++		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
++		     /* As it uses V4HImode, we have to require -mmmx too.  */
++		     | OPTION_MASK_ISA_MMX, 0,
++		     "__builtin_ia32_vec_ext_v4hi",
++		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
++
++  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si",
++		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi",
++		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
++
++  /* Access to the vec_set patterns.  */
++  def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0,
++		     "__builtin_ia32_vec_set_v2di",
++		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf",
++		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si",
++		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi",
++		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
++		     /* As it uses V4HImode, we have to require -mmmx too.  */
++		     | OPTION_MASK_ISA_MMX, 0,
++		     "__builtin_ia32_vec_set_v4hi",
++		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
++
++  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi",
++		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
++
++  /* RDSEED */
++  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step",
++	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
++  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step",
++	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
++  def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0,
++	       "__builtin_ia32_rdseed_di_step",
++	       INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
++
++  /* ADCX */
++  def_builtin (0, 0, "__builtin_ia32_addcarryx_u32",
++	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
++  def_builtin (OPTION_MASK_ISA_64BIT, 0,
++	       "__builtin_ia32_addcarryx_u64",
++	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
++	       IX86_BUILTIN_ADDCARRYX64);
++
++  /* SBB */
++  def_builtin (0, 0, "__builtin_ia32_sbb_u32",
++	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
++  def_builtin (OPTION_MASK_ISA_64BIT, 0,
++	       "__builtin_ia32_sbb_u64",
++	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
++	       IX86_BUILTIN_SBB64);
++
++  /* Read/write FLAGS.  */
++  if (TARGET_64BIT)
++    {
++      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64",
++		   UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
++      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64",
++		   VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
++    }
++  else
++    {
++      def_builtin (0, 0, "__builtin_ia32_readeflags_u32",
++		   UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
++      def_builtin (0, 0, "__builtin_ia32_writeeflags_u32",
++		   VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
++    }
++
++  /* CLFLUSHOPT.  */
++  def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt",
++	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
++
++  /* CLWB.  */
++  def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb",
++	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
++
++  /* MONITORX and MWAITX.  */
++  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
++		VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
++  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
++		VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
++
++  /* CLZERO.  */
++  def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
++		VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
++
++  /* WAITPKG.  */
++  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor",
++	       VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR);
++  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait",
++	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT);
++  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause",
++	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE);
++
++  /* CLDEMOTE.  */
++  def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote",
++	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE);
++
++  /* Add FMA4 multi-arg argument instructions */
++  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
++      if (d->name == 0)
++	continue;
++
++      ftype = (enum ix86_builtin_func_type) d->flag;
++      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
++		 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
++		 ARRAY_SIZE (bdesc_multi_arg) - 1);
++
++  /* Add CET inrinsics.  */
++  for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
++      if (d->name == 0)
++	continue;
++
++      ftype = (enum ix86_builtin_func_type) d->flag;
++      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
++		 IX86_BUILTIN__BDESC_CET_FIRST,
++		 ARRAY_SIZE (bdesc_cet) - 1);
++
++  for (i = 0, d = bdesc_cet_rdssp;
++       i < ARRAY_SIZE (bdesc_cet_rdssp);
++       i++, d++)
++    {
++      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
++      if (d->name == 0)
++	continue;
++
++      ftype = (enum ix86_builtin_func_type) d->flag;
++      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
++    }
++  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
++		 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
++		 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
++}
++
++#undef BDESC_VERIFY
++#undef BDESC_VERIFYS
++
++/* Make builtins to detect cpu type and features supported.  NAME is
++   the builtin name, CODE is the builtin code, and FTYPE is the function
++   type of the builtin.  */
++
++static void
++make_cpu_type_builtin (const char* name, int code,
++		       enum ix86_builtin_func_type ftype, bool is_const)
++{
++  tree decl;
++  tree type;
++
++  type = ix86_get_builtin_func_type (ftype);
++  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
++			       NULL, NULL_TREE);
++  gcc_assert (decl != NULL_TREE);
++  ix86_builtins[(int) code] = decl;
++  TREE_READONLY (decl) = is_const;
++}
++
++/* Make builtins to get CPU type and features supported.  The created
++   builtins are :
++
++   __builtin_cpu_init (), to detect cpu type and features,
++   __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
++   __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
++   */
++
++static void
++ix86_init_platform_type_builtins (void)
++{
++  make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
++			 INT_FTYPE_VOID, false);
++  make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
++			 INT_FTYPE_PCCHAR, true);
++  make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
++			 INT_FTYPE_PCCHAR, true);
++}
++
++/* Internal method for ix86_init_builtins.  */
++
++static void
++ix86_init_builtins_va_builtins_abi (void)
++{
++  tree ms_va_ref, sysv_va_ref;
++  tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
++  tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
++  tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
++  tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
++
++  if (!TARGET_64BIT)
++    return;
++  fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
++  fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
++  ms_va_ref = build_reference_type (ms_va_list_type_node);
++  sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
++
++  fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref,
++					       NULL_TREE);
++  fnvoid_va_start_ms
++    = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
++  fnvoid_va_end_sysv
++    = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
++  fnvoid_va_start_sysv
++    = build_varargs_function_type_list (void_type_node, sysv_va_ref,
++					NULL_TREE);
++  fnvoid_va_copy_ms
++    = build_function_type_list (void_type_node, ms_va_ref,
++				ms_va_list_type_node, NULL_TREE);
++  fnvoid_va_copy_sysv
++    = build_function_type_list (void_type_node, sysv_va_ref,
++				sysv_va_ref, NULL_TREE);
++
++  add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
++  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
++  add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
++  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
++  add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
++			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
++  add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
++  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
++  add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
++  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
++  add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
++			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
++}
++
++static void
++ix86_init_builtin_types (void)
++{
++  tree float80_type_node, const_string_type_node;
++
++  /* The __float80 type.  */
++  float80_type_node = long_double_type_node;
++  if (TYPE_MODE (float80_type_node) != XFmode)
++    {
++      if (float64x_type_node != NULL_TREE
++	  && TYPE_MODE (float64x_type_node) == XFmode)
++	float80_type_node = float64x_type_node;
++      else
++	{
++	  /* The __float80 type.  */
++	  float80_type_node = make_node (REAL_TYPE);
++
++	  TYPE_PRECISION (float80_type_node) = 80;
++	  layout_type (float80_type_node);
++	}
++    }
++  lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
++
++  /* The __float128 type.  The node has already been created as
++     _Float128, so we only need to register the __float128 name for
++     it.  */
++  lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
++
++  const_string_type_node
++    = build_pointer_type (build_qualified_type
++			  (char_type_node, TYPE_QUAL_CONST));
++
++  /* This macro is built by i386-builtin-types.awk.  */
++  DEFINE_BUILTIN_PRIMITIVE_TYPES;
++}
++
++void
++ix86_init_builtins (void)
++{
++  tree ftype, decl;
++
++  ix86_init_builtin_types ();
++
++  /* Builtins to get CPU type and features. */
++  ix86_init_platform_type_builtins ();
++
++  /* TFmode support builtins.  */
++  def_builtin_const (0, 0, "__builtin_infq",
++		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
++  def_builtin_const (0, 0, "__builtin_huge_valq",
++		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
++
++  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
++  decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
++			       BUILT_IN_MD, "nanq", NULL_TREE);
++  TREE_READONLY (decl) = 1;
++  ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
++
++  decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
++			       BUILT_IN_MD, "nansq", NULL_TREE);
++  TREE_READONLY (decl) = 1;
++  ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
++
++  /* We will expand them to normal call if SSE isn't available since
++     they are used by libgcc. */
++  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
++  decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
++			       BUILT_IN_MD, "__fabstf2", NULL_TREE);
++  TREE_READONLY (decl) = 1;
++  ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
++
++  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
++  decl = add_builtin_function ("__builtin_copysignq", ftype,
++			       IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
++			       "__copysigntf3", NULL_TREE);
++  TREE_READONLY (decl) = 1;
++  ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
++
++  ix86_init_tm_builtins ();
++  ix86_init_mmx_sse_builtins ();
++
++  if (TARGET_LP64)
++    ix86_init_builtins_va_builtins_abi ();
++
++#ifdef SUBTARGET_INIT_BUILTINS
++  SUBTARGET_INIT_BUILTINS;
++#endif
++}
++
++/* Return the ix86 builtin for CODE.  */
++
++tree
++ix86_builtin_decl (unsigned code, bool)
++{
++  if (code >= IX86_BUILTIN_MAX)
++    return error_mark_node;
++
++  return ix86_builtins[code];
++}
++
++/* This returns the target-specific builtin with code CODE if
++   current_function_decl has visibility on this builtin, which is checked
++   using isa flags.  Returns NULL_TREE otherwise.  */
++
++static tree ix86_get_builtin (enum ix86_builtins code)
++{
++  struct cl_target_option *opts;
++  tree target_tree = NULL_TREE;
++
++  /* Determine the isa flags of current_function_decl.  */
++
++  if (current_function_decl)
++    target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
++
++  if (target_tree == NULL)
++    target_tree = target_option_default_node;
++
++  opts = TREE_TARGET_OPTION (target_tree);
++
++  if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
++      || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
++    return ix86_builtin_decl (code, true);
++  else
++    return NULL_TREE;
++}
++
++/* Vectorization library interface and handlers.  */
++tree (*ix86_veclib_handler) (combined_fn, tree, tree);
++
++/* Returns a function decl for a vectorized version of the combined function
++   with combined_fn code FN and the result vector type TYPE, or NULL_TREE
++   if it is not available.  */
++
++tree
++ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
++				  tree type_in)
++{
++  machine_mode in_mode, out_mode;
++  int in_n, out_n;
++
++  if (TREE_CODE (type_out) != VECTOR_TYPE
++      || TREE_CODE (type_in) != VECTOR_TYPE)
++    return NULL_TREE;
++
++  out_mode = TYPE_MODE (TREE_TYPE (type_out));
++  out_n = TYPE_VECTOR_SUBPARTS (type_out);
++  in_mode = TYPE_MODE (TREE_TYPE (type_in));
++  in_n = TYPE_VECTOR_SUBPARTS (type_in);
++
++  switch (fn)
++    {
++    CASE_CFN_EXP2:
++      if (out_mode == SFmode && in_mode == SFmode)
++	{
++	  if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
++	}
++      break;
++
++    CASE_CFN_IFLOOR:
++    CASE_CFN_LFLOOR:
++    CASE_CFN_LLFLOOR:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == SImode && in_mode == DFmode)
++	{
++	  if (out_n == 4 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
++	  else if (out_n == 8 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
++	  else if (out_n == 16 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
++	}
++      if (out_mode == SImode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
++	}
++      break;
++
++    CASE_CFN_ICEIL:
++    CASE_CFN_LCEIL:
++    CASE_CFN_LLCEIL:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == SImode && in_mode == DFmode)
++	{
++	  if (out_n == 4 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
++	  else if (out_n == 8 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
++	  else if (out_n == 16 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
++	}
++      if (out_mode == SImode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
++	}
++      break;
++
++    CASE_CFN_IRINT:
++    CASE_CFN_LRINT:
++    CASE_CFN_LLRINT:
++      if (out_mode == SImode && in_mode == DFmode)
++	{
++	  if (out_n == 4 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
++	  else if (out_n == 8 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
++	  else if (out_n == 16 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
++	}
++      if (out_mode == SImode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
++	}
++      break;
++
++    CASE_CFN_IROUND:
++    CASE_CFN_LROUND:
++    CASE_CFN_LLROUND:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == SImode && in_mode == DFmode)
++	{
++	  if (out_n == 4 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
++	  else if (out_n == 8 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
++	  else if (out_n == 16 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
++	}
++      if (out_mode == SImode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
++	}
++      break;
++
++    CASE_CFN_FLOOR:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == DFmode && in_mode == DFmode)
++	{
++	  if (out_n == 2 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
++	  else if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
++	}
++      if (out_mode == SFmode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
++	}
++      break;
++
++    CASE_CFN_CEIL:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == DFmode && in_mode == DFmode)
++	{
++	  if (out_n == 2 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
++	  else if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
++	}
++      if (out_mode == SFmode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
++	}
++      break;
++
++    CASE_CFN_TRUNC:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == DFmode && in_mode == DFmode)
++	{
++	  if (out_n == 2 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
++	  else if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
++	}
++      if (out_mode == SFmode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
++	  else if (out_n == 16 && in_n == 16)
++	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
++	}
++      break;
++
++    CASE_CFN_RINT:
++      /* The round insn does not trap on denormals.  */
++      if (flag_trapping_math || !TARGET_SSE4_1)
++	break;
++
++      if (out_mode == DFmode && in_mode == DFmode)
++	{
++	  if (out_n == 2 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_RINTPD);
++	  else if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
++	}
++      if (out_mode == SFmode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_RINTPS);
++	  else if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
++	}
++      break;
++
++    CASE_CFN_FMA:
++      if (out_mode == DFmode && in_mode == DFmode)
++	{
++	  if (out_n == 2 && in_n == 2)
++	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
++	}
++      if (out_mode == SFmode && in_mode == SFmode)
++	{
++	  if (out_n == 4 && in_n == 4)
++	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
++	  if (out_n == 8 && in_n == 8)
++	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
++	}
++      break;
++
++    default:
++      break;
++    }
++
++  /* Dispatch to a handler for a vectorization library.  */
++  if (ix86_veclib_handler)
++    return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
++
++  return NULL_TREE;
++}
++
++/* Returns a decl of a function that implements gather load with
++   memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
++   Return NULL_TREE if it is not available.  */
++
++tree
++ix86_vectorize_builtin_gather (const_tree mem_vectype,
++			       const_tree index_type, int scale)
++{
++  bool si;
++  enum ix86_builtins code;
++
++  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
++    return NULL_TREE;
++
++  if ((TREE_CODE (index_type) != INTEGER_TYPE
++       && !POINTER_TYPE_P (index_type))
++      || (TYPE_MODE (index_type) != SImode
++	  && TYPE_MODE (index_type) != DImode))
++    return NULL_TREE;
++
++  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
++    return NULL_TREE;
++
++  /* v*gather* insn sign extends index to pointer mode.  */
++  if (TYPE_PRECISION (index_type) < POINTER_SIZE
++      && TYPE_UNSIGNED (index_type))
++    return NULL_TREE;
++
++  if (scale <= 0
++      || scale > 8
++      || (scale & (scale - 1)) != 0)
++    return NULL_TREE;
++
++  si = TYPE_MODE (index_type) == SImode;
++  switch (TYPE_MODE (mem_vectype))
++    {
++    case E_V2DFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
++      else
++	code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
++      break;
++    case E_V4DFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
++      else
++	code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
++      break;
++    case E_V2DImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
++      else
++	code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
++      break;
++    case E_V4DImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
++      else
++	code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
++      break;
++    case E_V4SFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
++      else
++	code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
++      break;
++    case E_V8SFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
++      else
++	code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
++      break;
++    case E_V4SImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
++      else
++	code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
++      break;
++    case E_V8SImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
++      else
++	code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
++      break;
++    case E_V8DFmode:
++      if (TARGET_AVX512F)
++	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
++      else
++	return NULL_TREE;
++      break;
++    case E_V8DImode:
++      if (TARGET_AVX512F)
++	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
++      else
++	return NULL_TREE;
++      break;
++    case E_V16SFmode:
++      if (TARGET_AVX512F)
++	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
++      else
++	return NULL_TREE;
++      break;
++    case E_V16SImode:
++      if (TARGET_AVX512F)
++	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
++      else
++	return NULL_TREE;
++      break;
++    default:
++      return NULL_TREE;
++    }
++
++  return ix86_get_builtin (code);
++}
++
++/* Returns a code for a target-specific builtin that implements
++   reciprocal of the function, or NULL_TREE if not available.  */
++
++tree
++ix86_builtin_reciprocal (tree fndecl)
++{
++  enum ix86_builtins fn_code
++    = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
++  switch (fn_code)
++    {
++      /* Vectorized version of sqrt to rsqrt conversion.  */
++    case IX86_BUILTIN_SQRTPS_NR:
++      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
++
++    case IX86_BUILTIN_SQRTPS_NR256:
++      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
++
++    default:
++      return NULL_TREE;
++    }
++}
++
++/* Priority of i386 features, greater value is higher priority.   This is
++   used to decide the order in which function dispatch must happen.  For
++   instance, a version specialized for SSE4.2 should be checked for dispatch
++   before a version for SSE3, as SSE4.2 implies SSE3.  */
++enum feature_priority
++{
++  P_ZERO = 0,
++  P_MMX,
++  P_SSE,
++  P_SSE2,
++  P_SSE3,
++  P_SSSE3,
++  P_PROC_SSSE3,
++  P_SSE4_A,
++  P_PROC_SSE4_A,
++  P_SSE4_1,
++  P_SSE4_2,
++  P_PROC_SSE4_2,
++  P_POPCNT,
++  P_AES,
++  P_PCLMUL,
++  P_AVX,
++  P_PROC_AVX,
++  P_BMI,
++  P_PROC_BMI,
++  P_FMA4,
++  P_XOP,
++  P_PROC_XOP,
++  P_FMA,
++  P_PROC_FMA,
++  P_BMI2,
++  P_AVX2,
++  P_PROC_AVX2,
++  P_AVX512F,
++  P_PROC_AVX512F
++};
++
++/* This is the order of bit-fields in __processor_features in cpuinfo.c */
++enum processor_features
++{
++  F_CMOV = 0,
++  F_MMX,
++  F_POPCNT,
++  F_SSE,
++  F_SSE2,
++  F_SSE3,
++  F_SSSE3,
++  F_SSE4_1,
++  F_SSE4_2,
++  F_AVX,
++  F_AVX2,
++  F_SSE4_A,
++  F_FMA4,
++  F_XOP,
++  F_FMA,
++  F_AVX512F,
++  F_BMI,
++  F_BMI2,
++  F_AES,
++  F_PCLMUL,
++  F_AVX512VL,
++  F_AVX512BW,
++  F_AVX512DQ,
++  F_AVX512CD,
++  F_AVX512ER,
++  F_AVX512PF,
++  F_AVX512VBMI,
++  F_AVX512IFMA,
++  F_AVX5124VNNIW,
++  F_AVX5124FMAPS,
++  F_AVX512VPOPCNTDQ,
++  F_AVX512VBMI2,
++  F_GFNI,
++  F_VPCLMULQDQ,
++  F_AVX512VNNI,
++  F_AVX512BITALG,
++  F_MAX
++};
++
++/* These are the values for vendor types and cpu types  and subtypes
++   in cpuinfo.c.  Cpu types and subtypes should be subtracted by
++   the corresponding start value.  */
++enum processor_model
++{
++  M_INTEL = 1,
++  M_AMD,
++  M_CPU_TYPE_START,
++  M_INTEL_BONNELL,
++  M_INTEL_CORE2,
++  M_INTEL_COREI7,
++  M_AMDFAM10H,
++  M_AMDFAM15H,
++  M_INTEL_SILVERMONT,
++  M_INTEL_KNL,
++  M_AMD_BTVER1,
++  M_AMD_BTVER2,
++  M_AMDFAM17H,
++  M_INTEL_KNM,
++  M_INTEL_GOLDMONT,
++  M_INTEL_GOLDMONT_PLUS,
++  M_INTEL_TREMONT,
++  M_CPU_SUBTYPE_START,
++  M_INTEL_COREI7_NEHALEM,
++  M_INTEL_COREI7_WESTMERE,
++  M_INTEL_COREI7_SANDYBRIDGE,
++  M_AMDFAM10H_BARCELONA,
++  M_AMDFAM10H_SHANGHAI,
++  M_AMDFAM10H_ISTANBUL,
++  M_AMDFAM15H_BDVER1,
++  M_AMDFAM15H_BDVER2,
++  M_AMDFAM15H_BDVER3,
++  M_AMDFAM15H_BDVER4,
++  M_AMDFAM17H_ZNVER1,
++  M_INTEL_COREI7_IVYBRIDGE,
++  M_INTEL_COREI7_HASWELL,
++  M_INTEL_COREI7_BROADWELL,
++  M_INTEL_COREI7_SKYLAKE,
++  M_INTEL_COREI7_SKYLAKE_AVX512,
++  M_INTEL_COREI7_CANNONLAKE,
++  M_INTEL_COREI7_ICELAKE_CLIENT,
++  M_INTEL_COREI7_ICELAKE_SERVER,
++  M_AMDFAM17H_ZNVER2,
++  M_INTEL_COREI7_CASCADELAKE
++};
++
++struct _arch_names_table
++{
++  const char *const name;
++  const enum processor_model model;
++};
++
++static const _arch_names_table arch_names_table[] =
++{
++  {"amd", M_AMD},
++  {"intel", M_INTEL},
++  {"atom", M_INTEL_BONNELL},
++  {"slm", M_INTEL_SILVERMONT},
++  {"core2", M_INTEL_CORE2},
++  {"corei7", M_INTEL_COREI7},
++  {"nehalem", M_INTEL_COREI7_NEHALEM},
++  {"westmere", M_INTEL_COREI7_WESTMERE},
++  {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
++  {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
++  {"haswell", M_INTEL_COREI7_HASWELL},
++  {"broadwell", M_INTEL_COREI7_BROADWELL},
++  {"skylake", M_INTEL_COREI7_SKYLAKE},
++  {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
++  {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
++  {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
++  {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
++  {"cascadelake", M_INTEL_COREI7_CASCADELAKE},
++  {"bonnell", M_INTEL_BONNELL},
++  {"silvermont", M_INTEL_SILVERMONT},
++  {"goldmont", M_INTEL_GOLDMONT},
++  {"goldmont-plus", M_INTEL_GOLDMONT_PLUS},
++  {"tremont", M_INTEL_TREMONT},
++  {"knl", M_INTEL_KNL},
++  {"knm", M_INTEL_KNM},
++  {"amdfam10h", M_AMDFAM10H},
++  {"barcelona", M_AMDFAM10H_BARCELONA},
++  {"shanghai", M_AMDFAM10H_SHANGHAI},
++  {"istanbul", M_AMDFAM10H_ISTANBUL},
++  {"btver1", M_AMD_BTVER1},
++  {"amdfam15h", M_AMDFAM15H},
++  {"bdver1", M_AMDFAM15H_BDVER1},
++  {"bdver2", M_AMDFAM15H_BDVER2},
++  {"bdver3", M_AMDFAM15H_BDVER3},
++  {"bdver4", M_AMDFAM15H_BDVER4},
++  {"btver2", M_AMD_BTVER2},
++  {"amdfam17h", M_AMDFAM17H},
++  {"znver1", M_AMDFAM17H_ZNVER1},
++  {"znver2", M_AMDFAM17H_ZNVER2},
++};
++
++/* These are the target attribute strings for which a dispatcher is
++   available, from fold_builtin_cpu.  */
++struct _isa_names_table
++{
++  const char *const name;
++  const enum processor_features feature;
++  const enum feature_priority priority;
++};
++
++static const _isa_names_table isa_names_table[] =
++{
++  {"cmov",    F_CMOV,	P_ZERO},
++  {"mmx",     F_MMX,	P_MMX},
++  {"popcnt",  F_POPCNT,	P_POPCNT},
++  {"sse",     F_SSE,	P_SSE},
++  {"sse2",    F_SSE2,	P_SSE2},
++  {"sse3",    F_SSE3,	P_SSE3},
++  {"ssse3",   F_SSSE3,	P_SSSE3},
++  {"sse4a",   F_SSE4_A,	P_SSE4_A},
++  {"sse4.1",  F_SSE4_1,	P_SSE4_1},
++  {"sse4.2",  F_SSE4_2,	P_SSE4_2},
++  {"avx",     F_AVX,	P_AVX},
++  {"fma4",    F_FMA4,	P_FMA4},
++  {"xop",     F_XOP,	P_XOP},
++  {"fma",     F_FMA,	P_FMA},
++  {"avx2",    F_AVX2,	P_AVX2},
++  {"avx512f", F_AVX512F, P_AVX512F},
++  {"bmi",     F_BMI,	P_BMI},
++  {"bmi2",    F_BMI2,	P_BMI2},
++  {"aes",     F_AES,	P_AES},
++  {"pclmul",  F_PCLMUL,	P_PCLMUL},
++  {"avx512vl",F_AVX512VL, P_ZERO},
++  {"avx512bw",F_AVX512BW, P_ZERO},
++  {"avx512dq",F_AVX512DQ, P_ZERO},
++  {"avx512cd",F_AVX512CD, P_ZERO},
++  {"avx512er",F_AVX512ER, P_ZERO},
++  {"avx512pf",F_AVX512PF, P_ZERO},
++  {"avx512vbmi",F_AVX512VBMI, P_ZERO},
++  {"avx512ifma",F_AVX512IFMA, P_ZERO},
++  {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO},
++  {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO},
++  {"avx512vpopcntdq",F_AVX512VPOPCNTDQ,	P_ZERO},
++  {"avx512vbmi2", F_AVX512VBMI2, P_ZERO},
++  {"gfni",	F_GFNI,	P_ZERO},
++  {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
++  {"avx512vnni", F_AVX512VNNI, P_ZERO},
++  {"avx512bitalg", F_AVX512BITALG, P_ZERO}
++};
++
++/* This parses the attribute arguments to target in DECL and determines
++   the right builtin to use to match the platform specification.
++   It returns the priority value for this version decl.  If PREDICATE_LIST
++   is not NULL, it stores the list of cpu features that need to be checked
++   before dispatching this function.  */
++
++unsigned int
++get_builtin_code_for_version (tree decl, tree *predicate_list)
++{
++  tree attrs;
++  struct cl_target_option cur_target;
++  tree target_node;
++  struct cl_target_option *new_target;
++  const char *arg_str = NULL;
++  const char *attrs_str = NULL;
++  char *tok_str = NULL;
++  char *token;
++
++  enum feature_priority priority = P_ZERO;
++
++  static unsigned int NUM_FEATURES
++    = sizeof (isa_names_table) / sizeof (_isa_names_table);
++
++  unsigned int i;
++
++  tree predicate_chain = NULL_TREE;
++  tree predicate_decl, predicate_arg;
++
++  attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
++  gcc_assert (attrs != NULL);
++
++  attrs = TREE_VALUE (TREE_VALUE (attrs));
++
++  gcc_assert (TREE_CODE (attrs) == STRING_CST);
++  attrs_str = TREE_STRING_POINTER (attrs);
++
++  /* Return priority zero for default function.  */
++  if (strcmp (attrs_str, "default") == 0)
++    return 0;
++
++  /* Handle arch= if specified.  For priority, set it to be 1 more than
++     the best instruction set the processor can handle.  For instance, if
++     there is a version for atom and a version for ssse3 (the highest ISA
++     priority for atom), the atom version must be checked for dispatch
++     before the ssse3 version. */
++  if (strstr (attrs_str, "arch=") != NULL)
++    {
++      cl_target_option_save (&cur_target, &global_options);
++      target_node
++	= ix86_valid_target_attribute_tree (decl, attrs, &global_options,
++					    &global_options_set, 0);
++    
++      gcc_assert (target_node);
++      if (target_node == error_mark_node)
++	return 0;
++      new_target = TREE_TARGET_OPTION (target_node);
++      gcc_assert (new_target);
++      
++      if (new_target->arch_specified && new_target->arch > 0)
++	{
++	  switch (new_target->arch)
++	    {
++	    case PROCESSOR_CORE2:
++	      arg_str = "core2";
++	      priority = P_PROC_SSSE3;
++	      break;
++	    case PROCESSOR_NEHALEM:
++	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL)
++		{
++		  arg_str = "westmere";
++		  priority = P_PCLMUL;
++		}
++	      else
++		{
++		  /* We translate "arch=corei7" and "arch=nehalem" to
++		     "corei7" so that it will be mapped to M_INTEL_COREI7
++		     as cpu type to cover all M_INTEL_COREI7_XXXs.  */
++		  arg_str = "corei7";
++		  priority = P_PROC_SSE4_2;
++		}
++	      break;
++	    case PROCESSOR_SANDYBRIDGE:
++	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
++		arg_str = "ivybridge";
++	      else
++		arg_str = "sandybridge";
++	      priority = P_PROC_AVX;
++	      break;
++	    case PROCESSOR_HASWELL:
++	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
++		arg_str = "broadwell";
++	      else
++		arg_str = "haswell";
++	      priority = P_PROC_AVX2;
++	      break;
++	    case PROCESSOR_SKYLAKE:
++	      arg_str = "skylake";
++	      priority = P_PROC_AVX2;
++	      break;
++	    case PROCESSOR_SKYLAKE_AVX512:
++	      arg_str = "skylake-avx512";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_CANNONLAKE:
++	      arg_str = "cannonlake";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_ICELAKE_CLIENT:
++	      arg_str = "icelake-client";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_ICELAKE_SERVER:
++	      arg_str = "icelake-server";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_CASCADELAKE:
++	      arg_str = "cascadelake";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_BONNELL:
++	      arg_str = "bonnell";
++	      priority = P_PROC_SSSE3;
++	      break;
++	    case PROCESSOR_KNL:
++	      arg_str = "knl";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_KNM:
++	      arg_str = "knm";
++	      priority = P_PROC_AVX512F;
++	      break;
++	    case PROCESSOR_SILVERMONT:
++	      arg_str = "silvermont";
++	      priority = P_PROC_SSE4_2;
++	      break;
++	    case PROCESSOR_GOLDMONT:
++	      arg_str = "goldmont";
++	      priority = P_PROC_SSE4_2;
++	      break;
++	    case PROCESSOR_GOLDMONT_PLUS:
++	      arg_str = "goldmont-plus";
++	      priority = P_PROC_SSE4_2;
++	      break;
++	    case PROCESSOR_TREMONT:
++	      arg_str = "tremont";
++	      priority = P_PROC_SSE4_2;
++	      break;
++	    case PROCESSOR_AMDFAM10:
++	      arg_str = "amdfam10h";
++	      priority = P_PROC_SSE4_A;
++	      break;
++	    case PROCESSOR_BTVER1:
++	      arg_str = "btver1";
++	      priority = P_PROC_SSE4_A;
++	      break;
++	    case PROCESSOR_BTVER2:
++	      arg_str = "btver2";
++	      priority = P_PROC_BMI;
++	      break;
++	    case PROCESSOR_BDVER1:
++	      arg_str = "bdver1";
++	      priority = P_PROC_XOP;
++	      break;
++	    case PROCESSOR_BDVER2:
++	      arg_str = "bdver2";
++	      priority = P_PROC_FMA;
++	      break;
++	    case PROCESSOR_BDVER3:
++	      arg_str = "bdver3";
++	      priority = P_PROC_FMA;
++	      break;
++	    case PROCESSOR_BDVER4:
++	      arg_str = "bdver4";
++	      priority = P_PROC_AVX2;
++	      break;
++	    case PROCESSOR_ZNVER1:
++	      arg_str = "znver1";
++	      priority = P_PROC_AVX2;
++	      break;
++	    case PROCESSOR_ZNVER2:
++	      arg_str = "znver2";
++	      priority = P_PROC_AVX2;
++	      break;
++	    }
++	}
++
++      cl_target_option_restore (&global_options, &cur_target);
++	
++      if (predicate_list && arg_str == NULL)
++	{
++	  error_at (DECL_SOURCE_LOCATION (decl),
++		    "no dispatcher found for the versioning attributes");
++	  return 0;
++	}
++    
++      if (predicate_list)
++	{
++          predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
++          /* For a C string literal the length includes the trailing NULL.  */
++          predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
++          predicate_chain = tree_cons (predicate_decl, predicate_arg,
++				       predicate_chain);
++	}
++    }
++
++  /* Process feature name.  */
++  tok_str =  (char *) xmalloc (strlen (attrs_str) + 1);
++  strcpy (tok_str, attrs_str);
++  token = strtok (tok_str, ",");
++  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
++
++  while (token != NULL)
++    {
++      /* Do not process "arch="  */
++      if (strncmp (token, "arch=", 5) == 0)
++	{
++	  token = strtok (NULL, ",");
++	  continue;
++	}
++      for (i = 0; i < NUM_FEATURES; ++i)
++	{
++	  if (strcmp (token, isa_names_table[i].name) == 0)
++	    {
++	      if (predicate_list)
++		{
++		  predicate_arg = build_string_literal (
++				  strlen (isa_names_table[i].name) + 1,
++				  isa_names_table[i].name);
++		  predicate_chain = tree_cons (predicate_decl, predicate_arg,
++					       predicate_chain);
++		}
++	      /* Find the maximum priority feature.  */
++	      if (isa_names_table[i].priority > priority)
++		priority = isa_names_table[i].priority;
++
++	      break;
++	    }
++	}
++      if (predicate_list && priority == P_ZERO)
++	{
++	  error_at (DECL_SOURCE_LOCATION (decl),
++		    "ISA %qs is not supported in %<target%> attribute, "
++		    "use %<arch=%> syntax", token);
++	  return 0;
++	}
++      token = strtok (NULL, ",");
++    }
++  free (tok_str);
++
++  if (predicate_list && predicate_chain == NULL_TREE)
++    {
++      error_at (DECL_SOURCE_LOCATION (decl),
++	        "no dispatcher found for the versioning attributes: %s",
++	        attrs_str);
++      return 0;
++    }
++  else if (predicate_list)
++    {
++      predicate_chain = nreverse (predicate_chain);
++      *predicate_list = predicate_chain;
++    }
++
++  return priority; 
++}
++
++/* This builds the processor_model struct type defined in
++   libgcc/config/i386/cpuinfo.c  */
++
++static tree
++build_processor_model_struct (void)
++{
++  const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
++			      "__cpu_features"};
++  tree field = NULL_TREE, field_chain = NULL_TREE;
++  int i;
++  tree type = make_node (RECORD_TYPE);
++
++  /* The first 3 fields are unsigned int.  */
++  for (i = 0; i < 3; ++i)
++    {
++      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
++			  get_identifier (field_name[i]), unsigned_type_node);
++      if (field_chain != NULL_TREE)
++	DECL_CHAIN (field) = field_chain;
++      field_chain = field;
++    }
++
++  /* The last field is an array of unsigned integers of size one.  */
++  field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
++		      get_identifier (field_name[3]),
++		      build_array_type (unsigned_type_node,
++					build_index_type (size_one_node)));
++  if (field_chain != NULL_TREE)
++    DECL_CHAIN (field) = field_chain;
++  field_chain = field;
++
++  finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
++  return type;
++}
++
++/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
++
++static tree
++make_var_decl (tree type, const char *name)
++{
++  tree new_decl;
++
++  new_decl = build_decl (UNKNOWN_LOCATION,
++	                 VAR_DECL,
++	  	         get_identifier(name),
++		         type);
++
++  DECL_EXTERNAL (new_decl) = 1;
++  TREE_STATIC (new_decl) = 1;
++  TREE_PUBLIC (new_decl) = 1;
++  DECL_INITIAL (new_decl) = 0;
++  DECL_ARTIFICIAL (new_decl) = 0;
++  DECL_PRESERVE_P (new_decl) = 1;
++
++  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
++  assemble_variable (new_decl, 0, 0, 0);
++
++  return new_decl;
++}
++
++/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
++   into an integer defined in libgcc/config/i386/cpuinfo.c */
++
++tree
++fold_builtin_cpu (tree fndecl, tree *args)
++{
++  unsigned int i;
++  enum ix86_builtins fn_code
++    = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
++  tree param_string_cst = NULL;
++
++  tree __processor_model_type = build_processor_model_struct ();
++  tree __cpu_model_var = make_var_decl (__processor_model_type,
++					"__cpu_model");
++
++
++  varpool_node::add (__cpu_model_var);
++
++  gcc_assert ((args != NULL) && (*args != NULL));
++
++  param_string_cst = *args;
++  while (param_string_cst
++	 && TREE_CODE (param_string_cst) !=  STRING_CST)
++    {
++      /* *args must be a expr that can contain other EXPRS leading to a
++	 STRING_CST.   */
++      if (!EXPR_P (param_string_cst))
++ 	{
++	  error ("parameter to builtin must be a string constant or literal");
++	  return integer_zero_node;
++	}
++      param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
++    }
++
++  gcc_assert (param_string_cst);
++
++  if (fn_code == IX86_BUILTIN_CPU_IS)
++    {
++      tree ref;
++      tree field;
++      tree final;
++
++      unsigned int field_val = 0;
++      unsigned int NUM_ARCH_NAMES
++	= sizeof (arch_names_table) / sizeof (struct _arch_names_table);
++
++      for (i = 0; i < NUM_ARCH_NAMES; i++)
++	if (strcmp (arch_names_table[i].name,
++	    TREE_STRING_POINTER (param_string_cst)) == 0)
++	  break;
++
++      if (i == NUM_ARCH_NAMES)
++	{
++	  error ("parameter to builtin not valid: %s",
++	         TREE_STRING_POINTER (param_string_cst));
++	  return integer_zero_node;
++	}
++
++      field = TYPE_FIELDS (__processor_model_type);
++      field_val = arch_names_table[i].model;
++
++      /* CPU types are stored in the next field.  */
++      if (field_val > M_CPU_TYPE_START
++	  && field_val < M_CPU_SUBTYPE_START)
++	{
++	  field = DECL_CHAIN (field);
++	  field_val -= M_CPU_TYPE_START;
++	}
++
++      /* CPU subtypes are stored in the next field.  */
++      if (field_val > M_CPU_SUBTYPE_START)
++	{
++	  field = DECL_CHAIN ( DECL_CHAIN (field));
++	  field_val -= M_CPU_SUBTYPE_START;
++	}
++
++      /* Get the appropriate field in __cpu_model.  */
++      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
++		    field, NULL_TREE);
++
++      /* Check the value.  */
++      final = build2 (EQ_EXPR, unsigned_type_node, ref,
++		      build_int_cstu (unsigned_type_node, field_val));
++      return build1 (CONVERT_EXPR, integer_type_node, final);
++    }
++  else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
++    {
++      tree ref;
++      tree array_elt;
++      tree field;
++      tree final;
++
++      unsigned int field_val = 0;
++      unsigned int NUM_ISA_NAMES
++	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
++
++      for (i = 0; i < NUM_ISA_NAMES; i++)
++	if (strcmp (isa_names_table[i].name,
++	    TREE_STRING_POINTER (param_string_cst)) == 0)
++	  break;
++
++      if (i == NUM_ISA_NAMES)
++	{
++	  error ("parameter to builtin not valid: %s",
++	       	 TREE_STRING_POINTER (param_string_cst));
++	  return integer_zero_node;
++	}
++
++      if (isa_names_table[i].feature >= 32)
++	{
++	  tree __cpu_features2_var = make_var_decl (unsigned_type_node,
++						    "__cpu_features2");
++
++	  varpool_node::add (__cpu_features2_var);
++	  field_val = (1U << (isa_names_table[i].feature - 32));
++	  /* Return __cpu_features2 & field_val  */
++	  final = build2 (BIT_AND_EXPR, unsigned_type_node,
++			  __cpu_features2_var,
++			  build_int_cstu (unsigned_type_node, field_val));
++	  return build1 (CONVERT_EXPR, integer_type_node, final);
++	}
++
++      field = TYPE_FIELDS (__processor_model_type);
++      /* Get the last field, which is __cpu_features.  */
++      while (DECL_CHAIN (field))
++        field = DECL_CHAIN (field);
++
++      /* Get the appropriate field: __cpu_model.__cpu_features  */
++      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
++		    field, NULL_TREE);
++
++      /* Access the 0th element of __cpu_features array.  */
++      array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
++			  integer_zero_node, NULL_TREE, NULL_TREE);
++
++      field_val = (1U << isa_names_table[i].feature);
++      /* Return __cpu_model.__cpu_features[0] & field_val  */
++      final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
++		      build_int_cstu (unsigned_type_node, field_val));
++      return build1 (CONVERT_EXPR, integer_type_node, final);
++    }
++  gcc_unreachable ();
++}
++
++#include "gt-i386-builtins.h"
+diff --git a/gcc/config/i386/i386-builtins.h b/gcc/config/i386/i386-builtins.h
+new file mode 100644
+index 000000000..c0264e5bf
+--- /dev/null
++++ b/gcc/config/i386/i386-builtins.h
+@@ -0,0 +1,330 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_I386_BUILTINS_H
++#define GCC_I386_BUILTINS_H
++
++/* The following file contains several enumerations and data structures
++   built from the definitions in i386-builtin-types.def.  */
++
++#include "i386-builtin-types.inc"
++
++/* Codes for all the SSE/MMX builtins.  Builtins not mentioned in any
++   bdesc_* arrays below should come first, then builtins for each bdesc_*
++   array in ascending order, so that we can use direct array accesses.  */
++enum ix86_builtins
++{
++  IX86_BUILTIN_MASKMOVQ,
++  IX86_BUILTIN_LDMXCSR,
++  IX86_BUILTIN_STMXCSR,
++  IX86_BUILTIN_MASKMOVDQU,
++  IX86_BUILTIN_PSLLDQ128,
++  IX86_BUILTIN_CLFLUSH,
++  IX86_BUILTIN_MONITOR,
++  IX86_BUILTIN_MWAIT,
++  IX86_BUILTIN_UMONITOR,
++  IX86_BUILTIN_UMWAIT,
++  IX86_BUILTIN_TPAUSE,
++  IX86_BUILTIN_CLZERO,
++  IX86_BUILTIN_CLDEMOTE,
++  IX86_BUILTIN_VEC_INIT_V2SI,
++  IX86_BUILTIN_VEC_INIT_V4HI,
++  IX86_BUILTIN_VEC_INIT_V8QI,
++  IX86_BUILTIN_VEC_EXT_V2DF,
++  IX86_BUILTIN_VEC_EXT_V2DI,
++  IX86_BUILTIN_VEC_EXT_V4SF,
++  IX86_BUILTIN_VEC_EXT_V4SI,
++  IX86_BUILTIN_VEC_EXT_V8HI,
++  IX86_BUILTIN_VEC_EXT_V2SI,
++  IX86_BUILTIN_VEC_EXT_V4HI,
++  IX86_BUILTIN_VEC_EXT_V16QI,
++  IX86_BUILTIN_VEC_SET_V2DI,
++  IX86_BUILTIN_VEC_SET_V4SF,
++  IX86_BUILTIN_VEC_SET_V4SI,
++  IX86_BUILTIN_VEC_SET_V8HI,
++  IX86_BUILTIN_VEC_SET_V4HI,
++  IX86_BUILTIN_VEC_SET_V16QI,
++  IX86_BUILTIN_GATHERSIV2DF,
++  IX86_BUILTIN_GATHERSIV4DF,
++  IX86_BUILTIN_GATHERDIV2DF,
++  IX86_BUILTIN_GATHERDIV4DF,
++  IX86_BUILTIN_GATHERSIV4SF,
++  IX86_BUILTIN_GATHERSIV8SF,
++  IX86_BUILTIN_GATHERDIV4SF,
++  IX86_BUILTIN_GATHERDIV8SF,
++  IX86_BUILTIN_GATHERSIV2DI,
++  IX86_BUILTIN_GATHERSIV4DI,
++  IX86_BUILTIN_GATHERDIV2DI,
++  IX86_BUILTIN_GATHERDIV4DI,
++  IX86_BUILTIN_GATHERSIV4SI,
++  IX86_BUILTIN_GATHERSIV8SI,
++  IX86_BUILTIN_GATHERDIV4SI,
++  IX86_BUILTIN_GATHERDIV8SI,
++  IX86_BUILTIN_GATHER3SIV8SF,
++  IX86_BUILTIN_GATHER3SIV4SF,
++  IX86_BUILTIN_GATHER3SIV4DF,
++  IX86_BUILTIN_GATHER3SIV2DF,
++  IX86_BUILTIN_GATHER3DIV8SF,
++  IX86_BUILTIN_GATHER3DIV4SF,
++  IX86_BUILTIN_GATHER3DIV4DF,
++  IX86_BUILTIN_GATHER3DIV2DF,
++  IX86_BUILTIN_GATHER3SIV8SI,
++  IX86_BUILTIN_GATHER3SIV4SI,
++  IX86_BUILTIN_GATHER3SIV4DI,
++  IX86_BUILTIN_GATHER3SIV2DI,
++  IX86_BUILTIN_GATHER3DIV8SI,
++  IX86_BUILTIN_GATHER3DIV4SI,
++  IX86_BUILTIN_GATHER3DIV4DI,
++  IX86_BUILTIN_GATHER3DIV2DI,
++  IX86_BUILTIN_SCATTERSIV8SF,
++  IX86_BUILTIN_SCATTERSIV4SF,
++  IX86_BUILTIN_SCATTERSIV4DF,
++  IX86_BUILTIN_SCATTERSIV2DF,
++  IX86_BUILTIN_SCATTERDIV8SF,
++  IX86_BUILTIN_SCATTERDIV4SF,
++  IX86_BUILTIN_SCATTERDIV4DF,
++  IX86_BUILTIN_SCATTERDIV2DF,
++  IX86_BUILTIN_SCATTERSIV8SI,
++  IX86_BUILTIN_SCATTERSIV4SI,
++  IX86_BUILTIN_SCATTERSIV4DI,
++  IX86_BUILTIN_SCATTERSIV2DI,
++  IX86_BUILTIN_SCATTERDIV8SI,
++  IX86_BUILTIN_SCATTERDIV4SI,
++  IX86_BUILTIN_SCATTERDIV4DI,
++  IX86_BUILTIN_SCATTERDIV2DI,
++  /* Alternate 4 and 8 element gather/scatter for the vectorizer
++     where all operands are 32-byte or 64-byte wide respectively.  */
++  IX86_BUILTIN_GATHERALTSIV4DF,
++  IX86_BUILTIN_GATHERALTDIV8SF,
++  IX86_BUILTIN_GATHERALTSIV4DI,
++  IX86_BUILTIN_GATHERALTDIV8SI,
++  IX86_BUILTIN_GATHER3ALTDIV16SF,
++  IX86_BUILTIN_GATHER3ALTDIV16SI,
++  IX86_BUILTIN_GATHER3ALTSIV4DF,
++  IX86_BUILTIN_GATHER3ALTDIV8SF,
++  IX86_BUILTIN_GATHER3ALTSIV4DI,
++  IX86_BUILTIN_GATHER3ALTDIV8SI,
++  IX86_BUILTIN_GATHER3ALTSIV8DF,
++  IX86_BUILTIN_GATHER3ALTSIV8DI,
++  IX86_BUILTIN_GATHER3DIV16SF,
++  IX86_BUILTIN_GATHER3DIV16SI,
++  IX86_BUILTIN_GATHER3DIV8DF,
++  IX86_BUILTIN_GATHER3DIV8DI,
++  IX86_BUILTIN_GATHER3SIV16SF,
++  IX86_BUILTIN_GATHER3SIV16SI,
++  IX86_BUILTIN_GATHER3SIV8DF,
++  IX86_BUILTIN_GATHER3SIV8DI,
++  IX86_BUILTIN_SCATTERALTSIV8DF,
++  IX86_BUILTIN_SCATTERALTDIV16SF,
++  IX86_BUILTIN_SCATTERALTSIV8DI,
++  IX86_BUILTIN_SCATTERALTDIV16SI,
++  IX86_BUILTIN_SCATTERALTSIV4DF,
++  IX86_BUILTIN_SCATTERALTDIV8SF,
++  IX86_BUILTIN_SCATTERALTSIV4DI,
++  IX86_BUILTIN_SCATTERALTDIV8SI,
++  IX86_BUILTIN_SCATTERALTSIV2DF,
++  IX86_BUILTIN_SCATTERALTDIV4SF,
++  IX86_BUILTIN_SCATTERALTSIV2DI,
++  IX86_BUILTIN_SCATTERALTDIV4SI,
++  IX86_BUILTIN_SCATTERDIV16SF,
++  IX86_BUILTIN_SCATTERDIV16SI,
++  IX86_BUILTIN_SCATTERDIV8DF,
++  IX86_BUILTIN_SCATTERDIV8DI,
++  IX86_BUILTIN_SCATTERSIV16SF,
++  IX86_BUILTIN_SCATTERSIV16SI,
++  IX86_BUILTIN_SCATTERSIV8DF,
++  IX86_BUILTIN_SCATTERSIV8DI,
++  IX86_BUILTIN_GATHERPFQPD,
++  IX86_BUILTIN_GATHERPFDPS,
++  IX86_BUILTIN_GATHERPFDPD,
++  IX86_BUILTIN_GATHERPFQPS,
++  IX86_BUILTIN_SCATTERPFDPD,
++  IX86_BUILTIN_SCATTERPFDPS,
++  IX86_BUILTIN_SCATTERPFQPD,
++  IX86_BUILTIN_SCATTERPFQPS,
++  IX86_BUILTIN_CLWB,
++  IX86_BUILTIN_CLFLUSHOPT,
++  IX86_BUILTIN_INFQ,
++  IX86_BUILTIN_HUGE_VALQ,
++  IX86_BUILTIN_NANQ,
++  IX86_BUILTIN_NANSQ,
++  IX86_BUILTIN_XABORT,
++  IX86_BUILTIN_ADDCARRYX32,
++  IX86_BUILTIN_ADDCARRYX64,
++  IX86_BUILTIN_SBB32,
++  IX86_BUILTIN_SBB64,
++  IX86_BUILTIN_RDRAND16_STEP,
++  IX86_BUILTIN_RDRAND32_STEP,
++  IX86_BUILTIN_RDRAND64_STEP,
++  IX86_BUILTIN_RDSEED16_STEP,
++  IX86_BUILTIN_RDSEED32_STEP,
++  IX86_BUILTIN_RDSEED64_STEP,
++  IX86_BUILTIN_MONITORX,
++  IX86_BUILTIN_MWAITX,
++  IX86_BUILTIN_CFSTRING,
++  IX86_BUILTIN_CPU_INIT,
++  IX86_BUILTIN_CPU_IS,
++  IX86_BUILTIN_CPU_SUPPORTS,
++  IX86_BUILTIN_READ_FLAGS,
++  IX86_BUILTIN_WRITE_FLAGS,
++
++  /* All the remaining builtins are tracked in bdesc_* arrays in
++     i386-builtin.def.  Don't add any IX86_BUILTIN_* enumerators after
++     this point.  */
++#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
++  code,
++#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
++  code,									\
++  IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
++#define BDESC_END(kind, next_kind)
++
++#include "i386-builtin.def"
++
++#undef BDESC
++#undef BDESC_FIRST
++#undef BDESC_END
++
++  IX86_BUILTIN_MAX,
++
++  IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
++
++  /* Now just the aliases for bdesc_* start/end.  */
++#define BDESC(mask, mask2, icode, name, code, comparison, flag)
++#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag)
++#define BDESC_END(kind, next_kind) \
++  IX86_BUILTIN__BDESC_##kind##_LAST					    \
++    = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
++
++#include "i386-builtin.def"
++
++#undef BDESC
++#undef BDESC_FIRST
++#undef BDESC_END
++
++  /* Just to make sure there is no comma after the last enumerator.  */
++  IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
++};
++
++/* Table of all of the builtin functions that are possible with different ISA's
++   but are waiting to be built until a function is declared to use that
++   ISA.  */
++struct builtin_isa {
++  HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
++  HOST_WIDE_INT isa2;		/* additional isa_flags this builtin is defined for */
++  const char *name;		/* function name */
++  enum ix86_builtin_func_type tcode; /* type to use in the declaration */
++  unsigned char const_p:1;	/* true if the declaration is constant */
++  unsigned char pure_p:1;	/* true if the declaration has pure attribute */
++  bool set_and_not_built_p;
++};
++
++/* Bits for builtin_description.flag.  */
++
++/* Set when we don't support the comparison natively, and should
++   swap_comparison in order to support it.  */
++#define BUILTIN_DESC_SWAP_OPERANDS	1
++
++struct builtin_description
++{
++  const HOST_WIDE_INT mask;
++  const HOST_WIDE_INT mask2;
++  const enum insn_code icode;
++  const char *const name;
++  const enum ix86_builtins code;
++  const enum rtx_code comparison;
++  const int flag;
++};
++
++#define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
++#define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
++#define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
++#define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
++#define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
++#define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
++#define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
++#define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
++#define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
++#define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
++#define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
++#define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
++#define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
++#define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
++#define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
++#define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
++#define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
++#define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
++#define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
++#define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
++#define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
++#define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
++#define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
++#define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
++#define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
++#define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
++#define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
++#define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
++#define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
++#define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
++#define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
++#define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
++#define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
++#define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
++#define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
++#define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
++#define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
++#define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
++#define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
++#define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
++#define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
++#define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
++#define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
++#define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
++#define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
++#define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
++#define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
++#define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
++#define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
++#define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
++#define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
++#define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
++
++#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
++  { mask, mask2, icode, name, code, comparison, flag },
++#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
++static const struct builtin_description bdesc_##kind[] =		    \
++{									    \
++  BDESC (mask, mask2, icode, name, code, comparison, flag)
++#define BDESC_END(kind, next_kind) \
++};
++
++#include "i386-builtin.def"
++
++extern builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
++
++tree ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
++					      tree type_in);
++void ix86_init_builtins (void);
++tree ix86_vectorize_builtin_gather (const_tree mem_vectype,
++					   const_tree index_type, int scale);
++tree ix86_builtin_decl (unsigned code, bool);
++tree ix86_builtin_reciprocal (tree fndecl);
++unsigned int get_builtin_code_for_version (tree decl, tree *predicate_list);
++tree fold_builtin_cpu (tree fndecl, tree *args);
++tree get_ix86_builtin (enum ix86_builtins c);
++
++#endif  /* GCC_I386_BUILTINS_H */
+diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
+index 5e7e46fce..50cac3b1a 100644
+--- a/gcc/config/i386/i386-c.c
++++ b/gcc/config/i386/i386-c.c
+@@ -586,8 +586,9 @@ ix86_pragma_target_parse (tree args, tree pop_target)
+     }
+   else
+     {
+-      cur_tree = ix86_valid_target_attribute_tree (args, &global_options,
+-						   &global_options_set);
++      cur_tree = ix86_valid_target_attribute_tree (NULL_TREE, args,
++						   &global_options,
++						   &global_options_set, 0);
+       if (!cur_tree || cur_tree == error_mark_node)
+        {
+          cl_target_option_restore (&global_options,
+diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
+new file mode 100644
+index 000000000..01f38b9ea
+--- /dev/null
++++ b/gcc/config/i386/i386-expand.c
+@@ -0,0 +1,19842 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#define IN_TARGET_CODE 1
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "rtl.h"
++#include "tree.h"
++#include "memmodel.h"
++#include "gimple.h"
++#include "cfghooks.h"
++#include "cfgloop.h"
++#include "df.h"
++#include "tm_p.h"
++#include "stringpool.h"
++#include "expmed.h"
++#include "optabs.h"
++#include "regs.h"
++#include "emit-rtl.h"
++#include "recog.h"
++#include "cgraph.h"
++#include "diagnostic.h"
++#include "cfgbuild.h"
++#include "alias.h"
++#include "fold-const.h"
++#include "attribs.h"
++#include "calls.h"
++#include "stor-layout.h"
++#include "varasm.h"
++#include "output.h"
++#include "insn-attr.h"
++#include "flags.h"
++#include "except.h"
++#include "explow.h"
++#include "expr.h"
++#include "cfgrtl.h"
++#include "common/common-target.h"
++#include "langhooks.h"
++#include "reload.h"
++#include "gimplify.h"
++#include "dwarf2.h"
++#include "tm-constrs.h"
++#include "params.h"
++#include "cselib.h"
++#include "sched-int.h"
++#include "opts.h"
++#include "tree-pass.h"
++#include "context.h"
++#include "pass_manager.h"
++#include "target-globals.h"
++#include "gimple-iterator.h"
++#include "tree-vectorizer.h"
++#include "shrink-wrap.h"
++#include "builtins.h"
++#include "rtl-iter.h"
++#include "tree-iterator.h"
++#include "dbgcnt.h"
++#include "case-cfn-macros.h"
++#include "dojump.h"
++#include "fold-const-call.h"
++#include "tree-vrp.h"
++#include "tree-ssanames.h"
++#include "selftest.h"
++#include "selftest-rtl.h"
++#include "print-rtl.h"
++#include "intl.h"
++#include "ifcvt.h"
++#include "symbol-summary.h"
++#include "ipa-prop.h"
++#include "ipa-fnsummary.h"
++#include "wide-int-bitmask.h"
++#include "tree-vector-builder.h"
++#include "debug.h"
++#include "dwarf2out.h"
++#include "i386-options.h"
++#include "i386-builtins.h"
++#include "i386-expand.h"
++
++/* Split one or more double-mode RTL references into pairs of half-mode
++   references.  The RTL can be REG, offsettable MEM, integer constant, or
++   CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
++   split and "num" is its length.  lo_half and hi_half are output arrays
++   that parallel "operands".  */
++
++void
++split_double_mode (machine_mode mode, rtx operands[],
++		   int num, rtx lo_half[], rtx hi_half[])
++{
++  machine_mode half_mode;
++  unsigned int byte;
++
++  switch (mode)
++    {
++    case E_TImode:
++      half_mode = DImode;
++      break;
++    case E_DImode:
++      half_mode = SImode;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  byte = GET_MODE_SIZE (half_mode);
++
++  while (num--)
++    {
++      rtx op = operands[num];
++
++      /* simplify_subreg refuse to split volatile memory addresses,
++         but we still have to handle it.  */
++      if (MEM_P (op))
++	{
++	  lo_half[num] = adjust_address (op, half_mode, 0);
++	  hi_half[num] = adjust_address (op, half_mode, byte);
++	}
++      else
++	{
++	  lo_half[num] = simplify_gen_subreg (half_mode, op,
++					      GET_MODE (op) == VOIDmode
++					      ? mode : GET_MODE (op), 0);
++	  hi_half[num] = simplify_gen_subreg (half_mode, op,
++					      GET_MODE (op) == VOIDmode
++					      ? mode : GET_MODE (op), byte);
++	}
++    }
++}
++
++/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
++   for the target.  */
++
++void
++ix86_expand_clear (rtx dest)
++{
++  rtx tmp;
++
++  /* We play register width games, which are only valid after reload.  */
++  gcc_assert (reload_completed);
++
++  /* Avoid HImode and its attendant prefix byte.  */
++  if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
++    dest = gen_rtx_REG (SImode, REGNO (dest));
++  tmp = gen_rtx_SET (dest, const0_rtx);
++
++  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
++    {
++      rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
++      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
++    }
++
++  emit_insn (tmp);
++}
++
++void
++ix86_expand_move (machine_mode mode, rtx operands[])
++{
++  rtx op0, op1;
++  rtx tmp, addend = NULL_RTX;
++  enum tls_model model;
++
++  op0 = operands[0];
++  op1 = operands[1];
++
++  switch (GET_CODE (op1))
++    {
++    case CONST:
++      tmp = XEXP (op1, 0);
++
++      if (GET_CODE (tmp) != PLUS
++	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
++	break;
++
++      op1 = XEXP (tmp, 0);
++      addend = XEXP (tmp, 1);
++      /* FALLTHRU */
++
++    case SYMBOL_REF:
++      model = SYMBOL_REF_TLS_MODEL (op1);
++
++      if (model)
++	op1 = legitimize_tls_address (op1, model, true);
++      else if (ix86_force_load_from_GOT_p (op1))
++	{
++	  /* Load the external function address via GOT slot to avoid PLT.  */
++	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
++				(TARGET_64BIT
++				 ? UNSPEC_GOTPCREL
++				 : UNSPEC_GOT));
++	  op1 = gen_rtx_CONST (Pmode, op1);
++	  op1 = gen_const_mem (Pmode, op1);
++	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
++	}
++      else
++	{
++	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
++	  if (tmp)
++	    {
++	      op1 = tmp;
++	      if (!addend)
++		break;
++	    }
++	  else
++	    {
++	      op1 = operands[1];
++	      break;
++	    }
++	}
++
++      if (addend)
++	{
++	  op1 = force_operand (op1, NULL_RTX);
++	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
++				     op0, 1, OPTAB_DIRECT);
++	}
++      else
++	op1 = force_operand (op1, op0);
++
++      if (op1 == op0)
++	return;
++
++      op1 = convert_to_mode (mode, op1, 1);
++
++    default:
++      break;
++    }
++
++  if ((flag_pic || MACHOPIC_INDIRECT)
++      && symbolic_operand (op1, mode))
++    {
++      if (TARGET_MACHO && !TARGET_64BIT)
++	{
++#if TARGET_MACHO
++	  /* dynamic-no-pic */
++	  if (MACHOPIC_INDIRECT)
++	    {
++	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
++			 ? op0 : gen_reg_rtx (Pmode);
++	      op1 = machopic_indirect_data_reference (op1, temp);
++	      if (MACHOPIC_PURE)
++		op1 = machopic_legitimize_pic_address (op1, mode,
++						       temp == op1 ? 0 : temp);
++	    }
++	  if (op0 != op1 && GET_CODE (op0) != MEM)
++	    {
++	      rtx insn = gen_rtx_SET (op0, op1);
++	      emit_insn (insn);
++	      return;
++	    }
++	  if (GET_CODE (op0) == MEM)
++	    op1 = force_reg (Pmode, op1);
++	  else
++	    {
++	      rtx temp = op0;
++	      if (GET_CODE (temp) != REG)
++		temp = gen_reg_rtx (Pmode);
++	      temp = legitimize_pic_address (op1, temp);
++	      if (temp == op0)
++	    return;
++	      op1 = temp;
++	    }
++      /* dynamic-no-pic */
++#endif
++	}
++      else
++	{
++	  if (MEM_P (op0))
++	    op1 = force_reg (mode, op1);
++	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
++	    {
++	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
++	      op1 = legitimize_pic_address (op1, reg);
++	      if (op0 == op1)
++		return;
++	      op1 = convert_to_mode (mode, op1, 1);
++	    }
++	}
++    }
++  else
++    {
++      if (MEM_P (op0)
++	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
++	      || !push_operand (op0, mode))
++	  && MEM_P (op1))
++	op1 = force_reg (mode, op1);
++
++      if (push_operand (op0, mode)
++	  && ! general_no_elim_operand (op1, mode))
++	op1 = copy_to_mode_reg (mode, op1);
++
++      /* Force large constants in 64bit compilation into register
++	 to get them CSEed.  */
++      if (can_create_pseudo_p ()
++	  && (mode == DImode) && TARGET_64BIT
++	  && immediate_operand (op1, mode)
++	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
++	  && !register_operand (op0, mode)
++	  && optimize)
++	op1 = copy_to_mode_reg (mode, op1);
++
++      if (can_create_pseudo_p ()
++	  && CONST_DOUBLE_P (op1))
++	{
++	  /* If we are loading a floating point constant to a register,
++	     force the value to memory now, since we'll get better code
++	     out the back end.  */
++
++	  op1 = validize_mem (force_const_mem (mode, op1));
++	  if (!register_operand (op0, mode))
++	    {
++	      rtx temp = gen_reg_rtx (mode);
++	      emit_insn (gen_rtx_SET (temp, op1));
++	      emit_move_insn (op0, temp);
++	      return;
++	    }
++	}
++    }
++
++  emit_insn (gen_rtx_SET (op0, op1));
++}
++
++void
++ix86_expand_vector_move (machine_mode mode, rtx operands[])
++{
++  rtx op0 = operands[0], op1 = operands[1];
++  /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
++     psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
++  unsigned int align = (TARGET_IAMCU
++			? GET_MODE_BITSIZE (mode)
++			: GET_MODE_ALIGNMENT (mode));
++
++  if (push_operand (op0, VOIDmode))
++    op0 = emit_move_resolve_push (mode, op0);
++
++  /* Force constants other than zero into memory.  We do not know how
++     the instructions used to build constants modify the upper 64 bits
++     of the register, once we have that information we may be able
++     to handle some of them more efficiently.  */
++  if (can_create_pseudo_p ()
++      && (CONSTANT_P (op1)
++	  || (SUBREG_P (op1)
++	      && CONSTANT_P (SUBREG_REG (op1))))
++      && ((register_operand (op0, mode)
++	   && !standard_sse_constant_p (op1, mode))
++	  /* ix86_expand_vector_move_misalign() does not like constants.  */
++	  || (SSE_REG_MODE_P (mode)
++	      && MEM_P (op0)
++	      && MEM_ALIGN (op0) < align)))
++    {
++      if (SUBREG_P (op1))
++	{
++	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
++	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
++	  if (r)
++	    r = validize_mem (r);
++	  else
++	    r = force_reg (imode, SUBREG_REG (op1));
++	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
++	}
++      else
++	op1 = validize_mem (force_const_mem (mode, op1));
++    }
++
++  /* We need to check memory alignment for SSE mode since attribute
++     can make operands unaligned.  */
++  if (can_create_pseudo_p ()
++      && SSE_REG_MODE_P (mode)
++      && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
++	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
++    {
++      rtx tmp[2];
++
++      /* ix86_expand_vector_move_misalign() does not like both
++	 arguments in memory.  */
++      if (!register_operand (op0, mode)
++	  && !register_operand (op1, mode))
++	op1 = force_reg (mode, op1);
++
++      tmp[0] = op0; tmp[1] = op1;
++      ix86_expand_vector_move_misalign (mode, tmp);
++      return;
++    }
++
++  /* Make operand1 a register if it isn't already.  */
++  if (can_create_pseudo_p ()
++      && !register_operand (op0, mode)
++      && !register_operand (op1, mode))
++    {
++      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
++      return;
++    }
++
++  emit_insn (gen_rtx_SET (op0, op1));
++}
++
++/* Split 32-byte AVX unaligned load and store if needed.  */
++
++static void
++ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
++{
++  rtx m;
++  rtx (*extract) (rtx, rtx, rtx);
++  machine_mode mode;
++
++  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
++      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
++    {
++      emit_insn (gen_rtx_SET (op0, op1));
++      return;
++    }
++
++  rtx orig_op0 = NULL_RTX;
++  mode = GET_MODE (op0);
++  switch (GET_MODE_CLASS (mode))
++    {
++    case MODE_VECTOR_INT:
++    case MODE_INT:
++      if (mode != V32QImode)
++	{
++	  if (!MEM_P (op0))
++	    {
++	      orig_op0 = op0;
++	      op0 = gen_reg_rtx (V32QImode);
++	    }
++	  else
++	    op0 = gen_lowpart (V32QImode, op0);
++	  op1 = gen_lowpart (V32QImode, op1);
++	  mode = V32QImode;
++	}
++      break;
++    case MODE_VECTOR_FLOAT:
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  switch (mode)
++    {
++    default:
++      gcc_unreachable ();
++    case E_V32QImode:
++      extract = gen_avx_vextractf128v32qi;
++      mode = V16QImode;
++      break;
++    case E_V8SFmode:
++      extract = gen_avx_vextractf128v8sf;
++      mode = V4SFmode;
++      break;
++    case E_V4DFmode:
++      extract = gen_avx_vextractf128v4df;
++      mode = V2DFmode;
++      break;
++    }
++
++  if (MEM_P (op1))
++    {
++      rtx r = gen_reg_rtx (mode);
++      m = adjust_address (op1, mode, 0);
++      emit_move_insn (r, m);
++      m = adjust_address (op1, mode, 16);
++      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
++      emit_move_insn (op0, r);
++    }
++  else if (MEM_P (op0))
++    {
++      m = adjust_address (op0, mode, 0);
++      emit_insn (extract (m, op1, const0_rtx));
++      m = adjust_address (op0, mode, 16);
++      emit_insn (extract (m, copy_rtx (op1), const1_rtx));
++    }
++  else
++    gcc_unreachable ();
++
++  if (orig_op0)
++    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
++}
++
++/* Implement the movmisalign patterns for SSE.  Non-SSE modes go
++   straight to ix86_expand_vector_move.  */
++/* Code generation for scalar reg-reg moves of single and double precision data:
++     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
++       movaps reg, reg
++     else
++       movss reg, reg
++     if (x86_sse_partial_reg_dependency == true)
++       movapd reg, reg
++     else
++       movsd reg, reg
++
++   Code generation for scalar loads of double precision data:
++     if (x86_sse_split_regs == true)
++       movlpd mem, reg      (gas syntax)
++     else
++       movsd mem, reg
++
++   Code generation for unaligned packed loads of single precision data
++   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
++     if (x86_sse_unaligned_move_optimal)
++       movups mem, reg
++
++     if (x86_sse_partial_reg_dependency == true)
++       {
++         xorps  reg, reg
++         movlps mem, reg
++         movhps mem+8, reg
++       }
++     else
++       {
++         movlps mem, reg
++         movhps mem+8, reg
++       }
++
++   Code generation for unaligned packed loads of double precision data
++   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
++     if (x86_sse_unaligned_move_optimal)
++       movupd mem, reg
++
++     if (x86_sse_split_regs == true)
++       {
++         movlpd mem, reg
++         movhpd mem+8, reg
++       }
++     else
++       {
++         movsd  mem, reg
++         movhpd mem+8, reg
++       }
++ */
++
++void
++ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
++{
++  rtx op0, op1, m;
++
++  op0 = operands[0];
++  op1 = operands[1];
++
++  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
++  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
++    {
++      emit_insn (gen_rtx_SET (op0, op1));
++      return;
++    }
++
++  if (TARGET_AVX)
++    {
++      if (GET_MODE_SIZE (mode) == 32)
++	ix86_avx256_split_vector_move_misalign (op0, op1);
++      else
++	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
++	emit_insn (gen_rtx_SET (op0, op1));
++      return;
++    }
++
++  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
++      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
++    {
++      emit_insn (gen_rtx_SET (op0, op1));
++      return;
++    }
++
++  /* ??? If we have typed data, then it would appear that using
++     movdqu is the only way to get unaligned data loaded with
++     integer type.  */
++  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
++    {
++      emit_insn (gen_rtx_SET (op0, op1));
++      return;
++    }
++
++  if (MEM_P (op1))
++    {
++      if (TARGET_SSE2 && mode == V2DFmode)
++        {
++          rtx zero;
++
++	  /* When SSE registers are split into halves, we can avoid
++	     writing to the top half twice.  */
++	  if (TARGET_SSE_SPLIT_REGS)
++	    {
++	      emit_clobber (op0);
++	      zero = op0;
++	    }
++	  else
++	    {
++	      /* ??? Not sure about the best option for the Intel chips.
++		 The following would seem to satisfy; the register is
++		 entirely cleared, breaking the dependency chain.  We
++		 then store to the upper half, with a dependency depth
++		 of one.  A rumor has it that Intel recommends two movsd
++		 followed by an unpacklpd, but this is unconfirmed.  And
++		 given that the dependency depth of the unpacklpd would
++		 still be one, I'm not sure why this would be better.  */
++	      zero = CONST0_RTX (V2DFmode);
++	    }
++
++	  m = adjust_address (op1, DFmode, 0);
++	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
++	  m = adjust_address (op1, DFmode, 8);
++	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
++	}
++      else
++        {
++	  rtx t;
++
++	  if (mode != V4SFmode)
++	    t = gen_reg_rtx (V4SFmode);
++	  else
++	    t = op0;
++	    
++	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
++	    emit_move_insn (t, CONST0_RTX (V4SFmode));
++	  else
++	    emit_clobber (t);
++
++	  m = adjust_address (op1, V2SFmode, 0);
++	  emit_insn (gen_sse_loadlps (t, t, m));
++	  m = adjust_address (op1, V2SFmode, 8);
++	  emit_insn (gen_sse_loadhps (t, t, m));
++	  if (mode != V4SFmode)
++	    emit_move_insn (op0, gen_lowpart (mode, t));
++	}
++    }
++  else if (MEM_P (op0))
++    {
++      if (TARGET_SSE2 && mode == V2DFmode)
++	{
++	  m = adjust_address (op0, DFmode, 0);
++	  emit_insn (gen_sse2_storelpd (m, op1));
++	  m = adjust_address (op0, DFmode, 8);
++	  emit_insn (gen_sse2_storehpd (m, op1));
++	}
++      else
++	{
++	  if (mode != V4SFmode)
++	    op1 = gen_lowpart (V4SFmode, op1);
++
++	  m = adjust_address (op0, V2SFmode, 0);
++	  emit_insn (gen_sse_storelps (m, op1));
++	  m = adjust_address (op0, V2SFmode, 8);
++	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
++	}
++    }
++  else
++    gcc_unreachable ();
++}
++
++/* Helper function of ix86_fixup_binary_operands to canonicalize
++   operand order.  Returns true if the operands should be swapped.  */
++
++static bool
++ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
++			     rtx operands[])
++{
++  rtx dst = operands[0];
++  rtx src1 = operands[1];
++  rtx src2 = operands[2];
++
++  /* If the operation is not commutative, we can't do anything.  */
++  if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
++      && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
++    return false;
++
++  /* Highest priority is that src1 should match dst.  */
++  if (rtx_equal_p (dst, src1))
++    return false;
++  if (rtx_equal_p (dst, src2))
++    return true;
++
++  /* Next highest priority is that immediate constants come second.  */
++  if (immediate_operand (src2, mode))
++    return false;
++  if (immediate_operand (src1, mode))
++    return true;
++
++  /* Lowest priority is that memory references should come second.  */
++  if (MEM_P (src2))
++    return false;
++  if (MEM_P (src1))
++    return true;
++
++  return false;
++}
++
++
++/* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
++   destination to use for the operation.  If different from the true
++   destination in operands[0], a copy operation will be required.  */
++
++rtx
++ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
++			    rtx operands[])
++{
++  rtx dst = operands[0];
++  rtx src1 = operands[1];
++  rtx src2 = operands[2];
++
++  /* Canonicalize operand order.  */
++  if (ix86_swap_binary_operands_p (code, mode, operands))
++    {
++      /* It is invalid to swap operands of different modes.  */
++      gcc_assert (GET_MODE (src1) == GET_MODE (src2));
++
++      std::swap (src1, src2);
++    }
++
++  /* Both source operands cannot be in memory.  */
++  if (MEM_P (src1) && MEM_P (src2))
++    {
++      /* Optimization: Only read from memory once.  */
++      if (rtx_equal_p (src1, src2))
++	{
++	  src2 = force_reg (mode, src2);
++	  src1 = src2;
++	}
++      else if (rtx_equal_p (dst, src1))
++	src2 = force_reg (mode, src2);
++      else
++	src1 = force_reg (mode, src1);
++    }
++
++  /* If the destination is memory, and we do not have matching source
++     operands, do things in registers.  */
++  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
++    dst = gen_reg_rtx (mode);
++
++  /* Source 1 cannot be a constant.  */
++  if (CONSTANT_P (src1))
++    src1 = force_reg (mode, src1);
++
++  /* Source 1 cannot be a non-matching memory.  */
++  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
++    src1 = force_reg (mode, src1);
++
++  /* Improve address combine.  */
++  if (code == PLUS
++      && GET_MODE_CLASS (mode) == MODE_INT
++      && MEM_P (src2))
++    src2 = force_reg (mode, src2);
++
++  operands[1] = src1;
++  operands[2] = src2;
++  return dst;
++}
++
++/* Similarly, but assume that the destination has already been
++   set up properly.  */
++
++void
++ix86_fixup_binary_operands_no_copy (enum rtx_code code,
++				    machine_mode mode, rtx operands[])
++{
++  rtx dst = ix86_fixup_binary_operands (code, mode, operands);
++  gcc_assert (dst == operands[0]);
++}
++
++/* Attempt to expand a binary operator.  Make the expansion closer to the
++   actual machine, then just general_operand, which will allow 3 separate
++   memory references (one output, two input) in a single insn.  */
++
++void
++ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
++			     rtx operands[])
++{
++  rtx src1, src2, dst, op, clob;
++
++  dst = ix86_fixup_binary_operands (code, mode, operands);
++  src1 = operands[1];
++  src2 = operands[2];
++
++ /* Emit the instruction.  */
++
++  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
++
++  if (reload_completed
++      && code == PLUS
++      && !rtx_equal_p (dst, src1))
++    {
++      /* This is going to be an LEA; avoid splitting it later.  */
++      emit_insn (op);
++    }
++  else
++    {
++      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
++      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
++    }
++
++  /* Fix up the destination if needed.  */
++  if (dst != operands[0])
++    emit_move_insn (operands[0], dst);
++}
++
++/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
++   the given OPERANDS.  */
++
++void
++ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
++				     rtx operands[])
++{
++  rtx op1 = NULL_RTX, op2 = NULL_RTX;
++  if (SUBREG_P (operands[1]))
++    {
++      op1 = operands[1];
++      op2 = operands[2];
++    }
++  else if (SUBREG_P (operands[2]))
++    {
++      op1 = operands[2];
++      op2 = operands[1];
++    }
++  /* Optimize (__m128i) d | (__m128i) e and similar code
++     when d and e are float vectors into float vector logical
++     insn.  In C/C++ without using intrinsics there is no other way
++     to express vector logical operation on float vectors than
++     to cast them temporarily to integer vectors.  */
++  if (op1
++      && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
++      && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
++      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
++      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
++      && SUBREG_BYTE (op1) == 0
++      && (GET_CODE (op2) == CONST_VECTOR
++	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
++	      && SUBREG_BYTE (op2) == 0))
++      && can_create_pseudo_p ())
++    {
++      rtx dst;
++      switch (GET_MODE (SUBREG_REG (op1)))
++	{
++	case E_V4SFmode:
++	case E_V8SFmode:
++	case E_V16SFmode:
++	case E_V2DFmode:
++	case E_V4DFmode:
++	case E_V8DFmode:
++	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
++	  if (GET_CODE (op2) == CONST_VECTOR)
++	    {
++	      op2 = gen_lowpart (GET_MODE (dst), op2);
++	      op2 = force_reg (GET_MODE (dst), op2);
++	    }
++	  else
++	    {
++	      op1 = operands[1];
++	      op2 = SUBREG_REG (operands[2]);
++	      if (!vector_operand (op2, GET_MODE (dst)))
++		op2 = force_reg (GET_MODE (dst), op2);
++	    }
++	  op1 = SUBREG_REG (op1);
++	  if (!vector_operand (op1, GET_MODE (dst)))
++	    op1 = force_reg (GET_MODE (dst), op1);
++	  emit_insn (gen_rtx_SET (dst,
++				  gen_rtx_fmt_ee (code, GET_MODE (dst),
++						  op1, op2)));
++	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
++	  return;
++	default:
++	  break;
++	}
++    }
++  if (!vector_operand (operands[1], mode))
++    operands[1] = force_reg (mode, operands[1]);
++  if (!vector_operand (operands[2], mode))
++    operands[2] = force_reg (mode, operands[2]);
++  ix86_fixup_binary_operands_no_copy (code, mode, operands);
++  emit_insn (gen_rtx_SET (operands[0],
++			  gen_rtx_fmt_ee (code, mode, operands[1],
++					  operands[2])));
++}
++
++/* Return TRUE or FALSE depending on whether the binary operator meets the
++   appropriate constraints.  */
++
++bool
++ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
++			 rtx operands[3])
++{
++  rtx dst = operands[0];
++  rtx src1 = operands[1];
++  rtx src2 = operands[2];
++
++  /* Both source operands cannot be in memory.  */
++  if (MEM_P (src1) && MEM_P (src2))
++    return false;
++
++  /* Canonicalize operand order for commutative operators.  */
++  if (ix86_swap_binary_operands_p (code, mode, operands))
++    std::swap (src1, src2);
++
++  /* If the destination is memory, we must have a matching source operand.  */
++  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
++    return false;
++
++  /* Source 1 cannot be a constant.  */
++  if (CONSTANT_P (src1))
++    return false;
++
++  /* Source 1 cannot be a non-matching memory.  */
++  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
++    /* Support "andhi/andsi/anddi" as a zero-extending move.  */
++    return (code == AND
++	    && (mode == HImode
++		|| mode == SImode
++		|| (TARGET_64BIT && mode == DImode))
++	    && satisfies_constraint_L (src2));
++
++  return true;
++}
++
++/* Attempt to expand a unary operator.  Make the expansion closer to the
++   actual machine, then just general_operand, which will allow 2 separate
++   memory references (one output, one input) in a single insn.  */
++
++void
++ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
++			    rtx operands[])
++{
++  bool matching_memory = false;
++  rtx src, dst, op, clob;
++
++  dst = operands[0];
++  src = operands[1];
++
++  /* If the destination is memory, and we do not have matching source
++     operands, do things in registers.  */
++  if (MEM_P (dst))
++    {
++      if (rtx_equal_p (dst, src))
++	matching_memory = true;
++      else
++	dst = gen_reg_rtx (mode);
++    }
++
++  /* When source operand is memory, destination must match.  */
++  if (MEM_P (src) && !matching_memory)
++    src = force_reg (mode, src);
++
++  /* Emit the instruction.  */
++
++  op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
++
++  if (code == NOT)
++    emit_insn (op);
++  else
++    {
++      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
++      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
++    }
++
++  /* Fix up the destination if needed.  */
++  if (dst != operands[0])
++    emit_move_insn (operands[0], dst);
++}
++
++/* Predict just emitted jump instruction to be taken with probability PROB.  */
++
++static void
++predict_jump (int prob)
++{
++  rtx_insn *insn = get_last_insn ();
++  gcc_assert (JUMP_P (insn));
++  add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
++}
++
++/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
++   divisor are within the range [0-255].  */
++
++void
++ix86_split_idivmod (machine_mode mode, rtx operands[],
++		    bool signed_p)
++{
++  rtx_code_label *end_label, *qimode_label;
++  rtx div, mod;
++  rtx_insn *insn;
++  rtx scratch, tmp0, tmp1, tmp2;
++  rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
++  rtx (*gen_zero_extend) (rtx, rtx);
++  rtx (*gen_test_ccno_1) (rtx, rtx);
++
++  switch (mode)
++    {
++    case E_SImode:
++      if (GET_MODE (operands[0]) == SImode)
++	{
++	  if (GET_MODE (operands[1]) == SImode)
++	    gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
++	  else
++	    gen_divmod4_1
++	      = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
++	  gen_zero_extend = gen_zero_extendqisi2;
++	}
++      else
++	{
++	  gen_divmod4_1
++	    = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
++	  gen_zero_extend = gen_zero_extendqidi2;
++	}
++      gen_test_ccno_1 = gen_testsi_ccno_1;
++      break;
++    case E_DImode:
++      gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
++      gen_test_ccno_1 = gen_testdi_ccno_1;
++      gen_zero_extend = gen_zero_extendqidi2;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  end_label = gen_label_rtx ();
++  qimode_label = gen_label_rtx ();
++
++  scratch = gen_reg_rtx (mode);
++
++  /* Use 8bit unsigned divimod if dividend and divisor are within
++     the range [0-255].  */
++  emit_move_insn (scratch, operands[2]);
++  scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
++				 scratch, 1, OPTAB_DIRECT);
++  emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
++  tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
++  tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
++  tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
++			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
++			       pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
++  predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  JUMP_LABEL (insn) = qimode_label;
++
++  /* Generate original signed/unsigned divimod.  */
++  div = gen_divmod4_1 (operands[0], operands[1],
++		       operands[2], operands[3]);
++  emit_insn (div);
++
++  /* Branch to the end.  */
++  emit_jump_insn (gen_jump (end_label));
++  emit_barrier ();
++
++  /* Generate 8bit unsigned divide.  */
++  emit_label (qimode_label);
++  /* Don't use operands[0] for result of 8bit divide since not all
++     registers support QImode ZERO_EXTRACT.  */
++  tmp0 = lowpart_subreg (HImode, scratch, mode);
++  tmp1 = lowpart_subreg (HImode, operands[2], mode);
++  tmp2 = lowpart_subreg (QImode, operands[3], mode);
++  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
++
++  if (signed_p)
++    {
++      div = gen_rtx_DIV (mode, operands[2], operands[3]);
++      mod = gen_rtx_MOD (mode, operands[2], operands[3]);
++    }
++  else
++    {
++      div = gen_rtx_UDIV (mode, operands[2], operands[3]);
++      mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
++    }
++  if (mode == SImode)
++    {
++      if (GET_MODE (operands[0]) != SImode)
++	div = gen_rtx_ZERO_EXTEND (DImode, div);
++      if (GET_MODE (operands[1]) != SImode)
++	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
++    }
++
++  /* Extract remainder from AH.  */
++  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
++			       tmp0, GEN_INT (8), GEN_INT (8));
++  if (REG_P (operands[1]))
++    insn = emit_move_insn (operands[1], tmp1);
++  else
++    {
++      /* Need a new scratch register since the old one has result
++	 of 8bit divide.  */
++      scratch = gen_reg_rtx (GET_MODE (operands[1]));
++      emit_move_insn (scratch, tmp1);
++      insn = emit_move_insn (operands[1], scratch);
++    }
++  set_unique_reg_note (insn, REG_EQUAL, mod);
++
++  /* Zero extend quotient from AL.  */
++  tmp1 = gen_lowpart (QImode, tmp0);
++  insn = emit_insn (gen_zero_extend (operands[0], tmp1));
++  set_unique_reg_note (insn, REG_EQUAL, div);
++
++  emit_label (end_label);
++}
++
++/* Emit x86 binary operand CODE in mode MODE, where the first operand
++   matches destination.  RTX includes clobber of FLAGS_REG.  */
++
++void
++ix86_emit_binop (enum rtx_code code, machine_mode mode,
++		 rtx dst, rtx src)
++{
++  rtx op, clob;
++
++  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
++  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
++  
++  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
++}
++
++/* Return true if regno1 def is nearest to the insn.  */
++
++static bool
++find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
++{
++  rtx_insn *prev = insn;
++  rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
++
++  if (insn == start)
++    return false;
++  while (prev && prev != start)
++    {
++      if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
++	{
++	  prev = PREV_INSN (prev);
++	  continue;
++	}
++      if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
++	return true;
++      else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
++	return false;
++      prev = PREV_INSN (prev);
++    }
++
++  /* None of the regs is defined in the bb.  */
++  return false;
++}
++
++/* Split lea instructions into a sequence of instructions
++   which are executed on ALU to avoid AGU stalls.
++   It is assumed that it is allowed to clobber flags register
++   at lea position.  */
++
++void
++ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
++{
++  unsigned int regno0, regno1, regno2;
++  struct ix86_address parts;
++  rtx target, tmp;
++  int ok, adds;
++
++  ok = ix86_decompose_address (operands[1], &parts);
++  gcc_assert (ok);
++
++  target = gen_lowpart (mode, operands[0]);
++
++  regno0 = true_regnum (target);
++  regno1 = INVALID_REGNUM;
++  regno2 = INVALID_REGNUM;
++
++  if (parts.base)
++    {
++      parts.base = gen_lowpart (mode, parts.base);
++      regno1 = true_regnum (parts.base);
++    }
++
++  if (parts.index)
++    {
++      parts.index = gen_lowpart (mode, parts.index);
++      regno2 = true_regnum (parts.index);
++    }
++
++  if (parts.disp)
++    parts.disp = gen_lowpart (mode, parts.disp);
++
++  if (parts.scale > 1)
++    {
++      /* Case r1 = r1 + ...  */
++      if (regno1 == regno0)
++	{
++	  /* If we have a case r1 = r1 + C * r2 then we
++	     should use multiplication which is very
++	     expensive.  Assume cost model is wrong if we
++	     have such case here.  */
++	  gcc_assert (regno2 != regno0);
++
++	  for (adds = parts.scale; adds > 0; adds--)
++	    ix86_emit_binop (PLUS, mode, target, parts.index);
++	}
++      else
++	{
++	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
++	  if (regno0 != regno2)
++	    emit_insn (gen_rtx_SET (target, parts.index));
++
++	  /* Use shift for scaling.  */
++	  ix86_emit_binop (ASHIFT, mode, target,
++			   GEN_INT (exact_log2 (parts.scale)));
++
++	  if (parts.base)
++	    ix86_emit_binop (PLUS, mode, target, parts.base);
++
++	  if (parts.disp && parts.disp != const0_rtx)
++	    ix86_emit_binop (PLUS, mode, target, parts.disp);
++	}
++    }
++  else if (!parts.base && !parts.index)
++    {
++      gcc_assert(parts.disp);
++      emit_insn (gen_rtx_SET (target, parts.disp));
++    }
++  else
++    {
++      if (!parts.base)
++	{
++	  if (regno0 != regno2)
++	    emit_insn (gen_rtx_SET (target, parts.index));
++	}
++      else if (!parts.index)
++	{
++	  if (regno0 != regno1)
++	    emit_insn (gen_rtx_SET (target, parts.base));
++	}
++      else
++	{
++	  if (regno0 == regno1)
++	    tmp = parts.index;
++	  else if (regno0 == regno2)
++	    tmp = parts.base;
++	  else
++	    {
++	      rtx tmp1;
++
++	      /* Find better operand for SET instruction, depending
++		 on which definition is farther from the insn.  */
++	      if (find_nearest_reg_def (insn, regno1, regno2))
++		tmp = parts.index, tmp1 = parts.base;
++	      else
++		tmp = parts.base, tmp1 = parts.index;
++
++	      emit_insn (gen_rtx_SET (target, tmp));
++
++	      if (parts.disp && parts.disp != const0_rtx)
++		ix86_emit_binop (PLUS, mode, target, parts.disp);
++
++	      ix86_emit_binop (PLUS, mode, target, tmp1);
++	      return;
++	    }
++
++	  ix86_emit_binop (PLUS, mode, target, tmp);
++	}
++
++      if (parts.disp && parts.disp != const0_rtx)
++	ix86_emit_binop (PLUS, mode, target, parts.disp);
++    }
++}
++
++/* Post-reload splitter for converting an SF or DFmode value in an
++   SSE register into an unsigned SImode.  */
++
++void
++ix86_split_convert_uns_si_sse (rtx operands[])
++{
++  machine_mode vecmode;
++  rtx value, large, zero_or_two31, input, two31, x;
++
++  large = operands[1];
++  zero_or_two31 = operands[2];
++  input = operands[3];
++  two31 = operands[4];
++  vecmode = GET_MODE (large);
++  value = gen_rtx_REG (vecmode, REGNO (operands[0]));
++
++  /* Load up the value into the low element.  We must ensure that the other
++     elements are valid floats -- zero is the easiest such value.  */
++  if (MEM_P (input))
++    {
++      if (vecmode == V4SFmode)
++	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
++      else
++	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
++    }
++  else
++    {
++      input = gen_rtx_REG (vecmode, REGNO (input));
++      emit_move_insn (value, CONST0_RTX (vecmode));
++      if (vecmode == V4SFmode)
++	emit_insn (gen_sse_movss (value, value, input));
++      else
++	emit_insn (gen_sse2_movsd (value, value, input));
++    }
++
++  emit_move_insn (large, two31);
++  emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
++
++  x = gen_rtx_fmt_ee (LE, vecmode, large, value);
++  emit_insn (gen_rtx_SET (large, x));
++
++  x = gen_rtx_AND (vecmode, zero_or_two31, large);
++  emit_insn (gen_rtx_SET (zero_or_two31, x));
++
++  x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
++  emit_insn (gen_rtx_SET (value, x));
++
++  large = gen_rtx_REG (V4SImode, REGNO (large));
++  emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
++
++  x = gen_rtx_REG (V4SImode, REGNO (value));
++  if (vecmode == V4SFmode)
++    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
++  else
++    emit_insn (gen_sse2_cvttpd2dq (x, value));
++  value = x;
++
++  emit_insn (gen_xorv4si3 (value, value, large));
++}
++
++static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
++						 machine_mode mode, rtx target,
++						 rtx var, int one_var);
++
++/* Convert an unsigned DImode value into a DFmode, using only SSE.
++   Expects the 64-bit DImode to be supplied in a pair of integral
++   registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
++   -mfpmath=sse, !optimize_size only.  */
++
++void
++ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
++{
++  REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
++  rtx int_xmm, fp_xmm;
++  rtx biases, exponents;
++  rtx x;
++
++  int_xmm = gen_reg_rtx (V4SImode);
++  if (TARGET_INTER_UNIT_MOVES_TO_VEC)
++    emit_insn (gen_movdi_to_sse (int_xmm, input));
++  else if (TARGET_SSE_SPLIT_REGS)
++    {
++      emit_clobber (int_xmm);
++      emit_move_insn (gen_lowpart (DImode, int_xmm), input);
++    }
++  else
++    {
++      x = gen_reg_rtx (V2DImode);
++      ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
++      emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
++    }
++
++  x = gen_rtx_CONST_VECTOR (V4SImode,
++			    gen_rtvec (4, GEN_INT (0x43300000UL),
++				       GEN_INT (0x45300000UL),
++				       const0_rtx, const0_rtx));
++  exponents = validize_mem (force_const_mem (V4SImode, x));
++
++  /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
++  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
++
++  /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
++     yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
++     Similarly (0x45300000UL ## fp_value_hi_xmm) yields
++     (0x1.0p84 + double(fp_value_hi_xmm)).
++     Note these exponents differ by 32.  */
++
++  fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
++
++  /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
++     in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
++  real_ldexp (&bias_lo_rvt, &dconst1, 52);
++  real_ldexp (&bias_hi_rvt, &dconst1, 84);
++  biases = const_double_from_real_value (bias_lo_rvt, DFmode);
++  x = const_double_from_real_value (bias_hi_rvt, DFmode);
++  biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
++  biases = validize_mem (force_const_mem (V2DFmode, biases));
++  emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
++
++  /* Add the upper and lower DFmode values together.  */
++  if (TARGET_SSE3)
++    emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
++  else
++    {
++      x = copy_to_mode_reg (V2DFmode, fp_xmm);
++      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
++      emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
++    }
++
++  ix86_expand_vector_extract (false, target, fp_xmm, 0);
++}
++
++/* Not used, but eases macroization of patterns.  */
++void
++ix86_expand_convert_uns_sixf_sse (rtx, rtx)
++{
++  gcc_unreachable ();
++}
++
++/* Convert an unsigned SImode value into a DFmode.  Only currently used
++   for SSE, but applicable anywhere.  */
++
++void
++ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
++{
++  REAL_VALUE_TYPE TWO31r;
++  rtx x, fp;
++
++  x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
++			   NULL, 1, OPTAB_DIRECT);
++
++  fp = gen_reg_rtx (DFmode);
++  emit_insn (gen_floatsidf2 (fp, x));
++
++  real_ldexp (&TWO31r, &dconst1, 31);
++  x = const_double_from_real_value (TWO31r, DFmode);
++
++  x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
++  if (x != target)
++    emit_move_insn (target, x);
++}
++
++/* Convert a signed DImode value into a DFmode.  Only used for SSE in
++   32-bit mode; otherwise we have a direct convert instruction.  */
++
++void
++ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
++{
++  REAL_VALUE_TYPE TWO32r;
++  rtx fp_lo, fp_hi, x;
++
++  fp_lo = gen_reg_rtx (DFmode);
++  fp_hi = gen_reg_rtx (DFmode);
++
++  emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
++
++  real_ldexp (&TWO32r, &dconst1, 32);
++  x = const_double_from_real_value (TWO32r, DFmode);
++  fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
++
++  ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
++
++  x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
++			   0, OPTAB_DIRECT);
++  if (x != target)
++    emit_move_insn (target, x);
++}
++
++/* Convert an unsigned SImode value into a SFmode, using only SSE.
++   For x86_32, -mfpmath=sse, !optimize_size only.  */
++void
++ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
++{
++  REAL_VALUE_TYPE ONE16r;
++  rtx fp_hi, fp_lo, int_hi, int_lo, x;
++
++  real_ldexp (&ONE16r, &dconst1, 16);
++  x = const_double_from_real_value (ONE16r, SFmode);
++  int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
++				      NULL, 0, OPTAB_DIRECT);
++  int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
++				      NULL, 0, OPTAB_DIRECT);
++  fp_hi = gen_reg_rtx (SFmode);
++  fp_lo = gen_reg_rtx (SFmode);
++  emit_insn (gen_floatsisf2 (fp_hi, int_hi));
++  emit_insn (gen_floatsisf2 (fp_lo, int_lo));
++  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
++			       0, OPTAB_DIRECT);
++  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
++			       0, OPTAB_DIRECT);
++  if (!rtx_equal_p (target, fp_hi))
++    emit_move_insn (target, fp_hi);
++}
++
++/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
++   a vector of unsigned ints VAL to vector of floats TARGET.  */
++
++void
++ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
++{
++  rtx tmp[8];
++  REAL_VALUE_TYPE TWO16r;
++  machine_mode intmode = GET_MODE (val);
++  machine_mode fltmode = GET_MODE (target);
++  rtx (*cvt) (rtx, rtx);
++
++  if (intmode == V4SImode)
++    cvt = gen_floatv4siv4sf2;
++  else
++    cvt = gen_floatv8siv8sf2;
++  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
++  tmp[0] = force_reg (intmode, tmp[0]);
++  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
++				OPTAB_DIRECT);
++  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
++				NULL_RTX, 1, OPTAB_DIRECT);
++  tmp[3] = gen_reg_rtx (fltmode);
++  emit_insn (cvt (tmp[3], tmp[1]));
++  tmp[4] = gen_reg_rtx (fltmode);
++  emit_insn (cvt (tmp[4], tmp[2]));
++  real_ldexp (&TWO16r, &dconst1, 16);
++  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
++  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
++  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
++				OPTAB_DIRECT);
++  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
++				OPTAB_DIRECT);
++  if (tmp[7] != target)
++    emit_move_insn (target, tmp[7]);
++}
++
++/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
++   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
++   This is done by doing just signed conversion if < 0x1p31, and otherwise by
++   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
++
++rtx
++ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
++{
++  REAL_VALUE_TYPE TWO31r;
++  rtx two31r, tmp[4];
++  machine_mode mode = GET_MODE (val);
++  machine_mode scalarmode = GET_MODE_INNER (mode);
++  machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
++  rtx (*cmp) (rtx, rtx, rtx, rtx);
++  int i;
++
++  for (i = 0; i < 3; i++)
++    tmp[i] = gen_reg_rtx (mode);
++  real_ldexp (&TWO31r, &dconst1, 31);
++  two31r = const_double_from_real_value (TWO31r, scalarmode);
++  two31r = ix86_build_const_vector (mode, 1, two31r);
++  two31r = force_reg (mode, two31r);
++  switch (mode)
++    {
++    case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
++    case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
++    case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
++    case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
++    default: gcc_unreachable ();
++    }
++  tmp[3] = gen_rtx_LE (mode, two31r, val);
++  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
++  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
++				0, OPTAB_DIRECT);
++  if (intmode == V4SImode || TARGET_AVX2)
++    *xorp = expand_simple_binop (intmode, ASHIFT,
++				 gen_lowpart (intmode, tmp[0]),
++				 GEN_INT (31), NULL_RTX, 0,
++				 OPTAB_DIRECT);
++  else
++    {
++      rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
++      two31 = ix86_build_const_vector (intmode, 1, two31);
++      *xorp = expand_simple_binop (intmode, AND,
++				   gen_lowpart (intmode, tmp[0]),
++				   two31, NULL_RTX, 0,
++				   OPTAB_DIRECT);
++    }
++  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
++			      0, OPTAB_DIRECT);
++}
++
++/* Generate code for floating point ABS or NEG.  */
++
++void
++ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
++				rtx operands[])
++{
++  rtx mask, set, dst, src;
++  bool use_sse = false;
++  bool vector_mode = VECTOR_MODE_P (mode);
++  machine_mode vmode = mode;
++
++  if (vector_mode)
++    use_sse = true;
++  else if (mode == TFmode)
++    use_sse = true;
++  else if (TARGET_SSE_MATH)
++    {
++      use_sse = SSE_FLOAT_MODE_P (mode);
++      if (mode == SFmode)
++	vmode = V4SFmode;
++      else if (mode == DFmode)
++	vmode = V2DFmode;
++    }
++
++  /* NEG and ABS performed with SSE use bitwise mask operations.
++     Create the appropriate mask now.  */
++  if (use_sse)
++    mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
++  else
++    mask = NULL_RTX;
++
++  dst = operands[0];
++  src = operands[1];
++
++  set = gen_rtx_fmt_e (code, mode, src);
++  set = gen_rtx_SET (dst, set);
++
++  if (mask)
++    {
++      rtx use, clob;
++      rtvec par;
++
++      use = gen_rtx_USE (VOIDmode, mask);
++      if (vector_mode)
++	par = gen_rtvec (2, set, use);
++      else
++	{
++          clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
++	  par = gen_rtvec (3, set, use, clob);
++        }
++      emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
++    }
++  else
++    emit_insn (set);
++}
++
++/* Expand a copysign operation.  Special case operand 0 being a constant.  */
++
++void
++ix86_expand_copysign (rtx operands[])
++{
++  machine_mode mode, vmode;
++  rtx dest, op0, op1, mask, nmask;
++
++  dest = operands[0];
++  op0 = operands[1];
++  op1 = operands[2];
++
++  mode = GET_MODE (dest);
++
++  if (mode == SFmode)
++    vmode = V4SFmode;
++  else if (mode == DFmode)
++    vmode = V2DFmode;
++  else
++    vmode = mode;
++
++  if (CONST_DOUBLE_P (op0))
++    {
++      rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
++
++      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
++	op0 = simplify_unary_operation (ABS, mode, op0, mode);
++
++      if (mode == SFmode || mode == DFmode)
++	{
++	  if (op0 == CONST0_RTX (mode))
++	    op0 = CONST0_RTX (vmode);
++	  else
++	    {
++	      rtx v = ix86_build_const_vector (vmode, false, op0);
++
++	      op0 = force_reg (vmode, v);
++	    }
++	}
++      else if (op0 != CONST0_RTX (mode))
++	op0 = force_reg (mode, op0);
++
++      mask = ix86_build_signbit_mask (vmode, 0, 0);
++
++      if (mode == SFmode)
++	copysign_insn = gen_copysignsf3_const;
++      else if (mode == DFmode)
++	copysign_insn = gen_copysigndf3_const;
++      else
++	copysign_insn = gen_copysigntf3_const;
++
++      emit_insn (copysign_insn (dest, op0, op1, mask));
++    }
++  else
++    {
++      rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
++
++      nmask = ix86_build_signbit_mask (vmode, 0, 1);
++      mask = ix86_build_signbit_mask (vmode, 0, 0);
++
++      if (mode == SFmode)
++	copysign_insn = gen_copysignsf3_var;
++      else if (mode == DFmode)
++	copysign_insn = gen_copysigndf3_var;
++      else
++	copysign_insn = gen_copysigntf3_var;
++
++      emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
++    }
++}
++
++/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
++   be a constant, and so has already been expanded into a vector constant.  */
++
++void
++ix86_split_copysign_const (rtx operands[])
++{
++  machine_mode mode, vmode;
++  rtx dest, op0, mask, x;
++
++  dest = operands[0];
++  op0 = operands[1];
++  mask = operands[3];
++
++  mode = GET_MODE (dest);
++  vmode = GET_MODE (mask);
++
++  dest = lowpart_subreg (vmode, dest, mode);
++  x = gen_rtx_AND (vmode, dest, mask);
++  emit_insn (gen_rtx_SET (dest, x));
++
++  if (op0 != CONST0_RTX (vmode))
++    {
++      x = gen_rtx_IOR (vmode, dest, op0);
++      emit_insn (gen_rtx_SET (dest, x));
++    }
++}
++
++/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
++   so we have to do two masks.  */
++
++void
++ix86_split_copysign_var (rtx operands[])
++{
++  machine_mode mode, vmode;
++  rtx dest, scratch, op0, op1, mask, nmask, x;
++
++  dest = operands[0];
++  scratch = operands[1];
++  op0 = operands[2];
++  op1 = operands[3];
++  nmask = operands[4];
++  mask = operands[5];
++
++  mode = GET_MODE (dest);
++  vmode = GET_MODE (mask);
++
++  if (rtx_equal_p (op0, op1))
++    {
++      /* Shouldn't happen often (it's useless, obviously), but when it does
++	 we'd generate incorrect code if we continue below.  */
++      emit_move_insn (dest, op0);
++      return;
++    }
++
++  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
++    {
++      gcc_assert (REGNO (op1) == REGNO (scratch));
++
++      x = gen_rtx_AND (vmode, scratch, mask);
++      emit_insn (gen_rtx_SET (scratch, x));
++
++      dest = mask;
++      op0 = lowpart_subreg (vmode, op0, mode);
++      x = gen_rtx_NOT (vmode, dest);
++      x = gen_rtx_AND (vmode, x, op0);
++      emit_insn (gen_rtx_SET (dest, x));
++    }
++  else
++    {
++      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
++	{
++	  x = gen_rtx_AND (vmode, scratch, mask);
++	}
++      else						/* alternative 2,4 */
++	{
++          gcc_assert (REGNO (mask) == REGNO (scratch));
++          op1 = lowpart_subreg (vmode, op1, mode);
++	  x = gen_rtx_AND (vmode, scratch, op1);
++	}
++      emit_insn (gen_rtx_SET (scratch, x));
++
++      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
++	{
++	  dest = lowpart_subreg (vmode, op0, mode);
++	  x = gen_rtx_AND (vmode, dest, nmask);
++	}
++      else						/* alternative 3,4 */
++	{
++          gcc_assert (REGNO (nmask) == REGNO (dest));
++	  dest = nmask;
++	  op0 = lowpart_subreg (vmode, op0, mode);
++	  x = gen_rtx_AND (vmode, dest, op0);
++	}
++      emit_insn (gen_rtx_SET (dest, x));
++    }
++
++  x = gen_rtx_IOR (vmode, dest, scratch);
++  emit_insn (gen_rtx_SET (dest, x));
++}
++
++/* Expand an xorsign operation.  */
++
++void
++ix86_expand_xorsign (rtx operands[])
++{
++  rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
++  machine_mode mode, vmode;
++  rtx dest, op0, op1, mask;
++
++  dest = operands[0];
++  op0 = operands[1];
++  op1 = operands[2];
++
++  mode = GET_MODE (dest);
++
++  if (mode == SFmode)
++    {
++      xorsign_insn = gen_xorsignsf3_1;
++      vmode = V4SFmode;
++    }
++  else if (mode == DFmode)
++    {
++      xorsign_insn = gen_xorsigndf3_1;
++      vmode = V2DFmode;
++    }
++  else
++    gcc_unreachable ();
++
++  mask = ix86_build_signbit_mask (vmode, 0, 0);
++
++  emit_insn (xorsign_insn (dest, op0, op1, mask));
++}
++
++/* Deconstruct an xorsign operation into bit masks.  */
++
++void
++ix86_split_xorsign (rtx operands[])
++{
++  machine_mode mode, vmode;
++  rtx dest, op0, mask, x;
++
++  dest = operands[0];
++  op0 = operands[1];
++  mask = operands[3];
++
++  mode = GET_MODE (dest);
++  vmode = GET_MODE (mask);
++
++  dest = lowpart_subreg (vmode, dest, mode);
++  x = gen_rtx_AND (vmode, dest, mask);
++  emit_insn (gen_rtx_SET (dest, x));
++
++  op0 = lowpart_subreg (vmode, op0, mode);
++  x = gen_rtx_XOR (vmode, dest, op0);
++  emit_insn (gen_rtx_SET (dest, x));
++}
++
++static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
++
++void
++ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
++{
++  machine_mode mode = GET_MODE (op0);
++  rtx tmp;
++
++  /* Handle special case - vector comparsion with boolean result, transform
++     it using ptest instruction.  */
++  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
++    {
++      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
++      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
++
++      gcc_assert (code == EQ || code == NE);
++      /* Generate XOR since we can't check that one operand is zero vector.  */
++      tmp = gen_reg_rtx (mode);
++      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
++      tmp = gen_lowpart (p_mode, tmp);
++      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
++			      gen_rtx_UNSPEC (CCmode,
++					      gen_rtvec (2, tmp, tmp),
++					      UNSPEC_PTEST)));
++      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
++      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
++				  gen_rtx_LABEL_REF (VOIDmode, label),
++				  pc_rtx);
++      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++      return;
++    }
++
++  switch (mode)
++    {
++    case E_SFmode:
++    case E_DFmode:
++    case E_XFmode:
++    case E_QImode:
++    case E_HImode:
++    case E_SImode:
++      simple:
++      tmp = ix86_expand_compare (code, op0, op1);
++      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
++				  gen_rtx_LABEL_REF (VOIDmode, label),
++				  pc_rtx);
++      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++      return;
++
++    case E_DImode:
++      if (TARGET_64BIT)
++	goto simple;
++      /* For 32-bit target DI comparison may be performed on
++	 SSE registers.  To allow this we should avoid split
++	 to SI mode which is achieved by doing xor in DI mode
++	 and then comparing with zero (which is recognized by
++	 STV pass).  We don't compare using xor when optimizing
++	 for size.  */
++      if (!optimize_insn_for_size_p ()
++	  && TARGET_STV
++	  && (code == EQ || code == NE))
++	{
++	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
++	  op1 = const0_rtx;
++	}
++      /* FALLTHRU */
++    case E_TImode:
++      /* Expand DImode branch into multiple compare+branch.  */
++      {
++	rtx lo[2], hi[2];
++	rtx_code_label *label2;
++	enum rtx_code code1, code2, code3;
++	machine_mode submode;
++
++	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
++	  {
++	    std::swap (op0, op1);
++	    code = swap_condition (code);
++	  }
++
++	split_double_mode (mode, &op0, 1, lo+0, hi+0);
++	split_double_mode (mode, &op1, 1, lo+1, hi+1);
++
++	submode = mode == DImode ? SImode : DImode;
++
++	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
++	   avoid two branches.  This costs one extra insn, so disable when
++	   optimizing for size.  */
++
++	if ((code == EQ || code == NE)
++	    && (!optimize_insn_for_size_p ()
++	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
++	  {
++	    rtx xor0, xor1;
++
++	    xor1 = hi[0];
++	    if (hi[1] != const0_rtx)
++	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
++				   NULL_RTX, 0, OPTAB_WIDEN);
++
++	    xor0 = lo[0];
++	    if (lo[1] != const0_rtx)
++	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
++				   NULL_RTX, 0, OPTAB_WIDEN);
++
++	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
++				NULL_RTX, 0, OPTAB_WIDEN);
++
++	    ix86_expand_branch (code, tmp, const0_rtx, label);
++	    return;
++	  }
++
++	/* Otherwise, if we are doing less-than or greater-or-equal-than,
++	   op1 is a constant and the low word is zero, then we can just
++	   examine the high word.  Similarly for low word -1 and
++	   less-or-equal-than or greater-than.  */
++
++	if (CONST_INT_P (hi[1]))
++	  switch (code)
++	    {
++	    case LT: case LTU: case GE: case GEU:
++	      if (lo[1] == const0_rtx)
++		{
++		  ix86_expand_branch (code, hi[0], hi[1], label);
++		  return;
++		}
++	      break;
++	    case LE: case LEU: case GT: case GTU:
++	      if (lo[1] == constm1_rtx)
++		{
++		  ix86_expand_branch (code, hi[0], hi[1], label);
++		  return;
++		}
++	      break;
++	    default:
++	      break;
++	    }
++
++	/* Emulate comparisons that do not depend on Zero flag with
++	   double-word subtraction.  Note that only Overflow, Sign
++	   and Carry flags are valid, so swap arguments and condition
++	   of comparisons that would otherwise test Zero flag.  */
++
++	switch (code)
++	  {
++	  case LE: case LEU: case GT: case GTU:
++	    std::swap (lo[0], lo[1]);
++	    std::swap (hi[0], hi[1]);
++	    code = swap_condition (code);
++	    /* FALLTHRU */
++
++	  case LT: case LTU: case GE: case GEU:
++	    {
++	      rtx (*cmp_insn) (rtx, rtx);
++	      rtx (*sbb_insn) (rtx, rtx, rtx);
++	      bool uns = (code == LTU || code == GEU);
++
++	      if (TARGET_64BIT)
++		{
++		  cmp_insn = gen_cmpdi_1;
++		  sbb_insn
++		    = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
++		}
++	      else
++		{
++		  cmp_insn = gen_cmpsi_1;
++		  sbb_insn
++		    = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
++		}
++
++	      if (!nonimmediate_operand (lo[0], submode))
++		lo[0] = force_reg (submode, lo[0]);
++	      if (!x86_64_general_operand (lo[1], submode))
++		lo[1] = force_reg (submode, lo[1]);
++
++	      if (!register_operand (hi[0], submode))
++		hi[0] = force_reg (submode, hi[0]);
++	      if ((uns && !nonimmediate_operand (hi[1], submode))
++		  || (!uns && !x86_64_general_operand (hi[1], submode)))
++		hi[1] = force_reg (submode, hi[1]);
++
++	      emit_insn (cmp_insn (lo[0], lo[1]));
++	      emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
++
++	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
++
++	      ix86_expand_branch (code, tmp, const0_rtx, label);
++	      return;
++	    }
++
++	  default:
++	    break;
++	  }
++
++	/* Otherwise, we need two or three jumps.  */
++
++	label2 = gen_label_rtx ();
++
++	code1 = code;
++	code2 = swap_condition (code);
++	code3 = unsigned_condition (code);
++
++	switch (code)
++	  {
++	  case LT: case GT: case LTU: case GTU:
++	    break;
++
++	  case LE:   code1 = LT;  code2 = GT;  break;
++	  case GE:   code1 = GT;  code2 = LT;  break;
++	  case LEU:  code1 = LTU; code2 = GTU; break;
++	  case GEU:  code1 = GTU; code2 = LTU; break;
++
++	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
++	  case NE:   code2 = UNKNOWN; break;
++
++	  default:
++	    gcc_unreachable ();
++	  }
++
++	/*
++	 * a < b =>
++	 *    if (hi(a) < hi(b)) goto true;
++	 *    if (hi(a) > hi(b)) goto false;
++	 *    if (lo(a) < lo(b)) goto true;
++	 *  false:
++	 */
++
++	if (code1 != UNKNOWN)
++	  ix86_expand_branch (code1, hi[0], hi[1], label);
++	if (code2 != UNKNOWN)
++	  ix86_expand_branch (code2, hi[0], hi[1], label2);
++
++	ix86_expand_branch (code3, lo[0], lo[1], label);
++
++	if (code2 != UNKNOWN)
++	  emit_label (label2);
++	return;
++      }
++
++    default:
++      gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
++      goto simple;
++    }
++}
++
++/* Figure out whether to use unordered fp comparisons.  */
++
++static bool
++ix86_unordered_fp_compare (enum rtx_code code)
++{
++  if (!TARGET_IEEE_FP)
++    return false;
++
++  switch (code)
++    {
++    case GT:
++    case GE:
++    case LT:
++    case LE:
++      return false;
++
++    case EQ:
++    case NE:
++
++    case LTGT:
++    case UNORDERED:
++    case ORDERED:
++    case UNLT:
++    case UNLE:
++    case UNGT:
++    case UNGE:
++    case UNEQ:
++      return true;
++
++    default:
++      gcc_unreachable ();
++    }
++}
++
++/* Return a comparison we can do and that it is equivalent to
++   swap_condition (code) apart possibly from orderedness.
++   But, never change orderedness if TARGET_IEEE_FP, returning
++   UNKNOWN in that case if necessary.  */
++
++static enum rtx_code
++ix86_fp_swap_condition (enum rtx_code code)
++{
++  switch (code)
++    {
++    case GT:                   /* GTU - CF=0 & ZF=0 */
++      return TARGET_IEEE_FP ? UNKNOWN : UNLT;
++    case GE:                   /* GEU - CF=0 */
++      return TARGET_IEEE_FP ? UNKNOWN : UNLE;
++    case UNLT:                 /* LTU - CF=1 */
++      return TARGET_IEEE_FP ? UNKNOWN : GT;
++    case UNLE:                 /* LEU - CF=1 | ZF=1 */
++      return TARGET_IEEE_FP ? UNKNOWN : GE;
++    default:
++      return swap_condition (code);
++    }
++}
++
++/* Return cost of comparison CODE using the best strategy for performance.
++   All following functions do use number of instructions as a cost metrics.
++   In future this should be tweaked to compute bytes for optimize_size and
++   take into account performance of various instructions on various CPUs.  */
++
++static int
++ix86_fp_comparison_cost (enum rtx_code code)
++{
++  int arith_cost;
++
++  /* The cost of code using bit-twiddling on %ah.  */
++  switch (code)
++    {
++    case UNLE:
++    case UNLT:
++    case LTGT:
++    case GT:
++    case GE:
++    case UNORDERED:
++    case ORDERED:
++    case UNEQ:
++      arith_cost = 4;
++      break;
++    case LT:
++    case NE:
++    case EQ:
++    case UNGE:
++      arith_cost = TARGET_IEEE_FP ? 5 : 4;
++      break;
++    case LE:
++    case UNGT:
++      arith_cost = TARGET_IEEE_FP ? 6 : 4;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  switch (ix86_fp_comparison_strategy (code))
++    {
++    case IX86_FPCMP_COMI:
++      return arith_cost > 4 ? 3 : 2;
++    case IX86_FPCMP_SAHF:
++      return arith_cost > 4 ? 4 : 3;
++    default:
++      return arith_cost;
++    }
++}
++
++/* Swap, force into registers, or otherwise massage the two operands
++   to a fp comparison.  The operands are updated in place; the new
++   comparison code is returned.  */
++
++static enum rtx_code
++ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
++{
++  bool unordered_compare = ix86_unordered_fp_compare (code);
++  rtx op0 = *pop0, op1 = *pop1;
++  machine_mode op_mode = GET_MODE (op0);
++  bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
++
++  /* All of the unordered compare instructions only work on registers.
++     The same is true of the fcomi compare instructions.  The XFmode
++     compare instructions require registers except when comparing
++     against zero or when converting operand 1 from fixed point to
++     floating point.  */
++
++  if (!is_sse
++      && (unordered_compare
++	  || (op_mode == XFmode
++	      && ! (standard_80387_constant_p (op0) == 1
++		    || standard_80387_constant_p (op1) == 1)
++	      && GET_CODE (op1) != FLOAT)
++	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
++    {
++      op0 = force_reg (op_mode, op0);
++      op1 = force_reg (op_mode, op1);
++    }
++  else
++    {
++      /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
++	 things around if they appear profitable, otherwise force op0
++	 into a register.  */
++
++      if (standard_80387_constant_p (op0) == 0
++	  || (MEM_P (op0)
++	      && ! (standard_80387_constant_p (op1) == 0
++		    || MEM_P (op1))))
++	{
++	  enum rtx_code new_code = ix86_fp_swap_condition (code);
++	  if (new_code != UNKNOWN)
++	    {
++	      std::swap (op0, op1);
++	      code = new_code;
++	    }
++	}
++
++      if (!REG_P (op0))
++	op0 = force_reg (op_mode, op0);
++
++      if (CONSTANT_P (op1))
++	{
++	  int tmp = standard_80387_constant_p (op1);
++	  if (tmp == 0)
++	    op1 = validize_mem (force_const_mem (op_mode, op1));
++	  else if (tmp == 1)
++	    {
++	      if (TARGET_CMOVE)
++		op1 = force_reg (op_mode, op1);
++	    }
++	  else
++	    op1 = force_reg (op_mode, op1);
++	}
++    }
++
++  /* Try to rearrange the comparison to make it cheaper.  */
++  if (ix86_fp_comparison_cost (code)
++      > ix86_fp_comparison_cost (swap_condition (code))
++      && (REG_P (op1) || can_create_pseudo_p ()))
++    {
++      std::swap (op0, op1);
++      code = swap_condition (code);
++      if (!REG_P (op0))
++	op0 = force_reg (op_mode, op0);
++    }
++
++  *pop0 = op0;
++  *pop1 = op1;
++  return code;
++}
++
++/* Generate insn patterns to do a floating point compare of OPERANDS.  */
++
++static rtx
++ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
++{
++  bool unordered_compare = ix86_unordered_fp_compare (code);
++  machine_mode cmp_mode;
++  rtx tmp, scratch;
++
++  code = ix86_prepare_fp_compare_args (code, &op0, &op1);
++
++  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
++  if (unordered_compare)
++    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
++
++  /* Do fcomi/sahf based test when profitable.  */
++  switch (ix86_fp_comparison_strategy (code))
++    {
++    case IX86_FPCMP_COMI:
++      cmp_mode = CCFPmode;
++      emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
++      break;
++
++    case IX86_FPCMP_SAHF:
++      cmp_mode = CCFPmode;
++      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
++      scratch = gen_reg_rtx (HImode);
++      emit_insn (gen_rtx_SET (scratch, tmp));
++      emit_insn (gen_x86_sahf_1 (scratch));
++      break;
++
++    case IX86_FPCMP_ARITH:
++      cmp_mode = CCNOmode;
++      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
++      scratch = gen_reg_rtx (HImode);
++      emit_insn (gen_rtx_SET (scratch, tmp));
++
++      /* In the unordered case, we have to check C2 for NaN's, which
++	 doesn't happen to work out to anything nice combination-wise.
++	 So do some bit twiddling on the value we've got in AH to come
++	 up with an appropriate set of condition codes.  */
++
++      switch (code)
++	{
++	case GT:
++	case UNGT:
++	  if (code == GT || !TARGET_IEEE_FP)
++	    {
++	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
++	      code = EQ;
++	    }
++	  else
++	    {
++	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
++	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
++	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
++	      cmp_mode = CCmode;
++	      code = GEU;
++	    }
++	  break;
++	case LT:
++	case UNLT:
++	  if (code == LT && TARGET_IEEE_FP)
++	    {
++	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
++	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
++	      cmp_mode = CCmode;
++	      code = EQ;
++	    }
++	  else
++	    {
++	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
++	      code = NE;
++	    }
++	  break;
++	case GE:
++	case UNGE:
++	  if (code == GE || !TARGET_IEEE_FP)
++	    {
++	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
++	      code = EQ;
++	    }
++	  else
++	    {
++	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
++	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
++	      code = NE;
++	    }
++	  break;
++	case LE:
++	case UNLE:
++	  if (code == LE && TARGET_IEEE_FP)
++	    {
++	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
++	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
++	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
++	      cmp_mode = CCmode;
++	      code = LTU;
++	    }
++	  else
++	    {
++	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
++	      code = NE;
++	    }
++	  break;
++	case EQ:
++	case UNEQ:
++	  if (code == EQ && TARGET_IEEE_FP)
++	    {
++	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
++	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
++	      cmp_mode = CCmode;
++	      code = EQ;
++	    }
++	  else
++	    {
++	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
++	      code = NE;
++	    }
++	  break;
++	case NE:
++	case LTGT:
++	  if (code == NE && TARGET_IEEE_FP)
++	    {
++	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
++	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
++					     GEN_INT (0x40)));
++	      code = NE;
++	    }
++	  else
++	    {
++	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
++	      code = EQ;
++	    }
++	  break;
++
++	case UNORDERED:
++	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
++	  code = NE;
++	  break;
++	case ORDERED:
++	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
++	  code = EQ;
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++	break;
++
++    default:
++      gcc_unreachable();
++    }
++
++  /* Return the test that should be put into the flags user, i.e.
++     the bcc, scc, or cmov instruction.  */
++  return gen_rtx_fmt_ee (code, VOIDmode,
++			 gen_rtx_REG (cmp_mode, FLAGS_REG),
++			 const0_rtx);
++}
++
++/* Generate insn patterns to do an integer compare of OPERANDS.  */
++
++static rtx
++ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
++{
++  machine_mode cmpmode;
++  rtx tmp, flags;
++
++  cmpmode = SELECT_CC_MODE (code, op0, op1);
++  flags = gen_rtx_REG (cmpmode, FLAGS_REG);
++
++  /* This is very simple, but making the interface the same as in the
++     FP case makes the rest of the code easier.  */
++  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
++  emit_insn (gen_rtx_SET (flags, tmp));
++
++  /* Return the test that should be put into the flags user, i.e.
++     the bcc, scc, or cmov instruction.  */
++  return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
++}
++
++static rtx
++ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
++{
++  rtx ret;
++
++  if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
++    ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
++
++  else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
++    {
++      gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
++      ret = ix86_expand_fp_compare (code, op0, op1);
++    }
++  else
++    ret = ix86_expand_int_compare (code, op0, op1);
++
++  return ret;
++}
++
++void
++ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
++{
++  rtx ret;
++
++  gcc_assert (GET_MODE (dest) == QImode);
++
++  ret = ix86_expand_compare (code, op0, op1);
++  PUT_MODE (ret, QImode);
++  emit_insn (gen_rtx_SET (dest, ret));
++}
++
++/* Expand comparison setting or clearing carry flag.  Return true when
++   successful and set pop for the operation.  */
++static bool
++ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
++{
++  machine_mode mode
++    = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
++
++  /* Do not handle double-mode compares that go through special path.  */
++  if (mode == (TARGET_64BIT ? TImode : DImode))
++    return false;
++
++  if (SCALAR_FLOAT_MODE_P (mode))
++    {
++      rtx compare_op;
++      rtx_insn *compare_seq;
++
++      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
++
++      /* Shortcut:  following common codes never translate
++	 into carry flag compares.  */
++      if (code == EQ || code == NE || code == UNEQ || code == LTGT
++	  || code == ORDERED || code == UNORDERED)
++	return false;
++
++      /* These comparisons require zero flag; swap operands so they won't.  */
++      if ((code == GT || code == UNLE || code == LE || code == UNGT)
++	  && !TARGET_IEEE_FP)
++	{
++	  std::swap (op0, op1);
++	  code = swap_condition (code);
++	}
++
++      /* Try to expand the comparison and verify that we end up with
++	 carry flag based comparison.  This fails to be true only when
++	 we decide to expand comparison using arithmetic that is not
++	 too common scenario.  */
++      start_sequence ();
++      compare_op = ix86_expand_fp_compare (code, op0, op1);
++      compare_seq = get_insns ();
++      end_sequence ();
++
++      if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
++        code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
++      else
++	code = GET_CODE (compare_op);
++
++      if (code != LTU && code != GEU)
++	return false;
++
++      emit_insn (compare_seq);
++      *pop = compare_op;
++      return true;
++    }
++
++  if (!INTEGRAL_MODE_P (mode))
++    return false;
++
++  switch (code)
++    {
++    case LTU:
++    case GEU:
++      break;
++
++    /* Convert a==0 into (unsigned)a<1.  */
++    case EQ:
++    case NE:
++      if (op1 != const0_rtx)
++	return false;
++      op1 = const1_rtx;
++      code = (code == EQ ? LTU : GEU);
++      break;
++
++    /* Convert a>b into b<a or a>=b-1.  */
++    case GTU:
++    case LEU:
++      if (CONST_INT_P (op1))
++	{
++	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
++	  /* Bail out on overflow.  We still can swap operands but that
++	     would force loading of the constant into register.  */
++	  if (op1 == const0_rtx
++	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
++	    return false;
++	  code = (code == GTU ? GEU : LTU);
++	}
++      else
++	{
++	  std::swap (op0, op1);
++	  code = (code == GTU ? LTU : GEU);
++	}
++      break;
++
++    /* Convert a>=0 into (unsigned)a<0x80000000.  */
++    case LT:
++    case GE:
++      if (mode == DImode || op1 != const0_rtx)
++	return false;
++      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
++      code = (code == LT ? GEU : LTU);
++      break;
++    case LE:
++    case GT:
++      if (mode == DImode || op1 != constm1_rtx)
++	return false;
++      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
++      code = (code == LE ? GEU : LTU);
++      break;
++
++    default:
++      return false;
++    }
++  /* Swapping operands may cause constant to appear as first operand.  */
++  if (!nonimmediate_operand (op0, VOIDmode))
++    {
++      if (!can_create_pseudo_p ())
++	return false;
++      op0 = force_reg (mode, op0);
++    }
++  *pop = ix86_expand_compare (code, op0, op1);
++  gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
++  return true;
++}
++
++/* Expand conditional increment or decrement using adb/sbb instructions.
++   The default case using setcc followed by the conditional move can be
++   done by generic code.  */
++bool
++ix86_expand_int_addcc (rtx operands[])
++{
++  enum rtx_code code = GET_CODE (operands[1]);
++  rtx flags;
++  rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
++  rtx compare_op;
++  rtx val = const0_rtx;
++  bool fpcmp = false;
++  machine_mode mode;
++  rtx op0 = XEXP (operands[1], 0);
++  rtx op1 = XEXP (operands[1], 1);
++
++  if (operands[3] != const1_rtx
++      && operands[3] != constm1_rtx)
++    return false;
++  if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
++     return false;
++  code = GET_CODE (compare_op);
++
++  flags = XEXP (compare_op, 0);
++
++  if (GET_MODE (flags) == CCFPmode)
++    {
++      fpcmp = true;
++      code = ix86_fp_compare_code_to_integer (code);
++    }
++
++  if (code != LTU)
++    {
++      val = constm1_rtx;
++      if (fpcmp)
++	PUT_CODE (compare_op,
++		  reverse_condition_maybe_unordered
++		    (GET_CODE (compare_op)));
++      else
++	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
++    }
++
++  mode = GET_MODE (operands[0]);
++
++  /* Construct either adc or sbb insn.  */
++  if ((code == LTU) == (operands[3] == constm1_rtx))
++    {
++      switch (mode)
++	{
++	  case E_QImode:
++	    insn = gen_subqi3_carry;
++	    break;
++	  case E_HImode:
++	    insn = gen_subhi3_carry;
++	    break;
++	  case E_SImode:
++	    insn = gen_subsi3_carry;
++	    break;
++	  case E_DImode:
++	    insn = gen_subdi3_carry;
++	    break;
++	  default:
++	    gcc_unreachable ();
++	}
++    }
++  else
++    {
++      switch (mode)
++	{
++	  case E_QImode:
++	    insn = gen_addqi3_carry;
++	    break;
++	  case E_HImode:
++	    insn = gen_addhi3_carry;
++	    break;
++	  case E_SImode:
++	    insn = gen_addsi3_carry;
++	    break;
++	  case E_DImode:
++	    insn = gen_adddi3_carry;
++	    break;
++	  default:
++	    gcc_unreachable ();
++	}
++    }
++  emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
++
++  return true;
++}
++
++bool
++ix86_expand_int_movcc (rtx operands[])
++{
++  enum rtx_code code = GET_CODE (operands[1]), compare_code;
++  rtx_insn *compare_seq;
++  rtx compare_op;
++  machine_mode mode = GET_MODE (operands[0]);
++  bool sign_bit_compare_p = false;
++  rtx op0 = XEXP (operands[1], 0);
++  rtx op1 = XEXP (operands[1], 1);
++
++  if (GET_MODE (op0) == TImode
++      || (GET_MODE (op0) == DImode
++	  && !TARGET_64BIT))
++    return false;
++
++  start_sequence ();
++  compare_op = ix86_expand_compare (code, op0, op1);
++  compare_seq = get_insns ();
++  end_sequence ();
++
++  compare_code = GET_CODE (compare_op);
++
++  if ((op1 == const0_rtx && (code == GE || code == LT))
++      || (op1 == constm1_rtx && (code == GT || code == LE)))
++    sign_bit_compare_p = true;
++
++  /* Don't attempt mode expansion here -- if we had to expand 5 or 6
++     HImode insns, we'd be swallowed in word prefix ops.  */
++
++  if ((mode != HImode || TARGET_FAST_PREFIX)
++      && (mode != (TARGET_64BIT ? TImode : DImode))
++      && CONST_INT_P (operands[2])
++      && CONST_INT_P (operands[3]))
++    {
++      rtx out = operands[0];
++      HOST_WIDE_INT ct = INTVAL (operands[2]);
++      HOST_WIDE_INT cf = INTVAL (operands[3]);
++      HOST_WIDE_INT diff;
++
++      diff = ct - cf;
++      /*  Sign bit compares are better done using shifts than we do by using
++	  sbb.  */
++      if (sign_bit_compare_p
++	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
++	{
++	  /* Detect overlap between destination and compare sources.  */
++	  rtx tmp = out;
++
++          if (!sign_bit_compare_p)
++	    {
++	      rtx flags;
++	      bool fpcmp = false;
++
++	      compare_code = GET_CODE (compare_op);
++
++	      flags = XEXP (compare_op, 0);
++
++	      if (GET_MODE (flags) == CCFPmode)
++		{
++		  fpcmp = true;
++		  compare_code
++		    = ix86_fp_compare_code_to_integer (compare_code);
++		}
++
++	      /* To simplify rest of code, restrict to the GEU case.  */
++	      if (compare_code == LTU)
++		{
++		  std::swap (ct, cf);
++		  compare_code = reverse_condition (compare_code);
++		  code = reverse_condition (code);
++		}
++	      else
++		{
++		  if (fpcmp)
++		    PUT_CODE (compare_op,
++			      reverse_condition_maybe_unordered
++			        (GET_CODE (compare_op)));
++		  else
++		    PUT_CODE (compare_op,
++			      reverse_condition (GET_CODE (compare_op)));
++		}
++	      diff = ct - cf;
++
++	      if (reg_overlap_mentioned_p (out, op0)
++		  || reg_overlap_mentioned_p (out, op1))
++		tmp = gen_reg_rtx (mode);
++
++	      if (mode == DImode)
++		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
++	      else
++		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
++						 flags, compare_op));
++	    }
++	  else
++	    {
++	      if (code == GT || code == GE)
++		code = reverse_condition (code);
++	      else
++		{
++		  std::swap (ct, cf);
++		  diff = ct - cf;
++		}
++	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
++	    }
++
++	  if (diff == 1)
++	    {
++	      /*
++	       * cmpl op0,op1
++	       * sbbl dest,dest
++	       * [addl dest, ct]
++	       *
++	       * Size 5 - 8.
++	       */
++	      if (ct)
++		tmp = expand_simple_binop (mode, PLUS,
++					   tmp, GEN_INT (ct),
++					   copy_rtx (tmp), 1, OPTAB_DIRECT);
++	    }
++	  else if (cf == -1)
++	    {
++	      /*
++	       * cmpl op0,op1
++	       * sbbl dest,dest
++	       * orl $ct, dest
++	       *
++	       * Size 8.
++	       */
++	      tmp = expand_simple_binop (mode, IOR,
++					 tmp, GEN_INT (ct),
++					 copy_rtx (tmp), 1, OPTAB_DIRECT);
++	    }
++	  else if (diff == -1 && ct)
++	    {
++	      /*
++	       * cmpl op0,op1
++	       * sbbl dest,dest
++	       * notl dest
++	       * [addl dest, cf]
++	       *
++	       * Size 8 - 11.
++	       */
++	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
++	      if (cf)
++		tmp = expand_simple_binop (mode, PLUS,
++					   copy_rtx (tmp), GEN_INT (cf),
++					   copy_rtx (tmp), 1, OPTAB_DIRECT);
++	    }
++	  else
++	    {
++	      /*
++	       * cmpl op0,op1
++	       * sbbl dest,dest
++	       * [notl dest]
++	       * andl cf - ct, dest
++	       * [addl dest, ct]
++	       *
++	       * Size 8 - 11.
++	       */
++
++	      if (cf == 0)
++		{
++		  cf = ct;
++		  ct = 0;
++		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
++		}
++
++	      tmp = expand_simple_binop (mode, AND,
++					 copy_rtx (tmp),
++					 gen_int_mode (cf - ct, mode),
++					 copy_rtx (tmp), 1, OPTAB_DIRECT);
++	      if (ct)
++		tmp = expand_simple_binop (mode, PLUS,
++					   copy_rtx (tmp), GEN_INT (ct),
++					   copy_rtx (tmp), 1, OPTAB_DIRECT);
++	    }
++
++	  if (!rtx_equal_p (tmp, out))
++	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
++
++	  return true;
++	}
++
++      if (diff < 0)
++	{
++	  machine_mode cmp_mode = GET_MODE (op0);
++	  enum rtx_code new_code;
++
++	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
++	    {
++	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
++
++	      /* We may be reversing unordered compare to normal compare, that
++		 is not valid in general (we may convert non-trapping condition
++		 to trapping one), however on i386 we currently emit all
++		 comparisons unordered.  */
++	      new_code = reverse_condition_maybe_unordered (code);
++	    }
++	  else
++	    new_code = ix86_reverse_condition (code, cmp_mode);
++	  if (new_code != UNKNOWN)
++	    {
++	      std::swap (ct, cf);
++	      diff = -diff;
++	      code = new_code;
++	    }
++	}
++
++      compare_code = UNKNOWN;
++      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
++	  && CONST_INT_P (op1))
++	{
++	  if (op1 == const0_rtx
++	      && (code == LT || code == GE))
++	    compare_code = code;
++	  else if (op1 == constm1_rtx)
++	    {
++	      if (code == LE)
++		compare_code = LT;
++	      else if (code == GT)
++		compare_code = GE;
++	    }
++	}
++
++      /* Optimize dest = (op0 < 0) ? -1 : cf.  */
++      if (compare_code != UNKNOWN
++	  && GET_MODE (op0) == GET_MODE (out)
++	  && (cf == -1 || ct == -1))
++	{
++	  /* If lea code below could be used, only optimize
++	     if it results in a 2 insn sequence.  */
++
++	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
++		 || diff == 3 || diff == 5 || diff == 9)
++	      || (compare_code == LT && ct == -1)
++	      || (compare_code == GE && cf == -1))
++	    {
++	      /*
++	       * notl op1	(if necessary)
++	       * sarl $31, op1
++	       * orl cf, op1
++	       */
++	      if (ct != -1)
++		{
++		  cf = ct;
++		  ct = -1;
++		  code = reverse_condition (code);
++		}
++
++	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
++
++	      out = expand_simple_binop (mode, IOR,
++					 out, GEN_INT (cf),
++					 out, 1, OPTAB_DIRECT);
++	      if (out != operands[0])
++		emit_move_insn (operands[0], out);
++
++	      return true;
++	    }
++	}
++
++
++      if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
++	   || diff == 3 || diff == 5 || diff == 9)
++	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
++	  && (mode != DImode
++	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
++	{
++	  /*
++	   * xorl dest,dest
++	   * cmpl op1,op2
++	   * setcc dest
++	   * lea cf(dest*(ct-cf)),dest
++	   *
++	   * Size 14.
++	   *
++	   * This also catches the degenerate setcc-only case.
++	   */
++
++	  rtx tmp;
++	  int nops;
++
++	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
++
++	  nops = 0;
++	  /* On x86_64 the lea instruction operates on Pmode, so we need
++	     to get arithmetics done in proper mode to match.  */
++	  if (diff == 1)
++	    tmp = copy_rtx (out);
++	  else
++	    {
++	      rtx out1;
++	      out1 = copy_rtx (out);
++	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
++	      nops++;
++	      if (diff & 1)
++		{
++		  tmp = gen_rtx_PLUS (mode, tmp, out1);
++		  nops++;
++		}
++	    }
++	  if (cf != 0)
++	    {
++	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
++	      nops++;
++	    }
++	  if (!rtx_equal_p (tmp, out))
++	    {
++	      if (nops == 1)
++		out = force_operand (tmp, copy_rtx (out));
++	      else
++		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
++	    }
++	  if (!rtx_equal_p (out, operands[0]))
++	    emit_move_insn (operands[0], copy_rtx (out));
++
++	  return true;
++	}
++
++      /*
++       * General case:			Jumpful:
++       *   xorl dest,dest		cmpl op1, op2
++       *   cmpl op1, op2		movl ct, dest
++       *   setcc dest			jcc 1f
++       *   decl dest			movl cf, dest
++       *   andl (cf-ct),dest		1:
++       *   addl ct,dest
++       *
++       * Size 20.			Size 14.
++       *
++       * This is reasonably steep, but branch mispredict costs are
++       * high on modern cpus, so consider failing only if optimizing
++       * for space.
++       */
++
++      if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
++	  && BRANCH_COST (optimize_insn_for_speed_p (),
++		  	  false) >= 2)
++	{
++	  if (cf == 0)
++	    {
++	      machine_mode cmp_mode = GET_MODE (op0);
++	      enum rtx_code new_code;
++
++	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
++		{
++		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
++
++		  /* We may be reversing unordered compare to normal compare,
++		     that is not valid in general (we may convert non-trapping
++		     condition to trapping one), however on i386 we currently
++		     emit all comparisons unordered.  */
++		  new_code = reverse_condition_maybe_unordered (code);
++		}
++	      else
++		{
++		  new_code = ix86_reverse_condition (code, cmp_mode);
++		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
++		    compare_code = reverse_condition (compare_code);
++		}
++
++	      if (new_code != UNKNOWN)
++		{
++		  cf = ct;
++		  ct = 0;
++		  code = new_code;
++		}
++	    }
++
++	  if (compare_code != UNKNOWN)
++	    {
++	      /* notl op1	(if needed)
++		 sarl $31, op1
++		 andl (cf-ct), op1
++		 addl ct, op1
++
++		 For x < 0 (resp. x <= -1) there will be no notl,
++		 so if possible swap the constants to get rid of the
++		 complement.
++		 True/false will be -1/0 while code below (store flag
++		 followed by decrement) is 0/-1, so the constants need
++		 to be exchanged once more.  */
++
++	      if (compare_code == GE || !cf)
++		{
++		  code = reverse_condition (code);
++		  compare_code = LT;
++		}
++	      else
++		std::swap (ct, cf);
++
++	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
++	    }
++	  else
++	    {
++	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
++
++	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
++					 constm1_rtx,
++					 copy_rtx (out), 1, OPTAB_DIRECT);
++	    }
++
++	  out = expand_simple_binop (mode, AND, copy_rtx (out),
++				     gen_int_mode (cf - ct, mode),
++				     copy_rtx (out), 1, OPTAB_DIRECT);
++	  if (ct)
++	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
++				       copy_rtx (out), 1, OPTAB_DIRECT);
++	  if (!rtx_equal_p (out, operands[0]))
++	    emit_move_insn (operands[0], copy_rtx (out));
++
++	  return true;
++	}
++    }
++
++  if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
++    {
++      /* Try a few things more with specific constants and a variable.  */
++
++      optab op;
++      rtx var, orig_out, out, tmp;
++
++      if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
++	return false;
++
++      /* If one of the two operands is an interesting constant, load a
++	 constant with the above and mask it in with a logical operation.  */
++
++      if (CONST_INT_P (operands[2]))
++	{
++	  var = operands[3];
++	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
++	    operands[3] = constm1_rtx, op = and_optab;
++	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
++	    operands[3] = const0_rtx, op = ior_optab;
++	  else
++	    return false;
++	}
++      else if (CONST_INT_P (operands[3]))
++	{
++	  var = operands[2];
++	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
++	    operands[2] = constm1_rtx, op = and_optab;
++	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
++	    operands[2] = const0_rtx, op = ior_optab;
++	  else
++	    return false;
++	}
++      else
++        return false;
++
++      orig_out = operands[0];
++      tmp = gen_reg_rtx (mode);
++      operands[0] = tmp;
++
++      /* Recurse to get the constant loaded.  */
++      if (!ix86_expand_int_movcc (operands))
++        return false;
++
++      /* Mask in the interesting variable.  */
++      out = expand_binop (mode, op, var, tmp, orig_out, 0,
++			  OPTAB_WIDEN);
++      if (!rtx_equal_p (out, orig_out))
++	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
++
++      return true;
++    }
++
++  /*
++   * For comparison with above,
++   *
++   * movl cf,dest
++   * movl ct,tmp
++   * cmpl op1,op2
++   * cmovcc tmp,dest
++   *
++   * Size 15.
++   */
++
++  if (! nonimmediate_operand (operands[2], mode))
++    operands[2] = force_reg (mode, operands[2]);
++  if (! nonimmediate_operand (operands[3], mode))
++    operands[3] = force_reg (mode, operands[3]);
++
++  if (! register_operand (operands[2], VOIDmode)
++      && (mode == QImode
++          || ! register_operand (operands[3], VOIDmode)))
++    operands[2] = force_reg (mode, operands[2]);
++
++  if (mode == QImode
++      && ! register_operand (operands[3], VOIDmode))
++    operands[3] = force_reg (mode, operands[3]);
++
++  emit_insn (compare_seq);
++  emit_insn (gen_rtx_SET (operands[0],
++			  gen_rtx_IF_THEN_ELSE (mode,
++						compare_op, operands[2],
++						operands[3])));
++  return true;
++}
++
++/* Detect conditional moves that exactly match min/max operational
++   semantics.  Note that this is IEEE safe, as long as we don't
++   interchange the operands.
++
++   Returns FALSE if this conditional move doesn't match a MIN/MAX,
++   and TRUE if the operation is successful and instructions are emitted.  */
++
++static bool
++ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
++			   rtx cmp_op1, rtx if_true, rtx if_false)
++{
++  machine_mode mode;
++  bool is_min;
++  rtx tmp;
++
++  if (code == LT)
++    ;
++  else if (code == UNGE)
++    std::swap (if_true, if_false);
++  else
++    return false;
++
++  if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
++    is_min = true;
++  else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
++    is_min = false;
++  else
++    return false;
++
++  mode = GET_MODE (dest);
++
++  /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
++     but MODE may be a vector mode and thus not appropriate.  */
++  if (!flag_finite_math_only || flag_signed_zeros)
++    {
++      int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
++      rtvec v;
++
++      if_true = force_reg (mode, if_true);
++      v = gen_rtvec (2, if_true, if_false);
++      tmp = gen_rtx_UNSPEC (mode, v, u);
++    }
++  else
++    {
++      code = is_min ? SMIN : SMAX;
++      if (MEM_P (if_true) && MEM_P (if_false))
++	if_true = force_reg (mode, if_true);
++      tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
++    }
++
++  emit_insn (gen_rtx_SET (dest, tmp));
++  return true;
++}
++
++/* Expand an SSE comparison.  Return the register with the result.  */
++
++static rtx
++ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
++		     rtx op_true, rtx op_false)
++{
++  machine_mode mode = GET_MODE (dest);
++  machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
++
++  /* In general case result of comparison can differ from operands' type.  */
++  machine_mode cmp_mode;
++
++  /* In AVX512F the result of comparison is an integer mask.  */
++  bool maskcmp = false;
++  rtx x;
++
++  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
++    {
++      unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
++      cmp_mode = int_mode_for_size (nbits, 0).require ();
++      maskcmp = true;
++    }
++  else
++    cmp_mode = cmp_ops_mode;
++
++  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
++
++  int (*op1_predicate)(rtx, machine_mode)
++    = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
++
++  if (!op1_predicate (cmp_op1, cmp_ops_mode))
++    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
++
++  if (optimize
++      || (maskcmp && cmp_mode != mode)
++      || (op_true && reg_overlap_mentioned_p (dest, op_true))
++      || (op_false && reg_overlap_mentioned_p (dest, op_false)))
++    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
++
++  /* Compare patterns for int modes are unspec in AVX512F only.  */
++  if (maskcmp && (code == GT || code == EQ))
++    {
++      rtx (*gen)(rtx, rtx, rtx);
++
++      switch (cmp_ops_mode)
++	{
++	case E_V64QImode:
++	  gcc_assert (TARGET_AVX512BW);
++	  gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
++	  break;
++	case E_V32HImode:
++	  gcc_assert (TARGET_AVX512BW);
++	  gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
++	  break;
++	case E_V16SImode:
++	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
++	  break;
++	case E_V8DImode:
++	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
++	  break;
++	default:
++	  gen = NULL;
++	}
++
++      if (gen)
++	{
++	  emit_insn (gen (dest, cmp_op0, cmp_op1));
++	  return dest;
++	}
++    }
++  x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
++
++  if (cmp_mode != mode && !maskcmp)
++    {
++      x = force_reg (cmp_ops_mode, x);
++      convert_move (dest, x, false);
++    }
++  else
++    emit_insn (gen_rtx_SET (dest, x));
++
++  return dest;
++}
++
++/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
++   operations.  This is used for both scalar and vector conditional moves.  */
++
++void
++ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
++{
++  machine_mode mode = GET_MODE (dest);
++  machine_mode cmpmode = GET_MODE (cmp);
++
++  /* In AVX512F the result of comparison is an integer mask.  */
++  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
++
++  rtx t2, t3, x;
++
++  /* If we have an integer mask and FP value then we need
++     to cast mask to FP mode.  */
++  if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
++    {
++      cmp = force_reg (cmpmode, cmp);
++      cmp = gen_rtx_SUBREG (mode, cmp, 0);
++    }
++
++  if (maskcmp)
++    {
++      rtx (*gen) (rtx, rtx) = NULL;
++      if ((op_true == CONST0_RTX (mode)
++	   && vector_all_ones_operand (op_false, mode))
++	  || (op_false == CONST0_RTX (mode)
++	      && vector_all_ones_operand (op_true, mode)))
++	switch (mode)
++	  {
++	  case E_V64QImode:
++	    if (TARGET_AVX512BW)
++	      gen = gen_avx512bw_cvtmask2bv64qi;
++	    break;
++	  case E_V32QImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512BW)
++	      gen = gen_avx512vl_cvtmask2bv32qi;
++	    break;
++	  case E_V16QImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512BW)
++	      gen = gen_avx512vl_cvtmask2bv16qi;
++	    break;
++	  case E_V32HImode:
++	    if (TARGET_AVX512BW)
++	      gen = gen_avx512bw_cvtmask2wv32hi;
++	    break;
++	  case E_V16HImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512BW)
++	      gen = gen_avx512vl_cvtmask2wv16hi;
++	    break;
++	  case E_V8HImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512BW)
++	      gen = gen_avx512vl_cvtmask2wv8hi;
++	    break;
++	  case E_V16SImode:
++	    if (TARGET_AVX512DQ)
++	      gen = gen_avx512f_cvtmask2dv16si;
++	    break;
++	  case E_V8SImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
++	      gen = gen_avx512vl_cvtmask2dv8si;
++	    break;
++	  case E_V4SImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
++	      gen = gen_avx512vl_cvtmask2dv4si;
++	    break;
++	  case E_V8DImode:
++	    if (TARGET_AVX512DQ)
++	      gen = gen_avx512f_cvtmask2qv8di;
++	    break;
++	  case E_V4DImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
++	      gen = gen_avx512vl_cvtmask2qv4di;
++	    break;
++	  case E_V2DImode:
++	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
++	      gen = gen_avx512vl_cvtmask2qv2di;
++	    break;
++	  default:
++	    break;
++	  }
++      if (gen && SCALAR_INT_MODE_P (cmpmode))
++	{
++	  cmp = force_reg (cmpmode, cmp);
++	  if (op_true == CONST0_RTX (mode))
++	    {
++	      rtx (*gen_not) (rtx, rtx);
++	      switch (cmpmode)
++		{
++		case E_QImode: gen_not = gen_knotqi; break;
++		case E_HImode: gen_not = gen_knothi; break;
++		case E_SImode: gen_not = gen_knotsi; break;
++		case E_DImode: gen_not = gen_knotdi; break;
++		default: gcc_unreachable ();
++		}
++	      rtx n = gen_reg_rtx (cmpmode);
++	      emit_insn (gen_not (n, cmp));
++	      cmp = n;
++	    }
++	  emit_insn (gen (dest, cmp));
++	  return;
++	}
++    }
++  else if (vector_all_ones_operand (op_true, mode)
++	   && op_false == CONST0_RTX (mode))
++    {
++      emit_insn (gen_rtx_SET (dest, cmp));
++      return;
++    }
++  else if (op_false == CONST0_RTX (mode))
++    {
++      op_true = force_reg (mode, op_true);
++      x = gen_rtx_AND (mode, cmp, op_true);
++      emit_insn (gen_rtx_SET (dest, x));
++      return;
++    }
++  else if (op_true == CONST0_RTX (mode))
++    {
++      op_false = force_reg (mode, op_false);
++      x = gen_rtx_NOT (mode, cmp);
++      x = gen_rtx_AND (mode, x, op_false);
++      emit_insn (gen_rtx_SET (dest, x));
++      return;
++    }
++  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
++    {
++      op_false = force_reg (mode, op_false);
++      x = gen_rtx_IOR (mode, cmp, op_false);
++      emit_insn (gen_rtx_SET (dest, x));
++      return;
++    }
++  else if (TARGET_XOP)
++    {
++      op_true = force_reg (mode, op_true);
++
++      if (!nonimmediate_operand (op_false, mode))
++	op_false = force_reg (mode, op_false);
++
++      emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
++							  op_true,
++							  op_false)));
++      return;
++    }
++
++  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
++  rtx d = dest;
++
++  if (!vector_operand (op_true, mode))
++    op_true = force_reg (mode, op_true);
++
++  op_false = force_reg (mode, op_false);
++
++  switch (mode)
++    {
++    case E_V4SFmode:
++      if (TARGET_SSE4_1)
++	gen = gen_sse4_1_blendvps;
++      break;
++    case E_V2DFmode:
++      if (TARGET_SSE4_1)
++	gen = gen_sse4_1_blendvpd;
++      break;
++    case E_SFmode:
++      if (TARGET_SSE4_1)
++	{
++	  gen = gen_sse4_1_blendvss;
++	  op_true = force_reg (mode, op_true);
++	}
++      break;
++    case E_DFmode:
++      if (TARGET_SSE4_1)
++	{
++	  gen = gen_sse4_1_blendvsd;
++	  op_true = force_reg (mode, op_true);
++	}
++      break;
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++      if (TARGET_SSE4_1)
++	{
++	  gen = gen_sse4_1_pblendvb;
++	  if (mode != V16QImode)
++	    d = gen_reg_rtx (V16QImode);
++	  op_false = gen_lowpart (V16QImode, op_false);
++	  op_true = gen_lowpart (V16QImode, op_true);
++	  cmp = gen_lowpart (V16QImode, cmp);
++	}
++      break;
++    case E_V8SFmode:
++      if (TARGET_AVX)
++	gen = gen_avx_blendvps256;
++      break;
++    case E_V4DFmode:
++      if (TARGET_AVX)
++	gen = gen_avx_blendvpd256;
++      break;
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V8SImode:
++    case E_V4DImode:
++      if (TARGET_AVX2)
++	{
++	  gen = gen_avx2_pblendvb;
++	  if (mode != V32QImode)
++	    d = gen_reg_rtx (V32QImode);
++	  op_false = gen_lowpart (V32QImode, op_false);
++	  op_true = gen_lowpart (V32QImode, op_true);
++	  cmp = gen_lowpart (V32QImode, cmp);
++	}
++      break;
++
++    case E_V64QImode:
++      gen = gen_avx512bw_blendmv64qi;
++      break;
++    case E_V32HImode:
++      gen = gen_avx512bw_blendmv32hi;
++      break;
++    case E_V16SImode:
++      gen = gen_avx512f_blendmv16si;
++      break;
++    case E_V8DImode:
++      gen = gen_avx512f_blendmv8di;
++      break;
++    case E_V8DFmode:
++      gen = gen_avx512f_blendmv8df;
++      break;
++    case E_V16SFmode:
++      gen = gen_avx512f_blendmv16sf;
++      break;
++
++    default:
++      break;
++    }
++
++  if (gen != NULL)
++    {
++      emit_insn (gen (d, op_false, op_true, cmp));
++      if (d != dest)
++	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
++    }
++  else
++    {
++      op_true = force_reg (mode, op_true);
++
++      t2 = gen_reg_rtx (mode);
++      if (optimize)
++	t3 = gen_reg_rtx (mode);
++      else
++	t3 = dest;
++
++      x = gen_rtx_AND (mode, op_true, cmp);
++      emit_insn (gen_rtx_SET (t2, x));
++
++      x = gen_rtx_NOT (mode, cmp);
++      x = gen_rtx_AND (mode, x, op_false);
++      emit_insn (gen_rtx_SET (t3, x));
++
++      x = gen_rtx_IOR (mode, t3, t2);
++      emit_insn (gen_rtx_SET (dest, x));
++    }
++}
++
++/* Swap, force into registers, or otherwise massage the two operands
++   to an sse comparison with a mask result.  Thus we differ a bit from
++   ix86_prepare_fp_compare_args which expects to produce a flags result.
++
++   The DEST operand exists to help determine whether to commute commutative
++   operators.  The POP0/POP1 operands are updated in place.  The new
++   comparison code is returned, or UNKNOWN if not implementable.  */
++
++static enum rtx_code
++ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
++				  rtx *pop0, rtx *pop1)
++{
++  switch (code)
++    {
++    case LTGT:
++    case UNEQ:
++      /* AVX supports all the needed comparisons.  */
++      if (TARGET_AVX)
++	break;
++      /* We have no LTGT as an operator.  We could implement it with
++	 NE & ORDERED, but this requires an extra temporary.  It's
++	 not clear that it's worth it.  */
++      return UNKNOWN;
++
++    case LT:
++    case LE:
++    case UNGT:
++    case UNGE:
++      /* These are supported directly.  */
++      break;
++
++    case EQ:
++    case NE:
++    case UNORDERED:
++    case ORDERED:
++      /* AVX has 3 operand comparisons, no need to swap anything.  */
++      if (TARGET_AVX)
++	break;
++      /* For commutative operators, try to canonicalize the destination
++	 operand to be first in the comparison - this helps reload to
++	 avoid extra moves.  */
++      if (!dest || !rtx_equal_p (dest, *pop1))
++	break;
++      /* FALLTHRU */
++
++    case GE:
++    case GT:
++    case UNLE:
++    case UNLT:
++      /* These are not supported directly before AVX, and furthermore
++	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
++	 comparison operands to transform into something that is
++	 supported.  */
++      std::swap (*pop0, *pop1);
++      code = swap_condition (code);
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  return code;
++}
++
++/* Expand a floating-point conditional move.  Return true if successful.  */
++
++bool
++ix86_expand_fp_movcc (rtx operands[])
++{
++  machine_mode mode = GET_MODE (operands[0]);
++  enum rtx_code code = GET_CODE (operands[1]);
++  rtx tmp, compare_op;
++  rtx op0 = XEXP (operands[1], 0);
++  rtx op1 = XEXP (operands[1], 1);
++
++  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
++    {
++      machine_mode cmode;
++
++      /* Since we've no cmove for sse registers, don't force bad register
++	 allocation just to gain access to it.  Deny movcc when the
++	 comparison mode doesn't match the move mode.  */
++      cmode = GET_MODE (op0);
++      if (cmode == VOIDmode)
++	cmode = GET_MODE (op1);
++      if (cmode != mode)
++	return false;
++
++      code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
++      if (code == UNKNOWN)
++	return false;
++
++      if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
++				     operands[2], operands[3]))
++	return true;
++
++      tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
++				 operands[2], operands[3]);
++      ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
++      return true;
++    }
++
++  if (GET_MODE (op0) == TImode
++      || (GET_MODE (op0) == DImode
++	  && !TARGET_64BIT))
++    return false;
++
++  /* The floating point conditional move instructions don't directly
++     support conditions resulting from a signed integer comparison.  */
++
++  compare_op = ix86_expand_compare (code, op0, op1);
++  if (!fcmov_comparison_operator (compare_op, VOIDmode))
++    {
++      tmp = gen_reg_rtx (QImode);
++      ix86_expand_setcc (tmp, code, op0, op1);
++
++      compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
++    }
++
++  emit_insn (gen_rtx_SET (operands[0],
++			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
++						operands[2], operands[3])));
++
++  return true;
++}
++
++/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
++
++static int
++ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
++{
++  switch (code)
++    {
++    case EQ:
++      return 0;
++    case LT:
++    case LTU:
++      return 1;
++    case LE:
++    case LEU:
++      return 2;
++    case NE:
++      return 4;
++    case GE:
++    case GEU:
++      return 5;
++    case GT:
++    case GTU:
++      return 6;
++    default:
++      gcc_unreachable ();
++    }
++}
++
++/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
++
++static int
++ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
++{
++  switch (code)
++    {
++    case EQ:
++      return 0x00;
++    case NE:
++      return 0x04;
++    case GT:
++      return 0x0e;
++    case LE:
++      return 0x02;
++    case GE:
++      return 0x0d;
++    case LT:
++      return 0x01;
++    case UNLE:
++      return 0x0a;
++    case UNLT:
++      return 0x09;
++    case UNGE:
++      return 0x05;
++    case UNGT:
++      return 0x06;
++    case UNEQ:
++      return 0x18;
++    case LTGT:
++      return 0x0c;
++    case ORDERED:
++      return 0x07;
++    case UNORDERED:
++      return 0x03;
++    default:
++      gcc_unreachable ();
++    }
++}
++
++/* Return immediate value to be used in UNSPEC_PCMP
++   for comparison CODE in MODE.  */
++
++static int
++ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
++{
++  if (FLOAT_MODE_P (mode))
++    return ix86_fp_cmp_code_to_pcmp_immediate (code);
++  return ix86_int_cmp_code_to_pcmp_immediate (code);
++}
++
++/* Expand AVX-512 vector comparison.  */
++
++bool
++ix86_expand_mask_vec_cmp (rtx operands[])
++{
++  machine_mode mask_mode = GET_MODE (operands[0]);
++  machine_mode cmp_mode = GET_MODE (operands[2]);
++  enum rtx_code code = GET_CODE (operands[1]);
++  rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
++  int unspec_code;
++  rtx unspec;
++
++  switch (code)
++    {
++    case LEU:
++    case GTU:
++    case GEU:
++    case LTU:
++      unspec_code = UNSPEC_UNSIGNED_PCMP;
++      break;
++
++    default:
++      unspec_code = UNSPEC_PCMP;
++    }
++
++  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
++						 operands[3], imm),
++			   unspec_code);
++  emit_insn (gen_rtx_SET (operands[0], unspec));
++
++  return true;
++}
++
++/* Expand fp vector comparison.  */
++
++bool
++ix86_expand_fp_vec_cmp (rtx operands[])
++{
++  enum rtx_code code = GET_CODE (operands[1]);
++  rtx cmp;
++
++  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
++					   &operands[2], &operands[3]);
++  if (code == UNKNOWN)
++    {
++      rtx temp;
++      switch (GET_CODE (operands[1]))
++	{
++	case LTGT:
++	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
++				      operands[3], NULL, NULL);
++	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
++				     operands[3], NULL, NULL);
++	  code = AND;
++	  break;
++	case UNEQ:
++	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
++				      operands[3], NULL, NULL);
++	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
++				     operands[3], NULL, NULL);
++	  code = IOR;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
++				 OPTAB_DIRECT);
++    }
++  else
++    cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
++			       operands[1], operands[2]);
++
++  if (operands[0] != cmp)
++    emit_move_insn (operands[0], cmp);
++
++  return true;
++}
++
++static rtx
++ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
++			 rtx op_true, rtx op_false, bool *negate)
++{
++  machine_mode data_mode = GET_MODE (dest);
++  machine_mode mode = GET_MODE (cop0);
++  rtx x;
++
++  *negate = false;
++
++  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
++  if (TARGET_XOP
++      && (mode == V16QImode || mode == V8HImode
++	  || mode == V4SImode || mode == V2DImode))
++    ;
++  else
++    {
++      /* Canonicalize the comparison to EQ, GT, GTU.  */
++      switch (code)
++	{
++	case EQ:
++	case GT:
++	case GTU:
++	  break;
++
++	case NE:
++	case LE:
++	case LEU:
++	  code = reverse_condition (code);
++	  *negate = true;
++	  break;
++
++	case GE:
++	case GEU:
++	  code = reverse_condition (code);
++	  *negate = true;
++	  /* FALLTHRU */
++
++	case LT:
++	case LTU:
++	  std::swap (cop0, cop1);
++	  code = swap_condition (code);
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++
++      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
++      if (mode == V2DImode)
++	{
++	  switch (code)
++	    {
++	    case EQ:
++	      /* SSE4.1 supports EQ.  */
++	      if (!TARGET_SSE4_1)
++		return NULL;
++	      break;
++
++	    case GT:
++	    case GTU:
++	      /* SSE4.2 supports GT/GTU.  */
++	      if (!TARGET_SSE4_2)
++		return NULL;
++	      break;
++
++	    default:
++	      gcc_unreachable ();
++	    }
++	}
++
++      rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
++      rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
++      if (*negate)
++	std::swap (optrue, opfalse);
++
++      /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
++	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
++	 min (x, y) == x).  While we add one instruction (the minimum),
++	 we remove the need for two instructions in the negation, as the
++	 result is done this way.
++	 When using masks, do it for SI/DImode element types, as it is shorter
++	 than the two subtractions.  */
++      if ((code != EQ
++	   && GET_MODE_SIZE (mode) != 64
++	   && vector_all_ones_operand (opfalse, data_mode)
++	   && optrue == CONST0_RTX (data_mode))
++	  || (code == GTU
++	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
++	      /* Don't do it if not using integer masks and we'd end up with
++		 the right values in the registers though.  */
++	      && (GET_MODE_SIZE (mode) == 64
++		  || !vector_all_ones_operand (optrue, data_mode)
++		  || opfalse != CONST0_RTX (data_mode))))
++	{
++	  rtx (*gen) (rtx, rtx, rtx) = NULL;
++
++	  switch (mode)
++	    {
++	    case E_V16SImode:
++	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
++	      break;
++	    case E_V8DImode:
++	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
++	      cop0 = force_reg (mode, cop0);
++	      cop1 = force_reg (mode, cop1);
++	      break;
++	    case E_V32QImode:
++	      if (TARGET_AVX2)
++		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
++	      break;
++	    case E_V16HImode:
++	      if (TARGET_AVX2)
++		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
++	      break;
++	    case E_V8SImode:
++	      if (TARGET_AVX2)
++		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
++	      break;
++	    case E_V4DImode:
++	      if (TARGET_AVX512VL)
++		{
++		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
++		  cop0 = force_reg (mode, cop0);
++		  cop1 = force_reg (mode, cop1);
++		}
++	      break;
++	    case E_V16QImode:
++	      if (code == GTU && TARGET_SSE2)
++		gen = gen_uminv16qi3;
++	      else if (code == GT && TARGET_SSE4_1)
++		gen = gen_sminv16qi3;
++	      break;
++	    case E_V8HImode:
++	      if (code == GTU && TARGET_SSE4_1)
++		gen = gen_uminv8hi3;
++	      else if (code == GT && TARGET_SSE2)
++		gen = gen_sminv8hi3;
++	      break;
++	    case E_V4SImode:
++	      if (TARGET_SSE4_1)
++		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
++	      break;
++	    case E_V2DImode:
++	      if (TARGET_AVX512VL)
++		{
++		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
++		  cop0 = force_reg (mode, cop0);
++		  cop1 = force_reg (mode, cop1);
++		}
++	      break;
++	    default:
++	      break;
++	    }
++
++	  if (gen)
++	    {
++	      rtx tem = gen_reg_rtx (mode);
++	      if (!vector_operand (cop0, mode))
++		cop0 = force_reg (mode, cop0);
++	      if (!vector_operand (cop1, mode))
++		cop1 = force_reg (mode, cop1);
++	      *negate = !*negate;
++	      emit_insn (gen (tem, cop0, cop1));
++	      cop1 = tem;
++	      code = EQ;
++	    }
++	}
++
++      /* Unsigned parallel compare is not supported by the hardware.
++	 Play some tricks to turn this into a signed comparison
++	 against 0.  */
++      if (code == GTU)
++	{
++	  cop0 = force_reg (mode, cop0);
++
++	  switch (mode)
++	    {
++	    case E_V16SImode:
++	    case E_V8DImode:
++	    case E_V8SImode:
++	    case E_V4DImode:
++	    case E_V4SImode:
++	    case E_V2DImode:
++		{
++		  rtx t1, t2, mask;
++		  rtx (*gen_sub3) (rtx, rtx, rtx);
++
++		  switch (mode)
++		    {
++		    case E_V16SImode: gen_sub3 = gen_subv16si3; break;
++		    case E_V8DImode: gen_sub3 = gen_subv8di3; break;
++		    case E_V8SImode: gen_sub3 = gen_subv8si3; break;
++		    case E_V4DImode: gen_sub3 = gen_subv4di3; break;
++		    case E_V4SImode: gen_sub3 = gen_subv4si3; break;
++		    case E_V2DImode: gen_sub3 = gen_subv2di3; break;
++		    default:
++		      gcc_unreachable ();
++		    }
++		  /* Subtract (-(INT MAX) - 1) from both operands to make
++		     them signed.  */
++		  mask = ix86_build_signbit_mask (mode, true, false);
++		  t1 = gen_reg_rtx (mode);
++		  emit_insn (gen_sub3 (t1, cop0, mask));
++
++		  t2 = gen_reg_rtx (mode);
++		  emit_insn (gen_sub3 (t2, cop1, mask));
++
++		  cop0 = t1;
++		  cop1 = t2;
++		  code = GT;
++		}
++	      break;
++
++	    case E_V64QImode:
++	    case E_V32HImode:
++	    case E_V32QImode:
++	    case E_V16HImode:
++	    case E_V16QImode:
++	    case E_V8HImode:
++	      /* Perform a parallel unsigned saturating subtraction.  */
++	      x = gen_reg_rtx (mode);
++	      emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
++							   cop1)));
++
++	      cop0 = x;
++	      cop1 = CONST0_RTX (mode);
++	      code = EQ;
++	      *negate = !*negate;
++	      break;
++
++	    default:
++	      gcc_unreachable ();
++	    }
++	}
++    }
++
++  if (*negate)
++    std::swap (op_true, op_false);
++
++  /* Allow the comparison to be done in one mode, but the movcc to
++     happen in another mode.  */
++  if (data_mode == mode)
++    {
++      x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
++			       op_true, op_false);
++    }
++  else
++    {
++      gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
++      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
++			       op_true, op_false);
++      if (GET_MODE (x) == mode)
++	x = gen_lowpart (data_mode, x);
++    }
++
++  return x;
++}
++
++/* Expand integer vector comparison.  */
++
++bool
++ix86_expand_int_vec_cmp (rtx operands[])
++{
++  rtx_code code = GET_CODE (operands[1]);
++  bool negate = false;
++  rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
++				     operands[3], NULL, NULL, &negate);
++
++  if (!cmp)
++    return false;
++
++  if (negate)
++    cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
++				   CONST0_RTX (GET_MODE (cmp)),
++				   NULL, NULL, &negate);
++
++  gcc_assert (!negate);
++
++  if (operands[0] != cmp)
++    emit_move_insn (operands[0], cmp);
++
++  return true;
++}
++
++/* Expand a floating-point vector conditional move; a vcond operation
++   rather than a movcc operation.  */
++
++bool
++ix86_expand_fp_vcond (rtx operands[])
++{
++  enum rtx_code code = GET_CODE (operands[3]);
++  rtx cmp;
++
++  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
++					   &operands[4], &operands[5]);
++  if (code == UNKNOWN)
++    {
++      rtx temp;
++      switch (GET_CODE (operands[3]))
++	{
++	case LTGT:
++	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
++				      operands[5], operands[0], operands[0]);
++	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
++				     operands[5], operands[1], operands[2]);
++	  code = AND;
++	  break;
++	case UNEQ:
++	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
++				      operands[5], operands[0], operands[0]);
++	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
++				     operands[5], operands[1], operands[2]);
++	  code = IOR;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
++				 OPTAB_DIRECT);
++      ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
++      return true;
++    }
++
++  if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
++				 operands[5], operands[1], operands[2]))
++    return true;
++
++  cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
++			     operands[1], operands[2]);
++  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
++  return true;
++}
++
++/* Expand a signed/unsigned integral vector conditional move.  */
++
++bool
++ix86_expand_int_vcond (rtx operands[])
++{
++  machine_mode data_mode = GET_MODE (operands[0]);
++  machine_mode mode = GET_MODE (operands[4]);
++  enum rtx_code code = GET_CODE (operands[3]);
++  bool negate = false;
++  rtx x, cop0, cop1;
++
++  cop0 = operands[4];
++  cop1 = operands[5];
++
++  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
++     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
++  if ((code == LT || code == GE)
++      && data_mode == mode
++      && cop1 == CONST0_RTX (mode)
++      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
++      && GET_MODE_UNIT_SIZE (data_mode) > 1
++      && GET_MODE_UNIT_SIZE (data_mode) <= 8
++      && (GET_MODE_SIZE (data_mode) == 16
++	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
++    {
++      rtx negop = operands[2 - (code == LT)];
++      int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
++      if (negop == CONST1_RTX (data_mode))
++	{
++	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
++					 operands[0], 1, OPTAB_DIRECT);
++	  if (res != operands[0])
++	    emit_move_insn (operands[0], res);
++	  return true;
++	}
++      else if (GET_MODE_INNER (data_mode) != DImode
++	       && vector_all_ones_operand (negop, data_mode))
++	{
++	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
++					 operands[0], 0, OPTAB_DIRECT);
++	  if (res != operands[0])
++	    emit_move_insn (operands[0], res);
++	  return true;
++	}
++    }
++
++  if (!nonimmediate_operand (cop1, mode))
++    cop1 = force_reg (mode, cop1);
++  if (!general_operand (operands[1], data_mode))
++    operands[1] = force_reg (data_mode, operands[1]);
++  if (!general_operand (operands[2], data_mode))
++    operands[2] = force_reg (data_mode, operands[2]);
++
++  x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
++			       operands[1], operands[2], &negate);
++
++  if (!x)
++    return false;
++
++  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
++			 operands[2-negate]);
++  return true;
++}
++
++static bool
++ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
++			      struct expand_vec_perm_d *d)
++{
++  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
++     expander, so args are either in d, or in op0, op1 etc.  */
++  machine_mode mode = GET_MODE (d ? d->op0 : op0);
++  machine_mode maskmode = mode;
++  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
++
++  switch (mode)
++    {
++    case E_V8HImode:
++      if (TARGET_AVX512VL && TARGET_AVX512BW)
++	gen = gen_avx512vl_vpermt2varv8hi3;
++      break;
++    case E_V16HImode:
++      if (TARGET_AVX512VL && TARGET_AVX512BW)
++	gen = gen_avx512vl_vpermt2varv16hi3;
++      break;
++    case E_V64QImode:
++      if (TARGET_AVX512VBMI)
++	gen = gen_avx512bw_vpermt2varv64qi3;
++      break;
++    case E_V32HImode:
++      if (TARGET_AVX512BW)
++	gen = gen_avx512bw_vpermt2varv32hi3;
++      break;
++    case E_V4SImode:
++      if (TARGET_AVX512VL)
++	gen = gen_avx512vl_vpermt2varv4si3;
++      break;
++    case E_V8SImode:
++      if (TARGET_AVX512VL)
++	gen = gen_avx512vl_vpermt2varv8si3;
++      break;
++    case E_V16SImode:
++      if (TARGET_AVX512F)
++	gen = gen_avx512f_vpermt2varv16si3;
++      break;
++    case E_V4SFmode:
++      if (TARGET_AVX512VL)
++	{
++	  gen = gen_avx512vl_vpermt2varv4sf3;
++	  maskmode = V4SImode;
++	}
++      break;
++    case E_V8SFmode:
++      if (TARGET_AVX512VL)
++	{
++	  gen = gen_avx512vl_vpermt2varv8sf3;
++	  maskmode = V8SImode;
++	}
++      break;
++    case E_V16SFmode:
++      if (TARGET_AVX512F)
++	{
++	  gen = gen_avx512f_vpermt2varv16sf3;
++	  maskmode = V16SImode;
++	}
++      break;
++    case E_V2DImode:
++      if (TARGET_AVX512VL)
++	gen = gen_avx512vl_vpermt2varv2di3;
++      break;
++    case E_V4DImode:
++      if (TARGET_AVX512VL)
++	gen = gen_avx512vl_vpermt2varv4di3;
++      break;
++    case E_V8DImode:
++      if (TARGET_AVX512F)
++	gen = gen_avx512f_vpermt2varv8di3;
++      break;
++    case E_V2DFmode:
++      if (TARGET_AVX512VL)
++	{
++	  gen = gen_avx512vl_vpermt2varv2df3;
++	  maskmode = V2DImode;
++	}
++      break;
++    case E_V4DFmode:
++      if (TARGET_AVX512VL)
++	{
++	  gen = gen_avx512vl_vpermt2varv4df3;
++	  maskmode = V4DImode;
++	}
++      break;
++    case E_V8DFmode:
++      if (TARGET_AVX512F)
++	{
++	  gen = gen_avx512f_vpermt2varv8df3;
++	  maskmode = V8DImode;
++	}
++      break;
++    default:
++      break;
++    }
++
++  if (gen == NULL)
++    return false;
++
++  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
++     expander, so args are either in d, or in op0, op1 etc.  */
++  if (d)
++    {
++      rtx vec[64];
++      target = d->target;
++      op0 = d->op0;
++      op1 = d->op1;
++      for (int i = 0; i < d->nelt; ++i)
++	vec[i] = GEN_INT (d->perm[i]);
++      mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
++    }
++
++  emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
++  return true;
++}
++
++/* Expand a variable vector permutation.  */
++
++void
++ix86_expand_vec_perm (rtx operands[])
++{
++  rtx target = operands[0];
++  rtx op0 = operands[1];
++  rtx op1 = operands[2];
++  rtx mask = operands[3];
++  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
++  machine_mode mode = GET_MODE (op0);
++  machine_mode maskmode = GET_MODE (mask);
++  int w, e, i;
++  bool one_operand_shuffle = rtx_equal_p (op0, op1);
++
++  /* Number of elements in the vector.  */
++  w = GET_MODE_NUNITS (mode);
++  e = GET_MODE_UNIT_SIZE (mode);
++  gcc_assert (w <= 64);
++
++  if (TARGET_AVX512F && one_operand_shuffle)
++    {
++      rtx (*gen) (rtx, rtx, rtx) = NULL;
++      switch (mode)
++	{
++	case E_V16SImode:
++	  gen =gen_avx512f_permvarv16si;
++	  break;
++	case E_V16SFmode:
++	  gen = gen_avx512f_permvarv16sf;
++	  break;
++	case E_V8DImode:
++	  gen = gen_avx512f_permvarv8di;
++	  break;
++	case E_V8DFmode:
++	  gen = gen_avx512f_permvarv8df;
++	  break;
++	default:
++	  break;
++	}
++      if (gen != NULL)
++	{
++	  emit_insn (gen (target, op0, mask));
++	  return;
++	}
++    }
++
++  if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
++    return;
++
++  if (TARGET_AVX2)
++    {
++      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
++	{
++	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
++	     an constant shuffle operand.  With a tiny bit of effort we can
++	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
++	     unfortunate but there's no avoiding it.
++	     Similarly for V16HImode we don't have instructions for variable
++	     shuffling, while for V32QImode we can use after preparing suitable
++	     masks vpshufb; vpshufb; vpermq; vpor.  */
++
++	  if (mode == V16HImode)
++	    {
++	      maskmode = mode = V32QImode;
++	      w = 32;
++	      e = 1;
++	    }
++	  else
++	    {
++	      maskmode = mode = V8SImode;
++	      w = 8;
++	      e = 4;
++	    }
++	  t1 = gen_reg_rtx (maskmode);
++
++	  /* Replicate the low bits of the V4DImode mask into V8SImode:
++	       mask = { A B C D }
++	       t1 = { A A B B C C D D }.  */
++	  for (i = 0; i < w / 2; ++i)
++	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
++	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
++	  vt = force_reg (maskmode, vt);
++	  mask = gen_lowpart (maskmode, mask);
++	  if (maskmode == V8SImode)
++	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
++	  else
++	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
++
++	  /* Multiply the shuffle indicies by two.  */
++	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
++				    OPTAB_DIRECT);
++
++	  /* Add one to the odd shuffle indicies:
++		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
++	  for (i = 0; i < w / 2; ++i)
++	    {
++	      vec[i * 2] = const0_rtx;
++	      vec[i * 2 + 1] = const1_rtx;
++	    }
++	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
++	  vt = validize_mem (force_const_mem (maskmode, vt));
++	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
++				    OPTAB_DIRECT);
++
++	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
++	  operands[3] = mask = t1;
++	  target = gen_reg_rtx (mode);
++	  op0 = gen_lowpart (mode, op0);
++	  op1 = gen_lowpart (mode, op1);
++	}
++
++      switch (mode)
++	{
++	case E_V8SImode:
++	  /* The VPERMD and VPERMPS instructions already properly ignore
++	     the high bits of the shuffle elements.  No need for us to
++	     perform an AND ourselves.  */
++	  if (one_operand_shuffle)
++	    {
++	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
++	      if (target != operands[0])
++		emit_move_insn (operands[0],
++				gen_lowpart (GET_MODE (operands[0]), target));
++	    }
++	  else
++	    {
++	      t1 = gen_reg_rtx (V8SImode);
++	      t2 = gen_reg_rtx (V8SImode);
++	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
++	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
++	      goto merge_two;
++	    }
++	  return;
++
++	case E_V8SFmode:
++	  mask = gen_lowpart (V8SImode, mask);
++	  if (one_operand_shuffle)
++	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
++	  else
++	    {
++	      t1 = gen_reg_rtx (V8SFmode);
++	      t2 = gen_reg_rtx (V8SFmode);
++	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
++	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
++	      goto merge_two;
++	    }
++	  return;
++
++        case E_V4SImode:
++	  /* By combining the two 128-bit input vectors into one 256-bit
++	     input vector, we can use VPERMD and VPERMPS for the full
++	     two-operand shuffle.  */
++	  t1 = gen_reg_rtx (V8SImode);
++	  t2 = gen_reg_rtx (V8SImode);
++	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
++	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
++	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
++	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
++	  return;
++
++        case E_V4SFmode:
++	  t1 = gen_reg_rtx (V8SFmode);
++	  t2 = gen_reg_rtx (V8SImode);
++	  mask = gen_lowpart (V4SImode, mask);
++	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
++	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
++	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
++	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
++	  return;
++
++	case E_V32QImode:
++	  t1 = gen_reg_rtx (V32QImode);
++	  t2 = gen_reg_rtx (V32QImode);
++	  t3 = gen_reg_rtx (V32QImode);
++	  vt2 = GEN_INT (-128);
++	  vt = gen_const_vec_duplicate (V32QImode, vt2);
++	  vt = force_reg (V32QImode, vt);
++	  for (i = 0; i < 32; i++)
++	    vec[i] = i < 16 ? vt2 : const0_rtx;
++	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
++	  vt2 = force_reg (V32QImode, vt2);
++	  /* From mask create two adjusted masks, which contain the same
++	     bits as mask in the low 7 bits of each vector element.
++	     The first mask will have the most significant bit clear
++	     if it requests element from the same 128-bit lane
++	     and MSB set if it requests element from the other 128-bit lane.
++	     The second mask will have the opposite values of the MSB,
++	     and additionally will have its 128-bit lanes swapped.
++	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
++	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
++	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
++	     stands for other 12 bytes.  */
++	  /* The bit whether element is from the same lane or the other
++	     lane is bit 4, so shift it up by 3 to the MSB position.  */
++	  t5 = gen_reg_rtx (V4DImode);
++	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
++				    GEN_INT (3)));
++	  /* Clear MSB bits from the mask just in case it had them set.  */
++	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
++	  /* After this t1 will have MSB set for elements from other lane.  */
++	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
++	  /* Clear bits other than MSB.  */
++	  emit_insn (gen_andv32qi3 (t1, t1, vt));
++	  /* Or in the lower bits from mask into t3.  */
++	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
++	  /* And invert MSB bits in t1, so MSB is set for elements from the same
++	     lane.  */
++	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
++	  /* Swap 128-bit lanes in t3.  */
++	  t6 = gen_reg_rtx (V4DImode);
++	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
++					  const2_rtx, GEN_INT (3),
++					  const0_rtx, const1_rtx));
++	  /* And or in the lower bits from mask into t1.  */
++	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
++	  if (one_operand_shuffle)
++	    {
++	      /* Each of these shuffles will put 0s in places where
++		 element from the other 128-bit lane is needed, otherwise
++		 will shuffle in the requested value.  */
++	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
++						gen_lowpart (V32QImode, t6)));
++	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
++	      /* For t3 the 128-bit lanes are swapped again.  */
++	      t7 = gen_reg_rtx (V4DImode);
++	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
++					      const2_rtx, GEN_INT (3),
++					      const0_rtx, const1_rtx));
++	      /* And oring both together leads to the result.  */
++	      emit_insn (gen_iorv32qi3 (target, t1,
++					gen_lowpart (V32QImode, t7)));
++	      if (target != operands[0])
++		emit_move_insn (operands[0],
++				gen_lowpart (GET_MODE (operands[0]), target));
++	      return;
++	    }
++
++	  t4 = gen_reg_rtx (V32QImode);
++	  /* Similarly to the above one_operand_shuffle code,
++	     just for repeated twice for each operand.  merge_two:
++	     code will merge the two results together.  */
++	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
++					    gen_lowpart (V32QImode, t6)));
++	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
++					    gen_lowpart (V32QImode, t6)));
++	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
++	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
++	  t7 = gen_reg_rtx (V4DImode);
++	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
++					  const2_rtx, GEN_INT (3),
++					  const0_rtx, const1_rtx));
++	  t8 = gen_reg_rtx (V4DImode);
++	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
++					  const2_rtx, GEN_INT (3),
++					  const0_rtx, const1_rtx));
++	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
++	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
++	  t1 = t4;
++	  t2 = t3;
++	  goto merge_two;
++
++	default:
++	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
++	  break;
++	}
++    }
++
++  if (TARGET_XOP)
++    {
++      /* The XOP VPPERM insn supports three inputs.  By ignoring the 
++	 one_operand_shuffle special case, we avoid creating another
++	 set of constant vectors in memory.  */
++      one_operand_shuffle = false;
++
++      /* mask = mask & {2*w-1, ...} */
++      vt = GEN_INT (2*w - 1);
++    }
++  else
++    {
++      /* mask = mask & {w-1, ...} */
++      vt = GEN_INT (w - 1);
++    }
++
++  vt = gen_const_vec_duplicate (maskmode, vt);
++  mask = expand_simple_binop (maskmode, AND, mask, vt,
++			      NULL_RTX, 0, OPTAB_DIRECT);
++
++  /* For non-QImode operations, convert the word permutation control
++     into a byte permutation control.  */
++  if (mode != V16QImode)
++    {
++      mask = expand_simple_binop (maskmode, ASHIFT, mask,
++				  GEN_INT (exact_log2 (e)),
++				  NULL_RTX, 0, OPTAB_DIRECT);
++
++      /* Convert mask to vector of chars.  */
++      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
++
++      /* Replicate each of the input bytes into byte positions:
++	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
++	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
++	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
++      for (i = 0; i < 16; ++i)
++	vec[i] = GEN_INT (i/e * e);
++      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
++      vt = validize_mem (force_const_mem (V16QImode, vt));
++      if (TARGET_XOP)
++	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
++      else
++	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
++
++      /* Convert it into the byte positions by doing
++	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
++      for (i = 0; i < 16; ++i)
++	vec[i] = GEN_INT (i % e);
++      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
++      vt = validize_mem (force_const_mem (V16QImode, vt));
++      emit_insn (gen_addv16qi3 (mask, mask, vt));
++    }
++
++  /* The actual shuffle operations all operate on V16QImode.  */
++  op0 = gen_lowpart (V16QImode, op0);
++  op1 = gen_lowpart (V16QImode, op1);
++
++  if (TARGET_XOP)
++    {
++      if (GET_MODE (target) != V16QImode)
++	target = gen_reg_rtx (V16QImode);
++      emit_insn (gen_xop_pperm (target, op0, op1, mask));
++      if (target != operands[0])
++	emit_move_insn (operands[0],
++			gen_lowpart (GET_MODE (operands[0]), target));
++    }
++  else if (one_operand_shuffle)
++    {
++      if (GET_MODE (target) != V16QImode)
++	target = gen_reg_rtx (V16QImode);
++      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
++      if (target != operands[0])
++	emit_move_insn (operands[0],
++			gen_lowpart (GET_MODE (operands[0]), target));
++    }
++  else
++    {
++      rtx xops[6];
++      bool ok;
++
++      /* Shuffle the two input vectors independently.  */
++      t1 = gen_reg_rtx (V16QImode);
++      t2 = gen_reg_rtx (V16QImode);
++      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
++      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
++
++ merge_two:
++      /* Then merge them together.  The key is whether any given control
++         element contained a bit set that indicates the second word.  */
++      mask = operands[3];
++      vt = GEN_INT (w);
++      if (maskmode == V2DImode && !TARGET_SSE4_1)
++	{
++	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
++	     more shuffle to convert the V2DI input mask into a V4SI
++	     input mask.  At which point the masking that expand_int_vcond
++	     will work as desired.  */
++	  rtx t3 = gen_reg_rtx (V4SImode);
++	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
++				        const0_rtx, const0_rtx,
++				        const2_rtx, const2_rtx));
++	  mask = t3;
++	  maskmode = V4SImode;
++	  e = w = 4;
++	}
++
++      vt = gen_const_vec_duplicate (maskmode, vt);
++      vt = force_reg (maskmode, vt);
++      mask = expand_simple_binop (maskmode, AND, mask, vt,
++				  NULL_RTX, 0, OPTAB_DIRECT);
++
++      if (GET_MODE (target) != mode)
++	target = gen_reg_rtx (mode);
++      xops[0] = target;
++      xops[1] = gen_lowpart (mode, t2);
++      xops[2] = gen_lowpart (mode, t1);
++      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
++      xops[4] = mask;
++      xops[5] = vt;
++      ok = ix86_expand_int_vcond (xops);
++      gcc_assert (ok);
++      if (target != operands[0])
++	emit_move_insn (operands[0],
++			gen_lowpart (GET_MODE (operands[0]), target));
++    }
++}
++
++/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
++   true if we should do zero extension, else sign extension.  HIGH_P is
++   true if we want the N/2 high elements, else the low elements.  */
++
++void
++ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
++{
++  machine_mode imode = GET_MODE (src);
++  rtx tmp;
++
++  if (TARGET_SSE4_1)
++    {
++      rtx (*unpack)(rtx, rtx);
++      rtx (*extract)(rtx, rtx) = NULL;
++      machine_mode halfmode = BLKmode;
++
++      switch (imode)
++	{
++	case E_V64QImode:
++	  if (unsigned_p)
++	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
++	  else
++	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
++	  halfmode = V32QImode;
++	  extract
++	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
++	  break;
++	case E_V32QImode:
++	  if (unsigned_p)
++	    unpack = gen_avx2_zero_extendv16qiv16hi2;
++	  else
++	    unpack = gen_avx2_sign_extendv16qiv16hi2;
++	  halfmode = V16QImode;
++	  extract
++	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
++	  break;
++	case E_V32HImode:
++	  if (unsigned_p)
++	    unpack = gen_avx512f_zero_extendv16hiv16si2;
++	  else
++	    unpack = gen_avx512f_sign_extendv16hiv16si2;
++	  halfmode = V16HImode;
++	  extract
++	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
++	  break;
++	case E_V16HImode:
++	  if (unsigned_p)
++	    unpack = gen_avx2_zero_extendv8hiv8si2;
++	  else
++	    unpack = gen_avx2_sign_extendv8hiv8si2;
++	  halfmode = V8HImode;
++	  extract
++	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
++	  break;
++	case E_V16SImode:
++	  if (unsigned_p)
++	    unpack = gen_avx512f_zero_extendv8siv8di2;
++	  else
++	    unpack = gen_avx512f_sign_extendv8siv8di2;
++	  halfmode = V8SImode;
++	  extract
++	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
++	  break;
++	case E_V8SImode:
++	  if (unsigned_p)
++	    unpack = gen_avx2_zero_extendv4siv4di2;
++	  else
++	    unpack = gen_avx2_sign_extendv4siv4di2;
++	  halfmode = V4SImode;
++	  extract
++	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
++	  break;
++	case E_V16QImode:
++	  if (unsigned_p)
++	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
++	  else
++	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
++	  break;
++	case E_V8HImode:
++	  if (unsigned_p)
++	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
++	  else
++	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
++	  break;
++	case E_V4SImode:
++	  if (unsigned_p)
++	    unpack = gen_sse4_1_zero_extendv2siv2di2;
++	  else
++	    unpack = gen_sse4_1_sign_extendv2siv2di2;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++
++      if (GET_MODE_SIZE (imode) >= 32)
++	{
++	  tmp = gen_reg_rtx (halfmode);
++	  emit_insn (extract (tmp, src));
++	}
++      else if (high_p)
++	{
++	  /* Shift higher 8 bytes to lower 8 bytes.  */
++	  tmp = gen_reg_rtx (V1TImode);
++	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
++					 GEN_INT (64)));
++	  tmp = gen_lowpart (imode, tmp);
++	}
++      else
++	tmp = src;
++
++      emit_insn (unpack (dest, tmp));
++    }
++  else
++    {
++      rtx (*unpack)(rtx, rtx, rtx);
++
++      switch (imode)
++	{
++	case E_V16QImode:
++	  if (high_p)
++	    unpack = gen_vec_interleave_highv16qi;
++	  else
++	    unpack = gen_vec_interleave_lowv16qi;
++	  break;
++	case E_V8HImode:
++	  if (high_p)
++	    unpack = gen_vec_interleave_highv8hi;
++	  else
++	    unpack = gen_vec_interleave_lowv8hi;
++	  break;
++	case E_V4SImode:
++	  if (high_p)
++	    unpack = gen_vec_interleave_highv4si;
++	  else
++	    unpack = gen_vec_interleave_lowv4si;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++
++      if (unsigned_p)
++	tmp = force_reg (imode, CONST0_RTX (imode));
++      else
++	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
++				   src, pc_rtx, pc_rtx);
++
++      rtx tmp2 = gen_reg_rtx (imode);
++      emit_insn (unpack (tmp2, src, tmp));
++      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
++    }
++}
++
++/* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
++   but works for floating pointer parameters and nonoffsetable memories.
++   For pushes, it returns just stack offsets; the values will be saved
++   in the right order.  Maximally three parts are generated.  */
++
++static int
++ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
++{
++  int size;
++
++  if (!TARGET_64BIT)
++    size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
++  else
++    size = (GET_MODE_SIZE (mode) + 4) / 8;
++
++  gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
++  gcc_assert (size >= 2 && size <= 4);
++
++  /* Optimize constant pool reference to immediates.  This is used by fp
++     moves, that force all constants to memory to allow combining.  */
++  if (MEM_P (operand) && MEM_READONLY_P (operand))
++    operand = avoid_constant_pool_reference (operand);
++
++  if (MEM_P (operand) && !offsettable_memref_p (operand))
++    {
++      /* The only non-offsetable memories we handle are pushes.  */
++      int ok = push_operand (operand, VOIDmode);
++
++      gcc_assert (ok);
++
++      operand = copy_rtx (operand);
++      PUT_MODE (operand, word_mode);
++      parts[0] = parts[1] = parts[2] = parts[3] = operand;
++      return size;
++    }
++
++  if (GET_CODE (operand) == CONST_VECTOR)
++    {
++      scalar_int_mode imode = int_mode_for_mode (mode).require ();
++      /* Caution: if we looked through a constant pool memory above,
++	 the operand may actually have a different mode now.  That's
++	 ok, since we want to pun this all the way back to an integer.  */
++      operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
++      gcc_assert (operand != NULL);
++      mode = imode;
++    }
++
++  if (!TARGET_64BIT)
++    {
++      if (mode == DImode)
++	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
++      else
++	{
++	  int i;
++
++	  if (REG_P (operand))
++	    {
++	      gcc_assert (reload_completed);
++	      for (i = 0; i < size; i++)
++		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
++	    }
++	  else if (offsettable_memref_p (operand))
++	    {
++	      operand = adjust_address (operand, SImode, 0);
++	      parts[0] = operand;
++	      for (i = 1; i < size; i++)
++		parts[i] = adjust_address (operand, SImode, 4 * i);
++	    }
++	  else if (CONST_DOUBLE_P (operand))
++	    {
++	      const REAL_VALUE_TYPE *r;
++	      long l[4];
++
++	      r = CONST_DOUBLE_REAL_VALUE (operand);
++	      switch (mode)
++		{
++		case E_TFmode:
++		  real_to_target (l, r, mode);
++		  parts[3] = gen_int_mode (l[3], SImode);
++		  parts[2] = gen_int_mode (l[2], SImode);
++		  break;
++		case E_XFmode:
++		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
++		     long double may not be 80-bit.  */
++		  real_to_target (l, r, mode);
++		  parts[2] = gen_int_mode (l[2], SImode);
++		  break;
++		case E_DFmode:
++		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
++		  break;
++		default:
++		  gcc_unreachable ();
++		}
++	      parts[1] = gen_int_mode (l[1], SImode);
++	      parts[0] = gen_int_mode (l[0], SImode);
++	    }
++	  else
++	    gcc_unreachable ();
++	}
++    }
++  else
++    {
++      if (mode == TImode)
++	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
++      if (mode == XFmode || mode == TFmode)
++	{
++	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
++	  if (REG_P (operand))
++	    {
++	      gcc_assert (reload_completed);
++	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
++	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
++	    }
++	  else if (offsettable_memref_p (operand))
++	    {
++	      operand = adjust_address (operand, DImode, 0);
++	      parts[0] = operand;
++	      parts[1] = adjust_address (operand, upper_mode, 8);
++	    }
++	  else if (CONST_DOUBLE_P (operand))
++	    {
++	      long l[4];
++
++	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
++
++	      /* real_to_target puts 32-bit pieces in each long.  */
++	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
++				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
++					  << 32), DImode);
++
++	      if (upper_mode == SImode)
++	        parts[1] = gen_int_mode (l[2], SImode);
++	      else
++	        parts[1]
++		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
++				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
++				     << 32), DImode);
++	    }
++	  else
++	    gcc_unreachable ();
++	}
++    }
++
++  return size;
++}
++
++/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
++   Return false when normal moves are needed; true when all required
++   insns have been emitted.  Operands 2-4 contain the input values
++   int the correct order; operands 5-7 contain the output values.  */
++
++void
++ix86_split_long_move (rtx operands[])
++{
++  rtx part[2][4];
++  int nparts, i, j;
++  int push = 0;
++  int collisions = 0;
++  machine_mode mode = GET_MODE (operands[0]);
++  bool collisionparts[4];
++
++  /* The DFmode expanders may ask us to move double.
++     For 64bit target this is single move.  By hiding the fact
++     here we simplify i386.md splitters.  */
++  if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
++    {
++      /* Optimize constant pool reference to immediates.  This is used by
++	 fp moves, that force all constants to memory to allow combining.  */
++
++      if (MEM_P (operands[1])
++	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
++	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
++	operands[1] = get_pool_constant (XEXP (operands[1], 0));
++      if (push_operand (operands[0], VOIDmode))
++	{
++	  operands[0] = copy_rtx (operands[0]);
++	  PUT_MODE (operands[0], word_mode);
++	}
++      else
++        operands[0] = gen_lowpart (DImode, operands[0]);
++      operands[1] = gen_lowpart (DImode, operands[1]);
++      emit_move_insn (operands[0], operands[1]);
++      return;
++    }
++
++  /* The only non-offsettable memory we handle is push.  */
++  if (push_operand (operands[0], VOIDmode))
++    push = 1;
++  else
++    gcc_assert (!MEM_P (operands[0])
++		|| offsettable_memref_p (operands[0]));
++
++  nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
++  ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
++
++  /* When emitting push, take care for source operands on the stack.  */
++  if (push && MEM_P (operands[1])
++      && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
++    {
++      rtx src_base = XEXP (part[1][nparts - 1], 0);
++
++      /* Compensate for the stack decrement by 4.  */
++      if (!TARGET_64BIT && nparts == 3
++	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
++	src_base = plus_constant (Pmode, src_base, 4);
++
++      /* src_base refers to the stack pointer and is
++	 automatically decreased by emitted push.  */
++      for (i = 0; i < nparts; i++)
++	part[1][i] = change_address (part[1][i],
++				     GET_MODE (part[1][i]), src_base);
++    }
++
++  /* We need to do copy in the right order in case an address register
++     of the source overlaps the destination.  */
++  if (REG_P (part[0][0]) && MEM_P (part[1][0]))
++    {
++      rtx tmp;
++
++      for (i = 0; i < nparts; i++)
++	{
++	  collisionparts[i]
++	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
++	  if (collisionparts[i])
++	    collisions++;
++	}
++
++      /* Collision in the middle part can be handled by reordering.  */
++      if (collisions == 1 && nparts == 3 && collisionparts [1])
++	{
++	  std::swap (part[0][1], part[0][2]);
++	  std::swap (part[1][1], part[1][2]);
++	}
++      else if (collisions == 1
++	       && nparts == 4
++	       && (collisionparts [1] || collisionparts [2]))
++	{
++	  if (collisionparts [1])
++	    {
++	      std::swap (part[0][1], part[0][2]);
++	      std::swap (part[1][1], part[1][2]);
++	    }
++	  else
++	    {
++	      std::swap (part[0][2], part[0][3]);
++	      std::swap (part[1][2], part[1][3]);
++	    }
++	}
++
++      /* If there are more collisions, we can't handle it by reordering.
++	 Do an lea to the last part and use only one colliding move.  */
++      else if (collisions > 1)
++	{
++	  rtx base, addr;
++
++	  collisions = 1;
++
++	  base = part[0][nparts - 1];
++
++	  /* Handle the case when the last part isn't valid for lea.
++	     Happens in 64-bit mode storing the 12-byte XFmode.  */
++	  if (GET_MODE (base) != Pmode)
++	    base = gen_rtx_REG (Pmode, REGNO (base));
++
++	  addr = XEXP (part[1][0], 0);
++	  if (TARGET_TLS_DIRECT_SEG_REFS)
++	    {
++	      struct ix86_address parts;
++	      int ok = ix86_decompose_address (addr, &parts);
++	      gcc_assert (ok);
++	      /* It is not valid to use %gs: or %fs: in lea.  */
++	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
++	    }
++	  emit_insn (gen_rtx_SET (base, addr));
++	  part[1][0] = replace_equiv_address (part[1][0], base);
++	  for (i = 1; i < nparts; i++)
++	    {
++	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
++	      part[1][i] = replace_equiv_address (part[1][i], tmp);
++	    }
++	}
++    }
++
++  if (push)
++    {
++      if (!TARGET_64BIT)
++	{
++	  if (nparts == 3)
++	    {
++	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
++                emit_insn (ix86_gen_add3 (stack_pointer_rtx,
++					  stack_pointer_rtx, GEN_INT (-4)));
++	      emit_move_insn (part[0][2], part[1][2]);
++	    }
++	  else if (nparts == 4)
++	    {
++	      emit_move_insn (part[0][3], part[1][3]);
++	      emit_move_insn (part[0][2], part[1][2]);
++	    }
++	}
++      else
++	{
++	  /* In 64bit mode we don't have 32bit push available.  In case this is
++	     register, it is OK - we will just use larger counterpart.  We also
++	     retype memory - these comes from attempt to avoid REX prefix on
++	     moving of second half of TFmode value.  */
++	  if (GET_MODE (part[1][1]) == SImode)
++	    {
++	      switch (GET_CODE (part[1][1]))
++		{
++		case MEM:
++		  part[1][1] = adjust_address (part[1][1], DImode, 0);
++		  break;
++
++		case REG:
++		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
++		  break;
++
++		default:
++		  gcc_unreachable ();
++		}
++
++	      if (GET_MODE (part[1][0]) == SImode)
++		part[1][0] = part[1][1];
++	    }
++	}
++      emit_move_insn (part[0][1], part[1][1]);
++      emit_move_insn (part[0][0], part[1][0]);
++      return;
++    }
++
++  /* Choose correct order to not overwrite the source before it is copied.  */
++  if ((REG_P (part[0][0])
++       && REG_P (part[1][1])
++       && (REGNO (part[0][0]) == REGNO (part[1][1])
++	   || (nparts == 3
++	       && REGNO (part[0][0]) == REGNO (part[1][2]))
++	   || (nparts == 4
++	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
++      || (collisions > 0
++	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
++    {
++      for (i = 0, j = nparts - 1; i < nparts; i++, j--)
++	{
++	  operands[2 + i] = part[0][j];
++	  operands[6 + i] = part[1][j];
++	}
++    }
++  else
++    {
++      for (i = 0; i < nparts; i++)
++	{
++	  operands[2 + i] = part[0][i];
++	  operands[6 + i] = part[1][i];
++	}
++    }
++
++  /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
++  if (optimize_insn_for_size_p ())
++    {
++      for (j = 0; j < nparts - 1; j++)
++	if (CONST_INT_P (operands[6 + j])
++	    && operands[6 + j] != const0_rtx
++	    && REG_P (operands[2 + j]))
++	  for (i = j; i < nparts - 1; i++)
++	    if (CONST_INT_P (operands[7 + i])
++		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
++	      operands[7 + i] = operands[2 + j];
++    }
++
++  for (i = 0; i < nparts; i++)
++    emit_move_insn (operands[2 + i], operands[6 + i]);
++
++  return;
++}
++
++/* Helper function of ix86_split_ashl used to generate an SImode/DImode
++   left shift by a constant, either using a single shift or
++   a sequence of add instructions.  */
++
++static void
++ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
++{
++  rtx (*insn)(rtx, rtx, rtx);
++
++  if (count == 1
++      || (count * ix86_cost->add <= ix86_cost->shift_const
++	  && !optimize_insn_for_size_p ()))
++    {
++      insn = mode == DImode ? gen_addsi3 : gen_adddi3;
++      while (count-- > 0)
++	emit_insn (insn (operand, operand, operand));
++    }
++  else
++    {
++      insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
++      emit_insn (insn (operand, operand, GEN_INT (count)));
++    }
++}
++
++void
++ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
++{
++  rtx (*gen_ashl3)(rtx, rtx, rtx);
++  rtx (*gen_shld)(rtx, rtx, rtx);
++  int half_width = GET_MODE_BITSIZE (mode) >> 1;
++
++  rtx low[2], high[2];
++  int count;
++
++  if (CONST_INT_P (operands[2]))
++    {
++      split_double_mode (mode, operands, 2, low, high);
++      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
++
++      if (count >= half_width)
++	{
++	  emit_move_insn (high[0], low[1]);
++	  emit_move_insn (low[0], const0_rtx);
++
++	  if (count > half_width)
++	    ix86_expand_ashl_const (high[0], count - half_width, mode);
++	}
++      else
++	{
++	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
++
++	  if (!rtx_equal_p (operands[0], operands[1]))
++	    emit_move_insn (operands[0], operands[1]);
++
++	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
++	  ix86_expand_ashl_const (low[0], count, mode);
++	}
++      return;
++    }
++
++  split_double_mode (mode, operands, 1, low, high);
++
++  gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
++
++  if (operands[1] == const1_rtx)
++    {
++      /* Assuming we've chosen a QImode capable registers, then 1 << N
++	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
++      if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
++	{
++	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
++
++	  ix86_expand_clear (low[0]);
++	  ix86_expand_clear (high[0]);
++	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
++
++	  d = gen_lowpart (QImode, low[0]);
++	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
++	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
++	  emit_insn (gen_rtx_SET (d, s));
++
++	  d = gen_lowpart (QImode, high[0]);
++	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
++	  s = gen_rtx_NE (QImode, flags, const0_rtx);
++	  emit_insn (gen_rtx_SET (d, s));
++	}
++
++      /* Otherwise, we can get the same results by manually performing
++	 a bit extract operation on bit 5/6, and then performing the two
++	 shifts.  The two methods of getting 0/1 into low/high are exactly
++	 the same size.  Avoiding the shift in the bit extract case helps
++	 pentium4 a bit; no one else seems to care much either way.  */
++      else
++	{
++	  machine_mode half_mode;
++	  rtx (*gen_lshr3)(rtx, rtx, rtx);
++	  rtx (*gen_and3)(rtx, rtx, rtx);
++	  rtx (*gen_xor3)(rtx, rtx, rtx);
++	  HOST_WIDE_INT bits;
++	  rtx x;
++
++	  if (mode == DImode)
++	    {
++	      half_mode = SImode;
++	      gen_lshr3 = gen_lshrsi3;
++	      gen_and3 = gen_andsi3;
++	      gen_xor3 = gen_xorsi3;
++	      bits = 5;
++	    }
++	  else
++	    {
++	      half_mode = DImode;
++	      gen_lshr3 = gen_lshrdi3;
++	      gen_and3 = gen_anddi3;
++	      gen_xor3 = gen_xordi3;
++	      bits = 6;
++	    }
++
++	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
++	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
++	  else
++	    x = gen_lowpart (half_mode, operands[2]);
++	  emit_insn (gen_rtx_SET (high[0], x));
++
++	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
++	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
++	  emit_move_insn (low[0], high[0]);
++	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
++	}
++
++      emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
++      emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
++      return;
++    }
++
++  if (operands[1] == constm1_rtx)
++    {
++      /* For -1 << N, we can avoid the shld instruction, because we
++	 know that we're shifting 0...31/63 ones into a -1.  */
++      emit_move_insn (low[0], constm1_rtx);
++      if (optimize_insn_for_size_p ())
++	emit_move_insn (high[0], low[0]);
++      else
++	emit_move_insn (high[0], constm1_rtx);
++    }
++  else
++    {
++      gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
++
++      if (!rtx_equal_p (operands[0], operands[1]))
++	emit_move_insn (operands[0], operands[1]);
++
++      split_double_mode (mode, operands, 1, low, high);
++      emit_insn (gen_shld (high[0], low[0], operands[2]));
++    }
++
++  emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
++
++  if (TARGET_CMOVE && scratch)
++    {
++      rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
++	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
++
++      ix86_expand_clear (scratch);
++      emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
++    }
++  else
++    {
++      rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
++	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
++
++      emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
++    }
++}
++
++void
++ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
++{
++  rtx (*gen_ashr3)(rtx, rtx, rtx)
++    = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
++  rtx (*gen_shrd)(rtx, rtx, rtx);
++  int half_width = GET_MODE_BITSIZE (mode) >> 1;
++
++  rtx low[2], high[2];
++  int count;
++
++  if (CONST_INT_P (operands[2]))
++    {
++      split_double_mode (mode, operands, 2, low, high);
++      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
++
++      if (count == GET_MODE_BITSIZE (mode) - 1)
++	{
++	  emit_move_insn (high[0], high[1]);
++	  emit_insn (gen_ashr3 (high[0], high[0],
++				GEN_INT (half_width - 1)));
++	  emit_move_insn (low[0], high[0]);
++
++	}
++      else if (count >= half_width)
++	{
++	  emit_move_insn (low[0], high[1]);
++	  emit_move_insn (high[0], low[0]);
++	  emit_insn (gen_ashr3 (high[0], high[0],
++				GEN_INT (half_width - 1)));
++
++	  if (count > half_width)
++	    emit_insn (gen_ashr3 (low[0], low[0],
++				  GEN_INT (count - half_width)));
++	}
++      else
++	{
++	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
++
++	  if (!rtx_equal_p (operands[0], operands[1]))
++	    emit_move_insn (operands[0], operands[1]);
++
++	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
++	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
++	}
++    }
++  else
++    {
++      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
++
++     if (!rtx_equal_p (operands[0], operands[1]))
++	emit_move_insn (operands[0], operands[1]);
++
++      split_double_mode (mode, operands, 1, low, high);
++
++      emit_insn (gen_shrd (low[0], high[0], operands[2]));
++      emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
++
++      if (TARGET_CMOVE && scratch)
++	{
++	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
++	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
++
++	  emit_move_insn (scratch, high[0]);
++	  emit_insn (gen_ashr3 (scratch, scratch,
++				GEN_INT (half_width - 1)));
++	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
++					  scratch));
++	}
++      else
++	{
++	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
++	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
++
++	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
++	}
++    }
++}
++
++void
++ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
++{
++  rtx (*gen_lshr3)(rtx, rtx, rtx)
++    = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
++  rtx (*gen_shrd)(rtx, rtx, rtx);
++  int half_width = GET_MODE_BITSIZE (mode) >> 1;
++
++  rtx low[2], high[2];
++  int count;
++
++  if (CONST_INT_P (operands[2]))
++    {
++      split_double_mode (mode, operands, 2, low, high);
++      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
++
++      if (count >= half_width)
++	{
++	  emit_move_insn (low[0], high[1]);
++	  ix86_expand_clear (high[0]);
++
++	  if (count > half_width)
++	    emit_insn (gen_lshr3 (low[0], low[0],
++				  GEN_INT (count - half_width)));
++	}
++      else
++	{
++	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
++
++	  if (!rtx_equal_p (operands[0], operands[1]))
++	    emit_move_insn (operands[0], operands[1]);
++
++	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
++	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
++	}
++    }
++  else
++    {
++      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
++
++      if (!rtx_equal_p (operands[0], operands[1]))
++	emit_move_insn (operands[0], operands[1]);
++
++      split_double_mode (mode, operands, 1, low, high);
++
++      emit_insn (gen_shrd (low[0], high[0], operands[2]));
++      emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
++
++      if (TARGET_CMOVE && scratch)
++	{
++	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
++	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
++
++	  ix86_expand_clear (scratch);
++	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
++					  scratch));
++	}
++      else
++	{
++	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
++	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
++
++	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
++	}
++    }
++}
++
++/* Return mode for the memcpy/memset loop counter.  Prefer SImode over
++   DImode for constant loop counts.  */
++
++static machine_mode
++counter_mode (rtx count_exp)
++{
++  if (GET_MODE (count_exp) != VOIDmode)
++    return GET_MODE (count_exp);
++  if (!CONST_INT_P (count_exp))
++    return Pmode;
++  if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
++    return DImode;
++  return SImode;
++}
++
++/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
++   to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
++   specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
++   memory by VALUE (supposed to be in MODE).
++
++   The size is rounded down to whole number of chunk size moved at once.
++   SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
++
++
++static void
++expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
++			       rtx destptr, rtx srcptr, rtx value,
++			       rtx count, machine_mode mode, int unroll,
++			       int expected_size, bool issetmem)
++{
++  rtx_code_label *out_label, *top_label;
++  rtx iter, tmp;
++  machine_mode iter_mode = counter_mode (count);
++  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
++  rtx piece_size = GEN_INT (piece_size_n);
++  rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
++  rtx size;
++  int i;
++
++  top_label = gen_label_rtx ();
++  out_label = gen_label_rtx ();
++  iter = gen_reg_rtx (iter_mode);
++
++  size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
++			      NULL, 1, OPTAB_DIRECT);
++  /* Those two should combine.  */
++  if (piece_size == const1_rtx)
++    {
++      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
++			       true, out_label);
++      predict_jump (REG_BR_PROB_BASE * 10 / 100);
++    }
++  emit_move_insn (iter, const0_rtx);
++
++  emit_label (top_label);
++
++  tmp = convert_modes (Pmode, iter_mode, iter, true);
++
++  /* This assert could be relaxed - in this case we'll need to compute
++     smallest power of two, containing in PIECE_SIZE_N and pass it to
++     offset_address.  */
++  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
++  destmem = offset_address (destmem, tmp, piece_size_n);
++  destmem = adjust_address (destmem, mode, 0);
++
++  if (!issetmem)
++    {
++      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
++      srcmem = adjust_address (srcmem, mode, 0);
++
++      /* When unrolling for chips that reorder memory reads and writes,
++	 we can save registers by using single temporary.
++	 Also using 4 temporaries is overkill in 32bit mode.  */
++      if (!TARGET_64BIT && 0)
++	{
++	  for (i = 0; i < unroll; i++)
++	    {
++	      if (i)
++		{
++		  destmem = adjust_address (copy_rtx (destmem), mode,
++					    GET_MODE_SIZE (mode));
++		  srcmem = adjust_address (copy_rtx (srcmem), mode,
++					   GET_MODE_SIZE (mode));
++		}
++	      emit_move_insn (destmem, srcmem);
++	    }
++	}
++      else
++	{
++	  rtx tmpreg[4];
++	  gcc_assert (unroll <= 4);
++	  for (i = 0; i < unroll; i++)
++	    {
++	      tmpreg[i] = gen_reg_rtx (mode);
++	      if (i)
++		srcmem = adjust_address (copy_rtx (srcmem), mode,
++					 GET_MODE_SIZE (mode));
++	      emit_move_insn (tmpreg[i], srcmem);
++	    }
++	  for (i = 0; i < unroll; i++)
++	    {
++	      if (i)
++		destmem = adjust_address (copy_rtx (destmem), mode,
++					  GET_MODE_SIZE (mode));
++	      emit_move_insn (destmem, tmpreg[i]);
++	    }
++	}
++    }
++  else
++    for (i = 0; i < unroll; i++)
++      {
++	if (i)
++	  destmem = adjust_address (copy_rtx (destmem), mode,
++				    GET_MODE_SIZE (mode));
++	emit_move_insn (destmem, value);
++      }
++
++  tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
++			     true, OPTAB_LIB_WIDEN);
++  if (tmp != iter)
++    emit_move_insn (iter, tmp);
++
++  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
++			   true, top_label);
++  if (expected_size != -1)
++    {
++      expected_size /= GET_MODE_SIZE (mode) * unroll;
++      if (expected_size == 0)
++	predict_jump (0);
++      else if (expected_size > REG_BR_PROB_BASE)
++	predict_jump (REG_BR_PROB_BASE - 1);
++      else
++        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
++		      / expected_size);
++    }
++  else
++    predict_jump (REG_BR_PROB_BASE * 80 / 100);
++  iter = ix86_zero_extend_to_Pmode (iter);
++  tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
++			     true, OPTAB_LIB_WIDEN);
++  if (tmp != destptr)
++    emit_move_insn (destptr, tmp);
++  if (!issetmem)
++    {
++      tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
++				 true, OPTAB_LIB_WIDEN);
++      if (tmp != srcptr)
++	emit_move_insn (srcptr, tmp);
++    }
++  emit_label (out_label);
++}
++
++/* Divide COUNTREG by SCALE.  */
++static rtx
++scale_counter (rtx countreg, int scale)
++{
++  rtx sc;
++
++  if (scale == 1)
++    return countreg;
++  if (CONST_INT_P (countreg))
++    return GEN_INT (INTVAL (countreg) / scale);
++  gcc_assert (REG_P (countreg));
++
++  sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
++			    GEN_INT (exact_log2 (scale)),
++			    NULL, 1, OPTAB_DIRECT);
++  return sc;
++}
++
++/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
++   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
++   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
++   For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
++   ORIG_VALUE is the original value passed to memset to fill the memory with.
++   Other arguments have same meaning as for previous function.  */
++
++static void
++expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
++			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
++			   rtx count,
++			   machine_mode mode, bool issetmem)
++{
++  rtx destexp;
++  rtx srcexp;
++  rtx countreg;
++  HOST_WIDE_INT rounded_count;
++
++  /* If possible, it is shorter to use rep movs.
++     TODO: Maybe it is better to move this logic to decide_alg.  */
++  if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
++      && (!issetmem || orig_value == const0_rtx))
++    mode = SImode;
++
++  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
++    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
++
++  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
++						       GET_MODE_SIZE (mode)));
++  if (mode != QImode)
++    {
++      destexp = gen_rtx_ASHIFT (Pmode, countreg,
++				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
++      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
++    }
++  else
++    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
++  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
++    {
++      rounded_count
++	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
++      destmem = shallow_copy_rtx (destmem);
++      set_mem_size (destmem, rounded_count);
++    }
++  else if (MEM_SIZE_KNOWN_P (destmem))
++    clear_mem_size (destmem);
++
++  if (issetmem)
++    {
++      value = force_reg (mode, gen_lowpart (mode, value));
++      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
++    }
++  else
++    {
++      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
++	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
++      if (mode != QImode)
++	{
++	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
++				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
++	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
++	}
++      else
++	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
++      if (CONST_INT_P (count))
++	{
++	  rounded_count
++	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
++	  srcmem = shallow_copy_rtx (srcmem);
++	  set_mem_size (srcmem, rounded_count);
++	}
++      else
++	{
++	  if (MEM_SIZE_KNOWN_P (srcmem))
++	    clear_mem_size (srcmem);
++	}
++      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
++			      destexp, srcexp));
++    }
++}
++
++/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
++   DESTMEM.
++   SRC is passed by pointer to be updated on return.
++   Return value is updated DST.  */
++static rtx
++emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
++	     HOST_WIDE_INT size_to_move)
++{
++  rtx dst = destmem, src = *srcmem, adjust, tempreg;
++  enum insn_code code;
++  machine_mode move_mode;
++  int piece_size, i;
++
++  /* Find the widest mode in which we could perform moves.
++     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
++     it until move of such size is supported.  */
++  piece_size = 1 << floor_log2 (size_to_move);
++  while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
++	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
++    {
++      gcc_assert (piece_size > 1);
++      piece_size >>= 1;
++    }
++
++  /* Find the corresponding vector mode with the same size as MOVE_MODE.
++     MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
++  if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
++    {
++      int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
++      if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
++	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
++	{
++	  move_mode = word_mode;
++	  piece_size = GET_MODE_SIZE (move_mode);
++	  code = optab_handler (mov_optab, move_mode);
++	}
++    }
++  gcc_assert (code != CODE_FOR_nothing);
++
++  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
++  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
++
++  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
++  gcc_assert (size_to_move % piece_size == 0);
++  adjust = GEN_INT (piece_size);
++  for (i = 0; i < size_to_move; i += piece_size)
++    {
++      /* We move from memory to memory, so we'll need to do it via
++	 a temporary register.  */
++      tempreg = gen_reg_rtx (move_mode);
++      emit_insn (GEN_FCN (code) (tempreg, src));
++      emit_insn (GEN_FCN (code) (dst, tempreg));
++
++      emit_move_insn (destptr,
++		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
++      emit_move_insn (srcptr,
++		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
++
++      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
++					  piece_size);
++      src = adjust_automodify_address_nv (src, move_mode, srcptr,
++					  piece_size);
++    }
++
++  /* Update DST and SRC rtx.  */
++  *srcmem = src;
++  return dst;
++}
++
++/* Helper function for the string operations below.  Dest VARIABLE whether
++   it is aligned to VALUE bytes.  If true, jump to the label.  */
++
++static rtx_code_label *
++ix86_expand_aligntest (rtx variable, int value, bool epilogue)
++{
++  rtx_code_label *label = gen_label_rtx ();
++  rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
++  if (GET_MODE (variable) == DImode)
++    emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
++  else
++    emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
++  emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
++			   1, label);
++  if (epilogue)
++    predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  else
++    predict_jump (REG_BR_PROB_BASE * 90 / 100);
++  return label;
++}
++
++
++/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
++
++static void
++expand_cpymem_epilogue (rtx destmem, rtx srcmem,
++			rtx destptr, rtx srcptr, rtx count, int max_size)
++{
++  rtx src, dest;
++  if (CONST_INT_P (count))
++    {
++      HOST_WIDE_INT countval = INTVAL (count);
++      HOST_WIDE_INT epilogue_size = countval % max_size;
++      int i;
++
++      /* For now MAX_SIZE should be a power of 2.  This assert could be
++	 relaxed, but it'll require a bit more complicated epilogue
++	 expanding.  */
++      gcc_assert ((max_size & (max_size - 1)) == 0);
++      for (i = max_size; i >= 1; i >>= 1)
++	{
++	  if (epilogue_size & i)
++	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
++	}
++      return;
++    }
++  if (max_size > 8)
++    {
++      count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
++				    count, 1, OPTAB_DIRECT);
++      expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
++				     count, QImode, 1, 4, false);
++      return;
++    }
++
++  /* When there are stringops, we can cheaply increase dest and src pointers.
++     Otherwise we save code size by maintaining offset (zero is readily
++     available from preceding rep operation) and using x86 addressing modes.
++   */
++  if (TARGET_SINGLE_STRINGOP)
++    {
++      if (max_size > 4)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
++	  src = change_address (srcmem, SImode, srcptr);
++	  dest = change_address (destmem, SImode, destptr);
++	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	}
++      if (max_size > 2)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
++	  src = change_address (srcmem, HImode, srcptr);
++	  dest = change_address (destmem, HImode, destptr);
++	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	}
++      if (max_size > 1)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
++	  src = change_address (srcmem, QImode, srcptr);
++	  dest = change_address (destmem, QImode, destptr);
++	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	}
++    }
++  else
++    {
++      rtx offset = force_reg (Pmode, const0_rtx);
++      rtx tmp;
++
++      if (max_size > 4)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
++	  src = change_address (srcmem, SImode, srcptr);
++	  dest = change_address (destmem, SImode, destptr);
++	  emit_move_insn (dest, src);
++	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
++				     true, OPTAB_LIB_WIDEN);
++	  if (tmp != offset)
++	    emit_move_insn (offset, tmp);
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	}
++      if (max_size > 2)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
++	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
++	  src = change_address (srcmem, HImode, tmp);
++	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
++	  dest = change_address (destmem, HImode, tmp);
++	  emit_move_insn (dest, src);
++	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
++				     true, OPTAB_LIB_WIDEN);
++	  if (tmp != offset)
++	    emit_move_insn (offset, tmp);
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	}
++      if (max_size > 1)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
++	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
++	  src = change_address (srcmem, QImode, tmp);
++	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
++	  dest = change_address (destmem, QImode, tmp);
++	  emit_move_insn (dest, src);
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	}
++    }
++}
++
++/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
++   with value PROMOTED_VAL.
++   SRC is passed by pointer to be updated on return.
++   Return value is updated DST.  */
++static rtx
++emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
++	     HOST_WIDE_INT size_to_move)
++{
++  rtx dst = destmem, adjust;
++  enum insn_code code;
++  machine_mode move_mode;
++  int piece_size, i;
++
++  /* Find the widest mode in which we could perform moves.
++     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
++     it until move of such size is supported.  */
++  move_mode = GET_MODE (promoted_val);
++  if (move_mode == VOIDmode)
++    move_mode = QImode;
++  if (size_to_move < GET_MODE_SIZE (move_mode))
++    {
++      unsigned int move_bits = size_to_move * BITS_PER_UNIT;
++      move_mode = int_mode_for_size (move_bits, 0).require ();
++      promoted_val = gen_lowpart (move_mode, promoted_val);
++    }
++  piece_size = GET_MODE_SIZE (move_mode);
++  code = optab_handler (mov_optab, move_mode);
++  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
++
++  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
++
++  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
++  gcc_assert (size_to_move % piece_size == 0);
++  adjust = GEN_INT (piece_size);
++  for (i = 0; i < size_to_move; i += piece_size)
++    {
++      if (piece_size <= GET_MODE_SIZE (word_mode))
++	{
++	  emit_insn (gen_strset (destptr, dst, promoted_val));
++	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
++					      piece_size);
++	  continue;
++	}
++
++      emit_insn (GEN_FCN (code) (dst, promoted_val));
++
++      emit_move_insn (destptr,
++		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
++
++      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
++					  piece_size);
++    }
++
++  /* Update DST rtx.  */
++  return dst;
++}
++/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
++static void
++expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
++				 rtx count, int max_size)
++{
++  count = expand_simple_binop (counter_mode (count), AND, count,
++			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
++  expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
++				 gen_lowpart (QImode, value), count, QImode,
++				 1, max_size / 2, true);
++}
++
++/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
++static void
++expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
++			rtx count, int max_size)
++{
++  rtx dest;
++
++  if (CONST_INT_P (count))
++    {
++      HOST_WIDE_INT countval = INTVAL (count);
++      HOST_WIDE_INT epilogue_size = countval % max_size;
++      int i;
++
++      /* For now MAX_SIZE should be a power of 2.  This assert could be
++	 relaxed, but it'll require a bit more complicated epilogue
++	 expanding.  */
++      gcc_assert ((max_size & (max_size - 1)) == 0);
++      for (i = max_size; i >= 1; i >>= 1)
++	{
++	  if (epilogue_size & i)
++	    {
++	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
++		destmem = emit_memset (destmem, destptr, vec_value, i);
++	      else
++		destmem = emit_memset (destmem, destptr, value, i);
++	    }
++	}
++      return;
++    }
++  if (max_size > 32)
++    {
++      expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
++      return;
++    }
++  if (max_size > 16)
++    {
++      rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
++      if (TARGET_64BIT)
++	{
++	  dest = change_address (destmem, DImode, destptr);
++	  emit_insn (gen_strset (destptr, dest, value));
++	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
++	  emit_insn (gen_strset (destptr, dest, value));
++	}
++      else
++	{
++	  dest = change_address (destmem, SImode, destptr);
++	  emit_insn (gen_strset (destptr, dest, value));
++	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
++	  emit_insn (gen_strset (destptr, dest, value));
++	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
++	  emit_insn (gen_strset (destptr, dest, value));
++	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
++	  emit_insn (gen_strset (destptr, dest, value));
++	}
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++    }
++  if (max_size > 8)
++    {
++      rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
++      if (TARGET_64BIT)
++	{
++	  dest = change_address (destmem, DImode, destptr);
++	  emit_insn (gen_strset (destptr, dest, value));
++	}
++      else
++	{
++	  dest = change_address (destmem, SImode, destptr);
++	  emit_insn (gen_strset (destptr, dest, value));
++	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
++	  emit_insn (gen_strset (destptr, dest, value));
++	}
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++    }
++  if (max_size > 4)
++    {
++      rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
++      dest = change_address (destmem, SImode, destptr);
++      emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++    }
++  if (max_size > 2)
++    {
++      rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
++      dest = change_address (destmem, HImode, destptr);
++      emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++    }
++  if (max_size > 1)
++    {
++      rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
++      dest = change_address (destmem, QImode, destptr);
++      emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++    }
++}
++
++/* Adjust COUNTER by the VALUE.  */
++static void
++ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
++{
++  rtx (*gen_add)(rtx, rtx, rtx)
++    = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
++
++  emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
++}
++
++/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
++   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
++   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
++   ignored.
++   Return value is updated DESTMEM.  */
++
++static rtx
++expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
++				  rtx destptr, rtx srcptr, rtx value,
++				  rtx vec_value, rtx count, int align,
++				  int desired_alignment, bool issetmem)
++{
++  int i;
++  for (i = 1; i < desired_alignment; i <<= 1)
++    {
++      if (align <= i)
++	{
++	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
++	  if (issetmem)
++	    {
++	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
++		destmem = emit_memset (destmem, destptr, vec_value, i);
++	      else
++		destmem = emit_memset (destmem, destptr, value, i);
++	    }
++	  else
++	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
++	  ix86_adjust_counter (count, i);
++	  emit_label (label);
++	  LABEL_NUSES (label) = 1;
++	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
++	}
++    }
++  return destmem;
++}
++
++/* Test if COUNT&SIZE is nonzero and if so, expand movme
++   or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
++   and jump to DONE_LABEL.  */
++static void
++expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
++			       rtx destptr, rtx srcptr,
++			       rtx value, rtx vec_value,
++			       rtx count, int size,
++			       rtx done_label, bool issetmem)
++{
++  rtx_code_label *label = ix86_expand_aligntest (count, size, false);
++  machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
++  rtx modesize;
++  int n;
++
++  /* If we do not have vector value to copy, we must reduce size.  */
++  if (issetmem)
++    {
++      if (!vec_value)
++	{
++	  if (GET_MODE (value) == VOIDmode && size > 8)
++	    mode = Pmode;
++	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
++	    mode = GET_MODE (value);
++	}
++      else
++	mode = GET_MODE (vec_value), value = vec_value;
++    }
++  else
++    {
++      /* Choose appropriate vector mode.  */
++      if (size >= 32)
++	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
++      else if (size >= 16)
++	mode = TARGET_SSE ? V16QImode : DImode;
++      srcmem = change_address (srcmem, mode, srcptr);
++    }
++  destmem = change_address (destmem, mode, destptr);
++  modesize = GEN_INT (GET_MODE_SIZE (mode));
++  gcc_assert (GET_MODE_SIZE (mode) <= size);
++  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
++    {
++      if (issetmem)
++	emit_move_insn (destmem, gen_lowpart (mode, value));
++      else
++	{
++          emit_move_insn (destmem, srcmem);
++          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
++	}
++      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
++    }
++
++  destmem = offset_address (destmem, count, 1);
++  destmem = offset_address (destmem, GEN_INT (-2 * size),
++			    GET_MODE_SIZE (mode));
++  if (!issetmem)
++    {
++      srcmem = offset_address (srcmem, count, 1);
++      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
++			       GET_MODE_SIZE (mode));
++    }
++  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
++    {
++      if (issetmem)
++	emit_move_insn (destmem, gen_lowpart (mode, value));
++      else
++	{
++	  emit_move_insn (destmem, srcmem);
++	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
++	}
++      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
++    }
++  emit_jump_insn (gen_jump (done_label));
++  emit_barrier ();
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++}
++
++/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
++   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
++   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
++   proceed with an loop copying SIZE bytes at once. Do moves in MODE.
++   DONE_LABEL is a label after the whole copying sequence. The label is created
++   on demand if *DONE_LABEL is NULL.
++   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
++   bounds after the initial copies. 
++
++   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
++   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
++   we will dispatch to a library call for large blocks.
++
++   In pseudocode we do:
++
++   if (COUNT < SIZE)
++     {
++       Assume that SIZE is 4. Bigger sizes are handled analogously
++       if (COUNT & 4)
++	 {
++	    copy 4 bytes from SRCPTR to DESTPTR
++	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
++	    goto done_label
++	 }
++       if (!COUNT)
++	 goto done_label;
++       copy 1 byte from SRCPTR to DESTPTR
++       if (COUNT & 2)
++	 {
++	    copy 2 bytes from SRCPTR to DESTPTR
++	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
++	 }
++     }
++   else
++     {
++       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
++       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
++
++       OLD_DESPTR = DESTPTR;
++       Align DESTPTR up to DESIRED_ALIGN
++       SRCPTR += DESTPTR - OLD_DESTPTR
++       COUNT -= DEST_PTR - OLD_DESTPTR
++       if (DYNAMIC_CHECK)
++	 Round COUNT down to multiple of SIZE
++       << optional caller supplied zero size guard is here >>
++       << optional caller supplied dynamic check is here >>
++       << caller supplied main copy loop is here >>
++     }
++   done_label:
++  */
++static void
++expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
++							    rtx *destptr, rtx *srcptr,
++							    machine_mode mode,
++							    rtx value, rtx vec_value,
++							    rtx *count,
++							    rtx_code_label **done_label,
++							    int size,
++							    int desired_align,
++							    int align,
++							    unsigned HOST_WIDE_INT *min_size,
++							    bool dynamic_check,
++							    bool issetmem)
++{
++  rtx_code_label *loop_label = NULL, *label;
++  int n;
++  rtx modesize;
++  int prolog_size = 0;
++  rtx mode_value;
++
++  /* Chose proper value to copy.  */
++  if (issetmem && VECTOR_MODE_P (mode))
++    mode_value = vec_value;
++  else
++    mode_value = value;
++  gcc_assert (GET_MODE_SIZE (mode) <= size);
++
++  /* See if block is big or small, handle small blocks.  */
++  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
++    {
++      int size2 = size;
++      loop_label = gen_label_rtx ();
++
++      if (!*done_label)
++	*done_label = gen_label_rtx ();
++
++      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
++			       1, loop_label);
++      size2 >>= 1;
++
++      /* Handle sizes > 3.  */
++      for (;size2 > 2; size2 >>= 1)
++	expand_small_cpymem_or_setmem (destmem, srcmem,
++				       *destptr, *srcptr,
++				       value, vec_value,
++				       *count,
++				       size2, *done_label, issetmem);
++      /* Nothing to copy?  Jump to DONE_LABEL if so */
++      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
++			       1, *done_label);
++
++      /* Do a byte copy.  */
++      destmem = change_address (destmem, QImode, *destptr);
++      if (issetmem)
++	emit_move_insn (destmem, gen_lowpart (QImode, value));
++      else
++	{
++          srcmem = change_address (srcmem, QImode, *srcptr);
++          emit_move_insn (destmem, srcmem);
++	}
++
++      /* Handle sizes 2 and 3.  */
++      label = ix86_expand_aligntest (*count, 2, false);
++      destmem = change_address (destmem, HImode, *destptr);
++      destmem = offset_address (destmem, *count, 1);
++      destmem = offset_address (destmem, GEN_INT (-2), 2);
++      if (issetmem)
++        emit_move_insn (destmem, gen_lowpart (HImode, value));
++      else
++	{
++	  srcmem = change_address (srcmem, HImode, *srcptr);
++	  srcmem = offset_address (srcmem, *count, 1);
++	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
++	  emit_move_insn (destmem, srcmem);
++	}
++
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++      emit_jump_insn (gen_jump (*done_label));
++      emit_barrier ();
++    }
++  else
++    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
++		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
++
++  /* Start memcpy for COUNT >= SIZE.  */
++  if (loop_label)
++    {
++       emit_label (loop_label);
++       LABEL_NUSES (loop_label) = 1;
++    }
++
++  /* Copy first desired_align bytes.  */
++  if (!issetmem)
++    srcmem = change_address (srcmem, mode, *srcptr);
++  destmem = change_address (destmem, mode, *destptr);
++  modesize = GEN_INT (GET_MODE_SIZE (mode));
++  for (n = 0; prolog_size < desired_align - align; n++)
++    {
++      if (issetmem)
++        emit_move_insn (destmem, mode_value);
++      else
++	{
++          emit_move_insn (destmem, srcmem);
++          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
++	}
++      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
++      prolog_size += GET_MODE_SIZE (mode);
++    }
++
++
++  /* Copy last SIZE bytes.  */
++  destmem = offset_address (destmem, *count, 1);
++  destmem = offset_address (destmem,
++			    GEN_INT (-size - prolog_size),
++			    1);
++  if (issetmem)
++    emit_move_insn (destmem, mode_value);
++  else
++    {
++      srcmem = offset_address (srcmem, *count, 1);
++      srcmem = offset_address (srcmem,
++			       GEN_INT (-size - prolog_size),
++			       1);
++      emit_move_insn (destmem, srcmem);
++    }
++  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
++    {
++      destmem = offset_address (destmem, modesize, 1);
++      if (issetmem)
++	emit_move_insn (destmem, mode_value);
++      else
++	{
++          srcmem = offset_address (srcmem, modesize, 1);
++          emit_move_insn (destmem, srcmem);
++	}
++    }
++
++  /* Align destination.  */
++  if (desired_align > 1 && desired_align > align)
++    {
++      rtx saveddest = *destptr;
++
++      gcc_assert (desired_align <= size);
++      /* Align destptr up, place it to new register.  */
++      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
++				      GEN_INT (prolog_size),
++				      NULL_RTX, 1, OPTAB_DIRECT);
++      if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
++	REG_POINTER (*destptr) = 1;
++      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
++				      GEN_INT (-desired_align),
++				      *destptr, 1, OPTAB_DIRECT);
++      /* See how many bytes we skipped.  */
++      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
++				       *destptr,
++				       saveddest, 1, OPTAB_DIRECT);
++      /* Adjust srcptr and count.  */
++      if (!issetmem)
++	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
++				       saveddest, *srcptr, 1, OPTAB_DIRECT);
++      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
++				    saveddest, *count, 1, OPTAB_DIRECT);
++      /* We copied at most size + prolog_size.  */
++      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
++	*min_size
++	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
++      else
++	*min_size = 0;
++
++      /* Our loops always round down the block size, but for dispatch to
++         library we need precise value.  */
++      if (dynamic_check)
++	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
++				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
++    }
++  else
++    {
++      gcc_assert (prolog_size == 0);
++      /* Decrease count, so we won't end up copying last word twice.  */
++      if (!CONST_INT_P (*count))
++	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
++				      constm1_rtx, *count, 1, OPTAB_DIRECT);
++      else
++	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
++				      (unsigned HOST_WIDE_INT)size));
++      if (*min_size)
++	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
++    }
++}
++
++
++/* This function is like the previous one, except here we know how many bytes
++   need to be copied.  That allows us to update alignment not only of DST, which
++   is returned, but also of SRC, which is passed as a pointer for that
++   reason.  */
++static rtx
++expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
++					   rtx srcreg, rtx value, rtx vec_value,
++					   int desired_align, int align_bytes,
++					   bool issetmem)
++{
++  rtx src = NULL;
++  rtx orig_dst = dst;
++  rtx orig_src = NULL;
++  int piece_size = 1;
++  int copied_bytes = 0;
++
++  if (!issetmem)
++    {
++      gcc_assert (srcp != NULL);
++      src = *srcp;
++      orig_src = src;
++    }
++
++  for (piece_size = 1;
++       piece_size <= desired_align && copied_bytes < align_bytes;
++       piece_size <<= 1)
++    {
++      if (align_bytes & piece_size)
++	{
++	  if (issetmem)
++	    {
++	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
++		dst = emit_memset (dst, destreg, vec_value, piece_size);
++	      else
++		dst = emit_memset (dst, destreg, value, piece_size);
++	    }
++	  else
++	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
++	  copied_bytes += piece_size;
++	}
++    }
++  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
++    set_mem_align (dst, desired_align * BITS_PER_UNIT);
++  if (MEM_SIZE_KNOWN_P (orig_dst))
++    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
++
++  if (!issetmem)
++    {
++      int src_align_bytes = get_mem_align_offset (src, desired_align
++						       * BITS_PER_UNIT);
++      if (src_align_bytes >= 0)
++	src_align_bytes = desired_align - src_align_bytes;
++      if (src_align_bytes >= 0)
++	{
++	  unsigned int src_align;
++	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
++	    {
++	      if ((src_align_bytes & (src_align - 1))
++		   == (align_bytes & (src_align - 1)))
++		break;
++	    }
++	  if (src_align > (unsigned int) desired_align)
++	    src_align = desired_align;
++	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
++	    set_mem_align (src, src_align * BITS_PER_UNIT);
++	}
++      if (MEM_SIZE_KNOWN_P (orig_src))
++	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
++      *srcp = src;
++    }
++
++  return dst;
++}
++
++/* Return true if ALG can be used in current context.  
++   Assume we expand memset if MEMSET is true.  */
++static bool
++alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
++{
++  if (alg == no_stringop)
++    return false;
++  if (alg == vector_loop)
++    return TARGET_SSE || TARGET_AVX;
++  /* Algorithms using the rep prefix want at least edi and ecx;
++     additionally, memset wants eax and memcpy wants esi.  Don't
++     consider such algorithms if the user has appropriated those
++     registers for their own purposes, or if we have a non-default
++     address space, since some string insns cannot override the segment.  */
++  if (alg == rep_prefix_1_byte
++      || alg == rep_prefix_4_byte
++      || alg == rep_prefix_8_byte)
++    {
++      if (have_as)
++	return false;
++      if (fixed_regs[CX_REG]
++	  || fixed_regs[DI_REG]
++	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
++	return false;
++    }
++  return true;
++}
++
++/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
++static enum stringop_alg
++decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
++	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
++	    bool memset, bool zero_memset, bool have_as,
++	    int *dynamic_check, bool *noalign, bool recur)
++{
++  const struct stringop_algs *algs;
++  bool optimize_for_speed;
++  int max = 0;
++  const struct processor_costs *cost;
++  int i;
++  bool any_alg_usable_p = false;
++
++  *noalign = false;
++  *dynamic_check = -1;
++
++  /* Even if the string operation call is cold, we still might spend a lot
++     of time processing large blocks.  */
++  if (optimize_function_for_size_p (cfun)
++      || (optimize_insn_for_size_p ()
++ 	  && (max_size < 256
++              || (expected_size != -1 && expected_size < 256))))
++    optimize_for_speed = false;
++  else
++    optimize_for_speed = true;
++
++  cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
++  if (memset)
++    algs = &cost->memset[TARGET_64BIT != 0];
++  else
++    algs = &cost->memcpy[TARGET_64BIT != 0];
++
++  /* See maximal size for user defined algorithm.  */
++  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
++    {
++      enum stringop_alg candidate = algs->size[i].alg;
++      bool usable = alg_usable_p (candidate, memset, have_as);
++      any_alg_usable_p |= usable;
++
++      if (candidate != libcall && candidate && usable)
++	max = algs->size[i].max;
++    }
++
++  /* If expected size is not known but max size is small enough
++     so inline version is a win, set expected size into
++     the range.  */
++  if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
++      && expected_size == -1)
++    expected_size = min_size / 2 + max_size / 2;
++
++  /* If user specified the algorithm, honor it if possible.  */
++  if (ix86_stringop_alg != no_stringop
++      && alg_usable_p (ix86_stringop_alg, memset, have_as))
++    return ix86_stringop_alg;
++  /* rep; movq or rep; movl is the smallest variant.  */
++  else if (!optimize_for_speed)
++    {
++      *noalign = true;
++      if (!count || (count & 3) || (memset && !zero_memset))
++	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
++	       ? rep_prefix_1_byte : loop_1_byte;
++      else
++	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
++	       ? rep_prefix_4_byte : loop;
++    }
++  /* Very tiny blocks are best handled via the loop, REP is expensive to
++     setup.  */
++  else if (expected_size != -1 && expected_size < 4)
++    return loop_1_byte;
++  else if (expected_size != -1)
++    {
++      enum stringop_alg alg = libcall;
++      bool alg_noalign = false;
++      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
++	{
++	  /* We get here if the algorithms that were not libcall-based
++	     were rep-prefix based and we are unable to use rep prefixes
++	     based on global register usage.  Break out of the loop and
++	     use the heuristic below.  */
++	  if (algs->size[i].max == 0)
++	    break;
++	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
++	    {
++	      enum stringop_alg candidate = algs->size[i].alg;
++
++	      if (candidate != libcall
++		  && alg_usable_p (candidate, memset, have_as))
++		{
++		  alg = candidate;
++		  alg_noalign = algs->size[i].noalign;
++		}
++	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
++		 last non-libcall inline algorithm.  */
++	      if (TARGET_INLINE_ALL_STRINGOPS)
++		{
++		  /* When the current size is best to be copied by a libcall,
++		     but we are still forced to inline, run the heuristic below
++		     that will pick code for medium sized blocks.  */
++		  if (alg != libcall)
++		    {
++		      *noalign = alg_noalign;
++		      return alg;
++		    }
++		  else if (!any_alg_usable_p)
++		    break;
++		}
++	      else if (alg_usable_p (candidate, memset, have_as))
++		{
++		  *noalign = algs->size[i].noalign;
++		  return candidate;
++		}
++	    }
++	}
++    }
++  /* When asked to inline the call anyway, try to pick meaningful choice.
++     We look for maximal size of block that is faster to copy by hand and
++     take blocks of at most of that size guessing that average size will
++     be roughly half of the block.
++
++     If this turns out to be bad, we might simply specify the preferred
++     choice in ix86_costs.  */
++  if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
++      && (algs->unknown_size == libcall
++	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
++    {
++      enum stringop_alg alg;
++      HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
++
++      /* If there aren't any usable algorithms or if recursing already,
++	 then recursing on smaller sizes or same size isn't going to
++	 find anything.  Just return the simple byte-at-a-time copy loop.  */
++      if (!any_alg_usable_p || recur)
++	{
++	  /* Pick something reasonable.  */
++	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
++	    *dynamic_check = 128;
++	  return loop_1_byte;
++	}
++      alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
++			zero_memset, have_as, dynamic_check, noalign, true);
++      gcc_assert (*dynamic_check == -1);
++      if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
++	*dynamic_check = max;
++      else
++	gcc_assert (alg != libcall);
++      return alg;
++    }
++  return (alg_usable_p (algs->unknown_size, memset, have_as)
++	  ? algs->unknown_size : libcall);
++}
++
++/* Decide on alignment.  We know that the operand is already aligned to ALIGN
++   (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
++static int
++decide_alignment (int align,
++		  enum stringop_alg alg,
++		  int expected_size,
++		  machine_mode move_mode)
++{
++  int desired_align = 0;
++
++  gcc_assert (alg != no_stringop);
++
++  if (alg == libcall)
++    return 0;
++  if (move_mode == VOIDmode)
++    return 0;
++
++  desired_align = GET_MODE_SIZE (move_mode);
++  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
++     copying whole cacheline at once.  */
++  if (TARGET_PENTIUMPRO
++      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
++    desired_align = 8;
++
++  if (optimize_size)
++    desired_align = 1;
++  if (desired_align < align)
++    desired_align = align;
++  if (expected_size != -1 && expected_size < 4)
++    desired_align = align;
++
++  return desired_align;
++}
++
++
++/* Helper function for memcpy.  For QImode value 0xXY produce
++   0xXYXYXYXY of wide specified by MODE.  This is essentially
++   a * 0x10101010, but we can do slightly better than
++   synth_mult by unwinding the sequence by hand on CPUs with
++   slow multiply.  */
++static rtx
++promote_duplicated_reg (machine_mode mode, rtx val)
++{
++  machine_mode valmode = GET_MODE (val);
++  rtx tmp;
++  int nops = mode == DImode ? 3 : 2;
++
++  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
++  if (val == const0_rtx)
++    return copy_to_mode_reg (mode, CONST0_RTX (mode));
++  if (CONST_INT_P (val))
++    {
++      HOST_WIDE_INT v = INTVAL (val) & 255;
++
++      v |= v << 8;
++      v |= v << 16;
++      if (mode == DImode)
++        v |= (v << 16) << 16;
++      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
++    }
++
++  if (valmode == VOIDmode)
++    valmode = QImode;
++  if (valmode != QImode)
++    val = gen_lowpart (QImode, val);
++  if (mode == QImode)
++    return val;
++  if (!TARGET_PARTIAL_REG_STALL)
++    nops--;
++  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
++      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
++      <= (ix86_cost->shift_const + ix86_cost->add) * nops
++          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
++    {
++      rtx reg = convert_modes (mode, QImode, val, true);
++      tmp = promote_duplicated_reg (mode, const1_rtx);
++      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
++				  OPTAB_DIRECT);
++    }
++  else
++    {
++      rtx reg = convert_modes (mode, QImode, val, true);
++
++      if (!TARGET_PARTIAL_REG_STALL)
++	if (mode == SImode)
++	  emit_insn (gen_insvsi_1 (reg, reg));
++	else
++	  emit_insn (gen_insvdi_1 (reg, reg));
++      else
++	{
++	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
++				     NULL, 1, OPTAB_DIRECT);
++	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
++				     OPTAB_DIRECT);
++	}
++      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
++			         NULL, 1, OPTAB_DIRECT);
++      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
++      if (mode == SImode)
++	return reg;
++      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
++				 NULL, 1, OPTAB_DIRECT);
++      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
++      return reg;
++    }
++}
++
++/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
++   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
++   alignment from ALIGN to DESIRED_ALIGN.  */
++static rtx
++promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
++				int align)
++{
++  rtx promoted_val;
++
++  if (TARGET_64BIT
++      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
++    promoted_val = promote_duplicated_reg (DImode, val);
++  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
++    promoted_val = promote_duplicated_reg (SImode, val);
++  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
++    promoted_val = promote_duplicated_reg (HImode, val);
++  else
++    promoted_val = val;
++
++  return promoted_val;
++}
++
++/* Copy the address to a Pmode register.  This is used for x32 to
++   truncate DImode TLS address to a SImode register. */
++
++static rtx
++ix86_copy_addr_to_reg (rtx addr)
++{
++  rtx reg;
++  if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
++    {
++      reg = copy_addr_to_reg (addr);
++      REG_POINTER (reg) = 1;
++      return reg;
++    }
++  else
++    {
++      gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
++      reg = copy_to_mode_reg (DImode, addr);
++      REG_POINTER (reg) = 1;
++      return gen_rtx_SUBREG (SImode, reg, 0);
++    }
++}
++
++/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
++   operations when profitable.  The code depends upon architecture, block size
++   and alignment, but always has one of the following overall structures:
++
++   Aligned move sequence:
++
++     1) Prologue guard: Conditional that jumps up to epilogues for small
++	blocks that can be handled by epilogue alone.  This is faster
++	but also needed for correctness, since prologue assume the block
++	is larger than the desired alignment.
++
++	Optional dynamic check for size and libcall for large
++	blocks is emitted here too, with -minline-stringops-dynamically.
++
++     2) Prologue: copy first few bytes in order to get destination
++	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
++	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
++	copied.  We emit either a jump tree on power of two sized
++	blocks, or a byte loop.
++
++     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
++	with specified algorithm.
++
++     4) Epilogue: code copying tail of the block that is too small to be
++	handled by main body (or up to size guarded by prologue guard). 
++
++  Misaligned move sequence
++
++     1) missaligned move prologue/epilogue containing:
++        a) Prologue handling small memory blocks and jumping to done_label
++	   (skipped if blocks are known to be large enough)
++	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
++           needed by single possibly misaligned move
++	   (skipped if alignment is not needed)
++        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
++
++     2) Zero size guard dispatching to done_label, if needed
++
++     3) dispatch to library call, if needed,
++
++     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
++	with specified algorithm.  */
++bool
++ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
++			   rtx align_exp, rtx expected_align_exp,
++			   rtx expected_size_exp, rtx min_size_exp,
++			   rtx max_size_exp, rtx probable_max_size_exp,
++			   bool issetmem)
++{
++  rtx destreg;
++  rtx srcreg = NULL;
++  rtx_code_label *label = NULL;
++  rtx tmp;
++  rtx_code_label *jump_around_label = NULL;
++  HOST_WIDE_INT align = 1;
++  unsigned HOST_WIDE_INT count = 0;
++  HOST_WIDE_INT expected_size = -1;
++  int size_needed = 0, epilogue_size_needed;
++  int desired_align = 0, align_bytes = 0;
++  enum stringop_alg alg;
++  rtx promoted_val = NULL;
++  rtx vec_promoted_val = NULL;
++  bool force_loopy_epilogue = false;
++  int dynamic_check;
++  bool need_zero_guard = false;
++  bool noalign;
++  machine_mode move_mode = VOIDmode;
++  machine_mode wider_mode;
++  int unroll_factor = 1;
++  /* TODO: Once value ranges are available, fill in proper data.  */
++  unsigned HOST_WIDE_INT min_size = 0;
++  unsigned HOST_WIDE_INT max_size = -1;
++  unsigned HOST_WIDE_INT probable_max_size = -1;
++  bool misaligned_prologue_used = false;
++  bool have_as;
++
++  if (CONST_INT_P (align_exp))
++    align = INTVAL (align_exp);
++  /* i386 can do misaligned access on reasonably increased cost.  */
++  if (CONST_INT_P (expected_align_exp)
++      && INTVAL (expected_align_exp) > align)
++    align = INTVAL (expected_align_exp);
++  /* ALIGN is the minimum of destination and source alignment, but we care here
++     just about destination alignment.  */
++  else if (!issetmem
++	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
++    align = MEM_ALIGN (dst) / BITS_PER_UNIT;
++
++  if (CONST_INT_P (count_exp))
++    {
++      min_size = max_size = probable_max_size = count = expected_size
++	= INTVAL (count_exp);
++      /* When COUNT is 0, there is nothing to do.  */
++      if (!count)
++	return true;
++    }
++  else
++    {
++      if (min_size_exp)
++	min_size = INTVAL (min_size_exp);
++      if (max_size_exp)
++	max_size = INTVAL (max_size_exp);
++      if (probable_max_size_exp)
++	probable_max_size = INTVAL (probable_max_size_exp);
++      if (CONST_INT_P (expected_size_exp))
++	expected_size = INTVAL (expected_size_exp);
++     }
++
++  /* Make sure we don't need to care about overflow later on.  */
++  if (count > (HOST_WIDE_INT_1U << 30))
++    return false;
++
++  have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
++  if (!issetmem)
++    have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
++
++  /* Step 0: Decide on preferred algorithm, desired alignment and
++     size of chunks to be copied by main loop.  */
++  alg = decide_alg (count, expected_size, min_size, probable_max_size,
++		    issetmem,
++		    issetmem && val_exp == const0_rtx, have_as,
++		    &dynamic_check, &noalign, false);
++
++  if (dump_file)
++    fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
++	     stringop_alg_names[alg]);
++
++  if (alg == libcall)
++    return false;
++  gcc_assert (alg != no_stringop);
++
++  /* For now vector-version of memset is generated only for memory zeroing, as
++     creating of promoted vector value is very cheap in this case.  */
++  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
++    alg = unrolled_loop;
++
++  if (!count)
++    count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
++  destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
++  if (!issetmem)
++    srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
++
++  unroll_factor = 1;
++  move_mode = word_mode;
++  switch (alg)
++    {
++    case libcall:
++    case no_stringop:
++    case last_alg:
++      gcc_unreachable ();
++    case loop_1_byte:
++      need_zero_guard = true;
++      move_mode = QImode;
++      break;
++    case loop:
++      need_zero_guard = true;
++      break;
++    case unrolled_loop:
++      need_zero_guard = true;
++      unroll_factor = (TARGET_64BIT ? 4 : 2);
++      break;
++    case vector_loop:
++      need_zero_guard = true;
++      unroll_factor = 4;
++      /* Find the widest supported mode.  */
++      move_mode = word_mode;
++      while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
++	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
++	move_mode = wider_mode;
++
++      if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
++	move_mode = TImode;
++
++      /* Find the corresponding vector mode with the same size as MOVE_MODE.
++	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
++      if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
++	{
++	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
++	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
++	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
++	    move_mode = word_mode;
++	}
++      gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
++      break;
++    case rep_prefix_8_byte:
++      move_mode = DImode;
++      break;
++    case rep_prefix_4_byte:
++      move_mode = SImode;
++      break;
++    case rep_prefix_1_byte:
++      move_mode = QImode;
++      break;
++    }
++  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
++  epilogue_size_needed = size_needed;
++
++  /* If we are going to call any library calls conditionally, make sure any
++     pending stack adjustment happen before the first conditional branch,
++     otherwise they will be emitted before the library call only and won't
++     happen from the other branches.  */
++  if (dynamic_check != -1)
++    do_pending_stack_adjust ();
++
++  desired_align = decide_alignment (align, alg, expected_size, move_mode);
++  if (!TARGET_ALIGN_STRINGOPS || noalign)
++    align = desired_align;
++
++  /* Step 1: Prologue guard.  */
++
++  /* Alignment code needs count to be in register.  */
++  if (CONST_INT_P (count_exp) && desired_align > align)
++    {
++      if (INTVAL (count_exp) > desired_align
++	  && INTVAL (count_exp) > size_needed)
++	{
++	  align_bytes
++	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
++	  if (align_bytes <= 0)
++	    align_bytes = 0;
++	  else
++	    align_bytes = desired_align - align_bytes;
++	}
++      if (align_bytes == 0)
++	count_exp = force_reg (counter_mode (count_exp), count_exp);
++    }
++  gcc_assert (desired_align >= 1 && align >= 1);
++
++  /* Misaligned move sequences handle both prologue and epilogue at once.
++     Default code generation results in a smaller code for large alignments
++     and also avoids redundant job when sizes are known precisely.  */
++  misaligned_prologue_used
++    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
++       && MAX (desired_align, epilogue_size_needed) <= 32
++       && desired_align <= epilogue_size_needed
++       && ((desired_align > align && !align_bytes)
++	   || (!count && epilogue_size_needed > 1)));
++
++  /* Do the cheap promotion to allow better CSE across the
++     main loop and epilogue (ie one load of the big constant in the
++     front of all code.  
++     For now the misaligned move sequences do not have fast path
++     without broadcasting.  */
++  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
++    {
++      if (alg == vector_loop)
++	{
++	  gcc_assert (val_exp == const0_rtx);
++	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
++	  promoted_val = promote_duplicated_reg_to_size (val_exp,
++							 GET_MODE_SIZE (word_mode),
++							 desired_align, align);
++	}
++      else
++	{
++	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
++							 desired_align, align);
++	}
++    }
++  /* Misaligned move sequences handles both prologues and epilogues at once.
++     Default code generation results in smaller code for large alignments and
++     also avoids redundant job when sizes are known precisely.  */
++  if (misaligned_prologue_used)
++    {
++      /* Misaligned move prologue handled small blocks by itself.  */
++      expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
++	   (dst, src, &destreg, &srcreg,
++	    move_mode, promoted_val, vec_promoted_val,
++	    &count_exp,
++	    &jump_around_label,
++            desired_align < align
++	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
++	    desired_align, align, &min_size, dynamic_check, issetmem);
++      if (!issetmem)
++        src = change_address (src, BLKmode, srcreg);
++      dst = change_address (dst, BLKmode, destreg);
++      set_mem_align (dst, desired_align * BITS_PER_UNIT);
++      epilogue_size_needed = 0;
++      if (need_zero_guard
++	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
++	{
++	  /* It is possible that we copied enough so the main loop will not
++	     execute.  */
++	  gcc_assert (size_needed > 1);
++	  if (jump_around_label == NULL_RTX)
++	    jump_around_label = gen_label_rtx ();
++	  emit_cmp_and_jump_insns (count_exp,
++				   GEN_INT (size_needed),
++				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
++	  if (expected_size == -1
++	      || expected_size < (desired_align - align) / 2 + size_needed)
++	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
++	  else
++	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
++	}
++    }
++  /* Ensure that alignment prologue won't copy past end of block.  */
++  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
++    {
++      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
++      /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
++	 Make sure it is power of 2.  */
++      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
++
++      /* To improve performance of small blocks, we jump around the VAL
++	 promoting mode.  This mean that if the promoted VAL is not constant,
++	 we might not use it in the epilogue and have to use byte
++	 loop variant.  */
++      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
++	force_loopy_epilogue = true;
++      if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
++	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
++	{
++	  /* If main algorithm works on QImode, no epilogue is needed.
++	     For small sizes just don't align anything.  */
++	  if (size_needed == 1)
++	    desired_align = align;
++	  else
++	    goto epilogue;
++	}
++      else if (!count
++	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
++	{
++	  label = gen_label_rtx ();
++	  emit_cmp_and_jump_insns (count_exp,
++				   GEN_INT (epilogue_size_needed),
++				   LTU, 0, counter_mode (count_exp), 1, label);
++	  if (expected_size == -1 || expected_size < epilogue_size_needed)
++	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
++	  else
++	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
++	}
++    }
++
++  /* Emit code to decide on runtime whether library call or inline should be
++     used.  */
++  if (dynamic_check != -1)
++    {
++      if (!issetmem && CONST_INT_P (count_exp))
++	{
++	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
++	    {
++	      emit_block_copy_via_libcall (dst, src, count_exp);
++	      count_exp = const0_rtx;
++	      goto epilogue;
++	    }
++	}
++      else
++	{
++	  rtx_code_label *hot_label = gen_label_rtx ();
++	  if (jump_around_label == NULL_RTX)
++	    jump_around_label = gen_label_rtx ();
++	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
++				   LEU, 0, counter_mode (count_exp),
++				   1, hot_label);
++	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
++	  if (issetmem)
++	    set_storage_via_libcall (dst, count_exp, val_exp);
++	  else
++	    emit_block_copy_via_libcall (dst, src, count_exp);
++	  emit_jump (jump_around_label);
++	  emit_label (hot_label);
++	}
++    }
++
++  /* Step 2: Alignment prologue.  */
++  /* Do the expensive promotion once we branched off the small blocks.  */
++  if (issetmem && !promoted_val)
++    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
++						   desired_align, align);
++
++  if (desired_align > align && !misaligned_prologue_used)
++    {
++      if (align_bytes == 0)
++	{
++	  /* Except for the first move in prologue, we no longer know
++	     constant offset in aliasing info.  It don't seems to worth
++	     the pain to maintain it for the first move, so throw away
++	     the info early.  */
++	  dst = change_address (dst, BLKmode, destreg);
++	  if (!issetmem)
++	    src = change_address (src, BLKmode, srcreg);
++	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
++					    promoted_val, vec_promoted_val,
++					    count_exp, align, desired_align,
++					    issetmem);
++	  /* At most desired_align - align bytes are copied.  */
++	  if (min_size < (unsigned)(desired_align - align))
++	    min_size = 0;
++	  else
++	    min_size -= desired_align - align;
++	}
++      else
++	{
++	  /* If we know how many bytes need to be stored before dst is
++	     sufficiently aligned, maintain aliasing info accurately.  */
++	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
++							   srcreg,
++							   promoted_val,
++							   vec_promoted_val,
++							   desired_align,
++							   align_bytes,
++							   issetmem);
++
++	  count_exp = plus_constant (counter_mode (count_exp),
++				     count_exp, -align_bytes);
++	  count -= align_bytes;
++	  min_size -= align_bytes;
++	  max_size -= align_bytes;
++	}
++      if (need_zero_guard
++	  && min_size < (unsigned HOST_WIDE_INT) size_needed
++	  && (count < (unsigned HOST_WIDE_INT) size_needed
++	      || (align_bytes == 0
++		  && count < ((unsigned HOST_WIDE_INT) size_needed
++			      + desired_align - align))))
++	{
++	  /* It is possible that we copied enough so the main loop will not
++	     execute.  */
++	  gcc_assert (size_needed > 1);
++	  if (label == NULL_RTX)
++	    label = gen_label_rtx ();
++	  emit_cmp_and_jump_insns (count_exp,
++				   GEN_INT (size_needed),
++				   LTU, 0, counter_mode (count_exp), 1, label);
++	  if (expected_size == -1
++	      || expected_size < (desired_align - align) / 2 + size_needed)
++	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
++	  else
++	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
++	}
++    }
++  if (label && size_needed == 1)
++    {
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++      label = NULL;
++      epilogue_size_needed = 1;
++      if (issetmem)
++	promoted_val = val_exp;
++    }
++  else if (label == NULL_RTX && !misaligned_prologue_used)
++    epilogue_size_needed = size_needed;
++
++  /* Step 3: Main loop.  */
++
++  switch (alg)
++    {
++    case libcall:
++    case no_stringop:
++    case last_alg:
++      gcc_unreachable ();
++    case loop_1_byte:
++    case loop:
++    case unrolled_loop:
++      expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
++				     count_exp, move_mode, unroll_factor,
++				     expected_size, issetmem);
++      break;
++    case vector_loop:
++      expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
++				     vec_promoted_val, count_exp, move_mode,
++				     unroll_factor, expected_size, issetmem);
++      break;
++    case rep_prefix_8_byte:
++    case rep_prefix_4_byte:
++    case rep_prefix_1_byte:
++      expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
++				       val_exp, count_exp, move_mode, issetmem);
++      break;
++    }
++  /* Adjust properly the offset of src and dest memory for aliasing.  */
++  if (CONST_INT_P (count_exp))
++    {
++      if (!issetmem)
++	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
++					    (count / size_needed) * size_needed);
++      dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
++					  (count / size_needed) * size_needed);
++    }
++  else
++    {
++      if (!issetmem)
++	src = change_address (src, BLKmode, srcreg);
++      dst = change_address (dst, BLKmode, destreg);
++    }
++
++  /* Step 4: Epilogue to copy the remaining bytes.  */
++ epilogue:
++  if (label)
++    {
++      /* When the main loop is done, COUNT_EXP might hold original count,
++	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
++	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
++	 bytes. Compensate if needed.  */
++
++      if (size_needed < epilogue_size_needed)
++	{
++	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
++				     GEN_INT (size_needed - 1), count_exp, 1,
++				     OPTAB_DIRECT);
++	  if (tmp != count_exp)
++	    emit_move_insn (count_exp, tmp);
++	}
++      emit_label (label);
++      LABEL_NUSES (label) = 1;
++    }
++
++  if (count_exp != const0_rtx && epilogue_size_needed > 1)
++    {
++      if (force_loopy_epilogue)
++	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
++					 epilogue_size_needed);
++      else
++	{
++	  if (issetmem)
++	    expand_setmem_epilogue (dst, destreg, promoted_val,
++				    vec_promoted_val, count_exp,
++				    epilogue_size_needed);
++	  else
++	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
++				    epilogue_size_needed);
++	}
++    }
++  if (jump_around_label)
++    emit_label (jump_around_label);
++  return true;
++}
++
++
++/* Expand the appropriate insns for doing strlen if not just doing
++   repnz; scasb
++
++   out = result, initialized with the start address
++   align_rtx = alignment of the address.
++   scratch = scratch register, initialized with the startaddress when
++	not aligned, otherwise undefined
++
++   This is just the body. It needs the initializations mentioned above and
++   some address computing at the end.  These things are done in i386.md.  */
++
++static void
++ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
++{
++  int align;
++  rtx tmp;
++  rtx_code_label *align_2_label = NULL;
++  rtx_code_label *align_3_label = NULL;
++  rtx_code_label *align_4_label = gen_label_rtx ();
++  rtx_code_label *end_0_label = gen_label_rtx ();
++  rtx mem;
++  rtx tmpreg = gen_reg_rtx (SImode);
++  rtx scratch = gen_reg_rtx (SImode);
++  rtx cmp;
++
++  align = 0;
++  if (CONST_INT_P (align_rtx))
++    align = INTVAL (align_rtx);
++
++  /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
++
++  /* Is there a known alignment and is it less than 4?  */
++  if (align < 4)
++    {
++      rtx scratch1 = gen_reg_rtx (Pmode);
++      emit_move_insn (scratch1, out);
++      /* Is there a known alignment and is it not 2? */
++      if (align != 2)
++	{
++	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
++	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
++
++	  /* Leave just the 3 lower bits.  */
++	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
++				    NULL_RTX, 0, OPTAB_WIDEN);
++
++	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
++				   Pmode, 1, align_4_label);
++	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
++				   Pmode, 1, align_2_label);
++	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
++				   Pmode, 1, align_3_label);
++	}
++      else
++        {
++	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
++	     check if is aligned to 4 - byte.  */
++
++	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
++				    NULL_RTX, 0, OPTAB_WIDEN);
++
++	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
++				   Pmode, 1, align_4_label);
++        }
++
++      mem = change_address (src, QImode, out);
++
++      /* Now compare the bytes.  */
++
++      /* Compare the first n unaligned byte on a byte per byte basis.  */
++      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
++			       QImode, 1, end_0_label);
++
++      /* Increment the address.  */
++      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
++
++      /* Not needed with an alignment of 2 */
++      if (align != 2)
++	{
++	  emit_label (align_2_label);
++
++	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
++				   end_0_label);
++
++	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
++
++	  emit_label (align_3_label);
++	}
++
++      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
++			       end_0_label);
++
++      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
++    }
++
++  /* Generate loop to check 4 bytes at a time.  It is not a good idea to
++     align this loop.  It gives only huge programs, but does not help to
++     speed up.  */
++  emit_label (align_4_label);
++
++  mem = change_address (src, SImode, out);
++  emit_move_insn (scratch, mem);
++  emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
++
++  /* This formula yields a nonzero result iff one of the bytes is zero.
++     This saves three branches inside loop and many cycles.  */
++
++  emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
++  emit_insn (gen_one_cmplsi2 (scratch, scratch));
++  emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
++  emit_insn (gen_andsi3 (tmpreg, tmpreg,
++			 gen_int_mode (0x80808080, SImode)));
++  emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
++			   align_4_label);
++
++  if (TARGET_CMOVE)
++    {
++       rtx reg = gen_reg_rtx (SImode);
++       rtx reg2 = gen_reg_rtx (Pmode);
++       emit_move_insn (reg, tmpreg);
++       emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
++
++       /* If zero is not in the first two bytes, move two bytes forward.  */
++       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
++       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
++       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
++       emit_insn (gen_rtx_SET (tmpreg,
++			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
++						     reg,
++						     tmpreg)));
++       /* Emit lea manually to avoid clobbering of flags.  */
++       emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
++
++       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
++       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
++       emit_insn (gen_rtx_SET (out,
++			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
++						     reg2,
++						     out)));
++    }
++  else
++    {
++       rtx_code_label *end_2_label = gen_label_rtx ();
++       /* Is zero in the first two bytes? */
++
++       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
++       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
++       tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
++       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
++                            gen_rtx_LABEL_REF (VOIDmode, end_2_label),
++                            pc_rtx);
++       tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++       JUMP_LABEL (tmp) = end_2_label;
++
++       /* Not in the first two.  Move two bytes forward.  */
++       emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
++       emit_insn (ix86_gen_add3 (out, out, const2_rtx));
++
++       emit_label (end_2_label);
++
++    }
++
++  /* Avoid branch in fixing the byte.  */
++  tmpreg = gen_lowpart (QImode, tmpreg);
++  emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
++  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
++  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
++  emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
++
++  emit_label (end_0_label);
++}
++
++/* Expand strlen.  */
++
++bool
++ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
++{
++if (TARGET_UNROLL_STRLEN
++	   && TARGET_INLINE_ALL_STRINGOPS
++	   && eoschar == const0_rtx
++	   && optimize > 1)
++    {
++      /* The generic case of strlen expander is long.  Avoid it's
++	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
++      rtx addr = force_reg (Pmode, XEXP (src, 0));
++      /* Well it seems that some optimizer does not combine a call like
++	 foo(strlen(bar), strlen(bar));
++	 when the move and the subtraction is done here.  It does calculate
++	 the length just once when these instructions are done inside of
++	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
++	 often used and I use one fewer register for the lifetime of
++	 output_strlen_unroll() this is better.  */
++
++      emit_move_insn (out, addr);
++
++      ix86_expand_strlensi_unroll_1 (out, src, align);
++
++      /* strlensi_unroll_1 returns the address of the zero at the end of
++	 the string, like memchr(), so compute the length by subtracting
++	 the start address.  */
++      emit_insn (ix86_gen_sub3 (out, out, addr));
++      return true;
++    }
++  else
++    return false;
++}
++
++/* For given symbol (function) construct code to compute address of it's PLT
++   entry in large x86-64 PIC model.  */
++
++static rtx
++construct_plt_address (rtx symbol)
++{
++  rtx tmp, unspec;
++
++  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
++  gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
++  gcc_assert (Pmode == DImode);
++
++  tmp = gen_reg_rtx (Pmode);
++  unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
++
++  emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
++  emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
++  return tmp;
++}
++
++/* Additional registers that are clobbered by SYSV calls.  */
++
++static int const x86_64_ms_sysv_extra_clobbered_registers
++		 [NUM_X86_64_MS_CLOBBERED_REGS] =
++{
++  SI_REG, DI_REG,
++  XMM6_REG, XMM7_REG,
++  XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
++  XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
++};
++
++rtx_insn *
++ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
++		  rtx callarg2,
++		  rtx pop, bool sibcall)
++{
++  rtx vec[3];
++  rtx use = NULL, call;
++  unsigned int vec_len = 0;
++  tree fndecl;
++
++  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
++    {
++      fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
++      if (fndecl
++	  && (lookup_attribute ("interrupt",
++				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
++	error ("interrupt service routine cannot be called directly");
++    }
++  else
++    fndecl = NULL_TREE;
++
++  if (pop == const0_rtx)
++    pop = NULL;
++  gcc_assert (!TARGET_64BIT || !pop);
++
++  if (TARGET_MACHO && !TARGET_64BIT)
++    {
++#if TARGET_MACHO
++      if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
++	fnaddr = machopic_indirect_call_target (fnaddr);
++#endif
++    }
++  else
++    {
++      /* Static functions and indirect calls don't need the pic register.  Also,
++	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
++	 it an indirect call.  */
++      rtx addr = XEXP (fnaddr, 0);
++      if (flag_pic
++	  && GET_CODE (addr) == SYMBOL_REF
++	  && !SYMBOL_REF_LOCAL_P (addr))
++	{
++	  if (flag_plt
++	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
++		  || !lookup_attribute ("noplt",
++					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
++	    {
++	      if (!TARGET_64BIT
++		  || (ix86_cmodel == CM_LARGE_PIC
++		      && DEFAULT_ABI != MS_ABI))
++		{
++		  use_reg (&use, gen_rtx_REG (Pmode,
++					      REAL_PIC_OFFSET_TABLE_REGNUM));
++		  if (ix86_use_pseudo_pic_reg ())
++		    emit_move_insn (gen_rtx_REG (Pmode,
++						 REAL_PIC_OFFSET_TABLE_REGNUM),
++				    pic_offset_table_rtx);
++		}
++	    }
++	  else if (!TARGET_PECOFF && !TARGET_MACHO)
++	    {
++	      if (TARGET_64BIT)
++		{
++		  fnaddr = gen_rtx_UNSPEC (Pmode,
++					   gen_rtvec (1, addr),
++					   UNSPEC_GOTPCREL);
++		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
++		}
++	      else
++		{
++		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
++					   UNSPEC_GOT);
++		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
++		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
++					 fnaddr);
++		}
++	      fnaddr = gen_const_mem (Pmode, fnaddr);
++	      /* Pmode may not be the same as word_mode for x32, which
++		 doesn't support indirect branch via 32-bit memory slot.
++		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
++		 indirect branch via x32 GOT slot is OK.  */
++	      if (GET_MODE (fnaddr) != word_mode)
++		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
++	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
++	    }
++	}
++    }
++
++  /* Skip setting up RAX register for -mskip-rax-setup when there are no
++     parameters passed in vector registers.  */
++  if (TARGET_64BIT
++      && (INTVAL (callarg2) > 0
++	  || (INTVAL (callarg2) == 0
++	      && (TARGET_SSE || !flag_skip_rax_setup))))
++    {
++      rtx al = gen_rtx_REG (QImode, AX_REG);
++      emit_move_insn (al, callarg2);
++      use_reg (&use, al);
++    }
++
++  if (ix86_cmodel == CM_LARGE_PIC
++      && !TARGET_PECOFF
++      && MEM_P (fnaddr)
++      && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
++      && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
++    fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
++  /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
++     branch via x32 GOT slot is OK.  */
++  else if (!(TARGET_X32
++	     && MEM_P (fnaddr)
++	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
++	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
++	   && (sibcall
++	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
++	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
++    {
++      fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
++      fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
++    }
++
++  call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
++
++  if (retval)
++    call = gen_rtx_SET (retval, call);
++  vec[vec_len++] = call;
++
++  if (pop)
++    {
++      pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
++      pop = gen_rtx_SET (stack_pointer_rtx, pop);
++      vec[vec_len++] = pop;
++    }
++
++  if (cfun->machine->no_caller_saved_registers
++      && (!fndecl
++	  || (!TREE_THIS_VOLATILE (fndecl)
++	      && !lookup_attribute ("no_caller_saved_registers",
++				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
++    {
++      static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
++      bool is_64bit_ms_abi = (TARGET_64BIT
++			      && ix86_function_abi (fndecl) == MS_ABI);
++      char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
++
++      /* If there are no caller-saved registers, add all registers
++	 that are clobbered by the call which returns.  */
++      for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
++	if (!fixed_regs[i]
++	    && (ix86_call_used_regs[i] == 1
++		|| (ix86_call_used_regs[i] & c_mask))
++	    && !STACK_REGNO_P (i)
++	    && !MMX_REGNO_P (i))
++	  clobber_reg (&use,
++		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
++    }
++  else if (TARGET_64BIT_MS_ABI
++	   && (!callarg2 || INTVAL (callarg2) != -2))
++    {
++      unsigned i;
++
++      for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
++	{
++	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
++	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
++
++	  clobber_reg (&use, gen_rtx_REG (mode, regno));
++	}
++
++      /* Set here, but it may get cleared later.  */
++      if (TARGET_CALL_MS2SYSV_XLOGUES)
++	{
++	  if (!TARGET_SSE)
++	    ;
++
++	  /* Don't break hot-patched functions.  */
++	  else if (ix86_function_ms_hook_prologue (current_function_decl))
++	    ;
++
++	  /* TODO: Cases not yet examined.  */
++	  else if (flag_split_stack)
++	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
++
++	  else
++	    {
++	      gcc_assert (!reload_completed);
++	      cfun->machine->call_ms2sysv = true;
++	    }
++	}
++    }
++
++  if (vec_len > 1)
++    call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
++  rtx_insn *call_insn = emit_call_insn (call);
++  if (use)
++    CALL_INSN_FUNCTION_USAGE (call_insn) = use;
++
++  return call_insn;
++}
++
++/* Split simple return with popping POPC bytes from stack to indirect
++   branch with stack adjustment .  */
++
++void
++ix86_split_simple_return_pop_internal (rtx popc)
++{
++  struct machine_function *m = cfun->machine;
++  rtx ecx = gen_rtx_REG (SImode, CX_REG);
++  rtx_insn *insn;
++
++  /* There is no "pascal" calling convention in any 64bit ABI.  */
++  gcc_assert (!TARGET_64BIT);
++
++  insn = emit_insn (gen_pop (ecx));
++  m->fs.cfa_offset -= UNITS_PER_WORD;
++  m->fs.sp_offset -= UNITS_PER_WORD;
++
++  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
++  x = gen_rtx_SET (stack_pointer_rtx, x);
++  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
++  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
++  RTX_FRAME_RELATED_P (insn) = 1;
++
++  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
++  x = gen_rtx_SET (stack_pointer_rtx, x);
++  insn = emit_insn (x);
++  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
++  RTX_FRAME_RELATED_P (insn) = 1;
++
++  /* Now return address is in ECX.  */
++  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
++}
++
++/* Errors in the source file can cause expand_expr to return const0_rtx
++   where we expect a vector.  To avoid crashing, use one of the vector
++   clear instructions.  */
++
++static rtx
++safe_vector_operand (rtx x, machine_mode mode)
++{
++  if (x == const0_rtx)
++    x = CONST0_RTX (mode);
++  return x;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of binop insns.  */
++
++static rtx
++ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  machine_mode tmode = insn_data[icode].operand[0].mode;
++  machine_mode mode0 = insn_data[icode].operand[1].mode;
++  machine_mode mode1 = insn_data[icode].operand[2].mode;
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++  if (VECTOR_MODE_P (mode1))
++    op1 = safe_vector_operand (op1, mode1);
++
++  if (optimize || !target
++      || GET_MODE (target) != tmode
++      || !insn_data[icode].operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  if (GET_MODE (op1) == SImode && mode1 == TImode)
++    {
++      rtx x = gen_reg_rtx (V4SImode);
++      emit_insn (gen_sse2_loadd (x, op1));
++      op1 = gen_lowpart (TImode, x);
++    }
++
++  if (!insn_data[icode].operand[1].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++  if (!insn_data[icode].operand[2].predicate (op1, mode1))
++    op1 = copy_to_mode_reg (mode1, op1);
++
++  pat = GEN_FCN (icode) (target, op0, op1);
++  if (! pat)
++    return 0;
++
++  emit_insn (pat);
++
++  return target;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
++
++static rtx
++ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
++			       enum ix86_builtin_func_type m_type,
++			       enum rtx_code sub_code)
++{
++  rtx pat;
++  int i;
++  int nargs;
++  bool comparison_p = false;
++  bool tf_p = false;
++  bool last_arg_constant = false;
++  int num_memory = 0;
++  struct {
++    rtx op;
++    machine_mode mode;
++  } args[4];
++
++  machine_mode tmode = insn_data[icode].operand[0].mode;
++
++  switch (m_type)
++    {
++    case MULTI_ARG_4_DF2_DI_I:
++    case MULTI_ARG_4_DF2_DI_I1:
++    case MULTI_ARG_4_SF2_SI_I:
++    case MULTI_ARG_4_SF2_SI_I1:
++      nargs = 4;
++      last_arg_constant = true;
++      break;
++
++    case MULTI_ARG_3_SF:
++    case MULTI_ARG_3_DF:
++    case MULTI_ARG_3_SF2:
++    case MULTI_ARG_3_DF2:
++    case MULTI_ARG_3_DI:
++    case MULTI_ARG_3_SI:
++    case MULTI_ARG_3_SI_DI:
++    case MULTI_ARG_3_HI:
++    case MULTI_ARG_3_HI_SI:
++    case MULTI_ARG_3_QI:
++    case MULTI_ARG_3_DI2:
++    case MULTI_ARG_3_SI2:
++    case MULTI_ARG_3_HI2:
++    case MULTI_ARG_3_QI2:
++      nargs = 3;
++      break;
++
++    case MULTI_ARG_2_SF:
++    case MULTI_ARG_2_DF:
++    case MULTI_ARG_2_DI:
++    case MULTI_ARG_2_SI:
++    case MULTI_ARG_2_HI:
++    case MULTI_ARG_2_QI:
++      nargs = 2;
++      break;
++
++    case MULTI_ARG_2_DI_IMM:
++    case MULTI_ARG_2_SI_IMM:
++    case MULTI_ARG_2_HI_IMM:
++    case MULTI_ARG_2_QI_IMM:
++      nargs = 2;
++      last_arg_constant = true;
++      break;
++
++    case MULTI_ARG_1_SF:
++    case MULTI_ARG_1_DF:
++    case MULTI_ARG_1_SF2:
++    case MULTI_ARG_1_DF2:
++    case MULTI_ARG_1_DI:
++    case MULTI_ARG_1_SI:
++    case MULTI_ARG_1_HI:
++    case MULTI_ARG_1_QI:
++    case MULTI_ARG_1_SI_DI:
++    case MULTI_ARG_1_HI_DI:
++    case MULTI_ARG_1_HI_SI:
++    case MULTI_ARG_1_QI_DI:
++    case MULTI_ARG_1_QI_SI:
++    case MULTI_ARG_1_QI_HI:
++      nargs = 1;
++      break;
++
++    case MULTI_ARG_2_DI_CMP:
++    case MULTI_ARG_2_SI_CMP:
++    case MULTI_ARG_2_HI_CMP:
++    case MULTI_ARG_2_QI_CMP:
++      nargs = 2;
++      comparison_p = true;
++      break;
++
++    case MULTI_ARG_2_SF_TF:
++    case MULTI_ARG_2_DF_TF:
++    case MULTI_ARG_2_DI_TF:
++    case MULTI_ARG_2_SI_TF:
++    case MULTI_ARG_2_HI_TF:
++    case MULTI_ARG_2_QI_TF:
++      nargs = 2;
++      tf_p = true;
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  if (optimize || !target
++      || GET_MODE (target) != tmode
++      || !insn_data[icode].operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++  else if (memory_operand (target, tmode))
++    num_memory++;
++
++  gcc_assert (nargs <= 4);
++
++  for (i = 0; i < nargs; i++)
++    {
++      tree arg = CALL_EXPR_ARG (exp, i);
++      rtx op = expand_normal (arg);
++      int adjust = (comparison_p) ? 1 : 0;
++      machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
++
++      if (last_arg_constant && i == nargs - 1)
++	{
++	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
++	    {
++	      enum insn_code new_icode = icode;
++	      switch (icode)
++		{
++		case CODE_FOR_xop_vpermil2v2df3:
++		case CODE_FOR_xop_vpermil2v4sf3:
++		case CODE_FOR_xop_vpermil2v4df3:
++		case CODE_FOR_xop_vpermil2v8sf3:
++		  error ("the last argument must be a 2-bit immediate");
++		  return gen_reg_rtx (tmode);
++		case CODE_FOR_xop_rotlv2di3:
++		  new_icode = CODE_FOR_rotlv2di3;
++		  goto xop_rotl;
++		case CODE_FOR_xop_rotlv4si3:
++		  new_icode = CODE_FOR_rotlv4si3;
++		  goto xop_rotl;
++		case CODE_FOR_xop_rotlv8hi3:
++		  new_icode = CODE_FOR_rotlv8hi3;
++		  goto xop_rotl;
++		case CODE_FOR_xop_rotlv16qi3:
++		  new_icode = CODE_FOR_rotlv16qi3;
++		xop_rotl:
++		  if (CONST_INT_P (op))
++		    {
++		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
++		      op = GEN_INT (INTVAL (op) & mask);
++		      gcc_checking_assert
++			(insn_data[icode].operand[i + 1].predicate (op, mode));
++		    }
++		  else
++		    {
++		      gcc_checking_assert
++			(nargs == 2
++			 && insn_data[new_icode].operand[0].mode == tmode
++			 && insn_data[new_icode].operand[1].mode == tmode
++			 && insn_data[new_icode].operand[2].mode == mode
++			 && insn_data[new_icode].operand[0].predicate
++			    == insn_data[icode].operand[0].predicate
++			 && insn_data[new_icode].operand[1].predicate
++			    == insn_data[icode].operand[1].predicate);
++		      icode = new_icode;
++		      goto non_constant;
++		    }
++		  break;
++		default:
++		  gcc_unreachable ();
++		}
++	    }
++	}
++      else
++	{
++	non_constant:
++	  if (VECTOR_MODE_P (mode))
++	    op = safe_vector_operand (op, mode);
++
++	  /* If we aren't optimizing, only allow one memory operand to be
++	     generated.  */
++	  if (memory_operand (op, mode))
++	    num_memory++;
++
++	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
++
++	  if (optimize
++	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
++	      || num_memory > 1)
++	    op = force_reg (mode, op);
++	}
++
++      args[i].op = op;
++      args[i].mode = mode;
++    }
++
++  switch (nargs)
++    {
++    case 1:
++      pat = GEN_FCN (icode) (target, args[0].op);
++      break;
++
++    case 2:
++      if (tf_p)
++	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
++			       GEN_INT ((int)sub_code));
++      else if (! comparison_p)
++	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
++      else
++	{
++	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
++				       args[0].op,
++				       args[1].op);
++
++	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
++	}
++      break;
++
++    case 3:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
++      break;
++
++    case 4:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  if (! pat)
++    return 0;
++
++  emit_insn (pat);
++  return target;
++}
++
++/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
++   insns with vec_merge.  */
++
++static rtx
++ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
++				    rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  rtx op1, op0 = expand_normal (arg0);
++  machine_mode tmode = insn_data[icode].operand[0].mode;
++  machine_mode mode0 = insn_data[icode].operand[1].mode;
++
++  if (optimize || !target
++      || GET_MODE (target) != tmode
++      || !insn_data[icode].operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_data[icode].operand[1].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++
++  op1 = op0;
++  if (!insn_data[icode].operand[2].predicate (op1, mode0))
++    op1 = copy_to_mode_reg (mode0, op1);
++
++  pat = GEN_FCN (icode) (target, op0, op1);
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  return target;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
++
++static rtx
++ix86_expand_sse_compare (const struct builtin_description *d,
++			 tree exp, rtx target, bool swap)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  rtx op2;
++  machine_mode tmode = insn_data[d->icode].operand[0].mode;
++  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
++  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
++  enum rtx_code comparison = d->comparison;
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++  if (VECTOR_MODE_P (mode1))
++    op1 = safe_vector_operand (op1, mode1);
++
++  /* Swap operands if we have a comparison that isn't available in
++     hardware.  */
++  if (swap)
++    std::swap (op0, op1);
++
++  if (optimize || !target
++      || GET_MODE (target) != tmode
++      || !insn_data[d->icode].operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_data[d->icode].operand[1].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++  if ((optimize && !register_operand (op1, mode1))
++      || !insn_data[d->icode].operand[2].predicate (op1, mode1))
++    op1 = copy_to_mode_reg (mode1, op1);
++
++  op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
++  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  return target;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of comi insns.  */
++
++static rtx
++ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
++		      rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
++  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
++  enum rtx_code comparison = d->comparison;
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++  if (VECTOR_MODE_P (mode1))
++    op1 = safe_vector_operand (op1, mode1);
++
++  /* Swap operands if we have a comparison that isn't available in
++     hardware.  */
++  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
++    std::swap (op0, op1);
++
++  target = gen_reg_rtx (SImode);
++  emit_move_insn (target, const0_rtx);
++  target = gen_rtx_SUBREG (QImode, target, 0);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++  if ((optimize && !register_operand (op1, mode1))
++      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
++    op1 = copy_to_mode_reg (mode1, op1);
++
++  pat = GEN_FCN (d->icode) (op0, op1);
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
++			  gen_rtx_fmt_ee (comparison, QImode,
++					  SET_DEST (pat),
++					  const0_rtx)));
++
++  return SUBREG_REG (target);
++}
++
++/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
++
++static rtx
++ix86_expand_sse_round (const struct builtin_description *d, tree exp,
++		       rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  rtx op1, op0 = expand_normal (arg0);
++  machine_mode tmode = insn_data[d->icode].operand[0].mode;
++  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
++
++  if (optimize || target == 0
++      || GET_MODE (target) != tmode
++      || !insn_data[d->icode].operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++
++  op1 = GEN_INT (d->comparison);
++
++  pat = GEN_FCN (d->icode) (target, op0, op1);
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  return target;
++}
++
++static rtx
++ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
++				     tree exp, rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  rtx op2;
++  machine_mode tmode = insn_data[d->icode].operand[0].mode;
++  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
++  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
++
++  if (optimize || target == 0
++      || GET_MODE (target) != tmode
++      || !insn_data[d->icode].operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  op0 = safe_vector_operand (op0, mode0);
++  op1 = safe_vector_operand (op1, mode1);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++  if ((optimize && !register_operand (op1, mode1))
++      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
++    op1 = copy_to_mode_reg (mode1, op1);
++
++  op2 = GEN_INT (d->comparison);
++
++  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  return target;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
++
++static rtx
++ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
++		       rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
++  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
++  enum rtx_code comparison = d->comparison;
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++  if (VECTOR_MODE_P (mode1))
++    op1 = safe_vector_operand (op1, mode1);
++
++  target = gen_reg_rtx (SImode);
++  emit_move_insn (target, const0_rtx);
++  target = gen_rtx_SUBREG (QImode, target, 0);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++  if ((optimize && !register_operand (op1, mode1))
++      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
++    op1 = copy_to_mode_reg (mode1, op1);
++
++  pat = GEN_FCN (d->icode) (op0, op1);
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
++			  gen_rtx_fmt_ee (comparison, QImode,
++					  SET_DEST (pat),
++					  const0_rtx)));
++
++  return SUBREG_REG (target);
++}
++
++/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
++
++static rtx
++ix86_expand_sse_pcmpestr (const struct builtin_description *d,
++			  tree exp, rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  tree arg2 = CALL_EXPR_ARG (exp, 2);
++  tree arg3 = CALL_EXPR_ARG (exp, 3);
++  tree arg4 = CALL_EXPR_ARG (exp, 4);
++  rtx scratch0, scratch1;
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  rtx op2 = expand_normal (arg2);
++  rtx op3 = expand_normal (arg3);
++  rtx op4 = expand_normal (arg4);
++  machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
++
++  tmode0 = insn_data[d->icode].operand[0].mode;
++  tmode1 = insn_data[d->icode].operand[1].mode;
++  modev2 = insn_data[d->icode].operand[2].mode;
++  modei3 = insn_data[d->icode].operand[3].mode;
++  modev4 = insn_data[d->icode].operand[4].mode;
++  modei5 = insn_data[d->icode].operand[5].mode;
++  modeimm = insn_data[d->icode].operand[6].mode;
++
++  if (VECTOR_MODE_P (modev2))
++    op0 = safe_vector_operand (op0, modev2);
++  if (VECTOR_MODE_P (modev4))
++    op2 = safe_vector_operand (op2, modev4);
++
++  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
++    op0 = copy_to_mode_reg (modev2, op0);
++  if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
++    op1 = copy_to_mode_reg (modei3, op1);
++  if ((optimize && !register_operand (op2, modev4))
++      || !insn_data[d->icode].operand[4].predicate (op2, modev4))
++    op2 = copy_to_mode_reg (modev4, op2);
++  if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
++    op3 = copy_to_mode_reg (modei5, op3);
++
++  if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
++    {
++      error ("the fifth argument must be an 8-bit immediate");
++      return const0_rtx;
++    }
++
++  if (d->code == IX86_BUILTIN_PCMPESTRI128)
++    {
++      if (optimize || !target
++	  || GET_MODE (target) != tmode0
++	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
++	target = gen_reg_rtx (tmode0);
++
++      scratch1 = gen_reg_rtx (tmode1);
++
++      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
++    }
++  else if (d->code == IX86_BUILTIN_PCMPESTRM128)
++    {
++      if (optimize || !target
++	  || GET_MODE (target) != tmode1
++	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
++	target = gen_reg_rtx (tmode1);
++
++      scratch0 = gen_reg_rtx (tmode0);
++
++      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
++    }
++  else
++    {
++      gcc_assert (d->flag);
++
++      scratch0 = gen_reg_rtx (tmode0);
++      scratch1 = gen_reg_rtx (tmode1);
++
++      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
++    }
++
++  if (! pat)
++    return 0;
++
++  emit_insn (pat);
++
++  if (d->flag)
++    {
++      target = gen_reg_rtx (SImode);
++      emit_move_insn (target, const0_rtx);
++      target = gen_rtx_SUBREG (QImode, target, 0);
++
++      emit_insn
++	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
++		      gen_rtx_fmt_ee (EQ, QImode,
++				      gen_rtx_REG ((machine_mode) d->flag,
++						   FLAGS_REG),
++				      const0_rtx)));
++      return SUBREG_REG (target);
++    }
++  else
++    return target;
++}
++
++
++/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
++
++static rtx
++ix86_expand_sse_pcmpistr (const struct builtin_description *d,
++			  tree exp, rtx target)
++{
++  rtx pat;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  tree arg2 = CALL_EXPR_ARG (exp, 2);
++  rtx scratch0, scratch1;
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  rtx op2 = expand_normal (arg2);
++  machine_mode tmode0, tmode1, modev2, modev3, modeimm;
++
++  tmode0 = insn_data[d->icode].operand[0].mode;
++  tmode1 = insn_data[d->icode].operand[1].mode;
++  modev2 = insn_data[d->icode].operand[2].mode;
++  modev3 = insn_data[d->icode].operand[3].mode;
++  modeimm = insn_data[d->icode].operand[4].mode;
++
++  if (VECTOR_MODE_P (modev2))
++    op0 = safe_vector_operand (op0, modev2);
++  if (VECTOR_MODE_P (modev3))
++    op1 = safe_vector_operand (op1, modev3);
++
++  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
++    op0 = copy_to_mode_reg (modev2, op0);
++  if ((optimize && !register_operand (op1, modev3))
++      || !insn_data[d->icode].operand[3].predicate (op1, modev3))
++    op1 = copy_to_mode_reg (modev3, op1);
++
++  if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
++    {
++      error ("the third argument must be an 8-bit immediate");
++      return const0_rtx;
++    }
++
++  if (d->code == IX86_BUILTIN_PCMPISTRI128)
++    {
++      if (optimize || !target
++	  || GET_MODE (target) != tmode0
++	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
++	target = gen_reg_rtx (tmode0);
++
++      scratch1 = gen_reg_rtx (tmode1);
++
++      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
++    }
++  else if (d->code == IX86_BUILTIN_PCMPISTRM128)
++    {
++      if (optimize || !target
++	  || GET_MODE (target) != tmode1
++	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
++	target = gen_reg_rtx (tmode1);
++
++      scratch0 = gen_reg_rtx (tmode0);
++
++      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
++    }
++  else
++    {
++      gcc_assert (d->flag);
++
++      scratch0 = gen_reg_rtx (tmode0);
++      scratch1 = gen_reg_rtx (tmode1);
++
++      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
++    }
++
++  if (! pat)
++    return 0;
++
++  emit_insn (pat);
++
++  if (d->flag)
++    {
++      target = gen_reg_rtx (SImode);
++      emit_move_insn (target, const0_rtx);
++      target = gen_rtx_SUBREG (QImode, target, 0);
++
++      emit_insn
++	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
++		      gen_rtx_fmt_ee (EQ, QImode,
++				      gen_rtx_REG ((machine_mode) d->flag,
++						   FLAGS_REG),
++				      const0_rtx)));
++      return SUBREG_REG (target);
++    }
++  else
++    return target;
++}
++
++/* Fixup modeless constants to fit required mode.  */
++
++static rtx
++fixup_modeless_constant (rtx x, machine_mode mode)
++{
++  if (GET_MODE (x) == VOIDmode)
++    x = convert_to_mode (mode, x, 1);
++  return x;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of insns with
++   variable number of operands.  */
++
++static rtx
++ix86_expand_args_builtin (const struct builtin_description *d,
++			  tree exp, rtx target)
++{
++  rtx pat, real_target;
++  unsigned int i, nargs;
++  unsigned int nargs_constant = 0;
++  unsigned int mask_pos = 0;
++  int num_memory = 0;
++  struct
++    {
++      rtx op;
++      machine_mode mode;
++    } args[6];
++  bool second_arg_count = false;
++  enum insn_code icode = d->icode;
++  const struct insn_data_d *insn_p = &insn_data[icode];
++  machine_mode tmode = insn_p->operand[0].mode;
++  machine_mode rmode = VOIDmode;
++  bool swap = false;
++  enum rtx_code comparison = d->comparison;
++
++  switch ((enum ix86_builtin_func_type) d->flag)
++    {
++    case V2DF_FTYPE_V2DF_ROUND:
++    case V4DF_FTYPE_V4DF_ROUND:
++    case V8DF_FTYPE_V8DF_ROUND:
++    case V4SF_FTYPE_V4SF_ROUND:
++    case V8SF_FTYPE_V8SF_ROUND:
++    case V16SF_FTYPE_V16SF_ROUND:
++    case V4SI_FTYPE_V4SF_ROUND:
++    case V8SI_FTYPE_V8SF_ROUND:
++    case V16SI_FTYPE_V16SF_ROUND:
++      return ix86_expand_sse_round (d, exp, target);
++    case V4SI_FTYPE_V2DF_V2DF_ROUND:
++    case V8SI_FTYPE_V4DF_V4DF_ROUND:
++    case V16SI_FTYPE_V8DF_V8DF_ROUND:
++      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
++    case INT_FTYPE_V8SF_V8SF_PTEST:
++    case INT_FTYPE_V4DI_V4DI_PTEST:
++    case INT_FTYPE_V4DF_V4DF_PTEST:
++    case INT_FTYPE_V4SF_V4SF_PTEST:
++    case INT_FTYPE_V2DI_V2DI_PTEST:
++    case INT_FTYPE_V2DF_V2DF_PTEST:
++      return ix86_expand_sse_ptest (d, exp, target);
++    case FLOAT128_FTYPE_FLOAT128:
++    case FLOAT_FTYPE_FLOAT:
++    case INT_FTYPE_INT:
++    case UINT_FTYPE_UINT:
++    case UINT16_FTYPE_UINT16:
++    case UINT64_FTYPE_INT:
++    case UINT64_FTYPE_UINT64:
++    case INT64_FTYPE_INT64:
++    case INT64_FTYPE_V4SF:
++    case INT64_FTYPE_V2DF:
++    case INT_FTYPE_V16QI:
++    case INT_FTYPE_V8QI:
++    case INT_FTYPE_V8SF:
++    case INT_FTYPE_V4DF:
++    case INT_FTYPE_V4SF:
++    case INT_FTYPE_V2DF:
++    case INT_FTYPE_V32QI:
++    case V16QI_FTYPE_V16QI:
++    case V8SI_FTYPE_V8SF:
++    case V8SI_FTYPE_V4SI:
++    case V8HI_FTYPE_V8HI:
++    case V8HI_FTYPE_V16QI:
++    case V8QI_FTYPE_V8QI:
++    case V8SF_FTYPE_V8SF:
++    case V8SF_FTYPE_V8SI:
++    case V8SF_FTYPE_V4SF:
++    case V8SF_FTYPE_V8HI:
++    case V4SI_FTYPE_V4SI:
++    case V4SI_FTYPE_V16QI:
++    case V4SI_FTYPE_V4SF:
++    case V4SI_FTYPE_V8SI:
++    case V4SI_FTYPE_V8HI:
++    case V4SI_FTYPE_V4DF:
++    case V4SI_FTYPE_V2DF:
++    case V4HI_FTYPE_V4HI:
++    case V4DF_FTYPE_V4DF:
++    case V4DF_FTYPE_V4SI:
++    case V4DF_FTYPE_V4SF:
++    case V4DF_FTYPE_V2DF:
++    case V4SF_FTYPE_V4SF:
++    case V4SF_FTYPE_V4SI:
++    case V4SF_FTYPE_V8SF:
++    case V4SF_FTYPE_V4DF:
++    case V4SF_FTYPE_V8HI:
++    case V4SF_FTYPE_V2DF:
++    case V2DI_FTYPE_V2DI:
++    case V2DI_FTYPE_V16QI:
++    case V2DI_FTYPE_V8HI:
++    case V2DI_FTYPE_V4SI:
++    case V2DF_FTYPE_V2DF:
++    case V2DF_FTYPE_V4SI:
++    case V2DF_FTYPE_V4DF:
++    case V2DF_FTYPE_V4SF:
++    case V2DF_FTYPE_V2SI:
++    case V2SI_FTYPE_V2SI:
++    case V2SI_FTYPE_V4SF:
++    case V2SI_FTYPE_V2SF:
++    case V2SI_FTYPE_V2DF:
++    case V2SF_FTYPE_V2SF:
++    case V2SF_FTYPE_V2SI:
++    case V32QI_FTYPE_V32QI:
++    case V32QI_FTYPE_V16QI:
++    case V16HI_FTYPE_V16HI:
++    case V16HI_FTYPE_V8HI:
++    case V8SI_FTYPE_V8SI:
++    case V16HI_FTYPE_V16QI:
++    case V8SI_FTYPE_V16QI:
++    case V4DI_FTYPE_V16QI:
++    case V8SI_FTYPE_V8HI:
++    case V4DI_FTYPE_V8HI:
++    case V4DI_FTYPE_V4SI:
++    case V4DI_FTYPE_V2DI:
++    case UQI_FTYPE_UQI:
++    case UHI_FTYPE_UHI:
++    case USI_FTYPE_USI:
++    case USI_FTYPE_UQI:
++    case USI_FTYPE_UHI:
++    case UDI_FTYPE_UDI:
++    case UHI_FTYPE_V16QI:
++    case USI_FTYPE_V32QI:
++    case UDI_FTYPE_V64QI:
++    case V16QI_FTYPE_UHI:
++    case V32QI_FTYPE_USI:
++    case V64QI_FTYPE_UDI:
++    case V8HI_FTYPE_UQI:
++    case V16HI_FTYPE_UHI:
++    case V32HI_FTYPE_USI:
++    case V4SI_FTYPE_UQI:
++    case V8SI_FTYPE_UQI:
++    case V4SI_FTYPE_UHI:
++    case V8SI_FTYPE_UHI:
++    case UQI_FTYPE_V8HI:
++    case UHI_FTYPE_V16HI:
++    case USI_FTYPE_V32HI:
++    case UQI_FTYPE_V4SI:
++    case UQI_FTYPE_V8SI:
++    case UHI_FTYPE_V16SI:
++    case UQI_FTYPE_V2DI:
++    case UQI_FTYPE_V4DI:
++    case UQI_FTYPE_V8DI:
++    case V16SI_FTYPE_UHI:
++    case V2DI_FTYPE_UQI:
++    case V4DI_FTYPE_UQI:
++    case V16SI_FTYPE_INT:
++    case V16SF_FTYPE_V8SF:
++    case V16SI_FTYPE_V8SI:
++    case V16SF_FTYPE_V4SF:
++    case V16SI_FTYPE_V4SI:
++    case V16SI_FTYPE_V16SF:
++    case V16SI_FTYPE_V16SI:
++    case V64QI_FTYPE_V64QI:
++    case V32HI_FTYPE_V32HI:
++    case V16SF_FTYPE_V16SF:
++    case V8DI_FTYPE_UQI:
++    case V8DI_FTYPE_V8DI:
++    case V8DF_FTYPE_V4DF:
++    case V8DF_FTYPE_V2DF:
++    case V8DF_FTYPE_V8DF:
++    case V4DI_FTYPE_V4DI:
++      nargs = 1;
++      break;
++    case V4SF_FTYPE_V4SF_VEC_MERGE:
++    case V2DF_FTYPE_V2DF_VEC_MERGE:
++      return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
++    case FLOAT128_FTYPE_FLOAT128_FLOAT128:
++    case V16QI_FTYPE_V16QI_V16QI:
++    case V16QI_FTYPE_V8HI_V8HI:
++    case V16SF_FTYPE_V16SF_V16SF:
++    case V8QI_FTYPE_V8QI_V8QI:
++    case V8QI_FTYPE_V4HI_V4HI:
++    case V8HI_FTYPE_V8HI_V8HI:
++    case V8HI_FTYPE_V16QI_V16QI:
++    case V8HI_FTYPE_V4SI_V4SI:
++    case V8SF_FTYPE_V8SF_V8SF:
++    case V8SF_FTYPE_V8SF_V8SI:
++    case V8DF_FTYPE_V8DF_V8DF:
++    case V4SI_FTYPE_V4SI_V4SI:
++    case V4SI_FTYPE_V8HI_V8HI:
++    case V4SI_FTYPE_V2DF_V2DF:
++    case V4HI_FTYPE_V4HI_V4HI:
++    case V4HI_FTYPE_V8QI_V8QI:
++    case V4HI_FTYPE_V2SI_V2SI:
++    case V4DF_FTYPE_V4DF_V4DF:
++    case V4DF_FTYPE_V4DF_V4DI:
++    case V4SF_FTYPE_V4SF_V4SF:
++    case V4SF_FTYPE_V4SF_V4SI:
++    case V4SF_FTYPE_V4SF_V2SI:
++    case V4SF_FTYPE_V4SF_V2DF:
++    case V4SF_FTYPE_V4SF_UINT:
++    case V4SF_FTYPE_V4SF_DI:
++    case V4SF_FTYPE_V4SF_SI:
++    case V2DI_FTYPE_V2DI_V2DI:
++    case V2DI_FTYPE_V16QI_V16QI:
++    case V2DI_FTYPE_V4SI_V4SI:
++    case V2DI_FTYPE_V2DI_V16QI:
++    case V2SI_FTYPE_V2SI_V2SI:
++    case V2SI_FTYPE_V4HI_V4HI:
++    case V2SI_FTYPE_V2SF_V2SF:
++    case V2DF_FTYPE_V2DF_V2DF:
++    case V2DF_FTYPE_V2DF_V4SF:
++    case V2DF_FTYPE_V2DF_V2DI:
++    case V2DF_FTYPE_V2DF_DI:
++    case V2DF_FTYPE_V2DF_SI:
++    case V2DF_FTYPE_V2DF_UINT:
++    case V2SF_FTYPE_V2SF_V2SF:
++    case V1DI_FTYPE_V1DI_V1DI:
++    case V1DI_FTYPE_V8QI_V8QI:
++    case V1DI_FTYPE_V2SI_V2SI:
++    case V32QI_FTYPE_V16HI_V16HI:
++    case V16HI_FTYPE_V8SI_V8SI:
++    case V64QI_FTYPE_V64QI_V64QI:
++    case V32QI_FTYPE_V32QI_V32QI:
++    case V16HI_FTYPE_V32QI_V32QI:
++    case V16HI_FTYPE_V16HI_V16HI:
++    case V8SI_FTYPE_V4DF_V4DF:
++    case V8SI_FTYPE_V8SI_V8SI:
++    case V8SI_FTYPE_V16HI_V16HI:
++    case V4DI_FTYPE_V4DI_V4DI:
++    case V4DI_FTYPE_V8SI_V8SI:
++    case V8DI_FTYPE_V64QI_V64QI:
++      if (comparison == UNKNOWN)
++	return ix86_expand_binop_builtin (icode, exp, target);
++      nargs = 2;
++      break;
++    case V4SF_FTYPE_V4SF_V4SF_SWAP:
++    case V2DF_FTYPE_V2DF_V2DF_SWAP:
++      gcc_assert (comparison != UNKNOWN);
++      nargs = 2;
++      swap = true;
++      break;
++    case V16HI_FTYPE_V16HI_V8HI_COUNT:
++    case V16HI_FTYPE_V16HI_SI_COUNT:
++    case V8SI_FTYPE_V8SI_V4SI_COUNT:
++    case V8SI_FTYPE_V8SI_SI_COUNT:
++    case V4DI_FTYPE_V4DI_V2DI_COUNT:
++    case V4DI_FTYPE_V4DI_INT_COUNT:
++    case V8HI_FTYPE_V8HI_V8HI_COUNT:
++    case V8HI_FTYPE_V8HI_SI_COUNT:
++    case V4SI_FTYPE_V4SI_V4SI_COUNT:
++    case V4SI_FTYPE_V4SI_SI_COUNT:
++    case V4HI_FTYPE_V4HI_V4HI_COUNT:
++    case V4HI_FTYPE_V4HI_SI_COUNT:
++    case V2DI_FTYPE_V2DI_V2DI_COUNT:
++    case V2DI_FTYPE_V2DI_SI_COUNT:
++    case V2SI_FTYPE_V2SI_V2SI_COUNT:
++    case V2SI_FTYPE_V2SI_SI_COUNT:
++    case V1DI_FTYPE_V1DI_V1DI_COUNT:
++    case V1DI_FTYPE_V1DI_SI_COUNT:
++      nargs = 2;
++      second_arg_count = true;
++      break;
++    case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
++    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
++    case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
++    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
++    case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
++    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
++    case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
++    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
++    case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
++    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
++    case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
++    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
++    case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
++    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
++    case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
++    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
++    case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
++    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
++      nargs = 4;
++      second_arg_count = true;
++      break;
++    case UINT64_FTYPE_UINT64_UINT64:
++    case UINT_FTYPE_UINT_UINT:
++    case UINT_FTYPE_UINT_USHORT:
++    case UINT_FTYPE_UINT_UCHAR:
++    case UINT16_FTYPE_UINT16_INT:
++    case UINT8_FTYPE_UINT8_INT:
++    case UQI_FTYPE_UQI_UQI:
++    case UHI_FTYPE_UHI_UHI:
++    case USI_FTYPE_USI_USI:
++    case UDI_FTYPE_UDI_UDI:
++    case V16SI_FTYPE_V8DF_V8DF:
++      nargs = 2;
++      break;
++    case V2DI_FTYPE_V2DI_INT_CONVERT:
++      nargs = 2;
++      rmode = V1TImode;
++      nargs_constant = 1;
++      break;
++    case V4DI_FTYPE_V4DI_INT_CONVERT:
++      nargs = 2;
++      rmode = V2TImode;
++      nargs_constant = 1;
++      break;
++    case V8DI_FTYPE_V8DI_INT_CONVERT:
++      nargs = 2;
++      rmode = V4TImode;
++      nargs_constant = 1;
++      break;
++    case V8HI_FTYPE_V8HI_INT:
++    case V8HI_FTYPE_V8SF_INT:
++    case V16HI_FTYPE_V16SF_INT:
++    case V8HI_FTYPE_V4SF_INT:
++    case V8SF_FTYPE_V8SF_INT:
++    case V4SF_FTYPE_V16SF_INT:
++    case V16SF_FTYPE_V16SF_INT:
++    case V4SI_FTYPE_V4SI_INT:
++    case V4SI_FTYPE_V8SI_INT:
++    case V4HI_FTYPE_V4HI_INT:
++    case V4DF_FTYPE_V4DF_INT:
++    case V4DF_FTYPE_V8DF_INT:
++    case V4SF_FTYPE_V4SF_INT:
++    case V4SF_FTYPE_V8SF_INT:
++    case V2DI_FTYPE_V2DI_INT:
++    case V2DF_FTYPE_V2DF_INT:
++    case V2DF_FTYPE_V4DF_INT:
++    case V16HI_FTYPE_V16HI_INT:
++    case V8SI_FTYPE_V8SI_INT:
++    case V16SI_FTYPE_V16SI_INT:
++    case V4SI_FTYPE_V16SI_INT:
++    case V4DI_FTYPE_V4DI_INT:
++    case V2DI_FTYPE_V4DI_INT:
++    case V4DI_FTYPE_V8DI_INT:
++    case QI_FTYPE_V4SF_INT:
++    case QI_FTYPE_V2DF_INT:
++    case UQI_FTYPE_UQI_UQI_CONST:
++    case UHI_FTYPE_UHI_UQI:
++    case USI_FTYPE_USI_UQI:
++    case UDI_FTYPE_UDI_UQI:
++      nargs = 2;
++      nargs_constant = 1;
++      break;
++    case V16QI_FTYPE_V16QI_V16QI_V16QI:
++    case V8SF_FTYPE_V8SF_V8SF_V8SF:
++    case V4DF_FTYPE_V4DF_V4DF_V4DF:
++    case V4SF_FTYPE_V4SF_V4SF_V4SF:
++    case V2DF_FTYPE_V2DF_V2DF_V2DF:
++    case V32QI_FTYPE_V32QI_V32QI_V32QI:
++    case UHI_FTYPE_V16SI_V16SI_UHI:
++    case UQI_FTYPE_V8DI_V8DI_UQI:
++    case V16HI_FTYPE_V16SI_V16HI_UHI:
++    case V16QI_FTYPE_V16SI_V16QI_UHI:
++    case V16QI_FTYPE_V8DI_V16QI_UQI:
++    case V16SF_FTYPE_V16SF_V16SF_UHI:
++    case V16SF_FTYPE_V4SF_V16SF_UHI:
++    case V16SI_FTYPE_SI_V16SI_UHI:
++    case V16SI_FTYPE_V16HI_V16SI_UHI:
++    case V16SI_FTYPE_V16QI_V16SI_UHI:
++    case V8SF_FTYPE_V4SF_V8SF_UQI:
++    case V4DF_FTYPE_V2DF_V4DF_UQI:
++    case V8SI_FTYPE_V4SI_V8SI_UQI:
++    case V8SI_FTYPE_SI_V8SI_UQI:
++    case V4SI_FTYPE_V4SI_V4SI_UQI:
++    case V4SI_FTYPE_SI_V4SI_UQI:
++    case V4DI_FTYPE_V2DI_V4DI_UQI:
++    case V4DI_FTYPE_DI_V4DI_UQI:
++    case V2DI_FTYPE_V2DI_V2DI_UQI:
++    case V2DI_FTYPE_DI_V2DI_UQI:
++    case V64QI_FTYPE_V64QI_V64QI_UDI:
++    case V64QI_FTYPE_V16QI_V64QI_UDI:
++    case V64QI_FTYPE_QI_V64QI_UDI:
++    case V32QI_FTYPE_V32QI_V32QI_USI:
++    case V32QI_FTYPE_V16QI_V32QI_USI:
++    case V32QI_FTYPE_QI_V32QI_USI:
++    case V16QI_FTYPE_V16QI_V16QI_UHI:
++    case V16QI_FTYPE_QI_V16QI_UHI:
++    case V32HI_FTYPE_V8HI_V32HI_USI:
++    case V32HI_FTYPE_HI_V32HI_USI:
++    case V16HI_FTYPE_V8HI_V16HI_UHI:
++    case V16HI_FTYPE_HI_V16HI_UHI:
++    case V8HI_FTYPE_V8HI_V8HI_UQI:
++    case V8HI_FTYPE_HI_V8HI_UQI:
++    case V8SF_FTYPE_V8HI_V8SF_UQI:
++    case V4SF_FTYPE_V8HI_V4SF_UQI:
++    case V8SI_FTYPE_V8SF_V8SI_UQI:
++    case V4SI_FTYPE_V4SF_V4SI_UQI:
++    case V4DI_FTYPE_V4SF_V4DI_UQI:
++    case V2DI_FTYPE_V4SF_V2DI_UQI:
++    case V4SF_FTYPE_V4DI_V4SF_UQI:
++    case V4SF_FTYPE_V2DI_V4SF_UQI:
++    case V4DF_FTYPE_V4DI_V4DF_UQI:
++    case V2DF_FTYPE_V2DI_V2DF_UQI:
++    case V16QI_FTYPE_V8HI_V16QI_UQI:
++    case V16QI_FTYPE_V16HI_V16QI_UHI:
++    case V16QI_FTYPE_V4SI_V16QI_UQI:
++    case V16QI_FTYPE_V8SI_V16QI_UQI:
++    case V8HI_FTYPE_V4SI_V8HI_UQI:
++    case V8HI_FTYPE_V8SI_V8HI_UQI:
++    case V16QI_FTYPE_V2DI_V16QI_UQI:
++    case V16QI_FTYPE_V4DI_V16QI_UQI:
++    case V8HI_FTYPE_V2DI_V8HI_UQI:
++    case V8HI_FTYPE_V4DI_V8HI_UQI:
++    case V4SI_FTYPE_V2DI_V4SI_UQI:
++    case V4SI_FTYPE_V4DI_V4SI_UQI:
++    case V32QI_FTYPE_V32HI_V32QI_USI:
++    case UHI_FTYPE_V16QI_V16QI_UHI:
++    case USI_FTYPE_V32QI_V32QI_USI:
++    case UDI_FTYPE_V64QI_V64QI_UDI:
++    case UQI_FTYPE_V8HI_V8HI_UQI:
++    case UHI_FTYPE_V16HI_V16HI_UHI:
++    case USI_FTYPE_V32HI_V32HI_USI:
++    case UQI_FTYPE_V4SI_V4SI_UQI:
++    case UQI_FTYPE_V8SI_V8SI_UQI:
++    case UQI_FTYPE_V2DI_V2DI_UQI:
++    case UQI_FTYPE_V4DI_V4DI_UQI:
++    case V4SF_FTYPE_V2DF_V4SF_UQI:
++    case V4SF_FTYPE_V4DF_V4SF_UQI:
++    case V16SI_FTYPE_V16SI_V16SI_UHI:
++    case V16SI_FTYPE_V4SI_V16SI_UHI:
++    case V2DI_FTYPE_V4SI_V2DI_UQI:
++    case V2DI_FTYPE_V8HI_V2DI_UQI:
++    case V2DI_FTYPE_V16QI_V2DI_UQI:
++    case V4DI_FTYPE_V4DI_V4DI_UQI:
++    case V4DI_FTYPE_V4SI_V4DI_UQI:
++    case V4DI_FTYPE_V8HI_V4DI_UQI:
++    case V4DI_FTYPE_V16QI_V4DI_UQI:
++    case V4DI_FTYPE_V4DF_V4DI_UQI:
++    case V2DI_FTYPE_V2DF_V2DI_UQI:
++    case V4SI_FTYPE_V4DF_V4SI_UQI:
++    case V4SI_FTYPE_V2DF_V4SI_UQI:
++    case V4SI_FTYPE_V8HI_V4SI_UQI:
++    case V4SI_FTYPE_V16QI_V4SI_UQI:
++    case V4DI_FTYPE_V4DI_V4DI_V4DI:
++    case V8DF_FTYPE_V2DF_V8DF_UQI:
++    case V8DF_FTYPE_V4DF_V8DF_UQI:
++    case V8DF_FTYPE_V8DF_V8DF_UQI:
++    case V8SF_FTYPE_V8SF_V8SF_UQI:
++    case V8SF_FTYPE_V8SI_V8SF_UQI:
++    case V4DF_FTYPE_V4DF_V4DF_UQI:
++    case V4SF_FTYPE_V4SF_V4SF_UQI:
++    case V2DF_FTYPE_V2DF_V2DF_UQI:
++    case V2DF_FTYPE_V4SF_V2DF_UQI:
++    case V2DF_FTYPE_V4SI_V2DF_UQI:
++    case V4SF_FTYPE_V4SI_V4SF_UQI:
++    case V4DF_FTYPE_V4SF_V4DF_UQI:
++    case V4DF_FTYPE_V4SI_V4DF_UQI:
++    case V8SI_FTYPE_V8SI_V8SI_UQI:
++    case V8SI_FTYPE_V8HI_V8SI_UQI:
++    case V8SI_FTYPE_V16QI_V8SI_UQI:
++    case V8DF_FTYPE_V8SI_V8DF_UQI:
++    case V8DI_FTYPE_DI_V8DI_UQI:
++    case V16SF_FTYPE_V8SF_V16SF_UHI:
++    case V16SI_FTYPE_V8SI_V16SI_UHI:
++    case V16HI_FTYPE_V16HI_V16HI_UHI:
++    case V8HI_FTYPE_V16QI_V8HI_UQI:
++    case V16HI_FTYPE_V16QI_V16HI_UHI:
++    case V32HI_FTYPE_V32HI_V32HI_USI:
++    case V32HI_FTYPE_V32QI_V32HI_USI:
++    case V8DI_FTYPE_V16QI_V8DI_UQI:
++    case V8DI_FTYPE_V2DI_V8DI_UQI:
++    case V8DI_FTYPE_V4DI_V8DI_UQI:
++    case V8DI_FTYPE_V8DI_V8DI_UQI:
++    case V8DI_FTYPE_V8HI_V8DI_UQI:
++    case V8DI_FTYPE_V8SI_V8DI_UQI:
++    case V8HI_FTYPE_V8DI_V8HI_UQI:
++    case V8SI_FTYPE_V8DI_V8SI_UQI:
++    case V4SI_FTYPE_V4SI_V4SI_V4SI:
++    case V16SI_FTYPE_V16SI_V16SI_V16SI:
++    case V8DI_FTYPE_V8DI_V8DI_V8DI:
++    case V32HI_FTYPE_V32HI_V32HI_V32HI:
++    case V2DI_FTYPE_V2DI_V2DI_V2DI:
++    case V16HI_FTYPE_V16HI_V16HI_V16HI:
++    case V8SI_FTYPE_V8SI_V8SI_V8SI:
++    case V8HI_FTYPE_V8HI_V8HI_V8HI:
++      nargs = 3;
++      break;
++    case V32QI_FTYPE_V32QI_V32QI_INT:
++    case V16HI_FTYPE_V16HI_V16HI_INT:
++    case V16QI_FTYPE_V16QI_V16QI_INT:
++    case V4DI_FTYPE_V4DI_V4DI_INT:
++    case V8HI_FTYPE_V8HI_V8HI_INT:
++    case V8SI_FTYPE_V8SI_V8SI_INT:
++    case V8SI_FTYPE_V8SI_V4SI_INT:
++    case V8SF_FTYPE_V8SF_V8SF_INT:
++    case V8SF_FTYPE_V8SF_V4SF_INT:
++    case V4SI_FTYPE_V4SI_V4SI_INT:
++    case V4DF_FTYPE_V4DF_V4DF_INT:
++    case V16SF_FTYPE_V16SF_V16SF_INT:
++    case V16SF_FTYPE_V16SF_V4SF_INT:
++    case V16SI_FTYPE_V16SI_V4SI_INT:
++    case V4DF_FTYPE_V4DF_V2DF_INT:
++    case V4SF_FTYPE_V4SF_V4SF_INT:
++    case V2DI_FTYPE_V2DI_V2DI_INT:
++    case V4DI_FTYPE_V4DI_V2DI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_INT:
++    case UQI_FTYPE_V8DI_V8UDI_INT:
++    case UQI_FTYPE_V8DF_V8DF_INT:
++    case UQI_FTYPE_V2DF_V2DF_INT:
++    case UQI_FTYPE_V4SF_V4SF_INT:
++    case UHI_FTYPE_V16SI_V16SI_INT:
++    case UHI_FTYPE_V16SF_V16SF_INT:
++    case V64QI_FTYPE_V64QI_V64QI_INT:
++    case V32HI_FTYPE_V32HI_V32HI_INT:
++    case V16SI_FTYPE_V16SI_V16SI_INT:
++    case V8DI_FTYPE_V8DI_V8DI_INT:
++      nargs = 3;
++      nargs_constant = 1;
++      break;
++    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
++      nargs = 3;
++      rmode = V4DImode;
++      nargs_constant = 1;
++      break;
++    case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
++      nargs = 3;
++      rmode = V2DImode;
++      nargs_constant = 1;
++      break;
++    case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
++      nargs = 3;
++      rmode = DImode;
++      nargs_constant = 1;
++      break;
++    case V2DI_FTYPE_V2DI_UINT_UINT:
++      nargs = 3;
++      nargs_constant = 2;
++      break;
++    case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
++      nargs = 3;
++      rmode = V8DImode;
++      nargs_constant = 1;
++      break;
++    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
++      nargs = 5;
++      rmode = V8DImode;
++      mask_pos = 2;
++      nargs_constant = 1;
++      break;
++    case QI_FTYPE_V8DF_INT_UQI:
++    case QI_FTYPE_V4DF_INT_UQI:
++    case QI_FTYPE_V2DF_INT_UQI:
++    case HI_FTYPE_V16SF_INT_UHI:
++    case QI_FTYPE_V8SF_INT_UQI:
++    case QI_FTYPE_V4SF_INT_UQI:
++    case V4SI_FTYPE_V4SI_V4SI_UHI:
++    case V8SI_FTYPE_V8SI_V8SI_UHI:
++      nargs = 3;
++      mask_pos = 1;
++      nargs_constant = 1;
++      break;
++    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
++      nargs = 5;
++      rmode = V4DImode;
++      mask_pos = 2;
++      nargs_constant = 1;
++      break;
++    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
++      nargs = 5;
++      rmode = V2DImode;
++      mask_pos = 2;
++      nargs_constant = 1;
++      break;
++    case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
++    case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
++    case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
++    case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
++    case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
++    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
++    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
++    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
++    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
++    case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
++    case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
++    case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
++    case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
++    case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
++    case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
++    case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
++    case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
++    case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
++    case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
++    case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
++    case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
++    case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
++    case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
++    case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
++    case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
++    case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
++    case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
++    case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
++    case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
++    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
++    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
++    case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
++    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
++    case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
++    case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
++    case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
++    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
++    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
++    case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
++    case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
++    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
++    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
++    case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
++    case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
++    case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
++    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
++    case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
++    case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
++    case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
++    case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
++    case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
++      nargs = 4;
++      break;
++    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
++    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
++    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
++    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
++    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
++      nargs = 4;
++      nargs_constant = 1;
++      break;
++    case UQI_FTYPE_V4DI_V4DI_INT_UQI:
++    case UQI_FTYPE_V8SI_V8SI_INT_UQI:
++    case QI_FTYPE_V4DF_V4DF_INT_UQI:
++    case QI_FTYPE_V8SF_V8SF_INT_UQI:
++    case UQI_FTYPE_V2DI_V2DI_INT_UQI:
++    case UQI_FTYPE_V4SI_V4SI_INT_UQI:
++    case UQI_FTYPE_V2DF_V2DF_INT_UQI:
++    case UQI_FTYPE_V4SF_V4SF_INT_UQI:
++    case UDI_FTYPE_V64QI_V64QI_INT_UDI:
++    case USI_FTYPE_V32QI_V32QI_INT_USI:
++    case UHI_FTYPE_V16QI_V16QI_INT_UHI:
++    case USI_FTYPE_V32HI_V32HI_INT_USI:
++    case UHI_FTYPE_V16HI_V16HI_INT_UHI:
++    case UQI_FTYPE_V8HI_V8HI_INT_UQI:
++    case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
++    case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
++    case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
++    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
++    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
++    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
++    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
++    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
++    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
++      nargs = 4;
++      mask_pos = 1;
++      nargs_constant = 1;
++      break;
++    case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
++      nargs = 4;
++      nargs_constant = 2;
++      break;
++    case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
++    case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
++      nargs = 4;
++      break;
++    case UQI_FTYPE_V8DI_V8DI_INT_UQI:
++    case UHI_FTYPE_V16SI_V16SI_INT_UHI:
++      mask_pos = 1;
++      nargs = 4;
++      nargs_constant = 1;
++      break;
++    case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
++    case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
++    case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
++    case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
++    case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
++    case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
++    case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
++    case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
++    case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
++    case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
++    case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
++    case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
++    case V32HI_FTYPE_V32HI_INT_V32HI_USI:
++    case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
++    case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
++    case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
++    case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
++    case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
++    case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
++    case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
++    case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
++    case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
++    case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
++    case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
++    case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
++    case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
++    case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
++    case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
++    case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
++    case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
++      nargs = 4;
++      mask_pos = 2;
++      nargs_constant = 1;
++      break;
++    case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
++    case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
++    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
++    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
++    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
++    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
++    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
++    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
++    case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
++    case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
++    case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
++    case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
++    case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
++    case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
++    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
++    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
++    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
++    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
++    case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
++    case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
++    case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
++    case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
++    case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
++    case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
++    case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
++    case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
++    case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
++      nargs = 5;
++      mask_pos = 2;
++      nargs_constant = 1;
++      break;
++    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
++    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
++    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
++    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
++    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
++    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
++    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
++    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
++    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
++    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
++      nargs = 5;
++      mask_pos = 1;
++      nargs_constant = 1;
++      break;
++    case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
++    case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
++    case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
++    case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
++    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
++    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
++    case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
++    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
++    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
++    case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
++    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
++    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
++      nargs = 5;
++      mask_pos = 1;
++      nargs_constant = 2;
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  gcc_assert (nargs <= ARRAY_SIZE (args));
++
++  if (comparison != UNKNOWN)
++    {
++      gcc_assert (nargs == 2);
++      return ix86_expand_sse_compare (d, exp, target, swap);
++    }
++
++  if (rmode == VOIDmode || rmode == tmode)
++    {
++      if (optimize
++	  || target == 0
++	  || GET_MODE (target) != tmode
++	  || !insn_p->operand[0].predicate (target, tmode))
++	target = gen_reg_rtx (tmode);
++      else if (memory_operand (target, tmode))
++	num_memory++;
++      real_target = target;
++    }
++  else
++    {
++      real_target = gen_reg_rtx (tmode);
++      target = lowpart_subreg (rmode, real_target, tmode);
++    }
++
++  for (i = 0; i < nargs; i++)
++    {
++      tree arg = CALL_EXPR_ARG (exp, i);
++      rtx op = expand_normal (arg);
++      machine_mode mode = insn_p->operand[i + 1].mode;
++      bool match = insn_p->operand[i + 1].predicate (op, mode);
++
++      if (second_arg_count && i == 1)
++	{
++	  /* SIMD shift insns take either an 8-bit immediate or
++	     register as count.  But builtin functions take int as
++	     count.  If count doesn't match, we put it in register.
++	     The instructions are using 64-bit count, if op is just
++	     32-bit, zero-extend it, as negative shift counts
++	     are undefined behavior and zero-extension is more
++	     efficient.  */
++	  if (!match)
++	    {
++	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
++		op = convert_modes (mode, GET_MODE (op), op, 1);
++	      else
++		op = lowpart_subreg (mode, op, GET_MODE (op));
++	      if (!insn_p->operand[i + 1].predicate (op, mode))
++		op = copy_to_reg (op);
++	    }
++	}
++      else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
++	       (!mask_pos && (nargs - i) <= nargs_constant))
++	{
++	  if (!match)
++	    switch (icode)
++	      {
++	      case CODE_FOR_avx_vinsertf128v4di:
++	      case CODE_FOR_avx_vextractf128v4di:
++		error ("the last argument must be an 1-bit immediate");
++		return const0_rtx;
++
++	      case CODE_FOR_avx512f_cmpv8di3_mask:
++	      case CODE_FOR_avx512f_cmpv16si3_mask:
++	      case CODE_FOR_avx512f_ucmpv8di3_mask:
++	      case CODE_FOR_avx512f_ucmpv16si3_mask:
++	      case CODE_FOR_avx512vl_cmpv4di3_mask:
++	      case CODE_FOR_avx512vl_cmpv8si3_mask:
++	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
++	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
++	      case CODE_FOR_avx512vl_cmpv2di3_mask:
++	      case CODE_FOR_avx512vl_cmpv4si3_mask:
++	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
++	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
++		error ("the last argument must be a 3-bit immediate");
++		return const0_rtx;
++
++	      case CODE_FOR_sse4_1_roundsd:
++	      case CODE_FOR_sse4_1_roundss:
++
++	      case CODE_FOR_sse4_1_roundpd:
++	      case CODE_FOR_sse4_1_roundps:
++	      case CODE_FOR_avx_roundpd256:
++	      case CODE_FOR_avx_roundps256:
++
++	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
++	      case CODE_FOR_sse4_1_roundps_sfix:
++	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
++	      case CODE_FOR_avx_roundps_sfix256:
++
++	      case CODE_FOR_sse4_1_blendps:
++	      case CODE_FOR_avx_blendpd256:
++	      case CODE_FOR_avx_vpermilv4df:
++	      case CODE_FOR_avx_vpermilv4df_mask:
++	      case CODE_FOR_avx512f_getmantv8df_mask:
++	      case CODE_FOR_avx512f_getmantv16sf_mask:
++	      case CODE_FOR_avx512vl_getmantv8sf_mask:
++	      case CODE_FOR_avx512vl_getmantv4df_mask:
++	      case CODE_FOR_avx512vl_getmantv4sf_mask:
++	      case CODE_FOR_avx512vl_getmantv2df_mask:
++	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
++	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
++	      case CODE_FOR_avx512dq_rangepv4df_mask:
++	      case CODE_FOR_avx512dq_rangepv8sf_mask:
++	      case CODE_FOR_avx512dq_rangepv2df_mask:
++	      case CODE_FOR_avx512dq_rangepv4sf_mask:
++	      case CODE_FOR_avx_shufpd256_mask:
++		error ("the last argument must be a 4-bit immediate");
++		return const0_rtx;
++
++	      case CODE_FOR_sha1rnds4:
++	      case CODE_FOR_sse4_1_blendpd:
++	      case CODE_FOR_avx_vpermilv2df:
++	      case CODE_FOR_avx_vpermilv2df_mask:
++	      case CODE_FOR_xop_vpermil2v2df3:
++	      case CODE_FOR_xop_vpermil2v4sf3:
++	      case CODE_FOR_xop_vpermil2v4df3:
++	      case CODE_FOR_xop_vpermil2v8sf3:
++	      case CODE_FOR_avx512f_vinsertf32x4_mask:
++	      case CODE_FOR_avx512f_vinserti32x4_mask:
++	      case CODE_FOR_avx512f_vextractf32x4_mask:
++	      case CODE_FOR_avx512f_vextracti32x4_mask:
++	      case CODE_FOR_sse2_shufpd:
++	      case CODE_FOR_sse2_shufpd_mask:
++	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
++	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
++	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
++	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
++		error ("the last argument must be a 2-bit immediate");
++		return const0_rtx;
++
++	      case CODE_FOR_avx_vextractf128v4df:
++	      case CODE_FOR_avx_vextractf128v8sf:
++	      case CODE_FOR_avx_vextractf128v8si:
++	      case CODE_FOR_avx_vinsertf128v4df:
++	      case CODE_FOR_avx_vinsertf128v8sf:
++	      case CODE_FOR_avx_vinsertf128v8si:
++	      case CODE_FOR_avx512f_vinsertf64x4_mask:
++	      case CODE_FOR_avx512f_vinserti64x4_mask:
++	      case CODE_FOR_avx512f_vextractf64x4_mask:
++	      case CODE_FOR_avx512f_vextracti64x4_mask:
++	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
++	      case CODE_FOR_avx512dq_vinserti32x8_mask:
++	      case CODE_FOR_avx512vl_vinsertv4df:
++	      case CODE_FOR_avx512vl_vinsertv4di:
++	      case CODE_FOR_avx512vl_vinsertv8sf:
++	      case CODE_FOR_avx512vl_vinsertv8si:
++		error ("the last argument must be a 1-bit immediate");
++		return const0_rtx;
++
++	      case CODE_FOR_avx_vmcmpv2df3:
++	      case CODE_FOR_avx_vmcmpv4sf3:
++	      case CODE_FOR_avx_cmpv2df3:
++	      case CODE_FOR_avx_cmpv4sf3:
++	      case CODE_FOR_avx_cmpv4df3:
++	      case CODE_FOR_avx_cmpv8sf3:
++	      case CODE_FOR_avx512f_cmpv8df3_mask:
++	      case CODE_FOR_avx512f_cmpv16sf3_mask:
++	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
++	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
++		error ("the last argument must be a 5-bit immediate");
++		return const0_rtx;
++
++	      default:
++		switch (nargs_constant)
++		  {
++		  case 2:
++		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
++			(!mask_pos && (nargs - i) == nargs_constant))
++		      {
++			error ("the next to last argument must be an 8-bit immediate");
++			break;
++		      }
++		    /* FALLTHRU */
++		  case 1:
++		    error ("the last argument must be an 8-bit immediate");
++		    break;
++		  default:
++		    gcc_unreachable ();
++		  }
++		return const0_rtx;
++	      }
++	}
++      else
++	{
++	  if (VECTOR_MODE_P (mode))
++	    op = safe_vector_operand (op, mode);
++
++	  /* If we aren't optimizing, only allow one memory operand to
++	     be generated.  */
++	  if (memory_operand (op, mode))
++	    num_memory++;
++
++	  op = fixup_modeless_constant (op, mode);
++
++	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
++	    {
++	      if (optimize || !match || num_memory > 1)
++		op = copy_to_mode_reg (mode, op);
++	    }
++	  else
++	    {
++	      op = copy_to_reg (op);
++	      op = lowpart_subreg (mode, op, GET_MODE (op));
++	    }
++	}
++
++      args[i].op = op;
++      args[i].mode = mode;
++    }
++
++  switch (nargs)
++    {
++    case 1:
++      pat = GEN_FCN (icode) (real_target, args[0].op);
++      break;
++    case 2:
++      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
++      break;
++    case 3:
++      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
++			     args[2].op);
++      break;
++    case 4:
++      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
++			     args[2].op, args[3].op);
++      break;
++    case 5:
++      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
++			     args[2].op, args[3].op, args[4].op);
++      break;
++    case 6:
++      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
++			     args[2].op, args[3].op, args[4].op,
++			     args[5].op);
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  if (! pat)
++    return 0;
++
++  emit_insn (pat);
++  return target;
++}
++
++/* Transform pattern of following layout:
++     (set A
++       (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
++     )
++   into:
++     (set (A B)) */
++
++static rtx
++ix86_erase_embedded_rounding (rtx pat)
++{
++  if (GET_CODE (pat) == INSN)
++    pat = PATTERN (pat);
++
++  gcc_assert (GET_CODE (pat) == SET);
++  rtx src = SET_SRC (pat);
++  gcc_assert (XVECLEN (src, 0) == 2);
++  rtx p0 = XVECEXP (src, 0, 0);
++  gcc_assert (GET_CODE (src) == UNSPEC
++	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
++  rtx res = gen_rtx_SET (SET_DEST (pat), p0);
++  return res;
++}
++
++/* Subroutine of ix86_expand_round_builtin to take care of comi insns
++   with rounding.  */
++static rtx
++ix86_expand_sse_comi_round (const struct builtin_description *d,
++			    tree exp, rtx target)
++{
++  rtx pat, set_dst;
++  tree arg0 = CALL_EXPR_ARG (exp, 0);
++  tree arg1 = CALL_EXPR_ARG (exp, 1);
++  tree arg2 = CALL_EXPR_ARG (exp, 2);
++  tree arg3 = CALL_EXPR_ARG (exp, 3);
++  rtx op0 = expand_normal (arg0);
++  rtx op1 = expand_normal (arg1);
++  rtx op2 = expand_normal (arg2);
++  rtx op3 = expand_normal (arg3);
++  enum insn_code icode = d->icode;
++  const struct insn_data_d *insn_p = &insn_data[icode];
++  machine_mode mode0 = insn_p->operand[0].mode;
++  machine_mode mode1 = insn_p->operand[1].mode;
++  enum rtx_code comparison = UNEQ;
++  bool need_ucomi = false;
++
++  /* See avxintrin.h for values.  */
++  enum rtx_code comi_comparisons[32] =
++    {
++      UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
++      UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
++      UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
++    };
++  bool need_ucomi_values[32] =
++    {
++      true,  false, false, true,  true,  false, false, true,
++      true,  false, false, true,  true,  false, false, true,
++      false, true,  true,  false, false, true,  true,  false,
++      false, true,  true,  false, false, true,  true,  false
++    };
++
++  if (!CONST_INT_P (op2))
++    {
++      error ("the third argument must be comparison constant");
++      return const0_rtx;
++    }
++  if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
++    {
++      error ("incorrect comparison mode");
++      return const0_rtx;
++    }
++
++  if (!insn_p->operand[2].predicate (op3, SImode))
++    {
++      error ("incorrect rounding operand");
++      return const0_rtx;
++    }
++
++  comparison = comi_comparisons[INTVAL (op2)];
++  need_ucomi = need_ucomi_values[INTVAL (op2)];
++
++  if (VECTOR_MODE_P (mode0))
++    op0 = safe_vector_operand (op0, mode0);
++  if (VECTOR_MODE_P (mode1))
++    op1 = safe_vector_operand (op1, mode1);
++
++  target = gen_reg_rtx (SImode);
++  emit_move_insn (target, const0_rtx);
++  target = gen_rtx_SUBREG (QImode, target, 0);
++
++  if ((optimize && !register_operand (op0, mode0))
++      || !insn_p->operand[0].predicate (op0, mode0))
++    op0 = copy_to_mode_reg (mode0, op0);
++  if ((optimize && !register_operand (op1, mode1))
++      || !insn_p->operand[1].predicate (op1, mode1))
++    op1 = copy_to_mode_reg (mode1, op1);
++
++  if (need_ucomi)
++    icode = icode == CODE_FOR_sse_comi_round
++		     ? CODE_FOR_sse_ucomi_round
++		     : CODE_FOR_sse2_ucomi_round;
++
++  pat = GEN_FCN (icode) (op0, op1, op3);
++  if (! pat)
++    return 0;
++
++  /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
++  if (INTVAL (op3) == NO_ROUND)
++    {
++      pat = ix86_erase_embedded_rounding (pat);
++      if (! pat)
++	return 0;
++
++      set_dst = SET_DEST (pat);
++    }
++  else
++    {
++      gcc_assert (GET_CODE (pat) == SET);
++      set_dst = SET_DEST (pat);
++    }
++
++  emit_insn (pat);
++  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
++			  gen_rtx_fmt_ee (comparison, QImode,
++					  set_dst,
++					  const0_rtx)));
++
++  return SUBREG_REG (target);
++}
++
++static rtx
++ix86_expand_round_builtin (const struct builtin_description *d,
++			   tree exp, rtx target)
++{
++  rtx pat;
++  unsigned int i, nargs;
++  struct
++    {
++      rtx op;
++      machine_mode mode;
++    } args[6];
++  enum insn_code icode = d->icode;
++  const struct insn_data_d *insn_p = &insn_data[icode];
++  machine_mode tmode = insn_p->operand[0].mode;
++  unsigned int nargs_constant = 0;
++  unsigned int redundant_embed_rnd = 0;
++
++  switch ((enum ix86_builtin_func_type) d->flag)
++    {
++    case UINT64_FTYPE_V2DF_INT:
++    case UINT64_FTYPE_V4SF_INT:
++    case UINT_FTYPE_V2DF_INT:
++    case UINT_FTYPE_V4SF_INT:
++    case INT64_FTYPE_V2DF_INT:
++    case INT64_FTYPE_V4SF_INT:
++    case INT_FTYPE_V2DF_INT:
++    case INT_FTYPE_V4SF_INT:
++      nargs = 2;
++      break;
++    case V4SF_FTYPE_V4SF_UINT_INT:
++    case V4SF_FTYPE_V4SF_UINT64_INT:
++    case V2DF_FTYPE_V2DF_UINT64_INT:
++    case V4SF_FTYPE_V4SF_INT_INT:
++    case V4SF_FTYPE_V4SF_INT64_INT:
++    case V2DF_FTYPE_V2DF_INT64_INT:
++    case V4SF_FTYPE_V4SF_V4SF_INT:
++    case V2DF_FTYPE_V2DF_V2DF_INT:
++    case V4SF_FTYPE_V4SF_V2DF_INT:
++    case V2DF_FTYPE_V2DF_V4SF_INT:
++      nargs = 3;
++      break;
++    case V8SF_FTYPE_V8DF_V8SF_QI_INT:
++    case V8DF_FTYPE_V8DF_V8DF_QI_INT:
++    case V8SI_FTYPE_V8DF_V8SI_QI_INT:
++    case V8DI_FTYPE_V8DF_V8DI_QI_INT:
++    case V8SF_FTYPE_V8DI_V8SF_QI_INT:
++    case V8DF_FTYPE_V8DI_V8DF_QI_INT:
++    case V16SF_FTYPE_V16SF_V16SF_HI_INT:
++    case V8DI_FTYPE_V8SF_V8DI_QI_INT:
++    case V16SF_FTYPE_V16SI_V16SF_HI_INT:
++    case V16SI_FTYPE_V16SF_V16SI_HI_INT:
++    case V8DF_FTYPE_V8SF_V8DF_QI_INT:
++    case V16SF_FTYPE_V16HI_V16SF_HI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
++    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
++      nargs = 4;
++      break;
++    case V4SF_FTYPE_V4SF_V4SF_INT_INT:
++    case V2DF_FTYPE_V2DF_V2DF_INT_INT:
++      nargs_constant = 2;
++      nargs = 4;
++      break;
++    case INT_FTYPE_V4SF_V4SF_INT_INT:
++    case INT_FTYPE_V2DF_V2DF_INT_INT:
++      return ix86_expand_sse_comi_round (d, exp, target);
++    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
++    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
++    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
++    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
++    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
++    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
++      nargs = 5;
++      break;
++    case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
++    case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
++      nargs_constant = 4;
++      nargs = 5;
++      break;
++    case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
++    case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
++    case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
++    case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
++      nargs_constant = 3;
++      nargs = 5;
++      break;
++    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
++    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
++    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
++    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
++      nargs = 6;
++      nargs_constant = 4;
++      break;
++    case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
++    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
++    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
++    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
++      nargs = 6;
++      nargs_constant = 3;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++  gcc_assert (nargs <= ARRAY_SIZE (args));
++
++  if (optimize
++      || target == 0
++      || GET_MODE (target) != tmode
++      || !insn_p->operand[0].predicate (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  for (i = 0; i < nargs; i++)
++    {
++      tree arg = CALL_EXPR_ARG (exp, i);
++      rtx op = expand_normal (arg);
++      machine_mode mode = insn_p->operand[i + 1].mode;
++      bool match = insn_p->operand[i + 1].predicate (op, mode);
++
++      if (i == nargs - nargs_constant)
++	{
++	  if (!match)
++	    {
++	      switch (icode)
++		{
++		case CODE_FOR_avx512f_getmantv8df_mask_round:
++		case CODE_FOR_avx512f_getmantv16sf_mask_round:
++		case CODE_FOR_avx512f_vgetmantv2df_round:
++		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
++		case CODE_FOR_avx512f_vgetmantv4sf_round:
++		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
++		  error ("the immediate argument must be a 4-bit immediate");
++		  return const0_rtx;
++		case CODE_FOR_avx512f_cmpv8df3_mask_round:
++		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
++		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
++		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
++		  error ("the immediate argument must be a 5-bit immediate");
++		  return const0_rtx;
++		default:
++		  error ("the immediate argument must be an 8-bit immediate");
++		  return const0_rtx;
++		}
++	    }
++	}
++      else if (i == nargs-1)
++	{
++	  if (!insn_p->operand[nargs].predicate (op, SImode))
++	    {
++	      error ("incorrect rounding operand");
++	      return const0_rtx;
++	    }
++
++	  /* If there is no rounding use normal version of the pattern.  */
++	  if (INTVAL (op) == NO_ROUND)
++	    redundant_embed_rnd = 1;
++	}
++      else
++	{
++	  if (VECTOR_MODE_P (mode))
++	    op = safe_vector_operand (op, mode);
++
++	  op = fixup_modeless_constant (op, mode);
++
++	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
++	    {
++	      if (optimize || !match)
++		op = copy_to_mode_reg (mode, op);
++	    }
++	  else
++	    {
++	      op = copy_to_reg (op);
++	      op = lowpart_subreg (mode, op, GET_MODE (op));
++	    }
++	}
++
++      args[i].op = op;
++      args[i].mode = mode;
++    }
++
++  switch (nargs)
++    {
++    case 1:
++      pat = GEN_FCN (icode) (target, args[0].op);
++      break;
++    case 2:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
++      break;
++    case 3:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
++			     args[2].op);
++      break;
++    case 4:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
++			     args[2].op, args[3].op);
++      break;
++    case 5:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
++			     args[2].op, args[3].op, args[4].op);
++      break;
++    case 6:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
++			     args[2].op, args[3].op, args[4].op,
++			     args[5].op);
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  if (!pat)
++    return 0;
++
++  if (redundant_embed_rnd)
++    pat = ix86_erase_embedded_rounding (pat);
++
++  emit_insn (pat);
++  return target;
++}
++
++/* Subroutine of ix86_expand_builtin to take care of special insns
++   with variable number of operands.  */
++
++static rtx
++ix86_expand_special_args_builtin (const struct builtin_description *d,
++				  tree exp, rtx target)
++{
++  tree arg;
++  rtx pat, op;
++  unsigned int i, nargs, arg_adjust, memory;
++  bool aligned_mem = false;
++  struct
++    {
++      rtx op;
++      machine_mode mode;
++    } args[3];
++  enum insn_code icode = d->icode;
++  bool last_arg_constant = false;
++  const struct insn_data_d *insn_p = &insn_data[icode];
++  machine_mode tmode = insn_p->operand[0].mode;
++  enum { load, store } klass;
++
++  switch ((enum ix86_builtin_func_type) d->flag)
++    {
++    case VOID_FTYPE_VOID:
++      emit_insn (GEN_FCN (icode) (target));
++      return 0;
++    case VOID_FTYPE_UINT64:
++    case VOID_FTYPE_UNSIGNED:
++      nargs = 0;
++      klass = store;
++      memory = 0;
++      break;
++
++    case INT_FTYPE_VOID:
++    case USHORT_FTYPE_VOID:
++    case UINT64_FTYPE_VOID:
++    case UINT_FTYPE_VOID:
++    case UNSIGNED_FTYPE_VOID:
++      nargs = 0;
++      klass = load;
++      memory = 0;
++      break;
++    case UINT64_FTYPE_PUNSIGNED:
++    case V2DI_FTYPE_PV2DI:
++    case V4DI_FTYPE_PV4DI:
++    case V32QI_FTYPE_PCCHAR:
++    case V16QI_FTYPE_PCCHAR:
++    case V8SF_FTYPE_PCV4SF:
++    case V8SF_FTYPE_PCFLOAT:
++    case V4SF_FTYPE_PCFLOAT:
++    case V4DF_FTYPE_PCV2DF:
++    case V4DF_FTYPE_PCDOUBLE:
++    case V2DF_FTYPE_PCDOUBLE:
++    case VOID_FTYPE_PVOID:
++    case V8DI_FTYPE_PV8DI:
++      nargs = 1;
++      klass = load;
++      memory = 0;
++      switch (icode)
++	{
++	case CODE_FOR_sse4_1_movntdqa:
++	case CODE_FOR_avx2_movntdqa:
++	case CODE_FOR_avx512f_movntdqa:
++	  aligned_mem = true;
++	  break;
++	default:
++	  break;
++	}
++      break;
++    case VOID_FTYPE_PV2SF_V4SF:
++    case VOID_FTYPE_PV8DI_V8DI:
++    case VOID_FTYPE_PV4DI_V4DI:
++    case VOID_FTYPE_PV2DI_V2DI:
++    case VOID_FTYPE_PCHAR_V32QI:
++    case VOID_FTYPE_PCHAR_V16QI:
++    case VOID_FTYPE_PFLOAT_V16SF:
++    case VOID_FTYPE_PFLOAT_V8SF:
++    case VOID_FTYPE_PFLOAT_V4SF:
++    case VOID_FTYPE_PDOUBLE_V8DF:
++    case VOID_FTYPE_PDOUBLE_V4DF:
++    case VOID_FTYPE_PDOUBLE_V2DF:
++    case VOID_FTYPE_PLONGLONG_LONGLONG:
++    case VOID_FTYPE_PULONGLONG_ULONGLONG:
++    case VOID_FTYPE_PUNSIGNED_UNSIGNED:
++    case VOID_FTYPE_PINT_INT:
++      nargs = 1;
++      klass = store;
++      /* Reserve memory operand for target.  */
++      memory = ARRAY_SIZE (args);
++      switch (icode)
++	{
++	/* These builtins and instructions require the memory
++	   to be properly aligned.  */
++	case CODE_FOR_avx_movntv4di:
++	case CODE_FOR_sse2_movntv2di:
++	case CODE_FOR_avx_movntv8sf:
++	case CODE_FOR_sse_movntv4sf:
++	case CODE_FOR_sse4a_vmmovntv4sf:
++	case CODE_FOR_avx_movntv4df:
++	case CODE_FOR_sse2_movntv2df:
++	case CODE_FOR_sse4a_vmmovntv2df:
++	case CODE_FOR_sse2_movntidi:
++	case CODE_FOR_sse_movntq:
++	case CODE_FOR_sse2_movntisi:
++	case CODE_FOR_avx512f_movntv16sf:
++	case CODE_FOR_avx512f_movntv8df:
++	case CODE_FOR_avx512f_movntv8di:
++	  aligned_mem = true;
++	  break;
++	default:
++	  break;
++	}
++      break;
++    case VOID_FTYPE_PVOID_PCVOID:
++	nargs = 1;
++	klass = store;
++	memory = 0;
++
++	break;
++    case V4SF_FTYPE_V4SF_PCV2SF:
++    case V2DF_FTYPE_V2DF_PCDOUBLE:
++      nargs = 2;
++      klass = load;
++      memory = 1;
++      break;
++    case V8SF_FTYPE_PCV8SF_V8SI:
++    case V4DF_FTYPE_PCV4DF_V4DI:
++    case V4SF_FTYPE_PCV4SF_V4SI:
++    case V2DF_FTYPE_PCV2DF_V2DI:
++    case V8SI_FTYPE_PCV8SI_V8SI:
++    case V4DI_FTYPE_PCV4DI_V4DI:
++    case V4SI_FTYPE_PCV4SI_V4SI:
++    case V2DI_FTYPE_PCV2DI_V2DI:
++    case VOID_FTYPE_INT_INT64:
++      nargs = 2;
++      klass = load;
++      memory = 0;
++      break;
++    case VOID_FTYPE_PV8DF_V8DF_UQI:
++    case VOID_FTYPE_PV4DF_V4DF_UQI:
++    case VOID_FTYPE_PV2DF_V2DF_UQI:
++    case VOID_FTYPE_PV16SF_V16SF_UHI:
++    case VOID_FTYPE_PV8SF_V8SF_UQI:
++    case VOID_FTYPE_PV4SF_V4SF_UQI:
++    case VOID_FTYPE_PV8DI_V8DI_UQI:
++    case VOID_FTYPE_PV4DI_V4DI_UQI:
++    case VOID_FTYPE_PV2DI_V2DI_UQI:
++    case VOID_FTYPE_PV16SI_V16SI_UHI:
++    case VOID_FTYPE_PV8SI_V8SI_UQI:
++    case VOID_FTYPE_PV4SI_V4SI_UQI:
++    case VOID_FTYPE_PV64QI_V64QI_UDI:
++    case VOID_FTYPE_PV32HI_V32HI_USI:
++    case VOID_FTYPE_PV32QI_V32QI_USI:
++    case VOID_FTYPE_PV16QI_V16QI_UHI:
++    case VOID_FTYPE_PV16HI_V16HI_UHI:
++    case VOID_FTYPE_PV8HI_V8HI_UQI:
++      switch (icode)
++	{
++	/* These builtins and instructions require the memory
++	   to be properly aligned.  */
++	case CODE_FOR_avx512f_storev16sf_mask:
++	case CODE_FOR_avx512f_storev16si_mask:
++	case CODE_FOR_avx512f_storev8df_mask:
++	case CODE_FOR_avx512f_storev8di_mask:
++	case CODE_FOR_avx512vl_storev8sf_mask:
++	case CODE_FOR_avx512vl_storev8si_mask:
++	case CODE_FOR_avx512vl_storev4df_mask:
++	case CODE_FOR_avx512vl_storev4di_mask:
++	case CODE_FOR_avx512vl_storev4sf_mask:
++	case CODE_FOR_avx512vl_storev4si_mask:
++	case CODE_FOR_avx512vl_storev2df_mask:
++	case CODE_FOR_avx512vl_storev2di_mask:
++	  aligned_mem = true;
++	  break;
++	default:
++	  break;
++	}
++      /* FALLTHRU */
++    case VOID_FTYPE_PV8SF_V8SI_V8SF:
++    case VOID_FTYPE_PV4DF_V4DI_V4DF:
++    case VOID_FTYPE_PV4SF_V4SI_V4SF:
++    case VOID_FTYPE_PV2DF_V2DI_V2DF:
++    case VOID_FTYPE_PV8SI_V8SI_V8SI:
++    case VOID_FTYPE_PV4DI_V4DI_V4DI:
++    case VOID_FTYPE_PV4SI_V4SI_V4SI:
++    case VOID_FTYPE_PV2DI_V2DI_V2DI:
++    case VOID_FTYPE_PV8SI_V8DI_UQI:
++    case VOID_FTYPE_PV8HI_V8DI_UQI:
++    case VOID_FTYPE_PV16HI_V16SI_UHI:
++    case VOID_FTYPE_PV16QI_V8DI_UQI:
++    case VOID_FTYPE_PV16QI_V16SI_UHI:
++    case VOID_FTYPE_PV4SI_V4DI_UQI:
++    case VOID_FTYPE_PV4SI_V2DI_UQI:
++    case VOID_FTYPE_PV8HI_V4DI_UQI:
++    case VOID_FTYPE_PV8HI_V2DI_UQI:
++    case VOID_FTYPE_PV8HI_V8SI_UQI:
++    case VOID_FTYPE_PV8HI_V4SI_UQI:
++    case VOID_FTYPE_PV16QI_V4DI_UQI:
++    case VOID_FTYPE_PV16QI_V2DI_UQI:
++    case VOID_FTYPE_PV16QI_V8SI_UQI:
++    case VOID_FTYPE_PV16QI_V4SI_UQI:
++    case VOID_FTYPE_PCHAR_V64QI_UDI:
++    case VOID_FTYPE_PCHAR_V32QI_USI:
++    case VOID_FTYPE_PCHAR_V16QI_UHI:
++    case VOID_FTYPE_PSHORT_V32HI_USI:
++    case VOID_FTYPE_PSHORT_V16HI_UHI:
++    case VOID_FTYPE_PSHORT_V8HI_UQI:
++    case VOID_FTYPE_PINT_V16SI_UHI:
++    case VOID_FTYPE_PINT_V8SI_UQI:
++    case VOID_FTYPE_PINT_V4SI_UQI:
++    case VOID_FTYPE_PINT64_V8DI_UQI:
++    case VOID_FTYPE_PINT64_V4DI_UQI:
++    case VOID_FTYPE_PINT64_V2DI_UQI:
++    case VOID_FTYPE_PDOUBLE_V8DF_UQI:
++    case VOID_FTYPE_PDOUBLE_V4DF_UQI:
++    case VOID_FTYPE_PDOUBLE_V2DF_UQI:
++    case VOID_FTYPE_PFLOAT_V16SF_UHI:
++    case VOID_FTYPE_PFLOAT_V8SF_UQI:
++    case VOID_FTYPE_PFLOAT_V4SF_UQI:
++    case VOID_FTYPE_PV32QI_V32HI_USI:
++    case VOID_FTYPE_PV16QI_V16HI_UHI:
++    case VOID_FTYPE_PV8QI_V8HI_UQI:
++      nargs = 2;
++      klass = store;
++      /* Reserve memory operand for target.  */
++      memory = ARRAY_SIZE (args);
++      break;
++    case V4SF_FTYPE_PCV4SF_V4SF_UQI:
++    case V8SF_FTYPE_PCV8SF_V8SF_UQI:
++    case V16SF_FTYPE_PCV16SF_V16SF_UHI:
++    case V4SI_FTYPE_PCV4SI_V4SI_UQI:
++    case V8SI_FTYPE_PCV8SI_V8SI_UQI:
++    case V16SI_FTYPE_PCV16SI_V16SI_UHI:
++    case V2DF_FTYPE_PCV2DF_V2DF_UQI:
++    case V4DF_FTYPE_PCV4DF_V4DF_UQI:
++    case V8DF_FTYPE_PCV8DF_V8DF_UQI:
++    case V2DI_FTYPE_PCV2DI_V2DI_UQI:
++    case V4DI_FTYPE_PCV4DI_V4DI_UQI:
++    case V8DI_FTYPE_PCV8DI_V8DI_UQI:
++    case V64QI_FTYPE_PCV64QI_V64QI_UDI:
++    case V32HI_FTYPE_PCV32HI_V32HI_USI:
++    case V32QI_FTYPE_PCV32QI_V32QI_USI:
++    case V16QI_FTYPE_PCV16QI_V16QI_UHI:
++    case V16HI_FTYPE_PCV16HI_V16HI_UHI:
++    case V8HI_FTYPE_PCV8HI_V8HI_UQI:
++      switch (icode)
++	{
++	/* These builtins and instructions require the memory
++	   to be properly aligned.  */
++	case CODE_FOR_avx512f_loadv16sf_mask:
++	case CODE_FOR_avx512f_loadv16si_mask:
++	case CODE_FOR_avx512f_loadv8df_mask:
++	case CODE_FOR_avx512f_loadv8di_mask:
++	case CODE_FOR_avx512vl_loadv8sf_mask:
++	case CODE_FOR_avx512vl_loadv8si_mask:
++	case CODE_FOR_avx512vl_loadv4df_mask:
++	case CODE_FOR_avx512vl_loadv4di_mask:
++	case CODE_FOR_avx512vl_loadv4sf_mask:
++	case CODE_FOR_avx512vl_loadv4si_mask:
++	case CODE_FOR_avx512vl_loadv2df_mask:
++	case CODE_FOR_avx512vl_loadv2di_mask:
++	case CODE_FOR_avx512bw_loadv64qi_mask:
++	case CODE_FOR_avx512vl_loadv32qi_mask:
++	case CODE_FOR_avx512vl_loadv16qi_mask:
++	case CODE_FOR_avx512bw_loadv32hi_mask:
++	case CODE_FOR_avx512vl_loadv16hi_mask:
++	case CODE_FOR_avx512vl_loadv8hi_mask:
++	  aligned_mem = true;
++	  break;
++	default:
++	  break;
++	}
++      /* FALLTHRU */
++    case V64QI_FTYPE_PCCHAR_V64QI_UDI:
++    case V32QI_FTYPE_PCCHAR_V32QI_USI:
++    case V16QI_FTYPE_PCCHAR_V16QI_UHI:
++    case V32HI_FTYPE_PCSHORT_V32HI_USI:
++    case V16HI_FTYPE_PCSHORT_V16HI_UHI:
++    case V8HI_FTYPE_PCSHORT_V8HI_UQI:
++    case V16SI_FTYPE_PCINT_V16SI_UHI:
++    case V8SI_FTYPE_PCINT_V8SI_UQI:
++    case V4SI_FTYPE_PCINT_V4SI_UQI:
++    case V8DI_FTYPE_PCINT64_V8DI_UQI:
++    case V4DI_FTYPE_PCINT64_V4DI_UQI:
++    case V2DI_FTYPE_PCINT64_V2DI_UQI:
++    case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
++    case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
++    case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
++    case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
++    case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
++    case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
++      nargs = 3;
++      klass = load;
++      memory = 0;
++      break;
++    case VOID_FTYPE_UINT_UINT_UINT:
++    case VOID_FTYPE_UINT64_UINT_UINT:
++    case UCHAR_FTYPE_UINT_UINT_UINT:
++    case UCHAR_FTYPE_UINT64_UINT_UINT:
++      nargs = 3;
++      klass = load;
++      memory = ARRAY_SIZE (args);
++      last_arg_constant = true;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  gcc_assert (nargs <= ARRAY_SIZE (args));
++
++  if (klass == store)
++    {
++      arg = CALL_EXPR_ARG (exp, 0);
++      op = expand_normal (arg);
++      gcc_assert (target == 0);
++      if (memory)
++	{
++	  op = ix86_zero_extend_to_Pmode (op);
++	  target = gen_rtx_MEM (tmode, op);
++	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
++	     on it.  Try to improve it using get_pointer_alignment,
++	     and if the special builtin is one that requires strict
++	     mode alignment, also from it's GET_MODE_ALIGNMENT.
++	     Failure to do so could lead to ix86_legitimate_combined_insn
++	     rejecting all changes to such insns.  */
++	  unsigned int align = get_pointer_alignment (arg);
++	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
++	    align = GET_MODE_ALIGNMENT (tmode);
++	  if (MEM_ALIGN (target) < align)
++	    set_mem_align (target, align);
++	}
++      else
++	target = force_reg (tmode, op);
++      arg_adjust = 1;
++    }
++  else
++    {
++      arg_adjust = 0;
++      if (optimize
++	  || target == 0
++	  || !register_operand (target, tmode)
++	  || GET_MODE (target) != tmode)
++	target = gen_reg_rtx (tmode);
++    }
++
++  for (i = 0; i < nargs; i++)
++    {
++      machine_mode mode = insn_p->operand[i + 1].mode;
++      bool match;
++
++      arg = CALL_EXPR_ARG (exp, i + arg_adjust);
++      op = expand_normal (arg);
++      match = insn_p->operand[i + 1].predicate (op, mode);
++
++      if (last_arg_constant && (i + 1) == nargs)
++	{
++	  if (!match)
++	    {
++	      if (icode == CODE_FOR_lwp_lwpvalsi3
++		  || icode == CODE_FOR_lwp_lwpinssi3
++		  || icode == CODE_FOR_lwp_lwpvaldi3
++		  || icode == CODE_FOR_lwp_lwpinsdi3)
++		error ("the last argument must be a 32-bit immediate");
++	      else
++		error ("the last argument must be an 8-bit immediate");
++	      return const0_rtx;
++	    }
++	}
++      else
++	{
++	  if (i == memory)
++	    {
++	      /* This must be the memory operand.  */
++	      op = ix86_zero_extend_to_Pmode (op);
++	      op = gen_rtx_MEM (mode, op);
++	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
++		 on it.  Try to improve it using get_pointer_alignment,
++		 and if the special builtin is one that requires strict
++		 mode alignment, also from it's GET_MODE_ALIGNMENT.
++		 Failure to do so could lead to ix86_legitimate_combined_insn
++		 rejecting all changes to such insns.  */
++	      unsigned int align = get_pointer_alignment (arg);
++	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
++		align = GET_MODE_ALIGNMENT (mode);
++	      if (MEM_ALIGN (op) < align)
++		set_mem_align (op, align);
++	    }
++	  else
++	    {
++	      /* This must be register.  */
++	      if (VECTOR_MODE_P (mode))
++		op = safe_vector_operand (op, mode);
++
++	      op = fixup_modeless_constant (op, mode);
++
++	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
++		op = copy_to_mode_reg (mode, op);
++	      else
++	        {
++	          op = copy_to_reg (op);
++	          op = lowpart_subreg (mode, op, GET_MODE (op));
++	        }
++	    }
++	}
++
++      args[i].op = op;
++      args[i].mode = mode;
++    }
++
++  switch (nargs)
++    {
++    case 0:
++      pat = GEN_FCN (icode) (target);
++      break;
++    case 1:
++      pat = GEN_FCN (icode) (target, args[0].op);
++      break;
++    case 2:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
++      break;
++    case 3:
++      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  if (! pat)
++    return 0;
++  emit_insn (pat);
++  return klass == store ? 0 : target;
++}
++
++/* Return the integer constant in ARG.  Constrain it to be in the range
++   of the subparts of VEC_TYPE; issue an error if not.  */
++
++static int
++get_element_number (tree vec_type, tree arg)
++{
++  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
++
++  if (!tree_fits_uhwi_p (arg)
++      || (elt = tree_to_uhwi (arg), elt > max))
++    {
++      error ("selector must be an integer constant in the range "
++	     "[0, %wi]", max);
++      return 0;
++    }
++
++  return elt;
++}
++
++/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
++   ix86_expand_vector_init.  We DO have language-level syntax for this, in
++   the form of  (type){ init-list }.  Except that since we can't place emms
++   instructions from inside the compiler, we can't allow the use of MMX
++   registers unless the user explicitly asks for it.  So we do *not* define
++   vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
++   we have builtins invoked by mmintrin.h that gives us license to emit
++   these sorts of instructions.  */
++
++static rtx
++ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
++{
++  machine_mode tmode = TYPE_MODE (type);
++  machine_mode inner_mode = GET_MODE_INNER (tmode);
++  int i, n_elt = GET_MODE_NUNITS (tmode);
++  rtvec v = rtvec_alloc (n_elt);
++
++  gcc_assert (VECTOR_MODE_P (tmode));
++  gcc_assert (call_expr_nargs (exp) == n_elt);
++
++  for (i = 0; i < n_elt; ++i)
++    {
++      rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
++      RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
++    }
++
++  if (!target || !register_operand (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
++  return target;
++}
++
++/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
++   ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
++   had a language-level syntax for referencing vector elements.  */
++
++static rtx
++ix86_expand_vec_ext_builtin (tree exp, rtx target)
++{
++  machine_mode tmode, mode0;
++  tree arg0, arg1;
++  int elt;
++  rtx op0;
++
++  arg0 = CALL_EXPR_ARG (exp, 0);
++  arg1 = CALL_EXPR_ARG (exp, 1);
++
++  op0 = expand_normal (arg0);
++  elt = get_element_number (TREE_TYPE (arg0), arg1);
++
++  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
++  mode0 = TYPE_MODE (TREE_TYPE (arg0));
++  gcc_assert (VECTOR_MODE_P (mode0));
++
++  op0 = force_reg (mode0, op0);
++
++  if (optimize || !target || !register_operand (target, tmode))
++    target = gen_reg_rtx (tmode);
++
++  ix86_expand_vector_extract (true, target, op0, elt);
++
++  return target;
++}
++
++/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
++   ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
++   a language-level syntax for referencing vector elements.  */
++
++static rtx
++ix86_expand_vec_set_builtin (tree exp)
++{
++  machine_mode tmode, mode1;
++  tree arg0, arg1, arg2;
++  int elt;
++  rtx op0, op1, target;
++
++  arg0 = CALL_EXPR_ARG (exp, 0);
++  arg1 = CALL_EXPR_ARG (exp, 1);
++  arg2 = CALL_EXPR_ARG (exp, 2);
++
++  tmode = TYPE_MODE (TREE_TYPE (arg0));
++  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
++  gcc_assert (VECTOR_MODE_P (tmode));
++
++  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
++  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
++  elt = get_element_number (TREE_TYPE (arg0), arg2);
++
++  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
++    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
++
++  op0 = force_reg (tmode, op0);
++  op1 = force_reg (mode1, op1);
++
++  /* OP0 is the source of these builtin functions and shouldn't be
++     modified.  Create a copy, use it and return it as target.  */
++  target = gen_reg_rtx (tmode);
++  emit_move_insn (target, op0);
++  ix86_expand_vector_set (true, target, op1, elt);
++
++  return target;
++}
++
++/* Expand an expression EXP that calls a built-in function,
++   with result going to TARGET if that's convenient
++   (and in mode MODE if that's convenient).
++   SUBTARGET may be used as the target for computing one of EXP's operands.
++   IGNORE is nonzero if the value is to be ignored.  */
++
++rtx
++ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
++		     machine_mode mode, int ignore)
++{
++  size_t i;
++  enum insn_code icode, icode2;
++  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
++  tree arg0, arg1, arg2, arg3, arg4;
++  rtx op0, op1, op2, op3, op4, pat, pat2, insn;
++  machine_mode mode0, mode1, mode2, mode3, mode4;
++  unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
++
++  /* For CPU builtins that can be folded, fold first and expand the fold.  */
++  switch (fcode)
++    {
++    case IX86_BUILTIN_CPU_INIT:
++      {
++	/* Make it call __cpu_indicator_init in libgcc. */
++	tree call_expr, fndecl, type;
++        type = build_function_type_list (integer_type_node, NULL_TREE); 
++	fndecl = build_fn_decl ("__cpu_indicator_init", type);
++	call_expr = build_call_expr (fndecl, 0); 
++	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
++      }
++    case IX86_BUILTIN_CPU_IS:
++    case IX86_BUILTIN_CPU_SUPPORTS:
++      {
++	tree arg0 = CALL_EXPR_ARG (exp, 0);
++	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
++	gcc_assert (fold_expr != NULL_TREE);
++	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
++      }
++    }
++
++  HOST_WIDE_INT isa = ix86_isa_flags;
++  HOST_WIDE_INT isa2 = ix86_isa_flags2;
++  HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
++  HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
++  /* The general case is we require all the ISAs specified in bisa{,2}
++     to be enabled.
++     The exceptions are:
++     OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
++     OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
++     OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
++     where for each this pair it is sufficient if either of the ISAs is
++     enabled, plus if it is ored with other options also those others.  */
++  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
++       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
++      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
++    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
++  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
++       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
++      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
++    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
++  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
++       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
++      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
++    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
++  if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
++    {
++      bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
++      if (TARGET_ABI_X32)
++	bisa |= OPTION_MASK_ABI_X32;
++      else
++	bisa |= OPTION_MASK_ABI_64;
++      char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
++				       (enum fpmath_unit) 0, false, add_abi_p);
++      if (!opts)
++	error ("%qE needs unknown isa option", fndecl);
++      else
++	{
++	  gcc_assert (opts != NULL);
++	  error ("%qE needs isa option %s", fndecl, opts);
++	  free (opts);
++	}
++      return expand_call (exp, target, ignore);
++    }
++
++  switch (fcode)
++    {
++    case IX86_BUILTIN_MASKMOVQ:
++    case IX86_BUILTIN_MASKMOVDQU:
++      icode = (fcode == IX86_BUILTIN_MASKMOVQ
++	       ? CODE_FOR_mmx_maskmovq
++	       : CODE_FOR_sse2_maskmovdqu);
++      /* Note the arg order is different from the operand order.  */
++      arg1 = CALL_EXPR_ARG (exp, 0);
++      arg2 = CALL_EXPR_ARG (exp, 1);
++      arg0 = CALL_EXPR_ARG (exp, 2);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      op2 = expand_normal (arg2);
++      mode0 = insn_data[icode].operand[0].mode;
++      mode1 = insn_data[icode].operand[1].mode;
++      mode2 = insn_data[icode].operand[2].mode;
++
++      op0 = ix86_zero_extend_to_Pmode (op0);
++      op0 = gen_rtx_MEM (mode1, op0);
++
++      if (!insn_data[icode].operand[0].predicate (op0, mode0))
++	op0 = copy_to_mode_reg (mode0, op0);
++      if (!insn_data[icode].operand[1].predicate (op1, mode1))
++	op1 = copy_to_mode_reg (mode1, op1);
++      if (!insn_data[icode].operand[2].predicate (op2, mode2))
++	op2 = copy_to_mode_reg (mode2, op2);
++      pat = GEN_FCN (icode) (op0, op1, op2);
++      if (! pat)
++	return 0;
++      emit_insn (pat);
++      return 0;
++
++    case IX86_BUILTIN_LDMXCSR:
++      op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
++      target = assign_386_stack_local (SImode, SLOT_TEMP);
++      emit_move_insn (target, op0);
++      emit_insn (gen_sse_ldmxcsr (target));
++      return 0;
++
++    case IX86_BUILTIN_STMXCSR:
++      target = assign_386_stack_local (SImode, SLOT_TEMP);
++      emit_insn (gen_sse_stmxcsr (target));
++      return copy_to_mode_reg (SImode, target);
++
++    case IX86_BUILTIN_CLFLUSH:
++	arg0 = CALL_EXPR_ARG (exp, 0);
++	op0 = expand_normal (arg0);
++	icode = CODE_FOR_sse2_clflush;
++	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
++	  op0 = ix86_zero_extend_to_Pmode (op0);
++
++	emit_insn (gen_sse2_clflush (op0));
++	return 0;
++
++    case IX86_BUILTIN_CLWB:
++	arg0 = CALL_EXPR_ARG (exp, 0);
++	op0 = expand_normal (arg0);
++	icode = CODE_FOR_clwb;
++	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
++	  op0 = ix86_zero_extend_to_Pmode (op0);
++
++	emit_insn (gen_clwb (op0));
++	return 0;
++
++    case IX86_BUILTIN_CLFLUSHOPT:
++	arg0 = CALL_EXPR_ARG (exp, 0);
++	op0 = expand_normal (arg0);
++	icode = CODE_FOR_clflushopt;
++	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
++	  op0 = ix86_zero_extend_to_Pmode (op0);
++
++	emit_insn (gen_clflushopt (op0));
++	return 0;
++
++    case IX86_BUILTIN_MONITOR:
++    case IX86_BUILTIN_MONITORX:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      arg2 = CALL_EXPR_ARG (exp, 2);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      op2 = expand_normal (arg2);
++      if (!REG_P (op0))
++	op0 = ix86_zero_extend_to_Pmode (op0);
++      if (!REG_P (op1))
++	op1 = copy_to_mode_reg (SImode, op1);
++      if (!REG_P (op2))
++	op2 = copy_to_mode_reg (SImode, op2);
++
++      emit_insn (fcode == IX86_BUILTIN_MONITOR 
++		 ? ix86_gen_monitor (op0, op1, op2)
++		 : ix86_gen_monitorx (op0, op1, op2));
++      return 0;
++
++    case IX86_BUILTIN_MWAIT:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      if (!REG_P (op0))
++	op0 = copy_to_mode_reg (SImode, op0);
++      if (!REG_P (op1))
++	op1 = copy_to_mode_reg (SImode, op1);
++      emit_insn (gen_sse3_mwait (op0, op1));
++      return 0;
++
++    case IX86_BUILTIN_MWAITX:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      arg2 = CALL_EXPR_ARG (exp, 2);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      op2 = expand_normal (arg2);
++      if (!REG_P (op0))
++	op0 = copy_to_mode_reg (SImode, op0);
++      if (!REG_P (op1))
++	op1 = copy_to_mode_reg (SImode, op1);
++      if (!REG_P (op2))
++	op2 = copy_to_mode_reg (SImode, op2);
++      emit_insn (gen_mwaitx (op0, op1, op2));
++      return 0;
++
++    case IX86_BUILTIN_UMONITOR:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++
++      op0 = ix86_zero_extend_to_Pmode (op0);
++
++      insn = (TARGET_64BIT
++	      ? gen_umonitor_di (op0)
++	      : gen_umonitor_si (op0));
++
++      emit_insn (insn);
++      return 0;
++
++    case IX86_BUILTIN_UMWAIT:
++    case IX86_BUILTIN_TPAUSE:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++
++      if (!REG_P (op0))
++	op0 = copy_to_mode_reg (SImode, op0);
++
++      op1 = force_reg (DImode, op1);
++
++      if (TARGET_64BIT)
++	{
++	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
++				     NULL, 1, OPTAB_DIRECT);
++	  switch (fcode)
++	    {
++	    case IX86_BUILTIN_UMWAIT:
++	      icode = CODE_FOR_umwait_rex64;
++	      break;
++	    case IX86_BUILTIN_TPAUSE:
++	      icode = CODE_FOR_tpause_rex64;
++	      break;
++	    default:
++	      gcc_unreachable ();
++	    }
++
++	  op2 = gen_lowpart (SImode, op2);
++	  op1 = gen_lowpart (SImode, op1);
++	  pat = GEN_FCN (icode) (op0, op1, op2);
++	}
++      else
++	{
++	  switch (fcode)
++	    {
++	    case IX86_BUILTIN_UMWAIT:
++	      icode = CODE_FOR_umwait;
++	      break;
++	    case IX86_BUILTIN_TPAUSE:
++	      icode = CODE_FOR_tpause;
++	      break;
++	    default:
++	      gcc_unreachable ();
++	    }
++	  pat = GEN_FCN (icode) (op0, op1);
++	}
++
++      if (!pat)
++	return 0;
++
++      emit_insn (pat);
++
++      if (target == 0
++	  || !register_operand (target, QImode))
++	target = gen_reg_rtx (QImode);
++
++      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
++			const0_rtx);
++      emit_insn (gen_rtx_SET (target, pat));
++
++      return target;
++
++    case IX86_BUILTIN_CLZERO:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      if (!REG_P (op0))
++	op0 = ix86_zero_extend_to_Pmode (op0);
++      emit_insn (ix86_gen_clzero (op0));
++      return 0;
++
++    case IX86_BUILTIN_CLDEMOTE:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      icode = CODE_FOR_cldemote;
++      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
++	op0 = ix86_zero_extend_to_Pmode (op0);
++
++      emit_insn (gen_cldemote (op0));
++      return 0;
++
++    case IX86_BUILTIN_VEC_INIT_V2SI:
++    case IX86_BUILTIN_VEC_INIT_V4HI:
++    case IX86_BUILTIN_VEC_INIT_V8QI:
++      return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
++
++    case IX86_BUILTIN_VEC_EXT_V2DF:
++    case IX86_BUILTIN_VEC_EXT_V2DI:
++    case IX86_BUILTIN_VEC_EXT_V4SF:
++    case IX86_BUILTIN_VEC_EXT_V4SI:
++    case IX86_BUILTIN_VEC_EXT_V8HI:
++    case IX86_BUILTIN_VEC_EXT_V2SI:
++    case IX86_BUILTIN_VEC_EXT_V4HI:
++    case IX86_BUILTIN_VEC_EXT_V16QI:
++      return ix86_expand_vec_ext_builtin (exp, target);
++
++    case IX86_BUILTIN_VEC_SET_V2DI:
++    case IX86_BUILTIN_VEC_SET_V4SF:
++    case IX86_BUILTIN_VEC_SET_V4SI:
++    case IX86_BUILTIN_VEC_SET_V8HI:
++    case IX86_BUILTIN_VEC_SET_V4HI:
++    case IX86_BUILTIN_VEC_SET_V16QI:
++      return ix86_expand_vec_set_builtin (exp);
++
++    case IX86_BUILTIN_NANQ:
++    case IX86_BUILTIN_NANSQ:
++      return expand_call (exp, target, ignore);
++
++    case IX86_BUILTIN_RDPID:
++
++      op0 = gen_reg_rtx (word_mode);
++
++      if (TARGET_64BIT)
++	{
++	  insn = gen_rdpid_rex64 (op0);
++	  op0 = convert_to_mode (SImode, op0, 1);
++	}
++      else
++	insn = gen_rdpid (op0);
++
++      emit_insn (insn);
++
++      if (target == 0
++	  || !register_operand (target, SImode))
++	target = gen_reg_rtx (SImode);
++
++      emit_move_insn (target, op0);
++      return target;
++
++    case IX86_BUILTIN_RDPMC:
++    case IX86_BUILTIN_RDTSC:
++    case IX86_BUILTIN_RDTSCP:
++    case IX86_BUILTIN_XGETBV:
++
++      op0 = gen_reg_rtx (DImode);
++      op1 = gen_reg_rtx (DImode);
++
++      if (fcode == IX86_BUILTIN_RDPMC)
++	{
++	  arg0 = CALL_EXPR_ARG (exp, 0);
++	  op2 = expand_normal (arg0);
++	  if (!register_operand (op2, SImode))
++	    op2 = copy_to_mode_reg (SImode, op2);
++
++	  insn = (TARGET_64BIT
++		  ? gen_rdpmc_rex64 (op0, op1, op2)
++		  : gen_rdpmc (op0, op2));
++	  emit_insn (insn);
++	}
++      else if (fcode == IX86_BUILTIN_XGETBV)
++	{
++	  arg0 = CALL_EXPR_ARG (exp, 0);
++	  op2 = expand_normal (arg0);
++	  if (!register_operand (op2, SImode))
++	    op2 = copy_to_mode_reg (SImode, op2);
++
++	  insn = (TARGET_64BIT
++		  ? gen_xgetbv_rex64 (op0, op1, op2)
++		  : gen_xgetbv (op0, op2));
++	  emit_insn (insn);
++	}
++      else if (fcode == IX86_BUILTIN_RDTSC)
++	{
++	  insn = (TARGET_64BIT
++		  ? gen_rdtsc_rex64 (op0, op1)
++		  : gen_rdtsc (op0));
++	  emit_insn (insn);
++	}
++      else
++	{
++	  op2 = gen_reg_rtx (SImode);
++
++	  insn = (TARGET_64BIT
++		  ? gen_rdtscp_rex64 (op0, op1, op2)
++		  : gen_rdtscp (op0, op2));
++	  emit_insn (insn);
++
++	  arg0 = CALL_EXPR_ARG (exp, 0);
++	  op4 = expand_normal (arg0);
++	  if (!address_operand (op4, VOIDmode))
++	    {
++	      op4 = convert_memory_address (Pmode, op4);
++	      op4 = copy_addr_to_reg (op4);
++	    }
++	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
++	}
++
++      if (target == 0
++	  || !register_operand (target, DImode))
++        target = gen_reg_rtx (DImode);
++
++      if (TARGET_64BIT)
++	{
++	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
++				     op1, 1, OPTAB_DIRECT);
++	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
++				     op0, 1, OPTAB_DIRECT);
++	}
++
++      emit_move_insn (target, op0);
++      return target;
++
++    case IX86_BUILTIN_MOVDIR64B:
++
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++
++      op0 = ix86_zero_extend_to_Pmode (op0);
++      if (!address_operand (op1, VOIDmode))
++      {
++	op1 = convert_memory_address (Pmode, op1);
++	op1 = copy_addr_to_reg (op1);
++      }
++      op1 = gen_rtx_MEM (XImode, op1);
++
++      insn = (TARGET_64BIT
++		? gen_movdir64b_di (op0, op1)
++		: gen_movdir64b_si (op0, op1));
++      emit_insn (insn);
++      return 0;
++
++    case IX86_BUILTIN_FXSAVE:
++    case IX86_BUILTIN_FXRSTOR:
++    case IX86_BUILTIN_FXSAVE64:
++    case IX86_BUILTIN_FXRSTOR64:
++    case IX86_BUILTIN_FNSTENV:
++    case IX86_BUILTIN_FLDENV:
++      mode0 = BLKmode;
++      switch (fcode)
++	{
++	case IX86_BUILTIN_FXSAVE:
++	  icode = CODE_FOR_fxsave;
++	  break;
++	case IX86_BUILTIN_FXRSTOR:
++	  icode = CODE_FOR_fxrstor;
++	  break;
++	case IX86_BUILTIN_FXSAVE64:
++	  icode = CODE_FOR_fxsave64;
++	  break;
++	case IX86_BUILTIN_FXRSTOR64:
++	  icode = CODE_FOR_fxrstor64;
++	  break;
++	case IX86_BUILTIN_FNSTENV:
++	  icode = CODE_FOR_fnstenv;
++	  break;
++	case IX86_BUILTIN_FLDENV:
++	  icode = CODE_FOR_fldenv;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++
++      if (!address_operand (op0, VOIDmode))
++	{
++	  op0 = convert_memory_address (Pmode, op0);
++	  op0 = copy_addr_to_reg (op0);
++	}
++      op0 = gen_rtx_MEM (mode0, op0);
++
++      pat = GEN_FCN (icode) (op0);
++      if (pat)
++	emit_insn (pat);
++      return 0;
++
++    case IX86_BUILTIN_XSETBV:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++
++      if (!REG_P (op0))
++	op0 = copy_to_mode_reg (SImode, op0);
++
++      op1 = force_reg (DImode, op1);
++
++      if (TARGET_64BIT)
++	{
++	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
++				     NULL, 1, OPTAB_DIRECT);
++
++	  icode = CODE_FOR_xsetbv_rex64;
++
++	  op2 = gen_lowpart (SImode, op2);
++	  op1 = gen_lowpart (SImode, op1);
++	  pat = GEN_FCN (icode) (op0, op1, op2);
++	}
++      else
++	{
++	  icode = CODE_FOR_xsetbv;
++
++	  pat = GEN_FCN (icode) (op0, op1);
++	}
++      if (pat)
++	emit_insn (pat);
++      return 0;
++
++    case IX86_BUILTIN_XSAVE:
++    case IX86_BUILTIN_XRSTOR:
++    case IX86_BUILTIN_XSAVE64:
++    case IX86_BUILTIN_XRSTOR64:
++    case IX86_BUILTIN_XSAVEOPT:
++    case IX86_BUILTIN_XSAVEOPT64:
++    case IX86_BUILTIN_XSAVES:
++    case IX86_BUILTIN_XRSTORS:
++    case IX86_BUILTIN_XSAVES64:
++    case IX86_BUILTIN_XRSTORS64:
++    case IX86_BUILTIN_XSAVEC:
++    case IX86_BUILTIN_XSAVEC64:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++
++      if (!address_operand (op0, VOIDmode))
++	{
++	  op0 = convert_memory_address (Pmode, op0);
++	  op0 = copy_addr_to_reg (op0);
++	}
++      op0 = gen_rtx_MEM (BLKmode, op0);
++
++      op1 = force_reg (DImode, op1);
++
++      if (TARGET_64BIT)
++	{
++	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
++				     NULL, 1, OPTAB_DIRECT);
++	  switch (fcode)
++	    {
++	    case IX86_BUILTIN_XSAVE:
++	      icode = CODE_FOR_xsave_rex64;
++	      break;
++	    case IX86_BUILTIN_XRSTOR:
++	      icode = CODE_FOR_xrstor_rex64;
++	      break;
++	    case IX86_BUILTIN_XSAVE64:
++	      icode = CODE_FOR_xsave64;
++	      break;
++	    case IX86_BUILTIN_XRSTOR64:
++	      icode = CODE_FOR_xrstor64;
++	      break;
++	    case IX86_BUILTIN_XSAVEOPT:
++	      icode = CODE_FOR_xsaveopt_rex64;
++	      break;
++	    case IX86_BUILTIN_XSAVEOPT64:
++	      icode = CODE_FOR_xsaveopt64;
++	      break;
++	    case IX86_BUILTIN_XSAVES:
++	      icode = CODE_FOR_xsaves_rex64;
++	      break;
++	    case IX86_BUILTIN_XRSTORS:
++	      icode = CODE_FOR_xrstors_rex64;
++	      break;
++	    case IX86_BUILTIN_XSAVES64:
++	      icode = CODE_FOR_xsaves64;
++	      break;
++	    case IX86_BUILTIN_XRSTORS64:
++	      icode = CODE_FOR_xrstors64;
++	      break;
++	    case IX86_BUILTIN_XSAVEC:
++	      icode = CODE_FOR_xsavec_rex64;
++	      break;
++	    case IX86_BUILTIN_XSAVEC64:
++	      icode = CODE_FOR_xsavec64;
++	      break;
++	    default:
++	      gcc_unreachable ();
++	    }
++
++	  op2 = gen_lowpart (SImode, op2);
++	  op1 = gen_lowpart (SImode, op1);
++	  pat = GEN_FCN (icode) (op0, op1, op2);
++	}
++      else
++	{
++	  switch (fcode)
++	    {
++	    case IX86_BUILTIN_XSAVE:
++	      icode = CODE_FOR_xsave;
++	      break;
++	    case IX86_BUILTIN_XRSTOR:
++	      icode = CODE_FOR_xrstor;
++	      break;
++	    case IX86_BUILTIN_XSAVEOPT:
++	      icode = CODE_FOR_xsaveopt;
++	      break;
++	    case IX86_BUILTIN_XSAVES:
++	      icode = CODE_FOR_xsaves;
++	      break;
++	    case IX86_BUILTIN_XRSTORS:
++	      icode = CODE_FOR_xrstors;
++	      break;
++	    case IX86_BUILTIN_XSAVEC:
++	      icode = CODE_FOR_xsavec;
++	      break;
++	    default:
++	      gcc_unreachable ();
++	    }
++	  pat = GEN_FCN (icode) (op0, op1);
++	}
++
++      if (pat)
++	emit_insn (pat);
++      return 0;
++
++    case IX86_BUILTIN_LLWPCB:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      icode = CODE_FOR_lwp_llwpcb;
++      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
++	op0 = ix86_zero_extend_to_Pmode (op0);
++      emit_insn (gen_lwp_llwpcb (op0));
++      return 0;
++
++    case IX86_BUILTIN_SLWPCB:
++      icode = CODE_FOR_lwp_slwpcb;
++      if (!target
++	  || !insn_data[icode].operand[0].predicate (target, Pmode))
++	target = gen_reg_rtx (Pmode);
++      emit_insn (gen_lwp_slwpcb (target));
++      return target;
++
++    case IX86_BUILTIN_BEXTRI32:
++    case IX86_BUILTIN_BEXTRI64:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      icode = (fcode == IX86_BUILTIN_BEXTRI32
++	  ? CODE_FOR_tbm_bextri_si
++	  : CODE_FOR_tbm_bextri_di);
++      if (!CONST_INT_P (op1))
++        {
++          error ("last argument must be an immediate");
++          return const0_rtx;
++        }
++      else
++        {
++          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
++          unsigned char lsb_index = INTVAL (op1) & 0xFF;
++          op1 = GEN_INT (length);
++          op2 = GEN_INT (lsb_index);
++
++	  mode1 = insn_data[icode].operand[1].mode;
++	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
++	    op0 = copy_to_mode_reg (mode1, op0);
++
++	  mode0 = insn_data[icode].operand[0].mode;
++	  if (target == 0
++	      || !register_operand (target, mode0))
++	    target = gen_reg_rtx (mode0);
++
++          pat = GEN_FCN (icode) (target, op0, op1, op2);
++          if (pat)
++            emit_insn (pat);
++          return target;
++        }
++
++    case IX86_BUILTIN_RDRAND16_STEP:
++      icode = CODE_FOR_rdrandhi_1;
++      mode0 = HImode;
++      goto rdrand_step;
++
++    case IX86_BUILTIN_RDRAND32_STEP:
++      icode = CODE_FOR_rdrandsi_1;
++      mode0 = SImode;
++      goto rdrand_step;
++
++    case IX86_BUILTIN_RDRAND64_STEP:
++      icode = CODE_FOR_rdranddi_1;
++      mode0 = DImode;
++
++rdrand_step:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op1 = expand_normal (arg0);
++      if (!address_operand (op1, VOIDmode))
++	{
++	  op1 = convert_memory_address (Pmode, op1);
++	  op1 = copy_addr_to_reg (op1);
++	}
++
++      op0 = gen_reg_rtx (mode0);
++      emit_insn (GEN_FCN (icode) (op0));
++
++      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
++
++      op1 = gen_reg_rtx (SImode);
++      emit_move_insn (op1, CONST1_RTX (SImode));
++
++      /* Emit SImode conditional move.  */
++      if (mode0 == HImode)
++	{
++	  if (TARGET_ZERO_EXTEND_WITH_AND
++	      && optimize_function_for_speed_p (cfun))
++	    {
++	      op2 = force_reg (SImode, const0_rtx);
++
++	      emit_insn (gen_movstricthi
++			 (gen_lowpart (HImode, op2), op0));
++	    }
++	  else
++	    {
++	      op2 = gen_reg_rtx (SImode);
++
++	      emit_insn (gen_zero_extendhisi2 (op2, op0));
++	    }
++	}
++      else if (mode0 == SImode)
++	op2 = op0;
++      else
++	op2 = gen_rtx_SUBREG (SImode, op0, 0);
++
++      if (target == 0
++	  || !register_operand (target, SImode))
++	target = gen_reg_rtx (SImode);
++
++      pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
++			 const0_rtx);
++      emit_insn (gen_rtx_SET (target,
++			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
++      return target;
++
++    case IX86_BUILTIN_RDSEED16_STEP:
++      icode = CODE_FOR_rdseedhi_1;
++      mode0 = HImode;
++      goto rdseed_step;
++
++    case IX86_BUILTIN_RDSEED32_STEP:
++      icode = CODE_FOR_rdseedsi_1;
++      mode0 = SImode;
++      goto rdseed_step;
++
++    case IX86_BUILTIN_RDSEED64_STEP:
++      icode = CODE_FOR_rdseeddi_1;
++      mode0 = DImode;
++
++rdseed_step:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op1 = expand_normal (arg0);
++      if (!address_operand (op1, VOIDmode))
++	{
++	  op1 = convert_memory_address (Pmode, op1);
++	  op1 = copy_addr_to_reg (op1);
++	}
++
++      op0 = gen_reg_rtx (mode0);
++      emit_insn (GEN_FCN (icode) (op0));
++
++      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
++
++      op2 = gen_reg_rtx (QImode);
++
++      pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
++                         const0_rtx);
++      emit_insn (gen_rtx_SET (op2, pat));
++
++      if (target == 0
++	  || !register_operand (target, SImode))
++        target = gen_reg_rtx (SImode);
++
++      emit_insn (gen_zero_extendqisi2 (target, op2));
++      return target;
++
++    case IX86_BUILTIN_SBB32:
++      icode = CODE_FOR_subborrowsi;
++      icode2 = CODE_FOR_subborrowsi_0;
++      mode0 = SImode;
++      mode1 = DImode;
++      mode2 = CCmode;
++      goto handlecarry;
++
++    case IX86_BUILTIN_SBB64:
++      icode = CODE_FOR_subborrowdi;
++      icode2 = CODE_FOR_subborrowdi_0;
++      mode0 = DImode;
++      mode1 = TImode;
++      mode2 = CCmode;
++      goto handlecarry;
++
++    case IX86_BUILTIN_ADDCARRYX32:
++      icode = CODE_FOR_addcarrysi;
++      icode2 = CODE_FOR_addcarrysi_0;
++      mode0 = SImode;
++      mode1 = DImode;
++      mode2 = CCCmode;
++      goto handlecarry;
++
++    case IX86_BUILTIN_ADDCARRYX64:
++      icode = CODE_FOR_addcarrydi;
++      icode2 = CODE_FOR_addcarrydi_0;
++      mode0 = DImode;
++      mode1 = TImode;
++      mode2 = CCCmode;
++
++    handlecarry:
++      arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
++      arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
++      arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
++      arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
++
++      op1 = expand_normal (arg0);
++      if (!integer_zerop (arg0))
++	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
++
++      op2 = expand_normal (arg1);
++      if (!register_operand (op2, mode0))
++	op2 = copy_to_mode_reg (mode0, op2);
++
++      op3 = expand_normal (arg2);
++      if (!register_operand (op3, mode0))
++	op3 = copy_to_mode_reg (mode0, op3);
++
++      op4 = expand_normal (arg3);
++      if (!address_operand (op4, VOIDmode))
++	{
++	  op4 = convert_memory_address (Pmode, op4);
++	  op4 = copy_addr_to_reg (op4);
++	}
++
++      op0 = gen_reg_rtx (mode0);
++      if (integer_zerop (arg0))
++	{
++	  /* If arg0 is 0, optimize right away into add or sub
++	     instruction that sets CCCmode flags.  */
++	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
++	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
++	}
++      else
++	{
++	  /* Generate CF from input operand.  */
++	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
++
++	  /* Generate instruction that consumes CF.  */
++	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
++	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
++	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
++	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
++	}
++
++      /* Return current CF value.  */
++      if (target == 0)
++        target = gen_reg_rtx (QImode);
++
++      pat = gen_rtx_LTU (QImode, op1, const0_rtx);
++      emit_insn (gen_rtx_SET (target, pat));
++
++      /* Store the result.  */
++      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
++
++      return target;
++
++    case IX86_BUILTIN_READ_FLAGS:
++      emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
++
++      if (optimize
++	  || target == NULL_RTX
++	  || !nonimmediate_operand (target, word_mode)
++	  || GET_MODE (target) != word_mode)
++	target = gen_reg_rtx (word_mode);
++
++      emit_insn (gen_pop (target));
++      return target;
++
++    case IX86_BUILTIN_WRITE_FLAGS:
++
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      if (!general_no_elim_operand (op0, word_mode))
++	op0 = copy_to_mode_reg (word_mode, op0);
++
++      emit_insn (gen_push (op0));
++      emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
++      return 0;
++
++    case IX86_BUILTIN_KTESTC8:
++      icode = CODE_FOR_ktestqi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTZ8:
++      icode = CODE_FOR_ktestqi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTC16:
++      icode = CODE_FOR_ktesthi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTZ16:
++      icode = CODE_FOR_ktesthi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTC32:
++      icode = CODE_FOR_ktestsi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTZ32:
++      icode = CODE_FOR_ktestsi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTC64:
++      icode = CODE_FOR_ktestdi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KTESTZ64:
++      icode = CODE_FOR_ktestdi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTC8:
++      icode = CODE_FOR_kortestqi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTZ8:
++      icode = CODE_FOR_kortestqi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTC16:
++      icode = CODE_FOR_kortesthi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTZ16:
++      icode = CODE_FOR_kortesthi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTC32:
++      icode = CODE_FOR_kortestsi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTZ32:
++      icode = CODE_FOR_kortestsi;
++      mode3 = CCZmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTC64:
++      icode = CODE_FOR_kortestdi;
++      mode3 = CCCmode;
++      goto kortest;
++
++    case IX86_BUILTIN_KORTESTZ64:
++      icode = CODE_FOR_kortestdi;
++      mode3 = CCZmode;
++
++    kortest:
++      arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
++      arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++
++      mode0 = insn_data[icode].operand[0].mode;
++      mode1 = insn_data[icode].operand[1].mode;
++
++      if (GET_MODE (op0) != VOIDmode)
++	op0 = force_reg (GET_MODE (op0), op0);
++
++      op0 = gen_lowpart (mode0, op0);
++
++      if (!insn_data[icode].operand[0].predicate (op0, mode0))
++	op0 = copy_to_mode_reg (mode0, op0);
++
++      if (GET_MODE (op1) != VOIDmode)
++	op1 = force_reg (GET_MODE (op1), op1);
++
++      op1 = gen_lowpart (mode1, op1);
++
++      if (!insn_data[icode].operand[1].predicate (op1, mode1))
++	op1 = copy_to_mode_reg (mode1, op1);
++
++      target = gen_reg_rtx (QImode);
++
++      /* Emit kortest.  */
++      emit_insn (GEN_FCN (icode) (op0, op1));
++      /* And use setcc to return result from flags.  */
++      ix86_expand_setcc (target, EQ,
++			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
++      return target;
++
++    case IX86_BUILTIN_GATHERSIV2DF:
++      icode = CODE_FOR_avx2_gathersiv2df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV4DF:
++      icode = CODE_FOR_avx2_gathersiv4df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV2DF:
++      icode = CODE_FOR_avx2_gatherdiv2df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV4DF:
++      icode = CODE_FOR_avx2_gatherdiv4df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV4SF:
++      icode = CODE_FOR_avx2_gathersiv4sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV8SF:
++      icode = CODE_FOR_avx2_gathersiv8sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV4SF:
++      icode = CODE_FOR_avx2_gatherdiv4sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV8SF:
++      icode = CODE_FOR_avx2_gatherdiv8sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV2DI:
++      icode = CODE_FOR_avx2_gathersiv2di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV4DI:
++      icode = CODE_FOR_avx2_gathersiv4di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV2DI:
++      icode = CODE_FOR_avx2_gatherdiv2di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV4DI:
++      icode = CODE_FOR_avx2_gatherdiv4di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV4SI:
++      icode = CODE_FOR_avx2_gathersiv4si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERSIV8SI:
++      icode = CODE_FOR_avx2_gathersiv8si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV4SI:
++      icode = CODE_FOR_avx2_gatherdiv4si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERDIV8SI:
++      icode = CODE_FOR_avx2_gatherdiv8si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERALTSIV4DF:
++      icode = CODE_FOR_avx2_gathersiv4df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERALTDIV8SF:
++      icode = CODE_FOR_avx2_gatherdiv8sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERALTSIV4DI:
++      icode = CODE_FOR_avx2_gathersiv4di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHERALTDIV8SI:
++      icode = CODE_FOR_avx2_gatherdiv8si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV16SF:
++      icode = CODE_FOR_avx512f_gathersiv16sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV8DF:
++      icode = CODE_FOR_avx512f_gathersiv8df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV16SF:
++      icode = CODE_FOR_avx512f_gatherdiv16sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV8DF:
++      icode = CODE_FOR_avx512f_gatherdiv8df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV16SI:
++      icode = CODE_FOR_avx512f_gathersiv16si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV8DI:
++      icode = CODE_FOR_avx512f_gathersiv8di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV16SI:
++      icode = CODE_FOR_avx512f_gatherdiv16si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV8DI:
++      icode = CODE_FOR_avx512f_gatherdiv8di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTSIV8DF:
++      icode = CODE_FOR_avx512f_gathersiv8df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTDIV16SF:
++      icode = CODE_FOR_avx512f_gatherdiv16sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTSIV8DI:
++      icode = CODE_FOR_avx512f_gathersiv8di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTDIV16SI:
++      icode = CODE_FOR_avx512f_gatherdiv16si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV2DF:
++      icode = CODE_FOR_avx512vl_gathersiv2df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV4DF:
++      icode = CODE_FOR_avx512vl_gathersiv4df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV2DF:
++      icode = CODE_FOR_avx512vl_gatherdiv2df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV4DF:
++      icode = CODE_FOR_avx512vl_gatherdiv4df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV4SF:
++      icode = CODE_FOR_avx512vl_gathersiv4sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV8SF:
++      icode = CODE_FOR_avx512vl_gathersiv8sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV4SF:
++      icode = CODE_FOR_avx512vl_gatherdiv4sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV8SF:
++      icode = CODE_FOR_avx512vl_gatherdiv8sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV2DI:
++      icode = CODE_FOR_avx512vl_gathersiv2di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV4DI:
++      icode = CODE_FOR_avx512vl_gathersiv4di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV2DI:
++      icode = CODE_FOR_avx512vl_gatherdiv2di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV4DI:
++      icode = CODE_FOR_avx512vl_gatherdiv4di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV4SI:
++      icode = CODE_FOR_avx512vl_gathersiv4si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3SIV8SI:
++      icode = CODE_FOR_avx512vl_gathersiv8si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV4SI:
++      icode = CODE_FOR_avx512vl_gatherdiv4si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3DIV8SI:
++      icode = CODE_FOR_avx512vl_gatherdiv8si;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTSIV4DF:
++      icode = CODE_FOR_avx512vl_gathersiv4df;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTDIV8SF:
++      icode = CODE_FOR_avx512vl_gatherdiv8sf;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTSIV4DI:
++      icode = CODE_FOR_avx512vl_gathersiv4di;
++      goto gather_gen;
++    case IX86_BUILTIN_GATHER3ALTDIV8SI:
++      icode = CODE_FOR_avx512vl_gatherdiv8si;
++      goto gather_gen;
++    case IX86_BUILTIN_SCATTERSIV16SF:
++      icode = CODE_FOR_avx512f_scattersiv16sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV8DF:
++      icode = CODE_FOR_avx512f_scattersiv8df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV16SF:
++      icode = CODE_FOR_avx512f_scatterdiv16sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV8DF:
++      icode = CODE_FOR_avx512f_scatterdiv8df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV16SI:
++      icode = CODE_FOR_avx512f_scattersiv16si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV8DI:
++      icode = CODE_FOR_avx512f_scattersiv8di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV16SI:
++      icode = CODE_FOR_avx512f_scatterdiv16si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV8DI:
++      icode = CODE_FOR_avx512f_scatterdiv8di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV8SF:
++      icode = CODE_FOR_avx512vl_scattersiv8sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV4SF:
++      icode = CODE_FOR_avx512vl_scattersiv4sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV4DF:
++      icode = CODE_FOR_avx512vl_scattersiv4df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV2DF:
++      icode = CODE_FOR_avx512vl_scattersiv2df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV8SF:
++      icode = CODE_FOR_avx512vl_scatterdiv8sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV4SF:
++      icode = CODE_FOR_avx512vl_scatterdiv4sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV4DF:
++      icode = CODE_FOR_avx512vl_scatterdiv4df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV2DF:
++      icode = CODE_FOR_avx512vl_scatterdiv2df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV8SI:
++      icode = CODE_FOR_avx512vl_scattersiv8si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV4SI:
++      icode = CODE_FOR_avx512vl_scattersiv4si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV4DI:
++      icode = CODE_FOR_avx512vl_scattersiv4di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERSIV2DI:
++      icode = CODE_FOR_avx512vl_scattersiv2di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV8SI:
++      icode = CODE_FOR_avx512vl_scatterdiv8si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV4SI:
++      icode = CODE_FOR_avx512vl_scatterdiv4si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV4DI:
++      icode = CODE_FOR_avx512vl_scatterdiv4di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERDIV2DI:
++      icode = CODE_FOR_avx512vl_scatterdiv2di;
++      goto scatter_gen;
++    case IX86_BUILTIN_GATHERPFDPD:
++      icode = CODE_FOR_avx512pf_gatherpfv8sidf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_SCATTERALTSIV8DF:
++      icode = CODE_FOR_avx512f_scattersiv8df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTDIV16SF:
++      icode = CODE_FOR_avx512f_scatterdiv16sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTSIV8DI:
++      icode = CODE_FOR_avx512f_scattersiv8di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTDIV16SI:
++      icode = CODE_FOR_avx512f_scatterdiv16si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTSIV4DF:
++      icode = CODE_FOR_avx512vl_scattersiv4df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTDIV8SF:
++      icode = CODE_FOR_avx512vl_scatterdiv8sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTSIV4DI:
++      icode = CODE_FOR_avx512vl_scattersiv4di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTDIV8SI:
++      icode = CODE_FOR_avx512vl_scatterdiv8si;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTSIV2DF:
++      icode = CODE_FOR_avx512vl_scattersiv2df;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTDIV4SF:
++      icode = CODE_FOR_avx512vl_scatterdiv4sf;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTSIV2DI:
++      icode = CODE_FOR_avx512vl_scattersiv2di;
++      goto scatter_gen;
++    case IX86_BUILTIN_SCATTERALTDIV4SI:
++      icode = CODE_FOR_avx512vl_scatterdiv4si;
++      goto scatter_gen;
++    case IX86_BUILTIN_GATHERPFDPS:
++      icode = CODE_FOR_avx512pf_gatherpfv16sisf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_GATHERPFQPD:
++      icode = CODE_FOR_avx512pf_gatherpfv8didf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_GATHERPFQPS:
++      icode = CODE_FOR_avx512pf_gatherpfv8disf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_SCATTERPFDPD:
++      icode = CODE_FOR_avx512pf_scatterpfv8sidf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_SCATTERPFDPS:
++      icode = CODE_FOR_avx512pf_scatterpfv16sisf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_SCATTERPFQPD:
++      icode = CODE_FOR_avx512pf_scatterpfv8didf;
++      goto vec_prefetch_gen;
++    case IX86_BUILTIN_SCATTERPFQPS:
++      icode = CODE_FOR_avx512pf_scatterpfv8disf;
++      goto vec_prefetch_gen;
++
++    gather_gen:
++      rtx half;
++      rtx (*gen) (rtx, rtx);
++
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      arg2 = CALL_EXPR_ARG (exp, 2);
++      arg3 = CALL_EXPR_ARG (exp, 3);
++      arg4 = CALL_EXPR_ARG (exp, 4);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      op2 = expand_normal (arg2);
++      op3 = expand_normal (arg3);
++      op4 = expand_normal (arg4);
++      /* Note the arg order is different from the operand order.  */
++      mode0 = insn_data[icode].operand[1].mode;
++      mode2 = insn_data[icode].operand[3].mode;
++      mode3 = insn_data[icode].operand[4].mode;
++      mode4 = insn_data[icode].operand[5].mode;
++
++      if (target == NULL_RTX
++	  || GET_MODE (target) != insn_data[icode].operand[0].mode
++	  || !insn_data[icode].operand[0].predicate (target,
++						     GET_MODE (target)))
++	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
++      else
++	subtarget = target;
++
++      switch (fcode)
++	{
++	case IX86_BUILTIN_GATHER3ALTSIV8DF:
++	case IX86_BUILTIN_GATHER3ALTSIV8DI:
++	  half = gen_reg_rtx (V8SImode);
++	  if (!nonimmediate_operand (op2, V16SImode))
++	    op2 = copy_to_mode_reg (V16SImode, op2);
++	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
++	  op2 = half;
++	  break;
++	case IX86_BUILTIN_GATHER3ALTSIV4DF:
++	case IX86_BUILTIN_GATHER3ALTSIV4DI:
++	case IX86_BUILTIN_GATHERALTSIV4DF:
++	case IX86_BUILTIN_GATHERALTSIV4DI:
++	  half = gen_reg_rtx (V4SImode);
++	  if (!nonimmediate_operand (op2, V8SImode))
++	    op2 = copy_to_mode_reg (V8SImode, op2);
++	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
++	  op2 = half;
++	  break;
++	case IX86_BUILTIN_GATHER3ALTDIV16SF:
++	case IX86_BUILTIN_GATHER3ALTDIV16SI:
++	  half = gen_reg_rtx (mode0);
++	  if (mode0 == V8SFmode)
++	    gen = gen_vec_extract_lo_v16sf;
++	  else
++	    gen = gen_vec_extract_lo_v16si;
++	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
++	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
++	  emit_insn (gen (half, op0));
++	  op0 = half;
++	  op3 = lowpart_subreg (QImode, op3, HImode);
++	  break;
++	case IX86_BUILTIN_GATHER3ALTDIV8SF:
++	case IX86_BUILTIN_GATHER3ALTDIV8SI:
++	case IX86_BUILTIN_GATHERALTDIV8SF:
++	case IX86_BUILTIN_GATHERALTDIV8SI:
++	  half = gen_reg_rtx (mode0);
++	  if (mode0 == V4SFmode)
++	    gen = gen_vec_extract_lo_v8sf;
++	  else
++	    gen = gen_vec_extract_lo_v8si;
++	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
++	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
++	  emit_insn (gen (half, op0));
++	  op0 = half;
++	  if (VECTOR_MODE_P (GET_MODE (op3)))
++	    {
++	      half = gen_reg_rtx (mode0);
++	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
++		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
++	      emit_insn (gen (half, op3));
++	      op3 = half;
++	    }
++	  break;
++	default:
++	  break;
++	}
++
++      /* Force memory operand only with base register here.  But we
++	 don't want to do it on memory operand for other builtin
++	 functions.  */
++      op1 = ix86_zero_extend_to_Pmode (op1);
++
++      if (!insn_data[icode].operand[1].predicate (op0, mode0))
++	op0 = copy_to_mode_reg (mode0, op0);
++      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
++	op1 = copy_to_mode_reg (Pmode, op1);
++      if (!insn_data[icode].operand[3].predicate (op2, mode2))
++	op2 = copy_to_mode_reg (mode2, op2);
++
++      op3 = fixup_modeless_constant (op3, mode3);
++
++      if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
++	{
++	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
++	    op3 = copy_to_mode_reg (mode3, op3);
++	}
++      else
++	{
++	  op3 = copy_to_reg (op3);
++	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
++	}
++      if (!insn_data[icode].operand[5].predicate (op4, mode4))
++	{
++          error ("the last argument must be scale 1, 2, 4, 8");
++          return const0_rtx;
++	}
++
++      /* Optimize.  If mask is known to have all high bits set,
++	 replace op0 with pc_rtx to signal that the instruction
++	 overwrites the whole destination and doesn't use its
++	 previous contents.  */
++      if (optimize)
++	{
++	  if (TREE_CODE (arg3) == INTEGER_CST)
++	    {
++	      if (integer_all_onesp (arg3))
++		op0 = pc_rtx;
++	    }
++	  else if (TREE_CODE (arg3) == VECTOR_CST)
++	    {
++	      unsigned int negative = 0;
++	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
++		{
++		  tree cst = VECTOR_CST_ELT (arg3, i);
++		  if (TREE_CODE (cst) == INTEGER_CST
++		      && tree_int_cst_sign_bit (cst))
++		    negative++;
++		  else if (TREE_CODE (cst) == REAL_CST
++			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
++		    negative++;
++		}
++	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
++		op0 = pc_rtx;
++	    }
++	  else if (TREE_CODE (arg3) == SSA_NAME
++		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
++	    {
++	      /* Recognize also when mask is like:
++		 __v2df src = _mm_setzero_pd ();
++		 __v2df mask = _mm_cmpeq_pd (src, src);
++		 or
++		 __v8sf src = _mm256_setzero_ps ();
++		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
++		 as that is a cheaper way to load all ones into
++		 a register than having to load a constant from
++		 memory.  */
++	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
++	      if (is_gimple_call (def_stmt))
++		{
++		  tree fndecl = gimple_call_fndecl (def_stmt);
++		  if (fndecl
++		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
++		    switch (DECL_MD_FUNCTION_CODE (fndecl))
++		      {
++		      case IX86_BUILTIN_CMPPD:
++		      case IX86_BUILTIN_CMPPS:
++		      case IX86_BUILTIN_CMPPD256:
++		      case IX86_BUILTIN_CMPPS256:
++			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
++			  break;
++			/* FALLTHRU */
++		      case IX86_BUILTIN_CMPEQPD:
++		      case IX86_BUILTIN_CMPEQPS:
++			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
++			    && initializer_zerop (gimple_call_arg (def_stmt,
++								   1)))
++			  op0 = pc_rtx;
++			break;
++		      default:
++			break;
++		      }
++		}
++	    }
++	}
++
++      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
++      if (! pat)
++	return const0_rtx;
++      emit_insn (pat);
++
++      switch (fcode)
++	{
++	case IX86_BUILTIN_GATHER3DIV16SF:
++	  if (target == NULL_RTX)
++	    target = gen_reg_rtx (V8SFmode);
++	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
++	  break;
++	case IX86_BUILTIN_GATHER3DIV16SI:
++	  if (target == NULL_RTX)
++	    target = gen_reg_rtx (V8SImode);
++	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
++	  break;
++	case IX86_BUILTIN_GATHER3DIV8SF:
++	case IX86_BUILTIN_GATHERDIV8SF:
++	  if (target == NULL_RTX)
++	    target = gen_reg_rtx (V4SFmode);
++	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
++	  break;
++	case IX86_BUILTIN_GATHER3DIV8SI:
++	case IX86_BUILTIN_GATHERDIV8SI:
++	  if (target == NULL_RTX)
++	    target = gen_reg_rtx (V4SImode);
++	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
++	  break;
++	default:
++	  target = subtarget;
++	  break;
++	}
++      return target;
++
++    scatter_gen:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      arg2 = CALL_EXPR_ARG (exp, 2);
++      arg3 = CALL_EXPR_ARG (exp, 3);
++      arg4 = CALL_EXPR_ARG (exp, 4);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      op2 = expand_normal (arg2);
++      op3 = expand_normal (arg3);
++      op4 = expand_normal (arg4);
++      mode1 = insn_data[icode].operand[1].mode;
++      mode2 = insn_data[icode].operand[2].mode;
++      mode3 = insn_data[icode].operand[3].mode;
++      mode4 = insn_data[icode].operand[4].mode;
++
++      /* Scatter instruction stores operand op3 to memory with
++	 indices from op2 and scale from op4 under writemask op1.
++	 If index operand op2 has more elements then source operand
++	 op3 one need to use only its low half. And vice versa.  */
++      switch (fcode)
++	{
++	case IX86_BUILTIN_SCATTERALTSIV8DF:
++	case IX86_BUILTIN_SCATTERALTSIV8DI:
++	  half = gen_reg_rtx (V8SImode);
++	  if (!nonimmediate_operand (op2, V16SImode))
++	    op2 = copy_to_mode_reg (V16SImode, op2);
++	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
++	  op2 = half;
++	  break;
++	case IX86_BUILTIN_SCATTERALTDIV16SF:
++	case IX86_BUILTIN_SCATTERALTDIV16SI:
++	  half = gen_reg_rtx (mode3);
++	  if (mode3 == V8SFmode)
++	    gen = gen_vec_extract_lo_v16sf;
++	  else
++	    gen = gen_vec_extract_lo_v16si;
++	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
++	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
++	  emit_insn (gen (half, op3));
++	  op3 = half;
++	  break;
++	case IX86_BUILTIN_SCATTERALTSIV4DF:
++	case IX86_BUILTIN_SCATTERALTSIV4DI:
++	  half = gen_reg_rtx (V4SImode);
++	  if (!nonimmediate_operand (op2, V8SImode))
++	    op2 = copy_to_mode_reg (V8SImode, op2);
++	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
++	  op2 = half;
++	  break;
++	case IX86_BUILTIN_SCATTERALTDIV8SF:
++	case IX86_BUILTIN_SCATTERALTDIV8SI:
++	  half = gen_reg_rtx (mode3);
++	  if (mode3 == V4SFmode)
++	    gen = gen_vec_extract_lo_v8sf;
++	  else
++	    gen = gen_vec_extract_lo_v8si;
++	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
++	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
++	  emit_insn (gen (half, op3));
++	  op3 = half;
++	  break;
++	case IX86_BUILTIN_SCATTERALTSIV2DF:
++	case IX86_BUILTIN_SCATTERALTSIV2DI:
++	  if (!nonimmediate_operand (op2, V4SImode))
++	    op2 = copy_to_mode_reg (V4SImode, op2);
++	  break;
++	case IX86_BUILTIN_SCATTERALTDIV4SF:
++	case IX86_BUILTIN_SCATTERALTDIV4SI:
++	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
++	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
++	  break;
++	default:
++	  break;
++	}
++
++      /* Force memory operand only with base register here.  But we
++	 don't want to do it on memory operand for other builtin
++	 functions.  */
++      op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
++
++      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
++	op0 = copy_to_mode_reg (Pmode, op0);
++
++      op1 = fixup_modeless_constant (op1, mode1);
++
++      if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
++	{
++	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
++	    op1 = copy_to_mode_reg (mode1, op1);
++	}
++      else
++	{
++	  op1 = copy_to_reg (op1);
++	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
++	}
++
++      if (!insn_data[icode].operand[2].predicate (op2, mode2))
++	op2 = copy_to_mode_reg (mode2, op2);
++
++      if (!insn_data[icode].operand[3].predicate (op3, mode3))
++	op3 = copy_to_mode_reg (mode3, op3);
++
++      if (!insn_data[icode].operand[4].predicate (op4, mode4))
++	{
++	  error ("the last argument must be scale 1, 2, 4, 8");
++	  return const0_rtx;
++	}
++
++      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
++      if (! pat)
++	return const0_rtx;
++
++      emit_insn (pat);
++      return 0;
++
++    vec_prefetch_gen:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      arg2 = CALL_EXPR_ARG (exp, 2);
++      arg3 = CALL_EXPR_ARG (exp, 3);
++      arg4 = CALL_EXPR_ARG (exp, 4);
++      op0 = expand_normal (arg0);
++      op1 = expand_normal (arg1);
++      op2 = expand_normal (arg2);
++      op3 = expand_normal (arg3);
++      op4 = expand_normal (arg4);
++      mode0 = insn_data[icode].operand[0].mode;
++      mode1 = insn_data[icode].operand[1].mode;
++      mode3 = insn_data[icode].operand[3].mode;
++      mode4 = insn_data[icode].operand[4].mode;
++
++      op0 = fixup_modeless_constant (op0, mode0);
++
++      if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
++	{
++	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
++	    op0 = copy_to_mode_reg (mode0, op0);
++	}
++      else
++	{
++	  op0 = copy_to_reg (op0);
++	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
++	}
++
++      if (!insn_data[icode].operand[1].predicate (op1, mode1))
++	op1 = copy_to_mode_reg (mode1, op1);
++
++      /* Force memory operand only with base register here.  But we
++	 don't want to do it on memory operand for other builtin
++	 functions.  */
++      op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
++
++      if (!insn_data[icode].operand[2].predicate (op2, Pmode))
++	op2 = copy_to_mode_reg (Pmode, op2);
++
++      if (!insn_data[icode].operand[3].predicate (op3, mode3))
++	{
++	  error ("the forth argument must be scale 1, 2, 4, 8");
++	  return const0_rtx;
++	}
++
++      if (!insn_data[icode].operand[4].predicate (op4, mode4))
++	{
++	  error ("incorrect hint operand");
++	  return const0_rtx;
++	}
++
++      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
++      if (! pat)
++	return const0_rtx;
++
++      emit_insn (pat);
++
++      return 0;
++
++    case IX86_BUILTIN_XABORT:
++      icode = CODE_FOR_xabort;
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      mode0 = insn_data[icode].operand[0].mode;
++      if (!insn_data[icode].operand[0].predicate (op0, mode0))
++	{
++	  error ("the argument to %<xabort%> intrinsic must "
++		 "be an 8-bit immediate");
++	  return const0_rtx;
++	}
++      emit_insn (gen_xabort (op0));
++      return 0;
++
++    case IX86_BUILTIN_RSTORSSP:
++    case IX86_BUILTIN_CLRSSBSY:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      icode = (fcode == IX86_BUILTIN_RSTORSSP
++	  ? CODE_FOR_rstorssp
++	  : CODE_FOR_clrssbsy);
++      if (!address_operand (op0, VOIDmode))
++	{
++	  op1 = convert_memory_address (Pmode, op0);
++	  op0 = copy_addr_to_reg (op1);
++	}
++      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
++      return 0;
++
++    case IX86_BUILTIN_WRSSD:
++    case IX86_BUILTIN_WRSSQ:
++    case IX86_BUILTIN_WRUSSD:
++    case IX86_BUILTIN_WRUSSQ:
++      arg0 = CALL_EXPR_ARG (exp, 0);
++      op0 = expand_normal (arg0);
++      arg1 = CALL_EXPR_ARG (exp, 1);
++      op1 = expand_normal (arg1);
++      switch (fcode)
++	{
++	case IX86_BUILTIN_WRSSD:
++	  icode = CODE_FOR_wrsssi;
++	  mode = SImode;
++	  break;
++	case IX86_BUILTIN_WRSSQ:
++	  icode = CODE_FOR_wrssdi;
++	  mode = DImode;
++	  break;
++	case IX86_BUILTIN_WRUSSD:
++	  icode = CODE_FOR_wrusssi;
++	  mode = SImode;
++	  break;
++	case IX86_BUILTIN_WRUSSQ:
++	  icode = CODE_FOR_wrussdi;
++	  mode = DImode;
++	  break;
++	}
++      op0 = force_reg (mode, op0);
++      if (!address_operand (op1, VOIDmode))
++	{
++	  op2 = convert_memory_address (Pmode, op1);
++	  op1 = copy_addr_to_reg (op2);
++	}
++      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
++      return 0;
++
++    default:
++      break;
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
++      return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
++					       target);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
++      rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
++      rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
++      rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
++      int masked = 1;
++      machine_mode mode, wide_mode, nar_mode;
++
++      nar_mode  = V4SFmode;
++      mode      = V16SFmode;
++      wide_mode = V64SFmode;
++      fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
++      fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
++
++      switch (fcode)
++	{
++	case IX86_BUILTIN_4FMAPS:
++	  fcn = gen_avx5124fmaddps_4fmaddps;
++	  masked = 0;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4DPWSSD:
++	  nar_mode  = V4SImode;
++	  mode      = V16SImode;
++	  wide_mode = V64SImode;
++	  fcn = gen_avx5124vnniw_vp4dpwssd;
++	  masked = 0;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4DPWSSDS:
++	  nar_mode  = V4SImode;
++	  mode      = V16SImode;
++	  wide_mode = V64SImode;
++	  fcn = gen_avx5124vnniw_vp4dpwssds;
++	  masked = 0;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4FNMAPS:
++	  fcn = gen_avx5124fmaddps_4fnmaddps;
++	  masked = 0;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4FNMAPS_MASK:
++	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
++	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4DPWSSD_MASK:
++	  nar_mode  = V4SImode;
++	  mode      = V16SImode;
++	  wide_mode = V64SImode;
++	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
++	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4DPWSSDS_MASK:
++	  nar_mode  = V4SImode;
++	  mode      = V16SImode;
++	  wide_mode = V64SImode;
++	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
++	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
++	  goto v4fma_expand;
++
++	case IX86_BUILTIN_4FMAPS_MASK:
++	  {
++	    tree args[4];
++	    rtx ops[4];
++	    rtx wide_reg;
++	    rtx accum;
++	    rtx addr;
++	    rtx mem;
++
++v4fma_expand:
++	    wide_reg = gen_reg_rtx (wide_mode);
++	    for (i = 0; i < 4; i++)
++	      {
++		args[i] = CALL_EXPR_ARG (exp, i);
++		ops[i] = expand_normal (args[i]);
++
++		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
++				ops[i]);
++	      }
++
++	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
++	    accum = force_reg (mode, accum);
++
++	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
++	    addr = force_reg (Pmode, addr);
++
++	    mem = gen_rtx_MEM (nar_mode, addr);
++
++	    target = gen_reg_rtx (mode);
++
++	    emit_move_insn (target, accum);
++
++	    if (! masked)
++	      emit_insn (fcn (target, accum, wide_reg, mem));
++	    else
++	      {
++		rtx merge, mask;
++		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
++
++		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
++
++		if (CONST_INT_P (mask))
++		  mask = fixup_modeless_constant (mask, HImode);
++
++		mask = force_reg (HImode, mask);
++
++		if (GET_MODE (mask) != HImode)
++		  mask = gen_rtx_SUBREG (HImode, mask, 0);
++
++		/* If merge is 0 then we're about to emit z-masked variant.  */
++		if (const0_operand (merge, mode))
++		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
++		/* If merge is the same as accum then emit merge-masked variant.  */
++		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
++		  {
++		    merge = force_reg (mode, merge);
++		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
++		  }
++		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
++		else
++		  {
++		    target = gen_reg_rtx (mode);
++		    emit_move_insn (target, merge);
++		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
++		  }
++	      }
++	    return target;
++	  }
++
++	case IX86_BUILTIN_4FNMASS:
++	  fcn = gen_avx5124fmaddps_4fnmaddss;
++	  masked = 0;
++	  goto s4fma_expand;
++
++	case IX86_BUILTIN_4FMASS:
++	  fcn = gen_avx5124fmaddps_4fmaddss;
++	  masked = 0;
++	  goto s4fma_expand;
++
++	case IX86_BUILTIN_4FNMASS_MASK:
++	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
++	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
++	  goto s4fma_expand;
++
++	case IX86_BUILTIN_4FMASS_MASK:
++	  {
++	    tree args[4];
++	    rtx ops[4];
++	    rtx wide_reg;
++	    rtx accum;
++	    rtx addr;
++	    rtx mem;
++
++	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
++	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
++
++s4fma_expand:
++	    mode = V4SFmode;
++	    wide_reg = gen_reg_rtx (V64SFmode);
++	    for (i = 0; i < 4; i++)
++	      {
++		rtx tmp;
++		args[i] = CALL_EXPR_ARG (exp, i);
++		ops[i] = expand_normal (args[i]);
++
++		tmp = gen_reg_rtx (SFmode);
++		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
++
++		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
++				gen_rtx_SUBREG (V16SFmode, tmp, 0));
++	      }
++
++	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
++	    accum = force_reg (V4SFmode, accum);
++
++	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
++	    addr = force_reg (Pmode, addr);
++
++	    mem = gen_rtx_MEM (V4SFmode, addr);
++
++	    target = gen_reg_rtx (V4SFmode);
++
++	    emit_move_insn (target, accum);
++
++	    if (! masked)
++	      emit_insn (fcn (target, accum, wide_reg, mem));
++	    else
++	      {
++		rtx merge, mask;
++		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
++
++		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
++
++		if (CONST_INT_P (mask))
++		  mask = fixup_modeless_constant (mask, QImode);
++
++		mask = force_reg (QImode, mask);
++
++		if (GET_MODE (mask) != QImode)
++		  mask = gen_rtx_SUBREG (QImode, mask, 0);
++
++		/* If merge is 0 then we're about to emit z-masked variant.  */
++		if (const0_operand (merge, mode))
++		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
++		/* If merge is the same as accum then emit merge-masked
++		   variant.  */
++		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
++		  {
++		    merge = force_reg (mode, merge);
++		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
++		  }
++		/* Merge with something unknown might happen if we z-mask
++		   w/ -O0.  */
++		else
++		  {
++		    target = gen_reg_rtx (mode);
++		    emit_move_insn (target, merge);
++		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
++		  }
++		}
++	      return target;
++	    }
++	  case IX86_BUILTIN_RDPID:
++	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
++						     target);
++	  case IX86_BUILTIN_FABSQ:
++	  case IX86_BUILTIN_COPYSIGNQ:
++	    if (!TARGET_SSE)
++	      /* Emit a normal call if SSE isn't available.  */
++	      return expand_call (exp, target, ignore);
++	    /* FALLTHRU */
++	  default:
++	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
++	  }
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
++      return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
++      return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
++      return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
++      return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
++      const struct builtin_description *d = bdesc_multi_arg + i;
++      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
++					    (enum ix86_builtin_func_type)
++					    d->flag, d->comparison);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
++      return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
++					       target);
++    }
++
++  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
++      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
++    {
++      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
++      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
++				       target);
++    }
++
++  gcc_unreachable ();
++}
++
++/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
++   fill target with val via vec_duplicate.  */
++
++static bool
++ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
++{
++  bool ok;
++  rtx_insn *insn;
++  rtx dup;
++
++  /* First attempt to recognize VAL as-is.  */
++  dup = gen_vec_duplicate (mode, val);
++  insn = emit_insn (gen_rtx_SET (target, dup));
++  if (recog_memoized (insn) < 0)
++    {
++      rtx_insn *seq;
++      machine_mode innermode = GET_MODE_INNER (mode);
++      rtx reg;
++
++      /* If that fails, force VAL into a register.  */
++
++      start_sequence ();
++      reg = force_reg (innermode, val);
++      if (GET_MODE (reg) != innermode)
++	reg = gen_lowpart (innermode, reg);
++      SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
++      seq = get_insns ();
++      end_sequence ();
++      if (seq)
++	emit_insn_before (seq, insn);
++
++      ok = recog_memoized (insn) >= 0;
++      gcc_assert (ok);
++    }
++  return true;
++}
++
++/* Get a vector mode of the same size as the original but with elements
++   twice as wide.  This is only guaranteed to apply to integral vectors.  */
++
++static machine_mode
++get_mode_wider_vector (machine_mode o)
++{
++  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
++  machine_mode n = GET_MODE_WIDER_MODE (o).require ();
++  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
++  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
++  return n;
++}
++
++static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
++static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
++
++/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
++   with all elements equal to VAR.  Return true if successful.  */
++
++static bool
++ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
++				   rtx target, rtx val)
++{
++  bool ok;
++
++  switch (mode)
++    {
++    case E_V2SImode:
++    case E_V2SFmode:
++      if (!mmx_ok)
++	return false;
++      /* FALLTHRU */
++
++    case E_V4DFmode:
++    case E_V4DImode:
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V2DFmode:
++    case E_V2DImode:
++    case E_V4SFmode:
++    case E_V4SImode:
++    case E_V16SImode:
++    case E_V8DImode:
++    case E_V16SFmode:
++    case E_V8DFmode:
++      return ix86_vector_duplicate_value (mode, target, val);
++
++    case E_V4HImode:
++      if (!mmx_ok)
++	return false;
++      if (TARGET_SSE || TARGET_3DNOW_A)
++	{
++	  rtx x;
++
++	  val = gen_lowpart (SImode, val);
++	  x = gen_rtx_TRUNCATE (HImode, val);
++	  x = gen_rtx_VEC_DUPLICATE (mode, x);
++	  emit_insn (gen_rtx_SET (target, x));
++	  return true;
++	}
++      goto widen;
++
++    case E_V8QImode:
++      if (!mmx_ok)
++	return false;
++      goto widen;
++
++    case E_V8HImode:
++      if (TARGET_AVX2)
++	return ix86_vector_duplicate_value (mode, target, val);
++
++      if (TARGET_SSE2)
++	{
++	  struct expand_vec_perm_d dperm;
++	  rtx tmp1, tmp2;
++
++	permute:
++	  memset (&dperm, 0, sizeof (dperm));
++	  dperm.target = target;
++	  dperm.vmode = mode;
++	  dperm.nelt = GET_MODE_NUNITS (mode);
++	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
++	  dperm.one_operand_p = true;
++
++	  /* Extend to SImode using a paradoxical SUBREG.  */
++	  tmp1 = gen_reg_rtx (SImode);
++	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
++
++	  /* Insert the SImode value as low element of a V4SImode vector. */
++	  tmp2 = gen_reg_rtx (V4SImode);
++	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
++	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
++
++	  ok = (expand_vec_perm_1 (&dperm)
++		|| expand_vec_perm_broadcast_1 (&dperm));
++	  gcc_assert (ok);
++	  return ok;
++	}
++      goto widen;
++
++    case E_V16QImode:
++      if (TARGET_AVX2)
++	return ix86_vector_duplicate_value (mode, target, val);
++
++      if (TARGET_SSE2)
++	goto permute;
++      goto widen;
++
++    widen:
++      /* Replicate the value once into the next wider mode and recurse.  */
++      {
++	machine_mode smode, wsmode, wvmode;
++	rtx x;
++
++	smode = GET_MODE_INNER (mode);
++	wvmode = get_mode_wider_vector (mode);
++	wsmode = GET_MODE_INNER (wvmode);
++
++	val = convert_modes (wsmode, smode, val, true);
++	x = expand_simple_binop (wsmode, ASHIFT, val,
++				 GEN_INT (GET_MODE_BITSIZE (smode)),
++				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
++	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
++
++	x = gen_reg_rtx (wvmode);
++	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
++	gcc_assert (ok);
++	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
++	return ok;
++      }
++
++    case E_V16HImode:
++    case E_V32QImode:
++      if (TARGET_AVX2)
++	return ix86_vector_duplicate_value (mode, target, val);
++      else
++	{
++	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
++	  rtx x = gen_reg_rtx (hvmode);
++
++	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
++	  gcc_assert (ok);
++
++	  x = gen_rtx_VEC_CONCAT (mode, x, x);
++	  emit_insn (gen_rtx_SET (target, x));
++	}
++      return true;
++
++    case E_V64QImode:
++    case E_V32HImode:
++      if (TARGET_AVX512BW)
++	return ix86_vector_duplicate_value (mode, target, val);
++      else
++	{
++	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
++	  rtx x = gen_reg_rtx (hvmode);
++
++	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
++	  gcc_assert (ok);
++
++	  x = gen_rtx_VEC_CONCAT (mode, x, x);
++	  emit_insn (gen_rtx_SET (target, x));
++	}
++      return true;
++
++    default:
++      return false;
++    }
++}
++
++/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
++   whose ONE_VAR element is VAR, and other elements are zero.  Return true
++   if successful.  */
++
++static bool
++ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
++				     rtx target, rtx var, int one_var)
++{
++  machine_mode vsimode;
++  rtx new_target;
++  rtx x, tmp;
++  bool use_vector_set = false;
++  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
++
++  switch (mode)
++    {
++    case E_V2DImode:
++      /* For SSE4.1, we normally use vector set.  But if the second
++	 element is zero and inter-unit moves are OK, we use movq
++	 instead.  */
++      use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
++			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
++			     && one_var == 0));
++      break;
++    case E_V16QImode:
++    case E_V4SImode:
++    case E_V4SFmode:
++      use_vector_set = TARGET_SSE4_1;
++      break;
++    case E_V8HImode:
++      use_vector_set = TARGET_SSE2;
++      break;
++    case E_V4HImode:
++      use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
++      break;
++    case E_V32QImode:
++    case E_V16HImode:
++      use_vector_set = TARGET_AVX;
++      break;
++    case E_V8SImode:
++      use_vector_set = TARGET_AVX;
++      gen_vec_set_0 = gen_vec_setv8si_0;
++      break;
++    case E_V8SFmode:
++      use_vector_set = TARGET_AVX;
++      gen_vec_set_0 = gen_vec_setv8sf_0;
++      break;
++    case E_V4DFmode:
++      use_vector_set = TARGET_AVX;
++      gen_vec_set_0 = gen_vec_setv4df_0;
++      break;
++    case E_V4DImode:
++      /* Use ix86_expand_vector_set in 64bit mode only.  */
++      use_vector_set = TARGET_AVX && TARGET_64BIT;
++      gen_vec_set_0 = gen_vec_setv4di_0;
++      break;
++    case E_V16SImode:
++      use_vector_set = TARGET_AVX512F && one_var == 0;
++      gen_vec_set_0 = gen_vec_setv16si_0;
++      break;
++    case E_V16SFmode:
++      use_vector_set = TARGET_AVX512F && one_var == 0;
++      gen_vec_set_0 = gen_vec_setv16sf_0;
++      break;
++    case E_V8DFmode:
++      use_vector_set = TARGET_AVX512F && one_var == 0;
++      gen_vec_set_0 = gen_vec_setv8df_0;
++      break;
++    case E_V8DImode:
++      /* Use ix86_expand_vector_set in 64bit mode only.  */
++      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
++      gen_vec_set_0 = gen_vec_setv8di_0;
++      break;
++    default:
++      break;
++    }
++
++  if (use_vector_set)
++    {
++      if (gen_vec_set_0 && one_var == 0)
++	{
++	  var = force_reg (GET_MODE_INNER (mode), var);
++	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
++	  return true;
++	}
++      emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
++      var = force_reg (GET_MODE_INNER (mode), var);
++      ix86_expand_vector_set (mmx_ok, target, var, one_var);
++      return true;
++    }
++
++  switch (mode)
++    {
++    case E_V2SFmode:
++    case E_V2SImode:
++      if (!mmx_ok)
++	return false;
++      /* FALLTHRU */
++
++    case E_V2DFmode:
++    case E_V2DImode:
++      if (one_var != 0)
++	return false;
++      var = force_reg (GET_MODE_INNER (mode), var);
++      x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
++      emit_insn (gen_rtx_SET (target, x));
++      return true;
++
++    case E_V4SFmode:
++    case E_V4SImode:
++      if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
++	new_target = gen_reg_rtx (mode);
++      else
++	new_target = target;
++      var = force_reg (GET_MODE_INNER (mode), var);
++      x = gen_rtx_VEC_DUPLICATE (mode, var);
++      x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
++      emit_insn (gen_rtx_SET (new_target, x));
++      if (one_var != 0)
++	{
++	  /* We need to shuffle the value to the correct position, so
++	     create a new pseudo to store the intermediate result.  */
++
++	  /* With SSE2, we can use the integer shuffle insns.  */
++	  if (mode != V4SFmode && TARGET_SSE2)
++	    {
++	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
++					    const1_rtx,
++					    GEN_INT (one_var == 1 ? 0 : 1),
++					    GEN_INT (one_var == 2 ? 0 : 1),
++					    GEN_INT (one_var == 3 ? 0 : 1)));
++	      if (target != new_target)
++		emit_move_insn (target, new_target);
++	      return true;
++	    }
++
++	  /* Otherwise convert the intermediate result to V4SFmode and
++	     use the SSE1 shuffle instructions.  */
++	  if (mode != V4SFmode)
++	    {
++	      tmp = gen_reg_rtx (V4SFmode);
++	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
++	    }
++	  else
++	    tmp = new_target;
++
++	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
++				       const1_rtx,
++				       GEN_INT (one_var == 1 ? 0 : 1),
++				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
++				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
++
++	  if (mode != V4SFmode)
++	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
++	  else if (tmp != target)
++	    emit_move_insn (target, tmp);
++	}
++      else if (target != new_target)
++	emit_move_insn (target, new_target);
++      return true;
++
++    case E_V8HImode:
++    case E_V16QImode:
++      vsimode = V4SImode;
++      goto widen;
++    case E_V4HImode:
++    case E_V8QImode:
++      if (!mmx_ok)
++	return false;
++      vsimode = V2SImode;
++      goto widen;
++    widen:
++      if (one_var != 0)
++	return false;
++
++      /* Zero extend the variable element to SImode and recurse.  */
++      var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
++
++      x = gen_reg_rtx (vsimode);
++      if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
++						var, one_var))
++	gcc_unreachable ();
++
++      emit_move_insn (target, gen_lowpart (mode, x));
++      return true;
++
++    default:
++      return false;
++    }
++}
++
++/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
++   consisting of the values in VALS.  It is known that all elements
++   except ONE_VAR are constants.  Return true if successful.  */
++
++static bool
++ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
++				 rtx target, rtx vals, int one_var)
++{
++  rtx var = XVECEXP (vals, 0, one_var);
++  machine_mode wmode;
++  rtx const_vec, x;
++
++  const_vec = copy_rtx (vals);
++  XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
++  const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
++
++  switch (mode)
++    {
++    case E_V2DFmode:
++    case E_V2DImode:
++    case E_V2SFmode:
++    case E_V2SImode:
++      /* For the two element vectors, it's just as easy to use
++	 the general case.  */
++      return false;
++
++    case E_V4DImode:
++      /* Use ix86_expand_vector_set in 64bit mode only.  */
++      if (!TARGET_64BIT)
++	return false;
++      /* FALLTHRU */
++    case E_V4DFmode:
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V16HImode:
++    case E_V32QImode:
++    case E_V4SFmode:
++    case E_V4SImode:
++    case E_V8HImode:
++    case E_V4HImode:
++      break;
++
++    case E_V16QImode:
++      if (TARGET_SSE4_1)
++	break;
++      wmode = V8HImode;
++      goto widen;
++    case E_V8QImode:
++      wmode = V4HImode;
++      goto widen;
++    widen:
++      /* There's no way to set one QImode entry easily.  Combine
++	 the variable value with its adjacent constant value, and
++	 promote to an HImode set.  */
++      x = XVECEXP (vals, 0, one_var ^ 1);
++      if (one_var & 1)
++	{
++	  var = convert_modes (HImode, QImode, var, true);
++	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
++				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
++	  x = GEN_INT (INTVAL (x) & 0xff);
++	}
++      else
++	{
++	  var = convert_modes (HImode, QImode, var, true);
++	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
++	}
++      if (x != const0_rtx)
++	var = expand_simple_binop (HImode, IOR, var, x, var,
++				   1, OPTAB_LIB_WIDEN);
++
++      x = gen_reg_rtx (wmode);
++      emit_move_insn (x, gen_lowpart (wmode, const_vec));
++      ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
++
++      emit_move_insn (target, gen_lowpart (mode, x));
++      return true;
++
++    default:
++      return false;
++    }
++
++  emit_move_insn (target, const_vec);
++  ix86_expand_vector_set (mmx_ok, target, var, one_var);
++  return true;
++}
++
++/* A subroutine of ix86_expand_vector_init_general.  Use vector
++   concatenate to handle the most general case: all values variable,
++   and none identical.  */
++
++static void
++ix86_expand_vector_init_concat (machine_mode mode,
++				rtx target, rtx *ops, int n)
++{
++  machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
++  rtx first[16], second[8], third[4];
++  rtvec v;
++  int i, j;
++
++  switch (n)
++    {
++    case 2:
++      switch (mode)
++	{
++	case E_V16SImode:
++	  cmode = V8SImode;
++	  break;
++	case E_V16SFmode:
++	  cmode = V8SFmode;
++	  break;
++	case E_V8DImode:
++	  cmode = V4DImode;
++	  break;
++	case E_V8DFmode:
++	  cmode = V4DFmode;
++	  break;
++	case E_V8SImode:
++	  cmode = V4SImode;
++	  break;
++	case E_V8SFmode:
++	  cmode = V4SFmode;
++	  break;
++	case E_V4DImode:
++	  cmode = V2DImode;
++	  break;
++	case E_V4DFmode:
++	  cmode = V2DFmode;
++	  break;
++	case E_V4SImode:
++	  cmode = V2SImode;
++	  break;
++	case E_V4SFmode:
++	  cmode = V2SFmode;
++	  break;
++	case E_V2DImode:
++	  cmode = DImode;
++	  break;
++	case E_V2SImode:
++	  cmode = SImode;
++	  break;
++	case E_V2DFmode:
++	  cmode = DFmode;
++	  break;
++	case E_V2SFmode:
++	  cmode = SFmode;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++
++      if (!register_operand (ops[1], cmode))
++	ops[1] = force_reg (cmode, ops[1]);
++      if (!register_operand (ops[0], cmode))
++	ops[0] = force_reg (cmode, ops[0]);
++      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
++							  ops[1])));
++      break;
++
++    case 4:
++      switch (mode)
++	{
++	case E_V4DImode:
++	  cmode = V2DImode;
++	  break;
++	case E_V4DFmode:
++	  cmode = V2DFmode;
++	  break;
++	case E_V4SImode:
++	  cmode = V2SImode;
++	  break;
++	case E_V4SFmode:
++	  cmode = V2SFmode;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      goto half;
++
++    case 8:
++      switch (mode)
++	{
++	case E_V8DImode:
++	  cmode = V2DImode;
++	  hmode = V4DImode;
++	  break;
++	case E_V8DFmode:
++	  cmode = V2DFmode;
++	  hmode = V4DFmode;
++	  break;
++	case E_V8SImode:
++	  cmode = V2SImode;
++	  hmode = V4SImode;
++	  break;
++	case E_V8SFmode:
++	  cmode = V2SFmode;
++	  hmode = V4SFmode;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      goto half;
++
++    case 16:
++      switch (mode)
++	{
++	case E_V16SImode:
++	  cmode = V2SImode;
++	  hmode = V4SImode;
++	  gmode = V8SImode;
++	  break;
++	case E_V16SFmode:
++	  cmode = V2SFmode;
++	  hmode = V4SFmode;
++	  gmode = V8SFmode;
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      goto half;
++
++half:
++      /* FIXME: We process inputs backward to help RA.  PR 36222.  */
++      i = n - 1;
++      j = (n >> 1) - 1;
++      for (; i > 0; i -= 2, j--)
++	{
++	  first[j] = gen_reg_rtx (cmode);
++	  v = gen_rtvec (2, ops[i - 1], ops[i]);
++	  ix86_expand_vector_init (false, first[j],
++				   gen_rtx_PARALLEL (cmode, v));
++	}
++
++      n >>= 1;
++      if (n > 4)
++	{
++	  gcc_assert (hmode != VOIDmode);
++	  gcc_assert (gmode != VOIDmode);
++	  for (i = j = 0; i < n; i += 2, j++)
++	    {
++	      second[j] = gen_reg_rtx (hmode);
++	      ix86_expand_vector_init_concat (hmode, second [j],
++					      &first [i], 2);
++	    }
++	  n >>= 1;
++	  for (i = j = 0; i < n; i += 2, j++)
++	    {
++	      third[j] = gen_reg_rtx (gmode);
++	      ix86_expand_vector_init_concat (gmode, third[j],
++					      &second[i], 2);
++	    }
++	  n >>= 1;
++	  ix86_expand_vector_init_concat (mode, target, third, n);
++	}
++      else if (n > 2)
++	{
++	  gcc_assert (hmode != VOIDmode);
++	  for (i = j = 0; i < n; i += 2, j++)
++	    {
++	      second[j] = gen_reg_rtx (hmode);
++	      ix86_expand_vector_init_concat (hmode, second [j],
++					      &first [i], 2);
++	    }
++	  n >>= 1;
++	  ix86_expand_vector_init_concat (mode, target, second, n);
++	}
++      else
++	ix86_expand_vector_init_concat (mode, target, first, n);
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++}
++
++/* A subroutine of ix86_expand_vector_init_general.  Use vector
++   interleave to handle the most general case: all values variable,
++   and none identical.  */
++
++static void
++ix86_expand_vector_init_interleave (machine_mode mode,
++				    rtx target, rtx *ops, int n)
++{
++  machine_mode first_imode, second_imode, third_imode, inner_mode;
++  int i, j;
++  rtx op0, op1;
++  rtx (*gen_load_even) (rtx, rtx, rtx);
++  rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
++  rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
++
++  switch (mode)
++    {
++    case E_V8HImode:
++      gen_load_even = gen_vec_setv8hi;
++      gen_interleave_first_low = gen_vec_interleave_lowv4si;
++      gen_interleave_second_low = gen_vec_interleave_lowv2di;
++      inner_mode = HImode;
++      first_imode = V4SImode;
++      second_imode = V2DImode;
++      third_imode = VOIDmode;
++      break;
++    case E_V16QImode:
++      gen_load_even = gen_vec_setv16qi;
++      gen_interleave_first_low = gen_vec_interleave_lowv8hi;
++      gen_interleave_second_low = gen_vec_interleave_lowv4si;
++      inner_mode = QImode;
++      first_imode = V8HImode;
++      second_imode = V4SImode;
++      third_imode = V2DImode;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  for (i = 0; i < n; i++)
++    {
++      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
++      op0 = gen_reg_rtx (SImode);
++      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
++
++      /* Insert the SImode value as low element of V4SImode vector. */
++      op1 = gen_reg_rtx (V4SImode);
++      op0 = gen_rtx_VEC_MERGE (V4SImode,
++			       gen_rtx_VEC_DUPLICATE (V4SImode,
++						      op0),
++			       CONST0_RTX (V4SImode),
++			       const1_rtx);
++      emit_insn (gen_rtx_SET (op1, op0));
++
++      /* Cast the V4SImode vector back to a vector in orignal mode.  */
++      op0 = gen_reg_rtx (mode);
++      emit_move_insn (op0, gen_lowpart (mode, op1));
++
++      /* Load even elements into the second position.  */
++      emit_insn (gen_load_even (op0,
++				force_reg (inner_mode,
++					   ops [i + i + 1]),
++				const1_rtx));
++
++      /* Cast vector to FIRST_IMODE vector.  */
++      ops[i] = gen_reg_rtx (first_imode);
++      emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
++    }
++
++  /* Interleave low FIRST_IMODE vectors.  */
++  for (i = j = 0; i < n; i += 2, j++)
++    {
++      op0 = gen_reg_rtx (first_imode);
++      emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
++
++      /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
++      ops[j] = gen_reg_rtx (second_imode);
++      emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
++    }
++
++  /* Interleave low SECOND_IMODE vectors.  */
++  switch (second_imode)
++    {
++    case E_V4SImode:
++      for (i = j = 0; i < n / 2; i += 2, j++)
++	{
++	  op0 = gen_reg_rtx (second_imode);
++	  emit_insn (gen_interleave_second_low (op0, ops[i],
++						ops[i + 1]));
++
++	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
++	     vector.  */
++	  ops[j] = gen_reg_rtx (third_imode);
++	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
++	}
++      second_imode = V2DImode;
++      gen_interleave_second_low = gen_vec_interleave_lowv2di;
++      /* FALLTHRU */
++
++    case E_V2DImode:
++      op0 = gen_reg_rtx (second_imode);
++      emit_insn (gen_interleave_second_low (op0, ops[0],
++					    ops[1]));
++
++      /* Cast the SECOND_IMODE vector back to a vector on original
++	 mode.  */
++      emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++}
++
++/* A subroutine of ix86_expand_vector_init.  Handle the most general case:
++   all values variable, and none identical.  */
++
++static void
++ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
++				 rtx target, rtx vals)
++{
++  rtx ops[64], op0, op1, op2, op3, op4, op5;
++  machine_mode half_mode = VOIDmode;
++  machine_mode quarter_mode = VOIDmode;
++  int n, i;
++
++  switch (mode)
++    {
++    case E_V2SFmode:
++    case E_V2SImode:
++      if (!mmx_ok && !TARGET_SSE)
++	break;
++      /* FALLTHRU */
++
++    case E_V16SImode:
++    case E_V16SFmode:
++    case E_V8DFmode:
++    case E_V8DImode:
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V4DFmode:
++    case E_V4DImode:
++    case E_V4SFmode:
++    case E_V4SImode:
++    case E_V2DFmode:
++    case E_V2DImode:
++      n = GET_MODE_NUNITS (mode);
++      for (i = 0; i < n; i++)
++	ops[i] = XVECEXP (vals, 0, i);
++      ix86_expand_vector_init_concat (mode, target, ops, n);
++      return;
++
++    case E_V2TImode:
++      for (i = 0; i < 2; i++)
++	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
++      op0 = gen_reg_rtx (V4DImode);
++      ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
++      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
++      return;
++
++    case E_V4TImode:
++      for (i = 0; i < 4; i++)
++	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
++      ops[4] = gen_reg_rtx (V4DImode);
++      ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
++      ops[5] = gen_reg_rtx (V4DImode);
++      ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
++      op0 = gen_reg_rtx (V8DImode);
++      ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
++      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
++      return;
++
++    case E_V32QImode:
++      half_mode = V16QImode;
++      goto half;
++
++    case E_V16HImode:
++      half_mode = V8HImode;
++      goto half;
++
++half:
++      n = GET_MODE_NUNITS (mode);
++      for (i = 0; i < n; i++)
++	ops[i] = XVECEXP (vals, 0, i);
++      op0 = gen_reg_rtx (half_mode);
++      op1 = gen_reg_rtx (half_mode);
++      ix86_expand_vector_init_interleave (half_mode, op0, ops,
++					  n >> 2);
++      ix86_expand_vector_init_interleave (half_mode, op1,
++					  &ops [n >> 1], n >> 2);
++      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
++      return;
++
++    case E_V64QImode:
++      quarter_mode = V16QImode;
++      half_mode = V32QImode;
++      goto quarter;
++
++    case E_V32HImode:
++      quarter_mode = V8HImode;
++      half_mode = V16HImode;
++      goto quarter;
++
++quarter:
++      n = GET_MODE_NUNITS (mode);
++      for (i = 0; i < n; i++)
++	ops[i] = XVECEXP (vals, 0, i);
++      op0 = gen_reg_rtx (quarter_mode);
++      op1 = gen_reg_rtx (quarter_mode);
++      op2 = gen_reg_rtx (quarter_mode);
++      op3 = gen_reg_rtx (quarter_mode);
++      op4 = gen_reg_rtx (half_mode);
++      op5 = gen_reg_rtx (half_mode);
++      ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
++					  n >> 3);
++      ix86_expand_vector_init_interleave (quarter_mode, op1,
++					  &ops [n >> 2], n >> 3);
++      ix86_expand_vector_init_interleave (quarter_mode, op2,
++					  &ops [n >> 1], n >> 3);
++      ix86_expand_vector_init_interleave (quarter_mode, op3,
++					  &ops [(n >> 1) | (n >> 2)], n >> 3);
++      emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
++      emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
++      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
++      return;
++
++    case E_V16QImode:
++      if (!TARGET_SSE4_1)
++	break;
++      /* FALLTHRU */
++
++    case E_V8HImode:
++      if (!TARGET_SSE2)
++	break;
++
++      /* Don't use ix86_expand_vector_init_interleave if we can't
++	 move from GPR to SSE register directly.  */
++      if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
++	break;
++
++      n = GET_MODE_NUNITS (mode);
++      for (i = 0; i < n; i++)
++	ops[i] = XVECEXP (vals, 0, i);
++      ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
++      return;
++
++    case E_V4HImode:
++    case E_V8QImode:
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++    {
++      int i, j, n_elts, n_words, n_elt_per_word;
++      machine_mode inner_mode;
++      rtx words[4], shift;
++
++      inner_mode = GET_MODE_INNER (mode);
++      n_elts = GET_MODE_NUNITS (mode);
++      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
++      n_elt_per_word = n_elts / n_words;
++      shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
++
++      for (i = 0; i < n_words; ++i)
++	{
++	  rtx word = NULL_RTX;
++
++	  for (j = 0; j < n_elt_per_word; ++j)
++	    {
++	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
++	      elt = convert_modes (word_mode, inner_mode, elt, true);
++
++	      if (j == 0)
++		word = elt;
++	      else
++		{
++		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
++					      word, 1, OPTAB_LIB_WIDEN);
++		  word = expand_simple_binop (word_mode, IOR, word, elt,
++					      word, 1, OPTAB_LIB_WIDEN);
++		}
++	    }
++
++	  words[i] = word;
++	}
++
++      if (n_words == 1)
++	emit_move_insn (target, gen_lowpart (mode, words[0]));
++      else if (n_words == 2)
++	{
++	  rtx tmp = gen_reg_rtx (mode);
++	  emit_clobber (tmp);
++	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
++	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
++	  emit_move_insn (target, tmp);
++	}
++      else if (n_words == 4)
++	{
++	  rtx tmp = gen_reg_rtx (V4SImode);
++	  gcc_assert (word_mode == SImode);
++	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
++	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
++	  emit_move_insn (target, gen_lowpart (mode, tmp));
++	}
++      else
++	gcc_unreachable ();
++    }
++}
++
++/* Initialize vector TARGET via VALS.  Suppress the use of MMX
++   instructions unless MMX_OK is true.  */
++
++void
++ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
++{
++  machine_mode mode = GET_MODE (target);
++  machine_mode inner_mode = GET_MODE_INNER (mode);
++  int n_elts = GET_MODE_NUNITS (mode);
++  int n_var = 0, one_var = -1;
++  bool all_same = true, all_const_zero = true;
++  int i;
++  rtx x;
++
++  /* Handle first initialization from vector elts.  */
++  if (n_elts != XVECLEN (vals, 0))
++    {
++      rtx subtarget = target;
++      x = XVECEXP (vals, 0, 0);
++      gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
++      if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
++	{
++	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
++	  if (inner_mode == QImode || inner_mode == HImode)
++	    {
++	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
++	      mode = mode_for_vector (SImode, n_bits / 4).require ();
++	      inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
++	      ops[0] = gen_lowpart (inner_mode, ops[0]);
++	      ops[1] = gen_lowpart (inner_mode, ops[1]);
++	      subtarget = gen_reg_rtx (mode);
++	    }
++	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
++	  if (subtarget != target)
++	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
++	  return;
++	}
++      gcc_unreachable ();
++    }
++
++  for (i = 0; i < n_elts; ++i)
++    {
++      x = XVECEXP (vals, 0, i);
++      if (!(CONST_SCALAR_INT_P (x)
++	    || CONST_DOUBLE_P (x)
++	    || CONST_FIXED_P (x)))
++	n_var++, one_var = i;
++      else if (x != CONST0_RTX (inner_mode))
++	all_const_zero = false;
++      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
++	all_same = false;
++    }
++
++  /* Constants are best loaded from the constant pool.  */
++  if (n_var == 0)
++    {
++      emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
++      return;
++    }
++
++  /* If all values are identical, broadcast the value.  */
++  if (all_same
++      && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
++					    XVECEXP (vals, 0, 0)))
++    return;
++
++  /* Values where only one field is non-constant are best loaded from
++     the pool and overwritten via move later.  */
++  if (n_var == 1)
++    {
++      if (all_const_zero
++	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
++						  XVECEXP (vals, 0, one_var),
++						  one_var))
++	return;
++
++      if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
++	return;
++    }
++
++  ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
++}
++
++void
++ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
++{
++  machine_mode mode = GET_MODE (target);
++  machine_mode inner_mode = GET_MODE_INNER (mode);
++  machine_mode half_mode;
++  bool use_vec_merge = false;
++  rtx tmp;
++  static rtx (*gen_extract[6][2]) (rtx, rtx)
++    = {
++	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
++	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
++	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
++	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
++	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
++	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
++      };
++  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
++    = {
++	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
++	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
++	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
++	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
++	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
++	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
++      };
++  int i, j, n;
++  machine_mode mmode = VOIDmode;
++  rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
++
++  switch (mode)
++    {
++    case E_V2SFmode:
++    case E_V2SImode:
++      if (mmx_ok)
++	{
++	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
++	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
++	  if (elt == 0)
++	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
++	  else
++	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
++	  emit_insn (gen_rtx_SET (target, tmp));
++	  return;
++	}
++      break;
++
++    case E_V2DImode:
++      use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
++      if (use_vec_merge)
++	break;
++
++      tmp = gen_reg_rtx (GET_MODE_INNER (mode));
++      ix86_expand_vector_extract (false, tmp, target, 1 - elt);
++      if (elt == 0)
++	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
++      else
++	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
++      emit_insn (gen_rtx_SET (target, tmp));
++      return;
++
++    case E_V2DFmode:
++      {
++	rtx op0, op1;
++
++	/* For the two element vectors, we implement a VEC_CONCAT with
++	   the extraction of the other element.  */
++
++	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
++	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
++
++	if (elt == 0)
++	  op0 = val, op1 = tmp;
++	else
++	  op0 = tmp, op1 = val;
++
++	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
++	emit_insn (gen_rtx_SET (target, tmp));
++      }
++      return;
++
++    case E_V4SFmode:
++      use_vec_merge = TARGET_SSE4_1;
++      if (use_vec_merge)
++	break;
++
++      switch (elt)
++	{
++	case 0:
++	  use_vec_merge = true;
++	  break;
++
++	case 1:
++	  /* tmp = target = A B C D */
++	  tmp = copy_to_reg (target);
++	  /* target = A A B B */
++	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
++	  /* target = X A B B */
++	  ix86_expand_vector_set (false, target, val, 0);
++	  /* target = A X C D  */
++	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
++					  const1_rtx, const0_rtx,
++					  GEN_INT (2+4), GEN_INT (3+4)));
++	  return;
++
++	case 2:
++	  /* tmp = target = A B C D */
++	  tmp = copy_to_reg (target);
++	  /* tmp = X B C D */
++	  ix86_expand_vector_set (false, tmp, val, 0);
++	  /* target = A B X D */
++	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
++					  const0_rtx, const1_rtx,
++					  GEN_INT (0+4), GEN_INT (3+4)));
++	  return;
++
++	case 3:
++	  /* tmp = target = A B C D */
++	  tmp = copy_to_reg (target);
++	  /* tmp = X B C D */
++	  ix86_expand_vector_set (false, tmp, val, 0);
++	  /* target = A B X D */
++	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
++					  const0_rtx, const1_rtx,
++					  GEN_INT (2+4), GEN_INT (0+4)));
++	  return;
++
++	default:
++	  gcc_unreachable ();
++	}
++      break;
++
++    case E_V4SImode:
++      use_vec_merge = TARGET_SSE4_1;
++      if (use_vec_merge)
++	break;
++
++      /* Element 0 handled by vec_merge below.  */
++      if (elt == 0)
++	{
++	  use_vec_merge = true;
++	  break;
++	}
++
++      if (TARGET_SSE2)
++	{
++	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
++	     store into element 0, then shuffle them back.  */
++
++	  rtx order[4];
++
++	  order[0] = GEN_INT (elt);
++	  order[1] = const1_rtx;
++	  order[2] = const2_rtx;
++	  order[3] = GEN_INT (3);
++	  order[elt] = const0_rtx;
++
++	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
++					order[1], order[2], order[3]));
++
++	  ix86_expand_vector_set (false, target, val, 0);
++
++	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
++					order[1], order[2], order[3]));
++	}
++      else
++	{
++	  /* For SSE1, we have to reuse the V4SF code.  */
++	  rtx t = gen_reg_rtx (V4SFmode);
++	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
++	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
++	  emit_move_insn (target, gen_lowpart (mode, t));
++	}
++      return;
++
++    case E_V8HImode:
++      use_vec_merge = TARGET_SSE2;
++      break;
++    case E_V4HImode:
++      use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
++      break;
++
++    case E_V16QImode:
++      use_vec_merge = TARGET_SSE4_1;
++      break;
++
++    case E_V8QImode:
++      break;
++
++    case E_V32QImode:
++      half_mode = V16QImode;
++      j = 0;
++      n = 16;
++      goto half;
++
++    case E_V16HImode:
++      half_mode = V8HImode;
++      j = 1;
++      n = 8;
++      goto half;
++
++    case E_V8SImode:
++      half_mode = V4SImode;
++      j = 2;
++      n = 4;
++      goto half;
++
++    case E_V4DImode:
++      half_mode = V2DImode;
++      j = 3;
++      n = 2;
++      goto half;
++
++    case E_V8SFmode:
++      half_mode = V4SFmode;
++      j = 4;
++      n = 4;
++      goto half;
++
++    case E_V4DFmode:
++      half_mode = V2DFmode;
++      j = 5;
++      n = 2;
++      goto half;
++
++half:
++      /* Compute offset.  */
++      i = elt / n;
++      elt %= n;
++
++      gcc_assert (i <= 1);
++
++      /* Extract the half.  */
++      tmp = gen_reg_rtx (half_mode);
++      emit_insn (gen_extract[j][i] (tmp, target));
++
++      /* Put val in tmp at elt.  */
++      ix86_expand_vector_set (false, tmp, val, elt);
++
++      /* Put it back.  */
++      emit_insn (gen_insert[j][i] (target, target, tmp));
++      return;
++
++    case E_V8DFmode:
++      if (TARGET_AVX512F)
++	{
++	  mmode = QImode;
++	  gen_blendm = gen_avx512f_blendmv8df;
++	}
++      break;
++
++    case E_V8DImode:
++      if (TARGET_AVX512F)
++	{
++	  mmode = QImode;
++	  gen_blendm = gen_avx512f_blendmv8di;
++	}
++      break;
++
++    case E_V16SFmode:
++      if (TARGET_AVX512F)
++	{
++	  mmode = HImode;
++	  gen_blendm = gen_avx512f_blendmv16sf;
++	}
++      break;
++
++    case E_V16SImode:
++      if (TARGET_AVX512F)
++	{
++	  mmode = HImode;
++	  gen_blendm = gen_avx512f_blendmv16si;
++	}
++      break;
++
++    case E_V32HImode:
++      if (TARGET_AVX512BW)
++	{
++	  mmode = SImode;
++	  gen_blendm = gen_avx512bw_blendmv32hi;
++	}
++      else if (TARGET_AVX512F)
++	{
++	  half_mode = E_V8HImode;
++	  n = 8;
++	  goto quarter;
++	}
++      break;
++
++    case E_V64QImode:
++      if (TARGET_AVX512BW)
++	{
++	  mmode = DImode;
++	  gen_blendm = gen_avx512bw_blendmv64qi;
++	}
++      else if (TARGET_AVX512F)
++	{
++	  half_mode = E_V16QImode;
++	  n = 16;
++	  goto quarter;
++	}
++      break;
++
++quarter:
++      /* Compute offset.  */
++      i = elt / n;
++      elt %= n;
++
++      gcc_assert (i <= 3);
++
++      {
++	/* Extract the quarter.  */
++	tmp = gen_reg_rtx (V4SImode);
++	rtx tmp2 = gen_lowpart (V16SImode, target);
++	rtx mask = gen_reg_rtx (QImode);
++
++	emit_move_insn (mask, constm1_rtx);
++	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
++						   tmp, mask));
++
++	tmp2 = gen_reg_rtx (half_mode);
++	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
++	tmp = tmp2;
++
++	/* Put val in tmp at elt.  */
++	ix86_expand_vector_set (false, tmp, val, elt);
++
++	/* Put it back.  */
++	tmp2 = gen_reg_rtx (V16SImode);
++	rtx tmp3 = gen_lowpart (V16SImode, target);
++	mask = gen_reg_rtx (HImode);
++	emit_move_insn (mask, constm1_rtx);
++	tmp = gen_lowpart (V4SImode, tmp);
++	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
++						  tmp3, mask));
++	emit_move_insn (target, gen_lowpart (mode, tmp2));
++      }
++      return;
++
++    default:
++      break;
++    }
++
++  if (mmode != VOIDmode)
++    {
++      tmp = gen_reg_rtx (mode);
++      emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
++      /* The avx512*_blendm<mode> expanders have different operand order
++	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
++	 elements where the mask is set and second input operand otherwise,
++	 in {sse,avx}*_*blend* the first input operand is used for elements
++	 where the mask is clear and second input operand otherwise.  */
++      emit_insn (gen_blendm (target, target, tmp,
++			     force_reg (mmode,
++					gen_int_mode (HOST_WIDE_INT_1U << elt,
++						      mmode))));
++    }
++  else if (use_vec_merge)
++    {
++      tmp = gen_rtx_VEC_DUPLICATE (mode, val);
++      tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
++			       GEN_INT (HOST_WIDE_INT_1U << elt));
++      emit_insn (gen_rtx_SET (target, tmp));
++    }
++  else
++    {
++      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
++
++      emit_move_insn (mem, target);
++
++      tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
++      emit_move_insn (tmp, val);
++
++      emit_move_insn (target, mem);
++    }
++}
++
++void
++ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
++{
++  machine_mode mode = GET_MODE (vec);
++  machine_mode inner_mode = GET_MODE_INNER (mode);
++  bool use_vec_extr = false;
++  rtx tmp;
++
++  switch (mode)
++    {
++    case E_V2SImode:
++    case E_V2SFmode:
++      if (!mmx_ok)
++	break;
++      /* FALLTHRU */
++
++    case E_V2DFmode:
++    case E_V2DImode:
++    case E_V2TImode:
++    case E_V4TImode:
++      use_vec_extr = true;
++      break;
++
++    case E_V4SFmode:
++      use_vec_extr = TARGET_SSE4_1;
++      if (use_vec_extr)
++	break;
++
++      switch (elt)
++	{
++	case 0:
++	  tmp = vec;
++	  break;
++
++	case 1:
++	case 3:
++	  tmp = gen_reg_rtx (mode);
++	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
++				       GEN_INT (elt), GEN_INT (elt),
++				       GEN_INT (elt+4), GEN_INT (elt+4)));
++	  break;
++
++	case 2:
++	  tmp = gen_reg_rtx (mode);
++	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++      vec = tmp;
++      use_vec_extr = true;
++      elt = 0;
++      break;
++
++    case E_V4SImode:
++      use_vec_extr = TARGET_SSE4_1;
++      if (use_vec_extr)
++	break;
++
++      if (TARGET_SSE2)
++	{
++	  switch (elt)
++	    {
++	    case 0:
++	      tmp = vec;
++	      break;
++
++	    case 1:
++	    case 3:
++	      tmp = gen_reg_rtx (mode);
++	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
++					    GEN_INT (elt), GEN_INT (elt),
++					    GEN_INT (elt), GEN_INT (elt)));
++	      break;
++
++	    case 2:
++	      tmp = gen_reg_rtx (mode);
++	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
++	      break;
++
++	    default:
++	      gcc_unreachable ();
++	    }
++	  vec = tmp;
++	  use_vec_extr = true;
++	  elt = 0;
++	}
++      else
++	{
++	  /* For SSE1, we have to reuse the V4SF code.  */
++	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
++				      gen_lowpart (V4SFmode, vec), elt);
++	  return;
++	}
++      break;
++
++    case E_V8HImode:
++      use_vec_extr = TARGET_SSE2;
++      break;
++    case E_V4HImode:
++      use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
++      break;
++
++    case E_V16QImode:
++      use_vec_extr = TARGET_SSE4_1;
++      break;
++
++    case E_V8SFmode:
++      if (TARGET_AVX)
++	{
++	  tmp = gen_reg_rtx (V4SFmode);
++	  if (elt < 4)
++	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
++	  return;
++	}
++      break;
++
++    case E_V4DFmode:
++      if (TARGET_AVX)
++	{
++	  tmp = gen_reg_rtx (V2DFmode);
++	  if (elt < 2)
++	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
++	  return;
++	}
++      break;
++
++    case E_V32QImode:
++      if (TARGET_AVX)
++	{
++	  tmp = gen_reg_rtx (V16QImode);
++	  if (elt < 16)
++	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
++	  return;
++	}
++      break;
++
++    case E_V16HImode:
++      if (TARGET_AVX)
++	{
++	  tmp = gen_reg_rtx (V8HImode);
++	  if (elt < 8)
++	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
++	  return;
++	}
++      break;
++
++    case E_V8SImode:
++      if (TARGET_AVX)
++	{
++	  tmp = gen_reg_rtx (V4SImode);
++	  if (elt < 4)
++	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
++	  return;
++	}
++      break;
++
++    case E_V4DImode:
++      if (TARGET_AVX)
++	{
++	  tmp = gen_reg_rtx (V2DImode);
++	  if (elt < 2)
++	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
++	  return;
++	}
++      break;
++
++    case E_V32HImode:
++      if (TARGET_AVX512BW)
++	{
++	  tmp = gen_reg_rtx (V16HImode);
++	  if (elt < 16)
++	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
++	  return;
++	}
++      break;
++
++    case E_V64QImode:
++      if (TARGET_AVX512BW)
++	{
++	  tmp = gen_reg_rtx (V32QImode);
++	  if (elt < 32)
++	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
++	  else
++	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
++	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
++	  return;
++	}
++      break;
++
++    case E_V16SFmode:
++      tmp = gen_reg_rtx (V8SFmode);
++      if (elt < 8)
++	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
++      else
++	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
++      ix86_expand_vector_extract (false, target, tmp, elt & 7);
++      return;
++
++    case E_V8DFmode:
++      tmp = gen_reg_rtx (V4DFmode);
++      if (elt < 4)
++	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
++      else
++	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
++      ix86_expand_vector_extract (false, target, tmp, elt & 3);
++      return;
++
++    case E_V16SImode:
++      tmp = gen_reg_rtx (V8SImode);
++      if (elt < 8)
++	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
++      else
++	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
++      ix86_expand_vector_extract (false, target, tmp, elt & 7);
++      return;
++
++    case E_V8DImode:
++      tmp = gen_reg_rtx (V4DImode);
++      if (elt < 4)
++	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
++      else
++	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
++      ix86_expand_vector_extract (false, target, tmp, elt & 3);
++      return;
++
++    case E_V8QImode:
++      /* ??? Could extract the appropriate HImode element and shift.  */
++    default:
++      break;
++    }
++
++  if (use_vec_extr)
++    {
++      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
++      tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
++
++      /* Let the rtl optimizers know about the zero extension performed.  */
++      if (inner_mode == QImode || inner_mode == HImode)
++	{
++	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
++	  target = gen_lowpart (SImode, target);
++	}
++
++      emit_insn (gen_rtx_SET (target, tmp));
++    }
++  else
++    {
++      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
++
++      emit_move_insn (mem, vec);
++
++      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
++      emit_move_insn (target, tmp);
++    }
++}
++
++/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
++   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
++   The upper bits of DEST are undefined, though they shouldn't cause
++   exceptions (some bits from src or all zeros are ok).  */
++
++static void
++emit_reduc_half (rtx dest, rtx src, int i)
++{
++  rtx tem, d = dest;
++  switch (GET_MODE (src))
++    {
++    case E_V4SFmode:
++      if (i == 128)
++	tem = gen_sse_movhlps (dest, src, src);
++      else
++	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
++				   GEN_INT (1 + 4), GEN_INT (1 + 4));
++      break;
++    case E_V2DFmode:
++      tem = gen_vec_interleave_highv2df (dest, src, src);
++      break;
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++      d = gen_reg_rtx (V1TImode);
++      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
++				GEN_INT (i / 2));
++      break;
++    case E_V8SFmode:
++      if (i == 256)
++	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
++      else
++	tem = gen_avx_shufps256 (dest, src, src,
++				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
++      break;
++    case E_V4DFmode:
++      if (i == 256)
++	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
++      else
++	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
++      break;
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V8SImode:
++    case E_V4DImode:
++      if (i == 256)
++	{
++	  if (GET_MODE (dest) != V4DImode)
++	    d = gen_reg_rtx (V4DImode);
++	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
++				   gen_lowpart (V4DImode, src),
++				   const1_rtx);
++	}
++      else
++	{
++	  d = gen_reg_rtx (V2TImode);
++	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
++				    GEN_INT (i / 2));
++	}
++      break;
++    case E_V64QImode:
++    case E_V32HImode:
++    case E_V16SImode:
++    case E_V16SFmode:
++    case E_V8DImode:
++    case E_V8DFmode:
++      if (i > 128)
++	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
++				      gen_lowpart (V16SImode, src),
++				      gen_lowpart (V16SImode, src),
++				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
++				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
++				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
++				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
++				      GEN_INT (0xC), GEN_INT (0xD),
++				      GEN_INT (0xE), GEN_INT (0xF),
++				      GEN_INT (0x10), GEN_INT (0x11),
++				      GEN_INT (0x12), GEN_INT (0x13),
++				      GEN_INT (0x14), GEN_INT (0x15),
++				      GEN_INT (0x16), GEN_INT (0x17));
++      else
++	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
++				   gen_lowpart (V16SImode, src),
++				   GEN_INT (i == 128 ? 0x2 : 0x1),
++				   GEN_INT (0x3),
++				   GEN_INT (0x3),
++				   GEN_INT (0x3),
++				   GEN_INT (i == 128 ? 0x6 : 0x5),
++				   GEN_INT (0x7),
++				   GEN_INT (0x7),
++				   GEN_INT (0x7),
++				   GEN_INT (i == 128 ? 0xA : 0x9),
++				   GEN_INT (0xB),
++				   GEN_INT (0xB),
++				   GEN_INT (0xB),
++				   GEN_INT (i == 128 ? 0xE : 0xD),
++				   GEN_INT (0xF),
++				   GEN_INT (0xF),
++				   GEN_INT (0xF));
++      break;
++    default:
++      gcc_unreachable ();
++    }
++  emit_insn (tem);
++  if (d != dest)
++    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
++}
++
++/* Expand a vector reduction.  FN is the binary pattern to reduce;
++   DEST is the destination; IN is the input vector.  */
++
++void
++ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
++{
++  rtx half, dst, vec = in;
++  machine_mode mode = GET_MODE (in);
++  int i;
++
++  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
++  if (TARGET_SSE4_1
++      && mode == V8HImode
++      && fn == gen_uminv8hi3)
++    {
++      emit_insn (gen_sse4_1_phminposuw (dest, in));
++      return;
++    }
++
++  for (i = GET_MODE_BITSIZE (mode);
++       i > GET_MODE_UNIT_BITSIZE (mode);
++       i >>= 1)
++    {
++      half = gen_reg_rtx (mode);
++      emit_reduc_half (half, vec, i);
++      if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
++	dst = dest;
++      else
++	dst = gen_reg_rtx (mode);
++      emit_insn (fn (dst, half, vec));
++      vec = dst;
++    }
++}
++
++/* Output code to perform a conditional jump to LABEL, if C2 flag in
++   FP status register is set.  */
++
++void
++ix86_emit_fp_unordered_jump (rtx label)
++{
++  rtx reg = gen_reg_rtx (HImode);
++  rtx_insn *insn;
++  rtx temp;
++
++  emit_insn (gen_x86_fnstsw_1 (reg));
++
++  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
++    {
++      emit_insn (gen_x86_sahf_1 (reg));
++
++      temp = gen_rtx_REG (CCmode, FLAGS_REG);
++      temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
++    }
++  else
++    {
++      emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
++
++      temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
++      temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
++    }
++
++  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
++			      gen_rtx_LABEL_REF (VOIDmode, label),
++			      pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
++  predict_jump (REG_BR_PROB_BASE * 10 / 100);
++  JUMP_LABEL (insn) = label;
++}
++
++/* Output code to perform an sinh XFmode calculation.  */
++
++void ix86_emit_i387_sinh (rtx op0, rtx op1)
++{
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx scratch = gen_reg_rtx (HImode);
++  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
++  rtx half = const_double_from_real_value (dconsthalf, XFmode);
++  rtx cst1, tmp;
++  rtx_code_label *jump_label = gen_label_rtx ();
++  rtx_insn *insn;
++
++  /* scratch = fxam (op1) */
++  emit_insn (gen_fxamxf2_i387 (scratch, op1));
++
++  /* e1 = expm1 (|op1|) */
++  emit_insn (gen_absxf2 (e2, op1));
++  emit_insn (gen_expm1xf2 (e1, e2));
++
++  /* e2 = e1 / (e1 + 1.0) + e1 */
++  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
++  emit_insn (gen_addxf3 (e2, e1, cst1));
++  emit_insn (gen_divxf3 (e2, e1, e2));
++  emit_insn (gen_addxf3 (e2, e2, e1));
++
++  /* flags = signbit (op1) */
++  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
++
++  /* if (flags) then e2 = -e2 */
++  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
++			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
++			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
++			      pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++  predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  JUMP_LABEL (insn) = jump_label;
++
++  emit_insn (gen_negxf2 (e2, e2));
++
++  emit_label (jump_label);
++  LABEL_NUSES (jump_label) = 1;
++
++  /* op0 = 0.5 * e2 */
++  half = force_reg (XFmode, half);
++  emit_insn (gen_mulxf3 (op0, e2, half));
++}
++
++/* Output code to perform an cosh XFmode calculation.  */
++
++void ix86_emit_i387_cosh (rtx op0, rtx op1)
++{
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx half = const_double_from_real_value (dconsthalf, XFmode);
++  rtx cst1;
++
++  /* e1 = exp (op1) */
++  emit_insn (gen_expxf2 (e1, op1));
++
++  /* e2 = e1 + 1.0 / e1 */
++  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
++  emit_insn (gen_divxf3 (e2, cst1, e1));
++  emit_insn (gen_addxf3 (e2, e1, e2));
++
++  /* op0 = 0.5 * e2 */
++  half = force_reg (XFmode, half);
++  emit_insn (gen_mulxf3 (op0, e2, half));
++}
++
++/* Output code to perform an tanh XFmode calculation.  */
++
++void ix86_emit_i387_tanh (rtx op0, rtx op1)
++{
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx scratch = gen_reg_rtx (HImode);
++  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
++  rtx cst2, tmp;
++  rtx_code_label *jump_label = gen_label_rtx ();
++  rtx_insn *insn;
++
++  /* scratch = fxam (op1) */
++  emit_insn (gen_fxamxf2_i387 (scratch, op1));
++
++  /* e1 = expm1 (-|2 * op1|) */
++  emit_insn (gen_addxf3 (e2, op1, op1));
++  emit_insn (gen_absxf2 (e2, e2));
++  emit_insn (gen_negxf2 (e2, e2));
++  emit_insn (gen_expm1xf2 (e1, e2));
++
++  /* e2 = e1 / (e1 + 2.0) */
++  cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
++  emit_insn (gen_addxf3 (e2, e1, cst2));
++  emit_insn (gen_divxf3 (e2, e1, e2));
++
++  /* flags = signbit (op1) */
++  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
++
++  /* if (!flags) then e2 = -e2 */
++  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
++			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
++			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
++			      pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++  predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  JUMP_LABEL (insn) = jump_label;
++
++  emit_insn (gen_negxf2 (e2, e2));
++
++  emit_label (jump_label);
++  LABEL_NUSES (jump_label) = 1;
++
++  emit_move_insn (op0, e2);
++}
++
++/* Output code to perform an asinh XFmode calculation.  */
++
++void ix86_emit_i387_asinh (rtx op0, rtx op1)
++{
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx scratch = gen_reg_rtx (HImode);
++  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
++  rtx cst1, tmp;
++  rtx_code_label *jump_label = gen_label_rtx ();
++  rtx_insn *insn;
++
++  /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
++  emit_insn (gen_mulxf3 (e1, op1, op1));
++  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
++  emit_insn (gen_addxf3 (e2, e1, cst1));
++  emit_insn (gen_sqrtxf2 (e2, e2));
++  emit_insn (gen_addxf3 (e2, e2, cst1));
++
++  /* e1 = e1 / e2 */
++  emit_insn (gen_divxf3 (e1, e1, e2));
++
++  /* scratch = fxam (op1) */
++  emit_insn (gen_fxamxf2_i387 (scratch, op1));
++
++  /* e1 = e1 + |op1| */
++  emit_insn (gen_absxf2 (e2, op1));
++  emit_insn (gen_addxf3 (e1, e1, e2));
++
++  /* e2 = log1p (e1) */
++  ix86_emit_i387_log1p (e2, e1);
++
++  /* flags = signbit (op1) */
++  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
++
++  /* if (flags) then e2 = -e2 */
++  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
++			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
++			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
++			      pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++  predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  JUMP_LABEL (insn) = jump_label;
++
++  emit_insn (gen_negxf2 (e2, e2));
++
++  emit_label (jump_label);
++  LABEL_NUSES (jump_label) = 1;
++
++  emit_move_insn (op0, e2);
++}
++
++/* Output code to perform an acosh XFmode calculation.  */
++
++void ix86_emit_i387_acosh (rtx op0, rtx op1)
++{
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
++
++  /* e2 = sqrt (op1 + 1.0) */
++  emit_insn (gen_addxf3 (e2, op1, cst1));
++  emit_insn (gen_sqrtxf2 (e2, e2));
++
++  /* e1 = sqrt (op1 - 1.0) */
++  emit_insn (gen_subxf3 (e1, op1, cst1));
++  emit_insn (gen_sqrtxf2 (e1, e1));
++
++  /* e1 = e1 * e2 */
++  emit_insn (gen_mulxf3 (e1, e1, e2));
++
++  /* e1 = e1 + op1 */
++  emit_insn (gen_addxf3 (e1, e1, op1));
++
++  /* op0 = log (e1) */
++  emit_insn (gen_logxf2 (op0, e1));
++}
++
++/* Output code to perform an atanh XFmode calculation.  */
++
++void ix86_emit_i387_atanh (rtx op0, rtx op1)
++{
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx scratch = gen_reg_rtx (HImode);
++  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
++  rtx half = const_double_from_real_value (dconsthalf, XFmode);
++  rtx cst1, tmp;
++  rtx_code_label *jump_label = gen_label_rtx ();
++  rtx_insn *insn;
++
++  /* scratch = fxam (op1) */
++  emit_insn (gen_fxamxf2_i387 (scratch, op1));
++
++  /* e2 = |op1| */
++  emit_insn (gen_absxf2 (e2, op1));
++
++  /* e1 = -(e2 + e2) / (e2 + 1.0) */
++  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
++  emit_insn (gen_addxf3 (e1, e2, cst1));
++  emit_insn (gen_addxf3 (e2, e2, e2));
++  emit_insn (gen_negxf2 (e2, e2));
++  emit_insn (gen_divxf3 (e1, e2, e1));
++
++  /* e2 = log1p (e1) */
++  ix86_emit_i387_log1p (e2, e1);
++
++  /* flags = signbit (op1) */
++  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
++
++  /* if (!flags) then e2 = -e2 */
++  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
++			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
++			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
++			      pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++  predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  JUMP_LABEL (insn) = jump_label;
++
++  emit_insn (gen_negxf2 (e2, e2));
++
++  emit_label (jump_label);
++  LABEL_NUSES (jump_label) = 1;
++
++  /* op0 = 0.5 * e2 */
++  half = force_reg (XFmode, half);
++  emit_insn (gen_mulxf3 (op0, e2, half));
++}
++
++/* Output code to perform a log1p XFmode calculation.  */
++
++void ix86_emit_i387_log1p (rtx op0, rtx op1)
++{
++  rtx_code_label *label1 = gen_label_rtx ();
++  rtx_code_label *label2 = gen_label_rtx ();
++
++  rtx tmp = gen_reg_rtx (XFmode);
++  rtx res = gen_reg_rtx (XFmode);
++  rtx cst, cstln2, cst1;
++  rtx_insn *insn;
++
++  cst = const_double_from_real_value
++    (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
++  cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
++
++  emit_insn (gen_absxf2 (tmp, op1));
++
++  cst = force_reg (XFmode, cst);
++  ix86_expand_branch (GE, tmp, cst, label1);
++  predict_jump (REG_BR_PROB_BASE * 10 / 100);
++  insn = get_last_insn ();
++  JUMP_LABEL (insn) = label1;
++
++  emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
++  emit_jump (label2);
++
++  emit_label (label1);
++  LABEL_NUSES (label1) = 1;
++
++  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
++  emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
++  emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
++
++  emit_label (label2);
++  LABEL_NUSES (label2) = 1;
++
++  emit_move_insn (op0, res);
++}
++
++/* Emit code for round calculation.  */
++void ix86_emit_i387_round (rtx op0, rtx op1)
++{
++  machine_mode inmode = GET_MODE (op1);
++  machine_mode outmode = GET_MODE (op0);
++  rtx e1 = gen_reg_rtx (XFmode);
++  rtx e2 = gen_reg_rtx (XFmode);
++  rtx scratch = gen_reg_rtx (HImode);
++  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
++  rtx half = const_double_from_real_value (dconsthalf, XFmode);
++  rtx res = gen_reg_rtx (outmode);
++  rtx_code_label *jump_label = gen_label_rtx ();
++  rtx (*floor_insn) (rtx, rtx);
++  rtx (*neg_insn) (rtx, rtx);
++  rtx_insn *insn;
++  rtx tmp;
++
++  switch (inmode)
++    {
++    case E_SFmode:
++    case E_DFmode:
++      tmp = gen_reg_rtx (XFmode);
++
++      emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
++      op1 = tmp;
++      break;
++    case E_XFmode:
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  switch (outmode)
++    {
++    case E_SFmode:
++      floor_insn = gen_frndintxf2_floor;
++      neg_insn = gen_negsf2;
++      break;
++    case E_DFmode:
++      floor_insn = gen_frndintxf2_floor;
++      neg_insn = gen_negdf2;
++      break;
++    case E_XFmode:
++      floor_insn = gen_frndintxf2_floor;
++      neg_insn = gen_negxf2;
++      break;
++    case E_HImode:
++      floor_insn = gen_lfloorxfhi2;
++      neg_insn = gen_neghi2;
++      break;
++    case E_SImode:
++      floor_insn = gen_lfloorxfsi2;
++      neg_insn = gen_negsi2;
++      break;
++    case E_DImode:
++      floor_insn = gen_lfloorxfdi2;
++      neg_insn = gen_negdi2;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
++
++  /* scratch = fxam(op1) */
++  emit_insn (gen_fxamxf2_i387 (scratch, op1));
++
++  /* e1 = fabs(op1) */
++  emit_insn (gen_absxf2 (e1, op1));
++
++  /* e2 = e1 + 0.5 */
++  half = force_reg (XFmode, half);
++  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
++
++  /* res = floor(e2) */
++  switch (outmode)
++    {
++    case E_SFmode:
++    case E_DFmode:
++      {
++	tmp = gen_reg_rtx (XFmode);
++
++	emit_insn (floor_insn (tmp, e2));
++	emit_insn (gen_rtx_SET (res,
++				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
++						UNSPEC_TRUNC_NOOP)));
++      }
++      break;
++    default:
++      emit_insn (floor_insn (res, e2));
++    }
++
++  /* flags = signbit(a) */
++  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
++
++  /* if (flags) then res = -res */
++  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
++			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
++			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
++			      pc_rtx);
++  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++  predict_jump (REG_BR_PROB_BASE * 50 / 100);
++  JUMP_LABEL (insn) = jump_label;
++
++  emit_insn (neg_insn (res, res));
++
++  emit_label (jump_label);
++  LABEL_NUSES (jump_label) = 1;
++
++  emit_move_insn (op0, res);
++}
++
++/* Output code to perform a Newton-Rhapson approximation of a single precision
++   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
++
++void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
++{
++  rtx x0, x1, e0, e1;
++
++  x0 = gen_reg_rtx (mode);
++  e0 = gen_reg_rtx (mode);
++  e1 = gen_reg_rtx (mode);
++  x1 = gen_reg_rtx (mode);
++
++  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
++
++  b = force_reg (mode, b);
++
++  /* x0 = rcp(b) estimate */
++  if (mode == V16SFmode || mode == V8DFmode)
++    {
++      if (TARGET_AVX512ER)
++	{
++	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
++						      UNSPEC_RCP28)));
++	  /* res = a * x0 */
++	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
++	  return;
++	}
++      else
++	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
++						    UNSPEC_RCP14)));
++    }
++  else
++    emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
++						UNSPEC_RCP)));
++
++  /* e0 = x0 * b */
++  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
++
++  /* e0 = x0 * e0 */
++  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
++
++  /* e1 = x0 + x0 */
++  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
++
++  /* x1 = e1 - e0 */
++  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
++
++  /* res = a * x1 */
++  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
++}
++
++/* Output code to perform a Newton-Rhapson approximation of a
++   single precision floating point [reciprocal] square root.  */
++
++void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
++{
++  rtx x0, e0, e1, e2, e3, mthree, mhalf;
++  REAL_VALUE_TYPE r;
++  int unspec;
++
++  x0 = gen_reg_rtx (mode);
++  e0 = gen_reg_rtx (mode);
++  e1 = gen_reg_rtx (mode);
++  e2 = gen_reg_rtx (mode);
++  e3 = gen_reg_rtx (mode);
++
++  if (TARGET_AVX512ER && mode == V16SFmode)
++    {
++      if (recip)
++	/* res = rsqrt28(a) estimate */
++	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
++						     UNSPEC_RSQRT28)));
++      else
++	{
++	  /* x0 = rsqrt28(a) estimate */
++	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
++						      UNSPEC_RSQRT28)));
++	  /* res = rcp28(x0) estimate */
++	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
++						       UNSPEC_RCP28)));
++	}
++      return;
++    }
++
++  real_from_integer (&r, VOIDmode, -3, SIGNED);
++  mthree = const_double_from_real_value (r, SFmode);
++
++  real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
++  mhalf = const_double_from_real_value (r, SFmode);
++  unspec = UNSPEC_RSQRT;
++
++  if (VECTOR_MODE_P (mode))
++    {
++      mthree = ix86_build_const_vector (mode, true, mthree);
++      mhalf = ix86_build_const_vector (mode, true, mhalf);
++      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
++      if (GET_MODE_SIZE (mode) == 64)
++	unspec = UNSPEC_RSQRT14;
++    }
++
++  /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
++     rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
++
++  a = force_reg (mode, a);
++
++  /* x0 = rsqrt(a) estimate */
++  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
++					      unspec)));
++
++  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
++  if (!recip)
++    {
++      rtx zero = force_reg (mode, CONST0_RTX(mode));
++      rtx mask;
++
++      /* Handle masked compare.  */
++      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
++	{
++	  mask = gen_reg_rtx (HImode);
++	  /* Imm value 0x4 corresponds to not-equal comparison.  */
++	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
++	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
++	}
++      else
++	{
++	  mask = gen_reg_rtx (mode);
++	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
++	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
++	}
++    }
++
++  /* e0 = x0 * a */
++  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
++  /* e1 = e0 * x0 */
++  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
++
++  /* e2 = e1 - 3. */
++  mthree = force_reg (mode, mthree);
++  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
++
++  mhalf = force_reg (mode, mhalf);
++  if (recip)
++    /* e3 = -.5 * x0 */
++    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
++  else
++    /* e3 = -.5 * e0 */
++    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
++  /* ret = e2 * e3 */
++  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
++}
++
++/* Expand fabs (OP0) and return a new rtx that holds the result.  The
++   mask for masking out the sign-bit is stored in *SMASK, if that is
++   non-null.  */
++
++static rtx
++ix86_expand_sse_fabs (rtx op0, rtx *smask)
++{
++  machine_mode vmode, mode = GET_MODE (op0);
++  rtx xa, mask;
++
++  xa = gen_reg_rtx (mode);
++  if (mode == SFmode)
++    vmode = V4SFmode;
++  else if (mode == DFmode)
++    vmode = V2DFmode;
++  else
++    vmode = mode;
++  mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
++  if (!VECTOR_MODE_P (mode))
++    {
++      /* We need to generate a scalar mode mask in this case.  */
++      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
++      tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
++      mask = gen_reg_rtx (mode);
++      emit_insn (gen_rtx_SET (mask, tmp));
++    }
++  emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
++
++  if (smask)
++    *smask = mask;
++
++  return xa;
++}
++
++/* Expands a comparison of OP0 with OP1 using comparison code CODE,
++   swapping the operands if SWAP_OPERANDS is true.  The expanded
++   code is a forward jump to a newly created label in case the
++   comparison is true.  The generated label rtx is returned.  */
++static rtx_code_label *
++ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
++                                  bool swap_operands)
++{
++  bool unordered_compare = ix86_unordered_fp_compare (code);
++  rtx_code_label *label;
++  rtx tmp, reg;
++
++  if (swap_operands)
++    std::swap (op0, op1);
++
++  label = gen_label_rtx ();
++  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
++  if (unordered_compare)
++    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
++  reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
++  emit_insn (gen_rtx_SET (reg, tmp));
++  tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
++  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
++			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
++  tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
++  JUMP_LABEL (tmp) = label;
++
++  return label;
++}
++
++/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
++   using comparison code CODE.  Operands are swapped for the comparison if
++   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
++static rtx
++ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
++			      bool swap_operands)
++{
++  rtx (*insn)(rtx, rtx, rtx, rtx);
++  machine_mode mode = GET_MODE (op0);
++  rtx mask = gen_reg_rtx (mode);
++
++  if (swap_operands)
++    std::swap (op0, op1);
++
++  insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
++
++  emit_insn (insn (mask, op0, op1,
++		   gen_rtx_fmt_ee (code, mode, op0, op1)));
++  return mask;
++}
++
++/* Expand copysign from SIGN to the positive value ABS_VALUE
++   storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
++   the sign-bit.  */
++
++static void
++ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
++{
++  machine_mode mode = GET_MODE (sign);
++  rtx sgn = gen_reg_rtx (mode);
++  if (mask == NULL_RTX)
++    {
++      machine_mode vmode;
++
++      if (mode == SFmode)
++	vmode = V4SFmode;
++      else if (mode == DFmode)
++	vmode = V2DFmode;
++      else
++	vmode = mode;
++
++      mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
++      if (!VECTOR_MODE_P (mode))
++	{
++	  /* We need to generate a scalar mode mask in this case.  */
++	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
++	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
++	  mask = gen_reg_rtx (mode);
++	  emit_insn (gen_rtx_SET (mask, tmp));
++	}
++    }
++  else
++    mask = gen_rtx_NOT (mode, mask);
++  emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
++  emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
++}
++
++/* Expand SSE sequence for computing lround from OP1 storing
++   into OP0.  */
++
++void
++ix86_expand_lround (rtx op0, rtx op1)
++{
++  /* C code for the stuff we're doing below:
++       tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
++       return (long)tmp;
++   */
++  machine_mode mode = GET_MODE (op1);
++  const struct real_format *fmt;
++  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
++  rtx adj;
++
++  /* load nextafter (0.5, 0.0) */
++  fmt = REAL_MODE_FORMAT (mode);
++  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
++  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
++
++  /* adj = copysign (0.5, op1) */
++  adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
++  ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
++
++  /* adj = op1 + adj */
++  adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
++
++  /* op0 = (imode)adj */
++  expand_fix (op0, adj, 0);
++}
++
++/* Expand SSE2 sequence for computing lround from OPERAND1 storing
++   into OPERAND0.  */
++
++void
++ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
++{
++  /* C code for the stuff we're doing below (for do_floor):
++	xi = (long)op1;
++        xi -= (double)xi > op1 ? 1 : 0;
++        return xi;
++   */
++  machine_mode fmode = GET_MODE (op1);
++  machine_mode imode = GET_MODE (op0);
++  rtx ireg, freg, tmp;
++  rtx_code_label *label;
++
++  /* reg = (long)op1 */
++  ireg = gen_reg_rtx (imode);
++  expand_fix (ireg, op1, 0);
++
++  /* freg = (double)reg */
++  freg = gen_reg_rtx (fmode);
++  expand_float (freg, ireg, 0);
++
++  /* ireg = (freg > op1) ? ireg - 1 : ireg */
++  label = ix86_expand_sse_compare_and_jump (UNLE,
++					    freg, op1, !do_floor);
++  tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
++			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
++  emit_move_insn (ireg, tmp);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (op0, ireg);
++}
++
++/* Generate and return a rtx of mode MODE for 2**n where n is the number
++   of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
++
++static rtx
++ix86_gen_TWO52 (machine_mode mode)
++{
++  REAL_VALUE_TYPE TWO52r;
++  rtx TWO52;
++
++  real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
++  TWO52 = const_double_from_real_value (TWO52r, mode);
++  TWO52 = force_reg (mode, TWO52);
++
++  return TWO52;
++}
++
++/* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
++
++void
++ix86_expand_rint (rtx operand0, rtx operand1)
++{
++  /* C code for the stuff we're doing below:
++	xa = fabs (operand1);
++        if (!isless (xa, 2**52))
++	  return operand1;
++        two52 = 2**52;
++        if (flag_rounding_math)
++	  {
++	    two52 = copysign (two52, operand1);
++	    xa = operand1;
++	  }
++        xa = xa + two52 - two52;
++        return copysign (xa, operand1);
++   */
++  machine_mode mode = GET_MODE (operand0);
++  rtx res, xa, TWO52, two52, mask;
++  rtx_code_label *label;
++
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  /* xa = abs (operand1) */
++  xa = ix86_expand_sse_fabs (res, &mask);
++
++  /* if (!isless (xa, TWO52)) goto label; */
++  TWO52 = ix86_gen_TWO52 (mode);
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  two52 = TWO52;
++  if (flag_rounding_math)
++    {
++      two52 = gen_reg_rtx (mode);
++      ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
++      xa = res;
++    }
++
++  xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
++  xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
++
++  ix86_sse_copysign_to_positive (res, xa, res, mask);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
++   into OPERAND0.  */
++void
++ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
++{
++  /* C code for the stuff we expand below.
++        double xa = fabs (x), x2;
++        if (!isless (xa, TWO52))
++          return x;
++        xa = xa + TWO52 - TWO52;
++        x2 = copysign (xa, x);
++     Compensate.  Floor:
++        if (x2 > x)
++          x2 -= 1;
++     Compensate.  Ceil:
++        if (x2 < x)
++          x2 -= -1;
++        return x2;
++   */
++  machine_mode mode = GET_MODE (operand0);
++  rtx xa, TWO52, tmp, one, res, mask;
++  rtx_code_label *label;
++
++  TWO52 = ix86_gen_TWO52 (mode);
++
++  /* Temporary for holding the result, initialized to the input
++     operand to ease control flow.  */
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  /* xa = abs (operand1) */
++  xa = ix86_expand_sse_fabs (res, &mask);
++
++  /* if (!isless (xa, TWO52)) goto label; */
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  /* xa = xa + TWO52 - TWO52; */
++  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
++  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
++
++  /* xa = copysign (xa, operand1) */
++  ix86_sse_copysign_to_positive (xa, xa, res, mask);
++
++  /* generate 1.0 or -1.0 */
++  one = force_reg (mode,
++	           const_double_from_real_value (do_floor
++						 ? dconst1 : dconstm1, mode));
++
++  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
++  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
++  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
++  /* We always need to subtract here to preserve signed zero.  */
++  tmp = expand_simple_binop (mode, MINUS,
++			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
++  emit_move_insn (res, tmp);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
++   into OPERAND0.  */
++void
++ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
++{
++  /* C code for the stuff we expand below.
++	double xa = fabs (x), x2;
++        if (!isless (xa, TWO52))
++          return x;
++	x2 = (double)(long)x;
++     Compensate.  Floor:
++	if (x2 > x)
++	  x2 -= 1;
++     Compensate.  Ceil:
++	if (x2 < x)
++	  x2 += 1;
++	if (HONOR_SIGNED_ZEROS (mode))
++	  return copysign (x2, x);
++	return x2;
++   */
++  machine_mode mode = GET_MODE (operand0);
++  rtx xa, xi, TWO52, tmp, one, res, mask;
++  rtx_code_label *label;
++
++  TWO52 = ix86_gen_TWO52 (mode);
++
++  /* Temporary for holding the result, initialized to the input
++     operand to ease control flow.  */
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  /* xa = abs (operand1) */
++  xa = ix86_expand_sse_fabs (res, &mask);
++
++  /* if (!isless (xa, TWO52)) goto label; */
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  /* xa = (double)(long)x */
++  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
++  expand_fix (xi, res, 0);
++  expand_float (xa, xi, 0);
++
++  /* generate 1.0 */
++  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
++
++  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
++  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
++  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
++  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
++			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
++  emit_move_insn (res, tmp);
++
++  if (HONOR_SIGNED_ZEROS (mode))
++    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE sequence for computing round from OPERAND1 storing
++   into OPERAND0.  Sequence that works without relying on DImode truncation
++   via cvttsd2siq that is only available on 64bit targets.  */
++void
++ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
++{
++  /* C code for the stuff we expand below.
++        double xa = fabs (x), xa2, x2;
++        if (!isless (xa, TWO52))
++          return x;
++     Using the absolute value and copying back sign makes
++     -0.0 -> -0.0 correct.
++        xa2 = xa + TWO52 - TWO52;
++     Compensate.
++	dxa = xa2 - xa;
++        if (dxa <= -0.5)
++          xa2 += 1;
++        else if (dxa > 0.5)
++          xa2 -= 1;
++        x2 = copysign (xa2, x);
++        return x2;
++   */
++  machine_mode mode = GET_MODE (operand0);
++  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
++  rtx_code_label *label;
++
++  TWO52 = ix86_gen_TWO52 (mode);
++
++  /* Temporary for holding the result, initialized to the input
++     operand to ease control flow.  */
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  /* xa = abs (operand1) */
++  xa = ix86_expand_sse_fabs (res, &mask);
++
++  /* if (!isless (xa, TWO52)) goto label; */
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  /* xa2 = xa + TWO52 - TWO52; */
++  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
++  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
++
++  /* dxa = xa2 - xa; */
++  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
++
++  /* generate 0.5, 1.0 and -0.5 */
++  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
++  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
++  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
++			       0, OPTAB_DIRECT);
++
++  /* Compensate.  */
++  tmp = gen_reg_rtx (mode);
++  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
++  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
++  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
++  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
++  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
++  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
++  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
++  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
++
++  /* res = copysign (xa2, operand1) */
++  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE sequence for computing trunc from OPERAND1 storing
++   into OPERAND0.  */
++void
++ix86_expand_trunc (rtx operand0, rtx operand1)
++{
++  /* C code for SSE variant we expand below.
++        double xa = fabs (x), x2;
++        if (!isless (xa, TWO52))
++          return x;
++        x2 = (double)(long)x;
++	if (HONOR_SIGNED_ZEROS (mode))
++	  return copysign (x2, x);
++	return x2;
++   */
++  machine_mode mode = GET_MODE (operand0);
++  rtx xa, xi, TWO52, res, mask;
++  rtx_code_label *label;
++
++  TWO52 = ix86_gen_TWO52 (mode);
++
++  /* Temporary for holding the result, initialized to the input
++     operand to ease control flow.  */
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  /* xa = abs (operand1) */
++  xa = ix86_expand_sse_fabs (res, &mask);
++
++  /* if (!isless (xa, TWO52)) goto label; */
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  /* x = (double)(long)x */
++  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
++  expand_fix (xi, res, 0);
++  expand_float (res, xi, 0);
++
++  if (HONOR_SIGNED_ZEROS (mode))
++    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE sequence for computing trunc from OPERAND1 storing
++   into OPERAND0.  */
++void
++ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
++{
++  machine_mode mode = GET_MODE (operand0);
++  rtx xa, mask, TWO52, one, res, smask, tmp;
++  rtx_code_label *label;
++
++  /* C code for SSE variant we expand below.
++        double xa = fabs (x), x2;
++        if (!isless (xa, TWO52))
++          return x;
++        xa2 = xa + TWO52 - TWO52;
++     Compensate:
++        if (xa2 > xa)
++          xa2 -= 1.0;
++        x2 = copysign (xa2, x);
++        return x2;
++   */
++
++  TWO52 = ix86_gen_TWO52 (mode);
++
++  /* Temporary for holding the result, initialized to the input
++     operand to ease control flow.  */
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  /* xa = abs (operand1) */
++  xa = ix86_expand_sse_fabs (res, &smask);
++
++  /* if (!isless (xa, TWO52)) goto label; */
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  /* res = xa + TWO52 - TWO52; */
++  tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
++  tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
++  emit_move_insn (res, tmp);
++
++  /* generate 1.0 */
++  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
++
++  /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
++  mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
++  emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
++  tmp = expand_simple_binop (mode, MINUS,
++			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
++  emit_move_insn (res, tmp);
++
++  /* res = copysign (res, operand1) */
++  ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE sequence for computing round from OPERAND1 storing
++   into OPERAND0.  */
++void
++ix86_expand_round (rtx operand0, rtx operand1)
++{
++  /* C code for the stuff we're doing below:
++        double xa = fabs (x);
++        if (!isless (xa, TWO52))
++          return x;
++        xa = (double)(long)(xa + nextafter (0.5, 0.0));
++        return copysign (xa, x);
++   */
++  machine_mode mode = GET_MODE (operand0);
++  rtx res, TWO52, xa, xi, half, mask;
++  rtx_code_label *label;
++  const struct real_format *fmt;
++  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
++
++  /* Temporary for holding the result, initialized to the input
++     operand to ease control flow.  */
++  res = gen_reg_rtx (mode);
++  emit_move_insn (res, operand1);
++
++  TWO52 = ix86_gen_TWO52 (mode);
++  xa = ix86_expand_sse_fabs (res, &mask);
++  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++
++  /* load nextafter (0.5, 0.0) */
++  fmt = REAL_MODE_FORMAT (mode);
++  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
++  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
++
++  /* xa = xa + 0.5 */
++  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
++  xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
++
++  /* xa = (double)(int64_t)xa */
++  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
++  expand_fix (xi, xa, 0);
++  expand_float (xa, xi, 0);
++
++  /* res = copysign (xa, operand1) */
++  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
++
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
++
++  emit_move_insn (operand0, res);
++}
++
++/* Expand SSE sequence for computing round
++   from OP1 storing into OP0 using sse4 round insn.  */
++void
++ix86_expand_round_sse4 (rtx op0, rtx op1)
++{
++  machine_mode mode = GET_MODE (op0);
++  rtx e1, e2, res, half;
++  const struct real_format *fmt;
++  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
++  rtx (*gen_copysign) (rtx, rtx, rtx);
++  rtx (*gen_round) (rtx, rtx, rtx);
++
++  switch (mode)
++    {
++    case E_SFmode:
++      gen_copysign = gen_copysignsf3;
++      gen_round = gen_sse4_1_roundsf2;
++      break;
++    case E_DFmode:
++      gen_copysign = gen_copysigndf3;
++      gen_round = gen_sse4_1_rounddf2;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  /* round (a) = trunc (a + copysign (0.5, a)) */
++
++  /* load nextafter (0.5, 0.0) */
++  fmt = REAL_MODE_FORMAT (mode);
++  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
++  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
++  half = const_double_from_real_value (pred_half, mode);
++
++  /* e1 = copysign (0.5, op1) */
++  e1 = gen_reg_rtx (mode);
++  emit_insn (gen_copysign (e1, half, op1));
++
++  /* e2 = op1 + e1 */
++  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
++
++  /* res = trunc (e2) */
++  res = gen_reg_rtx (mode);
++  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
++
++  emit_move_insn (op0, res);
++}
++
++/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
++   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
++   insn every time.  */
++
++static GTY(()) rtx_insn *vselect_insn;
++
++/* Initialize vselect_insn.  */
++
++static void
++init_vselect_insn (void)
++{
++  unsigned i;
++  rtx x;
++
++  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
++  for (i = 0; i < MAX_VECT_LEN; ++i)
++    XVECEXP (x, 0, i) = const0_rtx;
++  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
++							const0_rtx), x);
++  x = gen_rtx_SET (const0_rtx, x);
++  start_sequence ();
++  vselect_insn = emit_insn (x);
++  end_sequence ();
++}
++
++/* Construct (set target (vec_select op0 (parallel perm))) and
++   return true if that's a valid instruction in the active ISA.  */
++
++static bool
++expand_vselect (rtx target, rtx op0, const unsigned char *perm,
++		unsigned nelt, bool testing_p)
++{
++  unsigned int i;
++  rtx x, save_vconcat;
++  int icode;
++
++  if (vselect_insn == NULL_RTX)
++    init_vselect_insn ();
++
++  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
++  PUT_NUM_ELEM (XVEC (x, 0), nelt);
++  for (i = 0; i < nelt; ++i)
++    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
++  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
++  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
++  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
++  SET_DEST (PATTERN (vselect_insn)) = target;
++  icode = recog_memoized (vselect_insn);
++
++  if (icode >= 0 && !testing_p)
++    emit_insn (copy_rtx (PATTERN (vselect_insn)));
++
++  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
++  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
++  INSN_CODE (vselect_insn) = -1;
++
++  return icode >= 0;
++}
++
++/* Similar, but generate a vec_concat from op0 and op1 as well.  */
++
++static bool
++expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
++			const unsigned char *perm, unsigned nelt,
++			bool testing_p)
++{
++  machine_mode v2mode;
++  rtx x;
++  bool ok;
++
++  if (vselect_insn == NULL_RTX)
++    init_vselect_insn ();
++
++  if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
++    return false;
++  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
++  PUT_MODE (x, v2mode);
++  XEXP (x, 0) = op0;
++  XEXP (x, 1) = op1;
++  ok = expand_vselect (target, x, perm, nelt, testing_p);
++  XEXP (x, 0) = const0_rtx;
++  XEXP (x, 1) = const0_rtx;
++  return ok;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
++   using movss or movsd.  */
++static bool
++expand_vec_perm_movs (struct expand_vec_perm_d *d)
++{
++  machine_mode vmode = d->vmode;
++  unsigned i, nelt = d->nelt;
++  rtx x;
++
++  if (d->one_operand_p)
++    return false;
++
++  if (!(TARGET_SSE && vmode == V4SFmode)
++      && !(TARGET_SSE2 && vmode == V2DFmode))
++    return false;
++
++  /* Only the first element is changed.  */
++  if (d->perm[0] != nelt && d->perm[0] != 0)
++    return false;
++  for (i = 1; i < nelt; ++i)
++    if (d->perm[i] != i + nelt - d->perm[0])
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++  if (d->perm[0] == nelt)
++    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
++  else
++    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
++
++  emit_insn (gen_rtx_SET (d->target, x));
++
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
++   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
++
++static bool
++expand_vec_perm_blend (struct expand_vec_perm_d *d)
++{
++  machine_mode mmode, vmode = d->vmode;
++  unsigned i, nelt = d->nelt;
++  unsigned HOST_WIDE_INT mask;
++  rtx target, op0, op1, maskop, x;
++  rtx rperm[32], vperm;
++
++  if (d->one_operand_p)
++    return false;
++  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
++      && (TARGET_AVX512BW
++	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
++    ;
++  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
++    ;
++  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
++    ;
++  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
++    ;
++  else
++    return false;
++
++  /* This is a blend, not a permute.  Elements must stay in their
++     respective lanes.  */
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned e = d->perm[i];
++      if (!(e == i || e == i + nelt))
++	return false;
++    }
++
++  if (d->testing_p)
++    return true;
++
++  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
++     decision should be extracted elsewhere, so that we only try that
++     sequence once all budget==3 options have been tried.  */
++  target = d->target;
++  op0 = d->op0;
++  op1 = d->op1;
++  mask = 0;
++
++  switch (vmode)
++    {
++    case E_V8DFmode:
++    case E_V16SFmode:
++    case E_V4DFmode:
++    case E_V8SFmode:
++    case E_V2DFmode:
++    case E_V4SFmode:
++    case E_V8HImode:
++    case E_V8SImode:
++    case E_V32HImode:
++    case E_V64QImode:
++    case E_V16SImode:
++    case E_V8DImode:
++      for (i = 0; i < nelt; ++i)
++	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
++      break;
++
++    case E_V2DImode:
++      for (i = 0; i < 2; ++i)
++	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
++      vmode = V8HImode;
++      goto do_subreg;
++
++    case E_V4SImode:
++      for (i = 0; i < 4; ++i)
++	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
++      vmode = V8HImode;
++      goto do_subreg;
++
++    case E_V16QImode:
++      /* See if bytes move in pairs so we can use pblendw with
++	 an immediate argument, rather than pblendvb with a vector
++	 argument.  */
++      for (i = 0; i < 16; i += 2)
++	if (d->perm[i] + 1 != d->perm[i + 1])
++	  {
++	  use_pblendvb:
++	    for (i = 0; i < nelt; ++i)
++	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
++
++	  finish_pblendvb:
++	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
++	    vperm = force_reg (vmode, vperm);
++
++	    if (GET_MODE_SIZE (vmode) == 16)
++	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
++	    else
++	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
++	    if (target != d->target)
++	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
++	    return true;
++	  }
++
++      for (i = 0; i < 8; ++i)
++	mask |= (d->perm[i * 2] >= 16) << i;
++      vmode = V8HImode;
++      /* FALLTHRU */
++
++    do_subreg:
++      target = gen_reg_rtx (vmode);
++      op0 = gen_lowpart (vmode, op0);
++      op1 = gen_lowpart (vmode, op1);
++      break;
++
++    case E_V32QImode:
++      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
++      for (i = 0; i < 32; i += 2)
++	if (d->perm[i] + 1 != d->perm[i + 1])
++	  goto use_pblendvb;
++      /* See if bytes move in quadruplets.  If yes, vpblendd
++	 with immediate can be used.  */
++      for (i = 0; i < 32; i += 4)
++	if (d->perm[i] + 2 != d->perm[i + 2])
++	  break;
++      if (i < 32)
++	{
++	  /* See if bytes move the same in both lanes.  If yes,
++	     vpblendw with immediate can be used.  */
++	  for (i = 0; i < 16; i += 2)
++	    if (d->perm[i] + 16 != d->perm[i + 16])
++	      goto use_pblendvb;
++
++	  /* Use vpblendw.  */
++	  for (i = 0; i < 16; ++i)
++	    mask |= (d->perm[i * 2] >= 32) << i;
++	  vmode = V16HImode;
++	  goto do_subreg;
++	}
++
++      /* Use vpblendd.  */
++      for (i = 0; i < 8; ++i)
++	mask |= (d->perm[i * 4] >= 32) << i;
++      vmode = V8SImode;
++      goto do_subreg;
++
++    case E_V16HImode:
++      /* See if words move in pairs.  If yes, vpblendd can be used.  */
++      for (i = 0; i < 16; i += 2)
++	if (d->perm[i] + 1 != d->perm[i + 1])
++	  break;
++      if (i < 16)
++	{
++	  /* See if words move the same in both lanes.  If not,
++	     vpblendvb must be used.  */
++	  for (i = 0; i < 8; i++)
++	    if (d->perm[i] + 8 != d->perm[i + 8])
++	      {
++		/* Use vpblendvb.  */
++		for (i = 0; i < 32; ++i)
++		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
++
++		vmode = V32QImode;
++		nelt = 32;
++		target = gen_reg_rtx (vmode);
++		op0 = gen_lowpart (vmode, op0);
++		op1 = gen_lowpart (vmode, op1);
++		goto finish_pblendvb;
++	      }
++
++	  /* Use vpblendw.  */
++	  for (i = 0; i < 16; ++i)
++	    mask |= (d->perm[i] >= 16) << i;
++	  break;
++	}
++
++      /* Use vpblendd.  */
++      for (i = 0; i < 8; ++i)
++	mask |= (d->perm[i * 2] >= 16) << i;
++      vmode = V8SImode;
++      goto do_subreg;
++
++    case E_V4DImode:
++      /* Use vpblendd.  */
++      for (i = 0; i < 4; ++i)
++	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
++      vmode = V8SImode;
++      goto do_subreg;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  switch (vmode)
++    {
++    case E_V8DFmode:
++    case E_V8DImode:
++      mmode = QImode;
++      break;
++    case E_V16SFmode:
++    case E_V16SImode:
++      mmode = HImode;
++      break;
++    case E_V32HImode:
++      mmode = SImode;
++      break;
++    case E_V64QImode:
++      mmode = DImode;
++      break;
++    default:
++      mmode = VOIDmode;
++    }
++
++  if (mmode != VOIDmode)
++    maskop = force_reg (mmode, gen_int_mode (mask, mmode));
++  else
++    maskop = GEN_INT (mask);
++
++  /* This matches five different patterns with the different modes.  */
++  x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
++  x = gen_rtx_SET (target, x);
++  emit_insn (x);
++  if (target != d->target)
++    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
++
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
++   in terms of the variable form of vpermilps.
++
++   Note that we will have already failed the immediate input vpermilps,
++   which requires that the high and low part shuffle be identical; the
++   variable form doesn't require that.  */
++
++static bool
++expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
++{
++  rtx rperm[8], vperm;
++  unsigned i;
++
++  if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
++    return false;
++
++  /* We can only permute within the 128-bit lane.  */
++  for (i = 0; i < 8; ++i)
++    {
++      unsigned e = d->perm[i];
++      if (i < 4 ? e >= 4 : e < 4)
++	return false;
++    }
++
++  if (d->testing_p)
++    return true;
++
++  for (i = 0; i < 8; ++i)
++    {
++      unsigned e = d->perm[i];
++
++      /* Within each 128-bit lane, the elements of op0 are numbered
++	 from 0 and the elements of op1 are numbered from 4.  */
++      if (e >= 8 + 4)
++	e -= 8;
++      else if (e >= 4)
++	e -= 4;
++
++      rperm[i] = GEN_INT (e);
++    }
++
++  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
++  vperm = force_reg (V8SImode, vperm);
++  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
++
++  return true;
++}
++
++/* Return true if permutation D can be performed as VMODE permutation
++   instead.  */
++
++static bool
++valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
++{
++  unsigned int i, j, chunk;
++
++  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
++      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
++      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
++    return false;
++
++  if (GET_MODE_NUNITS (vmode) >= d->nelt)
++    return true;
++
++  chunk = d->nelt / GET_MODE_NUNITS (vmode);
++  for (i = 0; i < d->nelt; i += chunk)
++    if (d->perm[i] & (chunk - 1))
++      return false;
++    else
++      for (j = 1; j < chunk; ++j)
++	if (d->perm[i] + j != d->perm[i + j])
++	  return false;
++
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
++   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
++
++static bool
++expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
++{
++  unsigned i, nelt, eltsz, mask;
++  unsigned char perm[64];
++  machine_mode vmode = V16QImode;
++  rtx rperm[64], vperm, target, op0, op1;
++
++  nelt = d->nelt;
++
++  if (!d->one_operand_p)
++    {
++      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
++	{
++	  if (TARGET_AVX2
++	      && valid_perm_using_mode_p (V2TImode, d))
++	    {
++	      if (d->testing_p)
++		return true;
++
++	      /* Use vperm2i128 insn.  The pattern uses
++		 V4DImode instead of V2TImode.  */
++	      target = d->target;
++	      if (d->vmode != V4DImode)
++		target = gen_reg_rtx (V4DImode);
++	      op0 = gen_lowpart (V4DImode, d->op0);
++	      op1 = gen_lowpart (V4DImode, d->op1);
++	      rperm[0]
++		= GEN_INT ((d->perm[0] / (nelt / 2))
++			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
++	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
++	      if (target != d->target)
++		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
++	      return true;
++	    }
++	  return false;
++	}
++    }
++  else
++    {
++      if (GET_MODE_SIZE (d->vmode) == 16)
++	{
++	  if (!TARGET_SSSE3)
++	    return false;
++	}
++      else if (GET_MODE_SIZE (d->vmode) == 32)
++	{
++	  if (!TARGET_AVX2)
++	    return false;
++
++	  /* V4DImode should be already handled through
++	     expand_vselect by vpermq instruction.  */
++	  gcc_assert (d->vmode != V4DImode);
++
++	  vmode = V32QImode;
++	  if (d->vmode == V8SImode
++	      || d->vmode == V16HImode
++	      || d->vmode == V32QImode)
++	    {
++	      /* First see if vpermq can be used for
++		 V8SImode/V16HImode/V32QImode.  */
++	      if (valid_perm_using_mode_p (V4DImode, d))
++		{
++		  for (i = 0; i < 4; i++)
++		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
++		  if (d->testing_p)
++		    return true;
++		  target = gen_reg_rtx (V4DImode);
++		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
++				      perm, 4, false))
++		    {
++		      emit_move_insn (d->target,
++				      gen_lowpart (d->vmode, target));
++		      return true;
++		    }
++		  return false;
++		}
++
++	      /* Next see if vpermd can be used.  */
++	      if (valid_perm_using_mode_p (V8SImode, d))
++		vmode = V8SImode;
++	    }
++	  /* Or if vpermps can be used.  */
++	  else if (d->vmode == V8SFmode)
++	    vmode = V8SImode;
++
++	  if (vmode == V32QImode)
++	    {
++	      /* vpshufb only works intra lanes, it is not
++		 possible to shuffle bytes in between the lanes.  */
++	      for (i = 0; i < nelt; ++i)
++		if ((d->perm[i] ^ i) & (nelt / 2))
++		  return false;
++	    }
++	}
++      else if (GET_MODE_SIZE (d->vmode) == 64)
++	{
++	  if (!TARGET_AVX512BW)
++	    return false;
++
++	  /* If vpermq didn't work, vpshufb won't work either.  */
++	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
++	    return false;
++
++	  vmode = V64QImode;
++	  if (d->vmode == V16SImode
++	      || d->vmode == V32HImode
++	      || d->vmode == V64QImode)
++	    {
++	      /* First see if vpermq can be used for
++		 V16SImode/V32HImode/V64QImode.  */
++	      if (valid_perm_using_mode_p (V8DImode, d))
++		{
++		  for (i = 0; i < 8; i++)
++		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
++		  if (d->testing_p)
++		    return true;
++		  target = gen_reg_rtx (V8DImode);
++		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
++				      perm, 8, false))
++		    {
++		      emit_move_insn (d->target,
++				      gen_lowpart (d->vmode, target));
++		      return true;
++		    }
++		  return false;
++		}
++
++	      /* Next see if vpermd can be used.  */
++	      if (valid_perm_using_mode_p (V16SImode, d))
++		vmode = V16SImode;
++	    }
++	  /* Or if vpermps can be used.  */
++	  else if (d->vmode == V16SFmode)
++	    vmode = V16SImode;
++	  if (vmode == V64QImode)
++	    {
++	      /* vpshufb only works intra lanes, it is not
++		 possible to shuffle bytes in between the lanes.  */
++	      for (i = 0; i < nelt; ++i)
++		if ((d->perm[i] ^ i) & (nelt / 4))
++		  return false;
++	    }
++	}
++      else
++	return false;
++    }
++
++  if (d->testing_p)
++    return true;
++
++  if (vmode == V8SImode)
++    for (i = 0; i < 8; ++i)
++      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
++  else if (vmode == V16SImode)
++    for (i = 0; i < 16; ++i)
++      rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
++  else
++    {
++      eltsz = GET_MODE_UNIT_SIZE (d->vmode);
++      if (!d->one_operand_p)
++	mask = 2 * nelt - 1;
++      else if (vmode == V16QImode)
++	mask = nelt - 1;
++      else if (vmode == V64QImode)
++	mask = nelt / 4 - 1;
++      else
++	mask = nelt / 2 - 1;
++
++      for (i = 0; i < nelt; ++i)
++	{
++	  unsigned j, e = d->perm[i] & mask;
++	  for (j = 0; j < eltsz; ++j)
++	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
++	}
++    }
++
++  vperm = gen_rtx_CONST_VECTOR (vmode,
++				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
++  vperm = force_reg (vmode, vperm);
++
++  target = d->target;
++  if (d->vmode != vmode)
++    target = gen_reg_rtx (vmode);
++  op0 = gen_lowpart (vmode, d->op0);
++  if (d->one_operand_p)
++    {
++      if (vmode == V16QImode)
++	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
++      else if (vmode == V32QImode)
++	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
++      else if (vmode == V64QImode)
++	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
++      else if (vmode == V8SFmode)
++	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
++      else if (vmode == V8SImode)
++	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
++      else if (vmode == V16SFmode)
++	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
++      else if (vmode == V16SImode)
++	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
++      else
++	gcc_unreachable ();
++    }
++  else
++    {
++      op1 = gen_lowpart (vmode, d->op1);
++      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
++    }
++  if (target != d->target)
++    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
++
++  return true;
++}
++
++/* For V*[QHS]Imode permutations, check if the same permutation
++   can't be performed in a 2x, 4x or 8x wider inner mode.  */
++
++static bool
++canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
++			      struct expand_vec_perm_d *nd)
++{
++  int i;
++  machine_mode mode = VOIDmode;
++
++  switch (d->vmode)
++    {
++    case E_V16QImode: mode = V8HImode; break;
++    case E_V32QImode: mode = V16HImode; break;
++    case E_V64QImode: mode = V32HImode; break;
++    case E_V8HImode: mode = V4SImode; break;
++    case E_V16HImode: mode = V8SImode; break;
++    case E_V32HImode: mode = V16SImode; break;
++    case E_V4SImode: mode = V2DImode; break;
++    case E_V8SImode: mode = V4DImode; break;
++    case E_V16SImode: mode = V8DImode; break;
++    default: return false;
++    }
++  for (i = 0; i < d->nelt; i += 2)
++    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
++      return false;
++  nd->vmode = mode;
++  nd->nelt = d->nelt / 2;
++  for (i = 0; i < nd->nelt; i++)
++    nd->perm[i] = d->perm[2 * i] / 2;
++  if (GET_MODE_INNER (mode) != DImode)
++    canonicalize_vector_int_perm (nd, nd);
++  if (nd != d)
++    {
++      nd->one_operand_p = d->one_operand_p;
++      nd->testing_p = d->testing_p;
++      if (d->op0 == d->op1)
++	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
++      else
++	{
++	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
++	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
++	}
++      if (d->testing_p)
++	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
++      else
++	nd->target = gen_reg_rtx (nd->vmode);
++    }
++  return true;
++}
++
++/* Try to expand one-operand permutation with constant mask.  */
++
++static bool
++ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
++{
++  machine_mode mode = GET_MODE (d->op0);
++  machine_mode maskmode = mode;
++  rtx (*gen) (rtx, rtx, rtx) = NULL;
++  rtx target, op0, mask;
++  rtx vec[64];
++
++  if (!rtx_equal_p (d->op0, d->op1))
++    return false;
++
++  if (!TARGET_AVX512F)
++    return false;
++
++  switch (mode)
++    {
++    case E_V16SImode:
++      gen = gen_avx512f_permvarv16si;
++      break;
++    case E_V16SFmode:
++      gen = gen_avx512f_permvarv16sf;
++      maskmode = V16SImode;
++      break;
++    case E_V8DImode:
++      gen = gen_avx512f_permvarv8di;
++      break;
++    case E_V8DFmode:
++      gen = gen_avx512f_permvarv8df;
++      maskmode = V8DImode;
++      break;
++    default:
++      return false;
++    }
++
++  target = d->target;
++  op0 = d->op0;
++  for (int i = 0; i < d->nelt; ++i)
++    vec[i] = GEN_INT (d->perm[i]);
++  mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
++  emit_insn (gen (target, op0, force_reg (maskmode, mask)));
++  return true;
++}
++
++static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
++   in a single instruction.  */
++
++static bool
++expand_vec_perm_1 (struct expand_vec_perm_d *d)
++{
++  unsigned i, nelt = d->nelt;
++  struct expand_vec_perm_d nd;
++
++  /* Check plain VEC_SELECT first, because AVX has instructions that could
++     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
++     input where SEL+CONCAT may not.  */
++  if (d->one_operand_p)
++    {
++      int mask = nelt - 1;
++      bool identity_perm = true;
++      bool broadcast_perm = true;
++
++      for (i = 0; i < nelt; i++)
++	{
++	  nd.perm[i] = d->perm[i] & mask;
++	  if (nd.perm[i] != i)
++	    identity_perm = false;
++	  if (nd.perm[i])
++	    broadcast_perm = false;
++	}
++
++      if (identity_perm)
++	{
++	  if (!d->testing_p)
++	    emit_move_insn (d->target, d->op0);
++	  return true;
++	}
++      else if (broadcast_perm && TARGET_AVX2)
++	{
++	  /* Use vpbroadcast{b,w,d}.  */
++	  rtx (*gen) (rtx, rtx) = NULL;
++	  switch (d->vmode)
++	    {
++	    case E_V64QImode:
++	      if (TARGET_AVX512BW)
++		gen = gen_avx512bw_vec_dupv64qi_1;
++	      break;
++	    case E_V32QImode:
++	      gen = gen_avx2_pbroadcastv32qi_1;
++	      break;
++	    case E_V32HImode:
++	      if (TARGET_AVX512BW)
++		gen = gen_avx512bw_vec_dupv32hi_1;
++	      break;
++	    case E_V16HImode:
++	      gen = gen_avx2_pbroadcastv16hi_1;
++	      break;
++	    case E_V16SImode:
++	      if (TARGET_AVX512F)
++		gen = gen_avx512f_vec_dupv16si_1;
++	      break;
++	    case E_V8SImode:
++	      gen = gen_avx2_pbroadcastv8si_1;
++	      break;
++	    case E_V16QImode:
++	      gen = gen_avx2_pbroadcastv16qi;
++	      break;
++	    case E_V8HImode:
++	      gen = gen_avx2_pbroadcastv8hi;
++	      break;
++	    case E_V16SFmode:
++	      if (TARGET_AVX512F)
++		gen = gen_avx512f_vec_dupv16sf_1;
++	      break;
++	    case E_V8SFmode:
++	      gen = gen_avx2_vec_dupv8sf_1;
++	      break;
++	    case E_V8DFmode:
++	      if (TARGET_AVX512F)
++		gen = gen_avx512f_vec_dupv8df_1;
++	      break;
++	    case E_V8DImode:
++	      if (TARGET_AVX512F)
++		gen = gen_avx512f_vec_dupv8di_1;
++	      break;
++	    /* For other modes prefer other shuffles this function creates.  */
++	    default: break;
++	    }
++	  if (gen != NULL)
++	    {
++	      if (!d->testing_p)
++		emit_insn (gen (d->target, d->op0));
++	      return true;
++	    }
++	}
++
++      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
++	return true;
++
++      /* There are plenty of patterns in sse.md that are written for
++	 SEL+CONCAT and are not replicated for a single op.  Perhaps
++	 that should be changed, to avoid the nastiness here.  */
++
++      /* Recognize interleave style patterns, which means incrementing
++	 every other permutation operand.  */
++      for (i = 0; i < nelt; i += 2)
++	{
++	  nd.perm[i] = d->perm[i] & mask;
++	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
++	}
++      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
++				  d->testing_p))
++	return true;
++
++      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
++      if (nelt >= 4)
++	{
++	  for (i = 0; i < nelt; i += 4)
++	    {
++	      nd.perm[i + 0] = d->perm[i + 0] & mask;
++	      nd.perm[i + 1] = d->perm[i + 1] & mask;
++	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
++	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
++	    }
++
++	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
++				      d->testing_p))
++	    return true;
++	}
++    }
++
++  /* Try movss/movsd instructions.  */
++  if (expand_vec_perm_movs (d))
++    return true;
++
++  /* Finally, try the fully general two operand permute.  */
++  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
++			      d->testing_p))
++    return true;
++
++  /* Recognize interleave style patterns with reversed operands.  */
++  if (!d->one_operand_p)
++    {
++      for (i = 0; i < nelt; ++i)
++	{
++	  unsigned e = d->perm[i];
++	  if (e >= nelt)
++	    e -= nelt;
++	  else
++	    e += nelt;
++	  nd.perm[i] = e;
++	}
++
++      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
++				  d->testing_p))
++	return true;
++    }
++
++  /* Try the SSE4.1 blend variable merge instructions.  */
++  if (expand_vec_perm_blend (d))
++    return true;
++
++  /* Try one of the AVX vpermil variable permutations.  */
++  if (expand_vec_perm_vpermil (d))
++    return true;
++
++  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
++     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
++  if (expand_vec_perm_pshufb (d))
++    return true;
++
++  /* Try the AVX2 vpalignr instruction.  */
++  if (expand_vec_perm_palignr (d, true))
++    return true;
++
++  /* Try the AVX512F vperm{s,d} instructions.  */
++  if (ix86_expand_vec_one_operand_perm_avx512 (d))
++    return true;
++
++  /* Try the AVX512F vpermt2/vpermi2 instructions.  */
++  if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
++    return true;
++
++  /* See if we can get the same permutation in different vector integer
++     mode.  */
++  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
++    {
++      if (!d->testing_p)
++	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
++      return true;
++    }
++  return false;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
++   in terms of a pair of pshuflw + pshufhw instructions.  */
++
++static bool
++expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
++{
++  unsigned char perm2[MAX_VECT_LEN];
++  unsigned i;
++  bool ok;
++
++  if (d->vmode != V8HImode || !d->one_operand_p)
++    return false;
++
++  /* The two permutations only operate in 64-bit lanes.  */
++  for (i = 0; i < 4; ++i)
++    if (d->perm[i] >= 4)
++      return false;
++  for (i = 4; i < 8; ++i)
++    if (d->perm[i] < 4)
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++  /* Emit the pshuflw.  */
++  memcpy (perm2, d->perm, 4);
++  for (i = 4; i < 8; ++i)
++    perm2[i] = i;
++  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
++  gcc_assert (ok);
++
++  /* Emit the pshufhw.  */
++  memcpy (perm2 + 4, d->perm + 4, 4);
++  for (i = 0; i < 4; ++i)
++    perm2[i] = i;
++  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
++  gcc_assert (ok);
++
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
++   the permutation using the SSSE3 palignr instruction.  This succeeds
++   when all of the elements in PERM fit within one vector and we merely
++   need to shift them down so that a single vector permutation has a
++   chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
++   the vpalignr instruction itself can perform the requested permutation.  */
++
++static bool
++expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
++{
++  unsigned i, nelt = d->nelt;
++  unsigned min, max, minswap, maxswap;
++  bool in_order, ok, swap = false;
++  rtx shift, target;
++  struct expand_vec_perm_d dcopy;
++
++  /* Even with AVX, palignr only operates on 128-bit vectors,
++     in AVX2 palignr operates on both 128-bit lanes.  */
++  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
++      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
++    return false;
++
++  min = 2 * nelt;
++  max = 0;
++  minswap = 2 * nelt;
++  maxswap = 0;
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned e = d->perm[i];
++      unsigned eswap = d->perm[i] ^ nelt;
++      if (GET_MODE_SIZE (d->vmode) == 32)
++	{
++	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
++	  eswap = e ^ (nelt / 2);
++	}
++      if (e < min)
++	min = e;
++      if (e > max)
++	max = e;
++      if (eswap < minswap)
++	minswap = eswap;
++      if (eswap > maxswap)
++	maxswap = eswap;
++    }
++  if (min == 0
++      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
++    {
++      if (d->one_operand_p
++	  || minswap == 0
++	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
++				   ? nelt / 2 : nelt))
++	return false;
++      swap = true;
++      min = minswap;
++      max = maxswap;
++    }
++
++  /* Given that we have SSSE3, we know we'll be able to implement the
++     single operand permutation after the palignr with pshufb for
++     128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
++     first.  */
++  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
++    return true;
++
++  dcopy = *d;
++  if (swap)
++    {
++      dcopy.op0 = d->op1;
++      dcopy.op1 = d->op0;
++      for (i = 0; i < nelt; ++i)
++	dcopy.perm[i] ^= nelt;
++    }
++
++  in_order = true;
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned e = dcopy.perm[i];
++      if (GET_MODE_SIZE (d->vmode) == 32
++	  && e >= nelt
++	  && (e & (nelt / 2 - 1)) < min)
++	e = e - min - (nelt / 2);
++      else
++	e = e - min;
++      if (e != i)
++	in_order = false;
++      dcopy.perm[i] = e;
++    }
++  dcopy.one_operand_p = true;
++
++  if (single_insn_only_p && !in_order)
++    return false;
++
++  /* For AVX2, test whether we can permute the result in one instruction.  */
++  if (d->testing_p)
++    {
++      if (in_order)
++	return true;
++      dcopy.op1 = dcopy.op0;
++      return expand_vec_perm_1 (&dcopy);
++    }
++
++  shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
++  if (GET_MODE_SIZE (d->vmode) == 16)
++    {
++      target = gen_reg_rtx (TImode);
++      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
++				      gen_lowpart (TImode, dcopy.op0), shift));
++    }
++  else
++    {
++      target = gen_reg_rtx (V2TImode);
++      emit_insn (gen_avx2_palignrv2ti (target,
++				       gen_lowpart (V2TImode, dcopy.op1),
++				       gen_lowpart (V2TImode, dcopy.op0),
++				       shift));
++    }
++
++  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
++
++  /* Test for the degenerate case where the alignment by itself
++     produces the desired permutation.  */
++  if (in_order)
++    {
++      emit_move_insn (d->target, dcopy.op0);
++      return true;
++    }
++
++  ok = expand_vec_perm_1 (&dcopy);
++  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
++
++  return ok;
++}
++
++/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
++   the permutation using the SSE4_1 pblendv instruction.  Potentially
++   reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
++
++static bool
++expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
++{
++  unsigned i, which, nelt = d->nelt;
++  struct expand_vec_perm_d dcopy, dcopy1;
++  machine_mode vmode = d->vmode;
++  bool ok;
++
++  /* Use the same checks as in expand_vec_perm_blend.  */
++  if (d->one_operand_p)
++    return false;
++  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
++    ;
++  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
++    ;
++  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
++    ;
++  else
++    return false;
++
++  /* Figure out where permutation elements stay not in their
++     respective lanes.  */
++  for (i = 0, which = 0; i < nelt; ++i)
++    {
++      unsigned e = d->perm[i];
++      if (e != i)
++	which |= (e < nelt ? 1 : 2);
++    }
++  /* We can pblend the part where elements stay not in their
++     respective lanes only when these elements are all in one
++     half of a permutation.
++     {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
++     lanes, but both 8 and 9 >= 8
++     {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
++     respective lanes and 8 >= 8, but 2 not.  */
++  if (which != 1 && which != 2)
++    return false;
++  if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
++    return true;
++
++  /* First we apply one operand permutation to the part where
++     elements stay not in their respective lanes.  */
++  dcopy = *d;
++  if (which == 2)
++    dcopy.op0 = dcopy.op1 = d->op1;
++  else
++    dcopy.op0 = dcopy.op1 = d->op0;
++  if (!d->testing_p)
++    dcopy.target = gen_reg_rtx (vmode);
++  dcopy.one_operand_p = true;
++
++  for (i = 0; i < nelt; ++i)
++    dcopy.perm[i] = d->perm[i] & (nelt - 1);
++
++  ok = expand_vec_perm_1 (&dcopy);
++  if (GET_MODE_SIZE (vmode) != 16 && !ok)
++    return false;
++  else
++    gcc_assert (ok);
++  if (d->testing_p)
++    return true;
++
++  /* Next we put permuted elements into their positions.  */
++  dcopy1 = *d;
++  if (which == 2)
++    dcopy1.op1 = dcopy.target;
++  else
++    dcopy1.op0 = dcopy.target;
++
++  for (i = 0; i < nelt; ++i)
++    dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
++
++  ok = expand_vec_perm_blend (&dcopy1);
++  gcc_assert (ok);
++
++  return true;
++}
++
++static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
++   a two vector permutation into a single vector permutation by using
++   an interleave operation to merge the vectors.  */
++
++static bool
++expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
++{
++  struct expand_vec_perm_d dremap, dfinal;
++  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
++  unsigned HOST_WIDE_INT contents;
++  unsigned char remap[2 * MAX_VECT_LEN];
++  rtx_insn *seq;
++  bool ok, same_halves = false;
++
++  if (GET_MODE_SIZE (d->vmode) == 16)
++    {
++      if (d->one_operand_p)
++	return false;
++    }
++  else if (GET_MODE_SIZE (d->vmode) == 32)
++    {
++      if (!TARGET_AVX)
++	return false;
++      /* For 32-byte modes allow even d->one_operand_p.
++	 The lack of cross-lane shuffling in some instructions
++	 might prevent a single insn shuffle.  */
++      dfinal = *d;
++      dfinal.testing_p = true;
++      /* If expand_vec_perm_interleave3 can expand this into
++	 a 3 insn sequence, give up and let it be expanded as
++	 3 insn sequence.  While that is one insn longer,
++	 it doesn't need a memory operand and in the common
++	 case that both interleave low and high permutations
++	 with the same operands are adjacent needs 4 insns
++	 for both after CSE.  */
++      if (expand_vec_perm_interleave3 (&dfinal))
++	return false;
++    }
++  else
++    return false;
++
++  /* Examine from whence the elements come.  */
++  contents = 0;
++  for (i = 0; i < nelt; ++i)
++    contents |= HOST_WIDE_INT_1U << d->perm[i];
++
++  memset (remap, 0xff, sizeof (remap));
++  dremap = *d;
++
++  if (GET_MODE_SIZE (d->vmode) == 16)
++    {
++      unsigned HOST_WIDE_INT h1, h2, h3, h4;
++
++      /* Split the two input vectors into 4 halves.  */
++      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
++      h2 = h1 << nelt2;
++      h3 = h2 << nelt2;
++      h4 = h3 << nelt2;
++
++      /* If the elements from the low halves use interleave low, and similarly
++	 for interleave high.  If the elements are from mis-matched halves, we
++	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
++      if ((contents & (h1 | h3)) == contents)
++	{
++	  /* punpckl* */
++	  for (i = 0; i < nelt2; ++i)
++	    {
++	      remap[i] = i * 2;
++	      remap[i + nelt] = i * 2 + 1;
++	      dremap.perm[i * 2] = i;
++	      dremap.perm[i * 2 + 1] = i + nelt;
++	    }
++	  if (!TARGET_SSE2 && d->vmode == V4SImode)
++	    dremap.vmode = V4SFmode;
++	}
++      else if ((contents & (h2 | h4)) == contents)
++	{
++	  /* punpckh* */
++	  for (i = 0; i < nelt2; ++i)
++	    {
++	      remap[i + nelt2] = i * 2;
++	      remap[i + nelt + nelt2] = i * 2 + 1;
++	      dremap.perm[i * 2] = i + nelt2;
++	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
++	    }
++	  if (!TARGET_SSE2 && d->vmode == V4SImode)
++	    dremap.vmode = V4SFmode;
++	}
++      else if ((contents & (h1 | h4)) == contents)
++	{
++	  /* shufps */
++	  for (i = 0; i < nelt2; ++i)
++	    {
++	      remap[i] = i;
++	      remap[i + nelt + nelt2] = i + nelt2;
++	      dremap.perm[i] = i;
++	      dremap.perm[i + nelt2] = i + nelt + nelt2;
++	    }
++	  if (nelt != 4)
++	    {
++	      /* shufpd */
++	      dremap.vmode = V2DImode;
++	      dremap.nelt = 2;
++	      dremap.perm[0] = 0;
++	      dremap.perm[1] = 3;
++	    }
++	}
++      else if ((contents & (h2 | h3)) == contents)
++	{
++	  /* shufps */
++	  for (i = 0; i < nelt2; ++i)
++	    {
++	      remap[i + nelt2] = i;
++	      remap[i + nelt] = i + nelt2;
++	      dremap.perm[i] = i + nelt2;
++	      dremap.perm[i + nelt2] = i + nelt;
++	    }
++	  if (nelt != 4)
++	    {
++	      /* shufpd */
++	      dremap.vmode = V2DImode;
++	      dremap.nelt = 2;
++	      dremap.perm[0] = 1;
++	      dremap.perm[1] = 2;
++	    }
++	}
++      else
++	return false;
++    }
++  else
++    {
++      unsigned int nelt4 = nelt / 4, nzcnt = 0;
++      unsigned HOST_WIDE_INT q[8];
++      unsigned int nonzero_halves[4];
++
++      /* Split the two input vectors into 8 quarters.  */
++      q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
++      for (i = 1; i < 8; ++i)
++	q[i] = q[0] << (nelt4 * i);
++      for (i = 0; i < 4; ++i)
++	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
++	  {
++	    nonzero_halves[nzcnt] = i;
++	    ++nzcnt;
++	  }
++
++      if (nzcnt == 1)
++	{
++	  gcc_assert (d->one_operand_p);
++	  nonzero_halves[1] = nonzero_halves[0];
++	  same_halves = true;
++	}
++      else if (d->one_operand_p)
++	{
++	  gcc_assert (nonzero_halves[0] == 0);
++	  gcc_assert (nonzero_halves[1] == 1);
++	}
++
++      if (nzcnt <= 2)
++	{
++	  if (d->perm[0] / nelt2 == nonzero_halves[1])
++	    {
++	      /* Attempt to increase the likelihood that dfinal
++		 shuffle will be intra-lane.  */
++	      std::swap (nonzero_halves[0], nonzero_halves[1]);
++	    }
++
++	  /* vperm2f128 or vperm2i128.  */
++	  for (i = 0; i < nelt2; ++i)
++	    {
++	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
++	      remap[i + nonzero_halves[0] * nelt2] = i;
++	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
++	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
++	    }
++
++	  if (d->vmode != V8SFmode
++	      && d->vmode != V4DFmode
++	      && d->vmode != V8SImode)
++	    {
++	      dremap.vmode = V8SImode;
++	      dremap.nelt = 8;
++	      for (i = 0; i < 4; ++i)
++		{
++		  dremap.perm[i] = i + nonzero_halves[0] * 4;
++		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
++		}
++	    }
++	}
++      else if (d->one_operand_p)
++	return false;
++      else if (TARGET_AVX2
++	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
++	{
++	  /* vpunpckl* */
++	  for (i = 0; i < nelt4; ++i)
++	    {
++	      remap[i] = i * 2;
++	      remap[i + nelt] = i * 2 + 1;
++	      remap[i + nelt2] = i * 2 + nelt2;
++	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
++	      dremap.perm[i * 2] = i;
++	      dremap.perm[i * 2 + 1] = i + nelt;
++	      dremap.perm[i * 2 + nelt2] = i + nelt2;
++	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
++	    }
++	}
++      else if (TARGET_AVX2
++	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
++	{
++	  /* vpunpckh* */
++	  for (i = 0; i < nelt4; ++i)
++	    {
++	      remap[i + nelt4] = i * 2;
++	      remap[i + nelt + nelt4] = i * 2 + 1;
++	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
++	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
++	      dremap.perm[i * 2] = i + nelt4;
++	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
++	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
++	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
++	    }
++	}
++      else
++	return false;
++    }
++
++  /* Use the remapping array set up above to move the elements from their
++     swizzled locations into their final destinations.  */
++  dfinal = *d;
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned e = remap[d->perm[i]];
++      gcc_assert (e < nelt);
++      /* If same_halves is true, both halves of the remapped vector are the
++	 same.  Avoid cross-lane accesses if possible.  */
++      if (same_halves && i >= nelt2)
++	{
++	  gcc_assert (e < nelt2);
++	  dfinal.perm[i] = e + nelt2;
++	}
++      else
++	dfinal.perm[i] = e;
++    }
++  if (!d->testing_p)
++    {
++      dremap.target = gen_reg_rtx (dremap.vmode);
++      dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
++    }
++  dfinal.op1 = dfinal.op0;
++  dfinal.one_operand_p = true;
++
++  /* Test if the final remap can be done with a single insn.  For V4SFmode or
++     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
++  start_sequence ();
++  ok = expand_vec_perm_1 (&dfinal);
++  seq = get_insns ();
++  end_sequence ();
++
++  if (!ok)
++    return false;
++
++  if (d->testing_p)
++    return true;
++
++  if (dremap.vmode != dfinal.vmode)
++    {
++      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
++      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
++    }
++
++  ok = expand_vec_perm_1 (&dremap);
++  gcc_assert (ok);
++
++  emit_insn (seq);
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
++   a single vector cross-lane permutation into vpermq followed
++   by any of the single insn permutations.  */
++
++static bool
++expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
++{
++  struct expand_vec_perm_d dremap, dfinal;
++  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
++  unsigned contents[2];
++  bool ok;
++
++  if (!(TARGET_AVX2
++	&& (d->vmode == V32QImode || d->vmode == V16HImode)
++	&& d->one_operand_p))
++    return false;
++
++  contents[0] = 0;
++  contents[1] = 0;
++  for (i = 0; i < nelt2; ++i)
++    {
++      contents[0] |= 1u << (d->perm[i] / nelt4);
++      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
++    }
++
++  for (i = 0; i < 2; ++i)
++    {
++      unsigned int cnt = 0;
++      for (j = 0; j < 4; ++j)
++	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
++	  return false;
++    }
++
++  if (d->testing_p)
++    return true;
++
++  dremap = *d;
++  dremap.vmode = V4DImode;
++  dremap.nelt = 4;
++  dremap.target = gen_reg_rtx (V4DImode);
++  dremap.op0 = gen_lowpart (V4DImode, d->op0);
++  dremap.op1 = dremap.op0;
++  dremap.one_operand_p = true;
++  for (i = 0; i < 2; ++i)
++    {
++      unsigned int cnt = 0;
++      for (j = 0; j < 4; ++j)
++	if ((contents[i] & (1u << j)) != 0)
++	  dremap.perm[2 * i + cnt++] = j;
++      for (; cnt < 2; ++cnt)
++	dremap.perm[2 * i + cnt] = 0;
++    }
++
++  dfinal = *d;
++  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
++  dfinal.op1 = dfinal.op0;
++  dfinal.one_operand_p = true;
++  for (i = 0, j = 0; i < nelt; ++i)
++    {
++      if (i == nelt2)
++	j = 2;
++      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
++      if ((d->perm[i] / nelt4) == dremap.perm[j])
++	;
++      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
++	dfinal.perm[i] |= nelt4;
++      else
++	gcc_unreachable ();
++    }
++
++  ok = expand_vec_perm_1 (&dremap);
++  gcc_assert (ok);
++
++  ok = expand_vec_perm_1 (&dfinal);
++  gcc_assert (ok);
++
++  return true;
++}
++
++static bool canonicalize_perm (struct expand_vec_perm_d *d);
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
++   a vector permutation using two instructions, vperm2f128 resp.
++   vperm2i128 followed by any single in-lane permutation.  */
++
++static bool
++expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
++{
++  struct expand_vec_perm_d dfirst, dsecond;
++  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
++  bool ok;
++
++  if (!TARGET_AVX
++      || GET_MODE_SIZE (d->vmode) != 32
++      || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
++    return false;
++
++  dsecond = *d;
++  dsecond.one_operand_p = false;
++  dsecond.testing_p = true;
++
++  /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
++     immediate.  For perm < 16 the second permutation uses
++     d->op0 as first operand, for perm >= 16 it uses d->op1
++     as first operand.  The second operand is the result of
++     vperm2[fi]128.  */
++  for (perm = 0; perm < 32; perm++)
++    {
++      /* Ignore permutations which do not move anything cross-lane.  */
++      if (perm < 16)
++	{
++	  /* The second shuffle for e.g. V4DFmode has
++	     0123 and ABCD operands.
++	     Ignore AB23, as 23 is already in the second lane
++	     of the first operand.  */
++	  if ((perm & 0xc) == (1 << 2)) continue;
++	  /* And 01CD, as 01 is in the first lane of the first
++	     operand.  */
++	  if ((perm & 3) == 0) continue;
++	  /* And 4567, as then the vperm2[fi]128 doesn't change
++	     anything on the original 4567 second operand.  */
++	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
++	}
++      else
++	{
++	  /* The second shuffle for e.g. V4DFmode has
++	     4567 and ABCD operands.
++	     Ignore AB67, as 67 is already in the second lane
++	     of the first operand.  */
++	  if ((perm & 0xc) == (3 << 2)) continue;
++	  /* And 45CD, as 45 is in the first lane of the first
++	     operand.  */
++	  if ((perm & 3) == 2) continue;
++	  /* And 0123, as then the vperm2[fi]128 doesn't change
++	     anything on the original 0123 first operand.  */
++	  if ((perm & 0xf) == (1 << 2)) continue;
++	}
++
++      for (i = 0; i < nelt; i++)
++	{
++	  j = d->perm[i] / nelt2;
++	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
++	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
++	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
++	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
++	  else
++	    break;
++	}
++
++      if (i == nelt)
++	{
++	  start_sequence ();
++	  ok = expand_vec_perm_1 (&dsecond);
++	  end_sequence ();
++	}
++      else
++	ok = false;
++
++      if (ok)
++	{
++	  if (d->testing_p)
++	    return true;
++
++	  /* Found a usable second shuffle.  dfirst will be
++	     vperm2f128 on d->op0 and d->op1.  */
++	  dsecond.testing_p = false;
++	  dfirst = *d;
++	  dfirst.target = gen_reg_rtx (d->vmode);
++	  for (i = 0; i < nelt; i++)
++	    dfirst.perm[i] = (i & (nelt2 - 1))
++			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
++
++	  canonicalize_perm (&dfirst);
++	  ok = expand_vec_perm_1 (&dfirst);
++	  gcc_assert (ok);
++
++	  /* And dsecond is some single insn shuffle, taking
++	     d->op0 and result of vperm2f128 (if perm < 16) or
++	     d->op1 and result of vperm2f128 (otherwise).  */
++	  if (perm >= 16)
++	    dsecond.op0 = dsecond.op1;
++	  dsecond.op1 = dfirst.target;
++
++	  ok = expand_vec_perm_1 (&dsecond);
++	  gcc_assert (ok);
++
++	  return true;
++	}
++
++      /* For one operand, the only useful vperm2f128 permutation is 0x01
++	 aka lanes swap.  */
++      if (d->one_operand_p)
++	return false;
++    }
++
++  return false;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
++   a two vector permutation using 2 intra-lane interleave insns
++   and cross-lane shuffle for 32-byte vectors.  */
++
++static bool
++expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
++{
++  unsigned i, nelt;
++  rtx (*gen) (rtx, rtx, rtx);
++
++  if (d->one_operand_p)
++    return false;
++  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
++    ;
++  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
++    ;
++  else
++    return false;
++
++  nelt = d->nelt;
++  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
++    return false;
++  for (i = 0; i < nelt; i += 2)
++    if (d->perm[i] != d->perm[0] + i / 2
++	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++  switch (d->vmode)
++    {
++    case E_V32QImode:
++      if (d->perm[0])
++	gen = gen_vec_interleave_highv32qi;
++      else
++	gen = gen_vec_interleave_lowv32qi;
++      break;
++    case E_V16HImode:
++      if (d->perm[0])
++	gen = gen_vec_interleave_highv16hi;
++      else
++	gen = gen_vec_interleave_lowv16hi;
++      break;
++    case E_V8SImode:
++      if (d->perm[0])
++	gen = gen_vec_interleave_highv8si;
++      else
++	gen = gen_vec_interleave_lowv8si;
++      break;
++    case E_V4DImode:
++      if (d->perm[0])
++	gen = gen_vec_interleave_highv4di;
++      else
++	gen = gen_vec_interleave_lowv4di;
++      break;
++    case E_V8SFmode:
++      if (d->perm[0])
++	gen = gen_vec_interleave_highv8sf;
++      else
++	gen = gen_vec_interleave_lowv8sf;
++      break;
++    case E_V4DFmode:
++      if (d->perm[0])
++	gen = gen_vec_interleave_highv4df;
++      else
++	gen = gen_vec_interleave_lowv4df;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  emit_insn (gen (d->target, d->op0, d->op1));
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
++   a single vector permutation using a single intra-lane vector
++   permutation, vperm2f128 swapping the lanes and vblend* insn blending
++   the non-swapped and swapped vectors together.  */
++
++static bool
++expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
++{
++  struct expand_vec_perm_d dfirst, dsecond;
++  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
++  rtx_insn *seq;
++  bool ok;
++  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
++
++  if (!TARGET_AVX
++      || TARGET_AVX2
++      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
++      || !d->one_operand_p)
++    return false;
++
++  dfirst = *d;
++  for (i = 0; i < nelt; i++)
++    dfirst.perm[i] = 0xff;
++  for (i = 0, msk = 0; i < nelt; i++)
++    {
++      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
++      if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
++	return false;
++      dfirst.perm[j] = d->perm[i];
++      if (j != i)
++	msk |= (1 << i);
++    }
++  for (i = 0; i < nelt; i++)
++    if (dfirst.perm[i] == 0xff)
++      dfirst.perm[i] = i;
++
++  if (!d->testing_p)
++    dfirst.target = gen_reg_rtx (dfirst.vmode);
++
++  start_sequence ();
++  ok = expand_vec_perm_1 (&dfirst);
++  seq = get_insns ();
++  end_sequence ();
++
++  if (!ok)
++    return false;
++
++  if (d->testing_p)
++    return true;
++
++  emit_insn (seq);
++
++  dsecond = *d;
++  dsecond.op0 = dfirst.target;
++  dsecond.op1 = dfirst.target;
++  dsecond.one_operand_p = true;
++  dsecond.target = gen_reg_rtx (dsecond.vmode);
++  for (i = 0; i < nelt; i++)
++    dsecond.perm[i] = i ^ nelt2;
++
++  ok = expand_vec_perm_1 (&dsecond);
++  gcc_assert (ok);
++
++  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
++  emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
++   permutation using two vperm2f128, followed by a vshufpd insn blending
++   the two vectors together.  */
++
++static bool
++expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
++{
++  struct expand_vec_perm_d dfirst, dsecond, dthird;
++  bool ok;
++
++  if (!TARGET_AVX || (d->vmode != V4DFmode))
++    return false;
++
++  if (d->testing_p)
++    return true;
++
++  dfirst = *d;
++  dsecond = *d;
++  dthird = *d;
++
++  dfirst.perm[0] = (d->perm[0] & ~1);
++  dfirst.perm[1] = (d->perm[0] & ~1) + 1;
++  dfirst.perm[2] = (d->perm[2] & ~1);
++  dfirst.perm[3] = (d->perm[2] & ~1) + 1;
++  dsecond.perm[0] = (d->perm[1] & ~1);
++  dsecond.perm[1] = (d->perm[1] & ~1) + 1;
++  dsecond.perm[2] = (d->perm[3] & ~1);
++  dsecond.perm[3] = (d->perm[3] & ~1) + 1;
++  dthird.perm[0] = (d->perm[0] % 2);
++  dthird.perm[1] = (d->perm[1] % 2) + 4;
++  dthird.perm[2] = (d->perm[2] % 2) + 2;
++  dthird.perm[3] = (d->perm[3] % 2) + 6;
++
++  dfirst.target = gen_reg_rtx (dfirst.vmode);
++  dsecond.target = gen_reg_rtx (dsecond.vmode);
++  dthird.op0 = dfirst.target;
++  dthird.op1 = dsecond.target;
++  dthird.one_operand_p = false;
++
++  canonicalize_perm (&dfirst);
++  canonicalize_perm (&dsecond);
++
++  ok = expand_vec_perm_1 (&dfirst)
++       && expand_vec_perm_1 (&dsecond)
++       && expand_vec_perm_1 (&dthird);
++
++  gcc_assert (ok);
++
++  return true;
++}
++
++/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
++   permutation with two pshufb insns and an ior.  We should have already
++   failed all two instruction sequences.  */
++
++static bool
++expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
++{
++  rtx rperm[2][16], vperm, l, h, op, m128;
++  unsigned int i, nelt, eltsz;
++
++  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
++    return false;
++  gcc_assert (!d->one_operand_p);
++
++  if (d->testing_p)
++    return true;
++
++  nelt = d->nelt;
++  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
++
++  /* Generate two permutation masks.  If the required element is within
++     the given vector it is shuffled into the proper lane.  If the required
++     element is in the other vector, force a zero into the lane by setting
++     bit 7 in the permutation mask.  */
++  m128 = GEN_INT (-128);
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned j, e = d->perm[i];
++      unsigned which = (e >= nelt);
++      if (e >= nelt)
++	e -= nelt;
++
++      for (j = 0; j < eltsz; ++j)
++	{
++	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
++	  rperm[1-which][i*eltsz + j] = m128;
++	}
++    }
++
++  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
++  vperm = force_reg (V16QImode, vperm);
++
++  l = gen_reg_rtx (V16QImode);
++  op = gen_lowpart (V16QImode, d->op0);
++  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
++
++  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
++  vperm = force_reg (V16QImode, vperm);
++
++  h = gen_reg_rtx (V16QImode);
++  op = gen_lowpart (V16QImode, d->op1);
++  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
++
++  op = d->target;
++  if (d->vmode != V16QImode)
++    op = gen_reg_rtx (V16QImode);
++  emit_insn (gen_iorv16qi3 (op, l, h));
++  if (op != d->target)
++    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
++
++  return true;
++}
++
++/* Implement arbitrary permutation of one V32QImode and V16QImode operand
++   with two vpshufb insns, vpermq and vpor.  We should have already failed
++   all two or three instruction sequences.  */
++
++static bool
++expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
++{
++  rtx rperm[2][32], vperm, l, h, hp, op, m128;
++  unsigned int i, nelt, eltsz;
++
++  if (!TARGET_AVX2
++      || !d->one_operand_p
++      || (d->vmode != V32QImode && d->vmode != V16HImode))
++    return false;
++
++  if (d->testing_p)
++    return true;
++
++  nelt = d->nelt;
++  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
++
++  /* Generate two permutation masks.  If the required element is within
++     the same lane, it is shuffled in.  If the required element from the
++     other lane, force a zero by setting bit 7 in the permutation mask.
++     In the other mask the mask has non-negative elements if element
++     is requested from the other lane, but also moved to the other lane,
++     so that the result of vpshufb can have the two V2TImode halves
++     swapped.  */
++  m128 = GEN_INT (-128);
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
++      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
++
++      for (j = 0; j < eltsz; ++j)
++	{
++	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
++	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
++	}
++    }
++
++  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
++  vperm = force_reg (V32QImode, vperm);
++
++  h = gen_reg_rtx (V32QImode);
++  op = gen_lowpart (V32QImode, d->op0);
++  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
++
++  /* Swap the 128-byte lanes of h into hp.  */
++  hp = gen_reg_rtx (V4DImode);
++  op = gen_lowpart (V4DImode, h);
++  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
++				  const1_rtx));
++
++  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
++  vperm = force_reg (V32QImode, vperm);
++
++  l = gen_reg_rtx (V32QImode);
++  op = gen_lowpart (V32QImode, d->op0);
++  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
++
++  op = d->target;
++  if (d->vmode != V32QImode)
++    op = gen_reg_rtx (V32QImode);
++  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
++  if (op != d->target)
++    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
++
++  return true;
++}
++
++/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
++   and extract-odd permutations of two V32QImode and V16QImode operand
++   with two vpshufb insns, vpor and vpermq.  We should have already
++   failed all two or three instruction sequences.  */
++
++static bool
++expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
++{
++  rtx rperm[2][32], vperm, l, h, ior, op, m128;
++  unsigned int i, nelt, eltsz;
++
++  if (!TARGET_AVX2
++      || d->one_operand_p
++      || (d->vmode != V32QImode && d->vmode != V16HImode))
++    return false;
++
++  for (i = 0; i < d->nelt; ++i)
++    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++  nelt = d->nelt;
++  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
++
++  /* Generate two permutation masks.  In the first permutation mask
++     the first quarter will contain indexes for the first half
++     of the op0, the second quarter will contain bit 7 set, third quarter
++     will contain indexes for the second half of the op0 and the
++     last quarter bit 7 set.  In the second permutation mask
++     the first quarter will contain bit 7 set, the second quarter
++     indexes for the first half of the op1, the third quarter bit 7 set
++     and last quarter indexes for the second half of the op1.
++     I.e. the first mask e.g. for V32QImode extract even will be:
++     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
++     (all values masked with 0xf except for -128) and second mask
++     for extract even will be
++     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
++  m128 = GEN_INT (-128);
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
++      unsigned which = d->perm[i] >= nelt;
++      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
++
++      for (j = 0; j < eltsz; ++j)
++	{
++	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
++	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
++	}
++    }
++
++  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
++  vperm = force_reg (V32QImode, vperm);
++
++  l = gen_reg_rtx (V32QImode);
++  op = gen_lowpart (V32QImode, d->op0);
++  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
++
++  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
++  vperm = force_reg (V32QImode, vperm);
++
++  h = gen_reg_rtx (V32QImode);
++  op = gen_lowpart (V32QImode, d->op1);
++  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
++
++  ior = gen_reg_rtx (V32QImode);
++  emit_insn (gen_iorv32qi3 (ior, l, h));
++
++  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
++  op = gen_reg_rtx (V4DImode);
++  ior = gen_lowpart (V4DImode, ior);
++  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
++				  const1_rtx, GEN_INT (3)));
++  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
++
++  return true;
++}
++
++/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
++   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
++   with two "and" and "pack" or two "shift" and "pack" insns.  We should
++   have already failed all two instruction sequences.  */
++
++static bool
++expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
++{
++  rtx op, dop0, dop1, t;
++  unsigned i, odd, c, s, nelt = d->nelt;
++  bool end_perm = false;
++  machine_mode half_mode;
++  rtx (*gen_and) (rtx, rtx, rtx);
++  rtx (*gen_pack) (rtx, rtx, rtx);
++  rtx (*gen_shift) (rtx, rtx, rtx);
++
++  if (d->one_operand_p)
++    return false;
++
++  switch (d->vmode)
++    {
++    case E_V8HImode:
++      /* Required for "pack".  */
++      if (!TARGET_SSE4_1)
++        return false;
++      c = 0xffff;
++      s = 16;
++      half_mode = V4SImode;
++      gen_and = gen_andv4si3;
++      gen_pack = gen_sse4_1_packusdw;
++      gen_shift = gen_lshrv4si3;
++      break;
++    case E_V16QImode:
++      /* No check as all instructions are SSE2.  */
++      c = 0xff;
++      s = 8;
++      half_mode = V8HImode;
++      gen_and = gen_andv8hi3;
++      gen_pack = gen_sse2_packuswb;
++      gen_shift = gen_lshrv8hi3;
++      break;
++    case E_V16HImode:
++      if (!TARGET_AVX2)
++        return false;
++      c = 0xffff;
++      s = 16;
++      half_mode = V8SImode;
++      gen_and = gen_andv8si3;
++      gen_pack = gen_avx2_packusdw;
++      gen_shift = gen_lshrv8si3;
++      end_perm = true;
++      break;
++    case E_V32QImode:
++      if (!TARGET_AVX2)
++        return false;
++      c = 0xff;
++      s = 8;
++      half_mode = V16HImode;
++      gen_and = gen_andv16hi3;
++      gen_pack = gen_avx2_packuswb;
++      gen_shift = gen_lshrv16hi3;
++      end_perm = true;
++      break;
++    default:
++      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
++	 general shuffles.  */
++      return false;
++    }
++
++  /* Check that permutation is even or odd.  */
++  odd = d->perm[0];
++  if (odd > 1)
++    return false;
++
++  for (i = 1; i < nelt; ++i)
++    if (d->perm[i] != 2 * i + odd)
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++  dop0 = gen_reg_rtx (half_mode);
++  dop1 = gen_reg_rtx (half_mode);
++  if (odd == 0)
++    {
++      t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
++      t = force_reg (half_mode, t);
++      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
++      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
++    }
++  else
++    {
++      emit_insn (gen_shift (dop0,
++			    gen_lowpart (half_mode, d->op0),
++			    GEN_INT (s)));
++      emit_insn (gen_shift (dop1,
++			    gen_lowpart (half_mode, d->op1),
++			    GEN_INT (s)));
++    }
++  /* In AVX2 for 256 bit case we need to permute pack result.  */
++  if (TARGET_AVX2 && end_perm)
++    {
++      op = gen_reg_rtx (d->vmode);
++      t = gen_reg_rtx (V4DImode);
++      emit_insn (gen_pack (op, dop0, dop1));
++      emit_insn (gen_avx2_permv4di_1 (t,
++				      gen_lowpart (V4DImode, op),
++				      const0_rtx,
++				      const2_rtx,
++				      const1_rtx,
++				      GEN_INT (3)));
++      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
++    }
++  else
++    emit_insn (gen_pack (d->target, dop0, dop1));
++
++  return true;
++}
++
++/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
++   and extract-odd permutations of two V64QI operands
++   with two "shifts", two "truncs" and one "concat" insns for "odd"
++   and two "truncs" and one concat insn for "even."
++   Have already failed all two instruction sequences.  */
++
++static bool
++expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
++{
++  rtx t1, t2, t3, t4;
++  unsigned i, odd, nelt = d->nelt;
++
++  if (!TARGET_AVX512BW
++      || d->one_operand_p
++      || d->vmode != V64QImode)
++    return false;
++
++  /* Check that permutation is even or odd.  */
++  odd = d->perm[0];
++  if (odd > 1)
++    return false;
++
++  for (i = 1; i < nelt; ++i)
++    if (d->perm[i] != 2 * i + odd)
++      return false;
++
++  if (d->testing_p)
++    return true;
++
++
++  if (odd)
++    {
++      t1 = gen_reg_rtx (V32HImode);
++      t2 = gen_reg_rtx (V32HImode);
++      emit_insn (gen_lshrv32hi3 (t1,
++				 gen_lowpart (V32HImode, d->op0),
++				 GEN_INT (8)));
++      emit_insn (gen_lshrv32hi3 (t2,
++				 gen_lowpart (V32HImode, d->op1),
++				 GEN_INT (8)));
++    }
++  else
++    {
++      t1 = gen_lowpart (V32HImode, d->op0);
++      t2 = gen_lowpart (V32HImode, d->op1);
++    }
++
++  t3 = gen_reg_rtx (V32QImode);
++  t4 = gen_reg_rtx (V32QImode);
++  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
++  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
++  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
++
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
++   and extract-odd permutations.  */
++
++static bool
++expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
++{
++  rtx t1, t2, t3, t4, t5;
++
++  switch (d->vmode)
++    {
++    case E_V4DFmode:
++      if (d->testing_p)
++	break;
++      t1 = gen_reg_rtx (V4DFmode);
++      t2 = gen_reg_rtx (V4DFmode);
++
++      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
++      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
++      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
++
++      /* Now an unpck[lh]pd will produce the result required.  */
++      if (odd)
++	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
++      else
++	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
++      emit_insn (t3);
++      break;
++
++    case E_V8SFmode:
++      {
++	int mask = odd ? 0xdd : 0x88;
++
++	if (d->testing_p)
++	  break;
++	t1 = gen_reg_rtx (V8SFmode);
++	t2 = gen_reg_rtx (V8SFmode);
++	t3 = gen_reg_rtx (V8SFmode);
++
++	/* Shuffle within the 128-bit lanes to produce:
++	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
++	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
++				      GEN_INT (mask)));
++
++	/* Shuffle the lanes around to produce:
++	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
++	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
++					    GEN_INT (0x3)));
++
++	/* Shuffle within the 128-bit lanes to produce:
++	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
++	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
++
++	/* Shuffle within the 128-bit lanes to produce:
++	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
++	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
++
++	/* Shuffle the lanes around to produce:
++	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
++	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
++					    GEN_INT (0x20)));
++      }
++      break;
++
++    case E_V2DFmode:
++    case E_V4SFmode:
++    case E_V2DImode:
++    case E_V4SImode:
++      /* These are always directly implementable by expand_vec_perm_1.  */
++      gcc_unreachable ();
++
++    case E_V8HImode:
++      if (TARGET_SSE4_1)
++	return expand_vec_perm_even_odd_pack (d);
++      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
++	return expand_vec_perm_pshufb2 (d);
++      else
++	{
++	  if (d->testing_p)
++	    break;
++	  /* We need 2*log2(N)-1 operations to achieve odd/even
++	     with interleave. */
++	  t1 = gen_reg_rtx (V8HImode);
++	  t2 = gen_reg_rtx (V8HImode);
++	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
++	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
++	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
++	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
++	  if (odd)
++	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
++	  else
++	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
++	  emit_insn (t3);
++	}
++      break;
++
++    case E_V16QImode:
++      return expand_vec_perm_even_odd_pack (d);
++
++    case E_V16HImode:
++    case E_V32QImode:
++      return expand_vec_perm_even_odd_pack (d);
++
++    case E_V64QImode:
++      return expand_vec_perm_even_odd_trunc (d);
++
++    case E_V4DImode:
++      if (!TARGET_AVX2)
++	{
++	  struct expand_vec_perm_d d_copy = *d;
++	  d_copy.vmode = V4DFmode;
++	  if (d->testing_p)
++	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
++	  else
++	    d_copy.target = gen_reg_rtx (V4DFmode);
++	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
++	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
++	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
++	    {
++	      if (!d->testing_p)
++		emit_move_insn (d->target,
++				gen_lowpart (V4DImode, d_copy.target));
++	      return true;
++	    }
++	  return false;
++	}
++
++      if (d->testing_p)
++	break;
++
++      t1 = gen_reg_rtx (V4DImode);
++      t2 = gen_reg_rtx (V4DImode);
++
++      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
++      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
++      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
++
++      /* Now an vpunpck[lh]qdq will produce the result required.  */
++      if (odd)
++	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
++      else
++	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
++      emit_insn (t3);
++      break;
++
++    case E_V8SImode:
++      if (!TARGET_AVX2)
++	{
++	  struct expand_vec_perm_d d_copy = *d;
++	  d_copy.vmode = V8SFmode;
++	  if (d->testing_p)
++	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
++	  else
++	    d_copy.target = gen_reg_rtx (V8SFmode);
++	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
++	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
++	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
++	    {
++	      if (!d->testing_p)
++		emit_move_insn (d->target,
++				gen_lowpart (V8SImode, d_copy.target));
++	      return true;
++	    }
++	  return false;
++	}
++
++      if (d->testing_p)
++	break;
++
++      t1 = gen_reg_rtx (V8SImode);
++      t2 = gen_reg_rtx (V8SImode);
++      t3 = gen_reg_rtx (V4DImode);
++      t4 = gen_reg_rtx (V4DImode);
++      t5 = gen_reg_rtx (V4DImode);
++
++      /* Shuffle the lanes around into
++	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
++      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
++				    gen_lowpart (V4DImode, d->op1),
++				    GEN_INT (0x20)));
++      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
++				    gen_lowpart (V4DImode, d->op1),
++				    GEN_INT (0x31)));
++
++      /* Swap the 2nd and 3rd position in each lane into
++	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
++      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
++				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
++      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
++				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
++
++      /* Now an vpunpck[lh]qdq will produce
++	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
++      if (odd)
++	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
++					   gen_lowpart (V4DImode, t2));
++      else
++	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
++					  gen_lowpart (V4DImode, t2));
++      emit_insn (t3);
++      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  return true;
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
++   extract-even and extract-odd permutations.  */
++
++static bool
++expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
++{
++  unsigned i, odd, nelt = d->nelt;
++
++  odd = d->perm[0];
++  if (odd != 0 && odd != 1)
++    return false;
++
++  for (i = 1; i < nelt; ++i)
++    if (d->perm[i] != 2 * i + odd)
++      return false;
++
++  return expand_vec_perm_even_odd_1 (d, odd);
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
++   permutations.  We assume that expand_vec_perm_1 has already failed.  */
++
++static bool
++expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
++{
++  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
++  machine_mode vmode = d->vmode;
++  unsigned char perm2[4];
++  rtx op0 = d->op0, dest;
++  bool ok;
++
++  switch (vmode)
++    {
++    case E_V4DFmode:
++    case E_V8SFmode:
++      /* These are special-cased in sse.md so that we can optionally
++	 use the vbroadcast instruction.  They expand to two insns
++	 if the input happens to be in a register.  */
++      gcc_unreachable ();
++
++    case E_V2DFmode:
++    case E_V2DImode:
++    case E_V4SFmode:
++    case E_V4SImode:
++      /* These are always implementable using standard shuffle patterns.  */
++      gcc_unreachable ();
++
++    case E_V8HImode:
++    case E_V16QImode:
++      /* These can be implemented via interleave.  We save one insn by
++	 stopping once we have promoted to V4SImode and then use pshufd.  */
++      if (d->testing_p)
++	return true;
++      do
++	{
++	  rtx dest;
++	  rtx (*gen) (rtx, rtx, rtx)
++	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
++				 : gen_vec_interleave_lowv8hi;
++
++	  if (elt >= nelt2)
++	    {
++	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
++				       : gen_vec_interleave_highv8hi;
++	      elt -= nelt2;
++	    }
++	  nelt2 /= 2;
++
++	  dest = gen_reg_rtx (vmode);
++	  emit_insn (gen (dest, op0, op0));
++	  vmode = get_mode_wider_vector (vmode);
++	  op0 = gen_lowpart (vmode, dest);
++	}
++      while (vmode != V4SImode);
++
++      memset (perm2, elt, 4);
++      dest = gen_reg_rtx (V4SImode);
++      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
++      gcc_assert (ok);
++      if (!d->testing_p)
++	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
++      return true;
++
++    case E_V64QImode:
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V8SImode:
++    case E_V4DImode:
++      /* For AVX2 broadcasts of the first element vpbroadcast* or
++	 vpermq should be used by expand_vec_perm_1.  */
++      gcc_assert (!TARGET_AVX2 || d->perm[0]);
++      return false;
++
++    default:
++      gcc_unreachable ();
++    }
++}
++
++/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
++   broadcast permutations.  */
++
++static bool
++expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
++{
++  unsigned i, elt, nelt = d->nelt;
++
++  if (!d->one_operand_p)
++    return false;
++
++  elt = d->perm[0];
++  for (i = 1; i < nelt; ++i)
++    if (d->perm[i] != elt)
++      return false;
++
++  return expand_vec_perm_broadcast_1 (d);
++}
++
++/* Implement arbitrary permutations of two V64QImode operands
++   with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
++static bool
++expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
++{
++  if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
++    return false;
++
++  if (d->testing_p)
++    return true;
++
++  struct expand_vec_perm_d ds[2];
++  rtx rperm[128], vperm, target0, target1;
++  unsigned int i, nelt;
++  machine_mode vmode;
++
++  nelt = d->nelt;
++  vmode = V64QImode;
++
++  for (i = 0; i < 2; i++)
++    {
++      ds[i] = *d;
++      ds[i].vmode = V32HImode;
++      ds[i].nelt = 32;
++      ds[i].target = gen_reg_rtx (V32HImode);
++      ds[i].op0 = gen_lowpart (V32HImode, d->op0);
++      ds[i].op1 = gen_lowpart (V32HImode, d->op1);
++    }
++
++  /* Prepare permutations such that the first one takes care of
++     putting the even bytes into the right positions or one higher
++     positions (ds[0]) and the second one takes care of
++     putting the odd bytes into the right positions or one below
++     (ds[1]).  */
++
++  for (i = 0; i < nelt; i++)
++    {
++      ds[i & 1].perm[i / 2] = d->perm[i] / 2;
++      if (i & 1)
++	{
++	  rperm[i] = constm1_rtx;
++	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
++	}
++      else
++	{
++	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
++	  rperm[i + 64] = constm1_rtx;
++	}
++    }
++
++  bool ok = expand_vec_perm_1 (&ds[0]);
++  gcc_assert (ok);
++  ds[0].target = gen_lowpart (V64QImode, ds[0].target);
++
++  ok = expand_vec_perm_1 (&ds[1]);
++  gcc_assert (ok);
++  ds[1].target = gen_lowpart (V64QImode, ds[1].target);
++
++  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
++  vperm = force_reg (vmode, vperm);
++  target0 = gen_reg_rtx (V64QImode);
++  emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
++
++  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
++  vperm = force_reg (vmode, vperm);
++  target1 = gen_reg_rtx (V64QImode);
++  emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
++
++  emit_insn (gen_iorv64qi3 (d->target, target0, target1));
++  return true;
++}
++
++/* Implement arbitrary permutation of two V32QImode and V16QImode operands
++   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
++   all the shorter instruction sequences.  */
++
++static bool
++expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
++{
++  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
++  unsigned int i, nelt, eltsz;
++  bool used[4];
++
++  if (!TARGET_AVX2
++      || d->one_operand_p
++      || (d->vmode != V32QImode && d->vmode != V16HImode))
++    return false;
++
++  if (d->testing_p)
++    return true;
++
++  nelt = d->nelt;
++  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
++
++  /* Generate 4 permutation masks.  If the required element is within
++     the same lane, it is shuffled in.  If the required element from the
++     other lane, force a zero by setting bit 7 in the permutation mask.
++     In the other mask the mask has non-negative elements if element
++     is requested from the other lane, but also moved to the other lane,
++     so that the result of vpshufb can have the two V2TImode halves
++     swapped.  */
++  m128 = GEN_INT (-128);
++  for (i = 0; i < 32; ++i)
++    {
++      rperm[0][i] = m128;
++      rperm[1][i] = m128;
++      rperm[2][i] = m128;
++      rperm[3][i] = m128;
++    }
++  used[0] = false;
++  used[1] = false;
++  used[2] = false;
++  used[3] = false;
++  for (i = 0; i < nelt; ++i)
++    {
++      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
++      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
++      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
++
++      for (j = 0; j < eltsz; ++j)
++	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
++      used[which] = true;
++    }
++
++  for (i = 0; i < 2; ++i)
++    {
++      if (!used[2 * i + 1])
++	{
++	  h[i] = NULL_RTX;
++	  continue;
++	}
++      vperm = gen_rtx_CONST_VECTOR (V32QImode,
++				    gen_rtvec_v (32, rperm[2 * i + 1]));
++      vperm = force_reg (V32QImode, vperm);
++      h[i] = gen_reg_rtx (V32QImode);
++      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
++      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
++    }
++
++  /* Swap the 128-byte lanes of h[X].  */
++  for (i = 0; i < 2; ++i)
++   {
++     if (h[i] == NULL_RTX)
++       continue;
++     op = gen_reg_rtx (V4DImode);
++     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
++				     const2_rtx, GEN_INT (3), const0_rtx,
++				     const1_rtx));
++     h[i] = gen_lowpart (V32QImode, op);
++   }
++
++  for (i = 0; i < 2; ++i)
++    {
++      if (!used[2 * i])
++	{
++	  l[i] = NULL_RTX;
++	  continue;
++	}
++      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
++      vperm = force_reg (V32QImode, vperm);
++      l[i] = gen_reg_rtx (V32QImode);
++      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
++      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
++    }
++
++  for (i = 0; i < 2; ++i)
++    {
++      if (h[i] && l[i])
++	{
++	  op = gen_reg_rtx (V32QImode);
++	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
++	  l[i] = op;
++	}
++      else if (h[i])
++	l[i] = h[i];
++    }
++
++  gcc_assert (l[0] && l[1]);
++  op = d->target;
++  if (d->vmode != V32QImode)
++    op = gen_reg_rtx (V32QImode);
++  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
++  if (op != d->target)
++    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
++  return true;
++}
++
++/* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
++   taken care of, perform the expansion in D and return true on success.  */
++
++static bool
++ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
++{
++  /* Try a single instruction expansion.  */
++  if (expand_vec_perm_1 (d))
++    return true;
++
++  /* Try sequences of two instructions.  */
++
++  if (expand_vec_perm_pshuflw_pshufhw (d))
++    return true;
++
++  if (expand_vec_perm_palignr (d, false))
++    return true;
++
++  if (expand_vec_perm_interleave2 (d))
++    return true;
++
++  if (expand_vec_perm_broadcast (d))
++    return true;
++
++  if (expand_vec_perm_vpermq_perm_1 (d))
++    return true;
++
++  if (expand_vec_perm_vperm2f128 (d))
++    return true;
++
++  if (expand_vec_perm_pblendv (d))
++    return true;
++
++  /* Try sequences of three instructions.  */
++
++  if (expand_vec_perm_even_odd_pack (d))
++    return true;
++
++  if (expand_vec_perm_2vperm2f128_vshuf (d))
++    return true;
++
++  if (expand_vec_perm_pshufb2 (d))
++    return true;
++
++  if (expand_vec_perm_interleave3 (d))
++    return true;
++
++  if (expand_vec_perm_vperm2f128_vblend (d))
++    return true;
++
++  /* Try sequences of four instructions.  */
++
++  if (expand_vec_perm_even_odd_trunc (d))
++    return true;
++  if (expand_vec_perm_vpshufb2_vpermq (d))
++    return true;
++
++  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
++    return true;
++
++  if (expand_vec_perm_vpermt2_vpshub2 (d))
++    return true;
++
++  /* ??? Look for narrow permutations whose element orderings would
++     allow the promotion to a wider mode.  */
++
++  /* ??? Look for sequences of interleave or a wider permute that place
++     the data into the correct lanes for a half-vector shuffle like
++     pshuf[lh]w or vpermilps.  */
++
++  /* ??? Look for sequences of interleave that produce the desired results.
++     The combinatorics of punpck[lh] get pretty ugly... */
++
++  if (expand_vec_perm_even_odd (d))
++    return true;
++
++  /* Even longer sequences.  */
++  if (expand_vec_perm_vpshufb4_vpermq2 (d))
++    return true;
++
++  /* See if we can get the same permutation in different vector integer
++     mode.  */
++  struct expand_vec_perm_d nd;
++  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
++    {
++      if (!d->testing_p)
++	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
++      return true;
++    }
++
++  return false;
++}
++
++/* If a permutation only uses one operand, make it clear. Returns true
++   if the permutation references both operands.  */
++
++static bool
++canonicalize_perm (struct expand_vec_perm_d *d)
++{
++  int i, which, nelt = d->nelt;
++
++  for (i = which = 0; i < nelt; ++i)
++      which |= (d->perm[i] < nelt ? 1 : 2);
++
++  d->one_operand_p = true;
++  switch (which)
++    {
++    default:
++      gcc_unreachable();
++
++    case 3:
++      if (!rtx_equal_p (d->op0, d->op1))
++        {
++	  d->one_operand_p = false;
++	  break;
++        }
++      /* The elements of PERM do not suggest that only the first operand
++	 is used, but both operands are identical.  Allow easier matching
++	 of the permutation by folding the permutation into the single
++	 input vector.  */
++      /* FALLTHRU */
++
++    case 2:
++      for (i = 0; i < nelt; ++i)
++        d->perm[i] &= nelt - 1;
++      d->op0 = d->op1;
++      break;
++
++    case 1:
++      d->op1 = d->op0;
++      break;
++    }
++
++  return (which == 3);
++}
++
++/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
++
++bool
++ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
++			       rtx op1, const vec_perm_indices &sel)
++{
++  struct expand_vec_perm_d d;
++  unsigned char perm[MAX_VECT_LEN];
++  unsigned int i, nelt, which;
++  bool two_args;
++
++  d.target = target;
++  d.op0 = op0;
++  d.op1 = op1;
++
++  d.vmode = vmode;
++  gcc_assert (VECTOR_MODE_P (d.vmode));
++  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
++  d.testing_p = !target;
++
++  gcc_assert (sel.length () == nelt);
++  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
++
++  /* Given sufficient ISA support we can just return true here
++     for selected vector modes.  */
++  switch (d.vmode)
++    {
++    case E_V16SFmode:
++    case E_V16SImode:
++    case E_V8DImode:
++    case E_V8DFmode:
++      if (!TARGET_AVX512F)
++	return false;
++      /* All implementable with a single vperm[it]2 insn.  */
++      if (d.testing_p)
++	return true;
++      break;
++    case E_V32HImode:
++      if (!TARGET_AVX512BW)
++	return false;
++      if (d.testing_p)
++	/* All implementable with a single vperm[it]2 insn.  */
++	return true;
++      break;
++    case E_V64QImode:
++      if (!TARGET_AVX512BW)
++	return false;
++      if (d.testing_p)
++	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
++	return true;
++      break;
++    case E_V8SImode:
++    case E_V8SFmode:
++    case E_V4DFmode:
++    case E_V4DImode:
++      if (!TARGET_AVX)
++	return false;
++      if (d.testing_p && TARGET_AVX512VL)
++	/* All implementable with a single vperm[it]2 insn.  */
++	return true;
++      break;
++    case E_V16HImode:
++      if (!TARGET_SSE2)
++	return false;
++      if (d.testing_p && TARGET_AVX2)
++	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
++	return true;
++      break;
++    case E_V32QImode:
++      if (!TARGET_SSE2)
++	return false;
++      if (d.testing_p && TARGET_AVX2)
++	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
++	return true;
++      break;
++    case E_V8HImode:
++    case E_V16QImode:
++      if (!TARGET_SSE2)
++	return false;
++      /* Fall through.  */
++    case E_V4SImode:
++    case E_V4SFmode:
++      if (!TARGET_SSE)
++	return false;
++      /* All implementable with a single vpperm insn.  */
++      if (d.testing_p && TARGET_XOP)
++	return true;
++      /* All implementable with 2 pshufb + 1 ior.  */
++      if (d.testing_p && TARGET_SSSE3)
++	return true;
++      break;
++    case E_V2DImode:
++    case E_V2DFmode:
++      if (!TARGET_SSE)
++	return false;
++      /* All implementable with shufpd or unpck[lh]pd.  */
++      if (d.testing_p)
++	return true;
++      break;
++    default:
++      return false;
++    }
++
++  for (i = which = 0; i < nelt; ++i)
++    {
++      unsigned char e = sel[i];
++      gcc_assert (e < 2 * nelt);
++      d.perm[i] = e;
++      perm[i] = e;
++      which |= (e < nelt ? 1 : 2);
++    }
++
++  if (d.testing_p)
++    {
++      /* For all elements from second vector, fold the elements to first.  */
++      if (which == 2)
++	for (i = 0; i < nelt; ++i)
++	  d.perm[i] -= nelt;
++
++      /* Check whether the mask can be applied to the vector type.  */
++      d.one_operand_p = (which != 3);
++
++      /* Implementable with shufps or pshufd.  */
++      if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
++	return true;
++
++      /* Otherwise we have to go through the motions and see if we can
++	 figure out how to generate the requested permutation.  */
++      d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
++      d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
++      if (!d.one_operand_p)
++	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
++
++      start_sequence ();
++      bool ret = ix86_expand_vec_perm_const_1 (&d);
++      end_sequence ();
++
++      return ret;
++    }
++
++  two_args = canonicalize_perm (&d);
++
++  if (ix86_expand_vec_perm_const_1 (&d))
++    return true;
++
++  /* If the selector says both arguments are needed, but the operands are the
++     same, the above tried to expand with one_operand_p and flattened selector.
++     If that didn't work, retry without one_operand_p; we succeeded with that
++     during testing.  */
++  if (two_args && d.one_operand_p)
++    {
++      d.one_operand_p = false;
++      memcpy (d.perm, perm, sizeof (perm));
++      return ix86_expand_vec_perm_const_1 (&d);
++    }
++
++  return false;
++}
++
++void
++ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
++{
++  struct expand_vec_perm_d d;
++  unsigned i, nelt;
++
++  d.target = targ;
++  d.op0 = op0;
++  d.op1 = op1;
++  d.vmode = GET_MODE (targ);
++  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
++  d.one_operand_p = false;
++  d.testing_p = false;
++
++  for (i = 0; i < nelt; ++i)
++    d.perm[i] = i * 2 + odd;
++
++  /* We'll either be able to implement the permutation directly...  */
++  if (expand_vec_perm_1 (&d))
++    return;
++
++  /* ... or we use the special-case patterns.  */
++  expand_vec_perm_even_odd_1 (&d, odd);
++}
++
++static void
++ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
++{
++  struct expand_vec_perm_d d;
++  unsigned i, nelt, base;
++  bool ok;
++
++  d.target = targ;
++  d.op0 = op0;
++  d.op1 = op1;
++  d.vmode = GET_MODE (targ);
++  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
++  d.one_operand_p = false;
++  d.testing_p = false;
++
++  base = high_p ? nelt / 2 : 0;
++  for (i = 0; i < nelt / 2; ++i)
++    {
++      d.perm[i * 2] = i + base;
++      d.perm[i * 2 + 1] = i + base + nelt;
++    }
++
++  /* Note that for AVX this isn't one instruction.  */
++  ok = ix86_expand_vec_perm_const_1 (&d);
++  gcc_assert (ok);
++}
++
++
++/* Expand a vector operation CODE for a V*QImode in terms of the
++   same operation on V*HImode.  */
++
++void
++ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
++{
++  machine_mode qimode = GET_MODE (dest);
++  machine_mode himode;
++  rtx (*gen_il) (rtx, rtx, rtx);
++  rtx (*gen_ih) (rtx, rtx, rtx);
++  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
++  struct expand_vec_perm_d d;
++  bool ok, full_interleave;
++  bool uns_p = false;
++  int i;
++
++  switch (qimode)
++    {
++    case E_V16QImode:
++      himode = V8HImode;
++      gen_il = gen_vec_interleave_lowv16qi;
++      gen_ih = gen_vec_interleave_highv16qi;
++      break;
++    case E_V32QImode:
++      himode = V16HImode;
++      gen_il = gen_avx2_interleave_lowv32qi;
++      gen_ih = gen_avx2_interleave_highv32qi;
++      break;
++    case E_V64QImode:
++      himode = V32HImode;
++      gen_il = gen_avx512bw_interleave_lowv64qi;
++      gen_ih = gen_avx512bw_interleave_highv64qi;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  op2_l = op2_h = op2;
++  switch (code)
++    {
++    case MULT:
++      /* Unpack data such that we've got a source byte in each low byte of
++	 each word.  We don't care what goes into the high byte of each word.
++	 Rather than trying to get zero in there, most convenient is to let
++	 it be a copy of the low byte.  */
++      op2_l = gen_reg_rtx (qimode);
++      op2_h = gen_reg_rtx (qimode);
++      emit_insn (gen_il (op2_l, op2, op2));
++      emit_insn (gen_ih (op2_h, op2, op2));
++
++      op1_l = gen_reg_rtx (qimode);
++      op1_h = gen_reg_rtx (qimode);
++      emit_insn (gen_il (op1_l, op1, op1));
++      emit_insn (gen_ih (op1_h, op1, op1));
++      full_interleave = qimode == V16QImode;
++      break;
++
++    case ASHIFT:
++    case LSHIFTRT:
++      uns_p = true;
++      /* FALLTHRU */
++    case ASHIFTRT:
++      op1_l = gen_reg_rtx (himode);
++      op1_h = gen_reg_rtx (himode);
++      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
++      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
++      full_interleave = true;
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++  /* Perform the operation.  */
++  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
++			       1, OPTAB_DIRECT);
++  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
++			       1, OPTAB_DIRECT);
++  gcc_assert (res_l && res_h);
++
++  /* Merge the data back into the right place.  */
++  d.target = dest;
++  d.op0 = gen_lowpart (qimode, res_l);
++  d.op1 = gen_lowpart (qimode, res_h);
++  d.vmode = qimode;
++  d.nelt = GET_MODE_NUNITS (qimode);
++  d.one_operand_p = false;
++  d.testing_p = false;
++
++  if (full_interleave)
++    {
++      /* For SSE2, we used an full interleave, so the desired
++	 results are in the even elements.  */
++      for (i = 0; i < d.nelt; ++i)
++	d.perm[i] = i * 2;
++    }
++  else
++    {
++      /* For AVX, the interleave used above was not cross-lane.  So the
++	 extraction is evens but with the second and third quarter swapped.
++	 Happily, that is even one insn shorter than even extraction.
++	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
++	 always first from the first and then from the second source operand,
++	 the index bits above the low 4 bits remains the same.
++	 Thus, for d.nelt == 32 we want permutation
++	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
++	 and for d.nelt == 64 we want permutation
++	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
++	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
++      for (i = 0; i < d.nelt; ++i)
++	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
++    }
++
++  ok = ix86_expand_vec_perm_const_1 (&d);
++  gcc_assert (ok);
++
++  set_unique_reg_note (get_last_insn (), REG_EQUAL,
++		       gen_rtx_fmt_ee (code, qimode, op1, op2));
++}
++
++/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
++   if op is CONST_VECTOR with all odd elements equal to their
++   preceding element.  */
++
++static bool
++const_vector_equal_evenodd_p (rtx op)
++{
++  machine_mode mode = GET_MODE (op);
++  int i, nunits = GET_MODE_NUNITS (mode);
++  if (GET_CODE (op) != CONST_VECTOR
++      || nunits != CONST_VECTOR_NUNITS (op))
++    return false;
++  for (i = 0; i < nunits; i += 2)
++    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
++      return false;
++  return true;
++}
++
++void
++ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
++			       bool uns_p, bool odd_p)
++{
++  machine_mode mode = GET_MODE (op1);
++  machine_mode wmode = GET_MODE (dest);
++  rtx x;
++  rtx orig_op1 = op1, orig_op2 = op2;
++
++  if (!nonimmediate_operand (op1, mode))
++    op1 = force_reg (mode, op1);
++  if (!nonimmediate_operand (op2, mode))
++    op2 = force_reg (mode, op2);
++
++  /* We only play even/odd games with vectors of SImode.  */
++  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
++
++  /* If we're looking for the odd results, shift those members down to
++     the even slots.  For some cpus this is faster than a PSHUFD.  */
++  if (odd_p)
++    {
++      /* For XOP use vpmacsdqh, but only for smult, as it is only
++	 signed.  */
++      if (TARGET_XOP && mode == V4SImode && !uns_p)
++	{
++	  x = force_reg (wmode, CONST0_RTX (wmode));
++	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
++	  return;
++	}
++
++      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
++      if (!const_vector_equal_evenodd_p (orig_op1))
++	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
++			    x, NULL, 1, OPTAB_DIRECT);
++      if (!const_vector_equal_evenodd_p (orig_op2))
++	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
++			    x, NULL, 1, OPTAB_DIRECT);
++      op1 = gen_lowpart (mode, op1);
++      op2 = gen_lowpart (mode, op2);
++    }
++
++  if (mode == V16SImode)
++    {
++      if (uns_p)
++	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
++      else
++	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
++    }
++  else if (mode == V8SImode)
++    {
++      if (uns_p)
++	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
++      else
++	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
++    }
++  else if (uns_p)
++    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
++  else if (TARGET_SSE4_1)
++    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
++  else
++    {
++      rtx s1, s2, t0, t1, t2;
++
++      /* The easiest way to implement this without PMULDQ is to go through
++	 the motions as if we are performing a full 64-bit multiply.  With
++	 the exception that we need to do less shuffling of the elements.  */
++
++      /* Compute the sign-extension, aka highparts, of the two operands.  */
++      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
++				op1, pc_rtx, pc_rtx);
++      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
++				op2, pc_rtx, pc_rtx);
++
++      /* Multiply LO(A) * HI(B), and vice-versa.  */
++      t1 = gen_reg_rtx (wmode);
++      t2 = gen_reg_rtx (wmode);
++      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
++      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
++
++      /* Multiply LO(A) * LO(B).  */
++      t0 = gen_reg_rtx (wmode);
++      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
++
++      /* Combine and shift the highparts into place.  */
++      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
++      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
++			 1, OPTAB_DIRECT);
++
++      /* Combine high and low parts.  */
++      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
++      return;
++    }
++  emit_insn (x);
++}
++
++void
++ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
++			    bool uns_p, bool high_p)
++{
++  machine_mode wmode = GET_MODE (dest);
++  machine_mode mode = GET_MODE (op1);
++  rtx t1, t2, t3, t4, mask;
++
++  switch (mode)
++    {
++    case E_V4SImode:
++      t1 = gen_reg_rtx (mode);
++      t2 = gen_reg_rtx (mode);
++      if (TARGET_XOP && !uns_p)
++	{
++	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
++	     shuffle the elements once so that all elements are in the right
++	     place for immediate use: { A C B D }.  */
++	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
++					const1_rtx, GEN_INT (3)));
++	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
++					const1_rtx, GEN_INT (3)));
++	}
++      else
++	{
++	  /* Put the elements into place for the multiply.  */
++	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
++	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
++	  high_p = false;
++	}
++      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
++      break;
++
++    case E_V8SImode:
++      /* Shuffle the elements between the lanes.  After this we
++	 have { A B E F | C D G H } for each operand.  */
++      t1 = gen_reg_rtx (V4DImode);
++      t2 = gen_reg_rtx (V4DImode);
++      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
++				      const0_rtx, const2_rtx,
++				      const1_rtx, GEN_INT (3)));
++      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
++				      const0_rtx, const2_rtx,
++				      const1_rtx, GEN_INT (3)));
++
++      /* Shuffle the elements within the lanes.  After this we
++	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
++      t3 = gen_reg_rtx (V8SImode);
++      t4 = gen_reg_rtx (V8SImode);
++      mask = GEN_INT (high_p
++		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
++		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
++      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
++      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
++
++      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
++      break;
++
++    case E_V8HImode:
++    case E_V16HImode:
++      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
++			 uns_p, OPTAB_DIRECT);
++      t2 = expand_binop (mode,
++			 uns_p ? umul_highpart_optab : smul_highpart_optab,
++			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
++      gcc_assert (t1 && t2);
++
++      t3 = gen_reg_rtx (mode);
++      ix86_expand_vec_interleave (t3, t1, t2, high_p);
++      emit_move_insn (dest, gen_lowpart (wmode, t3));
++      break;
++
++    case E_V16QImode:
++    case E_V32QImode:
++    case E_V32HImode:
++    case E_V16SImode:
++    case E_V64QImode:
++      t1 = gen_reg_rtx (wmode);
++      t2 = gen_reg_rtx (wmode);
++      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
++      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
++
++      emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++}
++
++void
++ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
++{
++  rtx res_1, res_2, res_3, res_4;
++
++  res_1 = gen_reg_rtx (V4SImode);
++  res_2 = gen_reg_rtx (V4SImode);
++  res_3 = gen_reg_rtx (V2DImode);
++  res_4 = gen_reg_rtx (V2DImode);
++  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
++  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
++
++  /* Move the results in element 2 down to element 1; we don't care
++     what goes in elements 2 and 3.  Then we can merge the parts
++     back together with an interleave.
++
++     Note that two other sequences were tried:
++     (1) Use interleaves at the start instead of psrldq, which allows
++     us to use a single shufps to merge things back at the end.
++     (2) Use shufps here to combine the two vectors, then pshufd to
++     put the elements in the correct order.
++     In both cases the cost of the reformatting stall was too high
++     and the overall sequence slower.  */
++
++  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
++				const0_rtx, const2_rtx,
++				const0_rtx, const0_rtx));
++  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
++				const0_rtx, const2_rtx,
++				const0_rtx, const0_rtx));
++  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
++
++  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
++}
++
++void
++ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
++{
++  machine_mode mode = GET_MODE (op0);
++  rtx t1, t2, t3, t4, t5, t6;
++
++  if (TARGET_AVX512DQ && mode == V8DImode)
++    emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
++  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
++    emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
++  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
++    emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
++  else if (TARGET_XOP && mode == V2DImode)
++    {
++      /* op1: A,B,C,D, op2: E,F,G,H */
++      op1 = gen_lowpart (V4SImode, op1);
++      op2 = gen_lowpart (V4SImode, op2);
++
++      t1 = gen_reg_rtx (V4SImode);
++      t2 = gen_reg_rtx (V4SImode);
++      t3 = gen_reg_rtx (V2DImode);
++      t4 = gen_reg_rtx (V2DImode);
++
++      /* t1: B,A,D,C */
++      emit_insn (gen_sse2_pshufd_1 (t1, op1,
++				    GEN_INT (1),
++				    GEN_INT (0),
++				    GEN_INT (3),
++				    GEN_INT (2)));
++
++      /* t2: (B*E),(A*F),(D*G),(C*H) */
++      emit_insn (gen_mulv4si3 (t2, t1, op2));
++
++      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
++      emit_insn (gen_xop_phadddq (t3, t2));
++
++      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
++      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
++
++      /* Multiply lower parts and add all */
++      t5 = gen_reg_rtx (V2DImode);
++      emit_insn (gen_vec_widen_umult_even_v4si (t5, 
++					gen_lowpart (V4SImode, op1),
++					gen_lowpart (V4SImode, op2)));
++      op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
++
++    }
++  else
++    {
++      machine_mode nmode;
++      rtx (*umul) (rtx, rtx, rtx);
++
++      if (mode == V2DImode)
++	{
++	  umul = gen_vec_widen_umult_even_v4si;
++	  nmode = V4SImode;
++	}
++      else if (mode == V4DImode)
++	{
++	  umul = gen_vec_widen_umult_even_v8si;
++	  nmode = V8SImode;
++	}
++      else if (mode == V8DImode)
++	{
++	  umul = gen_vec_widen_umult_even_v16si;
++	  nmode = V16SImode;
++	}
++      else
++	gcc_unreachable ();
++
++
++      /* Multiply low parts.  */
++      t1 = gen_reg_rtx (mode);
++      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
++
++      /* Shift input vectors right 32 bits so we can multiply high parts.  */
++      t6 = GEN_INT (32);
++      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
++      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
++
++      /* Multiply high parts by low parts.  */
++      t4 = gen_reg_rtx (mode);
++      t5 = gen_reg_rtx (mode);
++      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
++      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
++
++      /* Combine and shift the highparts back.  */
++      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
++      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
++
++      /* Combine high and low parts.  */
++      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
++    }
++
++  set_unique_reg_note (get_last_insn (), REG_EQUAL,
++		       gen_rtx_MULT (mode, op1, op2));
++}
++
++/* Return 1 if control tansfer instruction INSN
++   should be encoded with notrack prefix.  */
++
++bool
++ix86_notrack_prefixed_insn_p (rtx insn)
++{
++  if (!insn || !((flag_cf_protection & CF_BRANCH)))
++    return false;
++
++  if (CALL_P (insn))
++    {
++      rtx call = get_call_rtx_from (insn);
++      gcc_assert (call != NULL_RTX);
++      rtx addr = XEXP (call, 0);
++
++      /* Do not emit 'notrack' if it's not an indirect call.  */
++      if (MEM_P (addr)
++	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
++	return false;
++      else
++	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
++    }
++
++  if (JUMP_P (insn) && !flag_cet_switch)
++    {
++      rtx target = JUMP_LABEL (insn);
++      if (target == NULL_RTX || ANY_RETURN_P (target))
++	return false;
++
++      /* Check the jump is a switch table.  */
++      rtx_insn *label = as_a<rtx_insn *> (target);
++      rtx_insn *table = next_insn (label);
++      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
++	return false;
++      else
++	return true;
++    }
++  return false;
++}
++
++/* Calculate integer abs() using only SSE2 instructions.  */
++
++void
++ix86_expand_sse2_abs (rtx target, rtx input)
++{
++  machine_mode mode = GET_MODE (target);
++  rtx tmp0, tmp1, x;
++
++  switch (mode)
++    {
++    case E_V2DImode:
++    case E_V4DImode:
++      /* For 64-bit signed integer X, with SSE4.2 use
++	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
++	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
++	 32 and use logical instead of arithmetic right shift (which is
++	 unimplemented) and subtract.  */
++      if (TARGET_SSE4_2)
++	{
++	  tmp0 = gen_reg_rtx (mode);
++	  tmp1 = gen_reg_rtx (mode);
++	  emit_move_insn (tmp1, CONST0_RTX (mode));
++	  if (mode == E_V2DImode)
++	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
++	  else
++	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
++	}
++      else
++	{
++	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
++				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
++					       - 1), NULL, 0, OPTAB_DIRECT);
++	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
++	}
++
++      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
++				  NULL, 0, OPTAB_DIRECT);
++      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
++			       target, 0, OPTAB_DIRECT);
++      break;
++
++    case E_V4SImode:
++      /* For 32-bit signed integer X, the best way to calculate the absolute
++	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
++      tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
++				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
++				  NULL, 0, OPTAB_DIRECT);
++      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
++				  NULL, 0, OPTAB_DIRECT);
++      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
++			       target, 0, OPTAB_DIRECT);
++      break;
++
++    case E_V8HImode:
++      /* For 16-bit signed integer X, the best way to calculate the absolute
++	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
++      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
++
++      x = expand_simple_binop (mode, SMAX, tmp0, input,
++			       target, 0, OPTAB_DIRECT);
++      break;
++
++    case E_V16QImode:
++      /* For 8-bit signed integer X, the best way to calculate the absolute
++	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
++	 as SSE2 provides the PMINUB insn.  */
++      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
++
++      x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
++			       target, 0, OPTAB_DIRECT);
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  if (x != target)
++    emit_move_insn (target, x);
++}
++
++/* Expand an extract from a vector register through pextr insn.
++   Return true if successful.  */
++
++bool
++ix86_expand_pextr (rtx *operands)
++{
++  rtx dst = operands[0];
++  rtx src = operands[1];
++
++  unsigned int size = INTVAL (operands[2]);
++  unsigned int pos = INTVAL (operands[3]);
++
++  if (SUBREG_P (dst))
++    {
++      /* Reject non-lowpart subregs.  */
++      if (SUBREG_BYTE (dst) > 0)
++	return false;
++      dst = SUBREG_REG (dst);
++    }
++	
++  if (SUBREG_P (src))
++    {
++      pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
++      src = SUBREG_REG (src);
++    }
++
++  switch (GET_MODE (src))
++    {
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++    case E_V1TImode:
++    case E_TImode:
++      {
++	machine_mode srcmode, dstmode;
++	rtx d, pat;
++
++	if (!int_mode_for_size (size, 0).exists (&dstmode))
++	  return false;
++
++	switch (dstmode)
++	  {
++	  case E_QImode:
++	    if (!TARGET_SSE4_1)
++	      return false;
++	    srcmode = V16QImode;
++	    break;
++
++	  case E_HImode:
++	    if (!TARGET_SSE2)
++	      return false;
++	    srcmode = V8HImode;
++	    break;
++
++	  case E_SImode:
++	    if (!TARGET_SSE4_1)
++	      return false;
++	    srcmode = V4SImode;
++	    break;
++
++	  case E_DImode:
++	    gcc_assert (TARGET_64BIT);
++	    if (!TARGET_SSE4_1)
++	      return false;
++	    srcmode = V2DImode;
++	    break;
++
++	  default:
++	    return false;
++	  }
++
++	/* Reject extractions from misaligned positions.  */
++	if (pos & (size-1))
++	  return false;
++
++	if (GET_MODE (dst) == dstmode)
++	  d = dst;
++	else
++	  d = gen_reg_rtx (dstmode);
++
++	/* Construct insn pattern.  */
++	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
++	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
++
++	/* Let the rtl optimizers know about the zero extension performed.  */
++	if (dstmode == QImode || dstmode == HImode)
++	  {
++	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
++	    d = gen_lowpart (SImode, d);
++	  }
++
++	emit_insn (gen_rtx_SET (d, pat));
++
++	if (d != dst)
++	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
++	return true;
++      }
++
++    default:
++      return false;
++    }
++}
++
++/* Expand an insert into a vector register through pinsr insn.
++   Return true if successful.  */
++
++bool
++ix86_expand_pinsr (rtx *operands)
++{
++  rtx dst = operands[0];
++  rtx src = operands[3];
++
++  unsigned int size = INTVAL (operands[1]);
++  unsigned int pos = INTVAL (operands[2]);
++
++  if (SUBREG_P (dst))
++    {
++      pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
++      dst = SUBREG_REG (dst);
++    }
++
++  switch (GET_MODE (dst))
++    {
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++    case E_V1TImode:
++    case E_TImode:
++      {
++	machine_mode srcmode, dstmode;
++	rtx (*pinsr)(rtx, rtx, rtx, rtx);
++	rtx d;
++
++	if (!int_mode_for_size (size, 0).exists (&srcmode))
++	  return false;
++
++	switch (srcmode)
++	  {
++	  case E_QImode:
++	    if (!TARGET_SSE4_1)
++	      return false;
++	    dstmode = V16QImode;
++	    pinsr = gen_sse4_1_pinsrb;
++	    break;
++
++	  case E_HImode:
++	    if (!TARGET_SSE2)
++	      return false;
++	    dstmode = V8HImode;
++	    pinsr = gen_sse2_pinsrw;
++	    break;
++
++	  case E_SImode:
++	    if (!TARGET_SSE4_1)
++	      return false;
++	    dstmode = V4SImode;
++	    pinsr = gen_sse4_1_pinsrd;
++	    break;
++
++	  case E_DImode:
++	    gcc_assert (TARGET_64BIT);
++	    if (!TARGET_SSE4_1)
++	      return false;
++	    dstmode = V2DImode;
++	    pinsr = gen_sse4_1_pinsrq;
++	    break;
++
++	  default:
++	    return false;
++	  }
++
++	/* Reject insertions to misaligned positions.  */
++	if (pos & (size-1))
++	  return false;
++
++	if (SUBREG_P (src))
++	  {
++	    unsigned int srcpos = SUBREG_BYTE (src);
++
++	    if (srcpos > 0)
++	      {
++		rtx extr_ops[4];
++
++		extr_ops[0] = gen_reg_rtx (srcmode);
++		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
++		extr_ops[2] = GEN_INT (size);
++		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
++
++		if (!ix86_expand_pextr (extr_ops))
++		  return false;
++
++		src = extr_ops[0];
++	      }
++	    else
++	      src = gen_lowpart (srcmode, SUBREG_REG (src));
++	  }
++
++	if (GET_MODE (dst) == dstmode)
++	  d = dst;
++	else
++	  d = gen_reg_rtx (dstmode);
++
++	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
++			  gen_lowpart (srcmode, src),
++			  GEN_INT (1 << (pos / size))));
++	if (d != dst)
++	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
++	return true;
++      }
++
++    default:
++      return false;
++    }
++}
++
++/* All CPUs prefer to avoid cross-lane operations so perform reductions
++   upper against lower halves up to SSE reg size.  */
++
++machine_mode
++ix86_split_reduction (machine_mode mode)
++{
++  /* Reduce lowpart against highpart until we reach SSE reg width to
++     avoid cross-lane operations.  */
++  switch (mode)
++    {
++    case E_V8DImode:
++    case E_V4DImode:
++      return V2DImode;
++    case E_V16SImode:
++    case E_V8SImode:
++      return V4SImode;
++    case E_V32HImode:
++    case E_V16HImode:
++      return V8HImode;
++    case E_V64QImode:
++    case E_V32QImode:
++      return V16QImode;
++    case E_V16SFmode:
++    case E_V8SFmode:
++      return V4SFmode;
++    case E_V8DFmode:
++    case E_V4DFmode:
++      return V2DFmode;
++    default:
++      return mode;
++    }
++}
++
++/* Generate call to __divmoddi4.  */
++
++void
++ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
++			    rtx op0, rtx op1,
++			    rtx *quot_p, rtx *rem_p)
++{
++  rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
++
++  rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
++				      mode, op0, mode, op1, mode,
++				      XEXP (rem, 0), Pmode);
++  *quot_p = quot;
++  *rem_p = rem;
++}
++
++#include "gt-i386-expand.h"
+diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
+new file mode 100644
+index 000000000..9271bb85a
+--- /dev/null
++++ b/gcc/config/i386/i386-expand.h
+@@ -0,0 +1,58 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_I386_EXPAND_H
++#define GCC_I386_EXPAND_H
++
++/* AVX512F does support 64-byte integer vector operations,
++   thus the longest vector we are faced with is V64QImode.  */
++#define MAX_VECT_LEN	64
++
++struct expand_vec_perm_d
++{
++  rtx target, op0, op1;
++  unsigned char perm[MAX_VECT_LEN];
++  machine_mode vmode;
++  unsigned char nelt;
++  bool one_operand_p;
++  bool testing_p;
++};
++
++rtx legitimize_tls_address (rtx x, enum tls_model model, bool for_mov);
++alias_set_type ix86_GOT_alias_set (void);
++rtx legitimize_pic_address (rtx orig, rtx reg);
++rtx legitimize_pe_coff_symbol (rtx addr, bool inreg);
++
++bool insn_defines_reg (unsigned int regno1, unsigned int regno2,
++		       rtx_insn *insn);
++void ix86_emit_binop (enum rtx_code code, machine_mode mode, rtx dst, rtx src);
++enum calling_abi ix86_function_abi (const_tree fndecl);
++bool ix86_function_ms_hook_prologue (const_tree fn);
++void warn_once_call_ms2sysv_xlogues (const char *feature);
++rtx gen_push (rtx arg);
++rtx gen_pop (rtx arg);
++rtx ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
++			 machine_mode mode, int ignore);
++bool ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
++				    rtx op1, const vec_perm_indices &sel);
++bool ix86_notrack_prefixed_insn_p (rtx insn);
++machine_mode ix86_split_reduction (machine_mode mode);
++void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
++				 rtx op1, rtx *quot_p, rtx *rem_p);
++
++#endif  /* GCC_I386_EXPAND_H */
+diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
+new file mode 100644
+index 000000000..60a120f4d
+--- /dev/null
++++ b/gcc/config/i386/i386-features.c
+@@ -0,0 +1,2742 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#define IN_TARGET_CODE 1
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "rtl.h"
++#include "tree.h"
++#include "memmodel.h"
++#include "gimple.h"
++#include "cfghooks.h"
++#include "cfgloop.h"
++#include "df.h"
++#include "tm_p.h"
++#include "stringpool.h"
++#include "expmed.h"
++#include "optabs.h"
++#include "regs.h"
++#include "emit-rtl.h"
++#include "recog.h"
++#include "cgraph.h"
++#include "diagnostic.h"
++#include "cfgbuild.h"
++#include "alias.h"
++#include "fold-const.h"
++#include "attribs.h"
++#include "calls.h"
++#include "stor-layout.h"
++#include "varasm.h"
++#include "output.h"
++#include "insn-attr.h"
++#include "flags.h"
++#include "except.h"
++#include "explow.h"
++#include "expr.h"
++#include "cfgrtl.h"
++#include "common/common-target.h"
++#include "langhooks.h"
++#include "reload.h"
++#include "gimplify.h"
++#include "dwarf2.h"
++#include "tm-constrs.h"
++#include "params.h"
++#include "cselib.h"
++#include "sched-int.h"
++#include "opts.h"
++#include "tree-pass.h"
++#include "context.h"
++#include "pass_manager.h"
++#include "target-globals.h"
++#include "gimple-iterator.h"
++#include "tree-vectorizer.h"
++#include "shrink-wrap.h"
++#include "builtins.h"
++#include "rtl-iter.h"
++#include "tree-iterator.h"
++#include "dbgcnt.h"
++#include "case-cfn-macros.h"
++#include "dojump.h"
++#include "fold-const-call.h"
++#include "tree-vrp.h"
++#include "tree-ssanames.h"
++#include "selftest.h"
++#include "selftest-rtl.h"
++#include "print-rtl.h"
++#include "intl.h"
++#include "ifcvt.h"
++#include "symbol-summary.h"
++#include "ipa-prop.h"
++#include "ipa-fnsummary.h"
++#include "wide-int-bitmask.h"
++#include "tree-vector-builder.h"
++#include "debug.h"
++#include "dwarf2out.h"
++#include "i386-builtins.h"
++#include "i386-features.h"
++
++const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
++  "savms64",
++  "resms64",
++  "resms64x",
++  "savms64f",
++  "resms64f",
++  "resms64fx"
++};
++
++const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
++/* The below offset values are where each register is stored for the layout
++   relative to incoming stack pointer.  The value of each m_regs[].offset will
++   be relative to the incoming base pointer (rax or rsi) used by the stub.
++
++    s_instances:   0		1		2		3
++    Offset:					realigned or	aligned + 8
++    Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
++    XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
++    XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
++    XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
++    XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
++    XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
++    XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
++    XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
++    XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
++    XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
++    XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
++    SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
++    DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
++    BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
++    BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
++    R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
++    R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
++    R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
++    R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
++};
++
++/* Instantiate static const values.  */
++const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
++const unsigned xlogue_layout::MIN_REGS;
++const unsigned xlogue_layout::MAX_REGS;
++const unsigned xlogue_layout::MAX_EXTRA_REGS;
++const unsigned xlogue_layout::VARIANT_COUNT;
++const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
++
++/* Initialize xlogue_layout::s_stub_names to zero.  */
++char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
++				[STUB_NAME_MAX_LEN];
++
++/* Instantiates all xlogue_layout instances.  */
++const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
++  xlogue_layout (0, false),
++  xlogue_layout (8, false),
++  xlogue_layout (0, true),
++  xlogue_layout (8, true)
++};
++
++/* Return an appropriate const instance of xlogue_layout based upon values
++   in cfun->machine and crtl.  */
++const struct xlogue_layout &
++xlogue_layout::get_instance ()
++{
++  enum xlogue_stub_sets stub_set;
++  bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
++
++  if (stack_realign_fp)
++    stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
++  else if (frame_pointer_needed)
++    stub_set = aligned_plus_8
++	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
++	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
++  else
++    stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
++
++  return s_instances[stub_set];
++}
++
++/* Determine how many clobbered registers can be saved by the stub.
++   Returns the count of registers the stub will save and restore.  */
++unsigned
++xlogue_layout::count_stub_managed_regs ()
++{
++  bool hfp = frame_pointer_needed || stack_realign_fp;
++  unsigned i, count;
++  unsigned regno;
++
++  for (count = i = MIN_REGS; i < MAX_REGS; ++i)
++    {
++      regno = REG_ORDER[i];
++      if (regno == BP_REG && hfp)
++	continue;
++      if (!ix86_save_reg (regno, false, false))
++	break;
++      ++count;
++    }
++  return count;
++}
++
++/* Determine if register REGNO is a stub managed register given the
++   total COUNT of stub managed registers.  */
++bool
++xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
++{
++  bool hfp = frame_pointer_needed || stack_realign_fp;
++  unsigned i;
++
++  for (i = 0; i < count; ++i)
++    {
++      gcc_assert (i < MAX_REGS);
++      if (REG_ORDER[i] == BP_REG && hfp)
++	++count;
++      else if (REG_ORDER[i] == regno)
++	return true;
++    }
++  return false;
++}
++
++/* Constructor for xlogue_layout.  */
++xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
++  : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
++    m_stack_align_off_in (stack_align_off_in)
++{
++  HOST_WIDE_INT offset = stack_align_off_in;
++  unsigned i, j;
++
++  for (i = j = 0; i < MAX_REGS; ++i)
++    {
++      unsigned regno = REG_ORDER[i];
++
++      if (regno == BP_REG && hfp)
++	continue;
++      if (SSE_REGNO_P (regno))
++	{
++	  offset += 16;
++	  /* Verify that SSE regs are always aligned.  */
++	  gcc_assert (!((stack_align_off_in + offset) & 15));
++	}
++      else
++	offset += 8;
++
++      m_regs[j].regno    = regno;
++      m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
++    }
++  gcc_assert (j == m_nregs);
++}
++
++const char *
++xlogue_layout::get_stub_name (enum xlogue_stub stub,
++			      unsigned n_extra_regs)
++{
++  const int have_avx = TARGET_AVX;
++  char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
++
++  /* Lazy init */
++  if (!*name)
++    {
++      int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
++			  (have_avx ? "avx" : "sse"),
++			  STUB_BASE_NAMES[stub],
++			  MIN_REGS + n_extra_regs);
++      gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
++    }
++
++  return name;
++}
++
++/* Return rtx of a symbol ref for the entry point (based upon
++   cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
++rtx
++xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
++{
++  const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
++  gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
++  gcc_assert (stub < XLOGUE_STUB_COUNT);
++  gcc_assert (crtl->stack_realign_finalized);
++
++  return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
++}
++
++unsigned scalar_chain::max_id = 0;
++
++/* Initialize new chain.  */
++
++scalar_chain::scalar_chain ()
++{
++  chain_id = ++max_id;
++
++   if (dump_file)
++    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
++
++  bitmap_obstack_initialize (NULL);
++  insns = BITMAP_ALLOC (NULL);
++  defs = BITMAP_ALLOC (NULL);
++  defs_conv = BITMAP_ALLOC (NULL);
++  queue = NULL;
++}
++
++/* Free chain's data.  */
++
++scalar_chain::~scalar_chain ()
++{
++  BITMAP_FREE (insns);
++  BITMAP_FREE (defs);
++  BITMAP_FREE (defs_conv);
++  bitmap_obstack_release (NULL);
++}
++
++/* Add instruction into chains' queue.  */
++
++void
++scalar_chain::add_to_queue (unsigned insn_uid)
++{
++  if (bitmap_bit_p (insns, insn_uid)
++      || bitmap_bit_p (queue, insn_uid))
++    return;
++
++  if (dump_file)
++    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
++	     insn_uid, chain_id);
++  bitmap_set_bit (queue, insn_uid);
++}
++
++/* For DImode conversion, mark register defined by DEF as requiring
++   conversion.  */
++
++void
++dimode_scalar_chain::mark_dual_mode_def (df_ref def)
++{
++  gcc_assert (DF_REF_REG_DEF_P (def));
++
++  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
++    return;
++
++  if (dump_file)
++    fprintf (dump_file,
++	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
++	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
++
++  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
++}
++
++/* For TImode conversion, it is unused.  */
++
++void
++timode_scalar_chain::mark_dual_mode_def (df_ref)
++{
++  gcc_unreachable ();
++}
++
++/* Check REF's chain to add new insns into a queue
++   and find registers requiring conversion.  */
++
++void
++scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
++{
++  df_link *chain;
++
++  gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
++	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
++  add_to_queue (DF_REF_INSN_UID (ref));
++
++  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
++    {
++      unsigned uid = DF_REF_INSN_UID (chain->ref);
++
++      if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
++	continue;
++
++      if (!DF_REF_REG_MEM_P (chain->ref))
++	{
++	  if (bitmap_bit_p (insns, uid))
++	    continue;
++
++	  if (bitmap_bit_p (candidates, uid))
++	    {
++	      add_to_queue (uid);
++	      continue;
++	    }
++	}
++
++      if (DF_REF_REG_DEF_P (chain->ref))
++	{
++	  if (dump_file)
++	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
++		     DF_REF_REGNO (chain->ref), uid);
++	  mark_dual_mode_def (chain->ref);
++	}
++      else
++	{
++	  if (dump_file)
++	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
++		     DF_REF_REGNO (chain->ref), uid);
++	  mark_dual_mode_def (ref);
++	}
++    }
++}
++
++/* Add instruction into a chain.  */
++
++void
++scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
++{
++  if (bitmap_bit_p (insns, insn_uid))
++    return;
++
++  if (dump_file)
++    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
++
++  bitmap_set_bit (insns, insn_uid);
++
++  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
++  rtx def_set = single_set (insn);
++  if (def_set && REG_P (SET_DEST (def_set))
++      && !HARD_REGISTER_P (SET_DEST (def_set)))
++    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
++
++  df_ref ref;
++  df_ref def;
++  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
++    if (!HARD_REGISTER_P (DF_REF_REG (ref)))
++      for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
++	   def;
++	   def = DF_REF_NEXT_REG (def))
++	analyze_register_chain (candidates, def);
++  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
++    if (!DF_REF_REG_MEM_P (ref))
++      analyze_register_chain (candidates, ref);
++}
++
++/* Build new chain starting from insn INSN_UID recursively
++   adding all dependent uses and definitions.  */
++
++void
++scalar_chain::build (bitmap candidates, unsigned insn_uid)
++{
++  queue = BITMAP_ALLOC (NULL);
++  bitmap_set_bit (queue, insn_uid);
++
++  if (dump_file)
++    fprintf (dump_file, "Building chain #%d...\n", chain_id);
++
++  while (!bitmap_empty_p (queue))
++    {
++      insn_uid = bitmap_first_set_bit (queue);
++      bitmap_clear_bit (queue, insn_uid);
++      bitmap_clear_bit (candidates, insn_uid);
++      add_insn (candidates, insn_uid);
++    }
++
++  if (dump_file)
++    {
++      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
++      fprintf (dump_file, "  insns: ");
++      dump_bitmap (dump_file, insns);
++      if (!bitmap_empty_p (defs_conv))
++	{
++	  bitmap_iterator bi;
++	  unsigned id;
++	  const char *comma = "";
++	  fprintf (dump_file, "  defs to convert: ");
++	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
++	    {
++	      fprintf (dump_file, "%sr%d", comma, id);
++	      comma = ", ";
++	    }
++	  fprintf (dump_file, "\n");
++	}
++    }
++
++  BITMAP_FREE (queue);
++}
++
++/* Return a cost of building a vector costant
++   instead of using a scalar one.  */
++
++int
++dimode_scalar_chain::vector_const_cost (rtx exp)
++{
++  gcc_assert (CONST_INT_P (exp));
++
++  if (standard_sse_constant_p (exp, V2DImode))
++    return COSTS_N_INSNS (1);
++  return ix86_cost->sse_load[1];
++}
++
++/* Compute a gain for chain conversion.  */
++
++int
++dimode_scalar_chain::compute_convert_gain ()
++{
++  bitmap_iterator bi;
++  unsigned insn_uid;
++  int gain = 0;
++  int cost = 0;
++
++  if (dump_file)
++    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
++
++  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
++    {
++      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
++      rtx def_set = single_set (insn);
++      rtx src = SET_SRC (def_set);
++      rtx dst = SET_DEST (def_set);
++
++      if (REG_P (src) && REG_P (dst))
++	gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
++      else if (REG_P (src) && MEM_P (dst))
++	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
++      else if (MEM_P (src) && REG_P (dst))
++	gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
++      else if (GET_CODE (src) == ASHIFT
++	       || GET_CODE (src) == ASHIFTRT
++	       || GET_CODE (src) == LSHIFTRT)
++	{
++    	  if (CONST_INT_P (XEXP (src, 0)))
++	    gain -= vector_const_cost (XEXP (src, 0));
++	  gain += ix86_cost->shift_const;
++	  if (INTVAL (XEXP (src, 1)) >= 32)
++	    gain -= COSTS_N_INSNS (1);
++	}
++      else if (GET_CODE (src) == PLUS
++	       || GET_CODE (src) == MINUS
++	       || GET_CODE (src) == IOR
++	       || GET_CODE (src) == XOR
++	       || GET_CODE (src) == AND)
++	{
++	  gain += ix86_cost->add;
++	  /* Additional gain for andnot for targets without BMI.  */
++	  if (GET_CODE (XEXP (src, 0)) == NOT
++	      && !TARGET_BMI)
++	    gain += 2 * ix86_cost->add;
++
++	  if (CONST_INT_P (XEXP (src, 0)))
++	    gain -= vector_const_cost (XEXP (src, 0));
++	  if (CONST_INT_P (XEXP (src, 1)))
++	    gain -= vector_const_cost (XEXP (src, 1));
++	}
++      else if (GET_CODE (src) == NEG
++	       || GET_CODE (src) == NOT)
++	gain += ix86_cost->add - COSTS_N_INSNS (1);
++      else if (GET_CODE (src) == COMPARE)
++	{
++	  /* Assume comparison cost is the same.  */
++	}
++      else if (CONST_INT_P (src))
++	{
++	  if (REG_P (dst))
++	    gain += COSTS_N_INSNS (2);
++	  else if (MEM_P (dst))
++	    gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
++	  gain -= vector_const_cost (src);
++	}
++      else
++	gcc_unreachable ();
++    }
++
++  if (dump_file)
++    fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
++
++  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
++    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
++
++  if (dump_file)
++    fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
++
++  gain -= cost;
++
++  if (dump_file)
++    fprintf (dump_file, "  Total gain: %d\n", gain);
++
++  return gain;
++}
++
++/* Replace REG in X with a V2DI subreg of NEW_REG.  */
++
++rtx
++dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
++{
++  if (x == reg)
++    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
++
++  const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
++  int i, j;
++  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
++    {
++      if (fmt[i] == 'e')
++	XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
++      else if (fmt[i] == 'E')
++	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
++	  XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
++						   reg, new_reg);
++    }
++
++  return x;
++}
++
++/* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
++
++void
++dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
++						  rtx reg, rtx new_reg)
++{
++  replace_with_subreg (single_set (insn), reg, new_reg);
++}
++
++/* Insert generated conversion instruction sequence INSNS
++   after instruction AFTER.  New BB may be required in case
++   instruction has EH region attached.  */
++
++void
++scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
++{
++  if (!control_flow_insn_p (after))
++    {
++      emit_insn_after (insns, after);
++      return;
++    }
++
++  basic_block bb = BLOCK_FOR_INSN (after);
++  edge e = find_fallthru_edge (bb->succs);
++  gcc_assert (e);
++
++  basic_block new_bb = split_edge (e);
++  emit_insn_after (insns, BB_HEAD (new_bb));
++}
++
++/* Make vector copies for all register REGNO definitions
++   and replace its uses in a chain.  */
++
++void
++dimode_scalar_chain::make_vector_copies (unsigned regno)
++{
++  rtx reg = regno_reg_rtx[regno];
++  rtx vreg = gen_reg_rtx (DImode);
++  df_ref ref;
++
++  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
++    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
++      {
++	start_sequence ();
++	if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
++	  {
++	    rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
++	    emit_move_insn (adjust_address (tmp, SImode, 0),
++			    gen_rtx_SUBREG (SImode, reg, 0));
++	    emit_move_insn (adjust_address (tmp, SImode, 4),
++			    gen_rtx_SUBREG (SImode, reg, 4));
++	    emit_move_insn (vreg, tmp);
++	  }
++	else if (TARGET_SSE4_1)
++	  {
++	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
++					CONST0_RTX (V4SImode),
++					gen_rtx_SUBREG (SImode, reg, 0)));
++	    emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
++					  gen_rtx_SUBREG (V4SImode, vreg, 0),
++					  gen_rtx_SUBREG (SImode, reg, 4),
++					  GEN_INT (2)));
++	  }
++	else
++	  {
++	    rtx tmp = gen_reg_rtx (DImode);
++	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
++					CONST0_RTX (V4SImode),
++					gen_rtx_SUBREG (SImode, reg, 0)));
++	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
++					CONST0_RTX (V4SImode),
++					gen_rtx_SUBREG (SImode, reg, 4)));
++	    emit_insn (gen_vec_interleave_lowv4si
++		       (gen_rtx_SUBREG (V4SImode, vreg, 0),
++			gen_rtx_SUBREG (V4SImode, vreg, 0),
++			gen_rtx_SUBREG (V4SImode, tmp, 0)));
++	  }
++	rtx_insn *seq = get_insns ();
++	end_sequence ();
++	rtx_insn *insn = DF_REF_INSN (ref);
++	emit_conversion_insns (seq, insn);
++
++	if (dump_file)
++	  fprintf (dump_file,
++		   "  Copied r%d to a vector register r%d for insn %d\n",
++		   regno, REGNO (vreg), INSN_UID (insn));
++      }
++
++  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
++    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
++      {
++	rtx_insn *insn = DF_REF_INSN (ref);
++	replace_with_subreg_in_insn (insn, reg, vreg);
++
++	if (dump_file)
++	  fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
++		   regno, REGNO (vreg), INSN_UID (insn));
++      }
++}
++
++/* Convert all definitions of register REGNO
++   and fix its uses.  Scalar copies may be created
++   in case register is used in not convertible insn.  */
++
++void
++dimode_scalar_chain::convert_reg (unsigned regno)
++{
++  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
++  rtx reg = regno_reg_rtx[regno];
++  rtx scopy = NULL_RTX;
++  df_ref ref;
++  bitmap conv;
++
++  conv = BITMAP_ALLOC (NULL);
++  bitmap_copy (conv, insns);
++
++  if (scalar_copy)
++    scopy = gen_reg_rtx (DImode);
++
++  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
++    {
++      rtx_insn *insn = DF_REF_INSN (ref);
++      rtx def_set = single_set (insn);
++      rtx src = SET_SRC (def_set);
++      rtx reg = DF_REF_REG (ref);
++
++      if (!MEM_P (src))
++	{
++	  replace_with_subreg_in_insn (insn, reg, reg);
++	  bitmap_clear_bit (conv, INSN_UID (insn));
++	}
++
++      if (scalar_copy)
++	{
++	  start_sequence ();
++	  if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
++	    {
++	      rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
++	      emit_move_insn (tmp, reg);
++	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
++			      adjust_address (tmp, SImode, 0));
++	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
++			      adjust_address (tmp, SImode, 4));
++	    }
++	  else if (TARGET_SSE4_1)
++	    {
++	      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
++	      emit_insn
++		(gen_rtx_SET
++		 (gen_rtx_SUBREG (SImode, scopy, 0),
++		  gen_rtx_VEC_SELECT (SImode,
++				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
++
++	      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
++	      emit_insn
++		(gen_rtx_SET
++		 (gen_rtx_SUBREG (SImode, scopy, 4),
++		  gen_rtx_VEC_SELECT (SImode,
++				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
++	    }
++	  else
++	    {
++	      rtx vcopy = gen_reg_rtx (V2DImode);
++	      emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
++	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
++			      gen_rtx_SUBREG (SImode, vcopy, 0));
++	      emit_move_insn (vcopy,
++			      gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
++	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
++			      gen_rtx_SUBREG (SImode, vcopy, 0));
++	    }
++	  rtx_insn *seq = get_insns ();
++	  end_sequence ();
++	  emit_conversion_insns (seq, insn);
++
++	  if (dump_file)
++	    fprintf (dump_file,
++		     "  Copied r%d to a scalar register r%d for insn %d\n",
++		     regno, REGNO (scopy), INSN_UID (insn));
++	}
++    }
++
++  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
++    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
++      {
++	if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
++	  {
++	    rtx_insn *insn = DF_REF_INSN (ref);
++
++	    rtx def_set = single_set (insn);
++	    gcc_assert (def_set);
++
++	    rtx src = SET_SRC (def_set);
++	    rtx dst = SET_DEST (def_set);
++
++	    if (!MEM_P (dst) || !REG_P (src))
++	      replace_with_subreg_in_insn (insn, reg, reg);
++
++	    bitmap_clear_bit (conv, INSN_UID (insn));
++	  }
++      }
++    /* Skip debug insns and uninitialized uses.  */
++    else if (DF_REF_CHAIN (ref)
++	     && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
++      {
++	gcc_assert (scopy);
++	replace_rtx (DF_REF_INSN (ref), reg, scopy);
++	df_insn_rescan (DF_REF_INSN (ref));
++      }
++
++  BITMAP_FREE (conv);
++}
++
++/* Convert operand OP in INSN.  We should handle
++   memory operands and uninitialized registers.
++   All other register uses are converted during
++   registers conversion.  */
++
++void
++dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
++{
++  *op = copy_rtx_if_shared (*op);
++
++  if (GET_CODE (*op) == NOT)
++    {
++      convert_op (&XEXP (*op, 0), insn);
++      PUT_MODE (*op, V2DImode);
++    }
++  else if (MEM_P (*op))
++    {
++      rtx tmp = gen_reg_rtx (DImode);
++
++      emit_insn_before (gen_move_insn (tmp, *op), insn);
++      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
++
++      if (dump_file)
++	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
++		 INSN_UID (insn), REGNO (tmp));
++    }
++  else if (REG_P (*op))
++    {
++      /* We may have not converted register usage in case
++	 this register has no definition.  Otherwise it
++	 should be converted in convert_reg.  */
++      df_ref ref;
++      FOR_EACH_INSN_USE (ref, insn)
++	if (DF_REF_REGNO (ref) == REGNO (*op))
++	  {
++	    gcc_assert (!DF_REF_CHAIN (ref));
++	    break;
++	  }
++      *op = gen_rtx_SUBREG (V2DImode, *op, 0);
++    }
++  else if (CONST_INT_P (*op))
++    {
++      rtx vec_cst;
++      rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
++
++      /* Prefer all ones vector in case of -1.  */
++      if (constm1_operand (*op, GET_MODE (*op)))
++	vec_cst = CONSTM1_RTX (V2DImode);
++      else
++	vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
++					gen_rtvec (2, *op, const0_rtx));
++
++      if (!standard_sse_constant_p (vec_cst, V2DImode))
++	{
++	  start_sequence ();
++	  vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
++	  rtx_insn *seq = get_insns ();
++	  end_sequence ();
++	  emit_insn_before (seq, insn);
++	}
++
++      emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
++      *op = tmp;
++    }
++  else
++    {
++      gcc_assert (SUBREG_P (*op));
++      gcc_assert (GET_MODE (*op) == V2DImode);
++    }
++}
++
++/* Convert INSN to vector mode.  */
++
++void
++dimode_scalar_chain::convert_insn (rtx_insn *insn)
++{
++  rtx def_set = single_set (insn);
++  rtx src = SET_SRC (def_set);
++  rtx dst = SET_DEST (def_set);
++  rtx subreg;
++
++  if (MEM_P (dst) && !REG_P (src))
++    {
++      /* There are no scalar integer instructions and therefore
++	 temporary register usage is required.  */
++      rtx tmp = gen_reg_rtx (DImode);
++      emit_conversion_insns (gen_move_insn (dst, tmp), insn);
++      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
++    }
++
++  switch (GET_CODE (src))
++    {
++    case ASHIFT:
++    case ASHIFTRT:
++    case LSHIFTRT:
++      convert_op (&XEXP (src, 0), insn);
++      PUT_MODE (src, V2DImode);
++      break;
++
++    case PLUS:
++    case MINUS:
++    case IOR:
++    case XOR:
++    case AND:
++      convert_op (&XEXP (src, 0), insn);
++      convert_op (&XEXP (src, 1), insn);
++      PUT_MODE (src, V2DImode);
++      break;
++
++    case NEG:
++      src = XEXP (src, 0);
++      convert_op (&src, insn);
++      subreg = gen_reg_rtx (V2DImode);
++      emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
++      src = gen_rtx_MINUS (V2DImode, subreg, src);
++      break;
++
++    case NOT:
++      src = XEXP (src, 0);
++      convert_op (&src, insn);
++      subreg = gen_reg_rtx (V2DImode);
++      emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
++      src = gen_rtx_XOR (V2DImode, src, subreg);
++      break;
++
++    case MEM:
++      if (!REG_P (dst))
++	convert_op (&src, insn);
++      break;
++
++    case REG:
++      if (!MEM_P (dst))
++	convert_op (&src, insn);
++      break;
++
++    case SUBREG:
++      gcc_assert (GET_MODE (src) == V2DImode);
++      break;
++
++    case COMPARE:
++      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
++
++      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
++		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
++
++      if (REG_P (src))
++	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
++      else
++	subreg = copy_rtx_if_shared (src);
++      emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
++						    copy_rtx_if_shared (subreg),
++						    copy_rtx_if_shared (subreg)),
++			insn);
++      dst = gen_rtx_REG (CCmode, FLAGS_REG);
++      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
++					       copy_rtx_if_shared (src)),
++			    UNSPEC_PTEST);
++      break;
++
++    case CONST_INT:
++      convert_op (&src, insn);
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  SET_SRC (def_set) = src;
++  SET_DEST (def_set) = dst;
++
++  /* Drop possible dead definitions.  */
++  PATTERN (insn) = def_set;
++
++  INSN_CODE (insn) = -1;
++  recog_memoized (insn);
++  df_insn_rescan (insn);
++}
++
++/* Fix uses of converted REG in debug insns.  */
++
++void
++timode_scalar_chain::fix_debug_reg_uses (rtx reg)
++{
++  if (!flag_var_tracking)
++    return;
++
++  df_ref ref, next;
++  for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
++    {
++      rtx_insn *insn = DF_REF_INSN (ref);
++      /* Make sure the next ref is for a different instruction,
++         so that we're not affected by the rescan.  */
++      next = DF_REF_NEXT_REG (ref);
++      while (next && DF_REF_INSN (next) == insn)
++	next = DF_REF_NEXT_REG (next);
++
++      if (DEBUG_INSN_P (insn))
++	{
++	  /* It may be a debug insn with a TImode variable in
++	     register.  */
++	  bool changed = false;
++	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
++	    {
++	      rtx *loc = DF_REF_LOC (ref);
++	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
++		{
++		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
++		  changed = true;
++		}
++	    }
++	  if (changed)
++	    df_insn_rescan (insn);
++	}
++    }
++}
++
++/* Convert INSN from TImode to V1T1mode.  */
++
++void
++timode_scalar_chain::convert_insn (rtx_insn *insn)
++{
++  rtx def_set = single_set (insn);
++  rtx src = SET_SRC (def_set);
++  rtx dst = SET_DEST (def_set);
++
++  switch (GET_CODE (dst))
++    {
++    case REG:
++      {
++	rtx tmp = find_reg_equal_equiv_note (insn);
++	if (tmp)
++	  PUT_MODE (XEXP (tmp, 0), V1TImode);
++	PUT_MODE (dst, V1TImode);
++	fix_debug_reg_uses (dst);
++      }
++      break;
++    case MEM:
++      PUT_MODE (dst, V1TImode);
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  switch (GET_CODE (src))
++    {
++    case REG:
++      PUT_MODE (src, V1TImode);
++      /* Call fix_debug_reg_uses only if SRC is never defined.  */
++      if (!DF_REG_DEF_CHAIN (REGNO (src)))
++	fix_debug_reg_uses (src);
++      break;
++
++    case MEM:
++      PUT_MODE (src, V1TImode);
++      break;
++
++    case CONST_WIDE_INT:
++      if (NONDEBUG_INSN_P (insn))
++	{
++	  /* Since there are no instructions to store 128-bit constant,
++	     temporary register usage is required.  */
++	  rtx tmp = gen_reg_rtx (V1TImode);
++	  start_sequence ();
++	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
++	  src = validize_mem (force_const_mem (V1TImode, src));
++	  rtx_insn *seq = get_insns ();
++	  end_sequence ();
++	  if (seq)
++	    emit_insn_before (seq, insn);
++	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
++	  dst = tmp;
++	}
++      break;
++
++    case CONST_INT:
++      switch (standard_sse_constant_p (src, TImode))
++	{
++	case 1:
++	  src = CONST0_RTX (GET_MODE (dst));
++	  break;
++	case 2:
++	  src = CONSTM1_RTX (GET_MODE (dst));
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++      if (NONDEBUG_INSN_P (insn))
++	{
++	  rtx tmp = gen_reg_rtx (V1TImode);
++	  /* Since there are no instructions to store standard SSE
++	     constant, temporary register usage is required.  */
++	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
++	  dst = tmp;
++	}
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  SET_SRC (def_set) = src;
++  SET_DEST (def_set) = dst;
++
++  /* Drop possible dead definitions.  */
++  PATTERN (insn) = def_set;
++
++  INSN_CODE (insn) = -1;
++  recog_memoized (insn);
++  df_insn_rescan (insn);
++}
++
++void
++dimode_scalar_chain::convert_registers ()
++{
++  bitmap_iterator bi;
++  unsigned id;
++
++  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
++    convert_reg (id);
++
++  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
++    make_vector_copies (id);
++}
++
++/* Convert whole chain creating required register
++   conversions and copies.  */
++
++int
++scalar_chain::convert ()
++{
++  bitmap_iterator bi;
++  unsigned id;
++  int converted_insns = 0;
++
++  if (!dbg_cnt (stv_conversion))
++    return 0;
++
++  if (dump_file)
++    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
++
++  convert_registers ();
++
++  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
++    {
++      convert_insn (DF_INSN_UID_GET (id)->insn);
++      converted_insns++;
++    }
++
++  return converted_insns;
++}
++
++/* Return 1 if INSN uses or defines a hard register.
++   Hard register uses in a memory address are ignored.
++   Clobbers and flags definitions are ignored.  */
++
++static bool
++has_non_address_hard_reg (rtx_insn *insn)
++{
++  df_ref ref;
++  FOR_EACH_INSN_DEF (ref, insn)
++    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
++	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
++	&& DF_REF_REGNO (ref) != FLAGS_REG)
++      return true;
++
++  FOR_EACH_INSN_USE (ref, insn)
++    if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
++      return true;
++
++  return false;
++}
++
++/* Check if comparison INSN may be transformed
++   into vector comparison.  Currently we transform
++   zero checks only which look like:
++
++   (set (reg:CCZ 17 flags)
++        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
++                             (subreg:SI (reg:DI x) 0))
++		     (const_int 0 [0])))  */
++
++static bool
++convertible_comparison_p (rtx_insn *insn)
++{
++  if (!TARGET_SSE4_1)
++    return false;
++
++  rtx def_set = single_set (insn);
++
++  gcc_assert (def_set);
++
++  rtx src = SET_SRC (def_set);
++  rtx dst = SET_DEST (def_set);
++
++  gcc_assert (GET_CODE (src) == COMPARE);
++
++  if (GET_CODE (dst) != REG
++      || REGNO (dst) != FLAGS_REG
++      || GET_MODE (dst) != CCZmode)
++    return false;
++
++  rtx op1 = XEXP (src, 0);
++  rtx op2 = XEXP (src, 1);
++
++  if (op2 != CONST0_RTX (GET_MODE (op2)))
++    return false;
++
++  if (GET_CODE (op1) != IOR)
++    return false;
++
++  op2 = XEXP (op1, 1);
++  op1 = XEXP (op1, 0);
++
++  if (!SUBREG_P (op1)
++      || !SUBREG_P (op2)
++      || GET_MODE (op1) != SImode
++      || GET_MODE (op2) != SImode
++      || ((SUBREG_BYTE (op1) != 0
++	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
++	  && (SUBREG_BYTE (op2) != 0
++	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
++    return false;
++
++  op1 = SUBREG_REG (op1);
++  op2 = SUBREG_REG (op2);
++
++  if (op1 != op2
++      || !REG_P (op1)
++      || GET_MODE (op1) != DImode)
++    return false;
++
++  return true;
++}
++
++/* The DImode version of scalar_to_vector_candidate_p.  */
++
++static bool
++dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
++{
++  rtx def_set = single_set (insn);
++
++  if (!def_set)
++    return false;
++
++  if (has_non_address_hard_reg (insn))
++    return false;
++
++  rtx src = SET_SRC (def_set);
++  rtx dst = SET_DEST (def_set);
++
++  if (GET_CODE (src) == COMPARE)
++    return convertible_comparison_p (insn);
++
++  /* We are interested in DImode promotion only.  */
++  if ((GET_MODE (src) != DImode
++       && !CONST_INT_P (src))
++      || GET_MODE (dst) != DImode)
++    return false;
++
++  if (!REG_P (dst) && !MEM_P (dst))
++    return false;
++
++  switch (GET_CODE (src))
++    {
++    case ASHIFTRT:
++      if (!TARGET_AVX512VL)
++	return false;
++      /* FALLTHRU */
++
++    case ASHIFT:
++    case LSHIFTRT:
++      if (!CONST_INT_P (XEXP (src, 1))
++	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
++	return false;
++      break;
++
++    case PLUS:
++    case MINUS:
++    case IOR:
++    case XOR:
++    case AND:
++      if (!REG_P (XEXP (src, 1))
++	  && !MEM_P (XEXP (src, 1))
++	  && !CONST_INT_P (XEXP (src, 1)))
++	return false;
++
++      if (GET_MODE (XEXP (src, 1)) != DImode
++	  && !CONST_INT_P (XEXP (src, 1)))
++	return false;
++      break;
++
++    case NEG:
++    case NOT:
++      break;
++
++    case REG:
++      return true;
++
++    case MEM:
++    case CONST_INT:
++      return REG_P (dst);
++
++    default:
++      return false;
++    }
++
++  if (!REG_P (XEXP (src, 0))
++      && !MEM_P (XEXP (src, 0))
++      && !CONST_INT_P (XEXP (src, 0))
++      /* Check for andnot case.  */
++      && (GET_CODE (src) != AND
++	  || GET_CODE (XEXP (src, 0)) != NOT
++	  || !REG_P (XEXP (XEXP (src, 0), 0))))
++      return false;
++
++  if (GET_MODE (XEXP (src, 0)) != DImode
++      && !CONST_INT_P (XEXP (src, 0)))
++    return false;
++
++  return true;
++}
++
++/* The TImode version of scalar_to_vector_candidate_p.  */
++
++static bool
++timode_scalar_to_vector_candidate_p (rtx_insn *insn)
++{
++  rtx def_set = single_set (insn);
++
++  if (!def_set)
++    return false;
++
++  if (has_non_address_hard_reg (insn))
++    return false;
++
++  rtx src = SET_SRC (def_set);
++  rtx dst = SET_DEST (def_set);
++
++  /* Only TImode load and store are allowed.  */
++  if (GET_MODE (dst) != TImode)
++    return false;
++
++  if (MEM_P (dst))
++    {
++      /* Check for store.  Memory must be aligned or unaligned store
++	 is optimal.  Only support store from register, standard SSE
++	 constant or CONST_WIDE_INT generated from piecewise store.
++
++	 ??? Verify performance impact before enabling CONST_INT for
++	 __int128 store.  */
++      if (misaligned_operand (dst, TImode)
++	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
++	return false;
++
++      switch (GET_CODE (src))
++	{
++	default:
++	  return false;
++
++	case REG:
++	case CONST_WIDE_INT:
++	  return true;
++
++	case CONST_INT:
++	  return standard_sse_constant_p (src, TImode);
++	}
++    }
++  else if (MEM_P (src))
++    {
++      /* Check for load.  Memory must be aligned or unaligned load is
++	 optimal.  */
++      return (REG_P (dst)
++	      && (!misaligned_operand (src, TImode)
++		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
++    }
++
++  return false;
++}
++
++/* Return 1 if INSN may be converted into vector
++   instruction.  */
++
++static bool
++scalar_to_vector_candidate_p (rtx_insn *insn)
++{
++  if (TARGET_64BIT)
++    return timode_scalar_to_vector_candidate_p (insn);
++  else
++    return dimode_scalar_to_vector_candidate_p (insn);
++}
++
++/* The DImode version of remove_non_convertible_regs.  */
++
++static void
++dimode_remove_non_convertible_regs (bitmap candidates)
++{
++  bitmap_iterator bi;
++  unsigned id;
++  bitmap regs = BITMAP_ALLOC (NULL);
++
++  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
++    {
++      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
++      rtx reg = SET_DEST (def_set);
++
++      if (!REG_P (reg)
++	  || bitmap_bit_p (regs, REGNO (reg))
++	  || HARD_REGISTER_P (reg))
++	continue;
++
++      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
++	   def;
++	   def = DF_REF_NEXT_REG (def))
++	{
++	  if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
++	    {
++	      if (dump_file)
++		fprintf (dump_file,
++			 "r%d has non convertible definition in insn %d\n",
++			 REGNO (reg), DF_REF_INSN_UID (def));
++
++	      bitmap_set_bit (regs, REGNO (reg));
++	      break;
++	    }
++	}
++    }
++
++  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
++    {
++      for (df_ref def = DF_REG_DEF_CHAIN (id);
++	   def;
++	   def = DF_REF_NEXT_REG (def))
++	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
++	  {
++	    if (dump_file)
++	      fprintf (dump_file, "Removing insn %d from candidates list\n",
++		       DF_REF_INSN_UID (def));
++
++	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
++	  }
++    }
++
++  BITMAP_FREE (regs);
++}
++
++/* For a register REGNO, scan instructions for its defs and uses.
++   Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
++
++static void
++timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
++				   unsigned int regno)
++{
++  for (df_ref def = DF_REG_DEF_CHAIN (regno);
++       def;
++       def = DF_REF_NEXT_REG (def))
++    {
++      if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
++	{
++	  if (dump_file)
++	    fprintf (dump_file,
++		     "r%d has non convertible def in insn %d\n",
++		     regno, DF_REF_INSN_UID (def));
++
++	  bitmap_set_bit (regs, regno);
++	  break;
++	}
++    }
++
++  for (df_ref ref = DF_REG_USE_CHAIN (regno);
++       ref;
++       ref = DF_REF_NEXT_REG (ref))
++    {
++      /* Debug instructions are skipped.  */
++      if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
++	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
++	{
++	  if (dump_file)
++	    fprintf (dump_file,
++		     "r%d has non convertible use in insn %d\n",
++		     regno, DF_REF_INSN_UID (ref));
++
++	  bitmap_set_bit (regs, regno);
++	  break;
++	}
++    }
++}
++
++/* The TImode version of remove_non_convertible_regs.  */
++
++static void
++timode_remove_non_convertible_regs (bitmap candidates)
++{
++  bitmap_iterator bi;
++  unsigned id;
++  bitmap regs = BITMAP_ALLOC (NULL);
++
++  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
++    {
++      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
++      rtx dest = SET_DEST (def_set);
++      rtx src = SET_SRC (def_set);
++
++      if ((!REG_P (dest)
++	   || bitmap_bit_p (regs, REGNO (dest))
++	   || HARD_REGISTER_P (dest))
++	  && (!REG_P (src)
++	      || bitmap_bit_p (regs, REGNO (src))
++	      || HARD_REGISTER_P (src)))
++	continue;
++
++      if (REG_P (dest))
++	timode_check_non_convertible_regs (candidates, regs,
++					   REGNO (dest));
++
++      if (REG_P (src))
++	timode_check_non_convertible_regs (candidates, regs,
++					   REGNO (src));
++    }
++
++  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
++    {
++      for (df_ref def = DF_REG_DEF_CHAIN (id);
++	   def;
++	   def = DF_REF_NEXT_REG (def))
++	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
++	  {
++	    if (dump_file)
++	      fprintf (dump_file, "Removing insn %d from candidates list\n",
++		       DF_REF_INSN_UID (def));
++
++	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
++	  }
++
++      for (df_ref ref = DF_REG_USE_CHAIN (id);
++	   ref;
++	   ref = DF_REF_NEXT_REG (ref))
++	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
++	  {
++	    if (dump_file)
++	      fprintf (dump_file, "Removing insn %d from candidates list\n",
++		       DF_REF_INSN_UID (ref));
++
++	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
++	  }
++    }
++
++  BITMAP_FREE (regs);
++}
++
++/* For a given bitmap of insn UIDs scans all instruction and
++   remove insn from CANDIDATES in case it has both convertible
++   and not convertible definitions.
++
++   All insns in a bitmap are conversion candidates according to
++   scalar_to_vector_candidate_p.  Currently it implies all insns
++   are single_set.  */
++
++static void
++remove_non_convertible_regs (bitmap candidates)
++{
++  if (TARGET_64BIT)
++    timode_remove_non_convertible_regs (candidates);
++  else
++    dimode_remove_non_convertible_regs (candidates);
++}
++
++/* Main STV pass function.  Find and convert scalar
++   instructions into vector mode when profitable.  */
++
++static unsigned int
++convert_scalars_to_vector ()
++{
++  basic_block bb;
++  bitmap candidates;
++  int converted_insns = 0;
++
++  bitmap_obstack_initialize (NULL);
++  candidates = BITMAP_ALLOC (NULL);
++
++  calculate_dominance_info (CDI_DOMINATORS);
++  df_set_flags (DF_DEFER_INSN_RESCAN);
++  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
++  df_md_add_problem ();
++  df_analyze ();
++
++  /* Find all instructions we want to convert into vector mode.  */
++  if (dump_file)
++    fprintf (dump_file, "Searching for mode conversion candidates...\n");
++
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      rtx_insn *insn;
++      FOR_BB_INSNS (bb, insn)
++	if (scalar_to_vector_candidate_p (insn))
++	  {
++	    if (dump_file)
++	      fprintf (dump_file, "  insn %d is marked as a candidate\n",
++		       INSN_UID (insn));
++
++	    bitmap_set_bit (candidates, INSN_UID (insn));
++	  }
++    }
++
++  remove_non_convertible_regs (candidates);
++
++  if (bitmap_empty_p (candidates))
++    if (dump_file)
++      fprintf (dump_file, "There are no candidates for optimization.\n");
++
++  while (!bitmap_empty_p (candidates))
++    {
++      unsigned uid = bitmap_first_set_bit (candidates);
++      scalar_chain *chain;
++
++      if (TARGET_64BIT)
++	chain = new timode_scalar_chain;
++      else
++	chain = new dimode_scalar_chain;
++
++      /* Find instructions chain we want to convert to vector mode.
++	 Check all uses and definitions to estimate all required
++	 conversions.  */
++      chain->build (candidates, uid);
++
++      if (chain->compute_convert_gain () > 0)
++	converted_insns += chain->convert ();
++      else
++	if (dump_file)
++	  fprintf (dump_file, "Chain #%d conversion is not profitable\n",
++		   chain->chain_id);
++
++      delete chain;
++    }
++
++  if (dump_file)
++    fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
++
++  BITMAP_FREE (candidates);
++  bitmap_obstack_release (NULL);
++  df_process_deferred_rescans ();
++
++  /* Conversion means we may have 128bit register spills/fills
++     which require aligned stack.  */
++  if (converted_insns)
++    {
++      if (crtl->stack_alignment_needed < 128)
++	crtl->stack_alignment_needed = 128;
++      if (crtl->stack_alignment_estimated < 128)
++	crtl->stack_alignment_estimated = 128;
++      /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
++      if (TARGET_64BIT)
++	for (tree parm = DECL_ARGUMENTS (current_function_decl);
++	     parm; parm = DECL_CHAIN (parm))
++	  {
++	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
++	      continue;
++	    if (DECL_RTL_SET_P (parm)
++		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
++	      {
++		rtx r = DECL_RTL (parm);
++		if (REG_P (r))
++		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
++	      }
++	    if (DECL_INCOMING_RTL (parm)
++		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
++	      {
++		rtx r = DECL_INCOMING_RTL (parm);
++		if (REG_P (r))
++		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
++	      }
++	  }
++    }
++
++  return 0;
++}
++
++static unsigned int
++rest_of_handle_insert_vzeroupper (void)
++{
++  int i;
++
++  /* vzeroupper instructions are inserted immediately after reload to
++     account for possible spills from 256bit or 512bit registers.  The pass
++     reuses mode switching infrastructure by re-running mode insertion
++     pass, so disable entities that have already been processed.  */
++  for (i = 0; i < MAX_386_ENTITIES; i++)
++    ix86_optimize_mode_switching[i] = 0;
++
++  ix86_optimize_mode_switching[AVX_U128] = 1;
++
++  /* Call optimize_mode_switching.  */
++  g->get_passes ()->execute_pass_mode_switching ();
++  return 0;
++}
++
++namespace {
++
++const pass_data pass_data_insert_vzeroupper =
++{
++  RTL_PASS, /* type */
++  "vzeroupper", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_MACH_DEP, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  TODO_df_finish, /* todo_flags_finish */
++};
++
++class pass_insert_vzeroupper : public rtl_opt_pass
++{
++public:
++  pass_insert_vzeroupper(gcc::context *ctxt)
++    : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *)
++    {
++      return TARGET_AVX
++	     && TARGET_VZEROUPPER && flag_expensive_optimizations
++	     && !optimize_size;
++    }
++
++  virtual unsigned int execute (function *)
++    {
++      return rest_of_handle_insert_vzeroupper ();
++    }
++
++}; // class pass_insert_vzeroupper
++
++const pass_data pass_data_stv =
++{
++  RTL_PASS, /* type */
++  "stv", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_MACH_DEP, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  TODO_df_finish, /* todo_flags_finish */
++};
++
++class pass_stv : public rtl_opt_pass
++{
++public:
++  pass_stv (gcc::context *ctxt)
++    : rtl_opt_pass (pass_data_stv, ctxt),
++      timode_p (false)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *)
++    {
++      return (timode_p == !!TARGET_64BIT
++	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
++    }
++
++  virtual unsigned int execute (function *)
++    {
++      return convert_scalars_to_vector ();
++    }
++
++  opt_pass *clone ()
++    {
++      return new pass_stv (m_ctxt);
++    }
++
++  void set_pass_param (unsigned int n, bool param)
++    {
++      gcc_assert (n == 0);
++      timode_p = param;
++    }
++
++private:
++  bool timode_p;
++}; // class pass_stv
++
++} // anon namespace
++
++rtl_opt_pass *
++make_pass_insert_vzeroupper (gcc::context *ctxt)
++{
++  return new pass_insert_vzeroupper (ctxt);
++}
++
++rtl_opt_pass *
++make_pass_stv (gcc::context *ctxt)
++{
++  return new pass_stv (ctxt);
++}
++
++/* Inserting ENDBRANCH instructions.  */
++
++static unsigned int
++rest_of_insert_endbranch (void)
++{
++  timevar_push (TV_MACH_DEP);
++
++  rtx cet_eb;
++  rtx_insn *insn;
++  basic_block bb;
++
++  /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
++     absent among function attributes.  Later an optimization will be
++     introduced to make analysis if an address of a static function is
++     taken.  A static function whose address is not taken will get a
++     nocf_check attribute.  This will allow to reduce the number of EB.  */
++
++  if (!lookup_attribute ("nocf_check",
++			 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
++      && (!flag_manual_endbr
++	  || lookup_attribute ("cf_check",
++			       DECL_ATTRIBUTES (cfun->decl)))
++      && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
++    {
++      /* Queue ENDBR insertion to x86_function_profiler.  */
++      if (crtl->profile && flag_fentry)
++	cfun->machine->endbr_queued_at_entrance = true;
++      else
++	{
++	  cet_eb = gen_nop_endbr ();
++
++	  bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
++	  insn = BB_HEAD (bb);
++	  emit_insn_before (cet_eb, insn);
++	}
++    }
++
++  bb = 0;
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
++	   insn = NEXT_INSN (insn))
++	{
++	  if (CALL_P (insn))
++	    {
++	      bool need_endbr;
++	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
++	      if (!need_endbr && !SIBLING_CALL_P (insn))
++		{
++		  rtx call = get_call_rtx_from (insn);
++		  rtx fnaddr = XEXP (call, 0);
++		  tree fndecl = NULL_TREE;
++
++		  /* Also generate ENDBRANCH for non-tail call which
++		     may return via indirect branch.  */
++		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
++		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
++		  if (fndecl == NULL_TREE)
++		    fndecl = MEM_EXPR (fnaddr);
++		  if (fndecl
++		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
++		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
++		    fndecl = NULL_TREE;
++		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
++		    {
++		      tree fntype = TREE_TYPE (fndecl);
++		      if (lookup_attribute ("indirect_return",
++					    TYPE_ATTRIBUTES (fntype)))
++			need_endbr = true;
++		    }
++		}
++	      if (!need_endbr)
++		continue;
++	      /* Generate ENDBRANCH after CALL, which can return more than
++		 twice, setjmp-like functions.  */
++
++	      cet_eb = gen_nop_endbr ();
++	      emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
++	      continue;
++	    }
++
++	  if (JUMP_P (insn) && flag_cet_switch)
++	    {
++	      rtx target = JUMP_LABEL (insn);
++	      if (target == NULL_RTX || ANY_RETURN_P (target))
++		continue;
++
++	      /* Check the jump is a switch table.  */
++	      rtx_insn *label = as_a<rtx_insn *> (target);
++	      rtx_insn *table = next_insn (label);
++	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
++		continue;
++
++	      /* For the indirect jump find out all places it jumps and insert
++		 ENDBRANCH there.  It should be done under a special flag to
++		 control ENDBRANCH generation for switch stmts.  */
++	      edge_iterator ei;
++	      edge e;
++	      basic_block dest_blk;
++
++	      FOR_EACH_EDGE (e, ei, bb->succs)
++		{
++		  rtx_insn *insn;
++
++		  dest_blk = e->dest;
++		  insn = BB_HEAD (dest_blk);
++		  gcc_assert (LABEL_P (insn));
++		  cet_eb = gen_nop_endbr ();
++		  emit_insn_after (cet_eb, insn);
++		}
++	      continue;
++	    }
++
++	  if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
++	      || (NOTE_P (insn)
++		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
++	    /* TODO.  Check /s bit also.  */
++	    {
++	      cet_eb = gen_nop_endbr ();
++	      emit_insn_after (cet_eb, insn);
++	      continue;
++	    }
++	}
++    }
++
++  timevar_pop (TV_MACH_DEP);
++  return 0;
++}
++
++namespace {
++
++const pass_data pass_data_insert_endbranch =
++{
++  RTL_PASS, /* type.  */
++  "cet", /* name.  */
++  OPTGROUP_NONE, /* optinfo_flags.  */
++  TV_MACH_DEP, /* tv_id.  */
++  0, /* properties_required.  */
++  0, /* properties_provided.  */
++  0, /* properties_destroyed.  */
++  0, /* todo_flags_start.  */
++  0, /* todo_flags_finish.  */
++};
++
++class pass_insert_endbranch : public rtl_opt_pass
++{
++public:
++  pass_insert_endbranch (gcc::context *ctxt)
++    : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *)
++    {
++      return ((flag_cf_protection & CF_BRANCH));
++    }
++
++  virtual unsigned int execute (function *)
++    {
++      return rest_of_insert_endbranch ();
++    }
++
++}; // class pass_insert_endbranch
++
++} // anon namespace
++
++rtl_opt_pass *
++make_pass_insert_endbranch (gcc::context *ctxt)
++{
++  return new pass_insert_endbranch (ctxt);
++}
++
++/* At entry of the nearest common dominator for basic blocks with
++   conversions, generate a single
++	vxorps %xmmN, %xmmN, %xmmN
++   for all
++	vcvtss2sd  op, %xmmN, %xmmX
++	vcvtsd2ss  op, %xmmN, %xmmX
++	vcvtsi2ss  op, %xmmN, %xmmX
++	vcvtsi2sd  op, %xmmN, %xmmX
++
++   NB: We want to generate only a single vxorps to cover the whole
++   function.  The LCM algorithm isn't appropriate here since it may
++   place a vxorps inside the loop.  */
++
++static unsigned int
++remove_partial_avx_dependency (void)
++{
++  timevar_push (TV_MACH_DEP);
++
++  bitmap_obstack_initialize (NULL);
++  bitmap convert_bbs = BITMAP_ALLOC (NULL);
++
++  basic_block bb;
++  rtx_insn *insn, *set_insn;
++  rtx set;
++  rtx v4sf_const0 = NULL_RTX;
++
++  auto_vec<rtx_insn *> control_flow_insns;
++
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      FOR_BB_INSNS (bb, insn)
++	{
++	  if (!NONDEBUG_INSN_P (insn))
++	    continue;
++
++	  set = single_set (insn);
++	  if (!set)
++	    continue;
++
++	  if (get_attr_avx_partial_xmm_update (insn)
++	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
++	    continue;
++
++	  if (!v4sf_const0)
++	    {
++	      calculate_dominance_info (CDI_DOMINATORS);
++	      df_set_flags (DF_DEFER_INSN_RESCAN);
++	      df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
++	      df_md_add_problem ();
++	      df_analyze ();
++	      v4sf_const0 = gen_reg_rtx (V4SFmode);
++	    }
++
++	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
++	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
++	     vec_merge with subreg.  */
++	  rtx src = SET_SRC (set);
++	  rtx dest = SET_DEST (set);
++	  machine_mode dest_mode = GET_MODE (dest);
++
++	  rtx zero;
++	  machine_mode dest_vecmode;
++	  if (dest_mode == E_SFmode)
++	    {
++	      dest_vecmode = V4SFmode;
++	      zero = v4sf_const0;
++	    }
++	  else
++	    {
++	      dest_vecmode = V2DFmode;
++	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
++	    }
++
++	  /* Change source to vector mode.  */
++	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
++	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
++				   GEN_INT (HOST_WIDE_INT_1U));
++	  /* Change destination to vector mode.  */
++	  rtx vec = gen_reg_rtx (dest_vecmode);
++	  /* Generate an XMM vector SET.  */
++	  set = gen_rtx_SET (vec, src);
++	  set_insn = emit_insn_before (set, insn);
++	  df_insn_rescan (set_insn);
++
++	  if (cfun->can_throw_non_call_exceptions)
++	    {
++	      /* Handle REG_EH_REGION note.  */
++	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
++	      if (note)
++		{
++		  control_flow_insns.safe_push (set_insn);
++		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
++		}
++	    }
++
++	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
++	  set = gen_rtx_SET (dest, src);
++
++	  /* Drop possible dead definitions.  */
++	  PATTERN (insn) = set;
++
++	  INSN_CODE (insn) = -1;
++	  recog_memoized (insn);
++	  df_insn_rescan (insn);
++	  bitmap_set_bit (convert_bbs, bb->index);
++	}
++    }
++
++  if (v4sf_const0)
++    {
++      /* (Re-)discover loops so that bb->loop_father can be used in the
++	 analysis below.  */
++      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
++
++      /* Generate a vxorps at entry of the nearest dominator for basic
++	 blocks with conversions, which is in the the fake loop that
++	 contains the whole function, so that there is only a single
++	 vxorps in the whole function.   */
++      bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
++					     convert_bbs);
++      while (bb->loop_father->latch
++	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
++	bb = get_immediate_dominator (CDI_DOMINATORS,
++				      bb->loop_father->header);
++
++      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
++
++      insn = BB_HEAD (bb);
++      while (insn && !NONDEBUG_INSN_P (insn))
++	{
++	  if (insn == BB_END (bb))
++	    {
++	      insn = NULL;
++	      break;
++	    }
++	  insn = NEXT_INSN (insn);
++	}
++      if (insn == BB_HEAD (bb))
++        set_insn = emit_insn_before (set, insn);
++      else
++	set_insn = emit_insn_after (set,
++				    insn ? PREV_INSN (insn) : BB_END (bb));
++      df_insn_rescan (set_insn);
++      df_process_deferred_rescans ();
++      loop_optimizer_finalize ();
++
++      if (!control_flow_insns.is_empty ())
++	{
++	  free_dominance_info (CDI_DOMINATORS);
++
++	  unsigned int i;
++	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
++	    if (control_flow_insn_p (insn))
++	      {
++		/* Split the block after insn.  There will be a fallthru
++		   edge, which is OK so we keep it.  We have to create
++		   the exception edges ourselves.  */
++		bb = BLOCK_FOR_INSN (insn);
++		split_block (bb, insn);
++		rtl_make_eh_edge (NULL, bb, BB_END (bb));
++	      }
++	}
++    }
++
++  bitmap_obstack_release (NULL);
++  BITMAP_FREE (convert_bbs);
++
++  timevar_pop (TV_MACH_DEP);
++  return 0;
++}
++
++namespace {
++
++const pass_data pass_data_remove_partial_avx_dependency =
++{
++  RTL_PASS, /* type */
++  "rpad", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_MACH_DEP, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  TODO_df_finish, /* todo_flags_finish */
++};
++
++class pass_remove_partial_avx_dependency : public rtl_opt_pass
++{
++public:
++  pass_remove_partial_avx_dependency (gcc::context *ctxt)
++    : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  virtual bool gate (function *)
++    {
++      return (TARGET_AVX
++	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
++	      && TARGET_SSE_MATH
++	      && optimize
++	      && optimize_function_for_speed_p (cfun));
++    }
++
++  virtual unsigned int execute (function *)
++    {
++      return remove_partial_avx_dependency ();
++    }
++}; // class pass_rpad
++
++} // anon namespace
++
++rtl_opt_pass *
++make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
++{
++  return new pass_remove_partial_avx_dependency (ctxt);
++}
++
++/* This compares the priority of target features in function DECL1
++   and DECL2.  It returns positive value if DECL1 is higher priority,
++   negative value if DECL2 is higher priority and 0 if they are the
++   same.  */
++
++int
++ix86_compare_version_priority (tree decl1, tree decl2)
++{
++  unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
++  unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
++
++  return (int)priority1 - (int)priority2;
++}
++
++/* V1 and V2 point to function versions with different priorities
++   based on the target ISA.  This function compares their priorities.  */
++ 
++static int
++feature_compare (const void *v1, const void *v2)
++{
++  typedef struct _function_version_info
++    {
++      tree version_decl;
++      tree predicate_chain;
++      unsigned int dispatch_priority;
++    } function_version_info;
++
++  const function_version_info c1 = *(const function_version_info *)v1;
++  const function_version_info c2 = *(const function_version_info *)v2;
++  return (c2.dispatch_priority - c1.dispatch_priority);
++}
++
++/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
++   to return a pointer to VERSION_DECL if the outcome of the expression
++   formed by PREDICATE_CHAIN is true.  This function will be called during
++   version dispatch to decide which function version to execute.  It returns
++   the basic block at the end, to which more conditions can be added.  */
++
++static basic_block
++add_condition_to_bb (tree function_decl, tree version_decl,
++		     tree predicate_chain, basic_block new_bb)
++{
++  gimple *return_stmt;
++  tree convert_expr, result_var;
++  gimple *convert_stmt;
++  gimple *call_cond_stmt;
++  gimple *if_else_stmt;
++
++  basic_block bb1, bb2, bb3;
++  edge e12, e23;
++
++  tree cond_var, and_expr_var = NULL_TREE;
++  gimple_seq gseq;
++
++  tree predicate_decl, predicate_arg;
++
++  push_cfun (DECL_STRUCT_FUNCTION (function_decl));
++
++  gcc_assert (new_bb != NULL);
++  gseq = bb_seq (new_bb);
++
++
++  convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
++	     		 build_fold_addr_expr (version_decl));
++  result_var = create_tmp_var (ptr_type_node);
++  convert_stmt = gimple_build_assign (result_var, convert_expr); 
++  return_stmt = gimple_build_return (result_var);
++
++  if (predicate_chain == NULL_TREE)
++    {
++      gimple_seq_add_stmt (&gseq, convert_stmt);
++      gimple_seq_add_stmt (&gseq, return_stmt);
++      set_bb_seq (new_bb, gseq);
++      gimple_set_bb (convert_stmt, new_bb);
++      gimple_set_bb (return_stmt, new_bb);
++      pop_cfun ();
++      return new_bb;
++    }
++
++  while (predicate_chain != NULL)
++    {
++      cond_var = create_tmp_var (integer_type_node);
++      predicate_decl = TREE_PURPOSE (predicate_chain);
++      predicate_arg = TREE_VALUE (predicate_chain);
++      call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
++      gimple_call_set_lhs (call_cond_stmt, cond_var);
++
++      gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
++      gimple_set_bb (call_cond_stmt, new_bb);
++      gimple_seq_add_stmt (&gseq, call_cond_stmt);
++
++      predicate_chain = TREE_CHAIN (predicate_chain);
++      
++      if (and_expr_var == NULL)
++        and_expr_var = cond_var;
++      else
++	{
++	  gimple *assign_stmt;
++	  /* Use MIN_EXPR to check if any integer is zero?.
++	     and_expr_var = min_expr <cond_var, and_expr_var>  */
++	  assign_stmt = gimple_build_assign (and_expr_var,
++			  build2 (MIN_EXPR, integer_type_node,
++				  cond_var, and_expr_var));
++
++	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
++	  gimple_set_bb (assign_stmt, new_bb);
++	  gimple_seq_add_stmt (&gseq, assign_stmt);
++	}
++    }
++
++  if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
++	  		            integer_zero_node,
++				    NULL_TREE, NULL_TREE);
++  gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
++  gimple_set_bb (if_else_stmt, new_bb);
++  gimple_seq_add_stmt (&gseq, if_else_stmt);
++
++  gimple_seq_add_stmt (&gseq, convert_stmt);
++  gimple_seq_add_stmt (&gseq, return_stmt);
++  set_bb_seq (new_bb, gseq);
++
++  bb1 = new_bb;
++  e12 = split_block (bb1, if_else_stmt);
++  bb2 = e12->dest;
++  e12->flags &= ~EDGE_FALLTHRU;
++  e12->flags |= EDGE_TRUE_VALUE;
++
++  e23 = split_block (bb2, return_stmt);
++
++  gimple_set_bb (convert_stmt, bb2);
++  gimple_set_bb (return_stmt, bb2);
++
++  bb3 = e23->dest;
++  make_edge (bb1, bb3, EDGE_FALSE_VALUE); 
++
++  remove_edge (e23);
++  make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
++
++  pop_cfun ();
++
++  return bb3;
++}
++
++/* This function generates the dispatch function for
++   multi-versioned functions.  DISPATCH_DECL is the function which will
++   contain the dispatch logic.  FNDECLS are the function choices for
++   dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
++   in DISPATCH_DECL in which the dispatch code is generated.  */
++
++static int
++dispatch_function_versions (tree dispatch_decl,
++			    void *fndecls_p,
++			    basic_block *empty_bb)
++{
++  tree default_decl;
++  gimple *ifunc_cpu_init_stmt;
++  gimple_seq gseq;
++  int ix;
++  tree ele;
++  vec<tree> *fndecls;
++  unsigned int num_versions = 0;
++  unsigned int actual_versions = 0;
++  unsigned int i;
++
++  struct _function_version_info
++    {
++      tree version_decl;
++      tree predicate_chain;
++      unsigned int dispatch_priority;
++    }*function_version_info;
++
++  gcc_assert (dispatch_decl != NULL
++	      && fndecls_p != NULL
++	      && empty_bb != NULL);
++
++  /*fndecls_p is actually a vector.  */
++  fndecls = static_cast<vec<tree> *> (fndecls_p);
++
++  /* At least one more version other than the default.  */
++  num_versions = fndecls->length ();
++  gcc_assert (num_versions >= 2);
++
++  function_version_info = (struct _function_version_info *)
++    XNEWVEC (struct _function_version_info, (num_versions - 1));
++
++  /* The first version in the vector is the default decl.  */
++  default_decl = (*fndecls)[0];
++
++  push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
++
++  gseq = bb_seq (*empty_bb);
++  /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
++     constructors, so explicity call __builtin_cpu_init here.  */
++  ifunc_cpu_init_stmt
++    = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
++  gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
++  gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
++  set_bb_seq (*empty_bb, gseq);
++
++  pop_cfun ();
++
++
++  for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
++    {
++      tree version_decl = ele;
++      tree predicate_chain = NULL_TREE;
++      unsigned int priority;
++      /* Get attribute string, parse it and find the right predicate decl.
++         The predicate function could be a lengthy combination of many
++	 features, like arch-type and various isa-variants.  */
++      priority = get_builtin_code_for_version (version_decl,
++	 			               &predicate_chain);
++
++      if (predicate_chain == NULL_TREE)
++	continue;
++
++      function_version_info [actual_versions].version_decl = version_decl;
++      function_version_info [actual_versions].predicate_chain
++	 = predicate_chain;
++      function_version_info [actual_versions].dispatch_priority = priority;
++      actual_versions++;
++    }
++
++  /* Sort the versions according to descending order of dispatch priority.  The
++     priority is based on the ISA.  This is not a perfect solution.  There
++     could still be ambiguity.  If more than one function version is suitable
++     to execute,  which one should be dispatched?  In future, allow the user
++     to specify a dispatch  priority next to the version.  */
++  qsort (function_version_info, actual_versions,
++         sizeof (struct _function_version_info), feature_compare);
++
++  for  (i = 0; i < actual_versions; ++i)
++    *empty_bb = add_condition_to_bb (dispatch_decl,
++				     function_version_info[i].version_decl,
++				     function_version_info[i].predicate_chain,
++				     *empty_bb);
++
++  /* dispatch default version at the end.  */
++  *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
++				   NULL, *empty_bb);
++
++  free (function_version_info);
++  return 0;
++}
++
++/* This function changes the assembler name for functions that are
++   versions.  If DECL is a function version and has a "target"
++   attribute, it appends the attribute string to its assembler name.  */
++
++static tree
++ix86_mangle_function_version_assembler_name (tree decl, tree id)
++{
++  tree version_attr;
++  const char *orig_name, *version_string;
++  char *attr_str, *assembler_name;
++
++  if (DECL_DECLARED_INLINE_P (decl)
++      && lookup_attribute ("gnu_inline",
++			   DECL_ATTRIBUTES (decl)))
++    error_at (DECL_SOURCE_LOCATION (decl),
++	      "function versions cannot be marked as %<gnu_inline%>,"
++	      " bodies have to be generated");
++
++  if (DECL_VIRTUAL_P (decl)
++      || DECL_VINDEX (decl))
++    sorry ("virtual function multiversioning not supported");
++
++  version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
++
++  /* target attribute string cannot be NULL.  */
++  gcc_assert (version_attr != NULL_TREE);
++
++  orig_name = IDENTIFIER_POINTER (id);
++  version_string
++    = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
++
++  if (strcmp (version_string, "default") == 0)
++    return id;
++
++  attr_str = sorted_attr_string (TREE_VALUE (version_attr));
++  assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
++
++  sprintf (assembler_name, "%s.%s", orig_name, attr_str);
++
++  /* Allow assembler name to be modified if already set.  */
++  if (DECL_ASSEMBLER_NAME_SET_P (decl))
++    SET_DECL_RTL (decl, NULL);
++
++  tree ret = get_identifier (assembler_name);
++  XDELETEVEC (attr_str);
++  XDELETEVEC (assembler_name);
++  return ret;
++}
++
++tree 
++ix86_mangle_decl_assembler_name (tree decl, tree id)
++{
++  /* For function version, add the target suffix to the assembler name.  */
++  if (TREE_CODE (decl) == FUNCTION_DECL
++      && DECL_FUNCTION_VERSIONED (decl))
++    id = ix86_mangle_function_version_assembler_name (decl, id);
++#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
++  id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
++#endif
++
++  return id;
++}
++
++/* Make a dispatcher declaration for the multi-versioned function DECL.
++   Calls to DECL function will be replaced with calls to the dispatcher
++   by the front-end.  Returns the decl of the dispatcher function.  */
++
++tree
++ix86_get_function_versions_dispatcher (void *decl)
++{
++  tree fn = (tree) decl;
++  struct cgraph_node *node = NULL;
++  struct cgraph_node *default_node = NULL;
++  struct cgraph_function_version_info *node_v = NULL;
++  struct cgraph_function_version_info *first_v = NULL;
++
++  tree dispatch_decl = NULL;
++
++  struct cgraph_function_version_info *default_version_info = NULL;
++ 
++  gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
++
++  node = cgraph_node::get (fn);
++  gcc_assert (node != NULL);
++
++  node_v = node->function_version ();
++  gcc_assert (node_v != NULL);
++ 
++  if (node_v->dispatcher_resolver != NULL)
++    return node_v->dispatcher_resolver;
++
++  /* Find the default version and make it the first node.  */
++  first_v = node_v;
++  /* Go to the beginning of the chain.  */
++  while (first_v->prev != NULL)
++    first_v = first_v->prev;
++  default_version_info = first_v;
++  while (default_version_info != NULL)
++    {
++      if (is_function_default_version
++	    (default_version_info->this_node->decl))
++        break;
++      default_version_info = default_version_info->next;
++    }
++
++  /* If there is no default node, just return NULL.  */
++  if (default_version_info == NULL)
++    return NULL;
++
++  /* Make default info the first node.  */
++  if (first_v != default_version_info)
++    {
++      default_version_info->prev->next = default_version_info->next;
++      if (default_version_info->next)
++        default_version_info->next->prev = default_version_info->prev;
++      first_v->prev = default_version_info;
++      default_version_info->next = first_v;
++      default_version_info->prev = NULL;
++    }
++
++  default_node = default_version_info->this_node;
++
++#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
++  if (targetm.has_ifunc_p ())
++    {
++      struct cgraph_function_version_info *it_v = NULL;
++      struct cgraph_node *dispatcher_node = NULL;
++      struct cgraph_function_version_info *dispatcher_version_info = NULL;
++
++      /* Right now, the dispatching is done via ifunc.  */
++      dispatch_decl = make_dispatcher_decl (default_node->decl);
++
++      dispatcher_node = cgraph_node::get_create (dispatch_decl);
++      gcc_assert (dispatcher_node != NULL);
++      dispatcher_node->dispatcher_function = 1;
++      dispatcher_version_info
++	= dispatcher_node->insert_new_function_version ();
++      dispatcher_version_info->next = default_version_info;
++      dispatcher_node->definition = 1;
++
++      /* Set the dispatcher for all the versions.  */
++      it_v = default_version_info;
++      while (it_v != NULL)
++	{
++	  it_v->dispatcher_resolver = dispatch_decl;
++	  it_v = it_v->next;
++	}
++    }
++  else
++#endif
++    {
++      error_at (DECL_SOURCE_LOCATION (default_node->decl),
++		"multiversioning needs ifunc which is not supported "
++		"on this target");
++    }
++
++  return dispatch_decl;
++}
++
++/* Make the resolver function decl to dispatch the versions of
++   a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
++   ifunc alias that will point to the created resolver.  Create an
++   empty basic block in the resolver and store the pointer in
++   EMPTY_BB.  Return the decl of the resolver function.  */
++
++static tree
++make_resolver_func (const tree default_decl,
++		    const tree ifunc_alias_decl,
++		    basic_block *empty_bb)
++{
++  char *resolver_name;
++  tree decl, type, decl_name, t;
++
++  /* IFUNC's have to be globally visible.  So, if the default_decl is
++     not, then the name of the IFUNC should be made unique.  */
++  if (TREE_PUBLIC (default_decl) == 0)
++    {
++      char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
++      symtab->change_decl_assembler_name (ifunc_alias_decl,
++					  get_identifier (ifunc_name));
++      XDELETEVEC (ifunc_name);
++    }
++
++  resolver_name = make_unique_name (default_decl, "resolver", false);
++
++  /* The resolver function should return a (void *). */
++  type = build_function_type_list (ptr_type_node, NULL_TREE);
++
++  decl = build_fn_decl (resolver_name, type);
++  decl_name = get_identifier (resolver_name);
++  SET_DECL_ASSEMBLER_NAME (decl, decl_name);
++
++  DECL_NAME (decl) = decl_name;
++  TREE_USED (decl) = 1;
++  DECL_ARTIFICIAL (decl) = 1;
++  DECL_IGNORED_P (decl) = 1;
++  TREE_PUBLIC (decl) = 0;
++  DECL_UNINLINABLE (decl) = 1;
++
++  /* Resolver is not external, body is generated.  */
++  DECL_EXTERNAL (decl) = 0;
++  DECL_EXTERNAL (ifunc_alias_decl) = 0;
++
++  DECL_CONTEXT (decl) = NULL_TREE;
++  DECL_INITIAL (decl) = make_node (BLOCK);
++  DECL_STATIC_CONSTRUCTOR (decl) = 0;
++
++  if (DECL_COMDAT_GROUP (default_decl)
++      || TREE_PUBLIC (default_decl))
++    {
++      /* In this case, each translation unit with a call to this
++	 versioned function will put out a resolver.  Ensure it
++	 is comdat to keep just one copy.  */
++      DECL_COMDAT (decl) = 1;
++      make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
++    }
++  /* Build result decl and add to function_decl. */
++  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
++  DECL_CONTEXT (t) = decl;
++  DECL_ARTIFICIAL (t) = 1;
++  DECL_IGNORED_P (t) = 1;
++  DECL_RESULT (decl) = t;
++
++  gimplify_function_tree (decl);
++  push_cfun (DECL_STRUCT_FUNCTION (decl));
++  *empty_bb = init_lowered_empty_function (decl, false,
++					   profile_count::uninitialized ());
++
++  cgraph_node::add_new_function (decl, true);
++  symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
++
++  pop_cfun ();
++
++  gcc_assert (ifunc_alias_decl != NULL);
++  /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
++  DECL_ATTRIBUTES (ifunc_alias_decl)
++    = make_attribute ("ifunc", resolver_name,
++		      DECL_ATTRIBUTES (ifunc_alias_decl));
++
++  /* Create the alias for dispatch to resolver here.  */
++  cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
++  XDELETEVEC (resolver_name);
++  return decl;
++}
++
++/* Generate the dispatching code body to dispatch multi-versioned function
++   DECL.  The target hook is called to process the "target" attributes and
++   provide the code to dispatch the right function at run-time.  NODE points
++   to the dispatcher decl whose body will be created.  */
++
++tree 
++ix86_generate_version_dispatcher_body (void *node_p)
++{
++  tree resolver_decl;
++  basic_block empty_bb;
++  tree default_ver_decl;
++  struct cgraph_node *versn;
++  struct cgraph_node *node;
++
++  struct cgraph_function_version_info *node_version_info = NULL;
++  struct cgraph_function_version_info *versn_info = NULL;
++
++  node = (cgraph_node *)node_p;
++
++  node_version_info = node->function_version ();
++  gcc_assert (node->dispatcher_function
++	      && node_version_info != NULL);
++
++  if (node_version_info->dispatcher_resolver)
++    return node_version_info->dispatcher_resolver;
++
++  /* The first version in the chain corresponds to the default version.  */
++  default_ver_decl = node_version_info->next->this_node->decl;
++
++  /* node is going to be an alias, so remove the finalized bit.  */
++  node->definition = false;
++
++  resolver_decl = make_resolver_func (default_ver_decl,
++				      node->decl, &empty_bb);
++
++  node_version_info->dispatcher_resolver = resolver_decl;
++
++  push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
++
++  auto_vec<tree, 2> fn_ver_vec;
++
++  for (versn_info = node_version_info->next; versn_info;
++       versn_info = versn_info->next)
++    {
++      versn = versn_info->this_node;
++      /* Check for virtual functions here again, as by this time it should
++	 have been determined if this function needs a vtable index or
++	 not.  This happens for methods in derived classes that override
++	 virtual methods in base classes but are not explicitly marked as
++	 virtual.  */
++      if (DECL_VINDEX (versn->decl))
++	sorry ("virtual function multiversioning not supported");
++
++      fn_ver_vec.safe_push (versn->decl);
++    }
++
++  dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
++  cgraph_edge::rebuild_edges ();
++  pop_cfun ();
++  return resolver_decl;
++}
++
++
+diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
+new file mode 100644
+index 000000000..358122249
+--- /dev/null
++++ b/gcc/config/i386/i386-features.h
+@@ -0,0 +1,201 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_I386_FEATURES_H
++#define GCC_I386_FEATURES_H
++
++enum xlogue_stub {
++  XLOGUE_STUB_SAVE,
++  XLOGUE_STUB_RESTORE,
++  XLOGUE_STUB_RESTORE_TAIL,
++  XLOGUE_STUB_SAVE_HFP,
++  XLOGUE_STUB_RESTORE_HFP,
++  XLOGUE_STUB_RESTORE_HFP_TAIL,
++
++  XLOGUE_STUB_COUNT
++};
++
++enum xlogue_stub_sets {
++  XLOGUE_SET_ALIGNED,
++  XLOGUE_SET_ALIGNED_PLUS_8,
++  XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
++  XLOGUE_SET_HFP_ALIGNED_PLUS_8,
++
++  XLOGUE_SET_COUNT
++};
++
++/* Register save/restore layout used by out-of-line stubs.  */
++class xlogue_layout {
++public:
++  struct reginfo
++  {
++    unsigned regno;
++    HOST_WIDE_INT offset;	/* Offset used by stub base pointer (rax or
++				   rsi) to where each register is stored.  */
++  };
++
++  unsigned get_nregs () const			{return m_nregs;}
++  HOST_WIDE_INT get_stack_align_off_in () const	{return m_stack_align_off_in;}
++
++  const reginfo &get_reginfo (unsigned reg) const
++  {
++    gcc_assert (reg < m_nregs);
++    return m_regs[reg];
++  }
++
++  static const char *get_stub_name (enum xlogue_stub stub,
++				    unsigned n_extra_args);
++
++  /* Returns an rtx for the stub's symbol based upon
++       1.) the specified stub (save, restore or restore_ret) and
++       2.) the value of cfun->machine->call_ms2sysv_extra_regs and
++       3.) rather or not stack alignment is being performed.  */
++  static rtx get_stub_rtx (enum xlogue_stub stub);
++
++  /* Returns the amount of stack space (including padding) that the stub
++     needs to store registers based upon data in the machine_function.  */
++  HOST_WIDE_INT get_stack_space_used () const
++  {
++    const struct machine_function *m = cfun->machine;
++    unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
++
++    gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
++    return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
++  }
++
++  /* Returns the offset for the base pointer used by the stub.  */
++  HOST_WIDE_INT get_stub_ptr_offset () const
++  {
++    return STUB_INDEX_OFFSET + m_stack_align_off_in;
++  }
++
++  static const struct xlogue_layout &get_instance ();
++  static unsigned count_stub_managed_regs ();
++  static bool is_stub_managed_reg (unsigned regno, unsigned count);
++
++  static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
++  static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
++  static const unsigned MAX_REGS = 18;
++  static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
++  static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
++  static const unsigned STUB_NAME_MAX_LEN = 20;
++  static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
++  static const unsigned REG_ORDER[MAX_REGS];
++  static const unsigned REG_ORDER_REALIGN[MAX_REGS];
++
++private:
++  xlogue_layout ();
++  xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
++  xlogue_layout (const xlogue_layout &);
++
++  /* True if hard frame pointer is used.  */
++  bool m_hfp;
++
++  /* Max number of register this layout manages.  */
++  unsigned m_nregs;
++
++  /* Incoming offset from 16-byte alignment.  */
++  HOST_WIDE_INT m_stack_align_off_in;
++
++  /* Register order and offsets.  */
++  struct reginfo m_regs[MAX_REGS];
++
++  /* Lazy-inited cache of symbol names for stubs.  */
++  static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
++			  [STUB_NAME_MAX_LEN];
++
++  static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
++};
++
++namespace {
++
++class scalar_chain
++{
++ public:
++  scalar_chain ();
++  virtual ~scalar_chain ();
++
++  static unsigned max_id;
++
++  /* ID of a chain.  */
++  unsigned int chain_id;
++  /* A queue of instructions to be included into a chain.  */
++  bitmap queue;
++  /* Instructions included into a chain.  */
++  bitmap insns;
++  /* All registers defined by a chain.  */
++  bitmap defs;
++  /* Registers used in both vector and sclar modes.  */
++  bitmap defs_conv;
++
++  void build (bitmap candidates, unsigned insn_uid);
++  virtual int compute_convert_gain () = 0;
++  int convert ();
++
++ protected:
++  void add_to_queue (unsigned insn_uid);
++  void emit_conversion_insns (rtx insns, rtx_insn *pos);
++
++ private:
++  void add_insn (bitmap candidates, unsigned insn_uid);
++  void analyze_register_chain (bitmap candidates, df_ref ref);
++  virtual void mark_dual_mode_def (df_ref def) = 0;
++  virtual void convert_insn (rtx_insn *insn) = 0;
++  virtual void convert_registers () = 0;
++};
++
++class dimode_scalar_chain : public scalar_chain
++{
++ public:
++  int compute_convert_gain ();
++ private:
++  void mark_dual_mode_def (df_ref def);
++  rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
++  void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
++  void convert_insn (rtx_insn *insn);
++  void convert_op (rtx *op, rtx_insn *insn);
++  void convert_reg (unsigned regno);
++  void make_vector_copies (unsigned regno);
++  void convert_registers ();
++  int vector_const_cost (rtx exp);
++};
++
++class timode_scalar_chain : public scalar_chain
++{
++ public:
++  /* Convert from TImode to V1TImode is always faster.  */
++  int compute_convert_gain () { return 1; }
++
++ private:
++  void mark_dual_mode_def (df_ref def);
++  void fix_debug_reg_uses (rtx reg);
++  void convert_insn (rtx_insn *insn);
++  /* We don't convert registers to difference size.  */
++  void convert_registers () {}
++};
++
++} // anon namespace
++
++bool ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined);
++int ix86_compare_version_priority (tree decl1, tree decl2);
++tree ix86_generate_version_dispatcher_body (void *node_p);
++tree ix86_get_function_versions_dispatcher (void *decl);
++tree ix86_mangle_decl_assembler_name (tree decl, tree id);
++
++
++#endif  /* GCC_I386_FEATURES_H */
+diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
+new file mode 100644
+index 000000000..4a03bead8
+--- /dev/null
++++ b/gcc/config/i386/i386-options.c
+@@ -0,0 +1,3707 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#define IN_TARGET_CODE 1
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "rtl.h"
++#include "tree.h"
++#include "memmodel.h"
++#include "gimple.h"
++#include "cfghooks.h"
++#include "cfgloop.h"
++#include "df.h"
++#include "tm_p.h"
++#include "stringpool.h"
++#include "expmed.h"
++#include "optabs.h"
++#include "regs.h"
++#include "emit-rtl.h"
++#include "recog.h"
++#include "cgraph.h"
++#include "diagnostic.h"
++#include "cfgbuild.h"
++#include "alias.h"
++#include "fold-const.h"
++#include "attribs.h"
++#include "calls.h"
++#include "stor-layout.h"
++#include "varasm.h"
++#include "output.h"
++#include "insn-attr.h"
++#include "flags.h"
++#include "except.h"
++#include "explow.h"
++#include "expr.h"
++#include "cfgrtl.h"
++#include "common/common-target.h"
++#include "langhooks.h"
++#include "reload.h"
++#include "gimplify.h"
++#include "dwarf2.h"
++#include "tm-constrs.h"
++#include "params.h"
++#include "cselib.h"
++#include "sched-int.h"
++#include "opts.h"
++#include "tree-pass.h"
++#include "context.h"
++#include "pass_manager.h"
++#include "target-globals.h"
++#include "gimple-iterator.h"
++#include "tree-vectorizer.h"
++#include "shrink-wrap.h"
++#include "builtins.h"
++#include "rtl-iter.h"
++#include "tree-iterator.h"
++#include "dbgcnt.h"
++#include "case-cfn-macros.h"
++#include "dojump.h"
++#include "fold-const-call.h"
++#include "tree-vrp.h"
++#include "tree-ssanames.h"
++#include "selftest.h"
++#include "selftest-rtl.h"
++#include "print-rtl.h"
++#include "intl.h"
++#include "ifcvt.h"
++#include "symbol-summary.h"
++#include "ipa-prop.h"
++#include "ipa-fnsummary.h"
++#include "wide-int-bitmask.h"
++#include "tree-vector-builder.h"
++#include "debug.h"
++#include "dwarf2out.h"
++#include "i386-options.h"
++
++#include "x86-tune-costs.h"
++
++#ifndef SUBTARGET32_DEFAULT_CPU
++#define SUBTARGET32_DEFAULT_CPU "i386"
++#endif
++
++/* Processor feature/optimization bitmasks.  */
++#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
++#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
++#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
++#define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
++#define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
++#define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
++#define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
++#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
++#define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
++#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
++#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
++#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
++#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
++#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
++#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
++#define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
++#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
++#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
++#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
++#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
++#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
++#define m_CASCADELAKE (HOST_WIDE_INT_1U<<PROCESSOR_CASCADELAKE)
++#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
++		       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE)
++#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
++#define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2)
++#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
++#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
++#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
++#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
++
++#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
++#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
++#define m_K6_GEODE (m_K6 | m_GEODE)
++#define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
++#define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
++#define m_ATHLON_K8 (m_K8 | m_ATHLON)
++#define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
++#define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
++#define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
++#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
++#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
++#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
++#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
++#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
++#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
++#define m_BDVER	(m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
++#define m_BTVER (m_BTVER1 | m_BTVER2)
++#define m_ZNVER	(m_ZNVER1 | m_ZNVER2)
++#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
++			| m_ZNVER)
++
++#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
++
++const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
++#undef DEF_TUNE
++#define DEF_TUNE(tune, name, selector) name,
++#include "x86-tune.def"
++#undef DEF_TUNE
++};
++
++/* Feature tests against the various tunings.  */
++unsigned char ix86_tune_features[X86_TUNE_LAST];
++
++/* Feature tests against the various tunings used to create ix86_tune_features
++   based on the processor mask.  */
++static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
++#undef DEF_TUNE
++#define DEF_TUNE(tune, name, selector) selector,
++#include "x86-tune.def"
++#undef DEF_TUNE
++};
++
++/* Feature tests against the various architecture variations.  */
++unsigned char ix86_arch_features[X86_ARCH_LAST];
++
++/* Return a string that documents the current -m options.  The caller is
++   responsible for freeing the string.  */
++
++char *
++ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
++		    int flags, int flags2,
++		    const char *arch, const char *tune,
++		    enum fpmath_unit fpmath, bool add_nl_p, bool add_abi_p)
++{
++  struct ix86_target_opts
++  {
++    const char *option;		/* option string */
++    HOST_WIDE_INT mask;		/* isa mask options */
++  };
++
++  /* This table is ordered so that options like -msse4.2 that imply other
++     ISAs come first.  Target string will be displayed in the same order.  */
++  static struct ix86_target_opts isa2_opts[] =
++  {
++    { "-mcx16",		OPTION_MASK_ISA_CX16 },
++    { "-mvaes",		OPTION_MASK_ISA_VAES },
++    { "-mrdpid",	OPTION_MASK_ISA_RDPID },
++    { "-mpconfig",	OPTION_MASK_ISA_PCONFIG },
++    { "-mwbnoinvd",     OPTION_MASK_ISA_WBNOINVD },
++    { "-msgx",		OPTION_MASK_ISA_SGX },
++    { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
++    { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
++    { "-mhle",		OPTION_MASK_ISA_HLE },
++    { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
++    { "-mclzero",	OPTION_MASK_ISA_CLZERO },
++    { "-mmwaitx",	OPTION_MASK_ISA_MWAITX },
++    { "-mmovdir64b",	OPTION_MASK_ISA_MOVDIR64B },
++    { "-mwaitpkg",	OPTION_MASK_ISA_WAITPKG },
++    { "-mcldemote",	OPTION_MASK_ISA_CLDEMOTE },
++    { "-mptwrite",	OPTION_MASK_ISA_PTWRITE }
++  };
++  static struct ix86_target_opts isa_opts[] =
++  {
++    { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
++    { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
++    { "-mvpclmulqdq",	OPTION_MASK_ISA_VPCLMULQDQ },
++    { "-mgfni",		OPTION_MASK_ISA_GFNI },
++    { "-mavx512vnni",	OPTION_MASK_ISA_AVX512VNNI },
++    { "-mavx512vbmi2",	OPTION_MASK_ISA_AVX512VBMI2 },
++    { "-mavx512vbmi",	OPTION_MASK_ISA_AVX512VBMI },
++    { "-mavx512ifma",	OPTION_MASK_ISA_AVX512IFMA },
++    { "-mavx512vl",	OPTION_MASK_ISA_AVX512VL },
++    { "-mavx512bw",	OPTION_MASK_ISA_AVX512BW },
++    { "-mavx512dq",	OPTION_MASK_ISA_AVX512DQ },
++    { "-mavx512er",	OPTION_MASK_ISA_AVX512ER },
++    { "-mavx512pf",	OPTION_MASK_ISA_AVX512PF },
++    { "-mavx512cd",	OPTION_MASK_ISA_AVX512CD },
++    { "-mavx512f",	OPTION_MASK_ISA_AVX512F },
++    { "-mavx2",		OPTION_MASK_ISA_AVX2 },
++    { "-mfma",		OPTION_MASK_ISA_FMA },
++    { "-mxop",		OPTION_MASK_ISA_XOP },
++    { "-mfma4",		OPTION_MASK_ISA_FMA4 },
++    { "-mf16c",		OPTION_MASK_ISA_F16C },
++    { "-mavx",		OPTION_MASK_ISA_AVX },
++/*  { "-msse4"		OPTION_MASK_ISA_SSE4 }, */
++    { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
++    { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
++    { "-msse4a",	OPTION_MASK_ISA_SSE4A },
++    { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
++    { "-msse3",		OPTION_MASK_ISA_SSE3 },
++    { "-maes",		OPTION_MASK_ISA_AES },
++    { "-msha",		OPTION_MASK_ISA_SHA },
++    { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
++    { "-msse2",		OPTION_MASK_ISA_SSE2 },
++    { "-msse",		OPTION_MASK_ISA_SSE },
++    { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
++    { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
++    { "-mmmx",		OPTION_MASK_ISA_MMX },
++    { "-mrtm",		OPTION_MASK_ISA_RTM },
++    { "-mprfchw",	OPTION_MASK_ISA_PRFCHW },
++    { "-mrdseed",	OPTION_MASK_ISA_RDSEED },
++    { "-madx",		OPTION_MASK_ISA_ADX },
++    { "-mprefetchwt1",	OPTION_MASK_ISA_PREFETCHWT1 },
++    { "-mclflushopt",	OPTION_MASK_ISA_CLFLUSHOPT },
++    { "-mxsaves",	OPTION_MASK_ISA_XSAVES },
++    { "-mxsavec",	OPTION_MASK_ISA_XSAVEC },
++    { "-mxsaveopt",	OPTION_MASK_ISA_XSAVEOPT },
++    { "-mxsave",	OPTION_MASK_ISA_XSAVE },
++    { "-mabm",		OPTION_MASK_ISA_ABM },
++    { "-mbmi",		OPTION_MASK_ISA_BMI },
++    { "-mbmi2",		OPTION_MASK_ISA_BMI2 },
++    { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
++    { "-mtbm",		OPTION_MASK_ISA_TBM },
++    { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
++    { "-msahf",		OPTION_MASK_ISA_SAHF },
++    { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
++    { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
++    { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
++    { "-mpku",		OPTION_MASK_ISA_PKU },
++    { "-mlwp",		OPTION_MASK_ISA_LWP },
++    { "-mfxsr",		OPTION_MASK_ISA_FXSR },
++    { "-mclwb",		OPTION_MASK_ISA_CLWB },
++    { "-mshstk",	OPTION_MASK_ISA_SHSTK },
++    { "-mmovdiri",	OPTION_MASK_ISA_MOVDIRI }
++  };
++
++  /* Flag options.  */
++  static struct ix86_target_opts flag_opts[] =
++  {
++    { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
++    { "-mlong-double-128",		MASK_LONG_DOUBLE_128 },
++    { "-mlong-double-64",		MASK_LONG_DOUBLE_64 },
++    { "-m80387",			MASK_80387 },
++    { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
++    { "-malign-double",			MASK_ALIGN_DOUBLE },
++    { "-mcld",				MASK_CLD },
++    { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
++    { "-mieee-fp",			MASK_IEEE_FP },
++    { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
++    { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
++    { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
++    { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
++    { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
++    { "-mno-push-args",			MASK_NO_PUSH_ARGS },
++    { "-mno-red-zone",			MASK_NO_RED_ZONE },
++    { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
++    { "-mrecip",			MASK_RECIP },
++    { "-mrtd",				MASK_RTD },
++    { "-msseregparm",			MASK_SSEREGPARM },
++    { "-mstack-arg-probe",		MASK_STACK_PROBE },
++    { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
++    { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
++    { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
++    { "-mvzeroupper",			MASK_VZEROUPPER },
++    { "-mstv",				MASK_STV },
++    { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
++    { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
++    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
++  };
++
++  /* Additional flag options.  */
++  static struct ix86_target_opts flag2_opts[] =
++  {
++    { "-mgeneral-regs-only",		OPTION_MASK_GENERAL_REGS_ONLY }
++  };
++
++  const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
++		   + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
++
++  char isa_other[40];
++  char isa2_other[40];
++  char flags_other[40];
++  char flags2_other[40];
++  unsigned num = 0;
++  unsigned i, j;
++  char *ret;
++  char *ptr;
++  size_t len;
++  size_t line_len;
++  size_t sep_len;
++  const char *abi;
++
++  memset (opts, '\0', sizeof (opts));
++
++  /* Add -march= option.  */
++  if (arch)
++    {
++      opts[num][0] = "-march=";
++      opts[num++][1] = arch;
++    }
++
++  /* Add -mtune= option.  */
++  if (tune)
++    {
++      opts[num][0] = "-mtune=";
++      opts[num++][1] = tune;
++    }
++
++  /* Add -m32/-m64/-mx32.  */
++  if (add_abi_p)
++    {
++      if ((isa & OPTION_MASK_ISA_64BIT) != 0)
++	{
++	  if ((isa & OPTION_MASK_ABI_64) != 0)
++	    abi = "-m64";
++	  else
++	    abi = "-mx32";
++	}
++      else
++	abi = "-m32";
++      opts[num++][0] = abi;
++    }
++  isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
++
++  /* Pick out the options in isa2 options.  */
++  for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
++    {
++      if ((isa2 & isa2_opts[i].mask) != 0)
++	{
++	  opts[num++][0] = isa2_opts[i].option;
++	  isa2 &= ~ isa2_opts[i].mask;
++	}
++    }
++
++  if (isa2 && add_nl_p)
++    {
++      opts[num++][0] = isa2_other;
++      sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
++    }
++
++  /* Pick out the options in isa options.  */
++  for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
++    {
++      if ((isa & isa_opts[i].mask) != 0)
++	{
++	  opts[num++][0] = isa_opts[i].option;
++	  isa &= ~ isa_opts[i].mask;
++	}
++    }
++
++  if (isa && add_nl_p)
++    {
++      opts[num++][0] = isa_other;
++      sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
++    }
++
++  /* Add flag options.  */
++  for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
++    {
++      if ((flags & flag_opts[i].mask) != 0)
++	{
++	  opts[num++][0] = flag_opts[i].option;
++	  flags &= ~ flag_opts[i].mask;
++	}
++    }
++
++  if (flags && add_nl_p)
++    {
++      opts[num++][0] = flags_other;
++      sprintf (flags_other, "(other flags: %#x)", flags);
++    }
++
++    /* Add additional flag options.  */
++  for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
++    {
++      if ((flags2 & flag2_opts[i].mask) != 0)
++	{
++	  opts[num++][0] = flag2_opts[i].option;
++	  flags2 &= ~ flag2_opts[i].mask;
++	}
++    }
++
++  if (flags2 && add_nl_p)
++    {
++      opts[num++][0] = flags2_other;
++      sprintf (flags2_other, "(other flags2: %#x)", flags2);
++    }
++
++  /* Add -fpmath= option.  */
++  if (fpmath)
++    {
++      opts[num][0] = "-mfpmath=";
++      switch ((int) fpmath)
++	{
++	case FPMATH_387:
++	  opts[num++][1] = "387";
++	  break;
++
++	case FPMATH_SSE:
++	  opts[num++][1] = "sse";
++	  break;
++
++	case FPMATH_387 | FPMATH_SSE:
++	  opts[num++][1] = "sse+387";
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++    }
++
++  /* Any options?  */
++  if (num == 0)
++    return NULL;
++
++  gcc_assert (num < ARRAY_SIZE (opts));
++
++  /* Size the string.  */
++  len = 0;
++  sep_len = (add_nl_p) ? 3 : 1;
++  for (i = 0; i < num; i++)
++    {
++      len += sep_len;
++      for (j = 0; j < 2; j++)
++	if (opts[i][j])
++	  len += strlen (opts[i][j]);
++    }
++
++  /* Build the string.  */
++  ret = ptr = (char *) xmalloc (len);
++  line_len = 0;
++
++  for (i = 0; i < num; i++)
++    {
++      size_t len2[2];
++
++      for (j = 0; j < 2; j++)
++	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
++
++      if (i != 0)
++	{
++	  *ptr++ = ' ';
++	  line_len++;
++
++	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
++	    {
++	      *ptr++ = '\\';
++	      *ptr++ = '\n';
++	      line_len = 0;
++	    }
++	}
++
++      for (j = 0; j < 2; j++)
++	if (opts[i][j])
++	  {
++	    memcpy (ptr, opts[i][j], len2[j]);
++	    ptr += len2[j];
++	    line_len += len2[j];
++	  }
++    }
++
++  *ptr = '\0';
++  gcc_assert (ret + len >= ptr);
++
++  return ret;
++}
++
++/* Function that is callable from the debugger to print the current
++   options.  */
++void ATTRIBUTE_UNUSED
++ix86_debug_options (void)
++{
++  char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
++				   target_flags, ix86_target_flags,
++				   ix86_arch_string,ix86_tune_string,
++				   ix86_fpmath, true, true);
++
++  if (opts)
++    {
++      fprintf (stderr, "%s\n\n", opts);
++      free (opts);
++    }
++  else
++    fputs ("<no options>\n\n", stderr);
++
++  return;
++}
++
++/* Save the current options */
++
++void
++ix86_function_specific_save (struct cl_target_option *ptr,
++			     struct gcc_options *opts)
++{
++  ptr->arch = ix86_arch;
++  ptr->schedule = ix86_schedule;
++  ptr->prefetch_sse = x86_prefetch_sse;
++  ptr->tune = ix86_tune;
++  ptr->branch_cost = ix86_branch_cost;
++  ptr->tune_defaulted = ix86_tune_defaulted;
++  ptr->arch_specified = ix86_arch_specified;
++  ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
++  ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
++  ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
++  ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
++  ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
++  ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
++  ptr->x_ix86_abi = opts->x_ix86_abi;
++  ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
++  ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
++  ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
++  ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
++  ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
++  ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
++  ptr->x_ix86_pmode = opts->x_ix86_pmode;
++  ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
++  ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
++  ptr->x_ix86_regparm = opts->x_ix86_regparm;
++  ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
++  ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
++  ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
++  ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
++  ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
++  ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
++  ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
++  ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
++  ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
++  ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
++
++  /* The fields are char but the variables are not; make sure the
++     values fit in the fields.  */
++  gcc_assert (ptr->arch == ix86_arch);
++  gcc_assert (ptr->schedule == ix86_schedule);
++  gcc_assert (ptr->tune == ix86_tune);
++  gcc_assert (ptr->branch_cost == ix86_branch_cost);
++}
++
++/* Feature tests against the various architecture variations, used to create
++   ix86_arch_features based on the processor mask.  */
++static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
++  /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
++  ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
++
++  /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
++  ~m_386,
++
++  /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
++  ~(m_386 | m_486),
++
++  /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
++  ~m_386,
++
++  /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
++  ~m_386,
++};
++
++/* This table must be in sync with enum processor_type in i386.h.  */ 
++static const struct processor_costs *processor_cost_table[] =
++{
++  &generic_cost,
++  &i386_cost,
++  &i486_cost,
++  &pentium_cost,
++  &lakemont_cost,
++  &pentiumpro_cost,
++  &pentium4_cost,
++  &nocona_cost,
++  &core_cost,
++  &core_cost,
++  &core_cost,
++  &core_cost,
++  &atom_cost,
++  &slm_cost,
++  &slm_cost,
++  &slm_cost,
++  &slm_cost,
++  &slm_cost,
++  &slm_cost,
++  &skylake_cost,
++  &skylake_cost,
++  &skylake_cost,
++  &skylake_cost,
++  &skylake_cost,
++  &skylake_cost,
++  &intel_cost,
++  &geode_cost,
++  &k6_cost,
++  &athlon_cost,
++  &k8_cost,
++  &amdfam10_cost,
++  &bdver_cost,
++  &bdver_cost,
++  &bdver_cost,
++  &bdver_cost,
++  &btver1_cost,
++  &btver2_cost,
++  &znver1_cost,
++  &znver2_cost
++};
++
++/* Guarantee that the array is aligned with enum processor_type.  */
++STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max);
++
++static bool
++ix86_option_override_internal (bool main_args_p,
++			       struct gcc_options *opts,
++			       struct gcc_options *opts_set);
++static void
++set_ix86_tune_features (enum processor_type ix86_tune, bool dump);
++
++/* Restore the current options */
++
++void
++ix86_function_specific_restore (struct gcc_options *opts,
++				struct cl_target_option *ptr)
++{
++  enum processor_type old_tune = ix86_tune;
++  enum processor_type old_arch = ix86_arch;
++  unsigned HOST_WIDE_INT ix86_arch_mask;
++  int i;
++
++  /* We don't change -fPIC.  */
++  opts->x_flag_pic = flag_pic;
++
++  ix86_arch = (enum processor_type) ptr->arch;
++  ix86_schedule = (enum attr_cpu) ptr->schedule;
++  ix86_tune = (enum processor_type) ptr->tune;
++  x86_prefetch_sse = ptr->prefetch_sse;
++  opts->x_ix86_branch_cost = ptr->branch_cost;
++  ix86_tune_defaulted = ptr->tune_defaulted;
++  ix86_arch_specified = ptr->arch_specified;
++  opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
++  opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
++  opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
++  opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
++  opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
++  opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
++  opts->x_ix86_abi = ptr->x_ix86_abi;
++  opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
++  opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
++  opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
++  opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
++  opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
++  opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
++  opts->x_ix86_pmode = ptr->x_ix86_pmode;
++  opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
++  opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
++  opts->x_ix86_regparm = ptr->x_ix86_regparm;
++  opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
++  opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
++  opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
++  opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
++  opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
++  opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
++  opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
++  opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
++  opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
++  opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
++  ix86_tune_cost = processor_cost_table[ix86_tune];
++  /* TODO: ix86_cost should be chosen at instruction or function granuality
++     so for cold code we use size_cost even in !optimize_size compilation.  */
++  if (opts->x_optimize_size)
++    ix86_cost = &ix86_size_cost;
++  else
++    ix86_cost = ix86_tune_cost;
++
++  /* Recreate the arch feature tests if the arch changed */
++  if (old_arch != ix86_arch)
++    {
++      ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
++      for (i = 0; i < X86_ARCH_LAST; ++i)
++	ix86_arch_features[i]
++	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
++    }
++
++  /* Recreate the tune optimization tests */
++  if (old_tune != ix86_tune)
++    set_ix86_tune_features (ix86_tune, false);
++}
++
++/* Adjust target options after streaming them in.  This is mainly about
++   reconciling them with global options.  */
++
++void
++ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
++{
++  /* flag_pic is a global option, but ix86_cmodel is target saved option
++     partly computed from flag_pic.  If flag_pic is on, adjust x_ix86_cmodel
++     for PIC, or error out.  */
++  if (flag_pic)
++    switch (ptr->x_ix86_cmodel)
++      {
++      case CM_SMALL:
++	ptr->x_ix86_cmodel = CM_SMALL_PIC;
++	break;
++
++      case CM_MEDIUM:
++	ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
++	break;
++
++      case CM_LARGE:
++	ptr->x_ix86_cmodel = CM_LARGE_PIC;
++	break;
++
++      case CM_KERNEL:
++	error ("code model %s does not support PIC mode", "kernel");
++	break;
++
++      default:
++	break;
++      }
++  else
++    switch (ptr->x_ix86_cmodel)
++      {
++      case CM_SMALL_PIC:
++	ptr->x_ix86_cmodel = CM_SMALL;
++	break;
++
++      case CM_MEDIUM_PIC:
++	ptr->x_ix86_cmodel = CM_MEDIUM;
++	break;
++
++      case CM_LARGE_PIC:
++	ptr->x_ix86_cmodel = CM_LARGE;
++	break;
++
++      default:
++	break;
++      }
++}
++
++/* Print the current options */
++
++void
++ix86_function_specific_print (FILE *file, int indent,
++			      struct cl_target_option *ptr)
++{
++  char *target_string
++    = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
++			  ptr->x_target_flags, ptr->x_ix86_target_flags,
++			  NULL, NULL, ptr->x_ix86_fpmath, false, true);
++
++  gcc_assert (ptr->arch < PROCESSOR_max);
++  fprintf (file, "%*sarch = %d (%s)\n",
++	   indent, "",
++	   ptr->arch, processor_names[ptr->arch]);
++
++  gcc_assert (ptr->tune < PROCESSOR_max);
++  fprintf (file, "%*stune = %d (%s)\n",
++	   indent, "",
++	   ptr->tune, processor_names[ptr->tune]);
++
++  fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
++
++  if (target_string)
++    {
++      fprintf (file, "%*s%s\n", indent, "", target_string);
++      free (target_string);
++    }
++}
++
++
++/* Inner function to process the attribute((target(...))), take an argument and
++   set the current options from the argument. If we have a list, recursively go
++   over the list.  */
++
++static bool
++ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
++				     struct gcc_options *opts,
++				     struct gcc_options *opts_set,
++				     struct gcc_options *enum_opts_set,
++				     bool target_clone_attr)
++{
++  char *next_optstr;
++  bool ret = true;
++
++#define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
++#define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
++#define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
++#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
++#define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
++
++  enum ix86_opt_type
++  {
++    ix86_opt_unknown,
++    ix86_opt_yes,
++    ix86_opt_no,
++    ix86_opt_str,
++    ix86_opt_enum,
++    ix86_opt_isa
++  };
++
++  static const struct
++  {
++    const char *string;
++    size_t len;
++    enum ix86_opt_type type;
++    int opt;
++    int mask;
++  } attrs[] = {
++    /* isa options */
++    IX86_ATTR_ISA ("pconfig",	OPT_mpconfig),
++    IX86_ATTR_ISA ("wbnoinvd",	OPT_mwbnoinvd),
++    IX86_ATTR_ISA ("sgx",	OPT_msgx),
++    IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
++    IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
++    IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
++    IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
++    IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
++    IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
++
++    IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
++    IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
++    IX86_ATTR_ISA ("avx512vl",	OPT_mavx512vl),
++    IX86_ATTR_ISA ("avx512bw",	OPT_mavx512bw),
++    IX86_ATTR_ISA ("avx512dq",	OPT_mavx512dq),
++    IX86_ATTR_ISA ("avx512er",	OPT_mavx512er),
++    IX86_ATTR_ISA ("avx512pf",	OPT_mavx512pf),
++    IX86_ATTR_ISA ("avx512cd",	OPT_mavx512cd),
++    IX86_ATTR_ISA ("avx512f",	OPT_mavx512f),
++    IX86_ATTR_ISA ("avx2",	OPT_mavx2),
++    IX86_ATTR_ISA ("fma",	OPT_mfma),
++    IX86_ATTR_ISA ("xop",	OPT_mxop),
++    IX86_ATTR_ISA ("fma4",	OPT_mfma4),
++    IX86_ATTR_ISA ("f16c",	OPT_mf16c),
++    IX86_ATTR_ISA ("avx",	OPT_mavx),
++    IX86_ATTR_ISA ("sse4",	OPT_msse4),
++    IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
++    IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
++    IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
++    IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
++    IX86_ATTR_ISA ("sse3",	OPT_msse3),
++    IX86_ATTR_ISA ("aes",	OPT_maes),
++    IX86_ATTR_ISA ("sha",	OPT_msha),
++    IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
++    IX86_ATTR_ISA ("sse2",	OPT_msse2),
++    IX86_ATTR_ISA ("sse",	OPT_msse),
++    IX86_ATTR_ISA ("3dnowa",	OPT_m3dnowa),
++    IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
++    IX86_ATTR_ISA ("mmx",	OPT_mmmx),
++    IX86_ATTR_ISA ("rtm",	OPT_mrtm),
++    IX86_ATTR_ISA ("prfchw",	OPT_mprfchw),
++    IX86_ATTR_ISA ("rdseed",	OPT_mrdseed),
++    IX86_ATTR_ISA ("adx",	OPT_madx),
++    IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
++    IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
++    IX86_ATTR_ISA ("xsaves",	OPT_mxsaves),
++    IX86_ATTR_ISA ("xsavec",	OPT_mxsavec),
++    IX86_ATTR_ISA ("xsaveopt",	OPT_mxsaveopt),
++    IX86_ATTR_ISA ("xsave",	OPT_mxsave),
++    IX86_ATTR_ISA ("abm",	OPT_mabm),
++    IX86_ATTR_ISA ("bmi",	OPT_mbmi),
++    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
++    IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
++    IX86_ATTR_ISA ("tbm",	OPT_mtbm),
++    IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
++    IX86_ATTR_ISA ("cx16",	OPT_mcx16),
++    IX86_ATTR_ISA ("sahf",	OPT_msahf),
++    IX86_ATTR_ISA ("movbe",	OPT_mmovbe),
++    IX86_ATTR_ISA ("crc32",	OPT_mcrc32),
++    IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
++    IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
++    IX86_ATTR_ISA ("mwaitx",	OPT_mmwaitx),
++    IX86_ATTR_ISA ("clzero",	OPT_mclzero),
++    IX86_ATTR_ISA ("pku",	OPT_mpku),
++    IX86_ATTR_ISA ("lwp",	OPT_mlwp),
++    IX86_ATTR_ISA ("hle",	OPT_mhle),
++    IX86_ATTR_ISA ("fxsr",	OPT_mfxsr),
++    IX86_ATTR_ISA ("clwb",	OPT_mclwb),
++    IX86_ATTR_ISA ("rdpid",	OPT_mrdpid),
++    IX86_ATTR_ISA ("gfni",	OPT_mgfni),
++    IX86_ATTR_ISA ("shstk",	OPT_mshstk),
++    IX86_ATTR_ISA ("vaes",	OPT_mvaes),
++    IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
++    IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
++    IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
++    IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
++    IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
++    IX86_ATTR_ISA ("ptwrite",   OPT_mptwrite),
++
++    /* enum options */
++    IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
++
++    /* string options */
++    IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
++    IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
++
++    /* flag options */
++    IX86_ATTR_YES ("cld",
++		   OPT_mcld,
++		   MASK_CLD),
++
++    IX86_ATTR_NO ("fancy-math-387",
++		  OPT_mfancy_math_387,
++		  MASK_NO_FANCY_MATH_387),
++
++    IX86_ATTR_YES ("ieee-fp",
++		   OPT_mieee_fp,
++		   MASK_IEEE_FP),
++
++    IX86_ATTR_YES ("inline-all-stringops",
++		   OPT_minline_all_stringops,
++		   MASK_INLINE_ALL_STRINGOPS),
++
++    IX86_ATTR_YES ("inline-stringops-dynamically",
++		   OPT_minline_stringops_dynamically,
++		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
++
++    IX86_ATTR_NO ("align-stringops",
++		  OPT_mno_align_stringops,
++		  MASK_NO_ALIGN_STRINGOPS),
++
++    IX86_ATTR_YES ("recip",
++		   OPT_mrecip,
++		   MASK_RECIP),
++  };
++
++  location_t loc
++    = fndecl == NULL ? UNKNOWN_LOCATION : DECL_SOURCE_LOCATION (fndecl);
++  const char *attr_name = target_clone_attr ? "target_clone" : "target";
++
++  /* If this is a list, recurse to get the options.  */
++  if (TREE_CODE (args) == TREE_LIST)
++    {
++      bool ret = true;
++
++      for (; args; args = TREE_CHAIN (args))
++	if (TREE_VALUE (args)
++	    && !ix86_valid_target_attribute_inner_p (fndecl, TREE_VALUE (args),
++						     p_strings, opts, opts_set,
++						     enum_opts_set,
++						     target_clone_attr))
++	  ret = false;
++
++      return ret;
++    }
++
++  else if (TREE_CODE (args) != STRING_CST)
++    {
++      error_at (loc, "attribute %qs argument is not a string", attr_name);
++      return false;
++    }
++
++  /* Handle multiple arguments separated by commas.  */
++  next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
++
++  while (next_optstr && *next_optstr != '\0')
++    {
++      char *p = next_optstr;
++      char *orig_p = p;
++      char *comma = strchr (next_optstr, ',');
++      size_t len, opt_len;
++      int opt;
++      bool opt_set_p;
++      char ch;
++      unsigned i;
++      enum ix86_opt_type type = ix86_opt_unknown;
++      int mask = 0;
++
++      if (comma)
++	{
++	  *comma = '\0';
++	  len = comma - next_optstr;
++	  next_optstr = comma + 1;
++	}
++      else
++	{
++	  len = strlen (p);
++	  next_optstr = NULL;
++	}
++
++      /* Recognize no-xxx.  */
++      if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
++	{
++	  opt_set_p = false;
++	  p += 3;
++	  len -= 3;
++	}
++      else
++	opt_set_p = true;
++
++      /* Find the option.  */
++      ch = *p;
++      opt = N_OPTS;
++      for (i = 0; i < ARRAY_SIZE (attrs); i++)
++	{
++	  type = attrs[i].type;
++	  opt_len = attrs[i].len;
++	  if (ch == attrs[i].string[0]
++	      && ((type != ix86_opt_str && type != ix86_opt_enum)
++		  ? len == opt_len
++		  : len > opt_len)
++	      && memcmp (p, attrs[i].string, opt_len) == 0)
++	    {
++	      opt = attrs[i].opt;
++	      mask = attrs[i].mask;
++	      break;
++	    }
++	}
++
++      /* Process the option.  */
++      if (opt == N_OPTS)
++	{
++	  error_at (loc, "attribute %qs argument %qs is unknown",
++		    orig_p, attr_name);
++	  ret = false;
++	}
++
++      else if (type == ix86_opt_isa)
++	{
++	  struct cl_decoded_option decoded;
++
++	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
++	  ix86_handle_option (opts, opts_set,
++			      &decoded, input_location);
++	}
++
++      else if (type == ix86_opt_yes || type == ix86_opt_no)
++	{
++	  if (type == ix86_opt_no)
++	    opt_set_p = !opt_set_p;
++
++	  if (opt_set_p)
++	    opts->x_target_flags |= mask;
++	  else
++	    opts->x_target_flags &= ~mask;
++	}
++
++      else if (type == ix86_opt_str)
++	{
++	  if (p_strings[opt])
++	    {
++	      error_at (loc, "attribute value %qs was already specified "
++			"in %qs attribute", orig_p, attr_name);
++	      ret = false;
++	    }
++	  else
++	    {
++	      p_strings[opt] = xstrdup (p + opt_len);
++	      if (opt == IX86_FUNCTION_SPECIFIC_ARCH)
++		{
++		  /* If arch= is set,  clear all bits in x_ix86_isa_flags,
++		     except for ISA_64BIT, ABI_64, ABI_X32, and CODE16
++		     and all bits in x_ix86_isa_flags2.  */
++		  opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
++					     | OPTION_MASK_ABI_64
++					     | OPTION_MASK_ABI_X32
++					     | OPTION_MASK_CODE16);
++		  opts->x_ix86_isa_flags_explicit &= (OPTION_MASK_ISA_64BIT
++						      | OPTION_MASK_ABI_64
++						      | OPTION_MASK_ABI_X32
++						      | OPTION_MASK_CODE16);
++		  opts->x_ix86_isa_flags2 = 0;
++		  opts->x_ix86_isa_flags2_explicit = 0;
++		}
++	    }
++	}
++
++      else if (type == ix86_opt_enum)
++	{
++	  bool arg_ok;
++	  int value;
++
++	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
++	  if (arg_ok)
++	    set_option (opts, enum_opts_set, opt, value,
++			p + opt_len, DK_UNSPECIFIED, input_location,
++			global_dc);
++	  else
++	    {
++	      error_at (loc, "attribute value %qs is unknown in %qs attribute",
++			orig_p, attr_name);
++	      ret = false;
++	    }
++	}
++
++      else
++	gcc_unreachable ();
++    }
++
++  return ret;
++}
++
++/* Release allocated strings.  */
++static void
++release_options_strings (char **option_strings)
++{
++  /* Free up memory allocated to hold the strings */
++  for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
++    free (option_strings[i]);
++}
++
++/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
++
++tree
++ix86_valid_target_attribute_tree (tree fndecl, tree args,
++				  struct gcc_options *opts,
++				  struct gcc_options *opts_set,
++				  bool target_clone_attr)
++{
++  const char *orig_arch_string = opts->x_ix86_arch_string;
++  const char *orig_tune_string = opts->x_ix86_tune_string;
++  enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
++  int orig_tune_defaulted = ix86_tune_defaulted;
++  int orig_arch_specified = ix86_arch_specified;
++  char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
++  tree t = NULL_TREE;
++  struct cl_target_option *def
++    = TREE_TARGET_OPTION (target_option_default_node);
++  struct gcc_options enum_opts_set;
++
++  memset (&enum_opts_set, 0, sizeof (enum_opts_set));
++
++  /* Process each of the options on the chain.  */
++  if (!ix86_valid_target_attribute_inner_p (fndecl, args, option_strings, opts,
++					    opts_set, &enum_opts_set,
++					    target_clone_attr))
++    return error_mark_node;
++
++  /* If the changed options are different from the default, rerun
++     ix86_option_override_internal, and then save the options away.
++     The string options are attribute options, and will be undone
++     when we copy the save structure.  */
++  if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
++      || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
++      || opts->x_target_flags != def->x_target_flags
++      || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
++      || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
++      || enum_opts_set.x_ix86_fpmath)
++    {
++      /* If we are using the default tune= or arch=, undo the string assigned,
++	 and use the default.  */
++      if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
++	opts->x_ix86_arch_string
++	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
++      else if (!orig_arch_specified)
++	opts->x_ix86_arch_string = NULL;
++
++      if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
++	opts->x_ix86_tune_string
++	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
++      else if (orig_tune_defaulted)
++	opts->x_ix86_tune_string = NULL;
++
++      /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
++      if (enum_opts_set.x_ix86_fpmath)
++	opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
++
++      /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
++      bool r = ix86_option_override_internal (false, opts, opts_set);
++      if (!r)
++	{
++	  release_options_strings (option_strings);
++	  return error_mark_node;
++	}
++
++      /* Add any builtin functions with the new isa if any.  */
++      ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
++
++      /* Save the current options unless we are validating options for
++	 #pragma.  */
++      t = build_target_option_node (opts);
++
++      opts->x_ix86_arch_string = orig_arch_string;
++      opts->x_ix86_tune_string = orig_tune_string;
++      opts_set->x_ix86_fpmath = orig_fpmath_set;
++
++      release_options_strings (option_strings);
++    }
++
++  return t;
++}
++
++/* Hook to validate attribute((target("string"))).  */
++
++bool
++ix86_valid_target_attribute_p (tree fndecl,
++			       tree ARG_UNUSED (name),
++			       tree args,
++			       int flags)
++{
++  struct gcc_options func_options;
++  tree new_target, new_optimize;
++  bool ret = true;
++
++  /* attribute((target("default"))) does nothing, beyond
++     affecting multi-versioning.  */
++  if (TREE_VALUE (args)
++      && TREE_CODE (TREE_VALUE (args)) == STRING_CST
++      && TREE_CHAIN (args) == NULL_TREE
++      && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
++    return true;
++
++  tree old_optimize = build_optimization_node (&global_options);
++
++  /* Get the optimization options of the current function.  */  
++  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
++ 
++  if (!func_optimize)
++    func_optimize = old_optimize;
++
++  /* Init func_options.  */
++  memset (&func_options, 0, sizeof (func_options));
++  init_options_struct (&func_options, NULL);
++  lang_hooks.init_options_struct (&func_options);
++ 
++  cl_optimization_restore (&func_options,
++			   TREE_OPTIMIZATION (func_optimize));
++
++  /* Initialize func_options to the default before its target options can
++     be set.  */
++  cl_target_option_restore (&func_options,
++			    TREE_TARGET_OPTION (target_option_default_node));
++
++  /* FLAGS == 1 is used for target_clones attribute.  */
++  new_target
++    = ix86_valid_target_attribute_tree (fndecl, args, &func_options,
++					&global_options_set, flags == 1);
++
++  new_optimize = build_optimization_node (&func_options);
++
++  if (new_target == error_mark_node)
++    ret = false;
++
++  else if (fndecl && new_target)
++    {
++      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
++
++      if (old_optimize != new_optimize)
++	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
++    }
++
++  finalize_options_struct (&func_options);
++
++  return ret;
++}
++
++const char *stringop_alg_names[] = {
++#define DEF_ENUM
++#define DEF_ALG(alg, name) #name,
++#include "stringop.def"
++#undef DEF_ENUM
++#undef DEF_ALG
++};
++
++/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
++   The string is of the following form (or comma separated list of it):
++
++     strategy_alg:max_size:[align|noalign]
++
++   where the full size range for the strategy is either [0, max_size] or
++   [min_size, max_size], in which min_size is the max_size + 1 of the
++   preceding range.  The last size range must have max_size == -1.
++
++   Examples:
++
++    1.
++       -mmemcpy-strategy=libcall:-1:noalign
++
++      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
++
++
++   2.
++      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
++
++      This is to tell the compiler to use the following strategy for memset
++      1) when the expected size is between [1, 16], use rep_8byte strategy;
++      2) when the size is between [17, 2048], use vector_loop;
++      3) when the size is > 2048, use libcall.  */
++
++struct stringop_size_range
++{
++  int max;
++  stringop_alg alg;
++  bool noalign;
++};
++
++static void
++ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
++{
++  const struct stringop_algs *default_algs;
++  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
++  char *curr_range_str, *next_range_str;
++  const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
++  int i = 0, n = 0;
++
++  if (is_memset)
++    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
++  else
++    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
++
++  curr_range_str = strategy_str;
++
++  do
++    {
++      int maxs;
++      char alg_name[128];
++      char align[16];
++      next_range_str = strchr (curr_range_str, ',');
++      if (next_range_str)
++        *next_range_str++ = '\0';
++
++      if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
++		  align) != 3)
++        {
++	  error ("wrong argument %qs to option %qs", curr_range_str, opt);
++          return;
++        }
++
++      if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
++        {
++	  error ("size ranges of option %qs should be increasing", opt);
++          return;
++        }
++
++      for (i = 0; i < last_alg; i++)
++	if (!strcmp (alg_name, stringop_alg_names[i]))
++	  break;
++
++      if (i == last_alg)
++        {
++	  error ("wrong strategy name %qs specified for option %qs",
++		 alg_name, opt);
++
++	  auto_vec <const char *> candidates;
++	  for (i = 0; i < last_alg; i++)
++	    if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
++	      candidates.safe_push (stringop_alg_names[i]);
++
++	  char *s;
++	  const char *hint
++	    = candidates_list_and_hint (alg_name, s, candidates);
++	  if (hint)
++	    inform (input_location,
++		    "valid arguments to %qs are: %s; did you mean %qs?",
++		    opt, s, hint);
++	  else
++	    inform (input_location, "valid arguments to %qs are: %s",
++		    opt, s);
++	  XDELETEVEC (s);
++          return;
++        }
++
++      if ((stringop_alg) i == rep_prefix_8_byte
++	  && !TARGET_64BIT)
++	{
++	  /* rep; movq isn't available in 32-bit code.  */
++	  error ("strategy name %qs specified for option %qs "
++		 "not supported for 32-bit code", alg_name, opt);
++	  return;
++	}
++
++      input_ranges[n].max = maxs;
++      input_ranges[n].alg = (stringop_alg) i;
++      if (!strcmp (align, "align"))
++        input_ranges[n].noalign = false;
++      else if (!strcmp (align, "noalign"))
++        input_ranges[n].noalign = true;
++      else
++        {
++	  error ("unknown alignment %qs specified for option %qs", align, opt);
++          return;
++        }
++      n++;
++      curr_range_str = next_range_str;
++    }
++  while (curr_range_str);
++
++  if (input_ranges[n - 1].max != -1)
++    {
++      error ("the max value for the last size range should be -1"
++             " for option %qs", opt);
++      return;
++    }
++
++  if (n > MAX_STRINGOP_ALGS)
++    {
++      error ("too many size ranges specified in option %qs", opt);
++      return;
++    }
++
++  /* Now override the default algs array.  */
++  for (i = 0; i < n; i++)
++    {
++      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
++      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
++          = input_ranges[i].alg;
++      *const_cast<int *>(&default_algs->size[i].noalign)
++          = input_ranges[i].noalign;
++    }
++}
++
++
++/* parse -mtune-ctrl= option. When DUMP is true,
++   print the features that are explicitly set.  */
++
++static void
++parse_mtune_ctrl_str (bool dump)
++{
++  if (!ix86_tune_ctrl_string)
++    return;
++
++  char *next_feature_string = NULL;
++  char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
++  char *orig = curr_feature_string;
++  int i;
++  do
++    {
++      bool clear = false;
++
++      next_feature_string = strchr (curr_feature_string, ',');
++      if (next_feature_string)
++        *next_feature_string++ = '\0';
++      if (*curr_feature_string == '^')
++        {
++          curr_feature_string++;
++          clear = true;
++        }
++      for (i = 0; i < X86_TUNE_LAST; i++)
++        {
++          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
++            {
++              ix86_tune_features[i] = !clear;
++              if (dump)
++                fprintf (stderr, "Explicitly %s feature %s\n",
++                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
++              break;
++            }
++        }
++      if (i == X86_TUNE_LAST)
++	error ("unknown parameter to option %<-mtune-ctrl%>: %s",
++	       clear ? curr_feature_string - 1 : curr_feature_string);
++      curr_feature_string = next_feature_string;
++    }
++  while (curr_feature_string);
++  free (orig);
++}
++
++/* Helper function to set ix86_tune_features. IX86_TUNE is the
++   processor type.  */
++
++static void
++set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
++{
++  unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
++  int i;
++
++  for (i = 0; i < X86_TUNE_LAST; ++i)
++    {
++      if (ix86_tune_no_default)
++        ix86_tune_features[i] = 0;
++      else
++	ix86_tune_features[i]
++	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
++    }
++
++  if (dump)
++    {
++      fprintf (stderr, "List of x86 specific tuning parameter names:\n");
++      for (i = 0; i < X86_TUNE_LAST; i++)
++        fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
++                 ix86_tune_features[i] ? "on" : "off");
++    }
++
++  parse_mtune_ctrl_str (dump);
++}
++
++
++/* Default align_* from the processor table.  */
++
++static void
++ix86_default_align (struct gcc_options *opts)
++{
++  /* -falign-foo without argument: supply one.  */
++  if (opts->x_flag_align_loops && !opts->x_str_align_loops)
++    opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop;
++  if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
++    opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump;
++  if (opts->x_flag_align_labels && !opts->x_str_align_labels)
++    opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label;
++  if (opts->x_flag_align_functions && !opts->x_str_align_functions)
++    opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func;
++}
++
++/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook.  */
++
++void
++ix86_override_options_after_change (void)
++{
++  ix86_default_align (&global_options);
++}
++
++/* Clear stack slot assignments remembered from previous functions.
++   This is called from INIT_EXPANDERS once before RTL is emitted for each
++   function.  */
++
++static struct machine_function *
++ix86_init_machine_status (void)
++{
++  struct machine_function *f;
++
++  f = ggc_cleared_alloc<machine_function> ();
++  f->call_abi = ix86_abi;
++
++  return f;
++}
++
++/* Override various settings based on options.  If MAIN_ARGS_P, the
++   options are from the command line, otherwise they are from
++   attributes.  Return true if there's an error related to march
++   option.  */
++
++static bool
++ix86_option_override_internal (bool main_args_p,
++			       struct gcc_options *opts,
++			       struct gcc_options *opts_set)
++{
++  int i;
++  unsigned HOST_WIDE_INT ix86_arch_mask;
++  const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
++
++  /* -mrecip options.  */
++  static struct
++    {
++      const char *string;           /* option name */
++      unsigned int mask;            /* mask bits to set */
++    }
++  const recip_options[] =
++    {
++      { "all",       RECIP_MASK_ALL },
++      { "none",      RECIP_MASK_NONE },
++      { "div",       RECIP_MASK_DIV },
++      { "sqrt",      RECIP_MASK_SQRT },
++      { "vec-div",   RECIP_MASK_VEC_DIV },
++      { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
++    };
++
++
++  /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
++     TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false.  */
++  if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
++#ifdef TARGET_BI_ARCH
++  else
++    {
++#if TARGET_BI_ARCH == 1
++      /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
++	 is on and OPTION_MASK_ABI_X32 is off.  We turn off
++	 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
++	 -mx32.  */
++      if (TARGET_X32_P (opts->x_ix86_isa_flags))
++	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
++#else
++      /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
++	 on and OPTION_MASK_ABI_64 is off.  We turn off
++	 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
++	 -m64 or OPTION_MASK_CODE16 is turned on by -m16.  */
++      if (TARGET_LP64_P (opts->x_ix86_isa_flags)
++	  || TARGET_16BIT_P (opts->x_ix86_isa_flags))
++	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
++#endif
++      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	  && TARGET_IAMCU_P (opts->x_target_flags))
++	sorry ("Intel MCU psABI isn%'t supported in %s mode",
++	       TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
++    }
++#endif
++
++  if (TARGET_X32_P (opts->x_ix86_isa_flags))
++    {
++      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
++	 OPTION_MASK_ABI_64 for TARGET_X32.  */
++      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
++      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
++    }
++  else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
++    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
++				| OPTION_MASK_ABI_X32
++				| OPTION_MASK_ABI_64);
++  else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
++    {
++      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
++	 OPTION_MASK_ABI_X32 for TARGET_LP64.  */
++      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
++      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
++    }
++
++#ifdef SUBTARGET_OVERRIDE_OPTIONS
++  SUBTARGET_OVERRIDE_OPTIONS;
++#endif
++
++#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
++  SUBSUBTARGET_OVERRIDE_OPTIONS;
++#endif
++
++  /* -fPIC is the default for x86_64.  */
++  if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    opts->x_flag_pic = 2;
++
++  /* Need to check -mtune=generic first.  */
++  if (opts->x_ix86_tune_string)
++    {
++      /* As special support for cross compilers we read -mtune=native
++	     as -mtune=generic.  With native compilers we won't see the
++	     -mtune=native, as it was changed by the driver.  */
++      if (!strcmp (opts->x_ix86_tune_string, "native"))
++	{
++	  opts->x_ix86_tune_string = "generic";
++	}
++      else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
++        warning (OPT_Wdeprecated,
++		 main_args_p
++		 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
++		      "or %<-mtune=generic%> instead as appropriate")
++		 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
++		      "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
++		      " instead as appropriate"));
++    }
++  else
++    {
++      if (opts->x_ix86_arch_string)
++	opts->x_ix86_tune_string = opts->x_ix86_arch_string;
++      if (!opts->x_ix86_tune_string)
++	{
++	  opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT];
++	  ix86_tune_defaulted = 1;
++	}
++
++      /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
++	 or defaulted.  We need to use a sensible tune option.  */
++      if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
++	{
++	  opts->x_ix86_tune_string = "generic";
++	}
++    }
++
++  if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
++      && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    {
++      /* rep; movq isn't available in 32-bit code.  */
++      error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code");
++      opts->x_ix86_stringop_alg = no_stringop;
++    }
++
++  if (!opts->x_ix86_arch_string)
++    opts->x_ix86_arch_string
++      = TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	? "x86-64" : SUBTARGET32_DEFAULT_CPU;
++  else
++    ix86_arch_specified = 1;
++
++  if (opts_set->x_ix86_pmode)
++    {
++      if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
++	   && opts->x_ix86_pmode == PMODE_SI)
++	  || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	       && opts->x_ix86_pmode == PMODE_DI))
++	error ("address mode %qs not supported in the %s bit mode",
++	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
++	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
++    }
++  else
++    opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
++			 ? PMODE_DI : PMODE_SI;
++
++  if (!opts_set->x_ix86_abi)
++    opts->x_ix86_abi = DEFAULT_ABI;
++
++  if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
++    error ("%<-mabi=ms%> not supported with X32 ABI");
++  gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
++
++  const char *abi_name = opts->x_ix86_abi == MS_ABI ? "ms" : "sysv";
++  if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS)
++      && opts->x_ix86_abi != DEFAULT_ABI)
++    error ("%<-mabi=%s%> not supported with %<-fsanitize=address%>", abi_name);
++  if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS)
++      && opts->x_ix86_abi != DEFAULT_ABI)
++    error ("%<-mabi=%s%> not supported with %<-fsanitize=kernel-address%>",
++	   abi_name);
++  if ((opts->x_flag_sanitize & SANITIZE_THREAD)
++      && opts->x_ix86_abi != DEFAULT_ABI)
++    error ("%<-mabi=%s%> not supported with %<-fsanitize=thread%>", abi_name);
++
++  /* For targets using ms ABI enable ms-extensions, if not
++     explicit turned off.  For non-ms ABI we turn off this
++     option.  */
++  if (!opts_set->x_flag_ms_extensions)
++    opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
++
++  if (opts_set->x_ix86_cmodel)
++    {
++      switch (opts->x_ix86_cmodel)
++	{
++	case CM_SMALL:
++	case CM_SMALL_PIC:
++	  if (opts->x_flag_pic)
++	    opts->x_ix86_cmodel = CM_SMALL_PIC;
++	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in the %s bit mode",
++		   "small", "32");
++	  break;
++
++	case CM_MEDIUM:
++	case CM_MEDIUM_PIC:
++	  if (opts->x_flag_pic)
++	    opts->x_ix86_cmodel = CM_MEDIUM_PIC;
++	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in the %s bit mode",
++		   "medium", "32");
++	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in x32 mode",
++		   "medium");
++	  break;
++
++	case CM_LARGE:
++	case CM_LARGE_PIC:
++	  if (opts->x_flag_pic)
++	    opts->x_ix86_cmodel = CM_LARGE_PIC;
++	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in the %s bit mode",
++		   "large", "32");
++	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in x32 mode",
++		   "large");
++	  break;
++
++	case CM_32:
++	  if (opts->x_flag_pic)
++	    error ("code model %s does not support PIC mode", "32");
++	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in the %s bit mode",
++		   "32", "64");
++	  break;
++
++	case CM_KERNEL:
++	  if (opts->x_flag_pic)
++	    {
++	      error ("code model %s does not support PIC mode", "kernel");
++	      opts->x_ix86_cmodel = CM_32;
++	    }
++	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	    error ("code model %qs not supported in the %s bit mode",
++		   "kernel", "32");
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++    }
++  else
++    {
++      /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
++	 use of rip-relative addressing.  This eliminates fixups that
++	 would otherwise be needed if this object is to be placed in a
++	 DLL, and is essentially just as efficient as direct addressing.  */
++      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	  && (TARGET_RDOS || TARGET_PECOFF))
++	opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
++      else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
++      else
++	opts->x_ix86_cmodel = CM_32;
++    }
++  if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
++    {
++      error ("%<-masm=intel%> not supported in this configuration");
++      opts->x_ix86_asm_dialect = ASM_ATT;
++    }
++  if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
++      != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
++    sorry ("%i-bit mode not compiled in",
++	   (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
++
++  for (i = 0; i < pta_size; i++)
++    if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
++      {
++	if (!strcmp (opts->x_ix86_arch_string, "generic"))
++	  {
++	    error (main_args_p
++		   ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
++			"switch")
++		   : G_("%<generic%> CPU can be used only for "
++			"%<target(\"tune=\")%> attribute"));
++	    return false;
++	  }
++	else if (!strcmp (opts->x_ix86_arch_string, "intel"))
++	  {
++	    error (main_args_p
++		   ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
++			"switch")
++		   : G_("%<intel%> CPU can be used only for "
++			"%<target(\"tune=\")%> attribute"));
++	    return false;
++	  }
++
++	if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	    && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
++	  {
++	    error ("CPU you selected does not support x86-64 "
++		   "instruction set");
++	    return false;
++	  }
++
++	ix86_schedule = processor_alias_table[i].schedule;
++	ix86_arch = processor_alias_table[i].processor;
++	/* Default cpu tuning to the architecture.  */
++	ix86_tune = ix86_arch;
++
++	if (((processor_alias_table[i].flags & PTA_MMX) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
++	if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
++	if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
++	if (((processor_alias_table[i].flags & PTA_SSE) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
++	if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
++	if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
++	if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
++	if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
++	if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
++	if (((processor_alias_table[i].flags & PTA_AVX) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
++	if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
++	if (((processor_alias_table[i].flags & PTA_FMA) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
++	if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
++	if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
++	if (((processor_alias_table[i].flags & PTA_XOP) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
++	if (((processor_alias_table[i].flags & PTA_LWP) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
++	if (((processor_alias_table[i].flags & PTA_ABM) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
++	if (((processor_alias_table[i].flags & PTA_BMI) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
++	if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
++	if (((processor_alias_table[i].flags & PTA_TBM) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
++	if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
++	if (((processor_alias_table[i].flags & PTA_CX16) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
++	if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
++	if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	    && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
++	if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
++	if (((processor_alias_table[i].flags & PTA_AES) != 0)
++	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
++	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
++	if (((processor_alias_table[i].flags & PTA_SHA) != 0)
++	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
++	  ix86_isa_flags |= OPTION_MASK_ISA_SHA;
++	if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
++	if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
++	if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
++	if (((processor_alias_table[i].flags & PTA_F16C) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
++	if (((processor_alias_table[i].flags & PTA_RTM) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
++	if (((processor_alias_table[i].flags & PTA_HLE) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
++	if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
++	if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
++	if (((processor_alias_table[i].flags & PTA_ADX) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
++	if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
++	if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
++	if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
++	if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
++	if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
++	if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
++	if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
++	if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
++	if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
++	if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
++	if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
++	if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
++	if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
++	if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
++	if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
++	if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
++	if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
++	if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
++	if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
++	if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
++	if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit
++	    & OPTION_MASK_ISA_AVX512VBMI2))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
++	if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
++	if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit
++	    & OPTION_MASK_ISA_AVX512BITALG))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
++
++	if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit
++		 & OPTION_MASK_ISA_AVX5124VNNIW))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
++	if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit
++		 & OPTION_MASK_ISA_AVX5124FMAPS))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
++	if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit
++		 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
++	if (((processor_alias_table[i].flags & PTA_SGX) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
++	if (((processor_alias_table[i].flags & PTA_VAES) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
++	if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
++	if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
++	if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
++	if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE;
++
++	if ((processor_alias_table[i].flags
++	   & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
++	  x86_prefetch_sse = true;
++	if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
++	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
++	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
++	if (((processor_alias_table[i].flags & PTA_PKU) != 0)
++	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
++	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
++
++	/* Don't enable x87 instructions if only
++	   general registers are allowed.  */
++	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
++	    && !(opts_set->x_target_flags & MASK_80387))
++	  {
++	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
++	      opts->x_target_flags &= ~MASK_80387;
++	    else
++	      opts->x_target_flags |= MASK_80387;
++	  }
++	break;
++      }
++
++  if (i == pta_size)
++    {
++      error (main_args_p
++	     ? G_("bad value (%qs) for %<-march=%> switch")
++	     : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
++	     opts->x_ix86_arch_string);
++
++      auto_vec <const char *> candidates;
++      for (i = 0; i < pta_size; i++)
++	if (strcmp (processor_alias_table[i].name, "generic")
++	    && strcmp (processor_alias_table[i].name, "intel")
++	    && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
++		|| ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
++	  candidates.safe_push (processor_alias_table[i].name);
++
++#ifdef HAVE_LOCAL_CPU_DETECT
++      /* Add also "native" as possible value.  */
++      candidates.safe_push ("native");
++#endif
++
++      char *s;
++      const char *hint
++	= candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
++      if (hint)
++	inform (input_location,
++		main_args_p
++		? G_("valid arguments to %<-march=%> switch are: "
++		     "%s; did you mean %qs?")
++		: G_("valid arguments to %<target(\"arch=\")%> attribute are: "
++		     "%s; did you mean %qs?"), s, hint);
++      else
++	inform (input_location,
++		main_args_p
++		? G_("valid arguments to %<-march=%> switch are: %s")
++		: G_("valid arguments to %<target(\"arch=\")%> attribute "
++		     "are: %s"), s);
++      XDELETEVEC (s);
++    }
++
++  ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
++  for (i = 0; i < X86_ARCH_LAST; ++i)
++    ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
++
++  for (i = 0; i < pta_size; i++)
++    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
++      {
++	ix86_schedule = processor_alias_table[i].schedule;
++	ix86_tune = processor_alias_table[i].processor;
++	if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	  {
++	    if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
++	      {
++		if (ix86_tune_defaulted)
++		  {
++		    opts->x_ix86_tune_string = "x86-64";
++		    for (i = 0; i < pta_size; i++)
++		      if (! strcmp (opts->x_ix86_tune_string,
++				    processor_alias_table[i].name))
++			break;
++		    ix86_schedule = processor_alias_table[i].schedule;
++		    ix86_tune = processor_alias_table[i].processor;
++		  }
++		else
++		  error ("CPU you selected does not support x86-64 "
++			 "instruction set");
++	      }
++	  }
++	/* Intel CPUs have always interpreted SSE prefetch instructions as
++	   NOPs; so, we can enable SSE prefetch instructions even when
++	   -mtune (rather than -march) points us to a processor that has them.
++	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
++	   higher processors.  */
++	if (TARGET_CMOV
++	    && ((processor_alias_table[i].flags
++	      & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
++	  x86_prefetch_sse = true;
++	break;
++      }
++
++  if (ix86_tune_specified && i == pta_size)
++    {
++      error (main_args_p
++	     ? G_("bad value (%qs) for %<-mtune=%> switch")
++	     : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
++	     opts->x_ix86_tune_string);
++
++      auto_vec <const char *> candidates;
++      for (i = 0; i < pta_size; i++)
++	if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
++	    || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
++	  candidates.safe_push (processor_alias_table[i].name);
++
++#ifdef HAVE_LOCAL_CPU_DETECT
++      /* Add also "native" as possible value.  */
++      candidates.safe_push ("native");
++#endif
++
++      char *s;
++      const char *hint
++	= candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
++      if (hint)
++	inform (input_location,
++		main_args_p
++		? G_("valid arguments to %<-mtune=%> switch are: "
++		     "%s; did you mean %qs?")
++		: G_("valid arguments to %<target(\"tune=\")%> attribute are: "
++		     "%s; did you mean %qs?"), s, hint);
++      else
++	inform (input_location,
++		main_args_p
++		? G_("valid arguments to %<-mtune=%> switch are: %s")
++		: G_("valid arguments to %<target(\"tune=\")%> attribute "
++		     "are: %s"), s);
++      XDELETEVEC (s);
++    }
++
++  set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
++
++#ifndef USE_IX86_FRAME_POINTER
++#define USE_IX86_FRAME_POINTER 0
++#endif
++
++#ifndef USE_X86_64_FRAME_POINTER
++#define USE_X86_64_FRAME_POINTER 0
++#endif
++
++  /* Set the default values for switches whose default depends on TARGET_64BIT
++     in case they weren't overwritten by command line options.  */
++  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    {
++      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
++	opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
++      if (opts->x_flag_asynchronous_unwind_tables
++	  && !opts_set->x_flag_unwind_tables
++	  && TARGET_64BIT_MS_ABI)
++	opts->x_flag_unwind_tables = 1;
++      if (opts->x_flag_asynchronous_unwind_tables == 2)
++	opts->x_flag_unwind_tables
++	  = opts->x_flag_asynchronous_unwind_tables = 1;
++      if (opts->x_flag_pcc_struct_return == 2)
++	opts->x_flag_pcc_struct_return = 0;
++    }
++  else
++    {
++      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
++	opts->x_flag_omit_frame_pointer
++	  = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
++      if (opts->x_flag_asynchronous_unwind_tables == 2)
++	opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
++      if (opts->x_flag_pcc_struct_return == 2)
++	{
++	  /* Intel MCU psABI specifies that -freg-struct-return should
++	     be on.  Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
++	     we check -miamcu so that -freg-struct-return is always
++	     turned on if -miamcu is used.  */
++	  if (TARGET_IAMCU_P (opts->x_target_flags))
++	    opts->x_flag_pcc_struct_return = 0;
++	  else
++	    opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
++	}
++    }
++
++  ix86_tune_cost = processor_cost_table[ix86_tune];
++  /* TODO: ix86_cost should be chosen at instruction or function granuality
++     so for cold code we use size_cost even in !optimize_size compilation.  */
++  if (opts->x_optimize_size)
++    ix86_cost = &ix86_size_cost;
++  else
++    ix86_cost = ix86_tune_cost;
++
++  /* Arrange to set up i386_stack_locals for all functions.  */
++  init_machine_status = ix86_init_machine_status;
++
++  /* Validate -mregparm= value.  */
++  if (opts_set->x_ix86_regparm)
++    {
++      if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++	warning (0, "%<-mregparm%> is ignored in 64-bit mode");
++      else if (TARGET_IAMCU_P (opts->x_target_flags))
++	warning (0, "%<-mregparm%> is ignored for Intel MCU psABI");
++      if (opts->x_ix86_regparm > REGPARM_MAX)
++	{
++	  error ("%<-mregparm=%d%> is not between 0 and %d",
++		 opts->x_ix86_regparm, REGPARM_MAX);
++	  opts->x_ix86_regparm = 0;
++	}
++    }
++  if (TARGET_IAMCU_P (opts->x_target_flags)
++      || TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    opts->x_ix86_regparm = REGPARM_MAX;
++
++  /* Default align_* from the processor table.  */
++  ix86_default_align (opts);
++
++  /* Provide default for -mbranch-cost= value.  */
++  if (!opts_set->x_ix86_branch_cost)
++    opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
++
++  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    {
++      opts->x_target_flags
++	|= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
++
++      if (!ix86_arch_specified)
++	opts->x_ix86_isa_flags
++	  |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
++
++      if (TARGET_RTD_P (opts->x_target_flags))
++	warning (0,
++		 main_args_p
++		 ? G_("%<-mrtd%> is ignored in 64bit mode")
++		 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
++    }
++  else
++    {
++      opts->x_target_flags
++	|= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
++
++      if (!ix86_arch_specified)
++        opts->x_ix86_isa_flags
++	  |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
++
++      /* i386 ABI does not specify red zone.  It still makes sense to use it
++         when programmer takes care to stack from being destroyed.  */
++      if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
++        opts->x_target_flags |= MASK_NO_RED_ZONE;
++    }
++
++  /* Keep nonleaf frame pointers.  */
++  if (opts->x_flag_omit_frame_pointer)
++    opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
++  else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
++    opts->x_flag_omit_frame_pointer = 1;
++
++  /* If we're doing fast math, we don't care about comparison order
++     wrt NaNs.  This lets us use a shorter comparison sequence.  */
++  if (opts->x_flag_finite_math_only)
++    opts->x_target_flags &= ~MASK_IEEE_FP;
++
++  /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
++     since the insns won't need emulation.  */
++  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
++    opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
++
++  /* Likewise, if the target doesn't have a 387, or we've specified
++     software floating point, don't use 387 inline intrinsics.  */
++  if (!TARGET_80387_P (opts->x_target_flags))
++    opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
++
++  /* Turn on MMX builtins for -msse.  */
++  if (TARGET_SSE_P (opts->x_ix86_isa_flags))
++    opts->x_ix86_isa_flags
++      |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
++
++  /* Enable SSE prefetch.  */
++  if (TARGET_SSE_P (opts->x_ix86_isa_flags)
++      || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
++	  && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
++      || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
++    x86_prefetch_sse = true;
++
++  /* Enable popcnt instruction for -msse4.2 or -mabm.  */
++  if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
++      || TARGET_ABM_P (opts->x_ix86_isa_flags))
++    opts->x_ix86_isa_flags
++      |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
++
++  /* Enable lzcnt instruction for -mabm.  */
++  if (TARGET_ABM_P(opts->x_ix86_isa_flags))
++    opts->x_ix86_isa_flags
++      |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
++
++  /* Disable BMI, BMI2 and TBM instructions for -m16.  */
++  if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
++    opts->x_ix86_isa_flags
++      &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
++	   & ~opts->x_ix86_isa_flags_explicit);
++
++  /* Validate -mpreferred-stack-boundary= value or default it to
++     PREFERRED_STACK_BOUNDARY_DEFAULT.  */
++  ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
++  if (opts_set->x_ix86_preferred_stack_boundary_arg)
++    {
++      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
++      int max = TARGET_SEH ? 4 : 12;
++
++      if (opts->x_ix86_preferred_stack_boundary_arg < min
++	  || opts->x_ix86_preferred_stack_boundary_arg > max)
++	{
++	  if (min == max)
++	    error ("%<-mpreferred-stack-boundary%> is not supported "
++		   "for this target");
++	  else
++	    error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d",
++		   opts->x_ix86_preferred_stack_boundary_arg, min, max);
++	}
++      else
++	ix86_preferred_stack_boundary
++	  = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
++    }
++
++  /* Set the default value for -mstackrealign.  */
++  if (!opts_set->x_ix86_force_align_arg_pointer)
++    opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
++
++  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
++
++  /* Validate -mincoming-stack-boundary= value or default it to
++     MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
++  ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
++  if (opts_set->x_ix86_incoming_stack_boundary_arg)
++    {
++      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
++
++      if (opts->x_ix86_incoming_stack_boundary_arg < min
++	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
++	error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12",
++	       opts->x_ix86_incoming_stack_boundary_arg, min);
++      else
++	{
++	  ix86_user_incoming_stack_boundary
++	    = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
++	  ix86_incoming_stack_boundary
++	    = ix86_user_incoming_stack_boundary;
++	}
++    }
++
++#ifndef NO_PROFILE_COUNTERS
++  if (flag_nop_mcount)
++    error ("%<-mnop-mcount%> is not compatible with this target");
++#endif
++  if (flag_nop_mcount && flag_pic)
++    error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>");
++
++  /* Accept -msseregparm only if at least SSE support is enabled.  */
++  if (TARGET_SSEREGPARM_P (opts->x_target_flags)
++      && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
++    error (main_args_p
++	   ? G_("%<-msseregparm%> used without SSE enabled")
++	   : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
++
++  if (opts_set->x_ix86_fpmath)
++    {
++      if (opts->x_ix86_fpmath & FPMATH_SSE)
++	{
++	  if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
++	    {
++	      if (TARGET_80387_P (opts->x_target_flags))
++		{
++		  warning (0, "SSE instruction set disabled, using 387 arithmetics");
++		  opts->x_ix86_fpmath = FPMATH_387;
++		}
++	    }
++	  else if ((opts->x_ix86_fpmath & FPMATH_387)
++		   && !TARGET_80387_P (opts->x_target_flags))
++	    {
++	      warning (0, "387 instruction set disabled, using SSE arithmetics");
++	      opts->x_ix86_fpmath = FPMATH_SSE;
++	    }
++	}
++    }
++  /* For all chips supporting SSE2, -mfpmath=sse performs better than
++     fpmath=387.  The second is however default at many targets since the
++     extra 80bit precision of temporaries is considered to be part of ABI.
++     Overwrite the default at least for -ffast-math. 
++     TODO: -mfpmath=both seems to produce same performing code with bit
++     smaller binaries.  It is however not clear if register allocation is
++     ready for this setting.
++     Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
++     codegen.  We may switch to 387 with -ffast-math for size optimized
++     functions. */
++  else if (fast_math_flags_set_p (&global_options)
++	   && TARGET_SSE2_P (opts->x_ix86_isa_flags))
++    opts->x_ix86_fpmath = FPMATH_SSE;
++  else
++    opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
++
++  /* Use external vectorized library in vectorizing intrinsics.  */
++  if (opts_set->x_ix86_veclibabi_type)
++    switch (opts->x_ix86_veclibabi_type)
++      {
++      case ix86_veclibabi_type_svml:
++	ix86_veclib_handler = &ix86_veclibabi_svml;
++	break;
++
++      case ix86_veclibabi_type_acml:
++	ix86_veclib_handler = &ix86_veclibabi_acml;
++	break;
++
++      default:
++	gcc_unreachable ();
++      }
++
++  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
++      && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
++    opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
++
++  /* If stack probes are required, the space used for large function
++     arguments on the stack must also be probed, so enable
++     -maccumulate-outgoing-args so this happens in the prologue.  */
++  if (TARGET_STACK_PROBE_P (opts->x_target_flags)
++      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
++    {
++      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
++	warning (0,
++		 main_args_p
++		 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
++		      "for correctness")
++		 : G_("stack probing requires "
++		      "%<target(\"accumulate-outgoing-args\")%> for "
++		      "correctness"));
++      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
++    }
++
++  /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
++     so enable -maccumulate-outgoing-args when %ebp is fixed.  */
++  if (fixed_regs[BP_REG]
++      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
++    {
++      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
++	warning (0,
++		 main_args_p
++		 ? G_("fixed ebp register requires "
++		      "%<-maccumulate-outgoing-args%>")
++		 : G_("fixed ebp register requires "
++		      "%<target(\"accumulate-outgoing-args\")%>"));
++      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
++    }
++
++  /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
++  {
++    char *p;
++    ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
++    p = strchr (internal_label_prefix, 'X');
++    internal_label_prefix_len = p - internal_label_prefix;
++    *p = '\0';
++  }
++
++  /* When scheduling description is not available, disable scheduler pass
++     so it won't slow down the compilation and make x87 code slower.  */
++  if (!TARGET_SCHEDULE)
++    opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
++
++  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
++			 ix86_tune_cost->simultaneous_prefetches,
++			 opts->x_param_values,
++			 opts_set->x_param_values);
++  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
++			 ix86_tune_cost->prefetch_block,
++			 opts->x_param_values,
++			 opts_set->x_param_values);
++  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
++			 ix86_tune_cost->l1_cache_size,
++			 opts->x_param_values,
++			 opts_set->x_param_values);
++  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
++			 ix86_tune_cost->l2_cache_size,
++			 opts->x_param_values,
++			 opts_set->x_param_values);
++
++  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
++  if (opts->x_flag_prefetch_loop_arrays < 0
++      && HAVE_prefetch
++      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
++      && !opts->x_optimize_size
++      && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
++    opts->x_flag_prefetch_loop_arrays = 1;
++
++  /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
++     can be opts->x_optimized to ap = __builtin_next_arg (0).  */
++  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
++    targetm.expand_builtin_va_start = NULL;
++
++  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    {
++      ix86_gen_leave = gen_leave_rex64;
++      if (Pmode == DImode)
++	{
++	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
++	  ix86_gen_tls_local_dynamic_base_64
++	    = gen_tls_local_dynamic_base_64_di;
++	}
++      else
++	{
++	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
++	  ix86_gen_tls_local_dynamic_base_64
++	    = gen_tls_local_dynamic_base_64_si;
++	}
++    }
++  else
++    ix86_gen_leave = gen_leave;
++
++  if (Pmode == DImode)
++    {
++      ix86_gen_add3 = gen_adddi3;
++      ix86_gen_sub3 = gen_subdi3;
++      ix86_gen_sub3_carry = gen_subdi3_carry;
++      ix86_gen_one_cmpl2 = gen_one_cmpldi2;
++      ix86_gen_andsp = gen_anddi3;
++      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
++      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
++      ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
++      ix86_gen_monitor = gen_sse3_monitor_di;
++      ix86_gen_monitorx = gen_monitorx_di;
++      ix86_gen_clzero = gen_clzero_di;
++    }
++  else
++    {
++      ix86_gen_add3 = gen_addsi3;
++      ix86_gen_sub3 = gen_subsi3;
++      ix86_gen_sub3_carry = gen_subsi3_carry;
++      ix86_gen_one_cmpl2 = gen_one_cmplsi2;
++      ix86_gen_andsp = gen_andsi3;
++      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
++      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
++      ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
++      ix86_gen_monitor = gen_sse3_monitor_si;
++      ix86_gen_monitorx = gen_monitorx_si;
++      ix86_gen_clzero = gen_clzero_si;
++    }
++
++#ifdef USE_IX86_CLD
++  /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
++  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
++    opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
++#endif
++
++  /* Set the default value for -mfentry.  */
++  if (!opts_set->x_flag_fentry)
++    opts->x_flag_fentry = TARGET_SEH;
++  else
++    {
++      if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
++	  && opts->x_flag_fentry)
++	sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination "
++	       "with %<-fpic%>");
++      else if (TARGET_SEH && !opts->x_flag_fentry)
++	sorry ("%<-mno-fentry%> isn%'t compatible with SEH");
++    }
++
++  if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
++    sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
++
++  if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
++      && TARGET_EMIT_VZEROUPPER)
++    opts->x_target_flags |= MASK_VZEROUPPER;
++  if (!(opts_set->x_target_flags & MASK_STV))
++    opts->x_target_flags |= MASK_STV;
++  /* Disable STV if -mpreferred-stack-boundary={2,3} or
++     -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
++     stack realignment will be extra cost the pass doesn't take into
++     account and the pass can't realign the stack.  */
++  if (ix86_preferred_stack_boundary < 128
++      || ix86_incoming_stack_boundary < 128
++      || opts->x_ix86_force_align_arg_pointer)
++    opts->x_target_flags &= ~MASK_STV;
++  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
++      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
++    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
++  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
++      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
++    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
++
++  /* Enable 128-bit AVX instruction generation
++     for the auto-vectorizer.  */
++  if (TARGET_AVX128_OPTIMAL
++      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
++    opts->x_prefer_vector_width_type = PVW_AVX128;
++
++  /* Use 256-bit AVX instruction generation
++     in the auto-vectorizer.  */
++  if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
++      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
++    opts->x_prefer_vector_width_type = PVW_AVX256;
++
++  if (opts->x_ix86_recip_name)
++    {
++      char *p = ASTRDUP (opts->x_ix86_recip_name);
++      char *q;
++      unsigned int mask, i;
++      bool invert;
++
++      while ((q = strtok (p, ",")) != NULL)
++	{
++	  p = NULL;
++	  if (*q == '!')
++	    {
++	      invert = true;
++	      q++;
++	    }
++	  else
++	    invert = false;
++
++	  if (!strcmp (q, "default"))
++	    mask = RECIP_MASK_ALL;
++	  else
++	    {
++	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
++		if (!strcmp (q, recip_options[i].string))
++		  {
++		    mask = recip_options[i].mask;
++		    break;
++		  }
++
++	      if (i == ARRAY_SIZE (recip_options))
++		{
++		  error ("unknown option for %<-mrecip=%s%>", q);
++		  invert = false;
++		  mask = RECIP_MASK_NONE;
++		}
++	    }
++
++	  opts->x_recip_mask_explicit |= mask;
++	  if (invert)
++	    opts->x_recip_mask &= ~mask;
++	  else
++	    opts->x_recip_mask |= mask;
++	}
++    }
++
++  if (TARGET_RECIP_P (opts->x_target_flags))
++    opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
++  else if (opts_set->x_target_flags & MASK_RECIP)
++    opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
++
++  /* Default long double to 64-bit for 32-bit Bionic and to __float128
++     for 64-bit Bionic.  Also default long double to 64-bit for Intel
++     MCU psABI.  */
++  if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
++      && !(opts_set->x_target_flags
++	   & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
++    opts->x_target_flags |= (TARGET_64BIT
++			     ? MASK_LONG_DOUBLE_128
++			     : MASK_LONG_DOUBLE_64);
++
++  /* Only one of them can be active.  */
++  gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
++	      || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
++
++  /* Handle stack protector */
++  if (!opts_set->x_ix86_stack_protector_guard)
++    {
++#ifdef TARGET_THREAD_SSP_OFFSET
++      if (!TARGET_HAS_BIONIC)
++	opts->x_ix86_stack_protector_guard = SSP_TLS;
++      else
++#endif
++	opts->x_ix86_stack_protector_guard = SSP_GLOBAL;
++    }
++
++  if (opts_set->x_ix86_stack_protector_guard_offset_str)
++    {
++      char *endp;
++      const char *str = opts->x_ix86_stack_protector_guard_offset_str;
++
++      errno = 0;
++      int64_t offset;
++
++#if defined(INT64_T_IS_LONG)
++      offset = strtol (str, &endp, 0);
++#else
++      offset = strtoll (str, &endp, 0);
++#endif
++
++      if (!*str || *endp || errno)
++	error ("%qs is not a valid number "
++	       "in %<-mstack-protector-guard-offset=%>", str);
++
++      if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
++		     HOST_WIDE_INT_C (0x7fffffff)))
++	error ("%qs is not a valid offset "
++	       "in %<-mstack-protector-guard-offset=%>", str);
++
++      opts->x_ix86_stack_protector_guard_offset = offset;
++    }
++#ifdef TARGET_THREAD_SSP_OFFSET
++  else
++    opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
++#endif
++
++  if (opts_set->x_ix86_stack_protector_guard_reg_str)
++    {
++      const char *str = opts->x_ix86_stack_protector_guard_reg_str;
++      addr_space_t seg = ADDR_SPACE_GENERIC;
++
++      /* Discard optional register prefix.  */
++      if (str[0] == '%')
++	str++;
++
++      if (strlen (str) == 2 && str[1] == 's')
++	{
++	  if (str[0] == 'f')
++	    seg = ADDR_SPACE_SEG_FS;
++	  else if (str[0] == 'g')
++	    seg = ADDR_SPACE_SEG_GS;
++	}
++
++      if (seg == ADDR_SPACE_GENERIC)
++	error ("%qs is not a valid base register "
++	       "in %<-mstack-protector-guard-reg=%>",
++	       opts->x_ix86_stack_protector_guard_reg_str);
++
++      opts->x_ix86_stack_protector_guard_reg = seg;
++    }
++  else
++    {
++      opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
++
++      /* The kernel uses a different segment register for performance
++	 reasons; a system call would not have to trash the userspace
++	 segment register, which would be expensive.  */
++      if (opts->x_ix86_cmodel == CM_KERNEL)
++	opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
++    }
++
++  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
++  if (opts->x_ix86_tune_memcpy_strategy)
++    {
++      char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
++      ix86_parse_stringop_strategy_string (str, false);
++      free (str);
++    }
++
++  if (opts->x_ix86_tune_memset_strategy)
++    {
++      char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
++      ix86_parse_stringop_strategy_string (str, true);
++      free (str);
++    }
++
++  /* Save the initial options in case the user does function specific
++     options.  */
++  if (main_args_p)
++    target_option_default_node = target_option_current_node
++      = build_target_option_node (opts);
++
++  if (opts->x_flag_cf_protection != CF_NONE)
++    opts->x_flag_cf_protection
++      = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
++
++  if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS])
++    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 256,
++			   opts->x_param_values,
++			   opts_set->x_param_values);
++  else if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
++    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
++			   opts->x_param_values,
++			   opts_set->x_param_values);
++
++  /* PR86952: jump table usage with retpolines is slow.
++     The PR provides some numbers about the slowness.  */
++  if (ix86_indirect_branch != indirect_branch_keep
++      && !opts_set->x_flag_jump_tables)
++    opts->x_flag_jump_tables = 0;
++
++  return true;
++}
++
++/* Implement the TARGET_OPTION_OVERRIDE hook.  */
++
++void
++ix86_option_override (void)
++{
++  ix86_option_override_internal (true, &global_options, &global_options_set);
++}
++
++/* Remember the last target of ix86_set_current_function.  */
++static GTY(()) tree ix86_previous_fndecl;
++
++/* Set targets globals to the default (or current #pragma GCC target
++   if active).  Invalidate ix86_previous_fndecl cache.  */
++
++void
++ix86_reset_previous_fndecl (void)
++{
++  tree new_tree = target_option_current_node;
++  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
++  if (TREE_TARGET_GLOBALS (new_tree))
++    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
++  else if (new_tree == target_option_default_node)
++    restore_target_globals (&default_target_globals);
++  else
++    TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
++  ix86_previous_fndecl = NULL_TREE;
++}
++
++/* Add target attribute to SIMD clone NODE if needed.  */
++
++void
++ix86_simd_clone_adjust (struct cgraph_node *node)
++{
++  const char *str = NULL;
++
++  /* Attributes need to be adjusted for definitions, not declarations.  */
++  if (!node->definition)
++    return;
++
++  gcc_assert (node->decl == cfun->decl);
++  switch (node->simdclone->vecsize_mangle)
++    {
++    case 'b':
++      if (!TARGET_SSE2)
++	str = "sse2";
++      break;
++    case 'c':
++      if (!TARGET_AVX)
++	str = "avx";
++      break;
++    case 'd':
++      if (!TARGET_AVX2)
++	str = "avx2";
++      break;
++    case 'e':
++      if (!TARGET_AVX512F)
++	str = "avx512f";
++      break;
++    default:
++      gcc_unreachable ();
++    }
++  if (str == NULL)
++    return;
++  push_cfun (NULL);
++  tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
++  bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
++  gcc_assert (ok);
++  pop_cfun ();
++  ix86_reset_previous_fndecl ();
++  ix86_set_current_function (node->decl);
++}
++
++
++
++/* Set the func_type field from the function FNDECL.  */
++
++static void
++ix86_set_func_type (tree fndecl)
++{
++  if (cfun->machine->func_type == TYPE_UNKNOWN)
++    {
++      if (lookup_attribute ("interrupt",
++			    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
++	{
++	  if (ix86_function_naked (fndecl))
++	    error_at (DECL_SOURCE_LOCATION (fndecl),
++		      "interrupt and naked attributes are not compatible");
++
++	  int nargs = 0;
++	  for (tree arg = DECL_ARGUMENTS (fndecl);
++	       arg;
++	       arg = TREE_CHAIN (arg))
++	    nargs++;
++	  cfun->machine->no_caller_saved_registers = true;
++	  cfun->machine->func_type
++	    = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
++
++	  ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
++
++	  /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument.  */
++	  if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
++	    sorry ("only DWARF debug format is supported for interrupt "
++		   "service routine");
++	}
++      else
++	{
++	  cfun->machine->func_type = TYPE_NORMAL;
++	  if (lookup_attribute ("no_caller_saved_registers",
++				TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
++	    cfun->machine->no_caller_saved_registers = true;
++	}
++    }
++}
++
++/* Set the indirect_branch_type field from the function FNDECL.  */
++
++static void
++ix86_set_indirect_branch_type (tree fndecl)
++{
++  if (cfun->machine->indirect_branch_type == indirect_branch_unset)
++    {
++      tree attr = lookup_attribute ("indirect_branch",
++				    DECL_ATTRIBUTES (fndecl));
++      if (attr != NULL)
++	{
++	  tree args = TREE_VALUE (attr);
++	  if (args == NULL)
++	    gcc_unreachable ();
++	  tree cst = TREE_VALUE (args);
++	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
++	    cfun->machine->indirect_branch_type = indirect_branch_keep;
++	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
++	    cfun->machine->indirect_branch_type = indirect_branch_thunk;
++	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
++	    cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
++	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
++	    cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
++	  else
++	    gcc_unreachable ();
++	}
++      else
++	cfun->machine->indirect_branch_type = ix86_indirect_branch;
++
++      /* -mcmodel=large is not compatible with -mindirect-branch=thunk
++	 nor -mindirect-branch=thunk-extern.  */
++      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
++	  && ((cfun->machine->indirect_branch_type
++	       == indirect_branch_thunk_extern)
++	      || (cfun->machine->indirect_branch_type
++		  == indirect_branch_thunk)))
++	error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
++	       "compatible",
++	       ((cfun->machine->indirect_branch_type
++		 == indirect_branch_thunk_extern)
++		? "thunk-extern" : "thunk"));
++
++      if (cfun->machine->indirect_branch_type != indirect_branch_keep
++	  && (flag_cf_protection & CF_RETURN))
++	error ("%<-mindirect-branch%> and %<-fcf-protection%> are not "
++	       "compatible");
++    }
++
++  if (cfun->machine->function_return_type == indirect_branch_unset)
++    {
++      tree attr = lookup_attribute ("function_return",
++				    DECL_ATTRIBUTES (fndecl));
++      if (attr != NULL)
++	{
++	  tree args = TREE_VALUE (attr);
++	  if (args == NULL)
++	    gcc_unreachable ();
++	  tree cst = TREE_VALUE (args);
++	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
++	    cfun->machine->function_return_type = indirect_branch_keep;
++	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
++	    cfun->machine->function_return_type = indirect_branch_thunk;
++	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
++	    cfun->machine->function_return_type = indirect_branch_thunk_inline;
++	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
++	    cfun->machine->function_return_type = indirect_branch_thunk_extern;
++	  else
++	    gcc_unreachable ();
++	}
++      else
++	cfun->machine->function_return_type = ix86_function_return;
++
++      /* -mcmodel=large is not compatible with -mfunction-return=thunk
++	 nor -mfunction-return=thunk-extern.  */
++      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
++	  && ((cfun->machine->function_return_type
++	       == indirect_branch_thunk_extern)
++	      || (cfun->machine->function_return_type
++		  == indirect_branch_thunk)))
++	error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
++	       "compatible",
++	       ((cfun->machine->function_return_type
++		 == indirect_branch_thunk_extern)
++		? "thunk-extern" : "thunk"));
++
++      if (cfun->machine->function_return_type != indirect_branch_keep
++	  && (flag_cf_protection & CF_RETURN))
++	error ("%<-mfunction-return%> and %<-fcf-protection%> are not "
++	       "compatible");
++    }
++}
++
++/* Establish appropriate back-end context for processing the function
++   FNDECL.  The argument might be NULL to indicate processing at top
++   level, outside of any function scope.  */
++void
++ix86_set_current_function (tree fndecl)
++{
++  /* Only change the context if the function changes.  This hook is called
++     several times in the course of compiling a function, and we don't want to
++     slow things down too much or call target_reinit when it isn't safe.  */
++  if (fndecl == ix86_previous_fndecl)
++    {
++      /* There may be 2 function bodies for the same function FNDECL,
++	 one is extern inline and one isn't.  Call ix86_set_func_type
++	 to set the func_type field.  */
++      if (fndecl != NULL_TREE)
++	{
++	  ix86_set_func_type (fndecl);
++	  ix86_set_indirect_branch_type (fndecl);
++	}
++      return;
++    }
++
++  tree old_tree;
++  if (ix86_previous_fndecl == NULL_TREE)
++    old_tree = target_option_current_node;
++  else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
++    old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
++  else
++    old_tree = target_option_default_node;
++
++  if (fndecl == NULL_TREE)
++    {
++      if (old_tree != target_option_current_node)
++	ix86_reset_previous_fndecl ();
++      return;
++    }
++
++  ix86_set_func_type (fndecl);
++  ix86_set_indirect_branch_type (fndecl);
++
++  tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
++  if (new_tree == NULL_TREE)
++    new_tree = target_option_default_node;
++
++  if (old_tree != new_tree)
++    {
++      cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
++      if (TREE_TARGET_GLOBALS (new_tree))
++	restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
++      else if (new_tree == target_option_default_node)
++	restore_target_globals (&default_target_globals);
++      else
++	TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
++    }
++  ix86_previous_fndecl = fndecl;
++
++  static bool prev_no_caller_saved_registers;
++
++  /* 64-bit MS and SYSV ABI have different set of call used registers.
++     Avoid expensive re-initialization of init_regs each time we switch
++     function context.  */
++  if (TARGET_64BIT
++      && (call_used_or_fixed_reg_p (SI_REG)
++	  == (cfun->machine->call_abi == MS_ABI)))
++    reinit_regs ();
++  /* Need to re-initialize init_regs if caller-saved registers are
++     changed.  */
++  else if (prev_no_caller_saved_registers
++	   != cfun->machine->no_caller_saved_registers)
++    reinit_regs ();
++
++  if (cfun->machine->func_type != TYPE_NORMAL
++      || cfun->machine->no_caller_saved_registers)
++    {
++      /* Don't allow SSE, MMX nor x87 instructions since they
++	 may change processor state.  */
++      const char *isa;
++      if (TARGET_SSE)
++	isa = "SSE";
++      else if (TARGET_MMX)
++	isa = "MMX/3Dnow";
++      else if (TARGET_80387)
++	isa = "80387";
++      else
++	isa = NULL;
++      if (isa != NULL)
++	{
++	  if (cfun->machine->func_type != TYPE_NORMAL)
++	    sorry (cfun->machine->func_type == TYPE_EXCEPTION
++		   ? G_("%s instructions aren%'t allowed in an"
++			" exception service routine")
++		   : G_("%s instructions aren%'t allowed in an"
++			" interrupt service routine"),
++		   isa);
++	  else
++	    sorry ("%s instructions aren%'t allowed in a function with "
++		   "the %<no_caller_saved_registers%> attribute", isa);
++	  /* Don't issue the same error twice.  */
++	  cfun->machine->func_type = TYPE_NORMAL;
++	  cfun->machine->no_caller_saved_registers = false;
++	}
++    }
++
++  prev_no_caller_saved_registers
++    = cfun->machine->no_caller_saved_registers;
++}
++
++/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
++char *
++ix86_offload_options (void)
++{
++  if (TARGET_LP64)
++    return xstrdup ("-foffload-abi=lp64");
++  return xstrdup ("-foffload-abi=ilp32");
++}
++
++/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
++   and "sseregparm" calling convention attributes;
++   arguments as in struct attribute_spec.handler.  */
++
++static tree
++ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
++			     bool *no_add_attrs)
++{
++  if (TREE_CODE (*node) != FUNCTION_TYPE
++      && TREE_CODE (*node) != METHOD_TYPE
++      && TREE_CODE (*node) != FIELD_DECL
++      && TREE_CODE (*node) != TYPE_DECL)
++    {
++      warning (OPT_Wattributes, "%qE attribute only applies to functions",
++	       name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++
++  /* Can combine regparm with all attributes but fastcall, and thiscall.  */
++  if (is_attribute_p ("regparm", name))
++    {
++      tree cst;
++
++      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("fastcall and regparm attributes are not compatible");
++	}
++
++      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("regparam and thiscall attributes are not compatible");
++	}
++
++      cst = TREE_VALUE (args);
++      if (TREE_CODE (cst) != INTEGER_CST)
++	{
++	  warning (OPT_Wattributes,
++		   "%qE attribute requires an integer constant argument",
++		   name);
++	  *no_add_attrs = true;
++	}
++      else if (compare_tree_int (cst, REGPARM_MAX) > 0)
++	{
++	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
++		   name, REGPARM_MAX);
++	  *no_add_attrs = true;
++	}
++
++      return NULL_TREE;
++    }
++
++  if (TARGET_64BIT)
++    {
++      /* Do not warn when emulating the MS ABI.  */
++      if ((TREE_CODE (*node) != FUNCTION_TYPE
++	   && TREE_CODE (*node) != METHOD_TYPE)
++	  || ix86_function_type_abi (*node) != MS_ABI)
++	warning (OPT_Wattributes, "%qE attribute ignored",
++	         name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++
++  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
++  if (is_attribute_p ("fastcall", name))
++    {
++      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("fastcall and cdecl attributes are not compatible");
++	}
++      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("fastcall and stdcall attributes are not compatible");
++	}
++      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("fastcall and regparm attributes are not compatible");
++	}
++      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("fastcall and thiscall attributes are not compatible");
++	}
++    }
++
++  /* Can combine stdcall with fastcall (redundant), regparm and
++     sseregparm.  */
++  else if (is_attribute_p ("stdcall", name))
++    {
++      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("stdcall and cdecl attributes are not compatible");
++	}
++      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("stdcall and fastcall attributes are not compatible");
++	}
++      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("stdcall and thiscall attributes are not compatible");
++	}
++    }
++
++  /* Can combine cdecl with regparm and sseregparm.  */
++  else if (is_attribute_p ("cdecl", name))
++    {
++      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("stdcall and cdecl attributes are not compatible");
++	}
++      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("fastcall and cdecl attributes are not compatible");
++	}
++      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("cdecl and thiscall attributes are not compatible");
++	}
++    }
++  else if (is_attribute_p ("thiscall", name))
++    {
++      if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
++	warning (OPT_Wattributes, "%qE attribute is used for non-class method",
++	         name);
++      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("stdcall and thiscall attributes are not compatible");
++	}
++      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("fastcall and thiscall attributes are not compatible");
++	}
++      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
++	{
++	  error ("cdecl and thiscall attributes are not compatible");
++	}
++    }
++
++  /* Can combine sseregparm with all attributes.  */
++
++  return NULL_TREE;
++}
++
++#ifndef CHECK_STACK_LIMIT
++#define CHECK_STACK_LIMIT (-1)
++#endif
++
++/* The transactional memory builtins are implicitly regparm or fastcall
++   depending on the ABI.  Override the generic do-nothing attribute that
++   these builtins were declared with, and replace it with one of the two
++   attributes that we expect elsewhere.  */
++
++static tree
++ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
++				  int flags, bool *no_add_attrs)
++{
++  tree alt;
++
++  /* In no case do we want to add the placeholder attribute.  */
++  *no_add_attrs = true;
++
++  /* The 64-bit ABI is unchanged for transactional memory.  */
++  if (TARGET_64BIT)
++    return NULL_TREE;
++
++  /* ??? Is there a better way to validate 32-bit windows?  We have
++     cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
++  if (CHECK_STACK_LIMIT > 0)
++    alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
++  else
++    {
++      alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
++      alt = tree_cons (get_identifier ("regparm"), alt, NULL);
++    }
++  decl_attributes (node, alt, flags);
++
++  return NULL_TREE;
++}
++
++/* Handle a "force_align_arg_pointer" attribute.  */
++
++static tree
++ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
++					       tree, int, bool *no_add_attrs)
++{
++  if (TREE_CODE (*node) != FUNCTION_TYPE
++      && TREE_CODE (*node) != METHOD_TYPE
++      && TREE_CODE (*node) != FIELD_DECL
++      && TREE_CODE (*node) != TYPE_DECL)
++    {
++      warning (OPT_Wattributes, "%qE attribute only applies to functions",
++	       name);
++      *no_add_attrs = true;
++    }
++
++  return NULL_TREE;
++}
++
++/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
++   struct attribute_spec.handler.  */
++
++static tree
++ix86_handle_struct_attribute (tree *node, tree name, tree, int,
++			      bool *no_add_attrs)
++{
++  tree *type = NULL;
++  if (DECL_P (*node))
++    {
++      if (TREE_CODE (*node) == TYPE_DECL)
++	type = &TREE_TYPE (*node);
++    }
++  else
++    type = node;
++
++  if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
++    {
++      warning (OPT_Wattributes, "%qE attribute ignored",
++	       name);
++      *no_add_attrs = true;
++    }
++
++  else if ((is_attribute_p ("ms_struct", name)
++	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
++	   || ((is_attribute_p ("gcc_struct", name)
++		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
++    {
++      warning (OPT_Wattributes, "%qE incompatible attribute ignored",
++               name);
++      *no_add_attrs = true;
++    }
++
++  return NULL_TREE;
++}
++
++/* Handle a "callee_pop_aggregate_return" attribute; arguments as
++   in struct attribute_spec handler.  */
++
++static tree
++ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
++					 bool *no_add_attrs)
++{
++  if (TREE_CODE (*node) != FUNCTION_TYPE
++      && TREE_CODE (*node) != METHOD_TYPE
++      && TREE_CODE (*node) != FIELD_DECL
++      && TREE_CODE (*node) != TYPE_DECL)
++    {
++      warning (OPT_Wattributes, "%qE attribute only applies to functions",
++	       name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++  if (TARGET_64BIT)
++    {
++      warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
++	       name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++  if (is_attribute_p ("callee_pop_aggregate_return", name))
++    {
++      tree cst;
++
++      cst = TREE_VALUE (args);
++      if (TREE_CODE (cst) != INTEGER_CST)
++	{
++	  warning (OPT_Wattributes,
++		   "%qE attribute requires an integer constant argument",
++		   name);
++	  *no_add_attrs = true;
++	}
++      else if (compare_tree_int (cst, 0) != 0
++	       && compare_tree_int (cst, 1) != 0)
++	{
++	  warning (OPT_Wattributes,
++		   "argument to %qE attribute is neither zero, nor one",
++		   name);
++	  *no_add_attrs = true;
++	}
++
++      return NULL_TREE;
++    }
++
++  return NULL_TREE;
++}
++
++/* Handle a "ms_abi" or "sysv" attribute; arguments as in
++   struct attribute_spec.handler.  */
++
++static tree
++ix86_handle_abi_attribute (tree *node, tree name, tree, int,
++			   bool *no_add_attrs)
++{
++  if (TREE_CODE (*node) != FUNCTION_TYPE
++      && TREE_CODE (*node) != METHOD_TYPE
++      && TREE_CODE (*node) != FIELD_DECL
++      && TREE_CODE (*node) != TYPE_DECL)
++    {
++      warning (OPT_Wattributes, "%qE attribute only applies to functions",
++	       name);
++      *no_add_attrs = true;
++      return NULL_TREE;
++    }
++
++  /* Can combine regparm with all attributes but fastcall.  */
++  if (is_attribute_p ("ms_abi", name))
++    {
++      if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("%qs and %qs attributes are not compatible",
++		 "ms_abi", "sysv_abi");
++	}
++
++      return NULL_TREE;
++    }
++  else if (is_attribute_p ("sysv_abi", name))
++    {
++      if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
++        {
++	  error ("%qs and %qs attributes are not compatible",
++		 "ms_abi", "sysv_abi");
++	}
++
++      return NULL_TREE;
++    }
++
++  return NULL_TREE;
++}
++
++static tree
++ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
++			      bool *no_add_attrs)
++{
++  if (TREE_CODE (*node) != FUNCTION_DECL)
++    {
++      warning (OPT_Wattributes, "%qE attribute only applies to functions",
++               name);
++      *no_add_attrs = true;
++    }
++
++  if (is_attribute_p ("indirect_branch", name))
++    {
++      tree cst = TREE_VALUE (args);
++      if (TREE_CODE (cst) != STRING_CST)
++	{
++	  warning (OPT_Wattributes,
++		   "%qE attribute requires a string constant argument",
++		   name);
++	  *no_add_attrs = true;
++	}
++      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
++	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
++	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
++	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
++	{
++	  warning (OPT_Wattributes,
++		   "argument to %qE attribute is not "
++		   "(keep|thunk|thunk-inline|thunk-extern)", name);
++	  *no_add_attrs = true;
++	}
++    }
++
++  if (is_attribute_p ("function_return", name))
++    {
++      tree cst = TREE_VALUE (args);
++      if (TREE_CODE (cst) != STRING_CST)
++	{
++	  warning (OPT_Wattributes,
++		   "%qE attribute requires a string constant argument",
++		   name);
++	  *no_add_attrs = true;
++	}
++      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
++	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
++	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
++	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
++	{
++	  warning (OPT_Wattributes,
++		   "argument to %qE attribute is not "
++		   "(keep|thunk|thunk-inline|thunk-extern)", name);
++	  *no_add_attrs = true;
++	}
++    }
++
++  return NULL_TREE;
++}
++
++static tree
++ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
++						 int, bool *)
++{
++  return NULL_TREE;
++}
++
++static tree
++ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
++{
++  /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
++     but the function type contains args and return type data.  */
++  tree func_type = *node;
++  tree return_type = TREE_TYPE (func_type);
++
++  int nargs = 0;
++  tree current_arg_type = TYPE_ARG_TYPES (func_type);
++  while (current_arg_type
++	 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
++    {
++      if (nargs == 0)
++	{
++	  if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
++	    error ("interrupt service routine should have a pointer "
++		   "as the first argument");
++	}
++      else if (nargs == 1)
++	{
++	  if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
++	      || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
++	    error ("interrupt service routine should have %qs "
++		   "as the second argument",
++		   TARGET_64BIT
++		   ? (TARGET_X32 ? "unsigned long long int"
++				 : "unsigned long int")
++		   : "unsigned int");
++	}
++      nargs++;
++      current_arg_type = TREE_CHAIN (current_arg_type);
++    }
++  if (!nargs || nargs > 2)
++    error ("interrupt service routine can only have a pointer argument "
++	   "and an optional integer argument");
++  if (! VOID_TYPE_P (return_type))
++    error ("interrupt service routine must return %<void%>");
++
++  return NULL_TREE;
++}
++
++/* Handle fentry_name / fentry_section attribute.  */
++
++static tree
++ix86_handle_fentry_name (tree *node, tree name, tree args,
++			 int, bool *no_add_attrs)
++{
++  if (TREE_CODE (*node) == FUNCTION_DECL
++      && TREE_CODE (TREE_VALUE (args)) == STRING_CST)
++    /* Do nothing else, just set the attribute.  We'll get at
++       it later with lookup_attribute.  */
++    ;
++  else
++    {
++      warning (OPT_Wattributes, "%qE attribute ignored", name);
++      *no_add_attrs = true;
++    }
++
++  return NULL_TREE;
++}
++
++/* Table of valid machine attributes.  */
++const struct attribute_spec ix86_attribute_table[] =
++{
++  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
++       affects_type_identity, handler, exclude } */
++  /* Stdcall attribute says callee is responsible for popping arguments
++     if they are not variable.  */
++  { "stdcall",   0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
++    NULL },
++  /* Fastcall attribute says callee is responsible for popping arguments
++     if they are not variable.  */
++  { "fastcall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
++    NULL },
++  /* Thiscall attribute says callee is responsible for popping arguments
++     if they are not variable.  */
++  { "thiscall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
++    NULL },
++  /* Cdecl attribute says the callee is a normal C declaration */
++  { "cdecl",     0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
++    NULL },
++  /* Regparm attribute specifies how many integer arguments are to be
++     passed in registers.  */
++  { "regparm",   1, 1, false, true,  true,  true, ix86_handle_cconv_attribute,
++    NULL },
++  /* Sseregparm attribute says we are using x86_64 calling conventions
++     for FP arguments.  */
++  { "sseregparm", 0, 0, false, true, true,  true, ix86_handle_cconv_attribute,
++    NULL },
++  /* The transactional memory builtins are implicitly regparm or fastcall
++     depending on the ABI.  Override the generic do-nothing attribute that
++     these builtins were declared with.  */
++  { "*tm regparm", 0, 0, false, true, true, true,
++    ix86_handle_tm_regparm_attribute, NULL },
++  /* force_align_arg_pointer says this function realigns the stack at entry.  */
++  { "force_align_arg_pointer", 0, 0,
++    false, true,  true, false, ix86_handle_force_align_arg_pointer_attribute,
++    NULL },
++#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
++  { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
++    NULL },
++  { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
++    NULL },
++  { "shared",    0, 0, true,  false, false, false,
++    ix86_handle_shared_attribute, NULL },
++#endif
++  { "ms_struct", 0, 0, false, false,  false, false,
++    ix86_handle_struct_attribute, NULL },
++  { "gcc_struct", 0, 0, false, false,  false, false,
++    ix86_handle_struct_attribute, NULL },
++#ifdef SUBTARGET_ATTRIBUTE_TABLE
++  SUBTARGET_ATTRIBUTE_TABLE,
++#endif
++  /* ms_abi and sysv_abi calling convention function attributes.  */
++  { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
++  { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
++    NULL },
++  { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
++  { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
++  { "ms_hook_prologue", 0, 0, true, false, false, false,
++    ix86_handle_fndecl_attribute, NULL },
++  { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
++    ix86_handle_callee_pop_aggregate_return, NULL },
++  { "interrupt", 0, 0, false, true, true, false,
++    ix86_handle_interrupt_attribute, NULL },
++  { "no_caller_saved_registers", 0, 0, false, true, true, false,
++    ix86_handle_no_caller_saved_registers_attribute, NULL },
++  { "naked", 0, 0, true, false, false, false,
++    ix86_handle_fndecl_attribute, NULL },
++  { "indirect_branch", 1, 1, true, false, false, false,
++    ix86_handle_fndecl_attribute, NULL },
++  { "function_return", 1, 1, true, false, false, false,
++    ix86_handle_fndecl_attribute, NULL },
++  { "indirect_return", 0, 0, false, true, true, false,
++    NULL, NULL },
++  { "fentry_name", 1, 1, true, false, false, false,
++    ix86_handle_fentry_name, NULL },
++  { "fentry_section", 1, 1, true, false, false, false,
++    ix86_handle_fentry_name, NULL },
++  { "cf_check", 0, 0, true, false, false, false,
++    ix86_handle_fndecl_attribute, NULL },
++
++  /* End element.  */
++  { NULL, 0, 0, false, false, false, false, NULL, NULL }
++};
++
++#include "gt-i386-options.h"
+diff --git a/gcc/config/i386/i386-options.h b/gcc/config/i386/i386-options.h
+new file mode 100644
+index 000000000..817ddda5c
+--- /dev/null
++++ b/gcc/config/i386/i386-options.h
+@@ -0,0 +1,95 @@
++/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify
++it under the terms of the GNU General Public License as published by
++the Free Software Foundation; either version 3, or (at your option)
++any later version.
++
++GCC is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_I386_OPTIONS_H
++#define GCC_I386_OPTIONS_H
++
++char *ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
++			  int flags, int flags2,
++			  const char *arch, const char *tune,
++			  enum fpmath_unit fpmath, bool add_nl_p,
++			  bool add_abi_p);
++
++extern enum attr_cpu ix86_schedule;
++
++extern enum processor_type ix86_tune;
++extern enum processor_type ix86_arch;
++extern unsigned char x86_prefetch_sse;
++extern const struct processor_costs *ix86_tune_cost;
++
++extern int ix86_tune_defaulted;
++extern int ix86_arch_specified;
++
++extern unsigned int ix86_default_incoming_stack_boundary;
++extern HOST_WIDE_INT deferred_isa_values;
++extern HOST_WIDE_INT deferred_isa_values2;
++
++extern unsigned int ix86_preferred_stack_boundary;
++extern unsigned int ix86_user_incoming_stack_boundary;
++extern unsigned int ix86_default_incoming_stack_boundary;
++extern unsigned int ix86_incoming_stack_boundary;
++
++extern char *ix86_offload_options (void);
++extern void ix86_option_override (void);
++extern void ix86_override_options_after_change (void);
++void ix86_set_current_function (tree fndecl);
++bool ix86_function_naked (const_tree fn);
++void ix86_simd_clone_adjust (struct cgraph_node *node);
++
++extern tree (*ix86_veclib_handler) (combined_fn, tree, tree);
++extern tree ix86_veclibabi_svml (combined_fn, tree, tree);
++extern tree ix86_veclibabi_acml (combined_fn, tree, tree);
++
++extern rtx (*ix86_gen_leave) (void);
++extern rtx (*ix86_gen_add3) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
++extern rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
++extern rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_clzero) (rtx);
++extern rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
++extern rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
++extern rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
++
++enum ix86_function_specific_strings
++{
++  IX86_FUNCTION_SPECIFIC_ARCH,
++  IX86_FUNCTION_SPECIFIC_TUNE,
++  IX86_FUNCTION_SPECIFIC_MAX
++};
++
++extern const char *stringop_alg_names[];
++
++void ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2);
++void ix86_function_specific_save (struct cl_target_option *,
++				  struct gcc_options *opts);
++void ix86_function_specific_restore (struct gcc_options *opts,
++				     struct cl_target_option *);
++void ix86_function_specific_post_stream_in (struct cl_target_option *);
++void ix86_function_specific_print (FILE *, int,
++				   struct cl_target_option *);
++bool ix86_valid_target_attribute_p (tree, tree, tree, int);
++
++extern const struct attribute_spec ix86_attribute_table[];
++
++
++#endif  /* GCC_I386_OPTIONS_H */
+diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
+index 83645e89a..4afba5bc2 100644
+--- a/gcc/config/i386/i386-protos.h
++++ b/gcc/config/i386/i386-protos.h
+@@ -65,7 +65,7 @@ extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
+ extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
+ 
+ extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
+-extern bool ix86_expand_set_or_movmem (rtx, rtx, rtx, rtx, rtx, rtx,
++extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
+ 				       rtx, rtx, rtx, rtx, bool);
+ 
+ extern bool constant_address_p (rtx);
+@@ -207,7 +207,7 @@ extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
+ #endif	/* RTX_CODE  */
+ 
+ #ifdef TREE_CODE
+-extern int ix86_data_alignment (tree, int, bool);
++extern int ix86_data_alignment (tree, unsigned int, bool);
+ extern unsigned int ix86_local_alignment (tree, machine_mode,
+ 					  unsigned int);
+ extern unsigned int ix86_minimum_alignment (tree, machine_mode,
+@@ -215,9 +215,9 @@ extern unsigned int ix86_minimum_alignment (tree, machine_mode,
+ extern tree ix86_handle_shared_attribute (tree *, tree, tree, int, bool *);
+ extern tree ix86_handle_selectany_attribute (tree *, tree, tree, int, bool *);
+ extern int x86_field_alignment (tree, int);
+-extern tree ix86_valid_target_attribute_tree (tree,
++extern tree ix86_valid_target_attribute_tree (tree, tree,
+ 					      struct gcc_options *,
+-					      struct gcc_options *);
++					      struct gcc_options *, bool);
+ extern unsigned int ix86_get_callcvt (const_tree);
+ 
+ #endif
+diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
+index 5a0f8a0eb..9282a8fb6 100644
+--- a/gcc/config/i386/i386.c
++++ b/gcc/config/i386/i386.c
+@@ -91,19 +91,17 @@ along with GCC; see the file COPYING3.  If not see
+ #include "tree-vector-builder.h"
+ #include "debug.h"
+ #include "dwarf2out.h"
++#include "i386-options.h"
++#include "i386-builtins.h"
++#include "i386-expand.h"
++#include "i386-features.h"
+ 
+ /* This file should be included last.  */
+ #include "target-def.h"
+ 
+-#include "x86-tune-costs.h"
+-
+ static rtx legitimize_dllimport_symbol (rtx, bool);
+ static rtx legitimize_pe_coff_extern_decl (rtx, bool);
+-static rtx legitimize_pe_coff_symbol (rtx, bool);
+ static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
+-static bool ix86_save_reg (unsigned int, bool, bool);
+-static bool ix86_function_naked (const_tree);
+-static bool ix86_notrack_prefixed_insn_p (rtx);
+ static void ix86_emit_restore_reg_using_pop (rtx);
+ 
+ 
+@@ -126,102 +124,6 @@ const struct processor_costs *ix86_tune_cost = NULL;
+ /* Set by -mtune or -Os.  */
+ const struct processor_costs *ix86_cost = NULL;
+ 
+-/* Processor feature/optimization bitmasks.  */
+-#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
+-#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
+-#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
+-#define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
+-#define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
+-#define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
+-#define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
+-#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
+-#define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
+-#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
+-#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
+-#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
+-#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
+-#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
+-#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
+-#define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
+-#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
+-#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
+-#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
+-#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
+-#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
+-#define m_CASCADELAKE (HOST_WIDE_INT_1U<<PROCESSOR_CASCADELAKE)
+-#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
+-		       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE)
+-#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
+-#define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2)
+-#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
+-#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
+-#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
+-#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
+-
+-#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
+-#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
+-#define m_K6_GEODE (m_K6 | m_GEODE)
+-#define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
+-#define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
+-#define m_ATHLON_K8 (m_K8 | m_ATHLON)
+-#define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
+-#define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
+-#define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
+-#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
+-#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
+-#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
+-#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
+-#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
+-#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
+-#define m_BDVER	(m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
+-#define m_BTVER (m_BTVER1 | m_BTVER2)
+-#define m_ZNVER	(m_ZNVER1 | m_ZNVER2)
+-#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
+-			| m_ZNVER)
+-
+-#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
+-
+-const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
+-#undef DEF_TUNE
+-#define DEF_TUNE(tune, name, selector) name,
+-#include "x86-tune.def"
+-#undef DEF_TUNE
+-};
+-
+-/* Feature tests against the various tunings.  */
+-unsigned char ix86_tune_features[X86_TUNE_LAST];
+-
+-/* Feature tests against the various tunings used to create ix86_tune_features
+-   based on the processor mask.  */
+-static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
+-#undef DEF_TUNE
+-#define DEF_TUNE(tune, name, selector) selector,
+-#include "x86-tune.def"
+-#undef DEF_TUNE
+-};
+-
+-/* Feature tests against the various architecture variations.  */
+-unsigned char ix86_arch_features[X86_ARCH_LAST];
+-
+-/* Feature tests against the various architecture variations, used to create
+-   ix86_arch_features based on the processor mask.  */
+-static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
+-  /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
+-  ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
+-
+-  /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
+-  ~m_386,
+-
+-  /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
+-  ~(m_386 | m_486),
+-
+-  /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
+-  ~m_386,
+-
+-  /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
+-  ~m_386,
+-};
+-
+ /* In case the average insn count for single function invocation is
+    lower than this constant, emit fast (but longer) prologue and
+    epilogue code.  */
+@@ -426,300 +328,6 @@ static int const x86_64_int_return_registers[4] =
+   AX_REG, DX_REG, DI_REG, SI_REG
+ };
+ 
+-/* Additional registers that are clobbered by SYSV calls.  */
+-
+-#define NUM_X86_64_MS_CLOBBERED_REGS 12
+-static int const x86_64_ms_sysv_extra_clobbered_registers
+-		 [NUM_X86_64_MS_CLOBBERED_REGS] =
+-{
+-  SI_REG, DI_REG,
+-  XMM6_REG, XMM7_REG,
+-  XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
+-  XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
+-};
+-
+-enum xlogue_stub {
+-  XLOGUE_STUB_SAVE,
+-  XLOGUE_STUB_RESTORE,
+-  XLOGUE_STUB_RESTORE_TAIL,
+-  XLOGUE_STUB_SAVE_HFP,
+-  XLOGUE_STUB_RESTORE_HFP,
+-  XLOGUE_STUB_RESTORE_HFP_TAIL,
+-
+-  XLOGUE_STUB_COUNT
+-};
+-
+-enum xlogue_stub_sets {
+-  XLOGUE_SET_ALIGNED,
+-  XLOGUE_SET_ALIGNED_PLUS_8,
+-  XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
+-  XLOGUE_SET_HFP_ALIGNED_PLUS_8,
+-
+-  XLOGUE_SET_COUNT
+-};
+-
+-/* Register save/restore layout used by out-of-line stubs.  */
+-class xlogue_layout {
+-public:
+-  struct reginfo
+-  {
+-    unsigned regno;
+-    HOST_WIDE_INT offset;	/* Offset used by stub base pointer (rax or
+-				   rsi) to where each register is stored.  */
+-  };
+-
+-  unsigned get_nregs () const			{return m_nregs;}
+-  HOST_WIDE_INT get_stack_align_off_in () const	{return m_stack_align_off_in;}
+-
+-  const reginfo &get_reginfo (unsigned reg) const
+-  {
+-    gcc_assert (reg < m_nregs);
+-    return m_regs[reg];
+-  }
+-
+-  static const char *get_stub_name (enum xlogue_stub stub,
+-				    unsigned n_extra_args);
+-
+-  /* Returns an rtx for the stub's symbol based upon
+-       1.) the specified stub (save, restore or restore_ret) and
+-       2.) the value of cfun->machine->call_ms2sysv_extra_regs and
+-       3.) rather or not stack alignment is being performed.  */
+-  static rtx get_stub_rtx (enum xlogue_stub stub);
+-
+-  /* Returns the amount of stack space (including padding) that the stub
+-     needs to store registers based upon data in the machine_function.  */
+-  HOST_WIDE_INT get_stack_space_used () const
+-  {
+-    const struct machine_function *m = cfun->machine;
+-    unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
+-
+-    gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
+-    return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
+-  }
+-
+-  /* Returns the offset for the base pointer used by the stub.  */
+-  HOST_WIDE_INT get_stub_ptr_offset () const
+-  {
+-    return STUB_INDEX_OFFSET + m_stack_align_off_in;
+-  }
+-
+-  static const struct xlogue_layout &get_instance ();
+-  static unsigned count_stub_managed_regs ();
+-  static bool is_stub_managed_reg (unsigned regno, unsigned count);
+-
+-  static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
+-  static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
+-  static const unsigned MAX_REGS = 18;
+-  static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
+-  static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
+-  static const unsigned STUB_NAME_MAX_LEN = 20;
+-  static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
+-  static const unsigned REG_ORDER[MAX_REGS];
+-  static const unsigned REG_ORDER_REALIGN[MAX_REGS];
+-
+-private:
+-  xlogue_layout ();
+-  xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
+-  xlogue_layout (const xlogue_layout &);
+-
+-  /* True if hard frame pointer is used.  */
+-  bool m_hfp;
+-
+-  /* Max number of register this layout manages.  */
+-  unsigned m_nregs;
+-
+-  /* Incoming offset from 16-byte alignment.  */
+-  HOST_WIDE_INT m_stack_align_off_in;
+-
+-  /* Register order and offsets.  */
+-  struct reginfo m_regs[MAX_REGS];
+-
+-  /* Lazy-inited cache of symbol names for stubs.  */
+-  static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
+-			  [STUB_NAME_MAX_LEN];
+-
+-  static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
+-};
+-
+-const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
+-  "savms64",
+-  "resms64",
+-  "resms64x",
+-  "savms64f",
+-  "resms64f",
+-  "resms64fx"
+-};
+-
+-const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
+-/* The below offset values are where each register is stored for the layout
+-   relative to incoming stack pointer.  The value of each m_regs[].offset will
+-   be relative to the incoming base pointer (rax or rsi) used by the stub.
+-
+-    s_instances:   0		1		2		3
+-    Offset:					realigned or	aligned + 8
+-    Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
+-    XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
+-    XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
+-    XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
+-    XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
+-    XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
+-    XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
+-    XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
+-    XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
+-    XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
+-    XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
+-    SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
+-    DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
+-    BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
+-    BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
+-    R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
+-    R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
+-    R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
+-    R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
+-};
+-
+-/* Instantiate static const values.  */
+-const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
+-const unsigned xlogue_layout::MIN_REGS;
+-const unsigned xlogue_layout::MAX_REGS;
+-const unsigned xlogue_layout::MAX_EXTRA_REGS;
+-const unsigned xlogue_layout::VARIANT_COUNT;
+-const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
+-
+-/* Initialize xlogue_layout::s_stub_names to zero.  */
+-char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
+-				[STUB_NAME_MAX_LEN];
+-
+-/* Instantiates all xlogue_layout instances.  */
+-const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
+-  xlogue_layout (0, false),
+-  xlogue_layout (8, false),
+-  xlogue_layout (0, true),
+-  xlogue_layout (8, true)
+-};
+-
+-/* Return an appropriate const instance of xlogue_layout based upon values
+-   in cfun->machine and crtl.  */
+-const struct xlogue_layout &
+-xlogue_layout::get_instance ()
+-{
+-  enum xlogue_stub_sets stub_set;
+-  bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
+-
+-  if (stack_realign_fp)
+-    stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
+-  else if (frame_pointer_needed)
+-    stub_set = aligned_plus_8
+-	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
+-	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
+-  else
+-    stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
+-
+-  return s_instances[stub_set];
+-}
+-
+-/* Determine how many clobbered registers can be saved by the stub.
+-   Returns the count of registers the stub will save and restore.  */
+-unsigned
+-xlogue_layout::count_stub_managed_regs ()
+-{
+-  bool hfp = frame_pointer_needed || stack_realign_fp;
+-  unsigned i, count;
+-  unsigned regno;
+-
+-  for (count = i = MIN_REGS; i < MAX_REGS; ++i)
+-    {
+-      regno = REG_ORDER[i];
+-      if (regno == BP_REG && hfp)
+-	continue;
+-      if (!ix86_save_reg (regno, false, false))
+-	break;
+-      ++count;
+-    }
+-  return count;
+-}
+-
+-/* Determine if register REGNO is a stub managed register given the
+-   total COUNT of stub managed registers.  */
+-bool
+-xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
+-{
+-  bool hfp = frame_pointer_needed || stack_realign_fp;
+-  unsigned i;
+-
+-  for (i = 0; i < count; ++i)
+-    {
+-      gcc_assert (i < MAX_REGS);
+-      if (REG_ORDER[i] == BP_REG && hfp)
+-	++count;
+-      else if (REG_ORDER[i] == regno)
+-	return true;
+-    }
+-  return false;
+-}
+-
+-/* Constructor for xlogue_layout.  */
+-xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
+-  : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
+-    m_stack_align_off_in (stack_align_off_in)
+-{
+-  HOST_WIDE_INT offset = stack_align_off_in;
+-  unsigned i, j;
+-
+-  for (i = j = 0; i < MAX_REGS; ++i)
+-    {
+-      unsigned regno = REG_ORDER[i];
+-
+-      if (regno == BP_REG && hfp)
+-	continue;
+-      if (SSE_REGNO_P (regno))
+-	{
+-	  offset += 16;
+-	  /* Verify that SSE regs are always aligned.  */
+-	  gcc_assert (!((stack_align_off_in + offset) & 15));
+-	}
+-      else
+-	offset += 8;
+-
+-      m_regs[j].regno    = regno;
+-      m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
+-    }
+-  gcc_assert (j == m_nregs);
+-}
+-
+-const char *
+-xlogue_layout::get_stub_name (enum xlogue_stub stub,
+-			      unsigned n_extra_regs)
+-{
+-  const int have_avx = TARGET_AVX;
+-  char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
+-
+-  /* Lazy init */
+-  if (!*name)
+-    {
+-      int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
+-			  (have_avx ? "avx" : "sse"),
+-			  STUB_BASE_NAMES[stub],
+-			  MIN_REGS + n_extra_regs);
+-      gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
+-    }
+-
+-  return name;
+-}
+-
+-/* Return rtx of a symbol ref for the entry point (based upon
+-   cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
+-rtx
+-xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
+-{
+-  const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
+-  gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
+-  gcc_assert (stub < XLOGUE_STUB_COUNT);
+-  gcc_assert (crtl->stack_realign_finalized);
+-
+-  return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
+-}
+-
+ /* Define the structure for the machine field in struct function.  */
+ 
+ struct GTY(()) stack_local_entry {
+@@ -741,41 +349,37 @@ enum processor_type ix86_arch;
+ /* True if processor has SSE prefetch instruction.  */
+ unsigned char x86_prefetch_sse;
+ 
+-/* -mstackrealign option */
+-static const char ix86_force_align_arg_pointer_string[]
+-  = "force_align_arg_pointer";
+-
+-static rtx (*ix86_gen_leave) (void);
+-static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
+-static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
+-static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_clzero) (rtx);
+-static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
+-static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
+-static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
++rtx (*ix86_gen_leave) (void);
++rtx (*ix86_gen_add3) (rtx, rtx, rtx);
++rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
++rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
++rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
++rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
++rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
++rtx (*ix86_gen_clzero) (rtx);
++rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
++rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
++rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
++rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
++rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
++rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
+ 
+ /* Preferred alignment for stack boundary in bits.  */
+ unsigned int ix86_preferred_stack_boundary;
+ 
+ /* Alignment for incoming stack boundary in bits specified at
+    command line.  */
+-static unsigned int ix86_user_incoming_stack_boundary;
++unsigned int ix86_user_incoming_stack_boundary;
+ 
+ /* Default alignment for incoming stack boundary in bits.  */
+-static unsigned int ix86_default_incoming_stack_boundary;
++unsigned int ix86_default_incoming_stack_boundary;
+ 
+ /* Alignment for incoming stack boundary in bits.  */
+ unsigned int ix86_incoming_stack_boundary;
+ 
+ /* Calling abi specific va_list type nodes.  */
+-static GTY(()) tree sysv_va_list_type_node;
+-static GTY(()) tree ms_va_list_type_node;
++tree sysv_va_list_type_node;
++tree ms_va_list_type_node;
+ 
+ /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
+ char internal_label_prefix[16];
+@@ -813,7 +417,6 @@ static REAL_VALUE_TYPE ext_80387_constants_table [5];
+ static bool ext_80387_constants_init;
+ 
+ 
+-static struct machine_function * ix86_init_machine_status (void);
+ static rtx ix86_function_value (const_tree, const_tree, bool);
+ static bool ix86_function_value_regno_p (const unsigned int);
+ static unsigned int ix86_function_arg_boundary (machine_mode,
+@@ -821,49173 +424,20710 @@ static unsigned int ix86_function_arg_boundary (machine_mode,
+ static rtx ix86_static_chain (const_tree, bool);
+ static int ix86_function_regparm (const_tree, const_tree);
+ static void ix86_compute_frame_layout (void);
+-static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
+-						 rtx, rtx, int);
+-static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
+ static tree ix86_canonical_va_list_type (tree);
+-static void predict_jump (int);
+ static unsigned int split_stack_prologue_scratch_regno (void);
+ static bool i386_asm_output_addr_const_extra (FILE *, rtx);
+ 
+-enum ix86_function_specific_strings
+-{
+-  IX86_FUNCTION_SPECIFIC_ARCH,
+-  IX86_FUNCTION_SPECIFIC_TUNE,
+-  IX86_FUNCTION_SPECIFIC_MAX
+-};
+-
+-static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
+-				 const char *, const char *, enum fpmath_unit,
+-				 bool, bool);
+-static void ix86_function_specific_save (struct cl_target_option *,
+-					 struct gcc_options *opts);
+-static void ix86_function_specific_restore (struct gcc_options *opts,
+-					    struct cl_target_option *);
+-static void ix86_function_specific_post_stream_in (struct cl_target_option *);
+-static void ix86_function_specific_print (FILE *, int,
+-					  struct cl_target_option *);
+-static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
+-static bool ix86_valid_target_attribute_inner_p (tree, char *[],
+-						 struct gcc_options *,
+-						 struct gcc_options *,
+-						 struct gcc_options *);
+ static bool ix86_can_inline_p (tree, tree);
+-static void ix86_set_current_function (tree);
+ static unsigned int ix86_minimum_incoming_stack_boundary (bool);
+ 
+-static enum calling_abi ix86_function_abi (const_tree);
+-
+ 
+-#ifndef SUBTARGET32_DEFAULT_CPU
+-#define SUBTARGET32_DEFAULT_CPU "i386"
+-#endif
+-
+ /* Whether -mtune= or -march= were specified */
+-static int ix86_tune_defaulted;
+-static int ix86_arch_specified;
+-
+-/* Vectorization library interface and handlers.  */
+-static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
+-
+-static tree ix86_veclibabi_svml (combined_fn, tree, tree);
+-static tree ix86_veclibabi_acml (combined_fn, tree, tree);
+-
+-/* This table must be in sync with enum processor_type in i386.h.  */ 
+-static const struct processor_costs *processor_cost_table[] =
+-{
+-  &generic_cost,
+-  &i386_cost,
+-  &i486_cost,
+-  &pentium_cost,
+-  &lakemont_cost,
+-  &pentiumpro_cost,
+-  &pentium4_cost,
+-  &nocona_cost,
+-  &core_cost,
+-  &core_cost,
+-  &core_cost,
+-  &core_cost,
+-  &atom_cost,
+-  &slm_cost,
+-  &slm_cost,
+-  &slm_cost,
+-  &slm_cost,
+-  &slm_cost,
+-  &slm_cost,
+-  &skylake_cost,
+-  &skylake_cost,
+-  &skylake_cost,
+-  &skylake_cost,
+-  &skylake_cost,
+-  &skylake_cost,
+-  &intel_cost,
+-  &geode_cost,
+-  &k6_cost,
+-  &athlon_cost,
+-  &k8_cost,
+-  &amdfam10_cost,
+-  &bdver_cost,
+-  &bdver_cost,
+-  &bdver_cost,
+-  &bdver_cost,
+-  &btver1_cost,
+-  &btver2_cost,
+-  &znver1_cost,
+-  &znver2_cost
+-};
+-
+-/* Guarantee that the array is aligned with enum processor_type.  */
+-STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max);
++int ix86_tune_defaulted;
++int ix86_arch_specified;
+ 
+-static unsigned int
+-rest_of_handle_insert_vzeroupper (void)
+-{
+-  int i;
+-
+-  /* vzeroupper instructions are inserted immediately after reload to
+-     account for possible spills from 256bit or 512bit registers.  The pass
+-     reuses mode switching infrastructure by re-running mode insertion
+-     pass, so disable entities that have already been processed.  */
+-  for (i = 0; i < MAX_386_ENTITIES; i++)
+-    ix86_optimize_mode_switching[i] = 0;
++/* Return true if a red-zone is in use.  We can't use red-zone when
++   there are local indirect jumps, like "indirect_jump" or "tablejump",
++   which jumps to another place in the function, since "call" in the
++   indirect thunk pushes the return address onto stack, destroying
++   red-zone.
+ 
+-  ix86_optimize_mode_switching[AVX_U128] = 1;
++   TODO: If we can reserve the first 2 WORDs, for PUSH and, another
++   for CALL, in red-zone, we can allow local indirect jumps with
++   indirect thunk.  */
+ 
+-  /* Call optimize_mode_switching.  */
+-  g->get_passes ()->execute_pass_mode_switching ();
+-  return 0;
++bool
++ix86_using_red_zone (void)
++{
++  return (TARGET_RED_ZONE
++	  && !TARGET_64BIT_MS_ABI
++	  && (!cfun->machine->has_local_indirect_jump
++	      || cfun->machine->indirect_branch_type == indirect_branch_keep));
+ }
+-
+-/* Return 1 if INSN uses or defines a hard register.
+-   Hard register uses in a memory address are ignored.
+-   Clobbers and flags definitions are ignored.  */
+-
++
++/* Return true, if profiling code should be emitted before
++   prologue. Otherwise it returns false.
++   Note: For x86 with "hotfix" it is sorried.  */
+ static bool
+-has_non_address_hard_reg (rtx_insn *insn)
++ix86_profile_before_prologue (void)
+ {
+-  df_ref ref;
+-  FOR_EACH_INSN_DEF (ref, insn)
+-    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
+-	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
+-	&& DF_REF_REGNO (ref) != FLAGS_REG)
+-      return true;
+-
+-  FOR_EACH_INSN_USE (ref, insn)
+-    if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
+-      return true;
+-
+-  return false;
++  return flag_fentry != 0;
+ }
+ 
+-/* Check if comparison INSN may be transformed
+-   into vector comparison.  Currently we transform
+-   zero checks only which look like:
+-
+-   (set (reg:CCZ 17 flags)
+-        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
+-                             (subreg:SI (reg:DI x) 0))
+-		     (const_int 0 [0])))  */
++/* Update register usage after having seen the compiler flags.  */
+ 
+-static bool
+-convertible_comparison_p (rtx_insn *insn)
++static void
++ix86_conditional_register_usage (void)
+ {
+-  if (!TARGET_SSE4_1)
+-    return false;
++  int i, c_mask;
+ 
+-  rtx def_set = single_set (insn);
++  /* If there are no caller-saved registers, preserve all registers.
++     except fixed_regs and registers used for function return value
++     since aggregate_value_p checks call_used_regs[regno] on return
++     value.  */
++  if (cfun && cfun->machine->no_caller_saved_registers)
++    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
++      if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
++	call_used_regs[i] = 0;
+ 
+-  gcc_assert (def_set);
++  /* For 32-bit targets, disable the REX registers.  */
++  if (! TARGET_64BIT)
++    {
++      for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
++	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
++      for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
++	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
++      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
++	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
++    }
+ 
+-  rtx src = SET_SRC (def_set);
+-  rtx dst = SET_DEST (def_set);
++  /*  See the definition of CALL_USED_REGISTERS in i386.h.  */
++  c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
++  
++  CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
+ 
+-  gcc_assert (GET_CODE (src) == COMPARE);
++  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
++    {
++      /* Set/reset conditionally defined registers from
++	 CALL_USED_REGISTERS initializer.  */
++      if (call_used_regs[i] > 1)
++	call_used_regs[i] = !!(call_used_regs[i] & c_mask);
+ 
+-  if (GET_CODE (dst) != REG
+-      || REGNO (dst) != FLAGS_REG
+-      || GET_MODE (dst) != CCZmode)
+-    return false;
++      /* Calculate registers of CLOBBERED_REGS register set
++	 as call used registers from GENERAL_REGS register set.  */
++      if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
++	  && call_used_regs[i])
++	SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
++    }
+ 
+-  rtx op1 = XEXP (src, 0);
+-  rtx op2 = XEXP (src, 1);
++  /* If MMX is disabled, disable the registers.  */
++  if (! TARGET_MMX)
++    accessible_reg_set &= ~reg_class_contents[MMX_REGS];
+ 
+-  if (op2 != CONST0_RTX (GET_MODE (op2)))
+-    return false;
++  /* If SSE is disabled, disable the registers.  */
++  if (! TARGET_SSE)
++    accessible_reg_set &= ~reg_class_contents[ALL_SSE_REGS];
+ 
+-  if (GET_CODE (op1) != IOR)
+-    return false;
++  /* If the FPU is disabled, disable the registers.  */
++  if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
++    accessible_reg_set &= ~reg_class_contents[FLOAT_REGS];
+ 
+-  op2 = XEXP (op1, 1);
+-  op1 = XEXP (op1, 0);
+-
+-  if (!SUBREG_P (op1)
+-      || !SUBREG_P (op2)
+-      || GET_MODE (op1) != SImode
+-      || GET_MODE (op2) != SImode
+-      || ((SUBREG_BYTE (op1) != 0
+-	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
+-	  && (SUBREG_BYTE (op2) != 0
+-	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
+-    return false;
++  /* If AVX512F is disabled, disable the registers.  */
++  if (! TARGET_AVX512F)
++    {
++      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
++	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
+ 
+-  op1 = SUBREG_REG (op1);
+-  op2 = SUBREG_REG (op2);
++      accessible_reg_set &= ~reg_class_contents[ALL_MASK_REGS];
++    }
++}
+ 
+-  if (op1 != op2
+-      || !REG_P (op1)
+-      || GET_MODE (op1) != DImode)
+-    return false;
++/* Canonicalize a comparison from one we don't have to one we do have.  */
+ 
+-  return true;
+-}
++static void
++ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
++			      bool op0_preserve_value)
++{
++  /* The order of operands in x87 ficom compare is forced by combine in
++     simplify_comparison () function. Float operator is treated as RTX_OBJ
++     with a precedence over other operators and is always put in the first
++     place. Swap condition and operands to match ficom instruction.  */
++  if (!op0_preserve_value
++      && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
++    {
++      enum rtx_code scode = swap_condition ((enum rtx_code) *code);
+ 
+-/* The DImode version of scalar_to_vector_candidate_p.  */
++      /* We are called only for compares that are split to SAHF instruction.
++	 Ensure that we have setcc/jcc insn for the swapped condition.  */
++      if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
++	{
++	  std::swap (*op0, *op1);
++	  *code = (int) scode;
++	}
++    }
++}
++
++
++/* Hook to determine if one function can safely inline another.  */
+ 
+ static bool
+-dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
++ix86_can_inline_p (tree caller, tree callee)
+ {
+-  rtx def_set = single_set (insn);
+-
+-  if (!def_set)
+-    return false;
++  tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
++  tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
+ 
+-  if (has_non_address_hard_reg (insn))
+-    return false;
++  /* Changes of those flags can be tolerated for always inlines. Lets hope
++     user knows what he is doing.  */
++  const unsigned HOST_WIDE_INT always_inline_safe_mask
++	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
++	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
++	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
++	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
++	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
++	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
++	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
+ 
+-  rtx src = SET_SRC (def_set);
+-  rtx dst = SET_DEST (def_set);
+ 
+-  if (GET_CODE (src) == COMPARE)
+-    return convertible_comparison_p (insn);
++  if (!callee_tree)
++    callee_tree = target_option_default_node;
++  if (!caller_tree)
++    caller_tree = target_option_default_node;
++  if (callee_tree == caller_tree)
++    return true;
+ 
+-  /* We are interested in DImode promotion only.  */
+-  if ((GET_MODE (src) != DImode
+-       && !CONST_INT_P (src))
+-      || GET_MODE (dst) != DImode)
+-    return false;
++  struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
++  struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
++  bool ret = false;
++  bool always_inline
++    = (DECL_DISREGARD_INLINE_LIMITS (callee)
++       && lookup_attribute ("always_inline",
++			    DECL_ATTRIBUTES (callee)));
+ 
+-  if (!REG_P (dst) && !MEM_P (dst))
+-    return false;
+-
+-  switch (GET_CODE (src))
+-    {
+-    case ASHIFTRT:
+-      if (!TARGET_AVX512VL)
+-	return false;
+-      /* FALLTHRU */
+-
+-    case ASHIFT:
+-    case LSHIFTRT:
+-      if (!CONST_INT_P (XEXP (src, 1))
+-	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
+-	return false;
+-      break;
+-
+-    case PLUS:
+-    case MINUS:
+-    case IOR:
+-    case XOR:
+-    case AND:
+-      if (!REG_P (XEXP (src, 1))
+-	  && !MEM_P (XEXP (src, 1))
+-	  && !CONST_INT_P (XEXP (src, 1)))
+-	return false;
+-
+-      if (GET_MODE (XEXP (src, 1)) != DImode
+-	  && !CONST_INT_P (XEXP (src, 1)))
+-	return false;
+-      break;
++  cgraph_node *callee_node = cgraph_node::get (callee);
++  /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
++     function can inline a SSE2 function but a SSE2 function can't inline
++     a SSE4 function.  */
++  if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
++       != callee_opts->x_ix86_isa_flags)
++      || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
++	  != callee_opts->x_ix86_isa_flags2))
++    ret = false;
+ 
+-    case NEG:
+-    case NOT:
+-      break;
++  /* See if we have the same non-isa options.  */
++  else if ((!always_inline
++	    && caller_opts->x_target_flags != callee_opts->x_target_flags)
++	   || (caller_opts->x_target_flags & ~always_inline_safe_mask)
++	       != (callee_opts->x_target_flags & ~always_inline_safe_mask))
++    ret = false;
+ 
+-    case REG:
+-      return true;
++  /* See if arch, tune, etc. are the same.  */
++  else if (caller_opts->arch != callee_opts->arch)
++    ret = false;
+ 
+-    case MEM:
+-    case CONST_INT:
+-      return REG_P (dst);
++  else if (!always_inline && caller_opts->tune != callee_opts->tune)
++    ret = false;
+ 
+-    default:
+-      return false;
+-    }
++  else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
++	   /* If the calle doesn't use FP expressions differences in
++	      ix86_fpmath can be ignored.  We are called from FEs
++	      for multi-versioning call optimization, so beware of
++	      ipa_fn_summaries not available.  */
++	   && (! ipa_fn_summaries
++	       || ipa_fn_summaries->get (callee_node) == NULL
++	       || ipa_fn_summaries->get (callee_node)->fp_expressions))
++    ret = false;
+ 
+-  if (!REG_P (XEXP (src, 0))
+-      && !MEM_P (XEXP (src, 0))
+-      && !CONST_INT_P (XEXP (src, 0))
+-      /* Check for andnot case.  */
+-      && (GET_CODE (src) != AND
+-	  || GET_CODE (XEXP (src, 0)) != NOT
+-	  || !REG_P (XEXP (XEXP (src, 0), 0))))
+-      return false;
++  else if (!always_inline
++	   && caller_opts->branch_cost != callee_opts->branch_cost)
++    ret = false;
+ 
+-  if (GET_MODE (XEXP (src, 0)) != DImode
+-      && !CONST_INT_P (XEXP (src, 0)))
+-    return false;
++  else
++    ret = true;
+ 
+-  return true;
++  return ret;
+ }
+-
+-/* The TImode version of scalar_to_vector_candidate_p.  */
++
++/* Return true if this goes in large data/bss.  */
+ 
+ static bool
+-timode_scalar_to_vector_candidate_p (rtx_insn *insn)
++ix86_in_large_data_p (tree exp)
+ {
+-  rtx def_set = single_set (insn);
+-
+-  if (!def_set)
++  if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
+     return false;
+ 
+-  if (has_non_address_hard_reg (insn))
++  if (exp == NULL_TREE)
+     return false;
+ 
+-  rtx src = SET_SRC (def_set);
+-  rtx dst = SET_DEST (def_set);
+-
+-  /* Only TImode load and store are allowed.  */
+-  if (GET_MODE (dst) != TImode)
++  /* Functions are never large data.  */
++  if (TREE_CODE (exp) == FUNCTION_DECL)
+     return false;
+ 
+-  if (MEM_P (dst))
+-    {
+-      /* Check for store.  Memory must be aligned or unaligned store
+-	 is optimal.  Only support store from register, standard SSE
+-	 constant or CONST_WIDE_INT generated from piecewise store.
+-
+-	 ??? Verify performance impact before enabling CONST_INT for
+-	 __int128 store.  */
+-      if (misaligned_operand (dst, TImode)
+-	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+-	return false;
+-
+-      switch (GET_CODE (src))
+-	{
+-	default:
+-	  return false;
+-
+-	case REG:
+-	case CONST_WIDE_INT:
+-	  return true;
++  /* Automatic variables are never large data.  */
++  if (VAR_P (exp) && !is_global_var (exp))
++    return false;
+ 
+-	case CONST_INT:
+-	  return standard_sse_constant_p (src, TImode);
+-	}
+-    }
+-  else if (MEM_P (src))
++  if (VAR_P (exp) && DECL_SECTION_NAME (exp))
+     {
+-      /* Check for load.  Memory must be aligned or unaligned load is
+-	 optimal.  */
+-      return (REG_P (dst)
+-	      && (!misaligned_operand (src, TImode)
+-		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
++      const char *section = DECL_SECTION_NAME (exp);
++      if (strcmp (section, ".ldata") == 0
++	  || strcmp (section, ".lbss") == 0)
++	return true;
++      return false;
+     }
+-
+-  return false;
+-}
+-
+-/* Return 1 if INSN may be converted into vector
+-   instruction.  */
+-
+-static bool
+-scalar_to_vector_candidate_p (rtx_insn *insn)
+-{
+-  if (TARGET_64BIT)
+-    return timode_scalar_to_vector_candidate_p (insn);
+   else
+-    return dimode_scalar_to_vector_candidate_p (insn);
+-}
+-
+-/* The DImode version of remove_non_convertible_regs.  */
+-
+-static void
+-dimode_remove_non_convertible_regs (bitmap candidates)
+-{
+-  bitmap_iterator bi;
+-  unsigned id;
+-  bitmap regs = BITMAP_ALLOC (NULL);
+-
+-  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
+-    {
+-      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+-      rtx reg = SET_DEST (def_set);
+-
+-      if (!REG_P (reg)
+-	  || bitmap_bit_p (regs, REGNO (reg))
+-	  || HARD_REGISTER_P (reg))
+-	continue;
+-
+-      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+-	   def;
+-	   def = DF_REF_NEXT_REG (def))
+-	{
+-	  if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+-	    {
+-	      if (dump_file)
+-		fprintf (dump_file,
+-			 "r%d has non convertible definition in insn %d\n",
+-			 REGNO (reg), DF_REF_INSN_UID (def));
+-
+-	      bitmap_set_bit (regs, REGNO (reg));
+-	      break;
+-	    }
+-	}
+-    }
+-
+-  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+     {
+-      for (df_ref def = DF_REG_DEF_CHAIN (id);
+-	   def;
+-	   def = DF_REF_NEXT_REG (def))
+-	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+-	  {
+-	    if (dump_file)
+-	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+-		       DF_REF_INSN_UID (def));
++      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
+ 
+-	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
+-	  }
++      /* If this is an incomplete type with size 0, then we can't put it
++	 in data because it might be too big when completed.  Also,
++	 int_size_in_bytes returns -1 if size can vary or is larger than
++	 an integer in which case also it is safer to assume that it goes in
++	 large data.  */
++      if (size <= 0 || size > ix86_section_threshold)
++	return true;
+     }
+ 
+-  BITMAP_FREE (regs);
++  return false;
+ }
+ 
+-/* For a register REGNO, scan instructions for its defs and uses.
+-   Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
++/* i386-specific section flag to mark large sections.  */
++#define SECTION_LARGE SECTION_MACH_DEP
++
++/* Switch to the appropriate section for output of DECL.
++   DECL is either a `VAR_DECL' node or a constant of some sort.
++   RELOC indicates whether forming the initial value of DECL requires
++   link-time relocations.  */
+ 
+-static void
+-timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
+-				   unsigned int regno)
++ATTRIBUTE_UNUSED static section *
++x86_64_elf_select_section (tree decl, int reloc,
++			   unsigned HOST_WIDE_INT align)
+ {
+-  for (df_ref def = DF_REG_DEF_CHAIN (regno);
+-       def;
+-       def = DF_REF_NEXT_REG (def))
++  if (ix86_in_large_data_p (decl))
+     {
+-      if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
++      const char *sname = NULL;
++      unsigned int flags = SECTION_WRITE | SECTION_LARGE;
++      switch (categorize_decl_for_section (decl, reloc))
+ 	{
+-	  if (dump_file)
+-	    fprintf (dump_file,
+-		     "r%d has non convertible def in insn %d\n",
+-		     regno, DF_REF_INSN_UID (def));
+-
+-	  bitmap_set_bit (regs, regno);
++	case SECCAT_DATA:
++	  sname = ".ldata";
++	  break;
++	case SECCAT_DATA_REL:
++	  sname = ".ldata.rel";
++	  break;
++	case SECCAT_DATA_REL_LOCAL:
++	  sname = ".ldata.rel.local";
++	  break;
++	case SECCAT_DATA_REL_RO:
++	  sname = ".ldata.rel.ro";
++	  break;
++	case SECCAT_DATA_REL_RO_LOCAL:
++	  sname = ".ldata.rel.ro.local";
++	  break;
++	case SECCAT_BSS:
++	  sname = ".lbss";
++	  flags |= SECTION_BSS;
++	  break;
++	case SECCAT_RODATA:
++	case SECCAT_RODATA_MERGE_STR:
++	case SECCAT_RODATA_MERGE_STR_INIT:
++	case SECCAT_RODATA_MERGE_CONST:
++	  sname = ".lrodata";
++	  flags &= ~SECTION_WRITE;
++	  break;
++	case SECCAT_SRODATA:
++	case SECCAT_SDATA:
++	case SECCAT_SBSS:
++	  gcc_unreachable ();
++	case SECCAT_TEXT:
++	case SECCAT_TDATA:
++	case SECCAT_TBSS:
++	  /* We don't split these for medium model.  Place them into
++	     default sections and hope for best.  */
+ 	  break;
+ 	}
+-    }
+-
+-  for (df_ref ref = DF_REG_USE_CHAIN (regno);
+-       ref;
+-       ref = DF_REF_NEXT_REG (ref))
+-    {
+-      /* Debug instructions are skipped.  */
+-      if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
+-	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
++      if (sname)
+ 	{
+-	  if (dump_file)
+-	    fprintf (dump_file,
+-		     "r%d has non convertible use in insn %d\n",
+-		     regno, DF_REF_INSN_UID (ref));
+-
+-	  bitmap_set_bit (regs, regno);
+-	  break;
++	  /* We might get called with string constants, but get_named_section
++	     doesn't like them as they are not DECLs.  Also, we need to set
++	     flags in that case.  */
++	  if (!DECL_P (decl))
++	    return get_section (sname, flags, NULL);
++	  return get_named_section (decl, sname, reloc);
+ 	}
+     }
++  return default_elf_select_section (decl, reloc, align);
+ }
+ 
+-/* The TImode version of remove_non_convertible_regs.  */
++/* Select a set of attributes for section NAME based on the properties
++   of DECL and whether or not RELOC indicates that DECL's initializer
++   might contain runtime relocations.  */
+ 
+-static void
+-timode_remove_non_convertible_regs (bitmap candidates)
++static unsigned int ATTRIBUTE_UNUSED
++x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
+ {
+-  bitmap_iterator bi;
+-  unsigned id;
+-  bitmap regs = BITMAP_ALLOC (NULL);
+-
+-  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
+-    {
+-      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+-      rtx dest = SET_DEST (def_set);
+-      rtx src = SET_SRC (def_set);
+-
+-      if ((!REG_P (dest)
+-	   || bitmap_bit_p (regs, REGNO (dest))
+-	   || HARD_REGISTER_P (dest))
+-	  && (!REG_P (src)
+-	      || bitmap_bit_p (regs, REGNO (src))
+-	      || HARD_REGISTER_P (src)))
+-	continue;
+-
+-      if (REG_P (dest))
+-	timode_check_non_convertible_regs (candidates, regs,
+-					   REGNO (dest));
+-
+-      if (REG_P (src))
+-	timode_check_non_convertible_regs (candidates, regs,
+-					   REGNO (src));
+-    }
+-
+-  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+-    {
+-      for (df_ref def = DF_REG_DEF_CHAIN (id);
+-	   def;
+-	   def = DF_REF_NEXT_REG (def))
+-	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+-	  {
+-	    if (dump_file)
+-	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+-		       DF_REF_INSN_UID (def));
++  unsigned int flags = default_section_type_flags (decl, name, reloc);
+ 
+-	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
+-	  }
++  if (ix86_in_large_data_p (decl))
++    flags |= SECTION_LARGE;
+ 
+-      for (df_ref ref = DF_REG_USE_CHAIN (id);
+-	   ref;
+-	   ref = DF_REF_NEXT_REG (ref))
+-	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
+-	  {
+-	    if (dump_file)
+-	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+-		       DF_REF_INSN_UID (ref));
++  if (decl == NULL_TREE
++      && (strcmp (name, ".ldata.rel.ro") == 0
++	  || strcmp (name, ".ldata.rel.ro.local") == 0))
++    flags |= SECTION_RELRO;
+ 
+-	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
+-	  }
+-    }
++  if (strcmp (name, ".lbss") == 0
++      || strncmp (name, ".lbss.", 5) == 0
++      || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
++    flags |= SECTION_BSS;
+ 
+-  BITMAP_FREE (regs);
++  return flags;
+ }
+ 
+-/* For a given bitmap of insn UIDs scans all instruction and
+-   remove insn from CANDIDATES in case it has both convertible
+-   and not convertible definitions.
+-
+-   All insns in a bitmap are conversion candidates according to
+-   scalar_to_vector_candidate_p.  Currently it implies all insns
+-   are single_set.  */
+-
+-static void
+-remove_non_convertible_regs (bitmap candidates)
+-{
+-  if (TARGET_64BIT)
+-    timode_remove_non_convertible_regs (candidates);
+-  else
+-    dimode_remove_non_convertible_regs (candidates);
+-}
+-
+-class scalar_chain
+-{
+- public:
+-  scalar_chain ();
+-  virtual ~scalar_chain ();
+-
+-  static unsigned max_id;
+-
+-  /* ID of a chain.  */
+-  unsigned int chain_id;
+-  /* A queue of instructions to be included into a chain.  */
+-  bitmap queue;
+-  /* Instructions included into a chain.  */
+-  bitmap insns;
+-  /* All registers defined by a chain.  */
+-  bitmap defs;
+-  /* Registers used in both vector and sclar modes.  */
+-  bitmap defs_conv;
+-
+-  void build (bitmap candidates, unsigned insn_uid);
+-  virtual int compute_convert_gain () = 0;
+-  int convert ();
+-
+- protected:
+-  void add_to_queue (unsigned insn_uid);
+-  void emit_conversion_insns (rtx insns, rtx_insn *pos);
+-
+- private:
+-  void add_insn (bitmap candidates, unsigned insn_uid);
+-  void analyze_register_chain (bitmap candidates, df_ref ref);
+-  virtual void mark_dual_mode_def (df_ref def) = 0;
+-  virtual void convert_insn (rtx_insn *insn) = 0;
+-  virtual void convert_registers () = 0;
+-};
+-
+-class dimode_scalar_chain : public scalar_chain
+-{
+- public:
+-  int compute_convert_gain ();
+- private:
+-  void mark_dual_mode_def (df_ref def);
+-  rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
+-  void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
+-  void convert_insn (rtx_insn *insn);
+-  void convert_op (rtx *op, rtx_insn *insn);
+-  void convert_reg (unsigned regno);
+-  void make_vector_copies (unsigned regno);
+-  void convert_registers ();
+-  int vector_const_cost (rtx exp);
+-};
++/* Build up a unique section name, expressed as a
++   STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
++   RELOC indicates whether the initial value of EXP requires
++   link-time relocations.  */
+ 
+-class timode_scalar_chain : public scalar_chain
++static void ATTRIBUTE_UNUSED
++x86_64_elf_unique_section (tree decl, int reloc)
+ {
+- public:
+-  /* Convert from TImode to V1TImode is always faster.  */
+-  int compute_convert_gain () { return 1; }
+-
+- private:
+-  void mark_dual_mode_def (df_ref def);
+-  void fix_debug_reg_uses (rtx reg);
+-  void convert_insn (rtx_insn *insn);
+-  /* We don't convert registers to difference size.  */
+-  void convert_registers () {}
+-};
+-
+-unsigned scalar_chain::max_id = 0;
+-
+-/* Initialize new chain.  */
++  if (ix86_in_large_data_p (decl))
++    {
++      const char *prefix = NULL;
++      /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
++      bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
+ 
+-scalar_chain::scalar_chain ()
+-{
+-  chain_id = ++max_id;
++      switch (categorize_decl_for_section (decl, reloc))
++	{
++	case SECCAT_DATA:
++	case SECCAT_DATA_REL:
++	case SECCAT_DATA_REL_LOCAL:
++	case SECCAT_DATA_REL_RO:
++	case SECCAT_DATA_REL_RO_LOCAL:
++          prefix = one_only ? ".ld" : ".ldata";
++	  break;
++	case SECCAT_BSS:
++          prefix = one_only ? ".lb" : ".lbss";
++	  break;
++	case SECCAT_RODATA:
++	case SECCAT_RODATA_MERGE_STR:
++	case SECCAT_RODATA_MERGE_STR_INIT:
++	case SECCAT_RODATA_MERGE_CONST:
++          prefix = one_only ? ".lr" : ".lrodata";
++	  break;
++	case SECCAT_SRODATA:
++	case SECCAT_SDATA:
++	case SECCAT_SBSS:
++	  gcc_unreachable ();
++	case SECCAT_TEXT:
++	case SECCAT_TDATA:
++	case SECCAT_TBSS:
++	  /* We don't split these for medium model.  Place them into
++	     default sections and hope for best.  */
++	  break;
++	}
++      if (prefix)
++	{
++	  const char *name, *linkonce;
++	  char *string;
+ 
+-   if (dump_file)
+-    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
++	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
++	  name = targetm.strip_name_encoding (name);
+ 
+-  bitmap_obstack_initialize (NULL);
+-  insns = BITMAP_ALLOC (NULL);
+-  defs = BITMAP_ALLOC (NULL);
+-  defs_conv = BITMAP_ALLOC (NULL);
+-  queue = NULL;
+-}
++	  /* If we're using one_only, then there needs to be a .gnu.linkonce
++     	     prefix to the section name.  */
++	  linkonce = one_only ? ".gnu.linkonce" : "";
+ 
+-/* Free chain's data.  */
++	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
+ 
+-scalar_chain::~scalar_chain ()
+-{
+-  BITMAP_FREE (insns);
+-  BITMAP_FREE (defs);
+-  BITMAP_FREE (defs_conv);
+-  bitmap_obstack_release (NULL);
++	  set_decl_section_name (decl, string);
++	  return;
++	}
++    }
++  default_unique_section (decl, reloc);
+ }
+ 
+-/* Add instruction into chains' queue.  */
+-
+-void
+-scalar_chain::add_to_queue (unsigned insn_uid)
+-{
+-  if (bitmap_bit_p (insns, insn_uid)
+-      || bitmap_bit_p (queue, insn_uid))
+-    return;
++#ifdef COMMON_ASM_OP
+ 
+-  if (dump_file)
+-    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
+-	     insn_uid, chain_id);
+-  bitmap_set_bit (queue, insn_uid);
+-}
++#ifndef LARGECOMM_SECTION_ASM_OP
++#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
++#endif
+ 
+-/* For DImode conversion, mark register defined by DEF as requiring
+-   conversion.  */
++/* This says how to output assembler code to declare an
++   uninitialized external linkage data object.
+ 
++   For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
++   large objects.  */
+ void
+-dimode_scalar_chain::mark_dual_mode_def (df_ref def)
++x86_elf_aligned_decl_common (FILE *file, tree decl,
++			const char *name, unsigned HOST_WIDE_INT size,
++			int align)
+ {
+-  gcc_assert (DF_REF_REG_DEF_P (def));
+-
+-  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
+-    return;
+-
+-  if (dump_file)
+-    fprintf (dump_file,
+-	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
+-	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
+-
+-  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
++  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
++      && size > (unsigned int)ix86_section_threshold)
++    {
++      switch_to_section (get_named_section (decl, ".lbss", 0));
++      fputs (LARGECOMM_SECTION_ASM_OP, file);
++    }
++  else
++    fputs (COMMON_ASM_OP, file);
++  assemble_name (file, name);
++  fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
++	   size, align / BITS_PER_UNIT);
+ }
++#endif
+ 
+-/* For TImode conversion, it is unused.  */
++/* Utility function for targets to use in implementing
++   ASM_OUTPUT_ALIGNED_BSS.  */
+ 
+ void
+-timode_scalar_chain::mark_dual_mode_def (df_ref)
++x86_output_aligned_bss (FILE *file, tree decl, const char *name,
++		       	unsigned HOST_WIDE_INT size, int align)
+ {
+-  gcc_unreachable ();
++  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
++      && size > (unsigned int)ix86_section_threshold)
++    switch_to_section (get_named_section (decl, ".lbss", 0));
++  else
++    switch_to_section (bss_section);
++  ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
++#ifdef ASM_DECLARE_OBJECT_NAME
++  last_assemble_variable_decl = decl;
++  ASM_DECLARE_OBJECT_NAME (file, name, decl);
++#else
++  /* Standard thing is just output label for the object.  */
++  ASM_OUTPUT_LABEL (file, name);
++#endif /* ASM_DECLARE_OBJECT_NAME */
++  ASM_OUTPUT_SKIP (file, size ? size : 1);
+ }
++
++/* Decide whether we must probe the stack before any space allocation
++   on this target.  It's essentially TARGET_STACK_PROBE except when
++   -fstack-check causes the stack to be already probed differently.  */
+ 
+-/* Check REF's chain to add new insns into a queue
+-   and find registers requiring conversion.  */
+-
+-void
+-scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
++bool
++ix86_target_stack_probe (void)
+ {
+-  df_link *chain;
+-
+-  gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
+-	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
+-  add_to_queue (DF_REF_INSN_UID (ref));
+-
+-  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
+-    {
+-      unsigned uid = DF_REF_INSN_UID (chain->ref);
+-
+-      if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
+-	continue;
+-
+-      if (!DF_REF_REG_MEM_P (chain->ref))
+-	{
+-	  if (bitmap_bit_p (insns, uid))
+-	    continue;
+-
+-	  if (bitmap_bit_p (candidates, uid))
+-	    {
+-	      add_to_queue (uid);
+-	      continue;
+-	    }
+-	}
++  /* Do not probe the stack twice if static stack checking is enabled.  */
++  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
++    return false;
+ 
+-      if (DF_REF_REG_DEF_P (chain->ref))
+-	{
+-	  if (dump_file)
+-	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
+-		     DF_REF_REGNO (chain->ref), uid);
+-	  mark_dual_mode_def (chain->ref);
+-	}
+-      else
+-	{
+-	  if (dump_file)
+-	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
+-		     DF_REF_REGNO (chain->ref), uid);
+-	  mark_dual_mode_def (ref);
+-	}
+-    }
++  return TARGET_STACK_PROBE;
+ }
++
++/* Decide whether we can make a sibling call to a function.  DECL is the
++   declaration of the function being targeted by the call and EXP is the
++   CALL_EXPR representing the call.  */
+ 
+-/* Add instruction into a chain.  */
+-
+-void
+-scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
++static bool
++ix86_function_ok_for_sibcall (tree decl, tree exp)
+ {
+-  if (bitmap_bit_p (insns, insn_uid))
+-    return;
+-
+-  if (dump_file)
+-    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
+-
+-  bitmap_set_bit (insns, insn_uid);
++  tree type, decl_or_type;
++  rtx a, b;
++  bool bind_global = decl && !targetm.binds_local_p (decl);
+ 
+-  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+-  rtx def_set = single_set (insn);
+-  if (def_set && REG_P (SET_DEST (def_set))
+-      && !HARD_REGISTER_P (SET_DEST (def_set)))
+-    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
++  if (ix86_function_naked (current_function_decl))
++    return false;
+ 
+-  df_ref ref;
+-  df_ref def;
+-  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+-    if (!HARD_REGISTER_P (DF_REF_REG (ref)))
+-      for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
+-	   def;
+-	   def = DF_REF_NEXT_REG (def))
+-	analyze_register_chain (candidates, def);
+-  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+-    if (!DF_REF_REG_MEM_P (ref))
+-      analyze_register_chain (candidates, ref);
+-}
++  /* Sibling call isn't OK if there are no caller-saved registers
++     since all registers must be preserved before return.  */
++  if (cfun->machine->no_caller_saved_registers)
++    return false;
+ 
+-/* Build new chain starting from insn INSN_UID recursively
+-   adding all dependent uses and definitions.  */
++  /* If we are generating position-independent code, we cannot sibcall
++     optimize direct calls to global functions, as the PLT requires
++     %ebx be live. (Darwin does not have a PLT.)  */
++  if (!TARGET_MACHO
++      && !TARGET_64BIT
++      && flag_pic
++      && flag_plt
++      && bind_global)
++    return false;
+ 
+-void
+-scalar_chain::build (bitmap candidates, unsigned insn_uid)
+-{
+-  queue = BITMAP_ALLOC (NULL);
+-  bitmap_set_bit (queue, insn_uid);
++  /* If we need to align the outgoing stack, then sibcalling would
++     unalign the stack, which may break the called function.  */
++  if (ix86_minimum_incoming_stack_boundary (true)
++      < PREFERRED_STACK_BOUNDARY)
++    return false;
+ 
+-  if (dump_file)
+-    fprintf (dump_file, "Building chain #%d...\n", chain_id);
++  if (decl)
++    {
++      decl_or_type = decl;
++      type = TREE_TYPE (decl);
++    }
++  else
++    {
++      /* We're looking at the CALL_EXPR, we need the type of the function.  */
++      type = CALL_EXPR_FN (exp);		/* pointer expression */
++      type = TREE_TYPE (type);			/* pointer type */
++      type = TREE_TYPE (type);			/* function type */
++      decl_or_type = type;
++    }
+ 
+-  while (!bitmap_empty_p (queue))
++  /* Check that the return value locations are the same.  Like
++     if we are returning floats on the 80387 register stack, we cannot
++     make a sibcall from a function that doesn't return a float to a
++     function that does or, conversely, from a function that does return
++     a float to a function that doesn't; the necessary stack adjustment
++     would not be executed.  This is also the place we notice
++     differences in the return value ABI.  Note that it is ok for one
++     of the functions to have void return type as long as the return
++     value of the other is passed in a register.  */
++  a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
++  b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
++			   cfun->decl, false);
++  if (STACK_REG_P (a) || STACK_REG_P (b))
+     {
+-      insn_uid = bitmap_first_set_bit (queue);
+-      bitmap_clear_bit (queue, insn_uid);
+-      bitmap_clear_bit (candidates, insn_uid);
+-      add_insn (candidates, insn_uid);
++      if (!rtx_equal_p (a, b))
++	return false;
+     }
++  else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
++    ;
++  else if (!rtx_equal_p (a, b))
++    return false;
+ 
+-  if (dump_file)
++  if (TARGET_64BIT)
++    {
++      /* The SYSV ABI has more call-clobbered registers;
++	 disallow sibcalls from MS to SYSV.  */
++      if (cfun->machine->call_abi == MS_ABI
++	  && ix86_function_type_abi (type) == SYSV_ABI)
++	return false;
++    }
++  else
+     {
+-      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
+-      fprintf (dump_file, "  insns: ");
+-      dump_bitmap (dump_file, insns);
+-      if (!bitmap_empty_p (defs_conv))
++      /* If this call is indirect, we'll need to be able to use a
++	 call-clobbered register for the address of the target function.
++	 Make sure that all such registers are not used for passing
++	 parameters.  Note that DLLIMPORT functions and call to global
++	 function via GOT slot are indirect.  */
++      if (!decl
++	  || (bind_global && flag_pic && !flag_plt)
++	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
++	  || flag_force_indirect_call)
+ 	{
+-	  bitmap_iterator bi;
+-	  unsigned id;
+-	  const char *comma = "";
+-	  fprintf (dump_file, "  defs to convert: ");
+-	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
+-	    {
+-	      fprintf (dump_file, "%sr%d", comma, id);
+-	      comma = ", ";
+-	    }
+-	  fprintf (dump_file, "\n");
++	  /* Check if regparm >= 3 since arg_reg_available is set to
++	     false if regparm == 0.  If regparm is 1 or 2, there is
++	     always a call-clobbered register available.
++
++	     ??? The symbol indirect call doesn't need a call-clobbered
++	     register.  But we don't know if this is a symbol indirect
++	     call or not here.  */
++	  if (ix86_function_regparm (type, decl) >= 3
++	      && !cfun->machine->arg_reg_available)
++	    return false;
+ 	}
+     }
+ 
+-  BITMAP_FREE (queue);
++  /* Otherwise okay.  That also includes certain types of indirect calls.  */
++  return true;
+ }
+ 
+-/* Return a cost of building a vector costant
+-   instead of using a scalar one.  */
++/* This function determines from TYPE the calling-convention.  */
+ 
+-int
+-dimode_scalar_chain::vector_const_cost (rtx exp)
++unsigned int
++ix86_get_callcvt (const_tree type)
+ {
+-  gcc_assert (CONST_INT_P (exp));
++  unsigned int ret = 0;
++  bool is_stdarg;
++  tree attrs;
+ 
+-  if (standard_sse_constant_p (exp, V2DImode))
+-    return COSTS_N_INSNS (1);
+-  return ix86_cost->sse_load[1];
+-}
++  if (TARGET_64BIT)
++    return IX86_CALLCVT_CDECL;
+ 
+-/* Compute a gain for chain conversion.  */
++  attrs = TYPE_ATTRIBUTES (type);
++  if (attrs != NULL_TREE)
++    {
++      if (lookup_attribute ("cdecl", attrs))
++	ret |= IX86_CALLCVT_CDECL;
++      else if (lookup_attribute ("stdcall", attrs))
++	ret |= IX86_CALLCVT_STDCALL;
++      else if (lookup_attribute ("fastcall", attrs))
++	ret |= IX86_CALLCVT_FASTCALL;
++      else if (lookup_attribute ("thiscall", attrs))
++	ret |= IX86_CALLCVT_THISCALL;
+ 
+-int
+-dimode_scalar_chain::compute_convert_gain ()
+-{
+-  bitmap_iterator bi;
+-  unsigned insn_uid;
+-  int gain = 0;
+-  int cost = 0;
+-
+-  if (dump_file)
+-    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
+-
+-  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
+-    {
+-      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+-      rtx def_set = single_set (insn);
+-      rtx src = SET_SRC (def_set);
+-      rtx dst = SET_DEST (def_set);
+-
+-      if (REG_P (src) && REG_P (dst))
+-	gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
+-      else if (REG_P (src) && MEM_P (dst))
+-	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+-      else if (MEM_P (src) && REG_P (dst))
+-	gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
+-      else if (GET_CODE (src) == ASHIFT
+-	       || GET_CODE (src) == ASHIFTRT
+-	       || GET_CODE (src) == LSHIFTRT)
+-	{
+-    	  if (CONST_INT_P (XEXP (src, 0)))
+-	    gain -= vector_const_cost (XEXP (src, 0));
+-
+-	  gain += ix86_cost->shift_const;
+-	  if (INTVAL (XEXP (src, 1)) >= 32)
+-	    gain -= COSTS_N_INSNS (1);
+-	}
+-      else if (GET_CODE (src) == PLUS
+-	       || GET_CODE (src) == MINUS
+-	       || GET_CODE (src) == IOR
+-	       || GET_CODE (src) == XOR
+-	       || GET_CODE (src) == AND)
+-	{
+-	  gain += ix86_cost->add;
+-	  /* Additional gain for andnot for targets without BMI.  */
+-	  if (GET_CODE (XEXP (src, 0)) == NOT
+-	      && !TARGET_BMI)
+-	    gain += 2 * ix86_cost->add;
+-
+-	  if (CONST_INT_P (XEXP (src, 0)))
+-	    gain -= vector_const_cost (XEXP (src, 0));
+-	  if (CONST_INT_P (XEXP (src, 1)))
+-	    gain -= vector_const_cost (XEXP (src, 1));
+-	}
+-      else if (GET_CODE (src) == NEG
+-	       || GET_CODE (src) == NOT)
+-	gain += ix86_cost->add - COSTS_N_INSNS (1);
+-      else if (GET_CODE (src) == COMPARE)
+-	{
+-	  /* Assume comparison cost is the same.  */
+-	}
+-      else if (CONST_INT_P (src))
+-	{
+-	  if (REG_P (dst))
+-	    gain += COSTS_N_INSNS (2);
+-	  else if (MEM_P (dst))
+-	    gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+-	  gain -= vector_const_cost (src);
+-	}
+-      else
+-	gcc_unreachable ();
+-    }
+-
+-  if (dump_file)
+-    fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
+-
+-  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
+-    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
++      /* Regparam isn't allowed for thiscall and fastcall.  */
++      if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
++	{
++	  if (lookup_attribute ("regparm", attrs))
++	    ret |= IX86_CALLCVT_REGPARM;
++	  if (lookup_attribute ("sseregparm", attrs))
++	    ret |= IX86_CALLCVT_SSEREGPARM;
++	}
+ 
+-  if (dump_file)
+-    fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
++      if (IX86_BASE_CALLCVT(ret) != 0)
++	return ret;
++    }
+ 
+-  gain -= cost;
++  is_stdarg = stdarg_p (type);
++  if (TARGET_RTD && !is_stdarg)
++    return IX86_CALLCVT_STDCALL | ret;
+ 
+-  if (dump_file)
+-    fprintf (dump_file, "  Total gain: %d\n", gain);
++  if (ret != 0
++      || is_stdarg
++      || TREE_CODE (type) != METHOD_TYPE
++      || ix86_function_type_abi (type) != MS_ABI)
++    return IX86_CALLCVT_CDECL | ret;
+ 
+-  return gain;
++  return IX86_CALLCVT_THISCALL;
+ }
+ 
+-/* Replace REG in X with a V2DI subreg of NEW_REG.  */
++/* Return 0 if the attributes for two types are incompatible, 1 if they
++   are compatible, and 2 if they are nearly compatible (which causes a
++   warning to be generated).  */
+ 
+-rtx
+-dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
++static int
++ix86_comp_type_attributes (const_tree type1, const_tree type2)
+ {
+-  if (x == reg)
+-    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
++  unsigned int ccvt1, ccvt2;
+ 
+-  const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
+-  int i, j;
+-  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
+-    {
+-      if (fmt[i] == 'e')
+-	XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
+-      else if (fmt[i] == 'E')
+-	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+-	  XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
+-						   reg, new_reg);
+-    }
++  if (TREE_CODE (type1) != FUNCTION_TYPE
++      && TREE_CODE (type1) != METHOD_TYPE)
++    return 1;
+ 
+-  return x;
+-}
++  ccvt1 = ix86_get_callcvt (type1);
++  ccvt2 = ix86_get_callcvt (type2);
++  if (ccvt1 != ccvt2)
++    return 0;
++  if (ix86_function_regparm (type1, NULL)
++      != ix86_function_regparm (type2, NULL))
++    return 0;
+ 
+-/* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
++  return 1;
++}
++
++/* Return the regparm value for a function with the indicated TYPE and DECL.
++   DECL may be NULL when calling function indirectly
++   or considering a libcall.  */
+ 
+-void
+-dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
+-						  rtx reg, rtx new_reg)
++static int
++ix86_function_regparm (const_tree type, const_tree decl)
+ {
+-  replace_with_subreg (single_set (insn), reg, new_reg);
+-}
++  tree attr;
++  int regparm;
++  unsigned int ccvt;
+ 
+-/* Insert generated conversion instruction sequence INSNS
+-   after instruction AFTER.  New BB may be required in case
+-   instruction has EH region attached.  */
++  if (TARGET_64BIT)
++    return (ix86_function_type_abi (type) == SYSV_ABI
++	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
++  ccvt = ix86_get_callcvt (type);
++  regparm = ix86_regparm;
+ 
+-void
+-scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
+-{
+-  if (!control_flow_insn_p (after))
++  if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
+     {
+-      emit_insn_after (insns, after);
+-      return;
++      attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
++      if (attr)
++	{
++	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
++	  return regparm;
++	}
+     }
++  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
++    return 2;
++  else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
++    return 1;
+ 
+-  basic_block bb = BLOCK_FOR_INSN (after);
+-  edge e = find_fallthru_edge (bb->succs);
+-  gcc_assert (e);
++  /* Use register calling convention for local functions when possible.  */
++  if (decl
++      && TREE_CODE (decl) == FUNCTION_DECL)
++    {
++      cgraph_node *target = cgraph_node::get (decl);
++      if (target)
++	target = target->function_symbol ();
+ 
+-  basic_block new_bb = split_edge (e);
+-  emit_insn_after (insns, BB_HEAD (new_bb));
+-}
++      /* Caller and callee must agree on the calling convention, so
++	 checking here just optimize means that with
++	 __attribute__((optimize (...))) caller could use regparm convention
++	 and callee not, or vice versa.  Instead look at whether the callee
++	 is optimized or not.  */
++      if (target && opt_for_fn (target->decl, optimize)
++	  && !(profile_flag && !flag_fentry))
++	{
++	  cgraph_local_info *i = &target->local;
++	  if (i && i->local && i->can_change_signature)
++	    {
++	      int local_regparm, globals = 0, regno;
+ 
+-/* Make vector copies for all register REGNO definitions
+-   and replace its uses in a chain.  */
++	      /* Make sure no regparm register is taken by a
++		 fixed register variable.  */
++	      for (local_regparm = 0; local_regparm < REGPARM_MAX;
++		   local_regparm++)
++		if (fixed_regs[local_regparm])
++		  break;
+ 
+-void
+-dimode_scalar_chain::make_vector_copies (unsigned regno)
+-{
+-  rtx reg = regno_reg_rtx[regno];
+-  rtx vreg = gen_reg_rtx (DImode);
+-  df_ref ref;
++	      /* We don't want to use regparm(3) for nested functions as
++		 these use a static chain pointer in the third argument.  */
++	      if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
++		local_regparm = 2;
+ 
+-  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+-    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+-      {
+-	start_sequence ();
++	      /* Save a register for the split stack.  */
++	      if (flag_split_stack)
++		{
++		  if (local_regparm == 3)
++		    local_regparm = 2;
++		  else if (local_regparm == 2
++			   && DECL_STATIC_CHAIN (target->decl))
++		    local_regparm = 1;
++		}
+ 
+-	if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+-	  {
+-	    rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
+-	    emit_move_insn (adjust_address (tmp, SImode, 0),
+-			    gen_rtx_SUBREG (SImode, reg, 0));
+-	    emit_move_insn (adjust_address (tmp, SImode, 4),
+-			    gen_rtx_SUBREG (SImode, reg, 4));
+-	    emit_move_insn (vreg, tmp);
+-	  }
+-	else if (TARGET_SSE4_1)
+-	  {
+-	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
+-					CONST0_RTX (V4SImode),
+-					gen_rtx_SUBREG (SImode, reg, 0)));
+-	    emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
+-					  gen_rtx_SUBREG (V4SImode, vreg, 0),
+-					  gen_rtx_SUBREG (SImode, reg, 4),
+-					  GEN_INT (2)));
+-	  }
+-	else
+-	  {
+-	    rtx tmp = gen_reg_rtx (DImode);
+-	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
+-					CONST0_RTX (V4SImode),
+-					gen_rtx_SUBREG (SImode, reg, 0)));
+-	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
+-					CONST0_RTX (V4SImode),
+-					gen_rtx_SUBREG (SImode, reg, 4)));
+-	    emit_insn (gen_vec_interleave_lowv4si
+-		       (gen_rtx_SUBREG (V4SImode, vreg, 0),
+-			gen_rtx_SUBREG (V4SImode, vreg, 0),
+-			gen_rtx_SUBREG (V4SImode, tmp, 0)));
+-	  }
+-	rtx_insn *seq = get_insns ();
+-	end_sequence ();
+-	rtx_insn *insn = DF_REF_INSN (ref);
+-	emit_conversion_insns (seq, insn);
+-
+-	if (dump_file)
+-	  fprintf (dump_file,
+-		   "  Copied r%d to a vector register r%d for insn %d\n",
+-		   regno, REGNO (vreg), INSN_UID (insn));
+-      }
++	      /* Each fixed register usage increases register pressure,
++		 so less registers should be used for argument passing.
++		 This functionality can be overriden by an explicit
++		 regparm value.  */
++	      for (regno = AX_REG; regno <= DI_REG; regno++)
++		if (fixed_regs[regno])
++		  globals++;
+ 
+-  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+-    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+-      {
+-	rtx_insn *insn = DF_REF_INSN (ref);
++	      local_regparm
++		= globals < local_regparm ? local_regparm - globals : 0;
+ 
+-	replace_with_subreg_in_insn (insn, reg, vreg);
++	      if (local_regparm > regparm)
++		regparm = local_regparm;
++	    }
++	}
++    }
+ 
+-	if (dump_file)
+-	  fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
+-		   regno, REGNO (vreg), INSN_UID (insn));
+-      }
++  return regparm;
+ }
+ 
+-/* Convert all definitions of register REGNO
+-   and fix its uses.  Scalar copies may be created
+-   in case register is used in not convertible insn.  */
++/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
++   DFmode (2) arguments in SSE registers for a function with the
++   indicated TYPE and DECL.  DECL may be NULL when calling function
++   indirectly or considering a libcall.  Return -1 if any FP parameter
++   should be rejected by error.  This is used in siutation we imply SSE
++   calling convetion but the function is called from another function with
++   SSE disabled. Otherwise return 0.  */
+ 
+-void
+-dimode_scalar_chain::convert_reg (unsigned regno)
++static int
++ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
+ {
+-  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
+-  rtx reg = regno_reg_rtx[regno];
+-  rtx scopy = NULL_RTX;
+-  df_ref ref;
+-  bitmap conv;
+-
+-  conv = BITMAP_ALLOC (NULL);
+-  bitmap_copy (conv, insns);
+-
+-  if (scalar_copy)
+-    scopy = gen_reg_rtx (DImode);
++  gcc_assert (!TARGET_64BIT);
+ 
+-  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
++  /* Use SSE registers to pass SFmode and DFmode arguments if requested
++     by the sseregparm attribute.  */
++  if (TARGET_SSEREGPARM
++      || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
+     {
+-      rtx_insn *insn = DF_REF_INSN (ref);
+-      rtx def_set = single_set (insn);
+-      rtx src = SET_SRC (def_set);
+-      rtx reg = DF_REF_REG (ref);
+-
+-      if (!MEM_P (src))
+-	{
+-	  replace_with_subreg_in_insn (insn, reg, reg);
+-	  bitmap_clear_bit (conv, INSN_UID (insn));
+-	}
+-
+-      if (scalar_copy)
++      if (!TARGET_SSE)
+ 	{
+-	  start_sequence ();
+-	  if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
+-	    {
+-	      rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
+-	      emit_move_insn (tmp, reg);
+-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+-			      adjust_address (tmp, SImode, 0));
+-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+-			      adjust_address (tmp, SImode, 4));
+-	    }
+-	  else if (TARGET_SSE4_1)
+-	    {
+-	      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+-	      emit_insn
+-		(gen_rtx_SET
+-		 (gen_rtx_SUBREG (SImode, scopy, 0),
+-		  gen_rtx_VEC_SELECT (SImode,
+-				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
+-
+-	      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
+-	      emit_insn
+-		(gen_rtx_SET
+-		 (gen_rtx_SUBREG (SImode, scopy, 4),
+-		  gen_rtx_VEC_SELECT (SImode,
+-				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
+-	    }
+-	  else
++	  if (warn)
+ 	    {
+-	      rtx vcopy = gen_reg_rtx (V2DImode);
+-	      emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
+-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+-			      gen_rtx_SUBREG (SImode, vcopy, 0));
+-	      emit_move_insn (vcopy,
+-			      gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+-			      gen_rtx_SUBREG (SImode, vcopy, 0));
++	      if (decl)
++		error ("calling %qD with attribute sseregparm without "
++		       "SSE/SSE2 enabled", decl);
++	      else
++		error ("calling %qT with attribute sseregparm without "
++		       "SSE/SSE2 enabled", type);
+ 	    }
+-	  rtx_insn *seq = get_insns ();
+-	  end_sequence ();
+-	  emit_conversion_insns (seq, insn);
+-
+-	  if (dump_file)
+-	    fprintf (dump_file,
+-		     "  Copied r%d to a scalar register r%d for insn %d\n",
+-		     regno, REGNO (scopy), INSN_UID (insn));
++	  return 0;
+ 	}
++
++      return 2;
+     }
+ 
+-  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+-    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+-      {
+-	if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
+-	  {
+-	    rtx_insn *insn = DF_REF_INSN (ref);
++  if (!decl)
++    return 0;
+ 
+-	    rtx def_set = single_set (insn);
+-	    gcc_assert (def_set);
++  cgraph_node *target = cgraph_node::get (decl);
++  if (target)
++    target = target->function_symbol ();
+ 
+-	    rtx src = SET_SRC (def_set);
+-	    rtx dst = SET_DEST (def_set);
++  /* For local functions, pass up to SSE_REGPARM_MAX SFmode
++     (and DFmode for SSE2) arguments in SSE registers.  */
++  if (target
++      /* TARGET_SSE_MATH */
++      && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
++      && opt_for_fn (target->decl, optimize)
++      && !(profile_flag && !flag_fentry))
++    {
++      cgraph_local_info *i = &target->local;
++      if (i && i->local && i->can_change_signature)
++	{
++	  /* Refuse to produce wrong code when local function with SSE enabled
++	     is called from SSE disabled function.
++	     FIXME: We need a way to detect these cases cross-ltrans partition
++	     and avoid using SSE calling conventions on local functions called
++	     from function with SSE disabled.  For now at least delay the
++	     warning until we know we are going to produce wrong code.
++	     See PR66047  */
++	  if (!TARGET_SSE && warn)
++	    return -1;
++	  return TARGET_SSE2_P (target_opts_for_fn (target->decl)
++				->x_ix86_isa_flags) ? 2 : 1;
++	}
++    }
+ 
+-	    if (!MEM_P (dst) || !REG_P (src))
+-	      replace_with_subreg_in_insn (insn, reg, reg);
++  return 0;
++}
+ 
+-	    bitmap_clear_bit (conv, INSN_UID (insn));
+-	  }
+-      }
+-    /* Skip debug insns and uninitialized uses.  */
+-    else if (DF_REF_CHAIN (ref)
+-	     && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
+-      {
+-	gcc_assert (scopy);
+-	replace_rtx (DF_REF_INSN (ref), reg, scopy);
+-	df_insn_rescan (DF_REF_INSN (ref));
+-      }
++/* Return true if EAX is live at the start of the function.  Used by
++   ix86_expand_prologue to determine if we need special help before
++   calling allocate_stack_worker.  */
+ 
+-  BITMAP_FREE (conv);
++static bool
++ix86_eax_live_at_start_p (void)
++{
++  /* Cheat.  Don't bother working forward from ix86_function_regparm
++     to the function type to whether an actual argument is located in
++     eax.  Instead just look at cfg info, which is still close enough
++     to correct at this point.  This gives false positives for broken
++     functions that might use uninitialized data that happens to be
++     allocated in eax, but who cares?  */
++  return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
+ }
+ 
+-/* Convert operand OP in INSN.  We should handle
+-   memory operands and uninitialized registers.
+-   All other register uses are converted during
+-   registers conversion.  */
+-
+-void
+-dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
++static bool
++ix86_keep_aggregate_return_pointer (tree fntype)
+ {
+-  *op = copy_rtx_if_shared (*op);
++  tree attr;
+ 
+-  if (GET_CODE (*op) == NOT)
+-    {
+-      convert_op (&XEXP (*op, 0), insn);
+-      PUT_MODE (*op, V2DImode);
+-    }
+-  else if (MEM_P (*op))
++  if (!TARGET_64BIT)
+     {
+-      rtx tmp = gen_reg_rtx (DImode);
+-
+-      emit_insn_before (gen_move_insn (tmp, *op), insn);
+-      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
++      attr = lookup_attribute ("callee_pop_aggregate_return",
++			       TYPE_ATTRIBUTES (fntype));
++      if (attr)
++	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
+ 
+-      if (dump_file)
+-	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
+-		 INSN_UID (insn), REGNO (tmp));
+-    }
+-  else if (REG_P (*op))
+-    {
+-      /* We may have not converted register usage in case
+-	 this register has no definition.  Otherwise it
+-	 should be converted in convert_reg.  */
+-      df_ref ref;
+-      FOR_EACH_INSN_USE (ref, insn)
+-	if (DF_REF_REGNO (ref) == REGNO (*op))
+-	  {
+-	    gcc_assert (!DF_REF_CHAIN (ref));
+-	    break;
+-	  }
+-      *op = gen_rtx_SUBREG (V2DImode, *op, 0);
+-    }
+-  else if (CONST_INT_P (*op))
+-    {
+-      rtx vec_cst;
+-      rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
+-
+-      /* Prefer all ones vector in case of -1.  */
+-      if (constm1_operand (*op, GET_MODE (*op)))
+-	vec_cst = CONSTM1_RTX (V2DImode);
+-      else
+-	vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
+-					gen_rtvec (2, *op, const0_rtx));
+-
+-      if (!standard_sse_constant_p (vec_cst, V2DImode))
+-	{
+-	  start_sequence ();
+-	  vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
+-	  rtx_insn *seq = get_insns ();
+-	  end_sequence ();
+-	  emit_insn_before (seq, insn);
+-	}
+-
+-      emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
+-      *op = tmp;
+-    }
+-  else
+-    {
+-      gcc_assert (SUBREG_P (*op));
+-      gcc_assert (GET_MODE (*op) == V2DImode);
++      /* For 32-bit MS-ABI the default is to keep aggregate
++         return pointer.  */
++      if (ix86_function_type_abi (fntype) == MS_ABI)
++	return true;
+     }
++  return KEEP_AGGREGATE_RETURN_POINTER != 0;
+ }
+ 
+-/* Convert INSN to vector mode.  */
+-
+-void
+-dimode_scalar_chain::convert_insn (rtx_insn *insn)
+-{
+-  rtx def_set = single_set (insn);
+-  rtx src = SET_SRC (def_set);
+-  rtx dst = SET_DEST (def_set);
+-  rtx subreg;
++/* Value is the number of bytes of arguments automatically
++   popped when returning from a subroutine call.
++   FUNDECL is the declaration node of the function (as a tree),
++   FUNTYPE is the data type of the function (as a tree),
++   or for a library call it is an identifier node for the subroutine name.
++   SIZE is the number of bytes of arguments passed on the stack.
+ 
+-  if (MEM_P (dst) && !REG_P (src))
+-    {
+-      /* There are no scalar integer instructions and therefore
+-	 temporary register usage is required.  */
+-      rtx tmp = gen_reg_rtx (DImode);
+-      emit_conversion_insns (gen_move_insn (dst, tmp), insn);
+-      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
+-    }
++   On the 80386, the RTD insn may be used to pop them if the number
++     of args is fixed, but if the number is variable then the caller
++     must pop them all.  RTD can't be used for library calls now
++     because the library is compiled with the Unix compiler.
++   Use of RTD is a selectable option, since it is incompatible with
++   standard Unix calling sequences.  If the option is not selected,
++   the caller must always pop the args.
+ 
+-  switch (GET_CODE (src))
+-    {
+-    case ASHIFT:
+-    case ASHIFTRT:
+-    case LSHIFTRT:
+-      convert_op (&XEXP (src, 0), insn);
+-      PUT_MODE (src, V2DImode);
+-      break;
++   The attribute stdcall is equivalent to RTD on a per module basis.  */
+ 
+-    case PLUS:
+-    case MINUS:
+-    case IOR:
+-    case XOR:
+-    case AND:
+-      convert_op (&XEXP (src, 0), insn);
+-      convert_op (&XEXP (src, 1), insn);
+-      PUT_MODE (src, V2DImode);
+-      break;
++static poly_int64
++ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
++{
++  unsigned int ccvt;
+ 
+-    case NEG:
+-      src = XEXP (src, 0);
+-      convert_op (&src, insn);
+-      subreg = gen_reg_rtx (V2DImode);
+-      emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
+-      src = gen_rtx_MINUS (V2DImode, subreg, src);
+-      break;
++  /* None of the 64-bit ABIs pop arguments.  */
++  if (TARGET_64BIT)
++    return 0;
+ 
+-    case NOT:
+-      src = XEXP (src, 0);
+-      convert_op (&src, insn);
+-      subreg = gen_reg_rtx (V2DImode);
+-      emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
+-      src = gen_rtx_XOR (V2DImode, src, subreg);
+-      break;
++  ccvt = ix86_get_callcvt (funtype);
+ 
+-    case MEM:
+-      if (!REG_P (dst))
+-	convert_op (&src, insn);
+-      break;
++  if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
++	       | IX86_CALLCVT_THISCALL)) != 0
++      && ! stdarg_p (funtype))
++    return size;
+ 
+-    case REG:
+-      if (!MEM_P (dst))
+-	convert_op (&src, insn);
+-      break;
++  /* Lose any fake structure return argument if it is passed on the stack.  */
++  if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
++      && !ix86_keep_aggregate_return_pointer (funtype))
++    {
++      int nregs = ix86_function_regparm (funtype, fundecl);
++      if (nregs == 0)
++	return GET_MODE_SIZE (Pmode);
++    }
+ 
+-    case SUBREG:
+-      gcc_assert (GET_MODE (src) == V2DImode);
+-      break;
++  return 0;
++}
+ 
+-    case COMPARE:
+-      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
++/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */
+ 
+-      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
+-		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
++static bool
++ix86_legitimate_combined_insn (rtx_insn *insn)
++{
++  int i;
+ 
+-      if (REG_P (src))
+-	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
+-      else
+-	subreg = copy_rtx_if_shared (src);
+-      emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
+-						    copy_rtx_if_shared (subreg),
+-						    copy_rtx_if_shared (subreg)),
+-			insn);
+-      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+-      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
+-					       copy_rtx_if_shared (src)),
+-			    UNSPEC_PTEST);
+-      break;
++  /* Check operand constraints in case hard registers were propagated
++     into insn pattern.  This check prevents combine pass from
++     generating insn patterns with invalid hard register operands.
++     These invalid insns can eventually confuse reload to error out
++     with a spill failure.  See also PRs 46829 and 46843.  */
+ 
+-    case CONST_INT:
+-      convert_op (&src, insn);
+-      break;
++  gcc_assert (INSN_CODE (insn) >= 0);
+ 
+-    default:
+-      gcc_unreachable ();
+-    }
++  extract_insn (insn);
++  preprocess_constraints (insn);
+ 
+-  SET_SRC (def_set) = src;
+-  SET_DEST (def_set) = dst;
++  int n_operands = recog_data.n_operands;
++  int n_alternatives = recog_data.n_alternatives;
++  for (i = 0; i < n_operands; i++)
++    {
++      rtx op = recog_data.operand[i];
++      machine_mode mode = GET_MODE (op);
++      const operand_alternative *op_alt;
++      int offset = 0;
++      bool win;
++      int j;
+ 
+-  /* Drop possible dead definitions.  */
+-  PATTERN (insn) = def_set;
++      /* A unary operator may be accepted by the predicate, but it
++	 is irrelevant for matching constraints.  */
++      if (UNARY_P (op))
++	op = XEXP (op, 0);
+ 
+-  INSN_CODE (insn) = -1;
+-  recog_memoized (insn);
+-  df_insn_rescan (insn);
+-}
++      if (SUBREG_P (op))
++	{
++	  if (REG_P (SUBREG_REG (op))
++	      && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
++	    offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
++					  GET_MODE (SUBREG_REG (op)),
++					  SUBREG_BYTE (op),
++					  GET_MODE (op));
++	  op = SUBREG_REG (op);
++	}
+ 
+-/* Fix uses of converted REG in debug insns.  */
++      if (!(REG_P (op) && HARD_REGISTER_P (op)))
++	continue;
+ 
+-void
+-timode_scalar_chain::fix_debug_reg_uses (rtx reg)
+-{
+-  if (!flag_var_tracking)
+-    return;
++      op_alt = recog_op_alt;
+ 
+-  df_ref ref, next;
+-  for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
+-    {
+-      rtx_insn *insn = DF_REF_INSN (ref);
+-      /* Make sure the next ref is for a different instruction,
+-         so that we're not affected by the rescan.  */
+-      next = DF_REF_NEXT_REG (ref);
+-      while (next && DF_REF_INSN (next) == insn)
+-	next = DF_REF_NEXT_REG (next);
++      /* Operand has no constraints, anything is OK.  */
++      win = !n_alternatives;
+ 
+-      if (DEBUG_INSN_P (insn))
++      alternative_mask preferred = get_preferred_alternatives (insn);
++      for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
+ 	{
+-	  /* It may be a debug insn with a TImode variable in
+-	     register.  */
+-	  bool changed = false;
+-	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
++	  if (!TEST_BIT (preferred, j))
++	    continue;
++	  if (op_alt[i].anything_ok
++	      || (op_alt[i].matches != -1
++		  && operands_match_p
++		  (recog_data.operand[i],
++		   recog_data.operand[op_alt[i].matches]))
++	      || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
+ 	    {
+-	      rtx *loc = DF_REF_LOC (ref);
+-	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
+-		{
+-		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
+-		  changed = true;
+-		}
++	      win = true;
++	      break;
+ 	    }
+-	  if (changed)
+-	    df_insn_rescan (insn);
+ 	}
++
++      if (!win)
++	return false;
+     }
++
++  return true;
+ }
++
++/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
+ 
+-/* Convert INSN from TImode to V1T1mode.  */
++static unsigned HOST_WIDE_INT
++ix86_asan_shadow_offset (void)
++{
++  return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
++				     : HOST_WIDE_INT_C (0x7fff8000))
++		     : (HOST_WIDE_INT_1 << 29);
++}
++
++/* Argument support functions.  */
+ 
+-void
+-timode_scalar_chain::convert_insn (rtx_insn *insn)
++/* Return true when register may be used to pass function parameters.  */
++bool
++ix86_function_arg_regno_p (int regno)
+ {
+-  rtx def_set = single_set (insn);
+-  rtx src = SET_SRC (def_set);
+-  rtx dst = SET_DEST (def_set);
++  int i;
++  enum calling_abi call_abi;
++  const int *parm_regs;
+ 
+-  switch (GET_CODE (dst))
++  if (!TARGET_64BIT)
+     {
+-    case REG:
+-      {
+-	rtx tmp = find_reg_equal_equiv_note (insn);
+-	if (tmp)
+-	  PUT_MODE (XEXP (tmp, 0), V1TImode);
+-	PUT_MODE (dst, V1TImode);
+-	fix_debug_reg_uses (dst);
+-      }
+-      break;
+-    case MEM:
+-      PUT_MODE (dst, V1TImode);
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
++      if (TARGET_MACHO)
++        return (regno < REGPARM_MAX
++                || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
++      else
++        return (regno < REGPARM_MAX
++	        || (TARGET_MMX && MMX_REGNO_P (regno)
++	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
++	        || (TARGET_SSE && SSE_REGNO_P (regno)
++		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
+     }
+ 
+-  switch (GET_CODE (src))
+-    {
+-    case REG:
+-      PUT_MODE (src, V1TImode);
+-      /* Call fix_debug_reg_uses only if SRC is never defined.  */
+-      if (!DF_REG_DEF_CHAIN (REGNO (src)))
+-	fix_debug_reg_uses (src);
+-      break;
+-
+-    case MEM:
+-      PUT_MODE (src, V1TImode);
+-      break;
+-
+-    case CONST_WIDE_INT:
+-      if (NONDEBUG_INSN_P (insn))
+-	{
+-	  /* Since there are no instructions to store 128-bit constant,
+-	     temporary register usage is required.  */
+-	  rtx tmp = gen_reg_rtx (V1TImode);
+-	  start_sequence ();
+-	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
+-	  src = validize_mem (force_const_mem (V1TImode, src));
+-	  rtx_insn *seq = get_insns ();
+-	  end_sequence ();
+-	  if (seq)
+-	    emit_insn_before (seq, insn);
+-	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
+-	  dst = tmp;
+-	}
+-      break;
+-
+-    case CONST_INT:
+-      switch (standard_sse_constant_p (src, TImode))
+-	{
+-	case 1:
+-	  src = CONST0_RTX (GET_MODE (dst));
+-	  break;
+-	case 2:
+-	  src = CONSTM1_RTX (GET_MODE (dst));
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      if (NONDEBUG_INSN_P (insn))
+-	{
+-	  rtx tmp = gen_reg_rtx (V1TImode);
+-	  /* Since there are no instructions to store standard SSE
+-	     constant, temporary register usage is required.  */
+-	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
+-	  dst = tmp;
+-	}
+-      break;
++  if (TARGET_SSE && SSE_REGNO_P (regno)
++      && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
++    return true;
+ 
+-    default:
+-      gcc_unreachable ();
+-    }
++  /* TODO: The function should depend on current function ABI but
++     builtins.c would need updating then. Therefore we use the
++     default ABI.  */
++  call_abi = ix86_cfun_abi ();
+ 
+-  SET_SRC (def_set) = src;
+-  SET_DEST (def_set) = dst;
++  /* RAX is used as hidden argument to va_arg functions.  */
++  if (call_abi == SYSV_ABI && regno == AX_REG)
++    return true;
+ 
+-  /* Drop possible dead definitions.  */
+-  PATTERN (insn) = def_set;
++  if (call_abi == MS_ABI)
++    parm_regs = x86_64_ms_abi_int_parameter_registers;
++  else
++    parm_regs = x86_64_int_parameter_registers;
+ 
+-  INSN_CODE (insn) = -1;
+-  recog_memoized (insn);
+-  df_insn_rescan (insn);
++  for (i = 0; i < (call_abi == MS_ABI
++		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
++    if (regno == parm_regs[i])
++      return true;
++  return false;
+ }
+ 
+-void
+-dimode_scalar_chain::convert_registers ()
+-{
+-  bitmap_iterator bi;
+-  unsigned id;
++/* Return if we do not know how to pass ARG solely in registers.  */
+ 
+-  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
+-    convert_reg (id);
++static bool
++ix86_must_pass_in_stack (const function_arg_info &arg)
++{
++  if (must_pass_in_stack_var_size_or_pad (arg))
++    return true;
+ 
+-  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
+-    make_vector_copies (id);
++  /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
++     The layout_type routine is crafty and tries to trick us into passing
++     currently unsupported vector types on the stack by using TImode.  */
++  return (!TARGET_64BIT && arg.mode == TImode
++	  && arg.type && TREE_CODE (arg.type) != VECTOR_TYPE);
+ }
+ 
+-/* Convert whole chain creating required register
+-   conversions and copies.  */
+-
++/* It returns the size, in bytes, of the area reserved for arguments passed
++   in registers for the function represented by fndecl dependent to the used
++   abi format.  */
+ int
+-scalar_chain::convert ()
++ix86_reg_parm_stack_space (const_tree fndecl)
+ {
+-  bitmap_iterator bi;
+-  unsigned id;
+-  int converted_insns = 0;
+-
+-  if (!dbg_cnt (stv_conversion))
+-    return 0;
+-
+-  if (dump_file)
+-    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
++  enum calling_abi call_abi = SYSV_ABI;
++  if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
++    call_abi = ix86_function_abi (fndecl);
++  else
++    call_abi = ix86_function_type_abi (fndecl);
++  if (TARGET_64BIT && call_abi == MS_ABI)
++    return 32;
++  return 0;
++}
+ 
+-  convert_registers ();
+-
+-  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+-    {
+-      convert_insn (DF_INSN_UID_GET (id)->insn);
+-      converted_insns++;
+-    }
+-
+-  return converted_insns;
++/* We add this as a workaround in order to use libc_has_function
++   hook in i386.md.  */
++bool
++ix86_libc_has_function (enum function_class fn_class)
++{
++  return targetm.libc_has_function (fn_class);
+ }
+ 
+-/* Main STV pass function.  Find and convert scalar
+-   instructions into vector mode when profitable.  */
+-
+-static unsigned int
+-convert_scalars_to_vector ()
++/* Returns value SYSV_ABI, MS_ABI dependent on fntype,
++   specifying the call abi used.  */
++enum calling_abi
++ix86_function_type_abi (const_tree fntype)
+ {
+-  basic_block bb;
+-  bitmap candidates;
+-  int converted_insns = 0;
+-
+-  bitmap_obstack_initialize (NULL);
+-  candidates = BITMAP_ALLOC (NULL);
+-
+-  calculate_dominance_info (CDI_DOMINATORS);
+-  df_set_flags (DF_DEFER_INSN_RESCAN);
+-  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+-  df_md_add_problem ();
+-  df_analyze ();
+-
+-  /* Find all instructions we want to convert into vector mode.  */
+-  if (dump_file)
+-    fprintf (dump_file, "Searching for mode conversion candidates...\n");
+-
+-  FOR_EACH_BB_FN (bb, cfun)
+-    {
+-      rtx_insn *insn;
+-      FOR_BB_INSNS (bb, insn)
+-	if (scalar_to_vector_candidate_p (insn))
+-	  {
+-	    if (dump_file)
+-	      fprintf (dump_file, "  insn %d is marked as a candidate\n",
+-		       INSN_UID (insn));
+-
+-	    bitmap_set_bit (candidates, INSN_UID (insn));
+-	  }
+-    }
+-
+-  remove_non_convertible_regs (candidates);
++  enum calling_abi abi = ix86_abi;
+ 
+-  if (bitmap_empty_p (candidates))
+-    if (dump_file)
+-      fprintf (dump_file, "There are no candidates for optimization.\n");
++  if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
++    return abi;
+ 
+-  while (!bitmap_empty_p (candidates))
++  if (abi == SYSV_ABI
++      && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
+     {
+-      unsigned uid = bitmap_first_set_bit (candidates);
+-      scalar_chain *chain;
+-
+-      if (TARGET_64BIT)
+-	chain = new timode_scalar_chain;
+-      else
+-	chain = new dimode_scalar_chain;
+-
+-      /* Find instructions chain we want to convert to vector mode.
+-	 Check all uses and definitions to estimate all required
+-	 conversions.  */
+-      chain->build (candidates, uid);
+-
+-      if (chain->compute_convert_gain () > 0)
+-	converted_insns += chain->convert ();
+-      else
+-	if (dump_file)
+-	  fprintf (dump_file, "Chain #%d conversion is not profitable\n",
+-		   chain->chain_id);
+-
+-      delete chain;
+-    }
+-
+-  if (dump_file)
+-    fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
+-
+-  BITMAP_FREE (candidates);
+-  bitmap_obstack_release (NULL);
+-  df_process_deferred_rescans ();
++      static int warned;
++      if (TARGET_X32 && !warned)
++	{
++	  error ("X32 does not support %<ms_abi%> attribute");
++	  warned = 1;
++	}
+ 
+-  /* Conversion means we may have 128bit register spills/fills
+-     which require aligned stack.  */
+-  if (converted_insns)
+-    {
+-      if (crtl->stack_alignment_needed < 128)
+-	crtl->stack_alignment_needed = 128;
+-      if (crtl->stack_alignment_estimated < 128)
+-	crtl->stack_alignment_estimated = 128;
+-      /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
+-      if (TARGET_64BIT)
+-	for (tree parm = DECL_ARGUMENTS (current_function_decl);
+-	     parm; parm = DECL_CHAIN (parm))
+-	  {
+-	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
+-	      continue;
+-	    if (DECL_RTL_SET_P (parm)
+-		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
+-	      {
+-		rtx r = DECL_RTL (parm);
+-		if (REG_P (r))
+-		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
+-	      }
+-	    if (DECL_INCOMING_RTL (parm)
+-		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
+-	      {
+-		rtx r = DECL_INCOMING_RTL (parm);
+-		if (REG_P (r))
+-		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
+-	      }
+-	  }
++      abi = MS_ABI;
+     }
++  else if (abi == MS_ABI
++	   && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
++    abi = SYSV_ABI;
+ 
+-  return 0;
++  return abi;
+ }
+ 
+-namespace {
+-
+-const pass_data pass_data_insert_vzeroupper =
++enum calling_abi
++ix86_function_abi (const_tree fndecl)
+ {
+-  RTL_PASS, /* type */
+-  "vzeroupper", /* name */
+-  OPTGROUP_NONE, /* optinfo_flags */
+-  TV_MACH_DEP, /* tv_id */
+-  0, /* properties_required */
+-  0, /* properties_provided */
+-  0, /* properties_destroyed */
+-  0, /* todo_flags_start */
+-  TODO_df_finish, /* todo_flags_finish */
+-};
++  return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
++}
+ 
+-class pass_insert_vzeroupper : public rtl_opt_pass
++/* Returns value SYSV_ABI, MS_ABI dependent on cfun,
++   specifying the call abi used.  */
++enum calling_abi
++ix86_cfun_abi (void)
+ {
+-public:
+-  pass_insert_vzeroupper(gcc::context *ctxt)
+-    : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
+-  {}
++  return cfun ? cfun->machine->call_abi : ix86_abi;
++}
+ 
+-  /* opt_pass methods: */
+-  virtual bool gate (function *)
++bool
++ix86_function_ms_hook_prologue (const_tree fn)
++{
++  if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
+     {
+-      return TARGET_AVX
+-	     && TARGET_VZEROUPPER && flag_expensive_optimizations
+-	     && !optimize_size;
++      if (decl_function_context (fn) != NULL_TREE)
++	error_at (DECL_SOURCE_LOCATION (fn),
++		  "%<ms_hook_prologue%> attribute is not compatible "
++		  "with nested function");
++      else
++        return true;
+     }
++  return false;
++}
+ 
+-  virtual unsigned int execute (function *)
+-    {
+-      return rest_of_handle_insert_vzeroupper ();
+-    }
++bool
++ix86_function_naked (const_tree fn)
++{
++  if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
++    return true;
+ 
+-}; // class pass_insert_vzeroupper
++  return false;
++}
+ 
+-const pass_data pass_data_stv =
+-{
+-  RTL_PASS, /* type */
+-  "stv", /* name */
+-  OPTGROUP_NONE, /* optinfo_flags */
+-  TV_MACH_DEP, /* tv_id */
+-  0, /* properties_required */
+-  0, /* properties_provided */
+-  0, /* properties_destroyed */
+-  0, /* todo_flags_start */
+-  TODO_df_finish, /* todo_flags_finish */
+-};
++/* Write the extra assembler code needed to declare a function properly.  */
+ 
+-class pass_stv : public rtl_opt_pass
++void
++ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
++				tree decl)
+ {
+-public:
+-  pass_stv (gcc::context *ctxt)
+-    : rtl_opt_pass (pass_data_stv, ctxt),
+-      timode_p (false)
+-  {}
++  bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
+ 
+-  /* opt_pass methods: */
+-  virtual bool gate (function *)
++  if (is_ms_hook)
+     {
+-      return (timode_p == !!TARGET_64BIT
+-	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
+-    }
++      int i, filler_count = (TARGET_64BIT ? 32 : 16);
++      unsigned int filler_cc = 0xcccccccc;
+ 
+-  virtual unsigned int execute (function *)
+-    {
+-      return convert_scalars_to_vector ();
++      for (i = 0; i < filler_count; i += 4)
++        fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
+     }
+ 
+-  opt_pass *clone ()
+-    {
+-      return new pass_stv (m_ctxt);
+-    }
++#ifdef SUBTARGET_ASM_UNWIND_INIT
++  SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
++#endif
++
++  ASM_OUTPUT_LABEL (asm_out_file, fname);
+ 
+-  void set_pass_param (unsigned int n, bool param)
++  /* Output magic byte marker, if hot-patch attribute is set.  */
++  if (is_ms_hook)
+     {
+-      gcc_assert (n == 0);
+-      timode_p = param;
++      if (TARGET_64BIT)
++	{
++	  /* leaq [%rsp + 0], %rsp  */
++	  fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
++		 asm_out_file);
++	}
++      else
++	{
++          /* movl.s %edi, %edi
++	     push   %ebp
++	     movl.s %esp, %ebp */
++	  fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
++	}
+     }
++}
+ 
+-private:
+-  bool timode_p;
+-}; // class pass_stv
+-
+-} // anon namespace
+-
+-rtl_opt_pass *
+-make_pass_insert_vzeroupper (gcc::context *ctxt)
++/* Implementation of call abi switching target hook. Specific to FNDECL
++   the specific call register sets are set.  See also
++   ix86_conditional_register_usage for more details.  */
++void
++ix86_call_abi_override (const_tree fndecl)
+ {
+-  return new pass_insert_vzeroupper (ctxt);
++  cfun->machine->call_abi = ix86_function_abi (fndecl);
+ }
+ 
+-rtl_opt_pass *
+-make_pass_stv (gcc::context *ctxt)
++/* Return 1 if pseudo register should be created and used to hold
++   GOT address for PIC code.  */
++bool
++ix86_use_pseudo_pic_reg (void)
+ {
+-  return new pass_stv (ctxt);
++  if ((TARGET_64BIT
++       && (ix86_cmodel == CM_SMALL_PIC
++	   || TARGET_PECOFF))
++      || !flag_pic)
++    return false;
++  return true;
+ }
+ 
+-/* Inserting ENDBRANCH instructions.  */
++/* Initialize large model PIC register.  */
+ 
+-static unsigned int
+-rest_of_insert_endbranch (void)
++static void
++ix86_init_large_pic_reg (unsigned int tmp_regno)
+ {
+-  timevar_push (TV_MACH_DEP);
+-
+-  rtx cet_eb;
+-  rtx_insn *insn;
+-  basic_block bb;
+-
+-  /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
+-     absent among function attributes.  Later an optimization will be
+-     introduced to make analysis if an address of a static function is
+-     taken.  A static function whose address is not taken will get a
+-     nocf_check attribute.  This will allow to reduce the number of EB.  */
+-
+-  if (!lookup_attribute ("nocf_check",
+-			 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+-      && (!flag_manual_endbr
+-	  || lookup_attribute ("cf_check",
+-			       DECL_ATTRIBUTES (cfun->decl)))
+-      && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
+-    {
+-      /* Queue ENDBR insertion to x86_function_profiler.  */
+-      if (crtl->profile && flag_fentry)
+-	cfun->machine->endbr_queued_at_entrance = true;
+-      else
+-	{
+-	  cet_eb = gen_nop_endbr ();
+-
+-	  bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+-	  insn = BB_HEAD (bb);
+-	  emit_insn_before (cet_eb, insn);
+-	}
+-    }
+-
+-  bb = 0;
+-  FOR_EACH_BB_FN (bb, cfun)
+-    {
+-      for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
+-	   insn = NEXT_INSN (insn))
+-	{
+-	  if (CALL_P (insn))
+-	    {
+-	      bool need_endbr;
+-	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
+-	      if (!need_endbr && !SIBLING_CALL_P (insn))
+-		{
+-		  rtx call = get_call_rtx_from (insn);
+-		  rtx fnaddr = XEXP (call, 0);
+-		  tree fndecl = NULL_TREE;
+-
+-		  /* Also generate ENDBRANCH for non-tail call which
+-		     may return via indirect branch.  */
+-		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+-		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
+-		  if (fndecl == NULL_TREE)
+-		    fndecl = MEM_EXPR (fnaddr);
+-		  if (fndecl
+-		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
+-		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
+-		    fndecl = NULL_TREE;
+-		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
+-		    {
+-		      tree fntype = TREE_TYPE (fndecl);
+-		      if (lookup_attribute ("indirect_return",
+-					    TYPE_ATTRIBUTES (fntype)))
+-			need_endbr = true;
+-		    }
+-		}
+-	      if (!need_endbr)
+-		continue;
+-	      /* Generate ENDBRANCH after CALL, which can return more than
+-		 twice, setjmp-like functions.  */
+-
+-	      cet_eb = gen_nop_endbr ();
+-	      emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
+-	      continue;
+-	    }
+-
+-	  if (JUMP_P (insn) && flag_cet_switch)
+-	    {
+-	      rtx target = JUMP_LABEL (insn);
+-	      if (target == NULL_RTX || ANY_RETURN_P (target))
+-		continue;
+-
+-	      /* Check the jump is a switch table.  */
+-	      rtx_insn *label = as_a<rtx_insn *> (target);
+-	      rtx_insn *table = next_insn (label);
+-	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+-		continue;
+-
+-	      /* For the indirect jump find out all places it jumps and insert
+-		 ENDBRANCH there.  It should be done under a special flag to
+-		 control ENDBRANCH generation for switch stmts.  */
+-	      edge_iterator ei;
+-	      edge e;
+-	      basic_block dest_blk;
+-
+-	      FOR_EACH_EDGE (e, ei, bb->succs)
+-		{
+-		  rtx_insn *insn;
+-
+-		  dest_blk = e->dest;
+-		  insn = BB_HEAD (dest_blk);
+-		  gcc_assert (LABEL_P (insn));
+-		  cet_eb = gen_nop_endbr ();
+-		  emit_insn_after (cet_eb, insn);
+-		}
+-	      continue;
+-	    }
+-
+-	  if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
+-	      || (NOTE_P (insn)
+-		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
+-	    /* TODO.  Check /s bit also.  */
+-	    {
+-	      cet_eb = gen_nop_endbr ();
+-	      emit_insn_after (cet_eb, insn);
+-	      continue;
+-	    }
+-	}
+-    }
++  rtx_code_label *label;
++  rtx tmp_reg;
+ 
+-  timevar_pop (TV_MACH_DEP);
+-  return 0;
++  gcc_assert (Pmode == DImode);
++  label = gen_label_rtx ();
++  emit_label (label);
++  LABEL_PRESERVE_P (label) = 1;
++  tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
++  gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
++  emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
++				label));
++  emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
++  emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
++			    pic_offset_table_rtx, tmp_reg));
++  const char *name = LABEL_NAME (label);
++  PUT_CODE (label, NOTE);
++  NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
++  NOTE_DELETED_LABEL_NAME (label) = name;
+ }
+ 
+-namespace {
+-
+-const pass_data pass_data_insert_endbranch =
++/* Create and initialize PIC register if required.  */
++static void
++ix86_init_pic_reg (void)
+ {
+-  RTL_PASS, /* type.  */
+-  "cet", /* name.  */
+-  OPTGROUP_NONE, /* optinfo_flags.  */
+-  TV_MACH_DEP, /* tv_id.  */
+-  0, /* properties_required.  */
+-  0, /* properties_provided.  */
+-  0, /* properties_destroyed.  */
+-  0, /* todo_flags_start.  */
+-  0, /* todo_flags_finish.  */
+-};
++  edge entry_edge;
++  rtx_insn *seq;
+ 
+-class pass_insert_endbranch : public rtl_opt_pass
+-{
+-public:
+-  pass_insert_endbranch (gcc::context *ctxt)
+-    : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
+-  {}
++  if (!ix86_use_pseudo_pic_reg ())
++    return;
++
++  start_sequence ();
+ 
+-  /* opt_pass methods: */
+-  virtual bool gate (function *)
++  if (TARGET_64BIT)
+     {
+-      return ((flag_cf_protection & CF_BRANCH));
++      if (ix86_cmodel == CM_LARGE_PIC)
++	ix86_init_large_pic_reg (R11_REG);
++      else
++	emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
+     }
+-
+-  virtual unsigned int execute (function *)
++  else
+     {
+-      return rest_of_insert_endbranch ();
++      /*  If there is future mcount call in the function it is more profitable
++	  to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM.  */
++      rtx reg = crtl->profile
++		? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
++		: pic_offset_table_rtx;
++      rtx_insn *insn = emit_insn (gen_set_got (reg));
++      RTX_FRAME_RELATED_P (insn) = 1;
++      if (crtl->profile)
++        emit_move_insn (pic_offset_table_rtx, reg);
++      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+     }
+ 
+-}; // class pass_insert_endbranch
+-
+-} // anon namespace
++  seq = get_insns ();
++  end_sequence ();
+ 
+-rtl_opt_pass *
+-make_pass_insert_endbranch (gcc::context *ctxt)
+-{
+-  return new pass_insert_endbranch (ctxt);
++  entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
++  insert_insn_on_edge (seq, entry_edge);
++  commit_one_edge_insertion (entry_edge);
+ }
+ 
+-/* At entry of the nearest common dominator for basic blocks with
+-   conversions, generate a single
+-	vxorps %xmmN, %xmmN, %xmmN
+-   for all
+-	vcvtss2sd  op, %xmmN, %xmmX
+-	vcvtsd2ss  op, %xmmN, %xmmX
+-	vcvtsi2ss  op, %xmmN, %xmmX
+-	vcvtsi2sd  op, %xmmN, %xmmX
+-
+-   NB: We want to generate only a single vxorps to cover the whole
+-   function.  The LCM algorithm isn't appropriate here since it may
+-   place a vxorps inside the loop.  */
+-
+-static unsigned int
+-remove_partial_avx_dependency (void)
+-{
+-  timevar_push (TV_MACH_DEP);
+-
+-  bitmap_obstack_initialize (NULL);
+-  bitmap convert_bbs = BITMAP_ALLOC (NULL);
++/* Initialize a variable CUM of type CUMULATIVE_ARGS
++   for a call to a function whose data type is FNTYPE.
++   For a library call, FNTYPE is 0.  */
+ 
+-  basic_block bb;
+-  rtx_insn *insn, *set_insn;
+-  rtx set;
+-  rtx v4sf_const0 = NULL_RTX;
++void
++init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
++		      tree fntype,	/* tree ptr for function decl */
++		      rtx libname,	/* SYMBOL_REF of library name or 0 */
++		      tree fndecl,
++		      int caller)
++{
++  struct cgraph_local_info *i = NULL;
++  struct cgraph_node *target = NULL;
+ 
+-  auto_vec<rtx_insn *> control_flow_insns;
++  memset (cum, 0, sizeof (*cum));
+ 
+-  FOR_EACH_BB_FN (bb, cfun)
++  if (fndecl)
+     {
+-      FOR_BB_INSNS (bb, insn)
++      target = cgraph_node::get (fndecl);
++      if (target)
+ 	{
+-	  if (!NONDEBUG_INSN_P (insn))
+-	    continue;
+-
+-	  set = single_set (insn);
+-	  if (!set)
+-	    continue;
++	  target = target->function_symbol ();
++	  i = cgraph_node::local_info (target->decl);
++	  cum->call_abi = ix86_function_abi (target->decl);
++	}
++      else
++	cum->call_abi = ix86_function_abi (fndecl);
++    }
++  else
++    cum->call_abi = ix86_function_type_abi (fntype);
+ 
+-	  if (get_attr_avx_partial_xmm_update (insn)
+-	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
+-	    continue;
++  cum->caller = caller;
+ 
+-	  if (!v4sf_const0)
+-	    {
+-	      calculate_dominance_info (CDI_DOMINATORS);
+-	      df_set_flags (DF_DEFER_INSN_RESCAN);
+-	      df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+-	      df_md_add_problem ();
+-	      df_analyze ();
+-	      v4sf_const0 = gen_reg_rtx (V4SFmode);
+-	    }
++  /* Set up the number of registers to use for passing arguments.  */
++  cum->nregs = ix86_regparm;
++  if (TARGET_64BIT)
++    {
++      cum->nregs = (cum->call_abi == SYSV_ABI
++                   ? X86_64_REGPARM_MAX
++                   : X86_64_MS_REGPARM_MAX);
++    }
++  if (TARGET_SSE)
++    {
++      cum->sse_nregs = SSE_REGPARM_MAX;
++      if (TARGET_64BIT)
++        {
++          cum->sse_nregs = (cum->call_abi == SYSV_ABI
++                           ? X86_64_SSE_REGPARM_MAX
++                           : X86_64_MS_SSE_REGPARM_MAX);
++        }
++    }
++  if (TARGET_MMX)
++    cum->mmx_nregs = MMX_REGPARM_MAX;
++  cum->warn_avx512f = true;
++  cum->warn_avx = true;
++  cum->warn_sse = true;
++  cum->warn_mmx = true;
+ 
+-	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
+-	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
+-	     vec_merge with subreg.  */
+-	  rtx src = SET_SRC (set);
+-	  rtx dest = SET_DEST (set);
+-	  machine_mode dest_mode = GET_MODE (dest);
++  /* Because type might mismatch in between caller and callee, we need to
++     use actual type of function for local calls.
++     FIXME: cgraph_analyze can be told to actually record if function uses
++     va_start so for local functions maybe_vaarg can be made aggressive
++     helping K&R code.
++     FIXME: once typesytem is fixed, we won't need this code anymore.  */
++  if (i && i->local && i->can_change_signature)
++    fntype = TREE_TYPE (target->decl);
++  cum->stdarg = stdarg_p (fntype);
++  cum->maybe_vaarg = (fntype
++		      ? (!prototype_p (fntype) || stdarg_p (fntype))
++		      : !libname);
+ 
+-	  rtx zero;
+-	  machine_mode dest_vecmode;
+-	  if (dest_mode == E_SFmode)
+-	    {
+-	      dest_vecmode = V4SFmode;
+-	      zero = v4sf_const0;
+-	    }
+-	  else
+-	    {
+-	      dest_vecmode = V2DFmode;
+-	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+-	    }
++  cum->decl = fndecl;
+ 
+-	  /* Change source to vector mode.  */
+-	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
+-	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
+-				   GEN_INT (HOST_WIDE_INT_1U));
+-	  /* Change destination to vector mode.  */
+-	  rtx vec = gen_reg_rtx (dest_vecmode);
+-	  /* Generate an XMM vector SET.  */
+-	  set = gen_rtx_SET (vec, src);
+-	  set_insn = emit_insn_before (set, insn);
+-	  df_insn_rescan (set_insn);
+-
+-	  if (cfun->can_throw_non_call_exceptions)
++  cum->warn_empty = !warn_abi || cum->stdarg;
++  if (!cum->warn_empty && fntype)
++    {
++      function_args_iterator iter;
++      tree argtype;
++      bool seen_empty_type = false;
++      FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
++	{
++	  if (argtype == error_mark_node || VOID_TYPE_P (argtype))
++	    break;
++	  if (TYPE_EMPTY_P (argtype))
++	    seen_empty_type = true;
++	  else if (seen_empty_type)
+ 	    {
+-	      /* Handle REG_EH_REGION note.  */
+-	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+-	      if (note)
+-		{
+-		  control_flow_insns.safe_push (set_insn);
+-		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
+-		}
++	      cum->warn_empty = true;
++	      break;
+ 	    }
+-
+-	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
+-	  set = gen_rtx_SET (dest, src);
+-
+-	  /* Drop possible dead definitions.  */
+-	  PATTERN (insn) = set;
+-
+-	  INSN_CODE (insn) = -1;
+-	  recog_memoized (insn);
+-	  df_insn_rescan (insn);
+-	  bitmap_set_bit (convert_bbs, bb->index);
+ 	}
+     }
+ 
+-  if (v4sf_const0)
++  if (!TARGET_64BIT)
+     {
+-      /* (Re-)discover loops so that bb->loop_father can be used in the
+-	 analysis below.  */
+-      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+-
+-      /* Generate a vxorps at entry of the nearest dominator for basic
+-	 blocks with conversions, which is in the the fake loop that
+-	 contains the whole function, so that there is only a single
+-	 vxorps in the whole function.   */
+-      bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
+-					     convert_bbs);
+-      while (bb->loop_father->latch
+-	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
+-	bb = get_immediate_dominator (CDI_DOMINATORS,
+-				      bb->loop_father->header);
+-
+-      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
++      /* If there are variable arguments, then we won't pass anything
++         in registers in 32-bit mode. */
++      if (stdarg_p (fntype))
++	{
++	  cum->nregs = 0;
++	  /* Since in 32-bit, variable arguments are always passed on
++	     stack, there is scratch register available for indirect
++	     sibcall.  */
++	  cfun->machine->arg_reg_available = true;
++	  cum->sse_nregs = 0;
++	  cum->mmx_nregs = 0;
++	  cum->warn_avx512f = false;
++	  cum->warn_avx = false;
++	  cum->warn_sse = false;
++	  cum->warn_mmx = false;
++	  return;
++	}
+ 
+-      insn = BB_HEAD (bb);
+-      while (insn && !NONDEBUG_INSN_P (insn))
++      /* Use ecx and edx registers if function has fastcall attribute,
++	 else look for regparm information.  */
++      if (fntype)
+ 	{
+-	  if (insn == BB_END (bb))
++	  unsigned int ccvt = ix86_get_callcvt (fntype);
++	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+ 	    {
+-	      insn = NULL;
+-	      break;
++	      cum->nregs = 1;
++	      cum->fastcall = 1; /* Same first register as in fastcall.  */
++	    }
++	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
++	    {
++	      cum->nregs = 2;
++	      cum->fastcall = 1;
+ 	    }
+-	  insn = NEXT_INSN (insn);
++	  else
++	    cum->nregs = ix86_function_regparm (fntype, fndecl);
+ 	}
+-      if (insn == BB_HEAD (bb))
+-        set_insn = emit_insn_before (set, insn);
+-      else
+-	set_insn = emit_insn_after (set,
+-				    insn ? PREV_INSN (insn) : BB_END (bb));
+-      df_insn_rescan (set_insn);
+-      df_process_deferred_rescans ();
+-      loop_optimizer_finalize ();
+-
+-      if (!control_flow_insns.is_empty ())
+-	{
+-	  free_dominance_info (CDI_DOMINATORS);
+ 
+-	  unsigned int i;
+-	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+-	    if (control_flow_insn_p (insn))
+-	      {
+-		/* Split the block after insn.  There will be a fallthru
+-		   edge, which is OK so we keep it.  We have to create
+-		   the exception edges ourselves.  */
+-		bb = BLOCK_FOR_INSN (insn);
+-		split_block (bb, insn);
+-		rtl_make_eh_edge (NULL, bb, BB_END (bb));
+-	      }
+-	}
++      /* Set up the number of SSE registers used for passing SFmode
++	 and DFmode arguments.  Warn for mismatching ABI.  */
++      cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
+     }
+ 
+-  bitmap_obstack_release (NULL);
+-  BITMAP_FREE (convert_bbs);
+-
+-  timevar_pop (TV_MACH_DEP);
+-  return 0;
++  cfun->machine->arg_reg_available = (cum->nregs > 0);
+ }
+ 
+-namespace {
+-
+-const pass_data pass_data_remove_partial_avx_dependency =
+-{
+-  RTL_PASS, /* type */
+-  "rpad", /* name */
+-  OPTGROUP_NONE, /* optinfo_flags */
+-  TV_MACH_DEP, /* tv_id */
+-  0, /* properties_required */
+-  0, /* properties_provided */
+-  0, /* properties_destroyed */
+-  0, /* todo_flags_start */
+-  TODO_df_finish, /* todo_flags_finish */
+-};
+-
+-class pass_remove_partial_avx_dependency : public rtl_opt_pass
+-{
+-public:
+-  pass_remove_partial_avx_dependency (gcc::context *ctxt)
+-    : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
+-  {}
+-
+-  /* opt_pass methods: */
+-  virtual bool gate (function *)
+-    {
+-      return (TARGET_AVX
+-	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+-	      && TARGET_SSE_MATH
+-	      && optimize
+-	      && optimize_function_for_speed_p (cfun));
+-    }
+-
+-  virtual unsigned int execute (function *)
+-    {
+-      return remove_partial_avx_dependency ();
+-    }
+-}; // class pass_rpad
+-
+-} // anon namespace
++/* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
++   But in the case of vector types, it is some vector mode.
+ 
+-rtl_opt_pass *
+-make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
+-{
+-  return new pass_remove_partial_avx_dependency (ctxt);
+-}
++   When we have only some of our vector isa extensions enabled, then there
++   are some modes for which vector_mode_supported_p is false.  For these
++   modes, the generic vector support in gcc will choose some non-vector mode
++   in order to implement the type.  By computing the natural mode, we'll
++   select the proper ABI location for the operand and not depend on whatever
++   the middle-end decides to do with these vector types.
+ 
+-/* Return true if a red-zone is in use.  We can't use red-zone when
+-   there are local indirect jumps, like "indirect_jump" or "tablejump",
+-   which jumps to another place in the function, since "call" in the
+-   indirect thunk pushes the return address onto stack, destroying
+-   red-zone.
++   The midde-end can't deal with the vector types > 16 bytes.  In this
++   case, we return the original mode and warn ABI change if CUM isn't
++   NULL. 
+ 
+-   TODO: If we can reserve the first 2 WORDs, for PUSH and, another
+-   for CALL, in red-zone, we can allow local indirect jumps with
+-   indirect thunk.  */
++   If INT_RETURN is true, warn ABI change if the vector mode isn't
++   available for function return value.  */
+ 
+-bool
+-ix86_using_red_zone (void)
++static machine_mode
++type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
++		   bool in_return)
+ {
+-  return (TARGET_RED_ZONE
+-	  && !TARGET_64BIT_MS_ABI
+-	  && (!cfun->machine->has_local_indirect_jump
+-	      || cfun->machine->indirect_branch_type == indirect_branch_keep));
+-}
+-
+-/* Return a string that documents the current -m options.  The caller is
+-   responsible for freeing the string.  */
++  machine_mode mode = TYPE_MODE (type);
+ 
+-static char *
+-ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
+-		    int flags, int flags2,
+-		    const char *arch, const char *tune,
+-		    enum fpmath_unit fpmath, bool add_nl_p, bool add_abi_p)
+-{
+-  struct ix86_target_opts
+-  {
+-    const char *option;		/* option string */
+-    HOST_WIDE_INT mask;		/* isa mask options */
+-  };
++  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
++    {
++      HOST_WIDE_INT size = int_size_in_bytes (type);
++      if ((size == 8 || size == 16 || size == 32 || size == 64)
++	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
++	  && TYPE_VECTOR_SUBPARTS (type) > 1)
++	{
++	  machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
+ 
+-  /* This table is ordered so that options like -msse4.2 that imply other
+-     ISAs come first.  Target string will be displayed in the same order.  */
+-  static struct ix86_target_opts isa2_opts[] =
+-  {
+-    { "-mcx16",		OPTION_MASK_ISA_CX16 },
+-    { "-mvaes",		OPTION_MASK_ISA_VAES },
+-    { "-mrdpid",	OPTION_MASK_ISA_RDPID },
+-    { "-mpconfig",	OPTION_MASK_ISA_PCONFIG },
+-    { "-mwbnoinvd",     OPTION_MASK_ISA_WBNOINVD },
+-    { "-msgx",		OPTION_MASK_ISA_SGX },
+-    { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
+-    { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
+-    { "-mhle",		OPTION_MASK_ISA_HLE },
+-    { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
+-    { "-mclzero",	OPTION_MASK_ISA_CLZERO },
+-    { "-mmwaitx",	OPTION_MASK_ISA_MWAITX },
+-    { "-mmovdir64b",	OPTION_MASK_ISA_MOVDIR64B },
+-    { "-mwaitpkg",	OPTION_MASK_ISA_WAITPKG },
+-    { "-mcldemote",	OPTION_MASK_ISA_CLDEMOTE },
+-    { "-mptwrite",	OPTION_MASK_ISA_PTWRITE }
+-  };
+-  static struct ix86_target_opts isa_opts[] =
+-  {
+-    { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
+-    { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
+-    { "-mvpclmulqdq",	OPTION_MASK_ISA_VPCLMULQDQ },
+-    { "-mgfni",		OPTION_MASK_ISA_GFNI },
+-    { "-mavx512vnni",	OPTION_MASK_ISA_AVX512VNNI },
+-    { "-mavx512vbmi2",	OPTION_MASK_ISA_AVX512VBMI2 },
+-    { "-mavx512vbmi",	OPTION_MASK_ISA_AVX512VBMI },
+-    { "-mavx512ifma",	OPTION_MASK_ISA_AVX512IFMA },
+-    { "-mavx512vl",	OPTION_MASK_ISA_AVX512VL },
+-    { "-mavx512bw",	OPTION_MASK_ISA_AVX512BW },
+-    { "-mavx512dq",	OPTION_MASK_ISA_AVX512DQ },
+-    { "-mavx512er",	OPTION_MASK_ISA_AVX512ER },
+-    { "-mavx512pf",	OPTION_MASK_ISA_AVX512PF },
+-    { "-mavx512cd",	OPTION_MASK_ISA_AVX512CD },
+-    { "-mavx512f",	OPTION_MASK_ISA_AVX512F },
+-    { "-mavx2",		OPTION_MASK_ISA_AVX2 },
+-    { "-mfma",		OPTION_MASK_ISA_FMA },
+-    { "-mxop",		OPTION_MASK_ISA_XOP },
+-    { "-mfma4",		OPTION_MASK_ISA_FMA4 },
+-    { "-mf16c",		OPTION_MASK_ISA_F16C },
+-    { "-mavx",		OPTION_MASK_ISA_AVX },
+-/*  { "-msse4"		OPTION_MASK_ISA_SSE4 }, */
+-    { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
+-    { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
+-    { "-msse4a",	OPTION_MASK_ISA_SSE4A },
+-    { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
+-    { "-msse3",		OPTION_MASK_ISA_SSE3 },
+-    { "-maes",		OPTION_MASK_ISA_AES },
+-    { "-msha",		OPTION_MASK_ISA_SHA },
+-    { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
+-    { "-msse2",		OPTION_MASK_ISA_SSE2 },
+-    { "-msse",		OPTION_MASK_ISA_SSE },
+-    { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
+-    { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
+-    { "-mmmx",		OPTION_MASK_ISA_MMX },
+-    { "-mrtm",		OPTION_MASK_ISA_RTM },
+-    { "-mprfchw",	OPTION_MASK_ISA_PRFCHW },
+-    { "-mrdseed",	OPTION_MASK_ISA_RDSEED },
+-    { "-madx",		OPTION_MASK_ISA_ADX },
+-    { "-mprefetchwt1",	OPTION_MASK_ISA_PREFETCHWT1 },
+-    { "-mclflushopt",	OPTION_MASK_ISA_CLFLUSHOPT },
+-    { "-mxsaves",	OPTION_MASK_ISA_XSAVES },
+-    { "-mxsavec",	OPTION_MASK_ISA_XSAVEC },
+-    { "-mxsaveopt",	OPTION_MASK_ISA_XSAVEOPT },
+-    { "-mxsave",	OPTION_MASK_ISA_XSAVE },
+-    { "-mabm",		OPTION_MASK_ISA_ABM },
+-    { "-mbmi",		OPTION_MASK_ISA_BMI },
+-    { "-mbmi2",		OPTION_MASK_ISA_BMI2 },
+-    { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
+-    { "-mtbm",		OPTION_MASK_ISA_TBM },
+-    { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
+-    { "-msahf",		OPTION_MASK_ISA_SAHF },
+-    { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
+-    { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
+-    { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
+-    { "-mpku",		OPTION_MASK_ISA_PKU },
+-    { "-mlwp",		OPTION_MASK_ISA_LWP },
+-    { "-mfxsr",		OPTION_MASK_ISA_FXSR },
+-    { "-mclwb",		OPTION_MASK_ISA_CLWB },
+-    { "-mshstk",	OPTION_MASK_ISA_SHSTK },
+-    { "-mmovdiri",	OPTION_MASK_ISA_MOVDIRI }
+-  };
++	  /* There are no XFmode vector modes.  */
++	  if (innermode == XFmode)
++	    return mode;
+ 
+-  /* Flag options.  */
+-  static struct ix86_target_opts flag_opts[] =
+-  {
+-    { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
+-    { "-mlong-double-128",		MASK_LONG_DOUBLE_128 },
+-    { "-mlong-double-64",		MASK_LONG_DOUBLE_64 },
+-    { "-m80387",			MASK_80387 },
+-    { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
+-    { "-malign-double",			MASK_ALIGN_DOUBLE },
+-    { "-mcld",				MASK_CLD },
+-    { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
+-    { "-mieee-fp",			MASK_IEEE_FP },
+-    { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
+-    { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
+-    { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
+-    { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
+-    { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
+-    { "-mno-push-args",			MASK_NO_PUSH_ARGS },
+-    { "-mno-red-zone",			MASK_NO_RED_ZONE },
+-    { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
+-    { "-mrecip",			MASK_RECIP },
+-    { "-mrtd",				MASK_RTD },
+-    { "-msseregparm",			MASK_SSEREGPARM },
+-    { "-mstack-arg-probe",		MASK_STACK_PROBE },
+-    { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
+-    { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
+-    { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
+-    { "-mvzeroupper",			MASK_VZEROUPPER },
+-    { "-mstv",				MASK_STV },
+-    { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
+-    { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
+-    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
+-  };
++	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
++	    mode = MIN_MODE_VECTOR_FLOAT;
++	  else
++	    mode = MIN_MODE_VECTOR_INT;
+ 
+-  /* Additional flag options.  */
+-  static struct ix86_target_opts flag2_opts[] =
+-  {
+-    { "-mgeneral-regs-only",		OPTION_MASK_GENERAL_REGS_ONLY }
+-  };
++	  /* Get the mode which has this inner mode and number of units.  */
++	  FOR_EACH_MODE_FROM (mode, mode)
++	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
++		&& GET_MODE_INNER (mode) == innermode)
++	      {
++		if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
++		  {
++		    static bool warnedavx512f;
++		    static bool warnedavx512f_ret;
+ 
+-  const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
+-		   + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
++		    if (cum && cum->warn_avx512f && !warnedavx512f)
++		      {
++			if (warning (OPT_Wpsabi, "AVX512F vector argument "
++				     "without AVX512F enabled changes the ABI"))
++			  warnedavx512f = true;
++		      }
++		    else if (in_return && !warnedavx512f_ret)
++		      {
++			if (warning (OPT_Wpsabi, "AVX512F vector return "
++				     "without AVX512F enabled changes the ABI"))
++			  warnedavx512f_ret = true;
++		      }
+ 
+-  char isa_other[40];
+-  char isa2_other[40];
+-  char flags_other[40];
+-  char flags2_other[40];
+-  unsigned num = 0;
+-  unsigned i, j;
+-  char *ret;
+-  char *ptr;
+-  size_t len;
+-  size_t line_len;
+-  size_t sep_len;
+-  const char *abi;
++		    return TYPE_MODE (type);
++		  }
++		else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
++		  {
++		    static bool warnedavx;
++		    static bool warnedavx_ret;
+ 
+-  memset (opts, '\0', sizeof (opts));
++		    if (cum && cum->warn_avx && !warnedavx)
++		      {
++			if (warning (OPT_Wpsabi, "AVX vector argument "
++				     "without AVX enabled changes the ABI"))
++			  warnedavx = true;
++		      }
++		    else if (in_return && !warnedavx_ret)
++		      {
++			if (warning (OPT_Wpsabi, "AVX vector return "
++				     "without AVX enabled changes the ABI"))
++			  warnedavx_ret = true;
++		      }
+ 
+-  /* Add -march= option.  */
+-  if (arch)
+-    {
+-      opts[num][0] = "-march=";
+-      opts[num++][1] = arch;
+-    }
+-
+-  /* Add -mtune= option.  */
+-  if (tune)
+-    {
+-      opts[num][0] = "-mtune=";
+-      opts[num++][1] = tune;
+-    }
+-
+-  /* Add -m32/-m64/-mx32.  */
+-  if (add_abi_p)
+-    {
+-      if ((isa & OPTION_MASK_ISA_64BIT) != 0)
+-	{
+-	  if ((isa & OPTION_MASK_ABI_64) != 0)
+-	    abi = "-m64";
+-	  else
+-	    abi = "-mx32";
+-	}
+-      else
+-	abi = "-m32";
+-      opts[num++][0] = abi;
+-    }
+-  isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
+-
+-  /* Pick out the options in isa2 options.  */
+-  for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
+-    {
+-      if ((isa2 & isa2_opts[i].mask) != 0)
+-	{
+-	  opts[num++][0] = isa2_opts[i].option;
+-	  isa2 &= ~ isa2_opts[i].mask;
+-	}
+-    }
+-
+-  if (isa2 && add_nl_p)
+-    {
+-      opts[num++][0] = isa2_other;
+-      sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
+-    }
+-
+-  /* Pick out the options in isa options.  */
+-  for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
+-    {
+-      if ((isa & isa_opts[i].mask) != 0)
+-	{
+-	  opts[num++][0] = isa_opts[i].option;
+-	  isa &= ~ isa_opts[i].mask;
+-	}
+-    }
+-
+-  if (isa && add_nl_p)
+-    {
+-      opts[num++][0] = isa_other;
+-      sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
+-    }
+-
+-  /* Add flag options.  */
+-  for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
+-    {
+-      if ((flags & flag_opts[i].mask) != 0)
+-	{
+-	  opts[num++][0] = flag_opts[i].option;
+-	  flags &= ~ flag_opts[i].mask;
+-	}
+-    }
+-
+-  if (flags && add_nl_p)
+-    {
+-      opts[num++][0] = flags_other;
+-      sprintf (flags_other, "(other flags: %#x)", flags);
+-    }
+-
+-    /* Add additional flag options.  */
+-  for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
+-    {
+-      if ((flags2 & flag2_opts[i].mask) != 0)
+-	{
+-	  opts[num++][0] = flag2_opts[i].option;
+-	  flags2 &= ~ flag2_opts[i].mask;
+-	}
+-    }
+-
+-  if (flags2 && add_nl_p)
+-    {
+-      opts[num++][0] = flags2_other;
+-      sprintf (flags2_other, "(other flags2: %#x)", flags2);
+-    }
+-
+-  /* Add -fpmath= option.  */
+-  if (fpmath)
+-    {
+-      opts[num][0] = "-mfpmath=";
+-      switch ((int) fpmath)
+-	{
+-	case FPMATH_387:
+-	  opts[num++][1] = "387";
+-	  break;
++		    return TYPE_MODE (type);
++		  }
++		else if (((size == 8 && TARGET_64BIT) || size == 16)
++			 && !TARGET_SSE
++			 && !TARGET_IAMCU)
++		  {
++		    static bool warnedsse;
++		    static bool warnedsse_ret;
+ 
+-	case FPMATH_SSE:
+-	  opts[num++][1] = "sse";
+-	  break;
++		    if (cum && cum->warn_sse && !warnedsse)
++		      {
++			if (warning (OPT_Wpsabi, "SSE vector argument "
++				     "without SSE enabled changes the ABI"))
++			  warnedsse = true;
++		      }
++		    else if (!TARGET_64BIT && in_return && !warnedsse_ret)
++		      {
++			if (warning (OPT_Wpsabi, "SSE vector return "
++				     "without SSE enabled changes the ABI"))
++			  warnedsse_ret = true;
++		      }
++		  }
++		else if ((size == 8 && !TARGET_64BIT)
++			 && (!cfun
++			     || cfun->machine->func_type == TYPE_NORMAL)
++			 && !TARGET_MMX
++			 && !TARGET_IAMCU)
++		  {
++		    static bool warnedmmx;
++		    static bool warnedmmx_ret;
+ 
+-	case FPMATH_387 | FPMATH_SSE:
+-	  opts[num++][1] = "sse+387";
+-	  break;
++		    if (cum && cum->warn_mmx && !warnedmmx)
++		      {
++			if (warning (OPT_Wpsabi, "MMX vector argument "
++				     "without MMX enabled changes the ABI"))
++			  warnedmmx = true;
++		      }
++		    else if (in_return && !warnedmmx_ret)
++		      {
++			if (warning (OPT_Wpsabi, "MMX vector return "
++				     "without MMX enabled changes the ABI"))
++			  warnedmmx_ret = true;
++		      }
++		  }
++		return mode;
++	      }
+ 
+-	default:
+ 	  gcc_unreachable ();
+ 	}
+     }
+ 
+-  /* Any options?  */
+-  if (num == 0)
+-    return NULL;
+-
+-  gcc_assert (num < ARRAY_SIZE (opts));
+-
+-  /* Size the string.  */
+-  len = 0;
+-  sep_len = (add_nl_p) ? 3 : 1;
+-  for (i = 0; i < num; i++)
+-    {
+-      len += sep_len;
+-      for (j = 0; j < 2; j++)
+-	if (opts[i][j])
+-	  len += strlen (opts[i][j]);
+-    }
+-
+-  /* Build the string.  */
+-  ret = ptr = (char *) xmalloc (len);
+-  line_len = 0;
+-
+-  for (i = 0; i < num; i++)
+-    {
+-      size_t len2[2];
+-
+-      for (j = 0; j < 2; j++)
+-	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
+-
+-      if (i != 0)
+-	{
+-	  *ptr++ = ' ';
+-	  line_len++;
+-
+-	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
+-	    {
+-	      *ptr++ = '\\';
+-	      *ptr++ = '\n';
+-	      line_len = 0;
+-	    }
+-	}
+-
+-      for (j = 0; j < 2; j++)
+-	if (opts[i][j])
+-	  {
+-	    memcpy (ptr, opts[i][j], len2[j]);
+-	    ptr += len2[j];
+-	    line_len += len2[j];
+-	  }
+-    }
+-
+-  *ptr = '\0';
+-  gcc_assert (ret + len >= ptr);
+-
+-  return ret;
++  return mode;
+ }
+ 
+-/* Return true, if profiling code should be emitted before
+-   prologue. Otherwise it returns false.
+-   Note: For x86 with "hotfix" it is sorried.  */
+-static bool
+-ix86_profile_before_prologue (void)
+-{
+-  return flag_fentry != 0;
+-}
++/* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
++   this may not agree with the mode that the type system has chosen for the
++   register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
++   go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
+ 
+-/* Function that is callable from the debugger to print the current
+-   options.  */
+-void ATTRIBUTE_UNUSED
+-ix86_debug_options (void)
++static rtx
++gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
++		     unsigned int regno)
+ {
+-  char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
+-				   target_flags, ix86_target_flags,
+-				   ix86_arch_string,ix86_tune_string,
+-				   ix86_fpmath, true, true);
++  rtx tmp;
+ 
+-  if (opts)
++  if (orig_mode != BLKmode)
++    tmp = gen_rtx_REG (orig_mode, regno);
++  else
+     {
+-      fprintf (stderr, "%s\n\n", opts);
+-      free (opts);
++      tmp = gen_rtx_REG (mode, regno);
++      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
++      tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
+     }
+-  else
+-    fputs ("<no options>\n\n", stderr);
+ 
+-  return;
++  return tmp;
+ }
+ 
+-static const char *stringop_alg_names[] = {
+-#define DEF_ENUM
+-#define DEF_ALG(alg, name) #name,
+-#include "stringop.def"
+-#undef DEF_ENUM
+-#undef DEF_ALG
+-};
++/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
++   of this code is to classify each 8bytes of incoming argument by the register
++   class and assign registers accordingly.  */
+ 
+-/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+-   The string is of the following form (or comma separated list of it):
++/* Return the union class of CLASS1 and CLASS2.
++   See the x86-64 PS ABI for details.  */
+ 
+-     strategy_alg:max_size:[align|noalign]
++static enum x86_64_reg_class
++merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
++{
++  /* Rule #1: If both classes are equal, this is the resulting class.  */
++  if (class1 == class2)
++    return class1;
+ 
+-   where the full size range for the strategy is either [0, max_size] or
+-   [min_size, max_size], in which min_size is the max_size + 1 of the
+-   preceding range.  The last size range must have max_size == -1.
++  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
++     the other class.  */
++  if (class1 == X86_64_NO_CLASS)
++    return class2;
++  if (class2 == X86_64_NO_CLASS)
++    return class1;
+ 
+-   Examples:
++  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
++  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
++    return X86_64_MEMORY_CLASS;
+ 
+-    1.
+-       -mmemcpy-strategy=libcall:-1:noalign
++  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
++  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
++      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
++    return X86_64_INTEGERSI_CLASS;
++  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
++      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
++    return X86_64_INTEGER_CLASS;
+ 
+-      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
++  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
++     MEMORY is used.  */
++  if (class1 == X86_64_X87_CLASS
++      || class1 == X86_64_X87UP_CLASS
++      || class1 == X86_64_COMPLEX_X87_CLASS
++      || class2 == X86_64_X87_CLASS
++      || class2 == X86_64_X87UP_CLASS
++      || class2 == X86_64_COMPLEX_X87_CLASS)
++    return X86_64_MEMORY_CLASS;
+ 
++  /* Rule #6: Otherwise class SSE is used.  */
++  return X86_64_SSE_CLASS;
++}
+ 
+-   2.
+-      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
++/* Classify the argument of type TYPE and mode MODE.
++   CLASSES will be filled by the register class used to pass each word
++   of the operand.  The number of words is returned.  In case the parameter
++   should be passed in memory, 0 is returned. As a special case for zero
++   sized containers, classes[0] will be NO_CLASS and 1 is returned.
+ 
+-      This is to tell the compiler to use the following strategy for memset
+-      1) when the expected size is between [1, 16], use rep_8byte strategy;
+-      2) when the size is between [17, 2048], use vector_loop;
+-      3) when the size is > 2048, use libcall.  */
++   BIT_OFFSET is used internally for handling records and specifies offset
++   of the offset in bits modulo 512 to avoid overflow cases.
+ 
+-struct stringop_size_range
+-{
+-  int max;
+-  stringop_alg alg;
+-  bool noalign;
+-};
++   See the x86-64 PS ABI for details.
++*/
+ 
+-static void
+-ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
++static int
++classify_argument (machine_mode mode, const_tree type,
++		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
+ {
+-  const struct stringop_algs *default_algs;
+-  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+-  char *curr_range_str, *next_range_str;
+-  const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
+-  int i = 0, n = 0;
+-
+-  if (is_memset)
+-    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+-  else
+-    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
++  HOST_WIDE_INT bytes
++    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
++  int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
+ 
+-  curr_range_str = strategy_str;
++  /* Variable sized entities are always passed/returned in memory.  */
++  if (bytes < 0)
++    return 0;
+ 
+-  do
++  if (mode != VOIDmode)
+     {
+-      int maxs;
+-      char alg_name[128];
+-      char align[16];
+-      next_range_str = strchr (curr_range_str, ',');
+-      if (next_range_str)
+-        *next_range_str++ = '\0';
+-
+-      if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
+-		  align) != 3)
+-        {
+-	  error ("wrong argument %qs to option %qs", curr_range_str, opt);
+-          return;
+-        }
++      /* The value of "named" doesn't matter.  */
++      function_arg_info arg (const_cast<tree> (type), mode, /*named=*/true);
++      if (targetm.calls.must_pass_in_stack (arg))
++	return 0;
++    }
+ 
+-      if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
+-        {
+-	  error ("size ranges of option %qs should be increasing", opt);
+-          return;
+-        }
++  if (type && AGGREGATE_TYPE_P (type))
++    {
++      int i;
++      tree field;
++      enum x86_64_reg_class subclasses[MAX_CLASSES];
+ 
+-      for (i = 0; i < last_alg; i++)
+-	if (!strcmp (alg_name, stringop_alg_names[i]))
+-	  break;
++      /* On x86-64 we pass structures larger than 64 bytes on the stack.  */
++      if (bytes > 64)
++	return 0;
+ 
+-      if (i == last_alg)
+-        {
+-	  error ("wrong strategy name %qs specified for option %qs",
+-		 alg_name, opt);
+-
+-	  auto_vec <const char *> candidates;
+-	  for (i = 0; i < last_alg; i++)
+-	    if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
+-	      candidates.safe_push (stringop_alg_names[i]);
+-
+-	  char *s;
+-	  const char *hint
+-	    = candidates_list_and_hint (alg_name, s, candidates);
+-	  if (hint)
+-	    inform (input_location,
+-		    "valid arguments to %qs are: %s; did you mean %qs?",
+-		    opt, s, hint);
+-	  else
+-	    inform (input_location, "valid arguments to %qs are: %s",
+-		    opt, s);
+-	  XDELETEVEC (s);
+-          return;
+-        }
++      for (i = 0; i < words; i++)
++	classes[i] = X86_64_NO_CLASS;
+ 
+-      if ((stringop_alg) i == rep_prefix_8_byte
+-	  && !TARGET_64BIT)
++      /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
++	 signalize memory class, so handle it as special case.  */
++      if (!words)
+ 	{
+-	  /* rep; movq isn't available in 32-bit code.  */
+-	  error ("strategy name %qs specified for option %qs "
+-		 "not supported for 32-bit code", alg_name, opt);
+-	  return;
++	  classes[0] = X86_64_NO_CLASS;
++	  return 1;
+ 	}
+ 
+-      input_ranges[n].max = maxs;
+-      input_ranges[n].alg = (stringop_alg) i;
+-      if (!strcmp (align, "align"))
+-        input_ranges[n].noalign = false;
+-      else if (!strcmp (align, "noalign"))
+-        input_ranges[n].noalign = true;
+-      else
+-        {
+-	  error ("unknown alignment %qs specified for option %qs", align, opt);
+-          return;
+-        }
+-      n++;
+-      curr_range_str = next_range_str;
+-    }
+-  while (curr_range_str);
+-
+-  if (input_ranges[n - 1].max != -1)
+-    {
+-      error ("the max value for the last size range should be -1"
+-             " for option %qs", opt);
+-      return;
+-    }
++      /* Classify each field of record and merge classes.  */
++      switch (TREE_CODE (type))
++	{
++	case RECORD_TYPE:
++	  /* And now merge the fields of structure.  */
++	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
++	    {
++	      if (TREE_CODE (field) == FIELD_DECL)
++		{
++		  int num;
+ 
+-  if (n > MAX_STRINGOP_ALGS)
+-    {
+-      error ("too many size ranges specified in option %qs", opt);
+-      return;
+-    }
++		  if (TREE_TYPE (field) == error_mark_node)
++		    continue;
+ 
+-  /* Now override the default algs array.  */
+-  for (i = 0; i < n; i++)
+-    {
+-      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+-      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+-          = input_ranges[i].alg;
+-      *const_cast<int *>(&default_algs->size[i].noalign)
+-          = input_ranges[i].noalign;
+-    }
+-}
+-
+-
+-/* parse -mtune-ctrl= option. When DUMP is true,
+-   print the features that are explicitly set.  */
+-
+-static void
+-parse_mtune_ctrl_str (bool dump)
+-{
+-  if (!ix86_tune_ctrl_string)
+-    return;
+-
+-  char *next_feature_string = NULL;
+-  char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
+-  char *orig = curr_feature_string;
+-  int i;
+-  do
+-    {
+-      bool clear = false;
+-
+-      next_feature_string = strchr (curr_feature_string, ',');
+-      if (next_feature_string)
+-        *next_feature_string++ = '\0';
+-      if (*curr_feature_string == '^')
+-        {
+-          curr_feature_string++;
+-          clear = true;
+-        }
+-      for (i = 0; i < X86_TUNE_LAST; i++)
+-        {
+-          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+-            {
+-              ix86_tune_features[i] = !clear;
+-              if (dump)
+-                fprintf (stderr, "Explicitly %s feature %s\n",
+-                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
+-              break;
+-            }
+-        }
+-      if (i == X86_TUNE_LAST)
+-	error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+-	       clear ? curr_feature_string - 1 : curr_feature_string);
+-      curr_feature_string = next_feature_string;
+-    }
+-  while (curr_feature_string);
+-  free (orig);
+-}
+-
+-/* Helper function to set ix86_tune_features. IX86_TUNE is the
+-   processor type.  */
+-
+-static void
+-set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
+-{
+-  unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
+-  int i;
+-
+-  for (i = 0; i < X86_TUNE_LAST; ++i)
+-    {
+-      if (ix86_tune_no_default)
+-        ix86_tune_features[i] = 0;
+-      else
+-	ix86_tune_features[i]
+-	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
+-    }
+-
+-  if (dump)
+-    {
+-      fprintf (stderr, "List of x86 specific tuning parameter names:\n");
+-      for (i = 0; i < X86_TUNE_LAST; i++)
+-        fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
+-                 ix86_tune_features[i] ? "on" : "off");
+-    }
+-
+-  parse_mtune_ctrl_str (dump);
+-}
+-
+-
+-/* Default align_* from the processor table.  */
++		  /* Bitfields are always classified as integer.  Handle them
++		     early, since later code would consider them to be
++		     misaligned integers.  */
++		  if (DECL_BIT_FIELD (field))
++		    {
++		      for (i = (int_bit_position (field)
++				+ (bit_offset % 64)) / 8 / 8;
++			   i < ((int_bit_position (field) + (bit_offset % 64))
++			        + tree_to_shwi (DECL_SIZE (field))
++				+ 63) / 8 / 8; i++)
++			classes[i]
++			  = merge_classes (X86_64_INTEGER_CLASS, classes[i]);
++		    }
++		  else
++		    {
++		      int pos;
+ 
+-static void
+-ix86_default_align (struct gcc_options *opts)
+-{
+-  /* -falign-foo without argument: supply one.  */
+-  if (opts->x_flag_align_loops && !opts->x_str_align_loops)
+-    opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop;
+-  if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
+-    opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump;
+-  if (opts->x_flag_align_labels && !opts->x_str_align_labels)
+-    opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label;
+-  if (opts->x_flag_align_functions && !opts->x_str_align_functions)
+-    opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func;
+-}
++		      type = TREE_TYPE (field);
+ 
+-/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook.  */
++		      /* Flexible array member is ignored.  */
++		      if (TYPE_MODE (type) == BLKmode
++			  && TREE_CODE (type) == ARRAY_TYPE
++			  && TYPE_SIZE (type) == NULL_TREE
++			  && TYPE_DOMAIN (type) != NULL_TREE
++			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
++			      == NULL_TREE))
++			{
++			  static bool warned;
+ 
+-static void
+-ix86_override_options_after_change (void)
+-{
+-  ix86_default_align (&global_options);
+-}
++			  if (!warned && warn_psabi)
++			    {
++			      warned = true;
++			      inform (input_location,
++				      "the ABI of passing struct with"
++				      " a flexible array member has"
++				      " changed in GCC 4.4");
++			    }
++			  continue;
++			}
++		      num = classify_argument (TYPE_MODE (type), type,
++					       subclasses,
++					       (int_bit_position (field)
++						+ bit_offset) % 512);
++		      if (!num)
++			return 0;
++		      pos = (int_bit_position (field)
++			     + (bit_offset % 64)) / 8 / 8;
++		      for (i = 0; i < num && (i + pos) < words; i++)
++			classes[i + pos]
++			  = merge_classes (subclasses[i], classes[i + pos]);
++		    }
++		}
++	    }
++	  break;
+ 
++	case ARRAY_TYPE:
++	  /* Arrays are handled as small records.  */
++	  {
++	    int num;
++	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
++				     TREE_TYPE (type), subclasses, bit_offset);
++	    if (!num)
++	      return 0;
+ 
++	    /* The partial classes are now full classes.  */
++	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
++	      subclasses[0] = X86_64_SSE_CLASS;
++	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
++		&& !((bit_offset % 64) == 0 && bytes == 4))
++	      subclasses[0] = X86_64_INTEGER_CLASS;
+ 
+-/* Override various settings based on options.  If MAIN_ARGS_P, the
+-   options are from the command line, otherwise they are from
+-   attributes.  Return true if there's an error related to march
+-   option.  */
++	    for (i = 0; i < words; i++)
++	      classes[i] = subclasses[i % num];
+ 
+-static bool
+-ix86_option_override_internal (bool main_args_p,
+-			       struct gcc_options *opts,
+-			       struct gcc_options *opts_set)
+-{
+-  int i;
+-  unsigned HOST_WIDE_INT ix86_arch_mask;
+-  const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
++	    break;
++	  }
++	case UNION_TYPE:
++	case QUAL_UNION_TYPE:
++	  /* Unions are similar to RECORD_TYPE but offset is always 0.
++	     */
++	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
++	    {
++	      if (TREE_CODE (field) == FIELD_DECL)
++		{
++		  int num;
+ 
+-  /* -mrecip options.  */
+-  static struct
+-    {
+-      const char *string;           /* option name */
+-      unsigned int mask;            /* mask bits to set */
+-    }
+-  const recip_options[] =
+-    {
+-      { "all",       RECIP_MASK_ALL },
+-      { "none",      RECIP_MASK_NONE },
+-      { "div",       RECIP_MASK_DIV },
+-      { "sqrt",      RECIP_MASK_SQRT },
+-      { "vec-div",   RECIP_MASK_VEC_DIV },
+-      { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
+-    };
++		  if (TREE_TYPE (field) == error_mark_node)
++		    continue;
+ 
++		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
++					   TREE_TYPE (field), subclasses,
++					   bit_offset);
++		  if (!num)
++		    return 0;
++		  for (i = 0; i < num && i < words; i++)
++		    classes[i] = merge_classes (subclasses[i], classes[i]);
++		}
++	    }
++	  break;
+ 
+-  /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
+-     TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false.  */
+-  if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
+-#ifdef TARGET_BI_ARCH
+-  else
+-    {
+-#if TARGET_BI_ARCH == 1
+-      /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
+-	 is on and OPTION_MASK_ABI_X32 is off.  We turn off
+-	 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
+-	 -mx32.  */
+-      if (TARGET_X32_P (opts->x_ix86_isa_flags))
+-	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
+-#else
+-      /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
+-	 on and OPTION_MASK_ABI_64 is off.  We turn off
+-	 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
+-	 -m64 or OPTION_MASK_CODE16 is turned on by -m16.  */
+-      if (TARGET_LP64_P (opts->x_ix86_isa_flags)
+-	  || TARGET_16BIT_P (opts->x_ix86_isa_flags))
+-	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+-#endif
+-      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	  && TARGET_IAMCU_P (opts->x_target_flags))
+-	sorry ("Intel MCU psABI isn%'t supported in %s mode",
+-	       TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
+-    }
+-#endif
++	default:
++	  gcc_unreachable ();
++	}
+ 
+-  if (TARGET_X32_P (opts->x_ix86_isa_flags))
+-    {
+-      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
+-	 OPTION_MASK_ABI_64 for TARGET_X32.  */
+-      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
+-      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
+-    }
+-  else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
+-    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
+-				| OPTION_MASK_ABI_X32
+-				| OPTION_MASK_ABI_64);
+-  else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
+-    {
+-      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
+-	 OPTION_MASK_ABI_X32 for TARGET_LP64.  */
+-      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
+-      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+-    }
++      if (words > 2)
++	{
++	  /* When size > 16 bytes, if the first one isn't
++	     X86_64_SSE_CLASS or any other ones aren't
++	     X86_64_SSEUP_CLASS, everything should be passed in
++	     memory.  */
++	  if (classes[0] != X86_64_SSE_CLASS)
++	      return 0;
+ 
+-#ifdef SUBTARGET_OVERRIDE_OPTIONS
+-  SUBTARGET_OVERRIDE_OPTIONS;
+-#endif
++	  for (i = 1; i < words; i++)
++	    if (classes[i] != X86_64_SSEUP_CLASS)
++	      return 0;
++	}
+ 
+-#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
+-  SUBSUBTARGET_OVERRIDE_OPTIONS;
+-#endif
++      /* Final merger cleanup.  */
++      for (i = 0; i < words; i++)
++	{
++	  /* If one class is MEMORY, everything should be passed in
++	     memory.  */
++	  if (classes[i] == X86_64_MEMORY_CLASS)
++	    return 0;
+ 
+-  /* -fPIC is the default for x86_64.  */
+-  if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    opts->x_flag_pic = 2;
++	  /* The X86_64_SSEUP_CLASS should be always preceded by
++	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
++	  if (classes[i] == X86_64_SSEUP_CLASS
++	      && classes[i - 1] != X86_64_SSE_CLASS
++	      && classes[i - 1] != X86_64_SSEUP_CLASS)
++	    {
++	      /* The first one should never be X86_64_SSEUP_CLASS.  */
++	      gcc_assert (i != 0);
++	      classes[i] = X86_64_SSE_CLASS;
++	    }
+ 
+-  /* Need to check -mtune=generic first.  */
+-  if (opts->x_ix86_tune_string)
+-    {
+-      /* As special support for cross compilers we read -mtune=native
+-	     as -mtune=generic.  With native compilers we won't see the
+-	     -mtune=native, as it was changed by the driver.  */
+-      if (!strcmp (opts->x_ix86_tune_string, "native"))
+-	{
+-	  opts->x_ix86_tune_string = "generic";
+-	}
+-      else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
+-        warning (OPT_Wdeprecated,
+-		 main_args_p
+-		 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
+-		      "or %<-mtune=generic%> instead as appropriate")
+-		 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
+-		      "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
+-		      " instead as appropriate"));
+-    }
+-  else
+-    {
+-      if (opts->x_ix86_arch_string)
+-	opts->x_ix86_tune_string = opts->x_ix86_arch_string;
+-      if (!opts->x_ix86_tune_string)
+-	{
+-	  opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT];
+-	  ix86_tune_defaulted = 1;
+-	}
++	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
++	       everything should be passed in memory.  */
++	  if (classes[i] == X86_64_X87UP_CLASS
++	      && (classes[i - 1] != X86_64_X87_CLASS))
++	    {
++	      static bool warned;
+ 
+-      /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
+-	 or defaulted.  We need to use a sensible tune option.  */
+-      if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
+-	{
+-	  opts->x_ix86_tune_string = "generic";
++	      /* The first one should never be X86_64_X87UP_CLASS.  */
++	      gcc_assert (i != 0);
++	      if (!warned && warn_psabi)
++		{
++		  warned = true;
++		  inform (input_location,
++			  "the ABI of passing union with %<long double%>"
++			  " has changed in GCC 4.4");
++		}
++	      return 0;
++	    }
+ 	}
++      return words;
+     }
+ 
+-  if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
+-      && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
++  /* Compute alignment needed.  We align all types to natural boundaries with
++     exception of XFmode that is aligned to 64bits.  */
++  if (mode != VOIDmode && mode != BLKmode)
+     {
+-      /* rep; movq isn't available in 32-bit code.  */
+-      error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code");
+-      opts->x_ix86_stringop_alg = no_stringop;
+-    }
+-
+-  if (!opts->x_ix86_arch_string)
+-    opts->x_ix86_arch_string
+-      = TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	? "x86-64" : SUBTARGET32_DEFAULT_CPU;
+-  else
+-    ix86_arch_specified = 1;
++      int mode_alignment = GET_MODE_BITSIZE (mode);
+ 
+-  if (opts_set->x_ix86_pmode)
+-    {
+-      if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
+-	   && opts->x_ix86_pmode == PMODE_SI)
+-	  || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	       && opts->x_ix86_pmode == PMODE_DI))
+-	error ("address mode %qs not supported in the %s bit mode",
+-	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
+-	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
++      if (mode == XFmode)
++	mode_alignment = 128;
++      else if (mode == XCmode)
++	mode_alignment = 256;
++      if (COMPLEX_MODE_P (mode))
++	mode_alignment /= 2;
++      /* Misaligned fields are always returned in memory.  */
++      if (bit_offset % mode_alignment)
++	return 0;
+     }
+-  else
+-    opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
+-			 ? PMODE_DI : PMODE_SI;
+-
+-  if (!opts_set->x_ix86_abi)
+-    opts->x_ix86_abi = DEFAULT_ABI;
+-
+-  if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
+-    error ("%<-mabi=ms%> not supported with X32 ABI");
+-  gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
+-
+-  const char *abi_name = opts->x_ix86_abi == MS_ABI ? "ms" : "sysv";
+-  if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS)
+-      && opts->x_ix86_abi != DEFAULT_ABI)
+-    error ("%<-mabi=%s%> not supported with %<-fsanitize=address%>", abi_name);
+-  if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS)
+-      && opts->x_ix86_abi != DEFAULT_ABI)
+-    error ("%<-mabi=%s%> not supported with %<-fsanitize=kernel-address%>",
+-	   abi_name);
+-  if ((opts->x_flag_sanitize & SANITIZE_THREAD)
+-      && opts->x_ix86_abi != DEFAULT_ABI)
+-    error ("%<-mabi=%s%> not supported with %<-fsanitize=thread%>", abi_name);
+-
+-  /* For targets using ms ABI enable ms-extensions, if not
+-     explicit turned off.  For non-ms ABI we turn off this
+-     option.  */
+-  if (!opts_set->x_flag_ms_extensions)
+-    opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
+-
+-  if (opts_set->x_ix86_cmodel)
+-    {
+-      switch (opts->x_ix86_cmodel)
+-	{
+-	case CM_SMALL:
+-	case CM_SMALL_PIC:
+-	  if (opts->x_flag_pic)
+-	    opts->x_ix86_cmodel = CM_SMALL_PIC;
+-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in the %s bit mode",
+-		   "small", "32");
+-	  break;
+ 
+-	case CM_MEDIUM:
+-	case CM_MEDIUM_PIC:
+-	  if (opts->x_flag_pic)
+-	    opts->x_ix86_cmodel = CM_MEDIUM_PIC;
+-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in the %s bit mode",
+-		   "medium", "32");
+-	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in x32 mode",
+-		   "medium");
+-	  break;
++  /* for V1xx modes, just use the base mode */
++  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
++      && GET_MODE_UNIT_SIZE (mode) == bytes)
++    mode = GET_MODE_INNER (mode);
+ 
+-	case CM_LARGE:
+-	case CM_LARGE_PIC:
+-	  if (opts->x_flag_pic)
+-	    opts->x_ix86_cmodel = CM_LARGE_PIC;
+-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in the %s bit mode",
+-		   "large", "32");
+-	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in x32 mode",
+-		   "large");
+-	  break;
++  /* Classification of atomic types.  */
++  switch (mode)
++    {
++    case E_SDmode:
++    case E_DDmode:
++      classes[0] = X86_64_SSE_CLASS;
++      return 1;
++    case E_TDmode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      return 2;
++    case E_DImode:
++    case E_SImode:
++    case E_HImode:
++    case E_QImode:
++    case E_CSImode:
++    case E_CHImode:
++    case E_CQImode:
++      {
++	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
+ 
+-	case CM_32:
+-	  if (opts->x_flag_pic)
+-	    error ("code model %s does not support PIC mode", "32");
+-	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in the %s bit mode",
+-		   "32", "64");
+-	  break;
++	/* Analyze last 128 bits only.  */
++	size = (size - 1) & 0x7f;
+ 
+-	case CM_KERNEL:
+-	  if (opts->x_flag_pic)
+-	    {
+-	      error ("code model %s does not support PIC mode", "kernel");
+-	      opts->x_ix86_cmodel = CM_32;
+-	    }
+-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	    error ("code model %qs not supported in the %s bit mode",
+-		   "kernel", "32");
+-	  break;
+-
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-  else
+-    {
+-      /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
+-	 use of rip-relative addressing.  This eliminates fixups that
+-	 would otherwise be needed if this object is to be placed in a
+-	 DLL, and is essentially just as efficient as direct addressing.  */
+-      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	  && (TARGET_RDOS || TARGET_PECOFF))
+-	opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
+-      else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
+-      else
+-	opts->x_ix86_cmodel = CM_32;
+-    }
+-  if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
+-    {
+-      error ("%<-masm=intel%> not supported in this configuration");
+-      opts->x_ix86_asm_dialect = ASM_ATT;
+-    }
+-  if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
+-      != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
+-    sorry ("%i-bit mode not compiled in",
+-	   (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
+-
+-  for (i = 0; i < pta_size; i++)
+-    if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
+-      {
+-	if (!strcmp (opts->x_ix86_arch_string, "generic"))
++	if (size < 32)
+ 	  {
+-	    error (main_args_p
+-		   ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
+-			"switch")
+-		   : G_("%<generic%> CPU can be used only for "
+-			"%<target(\"tune=\")%> attribute"));
+-	    return false;
++	    classes[0] = X86_64_INTEGERSI_CLASS;
++	    return 1;
+ 	  }
+-	else if (!strcmp (opts->x_ix86_arch_string, "intel"))
++	else if (size < 64)
+ 	  {
+-	    error (main_args_p
+-		   ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
+-			"switch")
+-		   : G_("%<intel%> CPU can be used only for "
+-			"%<target(\"tune=\")%> attribute"));
+-	    return false;
++	    classes[0] = X86_64_INTEGER_CLASS;
++	    return 1;
+ 	  }
+-
+-	if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	    && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
++	else if (size < 64+32)
+ 	  {
+-	    error ("CPU you selected does not support x86-64 "
+-		   "instruction set");
+-	    return false;
++	    classes[0] = X86_64_INTEGER_CLASS;
++	    classes[1] = X86_64_INTEGERSI_CLASS;
++	    return 2;
+ 	  }
+-
+-	ix86_schedule = processor_alias_table[i].schedule;
+-	ix86_arch = processor_alias_table[i].processor;
+-	/* Default cpu tuning to the architecture.  */
+-	ix86_tune = ix86_arch;
+-
+-	if (((processor_alias_table[i].flags & PTA_MMX) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
+-	if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
+-	if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
+-	if (((processor_alias_table[i].flags & PTA_SSE) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
+-	if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
+-	if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
+-	if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
+-	if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
+-	if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
+-	if (((processor_alias_table[i].flags & PTA_AVX) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
+-	if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
+-	if (((processor_alias_table[i].flags & PTA_FMA) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
+-	if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
+-	if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
+-	if (((processor_alias_table[i].flags & PTA_XOP) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
+-	if (((processor_alias_table[i].flags & PTA_LWP) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
+-	if (((processor_alias_table[i].flags & PTA_ABM) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
+-	if (((processor_alias_table[i].flags & PTA_BMI) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
+-	if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
+-	if (((processor_alias_table[i].flags & PTA_TBM) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+-	if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
+-	if (((processor_alias_table[i].flags & PTA_CX16) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
+-	if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
+-	if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	    && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
+-	if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
+-	if (((processor_alias_table[i].flags & PTA_AES) != 0)
+-	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
+-	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
+-	if (((processor_alias_table[i].flags & PTA_SHA) != 0)
+-	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
+-	  ix86_isa_flags |= OPTION_MASK_ISA_SHA;
+-	if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
+-	if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
+-	if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
+-	if (((processor_alias_table[i].flags & PTA_F16C) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
+-	if (((processor_alias_table[i].flags & PTA_RTM) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
+-	if (((processor_alias_table[i].flags & PTA_HLE) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
+-	if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
+-	if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
+-	if (((processor_alias_table[i].flags & PTA_ADX) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
+-	if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
+-	if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
+-	if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
+-	if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
+-	if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
+-	if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
+-	if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
+-	if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
+-	if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
+-	if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
+-	if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
+-	if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
+-	if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
+-	if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
+-	if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
+-	if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
+-	if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
+-	if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
+-	if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
+-	if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
+-	if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit
+-	    & OPTION_MASK_ISA_AVX512VBMI2))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
+-	if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
+-	if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit
+-	    & OPTION_MASK_ISA_AVX512BITALG))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
+-
+-	if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit
+-		 & OPTION_MASK_ISA_AVX5124VNNIW))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
+-	if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit
+-		 & OPTION_MASK_ISA_AVX5124FMAPS))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
+-	if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit
+-		 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
+-	if (((processor_alias_table[i].flags & PTA_SGX) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
+-	if (((processor_alias_table[i].flags & PTA_VAES) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
+-	if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
+-	if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
+-	if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
+-	if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE;
+-
+-	if ((processor_alias_table[i].flags
+-	   & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
+-	  x86_prefetch_sse = true;
+-	if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
+-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
+-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
+-	if (((processor_alias_table[i].flags & PTA_PKU) != 0)
+-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
+-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
+-
+-	/* Don't enable x87 instructions if only
+-	   general registers are allowed.  */
+-	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
+-	    && !(opts_set->x_target_flags & MASK_80387))
++	else if (size < 64+64)
+ 	  {
+-	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
+-	      opts->x_target_flags &= ~MASK_80387;
+-	    else
+-	      opts->x_target_flags |= MASK_80387;
++	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
++	    return 2;
+ 	  }
+-	break;
++	else
++	  gcc_unreachable ();
+       }
++    case E_CDImode:
++    case E_TImode:
++      classes[0] = classes[1] = X86_64_INTEGER_CLASS;
++      return 2;
++    case E_COImode:
++    case E_OImode:
++      /* OImode shouldn't be used directly.  */
++      gcc_unreachable ();
++    case E_CTImode:
++      return 0;
++    case E_SFmode:
++      if (!(bit_offset % 64))
++	classes[0] = X86_64_SSESF_CLASS;
++      else
++	classes[0] = X86_64_SSE_CLASS;
++      return 1;
++    case E_DFmode:
++      classes[0] = X86_64_SSEDF_CLASS;
++      return 1;
++    case E_XFmode:
++      classes[0] = X86_64_X87_CLASS;
++      classes[1] = X86_64_X87UP_CLASS;
++      return 2;
++    case E_TFmode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      return 2;
++    case E_SCmode:
++      classes[0] = X86_64_SSE_CLASS;
++      if (!(bit_offset % 64))
++	return 1;
++      else
++	{
++	  static bool warned;
+ 
+-  if (i == pta_size)
+-    {
+-      error (main_args_p
+-	     ? G_("bad value (%qs) for %<-march=%> switch")
+-	     : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
+-	     opts->x_ix86_arch_string);
++	  if (!warned && warn_psabi)
++	    {
++	      warned = true;
++	      inform (input_location,
++		      "the ABI of passing structure with %<complex float%>"
++		      " member has changed in GCC 4.4");
++	    }
++	  classes[1] = X86_64_SSESF_CLASS;
++	  return 2;
++	}
++    case E_DCmode:
++      classes[0] = X86_64_SSEDF_CLASS;
++      classes[1] = X86_64_SSEDF_CLASS;
++      return 2;
++    case E_XCmode:
++      classes[0] = X86_64_COMPLEX_X87_CLASS;
++      return 1;
++    case E_TCmode:
++      /* This modes is larger than 16 bytes.  */
++      return 0;
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V4DFmode:
++    case E_V4DImode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      classes[2] = X86_64_SSEUP_CLASS;
++      classes[3] = X86_64_SSEUP_CLASS;
++      return 4;
++    case E_V8DFmode:
++    case E_V16SFmode:
++    case E_V8DImode:
++    case E_V16SImode:
++    case E_V32HImode:
++    case E_V64QImode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      classes[2] = X86_64_SSEUP_CLASS;
++      classes[3] = X86_64_SSEUP_CLASS;
++      classes[4] = X86_64_SSEUP_CLASS;
++      classes[5] = X86_64_SSEUP_CLASS;
++      classes[6] = X86_64_SSEUP_CLASS;
++      classes[7] = X86_64_SSEUP_CLASS;
++      return 8;
++    case E_V4SFmode:
++    case E_V4SImode:
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V2DFmode:
++    case E_V2DImode:
++      classes[0] = X86_64_SSE_CLASS;
++      classes[1] = X86_64_SSEUP_CLASS;
++      return 2;
++    case E_V1TImode:
++    case E_V1DImode:
++    case E_V2SFmode:
++    case E_V2SImode:
++    case E_V4HImode:
++    case E_V8QImode:
++      classes[0] = X86_64_SSE_CLASS;
++      return 1;
++    case E_BLKmode:
++    case E_VOIDmode:
++      return 0;
++    default:
++      gcc_assert (VECTOR_MODE_P (mode));
+ 
+-      auto_vec <const char *> candidates;
+-      for (i = 0; i < pta_size; i++)
+-	if (strcmp (processor_alias_table[i].name, "generic")
+-	    && strcmp (processor_alias_table[i].name, "intel")
+-	    && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-		|| ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
+-	  candidates.safe_push (processor_alias_table[i].name);
++      if (bytes > 16)
++	return 0;
+ 
+-#ifdef HAVE_LOCAL_CPU_DETECT
+-      /* Add also "native" as possible value.  */
+-      candidates.safe_push ("native");
+-#endif
++      gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
+ 
+-      char *s;
+-      const char *hint
+-	= candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
+-      if (hint)
+-	inform (input_location,
+-		main_args_p
+-		? G_("valid arguments to %<-march=%> switch are: "
+-		     "%s; did you mean %qs?")
+-		: G_("valid arguments to %<target(\"arch=\")%> attribute are: "
+-		     "%s; did you mean %qs?"), s, hint);
++      if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
++	classes[0] = X86_64_INTEGERSI_CLASS;
+       else
+-	inform (input_location,
+-		main_args_p
+-		? G_("valid arguments to %<-march=%> switch are: %s")
+-		: G_("valid arguments to %<target(\"arch=\")%> attribute "
+-		     "are: %s"), s);
+-      XDELETEVEC (s);
++	classes[0] = X86_64_INTEGER_CLASS;
++      classes[1] = X86_64_INTEGER_CLASS;
++      return 1 + (bytes > 8);
+     }
++}
++
++/* Examine the argument and return set number of register required in each
++   class.  Return true iff parameter should be passed in memory.  */
++
++static bool
++examine_argument (machine_mode mode, const_tree type, int in_return,
++		  int *int_nregs, int *sse_nregs)
++{
++  enum x86_64_reg_class regclass[MAX_CLASSES];
++  int n = classify_argument (mode, type, regclass, 0);
+ 
+-  ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
+-  for (i = 0; i < X86_ARCH_LAST; ++i)
+-    ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
++  *int_nregs = 0;
++  *sse_nregs = 0;
+ 
+-  for (i = 0; i < pta_size; i++)
+-    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
++  if (!n)
++    return true;
++  for (n--; n >= 0; n--)
++    switch (regclass[n])
+       {
+-	ix86_schedule = processor_alias_table[i].schedule;
+-	ix86_tune = processor_alias_table[i].processor;
+-	if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	  {
+-	    if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
+-	      {
+-		if (ix86_tune_defaulted)
+-		  {
+-		    opts->x_ix86_tune_string = "x86-64";
+-		    for (i = 0; i < pta_size; i++)
+-		      if (! strcmp (opts->x_ix86_tune_string,
+-				    processor_alias_table[i].name))
+-			break;
+-		    ix86_schedule = processor_alias_table[i].schedule;
+-		    ix86_tune = processor_alias_table[i].processor;
+-		  }
+-		else
+-		  error ("CPU you selected does not support x86-64 "
+-			 "instruction set");
+-	      }
+-	  }
+-	/* Intel CPUs have always interpreted SSE prefetch instructions as
+-	   NOPs; so, we can enable SSE prefetch instructions even when
+-	   -mtune (rather than -march) points us to a processor that has them.
+-	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
+-	   higher processors.  */
+-	if (TARGET_CMOV
+-	    && ((processor_alias_table[i].flags
+-	      & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
+-	  x86_prefetch_sse = true;
++      case X86_64_INTEGER_CLASS:
++      case X86_64_INTEGERSI_CLASS:
++	(*int_nregs)++;
+ 	break;
++      case X86_64_SSE_CLASS:
++      case X86_64_SSESF_CLASS:
++      case X86_64_SSEDF_CLASS:
++	(*sse_nregs)++;
++	break;
++      case X86_64_NO_CLASS:
++      case X86_64_SSEUP_CLASS:
++	break;
++      case X86_64_X87_CLASS:
++      case X86_64_X87UP_CLASS:
++      case X86_64_COMPLEX_X87_CLASS:
++	if (!in_return)
++	  return true;
++	break;
++      case X86_64_MEMORY_CLASS:
++	gcc_unreachable ();
+       }
+ 
+-  if (ix86_tune_specified && i == pta_size)
+-    {
+-      error (main_args_p
+-	     ? G_("bad value (%qs) for %<-mtune=%> switch")
+-	     : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
+-	     opts->x_ix86_tune_string);
+-
+-      auto_vec <const char *> candidates;
+-      for (i = 0; i < pta_size; i++)
+-	if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+-	    || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
+-	  candidates.safe_push (processor_alias_table[i].name);
+-
+-#ifdef HAVE_LOCAL_CPU_DETECT
+-      /* Add also "native" as possible value.  */
+-      candidates.safe_push ("native");
+-#endif
++  return false;
++}
+ 
+-      char *s;
+-      const char *hint
+-	= candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
+-      if (hint)
+-	inform (input_location,
+-		main_args_p
+-		? G_("valid arguments to %<-mtune=%> switch are: "
+-		     "%s; did you mean %qs?")
+-		: G_("valid arguments to %<target(\"tune=\")%> attribute are: "
+-		     "%s; did you mean %qs?"), s, hint);
+-      else
+-	inform (input_location,
+-		main_args_p
+-		? G_("valid arguments to %<-mtune=%> switch are: %s")
+-		: G_("valid arguments to %<target(\"tune=\")%> attribute "
+-		     "are: %s"), s);
+-      XDELETEVEC (s);
+-    }
++/* Construct container for the argument used by GCC interface.  See
++   FUNCTION_ARG for the detailed description.  */
+ 
+-  set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
++static rtx
++construct_container (machine_mode mode, machine_mode orig_mode,
++		     const_tree type, int in_return, int nintregs, int nsseregs,
++		     const int *intreg, int sse_regno)
++{
++  /* The following variables hold the static issued_error state.  */
++  static bool issued_sse_arg_error;
++  static bool issued_sse_ret_error;
++  static bool issued_x87_ret_error;
+ 
+-#ifndef USE_IX86_FRAME_POINTER
+-#define USE_IX86_FRAME_POINTER 0
+-#endif
++  machine_mode tmpmode;
++  int bytes
++    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
++  enum x86_64_reg_class regclass[MAX_CLASSES];
++  int n;
++  int i;
++  int nexps = 0;
++  int needed_sseregs, needed_intregs;
++  rtx exp[MAX_CLASSES];
++  rtx ret;
+ 
+-#ifndef USE_X86_64_FRAME_POINTER
+-#define USE_X86_64_FRAME_POINTER 0
+-#endif
++  n = classify_argument (mode, type, regclass, 0);
++  if (!n)
++    return NULL;
++  if (examine_argument (mode, type, in_return, &needed_intregs,
++			&needed_sseregs))
++    return NULL;
++  if (needed_intregs > nintregs || needed_sseregs > nsseregs)
++    return NULL;
+ 
+-  /* Set the default values for switches whose default depends on TARGET_64BIT
+-     in case they weren't overwritten by command line options.  */
+-  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    {
+-      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
+-	opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
+-      if (opts->x_flag_asynchronous_unwind_tables
+-	  && !opts_set->x_flag_unwind_tables
+-	  && TARGET_64BIT_MS_ABI)
+-	opts->x_flag_unwind_tables = 1;
+-      if (opts->x_flag_asynchronous_unwind_tables == 2)
+-	opts->x_flag_unwind_tables
+-	  = opts->x_flag_asynchronous_unwind_tables = 1;
+-      if (opts->x_flag_pcc_struct_return == 2)
+-	opts->x_flag_pcc_struct_return = 0;
+-    }
+-  else
++  /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
++     some less clueful developer tries to use floating-point anyway.  */
++  if (needed_sseregs && !TARGET_SSE)
+     {
+-      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
+-	opts->x_flag_omit_frame_pointer
+-	  = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
+-      if (opts->x_flag_asynchronous_unwind_tables == 2)
+-	opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
+-      if (opts->x_flag_pcc_struct_return == 2)
+-	{
+-	  /* Intel MCU psABI specifies that -freg-struct-return should
+-	     be on.  Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
+-	     we check -miamcu so that -freg-struct-return is always
+-	     turned on if -miamcu is used.  */
+-	  if (TARGET_IAMCU_P (opts->x_target_flags))
+-	    opts->x_flag_pcc_struct_return = 0;
+-	  else
+-	    opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
++      if (in_return)
++	{
++	  if (!issued_sse_ret_error)
++	    {
++	      error ("SSE register return with SSE disabled");
++	      issued_sse_ret_error = true;
++	    }
+ 	}
+-    }
+-
+-  ix86_tune_cost = processor_cost_table[ix86_tune];
+-  /* TODO: ix86_cost should be chosen at instruction or function granuality
+-     so for cold code we use size_cost even in !optimize_size compilation.  */
+-  if (opts->x_optimize_size)
+-    ix86_cost = &ix86_size_cost;
+-  else
+-    ix86_cost = ix86_tune_cost;
+-
+-  /* Arrange to set up i386_stack_locals for all functions.  */
+-  init_machine_status = ix86_init_machine_status;
+-
+-  /* Validate -mregparm= value.  */
+-  if (opts_set->x_ix86_regparm)
+-    {
+-      if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-	warning (0, "%<-mregparm%> is ignored in 64-bit mode");
+-      else if (TARGET_IAMCU_P (opts->x_target_flags))
+-	warning (0, "%<-mregparm%> is ignored for Intel MCU psABI");
+-      if (opts->x_ix86_regparm > REGPARM_MAX)
++      else if (!issued_sse_arg_error)
+ 	{
+-	  error ("%<-mregparm=%d%> is not between 0 and %d",
+-		 opts->x_ix86_regparm, REGPARM_MAX);
+-	  opts->x_ix86_regparm = 0;
++	  error ("SSE register argument with SSE disabled");
++	  issued_sse_arg_error = true;
+ 	}
++      return NULL;
+     }
+-  if (TARGET_IAMCU_P (opts->x_target_flags)
+-      || TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    opts->x_ix86_regparm = REGPARM_MAX;
+-
+-  /* Default align_* from the processor table.  */
+-  ix86_default_align (opts);
+-
+-  /* Provide default for -mbranch-cost= value.  */
+-  if (!opts_set->x_ix86_branch_cost)
+-    opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
+ 
+-  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    {
+-      opts->x_target_flags
+-	|= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
+-
+-      if (!ix86_arch_specified)
+-	opts->x_ix86_isa_flags
+-	  |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
+-
+-      if (TARGET_RTD_P (opts->x_target_flags))
+-	warning (0,
+-		 main_args_p
+-		 ? G_("%<-mrtd%> is ignored in 64bit mode")
+-		 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
+-    }
+-  else
+-    {
+-      opts->x_target_flags
+-	|= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
+-
+-      if (!ix86_arch_specified)
+-        opts->x_ix86_isa_flags
+-	  |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
+-
+-      /* i386 ABI does not specify red zone.  It still makes sense to use it
+-         when programmer takes care to stack from being destroyed.  */
+-      if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
+-        opts->x_target_flags |= MASK_NO_RED_ZONE;
+-    }
+-
+-  /* Keep nonleaf frame pointers.  */
+-  if (opts->x_flag_omit_frame_pointer)
+-    opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
+-  else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
+-    opts->x_flag_omit_frame_pointer = 1;
+-
+-  /* If we're doing fast math, we don't care about comparison order
+-     wrt NaNs.  This lets us use a shorter comparison sequence.  */
+-  if (opts->x_flag_finite_math_only)
+-    opts->x_target_flags &= ~MASK_IEEE_FP;
+-
+-  /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
+-     since the insns won't need emulation.  */
+-  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
+-    opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
+-
+-  /* Likewise, if the target doesn't have a 387, or we've specified
+-     software floating point, don't use 387 inline intrinsics.  */
+-  if (!TARGET_80387_P (opts->x_target_flags))
+-    opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
+-
+-  /* Turn on MMX builtins for -msse.  */
+-  if (TARGET_SSE_P (opts->x_ix86_isa_flags))
+-    opts->x_ix86_isa_flags
+-      |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
+-
+-  /* Enable SSE prefetch.  */
+-  if (TARGET_SSE_P (opts->x_ix86_isa_flags)
+-      || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
+-	  && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
+-      || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
+-    x86_prefetch_sse = true;
+-
+-  /* Enable popcnt instruction for -msse4.2 or -mabm.  */
+-  if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
+-      || TARGET_ABM_P (opts->x_ix86_isa_flags))
+-    opts->x_ix86_isa_flags
+-      |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
+-
+-  /* Enable lzcnt instruction for -mabm.  */
+-  if (TARGET_ABM_P(opts->x_ix86_isa_flags))
+-    opts->x_ix86_isa_flags
+-      |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
+-
+-  /* Disable BMI, BMI2 and TBM instructions for -m16.  */
+-  if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
+-    opts->x_ix86_isa_flags
+-      &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
+-	   & ~opts->x_ix86_isa_flags_explicit);
+-
+-  /* Validate -mpreferred-stack-boundary= value or default it to
+-     PREFERRED_STACK_BOUNDARY_DEFAULT.  */
+-  ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
+-  if (opts_set->x_ix86_preferred_stack_boundary_arg)
+-    {
+-      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
+-      int max = TARGET_SEH ? 4 : 12;
+-
+-      if (opts->x_ix86_preferred_stack_boundary_arg < min
+-	  || opts->x_ix86_preferred_stack_boundary_arg > max)
+-	{
+-	  if (min == max)
+-	    error ("%<-mpreferred-stack-boundary%> is not supported "
+-		   "for this target");
+-	  else
+-	    error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d",
+-		   opts->x_ix86_preferred_stack_boundary_arg, min, max);
+-	}
+-      else
+-	ix86_preferred_stack_boundary
+-	  = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
+-    }
+-
+-  /* Set the default value for -mstackrealign.  */
+-  if (!opts_set->x_ix86_force_align_arg_pointer)
+-    opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
+-
+-  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+-
+-  /* Validate -mincoming-stack-boundary= value or default it to
+-     MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
+-  ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+-  if (opts_set->x_ix86_incoming_stack_boundary_arg)
+-    {
+-      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
+-
+-      if (opts->x_ix86_incoming_stack_boundary_arg < min
+-	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
+-	error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12",
+-	       opts->x_ix86_incoming_stack_boundary_arg, min);
+-      else
+-	{
+-	  ix86_user_incoming_stack_boundary
+-	    = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
+-	  ix86_incoming_stack_boundary
+-	    = ix86_user_incoming_stack_boundary;
+-	}
+-    }
+-
+-#ifndef NO_PROFILE_COUNTERS
+-  if (flag_nop_mcount)
+-    error ("%<-mnop-mcount%> is not compatible with this target");
+-#endif
+-  if (flag_nop_mcount && flag_pic)
+-    error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>");
+-
+-  /* Accept -msseregparm only if at least SSE support is enabled.  */
+-  if (TARGET_SSEREGPARM_P (opts->x_target_flags)
+-      && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
+-    error (main_args_p
+-	   ? G_("%<-msseregparm%> used without SSE enabled")
+-	   : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
+-
+-  if (opts_set->x_ix86_fpmath)
+-    {
+-      if (opts->x_ix86_fpmath & FPMATH_SSE)
++  /* Likewise, error if the ABI requires us to return values in the
++     x87 registers and the user specified -mno-80387.  */
++  if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
++    for (i = 0; i < n; i++)
++      if (regclass[i] == X86_64_X87_CLASS
++	  || regclass[i] == X86_64_X87UP_CLASS
++	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
+ 	{
+-	  if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
+-	    {
+-	      if (TARGET_80387_P (opts->x_target_flags))
+-		{
+-		  warning (0, "SSE instruction set disabled, using 387 arithmetics");
+-		  opts->x_ix86_fpmath = FPMATH_387;
+-		}
+-	    }
+-	  else if ((opts->x_ix86_fpmath & FPMATH_387)
+-		   && !TARGET_80387_P (opts->x_target_flags))
++	  if (!issued_x87_ret_error)
+ 	    {
+-	      warning (0, "387 instruction set disabled, using SSE arithmetics");
+-	      opts->x_ix86_fpmath = FPMATH_SSE;
++	      error ("x87 register return with x87 disabled");
++	      issued_x87_ret_error = true;
+ 	    }
++	  return NULL;
+ 	}
+-    }
+-  /* For all chips supporting SSE2, -mfpmath=sse performs better than
+-     fpmath=387.  The second is however default at many targets since the
+-     extra 80bit precision of temporaries is considered to be part of ABI.
+-     Overwrite the default at least for -ffast-math. 
+-     TODO: -mfpmath=both seems to produce same performing code with bit
+-     smaller binaries.  It is however not clear if register allocation is
+-     ready for this setting.
+-     Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
+-     codegen.  We may switch to 387 with -ffast-math for size optimized
+-     functions. */
+-  else if (fast_math_flags_set_p (&global_options)
+-	   && TARGET_SSE2_P (opts->x_ix86_isa_flags))
+-    opts->x_ix86_fpmath = FPMATH_SSE;
+-  else
+-    opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
+ 
+-  /* Use external vectorized library in vectorizing intrinsics.  */
+-  if (opts_set->x_ix86_veclibabi_type)
+-    switch (opts->x_ix86_veclibabi_type)
++  /* First construct simple cases.  Avoid SCmode, since we want to use
++     single register to pass this type.  */
++  if (n == 1 && mode != SCmode)
++    switch (regclass[0])
+       {
+-      case ix86_veclibabi_type_svml:
+-	ix86_veclib_handler = ix86_veclibabi_svml;
+-	break;
+-
+-      case ix86_veclibabi_type_acml:
+-	ix86_veclib_handler = ix86_veclibabi_acml;
++      case X86_64_INTEGER_CLASS:
++      case X86_64_INTEGERSI_CLASS:
++	return gen_rtx_REG (mode, intreg[0]);
++      case X86_64_SSE_CLASS:
++      case X86_64_SSESF_CLASS:
++      case X86_64_SSEDF_CLASS:
++	if (mode != BLKmode)
++	  return gen_reg_or_parallel (mode, orig_mode,
++				      GET_SSE_REGNO (sse_regno));
+ 	break;
+-
++      case X86_64_X87_CLASS:
++      case X86_64_COMPLEX_X87_CLASS:
++	return gen_rtx_REG (mode, FIRST_STACK_REG);
++      case X86_64_NO_CLASS:
++	/* Zero sized array, struct or class.  */
++	return NULL;
+       default:
+ 	gcc_unreachable ();
+       }
++  if (n == 2
++      && regclass[0] == X86_64_SSE_CLASS
++      && regclass[1] == X86_64_SSEUP_CLASS
++      && mode != BLKmode)
++    return gen_reg_or_parallel (mode, orig_mode,
++				GET_SSE_REGNO (sse_regno));
++  if (n == 4
++      && regclass[0] == X86_64_SSE_CLASS
++      && regclass[1] == X86_64_SSEUP_CLASS
++      && regclass[2] == X86_64_SSEUP_CLASS
++      && regclass[3] == X86_64_SSEUP_CLASS
++      && mode != BLKmode)
++    return gen_reg_or_parallel (mode, orig_mode,
++				GET_SSE_REGNO (sse_regno));
++  if (n == 8
++      && regclass[0] == X86_64_SSE_CLASS
++      && regclass[1] == X86_64_SSEUP_CLASS
++      && regclass[2] == X86_64_SSEUP_CLASS
++      && regclass[3] == X86_64_SSEUP_CLASS
++      && regclass[4] == X86_64_SSEUP_CLASS
++      && regclass[5] == X86_64_SSEUP_CLASS
++      && regclass[6] == X86_64_SSEUP_CLASS
++      && regclass[7] == X86_64_SSEUP_CLASS
++      && mode != BLKmode)
++    return gen_reg_or_parallel (mode, orig_mode,
++				GET_SSE_REGNO (sse_regno));
++  if (n == 2
++      && regclass[0] == X86_64_X87_CLASS
++      && regclass[1] == X86_64_X87UP_CLASS)
++    return gen_rtx_REG (XFmode, FIRST_STACK_REG);
+ 
+-  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
+-      && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+-    opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+-
+-  /* If stack probes are required, the space used for large function
+-     arguments on the stack must also be probed, so enable
+-     -maccumulate-outgoing-args so this happens in the prologue.  */
+-  if (TARGET_STACK_PROBE_P (opts->x_target_flags)
+-      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+-    {
+-      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
+-	warning (0,
+-		 main_args_p
+-		 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
+-		      "for correctness")
+-		 : G_("stack probing requires "
+-		      "%<target(\"accumulate-outgoing-args\")%> for "
+-		      "correctness"));
+-      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+-    }
+-
+-  /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
+-     so enable -maccumulate-outgoing-args when %ebp is fixed.  */
+-  if (fixed_regs[BP_REG]
+-      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+-    {
+-      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
+-	warning (0,
+-		 main_args_p
+-		 ? G_("fixed ebp register requires "
+-		      "%<-maccumulate-outgoing-args%>")
+-		 : G_("fixed ebp register requires "
+-		      "%<target(\"accumulate-outgoing-args\")%>"));
+-      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+-    }
+-
+-  /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
+-  {
+-    char *p;
+-    ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
+-    p = strchr (internal_label_prefix, 'X');
+-    internal_label_prefix_len = p - internal_label_prefix;
+-    *p = '\0';
+-  }
+-
+-  /* When scheduling description is not available, disable scheduler pass
+-     so it won't slow down the compilation and make x87 code slower.  */
+-  if (!TARGET_SCHEDULE)
+-    opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
+-
+-  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+-			 ix86_tune_cost->simultaneous_prefetches,
+-			 opts->x_param_values,
+-			 opts_set->x_param_values);
+-  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+-			 ix86_tune_cost->prefetch_block,
+-			 opts->x_param_values,
+-			 opts_set->x_param_values);
+-  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+-			 ix86_tune_cost->l1_cache_size,
+-			 opts->x_param_values,
+-			 opts_set->x_param_values);
+-  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+-			 ix86_tune_cost->l2_cache_size,
+-			 opts->x_param_values,
+-			 opts_set->x_param_values);
+-
+-  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
+-  if (opts->x_flag_prefetch_loop_arrays < 0
+-      && HAVE_prefetch
+-      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+-      && !opts->x_optimize_size
+-      && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
+-    opts->x_flag_prefetch_loop_arrays = 1;
+-
+-  /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
+-     can be opts->x_optimized to ap = __builtin_next_arg (0).  */
+-  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
+-    targetm.expand_builtin_va_start = NULL;
+-
+-  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    {
+-      ix86_gen_leave = gen_leave_rex64;
+-      if (Pmode == DImode)
+-	{
+-	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
+-	  ix86_gen_tls_local_dynamic_base_64
+-	    = gen_tls_local_dynamic_base_64_di;
+-	}
+-      else
++  if (n == 2
++      && regclass[0] == X86_64_INTEGER_CLASS
++      && regclass[1] == X86_64_INTEGER_CLASS
++      && (mode == CDImode || mode == TImode || mode == BLKmode)
++      && intreg[0] + 1 == intreg[1])
++    {
++      if (mode == BLKmode)
+ 	{
+-	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
+-	  ix86_gen_tls_local_dynamic_base_64
+-	    = gen_tls_local_dynamic_base_64_si;
++	  /* Use TImode for BLKmode values in 2 integer registers.  */
++	  exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
++				      gen_rtx_REG (TImode, intreg[0]),
++				      GEN_INT (0));
++	  ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
++	  XVECEXP (ret, 0, 0) = exp[0];
++	  return ret;
+ 	}
++      else
++	return gen_rtx_REG (mode, intreg[0]);
+     }
+-  else
+-    ix86_gen_leave = gen_leave;
+-
+-  if (Pmode == DImode)
+-    {
+-      ix86_gen_add3 = gen_adddi3;
+-      ix86_gen_sub3 = gen_subdi3;
+-      ix86_gen_sub3_carry = gen_subdi3_carry;
+-      ix86_gen_one_cmpl2 = gen_one_cmpldi2;
+-      ix86_gen_andsp = gen_anddi3;
+-      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
+-      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
+-      ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
+-      ix86_gen_monitor = gen_sse3_monitor_di;
+-      ix86_gen_monitorx = gen_monitorx_di;
+-      ix86_gen_clzero = gen_clzero_di;
+-    }
+-  else
+-    {
+-      ix86_gen_add3 = gen_addsi3;
+-      ix86_gen_sub3 = gen_subsi3;
+-      ix86_gen_sub3_carry = gen_subsi3_carry;
+-      ix86_gen_one_cmpl2 = gen_one_cmplsi2;
+-      ix86_gen_andsp = gen_andsi3;
+-      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
+-      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
+-      ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
+-      ix86_gen_monitor = gen_sse3_monitor_si;
+-      ix86_gen_monitorx = gen_monitorx_si;
+-      ix86_gen_clzero = gen_clzero_si;
+-    }
+-
+-#ifdef USE_IX86_CLD
+-  /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
+-  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+-    opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
+-#endif
+ 
+-  /* Set the default value for -mfentry.  */
+-  if (!opts_set->x_flag_fentry)
+-    opts->x_flag_fentry = TARGET_SEH;
+-  else
++  /* Otherwise figure out the entries of the PARALLEL.  */
++  for (i = 0; i < n; i++)
+     {
+-      if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
+-	  && opts->x_flag_fentry)
+-	sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination "
+-	       "with %<-fpic%>");
+-      else if (TARGET_SEH && !opts->x_flag_fentry)
+-	sorry ("%<-mno-fentry%> isn%'t compatible with SEH");
+-    }
+-
+-  if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
+-    sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
+-
+-  if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
+-      && TARGET_EMIT_VZEROUPPER)
+-    opts->x_target_flags |= MASK_VZEROUPPER;
+-  if (!(opts_set->x_target_flags & MASK_STV))
+-    opts->x_target_flags |= MASK_STV;
+-  /* Disable STV if -mpreferred-stack-boundary={2,3} or
+-     -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
+-     stack realignment will be extra cost the pass doesn't take into
+-     account and the pass can't realign the stack.  */
+-  if (ix86_preferred_stack_boundary < 128
+-      || ix86_incoming_stack_boundary < 128
+-      || opts->x_ix86_force_align_arg_pointer)
+-    opts->x_target_flags &= ~MASK_STV;
+-  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
+-      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
+-    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
+-  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
+-      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
+-    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
+-
+-  /* Enable 128-bit AVX instruction generation
+-     for the auto-vectorizer.  */
+-  if (TARGET_AVX128_OPTIMAL
+-      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
+-    opts->x_prefer_vector_width_type = PVW_AVX128;
+-
+-  /* Use 256-bit AVX instruction generation
+-     in the auto-vectorizer.  */
+-  if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
+-      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
+-    opts->x_prefer_vector_width_type = PVW_AVX256;
+-
+-  if (opts->x_ix86_recip_name)
+-    {
+-      char *p = ASTRDUP (opts->x_ix86_recip_name);
+-      char *q;
+-      unsigned int mask, i;
+-      bool invert;
+-
+-      while ((q = strtok (p, ",")) != NULL)
+-	{
+-	  p = NULL;
+-	  if (*q == '!')
+-	    {
+-	      invert = true;
+-	      q++;
+-	    }
+-	  else
+-	    invert = false;
++      int pos;
+ 
+-	  if (!strcmp (q, "default"))
+-	    mask = RECIP_MASK_ALL;
+-	  else
+-	    {
+-	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+-		if (!strcmp (q, recip_options[i].string))
++      switch (regclass[i])
++        {
++	  case X86_64_NO_CLASS:
++	    break;
++	  case X86_64_INTEGER_CLASS:
++	  case X86_64_INTEGERSI_CLASS:
++	    /* Merge TImodes on aligned occasions here too.  */
++	    if (i * 8 + 8 > bytes)
++	      {
++		unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
++		if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
++		  /* We've requested 24 bytes we
++		     don't have mode for.  Use DImode.  */
++		  tmpmode = DImode;
++	      }
++	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
++	      tmpmode = SImode;
++	    else
++	      tmpmode = DImode;
++	    exp [nexps++]
++	      = gen_rtx_EXPR_LIST (VOIDmode,
++				   gen_rtx_REG (tmpmode, *intreg),
++				   GEN_INT (i*8));
++	    intreg++;
++	    break;
++	  case X86_64_SSESF_CLASS:
++	    exp [nexps++]
++	      = gen_rtx_EXPR_LIST (VOIDmode,
++				   gen_rtx_REG (SFmode,
++						GET_SSE_REGNO (sse_regno)),
++				   GEN_INT (i*8));
++	    sse_regno++;
++	    break;
++	  case X86_64_SSEDF_CLASS:
++	    exp [nexps++]
++	      = gen_rtx_EXPR_LIST (VOIDmode,
++				   gen_rtx_REG (DFmode,
++						GET_SSE_REGNO (sse_regno)),
++				   GEN_INT (i*8));
++	    sse_regno++;
++	    break;
++	  case X86_64_SSE_CLASS:
++	    pos = i;
++	    switch (n)
++	      {
++	      case 1:
++		tmpmode = DImode;
++		break;
++	      case 2:
++		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
+ 		  {
+-		    mask = recip_options[i].mask;
+-		    break;
++		    tmpmode = TImode;
++		    i++;
+ 		  }
+-
+-	      if (i == ARRAY_SIZE (recip_options))
+-		{
+-		  error ("unknown option for %<-mrecip=%s%>", q);
+-		  invert = false;
+-		  mask = RECIP_MASK_NONE;
+-		}
+-	    }
+-
+-	  opts->x_recip_mask_explicit |= mask;
+-	  if (invert)
+-	    opts->x_recip_mask &= ~mask;
+-	  else
+-	    opts->x_recip_mask |= mask;
++		else
++		  tmpmode = DImode;
++		break;
++	      case 4:
++		gcc_assert (i == 0
++			    && regclass[1] == X86_64_SSEUP_CLASS
++			    && regclass[2] == X86_64_SSEUP_CLASS
++			    && regclass[3] == X86_64_SSEUP_CLASS);
++		tmpmode = OImode;
++		i += 3;
++		break;
++	      case 8:
++		gcc_assert (i == 0
++			    && regclass[1] == X86_64_SSEUP_CLASS
++			    && regclass[2] == X86_64_SSEUP_CLASS
++			    && regclass[3] == X86_64_SSEUP_CLASS
++			    && regclass[4] == X86_64_SSEUP_CLASS
++			    && regclass[5] == X86_64_SSEUP_CLASS
++			    && regclass[6] == X86_64_SSEUP_CLASS
++			    && regclass[7] == X86_64_SSEUP_CLASS);
++		tmpmode = XImode;
++		i += 7;
++		break;
++	      default:
++		gcc_unreachable ();
++	      }
++	    exp [nexps++]
++	      = gen_rtx_EXPR_LIST (VOIDmode,
++				   gen_rtx_REG (tmpmode,
++						GET_SSE_REGNO (sse_regno)),
++				   GEN_INT (pos*8));
++	    sse_regno++;
++	    break;
++	  default:
++	    gcc_unreachable ();
+ 	}
+     }
+ 
+-  if (TARGET_RECIP_P (opts->x_target_flags))
+-    opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
+-  else if (opts_set->x_target_flags & MASK_RECIP)
+-    opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
++  /* Empty aligned struct, union or class.  */
++  if (nexps == 0)
++    return NULL;
++
++  ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
++  for (i = 0; i < nexps; i++)
++    XVECEXP (ret, 0, i) = exp [i];
++  return ret;
++}
++
++/* Update the data in CUM to advance over an argument of mode MODE
++   and data type TYPE.  (TYPE is null for libcalls where that information
++   may not be available.)
+ 
+-  /* Default long double to 64-bit for 32-bit Bionic and to __float128
+-     for 64-bit Bionic.  Also default long double to 64-bit for Intel
+-     MCU psABI.  */
+-  if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
+-      && !(opts_set->x_target_flags
+-	   & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
+-    opts->x_target_flags |= (TARGET_64BIT
+-			     ? MASK_LONG_DOUBLE_128
+-			     : MASK_LONG_DOUBLE_64);
++   Return a number of integer regsiters advanced over.  */
+ 
+-  /* Only one of them can be active.  */
+-  gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
+-	      || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
++static int
++function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
++			 const_tree type, HOST_WIDE_INT bytes,
++			 HOST_WIDE_INT words)
++{
++  int res = 0;
++  bool error_p = false;
+ 
+-  /* Handle stack protector */
+-  if (!opts_set->x_ix86_stack_protector_guard)
++  if (TARGET_IAMCU)
+     {
+-#ifdef TARGET_THREAD_SSP_OFFSET
+-      if (!TARGET_HAS_BIONIC)
+-	opts->x_ix86_stack_protector_guard = SSP_TLS;
+-      else
+-#endif
+-	opts->x_ix86_stack_protector_guard = SSP_GLOBAL;
++      /* Intel MCU psABI passes scalars and aggregates no larger than 8
++	 bytes in registers.  */
++      if (!VECTOR_MODE_P (mode) && bytes <= 8)
++	goto pass_in_reg;
++      return res;
+     }
+ 
+-  if (opts_set->x_ix86_stack_protector_guard_offset_str)
++  switch (mode)
+     {
+-      char *endp;
+-      const char *str = opts->x_ix86_stack_protector_guard_offset_str;
+-
+-      errno = 0;
+-      int64_t offset;
+-
+-#if defined(INT64_T_IS_LONG)
+-      offset = strtol (str, &endp, 0);
+-#else
+-      offset = strtoll (str, &endp, 0);
+-#endif
+-
+-      if (!*str || *endp || errno)
+-	error ("%qs is not a valid number "
+-	       "in %<-mstack-protector-guard-offset=%>", str);
+-
+-      if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
+-		     HOST_WIDE_INT_C (0x7fffffff)))
+-	error ("%qs is not a valid offset "
+-	       "in %<-mstack-protector-guard-offset=%>", str);
+-
+-      opts->x_ix86_stack_protector_guard_offset = offset;
+-    }
+-#ifdef TARGET_THREAD_SSP_OFFSET
+-  else
+-    opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
+-#endif
+-
+-  if (opts_set->x_ix86_stack_protector_guard_reg_str)
+-    {
+-      const char *str = opts->x_ix86_stack_protector_guard_reg_str;
+-      addr_space_t seg = ADDR_SPACE_GENERIC;
++    default:
++      break;
+ 
+-      /* Discard optional register prefix.  */
+-      if (str[0] == '%')
+-	str++;
++    case E_BLKmode:
++      if (bytes < 0)
++	break;
++      /* FALLTHRU */
+ 
+-      if (strlen (str) == 2 && str[1] == 's')
++    case E_DImode:
++    case E_SImode:
++    case E_HImode:
++    case E_QImode:
++pass_in_reg:
++      cum->words += words;
++      cum->nregs -= words;
++      cum->regno += words;
++      if (cum->nregs >= 0)
++	res = words;
++      if (cum->nregs <= 0)
+ 	{
+-	  if (str[0] == 'f')
+-	    seg = ADDR_SPACE_SEG_FS;
+-	  else if (str[0] == 'g')
+-	    seg = ADDR_SPACE_SEG_GS;
++	  cum->nregs = 0;
++	  cfun->machine->arg_reg_available = false;
++	  cum->regno = 0;
+ 	}
++      break;
+ 
+-      if (seg == ADDR_SPACE_GENERIC)
+-	error ("%qs is not a valid base register "
+-	       "in %<-mstack-protector-guard-reg=%>",
+-	       opts->x_ix86_stack_protector_guard_reg_str);
++    case E_OImode:
++      /* OImode shouldn't be used directly.  */
++      gcc_unreachable ();
+ 
+-      opts->x_ix86_stack_protector_guard_reg = seg;
+-    }
+-  else
+-    {
+-      opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
++    case E_DFmode:
++      if (cum->float_in_sse == -1)
++	error_p = true;
++      if (cum->float_in_sse < 2)
++	break;
++      /* FALLTHRU */
++    case E_SFmode:
++      if (cum->float_in_sse == -1)
++	error_p = true;
++      if (cum->float_in_sse < 1)
++	break;
++      /* FALLTHRU */
+ 
+-      /* The kernel uses a different segment register for performance
+-	 reasons; a system call would not have to trash the userspace
+-	 segment register, which would be expensive.  */
+-      if (opts->x_ix86_cmodel == CM_KERNEL)
+-	opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
+-    }
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V64QImode:
++    case E_V32HImode:
++    case E_V16SImode:
++    case E_V8DImode:
++    case E_V16SFmode:
++    case E_V8DFmode:
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V4DFmode:
++    case E_V4DImode:
++    case E_TImode:
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++    case E_V4SFmode:
++    case E_V2DFmode:
++      if (!type || !AGGREGATE_TYPE_P (type))
++	{
++	  cum->sse_words += words;
++	  cum->sse_nregs -= 1;
++	  cum->sse_regno += 1;
++	  if (cum->sse_nregs <= 0)
++	    {
++	      cum->sse_nregs = 0;
++	      cum->sse_regno = 0;
++	    }
++	}
++      break;
+ 
+-  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+-  if (opts->x_ix86_tune_memcpy_strategy)
+-    {
+-      char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
+-      ix86_parse_stringop_strategy_string (str, false);
+-      free (str);
++    case E_V8QImode:
++    case E_V4HImode:
++    case E_V2SImode:
++    case E_V2SFmode:
++    case E_V1TImode:
++    case E_V1DImode:
++      if (!type || !AGGREGATE_TYPE_P (type))
++	{
++	  cum->mmx_words += words;
++	  cum->mmx_nregs -= 1;
++	  cum->mmx_regno += 1;
++	  if (cum->mmx_nregs <= 0)
++	    {
++	      cum->mmx_nregs = 0;
++	      cum->mmx_regno = 0;
++	    }
++	}
++      break;
+     }
+-
+-  if (opts->x_ix86_tune_memset_strategy)
++  if (error_p)
+     {
+-      char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
+-      ix86_parse_stringop_strategy_string (str, true);
+-      free (str);
++      cum->float_in_sse = 0;
++      error ("calling %qD with SSE calling convention without "
++	     "SSE/SSE2 enabled", cum->decl);
++      sorry ("this is a GCC bug that can be worked around by adding "
++	     "attribute used to function called");
+     }
+ 
+-  /* Save the initial options in case the user does function specific
+-     options.  */
+-  if (main_args_p)
+-    target_option_default_node = target_option_current_node
+-      = build_target_option_node (opts);
+-
+-  if (opts->x_flag_cf_protection != CF_NONE)
+-    opts->x_flag_cf_protection
+-      = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
+-
+-  if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS])
+-    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 256,
+-			   opts->x_param_values,
+-			   opts_set->x_param_values);
+-  else if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
+-    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
+-			   opts->x_param_values,
+-			   opts_set->x_param_values);
+-
+-  /* PR86952: jump table usage with retpolines is slow.
+-     The PR provides some numbers about the slowness.  */
+-  if (ix86_indirect_branch != indirect_branch_keep
+-      && !opts_set->x_flag_jump_tables)
+-    opts->x_flag_jump_tables = 0;
+-
+-  return true;
+-}
+-
+-/* Implement the TARGET_OPTION_OVERRIDE hook.  */
+-
+-static void
+-ix86_option_override (void)
+-{
+-  ix86_option_override_internal (true, &global_options, &global_options_set);
+-}
+-
+-/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
+-static char *
+-ix86_offload_options (void)
+-{
+-  if (TARGET_LP64)
+-    return xstrdup ("-foffload-abi=lp64");
+-  return xstrdup ("-foffload-abi=ilp32");
++  return res;
+ }
+ 
+-/* Update register usage after having seen the compiler flags.  */
+-
+-static void
+-ix86_conditional_register_usage (void)
++static int
++function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
++			 const_tree type, HOST_WIDE_INT words, bool named)
+ {
+-  int i, c_mask;
++  int int_nregs, sse_nregs;
+ 
+-  /* If there are no caller-saved registers, preserve all registers.
+-     except fixed_regs and registers used for function return value
+-     since aggregate_value_p checks call_used_regs[regno] on return
+-     value.  */
+-  if (cfun && cfun->machine->no_caller_saved_registers)
+-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
+-	call_used_regs[i] = 0;
++  /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
++  if (!named && (VALID_AVX512F_REG_MODE (mode)
++		 || VALID_AVX256_REG_MODE (mode)))
++    return 0;
+ 
+-  /* For 32-bit targets, squash the REX registers.  */
+-  if (! TARGET_64BIT)
++  if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
++      && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
+     {
+-      for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+-      for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+-      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
++      cum->nregs -= int_nregs;
++      cum->sse_nregs -= sse_nregs;
++      cum->regno += int_nregs;
++      cum->sse_regno += sse_nregs;
++      return int_nregs;
+     }
+-
+-  /*  See the definition of CALL_USED_REGISTERS in i386.h.  */
+-  c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
+-  
+-  CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
+-
+-  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
++  else
+     {
+-      /* Set/reset conditionally defined registers from
+-	 CALL_USED_REGISTERS initializer.  */
+-      if (call_used_regs[i] > 1)
+-	call_used_regs[i] = !!(call_used_regs[i] & c_mask);
+-
+-      /* Calculate registers of CLOBBERED_REGS register set
+-	 as call used registers from GENERAL_REGS register set.  */
+-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
+-	  && call_used_regs[i])
+-	SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
++      int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
++      cum->words = ROUND_UP (cum->words, align);
++      cum->words += words;
++      return 0;
+     }
++}
+ 
+-  /* If MMX is disabled, squash the registers.  */
+-  if (! TARGET_MMX)
+-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+-
+-  /* If SSE is disabled, squash the registers.  */
+-  if (! TARGET_SSE)
+-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+-
+-  /* If the FPU is disabled, squash the registers.  */
+-  if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
+-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
++static int
++function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
++			    HOST_WIDE_INT words)
++{
++  /* Otherwise, this should be passed indirect.  */
++  gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
+ 
+-  /* If AVX512F is disabled, squash the registers.  */
+-  if (! TARGET_AVX512F)
++  cum->words += words;
++  if (cum->nregs > 0)
+     {
+-      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+-
+-      for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
++      cum->nregs -= 1;
++      cum->regno += 1;
++      return 1;
+     }
++  return 0;
+ }
+ 
+-/* Canonicalize a comparison from one we don't have to one we do have.  */
++/* Update the data in CUM to advance over argument ARG.  */
+ 
+ static void
+-ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+-			      bool op0_preserve_value)
++ix86_function_arg_advance (cumulative_args_t cum_v,
++			   const function_arg_info &arg)
+ {
+-  /* The order of operands in x87 ficom compare is forced by combine in
+-     simplify_comparison () function. Float operator is treated as RTX_OBJ
+-     with a precedence over other operators and is always put in the first
+-     place. Swap condition and operands to match ficom instruction.  */
+-  if (!op0_preserve_value
+-      && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
+-    {
+-      enum rtx_code scode = swap_condition ((enum rtx_code) *code);
++  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
++  machine_mode mode = arg.mode;
++  HOST_WIDE_INT bytes, words;
++  int nregs;
+ 
+-      /* We are called only for compares that are split to SAHF instruction.
+-	 Ensure that we have setcc/jcc insn for the swapped condition.  */
+-      if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
+-	{
+-	  std::swap (*op0, *op1);
+-	  *code = (int) scode;
+-	}
+-    }
+-}
+-
+-/* Save the current options */
++  /* The argument of interrupt handler is a special case and is
++     handled in ix86_function_arg.  */
++  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
++    return;
+ 
+-static void
+-ix86_function_specific_save (struct cl_target_option *ptr,
+-			     struct gcc_options *opts)
+-{
+-  ptr->arch = ix86_arch;
+-  ptr->schedule = ix86_schedule;
+-  ptr->prefetch_sse = x86_prefetch_sse;
+-  ptr->tune = ix86_tune;
+-  ptr->branch_cost = ix86_branch_cost;
+-  ptr->tune_defaulted = ix86_tune_defaulted;
+-  ptr->arch_specified = ix86_arch_specified;
+-  ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
+-  ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
+-  ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
+-  ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
+-  ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
+-  ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
+-  ptr->x_ix86_abi = opts->x_ix86_abi;
+-  ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
+-  ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
+-  ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
+-  ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
+-  ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
+-  ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
+-  ptr->x_ix86_pmode = opts->x_ix86_pmode;
+-  ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
+-  ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
+-  ptr->x_ix86_regparm = opts->x_ix86_regparm;
+-  ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
+-  ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
+-  ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
+-  ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
+-  ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
+-  ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
+-  ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
+-  ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
+-  ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
+-  ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
+-
+-  /* The fields are char but the variables are not; make sure the
+-     values fit in the fields.  */
+-  gcc_assert (ptr->arch == ix86_arch);
+-  gcc_assert (ptr->schedule == ix86_schedule);
+-  gcc_assert (ptr->tune == ix86_tune);
+-  gcc_assert (ptr->branch_cost == ix86_branch_cost);
+-}
+-
+-/* Restore the current options */
++  bytes = arg.promoted_size_in_bytes ();
++  words = CEIL (bytes, UNITS_PER_WORD);
+ 
+-static void
+-ix86_function_specific_restore (struct gcc_options *opts,
+-				struct cl_target_option *ptr)
+-{
+-  enum processor_type old_tune = ix86_tune;
+-  enum processor_type old_arch = ix86_arch;
+-  unsigned HOST_WIDE_INT ix86_arch_mask;
+-  int i;
++  if (arg.type)
++    mode = type_natural_mode (arg.type, NULL, false);
++
++  if (TARGET_64BIT)
++    {
++      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+ 
+-  /* We don't change -fPIC.  */
+-  opts->x_flag_pic = flag_pic;
+-
+-  ix86_arch = (enum processor_type) ptr->arch;
+-  ix86_schedule = (enum attr_cpu) ptr->schedule;
+-  ix86_tune = (enum processor_type) ptr->tune;
+-  x86_prefetch_sse = ptr->prefetch_sse;
+-  opts->x_ix86_branch_cost = ptr->branch_cost;
+-  ix86_tune_defaulted = ptr->tune_defaulted;
+-  ix86_arch_specified = ptr->arch_specified;
+-  opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
+-  opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
+-  opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
+-  opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
+-  opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
+-  opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
+-  opts->x_ix86_abi = ptr->x_ix86_abi;
+-  opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
+-  opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
+-  opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
+-  opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
+-  opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
+-  opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
+-  opts->x_ix86_pmode = ptr->x_ix86_pmode;
+-  opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
+-  opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
+-  opts->x_ix86_regparm = ptr->x_ix86_regparm;
+-  opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
+-  opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
+-  opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
+-  opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
+-  opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
+-  opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
+-  opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
+-  opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
+-  opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
+-  opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
+-  ix86_tune_cost = processor_cost_table[ix86_tune];
+-  /* TODO: ix86_cost should be chosen at instruction or function granuality
+-     so for cold code we use size_cost even in !optimize_size compilation.  */
+-  if (opts->x_optimize_size)
+-    ix86_cost = &ix86_size_cost;
++      if (call_abi == MS_ABI)
++	nregs = function_arg_advance_ms_64 (cum, bytes, words);
++      else
++	nregs = function_arg_advance_64 (cum, mode, arg.type, words,
++					 arg.named);
++    }
+   else
+-    ix86_cost = ix86_tune_cost;
++    nregs = function_arg_advance_32 (cum, mode, arg.type, bytes, words);
+ 
+-  /* Recreate the arch feature tests if the arch changed */
+-  if (old_arch != ix86_arch)
++  if (!nregs)
+     {
+-      ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
+-      for (i = 0; i < X86_ARCH_LAST; ++i)
+-	ix86_arch_features[i]
+-	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
++      /* Track if there are outgoing arguments on stack.  */
++      if (cum->caller)
++	cfun->machine->outgoing_args_on_stack = true;
+     }
+-
+-  /* Recreate the tune optimization tests */
+-  if (old_tune != ix86_tune)
+-    set_ix86_tune_features (ix86_tune, false);
+ }
+ 
+-/* Adjust target options after streaming them in.  This is mainly about
+-   reconciling them with global options.  */
+-
+-static void
+-ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
+-{
+-  /* flag_pic is a global option, but ix86_cmodel is target saved option
+-     partly computed from flag_pic.  If flag_pic is on, adjust x_ix86_cmodel
+-     for PIC, or error out.  */
+-  if (flag_pic)
+-    switch (ptr->x_ix86_cmodel)
+-      {
+-      case CM_SMALL:
+-	ptr->x_ix86_cmodel = CM_SMALL_PIC;
+-	break;
++/* Define where to put the arguments to a function.
++   Value is zero to push the argument on the stack,
++   or a hard register in which to store the argument.
+ 
+-      case CM_MEDIUM:
+-	ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
+-	break;
+-
+-      case CM_LARGE:
+-	ptr->x_ix86_cmodel = CM_LARGE_PIC;
+-	break;
+-
+-      case CM_KERNEL:
+-	error ("code model %s does not support PIC mode", "kernel");
+-	break;
+-
+-      default:
+-	break;
+-      }
+-  else
+-    switch (ptr->x_ix86_cmodel)
+-      {
+-      case CM_SMALL_PIC:
+-	ptr->x_ix86_cmodel = CM_SMALL;
+-	break;
+-
+-      case CM_MEDIUM_PIC:
+-	ptr->x_ix86_cmodel = CM_MEDIUM;
+-	break;
+-
+-      case CM_LARGE_PIC:
+-	ptr->x_ix86_cmodel = CM_LARGE;
+-	break;
+-
+-      default:
+-	break;
+-      }
+-}
+-
+-/* Print the current options */
+-
+-static void
+-ix86_function_specific_print (FILE *file, int indent,
+-			      struct cl_target_option *ptr)
+-{
+-  char *target_string
+-    = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
+-			  ptr->x_target_flags, ptr->x_ix86_target_flags,
+-			  NULL, NULL, ptr->x_ix86_fpmath, false, true);
+-
+-  gcc_assert (ptr->arch < PROCESSOR_max);
+-  fprintf (file, "%*sarch = %d (%s)\n",
+-	   indent, "",
+-	   ptr->arch, processor_names[ptr->arch]);
+-
+-  gcc_assert (ptr->tune < PROCESSOR_max);
+-  fprintf (file, "%*stune = %d (%s)\n",
+-	   indent, "",
+-	   ptr->tune, processor_names[ptr->tune]);
+-
+-  fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
+-
+-  if (target_string)
+-    {
+-      fprintf (file, "%*s%s\n", indent, "", target_string);
+-      free (target_string);
+-    }
+-}
+-
+-
+-/* Inner function to process the attribute((target(...))), take an argument and
+-   set the current options from the argument. If we have a list, recursively go
+-   over the list.  */
++   MODE is the argument's machine mode.
++   TYPE is the data type of the argument (as a tree).
++    This is null for libcalls where that information may
++    not be available.
++   CUM is a variable of type CUMULATIVE_ARGS which gives info about
++    the preceding args and about the function being called.
++   NAMED is nonzero if this argument is a named parameter
++    (otherwise it is an extra parameter matching an ellipsis).  */
+ 
+-static bool
+-ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
+-				     struct gcc_options *opts,
+-				     struct gcc_options *opts_set,
+-				     struct gcc_options *enum_opts_set)
++static rtx
++function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
++		 machine_mode orig_mode, const_tree type,
++		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
+ {
+-  char *next_optstr;
+-  bool ret = true;
+-
+-#define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
+-#define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
+-#define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
+-#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
+-#define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
+-
+-  enum ix86_opt_type
+-  {
+-    ix86_opt_unknown,
+-    ix86_opt_yes,
+-    ix86_opt_no,
+-    ix86_opt_str,
+-    ix86_opt_enum,
+-    ix86_opt_isa
+-  };
+-
+-  static const struct
+-  {
+-    const char *string;
+-    size_t len;
+-    enum ix86_opt_type type;
+-    int opt;
+-    int mask;
+-  } attrs[] = {
+-    /* isa options */
+-    IX86_ATTR_ISA ("pconfig",	OPT_mpconfig),
+-    IX86_ATTR_ISA ("wbnoinvd",	OPT_mwbnoinvd),
+-    IX86_ATTR_ISA ("sgx",	OPT_msgx),
+-    IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
+-    IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
+-    IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
+-    IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
+-    IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
+-    IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
+-
+-    IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
+-    IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
+-    IX86_ATTR_ISA ("avx512vl",	OPT_mavx512vl),
+-    IX86_ATTR_ISA ("avx512bw",	OPT_mavx512bw),
+-    IX86_ATTR_ISA ("avx512dq",	OPT_mavx512dq),
+-    IX86_ATTR_ISA ("avx512er",	OPT_mavx512er),
+-    IX86_ATTR_ISA ("avx512pf",	OPT_mavx512pf),
+-    IX86_ATTR_ISA ("avx512cd",	OPT_mavx512cd),
+-    IX86_ATTR_ISA ("avx512f",	OPT_mavx512f),
+-    IX86_ATTR_ISA ("avx2",	OPT_mavx2),
+-    IX86_ATTR_ISA ("fma",	OPT_mfma),
+-    IX86_ATTR_ISA ("xop",	OPT_mxop),
+-    IX86_ATTR_ISA ("fma4",	OPT_mfma4),
+-    IX86_ATTR_ISA ("f16c",	OPT_mf16c),
+-    IX86_ATTR_ISA ("avx",	OPT_mavx),
+-    IX86_ATTR_ISA ("sse4",	OPT_msse4),
+-    IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
+-    IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
+-    IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
+-    IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
+-    IX86_ATTR_ISA ("sse3",	OPT_msse3),
+-    IX86_ATTR_ISA ("aes",	OPT_maes),
+-    IX86_ATTR_ISA ("sha",	OPT_msha),
+-    IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
+-    IX86_ATTR_ISA ("sse2",	OPT_msse2),
+-    IX86_ATTR_ISA ("sse",	OPT_msse),
+-    IX86_ATTR_ISA ("3dnowa",	OPT_m3dnowa),
+-    IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
+-    IX86_ATTR_ISA ("mmx",	OPT_mmmx),
+-    IX86_ATTR_ISA ("rtm",	OPT_mrtm),
+-    IX86_ATTR_ISA ("prfchw",	OPT_mprfchw),
+-    IX86_ATTR_ISA ("rdseed",	OPT_mrdseed),
+-    IX86_ATTR_ISA ("adx",	OPT_madx),
+-    IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
+-    IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
+-    IX86_ATTR_ISA ("xsaves",	OPT_mxsaves),
+-    IX86_ATTR_ISA ("xsavec",	OPT_mxsavec),
+-    IX86_ATTR_ISA ("xsaveopt",	OPT_mxsaveopt),
+-    IX86_ATTR_ISA ("xsave",	OPT_mxsave),
+-    IX86_ATTR_ISA ("abm",	OPT_mabm),
+-    IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+-    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
+-    IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
+-    IX86_ATTR_ISA ("tbm",	OPT_mtbm),
+-    IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
+-    IX86_ATTR_ISA ("cx16",	OPT_mcx16),
+-    IX86_ATTR_ISA ("sahf",	OPT_msahf),
+-    IX86_ATTR_ISA ("movbe",	OPT_mmovbe),
+-    IX86_ATTR_ISA ("crc32",	OPT_mcrc32),
+-    IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
+-    IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
+-    IX86_ATTR_ISA ("mwaitx",	OPT_mmwaitx),
+-    IX86_ATTR_ISA ("clzero",	OPT_mclzero),
+-    IX86_ATTR_ISA ("pku",	OPT_mpku),
+-    IX86_ATTR_ISA ("lwp",	OPT_mlwp),
+-    IX86_ATTR_ISA ("hle",	OPT_mhle),
+-    IX86_ATTR_ISA ("fxsr",	OPT_mfxsr),
+-    IX86_ATTR_ISA ("clwb",	OPT_mclwb),
+-    IX86_ATTR_ISA ("rdpid",	OPT_mrdpid),
+-    IX86_ATTR_ISA ("gfni",	OPT_mgfni),
+-    IX86_ATTR_ISA ("shstk",	OPT_mshstk),
+-    IX86_ATTR_ISA ("vaes",	OPT_mvaes),
+-    IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
+-    IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
+-    IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
+-    IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
+-    IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
+-    IX86_ATTR_ISA ("ptwrite",   OPT_mptwrite),
+-
+-    /* enum options */
+-    IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
+-
+-    /* string options */
+-    IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
+-    IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
+-
+-    /* flag options */
+-    IX86_ATTR_YES ("cld",
+-		   OPT_mcld,
+-		   MASK_CLD),
+-
+-    IX86_ATTR_NO ("fancy-math-387",
+-		  OPT_mfancy_math_387,
+-		  MASK_NO_FANCY_MATH_387),
+-
+-    IX86_ATTR_YES ("ieee-fp",
+-		   OPT_mieee_fp,
+-		   MASK_IEEE_FP),
+-
+-    IX86_ATTR_YES ("inline-all-stringops",
+-		   OPT_minline_all_stringops,
+-		   MASK_INLINE_ALL_STRINGOPS),
+-
+-    IX86_ATTR_YES ("inline-stringops-dynamically",
+-		   OPT_minline_stringops_dynamically,
+-		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
+-
+-    IX86_ATTR_NO ("align-stringops",
+-		  OPT_mno_align_stringops,
+-		  MASK_NO_ALIGN_STRINGOPS),
+-
+-    IX86_ATTR_YES ("recip",
+-		   OPT_mrecip,
+-		   MASK_RECIP),
+-
+-  };
+-
+-  /* If this is a list, recurse to get the options.  */
+-  if (TREE_CODE (args) == TREE_LIST)
+-    {
+-      bool ret = true;
+-
+-      for (; args; args = TREE_CHAIN (args))
+-	if (TREE_VALUE (args)
+-	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
+-						     p_strings, opts, opts_set,
+-						     enum_opts_set))
+-	  ret = false;
++  bool error_p = false;
+ 
+-      return ret;
+-    }
++  /* Avoid the AL settings for the Unix64 ABI.  */
++  if (mode == VOIDmode)
++    return constm1_rtx;
+ 
+-  else if (TREE_CODE (args) != STRING_CST)
++  if (TARGET_IAMCU)
+     {
+-      error ("attribute %<target%> argument not a string");
+-      return false;
++      /* Intel MCU psABI passes scalars and aggregates no larger than 8
++	 bytes in registers.  */
++      if (!VECTOR_MODE_P (mode) && bytes <= 8)
++	goto pass_in_reg;
++      return NULL_RTX;
+     }
+ 
+-  /* Handle multiple arguments separated by commas.  */
+-  next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
+-
+-  while (next_optstr && *next_optstr != '\0')
++  switch (mode)
+     {
+-      char *p = next_optstr;
+-      char *orig_p = p;
+-      char *comma = strchr (next_optstr, ',');
+-      const char *opt_string;
+-      size_t len, opt_len;
+-      int opt;
+-      bool opt_set_p;
+-      char ch;
+-      unsigned i;
+-      enum ix86_opt_type type = ix86_opt_unknown;
+-      int mask = 0;
++    default:
++      break;
+ 
+-      if (comma)
+-	{
+-	  *comma = '\0';
+-	  len = comma - next_optstr;
+-	  next_optstr = comma + 1;
+-	}
+-      else
++    case E_BLKmode:
++      if (bytes < 0)
++	break;
++      /* FALLTHRU */
++    case E_DImode:
++    case E_SImode:
++    case E_HImode:
++    case E_QImode:
++pass_in_reg:
++      if (words <= cum->nregs)
+ 	{
+-	  len = strlen (p);
+-	  next_optstr = NULL;
+-	}
++	  int regno = cum->regno;
+ 
+-      /* Recognize no-xxx.  */
+-      if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
+-	{
+-	  opt_set_p = false;
+-	  p += 3;
+-	  len -= 3;
+-	}
+-      else
+-	opt_set_p = true;
+-
+-      /* Find the option.  */
+-      ch = *p;
+-      opt = N_OPTS;
+-      for (i = 0; i < ARRAY_SIZE (attrs); i++)
+-	{
+-	  type = attrs[i].type;
+-	  opt_len = attrs[i].len;
+-	  if (ch == attrs[i].string[0]
+-	      && ((type != ix86_opt_str && type != ix86_opt_enum)
+-		  ? len == opt_len
+-		  : len > opt_len)
+-	      && memcmp (p, attrs[i].string, opt_len) == 0)
++	  /* Fastcall allocates the first two DWORD (SImode) or
++            smaller arguments to ECX and EDX if it isn't an
++            aggregate type .  */
++	  if (cum->fastcall)
+ 	    {
+-	      opt = attrs[i].opt;
+-	      mask = attrs[i].mask;
+-	      opt_string = attrs[i].string;
+-	      break;
+-	    }
+-	}
++	      if (mode == BLKmode
++		  || mode == DImode
++		  || (type && AGGREGATE_TYPE_P (type)))
++	        break;
+ 
+-      /* Process the option.  */
+-      if (opt == N_OPTS)
+-	{
+-	  error ("attribute(target(\"%s\")) is unknown", orig_p);
+-	  ret = false;
++	      /* ECX not EAX is the first allocated register.  */
++	      if (regno == AX_REG)
++		regno = CX_REG;
++	    }
++	  return gen_rtx_REG (mode, regno);
+ 	}
++      break;
+ 
+-      else if (type == ix86_opt_isa)
++    case E_DFmode:
++      if (cum->float_in_sse == -1)
++	error_p = true;
++      if (cum->float_in_sse < 2)
++	break;
++      /* FALLTHRU */
++    case E_SFmode:
++      if (cum->float_in_sse == -1)
++	error_p = true;
++      if (cum->float_in_sse < 1)
++	break;
++      /* FALLTHRU */
++    case E_TImode:
++      /* In 32bit, we pass TImode in xmm registers.  */
++    case E_V16QImode:
++    case E_V8HImode:
++    case E_V4SImode:
++    case E_V2DImode:
++    case E_V4SFmode:
++    case E_V2DFmode:
++      if (!type || !AGGREGATE_TYPE_P (type))
+ 	{
+-	  struct cl_decoded_option decoded;
+-
+-	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
+-	  ix86_handle_option (opts, opts_set,
+-			      &decoded, input_location);
++	  if (cum->sse_nregs)
++	    return gen_reg_or_parallel (mode, orig_mode,
++				        cum->sse_regno + FIRST_SSE_REG);
+ 	}
++      break;
+ 
+-      else if (type == ix86_opt_yes || type == ix86_opt_no)
+-	{
+-	  if (type == ix86_opt_no)
+-	    opt_set_p = !opt_set_p;
+-
+-	  if (opt_set_p)
+-	    opts->x_target_flags |= mask;
+-	  else
+-	    opts->x_target_flags &= ~mask;
+-	}
++    case E_OImode:
++    case E_XImode:
++      /* OImode and XImode shouldn't be used directly.  */
++      gcc_unreachable ();
+ 
+-      else if (type == ix86_opt_str)
++    case E_V64QImode:
++    case E_V32HImode:
++    case E_V16SImode:
++    case E_V8DImode:
++    case E_V16SFmode:
++    case E_V8DFmode:
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V4DFmode:
++    case E_V4DImode:
++      if (!type || !AGGREGATE_TYPE_P (type))
+ 	{
+-	  if (p_strings[opt])
+-	    {
+-	      error ("option(\"%s\") was already specified", opt_string);
+-	      ret = false;
+-	    }
+-	  else
+-	    {
+-	      p_strings[opt] = xstrdup (p + opt_len);
+-	      if (opt == IX86_FUNCTION_SPECIFIC_ARCH)
+-		{
+-		  /* If arch= is set,  clear all bits in x_ix86_isa_flags,
+-		     except for ISA_64BIT, ABI_64, ABI_X32, and CODE16
+-		     and all bits in x_ix86_isa_flags2.  */
+-		  opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
+-					     | OPTION_MASK_ABI_64
+-					     | OPTION_MASK_ABI_X32
+-					     | OPTION_MASK_CODE16);
+-		  opts->x_ix86_isa_flags_explicit &= (OPTION_MASK_ISA_64BIT
+-						      | OPTION_MASK_ABI_64
+-						      | OPTION_MASK_ABI_X32
+-						      | OPTION_MASK_CODE16);
+-		  opts->x_ix86_isa_flags2 = 0;
+-		  opts->x_ix86_isa_flags2_explicit = 0;
+-		}
+-	    }
++	  if (cum->sse_nregs)
++	    return gen_reg_or_parallel (mode, orig_mode,
++				        cum->sse_regno + FIRST_SSE_REG);
+ 	}
++      break;
+ 
+-      else if (type == ix86_opt_enum)
++    case E_V8QImode:
++    case E_V4HImode:
++    case E_V2SImode:
++    case E_V2SFmode:
++    case E_V1TImode:
++    case E_V1DImode:
++      if (!type || !AGGREGATE_TYPE_P (type))
+ 	{
+-	  bool arg_ok;
+-	  int value;
+-
+-	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
+-	  if (arg_ok)
+-	    set_option (opts, enum_opts_set, opt, value,
+-			p + opt_len, DK_UNSPECIFIED, input_location,
+-			global_dc);
+-	  else
+-	    {
+-	      error ("attribute(target(\"%s\")) is unknown", orig_p);
+-	      ret = false;
+-	    }
++	  if (cum->mmx_nregs)
++	    return gen_reg_or_parallel (mode, orig_mode,
++				        cum->mmx_regno + FIRST_MMX_REG);
+ 	}
+-
+-      else
+-	gcc_unreachable ();
++      break;
++    }
++  if (error_p)
++    {
++      cum->float_in_sse = 0;
++      error ("calling %qD with SSE calling convention without "
++	     "SSE/SSE2 enabled", cum->decl);
++      sorry ("this is a GCC bug that can be worked around by adding "
++	     "attribute used to function called");
+     }
+ 
+-  return ret;
+-}
+-
+-/* Release allocated strings.  */
+-static void
+-release_options_strings (char **option_strings)
+-{
+-  /* Free up memory allocated to hold the strings */
+-  for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
+-    free (option_strings[i]);
++  return NULL_RTX;
+ }
+ 
+-/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
+-
+-tree
+-ix86_valid_target_attribute_tree (tree args,
+-				  struct gcc_options *opts,
+-				  struct gcc_options *opts_set)
++static rtx
++function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
++		 machine_mode orig_mode, const_tree type, bool named)
+ {
+-  const char *orig_arch_string = opts->x_ix86_arch_string;
+-  const char *orig_tune_string = opts->x_ix86_tune_string;
+-  enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
+-  int orig_tune_defaulted = ix86_tune_defaulted;
+-  int orig_arch_specified = ix86_arch_specified;
+-  char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
+-  tree t = NULL_TREE;
+-  struct cl_target_option *def
+-    = TREE_TARGET_OPTION (target_option_default_node);
+-  struct gcc_options enum_opts_set;
+-
+-  memset (&enum_opts_set, 0, sizeof (enum_opts_set));
+-
+-  /* Process each of the options on the chain.  */
+-  if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
+-					     opts_set, &enum_opts_set))
+-    return error_mark_node;
++  /* Handle a hidden AL argument containing number of registers
++     for varargs x86-64 functions.  */
++  if (mode == VOIDmode)
++    return GEN_INT (cum->maybe_vaarg
++		    ? (cum->sse_nregs < 0
++		       ? X86_64_SSE_REGPARM_MAX
++		       : cum->sse_regno)
++		    : -1);
+ 
+-  /* If the changed options are different from the default, rerun
+-     ix86_option_override_internal, and then save the options away.
+-     The string options are attribute options, and will be undone
+-     when we copy the save structure.  */
+-  if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
+-      || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
+-      || opts->x_target_flags != def->x_target_flags
+-      || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
+-      || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
+-      || enum_opts_set.x_ix86_fpmath)
++  switch (mode)
+     {
+-      /* If we are using the default tune= or arch=, undo the string assigned,
+-	 and use the default.  */
+-      if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
+-	opts->x_ix86_arch_string
+-	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
+-      else if (!orig_arch_specified)
+-	opts->x_ix86_arch_string = NULL;
+-
+-      if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
+-	opts->x_ix86_tune_string
+-	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
+-      else if (orig_tune_defaulted)
+-	opts->x_ix86_tune_string = NULL;
+-
+-      /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
+-      if (enum_opts_set.x_ix86_fpmath)
+-	opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
+-
+-      /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
+-      bool r = ix86_option_override_internal (false, opts, opts_set);
+-      if (!r)
+-	{
+-	  release_options_strings (option_strings);
+-	  return error_mark_node;
+-	}
+-
+-      /* Add any builtin functions with the new isa if any.  */
+-      ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
+-
+-      /* Save the current options unless we are validating options for
+-	 #pragma.  */
+-      t = build_target_option_node (opts);
+-
+-      opts->x_ix86_arch_string = orig_arch_string;
+-      opts->x_ix86_tune_string = orig_tune_string;
+-      opts_set->x_ix86_fpmath = orig_fpmath_set;
++    default:
++      break;
+ 
+-      release_options_strings (option_strings);
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V4DFmode:
++    case E_V4DImode:
++    case E_V16SFmode:
++    case E_V16SImode:
++    case E_V64QImode:
++    case E_V32HImode:
++    case E_V8DFmode:
++    case E_V8DImode:
++      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
++      if (!named)
++	return NULL;
++      break;
+     }
+ 
+-  return t;
++  return construct_container (mode, orig_mode, type, 0, cum->nregs,
++			      cum->sse_nregs,
++			      &x86_64_int_parameter_registers [cum->regno],
++			      cum->sse_regno);
+ }
+ 
+-/* Hook to validate attribute((target("string"))).  */
+-
+-static bool
+-ix86_valid_target_attribute_p (tree fndecl,
+-			       tree ARG_UNUSED (name),
+-			       tree args,
+-			       int ARG_UNUSED (flags))
+-{
+-  struct gcc_options func_options;
+-  tree new_target, new_optimize;
+-  bool ret = true;
+-
+-  /* attribute((target("default"))) does nothing, beyond
+-     affecting multi-versioning.  */
+-  if (TREE_VALUE (args)
+-      && TREE_CODE (TREE_VALUE (args)) == STRING_CST
+-      && TREE_CHAIN (args) == NULL_TREE
+-      && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
+-    return true;
++static rtx
++function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
++		    machine_mode orig_mode, bool named, const_tree type,
++		    HOST_WIDE_INT bytes)
++{
++  unsigned int regno;
+ 
+-  tree old_optimize = build_optimization_node (&global_options);
+-
+-  /* Get the optimization options of the current function.  */  
+-  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
+- 
+-  if (!func_optimize)
+-    func_optimize = old_optimize;
+-
+-  /* Init func_options.  */
+-  memset (&func_options, 0, sizeof (func_options));
+-  init_options_struct (&func_options, NULL);
+-  lang_hooks.init_options_struct (&func_options);
+- 
+-  cl_optimization_restore (&func_options,
+-			   TREE_OPTIMIZATION (func_optimize));
+-
+-  /* Initialize func_options to the default before its target options can
+-     be set.  */
+-  cl_target_option_restore (&func_options,
+-			    TREE_TARGET_OPTION (target_option_default_node));
+-
+-  new_target = ix86_valid_target_attribute_tree (args, &func_options,
+-						 &global_options_set);
+-
+-  new_optimize = build_optimization_node (&func_options);
+-
+-  if (new_target == error_mark_node)
+-    ret = false;
+-
+-  else if (fndecl && new_target)
+-    {
+-      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
+-
+-      if (old_optimize != new_optimize)
+-	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
+-    }
+-
+-  finalize_options_struct (&func_options);
+-
+-  return ret;
+-}
+-
+-
+-/* Hook to determine if one function can safely inline another.  */
+-
+-static bool
+-ix86_can_inline_p (tree caller, tree callee)
+-{
+-  tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
+-  tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
+-
+-  /* Changes of those flags can be tolerated for always inlines. Lets hope
+-     user knows what he is doing.  */
+-  const unsigned HOST_WIDE_INT always_inline_safe_mask
+-	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
+-	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
+-	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
+-	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
+-	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
+-	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
+-	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
+-
+-
+-  if (!callee_tree)
+-    callee_tree = target_option_default_node;
+-  if (!caller_tree)
+-    caller_tree = target_option_default_node;
+-  if (callee_tree == caller_tree)
+-    return true;
+-
+-  struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
+-  struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
+-  bool ret = false;
+-  bool always_inline
+-    = (DECL_DISREGARD_INLINE_LIMITS (callee)
+-       && lookup_attribute ("always_inline",
+-			    DECL_ATTRIBUTES (callee)));
+-
+-  cgraph_node *callee_node = cgraph_node::get (callee);
+-  /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
+-     function can inline a SSE2 function but a SSE2 function can't inline
+-     a SSE4 function.  */
+-  if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
+-       != callee_opts->x_ix86_isa_flags)
+-      || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
+-	  != callee_opts->x_ix86_isa_flags2))
+-    ret = false;
+-
+-  /* See if we have the same non-isa options.  */
+-  else if ((!always_inline
+-	    && caller_opts->x_target_flags != callee_opts->x_target_flags)
+-	   || (caller_opts->x_target_flags & ~always_inline_safe_mask)
+-	       != (callee_opts->x_target_flags & ~always_inline_safe_mask))
+-    ret = false;
+-
+-  /* See if arch, tune, etc. are the same.  */
+-  else if (caller_opts->arch != callee_opts->arch)
+-    ret = false;
+-
+-  else if (!always_inline && caller_opts->tune != callee_opts->tune)
+-    ret = false;
+-
+-  else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
+-	   /* If the calle doesn't use FP expressions differences in
+-	      ix86_fpmath can be ignored.  We are called from FEs
+-	      for multi-versioning call optimization, so beware of
+-	      ipa_fn_summaries not available.  */
+-	   && (! ipa_fn_summaries
+-	       || ipa_fn_summaries->get (callee_node) == NULL
+-	       || ipa_fn_summaries->get (callee_node)->fp_expressions))
+-    ret = false;
+-
+-  else if (!always_inline
+-	   && caller_opts->branch_cost != callee_opts->branch_cost)
+-    ret = false;
+-
+-  else
+-    ret = true;
+-
+-  return ret;
+-}
+-
+-
+-/* Remember the last target of ix86_set_current_function.  */
+-static GTY(()) tree ix86_previous_fndecl;
+-
+-/* Set targets globals to the default (or current #pragma GCC target
+-   if active).  Invalidate ix86_previous_fndecl cache.  */
++  /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
++     We use value of -2 to specify that current function call is MSABI.  */
++  if (mode == VOIDmode)
++    return GEN_INT (-2);
+ 
+-void
+-ix86_reset_previous_fndecl (void)
+-{
+-  tree new_tree = target_option_current_node;
+-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+-  if (TREE_TARGET_GLOBALS (new_tree))
+-    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
+-  else if (new_tree == target_option_default_node)
+-    restore_target_globals (&default_target_globals);
+-  else
+-    TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
+-  ix86_previous_fndecl = NULL_TREE;
+-}
++  /* If we've run out of registers, it goes on the stack.  */
++  if (cum->nregs == 0)
++    return NULL_RTX;
+ 
+-/* Set the func_type field from the function FNDECL.  */
++  regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
+ 
+-static void
+-ix86_set_func_type (tree fndecl)
+-{
+-  if (cfun->machine->func_type == TYPE_UNKNOWN)
++  /* Only floating point modes are passed in anything but integer regs.  */
++  if (TARGET_SSE && (mode == SFmode || mode == DFmode))
+     {
+-      if (lookup_attribute ("interrupt",
+-			    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
++      if (named)
+ 	{
+-	  if (ix86_function_naked (fndecl))
+-	    error_at (DECL_SOURCE_LOCATION (fndecl),
+-		      "interrupt and naked attributes are not compatible");
+-
+-	  int nargs = 0;
+-	  for (tree arg = DECL_ARGUMENTS (fndecl);
+-	       arg;
+-	       arg = TREE_CHAIN (arg))
+-	    nargs++;
+-	  cfun->machine->no_caller_saved_registers = true;
+-	  cfun->machine->func_type
+-	    = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
+-
+-	  ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
+-
+-	  /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument.  */
+-	  if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
+-	    sorry ("only DWARF debug format is supported for interrupt "
+-		   "service routine");
++	  if (type == NULL_TREE || !AGGREGATE_TYPE_P (type))
++	    regno = cum->regno + FIRST_SSE_REG;
+ 	}
+       else
+ 	{
+-	  cfun->machine->func_type = TYPE_NORMAL;
+-	  if (lookup_attribute ("no_caller_saved_registers",
+-				TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
+-	    cfun->machine->no_caller_saved_registers = true;
++	  rtx t1, t2;
++
++	  /* Unnamed floating parameters are passed in both the
++	     SSE and integer registers.  */
++	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
++	  t2 = gen_rtx_REG (mode, regno);
++	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
++	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
++	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
+ 	}
+     }
++  /* Handle aggregated types passed in register.  */
++  if (orig_mode == BLKmode)
++    {
++      if (bytes > 0 && bytes <= 8)
++        mode = (bytes > 4 ? DImode : SImode);
++      if (mode == BLKmode)
++        mode = DImode;
++    }
++
++  return gen_reg_or_parallel (mode, orig_mode, regno);
+ }
+ 
+-/* Set the indirect_branch_type field from the function FNDECL.  */
++/* Return where to put the arguments to a function.
++   Return zero to push the argument on the stack, or a hard register in which to store the argument.
+ 
+-static void
+-ix86_set_indirect_branch_type (tree fndecl)
++   ARG describes the argument while CUM gives information about the
++   preceding args and about the function being called.  */
++
++static rtx
++ix86_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
+ {
+-  if (cfun->machine->indirect_branch_type == indirect_branch_unset)
++  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
++  machine_mode mode = arg.mode;
++  HOST_WIDE_INT bytes, words;
++  rtx reg;
++
++  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+     {
+-      tree attr = lookup_attribute ("indirect_branch",
+-				    DECL_ATTRIBUTES (fndecl));
+-      if (attr != NULL)
++      gcc_assert (arg.type != NULL_TREE);
++      if (POINTER_TYPE_P (arg.type))
+ 	{
+-	  tree args = TREE_VALUE (attr);
+-	  if (args == NULL)
+-	    gcc_unreachable ();
+-	  tree cst = TREE_VALUE (args);
+-	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
+-	    cfun->machine->indirect_branch_type = indirect_branch_keep;
+-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
+-	    cfun->machine->indirect_branch_type = indirect_branch_thunk;
+-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
+-	    cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
+-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
+-	    cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
+-	  else
+-	    gcc_unreachable ();
+-	}
+-      else
+-	cfun->machine->indirect_branch_type = ix86_indirect_branch;
+-
+-      /* -mcmodel=large is not compatible with -mindirect-branch=thunk
+-	 nor -mindirect-branch=thunk-extern.  */
+-      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+-	  && ((cfun->machine->indirect_branch_type
+-	       == indirect_branch_thunk_extern)
+-	      || (cfun->machine->indirect_branch_type
+-		  == indirect_branch_thunk)))
+-	error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
+-	       "compatible",
+-	       ((cfun->machine->indirect_branch_type
+-		 == indirect_branch_thunk_extern)
+-		? "thunk-extern" : "thunk"));
+-
+-      if (cfun->machine->indirect_branch_type != indirect_branch_keep
+-	  && (flag_cf_protection & CF_RETURN))
+-	error ("%<-mindirect-branch%> and %<-fcf-protection%> are not "
+-	       "compatible");
+-    }
+-
+-  if (cfun->machine->function_return_type == indirect_branch_unset)
+-    {
+-      tree attr = lookup_attribute ("function_return",
+-				    DECL_ATTRIBUTES (fndecl));
+-      if (attr != NULL)
+-	{
+-	  tree args = TREE_VALUE (attr);
+-	  if (args == NULL)
+-	    gcc_unreachable ();
+-	  tree cst = TREE_VALUE (args);
+-	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
+-	    cfun->machine->function_return_type = indirect_branch_keep;
+-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
+-	    cfun->machine->function_return_type = indirect_branch_thunk;
+-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
+-	    cfun->machine->function_return_type = indirect_branch_thunk_inline;
+-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
+-	    cfun->machine->function_return_type = indirect_branch_thunk_extern;
+-	  else
+-	    gcc_unreachable ();
++	  /* This is the pointer argument.  */
++	  gcc_assert (TYPE_MODE (arg.type) == Pmode);
++	  /* It is at -WORD(AP) in the current frame in interrupt and
++	     exception handlers.  */
++	  reg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
+ 	}
+       else
+-	cfun->machine->function_return_type = ix86_function_return;
+-
+-      /* -mcmodel=large is not compatible with -mfunction-return=thunk
+-	 nor -mfunction-return=thunk-extern.  */
+-      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+-	  && ((cfun->machine->function_return_type
+-	       == indirect_branch_thunk_extern)
+-	      || (cfun->machine->function_return_type
+-		  == indirect_branch_thunk)))
+-	error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
+-	       "compatible",
+-	       ((cfun->machine->function_return_type
+-		 == indirect_branch_thunk_extern)
+-		? "thunk-extern" : "thunk"));
+-
+-      if (cfun->machine->function_return_type != indirect_branch_keep
+-	  && (flag_cf_protection & CF_RETURN))
+-	error ("%<-mfunction-return%> and %<-fcf-protection%> are not "
+-	       "compatible");
+-    }
+-}
+-
+-/* Establish appropriate back-end context for processing the function
+-   FNDECL.  The argument might be NULL to indicate processing at top
+-   level, outside of any function scope.  */
+-static void
+-ix86_set_current_function (tree fndecl)
+-{
+-  /* Only change the context if the function changes.  This hook is called
+-     several times in the course of compiling a function, and we don't want to
+-     slow things down too much or call target_reinit when it isn't safe.  */
+-  if (fndecl == ix86_previous_fndecl)
+-    {
+-      /* There may be 2 function bodies for the same function FNDECL,
+-	 one is extern inline and one isn't.  Call ix86_set_func_type
+-	 to set the func_type field.  */
+-      if (fndecl != NULL_TREE)
+ 	{
+-	  ix86_set_func_type (fndecl);
+-	  ix86_set_indirect_branch_type (fndecl);
++	  gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
++		      && TREE_CODE (arg.type) == INTEGER_TYPE
++		      && TYPE_MODE (arg.type) == word_mode);
++	  /* The error code is the word-mode integer argument at
++	     -2 * WORD(AP) in the current frame of the exception
++	     handler.  */
++	  reg = gen_rtx_MEM (word_mode,
++			     plus_constant (Pmode,
++					    arg_pointer_rtx,
++					    -2 * UNITS_PER_WORD));
+ 	}
+-      return;
+-    }
+-
+-  tree old_tree;
+-  if (ix86_previous_fndecl == NULL_TREE)
+-    old_tree = target_option_current_node;
+-  else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
+-    old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
+-  else
+-    old_tree = target_option_default_node;
+-
+-  if (fndecl == NULL_TREE)
+-    {
+-      if (old_tree != target_option_current_node)
+-	ix86_reset_previous_fndecl ();
+-      return;
++      return reg;
+     }
+ 
+-  ix86_set_func_type (fndecl);
+-  ix86_set_indirect_branch_type (fndecl);
++  bytes = arg.promoted_size_in_bytes ();
++  words = CEIL (bytes, UNITS_PER_WORD);
+ 
+-  tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
+-  if (new_tree == NULL_TREE)
+-    new_tree = target_option_default_node;
++  /* To simplify the code below, represent vector types with a vector mode
++     even if MMX/SSE are not active.  */
++  if (arg.type && TREE_CODE (arg.type) == VECTOR_TYPE)
++    mode = type_natural_mode (arg.type, cum, false);
+ 
+-  if (old_tree != new_tree)
++  if (TARGET_64BIT)
+     {
+-      cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+-      if (TREE_TARGET_GLOBALS (new_tree))
+-	restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
+-      else if (new_tree == target_option_default_node)
+-	restore_target_globals (&default_target_globals);
+-      else
+-	TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
+-    }
+-  ix86_previous_fndecl = fndecl;
+-
+-  static bool prev_no_caller_saved_registers;
++      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+ 
+-  /* 64-bit MS and SYSV ABI have different set of call used registers.
+-     Avoid expensive re-initialization of init_regs each time we switch
+-     function context.  */
+-  if (TARGET_64BIT
+-      && (call_used_regs[SI_REG]
+-	  == (cfun->machine->call_abi == MS_ABI)))
+-    reinit_regs ();
+-  /* Need to re-initialize init_regs if caller-saved registers are
+-     changed.  */
+-  else if (prev_no_caller_saved_registers
+-	   != cfun->machine->no_caller_saved_registers)
+-    reinit_regs ();
+-
+-  if (cfun->machine->func_type != TYPE_NORMAL
+-      || cfun->machine->no_caller_saved_registers)
+-    {
+-      /* Don't allow SSE, MMX nor x87 instructions since they
+-	 may change processor state.  */
+-      const char *isa;
+-      if (TARGET_SSE)
+-	isa = "SSE";
+-      else if (TARGET_MMX)
+-	isa = "MMX/3Dnow";
+-      else if (TARGET_80387)
+-	isa = "80387";
++      if (call_abi == MS_ABI)
++	reg = function_arg_ms_64 (cum, mode, arg.mode, arg.named,
++				  arg.type, bytes);
+       else
+-	isa = NULL;
+-      if (isa != NULL)
+-	{
+-	  if (cfun->machine->func_type != TYPE_NORMAL)
+-	    sorry (cfun->machine->func_type == TYPE_EXCEPTION
+-		   ? G_("%s instructions aren%'t allowed in an"
+-			" exception service routine")
+-		   : G_("%s instructions aren%'t allowed in an"
+-			" interrupt service routine"),
+-		   isa);
+-	  else
+-	    sorry ("%s instructions aren%'t allowed in a function with "
+-		   "the %<no_caller_saved_registers%> attribute", isa);
+-	  /* Don't issue the same error twice.  */
+-	  cfun->machine->func_type = TYPE_NORMAL;
+-	  cfun->machine->no_caller_saved_registers = false;
+-	}
++	reg = function_arg_64 (cum, mode, arg.mode, arg.type, arg.named);
+     }
++  else
++    reg = function_arg_32 (cum, mode, arg.mode, arg.type, bytes, words);
+ 
+-  prev_no_caller_saved_registers
+-    = cfun->machine->no_caller_saved_registers;
++  /* Track if there are outgoing arguments on stack.  */
++  if (reg == NULL_RTX && cum->caller)
++    cfun->machine->outgoing_args_on_stack = true;
++
++  return reg;
+ }
+ 
+-
+-/* Return true if this goes in large data/bss.  */
++/* A C expression that indicates when an argument must be passed by
++   reference.  If nonzero for an argument, a copy of that argument is
++   made in memory and a pointer to the argument is passed instead of
++   the argument itself.  The pointer is passed in whatever way is
++   appropriate for passing a pointer to that type.  */
+ 
+ static bool
+-ix86_in_large_data_p (tree exp)
++ix86_pass_by_reference (cumulative_args_t cum_v, const function_arg_info &arg)
+ {
+-  if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
+-    return false;
++  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ 
+-  if (exp == NULL_TREE)
+-    return false;
++  if (TARGET_64BIT)
++    {
++      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+ 
+-  /* Functions are never large data.  */
+-  if (TREE_CODE (exp) == FUNCTION_DECL)
+-    return false;
++      /* See Windows x64 Software Convention.  */
++      if (call_abi == MS_ABI)
++	{
++	  HOST_WIDE_INT msize = GET_MODE_SIZE (arg.mode);
+ 
+-  /* Automatic variables are never large data.  */
+-  if (VAR_P (exp) && !is_global_var (exp))
+-    return false;
++	  if (tree type = arg.type)
++	    {
++	      /* Arrays are passed by reference.  */
++	      if (TREE_CODE (type) == ARRAY_TYPE)
++		return true;
+ 
+-  if (VAR_P (exp) && DECL_SECTION_NAME (exp))
+-    {
+-      const char *section = DECL_SECTION_NAME (exp);
+-      if (strcmp (section, ".ldata") == 0
+-	  || strcmp (section, ".lbss") == 0)
+-	return true;
+-      return false;
+-    }
+-  else
+-    {
+-      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
++	      if (RECORD_OR_UNION_TYPE_P (type))
++		{
++		  /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
++		     are passed by reference.  */
++		  msize = int_size_in_bytes (type);
++		}
++	    }
+ 
+-      /* If this is an incomplete type with size 0, then we can't put it
+-	 in data because it might be too big when completed.  Also,
+-	 int_size_in_bytes returns -1 if size can vary or is larger than
+-	 an integer in which case also it is safer to assume that it goes in
+-	 large data.  */
+-      if (size <= 0 || size > ix86_section_threshold)
++	  /* __m128 is passed by reference.  */
++	  return msize != 1 && msize != 2 && msize != 4 && msize != 8;
++	}
++      else if (arg.type && int_size_in_bytes (arg.type) == -1)
+ 	return true;
+     }
+ 
+   return false;
+ }
+ 
+-/* i386-specific section flag to mark large sections.  */
+-#define SECTION_LARGE SECTION_MACH_DEP
+-
+-/* Switch to the appropriate section for output of DECL.
+-   DECL is either a `VAR_DECL' node or a constant of some sort.
+-   RELOC indicates whether forming the initial value of DECL requires
+-   link-time relocations.  */
++/* Return true when TYPE should be 128bit aligned for 32bit argument
++   passing ABI.  XXX: This function is obsolete and is only used for
++   checking psABI compatibility with previous versions of GCC.  */
+ 
+-ATTRIBUTE_UNUSED static section *
+-x86_64_elf_select_section (tree decl, int reloc,
+-			   unsigned HOST_WIDE_INT align)
++static bool
++ix86_compat_aligned_value_p (const_tree type)
+ {
+-  if (ix86_in_large_data_p (decl))
++  machine_mode mode = TYPE_MODE (type);
++  if (((TARGET_SSE && SSE_REG_MODE_P (mode))
++       || mode == TDmode
++       || mode == TFmode
++       || mode == TCmode)
++      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
++    return true;
++  if (TYPE_ALIGN (type) < 128)
++    return false;
++
++  if (AGGREGATE_TYPE_P (type))
+     {
+-      const char *sname = NULL;
+-      unsigned int flags = SECTION_WRITE | SECTION_LARGE;
+-      switch (categorize_decl_for_section (decl, reloc))
++      /* Walk the aggregates recursively.  */
++      switch (TREE_CODE (type))
+ 	{
+-	case SECCAT_DATA:
+-	  sname = ".ldata";
+-	  break;
+-	case SECCAT_DATA_REL:
+-	  sname = ".ldata.rel";
+-	  break;
+-	case SECCAT_DATA_REL_LOCAL:
+-	  sname = ".ldata.rel.local";
+-	  break;
+-	case SECCAT_DATA_REL_RO:
+-	  sname = ".ldata.rel.ro";
+-	  break;
+-	case SECCAT_DATA_REL_RO_LOCAL:
+-	  sname = ".ldata.rel.ro.local";
+-	  break;
+-	case SECCAT_BSS:
+-	  sname = ".lbss";
+-	  flags |= SECTION_BSS;
+-	  break;
+-	case SECCAT_RODATA:
+-	case SECCAT_RODATA_MERGE_STR:
+-	case SECCAT_RODATA_MERGE_STR_INIT:
+-	case SECCAT_RODATA_MERGE_CONST:
+-	  sname = ".lrodata";
+-	  flags &= ~SECTION_WRITE;
++	case RECORD_TYPE:
++	case UNION_TYPE:
++	case QUAL_UNION_TYPE:
++	  {
++	    tree field;
++
++	    /* Walk all the structure fields.  */
++	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
++	      {
++		if (TREE_CODE (field) == FIELD_DECL
++		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
++		  return true;
++	      }
++	    break;
++	  }
++
++	case ARRAY_TYPE:
++	  /* Just for use if some languages passes arrays by value.  */
++	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
++	    return true;
+ 	  break;
+-	case SECCAT_SRODATA:
+-	case SECCAT_SDATA:
+-	case SECCAT_SBSS:
++
++	default:
+ 	  gcc_unreachable ();
+-	case SECCAT_TEXT:
+-	case SECCAT_TDATA:
+-	case SECCAT_TBSS:
+-	  /* We don't split these for medium model.  Place them into
+-	     default sections and hope for best.  */
+-	  break;
+-	}
+-      if (sname)
+-	{
+-	  /* We might get called with string constants, but get_named_section
+-	     doesn't like them as they are not DECLs.  Also, we need to set
+-	     flags in that case.  */
+-	  if (!DECL_P (decl))
+-	    return get_section (sname, flags, NULL);
+-	  return get_named_section (decl, sname, reloc);
+ 	}
+     }
+-  return default_elf_select_section (decl, reloc, align);
++  return false;
+ }
+ 
+-/* Select a set of attributes for section NAME based on the properties
+-   of DECL and whether or not RELOC indicates that DECL's initializer
+-   might contain runtime relocations.  */
++/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
++   XXX: This function is obsolete and is only used for checking psABI
++   compatibility with previous versions of GCC.  */
+ 
+-static unsigned int ATTRIBUTE_UNUSED
+-x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
++static unsigned int
++ix86_compat_function_arg_boundary (machine_mode mode,
++				   const_tree type, unsigned int align)
+ {
+-  unsigned int flags = default_section_type_flags (decl, name, reloc);
++  /* In 32bit, only _Decimal128 and __float128 are aligned to their
++     natural boundaries.  */
++  if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
++    {
++      /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
++	 make an exception for SSE modes since these require 128bit
++	 alignment.
+ 
+-  if (ix86_in_large_data_p (decl))
+-    flags |= SECTION_LARGE;
++	 The handling here differs from field_alignment.  ICC aligns MMX
++	 arguments to 4 byte boundaries, while structure fields are aligned
++	 to 8 byte boundaries.  */
++      if (!type)
++	{
++	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
++	    align = PARM_BOUNDARY;
++	}
++      else
++	{
++	  if (!ix86_compat_aligned_value_p (type))
++	    align = PARM_BOUNDARY;
++	}
++    }
++  if (align > BIGGEST_ALIGNMENT)
++    align = BIGGEST_ALIGNMENT;
++  return align;
++}
+ 
+-  if (decl == NULL_TREE
+-      && (strcmp (name, ".ldata.rel.ro") == 0
+-	  || strcmp (name, ".ldata.rel.ro.local") == 0))
+-    flags |= SECTION_RELRO;
++/* Return true when TYPE should be 128bit aligned for 32bit argument
++   passing ABI.  */
+ 
+-  if (strcmp (name, ".lbss") == 0
+-      || strncmp (name, ".lbss.", 5) == 0
+-      || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
+-    flags |= SECTION_BSS;
++static bool
++ix86_contains_aligned_value_p (const_tree type)
++{
++  machine_mode mode = TYPE_MODE (type);
+ 
+-  return flags;
+-}
++  if (mode == XFmode || mode == XCmode)
++    return false;
+ 
+-/* Build up a unique section name, expressed as a
+-   STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
+-   RELOC indicates whether the initial value of EXP requires
+-   link-time relocations.  */
++  if (TYPE_ALIGN (type) < 128)
++    return false;
+ 
+-static void ATTRIBUTE_UNUSED
+-x86_64_elf_unique_section (tree decl, int reloc)
+-{
+-  if (ix86_in_large_data_p (decl))
++  if (AGGREGATE_TYPE_P (type))
+     {
+-      const char *prefix = NULL;
+-      /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
+-      bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
+-
+-      switch (categorize_decl_for_section (decl, reloc))
+-	{
+-	case SECCAT_DATA:
+-	case SECCAT_DATA_REL:
+-	case SECCAT_DATA_REL_LOCAL:
+-	case SECCAT_DATA_REL_RO:
+-	case SECCAT_DATA_REL_RO_LOCAL:
+-          prefix = one_only ? ".ld" : ".ldata";
+-	  break;
+-	case SECCAT_BSS:
+-          prefix = one_only ? ".lb" : ".lbss";
+-	  break;
+-	case SECCAT_RODATA:
+-	case SECCAT_RODATA_MERGE_STR:
+-	case SECCAT_RODATA_MERGE_STR_INIT:
+-	case SECCAT_RODATA_MERGE_CONST:
+-          prefix = one_only ? ".lr" : ".lrodata";
+-	  break;
+-	case SECCAT_SRODATA:
+-	case SECCAT_SDATA:
+-	case SECCAT_SBSS:
+-	  gcc_unreachable ();
+-	case SECCAT_TEXT:
+-	case SECCAT_TDATA:
+-	case SECCAT_TBSS:
+-	  /* We don't split these for medium model.  Place them into
+-	     default sections and hope for best.  */
+-	  break;
+-	}
+-      if (prefix)
++      /* Walk the aggregates recursively.  */
++      switch (TREE_CODE (type))
+ 	{
+-	  const char *name, *linkonce;
+-	  char *string;
+-
+-	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+-	  name = targetm.strip_name_encoding (name);
++	case RECORD_TYPE:
++	case UNION_TYPE:
++	case QUAL_UNION_TYPE:
++	  {
++	    tree field;
+ 
+-	  /* If we're using one_only, then there needs to be a .gnu.linkonce
+-     	     prefix to the section name.  */
+-	  linkonce = one_only ? ".gnu.linkonce" : "";
++	    /* Walk all the structure fields.  */
++	    for (field = TYPE_FIELDS (type);
++		 field;
++		 field = DECL_CHAIN (field))
++	      {
++		if (TREE_CODE (field) == FIELD_DECL
++		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
++		  return true;
++	      }
++	    break;
++	  }
+ 
+-	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
++	case ARRAY_TYPE:
++	  /* Just for use if some languages passes arrays by value.  */
++	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
++	    return true;
++	  break;
+ 
+-	  set_decl_section_name (decl, string);
+-	  return;
++	default:
++	  gcc_unreachable ();
+ 	}
+     }
+-  default_unique_section (decl, reloc);
+-}
+-
+-#ifdef COMMON_ASM_OP
++  else
++    return TYPE_ALIGN (type) >= 128;
+ 
+-#ifndef LARGECOMM_SECTION_ASM_OP
+-#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
+-#endif
++  return false;
++}
+ 
+-/* This says how to output assembler code to declare an
+-   uninitialized external linkage data object.
++/* Gives the alignment boundary, in bits, of an argument with the
++   specified mode and type.  */
+ 
+-   For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
+-   large objects.  */
+-void
+-x86_elf_aligned_decl_common (FILE *file, tree decl,
+-			const char *name, unsigned HOST_WIDE_INT size,
+-			int align)
++static unsigned int
++ix86_function_arg_boundary (machine_mode mode, const_tree type)
+ {
+-  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+-      && size > (unsigned int)ix86_section_threshold)
++  unsigned int align;
++  if (type)
+     {
+-      switch_to_section (get_named_section (decl, ".lbss", 0));
+-      fputs (LARGECOMM_SECTION_ASM_OP, file);
++      /* Since the main variant type is used for call, we convert it to
++	 the main variant type.  */
++      type = TYPE_MAIN_VARIANT (type);
++      align = TYPE_ALIGN (type);
++      if (TYPE_EMPTY_P (type))
++	return PARM_BOUNDARY;
+     }
+   else
+-    fputs (COMMON_ASM_OP, file);
+-  assemble_name (file, name);
+-  fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
+-	   size, align / BITS_PER_UNIT);
+-}
+-#endif
++    align = GET_MODE_ALIGNMENT (mode);
++  if (align < PARM_BOUNDARY)
++    align = PARM_BOUNDARY;
++  else
++    {
++      static bool warned;
++      unsigned int saved_align = align;
+ 
+-/* Utility function for targets to use in implementing
+-   ASM_OUTPUT_ALIGNED_BSS.  */
++      if (!TARGET_64BIT)
++	{
++	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
++	  if (!type)
++	    {
++	      if (mode == XFmode || mode == XCmode)
++		align = PARM_BOUNDARY;
++	    }
++	  else if (!ix86_contains_aligned_value_p (type))
++	    align = PARM_BOUNDARY;
+ 
+-void
+-x86_output_aligned_bss (FILE *file, tree decl, const char *name,
+-		       	unsigned HOST_WIDE_INT size, int align)
+-{
+-  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+-      && size > (unsigned int)ix86_section_threshold)
+-    switch_to_section (get_named_section (decl, ".lbss", 0));
+-  else
+-    switch_to_section (bss_section);
+-  ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
+-#ifdef ASM_DECLARE_OBJECT_NAME
+-  last_assemble_variable_decl = decl;
+-  ASM_DECLARE_OBJECT_NAME (file, name, decl);
+-#else
+-  /* Standard thing is just output label for the object.  */
+-  ASM_OUTPUT_LABEL (file, name);
+-#endif /* ASM_DECLARE_OBJECT_NAME */
+-  ASM_OUTPUT_SKIP (file, size ? size : 1);
+-}
+-
+-/* Decide whether we must probe the stack before any space allocation
+-   on this target.  It's essentially TARGET_STACK_PROBE except when
+-   -fstack-check causes the stack to be already probed differently.  */
++	  if (align < 128)
++	    align = PARM_BOUNDARY;
++	}
+ 
+-bool
+-ix86_target_stack_probe (void)
+-{
+-  /* Do not probe the stack twice if static stack checking is enabled.  */
+-  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+-    return false;
++      if (warn_psabi
++	  && !warned
++	  && align != ix86_compat_function_arg_boundary (mode, type,
++							 saved_align))
++	{
++	  warned = true;
++	  inform (input_location,
++		  "the ABI for passing parameters with %d-byte"
++		  " alignment has changed in GCC 4.6",
++		  align / BITS_PER_UNIT);
++	}
++    }
+ 
+-  return TARGET_STACK_PROBE;
++  return align;
+ }
+-
+-/* Decide whether we can make a sibling call to a function.  DECL is the
+-   declaration of the function being targeted by the call and EXP is the
+-   CALL_EXPR representing the call.  */
++
++/* Return true if N is a possible register number of function value.  */
+ 
+ static bool
+-ix86_function_ok_for_sibcall (tree decl, tree exp)
++ix86_function_value_regno_p (const unsigned int regno)
+ {
+-  tree type, decl_or_type;
+-  rtx a, b;
+-  bool bind_global = decl && !targetm.binds_local_p (decl);
+-
+-  if (ix86_function_naked (current_function_decl))
+-    return false;
+-
+-  /* Sibling call isn't OK if there are no caller-saved registers
+-     since all registers must be preserved before return.  */
+-  if (cfun->machine->no_caller_saved_registers)
+-    return false;
+-
+-  /* If we are generating position-independent code, we cannot sibcall
+-     optimize direct calls to global functions, as the PLT requires
+-     %ebx be live. (Darwin does not have a PLT.)  */
+-  if (!TARGET_MACHO
+-      && !TARGET_64BIT
+-      && flag_pic
+-      && flag_plt
+-      && bind_global)
+-    return false;
+-
+-  /* If we need to align the outgoing stack, then sibcalling would
+-     unalign the stack, which may break the called function.  */
+-  if (ix86_minimum_incoming_stack_boundary (true)
+-      < PREFERRED_STACK_BOUNDARY)
+-    return false;
+-
+-  if (decl)
+-    {
+-      decl_or_type = decl;
+-      type = TREE_TYPE (decl);
+-    }
+-  else
++  switch (regno)
+     {
+-      /* We're looking at the CALL_EXPR, we need the type of the function.  */
+-      type = CALL_EXPR_FN (exp);		/* pointer expression */
+-      type = TREE_TYPE (type);			/* pointer type */
+-      type = TREE_TYPE (type);			/* function type */
+-      decl_or_type = type;
+-    }
++    case AX_REG:
++      return true;
++    case DX_REG:
++      return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
++    case DI_REG:
++    case SI_REG:
++      return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
+ 
+-  /* Check that the return value locations are the same.  Like
+-     if we are returning floats on the 80387 register stack, we cannot
+-     make a sibcall from a function that doesn't return a float to a
+-     function that does or, conversely, from a function that does return
+-     a float to a function that doesn't; the necessary stack adjustment
+-     would not be executed.  This is also the place we notice
+-     differences in the return value ABI.  Note that it is ok for one
+-     of the functions to have void return type as long as the return
+-     value of the other is passed in a register.  */
+-  a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
+-  b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
+-			   cfun->decl, false);
+-  if (STACK_REG_P (a) || STACK_REG_P (b))
+-    {
+-      if (!rtx_equal_p (a, b))
++      /* Complex values are returned in %st(0)/%st(1) pair.  */
++    case ST0_REG:
++    case ST1_REG:
++      /* TODO: The function should depend on current function ABI but
++       builtins.c would need updating then. Therefore we use the
++       default ABI.  */
++      if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
+ 	return false;
+-    }
+-  else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
+-    ;
+-  else if (!rtx_equal_p (a, b))
+-    return false;
++      return TARGET_FLOAT_RETURNS_IN_80387;
+ 
+-  if (TARGET_64BIT)
+-    {
+-      /* The SYSV ABI has more call-clobbered registers;
+-	 disallow sibcalls from MS to SYSV.  */
+-      if (cfun->machine->call_abi == MS_ABI
+-	  && ix86_function_type_abi (type) == SYSV_ABI)
+-	return false;
+-    }
+-  else
+-    {
+-      /* If this call is indirect, we'll need to be able to use a
+-	 call-clobbered register for the address of the target function.
+-	 Make sure that all such registers are not used for passing
+-	 parameters.  Note that DLLIMPORT functions and call to global
+-	 function via GOT slot are indirect.  */
+-      if (!decl
+-	  || (bind_global && flag_pic && !flag_plt)
+-	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
+-	  || flag_force_indirect_call)
+-	{
+-	  /* Check if regparm >= 3 since arg_reg_available is set to
+-	     false if regparm == 0.  If regparm is 1 or 2, there is
+-	     always a call-clobbered register available.
++      /* Complex values are returned in %xmm0/%xmm1 pair.  */
++    case XMM0_REG:
++    case XMM1_REG:
++      return TARGET_SSE;
+ 
+-	     ??? The symbol indirect call doesn't need a call-clobbered
+-	     register.  But we don't know if this is a symbol indirect
+-	     call or not here.  */
+-	  if (ix86_function_regparm (type, decl) >= 3
+-	      && !cfun->machine->arg_reg_available)
+-	    return false;
+-	}
++    case MM0_REG:
++      if (TARGET_MACHO || TARGET_64BIT)
++	return false;
++      return TARGET_MMX;
+     }
+ 
+-  /* Otherwise okay.  That also includes certain types of indirect calls.  */
+-  return true;
++  return false;
+ }
+ 
+-/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
+-   and "sseregparm" calling convention attributes;
+-   arguments as in struct attribute_spec.handler.  */
++/* Define how to find the value returned by a function.
++   VALTYPE is the data type of the value (as a tree).
++   If the precise function being called is known, FUNC is its FUNCTION_DECL;
++   otherwise, FUNC is 0.  */
+ 
+-static tree
+-ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
+-			     bool *no_add_attrs)
++static rtx
++function_value_32 (machine_mode orig_mode, machine_mode mode,
++		   const_tree fntype, const_tree fn)
+ {
+-  if (TREE_CODE (*node) != FUNCTION_TYPE
+-      && TREE_CODE (*node) != METHOD_TYPE
+-      && TREE_CODE (*node) != FIELD_DECL
+-      && TREE_CODE (*node) != TYPE_DECL)
+-    {
+-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+-	       name);
+-      *no_add_attrs = true;
+-      return NULL_TREE;
+-    }
+-
+-  /* Can combine regparm with all attributes but fastcall, and thiscall.  */
+-  if (is_attribute_p ("regparm", name))
+-    {
+-      tree cst;
++  unsigned int regno;
+ 
+-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("fastcall and regparm attributes are not compatible");
+-	}
++  /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
++     we normally prevent this case when mmx is not available.  However
++     some ABIs may require the result to be returned like DImode.  */
++  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
++    regno = FIRST_MMX_REG;
+ 
+-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+-	{
+-	  error ("regparam and thiscall attributes are not compatible");
+-	}
++  /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
++     we prevent this case when sse is not available.  However some ABIs
++     may require the result to be returned like integer TImode.  */
++  else if (mode == TImode
++	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
++    regno = FIRST_SSE_REG;
+ 
+-      cst = TREE_VALUE (args);
+-      if (TREE_CODE (cst) != INTEGER_CST)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "%qE attribute requires an integer constant argument",
+-		   name);
+-	  *no_add_attrs = true;
+-	}
+-      else if (compare_tree_int (cst, REGPARM_MAX) > 0)
+-	{
+-	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
+-		   name, REGPARM_MAX);
+-	  *no_add_attrs = true;
+-	}
++  /* 32-byte vector modes in %ymm0.   */
++  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
++    regno = FIRST_SSE_REG;
+ 
+-      return NULL_TREE;
+-    }
++  /* 64-byte vector modes in %zmm0.   */
++  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
++    regno = FIRST_SSE_REG;
+ 
+-  if (TARGET_64BIT)
+-    {
+-      /* Do not warn when emulating the MS ABI.  */
+-      if ((TREE_CODE (*node) != FUNCTION_TYPE
+-	   && TREE_CODE (*node) != METHOD_TYPE)
+-	  || ix86_function_type_abi (*node) != MS_ABI)
+-	warning (OPT_Wattributes, "%qE attribute ignored",
+-	         name);
+-      *no_add_attrs = true;
+-      return NULL_TREE;
+-    }
++  /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
++  else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
++    regno = FIRST_FLOAT_REG;
++  else
++    /* Most things go in %eax.  */
++    regno = AX_REG;
+ 
+-  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
+-  if (is_attribute_p ("fastcall", name))
++  /* Override FP return register with %xmm0 for local functions when
++     SSE math is enabled or for functions with sseregparm attribute.  */
++  if ((fn || fntype) && (mode == SFmode || mode == DFmode))
+     {
+-      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("fastcall and cdecl attributes are not compatible");
+-	}
+-      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("fastcall and stdcall attributes are not compatible");
+-	}
+-      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("fastcall and regparm attributes are not compatible");
+-	}
+-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
++      int sse_level = ix86_function_sseregparm (fntype, fn, false);
++      if (sse_level == -1)
+ 	{
+-	  error ("fastcall and thiscall attributes are not compatible");
++	  error ("calling %qD with SSE calling convention without "
++		 "SSE/SSE2 enabled", fn);
++	  sorry ("this is a GCC bug that can be worked around by adding "
++		 "attribute used to function called");
+ 	}
++      else if ((sse_level >= 1 && mode == SFmode)
++	       || (sse_level == 2 && mode == DFmode))
++	regno = FIRST_SSE_REG;
+     }
+ 
+-  /* Can combine stdcall with fastcall (redundant), regparm and
+-     sseregparm.  */
+-  else if (is_attribute_p ("stdcall", name))
+-    {
+-      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("stdcall and cdecl attributes are not compatible");
+-	}
+-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("stdcall and fastcall attributes are not compatible");
+-	}
+-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+-	{
+-	  error ("stdcall and thiscall attributes are not compatible");
+-	}
+-    }
++  /* OImode shouldn't be used directly.  */
++  gcc_assert (mode != OImode);
++
++  return gen_rtx_REG (orig_mode, regno);
++}
++
++static rtx
++function_value_64 (machine_mode orig_mode, machine_mode mode,
++		   const_tree valtype)
++{
++  rtx ret;
+ 
+-  /* Can combine cdecl with regparm and sseregparm.  */
+-  else if (is_attribute_p ("cdecl", name))
++  /* Handle libcalls, which don't provide a type node.  */
++  if (valtype == NULL)
+     {
+-      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("stdcall and cdecl attributes are not compatible");
+-	}
+-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("fastcall and cdecl attributes are not compatible");
+-	}
+-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
++      unsigned int regno;
++
++      switch (mode)
+ 	{
+-	  error ("cdecl and thiscall attributes are not compatible");
++	case E_SFmode:
++	case E_SCmode:
++	case E_DFmode:
++	case E_DCmode:
++	case E_TFmode:
++	case E_SDmode:
++	case E_DDmode:
++	case E_TDmode:
++	  regno = FIRST_SSE_REG;
++	  break;
++	case E_XFmode:
++	case E_XCmode:
++	  regno = FIRST_FLOAT_REG;
++	  break;
++	case E_TCmode:
++	  return NULL;
++	default:
++	  regno = AX_REG;
+ 	}
++
++      return gen_rtx_REG (mode, regno);
+     }
+-  else if (is_attribute_p ("thiscall", name))
++  else if (POINTER_TYPE_P (valtype))
+     {
+-      if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
+-	warning (OPT_Wattributes, "%qE attribute is used for non-class method",
+-	         name);
+-      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+-	{
+-	  error ("stdcall and thiscall attributes are not compatible");
+-	}
+-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+-	{
+-	  error ("fastcall and thiscall attributes are not compatible");
+-	}
+-      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+-	{
+-	  error ("cdecl and thiscall attributes are not compatible");
+-	}
++      /* Pointers are always returned in word_mode.  */
++      mode = word_mode;
+     }
+ 
+-  /* Can combine sseregparm with all attributes.  */
++  ret = construct_container (mode, orig_mode, valtype, 1,
++			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
++			     x86_64_int_return_registers, 0);
+ 
+-  return NULL_TREE;
+-}
++  /* For zero sized structures, construct_container returns NULL, but we
++     need to keep rest of compiler happy by returning meaningful value.  */
++  if (!ret)
++    ret = gen_rtx_REG (orig_mode, AX_REG);
+ 
+-/* The transactional memory builtins are implicitly regparm or fastcall
+-   depending on the ABI.  Override the generic do-nothing attribute that
+-   these builtins were declared with, and replace it with one of the two
+-   attributes that we expect elsewhere.  */
++  return ret;
++}
+ 
+-static tree
+-ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
+-				  int flags, bool *no_add_attrs)
++static rtx
++function_value_ms_32 (machine_mode orig_mode, machine_mode mode,
++		      const_tree fntype, const_tree fn, const_tree valtype)
+ {
+-  tree alt;
++  unsigned int regno;
+ 
+-  /* In no case do we want to add the placeholder attribute.  */
+-  *no_add_attrs = true;
++  /* Floating point return values in %st(0)
++     (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes).  */
++  if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387
++	   && (GET_MODE_SIZE (mode) > 8
++	       || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype)))
++  {
++    regno = FIRST_FLOAT_REG;
++    return gen_rtx_REG (orig_mode, regno);
++  }
++  else
++    return function_value_32(orig_mode, mode, fntype,fn);
++}
+ 
+-  /* The 64-bit ABI is unchanged for transactional memory.  */
+-  if (TARGET_64BIT)
+-    return NULL_TREE;
++static rtx
++function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
++		      const_tree valtype)
++{
++  unsigned int regno = AX_REG;
+ 
+-  /* ??? Is there a better way to validate 32-bit windows?  We have
+-     cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
+-  if (CHECK_STACK_LIMIT > 0)
+-    alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
+-  else
++  if (TARGET_SSE)
+     {
+-      alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
+-      alt = tree_cons (get_identifier ("regparm"), alt, NULL);
++      switch (GET_MODE_SIZE (mode))
++	{
++	case 16:
++	  if (valtype != NULL_TREE
++	      && !VECTOR_INTEGER_TYPE_P (valtype)
++	      && !VECTOR_INTEGER_TYPE_P (valtype)
++	      && !INTEGRAL_TYPE_P (valtype)
++	      && !VECTOR_FLOAT_TYPE_P (valtype))
++	    break;
++	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
++	      && !COMPLEX_MODE_P (mode))
++	    regno = FIRST_SSE_REG;
++	  break;
++	case 8:
++	case 4:
++	  if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype))
++	    break;
++	  if (mode == SFmode || mode == DFmode)
++	    regno = FIRST_SSE_REG;
++	  break;
++	default:
++	  break;
++        }
+     }
+-  decl_attributes (node, alt, flags);
+-
+-  return NULL_TREE;
++  return gen_rtx_REG (orig_mode, regno);
+ }
+ 
+-/* This function determines from TYPE the calling-convention.  */
+-
+-unsigned int
+-ix86_get_callcvt (const_tree type)
++static rtx
++ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
++		       machine_mode orig_mode, machine_mode mode)
+ {
+-  unsigned int ret = 0;
+-  bool is_stdarg;
+-  tree attrs;
+-
+-  if (TARGET_64BIT)
+-    return IX86_CALLCVT_CDECL;
++  const_tree fn, fntype;
+ 
+-  attrs = TYPE_ATTRIBUTES (type);
+-  if (attrs != NULL_TREE)
++  fn = NULL_TREE;
++  if (fntype_or_decl && DECL_P (fntype_or_decl))
++    fn = fntype_or_decl;
++  fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
++  
++  if (ix86_function_type_abi (fntype) == MS_ABI)
+     {
+-      if (lookup_attribute ("cdecl", attrs))
+-	ret |= IX86_CALLCVT_CDECL;
+-      else if (lookup_attribute ("stdcall", attrs))
+-	ret |= IX86_CALLCVT_STDCALL;
+-      else if (lookup_attribute ("fastcall", attrs))
+-	ret |= IX86_CALLCVT_FASTCALL;
+-      else if (lookup_attribute ("thiscall", attrs))
+-	ret |= IX86_CALLCVT_THISCALL;
+-
+-      /* Regparam isn't allowed for thiscall and fastcall.  */
+-      if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
+-	{
+-	  if (lookup_attribute ("regparm", attrs))
+-	    ret |= IX86_CALLCVT_REGPARM;
+-	  if (lookup_attribute ("sseregparm", attrs))
+-	    ret |= IX86_CALLCVT_SSEREGPARM;
+-	}
+-
+-      if (IX86_BASE_CALLCVT(ret) != 0)
+-	return ret;
++      if (TARGET_64BIT)
++	return function_value_ms_64 (orig_mode, mode, valtype);
++      else
++	return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype);
+     }
++  else if (TARGET_64BIT)
++    return function_value_64 (orig_mode, mode, valtype);
++  else
++    return function_value_32 (orig_mode, mode, fntype, fn);
++}
+ 
+-  is_stdarg = stdarg_p (type);
+-  if (TARGET_RTD && !is_stdarg)
+-    return IX86_CALLCVT_STDCALL | ret;
+-
+-  if (ret != 0
+-      || is_stdarg
+-      || TREE_CODE (type) != METHOD_TYPE
+-      || ix86_function_type_abi (type) != MS_ABI)
+-    return IX86_CALLCVT_CDECL | ret;
++static rtx
++ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
++{
++  machine_mode mode, orig_mode;
+ 
+-  return IX86_CALLCVT_THISCALL;
++  orig_mode = TYPE_MODE (valtype);
++  mode = type_natural_mode (valtype, NULL, true);
++  return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
+ }
+ 
+-/* Return 0 if the attributes for two types are incompatible, 1 if they
+-   are compatible, and 2 if they are nearly compatible (which causes a
+-   warning to be generated).  */
++/* Pointer function arguments and return values are promoted to
++   word_mode for normal functions.  */
+ 
+-static int
+-ix86_comp_type_attributes (const_tree type1, const_tree type2)
++static machine_mode
++ix86_promote_function_mode (const_tree type, machine_mode mode,
++			    int *punsignedp, const_tree fntype,
++			    int for_return)
+ {
+-  unsigned int ccvt1, ccvt2;
+-
+-  if (TREE_CODE (type1) != FUNCTION_TYPE
+-      && TREE_CODE (type1) != METHOD_TYPE)
+-    return 1;
++  if (cfun->machine->func_type == TYPE_NORMAL
++      && type != NULL_TREE
++      && POINTER_TYPE_P (type))
++    {
++      *punsignedp = POINTERS_EXTEND_UNSIGNED;
++      return word_mode;
++    }
++  return default_promote_function_mode (type, mode, punsignedp, fntype,
++					for_return);
++}
+ 
+-  ccvt1 = ix86_get_callcvt (type1);
+-  ccvt2 = ix86_get_callcvt (type2);
+-  if (ccvt1 != ccvt2)
+-    return 0;
+-  if (ix86_function_regparm (type1, NULL)
+-      != ix86_function_regparm (type2, NULL))
+-    return 0;
++/* Return true if a structure, union or array with MODE containing FIELD
++   should be accessed using BLKmode.  */
+ 
+-  return 1;
++static bool
++ix86_member_type_forces_blk (const_tree field, machine_mode mode)
++{
++  /* Union with XFmode must be in BLKmode.  */
++  return (mode == XFmode
++	  && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
++	      || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
+ }
+-
+-/* Return the regparm value for a function with the indicated TYPE and DECL.
+-   DECL may be NULL when calling function indirectly
+-   or considering a libcall.  */
+ 
+-static int
+-ix86_function_regparm (const_tree type, const_tree decl)
++rtx
++ix86_libcall_value (machine_mode mode)
+ {
+-  tree attr;
+-  int regparm;
+-  unsigned int ccvt;
++  return ix86_function_value_1 (NULL, NULL, mode, mode);
++}
+ 
+-  if (TARGET_64BIT)
+-    return (ix86_function_type_abi (type) == SYSV_ABI
+-	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
+-  ccvt = ix86_get_callcvt (type);
+-  regparm = ix86_regparm;
++/* Return true iff type is returned in memory.  */
+ 
+-  if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
++static bool
++ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
++{
++#ifdef SUBTARGET_RETURN_IN_MEMORY
++  return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
++#else
++  const machine_mode mode = type_natural_mode (type, NULL, true);
++  HOST_WIDE_INT size;
++
++  if (TARGET_64BIT)
+     {
+-      attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
+-      if (attr)
++      if (ix86_function_type_abi (fntype) == MS_ABI)
+ 	{
+-	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
+-	  return regparm;
+-	}
+-    }
+-  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+-    return 2;
+-  else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+-    return 1;
++	  size = int_size_in_bytes (type);
+ 
+-  /* Use register calling convention for local functions when possible.  */
+-  if (decl
+-      && TREE_CODE (decl) == FUNCTION_DECL)
+-    {
+-      cgraph_node *target = cgraph_node::get (decl);
+-      if (target)
+-	target = target->function_symbol ();
++	  /* __m128 is returned in xmm0.  */
++	  if ((!type || VECTOR_INTEGER_TYPE_P (type)
++	       || INTEGRAL_TYPE_P (type)
++	       || VECTOR_FLOAT_TYPE_P (type))
++	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
++	      && !COMPLEX_MODE_P (mode)
++	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
++	    return false;
+ 
+-      /* Caller and callee must agree on the calling convention, so
+-	 checking here just optimize means that with
+-	 __attribute__((optimize (...))) caller could use regparm convention
+-	 and callee not, or vice versa.  Instead look at whether the callee
+-	 is optimized or not.  */
+-      if (target && opt_for_fn (target->decl, optimize)
+-	  && !(profile_flag && !flag_fentry))
++	  /* Otherwise, the size must be exactly in [1248]. */
++	  return size != 1 && size != 2 && size != 4 && size != 8;
++	}
++      else
+ 	{
+-	  cgraph_local_info *i = &target->local;
+-	  if (i && i->local && i->can_change_signature)
+-	    {
+-	      int local_regparm, globals = 0, regno;
+-
+-	      /* Make sure no regparm register is taken by a
+-		 fixed register variable.  */
+-	      for (local_regparm = 0; local_regparm < REGPARM_MAX;
+-		   local_regparm++)
+-		if (fixed_regs[local_regparm])
+-		  break;
++	  int needed_intregs, needed_sseregs;
+ 
+-	      /* We don't want to use regparm(3) for nested functions as
+-		 these use a static chain pointer in the third argument.  */
+-	      if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
+-		local_regparm = 2;
++	  return examine_argument (mode, type, 1,
++				   &needed_intregs, &needed_sseregs);
++	}
++    }
++  else
++    {
++      size = int_size_in_bytes (type);
+ 
+-	      /* Save a register for the split stack.  */
+-	      if (flag_split_stack)
+-		{
+-		  if (local_regparm == 3)
+-		    local_regparm = 2;
+-		  else if (local_regparm == 2
+-			   && DECL_STATIC_CHAIN (target->decl))
+-		    local_regparm = 1;
+-		}
++      /* Intel MCU psABI returns scalars and aggregates no larger than 8
++	 bytes in registers.  */
++      if (TARGET_IAMCU)
++	return VECTOR_MODE_P (mode) || size < 0 || size > 8;
+ 
+-	      /* Each fixed register usage increases register pressure,
+-		 so less registers should be used for argument passing.
+-		 This functionality can be overriden by an explicit
+-		 regparm value.  */
+-	      for (regno = AX_REG; regno <= DI_REG; regno++)
+-		if (fixed_regs[regno])
+-		  globals++;
++      if (mode == BLKmode)
++	return true;
+ 
+-	      local_regparm
+-		= globals < local_regparm ? local_regparm - globals : 0;
++      if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
++	return false;
+ 
+-	      if (local_regparm > regparm)
+-		regparm = local_regparm;
+-	    }
+-	}
+-    }
++      if (VECTOR_MODE_P (mode) || mode == TImode)
++	{
++	  /* User-created vectors small enough to fit in EAX.  */
++	  if (size < 8)
++	    return false;
+ 
+-  return regparm;
+-}
++	  /* Unless ABI prescibes otherwise,
++	     MMX/3dNow values are returned in MM0 if available.  */
++	     
++	  if (size == 8)
++	    return TARGET_VECT8_RETURNS || !TARGET_MMX;
+ 
+-/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
+-   DFmode (2) arguments in SSE registers for a function with the
+-   indicated TYPE and DECL.  DECL may be NULL when calling function
+-   indirectly or considering a libcall.  Return -1 if any FP parameter
+-   should be rejected by error.  This is used in siutation we imply SSE
+-   calling convetion but the function is called from another function with
+-   SSE disabled. Otherwise return 0.  */
++	  /* SSE values are returned in XMM0 if available.  */
++	  if (size == 16)
++	    return !TARGET_SSE;
+ 
+-static int
+-ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
+-{
+-  gcc_assert (!TARGET_64BIT);
++	  /* AVX values are returned in YMM0 if available.  */
++	  if (size == 32)
++	    return !TARGET_AVX;
+ 
+-  /* Use SSE registers to pass SFmode and DFmode arguments if requested
+-     by the sseregparm attribute.  */
+-  if (TARGET_SSEREGPARM
+-      || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
+-    {
+-      if (!TARGET_SSE)
+-	{
+-	  if (warn)
+-	    {
+-	      if (decl)
+-		error ("calling %qD with attribute sseregparm without "
+-		       "SSE/SSE2 enabled", decl);
+-	      else
+-		error ("calling %qT with attribute sseregparm without "
+-		       "SSE/SSE2 enabled", type);
+-	    }
+-	  return 0;
++	  /* AVX512F values are returned in ZMM0 if available.  */
++	  if (size == 64)
++	    return !TARGET_AVX512F;
+ 	}
+ 
+-      return 2;
+-    }
++      if (mode == XFmode)
++	return false;
+ 
+-  if (!decl)
+-    return 0;
++      if (size > 12)
++	return true;
+ 
+-  cgraph_node *target = cgraph_node::get (decl);
+-  if (target)
+-    target = target->function_symbol ();
++      /* OImode shouldn't be used directly.  */
++      gcc_assert (mode != OImode);
+ 
+-  /* For local functions, pass up to SSE_REGPARM_MAX SFmode
+-     (and DFmode for SSE2) arguments in SSE registers.  */
+-  if (target
+-      /* TARGET_SSE_MATH */
+-      && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
+-      && opt_for_fn (target->decl, optimize)
+-      && !(profile_flag && !flag_fentry))
+-    {
+-      cgraph_local_info *i = &target->local;
+-      if (i && i->local && i->can_change_signature)
+-	{
+-	  /* Refuse to produce wrong code when local function with SSE enabled
+-	     is called from SSE disabled function.
+-	     FIXME: We need a way to detect these cases cross-ltrans partition
+-	     and avoid using SSE calling conventions on local functions called
+-	     from function with SSE disabled.  For now at least delay the
+-	     warning until we know we are going to produce wrong code.
+-	     See PR66047  */
+-	  if (!TARGET_SSE && warn)
+-	    return -1;
+-	  return TARGET_SSE2_P (target_opts_for_fn (target->decl)
+-				->x_ix86_isa_flags) ? 2 : 1;
+-	}
++      return false;
+     }
+-
+-  return 0;
++#endif
+ }
+ 
+-/* Return true if EAX is live at the start of the function.  Used by
+-   ix86_expand_prologue to determine if we need special help before
+-   calling allocate_stack_worker.  */
++
++/* Create the va_list data type.  */
+ 
+-static bool
+-ix86_eax_live_at_start_p (void)
++static tree
++ix86_build_builtin_va_list_64 (void)
+ {
+-  /* Cheat.  Don't bother working forward from ix86_function_regparm
+-     to the function type to whether an actual argument is located in
+-     eax.  Instead just look at cfg info, which is still close enough
+-     to correct at this point.  This gives false positives for broken
+-     functions that might use uninitialized data that happens to be
+-     allocated in eax, but who cares?  */
+-  return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
+-}
++  tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
+ 
+-static bool
+-ix86_keep_aggregate_return_pointer (tree fntype)
+-{
+-  tree attr;
++  record = lang_hooks.types.make_type (RECORD_TYPE);
++  type_decl = build_decl (BUILTINS_LOCATION,
++			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
+ 
+-  if (!TARGET_64BIT)
+-    {
+-      attr = lookup_attribute ("callee_pop_aggregate_return",
+-			       TYPE_ATTRIBUTES (fntype));
+-      if (attr)
+-	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
++  f_gpr = build_decl (BUILTINS_LOCATION,
++		      FIELD_DECL, get_identifier ("gp_offset"),
++		      unsigned_type_node);
++  f_fpr = build_decl (BUILTINS_LOCATION,
++		      FIELD_DECL, get_identifier ("fp_offset"),
++		      unsigned_type_node);
++  f_ovf = build_decl (BUILTINS_LOCATION,
++		      FIELD_DECL, get_identifier ("overflow_arg_area"),
++		      ptr_type_node);
++  f_sav = build_decl (BUILTINS_LOCATION,
++		      FIELD_DECL, get_identifier ("reg_save_area"),
++		      ptr_type_node);
+ 
+-      /* For 32-bit MS-ABI the default is to keep aggregate
+-         return pointer.  */
+-      if (ix86_function_type_abi (fntype) == MS_ABI)
+-	return true;
+-    }
+-  return KEEP_AGGREGATE_RETURN_POINTER != 0;
+-}
++  va_list_gpr_counter_field = f_gpr;
++  va_list_fpr_counter_field = f_fpr;
+ 
+-/* Value is the number of bytes of arguments automatically
+-   popped when returning from a subroutine call.
+-   FUNDECL is the declaration node of the function (as a tree),
+-   FUNTYPE is the data type of the function (as a tree),
+-   or for a library call it is an identifier node for the subroutine name.
+-   SIZE is the number of bytes of arguments passed on the stack.
++  DECL_FIELD_CONTEXT (f_gpr) = record;
++  DECL_FIELD_CONTEXT (f_fpr) = record;
++  DECL_FIELD_CONTEXT (f_ovf) = record;
++  DECL_FIELD_CONTEXT (f_sav) = record;
+ 
+-   On the 80386, the RTD insn may be used to pop them if the number
+-     of args is fixed, but if the number is variable then the caller
+-     must pop them all.  RTD can't be used for library calls now
+-     because the library is compiled with the Unix compiler.
+-   Use of RTD is a selectable option, since it is incompatible with
+-   standard Unix calling sequences.  If the option is not selected,
+-   the caller must always pop the args.
++  TYPE_STUB_DECL (record) = type_decl;
++  TYPE_NAME (record) = type_decl;
++  TYPE_FIELDS (record) = f_gpr;
++  DECL_CHAIN (f_gpr) = f_fpr;
++  DECL_CHAIN (f_fpr) = f_ovf;
++  DECL_CHAIN (f_ovf) = f_sav;
+ 
+-   The attribute stdcall is equivalent to RTD on a per module basis.  */
++  layout_type (record);
+ 
+-static poly_int64
+-ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
+-{
+-  unsigned int ccvt;
++  TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
++					NULL_TREE, TYPE_ATTRIBUTES (record));
+ 
+-  /* None of the 64-bit ABIs pop arguments.  */
++  /* The correct type is an array type of one element.  */
++  return build_array_type (record, build_index_type (size_zero_node));
++}
++
++/* Setup the builtin va_list data type and for 64-bit the additional
++   calling convention specific va_list data types.  */
++
++static tree
++ix86_build_builtin_va_list (void)
++{
+   if (TARGET_64BIT)
+-    return 0;
++    {
++      /* Initialize ABI specific va_list builtin types.
+ 
+-  ccvt = ix86_get_callcvt (funtype);
++	 In lto1, we can encounter two va_list types:
++	 - one as a result of the type-merge across TUs, and
++	 - the one constructed here.
++	 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
++	 a type identity check in canonical_va_list_type based on
++	 TYPE_MAIN_VARIANT (which we used to have) will not work.
++	 Instead, we tag each va_list_type_node with its unique attribute, and
++	 look for the attribute in the type identity check in
++	 canonical_va_list_type.
+ 
+-  if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
+-	       | IX86_CALLCVT_THISCALL)) != 0
+-      && ! stdarg_p (funtype))
+-    return size;
++	 Tagging sysv_va_list_type_node directly with the attribute is
++	 problematic since it's a array of one record, which will degrade into a
++	 pointer to record when used as parameter (see build_va_arg comments for
++	 an example), dropping the attribute in the process.  So we tag the
++	 record instead.  */
+ 
+-  /* Lose any fake structure return argument if it is passed on the stack.  */
+-  if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
+-      && !ix86_keep_aggregate_return_pointer (funtype))
++      /* For SYSV_ABI we use an array of one record.  */
++      sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
++	
++      /* For MS_ABI we use plain pointer to argument area.  */
++      tree char_ptr_type = build_pointer_type (char_type_node);
++      tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
++			     TYPE_ATTRIBUTES (char_ptr_type));
++      ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
++
++      return ((ix86_abi == MS_ABI)
++	      ? ms_va_list_type_node
++	      : sysv_va_list_type_node);
++    }
++  else
+     {
+-      int nregs = ix86_function_regparm (funtype, fundecl);
+-      if (nregs == 0)
+-	return GET_MODE_SIZE (Pmode);
++      /* For i386 we use plain pointer to argument area.  */
++      return build_pointer_type (char_type_node);
+     }
+-
+-  return 0;
+ }
+ 
+-/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */
++/* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
+ 
+-static bool
+-ix86_legitimate_combined_insn (rtx_insn *insn)
++static void
++setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
+ {
+-  int i;
++  rtx save_area, mem;
++  alias_set_type set;
++  int i, max;
+ 
+-  /* Check operand constraints in case hard registers were propagated
+-     into insn pattern.  This check prevents combine pass from
+-     generating insn patterns with invalid hard register operands.
+-     These invalid insns can eventually confuse reload to error out
+-     with a spill failure.  See also PRs 46829 and 46843.  */
++  /* GPR size of varargs save area.  */
++  if (cfun->va_list_gpr_size)
++    ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
++  else
++    ix86_varargs_gpr_size = 0;
+ 
+-  gcc_assert (INSN_CODE (insn) >= 0);
++  /* FPR size of varargs save area.  We don't need it if we don't pass
++     anything in SSE registers.  */
++  if (TARGET_SSE && cfun->va_list_fpr_size)
++    ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
++  else
++    ix86_varargs_fpr_size = 0;
+ 
+-  extract_insn (insn);
+-  preprocess_constraints (insn);
++  if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
++    return;
+ 
+-  int n_operands = recog_data.n_operands;
+-  int n_alternatives = recog_data.n_alternatives;
+-  for (i = 0; i < n_operands; i++)
++  save_area = frame_pointer_rtx;
++  set = get_varargs_alias_set ();
++
++  max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
++  if (max > X86_64_REGPARM_MAX)
++    max = X86_64_REGPARM_MAX;
++
++  for (i = cum->regno; i < max; i++)
+     {
+-      rtx op = recog_data.operand[i];
+-      machine_mode mode = GET_MODE (op);
+-      const operand_alternative *op_alt;
+-      int offset = 0;
+-      bool win;
+-      int j;
++      mem = gen_rtx_MEM (word_mode,
++			 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
++      MEM_NOTRAP_P (mem) = 1;
++      set_mem_alias_set (mem, set);
++      emit_move_insn (mem,
++		      gen_rtx_REG (word_mode,
++				   x86_64_int_parameter_registers[i]));
++    }
+ 
+-      /* A unary operator may be accepted by the predicate, but it
+-	 is irrelevant for matching constraints.  */
+-      if (UNARY_P (op))
+-	op = XEXP (op, 0);
++  if (ix86_varargs_fpr_size)
++    {
++      machine_mode smode;
++      rtx_code_label *label;
++      rtx test;
+ 
+-      if (SUBREG_P (op))
+-	{
+-	  if (REG_P (SUBREG_REG (op))
+-	      && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
+-	    offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
+-					  GET_MODE (SUBREG_REG (op)),
+-					  SUBREG_BYTE (op),
+-					  GET_MODE (op));
+-	  op = SUBREG_REG (op);
+-	}
++      /* Now emit code to save SSE registers.  The AX parameter contains number
++	 of SSE parameter registers used to call this function, though all we
++	 actually check here is the zero/non-zero status.  */
+ 
+-      if (!(REG_P (op) && HARD_REGISTER_P (op)))
+-	continue;
++      label = gen_label_rtx ();
++      test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
++      emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
++				      label));
+ 
+-      op_alt = recog_op_alt;
++      /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
++	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
++	 be if we could determine the real mode of the data, via a hook
++	 into pass_stdarg.  Ignore all that for now.  */
++      smode = V4SFmode;
++      if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
++	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
+ 
+-      /* Operand has no constraints, anything is OK.  */
+-      win = !n_alternatives;
++      max = cum->sse_regno + cfun->va_list_fpr_size / 16;
++      if (max > X86_64_SSE_REGPARM_MAX)
++	max = X86_64_SSE_REGPARM_MAX;
+ 
+-      alternative_mask preferred = get_preferred_alternatives (insn);
+-      for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
++      for (i = cum->sse_regno; i < max; ++i)
+ 	{
+-	  if (!TEST_BIT (preferred, j))
+-	    continue;
+-	  if (op_alt[i].anything_ok
+-	      || (op_alt[i].matches != -1
+-		  && operands_match_p
+-		  (recog_data.operand[i],
+-		   recog_data.operand[op_alt[i].matches]))
+-	      || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
+-	    {
+-	      win = true;
+-	      break;
+-	    }
++	  mem = plus_constant (Pmode, save_area,
++			       i * 16 + ix86_varargs_gpr_size);
++	  mem = gen_rtx_MEM (smode, mem);
++	  MEM_NOTRAP_P (mem) = 1;
++	  set_mem_alias_set (mem, set);
++	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
++
++	  emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
+ 	}
+ 
+-      if (!win)
+-	return false;
++      emit_label (label);
+     }
+-
+-  return true;
+-}
+-
+-/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
+-
+-static unsigned HOST_WIDE_INT
+-ix86_asan_shadow_offset (void)
+-{
+-  return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
+-				     : HOST_WIDE_INT_C (0x7fff8000))
+-		     : (HOST_WIDE_INT_1 << 29);
+ }
+-
+-/* Argument support functions.  */
+ 
+-/* Return true when register may be used to pass function parameters.  */
+-bool
+-ix86_function_arg_regno_p (int regno)
++static void
++setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
+ {
++  alias_set_type set = get_varargs_alias_set ();
+   int i;
+-  enum calling_abi call_abi;
+-  const int *parm_regs;
+ 
+-  if (!TARGET_64BIT)
+-    {
+-      if (TARGET_MACHO)
+-        return (regno < REGPARM_MAX
+-                || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
+-      else
+-        return (regno < REGPARM_MAX
+-	        || (TARGET_MMX && MMX_REGNO_P (regno)
+-	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
+-	        || (TARGET_SSE && SSE_REGNO_P (regno)
+-		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
+-    }
++  /* Reset to zero, as there might be a sysv vaarg used
++     before.  */
++  ix86_varargs_gpr_size = 0;
++  ix86_varargs_fpr_size = 0;
+ 
+-  if (TARGET_SSE && SSE_REGNO_P (regno)
+-      && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
+-    return true;
+-
+-  /* TODO: The function should depend on current function ABI but
+-     builtins.c would need updating then. Therefore we use the
+-     default ABI.  */
+-  call_abi = ix86_cfun_abi ();
+-
+-  /* RAX is used as hidden argument to va_arg functions.  */
+-  if (call_abi == SYSV_ABI && regno == AX_REG)
+-    return true;
+-
+-  if (call_abi == MS_ABI)
+-    parm_regs = x86_64_ms_abi_int_parameter_registers;
+-  else
+-    parm_regs = x86_64_int_parameter_registers;
+-
+-  for (i = 0; i < (call_abi == MS_ABI
+-		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
+-    if (regno == parm_regs[i])
+-      return true;
+-  return false;
+-}
+-
+-/* Return if we do not know how to pass TYPE solely in registers.  */
+-
+-static bool
+-ix86_must_pass_in_stack (machine_mode mode, const_tree type)
+-{
+-  if (must_pass_in_stack_var_size_or_pad (mode, type))
+-    return true;
+-
+-  /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
+-     The layout_type routine is crafty and tries to trick us into passing
+-     currently unsupported vector types on the stack by using TImode.  */
+-  return (!TARGET_64BIT && mode == TImode
+-	  && type && TREE_CODE (type) != VECTOR_TYPE);
+-}
++  for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
++    {
++      rtx reg, mem;
+ 
+-/* It returns the size, in bytes, of the area reserved for arguments passed
+-   in registers for the function represented by fndecl dependent to the used
+-   abi format.  */
+-int
+-ix86_reg_parm_stack_space (const_tree fndecl)
+-{
+-  enum calling_abi call_abi = SYSV_ABI;
+-  if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
+-    call_abi = ix86_function_abi (fndecl);
+-  else
+-    call_abi = ix86_function_type_abi (fndecl);
+-  if (TARGET_64BIT && call_abi == MS_ABI)
+-    return 32;
+-  return 0;
+-}
++      mem = gen_rtx_MEM (Pmode,
++			 plus_constant (Pmode, virtual_incoming_args_rtx,
++					i * UNITS_PER_WORD));
++      MEM_NOTRAP_P (mem) = 1;
++      set_mem_alias_set (mem, set);
+ 
+-/* We add this as a workaround in order to use libc_has_function
+-   hook in i386.md.  */
+-bool
+-ix86_libc_has_function (enum function_class fn_class)
+-{
+-  return targetm.libc_has_function (fn_class);
++      reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
++      emit_move_insn (mem, reg);
++    }
+ }
+ 
+-/* Returns value SYSV_ABI, MS_ABI dependent on fntype,
+-   specifying the call abi used.  */
+-enum calling_abi
+-ix86_function_type_abi (const_tree fntype)
++static void
++ix86_setup_incoming_varargs (cumulative_args_t cum_v,
++			     const function_arg_info &arg,
++			     int *, int no_rtl)
+ {
+-  enum calling_abi abi = ix86_abi;
++  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
++  CUMULATIVE_ARGS next_cum;
++  tree fntype;
+ 
+-  if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
+-    return abi;
++  /* This argument doesn't appear to be used anymore.  Which is good,
++     because the old code here didn't suppress rtl generation.  */
++  gcc_assert (!no_rtl);
+ 
+-  if (abi == SYSV_ABI
+-      && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
+-    {
+-      static int warned;
+-      if (TARGET_X32 && !warned)
+-	{
+-	  error ("X32 does not support ms_abi attribute");
+-	  warned = 1;
+-	}
++  if (!TARGET_64BIT)
++    return;
+ 
+-      abi = MS_ABI;
+-    }
+-  else if (abi == MS_ABI
+-	   && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
+-    abi = SYSV_ABI;
++  fntype = TREE_TYPE (current_function_decl);
+ 
+-  return abi;
+-}
++  /* For varargs, we do not want to skip the dummy va_dcl argument.
++     For stdargs, we do want to skip the last named argument.  */
++  next_cum = *cum;
++  if (stdarg_p (fntype))
++    ix86_function_arg_advance (pack_cumulative_args (&next_cum), arg);
+ 
+-static enum calling_abi
+-ix86_function_abi (const_tree fndecl)
+-{
+-  return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
++  if (cum->call_abi == MS_ABI)
++    setup_incoming_varargs_ms_64 (&next_cum);
++  else
++    setup_incoming_varargs_64 (&next_cum);
+ }
+ 
+-/* Returns value SYSV_ABI, MS_ABI dependent on cfun,
+-   specifying the call abi used.  */
+-enum calling_abi
+-ix86_cfun_abi (void)
+-{
+-  return cfun ? cfun->machine->call_abi : ix86_abi;
+-}
++/* Checks if TYPE is of kind va_list char *.  */
+ 
+ static bool
+-ix86_function_ms_hook_prologue (const_tree fn)
++is_va_list_char_pointer (tree type)
+ {
+-  if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
+-    {
+-      if (decl_function_context (fn) != NULL_TREE)
+-	error_at (DECL_SOURCE_LOCATION (fn),
+-		  "ms_hook_prologue is not compatible with nested function");
+-      else
+-        return true;
+-    }
+-  return false;
+-}
++  tree canonic;
+ 
+-static bool
+-ix86_function_naked (const_tree fn)
+-{
+-  if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
++  /* For 32-bit it is always true.  */
++  if (!TARGET_64BIT)
+     return true;
+-
+-  return false;
++  canonic = ix86_canonical_va_list_type (type);
++  return (canonic == ms_va_list_type_node
++          || (ix86_abi == MS_ABI && canonic == va_list_type_node));
+ }
+ 
+-/* Write the extra assembler code needed to declare a function properly.  */
++/* Implement va_start.  */
+ 
+-void
+-ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
+-				tree decl)
++static void
++ix86_va_start (tree valist, rtx nextarg)
+ {
+-  bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
++  HOST_WIDE_INT words, n_gpr, n_fpr;
++  tree f_gpr, f_fpr, f_ovf, f_sav;
++  tree gpr, fpr, ovf, sav, t;
++  tree type;
++  rtx ovf_rtx;
+ 
+-  if (is_ms_hook)
++  if (flag_split_stack
++      && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+     {
+-      int i, filler_count = (TARGET_64BIT ? 32 : 16);
+-      unsigned int filler_cc = 0xcccccccc;
++      unsigned int scratch_regno;
+ 
+-      for (i = 0; i < filler_count; i += 4)
+-        fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
+-    }
++      /* When we are splitting the stack, we can't refer to the stack
++	 arguments using internal_arg_pointer, because they may be on
++	 the old stack.  The split stack prologue will arrange to
++	 leave a pointer to the old stack arguments in a scratch
++	 register, which we here copy to a pseudo-register.  The split
++	 stack prologue can't set the pseudo-register directly because
++	 it (the prologue) runs before any registers have been saved.  */
+ 
+-#ifdef SUBTARGET_ASM_UNWIND_INIT
+-  SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
+-#endif
++      scratch_regno = split_stack_prologue_scratch_regno ();
++      if (scratch_regno != INVALID_REGNUM)
++	{
++	  rtx reg;
++	  rtx_insn *seq;
+ 
+-  ASM_OUTPUT_LABEL (asm_out_file, fname);
++	  reg = gen_reg_rtx (Pmode);
++	  cfun->machine->split_stack_varargs_pointer = reg;
+ 
+-  /* Output magic byte marker, if hot-patch attribute is set.  */
+-  if (is_ms_hook)
+-    {
+-      if (TARGET_64BIT)
+-	{
+-	  /* leaq [%rsp + 0], %rsp  */
+-	  fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
+-		 asm_out_file);
++	  start_sequence ();
++	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
++	  seq = get_insns ();
++	  end_sequence ();
++
++	  push_topmost_sequence ();
++	  emit_insn_after (seq, entry_of_function ());
++	  pop_topmost_sequence ();
+ 	}
++    }
++
++  /* Only 64bit target needs something special.  */
++  if (is_va_list_char_pointer (TREE_TYPE (valist)))
++    {
++      if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
++	std_expand_builtin_va_start (valist, nextarg);
+       else
+ 	{
+-          /* movl.s %edi, %edi
+-	     push   %ebp
+-	     movl.s %esp, %ebp */
+-	  fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
++	  rtx va_r, next;
++
++	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
++	  next = expand_binop (ptr_mode, add_optab,
++			       cfun->machine->split_stack_varargs_pointer,
++			       crtl->args.arg_offset_rtx,
++			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
++	  convert_move (va_r, next, 0);
+ 	}
++      return;
+     }
+-}
+ 
+-/* Implementation of call abi switching target hook. Specific to FNDECL
+-   the specific call register sets are set.  See also
+-   ix86_conditional_register_usage for more details.  */
+-void
+-ix86_call_abi_override (const_tree fndecl)
+-{
+-  cfun->machine->call_abi = ix86_function_abi (fndecl);
+-}
++  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
++  f_fpr = DECL_CHAIN (f_gpr);
++  f_ovf = DECL_CHAIN (f_fpr);
++  f_sav = DECL_CHAIN (f_ovf);
+ 
+-/* Return 1 if pseudo register should be created and used to hold
+-   GOT address for PIC code.  */
+-bool
+-ix86_use_pseudo_pic_reg (void)
+-{
+-  if ((TARGET_64BIT
+-       && (ix86_cmodel == CM_SMALL_PIC
+-	   || TARGET_PECOFF))
+-      || !flag_pic)
+-    return false;
+-  return true;
+-}
++  valist = build_simple_mem_ref (valist);
++  TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
++  /* The following should be folded into the MEM_REF offset.  */
++  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
++		f_gpr, NULL_TREE);
++  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
++		f_fpr, NULL_TREE);
++  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
++		f_ovf, NULL_TREE);
++  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
++		f_sav, NULL_TREE);
+ 
+-/* Initialize large model PIC register.  */
++  /* Count number of gp and fp argument registers used.  */
++  words = crtl->args.info.words;
++  n_gpr = crtl->args.info.regno;
++  n_fpr = crtl->args.info.sse_regno;
+ 
+-static void
+-ix86_init_large_pic_reg (unsigned int tmp_regno)
+-{
+-  rtx_code_label *label;
+-  rtx tmp_reg;
+-
+-  gcc_assert (Pmode == DImode);
+-  label = gen_label_rtx ();
+-  emit_label (label);
+-  LABEL_PRESERVE_P (label) = 1;
+-  tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
+-  gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
+-  emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
+-				label));
+-  emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
+-  emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
+-			    pic_offset_table_rtx, tmp_reg));
+-  const char *name = LABEL_NAME (label);
+-  PUT_CODE (label, NOTE);
+-  NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
+-  NOTE_DELETED_LABEL_NAME (label) = name;
+-}
+-
+-/* Create and initialize PIC register if required.  */
+-static void
+-ix86_init_pic_reg (void)
+-{
+-  edge entry_edge;
+-  rtx_insn *seq;
+-
+-  if (!ix86_use_pseudo_pic_reg ())
+-    return;
+-
+-  start_sequence ();
+-
+-  if (TARGET_64BIT)
++  if (cfun->va_list_gpr_size)
+     {
+-      if (ix86_cmodel == CM_LARGE_PIC)
+-	ix86_init_large_pic_reg (R11_REG);
+-      else
+-	emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
++      type = TREE_TYPE (gpr);
++      t = build2 (MODIFY_EXPR, type,
++		  gpr, build_int_cst (type, n_gpr * 8));
++      TREE_SIDE_EFFECTS (t) = 1;
++      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+     }
+-  else
++
++  if (TARGET_SSE && cfun->va_list_fpr_size)
+     {
+-      /*  If there is future mcount call in the function it is more profitable
+-	  to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM.  */
+-      rtx reg = crtl->profile
+-		? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
+-		: pic_offset_table_rtx;
+-      rtx_insn *insn = emit_insn (gen_set_got (reg));
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-      if (crtl->profile)
+-        emit_move_insn (pic_offset_table_rtx, reg);
+-      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
++      type = TREE_TYPE (fpr);
++      t = build2 (MODIFY_EXPR, type, fpr,
++		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
++      TREE_SIDE_EFFECTS (t) = 1;
++      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+     }
+ 
+-  seq = get_insns ();
+-  end_sequence ();
++  /* Find the overflow area.  */
++  type = TREE_TYPE (ovf);
++  if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
++    ovf_rtx = crtl->args.internal_arg_pointer;
++  else
++    ovf_rtx = cfun->machine->split_stack_varargs_pointer;
++  t = make_tree (type, ovf_rtx);
++  if (words != 0)
++    t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
+ 
+-  entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+-  insert_insn_on_edge (seq, entry_edge);
+-  commit_one_edge_insertion (entry_edge);
++  t = build2 (MODIFY_EXPR, type, ovf, t);
++  TREE_SIDE_EFFECTS (t) = 1;
++  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
++
++  if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
++    {
++      /* Find the register save area.
++	 Prologue of the function save it right above stack frame.  */
++      type = TREE_TYPE (sav);
++      t = make_tree (type, frame_pointer_rtx);
++      if (!ix86_varargs_gpr_size)
++	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
++
++      t = build2 (MODIFY_EXPR, type, sav, t);
++      TREE_SIDE_EFFECTS (t) = 1;
++      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
++    }
+ }
+ 
+-/* Initialize a variable CUM of type CUMULATIVE_ARGS
+-   for a call to a function whose data type is FNTYPE.
+-   For a library call, FNTYPE is 0.  */
++/* Implement va_arg.  */
+ 
+-void
+-init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
+-		      tree fntype,	/* tree ptr for function decl */
+-		      rtx libname,	/* SYMBOL_REF of library name or 0 */
+-		      tree fndecl,
+-		      int caller)
++static tree
++ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
++		      gimple_seq *post_p)
+ {
+-  struct cgraph_local_info *i = NULL;
+-  struct cgraph_node *target = NULL;
++  static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
++  tree f_gpr, f_fpr, f_ovf, f_sav;
++  tree gpr, fpr, ovf, sav, t;
++  int size, rsize;
++  tree lab_false, lab_over = NULL_TREE;
++  tree addr, t2;
++  rtx container;
++  int indirect_p = 0;
++  tree ptrtype;
++  machine_mode nat_mode;
++  unsigned int arg_boundary;
++  unsigned int type_align;
+ 
+-  memset (cum, 0, sizeof (*cum));
++  /* Only 64bit target needs something special.  */
++  if (is_va_list_char_pointer (TREE_TYPE (valist)))
++    return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
+ 
+-  if (fndecl)
++  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
++  f_fpr = DECL_CHAIN (f_gpr);
++  f_ovf = DECL_CHAIN (f_fpr);
++  f_sav = DECL_CHAIN (f_ovf);
++
++  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
++		valist, f_gpr, NULL_TREE);
++
++  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
++  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
++  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
++
++  indirect_p = pass_va_arg_by_reference (type);
++  if (indirect_p)
++    type = build_pointer_type (type);
++  size = arg_int_size_in_bytes (type);
++  rsize = CEIL (size, UNITS_PER_WORD);
++
++  nat_mode = type_natural_mode (type, NULL, false);
++  switch (nat_mode)
+     {
+-      target = cgraph_node::get (fndecl);
+-      if (target)
++    case E_V8SFmode:
++    case E_V8SImode:
++    case E_V32QImode:
++    case E_V16HImode:
++    case E_V4DFmode:
++    case E_V4DImode:
++    case E_V16SFmode:
++    case E_V16SImode:
++    case E_V64QImode:
++    case E_V32HImode:
++    case E_V8DFmode:
++    case E_V8DImode:
++      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
++      if (!TARGET_64BIT_MS_ABI)
+ 	{
+-	  target = target->function_symbol ();
+-	  i = cgraph_node::local_info (target->decl);
+-	  cum->call_abi = ix86_function_abi (target->decl);
++	  container = NULL;
++	  break;
+ 	}
+-      else
+-	cum->call_abi = ix86_function_abi (fndecl);
++      /* FALLTHRU */
++
++    default:
++      container = construct_container (nat_mode, TYPE_MODE (type),
++				       type, 0, X86_64_REGPARM_MAX,
++				       X86_64_SSE_REGPARM_MAX, intreg,
++				       0);
++      break;
+     }
+-  else
+-    cum->call_abi = ix86_function_type_abi (fntype);
+ 
+-  cum->caller = caller;
++  /* Pull the value out of the saved registers.  */
+ 
+-  /* Set up the number of registers to use for passing arguments.  */
+-  cum->nregs = ix86_regparm;
+-  if (TARGET_64BIT)
+-    {
+-      cum->nregs = (cum->call_abi == SYSV_ABI
+-                   ? X86_64_REGPARM_MAX
+-                   : X86_64_MS_REGPARM_MAX);
+-    }
+-  if (TARGET_SSE)
++  addr = create_tmp_var (ptr_type_node, "addr");
++  type_align = TYPE_ALIGN (type);
++
++  if (container)
+     {
+-      cum->sse_nregs = SSE_REGPARM_MAX;
+-      if (TARGET_64BIT)
+-        {
+-          cum->sse_nregs = (cum->call_abi == SYSV_ABI
+-                           ? X86_64_SSE_REGPARM_MAX
+-                           : X86_64_MS_SSE_REGPARM_MAX);
+-        }
+-    }
+-  if (TARGET_MMX)
+-    cum->mmx_nregs = MMX_REGPARM_MAX;
+-  cum->warn_avx512f = true;
+-  cum->warn_avx = true;
+-  cum->warn_sse = true;
+-  cum->warn_mmx = true;
++      int needed_intregs, needed_sseregs;
++      bool need_temp;
++      tree int_addr, sse_addr;
+ 
+-  /* Because type might mismatch in between caller and callee, we need to
+-     use actual type of function for local calls.
+-     FIXME: cgraph_analyze can be told to actually record if function uses
+-     va_start so for local functions maybe_vaarg can be made aggressive
+-     helping K&R code.
+-     FIXME: once typesytem is fixed, we won't need this code anymore.  */
+-  if (i && i->local && i->can_change_signature)
+-    fntype = TREE_TYPE (target->decl);
+-  cum->stdarg = stdarg_p (fntype);
+-  cum->maybe_vaarg = (fntype
+-		      ? (!prototype_p (fntype) || stdarg_p (fntype))
+-		      : !libname);
++      lab_false = create_artificial_label (UNKNOWN_LOCATION);
++      lab_over = create_artificial_label (UNKNOWN_LOCATION);
+ 
+-  cum->decl = fndecl;
++      examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
+ 
+-  cum->warn_empty = !warn_abi || cum->stdarg;
+-  if (!cum->warn_empty && fntype)
+-    {
+-      function_args_iterator iter;
+-      tree argtype;
+-      bool seen_empty_type = false;
+-      FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
++      need_temp = (!REG_P (container)
++		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
++		       || TYPE_ALIGN (type) > 128));
++
++      /* In case we are passing structure, verify that it is consecutive block
++         on the register save area.  If not we need to do moves.  */
++      if (!need_temp && !REG_P (container))
+ 	{
+-	  if (argtype == error_mark_node || VOID_TYPE_P (argtype))
+-	    break;
+-	  if (TYPE_EMPTY_P (argtype))
+-	    seen_empty_type = true;
+-	  else if (seen_empty_type)
++	  /* Verify that all registers are strictly consecutive  */
++	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
+ 	    {
+-	      cum->warn_empty = true;
+-	      break;
+-	    }
+-	}
+-    }
++	      int i;
+ 
+-  if (!TARGET_64BIT)
+-    {
+-      /* If there are variable arguments, then we won't pass anything
+-         in registers in 32-bit mode. */
+-      if (stdarg_p (fntype))
++	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
++		{
++		  rtx slot = XVECEXP (container, 0, i);
++		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
++		      || INTVAL (XEXP (slot, 1)) != i * 16)
++		    need_temp = true;
++		}
++	    }
++	  else
++	    {
++	      int i;
++
++	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
++		{
++		  rtx slot = XVECEXP (container, 0, i);
++		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
++		      || INTVAL (XEXP (slot, 1)) != i * 8)
++		    need_temp = true;
++		}
++	    }
++	}
++      if (!need_temp)
+ 	{
+-	  cum->nregs = 0;
+-	  /* Since in 32-bit, variable arguments are always passed on
+-	     stack, there is scratch register available for indirect
+-	     sibcall.  */
+-	  cfun->machine->arg_reg_available = true;
+-	  cum->sse_nregs = 0;
+-	  cum->mmx_nregs = 0;
+-	  cum->warn_avx512f = false;
+-	  cum->warn_avx = false;
+-	  cum->warn_sse = false;
+-	  cum->warn_mmx = false;
+-	  return;
++	  int_addr = addr;
++	  sse_addr = addr;
++	}
++      else
++	{
++	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
++	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
+ 	}
+ 
+-      /* Use ecx and edx registers if function has fastcall attribute,
+-	 else look for regparm information.  */
+-      if (fntype)
++      /* First ensure that we fit completely in registers.  */
++      if (needed_intregs)
+ 	{
+-	  unsigned int ccvt = ix86_get_callcvt (fntype);
+-	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+-	    {
+-	      cum->nregs = 1;
+-	      cum->fastcall = 1; /* Same first register as in fastcall.  */
+-	    }
+-	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+-	    {
+-	      cum->nregs = 2;
+-	      cum->fastcall = 1;
+-	    }
+-	  else
+-	    cum->nregs = ix86_function_regparm (fntype, fndecl);
++	  t = build_int_cst (TREE_TYPE (gpr),
++			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
++	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
++	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
++	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
++	  gimplify_and_add (t, pre_p);
++	}
++      if (needed_sseregs)
++	{
++	  t = build_int_cst (TREE_TYPE (fpr),
++			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
++			     + X86_64_REGPARM_MAX * 8);
++	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
++	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
++	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
++	  gimplify_and_add (t, pre_p);
+ 	}
+ 
+-      /* Set up the number of SSE registers used for passing SFmode
+-	 and DFmode arguments.  Warn for mismatching ABI.  */
+-      cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
+-    }
++      /* Compute index to start of area used for integer regs.  */
++      if (needed_intregs)
++	{
++	  /* int_addr = gpr + sav; */
++	  t = fold_build_pointer_plus (sav, gpr);
++	  gimplify_assign (int_addr, t, pre_p);
++	}
++      if (needed_sseregs)
++	{
++	  /* sse_addr = fpr + sav; */
++	  t = fold_build_pointer_plus (sav, fpr);
++	  gimplify_assign (sse_addr, t, pre_p);
++	}
++      if (need_temp)
++	{
++	  int i, prev_size = 0;
++	  tree temp = create_tmp_var (type, "va_arg_tmp");
+ 
+-  cfun->machine->arg_reg_available = (cum->nregs > 0);
+-}
++	  /* addr = &temp; */
++	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
++	  gimplify_assign (addr, t, pre_p);
+ 
+-/* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
+-   But in the case of vector types, it is some vector mode.
++	  for (i = 0; i < XVECLEN (container, 0); i++)
++	    {
++	      rtx slot = XVECEXP (container, 0, i);
++	      rtx reg = XEXP (slot, 0);
++	      machine_mode mode = GET_MODE (reg);
++	      tree piece_type;
++	      tree addr_type;
++	      tree daddr_type;
++	      tree src_addr, src;
++	      int src_offset;
++	      tree dest_addr, dest;
++	      int cur_size = GET_MODE_SIZE (mode);
+ 
+-   When we have only some of our vector isa extensions enabled, then there
+-   are some modes for which vector_mode_supported_p is false.  For these
+-   modes, the generic vector support in gcc will choose some non-vector mode
+-   in order to implement the type.  By computing the natural mode, we'll
+-   select the proper ABI location for the operand and not depend on whatever
+-   the middle-end decides to do with these vector types.
++	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
++	      prev_size = INTVAL (XEXP (slot, 1));
++	      if (prev_size + cur_size > size)
++		{
++		  cur_size = size - prev_size;
++		  unsigned int nbits = cur_size * BITS_PER_UNIT;
++		  if (!int_mode_for_size (nbits, 1).exists (&mode))
++		    mode = QImode;
++		}
++	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
++	      if (mode == GET_MODE (reg))
++		addr_type = build_pointer_type (piece_type);
++	      else
++		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
++							 true);
++	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
++							true);
+ 
+-   The midde-end can't deal with the vector types > 16 bytes.  In this
+-   case, we return the original mode and warn ABI change if CUM isn't
+-   NULL. 
++	      if (SSE_REGNO_P (REGNO (reg)))
++		{
++		  src_addr = sse_addr;
++		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
++		}
++	      else
++		{
++		  src_addr = int_addr;
++		  src_offset = REGNO (reg) * 8;
++		}
++	      src_addr = fold_convert (addr_type, src_addr);
++	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
+ 
+-   If INT_RETURN is true, warn ABI change if the vector mode isn't
+-   available for function return value.  */
++	      dest_addr = fold_convert (daddr_type, addr);
++	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
++	      if (cur_size == GET_MODE_SIZE (mode))
++		{
++		  src = build_va_arg_indirect_ref (src_addr);
++		  dest = build_va_arg_indirect_ref (dest_addr);
+ 
+-static machine_mode
+-type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
+-		   bool in_return)
+-{
+-  machine_mode mode = TYPE_MODE (type);
++		  gimplify_assign (dest, src, pre_p);
++		}
++	      else
++		{
++		  tree copy
++		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
++				       3, dest_addr, src_addr,
++				       size_int (cur_size));
++		  gimplify_and_add (copy, pre_p);
++		}
++	      prev_size += cur_size;
++	    }
++	}
+ 
+-  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
+-    {
+-      HOST_WIDE_INT size = int_size_in_bytes (type);
+-      if ((size == 8 || size == 16 || size == 32 || size == 64)
+-	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
+-	  && TYPE_VECTOR_SUBPARTS (type) > 1)
++      if (needed_intregs)
+ 	{
+-	  machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
++	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
++		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
++	  gimplify_assign (gpr, t, pre_p);
++	  /* The GPR save area guarantees only 8-byte alignment.  */
++	  if (!need_temp)
++	    type_align = MIN (type_align, 64);
++	}
+ 
+-	  /* There are no XFmode vector modes.  */
+-	  if (innermode == XFmode)
+-	    return mode;
++      if (needed_sseregs)
++	{
++	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
++		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
++	  gimplify_assign (unshare_expr (fpr), t, pre_p);
++	}
+ 
+-	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
+-	    mode = MIN_MODE_VECTOR_FLOAT;
+-	  else
+-	    mode = MIN_MODE_VECTOR_INT;
++      gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
+ 
+-	  /* Get the mode which has this inner mode and number of units.  */
+-	  FOR_EACH_MODE_FROM (mode, mode)
+-	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
+-		&& GET_MODE_INNER (mode) == innermode)
+-	      {
+-		if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
+-		  {
+-		    static bool warnedavx512f;
+-		    static bool warnedavx512f_ret;
++      gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
++    }
+ 
+-		    if (cum && cum->warn_avx512f && !warnedavx512f)
+-		      {
+-			if (warning (OPT_Wpsabi, "AVX512F vector argument "
+-				     "without AVX512F enabled changes the ABI"))
+-			  warnedavx512f = true;
+-		      }
+-		    else if (in_return && !warnedavx512f_ret)
+-		      {
+-			if (warning (OPT_Wpsabi, "AVX512F vector return "
+-				     "without AVX512F enabled changes the ABI"))
+-			  warnedavx512f_ret = true;
+-		      }
++  /* ... otherwise out of the overflow area.  */
+ 
+-		    return TYPE_MODE (type);
+-		  }
+-		else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
+-		  {
+-		    static bool warnedavx;
+-		    static bool warnedavx_ret;
++  /* When we align parameter on stack for caller, if the parameter
++     alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
++     aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
++     here with caller.  */
++  arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
++  if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
++    arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
+ 
+-		    if (cum && cum->warn_avx && !warnedavx)
+-		      {
+-			if (warning (OPT_Wpsabi, "AVX vector argument "
+-				     "without AVX enabled changes the ABI"))
+-			  warnedavx = true;
+-		      }
+-		    else if (in_return && !warnedavx_ret)
+-		      {
+-			if (warning (OPT_Wpsabi, "AVX vector return "
+-				     "without AVX enabled changes the ABI"))
+-			  warnedavx_ret = true;
+-		      }
++  /* Care for on-stack alignment if needed.  */
++  if (arg_boundary <= 64 || size == 0)
++    t = ovf;
++ else
++    {
++      HOST_WIDE_INT align = arg_boundary / 8;
++      t = fold_build_pointer_plus_hwi (ovf, align - 1);
++      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
++		  build_int_cst (TREE_TYPE (t), -align));
++    }
+ 
+-		    return TYPE_MODE (type);
+-		  }
+-		else if (((size == 8 && TARGET_64BIT) || size == 16)
+-			 && !TARGET_SSE
+-			 && !TARGET_IAMCU)
+-		  {
+-		    static bool warnedsse;
+-		    static bool warnedsse_ret;
++  gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
++  gimplify_assign (addr, t, pre_p);
+ 
+-		    if (cum && cum->warn_sse && !warnedsse)
+-		      {
+-			if (warning (OPT_Wpsabi, "SSE vector argument "
+-				     "without SSE enabled changes the ABI"))
+-			  warnedsse = true;
+-		      }
+-		    else if (!TARGET_64BIT && in_return && !warnedsse_ret)
+-		      {
+-			if (warning (OPT_Wpsabi, "SSE vector return "
+-				     "without SSE enabled changes the ABI"))
+-			  warnedsse_ret = true;
+-		      }
+-		  }
+-		else if ((size == 8 && !TARGET_64BIT)
+-			 && (!cfun
+-			     || cfun->machine->func_type == TYPE_NORMAL)
+-			 && !TARGET_MMX
+-			 && !TARGET_IAMCU)
+-		  {
+-		    static bool warnedmmx;
+-		    static bool warnedmmx_ret;
++  t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
++  gimplify_assign (unshare_expr (ovf), t, pre_p);
+ 
+-		    if (cum && cum->warn_mmx && !warnedmmx)
+-		      {
+-			if (warning (OPT_Wpsabi, "MMX vector argument "
+-				     "without MMX enabled changes the ABI"))
+-			  warnedmmx = true;
+-		      }
+-		    else if (in_return && !warnedmmx_ret)
+-		      {
+-			if (warning (OPT_Wpsabi, "MMX vector return "
+-				     "without MMX enabled changes the ABI"))
+-			  warnedmmx_ret = true;
+-		      }
+-		  }
+-		return mode;
+-	      }
++  if (container)
++    gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
+ 
+-	  gcc_unreachable ();
+-	}
+-    }
++  type = build_aligned_type (type, type_align);
++  ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
++  addr = fold_convert (ptrtype, addr);
+ 
+-  return mode;
++  if (indirect_p)
++    addr = build_va_arg_indirect_ref (addr);
++  return build_va_arg_indirect_ref (addr);
+ }
++
++/* Return true if OPNUM's MEM should be matched
++   in movabs* patterns.  */
+ 
+-/* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
+-   this may not agree with the mode that the type system has chosen for the
+-   register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
+-   go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
+-
+-static rtx
+-gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
+-		     unsigned int regno)
++bool
++ix86_check_movabs (rtx insn, int opnum)
+ {
+-  rtx tmp;
++  rtx set, mem;
+ 
+-  if (orig_mode != BLKmode)
+-    tmp = gen_rtx_REG (orig_mode, regno);
+-  else
++  set = PATTERN (insn);
++  if (GET_CODE (set) == PARALLEL)
++    set = XVECEXP (set, 0, 0);
++  gcc_assert (GET_CODE (set) == SET);
++  mem = XEXP (set, opnum);
++  while (SUBREG_P (mem))
++    mem = SUBREG_REG (mem);
++  gcc_assert (MEM_P (mem));
++  return volatile_ok || !MEM_VOLATILE_P (mem);
++}
++
++/* Return false if INSN contains a MEM with a non-default address space.  */
++bool
++ix86_check_no_addr_space (rtx insn)
++{
++  subrtx_var_iterator::array_type array;
++  FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
+     {
+-      tmp = gen_rtx_REG (mode, regno);
+-      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
+-      tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
++      rtx x = *iter;
++      if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
++	return false;
+     }
+-
+-  return tmp;
++  return true;
+ }
++
++/* Initialize the table of extra 80387 mathematical constants.  */
+ 
+-/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
+-   of this code is to classify each 8bytes of incoming argument by the register
+-   class and assign registers accordingly.  */
+-
+-/* Return the union class of CLASS1 and CLASS2.
+-   See the x86-64 PS ABI for details.  */
+-
+-static enum x86_64_reg_class
+-merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
++static void
++init_ext_80387_constants (void)
+ {
+-  /* Rule #1: If both classes are equal, this is the resulting class.  */
+-  if (class1 == class2)
+-    return class1;
+-
+-  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
+-     the other class.  */
+-  if (class1 == X86_64_NO_CLASS)
+-    return class2;
+-  if (class2 == X86_64_NO_CLASS)
+-    return class1;
+-
+-  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
+-  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
+-    return X86_64_MEMORY_CLASS;
+-
+-  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
+-  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
+-      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+-    return X86_64_INTEGERSI_CLASS;
+-  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
+-      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
+-    return X86_64_INTEGER_CLASS;
++  static const char * cst[5] =
++  {
++    "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
++    "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
++    "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
++    "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
++    "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
++  };
++  int i;
+ 
+-  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
+-     MEMORY is used.  */
+-  if (class1 == X86_64_X87_CLASS
+-      || class1 == X86_64_X87UP_CLASS
+-      || class1 == X86_64_COMPLEX_X87_CLASS
+-      || class2 == X86_64_X87_CLASS
+-      || class2 == X86_64_X87UP_CLASS
+-      || class2 == X86_64_COMPLEX_X87_CLASS)
+-    return X86_64_MEMORY_CLASS;
++  for (i = 0; i < 5; i++)
++    {
++      real_from_string (&ext_80387_constants_table[i], cst[i]);
++      /* Ensure each constant is rounded to XFmode precision.  */
++      real_convert (&ext_80387_constants_table[i],
++		    XFmode, &ext_80387_constants_table[i]);
++    }
+ 
+-  /* Rule #6: Otherwise class SSE is used.  */
+-  return X86_64_SSE_CLASS;
++  ext_80387_constants_init = 1;
+ }
+ 
+-/* Classify the argument of type TYPE and mode MODE.
+-   CLASSES will be filled by the register class used to pass each word
+-   of the operand.  The number of words is returned.  In case the parameter
+-   should be passed in memory, 0 is returned. As a special case for zero
+-   sized containers, classes[0] will be NO_CLASS and 1 is returned.
++/* Return non-zero if the constant is something that
++   can be loaded with a special instruction.  */
+ 
+-   BIT_OFFSET is used internally for handling records and specifies offset
+-   of the offset in bits modulo 512 to avoid overflow cases.
++int
++standard_80387_constant_p (rtx x)
++{
++  machine_mode mode = GET_MODE (x);
+ 
+-   See the x86-64 PS ABI for details.
+-*/
++  const REAL_VALUE_TYPE *r;
+ 
+-static int
+-classify_argument (machine_mode mode, const_tree type,
+-		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
+-{
+-  HOST_WIDE_INT bytes
+-    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+-  int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
++  if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
++    return -1;
+ 
+-  /* Variable sized entities are always passed/returned in memory.  */
+-  if (bytes < 0)
+-    return 0;
++  if (x == CONST0_RTX (mode))
++    return 1;
++  if (x == CONST1_RTX (mode))
++    return 2;
+ 
+-  if (mode != VOIDmode
+-      && targetm.calls.must_pass_in_stack (mode, type))
+-    return 0;
++  r = CONST_DOUBLE_REAL_VALUE (x);
+ 
+-  if (type && AGGREGATE_TYPE_P (type))
++  /* For XFmode constants, try to find a special 80387 instruction when
++     optimizing for size or on those CPUs that benefit from them.  */
++  if (mode == XFmode
++      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
+     {
+       int i;
+-      tree field;
+-      enum x86_64_reg_class subclasses[MAX_CLASSES];
+ 
+-      /* On x86-64 we pass structures larger than 64 bytes on the stack.  */
+-      if (bytes > 64)
+-	return 0;
++      if (! ext_80387_constants_init)
++	init_ext_80387_constants ();
+ 
+-      for (i = 0; i < words; i++)
+-	classes[i] = X86_64_NO_CLASS;
++      for (i = 0; i < 5; i++)
++        if (real_identical (r, &ext_80387_constants_table[i]))
++	  return i + 3;
++    }
+ 
+-      /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
+-	 signalize memory class, so handle it as special case.  */
+-      if (!words)
+-	{
+-	  classes[0] = X86_64_NO_CLASS;
+-	  return 1;
+-	}
++  /* Load of the constant -0.0 or -1.0 will be split as
++     fldz;fchs or fld1;fchs sequence.  */
++  if (real_isnegzero (r))
++    return 8;
++  if (real_identical (r, &dconstm1))
++    return 9;
+ 
+-      /* Classify each field of record and merge classes.  */
+-      switch (TREE_CODE (type))
+-	{
+-	case RECORD_TYPE:
+-	  /* And now merge the fields of structure.  */
+-	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+-	    {
+-	      if (TREE_CODE (field) == FIELD_DECL)
+-		{
+-		  int num;
++  return 0;
++}
+ 
+-		  if (TREE_TYPE (field) == error_mark_node)
+-		    continue;
++/* Return the opcode of the special instruction to be used to load
++   the constant X.  */
+ 
+-		  /* Bitfields are always classified as integer.  Handle them
+-		     early, since later code would consider them to be
+-		     misaligned integers.  */
+-		  if (DECL_BIT_FIELD (field))
+-		    {
+-		      for (i = (int_bit_position (field)
+-				+ (bit_offset % 64)) / 8 / 8;
+-			   i < ((int_bit_position (field) + (bit_offset % 64))
+-			        + tree_to_shwi (DECL_SIZE (field))
+-				+ 63) / 8 / 8; i++)
+-			classes[i]
+-			  = merge_classes (X86_64_INTEGER_CLASS, classes[i]);
+-		    }
+-		  else
+-		    {
+-		      int pos;
++const char *
++standard_80387_constant_opcode (rtx x)
++{
++  switch (standard_80387_constant_p (x))
++    {
++    case 1:
++      return "fldz";
++    case 2:
++      return "fld1";
++    case 3:
++      return "fldlg2";
++    case 4:
++      return "fldln2";
++    case 5:
++      return "fldl2e";
++    case 6:
++      return "fldl2t";
++    case 7:
++      return "fldpi";
++    case 8:
++    case 9:
++      return "#";
++    default:
++      gcc_unreachable ();
++    }
++}
+ 
+-		      type = TREE_TYPE (field);
++/* Return the CONST_DOUBLE representing the 80387 constant that is
++   loaded by the specified special instruction.  The argument IDX
++   matches the return value from standard_80387_constant_p.  */
+ 
+-		      /* Flexible array member is ignored.  */
+-		      if (TYPE_MODE (type) == BLKmode
+-			  && TREE_CODE (type) == ARRAY_TYPE
+-			  && TYPE_SIZE (type) == NULL_TREE
+-			  && TYPE_DOMAIN (type) != NULL_TREE
+-			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
+-			      == NULL_TREE))
+-			{
+-			  static bool warned;
++rtx
++standard_80387_constant_rtx (int idx)
++{
++  int i;
+ 
+-			  if (!warned && warn_psabi)
+-			    {
+-			      warned = true;
+-			      inform (input_location,
+-				      "the ABI of passing struct with"
+-				      " a flexible array member has"
+-				      " changed in GCC 4.4");
+-			    }
+-			  continue;
+-			}
+-		      num = classify_argument (TYPE_MODE (type), type,
+-					       subclasses,
+-					       (int_bit_position (field)
+-						+ bit_offset) % 512);
+-		      if (!num)
+-			return 0;
+-		      pos = (int_bit_position (field)
+-			     + (bit_offset % 64)) / 8 / 8;
+-		      for (i = 0; i < num && (i + pos) < words; i++)
+-			classes[i + pos]
+-			  = merge_classes (subclasses[i], classes[i + pos]);
+-		    }
+-		}
+-	    }
+-	  break;
++  if (! ext_80387_constants_init)
++    init_ext_80387_constants ();
+ 
+-	case ARRAY_TYPE:
+-	  /* Arrays are handled as small records.  */
+-	  {
+-	    int num;
+-	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
+-				     TREE_TYPE (type), subclasses, bit_offset);
+-	    if (!num)
+-	      return 0;
++  switch (idx)
++    {
++    case 3:
++    case 4:
++    case 5:
++    case 6:
++    case 7:
++      i = idx - 3;
++      break;
+ 
+-	    /* The partial classes are now full classes.  */
+-	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
+-	      subclasses[0] = X86_64_SSE_CLASS;
+-	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
+-		&& !((bit_offset % 64) == 0 && bytes == 4))
+-	      subclasses[0] = X86_64_INTEGER_CLASS;
++    default:
++      gcc_unreachable ();
++    }
+ 
+-	    for (i = 0; i < words; i++)
+-	      classes[i] = subclasses[i % num];
++  return const_double_from_real_value (ext_80387_constants_table[i],
++				       XFmode);
++}
+ 
+-	    break;
+-	  }
+-	case UNION_TYPE:
+-	case QUAL_UNION_TYPE:
+-	  /* Unions are similar to RECORD_TYPE but offset is always 0.
+-	     */
+-	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+-	    {
+-	      if (TREE_CODE (field) == FIELD_DECL)
+-		{
+-		  int num;
++/* Return 1 if X is all bits 0 and 2 if X is all bits 1
++   in supported SSE/AVX vector mode.  */
+ 
+-		  if (TREE_TYPE (field) == error_mark_node)
+-		    continue;
++int
++standard_sse_constant_p (rtx x, machine_mode pred_mode)
++{
++  machine_mode mode;
+ 
+-		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
+-					   TREE_TYPE (field), subclasses,
+-					   bit_offset);
+-		  if (!num)
+-		    return 0;
+-		  for (i = 0; i < num && i < words; i++)
+-		    classes[i] = merge_classes (subclasses[i], classes[i]);
+-		}
+-	    }
+-	  break;
++  if (!TARGET_SSE)
++    return 0;
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
++  mode = GET_MODE (x);
+ 
+-      if (words > 2)
+-	{
+-	  /* When size > 16 bytes, if the first one isn't
+-	     X86_64_SSE_CLASS or any other ones aren't
+-	     X86_64_SSEUP_CLASS, everything should be passed in
+-	     memory.  */
+-	  if (classes[0] != X86_64_SSE_CLASS)
+-	      return 0;
++  if (x == const0_rtx || const0_operand (x, mode))
++    return 1;
+ 
+-	  for (i = 1; i < words; i++)
+-	    if (classes[i] != X86_64_SSEUP_CLASS)
+-	      return 0;
+-	}
++  if (x == constm1_rtx || vector_all_ones_operand (x, mode))
++    {
++      /* VOIDmode integer constant, get mode from the predicate.  */
++      if (mode == VOIDmode)
++	mode = pred_mode;
+ 
+-      /* Final merger cleanup.  */
+-      for (i = 0; i < words; i++)
++      switch (GET_MODE_SIZE (mode))
+ 	{
+-	  /* If one class is MEMORY, everything should be passed in
+-	     memory.  */
+-	  if (classes[i] == X86_64_MEMORY_CLASS)
+-	    return 0;
+-
+-	  /* The X86_64_SSEUP_CLASS should be always preceded by
+-	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
+-	  if (classes[i] == X86_64_SSEUP_CLASS
+-	      && classes[i - 1] != X86_64_SSE_CLASS
+-	      && classes[i - 1] != X86_64_SSEUP_CLASS)
+-	    {
+-	      /* The first one should never be X86_64_SSEUP_CLASS.  */
+-	      gcc_assert (i != 0);
+-	      classes[i] = X86_64_SSE_CLASS;
+-	    }
+-
+-	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
+-	       everything should be passed in memory.  */
+-	  if (classes[i] == X86_64_X87UP_CLASS
+-	      && (classes[i - 1] != X86_64_X87_CLASS))
+-	    {
+-	      static bool warned;
+-
+-	      /* The first one should never be X86_64_X87UP_CLASS.  */
+-	      gcc_assert (i != 0);
+-	      if (!warned && warn_psabi)
+-		{
+-		  warned = true;
+-		  inform (input_location,
+-			  "the ABI of passing union with long double"
+-			  " has changed in GCC 4.4");
+-		}
+-	      return 0;
+-	    }
++	case 64:
++	  if (TARGET_AVX512F)
++	    return 2;
++	  break;
++	case 32:
++	  if (TARGET_AVX2)
++	    return 2;
++	  break;
++	case 16:
++	  if (TARGET_SSE2)
++	    return 2;
++	  break;
++	case 0:
++	  /* VOIDmode */
++	  gcc_unreachable ();
++	default:
++	  break;
+ 	}
+-      return words;
+     }
+ 
+-  /* Compute alignment needed.  We align all types to natural boundaries with
+-     exception of XFmode that is aligned to 64bits.  */
+-  if (mode != VOIDmode && mode != BLKmode)
+-    {
+-      int mode_alignment = GET_MODE_BITSIZE (mode);
+-
+-      if (mode == XFmode)
+-	mode_alignment = 128;
+-      else if (mode == XCmode)
+-	mode_alignment = 256;
+-      if (COMPLEX_MODE_P (mode))
+-	mode_alignment /= 2;
+-      /* Misaligned fields are always returned in memory.  */
+-      if (bit_offset % mode_alignment)
+-	return 0;
+-    }
++  return 0;
++}
+ 
+-  /* for V1xx modes, just use the base mode */
+-  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
+-      && GET_MODE_UNIT_SIZE (mode) == bytes)
+-    mode = GET_MODE_INNER (mode);
++/* Return the opcode of the special instruction to be used to load
++   the constant operands[1] into operands[0].  */
+ 
+-  /* Classification of atomic types.  */
+-  switch (mode)
+-    {
+-    case E_SDmode:
+-    case E_DDmode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      return 1;
+-    case E_TDmode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      classes[1] = X86_64_SSEUP_CLASS;
+-      return 2;
+-    case E_DImode:
+-    case E_SImode:
+-    case E_HImode:
+-    case E_QImode:
+-    case E_CSImode:
+-    case E_CHImode:
+-    case E_CQImode:
+-      {
+-	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
++const char *
++standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
++{
++  machine_mode mode;
++  rtx x = operands[1];
+ 
+-	/* Analyze last 128 bits only.  */
+-	size = (size - 1) & 0x7f;
++  gcc_assert (TARGET_SSE);
+ 
+-	if (size < 32)
+-	  {
+-	    classes[0] = X86_64_INTEGERSI_CLASS;
+-	    return 1;
+-	  }
+-	else if (size < 64)
+-	  {
+-	    classes[0] = X86_64_INTEGER_CLASS;
+-	    return 1;
+-	  }
+-	else if (size < 64+32)
+-	  {
+-	    classes[0] = X86_64_INTEGER_CLASS;
+-	    classes[1] = X86_64_INTEGERSI_CLASS;
+-	    return 2;
+-	  }
+-	else if (size < 64+64)
+-	  {
+-	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+-	    return 2;
+-	  }
+-	else
+-	  gcc_unreachable ();
+-      }
+-    case E_CDImode:
+-    case E_TImode:
+-      classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+-      return 2;
+-    case E_COImode:
+-    case E_OImode:
+-      /* OImode shouldn't be used directly.  */
+-      gcc_unreachable ();
+-    case E_CTImode:
+-      return 0;
+-    case E_SFmode:
+-      if (!(bit_offset % 64))
+-	classes[0] = X86_64_SSESF_CLASS;
+-      else
+-	classes[0] = X86_64_SSE_CLASS;
+-      return 1;
+-    case E_DFmode:
+-      classes[0] = X86_64_SSEDF_CLASS;
+-      return 1;
+-    case E_XFmode:
+-      classes[0] = X86_64_X87_CLASS;
+-      classes[1] = X86_64_X87UP_CLASS;
+-      return 2;
+-    case E_TFmode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      classes[1] = X86_64_SSEUP_CLASS;
+-      return 2;
+-    case E_SCmode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      if (!(bit_offset % 64))
+-	return 1;
+-      else
++  mode = GET_MODE (x);
++
++  if (x == const0_rtx || const0_operand (x, mode))
++    {
++      switch (get_attr_mode (insn))
+ 	{
+-	  static bool warned;
++	case MODE_TI:
++	  if (!EXT_REX_SSE_REG_P (operands[0]))
++	    return "%vpxor\t%0, %d0";
++	  /* FALLTHRU */
++	case MODE_XI:
++	case MODE_OI:
++	  if (EXT_REX_SSE_REG_P (operands[0]))
++	    return (TARGET_AVX512VL
++		    ? "vpxord\t%x0, %x0, %x0"
++		    : "vpxord\t%g0, %g0, %g0");
++	  return "vpxor\t%x0, %x0, %x0";
+ 
+-	  if (!warned && warn_psabi)
+-	    {
+-	      warned = true;
+-	      inform (input_location,
+-		      "the ABI of passing structure with complex float"
+-		      " member has changed in GCC 4.4");
+-	    }
+-	  classes[1] = X86_64_SSESF_CLASS;
+-	  return 2;
++	case MODE_V2DF:
++	  if (!EXT_REX_SSE_REG_P (operands[0]))
++	    return "%vxorpd\t%0, %d0";
++	  /* FALLTHRU */
++	case MODE_V8DF:
++	case MODE_V4DF:
++	  if (!EXT_REX_SSE_REG_P (operands[0]))
++	    return "vxorpd\t%x0, %x0, %x0";
++	  else if (TARGET_AVX512DQ)
++	    return (TARGET_AVX512VL
++		    ? "vxorpd\t%x0, %x0, %x0"
++		    : "vxorpd\t%g0, %g0, %g0");
++	  else
++	    return (TARGET_AVX512VL
++		    ? "vpxorq\t%x0, %x0, %x0"
++		    : "vpxorq\t%g0, %g0, %g0");
++
++	case MODE_V4SF:
++	  if (!EXT_REX_SSE_REG_P (operands[0]))
++	    return "%vxorps\t%0, %d0";
++	  /* FALLTHRU */
++	case MODE_V16SF:
++	case MODE_V8SF:
++	  if (!EXT_REX_SSE_REG_P (operands[0]))
++	    return "vxorps\t%x0, %x0, %x0";
++	  else if (TARGET_AVX512DQ)
++	    return (TARGET_AVX512VL
++		    ? "vxorps\t%x0, %x0, %x0"
++		    : "vxorps\t%g0, %g0, %g0");
++	  else
++	    return (TARGET_AVX512VL
++		    ? "vpxord\t%x0, %x0, %x0"
++		    : "vpxord\t%g0, %g0, %g0");
++
++	default:
++	  gcc_unreachable ();
+ 	}
+-    case E_DCmode:
+-      classes[0] = X86_64_SSEDF_CLASS;
+-      classes[1] = X86_64_SSEDF_CLASS;
+-      return 2;
+-    case E_XCmode:
+-      classes[0] = X86_64_COMPLEX_X87_CLASS;
+-      return 1;
+-    case E_TCmode:
+-      /* This modes is larger than 16 bytes.  */
+-      return 0;
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      classes[1] = X86_64_SSEUP_CLASS;
+-      classes[2] = X86_64_SSEUP_CLASS;
+-      classes[3] = X86_64_SSEUP_CLASS;
+-      return 4;
+-    case E_V8DFmode:
+-    case E_V16SFmode:
+-    case E_V8DImode:
+-    case E_V16SImode:
+-    case E_V32HImode:
+-    case E_V64QImode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      classes[1] = X86_64_SSEUP_CLASS;
+-      classes[2] = X86_64_SSEUP_CLASS;
+-      classes[3] = X86_64_SSEUP_CLASS;
+-      classes[4] = X86_64_SSEUP_CLASS;
+-      classes[5] = X86_64_SSEUP_CLASS;
+-      classes[6] = X86_64_SSEUP_CLASS;
+-      classes[7] = X86_64_SSEUP_CLASS;
+-      return 8;
+-    case E_V4SFmode:
+-    case E_V4SImode:
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      classes[1] = X86_64_SSEUP_CLASS;
+-      return 2;
+-    case E_V1TImode:
+-    case E_V1DImode:
+-    case E_V2SFmode:
+-    case E_V2SImode:
+-    case E_V4HImode:
+-    case E_V8QImode:
+-      classes[0] = X86_64_SSE_CLASS;
+-      return 1;
+-    case E_BLKmode:
+-    case E_VOIDmode:
+-      return 0;
+-    default:
+-      gcc_assert (VECTOR_MODE_P (mode));
++    }
++  else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
++    {
++      enum attr_mode insn_mode = get_attr_mode (insn);
++      
++      switch (insn_mode)
++	{
++	case MODE_XI:
++	case MODE_V8DF:
++	case MODE_V16SF:
++	  gcc_assert (TARGET_AVX512F);
++	  return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+ 
+-      if (bytes > 16)
+-	return 0;
++	case MODE_OI:
++	case MODE_V4DF:
++	case MODE_V8SF:
++	  gcc_assert (TARGET_AVX2);
++	  /* FALLTHRU */
++	case MODE_TI:
++	case MODE_V2DF:
++	case MODE_V4SF:
++	  gcc_assert (TARGET_SSE2);
++	  if (!EXT_REX_SSE_REG_P (operands[0]))
++	    return (TARGET_AVX
++		    ? "vpcmpeqd\t%0, %0, %0"
++		    : "pcmpeqd\t%0, %0");
++	  else if (TARGET_AVX512VL)
++	    return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
++	  else
++	    return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+ 
+-      gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
++	default:
++	  gcc_unreachable ();
++	}
++   }
+ 
+-      if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
+-	classes[0] = X86_64_INTEGERSI_CLASS;
+-      else
+-	classes[0] = X86_64_INTEGER_CLASS;
+-      classes[1] = X86_64_INTEGER_CLASS;
+-      return 1 + (bytes > 8);
+-    }
++  gcc_unreachable ();
+ }
+ 
+-/* Examine the argument and return set number of register required in each
+-   class.  Return true iff parameter should be passed in memory.  */
++/* Returns true if INSN can be transformed from a memory load
++   to a supported FP constant load.  */
+ 
+-static bool
+-examine_argument (machine_mode mode, const_tree type, int in_return,
+-		  int *int_nregs, int *sse_nregs)
++bool
++ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
+ {
+-  enum x86_64_reg_class regclass[MAX_CLASSES];
+-  int n = classify_argument (mode, type, regclass, 0);
++  rtx src = find_constant_src (insn);
+ 
+-  *int_nregs = 0;
+-  *sse_nregs = 0;
++  gcc_assert (REG_P (dst));
+ 
+-  if (!n)
+-    return true;
+-  for (n--; n >= 0; n--)
+-    switch (regclass[n])
+-      {
+-      case X86_64_INTEGER_CLASS:
+-      case X86_64_INTEGERSI_CLASS:
+-	(*int_nregs)++;
+-	break;
+-      case X86_64_SSE_CLASS:
+-      case X86_64_SSESF_CLASS:
+-      case X86_64_SSEDF_CLASS:
+-	(*sse_nregs)++;
+-	break;
+-      case X86_64_NO_CLASS:
+-      case X86_64_SSEUP_CLASS:
+-	break;
+-      case X86_64_X87_CLASS:
+-      case X86_64_X87UP_CLASS:
+-      case X86_64_COMPLEX_X87_CLASS:
+-	if (!in_return)
+-	  return true;
+-	break;
+-      case X86_64_MEMORY_CLASS:
+-	gcc_unreachable ();
+-      }
++  if (src == NULL
++      || (SSE_REGNO_P (REGNO (dst))
++	  && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
++      || (STACK_REGNO_P (REGNO (dst))
++	   && standard_80387_constant_p (src) < 1))
++    return false;
+ 
+-  return false;
++  return true;
+ }
+ 
+-/* Construct container for the argument used by GCC interface.  See
+-   FUNCTION_ARG for the detailed description.  */
++/* Returns true if OP contains a symbol reference */
+ 
+-static rtx
+-construct_container (machine_mode mode, machine_mode orig_mode,
+-		     const_tree type, int in_return, int nintregs, int nsseregs,
+-		     const int *intreg, int sse_regno)
++bool
++symbolic_reference_mentioned_p (rtx op)
+ {
+-  /* The following variables hold the static issued_error state.  */
+-  static bool issued_sse_arg_error;
+-  static bool issued_sse_ret_error;
+-  static bool issued_x87_ret_error;
+-
+-  machine_mode tmpmode;
+-  int bytes
+-    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+-  enum x86_64_reg_class regclass[MAX_CLASSES];
+-  int n;
++  const char *fmt;
+   int i;
+-  int nexps = 0;
+-  int needed_sseregs, needed_intregs;
+-  rtx exp[MAX_CLASSES];
+-  rtx ret;
+ 
+-  n = classify_argument (mode, type, regclass, 0);
+-  if (!n)
+-    return NULL;
+-  if (examine_argument (mode, type, in_return, &needed_intregs,
+-			&needed_sseregs))
+-    return NULL;
+-  if (needed_intregs > nintregs || needed_sseregs > nsseregs)
+-    return NULL;
++  if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
++    return true;
+ 
+-  /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
+-     some less clueful developer tries to use floating-point anyway.  */
+-  if (needed_sseregs && !TARGET_SSE)
++  fmt = GET_RTX_FORMAT (GET_CODE (op));
++  for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
+     {
+-      if (in_return)
+-	{
+-	  if (!issued_sse_ret_error)
+-	    {
+-	      error ("SSE register return with SSE disabled");
+-	      issued_sse_ret_error = true;
+-	    }
+-	}
+-      else if (!issued_sse_arg_error)
++      if (fmt[i] == 'E')
+ 	{
+-	  error ("SSE register argument with SSE disabled");
+-	  issued_sse_arg_error = true;
++	  int j;
++
++	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
++	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
++	      return true;
+ 	}
+-      return NULL;
++
++      else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
++	return true;
+     }
+ 
+-  /* Likewise, error if the ABI requires us to return values in the
+-     x87 registers and the user specified -mno-80387.  */
+-  if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
+-    for (i = 0; i < n; i++)
+-      if (regclass[i] == X86_64_X87_CLASS
+-	  || regclass[i] == X86_64_X87UP_CLASS
+-	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
+-	{
+-	  if (!issued_x87_ret_error)
+-	    {
+-	      error ("x87 register return with x87 disabled");
+-	      issued_x87_ret_error = true;
+-	    }
+-	  return NULL;
+-	}
++  return false;
++}
+ 
+-  /* First construct simple cases.  Avoid SCmode, since we want to use
+-     single register to pass this type.  */
+-  if (n == 1 && mode != SCmode)
+-    switch (regclass[0])
+-      {
+-      case X86_64_INTEGER_CLASS:
+-      case X86_64_INTEGERSI_CLASS:
+-	return gen_rtx_REG (mode, intreg[0]);
+-      case X86_64_SSE_CLASS:
+-      case X86_64_SSESF_CLASS:
+-      case X86_64_SSEDF_CLASS:
+-	if (mode != BLKmode)
+-	  return gen_reg_or_parallel (mode, orig_mode,
+-				      GET_SSE_REGNO (sse_regno));
+-	break;
+-      case X86_64_X87_CLASS:
+-      case X86_64_COMPLEX_X87_CLASS:
+-	return gen_rtx_REG (mode, FIRST_STACK_REG);
+-      case X86_64_NO_CLASS:
+-	/* Zero sized array, struct or class.  */
+-	return NULL;
+-      default:
+-	gcc_unreachable ();
+-      }
+-  if (n == 2
+-      && regclass[0] == X86_64_SSE_CLASS
+-      && regclass[1] == X86_64_SSEUP_CLASS
+-      && mode != BLKmode)
+-    return gen_reg_or_parallel (mode, orig_mode,
+-				GET_SSE_REGNO (sse_regno));
+-  if (n == 4
+-      && regclass[0] == X86_64_SSE_CLASS
+-      && regclass[1] == X86_64_SSEUP_CLASS
+-      && regclass[2] == X86_64_SSEUP_CLASS
+-      && regclass[3] == X86_64_SSEUP_CLASS
+-      && mode != BLKmode)
+-    return gen_reg_or_parallel (mode, orig_mode,
+-				GET_SSE_REGNO (sse_regno));
+-  if (n == 8
+-      && regclass[0] == X86_64_SSE_CLASS
+-      && regclass[1] == X86_64_SSEUP_CLASS
+-      && regclass[2] == X86_64_SSEUP_CLASS
+-      && regclass[3] == X86_64_SSEUP_CLASS
+-      && regclass[4] == X86_64_SSEUP_CLASS
+-      && regclass[5] == X86_64_SSEUP_CLASS
+-      && regclass[6] == X86_64_SSEUP_CLASS
+-      && regclass[7] == X86_64_SSEUP_CLASS
+-      && mode != BLKmode)
+-    return gen_reg_or_parallel (mode, orig_mode,
+-				GET_SSE_REGNO (sse_regno));
+-  if (n == 2
+-      && regclass[0] == X86_64_X87_CLASS
+-      && regclass[1] == X86_64_X87UP_CLASS)
+-    return gen_rtx_REG (XFmode, FIRST_STACK_REG);
++/* Return true if it is appropriate to emit `ret' instructions in the
++   body of a function.  Do this only if the epilogue is simple, needing a
++   couple of insns.  Prior to reloading, we can't tell how many registers
++   must be saved, so return false then.  Return false if there is no frame
++   marker to de-allocate.  */
+ 
+-  if (n == 2
+-      && regclass[0] == X86_64_INTEGER_CLASS
+-      && regclass[1] == X86_64_INTEGER_CLASS
+-      && (mode == CDImode || mode == TImode || mode == BLKmode)
+-      && intreg[0] + 1 == intreg[1])
+-    {
+-      if (mode == BLKmode)
+-	{
+-	  /* Use TImode for BLKmode values in 2 integer registers.  */
+-	  exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
+-				      gen_rtx_REG (TImode, intreg[0]),
+-				      GEN_INT (0));
+-	  ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
+-	  XVECEXP (ret, 0, 0) = exp[0];
+-	  return ret;
+-	}
+-      else
+-	return gen_rtx_REG (mode, intreg[0]);
+-    }
++bool
++ix86_can_use_return_insn_p (void)
++{
++  if (ix86_function_naked (current_function_decl))
++    return false;
+ 
+-  /* Otherwise figure out the entries of the PARALLEL.  */
+-  for (i = 0; i < n; i++)
+-    {
+-      int pos;
++  /* Don't use `ret' instruction in interrupt handler.  */
++  if (! reload_completed
++      || frame_pointer_needed
++      || cfun->machine->func_type != TYPE_NORMAL)
++    return 0;
+ 
+-      switch (regclass[i])
+-        {
+-	  case X86_64_NO_CLASS:
+-	    break;
+-	  case X86_64_INTEGER_CLASS:
+-	  case X86_64_INTEGERSI_CLASS:
+-	    /* Merge TImodes on aligned occasions here too.  */
+-	    if (i * 8 + 8 > bytes)
+-	      {
+-		unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
+-		if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
+-		  /* We've requested 24 bytes we
+-		     don't have mode for.  Use DImode.  */
+-		  tmpmode = DImode;
+-	      }
+-	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
+-	      tmpmode = SImode;
+-	    else
+-	      tmpmode = DImode;
+-	    exp [nexps++]
+-	      = gen_rtx_EXPR_LIST (VOIDmode,
+-				   gen_rtx_REG (tmpmode, *intreg),
+-				   GEN_INT (i*8));
+-	    intreg++;
+-	    break;
+-	  case X86_64_SSESF_CLASS:
+-	    exp [nexps++]
+-	      = gen_rtx_EXPR_LIST (VOIDmode,
+-				   gen_rtx_REG (SFmode,
+-						GET_SSE_REGNO (sse_regno)),
+-				   GEN_INT (i*8));
+-	    sse_regno++;
+-	    break;
+-	  case X86_64_SSEDF_CLASS:
+-	    exp [nexps++]
+-	      = gen_rtx_EXPR_LIST (VOIDmode,
+-				   gen_rtx_REG (DFmode,
+-						GET_SSE_REGNO (sse_regno)),
+-				   GEN_INT (i*8));
+-	    sse_regno++;
+-	    break;
+-	  case X86_64_SSE_CLASS:
+-	    pos = i;
+-	    switch (n)
+-	      {
+-	      case 1:
+-		tmpmode = DImode;
+-		break;
+-	      case 2:
+-		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
+-		  {
+-		    tmpmode = TImode;
+-		    i++;
+-		  }
+-		else
+-		  tmpmode = DImode;
+-		break;
+-	      case 4:
+-		gcc_assert (i == 0
+-			    && regclass[1] == X86_64_SSEUP_CLASS
+-			    && regclass[2] == X86_64_SSEUP_CLASS
+-			    && regclass[3] == X86_64_SSEUP_CLASS);
+-		tmpmode = OImode;
+-		i += 3;
+-		break;
+-	      case 8:
+-		gcc_assert (i == 0
+-			    && regclass[1] == X86_64_SSEUP_CLASS
+-			    && regclass[2] == X86_64_SSEUP_CLASS
+-			    && regclass[3] == X86_64_SSEUP_CLASS
+-			    && regclass[4] == X86_64_SSEUP_CLASS
+-			    && regclass[5] == X86_64_SSEUP_CLASS
+-			    && regclass[6] == X86_64_SSEUP_CLASS
+-			    && regclass[7] == X86_64_SSEUP_CLASS);
+-		tmpmode = XImode;
+-		i += 7;
+-		break;
+-	      default:
+-		gcc_unreachable ();
+-	      }
+-	    exp [nexps++]
+-	      = gen_rtx_EXPR_LIST (VOIDmode,
+-				   gen_rtx_REG (tmpmode,
+-						GET_SSE_REGNO (sse_regno)),
+-				   GEN_INT (pos*8));
+-	    sse_regno++;
+-	    break;
+-	  default:
+-	    gcc_unreachable ();
+-	}
+-    }
++  /* Don't allow more than 32k pop, since that's all we can do
++     with one instruction.  */
++  if (crtl->args.pops_args && crtl->args.size >= 32768)
++    return 0;
+ 
+-  /* Empty aligned struct, union or class.  */
+-  if (nexps == 0)
+-    return NULL;
++  struct ix86_frame &frame = cfun->machine->frame;
++  return (frame.stack_pointer_offset == UNITS_PER_WORD
++	  && (frame.nregs + frame.nsseregs) == 0);
++}
++
++/* Value should be nonzero if functions must have frame pointers.
++   Zero means the frame pointer need not be set up (and parms may
++   be accessed via the stack pointer) in functions that seem suitable.  */
+ 
+-  ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
+-  for (i = 0; i < nexps; i++)
+-    XVECEXP (ret, 0, i) = exp [i];
+-  return ret;
++static bool
++ix86_frame_pointer_required (void)
++{
++  /* If we accessed previous frames, then the generated code expects
++     to be able to access the saved ebp value in our frame.  */
++  if (cfun->machine->accesses_prev_frame)
++    return true;
++
++  /* Several x86 os'es need a frame pointer for other reasons,
++     usually pertaining to setjmp.  */
++  if (SUBTARGET_FRAME_POINTER_REQUIRED)
++    return true;
++
++  /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
++  if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
++    return true;
++
++  /* Win64 SEH, very large frames need a frame-pointer as maximum stack
++     allocation is 4GB.  */
++  if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
++    return true;
++
++  /* SSE saves require frame-pointer when stack is misaligned.  */
++  if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
++    return true;
++  
++  /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
++     turns off the frame pointer by default.  Turn it back on now if
++     we've not got a leaf function.  */
++  if (TARGET_OMIT_LEAF_FRAME_POINTER
++      && (!crtl->is_leaf
++	  || ix86_current_function_calls_tls_descriptor))
++    return true;
++
++  if (crtl->profile && !flag_fentry)
++    return true;
++
++  return false;
+ }
+ 
+-/* Update the data in CUM to advance over an argument of mode MODE
+-   and data type TYPE.  (TYPE is null for libcalls where that information
+-   may not be available.)
++/* Record that the current function accesses previous call frames.  */
+ 
+-   Return a number of integer regsiters advanced over.  */
++void
++ix86_setup_frame_addresses (void)
++{
++  cfun->machine->accesses_prev_frame = 1;
++}
++
++#ifndef USE_HIDDEN_LINKONCE
++# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
++#  define USE_HIDDEN_LINKONCE 1
++# else
++#  define USE_HIDDEN_LINKONCE 0
++# endif
++#endif
+ 
+-static int
+-function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
+-			 const_tree type, HOST_WIDE_INT bytes,
+-			 HOST_WIDE_INT words)
++/* Label count for call and return thunks.  It is used to make unique
++   labels in call and return thunks.  */
++static int indirectlabelno;
++
++/* True if call thunk function is needed.  */
++static bool indirect_thunk_needed = false;
++
++/* Bit masks of integer registers, which contain branch target, used
++   by call thunk functions.  */
++static int indirect_thunks_used;
++
++/* True if return thunk function is needed.  */
++static bool indirect_return_needed = false;
++
++/* True if return thunk function via CX is needed.  */
++static bool indirect_return_via_cx;
++
++#ifndef INDIRECT_LABEL
++# define INDIRECT_LABEL "LIND"
++#endif
++
++/* Indicate what prefix is needed for an indirect branch.  */
++enum indirect_thunk_prefix
+ {
+-  int res = 0;
+-  bool error_p = false;
++  indirect_thunk_prefix_none,
++  indirect_thunk_prefix_nt
++};
+ 
+-  if (TARGET_IAMCU)
++/* Return the prefix needed for an indirect branch INSN.  */
++
++enum indirect_thunk_prefix
++indirect_thunk_need_prefix (rtx_insn *insn)
++{
++  enum indirect_thunk_prefix need_prefix;
++  if ((cfun->machine->indirect_branch_type
++	    == indirect_branch_thunk_extern)
++	   && ix86_notrack_prefixed_insn_p (insn))
+     {
+-      /* Intel MCU psABI passes scalars and aggregates no larger than 8
+-	 bytes in registers.  */
+-      if (!VECTOR_MODE_P (mode) && bytes <= 8)
+-	goto pass_in_reg;
+-      return res;
++      /* NOTRACK prefix is only used with external thunk so that it
++	 can be properly updated to support CET at run-time.  */
++      need_prefix = indirect_thunk_prefix_nt;
+     }
++  else
++    need_prefix = indirect_thunk_prefix_none;
++  return need_prefix;
++}
+ 
+-  switch (mode)
+-    {
+-    default:
+-      break;
++/* Fills in the label name that should be used for the indirect thunk.  */
+ 
+-    case E_BLKmode:
+-      if (bytes < 0)
+-	break;
+-      /* FALLTHRU */
++static void
++indirect_thunk_name (char name[32], unsigned int regno,
++		     enum indirect_thunk_prefix need_prefix,
++		     bool ret_p)
++{
++  if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
++    gcc_unreachable ();
+ 
+-    case E_DImode:
+-    case E_SImode:
+-    case E_HImode:
+-    case E_QImode:
+-pass_in_reg:
+-      cum->words += words;
+-      cum->nregs -= words;
+-      cum->regno += words;
+-      if (cum->nregs >= 0)
+-	res = words;
+-      if (cum->nregs <= 0)
++  if (USE_HIDDEN_LINKONCE)
++    {
++      const char *prefix;
++
++      if (need_prefix == indirect_thunk_prefix_nt
++	  && regno != INVALID_REGNUM)
+ 	{
+-	  cum->nregs = 0;
+-	  cfun->machine->arg_reg_available = false;
+-	  cum->regno = 0;
++	  /* NOTRACK prefix is only used with external thunk via
++	     register so that NOTRACK prefix can be added to indirect
++	     branch via register to support CET at run-time.  */
++	  prefix = "_nt";
+ 	}
+-      break;
+-
+-    case E_OImode:
+-      /* OImode shouldn't be used directly.  */
+-      gcc_unreachable ();
++      else
++	prefix = "";
+ 
+-    case E_DFmode:
+-      if (cum->float_in_sse == -1)
+-	error_p = true;
+-      if (cum->float_in_sse < 2)
+-	break;
+-      /* FALLTHRU */
+-    case E_SFmode:
+-      if (cum->float_in_sse == -1)
+-	error_p = true;
+-      if (cum->float_in_sse < 1)
+-	break;
+-      /* FALLTHRU */
++      const char *ret = ret_p ? "return" : "indirect";
+ 
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V64QImode:
+-    case E_V32HImode:
+-    case E_V16SImode:
+-    case E_V8DImode:
+-    case E_V16SFmode:
+-    case E_V8DFmode:
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-    case E_TImode:
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V4SImode:
+-    case E_V2DImode:
+-    case E_V4SFmode:
+-    case E_V2DFmode:
+-      if (!type || !AGGREGATE_TYPE_P (type))
++      if (regno != INVALID_REGNUM)
+ 	{
+-	  cum->sse_words += words;
+-	  cum->sse_nregs -= 1;
+-	  cum->sse_regno += 1;
+-	  if (cum->sse_nregs <= 0)
+-	    {
+-	      cum->sse_nregs = 0;
+-	      cum->sse_regno = 0;
+-	    }
+-	}
+-      break;
+-
+-    case E_V8QImode:
+-    case E_V4HImode:
+-    case E_V2SImode:
+-    case E_V2SFmode:
+-    case E_V1TImode:
+-    case E_V1DImode:
+-      if (!type || !AGGREGATE_TYPE_P (type))
+-	{
+-	  cum->mmx_words += words;
+-	  cum->mmx_nregs -= 1;
+-	  cum->mmx_regno += 1;
+-	  if (cum->mmx_nregs <= 0)
+-	    {
+-	      cum->mmx_nregs = 0;
+-	      cum->mmx_regno = 0;
+-	    }
++	  const char *reg_prefix;
++	  if (LEGACY_INT_REGNO_P (regno))
++	    reg_prefix = TARGET_64BIT ? "r" : "e";
++	  else
++	    reg_prefix = "";
++	  sprintf (name, "__x86_%s_thunk%s_%s%s",
++		   ret, prefix, reg_prefix, reg_names[regno]);
+ 	}
+-      break;
++      else
++	sprintf (name, "__x86_%s_thunk%s", ret, prefix);
+     }
+-  if (error_p)
++  else
+     {
+-      cum->float_in_sse = 0;
+-      error ("calling %qD with SSE calling convention without "
+-	     "SSE/SSE2 enabled", cum->decl);
+-      sorry ("this is a GCC bug that can be worked around by adding "
+-	     "attribute used to function called");
++      if (regno != INVALID_REGNUM)
++	ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
++      else
++	{
++	  if (ret_p)
++	    ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
++	  else
++	    ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
++	}
+     }
+-
+-  return res;
+ }
+ 
+-static int
+-function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
+-			 const_tree type, HOST_WIDE_INT words, bool named)
+-{
+-  int int_nregs, sse_nregs;
++/* Output a call and return thunk for indirect branch.  If REGNO != -1,
++   the function address is in REGNO and the call and return thunk looks like:
+ 
+-  /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
+-  if (!named && (VALID_AVX512F_REG_MODE (mode)
+-		 || VALID_AVX256_REG_MODE (mode)))
+-    return 0;
++	call	L2
++   L1:
++	pause
++	lfence
++	jmp	L1
++   L2:
++	mov	%REG, (%sp)
++	ret
+ 
+-  if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
+-      && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
+-    {
+-      cum->nregs -= int_nregs;
+-      cum->sse_nregs -= sse_nregs;
+-      cum->regno += int_nregs;
+-      cum->sse_regno += sse_nregs;
+-      return int_nregs;
+-    }
+-  else
+-    {
+-      int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
+-      cum->words = ROUND_UP (cum->words, align);
+-      cum->words += words;
+-      return 0;
+-    }
+-}
++   Otherwise, the function address is on the top of stack and the
++   call and return thunk looks like:
+ 
+-static int
+-function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
+-			    HOST_WIDE_INT words)
++	call L2
++  L1:
++	pause
++	lfence
++	jmp L1
++  L2:
++	lea WORD_SIZE(%sp), %sp
++	ret
++ */
++
++static void
++output_indirect_thunk (unsigned int regno)
+ {
+-  /* Otherwise, this should be passed indirect.  */
+-  gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
++  char indirectlabel1[32];
++  char indirectlabel2[32];
+ 
+-  cum->words += words;
+-  if (cum->nregs > 0)
+-    {
+-      cum->nregs -= 1;
+-      cum->regno += 1;
+-      return 1;
+-    }
+-  return 0;
+-}
++  ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
++			       indirectlabelno++);
++  ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
++			       indirectlabelno++);
+ 
+-/* Update the data in CUM to advance over an argument of mode MODE and
+-   data type TYPE.  (TYPE is null for libcalls where that information
+-   may not be available.)  */
++  /* Call */
++  fputs ("\tcall\t", asm_out_file);
++  assemble_name_raw (asm_out_file, indirectlabel2);
++  fputc ('\n', asm_out_file);
+ 
+-static void
+-ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
+-			   const_tree type, bool named)
+-{
+-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+-  HOST_WIDE_INT bytes, words;
+-  int nregs;
++  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+ 
+-  /* The argument of interrupt handler is a special case and is
+-     handled in ix86_function_arg.  */
+-  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+-    return;
++  /* AMD and Intel CPUs prefer each a different instruction as loop filler.
++     Usage of both pause + lfence is compromise solution.  */
++  fprintf (asm_out_file, "\tpause\n\tlfence\n");
+ 
+-  if (mode == BLKmode)
+-    bytes = int_size_in_bytes (type);
+-  else
+-    bytes = GET_MODE_SIZE (mode);
+-  words = CEIL (bytes, UNITS_PER_WORD);
++  /* Jump.  */
++  fputs ("\tjmp\t", asm_out_file);
++  assemble_name_raw (asm_out_file, indirectlabel1);
++  fputc ('\n', asm_out_file);
+ 
+-  if (type)
+-    mode = type_natural_mode (type, NULL, false);
++  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+ 
+-  if (TARGET_64BIT)
++  /* The above call insn pushed a word to stack.  Adjust CFI info.  */
++  if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
+     {
+-      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
++      if (! dwarf2out_do_cfi_asm ())
++	{
++	  dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
++	  xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
++	  xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
++	  vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
++	}
++      dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
++      xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
++      xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
++      vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
++      dwarf2out_emit_cfi (xcfi);
++    }
+ 
+-      if (call_abi == MS_ABI)
+-	nregs = function_arg_advance_ms_64 (cum, bytes, words);
+-      else
+-	nregs = function_arg_advance_64 (cum, mode, type, words, named);
++  if (regno != INVALID_REGNUM)
++    {
++      /* MOV.  */
++      rtx xops[2];
++      xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
++      xops[1] = gen_rtx_REG (word_mode, regno);
++      output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
+     }
+   else
+-    nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
+-
+-  if (!nregs)
+     {
+-      /* Track if there are outgoing arguments on stack.  */
+-      if (cum->caller)
+-	cfun->machine->outgoing_args_on_stack = true;
++      /* LEA.  */
++      rtx xops[2];
++      xops[0] = stack_pointer_rtx;
++      xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
++      output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
+     }
+-}
+ 
+-/* Define where to put the arguments to a function.
+-   Value is zero to push the argument on the stack,
+-   or a hard register in which to store the argument.
++  fputs ("\tret\n", asm_out_file);
++}
+ 
+-   MODE is the argument's machine mode.
+-   TYPE is the data type of the argument (as a tree).
+-    This is null for libcalls where that information may
+-    not be available.
+-   CUM is a variable of type CUMULATIVE_ARGS which gives info about
+-    the preceding args and about the function being called.
+-   NAMED is nonzero if this argument is a named parameter
+-    (otherwise it is an extra parameter matching an ellipsis).  */
++/* Output a funtion with a call and return thunk for indirect branch.
++   If REGNO != INVALID_REGNUM, the function address is in REGNO.
++   Otherwise, the function address is on the top of stack.  Thunk is
++   used for function return if RET_P is true.  */
+ 
+-static rtx
+-function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
+-		 machine_mode orig_mode, const_tree type,
+-		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
++static void
++output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
++				unsigned int regno, bool ret_p)
+ {
+-  bool error_p = false;
++  char name[32];
++  tree decl;
+ 
+-  /* Avoid the AL settings for the Unix64 ABI.  */
+-  if (mode == VOIDmode)
+-    return constm1_rtx;
++  /* Create __x86_indirect_thunk.  */
++  indirect_thunk_name (name, regno, need_prefix, ret_p);
++  decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
++		     get_identifier (name),
++		     build_function_type_list (void_type_node, NULL_TREE));
++  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
++				   NULL_TREE, void_type_node);
++  TREE_PUBLIC (decl) = 1;
++  TREE_STATIC (decl) = 1;
++  DECL_IGNORED_P (decl) = 1;
+ 
+-  if (TARGET_IAMCU)
++#if TARGET_MACHO
++  if (TARGET_MACHO)
+     {
+-      /* Intel MCU psABI passes scalars and aggregates no larger than 8
+-	 bytes in registers.  */
+-      if (!VECTOR_MODE_P (mode) && bytes <= 8)
+-	goto pass_in_reg;
+-      return NULL_RTX;
++      switch_to_section (darwin_sections[picbase_thunk_section]);
++      fputs ("\t.weak_definition\t", asm_out_file);
++      assemble_name (asm_out_file, name);
++      fputs ("\n\t.private_extern\t", asm_out_file);
++      assemble_name (asm_out_file, name);
++      putc ('\n', asm_out_file);
++      ASM_OUTPUT_LABEL (asm_out_file, name);
++      DECL_WEAK (decl) = 1;
+     }
++  else
++#endif
++    if (USE_HIDDEN_LINKONCE)
++      {
++	cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
+ 
+-  switch (mode)
+-    {
+-    default:
+-      break;
++	targetm.asm_out.unique_section (decl, 0);
++	switch_to_section (get_named_section (decl, NULL, 0));
+ 
+-    case E_BLKmode:
+-      if (bytes < 0)
+-	break;
+-      /* FALLTHRU */
+-    case E_DImode:
+-    case E_SImode:
+-    case E_HImode:
+-    case E_QImode:
+-pass_in_reg:
+-      if (words <= cum->nregs)
+-	{
+-	  int regno = cum->regno;
++	targetm.asm_out.globalize_label (asm_out_file, name);
++	fputs ("\t.hidden\t", asm_out_file);
++	assemble_name (asm_out_file, name);
++	putc ('\n', asm_out_file);
++	ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
++      }
++    else
++      {
++	switch_to_section (text_section);
++	ASM_OUTPUT_LABEL (asm_out_file, name);
++      }
+ 
+-	  /* Fastcall allocates the first two DWORD (SImode) or
+-            smaller arguments to ECX and EDX if it isn't an
+-            aggregate type .  */
+-	  if (cum->fastcall)
+-	    {
+-	      if (mode == BLKmode
+-		  || mode == DImode
+-		  || (type && AGGREGATE_TYPE_P (type)))
+-	        break;
++  DECL_INITIAL (decl) = make_node (BLOCK);
++  current_function_decl = decl;
++  allocate_struct_function (decl, false);
++  init_function_start (decl);
++  /* We're about to hide the function body from callees of final_* by
++     emitting it directly; tell them we're a thunk, if they care.  */
++  cfun->is_thunk = true;
++  first_function_block_is_cold = false;
++  /* Make sure unwind info is emitted for the thunk if needed.  */
++  final_start_function (emit_barrier (), asm_out_file, 1);
+ 
+-	      /* ECX not EAX is the first allocated register.  */
+-	      if (regno == AX_REG)
+-		regno = CX_REG;
+-	    }
+-	  return gen_rtx_REG (mode, regno);
+-	}
+-      break;
++  output_indirect_thunk (regno);
+ 
+-    case E_DFmode:
+-      if (cum->float_in_sse == -1)
+-	error_p = true;
+-      if (cum->float_in_sse < 2)
+-	break;
+-      /* FALLTHRU */
+-    case E_SFmode:
+-      if (cum->float_in_sse == -1)
+-	error_p = true;
+-      if (cum->float_in_sse < 1)
+-	break;
+-      /* FALLTHRU */
+-    case E_TImode:
+-      /* In 32bit, we pass TImode in xmm registers.  */
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V4SImode:
+-    case E_V2DImode:
+-    case E_V4SFmode:
+-    case E_V2DFmode:
+-      if (!type || !AGGREGATE_TYPE_P (type))
+-	{
+-	  if (cum->sse_nregs)
+-	    return gen_reg_or_parallel (mode, orig_mode,
+-				        cum->sse_regno + FIRST_SSE_REG);
+-	}
+-      break;
++  final_end_function ();
++  init_insn_lengths ();
++  free_after_compilation (cfun);
++  set_cfun (NULL);
++  current_function_decl = NULL;
++}
+ 
+-    case E_OImode:
+-    case E_XImode:
+-      /* OImode and XImode shouldn't be used directly.  */
+-      gcc_unreachable ();
++static int pic_labels_used;
+ 
+-    case E_V64QImode:
+-    case E_V32HImode:
+-    case E_V16SImode:
+-    case E_V8DImode:
+-    case E_V16SFmode:
+-    case E_V8DFmode:
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-      if (!type || !AGGREGATE_TYPE_P (type))
+-	{
+-	  if (cum->sse_nregs)
+-	    return gen_reg_or_parallel (mode, orig_mode,
+-				        cum->sse_regno + FIRST_SSE_REG);
+-	}
+-      break;
++/* Fills in the label name that should be used for a pc thunk for
++   the given register.  */
+ 
+-    case E_V8QImode:
+-    case E_V4HImode:
+-    case E_V2SImode:
+-    case E_V2SFmode:
+-    case E_V1TImode:
+-    case E_V1DImode:
+-      if (!type || !AGGREGATE_TYPE_P (type))
+-	{
+-	  if (cum->mmx_nregs)
+-	    return gen_reg_or_parallel (mode, orig_mode,
+-				        cum->mmx_regno + FIRST_MMX_REG);
+-	}
+-      break;
+-    }
+-  if (error_p)
+-    {
+-      cum->float_in_sse = 0;
+-      error ("calling %qD with SSE calling convention without "
+-	     "SSE/SSE2 enabled", cum->decl);
+-      sorry ("this is a GCC bug that can be worked around by adding "
+-	     "attribute used to function called");
+-    }
++static void
++get_pc_thunk_name (char name[32], unsigned int regno)
++{
++  gcc_assert (!TARGET_64BIT);
+ 
+-  return NULL_RTX;
++  if (USE_HIDDEN_LINKONCE)
++    sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
++  else
++    ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
+ }
+ 
+-static rtx
+-function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
+-		 machine_mode orig_mode, const_tree type, bool named)
++
++/* This function generates code for -fpic that loads %ebx with
++   the return address of the caller and then returns.  */
++
++static void
++ix86_code_end (void)
+ {
+-  /* Handle a hidden AL argument containing number of registers
+-     for varargs x86-64 functions.  */
+-  if (mode == VOIDmode)
+-    return GEN_INT (cum->maybe_vaarg
+-		    ? (cum->sse_nregs < 0
+-		       ? X86_64_SSE_REGPARM_MAX
+-		       : cum->sse_regno)
+-		    : -1);
++  rtx xops[2];
++  unsigned int regno;
+ 
+-  switch (mode)
+-    {
+-    default:
+-      break;
++  if (indirect_return_needed)
++    output_indirect_thunk_function (indirect_thunk_prefix_none,
++				    INVALID_REGNUM, true);
++  if (indirect_return_via_cx)
++    output_indirect_thunk_function (indirect_thunk_prefix_none,
++				    CX_REG, true);
++  if (indirect_thunk_needed)
++    output_indirect_thunk_function (indirect_thunk_prefix_none,
++				    INVALID_REGNUM, false);
+ 
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-    case E_V16SFmode:
+-    case E_V16SImode:
+-    case E_V64QImode:
+-    case E_V32HImode:
+-    case E_V8DFmode:
+-    case E_V8DImode:
+-      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
+-      if (!named)
+-	return NULL;
+-      break;
++  for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
++    {
++      unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
++      if ((indirect_thunks_used & (1 << i)))
++	output_indirect_thunk_function (indirect_thunk_prefix_none,
++					regno, false);
+     }
+ 
+-  return construct_container (mode, orig_mode, type, 0, cum->nregs,
+-			      cum->sse_nregs,
+-			      &x86_64_int_parameter_registers [cum->regno],
+-			      cum->sse_regno);
+-}
++  for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
++    {
++      char name[32];
++      tree decl;
+ 
+-static rtx
+-function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
+-		    machine_mode orig_mode, bool named, const_tree type,
+-		    HOST_WIDE_INT bytes)
+-{
+-  unsigned int regno;
++      if ((indirect_thunks_used & (1 << regno)))
++	output_indirect_thunk_function (indirect_thunk_prefix_none,
++					regno, false);
+ 
+-  /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
+-     We use value of -2 to specify that current function call is MSABI.  */
+-  if (mode == VOIDmode)
+-    return GEN_INT (-2);
++      if (!(pic_labels_used & (1 << regno)))
++	continue;
+ 
+-  /* If we've run out of registers, it goes on the stack.  */
+-  if (cum->nregs == 0)
+-    return NULL_RTX;
++      get_pc_thunk_name (name, regno);
+ 
+-  regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
++      decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
++			 get_identifier (name),
++			 build_function_type_list (void_type_node, NULL_TREE));
++      DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
++				       NULL_TREE, void_type_node);
++      TREE_PUBLIC (decl) = 1;
++      TREE_STATIC (decl) = 1;
++      DECL_IGNORED_P (decl) = 1;
+ 
+-  /* Only floating point modes are passed in anything but integer regs.  */
+-  if (TARGET_SSE && (mode == SFmode || mode == DFmode))
+-    {
+-      if (named)
++#if TARGET_MACHO
++      if (TARGET_MACHO)
+ 	{
+-	  if (type == NULL_TREE || !AGGREGATE_TYPE_P (type))
+-	    regno = cum->regno + FIRST_SSE_REG;
++	  switch_to_section (darwin_sections[picbase_thunk_section]);
++	  fputs ("\t.weak_definition\t", asm_out_file);
++	  assemble_name (asm_out_file, name);
++	  fputs ("\n\t.private_extern\t", asm_out_file);
++	  assemble_name (asm_out_file, name);
++	  putc ('\n', asm_out_file);
++	  ASM_OUTPUT_LABEL (asm_out_file, name);
++	  DECL_WEAK (decl) = 1;
+ 	}
+       else
++#endif
++      if (USE_HIDDEN_LINKONCE)
+ 	{
+-	  rtx t1, t2;
++	  cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
+ 
+-	  /* Unnamed floating parameters are passed in both the
+-	     SSE and integer registers.  */
+-	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
+-	  t2 = gen_rtx_REG (mode, regno);
+-	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
+-	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
+-	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
+-	}
+-    }
+-  /* Handle aggregated types passed in register.  */
+-  if (orig_mode == BLKmode)
+-    {
+-      if (bytes > 0 && bytes <= 8)
+-        mode = (bytes > 4 ? DImode : SImode);
+-      if (mode == BLKmode)
+-        mode = DImode;
+-    }
+-
+-  return gen_reg_or_parallel (mode, orig_mode, regno);
+-}
+-
+-/* Return where to put the arguments to a function.
+-   Return zero to push the argument on the stack, or a hard register in which to store the argument.
+-
+-   MODE is the argument's machine mode.  TYPE is the data type of the
+-   argument.  It is null for libcalls where that information may not be
+-   available.  CUM gives information about the preceding args and about
+-   the function being called.  NAMED is nonzero if this argument is a
+-   named parameter (otherwise it is an extra parameter matching an
+-   ellipsis).  */
+-
+-static rtx
+-ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
+-		   const_tree type, bool named)
+-{
+-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+-  machine_mode mode = omode;
+-  HOST_WIDE_INT bytes, words;
+-  rtx arg;
++	  targetm.asm_out.unique_section (decl, 0);
++	  switch_to_section (get_named_section (decl, NULL, 0));
+ 
+-  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+-    {
+-      gcc_assert (type != NULL_TREE);
+-      if (POINTER_TYPE_P (type))
+-	{
+-	  /* This is the pointer argument.  */
+-	  gcc_assert (TYPE_MODE (type) == Pmode);
+-	  /* It is at -WORD(AP) in the current frame in interrupt and
+-	     exception handlers.  */
+-	  arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
++	  targetm.asm_out.globalize_label (asm_out_file, name);
++	  fputs ("\t.hidden\t", asm_out_file);
++	  assemble_name (asm_out_file, name);
++	  putc ('\n', asm_out_file);
++	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+ 	}
+       else
+ 	{
+-	  gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
+-		      && TREE_CODE (type) == INTEGER_TYPE
+-		      && TYPE_MODE (type) == word_mode);
+-	  /* The error code is the word-mode integer argument at
+-	     -2 * WORD(AP) in the current frame of the exception
+-	     handler.  */
+-	  arg = gen_rtx_MEM (word_mode,
+-			     plus_constant (Pmode,
+-					    arg_pointer_rtx,
+-					    -2 * UNITS_PER_WORD));
++	  switch_to_section (text_section);
++	  ASM_OUTPUT_LABEL (asm_out_file, name);
+ 	}
+-      return arg;
+-    }
+ 
+-  if (mode == BLKmode)
+-    bytes = int_size_in_bytes (type);
+-  else
+-    bytes = GET_MODE_SIZE (mode);
+-  words = CEIL (bytes, UNITS_PER_WORD);
++      DECL_INITIAL (decl) = make_node (BLOCK);
++      current_function_decl = decl;
++      allocate_struct_function (decl, false);
++      init_function_start (decl);
++      /* We're about to hide the function body from callees of final_* by
++	 emitting it directly; tell them we're a thunk, if they care.  */
++      cfun->is_thunk = true;
++      first_function_block_is_cold = false;
++      /* Make sure unwind info is emitted for the thunk if needed.  */
++      final_start_function (emit_barrier (), asm_out_file, 1);
+ 
+-  /* To simplify the code below, represent vector types with a vector mode
+-     even if MMX/SSE are not active.  */
+-  if (type && TREE_CODE (type) == VECTOR_TYPE)
+-    mode = type_natural_mode (type, cum, false);
++      /* Pad stack IP move with 4 instructions (two NOPs count
++	 as one instruction).  */
++      if (TARGET_PAD_SHORT_FUNCTION)
++	{
++	  int i = 8;
+ 
+-  if (TARGET_64BIT)
+-    {
+-      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
++	  while (i--)
++	    fputs ("\tnop\n", asm_out_file);
++	}
+ 
+-      if (call_abi == MS_ABI)
+-	arg = function_arg_ms_64 (cum, mode, omode, named, type, bytes);
+-      else
+-	arg = function_arg_64 (cum, mode, omode, type, named);
++      xops[0] = gen_rtx_REG (Pmode, regno);
++      xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
++      output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
++      output_asm_insn ("%!ret", NULL);
++      final_end_function ();
++      init_insn_lengths ();
++      free_after_compilation (cfun);
++      set_cfun (NULL);
++      current_function_decl = NULL;
+     }
+-  else
+-    arg = function_arg_32 (cum, mode, omode, type, bytes, words);
+ 
+-  /* Track if there are outgoing arguments on stack.  */
+-  if (arg == NULL_RTX && cum->caller)
+-    cfun->machine->outgoing_args_on_stack = true;
+-
+-  return arg;
++  if (flag_split_stack)
++    file_end_indicate_split_stack ();
+ }
+ 
+-/* A C expression that indicates when an argument must be passed by
+-   reference.  If nonzero for an argument, a copy of that argument is
+-   made in memory and a pointer to the argument is passed instead of
+-   the argument itself.  The pointer is passed in whatever way is
+-   appropriate for passing a pointer to that type.  */
++/* Emit code for the SET_GOT patterns.  */
+ 
+-static bool
+-ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
+-			const_tree type, bool)
++const char *
++output_set_got (rtx dest, rtx label)
+ {
+-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
++  rtx xops[3];
+ 
+-  if (TARGET_64BIT)
++  xops[0] = dest;
++
++  if (TARGET_VXWORKS_RTP && flag_pic)
+     {
+-      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
++      /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
++      xops[2] = gen_rtx_MEM (Pmode,
++			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
++      output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
+ 
+-      /* See Windows x64 Software Convention.  */
+-      if (call_abi == MS_ABI)
+-	{
+-	  HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
++      /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
++	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
++	 an unadorned address.  */
++      xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
++      SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
++      output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
++      return "";
++    }
+ 
+-	  if (type)
+-	    {
+-	      /* Arrays are passed by reference.  */
+-	      if (TREE_CODE (type) == ARRAY_TYPE)
+-		return true;
++  xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+ 
+-	      if (RECORD_OR_UNION_TYPE_P (type))
+-		{
+-		  /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
+-		     are passed by reference.  */
+-		  msize = int_size_in_bytes (type);
+-		}
+-	    }
++  if (flag_pic)
++    {
++      char name[32];
++      get_pc_thunk_name (name, REGNO (dest));
++      pic_labels_used |= 1 << REGNO (dest);
+ 
+-	  /* __m128 is passed by reference.  */
+-	  return msize != 1 && msize != 2 && msize != 4 && msize != 8;
+-	}
+-      else if (type && int_size_in_bytes (type) == -1)
+-	return true;
++      xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
++      xops[2] = gen_rtx_MEM (QImode, xops[2]);
++      output_asm_insn ("%!call\t%X2", xops);
++
++#if TARGET_MACHO
++      /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
++         This is what will be referenced by the Mach-O PIC subsystem.  */
++      if (machopic_should_output_picbase_label () || !label)
++	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
++
++      /* When we are restoring the pic base at the site of a nonlocal label,
++         and we decided to emit the pic base above, we will still output a
++         local label used for calculating the correction offset (even though
++         the offset will be 0 in that case).  */
++      if (label)
++        targetm.asm_out.internal_label (asm_out_file, "L",
++					   CODE_LABEL_NUMBER (label));
++#endif
++    }
++  else
++    {
++      if (TARGET_MACHO)
++	/* We don't need a pic base, we're not producing pic.  */
++	gcc_unreachable ();
++
++      xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
++      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
++      targetm.asm_out.internal_label (asm_out_file, "L",
++				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
+     }
+ 
+-  return false;
++  if (!TARGET_MACHO)
++    output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
++
++  return "";
+ }
+ 
+-/* Return true when TYPE should be 128bit aligned for 32bit argument
+-   passing ABI.  XXX: This function is obsolete and is only used for
+-   checking psABI compatibility with previous versions of GCC.  */
++/* Generate an "push" pattern for input ARG.  */
+ 
+-static bool
+-ix86_compat_aligned_value_p (const_tree type)
++rtx
++gen_push (rtx arg)
+ {
+-  machine_mode mode = TYPE_MODE (type);
+-  if (((TARGET_SSE && SSE_REG_MODE_P (mode))
+-       || mode == TDmode
+-       || mode == TFmode
+-       || mode == TCmode)
+-      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
+-    return true;
+-  if (TYPE_ALIGN (type) < 128)
+-    return false;
++  struct machine_function *m = cfun->machine;
+ 
+-  if (AGGREGATE_TYPE_P (type))
+-    {
+-      /* Walk the aggregates recursively.  */
+-      switch (TREE_CODE (type))
+-	{
+-	case RECORD_TYPE:
+-	case UNION_TYPE:
+-	case QUAL_UNION_TYPE:
+-	  {
+-	    tree field;
++  if (m->fs.cfa_reg == stack_pointer_rtx)
++    m->fs.cfa_offset += UNITS_PER_WORD;
++  m->fs.sp_offset += UNITS_PER_WORD;
+ 
+-	    /* Walk all the structure fields.  */
+-	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+-	      {
+-		if (TREE_CODE (field) == FIELD_DECL
+-		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
+-		  return true;
+-	      }
+-	    break;
+-	  }
++  if (REG_P (arg) && GET_MODE (arg) != word_mode)
++    arg = gen_rtx_REG (word_mode, REGNO (arg));
+ 
+-	case ARRAY_TYPE:
+-	  /* Just for use if some languages passes arrays by value.  */
+-	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
+-	    return true;
+-	  break;
++  return gen_rtx_SET (gen_rtx_MEM (word_mode,
++				   gen_rtx_PRE_DEC (Pmode,
++						    stack_pointer_rtx)),
++		      arg);
++}
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-  return false;
++/* Generate an "pop" pattern for input ARG.  */
++
++rtx
++gen_pop (rtx arg)
++{
++  if (REG_P (arg) && GET_MODE (arg) != word_mode)
++    arg = gen_rtx_REG (word_mode, REGNO (arg));
++
++  return gen_rtx_SET (arg,
++		      gen_rtx_MEM (word_mode,
++				   gen_rtx_POST_INC (Pmode,
++						     stack_pointer_rtx)));
+ }
+ 
+-/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
+-   XXX: This function is obsolete and is only used for checking psABI
+-   compatibility with previous versions of GCC.  */
++/* Return >= 0 if there is an unused call-clobbered register available
++   for the entire function.  */
+ 
+ static unsigned int
+-ix86_compat_function_arg_boundary (machine_mode mode,
+-				   const_tree type, unsigned int align)
++ix86_select_alt_pic_regnum (void)
+ {
+-  /* In 32bit, only _Decimal128 and __float128 are aligned to their
+-     natural boundaries.  */
+-  if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
+-    {
+-      /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
+-	 make an exception for SSE modes since these require 128bit
+-	 alignment.
+-
+-	 The handling here differs from field_alignment.  ICC aligns MMX
+-	 arguments to 4 byte boundaries, while structure fields are aligned
+-	 to 8 byte boundaries.  */
+-      if (!type)
+-	{
+-	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
+-	    align = PARM_BOUNDARY;
+-	}
++  if (ix86_use_pseudo_pic_reg ())
++    return INVALID_REGNUM;
++
++  if (crtl->is_leaf
++      && !crtl->profile
++      && !ix86_current_function_calls_tls_descriptor)
++    {
++      int i, drap;
++      /* Can't use the same register for both PIC and DRAP.  */
++      if (crtl->drap_reg)
++	drap = REGNO (crtl->drap_reg);
+       else
+-	{
+-	  if (!ix86_compat_aligned_value_p (type))
+-	    align = PARM_BOUNDARY;
+-	}
++	drap = -1;
++      for (i = 2; i >= 0; --i)
++        if (i != drap && !df_regs_ever_live_p (i))
++	  return i;
+     }
+-  if (align > BIGGEST_ALIGNMENT)
+-    align = BIGGEST_ALIGNMENT;
+-  return align;
++
++  return INVALID_REGNUM;
+ }
+ 
+-/* Return true when TYPE should be 128bit aligned for 32bit argument
+-   passing ABI.  */
++/* Return true if REGNO is used by the epilogue.  */
+ 
+-static bool
+-ix86_contains_aligned_value_p (const_tree type)
++bool
++ix86_epilogue_uses (int regno)
+ {
+-  machine_mode mode = TYPE_MODE (type);
+-
+-  if (mode == XFmode || mode == XCmode)
+-    return false;
+-
+-  if (TYPE_ALIGN (type) < 128)
+-    return false;
+-
+-  if (AGGREGATE_TYPE_P (type))
+-    {
+-      /* Walk the aggregates recursively.  */
+-      switch (TREE_CODE (type))
+-	{
+-	case RECORD_TYPE:
+-	case UNION_TYPE:
+-	case QUAL_UNION_TYPE:
+-	  {
+-	    tree field;
+-
+-	    /* Walk all the structure fields.  */
+-	    for (field = TYPE_FIELDS (type);
+-		 field;
+-		 field = DECL_CHAIN (field))
+-	      {
+-		if (TREE_CODE (field) == FIELD_DECL
+-		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
+-		  return true;
+-	      }
+-	    break;
+-	  }
+-
+-	case ARRAY_TYPE:
+-	  /* Just for use if some languages passes arrays by value.  */
+-	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
+-	    return true;
+-	  break;
++  /* If there are no caller-saved registers, we preserve all registers,
++     except for MMX and x87 registers which aren't supported when saving
++     and restoring registers.  Don't explicitly save SP register since
++     it is always preserved.  */
++  return (epilogue_completed
++	  && cfun->machine->no_caller_saved_registers
++	  && !fixed_regs[regno]
++	  && !STACK_REGNO_P (regno)
++	  && !MMX_REGNO_P (regno));
++}
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-  else
+-    return TYPE_ALIGN (type) >= 128;
++/* Return nonzero if register REGNO can be used as a scratch register
++   in peephole2.  */
+ 
+-  return false;
++static bool
++ix86_hard_regno_scratch_ok (unsigned int regno)
++{
++  /* If there are no caller-saved registers, we can't use any register
++     as a scratch register after epilogue and use REGNO as scratch
++     register only if it has been used before to avoid saving and
++     restoring it.  */
++  return (!cfun->machine->no_caller_saved_registers
++	  || (!epilogue_completed
++	      && df_regs_ever_live_p (regno)));
+ }
+ 
+-/* Gives the alignment boundary, in bits, of an argument with the
+-   specified mode and type.  */
++/* Return TRUE if we need to save REGNO.  */
+ 
+-static unsigned int
+-ix86_function_arg_boundary (machine_mode mode, const_tree type)
++bool
++ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
+ {
+-  unsigned int align;
+-  if (type)
++  /* If there are no caller-saved registers, we preserve all registers,
++     except for MMX and x87 registers which aren't supported when saving
++     and restoring registers.  Don't explicitly save SP register since
++     it is always preserved.  */
++  if (cfun->machine->no_caller_saved_registers)
+     {
+-      /* Since the main variant type is used for call, we convert it to
+-	 the main variant type.  */
+-      type = TYPE_MAIN_VARIANT (type);
+-      align = TYPE_ALIGN (type);
+-      if (TYPE_EMPTY_P (type))
+-	return PARM_BOUNDARY;
++      /* Don't preserve registers used for function return value.  */
++      rtx reg = crtl->return_rtx;
++      if (reg)
++	{
++	  unsigned int i = REGNO (reg);
++	  unsigned int nregs = REG_NREGS (reg);
++	  while (nregs-- > 0)
++	    if ((i + nregs) == regno)
++	      return false;
++	}
++
++      return (df_regs_ever_live_p (regno)
++	      && !fixed_regs[regno]
++	      && !STACK_REGNO_P (regno)
++	      && !MMX_REGNO_P (regno)
++	      && (regno != HARD_FRAME_POINTER_REGNUM
++		  || !frame_pointer_needed));
+     }
+-  else
+-    align = GET_MODE_ALIGNMENT (mode);
+-  if (align < PARM_BOUNDARY)
+-    align = PARM_BOUNDARY;
+-  else
+-    {
+-      static bool warned;
+-      unsigned int saved_align = align;
+ 
+-      if (!TARGET_64BIT)
++  if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
++      && pic_offset_table_rtx)
++    {
++      if (ix86_use_pseudo_pic_reg ())
+ 	{
+-	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
+-	  if (!type)
+-	    {
+-	      if (mode == XFmode || mode == XCmode)
+-		align = PARM_BOUNDARY;
+-	    }
+-	  else if (!ix86_contains_aligned_value_p (type))
+-	    align = PARM_BOUNDARY;
+-
+-	  if (align < 128)
+-	    align = PARM_BOUNDARY;
++	  /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
++	  _mcount in prologue.  */
++	  if (!TARGET_64BIT && flag_pic && crtl->profile)
++	    return true;
+ 	}
++      else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
++	       || crtl->profile
++	       || crtl->calls_eh_return
++	       || crtl->uses_const_pool
++	       || cfun->has_nonlocal_label)
++        return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
++    }
+ 
+-      if (warn_psabi
+-	  && !warned
+-	  && align != ix86_compat_function_arg_boundary (mode, type,
+-							 saved_align))
++  if (crtl->calls_eh_return && maybe_eh_return)
++    {
++      unsigned i;
++      for (i = 0; ; i++)
+ 	{
+-	  warned = true;
+-	  inform (input_location,
+-		  "the ABI for passing parameters with %d-byte"
+-		  " alignment has changed in GCC 4.6",
+-		  align / BITS_PER_UNIT);
++	  unsigned test = EH_RETURN_DATA_REGNO (i);
++	  if (test == INVALID_REGNUM)
++	    break;
++	  if (test == regno)
++	    return true;
+ 	}
+     }
+ 
+-  return align;
+-}
+-
+-/* Return true if N is a possible register number of function value.  */
+-
+-static bool
+-ix86_function_value_regno_p (const unsigned int regno)
+-{
+-  switch (regno)
++  if (ignore_outlined && cfun->machine->call_ms2sysv)
+     {
+-    case AX_REG:
+-      return true;
+-    case DX_REG:
+-      return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
+-    case DI_REG:
+-    case SI_REG:
+-      return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
+-
+-      /* Complex values are returned in %st(0)/%st(1) pair.  */
+-    case ST0_REG:
+-    case ST1_REG:
+-      /* TODO: The function should depend on current function ABI but
+-       builtins.c would need updating then. Therefore we use the
+-       default ABI.  */
+-      if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
+-	return false;
+-      return TARGET_FLOAT_RETURNS_IN_80387;
+-
+-      /* Complex values are returned in %xmm0/%xmm1 pair.  */
+-    case XMM0_REG:
+-    case XMM1_REG:
+-      return TARGET_SSE;
+-
+-    case MM0_REG:
+-      if (TARGET_MACHO || TARGET_64BIT)
++      unsigned count = cfun->machine->call_ms2sysv_extra_regs
++		       + xlogue_layout::MIN_REGS;
++      if (xlogue_layout::is_stub_managed_reg (regno, count))
+ 	return false;
+-      return TARGET_MMX;
+     }
+ 
+-  return false;
++  if (crtl->drap_reg
++      && regno == REGNO (crtl->drap_reg)
++      && !cfun->machine->no_drap_save_restore)
++    return true;
++
++  return (df_regs_ever_live_p (regno)
++	  && !call_used_or_fixed_reg_p (regno)
++	  && !fixed_regs[regno]
++	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
+ }
+ 
+-/* Define how to find the value returned by a function.
+-   VALTYPE is the data type of the value (as a tree).
+-   If the precise function being called is known, FUNC is its FUNCTION_DECL;
+-   otherwise, FUNC is 0.  */
++/* Return number of saved general prupose registers.  */
+ 
+-static rtx
+-function_value_32 (machine_mode orig_mode, machine_mode mode,
+-		   const_tree fntype, const_tree fn)
++static int
++ix86_nsaved_regs (void)
+ {
+-  unsigned int regno;
++  int nregs = 0;
++  int regno;
+ 
+-  /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
+-     we normally prevent this case when mmx is not available.  However
+-     some ABIs may require the result to be returned like DImode.  */
+-  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
+-    regno = FIRST_MMX_REG;
+-
+-  /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
+-     we prevent this case when sse is not available.  However some ABIs
+-     may require the result to be returned like integer TImode.  */
+-  else if (mode == TImode
+-	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
+-    regno = FIRST_SSE_REG;
+-
+-  /* 32-byte vector modes in %ymm0.   */
+-  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
+-    regno = FIRST_SSE_REG;
++  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
++    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++      nregs ++;
++  return nregs;
++}
+ 
+-  /* 64-byte vector modes in %zmm0.   */
+-  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+-    regno = FIRST_SSE_REG;
++/* Return number of saved SSE registers.  */
+ 
+-  /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
+-  else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
+-    regno = FIRST_FLOAT_REG;
+-  else
+-    /* Most things go in %eax.  */
+-    regno = AX_REG;
++static int
++ix86_nsaved_sseregs (void)
++{
++  int nregs = 0;
++  int regno;
+ 
+-  /* Override FP return register with %xmm0 for local functions when
+-     SSE math is enabled or for functions with sseregparm attribute.  */
+-  if ((fn || fntype) && (mode == SFmode || mode == DFmode))
+-    {
+-      int sse_level = ix86_function_sseregparm (fntype, fn, false);
+-      if (sse_level == -1)
+-	{
+-	  error ("calling %qD with SSE calling convention without "
+-		 "SSE/SSE2 enabled", fn);
+-	  sorry ("this is a GCC bug that can be worked around by adding "
+-		 "attribute used to function called");
+-	}
+-      else if ((sse_level >= 1 && mode == SFmode)
+-	       || (sse_level == 2 && mode == DFmode))
+-	regno = FIRST_SSE_REG;
+-    }
++  if (!TARGET_64BIT_MS_ABI)
++    return 0;
++  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
++    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++      nregs ++;
++  return nregs;
++}
+ 
+-  /* OImode shouldn't be used directly.  */
+-  gcc_assert (mode != OImode);
++/* Given FROM and TO register numbers, say whether this elimination is
++   allowed.  If stack alignment is needed, we can only replace argument
++   pointer with hard frame pointer, or replace frame pointer with stack
++   pointer.  Otherwise, frame pointer elimination is automatically
++   handled and all other eliminations are valid.  */
+ 
+-  return gen_rtx_REG (orig_mode, regno);
++static bool
++ix86_can_eliminate (const int from, const int to)
++{
++  if (stack_realign_fp)
++    return ((from == ARG_POINTER_REGNUM
++	     && to == HARD_FRAME_POINTER_REGNUM)
++	    || (from == FRAME_POINTER_REGNUM
++		&& to == STACK_POINTER_REGNUM));
++  else
++    return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
+ }
+ 
+-static rtx
+-function_value_64 (machine_mode orig_mode, machine_mode mode,
+-		   const_tree valtype)
++/* Return the offset between two registers, one to be eliminated, and the other
++   its replacement, at the start of a routine.  */
++
++HOST_WIDE_INT
++ix86_initial_elimination_offset (int from, int to)
+ {
+-  rtx ret;
++  struct ix86_frame &frame = cfun->machine->frame;
+ 
+-  /* Handle libcalls, which don't provide a type node.  */
+-  if (valtype == NULL)
++  if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
++    return frame.hard_frame_pointer_offset;
++  else if (from == FRAME_POINTER_REGNUM
++	   && to == HARD_FRAME_POINTER_REGNUM)
++    return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
++  else
+     {
+-      unsigned int regno;
++      gcc_assert (to == STACK_POINTER_REGNUM);
+ 
+-      switch (mode)
+-	{
+-	case E_SFmode:
+-	case E_SCmode:
+-	case E_DFmode:
+-	case E_DCmode:
+-	case E_TFmode:
+-	case E_SDmode:
+-	case E_DDmode:
+-	case E_TDmode:
+-	  regno = FIRST_SSE_REG;
+-	  break;
+-	case E_XFmode:
+-	case E_XCmode:
+-	  regno = FIRST_FLOAT_REG;
+-	  break;
+-	case E_TCmode:
+-	  return NULL;
+-	default:
+-	  regno = AX_REG;
+-	}
++      if (from == ARG_POINTER_REGNUM)
++	return frame.stack_pointer_offset;
+ 
+-      return gen_rtx_REG (mode, regno);
+-    }
+-  else if (POINTER_TYPE_P (valtype))
+-    {
+-      /* Pointers are always returned in word_mode.  */
+-      mode = word_mode;
++      gcc_assert (from == FRAME_POINTER_REGNUM);
++      return frame.stack_pointer_offset - frame.frame_pointer_offset;
+     }
+-
+-  ret = construct_container (mode, orig_mode, valtype, 1,
+-			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
+-			     x86_64_int_return_registers, 0);
+-
+-  /* For zero sized structures, construct_container returns NULL, but we
+-     need to keep rest of compiler happy by returning meaningful value.  */
+-  if (!ret)
+-    ret = gen_rtx_REG (orig_mode, AX_REG);
+-
+-  return ret;
+ }
+ 
++/* In a dynamically-aligned function, we can't know the offset from
++   stack pointer to frame pointer, so we must ensure that setjmp
++   eliminates fp against the hard fp (%ebp) rather than trying to
++   index from %esp up to the top of the frame across a gap that is
++   of unknown (at compile-time) size.  */
+ static rtx
+-function_value_ms_32 (machine_mode orig_mode, machine_mode mode,
+-		      const_tree fntype, const_tree fn, const_tree valtype)
++ix86_builtin_setjmp_frame_value (void)
+ {
+-  unsigned int regno;
+-
+-  /* Floating point return values in %st(0)
+-     (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes).  */
+-  if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387
+-	   && (GET_MODE_SIZE (mode) > 8
+-	       || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype)))
+-  {
+-    regno = FIRST_FLOAT_REG;
+-    return gen_rtx_REG (orig_mode, regno);
+-  }
+-  else
+-    return function_value_32(orig_mode, mode, fntype,fn);
++  return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
+ }
+ 
+-static rtx
+-function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
+-		      const_tree valtype)
++/* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
++void warn_once_call_ms2sysv_xlogues (const char *feature)
+ {
+-  unsigned int regno = AX_REG;
+-
+-  if (TARGET_SSE)
++  static bool warned_once = false;
++  if (!warned_once)
+     {
+-      switch (GET_MODE_SIZE (mode))
+-	{
+-	case 16:
+-	  if (valtype != NULL_TREE
+-	      && !VECTOR_INTEGER_TYPE_P (valtype)
+-	      && !VECTOR_INTEGER_TYPE_P (valtype)
+-	      && !INTEGRAL_TYPE_P (valtype)
+-	      && !VECTOR_FLOAT_TYPE_P (valtype))
+-	    break;
+-	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+-	      && !COMPLEX_MODE_P (mode))
+-	    regno = FIRST_SSE_REG;
+-	  break;
+-	case 8:
+-	case 4:
+-	  if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype))
+-	    break;
+-	  if (mode == SFmode || mode == DFmode)
+-	    regno = FIRST_SSE_REG;
+-	  break;
+-	default:
+-	  break;
+-        }
++      warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s",
++	       feature);
++      warned_once = true;
+     }
+-  return gen_rtx_REG (orig_mode, regno);
+ }
+ 
+-static rtx
+-ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
+-		       machine_mode orig_mode, machine_mode mode)
+-{
+-  const_tree fn, fntype;
++/* Return the probing interval for -fstack-clash-protection.  */
+ 
+-  fn = NULL_TREE;
+-  if (fntype_or_decl && DECL_P (fntype_or_decl))
+-    fn = fntype_or_decl;
+-  fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
+-  
+-  if (ix86_function_type_abi (fntype) == MS_ABI)
+-    {
+-      if (TARGET_64BIT)
+-	return function_value_ms_64 (orig_mode, mode, valtype);
+-      else
+-	return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype);
+-    }
+-  else if (TARGET_64BIT)
+-    return function_value_64 (orig_mode, mode, valtype);
++static HOST_WIDE_INT
++get_probe_interval (void)
++{
++  if (flag_stack_clash_protection)
++    return (HOST_WIDE_INT_1U
++	    << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
+   else
+-    return function_value_32 (orig_mode, mode, fntype, fn);
++    return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
+ }
+ 
+-static rtx
+-ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
+-{
+-  machine_mode mode, orig_mode;
++/* When using -fsplit-stack, the allocation routines set a field in
++   the TCB to the bottom of the stack plus this much space, measured
++   in bytes.  */
+ 
+-  orig_mode = TYPE_MODE (valtype);
+-  mode = type_natural_mode (valtype, NULL, true);
+-  return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
+-}
++#define SPLIT_STACK_AVAILABLE 256
+ 
+-/* Pointer function arguments and return values are promoted to
+-   word_mode for normal functions.  */
++/* Fill structure ix86_frame about frame of currently computed function.  */
+ 
+-static machine_mode
+-ix86_promote_function_mode (const_tree type, machine_mode mode,
+-			    int *punsignedp, const_tree fntype,
+-			    int for_return)
++static void
++ix86_compute_frame_layout (void)
+ {
+-  if (cfun->machine->func_type == TYPE_NORMAL
+-      && type != NULL_TREE
+-      && POINTER_TYPE_P (type))
+-    {
+-      *punsignedp = POINTERS_EXTEND_UNSIGNED;
+-      return word_mode;
+-    }
+-  return default_promote_function_mode (type, mode, punsignedp, fntype,
+-					for_return);
+-}
+-
+-/* Return true if a structure, union or array with MODE containing FIELD
+-   should be accessed using BLKmode.  */
+-
+-static bool
+-ix86_member_type_forces_blk (const_tree field, machine_mode mode)
+-{
+-  /* Union with XFmode must be in BLKmode.  */
+-  return (mode == XFmode
+-	  && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
+-	      || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
+-}
+-
+-rtx
+-ix86_libcall_value (machine_mode mode)
+-{
+-  return ix86_function_value_1 (NULL, NULL, mode, mode);
+-}
+-
+-/* Return true iff type is returned in memory.  */
+-
+-static bool
+-ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
+-{
+-#ifdef SUBTARGET_RETURN_IN_MEMORY
+-  return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
+-#else
+-  const machine_mode mode = type_natural_mode (type, NULL, true);
+-  HOST_WIDE_INT size;
++  struct ix86_frame *frame = &cfun->machine->frame;
++  struct machine_function *m = cfun->machine;
++  unsigned HOST_WIDE_INT stack_alignment_needed;
++  HOST_WIDE_INT offset;
++  unsigned HOST_WIDE_INT preferred_alignment;
++  HOST_WIDE_INT size = get_frame_size ();
++  HOST_WIDE_INT to_allocate;
+ 
+-  if (TARGET_64BIT)
++  /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
++   * ms_abi functions that call a sysv function.  We now need to prune away
++   * cases where it should be disabled.  */
++  if (TARGET_64BIT && m->call_ms2sysv)
+     {
+-      if (ix86_function_type_abi (fntype) == MS_ABI)
+-	{
+-	  size = int_size_in_bytes (type);
++      gcc_assert (TARGET_64BIT_MS_ABI);
++      gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
++      gcc_assert (!TARGET_SEH);
++      gcc_assert (TARGET_SSE);
++      gcc_assert (!ix86_using_red_zone ());
+ 
+-	  /* __m128 is returned in xmm0.  */
+-	  if ((!type || VECTOR_INTEGER_TYPE_P (type)
+-	       || INTEGRAL_TYPE_P (type)
+-	       || VECTOR_FLOAT_TYPE_P (type))
+-	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+-	      && !COMPLEX_MODE_P (mode)
+-	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
+-	    return false;
++      if (crtl->calls_eh_return)
++	{
++	  gcc_assert (!reload_completed);
++	  m->call_ms2sysv = false;
++	  warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
++	}
+ 
+-	  /* Otherwise, the size must be exactly in [1248]. */
+-	  return size != 1 && size != 2 && size != 4 && size != 8;
++      else if (ix86_static_chain_on_stack)
++	{
++	  gcc_assert (!reload_completed);
++	  m->call_ms2sysv = false;
++	  warn_once_call_ms2sysv_xlogues ("static call chains");
+ 	}
++
++      /* Finally, compute which registers the stub will manage.  */
+       else
+ 	{
+-	  int needed_intregs, needed_sseregs;
+-
+-	  return examine_argument (mode, type, 1,
+-				   &needed_intregs, &needed_sseregs);
++	  unsigned count = xlogue_layout::count_stub_managed_regs ();
++	  m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
++	  m->call_ms2sysv_pad_in = 0;
+ 	}
+     }
+-  else
+-    {
+-      size = int_size_in_bytes (type);
+-
+-      /* Intel MCU psABI returns scalars and aggregates no larger than 8
+-	 bytes in registers.  */
+-      if (TARGET_IAMCU)
+-	return VECTOR_MODE_P (mode) || size < 0 || size > 8;
+-
+-      if (mode == BLKmode)
+-	return true;
+-
+-      if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
+-	return false;
+ 
+-      if (VECTOR_MODE_P (mode) || mode == TImode)
+-	{
+-	  /* User-created vectors small enough to fit in EAX.  */
+-	  if (size < 8)
+-	    return false;
++  frame->nregs = ix86_nsaved_regs ();
++  frame->nsseregs = ix86_nsaved_sseregs ();
+ 
+-	  /* Unless ABI prescibes otherwise,
+-	     MMX/3dNow values are returned in MM0 if available.  */
+-	     
+-	  if (size == 8)
+-	    return TARGET_VECT8_RETURNS || !TARGET_MMX;
++  /* 64-bit MS ABI seem to require stack alignment to be always 16,
++     except for function prologues, leaf functions and when the defult
++     incoming stack boundary is overriden at command line or via
++     force_align_arg_pointer attribute.
+ 
+-	  /* SSE values are returned in XMM0 if available.  */
+-	  if (size == 16)
+-	    return !TARGET_SSE;
++     Darwin's ABI specifies 128b alignment for both 32 and  64 bit variants
++     at call sites, including profile function calls.
++ */
++  if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
++        && crtl->preferred_stack_boundary < 128)
++      && (!crtl->is_leaf || cfun->calls_alloca != 0
++	  || ix86_current_function_calls_tls_descriptor
++	  || (TARGET_MACHO && crtl->profile)
++	  || ix86_incoming_stack_boundary < 128))
++    {
++      crtl->preferred_stack_boundary = 128;
++      crtl->stack_alignment_needed = 128;
++    }
+ 
+-	  /* AVX values are returned in YMM0 if available.  */
+-	  if (size == 32)
+-	    return !TARGET_AVX;
++  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
++  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+ 
+-	  /* AVX512F values are returned in ZMM0 if available.  */
+-	  if (size == 64)
+-	    return !TARGET_AVX512F;
+-	}
++  gcc_assert (!size || stack_alignment_needed);
++  gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
++  gcc_assert (preferred_alignment <= stack_alignment_needed);
+ 
+-      if (mode == XFmode)
+-	return false;
++  /* The only ABI saving SSE regs should be 64-bit ms_abi.  */
++  gcc_assert (TARGET_64BIT || !frame->nsseregs);
++  if (TARGET_64BIT && m->call_ms2sysv)
++    {
++      gcc_assert (stack_alignment_needed >= 16);
++      gcc_assert (!frame->nsseregs);
++    }
+ 
+-      if (size > 12)
+-	return true;
++  /* For SEH we have to limit the amount of code movement into the prologue.
++     At present we do this via a BLOCKAGE, at which point there's very little
++     scheduling that can be done, which means that there's very little point
++     in doing anything except PUSHs.  */
++  if (TARGET_SEH)
++    m->use_fast_prologue_epilogue = false;
++  else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
++    {
++      int count = frame->nregs;
++      struct cgraph_node *node = cgraph_node::get (current_function_decl);
+ 
+-      /* OImode shouldn't be used directly.  */
+-      gcc_assert (mode != OImode);
++      /* The fast prologue uses move instead of push to save registers.  This
++         is significantly longer, but also executes faster as modern hardware
++         can execute the moves in parallel, but can't do that for push/pop.
+ 
+-      return false;
++	 Be careful about choosing what prologue to emit:  When function takes
++	 many instructions to execute we may use slow version as well as in
++	 case function is known to be outside hot spot (this is known with
++	 feedback only).  Weight the size of function by number of registers
++	 to save as it is cheap to use one or two push instructions but very
++	 slow to use many of them.  */
++      if (count)
++	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
++      if (node->frequency < NODE_FREQUENCY_NORMAL
++	  || (flag_branch_probabilities
++	      && node->frequency < NODE_FREQUENCY_HOT))
++	m->use_fast_prologue_epilogue = false;
++      else
++	m->use_fast_prologue_epilogue
++	   = !expensive_function_p (count);
+     }
+-#endif
+-}
+ 
+-
+-/* Create the va_list data type.  */
++  frame->save_regs_using_mov
++    = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
++       /* If static stack checking is enabled and done with probes,
++	  the registers need to be saved before allocating the frame.  */
++       && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
+ 
+-static tree
+-ix86_build_builtin_va_list_64 (void)
+-{
+-  tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
++  /* Skip return address and error code in exception handler.  */
++  offset = INCOMING_FRAME_SP_OFFSET;
+ 
+-  record = lang_hooks.types.make_type (RECORD_TYPE);
+-  type_decl = build_decl (BUILTINS_LOCATION,
+-			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
++  /* Skip pushed static chain.  */
++  if (ix86_static_chain_on_stack)
++    offset += UNITS_PER_WORD;
+ 
+-  f_gpr = build_decl (BUILTINS_LOCATION,
+-		      FIELD_DECL, get_identifier ("gp_offset"),
+-		      unsigned_type_node);
+-  f_fpr = build_decl (BUILTINS_LOCATION,
+-		      FIELD_DECL, get_identifier ("fp_offset"),
+-		      unsigned_type_node);
+-  f_ovf = build_decl (BUILTINS_LOCATION,
+-		      FIELD_DECL, get_identifier ("overflow_arg_area"),
+-		      ptr_type_node);
+-  f_sav = build_decl (BUILTINS_LOCATION,
+-		      FIELD_DECL, get_identifier ("reg_save_area"),
+-		      ptr_type_node);
++  /* Skip saved base pointer.  */
++  if (frame_pointer_needed)
++    offset += UNITS_PER_WORD;
++  frame->hfp_save_offset = offset;
+ 
+-  va_list_gpr_counter_field = f_gpr;
+-  va_list_fpr_counter_field = f_fpr;
++  /* The traditional frame pointer location is at the top of the frame.  */
++  frame->hard_frame_pointer_offset = offset;
+ 
+-  DECL_FIELD_CONTEXT (f_gpr) = record;
+-  DECL_FIELD_CONTEXT (f_fpr) = record;
+-  DECL_FIELD_CONTEXT (f_ovf) = record;
+-  DECL_FIELD_CONTEXT (f_sav) = record;
++  /* Register save area */
++  offset += frame->nregs * UNITS_PER_WORD;
++  frame->reg_save_offset = offset;
+ 
+-  TYPE_STUB_DECL (record) = type_decl;
+-  TYPE_NAME (record) = type_decl;
+-  TYPE_FIELDS (record) = f_gpr;
+-  DECL_CHAIN (f_gpr) = f_fpr;
+-  DECL_CHAIN (f_fpr) = f_ovf;
+-  DECL_CHAIN (f_ovf) = f_sav;
++  /* On SEH target, registers are pushed just before the frame pointer
++     location.  */
++  if (TARGET_SEH)
++    frame->hard_frame_pointer_offset = offset;
+ 
+-  layout_type (record);
++  /* Calculate the size of the va-arg area (not including padding, if any).  */
++  frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
+ 
+-  TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
+-					NULL_TREE, TYPE_ATTRIBUTES (record));
++  /* Also adjust stack_realign_offset for the largest alignment of
++     stack slot actually used.  */
++  if (stack_realign_fp
++      || (cfun->machine->max_used_stack_alignment != 0
++	  && (offset % cfun->machine->max_used_stack_alignment) != 0))
++    {
++      /* We may need a 16-byte aligned stack for the remainder of the
++	 register save area, but the stack frame for the local function
++	 may require a greater alignment if using AVX/2/512.  In order
++	 to avoid wasting space, we first calculate the space needed for
++	 the rest of the register saves, add that to the stack pointer,
++	 and then realign the stack to the boundary of the start of the
++	 frame for the local function.  */
++      HOST_WIDE_INT space_needed = 0;
++      HOST_WIDE_INT sse_reg_space_needed = 0;
+ 
+-  /* The correct type is an array type of one element.  */
+-  return build_array_type (record, build_index_type (size_zero_node));
+-}
++      if (TARGET_64BIT)
++	{
++	  if (m->call_ms2sysv)
++	    {
++	      m->call_ms2sysv_pad_in = 0;
++	      space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
++	    }
+ 
+-/* Setup the builtin va_list data type and for 64-bit the additional
+-   calling convention specific va_list data types.  */
++	  else if (frame->nsseregs)
++	    /* The only ABI that has saved SSE registers (Win64) also has a
++	       16-byte aligned default stack.  However, many programs violate
++	       the ABI, and Wine64 forces stack realignment to compensate.  */
++	    space_needed = frame->nsseregs * 16;
+ 
+-static tree
+-ix86_build_builtin_va_list (void)
+-{
+-  if (TARGET_64BIT)
+-    {
+-      /* Initialize ABI specific va_list builtin types.
++	  sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
+ 
+-	 In lto1, we can encounter two va_list types:
+-	 - one as a result of the type-merge across TUs, and
+-	 - the one constructed here.
+-	 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
+-	 a type identity check in canonical_va_list_type based on
+-	 TYPE_MAIN_VARIANT (which we used to have) will not work.
+-	 Instead, we tag each va_list_type_node with its unique attribute, and
+-	 look for the attribute in the type identity check in
+-	 canonical_va_list_type.
++	  /* 64-bit frame->va_arg_size should always be a multiple of 16, but
++	     rounding to be pedantic.  */
++	  space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
++	}
++      else
++	space_needed = frame->va_arg_size;
+ 
+-	 Tagging sysv_va_list_type_node directly with the attribute is
+-	 problematic since it's a array of one record, which will degrade into a
+-	 pointer to record when used as parameter (see build_va_arg comments for
+-	 an example), dropping the attribute in the process.  So we tag the
+-	 record instead.  */
++      /* Record the allocation size required prior to the realignment AND.  */
++      frame->stack_realign_allocate = space_needed;
+ 
+-      /* For SYSV_ABI we use an array of one record.  */
+-      sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
+-	
+-      /* For MS_ABI we use plain pointer to argument area.  */
+-      tree char_ptr_type = build_pointer_type (char_type_node);
+-      tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
+-			     TYPE_ATTRIBUTES (char_ptr_type));
+-      ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
++      /* The re-aligned stack starts at frame->stack_realign_offset.  Values
++	 before this point are not directly comparable with values below
++	 this point.  Use sp_valid_at to determine if the stack pointer is
++	 valid for a given offset, fp_valid_at for the frame pointer, or
++	 choose_baseaddr to have a base register chosen for you.
+ 
+-      return ((ix86_abi == MS_ABI)
+-	      ? ms_va_list_type_node
+-	      : sysv_va_list_type_node);
++	 Note that the result of (frame->stack_realign_offset
++	 & (stack_alignment_needed - 1)) may not equal zero.  */
++      offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
++      frame->stack_realign_offset = offset - space_needed;
++      frame->sse_reg_save_offset = frame->stack_realign_offset
++							+ sse_reg_space_needed;
+     }
+   else
+     {
+-      /* For i386 we use plain pointer to argument area.  */
+-      return build_pointer_type (char_type_node);
++      frame->stack_realign_offset = offset;
++
++      if (TARGET_64BIT && m->call_ms2sysv)
++	{
++	  m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
++	  offset += xlogue_layout::get_instance ().get_stack_space_used ();
++	}
++
++      /* Align and set SSE register save area.  */
++      else if (frame->nsseregs)
++	{
++	  /* If the incoming stack boundary is at least 16 bytes, or DRAP is
++	     required and the DRAP re-alignment boundary is at least 16 bytes,
++	     then we want the SSE register save area properly aligned.  */
++	  if (ix86_incoming_stack_boundary >= 128
++		  || (stack_realign_drap && stack_alignment_needed >= 16))
++	    offset = ROUND_UP (offset, 16);
++	  offset += frame->nsseregs * 16;
++	}
++      frame->sse_reg_save_offset = offset;
++      offset += frame->va_arg_size;
+     }
+-}
+ 
+-/* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
++  /* Align start of frame for local function.  When a function call
++     is removed, it may become a leaf function.  But if argument may
++     be passed on stack, we need to align the stack when there is no
++     tail call.  */
++  if (m->call_ms2sysv
++      || frame->va_arg_size != 0
++      || size != 0
++      || !crtl->is_leaf
++      || (!crtl->tail_call_emit
++	  && cfun->machine->outgoing_args_on_stack)
++      || cfun->calls_alloca
++      || ix86_current_function_calls_tls_descriptor)
++    offset = ROUND_UP (offset, stack_alignment_needed);
+ 
+-static void
+-setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
+-{
+-  rtx save_area, mem;
+-  alias_set_type set;
+-  int i, max;
++  /* Frame pointer points here.  */
++  frame->frame_pointer_offset = offset;
+ 
+-  /* GPR size of varargs save area.  */
+-  if (cfun->va_list_gpr_size)
+-    ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
+-  else
+-    ix86_varargs_gpr_size = 0;
++  offset += size;
+ 
+-  /* FPR size of varargs save area.  We don't need it if we don't pass
+-     anything in SSE registers.  */
+-  if (TARGET_SSE && cfun->va_list_fpr_size)
+-    ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
++  /* Add outgoing arguments area.  Can be skipped if we eliminated
++     all the function calls as dead code.
++     Skipping is however impossible when function calls alloca.  Alloca
++     expander assumes that last crtl->outgoing_args_size
++     of stack frame are unused.  */
++  if (ACCUMULATE_OUTGOING_ARGS
++      && (!crtl->is_leaf || cfun->calls_alloca
++	  || ix86_current_function_calls_tls_descriptor))
++    {
++      offset += crtl->outgoing_args_size;
++      frame->outgoing_arguments_size = crtl->outgoing_args_size;
++    }
+   else
+-    ix86_varargs_fpr_size = 0;
++    frame->outgoing_arguments_size = 0;
+ 
+-  if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
+-    return;
++  /* Align stack boundary.  Only needed if we're calling another function
++     or using alloca.  */
++  if (!crtl->is_leaf || cfun->calls_alloca
++      || ix86_current_function_calls_tls_descriptor)
++    offset = ROUND_UP (offset, preferred_alignment);
+ 
+-  save_area = frame_pointer_rtx;
+-  set = get_varargs_alias_set ();
++  /* We've reached end of stack frame.  */
++  frame->stack_pointer_offset = offset;
+ 
+-  max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
+-  if (max > X86_64_REGPARM_MAX)
+-    max = X86_64_REGPARM_MAX;
++  /* Size prologue needs to allocate.  */
++  to_allocate = offset - frame->sse_reg_save_offset;
+ 
+-  for (i = cum->regno; i < max; i++)
++  if ((!to_allocate && frame->nregs <= 1)
++      || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
++      /* If stack clash probing needs a loop, then it needs a
++	 scratch register.  But the returned register is only guaranteed
++	 to be safe to use after register saves are complete.  So if
++	 stack clash protections are enabled and the allocated frame is
++	 larger than the probe interval, then use pushes to save
++	 callee saved registers.  */
++      || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
++    frame->save_regs_using_mov = false;
++
++  if (ix86_using_red_zone ()
++      && crtl->sp_is_unchanging
++      && crtl->is_leaf
++      && !ix86_pc_thunk_call_expanded
++      && !ix86_current_function_calls_tls_descriptor)
+     {
+-      mem = gen_rtx_MEM (word_mode,
+-			 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
+-      MEM_NOTRAP_P (mem) = 1;
+-      set_mem_alias_set (mem, set);
+-      emit_move_insn (mem,
+-		      gen_rtx_REG (word_mode,
+-				   x86_64_int_parameter_registers[i]));
++      frame->red_zone_size = to_allocate;
++      if (frame->save_regs_using_mov)
++	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
++      if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
++	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
+     }
++  else
++    frame->red_zone_size = 0;
++  frame->stack_pointer_offset -= frame->red_zone_size;
+ 
+-  if (ix86_varargs_fpr_size)
++  /* The SEH frame pointer location is near the bottom of the frame.
++     This is enforced by the fact that the difference between the
++     stack pointer and the frame pointer is limited to 240 bytes in
++     the unwind data structure.  */
++  if (TARGET_SEH)
+     {
+-      machine_mode smode;
+-      rtx_code_label *label;
+-      rtx test;
+-
+-      /* Now emit code to save SSE registers.  The AX parameter contains number
+-	 of SSE parameter registers used to call this function, though all we
+-	 actually check here is the zero/non-zero status.  */
+-
+-      label = gen_label_rtx ();
+-      test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
+-      emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
+-				      label));
+-
+-      /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
+-	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
+-	 be if we could determine the real mode of the data, via a hook
+-	 into pass_stdarg.  Ignore all that for now.  */
+-      smode = V4SFmode;
+-      if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
+-	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
+-
+-      max = cum->sse_regno + cfun->va_list_fpr_size / 16;
+-      if (max > X86_64_SSE_REGPARM_MAX)
+-	max = X86_64_SSE_REGPARM_MAX;
++      HOST_WIDE_INT diff;
+ 
+-      for (i = cum->sse_regno; i < max; ++i)
++      /* If we can leave the frame pointer where it is, do so.  Also, returns
++	 the establisher frame for __builtin_frame_address (0).  */
++      diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
++      if (diff <= SEH_MAX_FRAME_SIZE
++	  && (diff > 240 || (diff & 15) != 0)
++	  && !crtl->accesses_prior_frames)
+ 	{
+-	  mem = plus_constant (Pmode, save_area,
+-			       i * 16 + ix86_varargs_gpr_size);
+-	  mem = gen_rtx_MEM (smode, mem);
+-	  MEM_NOTRAP_P (mem) = 1;
+-	  set_mem_alias_set (mem, set);
+-	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
+-
+-	  emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
++	  /* Ideally we'd determine what portion of the local stack frame
++	     (within the constraint of the lowest 240) is most heavily used.
++	     But without that complication, simply bias the frame pointer
++	     by 128 bytes so as to maximize the amount of the local stack
++	     frame that is addressable with 8-bit offsets.  */
++	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
+ 	}
+-
+-      emit_label (label);
+     }
+ }
+ 
+-static void
+-setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
+-{
+-  alias_set_type set = get_varargs_alias_set ();
+-  int i;
++/* This is semi-inlined memory_address_length, but simplified
++   since we know that we're always dealing with reg+offset, and
++   to avoid having to create and discard all that rtl.  */
+ 
+-  /* Reset to zero, as there might be a sysv vaarg used
+-     before.  */
+-  ix86_varargs_gpr_size = 0;
+-  ix86_varargs_fpr_size = 0;
++static inline int
++choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
++{
++  int len = 4;
+ 
+-  for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
++  if (offset == 0)
+     {
+-      rtx reg, mem;
++      /* EBP and R13 cannot be encoded without an offset.  */
++      len = (regno == BP_REG || regno == R13_REG);
++    }
++  else if (IN_RANGE (offset, -128, 127))
++    len = 1;
+ 
+-      mem = gen_rtx_MEM (Pmode,
+-			 plus_constant (Pmode, virtual_incoming_args_rtx,
+-					i * UNITS_PER_WORD));
+-      MEM_NOTRAP_P (mem) = 1;
+-      set_mem_alias_set (mem, set);
++  /* ESP and R12 must be encoded with a SIB byte.  */
++  if (regno == SP_REG || regno == R12_REG)
++    len++;
+ 
+-      reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
+-      emit_move_insn (mem, reg);
++  return len;
++}
++
++/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
++   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
++
++static bool
++sp_valid_at (HOST_WIDE_INT cfa_offset)
++{
++  const struct machine_frame_state &fs = cfun->machine->fs;
++  if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
++    {
++      /* Validate that the cfa_offset isn't in a "no-man's land".  */
++      gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
++      return false;
++    }
++  return fs.sp_valid;
++}
++
++/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
++   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
++
++static inline bool
++fp_valid_at (HOST_WIDE_INT cfa_offset)
++{
++  const struct machine_frame_state &fs = cfun->machine->fs;
++  if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
++    {
++      /* Validate that the cfa_offset isn't in a "no-man's land".  */
++      gcc_assert (cfa_offset >= fs.sp_realigned_offset);
++      return false;
+     }
++  return fs.fp_valid;
+ }
+ 
++/* Choose a base register based upon alignment requested, speed and/or
++   size.  */
++
+ static void
+-ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+-			     tree type, int *, int no_rtl)
++choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
++		HOST_WIDE_INT &base_offset,
++		unsigned int align_reqested, unsigned int *align)
+ {
+-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+-  CUMULATIVE_ARGS next_cum;
+-  tree fntype;
++  const struct machine_function *m = cfun->machine;
++  unsigned int hfp_align;
++  unsigned int drap_align;
++  unsigned int sp_align;
++  bool hfp_ok  = fp_valid_at (cfa_offset);
++  bool drap_ok = m->fs.drap_valid;
++  bool sp_ok   = sp_valid_at (cfa_offset);
+ 
+-  /* This argument doesn't appear to be used anymore.  Which is good,
+-     because the old code here didn't suppress rtl generation.  */
+-  gcc_assert (!no_rtl);
++  hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
+ 
+-  if (!TARGET_64BIT)
+-    return;
++  /* Filter out any registers that don't meet the requested alignment
++     criteria.  */
++  if (align_reqested)
++    {
++      if (m->fs.realigned)
++	hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
++      /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
++	 notes (which we would need to use a realigned stack pointer),
++	 so disable on SEH targets.  */
++      else if (m->fs.sp_realigned)
++	sp_align = crtl->stack_alignment_needed;
+ 
+-  fntype = TREE_TYPE (current_function_decl);
++      hfp_ok = hfp_ok && hfp_align >= align_reqested;
++      drap_ok = drap_ok && drap_align >= align_reqested;
++      sp_ok = sp_ok && sp_align >= align_reqested;
++    }
+ 
+-  /* For varargs, we do not want to skip the dummy va_dcl argument.
+-     For stdargs, we do want to skip the last named argument.  */
+-  next_cum = *cum;
+-  if (stdarg_p (fntype))
+-    ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
+-			       true);
++  if (m->use_fast_prologue_epilogue)
++    {
++      /* Choose the base register most likely to allow the most scheduling
++         opportunities.  Generally FP is valid throughout the function,
++         while DRAP must be reloaded within the epilogue.  But choose either
++         over the SP due to increased encoding size.  */
+ 
+-  if (cum->call_abi == MS_ABI)
+-    setup_incoming_varargs_ms_64 (&next_cum);
++      if (hfp_ok)
++	{
++	  base_reg = hard_frame_pointer_rtx;
++	  base_offset = m->fs.fp_offset - cfa_offset;
++	}
++      else if (drap_ok)
++	{
++	  base_reg = crtl->drap_reg;
++	  base_offset = 0 - cfa_offset;
++	}
++      else if (sp_ok)
++	{
++	  base_reg = stack_pointer_rtx;
++	  base_offset = m->fs.sp_offset - cfa_offset;
++	}
++    }
+   else
+-    setup_incoming_varargs_64 (&next_cum);
++    {
++      HOST_WIDE_INT toffset;
++      int len = 16, tlen;
++
++      /* Choose the base register with the smallest address encoding.
++         With a tie, choose FP > DRAP > SP.  */
++      if (sp_ok)
++	{
++	  base_reg = stack_pointer_rtx;
++	  base_offset = m->fs.sp_offset - cfa_offset;
++          len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
++	}
++      if (drap_ok)
++	{
++	  toffset = 0 - cfa_offset;
++	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
++	  if (tlen <= len)
++	    {
++	      base_reg = crtl->drap_reg;
++	      base_offset = toffset;
++	      len = tlen;
++	    }
++	}
++      if (hfp_ok)
++	{
++	  toffset = m->fs.fp_offset - cfa_offset;
++	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
++	  if (tlen <= len)
++	    {
++	      base_reg = hard_frame_pointer_rtx;
++	      base_offset = toffset;
++	      len = tlen;
++	    }
++	}
++    }
++
++    /* Set the align return value.  */
++    if (align)
++      {
++	if (base_reg == stack_pointer_rtx)
++	  *align = sp_align;
++	else if (base_reg == crtl->drap_reg)
++	  *align = drap_align;
++	else if (base_reg == hard_frame_pointer_rtx)
++	  *align = hfp_align;
++      }
+ }
+ 
+-static void
+-ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
+-				   machine_mode mode,
+-				   tree type,
+-				   int *pretend_size ATTRIBUTE_UNUSED,
+-				   int no_rtl)
++/* Return an RTX that points to CFA_OFFSET within the stack frame and
++   the alignment of address.  If ALIGN is non-null, it should point to
++   an alignment value (in bits) that is preferred or zero and will
++   recieve the alignment of the base register that was selected,
++   irrespective of rather or not CFA_OFFSET is a multiple of that
++   alignment value.  If it is possible for the base register offset to be
++   non-immediate then SCRATCH_REGNO should specify a scratch register to
++   use.
++
++   The valid base registers are taken from CFUN->MACHINE->FS.  */
++
++static rtx
++choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
++		 unsigned int scratch_regno = INVALID_REGNUM)
+ {
+-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+-  CUMULATIVE_ARGS next_cum;
+-  tree fntype;
++  rtx base_reg = NULL;
++  HOST_WIDE_INT base_offset = 0;
+ 
+-  gcc_assert (!no_rtl);
++  /* If a specific alignment is requested, try to get a base register
++     with that alignment first.  */
++  if (align && *align)
++    choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
+ 
+-  /* Do nothing if we use plain pointer to argument area.  */
+-  if (!TARGET_64BIT || cum->call_abi == MS_ABI)
+-    return;
++  if (!base_reg)
++    choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
+ 
+-  fntype = TREE_TYPE (current_function_decl);
++  gcc_assert (base_reg != NULL);
+ 
+-  /* For varargs, we do not want to skip the dummy va_dcl argument.
+-     For stdargs, we do want to skip the last named argument.  */
+-  next_cum = *cum;
+-  if (stdarg_p (fntype))
+-    ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
+-			       true);
+-}
++  rtx base_offset_rtx = GEN_INT (base_offset);
+ 
++  if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
++    {
++      gcc_assert (scratch_regno != INVALID_REGNUM);
+ 
+-/* Checks if TYPE is of kind va_list char *.  */
++      rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
++      emit_move_insn (scratch_reg, base_offset_rtx);
+ 
+-static bool
+-is_va_list_char_pointer (tree type)
+-{
+-  tree canonic;
++      return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
++    }
+ 
+-  /* For 32-bit it is always true.  */
+-  if (!TARGET_64BIT)
+-    return true;
+-  canonic = ix86_canonical_va_list_type (type);
+-  return (canonic == ms_va_list_type_node
+-          || (ix86_abi == MS_ABI && canonic == va_list_type_node));
++  return plus_constant (Pmode, base_reg, base_offset);
+ }
+ 
+-/* Implement va_start.  */
++/* Emit code to save registers in the prologue.  */
+ 
+ static void
+-ix86_va_start (tree valist, rtx nextarg)
++ix86_emit_save_regs (void)
+ {
+-  HOST_WIDE_INT words, n_gpr, n_fpr;
+-  tree f_gpr, f_fpr, f_ovf, f_sav;
+-  tree gpr, fpr, ovf, sav, t;
+-  tree type;
+-  rtx ovf_rtx;
+-
+-  if (flag_split_stack
+-      && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+-    {
+-      unsigned int scratch_regno;
++  unsigned int regno;
++  rtx_insn *insn;
+ 
+-      /* When we are splitting the stack, we can't refer to the stack
+-	 arguments using internal_arg_pointer, because they may be on
+-	 the old stack.  The split stack prologue will arrange to
+-	 leave a pointer to the old stack arguments in a scratch
+-	 register, which we here copy to a pseudo-register.  The split
+-	 stack prologue can't set the pseudo-register directly because
+-	 it (the prologue) runs before any registers have been saved.  */
++  for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
++    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++      {
++	insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
++	RTX_FRAME_RELATED_P (insn) = 1;
++      }
++}
+ 
+-      scratch_regno = split_stack_prologue_scratch_regno ();
+-      if (scratch_regno != INVALID_REGNUM)
+-	{
+-	  rtx reg;
+-	  rtx_insn *seq;
++/* Emit a single register save at CFA - CFA_OFFSET.  */
+ 
+-	  reg = gen_reg_rtx (Pmode);
+-	  cfun->machine->split_stack_varargs_pointer = reg;
++static void
++ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
++			      HOST_WIDE_INT cfa_offset)
++{
++  struct machine_function *m = cfun->machine;
++  rtx reg = gen_rtx_REG (mode, regno);
++  rtx mem, addr, base, insn;
++  unsigned int align = GET_MODE_ALIGNMENT (mode);
+ 
+-	  start_sequence ();
+-	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
+-	  seq = get_insns ();
+-	  end_sequence ();
++  addr = choose_baseaddr (cfa_offset, &align);
++  mem = gen_frame_mem (mode, addr);
+ 
+-	  push_topmost_sequence ();
+-	  emit_insn_after (seq, entry_of_function ());
+-	  pop_topmost_sequence ();
+-	}
+-    }
++  /* The location aligment depends upon the base register.  */
++  align = MIN (GET_MODE_ALIGNMENT (mode), align);
++  gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
++  set_mem_align (mem, align);
+ 
+-  /* Only 64bit target needs something special.  */
+-  if (is_va_list_char_pointer (TREE_TYPE (valist)))
++  insn = emit_insn (gen_rtx_SET (mem, reg));
++  RTX_FRAME_RELATED_P (insn) = 1;
++
++  base = addr;
++  if (GET_CODE (base) == PLUS)
++    base = XEXP (base, 0);
++  gcc_checking_assert (REG_P (base));
++
++  /* When saving registers into a re-aligned local stack frame, avoid
++     any tricky guessing by dwarf2out.  */
++  if (m->fs.realigned)
+     {
+-      if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+-	std_expand_builtin_va_start (valist, nextarg);
++      gcc_checking_assert (stack_realign_drap);
++
++      if (regno == REGNO (crtl->drap_reg))
++	{
++	  /* A bit of a hack.  We force the DRAP register to be saved in
++	     the re-aligned stack frame, which provides us with a copy
++	     of the CFA that will last past the prologue.  Install it.  */
++	  gcc_checking_assert (cfun->machine->fs.fp_valid);
++	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
++				cfun->machine->fs.fp_offset - cfa_offset);
++	  mem = gen_rtx_MEM (mode, addr);
++	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
++	}
+       else
+ 	{
+-	  rtx va_r, next;
+-
+-	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
+-	  next = expand_binop (ptr_mode, add_optab,
+-			       cfun->machine->split_stack_varargs_pointer,
+-			       crtl->args.arg_offset_rtx,
+-			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
+-	  convert_move (va_r, next, 0);
++	  /* The frame pointer is a stable reference within the
++	     aligned frame.  Use it.  */
++	  gcc_checking_assert (cfun->machine->fs.fp_valid);
++	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
++				cfun->machine->fs.fp_offset - cfa_offset);
++	  mem = gen_rtx_MEM (mode, addr);
++	  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+ 	}
+-      return;
+     }
+ 
+-  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+-  f_fpr = DECL_CHAIN (f_gpr);
+-  f_ovf = DECL_CHAIN (f_fpr);
+-  f_sav = DECL_CHAIN (f_ovf);
+-
+-  valist = build_simple_mem_ref (valist);
+-  TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
+-  /* The following should be folded into the MEM_REF offset.  */
+-  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
+-		f_gpr, NULL_TREE);
+-  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
+-		f_fpr, NULL_TREE);
+-  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
+-		f_ovf, NULL_TREE);
+-  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
+-		f_sav, NULL_TREE);
+-
+-  /* Count number of gp and fp argument registers used.  */
+-  words = crtl->args.info.words;
+-  n_gpr = crtl->args.info.regno;
+-  n_fpr = crtl->args.info.sse_regno;
+-
+-  if (cfun->va_list_gpr_size)
++  else if (base == stack_pointer_rtx && m->fs.sp_realigned
++	   && cfa_offset >= m->fs.sp_realigned_offset)
+     {
+-      type = TREE_TYPE (gpr);
+-      t = build2 (MODIFY_EXPR, type,
+-		  gpr, build_int_cst (type, n_gpr * 8));
+-      TREE_SIDE_EFFECTS (t) = 1;
+-      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
++      gcc_checking_assert (stack_realign_fp);
++      add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+     }
+ 
+-  if (TARGET_SSE && cfun->va_list_fpr_size)
++  /* The memory may not be relative to the current CFA register,
++     which means that we may need to generate a new pattern for
++     use by the unwind info.  */
++  else if (base != m->fs.cfa_reg)
+     {
+-      type = TREE_TYPE (fpr);
+-      t = build2 (MODIFY_EXPR, type, fpr,
+-		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
+-      TREE_SIDE_EFFECTS (t) = 1;
+-      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
++      addr = plus_constant (Pmode, m->fs.cfa_reg,
++			    m->fs.cfa_offset - cfa_offset);
++      mem = gen_rtx_MEM (mode, addr);
++      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
+     }
++}
+ 
+-  /* Find the overflow area.  */
+-  type = TREE_TYPE (ovf);
+-  if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+-    ovf_rtx = crtl->args.internal_arg_pointer;
+-  else
+-    ovf_rtx = cfun->machine->split_stack_varargs_pointer;
+-  t = make_tree (type, ovf_rtx);
+-  if (words != 0)
+-    t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
++/* Emit code to save registers using MOV insns.
++   First register is stored at CFA - CFA_OFFSET.  */
++static void
++ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
++{
++  unsigned int regno;
+ 
+-  t = build2 (MODIFY_EXPR, type, ovf, t);
+-  TREE_SIDE_EFFECTS (t) = 1;
+-  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
++  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
++    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++      {
++        ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
++	cfa_offset -= UNITS_PER_WORD;
++      }
++}
+ 
+-  if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
+-    {
+-      /* Find the register save area.
+-	 Prologue of the function save it right above stack frame.  */
+-      type = TREE_TYPE (sav);
+-      t = make_tree (type, frame_pointer_rtx);
+-      if (!ix86_varargs_gpr_size)
+-	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
++/* Emit code to save SSE registers using MOV insns.
++   First register is stored at CFA - CFA_OFFSET.  */
++static void
++ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
++{
++  unsigned int regno;
+ 
+-      t = build2 (MODIFY_EXPR, type, sav, t);
+-      TREE_SIDE_EFFECTS (t) = 1;
+-      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+-    }
++  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
++    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++      {
++	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
++	cfa_offset -= GET_MODE_SIZE (V4SFmode);
++      }
+ }
+ 
+-/* Implement va_arg.  */
++static GTY(()) rtx queued_cfa_restores;
+ 
+-static tree
+-ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+-		      gimple_seq *post_p)
++/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
++   manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
++   Don't add the note if the previously saved value will be left untouched
++   within stack red-zone till return, as unwinders can find the same value
++   in the register and on the stack.  */
++
++static void
++ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
+ {
+-  static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
+-  tree f_gpr, f_fpr, f_ovf, f_sav;
+-  tree gpr, fpr, ovf, sav, t;
+-  int size, rsize;
+-  tree lab_false, lab_over = NULL_TREE;
+-  tree addr, t2;
+-  rtx container;
+-  int indirect_p = 0;
+-  tree ptrtype;
+-  machine_mode nat_mode;
+-  unsigned int arg_boundary;
+-  unsigned int type_align;
++  if (!crtl->shrink_wrapped
++      && cfa_offset <= cfun->machine->fs.red_zone_offset)
++    return;
+ 
+-  /* Only 64bit target needs something special.  */
+-  if (is_va_list_char_pointer (TREE_TYPE (valist)))
+-    return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
++  if (insn)
++    {
++      add_reg_note (insn, REG_CFA_RESTORE, reg);
++      RTX_FRAME_RELATED_P (insn) = 1;
++    }
++  else
++    queued_cfa_restores
++      = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
++}
+ 
+-  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+-  f_fpr = DECL_CHAIN (f_gpr);
+-  f_ovf = DECL_CHAIN (f_fpr);
+-  f_sav = DECL_CHAIN (f_ovf);
++/* Add queued REG_CFA_RESTORE notes if any to INSN.  */
+ 
+-  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
+-		valist, f_gpr, NULL_TREE);
++static void
++ix86_add_queued_cfa_restore_notes (rtx insn)
++{
++  rtx last;
++  if (!queued_cfa_restores)
++    return;
++  for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
++    ;
++  XEXP (last, 1) = REG_NOTES (insn);
++  REG_NOTES (insn) = queued_cfa_restores;
++  queued_cfa_restores = NULL_RTX;
++  RTX_FRAME_RELATED_P (insn) = 1;
++}
+ 
+-  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
+-  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
+-  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
++/* Expand prologue or epilogue stack adjustment.
++   The pattern exist to put a dependency on all ebp-based memory accesses.
++   STYLE should be negative if instructions should be marked as frame related,
++   zero if %r11 register is live and cannot be freely used and positive
++   otherwise.  */
+ 
+-  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+-  if (indirect_p)
+-    type = build_pointer_type (type);
+-  size = arg_int_size_in_bytes (type);
+-  rsize = CEIL (size, UNITS_PER_WORD);
++static rtx
++pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
++			   int style, bool set_cfa)
++{
++  struct machine_function *m = cfun->machine;
++  rtx insn;
++  bool add_frame_related_expr = false;
+ 
+-  nat_mode = type_natural_mode (type, NULL, false);
+-  switch (nat_mode)
++  if (Pmode == SImode)
++    insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
++  else if (x86_64_immediate_operand (offset, DImode))
++    insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
++  else
+     {
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-    case E_V16SFmode:
+-    case E_V16SImode:
+-    case E_V64QImode:
+-    case E_V32HImode:
+-    case E_V8DFmode:
+-    case E_V8DImode:
+-      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
+-      if (!TARGET_64BIT_MS_ABI)
++      rtx tmp;
++      /* r11 is used by indirect sibcall return as well, set before the
++	 epilogue and used after the epilogue.  */
++      if (style)
++        tmp = gen_rtx_REG (DImode, R11_REG);
++      else
+ 	{
+-	  container = NULL;
+-	  break;
++	  gcc_assert (src != hard_frame_pointer_rtx
++		      && dest != hard_frame_pointer_rtx);
++	  tmp = hard_frame_pointer_rtx;
+ 	}
+-      /* FALLTHRU */
++      insn = emit_insn (gen_rtx_SET (tmp, offset));
++      if (style < 0)
++	add_frame_related_expr = true;
+ 
+-    default:
+-      container = construct_container (nat_mode, TYPE_MODE (type),
+-				       type, 0, X86_64_REGPARM_MAX,
+-				       X86_64_SSE_REGPARM_MAX, intreg,
+-				       0);
+-      break;
++      insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
+     }
+ 
+-  /* Pull the value out of the saved registers.  */
+-
+-  addr = create_tmp_var (ptr_type_node, "addr");
+-  type_align = TYPE_ALIGN (type);
++  insn = emit_insn (insn);
++  if (style >= 0)
++    ix86_add_queued_cfa_restore_notes (insn);
+ 
+-  if (container)
++  if (set_cfa)
+     {
+-      int needed_intregs, needed_sseregs;
+-      bool need_temp;
+-      tree int_addr, sse_addr;
+-
+-      lab_false = create_artificial_label (UNKNOWN_LOCATION);
+-      lab_over = create_artificial_label (UNKNOWN_LOCATION);
+-
+-      examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
++      rtx r;
+ 
+-      need_temp = (!REG_P (container)
+-		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
+-		       || TYPE_ALIGN (type) > 128));
++      gcc_assert (m->fs.cfa_reg == src);
++      m->fs.cfa_offset += INTVAL (offset);
++      m->fs.cfa_reg = dest;
+ 
+-      /* In case we are passing structure, verify that it is consecutive block
+-         on the register save area.  If not we need to do moves.  */
+-      if (!need_temp && !REG_P (container))
++      r = gen_rtx_PLUS (Pmode, src, offset);
++      r = gen_rtx_SET (dest, r);
++      add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
++      RTX_FRAME_RELATED_P (insn) = 1;
++    }
++  else if (style < 0)
++    {
++      RTX_FRAME_RELATED_P (insn) = 1;
++      if (add_frame_related_expr)
+ 	{
+-	  /* Verify that all registers are strictly consecutive  */
+-	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
+-	    {
+-	      int i;
++	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
++	  r = gen_rtx_SET (dest, r);
++	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
++	}
++    }
+ 
+-	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+-		{
+-		  rtx slot = XVECEXP (container, 0, i);
+-		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
+-		      || INTVAL (XEXP (slot, 1)) != i * 16)
+-		    need_temp = true;
+-		}
+-	    }
+-	  else
+-	    {
+-	      int i;
++  if (dest == stack_pointer_rtx)
++    {
++      HOST_WIDE_INT ooffset = m->fs.sp_offset;
++      bool valid = m->fs.sp_valid;
++      bool realigned = m->fs.sp_realigned;
+ 
+-	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+-		{
+-		  rtx slot = XVECEXP (container, 0, i);
+-		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
+-		      || INTVAL (XEXP (slot, 1)) != i * 8)
+-		    need_temp = true;
+-		}
+-	    }
+-	}
+-      if (!need_temp)
+-	{
+-	  int_addr = addr;
+-	  sse_addr = addr;
+-	}
+-      else
++      if (src == hard_frame_pointer_rtx)
+ 	{
+-	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
+-	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
++	  valid = m->fs.fp_valid;
++	  realigned = false;
++	  ooffset = m->fs.fp_offset;
+ 	}
+-
+-      /* First ensure that we fit completely in registers.  */
+-      if (needed_intregs)
++      else if (src == crtl->drap_reg)
+ 	{
+-	  t = build_int_cst (TREE_TYPE (gpr),
+-			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
+-	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
+-	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+-	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+-	  gimplify_and_add (t, pre_p);
++	  valid = m->fs.drap_valid;
++	  realigned = false;
++	  ooffset = 0;
+ 	}
+-      if (needed_sseregs)
++      else
+ 	{
+-	  t = build_int_cst (TREE_TYPE (fpr),
+-			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
+-			     + X86_64_REGPARM_MAX * 8);
+-	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
+-	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+-	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+-	  gimplify_and_add (t, pre_p);
++	  /* Else there are two possibilities: SP itself, which we set
++	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
++	     taken care of this by hand along the eh_return path.  */
++	  gcc_checking_assert (src == stack_pointer_rtx
++			       || offset == const0_rtx);
+ 	}
+ 
+-      /* Compute index to start of area used for integer regs.  */
+-      if (needed_intregs)
+-	{
+-	  /* int_addr = gpr + sav; */
+-	  t = fold_build_pointer_plus (sav, gpr);
+-	  gimplify_assign (int_addr, t, pre_p);
+-	}
+-      if (needed_sseregs)
+-	{
+-	  /* sse_addr = fpr + sav; */
+-	  t = fold_build_pointer_plus (sav, fpr);
+-	  gimplify_assign (sse_addr, t, pre_p);
+-	}
+-      if (need_temp)
+-	{
+-	  int i, prev_size = 0;
+-	  tree temp = create_tmp_var (type, "va_arg_tmp");
++      m->fs.sp_offset = ooffset - INTVAL (offset);
++      m->fs.sp_valid = valid;
++      m->fs.sp_realigned = realigned;
++    }
++  return insn;
++}
+ 
+-	  /* addr = &temp; */
+-	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
+-	  gimplify_assign (addr, t, pre_p);
++/* Find an available register to be used as dynamic realign argument
++   pointer regsiter.  Such a register will be written in prologue and
++   used in begin of body, so it must not be
++	1. parameter passing register.
++	2. GOT pointer.
++   We reuse static-chain register if it is available.  Otherwise, we
++   use DI for i386 and R13 for x86-64.  We chose R13 since it has
++   shorter encoding.
+ 
+-	  for (i = 0; i < XVECLEN (container, 0); i++)
+-	    {
+-	      rtx slot = XVECEXP (container, 0, i);
+-	      rtx reg = XEXP (slot, 0);
+-	      machine_mode mode = GET_MODE (reg);
+-	      tree piece_type;
+-	      tree addr_type;
+-	      tree daddr_type;
+-	      tree src_addr, src;
+-	      int src_offset;
+-	      tree dest_addr, dest;
+-	      int cur_size = GET_MODE_SIZE (mode);
++   Return: the regno of chosen register.  */
+ 
+-	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
+-	      prev_size = INTVAL (XEXP (slot, 1));
+-	      if (prev_size + cur_size > size)
+-		{
+-		  cur_size = size - prev_size;
+-		  unsigned int nbits = cur_size * BITS_PER_UNIT;
+-		  if (!int_mode_for_size (nbits, 1).exists (&mode))
+-		    mode = QImode;
+-		}
+-	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
+-	      if (mode == GET_MODE (reg))
+-		addr_type = build_pointer_type (piece_type);
+-	      else
+-		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+-							 true);
+-	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+-							true);
++static unsigned int
++find_drap_reg (void)
++{
++  tree decl = cfun->decl;
+ 
+-	      if (SSE_REGNO_P (REGNO (reg)))
+-		{
+-		  src_addr = sse_addr;
+-		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
+-		}
+-	      else
+-		{
+-		  src_addr = int_addr;
+-		  src_offset = REGNO (reg) * 8;
+-		}
+-	      src_addr = fold_convert (addr_type, src_addr);
+-	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
++  /* Always use callee-saved register if there are no caller-saved
++     registers.  */
++  if (TARGET_64BIT)
++    {
++      /* Use R13 for nested function or function need static chain.
++	 Since function with tail call may use any caller-saved
++	 registers in epilogue, DRAP must not use caller-saved
++	 register in such case.  */
++      if (DECL_STATIC_CHAIN (decl)
++	  || cfun->machine->no_caller_saved_registers
++	  || crtl->tail_call_emit)
++	return R13_REG;
+ 
+-	      dest_addr = fold_convert (daddr_type, addr);
+-	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
+-	      if (cur_size == GET_MODE_SIZE (mode))
+-		{
+-		  src = build_va_arg_indirect_ref (src_addr);
+-		  dest = build_va_arg_indirect_ref (dest_addr);
++      return R10_REG;
++    }
++  else
++    {
++      /* Use DI for nested function or function need static chain.
++	 Since function with tail call may use any caller-saved
++	 registers in epilogue, DRAP must not use caller-saved
++	 register in such case.  */
++      if (DECL_STATIC_CHAIN (decl)
++	  || cfun->machine->no_caller_saved_registers
++	  || crtl->tail_call_emit)
++	return DI_REG;
+ 
+-		  gimplify_assign (dest, src, pre_p);
+-		}
+-	      else
+-		{
+-		  tree copy
+-		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
+-				       3, dest_addr, src_addr,
+-				       size_int (cur_size));
+-		  gimplify_and_add (copy, pre_p);
+-		}
+-	      prev_size += cur_size;
+-	    }
+-	}
+-
+-      if (needed_intregs)
+-	{
+-	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
+-		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
+-	  gimplify_assign (gpr, t, pre_p);
+-	  /* The GPR save area guarantees only 8-byte alignment.  */
+-	  if (!need_temp)
+-	    type_align = MIN (type_align, 64);
+-	}
+-
+-      if (needed_sseregs)
++      /* Reuse static chain register if it isn't used for parameter
++         passing.  */
++      if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
+ 	{
+-	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
+-		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
+-	  gimplify_assign (unshare_expr (fpr), t, pre_p);
++	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
++	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
++	    return CX_REG;
+ 	}
+-
+-      gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
+-
+-      gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
+-    }
+-
+-  /* ... otherwise out of the overflow area.  */
+-
+-  /* When we align parameter on stack for caller, if the parameter
+-     alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
+-     aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
+-     here with caller.  */
+-  arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
+-  if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
+-    arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
+-
+-  /* Care for on-stack alignment if needed.  */
+-  if (arg_boundary <= 64 || size == 0)
+-    t = ovf;
+- else
+-    {
+-      HOST_WIDE_INT align = arg_boundary / 8;
+-      t = fold_build_pointer_plus_hwi (ovf, align - 1);
+-      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
+-		  build_int_cst (TREE_TYPE (t), -align));
++      return DI_REG;
+     }
++}
+ 
+-  gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
+-  gimplify_assign (addr, t, pre_p);
++/* Return minimum incoming stack alignment.  */
+ 
+-  t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
+-  gimplify_assign (unshare_expr (ovf), t, pre_p);
++static unsigned int
++ix86_minimum_incoming_stack_boundary (bool sibcall)
++{
++  unsigned int incoming_stack_boundary;
+ 
+-  if (container)
+-    gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
++  /* Stack of interrupt handler is aligned to 128 bits in 64bit mode.  */
++  if (cfun->machine->func_type != TYPE_NORMAL)
++    incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
++  /* Prefer the one specified at command line. */
++  else if (ix86_user_incoming_stack_boundary)
++    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
++  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
++     if -mstackrealign is used, it isn't used for sibcall check and
++     estimated stack alignment is 128bit.  */
++  else if (!sibcall
++	   && ix86_force_align_arg_pointer
++	   && crtl->stack_alignment_estimated == 128)
++    incoming_stack_boundary = MIN_STACK_BOUNDARY;
++  else
++    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+ 
+-  type = build_aligned_type (type, type_align);
+-  ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
+-  addr = fold_convert (ptrtype, addr);
++  /* Incoming stack alignment can be changed on individual functions
++     via force_align_arg_pointer attribute.  We use the smallest
++     incoming stack boundary.  */
++  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
++      && lookup_attribute ("force_align_arg_pointer",
++			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
++    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+ 
+-  if (indirect_p)
+-    addr = build_va_arg_indirect_ref (addr);
+-  return build_va_arg_indirect_ref (addr);
+-}
+-
+-/* Return true if OPNUM's MEM should be matched
+-   in movabs* patterns.  */
++  /* The incoming stack frame has to be aligned at least at
++     parm_stack_boundary.  */
++  if (incoming_stack_boundary < crtl->parm_stack_boundary)
++    incoming_stack_boundary = crtl->parm_stack_boundary;
+ 
+-bool
+-ix86_check_movabs (rtx insn, int opnum)
+-{
+-  rtx set, mem;
++  /* Stack at entrance of main is aligned by runtime.  We use the
++     smallest incoming stack boundary. */
++  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
++      && DECL_NAME (current_function_decl)
++      && MAIN_NAME_P (DECL_NAME (current_function_decl))
++      && DECL_FILE_SCOPE_P (current_function_decl))
++    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+ 
+-  set = PATTERN (insn);
+-  if (GET_CODE (set) == PARALLEL)
+-    set = XVECEXP (set, 0, 0);
+-  gcc_assert (GET_CODE (set) == SET);
+-  mem = XEXP (set, opnum);
+-  while (SUBREG_P (mem))
+-    mem = SUBREG_REG (mem);
+-  gcc_assert (MEM_P (mem));
+-  return volatile_ok || !MEM_VOLATILE_P (mem);
++  return incoming_stack_boundary;
+ }
+ 
+-/* Return false if INSN contains a MEM with a non-default address space.  */
+-bool
+-ix86_check_no_addr_space (rtx insn)
+-{
+-  subrtx_var_iterator::array_type array;
+-  FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
+-    {
+-      rtx x = *iter;
+-      if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
+-	return false;
+-    }
+-  return true;
+-}
+-
+-/* Initialize the table of extra 80387 mathematical constants.  */
++/* Update incoming stack boundary and estimated stack alignment.  */
+ 
+ static void
+-init_ext_80387_constants (void)
++ix86_update_stack_boundary (void)
+ {
+-  static const char * cst[5] =
+-  {
+-    "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
+-    "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
+-    "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
+-    "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
+-    "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
+-  };
+-  int i;
++  ix86_incoming_stack_boundary
++    = ix86_minimum_incoming_stack_boundary (false);
+ 
+-  for (i = 0; i < 5; i++)
+-    {
+-      real_from_string (&ext_80387_constants_table[i], cst[i]);
+-      /* Ensure each constant is rounded to XFmode precision.  */
+-      real_convert (&ext_80387_constants_table[i],
+-		    XFmode, &ext_80387_constants_table[i]);
+-    }
++  /* x86_64 vararg needs 16byte stack alignment for register save area.  */
++  if (TARGET_64BIT
++      && cfun->stdarg
++      && crtl->stack_alignment_estimated < 128)
++    crtl->stack_alignment_estimated = 128;
+ 
+-  ext_80387_constants_init = 1;
++  /* __tls_get_addr needs to be called with 16-byte aligned stack.  */
++  if (ix86_tls_descriptor_calls_expanded_in_cfun
++      && crtl->preferred_stack_boundary < 128)
++    crtl->preferred_stack_boundary = 128;
+ }
+ 
+-/* Return non-zero if the constant is something that
+-   can be loaded with a special instruction.  */
++/* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
++   needed or an rtx for DRAP otherwise.  */
+ 
+-int
+-standard_80387_constant_p (rtx x)
++static rtx
++ix86_get_drap_rtx (void)
+ {
+-  machine_mode mode = GET_MODE (x);
+-
+-  const REAL_VALUE_TYPE *r;
+-
+-  if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
+-    return -1;
+-
+-  if (x == CONST0_RTX (mode))
+-    return 1;
+-  if (x == CONST1_RTX (mode))
+-    return 2;
+-
+-  r = CONST_DOUBLE_REAL_VALUE (x);
++  /* We must use DRAP if there are outgoing arguments on stack and
++     ACCUMULATE_OUTGOING_ARGS is false.  */
++  if (ix86_force_drap
++      || (cfun->machine->outgoing_args_on_stack
++	  && !ACCUMULATE_OUTGOING_ARGS))
++    crtl->need_drap = true;
+ 
+-  /* For XFmode constants, try to find a special 80387 instruction when
+-     optimizing for size or on those CPUs that benefit from them.  */
+-  if (mode == XFmode
+-      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
++  if (stack_realign_drap)
+     {
+-      int i;
++      /* Assign DRAP to vDRAP and returns vDRAP */
++      unsigned int regno = find_drap_reg ();
++      rtx drap_vreg;
++      rtx arg_ptr;
++      rtx_insn *seq, *insn;
+ 
+-      if (! ext_80387_constants_init)
+-	init_ext_80387_constants ();
++      arg_ptr = gen_rtx_REG (Pmode, regno);
++      crtl->drap_reg = arg_ptr;
+ 
+-      for (i = 0; i < 5; i++)
+-        if (real_identical (r, &ext_80387_constants_table[i]))
+-	  return i + 3;
++      start_sequence ();
++      drap_vreg = copy_to_reg (arg_ptr);
++      seq = get_insns ();
++      end_sequence ();
++
++      insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
++      if (!optimize)
++	{
++	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	}
++      return drap_vreg;
+     }
++  else
++    return NULL;
++}
+ 
+-  /* Load of the constant -0.0 or -1.0 will be split as
+-     fldz;fchs or fld1;fchs sequence.  */
+-  if (real_isnegzero (r))
+-    return 8;
+-  if (real_identical (r, &dconstm1))
+-    return 9;
++/* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
+ 
+-  return 0;
++static rtx
++ix86_internal_arg_pointer (void)
++{
++  return virtual_incoming_args_rtx;
+ }
+ 
+-/* Return the opcode of the special instruction to be used to load
+-   the constant X.  */
++struct scratch_reg {
++  rtx reg;
++  bool saved;
++};
+ 
+-const char *
+-standard_80387_constant_opcode (rtx x)
++/* Return a short-lived scratch register for use on function entry.
++   In 32-bit mode, it is valid only after the registers are saved
++   in the prologue.  This register must be released by means of
++   release_scratch_register_on_entry once it is dead.  */
++
++static void
++get_scratch_register_on_entry (struct scratch_reg *sr)
+ {
+-  switch (standard_80387_constant_p (x))
+-    {
+-    case 1:
+-      return "fldz";
+-    case 2:
+-      return "fld1";
+-    case 3:
+-      return "fldlg2";
+-    case 4:
+-      return "fldln2";
+-    case 5:
+-      return "fldl2e";
+-    case 6:
+-      return "fldl2t";
+-    case 7:
+-      return "fldpi";
+-    case 8:
+-    case 9:
+-      return "#";
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* Return the CONST_DOUBLE representing the 80387 constant that is
+-   loaded by the specified special instruction.  The argument IDX
+-   matches the return value from standard_80387_constant_p.  */
+-
+-rtx
+-standard_80387_constant_rtx (int idx)
+-{
+-  int i;
++  int regno;
+ 
+-  if (! ext_80387_constants_init)
+-    init_ext_80387_constants ();
++  sr->saved = false;
+ 
+-  switch (idx)
++  if (TARGET_64BIT)
+     {
+-    case 3:
+-    case 4:
+-    case 5:
+-    case 6:
+-    case 7:
+-      i = idx - 3;
+-      break;
++      /* We always use R11 in 64-bit mode.  */
++      regno = R11_REG;
++    }
++  else
++    {
++      tree decl = current_function_decl, fntype = TREE_TYPE (decl);
++      bool fastcall_p
++	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
++      bool thiscall_p
++	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
++      bool static_chain_p = DECL_STATIC_CHAIN (decl);
++      int regparm = ix86_function_regparm (fntype, decl);
++      int drap_regno
++	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
+ 
+-    default:
+-      gcc_unreachable ();
++      /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
++	  for the static chain register.  */
++      if ((regparm < 1 || (fastcall_p && !static_chain_p))
++	  && drap_regno != AX_REG)
++	regno = AX_REG;
++      /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
++	  for the static chain register.  */
++      else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
++        regno = AX_REG;
++      else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
++	regno = DX_REG;
++      /* ecx is the static chain register.  */
++      else if (regparm < 3 && !fastcall_p && !thiscall_p
++	       && !static_chain_p
++	       && drap_regno != CX_REG)
++	regno = CX_REG;
++      else if (ix86_save_reg (BX_REG, true, false))
++	regno = BX_REG;
++      /* esi is the static chain register.  */
++      else if (!(regparm == 3 && static_chain_p)
++	       && ix86_save_reg (SI_REG, true, false))
++	regno = SI_REG;
++      else if (ix86_save_reg (DI_REG, true, false))
++	regno = DI_REG;
++      else
++	{
++	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
++	  sr->saved = true;
++	}
+     }
+ 
+-  return const_double_from_real_value (ext_80387_constants_table[i],
+-				       XFmode);
++  sr->reg = gen_rtx_REG (Pmode, regno);
++  if (sr->saved)
++    {
++      rtx_insn *insn = emit_insn (gen_push (sr->reg));
++      RTX_FRAME_RELATED_P (insn) = 1;
++    }
+ }
+ 
+-/* Return 1 if X is all bits 0 and 2 if X is all bits 1
+-   in supported SSE/AVX vector mode.  */
+-
+-int
+-standard_sse_constant_p (rtx x, machine_mode pred_mode)
+-{
+-  machine_mode mode;
+-
+-  if (!TARGET_SSE)
+-    return 0;
++/* Release a scratch register obtained from the preceding function.
+ 
+-  mode = GET_MODE (x);
++   If RELEASE_VIA_POP is true, we just pop the register off the stack
++   to release it.  This is what non-Linux systems use with -fstack-check.
+ 
+-  if (x == const0_rtx || const0_operand (x, mode))
+-    return 1;
++   Otherwise we use OFFSET to locate the saved register and the
++   allocated stack space becomes part of the local frame and is
++   deallocated by the epilogue.  */
+ 
+-  if (x == constm1_rtx || vector_all_ones_operand (x, mode))
++static void
++release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
++				   bool release_via_pop)
++{
++  if (sr->saved)
+     {
+-      /* VOIDmode integer constant, get mode from the predicate.  */
+-      if (mode == VOIDmode)
+-	mode = pred_mode;
++      if (release_via_pop)
++	{
++	  struct machine_function *m = cfun->machine;
++	  rtx x, insn = emit_insn (gen_pop (sr->reg));
+ 
+-      switch (GET_MODE_SIZE (mode))
++	  /* The RX FRAME_RELATED_P mechanism doesn't know about pop.  */
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
++	  x = gen_rtx_SET (stack_pointer_rtx, x);
++	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
++	  m->fs.sp_offset -= UNITS_PER_WORD;
++	}
++      else
+ 	{
+-	case 64:
+-	  if (TARGET_AVX512F)
+-	    return 2;
+-	  break;
+-	case 32:
+-	  if (TARGET_AVX2)
+-	    return 2;
+-	  break;
+-	case 16:
+-	  if (TARGET_SSE2)
+-	    return 2;
+-	  break;
+-	case 0:
+-	  /* VOIDmode */
+-	  gcc_unreachable ();
+-	default:
+-	  break;
++	  rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
++	  x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
++	  emit_insn (x);
+ 	}
+     }
+-
+-  return 0;
+ }
+ 
+-/* Return the opcode of the special instruction to be used to load
+-   the constant operands[1] into operands[0].  */
++/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
+ 
+-const char *
+-standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
+-{
+-  machine_mode mode;
+-  rtx x = operands[1];
++   This differs from the next routine in that it tries hard to prevent
++   attacks that jump the stack guard.  Thus it is never allowed to allocate
++   more than PROBE_INTERVAL bytes of stack space without a suitable
++   probe.
+ 
+-  gcc_assert (TARGET_SSE);
++   INT_REGISTERS_SAVED is true if integer registers have already been
++   pushed on the stack.  */
+ 
+-  mode = GET_MODE (x);
++static void
++ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
++					 const bool int_registers_saved)
++{
++  struct machine_function *m = cfun->machine;
+ 
+-  if (x == const0_rtx || const0_operand (x, mode))
++  /* If this function does not statically allocate stack space, then
++     no probes are needed.  */
++  if (!size)
+     {
+-      switch (get_attr_mode (insn))
+-	{
+-	case MODE_TI:
+-	  if (!EXT_REX_SSE_REG_P (operands[0]))
+-	    return "%vpxor\t%0, %d0";
+-	  /* FALLTHRU */
+-	case MODE_XI:
+-	case MODE_OI:
+-	  if (EXT_REX_SSE_REG_P (operands[0]))
+-	    return (TARGET_AVX512VL
+-		    ? "vpxord\t%x0, %x0, %x0"
+-		    : "vpxord\t%g0, %g0, %g0");
+-	  return "vpxor\t%x0, %x0, %x0";
++      /* However, the allocation of space via pushes for register
++	 saves could be viewed as allocating space, but without the
++	 need to probe.  */
++      if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
++        dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
++      else
++	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
++      return;
++    }
+ 
+-	case MODE_V2DF:
+-	  if (!EXT_REX_SSE_REG_P (operands[0]))
+-	    return "%vxorpd\t%0, %d0";
+-	  /* FALLTHRU */
+-	case MODE_V8DF:
+-	case MODE_V4DF:
+-	  if (!EXT_REX_SSE_REG_P (operands[0]))
+-	    return "vxorpd\t%x0, %x0, %x0";
+-	  else if (TARGET_AVX512DQ)
+-	    return (TARGET_AVX512VL
+-		    ? "vxorpd\t%x0, %x0, %x0"
+-		    : "vxorpd\t%g0, %g0, %g0");
+-	  else
+-	    return (TARGET_AVX512VL
+-		    ? "vpxorq\t%x0, %x0, %x0"
+-		    : "vpxorq\t%g0, %g0, %g0");
++  /* If we are a noreturn function, then we have to consider the
++     possibility that we're called via a jump rather than a call.
+ 
+-	case MODE_V4SF:
+-	  if (!EXT_REX_SSE_REG_P (operands[0]))
+-	    return "%vxorps\t%0, %d0";
+-	  /* FALLTHRU */
+-	case MODE_V16SF:
+-	case MODE_V8SF:
+-	  if (!EXT_REX_SSE_REG_P (operands[0]))
+-	    return "vxorps\t%x0, %x0, %x0";
+-	  else if (TARGET_AVX512DQ)
+-	    return (TARGET_AVX512VL
+-		    ? "vxorps\t%x0, %x0, %x0"
+-		    : "vxorps\t%g0, %g0, %g0");
+-	  else
+-	    return (TARGET_AVX512VL
+-		    ? "vpxord\t%x0, %x0, %x0"
+-		    : "vpxord\t%g0, %g0, %g0");
++     Thus we don't have the implicit probe generated by saving the
++     return address into the stack at the call.  Thus, the stack
++     pointer could be anywhere in the guard page.  The safe thing
++     to do is emit a probe now.
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-  else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
+-    {
+-      enum attr_mode insn_mode = get_attr_mode (insn);
+-      
+-      switch (insn_mode)
+-	{
+-	case MODE_XI:
+-	case MODE_V8DF:
+-	case MODE_V16SF:
+-	  gcc_assert (TARGET_AVX512F);
+-	  return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+-
+-	case MODE_OI:
+-	case MODE_V4DF:
+-	case MODE_V8SF:
+-	  gcc_assert (TARGET_AVX2);
+-	  /* FALLTHRU */
+-	case MODE_TI:
+-	case MODE_V2DF:
+-	case MODE_V4SF:
+-	  gcc_assert (TARGET_SSE2);
+-	  if (!EXT_REX_SSE_REG_P (operands[0]))
+-	    return (TARGET_AVX
+-		    ? "vpcmpeqd\t%0, %0, %0"
+-		    : "pcmpeqd\t%0, %0");
+-	  else if (TARGET_AVX512VL)
+-	    return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
+-	  else
+-	    return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
++     The probe can be avoided if we have already emitted any callee
++     register saves into the stack or have a frame pointer (which will
++     have been saved as well).  Those saves will function as implicit
++     probes.
+ 
+-	default:
+-	  gcc_unreachable ();
++     ?!? This should be revamped to work like aarch64 and s390 where
++     we track the offset from the most recent probe.  Normally that
++     offset would be zero.  For a noreturn function we would reset
++     it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT).   Then
++     we just probe when we cross PROBE_INTERVAL.  */
++  if (TREE_THIS_VOLATILE (cfun->decl)
++      && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
++    {
++      /* We can safely use any register here since we're just going to push
++	 its value and immediately pop it back.  But we do try and avoid
++	 argument passing registers so as not to introduce dependencies in
++	 the pipeline.  For 32 bit we use %esi and for 64 bit we use %rax.  */
++      rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
++      rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
++      rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
++      m->fs.sp_offset -= UNITS_PER_WORD;
++      if (m->fs.cfa_reg == stack_pointer_rtx)
++	{
++	  m->fs.cfa_offset -= UNITS_PER_WORD;
++	  rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
++	  x = gen_rtx_SET (stack_pointer_rtx, x);
++	  add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
++	  RTX_FRAME_RELATED_P (insn_push) = 1;
++	  x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
++	  x = gen_rtx_SET (stack_pointer_rtx, x);
++	  add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
++	  RTX_FRAME_RELATED_P (insn_pop) = 1;
+ 	}
+-   }
+-
+-  gcc_unreachable ();
+-}
+-
+-/* Returns true if INSN can be transformed from a memory load
+-   to a supported FP constant load.  */
++      emit_insn (gen_blockage ());
++    }
+ 
+-bool
+-ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
+-{
+-  rtx src = find_constant_src (insn);
++  /* If we allocate less than the size of the guard statically,
++     then no probing is necessary, but we do need to allocate
++     the stack.  */
++  if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
++    {
++      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++			         GEN_INT (-size), -1,
++			         m->fs.cfa_reg == stack_pointer_rtx);
++      dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
++      return;
++    }
+ 
+-  gcc_assert (REG_P (dst));
++  /* We're allocating a large enough stack frame that we need to
++     emit probes.  Either emit them inline or in a loop depending
++     on the size.  */
++  HOST_WIDE_INT probe_interval = get_probe_interval ();
++  if (size <= 4 * probe_interval)
++    {
++      HOST_WIDE_INT i;
++      for (i = probe_interval; i <= size; i += probe_interval)
++	{
++	  /* Allocate PROBE_INTERVAL bytes.  */
++	  rtx insn
++	    = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++					 GEN_INT (-probe_interval), -1,
++					 m->fs.cfa_reg == stack_pointer_rtx);
++	  add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
+ 
+-  if (src == NULL
+-      || (SSE_REGNO_P (REGNO (dst))
+-	  && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
+-      || (STACK_REGNO_P (REGNO (dst))
+-	   && standard_80387_constant_p (src) < 1))
+-    return false;
++	  /* And probe at *sp.  */
++	  emit_stack_probe (stack_pointer_rtx);
++	  emit_insn (gen_blockage ());
++	}
+ 
+-  return true;
+-}
++      /* We need to allocate space for the residual, but we do not need
++	 to probe the residual.  */
++      HOST_WIDE_INT residual = (i - probe_interval - size);
++      if (residual)
++	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				   GEN_INT (residual), -1,
++				   m->fs.cfa_reg == stack_pointer_rtx);
++      dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
++    }
++  else
++    {
++      /* We expect the GP registers to be saved when probes are used
++	 as the probing sequences might need a scratch register and
++	 the routine to allocate one assumes the integer registers
++	 have already been saved.  */
++      gcc_assert (int_registers_saved);
+ 
+-/* Returns true if OP contains a symbol reference */
++      struct scratch_reg sr;
++      get_scratch_register_on_entry (&sr);
+ 
+-bool
+-symbolic_reference_mentioned_p (rtx op)
+-{
+-  const char *fmt;
+-  int i;
++      /* If we needed to save a register, then account for any space
++	 that was pushed (we are not going to pop the register when
++	 we do the restore).  */
++      if (sr.saved)
++	size -= UNITS_PER_WORD;
+ 
+-  if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
+-    return true;
++      /* Step 1: round SIZE down to a multiple of the interval.  */
++      HOST_WIDE_INT rounded_size = size & -probe_interval;
+ 
+-  fmt = GET_RTX_FORMAT (GET_CODE (op));
+-  for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
+-    {
+-      if (fmt[i] == 'E')
++      /* Step 2: compute final value of the loop counter.  Use lea if
++	 possible.  */
++      rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
++      rtx insn;
++      if (address_no_seg_operand (addr, Pmode))
++	insn = emit_insn (gen_rtx_SET (sr.reg, addr));
++      else
+ 	{
+-	  int j;
+-
+-	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
+-	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
+-	      return true;
++	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
++	  insn = emit_insn (gen_rtx_SET (sr.reg,
++					 gen_rtx_PLUS (Pmode, sr.reg,
++						       stack_pointer_rtx)));
++	}
++      if (m->fs.cfa_reg == stack_pointer_rtx)
++	{
++	  add_reg_note (insn, REG_CFA_DEF_CFA,
++			plus_constant (Pmode, sr.reg,
++				       m->fs.cfa_offset + rounded_size));
++	  RTX_FRAME_RELATED_P (insn) = 1;
+ 	}
+ 
+-      else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
+-	return true;
+-    }
++      /* Step 3: the loop.  */
++      rtx size_rtx = GEN_INT (rounded_size);
++      insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
++							 size_rtx));
++      if (m->fs.cfa_reg == stack_pointer_rtx)
++	{
++	  m->fs.cfa_offset += rounded_size;
++	  add_reg_note (insn, REG_CFA_DEF_CFA,
++			plus_constant (Pmode, stack_pointer_rtx,
++				       m->fs.cfa_offset));
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	}
++      m->fs.sp_offset += rounded_size;
++      emit_insn (gen_blockage ());
+ 
+-  return false;
+-}
++      /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
++	 is equal to ROUNDED_SIZE.  */
+ 
+-/* Return true if it is appropriate to emit `ret' instructions in the
+-   body of a function.  Do this only if the epilogue is simple, needing a
+-   couple of insns.  Prior to reloading, we can't tell how many registers
+-   must be saved, so return false then.  Return false if there is no frame
+-   marker to de-allocate.  */
++      if (size != rounded_size)
++	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				   GEN_INT (rounded_size - size), -1,
++				   m->fs.cfa_reg == stack_pointer_rtx);
++      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+ 
+-bool
+-ix86_can_use_return_insn_p (void)
+-{
+-  if (ix86_function_naked (current_function_decl))
+-    return false;
++      /* This does not deallocate the space reserved for the scratch
++	 register.  That will be deallocated in the epilogue.  */
++      release_scratch_register_on_entry (&sr, size, false);
++    }
+ 
+-  /* Don't use `ret' instruction in interrupt handler.  */
+-  if (! reload_completed
+-      || frame_pointer_needed
+-      || cfun->machine->func_type != TYPE_NORMAL)
+-    return 0;
++  /* Make sure nothing is scheduled before we are done.  */
++  emit_insn (gen_blockage ());
++}
+ 
+-  /* Don't allow more than 32k pop, since that's all we can do
+-     with one instruction.  */
+-  if (crtl->args.pops_args && crtl->args.size >= 32768)
+-    return 0;
++/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
+ 
+-  struct ix86_frame &frame = cfun->machine->frame;
+-  return (frame.stack_pointer_offset == UNITS_PER_WORD
+-	  && (frame.nregs + frame.nsseregs) == 0);
+-}
+-
+-/* Value should be nonzero if functions must have frame pointers.
+-   Zero means the frame pointer need not be set up (and parms may
+-   be accessed via the stack pointer) in functions that seem suitable.  */
++   INT_REGISTERS_SAVED is true if integer registers have already been
++   pushed on the stack.  */
+ 
+-static bool
+-ix86_frame_pointer_required (void)
++static void
++ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
++			     const bool int_registers_saved)
+ {
+-  /* If we accessed previous frames, then the generated code expects
+-     to be able to access the saved ebp value in our frame.  */
+-  if (cfun->machine->accesses_prev_frame)
+-    return true;
++  /* We skip the probe for the first interval + a small dope of 4 words and
++     probe that many bytes past the specified size to maintain a protection
++     area at the botton of the stack.  */
++  const int dope = 4 * UNITS_PER_WORD;
++  rtx size_rtx = GEN_INT (size), last;
+ 
+-  /* Several x86 os'es need a frame pointer for other reasons,
+-     usually pertaining to setjmp.  */
+-  if (SUBTARGET_FRAME_POINTER_REQUIRED)
+-    return true;
++  /* See if we have a constant small number of probes to generate.  If so,
++     that's the easy case.  The run-time loop is made up of 9 insns in the
++     generic case while the compile-time loop is made up of 3+2*(n-1) insns
++     for n # of intervals.  */
++  if (size <= 4 * get_probe_interval ())
++    {
++      HOST_WIDE_INT i, adjust;
++      bool first_probe = true;
+ 
+-  /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
+-  if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
+-    return true;
++      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
++	 values of N from 1 until it exceeds SIZE.  If only one probe is
++	 needed, this will not generate any code.  Then adjust and probe
++	 to PROBE_INTERVAL + SIZE.  */
++      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
++	{
++	  if (first_probe)
++	    {
++	      adjust = 2 * get_probe_interval () + dope;
++	      first_probe = false;
++	    }
++	  else
++	    adjust = get_probe_interval ();
+ 
+-  /* Win64 SEH, very large frames need a frame-pointer as maximum stack
+-     allocation is 4GB.  */
+-  if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
+-    return true;
++	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
++				  plus_constant (Pmode, stack_pointer_rtx,
++						 -adjust)));
++	  emit_stack_probe (stack_pointer_rtx);
++	}
+ 
+-  /* SSE saves require frame-pointer when stack is misaligned.  */
+-  if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
+-    return true;
+-  
+-  /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
+-     turns off the frame pointer by default.  Turn it back on now if
+-     we've not got a leaf function.  */
+-  if (TARGET_OMIT_LEAF_FRAME_POINTER
+-      && (!crtl->is_leaf
+-	  || ix86_current_function_calls_tls_descriptor))
+-    return true;
++      if (first_probe)
++	adjust = size + get_probe_interval () + dope;
++      else
++        adjust = size + get_probe_interval () - i;
+ 
+-  if (crtl->profile && !flag_fentry)
+-    return true;
++      emit_insn (gen_rtx_SET (stack_pointer_rtx,
++			      plus_constant (Pmode, stack_pointer_rtx,
++					     -adjust)));
++      emit_stack_probe (stack_pointer_rtx);
+ 
+-  return false;
+-}
++      /* Adjust back to account for the additional first interval.  */
++      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
++				     plus_constant (Pmode, stack_pointer_rtx,
++						    (get_probe_interval ()
++						     + dope))));
++    }
+ 
+-/* Record that the current function accesses previous call frames.  */
++  /* Otherwise, do the same as above, but in a loop.  Note that we must be
++     extra careful with variables wrapping around because we might be at
++     the very top (or the very bottom) of the address space and we have
++     to be able to handle this case properly; in particular, we use an
++     equality test for the loop condition.  */
++  else
++    {
++      /* We expect the GP registers to be saved when probes are used
++	 as the probing sequences might need a scratch register and
++	 the routine to allocate one assumes the integer registers
++	 have already been saved.  */
++      gcc_assert (int_registers_saved);
+ 
+-void
+-ix86_setup_frame_addresses (void)
+-{
+-  cfun->machine->accesses_prev_frame = 1;
+-}
+-
+-#ifndef USE_HIDDEN_LINKONCE
+-# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
+-#  define USE_HIDDEN_LINKONCE 1
+-# else
+-#  define USE_HIDDEN_LINKONCE 0
+-# endif
+-#endif
++      HOST_WIDE_INT rounded_size;
++      struct scratch_reg sr;
+ 
+-/* Label count for call and return thunks.  It is used to make unique
+-   labels in call and return thunks.  */
+-static int indirectlabelno;
++      get_scratch_register_on_entry (&sr);
+ 
+-/* True if call thunk function is needed.  */
+-static bool indirect_thunk_needed = false;
++      /* If we needed to save a register, then account for any space
++	 that was pushed (we are not going to pop the register when
++	 we do the restore).  */
++      if (sr.saved)
++	size -= UNITS_PER_WORD;
+ 
+-/* Bit masks of integer registers, which contain branch target, used
+-   by call thunk functions.  */
+-static int indirect_thunks_used;
++      /* Step 1: round SIZE to the previous multiple of the interval.  */
+ 
+-/* True if return thunk function is needed.  */
+-static bool indirect_return_needed = false;
++      rounded_size = ROUND_DOWN (size, get_probe_interval ());
+ 
+-/* True if return thunk function via CX is needed.  */
+-static bool indirect_return_via_cx;
+ 
+-#ifndef INDIRECT_LABEL
+-# define INDIRECT_LABEL "LIND"
+-#endif
++      /* Step 2: compute initial and final value of the loop counter.  */
+ 
+-/* Indicate what prefix is needed for an indirect branch.  */
+-enum indirect_thunk_prefix
+-{
+-  indirect_thunk_prefix_none,
+-  indirect_thunk_prefix_nt
+-};
++      /* SP = SP_0 + PROBE_INTERVAL.  */
++      emit_insn (gen_rtx_SET (stack_pointer_rtx,
++			      plus_constant (Pmode, stack_pointer_rtx,
++					     - (get_probe_interval () + dope))));
+ 
+-/* Return the prefix needed for an indirect branch INSN.  */
++      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
++      if (rounded_size <= (HOST_WIDE_INT_1 << 31))
++	emit_insn (gen_rtx_SET (sr.reg,
++				plus_constant (Pmode, stack_pointer_rtx,
++					       -rounded_size)));
++      else
++	{
++	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
++	  emit_insn (gen_rtx_SET (sr.reg,
++				  gen_rtx_PLUS (Pmode, sr.reg,
++						stack_pointer_rtx)));
++	}
+ 
+-enum indirect_thunk_prefix
+-indirect_thunk_need_prefix (rtx_insn *insn)
+-{
+-  enum indirect_thunk_prefix need_prefix;
+-  if ((cfun->machine->indirect_branch_type
+-	    == indirect_branch_thunk_extern)
+-	   && ix86_notrack_prefixed_insn_p (insn))
+-    {
+-      /* NOTRACK prefix is only used with external thunk so that it
+-	 can be properly updated to support CET at run-time.  */
+-      need_prefix = indirect_thunk_prefix_nt;
+-    }
+-  else
+-    need_prefix = indirect_thunk_prefix_none;
+-  return need_prefix;
+-}
+ 
+-/* Fills in the label name that should be used for the indirect thunk.  */
++      /* Step 3: the loop
+ 
+-static void
+-indirect_thunk_name (char name[32], unsigned int regno,
+-		     enum indirect_thunk_prefix need_prefix,
+-		     bool ret_p)
+-{
+-  if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
+-    gcc_unreachable ();
++	 do
++	   {
++	     SP = SP + PROBE_INTERVAL
++	     probe at SP
++	   }
++	 while (SP != LAST_ADDR)
+ 
+-  if (USE_HIDDEN_LINKONCE)
+-    {
+-      const char *prefix;
++	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
++	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
+ 
+-      if (need_prefix == indirect_thunk_prefix_nt
+-	  && regno != INVALID_REGNUM)
+-	{
+-	  /* NOTRACK prefix is only used with external thunk via
+-	     register so that NOTRACK prefix can be added to indirect
+-	     branch via register to support CET at run-time.  */
+-	  prefix = "_nt";
+-	}
+-      else
+-	prefix = "";
++      emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
+ 
+-      const char *ret = ret_p ? "return" : "indirect";
+ 
+-      if (regno != INVALID_REGNUM)
++      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
++	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
++
++      if (size != rounded_size)
+ 	{
+-	  const char *reg_prefix;
+-	  if (LEGACY_INT_REGNO_P (regno))
+-	    reg_prefix = TARGET_64BIT ? "r" : "e";
+-	  else
+-	    reg_prefix = "";
+-	  sprintf (name, "__x86_%s_thunk%s_%s%s",
+-		   ret, prefix, reg_prefix, reg_names[regno]);
++	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
++			          plus_constant (Pmode, stack_pointer_rtx,
++						 rounded_size - size)));
++	  emit_stack_probe (stack_pointer_rtx);
+ 	}
+-      else
+-	sprintf (name, "__x86_%s_thunk%s", ret, prefix);
++
++      /* Adjust back to account for the additional first interval.  */
++      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
++				     plus_constant (Pmode, stack_pointer_rtx,
++						    (get_probe_interval ()
++						     + dope))));
++
++      /* This does not deallocate the space reserved for the scratch
++	 register.  That will be deallocated in the epilogue.  */
++      release_scratch_register_on_entry (&sr, size, false);
+     }
+-  else
++
++  /* Even if the stack pointer isn't the CFA register, we need to correctly
++     describe the adjustments made to it, in particular differentiate the
++     frame-related ones from the frame-unrelated ones.  */
++  if (size > 0)
+     {
+-      if (regno != INVALID_REGNUM)
+-	ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
+-      else
+-	{
+-	  if (ret_p)
+-	    ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
+-	  else
+-	    ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
+-	}
++      rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
++      XVECEXP (expr, 0, 0)
++	= gen_rtx_SET (stack_pointer_rtx,
++		       plus_constant (Pmode, stack_pointer_rtx, -size));
++      XVECEXP (expr, 0, 1)
++	= gen_rtx_SET (stack_pointer_rtx,
++		       plus_constant (Pmode, stack_pointer_rtx,
++				      get_probe_interval () + dope + size));
++      add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
++      RTX_FRAME_RELATED_P (last) = 1;
++
++      cfun->machine->fs.sp_offset += size;
+     }
++
++  /* Make sure nothing is scheduled before we are done.  */
++  emit_insn (gen_blockage ());
+ }
+ 
+-/* Output a call and return thunk for indirect branch.  If REGNO != -1,
+-   the function address is in REGNO and the call and return thunk looks like:
++/* Adjust the stack pointer up to REG while probing it.  */
+ 
+-	call	L2
+-   L1:
+-	pause
+-	lfence
+-	jmp	L1
+-   L2:
+-	mov	%REG, (%sp)
+-	ret
++const char *
++output_adjust_stack_and_probe (rtx reg)
++{
++  static int labelno = 0;
++  char loop_lab[32];
++  rtx xops[2];
+ 
+-   Otherwise, the function address is on the top of stack and the
+-   call and return thunk looks like:
++  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+ 
+-	call L2
+-  L1:
+-	pause
+-	lfence
+-	jmp L1
+-  L2:
+-	lea WORD_SIZE(%sp), %sp
+-	ret
+- */
++  /* Loop.  */
++  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+ 
+-static void
+-output_indirect_thunk (unsigned int regno)
+-{
+-  char indirectlabel1[32];
+-  char indirectlabel2[32];
++  /* SP = SP + PROBE_INTERVAL.  */
++  xops[0] = stack_pointer_rtx;
++  xops[1] = GEN_INT (get_probe_interval ());
++  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+ 
+-  ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
+-			       indirectlabelno++);
+-  ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
+-			       indirectlabelno++);
++  /* Probe at SP.  */
++  xops[1] = const0_rtx;
++  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+ 
+-  /* Call */
+-  fputs ("\tcall\t", asm_out_file);
+-  assemble_name_raw (asm_out_file, indirectlabel2);
+-  fputc ('\n', asm_out_file);
++  /* Test if SP == LAST_ADDR.  */
++  xops[0] = stack_pointer_rtx;
++  xops[1] = reg;
++  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+ 
+-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
++  /* Branch.  */
++  fputs ("\tjne\t", asm_out_file);
++  assemble_name_raw (asm_out_file, loop_lab);
++  fputc ('\n', asm_out_file);
+ 
+-  /* AMD and Intel CPUs prefer each a different instruction as loop filler.
+-     Usage of both pause + lfence is compromise solution.  */
+-  fprintf (asm_out_file, "\tpause\n\tlfence\n");
++  return "";
++}
+ 
+-  /* Jump.  */
+-  fputs ("\tjmp\t", asm_out_file);
+-  assemble_name_raw (asm_out_file, indirectlabel1);
+-  fputc ('\n', asm_out_file);
++/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
++   inclusive.  These are offsets from the current stack pointer.
+ 
+-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
++   INT_REGISTERS_SAVED is true if integer registers have already been
++   pushed on the stack.  */
+ 
+-  /* The above call insn pushed a word to stack.  Adjust CFI info.  */
+-  if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
++static void
++ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
++			     const bool int_registers_saved)
++{
++  /* See if we have a constant small number of probes to generate.  If so,
++     that's the easy case.  The run-time loop is made up of 6 insns in the
++     generic case while the compile-time loop is made up of n insns for n #
++     of intervals.  */
++  if (size <= 6 * get_probe_interval ())
+     {
+-      if (! dwarf2out_do_cfi_asm ())
+-	{
+-	  dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
+-	  xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
+-	  xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
+-	  vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
+-	}
+-      dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
+-      xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
+-      xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
+-      vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
+-      dwarf2out_emit_cfi (xcfi);
+-    }
++      HOST_WIDE_INT i;
+ 
+-  if (regno != INVALID_REGNUM)
+-    {
+-      /* MOV.  */
+-      rtx xops[2];
+-      xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
+-      xops[1] = gen_rtx_REG (word_mode, regno);
+-      output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
++      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
++	 it exceeds SIZE.  If only one probe is needed, this will not
++	 generate any code.  Then probe at FIRST + SIZE.  */
++      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
++	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
++					 -(first + i)));
++
++      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
++				       -(first + size)));
+     }
++
++  /* Otherwise, do the same as above, but in a loop.  Note that we must be
++     extra careful with variables wrapping around because we might be at
++     the very top (or the very bottom) of the address space and we have
++     to be able to handle this case properly; in particular, we use an
++     equality test for the loop condition.  */
+   else
+     {
+-      /* LEA.  */
+-      rtx xops[2];
+-      xops[0] = stack_pointer_rtx;
+-      xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+-      output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
+-    }
+-
+-  fputs ("\tret\n", asm_out_file);
+-}
++      /* We expect the GP registers to be saved when probes are used
++	 as the probing sequences might need a scratch register and
++	 the routine to allocate one assumes the integer registers
++	 have already been saved.  */
++      gcc_assert (int_registers_saved);
+ 
+-/* Output a funtion with a call and return thunk for indirect branch.
+-   If REGNO != INVALID_REGNUM, the function address is in REGNO.
+-   Otherwise, the function address is on the top of stack.  Thunk is
+-   used for function return if RET_P is true.  */
++      HOST_WIDE_INT rounded_size, last;
++      struct scratch_reg sr;
+ 
+-static void
+-output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
+-				unsigned int regno, bool ret_p)
+-{
+-  char name[32];
+-  tree decl;
++      get_scratch_register_on_entry (&sr);
+ 
+-  /* Create __x86_indirect_thunk.  */
+-  indirect_thunk_name (name, regno, need_prefix, ret_p);
+-  decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+-		     get_identifier (name),
+-		     build_function_type_list (void_type_node, NULL_TREE));
+-  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+-				   NULL_TREE, void_type_node);
+-  TREE_PUBLIC (decl) = 1;
+-  TREE_STATIC (decl) = 1;
+-  DECL_IGNORED_P (decl) = 1;
+ 
+-#if TARGET_MACHO
+-  if (TARGET_MACHO)
+-    {
+-      switch_to_section (darwin_sections[picbase_thunk_section]);
+-      fputs ("\t.weak_definition\t", asm_out_file);
+-      assemble_name (asm_out_file, name);
+-      fputs ("\n\t.private_extern\t", asm_out_file);
+-      assemble_name (asm_out_file, name);
+-      putc ('\n', asm_out_file);
+-      ASM_OUTPUT_LABEL (asm_out_file, name);
+-      DECL_WEAK (decl) = 1;
+-    }
+-  else
+-#endif
+-    if (USE_HIDDEN_LINKONCE)
+-      {
+-	cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
++      /* Step 1: round SIZE to the previous multiple of the interval.  */
+ 
+-	targetm.asm_out.unique_section (decl, 0);
+-	switch_to_section (get_named_section (decl, NULL, 0));
++      rounded_size = ROUND_DOWN (size, get_probe_interval ());
+ 
+-	targetm.asm_out.globalize_label (asm_out_file, name);
+-	fputs ("\t.hidden\t", asm_out_file);
+-	assemble_name (asm_out_file, name);
+-	putc ('\n', asm_out_file);
+-	ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+-      }
+-    else
+-      {
+-	switch_to_section (text_section);
+-	ASM_OUTPUT_LABEL (asm_out_file, name);
+-      }
+ 
+-  DECL_INITIAL (decl) = make_node (BLOCK);
+-  current_function_decl = decl;
+-  allocate_struct_function (decl, false);
+-  init_function_start (decl);
+-  /* We're about to hide the function body from callees of final_* by
+-     emitting it directly; tell them we're a thunk, if they care.  */
+-  cfun->is_thunk = true;
+-  first_function_block_is_cold = false;
+-  /* Make sure unwind info is emitted for the thunk if needed.  */
+-  final_start_function (emit_barrier (), asm_out_file, 1);
++      /* Step 2: compute initial and final value of the loop counter.  */
+ 
+-  output_indirect_thunk (regno);
++      /* TEST_OFFSET = FIRST.  */
++      emit_move_insn (sr.reg, GEN_INT (-first));
+ 
+-  final_end_function ();
+-  init_insn_lengths ();
+-  free_after_compilation (cfun);
+-  set_cfun (NULL);
+-  current_function_decl = NULL;
+-}
++      /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
++      last = first + rounded_size;
+ 
+-static int pic_labels_used;
+ 
+-/* Fills in the label name that should be used for a pc thunk for
+-   the given register.  */
++      /* Step 3: the loop
+ 
+-static void
+-get_pc_thunk_name (char name[32], unsigned int regno)
+-{
+-  gcc_assert (!TARGET_64BIT);
++	 do
++	   {
++	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
++	     probe at TEST_ADDR
++	   }
++	 while (TEST_ADDR != LAST_ADDR)
+ 
+-  if (USE_HIDDEN_LINKONCE)
+-    sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
+-  else
+-    ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
+-}
++         probes at FIRST + N * PROBE_INTERVAL for values of N from 1
++         until it is equal to ROUNDED_SIZE.  */
+ 
++      emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
+ 
+-/* This function generates code for -fpic that loads %ebx with
+-   the return address of the caller and then returns.  */
+ 
+-static void
+-ix86_code_end (void)
+-{
+-  rtx xops[2];
+-  unsigned int regno;
++      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
++	 that SIZE is equal to ROUNDED_SIZE.  */
+ 
+-  if (indirect_return_needed)
+-    output_indirect_thunk_function (indirect_thunk_prefix_none,
+-				    INVALID_REGNUM, true);
+-  if (indirect_return_via_cx)
+-    output_indirect_thunk_function (indirect_thunk_prefix_none,
+-				    CX_REG, true);
+-  if (indirect_thunk_needed)
+-    output_indirect_thunk_function (indirect_thunk_prefix_none,
+-				    INVALID_REGNUM, false);
++      if (size != rounded_size)
++	emit_stack_probe (plus_constant (Pmode,
++					 gen_rtx_PLUS (Pmode,
++						       stack_pointer_rtx,
++						       sr.reg),
++					 rounded_size - size));
+ 
+-  for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
+-    {
+-      unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
+-      if ((indirect_thunks_used & (1 << i)))
+-	output_indirect_thunk_function (indirect_thunk_prefix_none,
+-					regno, false);
++      release_scratch_register_on_entry (&sr, size, true);
+     }
+ 
+-  for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
+-    {
+-      char name[32];
+-      tree decl;
++  /* Make sure nothing is scheduled before we are done.  */
++  emit_insn (gen_blockage ());
++}
+ 
+-      if ((indirect_thunks_used & (1 << regno)))
+-	output_indirect_thunk_function (indirect_thunk_prefix_none,
+-					regno, false);
+-
+-      if (!(pic_labels_used & (1 << regno)))
+-	continue;
+-
+-      get_pc_thunk_name (name, regno);
+-
+-      decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+-			 get_identifier (name),
+-			 build_function_type_list (void_type_node, NULL_TREE));
+-      DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+-				       NULL_TREE, void_type_node);
+-      TREE_PUBLIC (decl) = 1;
+-      TREE_STATIC (decl) = 1;
+-      DECL_IGNORED_P (decl) = 1;
+-
+-#if TARGET_MACHO
+-      if (TARGET_MACHO)
+-	{
+-	  switch_to_section (darwin_sections[picbase_thunk_section]);
+-	  fputs ("\t.weak_definition\t", asm_out_file);
+-	  assemble_name (asm_out_file, name);
+-	  fputs ("\n\t.private_extern\t", asm_out_file);
+-	  assemble_name (asm_out_file, name);
+-	  putc ('\n', asm_out_file);
+-	  ASM_OUTPUT_LABEL (asm_out_file, name);
+-	  DECL_WEAK (decl) = 1;
+-	}
+-      else
+-#endif
+-      if (USE_HIDDEN_LINKONCE)
+-	{
+-	  cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
+-
+-	  targetm.asm_out.unique_section (decl, 0);
+-	  switch_to_section (get_named_section (decl, NULL, 0));
+-
+-	  targetm.asm_out.globalize_label (asm_out_file, name);
+-	  fputs ("\t.hidden\t", asm_out_file);
+-	  assemble_name (asm_out_file, name);
+-	  putc ('\n', asm_out_file);
+-	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+-	}
+-      else
+-	{
+-	  switch_to_section (text_section);
+-	  ASM_OUTPUT_LABEL (asm_out_file, name);
+-	}
+-
+-      DECL_INITIAL (decl) = make_node (BLOCK);
+-      current_function_decl = decl;
+-      allocate_struct_function (decl, false);
+-      init_function_start (decl);
+-      /* We're about to hide the function body from callees of final_* by
+-	 emitting it directly; tell them we're a thunk, if they care.  */
+-      cfun->is_thunk = true;
+-      first_function_block_is_cold = false;
+-      /* Make sure unwind info is emitted for the thunk if needed.  */
+-      final_start_function (emit_barrier (), asm_out_file, 1);
+-
+-      /* Pad stack IP move with 4 instructions (two NOPs count
+-	 as one instruction).  */
+-      if (TARGET_PAD_SHORT_FUNCTION)
+-	{
+-	  int i = 8;
+-
+-	  while (i--)
+-	    fputs ("\tnop\n", asm_out_file);
+-	}
+-
+-      xops[0] = gen_rtx_REG (Pmode, regno);
+-      xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
+-      output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
+-      output_asm_insn ("%!ret", NULL);
+-      final_end_function ();
+-      init_insn_lengths ();
+-      free_after_compilation (cfun);
+-      set_cfun (NULL);
+-      current_function_decl = NULL;
+-    }
+-
+-  if (flag_split_stack)
+-    file_end_indicate_split_stack ();
+-}
+-
+-/* Emit code for the SET_GOT patterns.  */
++/* Probe a range of stack addresses from REG to END, inclusive.  These are
++   offsets from the current stack pointer.  */
+ 
+ const char *
+-output_set_got (rtx dest, rtx label)
++output_probe_stack_range (rtx reg, rtx end)
+ {
++  static int labelno = 0;
++  char loop_lab[32];
+   rtx xops[3];
+ 
+-  xops[0] = dest;
+-
+-  if (TARGET_VXWORKS_RTP && flag_pic)
+-    {
+-      /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
+-      xops[2] = gen_rtx_MEM (Pmode,
+-			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
+-      output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
+-
+-      /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
+-	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
+-	 an unadorned address.  */
+-      xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
+-      SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
+-      output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
+-      return "";
+-    }
+-
+-  xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+-
+-  if (flag_pic)
+-    {
+-      char name[32];
+-      get_pc_thunk_name (name, REGNO (dest));
+-      pic_labels_used |= 1 << REGNO (dest);
++  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+ 
+-      xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
+-      xops[2] = gen_rtx_MEM (QImode, xops[2]);
+-      output_asm_insn ("%!call\t%X2", xops);
++  /* Loop.  */
++  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+ 
+-#if TARGET_MACHO
+-      /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
+-         This is what will be referenced by the Mach-O PIC subsystem.  */
+-      if (machopic_should_output_picbase_label () || !label)
+-	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
++  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
++  xops[0] = reg;
++  xops[1] = GEN_INT (get_probe_interval ());
++  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+ 
+-      /* When we are restoring the pic base at the site of a nonlocal label,
+-         and we decided to emit the pic base above, we will still output a
+-         local label used for calculating the correction offset (even though
+-         the offset will be 0 in that case).  */
+-      if (label)
+-        targetm.asm_out.internal_label (asm_out_file, "L",
+-					   CODE_LABEL_NUMBER (label));
+-#endif
+-    }
+-  else
+-    {
+-      if (TARGET_MACHO)
+-	/* We don't need a pic base, we're not producing pic.  */
+-	gcc_unreachable ();
++  /* Probe at TEST_ADDR.  */
++  xops[0] = stack_pointer_rtx;
++  xops[1] = reg;
++  xops[2] = const0_rtx;
++  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
+ 
+-      xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
+-      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
+-      targetm.asm_out.internal_label (asm_out_file, "L",
+-				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
+-    }
++  /* Test if TEST_ADDR == LAST_ADDR.  */
++  xops[0] = reg;
++  xops[1] = end;
++  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+ 
+-  if (!TARGET_MACHO)
+-    output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
++  /* Branch.  */
++  fputs ("\tjne\t", asm_out_file);
++  assemble_name_raw (asm_out_file, loop_lab);
++  fputc ('\n', asm_out_file);
+ 
+   return "";
+ }
+ 
+-/* Generate an "push" pattern for input ARG.  */
++/* Return true if stack frame is required.  Update STACK_ALIGNMENT
++   to the largest alignment, in bits, of stack slot used if stack
++   frame is required and CHECK_STACK_SLOT is true.  */
+ 
+-static rtx
+-gen_push (rtx arg)
++static bool
++ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
++				    bool check_stack_slot)
+ {
+-  struct machine_function *m = cfun->machine;
++  HARD_REG_SET set_up_by_prologue, prologue_used;
++  basic_block bb;
+ 
+-  if (m->fs.cfa_reg == stack_pointer_rtx)
+-    m->fs.cfa_offset += UNITS_PER_WORD;
+-  m->fs.sp_offset += UNITS_PER_WORD;
++  CLEAR_HARD_REG_SET (prologue_used);
++  CLEAR_HARD_REG_SET (set_up_by_prologue);
++  add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
++  add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
++  add_to_hard_reg_set (&set_up_by_prologue, Pmode,
++		       HARD_FRAME_POINTER_REGNUM);
+ 
+-  if (REG_P (arg) && GET_MODE (arg) != word_mode)
+-    arg = gen_rtx_REG (word_mode, REGNO (arg));
++  /* The preferred stack alignment is the minimum stack alignment.  */
++  if (stack_alignment > crtl->preferred_stack_boundary)
++    stack_alignment = crtl->preferred_stack_boundary;
+ 
+-  return gen_rtx_SET (gen_rtx_MEM (word_mode,
+-				   gen_rtx_PRE_DEC (Pmode,
+-						    stack_pointer_rtx)),
+-		      arg);
+-}
++  bool require_stack_frame = false;
+ 
+-/* Generate an "pop" pattern for input ARG.  */
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      rtx_insn *insn;
++      FOR_BB_INSNS (bb, insn)
++	if (NONDEBUG_INSN_P (insn)
++	    && requires_stack_frame_p (insn, prologue_used,
++				       set_up_by_prologue))
++	  {
++	    require_stack_frame = true;
+ 
+-static rtx
+-gen_pop (rtx arg)
+-{
+-  if (REG_P (arg) && GET_MODE (arg) != word_mode)
+-    arg = gen_rtx_REG (word_mode, REGNO (arg));
++	    if (check_stack_slot)
++	      {
++		/* Find the maximum stack alignment.  */
++		subrtx_iterator::array_type array;
++		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
++		  if (MEM_P (*iter)
++		      && (reg_mentioned_p (stack_pointer_rtx,
++					   *iter)
++			  || reg_mentioned_p (frame_pointer_rtx,
++					      *iter)))
++		    {
++		      unsigned int alignment = MEM_ALIGN (*iter);
++		      if (alignment > stack_alignment)
++			stack_alignment = alignment;
++		    }
++	      }
++	  }
++    }
+ 
+-  return gen_rtx_SET (arg,
+-		      gen_rtx_MEM (word_mode,
+-				   gen_rtx_POST_INC (Pmode,
+-						     stack_pointer_rtx)));
++  return require_stack_frame;
+ }
+ 
+-/* Return >= 0 if there is an unused call-clobbered register available
+-   for the entire function.  */
++/* Finalize stack_realign_needed and frame_pointer_needed flags, which
++   will guide prologue/epilogue to be generated in correct form.  */
+ 
+-static unsigned int
+-ix86_select_alt_pic_regnum (void)
++static void
++ix86_finalize_stack_frame_flags (void)
+ {
+-  if (ix86_use_pseudo_pic_reg ())
+-    return INVALID_REGNUM;
++  /* Check if stack realign is really needed after reload, and
++     stores result in cfun */
++  unsigned int incoming_stack_boundary
++    = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
++       ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
++  unsigned int stack_alignment
++    = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
++       ? crtl->max_used_stack_slot_alignment
++       : crtl->stack_alignment_needed);
++  unsigned int stack_realign
++    = (incoming_stack_boundary < stack_alignment);
++  bool recompute_frame_layout_p = false;
+ 
+-  if (crtl->is_leaf
+-      && !crtl->profile
+-      && !ix86_current_function_calls_tls_descriptor)
++  if (crtl->stack_realign_finalized)
+     {
+-      int i, drap;
+-      /* Can't use the same register for both PIC and DRAP.  */
+-      if (crtl->drap_reg)
+-	drap = REGNO (crtl->drap_reg);
+-      else
+-	drap = -1;
+-      for (i = 2; i >= 0; --i)
+-        if (i != drap && !df_regs_ever_live_p (i))
+-	  return i;
++      /* After stack_realign_needed is finalized, we can't no longer
++	 change it.  */
++      gcc_assert (crtl->stack_realign_needed == stack_realign);
++      return;
+     }
+ 
+-  return INVALID_REGNUM;
+-}
+-
+-/* Return true if REGNO is used by the epilogue.  */
++  /* If the only reason for frame_pointer_needed is that we conservatively
++     assumed stack realignment might be needed or -fno-omit-frame-pointer
++     is used, but in the end nothing that needed the stack alignment had
++     been spilled nor stack access, clear frame_pointer_needed and say we
++     don't need stack realignment.  */
++  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
++      && frame_pointer_needed
++      && crtl->is_leaf
++      && crtl->sp_is_unchanging
++      && !ix86_current_function_calls_tls_descriptor
++      && !crtl->accesses_prior_frames
++      && !cfun->calls_alloca
++      && !crtl->calls_eh_return
++      /* See ira_setup_eliminable_regset for the rationale.  */
++      && !(STACK_CHECK_MOVING_SP
++	   && flag_stack_check
++	   && flag_exceptions
++	   && cfun->can_throw_non_call_exceptions)
++      && !ix86_frame_pointer_required ()
++      && get_frame_size () == 0
++      && ix86_nsaved_sseregs () == 0
++      && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
++    {
++      if (ix86_find_max_used_stack_alignment (stack_alignment,
++					      stack_realign))
++	{
++	  /* Stack frame is required.  If stack alignment needed is less
++	     than incoming stack boundary, don't realign stack.  */
++	  stack_realign = incoming_stack_boundary < stack_alignment;
++	  if (!stack_realign)
++	    {
++	      crtl->max_used_stack_slot_alignment
++		= incoming_stack_boundary;
++	      crtl->stack_alignment_needed
++		= incoming_stack_boundary;
++	      /* Also update preferred_stack_boundary for leaf
++	         functions.  */
++	      crtl->preferred_stack_boundary
++		= incoming_stack_boundary;
++	    }
++	}
++      else
++	{
++	  /* If drap has been set, but it actually isn't live at the
++	     start of the function, there is no reason to set it up.  */
++	  if (crtl->drap_reg)
++	    {
++	      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
++	      if (! REGNO_REG_SET_P (DF_LR_IN (bb),
++				     REGNO (crtl->drap_reg)))
++		{
++		  crtl->drap_reg = NULL_RTX;
++		  crtl->need_drap = false;
++		}
++	    }
++	  else
++	    cfun->machine->no_drap_save_restore = true;
+ 
+-bool
+-ix86_epilogue_uses (int regno)
+-{
+-  /* If there are no caller-saved registers, we preserve all registers,
+-     except for MMX and x87 registers which aren't supported when saving
+-     and restoring registers.  Don't explicitly save SP register since
+-     it is always preserved.  */
+-  return (epilogue_completed
+-	  && cfun->machine->no_caller_saved_registers
+-	  && !fixed_regs[regno]
+-	  && !STACK_REGNO_P (regno)
+-	  && !MMX_REGNO_P (regno));
+-}
++	  frame_pointer_needed = false;
++	  stack_realign = false;
++	  crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
++	  crtl->stack_alignment_needed = incoming_stack_boundary;
++	  crtl->stack_alignment_estimated = incoming_stack_boundary;
++	  if (crtl->preferred_stack_boundary > incoming_stack_boundary)
++	    crtl->preferred_stack_boundary = incoming_stack_boundary;
++	  df_finish_pass (true);
++	  df_scan_alloc (NULL);
++	  df_scan_blocks ();
++	  df_compute_regs_ever_live (true);
++	  df_analyze ();
+ 
+-/* Return nonzero if register REGNO can be used as a scratch register
+-   in peephole2.  */
++	  if (flag_var_tracking)
++	    {
++	      /* Since frame pointer is no longer available, replace it with
++		 stack pointer - UNITS_PER_WORD in debug insns.  */
++	      df_ref ref, next;
++	      for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
++		   ref; ref = next)
++		{
++		  next = DF_REF_NEXT_REG (ref);
++		  if (!DF_REF_INSN_INFO (ref))
++		    continue;
+ 
+-static bool
+-ix86_hard_regno_scratch_ok (unsigned int regno)
+-{
+-  /* If there are no caller-saved registers, we can't use any register
+-     as a scratch register after epilogue and use REGNO as scratch
+-     register only if it has been used before to avoid saving and
+-     restoring it.  */
+-  return (!cfun->machine->no_caller_saved_registers
+-	  || (!epilogue_completed
+-	      && df_regs_ever_live_p (regno)));
+-}
++		  /* Make sure the next ref is for a different instruction,
++		     so that we're not affected by the rescan.  */
++		  rtx_insn *insn = DF_REF_INSN (ref);
++		  while (next && DF_REF_INSN (next) == insn)
++		    next = DF_REF_NEXT_REG (next);
+ 
+-/* Return TRUE if we need to save REGNO.  */
++		  if (DEBUG_INSN_P (insn))
++		    {
++		      bool changed = false;
++		      for (; ref != next; ref = DF_REF_NEXT_REG (ref))
++			{
++			  rtx *loc = DF_REF_LOC (ref);
++			  if (*loc == hard_frame_pointer_rtx)
++			    {
++			      *loc = plus_constant (Pmode,
++						    stack_pointer_rtx,
++						    -UNITS_PER_WORD);
++			      changed = true;
++			    }
++			}
++		      if (changed)
++			df_insn_rescan (insn);
++		    }
++		}
++	    }
+ 
+-static bool
+-ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
+-{
+-  /* If there are no caller-saved registers, we preserve all registers,
+-     except for MMX and x87 registers which aren't supported when saving
+-     and restoring registers.  Don't explicitly save SP register since
+-     it is always preserved.  */
+-  if (cfun->machine->no_caller_saved_registers)
+-    {
+-      /* Don't preserve registers used for function return value.  */
+-      rtx reg = crtl->return_rtx;
+-      if (reg)
+-	{
+-	  unsigned int i = REGNO (reg);
+-	  unsigned int nregs = REG_NREGS (reg);
+-	  while (nregs-- > 0)
+-	    if ((i + nregs) == regno)
+-	      return false;
++	  recompute_frame_layout_p = true;
+ 	}
+-
+-      return (df_regs_ever_live_p (regno)
+-	      && !fixed_regs[regno]
+-	      && !STACK_REGNO_P (regno)
+-	      && !MMX_REGNO_P (regno)
+-	      && (regno != HARD_FRAME_POINTER_REGNUM
+-		  || !frame_pointer_needed));
+     }
+-
+-  if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
+-      && pic_offset_table_rtx)
++  else if (crtl->max_used_stack_slot_alignment >= 128)
+     {
+-      if (ix86_use_pseudo_pic_reg ())
+-	{
+-	  /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
+-	  _mcount in prologue.  */
+-	  if (!TARGET_64BIT && flag_pic && crtl->profile)
+-	    return true;
+-	}
+-      else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
+-	       || crtl->profile
+-	       || crtl->calls_eh_return
+-	       || crtl->uses_const_pool
+-	       || cfun->has_nonlocal_label)
+-        return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
++      /* We don't need to realign stack.  max_used_stack_alignment is
++	 used to decide how stack frame should be aligned.  This is
++	 independent of any psABIs nor 32-bit vs 64-bit.  It is always
++	 safe to compute max_used_stack_alignment.  We compute it only
++	 if 128-bit aligned load/store may be generated on misaligned
++	 stack slot which will lead to segfault.   */
++      if (ix86_find_max_used_stack_alignment (stack_alignment, true))
++	cfun->machine->max_used_stack_alignment
++	  = stack_alignment / BITS_PER_UNIT;
+     }
+ 
+-  if (crtl->calls_eh_return && maybe_eh_return)
++  if (crtl->stack_realign_needed != stack_realign)
++    recompute_frame_layout_p = true;
++  crtl->stack_realign_needed = stack_realign;
++  crtl->stack_realign_finalized = true;
++  if (recompute_frame_layout_p)
++    ix86_compute_frame_layout ();
++}
++
++/* Delete SET_GOT right after entry block if it is allocated to reg.  */
++
++static void
++ix86_elim_entry_set_got (rtx reg)
++{
++  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
++  rtx_insn *c_insn = BB_HEAD (bb);
++  if (!NONDEBUG_INSN_P (c_insn))
++    c_insn = next_nonnote_nondebug_insn (c_insn);
++  if (c_insn && NONJUMP_INSN_P (c_insn))
+     {
+-      unsigned i;
+-      for (i = 0; ; i++)
++      rtx pat = PATTERN (c_insn);
++      if (GET_CODE (pat) == PARALLEL)
+ 	{
+-	  unsigned test = EH_RETURN_DATA_REGNO (i);
+-	  if (test == INVALID_REGNUM)
+-	    break;
+-	  if (test == regno)
+-	    return true;
++	  rtx vec = XVECEXP (pat, 0, 0);
++	  if (GET_CODE (vec) == SET
++	      && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
++	      && REGNO (XEXP (vec, 0)) == REGNO (reg))
++	    delete_insn (c_insn);
+ 	}
+     }
+-
+-  if (ignore_outlined && cfun->machine->call_ms2sysv)
+-    {
+-      unsigned count = cfun->machine->call_ms2sysv_extra_regs
+-		       + xlogue_layout::MIN_REGS;
+-      if (xlogue_layout::is_stub_managed_reg (regno, count))
+-	return false;
+-    }
+-
+-  if (crtl->drap_reg
+-      && regno == REGNO (crtl->drap_reg)
+-      && !cfun->machine->no_drap_save_restore)
+-    return true;
+-
+-  return (df_regs_ever_live_p (regno)
+-	  && !call_used_regs[regno]
+-	  && !fixed_regs[regno]
+-	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
+ }
+ 
+-/* Return number of saved general prupose registers.  */
+-
+-static int
+-ix86_nsaved_regs (void)
++static rtx
++gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
+ {
+-  int nregs = 0;
+-  int regno;
++  rtx addr, mem;
+ 
+-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+-      nregs ++;
+-  return nregs;
++  if (offset)
++    addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
++  mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
++  return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
+ }
+ 
+-/* Return number of saved SSE registers.  */
+-
+-static int
+-ix86_nsaved_sseregs (void)
++static inline rtx
++gen_frame_load (rtx reg, rtx frame_reg, int offset)
+ {
+-  int nregs = 0;
+-  int regno;
+-
+-  if (!TARGET_64BIT_MS_ABI)
+-    return 0;
+-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+-      nregs ++;
+-  return nregs;
++  return gen_frame_set (reg, frame_reg, offset, false);
+ }
+ 
+-/* Given FROM and TO register numbers, say whether this elimination is
+-   allowed.  If stack alignment is needed, we can only replace argument
+-   pointer with hard frame pointer, or replace frame pointer with stack
+-   pointer.  Otherwise, frame pointer elimination is automatically
+-   handled and all other eliminations are valid.  */
+-
+-static bool
+-ix86_can_eliminate (const int from, const int to)
++static inline rtx
++gen_frame_store (rtx reg, rtx frame_reg, int offset)
+ {
+-  if (stack_realign_fp)
+-    return ((from == ARG_POINTER_REGNUM
+-	     && to == HARD_FRAME_POINTER_REGNUM)
+-	    || (from == FRAME_POINTER_REGNUM
+-		&& to == STACK_POINTER_REGNUM));
+-  else
+-    return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
++  return gen_frame_set (reg, frame_reg, offset, true);
+ }
+ 
+-/* Return the offset between two registers, one to be eliminated, and the other
+-   its replacement, at the start of a routine.  */
+-
+-HOST_WIDE_INT
+-ix86_initial_elimination_offset (int from, int to)
++static void
++ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
+ {
+-  struct ix86_frame &frame = cfun->machine->frame;
++  struct machine_function *m = cfun->machine;
++  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
++			  + m->call_ms2sysv_extra_regs;
++  rtvec v = rtvec_alloc (ncregs + 1);
++  unsigned int align, i, vi = 0;
++  rtx_insn *insn;
++  rtx sym, addr;
++  rtx rax = gen_rtx_REG (word_mode, AX_REG);
++  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
+ 
+-  if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
+-    return frame.hard_frame_pointer_offset;
+-  else if (from == FRAME_POINTER_REGNUM
+-	   && to == HARD_FRAME_POINTER_REGNUM)
+-    return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
+-  else
+-    {
+-      gcc_assert (to == STACK_POINTER_REGNUM);
++  /* AL should only be live with sysv_abi.  */
++  gcc_assert (!ix86_eax_live_at_start_p ());
++  gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
+ 
+-      if (from == ARG_POINTER_REGNUM)
+-	return frame.stack_pointer_offset;
++  /* Setup RAX as the stub's base pointer.  We use stack_realign_offset rather
++     we've actually realigned the stack or not.  */
++  align = GET_MODE_ALIGNMENT (V4SFmode);
++  addr = choose_baseaddr (frame.stack_realign_offset
++			  + xlogue.get_stub_ptr_offset (), &align, AX_REG);
++  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
+ 
+-      gcc_assert (from == FRAME_POINTER_REGNUM);
+-      return frame.stack_pointer_offset - frame.frame_pointer_offset;
+-    }
+-}
++  emit_insn (gen_rtx_SET (rax, addr));
+ 
+-/* In a dynamically-aligned function, we can't know the offset from
+-   stack pointer to frame pointer, so we must ensure that setjmp
+-   eliminates fp against the hard fp (%ebp) rather than trying to
+-   index from %esp up to the top of the frame across a gap that is
+-   of unknown (at compile-time) size.  */
+-static rtx
+-ix86_builtin_setjmp_frame_value (void)
+-{
+-  return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
+-}
++  /* Get the stub symbol.  */
++  sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
++						  : XLOGUE_STUB_SAVE);
++  RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+ 
+-/* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
+-static void warn_once_call_ms2sysv_xlogues (const char *feature)
+-{
+-  static bool warned_once = false;
+-  if (!warned_once)
++  for (i = 0; i < ncregs; ++i)
+     {
+-      warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s",
+-	       feature);
+-      warned_once = true;
++      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
++      rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
++			     r.regno);
++      RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
+     }
+-}
+ 
+-/* Return the probing interval for -fstack-clash-protection.  */
++  gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
+ 
+-static HOST_WIDE_INT
+-get_probe_interval (void)
+-{
+-  if (flag_stack_clash_protection)
+-    return (HOST_WIDE_INT_1U
+-	    << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
+-  else
+-    return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
++  insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
++  RTX_FRAME_RELATED_P (insn) = true;
+ }
+ 
+-/* When using -fsplit-stack, the allocation routines set a field in
+-   the TCB to the bottom of the stack plus this much space, measured
+-   in bytes.  */
+-
+-#define SPLIT_STACK_AVAILABLE 256
+-
+-/* Fill structure ix86_frame about frame of currently computed function.  */
++/* Expand the prologue into a bunch of separate insns.  */
+ 
+-static void
+-ix86_compute_frame_layout (void)
++void
++ix86_expand_prologue (void)
+ {
+-  struct ix86_frame *frame = &cfun->machine->frame;
+   struct machine_function *m = cfun->machine;
+-  unsigned HOST_WIDE_INT stack_alignment_needed;
+-  HOST_WIDE_INT offset;
+-  unsigned HOST_WIDE_INT preferred_alignment;
+-  HOST_WIDE_INT size = get_frame_size ();
+-  HOST_WIDE_INT to_allocate;
++  rtx insn, t;
++  HOST_WIDE_INT allocate;
++  bool int_registers_saved;
++  bool sse_registers_saved;
++  bool save_stub_call_needed;
++  rtx static_chain = NULL_RTX;
+ 
+-  /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
+-   * ms_abi functions that call a sysv function.  We now need to prune away
+-   * cases where it should be disabled.  */
+-  if (TARGET_64BIT && m->call_ms2sysv)
+-    {
+-      gcc_assert (TARGET_64BIT_MS_ABI);
+-      gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
+-      gcc_assert (!TARGET_SEH);
+-      gcc_assert (TARGET_SSE);
+-      gcc_assert (!ix86_using_red_zone ());
++  if (ix86_function_naked (current_function_decl))
++    return;
+ 
+-      if (crtl->calls_eh_return)
+-	{
+-	  gcc_assert (!reload_completed);
+-	  m->call_ms2sysv = false;
+-	  warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
+-	}
++  ix86_finalize_stack_frame_flags ();
+ 
+-      else if (ix86_static_chain_on_stack)
+-	{
+-	  gcc_assert (!reload_completed);
+-	  m->call_ms2sysv = false;
+-	  warn_once_call_ms2sysv_xlogues ("static call chains");
+-	}
++  /* DRAP should not coexist with stack_realign_fp */
++  gcc_assert (!(crtl->drap_reg && stack_realign_fp));
+ 
+-      /* Finally, compute which registers the stub will manage.  */
+-      else
+-	{
+-	  unsigned count = xlogue_layout::count_stub_managed_regs ();
+-	  m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
+-	  m->call_ms2sysv_pad_in = 0;
+-	}
+-    }
++  memset (&m->fs, 0, sizeof (m->fs));
+ 
+-  frame->nregs = ix86_nsaved_regs ();
+-  frame->nsseregs = ix86_nsaved_sseregs ();
++  /* Initialize CFA state for before the prologue.  */
++  m->fs.cfa_reg = stack_pointer_rtx;
++  m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
+ 
+-  /* 64-bit MS ABI seem to require stack alignment to be always 16,
+-     except for function prologues, leaf functions and when the defult
+-     incoming stack boundary is overriden at command line or via
+-     force_align_arg_pointer attribute.
++  /* Track SP offset to the CFA.  We continue tracking this after we've
++     swapped the CFA register away from SP.  In the case of re-alignment
++     this is fudged; we're interested to offsets within the local frame.  */
++  m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
++  m->fs.sp_valid = true;
++  m->fs.sp_realigned = false;
+ 
+-     Darwin's ABI specifies 128b alignment for both 32 and  64 bit variants
+-     at call sites, including profile function calls.
+- */
+-  if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
+-        && crtl->preferred_stack_boundary < 128)
+-      && (!crtl->is_leaf || cfun->calls_alloca != 0
+-	  || ix86_current_function_calls_tls_descriptor
+-	  || (TARGET_MACHO && crtl->profile)
+-	  || ix86_incoming_stack_boundary < 128))
++  const struct ix86_frame &frame = cfun->machine->frame;
++
++  if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
+     {
+-      crtl->preferred_stack_boundary = 128;
+-      crtl->stack_alignment_needed = 128;
+-    }
++      /* We should have already generated an error for any use of
++         ms_hook on a nested function.  */
++      gcc_checking_assert (!ix86_static_chain_on_stack);
+ 
+-  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
+-  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
++      /* Check if profiling is active and we shall use profiling before
++         prologue variant. If so sorry.  */
++      if (crtl->profile && flag_fentry != 0)
++	sorry ("%<ms_hook_prologue%> attribute is not compatible "
++	       "with %<-mfentry%> for 32-bit");
+ 
+-  gcc_assert (!size || stack_alignment_needed);
+-  gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
+-  gcc_assert (preferred_alignment <= stack_alignment_needed);
++      /* In ix86_asm_output_function_label we emitted:
++	 8b ff     movl.s %edi,%edi
++	 55        push   %ebp
++	 8b ec     movl.s %esp,%ebp
+ 
+-  /* The only ABI saving SSE regs should be 64-bit ms_abi.  */
+-  gcc_assert (TARGET_64BIT || !frame->nsseregs);
+-  if (TARGET_64BIT && m->call_ms2sysv)
+-    {
+-      gcc_assert (stack_alignment_needed >= 16);
+-      gcc_assert (!frame->nsseregs);
+-    }
++	 This matches the hookable function prologue in Win32 API
++	 functions in Microsoft Windows XP Service Pack 2 and newer.
++	 Wine uses this to enable Windows apps to hook the Win32 API
++	 functions provided by Wine.
+ 
+-  /* For SEH we have to limit the amount of code movement into the prologue.
+-     At present we do this via a BLOCKAGE, at which point there's very little
+-     scheduling that can be done, which means that there's very little point
+-     in doing anything except PUSHs.  */
+-  if (TARGET_SEH)
+-    m->use_fast_prologue_epilogue = false;
+-  else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
+-    {
+-      int count = frame->nregs;
+-      struct cgraph_node *node = cgraph_node::get (current_function_decl);
++	 What that means is that we've already set up the frame pointer.  */
+ 
+-      /* The fast prologue uses move instead of push to save registers.  This
+-         is significantly longer, but also executes faster as modern hardware
+-         can execute the moves in parallel, but can't do that for push/pop.
++      if (frame_pointer_needed
++	  && !(crtl->drap_reg && crtl->stack_realign_needed))
++	{
++	  rtx push, mov;
+ 
+-	 Be careful about choosing what prologue to emit:  When function takes
+-	 many instructions to execute we may use slow version as well as in
+-	 case function is known to be outside hot spot (this is known with
+-	 feedback only).  Weight the size of function by number of registers
+-	 to save as it is cheap to use one or two push instructions but very
+-	 slow to use many of them.  */
+-      if (count)
+-	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
+-      if (node->frequency < NODE_FREQUENCY_NORMAL
+-	  || (flag_branch_probabilities
+-	      && node->frequency < NODE_FREQUENCY_HOT))
+-	m->use_fast_prologue_epilogue = false;
+-      else
+-	m->use_fast_prologue_epilogue
+-	   = !expensive_function_p (count);
+-    }
++	  /* We've decided to use the frame pointer already set up.
++	     Describe this to the unwinder by pretending that both
++	     push and mov insns happen right here.
+ 
+-  frame->save_regs_using_mov
+-    = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
+-       /* If static stack checking is enabled and done with probes,
+-	  the registers need to be saved before allocating the frame.  */
+-       && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
++	     Putting the unwind info here at the end of the ms_hook
++	     is done so that we can make absolutely certain we get
++	     the required byte sequence at the start of the function,
++	     rather than relying on an assembler that can produce
++	     the exact encoding required.
+ 
+-  /* Skip return address and error code in exception handler.  */
+-  offset = INCOMING_FRAME_SP_OFFSET;
++	     However it does mean (in the unpatched case) that we have
++	     a 1 insn window where the asynchronous unwind info is
++	     incorrect.  However, if we placed the unwind info at
++	     its correct location we would have incorrect unwind info
++	     in the patched case.  Which is probably all moot since
++	     I don't expect Wine generates dwarf2 unwind info for the
++	     system libraries that use this feature.  */
+ 
+-  /* Skip pushed static chain.  */
+-  if (ix86_static_chain_on_stack)
+-    offset += UNITS_PER_WORD;
++	  insn = emit_insn (gen_blockage ());
+ 
+-  /* Skip saved base pointer.  */
+-  if (frame_pointer_needed)
+-    offset += UNITS_PER_WORD;
+-  frame->hfp_save_offset = offset;
++	  push = gen_push (hard_frame_pointer_rtx);
++	  mov = gen_rtx_SET (hard_frame_pointer_rtx,
++			     stack_pointer_rtx);
++	  RTX_FRAME_RELATED_P (push) = 1;
++	  RTX_FRAME_RELATED_P (mov) = 1;
+ 
+-  /* The traditional frame pointer location is at the top of the frame.  */
+-  frame->hard_frame_pointer_offset = offset;
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
++			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
+ 
+-  /* Register save area */
+-  offset += frame->nregs * UNITS_PER_WORD;
+-  frame->reg_save_offset = offset;
++	  /* Note that gen_push incremented m->fs.cfa_offset, even
++	     though we didn't emit the push insn here.  */
++	  m->fs.cfa_reg = hard_frame_pointer_rtx;
++	  m->fs.fp_offset = m->fs.cfa_offset;
++	  m->fs.fp_valid = true;
++	}
++      else
++	{
++	  /* The frame pointer is not needed so pop %ebp again.
++	     This leaves us with a pristine state.  */
++	  emit_insn (gen_pop (hard_frame_pointer_rtx));
++	}
++    }
+ 
+-  /* On SEH target, registers are pushed just before the frame pointer
+-     location.  */
+-  if (TARGET_SEH)
+-    frame->hard_frame_pointer_offset = offset;
++  /* The first insn of a function that accepts its static chain on the
++     stack is to push the register that would be filled in by a direct
++     call.  This insn will be skipped by the trampoline.  */
++  else if (ix86_static_chain_on_stack)
++    {
++      static_chain = ix86_static_chain (cfun->decl, false);
++      insn = emit_insn (gen_push (static_chain));
++      emit_insn (gen_blockage ());
+ 
+-  /* Calculate the size of the va-arg area (not including padding, if any).  */
+-  frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
++      /* We don't want to interpret this push insn as a register save,
++	 only as a stack adjustment.  The real copy of the register as
++	 a save will be done later, if needed.  */
++      t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
++      t = gen_rtx_SET (stack_pointer_rtx, t);
++      add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
++      RTX_FRAME_RELATED_P (insn) = 1;
++    }
+ 
+-  /* Also adjust stack_realign_offset for the largest alignment of
+-     stack slot actually used.  */
+-  if (stack_realign_fp
+-      || (cfun->machine->max_used_stack_alignment != 0
+-	  && (offset % cfun->machine->max_used_stack_alignment) != 0))
++  /* Emit prologue code to adjust stack alignment and setup DRAP, in case
++     of DRAP is needed and stack realignment is really needed after reload */
++  if (stack_realign_drap)
+     {
+-      /* We may need a 16-byte aligned stack for the remainder of the
+-	 register save area, but the stack frame for the local function
+-	 may require a greater alignment if using AVX/2/512.  In order
+-	 to avoid wasting space, we first calculate the space needed for
+-	 the rest of the register saves, add that to the stack pointer,
+-	 and then realign the stack to the boundary of the start of the
+-	 frame for the local function.  */
+-      HOST_WIDE_INT space_needed = 0;
+-      HOST_WIDE_INT sse_reg_space_needed = 0;
++      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ 
+-      if (TARGET_64BIT)
+-	{
+-	  if (m->call_ms2sysv)
+-	    {
+-	      m->call_ms2sysv_pad_in = 0;
+-	      space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
+-	    }
++      /* Can't use DRAP in interrupt function.  */
++      if (cfun->machine->func_type != TYPE_NORMAL)
++	sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
++	       "in interrupt service routine.  This may be worked "
++	       "around by avoiding functions with aggregate return.");
+ 
+-	  else if (frame->nsseregs)
+-	    /* The only ABI that has saved SSE registers (Win64) also has a
+-	       16-byte aligned default stack.  However, many programs violate
+-	       the ABI, and Wine64 forces stack realignment to compensate.  */
+-	    space_needed = frame->nsseregs * 16;
++      /* Only need to push parameter pointer reg if it is caller saved.  */
++      if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
++	{
++	  /* Push arg pointer reg */
++	  insn = emit_insn (gen_push (crtl->drap_reg));
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	}
+ 
+-	  sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
++      /* Grab the argument pointer.  */
++      t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
++      insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
++      RTX_FRAME_RELATED_P (insn) = 1;
++      m->fs.cfa_reg = crtl->drap_reg;
++      m->fs.cfa_offset = 0;
+ 
+-	  /* 64-bit frame->va_arg_size should always be a multiple of 16, but
+-	     rounding to be pedantic.  */
+-	  space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
+-	}
+-      else
+-	space_needed = frame->va_arg_size;
++      /* Align the stack.  */
++      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
++					stack_pointer_rtx,
++					GEN_INT (-align_bytes)));
++      RTX_FRAME_RELATED_P (insn) = 1;
+ 
+-      /* Record the allocation size required prior to the realignment AND.  */
+-      frame->stack_realign_allocate = space_needed;
++      /* Replicate the return address on the stack so that return
++	 address can be reached via (argp - 1) slot.  This is needed
++	 to implement macro RETURN_ADDR_RTX and intrinsic function
++	 expand_builtin_return_addr etc.  */
++      t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
++      t = gen_frame_mem (word_mode, t);
++      insn = emit_insn (gen_push (t));
++      RTX_FRAME_RELATED_P (insn) = 1;
+ 
+-      /* The re-aligned stack starts at frame->stack_realign_offset.  Values
+-	 before this point are not directly comparable with values below
+-	 this point.  Use sp_valid_at to determine if the stack pointer is
+-	 valid for a given offset, fp_valid_at for the frame pointer, or
+-	 choose_baseaddr to have a base register chosen for you.
++      /* For the purposes of frame and register save area addressing,
++	 we've started over with a new frame.  */
++      m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
++      m->fs.realigned = true;
+ 
+-	 Note that the result of (frame->stack_realign_offset
+-	 & (stack_alignment_needed - 1)) may not equal zero.  */
+-      offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
+-      frame->stack_realign_offset = offset - space_needed;
+-      frame->sse_reg_save_offset = frame->stack_realign_offset
+-							+ sse_reg_space_needed;
++      if (static_chain)
++	{
++	  /* Replicate static chain on the stack so that static chain
++	     can be reached via (argp - 2) slot.  This is needed for
++	     nested function with stack realignment.  */
++	  insn = emit_insn (gen_push (static_chain));
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	}
+     }
+-  else
++
++  int_registers_saved = (frame.nregs == 0);
++  sse_registers_saved = (frame.nsseregs == 0);
++  save_stub_call_needed = (m->call_ms2sysv);
++  gcc_assert (sse_registers_saved || !save_stub_call_needed);
++
++  if (frame_pointer_needed && !m->fs.fp_valid)
+     {
+-      frame->stack_realign_offset = offset;
++      /* Note: AT&T enter does NOT have reversed args.  Enter is probably
++         slower on all targets.  Also sdb didn't like it.  */
++      insn = emit_insn (gen_push (hard_frame_pointer_rtx));
++      RTX_FRAME_RELATED_P (insn) = 1;
+ 
+-      if (TARGET_64BIT && m->call_ms2sysv)
++      /* Push registers now, before setting the frame pointer
++	 on SEH target.  */
++      if (!int_registers_saved
++	  && TARGET_SEH
++	  && !frame.save_regs_using_mov)
+ 	{
+-	  m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
+-	  offset += xlogue_layout::get_instance ().get_stack_space_used ();
++	  ix86_emit_save_regs ();
++	  int_registers_saved = true;
++	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+ 	}
+ 
+-      /* Align and set SSE register save area.  */
+-      else if (frame->nsseregs)
++      if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
+ 	{
+-	  /* If the incoming stack boundary is at least 16 bytes, or DRAP is
+-	     required and the DRAP re-alignment boundary is at least 16 bytes,
+-	     then we want the SSE register save area properly aligned.  */
+-	  if (ix86_incoming_stack_boundary >= 128
+-		  || (stack_realign_drap && stack_alignment_needed >= 16))
+-	    offset = ROUND_UP (offset, 16);
+-	  offset += frame->nsseregs * 16;
++	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
++	  RTX_FRAME_RELATED_P (insn) = 1;
++
++	  if (m->fs.cfa_reg == stack_pointer_rtx)
++	    m->fs.cfa_reg = hard_frame_pointer_rtx;
++	  m->fs.fp_offset = m->fs.sp_offset;
++	  m->fs.fp_valid = true;
+ 	}
+-      frame->sse_reg_save_offset = offset;
+-      offset += frame->va_arg_size;
+     }
+ 
+-  /* Align start of frame for local function.  When a function call
+-     is removed, it may become a leaf function.  But if argument may
+-     be passed on stack, we need to align the stack when there is no
+-     tail call.  */
+-  if (m->call_ms2sysv
+-      || frame->va_arg_size != 0
+-      || size != 0
+-      || !crtl->is_leaf
+-      || (!crtl->tail_call_emit
+-	  && cfun->machine->outgoing_args_on_stack)
+-      || cfun->calls_alloca
+-      || ix86_current_function_calls_tls_descriptor)
+-    offset = ROUND_UP (offset, stack_alignment_needed);
+-
+-  /* Frame pointer points here.  */
+-  frame->frame_pointer_offset = offset;
+-
+-  offset += size;
+-
+-  /* Add outgoing arguments area.  Can be skipped if we eliminated
+-     all the function calls as dead code.
+-     Skipping is however impossible when function calls alloca.  Alloca
+-     expander assumes that last crtl->outgoing_args_size
+-     of stack frame are unused.  */
+-  if (ACCUMULATE_OUTGOING_ARGS
+-      && (!crtl->is_leaf || cfun->calls_alloca
+-	  || ix86_current_function_calls_tls_descriptor))
++  if (!int_registers_saved)
+     {
+-      offset += crtl->outgoing_args_size;
+-      frame->outgoing_arguments_size = crtl->outgoing_args_size;
+-    }
+-  else
+-    frame->outgoing_arguments_size = 0;
++      /* If saving registers via PUSH, do so now.  */
++      if (!frame.save_regs_using_mov)
++	{
++	  ix86_emit_save_regs ();
++	  int_registers_saved = true;
++	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
++	}
+ 
+-  /* Align stack boundary.  Only needed if we're calling another function
+-     or using alloca.  */
+-  if (!crtl->is_leaf || cfun->calls_alloca
+-      || ix86_current_function_calls_tls_descriptor)
+-    offset = ROUND_UP (offset, preferred_alignment);
++      /* When using red zone we may start register saving before allocating
++	 the stack frame saving one cycle of the prologue.  However, avoid
++	 doing this if we have to probe the stack; at least on x86_64 the
++	 stack probe can turn into a call that clobbers a red zone location. */
++      else if (ix86_using_red_zone ()
++	       && (! TARGET_STACK_PROBE
++		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
++	{
++	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
++	  int_registers_saved = true;
++	}
++    }
+ 
+-  /* We've reached end of stack frame.  */
+-  frame->stack_pointer_offset = offset;
++  if (stack_realign_fp)
++    {
++      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
++      gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
+ 
+-  /* Size prologue needs to allocate.  */
+-  to_allocate = offset - frame->sse_reg_save_offset;
++      /* Record last valid frame pointer offset.  */
++      m->fs.sp_realigned_fp_last = frame.reg_save_offset;
+ 
+-  if ((!to_allocate && frame->nregs <= 1)
+-      || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+-      /* If stack clash probing needs a loop, then it needs a
+-	 scratch register.  But the returned register is only guaranteed
+-	 to be safe to use after register saves are complete.  So if
+-	 stack clash protections are enabled and the allocated frame is
+-	 larger than the probe interval, then use pushes to save
+-	 callee saved registers.  */
+-      || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
+-    frame->save_regs_using_mov = false;
++      /* The computation of the size of the re-aligned stack frame means
++	 that we must allocate the size of the register save area before
++	 performing the actual alignment.  Otherwise we cannot guarantee
++	 that there's enough storage above the realignment point.  */
++      allocate = frame.reg_save_offset - m->fs.sp_offset
++		 + frame.stack_realign_allocate;
++      if (allocate)
++        pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				   GEN_INT (-allocate), -1, false);
+ 
+-  if (ix86_using_red_zone ()
+-      && crtl->sp_is_unchanging
+-      && crtl->is_leaf
+-      && !ix86_pc_thunk_call_expanded
+-      && !ix86_current_function_calls_tls_descriptor)
+-    {
+-      frame->red_zone_size = to_allocate;
+-      if (frame->save_regs_using_mov)
+-	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
+-      if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
+-	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
+-    }
+-  else
+-    frame->red_zone_size = 0;
+-  frame->stack_pointer_offset -= frame->red_zone_size;
++      /* Align the stack.  */
++      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
++					stack_pointer_rtx,
++					GEN_INT (-align_bytes)));
++      m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
++      m->fs.sp_realigned_offset = m->fs.sp_offset
++					      - frame.stack_realign_allocate;
++      /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
++	 Beyond this point, stack access should be done via choose_baseaddr or
++	 by using sp_valid_at and fp_valid_at to determine the correct base
++	 register.  Henceforth, any CFA offset should be thought of as logical
++	 and not physical.  */
++      gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
++      gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
++      m->fs.sp_realigned = true;
+ 
+-  /* The SEH frame pointer location is near the bottom of the frame.
+-     This is enforced by the fact that the difference between the
+-     stack pointer and the frame pointer is limited to 240 bytes in
+-     the unwind data structure.  */
+-  if (TARGET_SEH)
+-    {
+-      HOST_WIDE_INT diff;
++      /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
++	 is needed to describe where a register is saved using a realigned
++	 stack pointer, so we need to invalidate the stack pointer for that
++	 target.  */
++      if (TARGET_SEH)
++	m->fs.sp_valid = false;
+ 
+-      /* If we can leave the frame pointer where it is, do so.  Also, returns
+-	 the establisher frame for __builtin_frame_address (0).  */
+-      diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
+-      if (diff <= SEH_MAX_FRAME_SIZE
+-	  && (diff > 240 || (diff & 15) != 0)
+-	  && !crtl->accesses_prior_frames)
++      /* If SP offset is non-immediate after allocation of the stack frame,
++	 then emit SSE saves or stub call prior to allocating the rest of the
++	 stack frame.  This is less efficient for the out-of-line stub because
++	 we can't combine allocations across the call barrier, but it's better
++	 than using a scratch register.  */
++      else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
++						   - m->fs.sp_realigned_offset),
++					  Pmode))
+ 	{
+-	  /* Ideally we'd determine what portion of the local stack frame
+-	     (within the constraint of the lowest 240) is most heavily used.
+-	     But without that complication, simply bias the frame pointer
+-	     by 128 bytes so as to maximize the amount of the local stack
+-	     frame that is addressable with 8-bit offsets.  */
+-	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
++	  if (!sse_registers_saved)
++	    {
++	      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
++	      sse_registers_saved = true;
++	    }
++	  else if (save_stub_call_needed)
++	    {
++	      ix86_emit_outlined_ms2sysv_save (frame);
++	      save_stub_call_needed = false;
++	    }
+ 	}
+     }
+-}
+-
+-/* This is semi-inlined memory_address_length, but simplified
+-   since we know that we're always dealing with reg+offset, and
+-   to avoid having to create and discard all that rtl.  */
+ 
+-static inline int
+-choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
+-{
+-  int len = 4;
++  allocate = frame.stack_pointer_offset - m->fs.sp_offset;
+ 
+-  if (offset == 0)
++  if (flag_stack_usage_info)
+     {
+-      /* EBP and R13 cannot be encoded without an offset.  */
+-      len = (regno == BP_REG || regno == R13_REG);
+-    }
+-  else if (IN_RANGE (offset, -128, 127))
+-    len = 1;
++      /* We start to count from ARG_POINTER.  */
++      HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
+ 
+-  /* ESP and R12 must be encoded with a SIB byte.  */
+-  if (regno == SP_REG || regno == R12_REG)
+-    len++;
++      /* If it was realigned, take into account the fake frame.  */
++      if (stack_realign_drap)
++	{
++	  if (ix86_static_chain_on_stack)
++	    stack_size += UNITS_PER_WORD;
+ 
+-  return len;
+-}
++	  if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
++	    stack_size += UNITS_PER_WORD;
+ 
+-/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
+-   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
++	  /* This over-estimates by 1 minimal-stack-alignment-unit but
++	     mitigates that by counting in the new return address slot.  */
++	  current_function_dynamic_stack_size
++	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
++	}
+ 
+-static bool
+-sp_valid_at (HOST_WIDE_INT cfa_offset)
+-{
+-  const struct machine_frame_state &fs = cfun->machine->fs;
+-  if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
+-    {
+-      /* Validate that the cfa_offset isn't in a "no-man's land".  */
+-      gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
+-      return false;
++      current_function_static_stack_size = stack_size;
+     }
+-  return fs.sp_valid;
+-}
+ 
+-/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
+-   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
+-
+-static inline bool
+-fp_valid_at (HOST_WIDE_INT cfa_offset)
+-{
+-  const struct machine_frame_state &fs = cfun->machine->fs;
+-  if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
++  /* On SEH target with very large frame size, allocate an area to save
++     SSE registers (as the very large allocation won't be described).  */
++  if (TARGET_SEH
++      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
++      && !sse_registers_saved)
+     {
+-      /* Validate that the cfa_offset isn't in a "no-man's land".  */
+-      gcc_assert (cfa_offset >= fs.sp_realigned_offset);
+-      return false;
+-    }
+-  return fs.fp_valid;
+-}
+-
+-/* Choose a base register based upon alignment requested, speed and/or
+-   size.  */
+-
+-static void
+-choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
+-		HOST_WIDE_INT &base_offset,
+-		unsigned int align_reqested, unsigned int *align)
+-{
+-  const struct machine_function *m = cfun->machine;
+-  unsigned int hfp_align;
+-  unsigned int drap_align;
+-  unsigned int sp_align;
+-  bool hfp_ok  = fp_valid_at (cfa_offset);
+-  bool drap_ok = m->fs.drap_valid;
+-  bool sp_ok   = sp_valid_at (cfa_offset);
+-
+-  hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
++      HOST_WIDE_INT sse_size
++	= frame.sse_reg_save_offset - frame.reg_save_offset;
+ 
+-  /* Filter out any registers that don't meet the requested alignment
+-     criteria.  */
+-  if (align_reqested)
+-    {
+-      if (m->fs.realigned)
+-	hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
+-      /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
+-	 notes (which we would need to use a realigned stack pointer),
+-	 so disable on SEH targets.  */
+-      else if (m->fs.sp_realigned)
+-	sp_align = crtl->stack_alignment_needed;
++      gcc_assert (int_registers_saved);
+ 
+-      hfp_ok = hfp_ok && hfp_align >= align_reqested;
+-      drap_ok = drap_ok && drap_align >= align_reqested;
+-      sp_ok = sp_ok && sp_align >= align_reqested;
++      /* No need to do stack checking as the area will be immediately
++	 written.  */
++      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++			         GEN_INT (-sse_size), -1,
++				 m->fs.cfa_reg == stack_pointer_rtx);
++      allocate -= sse_size;
++      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
++      sse_registers_saved = true;
+     }
+ 
+-  if (m->use_fast_prologue_epilogue)
++  /* The stack has already been decremented by the instruction calling us
++     so probe if the size is non-negative to preserve the protection area.  */
++  if (allocate >= 0
++      && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
++	  || flag_stack_clash_protection))
+     {
+-      /* Choose the base register most likely to allow the most scheduling
+-         opportunities.  Generally FP is valid throughout the function,
+-         while DRAP must be reloaded within the epilogue.  But choose either
+-         over the SP due to increased encoding size.  */
+-
+-      if (hfp_ok)
++      if (flag_stack_clash_protection)
+ 	{
+-	  base_reg = hard_frame_pointer_rtx;
+-	  base_offset = m->fs.fp_offset - cfa_offset;
++	  ix86_adjust_stack_and_probe_stack_clash (allocate,
++						   int_registers_saved);
++	  allocate = 0;
+ 	}
+-      else if (drap_ok)
++      else if (STACK_CHECK_MOVING_SP)
+ 	{
+-	  base_reg = crtl->drap_reg;
+-	  base_offset = 0 - cfa_offset;
++	  if (!(crtl->is_leaf && !cfun->calls_alloca
++		&& allocate <= get_probe_interval ()))
++	    {
++	      ix86_adjust_stack_and_probe (allocate, int_registers_saved);
++	      allocate = 0;
++	    }
+ 	}
+-      else if (sp_ok)
++      else
+ 	{
+-	  base_reg = stack_pointer_rtx;
+-	  base_offset = m->fs.sp_offset - cfa_offset;
++	  HOST_WIDE_INT size = allocate;
++
++	  if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
++	    size = 0x80000000 - get_stack_check_protect () - 1;
++
++	  if (TARGET_STACK_PROBE)
++	    {
++	      if (crtl->is_leaf && !cfun->calls_alloca)
++		{
++		  if (size > get_probe_interval ())
++		    ix86_emit_probe_stack_range (0, size, int_registers_saved);
++		}
++	      else
++		ix86_emit_probe_stack_range (0,
++					     size + get_stack_check_protect (),
++					     int_registers_saved);
++	    }
++	  else
++	    {
++	      if (crtl->is_leaf && !cfun->calls_alloca)
++		{
++		  if (size > get_probe_interval ()
++		      && size > get_stack_check_protect ())
++		    ix86_emit_probe_stack_range (get_stack_check_protect (),
++						 (size
++						  - get_stack_check_protect ()),
++						 int_registers_saved);
++		}
++	      else
++		ix86_emit_probe_stack_range (get_stack_check_protect (), size,
++					     int_registers_saved);
++	    }
+ 	}
+     }
++
++  if (allocate == 0)
++    ;
++  else if (!ix86_target_stack_probe ()
++	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
++    {
++      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++			         GEN_INT (-allocate), -1,
++			         m->fs.cfa_reg == stack_pointer_rtx);
++    }
+   else
+     {
+-      HOST_WIDE_INT toffset;
+-      int len = 16, tlen;
++      rtx eax = gen_rtx_REG (Pmode, AX_REG);
++      rtx r10 = NULL;
++      rtx (*adjust_stack_insn)(rtx, rtx, rtx);
++      const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
++      bool eax_live = ix86_eax_live_at_start_p ();
++      bool r10_live = false;
+ 
+-      /* Choose the base register with the smallest address encoding.
+-         With a tie, choose FP > DRAP > SP.  */
+-      if (sp_ok)
+-	{
+-	  base_reg = stack_pointer_rtx;
+-	  base_offset = m->fs.sp_offset - cfa_offset;
+-          len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
+-	}
+-      if (drap_ok)
++      if (TARGET_64BIT)
++        r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
++
++      if (eax_live)
+ 	{
+-	  toffset = 0 - cfa_offset;
+-	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
+-	  if (tlen <= len)
++	  insn = emit_insn (gen_push (eax));
++	  allocate -= UNITS_PER_WORD;
++	  /* Note that SEH directives need to continue tracking the stack
++	     pointer even after the frame pointer has been set up.  */
++	  if (sp_is_cfa_reg || TARGET_SEH)
+ 	    {
+-	      base_reg = crtl->drap_reg;
+-	      base_offset = toffset;
+-	      len = tlen;
++	      if (sp_is_cfa_reg)
++		m->fs.cfa_offset += UNITS_PER_WORD;
++	      RTX_FRAME_RELATED_P (insn) = 1;
++	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
++			    gen_rtx_SET (stack_pointer_rtx,
++					 plus_constant (Pmode, stack_pointer_rtx,
++							-UNITS_PER_WORD)));
+ 	    }
+ 	}
+-      if (hfp_ok)
++
++      if (r10_live)
+ 	{
+-	  toffset = m->fs.fp_offset - cfa_offset;
+-	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
+-	  if (tlen <= len)
++	  r10 = gen_rtx_REG (Pmode, R10_REG);
++	  insn = emit_insn (gen_push (r10));
++	  allocate -= UNITS_PER_WORD;
++	  if (sp_is_cfa_reg || TARGET_SEH)
+ 	    {
+-	      base_reg = hard_frame_pointer_rtx;
+-	      base_offset = toffset;
++	      if (sp_is_cfa_reg)
++		m->fs.cfa_offset += UNITS_PER_WORD;
++	      RTX_FRAME_RELATED_P (insn) = 1;
++	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
++			    gen_rtx_SET (stack_pointer_rtx,
++					 plus_constant (Pmode, stack_pointer_rtx,
++							-UNITS_PER_WORD)));
+ 	    }
+ 	}
+-    }
+ 
+-    /* Set the align return value.  */
+-    if (align)
+-      {
+-	if (base_reg == stack_pointer_rtx)
+-	  *align = sp_align;
+-	else if (base_reg == crtl->drap_reg)
+-	  *align = drap_align;
+-	else if (base_reg == hard_frame_pointer_rtx)
+-	  *align = hfp_align;
+-      }
+-}
++      emit_move_insn (eax, GEN_INT (allocate));
++      emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
+ 
+-/* Return an RTX that points to CFA_OFFSET within the stack frame and
+-   the alignment of address.  If ALIGN is non-null, it should point to
+-   an alignment value (in bits) that is preferred or zero and will
+-   recieve the alignment of the base register that was selected,
+-   irrespective of rather or not CFA_OFFSET is a multiple of that
+-   alignment value.  If it is possible for the base register offset to be
+-   non-immediate then SCRATCH_REGNO should specify a scratch register to
+-   use.
++      /* Use the fact that AX still contains ALLOCATE.  */
++      adjust_stack_insn = (Pmode == DImode
++			   ? gen_pro_epilogue_adjust_stack_di_sub
++			   : gen_pro_epilogue_adjust_stack_si_sub);
+ 
+-   The valid base registers are taken from CFUN->MACHINE->FS.  */
++      insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
++					   stack_pointer_rtx, eax));
+ 
+-static rtx
+-choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
+-		 unsigned int scratch_regno = INVALID_REGNUM)
+-{
+-  rtx base_reg = NULL;
+-  HOST_WIDE_INT base_offset = 0;
++      if (sp_is_cfa_reg || TARGET_SEH)
++	{
++	  if (sp_is_cfa_reg)
++	    m->fs.cfa_offset += allocate;
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
++			gen_rtx_SET (stack_pointer_rtx,
++				     plus_constant (Pmode, stack_pointer_rtx,
++						    -allocate)));
++	}
++      m->fs.sp_offset += allocate;
+ 
+-  /* If a specific alignment is requested, try to get a base register
+-     with that alignment first.  */
+-  if (align && *align)
+-    choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
++      /* Use stack_pointer_rtx for relative addressing so that code works for
++	 realigned stack.  But this means that we need a blockage to prevent
++	 stores based on the frame pointer from being scheduled before.  */
++      if (r10_live && eax_live)
++        {
++	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
++	  emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
++			  gen_frame_mem (word_mode, t));
++	  t = plus_constant (Pmode, t, UNITS_PER_WORD);
++	  emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
++			  gen_frame_mem (word_mode, t));
++	  emit_insn (gen_memory_blockage ());
++	}
++      else if (eax_live || r10_live)
++	{
++	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
++	  emit_move_insn (gen_rtx_REG (word_mode,
++				       (eax_live ? AX_REG : R10_REG)),
++			  gen_frame_mem (word_mode, t));
++	  emit_insn (gen_memory_blockage ());
++	}
++    }
++  gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
+ 
+-  if (!base_reg)
+-    choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
++  /* If we havn't already set up the frame pointer, do so now.  */
++  if (frame_pointer_needed && !m->fs.fp_valid)
++    {
++      insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
++			    GEN_INT (frame.stack_pointer_offset
++				     - frame.hard_frame_pointer_offset));
++      insn = emit_insn (insn);
++      RTX_FRAME_RELATED_P (insn) = 1;
++      add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
+ 
+-  gcc_assert (base_reg != NULL);
++      if (m->fs.cfa_reg == stack_pointer_rtx)
++	m->fs.cfa_reg = hard_frame_pointer_rtx;
++      m->fs.fp_offset = frame.hard_frame_pointer_offset;
++      m->fs.fp_valid = true;
++    }
+ 
+-  rtx base_offset_rtx = GEN_INT (base_offset);
++  if (!int_registers_saved)
++    ix86_emit_save_regs_using_mov (frame.reg_save_offset);
++  if (!sse_registers_saved)
++    ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
++  else if (save_stub_call_needed)
++    ix86_emit_outlined_ms2sysv_save (frame);
+ 
+-  if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
++  /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
++     in PROLOGUE.  */
++  if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
+     {
+-      gcc_assert (scratch_regno != INVALID_REGNUM);
+-
+-      rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+-      emit_move_insn (scratch_reg, base_offset_rtx);
+-
+-      return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
++      rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
++      insn = emit_insn (gen_set_got (pic));
++      RTX_FRAME_RELATED_P (insn) = 1;
++      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
++      emit_insn (gen_prologue_use (pic));
++      /* Deleting already emmitted SET_GOT if exist and allocated to
++	 REAL_PIC_OFFSET_TABLE_REGNUM.  */
++      ix86_elim_entry_set_got (pic);
+     }
+ 
+-  return plus_constant (Pmode, base_reg, base_offset);
+-}
+-
+-/* Emit code to save registers in the prologue.  */
++  if (crtl->drap_reg && !crtl->stack_realign_needed)
++    {
++      /* vDRAP is setup but after reload it turns out stack realign
++         isn't necessary, here we will emit prologue to setup DRAP
++         without stack realign adjustment */
++      t = choose_baseaddr (0, NULL);
++      emit_insn (gen_rtx_SET (crtl->drap_reg, t));
++    }
+ 
+-static void
+-ix86_emit_save_regs (void)
+-{
+-  unsigned int regno;
+-  rtx_insn *insn;
++  /* Prevent instructions from being scheduled into register save push
++     sequence when access to the redzone area is done through frame pointer.
++     The offset between the frame pointer and the stack pointer is calculated
++     relative to the value of the stack pointer at the end of the function
++     prologue, and moving instructions that access redzone area via frame
++     pointer inside push sequence violates this assumption.  */
++  if (frame_pointer_needed && frame.red_zone_size)
++    emit_insn (gen_memory_blockage ());
+ 
+-  for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
+-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+-      {
+-	insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
+-	RTX_FRAME_RELATED_P (insn) = 1;
+-      }
++  /* SEH requires that the prologue end within 256 bytes of the start of
++     the function.  Prevent instruction schedules that would extend that.
++     Further, prevent alloca modifications to the stack pointer from being
++     combined with prologue modifications.  */
++  if (TARGET_SEH)
++    emit_insn (gen_prologue_use (stack_pointer_rtx));
+ }
+ 
+-/* Emit a single register save at CFA - CFA_OFFSET.  */
++/* Emit code to restore REG using a POP insn.  */
+ 
+ static void
+-ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
+-			      HOST_WIDE_INT cfa_offset)
++ix86_emit_restore_reg_using_pop (rtx reg)
+ {
+   struct machine_function *m = cfun->machine;
+-  rtx reg = gen_rtx_REG (mode, regno);
+-  rtx mem, addr, base, insn;
+-  unsigned int align = GET_MODE_ALIGNMENT (mode);
+-
+-  addr = choose_baseaddr (cfa_offset, &align);
+-  mem = gen_frame_mem (mode, addr);
++  rtx_insn *insn = emit_insn (gen_pop (reg));
+ 
+-  /* The location aligment depends upon the base register.  */
+-  align = MIN (GET_MODE_ALIGNMENT (mode), align);
+-  gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
+-  set_mem_align (mem, align);
++  ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
++  m->fs.sp_offset -= UNITS_PER_WORD;
+ 
+-  insn = emit_insn (gen_rtx_SET (mem, reg));
+-  RTX_FRAME_RELATED_P (insn) = 1;
++  if (m->fs.cfa_reg == crtl->drap_reg
++      && REGNO (reg) == REGNO (crtl->drap_reg))
++    {
++      /* Previously we'd represented the CFA as an expression
++	 like *(%ebp - 8).  We've just popped that value from
++	 the stack, which means we need to reset the CFA to
++	 the drap register.  This will remain until we restore
++	 the stack pointer.  */
++      add_reg_note (insn, REG_CFA_DEF_CFA, reg);
++      RTX_FRAME_RELATED_P (insn) = 1;
+ 
+-  base = addr;
+-  if (GET_CODE (base) == PLUS)
+-    base = XEXP (base, 0);
+-  gcc_checking_assert (REG_P (base));
++      /* This means that the DRAP register is valid for addressing too.  */
++      m->fs.drap_valid = true;
++      return;
++    }
+ 
+-  /* When saving registers into a re-aligned local stack frame, avoid
+-     any tricky guessing by dwarf2out.  */
+-  if (m->fs.realigned)
++  if (m->fs.cfa_reg == stack_pointer_rtx)
+     {
+-      gcc_checking_assert (stack_realign_drap);
++      rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
++      x = gen_rtx_SET (stack_pointer_rtx, x);
++      add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
++      RTX_FRAME_RELATED_P (insn) = 1;
+ 
+-      if (regno == REGNO (crtl->drap_reg))
+-	{
+-	  /* A bit of a hack.  We force the DRAP register to be saved in
+-	     the re-aligned stack frame, which provides us with a copy
+-	     of the CFA that will last past the prologue.  Install it.  */
+-	  gcc_checking_assert (cfun->machine->fs.fp_valid);
+-	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+-				cfun->machine->fs.fp_offset - cfa_offset);
+-	  mem = gen_rtx_MEM (mode, addr);
+-	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
+-	}
+-      else
+-	{
+-	  /* The frame pointer is a stable reference within the
+-	     aligned frame.  Use it.  */
+-	  gcc_checking_assert (cfun->machine->fs.fp_valid);
+-	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+-				cfun->machine->fs.fp_offset - cfa_offset);
+-	  mem = gen_rtx_MEM (mode, addr);
+-	  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+-	}
++      m->fs.cfa_offset -= UNITS_PER_WORD;
+     }
+ 
+-  else if (base == stack_pointer_rtx && m->fs.sp_realigned
+-	   && cfa_offset >= m->fs.sp_realigned_offset)
++  /* When the frame pointer is the CFA, and we pop it, we are
++     swapping back to the stack pointer as the CFA.  This happens
++     for stack frames that don't allocate other data, so we assume
++     the stack pointer is now pointing at the return address, i.e.
++     the function entry state, which makes the offset be 1 word.  */
++  if (reg == hard_frame_pointer_rtx)
+     {
+-      gcc_checking_assert (stack_realign_fp);
+-      add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
++      m->fs.fp_valid = false;
++      if (m->fs.cfa_reg == hard_frame_pointer_rtx)
++	{
++	  m->fs.cfa_reg = stack_pointer_rtx;
++	  m->fs.cfa_offset -= UNITS_PER_WORD;
++
++	  add_reg_note (insn, REG_CFA_DEF_CFA,
++			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
++				      GEN_INT (m->fs.cfa_offset)));
++	  RTX_FRAME_RELATED_P (insn) = 1;
++	}
+     }
++}
+ 
+-  /* The memory may not be relative to the current CFA register,
+-     which means that we may need to generate a new pattern for
+-     use by the unwind info.  */
+-  else if (base != m->fs.cfa_reg)
++/* Emit code to restore saved registers using POP insns.  */
++
++static void
++ix86_emit_restore_regs_using_pop (void)
++{
++  unsigned int regno;
++
++  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
++    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
++      ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
++}
++
++/* Emit code and notes for the LEAVE instruction.  If insn is non-null,
++   omits the emit and only attaches the notes.  */
++
++static void
++ix86_emit_leave (rtx_insn *insn)
++{
++  struct machine_function *m = cfun->machine;
++  if (!insn)
++    insn = emit_insn (ix86_gen_leave ());
++
++  ix86_add_queued_cfa_restore_notes (insn);
++
++  gcc_assert (m->fs.fp_valid);
++  m->fs.sp_valid = true;
++  m->fs.sp_realigned = false;
++  m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
++  m->fs.fp_valid = false;
++
++  if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+     {
+-      addr = plus_constant (Pmode, m->fs.cfa_reg,
+-			    m->fs.cfa_offset - cfa_offset);
+-      mem = gen_rtx_MEM (mode, addr);
+-      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
++      m->fs.cfa_reg = stack_pointer_rtx;
++      m->fs.cfa_offset = m->fs.sp_offset;
++
++      add_reg_note (insn, REG_CFA_DEF_CFA,
++		    plus_constant (Pmode, stack_pointer_rtx,
++				   m->fs.sp_offset));
++      RTX_FRAME_RELATED_P (insn) = 1;
+     }
++  ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
++			     m->fs.fp_offset);
+ }
+ 
+-/* Emit code to save registers using MOV insns.
+-   First register is stored at CFA - CFA_OFFSET.  */
++/* Emit code to restore saved registers using MOV insns.
++   First register is restored from CFA - CFA_OFFSET.  */
+ static void
+-ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
++ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
++				  bool maybe_eh_return)
+ {
++  struct machine_function *m = cfun->machine;
+   unsigned int regno;
+ 
+   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
+       {
+-        ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
++	rtx reg = gen_rtx_REG (word_mode, regno);
++	rtx mem;
++	rtx_insn *insn;
++
++	mem = choose_baseaddr (cfa_offset, NULL);
++	mem = gen_frame_mem (word_mode, mem);
++	insn = emit_move_insn (reg, mem);
++
++        if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
++	  {
++	    /* Previously we'd represented the CFA as an expression
++	       like *(%ebp - 8).  We've just popped that value from
++	       the stack, which means we need to reset the CFA to
++	       the drap register.  This will remain until we restore
++	       the stack pointer.  */
++	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
++	    RTX_FRAME_RELATED_P (insn) = 1;
++
++	    /* This means that the DRAP register is valid for addressing.  */
++	    m->fs.drap_valid = true;
++	  }
++	else
++	  ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
++
+ 	cfa_offset -= UNITS_PER_WORD;
+       }
+ }
+ 
+-/* Emit code to save SSE registers using MOV insns.
+-   First register is stored at CFA - CFA_OFFSET.  */
++/* Emit code to restore saved registers using MOV insns.
++   First register is restored from CFA - CFA_OFFSET.  */
+ static void
+-ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
++ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
++				      bool maybe_eh_return)
+ {
+   unsigned int regno;
+ 
+   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
++    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
+       {
+-	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
+-	cfa_offset -= GET_MODE_SIZE (V4SFmode);
+-      }
+-}
++	rtx reg = gen_rtx_REG (V4SFmode, regno);
++	rtx mem;
++	unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
+ 
+-static GTY(()) rtx queued_cfa_restores;
++	mem = choose_baseaddr (cfa_offset, &align);
++	mem = gen_rtx_MEM (V4SFmode, mem);
+ 
+-/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
+-   manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
+-   Don't add the note if the previously saved value will be left untouched
+-   within stack red-zone till return, as unwinders can find the same value
+-   in the register and on the stack.  */
++	/* The location aligment depends upon the base register.  */
++	align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
++	gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
++	set_mem_align (mem, align);
++	emit_insn (gen_rtx_SET (reg, mem));
+ 
+-static void
+-ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
+-{
+-  if (!crtl->shrink_wrapped
+-      && cfa_offset <= cfun->machine->fs.red_zone_offset)
+-    return;
++	ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ 
+-  if (insn)
+-    {
+-      add_reg_note (insn, REG_CFA_RESTORE, reg);
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-    }
+-  else
+-    queued_cfa_restores
+-      = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
++	cfa_offset -= GET_MODE_SIZE (V4SFmode);
++      }
+ }
+ 
+-/* Add queued REG_CFA_RESTORE notes if any to INSN.  */
+-
+ static void
+-ix86_add_queued_cfa_restore_notes (rtx insn)
+-{
+-  rtx last;
+-  if (!queued_cfa_restores)
+-    return;
+-  for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
+-    ;
+-  XEXP (last, 1) = REG_NOTES (insn);
+-  REG_NOTES (insn) = queued_cfa_restores;
+-  queued_cfa_restores = NULL_RTX;
+-  RTX_FRAME_RELATED_P (insn) = 1;
+-}
+-
+-/* Expand prologue or epilogue stack adjustment.
+-   The pattern exist to put a dependency on all ebp-based memory accesses.
+-   STYLE should be negative if instructions should be marked as frame related,
+-   zero if %r11 register is live and cannot be freely used and positive
+-   otherwise.  */
+-
+-static rtx
+-pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
+-			   int style, bool set_cfa)
++ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
++				  bool use_call, int style)
+ {
+   struct machine_function *m = cfun->machine;
+-  rtx insn;
+-  bool add_frame_related_expr = false;
++  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
++			  + m->call_ms2sysv_extra_regs;
++  rtvec v;
++  unsigned int elems_needed, align, i, vi = 0;
++  rtx_insn *insn;
++  rtx sym, tmp;
++  rtx rsi = gen_rtx_REG (word_mode, SI_REG);
++  rtx r10 = NULL_RTX;
++  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
++  HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
++  HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
++  rtx rsi_frame_load = NULL_RTX;
++  HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
++  enum xlogue_stub stub;
+ 
+-  if (Pmode == SImode)
+-    insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
+-  else if (x86_64_immediate_operand (offset, DImode))
+-    insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
+-  else
+-    {
+-      rtx tmp;
+-      /* r11 is used by indirect sibcall return as well, set before the
+-	 epilogue and used after the epilogue.  */
+-      if (style)
+-        tmp = gen_rtx_REG (DImode, R11_REG);
+-      else
+-	{
+-	  gcc_assert (src != hard_frame_pointer_rtx
+-		      && dest != hard_frame_pointer_rtx);
+-	  tmp = hard_frame_pointer_rtx;
+-	}
+-      insn = emit_insn (gen_rtx_SET (tmp, offset));
+-      if (style < 0)
+-	add_frame_related_expr = true;
++  gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
+ 
+-      insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
+-    }
++  /* If using a realigned stack, we should never start with padding.  */
++  gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
+ 
+-  insn = emit_insn (insn);
+-  if (style >= 0)
+-    ix86_add_queued_cfa_restore_notes (insn);
++  /* Setup RSI as the stub's base pointer.  */
++  align = GET_MODE_ALIGNMENT (V4SFmode);
++  tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
++  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
+ 
+-  if (set_cfa)
+-    {
+-      rtx r;
++  emit_insn (gen_rtx_SET (rsi, tmp));
+ 
+-      gcc_assert (m->fs.cfa_reg == src);
+-      m->fs.cfa_offset += INTVAL (offset);
+-      m->fs.cfa_reg = dest;
++  /* Get a symbol for the stub.  */
++  if (frame_pointer_needed)
++    stub = use_call ? XLOGUE_STUB_RESTORE_HFP
++		    : XLOGUE_STUB_RESTORE_HFP_TAIL;
++  else
++    stub = use_call ? XLOGUE_STUB_RESTORE
++		    : XLOGUE_STUB_RESTORE_TAIL;
++  sym = xlogue.get_stub_rtx (stub);
+ 
+-      r = gen_rtx_PLUS (Pmode, src, offset);
+-      r = gen_rtx_SET (dest, r);
+-      add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-    }
+-  else if (style < 0)
++  elems_needed = ncregs;
++  if (use_call)
++    elems_needed += 1;
++  else
++    elems_needed += frame_pointer_needed ? 5 : 3;
++  v = rtvec_alloc (elems_needed);
++
++  /* We call the epilogue stub when we need to pop incoming args or we are
++     doing a sibling call as the tail.  Otherwise, we will emit a jmp to the
++     epilogue stub and it is the tail-call.  */
++  if (use_call)
++      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
++  else
+     {
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-      if (add_frame_related_expr)
++      RTVEC_ELT (v, vi++) = ret_rtx;
++      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
++      if (frame_pointer_needed)
+ 	{
+-	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
+-	  r = gen_rtx_SET (dest, r);
+-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
++	  rtx rbp = gen_rtx_REG (DImode, BP_REG);
++	  gcc_assert (m->fs.fp_valid);
++	  gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
++
++	  tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
++	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
++	  RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
++	  tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
++	  RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
++	}
++      else
++	{
++	  /* If no hard frame pointer, we set R10 to the SP restore value.  */
++	  gcc_assert (!m->fs.fp_valid);
++	  gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
++	  gcc_assert (m->fs.sp_valid);
++
++	  r10 = gen_rtx_REG (DImode, R10_REG);
++	  tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
++	  emit_insn (gen_rtx_SET (r10, tmp));
++
++	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
+ 	}
+     }
+ 
+-  if (dest == stack_pointer_rtx)
++  /* Generate frame load insns and restore notes.  */
++  for (i = 0; i < ncregs; ++i)
+     {
+-      HOST_WIDE_INT ooffset = m->fs.sp_offset;
+-      bool valid = m->fs.sp_valid;
+-      bool realigned = m->fs.sp_realigned;
++      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
++      machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
++      rtx reg, frame_load;
+ 
+-      if (src == hard_frame_pointer_rtx)
+-	{
+-	  valid = m->fs.fp_valid;
+-	  realigned = false;
+-	  ooffset = m->fs.fp_offset;
+-	}
+-      else if (src == crtl->drap_reg)
++      reg = gen_rtx_REG (mode, r.regno);
++      frame_load = gen_frame_load (reg, rsi, r.offset);
++
++      /* Save RSI frame load insn & note to add last.  */
++      if (r.regno == SI_REG)
+ 	{
+-	  valid = m->fs.drap_valid;
+-	  realigned = false;
+-	  ooffset = 0;
++	  gcc_assert (!rsi_frame_load);
++	  rsi_frame_load = frame_load;
++	  rsi_restore_offset = r.offset;
+ 	}
+       else
+ 	{
+-	  /* Else there are two possibilities: SP itself, which we set
+-	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
+-	     taken care of this by hand along the eh_return path.  */
+-	  gcc_checking_assert (src == stack_pointer_rtx
+-			       || offset == const0_rtx);
++	  RTVEC_ELT (v, vi++) = frame_load;
++	  ix86_add_cfa_restore_note (NULL, reg, r.offset);
+ 	}
+-
+-      m->fs.sp_offset = ooffset - INTVAL (offset);
+-      m->fs.sp_valid = valid;
+-      m->fs.sp_realigned = realigned;
+     }
+-  return insn;
+-}
+-
+-/* Find an available register to be used as dynamic realign argument
+-   pointer regsiter.  Such a register will be written in prologue and
+-   used in begin of body, so it must not be
+-	1. parameter passing register.
+-	2. GOT pointer.
+-   We reuse static-chain register if it is available.  Otherwise, we
+-   use DI for i386 and R13 for x86-64.  We chose R13 since it has
+-   shorter encoding.
+-
+-   Return: the regno of chosen register.  */
+ 
+-static unsigned int
+-find_drap_reg (void)
+-{
+-  tree decl = cfun->decl;
++  /* Add RSI frame load & restore note at the end.  */
++  gcc_assert (rsi_frame_load);
++  gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
++  RTVEC_ELT (v, vi++) = rsi_frame_load;
++  ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
++			     rsi_restore_offset);
+ 
+-  /* Always use callee-saved register if there are no caller-saved
+-     registers.  */
+-  if (TARGET_64BIT)
++  /* Finally, for tail-call w/o a hard frame pointer, set SP to R10.  */
++  if (!use_call && !frame_pointer_needed)
+     {
+-      /* Use R13 for nested function or function need static chain.
+-	 Since function with tail call may use any caller-saved
+-	 registers in epilogue, DRAP must not use caller-saved
+-	 register in such case.  */
+-      if (DECL_STATIC_CHAIN (decl)
+-	  || cfun->machine->no_caller_saved_registers
+-	  || crtl->tail_call_emit)
+-	return R13_REG;
++      gcc_assert (m->fs.sp_valid);
++      gcc_assert (!m->fs.sp_realigned);
+ 
+-      return R10_REG;
++      /* At this point, R10 should point to frame.stack_realign_offset.  */
++      if (m->fs.cfa_reg == stack_pointer_rtx)
++	m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
++      m->fs.sp_offset = frame.stack_realign_offset;
+     }
++
++  gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
++  tmp = gen_rtx_PARALLEL (VOIDmode, v);
++  if (use_call)
++      insn = emit_insn (tmp);
+   else
+     {
+-      /* Use DI for nested function or function need static chain.
+-	 Since function with tail call may use any caller-saved
+-	 registers in epilogue, DRAP must not use caller-saved
+-	 register in such case.  */
+-      if (DECL_STATIC_CHAIN (decl)
+-	  || cfun->machine->no_caller_saved_registers
+-	  || crtl->tail_call_emit)
+-	return DI_REG;
++      insn = emit_jump_insn (tmp);
++      JUMP_LABEL (insn) = ret_rtx;
+ 
+-      /* Reuse static chain register if it isn't used for parameter
+-         passing.  */
+-      if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
++      if (frame_pointer_needed)
++	ix86_emit_leave (insn);
++      else
+ 	{
+-	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
+-	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
+-	    return CX_REG;
++	  /* Need CFA adjust note.  */
++	  tmp = gen_rtx_SET (stack_pointer_rtx, r10);
++	  add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
+ 	}
+-      return DI_REG;
+     }
+-}
+ 
+-/* Handle a "force_align_arg_pointer" attribute.  */
++  RTX_FRAME_RELATED_P (insn) = true;
++  ix86_add_queued_cfa_restore_notes (insn);
+ 
+-static tree
+-ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
+-					       tree, int, bool *no_add_attrs)
+-{
+-  if (TREE_CODE (*node) != FUNCTION_TYPE
+-      && TREE_CODE (*node) != METHOD_TYPE
+-      && TREE_CODE (*node) != FIELD_DECL
+-      && TREE_CODE (*node) != TYPE_DECL)
++  /* If we're not doing a tail-call, we need to adjust the stack.  */
++  if (use_call && m->fs.sp_valid)
+     {
+-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+-	       name);
+-      *no_add_attrs = true;
++      HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
++      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				GEN_INT (dealloc), style,
++				m->fs.cfa_reg == stack_pointer_rtx);
+     }
+-
+-  return NULL_TREE;
+ }
+ 
+-/* Return minimum incoming stack alignment.  */
++/* Restore function stack, frame, and registers.  */
+ 
+-static unsigned int
+-ix86_minimum_incoming_stack_boundary (bool sibcall)
++void
++ix86_expand_epilogue (int style)
+ {
+-  unsigned int incoming_stack_boundary;
+-
+-  /* Stack of interrupt handler is aligned to 128 bits in 64bit mode.  */
+-  if (cfun->machine->func_type != TYPE_NORMAL)
+-    incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
+-  /* Prefer the one specified at command line. */
+-  else if (ix86_user_incoming_stack_boundary)
+-    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+-  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
+-     if -mstackrealign is used, it isn't used for sibcall check and
+-     estimated stack alignment is 128bit.  */
+-  else if (!sibcall
+-	   && ix86_force_align_arg_pointer
+-	   && crtl->stack_alignment_estimated == 128)
+-    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+-  else
+-    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
++  struct machine_function *m = cfun->machine;
++  struct machine_frame_state frame_state_save = m->fs;
++  bool restore_regs_via_mov;
++  bool using_drap;
++  bool restore_stub_is_tail = false;
+ 
+-  /* Incoming stack alignment can be changed on individual functions
+-     via force_align_arg_pointer attribute.  We use the smallest
+-     incoming stack boundary.  */
+-  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
+-      && lookup_attribute (ix86_force_align_arg_pointer_string,
+-			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
+-    incoming_stack_boundary = MIN_STACK_BOUNDARY;
++  if (ix86_function_naked (current_function_decl))
++    {
++      /* The program should not reach this point.  */
++      emit_insn (gen_ud2 ());
++      return;
++    }
+ 
+-  /* The incoming stack frame has to be aligned at least at
+-     parm_stack_boundary.  */
+-  if (incoming_stack_boundary < crtl->parm_stack_boundary)
+-    incoming_stack_boundary = crtl->parm_stack_boundary;
++  ix86_finalize_stack_frame_flags ();
++  const struct ix86_frame &frame = cfun->machine->frame;
+ 
+-  /* Stack at entrance of main is aligned by runtime.  We use the
+-     smallest incoming stack boundary. */
+-  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
+-      && DECL_NAME (current_function_decl)
+-      && MAIN_NAME_P (DECL_NAME (current_function_decl))
+-      && DECL_FILE_SCOPE_P (current_function_decl))
+-    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
++  m->fs.sp_realigned = stack_realign_fp;
++  m->fs.sp_valid = stack_realign_fp
++		   || !frame_pointer_needed
++		   || crtl->sp_is_unchanging;
++  gcc_assert (!m->fs.sp_valid
++	      || m->fs.sp_offset == frame.stack_pointer_offset);
+ 
+-  return incoming_stack_boundary;
+-}
++  /* The FP must be valid if the frame pointer is present.  */
++  gcc_assert (frame_pointer_needed == m->fs.fp_valid);
++  gcc_assert (!m->fs.fp_valid
++	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
+ 
+-/* Update incoming stack boundary and estimated stack alignment.  */
++  /* We must have *some* valid pointer to the stack frame.  */
++  gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
+ 
+-static void
+-ix86_update_stack_boundary (void)
+-{
+-  ix86_incoming_stack_boundary
+-    = ix86_minimum_incoming_stack_boundary (false);
++  /* The DRAP is never valid at this point.  */
++  gcc_assert (!m->fs.drap_valid);
+ 
+-  /* x86_64 vararg needs 16byte stack alignment for register save area.  */
+-  if (TARGET_64BIT
+-      && cfun->stdarg
+-      && crtl->stack_alignment_estimated < 128)
+-    crtl->stack_alignment_estimated = 128;
++  /* See the comment about red zone and frame
++     pointer usage in ix86_expand_prologue.  */
++  if (frame_pointer_needed && frame.red_zone_size)
++    emit_insn (gen_memory_blockage ());
+ 
+-  /* __tls_get_addr needs to be called with 16-byte aligned stack.  */
+-  if (ix86_tls_descriptor_calls_expanded_in_cfun
+-      && crtl->preferred_stack_boundary < 128)
+-    crtl->preferred_stack_boundary = 128;
+-}
++  using_drap = crtl->drap_reg && crtl->stack_realign_needed;
++  gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
+ 
+-/* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
+-   needed or an rtx for DRAP otherwise.  */
++  /* Determine the CFA offset of the end of the red-zone.  */
++  m->fs.red_zone_offset = 0;
++  if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
++    {
++      /* The red-zone begins below return address and error code in
++	 exception handler.  */
++      m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
+ 
+-static rtx
+-ix86_get_drap_rtx (void)
+-{
+-  /* We must use DRAP if there are outgoing arguments on stack and
+-     ACCUMULATE_OUTGOING_ARGS is false.  */
+-  if (ix86_force_drap
+-      || (cfun->machine->outgoing_args_on_stack
+-	  && !ACCUMULATE_OUTGOING_ARGS))
+-    crtl->need_drap = true;
++      /* When the register save area is in the aligned portion of
++         the stack, determine the maximum runtime displacement that
++	 matches up with the aligned frame.  */
++      if (stack_realign_drap)
++	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
++				  + UNITS_PER_WORD);
++    }
+ 
+-  if (stack_realign_drap)
+-    {
+-      /* Assign DRAP to vDRAP and returns vDRAP */
+-      unsigned int regno = find_drap_reg ();
+-      rtx drap_vreg;
+-      rtx arg_ptr;
+-      rtx_insn *seq, *insn;
++  HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
+ 
+-      arg_ptr = gen_rtx_REG (Pmode, regno);
+-      crtl->drap_reg = arg_ptr;
++  /* Special care must be taken for the normal return case of a function
++     using eh_return: the eax and edx registers are marked as saved, but
++     not restored along this path.  Adjust the save location to match.  */
++  if (crtl->calls_eh_return && style != 2)
++    reg_save_offset -= 2 * UNITS_PER_WORD;
+ 
+-      start_sequence ();
+-      drap_vreg = copy_to_reg (arg_ptr);
+-      seq = get_insns ();
+-      end_sequence ();
++  /* EH_RETURN requires the use of moves to function properly.  */
++  if (crtl->calls_eh_return)
++    restore_regs_via_mov = true;
++  /* SEH requires the use of pops to identify the epilogue.  */
++  else if (TARGET_SEH)
++    restore_regs_via_mov = false;
++  /* If we're only restoring one register and sp cannot be used then
++     using a move instruction to restore the register since it's
++     less work than reloading sp and popping the register.  */
++  else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
++    restore_regs_via_mov = true;
++  else if (TARGET_EPILOGUE_USING_MOVE
++	   && cfun->machine->use_fast_prologue_epilogue
++	   && (frame.nregs > 1
++	       || m->fs.sp_offset != reg_save_offset))
++    restore_regs_via_mov = true;
++  else if (frame_pointer_needed
++	   && !frame.nregs
++	   && m->fs.sp_offset != reg_save_offset)
++    restore_regs_via_mov = true;
++  else if (frame_pointer_needed
++	   && TARGET_USE_LEAVE
++	   && cfun->machine->use_fast_prologue_epilogue
++	   && frame.nregs == 1)
++    restore_regs_via_mov = true;
++  else
++    restore_regs_via_mov = false;
+ 
+-      insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
+-      if (!optimize)
++  if (restore_regs_via_mov || frame.nsseregs)
++    {
++      /* Ensure that the entire register save area is addressable via
++	 the stack pointer, if we will restore SSE regs via sp.  */
++      if (TARGET_64BIT
++	  && m->fs.sp_offset > 0x7fffffff
++	  && sp_valid_at (frame.stack_realign_offset + 1)
++	  && (frame.nsseregs + frame.nregs) != 0)
+ 	{
+-	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+-	  RTX_FRAME_RELATED_P (insn) = 1;
++	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				     GEN_INT (m->fs.sp_offset
++					      - frame.sse_reg_save_offset),
++				     style,
++				     m->fs.cfa_reg == stack_pointer_rtx);
+ 	}
+-      return drap_vreg;
+     }
+-  else
+-    return NULL;
+-}
+-
+-/* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
+ 
+-static rtx
+-ix86_internal_arg_pointer (void)
+-{
+-  return virtual_incoming_args_rtx;
+-}
++  /* If there are any SSE registers to restore, then we have to do it
++     via moves, since there's obviously no pop for SSE regs.  */
++  if (frame.nsseregs)
++    ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
++					  style == 2);
+ 
+-struct scratch_reg {
+-  rtx reg;
+-  bool saved;
+-};
++  if (m->call_ms2sysv)
++    {
++      int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
+ 
+-/* Return a short-lived scratch register for use on function entry.
+-   In 32-bit mode, it is valid only after the registers are saved
+-   in the prologue.  This register must be released by means of
+-   release_scratch_register_on_entry once it is dead.  */
++      /* We cannot use a tail-call for the stub if:
++	 1. We have to pop incoming args,
++	 2. We have additional int regs to restore, or
++	 3. A sibling call will be the tail-call, or
++	 4. We are emitting an eh_return_internal epilogue.
+ 
+-static void
+-get_scratch_register_on_entry (struct scratch_reg *sr)
+-{
+-  int regno;
++	 TODO: Item 4 has not yet tested!
+ 
+-  sr->saved = false;
++	 If any of the above are true, we will call the stub rather than
++	 jump to it.  */
++      restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
++      ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
++    }
+ 
+-  if (TARGET_64BIT)
++  /* If using out-of-line stub that is a tail-call, then...*/
++  if (m->call_ms2sysv && restore_stub_is_tail)
+     {
+-      /* We always use R11 in 64-bit mode.  */
+-      regno = R11_REG;
++      /* TODO: parinoid tests. (remove eventually)  */
++      gcc_assert (m->fs.sp_valid);
++      gcc_assert (!m->fs.sp_realigned);
++      gcc_assert (!m->fs.fp_valid);
++      gcc_assert (!m->fs.realigned);
++      gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
++      gcc_assert (!crtl->drap_reg);
++      gcc_assert (!frame.nregs);
+     }
+-  else
++  else if (restore_regs_via_mov)
+     {
+-      tree decl = current_function_decl, fntype = TREE_TYPE (decl);
+-      bool fastcall_p
+-	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+-      bool thiscall_p
+-	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+-      bool static_chain_p = DECL_STATIC_CHAIN (decl);
+-      int regparm = ix86_function_regparm (fntype, decl);
+-      int drap_regno
+-	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
++      rtx t;
+ 
+-      /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
+-	  for the static chain register.  */
+-      if ((regparm < 1 || (fastcall_p && !static_chain_p))
+-	  && drap_regno != AX_REG)
+-	regno = AX_REG;
+-      /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
+-	  for the static chain register.  */
+-      else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
+-        regno = AX_REG;
+-      else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
+-	regno = DX_REG;
+-      /* ecx is the static chain register.  */
+-      else if (regparm < 3 && !fastcall_p && !thiscall_p
+-	       && !static_chain_p
+-	       && drap_regno != CX_REG)
+-	regno = CX_REG;
+-      else if (ix86_save_reg (BX_REG, true, false))
+-	regno = BX_REG;
+-      /* esi is the static chain register.  */
+-      else if (!(regparm == 3 && static_chain_p)
+-	       && ix86_save_reg (SI_REG, true, false))
+-	regno = SI_REG;
+-      else if (ix86_save_reg (DI_REG, true, false))
+-	regno = DI_REG;
+-      else
++      if (frame.nregs)
++	ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
++
++      /* eh_return epilogues need %ecx added to the stack pointer.  */
++      if (style == 2)
+ 	{
+-	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
+-	  sr->saved = true;
+-	}
+-    }
++	  rtx sa = EH_RETURN_STACKADJ_RTX;
++	  rtx_insn *insn;
+ 
+-  sr->reg = gen_rtx_REG (Pmode, regno);
+-  if (sr->saved)
+-    {
+-      rtx_insn *insn = emit_insn (gen_push (sr->reg));
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-    }
+-}
++	  /* %ecx can't be used for both DRAP register and eh_return.  */
++	  if (crtl->drap_reg)
++	    gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
+ 
+-/* Release a scratch register obtained from the preceding function.
++	  /* regparm nested functions don't work with eh_return.  */
++	  gcc_assert (!ix86_static_chain_on_stack);
+ 
+-   If RELEASE_VIA_POP is true, we just pop the register off the stack
+-   to release it.  This is what non-Linux systems use with -fstack-check.
++	  if (frame_pointer_needed)
++	    {
++	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
++	      t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
++	      emit_insn (gen_rtx_SET (sa, t));
+ 
+-   Otherwise we use OFFSET to locate the saved register and the
+-   allocated stack space becomes part of the local frame and is
+-   deallocated by the epilogue.  */
++	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
++	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
+ 
+-static void
+-release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
+-				   bool release_via_pop)
+-{
+-  if (sr->saved)
++	      /* Note that we use SA as a temporary CFA, as the return
++		 address is at the proper place relative to it.  We
++		 pretend this happens at the FP restore insn because
++		 prior to this insn the FP would be stored at the wrong
++		 offset relative to SA, and after this insn we have no
++		 other reasonable register to use for the CFA.  We don't
++		 bother resetting the CFA to the SP for the duration of
++		 the return insn, unless the control flow instrumentation
++		 is done.  In this case the SP is used later and we have
++		 to reset CFA to SP.  */
++	      add_reg_note (insn, REG_CFA_DEF_CFA,
++			    plus_constant (Pmode, sa, UNITS_PER_WORD));
++	      ix86_add_queued_cfa_restore_notes (insn);
++	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
++	      RTX_FRAME_RELATED_P (insn) = 1;
++
++	      m->fs.cfa_reg = sa;
++	      m->fs.cfa_offset = UNITS_PER_WORD;
++	      m->fs.fp_valid = false;
++
++	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
++					 const0_rtx, style,
++					 flag_cf_protection);
++	    }
++	  else
++	    {
++	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
++	      t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
++	      insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
++	      ix86_add_queued_cfa_restore_notes (insn);
++
++	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
++	      if (m->fs.cfa_offset != UNITS_PER_WORD)
++		{
++		  m->fs.cfa_offset = UNITS_PER_WORD;
++		  add_reg_note (insn, REG_CFA_DEF_CFA,
++				plus_constant (Pmode, stack_pointer_rtx,
++					       UNITS_PER_WORD));
++		  RTX_FRAME_RELATED_P (insn) = 1;
++		}
++	    }
++	  m->fs.sp_offset = UNITS_PER_WORD;
++	  m->fs.sp_valid = true;
++	  m->fs.sp_realigned = false;
++	}
++    }
++  else
+     {
+-      if (release_via_pop)
++      /* SEH requires that the function end with (1) a stack adjustment
++	 if necessary, (2) a sequence of pops, and (3) a return or
++	 jump instruction.  Prevent insns from the function body from
++	 being scheduled into this sequence.  */
++      if (TARGET_SEH)
+ 	{
+-	  struct machine_function *m = cfun->machine;
+-	  rtx x, insn = emit_insn (gen_pop (sr->reg));
++	  /* Prevent a catch region from being adjacent to the standard
++	     epilogue sequence.  Unfortunately neither crtl->uses_eh_lsda
++	     nor several other flags that would be interesting to test are
++	     set up yet.  */
++	  if (flag_non_call_exceptions)
++	    emit_insn (gen_nops (const1_rtx));
++	  else
++	    emit_insn (gen_blockage ());
++	}
+ 
+-	  /* The RX FRAME_RELATED_P mechanism doesn't know about pop.  */
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
+-	  x = gen_rtx_SET (stack_pointer_rtx, x);
+-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+-	  m->fs.sp_offset -= UNITS_PER_WORD;
++      /* First step is to deallocate the stack frame so that we can
++	 pop the registers.  If the stack pointer was realigned, it needs
++	 to be restored now.  Also do it on SEH target for very large
++	 frame as the emitted instructions aren't allowed by the ABI
++	 in epilogues.  */
++      if (!m->fs.sp_valid || m->fs.sp_realigned
++ 	  || (TARGET_SEH
++	      && (m->fs.sp_offset - reg_save_offset
++		  >= SEH_MAX_FRAME_SIZE)))
++	{
++	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
++				     GEN_INT (m->fs.fp_offset
++					      - reg_save_offset),
++				     style, false);
+ 	}
+-      else
++      else if (m->fs.sp_offset != reg_save_offset)
+ 	{
+-	  rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
+-	  x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
+-	  emit_insn (x);
++	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				     GEN_INT (m->fs.sp_offset
++					      - reg_save_offset),
++				     style,
++				     m->fs.cfa_reg == stack_pointer_rtx);
+ 	}
++
++      ix86_emit_restore_regs_using_pop ();
+     }
+-}
+ 
+-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
++  /* If we used a stack pointer and haven't already got rid of it,
++     then do so now.  */
++  if (m->fs.fp_valid)
++    {
++      /* If the stack pointer is valid and pointing at the frame
++	 pointer store address, then we only need a pop.  */
++      if (sp_valid_at (frame.hfp_save_offset)
++	  && m->fs.sp_offset == frame.hfp_save_offset)
++	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
++      /* Leave results in shorter dependency chains on CPUs that are
++	 able to grok it fast.  */
++      else if (TARGET_USE_LEAVE
++	       || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
++	       || !cfun->machine->use_fast_prologue_epilogue)
++	ix86_emit_leave (NULL);
++      else
++        {
++	  pro_epilogue_adjust_stack (stack_pointer_rtx,
++				     hard_frame_pointer_rtx,
++				     const0_rtx, style, !using_drap);
++	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
++        }
++    }
+ 
+-   This differs from the next routine in that it tries hard to prevent
+-   attacks that jump the stack guard.  Thus it is never allowed to allocate
+-   more than PROBE_INTERVAL bytes of stack space without a suitable
+-   probe.
++  if (using_drap)
++    {
++      int param_ptr_offset = UNITS_PER_WORD;
++      rtx_insn *insn;
+ 
+-   INT_REGISTERS_SAVED is true if integer registers have already been
+-   pushed on the stack.  */
++      gcc_assert (stack_realign_drap);
+ 
+-static void
+-ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
+-					 const bool int_registers_saved)
+-{
+-  struct machine_function *m = cfun->machine;
++      if (ix86_static_chain_on_stack)
++	param_ptr_offset += UNITS_PER_WORD;
++      if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
++	param_ptr_offset += UNITS_PER_WORD;
+ 
+-  /* If this function does not statically allocate stack space, then
+-     no probes are needed.  */
+-  if (!size)
++      insn = emit_insn (gen_rtx_SET
++			(stack_pointer_rtx,
++			 gen_rtx_PLUS (Pmode,
++				       crtl->drap_reg,
++				       GEN_INT (-param_ptr_offset))));
++      m->fs.cfa_reg = stack_pointer_rtx;
++      m->fs.cfa_offset = param_ptr_offset;
++      m->fs.sp_offset = param_ptr_offset;
++      m->fs.realigned = false;
++
++      add_reg_note (insn, REG_CFA_DEF_CFA,
++		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
++				  GEN_INT (param_ptr_offset)));
++      RTX_FRAME_RELATED_P (insn) = 1;
++
++      if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
++	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
++    }
++
++  /* At this point the stack pointer must be valid, and we must have
++     restored all of the registers.  We may not have deallocated the
++     entire stack frame.  We've delayed this until now because it may
++     be possible to merge the local stack deallocation with the
++     deallocation forced by ix86_static_chain_on_stack.   */
++  gcc_assert (m->fs.sp_valid);
++  gcc_assert (!m->fs.sp_realigned);
++  gcc_assert (!m->fs.fp_valid);
++  gcc_assert (!m->fs.realigned);
++  if (m->fs.sp_offset != UNITS_PER_WORD)
+     {
+-      /* However, the allocation of space via pushes for register
+-	 saves could be viewed as allocating space, but without the
+-	 need to probe.  */
+-      if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
+-        dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+-      else
+-	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+-      return;
++      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
++				 style, true);
+     }
++  else
++    ix86_add_queued_cfa_restore_notes (get_last_insn ());
+ 
+-  /* If we are a noreturn function, then we have to consider the
+-     possibility that we're called via a jump rather than a call.
++  /* Sibcall epilogues don't want a return instruction.  */
++  if (style == 0)
++    {
++      m->fs = frame_state_save;
++      return;
++    }
+ 
+-     Thus we don't have the implicit probe generated by saving the
+-     return address into the stack at the call.  Thus, the stack
+-     pointer could be anywhere in the guard page.  The safe thing
+-     to do is emit a probe now.
++  if (cfun->machine->func_type != TYPE_NORMAL)
++    emit_jump_insn (gen_interrupt_return ());
++  else if (crtl->args.pops_args && crtl->args.size)
++    {
++      rtx popc = GEN_INT (crtl->args.pops_args);
+ 
+-     The probe can be avoided if we have already emitted any callee
+-     register saves into the stack or have a frame pointer (which will
+-     have been saved as well).  Those saves will function as implicit
+-     probes.
++      /* i386 can only pop 64K bytes.  If asked to pop more, pop return
++	 address, do explicit add, and jump indirectly to the caller.  */
+ 
+-     ?!? This should be revamped to work like aarch64 and s390 where
+-     we track the offset from the most recent probe.  Normally that
+-     offset would be zero.  For a noreturn function we would reset
+-     it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT).   Then
+-     we just probe when we cross PROBE_INTERVAL.  */
+-  if (TREE_THIS_VOLATILE (cfun->decl)
+-      && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
+-    {
+-      /* We can safely use any register here since we're just going to push
+-	 its value and immediately pop it back.  But we do try and avoid
+-	 argument passing registers so as not to introduce dependencies in
+-	 the pipeline.  For 32 bit we use %esi and for 64 bit we use %rax.  */
+-      rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
+-      rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
+-      rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
+-      m->fs.sp_offset -= UNITS_PER_WORD;
+-      if (m->fs.cfa_reg == stack_pointer_rtx)
++      if (crtl->args.pops_args >= 65536)
+ 	{
+-	  m->fs.cfa_offset -= UNITS_PER_WORD;
+-	  rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+-	  x = gen_rtx_SET (stack_pointer_rtx, x);
+-	  add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
+-	  RTX_FRAME_RELATED_P (insn_push) = 1;
+-	  x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+-	  x = gen_rtx_SET (stack_pointer_rtx, x);
+-	  add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
+-	  RTX_FRAME_RELATED_P (insn_pop) = 1;
+-	}
+-      emit_insn (gen_blockage ());
+-    }
++	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
++	  rtx_insn *insn;
+ 
+-  /* If we allocate less than the size of the guard statically,
+-     then no probing is necessary, but we do need to allocate
+-     the stack.  */
+-  if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
+-    {
+-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-			         GEN_INT (-size), -1,
+-			         m->fs.cfa_reg == stack_pointer_rtx);
+-      dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+-      return;
+-    }
++	  /* There is no "pascal" calling convention in any 64bit ABI.  */
++	  gcc_assert (!TARGET_64BIT);
+ 
+-  /* We're allocating a large enough stack frame that we need to
+-     emit probes.  Either emit them inline or in a loop depending
+-     on the size.  */
+-  HOST_WIDE_INT probe_interval = get_probe_interval ();
+-  if (size <= 4 * probe_interval)
+-    {
+-      HOST_WIDE_INT i;
+-      for (i = probe_interval; i <= size; i += probe_interval)
+-	{
+-	  /* Allocate PROBE_INTERVAL bytes.  */
+-	  rtx insn
+-	    = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-					 GEN_INT (-probe_interval), -1,
+-					 m->fs.cfa_reg == stack_pointer_rtx);
+-	  add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
++	  insn = emit_insn (gen_pop (ecx));
++	  m->fs.cfa_offset -= UNITS_PER_WORD;
++	  m->fs.sp_offset -= UNITS_PER_WORD;
+ 
+-	  /* And probe at *sp.  */
+-	  emit_stack_probe (stack_pointer_rtx);
+-	  emit_insn (gen_blockage ());
+-	}
++	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
++	  x = gen_rtx_SET (stack_pointer_rtx, x);
++	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
++	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
++	  RTX_FRAME_RELATED_P (insn) = 1;
+ 
+-      /* We need to allocate space for the residual, but we do not need
+-	 to probe the residual.  */
+-      HOST_WIDE_INT residual = (i - probe_interval - size);
+-      if (residual)
+-	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				   GEN_INT (residual), -1,
+-				   m->fs.cfa_reg == stack_pointer_rtx);
+-      dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
++	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
++				     popc, -1, true);
++	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
++	}
++      else
++	emit_jump_insn (gen_simple_return_pop_internal (popc));
+     }
+-  else
++  else if (!m->call_ms2sysv || !restore_stub_is_tail)
+     {
+-      /* We expect the GP registers to be saved when probes are used
+-	 as the probing sequences might need a scratch register and
+-	 the routine to allocate one assumes the integer registers
+-	 have already been saved.  */
+-      gcc_assert (int_registers_saved);
+-
+-      struct scratch_reg sr;
+-      get_scratch_register_on_entry (&sr);
+-
+-      /* If we needed to save a register, then account for any space
+-	 that was pushed (we are not going to pop the register when
+-	 we do the restore).  */
+-      if (sr.saved)
+-	size -= UNITS_PER_WORD;
++      /* In case of return from EH a simple return cannot be used
++	 as a return address will be compared with a shadow stack
++	 return address.  Use indirect jump instead.  */
++      if (style == 2 && flag_cf_protection)
++	{
++	  /* Register used in indirect jump must be in word_mode.  But
++	     Pmode may not be the same as word_mode for x32.  */
++	  rtx ecx = gen_rtx_REG (word_mode, CX_REG);
++	  rtx_insn *insn;
+ 
+-      /* Step 1: round SIZE down to a multiple of the interval.  */
+-      HOST_WIDE_INT rounded_size = size & -probe_interval;
++	  insn = emit_insn (gen_pop (ecx));
++	  m->fs.cfa_offset -= UNITS_PER_WORD;
++	  m->fs.sp_offset -= UNITS_PER_WORD;
+ 
+-      /* Step 2: compute final value of the loop counter.  Use lea if
+-	 possible.  */
+-      rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
+-      rtx insn;
+-      if (address_no_seg_operand (addr, Pmode))
+-	insn = emit_insn (gen_rtx_SET (sr.reg, addr));
+-      else
+-	{
+-	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+-	  insn = emit_insn (gen_rtx_SET (sr.reg,
+-					 gen_rtx_PLUS (Pmode, sr.reg,
+-						       stack_pointer_rtx)));
+-	}
+-      if (m->fs.cfa_reg == stack_pointer_rtx)
+-	{
+-	  add_reg_note (insn, REG_CFA_DEF_CFA,
+-			plus_constant (Pmode, sr.reg,
+-				       m->fs.cfa_offset + rounded_size));
++	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
++	  x = gen_rtx_SET (stack_pointer_rtx, x);
++	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
++	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+ 	  RTX_FRAME_RELATED_P (insn) = 1;
+-	}
+ 
+-      /* Step 3: the loop.  */
+-      rtx size_rtx = GEN_INT (rounded_size);
+-      insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
+-							 size_rtx));
+-      if (m->fs.cfa_reg == stack_pointer_rtx)
+-	{
+-	  m->fs.cfa_offset += rounded_size;
+-	  add_reg_note (insn, REG_CFA_DEF_CFA,
+-			plus_constant (Pmode, stack_pointer_rtx,
+-				       m->fs.cfa_offset));
+-	  RTX_FRAME_RELATED_P (insn) = 1;
++	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+ 	}
+-      m->fs.sp_offset += rounded_size;
+-      emit_insn (gen_blockage ());
+-
+-      /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
+-	 is equal to ROUNDED_SIZE.  */
+-
+-      if (size != rounded_size)
+-	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				   GEN_INT (rounded_size - size), -1,
+-				   m->fs.cfa_reg == stack_pointer_rtx);
+-      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+-
+-      /* This does not deallocate the space reserved for the scratch
+-	 register.  That will be deallocated in the epilogue.  */
+-      release_scratch_register_on_entry (&sr, size, false);
++      else
++	emit_jump_insn (gen_simple_return_internal ());
+     }
+ 
+-  /* Make sure nothing is scheduled before we are done.  */
+-  emit_insn (gen_blockage ());
++  /* Restore the state back to the state from the prologue,
++     so that it's correct for the next epilogue.  */
++  m->fs = frame_state_save;
+ }
+ 
+-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
+-
+-   INT_REGISTERS_SAVED is true if integer registers have already been
+-   pushed on the stack.  */
++/* Reset from the function's potential modifications.  */
+ 
+ static void
+-ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
+-			     const bool int_registers_saved)
++ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
+ {
+-  /* We skip the probe for the first interval + a small dope of 4 words and
+-     probe that many bytes past the specified size to maintain a protection
+-     area at the botton of the stack.  */
+-  const int dope = 4 * UNITS_PER_WORD;
+-  rtx size_rtx = GEN_INT (size), last;
++  if (pic_offset_table_rtx
++      && !ix86_use_pseudo_pic_reg ())
++    SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
+ 
+-  /* See if we have a constant small number of probes to generate.  If so,
+-     that's the easy case.  The run-time loop is made up of 9 insns in the
+-     generic case while the compile-time loop is made up of 3+2*(n-1) insns
+-     for n # of intervals.  */
+-  if (size <= 4 * get_probe_interval ())
++  if (TARGET_MACHO)
+     {
+-      HOST_WIDE_INT i, adjust;
+-      bool first_probe = true;
++      rtx_insn *insn = get_last_insn ();
++      rtx_insn *deleted_debug_label = NULL;
+ 
+-      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
+-	 values of N from 1 until it exceeds SIZE.  If only one probe is
+-	 needed, this will not generate any code.  Then adjust and probe
+-	 to PROBE_INTERVAL + SIZE.  */
+-      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
++      /* Mach-O doesn't support labels at the end of objects, so if
++         it looks like we might want one, take special action.
++        First, collect any sequence of deleted debug labels.  */
++      while (insn
++	     && NOTE_P (insn)
++	     && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
+ 	{
+-	  if (first_probe)
+-	    {
+-	      adjust = 2 * get_probe_interval () + dope;
+-	      first_probe = false;
+-	    }
+-	  else
+-	    adjust = get_probe_interval ();
+-
+-	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
+-				  plus_constant (Pmode, stack_pointer_rtx,
+-						 -adjust)));
+-	  emit_stack_probe (stack_pointer_rtx);
++	  /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
++	     notes only, instead set their CODE_LABEL_NUMBER to -1,
++	     otherwise there would be code generation differences
++	     in between -g and -g0.  */
++	  if (NOTE_P (insn) && NOTE_KIND (insn)
++	      == NOTE_INSN_DELETED_DEBUG_LABEL)
++	    deleted_debug_label = insn;
++	  insn = PREV_INSN (insn);
+ 	}
+ 
+-      if (first_probe)
+-	adjust = size + get_probe_interval () + dope;
+-      else
+-        adjust = size + get_probe_interval () - i;
+-
+-      emit_insn (gen_rtx_SET (stack_pointer_rtx,
+-			      plus_constant (Pmode, stack_pointer_rtx,
+-					     -adjust)));
+-      emit_stack_probe (stack_pointer_rtx);
++      /* If we have:
++	 label:
++	    barrier
++	  then this needs to be detected, so skip past the barrier.  */
+ 
+-      /* Adjust back to account for the additional first interval.  */
+-      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
+-				     plus_constant (Pmode, stack_pointer_rtx,
+-						    (get_probe_interval ()
+-						     + dope))));
+-    }
++      if (insn && BARRIER_P (insn))
++	insn = PREV_INSN (insn);
+ 
+-  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+-     extra careful with variables wrapping around because we might be at
+-     the very top (or the very bottom) of the address space and we have
+-     to be able to handle this case properly; in particular, we use an
+-     equality test for the loop condition.  */
+-  else
+-    {
+-      /* We expect the GP registers to be saved when probes are used
+-	 as the probing sequences might need a scratch register and
+-	 the routine to allocate one assumes the integer registers
+-	 have already been saved.  */
+-      gcc_assert (int_registers_saved);
+-
+-      HOST_WIDE_INT rounded_size;
+-      struct scratch_reg sr;
+-
+-      get_scratch_register_on_entry (&sr);
+-
+-      /* If we needed to save a register, then account for any space
+-	 that was pushed (we are not going to pop the register when
+-	 we do the restore).  */
+-      if (sr.saved)
+-	size -= UNITS_PER_WORD;
+-
+-      /* Step 1: round SIZE to the previous multiple of the interval.  */
+-
+-      rounded_size = ROUND_DOWN (size, get_probe_interval ());
++      /* Up to now we've only seen notes or barriers.  */
++      if (insn)
++	{
++	  if (LABEL_P (insn)
++	      || (NOTE_P (insn)
++		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
++	    /* Trailing label.  */
++	    fputs ("\tnop\n", file);
++	  else if (cfun && ! cfun->is_thunk)
++	    {
++	      /* See if we have a completely empty function body, skipping
++	         the special case of the picbase thunk emitted as asm.  */
++	      while (insn && ! INSN_P (insn))
++		insn = PREV_INSN (insn);
++	      /* If we don't find any insns, we've got an empty function body;
++		 I.e. completely empty - without a return or branch.  This is
++		 taken as the case where a function body has been removed
++		 because it contains an inline __builtin_unreachable().  GCC
++		 declares that reaching __builtin_unreachable() means UB so
++		 we're not obliged to do anything special; however, we want
++		 non-zero-sized function bodies.  To meet this, and help the
++		 user out, let's trap the case.  */
++	      if (insn == NULL)
++		fputs ("\tud2\n", file);
++	    }
++	}
++      else if (deleted_debug_label)
++	for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
++	  if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
++	    CODE_LABEL_NUMBER (insn) = -1;
++    }
++}
+ 
++/* Return a scratch register to use in the split stack prologue.  The
++   split stack prologue is used for -fsplit-stack.  It is the first
++   instructions in the function, even before the regular prologue.
++   The scratch register can be any caller-saved register which is not
++   used for parameters or for the static chain.  */
+ 
+-      /* Step 2: compute initial and final value of the loop counter.  */
++static unsigned int
++split_stack_prologue_scratch_regno (void)
++{
++  if (TARGET_64BIT)
++    return R11_REG;
++  else
++    {
++      bool is_fastcall, is_thiscall;
++      int regparm;
+ 
+-      /* SP = SP_0 + PROBE_INTERVAL.  */
+-      emit_insn (gen_rtx_SET (stack_pointer_rtx,
+-			      plus_constant (Pmode, stack_pointer_rtx,
+-					     - (get_probe_interval () + dope))));
++      is_fastcall = (lookup_attribute ("fastcall",
++				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
++		     != NULL);
++      is_thiscall = (lookup_attribute ("thiscall",
++				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
++		     != NULL);
++      regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
+ 
+-      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
+-      if (rounded_size <= (HOST_WIDE_INT_1 << 31))
+-	emit_insn (gen_rtx_SET (sr.reg,
+-				plus_constant (Pmode, stack_pointer_rtx,
+-					       -rounded_size)));
++      if (is_fastcall)
++	{
++	  if (DECL_STATIC_CHAIN (cfun->decl))
++	    {
++	      sorry ("%<-fsplit-stack%> does not support fastcall with "
++		     "nested function");
++	      return INVALID_REGNUM;
++	    }
++	  return AX_REG;
++	}
++      else if (is_thiscall)
++        {
++	  if (!DECL_STATIC_CHAIN (cfun->decl))
++	    return DX_REG;
++	  return AX_REG;
++	}
++      else if (regparm < 3)
++	{
++	  if (!DECL_STATIC_CHAIN (cfun->decl))
++	    return CX_REG;
++	  else
++	    {
++	      if (regparm >= 2)
++		{
++		  sorry ("%<-fsplit-stack%> does not support 2 register "
++			 "parameters for a nested function");
++		  return INVALID_REGNUM;
++		}
++	      return DX_REG;
++	    }
++	}
+       else
+ 	{
+-	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+-	  emit_insn (gen_rtx_SET (sr.reg,
+-				  gen_rtx_PLUS (Pmode, sr.reg,
+-						stack_pointer_rtx)));
++	  /* FIXME: We could make this work by pushing a register
++	     around the addition and comparison.  */
++	  sorry ("%<-fsplit-stack%> does not support 3 register parameters");
++	  return INVALID_REGNUM;
+ 	}
++    }
++}
+ 
++/* A SYMBOL_REF for the function which allocates new stackspace for
++   -fsplit-stack.  */
+ 
+-      /* Step 3: the loop
+-
+-	 do
+-	   {
+-	     SP = SP + PROBE_INTERVAL
+-	     probe at SP
+-	   }
+-	 while (SP != LAST_ADDR)
+-
+-	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
+-	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
+-
+-      emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
++static GTY(()) rtx split_stack_fn;
+ 
++/* A SYMBOL_REF for the more stack function when using the large
++   model.  */
+ 
+-      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
+-	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
++static GTY(()) rtx split_stack_fn_large;
+ 
+-      if (size != rounded_size)
+-	{
+-	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
+-			          plus_constant (Pmode, stack_pointer_rtx,
+-						 rounded_size - size)));
+-	  emit_stack_probe (stack_pointer_rtx);
+-	}
++/* Return location of the stack guard value in the TLS block.  */
+ 
+-      /* Adjust back to account for the additional first interval.  */
+-      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
+-				     plus_constant (Pmode, stack_pointer_rtx,
+-						    (get_probe_interval ()
+-						     + dope))));
++rtx
++ix86_split_stack_guard (void)
++{
++  int offset;
++  addr_space_t as = DEFAULT_TLS_SEG_REG;
++  rtx r;
+ 
+-      /* This does not deallocate the space reserved for the scratch
+-	 register.  That will be deallocated in the epilogue.  */
+-      release_scratch_register_on_entry (&sr, size, false);
+-    }
++  gcc_assert (flag_split_stack);
+ 
+-  /* Even if the stack pointer isn't the CFA register, we need to correctly
+-     describe the adjustments made to it, in particular differentiate the
+-     frame-related ones from the frame-unrelated ones.  */
+-  if (size > 0)
+-    {
+-      rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
+-      XVECEXP (expr, 0, 0)
+-	= gen_rtx_SET (stack_pointer_rtx,
+-		       plus_constant (Pmode, stack_pointer_rtx, -size));
+-      XVECEXP (expr, 0, 1)
+-	= gen_rtx_SET (stack_pointer_rtx,
+-		       plus_constant (Pmode, stack_pointer_rtx,
+-				      get_probe_interval () + dope + size));
+-      add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
+-      RTX_FRAME_RELATED_P (last) = 1;
++#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
++  offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
++#else
++  gcc_unreachable ();
++#endif
+ 
+-      cfun->machine->fs.sp_offset += size;
+-    }
++  r = GEN_INT (offset);
++  r = gen_const_mem (Pmode, r);
++  set_mem_addr_space (r, as);
+ 
+-  /* Make sure nothing is scheduled before we are done.  */
+-  emit_insn (gen_blockage ());
++  return r;
+ }
+ 
+-/* Adjust the stack pointer up to REG while probing it.  */
++/* Handle -fsplit-stack.  These are the first instructions in the
++   function, even before the regular prologue.  */
+ 
+-const char *
+-output_adjust_stack_and_probe (rtx reg)
++void
++ix86_expand_split_stack_prologue (void)
+ {
+-  static int labelno = 0;
+-  char loop_lab[32];
+-  rtx xops[2];
+-
+-  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
++  HOST_WIDE_INT allocate;
++  unsigned HOST_WIDE_INT args_size;
++  rtx_code_label *label;
++  rtx limit, current, allocate_rtx, call_fusage;
++  rtx_insn *call_insn;
++  rtx scratch_reg = NULL_RTX;
++  rtx_code_label *varargs_label = NULL;
++  rtx fn;
+ 
+-  /* Loop.  */
+-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
++  gcc_assert (flag_split_stack && reload_completed);
+ 
+-  /* SP = SP + PROBE_INTERVAL.  */
+-  xops[0] = stack_pointer_rtx;
+-  xops[1] = GEN_INT (get_probe_interval ());
+-  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
++  ix86_finalize_stack_frame_flags ();
++  struct ix86_frame &frame = cfun->machine->frame;
++  allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
+ 
+-  /* Probe at SP.  */
+-  xops[1] = const0_rtx;
+-  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
++  /* This is the label we will branch to if we have enough stack
++     space.  We expect the basic block reordering pass to reverse this
++     branch if optimizing, so that we branch in the unlikely case.  */
++  label = gen_label_rtx ();
+ 
+-  /* Test if SP == LAST_ADDR.  */
+-  xops[0] = stack_pointer_rtx;
+-  xops[1] = reg;
+-  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
++  /* We need to compare the stack pointer minus the frame size with
++     the stack boundary in the TCB.  The stack boundary always gives
++     us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
++     can compare directly.  Otherwise we need to do an addition.  */
+ 
+-  /* Branch.  */
+-  fputs ("\tjne\t", asm_out_file);
+-  assemble_name_raw (asm_out_file, loop_lab);
+-  fputc ('\n', asm_out_file);
++  limit = ix86_split_stack_guard ();
+ 
+-  return "";
+-}
++  if (allocate < SPLIT_STACK_AVAILABLE)
++    current = stack_pointer_rtx;
++  else
++    {
++      unsigned int scratch_regno;
++      rtx offset;
+ 
+-/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+-   inclusive.  These are offsets from the current stack pointer.
+-
+-   INT_REGISTERS_SAVED is true if integer registers have already been
+-   pushed on the stack.  */
++      /* We need a scratch register to hold the stack pointer minus
++	 the required frame size.  Since this is the very start of the
++	 function, the scratch register can be any caller-saved
++	 register which is not used for parameters.  */
++      offset = GEN_INT (- allocate);
++      scratch_regno = split_stack_prologue_scratch_regno ();
++      if (scratch_regno == INVALID_REGNUM)
++	return;
++      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
++      if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
++	{
++	  /* We don't use ix86_gen_add3 in this case because it will
++	     want to split to lea, but when not optimizing the insn
++	     will not be split after this point.  */
++	  emit_insn (gen_rtx_SET (scratch_reg,
++				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
++						offset)));
++	}
++      else
++	{
++	  emit_move_insn (scratch_reg, offset);
++	  emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
++				    stack_pointer_rtx));
++	}
++      current = scratch_reg;
++    }
+ 
+-static void
+-ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
+-			     const bool int_registers_saved)
+-{
+-  /* See if we have a constant small number of probes to generate.  If so,
+-     that's the easy case.  The run-time loop is made up of 6 insns in the
+-     generic case while the compile-time loop is made up of n insns for n #
+-     of intervals.  */
+-  if (size <= 6 * get_probe_interval ())
+-    {
+-      HOST_WIDE_INT i;
++  ix86_expand_branch (GEU, current, limit, label);
++  rtx_insn *jump_insn = get_last_insn ();
++  JUMP_LABEL (jump_insn) = label;
+ 
+-      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
+-	 it exceeds SIZE.  If only one probe is needed, this will not
+-	 generate any code.  Then probe at FIRST + SIZE.  */
+-      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+-	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+-					 -(first + i)));
++  /* Mark the jump as very likely to be taken.  */
++  add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
+ 
+-      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+-				       -(first + size)));
++  if (split_stack_fn == NULL_RTX)
++    {
++      split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
++      SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
+     }
++  fn = split_stack_fn;
+ 
+-  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+-     extra careful with variables wrapping around because we might be at
+-     the very top (or the very bottom) of the address space and we have
+-     to be able to handle this case properly; in particular, we use an
+-     equality test for the loop condition.  */
+-  else
++  /* Get more stack space.  We pass in the desired stack space and the
++     size of the arguments to copy to the new stack.  In 32-bit mode
++     we push the parameters; __morestack will return on a new stack
++     anyhow.  In 64-bit mode we pass the parameters in r10 and
++     r11.  */
++  allocate_rtx = GEN_INT (allocate);
++  args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
++  call_fusage = NULL_RTX;
++  rtx pop = NULL_RTX;
++  if (TARGET_64BIT)
+     {
+-      /* We expect the GP registers to be saved when probes are used
+-	 as the probing sequences might need a scratch register and
+-	 the routine to allocate one assumes the integer registers
+-	 have already been saved.  */
+-      gcc_assert (int_registers_saved);
++      rtx reg10, reg11;
+ 
+-      HOST_WIDE_INT rounded_size, last;
+-      struct scratch_reg sr;
++      reg10 = gen_rtx_REG (Pmode, R10_REG);
++      reg11 = gen_rtx_REG (Pmode, R11_REG);
+ 
+-      get_scratch_register_on_entry (&sr);
++      /* If this function uses a static chain, it will be in %r10.
++	 Preserve it across the call to __morestack.  */
++      if (DECL_STATIC_CHAIN (cfun->decl))
++	{
++	  rtx rax;
+ 
++	  rax = gen_rtx_REG (word_mode, AX_REG);
++	  emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
++	  use_reg (&call_fusage, rax);
++	}
+ 
+-      /* Step 1: round SIZE to the previous multiple of the interval.  */
++      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
++          && !TARGET_PECOFF)
++	{
++	  HOST_WIDE_INT argval;
+ 
+-      rounded_size = ROUND_DOWN (size, get_probe_interval ());
++	  gcc_assert (Pmode == DImode);
++	  /* When using the large model we need to load the address
++	     into a register, and we've run out of registers.  So we
++	     switch to a different calling convention, and we call a
++	     different function: __morestack_large.  We pass the
++	     argument size in the upper 32 bits of r10 and pass the
++	     frame size in the lower 32 bits.  */
++	  gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
++	  gcc_assert ((args_size & 0xffffffff) == args_size);
+ 
++	  if (split_stack_fn_large == NULL_RTX)
++	    {
++	      split_stack_fn_large
++		= gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
++	      SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
++	    }
++	  if (ix86_cmodel == CM_LARGE_PIC)
++	    {
++	      rtx_code_label *label;
++	      rtx x;
+ 
+-      /* Step 2: compute initial and final value of the loop counter.  */
++	      label = gen_label_rtx ();
++	      emit_label (label);
++	      LABEL_PRESERVE_P (label) = 1;
++	      emit_insn (gen_set_rip_rex64 (reg10, label));
++	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
++	      emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
++	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
++				  UNSPEC_GOT);
++	      x = gen_rtx_CONST (Pmode, x);
++	      emit_move_insn (reg11, x);
++	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
++	      x = gen_const_mem (Pmode, x);
++	      emit_move_insn (reg11, x);
++	    }
++	  else
++	    emit_move_insn (reg11, split_stack_fn_large);
+ 
+-      /* TEST_OFFSET = FIRST.  */
+-      emit_move_insn (sr.reg, GEN_INT (-first));
++	  fn = reg11;
+ 
+-      /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
+-      last = first + rounded_size;
++	  argval = ((args_size << 16) << 16) + allocate;
++	  emit_move_insn (reg10, GEN_INT (argval));
++	}
++      else
++	{
++	  emit_move_insn (reg10, allocate_rtx);
++	  emit_move_insn (reg11, GEN_INT (args_size));
++	  use_reg (&call_fusage, reg11);
++	}
+ 
++      use_reg (&call_fusage, reg10);
++    }
++  else
++    {
++      rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
++      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
++      insn = emit_insn (gen_push (allocate_rtx));
++      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
++      pop = GEN_INT (2 * UNITS_PER_WORD);
++    }
++  call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
++				GEN_INT (UNITS_PER_WORD), constm1_rtx,
++				pop, false);
++  add_function_usage_to (call_insn, call_fusage);
++  if (!TARGET_64BIT)
++    add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
++  /* Indicate that this function can't jump to non-local gotos.  */
++  make_reg_eh_region_note_nothrow_nononlocal (call_insn);
+ 
+-      /* Step 3: the loop
++  /* In order to make call/return prediction work right, we now need
++     to execute a return instruction.  See
++     libgcc/config/i386/morestack.S for the details on how this works.
+ 
+-	 do
+-	   {
+-	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+-	     probe at TEST_ADDR
+-	   }
+-	 while (TEST_ADDR != LAST_ADDR)
++     For flow purposes gcc must not see this as a return
++     instruction--we need control flow to continue at the subsequent
++     label.  Therefore, we use an unspec.  */
++  gcc_assert (crtl->args.pops_args < 65536);
++  rtx_insn *ret_insn
++    = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
+ 
+-         probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+-         until it is equal to ROUNDED_SIZE.  */
++  if ((flag_cf_protection & CF_BRANCH))
++    {
++      /* Insert ENDBR since __morestack will jump back here via indirect
++	 call.  */
++      rtx cet_eb = gen_nop_endbr ();
++      emit_insn_after (cet_eb, ret_insn);
++    }
+ 
+-      emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
++  /* If we are in 64-bit mode and this function uses a static chain,
++     we saved %r10 in %rax before calling _morestack.  */
++  if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
++    emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
++		    gen_rtx_REG (word_mode, AX_REG));
+ 
++  /* If this function calls va_start, we need to store a pointer to
++     the arguments on the old stack, because they may not have been
++     all copied to the new stack.  At this point the old stack can be
++     found at the frame pointer value used by __morestack, because
++     __morestack has set that up before calling back to us.  Here we
++     store that pointer in a scratch register, and in
++     ix86_expand_prologue we store the scratch register in a stack
++     slot.  */
++  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
++    {
++      unsigned int scratch_regno;
++      rtx frame_reg;
++      int words;
+ 
+-      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+-	 that SIZE is equal to ROUNDED_SIZE.  */
++      scratch_regno = split_stack_prologue_scratch_regno ();
++      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
++      frame_reg = gen_rtx_REG (Pmode, BP_REG);
+ 
+-      if (size != rounded_size)
+-	emit_stack_probe (plus_constant (Pmode,
+-					 gen_rtx_PLUS (Pmode,
+-						       stack_pointer_rtx,
+-						       sr.reg),
+-					 rounded_size - size));
++      /* 64-bit:
++	 fp -> old fp value
++	       return address within this function
++	       return address of caller of this function
++	       stack arguments
++	 So we add three words to get to the stack arguments.
+ 
+-      release_scratch_register_on_entry (&sr, size, true);
+-    }
++	 32-bit:
++	 fp -> old fp value
++	       return address within this function
++               first argument to __morestack
++               second argument to __morestack
++               return address of caller of this function
++               stack arguments
++         So we add five words to get to the stack arguments.
++      */
++      words = TARGET_64BIT ? 3 : 5;
++      emit_insn (gen_rtx_SET (scratch_reg,
++			      gen_rtx_PLUS (Pmode, frame_reg,
++					    GEN_INT (words * UNITS_PER_WORD))));
+ 
+-  /* Make sure nothing is scheduled before we are done.  */
+-  emit_insn (gen_blockage ());
+-}
++      varargs_label = gen_label_rtx ();
++      emit_jump_insn (gen_jump (varargs_label));
++      JUMP_LABEL (get_last_insn ()) = varargs_label;
+ 
+-/* Probe a range of stack addresses from REG to END, inclusive.  These are
+-   offsets from the current stack pointer.  */
++      emit_barrier ();
++    }
+ 
+-const char *
+-output_probe_stack_range (rtx reg, rtx end)
+-{
+-  static int labelno = 0;
+-  char loop_lab[32];
+-  rtx xops[3];
+-
+-  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+-
+-  /* Loop.  */
+-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+-
+-  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+-  xops[0] = reg;
+-  xops[1] = GEN_INT (get_probe_interval ());
+-  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+-
+-  /* Probe at TEST_ADDR.  */
+-  xops[0] = stack_pointer_rtx;
+-  xops[1] = reg;
+-  xops[2] = const0_rtx;
+-  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
+-
+-  /* Test if TEST_ADDR == LAST_ADDR.  */
+-  xops[0] = reg;
+-  xops[1] = end;
+-  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
++  emit_label (label);
++  LABEL_NUSES (label) = 1;
+ 
+-  /* Branch.  */
+-  fputs ("\tjne\t", asm_out_file);
+-  assemble_name_raw (asm_out_file, loop_lab);
+-  fputc ('\n', asm_out_file);
++  /* If this function calls va_start, we now have to set the scratch
++     register for the case where we do not call __morestack.  In this
++     case we need to set it based on the stack pointer.  */
++  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
++    {
++      emit_insn (gen_rtx_SET (scratch_reg,
++			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
++					    GEN_INT (UNITS_PER_WORD))));
+ 
+-  return "";
++      emit_label (varargs_label);
++      LABEL_NUSES (varargs_label) = 1;
++    }
+ }
+ 
+-/* Return true if stack frame is required.  Update STACK_ALIGNMENT
+-   to the largest alignment, in bits, of stack slot used if stack
+-   frame is required and CHECK_STACK_SLOT is true.  */
++/* We may have to tell the dataflow pass that the split stack prologue
++   is initializing a scratch register.  */
+ 
+-static bool
+-ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
+-				    bool check_stack_slot)
++static void
++ix86_live_on_entry (bitmap regs)
+ {
+-  HARD_REG_SET set_up_by_prologue, prologue_used;
+-  basic_block bb;
+-
+-  CLEAR_HARD_REG_SET (prologue_used);
+-  CLEAR_HARD_REG_SET (set_up_by_prologue);
+-  add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
+-  add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
+-  add_to_hard_reg_set (&set_up_by_prologue, Pmode,
+-		       HARD_FRAME_POINTER_REGNUM);
+-
+-  /* The preferred stack alignment is the minimum stack alignment.  */
+-  if (stack_alignment > crtl->preferred_stack_boundary)
+-    stack_alignment = crtl->preferred_stack_boundary;
+-
+-  bool require_stack_frame = false;
+-
+-  FOR_EACH_BB_FN (bb, cfun)
++  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+     {
+-      rtx_insn *insn;
+-      FOR_BB_INSNS (bb, insn)
+-	if (NONDEBUG_INSN_P (insn)
+-	    && requires_stack_frame_p (insn, prologue_used,
+-				       set_up_by_prologue))
+-	  {
+-	    require_stack_frame = true;
+-
+-	    if (check_stack_slot)
+-	      {
+-		/* Find the maximum stack alignment.  */
+-		subrtx_iterator::array_type array;
+-		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
+-		  if (MEM_P (*iter)
+-		      && (reg_mentioned_p (stack_pointer_rtx,
+-					   *iter)
+-			  || reg_mentioned_p (frame_pointer_rtx,
+-					      *iter)))
+-		    {
+-		      unsigned int alignment = MEM_ALIGN (*iter);
+-		      if (alignment > stack_alignment)
+-			stack_alignment = alignment;
+-		    }
+-	      }
+-	  }
++      gcc_assert (flag_split_stack);
++      bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
+     }
+-
+-  return require_stack_frame;
+ }
++
++/* Extract the parts of an RTL expression that is a valid memory address
++   for an instruction.  Return 0 if the structure of the address is
++   grossly off.  Return -1 if the address contains ASHIFT, so it is not
++   strictly valid, but still used for computing length of lea instruction.  */
+ 
+-/* Finalize stack_realign_needed and frame_pointer_needed flags, which
+-   will guide prologue/epilogue to be generated in correct form.  */
+-
+-static void
+-ix86_finalize_stack_frame_flags (void)
++int
++ix86_decompose_address (rtx addr, struct ix86_address *out)
+ {
+-  /* Check if stack realign is really needed after reload, and
+-     stores result in cfun */
+-  unsigned int incoming_stack_boundary
+-    = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
+-       ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
+-  unsigned int stack_alignment
+-    = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
+-       ? crtl->max_used_stack_slot_alignment
+-       : crtl->stack_alignment_needed);
+-  unsigned int stack_realign
+-    = (incoming_stack_boundary < stack_alignment);
+-  bool recompute_frame_layout_p = false;
++  rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
++  rtx base_reg, index_reg;
++  HOST_WIDE_INT scale = 1;
++  rtx scale_rtx = NULL_RTX;
++  rtx tmp;
++  int retval = 1;
++  addr_space_t seg = ADDR_SPACE_GENERIC;
+ 
+-  if (crtl->stack_realign_finalized)
++  /* Allow zero-extended SImode addresses,
++     they will be emitted with addr32 prefix.  */
++  if (TARGET_64BIT && GET_MODE (addr) == DImode)
+     {
+-      /* After stack_realign_needed is finalized, we can't no longer
+-	 change it.  */
+-      gcc_assert (crtl->stack_realign_needed == stack_realign);
+-      return;
++      if (GET_CODE (addr) == ZERO_EXTEND
++	  && GET_MODE (XEXP (addr, 0)) == SImode)
++	{
++	  addr = XEXP (addr, 0);
++	  if (CONST_INT_P (addr))
++	    return 0;
++	}	      
++      else if (GET_CODE (addr) == AND
++	       && const_32bit_mask (XEXP (addr, 1), DImode))
++	{
++	  addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
++	  if (addr == NULL_RTX)
++	    return 0;
++
++	  if (CONST_INT_P (addr))
++	    return 0;
++	}
+     }
+ 
+-  /* If the only reason for frame_pointer_needed is that we conservatively
+-     assumed stack realignment might be needed or -fno-omit-frame-pointer
+-     is used, but in the end nothing that needed the stack alignment had
+-     been spilled nor stack access, clear frame_pointer_needed and say we
+-     don't need stack realignment.  */
+-  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+-      && frame_pointer_needed
+-      && crtl->is_leaf
+-      && crtl->sp_is_unchanging
+-      && !ix86_current_function_calls_tls_descriptor
+-      && !crtl->accesses_prior_frames
+-      && !cfun->calls_alloca
+-      && !crtl->calls_eh_return
+-      /* See ira_setup_eliminable_regset for the rationale.  */
+-      && !(STACK_CHECK_MOVING_SP
+-	   && flag_stack_check
+-	   && flag_exceptions
+-	   && cfun->can_throw_non_call_exceptions)
+-      && !ix86_frame_pointer_required ()
+-      && get_frame_size () == 0
+-      && ix86_nsaved_sseregs () == 0
+-      && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
++  /* Allow SImode subregs of DImode addresses,
++     they will be emitted with addr32 prefix.  */
++  if (TARGET_64BIT && GET_MODE (addr) == SImode)
+     {
+-      if (ix86_find_max_used_stack_alignment (stack_alignment,
+-					      stack_realign))
++      if (SUBREG_P (addr)
++	  && GET_MODE (SUBREG_REG (addr)) == DImode)
+ 	{
+-	  /* Stack frame is required.  If stack alignment needed is less
+-	     than incoming stack boundary, don't realign stack.  */
+-	  stack_realign = incoming_stack_boundary < stack_alignment;
+-	  if (!stack_realign)
+-	    {
+-	      crtl->max_used_stack_slot_alignment
+-		= incoming_stack_boundary;
+-	      crtl->stack_alignment_needed
+-		= incoming_stack_boundary;
+-	      /* Also update preferred_stack_boundary for leaf
+-	         functions.  */
+-	      crtl->preferred_stack_boundary
+-		= incoming_stack_boundary;
+-	    }
++	  addr = SUBREG_REG (addr);
++	  if (CONST_INT_P (addr))
++	    return 0;
+ 	}
++    }
++
++  if (REG_P (addr))
++    base = addr;
++  else if (SUBREG_P (addr))
++    {
++      if (REG_P (SUBREG_REG (addr)))
++	base = addr;
+       else
++	return 0;
++    }
++  else if (GET_CODE (addr) == PLUS)
++    {
++      rtx addends[4], op;
++      int n = 0, i;
++
++      op = addr;
++      do
+ 	{
+-	  /* If drap has been set, but it actually isn't live at the
+-	     start of the function, there is no reason to set it up.  */
+-	  if (crtl->drap_reg)
++	  if (n >= 4)
++	    return 0;
++	  addends[n++] = XEXP (op, 1);
++	  op = XEXP (op, 0);
++	}
++      while (GET_CODE (op) == PLUS);
++      if (n >= 4)
++	return 0;
++      addends[n] = op;
++
++      for (i = n; i >= 0; --i)
++	{
++	  op = addends[i];
++	  switch (GET_CODE (op))
+ 	    {
+-	      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+-	      if (! REGNO_REG_SET_P (DF_LR_IN (bb),
+-				     REGNO (crtl->drap_reg)))
+-		{
+-		  crtl->drap_reg = NULL_RTX;
+-		  crtl->need_drap = false;
+-		}
+-	    }
+-	  else
+-	    cfun->machine->no_drap_save_restore = true;
++	    case MULT:
++	      if (index)
++		return 0;
++	      index = XEXP (op, 0);
++	      scale_rtx = XEXP (op, 1);
++	      break;
+ 
+-	  frame_pointer_needed = false;
+-	  stack_realign = false;
+-	  crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
+-	  crtl->stack_alignment_needed = incoming_stack_boundary;
+-	  crtl->stack_alignment_estimated = incoming_stack_boundary;
+-	  if (crtl->preferred_stack_boundary > incoming_stack_boundary)
+-	    crtl->preferred_stack_boundary = incoming_stack_boundary;
+-	  df_finish_pass (true);
+-	  df_scan_alloc (NULL);
+-	  df_scan_blocks ();
+-	  df_compute_regs_ever_live (true);
+-	  df_analyze ();
++	    case ASHIFT:
++	      if (index)
++		return 0;
++	      index = XEXP (op, 0);
++	      tmp = XEXP (op, 1);
++	      if (!CONST_INT_P (tmp))
++		return 0;
++	      scale = INTVAL (tmp);
++	      if ((unsigned HOST_WIDE_INT) scale > 3)
++		return 0;
++	      scale = 1 << scale;
++	      break;
+ 
+-	  if (flag_var_tracking)
+-	    {
+-	      /* Since frame pointer is no longer available, replace it with
+-		 stack pointer - UNITS_PER_WORD in debug insns.  */
+-	      df_ref ref, next;
+-	      for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
+-		   ref; ref = next)
+-		{
+-		  next = DF_REF_NEXT_REG (ref);
+-		  if (!DF_REF_INSN_INFO (ref))
+-		    continue;
++	    case ZERO_EXTEND:
++	      op = XEXP (op, 0);
++	      if (GET_CODE (op) != UNSPEC)
++		return 0;
++	      /* FALLTHRU */
+ 
+-		  /* Make sure the next ref is for a different instruction,
+-		     so that we're not affected by the rescan.  */
+-		  rtx_insn *insn = DF_REF_INSN (ref);
+-		  while (next && DF_REF_INSN (next) == insn)
+-		    next = DF_REF_NEXT_REG (next);
++	    case UNSPEC:
++	      if (XINT (op, 1) == UNSPEC_TP
++	          && TARGET_TLS_DIRECT_SEG_REFS
++	          && seg == ADDR_SPACE_GENERIC)
++		seg = DEFAULT_TLS_SEG_REG;
++	      else
++		return 0;
++	      break;
+ 
+-		  if (DEBUG_INSN_P (insn))
+-		    {
+-		      bool changed = false;
+-		      for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+-			{
+-			  rtx *loc = DF_REF_LOC (ref);
+-			  if (*loc == hard_frame_pointer_rtx)
+-			    {
+-			      *loc = plus_constant (Pmode,
+-						    stack_pointer_rtx,
+-						    -UNITS_PER_WORD);
+-			      changed = true;
+-			    }
+-			}
+-		      if (changed)
+-			df_insn_rescan (insn);
+-		    }
+-		}
+-	    }
++	    case SUBREG:
++	      if (!REG_P (SUBREG_REG (op)))
++		return 0;
++	      /* FALLTHRU */
+ 
+-	  recompute_frame_layout_p = true;
++	    case REG:
++	      if (!base)
++		base = op;
++	      else if (!index)
++		index = op;
++	      else
++		return 0;
++	      break;
++
++	    case CONST:
++	    case CONST_INT:
++	    case SYMBOL_REF:
++	    case LABEL_REF:
++	      if (disp)
++		return 0;
++	      disp = op;
++	      break;
++
++	    default:
++	      return 0;
++	    }
+ 	}
+     }
+-  else if (crtl->max_used_stack_slot_alignment >= 128)
++  else if (GET_CODE (addr) == MULT)
+     {
+-      /* We don't need to realign stack.  max_used_stack_alignment is
+-	 used to decide how stack frame should be aligned.  This is
+-	 independent of any psABIs nor 32-bit vs 64-bit.  It is always
+-	 safe to compute max_used_stack_alignment.  We compute it only
+-	 if 128-bit aligned load/store may be generated on misaligned
+-	 stack slot which will lead to segfault.   */
+-      if (ix86_find_max_used_stack_alignment (stack_alignment, true))
+-	cfun->machine->max_used_stack_alignment
+-	  = stack_alignment / BITS_PER_UNIT;
++      index = XEXP (addr, 0);		/* index*scale */
++      scale_rtx = XEXP (addr, 1);
+     }
++  else if (GET_CODE (addr) == ASHIFT)
++    {
++      /* We're called for lea too, which implements ashift on occasion.  */
++      index = XEXP (addr, 0);
++      tmp = XEXP (addr, 1);
++      if (!CONST_INT_P (tmp))
++	return 0;
++      scale = INTVAL (tmp);
++      if ((unsigned HOST_WIDE_INT) scale > 3)
++	return 0;
++      scale = 1 << scale;
++      retval = -1;
++    }
++  else
++    disp = addr;			/* displacement */
+ 
+-  if (crtl->stack_realign_needed != stack_realign)
+-    recompute_frame_layout_p = true;
+-  crtl->stack_realign_needed = stack_realign;
+-  crtl->stack_realign_finalized = true;
+-  if (recompute_frame_layout_p)
+-    ix86_compute_frame_layout ();
+-}
+-
+-/* Delete SET_GOT right after entry block if it is allocated to reg.  */
+-
+-static void
+-ix86_elim_entry_set_got (rtx reg)
+-{
+-  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+-  rtx_insn *c_insn = BB_HEAD (bb);
+-  if (!NONDEBUG_INSN_P (c_insn))
+-    c_insn = next_nonnote_nondebug_insn (c_insn);
+-  if (c_insn && NONJUMP_INSN_P (c_insn))
++  if (index)
+     {
+-      rtx pat = PATTERN (c_insn);
+-      if (GET_CODE (pat) == PARALLEL)
+-	{
+-	  rtx vec = XVECEXP (pat, 0, 0);
+-	  if (GET_CODE (vec) == SET
+-	      && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
+-	      && REGNO (XEXP (vec, 0)) == REGNO (reg))
+-	    delete_insn (c_insn);
+-	}
++      if (REG_P (index))
++	;
++      else if (SUBREG_P (index)
++	       && REG_P (SUBREG_REG (index)))
++	;
++      else
++	return 0;
+     }
+-}
+ 
+-static rtx
+-gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
+-{
+-  rtx addr, mem;
++  /* Extract the integral value of scale.  */
++  if (scale_rtx)
++    {
++      if (!CONST_INT_P (scale_rtx))
++	return 0;
++      scale = INTVAL (scale_rtx);
++    }
+ 
+-  if (offset)
+-    addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
+-  mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
+-  return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
+-}
++  base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
++  index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
+ 
+-static inline rtx
+-gen_frame_load (rtx reg, rtx frame_reg, int offset)
+-{
+-  return gen_frame_set (reg, frame_reg, offset, false);
+-}
++  /* Avoid useless 0 displacement.  */
++  if (disp == const0_rtx && (base || index))
++    disp = NULL_RTX;
+ 
+-static inline rtx
+-gen_frame_store (rtx reg, rtx frame_reg, int offset)
+-{
+-  return gen_frame_set (reg, frame_reg, offset, true);
+-}
++  /* Allow arg pointer and stack pointer as index if there is not scaling.  */
++  if (base_reg && index_reg && scale == 1
++      && (REGNO (index_reg) == ARG_POINTER_REGNUM
++	  || REGNO (index_reg) == FRAME_POINTER_REGNUM
++	  || REGNO (index_reg) == SP_REG))
++    {
++      std::swap (base, index);
++      std::swap (base_reg, index_reg);
++    }
+ 
+-static void
+-ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
+-{
+-  struct machine_function *m = cfun->machine;
+-  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
+-			  + m->call_ms2sysv_extra_regs;
+-  rtvec v = rtvec_alloc (ncregs + 1);
+-  unsigned int align, i, vi = 0;
+-  rtx_insn *insn;
+-  rtx sym, addr;
+-  rtx rax = gen_rtx_REG (word_mode, AX_REG);
+-  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
++  /* Special case: %ebp cannot be encoded as a base without a displacement.
++     Similarly %r13.  */
++  if (!disp && base_reg
++      && (REGNO (base_reg) == ARG_POINTER_REGNUM
++	  || REGNO (base_reg) == FRAME_POINTER_REGNUM
++	  || REGNO (base_reg) == BP_REG
++	  || REGNO (base_reg) == R13_REG))
++    disp = const0_rtx;
+ 
+-  /* AL should only be live with sysv_abi.  */
+-  gcc_assert (!ix86_eax_live_at_start_p ());
+-  gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
++  /* Special case: on K6, [%esi] makes the instruction vector decoded.
++     Avoid this by transforming to [%esi+0].
++     Reload calls address legitimization without cfun defined, so we need
++     to test cfun for being non-NULL. */
++  if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
++      && base_reg && !index_reg && !disp
++      && REGNO (base_reg) == SI_REG)
++    disp = const0_rtx;
+ 
+-  /* Setup RAX as the stub's base pointer.  We use stack_realign_offset rather
+-     we've actually realigned the stack or not.  */
+-  align = GET_MODE_ALIGNMENT (V4SFmode);
+-  addr = choose_baseaddr (frame.stack_realign_offset
+-			  + xlogue.get_stub_ptr_offset (), &align, AX_REG);
+-  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
++  /* Special case: encode reg+reg instead of reg*2.  */
++  if (!base && index && scale == 2)
++    base = index, base_reg = index_reg, scale = 1;
+ 
+-  emit_insn (gen_rtx_SET (rax, addr));
++  /* Special case: scaling cannot be encoded without base or displacement.  */
++  if (!base && !disp && index && scale != 1)
++    disp = const0_rtx;
+ 
+-  /* Get the stub symbol.  */
+-  sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
+-						  : XLOGUE_STUB_SAVE);
+-  RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
++  out->base = base;
++  out->index = index;
++  out->disp = disp;
++  out->scale = scale;
++  out->seg = seg;
+ 
+-  for (i = 0; i < ncregs; ++i)
+-    {
+-      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
+-      rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
+-			     r.regno);
+-      RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
+-    }
+-
+-  gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
+-
+-  insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
+-  RTX_FRAME_RELATED_P (insn) = true;
++  return retval;
+ }
++
++/* Return cost of the memory address x.
++   For i386, it is better to use a complex address than let gcc copy
++   the address into a reg and make a new pseudo.  But not if the address
++   requires to two regs - that would mean more pseudos with longer
++   lifetimes.  */
++static int
++ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
++{
++  struct ix86_address parts;
++  int cost = 1;
++  int ok = ix86_decompose_address (x, &parts);
+ 
+-/* Expand the prologue into a bunch of separate insns.  */
++  gcc_assert (ok);
+ 
+-void
+-ix86_expand_prologue (void)
+-{
+-  struct machine_function *m = cfun->machine;
+-  rtx insn, t;
+-  HOST_WIDE_INT allocate;
+-  bool int_registers_saved;
+-  bool sse_registers_saved;
+-  bool save_stub_call_needed;
+-  rtx static_chain = NULL_RTX;
++  if (parts.base && SUBREG_P (parts.base))
++    parts.base = SUBREG_REG (parts.base);
++  if (parts.index && SUBREG_P (parts.index))
++    parts.index = SUBREG_REG (parts.index);
+ 
+-  if (ix86_function_naked (current_function_decl))
+-    return;
++  /* Attempt to minimize number of registers in the address by increasing
++     address cost for each used register.  We don't increase address cost
++     for "pic_offset_table_rtx".  When a memopt with "pic_offset_table_rtx"
++     is not invariant itself it most likely means that base or index is not
++     invariant.  Therefore only "pic_offset_table_rtx" could be hoisted out,
++     which is not profitable for x86.  */
++  if (parts.base
++      && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
++      && (current_pass->type == GIMPLE_PASS
++	  || !pic_offset_table_rtx
++	  || !REG_P (parts.base)
++	  || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
++    cost++;
+ 
+-  ix86_finalize_stack_frame_flags ();
++  if (parts.index
++      && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
++      && (current_pass->type == GIMPLE_PASS
++	  || !pic_offset_table_rtx
++	  || !REG_P (parts.index)
++	  || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
++    cost++;
+ 
+-  /* DRAP should not coexist with stack_realign_fp */
+-  gcc_assert (!(crtl->drap_reg && stack_realign_fp));
++  /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
++     since it's predecode logic can't detect the length of instructions
++     and it degenerates to vector decoded.  Increase cost of such
++     addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
++     to split such addresses or even refuse such addresses at all.
+ 
+-  memset (&m->fs, 0, sizeof (m->fs));
++     Following addressing modes are affected:
++      [base+scale*index]
++      [scale*index+disp]
++      [base+index]
+ 
+-  /* Initialize CFA state for before the prologue.  */
+-  m->fs.cfa_reg = stack_pointer_rtx;
+-  m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
++     The first and last case  may be avoidable by explicitly coding the zero in
++     memory address, but I don't have AMD-K6 machine handy to check this
++     theory.  */
+ 
+-  /* Track SP offset to the CFA.  We continue tracking this after we've
+-     swapped the CFA register away from SP.  In the case of re-alignment
+-     this is fudged; we're interested to offsets within the local frame.  */
+-  m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+-  m->fs.sp_valid = true;
+-  m->fs.sp_realigned = false;
++  if (TARGET_K6
++      && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
++	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
++	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
++    cost += 10;
+ 
+-  const struct ix86_frame &frame = cfun->machine->frame;
++  return cost;
++}
++
++/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
++   this is used for to form addresses to local data when -fPIC is in
++   use.  */
+ 
+-  if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
+-    {
+-      /* We should have already generated an error for any use of
+-         ms_hook on a nested function.  */
+-      gcc_checking_assert (!ix86_static_chain_on_stack);
++static bool
++darwin_local_data_pic (rtx disp)
++{
++  return (GET_CODE (disp) == UNSPEC
++	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
++}
+ 
+-      /* Check if profiling is active and we shall use profiling before
+-         prologue variant. If so sorry.  */
+-      if (crtl->profile && flag_fentry != 0)
+-        sorry ("ms_hook_prologue attribute isn%'t compatible "
+-	       "with %<-mfentry%> for 32-bit");
++/* True if operand X should be loaded from GOT.  */
+ 
+-      /* In ix86_asm_output_function_label we emitted:
+-	 8b ff     movl.s %edi,%edi
+-	 55        push   %ebp
+-	 8b ec     movl.s %esp,%ebp
++bool
++ix86_force_load_from_GOT_p (rtx x)
++{
++  return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
++	  && !TARGET_PECOFF && !TARGET_MACHO
++	  && !flag_pic
++	  && ix86_cmodel != CM_LARGE
++	  && GET_CODE (x) == SYMBOL_REF
++	  && SYMBOL_REF_FUNCTION_P (x)
++	  && (!flag_plt
++	      || (SYMBOL_REF_DECL (x)
++		  && lookup_attribute ("noplt",
++				       DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
++	  && !SYMBOL_REF_LOCAL_P (x));
++}
+ 
+-	 This matches the hookable function prologue in Win32 API
+-	 functions in Microsoft Windows XP Service Pack 2 and newer.
+-	 Wine uses this to enable Windows apps to hook the Win32 API
+-	 functions provided by Wine.
++/* Determine if a given RTX is a valid constant.  We already know this
++   satisfies CONSTANT_P.  */
+ 
+-	 What that means is that we've already set up the frame pointer.  */
++static bool
++ix86_legitimate_constant_p (machine_mode mode, rtx x)
++{
++  switch (GET_CODE (x))
++    {
++    case CONST:
++      x = XEXP (x, 0);
+ 
+-      if (frame_pointer_needed
+-	  && !(crtl->drap_reg && crtl->stack_realign_needed))
++      if (GET_CODE (x) == PLUS)
+ 	{
+-	  rtx push, mov;
++	  if (!CONST_INT_P (XEXP (x, 1)))
++	    return false;
++	  x = XEXP (x, 0);
++	}
+ 
+-	  /* We've decided to use the frame pointer already set up.
+-	     Describe this to the unwinder by pretending that both
+-	     push and mov insns happen right here.
++      if (TARGET_MACHO && darwin_local_data_pic (x))
++	return true;
+ 
+-	     Putting the unwind info here at the end of the ms_hook
+-	     is done so that we can make absolutely certain we get
+-	     the required byte sequence at the start of the function,
+-	     rather than relying on an assembler that can produce
+-	     the exact encoding required.
++      /* Only some unspecs are valid as "constants".  */
++      if (GET_CODE (x) == UNSPEC)
++	switch (XINT (x, 1))
++	  {
++	  case UNSPEC_GOT:
++	  case UNSPEC_GOTOFF:
++	  case UNSPEC_PLTOFF:
++	    return TARGET_64BIT;
++	  case UNSPEC_TPOFF:
++	  case UNSPEC_NTPOFF:
++	    x = XVECEXP (x, 0, 0);
++	    return (GET_CODE (x) == SYMBOL_REF
++		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
++	  case UNSPEC_DTPOFF:
++	    x = XVECEXP (x, 0, 0);
++	    return (GET_CODE (x) == SYMBOL_REF
++		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
++	  default:
++	    return false;
++	  }
+ 
+-	     However it does mean (in the unpatched case) that we have
+-	     a 1 insn window where the asynchronous unwind info is
+-	     incorrect.  However, if we placed the unwind info at
+-	     its correct location we would have incorrect unwind info
+-	     in the patched case.  Which is probably all moot since
+-	     I don't expect Wine generates dwarf2 unwind info for the
+-	     system libraries that use this feature.  */
++      /* We must have drilled down to a symbol.  */
++      if (GET_CODE (x) == LABEL_REF)
++	return true;
++      if (GET_CODE (x) != SYMBOL_REF)
++	return false;
++      /* FALLTHRU */
+ 
+-	  insn = emit_insn (gen_blockage ());
++    case SYMBOL_REF:
++      /* TLS symbols are never valid.  */
++      if (SYMBOL_REF_TLS_MODEL (x))
++	return false;
+ 
+-	  push = gen_push (hard_frame_pointer_rtx);
+-	  mov = gen_rtx_SET (hard_frame_pointer_rtx,
+-			     stack_pointer_rtx);
+-	  RTX_FRAME_RELATED_P (push) = 1;
+-	  RTX_FRAME_RELATED_P (mov) = 1;
++      /* DLLIMPORT symbols are never valid.  */
++      if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
++	  && SYMBOL_REF_DLLIMPORT_P (x))
++	return false;
+ 
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+-			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
++#if TARGET_MACHO
++      /* mdynamic-no-pic */
++      if (MACHO_DYNAMIC_NO_PIC_P)
++	return machopic_symbol_defined_p (x);
++#endif
+ 
+-	  /* Note that gen_push incremented m->fs.cfa_offset, even
+-	     though we didn't emit the push insn here.  */
+-	  m->fs.cfa_reg = hard_frame_pointer_rtx;
+-	  m->fs.fp_offset = m->fs.cfa_offset;
+-	  m->fs.fp_valid = true;
+-	}
+-      else
++      /* External function address should be loaded
++	 via the GOT slot to avoid PLT.  */
++      if (ix86_force_load_from_GOT_p (x))
++	return false;
++
++      break;
++
++    CASE_CONST_SCALAR_INT:
++      switch (mode)
+ 	{
+-	  /* The frame pointer is not needed so pop %ebp again.
+-	     This leaves us with a pristine state.  */
+-	  emit_insn (gen_pop (hard_frame_pointer_rtx));
++	case E_TImode:
++	  if (TARGET_64BIT)
++	    return true;
++	  /* FALLTHRU */
++	case E_OImode:
++	case E_XImode:
++	  if (!standard_sse_constant_p (x, mode))
++	    return false;
++	default:
++	  break;
+ 	}
++      break;
++
++    case CONST_VECTOR:
++      if (!standard_sse_constant_p (x, mode))
++	return false;
++
++    default:
++      break;
+     }
+ 
+-  /* The first insn of a function that accepts its static chain on the
+-     stack is to push the register that would be filled in by a direct
+-     call.  This insn will be skipped by the trampoline.  */
+-  else if (ix86_static_chain_on_stack)
+-    {
+-      static_chain = ix86_static_chain (cfun->decl, false);
+-      insn = emit_insn (gen_push (static_chain));
+-      emit_insn (gen_blockage ());
++  /* Otherwise we handle everything else in the move patterns.  */
++  return true;
++}
+ 
+-      /* We don't want to interpret this push insn as a register save,
+-	 only as a stack adjustment.  The real copy of the register as
+-	 a save will be done later, if needed.  */
+-      t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+-      t = gen_rtx_SET (stack_pointer_rtx, t);
+-      add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-    }
++/* Determine if it's legal to put X into the constant pool.  This
++   is not possible for the address of thread-local symbols, which
++   is checked above.  */
+ 
+-  /* Emit prologue code to adjust stack alignment and setup DRAP, in case
+-     of DRAP is needed and stack realignment is really needed after reload */
+-  if (stack_realign_drap)
++static bool
++ix86_cannot_force_const_mem (machine_mode mode, rtx x)
++{
++  /* We can put any immediate constant in memory.  */
++  switch (GET_CODE (x))
+     {
+-      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
++    CASE_CONST_ANY:
++      return false;
+ 
+-      /* Can't use DRAP in interrupt function.  */
+-      if (cfun->machine->func_type != TYPE_NORMAL)
+-	sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
+-	       "in interrupt service routine.  This may be worked "
+-	       "around by avoiding functions with aggregate return.");
++    default:
++      break;
++    }
+ 
+-      /* Only need to push parameter pointer reg if it is caller saved.  */
+-      if (!call_used_regs[REGNO (crtl->drap_reg)])
+-	{
+-	  /* Push arg pointer reg */
+-	  insn = emit_insn (gen_push (crtl->drap_reg));
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	}
++  return !ix86_legitimate_constant_p (mode, x);
++}
+ 
+-      /* Grab the argument pointer.  */
+-      t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
+-      insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-      m->fs.cfa_reg = crtl->drap_reg;
+-      m->fs.cfa_offset = 0;
++/*  Nonzero if the symbol is marked as dllimport, or as stub-variable,
++    otherwise zero.  */
+ 
+-      /* Align the stack.  */
+-      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+-					stack_pointer_rtx,
+-					GEN_INT (-align_bytes)));
+-      RTX_FRAME_RELATED_P (insn) = 1;
++static bool
++is_imported_p (rtx x)
++{
++  if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
++      || GET_CODE (x) != SYMBOL_REF)
++    return false;
+ 
+-      /* Replicate the return address on the stack so that return
+-	 address can be reached via (argp - 1) slot.  This is needed
+-	 to implement macro RETURN_ADDR_RTX and intrinsic function
+-	 expand_builtin_return_addr etc.  */
+-      t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
+-      t = gen_frame_mem (word_mode, t);
+-      insn = emit_insn (gen_push (t));
+-      RTX_FRAME_RELATED_P (insn) = 1;
++  return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
++}
+ 
+-      /* For the purposes of frame and register save area addressing,
+-	 we've started over with a new frame.  */
+-      m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+-      m->fs.realigned = true;
+ 
+-      if (static_chain)
+-	{
+-	  /* Replicate static chain on the stack so that static chain
+-	     can be reached via (argp - 2) slot.  This is needed for
+-	     nested function with stack realignment.  */
+-	  insn = emit_insn (gen_push (static_chain));
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	}
+-    }
++/* Nonzero if the constant value X is a legitimate general operand
++   when generating PIC code.  It is given that flag_pic is on and
++   that X satisfies CONSTANT_P.  */
+ 
+-  int_registers_saved = (frame.nregs == 0);
+-  sse_registers_saved = (frame.nsseregs == 0);
+-  save_stub_call_needed = (m->call_ms2sysv);
+-  gcc_assert (sse_registers_saved || !save_stub_call_needed);
++bool
++legitimate_pic_operand_p (rtx x)
++{
++  rtx inner;
+ 
+-  if (frame_pointer_needed && !m->fs.fp_valid)
++  switch (GET_CODE (x))
+     {
+-      /* Note: AT&T enter does NOT have reversed args.  Enter is probably
+-         slower on all targets.  Also sdb didn't like it.  */
+-      insn = emit_insn (gen_push (hard_frame_pointer_rtx));
+-      RTX_FRAME_RELATED_P (insn) = 1;
++    case CONST:
++      inner = XEXP (x, 0);
++      if (GET_CODE (inner) == PLUS
++	  && CONST_INT_P (XEXP (inner, 1)))
++	inner = XEXP (inner, 0);
+ 
+-      /* Push registers now, before setting the frame pointer
+-	 on SEH target.  */
+-      if (!int_registers_saved
+-	  && TARGET_SEH
+-	  && !frame.save_regs_using_mov)
+-	{
+-	  ix86_emit_save_regs ();
+-	  int_registers_saved = true;
+-	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+-	}
++      /* Only some unspecs are valid as "constants".  */
++      if (GET_CODE (inner) == UNSPEC)
++	switch (XINT (inner, 1))
++	  {
++	  case UNSPEC_GOT:
++	  case UNSPEC_GOTOFF:
++	  case UNSPEC_PLTOFF:
++	    return TARGET_64BIT;
++	  case UNSPEC_TPOFF:
++	    x = XVECEXP (inner, 0, 0);
++	    return (GET_CODE (x) == SYMBOL_REF
++		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
++	  case UNSPEC_MACHOPIC_OFFSET:
++	    return legitimate_pic_address_disp_p (x);
++	  default:
++	    return false;
++	  }
++      /* FALLTHRU */
+ 
+-      if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
+-	{
+-	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
+-	  RTX_FRAME_RELATED_P (insn) = 1;
++    case SYMBOL_REF:
++    case LABEL_REF:
++      return legitimate_pic_address_disp_p (x);
+ 
+-	  if (m->fs.cfa_reg == stack_pointer_rtx)
+-	    m->fs.cfa_reg = hard_frame_pointer_rtx;
+-	  m->fs.fp_offset = m->fs.sp_offset;
+-	  m->fs.fp_valid = true;
+-	}
++    default:
++      return true;
+     }
++}
+ 
+-  if (!int_registers_saved)
+-    {
+-      /* If saving registers via PUSH, do so now.  */
+-      if (!frame.save_regs_using_mov)
+-	{
+-	  ix86_emit_save_regs ();
+-	  int_registers_saved = true;
+-	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+-	}
++/* Determine if a given CONST RTX is a valid memory displacement
++   in PIC mode.  */
+ 
+-      /* When using red zone we may start register saving before allocating
+-	 the stack frame saving one cycle of the prologue.  However, avoid
+-	 doing this if we have to probe the stack; at least on x86_64 the
+-	 stack probe can turn into a call that clobbers a red zone location. */
+-      else if (ix86_using_red_zone ()
+-	       && (! TARGET_STACK_PROBE
+-		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
+-	{
+-	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+-	  int_registers_saved = true;
+-	}
+-    }
++bool
++legitimate_pic_address_disp_p (rtx disp)
++{
++  bool saw_plus;
+ 
+-  if (stack_realign_fp)
++  /* In 64bit mode we can allow direct addresses of symbols and labels
++     when they are not dynamic symbols.  */
++  if (TARGET_64BIT)
+     {
+-      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+-      gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
++      rtx op0 = disp, op1;
+ 
+-      /* Record last valid frame pointer offset.  */
+-      m->fs.sp_realigned_fp_last = frame.reg_save_offset;
++      switch (GET_CODE (disp))
++	{
++	case LABEL_REF:
++	  return true;
+ 
+-      /* The computation of the size of the re-aligned stack frame means
+-	 that we must allocate the size of the register save area before
+-	 performing the actual alignment.  Otherwise we cannot guarantee
+-	 that there's enough storage above the realignment point.  */
+-      allocate = frame.reg_save_offset - m->fs.sp_offset
+-		 + frame.stack_realign_allocate;
+-      if (allocate)
+-        pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				   GEN_INT (-allocate), -1, false);
++	case CONST:
++	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
++	    break;
++	  op0 = XEXP (XEXP (disp, 0), 0);
++	  op1 = XEXP (XEXP (disp, 0), 1);
++	  if (!CONST_INT_P (op1))
++	    break;
++	  if (GET_CODE (op0) == UNSPEC
++	      && (XINT (op0, 1) == UNSPEC_DTPOFF
++		  || XINT (op0, 1) == UNSPEC_NTPOFF)
++	      && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
++	    return true;
++	  if (INTVAL (op1) >= 16*1024*1024
++	      || INTVAL (op1) < -16*1024*1024)
++	    break;
++	  if (GET_CODE (op0) == LABEL_REF)
++	    return true;
++	  if (GET_CODE (op0) == CONST
++	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
++	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
++	    return true;
++	  if (GET_CODE (op0) == UNSPEC
++	      && XINT (op0, 1) == UNSPEC_PCREL)
++	    return true;
++	  if (GET_CODE (op0) != SYMBOL_REF)
++	    break;
++	  /* FALLTHRU */
+ 
+-      /* Align the stack.  */
+-      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+-					stack_pointer_rtx,
+-					GEN_INT (-align_bytes)));
+-      m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
+-      m->fs.sp_realigned_offset = m->fs.sp_offset
+-					      - frame.stack_realign_allocate;
+-      /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
+-	 Beyond this point, stack access should be done via choose_baseaddr or
+-	 by using sp_valid_at and fp_valid_at to determine the correct base
+-	 register.  Henceforth, any CFA offset should be thought of as logical
+-	 and not physical.  */
+-      gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
+-      gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
+-      m->fs.sp_realigned = true;
++	case SYMBOL_REF:
++	  /* TLS references should always be enclosed in UNSPEC.
++	     The dllimported symbol needs always to be resolved.  */
++	  if (SYMBOL_REF_TLS_MODEL (op0)
++	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
++	    return false;
+ 
+-      /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
+-	 is needed to describe where a register is saved using a realigned
+-	 stack pointer, so we need to invalidate the stack pointer for that
+-	 target.  */
+-      if (TARGET_SEH)
+-	m->fs.sp_valid = false;
+-
+-      /* If SP offset is non-immediate after allocation of the stack frame,
+-	 then emit SSE saves or stub call prior to allocating the rest of the
+-	 stack frame.  This is less efficient for the out-of-line stub because
+-	 we can't combine allocations across the call barrier, but it's better
+-	 than using a scratch register.  */
+-      else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
+-						   - m->fs.sp_realigned_offset),
+-					  Pmode))
+-	{
+-	  if (!sse_registers_saved)
+-	    {
+-	      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+-	      sse_registers_saved = true;
+-	    }
+-	  else if (save_stub_call_needed)
++	  if (TARGET_PECOFF)
+ 	    {
+-	      ix86_emit_outlined_ms2sysv_save (frame);
+-	      save_stub_call_needed = false;
++	      if (is_imported_p (op0))
++		return true;
++
++	      if (SYMBOL_REF_FAR_ADDR_P (op0)
++		  || !SYMBOL_REF_LOCAL_P (op0))
++		break;
++
++	      /* Function-symbols need to be resolved only for
++	         large-model.
++	         For the small-model we don't need to resolve anything
++	         here.  */
++	      if ((ix86_cmodel != CM_LARGE_PIC
++	           && SYMBOL_REF_FUNCTION_P (op0))
++		  || ix86_cmodel == CM_SMALL_PIC)
++		return true;
++	      /* Non-external symbols don't need to be resolved for
++	         large, and medium-model.  */
++	      if ((ix86_cmodel == CM_LARGE_PIC
++		   || ix86_cmodel == CM_MEDIUM_PIC)
++		  && !SYMBOL_REF_EXTERNAL_P (op0))
++		return true;
+ 	    }
++	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
++		   && (SYMBOL_REF_LOCAL_P (op0)
++		       || (HAVE_LD_PIE_COPYRELOC
++			   && flag_pie
++			   && !SYMBOL_REF_WEAK (op0)
++			   && !SYMBOL_REF_FUNCTION_P (op0)))
++		   && ix86_cmodel != CM_LARGE_PIC)
++	    return true;
++	  break;
++
++	default:
++	  break;
+ 	}
+     }
++  if (GET_CODE (disp) != CONST)
++    return false;
++  disp = XEXP (disp, 0);
+ 
+-  allocate = frame.stack_pointer_offset - m->fs.sp_offset;
+-
+-  if (flag_stack_usage_info)
++  if (TARGET_64BIT)
+     {
+-      /* We start to count from ARG_POINTER.  */
+-      HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
++      /* We are unsafe to allow PLUS expressions.  This limit allowed distance
++         of GOT tables.  We should not need these anyway.  */
++      if (GET_CODE (disp) != UNSPEC
++	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
++	      && XINT (disp, 1) != UNSPEC_GOTOFF
++	      && XINT (disp, 1) != UNSPEC_PCREL
++	      && XINT (disp, 1) != UNSPEC_PLTOFF))
++	return false;
+ 
+-      /* If it was realigned, take into account the fake frame.  */
+-      if (stack_realign_drap)
+-	{
+-	  if (ix86_static_chain_on_stack)
+-	    stack_size += UNITS_PER_WORD;
++      if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
++	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
++	return false;
++      return true;
++    }
+ 
+-	  if (!call_used_regs[REGNO (crtl->drap_reg)])
+-	    stack_size += UNITS_PER_WORD;
++  saw_plus = false;
++  if (GET_CODE (disp) == PLUS)
++    {
++      if (!CONST_INT_P (XEXP (disp, 1)))
++	return false;
++      disp = XEXP (disp, 0);
++      saw_plus = true;
++    }
+ 
+-	  /* This over-estimates by 1 minimal-stack-alignment-unit but
+-	     mitigates that by counting in the new return address slot.  */
+-	  current_function_dynamic_stack_size
+-	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
+-	}
++  if (TARGET_MACHO && darwin_local_data_pic (disp))
++    return true;
+ 
+-      current_function_static_stack_size = stack_size;
+-    }
++  if (GET_CODE (disp) != UNSPEC)
++    return false;
+ 
+-  /* On SEH target with very large frame size, allocate an area to save
+-     SSE registers (as the very large allocation won't be described).  */
+-  if (TARGET_SEH
+-      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
+-      && !sse_registers_saved)
++  switch (XINT (disp, 1))
+     {
+-      HOST_WIDE_INT sse_size
+-	= frame.sse_reg_save_offset - frame.reg_save_offset;
++    case UNSPEC_GOT:
++      if (saw_plus)
++	return false;
++      /* We need to check for both symbols and labels because VxWorks loads
++	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
++	 details.  */
++      return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
++	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
++    case UNSPEC_GOTOFF:
++      /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
++	 While ABI specify also 32bit relocation but we don't produce it in
++	 small PIC model at all.  */
++      if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
++	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
++	  && !TARGET_64BIT)
++        return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
++      return false;
++    case UNSPEC_GOTTPOFF:
++    case UNSPEC_GOTNTPOFF:
++    case UNSPEC_INDNTPOFF:
++      if (saw_plus)
++	return false;
++      disp = XVECEXP (disp, 0, 0);
++      return (GET_CODE (disp) == SYMBOL_REF
++	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
++    case UNSPEC_NTPOFF:
++      disp = XVECEXP (disp, 0, 0);
++      return (GET_CODE (disp) == SYMBOL_REF
++	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
++    case UNSPEC_DTPOFF:
++      disp = XVECEXP (disp, 0, 0);
++      return (GET_CODE (disp) == SYMBOL_REF
++	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
++    }
+ 
+-      gcc_assert (int_registers_saved);
++  return false;
++}
+ 
+-      /* No need to do stack checking as the area will be immediately
+-	 written.  */
+-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-			         GEN_INT (-sse_size), -1,
+-				 m->fs.cfa_reg == stack_pointer_rtx);
+-      allocate -= sse_size;
+-      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+-      sse_registers_saved = true;
+-    }
++/* Determine if op is suitable RTX for an address register.
++   Return naked register if a register or a register subreg is
++   found, otherwise return NULL_RTX.  */
+ 
+-  /* The stack has already been decremented by the instruction calling us
+-     so probe if the size is non-negative to preserve the protection area.  */
+-  if (allocate >= 0
+-      && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+-	  || flag_stack_clash_protection))
++static rtx
++ix86_validate_address_register (rtx op)
++{
++  machine_mode mode = GET_MODE (op);
++
++  /* Only SImode or DImode registers can form the address.  */
++  if (mode != SImode && mode != DImode)
++    return NULL_RTX;
++
++  if (REG_P (op))
++    return op;
++  else if (SUBREG_P (op))
+     {
+-      if (flag_stack_clash_protection)
+-	{
+-	  ix86_adjust_stack_and_probe_stack_clash (allocate,
+-						   int_registers_saved);
+-	  allocate = 0;
+-	}
+-      else if (STACK_CHECK_MOVING_SP)
+-	{
+-	  if (!(crtl->is_leaf && !cfun->calls_alloca
+-		&& allocate <= get_probe_interval ()))
+-	    {
+-	      ix86_adjust_stack_and_probe (allocate, int_registers_saved);
+-	      allocate = 0;
+-	    }
+-	}
+-      else
+-	{
+-	  HOST_WIDE_INT size = allocate;
++      rtx reg = SUBREG_REG (op);
+ 
+-	  if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
+-	    size = 0x80000000 - get_stack_check_protect () - 1;
++      if (!REG_P (reg))
++	return NULL_RTX;
+ 
+-	  if (TARGET_STACK_PROBE)
+-	    {
+-	      if (crtl->is_leaf && !cfun->calls_alloca)
+-		{
+-		  if (size > get_probe_interval ())
+-		    ix86_emit_probe_stack_range (0, size, int_registers_saved);
+-		}
+-	      else
+-		ix86_emit_probe_stack_range (0,
+-					     size + get_stack_check_protect (),
+-					     int_registers_saved);
+-	    }
+-	  else
+-	    {
+-	      if (crtl->is_leaf && !cfun->calls_alloca)
+-		{
+-		  if (size > get_probe_interval ()
+-		      && size > get_stack_check_protect ())
+-		    ix86_emit_probe_stack_range (get_stack_check_protect (),
+-						 (size
+-						  - get_stack_check_protect ()),
+-						 int_registers_saved);
+-		}
+-	      else
+-		ix86_emit_probe_stack_range (get_stack_check_protect (), size,
+-					     int_registers_saved);
+-	    }
+-	}
+-    }
++      mode = GET_MODE (reg);
+ 
+-  if (allocate == 0)
+-    ;
+-  else if (!ix86_target_stack_probe ()
+-	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
+-    {
+-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-			         GEN_INT (-allocate), -1,
+-			         m->fs.cfa_reg == stack_pointer_rtx);
+-    }
+-  else
+-    {
+-      rtx eax = gen_rtx_REG (Pmode, AX_REG);
+-      rtx r10 = NULL;
+-      rtx (*adjust_stack_insn)(rtx, rtx, rtx);
+-      const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
+-      bool eax_live = ix86_eax_live_at_start_p ();
+-      bool r10_live = false;
++      /* Don't allow SUBREGs that span more than a word.  It can
++	 lead to spill failures when the register is one word out
++	 of a two word structure.  */
++      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
++	return NULL_RTX;
+ 
+-      if (TARGET_64BIT)
+-        r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
++      /* Allow only SUBREGs of non-eliminable hard registers.  */
++      if (register_no_elim_operand (reg, mode))
++	return reg;
++    }
+ 
+-      if (eax_live)
+-	{
+-	  insn = emit_insn (gen_push (eax));
+-	  allocate -= UNITS_PER_WORD;
+-	  /* Note that SEH directives need to continue tracking the stack
+-	     pointer even after the frame pointer has been set up.  */
+-	  if (sp_is_cfa_reg || TARGET_SEH)
+-	    {
+-	      if (sp_is_cfa_reg)
+-		m->fs.cfa_offset += UNITS_PER_WORD;
+-	      RTX_FRAME_RELATED_P (insn) = 1;
+-	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+-			    gen_rtx_SET (stack_pointer_rtx,
+-					 plus_constant (Pmode, stack_pointer_rtx,
+-							-UNITS_PER_WORD)));
+-	    }
+-	}
+-
+-      if (r10_live)
+-	{
+-	  r10 = gen_rtx_REG (Pmode, R10_REG);
+-	  insn = emit_insn (gen_push (r10));
+-	  allocate -= UNITS_PER_WORD;
+-	  if (sp_is_cfa_reg || TARGET_SEH)
+-	    {
+-	      if (sp_is_cfa_reg)
+-		m->fs.cfa_offset += UNITS_PER_WORD;
+-	      RTX_FRAME_RELATED_P (insn) = 1;
+-	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+-			    gen_rtx_SET (stack_pointer_rtx,
+-					 plus_constant (Pmode, stack_pointer_rtx,
+-							-UNITS_PER_WORD)));
+-	    }
+-	}
++  /* Op is not a register.  */
++  return NULL_RTX;
++}
+ 
+-      emit_move_insn (eax, GEN_INT (allocate));
+-      emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
++/* Recognizes RTL expressions that are valid memory addresses for an
++   instruction.  The MODE argument is the machine mode for the MEM
++   expression that wants to use this address.
+ 
+-      /* Use the fact that AX still contains ALLOCATE.  */
+-      adjust_stack_insn = (Pmode == DImode
+-			   ? gen_pro_epilogue_adjust_stack_di_sub
+-			   : gen_pro_epilogue_adjust_stack_si_sub);
++   It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
++   convert common non-canonical forms to canonical form so that they will
++   be recognized.  */
+ 
+-      insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
+-					   stack_pointer_rtx, eax));
++static bool
++ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
++{
++  struct ix86_address parts;
++  rtx base, index, disp;
++  HOST_WIDE_INT scale;
++  addr_space_t seg;
+ 
+-      if (sp_is_cfa_reg || TARGET_SEH)
+-	{
+-	  if (sp_is_cfa_reg)
+-	    m->fs.cfa_offset += allocate;
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+-			gen_rtx_SET (stack_pointer_rtx,
+-				     plus_constant (Pmode, stack_pointer_rtx,
+-						    -allocate)));
+-	}
+-      m->fs.sp_offset += allocate;
++  if (ix86_decompose_address (addr, &parts) <= 0)
++    /* Decomposition failed.  */
++    return false;
+ 
+-      /* Use stack_pointer_rtx for relative addressing so that code works for
+-	 realigned stack.  But this means that we need a blockage to prevent
+-	 stores based on the frame pointer from being scheduled before.  */
+-      if (r10_live && eax_live)
+-        {
+-	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+-	  emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+-			  gen_frame_mem (word_mode, t));
+-	  t = plus_constant (Pmode, t, UNITS_PER_WORD);
+-	  emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
+-			  gen_frame_mem (word_mode, t));
+-	  emit_insn (gen_memory_blockage ());
+-	}
+-      else if (eax_live || r10_live)
+-	{
+-	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+-	  emit_move_insn (gen_rtx_REG (word_mode,
+-				       (eax_live ? AX_REG : R10_REG)),
+-			  gen_frame_mem (word_mode, t));
+-	  emit_insn (gen_memory_blockage ());
+-	}
+-    }
+-  gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
++  base = parts.base;
++  index = parts.index;
++  disp = parts.disp;
++  scale = parts.scale;
++  seg = parts.seg;
+ 
+-  /* If we havn't already set up the frame pointer, do so now.  */
+-  if (frame_pointer_needed && !m->fs.fp_valid)
++  /* Validate base register.  */
++  if (base)
+     {
+-      insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
+-			    GEN_INT (frame.stack_pointer_offset
+-				     - frame.hard_frame_pointer_offset));
+-      insn = emit_insn (insn);
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-      add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
+-
+-      if (m->fs.cfa_reg == stack_pointer_rtx)
+-	m->fs.cfa_reg = hard_frame_pointer_rtx;
+-      m->fs.fp_offset = frame.hard_frame_pointer_offset;
+-      m->fs.fp_valid = true;
+-    }
++      rtx reg = ix86_validate_address_register (base);
+ 
+-  if (!int_registers_saved)
+-    ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+-  if (!sse_registers_saved)
+-    ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+-  else if (save_stub_call_needed)
+-    ix86_emit_outlined_ms2sysv_save (frame);
++      if (reg == NULL_RTX)
++	return false;
+ 
+-  /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
+-     in PROLOGUE.  */
+-  if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
+-    {
+-      rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
+-      insn = emit_insn (gen_set_got (pic));
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+-      emit_insn (gen_prologue_use (pic));
+-      /* Deleting already emmitted SET_GOT if exist and allocated to
+-	 REAL_PIC_OFFSET_TABLE_REGNUM.  */
+-      ix86_elim_entry_set_got (pic);
++      if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
++	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
++	/* Base is not valid.  */
++	return false;
+     }
+ 
+-  if (crtl->drap_reg && !crtl->stack_realign_needed)
++  /* Validate index register.  */
++  if (index)
+     {
+-      /* vDRAP is setup but after reload it turns out stack realign
+-         isn't necessary, here we will emit prologue to setup DRAP
+-         without stack realign adjustment */
+-      t = choose_baseaddr (0, NULL);
+-      emit_insn (gen_rtx_SET (crtl->drap_reg, t));
+-    }
+-
+-  /* Prevent instructions from being scheduled into register save push
+-     sequence when access to the redzone area is done through frame pointer.
+-     The offset between the frame pointer and the stack pointer is calculated
+-     relative to the value of the stack pointer at the end of the function
+-     prologue, and moving instructions that access redzone area via frame
+-     pointer inside push sequence violates this assumption.  */
+-  if (frame_pointer_needed && frame.red_zone_size)
+-    emit_insn (gen_memory_blockage ());
++      rtx reg = ix86_validate_address_register (index);
+ 
+-  /* SEH requires that the prologue end within 256 bytes of the start of
+-     the function.  Prevent instruction schedules that would extend that.
+-     Further, prevent alloca modifications to the stack pointer from being
+-     combined with prologue modifications.  */
+-  if (TARGET_SEH)
+-    emit_insn (gen_prologue_use (stack_pointer_rtx));
+-}
++      if (reg == NULL_RTX)
++	return false;
+ 
+-/* Emit code to restore REG using a POP insn.  */
++      if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
++	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
++	/* Index is not valid.  */
++	return false;
++    }
+ 
+-static void
+-ix86_emit_restore_reg_using_pop (rtx reg)
+-{
+-  struct machine_function *m = cfun->machine;
+-  rtx_insn *insn = emit_insn (gen_pop (reg));
++  /* Index and base should have the same mode.  */
++  if (base && index
++      && GET_MODE (base) != GET_MODE (index))
++    return false;
+ 
+-  ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
+-  m->fs.sp_offset -= UNITS_PER_WORD;
++  /* Address override works only on the (%reg) part of %fs:(%reg).  */
++  if (seg != ADDR_SPACE_GENERIC
++      && ((base && GET_MODE (base) != word_mode)
++	  || (index && GET_MODE (index) != word_mode)))
++    return false;
+ 
+-  if (m->fs.cfa_reg == crtl->drap_reg
+-      && REGNO (reg) == REGNO (crtl->drap_reg))
++  /* Validate scale factor.  */
++  if (scale != 1)
+     {
+-      /* Previously we'd represented the CFA as an expression
+-	 like *(%ebp - 8).  We've just popped that value from
+-	 the stack, which means we need to reset the CFA to
+-	 the drap register.  This will remain until we restore
+-	 the stack pointer.  */
+-      add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+-      RTX_FRAME_RELATED_P (insn) = 1;
++      if (!index)
++	/* Scale without index.  */
++	return false;
+ 
+-      /* This means that the DRAP register is valid for addressing too.  */
+-      m->fs.drap_valid = true;
+-      return;
++      if (scale != 2 && scale != 4 && scale != 8)
++	/* Scale is not a valid multiplier.  */
++	return false;
+     }
+ 
+-  if (m->fs.cfa_reg == stack_pointer_rtx)
++  /* Validate displacement.  */
++  if (disp)
+     {
+-      rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+-      x = gen_rtx_SET (stack_pointer_rtx, x);
+-      add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+-      RTX_FRAME_RELATED_P (insn) = 1;
+-
+-      m->fs.cfa_offset -= UNITS_PER_WORD;
+-    }
++      if (GET_CODE (disp) == CONST
++	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
++	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
++	switch (XINT (XEXP (disp, 0), 1))
++	  {
++	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
++	     when used.  While ABI specify also 32bit relocations, we
++	     don't produce them at all and use IP relative instead.
++	     Allow GOT in 32bit mode for both PIC and non-PIC if symbol
++	     should be loaded via GOT.  */
++	  case UNSPEC_GOT:
++	    if (!TARGET_64BIT
++		&& ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
++	      goto is_legitimate_pic;
++	    /* FALLTHRU */
++	  case UNSPEC_GOTOFF:
++	    gcc_assert (flag_pic);
++	    if (!TARGET_64BIT)
++	      goto is_legitimate_pic;
+ 
+-  /* When the frame pointer is the CFA, and we pop it, we are
+-     swapping back to the stack pointer as the CFA.  This happens
+-     for stack frames that don't allocate other data, so we assume
+-     the stack pointer is now pointing at the return address, i.e.
+-     the function entry state, which makes the offset be 1 word.  */
+-  if (reg == hard_frame_pointer_rtx)
+-    {
+-      m->fs.fp_valid = false;
+-      if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+-	{
+-	  m->fs.cfa_reg = stack_pointer_rtx;
+-	  m->fs.cfa_offset -= UNITS_PER_WORD;
++	    /* 64bit address unspec.  */
++	    return false;
+ 
+-	  add_reg_note (insn, REG_CFA_DEF_CFA,
+-			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+-				      GEN_INT (m->fs.cfa_offset)));
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-	}
+-    }
+-}
+-
+-/* Emit code to restore saved registers using POP insns.  */
++	  case UNSPEC_GOTPCREL:
++	    if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
++	      goto is_legitimate_pic;
++	    /* FALLTHRU */
++	  case UNSPEC_PCREL:
++	    gcc_assert (flag_pic);
++	    goto is_legitimate_pic;
+ 
+-static void
+-ix86_emit_restore_regs_using_pop (void)
+-{
+-  unsigned int regno;
++	  case UNSPEC_GOTTPOFF:
++	  case UNSPEC_GOTNTPOFF:
++	  case UNSPEC_INDNTPOFF:
++	  case UNSPEC_NTPOFF:
++	  case UNSPEC_DTPOFF:
++	    break;
+ 
+-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
+-      ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
+-}
++	  default:
++	    /* Invalid address unspec.  */
++	    return false;
++	  }
+ 
+-/* Emit code and notes for the LEAVE instruction.  If insn is non-null,
+-   omits the emit and only attaches the notes.  */
++      else if (SYMBOLIC_CONST (disp)
++	       && (flag_pic
++		   || (TARGET_MACHO
++#if TARGET_MACHO
++		       && MACHOPIC_INDIRECT
++		       && !machopic_operand_p (disp)
++#endif
++	       )))
++	{
+ 
+-static void
+-ix86_emit_leave (rtx_insn *insn)
+-{
+-  struct machine_function *m = cfun->machine;
+-  if (!insn)
+-    insn = emit_insn (ix86_gen_leave ());
++	is_legitimate_pic:
++	  if (TARGET_64BIT && (index || base))
++	    {
++	      /* foo@dtpoff(%rX) is ok.  */
++	      if (GET_CODE (disp) != CONST
++		  || GET_CODE (XEXP (disp, 0)) != PLUS
++		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
++		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
++		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
++		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
++		/* Non-constant pic memory reference.  */
++		return false;
++	    }
++	  else if ((!TARGET_MACHO || flag_pic)
++		    && ! legitimate_pic_address_disp_p (disp))
++	    /* Displacement is an invalid pic construct.  */
++	    return false;
++#if TARGET_MACHO
++	  else if (MACHO_DYNAMIC_NO_PIC_P
++		   && !ix86_legitimate_constant_p (Pmode, disp))
++	    /* displacment must be referenced via non_lazy_pointer */
++	    return false;
++#endif
+ 
+-  ix86_add_queued_cfa_restore_notes (insn);
++          /* This code used to verify that a symbolic pic displacement
++	     includes the pic_offset_table_rtx register.
+ 
+-  gcc_assert (m->fs.fp_valid);
+-  m->fs.sp_valid = true;
+-  m->fs.sp_realigned = false;
+-  m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
+-  m->fs.fp_valid = false;
++	     While this is good idea, unfortunately these constructs may
++	     be created by "adds using lea" optimization for incorrect
++	     code like:
+ 
+-  if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+-    {
+-      m->fs.cfa_reg = stack_pointer_rtx;
+-      m->fs.cfa_offset = m->fs.sp_offset;
++	     int a;
++	     int foo(int i)
++	       {
++	         return *(&a+i);
++	       }
+ 
+-      add_reg_note (insn, REG_CFA_DEF_CFA,
+-		    plus_constant (Pmode, stack_pointer_rtx,
+-				   m->fs.sp_offset));
+-      RTX_FRAME_RELATED_P (insn) = 1;
++	     This code is nonsensical, but results in addressing
++	     GOT table with pic_offset_table_rtx base.  We can't
++	     just refuse it easily, since it gets matched by
++	     "addsi3" pattern, that later gets split to lea in the
++	     case output register differs from input.  While this
++	     can be handled by separate addsi pattern for this case
++	     that never results in lea, this seems to be easier and
++	     correct fix for crash to disable this test.  */
++	}
++      else if (GET_CODE (disp) != LABEL_REF
++	       && !CONST_INT_P (disp)
++	       && (GET_CODE (disp) != CONST
++		   || !ix86_legitimate_constant_p (Pmode, disp))
++	       && (GET_CODE (disp) != SYMBOL_REF
++		   || !ix86_legitimate_constant_p (Pmode, disp)))
++	/* Displacement is not constant.  */
++	return false;
++      else if (TARGET_64BIT
++	       && !x86_64_immediate_operand (disp, VOIDmode))
++	/* Displacement is out of range.  */
++	return false;
++      /* In x32 mode, constant addresses are sign extended to 64bit, so
++	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
++      else if (TARGET_X32 && !(index || base)
++	       && CONST_INT_P (disp)
++	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
++	return false;
+     }
+-  ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
+-			     m->fs.fp_offset);
+-}
+-
+-/* Emit code to restore saved registers using MOV insns.
+-   First register is restored from CFA - CFA_OFFSET.  */
+-static void
+-ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
+-				  bool maybe_eh_return)
+-{
+-  struct machine_function *m = cfun->machine;
+-  unsigned int regno;
+-
+-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
+-      {
+-	rtx reg = gen_rtx_REG (word_mode, regno);
+-	rtx mem;
+-	rtx_insn *insn;
+-
+-	mem = choose_baseaddr (cfa_offset, NULL);
+-	mem = gen_frame_mem (word_mode, mem);
+-	insn = emit_move_insn (reg, mem);
+ 
+-        if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+-	  {
+-	    /* Previously we'd represented the CFA as an expression
+-	       like *(%ebp - 8).  We've just popped that value from
+-	       the stack, which means we need to reset the CFA to
+-	       the drap register.  This will remain until we restore
+-	       the stack pointer.  */
+-	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+-	    RTX_FRAME_RELATED_P (insn) = 1;
++  /* Everything looks valid.  */
++  return true;
++}
+ 
+-	    /* This means that the DRAP register is valid for addressing.  */
+-	    m->fs.drap_valid = true;
+-	  }
+-	else
+-	  ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
++/* Determine if a given RTX is a valid constant address.  */
+ 
+-	cfa_offset -= UNITS_PER_WORD;
+-      }
++bool
++constant_address_p (rtx x)
++{
++  return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
+ }
++
++/* Return a unique alias set for the GOT.  */
+ 
+-/* Emit code to restore saved registers using MOV insns.
+-   First register is restored from CFA - CFA_OFFSET.  */
+-static void
+-ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
+-				      bool maybe_eh_return)
++alias_set_type
++ix86_GOT_alias_set (void)
+ {
+-  unsigned int regno;
++  static alias_set_type set = -1;
++  if (set == -1)
++    set = new_alias_set ();
++  return set;
++}
+ 
+-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
+-      {
+-	rtx reg = gen_rtx_REG (V4SFmode, regno);
+-	rtx mem;
+-	unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
++/* Return a legitimate reference for ORIG (an address) using the
++   register REG.  If REG is 0, a new pseudo is generated.
+ 
+-	mem = choose_baseaddr (cfa_offset, &align);
+-	mem = gen_rtx_MEM (V4SFmode, mem);
++   There are two types of references that must be handled:
+ 
+-	/* The location aligment depends upon the base register.  */
+-	align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
+-	gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
+-	set_mem_align (mem, align);
+-	emit_insn (gen_rtx_SET (reg, mem));
++   1. Global data references must load the address from the GOT, via
++      the PIC reg.  An insn is emitted to do this load, and the reg is
++      returned.
+ 
+-	ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
++   2. Static data references, constant pool addresses, and code labels
++      compute the address as an offset from the GOT, whose base is in
++      the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
++      differentiate them from global data objects.  The returned
++      address is the PIC reg + an unspec constant.
+ 
+-	cfa_offset -= GET_MODE_SIZE (V4SFmode);
+-      }
+-}
++   TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
++   reg also appears in the address.  */
+ 
+-static void
+-ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
+-				  bool use_call, int style)
++rtx
++legitimize_pic_address (rtx orig, rtx reg)
+ {
+-  struct machine_function *m = cfun->machine;
+-  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
+-			  + m->call_ms2sysv_extra_regs;
+-  rtvec v;
+-  unsigned int elems_needed, align, i, vi = 0;
+-  rtx_insn *insn;
+-  rtx sym, tmp;
+-  rtx rsi = gen_rtx_REG (word_mode, SI_REG);
+-  rtx r10 = NULL_RTX;
+-  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
+-  HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
+-  HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
+-  rtx rsi_frame_load = NULL_RTX;
+-  HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
+-  enum xlogue_stub stub;
++  rtx addr = orig;
++  rtx new_rtx = orig;
+ 
+-  gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
++#if TARGET_MACHO
++  if (TARGET_MACHO && !TARGET_64BIT)
++    {
++      if (reg == 0)
++	reg = gen_reg_rtx (Pmode);
++      /* Use the generic Mach-O PIC machinery.  */
++      return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
++    }
++#endif
+ 
+-  /* If using a realigned stack, we should never start with padding.  */
+-  gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
++  if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
++    {
++      rtx tmp = legitimize_pe_coff_symbol (addr, true);
++      if (tmp)
++        return tmp;
++    }
+ 
+-  /* Setup RSI as the stub's base pointer.  */
+-  align = GET_MODE_ALIGNMENT (V4SFmode);
+-  tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
+-  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
++  if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
++    new_rtx = addr;
++  else if ((!TARGET_64BIT
++	    || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
++	   && !TARGET_PECOFF
++	   && gotoff_operand (addr, Pmode))
++    {
++      /* This symbol may be referenced via a displacement
++	 from the PIC base address (@GOTOFF).  */
++      if (GET_CODE (addr) == CONST)
++	addr = XEXP (addr, 0);
+ 
+-  emit_insn (gen_rtx_SET (rsi, tmp));
++      if (GET_CODE (addr) == PLUS)
++	  {
++            new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
++				      UNSPEC_GOTOFF);
++	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
++	  }
++	else
++          new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
+ 
+-  /* Get a symbol for the stub.  */
+-  if (frame_pointer_needed)
+-    stub = use_call ? XLOGUE_STUB_RESTORE_HFP
+-		    : XLOGUE_STUB_RESTORE_HFP_TAIL;
+-  else
+-    stub = use_call ? XLOGUE_STUB_RESTORE
+-		    : XLOGUE_STUB_RESTORE_TAIL;
+-  sym = xlogue.get_stub_rtx (stub);
++      new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ 
+-  elems_needed = ncregs;
+-  if (use_call)
+-    elems_needed += 1;
+-  else
+-    elems_needed += frame_pointer_needed ? 5 : 3;
+-  v = rtvec_alloc (elems_needed);
++      if (TARGET_64BIT)
++	new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
+ 
+-  /* We call the epilogue stub when we need to pop incoming args or we are
+-     doing a sibling call as the tail.  Otherwise, we will emit a jmp to the
+-     epilogue stub and it is the tail-call.  */
+-  if (use_call)
+-      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+-  else
+-    {
+-      RTVEC_ELT (v, vi++) = ret_rtx;
+-      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+-      if (frame_pointer_needed)
++      if (reg != 0)
+ 	{
+-	  rtx rbp = gen_rtx_REG (DImode, BP_REG);
+-	  gcc_assert (m->fs.fp_valid);
+-	  gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
+-
+-	  tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
+-	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
+-	  RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
+-	  tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
+-	  RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
+-	}
++ 	  gcc_assert (REG_P (reg));
++	  new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
++					 new_rtx, reg, 1, OPTAB_DIRECT);
++ 	}
+       else
+-	{
+-	  /* If no hard frame pointer, we set R10 to the SP restore value.  */
+-	  gcc_assert (!m->fs.fp_valid);
+-	  gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+-	  gcc_assert (m->fs.sp_valid);
+-
+-	  r10 = gen_rtx_REG (DImode, R10_REG);
+-	  tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
+-	  emit_insn (gen_rtx_SET (r10, tmp));
+-
+-	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
+-	}
++	new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+     }
+-
+-  /* Generate frame load insns and restore notes.  */
+-  for (i = 0; i < ncregs; ++i)
++  else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
++	   /* We can't use @GOTOFF for text labels
++	      on VxWorks, see gotoff_operand.  */
++	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
+     {
+-      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
+-      machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
+-      rtx reg, frame_load;
+-
+-      reg = gen_rtx_REG (mode, r.regno);
+-      frame_load = gen_frame_load (reg, rsi, r.offset);
++      rtx tmp = legitimize_pe_coff_symbol (addr, true);
++      if (tmp)
++        return tmp;
+ 
+-      /* Save RSI frame load insn & note to add last.  */
+-      if (r.regno == SI_REG)
++      /* For x64 PE-COFF there is no GOT table,
++	 so we use address directly.  */
++      if (TARGET_64BIT && TARGET_PECOFF)
+ 	{
+-	  gcc_assert (!rsi_frame_load);
+-	  rsi_frame_load = frame_load;
+-	  rsi_restore_offset = r.offset;
++	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
++	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
++	}
++      else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
++	{
++	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
++				    UNSPEC_GOTPCREL);
++	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
++	  new_rtx = gen_const_mem (Pmode, new_rtx);
++	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+ 	}
+       else
+ 	{
+-	  RTVEC_ELT (v, vi++) = frame_load;
+-	  ix86_add_cfa_restore_note (NULL, reg, r.offset);
++	  /* This symbol must be referenced via a load
++	     from the Global Offset Table (@GOT).  */
++	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
++	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
++	  if (TARGET_64BIT)
++	    new_rtx = force_reg (Pmode, new_rtx);
++	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
++	  new_rtx = gen_const_mem (Pmode, new_rtx);
++	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+ 	}
+-    }
+-
+-  /* Add RSI frame load & restore note at the end.  */
+-  gcc_assert (rsi_frame_load);
+-  gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
+-  RTVEC_ELT (v, vi++) = rsi_frame_load;
+-  ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
+-			     rsi_restore_offset);
+-
+-  /* Finally, for tail-call w/o a hard frame pointer, set SP to R10.  */
+-  if (!use_call && !frame_pointer_needed)
+-    {
+-      gcc_assert (m->fs.sp_valid);
+-      gcc_assert (!m->fs.sp_realigned);
+ 
+-      /* At this point, R10 should point to frame.stack_realign_offset.  */
+-      if (m->fs.cfa_reg == stack_pointer_rtx)
+-	m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
+-      m->fs.sp_offset = frame.stack_realign_offset;
++      new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
+     }
+-
+-  gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
+-  tmp = gen_rtx_PARALLEL (VOIDmode, v);
+-  if (use_call)
+-      insn = emit_insn (tmp);
+   else
+     {
+-      insn = emit_jump_insn (tmp);
+-      JUMP_LABEL (insn) = ret_rtx;
+-
+-      if (frame_pointer_needed)
+-	ix86_emit_leave (insn);
+-      else
++      if (CONST_INT_P (addr)
++	  && !x86_64_immediate_operand (addr, VOIDmode))
++	new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
++      else if (GET_CODE (addr) == CONST)
+ 	{
+-	  /* Need CFA adjust note.  */
+-	  tmp = gen_rtx_SET (stack_pointer_rtx, r10);
+-	  add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
+-	}
+-    }
++	  addr = XEXP (addr, 0);
+ 
+-  RTX_FRAME_RELATED_P (insn) = true;
+-  ix86_add_queued_cfa_restore_notes (insn);
++	  /* We must match stuff we generate before.  Assume the only
++	     unspecs that can get here are ours.  Not that we could do
++	     anything with them anyway....  */
++	  if (GET_CODE (addr) == UNSPEC
++	      || (GET_CODE (addr) == PLUS
++		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
++	    return orig;
++	  gcc_assert (GET_CODE (addr) == PLUS);
++	}
+ 
+-  /* If we're not doing a tail-call, we need to adjust the stack.  */
+-  if (use_call && m->fs.sp_valid)
+-    {
+-      HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
+-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				GEN_INT (dealloc), style,
+-				m->fs.cfa_reg == stack_pointer_rtx);
+-    }
+-}
++      if (GET_CODE (addr) == PLUS)
++	{
++	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
+ 
+-/* Restore function stack, frame, and registers.  */
++	  /* Check first to see if this is a constant
++	     offset from a @GOTOFF symbol reference.  */
++	  if (!TARGET_PECOFF
++	      && gotoff_operand (op0, Pmode)
++	      && CONST_INT_P (op1))
++	    {
++	      if (!TARGET_64BIT)
++		{
++		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
++					    UNSPEC_GOTOFF);
++		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
++		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ 
+-void
+-ix86_expand_epilogue (int style)
+-{
+-  struct machine_function *m = cfun->machine;
+-  struct machine_frame_state frame_state_save = m->fs;
+-  bool restore_regs_via_mov;
+-  bool using_drap;
+-  bool restore_stub_is_tail = false;
++		  if (reg != 0)
++		    {
++		      gcc_assert (REG_P (reg));
++		      new_rtx = expand_simple_binop (Pmode, PLUS,
++						     pic_offset_table_rtx,
++						     new_rtx, reg, 1,
++						     OPTAB_DIRECT);
++		    }
++		  else
++		    new_rtx
++		      = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
++		}
++	      else
++		{
++		  if (INTVAL (op1) < -16*1024*1024
++		      || INTVAL (op1) >= 16*1024*1024)
++		    {
++		      if (!x86_64_immediate_operand (op1, Pmode))
++			op1 = force_reg (Pmode, op1);
+ 
+-  if (ix86_function_naked (current_function_decl))
+-    {
+-      /* The program should not reach this point.  */
+-      emit_insn (gen_ud2 ());
+-      return;
+-    }
++		      new_rtx
++			= gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
++		    }
++		}
++	    }
++	  else
++	    {
++	      rtx base = legitimize_pic_address (op0, reg);
++	      machine_mode mode = GET_MODE (base);
++	      new_rtx
++	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
+ 
+-  ix86_finalize_stack_frame_flags ();
+-  const struct ix86_frame &frame = cfun->machine->frame;
++	      if (CONST_INT_P (new_rtx))
++		{
++		  if (INTVAL (new_rtx) < -16*1024*1024
++		      || INTVAL (new_rtx) >= 16*1024*1024)
++		    {
++		      if (!x86_64_immediate_operand (new_rtx, mode))
++			new_rtx = force_reg (mode, new_rtx);
+ 
+-  m->fs.sp_realigned = stack_realign_fp;
+-  m->fs.sp_valid = stack_realign_fp
+-		   || !frame_pointer_needed
+-		   || crtl->sp_is_unchanging;
+-  gcc_assert (!m->fs.sp_valid
+-	      || m->fs.sp_offset == frame.stack_pointer_offset);
++		      new_rtx
++		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
++		    }
++		  else
++		    new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
++		}
++	      else
++		{
++		  /* For %rip addressing, we have to use
++		     just disp32, not base nor index.  */
++		  if (TARGET_64BIT
++		      && (GET_CODE (base) == SYMBOL_REF
++			  || GET_CODE (base) == LABEL_REF))
++		    base = force_reg (mode, base);
++		  if (GET_CODE (new_rtx) == PLUS
++		      && CONSTANT_P (XEXP (new_rtx, 1)))
++		    {
++		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
++		      new_rtx = XEXP (new_rtx, 1);
++		    }
++		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
++		}
++	    }
++	}
++    }
++  return new_rtx;
++}
++
++/* Load the thread pointer.  If TO_REG is true, force it into a register.  */
+ 
+-  /* The FP must be valid if the frame pointer is present.  */
+-  gcc_assert (frame_pointer_needed == m->fs.fp_valid);
+-  gcc_assert (!m->fs.fp_valid
+-	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
++static rtx
++get_thread_pointer (machine_mode tp_mode, bool to_reg)
++{
++  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
+ 
+-  /* We must have *some* valid pointer to the stack frame.  */
+-  gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
++  if (GET_MODE (tp) != tp_mode)
++    {
++      gcc_assert (GET_MODE (tp) == SImode);
++      gcc_assert (tp_mode == DImode);
+ 
+-  /* The DRAP is never valid at this point.  */
+-  gcc_assert (!m->fs.drap_valid);
++      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
++    }
+ 
+-  /* See the comment about red zone and frame
+-     pointer usage in ix86_expand_prologue.  */
+-  if (frame_pointer_needed && frame.red_zone_size)
+-    emit_insn (gen_memory_blockage ());
++  if (to_reg)
++    tp = copy_to_mode_reg (tp_mode, tp);
+ 
+-  using_drap = crtl->drap_reg && crtl->stack_realign_needed;
+-  gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
++  return tp;
++}
+ 
+-  /* Determine the CFA offset of the end of the red-zone.  */
+-  m->fs.red_zone_offset = 0;
+-  if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
++/* Construct the SYMBOL_REF for the tls_get_addr function.  */
++
++static GTY(()) rtx ix86_tls_symbol;
++
++static rtx
++ix86_tls_get_addr (void)
++{
++  if (!ix86_tls_symbol)
+     {
+-      /* The red-zone begins below return address and error code in
+-	 exception handler.  */
+-      m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
++      const char *sym
++	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
++	   ? "___tls_get_addr" : "__tls_get_addr");
+ 
+-      /* When the register save area is in the aligned portion of
+-         the stack, determine the maximum runtime displacement that
+-	 matches up with the aligned frame.  */
+-      if (stack_realign_drap)
+-	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
+-				  + UNITS_PER_WORD);
++      ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
+     }
+ 
+-  HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
++  if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
++    {
++      rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
++				   UNSPEC_PLTOFF);
++      return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
++			   gen_rtx_CONST (Pmode, unspec));
++    }
+ 
+-  /* Special care must be taken for the normal return case of a function
+-     using eh_return: the eax and edx registers are marked as saved, but
+-     not restored along this path.  Adjust the save location to match.  */
+-  if (crtl->calls_eh_return && style != 2)
+-    reg_save_offset -= 2 * UNITS_PER_WORD;
++  return ix86_tls_symbol;
++}
+ 
+-  /* EH_RETURN requires the use of moves to function properly.  */
+-  if (crtl->calls_eh_return)
+-    restore_regs_via_mov = true;
+-  /* SEH requires the use of pops to identify the epilogue.  */
+-  else if (TARGET_SEH)
+-    restore_regs_via_mov = false;
+-  /* If we're only restoring one register and sp cannot be used then
+-     using a move instruction to restore the register since it's
+-     less work than reloading sp and popping the register.  */
+-  else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
+-    restore_regs_via_mov = true;
+-  else if (TARGET_EPILOGUE_USING_MOVE
+-	   && cfun->machine->use_fast_prologue_epilogue
+-	   && (frame.nregs > 1
+-	       || m->fs.sp_offset != reg_save_offset))
+-    restore_regs_via_mov = true;
+-  else if (frame_pointer_needed
+-	   && !frame.nregs
+-	   && m->fs.sp_offset != reg_save_offset)
+-    restore_regs_via_mov = true;
+-  else if (frame_pointer_needed
+-	   && TARGET_USE_LEAVE
+-	   && cfun->machine->use_fast_prologue_epilogue
+-	   && frame.nregs == 1)
+-    restore_regs_via_mov = true;
+-  else
+-    restore_regs_via_mov = false;
++/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
+ 
+-  if (restore_regs_via_mov || frame.nsseregs)
++static GTY(()) rtx ix86_tls_module_base_symbol;
++
++rtx
++ix86_tls_module_base (void)
++{
++  if (!ix86_tls_module_base_symbol)
+     {
+-      /* Ensure that the entire register save area is addressable via
+-	 the stack pointer, if we will restore SSE regs via sp.  */
+-      if (TARGET_64BIT
+-	  && m->fs.sp_offset > 0x7fffffff
+-	  && sp_valid_at (frame.stack_realign_offset + 1)
+-	  && (frame.nsseregs + frame.nregs) != 0)
+-	{
+-	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				     GEN_INT (m->fs.sp_offset
+-					      - frame.sse_reg_save_offset),
+-				     style,
+-				     m->fs.cfa_reg == stack_pointer_rtx);
+-	}
+-    }
++      ix86_tls_module_base_symbol
++	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
+ 
+-  /* If there are any SSE registers to restore, then we have to do it
+-     via moves, since there's obviously no pop for SSE regs.  */
+-  if (frame.nsseregs)
+-    ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
+-					  style == 2);
++      SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
++	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
++    }
+ 
+-  if (m->call_ms2sysv)
+-    {
+-      int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
++  return ix86_tls_module_base_symbol;
++}
+ 
+-      /* We cannot use a tail-call for the stub if:
+-	 1. We have to pop incoming args,
+-	 2. We have additional int regs to restore, or
+-	 3. A sibling call will be the tail-call, or
+-	 4. We are emitting an eh_return_internal epilogue.
++/* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
++   false if we expect this to be used for a memory address and true if
++   we expect to load the address into a register.  */
+ 
+-	 TODO: Item 4 has not yet tested!
++rtx
++legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
++{
++  rtx dest, base, off;
++  rtx pic = NULL_RTX, tp = NULL_RTX;
++  machine_mode tp_mode = Pmode;
++  int type;
+ 
+-	 If any of the above are true, we will call the stub rather than
+-	 jump to it.  */
+-      restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
+-      ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
+-    }
++  /* Fall back to global dynamic model if tool chain cannot support local
++     dynamic.  */
++  if (TARGET_SUN_TLS && !TARGET_64BIT
++      && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
++      && model == TLS_MODEL_LOCAL_DYNAMIC)
++    model = TLS_MODEL_GLOBAL_DYNAMIC;
+ 
+-  /* If using out-of-line stub that is a tail-call, then...*/
+-  if (m->call_ms2sysv && restore_stub_is_tail)
+-    {
+-      /* TODO: parinoid tests. (remove eventually)  */
+-      gcc_assert (m->fs.sp_valid);
+-      gcc_assert (!m->fs.sp_realigned);
+-      gcc_assert (!m->fs.fp_valid);
+-      gcc_assert (!m->fs.realigned);
+-      gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
+-      gcc_assert (!crtl->drap_reg);
+-      gcc_assert (!frame.nregs);
+-    }
+-  else if (restore_regs_via_mov)
++  switch (model)
+     {
+-      rtx t;
++    case TLS_MODEL_GLOBAL_DYNAMIC:
++      dest = gen_reg_rtx (Pmode);
+ 
+-      if (frame.nregs)
+-	ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
++      if (!TARGET_64BIT)
++	{
++	  if (flag_pic && !TARGET_PECOFF)
++	    pic = pic_offset_table_rtx;
++	  else
++	    {
++	      pic = gen_reg_rtx (Pmode);
++	      emit_insn (gen_set_got (pic));
++	    }
++	}
+ 
+-      /* eh_return epilogues need %ecx added to the stack pointer.  */
+-      if (style == 2)
++      if (TARGET_GNU2_TLS)
+ 	{
+-	  rtx sa = EH_RETURN_STACKADJ_RTX;
+-	  rtx_insn *insn;
++	  if (TARGET_64BIT)
++	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
++	  else
++	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
+ 
+-	  /* %ecx can't be used for both DRAP register and eh_return.  */
+-	  if (crtl->drap_reg)
+-	    gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
++	  tp = get_thread_pointer (Pmode, true);
++	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
+ 
+-	  /* regparm nested functions don't work with eh_return.  */
+-	  gcc_assert (!ix86_static_chain_on_stack);
++	  if (GET_MODE (x) != Pmode)
++	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
+ 
+-	  if (frame_pointer_needed)
++	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
++	}
++      else
++	{
++	  rtx caddr = ix86_tls_get_addr ();
++
++	  if (TARGET_64BIT)
+ 	    {
+-	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
+-	      t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
+-	      emit_insn (gen_rtx_SET (sa, t));
++	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
++	      rtx_insn *insns;
+ 
+-	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
+-	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
++	      start_sequence ();
++	      emit_call_insn
++		(ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
++	      insns = get_insns ();
++	      end_sequence ();
+ 
+-	      /* Note that we use SA as a temporary CFA, as the return
+-		 address is at the proper place relative to it.  We
+-		 pretend this happens at the FP restore insn because
+-		 prior to this insn the FP would be stored at the wrong
+-		 offset relative to SA, and after this insn we have no
+-		 other reasonable register to use for the CFA.  We don't
+-		 bother resetting the CFA to the SP for the duration of
+-		 the return insn, unless the control flow instrumentation
+-		 is done.  In this case the SP is used later and we have
+-		 to reset CFA to SP.  */
+-	      add_reg_note (insn, REG_CFA_DEF_CFA,
+-			    plus_constant (Pmode, sa, UNITS_PER_WORD));
+-	      ix86_add_queued_cfa_restore_notes (insn);
+-	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
+-	      RTX_FRAME_RELATED_P (insn) = 1;
+-
+-	      m->fs.cfa_reg = sa;
+-	      m->fs.cfa_offset = UNITS_PER_WORD;
+-	      m->fs.fp_valid = false;
++	      if (GET_MODE (x) != Pmode)
++		x = gen_rtx_ZERO_EXTEND (Pmode, x);
+ 
+-	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
+-					 const0_rtx, style,
+-					 flag_cf_protection);
++	      RTL_CONST_CALL_P (insns) = 1;
++	      emit_libcall_block (insns, dest, rax, x);
+ 	    }
+ 	  else
+-	    {
+-	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
+-	      t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
+-	      insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
+-	      ix86_add_queued_cfa_restore_notes (insn);
+-
+-	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+-	      if (m->fs.cfa_offset != UNITS_PER_WORD)
+-		{
+-		  m->fs.cfa_offset = UNITS_PER_WORD;
+-		  add_reg_note (insn, REG_CFA_DEF_CFA,
+-				plus_constant (Pmode, stack_pointer_rtx,
+-					       UNITS_PER_WORD));
+-		  RTX_FRAME_RELATED_P (insn) = 1;
+-		}
+-	    }
+-	  m->fs.sp_offset = UNITS_PER_WORD;
+-	  m->fs.sp_valid = true;
+-	  m->fs.sp_realigned = false;
++	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
+ 	}
+-    }
+-  else
+-    {
+-      /* SEH requires that the function end with (1) a stack adjustment
+-	 if necessary, (2) a sequence of pops, and (3) a return or
+-	 jump instruction.  Prevent insns from the function body from
+-	 being scheduled into this sequence.  */
+-      if (TARGET_SEH)
++      break;
++
++    case TLS_MODEL_LOCAL_DYNAMIC:
++      base = gen_reg_rtx (Pmode);
++
++      if (!TARGET_64BIT)
+ 	{
+-	  /* Prevent a catch region from being adjacent to the standard
+-	     epilogue sequence.  Unfortunately neither crtl->uses_eh_lsda
+-	     nor several other flags that would be interesting to test are
+-	     set up yet.  */
+-	  if (flag_non_call_exceptions)
+-	    emit_insn (gen_nops (const1_rtx));
++	  if (flag_pic)
++	    pic = pic_offset_table_rtx;
+ 	  else
+-	    emit_insn (gen_blockage ());
++	    {
++	      pic = gen_reg_rtx (Pmode);
++	      emit_insn (gen_set_got (pic));
++	    }
+ 	}
+ 
+-      /* First step is to deallocate the stack frame so that we can
+-	 pop the registers.  If the stack pointer was realigned, it needs
+-	 to be restored now.  Also do it on SEH target for very large
+-	 frame as the emitted instructions aren't allowed by the ABI
+-	 in epilogues.  */
+-      if (!m->fs.sp_valid || m->fs.sp_realigned
+- 	  || (TARGET_SEH
+-	      && (m->fs.sp_offset - reg_save_offset
+-		  >= SEH_MAX_FRAME_SIZE)))
+-	{
+-	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
+-				     GEN_INT (m->fs.fp_offset
+-					      - reg_save_offset),
+-				     style, false);
+-	}
+-      else if (m->fs.sp_offset != reg_save_offset)
++      if (TARGET_GNU2_TLS)
+ 	{
+-	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				     GEN_INT (m->fs.sp_offset
+-					      - reg_save_offset),
+-				     style,
+-				     m->fs.cfa_reg == stack_pointer_rtx);
+-	}
++	  rtx tmp = ix86_tls_module_base ();
+ 
+-      ix86_emit_restore_regs_using_pop ();
+-    }
++	  if (TARGET_64BIT)
++	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
++	  else
++	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
+ 
+-  /* If we used a stack pointer and haven't already got rid of it,
+-     then do so now.  */
+-  if (m->fs.fp_valid)
+-    {
+-      /* If the stack pointer is valid and pointing at the frame
+-	 pointer store address, then we only need a pop.  */
+-      if (sp_valid_at (frame.hfp_save_offset)
+-	  && m->fs.sp_offset == frame.hfp_save_offset)
+-	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+-      /* Leave results in shorter dependency chains on CPUs that are
+-	 able to grok it fast.  */
+-      else if (TARGET_USE_LEAVE
+-	       || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
+-	       || !cfun->machine->use_fast_prologue_epilogue)
+-	ix86_emit_leave (NULL);
++	  tp = get_thread_pointer (Pmode, true);
++	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
++			       gen_rtx_MINUS (Pmode, tmp, tp));
++	}
+       else
+-        {
+-	  pro_epilogue_adjust_stack (stack_pointer_rtx,
+-				     hard_frame_pointer_rtx,
+-				     const0_rtx, style, !using_drap);
+-	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+-        }
+-    }
+-
+-  if (using_drap)
+-    {
+-      int param_ptr_offset = UNITS_PER_WORD;
+-      rtx_insn *insn;
++	{
++	  rtx caddr = ix86_tls_get_addr ();
+ 
+-      gcc_assert (stack_realign_drap);
++	  if (TARGET_64BIT)
++	    {
++	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
++	      rtx_insn *insns;
++	      rtx eqv;
+ 
+-      if (ix86_static_chain_on_stack)
+-	param_ptr_offset += UNITS_PER_WORD;
+-      if (!call_used_regs[REGNO (crtl->drap_reg)])
+-	param_ptr_offset += UNITS_PER_WORD;
++	      start_sequence ();
++	      emit_call_insn
++		(ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
++	      insns = get_insns ();
++	      end_sequence ();
+ 
+-      insn = emit_insn (gen_rtx_SET
+-			(stack_pointer_rtx,
+-			 gen_rtx_PLUS (Pmode,
+-				       crtl->drap_reg,
+-				       GEN_INT (-param_ptr_offset))));
+-      m->fs.cfa_reg = stack_pointer_rtx;
+-      m->fs.cfa_offset = param_ptr_offset;
+-      m->fs.sp_offset = param_ptr_offset;
+-      m->fs.realigned = false;
++	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
++		 share the LD_BASE result with other LD model accesses.  */
++	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
++				    UNSPEC_TLS_LD_BASE);
+ 
+-      add_reg_note (insn, REG_CFA_DEF_CFA,
+-		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+-				  GEN_INT (param_ptr_offset)));
+-      RTX_FRAME_RELATED_P (insn) = 1;
++	      RTL_CONST_CALL_P (insns) = 1;
++	      emit_libcall_block (insns, base, rax, eqv);
++	    }
++	  else
++	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
++	}
+ 
+-      if (!call_used_regs[REGNO (crtl->drap_reg)])
+-	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
+-    }
++      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
++      off = gen_rtx_CONST (Pmode, off);
+ 
+-  /* At this point the stack pointer must be valid, and we must have
+-     restored all of the registers.  We may not have deallocated the
+-     entire stack frame.  We've delayed this until now because it may
+-     be possible to merge the local stack deallocation with the
+-     deallocation forced by ix86_static_chain_on_stack.   */
+-  gcc_assert (m->fs.sp_valid);
+-  gcc_assert (!m->fs.sp_realigned);
+-  gcc_assert (!m->fs.fp_valid);
+-  gcc_assert (!m->fs.realigned);
+-  if (m->fs.sp_offset != UNITS_PER_WORD)
+-    {
+-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
+-				 style, true);
+-    }
+-  else
+-    ix86_add_queued_cfa_restore_notes (get_last_insn ());
++      dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
+ 
+-  /* Sibcall epilogues don't want a return instruction.  */
+-  if (style == 0)
+-    {
+-      m->fs = frame_state_save;
+-      return;
+-    }
++      if (TARGET_GNU2_TLS)
++	{
++	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
+ 
+-  if (cfun->machine->func_type != TYPE_NORMAL)
+-    emit_jump_insn (gen_interrupt_return ());
+-  else if (crtl->args.pops_args && crtl->args.size)
+-    {
+-      rtx popc = GEN_INT (crtl->args.pops_args);
++	  if (GET_MODE (x) != Pmode)
++	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
+ 
+-      /* i386 can only pop 64K bytes.  If asked to pop more, pop return
+-	 address, do explicit add, and jump indirectly to the caller.  */
++	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
++	}
++      break;
+ 
+-      if (crtl->args.pops_args >= 65536)
++    case TLS_MODEL_INITIAL_EXEC:
++      if (TARGET_64BIT)
+ 	{
+-	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
+-	  rtx_insn *insn;
+-
+-	  /* There is no "pascal" calling convention in any 64bit ABI.  */
+-	  gcc_assert (!TARGET_64BIT);
+-
+-	  insn = emit_insn (gen_pop (ecx));
+-	  m->fs.cfa_offset -= UNITS_PER_WORD;
+-	  m->fs.sp_offset -= UNITS_PER_WORD;
++	  if (TARGET_SUN_TLS && !TARGET_X32)
++	    {
++	      /* The Sun linker took the AMD64 TLS spec literally
++		 and can only handle %rax as destination of the
++		 initial executable code sequence.  */
+ 
+-	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+-	  x = gen_rtx_SET (stack_pointer_rtx, x);
+-	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+-	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+-	  RTX_FRAME_RELATED_P (insn) = 1;
++	      dest = gen_reg_rtx (DImode);
++	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
++	      return dest;
++	    }
+ 
+-	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+-				     popc, -1, true);
+-	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
++	  /* Generate DImode references to avoid %fs:(%reg32)
++	     problems and linker IE->LE relaxation bug.  */
++	  tp_mode = DImode;
++	  pic = NULL;
++	  type = UNSPEC_GOTNTPOFF;
+ 	}
+-      else
+-	emit_jump_insn (gen_simple_return_pop_internal (popc));
+-    }
+-  else if (!m->call_ms2sysv || !restore_stub_is_tail)
+-    {
+-      /* In case of return from EH a simple return cannot be used
+-	 as a return address will be compared with a shadow stack
+-	 return address.  Use indirect jump instead.  */
+-      if (style == 2 && flag_cf_protection)
++      else if (flag_pic)
+ 	{
+-	  /* Register used in indirect jump must be in word_mode.  But
+-	     Pmode may not be the same as word_mode for x32.  */
+-	  rtx ecx = gen_rtx_REG (word_mode, CX_REG);
+-	  rtx_insn *insn;
+-
+-	  insn = emit_insn (gen_pop (ecx));
+-	  m->fs.cfa_offset -= UNITS_PER_WORD;
+-	  m->fs.sp_offset -= UNITS_PER_WORD;
+-
+-	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+-	  x = gen_rtx_SET (stack_pointer_rtx, x);
+-	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+-	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+-	  RTX_FRAME_RELATED_P (insn) = 1;
+-
+-	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
++	  pic = pic_offset_table_rtx;
++	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
++	}
++      else if (!TARGET_ANY_GNU_TLS)
++	{
++	  pic = gen_reg_rtx (Pmode);
++	  emit_insn (gen_set_got (pic));
++	  type = UNSPEC_GOTTPOFF;
+ 	}
+       else
+-	emit_jump_insn (gen_simple_return_internal ());
+-    }
+-
+-  /* Restore the state back to the state from the prologue,
+-     so that it's correct for the next epilogue.  */
+-  m->fs = frame_state_save;
+-}
++	{
++	  pic = NULL;
++	  type = UNSPEC_INDNTPOFF;
++	}
+ 
+-/* Reset from the function's potential modifications.  */
++      off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
++      off = gen_rtx_CONST (tp_mode, off);
++      if (pic)
++	off = gen_rtx_PLUS (tp_mode, pic, off);
++      off = gen_const_mem (tp_mode, off);
++      set_mem_alias_set (off, ix86_GOT_alias_set ());
+ 
+-static void
+-ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
+-{
+-  if (pic_offset_table_rtx
+-      && !ix86_use_pseudo_pic_reg ())
+-    SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
++      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
++	{
++	  base = get_thread_pointer (tp_mode,
++				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
++	  off = force_reg (tp_mode, off);
++	  dest = gen_rtx_PLUS (tp_mode, base, off);
++	  if (tp_mode != Pmode)
++	    dest = convert_to_mode (Pmode, dest, 1);
++	}
++      else
++	{
++	  base = get_thread_pointer (Pmode, true);
++	  dest = gen_reg_rtx (Pmode);
++	  emit_insn (ix86_gen_sub3 (dest, base, off));
++	}
++      break;
+ 
+-  if (TARGET_MACHO)
+-    {
+-      rtx_insn *insn = get_last_insn ();
+-      rtx_insn *deleted_debug_label = NULL;
++    case TLS_MODEL_LOCAL_EXEC:
++      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
++			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
++			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
++      off = gen_rtx_CONST (Pmode, off);
+ 
+-      /* Mach-O doesn't support labels at the end of objects, so if
+-         it looks like we might want one, take special action.
+-        First, collect any sequence of deleted debug labels.  */
+-      while (insn
+-	     && NOTE_P (insn)
+-	     && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
++      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+ 	{
+-	  /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
+-	     notes only, instead set their CODE_LABEL_NUMBER to -1,
+-	     otherwise there would be code generation differences
+-	     in between -g and -g0.  */
+-	  if (NOTE_P (insn) && NOTE_KIND (insn)
+-	      == NOTE_INSN_DELETED_DEBUG_LABEL)
+-	    deleted_debug_label = insn;
+-	  insn = PREV_INSN (insn);
++	  base = get_thread_pointer (Pmode,
++				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
++	  return gen_rtx_PLUS (Pmode, base, off);
++	}
++      else
++	{
++	  base = get_thread_pointer (Pmode, true);
++	  dest = gen_reg_rtx (Pmode);
++	  emit_insn (ix86_gen_sub3 (dest, base, off));
+ 	}
++      break;
+ 
+-      /* If we have:
+-	 label:
+-	    barrier
+-	  then this needs to be detected, so skip past the barrier.  */
++    default:
++      gcc_unreachable ();
++    }
+ 
+-      if (insn && BARRIER_P (insn))
+-	insn = PREV_INSN (insn);
++  return dest;
++}
+ 
+-      /* Up to now we've only seen notes or barriers.  */
+-      if (insn)
++/* Return true if OP refers to a TLS address.  */
++bool
++ix86_tls_address_pattern_p (rtx op)
++{
++  subrtx_var_iterator::array_type array;
++  FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
++    {
++      rtx op = *iter;
++      if (MEM_P (op))
+ 	{
+-	  if (LABEL_P (insn)
+-	      || (NOTE_P (insn)
+-		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
+-	    /* Trailing label.  */
+-	    fputs ("\tnop\n", file);
+-	  else if (cfun && ! cfun->is_thunk)
++	  rtx *x = &XEXP (op, 0);
++	  while (GET_CODE (*x) == PLUS)
+ 	    {
+-	      /* See if we have a completely empty function body, skipping
+-	         the special case of the picbase thunk emitted as asm.  */
+-	      while (insn && ! INSN_P (insn))
+-		insn = PREV_INSN (insn);
+-	      /* If we don't find any insns, we've got an empty function body;
+-		 I.e. completely empty - without a return or branch.  This is
+-		 taken as the case where a function body has been removed
+-		 because it contains an inline __builtin_unreachable().  GCC
+-		 declares that reaching __builtin_unreachable() means UB so
+-		 we're not obliged to do anything special; however, we want
+-		 non-zero-sized function bodies.  To meet this, and help the
+-		 user out, let's trap the case.  */
+-	      if (insn == NULL)
+-		fputs ("\tud2\n", file);
++	      int i;
++	      for (i = 0; i < 2; i++)
++		{
++		  rtx u = XEXP (*x, i);
++		  if (GET_CODE (u) == ZERO_EXTEND)
++		    u = XEXP (u, 0);
++		  if (GET_CODE (u) == UNSPEC
++		      && XINT (u, 1) == UNSPEC_TP)
++		    return true;
++		}
++	      x = &XEXP (*x, 0);
+ 	    }
++
++	  iter.skip_subrtxes ();
+ 	}
+-      else if (deleted_debug_label)
+-	for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
+-	  if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+-	    CODE_LABEL_NUMBER (insn) = -1;
+     }
+-}
+ 
+-/* Return a scratch register to use in the split stack prologue.  The
+-   split stack prologue is used for -fsplit-stack.  It is the first
+-   instructions in the function, even before the regular prologue.
+-   The scratch register can be any caller-saved register which is not
+-   used for parameters or for the static chain.  */
++  return false;
++}
+ 
+-static unsigned int
+-split_stack_prologue_scratch_regno (void)
++/* Rewrite *LOC so that it refers to a default TLS address space.  */
++void
++ix86_rewrite_tls_address_1 (rtx *loc)
+ {
+-  if (TARGET_64BIT)
+-    return R11_REG;
+-  else
++  subrtx_ptr_iterator::array_type array;
++  FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
+     {
+-      bool is_fastcall, is_thiscall;
+-      int regparm;
+-
+-      is_fastcall = (lookup_attribute ("fastcall",
+-				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+-		     != NULL);
+-      is_thiscall = (lookup_attribute ("thiscall",
+-				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+-		     != NULL);
+-      regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
+-
+-      if (is_fastcall)
+-	{
+-	  if (DECL_STATIC_CHAIN (cfun->decl))
+-	    {
+-	      sorry ("%<-fsplit-stack%> does not support fastcall with "
+-		     "nested function");
+-	      return INVALID_REGNUM;
+-	    }
+-	  return AX_REG;
+-	}
+-      else if (is_thiscall)
+-        {
+-	  if (!DECL_STATIC_CHAIN (cfun->decl))
+-	    return DX_REG;
+-	  return AX_REG;
+-	}
+-      else if (regparm < 3)
++      rtx *loc = *iter;
++      if (MEM_P (*loc))
+ 	{
+-	  if (!DECL_STATIC_CHAIN (cfun->decl))
+-	    return CX_REG;
+-	  else
++	  rtx addr = XEXP (*loc, 0);
++	  rtx *x = &addr;
++	  while (GET_CODE (*x) == PLUS)
+ 	    {
+-	      if (regparm >= 2)
++	      int i;
++	      for (i = 0; i < 2; i++)
+ 		{
+-		  sorry ("%<-fsplit-stack%> does not support 2 register "
+-			 "parameters for a nested function");
+-		  return INVALID_REGNUM;
++		  rtx u = XEXP (*x, i);
++		  if (GET_CODE (u) == ZERO_EXTEND)
++		    u = XEXP (u, 0);
++		  if (GET_CODE (u) == UNSPEC
++		      && XINT (u, 1) == UNSPEC_TP)
++		    {
++		      addr_space_t as = DEFAULT_TLS_SEG_REG;
++
++		      *x = XEXP (*x, 1 - i);
++
++		      *loc = replace_equiv_address_nv (*loc, addr, true);
++		      set_mem_addr_space (*loc, as);
++		      return;
++		    }
+ 		}
+-	      return DX_REG;
++	      x = &XEXP (*x, 0);
+ 	    }
+-	}
+-      else
+-	{
+-	  /* FIXME: We could make this work by pushing a register
+-	     around the addition and comparison.  */
+-	  sorry ("%<-fsplit-stack%> does not support 3 register parameters");
+-	  return INVALID_REGNUM;
++
++	  iter.skip_subrtxes ();
+ 	}
+     }
+ }
+ 
+-/* A SYMBOL_REF for the function which allocates new stackspace for
+-   -fsplit-stack.  */
+-
+-static GTY(()) rtx split_stack_fn;
+-
+-/* A SYMBOL_REF for the more stack function when using the large
+-   model.  */
+-
+-static GTY(()) rtx split_stack_fn_large;
++/* Rewrite instruction pattern involvning TLS address
++   so that it refers to a default TLS address space.  */
++rtx
++ix86_rewrite_tls_address (rtx pattern)
++{
++  pattern = copy_insn (pattern);
++  ix86_rewrite_tls_address_1 (&pattern);
++  return pattern;
++}
+ 
+-/* Return location of the stack guard value in the TLS block.  */
++/* Create or return the unique __imp_DECL dllimport symbol corresponding
++   to symbol DECL if BEIMPORT is true.  Otherwise create or return the
++   unique refptr-DECL symbol corresponding to symbol DECL.  */
+ 
+-rtx
+-ix86_split_stack_guard (void)
++struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
+ {
+-  int offset;
+-  addr_space_t as = DEFAULT_TLS_SEG_REG;
+-  rtx r;
++  static inline hashval_t hash (tree_map *m) { return m->hash; }
++  static inline bool
++  equal (tree_map *a, tree_map *b)
++  {
++    return a->base.from == b->base.from;
++  }
+ 
+-  gcc_assert (flag_split_stack);
++  static int
++  keep_cache_entry (tree_map *&m)
++  {
++    return ggc_marked_p (m->base.from);
++  }
++};
+ 
+-#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
+-  offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
+-#else
+-  gcc_unreachable ();
++static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
++
++static tree
++get_dllimport_decl (tree decl, bool beimport)
++{
++  struct tree_map *h, in;
++  const char *name;
++  const char *prefix;
++  size_t namelen, prefixlen;
++  char *imp_name;
++  tree to;
++  rtx rtl;
++
++  if (!dllimport_map)
++    dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
++
++  in.hash = htab_hash_pointer (decl);
++  in.base.from = decl;
++  tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
++  h = *loc;
++  if (h)
++    return h->to;
++
++  *loc = h = ggc_alloc<tree_map> ();
++  h->hash = in.hash;
++  h->base.from = decl;
++  h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
++			   VAR_DECL, NULL, ptr_type_node);
++  DECL_ARTIFICIAL (to) = 1;
++  DECL_IGNORED_P (to) = 1;
++  DECL_EXTERNAL (to) = 1;
++  TREE_READONLY (to) = 1;
++
++  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
++  name = targetm.strip_name_encoding (name);
++  if (beimport)
++    prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
++      ? "*__imp_" : "*__imp__";
++  else
++    prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
++  namelen = strlen (name);
++  prefixlen = strlen (prefix);
++  imp_name = (char *) alloca (namelen + prefixlen + 1);
++  memcpy (imp_name, prefix, prefixlen);
++  memcpy (imp_name + prefixlen, name, namelen + 1);
++
++  name = ggc_alloc_string (imp_name, namelen + prefixlen);
++  rtl = gen_rtx_SYMBOL_REF (Pmode, name);
++  SET_SYMBOL_REF_DECL (rtl, to);
++  SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
++  if (!beimport)
++    {
++      SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
++#ifdef SUB_TARGET_RECORD_STUB
++      SUB_TARGET_RECORD_STUB (name);
+ #endif
++    }      
+ 
+-  r = GEN_INT (offset);
+-  r = gen_const_mem (Pmode, r);
+-  set_mem_addr_space (r, as);
++  rtl = gen_const_mem (Pmode, rtl);
++  set_mem_alias_set (rtl, ix86_GOT_alias_set ());
+ 
+-  return r;
++  SET_DECL_RTL (to, rtl);
++  SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
++
++  return to;
+ }
+ 
+-/* Handle -fsplit-stack.  These are the first instructions in the
+-   function, even before the regular prologue.  */
++/* Expand SYMBOL into its corresponding far-address symbol.
++   WANT_REG is true if we require the result be a register.  */
+ 
+-void
+-ix86_expand_split_stack_prologue (void)
++static rtx
++legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
+ {
+-  HOST_WIDE_INT allocate;
+-  unsigned HOST_WIDE_INT args_size;
+-  rtx_code_label *label;
+-  rtx limit, current, allocate_rtx, call_fusage;
+-  rtx_insn *call_insn;
+-  rtx scratch_reg = NULL_RTX;
+-  rtx_code_label *varargs_label = NULL;
+-  rtx fn;
++  tree imp_decl;
++  rtx x;
+ 
+-  gcc_assert (flag_split_stack && reload_completed);
++  gcc_assert (SYMBOL_REF_DECL (symbol));
++  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
+ 
+-  ix86_finalize_stack_frame_flags ();
+-  struct ix86_frame &frame = cfun->machine->frame;
+-  allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
++  x = DECL_RTL (imp_decl);
++  if (want_reg)
++    x = force_reg (Pmode, x);
++  return x;
++}
+ 
+-  /* This is the label we will branch to if we have enough stack
+-     space.  We expect the basic block reordering pass to reverse this
+-     branch if optimizing, so that we branch in the unlikely case.  */
+-  label = gen_label_rtx ();
++/* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
++   true if we require the result be a register.  */
+ 
+-  /* We need to compare the stack pointer minus the frame size with
+-     the stack boundary in the TCB.  The stack boundary always gives
+-     us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
+-     can compare directly.  Otherwise we need to do an addition.  */
++static rtx
++legitimize_dllimport_symbol (rtx symbol, bool want_reg)
++{
++  tree imp_decl;
++  rtx x;
+ 
+-  limit = ix86_split_stack_guard ();
++  gcc_assert (SYMBOL_REF_DECL (symbol));
++  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
+ 
+-  if (allocate < SPLIT_STACK_AVAILABLE)
+-    current = stack_pointer_rtx;
+-  else
+-    {
+-      unsigned int scratch_regno;
+-      rtx offset;
++  x = DECL_RTL (imp_decl);
++  if (want_reg)
++    x = force_reg (Pmode, x);
++  return x;
++}
+ 
+-      /* We need a scratch register to hold the stack pointer minus
+-	 the required frame size.  Since this is the very start of the
+-	 function, the scratch register can be any caller-saved
+-	 register which is not used for parameters.  */
+-      offset = GEN_INT (- allocate);
+-      scratch_regno = split_stack_prologue_scratch_regno ();
+-      if (scratch_regno == INVALID_REGNUM)
+-	return;
+-      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+-      if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
+-	{
+-	  /* We don't use ix86_gen_add3 in this case because it will
+-	     want to split to lea, but when not optimizing the insn
+-	     will not be split after this point.  */
+-	  emit_insn (gen_rtx_SET (scratch_reg,
+-				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+-						offset)));
+-	}
+-      else
++/* Expand SYMBOL into its corresponding dllimport or refptr symbol.  WANT_REG 
++   is true if we require the result be a register.  */
++
++rtx
++legitimize_pe_coff_symbol (rtx addr, bool inreg)
++{
++  if (!TARGET_PECOFF)
++    return NULL_RTX;
++
++  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
++    {
++      if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
++	return legitimize_dllimport_symbol (addr, inreg);
++      if (GET_CODE (addr) == CONST
++	  && GET_CODE (XEXP (addr, 0)) == PLUS
++	  && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
++	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
+ 	{
+-	  emit_move_insn (scratch_reg, offset);
+-	  emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
+-				    stack_pointer_rtx));
++	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
++	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+ 	}
+-      current = scratch_reg;
+     }
+ 
+-  ix86_expand_branch (GEU, current, limit, label);
+-  rtx_insn *jump_insn = get_last_insn ();
+-  JUMP_LABEL (jump_insn) = label;
+-
+-  /* Mark the jump as very likely to be taken.  */
+-  add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
++  if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
++    return NULL_RTX;
++  if (GET_CODE (addr) == SYMBOL_REF
++      && !is_imported_p (addr)
++      && SYMBOL_REF_EXTERNAL_P (addr)
++      && SYMBOL_REF_DECL (addr))
++    return legitimize_pe_coff_extern_decl (addr, inreg);
+ 
+-  if (split_stack_fn == NULL_RTX)
++  if (GET_CODE (addr) == CONST
++      && GET_CODE (XEXP (addr, 0)) == PLUS
++      && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
++      && !is_imported_p (XEXP (XEXP (addr, 0), 0))
++      && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
++      && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
+     {
+-      split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+-      SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
++      rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
++      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+     }
+-  fn = split_stack_fn;
++  return NULL_RTX;
++}
+ 
+-  /* Get more stack space.  We pass in the desired stack space and the
+-     size of the arguments to copy to the new stack.  In 32-bit mode
+-     we push the parameters; __morestack will return on a new stack
+-     anyhow.  In 64-bit mode we pass the parameters in r10 and
+-     r11.  */
+-  allocate_rtx = GEN_INT (allocate);
+-  args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
+-  call_fusage = NULL_RTX;
+-  rtx pop = NULL_RTX;
+-  if (TARGET_64BIT)
+-    {
+-      rtx reg10, reg11;
++/* Try machine-dependent ways of modifying an illegitimate address
++   to be legitimate.  If we find one, return the new, valid address.
++   This macro is used in only one place: `memory_address' in explow.c.
+ 
+-      reg10 = gen_rtx_REG (Pmode, R10_REG);
+-      reg11 = gen_rtx_REG (Pmode, R11_REG);
++   OLDX is the address as it was before break_out_memory_refs was called.
++   In some cases it is useful to look at this to decide what needs to be done.
+ 
+-      /* If this function uses a static chain, it will be in %r10.
+-	 Preserve it across the call to __morestack.  */
+-      if (DECL_STATIC_CHAIN (cfun->decl))
+-	{
+-	  rtx rax;
++   It is always safe for this macro to do nothing.  It exists to recognize
++   opportunities to optimize the output.
+ 
+-	  rax = gen_rtx_REG (word_mode, AX_REG);
+-	  emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
+-	  use_reg (&call_fusage, rax);
+-	}
+-
+-      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+-          && !TARGET_PECOFF)
+-	{
+-	  HOST_WIDE_INT argval;
+-
+-	  gcc_assert (Pmode == DImode);
+-	  /* When using the large model we need to load the address
+-	     into a register, and we've run out of registers.  So we
+-	     switch to a different calling convention, and we call a
+-	     different function: __morestack_large.  We pass the
+-	     argument size in the upper 32 bits of r10 and pass the
+-	     frame size in the lower 32 bits.  */
+-	  gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
+-	  gcc_assert ((args_size & 0xffffffff) == args_size);
+-
+-	  if (split_stack_fn_large == NULL_RTX)
+-	    {
+-	      split_stack_fn_large
+-		= gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
+-	      SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
+-	    }
+-	  if (ix86_cmodel == CM_LARGE_PIC)
+-	    {
+-	      rtx_code_label *label;
+-	      rtx x;
+-
+-	      label = gen_label_rtx ();
+-	      emit_label (label);
+-	      LABEL_PRESERVE_P (label) = 1;
+-	      emit_insn (gen_set_rip_rex64 (reg10, label));
+-	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
+-	      emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
+-	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
+-				  UNSPEC_GOT);
+-	      x = gen_rtx_CONST (Pmode, x);
+-	      emit_move_insn (reg11, x);
+-	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
+-	      x = gen_const_mem (Pmode, x);
+-	      emit_move_insn (reg11, x);
+-	    }
+-	  else
+-	    emit_move_insn (reg11, split_stack_fn_large);
++   For the 80386, we handle X+REG by loading X into a register R and
++   using R+REG.  R will go in a general reg and indexing will be used.
++   However, if REG is a broken-out memory address or multiplication,
++   nothing needs to be done because REG can certainly go in a general reg.
+ 
+-	  fn = reg11;
++   When -fpic is used, special handling is needed for symbolic references.
++   See comments by legitimize_pic_address in i386.c for details.  */
+ 
+-	  argval = ((args_size << 16) << 16) + allocate;
+-	  emit_move_insn (reg10, GEN_INT (argval));
+-	}
+-      else
+-	{
+-	  emit_move_insn (reg10, allocate_rtx);
+-	  emit_move_insn (reg11, GEN_INT (args_size));
+-	  use_reg (&call_fusage, reg11);
+-	}
++static rtx
++ix86_legitimize_address (rtx x, rtx, machine_mode mode)
++{
++  bool changed = false;
++  unsigned log;
+ 
+-      use_reg (&call_fusage, reg10);
++  log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
++  if (log)
++    return legitimize_tls_address (x, (enum tls_model) log, false);
++  if (GET_CODE (x) == CONST
++      && GET_CODE (XEXP (x, 0)) == PLUS
++      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
++      && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
++    {
++      rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
++				      (enum tls_model) log, false);
++      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
+     }
+-  else
++
++  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+     {
+-      rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
+-      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
+-      insn = emit_insn (gen_push (allocate_rtx));
+-      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
+-      pop = GEN_INT (2 * UNITS_PER_WORD);
++      rtx tmp = legitimize_pe_coff_symbol (x, true);
++      if (tmp)
++        return tmp;
+     }
+-  call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
+-				GEN_INT (UNITS_PER_WORD), constm1_rtx,
+-				pop, false);
+-  add_function_usage_to (call_insn, call_fusage);
+-  if (!TARGET_64BIT)
+-    add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
+-  /* Indicate that this function can't jump to non-local gotos.  */
+-  make_reg_eh_region_note_nothrow_nononlocal (call_insn);
+ 
+-  /* In order to make call/return prediction work right, we now need
+-     to execute a return instruction.  See
+-     libgcc/config/i386/morestack.S for the details on how this works.
++  if (flag_pic && SYMBOLIC_CONST (x))
++    return legitimize_pic_address (x, 0);
+ 
+-     For flow purposes gcc must not see this as a return
+-     instruction--we need control flow to continue at the subsequent
+-     label.  Therefore, we use an unspec.  */
+-  gcc_assert (crtl->args.pops_args < 65536);
+-  rtx_insn *ret_insn
+-    = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
++#if TARGET_MACHO
++  if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
++    return machopic_indirect_data_reference (x, 0);
++#endif
+ 
+-  if ((flag_cf_protection & CF_BRANCH))
++  /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
++  if (GET_CODE (x) == ASHIFT
++      && CONST_INT_P (XEXP (x, 1))
++      && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
+     {
+-      /* Insert ENDBR since __morestack will jump back here via indirect
+-	 call.  */
+-      rtx cet_eb = gen_nop_endbr ();
+-      emit_insn_after (cet_eb, ret_insn);
++      changed = true;
++      log = INTVAL (XEXP (x, 1));
++      x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
++			GEN_INT (1 << log));
+     }
+ 
+-  /* If we are in 64-bit mode and this function uses a static chain,
+-     we saved %r10 in %rax before calling _morestack.  */
+-  if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
+-    emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+-		    gen_rtx_REG (word_mode, AX_REG));
+-
+-  /* If this function calls va_start, we need to store a pointer to
+-     the arguments on the old stack, because they may not have been
+-     all copied to the new stack.  At this point the old stack can be
+-     found at the frame pointer value used by __morestack, because
+-     __morestack has set that up before calling back to us.  Here we
+-     store that pointer in a scratch register, and in
+-     ix86_expand_prologue we store the scratch register in a stack
+-     slot.  */
+-  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
++  if (GET_CODE (x) == PLUS)
+     {
+-      unsigned int scratch_regno;
+-      rtx frame_reg;
+-      int words;
++      /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
+ 
+-      scratch_regno = split_stack_prologue_scratch_regno ();
+-      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+-      frame_reg = gen_rtx_REG (Pmode, BP_REG);
++      if (GET_CODE (XEXP (x, 0)) == ASHIFT
++	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
++	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
++	{
++	  changed = true;
++	  log = INTVAL (XEXP (XEXP (x, 0), 1));
++	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
++				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
++				      GEN_INT (1 << log));
++	}
+ 
+-      /* 64-bit:
+-	 fp -> old fp value
+-	       return address within this function
+-	       return address of caller of this function
+-	       stack arguments
+-	 So we add three words to get to the stack arguments.
++      if (GET_CODE (XEXP (x, 1)) == ASHIFT
++	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
++	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
++	{
++	  changed = true;
++	  log = INTVAL (XEXP (XEXP (x, 1), 1));
++	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
++				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
++				      GEN_INT (1 << log));
++	}
+ 
+-	 32-bit:
+-	 fp -> old fp value
+-	       return address within this function
+-               first argument to __morestack
+-               second argument to __morestack
+-               return address of caller of this function
+-               stack arguments
+-         So we add five words to get to the stack arguments.
+-      */
+-      words = TARGET_64BIT ? 3 : 5;
+-      emit_insn (gen_rtx_SET (scratch_reg,
+-			      gen_rtx_PLUS (Pmode, frame_reg,
+-					    GEN_INT (words * UNITS_PER_WORD))));
++      /* Put multiply first if it isn't already.  */
++      if (GET_CODE (XEXP (x, 1)) == MULT)
++	{
++	  std::swap (XEXP (x, 0), XEXP (x, 1));
++	  changed = true;
++	}
+ 
+-      varargs_label = gen_label_rtx ();
+-      emit_jump_insn (gen_jump (varargs_label));
+-      JUMP_LABEL (get_last_insn ()) = varargs_label;
++      /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
++	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
++	 created by virtual register instantiation, register elimination, and
++	 similar optimizations.  */
++      if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
++	{
++	  changed = true;
++	  x = gen_rtx_PLUS (Pmode,
++			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
++					  XEXP (XEXP (x, 1), 0)),
++			    XEXP (XEXP (x, 1), 1));
++	}
+ 
+-      emit_barrier ();
+-    }
++      /* Canonicalize
++	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
++	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
++      else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
++	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
++	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
++	       && CONSTANT_P (XEXP (x, 1)))
++	{
++	  rtx constant;
++	  rtx other = NULL_RTX;
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++	  if (CONST_INT_P (XEXP (x, 1)))
++	    {
++	      constant = XEXP (x, 1);
++	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
++	    }
++	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
++	    {
++	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
++	      other = XEXP (x, 1);
++	    }
++	  else
++	    constant = 0;
+ 
+-  /* If this function calls va_start, we now have to set the scratch
+-     register for the case where we do not call __morestack.  In this
+-     case we need to set it based on the stack pointer.  */
+-  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+-    {
+-      emit_insn (gen_rtx_SET (scratch_reg,
+-			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+-					    GEN_INT (UNITS_PER_WORD))));
++	  if (constant)
++	    {
++	      changed = true;
++	      x = gen_rtx_PLUS (Pmode,
++				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
++					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
++				plus_constant (Pmode, other,
++					       INTVAL (constant)));
++	    }
++	}
+ 
+-      emit_label (varargs_label);
+-      LABEL_NUSES (varargs_label) = 1;
+-    }
+-}
+-
+-/* We may have to tell the dataflow pass that the split stack prologue
+-   is initializing a scratch register.  */
+-
+-static void
+-ix86_live_on_entry (bitmap regs)
+-{
+-  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+-    {
+-      gcc_assert (flag_split_stack);
+-      bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
+-    }
+-}
+-
+-/* Extract the parts of an RTL expression that is a valid memory address
+-   for an instruction.  Return 0 if the structure of the address is
+-   grossly off.  Return -1 if the address contains ASHIFT, so it is not
+-   strictly valid, but still used for computing length of lea instruction.  */
+-
+-int
+-ix86_decompose_address (rtx addr, struct ix86_address *out)
+-{
+-  rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
+-  rtx base_reg, index_reg;
+-  HOST_WIDE_INT scale = 1;
+-  rtx scale_rtx = NULL_RTX;
+-  rtx tmp;
+-  int retval = 1;
+-  addr_space_t seg = ADDR_SPACE_GENERIC;
++      if (changed && ix86_legitimate_address_p (mode, x, false))
++	return x;
+ 
+-  /* Allow zero-extended SImode addresses,
+-     they will be emitted with addr32 prefix.  */
+-  if (TARGET_64BIT && GET_MODE (addr) == DImode)
+-    {
+-      if (GET_CODE (addr) == ZERO_EXTEND
+-	  && GET_MODE (XEXP (addr, 0)) == SImode)
+-	{
+-	  addr = XEXP (addr, 0);
+-	  if (CONST_INT_P (addr))
+-	    return 0;
+-	}	      
+-      else if (GET_CODE (addr) == AND
+-	       && const_32bit_mask (XEXP (addr, 1), DImode))
++      if (GET_CODE (XEXP (x, 0)) == MULT)
+ 	{
+-	  addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
+-	  if (addr == NULL_RTX)
+-	    return 0;
+-
+-	  if (CONST_INT_P (addr))
+-	    return 0;
++	  changed = true;
++	  XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
+ 	}
+-    }
+ 
+-  /* Allow SImode subregs of DImode addresses,
+-     they will be emitted with addr32 prefix.  */
+-  if (TARGET_64BIT && GET_MODE (addr) == SImode)
+-    {
+-      if (SUBREG_P (addr)
+-	  && GET_MODE (SUBREG_REG (addr)) == DImode)
++      if (GET_CODE (XEXP (x, 1)) == MULT)
+ 	{
+-	  addr = SUBREG_REG (addr);
+-	  if (CONST_INT_P (addr))
+-	    return 0;
++	  changed = true;
++	  XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
+ 	}
+-    }
+ 
+-  if (REG_P (addr))
+-    base = addr;
+-  else if (SUBREG_P (addr))
+-    {
+-      if (REG_P (SUBREG_REG (addr)))
+-	base = addr;
+-      else
+-	return 0;
+-    }
+-  else if (GET_CODE (addr) == PLUS)
+-    {
+-      rtx addends[4], op;
+-      int n = 0, i;
++      if (changed
++	  && REG_P (XEXP (x, 1))
++	  && REG_P (XEXP (x, 0)))
++	return x;
+ 
+-      op = addr;
+-      do
++      if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
+ 	{
+-	  if (n >= 4)
+-	    return 0;
+-	  addends[n++] = XEXP (op, 1);
+-	  op = XEXP (op, 0);
++	  changed = true;
++	  x = legitimize_pic_address (x, 0);
+ 	}
+-      while (GET_CODE (op) == PLUS);
+-      if (n >= 4)
+-	return 0;
+-      addends[n] = op;
+ 
+-      for (i = n; i >= 0; --i)
++      if (changed && ix86_legitimate_address_p (mode, x, false))
++	return x;
++
++      if (REG_P (XEXP (x, 0)))
+ 	{
+-	  op = addends[i];
+-	  switch (GET_CODE (op))
++	  rtx temp = gen_reg_rtx (Pmode);
++	  rtx val  = force_operand (XEXP (x, 1), temp);
++	  if (val != temp)
+ 	    {
+-	    case MULT:
+-	      if (index)
+-		return 0;
+-	      index = XEXP (op, 0);
+-	      scale_rtx = XEXP (op, 1);
+-	      break;
+-
+-	    case ASHIFT:
+-	      if (index)
+-		return 0;
+-	      index = XEXP (op, 0);
+-	      tmp = XEXP (op, 1);
+-	      if (!CONST_INT_P (tmp))
+-		return 0;
+-	      scale = INTVAL (tmp);
+-	      if ((unsigned HOST_WIDE_INT) scale > 3)
+-		return 0;
+-	      scale = 1 << scale;
+-	      break;
+-
+-	    case ZERO_EXTEND:
+-	      op = XEXP (op, 0);
+-	      if (GET_CODE (op) != UNSPEC)
+-		return 0;
+-	      /* FALLTHRU */
+-
+-	    case UNSPEC:
+-	      if (XINT (op, 1) == UNSPEC_TP
+-	          && TARGET_TLS_DIRECT_SEG_REFS
+-	          && seg == ADDR_SPACE_GENERIC)
+-		seg = DEFAULT_TLS_SEG_REG;
+-	      else
+-		return 0;
+-	      break;
+-
+-	    case SUBREG:
+-	      if (!REG_P (SUBREG_REG (op)))
+-		return 0;
+-	      /* FALLTHRU */
+-
+-	    case REG:
+-	      if (!base)
+-		base = op;
+-	      else if (!index)
+-		index = op;
+-	      else
+-		return 0;
+-	      break;
++	      val = convert_to_mode (Pmode, val, 1);
++	      emit_move_insn (temp, val);
++	    }
+ 
+-	    case CONST:
+-	    case CONST_INT:
+-	    case SYMBOL_REF:
+-	    case LABEL_REF:
+-	      if (disp)
+-		return 0;
+-	      disp = op;
+-	      break;
++	  XEXP (x, 1) = temp;
++	  return x;
++	}
+ 
+-	    default:
+-	      return 0;
++      else if (REG_P (XEXP (x, 1)))
++	{
++	  rtx temp = gen_reg_rtx (Pmode);
++	  rtx val  = force_operand (XEXP (x, 0), temp);
++	  if (val != temp)
++	    {
++	      val = convert_to_mode (Pmode, val, 1);
++	      emit_move_insn (temp, val);
+ 	    }
++
++	  XEXP (x, 0) = temp;
++	  return x;
+ 	}
+     }
+-  else if (GET_CODE (addr) == MULT)
+-    {
+-      index = XEXP (addr, 0);		/* index*scale */
+-      scale_rtx = XEXP (addr, 1);
+-    }
+-  else if (GET_CODE (addr) == ASHIFT)
+-    {
+-      /* We're called for lea too, which implements ashift on occasion.  */
+-      index = XEXP (addr, 0);
+-      tmp = XEXP (addr, 1);
+-      if (!CONST_INT_P (tmp))
+-	return 0;
+-      scale = INTVAL (tmp);
+-      if ((unsigned HOST_WIDE_INT) scale > 3)
+-	return 0;
+-      scale = 1 << scale;
+-      retval = -1;
+-    }
+-  else
+-    disp = addr;			/* displacement */
+ 
+-  if (index)
+-    {
+-      if (REG_P (index))
+-	;
+-      else if (SUBREG_P (index)
+-	       && REG_P (SUBREG_REG (index)))
+-	;
+-      else
+-	return 0;
+-    }
++  return x;
++}
++
++/* Print an integer constant expression in assembler syntax.  Addition
++   and subtraction are the only arithmetic that may appear in these
++   expressions.  FILE is the stdio stream to write to, X is the rtx, and
++   CODE is the operand print code from the output string.  */
+ 
+-  /* Extract the integral value of scale.  */
+-  if (scale_rtx)
++static void
++output_pic_addr_const (FILE *file, rtx x, int code)
++{
++  char buf[256];
++
++  switch (GET_CODE (x))
+     {
+-      if (!CONST_INT_P (scale_rtx))
+-	return 0;
+-      scale = INTVAL (scale_rtx);
+-    }
++    case PC:
++      gcc_assert (flag_pic);
++      putc ('.', file);
++      break;
+ 
+-  base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
+-  index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
++    case SYMBOL_REF:
++      if (TARGET_64BIT || ! TARGET_MACHO_SYMBOL_STUBS)
++	output_addr_const (file, x);
++      else
++	{
++	  const char *name = XSTR (x, 0);
+ 
+-  /* Avoid useless 0 displacement.  */
+-  if (disp == const0_rtx && (base || index))
+-    disp = NULL_RTX;
++	  /* Mark the decl as referenced so that cgraph will
++	     output the function.  */
++	  if (SYMBOL_REF_DECL (x))
++	    mark_decl_referenced (SYMBOL_REF_DECL (x));
+ 
+-  /* Allow arg pointer and stack pointer as index if there is not scaling.  */
+-  if (base_reg && index_reg && scale == 1
+-      && (REGNO (index_reg) == ARG_POINTER_REGNUM
+-	  || REGNO (index_reg) == FRAME_POINTER_REGNUM
+-	  || REGNO (index_reg) == SP_REG))
+-    {
+-      std::swap (base, index);
+-      std::swap (base_reg, index_reg);
+-    }
+-
+-  /* Special case: %ebp cannot be encoded as a base without a displacement.
+-     Similarly %r13.  */
+-  if (!disp && base_reg
+-      && (REGNO (base_reg) == ARG_POINTER_REGNUM
+-	  || REGNO (base_reg) == FRAME_POINTER_REGNUM
+-	  || REGNO (base_reg) == BP_REG
+-	  || REGNO (base_reg) == R13_REG))
+-    disp = const0_rtx;
+-
+-  /* Special case: on K6, [%esi] makes the instruction vector decoded.
+-     Avoid this by transforming to [%esi+0].
+-     Reload calls address legitimization without cfun defined, so we need
+-     to test cfun for being non-NULL. */
+-  if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
+-      && base_reg && !index_reg && !disp
+-      && REGNO (base_reg) == SI_REG)
+-    disp = const0_rtx;
+-
+-  /* Special case: encode reg+reg instead of reg*2.  */
+-  if (!base && index && scale == 2)
+-    base = index, base_reg = index_reg, scale = 1;
+-
+-  /* Special case: scaling cannot be encoded without base or displacement.  */
+-  if (!base && !disp && index && scale != 1)
+-    disp = const0_rtx;
+-
+-  out->base = base;
+-  out->index = index;
+-  out->disp = disp;
+-  out->scale = scale;
+-  out->seg = seg;
+-
+-  return retval;
+-}
+-
+-/* Return cost of the memory address x.
+-   For i386, it is better to use a complex address than let gcc copy
+-   the address into a reg and make a new pseudo.  But not if the address
+-   requires to two regs - that would mean more pseudos with longer
+-   lifetimes.  */
+-static int
+-ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
+-{
+-  struct ix86_address parts;
+-  int cost = 1;
+-  int ok = ix86_decompose_address (x, &parts);
+-
+-  gcc_assert (ok);
++#if TARGET_MACHO
++	  if (MACHOPIC_INDIRECT
++	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
++	    name = machopic_indirection_name (x, /*stub_p=*/true);
++#endif
++	  assemble_name (file, name);
++	}
++      if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
++	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
++	fputs ("@PLT", file);
++      break;
+ 
+-  if (parts.base && SUBREG_P (parts.base))
+-    parts.base = SUBREG_REG (parts.base);
+-  if (parts.index && SUBREG_P (parts.index))
+-    parts.index = SUBREG_REG (parts.index);
++    case LABEL_REF:
++      x = XEXP (x, 0);
++      /* FALLTHRU */
++    case CODE_LABEL:
++      ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
++      assemble_name (asm_out_file, buf);
++      break;
+ 
+-  /* Attempt to minimize number of registers in the address by increasing
+-     address cost for each used register.  We don't increase address cost
+-     for "pic_offset_table_rtx".  When a memopt with "pic_offset_table_rtx"
+-     is not invariant itself it most likely means that base or index is not
+-     invariant.  Therefore only "pic_offset_table_rtx" could be hoisted out,
+-     which is not profitable for x86.  */
+-  if (parts.base
+-      && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
+-      && (current_pass->type == GIMPLE_PASS
+-	  || !pic_offset_table_rtx
+-	  || !REG_P (parts.base)
+-	  || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
+-    cost++;
++    case CONST_INT:
++      fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
++      break;
+ 
+-  if (parts.index
+-      && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
+-      && (current_pass->type == GIMPLE_PASS
+-	  || !pic_offset_table_rtx
+-	  || !REG_P (parts.index)
+-	  || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
+-    cost++;
++    case CONST:
++      /* This used to output parentheses around the expression,
++	 but that does not work on the 386 (either ATT or BSD assembler).  */
++      output_pic_addr_const (file, XEXP (x, 0), code);
++      break;
+ 
+-  /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
+-     since it's predecode logic can't detect the length of instructions
+-     and it degenerates to vector decoded.  Increase cost of such
+-     addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
+-     to split such addresses or even refuse such addresses at all.
++    case CONST_DOUBLE:
++      /* We can't handle floating point constants;
++	 TARGET_PRINT_OPERAND must handle them.  */
++      output_operand_lossage ("floating constant misused");
++      break;
+ 
+-     Following addressing modes are affected:
+-      [base+scale*index]
+-      [scale*index+disp]
+-      [base+index]
++    case PLUS:
++      /* Some assemblers need integer constants to appear first.  */
++      if (CONST_INT_P (XEXP (x, 0)))
++	{
++	  output_pic_addr_const (file, XEXP (x, 0), code);
++	  putc ('+', file);
++	  output_pic_addr_const (file, XEXP (x, 1), code);
++	}
++      else
++	{
++	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
++	  output_pic_addr_const (file, XEXP (x, 1), code);
++	  putc ('+', file);
++	  output_pic_addr_const (file, XEXP (x, 0), code);
++	}
++      break;
+ 
+-     The first and last case  may be avoidable by explicitly coding the zero in
+-     memory address, but I don't have AMD-K6 machine handy to check this
+-     theory.  */
++    case MINUS:
++      if (!TARGET_MACHO)
++	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
++      output_pic_addr_const (file, XEXP (x, 0), code);
++      putc ('-', file);
++      output_pic_addr_const (file, XEXP (x, 1), code);
++      if (!TARGET_MACHO)
++	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
++      break;
+ 
+-  if (TARGET_K6
+-      && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
+-	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
+-	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
+-    cost += 10;
++    case UNSPEC:
++      gcc_assert (XVECLEN (x, 0) == 1);
++      output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
++      switch (XINT (x, 1))
++	{
++	case UNSPEC_GOT:
++	  fputs ("@GOT", file);
++	  break;
++	case UNSPEC_GOTOFF:
++	  fputs ("@GOTOFF", file);
++	  break;
++	case UNSPEC_PLTOFF:
++	  fputs ("@PLTOFF", file);
++	  break;
++	case UNSPEC_PCREL:
++	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
++		 "(%rip)" : "[rip]", file);
++	  break;
++	case UNSPEC_GOTPCREL:
++	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
++		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
++	  break;
++	case UNSPEC_GOTTPOFF:
++	  /* FIXME: This might be @TPOFF in Sun ld too.  */
++	  fputs ("@gottpoff", file);
++	  break;
++	case UNSPEC_TPOFF:
++	  fputs ("@tpoff", file);
++	  break;
++	case UNSPEC_NTPOFF:
++	  if (TARGET_64BIT)
++	    fputs ("@tpoff", file);
++	  else
++	    fputs ("@ntpoff", file);
++	  break;
++	case UNSPEC_DTPOFF:
++	  fputs ("@dtpoff", file);
++	  break;
++	case UNSPEC_GOTNTPOFF:
++	  if (TARGET_64BIT)
++	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
++		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
++	  else
++	    fputs ("@gotntpoff", file);
++	  break;
++	case UNSPEC_INDNTPOFF:
++	  fputs ("@indntpoff", file);
++	  break;
++#if TARGET_MACHO
++	case UNSPEC_MACHOPIC_OFFSET:
++	  putc ('-', file);
++	  machopic_output_function_base_name (file);
++	  break;
++#endif
++	default:
++	  output_operand_lossage ("invalid UNSPEC as operand");
++	  break;
++	}
++       break;
+ 
+-  return cost;
++    default:
++      output_operand_lossage ("invalid expression as operand");
++    }
+ }
+-
+-/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
+-   this is used for to form addresses to local data when -fPIC is in
+-   use.  */
+ 
+-static bool
+-darwin_local_data_pic (rtx disp)
++/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
++   We need to emit DTP-relative relocations.  */
++
++static void ATTRIBUTE_UNUSED
++i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
+ {
+-  return (GET_CODE (disp) == UNSPEC
+-	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
++  fputs (ASM_LONG, file);
++  output_addr_const (file, x);
++  fputs ("@dtpoff", file);
++  switch (size)
++    {
++    case 4:
++      break;
++    case 8:
++      fputs (", 0", file);
++      break;
++    default:
++      gcc_unreachable ();
++   }
+ }
+ 
+-/* True if operand X should be loaded from GOT.  */
++/* Return true if X is a representation of the PIC register.  This copes
++   with calls from ix86_find_base_term, where the register might have
++   been replaced by a cselib value.  */
+ 
+-bool
+-ix86_force_load_from_GOT_p (rtx x)
++static bool
++ix86_pic_register_p (rtx x)
+ {
+-  return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
+-	  && !TARGET_PECOFF && !TARGET_MACHO
+-	  && !flag_pic
+-	  && ix86_cmodel != CM_LARGE
+-	  && GET_CODE (x) == SYMBOL_REF
+-	  && SYMBOL_REF_FUNCTION_P (x)
+-	  && (!flag_plt
+-	      || (SYMBOL_REF_DECL (x)
+-		  && lookup_attribute ("noplt",
+-				       DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
+-	  && !SYMBOL_REF_LOCAL_P (x));
++  if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
++    return (pic_offset_table_rtx
++	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
++  else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT)
++    return true;
++  else if (!REG_P (x))
++    return false;
++  else if (pic_offset_table_rtx)
++    {
++      if (REGNO (x) == REGNO (pic_offset_table_rtx))
++	return true;
++      if (HARD_REGISTER_P (x)
++	  && !HARD_REGISTER_P (pic_offset_table_rtx)
++	  && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
++	return true;
++      return false;
++    }
++  else
++    return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
+ }
+ 
+-/* Determine if a given RTX is a valid constant.  We already know this
+-   satisfies CONSTANT_P.  */
++/* Helper function for ix86_delegitimize_address.
++   Attempt to delegitimize TLS local-exec accesses.  */
+ 
+-static bool
+-ix86_legitimate_constant_p (machine_mode mode, rtx x)
++static rtx
++ix86_delegitimize_tls_address (rtx orig_x)
+ {
+-  switch (GET_CODE (x))
+-    {
+-    case CONST:
+-      x = XEXP (x, 0);
+-
+-      if (GET_CODE (x) == PLUS)
+-	{
+-	  if (!CONST_INT_P (XEXP (x, 1)))
+-	    return false;
+-	  x = XEXP (x, 0);
+-	}
++  rtx x = orig_x, unspec;
++  struct ix86_address addr;
+ 
+-      if (TARGET_MACHO && darwin_local_data_pic (x))
+-	return true;
++  if (!TARGET_TLS_DIRECT_SEG_REFS)
++    return orig_x;
++  if (MEM_P (x))
++    x = XEXP (x, 0);
++  if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
++    return orig_x;
++  if (ix86_decompose_address (x, &addr) == 0
++      || addr.seg != DEFAULT_TLS_SEG_REG
++      || addr.disp == NULL_RTX
++      || GET_CODE (addr.disp) != CONST)
++    return orig_x;
++  unspec = XEXP (addr.disp, 0);
++  if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
++    unspec = XEXP (unspec, 0);
++  if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
++    return orig_x;
++  x = XVECEXP (unspec, 0, 0);
++  gcc_assert (GET_CODE (x) == SYMBOL_REF);
++  if (unspec != XEXP (addr.disp, 0))
++    x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
++  if (addr.index)
++    {
++      rtx idx = addr.index;
++      if (addr.scale != 1)
++	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
++      x = gen_rtx_PLUS (Pmode, idx, x);
++    }
++  if (addr.base)
++    x = gen_rtx_PLUS (Pmode, addr.base, x);
++  if (MEM_P (orig_x))
++    x = replace_equiv_address_nv (orig_x, x);
++  return x;
++}
+ 
+-      /* Only some unspecs are valid as "constants".  */
+-      if (GET_CODE (x) == UNSPEC)
+-	switch (XINT (x, 1))
+-	  {
+-	  case UNSPEC_GOT:
+-	  case UNSPEC_GOTOFF:
+-	  case UNSPEC_PLTOFF:
+-	    return TARGET_64BIT;
+-	  case UNSPEC_TPOFF:
+-	  case UNSPEC_NTPOFF:
+-	    x = XVECEXP (x, 0, 0);
+-	    return (GET_CODE (x) == SYMBOL_REF
+-		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+-	  case UNSPEC_DTPOFF:
+-	    x = XVECEXP (x, 0, 0);
+-	    return (GET_CODE (x) == SYMBOL_REF
+-		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+-	  default:
+-	    return false;
+-	  }
++/* In the name of slightly smaller debug output, and to cater to
++   general assembler lossage, recognize PIC+GOTOFF and turn it back
++   into a direct symbol reference.
+ 
+-      /* We must have drilled down to a symbol.  */
+-      if (GET_CODE (x) == LABEL_REF)
+-	return true;
+-      if (GET_CODE (x) != SYMBOL_REF)
+-	return false;
+-      /* FALLTHRU */
++   On Darwin, this is necessary to avoid a crash, because Darwin
++   has a different PIC label for each routine but the DWARF debugging
++   information is not associated with any particular routine, so it's
++   necessary to remove references to the PIC label from RTL stored by
++   the DWARF output code.
+ 
+-    case SYMBOL_REF:
+-      /* TLS symbols are never valid.  */
+-      if (SYMBOL_REF_TLS_MODEL (x))
+-	return false;
++   This helper is used in the normal ix86_delegitimize_address
++   entrypoint (e.g. used in the target delegitimization hook) and
++   in ix86_find_base_term.  As compile time memory optimization, we
++   avoid allocating rtxes that will not change anything on the outcome
++   of the callers (find_base_value and find_base_term).  */
+ 
+-      /* DLLIMPORT symbols are never valid.  */
+-      if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
+-	  && SYMBOL_REF_DLLIMPORT_P (x))
+-	return false;
++static inline rtx
++ix86_delegitimize_address_1 (rtx x, bool base_term_p)
++{
++  rtx orig_x = delegitimize_mem_from_attrs (x);
++  /* addend is NULL or some rtx if x is something+GOTOFF where
++     something doesn't include the PIC register.  */
++  rtx addend = NULL_RTX;
++  /* reg_addend is NULL or a multiple of some register.  */
++  rtx reg_addend = NULL_RTX;
++  /* const_addend is NULL or a const_int.  */
++  rtx const_addend = NULL_RTX;
++  /* This is the result, or NULL.  */
++  rtx result = NULL_RTX;
+ 
+-#if TARGET_MACHO
+-      /* mdynamic-no-pic */
+-      if (MACHO_DYNAMIC_NO_PIC_P)
+-	return machopic_symbol_defined_p (x);
+-#endif
++  x = orig_x;
+ 
+-      /* External function address should be loaded
+-	 via the GOT slot to avoid PLT.  */
+-      if (ix86_force_load_from_GOT_p (x))
+-	return false;
++  if (MEM_P (x))
++    x = XEXP (x, 0);
+ 
+-      break;
++  if (TARGET_64BIT)
++    {
++      if (GET_CODE (x) == CONST
++          && GET_CODE (XEXP (x, 0)) == PLUS
++          && GET_MODE (XEXP (x, 0)) == Pmode
++          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
++          && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
++          && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
++        {
++	  /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
++	     base.  A CONST can't be arg_pointer_rtx based.  */
++	  if (base_term_p && MEM_P (orig_x))
++	    return orig_x;
++	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
++	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
++	  if (MEM_P (orig_x))
++	    x = replace_equiv_address_nv (orig_x, x);
++	  return x;
++	}
+ 
+-    CASE_CONST_SCALAR_INT:
+-      switch (mode)
++      if (GET_CODE (x) == CONST
++	  && GET_CODE (XEXP (x, 0)) == UNSPEC
++	  && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
++	      || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
++	  && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
+ 	{
+-	case E_TImode:
+-	  if (TARGET_64BIT)
+-	    return true;
+-	  /* FALLTHRU */
+-	case E_OImode:
+-	case E_XImode:
+-	  if (!standard_sse_constant_p (x, mode))
+-	    return false;
+-	default:
+-	  break;
++	  x = XVECEXP (XEXP (x, 0), 0, 0);
++	  if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
++	    {
++	      x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
++	      if (x == NULL_RTX)
++		return orig_x;
++	    }
++	  return x;
+ 	}
+-      break;
+ 
+-    case CONST_VECTOR:
+-      if (!standard_sse_constant_p (x, mode))
+-	return false;
++      if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
++	return ix86_delegitimize_tls_address (orig_x);
+ 
+-    default:
+-      break;
++      /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
++	 and -mcmodel=medium -fpic.  */
+     }
+ 
+-  /* Otherwise we handle everything else in the move patterns.  */
+-  return true;
+-}
+-
+-/* Determine if it's legal to put X into the constant pool.  This
+-   is not possible for the address of thread-local symbols, which
+-   is checked above.  */
++  if (GET_CODE (x) != PLUS
++      || GET_CODE (XEXP (x, 1)) != CONST)
++    return ix86_delegitimize_tls_address (orig_x);
+ 
+-static bool
+-ix86_cannot_force_const_mem (machine_mode mode, rtx x)
+-{
+-  /* We can put any immediate constant in memory.  */
+-  switch (GET_CODE (x))
++  if (ix86_pic_register_p (XEXP (x, 0)))
++    /* %ebx + GOT/GOTOFF */
++    ;
++  else if (GET_CODE (XEXP (x, 0)) == PLUS)
+     {
+-    CASE_CONST_ANY:
+-      return false;
++      /* %ebx + %reg * scale + GOT/GOTOFF */
++      reg_addend = XEXP (x, 0);
++      if (ix86_pic_register_p (XEXP (reg_addend, 0)))
++	reg_addend = XEXP (reg_addend, 1);
++      else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
++	reg_addend = XEXP (reg_addend, 0);
++      else
++	{
++	  reg_addend = NULL_RTX;
++	  addend = XEXP (x, 0);
++	}
++    }
++  else
++    addend = XEXP (x, 0);
+ 
+-    default:
+-      break;
++  x = XEXP (XEXP (x, 1), 0);
++  if (GET_CODE (x) == PLUS
++      && CONST_INT_P (XEXP (x, 1)))
++    {
++      const_addend = XEXP (x, 1);
++      x = XEXP (x, 0);
+     }
+ 
+-  return !ix86_legitimate_constant_p (mode, x);
+-}
++  if (GET_CODE (x) == UNSPEC
++      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
++	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
++	  || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
++	      && !MEM_P (orig_x) && !addend)))
++    result = XVECEXP (x, 0, 0);
+ 
+-/*  Nonzero if the symbol is marked as dllimport, or as stub-variable,
+-    otherwise zero.  */
++  if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
++      && !MEM_P (orig_x))
++    result = XVECEXP (x, 0, 0);
+ 
+-static bool
+-is_imported_p (rtx x)
+-{
+-  if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
+-      || GET_CODE (x) != SYMBOL_REF)
+-    return false;
++  if (! result)
++    return ix86_delegitimize_tls_address (orig_x);
+ 
+-  return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
++  /* For (PLUS something CONST_INT) both find_base_{value,term} just
++     recurse on the first operand.  */
++  if (const_addend && !base_term_p)
++    result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
++  if (reg_addend)
++    result = gen_rtx_PLUS (Pmode, reg_addend, result);
++  if (addend)
++    {
++      /* If the rest of original X doesn't involve the PIC register, add
++	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
++	 for code like:
++	 leal (%ebx, %ecx, 4), %ecx
++	 ...
++	 movl foo@GOTOFF(%ecx), %edx
++	 in which case we return (%ecx - %ebx) + foo
++	 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
++	 and reload has completed.  Don't do the latter for debug,
++	 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly.  */
++      if (pic_offset_table_rtx
++	  && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
++        result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
++						     pic_offset_table_rtx),
++			       result);
++      else if (base_term_p
++	       && pic_offset_table_rtx
++	       && !TARGET_MACHO
++	       && !TARGET_VXWORKS_RTP)
++	{
++	  rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
++	  tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
++	  result = gen_rtx_PLUS (Pmode, tmp, result);
++	}
++      else
++	return orig_x;
++    }
++  if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
++    {
++      result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
++      if (result == NULL_RTX)
++	return orig_x;
++    }
++  return result;
+ }
+ 
++/* The normal instantiation of the above template.  */
+ 
+-/* Nonzero if the constant value X is a legitimate general operand
+-   when generating PIC code.  It is given that flag_pic is on and
+-   that X satisfies CONSTANT_P.  */
+-
+-bool
+-legitimate_pic_operand_p (rtx x)
++static rtx
++ix86_delegitimize_address (rtx x)
+ {
+-  rtx inner;
+-
+-  switch (GET_CODE (x))
+-    {
+-    case CONST:
+-      inner = XEXP (x, 0);
+-      if (GET_CODE (inner) == PLUS
+-	  && CONST_INT_P (XEXP (inner, 1)))
+-	inner = XEXP (inner, 0);
+-
+-      /* Only some unspecs are valid as "constants".  */
+-      if (GET_CODE (inner) == UNSPEC)
+-	switch (XINT (inner, 1))
+-	  {
+-	  case UNSPEC_GOT:
+-	  case UNSPEC_GOTOFF:
+-	  case UNSPEC_PLTOFF:
+-	    return TARGET_64BIT;
+-	  case UNSPEC_TPOFF:
+-	    x = XVECEXP (inner, 0, 0);
+-	    return (GET_CODE (x) == SYMBOL_REF
+-		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+-	  case UNSPEC_MACHOPIC_OFFSET:
+-	    return legitimate_pic_address_disp_p (x);
+-	  default:
+-	    return false;
+-	  }
+-      /* FALLTHRU */
+-
+-    case SYMBOL_REF:
+-    case LABEL_REF:
+-      return legitimate_pic_address_disp_p (x);
+-
+-    default:
+-      return true;
+-    }
++  return ix86_delegitimize_address_1 (x, false);
+ }
+ 
+-/* Determine if a given CONST RTX is a valid memory displacement
+-   in PIC mode.  */
++/* If X is a machine specific address (i.e. a symbol or label being
++   referenced as a displacement from the GOT implemented using an
++   UNSPEC), then return the base term.  Otherwise return X.  */
+ 
+-bool
+-legitimate_pic_address_disp_p (rtx disp)
++rtx
++ix86_find_base_term (rtx x)
+ {
+-  bool saw_plus;
++  rtx term;
+ 
+-  /* In 64bit mode we can allow direct addresses of symbols and labels
+-     when they are not dynamic symbols.  */
+   if (TARGET_64BIT)
+     {
+-      rtx op0 = disp, op1;
+-
+-      switch (GET_CODE (disp))
+-	{
+-	case LABEL_REF:
+-	  return true;
++      if (GET_CODE (x) != CONST)
++	return x;
++      term = XEXP (x, 0);
++      if (GET_CODE (term) == PLUS
++	  && CONST_INT_P (XEXP (term, 1)))
++	term = XEXP (term, 0);
++      if (GET_CODE (term) != UNSPEC
++	  || (XINT (term, 1) != UNSPEC_GOTPCREL
++	      && XINT (term, 1) != UNSPEC_PCREL))
++	return x;
+ 
+-	case CONST:
+-	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
+-	    break;
+-	  op0 = XEXP (XEXP (disp, 0), 0);
+-	  op1 = XEXP (XEXP (disp, 0), 1);
+-	  if (!CONST_INT_P (op1))
+-	    break;
+-	  if (GET_CODE (op0) == UNSPEC
+-	      && (XINT (op0, 1) == UNSPEC_DTPOFF
+-		  || XINT (op0, 1) == UNSPEC_NTPOFF)
+-	      && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
+-	    return true;
+-	  if (INTVAL (op1) >= 16*1024*1024
+-	      || INTVAL (op1) < -16*1024*1024)
+-	    break;
+-	  if (GET_CODE (op0) == LABEL_REF)
+-	    return true;
+-	  if (GET_CODE (op0) == CONST
+-	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
+-	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
+-	    return true;
+-	  if (GET_CODE (op0) == UNSPEC
+-	      && XINT (op0, 1) == UNSPEC_PCREL)
+-	    return true;
+-	  if (GET_CODE (op0) != SYMBOL_REF)
+-	    break;
+-	  /* FALLTHRU */
++      return XVECEXP (term, 0, 0);
++    }
+ 
+-	case SYMBOL_REF:
+-	  /* TLS references should always be enclosed in UNSPEC.
+-	     The dllimported symbol needs always to be resolved.  */
+-	  if (SYMBOL_REF_TLS_MODEL (op0)
+-	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
+-	    return false;
++  return ix86_delegitimize_address_1 (x, true);
++}
+ 
+-	  if (TARGET_PECOFF)
+-	    {
+-	      if (is_imported_p (op0))
+-		return true;
++/* Return true if X shouldn't be emitted into the debug info.
++   Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
++   symbol easily into the .debug_info section, so we need not to
++   delegitimize, but instead assemble as @gotoff.
++   Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
++   assembles that as _GLOBAL_OFFSET_TABLE_-. expression.  */
+ 
+-	      if (SYMBOL_REF_FAR_ADDR_P (op0)
+-		  || !SYMBOL_REF_LOCAL_P (op0))
+-		break;
++static bool
++ix86_const_not_ok_for_debug_p (rtx x)
++{
++  if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
++    return true;
+ 
+-	      /* Function-symbols need to be resolved only for
+-	         large-model.
+-	         For the small-model we don't need to resolve anything
+-	         here.  */
+-	      if ((ix86_cmodel != CM_LARGE_PIC
+-	           && SYMBOL_REF_FUNCTION_P (op0))
+-		  || ix86_cmodel == CM_SMALL_PIC)
+-		return true;
+-	      /* Non-external symbols don't need to be resolved for
+-	         large, and medium-model.  */
+-	      if ((ix86_cmodel == CM_LARGE_PIC
+-		   || ix86_cmodel == CM_MEDIUM_PIC)
+-		  && !SYMBOL_REF_EXTERNAL_P (op0))
+-		return true;
+-	    }
+-	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+-		   && (SYMBOL_REF_LOCAL_P (op0)
+-		       || (HAVE_LD_PIE_COPYRELOC
+-			   && flag_pie
+-			   && !SYMBOL_REF_WEAK (op0)
+-			   && !SYMBOL_REF_FUNCTION_P (op0)))
+-		   && ix86_cmodel != CM_LARGE_PIC)
+-	    return true;
+-	  break;
++  if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
++    return true;
+ 
+-	default:
+-	  break;
+-	}
+-    }
+-  if (GET_CODE (disp) != CONST)
+-    return false;
+-  disp = XEXP (disp, 0);
++  return false;
++}
++
++static void
++put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
++		    bool fp, FILE *file)
++{
++  const char *suffix;
+ 
+-  if (TARGET_64BIT)
++  if (mode == CCFPmode)
+     {
+-      /* We are unsafe to allow PLUS expressions.  This limit allowed distance
+-         of GOT tables.  We should not need these anyway.  */
+-      if (GET_CODE (disp) != UNSPEC
+-	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
+-	      && XINT (disp, 1) != UNSPEC_GOTOFF
+-	      && XINT (disp, 1) != UNSPEC_PCREL
+-	      && XINT (disp, 1) != UNSPEC_PLTOFF))
+-	return false;
+-
+-      if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
+-	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
+-	return false;
+-      return true;
++      code = ix86_fp_compare_code_to_integer (code);
++      mode = CCmode;
+     }
++  if (reverse)
++    code = reverse_condition (code);
+ 
+-  saw_plus = false;
+-  if (GET_CODE (disp) == PLUS)
++  switch (code)
+     {
+-      if (!CONST_INT_P (XEXP (disp, 1)))
+-	return false;
+-      disp = XEXP (disp, 0);
+-      saw_plus = true;
+-    }
+-
+-  if (TARGET_MACHO && darwin_local_data_pic (disp))
+-    return true;
++    case EQ:
++      gcc_assert (mode != CCGZmode);
++      switch (mode)
++	{
++	case E_CCAmode:
++	  suffix = "a";
++	  break;
++	case E_CCCmode:
++	  suffix = "c";
++	  break;
++	case E_CCOmode:
++	  suffix = "o";
++	  break;
++	case E_CCPmode:
++	  suffix = "p";
++	  break;
++	case E_CCSmode:
++	  suffix = "s";
++	  break;
++	default:
++	  suffix = "e";
++	  break;
++	}
++      break;
++    case NE:
++      gcc_assert (mode != CCGZmode);
++      switch (mode)
++	{
++	case E_CCAmode:
++	  suffix = "na";
++	  break;
++	case E_CCCmode:
++	  suffix = "nc";
++	  break;
++	case E_CCOmode:
++	  suffix = "no";
++	  break;
++	case E_CCPmode:
++	  suffix = "np";
++	  break;
++	case E_CCSmode:
++	  suffix = "ns";
++	  break;
++	default:
++	  suffix = "ne";
++	  break;
++	}
++      break;
++    case GT:
++      gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
++      suffix = "g";
++      break;
++    case GTU:
++      /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
++	 Those same assemblers have the same but opposite lossage on cmov.  */
++      if (mode == CCmode)
++	suffix = fp ? "nbe" : "a";
++      else
++	gcc_unreachable ();
++      break;
++    case LT:
++      switch (mode)
++	{
++	case E_CCNOmode:
++	case E_CCGOCmode:
++	  suffix = "s";
++	  break;
+ 
+-  if (GET_CODE (disp) != UNSPEC)
+-    return false;
++	case E_CCmode:
++	case E_CCGCmode:
++	case E_CCGZmode:
++	  suffix = "l";
++	  break;
+ 
+-  switch (XINT (disp, 1))
+-    {
+-    case UNSPEC_GOT:
+-      if (saw_plus)
+-	return false;
+-      /* We need to check for both symbols and labels because VxWorks loads
+-	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
+-	 details.  */
+-      return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+-	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
+-    case UNSPEC_GOTOFF:
+-      /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
+-	 While ABI specify also 32bit relocation but we don't produce it in
+-	 small PIC model at all.  */
+-      if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+-	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
+-	  && !TARGET_64BIT)
+-        return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
+-      return false;
+-    case UNSPEC_GOTTPOFF:
+-    case UNSPEC_GOTNTPOFF:
+-    case UNSPEC_INDNTPOFF:
+-      if (saw_plus)
+-	return false;
+-      disp = XVECEXP (disp, 0, 0);
+-      return (GET_CODE (disp) == SYMBOL_REF
+-	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
+-    case UNSPEC_NTPOFF:
+-      disp = XVECEXP (disp, 0, 0);
+-      return (GET_CODE (disp) == SYMBOL_REF
+-	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
+-    case UNSPEC_DTPOFF:
+-      disp = XVECEXP (disp, 0, 0);
+-      return (GET_CODE (disp) == SYMBOL_REF
+-	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+-    }
++	default:
++	  gcc_unreachable ();
++	}
++      break;
++    case LTU:
++      if (mode == CCmode || mode == CCGZmode)
++	suffix = "b";
++      else if (mode == CCCmode)
++	suffix = fp ? "b" : "c";
++      else
++	gcc_unreachable ();
++      break;
++    case GE:
++      switch (mode)
++	{
++	case E_CCNOmode:
++	case E_CCGOCmode:
++	  suffix = "ns";
++	  break;
+ 
+-  return false;
++	case E_CCmode:
++	case E_CCGCmode:
++	case E_CCGZmode:
++	  suffix = "ge";
++	  break;
++
++	default:
++	  gcc_unreachable ();
++	}
++      break;
++    case GEU:
++      if (mode == CCmode || mode == CCGZmode)
++	suffix = "nb";
++      else if (mode == CCCmode)
++	suffix = fp ? "nb" : "nc";
++      else
++	gcc_unreachable ();
++      break;
++    case LE:
++      gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
++      suffix = "le";
++      break;
++    case LEU:
++      if (mode == CCmode)
++	suffix = "be";
++      else
++	gcc_unreachable ();
++      break;
++    case UNORDERED:
++      suffix = fp ? "u" : "p";
++      break;
++    case ORDERED:
++      suffix = fp ? "nu" : "np";
++      break;
++    default:
++      gcc_unreachable ();
++    }
++  fputs (suffix, file);
+ }
+ 
+-/* Determine if op is suitable RTX for an address register.
+-   Return naked register if a register or a register subreg is
+-   found, otherwise return NULL_RTX.  */
++/* Print the name of register X to FILE based on its machine mode and number.
++   If CODE is 'w', pretend the mode is HImode.
++   If CODE is 'b', pretend the mode is QImode.
++   If CODE is 'k', pretend the mode is SImode.
++   If CODE is 'q', pretend the mode is DImode.
++   If CODE is 'x', pretend the mode is V4SFmode.
++   If CODE is 't', pretend the mode is V8SFmode.
++   If CODE is 'g', pretend the mode is V16SFmode.
++   If CODE is 'h', pretend the reg is the 'high' byte register.
++   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
++   If CODE is 'd', duplicate the operand for AVX instruction.
++   If CODE is 'V', print naked full integer register name without %.
++ */
+ 
+-static rtx
+-ix86_validate_address_register (rtx op)
++void
++print_reg (rtx x, int code, FILE *file)
+ {
+-  machine_mode mode = GET_MODE (op);
++  const char *reg;
++  int msize;
++  unsigned int regno;
++  bool duplicated;
+ 
+-  /* Only SImode or DImode registers can form the address.  */
+-  if (mode != SImode && mode != DImode)
+-    return NULL_RTX;
++  if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
++    putc ('%', file);
+ 
+-  if (REG_P (op))
+-    return op;
+-  else if (SUBREG_P (op))
++  if (x == pc_rtx)
+     {
+-      rtx reg = SUBREG_REG (op);
++      gcc_assert (TARGET_64BIT);
++      fputs ("rip", file);
++      return;
++    }
+ 
+-      if (!REG_P (reg))
+-	return NULL_RTX;
++  if (code == 'y' && STACK_TOP_P (x))
++    {
++      fputs ("st(0)", file);
++      return;
++    }
+ 
+-      mode = GET_MODE (reg);
++  if (code == 'w')
++    msize = 2;
++  else if (code == 'b')
++    msize = 1;
++  else if (code == 'k')
++    msize = 4;
++  else if (code == 'q')
++    msize = 8;
++  else if (code == 'h')
++    msize = 0;
++  else if (code == 'x')
++    msize = 16;
++  else if (code == 't')
++    msize = 32;
++  else if (code == 'g')
++    msize = 64;
++  else
++    msize = GET_MODE_SIZE (GET_MODE (x));
+ 
+-      /* Don't allow SUBREGs that span more than a word.  It can
+-	 lead to spill failures when the register is one word out
+-	 of a two word structure.  */
+-      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+-	return NULL_RTX;
++  regno = REGNO (x);
+ 
+-      /* Allow only SUBREGs of non-eliminable hard registers.  */
+-      if (register_no_elim_operand (reg, mode))
+-	return reg;
++  if (regno == ARG_POINTER_REGNUM
++      || regno == FRAME_POINTER_REGNUM
++      || regno == FPSR_REG)
++    {
++      output_operand_lossage
++	("invalid use of register '%s'", reg_names[regno]);
++      return;
++    }
++  else if (regno == FLAGS_REG)
++    {
++      output_operand_lossage ("invalid use of asm flag output");
++      return;
+     }
+ 
+-  /* Op is not a register.  */
+-  return NULL_RTX;
+-}
+-
+-/* Recognizes RTL expressions that are valid memory addresses for an
+-   instruction.  The MODE argument is the machine mode for the MEM
+-   expression that wants to use this address.
+-
+-   It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
+-   convert common non-canonical forms to canonical form so that they will
+-   be recognized.  */
+-
+-static bool
+-ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
+-{
+-  struct ix86_address parts;
+-  rtx base, index, disp;
+-  HOST_WIDE_INT scale;
+-  addr_space_t seg;
+-
+-  if (ix86_decompose_address (addr, &parts) <= 0)
+-    /* Decomposition failed.  */
+-    return false;
++  if (code == 'V')
++    {
++      if (GENERAL_REGNO_P (regno))
++	msize = GET_MODE_SIZE (word_mode);
++      else
++	error ("%<V%> modifier on non-integer register");
++    }
+ 
+-  base = parts.base;
+-  index = parts.index;
+-  disp = parts.disp;
+-  scale = parts.scale;
+-  seg = parts.seg;
++  duplicated = code == 'd' && TARGET_AVX;
+ 
+-  /* Validate base register.  */
+-  if (base)
++  switch (msize)
+     {
+-      rtx reg = ix86_validate_address_register (base);
++    case 16:
++    case 12:
++    case 8:
++      if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
++	warning (0, "unsupported size for integer register");
++      /* FALLTHRU */
++    case 4:
++      if (LEGACY_INT_REGNO_P (regno))
++	putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
++      /* FALLTHRU */
++    case 2:
++    normal:
++      reg = hi_reg_name[regno];
++      break;
++    case 1:
++      if (regno >= ARRAY_SIZE (qi_reg_name))
++	goto normal;
++      if (!ANY_QI_REGNO_P (regno))
++	error ("unsupported size for integer register");
++      reg = qi_reg_name[regno];
++      break;
++    case 0:
++      if (regno >= ARRAY_SIZE (qi_high_reg_name))
++	goto normal;
++      reg = qi_high_reg_name[regno];
++      break;
++    case 32:
++    case 64:
++      if (SSE_REGNO_P (regno))
++	{
++	  gcc_assert (!duplicated);
++	  putc (msize == 32 ? 'y' : 'z', file);
++	  reg = hi_reg_name[regno] + 1;
++	  break;
++	}
++      goto normal;
++    default:
++      gcc_unreachable ();
++    }
+ 
+-      if (reg == NULL_RTX)
+-	return false;
++  fputs (reg, file);
+ 
+-      if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
+-	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
+-	/* Base is not valid.  */
+-	return false;
++  /* Irritatingly, AMD extended registers use
++     different naming convention: "r%d[bwd]"  */
++  if (REX_INT_REGNO_P (regno))
++    {
++      gcc_assert (TARGET_64BIT);
++      switch (msize)
++	{
++	  case 0:
++	    error ("extended registers have no high halves");
++	    break;
++	  case 1:
++	    putc ('b', file);
++	    break;
++	  case 2:
++	    putc ('w', file);
++	    break;
++	  case 4:
++	    putc ('d', file);
++	    break;
++	  case 8:
++	    /* no suffix */
++	    break;
++	  default:
++	    error ("unsupported operand size for extended register");
++	    break;
++	}
++      return;
+     }
+ 
+-  /* Validate index register.  */
+-  if (index)
++  if (duplicated)
+     {
+-      rtx reg = ix86_validate_address_register (index);
+-
+-      if (reg == NULL_RTX)
+-	return false;
+-
+-      if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
+-	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
+-	/* Index is not valid.  */
+-	return false;
++      if (ASSEMBLER_DIALECT == ASM_ATT)
++	fprintf (file, ", %%%s", reg);
++      else
++	fprintf (file, ", %s", reg);
+     }
++}
+ 
+-  /* Index and base should have the same mode.  */
+-  if (base && index
+-      && GET_MODE (base) != GET_MODE (index))
+-    return false;
+-
+-  /* Address override works only on the (%reg) part of %fs:(%reg).  */
+-  if (seg != ADDR_SPACE_GENERIC
+-      && ((base && GET_MODE (base) != word_mode)
+-	  || (index && GET_MODE (index) != word_mode)))
+-    return false;
++/* Meaning of CODE:
++   L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
++   C -- print opcode suffix for set/cmov insn.
++   c -- like C, but print reversed condition
++   F,f -- likewise, but for floating-point.
++   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
++	otherwise nothing
++   R -- print embedded rounding and sae.
++   r -- print only sae.
++   z -- print the opcode suffix for the size of the current operand.
++   Z -- likewise, with special suffixes for x87 instructions.
++   * -- print a star (in certain assembler syntax)
++   A -- print an absolute memory reference.
++   E -- print address with DImode register names if TARGET_64BIT.
++   w -- print the operand as if it's a "word" (HImode) even if it isn't.
++   s -- print a shift double count, followed by the assemblers argument
++	delimiter.
++   b -- print the QImode name of the register for the indicated operand.
++	%b0 would print %al if operands[0] is reg 0.
++   w --  likewise, print the HImode name of the register.
++   k --  likewise, print the SImode name of the register.
++   q --  likewise, print the DImode name of the register.
++   x --  likewise, print the V4SFmode name of the register.
++   t --  likewise, print the V8SFmode name of the register.
++   g --  likewise, print the V16SFmode name of the register.
++   h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
++   y -- print "st(0)" instead of "st" as a register.
++   d -- print duplicated register operand for AVX instruction.
++   D -- print condition for SSE cmp instruction.
++   P -- if PIC, print an @PLT suffix.
++   p -- print raw symbol name.
++   X -- don't print any sort of PIC '@' suffix for a symbol.
++   & -- print some in-use local-dynamic symbol name.
++   H -- print a memory address offset by 8; used for sse high-parts
++   Y -- print condition for XOP pcom* instruction.
++   V -- print naked full integer register name without %.
++   + -- print a branch hint as 'cs' or 'ds' prefix
++   ; -- print a semicolon (after prefixes due to bug in older gas).
++   ~ -- print "i" if TARGET_AVX2, "f" otherwise.
++   ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
++   M -- print addr32 prefix for TARGET_X32 with VSIB address.
++   ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
++ */
+ 
+-  /* Validate scale factor.  */
+-  if (scale != 1)
++void
++ix86_print_operand (FILE *file, rtx x, int code)
++{
++  if (code)
+     {
+-      if (!index)
+-	/* Scale without index.  */
+-	return false;
++      switch (code)
++	{
++	case 'A':
++	  switch (ASSEMBLER_DIALECT)
++	    {
++	    case ASM_ATT:
++	      putc ('*', file);
++	      break;
+ 
+-      if (scale != 2 && scale != 4 && scale != 8)
+-	/* Scale is not a valid multiplier.  */
+-	return false;
+-    }
++	    case ASM_INTEL:
++	      /* Intel syntax. For absolute addresses, registers should not
++		 be surrounded by braces.  */
++	      if (!REG_P (x))
++		{
++		  putc ('[', file);
++		  ix86_print_operand (file, x, 0);
++		  putc (']', file);
++		  return;
++		}
++	      break;
+ 
+-  /* Validate displacement.  */
+-  if (disp)
+-    {
+-      if (GET_CODE (disp) == CONST
+-	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
+-	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
+-	switch (XINT (XEXP (disp, 0), 1))
+-	  {
+-	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
+-	     when used.  While ABI specify also 32bit relocations, we
+-	     don't produce them at all and use IP relative instead.
+-	     Allow GOT in 32bit mode for both PIC and non-PIC if symbol
+-	     should be loaded via GOT.  */
+-	  case UNSPEC_GOT:
+-	    if (!TARGET_64BIT
+-		&& ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+-	      goto is_legitimate_pic;
+-	    /* FALLTHRU */
+-	  case UNSPEC_GOTOFF:
+-	    gcc_assert (flag_pic);
+-	    if (!TARGET_64BIT)
+-	      goto is_legitimate_pic;
++	    default:
++	      gcc_unreachable ();
++	    }
+ 
+-	    /* 64bit address unspec.  */
+-	    return false;
++	  ix86_print_operand (file, x, 0);
++	  return;
+ 
+-	  case UNSPEC_GOTPCREL:
+-	    if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+-	      goto is_legitimate_pic;
+-	    /* FALLTHRU */
+-	  case UNSPEC_PCREL:
+-	    gcc_assert (flag_pic);
+-	    goto is_legitimate_pic;
++	case 'E':
++	  /* Wrap address in an UNSPEC to declare special handling.  */
++	  if (TARGET_64BIT)
++	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
+ 
+-	  case UNSPEC_GOTTPOFF:
+-	  case UNSPEC_GOTNTPOFF:
+-	  case UNSPEC_INDNTPOFF:
+-	  case UNSPEC_NTPOFF:
+-	  case UNSPEC_DTPOFF:
+-	    break;
+-
+-	  default:
+-	    /* Invalid address unspec.  */
+-	    return false;
+-	  }
++	  output_address (VOIDmode, x);
++	  return;
+ 
+-      else if (SYMBOLIC_CONST (disp)
+-	       && (flag_pic
+-		   || (TARGET_MACHO
+-#if TARGET_MACHO
+-		       && MACHOPIC_INDIRECT
+-		       && !machopic_operand_p (disp)
+-#endif
+-	       )))
+-	{
++	case 'L':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('l', file);
++	  return;
+ 
+-	is_legitimate_pic:
+-	  if (TARGET_64BIT && (index || base))
+-	    {
+-	      /* foo@dtpoff(%rX) is ok.  */
+-	      if (GET_CODE (disp) != CONST
+-		  || GET_CODE (XEXP (disp, 0)) != PLUS
+-		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
+-		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
+-		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
+-		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+-		/* Non-constant pic memory reference.  */
+-		return false;
+-	    }
+-	  else if ((!TARGET_MACHO || flag_pic)
+-		    && ! legitimate_pic_address_disp_p (disp))
+-	    /* Displacement is an invalid pic construct.  */
+-	    return false;
+-#if TARGET_MACHO
+-	  else if (MACHO_DYNAMIC_NO_PIC_P
+-		   && !ix86_legitimate_constant_p (Pmode, disp))
+-	    /* displacment must be referenced via non_lazy_pointer */
+-	    return false;
+-#endif
++	case 'W':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('w', file);
++	  return;
+ 
+-          /* This code used to verify that a symbolic pic displacement
+-	     includes the pic_offset_table_rtx register.
++	case 'B':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('b', file);
++	  return;
+ 
+-	     While this is good idea, unfortunately these constructs may
+-	     be created by "adds using lea" optimization for incorrect
+-	     code like:
++	case 'Q':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('l', file);
++	  return;
+ 
+-	     int a;
+-	     int foo(int i)
+-	       {
+-	         return *(&a+i);
+-	       }
++	case 'S':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('s', file);
++	  return;
+ 
+-	     This code is nonsensical, but results in addressing
+-	     GOT table with pic_offset_table_rtx base.  We can't
+-	     just refuse it easily, since it gets matched by
+-	     "addsi3" pattern, that later gets split to lea in the
+-	     case output register differs from input.  While this
+-	     can be handled by separate addsi pattern for this case
+-	     that never results in lea, this seems to be easier and
+-	     correct fix for crash to disable this test.  */
+-	}
+-      else if (GET_CODE (disp) != LABEL_REF
+-	       && !CONST_INT_P (disp)
+-	       && (GET_CODE (disp) != CONST
+-		   || !ix86_legitimate_constant_p (Pmode, disp))
+-	       && (GET_CODE (disp) != SYMBOL_REF
+-		   || !ix86_legitimate_constant_p (Pmode, disp)))
+-	/* Displacement is not constant.  */
+-	return false;
+-      else if (TARGET_64BIT
+-	       && !x86_64_immediate_operand (disp, VOIDmode))
+-	/* Displacement is out of range.  */
+-	return false;
+-      /* In x32 mode, constant addresses are sign extended to 64bit, so
+-	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
+-      else if (TARGET_X32 && !(index || base)
+-	       && CONST_INT_P (disp)
+-	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
+-	return false;
+-    }
++	case 'T':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('t', file);
++	  return;
+ 
+-  /* Everything looks valid.  */
+-  return true;
+-}
++	case 'O':
++#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
++	  if (ASSEMBLER_DIALECT != ASM_ATT)
++	    return;
+ 
+-/* Determine if a given RTX is a valid constant address.  */
++	  switch (GET_MODE_SIZE (GET_MODE (x)))
++	    {
++	    case 2:
++	      putc ('w', file);
++	      break;
++  
++	    case 4:
++	      putc ('l', file);
++	      break;
+ 
+-bool
+-constant_address_p (rtx x)
+-{
+-  return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
+-}
+-
+-/* Return a unique alias set for the GOT.  */
++	    case 8:
++	      putc ('q', file);
++	      break;
+ 
+-static alias_set_type
+-ix86_GOT_alias_set (void)
+-{
+-  static alias_set_type set = -1;
+-  if (set == -1)
+-    set = new_alias_set ();
+-  return set;
+-}
++	    default:
++	      output_operand_lossage ("invalid operand size for operand "
++				      "code 'O'");
++	      return;
++	    }
+ 
+-/* Return a legitimate reference for ORIG (an address) using the
+-   register REG.  If REG is 0, a new pseudo is generated.
++	  putc ('.', file);
++#endif
++	  return;
+ 
+-   There are two types of references that must be handled:
++	case 'z':
++	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
++	    {
++	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
++	      if (ASSEMBLER_DIALECT == ASM_INTEL)
++		return;
+ 
+-   1. Global data references must load the address from the GOT, via
+-      the PIC reg.  An insn is emitted to do this load, and the reg is
+-      returned.
++	      switch (GET_MODE_SIZE (GET_MODE (x)))
++		{
++		case 1:
++		  putc ('b', file);
++		  return;
+ 
+-   2. Static data references, constant pool addresses, and code labels
+-      compute the address as an offset from the GOT, whose base is in
+-      the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
+-      differentiate them from global data objects.  The returned
+-      address is the PIC reg + an unspec constant.
++		case 2:
++		  putc ('w', file);
++		  return;
+ 
+-   TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
+-   reg also appears in the address.  */
++		case 4:
++		  putc ('l', file);
++		  return;
+ 
+-static rtx
+-legitimize_pic_address (rtx orig, rtx reg)
+-{
+-  rtx addr = orig;
+-  rtx new_rtx = orig;
++		case 8:
++		  putc ('q', file);
++		  return;
+ 
+-#if TARGET_MACHO
+-  if (TARGET_MACHO && !TARGET_64BIT)
+-    {
+-      if (reg == 0)
+-	reg = gen_reg_rtx (Pmode);
+-      /* Use the generic Mach-O PIC machinery.  */
+-      return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
+-    }
+-#endif
++		default:
++		  output_operand_lossage ("invalid operand size for operand "
++					  "code 'z'");
++		  return;
++		}
++	    }
+ 
+-  if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+-    {
+-      rtx tmp = legitimize_pe_coff_symbol (addr, true);
+-      if (tmp)
+-        return tmp;
+-    }
++	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
++	    warning (0, "non-integer operand used with operand code %<z%>");
++	  /* FALLTHRU */
+ 
+-  if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
+-    new_rtx = addr;
+-  else if ((!TARGET_64BIT
+-	    || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
+-	   && !TARGET_PECOFF
+-	   && gotoff_operand (addr, Pmode))
+-    {
+-      /* This symbol may be referenced via a displacement
+-	 from the PIC base address (@GOTOFF).  */
+-      if (GET_CODE (addr) == CONST)
+-	addr = XEXP (addr, 0);
++	case 'Z':
++	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
++	  if (ASSEMBLER_DIALECT == ASM_INTEL)
++	    return;
+ 
+-      if (GET_CODE (addr) == PLUS)
+-	  {
+-            new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
+-				      UNSPEC_GOTOFF);
+-	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
+-	  }
+-	else
+-          new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
++	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
++	    {
++	      switch (GET_MODE_SIZE (GET_MODE (x)))
++		{
++		case 2:
++#ifdef HAVE_AS_IX86_FILDS
++		  putc ('s', file);
++#endif
++		  return;
+ 
+-      new_rtx = gen_rtx_CONST (Pmode, new_rtx);
++		case 4:
++		  putc ('l', file);
++		  return;
+ 
+-      if (TARGET_64BIT)
+-	new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
++		case 8:
++#ifdef HAVE_AS_IX86_FILDQ
++		  putc ('q', file);
++#else
++		  fputs ("ll", file);
++#endif
++		  return;
+ 
+-      if (reg != 0)
+-	{
+- 	  gcc_assert (REG_P (reg));
+-	  new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
+-					 new_rtx, reg, 1, OPTAB_DIRECT);
+- 	}
+-      else
+-	new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+-    }
+-  else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
+-	   /* We can't use @GOTOFF for text labels
+-	      on VxWorks, see gotoff_operand.  */
+-	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
+-    {
+-      rtx tmp = legitimize_pe_coff_symbol (addr, true);
+-      if (tmp)
+-        return tmp;
+-
+-      /* For x64 PE-COFF there is no GOT table,
+-	 so we use address directly.  */
+-      if (TARGET_64BIT && TARGET_PECOFF)
+-	{
+-	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
+-	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+-	}
+-      else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
+-	{
+-	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+-				    UNSPEC_GOTPCREL);
+-	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+-	  new_rtx = gen_const_mem (Pmode, new_rtx);
+-	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+-	}
+-      else
+-	{
+-	  /* This symbol must be referenced via a load
+-	     from the Global Offset Table (@GOT).  */
+-	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
+-	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+-	  if (TARGET_64BIT)
+-	    new_rtx = force_reg (Pmode, new_rtx);
+-	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+-	  new_rtx = gen_const_mem (Pmode, new_rtx);
+-	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+-	}
+-
+-      new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
+-    }
+-  else
+-    {
+-      if (CONST_INT_P (addr)
+-	  && !x86_64_immediate_operand (addr, VOIDmode))
+-	new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
+-      else if (GET_CODE (addr) == CONST)
+-	{
+-	  addr = XEXP (addr, 0);
+-
+-	  /* We must match stuff we generate before.  Assume the only
+-	     unspecs that can get here are ours.  Not that we could do
+-	     anything with them anyway....  */
+-	  if (GET_CODE (addr) == UNSPEC
+-	      || (GET_CODE (addr) == PLUS
+-		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
+-	    return orig;
+-	  gcc_assert (GET_CODE (addr) == PLUS);
+-	}
+-
+-      if (GET_CODE (addr) == PLUS)
+-	{
+-	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
+-
+-	  /* Check first to see if this is a constant
+-	     offset from a @GOTOFF symbol reference.  */
+-	  if (!TARGET_PECOFF
+-	      && gotoff_operand (op0, Pmode)
+-	      && CONST_INT_P (op1))
++		default:
++		  break;
++		}
++	    }
++	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+ 	    {
+-	      if (!TARGET_64BIT)
+-		{
+-		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
+-					    UNSPEC_GOTOFF);
+-		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
+-		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
++	      /* 387 opcodes don't get size suffixes
++		 if the operands are registers.  */
++	      if (STACK_REG_P (x))
++		return;
+ 
+-		  if (reg != 0)
+-		    {
+-		      gcc_assert (REG_P (reg));
+-		      new_rtx = expand_simple_binop (Pmode, PLUS,
+-						     pic_offset_table_rtx,
+-						     new_rtx, reg, 1,
+-						     OPTAB_DIRECT);
+-		    }
+-		  else
+-		    new_rtx
+-		      = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+-		}
+-	      else
++	      switch (GET_MODE_SIZE (GET_MODE (x)))
+ 		{
+-		  if (INTVAL (op1) < -16*1024*1024
+-		      || INTVAL (op1) >= 16*1024*1024)
+-		    {
+-		      if (!x86_64_immediate_operand (op1, Pmode))
+-			op1 = force_reg (Pmode, op1);
++		case 4:
++		  putc ('s', file);
++		  return;
+ 
+-		      new_rtx
+-			= gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
+-		    }
++		case 8:
++		  putc ('l', file);
++		  return;
++
++		case 12:
++		case 16:
++		  putc ('t', file);
++		  return;
++
++		default:
++		  break;
+ 		}
+ 	    }
+ 	  else
+ 	    {
+-	      rtx base = legitimize_pic_address (op0, reg);
+-	      machine_mode mode = GET_MODE (base);
+-	      new_rtx
+-	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
+-
+-	      if (CONST_INT_P (new_rtx))
+-		{
+-		  if (INTVAL (new_rtx) < -16*1024*1024
+-		      || INTVAL (new_rtx) >= 16*1024*1024)
+-		    {
+-		      if (!x86_64_immediate_operand (new_rtx, mode))
+-			new_rtx = force_reg (mode, new_rtx);
+-
+-		      new_rtx
+-		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
+-		    }
+-		  else
+-		    new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
+-		}
+-	      else
+-		{
+-		  /* For %rip addressing, we have to use
+-		     just disp32, not base nor index.  */
+-		  if (TARGET_64BIT
+-		      && (GET_CODE (base) == SYMBOL_REF
+-			  || GET_CODE (base) == LABEL_REF))
+-		    base = force_reg (mode, base);
+-		  if (GET_CODE (new_rtx) == PLUS
+-		      && CONSTANT_P (XEXP (new_rtx, 1)))
+-		    {
+-		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
+-		      new_rtx = XEXP (new_rtx, 1);
+-		    }
+-		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
+-		}
++	      output_operand_lossage ("invalid operand type used with "
++				      "operand code 'Z'");
++	      return;
+ 	    }
+-	}
+-    }
+-  return new_rtx;
+-}
+-
+-/* Load the thread pointer.  If TO_REG is true, force it into a register.  */
+-
+-static rtx
+-get_thread_pointer (machine_mode tp_mode, bool to_reg)
+-{
+-  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
+-
+-  if (GET_MODE (tp) != tp_mode)
+-    {
+-      gcc_assert (GET_MODE (tp) == SImode);
+-      gcc_assert (tp_mode == DImode);
+-
+-      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
+-    }
+-
+-  if (to_reg)
+-    tp = copy_to_mode_reg (tp_mode, tp);
+-
+-  return tp;
+-}
+-
+-/* Construct the SYMBOL_REF for the tls_get_addr function.  */
+-
+-static GTY(()) rtx ix86_tls_symbol;
+-
+-static rtx
+-ix86_tls_get_addr (void)
+-{
+-  if (!ix86_tls_symbol)
+-    {
+-      const char *sym
+-	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
+-	   ? "___tls_get_addr" : "__tls_get_addr");
+-
+-      ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
+-    }
+-
+-  if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
+-    {
+-      rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
+-				   UNSPEC_PLTOFF);
+-      return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+-			   gen_rtx_CONST (Pmode, unspec));
+-    }
+-
+-  return ix86_tls_symbol;
+-}
+-
+-/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
+ 
+-static GTY(()) rtx ix86_tls_module_base_symbol;
+-
+-rtx
+-ix86_tls_module_base (void)
+-{
+-  if (!ix86_tls_module_base_symbol)
+-    {
+-      ix86_tls_module_base_symbol
+-	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
++	  output_operand_lossage ("invalid operand size for operand code 'Z'");
++	  return;
+ 
+-      SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
+-	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
+-    }
++	case 'd':
++	case 'b':
++	case 'w':
++	case 'k':
++	case 'q':
++	case 'h':
++	case 't':
++	case 'g':
++	case 'y':
++	case 'x':
++	case 'X':
++	case 'P':
++	case 'p':
++	case 'V':
++	  break;
+ 
+-  return ix86_tls_module_base_symbol;
+-}
++	case 's':
++	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
++	    {
++	      ix86_print_operand (file, x, 0);
++	      fputs (", ", file);
++	    }
++	  return;
+ 
+-/* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
+-   false if we expect this to be used for a memory address and true if
+-   we expect to load the address into a register.  */
+-
+-static rtx
+-legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
+-{
+-  rtx dest, base, off;
+-  rtx pic = NULL_RTX, tp = NULL_RTX;
+-  machine_mode tp_mode = Pmode;
+-  int type;
++	case 'Y':
++	  switch (GET_CODE (x))
++	    {
++	    case NE:
++	      fputs ("neq", file);
++	      break;
++	    case EQ:
++	      fputs ("eq", file);
++	      break;
++	    case GE:
++	    case GEU:
++	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
++	      break;
++	    case GT:
++	    case GTU:
++	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
++	      break;
++	    case LE:
++	    case LEU:
++	      fputs ("le", file);
++	      break;
++	    case LT:
++	    case LTU:
++	      fputs ("lt", file);
++	      break;
++	    case UNORDERED:
++	      fputs ("unord", file);
++	      break;
++	    case ORDERED:
++	      fputs ("ord", file);
++	      break;
++	    case UNEQ:
++	      fputs ("ueq", file);
++	      break;
++	    case UNGE:
++	      fputs ("nlt", file);
++	      break;
++	    case UNGT:
++	      fputs ("nle", file);
++	      break;
++	    case UNLE:
++	      fputs ("ule", file);
++	      break;
++	    case UNLT:
++	      fputs ("ult", file);
++	      break;
++	    case LTGT:
++	      fputs ("une", file);
++	      break;
++	    default:
++	      output_operand_lossage ("operand is not a condition code, "
++				      "invalid operand code 'Y'");
++	      return;
++	    }
++	  return;
+ 
+-  /* Fall back to global dynamic model if tool chain cannot support local
+-     dynamic.  */
+-  if (TARGET_SUN_TLS && !TARGET_64BIT
+-      && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
+-      && model == TLS_MODEL_LOCAL_DYNAMIC)
+-    model = TLS_MODEL_GLOBAL_DYNAMIC;
++	case 'D':
++	  /* Little bit of braindamage here.  The SSE compare instructions
++	     does use completely different names for the comparisons that the
++	     fp conditional moves.  */
++	  switch (GET_CODE (x))
++	    {
++	    case UNEQ:
++	      if (TARGET_AVX)
++		{
++		  fputs ("eq_us", file);
++		  break;
++		}
++	     /* FALLTHRU */
++	    case EQ:
++	      fputs ("eq", file);
++	      break;
++	    case UNLT:
++	      if (TARGET_AVX)
++		{
++		  fputs ("nge", file);
++		  break;
++		}
++	     /* FALLTHRU */
++	    case LT:
++	      fputs ("lt", file);
++	      break;
++	    case UNLE:
++	      if (TARGET_AVX)
++		{
++		  fputs ("ngt", file);
++		  break;
++		}
++	     /* FALLTHRU */
++	    case LE:
++	      fputs ("le", file);
++	      break;
++	    case UNORDERED:
++	      fputs ("unord", file);
++	      break;
++	    case LTGT:
++	      if (TARGET_AVX)
++		{
++		  fputs ("neq_oq", file);
++		  break;
++		}
++	     /* FALLTHRU */
++	    case NE:
++	      fputs ("neq", file);
++	      break;
++	    case GE:
++	      if (TARGET_AVX)
++		{
++		  fputs ("ge", file);
++		  break;
++		}
++	     /* FALLTHRU */
++	    case UNGE:
++	      fputs ("nlt", file);
++	      break;
++	    case GT:
++	      if (TARGET_AVX)
++		{
++		  fputs ("gt", file);
++		  break;
++		}
++	     /* FALLTHRU */
++	    case UNGT:
++	      fputs ("nle", file);
++	      break;
++	    case ORDERED:
++	      fputs ("ord", file);
++	      break;
++	    default:
++	      output_operand_lossage ("operand is not a condition code, "
++				      "invalid operand code 'D'");
++	      return;
++	    }
++	  return;
+ 
+-  switch (model)
+-    {
+-    case TLS_MODEL_GLOBAL_DYNAMIC:
+-      dest = gen_reg_rtx (Pmode);
++	case 'F':
++	case 'f':
++#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('.', file);
++	  gcc_fallthrough ();
++#endif
+ 
+-      if (!TARGET_64BIT)
+-	{
+-	  if (flag_pic && !TARGET_PECOFF)
+-	    pic = pic_offset_table_rtx;
+-	  else
++	case 'C':
++	case 'c':
++	  if (!COMPARISON_P (x))
+ 	    {
+-	      pic = gen_reg_rtx (Pmode);
+-	      emit_insn (gen_set_got (pic));
++	      output_operand_lossage ("operand is not a condition code, "
++				      "invalid operand code '%c'", code);
++	      return;
+ 	    }
+-	}
++	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
++			      code == 'c' || code == 'f',
++			      code == 'F' || code == 'f',
++			      file);
++	  return;
+ 
+-      if (TARGET_GNU2_TLS)
+-	{
+-	  if (TARGET_64BIT)
+-	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
+-	  else
+-	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
++	case 'H':
++	  if (!offsettable_memref_p (x))
++	    {
++	      output_operand_lossage ("operand is not an offsettable memory "
++				      "reference, invalid operand code 'H'");
++	      return;
++	    }
++	  /* It doesn't actually matter what mode we use here, as we're
++	     only going to use this for printing.  */
++	  x = adjust_address_nv (x, DImode, 8);
++	  /* Output 'qword ptr' for intel assembler dialect.  */
++	  if (ASSEMBLER_DIALECT == ASM_INTEL)
++	    code = 'q';
++	  break;
+ 
+-	  tp = get_thread_pointer (Pmode, true);
+-	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
++	case 'K':
++	  if (!CONST_INT_P (x))
++	    {
++	      output_operand_lossage ("operand is not an integer, invalid "
++				      "operand code 'K'");
++	      return;
++	    }
+ 
+-	  if (GET_MODE (x) != Pmode)
+-	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
++	  if (INTVAL (x) & IX86_HLE_ACQUIRE)
++#ifdef HAVE_AS_IX86_HLE
++	    fputs ("xacquire ", file);
++#else
++	    fputs ("\n" ASM_BYTE "0xf2\n\t", file);
++#endif
++	  else if (INTVAL (x) & IX86_HLE_RELEASE)
++#ifdef HAVE_AS_IX86_HLE
++	    fputs ("xrelease ", file);
++#else
++	    fputs ("\n" ASM_BYTE "0xf3\n\t", file);
++#endif
++	  /* We do not want to print value of the operand.  */
++	  return;
+ 
+-	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
+-	}
+-      else
+-	{
+-	  rtx caddr = ix86_tls_get_addr ();
++	case 'N':
++	  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
++	    fputs ("{z}", file);
++	  return;
+ 
+-	  if (TARGET_64BIT)
++	case 'r':
++	  if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
+ 	    {
+-	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
+-	      rtx_insn *insns;
++	      output_operand_lossage ("operand is not a specific integer, "
++				      "invalid operand code 'r'");
++	      return;
++	    }
+ 
+-	      start_sequence ();
+-	      emit_call_insn
+-		(ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
+-	      insns = get_insns ();
+-	      end_sequence ();
++	  if (ASSEMBLER_DIALECT == ASM_INTEL)
++	    fputs (", ", file);
+ 
+-	      if (GET_MODE (x) != Pmode)
+-		x = gen_rtx_ZERO_EXTEND (Pmode, x);
++	  fputs ("{sae}", file);
+ 
+-	      RTL_CONST_CALL_P (insns) = 1;
+-	      emit_libcall_block (insns, dest, rax, x);
+-	    }
+-	  else
+-	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
+-	}
+-      break;
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    fputs (", ", file);
+ 
+-    case TLS_MODEL_LOCAL_DYNAMIC:
+-      base = gen_reg_rtx (Pmode);
++	  return;
+ 
+-      if (!TARGET_64BIT)
+-	{
+-	  if (flag_pic)
+-	    pic = pic_offset_table_rtx;
+-	  else
++	case 'R':
++	  if (!CONST_INT_P (x))
+ 	    {
+-	      pic = gen_reg_rtx (Pmode);
+-	      emit_insn (gen_set_got (pic));
++	      output_operand_lossage ("operand is not an integer, invalid "
++				      "operand code 'R'");
++	      return;
+ 	    }
+-	}
+ 
+-      if (TARGET_GNU2_TLS)
+-	{
+-	  rtx tmp = ix86_tls_module_base ();
++	  if (ASSEMBLER_DIALECT == ASM_INTEL)
++	    fputs (", ", file);
+ 
+-	  if (TARGET_64BIT)
+-	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
+-	  else
+-	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
++	  switch (INTVAL (x))
++	    {
++	    case ROUND_NEAREST_INT | ROUND_SAE:
++	      fputs ("{rn-sae}", file);
++	      break;
++	    case ROUND_NEG_INF | ROUND_SAE:
++	      fputs ("{rd-sae}", file);
++	      break;
++	    case ROUND_POS_INF | ROUND_SAE:
++	      fputs ("{ru-sae}", file);
++	      break;
++	    case ROUND_ZERO | ROUND_SAE:
++	      fputs ("{rz-sae}", file);
++	      break;
++	    default:
++	      output_operand_lossage ("operand is not a specific integer, "
++				      "invalid operand code 'R'");
++	    }
+ 
+-	  tp = get_thread_pointer (Pmode, true);
+-	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+-			       gen_rtx_MINUS (Pmode, tmp, tp));
+-	}
+-      else
+-	{
+-	  rtx caddr = ix86_tls_get_addr ();
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    fputs (", ", file);
+ 
+-	  if (TARGET_64BIT)
+-	    {
+-	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
+-	      rtx_insn *insns;
+-	      rtx eqv;
++	  return;
+ 
+-	      start_sequence ();
+-	      emit_call_insn
+-		(ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
+-	      insns = get_insns ();
+-	      end_sequence ();
++	case '*':
++	  if (ASSEMBLER_DIALECT == ASM_ATT)
++	    putc ('*', file);
++	  return;
+ 
+-	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
+-		 share the LD_BASE result with other LD model accesses.  */
+-	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+-				    UNSPEC_TLS_LD_BASE);
++	case '&':
++	  {
++	    const char *name = get_some_local_dynamic_name ();
++	    if (name == NULL)
++	      output_operand_lossage ("'%%&' used without any "
++				      "local dynamic TLS references");
++	    else
++	      assemble_name (file, name);
++	    return;
++	  }
+ 
+-	      RTL_CONST_CALL_P (insns) = 1;
+-	      emit_libcall_block (insns, base, rax, eqv);
+-	    }
+-	  else
+-	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
+-	}
++	case '+':
++	  {
++	    rtx x;
+ 
+-      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
+-      off = gen_rtx_CONST (Pmode, off);
++	    if (!optimize
++	        || optimize_function_for_size_p (cfun)
++		|| !TARGET_BRANCH_PREDICTION_HINTS)
++	      return;
+ 
+-      dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
++	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
++	    if (x)
++	      {
++		int pred_val = profile_probability::from_reg_br_prob_note
++				 (XINT (x, 0)).to_reg_br_prob_base ();
+ 
+-      if (TARGET_GNU2_TLS)
+-	{
+-	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
++		if (pred_val < REG_BR_PROB_BASE * 45 / 100
++		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
++		  {
++		    bool taken = pred_val > REG_BR_PROB_BASE / 2;
++		    bool cputaken
++		      = final_forward_branch_p (current_output_insn) == 0;
+ 
+-	  if (GET_MODE (x) != Pmode)
+-	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
++		    /* Emit hints only in the case default branch prediction
++		       heuristics would fail.  */
++		    if (taken != cputaken)
++		      {
++			/* We use 3e (DS) prefix for taken branches and
++			   2e (CS) prefix for not taken branches.  */
++			if (taken)
++			  fputs ("ds ; ", file);
++			else
++			  fputs ("cs ; ", file);
++		      }
++		  }
++	      }
++	    return;
++	  }
+ 
+-	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
+-	}
+-      break;
++	case ';':
++#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
++	  putc (';', file);
++#endif
++	  return;
+ 
+-    case TLS_MODEL_INITIAL_EXEC:
+-      if (TARGET_64BIT)
+-	{
+-	  if (TARGET_SUN_TLS && !TARGET_X32)
+-	    {
+-	      /* The Sun linker took the AMD64 TLS spec literally
+-		 and can only handle %rax as destination of the
+-		 initial executable code sequence.  */
++	case '~':
++	  putc (TARGET_AVX2 ? 'i' : 'f', file);
++	  return;
+ 
+-	      dest = gen_reg_rtx (DImode);
+-	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
+-	      return dest;
++	case 'M':
++	  if (TARGET_X32)
++	    {
++	      /* NB: 32-bit indices in VSIB address are sign-extended
++		 to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
++		 sign-extended to 0xfffffffff7fa3010 which is invalid
++		 address.  Add addr32 prefix if there is no base
++		 register nor symbol.  */
++	      bool ok;
++	      struct ix86_address parts;
++	      ok = ix86_decompose_address (x, &parts);
++	      gcc_assert (ok && parts.index == NULL_RTX);
++	      if (parts.base == NULL_RTX
++		  && (parts.disp == NULL_RTX
++		      || !symbolic_operand (parts.disp,
++					    GET_MODE (parts.disp))))
++		fputs ("addr32 ", file);
+ 	    }
++	  return;
+ 
+-	  /* Generate DImode references to avoid %fs:(%reg32)
+-	     problems and linker IE->LE relaxation bug.  */
+-	  tp_mode = DImode;
+-	  pic = NULL;
+-	  type = UNSPEC_GOTNTPOFF;
+-	}
+-      else if (flag_pic)
+-	{
+-	  pic = pic_offset_table_rtx;
+-	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
+-	}
+-      else if (!TARGET_ANY_GNU_TLS)
+-	{
+-	  pic = gen_reg_rtx (Pmode);
+-	  emit_insn (gen_set_got (pic));
+-	  type = UNSPEC_GOTTPOFF;
+-	}
+-      else
+-	{
+-	  pic = NULL;
+-	  type = UNSPEC_INDNTPOFF;
+-	}
++	case '^':
++	  if (TARGET_64BIT && Pmode != word_mode)
++	    fputs ("addr32 ", file);
++	  return;
+ 
+-      off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
+-      off = gen_rtx_CONST (tp_mode, off);
+-      if (pic)
+-	off = gen_rtx_PLUS (tp_mode, pic, off);
+-      off = gen_const_mem (tp_mode, off);
+-      set_mem_alias_set (off, ix86_GOT_alias_set ());
++	case '!':
++	  if (ix86_notrack_prefixed_insn_p (current_output_insn))
++	    fputs ("notrack ", file);
++	  return;
+ 
+-      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+-	{
+-	  base = get_thread_pointer (tp_mode,
+-				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+-	  off = force_reg (tp_mode, off);
+-	  dest = gen_rtx_PLUS (tp_mode, base, off);
+-	  if (tp_mode != Pmode)
+-	    dest = convert_to_mode (Pmode, dest, 1);
+-	}
+-      else
+-	{
+-	  base = get_thread_pointer (Pmode, true);
+-	  dest = gen_reg_rtx (Pmode);
+-	  emit_insn (ix86_gen_sub3 (dest, base, off));
++	default:
++	  output_operand_lossage ("invalid operand code '%c'", code);
+ 	}
+-      break;
++    }
+ 
+-    case TLS_MODEL_LOCAL_EXEC:
+-      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
+-			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+-			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
+-      off = gen_rtx_CONST (Pmode, off);
++  if (REG_P (x))
++    print_reg (x, code, file);
+ 
+-      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+-	{
+-	  base = get_thread_pointer (Pmode,
+-				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+-	  return gen_rtx_PLUS (Pmode, base, off);
+-	}
+-      else
++  else if (MEM_P (x))
++    {
++      rtx addr = XEXP (x, 0);
++
++      /* No `byte ptr' prefix for call instructions ... */
++      if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
+ 	{
+-	  base = get_thread_pointer (Pmode, true);
+-	  dest = gen_reg_rtx (Pmode);
+-	  emit_insn (ix86_gen_sub3 (dest, base, off));
+-	}
+-      break;
++	  machine_mode mode = GET_MODE (x);
++	  const char *size;
+ 
+-    default:
+-      gcc_unreachable ();
+-    }
++	  /* Check for explicit size override codes.  */
++	  if (code == 'b')
++	    size = "BYTE";
++	  else if (code == 'w')
++	    size = "WORD";
++	  else if (code == 'k')
++	    size = "DWORD";
++	  else if (code == 'q')
++	    size = "QWORD";
++	  else if (code == 'x')
++	    size = "XMMWORD";
++	  else if (code == 't')
++	    size = "YMMWORD";
++	  else if (code == 'g')
++	    size = "ZMMWORD";
++	  else if (mode == BLKmode)
++	    /* ... or BLKmode operands, when not overridden.  */
++	    size = NULL;
++	  else
++	    switch (GET_MODE_SIZE (mode))
++	      {
++	      case 1: size = "BYTE"; break;
++	      case 2: size = "WORD"; break;
++	      case 4: size = "DWORD"; break;
++	      case 8: size = "QWORD"; break;
++	      case 12: size = "TBYTE"; break;
++	      case 16:
++		if (mode == XFmode)
++		  size = "TBYTE";
++		else
++		  size = "XMMWORD";
++		break;
++	      case 32: size = "YMMWORD"; break;
++	      case 64: size = "ZMMWORD"; break;
++	      default:
++		gcc_unreachable ();
++	      }
++	  if (size)
++	    {
++	      fputs (size, file);
++	      fputs (" PTR ", file);
++	    }
++	}
+ 
+-  return dest;
+-}
++      if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
++	output_operand_lossage ("invalid constraints for operand");
++      else
++	ix86_print_operand_address_as
++	  (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
++    }
+ 
+-/* Return true if OP refers to a TLS address.  */
+-bool
+-ix86_tls_address_pattern_p (rtx op)
+-{
+-  subrtx_var_iterator::array_type array;
+-  FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
++  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
+     {
+-      rtx op = *iter;
+-      if (MEM_P (op))
+-	{
+-	  rtx *x = &XEXP (op, 0);
+-	  while (GET_CODE (*x) == PLUS)
+-	    {
+-	      int i;
+-	      for (i = 0; i < 2; i++)
+-		{
+-		  rtx u = XEXP (*x, i);
+-		  if (GET_CODE (u) == ZERO_EXTEND)
+-		    u = XEXP (u, 0);
+-		  if (GET_CODE (u) == UNSPEC
+-		      && XINT (u, 1) == UNSPEC_TP)
+-		    return true;
+-		}
+-	      x = &XEXP (*x, 0);
+-	    }
++      long l;
+ 
+-	  iter.skip_subrtxes ();
+-	}
++      REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
++
++      if (ASSEMBLER_DIALECT == ASM_ATT)
++	putc ('$', file);
++      /* Sign extend 32bit SFmode immediate to 8 bytes.  */
++      if (code == 'q')
++	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
++		 (unsigned long long) (int) l);
++      else
++	fprintf (file, "0x%08x", (unsigned int) l);
+     }
+ 
+-  return false;
+-}
++  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
++    {
++      long l[2];
+ 
+-/* Rewrite *LOC so that it refers to a default TLS address space.  */
+-void
+-ix86_rewrite_tls_address_1 (rtx *loc)
+-{
+-  subrtx_ptr_iterator::array_type array;
+-  FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
++      REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
++
++      if (ASSEMBLER_DIALECT == ASM_ATT)
++	putc ('$', file);
++      fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
++    }
++
++  /* These float cases don't actually occur as immediate operands.  */
++  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
+     {
+-      rtx *loc = *iter;
+-      if (MEM_P (*loc))
+-	{
+-	  rtx addr = XEXP (*loc, 0);
+-	  rtx *x = &addr;
+-	  while (GET_CODE (*x) == PLUS)
+-	    {
+-	      int i;
+-	      for (i = 0; i < 2; i++)
+-		{
+-		  rtx u = XEXP (*x, i);
+-		  if (GET_CODE (u) == ZERO_EXTEND)
+-		    u = XEXP (u, 0);
+-		  if (GET_CODE (u) == UNSPEC
+-		      && XINT (u, 1) == UNSPEC_TP)
+-		    {
+-		      addr_space_t as = DEFAULT_TLS_SEG_REG;
++      char dstr[30];
+ 
+-		      *x = XEXP (*x, 1 - i);
++      real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
++      fputs (dstr, file);
++    }
+ 
+-		      *loc = replace_equiv_address_nv (*loc, addr, true);
+-		      set_mem_addr_space (*loc, as);
+-		      return;
+-		    }
+-		}
+-	      x = &XEXP (*x, 0);
+-	    }
++  else
++    {
++      /* We have patterns that allow zero sets of memory, for instance.
++	 In 64-bit mode, we should probably support all 8-byte vectors,
++	 since we can in fact encode that into an immediate.  */
++      if (GET_CODE (x) == CONST_VECTOR)
++	{
++	  if (x != CONST0_RTX (GET_MODE (x)))
++	    output_operand_lossage ("invalid vector immediate");
++	  x = const0_rtx;
++	}
+ 
+-	  iter.skip_subrtxes ();
++      if (code != 'P' && code != 'p')
++	{
++	  if (CONST_INT_P (x))
++	    {
++	      if (ASSEMBLER_DIALECT == ASM_ATT)
++		putc ('$', file);
++	    }
++	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
++		   || GET_CODE (x) == LABEL_REF)
++	    {
++	      if (ASSEMBLER_DIALECT == ASM_ATT)
++		putc ('$', file);
++	      else
++		fputs ("OFFSET FLAT:", file);
++	    }
+ 	}
++      if (CONST_INT_P (x))
++	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
++      else if (flag_pic || MACHOPIC_INDIRECT)
++	output_pic_addr_const (file, x, code);
++      else
++	output_addr_const (file, x);
+     }
+ }
+ 
+-/* Rewrite instruction pattern involvning TLS address
+-   so that it refers to a default TLS address space.  */
+-rtx
+-ix86_rewrite_tls_address (rtx pattern)
++static bool
++ix86_print_operand_punct_valid_p (unsigned char code)
+ {
+-  pattern = copy_insn (pattern);
+-  ix86_rewrite_tls_address_1 (&pattern);
+-  return pattern;
++  return (code == '*' || code == '+' || code == '&' || code == ';'
++	  || code == '~' || code == '^' || code == '!');
+ }
++
++/* Print a memory operand whose address is ADDR.  */
+ 
+-/* Create or return the unique __imp_DECL dllimport symbol corresponding
+-   to symbol DECL if BEIMPORT is true.  Otherwise create or return the
+-   unique refptr-DECL symbol corresponding to symbol DECL.  */
+-
+-struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
++static void
++ix86_print_operand_address_as (FILE *file, rtx addr,
++			       addr_space_t as, bool no_rip)
+ {
+-  static inline hashval_t hash (tree_map *m) { return m->hash; }
+-  static inline bool
+-  equal (tree_map *a, tree_map *b)
+-  {
+-    return a->base.from == b->base.from;
+-  }
+-
+-  static int
+-  keep_cache_entry (tree_map *&m)
+-  {
+-    return ggc_marked_p (m->base.from);
+-  }
+-};
++  struct ix86_address parts;
++  rtx base, index, disp;
++  int scale;
++  int ok;
++  bool vsib = false;
++  int code = 0;
+ 
+-static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
++  if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
++    {
++      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
++      gcc_assert (parts.index == NULL_RTX);
++      parts.index = XVECEXP (addr, 0, 1);
++      parts.scale = INTVAL (XVECEXP (addr, 0, 2));
++      addr = XVECEXP (addr, 0, 0);
++      vsib = true;
++    }
++  else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
++    {
++      gcc_assert (TARGET_64BIT);
++      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
++      code = 'q';
++    }
++  else
++    ok = ix86_decompose_address (addr, &parts);
+ 
+-static tree
+-get_dllimport_decl (tree decl, bool beimport)
+-{
+-  struct tree_map *h, in;
+-  const char *name;
+-  const char *prefix;
+-  size_t namelen, prefixlen;
+-  char *imp_name;
+-  tree to;
+-  rtx rtl;
++  gcc_assert (ok);
+ 
+-  if (!dllimport_map)
+-    dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
++  base = parts.base;
++  index = parts.index;
++  disp = parts.disp;
++  scale = parts.scale;
+ 
+-  in.hash = htab_hash_pointer (decl);
+-  in.base.from = decl;
+-  tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
+-  h = *loc;
+-  if (h)
+-    return h->to;
++  if (ADDR_SPACE_GENERIC_P (as))
++    as = parts.seg;
++  else
++    gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
+ 
+-  *loc = h = ggc_alloc<tree_map> ();
+-  h->hash = in.hash;
+-  h->base.from = decl;
+-  h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
+-			   VAR_DECL, NULL, ptr_type_node);
+-  DECL_ARTIFICIAL (to) = 1;
+-  DECL_IGNORED_P (to) = 1;
+-  DECL_EXTERNAL (to) = 1;
+-  TREE_READONLY (to) = 1;
++  if (!ADDR_SPACE_GENERIC_P (as))
++    {
++      if (ASSEMBLER_DIALECT == ASM_ATT)
++	putc ('%', file);
+ 
+-  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+-  name = targetm.strip_name_encoding (name);
+-  if (beimport)
+-    prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
+-      ? "*__imp_" : "*__imp__";
+-  else
+-    prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
+-  namelen = strlen (name);
+-  prefixlen = strlen (prefix);
+-  imp_name = (char *) alloca (namelen + prefixlen + 1);
+-  memcpy (imp_name, prefix, prefixlen);
+-  memcpy (imp_name + prefixlen, name, namelen + 1);
++      switch (as)
++	{
++	case ADDR_SPACE_SEG_FS:
++	  fputs ("fs:", file);
++	  break;
++	case ADDR_SPACE_SEG_GS:
++	  fputs ("gs:", file);
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++    }
+ 
+-  name = ggc_alloc_string (imp_name, namelen + prefixlen);
+-  rtl = gen_rtx_SYMBOL_REF (Pmode, name);
+-  SET_SYMBOL_REF_DECL (rtl, to);
+-  SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
+-  if (!beimport)
++  /* Use one byte shorter RIP relative addressing for 64bit mode.  */
++  if (TARGET_64BIT && !base && !index && !no_rip)
+     {
+-      SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
+-#ifdef SUB_TARGET_RECORD_STUB
+-      SUB_TARGET_RECORD_STUB (name);
+-#endif
+-    }      
++      rtx symbol = disp;
+ 
+-  rtl = gen_const_mem (Pmode, rtl);
+-  set_mem_alias_set (rtl, ix86_GOT_alias_set ());
++      if (GET_CODE (disp) == CONST
++	  && GET_CODE (XEXP (disp, 0)) == PLUS
++	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
++	symbol = XEXP (XEXP (disp, 0), 0);
+ 
+-  SET_DECL_RTL (to, rtl);
+-  SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
++      if (GET_CODE (symbol) == LABEL_REF
++	  || (GET_CODE (symbol) == SYMBOL_REF
++	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
++	base = pc_rtx;
++    }
+ 
+-  return to;
+-}
++  if (!base && !index)
++    {
++      /* Displacement only requires special attention.  */
++      if (CONST_INT_P (disp))
++	{
++	  if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
++	    fputs ("ds:", file);
++	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
++	}
++      /* Load the external function address via the GOT slot to avoid PLT.  */
++      else if (GET_CODE (disp) == CONST
++	       && GET_CODE (XEXP (disp, 0)) == UNSPEC
++	       && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
++		   || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
++	       && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
++	output_pic_addr_const (file, disp, 0);
++      else if (flag_pic)
++	output_pic_addr_const (file, disp, 0);
++      else
++	output_addr_const (file, disp);
++    }
++  else
++    {
++      /* Print SImode register names to force addr32 prefix.  */
++      if (SImode_address_operand (addr, VOIDmode))
++	{
++	  if (flag_checking)
++	    {
++	      gcc_assert (TARGET_64BIT);
++	      switch (GET_CODE (addr))
++		{
++		case SUBREG:
++		  gcc_assert (GET_MODE (addr) == SImode);
++		  gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
++		  break;
++		case ZERO_EXTEND:
++		case AND:
++		  gcc_assert (GET_MODE (addr) == DImode);
++		  break;
++		default:
++		  gcc_unreachable ();
++		}
++	    }
++	  gcc_assert (!code);
++	  code = 'k';
++	}
++      else if (code == 0
++	       && TARGET_X32
++	       && disp
++	       && CONST_INT_P (disp)
++	       && INTVAL (disp) < -16*1024*1024)
++	{
++	  /* X32 runs in 64-bit mode, where displacement, DISP, in
++	     address DISP(%r64), is encoded as 32-bit immediate sign-
++	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
++	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
++	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
++	     which is invalid for x32.  The correct address is %r64
++	     - 0x40000300 == 0xf7ffdd64.  To properly encode
++	     -0x40000300(%r64) for x32, we zero-extend negative
++	     displacement by forcing addr32 prefix which truncates
++	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
++	     zero-extend all negative displacements, including -1(%rsp).
++	     However, for small negative displacements, sign-extension
++	     won't cause overflow.  We only zero-extend negative
++	     displacements if they < -16*1024*1024, which is also used
++	     to check legitimate address displacements for PIC.  */
++	  code = 'k';
++	}
+ 
+-/* Expand SYMBOL into its corresponding far-address symbol.
+-   WANT_REG is true if we require the result be a register.  */
++      /* Since the upper 32 bits of RSP are always zero for x32,
++	 we can encode %esp as %rsp to avoid 0x67 prefix if
++	 there is no index register.  */
++      if (TARGET_X32 && Pmode == SImode
++	  && !index && base && REG_P (base) && REGNO (base) == SP_REG)
++	code = 'q';
+ 
+-static rtx
+-legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
+-{
+-  tree imp_decl;
+-  rtx x;
++      if (ASSEMBLER_DIALECT == ASM_ATT)
++	{
++	  if (disp)
++	    {
++	      if (flag_pic)
++		output_pic_addr_const (file, disp, 0);
++	      else if (GET_CODE (disp) == LABEL_REF)
++		output_asm_label (disp);
++	      else
++		output_addr_const (file, disp);
++	    }
+ 
+-  gcc_assert (SYMBOL_REF_DECL (symbol));
+-  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
++	  putc ('(', file);
++	  if (base)
++	    print_reg (base, code, file);
++	  if (index)
++	    {
++	      putc (',', file);
++	      print_reg (index, vsib ? 0 : code, file);
++	      if (scale != 1 || vsib)
++		fprintf (file, ",%d", scale);
++	    }
++	  putc (')', file);
++	}
++      else
++	{
++	  rtx offset = NULL_RTX;
+ 
+-  x = DECL_RTL (imp_decl);
+-  if (want_reg)
+-    x = force_reg (Pmode, x);
+-  return x;
+-}
++	  if (disp)
++	    {
++	      /* Pull out the offset of a symbol; print any symbol itself.  */
++	      if (GET_CODE (disp) == CONST
++		  && GET_CODE (XEXP (disp, 0)) == PLUS
++		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
++		{
++		  offset = XEXP (XEXP (disp, 0), 1);
++		  disp = gen_rtx_CONST (VOIDmode,
++					XEXP (XEXP (disp, 0), 0));
++		}
+ 
+-/* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
+-   true if we require the result be a register.  */
++	      if (flag_pic)
++		output_pic_addr_const (file, disp, 0);
++	      else if (GET_CODE (disp) == LABEL_REF)
++		output_asm_label (disp);
++	      else if (CONST_INT_P (disp))
++		offset = disp;
++	      else
++		output_addr_const (file, disp);
++	    }
+ 
+-static rtx
+-legitimize_dllimport_symbol (rtx symbol, bool want_reg)
+-{
+-  tree imp_decl;
+-  rtx x;
++	  putc ('[', file);
++	  if (base)
++	    {
++	      print_reg (base, code, file);
++	      if (offset)
++		{
++		  if (INTVAL (offset) >= 0)
++		    putc ('+', file);
++		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
++		}
++	    }
++	  else if (offset)
++	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
++	  else
++	    putc ('0', file);
+ 
+-  gcc_assert (SYMBOL_REF_DECL (symbol));
+-  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
++	  if (index)
++	    {
++	      putc ('+', file);
++	      print_reg (index, vsib ? 0 : code, file);
++	      if (scale != 1 || vsib)
++		fprintf (file, "*%d", scale);
++	    }
++	  putc (']', file);
++	}
++    }
++}
+ 
+-  x = DECL_RTL (imp_decl);
+-  if (want_reg)
+-    x = force_reg (Pmode, x);
+-  return x;
++static void
++ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
++{
++  ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
+ }
+ 
+-/* Expand SYMBOL into its corresponding dllimport or refptr symbol.  WANT_REG 
+-   is true if we require the result be a register.  */
++/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
+ 
+-static rtx
+-legitimize_pe_coff_symbol (rtx addr, bool inreg)
++static bool
++i386_asm_output_addr_const_extra (FILE *file, rtx x)
+ {
+-  if (!TARGET_PECOFF)
+-    return NULL_RTX;
++  rtx op;
+ 
+-  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+-    {
+-      if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
+-	return legitimize_dllimport_symbol (addr, inreg);
+-      if (GET_CODE (addr) == CONST
+-	  && GET_CODE (XEXP (addr, 0)) == PLUS
+-	  && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+-	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
+-	{
+-	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
+-	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+-	}
+-    }
+-
+-  if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
+-    return NULL_RTX;
+-  if (GET_CODE (addr) == SYMBOL_REF
+-      && !is_imported_p (addr)
+-      && SYMBOL_REF_EXTERNAL_P (addr)
+-      && SYMBOL_REF_DECL (addr))
+-    return legitimize_pe_coff_extern_decl (addr, inreg);
++  if (GET_CODE (x) != UNSPEC)
++    return false;
+ 
+-  if (GET_CODE (addr) == CONST
+-      && GET_CODE (XEXP (addr, 0)) == PLUS
+-      && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+-      && !is_imported_p (XEXP (XEXP (addr, 0), 0))
+-      && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
+-      && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
++  op = XVECEXP (x, 0, 0);
++  switch (XINT (x, 1))
+     {
+-      rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
+-      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
++    case UNSPEC_GOTOFF:
++      output_addr_const (file, op);
++      fputs ("@gotoff", file);
++      break;
++    case UNSPEC_GOTTPOFF:
++      output_addr_const (file, op);
++      /* FIXME: This might be @TPOFF in Sun ld.  */
++      fputs ("@gottpoff", file);
++      break;
++    case UNSPEC_TPOFF:
++      output_addr_const (file, op);
++      fputs ("@tpoff", file);
++      break;
++    case UNSPEC_NTPOFF:
++      output_addr_const (file, op);
++      if (TARGET_64BIT)
++	fputs ("@tpoff", file);
++      else
++	fputs ("@ntpoff", file);
++      break;
++    case UNSPEC_DTPOFF:
++      output_addr_const (file, op);
++      fputs ("@dtpoff", file);
++      break;
++    case UNSPEC_GOTNTPOFF:
++      output_addr_const (file, op);
++      if (TARGET_64BIT)
++	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
++	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
++      else
++	fputs ("@gotntpoff", file);
++      break;
++    case UNSPEC_INDNTPOFF:
++      output_addr_const (file, op);
++      fputs ("@indntpoff", file);
++      break;
++#if TARGET_MACHO
++    case UNSPEC_MACHOPIC_OFFSET:
++      output_addr_const (file, op);
++      putc ('-', file);
++      machopic_output_function_base_name (file);
++      break;
++#endif
++
++    default:
++      return false;
+     }
+-  return NULL_RTX;
+-}
+ 
+-/* Try machine-dependent ways of modifying an illegitimate address
+-   to be legitimate.  If we find one, return the new, valid address.
+-   This macro is used in only one place: `memory_address' in explow.c.
++  return true;
++}
++
++
++/* Output code to perform a 387 binary operation in INSN, one of PLUS,
++   MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
++   is the expression of the binary operation.  The output may either be
++   emitted here, or returned to the caller, like all output_* functions.
+ 
+-   OLDX is the address as it was before break_out_memory_refs was called.
+-   In some cases it is useful to look at this to decide what needs to be done.
++   There is no guarantee that the operands are the same mode, as they
++   might be within FLOAT or FLOAT_EXTEND expressions.  */
+ 
+-   It is always safe for this macro to do nothing.  It exists to recognize
+-   opportunities to optimize the output.
++#ifndef SYSV386_COMPAT
++/* Set to 1 for compatibility with brain-damaged assemblers.  No-one
++   wants to fix the assemblers because that causes incompatibility
++   with gcc.  No-one wants to fix gcc because that causes
++   incompatibility with assemblers...  You can use the option of
++   -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
++#define SYSV386_COMPAT 1
++#endif
+ 
+-   For the 80386, we handle X+REG by loading X into a register R and
+-   using R+REG.  R will go in a general reg and indexing will be used.
+-   However, if REG is a broken-out memory address or multiplication,
+-   nothing needs to be done because REG can certainly go in a general reg.
++const char *
++output_387_binary_op (rtx_insn *insn, rtx *operands)
++{
++  static char buf[40];
++  const char *p;
++  bool is_sse
++    = (SSE_REG_P (operands[0])
++       || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
+ 
+-   When -fpic is used, special handling is needed for symbolic references.
+-   See comments by legitimize_pic_address in i386.c for details.  */
++  if (is_sse)
++    p = "%v";
++  else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
++	   || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
++    p = "fi";
++  else
++    p = "f";
+ 
+-static rtx
+-ix86_legitimize_address (rtx x, rtx, machine_mode mode)
+-{
+-  bool changed = false;
+-  unsigned log;
++  strcpy (buf, p);
+ 
+-  log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
+-  if (log)
+-    return legitimize_tls_address (x, (enum tls_model) log, false);
+-  if (GET_CODE (x) == CONST
+-      && GET_CODE (XEXP (x, 0)) == PLUS
+-      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
+-      && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
++  switch (GET_CODE (operands[3]))
+     {
+-      rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
+-				      (enum tls_model) log, false);
+-      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
++    case PLUS:
++      p = "add"; break;
++    case MINUS:
++      p = "sub"; break;
++    case MULT:
++      p = "mul"; break;
++    case DIV:
++      p = "div"; break;
++    default:
++      gcc_unreachable ();
+     }
+ 
+-  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+-    {
+-      rtx tmp = legitimize_pe_coff_symbol (x, true);
+-      if (tmp)
+-        return tmp;
+-    }
++  strcat (buf, p);
+ 
+-  if (flag_pic && SYMBOLIC_CONST (x))
+-    return legitimize_pic_address (x, 0);
++  if (is_sse)
++   {
++     p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
++     strcat (buf, p);
+ 
+-#if TARGET_MACHO
+-  if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
+-    return machopic_indirect_data_reference (x, 0);
+-#endif
++     if (TARGET_AVX)
++       p = "\t{%2, %1, %0|%0, %1, %2}";
++     else
++       p = "\t{%2, %0|%0, %2}";
+ 
+-  /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
+-  if (GET_CODE (x) == ASHIFT
+-      && CONST_INT_P (XEXP (x, 1))
+-      && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
++     strcat (buf, p);
++     return buf;
++   }
++
++  /* Even if we do not want to check the inputs, this documents input
++     constraints.  Which helps in understanding the following code.  */
++  if (flag_checking)
+     {
+-      changed = true;
+-      log = INTVAL (XEXP (x, 1));
+-      x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
+-			GEN_INT (1 << log));
++      if (STACK_REG_P (operands[0])
++	  && ((REG_P (operands[1])
++	       && REGNO (operands[0]) == REGNO (operands[1])
++	       && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
++	      || (REG_P (operands[2])
++		  && REGNO (operands[0]) == REGNO (operands[2])
++		  && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
++	  && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
++	; /* ok */
++      else
++	gcc_unreachable ();
+     }
+ 
+-  if (GET_CODE (x) == PLUS)
++  switch (GET_CODE (operands[3]))
+     {
+-      /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
++    case MULT:
++    case PLUS:
++      if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
++	std::swap (operands[1], operands[2]);
+ 
+-      if (GET_CODE (XEXP (x, 0)) == ASHIFT
+-	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+-	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
++      /* know operands[0] == operands[1].  */
++
++      if (MEM_P (operands[2]))
+ 	{
+-	  changed = true;
+-	  log = INTVAL (XEXP (XEXP (x, 0), 1));
+-	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
+-				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
+-				      GEN_INT (1 << log));
++	  p = "%Z2\t%2";
++	  break;
+ 	}
+ 
+-      if (GET_CODE (XEXP (x, 1)) == ASHIFT
+-	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
+-	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
++      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+ 	{
+-	  changed = true;
+-	  log = INTVAL (XEXP (XEXP (x, 1), 1));
+-	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
+-				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
+-				      GEN_INT (1 << log));
++	  if (STACK_TOP_P (operands[0]))
++	    /* How is it that we are storing to a dead operand[2]?
++	       Well, presumably operands[1] is dead too.  We can't
++	       store the result to st(0) as st(0) gets popped on this
++	       instruction.  Instead store to operands[2] (which I
++	       think has to be st(1)).  st(1) will be popped later.
++	       gcc <= 2.8.1 didn't have this check and generated
++	       assembly code that the Unixware assembler rejected.  */
++	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
++	  else
++	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
++	  break;
+ 	}
+ 
+-      /* Put multiply first if it isn't already.  */
+-      if (GET_CODE (XEXP (x, 1)) == MULT)
++      if (STACK_TOP_P (operands[0]))
++	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
++      else
++	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
++      break;
++
++    case MINUS:
++    case DIV:
++      if (MEM_P (operands[1]))
+ 	{
+-	  std::swap (XEXP (x, 0), XEXP (x, 1));
+-	  changed = true;
++	  p = "r%Z1\t%1";
++	  break;
+ 	}
+ 
+-      /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
+-	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
+-	 created by virtual register instantiation, register elimination, and
+-	 similar optimizations.  */
+-      if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
++      if (MEM_P (operands[2]))
+ 	{
+-	  changed = true;
+-	  x = gen_rtx_PLUS (Pmode,
+-			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
+-					  XEXP (XEXP (x, 1), 0)),
+-			    XEXP (XEXP (x, 1), 1));
++	  p = "%Z2\t%2";
++	  break;
+ 	}
+ 
+-      /* Canonicalize
+-	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
+-	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
+-      else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
+-	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+-	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
+-	       && CONSTANT_P (XEXP (x, 1)))
++      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+ 	{
+-	  rtx constant;
+-	  rtx other = NULL_RTX;
+-
+-	  if (CONST_INT_P (XEXP (x, 1)))
+-	    {
+-	      constant = XEXP (x, 1);
+-	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
+-	    }
+-	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
+-	    {
+-	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
+-	      other = XEXP (x, 1);
+-	    }
++#if SYSV386_COMPAT
++	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
++	     derived assemblers, confusingly reverse the direction of
++	     the operation for fsub{r} and fdiv{r} when the
++	     destination register is not st(0).  The Intel assembler
++	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
++	     figure out what the hardware really does.  */
++	  if (STACK_TOP_P (operands[0]))
++	    p = "{p\t%0, %2|rp\t%2, %0}";
+ 	  else
+-	    constant = 0;
+-
+-	  if (constant)
+-	    {
+-	      changed = true;
+-	      x = gen_rtx_PLUS (Pmode,
+-				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
+-					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
+-				plus_constant (Pmode, other,
+-					       INTVAL (constant)));
+-	    }
++	    p = "{rp\t%2, %0|p\t%0, %2}";
++#else
++	  if (STACK_TOP_P (operands[0]))
++	    /* As above for fmul/fadd, we can't store to st(0).  */
++	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
++	  else
++	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
++#endif
++	  break;
+ 	}
+ 
+-      if (changed && ix86_legitimate_address_p (mode, x, false))
+-	return x;
+-
+-      if (GET_CODE (XEXP (x, 0)) == MULT)
++      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+ 	{
+-	  changed = true;
+-	  XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
++#if SYSV386_COMPAT
++	  if (STACK_TOP_P (operands[0]))
++	    p = "{rp\t%0, %1|p\t%1, %0}";
++	  else
++	    p = "{p\t%1, %0|rp\t%0, %1}";
++#else
++	  if (STACK_TOP_P (operands[0]))
++	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
++	  else
++	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
++#endif
++	  break;
+ 	}
+ 
+-      if (GET_CODE (XEXP (x, 1)) == MULT)
++      if (STACK_TOP_P (operands[0]))
+ 	{
+-	  changed = true;
+-	  XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
++	  if (STACK_TOP_P (operands[1]))
++	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
++	  else
++	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
++	  break;
+ 	}
+-
+-      if (changed
+-	  && REG_P (XEXP (x, 1))
+-	  && REG_P (XEXP (x, 0)))
+-	return x;
+-
+-      if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
++      else if (STACK_TOP_P (operands[1]))
+ 	{
+-	  changed = true;
+-	  x = legitimize_pic_address (x, 0);
++#if SYSV386_COMPAT
++	  p = "{\t%1, %0|r\t%0, %1}";
++#else
++	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
++#endif
++	}
++      else
++	{
++#if SYSV386_COMPAT
++	  p = "{r\t%2, %0|\t%0, %2}";
++#else
++	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
++#endif
+ 	}
++      break;
+ 
+-      if (changed && ix86_legitimate_address_p (mode, x, false))
+-	return x;
++    default:
++      gcc_unreachable ();
++    }
+ 
+-      if (REG_P (XEXP (x, 0)))
+-	{
+-	  rtx temp = gen_reg_rtx (Pmode);
+-	  rtx val  = force_operand (XEXP (x, 1), temp);
+-	  if (val != temp)
+-	    {
+-	      val = convert_to_mode (Pmode, val, 1);
+-	      emit_move_insn (temp, val);
+-	    }
++  strcat (buf, p);
++  return buf;
++}
+ 
+-	  XEXP (x, 1) = temp;
+-	  return x;
+-	}
++/* Return needed mode for entity in optimize_mode_switching pass.  */
+ 
+-      else if (REG_P (XEXP (x, 1)))
+-	{
+-	  rtx temp = gen_reg_rtx (Pmode);
+-	  rtx val  = force_operand (XEXP (x, 0), temp);
+-	  if (val != temp)
+-	    {
+-	      val = convert_to_mode (Pmode, val, 1);
+-	      emit_move_insn (temp, val);
+-	    }
++static int
++ix86_dirflag_mode_needed (rtx_insn *insn)
++{
++  if (CALL_P (insn))
++    {
++      if (cfun->machine->func_type == TYPE_NORMAL)
++	return X86_DIRFLAG_ANY;
++      else
++	/* No need to emit CLD in interrupt handler for TARGET_CLD.  */
++	return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
++    }
+ 
+-	  XEXP (x, 0) = temp;
+-	  return x;
+-	}
++  if (recog_memoized (insn) < 0)
++    return X86_DIRFLAG_ANY;
++
++  if (get_attr_type (insn) == TYPE_STR)
++    {
++      /* Emit cld instruction if stringops are used in the function.  */
++      if (cfun->machine->func_type == TYPE_NORMAL)
++	return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
++      else
++	return X86_DIRFLAG_RESET;
+     }
+ 
+-  return x;
++  return X86_DIRFLAG_ANY;
+ }
+-
+-/* Print an integer constant expression in assembler syntax.  Addition
+-   and subtraction are the only arithmetic that may appear in these
+-   expressions.  FILE is the stdio stream to write to, X is the rtx, and
+-   CODE is the operand print code from the output string.  */
+ 
+-static void
+-output_pic_addr_const (FILE *file, rtx x, int code)
++/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP.   */
++
++static bool
++ix86_check_avx_upper_register (const_rtx exp)
+ {
+-  char buf[256];
++  return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128;
++}
+ 
+-  switch (GET_CODE (x))
++/* Return needed mode for entity in optimize_mode_switching pass.  */
++
++static int
++ix86_avx_u128_mode_needed (rtx_insn *insn)
++{
++  if (CALL_P (insn))
+     {
+-    case PC:
+-      gcc_assert (flag_pic);
+-      putc ('.', file);
+-      break;
++      rtx link;
+ 
+-    case SYMBOL_REF:
+-      if (TARGET_64BIT || ! TARGET_MACHO_SYMBOL_STUBS)
+-	output_addr_const (file, x);
+-      else
++      /* Needed mode is set to AVX_U128_CLEAN if there are
++	 no 256bit or 512bit modes used in function arguments. */
++      for (link = CALL_INSN_FUNCTION_USAGE (insn);
++	   link;
++	   link = XEXP (link, 1))
+ 	{
+-	  const char *name = XSTR (x, 0);
+-
+-	  /* Mark the decl as referenced so that cgraph will
+-	     output the function.  */
+-	  if (SYMBOL_REF_DECL (x))
+-	    mark_decl_referenced (SYMBOL_REF_DECL (x));
++	  if (GET_CODE (XEXP (link, 0)) == USE)
++	    {
++	      rtx arg = XEXP (XEXP (link, 0), 0);
+ 
+-#if TARGET_MACHO
+-	  if (MACHOPIC_INDIRECT
+-	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
+-	    name = machopic_indirection_name (x, /*stub_p=*/true);
+-#endif
+-	  assemble_name (file, name);
++	      if (ix86_check_avx_upper_register (arg))
++		return AVX_U128_DIRTY;
++	    }
+ 	}
+-      if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
+-	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
+-	fputs ("@PLT", file);
+-      break;
+ 
+-    case LABEL_REF:
+-      x = XEXP (x, 0);
+-      /* FALLTHRU */
+-    case CODE_LABEL:
+-      ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
+-      assemble_name (asm_out_file, buf);
+-      break;
++      return AVX_U128_CLEAN;
++    }
+ 
+-    case CONST_INT:
+-      fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+-      break;
++  /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
++     Hardware changes state only when a 256bit register is written to,
++     but we need to prevent the compiler from moving optimal insertion
++     point above eventual read from 256bit or 512 bit register.  */
++  subrtx_iterator::array_type array;
++  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
++    if (ix86_check_avx_upper_register (*iter))
++      return AVX_U128_DIRTY;
+ 
+-    case CONST:
+-      /* This used to output parentheses around the expression,
+-	 but that does not work on the 386 (either ATT or BSD assembler).  */
+-      output_pic_addr_const (file, XEXP (x, 0), code);
+-      break;
++  return AVX_U128_ANY;
++}
+ 
+-    case CONST_DOUBLE:
+-      /* We can't handle floating point constants;
+-	 TARGET_PRINT_OPERAND must handle them.  */
+-      output_operand_lossage ("floating constant misused");
+-      break;
++/* Return mode that i387 must be switched into
++   prior to the execution of insn.  */
+ 
+-    case PLUS:
+-      /* Some assemblers need integer constants to appear first.  */
+-      if (CONST_INT_P (XEXP (x, 0)))
+-	{
+-	  output_pic_addr_const (file, XEXP (x, 0), code);
+-	  putc ('+', file);
+-	  output_pic_addr_const (file, XEXP (x, 1), code);
+-	}
+-      else
+-	{
+-	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
+-	  output_pic_addr_const (file, XEXP (x, 1), code);
+-	  putc ('+', file);
+-	  output_pic_addr_const (file, XEXP (x, 0), code);
+-	}
++static int
++ix86_i387_mode_needed (int entity, rtx_insn *insn)
++{
++  enum attr_i387_cw mode;
++
++  /* The mode UNINITIALIZED is used to store control word after a
++     function call or ASM pattern.  The mode ANY specify that function
++     has no requirements on the control word and make no changes in the
++     bits we are interested in.  */
++
++  if (CALL_P (insn)
++      || (NONJUMP_INSN_P (insn)
++	  && (asm_noperands (PATTERN (insn)) >= 0
++	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
++    return I387_CW_UNINITIALIZED;
++
++  if (recog_memoized (insn) < 0)
++    return I387_CW_ANY;
++
++  mode = get_attr_i387_cw (insn);
++
++  switch (entity)
++    {
++    case I387_TRUNC:
++      if (mode == I387_CW_TRUNC)
++	return mode;
+       break;
+ 
+-    case MINUS:
+-      if (!TARGET_MACHO)
+-	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
+-      output_pic_addr_const (file, XEXP (x, 0), code);
+-      putc ('-', file);
+-      output_pic_addr_const (file, XEXP (x, 1), code);
+-      if (!TARGET_MACHO)
+-	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
++    case I387_FLOOR:
++      if (mode == I387_CW_FLOOR)
++	return mode;
+       break;
+ 
+-    case UNSPEC:
+-      gcc_assert (XVECLEN (x, 0) == 1);
+-      output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
+-      switch (XINT (x, 1))
+-	{
+-	case UNSPEC_GOT:
+-	  fputs ("@GOT", file);
+-	  break;
+-	case UNSPEC_GOTOFF:
+-	  fputs ("@GOTOFF", file);
+-	  break;
+-	case UNSPEC_PLTOFF:
+-	  fputs ("@PLTOFF", file);
+-	  break;
+-	case UNSPEC_PCREL:
+-	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+-		 "(%rip)" : "[rip]", file);
+-	  break;
+-	case UNSPEC_GOTPCREL:
+-	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+-		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
+-	  break;
+-	case UNSPEC_GOTTPOFF:
+-	  /* FIXME: This might be @TPOFF in Sun ld too.  */
+-	  fputs ("@gottpoff", file);
+-	  break;
+-	case UNSPEC_TPOFF:
+-	  fputs ("@tpoff", file);
+-	  break;
+-	case UNSPEC_NTPOFF:
+-	  if (TARGET_64BIT)
+-	    fputs ("@tpoff", file);
+-	  else
+-	    fputs ("@ntpoff", file);
+-	  break;
+-	case UNSPEC_DTPOFF:
+-	  fputs ("@dtpoff", file);
+-	  break;
+-	case UNSPEC_GOTNTPOFF:
+-	  if (TARGET_64BIT)
+-	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+-		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
+-	  else
+-	    fputs ("@gotntpoff", file);
+-	  break;
+-	case UNSPEC_INDNTPOFF:
+-	  fputs ("@indntpoff", file);
+-	  break;
+-#if TARGET_MACHO
+-	case UNSPEC_MACHOPIC_OFFSET:
+-	  putc ('-', file);
+-	  machopic_output_function_base_name (file);
+-	  break;
+-#endif
+-	default:
+-	  output_operand_lossage ("invalid UNSPEC as operand");
+-	  break;
+-	}
+-       break;
++    case I387_CEIL:
++      if (mode == I387_CW_CEIL)
++	return mode;
++      break;
+ 
+     default:
+-      output_operand_lossage ("invalid expression as operand");
++      gcc_unreachable ();
+     }
++
++  return I387_CW_ANY;
+ }
+ 
+-/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
+-   We need to emit DTP-relative relocations.  */
++/* Return mode that entity must be switched into
++   prior to the execution of insn.  */
+ 
+-static void ATTRIBUTE_UNUSED
+-i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
++static int
++ix86_mode_needed (int entity, rtx_insn *insn)
+ {
+-  fputs (ASM_LONG, file);
+-  output_addr_const (file, x);
+-  fputs ("@dtpoff", file);
+-  switch (size)
++  switch (entity)
+     {
+-    case 4:
+-      break;
+-    case 8:
+-      fputs (", 0", file);
+-      break;
++    case X86_DIRFLAG:
++      return ix86_dirflag_mode_needed (insn);
++    case AVX_U128:
++      return ix86_avx_u128_mode_needed (insn);
++    case I387_TRUNC:
++    case I387_FLOOR:
++    case I387_CEIL:
++      return ix86_i387_mode_needed (entity, insn);
+     default:
+       gcc_unreachable ();
+-   }
++    }
++  return 0;
+ }
+ 
+-/* Return true if X is a representation of the PIC register.  This copes
+-   with calls from ix86_find_base_term, where the register might have
+-   been replaced by a cselib value.  */
+-
+-static bool
+-ix86_pic_register_p (rtx x)
+-{
+-  if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
+-    return (pic_offset_table_rtx
+-	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
+-  else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT)
+-    return true;
+-  else if (!REG_P (x))
+-    return false;
+-  else if (pic_offset_table_rtx)
++/* Check if a 256bit or 512bit AVX register is referenced in stores.   */
++ 
++static void
++ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
++ {
++   if (ix86_check_avx_upper_register (dest))
+     {
+-      if (REGNO (x) == REGNO (pic_offset_table_rtx))
+-	return true;
+-      if (HARD_REGISTER_P (x)
+-	  && !HARD_REGISTER_P (pic_offset_table_rtx)
+-	  && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
+-	return true;
+-      return false;
++      bool *used = (bool *) data;
++      *used = true;
+     }
+-  else
+-    return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
+-}
++ } 
+ 
+-/* Helper function for ix86_delegitimize_address.
+-   Attempt to delegitimize TLS local-exec accesses.  */
++/* Calculate mode of upper 128bit AVX registers after the insn.  */
+ 
+-static rtx
+-ix86_delegitimize_tls_address (rtx orig_x)
++static int
++ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
+ {
+-  rtx x = orig_x, unspec;
+-  struct ix86_address addr;
++  rtx pat = PATTERN (insn);
+ 
+-  if (!TARGET_TLS_DIRECT_SEG_REFS)
+-    return orig_x;
+-  if (MEM_P (x))
+-    x = XEXP (x, 0);
+-  if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
+-    return orig_x;
+-  if (ix86_decompose_address (x, &addr) == 0
+-      || addr.seg != DEFAULT_TLS_SEG_REG
+-      || addr.disp == NULL_RTX
+-      || GET_CODE (addr.disp) != CONST)
+-    return orig_x;
+-  unspec = XEXP (addr.disp, 0);
+-  if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
+-    unspec = XEXP (unspec, 0);
+-  if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
+-    return orig_x;
+-  x = XVECEXP (unspec, 0, 0);
+-  gcc_assert (GET_CODE (x) == SYMBOL_REF);
+-  if (unspec != XEXP (addr.disp, 0))
+-    x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
+-  if (addr.index)
++  if (vzeroupper_pattern (pat, VOIDmode)
++      || vzeroall_pattern (pat, VOIDmode))
++    return AVX_U128_CLEAN;
++
++  /* We know that state is clean after CALL insn if there are no
++     256bit or 512bit registers used in the function return register. */
++  if (CALL_P (insn))
+     {
+-      rtx idx = addr.index;
+-      if (addr.scale != 1)
+-	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
+-      x = gen_rtx_PLUS (Pmode, idx, x);
++      bool avx_upper_reg_found = false;
++      note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
++
++      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+     }
+-  if (addr.base)
+-    x = gen_rtx_PLUS (Pmode, addr.base, x);
+-  if (MEM_P (orig_x))
+-    x = replace_equiv_address_nv (orig_x, x);
+-  return x;
+-}
+ 
+-/* In the name of slightly smaller debug output, and to cater to
+-   general assembler lossage, recognize PIC+GOTOFF and turn it back
+-   into a direct symbol reference.
++  /* Otherwise, return current mode.  Remember that if insn
++     references AVX 256bit or 512bit registers, the mode was already
++     changed to DIRTY from MODE_NEEDED.  */
++  return mode;
++}
+ 
+-   On Darwin, this is necessary to avoid a crash, because Darwin
+-   has a different PIC label for each routine but the DWARF debugging
+-   information is not associated with any particular routine, so it's
+-   necessary to remove references to the PIC label from RTL stored by
+-   the DWARF output code.
++/* Return the mode that an insn results in.  */
+ 
+-   This helper is used in the normal ix86_delegitimize_address
+-   entrypoint (e.g. used in the target delegitimization hook) and
+-   in ix86_find_base_term.  As compile time memory optimization, we
+-   avoid allocating rtxes that will not change anything on the outcome
+-   of the callers (find_base_value and find_base_term).  */
++static int
++ix86_mode_after (int entity, int mode, rtx_insn *insn)
++{
++  switch (entity)
++    {
++    case X86_DIRFLAG:
++      return mode;
++    case AVX_U128:
++      return ix86_avx_u128_mode_after (mode, insn);
++    case I387_TRUNC:
++    case I387_FLOOR:
++    case I387_CEIL:
++      return mode;
++    default:
++      gcc_unreachable ();
++    }
++}
+ 
+-static inline rtx
+-ix86_delegitimize_address_1 (rtx x, bool base_term_p)
++static int
++ix86_dirflag_mode_entry (void)
+ {
+-  rtx orig_x = delegitimize_mem_from_attrs (x);
+-  /* addend is NULL or some rtx if x is something+GOTOFF where
+-     something doesn't include the PIC register.  */
+-  rtx addend = NULL_RTX;
+-  /* reg_addend is NULL or a multiple of some register.  */
+-  rtx reg_addend = NULL_RTX;
+-  /* const_addend is NULL or a const_int.  */
+-  rtx const_addend = NULL_RTX;
+-  /* This is the result, or NULL.  */
+-  rtx result = NULL_RTX;
++  /* For TARGET_CLD or in the interrupt handler we can't assume
++     direction flag state at function entry.  */
++  if (TARGET_CLD
++      || cfun->machine->func_type != TYPE_NORMAL)
++    return X86_DIRFLAG_ANY;
+ 
+-  x = orig_x;
++  return X86_DIRFLAG_RESET;
++}
+ 
+-  if (MEM_P (x))
+-    x = XEXP (x, 0);
++static int
++ix86_avx_u128_mode_entry (void)
++{
++  tree arg;
+ 
+-  if (TARGET_64BIT)
++  /* Entry mode is set to AVX_U128_DIRTY if there are
++     256bit or 512bit modes used in function arguments.  */
++  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
++       arg = TREE_CHAIN (arg))
+     {
+-      if (GET_CODE (x) == CONST
+-          && GET_CODE (XEXP (x, 0)) == PLUS
+-          && GET_MODE (XEXP (x, 0)) == Pmode
+-          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+-          && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
+-          && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
+-        {
+-	  /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
+-	     base.  A CONST can't be arg_pointer_rtx based.  */
+-	  if (base_term_p && MEM_P (orig_x))
+-	    return orig_x;
+-	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
+-	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
+-	  if (MEM_P (orig_x))
+-	    x = replace_equiv_address_nv (orig_x, x);
+-	  return x;
+-	}
+-
+-      if (GET_CODE (x) == CONST
+-	  && GET_CODE (XEXP (x, 0)) == UNSPEC
+-	  && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
+-	      || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
+-	  && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
+-	{
+-	  x = XVECEXP (XEXP (x, 0), 0, 0);
+-	  if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
+-	    {
+-	      x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
+-	      if (x == NULL_RTX)
+-		return orig_x;
+-	    }
+-	  return x;
+-	}
+-
+-      if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
+-	return ix86_delegitimize_tls_address (orig_x);
++      rtx incoming = DECL_INCOMING_RTL (arg);
+ 
+-      /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
+-	 and -mcmodel=medium -fpic.  */
++      if (incoming && ix86_check_avx_upper_register (incoming))
++	return AVX_U128_DIRTY;
+     }
+ 
+-  if (GET_CODE (x) != PLUS
+-      || GET_CODE (XEXP (x, 1)) != CONST)
+-    return ix86_delegitimize_tls_address (orig_x);
++  return AVX_U128_CLEAN;
++}
+ 
+-  if (ix86_pic_register_p (XEXP (x, 0)))
+-    /* %ebx + GOT/GOTOFF */
+-    ;
+-  else if (GET_CODE (XEXP (x, 0)) == PLUS)
+-    {
+-      /* %ebx + %reg * scale + GOT/GOTOFF */
+-      reg_addend = XEXP (x, 0);
+-      if (ix86_pic_register_p (XEXP (reg_addend, 0)))
+-	reg_addend = XEXP (reg_addend, 1);
+-      else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
+-	reg_addend = XEXP (reg_addend, 0);
+-      else
+-	{
+-	  reg_addend = NULL_RTX;
+-	  addend = XEXP (x, 0);
+-	}
+-    }
+-  else
+-    addend = XEXP (x, 0);
++/* Return a mode that ENTITY is assumed to be
++   switched to at function entry.  */
+ 
+-  x = XEXP (XEXP (x, 1), 0);
+-  if (GET_CODE (x) == PLUS
+-      && CONST_INT_P (XEXP (x, 1)))
++static int
++ix86_mode_entry (int entity)
++{
++  switch (entity)
+     {
+-      const_addend = XEXP (x, 1);
+-      x = XEXP (x, 0);
++    case X86_DIRFLAG:
++      return ix86_dirflag_mode_entry ();
++    case AVX_U128:
++      return ix86_avx_u128_mode_entry ();
++    case I387_TRUNC:
++    case I387_FLOOR:
++    case I387_CEIL:
++      return I387_CW_ANY;
++    default:
++      gcc_unreachable ();
+     }
++}
+ 
+-  if (GET_CODE (x) == UNSPEC
+-      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
+-	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
+-	  || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
+-	      && !MEM_P (orig_x) && !addend)))
+-    result = XVECEXP (x, 0, 0);
++static int
++ix86_avx_u128_mode_exit (void)
++{
++  rtx reg = crtl->return_rtx;
+ 
+-  if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
+-      && !MEM_P (orig_x))
+-    result = XVECEXP (x, 0, 0);
++  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
++     or 512 bit modes used in the function return register. */
++  if (reg && ix86_check_avx_upper_register (reg))
++    return AVX_U128_DIRTY;
+ 
+-  if (! result)
+-    return ix86_delegitimize_tls_address (orig_x);
++  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
++     modes used in function arguments, otherwise return AVX_U128_CLEAN.
++   */
++  return ix86_avx_u128_mode_entry ();
++}
+ 
+-  /* For (PLUS something CONST_INT) both find_base_{value,term} just
+-     recurse on the first operand.  */
+-  if (const_addend && !base_term_p)
+-    result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
+-  if (reg_addend)
+-    result = gen_rtx_PLUS (Pmode, reg_addend, result);
+-  if (addend)
+-    {
+-      /* If the rest of original X doesn't involve the PIC register, add
+-	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
+-	 for code like:
+-	 leal (%ebx, %ecx, 4), %ecx
+-	 ...
+-	 movl foo@GOTOFF(%ecx), %edx
+-	 in which case we return (%ecx - %ebx) + foo
+-	 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
+-	 and reload has completed.  Don't do the latter for debug,
+-	 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly.  */
+-      if (pic_offset_table_rtx
+-	  && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
+-        result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+-						     pic_offset_table_rtx),
+-			       result);
+-      else if (base_term_p
+-	       && pic_offset_table_rtx
+-	       && !TARGET_MACHO
+-	       && !TARGET_VXWORKS_RTP)
+-	{
+-	  rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+-	  tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
+-	  result = gen_rtx_PLUS (Pmode, tmp, result);
+-	}
+-      else
+-	return orig_x;
+-    }
+-  if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
++/* Return a mode that ENTITY is assumed to be
++   switched to at function exit.  */
++
++static int
++ix86_mode_exit (int entity)
++{
++  switch (entity)
+     {
+-      result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
+-      if (result == NULL_RTX)
+-	return orig_x;
++    case X86_DIRFLAG:
++      return X86_DIRFLAG_ANY;
++    case AVX_U128:
++      return ix86_avx_u128_mode_exit ();
++    case I387_TRUNC:
++    case I387_FLOOR:
++    case I387_CEIL:
++      return I387_CW_ANY;
++    default:
++      gcc_unreachable ();
+     }
+-  return result;
+ }
+ 
+-/* The normal instantiation of the above template.  */
+-
+-static rtx
+-ix86_delegitimize_address (rtx x)
++static int
++ix86_mode_priority (int, int n)
+ {
+-  return ix86_delegitimize_address_1 (x, false);
++  return n;
+ }
+ 
+-/* If X is a machine specific address (i.e. a symbol or label being
+-   referenced as a displacement from the GOT implemented using an
+-   UNSPEC), then return the base term.  Otherwise return X.  */
++/* Output code to initialize control word copies used by trunc?f?i and
++   rounding patterns.  CURRENT_MODE is set to current control word,
++   while NEW_MODE is set to new control word.  */
+ 
+-rtx
+-ix86_find_base_term (rtx x)
++static void
++emit_i387_cw_initialization (int mode)
+ {
+-  rtx term;
++  rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
++  rtx new_mode;
+ 
+-  if (TARGET_64BIT)
+-    {
+-      if (GET_CODE (x) != CONST)
+-	return x;
+-      term = XEXP (x, 0);
+-      if (GET_CODE (term) == PLUS
+-	  && CONST_INT_P (XEXP (term, 1)))
+-	term = XEXP (term, 0);
+-      if (GET_CODE (term) != UNSPEC
+-	  || (XINT (term, 1) != UNSPEC_GOTPCREL
+-	      && XINT (term, 1) != UNSPEC_PCREL))
+-	return x;
++  enum ix86_stack_slot slot;
+ 
+-      return XVECEXP (term, 0, 0);
+-    }
++  rtx reg = gen_reg_rtx (HImode);
+ 
+-  return ix86_delegitimize_address_1 (x, true);
+-}
+-
+-/* Return true if X shouldn't be emitted into the debug info.
+-   Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
+-   symbol easily into the .debug_info section, so we need not to
+-   delegitimize, but instead assemble as @gotoff.
+-   Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
+-   assembles that as _GLOBAL_OFFSET_TABLE_-. expression.  */
+-
+-static bool
+-ix86_const_not_ok_for_debug_p (rtx x)
+-{
+-  if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
+-    return true;
+-
+-  if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
+-    return true;
+-
+-  return false;
+-}
+-
+-static void
+-put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
+-		    bool fp, FILE *file)
+-{
+-  const char *suffix;
+-
+-  if (mode == CCFPmode)
+-    {
+-      code = ix86_fp_compare_code_to_integer (code);
+-      mode = CCmode;
+-    }
+-  if (reverse)
+-    code = reverse_condition (code);
++  emit_insn (gen_x86_fnstcw_1 (stored_mode));
++  emit_move_insn (reg, copy_rtx (stored_mode));
+ 
+-  switch (code)
++  switch (mode)
+     {
+-    case EQ:
+-      gcc_assert (mode != CCGZmode);
+-      switch (mode)
+-	{
+-	case E_CCAmode:
+-	  suffix = "a";
+-	  break;
+-	case E_CCCmode:
+-	  suffix = "c";
+-	  break;
+-	case E_CCOmode:
+-	  suffix = "o";
+-	  break;
+-	case E_CCPmode:
+-	  suffix = "p";
+-	  break;
+-	case E_CCSmode:
+-	  suffix = "s";
+-	  break;
+-	default:
+-	  suffix = "e";
+-	  break;
+-	}
+-      break;
+-    case NE:
+-      gcc_assert (mode != CCGZmode);
+-      switch (mode)
+-	{
+-	case E_CCAmode:
+-	  suffix = "na";
+-	  break;
+-	case E_CCCmode:
+-	  suffix = "nc";
+-	  break;
+-	case E_CCOmode:
+-	  suffix = "no";
+-	  break;
+-	case E_CCPmode:
+-	  suffix = "np";
+-	  break;
+-	case E_CCSmode:
+-	  suffix = "ns";
+-	  break;
+-	default:
+-	  suffix = "ne";
+-	  break;
+-	}
++    case I387_CW_TRUNC:
++      /* round toward zero (truncate) */
++      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
++      slot = SLOT_CW_TRUNC;
+       break;
+-    case GT:
+-      gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
+-      suffix = "g";
++
++    case I387_CW_FLOOR:
++      /* round down toward -oo */
++      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
++      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
++      slot = SLOT_CW_FLOOR;
+       break;
+-    case GTU:
+-      /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
+-	 Those same assemblers have the same but opposite lossage on cmov.  */
+-      if (mode == CCmode)
+-	suffix = fp ? "nbe" : "a";
+-      else
+-	gcc_unreachable ();
++
++    case I387_CW_CEIL:
++      /* round up toward +oo */
++      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
++      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
++      slot = SLOT_CW_CEIL;
+       break;
+-    case LT:
+-      switch (mode)
+-	{
+-	case E_CCNOmode:
+-	case E_CCGOCmode:
+-	  suffix = "s";
+-	  break;
+ 
+-	case E_CCmode:
+-	case E_CCGCmode:
+-	case E_CCGZmode:
+-	  suffix = "l";
+-	  break;
++    default:
++      gcc_unreachable ();
++    }
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      break;
+-    case LTU:
+-      if (mode == CCmode || mode == CCGZmode)
+-	suffix = "b";
+-      else if (mode == CCCmode)
+-	suffix = fp ? "b" : "c";
+-      else
+-	gcc_unreachable ();
+-      break;
+-    case GE:
+-      switch (mode)
+-	{
+-	case E_CCNOmode:
+-	case E_CCGOCmode:
+-	  suffix = "ns";
+-	  break;
++  gcc_assert (slot < MAX_386_STACK_LOCALS);
+ 
+-	case E_CCmode:
+-	case E_CCGCmode:
+-	case E_CCGZmode:
+-	  suffix = "ge";
+-	  break;
++  new_mode = assign_386_stack_local (HImode, slot);
++  emit_move_insn (new_mode, reg);
++}
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      break;
+-    case GEU:
+-      if (mode == CCmode || mode == CCGZmode)
+-	suffix = "nb";
+-      else if (mode == CCCmode)
+-	suffix = fp ? "nb" : "nc";
+-      else
+-	gcc_unreachable ();
+-      break;
+-    case LE:
+-      gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
+-      suffix = "le";
+-      break;
+-    case LEU:
+-      if (mode == CCmode)
+-	suffix = "be";
+-      else
+-	gcc_unreachable ();
++/* Generate one or more insns to set ENTITY to MODE.  */
++
++static void
++ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
++		    HARD_REG_SET regs_live ATTRIBUTE_UNUSED)
++{
++  switch (entity)
++    {
++    case X86_DIRFLAG:
++      if (mode == X86_DIRFLAG_RESET)
++	emit_insn (gen_cld ());
+       break;
+-    case UNORDERED:
+-      suffix = fp ? "u" : "p";
++    case AVX_U128:
++      if (mode == AVX_U128_CLEAN)
++	emit_insn (gen_avx_vzeroupper ());
+       break;
+-    case ORDERED:
+-      suffix = fp ? "nu" : "np";
++    case I387_TRUNC:
++    case I387_FLOOR:
++    case I387_CEIL:
++      if (mode != I387_CW_ANY
++	  && mode != I387_CW_UNINITIALIZED)
++	emit_i387_cw_initialization (mode);
+       break;
+     default:
+       gcc_unreachable ();
+     }
+-  fputs (suffix, file);
+ }
+ 
+-/* Print the name of register X to FILE based on its machine mode and number.
+-   If CODE is 'w', pretend the mode is HImode.
+-   If CODE is 'b', pretend the mode is QImode.
+-   If CODE is 'k', pretend the mode is SImode.
+-   If CODE is 'q', pretend the mode is DImode.
+-   If CODE is 'x', pretend the mode is V4SFmode.
+-   If CODE is 't', pretend the mode is V8SFmode.
+-   If CODE is 'g', pretend the mode is V16SFmode.
+-   If CODE is 'h', pretend the reg is the 'high' byte register.
+-   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+-   If CODE is 'd', duplicate the operand for AVX instruction.
+-   If CODE is 'V', print naked full integer register name without %.
+- */
++/* Output code for INSN to convert a float to a signed int.  OPERANDS
++   are the insn operands.  The output may be [HSD]Imode and the input
++   operand may be [SDX]Fmode.  */
+ 
+-void
+-print_reg (rtx x, int code, FILE *file)
++const char *
++output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
+ {
+-  const char *reg;
+-  int msize;
+-  unsigned int regno;
+-  bool duplicated;
++  bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
++  bool dimode_p = GET_MODE (operands[0]) == DImode;
++  int round_mode = get_attr_i387_cw (insn);
+ 
+-  if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
+-    putc ('%', file);
++  static char buf[40];
++  const char *p;
+ 
+-  if (x == pc_rtx)
+-    {
+-      gcc_assert (TARGET_64BIT);
+-      fputs ("rip", file);
+-      return;
+-    }
++  /* Jump through a hoop or two for DImode, since the hardware has no
++     non-popping instruction.  We used to do this a different way, but
++     that was somewhat fragile and broke with post-reload splitters.  */
++  if ((dimode_p || fisttp) && !stack_top_dies)
++    output_asm_insn ("fld\t%y1", operands);
+ 
+-  if (code == 'y' && STACK_TOP_P (x))
+-    {
+-      fputs ("st(0)", file);
+-      return;
+-    }
++  gcc_assert (STACK_TOP_P (operands[1]));
++  gcc_assert (MEM_P (operands[0]));
++  gcc_assert (GET_MODE (operands[1]) != TFmode);
+ 
+-  if (code == 'w')
+-    msize = 2;
+-  else if (code == 'b')
+-    msize = 1;
+-  else if (code == 'k')
+-    msize = 4;
+-  else if (code == 'q')
+-    msize = 8;
+-  else if (code == 'h')
+-    msize = 0;
+-  else if (code == 'x')
+-    msize = 16;
+-  else if (code == 't')
+-    msize = 32;
+-  else if (code == 'g')
+-    msize = 64;
+-  else
+-    msize = GET_MODE_SIZE (GET_MODE (x));
+-
+-  regno = REGNO (x);
+-
+-  if (regno == ARG_POINTER_REGNUM
+-      || regno == FRAME_POINTER_REGNUM
+-      || regno == FPSR_REG)
+-    {
+-      output_operand_lossage
+-	("invalid use of register '%s'", reg_names[regno]);
+-      return;
+-    }
+-  else if (regno == FLAGS_REG)
+-    {
+-      output_operand_lossage ("invalid use of asm flag output");
+-      return;
+-    }
++  if (fisttp)
++    return "fisttp%Z0\t%0";
+ 
+-  if (code == 'V')
+-    {
+-      if (GENERAL_REGNO_P (regno))
+-	msize = GET_MODE_SIZE (word_mode);
+-      else
+-	error ("%<V%> modifier on non-integer register");
+-    }
++  strcpy (buf, "fist");
+ 
+-  duplicated = code == 'd' && TARGET_AVX;
++  if (round_mode != I387_CW_ANY)
++    output_asm_insn ("fldcw\t%3", operands);
+ 
+-  switch (msize)
+-    {
+-    case 16:
+-    case 12:
+-    case 8:
+-      if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
+-	warning (0, "unsupported size for integer register");
+-      /* FALLTHRU */
+-    case 4:
+-      if (LEGACY_INT_REGNO_P (regno))
+-	putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
+-      /* FALLTHRU */
+-    case 2:
+-    normal:
+-      reg = hi_reg_name[regno];
+-      break;
+-    case 1:
+-      if (regno >= ARRAY_SIZE (qi_reg_name))
+-	goto normal;
+-      if (!ANY_QI_REGNO_P (regno))
+-	error ("unsupported size for integer register");
+-      reg = qi_reg_name[regno];
+-      break;
+-    case 0:
+-      if (regno >= ARRAY_SIZE (qi_high_reg_name))
+-	goto normal;
+-      reg = qi_high_reg_name[regno];
+-      break;
+-    case 32:
+-    case 64:
+-      if (SSE_REGNO_P (regno))
+-	{
+-	  gcc_assert (!duplicated);
+-	  putc (msize == 32 ? 'y' : 'z', file);
+-	  reg = hi_reg_name[regno] + 1;
+-	  break;
+-	}
+-      goto normal;
+-    default:
+-      gcc_unreachable ();
+-    }
++  p = "p%Z0\t%0";
++  strcat (buf, p + !(stack_top_dies || dimode_p));
+ 
+-  fputs (reg, file);
++  output_asm_insn (buf, operands);
+ 
+-  /* Irritatingly, AMD extended registers use
+-     different naming convention: "r%d[bwd]"  */
+-  if (REX_INT_REGNO_P (regno))
+-    {
+-      gcc_assert (TARGET_64BIT);
+-      switch (msize)
+-	{
+-	  case 0:
+-	    error ("extended registers have no high halves");
+-	    break;
+-	  case 1:
+-	    putc ('b', file);
+-	    break;
+-	  case 2:
+-	    putc ('w', file);
+-	    break;
+-	  case 4:
+-	    putc ('d', file);
+-	    break;
+-	  case 8:
+-	    /* no suffix */
+-	    break;
+-	  default:
+-	    error ("unsupported operand size for extended register");
+-	    break;
+-	}
+-      return;
+-    }
++  if (round_mode != I387_CW_ANY)
++    output_asm_insn ("fldcw\t%2", operands);
+ 
+-  if (duplicated)
+-    {
+-      if (ASSEMBLER_DIALECT == ASM_ATT)
+-	fprintf (file, ", %%%s", reg);
+-      else
+-	fprintf (file, ", %s", reg);
+-    }
++  return "";
+ }
+ 
+-/* Meaning of CODE:
+-   L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
+-   C -- print opcode suffix for set/cmov insn.
+-   c -- like C, but print reversed condition
+-   F,f -- likewise, but for floating-point.
+-   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+-	otherwise nothing
+-   R -- print embedded rounding and sae.
+-   r -- print only sae.
+-   z -- print the opcode suffix for the size of the current operand.
+-   Z -- likewise, with special suffixes for x87 instructions.
+-   * -- print a star (in certain assembler syntax)
+-   A -- print an absolute memory reference.
+-   E -- print address with DImode register names if TARGET_64BIT.
+-   w -- print the operand as if it's a "word" (HImode) even if it isn't.
+-   s -- print a shift double count, followed by the assemblers argument
+-	delimiter.
+-   b -- print the QImode name of the register for the indicated operand.
+-	%b0 would print %al if operands[0] is reg 0.
+-   w --  likewise, print the HImode name of the register.
+-   k --  likewise, print the SImode name of the register.
+-   q --  likewise, print the DImode name of the register.
+-   x --  likewise, print the V4SFmode name of the register.
+-   t --  likewise, print the V8SFmode name of the register.
+-   g --  likewise, print the V16SFmode name of the register.
+-   h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
+-   y -- print "st(0)" instead of "st" as a register.
+-   d -- print duplicated register operand for AVX instruction.
+-   D -- print condition for SSE cmp instruction.
+-   P -- if PIC, print an @PLT suffix.
+-   p -- print raw symbol name.
+-   X -- don't print any sort of PIC '@' suffix for a symbol.
+-   & -- print some in-use local-dynamic symbol name.
+-   H -- print a memory address offset by 8; used for sse high-parts
+-   Y -- print condition for XOP pcom* instruction.
+-   V -- print naked full integer register name without %.
+-   + -- print a branch hint as 'cs' or 'ds' prefix
+-   ; -- print a semicolon (after prefixes due to bug in older gas).
+-   ~ -- print "i" if TARGET_AVX2, "f" otherwise.
+-   ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
+-   M -- print addr32 prefix for TARGET_X32 with VSIB address.
+-   ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
+- */
++/* Output code for x87 ffreep insn.  The OPNO argument, which may only
++   have the values zero or one, indicates the ffreep insn's operand
++   from the OPERANDS array.  */
+ 
+-void
+-ix86_print_operand (FILE *file, rtx x, int code)
++static const char *
++output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
+ {
+-  if (code)
++  if (TARGET_USE_FFREEP)
++#ifdef HAVE_AS_IX86_FFREEP
++    return opno ? "ffreep\t%y1" : "ffreep\t%y0";
++#else
+     {
+-      switch (code)
+-	{
+-	case 'A':
+-	  switch (ASSEMBLER_DIALECT)
+-	    {
+-	    case ASM_ATT:
+-	      putc ('*', file);
+-	      break;
+-
+-	    case ASM_INTEL:
+-	      /* Intel syntax. For absolute addresses, registers should not
+-		 be surrounded by braces.  */
+-	      if (!REG_P (x))
+-		{
+-		  putc ('[', file);
+-		  ix86_print_operand (file, x, 0);
+-		  putc (']', file);
+-		  return;
+-		}
+-	      break;
++      static char retval[32];
++      int regno = REGNO (operands[opno]);
+ 
+-	    default:
+-	      gcc_unreachable ();
+-	    }
++      gcc_assert (STACK_REGNO_P (regno));
+ 
+-	  ix86_print_operand (file, x, 0);
+-	  return;
++      regno -= FIRST_STACK_REG;
+ 
+-	case 'E':
+-	  /* Wrap address in an UNSPEC to declare special handling.  */
+-	  if (TARGET_64BIT)
+-	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
++      snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
++      return retval;
++    }
++#endif
+ 
+-	  output_address (VOIDmode, x);
+-	  return;
++  return opno ? "fstp\t%y1" : "fstp\t%y0";
++}
+ 
+-	case 'L':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('l', file);
+-	  return;
+ 
+-	case 'W':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('w', file);
+-	  return;
++/* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
++   should be used.  UNORDERED_P is true when fucom should be used.  */
+ 
+-	case 'B':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('b', file);
+-	  return;
++const char *
++output_fp_compare (rtx_insn *insn, rtx *operands,
++		   bool eflags_p, bool unordered_p)
++{
++  rtx *xops = eflags_p ? &operands[0] : &operands[1];
++  bool stack_top_dies;
+ 
+-	case 'Q':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('l', file);
+-	  return;
++  static char buf[40];
++  const char *p;
+ 
+-	case 'S':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('s', file);
+-	  return;
++  gcc_assert (STACK_TOP_P (xops[0]));
+ 
+-	case 'T':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('t', file);
+-	  return;
++  stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+ 
+-	case 'O':
+-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+-	  if (ASSEMBLER_DIALECT != ASM_ATT)
+-	    return;
+-
+-	  switch (GET_MODE_SIZE (GET_MODE (x)))
+-	    {
+-	    case 2:
+-	      putc ('w', file);
+-	      break;
+-  
+-	    case 4:
+-	      putc ('l', file);
+-	      break;
++  if (eflags_p)
++    {
++      p = unordered_p ? "fucomi" : "fcomi";
++      strcpy (buf, p);
+ 
+-	    case 8:
+-	      putc ('q', file);
+-	      break;
++      p = "p\t{%y1, %0|%0, %y1}";
++      strcat (buf, p + !stack_top_dies);
+ 
+-	    default:
+-	      output_operand_lossage ("invalid operand size for operand "
+-				      "code 'O'");
+-	      return;
+-	    }
++      return buf;
++    }
+ 
+-	  putc ('.', file);
+-#endif
+-	  return;
++  if (STACK_REG_P (xops[1])
++      && stack_top_dies
++      && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
++    {
++      gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
+ 
+-	case 'z':
+-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+-	    {
+-	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
+-	      if (ASSEMBLER_DIALECT == ASM_INTEL)
+-		return;
++      /* If both the top of the 387 stack die, and the other operand
++	 is also a stack register that dies, then this must be a
++	 `fcompp' float compare.  */
++      p = unordered_p ? "fucompp" : "fcompp";
++      strcpy (buf, p);
++    }
++  else if (const0_operand (xops[1], VOIDmode))
++    {
++      gcc_assert (!unordered_p);
++      strcpy (buf, "ftst");
++    }
++  else
++    {
++      if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
++	{
++	  gcc_assert (!unordered_p);
++	  p = "ficom";
++	}
++      else
++	p = unordered_p ? "fucom" : "fcom";
+ 
+-	      switch (GET_MODE_SIZE (GET_MODE (x)))
+-		{
+-		case 1:
+-		  putc ('b', file);
+-		  return;
++      strcpy (buf, p);
+ 
+-		case 2:
+-		  putc ('w', file);
+-		  return;
++      p = "p%Z2\t%y2";
++      strcat (buf, p + !stack_top_dies);
++    }
+ 
+-		case 4:
+-		  putc ('l', file);
+-		  return;
++  output_asm_insn (buf, operands);
++  return "fnstsw\t%0";
++}
+ 
+-		case 8:
+-		  putc ('q', file);
+-		  return;
++void
++ix86_output_addr_vec_elt (FILE *file, int value)
++{
++  const char *directive = ASM_LONG;
+ 
+-		default:
+-		  output_operand_lossage ("invalid operand size for operand "
+-					  "code 'z'");
+-		  return;
+-		}
+-	    }
++#ifdef ASM_QUAD
++  if (TARGET_LP64)
++    directive = ASM_QUAD;
++#else
++  gcc_assert (!TARGET_64BIT);
++#endif
+ 
+-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+-	    warning (0, "non-integer operand used with operand code %<z%>");
+-	  /* FALLTHRU */
++  fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
++}
+ 
+-	case 'Z':
+-	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
+-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+-	    return;
++void
++ix86_output_addr_diff_elt (FILE *file, int value, int rel)
++{
++  const char *directive = ASM_LONG;
+ 
+-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+-	    {
+-	      switch (GET_MODE_SIZE (GET_MODE (x)))
+-		{
+-		case 2:
+-#ifdef HAVE_AS_IX86_FILDS
+-		  putc ('s', file);
++#ifdef ASM_QUAD
++  if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
++    directive = ASM_QUAD;
++#else
++  gcc_assert (!TARGET_64BIT);
+ #endif
+-		  return;
++  /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
++  if (TARGET_64BIT || TARGET_VXWORKS_RTP)
++    fprintf (file, "%s%s%d-%s%d\n",
++	     directive, LPREFIX, value, LPREFIX, rel);
++#if TARGET_MACHO
++  else if (TARGET_MACHO)
++    {
++      fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
++      machopic_output_function_base_name (file);
++      putc ('\n', file);
++    }
++#endif
++  else if (HAVE_AS_GOTOFF_IN_DATA)
++    fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
++  else
++    asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
++		 GOT_SYMBOL_NAME, LPREFIX, value);
++}
++
++#define LEA_MAX_STALL (3)
++#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
+ 
+-		case 4:
+-		  putc ('l', file);
+-		  return;
++/* Increase given DISTANCE in half-cycles according to
++   dependencies between PREV and NEXT instructions.
++   Add 1 half-cycle if there is no dependency and
++   go to next cycle if there is some dependecy.  */
+ 
+-		case 8:
+-#ifdef HAVE_AS_IX86_FILDQ
+-		  putc ('q', file);
+-#else
+-		  fputs ("ll", file);
+-#endif
+-		  return;
++static unsigned int
++increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
++{
++  df_ref def, use;
+ 
+-		default:
+-		  break;
+-		}
+-	    }
+-	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+-	    {
+-	      /* 387 opcodes don't get size suffixes
+-		 if the operands are registers.  */
+-	      if (STACK_REG_P (x))
+-		return;
++  if (!prev || !next)
++    return distance + (distance & 1) + 2;
+ 
+-	      switch (GET_MODE_SIZE (GET_MODE (x)))
+-		{
+-		case 4:
+-		  putc ('s', file);
+-		  return;
++  if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
++    return distance + 1;
+ 
+-		case 8:
+-		  putc ('l', file);
+-		  return;
++  FOR_EACH_INSN_USE (use, next)
++    FOR_EACH_INSN_DEF (def, prev)
++      if (!DF_REF_IS_ARTIFICIAL (def)
++	  && DF_REF_REGNO (use) == DF_REF_REGNO (def))
++	return distance + (distance & 1) + 2;
+ 
+-		case 12:
+-		case 16:
+-		  putc ('t', file);
+-		  return;
++  return distance + 1;
++}
+ 
+-		default:
+-		  break;
+-		}
+-	    }
+-	  else
+-	    {
+-	      output_operand_lossage ("invalid operand type used with "
+-				      "operand code 'Z'");
+-	      return;
+-	    }
++/* Function checks if instruction INSN defines register number
++   REGNO1 or REGNO2.  */
+ 
+-	  output_operand_lossage ("invalid operand size for operand code 'Z'");
+-	  return;
++bool
++insn_defines_reg (unsigned int regno1, unsigned int regno2,
++		  rtx_insn *insn)
++{
++  df_ref def;
+ 
+-	case 'd':
+-	case 'b':
+-	case 'w':
+-	case 'k':
+-	case 'q':
+-	case 'h':
+-	case 't':
+-	case 'g':
+-	case 'y':
+-	case 'x':
+-	case 'X':
+-	case 'P':
+-	case 'p':
+-	case 'V':
+-	  break;
++  FOR_EACH_INSN_DEF (def, insn)
++    if (DF_REF_REG_DEF_P (def)
++	&& !DF_REF_IS_ARTIFICIAL (def)
++	&& (regno1 == DF_REF_REGNO (def)
++	    || regno2 == DF_REF_REGNO (def)))
++      return true;
+ 
+-	case 's':
+-	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
+-	    {
+-	      ix86_print_operand (file, x, 0);
+-	      fputs (", ", file);
+-	    }
+-	  return;
++  return false;
++}
+ 
+-	case 'Y':
+-	  switch (GET_CODE (x))
+-	    {
+-	    case NE:
+-	      fputs ("neq", file);
+-	      break;
+-	    case EQ:
+-	      fputs ("eq", file);
+-	      break;
+-	    case GE:
+-	    case GEU:
+-	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
+-	      break;
+-	    case GT:
+-	    case GTU:
+-	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
+-	      break;
+-	    case LE:
+-	    case LEU:
+-	      fputs ("le", file);
+-	      break;
+-	    case LT:
+-	    case LTU:
+-	      fputs ("lt", file);
+-	      break;
+-	    case UNORDERED:
+-	      fputs ("unord", file);
+-	      break;
+-	    case ORDERED:
+-	      fputs ("ord", file);
+-	      break;
+-	    case UNEQ:
+-	      fputs ("ueq", file);
+-	      break;
+-	    case UNGE:
+-	      fputs ("nlt", file);
+-	      break;
+-	    case UNGT:
+-	      fputs ("nle", file);
+-	      break;
+-	    case UNLE:
+-	      fputs ("ule", file);
+-	      break;
+-	    case UNLT:
+-	      fputs ("ult", file);
+-	      break;
+-	    case LTGT:
+-	      fputs ("une", file);
+-	      break;
+-	    default:
+-	      output_operand_lossage ("operand is not a condition code, "
+-				      "invalid operand code 'Y'");
+-	      return;
+-	    }
+-	  return;
++/* Function checks if instruction INSN uses register number
++   REGNO as a part of address expression.  */
+ 
+-	case 'D':
+-	  /* Little bit of braindamage here.  The SSE compare instructions
+-	     does use completely different names for the comparisons that the
+-	     fp conditional moves.  */
+-	  switch (GET_CODE (x))
+-	    {
+-	    case UNEQ:
+-	      if (TARGET_AVX)
+-		{
+-		  fputs ("eq_us", file);
+-		  break;
+-		}
+-	     /* FALLTHRU */
+-	    case EQ:
+-	      fputs ("eq", file);
+-	      break;
+-	    case UNLT:
+-	      if (TARGET_AVX)
+-		{
+-		  fputs ("nge", file);
+-		  break;
+-		}
+-	     /* FALLTHRU */
+-	    case LT:
+-	      fputs ("lt", file);
+-	      break;
+-	    case UNLE:
+-	      if (TARGET_AVX)
+-		{
+-		  fputs ("ngt", file);
+-		  break;
+-		}
+-	     /* FALLTHRU */
+-	    case LE:
+-	      fputs ("le", file);
+-	      break;
+-	    case UNORDERED:
+-	      fputs ("unord", file);
+-	      break;
+-	    case LTGT:
+-	      if (TARGET_AVX)
+-		{
+-		  fputs ("neq_oq", file);
+-		  break;
+-		}
+-	     /* FALLTHRU */
+-	    case NE:
+-	      fputs ("neq", file);
+-	      break;
+-	    case GE:
+-	      if (TARGET_AVX)
+-		{
+-		  fputs ("ge", file);
+-		  break;
+-		}
+-	     /* FALLTHRU */
+-	    case UNGE:
+-	      fputs ("nlt", file);
+-	      break;
+-	    case GT:
+-	      if (TARGET_AVX)
+-		{
+-		  fputs ("gt", file);
+-		  break;
+-		}
+-	     /* FALLTHRU */
+-	    case UNGT:
+-	      fputs ("nle", file);
+-	      break;
+-	    case ORDERED:
+-	      fputs ("ord", file);
+-	      break;
+-	    default:
+-	      output_operand_lossage ("operand is not a condition code, "
+-				      "invalid operand code 'D'");
+-	      return;
+-	    }
+-	  return;
++static bool
++insn_uses_reg_mem (unsigned int regno, rtx insn)
++{
++  df_ref use;
+ 
+-	case 'F':
+-	case 'f':
+-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('.', file);
+-	  gcc_fallthrough ();
+-#endif
++  FOR_EACH_INSN_USE (use, insn)
++    if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
++      return true;
+ 
+-	case 'C':
+-	case 'c':
+-	  if (!COMPARISON_P (x))
+-	    {
+-	      output_operand_lossage ("operand is not a condition code, "
+-				      "invalid operand code '%c'", code);
+-	      return;
+-	    }
+-	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
+-			      code == 'c' || code == 'f',
+-			      code == 'F' || code == 'f',
+-			      file);
+-	  return;
++  return false;
++}
+ 
+-	case 'H':
+-	  if (!offsettable_memref_p (x))
+-	    {
+-	      output_operand_lossage ("operand is not an offsettable memory "
+-				      "reference, invalid operand code 'H'");
+-	      return;
+-	    }
+-	  /* It doesn't actually matter what mode we use here, as we're
+-	     only going to use this for printing.  */
+-	  x = adjust_address_nv (x, DImode, 8);
+-	  /* Output 'qword ptr' for intel assembler dialect.  */
+-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+-	    code = 'q';
+-	  break;
++/* Search backward for non-agu definition of register number REGNO1
++   or register number REGNO2 in basic block starting from instruction
++   START up to head of basic block or instruction INSN.
+ 
+-	case 'K':
+-	  if (!CONST_INT_P (x))
+-	    {
+-	      output_operand_lossage ("operand is not an integer, invalid "
+-				      "operand code 'K'");
+-	      return;
+-	    }
++   Function puts true value into *FOUND var if definition was found
++   and false otherwise.
+ 
+-	  if (INTVAL (x) & IX86_HLE_ACQUIRE)
+-#ifdef HAVE_AS_IX86_HLE
+-	    fputs ("xacquire ", file);
+-#else
+-	    fputs ("\n" ASM_BYTE "0xf2\n\t", file);
+-#endif
+-	  else if (INTVAL (x) & IX86_HLE_RELEASE)
+-#ifdef HAVE_AS_IX86_HLE
+-	    fputs ("xrelease ", file);
+-#else
+-	    fputs ("\n" ASM_BYTE "0xf3\n\t", file);
+-#endif
+-	  /* We do not want to print value of the operand.  */
+-	  return;
++   Distance in half-cycles between START and found instruction or head
++   of BB is added to DISTANCE and returned.  */
+ 
+-	case 'N':
+-	  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
+-	    fputs ("{z}", file);
+-	  return;
++static int
++distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
++			       rtx_insn *insn, int distance,
++			       rtx_insn *start, bool *found)
++{
++  basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
++  rtx_insn *prev = start;
++  rtx_insn *next = NULL;
+ 
+-	case 'r':
+-	  if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
++  *found = false;
++
++  while (prev
++	 && prev != insn
++	 && distance < LEA_SEARCH_THRESHOLD)
++    {
++      if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
++	{
++	  distance = increase_distance (prev, next, distance);
++	  if (insn_defines_reg (regno1, regno2, prev))
+ 	    {
+-	      output_operand_lossage ("operand is not a specific integer, "
+-				      "invalid operand code 'r'");
+-	      return;
++	      if (recog_memoized (prev) < 0
++		  || get_attr_type (prev) != TYPE_LEA)
++		{
++		  *found = true;
++		  return distance;
++		}
+ 	    }
+ 
+-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+-	    fputs (", ", file);
++	  next = prev;
++	}
++      if (prev == BB_HEAD (bb))
++	break;
+ 
+-	  fputs ("{sae}", file);
++      prev = PREV_INSN (prev);
++    }
+ 
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    fputs (", ", file);
++  return distance;
++}
+ 
+-	  return;
++/* Search backward for non-agu definition of register number REGNO1
++   or register number REGNO2 in INSN's basic block until
++   1. Pass LEA_SEARCH_THRESHOLD instructions, or
++   2. Reach neighbor BBs boundary, or
++   3. Reach agu definition.
++   Returns the distance between the non-agu definition point and INSN.
++   If no definition point, returns -1.  */
+ 
+-	case 'R':
+-	  if (!CONST_INT_P (x))
+-	    {
+-	      output_operand_lossage ("operand is not an integer, invalid "
+-				      "operand code 'R'");
+-	      return;
+-	    }
++static int
++distance_non_agu_define (unsigned int regno1, unsigned int regno2,
++			 rtx_insn *insn)
++{
++  basic_block bb = BLOCK_FOR_INSN (insn);
++  int distance = 0;
++  bool found = false;
+ 
+-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+-	    fputs (", ", file);
++  if (insn != BB_HEAD (bb))
++    distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
++					      distance, PREV_INSN (insn),
++					      &found);
+ 
+-	  switch (INTVAL (x))
++  if (!found && distance < LEA_SEARCH_THRESHOLD)
++    {
++      edge e;
++      edge_iterator ei;
++      bool simple_loop = false;
++
++      FOR_EACH_EDGE (e, ei, bb->preds)
++	if (e->src == bb)
++	  {
++	    simple_loop = true;
++	    break;
++	  }
++
++      if (simple_loop)
++	distance = distance_non_agu_define_in_bb (regno1, regno2,
++						  insn, distance,
++						  BB_END (bb), &found);
++      else
++	{
++	  int shortest_dist = -1;
++	  bool found_in_bb = false;
++
++	  FOR_EACH_EDGE (e, ei, bb->preds)
+ 	    {
+-	    case ROUND_NEAREST_INT | ROUND_SAE:
+-	      fputs ("{rn-sae}", file);
+-	      break;
+-	    case ROUND_NEG_INF | ROUND_SAE:
+-	      fputs ("{rd-sae}", file);
+-	      break;
+-	    case ROUND_POS_INF | ROUND_SAE:
+-	      fputs ("{ru-sae}", file);
+-	      break;
+-	    case ROUND_ZERO | ROUND_SAE:
+-	      fputs ("{rz-sae}", file);
+-	      break;
+-	    default:
+-	      output_operand_lossage ("operand is not a specific integer, "
+-				      "invalid operand code 'R'");
+-	    }
++	      int bb_dist
++		= distance_non_agu_define_in_bb (regno1, regno2,
++						 insn, distance,
++						 BB_END (e->src),
++						 &found_in_bb);
++	      if (found_in_bb)
++		{
++		  if (shortest_dist < 0)
++		    shortest_dist = bb_dist;
++		  else if (bb_dist > 0)
++		    shortest_dist = MIN (bb_dist, shortest_dist);
+ 
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    fputs (", ", file);
++		  found = true;
++		}
++	    }
+ 
+-	  return;
++	  distance = shortest_dist;
++	}
++    }
+ 
+-	case '*':
+-	  if (ASSEMBLER_DIALECT == ASM_ATT)
+-	    putc ('*', file);
+-	  return;
++  /* get_attr_type may modify recog data.  We want to make sure
++     that recog data is valid for instruction INSN, on which
++     distance_non_agu_define is called.  INSN is unchanged here.  */
++  extract_insn_cached (insn);
+ 
+-	case '&':
+-	  {
+-	    const char *name = get_some_local_dynamic_name ();
+-	    if (name == NULL)
+-	      output_operand_lossage ("'%%&' used without any "
+-				      "local dynamic TLS references");
+-	    else
+-	      assemble_name (file, name);
+-	    return;
+-	  }
++  if (!found)
++    return -1;
+ 
+-	case '+':
+-	  {
+-	    rtx x;
++  return distance >> 1;
++}
+ 
+-	    if (!optimize
+-	        || optimize_function_for_size_p (cfun)
+-		|| !TARGET_BRANCH_PREDICTION_HINTS)
+-	      return;
++/* Return the distance in half-cycles between INSN and the next
++   insn that uses register number REGNO in memory address added
++   to DISTANCE.  Return -1 if REGNO0 is set.
+ 
+-	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
+-	    if (x)
+-	      {
+-		int pred_val = profile_probability::from_reg_br_prob_note
+-				 (XINT (x, 0)).to_reg_br_prob_base ();
++   Put true value into *FOUND if register usage was found and
++   false otherwise.
++   Put true value into *REDEFINED if register redefinition was
++   found and false otherwise.  */
+ 
+-		if (pred_val < REG_BR_PROB_BASE * 45 / 100
+-		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
+-		  {
+-		    bool taken = pred_val > REG_BR_PROB_BASE / 2;
+-		    bool cputaken
+-		      = final_forward_branch_p (current_output_insn) == 0;
++static int
++distance_agu_use_in_bb (unsigned int regno,
++			rtx_insn *insn, int distance, rtx_insn *start,
++			bool *found, bool *redefined)
++{
++  basic_block bb = NULL;
++  rtx_insn *next = start;
++  rtx_insn *prev = NULL;
+ 
+-		    /* Emit hints only in the case default branch prediction
+-		       heuristics would fail.  */
+-		    if (taken != cputaken)
+-		      {
+-			/* We use 3e (DS) prefix for taken branches and
+-			   2e (CS) prefix for not taken branches.  */
+-			if (taken)
+-			  fputs ("ds ; ", file);
+-			else
+-			  fputs ("cs ; ", file);
+-		      }
+-		  }
+-	      }
+-	    return;
+-	  }
++  *found = false;
++  *redefined = false;
+ 
+-	case ';':
+-#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
+-	  putc (';', file);
+-#endif
+-	  return;
++  if (start != NULL_RTX)
++    {
++      bb = BLOCK_FOR_INSN (start);
++      if (start != BB_HEAD (bb))
++	/* If insn and start belong to the same bb, set prev to insn,
++	   so the call to increase_distance will increase the distance
++	   between insns by 1.  */
++	prev = insn;
++    }
+ 
+-	case '~':
+-	  putc (TARGET_AVX2 ? 'i' : 'f', file);
+-	  return;
++  while (next
++	 && next != insn
++	 && distance < LEA_SEARCH_THRESHOLD)
++    {
++      if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
++	{
++	  distance = increase_distance(prev, next, distance);
++	  if (insn_uses_reg_mem (regno, next))
++	    {
++	      /* Return DISTANCE if OP0 is used in memory
++		 address in NEXT.  */
++	      *found = true;
++	      return distance;
++	    }
+ 
+-	case 'M':
+-	  if (TARGET_X32)
++	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
+ 	    {
+-	      /* NB: 32-bit indices in VSIB address are sign-extended
+-		 to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
+-		 sign-extended to 0xfffffffff7fa3010 which is invalid
+-		 address.  Add addr32 prefix if there is no base
+-		 register nor symbol.  */
+-	      bool ok;
+-	      struct ix86_address parts;
+-	      ok = ix86_decompose_address (x, &parts);
+-	      gcc_assert (ok && parts.index == NULL_RTX);
+-	      if (parts.base == NULL_RTX
+-		  && (parts.disp == NULL_RTX
+-		      || !symbolic_operand (parts.disp,
+-					    GET_MODE (parts.disp))))
+-		fputs ("addr32 ", file);
++	      /* Return -1 if OP0 is set in NEXT.  */
++	      *redefined = true;
++	      return -1;
+ 	    }
+-	  return;
+ 
+-	case '^':
+-	  if (TARGET_64BIT && Pmode != word_mode)
+-	    fputs ("addr32 ", file);
+-	  return;
++	  prev = next;
++	}
+ 
+-	case '!':
+-	  if (ix86_notrack_prefixed_insn_p (current_output_insn))
+-	    fputs ("notrack ", file);
+-	  return;
++      if (next == BB_END (bb))
++	break;
+ 
+-	default:
+-	  output_operand_lossage ("invalid operand code '%c'", code);
+-	}
++      next = NEXT_INSN (next);
+     }
+ 
+-  if (REG_P (x))
+-    print_reg (x, code, file);
++  return distance;
++}
+ 
+-  else if (MEM_P (x))
++/* Return the distance between INSN and the next insn that uses
++   register number REGNO0 in memory address.  Return -1 if no such
++   a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
++
++static int
++distance_agu_use (unsigned int regno0, rtx_insn *insn)
++{
++  basic_block bb = BLOCK_FOR_INSN (insn);
++  int distance = 0;
++  bool found = false;
++  bool redefined = false;
++
++  if (insn != BB_END (bb))
++    distance = distance_agu_use_in_bb (regno0, insn, distance,
++				       NEXT_INSN (insn),
++				       &found, &redefined);
++
++  if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
+     {
+-      rtx addr = XEXP (x, 0);
++      edge e;
++      edge_iterator ei;
++      bool simple_loop = false;
+ 
+-      /* No `byte ptr' prefix for call instructions ... */
+-      if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
+-	{
+-	  machine_mode mode = GET_MODE (x);
+-	  const char *size;
++      FOR_EACH_EDGE (e, ei, bb->succs)
++        if (e->dest == bb)
++	  {
++	    simple_loop = true;
++	    break;
++	  }
+ 
+-	  /* Check for explicit size override codes.  */
+-	  if (code == 'b')
+-	    size = "BYTE";
+-	  else if (code == 'w')
+-	    size = "WORD";
+-	  else if (code == 'k')
+-	    size = "DWORD";
+-	  else if (code == 'q')
+-	    size = "QWORD";
+-	  else if (code == 'x')
+-	    size = "XMMWORD";
+-	  else if (code == 't')
+-	    size = "YMMWORD";
+-	  else if (code == 'g')
+-	    size = "ZMMWORD";
+-	  else if (mode == BLKmode)
+-	    /* ... or BLKmode operands, when not overridden.  */
+-	    size = NULL;
+-	  else
+-	    switch (GET_MODE_SIZE (mode))
+-	      {
+-	      case 1: size = "BYTE"; break;
+-	      case 2: size = "WORD"; break;
+-	      case 4: size = "DWORD"; break;
+-	      case 8: size = "QWORD"; break;
+-	      case 12: size = "TBYTE"; break;
+-	      case 16:
+-		if (mode == XFmode)
+-		  size = "TBYTE";
+-		else
+-		  size = "XMMWORD";
+-		break;
+-	      case 32: size = "YMMWORD"; break;
+-	      case 64: size = "ZMMWORD"; break;
+-	      default:
+-		gcc_unreachable ();
+-	      }
+-	  if (size)
++      if (simple_loop)
++	distance = distance_agu_use_in_bb (regno0, insn,
++					   distance, BB_HEAD (bb),
++					   &found, &redefined);
++      else
++	{
++	  int shortest_dist = -1;
++	  bool found_in_bb = false;
++	  bool redefined_in_bb = false;
++
++	  FOR_EACH_EDGE (e, ei, bb->succs)
+ 	    {
+-	      fputs (size, file);
+-	      fputs (" PTR ", file);
++	      int bb_dist
++		= distance_agu_use_in_bb (regno0, insn,
++					  distance, BB_HEAD (e->dest),
++					  &found_in_bb, &redefined_in_bb);
++	      if (found_in_bb)
++		{
++		  if (shortest_dist < 0)
++		    shortest_dist = bb_dist;
++		  else if (bb_dist > 0)
++		    shortest_dist = MIN (bb_dist, shortest_dist);
++
++		  found = true;
++		}
+ 	    }
+-	}
+ 
+-      if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
+-	output_operand_lossage ("invalid constraints for operand");
+-      else
+-	ix86_print_operand_address_as
+-	  (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
++	  distance = shortest_dist;
++	}
+     }
+ 
+-  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
++  if (!found || redefined)
++    return -1;
++
++  return distance >> 1;
++}
++
++/* Define this macro to tune LEA priority vs ADD, it take effect when
++   there is a dilemma of choicing LEA or ADD
++   Negative value: ADD is more preferred than LEA
++   Zero: Netrual
++   Positive value: LEA is more preferred than ADD*/
++#define IX86_LEA_PRIORITY 0
++
++/* Return true if usage of lea INSN has performance advantage
++   over a sequence of instructions.  Instructions sequence has
++   SPLIT_COST cycles higher latency than lea latency.  */
++
++static bool
++ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
++		      unsigned int regno2, int split_cost, bool has_scale)
++{
++  int dist_define, dist_use;
++
++  /* For Silvermont if using a 2-source or 3-source LEA for
++     non-destructive destination purposes, or due to wanting
++     ability to use SCALE, the use of LEA is justified.  */
++  if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
++      || TARGET_TREMONT || TARGET_INTEL)
+     {
+-      long l;
++      if (has_scale)
++	return true;
++      if (split_cost < 1)
++	return false;
++      if (regno0 == regno1 || regno0 == regno2)
++	return false;
++      return true;
++    }
+ 
+-      REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
++  dist_define = distance_non_agu_define (regno1, regno2, insn);
++  dist_use = distance_agu_use (regno0, insn);
+ 
+-      if (ASSEMBLER_DIALECT == ASM_ATT)
+-	putc ('$', file);
+-      /* Sign extend 32bit SFmode immediate to 8 bytes.  */
+-      if (code == 'q')
+-	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
+-		 (unsigned long long) (int) l);
++  if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
++    {
++      /* If there is no non AGU operand definition, no AGU
++	 operand usage and split cost is 0 then both lea
++	 and non lea variants have same priority.  Currently
++	 we prefer lea for 64 bit code and non lea on 32 bit
++	 code.  */
++      if (dist_use < 0 && split_cost == 0)
++	return TARGET_64BIT || IX86_LEA_PRIORITY;
+       else
+-	fprintf (file, "0x%08x", (unsigned int) l);
++	return true;
+     }
+ 
+-  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
+-    {
+-      long l[2];
++  /* With longer definitions distance lea is more preferable.
++     Here we change it to take into account splitting cost and
++     lea priority.  */
++  dist_define += split_cost + IX86_LEA_PRIORITY;
+ 
+-      REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
++  /* If there is no use in memory addess then we just check
++     that split cost exceeds AGU stall.  */
++  if (dist_use < 0)
++    return dist_define > LEA_MAX_STALL;
+ 
+-      if (ASSEMBLER_DIALECT == ASM_ATT)
+-	putc ('$', file);
+-      fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
+-    }
++  /* If this insn has both backward non-agu dependence and forward
++     agu dependence, the one with short distance takes effect.  */
++  return dist_define >= dist_use;
++}
+ 
+-  /* These float cases don't actually occur as immediate operands.  */
+-  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
+-    {
+-      char dstr[30];
++/* Return true if it is legal to clobber flags by INSN and
++   false otherwise.  */
+ 
+-      real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
+-      fputs (dstr, file);
+-    }
++static bool
++ix86_ok_to_clobber_flags (rtx_insn *insn)
++{
++  basic_block bb = BLOCK_FOR_INSN (insn);
++  df_ref use;
++  bitmap live;
+ 
+-  else
++  while (insn)
+     {
+-      /* We have patterns that allow zero sets of memory, for instance.
+-	 In 64-bit mode, we should probably support all 8-byte vectors,
+-	 since we can in fact encode that into an immediate.  */
+-      if (GET_CODE (x) == CONST_VECTOR)
++      if (NONDEBUG_INSN_P (insn))
+ 	{
+-	  if (x != CONST0_RTX (GET_MODE (x)))
+-	    output_operand_lossage ("invalid vector immediate");
+-	  x = const0_rtx;
+-	}
++	  FOR_EACH_INSN_USE (use, insn)
++	    if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
++	      return false;
+ 
+-      if (code != 'P' && code != 'p')
+-	{
+-	  if (CONST_INT_P (x))
+-	    {
+-	      if (ASSEMBLER_DIALECT == ASM_ATT)
+-		putc ('$', file);
+-	    }
+-	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
+-		   || GET_CODE (x) == LABEL_REF)
+-	    {
+-	      if (ASSEMBLER_DIALECT == ASM_ATT)
+-		putc ('$', file);
+-	      else
+-		fputs ("OFFSET FLAT:", file);
+-	    }
++	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
++	    return true;
+ 	}
+-      if (CONST_INT_P (x))
+-	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+-      else if (flag_pic || MACHOPIC_INDIRECT)
+-	output_pic_addr_const (file, x, code);
+-      else
+-	output_addr_const (file, x);
++
++      if (insn == BB_END (bb))
++	break;
++
++      insn = NEXT_INSN (insn);
+     }
+-}
+ 
+-static bool
+-ix86_print_operand_punct_valid_p (unsigned char code)
+-{
+-  return (code == '*' || code == '+' || code == '&' || code == ';'
+-	  || code == '~' || code == '^' || code == '!');
++  live = df_get_live_out(bb);
++  return !REGNO_REG_SET_P (live, FLAGS_REG);
+ }
+-
+-/* Print a memory operand whose address is ADDR.  */
+ 
+-static void
+-ix86_print_operand_address_as (FILE *file, rtx addr,
+-			       addr_space_t as, bool no_rip)
++/* Return true if we need to split op0 = op1 + op2 into a sequence of
++   move and add to avoid AGU stalls.  */
++
++bool
++ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
+ {
+-  struct ix86_address parts;
+-  rtx base, index, disp;
+-  int scale;
+-  int ok;
+-  bool vsib = false;
+-  int code = 0;
++  unsigned int regno0, regno1, regno2;
+ 
+-  if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
+-    {
+-      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+-      gcc_assert (parts.index == NULL_RTX);
+-      parts.index = XVECEXP (addr, 0, 1);
+-      parts.scale = INTVAL (XVECEXP (addr, 0, 2));
+-      addr = XVECEXP (addr, 0, 0);
+-      vsib = true;
+-    }
+-  else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
+-    {
+-      gcc_assert (TARGET_64BIT);
+-      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+-      code = 'q';
+-    }
+-  else
+-    ok = ix86_decompose_address (addr, &parts);
++  /* Check if we need to optimize.  */
++  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
++    return false;
+ 
+-  gcc_assert (ok);
++  /* Check it is correct to split here.  */
++  if (!ix86_ok_to_clobber_flags(insn))
++    return false;
+ 
+-  base = parts.base;
+-  index = parts.index;
+-  disp = parts.disp;
+-  scale = parts.scale;
++  regno0 = true_regnum (operands[0]);
++  regno1 = true_regnum (operands[1]);
++  regno2 = true_regnum (operands[2]);
+ 
+-  if (ADDR_SPACE_GENERIC_P (as))
+-    as = parts.seg;
++  /* We need to split only adds with non destructive
++     destination operand.  */
++  if (regno0 == regno1 || regno0 == regno2)
++    return false;
+   else
+-    gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
+-
+-  if (!ADDR_SPACE_GENERIC_P (as))
+-    {
+-      if (ASSEMBLER_DIALECT == ASM_ATT)
+-	putc ('%', file);
+-
+-      switch (as)
+-	{
+-	case ADDR_SPACE_SEG_FS:
+-	  fputs ("fs:", file);
+-	  break;
+-	case ADDR_SPACE_SEG_GS:
+-	  fputs ("gs:", file);
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-
+-  /* Use one byte shorter RIP relative addressing for 64bit mode.  */
+-  if (TARGET_64BIT && !base && !index && !no_rip)
+-    {
+-      rtx symbol = disp;
++    return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
++}
+ 
+-      if (GET_CODE (disp) == CONST
+-	  && GET_CODE (XEXP (disp, 0)) == PLUS
+-	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+-	symbol = XEXP (XEXP (disp, 0), 0);
++/* Return true if we should emit lea instruction instead of mov
++   instruction.  */
+ 
+-      if (GET_CODE (symbol) == LABEL_REF
+-	  || (GET_CODE (symbol) == SYMBOL_REF
+-	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
+-	base = pc_rtx;
+-    }
++bool
++ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
++{
++  unsigned int regno0, regno1;
+ 
+-  if (!base && !index)
+-    {
+-      /* Displacement only requires special attention.  */
+-      if (CONST_INT_P (disp))
+-	{
+-	  if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
+-	    fputs ("ds:", file);
+-	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
+-	}
+-      /* Load the external function address via the GOT slot to avoid PLT.  */
+-      else if (GET_CODE (disp) == CONST
+-	       && GET_CODE (XEXP (disp, 0)) == UNSPEC
+-	       && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
+-		   || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
+-	       && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+-	output_pic_addr_const (file, disp, 0);
+-      else if (flag_pic)
+-	output_pic_addr_const (file, disp, 0);
+-      else
+-	output_addr_const (file, disp);
+-    }
+-  else
+-    {
+-      /* Print SImode register names to force addr32 prefix.  */
+-      if (SImode_address_operand (addr, VOIDmode))
+-	{
+-	  if (flag_checking)
+-	    {
+-	      gcc_assert (TARGET_64BIT);
+-	      switch (GET_CODE (addr))
+-		{
+-		case SUBREG:
+-		  gcc_assert (GET_MODE (addr) == SImode);
+-		  gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
+-		  break;
+-		case ZERO_EXTEND:
+-		case AND:
+-		  gcc_assert (GET_MODE (addr) == DImode);
+-		  break;
+-		default:
+-		  gcc_unreachable ();
+-		}
+-	    }
+-	  gcc_assert (!code);
+-	  code = 'k';
+-	}
+-      else if (code == 0
+-	       && TARGET_X32
+-	       && disp
+-	       && CONST_INT_P (disp)
+-	       && INTVAL (disp) < -16*1024*1024)
+-	{
+-	  /* X32 runs in 64-bit mode, where displacement, DISP, in
+-	     address DISP(%r64), is encoded as 32-bit immediate sign-
+-	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
+-	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
+-	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
+-	     which is invalid for x32.  The correct address is %r64
+-	     - 0x40000300 == 0xf7ffdd64.  To properly encode
+-	     -0x40000300(%r64) for x32, we zero-extend negative
+-	     displacement by forcing addr32 prefix which truncates
+-	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
+-	     zero-extend all negative displacements, including -1(%rsp).
+-	     However, for small negative displacements, sign-extension
+-	     won't cause overflow.  We only zero-extend negative
+-	     displacements if they < -16*1024*1024, which is also used
+-	     to check legitimate address displacements for PIC.  */
+-	  code = 'k';
+-	}
++  /* Check if we need to optimize.  */
++  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
++    return false;
+ 
+-      /* Since the upper 32 bits of RSP are always zero for x32,
+-	 we can encode %esp as %rsp to avoid 0x67 prefix if
+-	 there is no index register.  */
+-      if (TARGET_X32 && Pmode == SImode
+-	  && !index && base && REG_P (base) && REGNO (base) == SP_REG)
+-	code = 'q';
++  /* Use lea for reg to reg moves only.  */
++  if (!REG_P (operands[0]) || !REG_P (operands[1]))
++    return false;
+ 
+-      if (ASSEMBLER_DIALECT == ASM_ATT)
+-	{
+-	  if (disp)
+-	    {
+-	      if (flag_pic)
+-		output_pic_addr_const (file, disp, 0);
+-	      else if (GET_CODE (disp) == LABEL_REF)
+-		output_asm_label (disp);
+-	      else
+-		output_addr_const (file, disp);
+-	    }
++  regno0 = true_regnum (operands[0]);
++  regno1 = true_regnum (operands[1]);
+ 
+-	  putc ('(', file);
+-	  if (base)
+-	    print_reg (base, code, file);
+-	  if (index)
+-	    {
+-	      putc (',', file);
+-	      print_reg (index, vsib ? 0 : code, file);
+-	      if (scale != 1 || vsib)
+-		fprintf (file, ",%d", scale);
+-	    }
+-	  putc (')', file);
+-	}
+-      else
+-	{
+-	  rtx offset = NULL_RTX;
++  return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
++}
+ 
+-	  if (disp)
+-	    {
+-	      /* Pull out the offset of a symbol; print any symbol itself.  */
+-	      if (GET_CODE (disp) == CONST
+-		  && GET_CODE (XEXP (disp, 0)) == PLUS
+-		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+-		{
+-		  offset = XEXP (XEXP (disp, 0), 1);
+-		  disp = gen_rtx_CONST (VOIDmode,
+-					XEXP (XEXP (disp, 0), 0));
+-		}
++/* Return true if we need to split lea into a sequence of
++   instructions to avoid AGU stalls. */
+ 
+-	      if (flag_pic)
+-		output_pic_addr_const (file, disp, 0);
+-	      else if (GET_CODE (disp) == LABEL_REF)
+-		output_asm_label (disp);
+-	      else if (CONST_INT_P (disp))
+-		offset = disp;
+-	      else
+-		output_addr_const (file, disp);
+-	    }
++bool
++ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
++{
++  unsigned int regno0, regno1, regno2;
++  int split_cost;
++  struct ix86_address parts;
++  int ok;
+ 
+-	  putc ('[', file);
+-	  if (base)
+-	    {
+-	      print_reg (base, code, file);
+-	      if (offset)
+-		{
+-		  if (INTVAL (offset) >= 0)
+-		    putc ('+', file);
+-		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+-		}
+-	    }
+-	  else if (offset)
+-	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+-	  else
+-	    putc ('0', file);
++  /* Check we need to optimize.  */
++  if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
++    return false;
+ 
+-	  if (index)
+-	    {
+-	      putc ('+', file);
+-	      print_reg (index, vsib ? 0 : code, file);
+-	      if (scale != 1 || vsib)
+-		fprintf (file, "*%d", scale);
+-	    }
+-	  putc (']', file);
+-	}
+-    }
+-}
++  /* The "at least two components" test below might not catch simple
++     move or zero extension insns if parts.base is non-NULL and parts.disp
++     is const0_rtx as the only components in the address, e.g. if the
++     register is %rbp or %r13.  As this test is much cheaper and moves or
++     zero extensions are the common case, do this check first.  */
++  if (REG_P (operands[1])
++      || (SImode_address_operand (operands[1], VOIDmode)
++	  && REG_P (XEXP (operands[1], 0))))
++    return false;
+ 
+-static void
+-ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
+-{
+-  ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
+-}
++  /* Check if it is OK to split here.  */
++  if (!ix86_ok_to_clobber_flags (insn))
++    return false;
+ 
+-/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
++  ok = ix86_decompose_address (operands[1], &parts);
++  gcc_assert (ok);
+ 
+-static bool
+-i386_asm_output_addr_const_extra (FILE *file, rtx x)
+-{
+-  rtx op;
++  /* There should be at least two components in the address.  */
++  if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
++      + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
++    return false;
+ 
+-  if (GET_CODE (x) != UNSPEC)
++  /* We should not split into add if non legitimate pic
++     operand is used as displacement. */
++  if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
+     return false;
+ 
+-  op = XVECEXP (x, 0, 0);
+-  switch (XINT (x, 1))
+-    {
+-    case UNSPEC_GOTOFF:
+-      output_addr_const (file, op);
+-      fputs ("@gotoff", file);
+-      break;
+-    case UNSPEC_GOTTPOFF:
+-      output_addr_const (file, op);
+-      /* FIXME: This might be @TPOFF in Sun ld.  */
+-      fputs ("@gottpoff", file);
+-      break;
+-    case UNSPEC_TPOFF:
+-      output_addr_const (file, op);
+-      fputs ("@tpoff", file);
+-      break;
+-    case UNSPEC_NTPOFF:
+-      output_addr_const (file, op);
+-      if (TARGET_64BIT)
+-	fputs ("@tpoff", file);
+-      else
+-	fputs ("@ntpoff", file);
+-      break;
+-    case UNSPEC_DTPOFF:
+-      output_addr_const (file, op);
+-      fputs ("@dtpoff", file);
+-      break;
+-    case UNSPEC_GOTNTPOFF:
+-      output_addr_const (file, op);
+-      if (TARGET_64BIT)
+-	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+-	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
+-      else
+-	fputs ("@gotntpoff", file);
+-      break;
+-    case UNSPEC_INDNTPOFF:
+-      output_addr_const (file, op);
+-      fputs ("@indntpoff", file);
+-      break;
+-#if TARGET_MACHO
+-    case UNSPEC_MACHOPIC_OFFSET:
+-      output_addr_const (file, op);
+-      putc ('-', file);
+-      machopic_output_function_base_name (file);
+-      break;
+-#endif
+-
+-    default:
+-      return false;
+-    }
++  regno0 = true_regnum (operands[0]) ;
++  regno1 = INVALID_REGNUM;
++  regno2 = INVALID_REGNUM;
+ 
+-  return true;
+-}
+-
+-/* Split one or more double-mode RTL references into pairs of half-mode
+-   references.  The RTL can be REG, offsettable MEM, integer constant, or
+-   CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
+-   split and "num" is its length.  lo_half and hi_half are output arrays
+-   that parallel "operands".  */
++  if (parts.base)
++    regno1 = true_regnum (parts.base);
++  if (parts.index)
++    regno2 = true_regnum (parts.index);
+ 
+-void
+-split_double_mode (machine_mode mode, rtx operands[],
+-		   int num, rtx lo_half[], rtx hi_half[])
+-{
+-  machine_mode half_mode;
+-  unsigned int byte;
++  split_cost = 0;
+ 
+-  switch (mode)
++  /* Compute how many cycles we will add to execution time
++     if split lea into a sequence of instructions.  */
++  if (parts.base || parts.index)
+     {
+-    case E_TImode:
+-      half_mode = DImode;
+-      break;
+-    case E_DImode:
+-      half_mode = SImode;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  byte = GET_MODE_SIZE (half_mode);
++      /* Have to use mov instruction if non desctructive
++	 destination form is used.  */
++      if (regno1 != regno0 && regno2 != regno0)
++	split_cost += 1;
+ 
+-  while (num--)
+-    {
+-      rtx op = operands[num];
++      /* Have to add index to base if both exist.  */
++      if (parts.base && parts.index)
++	split_cost += 1;
+ 
+-      /* simplify_subreg refuse to split volatile memory addresses,
+-         but we still have to handle it.  */
+-      if (MEM_P (op))
+-	{
+-	  lo_half[num] = adjust_address (op, half_mode, 0);
+-	  hi_half[num] = adjust_address (op, half_mode, byte);
+-	}
+-      else
++      /* Have to use shift and adds if scale is 2 or greater.  */
++      if (parts.scale > 1)
+ 	{
+-	  lo_half[num] = simplify_gen_subreg (half_mode, op,
+-					      GET_MODE (op) == VOIDmode
+-					      ? mode : GET_MODE (op), 0);
+-	  hi_half[num] = simplify_gen_subreg (half_mode, op,
+-					      GET_MODE (op) == VOIDmode
+-					      ? mode : GET_MODE (op), byte);
++	  if (regno0 != regno1)
++	    split_cost += 1;
++	  else if (regno2 == regno0)
++	    split_cost += 4;
++	  else
++	    split_cost += parts.scale;
+ 	}
+-    }
+-}
+-
+-/* Output code to perform a 387 binary operation in INSN, one of PLUS,
+-   MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
+-   is the expression of the binary operation.  The output may either be
+-   emitted here, or returned to the caller, like all output_* functions.
+ 
+-   There is no guarantee that the operands are the same mode, as they
+-   might be within FLOAT or FLOAT_EXTEND expressions.  */
++      /* Have to use add instruction with immediate if
++	 disp is non zero.  */
++      if (parts.disp && parts.disp != const0_rtx)
++	split_cost += 1;
+ 
+-#ifndef SYSV386_COMPAT
+-/* Set to 1 for compatibility with brain-damaged assemblers.  No-one
+-   wants to fix the assemblers because that causes incompatibility
+-   with gcc.  No-one wants to fix gcc because that causes
+-   incompatibility with assemblers...  You can use the option of
+-   -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
+-#define SYSV386_COMPAT 1
+-#endif
++      /* Subtract the price of lea.  */
++      split_cost -= 1;
++    }
+ 
+-const char *
+-output_387_binary_op (rtx_insn *insn, rtx *operands)
+-{
+-  static char buf[40];
+-  const char *p;
+-  bool is_sse
+-    = (SSE_REG_P (operands[0])
+-       || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
++  return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
++				parts.scale > 1);
++}
+ 
+-  if (is_sse)
+-    p = "%v";
+-  else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+-	   || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+-    p = "fi";
+-  else
+-    p = "f";
++/* Return true if it is ok to optimize an ADD operation to LEA
++   operation to avoid flag register consumation.  For most processors,
++   ADD is faster than LEA.  For the processors like BONNELL, if the
++   destination register of LEA holds an actual address which will be
++   used soon, LEA is better and otherwise ADD is better.  */
+ 
+-  strcpy (buf, p);
++bool
++ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
++{
++  unsigned int regno0 = true_regnum (operands[0]);
++  unsigned int regno1 = true_regnum (operands[1]);
++  unsigned int regno2 = true_regnum (operands[2]);
+ 
+-  switch (GET_CODE (operands[3]))
+-    {
+-    case PLUS:
+-      p = "add"; break;
+-    case MINUS:
+-      p = "sub"; break;
+-    case MULT:
+-      p = "mul"; break;
+-    case DIV:
+-      p = "div"; break;
+-    default:
+-      gcc_unreachable ();
+-    }
++  /* If a = b + c, (a!=b && a!=c), must use lea form. */
++  if (regno0 != regno1 && regno0 != regno2)
++    return true;
+ 
+-  strcat (buf, p);
++  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
++    return false;
+ 
+-  if (is_sse)
+-   {
+-     p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
+-     strcat (buf, p);
++  return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
++}
+ 
+-     if (TARGET_AVX)
+-       p = "\t{%2, %1, %0|%0, %1, %2}";
+-     else
+-       p = "\t{%2, %0|%0, %2}";
++/* Return true if destination reg of SET_BODY is shift count of
++   USE_BODY.  */
+ 
+-     strcat (buf, p);
+-     return buf;
+-   }
++static bool
++ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
++{
++  rtx set_dest;
++  rtx shift_rtx;
++  int i;
+ 
+-  /* Even if we do not want to check the inputs, this documents input
+-     constraints.  Which helps in understanding the following code.  */
+-  if (flag_checking)
++  /* Retrieve destination of SET_BODY.  */
++  switch (GET_CODE (set_body))
+     {
+-      if (STACK_REG_P (operands[0])
+-	  && ((REG_P (operands[1])
+-	       && REGNO (operands[0]) == REGNO (operands[1])
+-	       && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
+-	      || (REG_P (operands[2])
+-		  && REGNO (operands[0]) == REGNO (operands[2])
+-		  && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
+-	  && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
+-	; /* ok */
+-      else
+-	gcc_unreachable ();
++    case SET:
++      set_dest = SET_DEST (set_body);
++      if (!set_dest || !REG_P (set_dest))
++	return false;
++      break;
++    case PARALLEL:
++      for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
++	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
++					  use_body))
++	  return true;
++      /* FALLTHROUGH */
++    default:
++      return false;
+     }
+ 
+-  switch (GET_CODE (operands[3]))
++  /* Retrieve shift count of USE_BODY.  */
++  switch (GET_CODE (use_body))
+     {
+-    case MULT:
+-    case PLUS:
+-      if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
+-	std::swap (operands[1], operands[2]);
++    case SET:
++      shift_rtx = XEXP (use_body, 1);
++      break;
++    case PARALLEL:
++      for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
++	if (ix86_dep_by_shift_count_body (set_body,
++					  XVECEXP (use_body, 0, i)))
++	  return true;
++      /* FALLTHROUGH */
++    default:
++      return false;
++    }
+ 
+-      /* know operands[0] == operands[1].  */
++  if (shift_rtx
++      && (GET_CODE (shift_rtx) == ASHIFT
++	  || GET_CODE (shift_rtx) == LSHIFTRT
++	  || GET_CODE (shift_rtx) == ASHIFTRT
++	  || GET_CODE (shift_rtx) == ROTATE
++	  || GET_CODE (shift_rtx) == ROTATERT))
++    {
++      rtx shift_count = XEXP (shift_rtx, 1);
+ 
+-      if (MEM_P (operands[2]))
++      /* Return true if shift count is dest of SET_BODY.  */
++      if (REG_P (shift_count))
+ 	{
+-	  p = "%Z2\t%2";
+-	  break;
++	  /* Add check since it can be invoked before register
++	     allocation in pre-reload schedule.  */
++	  if (reload_completed
++	      && true_regnum (set_dest) == true_regnum (shift_count))
++	    return true;
++	  else if (REGNO(set_dest) == REGNO(shift_count))
++	    return true;
+ 	}
++    }
+ 
+-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+-	{
+-	  if (STACK_TOP_P (operands[0]))
+-	    /* How is it that we are storing to a dead operand[2]?
+-	       Well, presumably operands[1] is dead too.  We can't
+-	       store the result to st(0) as st(0) gets popped on this
+-	       instruction.  Instead store to operands[2] (which I
+-	       think has to be st(1)).  st(1) will be popped later.
+-	       gcc <= 2.8.1 didn't have this check and generated
+-	       assembly code that the Unixware assembler rejected.  */
+-	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
+-	  else
+-	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
+-	  break;
+-	}
+-
+-      if (STACK_TOP_P (operands[0]))
+-	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
+-      else
+-	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
+-      break;
+-
+-    case MINUS:
+-    case DIV:
+-      if (MEM_P (operands[1]))
+-	{
+-	  p = "r%Z1\t%1";
+-	  break;
+-	}
+-
+-      if (MEM_P (operands[2]))
+-	{
+-	  p = "%Z2\t%2";
+-	  break;
+-	}
+-
+-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+-	{
+-#if SYSV386_COMPAT
+-	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
+-	     derived assemblers, confusingly reverse the direction of
+-	     the operation for fsub{r} and fdiv{r} when the
+-	     destination register is not st(0).  The Intel assembler
+-	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
+-	     figure out what the hardware really does.  */
+-	  if (STACK_TOP_P (operands[0]))
+-	    p = "{p\t%0, %2|rp\t%2, %0}";
+-	  else
+-	    p = "{rp\t%2, %0|p\t%0, %2}";
+-#else
+-	  if (STACK_TOP_P (operands[0]))
+-	    /* As above for fmul/fadd, we can't store to st(0).  */
+-	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
+-	  else
+-	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
+-#endif
+-	  break;
+-	}
+-
+-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+-	{
+-#if SYSV386_COMPAT
+-	  if (STACK_TOP_P (operands[0]))
+-	    p = "{rp\t%0, %1|p\t%1, %0}";
+-	  else
+-	    p = "{p\t%1, %0|rp\t%0, %1}";
+-#else
+-	  if (STACK_TOP_P (operands[0]))
+-	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
+-	  else
+-	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
+-#endif
+-	  break;
+-	}
+-
+-      if (STACK_TOP_P (operands[0]))
+-	{
+-	  if (STACK_TOP_P (operands[1]))
+-	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
+-	  else
+-	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
+-	  break;
+-	}
+-      else if (STACK_TOP_P (operands[1]))
+-	{
+-#if SYSV386_COMPAT
+-	  p = "{\t%1, %0|r\t%0, %1}";
+-#else
+-	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
+-#endif
+-	}
+-      else
+-	{
+-#if SYSV386_COMPAT
+-	  p = "{r\t%2, %0|\t%0, %2}";
+-#else
+-	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
+-#endif
+-	}
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  strcat (buf, p);
+-  return buf;
++  return false;
+ }
+ 
+-/* Return needed mode for entity in optimize_mode_switching pass.  */
++/* Return true if destination reg of SET_INSN is shift count of
++   USE_INSN.  */
+ 
+-static int
+-ix86_dirflag_mode_needed (rtx_insn *insn)
++bool
++ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
+ {
+-  if (CALL_P (insn))
+-    {
+-      if (cfun->machine->func_type == TYPE_NORMAL)
+-	return X86_DIRFLAG_ANY;
+-      else
+-	/* No need to emit CLD in interrupt handler for TARGET_CLD.  */
+-	return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
+-    }
+-
+-  if (recog_memoized (insn) < 0)
+-    return X86_DIRFLAG_ANY;
++  return ix86_dep_by_shift_count_body (PATTERN (set_insn),
++				       PATTERN (use_insn));
++}
+ 
+-  if (get_attr_type (insn) == TYPE_STR)
+-    {
+-      /* Emit cld instruction if stringops are used in the function.  */
+-      if (cfun->machine->func_type == TYPE_NORMAL)
+-	return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
+-      else
+-	return X86_DIRFLAG_RESET;
+-    }
++/* Return TRUE or FALSE depending on whether the unary operator meets the
++   appropriate constraints.  */
+ 
+-  return X86_DIRFLAG_ANY;
++bool
++ix86_unary_operator_ok (enum rtx_code,
++			machine_mode,
++			rtx operands[2])
++{
++  /* If one of operands is memory, source and destination must match.  */
++  if ((MEM_P (operands[0])
++       || MEM_P (operands[1]))
++      && ! rtx_equal_p (operands[0], operands[1]))
++    return false;
++  return true;
+ }
+ 
+-/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP.   */
++/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
++   are ok, keeping in mind the possible movddup alternative.  */
+ 
+-static bool
+-ix86_check_avx_upper_register (const_rtx exp)
++bool
++ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+ {
+-  return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128;
++  if (MEM_P (operands[0]))
++    return rtx_equal_p (operands[0], operands[1 + high]);
++  if (MEM_P (operands[1]) && MEM_P (operands[2]))
++    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
++  return true;
+ }
+ 
+-/* Return needed mode for entity in optimize_mode_switching pass.  */
++/* A subroutine of ix86_build_signbit_mask.  If VECT is true,
++   then replicate the value for all elements of the vector
++   register.  */
+ 
+-static int
+-ix86_avx_u128_mode_needed (rtx_insn *insn)
++rtx
++ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
+ {
+-  if (CALL_P (insn))
+-    {
+-      rtx link;
++  int i, n_elt;
++  rtvec v;
++  machine_mode scalar_mode;
+ 
+-      /* Needed mode is set to AVX_U128_CLEAN if there are
+-	 no 256bit or 512bit modes used in function arguments. */
+-      for (link = CALL_INSN_FUNCTION_USAGE (insn);
+-	   link;
+-	   link = XEXP (link, 1))
+-	{
+-	  if (GET_CODE (XEXP (link, 0)) == USE)
+-	    {
+-	      rtx arg = XEXP (XEXP (link, 0), 0);
++  switch (mode)
++    {
++    case E_V64QImode:
++    case E_V32QImode:
++    case E_V16QImode:
++    case E_V32HImode:
++    case E_V16HImode:
++    case E_V8HImode:
++    case E_V16SImode:
++    case E_V8SImode:
++    case E_V4SImode:
++    case E_V8DImode:
++    case E_V4DImode:
++    case E_V2DImode:
++      gcc_assert (vect);
++      /* FALLTHRU */
++    case E_V16SFmode:
++    case E_V8SFmode:
++    case E_V4SFmode:
++    case E_V8DFmode:
++    case E_V4DFmode:
++    case E_V2DFmode:
++      n_elt = GET_MODE_NUNITS (mode);
++      v = rtvec_alloc (n_elt);
++      scalar_mode = GET_MODE_INNER (mode);
+ 
+-	      if (ix86_check_avx_upper_register (arg))
+-		return AVX_U128_DIRTY;
+-	    }
+-	}
++      RTVEC_ELT (v, 0) = value;
+ 
+-      return AVX_U128_CLEAN;
+-    }
++      for (i = 1; i < n_elt; ++i)
++	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
+ 
+-  /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
+-     Hardware changes state only when a 256bit register is written to,
+-     but we need to prevent the compiler from moving optimal insertion
+-     point above eventual read from 256bit or 512 bit register.  */
+-  subrtx_iterator::array_type array;
+-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
+-    if (ix86_check_avx_upper_register (*iter))
+-      return AVX_U128_DIRTY;
++      return gen_rtx_CONST_VECTOR (mode, v);
+ 
+-  return AVX_U128_ANY;
++    default:
++      gcc_unreachable ();
++    }
+ }
+ 
+-/* Return mode that i387 must be switched into
+-   prior to the execution of insn.  */
++/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
++   and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
++   for an SSE register.  If VECT is true, then replicate the mask for
++   all elements of the vector register.  If INVERT is true, then create
++   a mask excluding the sign bit.  */
+ 
+-static int
+-ix86_i387_mode_needed (int entity, rtx_insn *insn)
++rtx
++ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
+ {
+-  enum attr_i387_cw mode;
+-
+-  /* The mode UNINITIALIZED is used to store control word after a
+-     function call or ASM pattern.  The mode ANY specify that function
+-     has no requirements on the control word and make no changes in the
+-     bits we are interested in.  */
++  machine_mode vec_mode, imode;
++  wide_int w;
++  rtx mask, v;
+ 
+-  if (CALL_P (insn)
+-      || (NONJUMP_INSN_P (insn)
+-	  && (asm_noperands (PATTERN (insn)) >= 0
+-	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
+-    return I387_CW_UNINITIALIZED;
+-
+-  if (recog_memoized (insn) < 0)
+-    return I387_CW_ANY;
+-
+-  mode = get_attr_i387_cw (insn);
+-
+-  switch (entity)
++  switch (mode)
+     {
+-    case I387_TRUNC:
+-      if (mode == I387_CW_TRUNC)
+-	return mode;
++    case E_V16SImode:
++    case E_V16SFmode:
++    case E_V8SImode:
++    case E_V4SImode:
++    case E_V8SFmode:
++    case E_V4SFmode:
++      vec_mode = mode;
++      imode = SImode;
+       break;
+ 
+-    case I387_FLOOR:
+-      if (mode == I387_CW_FLOOR)
+-	return mode;
++    case E_V8DImode:
++    case E_V4DImode:
++    case E_V2DImode:
++    case E_V8DFmode:
++    case E_V4DFmode:
++    case E_V2DFmode:
++      vec_mode = mode;
++      imode = DImode;
+       break;
+ 
+-    case I387_CEIL:
+-      if (mode == I387_CW_CEIL)
+-	return mode;
++    case E_TImode:
++    case E_TFmode:
++      vec_mode = VOIDmode;
++      imode = TImode;
+       break;
+ 
+     default:
+       gcc_unreachable ();
+     }
+ 
+-  return I387_CW_ANY;
+-}
++  machine_mode inner_mode = GET_MODE_INNER (mode);
++  w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
++			   GET_MODE_BITSIZE (inner_mode));
++  if (invert)
++    w = wi::bit_not (w);
+ 
+-/* Return mode that entity must be switched into
+-   prior to the execution of insn.  */
++  /* Force this value into the low part of a fp vector constant.  */
++  mask = immed_wide_int_const (w, imode);
++  mask = gen_lowpart (inner_mode, mask);
+ 
+-static int
+-ix86_mode_needed (int entity, rtx_insn *insn)
+-{
+-  switch (entity)
+-    {
+-    case X86_DIRFLAG:
+-      return ix86_dirflag_mode_needed (insn);
+-    case AVX_U128:
+-      return ix86_avx_u128_mode_needed (insn);
+-    case I387_TRUNC:
+-    case I387_FLOOR:
+-    case I387_CEIL:
+-      return ix86_i387_mode_needed (entity, insn);
+-    default:
+-      gcc_unreachable ();
+-    }
+-  return 0;
+-}
++  if (vec_mode == VOIDmode)
++    return force_reg (inner_mode, mask);
+ 
+-/* Check if a 256bit or 512bit AVX register is referenced in stores.   */
+- 
+-static void
+-ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
+- {
+-   if (ix86_check_avx_upper_register (dest))
+-    {
+-      bool *used = (bool *) data;
+-      *used = true;
+-    }
+- } 
++  v = ix86_build_const_vector (vec_mode, vect, mask);
++  return force_reg (vec_mode, v);
++}
+ 
+-/* Calculate mode of upper 128bit AVX registers after the insn.  */
++/* Return TRUE or FALSE depending on whether the first SET in INSN
++   has source and destination with matching CC modes, and that the
++   CC mode is at least as constrained as REQ_MODE.  */
+ 
+-static int
+-ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
++bool
++ix86_match_ccmode (rtx insn, machine_mode req_mode)
+ {
+-  rtx pat = PATTERN (insn);
++  rtx set;
++  machine_mode set_mode;
+ 
+-  if (vzeroupper_pattern (pat, VOIDmode)
+-      || vzeroall_pattern (pat, VOIDmode))
+-    return AVX_U128_CLEAN;
++  set = PATTERN (insn);
++  if (GET_CODE (set) == PARALLEL)
++    set = XVECEXP (set, 0, 0);
++  gcc_assert (GET_CODE (set) == SET);
++  gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
+ 
+-  /* We know that state is clean after CALL insn if there are no
+-     256bit or 512bit registers used in the function return register. */
+-  if (CALL_P (insn))
++  set_mode = GET_MODE (SET_DEST (set));
++  switch (set_mode)
+     {
+-      bool avx_upper_reg_found = false;
+-      note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
+-
+-      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+-    }
++    case E_CCNOmode:
++      if (req_mode != CCNOmode
++	  && (req_mode != CCmode
++	      || XEXP (SET_SRC (set), 1) != const0_rtx))
++	return false;
++      break;
++    case E_CCmode:
++      if (req_mode == CCGCmode)
++	return false;
++      /* FALLTHRU */
++    case E_CCGCmode:
++      if (req_mode == CCGOCmode || req_mode == CCNOmode)
++	return false;
++      /* FALLTHRU */
++    case E_CCGOCmode:
++      if (req_mode == CCZmode)
++	return false;
++      /* FALLTHRU */
++    case E_CCZmode:
++      break;
+ 
+-  /* Otherwise, return current mode.  Remember that if insn
+-     references AVX 256bit or 512bit registers, the mode was already
+-     changed to DIRTY from MODE_NEEDED.  */
+-  return mode;
+-}
++    case E_CCGZmode:
+ 
+-/* Return the mode that an insn results in.  */
++    case E_CCAmode:
++    case E_CCCmode:
++    case E_CCOmode:
++    case E_CCPmode:
++    case E_CCSmode:
++      if (set_mode != req_mode)
++	return false;
++      break;
+ 
+-static int
+-ix86_mode_after (int entity, int mode, rtx_insn *insn)
+-{
+-  switch (entity)
+-    {
+-    case X86_DIRFLAG:
+-      return mode;
+-    case AVX_U128:
+-      return ix86_avx_u128_mode_after (mode, insn);
+-    case I387_TRUNC:
+-    case I387_FLOOR:
+-    case I387_CEIL:
+-      return mode;
+     default:
+       gcc_unreachable ();
+     }
+-}
+-
+-static int
+-ix86_dirflag_mode_entry (void)
+-{
+-  /* For TARGET_CLD or in the interrupt handler we can't assume
+-     direction flag state at function entry.  */
+-  if (TARGET_CLD
+-      || cfun->machine->func_type != TYPE_NORMAL)
+-    return X86_DIRFLAG_ANY;
+ 
+-  return X86_DIRFLAG_RESET;
++  return GET_MODE (SET_SRC (set)) == set_mode;
+ }
+ 
+-static int
+-ix86_avx_u128_mode_entry (void)
++machine_mode
++ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
+ {
+-  tree arg;
++  machine_mode mode = GET_MODE (op0);
+ 
+-  /* Entry mode is set to AVX_U128_DIRTY if there are
+-     256bit or 512bit modes used in function arguments.  */
+-  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
+-       arg = TREE_CHAIN (arg))
++  if (SCALAR_FLOAT_MODE_P (mode))
+     {
+-      rtx incoming = DECL_INCOMING_RTL (arg);
+-
+-      if (incoming && ix86_check_avx_upper_register (incoming))
+-	return AVX_U128_DIRTY;
++      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
++      return CCFPmode;
+     }
+ 
+-  return AVX_U128_CLEAN;
+-}
+-
+-/* Return a mode that ENTITY is assumed to be
+-   switched to at function entry.  */
+-
+-static int
+-ix86_mode_entry (int entity)
+-{
+-  switch (entity)
++  switch (code)
+     {
+-    case X86_DIRFLAG:
+-      return ix86_dirflag_mode_entry ();
+-    case AVX_U128:
+-      return ix86_avx_u128_mode_entry ();
+-    case I387_TRUNC:
+-    case I387_FLOOR:
+-    case I387_CEIL:
+-      return I387_CW_ANY;
++      /* Only zero flag is needed.  */
++    case EQ:			/* ZF=0 */
++    case NE:			/* ZF!=0 */
++      return CCZmode;
++      /* Codes needing carry flag.  */
++    case GEU:			/* CF=0 */
++    case LTU:			/* CF=1 */
++      /* Detect overflow checks.  They need just the carry flag.  */
++      if (GET_CODE (op0) == PLUS
++	  && (rtx_equal_p (op1, XEXP (op0, 0))
++	      || rtx_equal_p (op1, XEXP (op0, 1))))
++	return CCCmode;
++      else
++	return CCmode;
++    case GTU:			/* CF=0 & ZF=0 */
++    case LEU:			/* CF=1 | ZF=1 */
++      return CCmode;
++      /* Codes possibly doable only with sign flag when
++         comparing against zero.  */
++    case GE:			/* SF=OF   or   SF=0 */
++    case LT:			/* SF<>OF  or   SF=1 */
++      if (op1 == const0_rtx)
++	return CCGOCmode;
++      else
++	/* For other cases Carry flag is not required.  */
++	return CCGCmode;
++      /* Codes doable only with sign flag when comparing
++         against zero, but we miss jump instruction for it
++         so we need to use relational tests against overflow
++         that thus needs to be zero.  */
++    case GT:			/* ZF=0 & SF=OF */
++    case LE:			/* ZF=1 | SF<>OF */
++      if (op1 == const0_rtx)
++	return CCNOmode;
++      else
++	return CCGCmode;
++      /* strcmp pattern do (use flags) and combine may ask us for proper
++	 mode.  */
++    case USE:
++      return CCmode;
+     default:
+       gcc_unreachable ();
+     }
+ }
+ 
+-static int
+-ix86_avx_u128_mode_exit (void)
+-{
+-  rtx reg = crtl->return_rtx;
++/* Return the fixed registers used for condition codes.  */
+ 
+-  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
+-     or 512 bit modes used in the function return register. */
+-  if (reg && ix86_check_avx_upper_register (reg))
+-    return AVX_U128_DIRTY;
+-
+-  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+-     modes used in function arguments, otherwise return AVX_U128_CLEAN.
+-   */
+-  return ix86_avx_u128_mode_entry ();
+-}
+-
+-/* Return a mode that ENTITY is assumed to be
+-   switched to at function exit.  */
+-
+-static int
+-ix86_mode_exit (int entity)
+-{
+-  switch (entity)
+-    {
+-    case X86_DIRFLAG:
+-      return X86_DIRFLAG_ANY;
+-    case AVX_U128:
+-      return ix86_avx_u128_mode_exit ();
+-    case I387_TRUNC:
+-    case I387_FLOOR:
+-    case I387_CEIL:
+-      return I387_CW_ANY;
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-static int
+-ix86_mode_priority (int, int n)
++static bool
++ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
+ {
+-  return n;
++  *p1 = FLAGS_REG;
++  *p2 = INVALID_REGNUM;
++  return true;
+ }
+ 
+-/* Output code to initialize control word copies used by trunc?f?i and
+-   rounding patterns.  CURRENT_MODE is set to current control word,
+-   while NEW_MODE is set to new control word.  */
++/* If two condition code modes are compatible, return a condition code
++   mode which is compatible with both.  Otherwise, return
++   VOIDmode.  */
+ 
+-static void
+-emit_i387_cw_initialization (int mode)
++static machine_mode
++ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
+ {
+-  rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
+-  rtx new_mode;
++  if (m1 == m2)
++    return m1;
+ 
+-  enum ix86_stack_slot slot;
++  if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
++    return VOIDmode;
+ 
+-  rtx reg = gen_reg_rtx (HImode);
++  if ((m1 == CCGCmode && m2 == CCGOCmode)
++      || (m1 == CCGOCmode && m2 == CCGCmode))
++    return CCGCmode;
+ 
+-  emit_insn (gen_x86_fnstcw_1 (stored_mode));
+-  emit_move_insn (reg, copy_rtx (stored_mode));
++  if ((m1 == CCNOmode && m2 == CCGOCmode)
++      || (m1 == CCGOCmode && m2 == CCNOmode))
++    return CCNOmode;
+ 
+-  switch (mode)
++  if (m1 == CCZmode
++      && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
++    return m2;
++  else if (m2 == CCZmode
++	   && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
++    return m1;
++
++  switch (m1)
+     {
+-    case I387_CW_TRUNC:
+-      /* round toward zero (truncate) */
+-      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
+-      slot = SLOT_CW_TRUNC;
+-      break;
++    default:
++      gcc_unreachable ();
+ 
+-    case I387_CW_FLOOR:
+-      /* round down toward -oo */
+-      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+-      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
+-      slot = SLOT_CW_FLOOR;
+-      break;
++    case E_CCmode:
++    case E_CCGCmode:
++    case E_CCGOCmode:
++    case E_CCNOmode:
++    case E_CCAmode:
++    case E_CCCmode:
++    case E_CCOmode:
++    case E_CCPmode:
++    case E_CCSmode:
++    case E_CCZmode:
++      switch (m2)
++	{
++	default:
++	  return VOIDmode;
+ 
+-    case I387_CW_CEIL:
+-      /* round up toward +oo */
+-      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+-      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
+-      slot = SLOT_CW_CEIL;
+-      break;
++	case E_CCmode:
++	case E_CCGCmode:
++	case E_CCGOCmode:
++	case E_CCNOmode:
++	case E_CCAmode:
++	case E_CCCmode:
++	case E_CCOmode:
++	case E_CCPmode:
++	case E_CCSmode:
++	case E_CCZmode:
++	  return CCmode;
++	}
+ 
+-    default:
+-      gcc_unreachable ();
++    case E_CCFPmode:
++      /* These are only compatible with themselves, which we already
++	 checked above.  */
++      return VOIDmode;
+     }
++}
+ 
+-  gcc_assert (slot < MAX_386_STACK_LOCALS);
++/* Return strategy to use for floating-point.  We assume that fcomi is always
++   preferrable where available, since that is also true when looking at size
++   (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
+ 
+-  new_mode = assign_386_stack_local (HImode, slot);
+-  emit_move_insn (new_mode, reg);
++enum ix86_fpcmp_strategy
++ix86_fp_comparison_strategy (enum rtx_code)
++{
++  /* Do fcomi/sahf based test when profitable.  */
++
++  if (TARGET_CMOVE)
++    return IX86_FPCMP_COMI;
++
++  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
++    return IX86_FPCMP_SAHF;
++
++  return IX86_FPCMP_ARITH;
+ }
+ 
+-/* Generate one or more insns to set ENTITY to MODE.  */
++/* Convert comparison codes we use to represent FP comparison to integer
++   code that will result in proper branch.  Return UNKNOWN if no such code
++   is available.  */
+ 
+-static void
+-ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
+-		    HARD_REG_SET regs_live ATTRIBUTE_UNUSED)
++enum rtx_code
++ix86_fp_compare_code_to_integer (enum rtx_code code)
+ {
+-  switch (entity)
++  switch (code)
+     {
+-    case X86_DIRFLAG:
+-      if (mode == X86_DIRFLAG_RESET)
+-	emit_insn (gen_cld ());
+-      break;
+-    case AVX_U128:
+-      if (mode == AVX_U128_CLEAN)
+-	emit_insn (gen_avx_vzeroupper ());
+-      break;
+-    case I387_TRUNC:
+-    case I387_FLOOR:
+-    case I387_CEIL:
+-      if (mode != I387_CW_ANY
+-	  && mode != I387_CW_UNINITIALIZED)
+-	emit_i387_cw_initialization (mode);
+-      break;
++    case GT:
++      return GTU;
++    case GE:
++      return GEU;
++    case ORDERED:
++    case UNORDERED:
++      return code;
++    case UNEQ:
++      return EQ;
++    case UNLT:
++      return LTU;
++    case UNLE:
++      return LEU;
++    case LTGT:
++      return NE;
+     default:
+-      gcc_unreachable ();
++      return UNKNOWN;
+     }
+ }
+ 
+-/* Output code for INSN to convert a float to a signed int.  OPERANDS
+-   are the insn operands.  The output may be [HSD]Imode and the input
+-   operand may be [SDX]Fmode.  */
+-
+-const char *
+-output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
++/* Zero extend possibly SImode EXP to Pmode register.  */
++rtx
++ix86_zero_extend_to_Pmode (rtx exp)
+ {
+-  bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+-  bool dimode_p = GET_MODE (operands[0]) == DImode;
+-  int round_mode = get_attr_i387_cw (insn);
+-
+-  static char buf[40];
+-  const char *p;
+-
+-  /* Jump through a hoop or two for DImode, since the hardware has no
+-     non-popping instruction.  We used to do this a different way, but
+-     that was somewhat fragile and broke with post-reload splitters.  */
+-  if ((dimode_p || fisttp) && !stack_top_dies)
+-    output_asm_insn ("fld\t%y1", operands);
+-
+-  gcc_assert (STACK_TOP_P (operands[1]));
+-  gcc_assert (MEM_P (operands[0]));
+-  gcc_assert (GET_MODE (operands[1]) != TFmode);
+-
+-  if (fisttp)
+-    return "fisttp%Z0\t%0";
+-
+-  strcpy (buf, "fist");
++  return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
++}
+ 
+-  if (round_mode != I387_CW_ANY)
+-    output_asm_insn ("fldcw\t%3", operands);
++/* Return true if the function being called was marked with attribute
++   "noplt" or using -fno-plt and we are compiling for non-PIC.  We need
++   to handle the non-PIC case in the backend because there is no easy
++   interface for the front-end to force non-PLT calls to use the GOT.
++   This is currently used only with 64-bit or 32-bit GOT32X ELF targets
++   to call the function marked "noplt" indirectly.  */
+ 
+-  p = "p%Z0\t%0";
+-  strcat (buf, p + !(stack_top_dies || dimode_p));
++static bool
++ix86_nopic_noplt_attribute_p (rtx call_op)
++{
++  if (flag_pic || ix86_cmodel == CM_LARGE
++      || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
++      || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
++      || SYMBOL_REF_LOCAL_P (call_op))
++    return false;
+ 
+-  output_asm_insn (buf, operands);
++  tree symbol_decl = SYMBOL_REF_DECL (call_op);
+ 
+-  if (round_mode != I387_CW_ANY)
+-    output_asm_insn ("fldcw\t%2", operands);
++  if (!flag_plt
++      || (symbol_decl != NULL_TREE
++          && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
++    return true;
+ 
+-  return "";
++  return false;
+ }
+ 
+-/* Output code for x87 ffreep insn.  The OPNO argument, which may only
+-   have the values zero or one, indicates the ffreep insn's operand
+-   from the OPERANDS array.  */
+-
+-static const char *
+-output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
++/* Helper to output the jmp/call.  */
++static void
++ix86_output_jmp_thunk_or_indirect (const char *thunk_name, const int regno)
+ {
+-  if (TARGET_USE_FFREEP)
+-#ifdef HAVE_AS_IX86_FFREEP
+-    return opno ? "ffreep\t%y1" : "ffreep\t%y0";
+-#else
++  if (thunk_name != NULL)
+     {
+-      static char retval[32];
+-      int regno = REGNO (operands[opno]);
+-
+-      gcc_assert (STACK_REGNO_P (regno));
+-
+-      regno -= FIRST_STACK_REG;
+-
+-      snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
+-      return retval;
++      fprintf (asm_out_file, "\tjmp\t");
++      assemble_name (asm_out_file, thunk_name);
++      putc ('\n', asm_out_file);
+     }
+-#endif
+-
+-  return opno ? "fstp\t%y1" : "fstp\t%y0";
++  else
++    output_indirect_thunk (regno);
+ }
+ 
++/* Output indirect branch via a call and return thunk.  CALL_OP is a
++   register which contains the branch target.  XASM is the assembly
++   template for CALL_OP.  Branch is a tail call if SIBCALL_P is true.
++   A normal call is converted to:
+ 
+-/* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
+-   should be used.  UNORDERED_P is true when fucom should be used.  */
+-
+-const char *
+-output_fp_compare (rtx_insn *insn, rtx *operands,
+-		   bool eflags_p, bool unordered_p)
+-{
+-  rtx *xops = eflags_p ? &operands[0] : &operands[1];
+-  bool stack_top_dies;
++	call __x86_indirect_thunk_reg
+ 
+-  static char buf[40];
+-  const char *p;
++   and a tail call is converted to:
+ 
+-  gcc_assert (STACK_TOP_P (xops[0]));
++	jmp __x86_indirect_thunk_reg
++ */
+ 
+-  stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
++static void
++ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
++{
++  char thunk_name_buf[32];
++  char *thunk_name;
++  enum indirect_thunk_prefix need_prefix
++    = indirect_thunk_need_prefix (current_output_insn);
++  int regno = REGNO (call_op);
+ 
+-  if (eflags_p)
++  if (cfun->machine->indirect_branch_type
++      != indirect_branch_thunk_inline)
+     {
+-      p = unordered_p ? "fucomi" : "fcomi";
+-      strcpy (buf, p);
+-
+-      p = "p\t{%y1, %0|%0, %y1}";
+-      strcat (buf, p + !stack_top_dies);
+-
+-      return buf;
++      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
++	{
++	  int i = regno;
++	  if (i >= FIRST_REX_INT_REG)
++	    i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
++	  indirect_thunks_used |= 1 << i;
++	}
++      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
++      thunk_name = thunk_name_buf;
+     }
++  else
++    thunk_name = NULL;
+ 
+-  if (STACK_REG_P (xops[1])
+-      && stack_top_dies
+-      && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
+-    {
+-      gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
+-
+-      /* If both the top of the 387 stack die, and the other operand
+-	 is also a stack register that dies, then this must be a
+-	 `fcompp' float compare.  */
+-      p = unordered_p ? "fucompp" : "fcompp";
+-      strcpy (buf, p);
+-    }
+-  else if (const0_operand (xops[1], VOIDmode))
+-    {
+-      gcc_assert (!unordered_p);
+-      strcpy (buf, "ftst");
+-    }
++  if (sibcall_p)
++     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+   else
+     {
+-      if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
++      if (thunk_name != NULL)
+ 	{
+-	  gcc_assert (!unordered_p);
+-	  p = "ficom";
++	  fprintf (asm_out_file, "\tcall\t");
++	  assemble_name (asm_out_file, thunk_name);
++	  putc ('\n', asm_out_file);
++	  return;
+ 	}
+-      else
+-	p = unordered_p ? "fucom" : "fcom";
+-
+-      strcpy (buf, p);
+ 
+-      p = "p%Z2\t%y2";
+-      strcat (buf, p + !stack_top_dies);
+-    }
++      char indirectlabel1[32];
++      char indirectlabel2[32];
+ 
+-  output_asm_insn (buf, operands);
+-  return "fnstsw\t%0";
+-}
++      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
++				   INDIRECT_LABEL,
++				   indirectlabelno++);
++      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
++				   INDIRECT_LABEL,
++				   indirectlabelno++);
+ 
+-void
+-ix86_output_addr_vec_elt (FILE *file, int value)
+-{
+-  const char *directive = ASM_LONG;
++      /* Jump.  */
++      fputs ("\tjmp\t", asm_out_file);
++      assemble_name_raw (asm_out_file, indirectlabel2);
++      fputc ('\n', asm_out_file);
+ 
+-#ifdef ASM_QUAD
+-  if (TARGET_LP64)
+-    directive = ASM_QUAD;
+-#else
+-  gcc_assert (!TARGET_64BIT);
+-#endif
++      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+ 
+-  fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
+-}
++     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+ 
+-void
+-ix86_output_addr_diff_elt (FILE *file, int value, int rel)
+-{
+-  const char *directive = ASM_LONG;
++      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+ 
+-#ifdef ASM_QUAD
+-  if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
+-    directive = ASM_QUAD;
+-#else
+-  gcc_assert (!TARGET_64BIT);
+-#endif
+-  /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
+-  if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+-    fprintf (file, "%s%s%d-%s%d\n",
+-	     directive, LPREFIX, value, LPREFIX, rel);
+-#if TARGET_MACHO
+-  else if (TARGET_MACHO)
+-    {
+-      fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
+-      machopic_output_function_base_name (file);
+-      putc ('\n', file);
++      /* Call.  */
++      fputs ("\tcall\t", asm_out_file);
++      assemble_name_raw (asm_out_file, indirectlabel1);
++      fputc ('\n', asm_out_file);
+     }
+-#endif
+-  else if (HAVE_AS_GOTOFF_IN_DATA)
+-    fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
+-  else
+-    asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
+-		 GOT_SYMBOL_NAME, LPREFIX, value);
+ }
+-
+-/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
+-   for the target.  */
+-
+-void
+-ix86_expand_clear (rtx dest)
+-{
+-  rtx tmp;
+ 
+-  /* We play register width games, which are only valid after reload.  */
+-  gcc_assert (reload_completed);
++/* Output indirect branch via a call and return thunk.  CALL_OP is
++   the branch target.  XASM is the assembly template for CALL_OP.
++   Branch is a tail call if SIBCALL_P is true.  A normal call is
++   converted to:
+ 
+-  /* Avoid HImode and its attendant prefix byte.  */
+-  if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
+-    dest = gen_rtx_REG (SImode, REGNO (dest));
+-  tmp = gen_rtx_SET (dest, const0_rtx);
++	jmp L2
++   L1:
++	push CALL_OP
++	jmp __x86_indirect_thunk
++   L2:
++	call L1
+ 
+-  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+-    {
+-      rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
+-    }
++   and a tail call is converted to:
+ 
+-  emit_insn (tmp);
+-}
++	push CALL_OP
++	jmp __x86_indirect_thunk
++ */
+ 
+-void
+-ix86_expand_move (machine_mode mode, rtx operands[])
++static void
++ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
++				      bool sibcall_p)
+ {
+-  rtx op0, op1;
+-  rtx tmp, addend = NULL_RTX;
+-  enum tls_model model;
++  char thunk_name_buf[32];
++  char *thunk_name;
++  char push_buf[64];
++  enum indirect_thunk_prefix need_prefix
++    = indirect_thunk_need_prefix (current_output_insn);
++  int regno = -1;
++
++  if (cfun->machine->indirect_branch_type
++      != indirect_branch_thunk_inline)
++    {
++      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
++	indirect_thunk_needed = true;
++      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
++      thunk_name = thunk_name_buf;
++    }
++  else
++    thunk_name = NULL;
+ 
+-  op0 = operands[0];
+-  op1 = operands[1];
++  snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
++	    TARGET_64BIT ? 'q' : 'l', xasm);
+ 
+-  switch (GET_CODE (op1))
++  if (sibcall_p)
+     {
+-    case CONST:
+-      tmp = XEXP (op1, 0);
++      output_asm_insn (push_buf, &call_op);
++      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
++    }
++  else
++    {
++      char indirectlabel1[32];
++      char indirectlabel2[32];
+ 
+-      if (GET_CODE (tmp) != PLUS
+-	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
+-	break;
++      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
++				   INDIRECT_LABEL,
++				   indirectlabelno++);
++      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
++				   INDIRECT_LABEL,
++				   indirectlabelno++);
+ 
+-      op1 = XEXP (tmp, 0);
+-      addend = XEXP (tmp, 1);
+-      /* FALLTHRU */
++      /* Jump.  */
++      fputs ("\tjmp\t", asm_out_file);
++      assemble_name_raw (asm_out_file, indirectlabel2);
++      fputc ('\n', asm_out_file);
+ 
+-    case SYMBOL_REF:
+-      model = SYMBOL_REF_TLS_MODEL (op1);
++      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+ 
+-      if (model)
+-	op1 = legitimize_tls_address (op1, model, true);
+-      else if (ix86_force_load_from_GOT_p (op1))
+-	{
+-	  /* Load the external function address via GOT slot to avoid PLT.  */
+-	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
+-				(TARGET_64BIT
+-				 ? UNSPEC_GOTPCREL
+-				 : UNSPEC_GOT));
+-	  op1 = gen_rtx_CONST (Pmode, op1);
+-	  op1 = gen_const_mem (Pmode, op1);
+-	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
+-	}
+-      else
++      /* An external function may be called via GOT, instead of PLT.  */
++      if (MEM_P (call_op))
+ 	{
+-	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
+-	  if (tmp)
+-	    {
+-	      op1 = tmp;
+-	      if (!addend)
+-		break;
+-	    }
+-	  else
++	  struct ix86_address parts;
++	  rtx addr = XEXP (call_op, 0);
++	  if (ix86_decompose_address (addr, &parts)
++	      && parts.base == stack_pointer_rtx)
+ 	    {
+-	      op1 = operands[1];
+-	      break;
+-	    }
+-	}
+-
+-      if (addend)
+-	{
+-	  op1 = force_operand (op1, NULL_RTX);
+-	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
+-				     op0, 1, OPTAB_DIRECT);
+-	}
+-      else
+-	op1 = force_operand (op1, op0);
+-
+-      if (op1 == op0)
+-	return;
+-
+-      op1 = convert_to_mode (mode, op1, 1);
++	      /* Since call will adjust stack by -UNITS_PER_WORD,
++		 we must convert "disp(stack, index, scale)" to
++		 "disp+UNITS_PER_WORD(stack, index, scale)".  */
++	      if (parts.index)
++		{
++		  addr = gen_rtx_MULT (Pmode, parts.index,
++				       GEN_INT (parts.scale));
++		  addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
++				       addr);
++		}
++	      else
++		addr = stack_pointer_rtx;
+ 
+-    default:
+-      break;
+-    }
++	      rtx disp;
++	      if (parts.disp != NULL_RTX)
++		disp = plus_constant (Pmode, parts.disp,
++				      UNITS_PER_WORD);
++	      else
++		disp = GEN_INT (UNITS_PER_WORD);
+ 
+-  if ((flag_pic || MACHOPIC_INDIRECT)
+-      && symbolic_operand (op1, mode))
+-    {
+-      if (TARGET_MACHO && !TARGET_64BIT)
+-	{
+-#if TARGET_MACHO
+-	  /* dynamic-no-pic */
+-	  if (MACHOPIC_INDIRECT)
+-	    {
+-	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
+-			 ? op0 : gen_reg_rtx (Pmode);
+-	      op1 = machopic_indirect_data_reference (op1, temp);
+-	      if (MACHOPIC_PURE)
+-		op1 = machopic_legitimize_pic_address (op1, mode,
+-						       temp == op1 ? 0 : temp);
+-	    }
+-	  if (op0 != op1 && GET_CODE (op0) != MEM)
+-	    {
+-	      rtx insn = gen_rtx_SET (op0, op1);
+-	      emit_insn (insn);
+-	      return;
+-	    }
+-	  if (GET_CODE (op0) == MEM)
+-	    op1 = force_reg (Pmode, op1);
+-	  else
+-	    {
+-	      rtx temp = op0;
+-	      if (GET_CODE (temp) != REG)
+-		temp = gen_reg_rtx (Pmode);
+-	      temp = legitimize_pic_address (op1, temp);
+-	      if (temp == op0)
+-	    return;
+-	      op1 = temp;
+-	    }
+-      /* dynamic-no-pic */
+-#endif
+-	}
+-      else
+-	{
+-	  if (MEM_P (op0))
+-	    op1 = force_reg (mode, op1);
+-	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
+-	    {
+-	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
+-	      op1 = legitimize_pic_address (op1, reg);
+-	      if (op0 == op1)
+-		return;
+-	      op1 = convert_to_mode (mode, op1, 1);
+-	    }
+-	}
+-    }
+-  else
+-    {
+-      if (MEM_P (op0)
+-	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
+-	      || !push_operand (op0, mode))
+-	  && MEM_P (op1))
+-	op1 = force_reg (mode, op1);
+-
+-      if (push_operand (op0, mode)
+-	  && ! general_no_elim_operand (op1, mode))
+-	op1 = copy_to_mode_reg (mode, op1);
+-
+-      /* Force large constants in 64bit compilation into register
+-	 to get them CSEed.  */
+-      if (can_create_pseudo_p ()
+-	  && (mode == DImode) && TARGET_64BIT
+-	  && immediate_operand (op1, mode)
+-	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
+-	  && !register_operand (op0, mode)
+-	  && optimize)
+-	op1 = copy_to_mode_reg (mode, op1);
+-
+-      if (can_create_pseudo_p ()
+-	  && CONST_DOUBLE_P (op1))
+-	{
+-	  /* If we are loading a floating point constant to a register,
+-	     force the value to memory now, since we'll get better code
+-	     out the back end.  */
+-
+-	  op1 = validize_mem (force_const_mem (mode, op1));
+-	  if (!register_operand (op0, mode))
+-	    {
+-	      rtx temp = gen_reg_rtx (mode);
+-	      emit_insn (gen_rtx_SET (temp, op1));
+-	      emit_move_insn (op0, temp);
+-	      return;
++	      addr = gen_rtx_PLUS (Pmode, addr, disp);
++	      call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
+ 	    }
+ 	}
+-    }
+-
+-  emit_insn (gen_rtx_SET (op0, op1));
+-}
+-
+-void
+-ix86_expand_vector_move (machine_mode mode, rtx operands[])
+-{
+-  rtx op0 = operands[0], op1 = operands[1];
+-  /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
+-     psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
+-  unsigned int align = (TARGET_IAMCU
+-			? GET_MODE_BITSIZE (mode)
+-			: GET_MODE_ALIGNMENT (mode));
+-
+-  if (push_operand (op0, VOIDmode))
+-    op0 = emit_move_resolve_push (mode, op0);
+-
+-  /* Force constants other than zero into memory.  We do not know how
+-     the instructions used to build constants modify the upper 64 bits
+-     of the register, once we have that information we may be able
+-     to handle some of them more efficiently.  */
+-  if (can_create_pseudo_p ()
+-      && (CONSTANT_P (op1)
+-	  || (SUBREG_P (op1)
+-	      && CONSTANT_P (SUBREG_REG (op1))))
+-      && ((register_operand (op0, mode)
+-	   && !standard_sse_constant_p (op1, mode))
+-	  /* ix86_expand_vector_move_misalign() does not like constants.  */
+-	  || (SSE_REG_MODE_P (mode)
+-	      && MEM_P (op0)
+-	      && MEM_ALIGN (op0) < align)))
+-    {
+-      if (SUBREG_P (op1))
+-	{
+-	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
+-	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
+-	  if (r)
+-	    r = validize_mem (r);
+-	  else
+-	    r = force_reg (imode, SUBREG_REG (op1));
+-	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
+-	}
+-      else
+-	op1 = validize_mem (force_const_mem (mode, op1));
+-    }
+ 
+-  /* We need to check memory alignment for SSE mode since attribute
+-     can make operands unaligned.  */
+-  if (can_create_pseudo_p ()
+-      && SSE_REG_MODE_P (mode)
+-      && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
+-	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
+-    {
+-      rtx tmp[2];
++      output_asm_insn (push_buf, &call_op);
+ 
+-      /* ix86_expand_vector_move_misalign() does not like both
+-	 arguments in memory.  */
+-      if (!register_operand (op0, mode)
+-	  && !register_operand (op1, mode))
+-	op1 = force_reg (mode, op1);
++      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+ 
+-      tmp[0] = op0; tmp[1] = op1;
+-      ix86_expand_vector_move_misalign (mode, tmp);
+-      return;
+-    }
++      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+ 
+-  /* Make operand1 a register if it isn't already.  */
+-  if (can_create_pseudo_p ()
+-      && !register_operand (op0, mode)
+-      && !register_operand (op1, mode))
+-    {
+-      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+-      return;
++      /* Call.  */
++      fputs ("\tcall\t", asm_out_file);
++      assemble_name_raw (asm_out_file, indirectlabel1);
++      fputc ('\n', asm_out_file);
+     }
+-
+-  emit_insn (gen_rtx_SET (op0, op1));
+ }
+ 
+-/* Split 32-byte AVX unaligned load and store if needed.  */
++/* Output indirect branch via a call and return thunk.  CALL_OP is
++   the branch target.  XASM is the assembly template for CALL_OP.
++   Branch is a tail call if SIBCALL_P is true.   */
+ 
+ static void
+-ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
++ix86_output_indirect_branch (rtx call_op, const char *xasm,
++			     bool sibcall_p)
+ {
+-  rtx m;
+-  rtx (*extract) (rtx, rtx, rtx);
+-  machine_mode mode;
++  if (REG_P (call_op))
++    ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
++  else
++    ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
++}
++
++/* Output indirect jump.  CALL_OP is the jump target.  */
+ 
+-  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+-      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
++const char *
++ix86_output_indirect_jmp (rtx call_op)
++{
++  if (cfun->machine->indirect_branch_type != indirect_branch_keep)
+     {
+-      emit_insn (gen_rtx_SET (op0, op1));
+-      return;
++      /* We can't have red-zone since "call" in the indirect thunk
++         pushes the return address onto stack, destroying red-zone.  */
++      if (ix86_red_zone_size != 0)
++	gcc_unreachable ();
++
++      ix86_output_indirect_branch (call_op, "%0", true);
++      return "";
+     }
++  else
++    return "%!jmp\t%A0";
++}
+ 
+-  rtx orig_op0 = NULL_RTX;
+-  mode = GET_MODE (op0);
+-  switch (GET_MODE_CLASS (mode))
++/* Output return instrumentation for current function if needed.  */
++
++static void
++output_return_instrumentation (void)
++{
++  if (ix86_instrument_return != instrument_return_none
++      && flag_fentry
++      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
+     {
+-    case MODE_VECTOR_INT:
+-    case MODE_INT:
+-      if (mode != V32QImode)
++      if (ix86_flag_record_return)
++	fprintf (asm_out_file, "1:\n");
++      switch (ix86_instrument_return)
+ 	{
+-	  if (!MEM_P (op0))
+-	    {
+-	      orig_op0 = op0;
+-	      op0 = gen_reg_rtx (V32QImode);
+-	    }
+-	  else
+-	    op0 = gen_lowpart (V32QImode, op0);
+-	  op1 = gen_lowpart (V32QImode, op1);
+-	  mode = V32QImode;
++	case instrument_return_call:
++	  fprintf (asm_out_file, "\tcall\t__return__\n");
++	  break;
++	case instrument_return_nop5:
++	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */
++	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
++	  break;
++	case instrument_return_none:
++	  break;
+ 	}
+-      break;
+-    case MODE_VECTOR_FLOAT:
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  switch (mode)
+-    {
+-    default:
+-      gcc_unreachable ();
+-    case E_V32QImode:
+-      extract = gen_avx_vextractf128v32qi;
+-      mode = V16QImode;
+-      break;
+-    case E_V8SFmode:
+-      extract = gen_avx_vextractf128v8sf;
+-      mode = V4SFmode;
+-      break;
+-    case E_V4DFmode:
+-      extract = gen_avx_vextractf128v4df;
+-      mode = V2DFmode;
+-      break;
+-    }
+ 
+-  if (MEM_P (op1))
+-    {
+-      rtx r = gen_reg_rtx (mode);
+-      m = adjust_address (op1, mode, 0);
+-      emit_move_insn (r, m);
+-      m = adjust_address (op1, mode, 16);
+-      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+-      emit_move_insn (op0, r);
+-    }
+-  else if (MEM_P (op0))
+-    {
+-      m = adjust_address (op0, mode, 0);
+-      emit_insn (extract (m, op1, const0_rtx));
+-      m = adjust_address (op0, mode, 16);
+-      emit_insn (extract (m, copy_rtx (op1), const1_rtx));
++      if (ix86_flag_record_return)
++	{
++	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
++	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
++	  fprintf (asm_out_file, "\t.previous\n");
++	}
+     }
+-  else
+-    gcc_unreachable ();
+-
+-  if (orig_op0)
+-    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
+ }
+ 
+-/* Implement the movmisalign patterns for SSE.  Non-SSE modes go
+-   straight to ix86_expand_vector_move.  */
+-/* Code generation for scalar reg-reg moves of single and double precision data:
+-     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
+-       movaps reg, reg
+-     else
+-       movss reg, reg
+-     if (x86_sse_partial_reg_dependency == true)
+-       movapd reg, reg
+-     else
+-       movsd reg, reg
+-
+-   Code generation for scalar loads of double precision data:
+-     if (x86_sse_split_regs == true)
+-       movlpd mem, reg      (gas syntax)
+-     else
+-       movsd mem, reg
+-
+-   Code generation for unaligned packed loads of single precision data
+-   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
+-     if (x86_sse_unaligned_move_optimal)
+-       movups mem, reg
+-
+-     if (x86_sse_partial_reg_dependency == true)
+-       {
+-         xorps  reg, reg
+-         movlps mem, reg
+-         movhps mem+8, reg
+-       }
+-     else
+-       {
+-         movlps mem, reg
+-         movhps mem+8, reg
+-       }
+-
+-   Code generation for unaligned packed loads of double precision data
+-   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
+-     if (x86_sse_unaligned_move_optimal)
+-       movupd mem, reg
+-
+-     if (x86_sse_split_regs == true)
+-       {
+-         movlpd mem, reg
+-         movhpd mem+8, reg
+-       }
+-     else
+-       {
+-         movsd  mem, reg
+-         movhpd mem+8, reg
+-       }
+- */
++/* Output function return.  CALL_OP is the jump target.  Add a REP
++   prefix to RET if LONG_P is true and function return is kept.  */
+ 
+-void
+-ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
++const char *
++ix86_output_function_return (bool long_p)
+ {
+-  rtx op0, op1, m;
+-
+-  op0 = operands[0];
+-  op1 = operands[1];
++  output_return_instrumentation ();
+ 
+-  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
+-  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
++  if (cfun->machine->function_return_type != indirect_branch_keep)
+     {
+-      emit_insn (gen_rtx_SET (op0, op1));
+-      return;
+-    }
++      char thunk_name[32];
++      enum indirect_thunk_prefix need_prefix
++	= indirect_thunk_need_prefix (current_output_insn);
+ 
+-  if (TARGET_AVX)
+-    {
+-      if (GET_MODE_SIZE (mode) == 32)
+-	ix86_avx256_split_vector_move_misalign (op0, op1);
++      if (cfun->machine->function_return_type
++	  != indirect_branch_thunk_inline)
++	{
++	  bool need_thunk = (cfun->machine->function_return_type
++			     == indirect_branch_thunk);
++	  indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
++			       true);
++	  indirect_return_needed |= need_thunk;
++	  fprintf (asm_out_file, "\tjmp\t");
++	  assemble_name (asm_out_file, thunk_name);
++	  putc ('\n', asm_out_file);
++	}
+       else
+-	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
+-	emit_insn (gen_rtx_SET (op0, op1));
+-      return;
+-    }
+-
+-  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+-      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+-    {
+-      emit_insn (gen_rtx_SET (op0, op1));
+-      return;
+-    }
++	output_indirect_thunk (INVALID_REGNUM);
+ 
+-  /* ??? If we have typed data, then it would appear that using
+-     movdqu is the only way to get unaligned data loaded with
+-     integer type.  */
+-  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+-    {
+-      emit_insn (gen_rtx_SET (op0, op1));
+-      return;
++      return "";
+     }
+ 
+-  if (MEM_P (op1))
+-    {
+-      if (TARGET_SSE2 && mode == V2DFmode)
+-        {
+-          rtx zero;
+-
+-	  /* When SSE registers are split into halves, we can avoid
+-	     writing to the top half twice.  */
+-	  if (TARGET_SSE_SPLIT_REGS)
+-	    {
+-	      emit_clobber (op0);
+-	      zero = op0;
+-	    }
+-	  else
+-	    {
+-	      /* ??? Not sure about the best option for the Intel chips.
+-		 The following would seem to satisfy; the register is
+-		 entirely cleared, breaking the dependency chain.  We
+-		 then store to the upper half, with a dependency depth
+-		 of one.  A rumor has it that Intel recommends two movsd
+-		 followed by an unpacklpd, but this is unconfirmed.  And
+-		 given that the dependency depth of the unpacklpd would
+-		 still be one, I'm not sure why this would be better.  */
+-	      zero = CONST0_RTX (V2DFmode);
+-	    }
++  if (!long_p)
++    return "%!ret";
+ 
+-	  m = adjust_address (op1, DFmode, 0);
+-	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
+-	  m = adjust_address (op1, DFmode, 8);
+-	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
+-	}
+-      else
+-        {
+-	  rtx t;
++  return "rep%; ret";
++}
+ 
+-	  if (mode != V4SFmode)
+-	    t = gen_reg_rtx (V4SFmode);
+-	  else
+-	    t = op0;
+-	    
+-	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
+-	    emit_move_insn (t, CONST0_RTX (V4SFmode));
+-	  else
+-	    emit_clobber (t);
++/* Output indirect function return.  RET_OP is the function return
++   target.  */
+ 
+-	  m = adjust_address (op1, V2SFmode, 0);
+-	  emit_insn (gen_sse_loadlps (t, t, m));
+-	  m = adjust_address (op1, V2SFmode, 8);
+-	  emit_insn (gen_sse_loadhps (t, t, m));
+-	  if (mode != V4SFmode)
+-	    emit_move_insn (op0, gen_lowpart (mode, t));
+-	}
+-    }
+-  else if (MEM_P (op0))
++const char *
++ix86_output_indirect_function_return (rtx ret_op)
++{
++  if (cfun->machine->function_return_type != indirect_branch_keep)
+     {
+-      if (TARGET_SSE2 && mode == V2DFmode)
++      char thunk_name[32];
++      enum indirect_thunk_prefix need_prefix
++	= indirect_thunk_need_prefix (current_output_insn);
++      unsigned int regno = REGNO (ret_op);
++      gcc_assert (regno == CX_REG);
++
++      if (cfun->machine->function_return_type
++	  != indirect_branch_thunk_inline)
+ 	{
+-	  m = adjust_address (op0, DFmode, 0);
+-	  emit_insn (gen_sse2_storelpd (m, op1));
+-	  m = adjust_address (op0, DFmode, 8);
+-	  emit_insn (gen_sse2_storehpd (m, op1));
++	  bool need_thunk = (cfun->machine->function_return_type
++			     == indirect_branch_thunk);
++	  indirect_thunk_name (thunk_name, regno, need_prefix, true);
++
++	  if (need_thunk)
++	    {
++	      indirect_return_via_cx = true;
++	      indirect_thunks_used |= 1 << CX_REG;
++	    }
++	  fprintf (asm_out_file, "\tjmp\t");
++	  assemble_name (asm_out_file, thunk_name);
++	  putc ('\n', asm_out_file);
+ 	}
+       else
+-	{
+-	  if (mode != V4SFmode)
+-	    op1 = gen_lowpart (V4SFmode, op1);
++	output_indirect_thunk (regno);
+ 
+-	  m = adjust_address (op0, V2SFmode, 0);
+-	  emit_insn (gen_sse_storelps (m, op1));
+-	  m = adjust_address (op0, V2SFmode, 8);
+-	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
+-	}
++      return "";
+     }
+   else
+-    gcc_unreachable ();
++    return "%!jmp\t%A0";
+ }
+ 
+-/* Helper function of ix86_fixup_binary_operands to canonicalize
+-   operand order.  Returns true if the operands should be swapped.  */
++/* Output the assembly for a call instruction.  */
+ 
+-static bool
+-ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
+-			     rtx operands[])
++const char *
++ix86_output_call_insn (rtx_insn *insn, rtx call_op)
+ {
+-  rtx dst = operands[0];
+-  rtx src1 = operands[1];
+-  rtx src2 = operands[2];
+-
+-  /* If the operation is not commutative, we can't do anything.  */
+-  if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
+-      && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
+-    return false;
+-
+-  /* Highest priority is that src1 should match dst.  */
+-  if (rtx_equal_p (dst, src1))
+-    return false;
+-  if (rtx_equal_p (dst, src2))
+-    return true;
+-
+-  /* Next highest priority is that immediate constants come second.  */
+-  if (immediate_operand (src2, mode))
+-    return false;
+-  if (immediate_operand (src1, mode))
+-    return true;
+-
+-  /* Lowest priority is that memory references should come second.  */
+-  if (MEM_P (src2))
+-    return false;
+-  if (MEM_P (src1))
+-    return true;
++  bool direct_p = constant_call_address_operand (call_op, VOIDmode);
++  bool output_indirect_p
++    = (!TARGET_SEH
++       && cfun->machine->indirect_branch_type != indirect_branch_keep);
++  bool seh_nop_p = false;
++  const char *xasm;
+ 
+-  return false;
+-}
++  if (SIBLING_CALL_P (insn))
++    {
++      output_return_instrumentation ();
++      if (direct_p)
++	{
++	  if (ix86_nopic_noplt_attribute_p (call_op))
++	    {
++	      direct_p = false;
++	      if (TARGET_64BIT)
++		{
++		  if (output_indirect_p)
++		    xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
++		  else
++		    xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
++		}
++	      else
++		{
++		  if (output_indirect_p)
++		    xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
++		  else
++		    xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
++		}
++	    }
++	  else
++	    xasm = "%!jmp\t%P0";
++	}
++      /* SEH epilogue detection requires the indirect branch case
++	 to include REX.W.  */
++      else if (TARGET_SEH)
++	xasm = "%!rex.W jmp\t%A0";
++      else
++	{
++	  if (output_indirect_p)
++	    xasm = "%0";
++	  else
++	    xasm = "%!jmp\t%A0";
++	}
+ 
++      if (output_indirect_p && !direct_p)
++	ix86_output_indirect_branch (call_op, xasm, true);
++      else
++	output_asm_insn (xasm, &call_op);
++      return "";
++    }
+ 
+-/* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
+-   destination to use for the operation.  If different from the true
+-   destination in operands[0], a copy operation will be required.  */
++  /* SEH unwinding can require an extra nop to be emitted in several
++     circumstances.  Determine if we have one of those.  */
++  if (TARGET_SEH)
++    {
++      rtx_insn *i;
+ 
+-rtx
+-ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
+-			    rtx operands[])
+-{
+-  rtx dst = operands[0];
+-  rtx src1 = operands[1];
+-  rtx src2 = operands[2];
++      for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
++	{
++	  /* Prevent a catch region from being adjacent to a jump that would
++	     be interpreted as an epilogue sequence by the unwinder.  */
++	  if (JUMP_P(i) && CROSSING_JUMP_P (i))
++	    {
++	      seh_nop_p = true;
++	      break;
++	    }
++	    
++	  /* If we get to another real insn, we don't need the nop.  */
++	  if (INSN_P (i))
++	    break;
+ 
+-  /* Canonicalize operand order.  */
+-  if (ix86_swap_binary_operands_p (code, mode, operands))
+-    {
+-      /* It is invalid to swap operands of different modes.  */
+-      gcc_assert (GET_MODE (src1) == GET_MODE (src2));
++	  /* If we get to the epilogue note, prevent a catch region from
++	     being adjacent to the standard epilogue sequence.  If non-
++	     call-exceptions, we'll have done this during epilogue emission. */
++	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
++	      && !flag_non_call_exceptions
++	      && !can_throw_internal (insn))
++	    {
++	      seh_nop_p = true;
++	      break;
++	    }
++	}
+ 
+-      std::swap (src1, src2);
++      /* If we didn't find a real insn following the call, prevent the
++	 unwinder from looking into the next function.  */
++      if (i == NULL)
++	seh_nop_p = true;
+     }
+ 
+-  /* Both source operands cannot be in memory.  */
+-  if (MEM_P (src1) && MEM_P (src2))
++  if (direct_p)
+     {
+-      /* Optimization: Only read from memory once.  */
+-      if (rtx_equal_p (src1, src2))
++      if (ix86_nopic_noplt_attribute_p (call_op))
+ 	{
+-	  src2 = force_reg (mode, src2);
+-	  src1 = src2;
++	  direct_p = false;
++	  if (TARGET_64BIT)
++	    {
++	      if (output_indirect_p)
++		xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
++	      else
++		xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
++	    }
++	  else
++	    {
++	      if (output_indirect_p)
++		xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
++	      else
++		xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
++	    }
+ 	}
+-      else if (rtx_equal_p (dst, src1))
+-	src2 = force_reg (mode, src2);
+       else
+-	src1 = force_reg (mode, src1);
++	xasm = "%!call\t%P0";
++    }
++  else
++    {
++      if (output_indirect_p)
++	xasm = "%0";
++      else
++	xasm = "%!call\t%A0";
+     }
+ 
+-  /* If the destination is memory, and we do not have matching source
+-     operands, do things in registers.  */
+-  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+-    dst = gen_reg_rtx (mode);
+-
+-  /* Source 1 cannot be a constant.  */
+-  if (CONSTANT_P (src1))
+-    src1 = force_reg (mode, src1);
+-
+-  /* Source 1 cannot be a non-matching memory.  */
+-  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+-    src1 = force_reg (mode, src1);
+-
+-  /* Improve address combine.  */
+-  if (code == PLUS
+-      && GET_MODE_CLASS (mode) == MODE_INT
+-      && MEM_P (src2))
+-    src2 = force_reg (mode, src2);
+-
+-  operands[1] = src1;
+-  operands[2] = src2;
+-  return dst;
+-}
++  if (output_indirect_p && !direct_p)
++    ix86_output_indirect_branch (call_op, xasm, false);
++  else
++    output_asm_insn (xasm, &call_op);
+ 
+-/* Similarly, but assume that the destination has already been
+-   set up properly.  */
++  if (seh_nop_p)
++    return "nop";
+ 
+-void
+-ix86_fixup_binary_operands_no_copy (enum rtx_code code,
+-				    machine_mode mode, rtx operands[])
+-{
+-  rtx dst = ix86_fixup_binary_operands (code, mode, operands);
+-  gcc_assert (dst == operands[0]);
++  return "";
+ }
++
++/* Return a MEM corresponding to a stack slot with mode MODE.
++   Allocate a new slot if necessary.
+ 
+-/* Attempt to expand a binary operator.  Make the expansion closer to the
+-   actual machine, then just general_operand, which will allow 3 separate
+-   memory references (one output, two input) in a single insn.  */
++   The RTL for a function can have several slots available: N is
++   which slot to use.  */
+ 
+-void
+-ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
+-			     rtx operands[])
++rtx
++assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
+ {
+-  rtx src1, src2, dst, op, clob;
+-
+-  dst = ix86_fixup_binary_operands (code, mode, operands);
+-  src1 = operands[1];
+-  src2 = operands[2];
++  struct stack_local_entry *s;
+ 
+- /* Emit the instruction.  */
++  gcc_assert (n < MAX_386_STACK_LOCALS);
+ 
+-  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
++  for (s = ix86_stack_locals; s; s = s->next)
++    if (s->mode == mode && s->n == n)
++      return validize_mem (copy_rtx (s->rtl));
+ 
+-  if (reload_completed
+-      && code == PLUS
+-      && !rtx_equal_p (dst, src1))
+-    {
+-      /* This is going to be an LEA; avoid splitting it later.  */
+-      emit_insn (op);
+-    }
+-  else
+-    {
+-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+-    }
++  s = ggc_alloc<stack_local_entry> ();
++  s->n = n;
++  s->mode = mode;
++  s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
+ 
+-  /* Fix up the destination if needed.  */
+-  if (dst != operands[0])
+-    emit_move_insn (operands[0], dst);
++  s->next = ix86_stack_locals;
++  ix86_stack_locals = s;
++  return validize_mem (copy_rtx (s->rtl));
+ }
+ 
+-/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
+-   the given OPERANDS.  */
++static void
++ix86_instantiate_decls (void)
++{
++  struct stack_local_entry *s;
+ 
+-void
+-ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
+-				     rtx operands[])
+-{
+-  rtx op1 = NULL_RTX, op2 = NULL_RTX;
+-  if (SUBREG_P (operands[1]))
+-    {
+-      op1 = operands[1];
+-      op2 = operands[2];
+-    }
+-  else if (SUBREG_P (operands[2]))
+-    {
+-      op1 = operands[2];
+-      op2 = operands[1];
+-    }
+-  /* Optimize (__m128i) d | (__m128i) e and similar code
+-     when d and e are float vectors into float vector logical
+-     insn.  In C/C++ without using intrinsics there is no other way
+-     to express vector logical operation on float vectors than
+-     to cast them temporarily to integer vectors.  */
+-  if (op1
+-      && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+-      && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
+-      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
+-      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
+-      && SUBREG_BYTE (op1) == 0
+-      && (GET_CODE (op2) == CONST_VECTOR
+-	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
+-	      && SUBREG_BYTE (op2) == 0))
+-      && can_create_pseudo_p ())
+-    {
+-      rtx dst;
+-      switch (GET_MODE (SUBREG_REG (op1)))
+-	{
+-	case E_V4SFmode:
+-	case E_V8SFmode:
+-	case E_V16SFmode:
+-	case E_V2DFmode:
+-	case E_V4DFmode:
+-	case E_V8DFmode:
+-	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
+-	  if (GET_CODE (op2) == CONST_VECTOR)
+-	    {
+-	      op2 = gen_lowpart (GET_MODE (dst), op2);
+-	      op2 = force_reg (GET_MODE (dst), op2);
+-	    }
+-	  else
+-	    {
+-	      op1 = operands[1];
+-	      op2 = SUBREG_REG (operands[2]);
+-	      if (!vector_operand (op2, GET_MODE (dst)))
+-		op2 = force_reg (GET_MODE (dst), op2);
+-	    }
+-	  op1 = SUBREG_REG (op1);
+-	  if (!vector_operand (op1, GET_MODE (dst)))
+-	    op1 = force_reg (GET_MODE (dst), op1);
+-	  emit_insn (gen_rtx_SET (dst,
+-				  gen_rtx_fmt_ee (code, GET_MODE (dst),
+-						  op1, op2)));
+-	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
+-	  return;
+-	default:
+-	  break;
+-	}
+-    }
+-  if (!vector_operand (operands[1], mode))
+-    operands[1] = force_reg (mode, operands[1]);
+-  if (!vector_operand (operands[2], mode))
+-    operands[2] = force_reg (mode, operands[2]);
+-  ix86_fixup_binary_operands_no_copy (code, mode, operands);
+-  emit_insn (gen_rtx_SET (operands[0],
+-			  gen_rtx_fmt_ee (code, mode, operands[1],
+-					  operands[2])));
++  for (s = ix86_stack_locals; s; s = s->next)
++    if (s->rtl != NULL_RTX)
++      instantiate_decl_rtl (s->rtl);
+ }
+-
+-/* Return TRUE or FALSE depending on whether the binary operator meets the
+-   appropriate constraints.  */
++
++/* Check whether x86 address PARTS is a pc-relative address.  */
+ 
+ bool
+-ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
+-			 rtx operands[3])
++ix86_rip_relative_addr_p (struct ix86_address *parts)
+ {
+-  rtx dst = operands[0];
+-  rtx src1 = operands[1];
+-  rtx src2 = operands[2];
+-
+-  /* Both source operands cannot be in memory.  */
+-  if (MEM_P (src1) && MEM_P (src2))
+-    return false;
+-
+-  /* Canonicalize operand order for commutative operators.  */
+-  if (ix86_swap_binary_operands_p (code, mode, operands))
+-    std::swap (src1, src2);
++  rtx base, index, disp;
+ 
+-  /* If the destination is memory, we must have a matching source operand.  */
+-  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+-    return false;
++  base = parts->base;
++  index = parts->index;
++  disp = parts->disp;
+ 
+-  /* Source 1 cannot be a constant.  */
+-  if (CONSTANT_P (src1))
+-    return false;
++  if (disp && !base && !index)
++    {
++      if (TARGET_64BIT)
++	{
++	  rtx symbol = disp;
+ 
+-  /* Source 1 cannot be a non-matching memory.  */
+-  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+-    /* Support "andhi/andsi/anddi" as a zero-extending move.  */
+-    return (code == AND
+-	    && (mode == HImode
+-		|| mode == SImode
+-		|| (TARGET_64BIT && mode == DImode))
+-	    && satisfies_constraint_L (src2));
++	  if (GET_CODE (disp) == CONST)
++	    symbol = XEXP (disp, 0);
++	  if (GET_CODE (symbol) == PLUS
++	      && CONST_INT_P (XEXP (symbol, 1)))
++	    symbol = XEXP (symbol, 0);
+ 
+-  return true;
++	  if (GET_CODE (symbol) == LABEL_REF
++	      || (GET_CODE (symbol) == SYMBOL_REF
++		  && SYMBOL_REF_TLS_MODEL (symbol) == 0)
++	      || (GET_CODE (symbol) == UNSPEC
++		  && (XINT (symbol, 1) == UNSPEC_GOTPCREL
++		      || XINT (symbol, 1) == UNSPEC_PCREL
++		      || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
++	    return true;
++	}
++    }
++  return false;
+ }
+ 
+-/* Attempt to expand a unary operator.  Make the expansion closer to the
+-   actual machine, then just general_operand, which will allow 2 separate
+-   memory references (one output, one input) in a single insn.  */
++/* Calculate the length of the memory address in the instruction encoding.
++   Includes addr32 prefix, does not include the one-byte modrm, opcode,
++   or other prefixes.  We never generate addr32 prefix for LEA insn.  */
+ 
+-void
+-ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
+-			    rtx operands[])
++int
++memory_address_length (rtx addr, bool lea)
+ {
+-  bool matching_memory = false;
+-  rtx src, dst, op, clob;
+-
+-  dst = operands[0];
+-  src = operands[1];
++  struct ix86_address parts;
++  rtx base, index, disp;
++  int len;
++  int ok;
+ 
+-  /* If the destination is memory, and we do not have matching source
+-     operands, do things in registers.  */
+-  if (MEM_P (dst))
+-    {
+-      if (rtx_equal_p (dst, src))
+-	matching_memory = true;
+-      else
+-	dst = gen_reg_rtx (mode);
+-    }
++  if (GET_CODE (addr) == PRE_DEC
++      || GET_CODE (addr) == POST_INC
++      || GET_CODE (addr) == PRE_MODIFY
++      || GET_CODE (addr) == POST_MODIFY)
++    return 0;
+ 
+-  /* When source operand is memory, destination must match.  */
+-  if (MEM_P (src) && !matching_memory)
+-    src = force_reg (mode, src);
++  ok = ix86_decompose_address (addr, &parts);
++  gcc_assert (ok);
+ 
+-  /* Emit the instruction.  */
++  len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
+ 
+-  op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
++  /*  If this is not LEA instruction, add the length of addr32 prefix.  */
++  if (TARGET_64BIT && !lea
++      && (SImode_address_operand (addr, VOIDmode)
++	  || (parts.base && GET_MODE (parts.base) == SImode)
++	  || (parts.index && GET_MODE (parts.index) == SImode)))
++    len++;
+ 
+-  if (code == NOT)
+-    emit_insn (op);
+-  else
+-    {
+-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+-    }
++  base = parts.base;
++  index = parts.index;
++  disp = parts.disp;
+ 
+-  /* Fix up the destination if needed.  */
+-  if (dst != operands[0])
+-    emit_move_insn (operands[0], dst);
+-}
++  if (base && SUBREG_P (base))
++    base = SUBREG_REG (base);
++  if (index && SUBREG_P (index))
++    index = SUBREG_REG (index);
+ 
+-/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
+-   divisor are within the range [0-255].  */
++  gcc_assert (base == NULL_RTX || REG_P (base));
++  gcc_assert (index == NULL_RTX || REG_P (index));
+ 
+-void
+-ix86_split_idivmod (machine_mode mode, rtx operands[],
+-		    bool signed_p)
+-{
+-  rtx_code_label *end_label, *qimode_label;
+-  rtx div, mod;
+-  rtx_insn *insn;
+-  rtx scratch, tmp0, tmp1, tmp2;
+-  rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
+-  rtx (*gen_zero_extend) (rtx, rtx);
+-  rtx (*gen_test_ccno_1) (rtx, rtx);
++  /* Rule of thumb:
++       - esp as the base always wants an index,
++       - ebp as the base always wants a displacement,
++       - r12 as the base always wants an index,
++       - r13 as the base always wants a displacement.  */
+ 
+-  switch (mode)
++  /* Register Indirect.  */
++  if (base && !index && !disp)
+     {
+-    case E_SImode:
+-      if (GET_MODE (operands[0]) == SImode)
+-	{
+-	  if (GET_MODE (operands[1]) == SImode)
+-	    gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
+-	  else
+-	    gen_divmod4_1
+-	      = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
+-	  gen_zero_extend = gen_zero_extendqisi2;
+-	}
+-      else
+-	{
+-	  gen_divmod4_1
+-	    = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
+-	  gen_zero_extend = gen_zero_extendqidi2;
+-	}
+-      gen_test_ccno_1 = gen_testsi_ccno_1;
+-      break;
+-    case E_DImode:
+-      gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
+-      gen_test_ccno_1 = gen_testdi_ccno_1;
+-      gen_zero_extend = gen_zero_extendqidi2;
+-      break;
+-    default:
+-      gcc_unreachable ();
++      /* esp (for its index) and ebp (for its displacement) need
++	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
++	 code.  */
++      if (base == arg_pointer_rtx
++	  || base == frame_pointer_rtx
++	  || REGNO (base) == SP_REG
++	  || REGNO (base) == BP_REG
++	  || REGNO (base) == R12_REG
++	  || REGNO (base) == R13_REG)
++	len++;
+     }
+ 
+-  end_label = gen_label_rtx ();
+-  qimode_label = gen_label_rtx ();
+-
+-  scratch = gen_reg_rtx (mode);
+-
+-  /* Use 8bit unsigned divimod if dividend and divisor are within
+-     the range [0-255].  */
+-  emit_move_insn (scratch, operands[2]);
+-  scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
+-				 scratch, 1, OPTAB_DIRECT);
+-  emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
+-  tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-  tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
+-  tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
+-			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
+-			       pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
+-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  JUMP_LABEL (insn) = qimode_label;
+-
+-  /* Generate original signed/unsigned divimod.  */
+-  div = gen_divmod4_1 (operands[0], operands[1],
+-		       operands[2], operands[3]);
+-  emit_insn (div);
+-
+-  /* Branch to the end.  */
+-  emit_jump_insn (gen_jump (end_label));
+-  emit_barrier ();
+-
+-  /* Generate 8bit unsigned divide.  */
+-  emit_label (qimode_label);
+-  /* Don't use operands[0] for result of 8bit divide since not all
+-     registers support QImode ZERO_EXTRACT.  */
+-  tmp0 = lowpart_subreg (HImode, scratch, mode);
+-  tmp1 = lowpart_subreg (HImode, operands[2], mode);
+-  tmp2 = lowpart_subreg (QImode, operands[3], mode);
+-  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
+-
+-  if (signed_p)
++  /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
++     is not disp32, but disp32(%rip), so for disp32
++     SIB byte is needed, unless print_operand_address
++     optimizes it into disp32(%rip) or (%rip) is implied
++     by UNSPEC.  */
++  else if (disp && !base && !index)
+     {
+-      div = gen_rtx_DIV (mode, operands[2], operands[3]);
+-      mod = gen_rtx_MOD (mode, operands[2], operands[3]);
++      len += 4;
++      if (!ix86_rip_relative_addr_p (&parts))
++	len++;
+     }
+   else
+     {
+-      div = gen_rtx_UDIV (mode, operands[2], operands[3]);
+-      mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
+-    }
+-  if (mode == SImode)
+-    {
+-      if (GET_MODE (operands[0]) != SImode)
+-	div = gen_rtx_ZERO_EXTEND (DImode, div);
+-      if (GET_MODE (operands[1]) != SImode)
+-	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
+-    }
++      /* Find the length of the displacement constant.  */
++      if (disp)
++	{
++	  if (base && satisfies_constraint_K (disp))
++	    len += 1;
++	  else
++	    len += 4;
++	}
++      /* ebp always wants a displacement.  Similarly r13.  */
++      else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
++	len++;
+ 
+-  /* Extract remainder from AH.  */
+-  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
+-			       tmp0, GEN_INT (8), GEN_INT (8));
+-  if (REG_P (operands[1]))
+-    insn = emit_move_insn (operands[1], tmp1);
+-  else
+-    {
+-      /* Need a new scratch register since the old one has result
+-	 of 8bit divide.  */
+-      scratch = gen_reg_rtx (GET_MODE (operands[1]));
+-      emit_move_insn (scratch, tmp1);
+-      insn = emit_move_insn (operands[1], scratch);
++      /* An index requires the two-byte modrm form....  */
++      if (index
++	  /* ...like esp (or r12), which always wants an index.  */
++	  || base == arg_pointer_rtx
++	  || base == frame_pointer_rtx
++	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
++	len++;
+     }
+-  set_unique_reg_note (insn, REG_EQUAL, mod);
+ 
+-  /* Zero extend quotient from AL.  */
+-  tmp1 = gen_lowpart (QImode, tmp0);
+-  insn = emit_insn (gen_zero_extend (operands[0], tmp1));
+-  set_unique_reg_note (insn, REG_EQUAL, div);
+-
+-  emit_label (end_label);
++  return len;
+ }
+ 
+-#define LEA_MAX_STALL (3)
+-#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
++/* Compute default value for "length_immediate" attribute.  When SHORTFORM
++   is set, expect that insn have 8bit immediate alternative.  */
++int
++ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
++{
++  int len = 0;
++  int i;
++  extract_insn_cached (insn);
++  for (i = recog_data.n_operands - 1; i >= 0; --i)
++    if (CONSTANT_P (recog_data.operand[i]))
++      {
++        enum attr_mode mode = get_attr_mode (insn);
+ 
+-/* Increase given DISTANCE in half-cycles according to
+-   dependencies between PREV and NEXT instructions.
+-   Add 1 half-cycle if there is no dependency and
+-   go to next cycle if there is some dependecy.  */
++	gcc_assert (!len);
++	if (shortform && CONST_INT_P (recog_data.operand[i]))
++	  {
++	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
++	    switch (mode)
++	      {
++	      case MODE_QI:
++		len = 1;
++		continue;
++	      case MODE_HI:
++		ival = trunc_int_for_mode (ival, HImode);
++		break;
++	      case MODE_SI:
++		ival = trunc_int_for_mode (ival, SImode);
++		break;
++	      default:
++		break;
++	      }
++	    if (IN_RANGE (ival, -128, 127))
++	      {
++		len = 1;
++		continue;
++	      }
++	  }
++	switch (mode)
++	  {
++	  case MODE_QI:
++	    len = 1;
++	    break;
++	  case MODE_HI:
++	    len = 2;
++	    break;
++	  case MODE_SI:
++	    len = 4;
++	    break;
++	  /* Immediates for DImode instructions are encoded
++	     as 32bit sign extended values.  */
++	  case MODE_DI:
++	    len = 4;
++	    break;
++	  default:
++	    fatal_insn ("unknown insn mode", insn);
++	}
++      }
++  return len;
++}
+ 
+-static unsigned int
+-increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
++/* Compute default value for "length_address" attribute.  */
++int
++ix86_attr_length_address_default (rtx_insn *insn)
+ {
+-  df_ref def, use;
++  int i;
+ 
+-  if (!prev || !next)
+-    return distance + (distance & 1) + 2;
++  if (get_attr_type (insn) == TYPE_LEA)
++    {
++      rtx set = PATTERN (insn), addr;
+ 
+-  if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
+-    return distance + 1;
++      if (GET_CODE (set) == PARALLEL)
++	set = XVECEXP (set, 0, 0);
+ 
+-  FOR_EACH_INSN_USE (use, next)
+-    FOR_EACH_INSN_DEF (def, prev)
+-      if (!DF_REF_IS_ARTIFICIAL (def)
+-	  && DF_REF_REGNO (use) == DF_REF_REGNO (def))
+-	return distance + (distance & 1) + 2;
++      gcc_assert (GET_CODE (set) == SET);
+ 
+-  return distance + 1;
+-}
++      addr = SET_SRC (set);
+ 
+-/* Function checks if instruction INSN defines register number
+-   REGNO1 or REGNO2.  */
++      return memory_address_length (addr, true);
++    }
+ 
+-static bool
+-insn_defines_reg (unsigned int regno1, unsigned int regno2,
+-		  rtx_insn *insn)
+-{
+-  df_ref def;
++  extract_insn_cached (insn);
++  for (i = recog_data.n_operands - 1; i >= 0; --i)
++    {
++      rtx op = recog_data.operand[i];
++      if (MEM_P (op))
++	{
++	  constrain_operands_cached (insn, reload_completed);
++	  if (which_alternative != -1)
++	    {
++	      const char *constraints = recog_data.constraints[i];
++	      int alt = which_alternative;
+ 
+-  FOR_EACH_INSN_DEF (def, insn)
+-    if (DF_REF_REG_DEF_P (def)
+-	&& !DF_REF_IS_ARTIFICIAL (def)
+-	&& (regno1 == DF_REF_REGNO (def)
+-	    || regno2 == DF_REF_REGNO (def)))
+-      return true;
++	      while (*constraints == '=' || *constraints == '+')
++		constraints++;
++	      while (alt-- > 0)
++	        while (*constraints++ != ',')
++		  ;
++	      /* Skip ignored operands.  */
++	      if (*constraints == 'X')
++		continue;
++	    }
+ 
+-  return false;
++	  int len = memory_address_length (XEXP (op, 0), false);
++
++	  /* Account for segment prefix for non-default addr spaces.  */
++	  if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
++	    len++;
++
++	  return len;
++	}
++    }
++  return 0;
+ }
+ 
+-/* Function checks if instruction INSN uses register number
+-   REGNO as a part of address expression.  */
++/* Compute default value for "length_vex" attribute. It includes
++   2 or 3 byte VEX prefix and 1 opcode byte.  */
+ 
+-static bool
+-insn_uses_reg_mem (unsigned int regno, rtx insn)
++int
++ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
++			      bool has_vex_w)
+ {
+-  df_ref use;
++  int i;
+ 
+-  FOR_EACH_INSN_USE (use, insn)
+-    if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
+-      return true;
++  /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
++     byte VEX prefix.  */
++  if (!has_0f_opcode || has_vex_w)
++    return 3 + 1;
+ 
+-  return false;
+-}
++ /* We can always use 2 byte VEX prefix in 32bit.  */
++  if (!TARGET_64BIT)
++    return 2 + 1;
+ 
+-/* Search backward for non-agu definition of register number REGNO1
+-   or register number REGNO2 in basic block starting from instruction
+-   START up to head of basic block or instruction INSN.
++  extract_insn_cached (insn);
+ 
+-   Function puts true value into *FOUND var if definition was found
+-   and false otherwise.
++  for (i = recog_data.n_operands - 1; i >= 0; --i)
++    if (REG_P (recog_data.operand[i]))
++      {
++	/* REX.W bit uses 3 byte VEX prefix.  */
++	if (GET_MODE (recog_data.operand[i]) == DImode
++	    && GENERAL_REG_P (recog_data.operand[i]))
++	  return 3 + 1;
++      }
++    else
++      {
++	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
++	if (MEM_P (recog_data.operand[i])
++	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
++	  return 3 + 1;
++      }
+ 
+-   Distance in half-cycles between START and found instruction or head
+-   of BB is added to DISTANCE and returned.  */
++  return 2 + 1;
++}
++
+ 
+-static int
+-distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
+-			       rtx_insn *insn, int distance,
+-			       rtx_insn *start, bool *found)
+-{
+-  basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
+-  rtx_insn *prev = start;
+-  rtx_insn *next = NULL;
++static bool
++ix86_class_likely_spilled_p (reg_class_t);
+ 
+-  *found = false;
++/* Returns true if lhs of insn is HW function argument register and set up
++   is_spilled to true if it is likely spilled HW register.  */
++static bool
++insn_is_function_arg (rtx insn, bool* is_spilled)
++{
++  rtx dst;
+ 
+-  while (prev
+-	 && prev != insn
+-	 && distance < LEA_SEARCH_THRESHOLD)
++  if (!NONDEBUG_INSN_P (insn))
++    return false;
++  /* Call instructions are not movable, ignore it.  */
++  if (CALL_P (insn))
++    return false;
++  insn = PATTERN (insn);
++  if (GET_CODE (insn) == PARALLEL)
++    insn = XVECEXP (insn, 0, 0);
++  if (GET_CODE (insn) != SET)
++    return false;
++  dst = SET_DEST (insn);
++  if (REG_P (dst) && HARD_REGISTER_P (dst)
++      && ix86_function_arg_regno_p (REGNO (dst)))
+     {
+-      if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
+-	{
+-	  distance = increase_distance (prev, next, distance);
+-	  if (insn_defines_reg (regno1, regno2, prev))
+-	    {
+-	      if (recog_memoized (prev) < 0
+-		  || get_attr_type (prev) != TYPE_LEA)
+-		{
+-		  *found = true;
+-		  return distance;
+-		}
+-	    }
+-
+-	  next = prev;
+-	}
+-      if (prev == BB_HEAD (bb))
+-	break;
+-
+-      prev = PREV_INSN (prev);
++      /* Is it likely spilled HW register?  */
++      if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
++	  && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
++	*is_spilled = true;
++      return true;
+     }
+-
+-  return distance;
++  return false;
+ }
+ 
+-/* Search backward for non-agu definition of register number REGNO1
+-   or register number REGNO2 in INSN's basic block until
+-   1. Pass LEA_SEARCH_THRESHOLD instructions, or
+-   2. Reach neighbor BBs boundary, or
+-   3. Reach agu definition.
+-   Returns the distance between the non-agu definition point and INSN.
+-   If no definition point, returns -1.  */
+-
+-static int
+-distance_non_agu_define (unsigned int regno1, unsigned int regno2,
+-			 rtx_insn *insn)
++/* Add output dependencies for chain of function adjacent arguments if only
++   there is a move to likely spilled HW register.  Return first argument
++   if at least one dependence was added or NULL otherwise.  */
++static rtx_insn *
++add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
+ {
+-  basic_block bb = BLOCK_FOR_INSN (insn);
+-  int distance = 0;
+-  bool found = false;
++  rtx_insn *insn;
++  rtx_insn *last = call;
++  rtx_insn *first_arg = NULL;
++  bool is_spilled = false;
+ 
+-  if (insn != BB_HEAD (bb))
+-    distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
+-					      distance, PREV_INSN (insn),
+-					      &found);
++  head = PREV_INSN (head);
+ 
+-  if (!found && distance < LEA_SEARCH_THRESHOLD)
++  /* Find nearest to call argument passing instruction.  */
++  while (true)
+     {
+-      edge e;
+-      edge_iterator ei;
+-      bool simple_loop = false;
+-
+-      FOR_EACH_EDGE (e, ei, bb->preds)
+-	if (e->src == bb)
+-	  {
+-	    simple_loop = true;
+-	    break;
+-	  }
++      last = PREV_INSN (last);
++      if (last == head)
++	return NULL;
++      if (!NONDEBUG_INSN_P (last))
++	continue;
++      if (insn_is_function_arg (last, &is_spilled))
++	break;
++      return NULL;
++    }
+ 
+-      if (simple_loop)
+-	distance = distance_non_agu_define_in_bb (regno1, regno2,
+-						  insn, distance,
+-						  BB_END (bb), &found);
+-      else
++  first_arg = last;
++  while (true)
++    {
++      insn = PREV_INSN (last);
++      if (!INSN_P (insn))
++	break;
++      if (insn == head)
++	break;
++      if (!NONDEBUG_INSN_P (insn))
+ 	{
+-	  int shortest_dist = -1;
+-	  bool found_in_bb = false;
+-
+-	  FOR_EACH_EDGE (e, ei, bb->preds)
+-	    {
+-	      int bb_dist
+-		= distance_non_agu_define_in_bb (regno1, regno2,
+-						 insn, distance,
+-						 BB_END (e->src),
+-						 &found_in_bb);
+-	      if (found_in_bb)
+-		{
+-		  if (shortest_dist < 0)
+-		    shortest_dist = bb_dist;
+-		  else if (bb_dist > 0)
+-		    shortest_dist = MIN (bb_dist, shortest_dist);
+-
+-		  found = true;
+-		}
+-	    }
+-
+-	  distance = shortest_dist;
++	  last = insn;
++	  continue;
++	}
++      if (insn_is_function_arg (insn, &is_spilled))
++	{
++	  /* Add output depdendence between two function arguments if chain
++	     of output arguments contains likely spilled HW registers.  */
++	  if (is_spilled)
++	    add_dependence (first_arg, insn, REG_DEP_OUTPUT);
++	  first_arg = last = insn;
+ 	}
++      else
++	break;
+     }
+-
+-  /* get_attr_type may modify recog data.  We want to make sure
+-     that recog data is valid for instruction INSN, on which
+-     distance_non_agu_define is called.  INSN is unchanged here.  */
+-  extract_insn_cached (insn);
+-
+-  if (!found)
+-    return -1;
+-
+-  return distance >> 1;
++  if (!is_spilled)
++    return NULL;
++  return first_arg;
+ }
+ 
+-/* Return the distance in half-cycles between INSN and the next
+-   insn that uses register number REGNO in memory address added
+-   to DISTANCE.  Return -1 if REGNO0 is set.
+-
+-   Put true value into *FOUND if register usage was found and
+-   false otherwise.
+-   Put true value into *REDEFINED if register redefinition was
+-   found and false otherwise.  */
+-
+-static int
+-distance_agu_use_in_bb (unsigned int regno,
+-			rtx_insn *insn, int distance, rtx_insn *start,
+-			bool *found, bool *redefined)
++/* Add output or anti dependency from insn to first_arg to restrict its code
++   motion.  */
++static void
++avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
+ {
+-  basic_block bb = NULL;
+-  rtx_insn *next = start;
+-  rtx_insn *prev = NULL;
+-
+-  *found = false;
+-  *redefined = false;
++  rtx set;
++  rtx tmp;
+ 
+-  if (start != NULL_RTX)
++  set = single_set (insn);
++  if (!set)
++    return;
++  tmp = SET_DEST (set);
++  if (REG_P (tmp))
+     {
+-      bb = BLOCK_FOR_INSN (start);
+-      if (start != BB_HEAD (bb))
+-	/* If insn and start belong to the same bb, set prev to insn,
+-	   so the call to increase_distance will increase the distance
+-	   between insns by 1.  */
+-	prev = insn;
++      /* Add output dependency to the first function argument.  */
++      add_dependence (first_arg, insn, REG_DEP_OUTPUT);
++      return;
+     }
++  /* Add anti dependency.  */
++  add_dependence (first_arg, insn, REG_DEP_ANTI);
++}
+ 
+-  while (next
+-	 && next != insn
+-	 && distance < LEA_SEARCH_THRESHOLD)
++/* Avoid cross block motion of function argument through adding dependency
++   from the first non-jump instruction in bb.  */
++static void
++add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
++{
++  rtx_insn *insn = BB_END (bb);
++
++  while (insn)
+     {
+-      if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
++      if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
+ 	{
+-	  distance = increase_distance(prev, next, distance);
+-	  if (insn_uses_reg_mem (regno, next))
+-	    {
+-	      /* Return DISTANCE if OP0 is used in memory
+-		 address in NEXT.  */
+-	      *found = true;
+-	      return distance;
+-	    }
+-
+-	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
++	  rtx set = single_set (insn);
++	  if (set)
+ 	    {
+-	      /* Return -1 if OP0 is set in NEXT.  */
+-	      *redefined = true;
+-	      return -1;
++	      avoid_func_arg_motion (arg, insn);
++	      return;
+ 	    }
+-
+-	  prev = next;
+ 	}
+-
+-      if (next == BB_END (bb))
+-	break;
+-
+-      next = NEXT_INSN (next);
++      if (insn == BB_HEAD (bb))
++	return;
++      insn = PREV_INSN (insn);
+     }
+-
+-  return distance;
+ }
+ 
+-/* Return the distance between INSN and the next insn that uses
+-   register number REGNO0 in memory address.  Return -1 if no such
+-   a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
++/* Hook for pre-reload schedule - avoid motion of function arguments
++   passed in likely spilled HW registers.  */
++static void
++ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
++{
++  rtx_insn *insn;
++  rtx_insn *first_arg = NULL;
++  if (reload_completed)
++    return;
++  while (head != tail && DEBUG_INSN_P (head))
++    head = NEXT_INSN (head);
++  for (insn = tail; insn != head; insn = PREV_INSN (insn))
++    if (INSN_P (insn) && CALL_P (insn))
++      {
++	first_arg = add_parameter_dependencies (insn, head);
++	if (first_arg)
++	  {
++	    /* Add dependee for first argument to predecessors if only
++	       region contains more than one block.  */
++	    basic_block bb =  BLOCK_FOR_INSN (insn);
++	    int rgn = CONTAINING_RGN (bb->index);
++	    int nr_blks = RGN_NR_BLOCKS (rgn);
++	    /* Skip trivial regions and region head blocks that can have
++	       predecessors outside of region.  */
++	    if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
++	      {
++		edge e;
++		edge_iterator ei;
++
++		/* Regions are SCCs with the exception of selective
++		   scheduling with pipelining of outer blocks enabled.
++		   So also check that immediate predecessors of a non-head
++		   block are in the same region.  */
++		FOR_EACH_EDGE (e, ei, bb->preds)
++		  {
++		    /* Avoid creating of loop-carried dependencies through
++		       using topological ordering in the region.  */
++		    if (rgn == CONTAINING_RGN (e->src->index)
++			&& BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
++		      add_dependee_for_func_arg (first_arg, e->src); 
++		  }
++	      }
++	    insn = first_arg;
++	    if (insn == head)
++	      break;
++	  }
++      }
++    else if (first_arg)
++      avoid_func_arg_motion (first_arg, insn);
++}
+ 
++/* Hook for pre-reload schedule - set priority of moves from likely spilled
++   HW registers to maximum, to schedule them at soon as possible. These are
++   moves from function argument registers at the top of the function entry
++   and moves from function return value registers after call.  */
+ static int
+-distance_agu_use (unsigned int regno0, rtx_insn *insn)
++ix86_adjust_priority (rtx_insn *insn, int priority)
+ {
+-  basic_block bb = BLOCK_FOR_INSN (insn);
+-  int distance = 0;
+-  bool found = false;
+-  bool redefined = false;
++  rtx set;
+ 
+-  if (insn != BB_END (bb))
+-    distance = distance_agu_use_in_bb (regno0, insn, distance,
+-				       NEXT_INSN (insn),
+-				       &found, &redefined);
++  if (reload_completed)
++    return priority;
+ 
+-  if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
++  if (!NONDEBUG_INSN_P (insn))
++    return priority;
++
++  set = single_set (insn);
++  if (set)
+     {
+-      edge e;
+-      edge_iterator ei;
+-      bool simple_loop = false;
++      rtx tmp = SET_SRC (set);
++      if (REG_P (tmp)
++          && HARD_REGISTER_P (tmp)
++          && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
++          && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
++	return current_sched_info->sched_max_insns_priority;
++    }
+ 
+-      FOR_EACH_EDGE (e, ei, bb->succs)
+-        if (e->dest == bb)
+-	  {
+-	    simple_loop = true;
+-	    break;
+-	  }
++  return priority;
++}
+ 
+-      if (simple_loop)
+-	distance = distance_agu_use_in_bb (regno0, insn,
+-					   distance, BB_HEAD (bb),
+-					   &found, &redefined);
+-      else
++/* Prepare for scheduling pass.  */
++static void
++ix86_sched_init_global (FILE *, int, int)
++{
++  /* Install scheduling hooks for current CPU.  Some of these hooks are used
++     in time-critical parts of the scheduler, so we only set them up when
++     they are actually used.  */
++  switch (ix86_tune)
++    {
++    case PROCESSOR_CORE2:
++    case PROCESSOR_NEHALEM:
++    case PROCESSOR_SANDYBRIDGE:
++    case PROCESSOR_HASWELL:
++    case PROCESSOR_GENERIC:
++      /* Do not perform multipass scheduling for pre-reload schedule
++         to save compile time.  */
++      if (reload_completed)
+ 	{
+-	  int shortest_dist = -1;
+-	  bool found_in_bb = false;
+-	  bool redefined_in_bb = false;
+-
+-	  FOR_EACH_EDGE (e, ei, bb->succs)
+-	    {
+-	      int bb_dist
+-		= distance_agu_use_in_bb (regno0, insn,
+-					  distance, BB_HEAD (e->dest),
+-					  &found_in_bb, &redefined_in_bb);
+-	      if (found_in_bb)
+-		{
+-		  if (shortest_dist < 0)
+-		    shortest_dist = bb_dist;
+-		  else if (bb_dist > 0)
+-		    shortest_dist = MIN (bb_dist, shortest_dist);
+-
+-		  found = true;
+-		}
+-	    }
+-
+-	  distance = shortest_dist;
++	  ix86_core2i7_init_hooks ();
++	  break;
+ 	}
++      /* Fall through.  */
++    default:
++      targetm.sched.dfa_post_advance_cycle = NULL;
++      targetm.sched.first_cycle_multipass_init = NULL;
++      targetm.sched.first_cycle_multipass_begin = NULL;
++      targetm.sched.first_cycle_multipass_issue = NULL;
++      targetm.sched.first_cycle_multipass_backtrack = NULL;
++      targetm.sched.first_cycle_multipass_end = NULL;
++      targetm.sched.first_cycle_multipass_fini = NULL;
++      break;
+     }
+-
+-  if (!found || redefined)
+-    return -1;
+-
+-  return distance >> 1;
+ }
+ 
+-/* Define this macro to tune LEA priority vs ADD, it take effect when
+-   there is a dilemma of choicing LEA or ADD
+-   Negative value: ADD is more preferred than LEA
+-   Zero: Netrual
+-   Positive value: LEA is more preferred than ADD*/
+-#define IX86_LEA_PRIORITY 0
+-
+-/* Return true if usage of lea INSN has performance advantage
+-   over a sequence of instructions.  Instructions sequence has
+-   SPLIT_COST cycles higher latency than lea latency.  */
++
++/* Implement TARGET_STATIC_RTX_ALIGNMENT.  */
+ 
+-static bool
+-ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
+-		      unsigned int regno2, int split_cost, bool has_scale)
++static HOST_WIDE_INT
++ix86_static_rtx_alignment (machine_mode mode)
+ {
+-  int dist_define, dist_use;
+-
+-  /* For Silvermont if using a 2-source or 3-source LEA for
+-     non-destructive destination purposes, or due to wanting
+-     ability to use SCALE, the use of LEA is justified.  */
+-  if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
+-      || TARGET_TREMONT || TARGET_INTEL)
+-    {
+-      if (has_scale)
+-	return true;
+-      if (split_cost < 1)
+-	return false;
+-      if (regno0 == regno1 || regno0 == regno2)
+-	return false;
+-      return true;
+-    }
++  if (mode == DFmode)
++    return 64;
++  if (ALIGN_MODE_128 (mode))
++    return MAX (128, GET_MODE_ALIGNMENT (mode));
++  return GET_MODE_ALIGNMENT (mode);
++}
+ 
+-  dist_define = distance_non_agu_define (regno1, regno2, insn);
+-  dist_use = distance_agu_use (regno0, insn);
++/* Implement TARGET_CONSTANT_ALIGNMENT.  */
+ 
+-  if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
++static HOST_WIDE_INT
++ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
++{
++  if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
++      || TREE_CODE (exp) == INTEGER_CST)
+     {
+-      /* If there is no non AGU operand definition, no AGU
+-	 operand usage and split cost is 0 then both lea
+-	 and non lea variants have same priority.  Currently
+-	 we prefer lea for 64 bit code and non lea on 32 bit
+-	 code.  */
+-      if (dist_use < 0 && split_cost == 0)
+-	return TARGET_64BIT || IX86_LEA_PRIORITY;
+-      else
+-	return true;
++      machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
++      HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
++      return MAX (mode_align, align);
+     }
++  else if (!optimize_size && TREE_CODE (exp) == STRING_CST
++	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
++    return BITS_PER_WORD;
+ 
+-  /* With longer definitions distance lea is more preferable.
+-     Here we change it to take into account splitting cost and
+-     lea priority.  */
+-  dist_define += split_cost + IX86_LEA_PRIORITY;
++  return align;
++}
+ 
+-  /* If there is no use in memory addess then we just check
+-     that split cost exceeds AGU stall.  */
+-  if (dist_use < 0)
+-    return dist_define > LEA_MAX_STALL;
++/* Implement TARGET_EMPTY_RECORD_P.  */
+ 
+-  /* If this insn has both backward non-agu dependence and forward
+-     agu dependence, the one with short distance takes effect.  */
+-  return dist_define >= dist_use;
++static bool
++ix86_is_empty_record (const_tree type)
++{
++  if (!TARGET_64BIT)
++    return false;
++  return default_is_empty_record (type);
+ }
+ 
+-/* Return true if it is legal to clobber flags by INSN and
+-   false otherwise.  */
++/* Implement TARGET_WARN_PARAMETER_PASSING_ABI.  */
+ 
+-static bool
+-ix86_ok_to_clobber_flags (rtx_insn *insn)
++static void
++ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
+ {
+-  basic_block bb = BLOCK_FOR_INSN (insn);
+-  df_ref use;
+-  bitmap live;
++  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ 
+-  while (insn)
+-    {
+-      if (NONDEBUG_INSN_P (insn))
+-	{
+-	  FOR_EACH_INSN_USE (use, insn)
+-	    if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
+-	      return false;
++  if (!cum->warn_empty)
++    return;
+ 
+-	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
+-	    return true;
+-	}
+-
+-      if (insn == BB_END (bb))
+-	break;
+-
+-      insn = NEXT_INSN (insn);
+-    }
+-
+-  live = df_get_live_out(bb);
+-  return !REGNO_REG_SET_P (live, FLAGS_REG);
+-}
+-
+-/* Return true if we need to split op0 = op1 + op2 into a sequence of
+-   move and add to avoid AGU stalls.  */
++  if (!TYPE_EMPTY_P (type))
++    return;
+ 
+-bool
+-ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
+-{
+-  unsigned int regno0, regno1, regno2;
++  /* Don't warn if the function isn't visible outside of the TU.  */
++  if (cum->decl && !TREE_PUBLIC (cum->decl))
++    return;
+ 
+-  /* Check if we need to optimize.  */
+-  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+-    return false;
++  const_tree ctx = get_ultimate_context (cum->decl);
++  if (ctx != NULL_TREE
++      && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
++    return;
+ 
+-  /* Check it is correct to split here.  */
+-  if (!ix86_ok_to_clobber_flags(insn))
+-    return false;
++  /* If the actual size of the type is zero, then there is no change
++     in how objects of this size are passed.  */
++  if (int_size_in_bytes (type) == 0)
++    return;
+ 
+-  regno0 = true_regnum (operands[0]);
+-  regno1 = true_regnum (operands[1]);
+-  regno2 = true_regnum (operands[2]);
++  warning (OPT_Wabi, "empty class %qT parameter passing ABI "
++	   "changes in %<-fabi-version=12%> (GCC 8)", type);
+ 
+-  /* We need to split only adds with non destructive
+-     destination operand.  */
+-  if (regno0 == regno1 || regno0 == regno2)
+-    return false;
+-  else
+-    return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
++  /* Only warn once.  */
++  cum->warn_empty = false;
+ }
+ 
+-/* Return true if we should emit lea instruction instead of mov
+-   instruction.  */
++/* This hook returns name of multilib ABI.  */
+ 
+-bool
+-ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
++static const char *
++ix86_get_multilib_abi_name (void)
+ {
+-  unsigned int regno0, regno1;
+-
+-  /* Check if we need to optimize.  */
+-  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+-    return false;
+-
+-  /* Use lea for reg to reg moves only.  */
+-  if (!REG_P (operands[0]) || !REG_P (operands[1]))
+-    return false;
+-
+-  regno0 = true_regnum (operands[0]);
+-  regno1 = true_regnum (operands[1]);
+-
+-  return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
++  if (!(TARGET_64BIT_P (ix86_isa_flags)))
++    return "i386";
++  else if (TARGET_X32_P (ix86_isa_flags))
++    return "x32";
++  else
++    return "x86_64";
+ }
+ 
+-/* Return true if we need to split lea into a sequence of
+-   instructions to avoid AGU stalls. */
++/* Compute the alignment for a variable for Intel MCU psABI.  TYPE is
++   the data type, and ALIGN is the alignment that the object would
++   ordinarily have.  */
+ 
+-bool
+-ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
++static int
++iamcu_alignment (tree type, int align)
+ {
+-  unsigned int regno0, regno1, regno2;
+-  int split_cost;
+-  struct ix86_address parts;
+-  int ok;
+-
+-  /* Check we need to optimize.  */
+-  if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
+-    return false;
+-
+-  /* The "at least two components" test below might not catch simple
+-     move or zero extension insns if parts.base is non-NULL and parts.disp
+-     is const0_rtx as the only components in the address, e.g. if the
+-     register is %rbp or %r13.  As this test is much cheaper and moves or
+-     zero extensions are the common case, do this check first.  */
+-  if (REG_P (operands[1])
+-      || (SImode_address_operand (operands[1], VOIDmode)
+-	  && REG_P (XEXP (operands[1], 0))))
+-    return false;
+-
+-  /* Check if it is OK to split here.  */
+-  if (!ix86_ok_to_clobber_flags (insn))
+-    return false;
+-
+-  ok = ix86_decompose_address (operands[1], &parts);
+-  gcc_assert (ok);
+-
+-  /* There should be at least two components in the address.  */
+-  if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
+-      + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
+-    return false;
+-
+-  /* We should not split into add if non legitimate pic
+-     operand is used as displacement. */
+-  if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
+-    return false;
+-
+-  regno0 = true_regnum (operands[0]) ;
+-  regno1 = INVALID_REGNUM;
+-  regno2 = INVALID_REGNUM;
+-
+-  if (parts.base)
+-    regno1 = true_regnum (parts.base);
+-  if (parts.index)
+-    regno2 = true_regnum (parts.index);
++  machine_mode mode;
+ 
+-  split_cost = 0;
++  if (align < 32 || TYPE_USER_ALIGN (type))
++    return align;
+ 
+-  /* Compute how many cycles we will add to execution time
+-     if split lea into a sequence of instructions.  */
+-  if (parts.base || parts.index)
++  /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
++     bytes.  */
++  mode = TYPE_MODE (strip_array_types (type));
++  switch (GET_MODE_CLASS (mode))
+     {
+-      /* Have to use mov instruction if non desctructive
+-	 destination form is used.  */
+-      if (regno1 != regno0 && regno2 != regno0)
+-	split_cost += 1;
+-
+-      /* Have to add index to base if both exist.  */
+-      if (parts.base && parts.index)
+-	split_cost += 1;
+-
+-      /* Have to use shift and adds if scale is 2 or greater.  */
+-      if (parts.scale > 1)
+-	{
+-	  if (regno0 != regno1)
+-	    split_cost += 1;
+-	  else if (regno2 == regno0)
+-	    split_cost += 4;
+-	  else
+-	    split_cost += parts.scale;
+-	}
+-
+-      /* Have to use add instruction with immediate if
+-	 disp is non zero.  */
+-      if (parts.disp && parts.disp != const0_rtx)
+-	split_cost += 1;
+-
+-      /* Subtract the price of lea.  */
+-      split_cost -= 1;
++    case MODE_INT:
++    case MODE_COMPLEX_INT:
++    case MODE_COMPLEX_FLOAT:
++    case MODE_FLOAT:
++    case MODE_DECIMAL_FLOAT:
++      return 32;
++    default:
++      return align;
+     }
+-
+-  return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
+-				parts.scale > 1);
+ }
+ 
+-/* Emit x86 binary operand CODE in mode MODE, where the first operand
+-   matches destination.  RTX includes clobber of FLAGS_REG.  */
++/* Compute the alignment for a static variable.
++   TYPE is the data type, and ALIGN is the alignment that
++   the object would ordinarily have.  The value of this function is used
++   instead of that alignment to align the object.  */
+ 
+-static void
+-ix86_emit_binop (enum rtx_code code, machine_mode mode,
+-		 rtx dst, rtx src)
++int
++ix86_data_alignment (tree type, unsigned int align, bool opt)
+ {
+-  rtx op, clob;
++  /* GCC 4.8 and earlier used to incorrectly assume this alignment even
++     for symbols from other compilation units or symbols that don't need
++     to bind locally.  In order to preserve some ABI compatibility with
++     those compilers, ensure we don't decrease alignment from what we
++     used to assume.  */
+ 
+-  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
+-  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+-  
+-  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+-}
++  unsigned int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
+ 
+-/* Return true if regno1 def is nearest to the insn.  */
++  /* A data structure, equal or greater than the size of a cache line
++     (64 bytes in the Pentium 4 and other recent Intel processors, including
++     processors based on Intel Core microarchitecture) should be aligned
++     so that its base address is a multiple of a cache line size.  */
+ 
+-static bool
+-find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
+-{
+-  rtx_insn *prev = insn;
+-  rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
++  unsigned int max_align
++    = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
+ 
+-  if (insn == start)
+-    return false;
+-  while (prev && prev != start)
++  if (max_align < BITS_PER_WORD)
++    max_align = BITS_PER_WORD;
++
++  switch (ix86_align_data_type)
+     {
+-      if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
+-	{
+-	  prev = PREV_INSN (prev);
+-	  continue;
+-	}
+-      if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
+-	return true;
+-      else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
+-	return false;
+-      prev = PREV_INSN (prev);
++    case ix86_align_data_type_abi: opt = false; break;
++    case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
++    case ix86_align_data_type_cacheline: break;
+     }
+ 
+-  /* None of the regs is defined in the bb.  */
+-  return false;
+-}
+-
+-/* Split lea instructions into a sequence of instructions
+-   which are executed on ALU to avoid AGU stalls.
+-   It is assumed that it is allowed to clobber flags register
+-   at lea position.  */
+-
+-void
+-ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
+-{
+-  unsigned int regno0, regno1, regno2;
+-  struct ix86_address parts;
+-  rtx target, tmp;
+-  int ok, adds;
+-
+-  ok = ix86_decompose_address (operands[1], &parts);
+-  gcc_assert (ok);
+-
+-  target = gen_lowpart (mode, operands[0]);
+-
+-  regno0 = true_regnum (target);
+-  regno1 = INVALID_REGNUM;
+-  regno2 = INVALID_REGNUM;
++  if (TARGET_IAMCU)
++    align = iamcu_alignment (type, align);
+ 
+-  if (parts.base)
++  if (opt
++      && AGGREGATE_TYPE_P (type)
++      && TYPE_SIZE (type)
++      && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
+     {
+-      parts.base = gen_lowpart (mode, parts.base);
+-      regno1 = true_regnum (parts.base);
++      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
++	  && align < max_align_compat)
++	align = max_align_compat;
++      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
++	  && align < max_align)
++	align = max_align;
+     }
+ 
+-  if (parts.index)
++  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
++     to 16byte boundary.  */
++  if (TARGET_64BIT)
+     {
+-      parts.index = gen_lowpart (mode, parts.index);
+-      regno2 = true_regnum (parts.index);
++      if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
++	  && TYPE_SIZE (type)
++	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
++	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
++	  && align < 128)
++	return 128;
+     }
+ 
+-  if (parts.disp)
+-    parts.disp = gen_lowpart (mode, parts.disp);
++  if (!opt)
++    return align;
+ 
+-  if (parts.scale > 1)
++  if (TREE_CODE (type) == ARRAY_TYPE)
++    {
++      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
++	return 64;
++      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
++	return 128;
++    }
++  else if (TREE_CODE (type) == COMPLEX_TYPE)
+     {
+-      /* Case r1 = r1 + ...  */
+-      if (regno1 == regno0)
+-	{
+-	  /* If we have a case r1 = r1 + C * r2 then we
+-	     should use multiplication which is very
+-	     expensive.  Assume cost model is wrong if we
+-	     have such case here.  */
+-	  gcc_assert (regno2 != regno0);
+-
+-	  for (adds = parts.scale; adds > 0; adds--)
+-	    ix86_emit_binop (PLUS, mode, target, parts.index);
+-	}
+-      else
+-	{
+-	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
+-	  if (regno0 != regno2)
+-	    emit_insn (gen_rtx_SET (target, parts.index));
+-
+-	  /* Use shift for scaling.  */
+-	  ix86_emit_binop (ASHIFT, mode, target,
+-			   GEN_INT (exact_log2 (parts.scale)));
+-
+-	  if (parts.base)
+-	    ix86_emit_binop (PLUS, mode, target, parts.base);
+ 
+-	  if (parts.disp && parts.disp != const0_rtx)
+-	    ix86_emit_binop (PLUS, mode, target, parts.disp);
+-	}
++      if (TYPE_MODE (type) == DCmode && align < 64)
++	return 64;
++      if ((TYPE_MODE (type) == XCmode
++	   || TYPE_MODE (type) == TCmode) && align < 128)
++	return 128;
+     }
+-  else if (!parts.base && !parts.index)
++  else if ((TREE_CODE (type) == RECORD_TYPE
++	    || TREE_CODE (type) == UNION_TYPE
++	    || TREE_CODE (type) == QUAL_UNION_TYPE)
++	   && TYPE_FIELDS (type))
+     {
+-      gcc_assert(parts.disp);
+-      emit_insn (gen_rtx_SET (target, parts.disp));
++      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
++	return 64;
++      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
++	return 128;
+     }
+-  else
++  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
++	   || TREE_CODE (type) == INTEGER_TYPE)
+     {
+-      if (!parts.base)
+-	{
+-	  if (regno0 != regno2)
+-	    emit_insn (gen_rtx_SET (target, parts.index));
+-	}
+-      else if (!parts.index)
+-	{
+-	  if (regno0 != regno1)
+-	    emit_insn (gen_rtx_SET (target, parts.base));
+-	}
+-      else
+-	{
+-	  if (regno0 == regno1)
+-	    tmp = parts.index;
+-	  else if (regno0 == regno2)
+-	    tmp = parts.base;
+-	  else
+-	    {
+-	      rtx tmp1;
++      if (TYPE_MODE (type) == DFmode && align < 64)
++	return 64;
++      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
++	return 128;
++    }
+ 
+-	      /* Find better operand for SET instruction, depending
+-		 on which definition is farther from the insn.  */
+-	      if (find_nearest_reg_def (insn, regno1, regno2))
+-		tmp = parts.index, tmp1 = parts.base;
+-	      else
+-		tmp = parts.base, tmp1 = parts.index;
++  return align;
++}
+ 
+-	      emit_insn (gen_rtx_SET (target, tmp));
++/* Compute the alignment for a local variable or a stack slot.  EXP is
++   the data type or decl itself, MODE is the widest mode available and
++   ALIGN is the alignment that the object would ordinarily have.  The
++   value of this macro is used instead of that alignment to align the
++   object.  */
+ 
+-	      if (parts.disp && parts.disp != const0_rtx)
+-		ix86_emit_binop (PLUS, mode, target, parts.disp);
++unsigned int
++ix86_local_alignment (tree exp, machine_mode mode,
++		      unsigned int align)
++{
++  tree type, decl;
+ 
+-	      ix86_emit_binop (PLUS, mode, target, tmp1);
+-	      return;
+-	    }
++  if (exp && DECL_P (exp))
++    {
++      type = TREE_TYPE (exp);
++      decl = exp;
++    }
++  else
++    {
++      type = exp;
++      decl = NULL;
++    }
+ 
+-	  ix86_emit_binop (PLUS, mode, target, tmp);
+-	}
++  /* Don't do dynamic stack realignment for long long objects with
++     -mpreferred-stack-boundary=2.  */
++  if (!TARGET_64BIT
++      && align == 64
++      && ix86_preferred_stack_boundary < 64
++      && (mode == DImode || (type && TYPE_MODE (type) == DImode))
++      && (!type || !TYPE_USER_ALIGN (type))
++      && (!decl || !DECL_USER_ALIGN (decl)))
++    align = 32;
+ 
+-      if (parts.disp && parts.disp != const0_rtx)
+-	ix86_emit_binop (PLUS, mode, target, parts.disp);
++  /* If TYPE is NULL, we are allocating a stack slot for caller-save
++     register in MODE.  We will return the largest alignment of XF
++     and DF.  */
++  if (!type)
++    {
++      if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
++	align = GET_MODE_ALIGNMENT (DFmode);
++      return align;
+     }
+-}
+ 
+-/* Return true if it is ok to optimize an ADD operation to LEA
+-   operation to avoid flag register consumation.  For most processors,
+-   ADD is faster than LEA.  For the processors like BONNELL, if the
+-   destination register of LEA holds an actual address which will be
+-   used soon, LEA is better and otherwise ADD is better.  */
++  /* Don't increase alignment for Intel MCU psABI.  */
++  if (TARGET_IAMCU)
++    return align;
+ 
+-bool
+-ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
+-{
+-  unsigned int regno0 = true_regnum (operands[0]);
+-  unsigned int regno1 = true_regnum (operands[1]);
+-  unsigned int regno2 = true_regnum (operands[2]);
++  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
++     to 16byte boundary.  Exact wording is:
+ 
+-  /* If a = b + c, (a!=b && a!=c), must use lea form. */
+-  if (regno0 != regno1 && regno0 != regno2)
+-    return true;
++     An array uses the same alignment as its elements, except that a local or
++     global array variable of length at least 16 bytes or
++     a C99 variable-length array variable always has alignment of at least 16 bytes.
+ 
+-  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+-    return false;
++     This was added to allow use of aligned SSE instructions at arrays.  This
++     rule is meant for static storage (where compiler cannot do the analysis
++     by itself).  We follow it for automatic variables only when convenient.
++     We fully control everything in the function compiled and functions from
++     other unit cannot rely on the alignment.
+ 
+-  return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
++     Exclude va_list type.  It is the common case of local array where
++     we cannot benefit from the alignment.  
++
++     TODO: Probably one should optimize for size only when var is not escaping.  */
++  if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
++      && TARGET_SSE)
++    {
++      if (AGGREGATE_TYPE_P (type)
++	  && (va_list_type_node == NULL_TREE
++	      || (TYPE_MAIN_VARIANT (type)
++		  != TYPE_MAIN_VARIANT (va_list_type_node)))
++	  && TYPE_SIZE (type)
++	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
++	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
++	  && align < 128)
++	return 128;
++    }
++  if (TREE_CODE (type) == ARRAY_TYPE)
++    {
++      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
++	return 64;
++      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
++	return 128;
++    }
++  else if (TREE_CODE (type) == COMPLEX_TYPE)
++    {
++      if (TYPE_MODE (type) == DCmode && align < 64)
++	return 64;
++      if ((TYPE_MODE (type) == XCmode
++	   || TYPE_MODE (type) == TCmode) && align < 128)
++	return 128;
++    }
++  else if ((TREE_CODE (type) == RECORD_TYPE
++	    || TREE_CODE (type) == UNION_TYPE
++	    || TREE_CODE (type) == QUAL_UNION_TYPE)
++	   && TYPE_FIELDS (type))
++    {
++      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
++	return 64;
++      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
++	return 128;
++    }
++  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
++	   || TREE_CODE (type) == INTEGER_TYPE)
++    {
++
++      if (TYPE_MODE (type) == DFmode && align < 64)
++	return 64;
++      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
++	return 128;
++    }
++  return align;
+ }
+ 
+-/* Return true if destination reg of SET_BODY is shift count of
+-   USE_BODY.  */
++/* Compute the minimum required alignment for dynamic stack realignment
++   purposes for a local variable, parameter or a stack slot.  EXP is
++   the data type or decl itself, MODE is its mode and ALIGN is the
++   alignment that the object would ordinarily have.  */
+ 
+-static bool
+-ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
++unsigned int
++ix86_minimum_alignment (tree exp, machine_mode mode,
++			unsigned int align)
+ {
+-  rtx set_dest;
+-  rtx shift_rtx;
+-  int i;
++  tree type, decl;
+ 
+-  /* Retrieve destination of SET_BODY.  */
+-  switch (GET_CODE (set_body))
++  if (exp && DECL_P (exp))
+     {
+-    case SET:
+-      set_dest = SET_DEST (set_body);
+-      if (!set_dest || !REG_P (set_dest))
+-	return false;
+-      break;
+-    case PARALLEL:
+-      for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
+-	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
+-					  use_body))
+-	  return true;
+-      /* FALLTHROUGH */
+-    default:
+-      return false;
++      type = TREE_TYPE (exp);
++      decl = exp;
+     }
+-
+-  /* Retrieve shift count of USE_BODY.  */
+-  switch (GET_CODE (use_body))
++  else
+     {
+-    case SET:
+-      shift_rtx = XEXP (use_body, 1);
+-      break;
+-    case PARALLEL:
+-      for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
+-	if (ix86_dep_by_shift_count_body (set_body,
+-					  XVECEXP (use_body, 0, i)))
+-	  return true;
+-      /* FALLTHROUGH */
+-    default:
+-      return false;
++      type = exp;
++      decl = NULL;
+     }
+ 
+-  if (shift_rtx
+-      && (GET_CODE (shift_rtx) == ASHIFT
+-	  || GET_CODE (shift_rtx) == LSHIFTRT
+-	  || GET_CODE (shift_rtx) == ASHIFTRT
+-	  || GET_CODE (shift_rtx) == ROTATE
+-	  || GET_CODE (shift_rtx) == ROTATERT))
+-    {
+-      rtx shift_count = XEXP (shift_rtx, 1);
++  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
++    return align;
+ 
+-      /* Return true if shift count is dest of SET_BODY.  */
+-      if (REG_P (shift_count))
+-	{
+-	  /* Add check since it can be invoked before register
+-	     allocation in pre-reload schedule.  */
+-	  if (reload_completed
+-	      && true_regnum (set_dest) == true_regnum (shift_count))
+-	    return true;
+-	  else if (REGNO(set_dest) == REGNO(shift_count))
+-	    return true;
+-	}
++  /* Don't do dynamic stack realignment for long long objects with
++     -mpreferred-stack-boundary=2.  */
++  if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
++      && (!type || !TYPE_USER_ALIGN (type))
++      && (!decl || !DECL_USER_ALIGN (decl)))
++    {
++      gcc_checking_assert (!TARGET_STV);
++      return 32;
+     }
+ 
+-  return false;
++  return align;
+ }
++
++/* Find a location for the static chain incoming to a nested function.
++   This is a register, unless all free registers are used by arguments.  */
+ 
+-/* Return true if destination reg of SET_INSN is shift count of
+-   USE_INSN.  */
+-
+-bool
+-ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
++static rtx
++ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
+ {
+-  return ix86_dep_by_shift_count_body (PATTERN (set_insn),
+-				       PATTERN (use_insn));
+-}
++  unsigned regno;
+ 
+-/* Return TRUE or FALSE depending on whether the unary operator meets the
+-   appropriate constraints.  */
++  if (TARGET_64BIT)
++    {
++      /* We always use R10 in 64-bit mode.  */
++      regno = R10_REG;
++    }
++  else
++    {
++      const_tree fntype, fndecl;
++      unsigned int ccvt;
+ 
+-bool
+-ix86_unary_operator_ok (enum rtx_code,
+-			machine_mode,
+-			rtx operands[2])
+-{
+-  /* If one of operands is memory, source and destination must match.  */
+-  if ((MEM_P (operands[0])
+-       || MEM_P (operands[1]))
+-      && ! rtx_equal_p (operands[0], operands[1]))
+-    return false;
+-  return true;
+-}
++      /* By default in 32-bit mode we use ECX to pass the static chain.  */
++      regno = CX_REG;
+ 
+-/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+-   are ok, keeping in mind the possible movddup alternative.  */
++      if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
++	{
++          fntype = TREE_TYPE (fndecl_or_type);
++	  fndecl = fndecl_or_type;
++	}
++      else
++	{
++	  fntype = fndecl_or_type;
++	  fndecl = NULL;
++	}
+ 
+-bool
+-ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+-{
+-  if (MEM_P (operands[0]))
+-    return rtx_equal_p (operands[0], operands[1 + high]);
+-  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+-    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+-  return true;
++      ccvt = ix86_get_callcvt (fntype);
++      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
++	{
++	  /* Fastcall functions use ecx/edx for arguments, which leaves
++	     us with EAX for the static chain.
++	     Thiscall functions use ecx for arguments, which also
++	     leaves us with EAX for the static chain.  */
++	  regno = AX_REG;
++	}
++      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
++	{
++	  /* Thiscall functions use ecx for arguments, which leaves
++	     us with EAX and EDX for the static chain.
++	     We are using for abi-compatibility EAX.  */
++	  regno = AX_REG;
++	}
++      else if (ix86_function_regparm (fntype, fndecl) == 3)
++	{
++	  /* For regparm 3, we have no free call-clobbered registers in
++	     which to store the static chain.  In order to implement this,
++	     we have the trampoline push the static chain to the stack.
++	     However, we can't push a value below the return address when
++	     we call the nested function directly, so we have to use an
++	     alternate entry point.  For this we use ESI, and have the
++	     alternate entry point push ESI, so that things appear the
++	     same once we're executing the nested function.  */
++	  if (incoming_p)
++	    {
++	      if (fndecl == current_function_decl
++		  && !ix86_static_chain_on_stack)
++		{
++		  gcc_assert (!reload_completed);
++		  ix86_static_chain_on_stack = true;
++		}
++	      return gen_frame_mem (SImode,
++				    plus_constant (Pmode,
++						   arg_pointer_rtx, -8));
++	    }
++	  regno = SI_REG;
++	}
++    }
++
++  return gen_rtx_REG (Pmode, regno);
+ }
+ 
+-/* Post-reload splitter for converting an SF or DFmode value in an
+-   SSE register into an unsigned SImode.  */
++/* Emit RTL insns to initialize the variable parts of a trampoline.
++   FNDECL is the decl of the target address; M_TRAMP is a MEM for
++   the trampoline, and CHAIN_VALUE is an RTX for the static chain
++   to be passed to the target function.  */
+ 
+-void
+-ix86_split_convert_uns_si_sse (rtx operands[])
++static void
++ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
+ {
+-  machine_mode vecmode;
+-  rtx value, large, zero_or_two31, input, two31, x;
++  rtx mem, fnaddr;
++  int opcode;
++  int offset = 0;
++  bool need_endbr = (flag_cf_protection & CF_BRANCH);
+ 
+-  large = operands[1];
+-  zero_or_two31 = operands[2];
+-  input = operands[3];
+-  two31 = operands[4];
+-  vecmode = GET_MODE (large);
+-  value = gen_rtx_REG (vecmode, REGNO (operands[0]));
++  fnaddr = XEXP (DECL_RTL (fndecl), 0);
+ 
+-  /* Load up the value into the low element.  We must ensure that the other
+-     elements are valid floats -- zero is the easiest such value.  */
+-  if (MEM_P (input))
+-    {
+-      if (vecmode == V4SFmode)
+-	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
+-      else
+-	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
+-    }
+-  else
++  if (TARGET_64BIT)
+     {
+-      input = gen_rtx_REG (vecmode, REGNO (input));
+-      emit_move_insn (value, CONST0_RTX (vecmode));
+-      if (vecmode == V4SFmode)
+-	emit_insn (gen_sse_movss (value, value, input));
+-      else
+-	emit_insn (gen_sse2_movsd (value, value, input));
+-    }
+-
+-  emit_move_insn (large, two31);
+-  emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
++      int size;
+ 
+-  x = gen_rtx_fmt_ee (LE, vecmode, large, value);
+-  emit_insn (gen_rtx_SET (large, x));
++      if (need_endbr)
++	{
++	  /* Insert ENDBR64.  */
++	  mem = adjust_address (m_tramp, SImode, offset);
++	  emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
++	  offset += 4;
++	}
+ 
+-  x = gen_rtx_AND (vecmode, zero_or_two31, large);
+-  emit_insn (gen_rtx_SET (zero_or_two31, x));
++      /* Load the function address to r11.  Try to load address using
++	 the shorter movl instead of movabs.  We may want to support
++	 movq for kernel mode, but kernel does not use trampolines at
++	 the moment.  FNADDR is a 32bit address and may not be in
++	 DImode when ptr_mode == SImode.  Always use movl in this
++	 case.  */
++      if (ptr_mode == SImode
++	  || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
++	{
++	  fnaddr = copy_addr_to_reg (fnaddr);
+ 
+-  x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
+-  emit_insn (gen_rtx_SET (value, x));
++	  mem = adjust_address (m_tramp, HImode, offset);
++	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
+ 
+-  large = gen_rtx_REG (V4SImode, REGNO (large));
+-  emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
++	  mem = adjust_address (m_tramp, SImode, offset + 2);
++	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
++	  offset += 6;
++	}
++      else
++	{
++	  mem = adjust_address (m_tramp, HImode, offset);
++	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
+ 
+-  x = gen_rtx_REG (V4SImode, REGNO (value));
+-  if (vecmode == V4SFmode)
+-    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
+-  else
+-    emit_insn (gen_sse2_cvttpd2dq (x, value));
+-  value = x;
++	  mem = adjust_address (m_tramp, DImode, offset + 2);
++	  emit_move_insn (mem, fnaddr);
++	  offset += 10;
++	}
+ 
+-  emit_insn (gen_xorv4si3 (value, value, large));
+-}
++      /* Load static chain using movabs to r10.  Use the shorter movl
++         instead of movabs when ptr_mode == SImode.  */
++      if (ptr_mode == SImode)
++	{
++	  opcode = 0xba41;
++	  size = 6;
++	}
++      else
++	{
++	  opcode = 0xba49;
++	  size = 10;
++	}
+ 
+-/* Convert an unsigned DImode value into a DFmode, using only SSE.
+-   Expects the 64-bit DImode to be supplied in a pair of integral
+-   registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
+-   -mfpmath=sse, !optimize_size only.  */
++      mem = adjust_address (m_tramp, HImode, offset);
++      emit_move_insn (mem, gen_int_mode (opcode, HImode));
+ 
+-void
+-ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
+-{
+-  REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
+-  rtx int_xmm, fp_xmm;
+-  rtx biases, exponents;
+-  rtx x;
++      mem = adjust_address (m_tramp, ptr_mode, offset + 2);
++      emit_move_insn (mem, chain_value);
++      offset += size;
+ 
+-  int_xmm = gen_reg_rtx (V4SImode);
+-  if (TARGET_INTER_UNIT_MOVES_TO_VEC)
+-    emit_insn (gen_movdi_to_sse (int_xmm, input));
+-  else if (TARGET_SSE_SPLIT_REGS)
+-    {
+-      emit_clobber (int_xmm);
+-      emit_move_insn (gen_lowpart (DImode, int_xmm), input);
++      /* Jump to r11; the last (unused) byte is a nop, only there to
++	 pad the write out to a single 32-bit store.  */
++      mem = adjust_address (m_tramp, SImode, offset);
++      emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
++      offset += 4;
+     }
+   else
+     {
+-      x = gen_reg_rtx (V2DImode);
+-      ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
+-      emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
+-    }
+-
+-  x = gen_rtx_CONST_VECTOR (V4SImode,
+-			    gen_rtvec (4, GEN_INT (0x43300000UL),
+-				       GEN_INT (0x45300000UL),
+-				       const0_rtx, const0_rtx));
+-  exponents = validize_mem (force_const_mem (V4SImode, x));
+-
+-  /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
+-  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
+-
+-  /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
+-     yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
+-     Similarly (0x45300000UL ## fp_value_hi_xmm) yields
+-     (0x1.0p84 + double(fp_value_hi_xmm)).
+-     Note these exponents differ by 32.  */
+-
+-  fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
+-
+-  /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
+-     in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
+-  real_ldexp (&bias_lo_rvt, &dconst1, 52);
+-  real_ldexp (&bias_hi_rvt, &dconst1, 84);
+-  biases = const_double_from_real_value (bias_lo_rvt, DFmode);
+-  x = const_double_from_real_value (bias_hi_rvt, DFmode);
+-  biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
+-  biases = validize_mem (force_const_mem (V2DFmode, biases));
+-  emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
+-
+-  /* Add the upper and lower DFmode values together.  */
+-  if (TARGET_SSE3)
+-    emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
+-  else
+-    {
+-      x = copy_to_mode_reg (V2DFmode, fp_xmm);
+-      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
+-      emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
+-    }
++      rtx disp, chain;
+ 
+-  ix86_expand_vector_extract (false, target, fp_xmm, 0);
+-}
++      /* Depending on the static chain location, either load a register
++	 with a constant, or push the constant to the stack.  All of the
++	 instructions are the same size.  */
++      chain = ix86_static_chain (fndecl, true);
++      if (REG_P (chain))
++	{
++	  switch (REGNO (chain))
++	    {
++	    case AX_REG:
++	      opcode = 0xb8; break;
++	    case CX_REG:
++	      opcode = 0xb9; break;
++	    default:
++	      gcc_unreachable ();
++	    }
++	}
++      else
++	opcode = 0x68;
+ 
+-/* Not used, but eases macroization of patterns.  */
+-void
+-ix86_expand_convert_uns_sixf_sse (rtx, rtx)
+-{
+-  gcc_unreachable ();
+-}
++      if (need_endbr)
++	{
++	  /* Insert ENDBR32.  */
++	  mem = adjust_address (m_tramp, SImode, offset);
++	  emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
++	  offset += 4;
++	}
+ 
+-/* Convert an unsigned SImode value into a DFmode.  Only currently used
+-   for SSE, but applicable anywhere.  */
++      mem = adjust_address (m_tramp, QImode, offset);
++      emit_move_insn (mem, gen_int_mode (opcode, QImode));
+ 
+-void
+-ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
+-{
+-  REAL_VALUE_TYPE TWO31r;
+-  rtx x, fp;
++      mem = adjust_address (m_tramp, SImode, offset + 1);
++      emit_move_insn (mem, chain_value);
++      offset += 5;
+ 
+-  x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
+-			   NULL, 1, OPTAB_DIRECT);
++      mem = adjust_address (m_tramp, QImode, offset);
++      emit_move_insn (mem, gen_int_mode (0xe9, QImode));
++
++      mem = adjust_address (m_tramp, SImode, offset + 1);
+ 
+-  fp = gen_reg_rtx (DFmode);
+-  emit_insn (gen_floatsidf2 (fp, x));
++      /* Compute offset from the end of the jmp to the target function.
++	 In the case in which the trampoline stores the static chain on
++	 the stack, we need to skip the first insn which pushes the
++	 (call-saved) register static chain; this push is 1 byte.  */
++      offset += 5;
++      int skip = MEM_P (chain) ? 1 : 0;
++      /* Skip ENDBR32 at the entry of the target function.  */
++      if (need_endbr
++	  && !cgraph_node::get (fndecl)->only_called_directly_p ())
++	skip += 4;
++      disp = expand_binop (SImode, sub_optab, fnaddr,
++			   plus_constant (Pmode, XEXP (m_tramp, 0),
++					  offset - skip),
++			   NULL_RTX, 1, OPTAB_DIRECT);
++      emit_move_insn (mem, disp);
++    }
+ 
+-  real_ldexp (&TWO31r, &dconst1, 31);
+-  x = const_double_from_real_value (TWO31r, DFmode);
++  gcc_assert (offset <= TRAMPOLINE_SIZE);
+ 
+-  x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+-  if (x != target)
+-    emit_move_insn (target, x);
++#ifdef HAVE_ENABLE_EXECUTE_STACK
++#ifdef CHECK_EXECUTE_STACK_ENABLED
++  if (CHECK_EXECUTE_STACK_ENABLED)
++#endif
++  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
++		     LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
++#endif
+ }
+ 
+-/* Convert a signed DImode value into a DFmode.  Only used for SSE in
+-   32-bit mode; otherwise we have a direct convert instruction.  */
+-
+-void
+-ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
++static bool
++ix86_allocate_stack_slots_for_args (void)
+ {
+-  REAL_VALUE_TYPE TWO32r;
+-  rtx fp_lo, fp_hi, x;
+-
+-  fp_lo = gen_reg_rtx (DFmode);
+-  fp_hi = gen_reg_rtx (DFmode);
++  /* Naked functions should not allocate stack slots for arguments.  */
++  return !ix86_function_naked (current_function_decl);
++}
+ 
+-  emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
+-
+-  real_ldexp (&TWO32r, &dconst1, 32);
+-  x = const_double_from_real_value (TWO32r, DFmode);
+-  fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
+-
+-  ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
+-
+-  x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
+-			   0, OPTAB_DIRECT);
+-  if (x != target)
+-    emit_move_insn (target, x);
+-}
+-
+-/* Convert an unsigned SImode value into a SFmode, using only SSE.
+-   For x86_32, -mfpmath=sse, !optimize_size only.  */
+-void
+-ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
+-{
+-  REAL_VALUE_TYPE ONE16r;
+-  rtx fp_hi, fp_lo, int_hi, int_lo, x;
+-
+-  real_ldexp (&ONE16r, &dconst1, 16);
+-  x = const_double_from_real_value (ONE16r, SFmode);
+-  int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
+-				      NULL, 0, OPTAB_DIRECT);
+-  int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
+-				      NULL, 0, OPTAB_DIRECT);
+-  fp_hi = gen_reg_rtx (SFmode);
+-  fp_lo = gen_reg_rtx (SFmode);
+-  emit_insn (gen_floatsisf2 (fp_hi, int_hi));
+-  emit_insn (gen_floatsisf2 (fp_lo, int_lo));
+-  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
+-			       0, OPTAB_DIRECT);
+-  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
+-			       0, OPTAB_DIRECT);
+-  if (!rtx_equal_p (target, fp_hi))
+-    emit_move_insn (target, fp_hi);
+-}
+-
+-/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
+-   a vector of unsigned ints VAL to vector of floats TARGET.  */
+-
+-void
+-ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
+-{
+-  rtx tmp[8];
+-  REAL_VALUE_TYPE TWO16r;
+-  machine_mode intmode = GET_MODE (val);
+-  machine_mode fltmode = GET_MODE (target);
+-  rtx (*cvt) (rtx, rtx);
+-
+-  if (intmode == V4SImode)
+-    cvt = gen_floatv4siv4sf2;
+-  else
+-    cvt = gen_floatv8siv8sf2;
+-  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
+-  tmp[0] = force_reg (intmode, tmp[0]);
+-  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
+-				OPTAB_DIRECT);
+-  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
+-				NULL_RTX, 1, OPTAB_DIRECT);
+-  tmp[3] = gen_reg_rtx (fltmode);
+-  emit_insn (cvt (tmp[3], tmp[1]));
+-  tmp[4] = gen_reg_rtx (fltmode);
+-  emit_insn (cvt (tmp[4], tmp[2]));
+-  real_ldexp (&TWO16r, &dconst1, 16);
+-  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
+-  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
+-  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
+-				OPTAB_DIRECT);
+-  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
+-				OPTAB_DIRECT);
+-  if (tmp[7] != target)
+-    emit_move_insn (target, tmp[7]);
+-}
+-
+-/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
+-   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+-   This is done by doing just signed conversion if < 0x1p31, and otherwise by
+-   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
+-
+-rtx
+-ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
+-{
+-  REAL_VALUE_TYPE TWO31r;
+-  rtx two31r, tmp[4];
+-  machine_mode mode = GET_MODE (val);
+-  machine_mode scalarmode = GET_MODE_INNER (mode);
+-  machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
+-  rtx (*cmp) (rtx, rtx, rtx, rtx);
+-  int i;
+-
+-  for (i = 0; i < 3; i++)
+-    tmp[i] = gen_reg_rtx (mode);
+-  real_ldexp (&TWO31r, &dconst1, 31);
+-  two31r = const_double_from_real_value (TWO31r, scalarmode);
+-  two31r = ix86_build_const_vector (mode, 1, two31r);
+-  two31r = force_reg (mode, two31r);
+-  switch (mode)
+-    {
+-    case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
+-    case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
+-    case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
+-    case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
+-    default: gcc_unreachable ();
+-    }
+-  tmp[3] = gen_rtx_LE (mode, two31r, val);
+-  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
+-  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
+-				0, OPTAB_DIRECT);
+-  if (intmode == V4SImode || TARGET_AVX2)
+-    *xorp = expand_simple_binop (intmode, ASHIFT,
+-				 gen_lowpart (intmode, tmp[0]),
+-				 GEN_INT (31), NULL_RTX, 0,
+-				 OPTAB_DIRECT);
+-  else
+-    {
+-      rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
+-      two31 = ix86_build_const_vector (intmode, 1, two31);
+-      *xorp = expand_simple_binop (intmode, AND,
+-				   gen_lowpart (intmode, tmp[0]),
+-				   two31, NULL_RTX, 0,
+-				   OPTAB_DIRECT);
+-    }
+-  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
+-			      0, OPTAB_DIRECT);
+-}
+-
+-/* A subroutine of ix86_build_signbit_mask.  If VECT is true,
+-   then replicate the value for all elements of the vector
+-   register.  */
+-
+-rtx
+-ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
++static bool
++ix86_warn_func_return (tree decl)
+ {
+-  int i, n_elt;
+-  rtvec v;
+-  machine_mode scalar_mode;
+-
+-  switch (mode)
+-    {
+-    case E_V64QImode:
+-    case E_V32QImode:
+-    case E_V16QImode:
+-    case E_V32HImode:
+-    case E_V16HImode:
+-    case E_V8HImode:
+-    case E_V16SImode:
+-    case E_V8SImode:
+-    case E_V4SImode:
+-    case E_V8DImode:
+-    case E_V4DImode:
+-    case E_V2DImode:
+-      gcc_assert (vect);
+-      /* FALLTHRU */
+-    case E_V16SFmode:
+-    case E_V8SFmode:
+-    case E_V4SFmode:
+-    case E_V8DFmode:
+-    case E_V4DFmode:
+-    case E_V2DFmode:
+-      n_elt = GET_MODE_NUNITS (mode);
+-      v = rtvec_alloc (n_elt);
+-      scalar_mode = GET_MODE_INNER (mode);
+-
+-      RTVEC_ELT (v, 0) = value;
+-
+-      for (i = 1; i < n_elt; ++i)
+-	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
+-
+-      return gen_rtx_CONST_VECTOR (mode, v);
+-
+-    default:
+-      gcc_unreachable ();
+-    }
++  /* Naked functions are implemented entirely in assembly, including the
++     return sequence, so suppress warnings about this.  */
++  return !ix86_function_naked (decl);
+ }
+-
+-/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
+-   and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
+-   for an SSE register.  If VECT is true, then replicate the mask for
+-   all elements of the vector register.  If INVERT is true, then create
+-   a mask excluding the sign bit.  */
+-
+-rtx
+-ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
++
++/* Return the shift count of a vector by scalar shift builtin second argument
++   ARG1.  */
++static tree
++ix86_vector_shift_count (tree arg1)
+ {
+-  machine_mode vec_mode, imode;
+-  wide_int w;
+-  rtx mask, v;
+-
+-  switch (mode)
++  if (tree_fits_uhwi_p (arg1))
++    return arg1;
++  else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
+     {
+-    case E_V16SImode:
+-    case E_V16SFmode:
+-    case E_V8SImode:
+-    case E_V4SImode:
+-    case E_V8SFmode:
+-    case E_V4SFmode:
+-      vec_mode = mode;
+-      imode = SImode;
+-      break;
+-
+-    case E_V8DImode:
+-    case E_V4DImode:
+-    case E_V2DImode:
+-    case E_V8DFmode:
+-    case E_V4DFmode:
+-    case E_V2DFmode:
+-      vec_mode = mode;
+-      imode = DImode;
+-      break;
+-
+-    case E_TImode:
+-    case E_TFmode:
+-      vec_mode = VOIDmode;
+-      imode = TImode;
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
++      /* The count argument is weird, passed in as various 128-bit
++	 (or 64-bit) vectors, the low 64 bits from it are the count.  */
++      unsigned char buf[16];
++      int len = native_encode_expr (arg1, buf, 16);
++      if (len == 0)
++	return NULL_TREE;
++      tree t = native_interpret_expr (uint64_type_node, buf, len);
++      if (t && tree_fits_uhwi_p (t))
++	return t;
+     }
+-
+-  machine_mode inner_mode = GET_MODE_INNER (mode);
+-  w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
+-			   GET_MODE_BITSIZE (inner_mode));
+-  if (invert)
+-    w = wi::bit_not (w);
+-
+-  /* Force this value into the low part of a fp vector constant.  */
+-  mask = immed_wide_int_const (w, imode);
+-  mask = gen_lowpart (inner_mode, mask);
+-
+-  if (vec_mode == VOIDmode)
+-    return force_reg (inner_mode, mask);
+-
+-  v = ix86_build_const_vector (vec_mode, vect, mask);
+-  return force_reg (vec_mode, v);
++  return NULL_TREE;
+ }
+ 
+-/* Generate code for floating point ABS or NEG.  */
+-
+-void
+-ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
+-				rtx operands[])
++static tree
++ix86_fold_builtin (tree fndecl, int n_args,
++		   tree *args, bool ignore ATTRIBUTE_UNUSED)
+ {
+-  rtx mask, set, dst, src;
+-  bool use_sse = false;
+-  bool vector_mode = VECTOR_MODE_P (mode);
+-  machine_mode vmode = mode;
+-
+-  if (vector_mode)
+-    use_sse = true;
+-  else if (mode == TFmode)
+-    use_sse = true;
+-  else if (TARGET_SSE_MATH)
+-    {
+-      use_sse = SSE_FLOAT_MODE_P (mode);
+-      if (mode == SFmode)
+-	vmode = V4SFmode;
+-      else if (mode == DFmode)
+-	vmode = V2DFmode;
+-    }
+-
+-  /* NEG and ABS performed with SSE use bitwise mask operations.
+-     Create the appropriate mask now.  */
+-  if (use_sse)
+-    mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
+-  else
+-    mask = NULL_RTX;
+-
+-  dst = operands[0];
+-  src = operands[1];
+-
+-  set = gen_rtx_fmt_e (code, mode, src);
+-  set = gen_rtx_SET (dst, set);
+-
+-  if (mask)
++  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+     {
+-      rtx use, clob;
+-      rtvec par;
++      enum ix86_builtins fn_code
++	= (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
++      enum rtx_code rcode;
++      bool is_vshift;
++      unsigned HOST_WIDE_INT mask;
+ 
+-      use = gen_rtx_USE (VOIDmode, mask);
+-      if (vector_mode)
+-	par = gen_rtvec (2, set, use);
+-      else
++      switch (fn_code)
+ 	{
+-          clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+-	  par = gen_rtvec (3, set, use, clob);
+-        }
+-      emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
+-    }
+-  else
+-    emit_insn (set);
+-}
+-
+-/* Expand a copysign operation.  Special case operand 0 being a constant.  */
+-
+-void
+-ix86_expand_copysign (rtx operands[])
+-{
+-  machine_mode mode, vmode;
+-  rtx dest, op0, op1, mask, nmask;
+-
+-  dest = operands[0];
+-  op0 = operands[1];
+-  op1 = operands[2];
+-
+-  mode = GET_MODE (dest);
++	case IX86_BUILTIN_CPU_IS:
++	case IX86_BUILTIN_CPU_SUPPORTS:
++	  gcc_assert (n_args == 1);
++	  return fold_builtin_cpu (fndecl, args);
+ 
+-  if (mode == SFmode)
+-    vmode = V4SFmode;
+-  else if (mode == DFmode)
+-    vmode = V2DFmode;
+-  else
+-    vmode = mode;
++	case IX86_BUILTIN_NANQ:
++	case IX86_BUILTIN_NANSQ:
++	  {
++	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
++	    const char *str = c_getstr (*args);
++	    int quiet = fn_code == IX86_BUILTIN_NANQ;
++	    REAL_VALUE_TYPE real;
+ 
+-  if (CONST_DOUBLE_P (op0))
+-    {
+-      rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
++	    if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
++	      return build_real (type, real);
++	    return NULL_TREE;
++	  }
+ 
+-      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
+-	op0 = simplify_unary_operation (ABS, mode, op0, mode);
++	case IX86_BUILTIN_INFQ:
++	case IX86_BUILTIN_HUGE_VALQ:
++	  {
++	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
++	    REAL_VALUE_TYPE inf;
++	    real_inf (&inf);
++	    return build_real (type, inf);
++	  }
+ 
+-      if (mode == SFmode || mode == DFmode)
+-	{
+-	  if (op0 == CONST0_RTX (mode))
+-	    op0 = CONST0_RTX (vmode);
+-	  else
++	case IX86_BUILTIN_TZCNT16:
++	case IX86_BUILTIN_CTZS:
++	case IX86_BUILTIN_TZCNT32:
++	case IX86_BUILTIN_TZCNT64:
++	  gcc_assert (n_args == 1);
++	  if (TREE_CODE (args[0]) == INTEGER_CST)
+ 	    {
+-	      rtx v = ix86_build_const_vector (vmode, false, op0);
+-
+-	      op0 = force_reg (vmode, v);
++	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
++	      tree arg = args[0];
++	      if (fn_code == IX86_BUILTIN_TZCNT16
++		  || fn_code == IX86_BUILTIN_CTZS)
++		arg = fold_convert (short_unsigned_type_node, arg);
++	      if (integer_zerop (arg))
++		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
++	      else
++		return fold_const_call (CFN_CTZ, type, arg);
+ 	    }
+-	}
+-      else if (op0 != CONST0_RTX (mode))
+-	op0 = force_reg (mode, op0);
+-
+-      mask = ix86_build_signbit_mask (vmode, 0, 0);
+-
+-      if (mode == SFmode)
+-	copysign_insn = gen_copysignsf3_const;
+-      else if (mode == DFmode)
+-	copysign_insn = gen_copysigndf3_const;
+-      else
+-	copysign_insn = gen_copysigntf3_const;
+-
+-      emit_insn (copysign_insn (dest, op0, op1, mask));
+-    }
+-  else
+-    {
+-      rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
+-
+-      nmask = ix86_build_signbit_mask (vmode, 0, 1);
+-      mask = ix86_build_signbit_mask (vmode, 0, 0);
+-
+-      if (mode == SFmode)
+-	copysign_insn = gen_copysignsf3_var;
+-      else if (mode == DFmode)
+-	copysign_insn = gen_copysigndf3_var;
+-      else
+-	copysign_insn = gen_copysigntf3_var;
+-
+-      emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
+-    }
+-}
++	  break;
+ 
+-/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
+-   be a constant, and so has already been expanded into a vector constant.  */
++	case IX86_BUILTIN_LZCNT16:
++	case IX86_BUILTIN_CLZS:
++	case IX86_BUILTIN_LZCNT32:
++	case IX86_BUILTIN_LZCNT64:
++	  gcc_assert (n_args == 1);
++	  if (TREE_CODE (args[0]) == INTEGER_CST)
++	    {
++	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
++	      tree arg = args[0];
++	      if (fn_code == IX86_BUILTIN_LZCNT16
++		  || fn_code == IX86_BUILTIN_CLZS)
++		arg = fold_convert (short_unsigned_type_node, arg);
++	      if (integer_zerop (arg))
++		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
++	      else
++		return fold_const_call (CFN_CLZ, type, arg);
++	    }
++	  break;
+ 
+-void
+-ix86_split_copysign_const (rtx operands[])
+-{
+-  machine_mode mode, vmode;
+-  rtx dest, op0, mask, x;
+-
+-  dest = operands[0];
+-  op0 = operands[1];
+-  mask = operands[3];
+-
+-  mode = GET_MODE (dest);
+-  vmode = GET_MODE (mask);
+-
+-  dest = lowpart_subreg (vmode, dest, mode);
+-  x = gen_rtx_AND (vmode, dest, mask);
+-  emit_insn (gen_rtx_SET (dest, x));
+-
+-  if (op0 != CONST0_RTX (vmode))
+-    {
+-      x = gen_rtx_IOR (vmode, dest, op0);
+-      emit_insn (gen_rtx_SET (dest, x));
+-    }
+-}
+-
+-/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
+-   so we have to do two masks.  */
+-
+-void
+-ix86_split_copysign_var (rtx operands[])
+-{
+-  machine_mode mode, vmode;
+-  rtx dest, scratch, op0, op1, mask, nmask, x;
+-
+-  dest = operands[0];
+-  scratch = operands[1];
+-  op0 = operands[2];
+-  op1 = operands[3];
+-  nmask = operands[4];
+-  mask = operands[5];
+-
+-  mode = GET_MODE (dest);
+-  vmode = GET_MODE (mask);
+-
+-  if (rtx_equal_p (op0, op1))
+-    {
+-      /* Shouldn't happen often (it's useless, obviously), but when it does
+-	 we'd generate incorrect code if we continue below.  */
+-      emit_move_insn (dest, op0);
+-      return;
+-    }
+-
+-  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
+-    {
+-      gcc_assert (REGNO (op1) == REGNO (scratch));
+-
+-      x = gen_rtx_AND (vmode, scratch, mask);
+-      emit_insn (gen_rtx_SET (scratch, x));
+-
+-      dest = mask;
+-      op0 = lowpart_subreg (vmode, op0, mode);
+-      x = gen_rtx_NOT (vmode, dest);
+-      x = gen_rtx_AND (vmode, x, op0);
+-      emit_insn (gen_rtx_SET (dest, x));
+-    }
+-  else
+-    {
+-      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
+-	{
+-	  x = gen_rtx_AND (vmode, scratch, mask);
+-	}
+-      else						/* alternative 2,4 */
+-	{
+-          gcc_assert (REGNO (mask) == REGNO (scratch));
+-          op1 = lowpart_subreg (vmode, op1, mode);
+-	  x = gen_rtx_AND (vmode, scratch, op1);
+-	}
+-      emit_insn (gen_rtx_SET (scratch, x));
+-
+-      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
+-	{
+-	  dest = lowpart_subreg (vmode, op0, mode);
+-	  x = gen_rtx_AND (vmode, dest, nmask);
+-	}
+-      else						/* alternative 3,4 */
+-	{
+-          gcc_assert (REGNO (nmask) == REGNO (dest));
+-	  dest = nmask;
+-	  op0 = lowpart_subreg (vmode, op0, mode);
+-	  x = gen_rtx_AND (vmode, dest, op0);
+-	}
+-      emit_insn (gen_rtx_SET (dest, x));
+-    }
+-
+-  x = gen_rtx_IOR (vmode, dest, scratch);
+-  emit_insn (gen_rtx_SET (dest, x));
+-}
+-
+-/* Expand an xorsign operation.  */
+-
+-void
+-ix86_expand_xorsign (rtx operands[])
+-{
+-  rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
+-  machine_mode mode, vmode;
+-  rtx dest, op0, op1, mask;
+-
+-  dest = operands[0];
+-  op0 = operands[1];
+-  op1 = operands[2];
+-
+-  mode = GET_MODE (dest);
+-
+-  if (mode == SFmode)
+-    {
+-      xorsign_insn = gen_xorsignsf3_1;
+-      vmode = V4SFmode;
+-    }
+-  else if (mode == DFmode)
+-    {
+-      xorsign_insn = gen_xorsigndf3_1;
+-      vmode = V2DFmode;
+-    }
+-  else
+-    gcc_unreachable ();
+-
+-  mask = ix86_build_signbit_mask (vmode, 0, 0);
+-
+-  emit_insn (xorsign_insn (dest, op0, op1, mask));
+-}
+-
+-/* Deconstruct an xorsign operation into bit masks.  */
+-
+-void
+-ix86_split_xorsign (rtx operands[])
+-{
+-  machine_mode mode, vmode;
+-  rtx dest, op0, mask, x;
+-
+-  dest = operands[0];
+-  op0 = operands[1];
+-  mask = operands[3];
+-
+-  mode = GET_MODE (dest);
+-  vmode = GET_MODE (mask);
+-
+-  dest = lowpart_subreg (vmode, dest, mode);
+-  x = gen_rtx_AND (vmode, dest, mask);
+-  emit_insn (gen_rtx_SET (dest, x));
+-
+-  op0 = lowpart_subreg (vmode, op0, mode);
+-  x = gen_rtx_XOR (vmode, dest, op0);
+-  emit_insn (gen_rtx_SET (dest, x));
+-}
+-
+-/* Return TRUE or FALSE depending on whether the first SET in INSN
+-   has source and destination with matching CC modes, and that the
+-   CC mode is at least as constrained as REQ_MODE.  */
+-
+-bool
+-ix86_match_ccmode (rtx insn, machine_mode req_mode)
+-{
+-  rtx set;
+-  machine_mode set_mode;
+-
+-  set = PATTERN (insn);
+-  if (GET_CODE (set) == PARALLEL)
+-    set = XVECEXP (set, 0, 0);
+-  gcc_assert (GET_CODE (set) == SET);
+-  gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
+-
+-  set_mode = GET_MODE (SET_DEST (set));
+-  switch (set_mode)
+-    {
+-    case E_CCNOmode:
+-      if (req_mode != CCNOmode
+-	  && (req_mode != CCmode
+-	      || XEXP (SET_SRC (set), 1) != const0_rtx))
+-	return false;
+-      break;
+-    case E_CCmode:
+-      if (req_mode == CCGCmode)
+-	return false;
+-      /* FALLTHRU */
+-    case E_CCGCmode:
+-      if (req_mode == CCGOCmode || req_mode == CCNOmode)
+-	return false;
+-      /* FALLTHRU */
+-    case E_CCGOCmode:
+-      if (req_mode == CCZmode)
+-	return false;
+-      /* FALLTHRU */
+-    case E_CCZmode:
+-      break;
+-
+-    case E_CCGZmode:
+-
+-    case E_CCAmode:
+-    case E_CCCmode:
+-    case E_CCOmode:
+-    case E_CCPmode:
+-    case E_CCSmode:
+-      if (set_mode != req_mode)
+-	return false;
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  return GET_MODE (SET_SRC (set)) == set_mode;
+-}
+-
+-/* Generate insn patterns to do an integer compare of OPERANDS.  */
+-
+-static rtx
+-ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
+-{
+-  machine_mode cmpmode;
+-  rtx tmp, flags;
+-
+-  cmpmode = SELECT_CC_MODE (code, op0, op1);
+-  flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+-
+-  /* This is very simple, but making the interface the same as in the
+-     FP case makes the rest of the code easier.  */
+-  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+-  emit_insn (gen_rtx_SET (flags, tmp));
+-
+-  /* Return the test that should be put into the flags user, i.e.
+-     the bcc, scc, or cmov instruction.  */
+-  return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
+-}
++	case IX86_BUILTIN_BEXTR32:
++	case IX86_BUILTIN_BEXTR64:
++	case IX86_BUILTIN_BEXTRI32:
++	case IX86_BUILTIN_BEXTRI64:
++	  gcc_assert (n_args == 2);
++	  if (tree_fits_uhwi_p (args[1]))
++	    {
++	      unsigned HOST_WIDE_INT res = 0;
++	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
++	      unsigned int start = tree_to_uhwi (args[1]);
++	      unsigned int len = (start & 0xff00) >> 8;
++	      start &= 0xff;
++	      if (start >= prec || len == 0)
++		res = 0;
++	      else if (!tree_fits_uhwi_p (args[0]))
++		break;
++	      else
++		res = tree_to_uhwi (args[0]) >> start;
++	      if (len > prec)
++		len = prec;
++	      if (len < HOST_BITS_PER_WIDE_INT)
++		res &= (HOST_WIDE_INT_1U << len) - 1;
++	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
++	    }
++	  break;
+ 
+-/* Figure out whether to use unordered fp comparisons.  */
++	case IX86_BUILTIN_BZHI32:
++	case IX86_BUILTIN_BZHI64:
++	  gcc_assert (n_args == 2);
++	  if (tree_fits_uhwi_p (args[1]))
++	    {
++	      unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
++	      if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
++		return args[0];
++	      if (idx == 0)
++		return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0);
++	      if (!tree_fits_uhwi_p (args[0]))
++		break;
++	      unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
++	      res &= ~(HOST_WIDE_INT_M1U << idx);
++	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
++	    }
++	  break;
+ 
+-static bool
+-ix86_unordered_fp_compare (enum rtx_code code)
+-{
+-  if (!TARGET_IEEE_FP)
+-    return false;
++	case IX86_BUILTIN_PDEP32:
++	case IX86_BUILTIN_PDEP64:
++	  gcc_assert (n_args == 2);
++	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
++	    {
++	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
++	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
++	      unsigned HOST_WIDE_INT res = 0;
++	      unsigned HOST_WIDE_INT m, k = 1;
++	      for (m = 1; m; m <<= 1)
++		if ((mask & m) != 0)
++		  {
++		    if ((src & k) != 0)
++		      res |= m;
++		    k <<= 1;
++		  }
++	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
++	    }
++	  break;
+ 
+-  switch (code)
+-    {
+-    case GT:
+-    case GE:
+-    case LT:
+-    case LE:
+-      return false;
++	case IX86_BUILTIN_PEXT32:
++	case IX86_BUILTIN_PEXT64:
++	  gcc_assert (n_args == 2);
++	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
++	    {
++	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
++	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
++	      unsigned HOST_WIDE_INT res = 0;
++	      unsigned HOST_WIDE_INT m, k = 1;
++	      for (m = 1; m; m <<= 1)
++		if ((mask & m) != 0)
++		  {
++		    if ((src & m) != 0)
++		      res |= k;
++		    k <<= 1;
++		  }
++	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
++	    }
++	  break;
+ 
+-    case EQ:
+-    case NE:
++	case IX86_BUILTIN_MOVMSKPS:
++	case IX86_BUILTIN_PMOVMSKB:
++	case IX86_BUILTIN_MOVMSKPD:
++	case IX86_BUILTIN_PMOVMSKB128:
++	case IX86_BUILTIN_MOVMSKPD256:
++	case IX86_BUILTIN_MOVMSKPS256:
++	case IX86_BUILTIN_PMOVMSKB256:
++	  gcc_assert (n_args == 1);
++	  if (TREE_CODE (args[0]) == VECTOR_CST)
++	    {
++	      HOST_WIDE_INT res = 0;
++	      for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
++		{
++		  tree e = VECTOR_CST_ELT (args[0], i);
++		  if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
++		    {
++		      if (wi::neg_p (wi::to_wide (e)))
++			res |= HOST_WIDE_INT_1 << i;
++		    }
++		  else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
++		    {
++		      if (TREE_REAL_CST (e).sign)
++			res |= HOST_WIDE_INT_1 << i;
++		    }
++		  else
++		    return NULL_TREE;
++		}
++	      return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
++	    }
++	  break;
+ 
+-    case LTGT:
+-    case UNORDERED:
+-    case ORDERED:
+-    case UNLT:
+-    case UNLE:
+-    case UNGT:
+-    case UNGE:
+-    case UNEQ:
+-      return true;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-machine_mode
+-ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
+-{
+-  machine_mode mode = GET_MODE (op0);
+-
+-  if (SCALAR_FLOAT_MODE_P (mode))
+-    {
+-      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+-      return CCFPmode;
+-    }
+-
+-  switch (code)
+-    {
+-      /* Only zero flag is needed.  */
+-    case EQ:			/* ZF=0 */
+-    case NE:			/* ZF!=0 */
+-      return CCZmode;
+-      /* Codes needing carry flag.  */
+-    case GEU:			/* CF=0 */
+-    case LTU:			/* CF=1 */
+-      /* Detect overflow checks.  They need just the carry flag.  */
+-      if (GET_CODE (op0) == PLUS
+-	  && (rtx_equal_p (op1, XEXP (op0, 0))
+-	      || rtx_equal_p (op1, XEXP (op0, 1))))
+-	return CCCmode;
+-      else
+-	return CCmode;
+-    case GTU:			/* CF=0 & ZF=0 */
+-    case LEU:			/* CF=1 | ZF=1 */
+-      return CCmode;
+-      /* Codes possibly doable only with sign flag when
+-         comparing against zero.  */
+-    case GE:			/* SF=OF   or   SF=0 */
+-    case LT:			/* SF<>OF  or   SF=1 */
+-      if (op1 == const0_rtx)
+-	return CCGOCmode;
+-      else
+-	/* For other cases Carry flag is not required.  */
+-	return CCGCmode;
+-      /* Codes doable only with sign flag when comparing
+-         against zero, but we miss jump instruction for it
+-         so we need to use relational tests against overflow
+-         that thus needs to be zero.  */
+-    case GT:			/* ZF=0 & SF=OF */
+-    case LE:			/* ZF=1 | SF<>OF */
+-      if (op1 == const0_rtx)
+-	return CCNOmode;
+-      else
+-	return CCGCmode;
+-      /* strcmp pattern do (use flags) and combine may ask us for proper
+-	 mode.  */
+-    case USE:
+-      return CCmode;
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* Return the fixed registers used for condition codes.  */
+-
+-static bool
+-ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
+-{
+-  *p1 = FLAGS_REG;
+-  *p2 = INVALID_REGNUM;
+-  return true;
+-}
+-
+-/* If two condition code modes are compatible, return a condition code
+-   mode which is compatible with both.  Otherwise, return
+-   VOIDmode.  */
+-
+-static machine_mode
+-ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
+-{
+-  if (m1 == m2)
+-    return m1;
+-
+-  if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
+-    return VOIDmode;
+-
+-  if ((m1 == CCGCmode && m2 == CCGOCmode)
+-      || (m1 == CCGOCmode && m2 == CCGCmode))
+-    return CCGCmode;
+-
+-  if ((m1 == CCNOmode && m2 == CCGOCmode)
+-      || (m1 == CCGOCmode && m2 == CCNOmode))
+-    return CCNOmode;
+-
+-  if (m1 == CCZmode
+-      && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
+-    return m2;
+-  else if (m2 == CCZmode
+-	   && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
+-    return m1;
+-
+-  switch (m1)
+-    {
+-    default:
+-      gcc_unreachable ();
+-
+-    case E_CCmode:
+-    case E_CCGCmode:
+-    case E_CCGOCmode:
+-    case E_CCNOmode:
+-    case E_CCAmode:
+-    case E_CCCmode:
+-    case E_CCOmode:
+-    case E_CCPmode:
+-    case E_CCSmode:
+-    case E_CCZmode:
+-      switch (m2)
+-	{
+-	default:
+-	  return VOIDmode;
+-
+-	case E_CCmode:
+-	case E_CCGCmode:
+-	case E_CCGOCmode:
+-	case E_CCNOmode:
+-	case E_CCAmode:
+-	case E_CCCmode:
+-	case E_CCOmode:
+-	case E_CCPmode:
+-	case E_CCSmode:
+-	case E_CCZmode:
+-	  return CCmode;
+-	}
+-
+-    case E_CCFPmode:
+-      /* These are only compatible with themselves, which we already
+-	 checked above.  */
+-      return VOIDmode;
+-    }
+-}
+-
+-
+-/* Return a comparison we can do and that it is equivalent to
+-   swap_condition (code) apart possibly from orderedness.
+-   But, never change orderedness if TARGET_IEEE_FP, returning
+-   UNKNOWN in that case if necessary.  */
+-
+-static enum rtx_code
+-ix86_fp_swap_condition (enum rtx_code code)
+-{
+-  switch (code)
+-    {
+-    case GT:                   /* GTU - CF=0 & ZF=0 */
+-      return TARGET_IEEE_FP ? UNKNOWN : UNLT;
+-    case GE:                   /* GEU - CF=0 */
+-      return TARGET_IEEE_FP ? UNKNOWN : UNLE;
+-    case UNLT:                 /* LTU - CF=1 */
+-      return TARGET_IEEE_FP ? UNKNOWN : GT;
+-    case UNLE:                 /* LEU - CF=1 | ZF=1 */
+-      return TARGET_IEEE_FP ? UNKNOWN : GE;
+-    default:
+-      return swap_condition (code);
+-    }
+-}
+-
+-/* Return cost of comparison CODE using the best strategy for performance.
+-   All following functions do use number of instructions as a cost metrics.
+-   In future this should be tweaked to compute bytes for optimize_size and
+-   take into account performance of various instructions on various CPUs.  */
+-
+-static int
+-ix86_fp_comparison_cost (enum rtx_code code)
+-{
+-  int arith_cost;
+-
+-  /* The cost of code using bit-twiddling on %ah.  */
+-  switch (code)
+-    {
+-    case UNLE:
+-    case UNLT:
+-    case LTGT:
+-    case GT:
+-    case GE:
+-    case UNORDERED:
+-    case ORDERED:
+-    case UNEQ:
+-      arith_cost = 4;
+-      break;
+-    case LT:
+-    case NE:
+-    case EQ:
+-    case UNGE:
+-      arith_cost = TARGET_IEEE_FP ? 5 : 4;
+-      break;
+-    case LE:
+-    case UNGT:
+-      arith_cost = TARGET_IEEE_FP ? 6 : 4;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  switch (ix86_fp_comparison_strategy (code))
+-    {
+-    case IX86_FPCMP_COMI:
+-      return arith_cost > 4 ? 3 : 2;
+-    case IX86_FPCMP_SAHF:
+-      return arith_cost > 4 ? 4 : 3;
+-    default:
+-      return arith_cost;
+-    }
+-}
+-
+-/* Return strategy to use for floating-point.  We assume that fcomi is always
+-   preferrable where available, since that is also true when looking at size
+-   (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
+-
+-enum ix86_fpcmp_strategy
+-ix86_fp_comparison_strategy (enum rtx_code)
+-{
+-  /* Do fcomi/sahf based test when profitable.  */
+-
+-  if (TARGET_CMOVE)
+-    return IX86_FPCMP_COMI;
+-
+-  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+-    return IX86_FPCMP_SAHF;
+-
+-  return IX86_FPCMP_ARITH;
+-}
+-
+-/* Swap, force into registers, or otherwise massage the two operands
+-   to a fp comparison.  The operands are updated in place; the new
+-   comparison code is returned.  */
+-
+-static enum rtx_code
+-ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
+-{
+-  bool unordered_compare = ix86_unordered_fp_compare (code);
+-  rtx op0 = *pop0, op1 = *pop1;
+-  machine_mode op_mode = GET_MODE (op0);
+-  bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+-
+-  /* All of the unordered compare instructions only work on registers.
+-     The same is true of the fcomi compare instructions.  The XFmode
+-     compare instructions require registers except when comparing
+-     against zero or when converting operand 1 from fixed point to
+-     floating point.  */
+-
+-  if (!is_sse
+-      && (unordered_compare
+-	  || (op_mode == XFmode
+-	      && ! (standard_80387_constant_p (op0) == 1
+-		    || standard_80387_constant_p (op1) == 1)
+-	      && GET_CODE (op1) != FLOAT)
+-	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
+-    {
+-      op0 = force_reg (op_mode, op0);
+-      op1 = force_reg (op_mode, op1);
+-    }
+-  else
+-    {
+-      /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
+-	 things around if they appear profitable, otherwise force op0
+-	 into a register.  */
+-
+-      if (standard_80387_constant_p (op0) == 0
+-	  || (MEM_P (op0)
+-	      && ! (standard_80387_constant_p (op1) == 0
+-		    || MEM_P (op1))))
+-	{
+-	  enum rtx_code new_code = ix86_fp_swap_condition (code);
+-	  if (new_code != UNKNOWN)
+-	    {
+-	      std::swap (op0, op1);
+-	      code = new_code;
+-	    }
+-	}
+-
+-      if (!REG_P (op0))
+-	op0 = force_reg (op_mode, op0);
+-
+-      if (CONSTANT_P (op1))
+-	{
+-	  int tmp = standard_80387_constant_p (op1);
+-	  if (tmp == 0)
+-	    op1 = validize_mem (force_const_mem (op_mode, op1));
+-	  else if (tmp == 1)
+-	    {
+-	      if (TARGET_CMOVE)
+-		op1 = force_reg (op_mode, op1);
+-	    }
+-	  else
+-	    op1 = force_reg (op_mode, op1);
+-	}
+-    }
+-
+-  /* Try to rearrange the comparison to make it cheaper.  */
+-  if (ix86_fp_comparison_cost (code)
+-      > ix86_fp_comparison_cost (swap_condition (code))
+-      && (REG_P (op1) || can_create_pseudo_p ()))
+-    {
+-      std::swap (op0, op1);
+-      code = swap_condition (code);
+-      if (!REG_P (op0))
+-	op0 = force_reg (op_mode, op0);
+-    }
+-
+-  *pop0 = op0;
+-  *pop1 = op1;
+-  return code;
+-}
+-
+-/* Convert comparison codes we use to represent FP comparison to integer
+-   code that will result in proper branch.  Return UNKNOWN if no such code
+-   is available.  */
+-
+-enum rtx_code
+-ix86_fp_compare_code_to_integer (enum rtx_code code)
+-{
+-  switch (code)
+-    {
+-    case GT:
+-      return GTU;
+-    case GE:
+-      return GEU;
+-    case ORDERED:
+-    case UNORDERED:
+-      return code;
+-    case UNEQ:
+-      return EQ;
+-    case UNLT:
+-      return LTU;
+-    case UNLE:
+-      return LEU;
+-    case LTGT:
+-      return NE;
+-    default:
+-      return UNKNOWN;
+-    }
+-}
+-
+-/* Generate insn patterns to do a floating point compare of OPERANDS.  */
+-
+-static rtx
+-ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
+-{
+-  bool unordered_compare = ix86_unordered_fp_compare (code);
+-  machine_mode cmp_mode;
+-  rtx tmp, scratch;
+-
+-  code = ix86_prepare_fp_compare_args (code, &op0, &op1);
+-
+-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+-  if (unordered_compare)
+-    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+-
+-  /* Do fcomi/sahf based test when profitable.  */
+-  switch (ix86_fp_comparison_strategy (code))
+-    {
+-    case IX86_FPCMP_COMI:
+-      cmp_mode = CCFPmode;
+-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
+-      break;
+-
+-    case IX86_FPCMP_SAHF:
+-      cmp_mode = CCFPmode;
+-      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+-      scratch = gen_reg_rtx (HImode);
+-      emit_insn (gen_rtx_SET (scratch, tmp));
+-      emit_insn (gen_x86_sahf_1 (scratch));
+-      break;
+-
+-    case IX86_FPCMP_ARITH:
+-      cmp_mode = CCNOmode;
+-      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+-      scratch = gen_reg_rtx (HImode);
+-      emit_insn (gen_rtx_SET (scratch, tmp));
+-
+-      /* In the unordered case, we have to check C2 for NaN's, which
+-	 doesn't happen to work out to anything nice combination-wise.
+-	 So do some bit twiddling on the value we've got in AH to come
+-	 up with an appropriate set of condition codes.  */
+-
+-      switch (code)
+-	{
+-	case GT:
+-	case UNGT:
+-	  if (code == GT || !TARGET_IEEE_FP)
+-	    {
+-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
+-	      code = EQ;
+-	    }
+-	  else
+-	    {
+-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+-	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+-	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
+-	      cmp_mode = CCmode;
+-	      code = GEU;
+-	    }
+-	  break;
+-	case LT:
+-	case UNLT:
+-	  if (code == LT && TARGET_IEEE_FP)
+-	    {
+-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+-	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
+-	      cmp_mode = CCmode;
+-	      code = EQ;
+-	    }
+-	  else
+-	    {
+-	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
+-	      code = NE;
+-	    }
+-	  break;
+-	case GE:
+-	case UNGE:
+-	  if (code == GE || !TARGET_IEEE_FP)
+-	    {
+-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
+-	      code = EQ;
+-	    }
+-	  else
+-	    {
+-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+-	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
+-	      code = NE;
+-	    }
+-	  break;
+-	case LE:
+-	case UNLE:
+-	  if (code == LE && TARGET_IEEE_FP)
+-	    {
+-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+-	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+-	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+-	      cmp_mode = CCmode;
+-	      code = LTU;
+-	    }
+-	  else
+-	    {
+-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
+-	      code = NE;
+-	    }
+-	  break;
+-	case EQ:
+-	case UNEQ:
+-	  if (code == EQ && TARGET_IEEE_FP)
+-	    {
+-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+-	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+-	      cmp_mode = CCmode;
+-	      code = EQ;
+-	    }
+-	  else
+-	    {
+-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
+-	      code = NE;
+-	    }
+-	  break;
+-	case NE:
+-	case LTGT:
+-	  if (code == NE && TARGET_IEEE_FP)
+-	    {
+-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+-	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
+-					     GEN_INT (0x40)));
+-	      code = NE;
+-	    }
+-	  else
+-	    {
+-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
+-	      code = EQ;
+-	    }
+-	  break;
+-
+-	case UNORDERED:
+-	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
+-	  code = NE;
+-	  break;
+-	case ORDERED:
+-	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
+-	  code = EQ;
+-	  break;
+-
+-	default:
+-	  gcc_unreachable ();
+-	}
+-	break;
+-
+-    default:
+-      gcc_unreachable();
+-    }
+-
+-  /* Return the test that should be put into the flags user, i.e.
+-     the bcc, scc, or cmov instruction.  */
+-  return gen_rtx_fmt_ee (code, VOIDmode,
+-			 gen_rtx_REG (cmp_mode, FLAGS_REG),
+-			 const0_rtx);
+-}
+-
+-static rtx
+-ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
+-{
+-  rtx ret;
+-
+-  if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
+-    ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+-
+-  else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
+-    {
+-      gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
+-      ret = ix86_expand_fp_compare (code, op0, op1);
+-    }
+-  else
+-    ret = ix86_expand_int_compare (code, op0, op1);
+-
+-  return ret;
+-}
+-
+-void
+-ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
+-{
+-  machine_mode mode = GET_MODE (op0);
+-  rtx tmp;
+-
+-  /* Handle special case - vector comparsion with boolean result, transform
+-     it using ptest instruction.  */
+-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+-    {
+-      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
+-      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+-
+-      gcc_assert (code == EQ || code == NE);
+-      /* Generate XOR since we can't check that one operand is zero vector.  */
+-      tmp = gen_reg_rtx (mode);
+-      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+-      tmp = gen_lowpart (p_mode, tmp);
+-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
+-			      gen_rtx_UNSPEC (CCmode,
+-					      gen_rtvec (2, tmp, tmp),
+-					      UNSPEC_PTEST)));
+-      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
+-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+-				  gen_rtx_LABEL_REF (VOIDmode, label),
+-				  pc_rtx);
+-      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-      return;
+-    }
+-
+-  switch (mode)
+-    {
+-    case E_SFmode:
+-    case E_DFmode:
+-    case E_XFmode:
+-    case E_QImode:
+-    case E_HImode:
+-    case E_SImode:
+-      simple:
+-      tmp = ix86_expand_compare (code, op0, op1);
+-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+-				  gen_rtx_LABEL_REF (VOIDmode, label),
+-				  pc_rtx);
+-      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-      return;
+-
+-    case E_DImode:
+-      if (TARGET_64BIT)
+-	goto simple;
+-      /* For 32-bit target DI comparison may be performed on
+-	 SSE registers.  To allow this we should avoid split
+-	 to SI mode which is achieved by doing xor in DI mode
+-	 and then comparing with zero (which is recognized by
+-	 STV pass).  We don't compare using xor when optimizing
+-	 for size.  */
+-      if (!optimize_insn_for_size_p ()
+-	  && TARGET_STV
+-	  && (code == EQ || code == NE))
+-	{
+-	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
+-	  op1 = const0_rtx;
+-	}
+-      /* FALLTHRU */
+-    case E_TImode:
+-      /* Expand DImode branch into multiple compare+branch.  */
+-      {
+-	rtx lo[2], hi[2];
+-	rtx_code_label *label2;
+-	enum rtx_code code1, code2, code3;
+-	machine_mode submode;
+-
+-	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
+-	  {
+-	    std::swap (op0, op1);
+-	    code = swap_condition (code);
+-	  }
+-
+-	split_double_mode (mode, &op0, 1, lo+0, hi+0);
+-	split_double_mode (mode, &op1, 1, lo+1, hi+1);
+-
+-	submode = mode == DImode ? SImode : DImode;
+-
+-	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
+-	   avoid two branches.  This costs one extra insn, so disable when
+-	   optimizing for size.  */
+-
+-	if ((code == EQ || code == NE)
+-	    && (!optimize_insn_for_size_p ()
+-	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
+-	  {
+-	    rtx xor0, xor1;
+-
+-	    xor1 = hi[0];
+-	    if (hi[1] != const0_rtx)
+-	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
+-				   NULL_RTX, 0, OPTAB_WIDEN);
+-
+-	    xor0 = lo[0];
+-	    if (lo[1] != const0_rtx)
+-	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
+-				   NULL_RTX, 0, OPTAB_WIDEN);
+-
+-	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
+-				NULL_RTX, 0, OPTAB_WIDEN);
+-
+-	    ix86_expand_branch (code, tmp, const0_rtx, label);
+-	    return;
+-	  }
+-
+-	/* Otherwise, if we are doing less-than or greater-or-equal-than,
+-	   op1 is a constant and the low word is zero, then we can just
+-	   examine the high word.  Similarly for low word -1 and
+-	   less-or-equal-than or greater-than.  */
+-
+-	if (CONST_INT_P (hi[1]))
+-	  switch (code)
+-	    {
+-	    case LT: case LTU: case GE: case GEU:
+-	      if (lo[1] == const0_rtx)
+-		{
+-		  ix86_expand_branch (code, hi[0], hi[1], label);
+-		  return;
+-		}
+-	      break;
+-	    case LE: case LEU: case GT: case GTU:
+-	      if (lo[1] == constm1_rtx)
+-		{
+-		  ix86_expand_branch (code, hi[0], hi[1], label);
+-		  return;
+-		}
+-	      break;
+-	    default:
+-	      break;
+-	    }
+-
+-	/* Emulate comparisons that do not depend on Zero flag with
+-	   double-word subtraction.  Note that only Overflow, Sign
+-	   and Carry flags are valid, so swap arguments and condition
+-	   of comparisons that would otherwise test Zero flag.  */
+-
+-	switch (code)
+-	  {
+-	  case LE: case LEU: case GT: case GTU:
+-	    std::swap (lo[0], lo[1]);
+-	    std::swap (hi[0], hi[1]);
+-	    code = swap_condition (code);
+-	    /* FALLTHRU */
+-
+-	  case LT: case LTU: case GE: case GEU:
+-	    {
+-	      rtx (*cmp_insn) (rtx, rtx);
+-	      rtx (*sbb_insn) (rtx, rtx, rtx);
+-	      bool uns = (code == LTU || code == GEU);
+-
+-	      if (TARGET_64BIT)
+-		{
+-		  cmp_insn = gen_cmpdi_1;
+-		  sbb_insn
+-		    = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
+-		}
+-	      else
+-		{
+-		  cmp_insn = gen_cmpsi_1;
+-		  sbb_insn
+-		    = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
+-		}
+-
+-	      if (!nonimmediate_operand (lo[0], submode))
+-		lo[0] = force_reg (submode, lo[0]);
+-	      if (!x86_64_general_operand (lo[1], submode))
+-		lo[1] = force_reg (submode, lo[1]);
+-
+-	      if (!register_operand (hi[0], submode))
+-		hi[0] = force_reg (submode, hi[0]);
+-	      if ((uns && !nonimmediate_operand (hi[1], submode))
+-		  || (!uns && !x86_64_general_operand (hi[1], submode)))
+-		hi[1] = force_reg (submode, hi[1]);
+-
+-	      emit_insn (cmp_insn (lo[0], lo[1]));
+-	      emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
+-
+-	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
+-
+-	      ix86_expand_branch (code, tmp, const0_rtx, label);
+-	      return;
+-	    }
+-
+-	  default:
+-	    break;
+-	  }
+-
+-	/* Otherwise, we need two or three jumps.  */
+-
+-	label2 = gen_label_rtx ();
+-
+-	code1 = code;
+-	code2 = swap_condition (code);
+-	code3 = unsigned_condition (code);
+-
+-	switch (code)
+-	  {
+-	  case LT: case GT: case LTU: case GTU:
+-	    break;
+-
+-	  case LE:   code1 = LT;  code2 = GT;  break;
+-	  case GE:   code1 = GT;  code2 = LT;  break;
+-	  case LEU:  code1 = LTU; code2 = GTU; break;
+-	  case GEU:  code1 = GTU; code2 = LTU; break;
+-
+-	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
+-	  case NE:   code2 = UNKNOWN; break;
+-
+-	  default:
+-	    gcc_unreachable ();
+-	  }
+-
+-	/*
+-	 * a < b =>
+-	 *    if (hi(a) < hi(b)) goto true;
+-	 *    if (hi(a) > hi(b)) goto false;
+-	 *    if (lo(a) < lo(b)) goto true;
+-	 *  false:
+-	 */
+-
+-	if (code1 != UNKNOWN)
+-	  ix86_expand_branch (code1, hi[0], hi[1], label);
+-	if (code2 != UNKNOWN)
+-	  ix86_expand_branch (code2, hi[0], hi[1], label2);
+-
+-	ix86_expand_branch (code3, lo[0], lo[1], label);
+-
+-	if (code2 != UNKNOWN)
+-	  emit_label (label2);
+-	return;
+-      }
+-
+-    default:
+-      gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
+-      goto simple;
+-    }
+-}
+-
+-void
+-ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
+-{
+-  rtx ret;
+-
+-  gcc_assert (GET_MODE (dest) == QImode);
+-
+-  ret = ix86_expand_compare (code, op0, op1);
+-  PUT_MODE (ret, QImode);
+-  emit_insn (gen_rtx_SET (dest, ret));
+-}
+-
+-/* Expand comparison setting or clearing carry flag.  Return true when
+-   successful and set pop for the operation.  */
+-static bool
+-ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
+-{
+-  machine_mode mode
+-    = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
+-
+-  /* Do not handle double-mode compares that go through special path.  */
+-  if (mode == (TARGET_64BIT ? TImode : DImode))
+-    return false;
+-
+-  if (SCALAR_FLOAT_MODE_P (mode))
+-    {
+-      rtx compare_op;
+-      rtx_insn *compare_seq;
+-
+-      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+-
+-      /* Shortcut:  following common codes never translate
+-	 into carry flag compares.  */
+-      if (code == EQ || code == NE || code == UNEQ || code == LTGT
+-	  || code == ORDERED || code == UNORDERED)
+-	return false;
+-
+-      /* These comparisons require zero flag; swap operands so they won't.  */
+-      if ((code == GT || code == UNLE || code == LE || code == UNGT)
+-	  && !TARGET_IEEE_FP)
+-	{
+-	  std::swap (op0, op1);
+-	  code = swap_condition (code);
+-	}
+-
+-      /* Try to expand the comparison and verify that we end up with
+-	 carry flag based comparison.  This fails to be true only when
+-	 we decide to expand comparison using arithmetic that is not
+-	 too common scenario.  */
+-      start_sequence ();
+-      compare_op = ix86_expand_fp_compare (code, op0, op1);
+-      compare_seq = get_insns ();
+-      end_sequence ();
+-
+-      if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
+-        code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
+-      else
+-	code = GET_CODE (compare_op);
+-
+-      if (code != LTU && code != GEU)
+-	return false;
+-
+-      emit_insn (compare_seq);
+-      *pop = compare_op;
+-      return true;
+-    }
+-
+-  if (!INTEGRAL_MODE_P (mode))
+-    return false;
+-
+-  switch (code)
+-    {
+-    case LTU:
+-    case GEU:
+-      break;
+-
+-    /* Convert a==0 into (unsigned)a<1.  */
+-    case EQ:
+-    case NE:
+-      if (op1 != const0_rtx)
+-	return false;
+-      op1 = const1_rtx;
+-      code = (code == EQ ? LTU : GEU);
+-      break;
+-
+-    /* Convert a>b into b<a or a>=b-1.  */
+-    case GTU:
+-    case LEU:
+-      if (CONST_INT_P (op1))
+-	{
+-	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
+-	  /* Bail out on overflow.  We still can swap operands but that
+-	     would force loading of the constant into register.  */
+-	  if (op1 == const0_rtx
+-	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
+-	    return false;
+-	  code = (code == GTU ? GEU : LTU);
+-	}
+-      else
+-	{
+-	  std::swap (op0, op1);
+-	  code = (code == GTU ? LTU : GEU);
+-	}
+-      break;
+-
+-    /* Convert a>=0 into (unsigned)a<0x80000000.  */
+-    case LT:
+-    case GE:
+-      if (mode == DImode || op1 != const0_rtx)
+-	return false;
+-      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+-      code = (code == LT ? GEU : LTU);
+-      break;
+-    case LE:
+-    case GT:
+-      if (mode == DImode || op1 != constm1_rtx)
+-	return false;
+-      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+-      code = (code == LE ? GEU : LTU);
+-      break;
+-
+-    default:
+-      return false;
+-    }
+-  /* Swapping operands may cause constant to appear as first operand.  */
+-  if (!nonimmediate_operand (op0, VOIDmode))
+-    {
+-      if (!can_create_pseudo_p ())
+-	return false;
+-      op0 = force_reg (mode, op0);
+-    }
+-  *pop = ix86_expand_compare (code, op0, op1);
+-  gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
+-  return true;
+-}
+-
+-bool
+-ix86_expand_int_movcc (rtx operands[])
+-{
+-  enum rtx_code code = GET_CODE (operands[1]), compare_code;
+-  rtx_insn *compare_seq;
+-  rtx compare_op;
+-  machine_mode mode = GET_MODE (operands[0]);
+-  bool sign_bit_compare_p = false;
+-  rtx op0 = XEXP (operands[1], 0);
+-  rtx op1 = XEXP (operands[1], 1);
+-
+-  if (GET_MODE (op0) == TImode
+-      || (GET_MODE (op0) == DImode
+-	  && !TARGET_64BIT))
+-    return false;
+-
+-  start_sequence ();
+-  compare_op = ix86_expand_compare (code, op0, op1);
+-  compare_seq = get_insns ();
+-  end_sequence ();
+-
+-  compare_code = GET_CODE (compare_op);
+-
+-  if ((op1 == const0_rtx && (code == GE || code == LT))
+-      || (op1 == constm1_rtx && (code == GT || code == LE)))
+-    sign_bit_compare_p = true;
+-
+-  /* Don't attempt mode expansion here -- if we had to expand 5 or 6
+-     HImode insns, we'd be swallowed in word prefix ops.  */
+-
+-  if ((mode != HImode || TARGET_FAST_PREFIX)
+-      && (mode != (TARGET_64BIT ? TImode : DImode))
+-      && CONST_INT_P (operands[2])
+-      && CONST_INT_P (operands[3]))
+-    {
+-      rtx out = operands[0];
+-      HOST_WIDE_INT ct = INTVAL (operands[2]);
+-      HOST_WIDE_INT cf = INTVAL (operands[3]);
+-      HOST_WIDE_INT diff;
+-
+-      diff = ct - cf;
+-      /*  Sign bit compares are better done using shifts than we do by using
+-	  sbb.  */
+-      if (sign_bit_compare_p
+-	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+-	{
+-	  /* Detect overlap between destination and compare sources.  */
+-	  rtx tmp = out;
+-
+-          if (!sign_bit_compare_p)
+-	    {
+-	      rtx flags;
+-	      bool fpcmp = false;
+-
+-	      compare_code = GET_CODE (compare_op);
+-
+-	      flags = XEXP (compare_op, 0);
+-
+-	      if (GET_MODE (flags) == CCFPmode)
+-		{
+-		  fpcmp = true;
+-		  compare_code
+-		    = ix86_fp_compare_code_to_integer (compare_code);
+-		}
+-
+-	      /* To simplify rest of code, restrict to the GEU case.  */
+-	      if (compare_code == LTU)
+-		{
+-		  std::swap (ct, cf);
+-		  compare_code = reverse_condition (compare_code);
+-		  code = reverse_condition (code);
+-		}
+-	      else
+-		{
+-		  if (fpcmp)
+-		    PUT_CODE (compare_op,
+-			      reverse_condition_maybe_unordered
+-			        (GET_CODE (compare_op)));
+-		  else
+-		    PUT_CODE (compare_op,
+-			      reverse_condition (GET_CODE (compare_op)));
+-		}
+-	      diff = ct - cf;
+-
+-	      if (reg_overlap_mentioned_p (out, op0)
+-		  || reg_overlap_mentioned_p (out, op1))
+-		tmp = gen_reg_rtx (mode);
+-
+-	      if (mode == DImode)
+-		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
+-	      else
+-		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
+-						 flags, compare_op));
+-	    }
+-	  else
+-	    {
+-	      if (code == GT || code == GE)
+-		code = reverse_condition (code);
+-	      else
+-		{
+-		  std::swap (ct, cf);
+-		  diff = ct - cf;
+-		}
+-	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
+-	    }
+-
+-	  if (diff == 1)
+-	    {
+-	      /*
+-	       * cmpl op0,op1
+-	       * sbbl dest,dest
+-	       * [addl dest, ct]
+-	       *
+-	       * Size 5 - 8.
+-	       */
+-	      if (ct)
+-		tmp = expand_simple_binop (mode, PLUS,
+-					   tmp, GEN_INT (ct),
+-					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+-	    }
+-	  else if (cf == -1)
+-	    {
+-	      /*
+-	       * cmpl op0,op1
+-	       * sbbl dest,dest
+-	       * orl $ct, dest
+-	       *
+-	       * Size 8.
+-	       */
+-	      tmp = expand_simple_binop (mode, IOR,
+-					 tmp, GEN_INT (ct),
+-					 copy_rtx (tmp), 1, OPTAB_DIRECT);
+-	    }
+-	  else if (diff == -1 && ct)
+-	    {
+-	      /*
+-	       * cmpl op0,op1
+-	       * sbbl dest,dest
+-	       * notl dest
+-	       * [addl dest, cf]
+-	       *
+-	       * Size 8 - 11.
+-	       */
+-	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+-	      if (cf)
+-		tmp = expand_simple_binop (mode, PLUS,
+-					   copy_rtx (tmp), GEN_INT (cf),
+-					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+-	    }
+-	  else
+-	    {
+-	      /*
+-	       * cmpl op0,op1
+-	       * sbbl dest,dest
+-	       * [notl dest]
+-	       * andl cf - ct, dest
+-	       * [addl dest, ct]
+-	       *
+-	       * Size 8 - 11.
+-	       */
+-
+-	      if (cf == 0)
+-		{
+-		  cf = ct;
+-		  ct = 0;
+-		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+-		}
+-
+-	      tmp = expand_simple_binop (mode, AND,
+-					 copy_rtx (tmp),
+-					 gen_int_mode (cf - ct, mode),
+-					 copy_rtx (tmp), 1, OPTAB_DIRECT);
+-	      if (ct)
+-		tmp = expand_simple_binop (mode, PLUS,
+-					   copy_rtx (tmp), GEN_INT (ct),
+-					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+-	    }
+-
+-	  if (!rtx_equal_p (tmp, out))
+-	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
+-
+-	  return true;
+-	}
+-
+-      if (diff < 0)
+-	{
+-	  machine_mode cmp_mode = GET_MODE (op0);
+-	  enum rtx_code new_code;
+-
+-	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
+-	    {
+-	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+-
+-	      /* We may be reversing unordered compare to normal compare, that
+-		 is not valid in general (we may convert non-trapping condition
+-		 to trapping one), however on i386 we currently emit all
+-		 comparisons unordered.  */
+-	      new_code = reverse_condition_maybe_unordered (code);
+-	    }
+-	  else
+-	    new_code = ix86_reverse_condition (code, cmp_mode);
+-	  if (new_code != UNKNOWN)
+-	    {
+-	      std::swap (ct, cf);
+-	      diff = -diff;
+-	      code = new_code;
+-	    }
+-	}
+-
+-      compare_code = UNKNOWN;
+-      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
+-	  && CONST_INT_P (op1))
+-	{
+-	  if (op1 == const0_rtx
+-	      && (code == LT || code == GE))
+-	    compare_code = code;
+-	  else if (op1 == constm1_rtx)
+-	    {
+-	      if (code == LE)
+-		compare_code = LT;
+-	      else if (code == GT)
+-		compare_code = GE;
+-	    }
+-	}
+-
+-      /* Optimize dest = (op0 < 0) ? -1 : cf.  */
+-      if (compare_code != UNKNOWN
+-	  && GET_MODE (op0) == GET_MODE (out)
+-	  && (cf == -1 || ct == -1))
+-	{
+-	  /* If lea code below could be used, only optimize
+-	     if it results in a 2 insn sequence.  */
+-
+-	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
+-		 || diff == 3 || diff == 5 || diff == 9)
+-	      || (compare_code == LT && ct == -1)
+-	      || (compare_code == GE && cf == -1))
+-	    {
+-	      /*
+-	       * notl op1	(if necessary)
+-	       * sarl $31, op1
+-	       * orl cf, op1
+-	       */
+-	      if (ct != -1)
+-		{
+-		  cf = ct;
+-		  ct = -1;
+-		  code = reverse_condition (code);
+-		}
+-
+-	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+-
+-	      out = expand_simple_binop (mode, IOR,
+-					 out, GEN_INT (cf),
+-					 out, 1, OPTAB_DIRECT);
+-	      if (out != operands[0])
+-		emit_move_insn (operands[0], out);
+-
+-	      return true;
+-	    }
+-	}
+-
+-
+-      if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
+-	   || diff == 3 || diff == 5 || diff == 9)
+-	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
+-	  && (mode != DImode
+-	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
+-	{
+-	  /*
+-	   * xorl dest,dest
+-	   * cmpl op1,op2
+-	   * setcc dest
+-	   * lea cf(dest*(ct-cf)),dest
+-	   *
+-	   * Size 14.
+-	   *
+-	   * This also catches the degenerate setcc-only case.
+-	   */
+-
+-	  rtx tmp;
+-	  int nops;
+-
+-	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+-
+-	  nops = 0;
+-	  /* On x86_64 the lea instruction operates on Pmode, so we need
+-	     to get arithmetics done in proper mode to match.  */
+-	  if (diff == 1)
+-	    tmp = copy_rtx (out);
+-	  else
+-	    {
+-	      rtx out1;
+-	      out1 = copy_rtx (out);
+-	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
+-	      nops++;
+-	      if (diff & 1)
+-		{
+-		  tmp = gen_rtx_PLUS (mode, tmp, out1);
+-		  nops++;
+-		}
+-	    }
+-	  if (cf != 0)
+-	    {
+-	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+-	      nops++;
+-	    }
+-	  if (!rtx_equal_p (tmp, out))
+-	    {
+-	      if (nops == 1)
+-		out = force_operand (tmp, copy_rtx (out));
+-	      else
+-		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
+-	    }
+-	  if (!rtx_equal_p (out, operands[0]))
+-	    emit_move_insn (operands[0], copy_rtx (out));
+-
+-	  return true;
+-	}
+-
+-      /*
+-       * General case:			Jumpful:
+-       *   xorl dest,dest		cmpl op1, op2
+-       *   cmpl op1, op2		movl ct, dest
+-       *   setcc dest			jcc 1f
+-       *   decl dest			movl cf, dest
+-       *   andl (cf-ct),dest		1:
+-       *   addl ct,dest
+-       *
+-       * Size 20.			Size 14.
+-       *
+-       * This is reasonably steep, but branch mispredict costs are
+-       * high on modern cpus, so consider failing only if optimizing
+-       * for space.
+-       */
+-
+-      if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+-	  && BRANCH_COST (optimize_insn_for_speed_p (),
+-		  	  false) >= 2)
+-	{
+-	  if (cf == 0)
+-	    {
+-	      machine_mode cmp_mode = GET_MODE (op0);
+-	      enum rtx_code new_code;
+-
+-	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
+-		{
+-		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+-
+-		  /* We may be reversing unordered compare to normal compare,
+-		     that is not valid in general (we may convert non-trapping
+-		     condition to trapping one), however on i386 we currently
+-		     emit all comparisons unordered.  */
+-		  new_code = reverse_condition_maybe_unordered (code);
+-		}
+-	      else
+-		{
+-		  new_code = ix86_reverse_condition (code, cmp_mode);
+-		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
+-		    compare_code = reverse_condition (compare_code);
+-		}
+-
+-	      if (new_code != UNKNOWN)
+-		{
+-		  cf = ct;
+-		  ct = 0;
+-		  code = new_code;
+-		}
+-	    }
+-
+-	  if (compare_code != UNKNOWN)
+-	    {
+-	      /* notl op1	(if needed)
+-		 sarl $31, op1
+-		 andl (cf-ct), op1
+-		 addl ct, op1
+-
+-		 For x < 0 (resp. x <= -1) there will be no notl,
+-		 so if possible swap the constants to get rid of the
+-		 complement.
+-		 True/false will be -1/0 while code below (store flag
+-		 followed by decrement) is 0/-1, so the constants need
+-		 to be exchanged once more.  */
+-
+-	      if (compare_code == GE || !cf)
+-		{
+-		  code = reverse_condition (code);
+-		  compare_code = LT;
+-		}
+-	      else
+-		std::swap (ct, cf);
+-
+-	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+-	    }
+-	  else
+-	    {
+-	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+-
+-	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
+-					 constm1_rtx,
+-					 copy_rtx (out), 1, OPTAB_DIRECT);
+-	    }
+-
+-	  out = expand_simple_binop (mode, AND, copy_rtx (out),
+-				     gen_int_mode (cf - ct, mode),
+-				     copy_rtx (out), 1, OPTAB_DIRECT);
+-	  if (ct)
+-	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
+-				       copy_rtx (out), 1, OPTAB_DIRECT);
+-	  if (!rtx_equal_p (out, operands[0]))
+-	    emit_move_insn (operands[0], copy_rtx (out));
+-
+-	  return true;
+-	}
+-    }
+-
+-  if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+-    {
+-      /* Try a few things more with specific constants and a variable.  */
+-
+-      optab op;
+-      rtx var, orig_out, out, tmp;
+-
+-      if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
+-	return false;
+-
+-      /* If one of the two operands is an interesting constant, load a
+-	 constant with the above and mask it in with a logical operation.  */
+-
+-      if (CONST_INT_P (operands[2]))
+-	{
+-	  var = operands[3];
+-	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
+-	    operands[3] = constm1_rtx, op = and_optab;
+-	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
+-	    operands[3] = const0_rtx, op = ior_optab;
+-	  else
+-	    return false;
+-	}
+-      else if (CONST_INT_P (operands[3]))
+-	{
+-	  var = operands[2];
+-	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
+-	    operands[2] = constm1_rtx, op = and_optab;
+-	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
+-	    operands[2] = const0_rtx, op = ior_optab;
+-	  else
+-	    return false;
+-	}
+-      else
+-        return false;
+-
+-      orig_out = operands[0];
+-      tmp = gen_reg_rtx (mode);
+-      operands[0] = tmp;
+-
+-      /* Recurse to get the constant loaded.  */
+-      if (!ix86_expand_int_movcc (operands))
+-        return false;
+-
+-      /* Mask in the interesting variable.  */
+-      out = expand_binop (mode, op, var, tmp, orig_out, 0,
+-			  OPTAB_WIDEN);
+-      if (!rtx_equal_p (out, orig_out))
+-	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
+-
+-      return true;
+-    }
+-
+-  /*
+-   * For comparison with above,
+-   *
+-   * movl cf,dest
+-   * movl ct,tmp
+-   * cmpl op1,op2
+-   * cmovcc tmp,dest
+-   *
+-   * Size 15.
+-   */
+-
+-  if (! nonimmediate_operand (operands[2], mode))
+-    operands[2] = force_reg (mode, operands[2]);
+-  if (! nonimmediate_operand (operands[3], mode))
+-    operands[3] = force_reg (mode, operands[3]);
+-
+-  if (! register_operand (operands[2], VOIDmode)
+-      && (mode == QImode
+-          || ! register_operand (operands[3], VOIDmode)))
+-    operands[2] = force_reg (mode, operands[2]);
+-
+-  if (mode == QImode
+-      && ! register_operand (operands[3], VOIDmode))
+-    operands[3] = force_reg (mode, operands[3]);
+-
+-  emit_insn (compare_seq);
+-  emit_insn (gen_rtx_SET (operands[0],
+-			  gen_rtx_IF_THEN_ELSE (mode,
+-						compare_op, operands[2],
+-						operands[3])));
+-  return true;
+-}
+-
+-/* Swap, force into registers, or otherwise massage the two operands
+-   to an sse comparison with a mask result.  Thus we differ a bit from
+-   ix86_prepare_fp_compare_args which expects to produce a flags result.
+-
+-   The DEST operand exists to help determine whether to commute commutative
+-   operators.  The POP0/POP1 operands are updated in place.  The new
+-   comparison code is returned, or UNKNOWN if not implementable.  */
+-
+-static enum rtx_code
+-ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
+-				  rtx *pop0, rtx *pop1)
+-{
+-  switch (code)
+-    {
+-    case LTGT:
+-    case UNEQ:
+-      /* AVX supports all the needed comparisons.  */
+-      if (TARGET_AVX)
+-	break;
+-      /* We have no LTGT as an operator.  We could implement it with
+-	 NE & ORDERED, but this requires an extra temporary.  It's
+-	 not clear that it's worth it.  */
+-      return UNKNOWN;
+-
+-    case LT:
+-    case LE:
+-    case UNGT:
+-    case UNGE:
+-      /* These are supported directly.  */
+-      break;
+-
+-    case EQ:
+-    case NE:
+-    case UNORDERED:
+-    case ORDERED:
+-      /* AVX has 3 operand comparisons, no need to swap anything.  */
+-      if (TARGET_AVX)
+-	break;
+-      /* For commutative operators, try to canonicalize the destination
+-	 operand to be first in the comparison - this helps reload to
+-	 avoid extra moves.  */
+-      if (!dest || !rtx_equal_p (dest, *pop1))
+-	break;
+-      /* FALLTHRU */
+-
+-    case GE:
+-    case GT:
+-    case UNLE:
+-    case UNLT:
+-      /* These are not supported directly before AVX, and furthermore
+-	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
+-	 comparison operands to transform into something that is
+-	 supported.  */
+-      std::swap (*pop0, *pop1);
+-      code = swap_condition (code);
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  return code;
+-}
+-
+-/* Detect conditional moves that exactly match min/max operational
+-   semantics.  Note that this is IEEE safe, as long as we don't
+-   interchange the operands.
+-
+-   Returns FALSE if this conditional move doesn't match a MIN/MAX,
+-   and TRUE if the operation is successful and instructions are emitted.  */
+-
+-static bool
+-ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
+-			   rtx cmp_op1, rtx if_true, rtx if_false)
+-{
+-  machine_mode mode;
+-  bool is_min;
+-  rtx tmp;
+-
+-  if (code == LT)
+-    ;
+-  else if (code == UNGE)
+-    std::swap (if_true, if_false);
+-  else
+-    return false;
+-
+-  if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
+-    is_min = true;
+-  else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
+-    is_min = false;
+-  else
+-    return false;
+-
+-  mode = GET_MODE (dest);
+-
+-  /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
+-     but MODE may be a vector mode and thus not appropriate.  */
+-  if (!flag_finite_math_only || flag_signed_zeros)
+-    {
+-      int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
+-      rtvec v;
+-
+-      if_true = force_reg (mode, if_true);
+-      v = gen_rtvec (2, if_true, if_false);
+-      tmp = gen_rtx_UNSPEC (mode, v, u);
+-    }
+-  else
+-    {
+-      code = is_min ? SMIN : SMAX;
+-      if (MEM_P (if_true) && MEM_P (if_false))
+-	if_true = force_reg (mode, if_true);
+-      tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
+-    }
+-
+-  emit_insn (gen_rtx_SET (dest, tmp));
+-  return true;
+-}
+-
+-/* Expand an SSE comparison.  Return the register with the result.  */
+-
+-static rtx
+-ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
+-		     rtx op_true, rtx op_false)
+-{
+-  machine_mode mode = GET_MODE (dest);
+-  machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
+-
+-  /* In general case result of comparison can differ from operands' type.  */
+-  machine_mode cmp_mode;
+-
+-  /* In AVX512F the result of comparison is an integer mask.  */
+-  bool maskcmp = false;
+-  rtx x;
+-
+-  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+-    {
+-      unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
+-      cmp_mode = int_mode_for_size (nbits, 0).require ();
+-      maskcmp = true;
+-    }
+-  else
+-    cmp_mode = cmp_ops_mode;
+-
+-  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
+-
+-  int (*op1_predicate)(rtx, machine_mode)
+-    = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
+-
+-  if (!op1_predicate (cmp_op1, cmp_ops_mode))
+-    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
+-
+-  if (optimize
+-      || (maskcmp && cmp_mode != mode)
+-      || (op_true && reg_overlap_mentioned_p (dest, op_true))
+-      || (op_false && reg_overlap_mentioned_p (dest, op_false)))
+-    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
+-
+-  /* Compare patterns for int modes are unspec in AVX512F only.  */
+-  if (maskcmp && (code == GT || code == EQ))
+-    {
+-      rtx (*gen)(rtx, rtx, rtx);
+-
+-      switch (cmp_ops_mode)
+-	{
+-	case E_V64QImode:
+-	  gcc_assert (TARGET_AVX512BW);
+-	  gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
+-	  break;
+-	case E_V32HImode:
+-	  gcc_assert (TARGET_AVX512BW);
+-	  gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
+-	  break;
+-	case E_V16SImode:
+-	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
+-	  break;
+-	case E_V8DImode:
+-	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
+-	  break;
+-	default:
+-	  gen = NULL;
+-	}
+-
+-      if (gen)
+-	{
+-	  emit_insn (gen (dest, cmp_op0, cmp_op1));
+-	  return dest;
+-	}
+-    }
+-  x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
+-
+-  if (cmp_mode != mode && !maskcmp)
+-    {
+-      x = force_reg (cmp_ops_mode, x);
+-      convert_move (dest, x, false);
+-    }
+-  else
+-    emit_insn (gen_rtx_SET (dest, x));
+-
+-  return dest;
+-}
+-
+-/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
+-   operations.  This is used for both scalar and vector conditional moves.  */
+-
+-void
+-ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
+-{
+-  machine_mode mode = GET_MODE (dest);
+-  machine_mode cmpmode = GET_MODE (cmp);
+-
+-  /* In AVX512F the result of comparison is an integer mask.  */
+-  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+-
+-  rtx t2, t3, x;
+-
+-  /* If we have an integer mask and FP value then we need
+-     to cast mask to FP mode.  */
+-  if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
+-    {
+-      cmp = force_reg (cmpmode, cmp);
+-      cmp = gen_rtx_SUBREG (mode, cmp, 0);
+-    }
+-
+-  if (maskcmp)
+-    {
+-      rtx (*gen) (rtx, rtx) = NULL;
+-      if ((op_true == CONST0_RTX (mode)
+-	   && vector_all_ones_operand (op_false, mode))
+-	  || (op_false == CONST0_RTX (mode)
+-	      && vector_all_ones_operand (op_true, mode)))
+-	switch (mode)
+-	  {
+-	  case E_V64QImode:
+-	    if (TARGET_AVX512BW)
+-	      gen = gen_avx512bw_cvtmask2bv64qi;
+-	    break;
+-	  case E_V32QImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+-	      gen = gen_avx512vl_cvtmask2bv32qi;
+-	    break;
+-	  case E_V16QImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+-	      gen = gen_avx512vl_cvtmask2bv16qi;
+-	    break;
+-	  case E_V32HImode:
+-	    if (TARGET_AVX512BW)
+-	      gen = gen_avx512bw_cvtmask2wv32hi;
+-	    break;
+-	  case E_V16HImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+-	      gen = gen_avx512vl_cvtmask2wv16hi;
+-	    break;
+-	  case E_V8HImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+-	      gen = gen_avx512vl_cvtmask2wv8hi;
+-	    break;
+-	  case E_V16SImode:
+-	    if (TARGET_AVX512DQ)
+-	      gen = gen_avx512f_cvtmask2dv16si;
+-	    break;
+-	  case E_V8SImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+-	      gen = gen_avx512vl_cvtmask2dv8si;
+-	    break;
+-	  case E_V4SImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+-	      gen = gen_avx512vl_cvtmask2dv4si;
+-	    break;
+-	  case E_V8DImode:
+-	    if (TARGET_AVX512DQ)
+-	      gen = gen_avx512f_cvtmask2qv8di;
+-	    break;
+-	  case E_V4DImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+-	      gen = gen_avx512vl_cvtmask2qv4di;
+-	    break;
+-	  case E_V2DImode:
+-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+-	      gen = gen_avx512vl_cvtmask2qv2di;
+-	    break;
+-	  default:
+-	    break;
+-	  }
+-      if (gen && SCALAR_INT_MODE_P (cmpmode))
+-	{
+-	  cmp = force_reg (cmpmode, cmp);
+-	  if (op_true == CONST0_RTX (mode))
+-	    {
+-	      rtx (*gen_not) (rtx, rtx);
+-	      switch (cmpmode)
+-		{
+-		case E_QImode: gen_not = gen_knotqi; break;
+-		case E_HImode: gen_not = gen_knothi; break;
+-		case E_SImode: gen_not = gen_knotsi; break;
+-		case E_DImode: gen_not = gen_knotdi; break;
+-		default: gcc_unreachable ();
+-		}
+-	      rtx n = gen_reg_rtx (cmpmode);
+-	      emit_insn (gen_not (n, cmp));
+-	      cmp = n;
+-	    }
+-	  emit_insn (gen (dest, cmp));
+-	  return;
+-	}
+-    }
+-  else if (vector_all_ones_operand (op_true, mode)
+-	   && op_false == CONST0_RTX (mode))
+-    {
+-      emit_insn (gen_rtx_SET (dest, cmp));
+-      return;
+-    }
+-  else if (op_false == CONST0_RTX (mode))
+-    {
+-      op_true = force_reg (mode, op_true);
+-      x = gen_rtx_AND (mode, cmp, op_true);
+-      emit_insn (gen_rtx_SET (dest, x));
+-      return;
+-    }
+-  else if (op_true == CONST0_RTX (mode))
+-    {
+-      op_false = force_reg (mode, op_false);
+-      x = gen_rtx_NOT (mode, cmp);
+-      x = gen_rtx_AND (mode, x, op_false);
+-      emit_insn (gen_rtx_SET (dest, x));
+-      return;
+-    }
+-  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
+-    {
+-      op_false = force_reg (mode, op_false);
+-      x = gen_rtx_IOR (mode, cmp, op_false);
+-      emit_insn (gen_rtx_SET (dest, x));
+-      return;
+-    }
+-  else if (TARGET_XOP)
+-    {
+-      op_true = force_reg (mode, op_true);
+-
+-      if (!nonimmediate_operand (op_false, mode))
+-	op_false = force_reg (mode, op_false);
+-
+-      emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
+-							  op_true,
+-							  op_false)));
+-      return;
+-    }
+-
+-  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+-  rtx d = dest;
+-
+-  if (!vector_operand (op_true, mode))
+-    op_true = force_reg (mode, op_true);
+-
+-  op_false = force_reg (mode, op_false);
+-
+-  switch (mode)
+-    {
+-    case E_V4SFmode:
+-      if (TARGET_SSE4_1)
+-	gen = gen_sse4_1_blendvps;
+-      break;
+-    case E_V2DFmode:
+-      if (TARGET_SSE4_1)
+-	gen = gen_sse4_1_blendvpd;
+-      break;
+-    case E_SFmode:
+-      if (TARGET_SSE4_1)
+-	{
+-	  gen = gen_sse4_1_blendvss;
+-	  op_true = force_reg (mode, op_true);
+-	}
+-      break;
+-    case E_DFmode:
+-      if (TARGET_SSE4_1)
+-	{
+-	  gen = gen_sse4_1_blendvsd;
+-	  op_true = force_reg (mode, op_true);
+-	}
+-      break;
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V4SImode:
+-    case E_V2DImode:
+-      if (TARGET_SSE4_1)
+-	{
+-	  gen = gen_sse4_1_pblendvb;
+-	  if (mode != V16QImode)
+-	    d = gen_reg_rtx (V16QImode);
+-	  op_false = gen_lowpart (V16QImode, op_false);
+-	  op_true = gen_lowpart (V16QImode, op_true);
+-	  cmp = gen_lowpart (V16QImode, cmp);
+-	}
+-      break;
+-    case E_V8SFmode:
+-      if (TARGET_AVX)
+-	gen = gen_avx_blendvps256;
+-      break;
+-    case E_V4DFmode:
+-      if (TARGET_AVX)
+-	gen = gen_avx_blendvpd256;
+-      break;
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V8SImode:
+-    case E_V4DImode:
+-      if (TARGET_AVX2)
+-	{
+-	  gen = gen_avx2_pblendvb;
+-	  if (mode != V32QImode)
+-	    d = gen_reg_rtx (V32QImode);
+-	  op_false = gen_lowpart (V32QImode, op_false);
+-	  op_true = gen_lowpart (V32QImode, op_true);
+-	  cmp = gen_lowpart (V32QImode, cmp);
+-	}
+-      break;
+-
+-    case E_V64QImode:
+-      gen = gen_avx512bw_blendmv64qi;
+-      break;
+-    case E_V32HImode:
+-      gen = gen_avx512bw_blendmv32hi;
+-      break;
+-    case E_V16SImode:
+-      gen = gen_avx512f_blendmv16si;
+-      break;
+-    case E_V8DImode:
+-      gen = gen_avx512f_blendmv8di;
+-      break;
+-    case E_V8DFmode:
+-      gen = gen_avx512f_blendmv8df;
+-      break;
+-    case E_V16SFmode:
+-      gen = gen_avx512f_blendmv16sf;
+-      break;
+-
+-    default:
+-      break;
+-    }
+-
+-  if (gen != NULL)
+-    {
+-      emit_insn (gen (d, op_false, op_true, cmp));
+-      if (d != dest)
+-	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+-    }
+-  else
+-    {
+-      op_true = force_reg (mode, op_true);
+-
+-      t2 = gen_reg_rtx (mode);
+-      if (optimize)
+-	t3 = gen_reg_rtx (mode);
+-      else
+-	t3 = dest;
+-
+-      x = gen_rtx_AND (mode, op_true, cmp);
+-      emit_insn (gen_rtx_SET (t2, x));
+-
+-      x = gen_rtx_NOT (mode, cmp);
+-      x = gen_rtx_AND (mode, x, op_false);
+-      emit_insn (gen_rtx_SET (t3, x));
+-
+-      x = gen_rtx_IOR (mode, t3, t2);
+-      emit_insn (gen_rtx_SET (dest, x));
+-    }
+-}
+-
+-/* Expand a floating-point conditional move.  Return true if successful.  */
+-
+-bool
+-ix86_expand_fp_movcc (rtx operands[])
+-{
+-  machine_mode mode = GET_MODE (operands[0]);
+-  enum rtx_code code = GET_CODE (operands[1]);
+-  rtx tmp, compare_op;
+-  rtx op0 = XEXP (operands[1], 0);
+-  rtx op1 = XEXP (operands[1], 1);
+-
+-  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+-    {
+-      machine_mode cmode;
+-
+-      /* Since we've no cmove for sse registers, don't force bad register
+-	 allocation just to gain access to it.  Deny movcc when the
+-	 comparison mode doesn't match the move mode.  */
+-      cmode = GET_MODE (op0);
+-      if (cmode == VOIDmode)
+-	cmode = GET_MODE (op1);
+-      if (cmode != mode)
+-	return false;
+-
+-      code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
+-      if (code == UNKNOWN)
+-	return false;
+-
+-      if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
+-				     operands[2], operands[3]))
+-	return true;
+-
+-      tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
+-				 operands[2], operands[3]);
+-      ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
+-      return true;
+-    }
+-
+-  if (GET_MODE (op0) == TImode
+-      || (GET_MODE (op0) == DImode
+-	  && !TARGET_64BIT))
+-    return false;
+-
+-  /* The floating point conditional move instructions don't directly
+-     support conditions resulting from a signed integer comparison.  */
+-
+-  compare_op = ix86_expand_compare (code, op0, op1);
+-  if (!fcmov_comparison_operator (compare_op, VOIDmode))
+-    {
+-      tmp = gen_reg_rtx (QImode);
+-      ix86_expand_setcc (tmp, code, op0, op1);
+-
+-      compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
+-    }
+-
+-  emit_insn (gen_rtx_SET (operands[0],
+-			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
+-						operands[2], operands[3])));
+-
+-  return true;
+-}
+-
+-/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
+-
+-static int
+-ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
+-{
+-  switch (code)
+-    {
+-    case EQ:
+-      return 0;
+-    case LT:
+-    case LTU:
+-      return 1;
+-    case LE:
+-    case LEU:
+-      return 2;
+-    case NE:
+-      return 4;
+-    case GE:
+-    case GEU:
+-      return 5;
+-    case GT:
+-    case GTU:
+-      return 6;
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
+-
+-static int
+-ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
+-{
+-  switch (code)
+-    {
+-    case EQ:
+-      return 0x00;
+-    case NE:
+-      return 0x04;
+-    case GT:
+-      return 0x0e;
+-    case LE:
+-      return 0x02;
+-    case GE:
+-      return 0x0d;
+-    case LT:
+-      return 0x01;
+-    case UNLE:
+-      return 0x0a;
+-    case UNLT:
+-      return 0x09;
+-    case UNGE:
+-      return 0x05;
+-    case UNGT:
+-      return 0x06;
+-    case UNEQ:
+-      return 0x18;
+-    case LTGT:
+-      return 0x0c;
+-    case ORDERED:
+-      return 0x07;
+-    case UNORDERED:
+-      return 0x03;
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* Return immediate value to be used in UNSPEC_PCMP
+-   for comparison CODE in MODE.  */
+-
+-static int
+-ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
+-{
+-  if (FLOAT_MODE_P (mode))
+-    return ix86_fp_cmp_code_to_pcmp_immediate (code);
+-  return ix86_int_cmp_code_to_pcmp_immediate (code);
+-}
+-
+-/* Expand AVX-512 vector comparison.  */
+-
+-bool
+-ix86_expand_mask_vec_cmp (rtx operands[])
+-{
+-  machine_mode mask_mode = GET_MODE (operands[0]);
+-  machine_mode cmp_mode = GET_MODE (operands[2]);
+-  enum rtx_code code = GET_CODE (operands[1]);
+-  rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
+-  int unspec_code;
+-  rtx unspec;
+-
+-  switch (code)
+-    {
+-    case LEU:
+-    case GTU:
+-    case GEU:
+-    case LTU:
+-      unspec_code = UNSPEC_UNSIGNED_PCMP;
+-      break;
+-
+-    default:
+-      unspec_code = UNSPEC_PCMP;
+-    }
+-
+-  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
+-						 operands[3], imm),
+-			   unspec_code);
+-  emit_insn (gen_rtx_SET (operands[0], unspec));
+-
+-  return true;
+-}
+-
+-/* Expand fp vector comparison.  */
+-
+-bool
+-ix86_expand_fp_vec_cmp (rtx operands[])
+-{
+-  enum rtx_code code = GET_CODE (operands[1]);
+-  rtx cmp;
+-
+-  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+-					   &operands[2], &operands[3]);
+-  if (code == UNKNOWN)
+-    {
+-      rtx temp;
+-      switch (GET_CODE (operands[1]))
+-	{
+-	case LTGT:
+-	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
+-				      operands[3], NULL, NULL);
+-	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
+-				     operands[3], NULL, NULL);
+-	  code = AND;
+-	  break;
+-	case UNEQ:
+-	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
+-				      operands[3], NULL, NULL);
+-	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
+-				     operands[3], NULL, NULL);
+-	  code = IOR;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+-				 OPTAB_DIRECT);
+-    }
+-  else
+-    cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
+-			       operands[1], operands[2]);
+-
+-  if (operands[0] != cmp)
+-    emit_move_insn (operands[0], cmp);
+-
+-  return true;
+-}
+-
+-static rtx
+-ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
+-			 rtx op_true, rtx op_false, bool *negate)
+-{
+-  machine_mode data_mode = GET_MODE (dest);
+-  machine_mode mode = GET_MODE (cop0);
+-  rtx x;
+-
+-  *negate = false;
+-
+-  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
+-  if (TARGET_XOP
+-      && (mode == V16QImode || mode == V8HImode
+-	  || mode == V4SImode || mode == V2DImode))
+-    ;
+-  else
+-    {
+-      /* Canonicalize the comparison to EQ, GT, GTU.  */
+-      switch (code)
+-	{
+-	case EQ:
+-	case GT:
+-	case GTU:
+-	  break;
+-
+-	case NE:
+-	case LE:
+-	case LEU:
+-	  code = reverse_condition (code);
+-	  *negate = true;
+-	  break;
+-
+-	case GE:
+-	case GEU:
+-	  code = reverse_condition (code);
+-	  *negate = true;
+-	  /* FALLTHRU */
+-
+-	case LT:
+-	case LTU:
+-	  std::swap (cop0, cop1);
+-	  code = swap_condition (code);
+-	  break;
+-
+-	default:
+-	  gcc_unreachable ();
+-	}
+-
+-      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
+-      if (mode == V2DImode)
+-	{
+-	  switch (code)
+-	    {
+-	    case EQ:
+-	      /* SSE4.1 supports EQ.  */
+-	      if (!TARGET_SSE4_1)
+-		return NULL;
+-	      break;
+-
+-	    case GT:
+-	    case GTU:
+-	      /* SSE4.2 supports GT/GTU.  */
+-	      if (!TARGET_SSE4_2)
+-		return NULL;
+-	      break;
+-
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-	}
+-
+-      rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
+-      rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
+-      if (*negate)
+-	std::swap (optrue, opfalse);
+-
+-      /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
+-	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
+-	 min (x, y) == x).  While we add one instruction (the minimum),
+-	 we remove the need for two instructions in the negation, as the
+-	 result is done this way.
+-	 When using masks, do it for SI/DImode element types, as it is shorter
+-	 than the two subtractions.  */
+-      if ((code != EQ
+-	   && GET_MODE_SIZE (mode) != 64
+-	   && vector_all_ones_operand (opfalse, data_mode)
+-	   && optrue == CONST0_RTX (data_mode))
+-	  || (code == GTU
+-	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
+-	      /* Don't do it if not using integer masks and we'd end up with
+-		 the right values in the registers though.  */
+-	      && (GET_MODE_SIZE (mode) == 64
+-		  || !vector_all_ones_operand (optrue, data_mode)
+-		  || opfalse != CONST0_RTX (data_mode))))
+-	{
+-	  rtx (*gen) (rtx, rtx, rtx) = NULL;
+-
+-	  switch (mode)
+-	    {
+-	    case E_V16SImode:
+-	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
+-	      break;
+-	    case E_V8DImode:
+-	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
+-	      cop0 = force_reg (mode, cop0);
+-	      cop1 = force_reg (mode, cop1);
+-	      break;
+-	    case E_V32QImode:
+-	      if (TARGET_AVX2)
+-		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
+-	      break;
+-	    case E_V16HImode:
+-	      if (TARGET_AVX2)
+-		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
+-	      break;
+-	    case E_V8SImode:
+-	      if (TARGET_AVX2)
+-		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
+-	      break;
+-	    case E_V4DImode:
+-	      if (TARGET_AVX512VL)
+-		{
+-		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
+-		  cop0 = force_reg (mode, cop0);
+-		  cop1 = force_reg (mode, cop1);
+-		}
+-	      break;
+-	    case E_V16QImode:
+-	      if (code == GTU && TARGET_SSE2)
+-		gen = gen_uminv16qi3;
+-	      else if (code == GT && TARGET_SSE4_1)
+-		gen = gen_sminv16qi3;
+-	      break;
+-	    case E_V8HImode:
+-	      if (code == GTU && TARGET_SSE4_1)
+-		gen = gen_uminv8hi3;
+-	      else if (code == GT && TARGET_SSE2)
+-		gen = gen_sminv8hi3;
+-	      break;
+-	    case E_V4SImode:
+-	      if (TARGET_SSE4_1)
+-		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
+-	      break;
+-	    case E_V2DImode:
+-	      if (TARGET_AVX512VL)
+-		{
+-		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
+-		  cop0 = force_reg (mode, cop0);
+-		  cop1 = force_reg (mode, cop1);
+-		}
+-	      break;
+-	    default:
+-	      break;
+-	    }
+-
+-	  if (gen)
+-	    {
+-	      rtx tem = gen_reg_rtx (mode);
+-	      if (!vector_operand (cop0, mode))
+-		cop0 = force_reg (mode, cop0);
+-	      if (!vector_operand (cop1, mode))
+-		cop1 = force_reg (mode, cop1);
+-	      *negate = !*negate;
+-	      emit_insn (gen (tem, cop0, cop1));
+-	      cop1 = tem;
+-	      code = EQ;
+-	    }
+-	}
+-
+-      /* Unsigned parallel compare is not supported by the hardware.
+-	 Play some tricks to turn this into a signed comparison
+-	 against 0.  */
+-      if (code == GTU)
+-	{
+-	  cop0 = force_reg (mode, cop0);
+-
+-	  switch (mode)
+-	    {
+-	    case E_V16SImode:
+-	    case E_V8DImode:
+-	    case E_V8SImode:
+-	    case E_V4DImode:
+-	    case E_V4SImode:
+-	    case E_V2DImode:
+-		{
+-		  rtx t1, t2, mask;
+-		  rtx (*gen_sub3) (rtx, rtx, rtx);
+-
+-		  switch (mode)
+-		    {
+-		    case E_V16SImode: gen_sub3 = gen_subv16si3; break;
+-		    case E_V8DImode: gen_sub3 = gen_subv8di3; break;
+-		    case E_V8SImode: gen_sub3 = gen_subv8si3; break;
+-		    case E_V4DImode: gen_sub3 = gen_subv4di3; break;
+-		    case E_V4SImode: gen_sub3 = gen_subv4si3; break;
+-		    case E_V2DImode: gen_sub3 = gen_subv2di3; break;
+-		    default:
+-		      gcc_unreachable ();
+-		    }
+-		  /* Subtract (-(INT MAX) - 1) from both operands to make
+-		     them signed.  */
+-		  mask = ix86_build_signbit_mask (mode, true, false);
+-		  t1 = gen_reg_rtx (mode);
+-		  emit_insn (gen_sub3 (t1, cop0, mask));
+-
+-		  t2 = gen_reg_rtx (mode);
+-		  emit_insn (gen_sub3 (t2, cop1, mask));
+-
+-		  cop0 = t1;
+-		  cop1 = t2;
+-		  code = GT;
+-		}
+-	      break;
+-
+-	    case E_V64QImode:
+-	    case E_V32HImode:
+-	    case E_V32QImode:
+-	    case E_V16HImode:
+-	    case E_V16QImode:
+-	    case E_V8HImode:
+-	      /* Perform a parallel unsigned saturating subtraction.  */
+-	      x = gen_reg_rtx (mode);
+-	      emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
+-							   cop1)));
+-
+-	      cop0 = x;
+-	      cop1 = CONST0_RTX (mode);
+-	      code = EQ;
+-	      *negate = !*negate;
+-	      break;
+-
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-	}
+-    }
+-
+-  if (*negate)
+-    std::swap (op_true, op_false);
+-
+-  /* Allow the comparison to be done in one mode, but the movcc to
+-     happen in another mode.  */
+-  if (data_mode == mode)
+-    {
+-      x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
+-			       op_true, op_false);
+-    }
+-  else
+-    {
+-      gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
+-      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
+-			       op_true, op_false);
+-      if (GET_MODE (x) == mode)
+-	x = gen_lowpart (data_mode, x);
+-    }
+-
+-  return x;
+-}
+-
+-/* Expand integer vector comparison.  */
+-
+-bool
+-ix86_expand_int_vec_cmp (rtx operands[])
+-{
+-  rtx_code code = GET_CODE (operands[1]);
+-  bool negate = false;
+-  rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
+-				     operands[3], NULL, NULL, &negate);
+-
+-  if (!cmp)
+-    return false;
+-
+-  if (negate)
+-    cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
+-				   CONST0_RTX (GET_MODE (cmp)),
+-				   NULL, NULL, &negate);
+-
+-  gcc_assert (!negate);
+-
+-  if (operands[0] != cmp)
+-    emit_move_insn (operands[0], cmp);
+-
+-  return true;
+-}
+-
+-/* Expand a floating-point vector conditional move; a vcond operation
+-   rather than a movcc operation.  */
+-
+-bool
+-ix86_expand_fp_vcond (rtx operands[])
+-{
+-  enum rtx_code code = GET_CODE (operands[3]);
+-  rtx cmp;
+-
+-  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+-					   &operands[4], &operands[5]);
+-  if (code == UNKNOWN)
+-    {
+-      rtx temp;
+-      switch (GET_CODE (operands[3]))
+-	{
+-	case LTGT:
+-	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
+-				      operands[5], operands[0], operands[0]);
+-	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
+-				     operands[5], operands[1], operands[2]);
+-	  code = AND;
+-	  break;
+-	case UNEQ:
+-	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
+-				      operands[5], operands[0], operands[0]);
+-	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
+-				     operands[5], operands[1], operands[2]);
+-	  code = IOR;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+-				 OPTAB_DIRECT);
+-      ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+-      return true;
+-    }
+-
+-  if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
+-				 operands[5], operands[1], operands[2]))
+-    return true;
+-
+-  cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
+-			     operands[1], operands[2]);
+-  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+-  return true;
+-}
+-
+-/* Expand a signed/unsigned integral vector conditional move.  */
+-
+-bool
+-ix86_expand_int_vcond (rtx operands[])
+-{
+-  machine_mode data_mode = GET_MODE (operands[0]);
+-  machine_mode mode = GET_MODE (operands[4]);
+-  enum rtx_code code = GET_CODE (operands[3]);
+-  bool negate = false;
+-  rtx x, cop0, cop1;
+-
+-  cop0 = operands[4];
+-  cop1 = operands[5];
+-
+-  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+-     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+-  if ((code == LT || code == GE)
+-      && data_mode == mode
+-      && cop1 == CONST0_RTX (mode)
+-      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+-      && GET_MODE_UNIT_SIZE (data_mode) > 1
+-      && GET_MODE_UNIT_SIZE (data_mode) <= 8
+-      && (GET_MODE_SIZE (data_mode) == 16
+-	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+-    {
+-      rtx negop = operands[2 - (code == LT)];
+-      int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
+-      if (negop == CONST1_RTX (data_mode))
+-	{
+-	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+-					 operands[0], 1, OPTAB_DIRECT);
+-	  if (res != operands[0])
+-	    emit_move_insn (operands[0], res);
+-	  return true;
+-	}
+-      else if (GET_MODE_INNER (data_mode) != DImode
+-	       && vector_all_ones_operand (negop, data_mode))
+-	{
+-	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+-					 operands[0], 0, OPTAB_DIRECT);
+-	  if (res != operands[0])
+-	    emit_move_insn (operands[0], res);
+-	  return true;
+-	}
+-    }
+-
+-  if (!nonimmediate_operand (cop1, mode))
+-    cop1 = force_reg (mode, cop1);
+-  if (!general_operand (operands[1], data_mode))
+-    operands[1] = force_reg (data_mode, operands[1]);
+-  if (!general_operand (operands[2], data_mode))
+-    operands[2] = force_reg (data_mode, operands[2]);
+-
+-  x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
+-			       operands[1], operands[2], &negate);
+-
+-  if (!x)
+-    return false;
+-
+-  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
+-			 operands[2-negate]);
+-  return true;
+-}
+-
+-/* AVX512F does support 64-byte integer vector operations,
+-   thus the longest vector we are faced with is V64QImode.  */
+-#define MAX_VECT_LEN	64
+-
+-struct expand_vec_perm_d
+-{
+-  rtx target, op0, op1;
+-  unsigned char perm[MAX_VECT_LEN];
+-  machine_mode vmode;
+-  unsigned char nelt;
+-  bool one_operand_p;
+-  bool testing_p;
+-};
+-
+-static bool
+-ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
+-			      struct expand_vec_perm_d *d)
+-{
+-  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
+-     expander, so args are either in d, or in op0, op1 etc.  */
+-  machine_mode mode = GET_MODE (d ? d->op0 : op0);
+-  machine_mode maskmode = mode;
+-  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+-
+-  switch (mode)
+-    {
+-    case E_V8HImode:
+-      if (TARGET_AVX512VL && TARGET_AVX512BW)
+-	gen = gen_avx512vl_vpermt2varv8hi3;
+-      break;
+-    case E_V16HImode:
+-      if (TARGET_AVX512VL && TARGET_AVX512BW)
+-	gen = gen_avx512vl_vpermt2varv16hi3;
+-      break;
+-    case E_V64QImode:
+-      if (TARGET_AVX512VBMI)
+-	gen = gen_avx512bw_vpermt2varv64qi3;
+-      break;
+-    case E_V32HImode:
+-      if (TARGET_AVX512BW)
+-	gen = gen_avx512bw_vpermt2varv32hi3;
+-      break;
+-    case E_V4SImode:
+-      if (TARGET_AVX512VL)
+-	gen = gen_avx512vl_vpermt2varv4si3;
+-      break;
+-    case E_V8SImode:
+-      if (TARGET_AVX512VL)
+-	gen = gen_avx512vl_vpermt2varv8si3;
+-      break;
+-    case E_V16SImode:
+-      if (TARGET_AVX512F)
+-	gen = gen_avx512f_vpermt2varv16si3;
+-      break;
+-    case E_V4SFmode:
+-      if (TARGET_AVX512VL)
+-	{
+-	  gen = gen_avx512vl_vpermt2varv4sf3;
+-	  maskmode = V4SImode;
+-	}
+-      break;
+-    case E_V8SFmode:
+-      if (TARGET_AVX512VL)
+-	{
+-	  gen = gen_avx512vl_vpermt2varv8sf3;
+-	  maskmode = V8SImode;
+-	}
+-      break;
+-    case E_V16SFmode:
+-      if (TARGET_AVX512F)
+-	{
+-	  gen = gen_avx512f_vpermt2varv16sf3;
+-	  maskmode = V16SImode;
+-	}
+-      break;
+-    case E_V2DImode:
+-      if (TARGET_AVX512VL)
+-	gen = gen_avx512vl_vpermt2varv2di3;
+-      break;
+-    case E_V4DImode:
+-      if (TARGET_AVX512VL)
+-	gen = gen_avx512vl_vpermt2varv4di3;
+-      break;
+-    case E_V8DImode:
+-      if (TARGET_AVX512F)
+-	gen = gen_avx512f_vpermt2varv8di3;
+-      break;
+-    case E_V2DFmode:
+-      if (TARGET_AVX512VL)
+-	{
+-	  gen = gen_avx512vl_vpermt2varv2df3;
+-	  maskmode = V2DImode;
+-	}
+-      break;
+-    case E_V4DFmode:
+-      if (TARGET_AVX512VL)
+-	{
+-	  gen = gen_avx512vl_vpermt2varv4df3;
+-	  maskmode = V4DImode;
+-	}
+-      break;
+-    case E_V8DFmode:
+-      if (TARGET_AVX512F)
+-	{
+-	  gen = gen_avx512f_vpermt2varv8df3;
+-	  maskmode = V8DImode;
+-	}
+-      break;
+-    default:
+-      break;
+-    }
+-
+-  if (gen == NULL)
+-    return false;
+-
+-  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
+-     expander, so args are either in d, or in op0, op1 etc.  */
+-  if (d)
+-    {
+-      rtx vec[64];
+-      target = d->target;
+-      op0 = d->op0;
+-      op1 = d->op1;
+-      for (int i = 0; i < d->nelt; ++i)
+-	vec[i] = GEN_INT (d->perm[i]);
+-      mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+-    }
+-
+-  emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
+-  return true;
+-}
+-
+-/* Expand a variable vector permutation.  */
+-
+-void
+-ix86_expand_vec_perm (rtx operands[])
+-{
+-  rtx target = operands[0];
+-  rtx op0 = operands[1];
+-  rtx op1 = operands[2];
+-  rtx mask = operands[3];
+-  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
+-  machine_mode mode = GET_MODE (op0);
+-  machine_mode maskmode = GET_MODE (mask);
+-  int w, e, i;
+-  bool one_operand_shuffle = rtx_equal_p (op0, op1);
+-
+-  /* Number of elements in the vector.  */
+-  w = GET_MODE_NUNITS (mode);
+-  e = GET_MODE_UNIT_SIZE (mode);
+-  gcc_assert (w <= 64);
+-
+-  if (TARGET_AVX512F && one_operand_shuffle)
+-    {
+-      rtx (*gen) (rtx, rtx, rtx) = NULL;
+-      switch (mode)
+-	{
+-	case E_V16SImode:
+-	  gen =gen_avx512f_permvarv16si;
+-	  break;
+-	case E_V16SFmode:
+-	  gen = gen_avx512f_permvarv16sf;
+-	  break;
+-	case E_V8DImode:
+-	  gen = gen_avx512f_permvarv8di;
+-	  break;
+-	case E_V8DFmode:
+-	  gen = gen_avx512f_permvarv8df;
+-	  break;
+-	default:
+-	  break;
+-	}
+-      if (gen != NULL)
+-	{
+-	  emit_insn (gen (target, op0, mask));
+-	  return;
+-	}
+-    }
+-
+-  if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
+-    return;
+-
+-  if (TARGET_AVX2)
+-    {
+-      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
+-	{
+-	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
+-	     an constant shuffle operand.  With a tiny bit of effort we can
+-	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
+-	     unfortunate but there's no avoiding it.
+-	     Similarly for V16HImode we don't have instructions for variable
+-	     shuffling, while for V32QImode we can use after preparing suitable
+-	     masks vpshufb; vpshufb; vpermq; vpor.  */
+-
+-	  if (mode == V16HImode)
+-	    {
+-	      maskmode = mode = V32QImode;
+-	      w = 32;
+-	      e = 1;
+-	    }
+-	  else
+-	    {
+-	      maskmode = mode = V8SImode;
+-	      w = 8;
+-	      e = 4;
+-	    }
+-	  t1 = gen_reg_rtx (maskmode);
+-
+-	  /* Replicate the low bits of the V4DImode mask into V8SImode:
+-	       mask = { A B C D }
+-	       t1 = { A A B B C C D D }.  */
+-	  for (i = 0; i < w / 2; ++i)
+-	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
+-	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+-	  vt = force_reg (maskmode, vt);
+-	  mask = gen_lowpart (maskmode, mask);
+-	  if (maskmode == V8SImode)
+-	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
+-	  else
+-	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
+-
+-	  /* Multiply the shuffle indicies by two.  */
+-	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+-				    OPTAB_DIRECT);
+-
+-	  /* Add one to the odd shuffle indicies:
+-		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
+-	  for (i = 0; i < w / 2; ++i)
+-	    {
+-	      vec[i * 2] = const0_rtx;
+-	      vec[i * 2 + 1] = const1_rtx;
+-	    }
+-	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+-	  vt = validize_mem (force_const_mem (maskmode, vt));
+-	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+-				    OPTAB_DIRECT);
+-
+-	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
+-	  operands[3] = mask = t1;
+-	  target = gen_reg_rtx (mode);
+-	  op0 = gen_lowpart (mode, op0);
+-	  op1 = gen_lowpart (mode, op1);
+-	}
+-
+-      switch (mode)
+-	{
+-	case E_V8SImode:
+-	  /* The VPERMD and VPERMPS instructions already properly ignore
+-	     the high bits of the shuffle elements.  No need for us to
+-	     perform an AND ourselves.  */
+-	  if (one_operand_shuffle)
+-	    {
+-	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+-	      if (target != operands[0])
+-		emit_move_insn (operands[0],
+-				gen_lowpart (GET_MODE (operands[0]), target));
+-	    }
+-	  else
+-	    {
+-	      t1 = gen_reg_rtx (V8SImode);
+-	      t2 = gen_reg_rtx (V8SImode);
+-	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+-	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
+-	      goto merge_two;
+-	    }
+-	  return;
+-
+-	case E_V8SFmode:
+-	  mask = gen_lowpart (V8SImode, mask);
+-	  if (one_operand_shuffle)
+-	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
+-	  else
+-	    {
+-	      t1 = gen_reg_rtx (V8SFmode);
+-	      t2 = gen_reg_rtx (V8SFmode);
+-	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+-	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
+-	      goto merge_two;
+-	    }
+-	  return;
+-
+-        case E_V4SImode:
+-	  /* By combining the two 128-bit input vectors into one 256-bit
+-	     input vector, we can use VPERMD and VPERMPS for the full
+-	     two-operand shuffle.  */
+-	  t1 = gen_reg_rtx (V8SImode);
+-	  t2 = gen_reg_rtx (V8SImode);
+-	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
+-	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+-	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
+-	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
+-	  return;
+-
+-        case E_V4SFmode:
+-	  t1 = gen_reg_rtx (V8SFmode);
+-	  t2 = gen_reg_rtx (V8SImode);
+-	  mask = gen_lowpart (V4SImode, mask);
+-	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
+-	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+-	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
+-	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
+-	  return;
+-
+-	case E_V32QImode:
+-	  t1 = gen_reg_rtx (V32QImode);
+-	  t2 = gen_reg_rtx (V32QImode);
+-	  t3 = gen_reg_rtx (V32QImode);
+-	  vt2 = GEN_INT (-128);
+-	  vt = gen_const_vec_duplicate (V32QImode, vt2);
+-	  vt = force_reg (V32QImode, vt);
+-	  for (i = 0; i < 32; i++)
+-	    vec[i] = i < 16 ? vt2 : const0_rtx;
+-	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+-	  vt2 = force_reg (V32QImode, vt2);
+-	  /* From mask create two adjusted masks, which contain the same
+-	     bits as mask in the low 7 bits of each vector element.
+-	     The first mask will have the most significant bit clear
+-	     if it requests element from the same 128-bit lane
+-	     and MSB set if it requests element from the other 128-bit lane.
+-	     The second mask will have the opposite values of the MSB,
+-	     and additionally will have its 128-bit lanes swapped.
+-	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+-	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
+-	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+-	     stands for other 12 bytes.  */
+-	  /* The bit whether element is from the same lane or the other
+-	     lane is bit 4, so shift it up by 3 to the MSB position.  */
+-	  t5 = gen_reg_rtx (V4DImode);
+-	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
+-				    GEN_INT (3)));
+-	  /* Clear MSB bits from the mask just in case it had them set.  */
+-	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+-	  /* After this t1 will have MSB set for elements from other lane.  */
+-	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
+-	  /* Clear bits other than MSB.  */
+-	  emit_insn (gen_andv32qi3 (t1, t1, vt));
+-	  /* Or in the lower bits from mask into t3.  */
+-	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
+-	  /* And invert MSB bits in t1, so MSB is set for elements from the same
+-	     lane.  */
+-	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
+-	  /* Swap 128-bit lanes in t3.  */
+-	  t6 = gen_reg_rtx (V4DImode);
+-	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
+-					  const2_rtx, GEN_INT (3),
+-					  const0_rtx, const1_rtx));
+-	  /* And or in the lower bits from mask into t1.  */
+-	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
+-	  if (one_operand_shuffle)
+-	    {
+-	      /* Each of these shuffles will put 0s in places where
+-		 element from the other 128-bit lane is needed, otherwise
+-		 will shuffle in the requested value.  */
+-	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
+-						gen_lowpart (V32QImode, t6)));
+-	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+-	      /* For t3 the 128-bit lanes are swapped again.  */
+-	      t7 = gen_reg_rtx (V4DImode);
+-	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
+-					      const2_rtx, GEN_INT (3),
+-					      const0_rtx, const1_rtx));
+-	      /* And oring both together leads to the result.  */
+-	      emit_insn (gen_iorv32qi3 (target, t1,
+-					gen_lowpart (V32QImode, t7)));
+-	      if (target != operands[0])
+-		emit_move_insn (operands[0],
+-				gen_lowpart (GET_MODE (operands[0]), target));
+-	      return;
+-	    }
+-
+-	  t4 = gen_reg_rtx (V32QImode);
+-	  /* Similarly to the above one_operand_shuffle code,
+-	     just for repeated twice for each operand.  merge_two:
+-	     code will merge the two results together.  */
+-	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
+-					    gen_lowpart (V32QImode, t6)));
+-	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
+-					    gen_lowpart (V32QImode, t6)));
+-	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+-	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+-	  t7 = gen_reg_rtx (V4DImode);
+-	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
+-					  const2_rtx, GEN_INT (3),
+-					  const0_rtx, const1_rtx));
+-	  t8 = gen_reg_rtx (V4DImode);
+-	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
+-					  const2_rtx, GEN_INT (3),
+-					  const0_rtx, const1_rtx));
+-	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
+-	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
+-	  t1 = t4;
+-	  t2 = t3;
+-	  goto merge_two;
+-
+-	default:
+-	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
+-	  break;
+-	}
+-    }
+-
+-  if (TARGET_XOP)
+-    {
+-      /* The XOP VPPERM insn supports three inputs.  By ignoring the 
+-	 one_operand_shuffle special case, we avoid creating another
+-	 set of constant vectors in memory.  */
+-      one_operand_shuffle = false;
+-
+-      /* mask = mask & {2*w-1, ...} */
+-      vt = GEN_INT (2*w - 1);
+-    }
+-  else
+-    {
+-      /* mask = mask & {w-1, ...} */
+-      vt = GEN_INT (w - 1);
+-    }
+-
+-  vt = gen_const_vec_duplicate (maskmode, vt);
+-  mask = expand_simple_binop (maskmode, AND, mask, vt,
+-			      NULL_RTX, 0, OPTAB_DIRECT);
+-
+-  /* For non-QImode operations, convert the word permutation control
+-     into a byte permutation control.  */
+-  if (mode != V16QImode)
+-    {
+-      mask = expand_simple_binop (maskmode, ASHIFT, mask,
+-				  GEN_INT (exact_log2 (e)),
+-				  NULL_RTX, 0, OPTAB_DIRECT);
+-
+-      /* Convert mask to vector of chars.  */
+-      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
+-
+-      /* Replicate each of the input bytes into byte positions:
+-	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
+-	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+-	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
+-      for (i = 0; i < 16; ++i)
+-	vec[i] = GEN_INT (i/e * e);
+-      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+-      vt = validize_mem (force_const_mem (V16QImode, vt));
+-      if (TARGET_XOP)
+-	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
+-      else
+-	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
+-
+-      /* Convert it into the byte positions by doing
+-	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
+-      for (i = 0; i < 16; ++i)
+-	vec[i] = GEN_INT (i % e);
+-      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+-      vt = validize_mem (force_const_mem (V16QImode, vt));
+-      emit_insn (gen_addv16qi3 (mask, mask, vt));
+-    }
+-
+-  /* The actual shuffle operations all operate on V16QImode.  */
+-  op0 = gen_lowpart (V16QImode, op0);
+-  op1 = gen_lowpart (V16QImode, op1);
+-
+-  if (TARGET_XOP)
+-    {
+-      if (GET_MODE (target) != V16QImode)
+-	target = gen_reg_rtx (V16QImode);
+-      emit_insn (gen_xop_pperm (target, op0, op1, mask));
+-      if (target != operands[0])
+-	emit_move_insn (operands[0],
+-			gen_lowpart (GET_MODE (operands[0]), target));
+-    }
+-  else if (one_operand_shuffle)
+-    {
+-      if (GET_MODE (target) != V16QImode)
+-	target = gen_reg_rtx (V16QImode);
+-      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+-      if (target != operands[0])
+-	emit_move_insn (operands[0],
+-			gen_lowpart (GET_MODE (operands[0]), target));
+-    }
+-  else
+-    {
+-      rtx xops[6];
+-      bool ok;
+-
+-      /* Shuffle the two input vectors independently.  */
+-      t1 = gen_reg_rtx (V16QImode);
+-      t2 = gen_reg_rtx (V16QImode);
+-      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
+-      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
+-
+- merge_two:
+-      /* Then merge them together.  The key is whether any given control
+-         element contained a bit set that indicates the second word.  */
+-      mask = operands[3];
+-      vt = GEN_INT (w);
+-      if (maskmode == V2DImode && !TARGET_SSE4_1)
+-	{
+-	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
+-	     more shuffle to convert the V2DI input mask into a V4SI
+-	     input mask.  At which point the masking that expand_int_vcond
+-	     will work as desired.  */
+-	  rtx t3 = gen_reg_rtx (V4SImode);
+-	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
+-				        const0_rtx, const0_rtx,
+-				        const2_rtx, const2_rtx));
+-	  mask = t3;
+-	  maskmode = V4SImode;
+-	  e = w = 4;
+-	}
+-
+-      vt = gen_const_vec_duplicate (maskmode, vt);
+-      vt = force_reg (maskmode, vt);
+-      mask = expand_simple_binop (maskmode, AND, mask, vt,
+-				  NULL_RTX, 0, OPTAB_DIRECT);
+-
+-      if (GET_MODE (target) != mode)
+-	target = gen_reg_rtx (mode);
+-      xops[0] = target;
+-      xops[1] = gen_lowpart (mode, t2);
+-      xops[2] = gen_lowpart (mode, t1);
+-      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+-      xops[4] = mask;
+-      xops[5] = vt;
+-      ok = ix86_expand_int_vcond (xops);
+-      gcc_assert (ok);
+-      if (target != operands[0])
+-	emit_move_insn (operands[0],
+-			gen_lowpart (GET_MODE (operands[0]), target));
+-    }
+-}
+-
+-/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
+-   true if we should do zero extension, else sign extension.  HIGH_P is
+-   true if we want the N/2 high elements, else the low elements.  */
+-
+-void
+-ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
+-{
+-  machine_mode imode = GET_MODE (src);
+-  rtx tmp;
+-
+-  if (TARGET_SSE4_1)
+-    {
+-      rtx (*unpack)(rtx, rtx);
+-      rtx (*extract)(rtx, rtx) = NULL;
+-      machine_mode halfmode = BLKmode;
+-
+-      switch (imode)
+-	{
+-	case E_V64QImode:
+-	  if (unsigned_p)
+-	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+-	  else
+-	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+-	  halfmode = V32QImode;
+-	  extract
+-	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+-	  break;
+-	case E_V32QImode:
+-	  if (unsigned_p)
+-	    unpack = gen_avx2_zero_extendv16qiv16hi2;
+-	  else
+-	    unpack = gen_avx2_sign_extendv16qiv16hi2;
+-	  halfmode = V16QImode;
+-	  extract
+-	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
+-	  break;
+-	case E_V32HImode:
+-	  if (unsigned_p)
+-	    unpack = gen_avx512f_zero_extendv16hiv16si2;
+-	  else
+-	    unpack = gen_avx512f_sign_extendv16hiv16si2;
+-	  halfmode = V16HImode;
+-	  extract
+-	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
+-	  break;
+-	case E_V16HImode:
+-	  if (unsigned_p)
+-	    unpack = gen_avx2_zero_extendv8hiv8si2;
+-	  else
+-	    unpack = gen_avx2_sign_extendv8hiv8si2;
+-	  halfmode = V8HImode;
+-	  extract
+-	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
+-	  break;
+-	case E_V16SImode:
+-	  if (unsigned_p)
+-	    unpack = gen_avx512f_zero_extendv8siv8di2;
+-	  else
+-	    unpack = gen_avx512f_sign_extendv8siv8di2;
+-	  halfmode = V8SImode;
+-	  extract
+-	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
+-	  break;
+-	case E_V8SImode:
+-	  if (unsigned_p)
+-	    unpack = gen_avx2_zero_extendv4siv4di2;
+-	  else
+-	    unpack = gen_avx2_sign_extendv4siv4di2;
+-	  halfmode = V4SImode;
+-	  extract
+-	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
+-	  break;
+-	case E_V16QImode:
+-	  if (unsigned_p)
+-	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
+-	  else
+-	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
+-	  break;
+-	case E_V8HImode:
+-	  if (unsigned_p)
+-	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
+-	  else
+-	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
+-	  break;
+-	case E_V4SImode:
+-	  if (unsigned_p)
+-	    unpack = gen_sse4_1_zero_extendv2siv2di2;
+-	  else
+-	    unpack = gen_sse4_1_sign_extendv2siv2di2;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-
+-      if (GET_MODE_SIZE (imode) >= 32)
+-	{
+-	  tmp = gen_reg_rtx (halfmode);
+-	  emit_insn (extract (tmp, src));
+-	}
+-      else if (high_p)
+-	{
+-	  /* Shift higher 8 bytes to lower 8 bytes.  */
+-	  tmp = gen_reg_rtx (V1TImode);
+-	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+-					 GEN_INT (64)));
+-	  tmp = gen_lowpart (imode, tmp);
+-	}
+-      else
+-	tmp = src;
+-
+-      emit_insn (unpack (dest, tmp));
+-    }
+-  else
+-    {
+-      rtx (*unpack)(rtx, rtx, rtx);
+-
+-      switch (imode)
+-	{
+-	case E_V16QImode:
+-	  if (high_p)
+-	    unpack = gen_vec_interleave_highv16qi;
+-	  else
+-	    unpack = gen_vec_interleave_lowv16qi;
+-	  break;
+-	case E_V8HImode:
+-	  if (high_p)
+-	    unpack = gen_vec_interleave_highv8hi;
+-	  else
+-	    unpack = gen_vec_interleave_lowv8hi;
+-	  break;
+-	case E_V4SImode:
+-	  if (high_p)
+-	    unpack = gen_vec_interleave_highv4si;
+-	  else
+-	    unpack = gen_vec_interleave_lowv4si;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-
+-      if (unsigned_p)
+-	tmp = force_reg (imode, CONST0_RTX (imode));
+-      else
+-	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+-				   src, pc_rtx, pc_rtx);
+-
+-      rtx tmp2 = gen_reg_rtx (imode);
+-      emit_insn (unpack (tmp2, src, tmp));
+-      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
+-    }
+-}
+-
+-/* Expand conditional increment or decrement using adb/sbb instructions.
+-   The default case using setcc followed by the conditional move can be
+-   done by generic code.  */
+-bool
+-ix86_expand_int_addcc (rtx operands[])
+-{
+-  enum rtx_code code = GET_CODE (operands[1]);
+-  rtx flags;
+-  rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
+-  rtx compare_op;
+-  rtx val = const0_rtx;
+-  bool fpcmp = false;
+-  machine_mode mode;
+-  rtx op0 = XEXP (operands[1], 0);
+-  rtx op1 = XEXP (operands[1], 1);
+-
+-  if (operands[3] != const1_rtx
+-      && operands[3] != constm1_rtx)
+-    return false;
+-  if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+-     return false;
+-  code = GET_CODE (compare_op);
+-
+-  flags = XEXP (compare_op, 0);
+-
+-  if (GET_MODE (flags) == CCFPmode)
+-    {
+-      fpcmp = true;
+-      code = ix86_fp_compare_code_to_integer (code);
+-    }
+-
+-  if (code != LTU)
+-    {
+-      val = constm1_rtx;
+-      if (fpcmp)
+-	PUT_CODE (compare_op,
+-		  reverse_condition_maybe_unordered
+-		    (GET_CODE (compare_op)));
+-      else
+-	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
+-    }
+-
+-  mode = GET_MODE (operands[0]);
+-
+-  /* Construct either adc or sbb insn.  */
+-  if ((code == LTU) == (operands[3] == constm1_rtx))
+-    {
+-      switch (mode)
+-	{
+-	  case E_QImode:
+-	    insn = gen_subqi3_carry;
+-	    break;
+-	  case E_HImode:
+-	    insn = gen_subhi3_carry;
+-	    break;
+-	  case E_SImode:
+-	    insn = gen_subsi3_carry;
+-	    break;
+-	  case E_DImode:
+-	    insn = gen_subdi3_carry;
+-	    break;
+-	  default:
+-	    gcc_unreachable ();
+-	}
+-    }
+-  else
+-    {
+-      switch (mode)
+-	{
+-	  case E_QImode:
+-	    insn = gen_addqi3_carry;
+-	    break;
+-	  case E_HImode:
+-	    insn = gen_addhi3_carry;
+-	    break;
+-	  case E_SImode:
+-	    insn = gen_addsi3_carry;
+-	    break;
+-	  case E_DImode:
+-	    insn = gen_adddi3_carry;
+-	    break;
+-	  default:
+-	    gcc_unreachable ();
+-	}
+-    }
+-  emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
+-
+-  return true;
+-}
+-
+-
+-/* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
+-   but works for floating pointer parameters and nonoffsetable memories.
+-   For pushes, it returns just stack offsets; the values will be saved
+-   in the right order.  Maximally three parts are generated.  */
+-
+-static int
+-ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
+-{
+-  int size;
+-
+-  if (!TARGET_64BIT)
+-    size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
+-  else
+-    size = (GET_MODE_SIZE (mode) + 4) / 8;
+-
+-  gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
+-  gcc_assert (size >= 2 && size <= 4);
+-
+-  /* Optimize constant pool reference to immediates.  This is used by fp
+-     moves, that force all constants to memory to allow combining.  */
+-  if (MEM_P (operand) && MEM_READONLY_P (operand))
+-    operand = avoid_constant_pool_reference (operand);
+-
+-  if (MEM_P (operand) && !offsettable_memref_p (operand))
+-    {
+-      /* The only non-offsetable memories we handle are pushes.  */
+-      int ok = push_operand (operand, VOIDmode);
+-
+-      gcc_assert (ok);
+-
+-      operand = copy_rtx (operand);
+-      PUT_MODE (operand, word_mode);
+-      parts[0] = parts[1] = parts[2] = parts[3] = operand;
+-      return size;
+-    }
+-
+-  if (GET_CODE (operand) == CONST_VECTOR)
+-    {
+-      scalar_int_mode imode = int_mode_for_mode (mode).require ();
+-      /* Caution: if we looked through a constant pool memory above,
+-	 the operand may actually have a different mode now.  That's
+-	 ok, since we want to pun this all the way back to an integer.  */
+-      operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
+-      gcc_assert (operand != NULL);
+-      mode = imode;
+-    }
+-
+-  if (!TARGET_64BIT)
+-    {
+-      if (mode == DImode)
+-	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+-      else
+-	{
+-	  int i;
+-
+-	  if (REG_P (operand))
+-	    {
+-	      gcc_assert (reload_completed);
+-	      for (i = 0; i < size; i++)
+-		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
+-	    }
+-	  else if (offsettable_memref_p (operand))
+-	    {
+-	      operand = adjust_address (operand, SImode, 0);
+-	      parts[0] = operand;
+-	      for (i = 1; i < size; i++)
+-		parts[i] = adjust_address (operand, SImode, 4 * i);
+-	    }
+-	  else if (CONST_DOUBLE_P (operand))
+-	    {
+-	      const REAL_VALUE_TYPE *r;
+-	      long l[4];
+-
+-	      r = CONST_DOUBLE_REAL_VALUE (operand);
+-	      switch (mode)
+-		{
+-		case E_TFmode:
+-		  real_to_target (l, r, mode);
+-		  parts[3] = gen_int_mode (l[3], SImode);
+-		  parts[2] = gen_int_mode (l[2], SImode);
+-		  break;
+-		case E_XFmode:
+-		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
+-		     long double may not be 80-bit.  */
+-		  real_to_target (l, r, mode);
+-		  parts[2] = gen_int_mode (l[2], SImode);
+-		  break;
+-		case E_DFmode:
+-		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
+-		  break;
+-		default:
+-		  gcc_unreachable ();
+-		}
+-	      parts[1] = gen_int_mode (l[1], SImode);
+-	      parts[0] = gen_int_mode (l[0], SImode);
+-	    }
+-	  else
+-	    gcc_unreachable ();
+-	}
+-    }
+-  else
+-    {
+-      if (mode == TImode)
+-	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+-      if (mode == XFmode || mode == TFmode)
+-	{
+-	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
+-	  if (REG_P (operand))
+-	    {
+-	      gcc_assert (reload_completed);
+-	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
+-	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
+-	    }
+-	  else if (offsettable_memref_p (operand))
+-	    {
+-	      operand = adjust_address (operand, DImode, 0);
+-	      parts[0] = operand;
+-	      parts[1] = adjust_address (operand, upper_mode, 8);
+-	    }
+-	  else if (CONST_DOUBLE_P (operand))
+-	    {
+-	      long l[4];
+-
+-	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
+-
+-	      /* real_to_target puts 32-bit pieces in each long.  */
+-	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
+-				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
+-					  << 32), DImode);
+-
+-	      if (upper_mode == SImode)
+-	        parts[1] = gen_int_mode (l[2], SImode);
+-	      else
+-	        parts[1]
+-		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
+-				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
+-				     << 32), DImode);
+-	    }
+-	  else
+-	    gcc_unreachable ();
+-	}
+-    }
+-
+-  return size;
+-}
+-
+-/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
+-   Return false when normal moves are needed; true when all required
+-   insns have been emitted.  Operands 2-4 contain the input values
+-   int the correct order; operands 5-7 contain the output values.  */
+-
+-void
+-ix86_split_long_move (rtx operands[])
+-{
+-  rtx part[2][4];
+-  int nparts, i, j;
+-  int push = 0;
+-  int collisions = 0;
+-  machine_mode mode = GET_MODE (operands[0]);
+-  bool collisionparts[4];
+-
+-  /* The DFmode expanders may ask us to move double.
+-     For 64bit target this is single move.  By hiding the fact
+-     here we simplify i386.md splitters.  */
+-  if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
+-    {
+-      /* Optimize constant pool reference to immediates.  This is used by
+-	 fp moves, that force all constants to memory to allow combining.  */
+-
+-      if (MEM_P (operands[1])
+-	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
+-	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
+-	operands[1] = get_pool_constant (XEXP (operands[1], 0));
+-      if (push_operand (operands[0], VOIDmode))
+-	{
+-	  operands[0] = copy_rtx (operands[0]);
+-	  PUT_MODE (operands[0], word_mode);
+-	}
+-      else
+-        operands[0] = gen_lowpart (DImode, operands[0]);
+-      operands[1] = gen_lowpart (DImode, operands[1]);
+-      emit_move_insn (operands[0], operands[1]);
+-      return;
+-    }
+-
+-  /* The only non-offsettable memory we handle is push.  */
+-  if (push_operand (operands[0], VOIDmode))
+-    push = 1;
+-  else
+-    gcc_assert (!MEM_P (operands[0])
+-		|| offsettable_memref_p (operands[0]));
+-
+-  nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
+-  ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
+-
+-  /* When emitting push, take care for source operands on the stack.  */
+-  if (push && MEM_P (operands[1])
+-      && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
+-    {
+-      rtx src_base = XEXP (part[1][nparts - 1], 0);
+-
+-      /* Compensate for the stack decrement by 4.  */
+-      if (!TARGET_64BIT && nparts == 3
+-	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
+-	src_base = plus_constant (Pmode, src_base, 4);
+-
+-      /* src_base refers to the stack pointer and is
+-	 automatically decreased by emitted push.  */
+-      for (i = 0; i < nparts; i++)
+-	part[1][i] = change_address (part[1][i],
+-				     GET_MODE (part[1][i]), src_base);
+-    }
+-
+-  /* We need to do copy in the right order in case an address register
+-     of the source overlaps the destination.  */
+-  if (REG_P (part[0][0]) && MEM_P (part[1][0]))
+-    {
+-      rtx tmp;
+-
+-      for (i = 0; i < nparts; i++)
+-	{
+-	  collisionparts[i]
+-	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
+-	  if (collisionparts[i])
+-	    collisions++;
+-	}
+-
+-      /* Collision in the middle part can be handled by reordering.  */
+-      if (collisions == 1 && nparts == 3 && collisionparts [1])
+-	{
+-	  std::swap (part[0][1], part[0][2]);
+-	  std::swap (part[1][1], part[1][2]);
+-	}
+-      else if (collisions == 1
+-	       && nparts == 4
+-	       && (collisionparts [1] || collisionparts [2]))
+-	{
+-	  if (collisionparts [1])
+-	    {
+-	      std::swap (part[0][1], part[0][2]);
+-	      std::swap (part[1][1], part[1][2]);
+-	    }
+-	  else
+-	    {
+-	      std::swap (part[0][2], part[0][3]);
+-	      std::swap (part[1][2], part[1][3]);
+-	    }
+-	}
+-
+-      /* If there are more collisions, we can't handle it by reordering.
+-	 Do an lea to the last part and use only one colliding move.  */
+-      else if (collisions > 1)
+-	{
+-	  rtx base, addr;
+-
+-	  collisions = 1;
+-
+-	  base = part[0][nparts - 1];
+-
+-	  /* Handle the case when the last part isn't valid for lea.
+-	     Happens in 64-bit mode storing the 12-byte XFmode.  */
+-	  if (GET_MODE (base) != Pmode)
+-	    base = gen_rtx_REG (Pmode, REGNO (base));
+-
+-	  addr = XEXP (part[1][0], 0);
+-	  if (TARGET_TLS_DIRECT_SEG_REFS)
+-	    {
+-	      struct ix86_address parts;
+-	      int ok = ix86_decompose_address (addr, &parts);
+-	      gcc_assert (ok);
+-	      /* It is not valid to use %gs: or %fs: in lea.  */
+-	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
+-	    }
+-	  emit_insn (gen_rtx_SET (base, addr));
+-	  part[1][0] = replace_equiv_address (part[1][0], base);
+-	  for (i = 1; i < nparts; i++)
+-	    {
+-	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
+-	      part[1][i] = replace_equiv_address (part[1][i], tmp);
+-	    }
+-	}
+-    }
+-
+-  if (push)
+-    {
+-      if (!TARGET_64BIT)
+-	{
+-	  if (nparts == 3)
+-	    {
+-	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
+-                emit_insn (ix86_gen_add3 (stack_pointer_rtx,
+-					  stack_pointer_rtx, GEN_INT (-4)));
+-	      emit_move_insn (part[0][2], part[1][2]);
+-	    }
+-	  else if (nparts == 4)
+-	    {
+-	      emit_move_insn (part[0][3], part[1][3]);
+-	      emit_move_insn (part[0][2], part[1][2]);
+-	    }
+-	}
+-      else
+-	{
+-	  /* In 64bit mode we don't have 32bit push available.  In case this is
+-	     register, it is OK - we will just use larger counterpart.  We also
+-	     retype memory - these comes from attempt to avoid REX prefix on
+-	     moving of second half of TFmode value.  */
+-	  if (GET_MODE (part[1][1]) == SImode)
+-	    {
+-	      switch (GET_CODE (part[1][1]))
+-		{
+-		case MEM:
+-		  part[1][1] = adjust_address (part[1][1], DImode, 0);
+-		  break;
+-
+-		case REG:
+-		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
+-		  break;
+-
+-		default:
+-		  gcc_unreachable ();
+-		}
+-
+-	      if (GET_MODE (part[1][0]) == SImode)
+-		part[1][0] = part[1][1];
+-	    }
+-	}
+-      emit_move_insn (part[0][1], part[1][1]);
+-      emit_move_insn (part[0][0], part[1][0]);
+-      return;
+-    }
+-
+-  /* Choose correct order to not overwrite the source before it is copied.  */
+-  if ((REG_P (part[0][0])
+-       && REG_P (part[1][1])
+-       && (REGNO (part[0][0]) == REGNO (part[1][1])
+-	   || (nparts == 3
+-	       && REGNO (part[0][0]) == REGNO (part[1][2]))
+-	   || (nparts == 4
+-	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
+-      || (collisions > 0
+-	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
+-    {
+-      for (i = 0, j = nparts - 1; i < nparts; i++, j--)
+-	{
+-	  operands[2 + i] = part[0][j];
+-	  operands[6 + i] = part[1][j];
+-	}
+-    }
+-  else
+-    {
+-      for (i = 0; i < nparts; i++)
+-	{
+-	  operands[2 + i] = part[0][i];
+-	  operands[6 + i] = part[1][i];
+-	}
+-    }
+-
+-  /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
+-  if (optimize_insn_for_size_p ())
+-    {
+-      for (j = 0; j < nparts - 1; j++)
+-	if (CONST_INT_P (operands[6 + j])
+-	    && operands[6 + j] != const0_rtx
+-	    && REG_P (operands[2 + j]))
+-	  for (i = j; i < nparts - 1; i++)
+-	    if (CONST_INT_P (operands[7 + i])
+-		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+-	      operands[7 + i] = operands[2 + j];
+-    }
+-
+-  for (i = 0; i < nparts; i++)
+-    emit_move_insn (operands[2 + i], operands[6 + i]);
+-
+-  return;
+-}
+-
+-/* Helper function of ix86_split_ashl used to generate an SImode/DImode
+-   left shift by a constant, either using a single shift or
+-   a sequence of add instructions.  */
+-
+-static void
+-ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
+-{
+-  rtx (*insn)(rtx, rtx, rtx);
+-
+-  if (count == 1
+-      || (count * ix86_cost->add <= ix86_cost->shift_const
+-	  && !optimize_insn_for_size_p ()))
+-    {
+-      insn = mode == DImode ? gen_addsi3 : gen_adddi3;
+-      while (count-- > 0)
+-	emit_insn (insn (operand, operand, operand));
+-    }
+-  else
+-    {
+-      insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+-      emit_insn (insn (operand, operand, GEN_INT (count)));
+-    }
+-}
+-
+-void
+-ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
+-{
+-  rtx (*gen_ashl3)(rtx, rtx, rtx);
+-  rtx (*gen_shld)(rtx, rtx, rtx);
+-  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+-
+-  rtx low[2], high[2];
+-  int count;
+-
+-  if (CONST_INT_P (operands[2]))
+-    {
+-      split_double_mode (mode, operands, 2, low, high);
+-      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+-
+-      if (count >= half_width)
+-	{
+-	  emit_move_insn (high[0], low[1]);
+-	  emit_move_insn (low[0], const0_rtx);
+-
+-	  if (count > half_width)
+-	    ix86_expand_ashl_const (high[0], count - half_width, mode);
+-	}
+-      else
+-	{
+-	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+-
+-	  if (!rtx_equal_p (operands[0], operands[1]))
+-	    emit_move_insn (operands[0], operands[1]);
+-
+-	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
+-	  ix86_expand_ashl_const (low[0], count, mode);
+-	}
+-      return;
+-    }
+-
+-  split_double_mode (mode, operands, 1, low, high);
+-
+-  gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+-
+-  if (operands[1] == const1_rtx)
+-    {
+-      /* Assuming we've chosen a QImode capable registers, then 1 << N
+-	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
+-      if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
+-	{
+-	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+-
+-	  ix86_expand_clear (low[0]);
+-	  ix86_expand_clear (high[0]);
+-	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
+-
+-	  d = gen_lowpart (QImode, low[0]);
+-	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+-	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
+-	  emit_insn (gen_rtx_SET (d, s));
+-
+-	  d = gen_lowpart (QImode, high[0]);
+-	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+-	  s = gen_rtx_NE (QImode, flags, const0_rtx);
+-	  emit_insn (gen_rtx_SET (d, s));
+-	}
+-
+-      /* Otherwise, we can get the same results by manually performing
+-	 a bit extract operation on bit 5/6, and then performing the two
+-	 shifts.  The two methods of getting 0/1 into low/high are exactly
+-	 the same size.  Avoiding the shift in the bit extract case helps
+-	 pentium4 a bit; no one else seems to care much either way.  */
+-      else
+-	{
+-	  machine_mode half_mode;
+-	  rtx (*gen_lshr3)(rtx, rtx, rtx);
+-	  rtx (*gen_and3)(rtx, rtx, rtx);
+-	  rtx (*gen_xor3)(rtx, rtx, rtx);
+-	  HOST_WIDE_INT bits;
+-	  rtx x;
+-
+-	  if (mode == DImode)
+-	    {
+-	      half_mode = SImode;
+-	      gen_lshr3 = gen_lshrsi3;
+-	      gen_and3 = gen_andsi3;
+-	      gen_xor3 = gen_xorsi3;
+-	      bits = 5;
+-	    }
+-	  else
+-	    {
+-	      half_mode = DImode;
+-	      gen_lshr3 = gen_lshrdi3;
+-	      gen_and3 = gen_anddi3;
+-	      gen_xor3 = gen_xordi3;
+-	      bits = 6;
+-	    }
+-
+-	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
+-	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
+-	  else
+-	    x = gen_lowpart (half_mode, operands[2]);
+-	  emit_insn (gen_rtx_SET (high[0], x));
+-
+-	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
+-	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
+-	  emit_move_insn (low[0], high[0]);
+-	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
+-	}
+-
+-      emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+-      emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
+-      return;
+-    }
+-
+-  if (operands[1] == constm1_rtx)
+-    {
+-      /* For -1 << N, we can avoid the shld instruction, because we
+-	 know that we're shifting 0...31/63 ones into a -1.  */
+-      emit_move_insn (low[0], constm1_rtx);
+-      if (optimize_insn_for_size_p ())
+-	emit_move_insn (high[0], low[0]);
+-      else
+-	emit_move_insn (high[0], constm1_rtx);
+-    }
+-  else
+-    {
+-      gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+-
+-      if (!rtx_equal_p (operands[0], operands[1]))
+-	emit_move_insn (operands[0], operands[1]);
+-
+-      split_double_mode (mode, operands, 1, low, high);
+-      emit_insn (gen_shld (high[0], low[0], operands[2]));
+-    }
+-
+-  emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+-
+-  if (TARGET_CMOVE && scratch)
+-    {
+-      rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+-	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+-
+-      ix86_expand_clear (scratch);
+-      emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
+-    }
+-  else
+-    {
+-      rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+-	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+-
+-      emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
+-    }
+-}
+-
+-void
+-ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
+-{
+-  rtx (*gen_ashr3)(rtx, rtx, rtx)
+-    = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
+-  rtx (*gen_shrd)(rtx, rtx, rtx);
+-  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+-
+-  rtx low[2], high[2];
+-  int count;
+-
+-  if (CONST_INT_P (operands[2]))
+-    {
+-      split_double_mode (mode, operands, 2, low, high);
+-      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+-
+-      if (count == GET_MODE_BITSIZE (mode) - 1)
+-	{
+-	  emit_move_insn (high[0], high[1]);
+-	  emit_insn (gen_ashr3 (high[0], high[0],
+-				GEN_INT (half_width - 1)));
+-	  emit_move_insn (low[0], high[0]);
+-
+-	}
+-      else if (count >= half_width)
+-	{
+-	  emit_move_insn (low[0], high[1]);
+-	  emit_move_insn (high[0], low[0]);
+-	  emit_insn (gen_ashr3 (high[0], high[0],
+-				GEN_INT (half_width - 1)));
+-
+-	  if (count > half_width)
+-	    emit_insn (gen_ashr3 (low[0], low[0],
+-				  GEN_INT (count - half_width)));
+-	}
+-      else
+-	{
+-	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+-
+-	  if (!rtx_equal_p (operands[0], operands[1]))
+-	    emit_move_insn (operands[0], operands[1]);
+-
+-	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+-	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
+-	}
+-    }
+-  else
+-    {
+-      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+-
+-     if (!rtx_equal_p (operands[0], operands[1]))
+-	emit_move_insn (operands[0], operands[1]);
+-
+-      split_double_mode (mode, operands, 1, low, high);
+-
+-      emit_insn (gen_shrd (low[0], high[0], operands[2]));
+-      emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
+-
+-      if (TARGET_CMOVE && scratch)
+-	{
+-	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+-	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+-
+-	  emit_move_insn (scratch, high[0]);
+-	  emit_insn (gen_ashr3 (scratch, scratch,
+-				GEN_INT (half_width - 1)));
+-	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+-					  scratch));
+-	}
+-      else
+-	{
+-	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
+-	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
+-
+-	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
+-	}
+-    }
+-}
+-
+-void
+-ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
+-{
+-  rtx (*gen_lshr3)(rtx, rtx, rtx)
+-    = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
+-  rtx (*gen_shrd)(rtx, rtx, rtx);
+-  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+-
+-  rtx low[2], high[2];
+-  int count;
+-
+-  if (CONST_INT_P (operands[2]))
+-    {
+-      split_double_mode (mode, operands, 2, low, high);
+-      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+-
+-      if (count >= half_width)
+-	{
+-	  emit_move_insn (low[0], high[1]);
+-	  ix86_expand_clear (high[0]);
+-
+-	  if (count > half_width)
+-	    emit_insn (gen_lshr3 (low[0], low[0],
+-				  GEN_INT (count - half_width)));
+-	}
+-      else
+-	{
+-	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+-
+-	  if (!rtx_equal_p (operands[0], operands[1]))
+-	    emit_move_insn (operands[0], operands[1]);
+-
+-	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+-	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
+-	}
+-    }
+-  else
+-    {
+-      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+-
+-      if (!rtx_equal_p (operands[0], operands[1]))
+-	emit_move_insn (operands[0], operands[1]);
+-
+-      split_double_mode (mode, operands, 1, low, high);
+-
+-      emit_insn (gen_shrd (low[0], high[0], operands[2]));
+-      emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
+-
+-      if (TARGET_CMOVE && scratch)
+-	{
+-	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+-	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+-
+-	  ix86_expand_clear (scratch);
+-	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+-					  scratch));
+-	}
+-      else
+-	{
+-	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+-	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+-
+-	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
+-	}
+-    }
+-}
+-
+-/* Predict just emitted jump instruction to be taken with probability PROB.  */
+-static void
+-predict_jump (int prob)
+-{
+-  rtx_insn *insn = get_last_insn ();
+-  gcc_assert (JUMP_P (insn));
+-  add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
+-}
+-
+-/* Helper function for the string operations below.  Dest VARIABLE whether
+-   it is aligned to VALUE bytes.  If true, jump to the label.  */
+-static rtx_code_label *
+-ix86_expand_aligntest (rtx variable, int value, bool epilogue)
+-{
+-  rtx_code_label *label = gen_label_rtx ();
+-  rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
+-  if (GET_MODE (variable) == DImode)
+-    emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
+-  else
+-    emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
+-  emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
+-			   1, label);
+-  if (epilogue)
+-    predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  else
+-    predict_jump (REG_BR_PROB_BASE * 90 / 100);
+-  return label;
+-}
+-
+-/* Adjust COUNTER by the VALUE.  */
+-static void
+-ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
+-{
+-  rtx (*gen_add)(rtx, rtx, rtx)
+-    = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
+-
+-  emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
+-}
+-
+-/* Zero extend possibly SImode EXP to Pmode register.  */
+-rtx
+-ix86_zero_extend_to_Pmode (rtx exp)
+-{
+-  return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
+-}
+-
+-/* Divide COUNTREG by SCALE.  */
+-static rtx
+-scale_counter (rtx countreg, int scale)
+-{
+-  rtx sc;
+-
+-  if (scale == 1)
+-    return countreg;
+-  if (CONST_INT_P (countreg))
+-    return GEN_INT (INTVAL (countreg) / scale);
+-  gcc_assert (REG_P (countreg));
+-
+-  sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
+-			    GEN_INT (exact_log2 (scale)),
+-			    NULL, 1, OPTAB_DIRECT);
+-  return sc;
+-}
+-
+-/* Return mode for the memcpy/memset loop counter.  Prefer SImode over
+-   DImode for constant loop counts.  */
+-
+-static machine_mode
+-counter_mode (rtx count_exp)
+-{
+-  if (GET_MODE (count_exp) != VOIDmode)
+-    return GET_MODE (count_exp);
+-  if (!CONST_INT_P (count_exp))
+-    return Pmode;
+-  if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
+-    return DImode;
+-  return SImode;
+-}
+-
+-/* Copy the address to a Pmode register.  This is used for x32 to
+-   truncate DImode TLS address to a SImode register. */
+-
+-static rtx
+-ix86_copy_addr_to_reg (rtx addr)
+-{
+-  rtx reg;
+-  if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
+-    {
+-      reg = copy_addr_to_reg (addr);
+-      REG_POINTER (reg) = 1;
+-      return reg;
+-    }
+-  else
+-    {
+-      gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
+-      reg = copy_to_mode_reg (DImode, addr);
+-      REG_POINTER (reg) = 1;
+-      return gen_rtx_SUBREG (SImode, reg, 0);
+-    }
+-}
+-
+-/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
+-   to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
+-   specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
+-   memory by VALUE (supposed to be in MODE).
+-
+-   The size is rounded down to whole number of chunk size moved at once.
+-   SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
+-
+-
+-static void
+-expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
+-			       rtx destptr, rtx srcptr, rtx value,
+-			       rtx count, machine_mode mode, int unroll,
+-			       int expected_size, bool issetmem)
+-{
+-  rtx_code_label *out_label, *top_label;
+-  rtx iter, tmp;
+-  machine_mode iter_mode = counter_mode (count);
+-  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
+-  rtx piece_size = GEN_INT (piece_size_n);
+-  rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
+-  rtx size;
+-  int i;
+-
+-  top_label = gen_label_rtx ();
+-  out_label = gen_label_rtx ();
+-  iter = gen_reg_rtx (iter_mode);
+-
+-  size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
+-			      NULL, 1, OPTAB_DIRECT);
+-  /* Those two should combine.  */
+-  if (piece_size == const1_rtx)
+-    {
+-      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+-			       true, out_label);
+-      predict_jump (REG_BR_PROB_BASE * 10 / 100);
+-    }
+-  emit_move_insn (iter, const0_rtx);
+-
+-  emit_label (top_label);
+-
+-  tmp = convert_modes (Pmode, iter_mode, iter, true);
+-
+-  /* This assert could be relaxed - in this case we'll need to compute
+-     smallest power of two, containing in PIECE_SIZE_N and pass it to
+-     offset_address.  */
+-  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
+-  destmem = offset_address (destmem, tmp, piece_size_n);
+-  destmem = adjust_address (destmem, mode, 0);
+-
+-  if (!issetmem)
+-    {
+-      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
+-      srcmem = adjust_address (srcmem, mode, 0);
+-
+-      /* When unrolling for chips that reorder memory reads and writes,
+-	 we can save registers by using single temporary.
+-	 Also using 4 temporaries is overkill in 32bit mode.  */
+-      if (!TARGET_64BIT && 0)
+-	{
+-	  for (i = 0; i < unroll; i++)
+-	    {
+-	      if (i)
+-		{
+-		  destmem = adjust_address (copy_rtx (destmem), mode,
+-					    GET_MODE_SIZE (mode));
+-		  srcmem = adjust_address (copy_rtx (srcmem), mode,
+-					   GET_MODE_SIZE (mode));
+-		}
+-	      emit_move_insn (destmem, srcmem);
+-	    }
+-	}
+-      else
+-	{
+-	  rtx tmpreg[4];
+-	  gcc_assert (unroll <= 4);
+-	  for (i = 0; i < unroll; i++)
+-	    {
+-	      tmpreg[i] = gen_reg_rtx (mode);
+-	      if (i)
+-		srcmem = adjust_address (copy_rtx (srcmem), mode,
+-					 GET_MODE_SIZE (mode));
+-	      emit_move_insn (tmpreg[i], srcmem);
+-	    }
+-	  for (i = 0; i < unroll; i++)
+-	    {
+-	      if (i)
+-		destmem = adjust_address (copy_rtx (destmem), mode,
+-					  GET_MODE_SIZE (mode));
+-	      emit_move_insn (destmem, tmpreg[i]);
+-	    }
+-	}
+-    }
+-  else
+-    for (i = 0; i < unroll; i++)
+-      {
+-	if (i)
+-	  destmem = adjust_address (copy_rtx (destmem), mode,
+-				    GET_MODE_SIZE (mode));
+-	emit_move_insn (destmem, value);
+-      }
+-
+-  tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
+-			     true, OPTAB_LIB_WIDEN);
+-  if (tmp != iter)
+-    emit_move_insn (iter, tmp);
+-
+-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+-			   true, top_label);
+-  if (expected_size != -1)
+-    {
+-      expected_size /= GET_MODE_SIZE (mode) * unroll;
+-      if (expected_size == 0)
+-	predict_jump (0);
+-      else if (expected_size > REG_BR_PROB_BASE)
+-	predict_jump (REG_BR_PROB_BASE - 1);
+-      else
+-        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
+-		      / expected_size);
+-    }
+-  else
+-    predict_jump (REG_BR_PROB_BASE * 80 / 100);
+-  iter = ix86_zero_extend_to_Pmode (iter);
+-  tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+-			     true, OPTAB_LIB_WIDEN);
+-  if (tmp != destptr)
+-    emit_move_insn (destptr, tmp);
+-  if (!issetmem)
+-    {
+-      tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
+-				 true, OPTAB_LIB_WIDEN);
+-      if (tmp != srcptr)
+-	emit_move_insn (srcptr, tmp);
+-    }
+-  emit_label (out_label);
+-}
+-
+-/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
+-   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
+-   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
+-   For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
+-   ORIG_VALUE is the original value passed to memset to fill the memory with.
+-   Other arguments have same meaning as for previous function.  */
+-
+-static void
+-expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
+-			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
+-			   rtx count,
+-			   machine_mode mode, bool issetmem)
+-{
+-  rtx destexp;
+-  rtx srcexp;
+-  rtx countreg;
+-  HOST_WIDE_INT rounded_count;
+-
+-  /* If possible, it is shorter to use rep movs.
+-     TODO: Maybe it is better to move this logic to decide_alg.  */
+-  if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+-      && (!issetmem || orig_value == const0_rtx))
+-    mode = SImode;
+-
+-  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
+-    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
+-
+-  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
+-						       GET_MODE_SIZE (mode)));
+-  if (mode != QImode)
+-    {
+-      destexp = gen_rtx_ASHIFT (Pmode, countreg,
+-				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+-      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+-    }
+-  else
+-    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+-  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
+-    {
+-      rounded_count
+-	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
+-      destmem = shallow_copy_rtx (destmem);
+-      set_mem_size (destmem, rounded_count);
+-    }
+-  else if (MEM_SIZE_KNOWN_P (destmem))
+-    clear_mem_size (destmem);
+-
+-  if (issetmem)
+-    {
+-      value = force_reg (mode, gen_lowpart (mode, value));
+-      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
+-    }
+-  else
+-    {
+-      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
+-	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
+-      if (mode != QImode)
+-	{
+-	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
+-				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+-	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
+-	}
+-      else
+-	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
+-      if (CONST_INT_P (count))
+-	{
+-	  rounded_count
+-	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
+-	  srcmem = shallow_copy_rtx (srcmem);
+-	  set_mem_size (srcmem, rounded_count);
+-	}
+-      else
+-	{
+-	  if (MEM_SIZE_KNOWN_P (srcmem))
+-	    clear_mem_size (srcmem);
+-	}
+-      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
+-			      destexp, srcexp));
+-    }
+-}
+-
+-/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
+-   DESTMEM.
+-   SRC is passed by pointer to be updated on return.
+-   Return value is updated DST.  */
+-static rtx
+-emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
+-	     HOST_WIDE_INT size_to_move)
+-{
+-  rtx dst = destmem, src = *srcmem, adjust, tempreg;
+-  enum insn_code code;
+-  machine_mode move_mode;
+-  int piece_size, i;
+-
+-  /* Find the widest mode in which we could perform moves.
+-     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+-     it until move of such size is supported.  */
+-  piece_size = 1 << floor_log2 (size_to_move);
+-  while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
+-	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
+-    {
+-      gcc_assert (piece_size > 1);
+-      piece_size >>= 1;
+-    }
+-
+-  /* Find the corresponding vector mode with the same size as MOVE_MODE.
+-     MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
+-  if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+-    {
+-      int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+-      if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
+-	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
+-	{
+-	  move_mode = word_mode;
+-	  piece_size = GET_MODE_SIZE (move_mode);
+-	  code = optab_handler (mov_optab, move_mode);
+-	}
+-    }
+-  gcc_assert (code != CODE_FOR_nothing);
+-
+-  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+-  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
+-
+-  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+-  gcc_assert (size_to_move % piece_size == 0);
+-  adjust = GEN_INT (piece_size);
+-  for (i = 0; i < size_to_move; i += piece_size)
+-    {
+-      /* We move from memory to memory, so we'll need to do it via
+-	 a temporary register.  */
+-      tempreg = gen_reg_rtx (move_mode);
+-      emit_insn (GEN_FCN (code) (tempreg, src));
+-      emit_insn (GEN_FCN (code) (dst, tempreg));
+-
+-      emit_move_insn (destptr,
+-		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+-      emit_move_insn (srcptr,
+-		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+-
+-      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+-					  piece_size);
+-      src = adjust_automodify_address_nv (src, move_mode, srcptr,
+-					  piece_size);
+-    }
+-
+-  /* Update DST and SRC rtx.  */
+-  *srcmem = src;
+-  return dst;
+-}
+-
+-/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
+-static void
+-expand_movmem_epilogue (rtx destmem, rtx srcmem,
+-			rtx destptr, rtx srcptr, rtx count, int max_size)
+-{
+-  rtx src, dest;
+-  if (CONST_INT_P (count))
+-    {
+-      HOST_WIDE_INT countval = INTVAL (count);
+-      HOST_WIDE_INT epilogue_size = countval % max_size;
+-      int i;
+-
+-      /* For now MAX_SIZE should be a power of 2.  This assert could be
+-	 relaxed, but it'll require a bit more complicated epilogue
+-	 expanding.  */
+-      gcc_assert ((max_size & (max_size - 1)) == 0);
+-      for (i = max_size; i >= 1; i >>= 1)
+-	{
+-	  if (epilogue_size & i)
+-	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+-	}
+-      return;
+-    }
+-  if (max_size > 8)
+-    {
+-      count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
+-				    count, 1, OPTAB_DIRECT);
+-      expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
+-				     count, QImode, 1, 4, false);
+-      return;
+-    }
+-
+-  /* When there are stringops, we can cheaply increase dest and src pointers.
+-     Otherwise we save code size by maintaining offset (zero is readily
+-     available from preceding rep operation) and using x86 addressing modes.
+-   */
+-  if (TARGET_SINGLE_STRINGOP)
+-    {
+-      if (max_size > 4)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+-	  src = change_address (srcmem, SImode, srcptr);
+-	  dest = change_address (destmem, SImode, destptr);
+-	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	}
+-      if (max_size > 2)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+-	  src = change_address (srcmem, HImode, srcptr);
+-	  dest = change_address (destmem, HImode, destptr);
+-	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	}
+-      if (max_size > 1)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+-	  src = change_address (srcmem, QImode, srcptr);
+-	  dest = change_address (destmem, QImode, destptr);
+-	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	}
+-    }
+-  else
+-    {
+-      rtx offset = force_reg (Pmode, const0_rtx);
+-      rtx tmp;
+-
+-      if (max_size > 4)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+-	  src = change_address (srcmem, SImode, srcptr);
+-	  dest = change_address (destmem, SImode, destptr);
+-	  emit_move_insn (dest, src);
+-	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+-				     true, OPTAB_LIB_WIDEN);
+-	  if (tmp != offset)
+-	    emit_move_insn (offset, tmp);
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	}
+-      if (max_size > 2)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+-	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+-	  src = change_address (srcmem, HImode, tmp);
+-	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+-	  dest = change_address (destmem, HImode, tmp);
+-	  emit_move_insn (dest, src);
+-	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
+-				     true, OPTAB_LIB_WIDEN);
+-	  if (tmp != offset)
+-	    emit_move_insn (offset, tmp);
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	}
+-      if (max_size > 1)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+-	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+-	  src = change_address (srcmem, QImode, tmp);
+-	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+-	  dest = change_address (destmem, QImode, tmp);
+-	  emit_move_insn (dest, src);
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	}
+-    }
+-}
+-
+-/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
+-   with value PROMOTED_VAL.
+-   SRC is passed by pointer to be updated on return.
+-   Return value is updated DST.  */
+-static rtx
+-emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
+-	     HOST_WIDE_INT size_to_move)
+-{
+-  rtx dst = destmem, adjust;
+-  enum insn_code code;
+-  machine_mode move_mode;
+-  int piece_size, i;
+-
+-  /* Find the widest mode in which we could perform moves.
+-     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+-     it until move of such size is supported.  */
+-  move_mode = GET_MODE (promoted_val);
+-  if (move_mode == VOIDmode)
+-    move_mode = QImode;
+-  if (size_to_move < GET_MODE_SIZE (move_mode))
+-    {
+-      unsigned int move_bits = size_to_move * BITS_PER_UNIT;
+-      move_mode = int_mode_for_size (move_bits, 0).require ();
+-      promoted_val = gen_lowpart (move_mode, promoted_val);
+-    }
+-  piece_size = GET_MODE_SIZE (move_mode);
+-  code = optab_handler (mov_optab, move_mode);
+-  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
+-
+-  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+-
+-  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+-  gcc_assert (size_to_move % piece_size == 0);
+-  adjust = GEN_INT (piece_size);
+-  for (i = 0; i < size_to_move; i += piece_size)
+-    {
+-      if (piece_size <= GET_MODE_SIZE (word_mode))
+-	{
+-	  emit_insn (gen_strset (destptr, dst, promoted_val));
+-	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+-					      piece_size);
+-	  continue;
+-	}
+-
+-      emit_insn (GEN_FCN (code) (dst, promoted_val));
+-
+-      emit_move_insn (destptr,
+-		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+-
+-      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+-					  piece_size);
+-    }
+-
+-  /* Update DST rtx.  */
+-  return dst;
+-}
+-/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
+-static void
+-expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
+-				 rtx count, int max_size)
+-{
+-  count = expand_simple_binop (counter_mode (count), AND, count,
+-			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
+-  expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
+-				 gen_lowpart (QImode, value), count, QImode,
+-				 1, max_size / 2, true);
+-}
+-
+-/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
+-static void
+-expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
+-			rtx count, int max_size)
+-{
+-  rtx dest;
+-
+-  if (CONST_INT_P (count))
+-    {
+-      HOST_WIDE_INT countval = INTVAL (count);
+-      HOST_WIDE_INT epilogue_size = countval % max_size;
+-      int i;
+-
+-      /* For now MAX_SIZE should be a power of 2.  This assert could be
+-	 relaxed, but it'll require a bit more complicated epilogue
+-	 expanding.  */
+-      gcc_assert ((max_size & (max_size - 1)) == 0);
+-      for (i = max_size; i >= 1; i >>= 1)
+-	{
+-	  if (epilogue_size & i)
+-	    {
+-	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+-		destmem = emit_memset (destmem, destptr, vec_value, i);
+-	      else
+-		destmem = emit_memset (destmem, destptr, value, i);
+-	    }
+-	}
+-      return;
+-    }
+-  if (max_size > 32)
+-    {
+-      expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
+-      return;
+-    }
+-  if (max_size > 16)
+-    {
+-      rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
+-      if (TARGET_64BIT)
+-	{
+-	  dest = change_address (destmem, DImode, destptr);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	}
+-      else
+-	{
+-	  dest = change_address (destmem, SImode, destptr);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	}
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-    }
+-  if (max_size > 8)
+-    {
+-      rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
+-      if (TARGET_64BIT)
+-	{
+-	  dest = change_address (destmem, DImode, destptr);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	}
+-      else
+-	{
+-	  dest = change_address (destmem, SImode, destptr);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+-	  emit_insn (gen_strset (destptr, dest, value));
+-	}
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-    }
+-  if (max_size > 4)
+-    {
+-      rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+-      dest = change_address (destmem, SImode, destptr);
+-      emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-    }
+-  if (max_size > 2)
+-    {
+-      rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+-      dest = change_address (destmem, HImode, destptr);
+-      emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-    }
+-  if (max_size > 1)
+-    {
+-      rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+-      dest = change_address (destmem, QImode, destptr);
+-      emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-    }
+-}
+-
+-/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
+-   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
+-   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
+-   ignored.
+-   Return value is updated DESTMEM.  */
+-static rtx
+-expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
+-				  rtx destptr, rtx srcptr, rtx value,
+-				  rtx vec_value, rtx count, int align,
+-				  int desired_alignment, bool issetmem)
+-{
+-  int i;
+-  for (i = 1; i < desired_alignment; i <<= 1)
+-    {
+-      if (align <= i)
+-	{
+-	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
+-	  if (issetmem)
+-	    {
+-	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+-		destmem = emit_memset (destmem, destptr, vec_value, i);
+-	      else
+-		destmem = emit_memset (destmem, destptr, value, i);
+-	    }
+-	  else
+-	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+-	  ix86_adjust_counter (count, i);
+-	  emit_label (label);
+-	  LABEL_NUSES (label) = 1;
+-	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
+-	}
+-    }
+-  return destmem;
+-}
+-
+-/* Test if COUNT&SIZE is nonzero and if so, expand movme
+-   or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
+-   and jump to DONE_LABEL.  */
+-static void
+-expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
+-			       rtx destptr, rtx srcptr,
+-			       rtx value, rtx vec_value,
+-			       rtx count, int size,
+-			       rtx done_label, bool issetmem)
+-{
+-  rtx_code_label *label = ix86_expand_aligntest (count, size, false);
+-  machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
+-  rtx modesize;
+-  int n;
+-
+-  /* If we do not have vector value to copy, we must reduce size.  */
+-  if (issetmem)
+-    {
+-      if (!vec_value)
+-	{
+-	  if (GET_MODE (value) == VOIDmode && size > 8)
+-	    mode = Pmode;
+-	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
+-	    mode = GET_MODE (value);
+-	}
+-      else
+-	mode = GET_MODE (vec_value), value = vec_value;
+-    }
+-  else
+-    {
+-      /* Choose appropriate vector mode.  */
+-      if (size >= 32)
+-	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+-      else if (size >= 16)
+-	mode = TARGET_SSE ? V16QImode : DImode;
+-      srcmem = change_address (srcmem, mode, srcptr);
+-    }
+-  destmem = change_address (destmem, mode, destptr);
+-  modesize = GEN_INT (GET_MODE_SIZE (mode));
+-  gcc_assert (GET_MODE_SIZE (mode) <= size);
+-  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+-    {
+-      if (issetmem)
+-	emit_move_insn (destmem, gen_lowpart (mode, value));
+-      else
+-	{
+-          emit_move_insn (destmem, srcmem);
+-          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+-	}
+-      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+-    }
+-
+-  destmem = offset_address (destmem, count, 1);
+-  destmem = offset_address (destmem, GEN_INT (-2 * size),
+-			    GET_MODE_SIZE (mode));
+-  if (!issetmem)
+-    {
+-      srcmem = offset_address (srcmem, count, 1);
+-      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
+-			       GET_MODE_SIZE (mode));
+-    }
+-  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+-    {
+-      if (issetmem)
+-	emit_move_insn (destmem, gen_lowpart (mode, value));
+-      else
+-	{
+-	  emit_move_insn (destmem, srcmem);
+-	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+-	}
+-      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+-    }
+-  emit_jump_insn (gen_jump (done_label));
+-  emit_barrier ();
+-
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
+-}
+-
+-/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
+-   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
+-   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
+-   proceed with an loop copying SIZE bytes at once. Do moves in MODE.
+-   DONE_LABEL is a label after the whole copying sequence. The label is created
+-   on demand if *DONE_LABEL is NULL.
+-   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
+-   bounds after the initial copies. 
+-
+-   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
+-   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
+-   we will dispatch to a library call for large blocks.
+-
+-   In pseudocode we do:
+-
+-   if (COUNT < SIZE)
+-     {
+-       Assume that SIZE is 4. Bigger sizes are handled analogously
+-       if (COUNT & 4)
+-	 {
+-	    copy 4 bytes from SRCPTR to DESTPTR
+-	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
+-	    goto done_label
+-	 }
+-       if (!COUNT)
+-	 goto done_label;
+-       copy 1 byte from SRCPTR to DESTPTR
+-       if (COUNT & 2)
+-	 {
+-	    copy 2 bytes from SRCPTR to DESTPTR
+-	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
+-	 }
+-     }
+-   else
+-     {
+-       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
+-       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
+-
+-       OLD_DESPTR = DESTPTR;
+-       Align DESTPTR up to DESIRED_ALIGN
+-       SRCPTR += DESTPTR - OLD_DESTPTR
+-       COUNT -= DEST_PTR - OLD_DESTPTR
+-       if (DYNAMIC_CHECK)
+-	 Round COUNT down to multiple of SIZE
+-       << optional caller supplied zero size guard is here >>
+-       << optional caller supplied dynamic check is here >>
+-       << caller supplied main copy loop is here >>
+-     }
+-   done_label:
+-  */
+-static void
+-expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
+-							    rtx *destptr, rtx *srcptr,
+-							    machine_mode mode,
+-							    rtx value, rtx vec_value,
+-							    rtx *count,
+-							    rtx_code_label **done_label,
+-							    int size,
+-							    int desired_align,
+-							    int align,
+-							    unsigned HOST_WIDE_INT *min_size,
+-							    bool dynamic_check,
+-							    bool issetmem)
+-{
+-  rtx_code_label *loop_label = NULL, *label;
+-  int n;
+-  rtx modesize;
+-  int prolog_size = 0;
+-  rtx mode_value;
+-
+-  /* Chose proper value to copy.  */
+-  if (issetmem && VECTOR_MODE_P (mode))
+-    mode_value = vec_value;
+-  else
+-    mode_value = value;
+-  gcc_assert (GET_MODE_SIZE (mode) <= size);
+-
+-  /* See if block is big or small, handle small blocks.  */
+-  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
+-    {
+-      int size2 = size;
+-      loop_label = gen_label_rtx ();
+-
+-      if (!*done_label)
+-	*done_label = gen_label_rtx ();
+-
+-      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
+-			       1, loop_label);
+-      size2 >>= 1;
+-
+-      /* Handle sizes > 3.  */
+-      for (;size2 > 2; size2 >>= 1)
+-	expand_small_movmem_or_setmem (destmem, srcmem,
+-				       *destptr, *srcptr,
+-				       value, vec_value,
+-				       *count,
+-				       size2, *done_label, issetmem);
+-      /* Nothing to copy?  Jump to DONE_LABEL if so */
+-      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
+-			       1, *done_label);
+-
+-      /* Do a byte copy.  */
+-      destmem = change_address (destmem, QImode, *destptr);
+-      if (issetmem)
+-	emit_move_insn (destmem, gen_lowpart (QImode, value));
+-      else
+-	{
+-          srcmem = change_address (srcmem, QImode, *srcptr);
+-          emit_move_insn (destmem, srcmem);
+-	}
+-
+-      /* Handle sizes 2 and 3.  */
+-      label = ix86_expand_aligntest (*count, 2, false);
+-      destmem = change_address (destmem, HImode, *destptr);
+-      destmem = offset_address (destmem, *count, 1);
+-      destmem = offset_address (destmem, GEN_INT (-2), 2);
+-      if (issetmem)
+-        emit_move_insn (destmem, gen_lowpart (HImode, value));
+-      else
+-	{
+-	  srcmem = change_address (srcmem, HImode, *srcptr);
+-	  srcmem = offset_address (srcmem, *count, 1);
+-	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
+-	  emit_move_insn (destmem, srcmem);
+-	}
+-
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-      emit_jump_insn (gen_jump (*done_label));
+-      emit_barrier ();
+-    }
+-  else
+-    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
+-		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
+-
+-  /* Start memcpy for COUNT >= SIZE.  */
+-  if (loop_label)
+-    {
+-       emit_label (loop_label);
+-       LABEL_NUSES (loop_label) = 1;
+-    }
+-
+-  /* Copy first desired_align bytes.  */
+-  if (!issetmem)
+-    srcmem = change_address (srcmem, mode, *srcptr);
+-  destmem = change_address (destmem, mode, *destptr);
+-  modesize = GEN_INT (GET_MODE_SIZE (mode));
+-  for (n = 0; prolog_size < desired_align - align; n++)
+-    {
+-      if (issetmem)
+-        emit_move_insn (destmem, mode_value);
+-      else
+-	{
+-          emit_move_insn (destmem, srcmem);
+-          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+-	}
+-      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+-      prolog_size += GET_MODE_SIZE (mode);
+-    }
+-
+-
+-  /* Copy last SIZE bytes.  */
+-  destmem = offset_address (destmem, *count, 1);
+-  destmem = offset_address (destmem,
+-			    GEN_INT (-size - prolog_size),
+-			    1);
+-  if (issetmem)
+-    emit_move_insn (destmem, mode_value);
+-  else
+-    {
+-      srcmem = offset_address (srcmem, *count, 1);
+-      srcmem = offset_address (srcmem,
+-			       GEN_INT (-size - prolog_size),
+-			       1);
+-      emit_move_insn (destmem, srcmem);
+-    }
+-  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
+-    {
+-      destmem = offset_address (destmem, modesize, 1);
+-      if (issetmem)
+-	emit_move_insn (destmem, mode_value);
+-      else
+-	{
+-          srcmem = offset_address (srcmem, modesize, 1);
+-          emit_move_insn (destmem, srcmem);
+-	}
+-    }
+-
+-  /* Align destination.  */
+-  if (desired_align > 1 && desired_align > align)
+-    {
+-      rtx saveddest = *destptr;
+-
+-      gcc_assert (desired_align <= size);
+-      /* Align destptr up, place it to new register.  */
+-      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
+-				      GEN_INT (prolog_size),
+-				      NULL_RTX, 1, OPTAB_DIRECT);
+-      if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
+-	REG_POINTER (*destptr) = 1;
+-      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
+-				      GEN_INT (-desired_align),
+-				      *destptr, 1, OPTAB_DIRECT);
+-      /* See how many bytes we skipped.  */
+-      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
+-				       *destptr,
+-				       saveddest, 1, OPTAB_DIRECT);
+-      /* Adjust srcptr and count.  */
+-      if (!issetmem)
+-	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
+-				       saveddest, *srcptr, 1, OPTAB_DIRECT);
+-      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+-				    saveddest, *count, 1, OPTAB_DIRECT);
+-      /* We copied at most size + prolog_size.  */
+-      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
+-	*min_size
+-	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
+-      else
+-	*min_size = 0;
+-
+-      /* Our loops always round down the block size, but for dispatch to
+-         library we need precise value.  */
+-      if (dynamic_check)
+-	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
+-				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
+-    }
+-  else
+-    {
+-      gcc_assert (prolog_size == 0);
+-      /* Decrease count, so we won't end up copying last word twice.  */
+-      if (!CONST_INT_P (*count))
+-	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+-				      constm1_rtx, *count, 1, OPTAB_DIRECT);
+-      else
+-	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
+-				      (unsigned HOST_WIDE_INT)size));
+-      if (*min_size)
+-	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
+-    }
+-}
+-
+-
+-/* This function is like the previous one, except here we know how many bytes
+-   need to be copied.  That allows us to update alignment not only of DST, which
+-   is returned, but also of SRC, which is passed as a pointer for that
+-   reason.  */
+-static rtx
+-expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
+-					   rtx srcreg, rtx value, rtx vec_value,
+-					   int desired_align, int align_bytes,
+-					   bool issetmem)
+-{
+-  rtx src = NULL;
+-  rtx orig_dst = dst;
+-  rtx orig_src = NULL;
+-  int piece_size = 1;
+-  int copied_bytes = 0;
+-
+-  if (!issetmem)
+-    {
+-      gcc_assert (srcp != NULL);
+-      src = *srcp;
+-      orig_src = src;
+-    }
+-
+-  for (piece_size = 1;
+-       piece_size <= desired_align && copied_bytes < align_bytes;
+-       piece_size <<= 1)
+-    {
+-      if (align_bytes & piece_size)
+-	{
+-	  if (issetmem)
+-	    {
+-	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
+-		dst = emit_memset (dst, destreg, vec_value, piece_size);
+-	      else
+-		dst = emit_memset (dst, destreg, value, piece_size);
+-	    }
+-	  else
+-	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
+-	  copied_bytes += piece_size;
+-	}
+-    }
+-  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
+-    set_mem_align (dst, desired_align * BITS_PER_UNIT);
+-  if (MEM_SIZE_KNOWN_P (orig_dst))
+-    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
+-
+-  if (!issetmem)
+-    {
+-      int src_align_bytes = get_mem_align_offset (src, desired_align
+-						       * BITS_PER_UNIT);
+-      if (src_align_bytes >= 0)
+-	src_align_bytes = desired_align - src_align_bytes;
+-      if (src_align_bytes >= 0)
+-	{
+-	  unsigned int src_align;
+-	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
+-	    {
+-	      if ((src_align_bytes & (src_align - 1))
+-		   == (align_bytes & (src_align - 1)))
+-		break;
+-	    }
+-	  if (src_align > (unsigned int) desired_align)
+-	    src_align = desired_align;
+-	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+-	    set_mem_align (src, src_align * BITS_PER_UNIT);
+-	}
+-      if (MEM_SIZE_KNOWN_P (orig_src))
+-	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
+-      *srcp = src;
+-    }
+-
+-  return dst;
+-}
+-
+-/* Return true if ALG can be used in current context.  
+-   Assume we expand memset if MEMSET is true.  */
+-static bool
+-alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
+-{
+-  if (alg == no_stringop)
+-    return false;
+-  if (alg == vector_loop)
+-    return TARGET_SSE || TARGET_AVX;
+-  /* Algorithms using the rep prefix want at least edi and ecx;
+-     additionally, memset wants eax and memcpy wants esi.  Don't
+-     consider such algorithms if the user has appropriated those
+-     registers for their own purposes, or if we have a non-default
+-     address space, since some string insns cannot override the segment.  */
+-  if (alg == rep_prefix_1_byte
+-      || alg == rep_prefix_4_byte
+-      || alg == rep_prefix_8_byte)
+-    {
+-      if (have_as)
+-	return false;
+-      if (fixed_regs[CX_REG]
+-	  || fixed_regs[DI_REG]
+-	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
+-	return false;
+-    }
+-  return true;
+-}
+-
+-/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
+-static enum stringop_alg
+-decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
+-	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
+-	    bool memset, bool zero_memset, bool have_as,
+-	    int *dynamic_check, bool *noalign, bool recur)
+-{
+-  const struct stringop_algs *algs;
+-  bool optimize_for_speed;
+-  int max = 0;
+-  const struct processor_costs *cost;
+-  int i;
+-  bool any_alg_usable_p = false;
+-
+-  *noalign = false;
+-  *dynamic_check = -1;
+-
+-  /* Even if the string operation call is cold, we still might spend a lot
+-     of time processing large blocks.  */
+-  if (optimize_function_for_size_p (cfun)
+-      || (optimize_insn_for_size_p ()
+- 	  && (max_size < 256
+-              || (expected_size != -1 && expected_size < 256))))
+-    optimize_for_speed = false;
+-  else
+-    optimize_for_speed = true;
+-
+-  cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
+-  if (memset)
+-    algs = &cost->memset[TARGET_64BIT != 0];
+-  else
+-    algs = &cost->memcpy[TARGET_64BIT != 0];
+-
+-  /* See maximal size for user defined algorithm.  */
+-  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+-    {
+-      enum stringop_alg candidate = algs->size[i].alg;
+-      bool usable = alg_usable_p (candidate, memset, have_as);
+-      any_alg_usable_p |= usable;
+-
+-      if (candidate != libcall && candidate && usable)
+-	max = algs->size[i].max;
+-    }
+-
+-  /* If expected size is not known but max size is small enough
+-     so inline version is a win, set expected size into
+-     the range.  */
+-  if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
+-      && expected_size == -1)
+-    expected_size = min_size / 2 + max_size / 2;
+-
+-  /* If user specified the algorithm, honor it if possible.  */
+-  if (ix86_stringop_alg != no_stringop
+-      && alg_usable_p (ix86_stringop_alg, memset, have_as))
+-    return ix86_stringop_alg;
+-  /* rep; movq or rep; movl is the smallest variant.  */
+-  else if (!optimize_for_speed)
+-    {
+-      *noalign = true;
+-      if (!count || (count & 3) || (memset && !zero_memset))
+-	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
+-	       ? rep_prefix_1_byte : loop_1_byte;
+-      else
+-	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
+-	       ? rep_prefix_4_byte : loop;
+-    }
+-  /* Very tiny blocks are best handled via the loop, REP is expensive to
+-     setup.  */
+-  else if (expected_size != -1 && expected_size < 4)
+-    return loop_1_byte;
+-  else if (expected_size != -1)
+-    {
+-      enum stringop_alg alg = libcall;
+-      bool alg_noalign = false;
+-      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+-	{
+-	  /* We get here if the algorithms that were not libcall-based
+-	     were rep-prefix based and we are unable to use rep prefixes
+-	     based on global register usage.  Break out of the loop and
+-	     use the heuristic below.  */
+-	  if (algs->size[i].max == 0)
+-	    break;
+-	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
+-	    {
+-	      enum stringop_alg candidate = algs->size[i].alg;
+-
+-	      if (candidate != libcall
+-		  && alg_usable_p (candidate, memset, have_as))
+-		{
+-		  alg = candidate;
+-		  alg_noalign = algs->size[i].noalign;
+-		}
+-	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
+-		 last non-libcall inline algorithm.  */
+-	      if (TARGET_INLINE_ALL_STRINGOPS)
+-		{
+-		  /* When the current size is best to be copied by a libcall,
+-		     but we are still forced to inline, run the heuristic below
+-		     that will pick code for medium sized blocks.  */
+-		  if (alg != libcall)
+-		    {
+-		      *noalign = alg_noalign;
+-		      return alg;
+-		    }
+-		  else if (!any_alg_usable_p)
+-		    break;
+-		}
+-	      else if (alg_usable_p (candidate, memset, have_as))
+-		{
+-		  *noalign = algs->size[i].noalign;
+-		  return candidate;
+-		}
+-	    }
+-	}
+-    }
+-  /* When asked to inline the call anyway, try to pick meaningful choice.
+-     We look for maximal size of block that is faster to copy by hand and
+-     take blocks of at most of that size guessing that average size will
+-     be roughly half of the block.
+-
+-     If this turns out to be bad, we might simply specify the preferred
+-     choice in ix86_costs.  */
+-  if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+-      && (algs->unknown_size == libcall
+-	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
+-    {
+-      enum stringop_alg alg;
+-      HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
+-
+-      /* If there aren't any usable algorithms or if recursing already,
+-	 then recursing on smaller sizes or same size isn't going to
+-	 find anything.  Just return the simple byte-at-a-time copy loop.  */
+-      if (!any_alg_usable_p || recur)
+-	{
+-	  /* Pick something reasonable.  */
+-	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
+-	    *dynamic_check = 128;
+-	  return loop_1_byte;
+-	}
+-      alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
+-			zero_memset, have_as, dynamic_check, noalign, true);
+-      gcc_assert (*dynamic_check == -1);
+-      if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+-	*dynamic_check = max;
+-      else
+-	gcc_assert (alg != libcall);
+-      return alg;
+-    }
+-  return (alg_usable_p (algs->unknown_size, memset, have_as)
+-	  ? algs->unknown_size : libcall);
+-}
+-
+-/* Decide on alignment.  We know that the operand is already aligned to ALIGN
+-   (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
+-static int
+-decide_alignment (int align,
+-		  enum stringop_alg alg,
+-		  int expected_size,
+-		  machine_mode move_mode)
+-{
+-  int desired_align = 0;
+-
+-  gcc_assert (alg != no_stringop);
+-
+-  if (alg == libcall)
+-    return 0;
+-  if (move_mode == VOIDmode)
+-    return 0;
+-
+-  desired_align = GET_MODE_SIZE (move_mode);
+-  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+-     copying whole cacheline at once.  */
+-  if (TARGET_PENTIUMPRO
+-      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
+-    desired_align = 8;
+-
+-  if (optimize_size)
+-    desired_align = 1;
+-  if (desired_align < align)
+-    desired_align = align;
+-  if (expected_size != -1 && expected_size < 4)
+-    desired_align = align;
+-
+-  return desired_align;
+-}
+-
+-
+-/* Helper function for memcpy.  For QImode value 0xXY produce
+-   0xXYXYXYXY of wide specified by MODE.  This is essentially
+-   a * 0x10101010, but we can do slightly better than
+-   synth_mult by unwinding the sequence by hand on CPUs with
+-   slow multiply.  */
+-static rtx
+-promote_duplicated_reg (machine_mode mode, rtx val)
+-{
+-  machine_mode valmode = GET_MODE (val);
+-  rtx tmp;
+-  int nops = mode == DImode ? 3 : 2;
+-
+-  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
+-  if (val == const0_rtx)
+-    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+-  if (CONST_INT_P (val))
+-    {
+-      HOST_WIDE_INT v = INTVAL (val) & 255;
+-
+-      v |= v << 8;
+-      v |= v << 16;
+-      if (mode == DImode)
+-        v |= (v << 16) << 16;
+-      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
+-    }
+-
+-  if (valmode == VOIDmode)
+-    valmode = QImode;
+-  if (valmode != QImode)
+-    val = gen_lowpart (QImode, val);
+-  if (mode == QImode)
+-    return val;
+-  if (!TARGET_PARTIAL_REG_STALL)
+-    nops--;
+-  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
+-      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
+-      <= (ix86_cost->shift_const + ix86_cost->add) * nops
+-          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
+-    {
+-      rtx reg = convert_modes (mode, QImode, val, true);
+-      tmp = promote_duplicated_reg (mode, const1_rtx);
+-      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
+-				  OPTAB_DIRECT);
+-    }
+-  else
+-    {
+-      rtx reg = convert_modes (mode, QImode, val, true);
+-
+-      if (!TARGET_PARTIAL_REG_STALL)
+-	if (mode == SImode)
+-	  emit_insn (gen_insvsi_1 (reg, reg));
+-	else
+-	  emit_insn (gen_insvdi_1 (reg, reg));
+-      else
+-	{
+-	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
+-				     NULL, 1, OPTAB_DIRECT);
+-	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
+-				     OPTAB_DIRECT);
+-	}
+-      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
+-			         NULL, 1, OPTAB_DIRECT);
+-      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+-      if (mode == SImode)
+-	return reg;
+-      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
+-				 NULL, 1, OPTAB_DIRECT);
+-      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+-      return reg;
+-    }
+-}
+-
+-/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
+-   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
+-   alignment from ALIGN to DESIRED_ALIGN.  */
+-static rtx
+-promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
+-				int align)
+-{
+-  rtx promoted_val;
+-
+-  if (TARGET_64BIT
+-      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+-    promoted_val = promote_duplicated_reg (DImode, val);
+-  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
+-    promoted_val = promote_duplicated_reg (SImode, val);
+-  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
+-    promoted_val = promote_duplicated_reg (HImode, val);
+-  else
+-    promoted_val = val;
+-
+-  return promoted_val;
+-}
+-
+-/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
+-   operations when profitable.  The code depends upon architecture, block size
+-   and alignment, but always has one of the following overall structures:
+-
+-   Aligned move sequence:
+-
+-     1) Prologue guard: Conditional that jumps up to epilogues for small
+-	blocks that can be handled by epilogue alone.  This is faster
+-	but also needed for correctness, since prologue assume the block
+-	is larger than the desired alignment.
+-
+-	Optional dynamic check for size and libcall for large
+-	blocks is emitted here too, with -minline-stringops-dynamically.
+-
+-     2) Prologue: copy first few bytes in order to get destination
+-	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
+-	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
+-	copied.  We emit either a jump tree on power of two sized
+-	blocks, or a byte loop.
+-
+-     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+-	with specified algorithm.
+-
+-     4) Epilogue: code copying tail of the block that is too small to be
+-	handled by main body (or up to size guarded by prologue guard). 
+-
+-  Misaligned move sequence
+-
+-     1) missaligned move prologue/epilogue containing:
+-        a) Prologue handling small memory blocks and jumping to done_label
+-	   (skipped if blocks are known to be large enough)
+-	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
+-           needed by single possibly misaligned move
+-	   (skipped if alignment is not needed)
+-        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
+-
+-     2) Zero size guard dispatching to done_label, if needed
+-
+-     3) dispatch to library call, if needed,
+-
+-     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+-	with specified algorithm.  */
+-bool
+-ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
+-			   rtx align_exp, rtx expected_align_exp,
+-			   rtx expected_size_exp, rtx min_size_exp,
+-			   rtx max_size_exp, rtx probable_max_size_exp,
+-			   bool issetmem)
+-{
+-  rtx destreg;
+-  rtx srcreg = NULL;
+-  rtx_code_label *label = NULL;
+-  rtx tmp;
+-  rtx_code_label *jump_around_label = NULL;
+-  HOST_WIDE_INT align = 1;
+-  unsigned HOST_WIDE_INT count = 0;
+-  HOST_WIDE_INT expected_size = -1;
+-  int size_needed = 0, epilogue_size_needed;
+-  int desired_align = 0, align_bytes = 0;
+-  enum stringop_alg alg;
+-  rtx promoted_val = NULL;
+-  rtx vec_promoted_val = NULL;
+-  bool force_loopy_epilogue = false;
+-  int dynamic_check;
+-  bool need_zero_guard = false;
+-  bool noalign;
+-  machine_mode move_mode = VOIDmode;
+-  machine_mode wider_mode;
+-  int unroll_factor = 1;
+-  /* TODO: Once value ranges are available, fill in proper data.  */
+-  unsigned HOST_WIDE_INT min_size = 0;
+-  unsigned HOST_WIDE_INT max_size = -1;
+-  unsigned HOST_WIDE_INT probable_max_size = -1;
+-  bool misaligned_prologue_used = false;
+-  bool have_as;
+-
+-  if (CONST_INT_P (align_exp))
+-    align = INTVAL (align_exp);
+-  /* i386 can do misaligned access on reasonably increased cost.  */
+-  if (CONST_INT_P (expected_align_exp)
+-      && INTVAL (expected_align_exp) > align)
+-    align = INTVAL (expected_align_exp);
+-  /* ALIGN is the minimum of destination and source alignment, but we care here
+-     just about destination alignment.  */
+-  else if (!issetmem
+-	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
+-    align = MEM_ALIGN (dst) / BITS_PER_UNIT;
+-
+-  if (CONST_INT_P (count_exp))
+-    {
+-      min_size = max_size = probable_max_size = count = expected_size
+-	= INTVAL (count_exp);
+-      /* When COUNT is 0, there is nothing to do.  */
+-      if (!count)
+-	return true;
+-    }
+-  else
+-    {
+-      if (min_size_exp)
+-	min_size = INTVAL (min_size_exp);
+-      if (max_size_exp)
+-	max_size = INTVAL (max_size_exp);
+-      if (probable_max_size_exp)
+-	probable_max_size = INTVAL (probable_max_size_exp);
+-      if (CONST_INT_P (expected_size_exp))
+-	expected_size = INTVAL (expected_size_exp);
+-     }
+-
+-  /* Make sure we don't need to care about overflow later on.  */
+-  if (count > (HOST_WIDE_INT_1U << 30))
+-    return false;
+-
+-  have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
+-  if (!issetmem)
+-    have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
+-
+-  /* Step 0: Decide on preferred algorithm, desired alignment and
+-     size of chunks to be copied by main loop.  */
+-  alg = decide_alg (count, expected_size, min_size, probable_max_size,
+-		    issetmem,
+-		    issetmem && val_exp == const0_rtx, have_as,
+-		    &dynamic_check, &noalign, false);
+-
+-  if (dump_file)
+-    fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
+-	     stringop_alg_names[alg]);
+-
+-  if (alg == libcall)
+-    return false;
+-  gcc_assert (alg != no_stringop);
+-
+-  /* For now vector-version of memset is generated only for memory zeroing, as
+-     creating of promoted vector value is very cheap in this case.  */
+-  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
+-    alg = unrolled_loop;
+-
+-  if (!count)
+-    count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
+-  destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
+-  if (!issetmem)
+-    srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+-
+-  unroll_factor = 1;
+-  move_mode = word_mode;
+-  switch (alg)
+-    {
+-    case libcall:
+-    case no_stringop:
+-    case last_alg:
+-      gcc_unreachable ();
+-    case loop_1_byte:
+-      need_zero_guard = true;
+-      move_mode = QImode;
+-      break;
+-    case loop:
+-      need_zero_guard = true;
+-      break;
+-    case unrolled_loop:
+-      need_zero_guard = true;
+-      unroll_factor = (TARGET_64BIT ? 4 : 2);
+-      break;
+-    case vector_loop:
+-      need_zero_guard = true;
+-      unroll_factor = 4;
+-      /* Find the widest supported mode.  */
+-      move_mode = word_mode;
+-      while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
+-	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
+-	move_mode = wider_mode;
+-
+-      if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
+-	move_mode = TImode;
+-
+-      /* Find the corresponding vector mode with the same size as MOVE_MODE.
+-	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
+-      if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+-	{
+-	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+-	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
+-	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
+-	    move_mode = word_mode;
+-	}
+-      gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
+-      break;
+-    case rep_prefix_8_byte:
+-      move_mode = DImode;
+-      break;
+-    case rep_prefix_4_byte:
+-      move_mode = SImode;
+-      break;
+-    case rep_prefix_1_byte:
+-      move_mode = QImode;
+-      break;
+-    }
+-  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+-  epilogue_size_needed = size_needed;
+-
+-  /* If we are going to call any library calls conditionally, make sure any
+-     pending stack adjustment happen before the first conditional branch,
+-     otherwise they will be emitted before the library call only and won't
+-     happen from the other branches.  */
+-  if (dynamic_check != -1)
+-    do_pending_stack_adjust ();
+-
+-  desired_align = decide_alignment (align, alg, expected_size, move_mode);
+-  if (!TARGET_ALIGN_STRINGOPS || noalign)
+-    align = desired_align;
+-
+-  /* Step 1: Prologue guard.  */
+-
+-  /* Alignment code needs count to be in register.  */
+-  if (CONST_INT_P (count_exp) && desired_align > align)
+-    {
+-      if (INTVAL (count_exp) > desired_align
+-	  && INTVAL (count_exp) > size_needed)
+-	{
+-	  align_bytes
+-	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
+-	  if (align_bytes <= 0)
+-	    align_bytes = 0;
+-	  else
+-	    align_bytes = desired_align - align_bytes;
+-	}
+-      if (align_bytes == 0)
+-	count_exp = force_reg (counter_mode (count_exp), count_exp);
+-    }
+-  gcc_assert (desired_align >= 1 && align >= 1);
+-
+-  /* Misaligned move sequences handle both prologue and epilogue at once.
+-     Default code generation results in a smaller code for large alignments
+-     and also avoids redundant job when sizes are known precisely.  */
+-  misaligned_prologue_used
+-    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+-       && MAX (desired_align, epilogue_size_needed) <= 32
+-       && desired_align <= epilogue_size_needed
+-       && ((desired_align > align && !align_bytes)
+-	   || (!count && epilogue_size_needed > 1)));
+-
+-  /* Do the cheap promotion to allow better CSE across the
+-     main loop and epilogue (ie one load of the big constant in the
+-     front of all code.  
+-     For now the misaligned move sequences do not have fast path
+-     without broadcasting.  */
+-  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+-    {
+-      if (alg == vector_loop)
+-	{
+-	  gcc_assert (val_exp == const0_rtx);
+-	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
+-	  promoted_val = promote_duplicated_reg_to_size (val_exp,
+-							 GET_MODE_SIZE (word_mode),
+-							 desired_align, align);
+-	}
+-      else
+-	{
+-	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+-							 desired_align, align);
+-	}
+-    }
+-  /* Misaligned move sequences handles both prologues and epilogues at once.
+-     Default code generation results in smaller code for large alignments and
+-     also avoids redundant job when sizes are known precisely.  */
+-  if (misaligned_prologue_used)
+-    {
+-      /* Misaligned move prologue handled small blocks by itself.  */
+-      expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
+-	   (dst, src, &destreg, &srcreg,
+-	    move_mode, promoted_val, vec_promoted_val,
+-	    &count_exp,
+-	    &jump_around_label,
+-            desired_align < align
+-	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
+-	    desired_align, align, &min_size, dynamic_check, issetmem);
+-      if (!issetmem)
+-        src = change_address (src, BLKmode, srcreg);
+-      dst = change_address (dst, BLKmode, destreg);
+-      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+-      epilogue_size_needed = 0;
+-      if (need_zero_guard
+-	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
+-	{
+-	  /* It is possible that we copied enough so the main loop will not
+-	     execute.  */
+-	  gcc_assert (size_needed > 1);
+-	  if (jump_around_label == NULL_RTX)
+-	    jump_around_label = gen_label_rtx ();
+-	  emit_cmp_and_jump_insns (count_exp,
+-				   GEN_INT (size_needed),
+-				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
+-	  if (expected_size == -1
+-	      || expected_size < (desired_align - align) / 2 + size_needed)
+-	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+-	  else
+-	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+-	}
+-    }
+-  /* Ensure that alignment prologue won't copy past end of block.  */
+-  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+-    {
+-      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
+-      /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
+-	 Make sure it is power of 2.  */
+-      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
+-
+-      /* To improve performance of small blocks, we jump around the VAL
+-	 promoting mode.  This mean that if the promoted VAL is not constant,
+-	 we might not use it in the epilogue and have to use byte
+-	 loop variant.  */
+-      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
+-	force_loopy_epilogue = true;
+-      if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+-	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+-	{
+-	  /* If main algorithm works on QImode, no epilogue is needed.
+-	     For small sizes just don't align anything.  */
+-	  if (size_needed == 1)
+-	    desired_align = align;
+-	  else
+-	    goto epilogue;
+-	}
+-      else if (!count
+-	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+-	{
+-	  label = gen_label_rtx ();
+-	  emit_cmp_and_jump_insns (count_exp,
+-				   GEN_INT (epilogue_size_needed),
+-				   LTU, 0, counter_mode (count_exp), 1, label);
+-	  if (expected_size == -1 || expected_size < epilogue_size_needed)
+-	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+-	  else
+-	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+-	}
+-    }
+-
+-  /* Emit code to decide on runtime whether library call or inline should be
+-     used.  */
+-  if (dynamic_check != -1)
+-    {
+-      if (!issetmem && CONST_INT_P (count_exp))
+-	{
+-	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
+-	    {
+-	      emit_block_copy_via_libcall (dst, src, count_exp);
+-	      count_exp = const0_rtx;
+-	      goto epilogue;
+-	    }
+-	}
+-      else
+-	{
+-	  rtx_code_label *hot_label = gen_label_rtx ();
+-	  if (jump_around_label == NULL_RTX)
+-	    jump_around_label = gen_label_rtx ();
+-	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+-				   LEU, 0, counter_mode (count_exp),
+-				   1, hot_label);
+-	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
+-	  if (issetmem)
+-	    set_storage_via_libcall (dst, count_exp, val_exp);
+-	  else
+-	    emit_block_copy_via_libcall (dst, src, count_exp);
+-	  emit_jump (jump_around_label);
+-	  emit_label (hot_label);
+-	}
+-    }
+-
+-  /* Step 2: Alignment prologue.  */
+-  /* Do the expensive promotion once we branched off the small blocks.  */
+-  if (issetmem && !promoted_val)
+-    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+-						   desired_align, align);
+-
+-  if (desired_align > align && !misaligned_prologue_used)
+-    {
+-      if (align_bytes == 0)
+-	{
+-	  /* Except for the first move in prologue, we no longer know
+-	     constant offset in aliasing info.  It don't seems to worth
+-	     the pain to maintain it for the first move, so throw away
+-	     the info early.  */
+-	  dst = change_address (dst, BLKmode, destreg);
+-	  if (!issetmem)
+-	    src = change_address (src, BLKmode, srcreg);
+-	  dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
+-					    promoted_val, vec_promoted_val,
+-					    count_exp, align, desired_align,
+-					    issetmem);
+-	  /* At most desired_align - align bytes are copied.  */
+-	  if (min_size < (unsigned)(desired_align - align))
+-	    min_size = 0;
+-	  else
+-	    min_size -= desired_align - align;
+-	}
+-      else
+-	{
+-	  /* If we know how many bytes need to be stored before dst is
+-	     sufficiently aligned, maintain aliasing info accurately.  */
+-	  dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
+-							   srcreg,
+-							   promoted_val,
+-							   vec_promoted_val,
+-							   desired_align,
+-							   align_bytes,
+-							   issetmem);
+-
+-	  count_exp = plus_constant (counter_mode (count_exp),
+-				     count_exp, -align_bytes);
+-	  count -= align_bytes;
+-	  min_size -= align_bytes;
+-	  max_size -= align_bytes;
+-	}
+-      if (need_zero_guard
+-	  && min_size < (unsigned HOST_WIDE_INT) size_needed
+-	  && (count < (unsigned HOST_WIDE_INT) size_needed
+-	      || (align_bytes == 0
+-		  && count < ((unsigned HOST_WIDE_INT) size_needed
+-			      + desired_align - align))))
+-	{
+-	  /* It is possible that we copied enough so the main loop will not
+-	     execute.  */
+-	  gcc_assert (size_needed > 1);
+-	  if (label == NULL_RTX)
+-	    label = gen_label_rtx ();
+-	  emit_cmp_and_jump_insns (count_exp,
+-				   GEN_INT (size_needed),
+-				   LTU, 0, counter_mode (count_exp), 1, label);
+-	  if (expected_size == -1
+-	      || expected_size < (desired_align - align) / 2 + size_needed)
+-	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+-	  else
+-	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+-	}
+-    }
+-  if (label && size_needed == 1)
+-    {
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-      label = NULL;
+-      epilogue_size_needed = 1;
+-      if (issetmem)
+-	promoted_val = val_exp;
+-    }
+-  else if (label == NULL_RTX && !misaligned_prologue_used)
+-    epilogue_size_needed = size_needed;
+-
+-  /* Step 3: Main loop.  */
+-
+-  switch (alg)
+-    {
+-    case libcall:
+-    case no_stringop:
+-    case last_alg:
+-      gcc_unreachable ();
+-    case loop_1_byte:
+-    case loop:
+-    case unrolled_loop:
+-      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
+-				     count_exp, move_mode, unroll_factor,
+-				     expected_size, issetmem);
+-      break;
+-    case vector_loop:
+-      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
+-				     vec_promoted_val, count_exp, move_mode,
+-				     unroll_factor, expected_size, issetmem);
+-      break;
+-    case rep_prefix_8_byte:
+-    case rep_prefix_4_byte:
+-    case rep_prefix_1_byte:
+-      expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
+-				       val_exp, count_exp, move_mode, issetmem);
+-      break;
+-    }
+-  /* Adjust properly the offset of src and dest memory for aliasing.  */
+-  if (CONST_INT_P (count_exp))
+-    {
+-      if (!issetmem)
+-	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
+-					    (count / size_needed) * size_needed);
+-      dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
+-					  (count / size_needed) * size_needed);
+-    }
+-  else
+-    {
+-      if (!issetmem)
+-	src = change_address (src, BLKmode, srcreg);
+-      dst = change_address (dst, BLKmode, destreg);
+-    }
+-
+-  /* Step 4: Epilogue to copy the remaining bytes.  */
+- epilogue:
+-  if (label)
+-    {
+-      /* When the main loop is done, COUNT_EXP might hold original count,
+-	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
+-	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
+-	 bytes. Compensate if needed.  */
+-
+-      if (size_needed < epilogue_size_needed)
+-	{
+-	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
+-				     GEN_INT (size_needed - 1), count_exp, 1,
+-				     OPTAB_DIRECT);
+-	  if (tmp != count_exp)
+-	    emit_move_insn (count_exp, tmp);
+-	}
+-      emit_label (label);
+-      LABEL_NUSES (label) = 1;
+-    }
+-
+-  if (count_exp != const0_rtx && epilogue_size_needed > 1)
+-    {
+-      if (force_loopy_epilogue)
+-	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+-					 epilogue_size_needed);
+-      else
+-	{
+-	  if (issetmem)
+-	    expand_setmem_epilogue (dst, destreg, promoted_val,
+-				    vec_promoted_val, count_exp,
+-				    epilogue_size_needed);
+-	  else
+-	    expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
+-				    epilogue_size_needed);
+-	}
+-    }
+-  if (jump_around_label)
+-    emit_label (jump_around_label);
+-  return true;
+-}
+-
+-
+-/* Expand the appropriate insns for doing strlen if not just doing
+-   repnz; scasb
+-
+-   out = result, initialized with the start address
+-   align_rtx = alignment of the address.
+-   scratch = scratch register, initialized with the startaddress when
+-	not aligned, otherwise undefined
+-
+-   This is just the body. It needs the initializations mentioned above and
+-   some address computing at the end.  These things are done in i386.md.  */
+-
+-static void
+-ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
+-{
+-  int align;
+-  rtx tmp;
+-  rtx_code_label *align_2_label = NULL;
+-  rtx_code_label *align_3_label = NULL;
+-  rtx_code_label *align_4_label = gen_label_rtx ();
+-  rtx_code_label *end_0_label = gen_label_rtx ();
+-  rtx mem;
+-  rtx tmpreg = gen_reg_rtx (SImode);
+-  rtx scratch = gen_reg_rtx (SImode);
+-  rtx cmp;
+-
+-  align = 0;
+-  if (CONST_INT_P (align_rtx))
+-    align = INTVAL (align_rtx);
+-
+-  /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
+-
+-  /* Is there a known alignment and is it less than 4?  */
+-  if (align < 4)
+-    {
+-      rtx scratch1 = gen_reg_rtx (Pmode);
+-      emit_move_insn (scratch1, out);
+-      /* Is there a known alignment and is it not 2? */
+-      if (align != 2)
+-	{
+-	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
+-	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
+-
+-	  /* Leave just the 3 lower bits.  */
+-	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
+-				    NULL_RTX, 0, OPTAB_WIDEN);
+-
+-	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+-				   Pmode, 1, align_4_label);
+-	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
+-				   Pmode, 1, align_2_label);
+-	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
+-				   Pmode, 1, align_3_label);
+-	}
+-      else
+-        {
+-	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
+-	     check if is aligned to 4 - byte.  */
+-
+-	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
+-				    NULL_RTX, 0, OPTAB_WIDEN);
+-
+-	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+-				   Pmode, 1, align_4_label);
+-        }
+-
+-      mem = change_address (src, QImode, out);
+-
+-      /* Now compare the bytes.  */
+-
+-      /* Compare the first n unaligned byte on a byte per byte basis.  */
+-      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
+-			       QImode, 1, end_0_label);
+-
+-      /* Increment the address.  */
+-      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+-
+-      /* Not needed with an alignment of 2 */
+-      if (align != 2)
+-	{
+-	  emit_label (align_2_label);
+-
+-	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+-				   end_0_label);
+-
+-	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+-
+-	  emit_label (align_3_label);
+-	}
+-
+-      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+-			       end_0_label);
+-
+-      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+-    }
+-
+-  /* Generate loop to check 4 bytes at a time.  It is not a good idea to
+-     align this loop.  It gives only huge programs, but does not help to
+-     speed up.  */
+-  emit_label (align_4_label);
+-
+-  mem = change_address (src, SImode, out);
+-  emit_move_insn (scratch, mem);
+-  emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
+-
+-  /* This formula yields a nonzero result iff one of the bytes is zero.
+-     This saves three branches inside loop and many cycles.  */
+-
+-  emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
+-  emit_insn (gen_one_cmplsi2 (scratch, scratch));
+-  emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
+-  emit_insn (gen_andsi3 (tmpreg, tmpreg,
+-			 gen_int_mode (0x80808080, SImode)));
+-  emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
+-			   align_4_label);
+-
+-  if (TARGET_CMOVE)
+-    {
+-       rtx reg = gen_reg_rtx (SImode);
+-       rtx reg2 = gen_reg_rtx (Pmode);
+-       emit_move_insn (reg, tmpreg);
+-       emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
+-
+-       /* If zero is not in the first two bytes, move two bytes forward.  */
+-       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+-       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+-       emit_insn (gen_rtx_SET (tmpreg,
+-			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
+-						     reg,
+-						     tmpreg)));
+-       /* Emit lea manually to avoid clobbering of flags.  */
+-       emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
+-
+-       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+-       emit_insn (gen_rtx_SET (out,
+-			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
+-						     reg2,
+-						     out)));
+-    }
+-  else
+-    {
+-       rtx_code_label *end_2_label = gen_label_rtx ();
+-       /* Is zero in the first two bytes? */
+-
+-       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+-       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-       tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
+-       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+-                            gen_rtx_LABEL_REF (VOIDmode, end_2_label),
+-                            pc_rtx);
+-       tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-       JUMP_LABEL (tmp) = end_2_label;
+-
+-       /* Not in the first two.  Move two bytes forward.  */
+-       emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
+-       emit_insn (ix86_gen_add3 (out, out, const2_rtx));
+-
+-       emit_label (end_2_label);
+-
+-    }
+-
+-  /* Avoid branch in fixing the byte.  */
+-  tmpreg = gen_lowpart (QImode, tmpreg);
+-  emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
+-  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
+-  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
+-  emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
+-
+-  emit_label (end_0_label);
+-}
+-
+-/* Expand strlen.  */
+-
+-bool
+-ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
+-{
+-  rtx addr, scratch1, scratch2, scratch3, scratch4;
+-
+-  /* The generic case of strlen expander is long.  Avoid it's
+-     expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
+-
+-  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+-      && !TARGET_INLINE_ALL_STRINGOPS
+-      && !optimize_insn_for_size_p ()
+-      && (!CONST_INT_P (align) || INTVAL (align) < 4))
+-    return false;
+-
+-  addr = force_reg (Pmode, XEXP (src, 0));
+-  scratch1 = gen_reg_rtx (Pmode);
+-
+-  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+-      && !optimize_insn_for_size_p ())
+-    {
+-      /* Well it seems that some optimizer does not combine a call like
+-         foo(strlen(bar), strlen(bar));
+-         when the move and the subtraction is done here.  It does calculate
+-         the length just once when these instructions are done inside of
+-         output_strlen_unroll().  But I think since &bar[strlen(bar)] is
+-         often used and I use one fewer register for the lifetime of
+-         output_strlen_unroll() this is better.  */
+-
+-      emit_move_insn (out, addr);
+-
+-      ix86_expand_strlensi_unroll_1 (out, src, align);
+-
+-      /* strlensi_unroll_1 returns the address of the zero at the end of
+-         the string, like memchr(), so compute the length by subtracting
+-         the start address.  */
+-      emit_insn (ix86_gen_sub3 (out, out, addr));
+-    }
+-  else
+-    {
+-      rtx unspec;
+-
+-      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+-      if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
+-        return false;
+-      /* Can't use this for non-default address spaces.  */
+-      if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
+-	return false;
+-
+-      scratch2 = gen_reg_rtx (Pmode);
+-      scratch3 = gen_reg_rtx (Pmode);
+-      scratch4 = force_reg (Pmode, constm1_rtx);
+-
+-      emit_move_insn (scratch3, addr);
+-      eoschar = force_reg (QImode, eoschar);
+-
+-      src = replace_equiv_address_nv (src, scratch3);
+-
+-      /* If .md starts supporting :P, this can be done in .md.  */
+-      unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
+-						 scratch4), UNSPEC_SCAS);
+-      emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
+-      emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
+-      emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
+-    }
+-  return true;
+-}
+-
+-/* For given symbol (function) construct code to compute address of it's PLT
+-   entry in large x86-64 PIC model.  */
+-static rtx
+-construct_plt_address (rtx symbol)
+-{
+-  rtx tmp, unspec;
+-
+-  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
+-  gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
+-  gcc_assert (Pmode == DImode);
+-
+-  tmp = gen_reg_rtx (Pmode);
+-  unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
+-
+-  emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
+-  emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
+-  return tmp;
+-}
+-
+-rtx_insn *
+-ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
+-		  rtx callarg2,
+-		  rtx pop, bool sibcall)
+-{
+-  rtx vec[3];
+-  rtx use = NULL, call;
+-  unsigned int vec_len = 0;
+-  tree fndecl;
+-
+-  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+-    {
+-      fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
+-      if (fndecl
+-	  && (lookup_attribute ("interrupt",
+-				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
+-	error ("interrupt service routine can%'t be called directly");
+-    }
+-  else
+-    fndecl = NULL_TREE;
+-
+-  if (pop == const0_rtx)
+-    pop = NULL;
+-  gcc_assert (!TARGET_64BIT || !pop);
+-
+-  if (TARGET_MACHO && !TARGET_64BIT)
+-    {
+-#if TARGET_MACHO
+-      if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+-	fnaddr = machopic_indirect_call_target (fnaddr);
+-#endif
+-    }
+-  else
+-    {
+-      /* Static functions and indirect calls don't need the pic register.  Also,
+-	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
+-	 it an indirect call.  */
+-      rtx addr = XEXP (fnaddr, 0);
+-      if (flag_pic
+-	  && GET_CODE (addr) == SYMBOL_REF
+-	  && !SYMBOL_REF_LOCAL_P (addr))
+-	{
+-	  if (flag_plt
+-	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
+-		  || !lookup_attribute ("noplt",
+-					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
+-	    {
+-	      if (!TARGET_64BIT
+-		  || (ix86_cmodel == CM_LARGE_PIC
+-		      && DEFAULT_ABI != MS_ABI))
+-		{
+-		  use_reg (&use, gen_rtx_REG (Pmode,
+-					      REAL_PIC_OFFSET_TABLE_REGNUM));
+-		  if (ix86_use_pseudo_pic_reg ())
+-		    emit_move_insn (gen_rtx_REG (Pmode,
+-						 REAL_PIC_OFFSET_TABLE_REGNUM),
+-				    pic_offset_table_rtx);
+-		}
+-	    }
+-	  else if (!TARGET_PECOFF && !TARGET_MACHO)
+-	    {
+-	      if (TARGET_64BIT)
+-		{
+-		  fnaddr = gen_rtx_UNSPEC (Pmode,
+-					   gen_rtvec (1, addr),
+-					   UNSPEC_GOTPCREL);
+-		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+-		}
+-	      else
+-		{
+-		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+-					   UNSPEC_GOT);
+-		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+-		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+-					 fnaddr);
+-		}
+-	      fnaddr = gen_const_mem (Pmode, fnaddr);
+-	      /* Pmode may not be the same as word_mode for x32, which
+-		 doesn't support indirect branch via 32-bit memory slot.
+-		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
+-		 indirect branch via x32 GOT slot is OK.  */
+-	      if (GET_MODE (fnaddr) != word_mode)
+-		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+-	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
+-	    }
+-	}
+-    }
+-
+-  /* Skip setting up RAX register for -mskip-rax-setup when there are no
+-     parameters passed in vector registers.  */
+-  if (TARGET_64BIT
+-      && (INTVAL (callarg2) > 0
+-	  || (INTVAL (callarg2) == 0
+-	      && (TARGET_SSE || !flag_skip_rax_setup))))
+-    {
+-      rtx al = gen_rtx_REG (QImode, AX_REG);
+-      emit_move_insn (al, callarg2);
+-      use_reg (&use, al);
+-    }
+-
+-  if (ix86_cmodel == CM_LARGE_PIC
+-      && !TARGET_PECOFF
+-      && MEM_P (fnaddr)
+-      && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
+-      && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
+-    fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
+-  /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
+-     branch via x32 GOT slot is OK.  */
+-  else if (!(TARGET_X32
+-	     && MEM_P (fnaddr)
+-	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
+-	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
+-	   && (sibcall
+-	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
+-	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
+-    {
+-      fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
+-      fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
+-    }
+-
+-  call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
+-
+-  if (retval)
+-    call = gen_rtx_SET (retval, call);
+-  vec[vec_len++] = call;
+-
+-  if (pop)
+-    {
+-      pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
+-      pop = gen_rtx_SET (stack_pointer_rtx, pop);
+-      vec[vec_len++] = pop;
+-    }
+-
+-  if (cfun->machine->no_caller_saved_registers
+-      && (!fndecl
+-	  || (!TREE_THIS_VOLATILE (fndecl)
+-	      && !lookup_attribute ("no_caller_saved_registers",
+-				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
+-    {
+-      static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
+-      bool is_64bit_ms_abi = (TARGET_64BIT
+-			      && ix86_function_abi (fndecl) == MS_ABI);
+-      char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
+-
+-      /* If there are no caller-saved registers, add all registers
+-	 that are clobbered by the call which returns.  */
+-      for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-	if (!fixed_regs[i]
+-	    && (ix86_call_used_regs[i] == 1
+-		|| (ix86_call_used_regs[i] & c_mask))
+-	    && !STACK_REGNO_P (i)
+-	    && !MMX_REGNO_P (i))
+-	  clobber_reg (&use,
+-		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
+-    }
+-  else if (TARGET_64BIT_MS_ABI
+-	   && (!callarg2 || INTVAL (callarg2) != -2))
+-    {
+-      unsigned i;
+-
+-      for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
+-	{
+-	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
+-	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
+-
+-	  clobber_reg (&use, gen_rtx_REG (mode, regno));
+-	}
+-
+-      /* Set here, but it may get cleared later.  */
+-      if (TARGET_CALL_MS2SYSV_XLOGUES)
+-	{
+-	  if (!TARGET_SSE)
+-	    ;
+-
+-	  /* Don't break hot-patched functions.  */
+-	  else if (ix86_function_ms_hook_prologue (current_function_decl))
+-	    ;
+-
+-	  /* TODO: Cases not yet examined.  */
+-	  else if (flag_split_stack)
+-	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
+-
+-	  else
+-	    {
+-	      gcc_assert (!reload_completed);
+-	      cfun->machine->call_ms2sysv = true;
+-	    }
+-	}
+-    }
+-
+-  if (vec_len > 1)
+-    call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
+-  rtx_insn *call_insn = emit_call_insn (call);
+-  if (use)
+-    CALL_INSN_FUNCTION_USAGE (call_insn) = use;
+-
+-  return call_insn;
+-}
+-
+-/* Return true if the function being called was marked with attribute
+-   "noplt" or using -fno-plt and we are compiling for non-PIC.  We need
+-   to handle the non-PIC case in the backend because there is no easy
+-   interface for the front-end to force non-PLT calls to use the GOT.
+-   This is currently used only with 64-bit or 32-bit GOT32X ELF targets
+-   to call the function marked "noplt" indirectly.  */
+-
+-static bool
+-ix86_nopic_noplt_attribute_p (rtx call_op)
+-{
+-  if (flag_pic || ix86_cmodel == CM_LARGE
+-      || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
+-      || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
+-      || SYMBOL_REF_LOCAL_P (call_op))
+-    return false;
+-
+-  tree symbol_decl = SYMBOL_REF_DECL (call_op);
+-
+-  if (!flag_plt
+-      || (symbol_decl != NULL_TREE
+-          && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
+-    return true;
+-
+-  return false;
+-}
+-
+-/* Helper to output the jmp/call.  */
+-static void
+-ix86_output_jmp_thunk_or_indirect (const char *thunk_name, const int regno)
+-{
+-  if (thunk_name != NULL)
+-    {
+-      fprintf (asm_out_file, "\tjmp\t");
+-      assemble_name (asm_out_file, thunk_name);
+-      putc ('\n', asm_out_file);
+-    }
+-  else
+-    output_indirect_thunk (regno);
+-}
+-
+-/* Output indirect branch via a call and return thunk.  CALL_OP is a
+-   register which contains the branch target.  XASM is the assembly
+-   template for CALL_OP.  Branch is a tail call if SIBCALL_P is true.
+-   A normal call is converted to:
+-
+-	call __x86_indirect_thunk_reg
+-
+-   and a tail call is converted to:
+-
+-	jmp __x86_indirect_thunk_reg
+- */
+-
+-static void
+-ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
+-{
+-  char thunk_name_buf[32];
+-  char *thunk_name;
+-  enum indirect_thunk_prefix need_prefix
+-    = indirect_thunk_need_prefix (current_output_insn);
+-  int regno = REGNO (call_op);
+-
+-  if (cfun->machine->indirect_branch_type
+-      != indirect_branch_thunk_inline)
+-    {
+-      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
+-	{
+-	  int i = regno;
+-	  if (i >= FIRST_REX_INT_REG)
+-	    i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
+-	  indirect_thunks_used |= 1 << i;
+-	}
+-      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
+-      thunk_name = thunk_name_buf;
+-    }
+-  else
+-    thunk_name = NULL;
+-
+-  if (sibcall_p)
+-     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+-  else
+-    {
+-      if (thunk_name != NULL)
+-	{
+-	  fprintf (asm_out_file, "\tcall\t");
+-	  assemble_name (asm_out_file, thunk_name);
+-	  putc ('\n', asm_out_file);
+-	  return;
+-	}
+-
+-      char indirectlabel1[32];
+-      char indirectlabel2[32];
+-
+-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
+-				   INDIRECT_LABEL,
+-				   indirectlabelno++);
+-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
+-				   INDIRECT_LABEL,
+-				   indirectlabelno++);
+-
+-      /* Jump.  */
+-      fputs ("\tjmp\t", asm_out_file);
+-      assemble_name_raw (asm_out_file, indirectlabel2);
+-      fputc ('\n', asm_out_file);
+-
+-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+-
+-     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+-
+-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+-
+-      /* Call.  */
+-      fputs ("\tcall\t", asm_out_file);
+-      assemble_name_raw (asm_out_file, indirectlabel1);
+-      fputc ('\n', asm_out_file);
+-    }
+-}
+-
+-/* Output indirect branch via a call and return thunk.  CALL_OP is
+-   the branch target.  XASM is the assembly template for CALL_OP.
+-   Branch is a tail call if SIBCALL_P is true.  A normal call is
+-   converted to:
+-
+-	jmp L2
+-   L1:
+-	push CALL_OP
+-	jmp __x86_indirect_thunk
+-   L2:
+-	call L1
+-
+-   and a tail call is converted to:
+-
+-	push CALL_OP
+-	jmp __x86_indirect_thunk
+- */
+-
+-static void
+-ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
+-				      bool sibcall_p)
+-{
+-  char thunk_name_buf[32];
+-  char *thunk_name;
+-  char push_buf[64];
+-  enum indirect_thunk_prefix need_prefix
+-    = indirect_thunk_need_prefix (current_output_insn);
+-  int regno = -1;
+-
+-  if (cfun->machine->indirect_branch_type
+-      != indirect_branch_thunk_inline)
+-    {
+-      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
+-	indirect_thunk_needed = true;
+-      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
+-      thunk_name = thunk_name_buf;
+-    }
+-  else
+-    thunk_name = NULL;
+-
+-  snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
+-	    TARGET_64BIT ? 'q' : 'l', xasm);
+-
+-  if (sibcall_p)
+-    {
+-      output_asm_insn (push_buf, &call_op);
+-      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+-    }
+-  else
+-    {
+-      char indirectlabel1[32];
+-      char indirectlabel2[32];
+-
+-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
+-				   INDIRECT_LABEL,
+-				   indirectlabelno++);
+-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
+-				   INDIRECT_LABEL,
+-				   indirectlabelno++);
+-
+-      /* Jump.  */
+-      fputs ("\tjmp\t", asm_out_file);
+-      assemble_name_raw (asm_out_file, indirectlabel2);
+-      fputc ('\n', asm_out_file);
+-
+-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+-
+-      /* An external function may be called via GOT, instead of PLT.  */
+-      if (MEM_P (call_op))
+-	{
+-	  struct ix86_address parts;
+-	  rtx addr = XEXP (call_op, 0);
+-	  if (ix86_decompose_address (addr, &parts)
+-	      && parts.base == stack_pointer_rtx)
+-	    {
+-	      /* Since call will adjust stack by -UNITS_PER_WORD,
+-		 we must convert "disp(stack, index, scale)" to
+-		 "disp+UNITS_PER_WORD(stack, index, scale)".  */
+-	      if (parts.index)
+-		{
+-		  addr = gen_rtx_MULT (Pmode, parts.index,
+-				       GEN_INT (parts.scale));
+-		  addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+-				       addr);
+-		}
+-	      else
+-		addr = stack_pointer_rtx;
+-
+-	      rtx disp;
+-	      if (parts.disp != NULL_RTX)
+-		disp = plus_constant (Pmode, parts.disp,
+-				      UNITS_PER_WORD);
+-	      else
+-		disp = GEN_INT (UNITS_PER_WORD);
+-
+-	      addr = gen_rtx_PLUS (Pmode, addr, disp);
+-	      call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
+-	    }
+-	}
+-
+-      output_asm_insn (push_buf, &call_op);
+-
+-      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+-
+-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+-
+-      /* Call.  */
+-      fputs ("\tcall\t", asm_out_file);
+-      assemble_name_raw (asm_out_file, indirectlabel1);
+-      fputc ('\n', asm_out_file);
+-    }
+-}
+-
+-/* Output indirect branch via a call and return thunk.  CALL_OP is
+-   the branch target.  XASM is the assembly template for CALL_OP.
+-   Branch is a tail call if SIBCALL_P is true.   */
+-
+-static void
+-ix86_output_indirect_branch (rtx call_op, const char *xasm,
+-			     bool sibcall_p)
+-{
+-  if (REG_P (call_op))
+-    ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
+-  else
+-    ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
+-}
+-
+-/* Output indirect jump.  CALL_OP is the jump target.  */
+-
+-const char *
+-ix86_output_indirect_jmp (rtx call_op)
+-{
+-  if (cfun->machine->indirect_branch_type != indirect_branch_keep)
+-    {
+-      /* We can't have red-zone since "call" in the indirect thunk
+-         pushes the return address onto stack, destroying red-zone.  */
+-      if (ix86_red_zone_size != 0)
+-	gcc_unreachable ();
+-
+-      ix86_output_indirect_branch (call_op, "%0", true);
+-      return "";
+-    }
+-  else
+-    return "%!jmp\t%A0";
+-}
+-
+-/* Output return instrumentation for current function if needed.  */
+-
+-static void
+-output_return_instrumentation (void)
+-{
+-  if (ix86_instrument_return != instrument_return_none
+-      && flag_fentry
+-      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
+-    {
+-      if (ix86_flag_record_return)
+-	fprintf (asm_out_file, "1:\n");
+-      switch (ix86_instrument_return)
+-	{
+-	case instrument_return_call:
+-	  fprintf (asm_out_file, "\tcall\t__return__\n");
+-	  break;
+-	case instrument_return_nop5:
+-	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */
+-	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+-	  break;
+-	case instrument_return_none:
+-	  break;
+-	}
+-
+-      if (ix86_flag_record_return)
+-	{
+-	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
+-	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+-	  fprintf (asm_out_file, "\t.previous\n");
+-	}
+-    }
+-}
+-
+-/* Output function return.  CALL_OP is the jump target.  Add a REP
+-   prefix to RET if LONG_P is true and function return is kept.  */
+-
+-const char *
+-ix86_output_function_return (bool long_p)
+-{
+-  output_return_instrumentation ();
+-
+-  if (cfun->machine->function_return_type != indirect_branch_keep)
+-    {
+-      char thunk_name[32];
+-      enum indirect_thunk_prefix need_prefix
+-	= indirect_thunk_need_prefix (current_output_insn);
+-
+-      if (cfun->machine->function_return_type
+-	  != indirect_branch_thunk_inline)
+-	{
+-	  bool need_thunk = (cfun->machine->function_return_type
+-			     == indirect_branch_thunk);
+-	  indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
+-			       true);
+-	  indirect_return_needed |= need_thunk;
+-	  fprintf (asm_out_file, "\tjmp\t");
+-	  assemble_name (asm_out_file, thunk_name);
+-	  putc ('\n', asm_out_file);
+-	}
+-      else
+-	output_indirect_thunk (INVALID_REGNUM);
+-
+-      return "";
+-    }
+-
+-  if (!long_p)
+-    return "%!ret";
+-
+-  return "rep%; ret";
+-}
+-
+-/* Output indirect function return.  RET_OP is the function return
+-   target.  */
+-
+-const char *
+-ix86_output_indirect_function_return (rtx ret_op)
+-{
+-  if (cfun->machine->function_return_type != indirect_branch_keep)
+-    {
+-      char thunk_name[32];
+-      enum indirect_thunk_prefix need_prefix
+-	= indirect_thunk_need_prefix (current_output_insn);
+-      unsigned int regno = REGNO (ret_op);
+-      gcc_assert (regno == CX_REG);
+-
+-      if (cfun->machine->function_return_type
+-	  != indirect_branch_thunk_inline)
+-	{
+-	  bool need_thunk = (cfun->machine->function_return_type
+-			     == indirect_branch_thunk);
+-	  indirect_thunk_name (thunk_name, regno, need_prefix, true);
+-
+-	  if (need_thunk)
+-	    {
+-	      indirect_return_via_cx = true;
+-	      indirect_thunks_used |= 1 << CX_REG;
+-	    }
+-	  fprintf (asm_out_file, "\tjmp\t");
+-	  assemble_name (asm_out_file, thunk_name);
+-	  putc ('\n', asm_out_file);
+-	}
+-      else
+-	output_indirect_thunk (regno);
+-
+-      return "";
+-    }
+-  else
+-    return "%!jmp\t%A0";
+-}
+-
+-/* Split simple return with popping POPC bytes from stack to indirect
+-   branch with stack adjustment .  */
+-
+-void
+-ix86_split_simple_return_pop_internal (rtx popc)
+-{
+-  struct machine_function *m = cfun->machine;
+-  rtx ecx = gen_rtx_REG (SImode, CX_REG);
+-  rtx_insn *insn;
+-
+-  /* There is no "pascal" calling convention in any 64bit ABI.  */
+-  gcc_assert (!TARGET_64BIT);
+-
+-  insn = emit_insn (gen_pop (ecx));
+-  m->fs.cfa_offset -= UNITS_PER_WORD;
+-  m->fs.sp_offset -= UNITS_PER_WORD;
+-
+-  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+-  x = gen_rtx_SET (stack_pointer_rtx, x);
+-  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+-  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+-  RTX_FRAME_RELATED_P (insn) = 1;
+-
+-  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
+-  x = gen_rtx_SET (stack_pointer_rtx, x);
+-  insn = emit_insn (x);
+-  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+-  RTX_FRAME_RELATED_P (insn) = 1;
+-
+-  /* Now return address is in ECX.  */
+-  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+-}
+-
+-/* Output the assembly for a call instruction.  */
+-
+-const char *
+-ix86_output_call_insn (rtx_insn *insn, rtx call_op)
+-{
+-  bool direct_p = constant_call_address_operand (call_op, VOIDmode);
+-  bool output_indirect_p
+-    = (!TARGET_SEH
+-       && cfun->machine->indirect_branch_type != indirect_branch_keep);
+-  bool seh_nop_p = false;
+-  const char *xasm;
+-
+-  if (SIBLING_CALL_P (insn))
+-    {
+-      output_return_instrumentation ();
+-      if (direct_p)
+-	{
+-	  if (ix86_nopic_noplt_attribute_p (call_op))
+-	    {
+-	      direct_p = false;
+-	      if (TARGET_64BIT)
+-		{
+-		  if (output_indirect_p)
+-		    xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+-		  else
+-		    xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+-		}
+-	      else
+-		{
+-		  if (output_indirect_p)
+-		    xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
+-		  else
+-		    xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
+-		}
+-	    }
+-	  else
+-	    xasm = "%!jmp\t%P0";
+-	}
+-      /* SEH epilogue detection requires the indirect branch case
+-	 to include REX.W.  */
+-      else if (TARGET_SEH)
+-	xasm = "%!rex.W jmp\t%A0";
+-      else
+-	{
+-	  if (output_indirect_p)
+-	    xasm = "%0";
+-	  else
+-	    xasm = "%!jmp\t%A0";
+-	}
+-
+-      if (output_indirect_p && !direct_p)
+-	ix86_output_indirect_branch (call_op, xasm, true);
+-      else
+-	output_asm_insn (xasm, &call_op);
+-      return "";
+-    }
+-
+-  /* SEH unwinding can require an extra nop to be emitted in several
+-     circumstances.  Determine if we have one of those.  */
+-  if (TARGET_SEH)
+-    {
+-      rtx_insn *i;
+-
+-      for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
+-	{
+-	  /* Prevent a catch region from being adjacent to a jump that would
+-	     be interpreted as an epilogue sequence by the unwinder.  */
+-	  if (JUMP_P(i) && CROSSING_JUMP_P (i))
+-	    {
+-	      seh_nop_p = true;
+-	      break;
+-	    }
+-	    
+-	  /* If we get to another real insn, we don't need the nop.  */
+-	  if (INSN_P (i))
+-	    break;
+-
+-	  /* If we get to the epilogue note, prevent a catch region from
+-	     being adjacent to the standard epilogue sequence.  If non-
+-	     call-exceptions, we'll have done this during epilogue emission. */
+-	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
+-	      && !flag_non_call_exceptions
+-	      && !can_throw_internal (insn))
+-	    {
+-	      seh_nop_p = true;
+-	      break;
+-	    }
+-	}
+-
+-      /* If we didn't find a real insn following the call, prevent the
+-	 unwinder from looking into the next function.  */
+-      if (i == NULL)
+-	seh_nop_p = true;
+-    }
+-
+-  if (direct_p)
+-    {
+-      if (ix86_nopic_noplt_attribute_p (call_op))
+-	{
+-	  direct_p = false;
+-	  if (TARGET_64BIT)
+-	    {
+-	      if (output_indirect_p)
+-		xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+-	      else
+-		xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+-	    }
+-	  else
+-	    {
+-	      if (output_indirect_p)
+-		xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
+-	      else
+-		xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
+-	    }
+-	}
+-      else
+-	xasm = "%!call\t%P0";
+-    }
+-  else
+-    {
+-      if (output_indirect_p)
+-	xasm = "%0";
+-      else
+-	xasm = "%!call\t%A0";
+-    }
+-
+-  if (output_indirect_p && !direct_p)
+-    ix86_output_indirect_branch (call_op, xasm, false);
+-  else
+-    output_asm_insn (xasm, &call_op);
+-
+-  if (seh_nop_p)
+-    return "nop";
+-
+-  return "";
+-}
+-
+-/* Clear stack slot assignments remembered from previous functions.
+-   This is called from INIT_EXPANDERS once before RTL is emitted for each
+-   function.  */
+-
+-static struct machine_function *
+-ix86_init_machine_status (void)
+-{
+-  struct machine_function *f;
+-
+-  f = ggc_cleared_alloc<machine_function> ();
+-  f->call_abi = ix86_abi;
+-
+-  return f;
+-}
+-
+-/* Return a MEM corresponding to a stack slot with mode MODE.
+-   Allocate a new slot if necessary.
+-
+-   The RTL for a function can have several slots available: N is
+-   which slot to use.  */
+-
+-rtx
+-assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
+-{
+-  struct stack_local_entry *s;
+-
+-  gcc_assert (n < MAX_386_STACK_LOCALS);
+-
+-  for (s = ix86_stack_locals; s; s = s->next)
+-    if (s->mode == mode && s->n == n)
+-      return validize_mem (copy_rtx (s->rtl));
+-
+-  s = ggc_alloc<stack_local_entry> ();
+-  s->n = n;
+-  s->mode = mode;
+-  s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
+-
+-  s->next = ix86_stack_locals;
+-  ix86_stack_locals = s;
+-  return validize_mem (copy_rtx (s->rtl));
+-}
+-
+-static void
+-ix86_instantiate_decls (void)
+-{
+-  struct stack_local_entry *s;
+-
+-  for (s = ix86_stack_locals; s; s = s->next)
+-    if (s->rtl != NULL_RTX)
+-      instantiate_decl_rtl (s->rtl);
+-}
+-
+-/* Check whether x86 address PARTS is a pc-relative address.  */
+-
+-bool
+-ix86_rip_relative_addr_p (struct ix86_address *parts)
+-{
+-  rtx base, index, disp;
+-
+-  base = parts->base;
+-  index = parts->index;
+-  disp = parts->disp;
+-
+-  if (disp && !base && !index)
+-    {
+-      if (TARGET_64BIT)
+-	{
+-	  rtx symbol = disp;
+-
+-	  if (GET_CODE (disp) == CONST)
+-	    symbol = XEXP (disp, 0);
+-	  if (GET_CODE (symbol) == PLUS
+-	      && CONST_INT_P (XEXP (symbol, 1)))
+-	    symbol = XEXP (symbol, 0);
+-
+-	  if (GET_CODE (symbol) == LABEL_REF
+-	      || (GET_CODE (symbol) == SYMBOL_REF
+-		  && SYMBOL_REF_TLS_MODEL (symbol) == 0)
+-	      || (GET_CODE (symbol) == UNSPEC
+-		  && (XINT (symbol, 1) == UNSPEC_GOTPCREL
+-		      || XINT (symbol, 1) == UNSPEC_PCREL
+-		      || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
+-	    return true;
+-	}
+-    }
+-  return false;
+-}
+-
+-/* Calculate the length of the memory address in the instruction encoding.
+-   Includes addr32 prefix, does not include the one-byte modrm, opcode,
+-   or other prefixes.  We never generate addr32 prefix for LEA insn.  */
+-
+-int
+-memory_address_length (rtx addr, bool lea)
+-{
+-  struct ix86_address parts;
+-  rtx base, index, disp;
+-  int len;
+-  int ok;
+-
+-  if (GET_CODE (addr) == PRE_DEC
+-      || GET_CODE (addr) == POST_INC
+-      || GET_CODE (addr) == PRE_MODIFY
+-      || GET_CODE (addr) == POST_MODIFY)
+-    return 0;
+-
+-  ok = ix86_decompose_address (addr, &parts);
+-  gcc_assert (ok);
+-
+-  len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
+-
+-  /*  If this is not LEA instruction, add the length of addr32 prefix.  */
+-  if (TARGET_64BIT && !lea
+-      && (SImode_address_operand (addr, VOIDmode)
+-	  || (parts.base && GET_MODE (parts.base) == SImode)
+-	  || (parts.index && GET_MODE (parts.index) == SImode)))
+-    len++;
+-
+-  base = parts.base;
+-  index = parts.index;
+-  disp = parts.disp;
+-
+-  if (base && SUBREG_P (base))
+-    base = SUBREG_REG (base);
+-  if (index && SUBREG_P (index))
+-    index = SUBREG_REG (index);
+-
+-  gcc_assert (base == NULL_RTX || REG_P (base));
+-  gcc_assert (index == NULL_RTX || REG_P (index));
+-
+-  /* Rule of thumb:
+-       - esp as the base always wants an index,
+-       - ebp as the base always wants a displacement,
+-       - r12 as the base always wants an index,
+-       - r13 as the base always wants a displacement.  */
+-
+-  /* Register Indirect.  */
+-  if (base && !index && !disp)
+-    {
+-      /* esp (for its index) and ebp (for its displacement) need
+-	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
+-	 code.  */
+-      if (base == arg_pointer_rtx
+-	  || base == frame_pointer_rtx
+-	  || REGNO (base) == SP_REG
+-	  || REGNO (base) == BP_REG
+-	  || REGNO (base) == R12_REG
+-	  || REGNO (base) == R13_REG)
+-	len++;
+-    }
+-
+-  /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
+-     is not disp32, but disp32(%rip), so for disp32
+-     SIB byte is needed, unless print_operand_address
+-     optimizes it into disp32(%rip) or (%rip) is implied
+-     by UNSPEC.  */
+-  else if (disp && !base && !index)
+-    {
+-      len += 4;
+-      if (!ix86_rip_relative_addr_p (&parts))
+-	len++;
+-    }
+-  else
+-    {
+-      /* Find the length of the displacement constant.  */
+-      if (disp)
+-	{
+-	  if (base && satisfies_constraint_K (disp))
+-	    len += 1;
+-	  else
+-	    len += 4;
+-	}
+-      /* ebp always wants a displacement.  Similarly r13.  */
+-      else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
+-	len++;
+-
+-      /* An index requires the two-byte modrm form....  */
+-      if (index
+-	  /* ...like esp (or r12), which always wants an index.  */
+-	  || base == arg_pointer_rtx
+-	  || base == frame_pointer_rtx
+-	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
+-	len++;
+-    }
+-
+-  return len;
+-}
+-
+-/* Compute default value for "length_immediate" attribute.  When SHORTFORM
+-   is set, expect that insn have 8bit immediate alternative.  */
+-int
+-ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
+-{
+-  int len = 0;
+-  int i;
+-  extract_insn_cached (insn);
+-  for (i = recog_data.n_operands - 1; i >= 0; --i)
+-    if (CONSTANT_P (recog_data.operand[i]))
+-      {
+-        enum attr_mode mode = get_attr_mode (insn);
+-
+-	gcc_assert (!len);
+-	if (shortform && CONST_INT_P (recog_data.operand[i]))
+-	  {
+-	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
+-	    switch (mode)
+-	      {
+-	      case MODE_QI:
+-		len = 1;
+-		continue;
+-	      case MODE_HI:
+-		ival = trunc_int_for_mode (ival, HImode);
+-		break;
+-	      case MODE_SI:
+-		ival = trunc_int_for_mode (ival, SImode);
+-		break;
+-	      default:
+-		break;
+-	      }
+-	    if (IN_RANGE (ival, -128, 127))
+-	      {
+-		len = 1;
+-		continue;
+-	      }
+-	  }
+-	switch (mode)
+-	  {
+-	  case MODE_QI:
+-	    len = 1;
+-	    break;
+-	  case MODE_HI:
+-	    len = 2;
+-	    break;
+-	  case MODE_SI:
+-	    len = 4;
+-	    break;
+-	  /* Immediates for DImode instructions are encoded
+-	     as 32bit sign extended values.  */
+-	  case MODE_DI:
+-	    len = 4;
+-	    break;
+-	  default:
+-	    fatal_insn ("unknown insn mode", insn);
+-	}
+-      }
+-  return len;
+-}
+-
+-/* Compute default value for "length_address" attribute.  */
+-int
+-ix86_attr_length_address_default (rtx_insn *insn)
+-{
+-  int i;
+-
+-  if (get_attr_type (insn) == TYPE_LEA)
+-    {
+-      rtx set = PATTERN (insn), addr;
+-
+-      if (GET_CODE (set) == PARALLEL)
+-	set = XVECEXP (set, 0, 0);
+-
+-      gcc_assert (GET_CODE (set) == SET);
+-
+-      addr = SET_SRC (set);
+-
+-      return memory_address_length (addr, true);
+-    }
+-
+-  extract_insn_cached (insn);
+-  for (i = recog_data.n_operands - 1; i >= 0; --i)
+-    {
+-      rtx op = recog_data.operand[i];
+-      if (MEM_P (op))
+-	{
+-	  constrain_operands_cached (insn, reload_completed);
+-	  if (which_alternative != -1)
+-	    {
+-	      const char *constraints = recog_data.constraints[i];
+-	      int alt = which_alternative;
+-
+-	      while (*constraints == '=' || *constraints == '+')
+-		constraints++;
+-	      while (alt-- > 0)
+-	        while (*constraints++ != ',')
+-		  ;
+-	      /* Skip ignored operands.  */
+-	      if (*constraints == 'X')
+-		continue;
+-	    }
+-
+-	  int len = memory_address_length (XEXP (op, 0), false);
+-
+-	  /* Account for segment prefix for non-default addr spaces.  */
+-	  if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
+-	    len++;
+-
+-	  return len;
+-	}
+-    }
+-  return 0;
+-}
+-
+-/* Compute default value for "length_vex" attribute. It includes
+-   2 or 3 byte VEX prefix and 1 opcode byte.  */
+-
+-int
+-ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
+-			      bool has_vex_w)
+-{
+-  int i;
+-
+-  /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
+-     byte VEX prefix.  */
+-  if (!has_0f_opcode || has_vex_w)
+-    return 3 + 1;
+-
+- /* We can always use 2 byte VEX prefix in 32bit.  */
+-  if (!TARGET_64BIT)
+-    return 2 + 1;
+-
+-  extract_insn_cached (insn);
+-
+-  for (i = recog_data.n_operands - 1; i >= 0; --i)
+-    if (REG_P (recog_data.operand[i]))
+-      {
+-	/* REX.W bit uses 3 byte VEX prefix.  */
+-	if (GET_MODE (recog_data.operand[i]) == DImode
+-	    && GENERAL_REG_P (recog_data.operand[i]))
+-	  return 3 + 1;
+-      }
+-    else
+-      {
+-	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
+-	if (MEM_P (recog_data.operand[i])
+-	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
+-	  return 3 + 1;
+-      }
+-
+-  return 2 + 1;
+-}
+-
+-
+-static bool
+-ix86_class_likely_spilled_p (reg_class_t);
+-
+-/* Returns true if lhs of insn is HW function argument register and set up
+-   is_spilled to true if it is likely spilled HW register.  */
+-static bool
+-insn_is_function_arg (rtx insn, bool* is_spilled)
+-{
+-  rtx dst;
+-
+-  if (!NONDEBUG_INSN_P (insn))
+-    return false;
+-  /* Call instructions are not movable, ignore it.  */
+-  if (CALL_P (insn))
+-    return false;
+-  insn = PATTERN (insn);
+-  if (GET_CODE (insn) == PARALLEL)
+-    insn = XVECEXP (insn, 0, 0);
+-  if (GET_CODE (insn) != SET)
+-    return false;
+-  dst = SET_DEST (insn);
+-  if (REG_P (dst) && HARD_REGISTER_P (dst)
+-      && ix86_function_arg_regno_p (REGNO (dst)))
+-    {
+-      /* Is it likely spilled HW register?  */
+-      if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
+-	  && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
+-	*is_spilled = true;
+-      return true;
+-    }
+-  return false;
+-}
+-
+-/* Add output dependencies for chain of function adjacent arguments if only
+-   there is a move to likely spilled HW register.  Return first argument
+-   if at least one dependence was added or NULL otherwise.  */
+-static rtx_insn *
+-add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
+-{
+-  rtx_insn *insn;
+-  rtx_insn *last = call;
+-  rtx_insn *first_arg = NULL;
+-  bool is_spilled = false;
+-
+-  head = PREV_INSN (head);
+-
+-  /* Find nearest to call argument passing instruction.  */
+-  while (true)
+-    {
+-      last = PREV_INSN (last);
+-      if (last == head)
+-	return NULL;
+-      if (!NONDEBUG_INSN_P (last))
+-	continue;
+-      if (insn_is_function_arg (last, &is_spilled))
+-	break;
+-      return NULL;
+-    }
+-
+-  first_arg = last;
+-  while (true)
+-    {
+-      insn = PREV_INSN (last);
+-      if (!INSN_P (insn))
+-	break;
+-      if (insn == head)
+-	break;
+-      if (!NONDEBUG_INSN_P (insn))
+-	{
+-	  last = insn;
+-	  continue;
+-	}
+-      if (insn_is_function_arg (insn, &is_spilled))
+-	{
+-	  /* Add output depdendence between two function arguments if chain
+-	     of output arguments contains likely spilled HW registers.  */
+-	  if (is_spilled)
+-	    add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+-	  first_arg = last = insn;
+-	}
+-      else
+-	break;
+-    }
+-  if (!is_spilled)
+-    return NULL;
+-  return first_arg;
+-}
+-
+-/* Add output or anti dependency from insn to first_arg to restrict its code
+-   motion.  */
+-static void
+-avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
+-{
+-  rtx set;
+-  rtx tmp;
+-
+-  set = single_set (insn);
+-  if (!set)
+-    return;
+-  tmp = SET_DEST (set);
+-  if (REG_P (tmp))
+-    {
+-      /* Add output dependency to the first function argument.  */
+-      add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+-      return;
+-    }
+-  /* Add anti dependency.  */
+-  add_dependence (first_arg, insn, REG_DEP_ANTI);
+-}
+-
+-/* Avoid cross block motion of function argument through adding dependency
+-   from the first non-jump instruction in bb.  */
+-static void
+-add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
+-{
+-  rtx_insn *insn = BB_END (bb);
+-
+-  while (insn)
+-    {
+-      if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
+-	{
+-	  rtx set = single_set (insn);
+-	  if (set)
+-	    {
+-	      avoid_func_arg_motion (arg, insn);
+-	      return;
+-	    }
+-	}
+-      if (insn == BB_HEAD (bb))
+-	return;
+-      insn = PREV_INSN (insn);
+-    }
+-}
+-
+-/* Hook for pre-reload schedule - avoid motion of function arguments
+-   passed in likely spilled HW registers.  */
+-static void
+-ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
+-{
+-  rtx_insn *insn;
+-  rtx_insn *first_arg = NULL;
+-  if (reload_completed)
+-    return;
+-  while (head != tail && DEBUG_INSN_P (head))
+-    head = NEXT_INSN (head);
+-  for (insn = tail; insn != head; insn = PREV_INSN (insn))
+-    if (INSN_P (insn) && CALL_P (insn))
+-      {
+-	first_arg = add_parameter_dependencies (insn, head);
+-	if (first_arg)
+-	  {
+-	    /* Add dependee for first argument to predecessors if only
+-	       region contains more than one block.  */
+-	    basic_block bb =  BLOCK_FOR_INSN (insn);
+-	    int rgn = CONTAINING_RGN (bb->index);
+-	    int nr_blks = RGN_NR_BLOCKS (rgn);
+-	    /* Skip trivial regions and region head blocks that can have
+-	       predecessors outside of region.  */
+-	    if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
+-	      {
+-		edge e;
+-		edge_iterator ei;
+-
+-		/* Regions are SCCs with the exception of selective
+-		   scheduling with pipelining of outer blocks enabled.
+-		   So also check that immediate predecessors of a non-head
+-		   block are in the same region.  */
+-		FOR_EACH_EDGE (e, ei, bb->preds)
+-		  {
+-		    /* Avoid creating of loop-carried dependencies through
+-		       using topological ordering in the region.  */
+-		    if (rgn == CONTAINING_RGN (e->src->index)
+-			&& BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
+-		      add_dependee_for_func_arg (first_arg, e->src); 
+-		  }
+-	      }
+-	    insn = first_arg;
+-	    if (insn == head)
+-	      break;
+-	  }
+-      }
+-    else if (first_arg)
+-      avoid_func_arg_motion (first_arg, insn);
+-}
+-
+-/* Hook for pre-reload schedule - set priority of moves from likely spilled
+-   HW registers to maximum, to schedule them at soon as possible. These are
+-   moves from function argument registers at the top of the function entry
+-   and moves from function return value registers after call.  */
+-static int
+-ix86_adjust_priority (rtx_insn *insn, int priority)
+-{
+-  rtx set;
+-
+-  if (reload_completed)
+-    return priority;
+-
+-  if (!NONDEBUG_INSN_P (insn))
+-    return priority;
+-
+-  set = single_set (insn);
+-  if (set)
+-    {
+-      rtx tmp = SET_SRC (set);
+-      if (REG_P (tmp)
+-          && HARD_REGISTER_P (tmp)
+-          && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
+-          && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
+-	return current_sched_info->sched_max_insns_priority;
+-    }
+-
+-  return priority;
+-}
+-
+-/* Prepare for scheduling pass.  */
+-static void
+-ix86_sched_init_global (FILE *, int, int)
+-{
+-  /* Install scheduling hooks for current CPU.  Some of these hooks are used
+-     in time-critical parts of the scheduler, so we only set them up when
+-     they are actually used.  */
+-  switch (ix86_tune)
+-    {
+-    case PROCESSOR_CORE2:
+-    case PROCESSOR_NEHALEM:
+-    case PROCESSOR_SANDYBRIDGE:
+-    case PROCESSOR_HASWELL:
+-    case PROCESSOR_GENERIC:
+-      /* Do not perform multipass scheduling for pre-reload schedule
+-         to save compile time.  */
+-      if (reload_completed)
+-	{
+-	  ix86_core2i7_init_hooks ();
+-	  break;
+-	}
+-      /* Fall through.  */
+-    default:
+-      targetm.sched.dfa_post_advance_cycle = NULL;
+-      targetm.sched.first_cycle_multipass_init = NULL;
+-      targetm.sched.first_cycle_multipass_begin = NULL;
+-      targetm.sched.first_cycle_multipass_issue = NULL;
+-      targetm.sched.first_cycle_multipass_backtrack = NULL;
+-      targetm.sched.first_cycle_multipass_end = NULL;
+-      targetm.sched.first_cycle_multipass_fini = NULL;
+-      break;
+-    }
+-}
+-
+-
+-/* Implement TARGET_STATIC_RTX_ALIGNMENT.  */
+-
+-static HOST_WIDE_INT
+-ix86_static_rtx_alignment (machine_mode mode)
+-{
+-  if (mode == DFmode)
+-    return 64;
+-  if (ALIGN_MODE_128 (mode))
+-    return MAX (128, GET_MODE_ALIGNMENT (mode));
+-  return GET_MODE_ALIGNMENT (mode);
+-}
+-
+-/* Implement TARGET_CONSTANT_ALIGNMENT.  */
+-
+-static HOST_WIDE_INT
+-ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
+-{
+-  if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
+-      || TREE_CODE (exp) == INTEGER_CST)
+-    {
+-      machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+-      HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
+-      return MAX (mode_align, align);
+-    }
+-  else if (!optimize_size && TREE_CODE (exp) == STRING_CST
+-	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
+-    return BITS_PER_WORD;
+-
+-  return align;
+-}
+-
+-/* Implement TARGET_EMPTY_RECORD_P.  */
+-
+-static bool
+-ix86_is_empty_record (const_tree type)
+-{
+-  if (!TARGET_64BIT)
+-    return false;
+-  return default_is_empty_record (type);
+-}
+-
+-/* Implement TARGET_WARN_PARAMETER_PASSING_ABI.  */
+-
+-static void
+-ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
+-{
+-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+-
+-  if (!cum->warn_empty)
+-    return;
+-
+-  if (!TYPE_EMPTY_P (type))
+-    return;
+-
+-  /* Don't warn if the function isn't visible outside of the TU.  */
+-  if (cum->decl && !TREE_PUBLIC (cum->decl))
+-    return;
+-
+-  const_tree ctx = get_ultimate_context (cum->decl);
+-  if (ctx != NULL_TREE
+-      && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
+-    return;
+-
+-  /* If the actual size of the type is zero, then there is no change
+-     in how objects of this size are passed.  */
+-  if (int_size_in_bytes (type) == 0)
+-    return;
+-
+-  warning (OPT_Wabi, "empty class %qT parameter passing ABI "
+-	   "changes in %<-fabi-version=12%> (GCC 8)", type);
+-
+-  /* Only warn once.  */
+-  cum->warn_empty = false;
+-}
+-
+-/* This hook returns name of multilib ABI.  */
+-
+-static const char *
+-ix86_get_multilib_abi_name (void)
+-{
+-  if (!(TARGET_64BIT_P (ix86_isa_flags)))
+-    return "i386";
+-  else if (TARGET_X32_P (ix86_isa_flags))
+-    return "x32";
+-  else
+-    return "x86_64";
+-}
+-
+-/* Compute the alignment for a variable for Intel MCU psABI.  TYPE is
+-   the data type, and ALIGN is the alignment that the object would
+-   ordinarily have.  */
+-
+-static int
+-iamcu_alignment (tree type, int align)
+-{
+-  machine_mode mode;
+-
+-  if (align < 32 || TYPE_USER_ALIGN (type))
+-    return align;
+-
+-  /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
+-     bytes.  */
+-  mode = TYPE_MODE (strip_array_types (type));
+-  switch (GET_MODE_CLASS (mode))
+-    {
+-    case MODE_INT:
+-    case MODE_COMPLEX_INT:
+-    case MODE_COMPLEX_FLOAT:
+-    case MODE_FLOAT:
+-    case MODE_DECIMAL_FLOAT:
+-      return 32;
+-    default:
+-      return align;
+-    }
+-}
+-
+-/* Compute the alignment for a static variable.
+-   TYPE is the data type, and ALIGN is the alignment that
+-   the object would ordinarily have.  The value of this function is used
+-   instead of that alignment to align the object.  */
+-
+-int
+-ix86_data_alignment (tree type, int align, bool opt)
+-{
+-  /* GCC 4.8 and earlier used to incorrectly assume this alignment even
+-     for symbols from other compilation units or symbols that don't need
+-     to bind locally.  In order to preserve some ABI compatibility with
+-     those compilers, ensure we don't decrease alignment from what we
+-     used to assume.  */
+-
+-  int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
+-
+-  /* A data structure, equal or greater than the size of a cache line
+-     (64 bytes in the Pentium 4 and other recent Intel processors, including
+-     processors based on Intel Core microarchitecture) should be aligned
+-     so that its base address is a multiple of a cache line size.  */
+-
+-  int max_align
+-    = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
+-
+-  if (max_align < BITS_PER_WORD)
+-    max_align = BITS_PER_WORD;
+-
+-  switch (ix86_align_data_type)
+-    {
+-    case ix86_align_data_type_abi: opt = false; break;
+-    case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
+-    case ix86_align_data_type_cacheline: break;
+-    }
+-
+-  if (TARGET_IAMCU)
+-    align = iamcu_alignment (type, align);
+-
+-  if (opt
+-      && AGGREGATE_TYPE_P (type)
+-      && TYPE_SIZE (type)
+-      && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
+-    {
+-      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
+-	  && align < max_align_compat)
+-	align = max_align_compat;
+-      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
+-	  && align < max_align)
+-	align = max_align;
+-    }
+-
+-  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+-     to 16byte boundary.  */
+-  if (TARGET_64BIT)
+-    {
+-      if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
+-	  && TYPE_SIZE (type)
+-	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+-	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
+-	  && align < 128)
+-	return 128;
+-    }
+-
+-  if (!opt)
+-    return align;
+-
+-  if (TREE_CODE (type) == ARRAY_TYPE)
+-    {
+-      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+-	return 64;
+-      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+-	return 128;
+-    }
+-  else if (TREE_CODE (type) == COMPLEX_TYPE)
+-    {
+-
+-      if (TYPE_MODE (type) == DCmode && align < 64)
+-	return 64;
+-      if ((TYPE_MODE (type) == XCmode
+-	   || TYPE_MODE (type) == TCmode) && align < 128)
+-	return 128;
+-    }
+-  else if ((TREE_CODE (type) == RECORD_TYPE
+-	    || TREE_CODE (type) == UNION_TYPE
+-	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+-	   && TYPE_FIELDS (type))
+-    {
+-      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+-	return 64;
+-      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+-	return 128;
+-    }
+-  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+-	   || TREE_CODE (type) == INTEGER_TYPE)
+-    {
+-      if (TYPE_MODE (type) == DFmode && align < 64)
+-	return 64;
+-      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+-	return 128;
+-    }
+-
+-  return align;
+-}
+-
+-/* Compute the alignment for a local variable or a stack slot.  EXP is
+-   the data type or decl itself, MODE is the widest mode available and
+-   ALIGN is the alignment that the object would ordinarily have.  The
+-   value of this macro is used instead of that alignment to align the
+-   object.  */
+-
+-unsigned int
+-ix86_local_alignment (tree exp, machine_mode mode,
+-		      unsigned int align)
+-{
+-  tree type, decl;
+-
+-  if (exp && DECL_P (exp))
+-    {
+-      type = TREE_TYPE (exp);
+-      decl = exp;
+-    }
+-  else
+-    {
+-      type = exp;
+-      decl = NULL;
+-    }
+-
+-  /* Don't do dynamic stack realignment for long long objects with
+-     -mpreferred-stack-boundary=2.  */
+-  if (!TARGET_64BIT
+-      && align == 64
+-      && ix86_preferred_stack_boundary < 64
+-      && (mode == DImode || (type && TYPE_MODE (type) == DImode))
+-      && (!type || !TYPE_USER_ALIGN (type))
+-      && (!decl || !DECL_USER_ALIGN (decl)))
+-    align = 32;
+-
+-  /* If TYPE is NULL, we are allocating a stack slot for caller-save
+-     register in MODE.  We will return the largest alignment of XF
+-     and DF.  */
+-  if (!type)
+-    {
+-      if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
+-	align = GET_MODE_ALIGNMENT (DFmode);
+-      return align;
+-    }
+-
+-  /* Don't increase alignment for Intel MCU psABI.  */
+-  if (TARGET_IAMCU)
+-    return align;
+-
+-  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+-     to 16byte boundary.  Exact wording is:
+-
+-     An array uses the same alignment as its elements, except that a local or
+-     global array variable of length at least 16 bytes or
+-     a C99 variable-length array variable always has alignment of at least 16 bytes.
+-
+-     This was added to allow use of aligned SSE instructions at arrays.  This
+-     rule is meant for static storage (where compiler cannot do the analysis
+-     by itself).  We follow it for automatic variables only when convenient.
+-     We fully control everything in the function compiled and functions from
+-     other unit cannot rely on the alignment.
+-
+-     Exclude va_list type.  It is the common case of local array where
+-     we cannot benefit from the alignment.  
+-
+-     TODO: Probably one should optimize for size only when var is not escaping.  */
+-  if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
+-      && TARGET_SSE)
+-    {
+-      if (AGGREGATE_TYPE_P (type)
+-	  && (va_list_type_node == NULL_TREE
+-	      || (TYPE_MAIN_VARIANT (type)
+-		  != TYPE_MAIN_VARIANT (va_list_type_node)))
+-	  && TYPE_SIZE (type)
+-	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+-	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
+-	  && align < 128)
+-	return 128;
+-    }
+-  if (TREE_CODE (type) == ARRAY_TYPE)
+-    {
+-      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+-	return 64;
+-      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+-	return 128;
+-    }
+-  else if (TREE_CODE (type) == COMPLEX_TYPE)
+-    {
+-      if (TYPE_MODE (type) == DCmode && align < 64)
+-	return 64;
+-      if ((TYPE_MODE (type) == XCmode
+-	   || TYPE_MODE (type) == TCmode) && align < 128)
+-	return 128;
+-    }
+-  else if ((TREE_CODE (type) == RECORD_TYPE
+-	    || TREE_CODE (type) == UNION_TYPE
+-	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+-	   && TYPE_FIELDS (type))
+-    {
+-      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+-	return 64;
+-      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+-	return 128;
+-    }
+-  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+-	   || TREE_CODE (type) == INTEGER_TYPE)
+-    {
+-
+-      if (TYPE_MODE (type) == DFmode && align < 64)
+-	return 64;
+-      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+-	return 128;
+-    }
+-  return align;
+-}
+-
+-/* Compute the minimum required alignment for dynamic stack realignment
+-   purposes for a local variable, parameter or a stack slot.  EXP is
+-   the data type or decl itself, MODE is its mode and ALIGN is the
+-   alignment that the object would ordinarily have.  */
+-
+-unsigned int
+-ix86_minimum_alignment (tree exp, machine_mode mode,
+-			unsigned int align)
+-{
+-  tree type, decl;
+-
+-  if (exp && DECL_P (exp))
+-    {
+-      type = TREE_TYPE (exp);
+-      decl = exp;
+-    }
+-  else
+-    {
+-      type = exp;
+-      decl = NULL;
+-    }
+-
+-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+-    return align;
+-
+-  /* Don't do dynamic stack realignment for long long objects with
+-     -mpreferred-stack-boundary=2.  */
+-  if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
+-      && (!type || !TYPE_USER_ALIGN (type))
+-      && (!decl || !DECL_USER_ALIGN (decl)))
+-    {
+-      gcc_checking_assert (!TARGET_STV);
+-      return 32;
+-    }
+-
+-  return align;
+-}
+-
+-/* Find a location for the static chain incoming to a nested function.
+-   This is a register, unless all free registers are used by arguments.  */
+-
+-static rtx
+-ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
+-{
+-  unsigned regno;
+-
+-  if (TARGET_64BIT)
+-    {
+-      /* We always use R10 in 64-bit mode.  */
+-      regno = R10_REG;
+-    }
+-  else
+-    {
+-      const_tree fntype, fndecl;
+-      unsigned int ccvt;
+-
+-      /* By default in 32-bit mode we use ECX to pass the static chain.  */
+-      regno = CX_REG;
+-
+-      if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
+-	{
+-          fntype = TREE_TYPE (fndecl_or_type);
+-	  fndecl = fndecl_or_type;
+-	}
+-      else
+-	{
+-	  fntype = fndecl_or_type;
+-	  fndecl = NULL;
+-	}
+-
+-      ccvt = ix86_get_callcvt (fntype);
+-      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+-	{
+-	  /* Fastcall functions use ecx/edx for arguments, which leaves
+-	     us with EAX for the static chain.
+-	     Thiscall functions use ecx for arguments, which also
+-	     leaves us with EAX for the static chain.  */
+-	  regno = AX_REG;
+-	}
+-      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+-	{
+-	  /* Thiscall functions use ecx for arguments, which leaves
+-	     us with EAX and EDX for the static chain.
+-	     We are using for abi-compatibility EAX.  */
+-	  regno = AX_REG;
+-	}
+-      else if (ix86_function_regparm (fntype, fndecl) == 3)
+-	{
+-	  /* For regparm 3, we have no free call-clobbered registers in
+-	     which to store the static chain.  In order to implement this,
+-	     we have the trampoline push the static chain to the stack.
+-	     However, we can't push a value below the return address when
+-	     we call the nested function directly, so we have to use an
+-	     alternate entry point.  For this we use ESI, and have the
+-	     alternate entry point push ESI, so that things appear the
+-	     same once we're executing the nested function.  */
+-	  if (incoming_p)
+-	    {
+-	      if (fndecl == current_function_decl
+-		  && !ix86_static_chain_on_stack)
+-		{
+-		  gcc_assert (!reload_completed);
+-		  ix86_static_chain_on_stack = true;
+-		}
+-	      return gen_frame_mem (SImode,
+-				    plus_constant (Pmode,
+-						   arg_pointer_rtx, -8));
+-	    }
+-	  regno = SI_REG;
+-	}
+-    }
+-
+-  return gen_rtx_REG (Pmode, regno);
+-}
+-
+-/* Emit RTL insns to initialize the variable parts of a trampoline.
+-   FNDECL is the decl of the target address; M_TRAMP is a MEM for
+-   the trampoline, and CHAIN_VALUE is an RTX for the static chain
+-   to be passed to the target function.  */
+-
+-static void
+-ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
+-{
+-  rtx mem, fnaddr;
+-  int opcode;
+-  int offset = 0;
+-  bool need_endbr = (flag_cf_protection & CF_BRANCH);
+-
+-  fnaddr = XEXP (DECL_RTL (fndecl), 0);
+-
+-  if (TARGET_64BIT)
+-    {
+-      int size;
+-
+-      if (need_endbr)
+-	{
+-	  /* Insert ENDBR64.  */
+-	  mem = adjust_address (m_tramp, SImode, offset);
+-	  emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
+-	  offset += 4;
+-	}
+-
+-      /* Load the function address to r11.  Try to load address using
+-	 the shorter movl instead of movabs.  We may want to support
+-	 movq for kernel mode, but kernel does not use trampolines at
+-	 the moment.  FNADDR is a 32bit address and may not be in
+-	 DImode when ptr_mode == SImode.  Always use movl in this
+-	 case.  */
+-      if (ptr_mode == SImode
+-	  || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
+-	{
+-	  fnaddr = copy_addr_to_reg (fnaddr);
+-
+-	  mem = adjust_address (m_tramp, HImode, offset);
+-	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
+-
+-	  mem = adjust_address (m_tramp, SImode, offset + 2);
+-	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
+-	  offset += 6;
+-	}
+-      else
+-	{
+-	  mem = adjust_address (m_tramp, HImode, offset);
+-	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
+-
+-	  mem = adjust_address (m_tramp, DImode, offset + 2);
+-	  emit_move_insn (mem, fnaddr);
+-	  offset += 10;
+-	}
+-
+-      /* Load static chain using movabs to r10.  Use the shorter movl
+-         instead of movabs when ptr_mode == SImode.  */
+-      if (ptr_mode == SImode)
+-	{
+-	  opcode = 0xba41;
+-	  size = 6;
+-	}
+-      else
+-	{
+-	  opcode = 0xba49;
+-	  size = 10;
+-	}
+-
+-      mem = adjust_address (m_tramp, HImode, offset);
+-      emit_move_insn (mem, gen_int_mode (opcode, HImode));
+-
+-      mem = adjust_address (m_tramp, ptr_mode, offset + 2);
+-      emit_move_insn (mem, chain_value);
+-      offset += size;
+-
+-      /* Jump to r11; the last (unused) byte is a nop, only there to
+-	 pad the write out to a single 32-bit store.  */
+-      mem = adjust_address (m_tramp, SImode, offset);
+-      emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
+-      offset += 4;
+-    }
+-  else
+-    {
+-      rtx disp, chain;
+-
+-      /* Depending on the static chain location, either load a register
+-	 with a constant, or push the constant to the stack.  All of the
+-	 instructions are the same size.  */
+-      chain = ix86_static_chain (fndecl, true);
+-      if (REG_P (chain))
+-	{
+-	  switch (REGNO (chain))
+-	    {
+-	    case AX_REG:
+-	      opcode = 0xb8; break;
+-	    case CX_REG:
+-	      opcode = 0xb9; break;
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-	}
+-      else
+-	opcode = 0x68;
+-
+-      if (need_endbr)
+-	{
+-	  /* Insert ENDBR32.  */
+-	  mem = adjust_address (m_tramp, SImode, offset);
+-	  emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
+-	  offset += 4;
+-	}
+-
+-      mem = adjust_address (m_tramp, QImode, offset);
+-      emit_move_insn (mem, gen_int_mode (opcode, QImode));
+-
+-      mem = adjust_address (m_tramp, SImode, offset + 1);
+-      emit_move_insn (mem, chain_value);
+-      offset += 5;
+-
+-      mem = adjust_address (m_tramp, QImode, offset);
+-      emit_move_insn (mem, gen_int_mode (0xe9, QImode));
+-
+-      mem = adjust_address (m_tramp, SImode, offset + 1);
+-
+-      /* Compute offset from the end of the jmp to the target function.
+-	 In the case in which the trampoline stores the static chain on
+-	 the stack, we need to skip the first insn which pushes the
+-	 (call-saved) register static chain; this push is 1 byte.  */
+-      offset += 5;
+-      int skip = MEM_P (chain) ? 1 : 0;
+-      /* Skip ENDBR32 at the entry of the target function.  */
+-      if (need_endbr
+-	  && !cgraph_node::get (fndecl)->only_called_directly_p ())
+-	skip += 4;
+-      disp = expand_binop (SImode, sub_optab, fnaddr,
+-			   plus_constant (Pmode, XEXP (m_tramp, 0),
+-					  offset - skip),
+-			   NULL_RTX, 1, OPTAB_DIRECT);
+-      emit_move_insn (mem, disp);
+-    }
+-
+-  gcc_assert (offset <= TRAMPOLINE_SIZE);
+-
+-#ifdef HAVE_ENABLE_EXECUTE_STACK
+-#ifdef CHECK_EXECUTE_STACK_ENABLED
+-  if (CHECK_EXECUTE_STACK_ENABLED)
+-#endif
+-  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
+-		     LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
+-#endif
+-}
+-
+-static bool
+-ix86_allocate_stack_slots_for_args (void)
+-{
+-  /* Naked functions should not allocate stack slots for arguments.  */
+-  return !ix86_function_naked (current_function_decl);
+-}
+-
+-static bool
+-ix86_warn_func_return (tree decl)
+-{
+-  /* Naked functions are implemented entirely in assembly, including the
+-     return sequence, so suppress warnings about this.  */
+-  return !ix86_function_naked (decl);
+-}
+-
+-/* The following file contains several enumerations and data structures
+-   built from the definitions in i386-builtin-types.def.  */
+-
+-#include "i386-builtin-types.inc"
+-
+-/* Table for the ix86 builtin non-function types.  */
+-static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
+-
+-/* Retrieve an element from the above table, building some of
+-   the types lazily.  */
+-
+-static tree
+-ix86_get_builtin_type (enum ix86_builtin_type tcode)
+-{
+-  unsigned int index;
+-  tree type, itype;
+-
+-  gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
+-
+-  type = ix86_builtin_type_tab[(int) tcode];
+-  if (type != NULL)
+-    return type;
+-
+-  gcc_assert (tcode > IX86_BT_LAST_PRIM);
+-  if (tcode <= IX86_BT_LAST_VECT)
+-    {
+-      machine_mode mode;
+-
+-      index = tcode - IX86_BT_LAST_PRIM - 1;
+-      itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
+-      mode = ix86_builtin_type_vect_mode[index];
+-
+-      type = build_vector_type_for_mode (itype, mode);
+-    }
+-  else
+-    {
+-      int quals;
+-
+-      index = tcode - IX86_BT_LAST_VECT - 1;
+-      if (tcode <= IX86_BT_LAST_PTR)
+-	quals = TYPE_UNQUALIFIED;
+-      else
+-	quals = TYPE_QUAL_CONST;
+-
+-      itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
+-      if (quals != TYPE_UNQUALIFIED)
+-	itype = build_qualified_type (itype, quals);
+-
+-      type = build_pointer_type (itype);
+-    }
+-
+-  ix86_builtin_type_tab[(int) tcode] = type;
+-  return type;
+-}
+-
+-/* Table for the ix86 builtin function types.  */
+-static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
+-
+-/* Retrieve an element from the above table, building some of
+-   the types lazily.  */
+-
+-static tree
+-ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
+-{
+-  tree type;
+-
+-  gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
+-
+-  type = ix86_builtin_func_type_tab[(int) tcode];
+-  if (type != NULL)
+-    return type;
+-
+-  if (tcode <= IX86_BT_LAST_FUNC)
+-    {
+-      unsigned start = ix86_builtin_func_start[(int) tcode];
+-      unsigned after = ix86_builtin_func_start[(int) tcode + 1];
+-      tree rtype, atype, args = void_list_node;
+-      unsigned i;
+-
+-      rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
+-      for (i = after - 1; i > start; --i)
+-	{
+-	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
+-	  args = tree_cons (NULL, atype, args);
+-	}
+-
+-      type = build_function_type (rtype, args);
+-    }
+-  else
+-    {
+-      unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
+-      enum ix86_builtin_func_type icode;
+-
+-      icode = ix86_builtin_func_alias_base[index];
+-      type = ix86_get_builtin_func_type (icode);
+-    }
+-
+-  ix86_builtin_func_type_tab[(int) tcode] = type;
+-  return type;
+-}
+-
+-
+-/* Codes for all the SSE/MMX builtins.  Builtins not mentioned in any
+-   bdesc_* arrays below should come first, then builtins for each bdesc_*
+-   array in ascending order, so that we can use direct array accesses.  */
+-enum ix86_builtins
+-{
+-  IX86_BUILTIN_MASKMOVQ,
+-  IX86_BUILTIN_LDMXCSR,
+-  IX86_BUILTIN_STMXCSR,
+-  IX86_BUILTIN_MASKMOVDQU,
+-  IX86_BUILTIN_PSLLDQ128,
+-  IX86_BUILTIN_CLFLUSH,
+-  IX86_BUILTIN_MONITOR,
+-  IX86_BUILTIN_MWAIT,
+-  IX86_BUILTIN_UMONITOR,
+-  IX86_BUILTIN_UMWAIT,
+-  IX86_BUILTIN_TPAUSE,
+-  IX86_BUILTIN_CLZERO,
+-  IX86_BUILTIN_CLDEMOTE,
+-  IX86_BUILTIN_VEC_INIT_V2SI,
+-  IX86_BUILTIN_VEC_INIT_V4HI,
+-  IX86_BUILTIN_VEC_INIT_V8QI,
+-  IX86_BUILTIN_VEC_EXT_V2DF,
+-  IX86_BUILTIN_VEC_EXT_V2DI,
+-  IX86_BUILTIN_VEC_EXT_V4SF,
+-  IX86_BUILTIN_VEC_EXT_V4SI,
+-  IX86_BUILTIN_VEC_EXT_V8HI,
+-  IX86_BUILTIN_VEC_EXT_V2SI,
+-  IX86_BUILTIN_VEC_EXT_V4HI,
+-  IX86_BUILTIN_VEC_EXT_V16QI,
+-  IX86_BUILTIN_VEC_SET_V2DI,
+-  IX86_BUILTIN_VEC_SET_V4SF,
+-  IX86_BUILTIN_VEC_SET_V4SI,
+-  IX86_BUILTIN_VEC_SET_V8HI,
+-  IX86_BUILTIN_VEC_SET_V4HI,
+-  IX86_BUILTIN_VEC_SET_V16QI,
+-  IX86_BUILTIN_GATHERSIV2DF,
+-  IX86_BUILTIN_GATHERSIV4DF,
+-  IX86_BUILTIN_GATHERDIV2DF,
+-  IX86_BUILTIN_GATHERDIV4DF,
+-  IX86_BUILTIN_GATHERSIV4SF,
+-  IX86_BUILTIN_GATHERSIV8SF,
+-  IX86_BUILTIN_GATHERDIV4SF,
+-  IX86_BUILTIN_GATHERDIV8SF,
+-  IX86_BUILTIN_GATHERSIV2DI,
+-  IX86_BUILTIN_GATHERSIV4DI,
+-  IX86_BUILTIN_GATHERDIV2DI,
+-  IX86_BUILTIN_GATHERDIV4DI,
+-  IX86_BUILTIN_GATHERSIV4SI,
+-  IX86_BUILTIN_GATHERSIV8SI,
+-  IX86_BUILTIN_GATHERDIV4SI,
+-  IX86_BUILTIN_GATHERDIV8SI,
+-  IX86_BUILTIN_GATHER3SIV8SF,
+-  IX86_BUILTIN_GATHER3SIV4SF,
+-  IX86_BUILTIN_GATHER3SIV4DF,
+-  IX86_BUILTIN_GATHER3SIV2DF,
+-  IX86_BUILTIN_GATHER3DIV8SF,
+-  IX86_BUILTIN_GATHER3DIV4SF,
+-  IX86_BUILTIN_GATHER3DIV4DF,
+-  IX86_BUILTIN_GATHER3DIV2DF,
+-  IX86_BUILTIN_GATHER3SIV8SI,
+-  IX86_BUILTIN_GATHER3SIV4SI,
+-  IX86_BUILTIN_GATHER3SIV4DI,
+-  IX86_BUILTIN_GATHER3SIV2DI,
+-  IX86_BUILTIN_GATHER3DIV8SI,
+-  IX86_BUILTIN_GATHER3DIV4SI,
+-  IX86_BUILTIN_GATHER3DIV4DI,
+-  IX86_BUILTIN_GATHER3DIV2DI,
+-  IX86_BUILTIN_SCATTERSIV8SF,
+-  IX86_BUILTIN_SCATTERSIV4SF,
+-  IX86_BUILTIN_SCATTERSIV4DF,
+-  IX86_BUILTIN_SCATTERSIV2DF,
+-  IX86_BUILTIN_SCATTERDIV8SF,
+-  IX86_BUILTIN_SCATTERDIV4SF,
+-  IX86_BUILTIN_SCATTERDIV4DF,
+-  IX86_BUILTIN_SCATTERDIV2DF,
+-  IX86_BUILTIN_SCATTERSIV8SI,
+-  IX86_BUILTIN_SCATTERSIV4SI,
+-  IX86_BUILTIN_SCATTERSIV4DI,
+-  IX86_BUILTIN_SCATTERSIV2DI,
+-  IX86_BUILTIN_SCATTERDIV8SI,
+-  IX86_BUILTIN_SCATTERDIV4SI,
+-  IX86_BUILTIN_SCATTERDIV4DI,
+-  IX86_BUILTIN_SCATTERDIV2DI,
+-  /* Alternate 4 and 8 element gather/scatter for the vectorizer
+-     where all operands are 32-byte or 64-byte wide respectively.  */
+-  IX86_BUILTIN_GATHERALTSIV4DF,
+-  IX86_BUILTIN_GATHERALTDIV8SF,
+-  IX86_BUILTIN_GATHERALTSIV4DI,
+-  IX86_BUILTIN_GATHERALTDIV8SI,
+-  IX86_BUILTIN_GATHER3ALTDIV16SF,
+-  IX86_BUILTIN_GATHER3ALTDIV16SI,
+-  IX86_BUILTIN_GATHER3ALTSIV4DF,
+-  IX86_BUILTIN_GATHER3ALTDIV8SF,
+-  IX86_BUILTIN_GATHER3ALTSIV4DI,
+-  IX86_BUILTIN_GATHER3ALTDIV8SI,
+-  IX86_BUILTIN_GATHER3ALTSIV8DF,
+-  IX86_BUILTIN_GATHER3ALTSIV8DI,
+-  IX86_BUILTIN_GATHER3DIV16SF,
+-  IX86_BUILTIN_GATHER3DIV16SI,
+-  IX86_BUILTIN_GATHER3DIV8DF,
+-  IX86_BUILTIN_GATHER3DIV8DI,
+-  IX86_BUILTIN_GATHER3SIV16SF,
+-  IX86_BUILTIN_GATHER3SIV16SI,
+-  IX86_BUILTIN_GATHER3SIV8DF,
+-  IX86_BUILTIN_GATHER3SIV8DI,
+-  IX86_BUILTIN_SCATTERALTSIV8DF,
+-  IX86_BUILTIN_SCATTERALTDIV16SF,
+-  IX86_BUILTIN_SCATTERALTSIV8DI,
+-  IX86_BUILTIN_SCATTERALTDIV16SI,
+-  IX86_BUILTIN_SCATTERALTSIV4DF,
+-  IX86_BUILTIN_SCATTERALTDIV8SF,
+-  IX86_BUILTIN_SCATTERALTSIV4DI,
+-  IX86_BUILTIN_SCATTERALTDIV8SI,
+-  IX86_BUILTIN_SCATTERALTSIV2DF,
+-  IX86_BUILTIN_SCATTERALTDIV4SF,
+-  IX86_BUILTIN_SCATTERALTSIV2DI,
+-  IX86_BUILTIN_SCATTERALTDIV4SI,
+-  IX86_BUILTIN_SCATTERDIV16SF,
+-  IX86_BUILTIN_SCATTERDIV16SI,
+-  IX86_BUILTIN_SCATTERDIV8DF,
+-  IX86_BUILTIN_SCATTERDIV8DI,
+-  IX86_BUILTIN_SCATTERSIV16SF,
+-  IX86_BUILTIN_SCATTERSIV16SI,
+-  IX86_BUILTIN_SCATTERSIV8DF,
+-  IX86_BUILTIN_SCATTERSIV8DI,
+-  IX86_BUILTIN_GATHERPFQPD,
+-  IX86_BUILTIN_GATHERPFDPS,
+-  IX86_BUILTIN_GATHERPFDPD,
+-  IX86_BUILTIN_GATHERPFQPS,
+-  IX86_BUILTIN_SCATTERPFDPD,
+-  IX86_BUILTIN_SCATTERPFDPS,
+-  IX86_BUILTIN_SCATTERPFQPD,
+-  IX86_BUILTIN_SCATTERPFQPS,
+-  IX86_BUILTIN_CLWB,
+-  IX86_BUILTIN_CLFLUSHOPT,
+-  IX86_BUILTIN_INFQ,
+-  IX86_BUILTIN_HUGE_VALQ,
+-  IX86_BUILTIN_NANQ,
+-  IX86_BUILTIN_NANSQ,
+-  IX86_BUILTIN_XABORT,
+-  IX86_BUILTIN_ADDCARRYX32,
+-  IX86_BUILTIN_ADDCARRYX64,
+-  IX86_BUILTIN_SBB32,
+-  IX86_BUILTIN_SBB64,
+-  IX86_BUILTIN_RDRAND16_STEP,
+-  IX86_BUILTIN_RDRAND32_STEP,
+-  IX86_BUILTIN_RDRAND64_STEP,
+-  IX86_BUILTIN_RDSEED16_STEP,
+-  IX86_BUILTIN_RDSEED32_STEP,
+-  IX86_BUILTIN_RDSEED64_STEP,
+-  IX86_BUILTIN_MONITORX,
+-  IX86_BUILTIN_MWAITX,
+-  IX86_BUILTIN_CFSTRING,
+-  IX86_BUILTIN_CPU_INIT,
+-  IX86_BUILTIN_CPU_IS,
+-  IX86_BUILTIN_CPU_SUPPORTS,
+-  IX86_BUILTIN_READ_FLAGS,
+-  IX86_BUILTIN_WRITE_FLAGS,
+-
+-  /* All the remaining builtins are tracked in bdesc_* arrays in
+-     i386-builtin.def.  Don't add any IX86_BUILTIN_* enumerators after
+-     this point.  */
+-#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
+-  code,
+-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
+-  code,									\
+-  IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
+-#define BDESC_END(kind, next_kind)
+-
+-#include "i386-builtin.def"
+-
+-#undef BDESC
+-#undef BDESC_FIRST
+-#undef BDESC_END
+-
+-  IX86_BUILTIN_MAX,
+-
+-  IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
+-
+-  /* Now just the aliases for bdesc_* start/end.  */
+-#define BDESC(mask, mask2, icode, name, code, comparison, flag)
+-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag)
+-#define BDESC_END(kind, next_kind) \
+-  IX86_BUILTIN__BDESC_##kind##_LAST					    \
+-    = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
+-
+-#include "i386-builtin.def"
+-
+-#undef BDESC
+-#undef BDESC_FIRST
+-#undef BDESC_END
+-
+-  /* Just to make sure there is no comma after the last enumerator.  */
+-  IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
+-};
+-
+-/* Table for the ix86 builtin decls.  */
+-static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
+-
+-/* Table of all of the builtin functions that are possible with different ISA's
+-   but are waiting to be built until a function is declared to use that
+-   ISA.  */
+-struct builtin_isa {
+-  HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
+-  HOST_WIDE_INT isa2;		/* additional isa_flags this builtin is defined for */
+-  const char *name;		/* function name */
+-  enum ix86_builtin_func_type tcode; /* type to use in the declaration */
+-  unsigned char const_p:1;	/* true if the declaration is constant */
+-  unsigned char pure_p:1;	/* true if the declaration has pure attribute */
+-  bool set_and_not_built_p;
+-};
+-
+-static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
+-
+-/* Bits that can still enable any inclusion of a builtin.  */
+-static HOST_WIDE_INT deferred_isa_values = 0;
+-static HOST_WIDE_INT deferred_isa_values2 = 0;
+-
+-/* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the
+-   MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the
+-   ix86_builtins_isa array.  Stores the function decl in the ix86_builtins
+-   array.  Returns the function decl or NULL_TREE, if the builtin was not
+-   added.
+-
+-   If the front end has a special hook for builtin functions, delay adding
+-   builtin functions that aren't in the current ISA until the ISA is changed
+-   with function specific optimization.  Doing so, can save about 300K for the
+-   default compiler.  When the builtin is expanded, check at that time whether
+-   it is valid.
+-
+-   If the front end doesn't have a special hook, record all builtins, even if
+-   it isn't an instruction set in the current ISA in case the user uses
+-   function specific options for a different ISA, so that we don't get scope
+-   errors if a builtin is added in the middle of a function scope.  */
+-
+-static inline tree
+-def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2,
+-	     const char *name,
+-	     enum ix86_builtin_func_type tcode,
+-	     enum ix86_builtins code)
+-{
+-  tree decl = NULL_TREE;
+-
+-  /* An instruction may be 64bit only regardless of ISAs.  */
+-  if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
+-    {
+-      ix86_builtins_isa[(int) code].isa = mask;
+-      ix86_builtins_isa[(int) code].isa2 = mask2;
+-
+-      mask &= ~OPTION_MASK_ISA_64BIT;
+-
+-      /* Filter out the masks most often ored together with others.  */
+-      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
+-	  && mask != OPTION_MASK_ISA_AVX512VL)
+-	mask &= ~OPTION_MASK_ISA_AVX512VL;
+-      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
+-	  && mask != OPTION_MASK_ISA_AVX512BW)
+-	mask &= ~OPTION_MASK_ISA_AVX512BW;
+-
+-      if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0)
+-	   && (mask == 0 || (mask & ix86_isa_flags) != 0))
+-	  || (lang_hooks.builtin_function
+-	      == lang_hooks.builtin_function_ext_scope))
+-	{
+-	  tree type = ix86_get_builtin_func_type (tcode);
+-	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+-				       NULL, NULL_TREE);
+-	  ix86_builtins[(int) code] = decl;
+-	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
+-	}
+-      else
+-	{
+-	  /* Just MASK and MASK2 where set_and_not_built_p == true can potentially
+-	     include a builtin.  */
+-	  deferred_isa_values |= mask;
+-	  deferred_isa_values2 |= mask2;
+-	  ix86_builtins[(int) code] = NULL_TREE;
+-	  ix86_builtins_isa[(int) code].tcode = tcode;
+-	  ix86_builtins_isa[(int) code].name = name;
+-	  ix86_builtins_isa[(int) code].const_p = false;
+-	  ix86_builtins_isa[(int) code].pure_p = false;
+-	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
+-	}
+-    }
+-
+-  return decl;
+-}
+-
+-/* Like def_builtin, but also marks the function decl "const".  */
+-
+-static inline tree
+-def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
+-		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
+-{
+-  tree decl = def_builtin (mask, mask2, name, tcode, code);
+-  if (decl)
+-    TREE_READONLY (decl) = 1;
+-  else
+-    ix86_builtins_isa[(int) code].const_p = true;
+-
+-  return decl;
+-}
+-
+-/* Like def_builtin, but also marks the function decl "pure".  */
+-
+-static inline tree
+-def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
+-		  enum ix86_builtin_func_type tcode, enum ix86_builtins code)
+-{
+-  tree decl = def_builtin (mask, mask2, name, tcode, code);
+-  if (decl)
+-    DECL_PURE_P (decl) = 1;
+-  else
+-    ix86_builtins_isa[(int) code].pure_p = true;
+-
+-  return decl;
+-}
+-
+-/* Add any new builtin functions for a given ISA that may not have been
+-   declared.  This saves a bit of space compared to adding all of the
+-   declarations to the tree, even if we didn't use them.  */
+-
+-static void
+-ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
+-{
+-  isa &= ~OPTION_MASK_ISA_64BIT;
+-
+-  if ((isa & deferred_isa_values) == 0
+-      && (isa2 & deferred_isa_values2) == 0)
+-    return;
+-
+-  /* Bits in ISA value can be removed from potential isa values.  */
+-  deferred_isa_values &= ~isa;
+-  deferred_isa_values2 &= ~isa2;
+-
+-  int i;
+-  tree saved_current_target_pragma = current_target_pragma;
+-  current_target_pragma = NULL_TREE;
+-
+-  for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
+-    {
+-      if (((ix86_builtins_isa[i].isa & isa) != 0
+-	   || (ix86_builtins_isa[i].isa2 & isa2) != 0)
+-	  && ix86_builtins_isa[i].set_and_not_built_p)
+-	{
+-	  tree decl, type;
+-
+-	  /* Don't define the builtin again.  */
+-	  ix86_builtins_isa[i].set_and_not_built_p = false;
+-
+-	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
+-	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
+-						 type, i, BUILT_IN_MD, NULL,
+-						 NULL_TREE);
+-
+-	  ix86_builtins[i] = decl;
+-	  if (ix86_builtins_isa[i].const_p)
+-	    TREE_READONLY (decl) = 1;
+-	}
+-    }
+-
+-  current_target_pragma = saved_current_target_pragma;
+-}
+-
+-/* Bits for builtin_description.flag.  */
+-
+-/* Set when we don't support the comparison natively, and should
+-   swap_comparison in order to support it.  */
+-#define BUILTIN_DESC_SWAP_OPERANDS	1
+-
+-struct builtin_description
+-{
+-  const HOST_WIDE_INT mask;
+-  const HOST_WIDE_INT mask2;
+-  const enum insn_code icode;
+-  const char *const name;
+-  const enum ix86_builtins code;
+-  const enum rtx_code comparison;
+-  const int flag;
+-};
+-
+-#define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+-#define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+-#define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+-#define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
+-#define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
+-#define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
+-#define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
+-#define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
+-#define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
+-#define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
+-#define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
+-#define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
+-#define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
+-#define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
+-#define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
+-#define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
+-#define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
+-#define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
+-#define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
+-#define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
+-#define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
+-#define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
+-#define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
+-#define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
+-#define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
+-#define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
+-#define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
+-#define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
+-#define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
+-#define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
+-#define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
+-#define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
+-#define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
+-#define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
+-#define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
+-#define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
+-#define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
+-#define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
+-#define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
+-#define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
+-#define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
+-#define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
+-#define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
+-#define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
+-#define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
+-#define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
+-#define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
+-#define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
+-#define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
+-#define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
+-#define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
+-#define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
+-
+-#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
+-  { mask, mask2, icode, name, code, comparison, flag },
+-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
+-static const struct builtin_description bdesc_##kind[] =		    \
+-{									    \
+-  BDESC (mask, mask2, icode, name, code, comparison, flag)
+-#define BDESC_END(kind, next_kind) \
+-};
+-
+-#include "i386-builtin.def"
+-
+-#undef BDESC
+-#undef BDESC_FIRST
+-#undef BDESC_END
+-
+-
+-/* TM vector builtins.  */
+-
+-/* Reuse the existing x86-specific `struct builtin_description' cause
+-   we're lazy.  Add casts to make them fit.  */
+-static const struct builtin_description bdesc_tm[] =
+-{
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+-
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+-
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+-
+-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
+-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
+-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
+-};
+-
+-/* Initialize the transactional memory vector load/store builtins.  */
+-
+-static void
+-ix86_init_tm_builtins (void)
+-{
+-  enum ix86_builtin_func_type ftype;
+-  const struct builtin_description *d;
+-  size_t i;
+-  tree decl;
+-  tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
+-  tree attrs_log, attrs_type_log;
+-
+-  if (!flag_tm)
+-    return;
+-
+-  /* If there are no builtins defined, we must be compiling in a
+-     language without trans-mem support.  */
+-  if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
+-    return;
+-
+-  /* Use whatever attributes a normal TM load has.  */
+-  decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
+-  attrs_load = DECL_ATTRIBUTES (decl);
+-  attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+-  /* Use whatever attributes a normal TM store has.  */
+-  decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
+-  attrs_store = DECL_ATTRIBUTES (decl);
+-  attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+-  /* Use whatever attributes a normal TM log has.  */
+-  decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
+-  attrs_log = DECL_ATTRIBUTES (decl);
+-  attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+-
+-  for (i = 0, d = bdesc_tm;
+-       i < ARRAY_SIZE (bdesc_tm);
+-       i++, d++)
+-    {
+-      if ((d->mask & ix86_isa_flags) != 0
+-	  || (lang_hooks.builtin_function
+-	      == lang_hooks.builtin_function_ext_scope))
+-	{
+-	  tree type, attrs, attrs_type;
+-	  enum built_in_function code = (enum built_in_function) d->code;
+-
+-	  ftype = (enum ix86_builtin_func_type) d->flag;
+-	  type = ix86_get_builtin_func_type (ftype);
+-
+-	  if (BUILTIN_TM_LOAD_P (code))
+-	    {
+-	      attrs = attrs_load;
+-	      attrs_type = attrs_type_load;
+-	    }
+-	  else if (BUILTIN_TM_STORE_P (code))
+-	    {
+-	      attrs = attrs_store;
+-	      attrs_type = attrs_type_store;
+-	    }
+-	  else
+-	    {
+-	      attrs = attrs_log;
+-	      attrs_type = attrs_type_log;
+-	    }
+-	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
+-				       /* The builtin without the prefix for
+-					  calling it directly.  */
+-				       d->name + strlen ("__builtin_"),
+-				       attrs);
+-	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
+-	     set the TYPE_ATTRIBUTES.  */
+-	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
+-
+-	  set_builtin_decl (code, decl, false);
+-	}
+-    }
+-}
+-
+-/* Macros for verification of enum ix86_builtins order.  */
+-#define BDESC_VERIFY(x, y, z) \
+-  gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
+-#define BDESC_VERIFYS(x, y, z) \
+-  STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
+-
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
+-	       IX86_BUILTIN__BDESC_COMI_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
+-	       IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
+-	       IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
+-	       IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
+-	       IX86_BUILTIN__BDESC_ARGS_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
+-	       IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
+-	       IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
+-	       IX86_BUILTIN__BDESC_CET_LAST, 1);
+-BDESC_VERIFYS (IX86_BUILTIN_MAX,
+-	       IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
+-
+-/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
+-   in the current target ISA to allow the user to compile particular modules
+-   with different target specific options that differ from the command line
+-   options.  */
+-static void
+-ix86_init_mmx_sse_builtins (void)
+-{
+-  const struct builtin_description * d;
+-  enum ix86_builtin_func_type ftype;
+-  size_t i;
+-
+-  /* Add all special builtins with variable number of operands.  */
+-  for (i = 0, d = bdesc_special_args;
+-       i < ARRAY_SIZE (bdesc_special_args);
+-       i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
+-      if (d->name == 0)
+-	continue;
+-
+-      ftype = (enum ix86_builtin_func_type) d->flag;
+-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
+-		 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
+-		 ARRAY_SIZE (bdesc_special_args) - 1);
+-
+-  /* Add all builtins with variable number of operands.  */
+-  for (i = 0, d = bdesc_args;
+-       i < ARRAY_SIZE (bdesc_args);
+-       i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
+-      if (d->name == 0)
+-	continue;
+-
+-      ftype = (enum ix86_builtin_func_type) d->flag;
+-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
+-		 IX86_BUILTIN__BDESC_ARGS_FIRST,
+-		 ARRAY_SIZE (bdesc_args) - 1);
+-
+-  /* Add all builtins with rounding.  */
+-  for (i = 0, d = bdesc_round_args;
+-       i < ARRAY_SIZE (bdesc_round_args);
+-       i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
+-      if (d->name == 0)
+-	continue;
+-
+-      ftype = (enum ix86_builtin_func_type) d->flag;
+-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
+-		 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
+-		 ARRAY_SIZE (bdesc_round_args) - 1);
+-
+-  /* pcmpestr[im] insns.  */
+-  for (i = 0, d = bdesc_pcmpestr;
+-       i < ARRAY_SIZE (bdesc_pcmpestr);
+-       i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
+-      if (d->code == IX86_BUILTIN_PCMPESTRM128)
+-	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
+-      else
+-	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
+-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
+-		 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
+-		 ARRAY_SIZE (bdesc_pcmpestr) - 1);
+-
+-  /* pcmpistr[im] insns.  */
+-  for (i = 0, d = bdesc_pcmpistr;
+-       i < ARRAY_SIZE (bdesc_pcmpistr);
+-       i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
+-      if (d->code == IX86_BUILTIN_PCMPISTRM128)
+-	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
+-      else
+-	ftype = INT_FTYPE_V16QI_V16QI_INT;
+-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
+-		 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
+-		 ARRAY_SIZE (bdesc_pcmpistr) - 1);
+-
+-  /* comi/ucomi insns.  */
+-  for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
+-      if (d->mask == OPTION_MASK_ISA_SSE2)
+-	ftype = INT_FTYPE_V2DF_V2DF;
+-      else
+-	ftype = INT_FTYPE_V4SF_V4SF;
+-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
+-		 IX86_BUILTIN__BDESC_COMI_FIRST,
+-		 ARRAY_SIZE (bdesc_comi) - 1);
+-
+-  /* SSE */
+-  def_builtin (OPTION_MASK_ISA_SSE, 0,  "__builtin_ia32_ldmxcsr",
+-	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
+-  def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
+-		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
+-
+-  /* SSE or 3DNow!A */
+-  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+-	       /* As it uses V4HImode, we have to require -mmmx too.  */
+-	       | OPTION_MASK_ISA_MMX, 0,
+-	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
+-	       IX86_BUILTIN_MASKMOVQ);
+-
+-  /* SSE2 */
+-  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
+-	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
+-
+-  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush",
+-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
+-  x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence",
+-			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
+-
+-  /* SSE3.  */
+-  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor",
+-	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
+-  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait",
+-	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
+-
+-  /* AES */
+-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_aesenc128",
+-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
+-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_aesenclast128",
+-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
+-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_aesdec128",
+-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
+-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_aesdeclast128",
+-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
+-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_aesimc128",
+-		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
+-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_aeskeygenassist128",
+-		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
+-
+-  /* PCLMUL */
+-  def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0,
+-		     "__builtin_ia32_pclmulqdq128",
+-		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
+-
+-  /* RDRND */
+-  def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step",
+-	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
+-  def_builtin (OPTION_MASK_ISA_RDRND, 0,  "__builtin_ia32_rdrand32_step",
+-	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
+-  def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0,
+-	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
+-	       IX86_BUILTIN_RDRAND64_STEP);
+-
+-  /* AVX2 */
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df",
+-		    V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
+-		    IX86_BUILTIN_GATHERSIV2DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df",
+-		    V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
+-		    IX86_BUILTIN_GATHERSIV4DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df",
+-		    V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
+-		    IX86_BUILTIN_GATHERDIV2DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df",
+-		    V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
+-		    IX86_BUILTIN_GATHERDIV4DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf",
+-		    V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
+-		    IX86_BUILTIN_GATHERSIV4SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf",
+-		    V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
+-		    IX86_BUILTIN_GATHERSIV8SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf",
+-		    V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
+-		    IX86_BUILTIN_GATHERDIV4SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256",
+-		    V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
+-		    IX86_BUILTIN_GATHERDIV8SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di",
+-		    V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
+-		    IX86_BUILTIN_GATHERSIV2DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di",
+-		    V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
+-		    IX86_BUILTIN_GATHERSIV4DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di",
+-		    V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
+-		    IX86_BUILTIN_GATHERDIV2DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di",
+-		    V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
+-		    IX86_BUILTIN_GATHERDIV4DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si",
+-		    V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
+-		    IX86_BUILTIN_GATHERSIV4SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si",
+-		    V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
+-		    IX86_BUILTIN_GATHERSIV8SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si",
+-		    V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
+-		    IX86_BUILTIN_GATHERDIV4SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256",
+-		    V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
+-		    IX86_BUILTIN_GATHERDIV8SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ",
+-		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
+-		    IX86_BUILTIN_GATHERALTSIV4DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ",
+-		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
+-		    IX86_BUILTIN_GATHERALTDIV8SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ",
+-		    V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
+-		    IX86_BUILTIN_GATHERALTSIV4DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ",
+-		    V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
+-		    IX86_BUILTIN_GATHERALTDIV8SI);
+-
+-  /* AVX512F */
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf",
+-		    V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
+-		    IX86_BUILTIN_GATHER3SIV16SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df",
+-		    V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV8DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf",
+-		    V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV16SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df",
+-		    V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV8DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si",
+-		    V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
+-		    IX86_BUILTIN_GATHER3SIV16SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di",
+-		    V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV8DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si",
+-		    V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV16SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di",
+-		    V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV8DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ",
+-		    V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3ALTSIV8DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ",
+-		    V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
+-		    IX86_BUILTIN_GATHER3ALTDIV16SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ",
+-		    V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3ALTSIV8DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ",
+-		    V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
+-		    IX86_BUILTIN_GATHER3ALTDIV16SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf",
+-	       VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
+-	       IX86_BUILTIN_SCATTERSIV16SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df",
+-	       VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
+-	       IX86_BUILTIN_SCATTERSIV8DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf",
+-	       VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
+-	       IX86_BUILTIN_SCATTERDIV16SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df",
+-	       VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
+-	       IX86_BUILTIN_SCATTERDIV8DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si",
+-	       VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
+-	       IX86_BUILTIN_SCATTERSIV16SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di",
+-	       VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
+-	       IX86_BUILTIN_SCATTERSIV8DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si",
+-	       VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
+-	       IX86_BUILTIN_SCATTERDIV16SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di",
+-	       VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
+-	       IX86_BUILTIN_SCATTERDIV8DI);
+-
+-  /* AVX512VL */
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df",
+-		    V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV2DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df",
+-		    V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV4DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df",
+-		    V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV2DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df",
+-		    V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV4DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf",
+-		    V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV4SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf",
+-		    V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV8SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf",
+-		    V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV4SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf",
+-		    V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV8SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di",
+-		    V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV2DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di",
+-		    V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV4DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di",
+-		    V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV2DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di",
+-		    V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV4DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si",
+-		    V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV4SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si",
+-		    V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3SIV8SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si",
+-		    V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV4SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si",
+-		    V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3DIV8SI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ",
+-		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3ALTSIV4DF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ",
+-		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3ALTDIV8SF);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ",
+-		    V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
+-		    IX86_BUILTIN_GATHER3ALTSIV4DI);
+-
+-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ",
+-		    V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
+-		    IX86_BUILTIN_GATHER3ALTDIV8SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf",
+-	       VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
+-	       IX86_BUILTIN_SCATTERSIV8SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf",
+-	       VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
+-	       IX86_BUILTIN_SCATTERSIV4SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df",
+-	       VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
+-	       IX86_BUILTIN_SCATTERSIV4DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df",
+-	       VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
+-	       IX86_BUILTIN_SCATTERSIV2DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf",
+-	       VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
+-	       IX86_BUILTIN_SCATTERDIV8SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf",
+-	       VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
+-	       IX86_BUILTIN_SCATTERDIV4SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df",
+-	       VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
+-	       IX86_BUILTIN_SCATTERDIV4DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df",
+-	       VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
+-	       IX86_BUILTIN_SCATTERDIV2DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si",
+-	       VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
+-	       IX86_BUILTIN_SCATTERSIV8SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si",
+-	       VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
+-	       IX86_BUILTIN_SCATTERSIV4SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di",
+-	       VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
+-	       IX86_BUILTIN_SCATTERSIV4DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di",
+-	       VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
+-	       IX86_BUILTIN_SCATTERSIV2DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si",
+-	       VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
+-	       IX86_BUILTIN_SCATTERDIV8SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si",
+-	       VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
+-	       IX86_BUILTIN_SCATTERDIV4SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di",
+-	       VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
+-	       IX86_BUILTIN_SCATTERDIV4DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di",
+-	       VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
+-	       IX86_BUILTIN_SCATTERDIV2DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ",
+-	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
+-	       IX86_BUILTIN_SCATTERALTSIV8DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ",
+-	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
+-	       IX86_BUILTIN_SCATTERALTDIV16SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ",
+-	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
+-	       IX86_BUILTIN_SCATTERALTSIV8DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ",
+-	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
+-	       IX86_BUILTIN_SCATTERALTDIV16SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ",
+-	       VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT,
+-	       IX86_BUILTIN_SCATTERALTSIV4DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ",
+-	       VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT,
+-	       IX86_BUILTIN_SCATTERALTDIV8SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ",
+-	       VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT,
+-	       IX86_BUILTIN_SCATTERALTSIV4DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ",
+-	       VOID_FTYPE_PINT_QI_V4DI_V8SI_INT,
+-	       IX86_BUILTIN_SCATTERALTDIV8SI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ",
+-	       VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
+-	       IX86_BUILTIN_SCATTERALTSIV2DF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ",
+-	       VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
+-	       IX86_BUILTIN_SCATTERALTDIV4SF);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ",
+-	       VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
+-	       IX86_BUILTIN_SCATTERALTSIV2DI);
+-
+-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ",
+-	       VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
+-	       IX86_BUILTIN_SCATTERALTDIV4SI);
+-
+-  /* AVX512PF */
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd",
+-	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_GATHERPFDPD);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps",
+-	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_GATHERPFDPS);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd",
+-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_GATHERPFQPD);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps",
+-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_GATHERPFQPS);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd",
+-	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_SCATTERPFDPD);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps",
+-	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_SCATTERPFDPS);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd",
+-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_SCATTERPFQPD);
+-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps",
+-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+-	       IX86_BUILTIN_SCATTERPFQPS);
+-
+-  /* SHA */
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1",
+-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2",
+-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte",
+-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4",
+-		     V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1",
+-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2",
+-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
+-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2",
+-		     V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
+-
+-  /* RTM.  */
+-  def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort",
+-	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
+-
+-  /* MMX access to the vec_init patterns.  */
+-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si",
+-		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi",
+-		     V4HI_FTYPE_HI_HI_HI_HI,
+-		     IX86_BUILTIN_VEC_INIT_V4HI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi",
+-		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
+-		     IX86_BUILTIN_VEC_INIT_V8QI);
+-
+-  /* Access to the vec_extract patterns.  */
+-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df",
+-		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
+-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di",
+-		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
+-  def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf",
+-		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
+-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si",
+-		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
+-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi",
+-		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+-		     /* As it uses V4HImode, we have to require -mmmx too.  */
+-		     | OPTION_MASK_ISA_MMX, 0,
+-		     "__builtin_ia32_vec_ext_v4hi",
+-		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si",
+-		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi",
+-		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
+-
+-  /* Access to the vec_set patterns.  */
+-  def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0,
+-		     "__builtin_ia32_vec_set_v2di",
+-		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf",
+-		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si",
+-		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi",
+-		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+-		     /* As it uses V4HImode, we have to require -mmmx too.  */
+-		     | OPTION_MASK_ISA_MMX, 0,
+-		     "__builtin_ia32_vec_set_v4hi",
+-		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
+-
+-  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi",
+-		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
+-
+-  /* RDSEED */
+-  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step",
+-	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
+-  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step",
+-	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
+-  def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0,
+-	       "__builtin_ia32_rdseed_di_step",
+-	       INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
+-
+-  /* ADCX */
+-  def_builtin (0, 0, "__builtin_ia32_addcarryx_u32",
+-	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
+-  def_builtin (OPTION_MASK_ISA_64BIT, 0,
+-	       "__builtin_ia32_addcarryx_u64",
+-	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
+-	       IX86_BUILTIN_ADDCARRYX64);
+-
+-  /* SBB */
+-  def_builtin (0, 0, "__builtin_ia32_sbb_u32",
+-	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
+-  def_builtin (OPTION_MASK_ISA_64BIT, 0,
+-	       "__builtin_ia32_sbb_u64",
+-	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
+-	       IX86_BUILTIN_SBB64);
+-
+-  /* Read/write FLAGS.  */
+-  if (TARGET_64BIT)
+-    {
+-      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64",
+-		   UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
+-      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64",
+-		   VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
+-    }
+-  else
+-    {
+-      def_builtin (0, 0, "__builtin_ia32_readeflags_u32",
+-		   UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
+-      def_builtin (0, 0, "__builtin_ia32_writeeflags_u32",
+-		   VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
+-    }
+-
+-  /* CLFLUSHOPT.  */
+-  def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt",
+-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
+-
+-  /* CLWB.  */
+-  def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb",
+-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
+-
+-  /* MONITORX and MWAITX.  */
+-  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
+-		VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
+-  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
+-		VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
+-
+-  /* CLZERO.  */
+-  def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
+-		VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
+-
+-  /* WAITPKG.  */
+-  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor",
+-	       VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR);
+-  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait",
+-	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT);
+-  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause",
+-	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE);
+-
+-  /* CLDEMOTE.  */
+-  def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote",
+-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE);
+-
+-  /* Add FMA4 multi-arg argument instructions */
+-  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
+-      if (d->name == 0)
+-	continue;
+-
+-      ftype = (enum ix86_builtin_func_type) d->flag;
+-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
+-		 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
+-		 ARRAY_SIZE (bdesc_multi_arg) - 1);
+-
+-  /* Add CET inrinsics.  */
+-  for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
+-      if (d->name == 0)
+-	continue;
+-
+-      ftype = (enum ix86_builtin_func_type) d->flag;
+-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
+-		 IX86_BUILTIN__BDESC_CET_FIRST,
+-		 ARRAY_SIZE (bdesc_cet) - 1);
+-
+-  for (i = 0, d = bdesc_cet_rdssp;
+-       i < ARRAY_SIZE (bdesc_cet_rdssp);
+-       i++, d++)
+-    {
+-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
+-      if (d->name == 0)
+-	continue;
+-
+-      ftype = (enum ix86_builtin_func_type) d->flag;
+-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
+-    }
+-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
+-		 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
+-		 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
+-}
+-
+-#undef BDESC_VERIFY
+-#undef BDESC_VERIFYS
+-
+-/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
+-   to return a pointer to VERSION_DECL if the outcome of the expression
+-   formed by PREDICATE_CHAIN is true.  This function will be called during
+-   version dispatch to decide which function version to execute.  It returns
+-   the basic block at the end, to which more conditions can be added.  */
+-
+-static basic_block
+-add_condition_to_bb (tree function_decl, tree version_decl,
+-		     tree predicate_chain, basic_block new_bb)
+-{
+-  gimple *return_stmt;
+-  tree convert_expr, result_var;
+-  gimple *convert_stmt;
+-  gimple *call_cond_stmt;
+-  gimple *if_else_stmt;
+-
+-  basic_block bb1, bb2, bb3;
+-  edge e12, e23;
+-
+-  tree cond_var, and_expr_var = NULL_TREE;
+-  gimple_seq gseq;
+-
+-  tree predicate_decl, predicate_arg;
+-
+-  push_cfun (DECL_STRUCT_FUNCTION (function_decl));
+-
+-  gcc_assert (new_bb != NULL);
+-  gseq = bb_seq (new_bb);
+-
+-
+-  convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
+-	     		 build_fold_addr_expr (version_decl));
+-  result_var = create_tmp_var (ptr_type_node);
+-  convert_stmt = gimple_build_assign (result_var, convert_expr); 
+-  return_stmt = gimple_build_return (result_var);
+-
+-  if (predicate_chain == NULL_TREE)
+-    {
+-      gimple_seq_add_stmt (&gseq, convert_stmt);
+-      gimple_seq_add_stmt (&gseq, return_stmt);
+-      set_bb_seq (new_bb, gseq);
+-      gimple_set_bb (convert_stmt, new_bb);
+-      gimple_set_bb (return_stmt, new_bb);
+-      pop_cfun ();
+-      return new_bb;
+-    }
+-
+-  while (predicate_chain != NULL)
+-    {
+-      cond_var = create_tmp_var (integer_type_node);
+-      predicate_decl = TREE_PURPOSE (predicate_chain);
+-      predicate_arg = TREE_VALUE (predicate_chain);
+-      call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
+-      gimple_call_set_lhs (call_cond_stmt, cond_var);
+-
+-      gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
+-      gimple_set_bb (call_cond_stmt, new_bb);
+-      gimple_seq_add_stmt (&gseq, call_cond_stmt);
+-
+-      predicate_chain = TREE_CHAIN (predicate_chain);
+-      
+-      if (and_expr_var == NULL)
+-        and_expr_var = cond_var;
+-      else
+-	{
+-	  gimple *assign_stmt;
+-	  /* Use MIN_EXPR to check if any integer is zero?.
+-	     and_expr_var = min_expr <cond_var, and_expr_var>  */
+-	  assign_stmt = gimple_build_assign (and_expr_var,
+-			  build2 (MIN_EXPR, integer_type_node,
+-				  cond_var, and_expr_var));
+-
+-	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
+-	  gimple_set_bb (assign_stmt, new_bb);
+-	  gimple_seq_add_stmt (&gseq, assign_stmt);
+-	}
+-    }
+-
+-  if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
+-	  		            integer_zero_node,
+-				    NULL_TREE, NULL_TREE);
+-  gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
+-  gimple_set_bb (if_else_stmt, new_bb);
+-  gimple_seq_add_stmt (&gseq, if_else_stmt);
+-
+-  gimple_seq_add_stmt (&gseq, convert_stmt);
+-  gimple_seq_add_stmt (&gseq, return_stmt);
+-  set_bb_seq (new_bb, gseq);
+-
+-  bb1 = new_bb;
+-  e12 = split_block (bb1, if_else_stmt);
+-  bb2 = e12->dest;
+-  e12->flags &= ~EDGE_FALLTHRU;
+-  e12->flags |= EDGE_TRUE_VALUE;
+-
+-  e23 = split_block (bb2, return_stmt);
+-
+-  gimple_set_bb (convert_stmt, bb2);
+-  gimple_set_bb (return_stmt, bb2);
+-
+-  bb3 = e23->dest;
+-  make_edge (bb1, bb3, EDGE_FALSE_VALUE); 
+-
+-  remove_edge (e23);
+-  make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
+-
+-  pop_cfun ();
+-
+-  return bb3;
+-}
+-
+-/* Priority of i386 features, greater value is higher priority.   This is
+-   used to decide the order in which function dispatch must happen.  For
+-   instance, a version specialized for SSE4.2 should be checked for dispatch
+-   before a version for SSE3, as SSE4.2 implies SSE3.  */
+-enum feature_priority
+-{
+-  P_ZERO = 0,
+-  P_MMX,
+-  P_SSE,
+-  P_SSE2,
+-  P_SSE3,
+-  P_SSSE3,
+-  P_PROC_SSSE3,
+-  P_SSE4_A,
+-  P_PROC_SSE4_A,
+-  P_SSE4_1,
+-  P_SSE4_2,
+-  P_PROC_SSE4_2,
+-  P_POPCNT,
+-  P_AES,
+-  P_PCLMUL,
+-  P_AVX,
+-  P_PROC_AVX,
+-  P_BMI,
+-  P_PROC_BMI,
+-  P_FMA4,
+-  P_XOP,
+-  P_PROC_XOP,
+-  P_FMA,
+-  P_PROC_FMA,
+-  P_BMI2,
+-  P_AVX2,
+-  P_PROC_AVX2,
+-  P_AVX512F,
+-  P_PROC_AVX512F
+-};
+-
+-/* This is the order of bit-fields in __processor_features in cpuinfo.c */
+-enum processor_features
+-{
+-  F_CMOV = 0,
+-  F_MMX,
+-  F_POPCNT,
+-  F_SSE,
+-  F_SSE2,
+-  F_SSE3,
+-  F_SSSE3,
+-  F_SSE4_1,
+-  F_SSE4_2,
+-  F_AVX,
+-  F_AVX2,
+-  F_SSE4_A,
+-  F_FMA4,
+-  F_XOP,
+-  F_FMA,
+-  F_AVX512F,
+-  F_BMI,
+-  F_BMI2,
+-  F_AES,
+-  F_PCLMUL,
+-  F_AVX512VL,
+-  F_AVX512BW,
+-  F_AVX512DQ,
+-  F_AVX512CD,
+-  F_AVX512ER,
+-  F_AVX512PF,
+-  F_AVX512VBMI,
+-  F_AVX512IFMA,
+-  F_AVX5124VNNIW,
+-  F_AVX5124FMAPS,
+-  F_AVX512VPOPCNTDQ,
+-  F_AVX512VBMI2,
+-  F_GFNI,
+-  F_VPCLMULQDQ,
+-  F_AVX512VNNI,
+-  F_AVX512BITALG,
+-  F_MAX
+-};
+-
+-/* These are the values for vendor types and cpu types  and subtypes
+-   in cpuinfo.c.  Cpu types and subtypes should be subtracted by
+-   the corresponding start value.  */
+-enum processor_model
+-{
+-  M_INTEL = 1,
+-  M_AMD,
+-  M_CPU_TYPE_START,
+-  M_INTEL_BONNELL,
+-  M_INTEL_CORE2,
+-  M_INTEL_COREI7,
+-  M_AMDFAM10H,
+-  M_AMDFAM15H,
+-  M_INTEL_SILVERMONT,
+-  M_INTEL_KNL,
+-  M_AMD_BTVER1,
+-  M_AMD_BTVER2,
+-  M_AMDFAM17H,
+-  M_INTEL_KNM,
+-  M_INTEL_GOLDMONT,
+-  M_INTEL_GOLDMONT_PLUS,
+-  M_INTEL_TREMONT,
+-  M_CPU_SUBTYPE_START,
+-  M_INTEL_COREI7_NEHALEM,
+-  M_INTEL_COREI7_WESTMERE,
+-  M_INTEL_COREI7_SANDYBRIDGE,
+-  M_AMDFAM10H_BARCELONA,
+-  M_AMDFAM10H_SHANGHAI,
+-  M_AMDFAM10H_ISTANBUL,
+-  M_AMDFAM15H_BDVER1,
+-  M_AMDFAM15H_BDVER2,
+-  M_AMDFAM15H_BDVER3,
+-  M_AMDFAM15H_BDVER4,
+-  M_AMDFAM17H_ZNVER1,
+-  M_INTEL_COREI7_IVYBRIDGE,
+-  M_INTEL_COREI7_HASWELL,
+-  M_INTEL_COREI7_BROADWELL,
+-  M_INTEL_COREI7_SKYLAKE,
+-  M_INTEL_COREI7_SKYLAKE_AVX512,
+-  M_INTEL_COREI7_CANNONLAKE,
+-  M_INTEL_COREI7_ICELAKE_CLIENT,
+-  M_INTEL_COREI7_ICELAKE_SERVER,
+-  M_AMDFAM17H_ZNVER2,
+-  M_INTEL_COREI7_CASCADELAKE
+-};
+-
+-struct _arch_names_table
+-{
+-  const char *const name;
+-  const enum processor_model model;
+-};
+-
+-static const _arch_names_table arch_names_table[] =
+-{
+-  {"amd", M_AMD},
+-  {"intel", M_INTEL},
+-  {"atom", M_INTEL_BONNELL},
+-  {"slm", M_INTEL_SILVERMONT},
+-  {"core2", M_INTEL_CORE2},
+-  {"corei7", M_INTEL_COREI7},
+-  {"nehalem", M_INTEL_COREI7_NEHALEM},
+-  {"westmere", M_INTEL_COREI7_WESTMERE},
+-  {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
+-  {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
+-  {"haswell", M_INTEL_COREI7_HASWELL},
+-  {"broadwell", M_INTEL_COREI7_BROADWELL},
+-  {"skylake", M_INTEL_COREI7_SKYLAKE},
+-  {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
+-  {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
+-  {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
+-  {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
+-  {"cascadelake", M_INTEL_COREI7_CASCADELAKE},
+-  {"bonnell", M_INTEL_BONNELL},
+-  {"silvermont", M_INTEL_SILVERMONT},
+-  {"goldmont", M_INTEL_GOLDMONT},
+-  {"goldmont-plus", M_INTEL_GOLDMONT_PLUS},
+-  {"tremont", M_INTEL_TREMONT},
+-  {"knl", M_INTEL_KNL},
+-  {"knm", M_INTEL_KNM},
+-  {"amdfam10h", M_AMDFAM10H},
+-  {"barcelona", M_AMDFAM10H_BARCELONA},
+-  {"shanghai", M_AMDFAM10H_SHANGHAI},
+-  {"istanbul", M_AMDFAM10H_ISTANBUL},
+-  {"btver1", M_AMD_BTVER1},
+-  {"amdfam15h", M_AMDFAM15H},
+-  {"bdver1", M_AMDFAM15H_BDVER1},
+-  {"bdver2", M_AMDFAM15H_BDVER2},
+-  {"bdver3", M_AMDFAM15H_BDVER3},
+-  {"bdver4", M_AMDFAM15H_BDVER4},
+-  {"btver2", M_AMD_BTVER2},
+-  {"amdfam17h", M_AMDFAM17H},
+-  {"znver1", M_AMDFAM17H_ZNVER1},
+-  {"znver2", M_AMDFAM17H_ZNVER2},
+-};
+-
+-/* These are the target attribute strings for which a dispatcher is
+-   available, from fold_builtin_cpu.  */
+-struct _isa_names_table
+-{
+-  const char *const name;
+-  const enum processor_features feature;
+-  const enum feature_priority priority;
+-};
+-
+-static const _isa_names_table isa_names_table[] =
+-{
+-  {"cmov",    F_CMOV,	P_ZERO},
+-  {"mmx",     F_MMX,	P_MMX},
+-  {"popcnt",  F_POPCNT,	P_POPCNT},
+-  {"sse",     F_SSE,	P_SSE},
+-  {"sse2",    F_SSE2,	P_SSE2},
+-  {"sse3",    F_SSE3,	P_SSE3},
+-  {"ssse3",   F_SSSE3,	P_SSSE3},
+-  {"sse4a",   F_SSE4_A,	P_SSE4_A},
+-  {"sse4.1",  F_SSE4_1,	P_SSE4_1},
+-  {"sse4.2",  F_SSE4_2,	P_SSE4_2},
+-  {"avx",     F_AVX,	P_AVX},
+-  {"fma4",    F_FMA4,	P_FMA4},
+-  {"xop",     F_XOP,	P_XOP},
+-  {"fma",     F_FMA,	P_FMA},
+-  {"avx2",    F_AVX2,	P_AVX2},
+-  {"avx512f", F_AVX512F, P_AVX512F},
+-  {"bmi",     F_BMI,	P_BMI},
+-  {"bmi2",    F_BMI2,	P_BMI2},
+-  {"aes",     F_AES,	P_AES},
+-  {"pclmul",  F_PCLMUL,	P_PCLMUL},
+-  {"avx512vl",F_AVX512VL, P_ZERO},
+-  {"avx512bw",F_AVX512BW, P_ZERO},
+-  {"avx512dq",F_AVX512DQ, P_ZERO},
+-  {"avx512cd",F_AVX512CD, P_ZERO},
+-  {"avx512er",F_AVX512ER, P_ZERO},
+-  {"avx512pf",F_AVX512PF, P_ZERO},
+-  {"avx512vbmi",F_AVX512VBMI, P_ZERO},
+-  {"avx512ifma",F_AVX512IFMA, P_ZERO},
+-  {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO},
+-  {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO},
+-  {"avx512vpopcntdq",F_AVX512VPOPCNTDQ,	P_ZERO},
+-  {"avx512vbmi2", F_AVX512VBMI2, P_ZERO},
+-  {"gfni",	F_GFNI,	P_ZERO},
+-  {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
+-  {"avx512vnni", F_AVX512VNNI, P_ZERO},
+-  {"avx512bitalg", F_AVX512BITALG, P_ZERO}
+-};
+-
+-/* This parses the attribute arguments to target in DECL and determines
+-   the right builtin to use to match the platform specification.
+-   It returns the priority value for this version decl.  If PREDICATE_LIST
+-   is not NULL, it stores the list of cpu features that need to be checked
+-   before dispatching this function.  */
+-
+-static unsigned int
+-get_builtin_code_for_version (tree decl, tree *predicate_list)
+-{
+-  tree attrs;
+-  struct cl_target_option cur_target;
+-  tree target_node;
+-  struct cl_target_option *new_target;
+-  const char *arg_str = NULL;
+-  const char *attrs_str = NULL;
+-  char *tok_str = NULL;
+-  char *token;
+-
+-  enum feature_priority priority = P_ZERO;
+-
+-  static unsigned int NUM_FEATURES
+-    = sizeof (isa_names_table) / sizeof (_isa_names_table);
+-
+-  unsigned int i;
+-
+-  tree predicate_chain = NULL_TREE;
+-  tree predicate_decl, predicate_arg;
+-
+-  attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+-  gcc_assert (attrs != NULL);
+-
+-  attrs = TREE_VALUE (TREE_VALUE (attrs));
+-
+-  gcc_assert (TREE_CODE (attrs) == STRING_CST);
+-  attrs_str = TREE_STRING_POINTER (attrs);
+-
+-  /* Return priority zero for default function.  */
+-  if (strcmp (attrs_str, "default") == 0)
+-    return 0;
+-
+-  /* Handle arch= if specified.  For priority, set it to be 1 more than
+-     the best instruction set the processor can handle.  For instance, if
+-     there is a version for atom and a version for ssse3 (the highest ISA
+-     priority for atom), the atom version must be checked for dispatch
+-     before the ssse3 version. */
+-  if (strstr (attrs_str, "arch=") != NULL)
+-    {
+-      cl_target_option_save (&cur_target, &global_options);
+-      target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
+-						      &global_options_set);
+-    
+-      gcc_assert (target_node);
+-      if (target_node == error_mark_node)
+-	return 0;
+-      new_target = TREE_TARGET_OPTION (target_node);
+-      gcc_assert (new_target);
+-      
+-      if (new_target->arch_specified && new_target->arch > 0)
+-	{
+-	  switch (new_target->arch)
+-	    {
+-	    case PROCESSOR_CORE2:
+-	      arg_str = "core2";
+-	      priority = P_PROC_SSSE3;
+-	      break;
+-	    case PROCESSOR_NEHALEM:
+-	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL)
+-		{
+-		  arg_str = "westmere";
+-		  priority = P_PCLMUL;
+-		}
+-	      else
+-		{
+-		  /* We translate "arch=corei7" and "arch=nehalem" to
+-		     "corei7" so that it will be mapped to M_INTEL_COREI7
+-		     as cpu type to cover all M_INTEL_COREI7_XXXs.  */
+-		  arg_str = "corei7";
+-		  priority = P_PROC_SSE4_2;
+-		}
+-	      break;
+-	    case PROCESSOR_SANDYBRIDGE:
+-	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
+-		arg_str = "ivybridge";
+-	      else
+-		arg_str = "sandybridge";
+-	      priority = P_PROC_AVX;
+-	      break;
+-	    case PROCESSOR_HASWELL:
+-	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
+-		arg_str = "broadwell";
+-	      else
+-		arg_str = "haswell";
+-	      priority = P_PROC_AVX2;
+-	      break;
+-	    case PROCESSOR_SKYLAKE:
+-	      arg_str = "skylake";
+-	      priority = P_PROC_AVX2;
+-	      break;
+-	    case PROCESSOR_SKYLAKE_AVX512:
+-	      arg_str = "skylake-avx512";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_CANNONLAKE:
+-	      arg_str = "cannonlake";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_ICELAKE_CLIENT:
+-	      arg_str = "icelake-client";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_ICELAKE_SERVER:
+-	      arg_str = "icelake-server";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_CASCADELAKE:
+-	      arg_str = "cascadelake";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_BONNELL:
+-	      arg_str = "bonnell";
+-	      priority = P_PROC_SSSE3;
+-	      break;
+-	    case PROCESSOR_KNL:
+-	      arg_str = "knl";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_KNM:
+-	      arg_str = "knm";
+-	      priority = P_PROC_AVX512F;
+-	      break;
+-	    case PROCESSOR_SILVERMONT:
+-	      arg_str = "silvermont";
+-	      priority = P_PROC_SSE4_2;
+-	      break;
+-	    case PROCESSOR_GOLDMONT:
+-	      arg_str = "goldmont";
+-	      priority = P_PROC_SSE4_2;
+-	      break;
+-	    case PROCESSOR_GOLDMONT_PLUS:
+-	      arg_str = "goldmont-plus";
+-	      priority = P_PROC_SSE4_2;
+-	      break;
+-	    case PROCESSOR_TREMONT:
+-	      arg_str = "tremont";
+-	      priority = P_PROC_SSE4_2;
+-	      break;
+-	    case PROCESSOR_AMDFAM10:
+-	      arg_str = "amdfam10h";
+-	      priority = P_PROC_SSE4_A;
+-	      break;
+-	    case PROCESSOR_BTVER1:
+-	      arg_str = "btver1";
+-	      priority = P_PROC_SSE4_A;
+-	      break;
+-	    case PROCESSOR_BTVER2:
+-	      arg_str = "btver2";
+-	      priority = P_PROC_BMI;
+-	      break;
+-	    case PROCESSOR_BDVER1:
+-	      arg_str = "bdver1";
+-	      priority = P_PROC_XOP;
+-	      break;
+-	    case PROCESSOR_BDVER2:
+-	      arg_str = "bdver2";
+-	      priority = P_PROC_FMA;
+-	      break;
+-	    case PROCESSOR_BDVER3:
+-	      arg_str = "bdver3";
+-	      priority = P_PROC_FMA;
+-	      break;
+-	    case PROCESSOR_BDVER4:
+-	      arg_str = "bdver4";
+-	      priority = P_PROC_AVX2;
+-	      break;
+-	    case PROCESSOR_ZNVER1:
+-	      arg_str = "znver1";
+-	      priority = P_PROC_AVX2;
+-	      break;
+-	    case PROCESSOR_ZNVER2:
+-	      arg_str = "znver2";
+-	      priority = P_PROC_AVX2;
+-	      break;
+-	    }
+-	}
+-
+-      cl_target_option_restore (&global_options, &cur_target);
+-	
+-      if (predicate_list && arg_str == NULL)
+-	{
+-	  error_at (DECL_SOURCE_LOCATION (decl),
+-		    "no dispatcher found for the versioning attributes");
+-	  return 0;
+-	}
+-    
+-      if (predicate_list)
+-	{
+-          predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
+-          /* For a C string literal the length includes the trailing NULL.  */
+-          predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
+-          predicate_chain = tree_cons (predicate_decl, predicate_arg,
+-				       predicate_chain);
+-	}
+-    }
+-
+-  /* Process feature name.  */
+-  tok_str =  (char *) xmalloc (strlen (attrs_str) + 1);
+-  strcpy (tok_str, attrs_str);
+-  token = strtok (tok_str, ",");
+-  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
+-
+-  while (token != NULL)
+-    {
+-      /* Do not process "arch="  */
+-      if (strncmp (token, "arch=", 5) == 0)
+-	{
+-	  token = strtok (NULL, ",");
+-	  continue;
+-	}
+-      for (i = 0; i < NUM_FEATURES; ++i)
+-	{
+-	  if (strcmp (token, isa_names_table[i].name) == 0)
+-	    {
+-	      if (predicate_list)
+-		{
+-		  predicate_arg = build_string_literal (
+-				  strlen (isa_names_table[i].name) + 1,
+-				  isa_names_table[i].name);
+-		  predicate_chain = tree_cons (predicate_decl, predicate_arg,
+-					       predicate_chain);
+-		}
+-	      /* Find the maximum priority feature.  */
+-	      if (isa_names_table[i].priority > priority)
+-		priority = isa_names_table[i].priority;
+-
+-	      break;
+-	    }
+-	}
+-      if (predicate_list && priority == P_ZERO)
+-	{
+-	  error_at (DECL_SOURCE_LOCATION (decl),
+-		    "ISA %qs is not supported in %<target%> attribute, "
+-		    "use %<arch=%> syntax", token);
+-	  return 0;
+-	}
+-      token = strtok (NULL, ",");
+-    }
+-  free (tok_str);
+-
+-  if (predicate_list && predicate_chain == NULL_TREE)
+-    {
+-      error_at (DECL_SOURCE_LOCATION (decl),
+-	        "no dispatcher found for the versioning attributes: %s",
+-	        attrs_str);
+-      return 0;
+-    }
+-  else if (predicate_list)
+-    {
+-      predicate_chain = nreverse (predicate_chain);
+-      *predicate_list = predicate_chain;
+-    }
+-
+-  return priority; 
+-}
+-
+-/* This compares the priority of target features in function DECL1
+-   and DECL2.  It returns positive value if DECL1 is higher priority,
+-   negative value if DECL2 is higher priority and 0 if they are the
+-   same.  */
+-
+-static int
+-ix86_compare_version_priority (tree decl1, tree decl2)
+-{
+-  unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
+-  unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
+-
+-  return (int)priority1 - (int)priority2;
+-}
+-
+-/* V1 and V2 point to function versions with different priorities
+-   based on the target ISA.  This function compares their priorities.  */
+- 
+-static int
+-feature_compare (const void *v1, const void *v2)
+-{
+-  typedef struct _function_version_info
+-    {
+-      tree version_decl;
+-      tree predicate_chain;
+-      unsigned int dispatch_priority;
+-    } function_version_info;
+-
+-  const function_version_info c1 = *(const function_version_info *)v1;
+-  const function_version_info c2 = *(const function_version_info *)v2;
+-  return (c2.dispatch_priority - c1.dispatch_priority);
+-}
+-
+-/* This function generates the dispatch function for
+-   multi-versioned functions.  DISPATCH_DECL is the function which will
+-   contain the dispatch logic.  FNDECLS are the function choices for
+-   dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
+-   in DISPATCH_DECL in which the dispatch code is generated.  */
+-
+-static int
+-dispatch_function_versions (tree dispatch_decl,
+-			    void *fndecls_p,
+-			    basic_block *empty_bb)
+-{
+-  tree default_decl;
+-  gimple *ifunc_cpu_init_stmt;
+-  gimple_seq gseq;
+-  int ix;
+-  tree ele;
+-  vec<tree> *fndecls;
+-  unsigned int num_versions = 0;
+-  unsigned int actual_versions = 0;
+-  unsigned int i;
+-
+-  struct _function_version_info
+-    {
+-      tree version_decl;
+-      tree predicate_chain;
+-      unsigned int dispatch_priority;
+-    }*function_version_info;
+-
+-  gcc_assert (dispatch_decl != NULL
+-	      && fndecls_p != NULL
+-	      && empty_bb != NULL);
+-
+-  /*fndecls_p is actually a vector.  */
+-  fndecls = static_cast<vec<tree> *> (fndecls_p);
+-
+-  /* At least one more version other than the default.  */
+-  num_versions = fndecls->length ();
+-  gcc_assert (num_versions >= 2);
+-
+-  function_version_info = (struct _function_version_info *)
+-    XNEWVEC (struct _function_version_info, (num_versions - 1));
+-
+-  /* The first version in the vector is the default decl.  */
+-  default_decl = (*fndecls)[0];
+-
+-  push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
+-
+-  gseq = bb_seq (*empty_bb);
+-  /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
+-     constructors, so explicity call __builtin_cpu_init here.  */
+-  ifunc_cpu_init_stmt = gimple_build_call_vec (
+-                     ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
+-  gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
+-  gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
+-  set_bb_seq (*empty_bb, gseq);
+-
+-  pop_cfun ();
+-
+-
+-  for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
+-    {
+-      tree version_decl = ele;
+-      tree predicate_chain = NULL_TREE;
+-      unsigned int priority;
+-      /* Get attribute string, parse it and find the right predicate decl.
+-         The predicate function could be a lengthy combination of many
+-	 features, like arch-type and various isa-variants.  */
+-      priority = get_builtin_code_for_version (version_decl,
+-	 			               &predicate_chain);
+-
+-      if (predicate_chain == NULL_TREE)
+-	continue;
+-
+-      function_version_info [actual_versions].version_decl = version_decl;
+-      function_version_info [actual_versions].predicate_chain
+-	 = predicate_chain;
+-      function_version_info [actual_versions].dispatch_priority = priority;
+-      actual_versions++;
+-    }
+-
+-  /* Sort the versions according to descending order of dispatch priority.  The
+-     priority is based on the ISA.  This is not a perfect solution.  There
+-     could still be ambiguity.  If more than one function version is suitable
+-     to execute,  which one should be dispatched?  In future, allow the user
+-     to specify a dispatch  priority next to the version.  */
+-  qsort (function_version_info, actual_versions,
+-         sizeof (struct _function_version_info), feature_compare);
+-
+-  for  (i = 0; i < actual_versions; ++i)
+-    *empty_bb = add_condition_to_bb (dispatch_decl,
+-				     function_version_info[i].version_decl,
+-				     function_version_info[i].predicate_chain,
+-				     *empty_bb);
+-
+-  /* dispatch default version at the end.  */
+-  *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
+-				   NULL, *empty_bb);
+-
+-  free (function_version_info);
+-  return 0;
+-}
+-
+-/* This function changes the assembler name for functions that are
+-   versions.  If DECL is a function version and has a "target"
+-   attribute, it appends the attribute string to its assembler name.  */
+-
+-static tree
+-ix86_mangle_function_version_assembler_name (tree decl, tree id)
+-{
+-  tree version_attr;
+-  const char *orig_name, *version_string;
+-  char *attr_str, *assembler_name;
+-
+-  if (DECL_DECLARED_INLINE_P (decl)
+-      && lookup_attribute ("gnu_inline",
+-			   DECL_ATTRIBUTES (decl)))
+-    error_at (DECL_SOURCE_LOCATION (decl),
+-	      "function versions cannot be marked as gnu_inline,"
+-	      " bodies have to be generated");
+-
+-  if (DECL_VIRTUAL_P (decl)
+-      || DECL_VINDEX (decl))
+-    sorry ("virtual function multiversioning not supported");
+-
+-  version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+-
+-  /* target attribute string cannot be NULL.  */
+-  gcc_assert (version_attr != NULL_TREE);
+-
+-  orig_name = IDENTIFIER_POINTER (id);
+-  version_string
+-    = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
+-
+-  if (strcmp (version_string, "default") == 0)
+-    return id;
+-
+-  attr_str = sorted_attr_string (TREE_VALUE (version_attr));
+-  assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
+-
+-  sprintf (assembler_name, "%s.%s", orig_name, attr_str);
+-
+-  /* Allow assembler name to be modified if already set.  */
+-  if (DECL_ASSEMBLER_NAME_SET_P (decl))
+-    SET_DECL_RTL (decl, NULL);
+-
+-  tree ret = get_identifier (assembler_name);
+-  XDELETEVEC (attr_str);
+-  XDELETEVEC (assembler_name);
+-  return ret;
+-}
+-
+-
+-static tree 
+-ix86_mangle_decl_assembler_name (tree decl, tree id)
+-{
+-  /* For function version, add the target suffix to the assembler name.  */
+-  if (TREE_CODE (decl) == FUNCTION_DECL
+-      && DECL_FUNCTION_VERSIONED (decl))
+-    id = ix86_mangle_function_version_assembler_name (decl, id);
+-#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
+-  id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
+-#endif
+-
+-  return id;
+-}
+-
+-/* Make a dispatcher declaration for the multi-versioned function DECL.
+-   Calls to DECL function will be replaced with calls to the dispatcher
+-   by the front-end.  Returns the decl of the dispatcher function.  */
+-
+-static tree
+-ix86_get_function_versions_dispatcher (void *decl)
+-{
+-  tree fn = (tree) decl;
+-  struct cgraph_node *node = NULL;
+-  struct cgraph_node *default_node = NULL;
+-  struct cgraph_function_version_info *node_v = NULL;
+-  struct cgraph_function_version_info *first_v = NULL;
+-
+-  tree dispatch_decl = NULL;
+-
+-  struct cgraph_function_version_info *default_version_info = NULL;
+- 
+-  gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
+-
+-  node = cgraph_node::get (fn);
+-  gcc_assert (node != NULL);
+-
+-  node_v = node->function_version ();
+-  gcc_assert (node_v != NULL);
+- 
+-  if (node_v->dispatcher_resolver != NULL)
+-    return node_v->dispatcher_resolver;
+-
+-  /* Find the default version and make it the first node.  */
+-  first_v = node_v;
+-  /* Go to the beginning of the chain.  */
+-  while (first_v->prev != NULL)
+-    first_v = first_v->prev;
+-  default_version_info = first_v;
+-  while (default_version_info != NULL)
+-    {
+-      if (is_function_default_version
+-	    (default_version_info->this_node->decl))
+-        break;
+-      default_version_info = default_version_info->next;
+-    }
+-
+-  /* If there is no default node, just return NULL.  */
+-  if (default_version_info == NULL)
+-    return NULL;
+-
+-  /* Make default info the first node.  */
+-  if (first_v != default_version_info)
+-    {
+-      default_version_info->prev->next = default_version_info->next;
+-      if (default_version_info->next)
+-        default_version_info->next->prev = default_version_info->prev;
+-      first_v->prev = default_version_info;
+-      default_version_info->next = first_v;
+-      default_version_info->prev = NULL;
+-    }
+-
+-  default_node = default_version_info->this_node;
+-
+-#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
+-  if (targetm.has_ifunc_p ())
+-    {
+-      struct cgraph_function_version_info *it_v = NULL;
+-      struct cgraph_node *dispatcher_node = NULL;
+-      struct cgraph_function_version_info *dispatcher_version_info = NULL;
+-
+-      /* Right now, the dispatching is done via ifunc.  */
+-      dispatch_decl = make_dispatcher_decl (default_node->decl);
+-
+-      dispatcher_node = cgraph_node::get_create (dispatch_decl);
+-      gcc_assert (dispatcher_node != NULL);
+-      dispatcher_node->dispatcher_function = 1;
+-      dispatcher_version_info
+-	= dispatcher_node->insert_new_function_version ();
+-      dispatcher_version_info->next = default_version_info;
+-      dispatcher_node->definition = 1;
+-
+-      /* Set the dispatcher for all the versions.  */
+-      it_v = default_version_info;
+-      while (it_v != NULL)
+-	{
+-	  it_v->dispatcher_resolver = dispatch_decl;
+-	  it_v = it_v->next;
+-	}
+-    }
+-  else
+-#endif
+-    {
+-      error_at (DECL_SOURCE_LOCATION (default_node->decl),
+-		"multiversioning needs ifunc which is not supported "
+-		"on this target");
+-    }
+-
+-  return dispatch_decl;
+-}
+-
+-/* Make the resolver function decl to dispatch the versions of
+-   a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
+-   ifunc alias that will point to the created resolver.  Create an
+-   empty basic block in the resolver and store the pointer in
+-   EMPTY_BB.  Return the decl of the resolver function.  */
+-
+-static tree
+-make_resolver_func (const tree default_decl,
+-		    const tree ifunc_alias_decl,
+-		    basic_block *empty_bb)
+-{
+-  char *resolver_name;
+-  tree decl, type, decl_name, t;
+-
+-  /* IFUNC's have to be globally visible.  So, if the default_decl is
+-     not, then the name of the IFUNC should be made unique.  */
+-  if (TREE_PUBLIC (default_decl) == 0)
+-    {
+-      char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
+-      symtab->change_decl_assembler_name (ifunc_alias_decl,
+-					  get_identifier (ifunc_name));
+-      XDELETEVEC (ifunc_name);
+-    }
+-
+-  resolver_name = make_unique_name (default_decl, "resolver", false);
+-
+-  /* The resolver function should return a (void *). */
+-  type = build_function_type_list (ptr_type_node, NULL_TREE);
+-
+-  decl = build_fn_decl (resolver_name, type);
+-  decl_name = get_identifier (resolver_name);
+-  SET_DECL_ASSEMBLER_NAME (decl, decl_name);
+-
+-  DECL_NAME (decl) = decl_name;
+-  TREE_USED (decl) = 1;
+-  DECL_ARTIFICIAL (decl) = 1;
+-  DECL_IGNORED_P (decl) = 1;
+-  TREE_PUBLIC (decl) = 0;
+-  DECL_UNINLINABLE (decl) = 1;
+-
+-  /* Resolver is not external, body is generated.  */
+-  DECL_EXTERNAL (decl) = 0;
+-  DECL_EXTERNAL (ifunc_alias_decl) = 0;
+-
+-  DECL_CONTEXT (decl) = NULL_TREE;
+-  DECL_INITIAL (decl) = make_node (BLOCK);
+-  DECL_STATIC_CONSTRUCTOR (decl) = 0;
+-
+-  if (DECL_COMDAT_GROUP (default_decl)
+-      || TREE_PUBLIC (default_decl))
+-    {
+-      /* In this case, each translation unit with a call to this
+-	 versioned function will put out a resolver.  Ensure it
+-	 is comdat to keep just one copy.  */
+-      DECL_COMDAT (decl) = 1;
+-      make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
+-    }
+-  /* Build result decl and add to function_decl. */
+-  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
+-  DECL_CONTEXT (t) = decl;
+-  DECL_ARTIFICIAL (t) = 1;
+-  DECL_IGNORED_P (t) = 1;
+-  DECL_RESULT (decl) = t;
+-
+-  gimplify_function_tree (decl);
+-  push_cfun (DECL_STRUCT_FUNCTION (decl));
+-  *empty_bb = init_lowered_empty_function (decl, false,
+-					   profile_count::uninitialized ());
+-
+-  cgraph_node::add_new_function (decl, true);
+-  symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
+-
+-  pop_cfun ();
+-
+-  gcc_assert (ifunc_alias_decl != NULL);
+-  /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
+-  DECL_ATTRIBUTES (ifunc_alias_decl)
+-    = make_attribute ("ifunc", resolver_name,
+-		      DECL_ATTRIBUTES (ifunc_alias_decl));
+-
+-  /* Create the alias for dispatch to resolver here.  */
+-  cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
+-  XDELETEVEC (resolver_name);
+-  return decl;
+-}
+-
+-/* Generate the dispatching code body to dispatch multi-versioned function
+-   DECL.  The target hook is called to process the "target" attributes and
+-   provide the code to dispatch the right function at run-time.  NODE points
+-   to the dispatcher decl whose body will be created.  */
+-
+-static tree 
+-ix86_generate_version_dispatcher_body (void *node_p)
+-{
+-  tree resolver_decl;
+-  basic_block empty_bb;
+-  tree default_ver_decl;
+-  struct cgraph_node *versn;
+-  struct cgraph_node *node;
+-
+-  struct cgraph_function_version_info *node_version_info = NULL;
+-  struct cgraph_function_version_info *versn_info = NULL;
+-
+-  node = (cgraph_node *)node_p;
+-
+-  node_version_info = node->function_version ();
+-  gcc_assert (node->dispatcher_function
+-	      && node_version_info != NULL);
+-
+-  if (node_version_info->dispatcher_resolver)
+-    return node_version_info->dispatcher_resolver;
+-
+-  /* The first version in the chain corresponds to the default version.  */
+-  default_ver_decl = node_version_info->next->this_node->decl;
+-
+-  /* node is going to be an alias, so remove the finalized bit.  */
+-  node->definition = false;
+-
+-  resolver_decl = make_resolver_func (default_ver_decl,
+-				      node->decl, &empty_bb);
+-
+-  node_version_info->dispatcher_resolver = resolver_decl;
+-
+-  push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
+-
+-  auto_vec<tree, 2> fn_ver_vec;
+-
+-  for (versn_info = node_version_info->next; versn_info;
+-       versn_info = versn_info->next)
+-    {
+-      versn = versn_info->this_node;
+-      /* Check for virtual functions here again, as by this time it should
+-	 have been determined if this function needs a vtable index or
+-	 not.  This happens for methods in derived classes that override
+-	 virtual methods in base classes but are not explicitly marked as
+-	 virtual.  */
+-      if (DECL_VINDEX (versn->decl))
+-	sorry ("virtual function multiversioning not supported");
+-
+-      fn_ver_vec.safe_push (versn->decl);
+-    }
+-
+-  dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
+-  cgraph_edge::rebuild_edges ();
+-  pop_cfun ();
+-  return resolver_decl;
+-}
+-/* This builds the processor_model struct type defined in
+-   libgcc/config/i386/cpuinfo.c  */
+-
+-static tree
+-build_processor_model_struct (void)
+-{
+-  const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
+-			      "__cpu_features"};
+-  tree field = NULL_TREE, field_chain = NULL_TREE;
+-  int i;
+-  tree type = make_node (RECORD_TYPE);
+-
+-  /* The first 3 fields are unsigned int.  */
+-  for (i = 0; i < 3; ++i)
+-    {
+-      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+-			  get_identifier (field_name[i]), unsigned_type_node);
+-      if (field_chain != NULL_TREE)
+-	DECL_CHAIN (field) = field_chain;
+-      field_chain = field;
+-    }
+-
+-  /* The last field is an array of unsigned integers of size one.  */
+-  field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+-		      get_identifier (field_name[3]),
+-		      build_array_type (unsigned_type_node,
+-					build_index_type (size_one_node)));
+-  if (field_chain != NULL_TREE)
+-    DECL_CHAIN (field) = field_chain;
+-  field_chain = field;
+-
+-  finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
+-  return type;
+-}
+-
+-/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
+-
+-static tree
+-make_var_decl (tree type, const char *name)
+-{
+-  tree new_decl;
+-
+-  new_decl = build_decl (UNKNOWN_LOCATION,
+-	                 VAR_DECL,
+-	  	         get_identifier(name),
+-		         type);
+-
+-  DECL_EXTERNAL (new_decl) = 1;
+-  TREE_STATIC (new_decl) = 1;
+-  TREE_PUBLIC (new_decl) = 1;
+-  DECL_INITIAL (new_decl) = 0;
+-  DECL_ARTIFICIAL (new_decl) = 0;
+-  DECL_PRESERVE_P (new_decl) = 1;
+-
+-  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
+-  assemble_variable (new_decl, 0, 0, 0);
+-
+-  return new_decl;
+-}
+-
+-/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
+-   into an integer defined in libgcc/config/i386/cpuinfo.c */
+-
+-static tree
+-fold_builtin_cpu (tree fndecl, tree *args)
+-{
+-  unsigned int i;
+-  enum ix86_builtins fn_code = (enum ix86_builtins)
+-				DECL_FUNCTION_CODE (fndecl);
+-  tree param_string_cst = NULL;
+-
+-  tree __processor_model_type = build_processor_model_struct ();
+-  tree __cpu_model_var = make_var_decl (__processor_model_type,
+-					"__cpu_model");
+-
+-
+-  varpool_node::add (__cpu_model_var);
+-
+-  gcc_assert ((args != NULL) && (*args != NULL));
+-
+-  param_string_cst = *args;
+-  while (param_string_cst
+-	 && TREE_CODE (param_string_cst) !=  STRING_CST)
+-    {
+-      /* *args must be a expr that can contain other EXPRS leading to a
+-	 STRING_CST.   */
+-      if (!EXPR_P (param_string_cst))
+- 	{
+-	  error ("parameter to builtin must be a string constant or literal");
+-	  return integer_zero_node;
+-	}
+-      param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
+-    }
+-
+-  gcc_assert (param_string_cst);
+-
+-  if (fn_code == IX86_BUILTIN_CPU_IS)
+-    {
+-      tree ref;
+-      tree field;
+-      tree final;
+-
+-      unsigned int field_val = 0;
+-      unsigned int NUM_ARCH_NAMES
+-	= sizeof (arch_names_table) / sizeof (struct _arch_names_table);
+-
+-      for (i = 0; i < NUM_ARCH_NAMES; i++)
+-	if (strcmp (arch_names_table[i].name,
+-	    TREE_STRING_POINTER (param_string_cst)) == 0)
+-	  break;
+-
+-      if (i == NUM_ARCH_NAMES)
+-	{
+-	  error ("parameter to builtin not valid: %s",
+-	         TREE_STRING_POINTER (param_string_cst));
+-	  return integer_zero_node;
+-	}
+-
+-      field = TYPE_FIELDS (__processor_model_type);
+-      field_val = arch_names_table[i].model;
+-
+-      /* CPU types are stored in the next field.  */
+-      if (field_val > M_CPU_TYPE_START
+-	  && field_val < M_CPU_SUBTYPE_START)
+-	{
+-	  field = DECL_CHAIN (field);
+-	  field_val -= M_CPU_TYPE_START;
+-	}
+-
+-      /* CPU subtypes are stored in the next field.  */
+-      if (field_val > M_CPU_SUBTYPE_START)
+-	{
+-	  field = DECL_CHAIN ( DECL_CHAIN (field));
+-	  field_val -= M_CPU_SUBTYPE_START;
+-	}
+-
+-      /* Get the appropriate field in __cpu_model.  */
+-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+-		    field, NULL_TREE);
+-
+-      /* Check the value.  */
+-      final = build2 (EQ_EXPR, unsigned_type_node, ref,
+-		      build_int_cstu (unsigned_type_node, field_val));
+-      return build1 (CONVERT_EXPR, integer_type_node, final);
+-    }
+-  else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
+-    {
+-      tree ref;
+-      tree array_elt;
+-      tree field;
+-      tree final;
+-
+-      unsigned int field_val = 0;
+-      unsigned int NUM_ISA_NAMES
+-	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
+-
+-      for (i = 0; i < NUM_ISA_NAMES; i++)
+-	if (strcmp (isa_names_table[i].name,
+-	    TREE_STRING_POINTER (param_string_cst)) == 0)
+-	  break;
+-
+-      if (i == NUM_ISA_NAMES)
+-	{
+-	  error ("parameter to builtin not valid: %s",
+-	       	 TREE_STRING_POINTER (param_string_cst));
+-	  return integer_zero_node;
+-	}
+-
+-      if (isa_names_table[i].feature >= 32)
+-	{
+-	  tree __cpu_features2_var = make_var_decl (unsigned_type_node,
+-						    "__cpu_features2");
+-
+-	  varpool_node::add (__cpu_features2_var);
+-	  field_val = (1U << (isa_names_table[i].feature - 32));
+-	  /* Return __cpu_features2 & field_val  */
+-	  final = build2 (BIT_AND_EXPR, unsigned_type_node,
+-			  __cpu_features2_var,
+-			  build_int_cstu (unsigned_type_node, field_val));
+-	  return build1 (CONVERT_EXPR, integer_type_node, final);
+-	}
+-
+-      field = TYPE_FIELDS (__processor_model_type);
+-      /* Get the last field, which is __cpu_features.  */
+-      while (DECL_CHAIN (field))
+-        field = DECL_CHAIN (field);
+-
+-      /* Get the appropriate field: __cpu_model.__cpu_features  */
+-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+-		    field, NULL_TREE);
+-
+-      /* Access the 0th element of __cpu_features array.  */
+-      array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
+-			  integer_zero_node, NULL_TREE, NULL_TREE);
+-
+-      field_val = (1U << isa_names_table[i].feature);
+-      /* Return __cpu_model.__cpu_features[0] & field_val  */
+-      final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
+-		      build_int_cstu (unsigned_type_node, field_val));
+-      return build1 (CONVERT_EXPR, integer_type_node, final);
+-    }
+-  gcc_unreachable ();
+-}
+-
+-/* Return the shift count of a vector by scalar shift builtin second argument
+-   ARG1.  */
+-static tree
+-ix86_vector_shift_count (tree arg1)
+-{
+-  if (tree_fits_uhwi_p (arg1))
+-    return arg1;
+-  else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
+-    {
+-      /* The count argument is weird, passed in as various 128-bit
+-	 (or 64-bit) vectors, the low 64 bits from it are the count.  */
+-      unsigned char buf[16];
+-      int len = native_encode_expr (arg1, buf, 16);
+-      if (len == 0)
+-	return NULL_TREE;
+-      tree t = native_interpret_expr (uint64_type_node, buf, len);
+-      if (t && tree_fits_uhwi_p (t))
+-	return t;
+-    }
+-  return NULL_TREE;
+-}
+-
+-static tree
+-ix86_fold_builtin (tree fndecl, int n_args,
+-		   tree *args, bool ignore ATTRIBUTE_UNUSED)
+-{
+-  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+-    {
+-      enum ix86_builtins fn_code = (enum ix86_builtins)
+-				   DECL_FUNCTION_CODE (fndecl);
+-      enum rtx_code rcode;
+-      bool is_vshift;
+-      unsigned HOST_WIDE_INT mask;
+-
+-      switch (fn_code)
+-	{
+-	case IX86_BUILTIN_CPU_IS:
+-	case IX86_BUILTIN_CPU_SUPPORTS:
+-	  gcc_assert (n_args == 1);
+-	  return fold_builtin_cpu (fndecl, args);
+-
+-	case IX86_BUILTIN_NANQ:
+-	case IX86_BUILTIN_NANSQ:
+-	  {
+-	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
+-	    const char *str = c_getstr (*args);
+-	    int quiet = fn_code == IX86_BUILTIN_NANQ;
+-	    REAL_VALUE_TYPE real;
+-
+-	    if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
+-	      return build_real (type, real);
+-	    return NULL_TREE;
+-	  }
+-
+-	case IX86_BUILTIN_INFQ:
+-	case IX86_BUILTIN_HUGE_VALQ:
+-	  {
+-	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
+-	    REAL_VALUE_TYPE inf;
+-	    real_inf (&inf);
+-	    return build_real (type, inf);
+-	  }
+-
+-	case IX86_BUILTIN_TZCNT16:
+-	case IX86_BUILTIN_CTZS:
+-	case IX86_BUILTIN_TZCNT32:
+-	case IX86_BUILTIN_TZCNT64:
+-	  gcc_assert (n_args == 1);
+-	  if (TREE_CODE (args[0]) == INTEGER_CST)
+-	    {
+-	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
+-	      tree arg = args[0];
+-	      if (fn_code == IX86_BUILTIN_TZCNT16
+-		  || fn_code == IX86_BUILTIN_CTZS)
+-		arg = fold_convert (short_unsigned_type_node, arg);
+-	      if (integer_zerop (arg))
+-		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+-	      else
+-		return fold_const_call (CFN_CTZ, type, arg);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_LZCNT16:
+-	case IX86_BUILTIN_CLZS:
+-	case IX86_BUILTIN_LZCNT32:
+-	case IX86_BUILTIN_LZCNT64:
+-	  gcc_assert (n_args == 1);
+-	  if (TREE_CODE (args[0]) == INTEGER_CST)
+-	    {
+-	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
+-	      tree arg = args[0];
+-	      if (fn_code == IX86_BUILTIN_LZCNT16
+-		  || fn_code == IX86_BUILTIN_CLZS)
+-		arg = fold_convert (short_unsigned_type_node, arg);
+-	      if (integer_zerop (arg))
+-		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+-	      else
+-		return fold_const_call (CFN_CLZ, type, arg);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_BEXTR32:
+-	case IX86_BUILTIN_BEXTR64:
+-	case IX86_BUILTIN_BEXTRI32:
+-	case IX86_BUILTIN_BEXTRI64:
+-	  gcc_assert (n_args == 2);
+-	  if (tree_fits_uhwi_p (args[1]))
+-	    {
+-	      unsigned HOST_WIDE_INT res = 0;
+-	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
+-	      unsigned int start = tree_to_uhwi (args[1]);
+-	      unsigned int len = (start & 0xff00) >> 8;
+-	      start &= 0xff;
+-	      if (start >= prec || len == 0)
+-		res = 0;
+-	      else if (!tree_fits_uhwi_p (args[0]))
+-		break;
+-	      else
+-		res = tree_to_uhwi (args[0]) >> start;
+-	      if (len > prec)
+-		len = prec;
+-	      if (len < HOST_BITS_PER_WIDE_INT)
+-		res &= (HOST_WIDE_INT_1U << len) - 1;
+-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_BZHI32:
+-	case IX86_BUILTIN_BZHI64:
+-	  gcc_assert (n_args == 2);
+-	  if (tree_fits_uhwi_p (args[1]))
+-	    {
+-	      unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
+-	      if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
+-		return args[0];
+-	      if (idx == 0)
+-		return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0);
+-	      if (!tree_fits_uhwi_p (args[0]))
+-		break;
+-	      unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
+-	      res &= ~(HOST_WIDE_INT_M1U << idx);
+-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_PDEP32:
+-	case IX86_BUILTIN_PDEP64:
+-	  gcc_assert (n_args == 2);
+-	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+-	    {
+-	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+-	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+-	      unsigned HOST_WIDE_INT res = 0;
+-	      unsigned HOST_WIDE_INT m, k = 1;
+-	      for (m = 1; m; m <<= 1)
+-		if ((mask & m) != 0)
+-		  {
+-		    if ((src & k) != 0)
+-		      res |= m;
+-		    k <<= 1;
+-		  }
+-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_PEXT32:
+-	case IX86_BUILTIN_PEXT64:
+-	  gcc_assert (n_args == 2);
+-	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+-	    {
+-	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+-	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+-	      unsigned HOST_WIDE_INT res = 0;
+-	      unsigned HOST_WIDE_INT m, k = 1;
+-	      for (m = 1; m; m <<= 1)
+-		if ((mask & m) != 0)
+-		  {
+-		    if ((src & m) != 0)
+-		      res |= k;
+-		    k <<= 1;
+-		  }
+-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_MOVMSKPS:
+-	case IX86_BUILTIN_PMOVMSKB:
+-	case IX86_BUILTIN_MOVMSKPD:
+-	case IX86_BUILTIN_PMOVMSKB128:
+-	case IX86_BUILTIN_MOVMSKPD256:
+-	case IX86_BUILTIN_MOVMSKPS256:
+-	case IX86_BUILTIN_PMOVMSKB256:
+-	  gcc_assert (n_args == 1);
+-	  if (TREE_CODE (args[0]) == VECTOR_CST)
+-	    {
+-	      HOST_WIDE_INT res = 0;
+-	      for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
+-		{
+-		  tree e = VECTOR_CST_ELT (args[0], i);
+-		  if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
+-		    {
+-		      if (wi::neg_p (wi::to_wide (e)))
+-			res |= HOST_WIDE_INT_1 << i;
+-		    }
+-		  else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
+-		    {
+-		      if (TREE_REAL_CST (e).sign)
+-			res |= HOST_WIDE_INT_1 << i;
+-		    }
+-		  else
+-		    return NULL_TREE;
+-		}
+-	      return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
+-	    }
+-	  break;
+-
+-	case IX86_BUILTIN_PSLLD:
+-	case IX86_BUILTIN_PSLLD128:
+-	case IX86_BUILTIN_PSLLD128_MASK:
+-	case IX86_BUILTIN_PSLLD256:
+-	case IX86_BUILTIN_PSLLD256_MASK:
+-	case IX86_BUILTIN_PSLLD512:
+-	case IX86_BUILTIN_PSLLDI:
+-	case IX86_BUILTIN_PSLLDI128:
+-	case IX86_BUILTIN_PSLLDI128_MASK:
+-	case IX86_BUILTIN_PSLLDI256:
+-	case IX86_BUILTIN_PSLLDI256_MASK:
+-	case IX86_BUILTIN_PSLLDI512:
+-	case IX86_BUILTIN_PSLLQ:
+-	case IX86_BUILTIN_PSLLQ128:
+-	case IX86_BUILTIN_PSLLQ128_MASK:
+-	case IX86_BUILTIN_PSLLQ256:
+-	case IX86_BUILTIN_PSLLQ256_MASK:
+-	case IX86_BUILTIN_PSLLQ512:
+-	case IX86_BUILTIN_PSLLQI:
+-	case IX86_BUILTIN_PSLLQI128:
+-	case IX86_BUILTIN_PSLLQI128_MASK:
+-	case IX86_BUILTIN_PSLLQI256:
+-	case IX86_BUILTIN_PSLLQI256_MASK:
+-	case IX86_BUILTIN_PSLLQI512:
+-	case IX86_BUILTIN_PSLLW:
+-	case IX86_BUILTIN_PSLLW128:
+-	case IX86_BUILTIN_PSLLW128_MASK:
+-	case IX86_BUILTIN_PSLLW256:
+-	case IX86_BUILTIN_PSLLW256_MASK:
+-	case IX86_BUILTIN_PSLLW512_MASK:
+-	case IX86_BUILTIN_PSLLWI:
+-	case IX86_BUILTIN_PSLLWI128:
+-	case IX86_BUILTIN_PSLLWI128_MASK:
+-	case IX86_BUILTIN_PSLLWI256:
+-	case IX86_BUILTIN_PSLLWI256_MASK:
+-	case IX86_BUILTIN_PSLLWI512_MASK:
+-	  rcode = ASHIFT;
+-	  is_vshift = false;
+-	  goto do_shift;
+-	case IX86_BUILTIN_PSRAD:
+-	case IX86_BUILTIN_PSRAD128:
+-	case IX86_BUILTIN_PSRAD128_MASK:
+-	case IX86_BUILTIN_PSRAD256:
+-	case IX86_BUILTIN_PSRAD256_MASK:
+-	case IX86_BUILTIN_PSRAD512:
+-	case IX86_BUILTIN_PSRADI:
+-	case IX86_BUILTIN_PSRADI128:
+-	case IX86_BUILTIN_PSRADI128_MASK:
+-	case IX86_BUILTIN_PSRADI256:
+-	case IX86_BUILTIN_PSRADI256_MASK:
+-	case IX86_BUILTIN_PSRADI512:
+-	case IX86_BUILTIN_PSRAQ128_MASK:
+-	case IX86_BUILTIN_PSRAQ256_MASK:
+-	case IX86_BUILTIN_PSRAQ512:
+-	case IX86_BUILTIN_PSRAQI128_MASK:
+-	case IX86_BUILTIN_PSRAQI256_MASK:
+-	case IX86_BUILTIN_PSRAQI512:
+-	case IX86_BUILTIN_PSRAW:
+-	case IX86_BUILTIN_PSRAW128:
+-	case IX86_BUILTIN_PSRAW128_MASK:
+-	case IX86_BUILTIN_PSRAW256:
+-	case IX86_BUILTIN_PSRAW256_MASK:
+-	case IX86_BUILTIN_PSRAW512:
+-	case IX86_BUILTIN_PSRAWI:
+-	case IX86_BUILTIN_PSRAWI128:
+-	case IX86_BUILTIN_PSRAWI128_MASK:
+-	case IX86_BUILTIN_PSRAWI256:
+-	case IX86_BUILTIN_PSRAWI256_MASK:
+-	case IX86_BUILTIN_PSRAWI512:
+-	  rcode = ASHIFTRT;
+-	  is_vshift = false;
+-	  goto do_shift;
+-	case IX86_BUILTIN_PSRLD:
+-	case IX86_BUILTIN_PSRLD128:
+-	case IX86_BUILTIN_PSRLD128_MASK:
+-	case IX86_BUILTIN_PSRLD256:
+-	case IX86_BUILTIN_PSRLD256_MASK:
+-	case IX86_BUILTIN_PSRLD512:
+-	case IX86_BUILTIN_PSRLDI:
+-	case IX86_BUILTIN_PSRLDI128:
+-	case IX86_BUILTIN_PSRLDI128_MASK:
+-	case IX86_BUILTIN_PSRLDI256:
+-	case IX86_BUILTIN_PSRLDI256_MASK:
+-	case IX86_BUILTIN_PSRLDI512:
+-	case IX86_BUILTIN_PSRLQ:
+-	case IX86_BUILTIN_PSRLQ128:
+-	case IX86_BUILTIN_PSRLQ128_MASK:
+-	case IX86_BUILTIN_PSRLQ256:
+-	case IX86_BUILTIN_PSRLQ256_MASK:
+-	case IX86_BUILTIN_PSRLQ512:
+-	case IX86_BUILTIN_PSRLQI:
+-	case IX86_BUILTIN_PSRLQI128:
+-	case IX86_BUILTIN_PSRLQI128_MASK:
+-	case IX86_BUILTIN_PSRLQI256:
+-	case IX86_BUILTIN_PSRLQI256_MASK:
+-	case IX86_BUILTIN_PSRLQI512:
+-	case IX86_BUILTIN_PSRLW:
+-	case IX86_BUILTIN_PSRLW128:
+-	case IX86_BUILTIN_PSRLW128_MASK:
+-	case IX86_BUILTIN_PSRLW256:
+-	case IX86_BUILTIN_PSRLW256_MASK:
+-	case IX86_BUILTIN_PSRLW512:
+-	case IX86_BUILTIN_PSRLWI:
+-	case IX86_BUILTIN_PSRLWI128:
+-	case IX86_BUILTIN_PSRLWI128_MASK:
+-	case IX86_BUILTIN_PSRLWI256:
+-	case IX86_BUILTIN_PSRLWI256_MASK:
+-	case IX86_BUILTIN_PSRLWI512:
+-	  rcode = LSHIFTRT;
+-	  is_vshift = false;
+-	  goto do_shift;
+-	case IX86_BUILTIN_PSLLVV16HI:
+-	case IX86_BUILTIN_PSLLVV16SI:
+-	case IX86_BUILTIN_PSLLVV2DI:
+-	case IX86_BUILTIN_PSLLVV2DI_MASK:
+-	case IX86_BUILTIN_PSLLVV32HI:
+-	case IX86_BUILTIN_PSLLVV4DI:
+-	case IX86_BUILTIN_PSLLVV4DI_MASK:
+-	case IX86_BUILTIN_PSLLVV4SI:
+-	case IX86_BUILTIN_PSLLVV4SI_MASK:
+-	case IX86_BUILTIN_PSLLVV8DI:
+-	case IX86_BUILTIN_PSLLVV8HI:
+-	case IX86_BUILTIN_PSLLVV8SI:
+-	case IX86_BUILTIN_PSLLVV8SI_MASK:
+-	  rcode = ASHIFT;
+-	  is_vshift = true;
+-	  goto do_shift;
+-	case IX86_BUILTIN_PSRAVQ128:
+-	case IX86_BUILTIN_PSRAVQ256:
+-	case IX86_BUILTIN_PSRAVV16HI:
+-	case IX86_BUILTIN_PSRAVV16SI:
+-	case IX86_BUILTIN_PSRAVV32HI:
+-	case IX86_BUILTIN_PSRAVV4SI:
+-	case IX86_BUILTIN_PSRAVV4SI_MASK:
+-	case IX86_BUILTIN_PSRAVV8DI:
+-	case IX86_BUILTIN_PSRAVV8HI:
+-	case IX86_BUILTIN_PSRAVV8SI:
+-	case IX86_BUILTIN_PSRAVV8SI_MASK:
+-	  rcode = ASHIFTRT;
+-	  is_vshift = true;
+-	  goto do_shift;
+-	case IX86_BUILTIN_PSRLVV16HI:
+-	case IX86_BUILTIN_PSRLVV16SI:
+-	case IX86_BUILTIN_PSRLVV2DI:
+-	case IX86_BUILTIN_PSRLVV2DI_MASK:
+-	case IX86_BUILTIN_PSRLVV32HI:
+-	case IX86_BUILTIN_PSRLVV4DI:
+-	case IX86_BUILTIN_PSRLVV4DI_MASK:
+-	case IX86_BUILTIN_PSRLVV4SI:
+-	case IX86_BUILTIN_PSRLVV4SI_MASK:
+-	case IX86_BUILTIN_PSRLVV8DI:
+-	case IX86_BUILTIN_PSRLVV8HI:
+-	case IX86_BUILTIN_PSRLVV8SI:
+-	case IX86_BUILTIN_PSRLVV8SI_MASK:
+-	  rcode = LSHIFTRT;
+-	  is_vshift = true;
+-	  goto do_shift;
+-
+-	do_shift:
+-	  gcc_assert (n_args >= 2);
+-	  if (TREE_CODE (args[0]) != VECTOR_CST)
+-	    break;
+-	  mask = HOST_WIDE_INT_M1U;
+-	  if (n_args > 2)
+-	    {
+-	      /* This is masked shift.  */
+-	      if (!tree_fits_uhwi_p (args[n_args - 1])
+-		  || TREE_SIDE_EFFECTS (args[n_args - 2]))
+-		break;
+-	      mask = tree_to_uhwi (args[n_args - 1]);
+-	      unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+-	      mask |= HOST_WIDE_INT_M1U << elems;
+-	      if (mask != HOST_WIDE_INT_M1U
+-		  && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
+-		break;
+-	      if (mask == (HOST_WIDE_INT_M1U << elems))
+-		return args[n_args - 2];
+-	    }
+-	  if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
+-	    break;
+-	  if (tree tem = (is_vshift ? integer_one_node
+-			  : ix86_vector_shift_count (args[1])))
+-	    {
+-	      unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
+-	      unsigned HOST_WIDE_INT prec
+-		= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
+-	      if (count == 0 && mask == HOST_WIDE_INT_M1U)
+-		return args[0];
+-	      if (count >= prec)
+-		{
+-		  if (rcode == ASHIFTRT)
+-		    count = prec - 1;
+-		  else if (mask == HOST_WIDE_INT_M1U)
+-		    return build_zero_cst (TREE_TYPE (args[0]));
+-		}
+-	      tree countt = NULL_TREE;
+-	      if (!is_vshift)
+-		{
+-		  if (count >= prec)
+-		    countt = integer_zero_node;
+-		  else
+-		    countt = build_int_cst (integer_type_node, count);
+-		}
+-	      tree_vector_builder builder;
+-	      if (mask != HOST_WIDE_INT_M1U || is_vshift)
+-		builder.new_vector (TREE_TYPE (args[0]),
+-				    TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])),
+-				    1);
+-	      else
+-		builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
+-					     false);
+-	      unsigned int cnt = builder.encoded_nelts ();
+-	      for (unsigned int i = 0; i < cnt; ++i)
+-		{
+-		  tree elt = VECTOR_CST_ELT (args[0], i);
+-		  if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
+-		    return NULL_TREE;
+-		  tree type = TREE_TYPE (elt);
+-		  if (rcode == LSHIFTRT)
+-		    elt = fold_convert (unsigned_type_for (type), elt);
+-		  if (is_vshift)
+-		    {
+-		      countt = VECTOR_CST_ELT (args[1], i);
+-		      if (TREE_CODE (countt) != INTEGER_CST
+-			  || TREE_OVERFLOW (countt))
+-			return NULL_TREE;
+-		      if (wi::neg_p (wi::to_wide (countt))
+-			  || wi::to_widest (countt) >= prec)
+-			{
+-			  if (rcode == ASHIFTRT)
+-			    countt = build_int_cst (TREE_TYPE (countt),
+-						    prec - 1);
+-			  else
+-			    {
+-			      elt = build_zero_cst (TREE_TYPE (elt));
+-			      countt = build_zero_cst (TREE_TYPE (countt));
+-			    }
+-			}
+-		    }
+-		  else if (count >= prec)
+-		    elt = build_zero_cst (TREE_TYPE (elt));
+-		  elt = const_binop (rcode == ASHIFT
+-				     ? LSHIFT_EXPR : RSHIFT_EXPR,
+-				     TREE_TYPE (elt), elt, countt);
+-		  if (!elt || TREE_CODE (elt) != INTEGER_CST)
+-		    return NULL_TREE;
+-		  if (rcode == LSHIFTRT)
+-		    elt = fold_convert (type, elt);
+-		  if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
+-		    {
+-		      elt = VECTOR_CST_ELT (args[n_args - 2], i);
+-		      if (TREE_CODE (elt) != INTEGER_CST
+-			  || TREE_OVERFLOW (elt))
+-			return NULL_TREE;
+-		    }
+-		  builder.quick_push (elt);
+-		}
+-	      return builder.build ();
+-	    }
+-	  break;
+-
+-	default:
+-	  break;
+-	}
+-    }
+-
+-#ifdef SUBTARGET_FOLD_BUILTIN
+-  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+-#endif
+-
+-  return NULL_TREE;
+-}
+-
+-/* Fold a MD builtin (use ix86_fold_builtin for folding into
+-   constant) in GIMPLE.  */
+-
+-bool
+-ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+-{
+-  gimple *stmt = gsi_stmt (*gsi);
+-  tree fndecl = gimple_call_fndecl (stmt);
+-  gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
+-  int n_args = gimple_call_num_args (stmt);
+-  enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
+-  tree decl = NULL_TREE;
+-  tree arg0, arg1;
+-  enum rtx_code rcode;
+-  unsigned HOST_WIDE_INT count;
+-  bool is_vshift;
+-
+-  switch (fn_code)
+-    {
+-    case IX86_BUILTIN_TZCNT32:
+-      decl = builtin_decl_implicit (BUILT_IN_CTZ);
+-      goto fold_tzcnt_lzcnt;
+-
+-    case IX86_BUILTIN_TZCNT64:
+-      decl = builtin_decl_implicit (BUILT_IN_CTZLL);
+-      goto fold_tzcnt_lzcnt;
+-
+-    case IX86_BUILTIN_LZCNT32:
+-      decl = builtin_decl_implicit (BUILT_IN_CLZ);
+-      goto fold_tzcnt_lzcnt;
+-
+-    case IX86_BUILTIN_LZCNT64:
+-      decl = builtin_decl_implicit (BUILT_IN_CLZLL);
+-      goto fold_tzcnt_lzcnt;
+-
+-    fold_tzcnt_lzcnt:
+-      gcc_assert (n_args == 1);
+-      arg0 = gimple_call_arg (stmt, 0);
+-      if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
+-	{
+-	  int prec = TYPE_PRECISION (TREE_TYPE (arg0));
+-	  /* If arg0 is provably non-zero, optimize into generic
+-	     __builtin_c[tl]z{,ll} function the middle-end handles
+-	     better.  */
+-	  if (!expr_not_equal_to (arg0, wi::zero (prec)))
+-	    return false;
+-
+-	  location_t loc = gimple_location (stmt);
+-	  gimple *g = gimple_build_call (decl, 1, arg0);
+-	  gimple_set_location (g, loc);
+-	  tree lhs = make_ssa_name (integer_type_node);
+-	  gimple_call_set_lhs (g, lhs);
+-	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+-	  g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
+-	  gimple_set_location (g, loc);
+-	  gsi_replace (gsi, g, false);
+-	  return true;
+-	}
+-      break;
+-
+-    case IX86_BUILTIN_BZHI32:
+-    case IX86_BUILTIN_BZHI64:
+-      gcc_assert (n_args == 2);
+-      arg1 = gimple_call_arg (stmt, 1);
+-      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
+-	{
+-	  unsigned int idx = tree_to_uhwi (arg1) & 0xff;
+-	  arg0 = gimple_call_arg (stmt, 0);
+-	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
+-	    break;
+-	  location_t loc = gimple_location (stmt);
+-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+-	  gimple_set_location (g, loc);
+-	  gsi_replace (gsi, g, false);
+-	  return true;
+-	}
+-      break;
+-
+-    case IX86_BUILTIN_PDEP32:
+-    case IX86_BUILTIN_PDEP64:
+-    case IX86_BUILTIN_PEXT32:
+-    case IX86_BUILTIN_PEXT64:
+-      gcc_assert (n_args == 2);
+-      arg1 = gimple_call_arg (stmt, 1);
+-      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
+-	{
+-	  location_t loc = gimple_location (stmt);
+-	  arg0 = gimple_call_arg (stmt, 0);
+-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+-	  gimple_set_location (g, loc);
+-	  gsi_replace (gsi, g, false);
+-	  return true;
+-	}
+-      break;
+-
+-    case IX86_BUILTIN_PSLLD:
+-    case IX86_BUILTIN_PSLLD128:
+-    case IX86_BUILTIN_PSLLD128_MASK:
+-    case IX86_BUILTIN_PSLLD256:
+-    case IX86_BUILTIN_PSLLD256_MASK:
+-    case IX86_BUILTIN_PSLLD512:
+-    case IX86_BUILTIN_PSLLDI:
+-    case IX86_BUILTIN_PSLLDI128:
+-    case IX86_BUILTIN_PSLLDI128_MASK:
+-    case IX86_BUILTIN_PSLLDI256:
+-    case IX86_BUILTIN_PSLLDI256_MASK:
+-    case IX86_BUILTIN_PSLLDI512:
+-    case IX86_BUILTIN_PSLLQ:
+-    case IX86_BUILTIN_PSLLQ128:
+-    case IX86_BUILTIN_PSLLQ128_MASK:
+-    case IX86_BUILTIN_PSLLQ256:
+-    case IX86_BUILTIN_PSLLQ256_MASK:
+-    case IX86_BUILTIN_PSLLQ512:
+-    case IX86_BUILTIN_PSLLQI:
+-    case IX86_BUILTIN_PSLLQI128:
+-    case IX86_BUILTIN_PSLLQI128_MASK:
+-    case IX86_BUILTIN_PSLLQI256:
+-    case IX86_BUILTIN_PSLLQI256_MASK:
+-    case IX86_BUILTIN_PSLLQI512:
+-    case IX86_BUILTIN_PSLLW:
+-    case IX86_BUILTIN_PSLLW128:
+-    case IX86_BUILTIN_PSLLW128_MASK:
+-    case IX86_BUILTIN_PSLLW256:
+-    case IX86_BUILTIN_PSLLW256_MASK:
+-    case IX86_BUILTIN_PSLLW512_MASK:
+-    case IX86_BUILTIN_PSLLWI:
+-    case IX86_BUILTIN_PSLLWI128:
+-    case IX86_BUILTIN_PSLLWI128_MASK:
+-    case IX86_BUILTIN_PSLLWI256:
+-    case IX86_BUILTIN_PSLLWI256_MASK:
+-    case IX86_BUILTIN_PSLLWI512_MASK:
+-      rcode = ASHIFT;
+-      is_vshift = false;
+-      goto do_shift;
+-    case IX86_BUILTIN_PSRAD:
+-    case IX86_BUILTIN_PSRAD128:
+-    case IX86_BUILTIN_PSRAD128_MASK:
+-    case IX86_BUILTIN_PSRAD256:
+-    case IX86_BUILTIN_PSRAD256_MASK:
+-    case IX86_BUILTIN_PSRAD512:
+-    case IX86_BUILTIN_PSRADI:
+-    case IX86_BUILTIN_PSRADI128:
+-    case IX86_BUILTIN_PSRADI128_MASK:
+-    case IX86_BUILTIN_PSRADI256:
+-    case IX86_BUILTIN_PSRADI256_MASK:
+-    case IX86_BUILTIN_PSRADI512:
+-    case IX86_BUILTIN_PSRAQ128_MASK:
+-    case IX86_BUILTIN_PSRAQ256_MASK:
+-    case IX86_BUILTIN_PSRAQ512:
+-    case IX86_BUILTIN_PSRAQI128_MASK:
+-    case IX86_BUILTIN_PSRAQI256_MASK:
+-    case IX86_BUILTIN_PSRAQI512:
+-    case IX86_BUILTIN_PSRAW:
+-    case IX86_BUILTIN_PSRAW128:
+-    case IX86_BUILTIN_PSRAW128_MASK:
+-    case IX86_BUILTIN_PSRAW256:
+-    case IX86_BUILTIN_PSRAW256_MASK:
+-    case IX86_BUILTIN_PSRAW512:
+-    case IX86_BUILTIN_PSRAWI:
+-    case IX86_BUILTIN_PSRAWI128:
+-    case IX86_BUILTIN_PSRAWI128_MASK:
+-    case IX86_BUILTIN_PSRAWI256:
+-    case IX86_BUILTIN_PSRAWI256_MASK:
+-    case IX86_BUILTIN_PSRAWI512:
+-      rcode = ASHIFTRT;
+-      is_vshift = false;
+-      goto do_shift;
+-    case IX86_BUILTIN_PSRLD:
+-    case IX86_BUILTIN_PSRLD128:
+-    case IX86_BUILTIN_PSRLD128_MASK:
+-    case IX86_BUILTIN_PSRLD256:
+-    case IX86_BUILTIN_PSRLD256_MASK:
+-    case IX86_BUILTIN_PSRLD512:
+-    case IX86_BUILTIN_PSRLDI:
+-    case IX86_BUILTIN_PSRLDI128:
+-    case IX86_BUILTIN_PSRLDI128_MASK:
+-    case IX86_BUILTIN_PSRLDI256:
+-    case IX86_BUILTIN_PSRLDI256_MASK:
+-    case IX86_BUILTIN_PSRLDI512:
+-    case IX86_BUILTIN_PSRLQ:
+-    case IX86_BUILTIN_PSRLQ128:
+-    case IX86_BUILTIN_PSRLQ128_MASK:
+-    case IX86_BUILTIN_PSRLQ256:
+-    case IX86_BUILTIN_PSRLQ256_MASK:
+-    case IX86_BUILTIN_PSRLQ512:
+-    case IX86_BUILTIN_PSRLQI:
+-    case IX86_BUILTIN_PSRLQI128:
+-    case IX86_BUILTIN_PSRLQI128_MASK:
+-    case IX86_BUILTIN_PSRLQI256:
+-    case IX86_BUILTIN_PSRLQI256_MASK:
+-    case IX86_BUILTIN_PSRLQI512:
+-    case IX86_BUILTIN_PSRLW:
+-    case IX86_BUILTIN_PSRLW128:
+-    case IX86_BUILTIN_PSRLW128_MASK:
+-    case IX86_BUILTIN_PSRLW256:
+-    case IX86_BUILTIN_PSRLW256_MASK:
+-    case IX86_BUILTIN_PSRLW512:
+-    case IX86_BUILTIN_PSRLWI:
+-    case IX86_BUILTIN_PSRLWI128:
+-    case IX86_BUILTIN_PSRLWI128_MASK:
+-    case IX86_BUILTIN_PSRLWI256:
+-    case IX86_BUILTIN_PSRLWI256_MASK:
+-    case IX86_BUILTIN_PSRLWI512:
+-      rcode = LSHIFTRT;
+-      is_vshift = false;
+-      goto do_shift;
+-    case IX86_BUILTIN_PSLLVV16HI:
+-    case IX86_BUILTIN_PSLLVV16SI:
+-    case IX86_BUILTIN_PSLLVV2DI:
+-    case IX86_BUILTIN_PSLLVV2DI_MASK:
+-    case IX86_BUILTIN_PSLLVV32HI:
+-    case IX86_BUILTIN_PSLLVV4DI:
+-    case IX86_BUILTIN_PSLLVV4DI_MASK:
+-    case IX86_BUILTIN_PSLLVV4SI:
+-    case IX86_BUILTIN_PSLLVV4SI_MASK:
+-    case IX86_BUILTIN_PSLLVV8DI:
+-    case IX86_BUILTIN_PSLLVV8HI:
+-    case IX86_BUILTIN_PSLLVV8SI:
+-    case IX86_BUILTIN_PSLLVV8SI_MASK:
+-      rcode = ASHIFT;
+-      is_vshift = true;
+-      goto do_shift;
+-    case IX86_BUILTIN_PSRAVQ128:
+-    case IX86_BUILTIN_PSRAVQ256:
+-    case IX86_BUILTIN_PSRAVV16HI:
+-    case IX86_BUILTIN_PSRAVV16SI:
+-    case IX86_BUILTIN_PSRAVV32HI:
+-    case IX86_BUILTIN_PSRAVV4SI:
+-    case IX86_BUILTIN_PSRAVV4SI_MASK:
+-    case IX86_BUILTIN_PSRAVV8DI:
+-    case IX86_BUILTIN_PSRAVV8HI:
+-    case IX86_BUILTIN_PSRAVV8SI:
+-    case IX86_BUILTIN_PSRAVV8SI_MASK:
+-      rcode = ASHIFTRT;
+-      is_vshift = true;
+-      goto do_shift;
+-    case IX86_BUILTIN_PSRLVV16HI:
+-    case IX86_BUILTIN_PSRLVV16SI:
+-    case IX86_BUILTIN_PSRLVV2DI:
+-    case IX86_BUILTIN_PSRLVV2DI_MASK:
+-    case IX86_BUILTIN_PSRLVV32HI:
+-    case IX86_BUILTIN_PSRLVV4DI:
+-    case IX86_BUILTIN_PSRLVV4DI_MASK:
+-    case IX86_BUILTIN_PSRLVV4SI:
+-    case IX86_BUILTIN_PSRLVV4SI_MASK:
+-    case IX86_BUILTIN_PSRLVV8DI:
+-    case IX86_BUILTIN_PSRLVV8HI:
+-    case IX86_BUILTIN_PSRLVV8SI:
+-    case IX86_BUILTIN_PSRLVV8SI_MASK:
+-      rcode = LSHIFTRT;
+-      is_vshift = true;
+-      goto do_shift;
+-
+-    do_shift:
+-      gcc_assert (n_args >= 2);
+-      arg0 = gimple_call_arg (stmt, 0);
+-      arg1 = gimple_call_arg (stmt, 1);
+-      if (n_args > 2)
+-	{
+-	  /* This is masked shift.  Only optimize if the mask is all ones.  */
+-	  tree argl = gimple_call_arg (stmt, n_args - 1);
+-	  if (!tree_fits_uhwi_p (argl))
+-	    break;
+-	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
+-	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+-	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+-	    break;
+-	}
+-      if (is_vshift)
+-	{
+-	  if (TREE_CODE (arg1) != VECTOR_CST)
+-	    break;
+-	  count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
+-	  if (integer_zerop (arg1))
+-	    count = 0;
+-	  else if (rcode == ASHIFTRT)
+-	    break;
+-	  else
+-	    for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
+-	      {
+-		tree elt = VECTOR_CST_ELT (arg1, i);
+-		if (!wi::neg_p (wi::to_wide (elt))
+-		    && wi::to_widest (elt) < count)
+-		  return false;
+-	      }
+-	}
+-      else
+-	{
+-	  arg1 = ix86_vector_shift_count (arg1);
+-	  if (!arg1)
+-	    break;
+-	  count = tree_to_uhwi (arg1);
+-	}
+-      if (count == 0)
+-	{
+-	  /* Just return the first argument for shift by 0.  */
+-	  location_t loc = gimple_location (stmt);
+-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+-	  gimple_set_location (g, loc);
+-	  gsi_replace (gsi, g, false);
+-	  return true;
+-	}
+-      if (rcode != ASHIFTRT
+-	  && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
+-	{
+-	  /* For shift counts equal or greater than precision, except for
+-	     arithmetic right shift the result is zero.  */
+-	  location_t loc = gimple_location (stmt);
+-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+-					   build_zero_cst (TREE_TYPE (arg0)));
+-	  gimple_set_location (g, loc);
+-	  gsi_replace (gsi, g, false);
+-	  return true;
+-	}
+-      break;
+-
+-    default:
+-      break;
+-    }
+-
+-  return false;
+-}
+-
+-/* Make builtins to detect cpu type and features supported.  NAME is
+-   the builtin name, CODE is the builtin code, and FTYPE is the function
+-   type of the builtin.  */
+-
+-static void
+-make_cpu_type_builtin (const char* name, int code,
+-		       enum ix86_builtin_func_type ftype, bool is_const)
+-{
+-  tree decl;
+-  tree type;
+-
+-  type = ix86_get_builtin_func_type (ftype);
+-  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+-			       NULL, NULL_TREE);
+-  gcc_assert (decl != NULL_TREE);
+-  ix86_builtins[(int) code] = decl;
+-  TREE_READONLY (decl) = is_const;
+-}
+-
+-/* Make builtins to get CPU type and features supported.  The created
+-   builtins are :
+-
+-   __builtin_cpu_init (), to detect cpu type and features,
+-   __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
+-   __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
+-   */
+-
+-static void
+-ix86_init_platform_type_builtins (void)
+-{
+-  make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
+-			 INT_FTYPE_VOID, false);
+-  make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
+-			 INT_FTYPE_PCCHAR, true);
+-  make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
+-			 INT_FTYPE_PCCHAR, true);
+-}
+-
+-/* Internal method for ix86_init_builtins.  */
+-
+-static void
+-ix86_init_builtins_va_builtins_abi (void)
+-{
+-  tree ms_va_ref, sysv_va_ref;
+-  tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
+-  tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
+-  tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
+-  tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
+-
+-  if (!TARGET_64BIT)
+-    return;
+-  fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
+-  fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
+-  ms_va_ref = build_reference_type (ms_va_list_type_node);
+-  sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
+-
+-  fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref,
+-					       NULL_TREE);
+-  fnvoid_va_start_ms
+-    = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
+-  fnvoid_va_end_sysv
+-    = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
+-  fnvoid_va_start_sysv
+-    = build_varargs_function_type_list (void_type_node, sysv_va_ref,
+-					NULL_TREE);
+-  fnvoid_va_copy_ms
+-    = build_function_type_list (void_type_node, ms_va_ref,
+-				ms_va_list_type_node, NULL_TREE);
+-  fnvoid_va_copy_sysv
+-    = build_function_type_list (void_type_node, sysv_va_ref,
+-				sysv_va_ref, NULL_TREE);
+-
+-  add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
+-  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
+-  add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
+-  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
+-  add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
+-			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
+-  add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
+-  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+-  add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
+-  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+-  add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
+-			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+-}
+-
+-static void
+-ix86_init_builtin_types (void)
+-{
+-  tree float80_type_node, const_string_type_node;
+-
+-  /* The __float80 type.  */
+-  float80_type_node = long_double_type_node;
+-  if (TYPE_MODE (float80_type_node) != XFmode)
+-    {
+-      if (float64x_type_node != NULL_TREE
+-	  && TYPE_MODE (float64x_type_node) == XFmode)
+-	float80_type_node = float64x_type_node;
+-      else
+-	{
+-	  /* The __float80 type.  */
+-	  float80_type_node = make_node (REAL_TYPE);
+-
+-	  TYPE_PRECISION (float80_type_node) = 80;
+-	  layout_type (float80_type_node);
+-	}
+-    }
+-  lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
+-
+-  /* The __float128 type.  The node has already been created as
+-     _Float128, so we only need to register the __float128 name for
+-     it.  */
+-  lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
+-
+-  const_string_type_node
+-    = build_pointer_type (build_qualified_type
+-			  (char_type_node, TYPE_QUAL_CONST));
+-
+-  /* This macro is built by i386-builtin-types.awk.  */
+-  DEFINE_BUILTIN_PRIMITIVE_TYPES;
+-}
+-
+-static void
+-ix86_init_builtins (void)
+-{
+-  tree ftype, decl;
+-
+-  ix86_init_builtin_types ();
+-
+-  /* Builtins to get CPU type and features. */
+-  ix86_init_platform_type_builtins ();
+-
+-  /* TFmode support builtins.  */
+-  def_builtin_const (0, 0, "__builtin_infq",
+-		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
+-  def_builtin_const (0, 0, "__builtin_huge_valq",
+-		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
+-
+-  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
+-  decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
+-			       BUILT_IN_MD, "nanq", NULL_TREE);
+-  TREE_READONLY (decl) = 1;
+-  ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
+-
+-  decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
+-			       BUILT_IN_MD, "nansq", NULL_TREE);
+-  TREE_READONLY (decl) = 1;
+-  ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
+-
+-  /* We will expand them to normal call if SSE isn't available since
+-     they are used by libgcc. */
+-  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
+-  decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
+-			       BUILT_IN_MD, "__fabstf2", NULL_TREE);
+-  TREE_READONLY (decl) = 1;
+-  ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
+-
+-  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
+-  decl = add_builtin_function ("__builtin_copysignq", ftype,
+-			       IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
+-			       "__copysigntf3", NULL_TREE);
+-  TREE_READONLY (decl) = 1;
+-  ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
+-
+-  ix86_init_tm_builtins ();
+-  ix86_init_mmx_sse_builtins ();
+-
+-  if (TARGET_LP64)
+-    ix86_init_builtins_va_builtins_abi ();
+-
+-#ifdef SUBTARGET_INIT_BUILTINS
+-  SUBTARGET_INIT_BUILTINS;
+-#endif
+-}
+-
+-/* Return the ix86 builtin for CODE.  */
+-
+-static tree
+-ix86_builtin_decl (unsigned code, bool)
+-{
+-  if (code >= IX86_BUILTIN_MAX)
+-    return error_mark_node;
+-
+-  return ix86_builtins[code];
+-}
+-
+-/* Errors in the source file can cause expand_expr to return const0_rtx
+-   where we expect a vector.  To avoid crashing, use one of the vector
+-   clear instructions.  */
+-static rtx
+-safe_vector_operand (rtx x, machine_mode mode)
+-{
+-  if (x == const0_rtx)
+-    x = CONST0_RTX (mode);
+-  return x;
+-}
+-
+-/* Fixup modeless constants to fit required mode.  */
+-static rtx
+-fixup_modeless_constant (rtx x, machine_mode mode)
+-{
+-  if (GET_MODE (x) == VOIDmode)
+-    x = convert_to_mode (mode, x, 1);
+-  return x;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of binop insns.  */
+-
+-static rtx
+-ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  machine_mode tmode = insn_data[icode].operand[0].mode;
+-  machine_mode mode0 = insn_data[icode].operand[1].mode;
+-  machine_mode mode1 = insn_data[icode].operand[2].mode;
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-  if (VECTOR_MODE_P (mode1))
+-    op1 = safe_vector_operand (op1, mode1);
+-
+-  if (optimize || !target
+-      || GET_MODE (target) != tmode
+-      || !insn_data[icode].operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  if (GET_MODE (op1) == SImode && mode1 == TImode)
+-    {
+-      rtx x = gen_reg_rtx (V4SImode);
+-      emit_insn (gen_sse2_loadd (x, op1));
+-      op1 = gen_lowpart (TImode, x);
+-    }
+-
+-  if (!insn_data[icode].operand[1].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-  if (!insn_data[icode].operand[2].predicate (op1, mode1))
+-    op1 = copy_to_mode_reg (mode1, op1);
+-
+-  pat = GEN_FCN (icode) (target, op0, op1);
+-  if (! pat)
+-    return 0;
+-
+-  emit_insn (pat);
+-
+-  return target;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
+-
+-static rtx
+-ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
+-			       enum ix86_builtin_func_type m_type,
+-			       enum rtx_code sub_code)
+-{
+-  rtx pat;
+-  int i;
+-  int nargs;
+-  bool comparison_p = false;
+-  bool tf_p = false;
+-  bool last_arg_constant = false;
+-  int num_memory = 0;
+-  struct {
+-    rtx op;
+-    machine_mode mode;
+-  } args[4];
+-
+-  machine_mode tmode = insn_data[icode].operand[0].mode;
+-
+-  switch (m_type)
+-    {
+-    case MULTI_ARG_4_DF2_DI_I:
+-    case MULTI_ARG_4_DF2_DI_I1:
+-    case MULTI_ARG_4_SF2_SI_I:
+-    case MULTI_ARG_4_SF2_SI_I1:
+-      nargs = 4;
+-      last_arg_constant = true;
+-      break;
+-
+-    case MULTI_ARG_3_SF:
+-    case MULTI_ARG_3_DF:
+-    case MULTI_ARG_3_SF2:
+-    case MULTI_ARG_3_DF2:
+-    case MULTI_ARG_3_DI:
+-    case MULTI_ARG_3_SI:
+-    case MULTI_ARG_3_SI_DI:
+-    case MULTI_ARG_3_HI:
+-    case MULTI_ARG_3_HI_SI:
+-    case MULTI_ARG_3_QI:
+-    case MULTI_ARG_3_DI2:
+-    case MULTI_ARG_3_SI2:
+-    case MULTI_ARG_3_HI2:
+-    case MULTI_ARG_3_QI2:
+-      nargs = 3;
+-      break;
+-
+-    case MULTI_ARG_2_SF:
+-    case MULTI_ARG_2_DF:
+-    case MULTI_ARG_2_DI:
+-    case MULTI_ARG_2_SI:
+-    case MULTI_ARG_2_HI:
+-    case MULTI_ARG_2_QI:
+-      nargs = 2;
+-      break;
+-
+-    case MULTI_ARG_2_DI_IMM:
+-    case MULTI_ARG_2_SI_IMM:
+-    case MULTI_ARG_2_HI_IMM:
+-    case MULTI_ARG_2_QI_IMM:
+-      nargs = 2;
+-      last_arg_constant = true;
+-      break;
+-
+-    case MULTI_ARG_1_SF:
+-    case MULTI_ARG_1_DF:
+-    case MULTI_ARG_1_SF2:
+-    case MULTI_ARG_1_DF2:
+-    case MULTI_ARG_1_DI:
+-    case MULTI_ARG_1_SI:
+-    case MULTI_ARG_1_HI:
+-    case MULTI_ARG_1_QI:
+-    case MULTI_ARG_1_SI_DI:
+-    case MULTI_ARG_1_HI_DI:
+-    case MULTI_ARG_1_HI_SI:
+-    case MULTI_ARG_1_QI_DI:
+-    case MULTI_ARG_1_QI_SI:
+-    case MULTI_ARG_1_QI_HI:
+-      nargs = 1;
+-      break;
+-
+-    case MULTI_ARG_2_DI_CMP:
+-    case MULTI_ARG_2_SI_CMP:
+-    case MULTI_ARG_2_HI_CMP:
+-    case MULTI_ARG_2_QI_CMP:
+-      nargs = 2;
+-      comparison_p = true;
+-      break;
+-
+-    case MULTI_ARG_2_SF_TF:
+-    case MULTI_ARG_2_DF_TF:
+-    case MULTI_ARG_2_DI_TF:
+-    case MULTI_ARG_2_SI_TF:
+-    case MULTI_ARG_2_HI_TF:
+-    case MULTI_ARG_2_QI_TF:
+-      nargs = 2;
+-      tf_p = true;
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  if (optimize || !target
+-      || GET_MODE (target) != tmode
+-      || !insn_data[icode].operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-  else if (memory_operand (target, tmode))
+-    num_memory++;
+-
+-  gcc_assert (nargs <= 4);
+-
+-  for (i = 0; i < nargs; i++)
+-    {
+-      tree arg = CALL_EXPR_ARG (exp, i);
+-      rtx op = expand_normal (arg);
+-      int adjust = (comparison_p) ? 1 : 0;
+-      machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+-
+-      if (last_arg_constant && i == nargs - 1)
+-	{
+-	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
+-	    {
+-	      enum insn_code new_icode = icode;
+-	      switch (icode)
+-		{
+-		case CODE_FOR_xop_vpermil2v2df3:
+-		case CODE_FOR_xop_vpermil2v4sf3:
+-		case CODE_FOR_xop_vpermil2v4df3:
+-		case CODE_FOR_xop_vpermil2v8sf3:
+-		  error ("the last argument must be a 2-bit immediate");
+-		  return gen_reg_rtx (tmode);
+-		case CODE_FOR_xop_rotlv2di3:
+-		  new_icode = CODE_FOR_rotlv2di3;
+-		  goto xop_rotl;
+-		case CODE_FOR_xop_rotlv4si3:
+-		  new_icode = CODE_FOR_rotlv4si3;
+-		  goto xop_rotl;
+-		case CODE_FOR_xop_rotlv8hi3:
+-		  new_icode = CODE_FOR_rotlv8hi3;
+-		  goto xop_rotl;
+-		case CODE_FOR_xop_rotlv16qi3:
+-		  new_icode = CODE_FOR_rotlv16qi3;
+-		xop_rotl:
+-		  if (CONST_INT_P (op))
+-		    {
+-		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
+-		      op = GEN_INT (INTVAL (op) & mask);
+-		      gcc_checking_assert
+-			(insn_data[icode].operand[i + 1].predicate (op, mode));
+-		    }
+-		  else
+-		    {
+-		      gcc_checking_assert
+-			(nargs == 2
+-			 && insn_data[new_icode].operand[0].mode == tmode
+-			 && insn_data[new_icode].operand[1].mode == tmode
+-			 && insn_data[new_icode].operand[2].mode == mode
+-			 && insn_data[new_icode].operand[0].predicate
+-			    == insn_data[icode].operand[0].predicate
+-			 && insn_data[new_icode].operand[1].predicate
+-			    == insn_data[icode].operand[1].predicate);
+-		      icode = new_icode;
+-		      goto non_constant;
+-		    }
+-		  break;
+-		default:
+-		  gcc_unreachable ();
+-		}
+-	    }
+-	}
+-      else
+-	{
+-	non_constant:
+-	  if (VECTOR_MODE_P (mode))
+-	    op = safe_vector_operand (op, mode);
+-
+-	  /* If we aren't optimizing, only allow one memory operand to be
+-	     generated.  */
+-	  if (memory_operand (op, mode))
+-	    num_memory++;
+-
+-	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+-
+-	  if (optimize
+-	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
+-	      || num_memory > 1)
+-	    op = force_reg (mode, op);
+-	}
+-
+-      args[i].op = op;
+-      args[i].mode = mode;
+-    }
+-
+-  switch (nargs)
+-    {
+-    case 1:
+-      pat = GEN_FCN (icode) (target, args[0].op);
+-      break;
+-
+-    case 2:
+-      if (tf_p)
+-	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+-			       GEN_INT ((int)sub_code));
+-      else if (! comparison_p)
+-	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+-      else
+-	{
+-	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+-				       args[0].op,
+-				       args[1].op);
+-
+-	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+-	}
+-      break;
+-
+-    case 3:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+-      break;
+-
+-    case 4:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  if (! pat)
+-    return 0;
+-
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
+-   insns with vec_merge.  */
+-
+-static rtx
+-ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
+-				    rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  rtx op1, op0 = expand_normal (arg0);
+-  machine_mode tmode = insn_data[icode].operand[0].mode;
+-  machine_mode mode0 = insn_data[icode].operand[1].mode;
+-
+-  if (optimize || !target
+-      || GET_MODE (target) != tmode
+-      || !insn_data[icode].operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_data[icode].operand[1].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-
+-  op1 = op0;
+-  if (!insn_data[icode].operand[2].predicate (op1, mode0))
+-    op1 = copy_to_mode_reg (mode0, op1);
+-
+-  pat = GEN_FCN (icode) (target, op0, op1);
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
+-
+-static rtx
+-ix86_expand_sse_compare (const struct builtin_description *d,
+-			 tree exp, rtx target, bool swap)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  rtx op2;
+-  machine_mode tmode = insn_data[d->icode].operand[0].mode;
+-  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+-  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+-  enum rtx_code comparison = d->comparison;
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-  if (VECTOR_MODE_P (mode1))
+-    op1 = safe_vector_operand (op1, mode1);
+-
+-  /* Swap operands if we have a comparison that isn't available in
+-     hardware.  */
+-  if (swap)
+-    std::swap (op0, op1);
+-
+-  if (optimize || !target
+-      || GET_MODE (target) != tmode
+-      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_data[d->icode].operand[1].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-  if ((optimize && !register_operand (op1, mode1))
+-      || !insn_data[d->icode].operand[2].predicate (op1, mode1))
+-    op1 = copy_to_mode_reg (mode1, op1);
+-
+-  op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
+-  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of comi insns.  */
+-
+-static rtx
+-ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
+-		      rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+-  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+-  enum rtx_code comparison = d->comparison;
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-  if (VECTOR_MODE_P (mode1))
+-    op1 = safe_vector_operand (op1, mode1);
+-
+-  /* Swap operands if we have a comparison that isn't available in
+-     hardware.  */
+-  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
+-    std::swap (op0, op1);
+-
+-  target = gen_reg_rtx (SImode);
+-  emit_move_insn (target, const0_rtx);
+-  target = gen_rtx_SUBREG (QImode, target, 0);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-  if ((optimize && !register_operand (op1, mode1))
+-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+-    op1 = copy_to_mode_reg (mode1, op1);
+-
+-  pat = GEN_FCN (d->icode) (op0, op1);
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+-			  gen_rtx_fmt_ee (comparison, QImode,
+-					  SET_DEST (pat),
+-					  const0_rtx)));
+-
+-  return SUBREG_REG (target);
+-}
+-
+-/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
+-
+-static rtx
+-ix86_expand_sse_round (const struct builtin_description *d, tree exp,
+-		       rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  rtx op1, op0 = expand_normal (arg0);
+-  machine_mode tmode = insn_data[d->icode].operand[0].mode;
+-  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+-
+-  if (optimize || target == 0
+-      || GET_MODE (target) != tmode
+-      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-
+-  op1 = GEN_INT (d->comparison);
+-
+-  pat = GEN_FCN (d->icode) (target, op0, op1);
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-static rtx
+-ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
+-				     tree exp, rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  rtx op2;
+-  machine_mode tmode = insn_data[d->icode].operand[0].mode;
+-  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+-  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+-
+-  if (optimize || target == 0
+-      || GET_MODE (target) != tmode
+-      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  op0 = safe_vector_operand (op0, mode0);
+-  op1 = safe_vector_operand (op1, mode1);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-  if ((optimize && !register_operand (op1, mode1))
+-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+-    op1 = copy_to_mode_reg (mode1, op1);
+-
+-  op2 = GEN_INT (d->comparison);
+-
+-  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
+-
+-static rtx
+-ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
+-		       rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+-  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+-  enum rtx_code comparison = d->comparison;
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-  if (VECTOR_MODE_P (mode1))
+-    op1 = safe_vector_operand (op1, mode1);
+-
+-  target = gen_reg_rtx (SImode);
+-  emit_move_insn (target, const0_rtx);
+-  target = gen_rtx_SUBREG (QImode, target, 0);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-  if ((optimize && !register_operand (op1, mode1))
+-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+-    op1 = copy_to_mode_reg (mode1, op1);
+-
+-  pat = GEN_FCN (d->icode) (op0, op1);
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+-			  gen_rtx_fmt_ee (comparison, QImode,
+-					  SET_DEST (pat),
+-					  const0_rtx)));
+-
+-  return SUBREG_REG (target);
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
+-
+-static rtx
+-ix86_expand_sse_pcmpestr (const struct builtin_description *d,
+-			  tree exp, rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  tree arg2 = CALL_EXPR_ARG (exp, 2);
+-  tree arg3 = CALL_EXPR_ARG (exp, 3);
+-  tree arg4 = CALL_EXPR_ARG (exp, 4);
+-  rtx scratch0, scratch1;
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  rtx op2 = expand_normal (arg2);
+-  rtx op3 = expand_normal (arg3);
+-  rtx op4 = expand_normal (arg4);
+-  machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
+-
+-  tmode0 = insn_data[d->icode].operand[0].mode;
+-  tmode1 = insn_data[d->icode].operand[1].mode;
+-  modev2 = insn_data[d->icode].operand[2].mode;
+-  modei3 = insn_data[d->icode].operand[3].mode;
+-  modev4 = insn_data[d->icode].operand[4].mode;
+-  modei5 = insn_data[d->icode].operand[5].mode;
+-  modeimm = insn_data[d->icode].operand[6].mode;
+-
+-  if (VECTOR_MODE_P (modev2))
+-    op0 = safe_vector_operand (op0, modev2);
+-  if (VECTOR_MODE_P (modev4))
+-    op2 = safe_vector_operand (op2, modev4);
+-
+-  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+-    op0 = copy_to_mode_reg (modev2, op0);
+-  if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
+-    op1 = copy_to_mode_reg (modei3, op1);
+-  if ((optimize && !register_operand (op2, modev4))
+-      || !insn_data[d->icode].operand[4].predicate (op2, modev4))
+-    op2 = copy_to_mode_reg (modev4, op2);
+-  if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
+-    op3 = copy_to_mode_reg (modei5, op3);
+-
+-  if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
+-    {
+-      error ("the fifth argument must be an 8-bit immediate");
+-      return const0_rtx;
+-    }
+-
+-  if (d->code == IX86_BUILTIN_PCMPESTRI128)
+-    {
+-      if (optimize || !target
+-	  || GET_MODE (target) != tmode0
+-	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+-	target = gen_reg_rtx (tmode0);
+-
+-      scratch1 = gen_reg_rtx (tmode1);
+-
+-      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
+-    }
+-  else if (d->code == IX86_BUILTIN_PCMPESTRM128)
+-    {
+-      if (optimize || !target
+-	  || GET_MODE (target) != tmode1
+-	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+-	target = gen_reg_rtx (tmode1);
+-
+-      scratch0 = gen_reg_rtx (tmode0);
+-
+-      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
+-    }
+-  else
+-    {
+-      gcc_assert (d->flag);
+-
+-      scratch0 = gen_reg_rtx (tmode0);
+-      scratch1 = gen_reg_rtx (tmode1);
+-
+-      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
+-    }
+-
+-  if (! pat)
+-    return 0;
+-
+-  emit_insn (pat);
+-
+-  if (d->flag)
+-    {
+-      target = gen_reg_rtx (SImode);
+-      emit_move_insn (target, const0_rtx);
+-      target = gen_rtx_SUBREG (QImode, target, 0);
+-
+-      emit_insn
+-	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+-		      gen_rtx_fmt_ee (EQ, QImode,
+-				      gen_rtx_REG ((machine_mode) d->flag,
+-						   FLAGS_REG),
+-				      const0_rtx)));
+-      return SUBREG_REG (target);
+-    }
+-  else
+-    return target;
+-}
+-
+-
+-/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
+-
+-static rtx
+-ix86_expand_sse_pcmpistr (const struct builtin_description *d,
+-			  tree exp, rtx target)
+-{
+-  rtx pat;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  tree arg2 = CALL_EXPR_ARG (exp, 2);
+-  rtx scratch0, scratch1;
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  rtx op2 = expand_normal (arg2);
+-  machine_mode tmode0, tmode1, modev2, modev3, modeimm;
+-
+-  tmode0 = insn_data[d->icode].operand[0].mode;
+-  tmode1 = insn_data[d->icode].operand[1].mode;
+-  modev2 = insn_data[d->icode].operand[2].mode;
+-  modev3 = insn_data[d->icode].operand[3].mode;
+-  modeimm = insn_data[d->icode].operand[4].mode;
+-
+-  if (VECTOR_MODE_P (modev2))
+-    op0 = safe_vector_operand (op0, modev2);
+-  if (VECTOR_MODE_P (modev3))
+-    op1 = safe_vector_operand (op1, modev3);
+-
+-  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+-    op0 = copy_to_mode_reg (modev2, op0);
+-  if ((optimize && !register_operand (op1, modev3))
+-      || !insn_data[d->icode].operand[3].predicate (op1, modev3))
+-    op1 = copy_to_mode_reg (modev3, op1);
+-
+-  if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
+-    {
+-      error ("the third argument must be an 8-bit immediate");
+-      return const0_rtx;
+-    }
+-
+-  if (d->code == IX86_BUILTIN_PCMPISTRI128)
+-    {
+-      if (optimize || !target
+-	  || GET_MODE (target) != tmode0
+-	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+-	target = gen_reg_rtx (tmode0);
+-
+-      scratch1 = gen_reg_rtx (tmode1);
+-
+-      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
+-    }
+-  else if (d->code == IX86_BUILTIN_PCMPISTRM128)
+-    {
+-      if (optimize || !target
+-	  || GET_MODE (target) != tmode1
+-	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+-	target = gen_reg_rtx (tmode1);
+-
+-      scratch0 = gen_reg_rtx (tmode0);
+-
+-      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
+-    }
+-  else
+-    {
+-      gcc_assert (d->flag);
+-
+-      scratch0 = gen_reg_rtx (tmode0);
+-      scratch1 = gen_reg_rtx (tmode1);
+-
+-      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
+-    }
+-
+-  if (! pat)
+-    return 0;
+-
+-  emit_insn (pat);
+-
+-  if (d->flag)
+-    {
+-      target = gen_reg_rtx (SImode);
+-      emit_move_insn (target, const0_rtx);
+-      target = gen_rtx_SUBREG (QImode, target, 0);
+-
+-      emit_insn
+-	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+-		      gen_rtx_fmt_ee (EQ, QImode,
+-				      gen_rtx_REG ((machine_mode) d->flag,
+-						   FLAGS_REG),
+-				      const0_rtx)));
+-      return SUBREG_REG (target);
+-    }
+-  else
+-    return target;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of insns with
+-   variable number of operands.  */
+-
+-static rtx
+-ix86_expand_args_builtin (const struct builtin_description *d,
+-			  tree exp, rtx target)
+-{
+-  rtx pat, real_target;
+-  unsigned int i, nargs;
+-  unsigned int nargs_constant = 0;
+-  unsigned int mask_pos = 0;
+-  int num_memory = 0;
+-  struct
+-    {
+-      rtx op;
+-      machine_mode mode;
+-    } args[6];
+-  bool second_arg_count = false;
+-  enum insn_code icode = d->icode;
+-  const struct insn_data_d *insn_p = &insn_data[icode];
+-  machine_mode tmode = insn_p->operand[0].mode;
+-  machine_mode rmode = VOIDmode;
+-  bool swap = false;
+-  enum rtx_code comparison = d->comparison;
+-
+-  switch ((enum ix86_builtin_func_type) d->flag)
+-    {
+-    case V2DF_FTYPE_V2DF_ROUND:
+-    case V4DF_FTYPE_V4DF_ROUND:
+-    case V8DF_FTYPE_V8DF_ROUND:
+-    case V4SF_FTYPE_V4SF_ROUND:
+-    case V8SF_FTYPE_V8SF_ROUND:
+-    case V16SF_FTYPE_V16SF_ROUND:
+-    case V4SI_FTYPE_V4SF_ROUND:
+-    case V8SI_FTYPE_V8SF_ROUND:
+-    case V16SI_FTYPE_V16SF_ROUND:
+-      return ix86_expand_sse_round (d, exp, target);
+-    case V4SI_FTYPE_V2DF_V2DF_ROUND:
+-    case V8SI_FTYPE_V4DF_V4DF_ROUND:
+-    case V16SI_FTYPE_V8DF_V8DF_ROUND:
+-      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
+-    case INT_FTYPE_V8SF_V8SF_PTEST:
+-    case INT_FTYPE_V4DI_V4DI_PTEST:
+-    case INT_FTYPE_V4DF_V4DF_PTEST:
+-    case INT_FTYPE_V4SF_V4SF_PTEST:
+-    case INT_FTYPE_V2DI_V2DI_PTEST:
+-    case INT_FTYPE_V2DF_V2DF_PTEST:
+-      return ix86_expand_sse_ptest (d, exp, target);
+-    case FLOAT128_FTYPE_FLOAT128:
+-    case FLOAT_FTYPE_FLOAT:
+-    case INT_FTYPE_INT:
+-    case UINT_FTYPE_UINT:
+-    case UINT16_FTYPE_UINT16:
+-    case UINT64_FTYPE_INT:
+-    case UINT64_FTYPE_UINT64:
+-    case INT64_FTYPE_INT64:
+-    case INT64_FTYPE_V4SF:
+-    case INT64_FTYPE_V2DF:
+-    case INT_FTYPE_V16QI:
+-    case INT_FTYPE_V8QI:
+-    case INT_FTYPE_V8SF:
+-    case INT_FTYPE_V4DF:
+-    case INT_FTYPE_V4SF:
+-    case INT_FTYPE_V2DF:
+-    case INT_FTYPE_V32QI:
+-    case V16QI_FTYPE_V16QI:
+-    case V8SI_FTYPE_V8SF:
+-    case V8SI_FTYPE_V4SI:
+-    case V8HI_FTYPE_V8HI:
+-    case V8HI_FTYPE_V16QI:
+-    case V8QI_FTYPE_V8QI:
+-    case V8SF_FTYPE_V8SF:
+-    case V8SF_FTYPE_V8SI:
+-    case V8SF_FTYPE_V4SF:
+-    case V8SF_FTYPE_V8HI:
+-    case V4SI_FTYPE_V4SI:
+-    case V4SI_FTYPE_V16QI:
+-    case V4SI_FTYPE_V4SF:
+-    case V4SI_FTYPE_V8SI:
+-    case V4SI_FTYPE_V8HI:
+-    case V4SI_FTYPE_V4DF:
+-    case V4SI_FTYPE_V2DF:
+-    case V4HI_FTYPE_V4HI:
+-    case V4DF_FTYPE_V4DF:
+-    case V4DF_FTYPE_V4SI:
+-    case V4DF_FTYPE_V4SF:
+-    case V4DF_FTYPE_V2DF:
+-    case V4SF_FTYPE_V4SF:
+-    case V4SF_FTYPE_V4SI:
+-    case V4SF_FTYPE_V8SF:
+-    case V4SF_FTYPE_V4DF:
+-    case V4SF_FTYPE_V8HI:
+-    case V4SF_FTYPE_V2DF:
+-    case V2DI_FTYPE_V2DI:
+-    case V2DI_FTYPE_V16QI:
+-    case V2DI_FTYPE_V8HI:
+-    case V2DI_FTYPE_V4SI:
+-    case V2DF_FTYPE_V2DF:
+-    case V2DF_FTYPE_V4SI:
+-    case V2DF_FTYPE_V4DF:
+-    case V2DF_FTYPE_V4SF:
+-    case V2DF_FTYPE_V2SI:
+-    case V2SI_FTYPE_V2SI:
+-    case V2SI_FTYPE_V4SF:
+-    case V2SI_FTYPE_V2SF:
+-    case V2SI_FTYPE_V2DF:
+-    case V2SF_FTYPE_V2SF:
+-    case V2SF_FTYPE_V2SI:
+-    case V32QI_FTYPE_V32QI:
+-    case V32QI_FTYPE_V16QI:
+-    case V16HI_FTYPE_V16HI:
+-    case V16HI_FTYPE_V8HI:
+-    case V8SI_FTYPE_V8SI:
+-    case V16HI_FTYPE_V16QI:
+-    case V8SI_FTYPE_V16QI:
+-    case V4DI_FTYPE_V16QI:
+-    case V8SI_FTYPE_V8HI:
+-    case V4DI_FTYPE_V8HI:
+-    case V4DI_FTYPE_V4SI:
+-    case V4DI_FTYPE_V2DI:
+-    case UQI_FTYPE_UQI:
+-    case UHI_FTYPE_UHI:
+-    case USI_FTYPE_USI:
+-    case USI_FTYPE_UQI:
+-    case USI_FTYPE_UHI:
+-    case UDI_FTYPE_UDI:
+-    case UHI_FTYPE_V16QI:
+-    case USI_FTYPE_V32QI:
+-    case UDI_FTYPE_V64QI:
+-    case V16QI_FTYPE_UHI:
+-    case V32QI_FTYPE_USI:
+-    case V64QI_FTYPE_UDI:
+-    case V8HI_FTYPE_UQI:
+-    case V16HI_FTYPE_UHI:
+-    case V32HI_FTYPE_USI:
+-    case V4SI_FTYPE_UQI:
+-    case V8SI_FTYPE_UQI:
+-    case V4SI_FTYPE_UHI:
+-    case V8SI_FTYPE_UHI:
+-    case UQI_FTYPE_V8HI:
+-    case UHI_FTYPE_V16HI:
+-    case USI_FTYPE_V32HI:
+-    case UQI_FTYPE_V4SI:
+-    case UQI_FTYPE_V8SI:
+-    case UHI_FTYPE_V16SI:
+-    case UQI_FTYPE_V2DI:
+-    case UQI_FTYPE_V4DI:
+-    case UQI_FTYPE_V8DI:
+-    case V16SI_FTYPE_UHI:
+-    case V2DI_FTYPE_UQI:
+-    case V4DI_FTYPE_UQI:
+-    case V16SI_FTYPE_INT:
+-    case V16SF_FTYPE_V8SF:
+-    case V16SI_FTYPE_V8SI:
+-    case V16SF_FTYPE_V4SF:
+-    case V16SI_FTYPE_V4SI:
+-    case V16SI_FTYPE_V16SF:
+-    case V16SI_FTYPE_V16SI:
+-    case V64QI_FTYPE_V64QI:
+-    case V32HI_FTYPE_V32HI:
+-    case V16SF_FTYPE_V16SF:
+-    case V8DI_FTYPE_UQI:
+-    case V8DI_FTYPE_V8DI:
+-    case V8DF_FTYPE_V4DF:
+-    case V8DF_FTYPE_V2DF:
+-    case V8DF_FTYPE_V8DF:
+-    case V4DI_FTYPE_V4DI:
+-      nargs = 1;
+-      break;
+-    case V4SF_FTYPE_V4SF_VEC_MERGE:
+-    case V2DF_FTYPE_V2DF_VEC_MERGE:
+-      return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
+-    case FLOAT128_FTYPE_FLOAT128_FLOAT128:
+-    case V16QI_FTYPE_V16QI_V16QI:
+-    case V16QI_FTYPE_V8HI_V8HI:
+-    case V16SF_FTYPE_V16SF_V16SF:
+-    case V8QI_FTYPE_V8QI_V8QI:
+-    case V8QI_FTYPE_V4HI_V4HI:
+-    case V8HI_FTYPE_V8HI_V8HI:
+-    case V8HI_FTYPE_V16QI_V16QI:
+-    case V8HI_FTYPE_V4SI_V4SI:
+-    case V8SF_FTYPE_V8SF_V8SF:
+-    case V8SF_FTYPE_V8SF_V8SI:
+-    case V8DF_FTYPE_V8DF_V8DF:
+-    case V4SI_FTYPE_V4SI_V4SI:
+-    case V4SI_FTYPE_V8HI_V8HI:
+-    case V4SI_FTYPE_V2DF_V2DF:
+-    case V4HI_FTYPE_V4HI_V4HI:
+-    case V4HI_FTYPE_V8QI_V8QI:
+-    case V4HI_FTYPE_V2SI_V2SI:
+-    case V4DF_FTYPE_V4DF_V4DF:
+-    case V4DF_FTYPE_V4DF_V4DI:
+-    case V4SF_FTYPE_V4SF_V4SF:
+-    case V4SF_FTYPE_V4SF_V4SI:
+-    case V4SF_FTYPE_V4SF_V2SI:
+-    case V4SF_FTYPE_V4SF_V2DF:
+-    case V4SF_FTYPE_V4SF_UINT:
+-    case V4SF_FTYPE_V4SF_DI:
+-    case V4SF_FTYPE_V4SF_SI:
+-    case V2DI_FTYPE_V2DI_V2DI:
+-    case V2DI_FTYPE_V16QI_V16QI:
+-    case V2DI_FTYPE_V4SI_V4SI:
+-    case V2DI_FTYPE_V2DI_V16QI:
+-    case V2SI_FTYPE_V2SI_V2SI:
+-    case V2SI_FTYPE_V4HI_V4HI:
+-    case V2SI_FTYPE_V2SF_V2SF:
+-    case V2DF_FTYPE_V2DF_V2DF:
+-    case V2DF_FTYPE_V2DF_V4SF:
+-    case V2DF_FTYPE_V2DF_V2DI:
+-    case V2DF_FTYPE_V2DF_DI:
+-    case V2DF_FTYPE_V2DF_SI:
+-    case V2DF_FTYPE_V2DF_UINT:
+-    case V2SF_FTYPE_V2SF_V2SF:
+-    case V1DI_FTYPE_V1DI_V1DI:
+-    case V1DI_FTYPE_V8QI_V8QI:
+-    case V1DI_FTYPE_V2SI_V2SI:
+-    case V32QI_FTYPE_V16HI_V16HI:
+-    case V16HI_FTYPE_V8SI_V8SI:
+-    case V64QI_FTYPE_V64QI_V64QI:
+-    case V32QI_FTYPE_V32QI_V32QI:
+-    case V16HI_FTYPE_V32QI_V32QI:
+-    case V16HI_FTYPE_V16HI_V16HI:
+-    case V8SI_FTYPE_V4DF_V4DF:
+-    case V8SI_FTYPE_V8SI_V8SI:
+-    case V8SI_FTYPE_V16HI_V16HI:
+-    case V4DI_FTYPE_V4DI_V4DI:
+-    case V4DI_FTYPE_V8SI_V8SI:
+-    case V8DI_FTYPE_V64QI_V64QI:
+-      if (comparison == UNKNOWN)
+-	return ix86_expand_binop_builtin (icode, exp, target);
+-      nargs = 2;
+-      break;
+-    case V4SF_FTYPE_V4SF_V4SF_SWAP:
+-    case V2DF_FTYPE_V2DF_V2DF_SWAP:
+-      gcc_assert (comparison != UNKNOWN);
+-      nargs = 2;
+-      swap = true;
+-      break;
+-    case V16HI_FTYPE_V16HI_V8HI_COUNT:
+-    case V16HI_FTYPE_V16HI_SI_COUNT:
+-    case V8SI_FTYPE_V8SI_V4SI_COUNT:
+-    case V8SI_FTYPE_V8SI_SI_COUNT:
+-    case V4DI_FTYPE_V4DI_V2DI_COUNT:
+-    case V4DI_FTYPE_V4DI_INT_COUNT:
+-    case V8HI_FTYPE_V8HI_V8HI_COUNT:
+-    case V8HI_FTYPE_V8HI_SI_COUNT:
+-    case V4SI_FTYPE_V4SI_V4SI_COUNT:
+-    case V4SI_FTYPE_V4SI_SI_COUNT:
+-    case V4HI_FTYPE_V4HI_V4HI_COUNT:
+-    case V4HI_FTYPE_V4HI_SI_COUNT:
+-    case V2DI_FTYPE_V2DI_V2DI_COUNT:
+-    case V2DI_FTYPE_V2DI_SI_COUNT:
+-    case V2SI_FTYPE_V2SI_V2SI_COUNT:
+-    case V2SI_FTYPE_V2SI_SI_COUNT:
+-    case V1DI_FTYPE_V1DI_V1DI_COUNT:
+-    case V1DI_FTYPE_V1DI_SI_COUNT:
+-      nargs = 2;
+-      second_arg_count = true;
+-      break;
+-    case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
+-    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
+-    case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
+-    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
+-    case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
+-    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
+-    case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
+-    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
+-    case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
+-    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
+-    case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
+-    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
+-    case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
+-    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
+-    case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
+-    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
+-    case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
+-    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
+-      nargs = 4;
+-      second_arg_count = true;
+-      break;
+-    case UINT64_FTYPE_UINT64_UINT64:
+-    case UINT_FTYPE_UINT_UINT:
+-    case UINT_FTYPE_UINT_USHORT:
+-    case UINT_FTYPE_UINT_UCHAR:
+-    case UINT16_FTYPE_UINT16_INT:
+-    case UINT8_FTYPE_UINT8_INT:
+-    case UQI_FTYPE_UQI_UQI:
+-    case UHI_FTYPE_UHI_UHI:
+-    case USI_FTYPE_USI_USI:
+-    case UDI_FTYPE_UDI_UDI:
+-    case V16SI_FTYPE_V8DF_V8DF:
+-      nargs = 2;
+-      break;
+-    case V2DI_FTYPE_V2DI_INT_CONVERT:
+-      nargs = 2;
+-      rmode = V1TImode;
+-      nargs_constant = 1;
+-      break;
+-    case V4DI_FTYPE_V4DI_INT_CONVERT:
+-      nargs = 2;
+-      rmode = V2TImode;
+-      nargs_constant = 1;
+-      break;
+-    case V8DI_FTYPE_V8DI_INT_CONVERT:
+-      nargs = 2;
+-      rmode = V4TImode;
+-      nargs_constant = 1;
+-      break;
+-    case V8HI_FTYPE_V8HI_INT:
+-    case V8HI_FTYPE_V8SF_INT:
+-    case V16HI_FTYPE_V16SF_INT:
+-    case V8HI_FTYPE_V4SF_INT:
+-    case V8SF_FTYPE_V8SF_INT:
+-    case V4SF_FTYPE_V16SF_INT:
+-    case V16SF_FTYPE_V16SF_INT:
+-    case V4SI_FTYPE_V4SI_INT:
+-    case V4SI_FTYPE_V8SI_INT:
+-    case V4HI_FTYPE_V4HI_INT:
+-    case V4DF_FTYPE_V4DF_INT:
+-    case V4DF_FTYPE_V8DF_INT:
+-    case V4SF_FTYPE_V4SF_INT:
+-    case V4SF_FTYPE_V8SF_INT:
+-    case V2DI_FTYPE_V2DI_INT:
+-    case V2DF_FTYPE_V2DF_INT:
+-    case V2DF_FTYPE_V4DF_INT:
+-    case V16HI_FTYPE_V16HI_INT:
+-    case V8SI_FTYPE_V8SI_INT:
+-    case V16SI_FTYPE_V16SI_INT:
+-    case V4SI_FTYPE_V16SI_INT:
+-    case V4DI_FTYPE_V4DI_INT:
+-    case V2DI_FTYPE_V4DI_INT:
+-    case V4DI_FTYPE_V8DI_INT:
+-    case QI_FTYPE_V4SF_INT:
+-    case QI_FTYPE_V2DF_INT:
+-    case UQI_FTYPE_UQI_UQI_CONST:
+-    case UHI_FTYPE_UHI_UQI:
+-    case USI_FTYPE_USI_UQI:
+-    case UDI_FTYPE_UDI_UQI:
+-      nargs = 2;
+-      nargs_constant = 1;
+-      break;
+-    case V16QI_FTYPE_V16QI_V16QI_V16QI:
+-    case V8SF_FTYPE_V8SF_V8SF_V8SF:
+-    case V4DF_FTYPE_V4DF_V4DF_V4DF:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SF:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DF:
+-    case V32QI_FTYPE_V32QI_V32QI_V32QI:
+-    case UHI_FTYPE_V16SI_V16SI_UHI:
+-    case UQI_FTYPE_V8DI_V8DI_UQI:
+-    case V16HI_FTYPE_V16SI_V16HI_UHI:
+-    case V16QI_FTYPE_V16SI_V16QI_UHI:
+-    case V16QI_FTYPE_V8DI_V16QI_UQI:
+-    case V16SF_FTYPE_V16SF_V16SF_UHI:
+-    case V16SF_FTYPE_V4SF_V16SF_UHI:
+-    case V16SI_FTYPE_SI_V16SI_UHI:
+-    case V16SI_FTYPE_V16HI_V16SI_UHI:
+-    case V16SI_FTYPE_V16QI_V16SI_UHI:
+-    case V8SF_FTYPE_V4SF_V8SF_UQI:
+-    case V4DF_FTYPE_V2DF_V4DF_UQI:
+-    case V8SI_FTYPE_V4SI_V8SI_UQI:
+-    case V8SI_FTYPE_SI_V8SI_UQI:
+-    case V4SI_FTYPE_V4SI_V4SI_UQI:
+-    case V4SI_FTYPE_SI_V4SI_UQI:
+-    case V4DI_FTYPE_V2DI_V4DI_UQI:
+-    case V4DI_FTYPE_DI_V4DI_UQI:
+-    case V2DI_FTYPE_V2DI_V2DI_UQI:
+-    case V2DI_FTYPE_DI_V2DI_UQI:
+-    case V64QI_FTYPE_V64QI_V64QI_UDI:
+-    case V64QI_FTYPE_V16QI_V64QI_UDI:
+-    case V64QI_FTYPE_QI_V64QI_UDI:
+-    case V32QI_FTYPE_V32QI_V32QI_USI:
+-    case V32QI_FTYPE_V16QI_V32QI_USI:
+-    case V32QI_FTYPE_QI_V32QI_USI:
+-    case V16QI_FTYPE_V16QI_V16QI_UHI:
+-    case V16QI_FTYPE_QI_V16QI_UHI:
+-    case V32HI_FTYPE_V8HI_V32HI_USI:
+-    case V32HI_FTYPE_HI_V32HI_USI:
+-    case V16HI_FTYPE_V8HI_V16HI_UHI:
+-    case V16HI_FTYPE_HI_V16HI_UHI:
+-    case V8HI_FTYPE_V8HI_V8HI_UQI:
+-    case V8HI_FTYPE_HI_V8HI_UQI:
+-    case V8SF_FTYPE_V8HI_V8SF_UQI:
+-    case V4SF_FTYPE_V8HI_V4SF_UQI:
+-    case V8SI_FTYPE_V8SF_V8SI_UQI:
+-    case V4SI_FTYPE_V4SF_V4SI_UQI:
+-    case V4DI_FTYPE_V4SF_V4DI_UQI:
+-    case V2DI_FTYPE_V4SF_V2DI_UQI:
+-    case V4SF_FTYPE_V4DI_V4SF_UQI:
+-    case V4SF_FTYPE_V2DI_V4SF_UQI:
+-    case V4DF_FTYPE_V4DI_V4DF_UQI:
+-    case V2DF_FTYPE_V2DI_V2DF_UQI:
+-    case V16QI_FTYPE_V8HI_V16QI_UQI:
+-    case V16QI_FTYPE_V16HI_V16QI_UHI:
+-    case V16QI_FTYPE_V4SI_V16QI_UQI:
+-    case V16QI_FTYPE_V8SI_V16QI_UQI:
+-    case V8HI_FTYPE_V4SI_V8HI_UQI:
+-    case V8HI_FTYPE_V8SI_V8HI_UQI:
+-    case V16QI_FTYPE_V2DI_V16QI_UQI:
+-    case V16QI_FTYPE_V4DI_V16QI_UQI:
+-    case V8HI_FTYPE_V2DI_V8HI_UQI:
+-    case V8HI_FTYPE_V4DI_V8HI_UQI:
+-    case V4SI_FTYPE_V2DI_V4SI_UQI:
+-    case V4SI_FTYPE_V4DI_V4SI_UQI:
+-    case V32QI_FTYPE_V32HI_V32QI_USI:
+-    case UHI_FTYPE_V16QI_V16QI_UHI:
+-    case USI_FTYPE_V32QI_V32QI_USI:
+-    case UDI_FTYPE_V64QI_V64QI_UDI:
+-    case UQI_FTYPE_V8HI_V8HI_UQI:
+-    case UHI_FTYPE_V16HI_V16HI_UHI:
+-    case USI_FTYPE_V32HI_V32HI_USI:
+-    case UQI_FTYPE_V4SI_V4SI_UQI:
+-    case UQI_FTYPE_V8SI_V8SI_UQI:
+-    case UQI_FTYPE_V2DI_V2DI_UQI:
+-    case UQI_FTYPE_V4DI_V4DI_UQI:
+-    case V4SF_FTYPE_V2DF_V4SF_UQI:
+-    case V4SF_FTYPE_V4DF_V4SF_UQI:
+-    case V16SI_FTYPE_V16SI_V16SI_UHI:
+-    case V16SI_FTYPE_V4SI_V16SI_UHI:
+-    case V2DI_FTYPE_V4SI_V2DI_UQI:
+-    case V2DI_FTYPE_V8HI_V2DI_UQI:
+-    case V2DI_FTYPE_V16QI_V2DI_UQI:
+-    case V4DI_FTYPE_V4DI_V4DI_UQI:
+-    case V4DI_FTYPE_V4SI_V4DI_UQI:
+-    case V4DI_FTYPE_V8HI_V4DI_UQI:
+-    case V4DI_FTYPE_V16QI_V4DI_UQI:
+-    case V4DI_FTYPE_V4DF_V4DI_UQI:
+-    case V2DI_FTYPE_V2DF_V2DI_UQI:
+-    case V4SI_FTYPE_V4DF_V4SI_UQI:
+-    case V4SI_FTYPE_V2DF_V4SI_UQI:
+-    case V4SI_FTYPE_V8HI_V4SI_UQI:
+-    case V4SI_FTYPE_V16QI_V4SI_UQI:
+-    case V4DI_FTYPE_V4DI_V4DI_V4DI:
+-    case V8DF_FTYPE_V2DF_V8DF_UQI:
+-    case V8DF_FTYPE_V4DF_V8DF_UQI:
+-    case V8DF_FTYPE_V8DF_V8DF_UQI:
+-    case V8SF_FTYPE_V8SF_V8SF_UQI:
+-    case V8SF_FTYPE_V8SI_V8SF_UQI:
+-    case V4DF_FTYPE_V4DF_V4DF_UQI:
+-    case V4SF_FTYPE_V4SF_V4SF_UQI:
+-    case V2DF_FTYPE_V2DF_V2DF_UQI:
+-    case V2DF_FTYPE_V4SF_V2DF_UQI:
+-    case V2DF_FTYPE_V4SI_V2DF_UQI:
+-    case V4SF_FTYPE_V4SI_V4SF_UQI:
+-    case V4DF_FTYPE_V4SF_V4DF_UQI:
+-    case V4DF_FTYPE_V4SI_V4DF_UQI:
+-    case V8SI_FTYPE_V8SI_V8SI_UQI:
+-    case V8SI_FTYPE_V8HI_V8SI_UQI:
+-    case V8SI_FTYPE_V16QI_V8SI_UQI:
+-    case V8DF_FTYPE_V8SI_V8DF_UQI:
+-    case V8DI_FTYPE_DI_V8DI_UQI:
+-    case V16SF_FTYPE_V8SF_V16SF_UHI:
+-    case V16SI_FTYPE_V8SI_V16SI_UHI:
+-    case V16HI_FTYPE_V16HI_V16HI_UHI:
+-    case V8HI_FTYPE_V16QI_V8HI_UQI:
+-    case V16HI_FTYPE_V16QI_V16HI_UHI:
+-    case V32HI_FTYPE_V32HI_V32HI_USI:
+-    case V32HI_FTYPE_V32QI_V32HI_USI:
+-    case V8DI_FTYPE_V16QI_V8DI_UQI:
+-    case V8DI_FTYPE_V2DI_V8DI_UQI:
+-    case V8DI_FTYPE_V4DI_V8DI_UQI:
+-    case V8DI_FTYPE_V8DI_V8DI_UQI:
+-    case V8DI_FTYPE_V8HI_V8DI_UQI:
+-    case V8DI_FTYPE_V8SI_V8DI_UQI:
+-    case V8HI_FTYPE_V8DI_V8HI_UQI:
+-    case V8SI_FTYPE_V8DI_V8SI_UQI:
+-    case V4SI_FTYPE_V4SI_V4SI_V4SI:
+-    case V16SI_FTYPE_V16SI_V16SI_V16SI:
+-    case V8DI_FTYPE_V8DI_V8DI_V8DI:
+-    case V32HI_FTYPE_V32HI_V32HI_V32HI:
+-    case V2DI_FTYPE_V2DI_V2DI_V2DI:
+-    case V16HI_FTYPE_V16HI_V16HI_V16HI:
+-    case V8SI_FTYPE_V8SI_V8SI_V8SI:
+-    case V8HI_FTYPE_V8HI_V8HI_V8HI:
+-      nargs = 3;
+-      break;
+-    case V32QI_FTYPE_V32QI_V32QI_INT:
+-    case V16HI_FTYPE_V16HI_V16HI_INT:
+-    case V16QI_FTYPE_V16QI_V16QI_INT:
+-    case V4DI_FTYPE_V4DI_V4DI_INT:
+-    case V8HI_FTYPE_V8HI_V8HI_INT:
+-    case V8SI_FTYPE_V8SI_V8SI_INT:
+-    case V8SI_FTYPE_V8SI_V4SI_INT:
+-    case V8SF_FTYPE_V8SF_V8SF_INT:
+-    case V8SF_FTYPE_V8SF_V4SF_INT:
+-    case V4SI_FTYPE_V4SI_V4SI_INT:
+-    case V4DF_FTYPE_V4DF_V4DF_INT:
+-    case V16SF_FTYPE_V16SF_V16SF_INT:
+-    case V16SF_FTYPE_V16SF_V4SF_INT:
+-    case V16SI_FTYPE_V16SI_V4SI_INT:
+-    case V4DF_FTYPE_V4DF_V2DF_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_INT:
+-    case V2DI_FTYPE_V2DI_V2DI_INT:
+-    case V4DI_FTYPE_V4DI_V2DI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_INT:
+-    case UQI_FTYPE_V8DI_V8UDI_INT:
+-    case UQI_FTYPE_V8DF_V8DF_INT:
+-    case UQI_FTYPE_V2DF_V2DF_INT:
+-    case UQI_FTYPE_V4SF_V4SF_INT:
+-    case UHI_FTYPE_V16SI_V16SI_INT:
+-    case UHI_FTYPE_V16SF_V16SF_INT:
+-    case V64QI_FTYPE_V64QI_V64QI_INT:
+-    case V32HI_FTYPE_V32HI_V32HI_INT:
+-    case V16SI_FTYPE_V16SI_V16SI_INT:
+-    case V8DI_FTYPE_V8DI_V8DI_INT:
+-      nargs = 3;
+-      nargs_constant = 1;
+-      break;
+-    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
+-      nargs = 3;
+-      rmode = V4DImode;
+-      nargs_constant = 1;
+-      break;
+-    case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
+-      nargs = 3;
+-      rmode = V2DImode;
+-      nargs_constant = 1;
+-      break;
+-    case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
+-      nargs = 3;
+-      rmode = DImode;
+-      nargs_constant = 1;
+-      break;
+-    case V2DI_FTYPE_V2DI_UINT_UINT:
+-      nargs = 3;
+-      nargs_constant = 2;
+-      break;
+-    case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
+-      nargs = 3;
+-      rmode = V8DImode;
+-      nargs_constant = 1;
+-      break;
+-    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
+-      nargs = 5;
+-      rmode = V8DImode;
+-      mask_pos = 2;
+-      nargs_constant = 1;
+-      break;
+-    case QI_FTYPE_V8DF_INT_UQI:
+-    case QI_FTYPE_V4DF_INT_UQI:
+-    case QI_FTYPE_V2DF_INT_UQI:
+-    case HI_FTYPE_V16SF_INT_UHI:
+-    case QI_FTYPE_V8SF_INT_UQI:
+-    case QI_FTYPE_V4SF_INT_UQI:
+-    case V4SI_FTYPE_V4SI_V4SI_UHI:
+-    case V8SI_FTYPE_V8SI_V8SI_UHI:
+-      nargs = 3;
+-      mask_pos = 1;
+-      nargs_constant = 1;
+-      break;
+-    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
+-      nargs = 5;
+-      rmode = V4DImode;
+-      mask_pos = 2;
+-      nargs_constant = 1;
+-      break;
+-    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
+-      nargs = 5;
+-      rmode = V2DImode;
+-      mask_pos = 2;
+-      nargs_constant = 1;
+-      break;
+-    case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
+-    case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
+-    case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
+-    case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
+-    case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
+-    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
+-    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
+-    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
+-    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
+-    case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
+-    case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
+-    case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
+-    case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
+-    case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
+-    case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
+-    case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+-    case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
+-    case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
+-    case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
+-    case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
+-    case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
+-    case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
+-    case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
+-    case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
+-    case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
+-    case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
+-    case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
+-    case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
+-    case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
+-    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
+-    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
+-    case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
+-    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+-    case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
+-    case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
+-    case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
+-    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
+-    case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
+-    case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
+-    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
+-    case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
+-    case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
+-    case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
+-    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
+-    case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
+-    case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
+-    case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
+-    case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
+-    case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
+-      nargs = 4;
+-      break;
+-    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+-    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+-    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+-    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+-      nargs = 4;
+-      nargs_constant = 1;
+-      break;
+-    case UQI_FTYPE_V4DI_V4DI_INT_UQI:
+-    case UQI_FTYPE_V8SI_V8SI_INT_UQI:
+-    case QI_FTYPE_V4DF_V4DF_INT_UQI:
+-    case QI_FTYPE_V8SF_V8SF_INT_UQI:
+-    case UQI_FTYPE_V2DI_V2DI_INT_UQI:
+-    case UQI_FTYPE_V4SI_V4SI_INT_UQI:
+-    case UQI_FTYPE_V2DF_V2DF_INT_UQI:
+-    case UQI_FTYPE_V4SF_V4SF_INT_UQI:
+-    case UDI_FTYPE_V64QI_V64QI_INT_UDI:
+-    case USI_FTYPE_V32QI_V32QI_INT_USI:
+-    case UHI_FTYPE_V16QI_V16QI_INT_UHI:
+-    case USI_FTYPE_V32HI_V32HI_INT_USI:
+-    case UHI_FTYPE_V16HI_V16HI_INT_UHI:
+-    case UQI_FTYPE_V8HI_V8HI_INT_UQI:
+-    case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
+-    case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
+-    case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
+-    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
+-    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
+-    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
+-    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
+-    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
+-    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
+-      nargs = 4;
+-      mask_pos = 1;
+-      nargs_constant = 1;
+-      break;
+-    case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
+-      nargs = 4;
+-      nargs_constant = 2;
+-      break;
+-    case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
+-    case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
+-      nargs = 4;
+-      break;
+-    case UQI_FTYPE_V8DI_V8DI_INT_UQI:
+-    case UHI_FTYPE_V16SI_V16SI_INT_UHI:
+-      mask_pos = 1;
+-      nargs = 4;
+-      nargs_constant = 1;
+-      break;
+-    case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
+-    case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
+-    case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
+-    case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
+-    case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
+-    case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
+-    case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
+-    case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
+-    case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
+-    case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
+-    case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
+-    case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
+-    case V32HI_FTYPE_V32HI_INT_V32HI_USI:
+-    case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
+-    case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
+-    case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
+-    case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
+-    case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
+-    case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
+-    case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
+-    case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
+-    case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
+-    case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
+-    case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
+-    case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
+-    case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
+-    case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
+-    case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
+-    case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
+-    case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
+-      nargs = 4;
+-      mask_pos = 2;
+-      nargs_constant = 1;
+-      break;
+-    case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
+-    case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
+-    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
+-    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
+-    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
+-    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
+-    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
+-    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
+-    case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
+-    case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
+-    case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
+-    case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
+-    case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
+-    case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
+-    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
+-    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
+-    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
+-    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
+-    case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
+-    case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
+-    case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
+-    case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
+-    case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
+-    case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
+-    case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
+-    case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
+-    case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
+-      nargs = 5;
+-      mask_pos = 2;
+-      nargs_constant = 1;
+-      break;
+-    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
+-    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
+-    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
+-    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
+-    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
+-    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
+-    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
+-    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
+-      nargs = 5;
+-      mask_pos = 1;
+-      nargs_constant = 1;
+-      break;
+-    case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
+-    case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
+-    case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
+-    case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
+-    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
+-    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
+-    case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
+-    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
+-    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
+-    case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
+-    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
+-    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
+-      nargs = 5;
+-      mask_pos = 1;
+-      nargs_constant = 2;
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  gcc_assert (nargs <= ARRAY_SIZE (args));
+-
+-  if (comparison != UNKNOWN)
+-    {
+-      gcc_assert (nargs == 2);
+-      return ix86_expand_sse_compare (d, exp, target, swap);
+-    }
+-
+-  if (rmode == VOIDmode || rmode == tmode)
+-    {
+-      if (optimize
+-	  || target == 0
+-	  || GET_MODE (target) != tmode
+-	  || !insn_p->operand[0].predicate (target, tmode))
+-	target = gen_reg_rtx (tmode);
+-      else if (memory_operand (target, tmode))
+-	num_memory++;
+-      real_target = target;
+-    }
+-  else
+-    {
+-      real_target = gen_reg_rtx (tmode);
+-      target = lowpart_subreg (rmode, real_target, tmode);
+-    }
+-
+-  for (i = 0; i < nargs; i++)
+-    {
+-      tree arg = CALL_EXPR_ARG (exp, i);
+-      rtx op = expand_normal (arg);
+-      machine_mode mode = insn_p->operand[i + 1].mode;
+-      bool match = insn_p->operand[i + 1].predicate (op, mode);
+-
+-      if (second_arg_count && i == 1)
+-	{
+-	  /* SIMD shift insns take either an 8-bit immediate or
+-	     register as count.  But builtin functions take int as
+-	     count.  If count doesn't match, we put it in register.
+-	     The instructions are using 64-bit count, if op is just
+-	     32-bit, zero-extend it, as negative shift counts
+-	     are undefined behavior and zero-extension is more
+-	     efficient.  */
+-	  if (!match)
+-	    {
+-	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
+-		op = convert_modes (mode, GET_MODE (op), op, 1);
+-	      else
+-		op = lowpart_subreg (mode, op, GET_MODE (op));
+-	      if (!insn_p->operand[i + 1].predicate (op, mode))
+-		op = copy_to_reg (op);
+-	    }
+-	}
+-      else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+-	       (!mask_pos && (nargs - i) <= nargs_constant))
+-	{
+-	  if (!match)
+-	    switch (icode)
+-	      {
+-	      case CODE_FOR_avx_vinsertf128v4di:
+-	      case CODE_FOR_avx_vextractf128v4di:
+-		error ("the last argument must be an 1-bit immediate");
+-		return const0_rtx;
+-
+-	      case CODE_FOR_avx512f_cmpv8di3_mask:
+-	      case CODE_FOR_avx512f_cmpv16si3_mask:
+-	      case CODE_FOR_avx512f_ucmpv8di3_mask:
+-	      case CODE_FOR_avx512f_ucmpv16si3_mask:
+-	      case CODE_FOR_avx512vl_cmpv4di3_mask:
+-	      case CODE_FOR_avx512vl_cmpv8si3_mask:
+-	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
+-	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
+-	      case CODE_FOR_avx512vl_cmpv2di3_mask:
+-	      case CODE_FOR_avx512vl_cmpv4si3_mask:
+-	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
+-	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
+-		error ("the last argument must be a 3-bit immediate");
+-		return const0_rtx;
+-
+-	      case CODE_FOR_sse4_1_roundsd:
+-	      case CODE_FOR_sse4_1_roundss:
+-
+-	      case CODE_FOR_sse4_1_roundpd:
+-	      case CODE_FOR_sse4_1_roundps:
+-	      case CODE_FOR_avx_roundpd256:
+-	      case CODE_FOR_avx_roundps256:
+-
+-	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
+-	      case CODE_FOR_sse4_1_roundps_sfix:
+-	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
+-	      case CODE_FOR_avx_roundps_sfix256:
+-
+-	      case CODE_FOR_sse4_1_blendps:
+-	      case CODE_FOR_avx_blendpd256:
+-	      case CODE_FOR_avx_vpermilv4df:
+-	      case CODE_FOR_avx_vpermilv4df_mask:
+-	      case CODE_FOR_avx512f_getmantv8df_mask:
+-	      case CODE_FOR_avx512f_getmantv16sf_mask:
+-	      case CODE_FOR_avx512vl_getmantv8sf_mask:
+-	      case CODE_FOR_avx512vl_getmantv4df_mask:
+-	      case CODE_FOR_avx512vl_getmantv4sf_mask:
+-	      case CODE_FOR_avx512vl_getmantv2df_mask:
+-	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
+-	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
+-	      case CODE_FOR_avx512dq_rangepv4df_mask:
+-	      case CODE_FOR_avx512dq_rangepv8sf_mask:
+-	      case CODE_FOR_avx512dq_rangepv2df_mask:
+-	      case CODE_FOR_avx512dq_rangepv4sf_mask:
+-	      case CODE_FOR_avx_shufpd256_mask:
+-		error ("the last argument must be a 4-bit immediate");
+-		return const0_rtx;
+-
+-	      case CODE_FOR_sha1rnds4:
+-	      case CODE_FOR_sse4_1_blendpd:
+-	      case CODE_FOR_avx_vpermilv2df:
+-	      case CODE_FOR_avx_vpermilv2df_mask:
+-	      case CODE_FOR_xop_vpermil2v2df3:
+-	      case CODE_FOR_xop_vpermil2v4sf3:
+-	      case CODE_FOR_xop_vpermil2v4df3:
+-	      case CODE_FOR_xop_vpermil2v8sf3:
+-	      case CODE_FOR_avx512f_vinsertf32x4_mask:
+-	      case CODE_FOR_avx512f_vinserti32x4_mask:
+-	      case CODE_FOR_avx512f_vextractf32x4_mask:
+-	      case CODE_FOR_avx512f_vextracti32x4_mask:
+-	      case CODE_FOR_sse2_shufpd:
+-	      case CODE_FOR_sse2_shufpd_mask:
+-	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
+-	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
+-	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
+-	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
+-		error ("the last argument must be a 2-bit immediate");
+-		return const0_rtx;
+-
+-	      case CODE_FOR_avx_vextractf128v4df:
+-	      case CODE_FOR_avx_vextractf128v8sf:
+-	      case CODE_FOR_avx_vextractf128v8si:
+-	      case CODE_FOR_avx_vinsertf128v4df:
+-	      case CODE_FOR_avx_vinsertf128v8sf:
+-	      case CODE_FOR_avx_vinsertf128v8si:
+-	      case CODE_FOR_avx512f_vinsertf64x4_mask:
+-	      case CODE_FOR_avx512f_vinserti64x4_mask:
+-	      case CODE_FOR_avx512f_vextractf64x4_mask:
+-	      case CODE_FOR_avx512f_vextracti64x4_mask:
+-	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
+-	      case CODE_FOR_avx512dq_vinserti32x8_mask:
+-	      case CODE_FOR_avx512vl_vinsertv4df:
+-	      case CODE_FOR_avx512vl_vinsertv4di:
+-	      case CODE_FOR_avx512vl_vinsertv8sf:
+-	      case CODE_FOR_avx512vl_vinsertv8si:
+-		error ("the last argument must be a 1-bit immediate");
+-		return const0_rtx;
+-
+-	      case CODE_FOR_avx_vmcmpv2df3:
+-	      case CODE_FOR_avx_vmcmpv4sf3:
+-	      case CODE_FOR_avx_cmpv2df3:
+-	      case CODE_FOR_avx_cmpv4sf3:
+-	      case CODE_FOR_avx_cmpv4df3:
+-	      case CODE_FOR_avx_cmpv8sf3:
+-	      case CODE_FOR_avx512f_cmpv8df3_mask:
+-	      case CODE_FOR_avx512f_cmpv16sf3_mask:
+-	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
+-	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
+-		error ("the last argument must be a 5-bit immediate");
+-		return const0_rtx;
+-
+-	      default:
+-		switch (nargs_constant)
+-		  {
+-		  case 2:
+-		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+-			(!mask_pos && (nargs - i) == nargs_constant))
+-		      {
+-			error ("the next to last argument must be an 8-bit immediate");
+-			break;
+-		      }
+-		    /* FALLTHRU */
+-		  case 1:
+-		    error ("the last argument must be an 8-bit immediate");
+-		    break;
+-		  default:
+-		    gcc_unreachable ();
+-		  }
+-		return const0_rtx;
+-	      }
+-	}
+-      else
+-	{
+-	  if (VECTOR_MODE_P (mode))
+-	    op = safe_vector_operand (op, mode);
+-
+-	  /* If we aren't optimizing, only allow one memory operand to
+-	     be generated.  */
+-	  if (memory_operand (op, mode))
+-	    num_memory++;
+-
+-	  op = fixup_modeless_constant (op, mode);
+-
+-	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+-	    {
+-	      if (optimize || !match || num_memory > 1)
+-		op = copy_to_mode_reg (mode, op);
+-	    }
+-	  else
+-	    {
+-	      op = copy_to_reg (op);
+-	      op = lowpart_subreg (mode, op, GET_MODE (op));
+-	    }
+-	}
+-
+-      args[i].op = op;
+-      args[i].mode = mode;
+-    }
+-
+-  switch (nargs)
+-    {
+-    case 1:
+-      pat = GEN_FCN (icode) (real_target, args[0].op);
+-      break;
+-    case 2:
+-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
+-      break;
+-    case 3:
+-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+-			     args[2].op);
+-      break;
+-    case 4:
+-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+-			     args[2].op, args[3].op);
+-      break;
+-    case 5:
+-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+-			     args[2].op, args[3].op, args[4].op);
+-      break;
+-    case 6:
+-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+-			     args[2].op, args[3].op, args[4].op,
+-			     args[5].op);
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  if (! pat)
+-    return 0;
+-
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-/* Transform pattern of following layout:
+-     (set A
+-       (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
+-     )
+-   into:
+-     (set (A B)) */
+-
+-static rtx
+-ix86_erase_embedded_rounding (rtx pat)
+-{
+-  if (GET_CODE (pat) == INSN)
+-    pat = PATTERN (pat);
+-
+-  gcc_assert (GET_CODE (pat) == SET);
+-  rtx src = SET_SRC (pat);
+-  gcc_assert (XVECLEN (src, 0) == 2);
+-  rtx p0 = XVECEXP (src, 0, 0);
+-  gcc_assert (GET_CODE (src) == UNSPEC
+-	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
+-  rtx res = gen_rtx_SET (SET_DEST (pat), p0);
+-  return res;
+-}
+-
+-/* Subroutine of ix86_expand_round_builtin to take care of comi insns
+-   with rounding.  */
+-static rtx
+-ix86_expand_sse_comi_round (const struct builtin_description *d,
+-			    tree exp, rtx target)
+-{
+-  rtx pat, set_dst;
+-  tree arg0 = CALL_EXPR_ARG (exp, 0);
+-  tree arg1 = CALL_EXPR_ARG (exp, 1);
+-  tree arg2 = CALL_EXPR_ARG (exp, 2);
+-  tree arg3 = CALL_EXPR_ARG (exp, 3);
+-  rtx op0 = expand_normal (arg0);
+-  rtx op1 = expand_normal (arg1);
+-  rtx op2 = expand_normal (arg2);
+-  rtx op3 = expand_normal (arg3);
+-  enum insn_code icode = d->icode;
+-  const struct insn_data_d *insn_p = &insn_data[icode];
+-  machine_mode mode0 = insn_p->operand[0].mode;
+-  machine_mode mode1 = insn_p->operand[1].mode;
+-  enum rtx_code comparison = UNEQ;
+-  bool need_ucomi = false;
+-
+-  /* See avxintrin.h for values.  */
+-  enum rtx_code comi_comparisons[32] =
+-    {
+-      UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
+-      UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
+-      UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
+-    };
+-  bool need_ucomi_values[32] =
+-    {
+-      true,  false, false, true,  true,  false, false, true,
+-      true,  false, false, true,  true,  false, false, true,
+-      false, true,  true,  false, false, true,  true,  false,
+-      false, true,  true,  false, false, true,  true,  false
+-    };
+-
+-  if (!CONST_INT_P (op2))
+-    {
+-      error ("the third argument must be comparison constant");
+-      return const0_rtx;
+-    }
+-  if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
+-    {
+-      error ("incorrect comparison mode");
+-      return const0_rtx;
+-    }
+-
+-  if (!insn_p->operand[2].predicate (op3, SImode))
+-    {
+-      error ("incorrect rounding operand");
+-      return const0_rtx;
+-    }
+-
+-  comparison = comi_comparisons[INTVAL (op2)];
+-  need_ucomi = need_ucomi_values[INTVAL (op2)];
+-
+-  if (VECTOR_MODE_P (mode0))
+-    op0 = safe_vector_operand (op0, mode0);
+-  if (VECTOR_MODE_P (mode1))
+-    op1 = safe_vector_operand (op1, mode1);
+-
+-  target = gen_reg_rtx (SImode);
+-  emit_move_insn (target, const0_rtx);
+-  target = gen_rtx_SUBREG (QImode, target, 0);
+-
+-  if ((optimize && !register_operand (op0, mode0))
+-      || !insn_p->operand[0].predicate (op0, mode0))
+-    op0 = copy_to_mode_reg (mode0, op0);
+-  if ((optimize && !register_operand (op1, mode1))
+-      || !insn_p->operand[1].predicate (op1, mode1))
+-    op1 = copy_to_mode_reg (mode1, op1);
+-
+-  if (need_ucomi)
+-    icode = icode == CODE_FOR_sse_comi_round
+-		     ? CODE_FOR_sse_ucomi_round
+-		     : CODE_FOR_sse2_ucomi_round;
+-
+-  pat = GEN_FCN (icode) (op0, op1, op3);
+-  if (! pat)
+-    return 0;
+-
+-  /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
+-  if (INTVAL (op3) == NO_ROUND)
+-    {
+-      pat = ix86_erase_embedded_rounding (pat);
+-      if (! pat)
+-	return 0;
+-
+-      set_dst = SET_DEST (pat);
+-    }
+-  else
+-    {
+-      gcc_assert (GET_CODE (pat) == SET);
+-      set_dst = SET_DEST (pat);
+-    }
+-
+-  emit_insn (pat);
+-  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+-			  gen_rtx_fmt_ee (comparison, QImode,
+-					  set_dst,
+-					  const0_rtx)));
+-
+-  return SUBREG_REG (target);
+-}
+-
+-static rtx
+-ix86_expand_round_builtin (const struct builtin_description *d,
+-			   tree exp, rtx target)
+-{
+-  rtx pat;
+-  unsigned int i, nargs;
+-  struct
+-    {
+-      rtx op;
+-      machine_mode mode;
+-    } args[6];
+-  enum insn_code icode = d->icode;
+-  const struct insn_data_d *insn_p = &insn_data[icode];
+-  machine_mode tmode = insn_p->operand[0].mode;
+-  unsigned int nargs_constant = 0;
+-  unsigned int redundant_embed_rnd = 0;
+-
+-  switch ((enum ix86_builtin_func_type) d->flag)
+-    {
+-    case UINT64_FTYPE_V2DF_INT:
+-    case UINT64_FTYPE_V4SF_INT:
+-    case UINT_FTYPE_V2DF_INT:
+-    case UINT_FTYPE_V4SF_INT:
+-    case INT64_FTYPE_V2DF_INT:
+-    case INT64_FTYPE_V4SF_INT:
+-    case INT_FTYPE_V2DF_INT:
+-    case INT_FTYPE_V4SF_INT:
+-      nargs = 2;
+-      break;
+-    case V4SF_FTYPE_V4SF_UINT_INT:
+-    case V4SF_FTYPE_V4SF_UINT64_INT:
+-    case V2DF_FTYPE_V2DF_UINT64_INT:
+-    case V4SF_FTYPE_V4SF_INT_INT:
+-    case V4SF_FTYPE_V4SF_INT64_INT:
+-    case V2DF_FTYPE_V2DF_INT64_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_INT:
+-    case V4SF_FTYPE_V4SF_V2DF_INT:
+-    case V2DF_FTYPE_V2DF_V4SF_INT:
+-      nargs = 3;
+-      break;
+-    case V8SF_FTYPE_V8DF_V8SF_QI_INT:
+-    case V8DF_FTYPE_V8DF_V8DF_QI_INT:
+-    case V8SI_FTYPE_V8DF_V8SI_QI_INT:
+-    case V8DI_FTYPE_V8DF_V8DI_QI_INT:
+-    case V8SF_FTYPE_V8DI_V8SF_QI_INT:
+-    case V8DF_FTYPE_V8DI_V8DF_QI_INT:
+-    case V16SF_FTYPE_V16SF_V16SF_HI_INT:
+-    case V8DI_FTYPE_V8SF_V8DI_QI_INT:
+-    case V16SF_FTYPE_V16SI_V16SF_HI_INT:
+-    case V16SI_FTYPE_V16SF_V16SI_HI_INT:
+-    case V8DF_FTYPE_V8SF_V8DF_QI_INT:
+-    case V16SF_FTYPE_V16HI_V16SF_HI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+-      nargs = 4;
+-      break;
+-    case V4SF_FTYPE_V4SF_V4SF_INT_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_INT_INT:
+-      nargs_constant = 2;
+-      nargs = 4;
+-      break;
+-    case INT_FTYPE_V4SF_V4SF_INT_INT:
+-    case INT_FTYPE_V2DF_V2DF_INT_INT:
+-      return ix86_expand_sse_comi_round (d, exp, target);
+-    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
+-    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
+-    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
+-    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
+-      nargs = 5;
+-      break;
+-    case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
+-    case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
+-      nargs_constant = 4;
+-      nargs = 5;
+-      break;
+-    case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
+-    case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
+-    case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
+-    case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
+-      nargs_constant = 3;
+-      nargs = 5;
+-      break;
+-    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
+-    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
+-      nargs = 6;
+-      nargs_constant = 4;
+-      break;
+-    case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
+-    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
+-    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
+-    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
+-      nargs = 6;
+-      nargs_constant = 3;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-  gcc_assert (nargs <= ARRAY_SIZE (args));
+-
+-  if (optimize
+-      || target == 0
+-      || GET_MODE (target) != tmode
+-      || !insn_p->operand[0].predicate (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  for (i = 0; i < nargs; i++)
+-    {
+-      tree arg = CALL_EXPR_ARG (exp, i);
+-      rtx op = expand_normal (arg);
+-      machine_mode mode = insn_p->operand[i + 1].mode;
+-      bool match = insn_p->operand[i + 1].predicate (op, mode);
+-
+-      if (i == nargs - nargs_constant)
+-	{
+-	  if (!match)
+-	    {
+-	      switch (icode)
+-		{
+-		case CODE_FOR_avx512f_getmantv8df_mask_round:
+-		case CODE_FOR_avx512f_getmantv16sf_mask_round:
+-		case CODE_FOR_avx512f_vgetmantv2df_round:
+-		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
+-		case CODE_FOR_avx512f_vgetmantv4sf_round:
+-		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
+-		  error ("the immediate argument must be a 4-bit immediate");
+-		  return const0_rtx;
+-		case CODE_FOR_avx512f_cmpv8df3_mask_round:
+-		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
+-		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
+-		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
+-		  error ("the immediate argument must be a 5-bit immediate");
+-		  return const0_rtx;
+-		default:
+-		  error ("the immediate argument must be an 8-bit immediate");
+-		  return const0_rtx;
+-		}
+-	    }
+-	}
+-      else if (i == nargs-1)
+-	{
+-	  if (!insn_p->operand[nargs].predicate (op, SImode))
+-	    {
+-	      error ("incorrect rounding operand");
+-	      return const0_rtx;
+-	    }
+-
+-	  /* If there is no rounding use normal version of the pattern.  */
+-	  if (INTVAL (op) == NO_ROUND)
+-	    redundant_embed_rnd = 1;
+-	}
+-      else
+-	{
+-	  if (VECTOR_MODE_P (mode))
+-	    op = safe_vector_operand (op, mode);
+-
+-	  op = fixup_modeless_constant (op, mode);
+-
+-	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+-	    {
+-	      if (optimize || !match)
+-		op = copy_to_mode_reg (mode, op);
+-	    }
+-	  else
+-	    {
+-	      op = copy_to_reg (op);
+-	      op = lowpart_subreg (mode, op, GET_MODE (op));
+-	    }
+-	}
+-
+-      args[i].op = op;
+-      args[i].mode = mode;
+-    }
+-
+-  switch (nargs)
+-    {
+-    case 1:
+-      pat = GEN_FCN (icode) (target, args[0].op);
+-      break;
+-    case 2:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+-      break;
+-    case 3:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+-			     args[2].op);
+-      break;
+-    case 4:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+-			     args[2].op, args[3].op);
+-      break;
+-    case 5:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+-			     args[2].op, args[3].op, args[4].op);
+-      break;
+-    case 6:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+-			     args[2].op, args[3].op, args[4].op,
+-			     args[5].op);
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  if (!pat)
+-    return 0;
+-
+-  if (redundant_embed_rnd)
+-    pat = ix86_erase_embedded_rounding (pat);
+-
+-  emit_insn (pat);
+-  return target;
+-}
+-
+-/* Subroutine of ix86_expand_builtin to take care of special insns
+-   with variable number of operands.  */
+-
+-static rtx
+-ix86_expand_special_args_builtin (const struct builtin_description *d,
+-				  tree exp, rtx target)
+-{
+-  tree arg;
+-  rtx pat, op;
+-  unsigned int i, nargs, arg_adjust, memory;
+-  bool aligned_mem = false;
+-  struct
+-    {
+-      rtx op;
+-      machine_mode mode;
+-    } args[3];
+-  enum insn_code icode = d->icode;
+-  bool last_arg_constant = false;
+-  const struct insn_data_d *insn_p = &insn_data[icode];
+-  machine_mode tmode = insn_p->operand[0].mode;
+-  enum { load, store } klass;
+-
+-  switch ((enum ix86_builtin_func_type) d->flag)
+-    {
+-    case VOID_FTYPE_VOID:
+-      emit_insn (GEN_FCN (icode) (target));
+-      return 0;
+-    case VOID_FTYPE_UINT64:
+-    case VOID_FTYPE_UNSIGNED:
+-      nargs = 0;
+-      klass = store;
+-      memory = 0;
+-      break;
+-
+-    case INT_FTYPE_VOID:
+-    case USHORT_FTYPE_VOID:
+-    case UINT64_FTYPE_VOID:
+-    case UINT_FTYPE_VOID:
+-    case UNSIGNED_FTYPE_VOID:
+-      nargs = 0;
+-      klass = load;
+-      memory = 0;
+-      break;
+-    case UINT64_FTYPE_PUNSIGNED:
+-    case V2DI_FTYPE_PV2DI:
+-    case V4DI_FTYPE_PV4DI:
+-    case V32QI_FTYPE_PCCHAR:
+-    case V16QI_FTYPE_PCCHAR:
+-    case V8SF_FTYPE_PCV4SF:
+-    case V8SF_FTYPE_PCFLOAT:
+-    case V4SF_FTYPE_PCFLOAT:
+-    case V4DF_FTYPE_PCV2DF:
+-    case V4DF_FTYPE_PCDOUBLE:
+-    case V2DF_FTYPE_PCDOUBLE:
+-    case VOID_FTYPE_PVOID:
+-    case V8DI_FTYPE_PV8DI:
+-      nargs = 1;
+-      klass = load;
+-      memory = 0;
+-      switch (icode)
+-	{
+-	case CODE_FOR_sse4_1_movntdqa:
+-	case CODE_FOR_avx2_movntdqa:
+-	case CODE_FOR_avx512f_movntdqa:
+-	  aligned_mem = true;
+-	  break;
+-	default:
+-	  break;
+-	}
+-      break;
+-    case VOID_FTYPE_PV2SF_V4SF:
+-    case VOID_FTYPE_PV8DI_V8DI:
+-    case VOID_FTYPE_PV4DI_V4DI:
+-    case VOID_FTYPE_PV2DI_V2DI:
+-    case VOID_FTYPE_PCHAR_V32QI:
+-    case VOID_FTYPE_PCHAR_V16QI:
+-    case VOID_FTYPE_PFLOAT_V16SF:
+-    case VOID_FTYPE_PFLOAT_V8SF:
+-    case VOID_FTYPE_PFLOAT_V4SF:
+-    case VOID_FTYPE_PDOUBLE_V8DF:
+-    case VOID_FTYPE_PDOUBLE_V4DF:
+-    case VOID_FTYPE_PDOUBLE_V2DF:
+-    case VOID_FTYPE_PLONGLONG_LONGLONG:
+-    case VOID_FTYPE_PULONGLONG_ULONGLONG:
+-    case VOID_FTYPE_PUNSIGNED_UNSIGNED:
+-    case VOID_FTYPE_PINT_INT:
+-      nargs = 1;
+-      klass = store;
+-      /* Reserve memory operand for target.  */
+-      memory = ARRAY_SIZE (args);
+-      switch (icode)
+-	{
+-	/* These builtins and instructions require the memory
+-	   to be properly aligned.  */
+-	case CODE_FOR_avx_movntv4di:
+-	case CODE_FOR_sse2_movntv2di:
+-	case CODE_FOR_avx_movntv8sf:
+-	case CODE_FOR_sse_movntv4sf:
+-	case CODE_FOR_sse4a_vmmovntv4sf:
+-	case CODE_FOR_avx_movntv4df:
+-	case CODE_FOR_sse2_movntv2df:
+-	case CODE_FOR_sse4a_vmmovntv2df:
+-	case CODE_FOR_sse2_movntidi:
+-	case CODE_FOR_sse_movntq:
+-	case CODE_FOR_sse2_movntisi:
+-	case CODE_FOR_avx512f_movntv16sf:
+-	case CODE_FOR_avx512f_movntv8df:
+-	case CODE_FOR_avx512f_movntv8di:
+-	  aligned_mem = true;
+-	  break;
+-	default:
+-	  break;
+-	}
+-      break;
+-    case VOID_FTYPE_PVOID_PCVOID:
+-	nargs = 1;
+-	klass = store;
+-	memory = 0;
+-
+-	break;
+-    case V4SF_FTYPE_V4SF_PCV2SF:
+-    case V2DF_FTYPE_V2DF_PCDOUBLE:
+-      nargs = 2;
+-      klass = load;
+-      memory = 1;
+-      break;
+-    case V8SF_FTYPE_PCV8SF_V8SI:
+-    case V4DF_FTYPE_PCV4DF_V4DI:
+-    case V4SF_FTYPE_PCV4SF_V4SI:
+-    case V2DF_FTYPE_PCV2DF_V2DI:
+-    case V8SI_FTYPE_PCV8SI_V8SI:
+-    case V4DI_FTYPE_PCV4DI_V4DI:
+-    case V4SI_FTYPE_PCV4SI_V4SI:
+-    case V2DI_FTYPE_PCV2DI_V2DI:
+-    case VOID_FTYPE_INT_INT64:
+-      nargs = 2;
+-      klass = load;
+-      memory = 0;
+-      break;
+-    case VOID_FTYPE_PV8DF_V8DF_UQI:
+-    case VOID_FTYPE_PV4DF_V4DF_UQI:
+-    case VOID_FTYPE_PV2DF_V2DF_UQI:
+-    case VOID_FTYPE_PV16SF_V16SF_UHI:
+-    case VOID_FTYPE_PV8SF_V8SF_UQI:
+-    case VOID_FTYPE_PV4SF_V4SF_UQI:
+-    case VOID_FTYPE_PV8DI_V8DI_UQI:
+-    case VOID_FTYPE_PV4DI_V4DI_UQI:
+-    case VOID_FTYPE_PV2DI_V2DI_UQI:
+-    case VOID_FTYPE_PV16SI_V16SI_UHI:
+-    case VOID_FTYPE_PV8SI_V8SI_UQI:
+-    case VOID_FTYPE_PV4SI_V4SI_UQI:
+-    case VOID_FTYPE_PV64QI_V64QI_UDI:
+-    case VOID_FTYPE_PV32HI_V32HI_USI:
+-    case VOID_FTYPE_PV32QI_V32QI_USI:
+-    case VOID_FTYPE_PV16QI_V16QI_UHI:
+-    case VOID_FTYPE_PV16HI_V16HI_UHI:
+-    case VOID_FTYPE_PV8HI_V8HI_UQI:
+-      switch (icode)
+-	{
+-	/* These builtins and instructions require the memory
+-	   to be properly aligned.  */
+-	case CODE_FOR_avx512f_storev16sf_mask:
+-	case CODE_FOR_avx512f_storev16si_mask:
+-	case CODE_FOR_avx512f_storev8df_mask:
+-	case CODE_FOR_avx512f_storev8di_mask:
+-	case CODE_FOR_avx512vl_storev8sf_mask:
+-	case CODE_FOR_avx512vl_storev8si_mask:
+-	case CODE_FOR_avx512vl_storev4df_mask:
+-	case CODE_FOR_avx512vl_storev4di_mask:
+-	case CODE_FOR_avx512vl_storev4sf_mask:
+-	case CODE_FOR_avx512vl_storev4si_mask:
+-	case CODE_FOR_avx512vl_storev2df_mask:
+-	case CODE_FOR_avx512vl_storev2di_mask:
+-	  aligned_mem = true;
+-	  break;
+-	default:
+-	  break;
+-	}
+-      /* FALLTHRU */
+-    case VOID_FTYPE_PV8SF_V8SI_V8SF:
+-    case VOID_FTYPE_PV4DF_V4DI_V4DF:
+-    case VOID_FTYPE_PV4SF_V4SI_V4SF:
+-    case VOID_FTYPE_PV2DF_V2DI_V2DF:
+-    case VOID_FTYPE_PV8SI_V8SI_V8SI:
+-    case VOID_FTYPE_PV4DI_V4DI_V4DI:
+-    case VOID_FTYPE_PV4SI_V4SI_V4SI:
+-    case VOID_FTYPE_PV2DI_V2DI_V2DI:
+-    case VOID_FTYPE_PV8SI_V8DI_UQI:
+-    case VOID_FTYPE_PV8HI_V8DI_UQI:
+-    case VOID_FTYPE_PV16HI_V16SI_UHI:
+-    case VOID_FTYPE_PV16QI_V8DI_UQI:
+-    case VOID_FTYPE_PV16QI_V16SI_UHI:
+-    case VOID_FTYPE_PV4SI_V4DI_UQI:
+-    case VOID_FTYPE_PV4SI_V2DI_UQI:
+-    case VOID_FTYPE_PV8HI_V4DI_UQI:
+-    case VOID_FTYPE_PV8HI_V2DI_UQI:
+-    case VOID_FTYPE_PV8HI_V8SI_UQI:
+-    case VOID_FTYPE_PV8HI_V4SI_UQI:
+-    case VOID_FTYPE_PV16QI_V4DI_UQI:
+-    case VOID_FTYPE_PV16QI_V2DI_UQI:
+-    case VOID_FTYPE_PV16QI_V8SI_UQI:
+-    case VOID_FTYPE_PV16QI_V4SI_UQI:
+-    case VOID_FTYPE_PCHAR_V64QI_UDI:
+-    case VOID_FTYPE_PCHAR_V32QI_USI:
+-    case VOID_FTYPE_PCHAR_V16QI_UHI:
+-    case VOID_FTYPE_PSHORT_V32HI_USI:
+-    case VOID_FTYPE_PSHORT_V16HI_UHI:
+-    case VOID_FTYPE_PSHORT_V8HI_UQI:
+-    case VOID_FTYPE_PINT_V16SI_UHI:
+-    case VOID_FTYPE_PINT_V8SI_UQI:
+-    case VOID_FTYPE_PINT_V4SI_UQI:
+-    case VOID_FTYPE_PINT64_V8DI_UQI:
+-    case VOID_FTYPE_PINT64_V4DI_UQI:
+-    case VOID_FTYPE_PINT64_V2DI_UQI:
+-    case VOID_FTYPE_PDOUBLE_V8DF_UQI:
+-    case VOID_FTYPE_PDOUBLE_V4DF_UQI:
+-    case VOID_FTYPE_PDOUBLE_V2DF_UQI:
+-    case VOID_FTYPE_PFLOAT_V16SF_UHI:
+-    case VOID_FTYPE_PFLOAT_V8SF_UQI:
+-    case VOID_FTYPE_PFLOAT_V4SF_UQI:
+-    case VOID_FTYPE_PV32QI_V32HI_USI:
+-    case VOID_FTYPE_PV16QI_V16HI_UHI:
+-    case VOID_FTYPE_PV8QI_V8HI_UQI:
+-      nargs = 2;
+-      klass = store;
+-      /* Reserve memory operand for target.  */
+-      memory = ARRAY_SIZE (args);
+-      break;
+-    case V4SF_FTYPE_PCV4SF_V4SF_UQI:
+-    case V8SF_FTYPE_PCV8SF_V8SF_UQI:
+-    case V16SF_FTYPE_PCV16SF_V16SF_UHI:
+-    case V4SI_FTYPE_PCV4SI_V4SI_UQI:
+-    case V8SI_FTYPE_PCV8SI_V8SI_UQI:
+-    case V16SI_FTYPE_PCV16SI_V16SI_UHI:
+-    case V2DF_FTYPE_PCV2DF_V2DF_UQI:
+-    case V4DF_FTYPE_PCV4DF_V4DF_UQI:
+-    case V8DF_FTYPE_PCV8DF_V8DF_UQI:
+-    case V2DI_FTYPE_PCV2DI_V2DI_UQI:
+-    case V4DI_FTYPE_PCV4DI_V4DI_UQI:
+-    case V8DI_FTYPE_PCV8DI_V8DI_UQI:
+-    case V64QI_FTYPE_PCV64QI_V64QI_UDI:
+-    case V32HI_FTYPE_PCV32HI_V32HI_USI:
+-    case V32QI_FTYPE_PCV32QI_V32QI_USI:
+-    case V16QI_FTYPE_PCV16QI_V16QI_UHI:
+-    case V16HI_FTYPE_PCV16HI_V16HI_UHI:
+-    case V8HI_FTYPE_PCV8HI_V8HI_UQI:
+-      switch (icode)
+-	{
+-	/* These builtins and instructions require the memory
+-	   to be properly aligned.  */
+-	case CODE_FOR_avx512f_loadv16sf_mask:
+-	case CODE_FOR_avx512f_loadv16si_mask:
+-	case CODE_FOR_avx512f_loadv8df_mask:
+-	case CODE_FOR_avx512f_loadv8di_mask:
+-	case CODE_FOR_avx512vl_loadv8sf_mask:
+-	case CODE_FOR_avx512vl_loadv8si_mask:
+-	case CODE_FOR_avx512vl_loadv4df_mask:
+-	case CODE_FOR_avx512vl_loadv4di_mask:
+-	case CODE_FOR_avx512vl_loadv4sf_mask:
+-	case CODE_FOR_avx512vl_loadv4si_mask:
+-	case CODE_FOR_avx512vl_loadv2df_mask:
+-	case CODE_FOR_avx512vl_loadv2di_mask:
+-	case CODE_FOR_avx512bw_loadv64qi_mask:
+-	case CODE_FOR_avx512vl_loadv32qi_mask:
+-	case CODE_FOR_avx512vl_loadv16qi_mask:
+-	case CODE_FOR_avx512bw_loadv32hi_mask:
+-	case CODE_FOR_avx512vl_loadv16hi_mask:
+-	case CODE_FOR_avx512vl_loadv8hi_mask:
+-	  aligned_mem = true;
+-	  break;
+-	default:
+-	  break;
+-	}
+-      /* FALLTHRU */
+-    case V64QI_FTYPE_PCCHAR_V64QI_UDI:
+-    case V32QI_FTYPE_PCCHAR_V32QI_USI:
+-    case V16QI_FTYPE_PCCHAR_V16QI_UHI:
+-    case V32HI_FTYPE_PCSHORT_V32HI_USI:
+-    case V16HI_FTYPE_PCSHORT_V16HI_UHI:
+-    case V8HI_FTYPE_PCSHORT_V8HI_UQI:
+-    case V16SI_FTYPE_PCINT_V16SI_UHI:
+-    case V8SI_FTYPE_PCINT_V8SI_UQI:
+-    case V4SI_FTYPE_PCINT_V4SI_UQI:
+-    case V8DI_FTYPE_PCINT64_V8DI_UQI:
+-    case V4DI_FTYPE_PCINT64_V4DI_UQI:
+-    case V2DI_FTYPE_PCINT64_V2DI_UQI:
+-    case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
+-    case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
+-    case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
+-    case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
+-    case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
+-    case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
+-      nargs = 3;
+-      klass = load;
+-      memory = 0;
+-      break;
+-    case VOID_FTYPE_UINT_UINT_UINT:
+-    case VOID_FTYPE_UINT64_UINT_UINT:
+-    case UCHAR_FTYPE_UINT_UINT_UINT:
+-    case UCHAR_FTYPE_UINT64_UINT_UINT:
+-      nargs = 3;
+-      klass = load;
+-      memory = ARRAY_SIZE (args);
+-      last_arg_constant = true;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  gcc_assert (nargs <= ARRAY_SIZE (args));
+-
+-  if (klass == store)
+-    {
+-      arg = CALL_EXPR_ARG (exp, 0);
+-      op = expand_normal (arg);
+-      gcc_assert (target == 0);
+-      if (memory)
+-	{
+-	  op = ix86_zero_extend_to_Pmode (op);
+-	  target = gen_rtx_MEM (tmode, op);
+-	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
+-	     on it.  Try to improve it using get_pointer_alignment,
+-	     and if the special builtin is one that requires strict
+-	     mode alignment, also from it's GET_MODE_ALIGNMENT.
+-	     Failure to do so could lead to ix86_legitimate_combined_insn
+-	     rejecting all changes to such insns.  */
+-	  unsigned int align = get_pointer_alignment (arg);
+-	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
+-	    align = GET_MODE_ALIGNMENT (tmode);
+-	  if (MEM_ALIGN (target) < align)
+-	    set_mem_align (target, align);
+-	}
+-      else
+-	target = force_reg (tmode, op);
+-      arg_adjust = 1;
+-    }
+-  else
+-    {
+-      arg_adjust = 0;
+-      if (optimize
+-	  || target == 0
+-	  || !register_operand (target, tmode)
+-	  || GET_MODE (target) != tmode)
+-	target = gen_reg_rtx (tmode);
+-    }
+-
+-  for (i = 0; i < nargs; i++)
+-    {
+-      machine_mode mode = insn_p->operand[i + 1].mode;
+-      bool match;
+-
+-      arg = CALL_EXPR_ARG (exp, i + arg_adjust);
+-      op = expand_normal (arg);
+-      match = insn_p->operand[i + 1].predicate (op, mode);
+-
+-      if (last_arg_constant && (i + 1) == nargs)
+-	{
+-	  if (!match)
+-	    {
+-	      if (icode == CODE_FOR_lwp_lwpvalsi3
+-		  || icode == CODE_FOR_lwp_lwpinssi3
+-		  || icode == CODE_FOR_lwp_lwpvaldi3
+-		  || icode == CODE_FOR_lwp_lwpinsdi3)
+-		error ("the last argument must be a 32-bit immediate");
+-	      else
+-		error ("the last argument must be an 8-bit immediate");
+-	      return const0_rtx;
+-	    }
+-	}
+-      else
+-	{
+-	  if (i == memory)
+-	    {
+-	      /* This must be the memory operand.  */
+-	      op = ix86_zero_extend_to_Pmode (op);
+-	      op = gen_rtx_MEM (mode, op);
+-	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
+-		 on it.  Try to improve it using get_pointer_alignment,
+-		 and if the special builtin is one that requires strict
+-		 mode alignment, also from it's GET_MODE_ALIGNMENT.
+-		 Failure to do so could lead to ix86_legitimate_combined_insn
+-		 rejecting all changes to such insns.  */
+-	      unsigned int align = get_pointer_alignment (arg);
+-	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+-		align = GET_MODE_ALIGNMENT (mode);
+-	      if (MEM_ALIGN (op) < align)
+-		set_mem_align (op, align);
+-	    }
+-	  else
+-	    {
+-	      /* This must be register.  */
+-	      if (VECTOR_MODE_P (mode))
+-		op = safe_vector_operand (op, mode);
+-
+-	      op = fixup_modeless_constant (op, mode);
+-
+-	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+-		op = copy_to_mode_reg (mode, op);
+-	      else
+-	        {
+-	          op = copy_to_reg (op);
+-	          op = lowpart_subreg (mode, op, GET_MODE (op));
+-	        }
+-	    }
+-	}
+-
+-      args[i].op = op;
+-      args[i].mode = mode;
+-    }
+-
+-  switch (nargs)
+-    {
+-    case 0:
+-      pat = GEN_FCN (icode) (target);
+-      break;
+-    case 1:
+-      pat = GEN_FCN (icode) (target, args[0].op);
+-      break;
+-    case 2:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+-      break;
+-    case 3:
+-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  if (! pat)
+-    return 0;
+-  emit_insn (pat);
+-  return klass == store ? 0 : target;
+-}
+-
+-/* Return the integer constant in ARG.  Constrain it to be in the range
+-   of the subparts of VEC_TYPE; issue an error if not.  */
+-
+-static int
+-get_element_number (tree vec_type, tree arg)
+-{
+-  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
+-
+-  if (!tree_fits_uhwi_p (arg)
+-      || (elt = tree_to_uhwi (arg), elt > max))
+-    {
+-      error ("selector must be an integer constant in the range 0..%wi", max);
+-      return 0;
+-    }
+-
+-  return elt;
+-}
+-
+-/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+-   ix86_expand_vector_init.  We DO have language-level syntax for this, in
+-   the form of  (type){ init-list }.  Except that since we can't place emms
+-   instructions from inside the compiler, we can't allow the use of MMX
+-   registers unless the user explicitly asks for it.  So we do *not* define
+-   vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
+-   we have builtins invoked by mmintrin.h that gives us license to emit
+-   these sorts of instructions.  */
+-
+-static rtx
+-ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
+-{
+-  machine_mode tmode = TYPE_MODE (type);
+-  machine_mode inner_mode = GET_MODE_INNER (tmode);
+-  int i, n_elt = GET_MODE_NUNITS (tmode);
+-  rtvec v = rtvec_alloc (n_elt);
+-
+-  gcc_assert (VECTOR_MODE_P (tmode));
+-  gcc_assert (call_expr_nargs (exp) == n_elt);
+-
+-  for (i = 0; i < n_elt; ++i)
+-    {
+-      rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
+-      RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
+-    }
+-
+-  if (!target || !register_operand (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
+-  return target;
+-}
+-
+-/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+-   ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
+-   had a language-level syntax for referencing vector elements.  */
+-
+-static rtx
+-ix86_expand_vec_ext_builtin (tree exp, rtx target)
+-{
+-  machine_mode tmode, mode0;
+-  tree arg0, arg1;
+-  int elt;
+-  rtx op0;
+-
+-  arg0 = CALL_EXPR_ARG (exp, 0);
+-  arg1 = CALL_EXPR_ARG (exp, 1);
+-
+-  op0 = expand_normal (arg0);
+-  elt = get_element_number (TREE_TYPE (arg0), arg1);
+-
+-  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+-  mode0 = TYPE_MODE (TREE_TYPE (arg0));
+-  gcc_assert (VECTOR_MODE_P (mode0));
+-
+-  op0 = force_reg (mode0, op0);
+-
+-  if (optimize || !target || !register_operand (target, tmode))
+-    target = gen_reg_rtx (tmode);
+-
+-  ix86_expand_vector_extract (true, target, op0, elt);
+-
+-  return target;
+-}
+-
+-/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+-   ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
+-   a language-level syntax for referencing vector elements.  */
+-
+-static rtx
+-ix86_expand_vec_set_builtin (tree exp)
+-{
+-  machine_mode tmode, mode1;
+-  tree arg0, arg1, arg2;
+-  int elt;
+-  rtx op0, op1, target;
+-
+-  arg0 = CALL_EXPR_ARG (exp, 0);
+-  arg1 = CALL_EXPR_ARG (exp, 1);
+-  arg2 = CALL_EXPR_ARG (exp, 2);
+-
+-  tmode = TYPE_MODE (TREE_TYPE (arg0));
+-  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+-  gcc_assert (VECTOR_MODE_P (tmode));
+-
+-  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
+-  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
+-  elt = get_element_number (TREE_TYPE (arg0), arg2);
+-
+-  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+-    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
+-
+-  op0 = force_reg (tmode, op0);
+-  op1 = force_reg (mode1, op1);
+-
+-  /* OP0 is the source of these builtin functions and shouldn't be
+-     modified.  Create a copy, use it and return it as target.  */
+-  target = gen_reg_rtx (tmode);
+-  emit_move_insn (target, op0);
+-  ix86_expand_vector_set (true, target, op1, elt);
+-
+-  return target;
+-}
+-
+-/* Expand an expression EXP that calls a built-in function,
+-   with result going to TARGET if that's convenient
+-   (and in mode MODE if that's convenient).
+-   SUBTARGET may be used as the target for computing one of EXP's operands.
+-   IGNORE is nonzero if the value is to be ignored.  */
+-
+-static rtx
+-ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
+-		     machine_mode mode, int ignore)
+-{
+-  size_t i;
+-  enum insn_code icode, icode2;
+-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+-  tree arg0, arg1, arg2, arg3, arg4;
+-  rtx op0, op1, op2, op3, op4, pat, pat2, insn;
+-  machine_mode mode0, mode1, mode2, mode3, mode4;
+-  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+-
+-  /* For CPU builtins that can be folded, fold first and expand the fold.  */
+-  switch (fcode)
+-    {
+-    case IX86_BUILTIN_CPU_INIT:
+-      {
+-	/* Make it call __cpu_indicator_init in libgcc. */
+-	tree call_expr, fndecl, type;
+-        type = build_function_type_list (integer_type_node, NULL_TREE); 
+-	fndecl = build_fn_decl ("__cpu_indicator_init", type);
+-	call_expr = build_call_expr (fndecl, 0); 
+-	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
+-      }
+-    case IX86_BUILTIN_CPU_IS:
+-    case IX86_BUILTIN_CPU_SUPPORTS:
+-      {
+-	tree arg0 = CALL_EXPR_ARG (exp, 0);
+-	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
+-	gcc_assert (fold_expr != NULL_TREE);
+-	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
+-      }
+-    }
+-
+-  HOST_WIDE_INT isa = ix86_isa_flags;
+-  HOST_WIDE_INT isa2 = ix86_isa_flags2;
+-  HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
+-  HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+-  /* The general case is we require all the ISAs specified in bisa{,2}
+-     to be enabled.
+-     The exceptions are:
+-     OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+-     OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
+-     OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
+-     where for each this pair it is sufficient if either of the ISAs is
+-     enabled, plus if it is ored with other options also those others.  */
+-  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
+-       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
+-      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
+-    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
+-  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
+-       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
+-      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
+-    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
+-  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
+-       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
+-      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
+-    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
+-  if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
+-    {
+-      bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
+-      if (TARGET_ABI_X32)
+-	bisa |= OPTION_MASK_ABI_X32;
+-      else
+-	bisa |= OPTION_MASK_ABI_64;
+-      char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
+-				       (enum fpmath_unit) 0, false, add_abi_p);
+-      if (!opts)
+-	error ("%qE needs unknown isa option", fndecl);
+-      else
+-	{
+-	  gcc_assert (opts != NULL);
+-	  error ("%qE needs isa option %s", fndecl, opts);
+-	  free (opts);
+-	}
+-      return expand_call (exp, target, ignore);
+-    }
+-
+-  switch (fcode)
+-    {
+-    case IX86_BUILTIN_MASKMOVQ:
+-    case IX86_BUILTIN_MASKMOVDQU:
+-      icode = (fcode == IX86_BUILTIN_MASKMOVQ
+-	       ? CODE_FOR_mmx_maskmovq
+-	       : CODE_FOR_sse2_maskmovdqu);
+-      /* Note the arg order is different from the operand order.  */
+-      arg1 = CALL_EXPR_ARG (exp, 0);
+-      arg2 = CALL_EXPR_ARG (exp, 1);
+-      arg0 = CALL_EXPR_ARG (exp, 2);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      op2 = expand_normal (arg2);
+-      mode0 = insn_data[icode].operand[0].mode;
+-      mode1 = insn_data[icode].operand[1].mode;
+-      mode2 = insn_data[icode].operand[2].mode;
+-
+-      op0 = ix86_zero_extend_to_Pmode (op0);
+-      op0 = gen_rtx_MEM (mode1, op0);
+-
+-      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+-	op0 = copy_to_mode_reg (mode0, op0);
+-      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+-	op1 = copy_to_mode_reg (mode1, op1);
+-      if (!insn_data[icode].operand[2].predicate (op2, mode2))
+-	op2 = copy_to_mode_reg (mode2, op2);
+-      pat = GEN_FCN (icode) (op0, op1, op2);
+-      if (! pat)
+-	return 0;
+-      emit_insn (pat);
+-      return 0;
+-
+-    case IX86_BUILTIN_LDMXCSR:
+-      op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+-      target = assign_386_stack_local (SImode, SLOT_TEMP);
+-      emit_move_insn (target, op0);
+-      emit_insn (gen_sse_ldmxcsr (target));
+-      return 0;
+-
+-    case IX86_BUILTIN_STMXCSR:
+-      target = assign_386_stack_local (SImode, SLOT_TEMP);
+-      emit_insn (gen_sse_stmxcsr (target));
+-      return copy_to_mode_reg (SImode, target);
+-
+-    case IX86_BUILTIN_CLFLUSH:
+-	arg0 = CALL_EXPR_ARG (exp, 0);
+-	op0 = expand_normal (arg0);
+-	icode = CODE_FOR_sse2_clflush;
+-	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+-	  op0 = ix86_zero_extend_to_Pmode (op0);
+-
+-	emit_insn (gen_sse2_clflush (op0));
+-	return 0;
+-
+-    case IX86_BUILTIN_CLWB:
+-	arg0 = CALL_EXPR_ARG (exp, 0);
+-	op0 = expand_normal (arg0);
+-	icode = CODE_FOR_clwb;
+-	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+-	  op0 = ix86_zero_extend_to_Pmode (op0);
+-
+-	emit_insn (gen_clwb (op0));
+-	return 0;
+-
+-    case IX86_BUILTIN_CLFLUSHOPT:
+-	arg0 = CALL_EXPR_ARG (exp, 0);
+-	op0 = expand_normal (arg0);
+-	icode = CODE_FOR_clflushopt;
+-	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+-	  op0 = ix86_zero_extend_to_Pmode (op0);
+-
+-	emit_insn (gen_clflushopt (op0));
+-	return 0;
+-
+-    case IX86_BUILTIN_MONITOR:
+-    case IX86_BUILTIN_MONITORX:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      arg2 = CALL_EXPR_ARG (exp, 2);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      op2 = expand_normal (arg2);
+-      if (!REG_P (op0))
+-	op0 = ix86_zero_extend_to_Pmode (op0);
+-      if (!REG_P (op1))
+-	op1 = copy_to_mode_reg (SImode, op1);
+-      if (!REG_P (op2))
+-	op2 = copy_to_mode_reg (SImode, op2);
+-
+-      emit_insn (fcode == IX86_BUILTIN_MONITOR 
+-		 ? ix86_gen_monitor (op0, op1, op2)
+-		 : ix86_gen_monitorx (op0, op1, op2));
+-      return 0;
+-
+-    case IX86_BUILTIN_MWAIT:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      if (!REG_P (op0))
+-	op0 = copy_to_mode_reg (SImode, op0);
+-      if (!REG_P (op1))
+-	op1 = copy_to_mode_reg (SImode, op1);
+-      emit_insn (gen_sse3_mwait (op0, op1));
+-      return 0;
+-
+-    case IX86_BUILTIN_MWAITX:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      arg2 = CALL_EXPR_ARG (exp, 2);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      op2 = expand_normal (arg2);
+-      if (!REG_P (op0))
+-	op0 = copy_to_mode_reg (SImode, op0);
+-      if (!REG_P (op1))
+-	op1 = copy_to_mode_reg (SImode, op1);
+-      if (!REG_P (op2))
+-	op2 = copy_to_mode_reg (SImode, op2);
+-      emit_insn (gen_mwaitx (op0, op1, op2));
+-      return 0;
+-
+-    case IX86_BUILTIN_UMONITOR:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-
+-      op0 = ix86_zero_extend_to_Pmode (op0);
+-
+-      insn = (TARGET_64BIT
+-	      ? gen_umonitor_di (op0)
+-	      : gen_umonitor_si (op0));
+-
+-      emit_insn (insn);
+-      return 0;
+-
+-    case IX86_BUILTIN_UMWAIT:
+-    case IX86_BUILTIN_TPAUSE:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-
+-      if (!REG_P (op0))
+-	op0 = copy_to_mode_reg (SImode, op0);
+-
+-      op1 = force_reg (DImode, op1);
+-
+-      if (TARGET_64BIT)
+-	{
+-	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+-				     NULL, 1, OPTAB_DIRECT);
+-	  switch (fcode)
+-	    {
+-	    case IX86_BUILTIN_UMWAIT:
+-	      icode = CODE_FOR_umwait_rex64;
+-	      break;
+-	    case IX86_BUILTIN_TPAUSE:
+-	      icode = CODE_FOR_tpause_rex64;
+-	      break;
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-
+-	  op2 = gen_lowpart (SImode, op2);
+-	  op1 = gen_lowpart (SImode, op1);
+-	  pat = GEN_FCN (icode) (op0, op1, op2);
+-	}
+-      else
+-	{
+-	  switch (fcode)
+-	    {
+-	    case IX86_BUILTIN_UMWAIT:
+-	      icode = CODE_FOR_umwait;
+-	      break;
+-	    case IX86_BUILTIN_TPAUSE:
+-	      icode = CODE_FOR_tpause;
+-	      break;
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-	  pat = GEN_FCN (icode) (op0, op1);
+-	}
+-
+-      if (!pat)
+-	return 0;
+-
+-      emit_insn (pat);
+-
+-      if (target == 0
+-	  || !register_operand (target, QImode))
+-	target = gen_reg_rtx (QImode);
+-
+-      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+-			const0_rtx);
+-      emit_insn (gen_rtx_SET (target, pat));
+-
+-      return target;
+-
+-    case IX86_BUILTIN_CLZERO:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      if (!REG_P (op0))
+-	op0 = ix86_zero_extend_to_Pmode (op0);
+-      emit_insn (ix86_gen_clzero (op0));
+-      return 0;
+-
+-    case IX86_BUILTIN_CLDEMOTE:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      icode = CODE_FOR_cldemote;
+-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+-	op0 = ix86_zero_extend_to_Pmode (op0);
+-
+-      emit_insn (gen_cldemote (op0));
+-      return 0;
+-
+-    case IX86_BUILTIN_VEC_INIT_V2SI:
+-    case IX86_BUILTIN_VEC_INIT_V4HI:
+-    case IX86_BUILTIN_VEC_INIT_V8QI:
+-      return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+-
+-    case IX86_BUILTIN_VEC_EXT_V2DF:
+-    case IX86_BUILTIN_VEC_EXT_V2DI:
+-    case IX86_BUILTIN_VEC_EXT_V4SF:
+-    case IX86_BUILTIN_VEC_EXT_V4SI:
+-    case IX86_BUILTIN_VEC_EXT_V8HI:
+-    case IX86_BUILTIN_VEC_EXT_V2SI:
+-    case IX86_BUILTIN_VEC_EXT_V4HI:
+-    case IX86_BUILTIN_VEC_EXT_V16QI:
+-      return ix86_expand_vec_ext_builtin (exp, target);
+-
+-    case IX86_BUILTIN_VEC_SET_V2DI:
+-    case IX86_BUILTIN_VEC_SET_V4SF:
+-    case IX86_BUILTIN_VEC_SET_V4SI:
+-    case IX86_BUILTIN_VEC_SET_V8HI:
+-    case IX86_BUILTIN_VEC_SET_V4HI:
+-    case IX86_BUILTIN_VEC_SET_V16QI:
+-      return ix86_expand_vec_set_builtin (exp);
+-
+-    case IX86_BUILTIN_NANQ:
+-    case IX86_BUILTIN_NANSQ:
+-      return expand_call (exp, target, ignore);
+-
+-    case IX86_BUILTIN_RDPID:
+-
+-      op0 = gen_reg_rtx (word_mode);
+-
+-      if (TARGET_64BIT)
+-	{
+-	  insn = gen_rdpid_rex64 (op0);
+-	  op0 = convert_to_mode (SImode, op0, 1);
+-	}
+-      else
+-	insn = gen_rdpid (op0);
+-
+-      emit_insn (insn);
+-
+-      if (target == 0
+-	  || !register_operand (target, SImode))
+-	target = gen_reg_rtx (SImode);
+-
+-      emit_move_insn (target, op0);
+-      return target;
+-
+-    case IX86_BUILTIN_RDPMC:
+-    case IX86_BUILTIN_RDTSC:
+-    case IX86_BUILTIN_RDTSCP:
+-    case IX86_BUILTIN_XGETBV:
+-
+-      op0 = gen_reg_rtx (DImode);
+-      op1 = gen_reg_rtx (DImode);
+-
+-      if (fcode == IX86_BUILTIN_RDPMC)
+-	{
+-	  arg0 = CALL_EXPR_ARG (exp, 0);
+-	  op2 = expand_normal (arg0);
+-	  if (!register_operand (op2, SImode))
+-	    op2 = copy_to_mode_reg (SImode, op2);
+-
+-	  insn = (TARGET_64BIT
+-		  ? gen_rdpmc_rex64 (op0, op1, op2)
+-		  : gen_rdpmc (op0, op2));
+-	  emit_insn (insn);
+-	}
+-      else if (fcode == IX86_BUILTIN_XGETBV)
+-	{
+-	  arg0 = CALL_EXPR_ARG (exp, 0);
+-	  op2 = expand_normal (arg0);
+-	  if (!register_operand (op2, SImode))
+-	    op2 = copy_to_mode_reg (SImode, op2);
+-
+-	  insn = (TARGET_64BIT
+-		  ? gen_xgetbv_rex64 (op0, op1, op2)
+-		  : gen_xgetbv (op0, op2));
+-	  emit_insn (insn);
+-	}
+-      else if (fcode == IX86_BUILTIN_RDTSC)
+-	{
+-	  insn = (TARGET_64BIT
+-		  ? gen_rdtsc_rex64 (op0, op1)
+-		  : gen_rdtsc (op0));
+-	  emit_insn (insn);
+-	}
+-      else
+-	{
+-	  op2 = gen_reg_rtx (SImode);
+-
+-	  insn = (TARGET_64BIT
+-		  ? gen_rdtscp_rex64 (op0, op1, op2)
+-		  : gen_rdtscp (op0, op2));
+-	  emit_insn (insn);
+-
+-	  arg0 = CALL_EXPR_ARG (exp, 0);
+-	  op4 = expand_normal (arg0);
+-	  if (!address_operand (op4, VOIDmode))
+-	    {
+-	      op4 = convert_memory_address (Pmode, op4);
+-	      op4 = copy_addr_to_reg (op4);
+-	    }
+-	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
+-	}
+-
+-      if (target == 0
+-	  || !register_operand (target, DImode))
+-        target = gen_reg_rtx (DImode);
+-
+-      if (TARGET_64BIT)
+-	{
+-	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
+-				     op1, 1, OPTAB_DIRECT);
+-	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
+-				     op0, 1, OPTAB_DIRECT);
+-	}
+-
+-      emit_move_insn (target, op0);
+-      return target;
+-
+-    case IX86_BUILTIN_MOVDIR64B:
+-
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-
+-      op0 = ix86_zero_extend_to_Pmode (op0);
+-      if (!address_operand (op1, VOIDmode))
+-      {
+-	op1 = convert_memory_address (Pmode, op1);
+-	op1 = copy_addr_to_reg (op1);
+-      }
+-      op1 = gen_rtx_MEM (XImode, op1);
+-
+-      insn = (TARGET_64BIT
+-		? gen_movdir64b_di (op0, op1)
+-		: gen_movdir64b_si (op0, op1));
+-      emit_insn (insn);
+-      return 0;
+-
+-    case IX86_BUILTIN_FXSAVE:
+-    case IX86_BUILTIN_FXRSTOR:
+-    case IX86_BUILTIN_FXSAVE64:
+-    case IX86_BUILTIN_FXRSTOR64:
+-    case IX86_BUILTIN_FNSTENV:
+-    case IX86_BUILTIN_FLDENV:
+-      mode0 = BLKmode;
+-      switch (fcode)
+-	{
+-	case IX86_BUILTIN_FXSAVE:
+-	  icode = CODE_FOR_fxsave;
+-	  break;
+-	case IX86_BUILTIN_FXRSTOR:
+-	  icode = CODE_FOR_fxrstor;
+-	  break;
+-	case IX86_BUILTIN_FXSAVE64:
+-	  icode = CODE_FOR_fxsave64;
+-	  break;
+-	case IX86_BUILTIN_FXRSTOR64:
+-	  icode = CODE_FOR_fxrstor64;
+-	  break;
+-	case IX86_BUILTIN_FNSTENV:
+-	  icode = CODE_FOR_fnstenv;
+-	  break;
+-	case IX86_BUILTIN_FLDENV:
+-	  icode = CODE_FOR_fldenv;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-
+-      if (!address_operand (op0, VOIDmode))
+-	{
+-	  op0 = convert_memory_address (Pmode, op0);
+-	  op0 = copy_addr_to_reg (op0);
+-	}
+-      op0 = gen_rtx_MEM (mode0, op0);
+-
+-      pat = GEN_FCN (icode) (op0);
+-      if (pat)
+-	emit_insn (pat);
+-      return 0;
+-
+-    case IX86_BUILTIN_XSETBV:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-
+-      if (!REG_P (op0))
+-	op0 = copy_to_mode_reg (SImode, op0);
+-
+-      op1 = force_reg (DImode, op1);
+-
+-      if (TARGET_64BIT)
+-	{
+-	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+-				     NULL, 1, OPTAB_DIRECT);
+-
+-	  icode = CODE_FOR_xsetbv_rex64;
+-
+-	  op2 = gen_lowpart (SImode, op2);
+-	  op1 = gen_lowpart (SImode, op1);
+-	  pat = GEN_FCN (icode) (op0, op1, op2);
+-	}
+-      else
+-	{
+-	  icode = CODE_FOR_xsetbv;
+-
+-	  pat = GEN_FCN (icode) (op0, op1);
+-	}
+-      if (pat)
+-	emit_insn (pat);
+-      return 0;
+-
+-    case IX86_BUILTIN_XSAVE:
+-    case IX86_BUILTIN_XRSTOR:
+-    case IX86_BUILTIN_XSAVE64:
+-    case IX86_BUILTIN_XRSTOR64:
+-    case IX86_BUILTIN_XSAVEOPT:
+-    case IX86_BUILTIN_XSAVEOPT64:
+-    case IX86_BUILTIN_XSAVES:
+-    case IX86_BUILTIN_XRSTORS:
+-    case IX86_BUILTIN_XSAVES64:
+-    case IX86_BUILTIN_XRSTORS64:
+-    case IX86_BUILTIN_XSAVEC:
+-    case IX86_BUILTIN_XSAVEC64:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-
+-      if (!address_operand (op0, VOIDmode))
+-	{
+-	  op0 = convert_memory_address (Pmode, op0);
+-	  op0 = copy_addr_to_reg (op0);
+-	}
+-      op0 = gen_rtx_MEM (BLKmode, op0);
+-
+-      op1 = force_reg (DImode, op1);
+-
+-      if (TARGET_64BIT)
+-	{
+-	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+-				     NULL, 1, OPTAB_DIRECT);
+-	  switch (fcode)
+-	    {
+-	    case IX86_BUILTIN_XSAVE:
+-	      icode = CODE_FOR_xsave_rex64;
+-	      break;
+-	    case IX86_BUILTIN_XRSTOR:
+-	      icode = CODE_FOR_xrstor_rex64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVE64:
+-	      icode = CODE_FOR_xsave64;
+-	      break;
+-	    case IX86_BUILTIN_XRSTOR64:
+-	      icode = CODE_FOR_xrstor64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVEOPT:
+-	      icode = CODE_FOR_xsaveopt_rex64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVEOPT64:
+-	      icode = CODE_FOR_xsaveopt64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVES:
+-	      icode = CODE_FOR_xsaves_rex64;
+-	      break;
+-	    case IX86_BUILTIN_XRSTORS:
+-	      icode = CODE_FOR_xrstors_rex64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVES64:
+-	      icode = CODE_FOR_xsaves64;
+-	      break;
+-	    case IX86_BUILTIN_XRSTORS64:
+-	      icode = CODE_FOR_xrstors64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVEC:
+-	      icode = CODE_FOR_xsavec_rex64;
+-	      break;
+-	    case IX86_BUILTIN_XSAVEC64:
+-	      icode = CODE_FOR_xsavec64;
+-	      break;
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-
+-	  op2 = gen_lowpart (SImode, op2);
+-	  op1 = gen_lowpart (SImode, op1);
+-	  pat = GEN_FCN (icode) (op0, op1, op2);
+-	}
+-      else
+-	{
+-	  switch (fcode)
+-	    {
+-	    case IX86_BUILTIN_XSAVE:
+-	      icode = CODE_FOR_xsave;
+-	      break;
+-	    case IX86_BUILTIN_XRSTOR:
+-	      icode = CODE_FOR_xrstor;
+-	      break;
+-	    case IX86_BUILTIN_XSAVEOPT:
+-	      icode = CODE_FOR_xsaveopt;
+-	      break;
+-	    case IX86_BUILTIN_XSAVES:
+-	      icode = CODE_FOR_xsaves;
+-	      break;
+-	    case IX86_BUILTIN_XRSTORS:
+-	      icode = CODE_FOR_xrstors;
+-	      break;
+-	    case IX86_BUILTIN_XSAVEC:
+-	      icode = CODE_FOR_xsavec;
+-	      break;
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-	  pat = GEN_FCN (icode) (op0, op1);
+-	}
+-
+-      if (pat)
+-	emit_insn (pat);
+-      return 0;
+-
+-    case IX86_BUILTIN_LLWPCB:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      icode = CODE_FOR_lwp_llwpcb;
+-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+-	op0 = ix86_zero_extend_to_Pmode (op0);
+-      emit_insn (gen_lwp_llwpcb (op0));
+-      return 0;
+-
+-    case IX86_BUILTIN_SLWPCB:
+-      icode = CODE_FOR_lwp_slwpcb;
+-      if (!target
+-	  || !insn_data[icode].operand[0].predicate (target, Pmode))
+-	target = gen_reg_rtx (Pmode);
+-      emit_insn (gen_lwp_slwpcb (target));
+-      return target;
+-
+-    case IX86_BUILTIN_BEXTRI32:
+-    case IX86_BUILTIN_BEXTRI64:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      icode = (fcode == IX86_BUILTIN_BEXTRI32
+-	  ? CODE_FOR_tbm_bextri_si
+-	  : CODE_FOR_tbm_bextri_di);
+-      if (!CONST_INT_P (op1))
+-        {
+-          error ("last argument must be an immediate");
+-          return const0_rtx;
+-        }
+-      else
+-        {
+-          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
+-          unsigned char lsb_index = INTVAL (op1) & 0xFF;
+-          op1 = GEN_INT (length);
+-          op2 = GEN_INT (lsb_index);
+-
+-	  mode1 = insn_data[icode].operand[1].mode;
+-	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
+-	    op0 = copy_to_mode_reg (mode1, op0);
+-
+-	  mode0 = insn_data[icode].operand[0].mode;
+-	  if (target == 0
+-	      || !register_operand (target, mode0))
+-	    target = gen_reg_rtx (mode0);
+-
+-          pat = GEN_FCN (icode) (target, op0, op1, op2);
+-          if (pat)
+-            emit_insn (pat);
+-          return target;
+-        }
+-
+-    case IX86_BUILTIN_RDRAND16_STEP:
+-      icode = CODE_FOR_rdrandhi_1;
+-      mode0 = HImode;
+-      goto rdrand_step;
+-
+-    case IX86_BUILTIN_RDRAND32_STEP:
+-      icode = CODE_FOR_rdrandsi_1;
+-      mode0 = SImode;
+-      goto rdrand_step;
+-
+-    case IX86_BUILTIN_RDRAND64_STEP:
+-      icode = CODE_FOR_rdranddi_1;
+-      mode0 = DImode;
+-
+-rdrand_step:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op1 = expand_normal (arg0);
+-      if (!address_operand (op1, VOIDmode))
+-	{
+-	  op1 = convert_memory_address (Pmode, op1);
+-	  op1 = copy_addr_to_reg (op1);
+-	}
+-
+-      op0 = gen_reg_rtx (mode0);
+-      emit_insn (GEN_FCN (icode) (op0));
+-
+-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+-
+-      op1 = gen_reg_rtx (SImode);
+-      emit_move_insn (op1, CONST1_RTX (SImode));
+-
+-      /* Emit SImode conditional move.  */
+-      if (mode0 == HImode)
+-	{
+-	  if (TARGET_ZERO_EXTEND_WITH_AND
+-	      && optimize_function_for_speed_p (cfun))
+-	    {
+-	      op2 = force_reg (SImode, const0_rtx);
+-
+-	      emit_insn (gen_movstricthi
+-			 (gen_lowpart (HImode, op2), op0));
+-	    }
+-	  else
+-	    {
+-	      op2 = gen_reg_rtx (SImode);
+-
+-	      emit_insn (gen_zero_extendhisi2 (op2, op0));
+-	    }
+-	}
+-      else if (mode0 == SImode)
+-	op2 = op0;
+-      else
+-	op2 = gen_rtx_SUBREG (SImode, op0, 0);
+-
+-      if (target == 0
+-	  || !register_operand (target, SImode))
+-	target = gen_reg_rtx (SImode);
+-
+-      pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+-			 const0_rtx);
+-      emit_insn (gen_rtx_SET (target,
+-			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
+-      return target;
+-
+-    case IX86_BUILTIN_RDSEED16_STEP:
+-      icode = CODE_FOR_rdseedhi_1;
+-      mode0 = HImode;
+-      goto rdseed_step;
+-
+-    case IX86_BUILTIN_RDSEED32_STEP:
+-      icode = CODE_FOR_rdseedsi_1;
+-      mode0 = SImode;
+-      goto rdseed_step;
+-
+-    case IX86_BUILTIN_RDSEED64_STEP:
+-      icode = CODE_FOR_rdseeddi_1;
+-      mode0 = DImode;
+-
+-rdseed_step:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op1 = expand_normal (arg0);
+-      if (!address_operand (op1, VOIDmode))
+-	{
+-	  op1 = convert_memory_address (Pmode, op1);
+-	  op1 = copy_addr_to_reg (op1);
+-	}
+-
+-      op0 = gen_reg_rtx (mode0);
+-      emit_insn (GEN_FCN (icode) (op0));
+-
+-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+-
+-      op2 = gen_reg_rtx (QImode);
+-
+-      pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+-                         const0_rtx);
+-      emit_insn (gen_rtx_SET (op2, pat));
+-
+-      if (target == 0
+-	  || !register_operand (target, SImode))
+-        target = gen_reg_rtx (SImode);
+-
+-      emit_insn (gen_zero_extendqisi2 (target, op2));
+-      return target;
+-
+-    case IX86_BUILTIN_SBB32:
+-      icode = CODE_FOR_subborrowsi;
+-      icode2 = CODE_FOR_subborrowsi_0;
+-      mode0 = SImode;
+-      mode1 = DImode;
+-      mode2 = CCmode;
+-      goto handlecarry;
+-
+-    case IX86_BUILTIN_SBB64:
+-      icode = CODE_FOR_subborrowdi;
+-      icode2 = CODE_FOR_subborrowdi_0;
+-      mode0 = DImode;
+-      mode1 = TImode;
+-      mode2 = CCmode;
+-      goto handlecarry;
+-
+-    case IX86_BUILTIN_ADDCARRYX32:
+-      icode = CODE_FOR_addcarrysi;
+-      icode2 = CODE_FOR_addcarrysi_0;
+-      mode0 = SImode;
+-      mode1 = DImode;
+-      mode2 = CCCmode;
+-      goto handlecarry;
+-
+-    case IX86_BUILTIN_ADDCARRYX64:
+-      icode = CODE_FOR_addcarrydi;
+-      icode2 = CODE_FOR_addcarrydi_0;
+-      mode0 = DImode;
+-      mode1 = TImode;
+-      mode2 = CCCmode;
+-
+-    handlecarry:
+-      arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
+-      arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
+-      arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
+-      arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
+-
+-      op1 = expand_normal (arg0);
+-      if (!integer_zerop (arg0))
+-	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
+-
+-      op2 = expand_normal (arg1);
+-      if (!register_operand (op2, mode0))
+-	op2 = copy_to_mode_reg (mode0, op2);
+-
+-      op3 = expand_normal (arg2);
+-      if (!register_operand (op3, mode0))
+-	op3 = copy_to_mode_reg (mode0, op3);
+-
+-      op4 = expand_normal (arg3);
+-      if (!address_operand (op4, VOIDmode))
+-	{
+-	  op4 = convert_memory_address (Pmode, op4);
+-	  op4 = copy_addr_to_reg (op4);
+-	}
+-
+-      op0 = gen_reg_rtx (mode0);
+-      if (integer_zerop (arg0))
+-	{
+-	  /* If arg0 is 0, optimize right away into add or sub
+-	     instruction that sets CCCmode flags.  */
+-	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
+-	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
+-	}
+-      else
+-	{
+-	  /* Generate CF from input operand.  */
+-	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+-
+-	  /* Generate instruction that consumes CF.  */
+-	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
+-	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
+-	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
+-	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
+-	}
+-
+-      /* Return current CF value.  */
+-      if (target == 0)
+-        target = gen_reg_rtx (QImode);
+-
+-      pat = gen_rtx_LTU (QImode, op1, const0_rtx);
+-      emit_insn (gen_rtx_SET (target, pat));
+-
+-      /* Store the result.  */
+-      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
+-
+-      return target;
+-
+-    case IX86_BUILTIN_READ_FLAGS:
+-      emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
+-
+-      if (optimize
+-	  || target == NULL_RTX
+-	  || !nonimmediate_operand (target, word_mode)
+-	  || GET_MODE (target) != word_mode)
+-	target = gen_reg_rtx (word_mode);
+-
+-      emit_insn (gen_pop (target));
+-      return target;
+-
+-    case IX86_BUILTIN_WRITE_FLAGS:
+-
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      if (!general_no_elim_operand (op0, word_mode))
+-	op0 = copy_to_mode_reg (word_mode, op0);
+-
+-      emit_insn (gen_push (op0));
+-      emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
+-      return 0;
+-
+-    case IX86_BUILTIN_KTESTC8:
+-      icode = CODE_FOR_ktestqi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTZ8:
+-      icode = CODE_FOR_ktestqi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTC16:
+-      icode = CODE_FOR_ktesthi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTZ16:
+-      icode = CODE_FOR_ktesthi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTC32:
+-      icode = CODE_FOR_ktestsi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTZ32:
+-      icode = CODE_FOR_ktestsi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTC64:
+-      icode = CODE_FOR_ktestdi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KTESTZ64:
+-      icode = CODE_FOR_ktestdi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTC8:
+-      icode = CODE_FOR_kortestqi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTZ8:
+-      icode = CODE_FOR_kortestqi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTC16:
+-      icode = CODE_FOR_kortesthi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTZ16:
+-      icode = CODE_FOR_kortesthi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTC32:
+-      icode = CODE_FOR_kortestsi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTZ32:
+-      icode = CODE_FOR_kortestsi;
+-      mode3 = CCZmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTC64:
+-      icode = CODE_FOR_kortestdi;
+-      mode3 = CCCmode;
+-      goto kortest;
+-
+-    case IX86_BUILTIN_KORTESTZ64:
+-      icode = CODE_FOR_kortestdi;
+-      mode3 = CCZmode;
+-
+-    kortest:
+-      arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
+-      arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-
+-      mode0 = insn_data[icode].operand[0].mode;
+-      mode1 = insn_data[icode].operand[1].mode;
+-
+-      if (GET_MODE (op0) != VOIDmode)
+-	op0 = force_reg (GET_MODE (op0), op0);
+-
+-      op0 = gen_lowpart (mode0, op0);
+-
+-      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+-	op0 = copy_to_mode_reg (mode0, op0);
+-
+-      if (GET_MODE (op1) != VOIDmode)
+-	op1 = force_reg (GET_MODE (op1), op1);
+-
+-      op1 = gen_lowpart (mode1, op1);
+-
+-      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+-	op1 = copy_to_mode_reg (mode1, op1);
+-
+-      target = gen_reg_rtx (QImode);
+-
+-      /* Emit kortest.  */
+-      emit_insn (GEN_FCN (icode) (op0, op1));
+-      /* And use setcc to return result from flags.  */
+-      ix86_expand_setcc (target, EQ,
+-			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
+-      return target;
+-
+-    case IX86_BUILTIN_GATHERSIV2DF:
+-      icode = CODE_FOR_avx2_gathersiv2df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV4DF:
+-      icode = CODE_FOR_avx2_gathersiv4df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV2DF:
+-      icode = CODE_FOR_avx2_gatherdiv2df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV4DF:
+-      icode = CODE_FOR_avx2_gatherdiv4df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV4SF:
+-      icode = CODE_FOR_avx2_gathersiv4sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV8SF:
+-      icode = CODE_FOR_avx2_gathersiv8sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV4SF:
+-      icode = CODE_FOR_avx2_gatherdiv4sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV8SF:
+-      icode = CODE_FOR_avx2_gatherdiv8sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV2DI:
+-      icode = CODE_FOR_avx2_gathersiv2di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV4DI:
+-      icode = CODE_FOR_avx2_gathersiv4di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV2DI:
+-      icode = CODE_FOR_avx2_gatherdiv2di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV4DI:
+-      icode = CODE_FOR_avx2_gatherdiv4di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV4SI:
+-      icode = CODE_FOR_avx2_gathersiv4si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERSIV8SI:
+-      icode = CODE_FOR_avx2_gathersiv8si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV4SI:
+-      icode = CODE_FOR_avx2_gatherdiv4si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERDIV8SI:
+-      icode = CODE_FOR_avx2_gatherdiv8si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERALTSIV4DF:
+-      icode = CODE_FOR_avx2_gathersiv4df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERALTDIV8SF:
+-      icode = CODE_FOR_avx2_gatherdiv8sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERALTSIV4DI:
+-      icode = CODE_FOR_avx2_gathersiv4di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHERALTDIV8SI:
+-      icode = CODE_FOR_avx2_gatherdiv8si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV16SF:
+-      icode = CODE_FOR_avx512f_gathersiv16sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV8DF:
+-      icode = CODE_FOR_avx512f_gathersiv8df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV16SF:
+-      icode = CODE_FOR_avx512f_gatherdiv16sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV8DF:
+-      icode = CODE_FOR_avx512f_gatherdiv8df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV16SI:
+-      icode = CODE_FOR_avx512f_gathersiv16si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV8DI:
+-      icode = CODE_FOR_avx512f_gathersiv8di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV16SI:
+-      icode = CODE_FOR_avx512f_gatherdiv16si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV8DI:
+-      icode = CODE_FOR_avx512f_gatherdiv8di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTSIV8DF:
+-      icode = CODE_FOR_avx512f_gathersiv8df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTDIV16SF:
+-      icode = CODE_FOR_avx512f_gatherdiv16sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTSIV8DI:
+-      icode = CODE_FOR_avx512f_gathersiv8di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTDIV16SI:
+-      icode = CODE_FOR_avx512f_gatherdiv16si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV2DF:
+-      icode = CODE_FOR_avx512vl_gathersiv2df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV4DF:
+-      icode = CODE_FOR_avx512vl_gathersiv4df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV2DF:
+-      icode = CODE_FOR_avx512vl_gatherdiv2df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV4DF:
+-      icode = CODE_FOR_avx512vl_gatherdiv4df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV4SF:
+-      icode = CODE_FOR_avx512vl_gathersiv4sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV8SF:
+-      icode = CODE_FOR_avx512vl_gathersiv8sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV4SF:
+-      icode = CODE_FOR_avx512vl_gatherdiv4sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV8SF:
+-      icode = CODE_FOR_avx512vl_gatherdiv8sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV2DI:
+-      icode = CODE_FOR_avx512vl_gathersiv2di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV4DI:
+-      icode = CODE_FOR_avx512vl_gathersiv4di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV2DI:
+-      icode = CODE_FOR_avx512vl_gatherdiv2di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV4DI:
+-      icode = CODE_FOR_avx512vl_gatherdiv4di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV4SI:
+-      icode = CODE_FOR_avx512vl_gathersiv4si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3SIV8SI:
+-      icode = CODE_FOR_avx512vl_gathersiv8si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV4SI:
+-      icode = CODE_FOR_avx512vl_gatherdiv4si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3DIV8SI:
+-      icode = CODE_FOR_avx512vl_gatherdiv8si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTSIV4DF:
+-      icode = CODE_FOR_avx512vl_gathersiv4df;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTDIV8SF:
+-      icode = CODE_FOR_avx512vl_gatherdiv8sf;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTSIV4DI:
+-      icode = CODE_FOR_avx512vl_gathersiv4di;
+-      goto gather_gen;
+-    case IX86_BUILTIN_GATHER3ALTDIV8SI:
+-      icode = CODE_FOR_avx512vl_gatherdiv8si;
+-      goto gather_gen;
+-    case IX86_BUILTIN_SCATTERSIV16SF:
+-      icode = CODE_FOR_avx512f_scattersiv16sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV8DF:
+-      icode = CODE_FOR_avx512f_scattersiv8df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV16SF:
+-      icode = CODE_FOR_avx512f_scatterdiv16sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV8DF:
+-      icode = CODE_FOR_avx512f_scatterdiv8df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV16SI:
+-      icode = CODE_FOR_avx512f_scattersiv16si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV8DI:
+-      icode = CODE_FOR_avx512f_scattersiv8di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV16SI:
+-      icode = CODE_FOR_avx512f_scatterdiv16si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV8DI:
+-      icode = CODE_FOR_avx512f_scatterdiv8di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV8SF:
+-      icode = CODE_FOR_avx512vl_scattersiv8sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV4SF:
+-      icode = CODE_FOR_avx512vl_scattersiv4sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV4DF:
+-      icode = CODE_FOR_avx512vl_scattersiv4df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV2DF:
+-      icode = CODE_FOR_avx512vl_scattersiv2df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV8SF:
+-      icode = CODE_FOR_avx512vl_scatterdiv8sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV4SF:
+-      icode = CODE_FOR_avx512vl_scatterdiv4sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV4DF:
+-      icode = CODE_FOR_avx512vl_scatterdiv4df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV2DF:
+-      icode = CODE_FOR_avx512vl_scatterdiv2df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV8SI:
+-      icode = CODE_FOR_avx512vl_scattersiv8si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV4SI:
+-      icode = CODE_FOR_avx512vl_scattersiv4si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV4DI:
+-      icode = CODE_FOR_avx512vl_scattersiv4di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERSIV2DI:
+-      icode = CODE_FOR_avx512vl_scattersiv2di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV8SI:
+-      icode = CODE_FOR_avx512vl_scatterdiv8si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV4SI:
+-      icode = CODE_FOR_avx512vl_scatterdiv4si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV4DI:
+-      icode = CODE_FOR_avx512vl_scatterdiv4di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERDIV2DI:
+-      icode = CODE_FOR_avx512vl_scatterdiv2di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_GATHERPFDPD:
+-      icode = CODE_FOR_avx512pf_gatherpfv8sidf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_SCATTERALTSIV8DF:
+-      icode = CODE_FOR_avx512f_scattersiv8df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTDIV16SF:
+-      icode = CODE_FOR_avx512f_scatterdiv16sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTSIV8DI:
+-      icode = CODE_FOR_avx512f_scattersiv8di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTDIV16SI:
+-      icode = CODE_FOR_avx512f_scatterdiv16si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTSIV4DF:
+-      icode = CODE_FOR_avx512vl_scattersiv4df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTDIV8SF:
+-      icode = CODE_FOR_avx512vl_scatterdiv8sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTSIV4DI:
+-      icode = CODE_FOR_avx512vl_scattersiv4di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTDIV8SI:
+-      icode = CODE_FOR_avx512vl_scatterdiv8si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTSIV2DF:
+-      icode = CODE_FOR_avx512vl_scattersiv2df;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTDIV4SF:
+-      icode = CODE_FOR_avx512vl_scatterdiv4sf;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTSIV2DI:
+-      icode = CODE_FOR_avx512vl_scattersiv2di;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_SCATTERALTDIV4SI:
+-      icode = CODE_FOR_avx512vl_scatterdiv4si;
+-      goto scatter_gen;
+-    case IX86_BUILTIN_GATHERPFDPS:
+-      icode = CODE_FOR_avx512pf_gatherpfv16sisf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_GATHERPFQPD:
+-      icode = CODE_FOR_avx512pf_gatherpfv8didf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_GATHERPFQPS:
+-      icode = CODE_FOR_avx512pf_gatherpfv8disf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_SCATTERPFDPD:
+-      icode = CODE_FOR_avx512pf_scatterpfv8sidf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_SCATTERPFDPS:
+-      icode = CODE_FOR_avx512pf_scatterpfv16sisf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_SCATTERPFQPD:
+-      icode = CODE_FOR_avx512pf_scatterpfv8didf;
+-      goto vec_prefetch_gen;
+-    case IX86_BUILTIN_SCATTERPFQPS:
+-      icode = CODE_FOR_avx512pf_scatterpfv8disf;
+-      goto vec_prefetch_gen;
+-
+-    gather_gen:
+-      rtx half;
+-      rtx (*gen) (rtx, rtx);
+-
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      arg2 = CALL_EXPR_ARG (exp, 2);
+-      arg3 = CALL_EXPR_ARG (exp, 3);
+-      arg4 = CALL_EXPR_ARG (exp, 4);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      op2 = expand_normal (arg2);
+-      op3 = expand_normal (arg3);
+-      op4 = expand_normal (arg4);
+-      /* Note the arg order is different from the operand order.  */
+-      mode0 = insn_data[icode].operand[1].mode;
+-      mode2 = insn_data[icode].operand[3].mode;
+-      mode3 = insn_data[icode].operand[4].mode;
+-      mode4 = insn_data[icode].operand[5].mode;
+-
+-      if (target == NULL_RTX
+-	  || GET_MODE (target) != insn_data[icode].operand[0].mode
+-	  || !insn_data[icode].operand[0].predicate (target,
+-						     GET_MODE (target)))
+-	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+-      else
+-	subtarget = target;
+-
+-      switch (fcode)
+-	{
+-	case IX86_BUILTIN_GATHER3ALTSIV8DF:
+-	case IX86_BUILTIN_GATHER3ALTSIV8DI:
+-	  half = gen_reg_rtx (V8SImode);
+-	  if (!nonimmediate_operand (op2, V16SImode))
+-	    op2 = copy_to_mode_reg (V16SImode, op2);
+-	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+-	  op2 = half;
+-	  break;
+-	case IX86_BUILTIN_GATHER3ALTSIV4DF:
+-	case IX86_BUILTIN_GATHER3ALTSIV4DI:
+-	case IX86_BUILTIN_GATHERALTSIV4DF:
+-	case IX86_BUILTIN_GATHERALTSIV4DI:
+-	  half = gen_reg_rtx (V4SImode);
+-	  if (!nonimmediate_operand (op2, V8SImode))
+-	    op2 = copy_to_mode_reg (V8SImode, op2);
+-	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
+-	  op2 = half;
+-	  break;
+-	case IX86_BUILTIN_GATHER3ALTDIV16SF:
+-	case IX86_BUILTIN_GATHER3ALTDIV16SI:
+-	  half = gen_reg_rtx (mode0);
+-	  if (mode0 == V8SFmode)
+-	    gen = gen_vec_extract_lo_v16sf;
+-	  else
+-	    gen = gen_vec_extract_lo_v16si;
+-	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
+-	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+-	  emit_insn (gen (half, op0));
+-	  op0 = half;
+-	  op3 = lowpart_subreg (QImode, op3, HImode);
+-	  break;
+-	case IX86_BUILTIN_GATHER3ALTDIV8SF:
+-	case IX86_BUILTIN_GATHER3ALTDIV8SI:
+-	case IX86_BUILTIN_GATHERALTDIV8SF:
+-	case IX86_BUILTIN_GATHERALTDIV8SI:
+-	  half = gen_reg_rtx (mode0);
+-	  if (mode0 == V4SFmode)
+-	    gen = gen_vec_extract_lo_v8sf;
+-	  else
+-	    gen = gen_vec_extract_lo_v8si;
+-	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
+-	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+-	  emit_insn (gen (half, op0));
+-	  op0 = half;
+-	  if (VECTOR_MODE_P (GET_MODE (op3)))
+-	    {
+-	      half = gen_reg_rtx (mode0);
+-	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
+-		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+-	      emit_insn (gen (half, op3));
+-	      op3 = half;
+-	    }
+-	  break;
+-	default:
+-	  break;
+-	}
+-
+-      /* Force memory operand only with base register here.  But we
+-	 don't want to do it on memory operand for other builtin
+-	 functions.  */
+-      op1 = ix86_zero_extend_to_Pmode (op1);
+-
+-      if (!insn_data[icode].operand[1].predicate (op0, mode0))
+-	op0 = copy_to_mode_reg (mode0, op0);
+-      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
+-	op1 = copy_to_mode_reg (Pmode, op1);
+-      if (!insn_data[icode].operand[3].predicate (op2, mode2))
+-	op2 = copy_to_mode_reg (mode2, op2);
+-
+-      op3 = fixup_modeless_constant (op3, mode3);
+-
+-      if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
+-	{
+-	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
+-	    op3 = copy_to_mode_reg (mode3, op3);
+-	}
+-      else
+-	{
+-	  op3 = copy_to_reg (op3);
+-	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
+-	}
+-      if (!insn_data[icode].operand[5].predicate (op4, mode4))
+-	{
+-          error ("the last argument must be scale 1, 2, 4, 8");
+-          return const0_rtx;
+-	}
+-
+-      /* Optimize.  If mask is known to have all high bits set,
+-	 replace op0 with pc_rtx to signal that the instruction
+-	 overwrites the whole destination and doesn't use its
+-	 previous contents.  */
+-      if (optimize)
+-	{
+-	  if (TREE_CODE (arg3) == INTEGER_CST)
+-	    {
+-	      if (integer_all_onesp (arg3))
+-		op0 = pc_rtx;
+-	    }
+-	  else if (TREE_CODE (arg3) == VECTOR_CST)
+-	    {
+-	      unsigned int negative = 0;
+-	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
+-		{
+-		  tree cst = VECTOR_CST_ELT (arg3, i);
+-		  if (TREE_CODE (cst) == INTEGER_CST
+-		      && tree_int_cst_sign_bit (cst))
+-		    negative++;
+-		  else if (TREE_CODE (cst) == REAL_CST
+-			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
+-		    negative++;
+-		}
+-	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
+-		op0 = pc_rtx;
+-	    }
+-	  else if (TREE_CODE (arg3) == SSA_NAME
+-		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+-	    {
+-	      /* Recognize also when mask is like:
+-		 __v2df src = _mm_setzero_pd ();
+-		 __v2df mask = _mm_cmpeq_pd (src, src);
+-		 or
+-		 __v8sf src = _mm256_setzero_ps ();
+-		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+-		 as that is a cheaper way to load all ones into
+-		 a register than having to load a constant from
+-		 memory.  */
+-	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
+-	      if (is_gimple_call (def_stmt))
+-		{
+-		  tree fndecl = gimple_call_fndecl (def_stmt);
+-		  if (fndecl
+-		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
+-		    switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+-		      {
+-		      case IX86_BUILTIN_CMPPD:
+-		      case IX86_BUILTIN_CMPPS:
+-		      case IX86_BUILTIN_CMPPD256:
+-		      case IX86_BUILTIN_CMPPS256:
+-			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
+-			  break;
+-			/* FALLTHRU */
+-		      case IX86_BUILTIN_CMPEQPD:
+-		      case IX86_BUILTIN_CMPEQPS:
+-			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
+-			    && initializer_zerop (gimple_call_arg (def_stmt,
+-								   1)))
+-			  op0 = pc_rtx;
+-			break;
+-		      default:
+-			break;
+-		      }
+-		}
+-	    }
+-	}
+-
+-      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
+-      if (! pat)
+-	return const0_rtx;
+-      emit_insn (pat);
+-
+-      switch (fcode)
+-	{
+-	case IX86_BUILTIN_GATHER3DIV16SF:
+-	  if (target == NULL_RTX)
+-	    target = gen_reg_rtx (V8SFmode);
+-	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
+-	  break;
+-	case IX86_BUILTIN_GATHER3DIV16SI:
+-	  if (target == NULL_RTX)
+-	    target = gen_reg_rtx (V8SImode);
+-	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
+-	  break;
+-	case IX86_BUILTIN_GATHER3DIV8SF:
+-	case IX86_BUILTIN_GATHERDIV8SF:
+-	  if (target == NULL_RTX)
+-	    target = gen_reg_rtx (V4SFmode);
+-	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+-	  break;
+-	case IX86_BUILTIN_GATHER3DIV8SI:
+-	case IX86_BUILTIN_GATHERDIV8SI:
+-	  if (target == NULL_RTX)
+-	    target = gen_reg_rtx (V4SImode);
+-	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+-	  break;
+-	default:
+-	  target = subtarget;
+-	  break;
+-	}
+-      return target;
+-
+-    scatter_gen:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      arg2 = CALL_EXPR_ARG (exp, 2);
+-      arg3 = CALL_EXPR_ARG (exp, 3);
+-      arg4 = CALL_EXPR_ARG (exp, 4);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      op2 = expand_normal (arg2);
+-      op3 = expand_normal (arg3);
+-      op4 = expand_normal (arg4);
+-      mode1 = insn_data[icode].operand[1].mode;
+-      mode2 = insn_data[icode].operand[2].mode;
+-      mode3 = insn_data[icode].operand[3].mode;
+-      mode4 = insn_data[icode].operand[4].mode;
+-
+-      /* Scatter instruction stores operand op3 to memory with
+-	 indices from op2 and scale from op4 under writemask op1.
+-	 If index operand op2 has more elements then source operand
+-	 op3 one need to use only its low half. And vice versa.  */
+-      switch (fcode)
+-	{
+-	case IX86_BUILTIN_SCATTERALTSIV8DF:
+-	case IX86_BUILTIN_SCATTERALTSIV8DI:
+-	  half = gen_reg_rtx (V8SImode);
+-	  if (!nonimmediate_operand (op2, V16SImode))
+-	    op2 = copy_to_mode_reg (V16SImode, op2);
+-	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+-	  op2 = half;
+-	  break;
+-	case IX86_BUILTIN_SCATTERALTDIV16SF:
+-	case IX86_BUILTIN_SCATTERALTDIV16SI:
+-	  half = gen_reg_rtx (mode3);
+-	  if (mode3 == V8SFmode)
+-	    gen = gen_vec_extract_lo_v16sf;
+-	  else
+-	    gen = gen_vec_extract_lo_v16si;
+-	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+-	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+-	  emit_insn (gen (half, op3));
+-	  op3 = half;
+-	  break;
+-	case IX86_BUILTIN_SCATTERALTSIV4DF:
+-	case IX86_BUILTIN_SCATTERALTSIV4DI:
+-	  half = gen_reg_rtx (V4SImode);
+-	  if (!nonimmediate_operand (op2, V8SImode))
+-	    op2 = copy_to_mode_reg (V8SImode, op2);
+-	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
+-	  op2 = half;
+-	  break;
+-	case IX86_BUILTIN_SCATTERALTDIV8SF:
+-	case IX86_BUILTIN_SCATTERALTDIV8SI:
+-	  half = gen_reg_rtx (mode3);
+-	  if (mode3 == V4SFmode)
+-	    gen = gen_vec_extract_lo_v8sf;
+-	  else
+-	    gen = gen_vec_extract_lo_v8si;
+-	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+-	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+-	  emit_insn (gen (half, op3));
+-	  op3 = half;
+-	  break;
+-	case IX86_BUILTIN_SCATTERALTSIV2DF:
+-	case IX86_BUILTIN_SCATTERALTSIV2DI:
+-	  if (!nonimmediate_operand (op2, V4SImode))
+-	    op2 = copy_to_mode_reg (V4SImode, op2);
+-	  break;
+-	case IX86_BUILTIN_SCATTERALTDIV4SF:
+-	case IX86_BUILTIN_SCATTERALTDIV4SI:
+-	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+-	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+-	  break;
+-	default:
+-	  break;
+-	}
+-
+-      /* Force memory operand only with base register here.  But we
+-	 don't want to do it on memory operand for other builtin
+-	 functions.  */
+-      op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+-
+-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+-	op0 = copy_to_mode_reg (Pmode, op0);
+-
+-      op1 = fixup_modeless_constant (op1, mode1);
+-
+-      if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
+-	{
+-	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
+-	    op1 = copy_to_mode_reg (mode1, op1);
+-	}
+-      else
+-	{
+-	  op1 = copy_to_reg (op1);
+-	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
+-	}
+-
+-      if (!insn_data[icode].operand[2].predicate (op2, mode2))
+-	op2 = copy_to_mode_reg (mode2, op2);
+-
+-      if (!insn_data[icode].operand[3].predicate (op3, mode3))
+-	op3 = copy_to_mode_reg (mode3, op3);
+-
+-      if (!insn_data[icode].operand[4].predicate (op4, mode4))
+-	{
+-	  error ("the last argument must be scale 1, 2, 4, 8");
+-	  return const0_rtx;
+-	}
+-
+-      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+-      if (! pat)
+-	return const0_rtx;
+-
+-      emit_insn (pat);
+-      return 0;
+-
+-    vec_prefetch_gen:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      arg2 = CALL_EXPR_ARG (exp, 2);
+-      arg3 = CALL_EXPR_ARG (exp, 3);
+-      arg4 = CALL_EXPR_ARG (exp, 4);
+-      op0 = expand_normal (arg0);
+-      op1 = expand_normal (arg1);
+-      op2 = expand_normal (arg2);
+-      op3 = expand_normal (arg3);
+-      op4 = expand_normal (arg4);
+-      mode0 = insn_data[icode].operand[0].mode;
+-      mode1 = insn_data[icode].operand[1].mode;
+-      mode3 = insn_data[icode].operand[3].mode;
+-      mode4 = insn_data[icode].operand[4].mode;
+-
+-      op0 = fixup_modeless_constant (op0, mode0);
+-
+-      if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
+-	{
+-	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
+-	    op0 = copy_to_mode_reg (mode0, op0);
+-	}
+-      else
+-	{
+-	  op0 = copy_to_reg (op0);
+-	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
+-	}
+-
+-      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+-	op1 = copy_to_mode_reg (mode1, op1);
+-
+-      /* Force memory operand only with base register here.  But we
+-	 don't want to do it on memory operand for other builtin
+-	 functions.  */
+-      op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
+-
+-      if (!insn_data[icode].operand[2].predicate (op2, Pmode))
+-	op2 = copy_to_mode_reg (Pmode, op2);
+-
+-      if (!insn_data[icode].operand[3].predicate (op3, mode3))
+-	{
+-	  error ("the forth argument must be scale 1, 2, 4, 8");
+-	  return const0_rtx;
+-	}
+-
+-      if (!insn_data[icode].operand[4].predicate (op4, mode4))
+-	{
+-	  error ("incorrect hint operand");
+-	  return const0_rtx;
+-	}
+-
+-      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+-      if (! pat)
+-	return const0_rtx;
+-
+-      emit_insn (pat);
+-
+-      return 0;
+-
+-    case IX86_BUILTIN_XABORT:
+-      icode = CODE_FOR_xabort;
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      mode0 = insn_data[icode].operand[0].mode;
+-      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+-	{
+-	  error ("the argument to %<xabort%> intrinsic must "
+-		 "be an 8-bit immediate");
+-	  return const0_rtx;
+-	}
+-      emit_insn (gen_xabort (op0));
+-      return 0;
+-
+-    case IX86_BUILTIN_RSTORSSP:
+-    case IX86_BUILTIN_CLRSSBSY:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      icode = (fcode == IX86_BUILTIN_RSTORSSP
+-	  ? CODE_FOR_rstorssp
+-	  : CODE_FOR_clrssbsy);
+-      if (!address_operand (op0, VOIDmode))
+-	{
+-	  op1 = convert_memory_address (Pmode, op0);
+-	  op0 = copy_addr_to_reg (op1);
+-	}
+-      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+-      return 0;
+-
+-    case IX86_BUILTIN_WRSSD:
+-    case IX86_BUILTIN_WRSSQ:
+-    case IX86_BUILTIN_WRUSSD:
+-    case IX86_BUILTIN_WRUSSQ:
+-      arg0 = CALL_EXPR_ARG (exp, 0);
+-      op0 = expand_normal (arg0);
+-      arg1 = CALL_EXPR_ARG (exp, 1);
+-      op1 = expand_normal (arg1);
+-      switch (fcode)
+-	{
+-	case IX86_BUILTIN_WRSSD:
+-	  icode = CODE_FOR_wrsssi;
+-	  mode = SImode;
+-	  break;
+-	case IX86_BUILTIN_WRSSQ:
+-	  icode = CODE_FOR_wrssdi;
+-	  mode = DImode;
+-	  break;
+-	case IX86_BUILTIN_WRUSSD:
+-	  icode = CODE_FOR_wrusssi;
+-	  mode = SImode;
+-	  break;
+-	case IX86_BUILTIN_WRUSSQ:
+-	  icode = CODE_FOR_wrussdi;
+-	  mode = DImode;
+-	  break;
+-	}
+-      op0 = force_reg (mode, op0);
+-      if (!address_operand (op1, VOIDmode))
+-	{
+-	  op2 = convert_memory_address (Pmode, op1);
+-	  op1 = copy_addr_to_reg (op2);
+-	}
+-      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+-      return 0;
+-
+-    default:
+-      break;
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
+-      return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
+-					       target);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
+-      rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
+-      rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
+-      rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
+-      int masked = 1;
+-      machine_mode mode, wide_mode, nar_mode;
+-
+-      nar_mode  = V4SFmode;
+-      mode      = V16SFmode;
+-      wide_mode = V64SFmode;
+-      fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
+-      fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
+-
+-      switch (fcode)
+-	{
+-	case IX86_BUILTIN_4FMAPS:
+-	  fcn = gen_avx5124fmaddps_4fmaddps;
+-	  masked = 0;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4DPWSSD:
+-	  nar_mode  = V4SImode;
+-	  mode      = V16SImode;
+-	  wide_mode = V64SImode;
+-	  fcn = gen_avx5124vnniw_vp4dpwssd;
+-	  masked = 0;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4DPWSSDS:
+-	  nar_mode  = V4SImode;
+-	  mode      = V16SImode;
+-	  wide_mode = V64SImode;
+-	  fcn = gen_avx5124vnniw_vp4dpwssds;
+-	  masked = 0;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4FNMAPS:
+-	  fcn = gen_avx5124fmaddps_4fnmaddps;
+-	  masked = 0;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4FNMAPS_MASK:
+-	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
+-	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4DPWSSD_MASK:
+-	  nar_mode  = V4SImode;
+-	  mode      = V16SImode;
+-	  wide_mode = V64SImode;
+-	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
+-	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4DPWSSDS_MASK:
+-	  nar_mode  = V4SImode;
+-	  mode      = V16SImode;
+-	  wide_mode = V64SImode;
+-	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
+-	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
+-	  goto v4fma_expand;
+-
+-	case IX86_BUILTIN_4FMAPS_MASK:
+-	  {
+-	    tree args[4];
+-	    rtx ops[4];
+-	    rtx wide_reg;
+-	    rtx accum;
+-	    rtx addr;
+-	    rtx mem;
+-
+-v4fma_expand:
+-	    wide_reg = gen_reg_rtx (wide_mode);
+-	    for (i = 0; i < 4; i++)
+-	      {
+-		args[i] = CALL_EXPR_ARG (exp, i);
+-		ops[i] = expand_normal (args[i]);
+-
+-		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
+-				ops[i]);
+-	      }
+-
+-	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+-	    accum = force_reg (mode, accum);
+-
+-	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+-	    addr = force_reg (Pmode, addr);
+-
+-	    mem = gen_rtx_MEM (nar_mode, addr);
+-
+-	    target = gen_reg_rtx (mode);
+-
+-	    emit_move_insn (target, accum);
+-
+-	    if (! masked)
+-	      emit_insn (fcn (target, accum, wide_reg, mem));
+-	    else
+-	      {
+-		rtx merge, mask;
+-		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+-
+-		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+-
+-		if (CONST_INT_P (mask))
+-		  mask = fixup_modeless_constant (mask, HImode);
+-
+-		mask = force_reg (HImode, mask);
+-
+-		if (GET_MODE (mask) != HImode)
+-		  mask = gen_rtx_SUBREG (HImode, mask, 0);
+-
+-		/* If merge is 0 then we're about to emit z-masked variant.  */
+-		if (const0_operand (merge, mode))
+-		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+-		/* If merge is the same as accum then emit merge-masked variant.  */
+-		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+-		  {
+-		    merge = force_reg (mode, merge);
+-		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+-		  }
+-		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
+-		else
+-		  {
+-		    target = gen_reg_rtx (mode);
+-		    emit_move_insn (target, merge);
+-		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
+-		  }
+-	      }
+-	    return target;
+-	  }
+-
+-	case IX86_BUILTIN_4FNMASS:
+-	  fcn = gen_avx5124fmaddps_4fnmaddss;
+-	  masked = 0;
+-	  goto s4fma_expand;
+-
+-	case IX86_BUILTIN_4FMASS:
+-	  fcn = gen_avx5124fmaddps_4fmaddss;
+-	  masked = 0;
+-	  goto s4fma_expand;
+-
+-	case IX86_BUILTIN_4FNMASS_MASK:
+-	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
+-	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
+-	  goto s4fma_expand;
+-
+-	case IX86_BUILTIN_4FMASS_MASK:
+-	  {
+-	    tree args[4];
+-	    rtx ops[4];
+-	    rtx wide_reg;
+-	    rtx accum;
+-	    rtx addr;
+-	    rtx mem;
+-
+-	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
+-	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
+-
+-s4fma_expand:
+-	    mode = V4SFmode;
+-	    wide_reg = gen_reg_rtx (V64SFmode);
+-	    for (i = 0; i < 4; i++)
+-	      {
+-		rtx tmp;
+-		args[i] = CALL_EXPR_ARG (exp, i);
+-		ops[i] = expand_normal (args[i]);
+-
+-		tmp = gen_reg_rtx (SFmode);
+-		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
+-
+-		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
+-				gen_rtx_SUBREG (V16SFmode, tmp, 0));
+-	      }
+-
+-	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+-	    accum = force_reg (V4SFmode, accum);
+-
+-	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+-	    addr = force_reg (Pmode, addr);
+-
+-	    mem = gen_rtx_MEM (V4SFmode, addr);
+-
+-	    target = gen_reg_rtx (V4SFmode);
+-
+-	    emit_move_insn (target, accum);
+-
+-	    if (! masked)
+-	      emit_insn (fcn (target, accum, wide_reg, mem));
+-	    else
+-	      {
+-		rtx merge, mask;
+-		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+-
+-		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+-
+-		if (CONST_INT_P (mask))
+-		  mask = fixup_modeless_constant (mask, QImode);
+-
+-		mask = force_reg (QImode, mask);
+-
+-		if (GET_MODE (mask) != QImode)
+-		  mask = gen_rtx_SUBREG (QImode, mask, 0);
+-
+-		/* If merge is 0 then we're about to emit z-masked variant.  */
+-		if (const0_operand (merge, mode))
+-		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+-		/* If merge is the same as accum then emit merge-masked
+-		   variant.  */
+-		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+-		  {
+-		    merge = force_reg (mode, merge);
+-		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+-		  }
+-		/* Merge with something unknown might happen if we z-mask
+-		   w/ -O0.  */
+-		else
+-		  {
+-		    target = gen_reg_rtx (mode);
+-		    emit_move_insn (target, merge);
+-		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
+-		  }
+-		}
+-	      return target;
+-	    }
+-	  case IX86_BUILTIN_RDPID:
+-	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
+-						     target);
+-	  case IX86_BUILTIN_FABSQ:
+-	  case IX86_BUILTIN_COPYSIGNQ:
+-	    if (!TARGET_SSE)
+-	      /* Emit a normal call if SSE isn't available.  */
+-	      return expand_call (exp, target, ignore);
+-	    /* FALLTHRU */
+-	  default:
+-	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
+-	  }
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
+-      return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
+-      return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
+-      return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
+-      return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
+-      const struct builtin_description *d = bdesc_multi_arg + i;
+-      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
+-					    (enum ix86_builtin_func_type)
+-					    d->flag, d->comparison);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
+-      return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
+-					       target);
+-    }
+-
+-  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
+-      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
+-    {
+-      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
+-      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
+-				       target);
+-    }
+-
+-  gcc_unreachable ();
+-}
+-
+-/* This returns the target-specific builtin with code CODE if
+-   current_function_decl has visibility on this builtin, which is checked
+-   using isa flags.  Returns NULL_TREE otherwise.  */
+-
+-static tree ix86_get_builtin (enum ix86_builtins code)
+-{
+-  struct cl_target_option *opts;
+-  tree target_tree = NULL_TREE;
+-
+-  /* Determine the isa flags of current_function_decl.  */
+-
+-  if (current_function_decl)
+-    target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
+-
+-  if (target_tree == NULL)
+-    target_tree = target_option_default_node;
+-
+-  opts = TREE_TARGET_OPTION (target_tree);
+-
+-  if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
+-      || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
+-    return ix86_builtin_decl (code, true);
+-  else
+-    return NULL_TREE;
+-}
+-
+-/* Returns a function decl for a vectorized version of the combined function
+-   with combined_fn code FN and the result vector type TYPE, or NULL_TREE
+-   if it is not available.  */
+-
+-static tree
+-ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
+-				  tree type_in)
+-{
+-  machine_mode in_mode, out_mode;
+-  int in_n, out_n;
+-
+-  if (TREE_CODE (type_out) != VECTOR_TYPE
+-      || TREE_CODE (type_in) != VECTOR_TYPE)
+-    return NULL_TREE;
+-
+-  out_mode = TYPE_MODE (TREE_TYPE (type_out));
+-  out_n = TYPE_VECTOR_SUBPARTS (type_out);
+-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+-
+-  switch (fn)
+-    {
+-    CASE_CFN_EXP2:
+-      if (out_mode == SFmode && in_mode == SFmode)
+-	{
+-	  if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
+-	}
+-      break;
+-
+-    CASE_CFN_IFLOOR:
+-    CASE_CFN_LFLOOR:
+-    CASE_CFN_LLFLOOR:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == SImode && in_mode == DFmode)
+-	{
+-	  if (out_n == 4 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
+-	  else if (out_n == 8 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
+-	  else if (out_n == 16 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
+-	}
+-      if (out_mode == SImode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
+-	}
+-      break;
+-
+-    CASE_CFN_ICEIL:
+-    CASE_CFN_LCEIL:
+-    CASE_CFN_LLCEIL:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == SImode && in_mode == DFmode)
+-	{
+-	  if (out_n == 4 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
+-	  else if (out_n == 8 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
+-	  else if (out_n == 16 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
+-	}
+-      if (out_mode == SImode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
+-	}
+-      break;
+-
+-    CASE_CFN_IRINT:
+-    CASE_CFN_LRINT:
+-    CASE_CFN_LLRINT:
+-      if (out_mode == SImode && in_mode == DFmode)
+-	{
+-	  if (out_n == 4 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
+-	  else if (out_n == 8 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
+-	  else if (out_n == 16 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
+-	}
+-      if (out_mode == SImode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
+-	}
+-      break;
+-
+-    CASE_CFN_IROUND:
+-    CASE_CFN_LROUND:
+-    CASE_CFN_LLROUND:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == SImode && in_mode == DFmode)
+-	{
+-	  if (out_n == 4 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
+-	  else if (out_n == 8 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
+-	  else if (out_n == 16 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
+-	}
+-      if (out_mode == SImode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
+-	}
+-      break;
+-
+-    CASE_CFN_FLOOR:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == DFmode && in_mode == DFmode)
+-	{
+-	  if (out_n == 2 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
+-	  else if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
+-	}
+-      if (out_mode == SFmode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
+-	}
+-      break;
+-
+-    CASE_CFN_CEIL:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == DFmode && in_mode == DFmode)
+-	{
+-	  if (out_n == 2 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
+-	  else if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
+-	}
+-      if (out_mode == SFmode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
+-	}
+-      break;
+-
+-    CASE_CFN_TRUNC:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == DFmode && in_mode == DFmode)
+-	{
+-	  if (out_n == 2 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
+-	  else if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
+-	}
+-      if (out_mode == SFmode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
+-	  else if (out_n == 16 && in_n == 16)
+-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
+-	}
+-      break;
+-
+-    CASE_CFN_RINT:
+-      /* The round insn does not trap on denormals.  */
+-      if (flag_trapping_math || !TARGET_SSE4_1)
+-	break;
+-
+-      if (out_mode == DFmode && in_mode == DFmode)
+-	{
+-	  if (out_n == 2 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_RINTPD);
+-	  else if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
+-	}
+-      if (out_mode == SFmode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_RINTPS);
+-	  else if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
+-	}
+-      break;
+-
+-    CASE_CFN_FMA:
+-      if (out_mode == DFmode && in_mode == DFmode)
+-	{
+-	  if (out_n == 2 && in_n == 2)
+-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
+-	}
+-      if (out_mode == SFmode && in_mode == SFmode)
+-	{
+-	  if (out_n == 4 && in_n == 4)
+-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
+-	  if (out_n == 8 && in_n == 8)
+-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
+-	}
+-      break;
+-
+-    default:
+-      break;
+-    }
+-
+-  /* Dispatch to a handler for a vectorization library.  */
+-  if (ix86_veclib_handler)
+-    return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
+-
+-  return NULL_TREE;
+-}
+-
+-/* Handler for an SVML-style interface to
+-   a library with vectorized intrinsics.  */
+-
+-static tree
+-ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
+-{
+-  char name[20];
+-  tree fntype, new_fndecl, args;
+-  unsigned arity;
+-  const char *bname;
+-  machine_mode el_mode, in_mode;
+-  int n, in_n;
+-
+-  /* The SVML is suitable for unsafe math only.  */
+-  if (!flag_unsafe_math_optimizations)
+-    return NULL_TREE;
+-
+-  el_mode = TYPE_MODE (TREE_TYPE (type_out));
+-  n = TYPE_VECTOR_SUBPARTS (type_out);
+-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+-  if (el_mode != in_mode
+-      || n != in_n)
+-    return NULL_TREE;
+-
+-  switch (fn)
+-    {
+-    CASE_CFN_EXP:
+-    CASE_CFN_LOG:
+-    CASE_CFN_LOG10:
+-    CASE_CFN_POW:
+-    CASE_CFN_TANH:
+-    CASE_CFN_TAN:
+-    CASE_CFN_ATAN:
+-    CASE_CFN_ATAN2:
+-    CASE_CFN_ATANH:
+-    CASE_CFN_CBRT:
+-    CASE_CFN_SINH:
+-    CASE_CFN_SIN:
+-    CASE_CFN_ASINH:
+-    CASE_CFN_ASIN:
+-    CASE_CFN_COSH:
+-    CASE_CFN_COS:
+-    CASE_CFN_ACOSH:
+-    CASE_CFN_ACOS:
+-      if ((el_mode != DFmode || n != 2)
+-	  && (el_mode != SFmode || n != 4))
+-	return NULL_TREE;
+-      break;
+-
+-    default:
+-      return NULL_TREE;
+-    }
+-
+-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+-  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+-
+-  if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
+-    strcpy (name, "vmlsLn4");
+-  else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
+-    strcpy (name, "vmldLn2");
+-  else if (n == 4)
+-    {
+-      sprintf (name, "vmls%s", bname+10);
+-      name[strlen (name)-1] = '4';
+-    }
+-  else
+-    sprintf (name, "vmld%s2", bname+10);
+-
+-  /* Convert to uppercase. */
+-  name[4] &= ~0x20;
+-
+-  arity = 0;
+-  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+-    arity++;
+-
+-  if (arity == 1)
+-    fntype = build_function_type_list (type_out, type_in, NULL);
+-  else
+-    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+-
+-  /* Build a function declaration for the vectorized function.  */
+-  new_fndecl = build_decl (BUILTINS_LOCATION,
+-			   FUNCTION_DECL, get_identifier (name), fntype);
+-  TREE_PUBLIC (new_fndecl) = 1;
+-  DECL_EXTERNAL (new_fndecl) = 1;
+-  DECL_IS_NOVOPS (new_fndecl) = 1;
+-  TREE_READONLY (new_fndecl) = 1;
+-
+-  return new_fndecl;
+-}
+-
+-/* Handler for an ACML-style interface to
+-   a library with vectorized intrinsics.  */
+-
+-static tree
+-ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
+-{
+-  char name[20] = "__vr.._";
+-  tree fntype, new_fndecl, args;
+-  unsigned arity;
+-  const char *bname;
+-  machine_mode el_mode, in_mode;
+-  int n, in_n;
+-
+-  /* The ACML is 64bits only and suitable for unsafe math only as
+-     it does not correctly support parts of IEEE with the required
+-     precision such as denormals.  */
+-  if (!TARGET_64BIT
+-      || !flag_unsafe_math_optimizations)
+-    return NULL_TREE;
+-
+-  el_mode = TYPE_MODE (TREE_TYPE (type_out));
+-  n = TYPE_VECTOR_SUBPARTS (type_out);
+-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+-  if (el_mode != in_mode
+-      || n != in_n)
+-    return NULL_TREE;
+-
+-  switch (fn)
+-    {
+-    CASE_CFN_SIN:
+-    CASE_CFN_COS:
+-    CASE_CFN_EXP:
+-    CASE_CFN_LOG:
+-    CASE_CFN_LOG2:
+-    CASE_CFN_LOG10:
+-      if (el_mode == DFmode && n == 2)
+-	{
+-	  name[4] = 'd';
+-	  name[5] = '2';
+-	}
+-      else if (el_mode == SFmode && n == 4)
+-	{
+-	  name[4] = 's';
+-	  name[5] = '4';
+-	}
+-      else
+-	return NULL_TREE;
+-      break;
+-
+-    default:
+-      return NULL_TREE;
+-    }
+-
+-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+-  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+-  sprintf (name + 7, "%s", bname+10);
+-
+-  arity = 0;
+-  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+-    arity++;
+-
+-  if (arity == 1)
+-    fntype = build_function_type_list (type_out, type_in, NULL);
+-  else
+-    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+-
+-  /* Build a function declaration for the vectorized function.  */
+-  new_fndecl = build_decl (BUILTINS_LOCATION,
+-			   FUNCTION_DECL, get_identifier (name), fntype);
+-  TREE_PUBLIC (new_fndecl) = 1;
+-  DECL_EXTERNAL (new_fndecl) = 1;
+-  DECL_IS_NOVOPS (new_fndecl) = 1;
+-  TREE_READONLY (new_fndecl) = 1;
+-
+-  return new_fndecl;
+-}
+-
+-/* Returns a decl of a function that implements gather load with
+-   memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
+-   Return NULL_TREE if it is not available.  */
+-
+-static tree
+-ix86_vectorize_builtin_gather (const_tree mem_vectype,
+-			       const_tree index_type, int scale)
+-{
+-  bool si;
+-  enum ix86_builtins code;
+-
+-  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
+-    return NULL_TREE;
+-
+-  if ((TREE_CODE (index_type) != INTEGER_TYPE
+-       && !POINTER_TYPE_P (index_type))
+-      || (TYPE_MODE (index_type) != SImode
+-	  && TYPE_MODE (index_type) != DImode))
+-    return NULL_TREE;
+-
+-  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+-    return NULL_TREE;
+-
+-  /* v*gather* insn sign extends index to pointer mode.  */
+-  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+-      && TYPE_UNSIGNED (index_type))
+-    return NULL_TREE;
+-
+-  if (scale <= 0
+-      || scale > 8
+-      || (scale & (scale - 1)) != 0)
+-    return NULL_TREE;
+-
+-  si = TYPE_MODE (index_type) == SImode;
+-  switch (TYPE_MODE (mem_vectype))
+-    {
+-    case E_V2DFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
+-      break;
+-    case E_V4DFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
+-      break;
+-    case E_V2DImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
+-      break;
+-    case E_V4DImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
+-      break;
+-    case E_V4SFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
+-      break;
+-    case E_V8SFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
+-      break;
+-    case E_V4SImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
+-      break;
+-    case E_V8SImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
+-      else
+-	code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
+-      break;
+-    case E_V8DFmode:
+-      if (TARGET_AVX512F)
+-	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V8DImode:
+-      if (TARGET_AVX512F)
+-	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V16SFmode:
+-      if (TARGET_AVX512F)
+-	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V16SImode:
+-      if (TARGET_AVX512F)
+-	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
+-      else
+-	return NULL_TREE;
+-      break;
+-    default:
+-      return NULL_TREE;
+-    }
+-
+-  return ix86_get_builtin (code);
+-}
+-
+-/* Returns a decl of a function that implements scatter store with
+-   register type VECTYPE and index type INDEX_TYPE and SCALE.
+-   Return NULL_TREE if it is not available.  */
+-
+-static tree
+-ix86_vectorize_builtin_scatter (const_tree vectype,
+-				const_tree index_type, int scale)
+-{
+-  bool si;
+-  enum ix86_builtins code;
+-
+-  if (!TARGET_AVX512F)
+-    return NULL_TREE;
+-
+-  if ((TREE_CODE (index_type) != INTEGER_TYPE
+-       && !POINTER_TYPE_P (index_type))
+-      || (TYPE_MODE (index_type) != SImode
+-	  && TYPE_MODE (index_type) != DImode))
+-    return NULL_TREE;
+-
+-  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+-    return NULL_TREE;
+-
+-  /* v*scatter* insn sign extends index to pointer mode.  */
+-  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+-      && TYPE_UNSIGNED (index_type))
+-    return NULL_TREE;
+-
+-  /* Scale can be 1, 2, 4 or 8.  */
+-  if (scale <= 0
+-      || scale > 8
+-      || (scale & (scale - 1)) != 0)
+-    return NULL_TREE;
+-
+-  si = TYPE_MODE (index_type) == SImode;
+-  switch (TYPE_MODE (vectype))
+-    {
+-    case E_V8DFmode:
+-      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+-      break;
+-    case E_V8DImode:
+-      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+-      break;
+-    case E_V16SFmode:
+-      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+-      break;
+-    case E_V16SImode:
+-      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+-      break;
+-    case E_V4DFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V4DImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V8SFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V8SImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V2DFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V2DImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V4SFmode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF;
+-      else
+-	return NULL_TREE;
+-      break;
+-    case E_V4SImode:
+-      if (TARGET_AVX512VL)
+-	code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI;
+-      else
+-	return NULL_TREE;
+-      break;
+-    default:
+-      return NULL_TREE;
+-    }
+-
+-  return ix86_builtins[code];
+-}
+-
+-/* Return true if it is safe to use the rsqrt optabs to optimize
+-   1.0/sqrt.  */
+-
+-static bool
+-use_rsqrt_p ()
+-{
+-  return (TARGET_SSE && TARGET_SSE_MATH
+-	  && flag_finite_math_only
+-	  && !flag_trapping_math
+-	  && flag_unsafe_math_optimizations);
+-}
+-
+-/* Returns a code for a target-specific builtin that implements
+-   reciprocal of the function, or NULL_TREE if not available.  */
+-
+-static tree
+-ix86_builtin_reciprocal (tree fndecl)
+-{
+-  switch (DECL_FUNCTION_CODE (fndecl))
+-    {
+-      /* Vectorized version of sqrt to rsqrt conversion.  */
+-    case IX86_BUILTIN_SQRTPS_NR:
+-      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
+-
+-    case IX86_BUILTIN_SQRTPS_NR256:
+-      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
+-
+-    default:
+-      return NULL_TREE;
+-    }
+-}
+-
+-/* Helper for avx_vpermilps256_operand et al.  This is also used by
+-   the expansion functions to turn the parallel back into a mask.
+-   The return value is 0 for no match and the imm8+1 for a match.  */
+-
+-int
+-avx_vpermilp_parallel (rtx par, machine_mode mode)
+-{
+-  unsigned i, nelt = GET_MODE_NUNITS (mode);
+-  unsigned mask = 0;
+-  unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
+-
+-  if (XVECLEN (par, 0) != (int) nelt)
+-    return 0;
+-
+-  /* Validate that all of the elements are constants, and not totally
+-     out of range.  Copy the data into an integral array to make the
+-     subsequent checks easier.  */
+-  for (i = 0; i < nelt; ++i)
+-    {
+-      rtx er = XVECEXP (par, 0, i);
+-      unsigned HOST_WIDE_INT ei;
+-
+-      if (!CONST_INT_P (er))
+-	return 0;
+-      ei = INTVAL (er);
+-      if (ei >= nelt)
+-	return 0;
+-      ipar[i] = ei;
+-    }
+-
+-  switch (mode)
+-    {
+-    case E_V8DFmode:
+-      /* In the 512-bit DFmode case, we can only move elements within
+-         a 128-bit lane.  First fill the second part of the mask,
+-	 then fallthru.  */
+-      for (i = 4; i < 6; ++i)
+-	{
+-	  if (ipar[i] < 4 || ipar[i] >= 6)
+-	    return 0;
+-	  mask |= (ipar[i] - 4) << i;
+-	}
+-      for (i = 6; i < 8; ++i)
+-	{
+-	  if (ipar[i] < 6)
+-	    return 0;
+-	  mask |= (ipar[i] - 6) << i;
+-	}
+-      /* FALLTHRU */
+-
+-    case E_V4DFmode:
+-      /* In the 256-bit DFmode case, we can only move elements within
+-         a 128-bit lane.  */
+-      for (i = 0; i < 2; ++i)
+-	{
+-	  if (ipar[i] >= 2)
+-	    return 0;
+-	  mask |= ipar[i] << i;
+-	}
+-      for (i = 2; i < 4; ++i)
+-	{
+-	  if (ipar[i] < 2)
+-	    return 0;
+-	  mask |= (ipar[i] - 2) << i;
+-	}
+-      break;
+-
+-    case E_V16SFmode:
+-      /* In 512 bit SFmode case, permutation in the upper 256 bits
+-	 must mirror the permutation in the lower 256-bits.  */
+-      for (i = 0; i < 8; ++i)
+-	if (ipar[i] + 8 != ipar[i + 8])
+-	  return 0;
+-      /* FALLTHRU */
+-
+-    case E_V8SFmode:
+-      /* In 256 bit SFmode case, we have full freedom of
+-         movement within the low 128-bit lane, but the high 128-bit
+-         lane must mirror the exact same pattern.  */
+-      for (i = 0; i < 4; ++i)
+-	if (ipar[i] + 4 != ipar[i + 4])
+-	  return 0;
+-      nelt = 4;
+-      /* FALLTHRU */
+-
+-    case E_V2DFmode:
+-    case E_V4SFmode:
+-      /* In the 128-bit case, we've full freedom in the placement of
+-	 the elements from the source operand.  */
+-      for (i = 0; i < nelt; ++i)
+-	mask |= ipar[i] << (i * (nelt / 2));
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  /* Make sure success has a non-zero value by adding one.  */
+-  return mask + 1;
+-}
+-
+-/* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
+-   the expansion functions to turn the parallel back into a mask.
+-   The return value is 0 for no match and the imm8+1 for a match.  */
+-
+-int
+-avx_vperm2f128_parallel (rtx par, machine_mode mode)
+-{
+-  unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+-  unsigned mask = 0;
+-  unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
+-
+-  if (XVECLEN (par, 0) != (int) nelt)
+-    return 0;
+-
+-  /* Validate that all of the elements are constants, and not totally
+-     out of range.  Copy the data into an integral array to make the
+-     subsequent checks easier.  */
+-  for (i = 0; i < nelt; ++i)
+-    {
+-      rtx er = XVECEXP (par, 0, i);
+-      unsigned HOST_WIDE_INT ei;
+-
+-      if (!CONST_INT_P (er))
+-	return 0;
+-      ei = INTVAL (er);
+-      if (ei >= 2 * nelt)
+-	return 0;
+-      ipar[i] = ei;
+-    }
+-
+-  /* Validate that the halves of the permute are halves.  */
+-  for (i = 0; i < nelt2 - 1; ++i)
+-    if (ipar[i] + 1 != ipar[i + 1])
+-      return 0;
+-  for (i = nelt2; i < nelt - 1; ++i)
+-    if (ipar[i] + 1 != ipar[i + 1])
+-      return 0;
+-
+-  /* Reconstruct the mask.  */
+-  for (i = 0; i < 2; ++i)
+-    {
+-      unsigned e = ipar[i * nelt2];
+-      if (e % nelt2)
+-	return 0;
+-      e /= nelt2;
+-      mask |= e << (i * 4);
+-    }
+-
+-  /* Make sure success has a non-zero value by adding one.  */
+-  return mask + 1;
+-}
+-
+-/* Return a register priority for hard reg REGNO.  */
+-static int
+-ix86_register_priority (int hard_regno)
+-{
+-  /* ebp and r13 as the base always wants a displacement, r12 as the
+-     base always wants an index.  So discourage their usage in an
+-     address.  */
+-  if (hard_regno == R12_REG || hard_regno == R13_REG)
+-    return 0;
+-  if (hard_regno == BP_REG)
+-    return 1;
+-  /* New x86-64 int registers result in bigger code size.  Discourage
+-     them.  */
+-  if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
+-    return 2;
+-  /* New x86-64 SSE registers result in bigger code size.  Discourage
+-     them.  */
+-  if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
+-    return 2;
+-  if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG))
+-    return 1;
+-  /* Usage of AX register results in smaller code.  Prefer it.  */
+-  if (hard_regno == AX_REG)
+-    return 4;
+-  return 3;
+-}
+-
+-/* Implement TARGET_PREFERRED_RELOAD_CLASS.
+-
+-   Put float CONST_DOUBLE in the constant pool instead of fp regs.
+-   QImode must go into class Q_REGS.
+-   Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
+-   movdf to do mem-to-mem moves through integer regs.  */
+-
+-static reg_class_t
+-ix86_preferred_reload_class (rtx x, reg_class_t regclass)
+-{
+-  machine_mode mode = GET_MODE (x);
+-
+-  /* We're only allowed to return a subclass of CLASS.  Many of the
+-     following checks fail for NO_REGS, so eliminate that early.  */
+-  if (regclass == NO_REGS)
+-    return NO_REGS;
+-
+-  /* All classes can load zeros.  */
+-  if (x == CONST0_RTX (mode))
+-    return regclass;
+-
+-  /* Force constants into memory if we are loading a (nonzero) constant into
+-     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
+-     instructions to load from a constant.  */
+-  if (CONSTANT_P (x)
+-      && (MAYBE_MMX_CLASS_P (regclass)
+-	  || MAYBE_SSE_CLASS_P (regclass)
+-	  || MAYBE_MASK_CLASS_P (regclass)))
+-    return NO_REGS;
+-
+-  /* Floating-point constants need more complex checks.  */
+-  if (CONST_DOUBLE_P (x))
+-    {
+-      /* General regs can load everything.  */
+-      if (INTEGER_CLASS_P (regclass))
+-        return regclass;
+-
+-      /* Floats can load 0 and 1 plus some others.  Note that we eliminated
+-	 zero above.  We only want to wind up preferring 80387 registers if
+-	 we plan on doing computation with them.  */
+-      if (IS_STACK_MODE (mode)
+-	  && standard_80387_constant_p (x) > 0)
+-	{
+-	  /* Limit class to FP regs.  */
+-	  if (FLOAT_CLASS_P (regclass))
+-	    return FLOAT_REGS;
+-	}
+-
+-      return NO_REGS;
+-    }
+-
+-  /* Prefer SSE regs only, if we can use them for math.  */
+-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-    return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
+-
+-  /* Generally when we see PLUS here, it's the function invariant
+-     (plus soft-fp const_int).  Which can only be computed into general
+-     regs.  */
+-  if (GET_CODE (x) == PLUS)
+-    return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
+-
+-  /* QImode constants are easy to load, but non-constant QImode data
+-     must go into Q_REGS.  */
+-  if (GET_MODE (x) == QImode && !CONSTANT_P (x))
+-    {
+-      if (Q_CLASS_P (regclass))
+-	return regclass;
+-      else if (reg_class_subset_p (Q_REGS, regclass))
+-	return Q_REGS;
+-      else
+-	return NO_REGS;
+-    }
+-
+-  return regclass;
+-}
+-
+-/* Discourage putting floating-point values in SSE registers unless
+-   SSE math is being used, and likewise for the 387 registers.  */
+-static reg_class_t
+-ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
+-{
+-  /* Restrict the output reload class to the register bank that we are doing
+-     math on.  If we would like not to return a subset of CLASS, reject this
+-     alternative: if reload cannot do this, it will still use its choice.  */
+-  machine_mode mode = GET_MODE (x);
+-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-    return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
+-
+-  if (IS_STACK_MODE (mode))
+-    return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
+-
+-  return regclass;
+-}
+-
+-static reg_class_t
+-ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
+-		       machine_mode mode, secondary_reload_info *sri)
+-{
+-  /* Double-word spills from general registers to non-offsettable memory
+-     references (zero-extended addresses) require special handling.  */
+-  if (TARGET_64BIT
+-      && MEM_P (x)
+-      && GET_MODE_SIZE (mode) > UNITS_PER_WORD
+-      && INTEGER_CLASS_P (rclass)
+-      && !offsettable_memref_p (x))
+-    {
+-      sri->icode = (in_p
+-		    ? CODE_FOR_reload_noff_load
+-		    : CODE_FOR_reload_noff_store);
+-      /* Add the cost of moving address to a temporary.  */
+-      sri->extra_cost = 1;
+-
+-      return NO_REGS;
+-    }
+-
+-  /* QImode spills from non-QI registers require
+-     intermediate register on 32bit targets.  */
+-  if (mode == QImode
+-      && ((!TARGET_64BIT && !in_p
+-	   && INTEGER_CLASS_P (rclass)
+-	   && MAYBE_NON_Q_CLASS_P (rclass))
+-	  || (!TARGET_AVX512DQ
+-	      && MAYBE_MASK_CLASS_P (rclass))))
+-    {
+-      int regno = true_regnum (x);
+-
+-      /* Return Q_REGS if the operand is in memory.  */
+-      if (regno == -1)
+-	return Q_REGS;
+-
+-      return NO_REGS;
+-    }
+-
+-  /* This condition handles corner case where an expression involving
+-     pointers gets vectorized.  We're trying to use the address of a
+-     stack slot as a vector initializer.
+-
+-     (set (reg:V2DI 74 [ vect_cst_.2 ])
+-          (vec_duplicate:V2DI (reg/f:DI 20 frame)))
+-
+-     Eventually frame gets turned into sp+offset like this:
+-
+-     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+-          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+-	                               (const_int 392 [0x188]))))
+-
+-     That later gets turned into:
+-
+-     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+-          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+-	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
+-
+-     We'll have the following reload recorded:
+-
+-     Reload 0: reload_in (DI) =
+-           (plus:DI (reg/f:DI 7 sp)
+-            (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
+-     reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+-     SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
+-     reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
+-     reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+-     reload_reg_rtx: (reg:V2DI 22 xmm1)
+-
+-     Which isn't going to work since SSE instructions can't handle scalar
+-     additions.  Returning GENERAL_REGS forces the addition into integer
+-     register and reload can handle subsequent reloads without problems.  */
+-
+-  if (in_p && GET_CODE (x) == PLUS
+-      && SSE_CLASS_P (rclass)
+-      && SCALAR_INT_MODE_P (mode))
+-    return GENERAL_REGS;
+-
+-  return NO_REGS;
+-}
+-
+-/* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
+-
+-static bool
+-ix86_class_likely_spilled_p (reg_class_t rclass)
+-{
+-  switch (rclass)
+-    {
+-      case AREG:
+-      case DREG:
+-      case CREG:
+-      case BREG:
+-      case AD_REGS:
+-      case SIREG:
+-      case DIREG:
+-      case SSE_FIRST_REG:
+-      case FP_TOP_REG:
+-      case FP_SECOND_REG:
+-	return true;
+-
+-      default:
+-	break;
+-    }
+-
+-  return false;
+-}
+-
+-/* If we are copying between registers from different register sets
+-   (e.g. FP and integer), we may need a memory location.
+-
+-   The function can't work reliably when one of the CLASSES is a class
+-   containing registers from multiple sets.  We avoid this by never combining
+-   different sets in a single alternative in the machine description.
+-   Ensure that this constraint holds to avoid unexpected surprises.
+-
+-   When STRICT is false, we are being called from REGISTER_MOVE_COST,
+-   so do not enforce these sanity checks.
+-
+-   To optimize register_move_cost performance, define inline variant.  */
+-
+-static inline bool
+-inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
+-				reg_class_t class2, int strict)
+-{
+-  if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
+-    return false;
+-
+-  if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
+-      || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
+-      || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
+-      || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
+-      || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
+-      || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
+-      || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
+-      || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
+-    {
+-      gcc_assert (!strict || lra_in_progress);
+-      return true;
+-    }
+-
+-  if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
+-    return true;
+-
+-  /* Between mask and general, we have moves no larger than word size.  */
+-  if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
+-      && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
+-  return true;
+-
+-  /* ??? This is a lie.  We do have moves between mmx/general, and for
+-     mmx/sse2.  But by saying we need secondary memory we discourage the
+-     register allocator from using the mmx registers unless needed.  */
+-  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
+-    return true;
+-
+-  if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+-    {
+-      /* SSE1 doesn't have any direct moves from other classes.  */
+-      if (!TARGET_SSE2)
+-	return true;
+-
+-      /* If the target says that inter-unit moves are more expensive
+-	 than moving through memory, then don't generate them.  */
+-      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+-	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+-	return true;
+-
+-      /* Between SSE and general, we have moves no larger than word size.  */
+-      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+-	return true;
+-    }
+-
+-  return false;
+-}
+-
+-/* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
+-
+-static bool
+-ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
+-			      reg_class_t class2)
+-{
+-  return inline_secondary_memory_needed (mode, class1, class2, true);
+-}
+-
+-/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
+-
+-   get_secondary_mem widens integral modes to BITS_PER_WORD.
+-   There is no need to emit full 64 bit move on 64 bit targets
+-   for integral modes that can be moved using 32 bit move.  */
+-
+-static machine_mode
+-ix86_secondary_memory_needed_mode (machine_mode mode)
+-{
+-  if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
+-    return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
+-  return mode;
+-}
+-
+-/* Implement the TARGET_CLASS_MAX_NREGS hook.
+-
+-   On the 80386, this is the size of MODE in words,
+-   except in the FP regs, where a single reg is always enough.  */
+-
+-static unsigned char
+-ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
+-{
+-  if (MAYBE_INTEGER_CLASS_P (rclass))
+-    {
+-      if (mode == XFmode)
+-	return (TARGET_64BIT ? 2 : 3);
+-      else if (mode == XCmode)
+-	return (TARGET_64BIT ? 4 : 6);
+-      else
+-	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+-    }
+-  else
+-    {
+-      if (COMPLEX_MODE_P (mode))
+-	return 2;
+-      else
+-	return 1;
+-    }
+-}
+-
+-/* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
+-
+-static bool
+-ix86_can_change_mode_class (machine_mode from, machine_mode to,
+-			    reg_class_t regclass)
+-{
+-  if (from == to)
+-    return true;
+-
+-  /* x87 registers can't do subreg at all, as all values are reformatted
+-     to extended precision.  */
+-  if (MAYBE_FLOAT_CLASS_P (regclass))
+-    return false;
+-
+-  if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
+-    {
+-      /* Vector registers do not support QI or HImode loads.  If we don't
+-	 disallow a change to these modes, reload will assume it's ok to
+-	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
+-	 the vec_dupv4hi pattern.  */
+-      if (GET_MODE_SIZE (from) < 4)
+-	return false;
+-    }
+-
+-  return true;
+-}
+-
+-/* Return index of MODE in the sse load/store tables.  */
+-
+-static inline int
+-sse_store_index (machine_mode mode)
+-{
+-      switch (GET_MODE_SIZE (mode))
+-	{
+-	  case 4:
+-	    return 0;
+-	  case 8:
+-	    return 1;
+-	  case 16:
+-	    return 2;
+-	  case 32:
+-	    return 3;
+-	  case 64:
+-	    return 4;
+-	  default:
+-	    return -1;
+-	}
+-}
+-
+-/* Return the cost of moving data of mode M between a
+-   register and memory.  A value of 2 is the default; this cost is
+-   relative to those in `REGISTER_MOVE_COST'.
+-
+-   This function is used extensively by register_move_cost that is used to
+-   build tables at startup.  Make it inline in this case.
+-   When IN is 2, return maximum of in and out move cost.
+-
+-   If moving between registers and memory is more expensive than
+-   between two registers, you should define this macro to express the
+-   relative cost.
+-
+-   Model also increased moving costs of QImode registers in non
+-   Q_REGS classes.
+- */
+-static inline int
+-inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
+-{
+-  int cost;
+-  if (FLOAT_CLASS_P (regclass))
+-    {
+-      int index;
+-      switch (mode)
+-	{
+-	  case E_SFmode:
+-	    index = 0;
+-	    break;
+-	  case E_DFmode:
+-	    index = 1;
+-	    break;
+-	  case E_XFmode:
+-	    index = 2;
+-	    break;
+-	  default:
+-	    return 100;
+-	}
+-      if (in == 2)
+-        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
+-      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+-    }
+-  if (SSE_CLASS_P (regclass))
+-    {
+-      int index = sse_store_index (mode);
+-      if (index == -1)
+-	return 100;
+-      if (in == 2)
+-        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
+-      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+-    }
+-  if (MMX_CLASS_P (regclass))
+-    {
+-      int index;
+-      switch (GET_MODE_SIZE (mode))
+-	{
+-	  case 4:
+-	    index = 0;
+-	    break;
+-	  case 8:
+-	    index = 1;
+-	    break;
+-	  default:
+-	    return 100;
+-	}
+-      if (in == 2)
+-        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
+-      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
+-    }
+-  switch (GET_MODE_SIZE (mode))
+-    {
+-      case 1:
+-	if (Q_CLASS_P (regclass) || TARGET_64BIT)
+-	  {
+-	    if (!in)
+-	      return ix86_cost->int_store[0];
+-	    if (TARGET_PARTIAL_REG_DEPENDENCY
+-	        && optimize_function_for_speed_p (cfun))
+-	      cost = ix86_cost->movzbl_load;
+-	    else
+-	      cost = ix86_cost->int_load[0];
+-	    if (in == 2)
+-	      return MAX (cost, ix86_cost->int_store[0]);
+-	    return cost;
+-	  }
+-	else
+-	  {
+-	   if (in == 2)
+-	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+-	   if (in)
+-	     return ix86_cost->movzbl_load;
+-	   else
+-	     return ix86_cost->int_store[0] + 4;
+-	  }
+-	break;
+-      case 2:
+-	if (in == 2)
+-	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
+-	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+-      default:
+-	if (in == 2)
+-	  cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
+-	else if (in)
+-	  cost = ix86_cost->int_load[2];
+-	else
+-	  cost = ix86_cost->int_store[2];
+-	/* Multiply with the number of GPR moves needed.  */
+-	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
+-    }
+-}
+-
+-static int
+-ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
+-{
+-  return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
+-}
+-
+-
+-/* Return the cost of moving data from a register in class CLASS1 to
+-   one in class CLASS2.
+-
+-   It is not required that the cost always equal 2 when FROM is the same as TO;
+-   on some machines it is expensive to move between registers if they are not
+-   general registers.  */
+-
+-static int
+-ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
+-			 reg_class_t class2_i)
+-{
+-  enum reg_class class1 = (enum reg_class) class1_i;
+-  enum reg_class class2 = (enum reg_class) class2_i;
+-
+-  /* In case we require secondary memory, compute cost of the store followed
+-     by load.  In order to avoid bad register allocation choices, we need
+-     for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
+-
+-  if (inline_secondary_memory_needed (mode, class1, class2, false))
+-    {
+-      int cost = 1;
+-
+-      cost += inline_memory_move_cost (mode, class1, 2);
+-      cost += inline_memory_move_cost (mode, class2, 2);
+-
+-      /* In case of copying from general_purpose_register we may emit multiple
+-         stores followed by single load causing memory size mismatch stall.
+-         Count this as arbitrarily high cost of 20.  */
+-      if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
+-	  && TARGET_MEMORY_MISMATCH_STALL
+-	  && targetm.class_max_nregs (class1, mode)
+-	     > targetm.class_max_nregs (class2, mode))
+-	cost += 20;
+-
+-      /* In the case of FP/MMX moves, the registers actually overlap, and we
+-	 have to switch modes in order to treat them differently.  */
+-      if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
+-          || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
+-	cost += 20;
+-
+-      return cost;
+-    }
+-
+-  /* Moves between SSE/MMX and integer unit are expensive.  */
+-  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
+-      || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+-
+-    /* ??? By keeping returned value relatively high, we limit the number
+-       of moves between integer and MMX/SSE registers for all targets.
+-       Additionally, high value prevents problem with x86_modes_tieable_p(),
+-       where integer modes in MMX/SSE registers are not tieable
+-       because of missing QImode and HImode moves to, from or between
+-       MMX/SSE registers.  */
+-    return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
+-		? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
+-
+-  if (MAYBE_FLOAT_CLASS_P (class1))
+-    return ix86_cost->fp_move;
+-  if (MAYBE_SSE_CLASS_P (class1))
+-    {
+-      if (GET_MODE_BITSIZE (mode) <= 128)
+-	return ix86_cost->xmm_move;
+-      if (GET_MODE_BITSIZE (mode) <= 256)
+-	return ix86_cost->ymm_move;
+-      return ix86_cost->zmm_move;
+-    }
+-  if (MAYBE_MMX_CLASS_P (class1))
+-    return ix86_cost->mmx_move;
+-  return 2;
+-}
+-
+-/* Implement TARGET_HARD_REGNO_NREGS.  This is ordinarily the length in
+-   words of a value of mode MODE but can be less for certain modes in
+-   special long registers.
+-
+-   Actually there are no two word move instructions for consecutive
+-   registers.  And only registers 0-3 may have mov byte instructions
+-   applied to them.  */
+-
+-static unsigned int
+-ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
+-{
+-  if (GENERAL_REGNO_P (regno))
+-    {
+-      if (mode == XFmode)
+-	return TARGET_64BIT ? 2 : 3;
+-      if (mode == XCmode)
+-	return TARGET_64BIT ? 4 : 6;
+-      return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+-    }
+-  if (COMPLEX_MODE_P (mode))
+-    return 2;
+-  if (mode == V64SFmode || mode == V64SImode)
+-    return 4;
+-  return 1;
+-}
+-
+-/* Implement TARGET_HARD_REGNO_MODE_OK.  */
+-
+-static bool
+-ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+-{
+-  /* Flags and only flags can only hold CCmode values.  */
+-  if (CC_REGNO_P (regno))
+-    return GET_MODE_CLASS (mode) == MODE_CC;
+-  if (GET_MODE_CLASS (mode) == MODE_CC
+-      || GET_MODE_CLASS (mode) == MODE_RANDOM
+-      || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
+-    return false;
+-  if (STACK_REGNO_P (regno))
+-    return VALID_FP_MODE_P (mode);
+-  if (MASK_REGNO_P (regno))
+-    return (VALID_MASK_REG_MODE (mode)
+-	    || (TARGET_AVX512BW
+-		&& VALID_MASK_AVX512BW_MODE (mode)));
+-  if (SSE_REGNO_P (regno))
+-    {
+-      /* We implement the move patterns for all vector modes into and
+-	 out of SSE registers, even when no operation instructions
+-	 are available.  */
+-
+-      /* For AVX-512 we allow, regardless of regno:
+-	  - XI mode
+-	  - any of 512-bit wide vector mode
+-	  - any scalar mode.  */
+-      if (TARGET_AVX512F
+-	  && (mode == XImode
+-	      || VALID_AVX512F_REG_MODE (mode)
+-	      || VALID_AVX512F_SCALAR_MODE (mode)))
+-	return true;
+-
+-      /* For AVX-5124FMAPS or AVX-5124VNNIW
+-	 allow V64SF and V64SI modes for special regnos.  */
+-      if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
+-	  && (mode == V64SFmode || mode == V64SImode)
+-	  && MOD4_SSE_REGNO_P (regno))
+-	return true;
+-
+-      /* TODO check for QI/HI scalars.  */
+-      /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
+-      if (TARGET_AVX512VL
+-	  && (mode == OImode
+-	      || mode == TImode
+-	      || VALID_AVX256_REG_MODE (mode)
+-	      || VALID_AVX512VL_128_REG_MODE (mode)))
+-	return true;
+-
+-      /* xmm16-xmm31 are only available for AVX-512.  */
+-      if (EXT_REX_SSE_REGNO_P (regno))
+-	return false;
+-
+-      /* OImode and AVX modes are available only when AVX is enabled.  */
+-      return ((TARGET_AVX
+-	       && VALID_AVX256_REG_OR_OI_MODE (mode))
+-	      || VALID_SSE_REG_MODE (mode)
+-	      || VALID_SSE2_REG_MODE (mode)
+-	      || VALID_MMX_REG_MODE (mode)
+-	      || VALID_MMX_REG_MODE_3DNOW (mode));
+-    }
+-  if (MMX_REGNO_P (regno))
+-    {
+-      /* We implement the move patterns for 3DNOW modes even in MMX mode,
+-	 so if the register is available at all, then we can move data of
+-	 the given mode into or out of it.  */
+-      return (VALID_MMX_REG_MODE (mode)
+-	      || VALID_MMX_REG_MODE_3DNOW (mode));
+-    }
+-
+-  if (mode == QImode)
+-    {
+-      /* Take care for QImode values - they can be in non-QI regs,
+-	 but then they do cause partial register stalls.  */
+-      if (ANY_QI_REGNO_P (regno))
+-	return true;
+-      if (!TARGET_PARTIAL_REG_STALL)
+-	return true;
+-      /* LRA checks if the hard register is OK for the given mode.
+-	 QImode values can live in non-QI regs, so we allow all
+-	 registers here.  */
+-      if (lra_in_progress)
+-       return true;
+-      return !can_create_pseudo_p ();
+-    }
+-  /* We handle both integer and floats in the general purpose registers.  */
+-  else if (VALID_INT_MODE_P (mode))
+-    return true;
+-  else if (VALID_FP_MODE_P (mode))
+-    return true;
+-  else if (VALID_DFP_MODE_P (mode))
+-    return true;
+-  /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
+-     on to use that value in smaller contexts, this can easily force a
+-     pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
+-     supporting DImode, allow it.  */
+-  else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
+-    return true;
+-
+-  return false;
+-}
+-
+-/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The only ABI that
+-   saves SSE registers across calls is Win64 (thus no need to check the
+-   current ABI here), and with AVX enabled Win64 only guarantees that
+-   the low 16 bytes are saved.  */
+-
+-static bool
+-ix86_hard_regno_call_part_clobbered (rtx_insn *insn ATTRIBUTE_UNUSED,
+-				     unsigned int regno, machine_mode mode)
+-{
+-  return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
+-}
+-
+-/* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
+-   tieable integer mode.  */
+-
+-static bool
+-ix86_tieable_integer_mode_p (machine_mode mode)
+-{
+-  switch (mode)
+-    {
+-    case E_HImode:
+-    case E_SImode:
+-      return true;
+-
+-    case E_QImode:
+-      return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
+-
+-    case E_DImode:
+-      return TARGET_64BIT;
+-
+-    default:
+-      return false;
+-    }
+-}
+-
+-/* Implement TARGET_MODES_TIEABLE_P.
+-
+-   Return true if MODE1 is accessible in a register that can hold MODE2
+-   without copying.  That is, all register classes that can hold MODE2
+-   can also hold MODE1.  */
+-
+-static bool
+-ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+-{
+-  if (mode1 == mode2)
+-    return true;
+-
+-  if (ix86_tieable_integer_mode_p (mode1)
+-      && ix86_tieable_integer_mode_p (mode2))
+-    return true;
+-
+-  /* MODE2 being XFmode implies fp stack or general regs, which means we
+-     can tie any smaller floating point modes to it.  Note that we do not
+-     tie this with TFmode.  */
+-  if (mode2 == XFmode)
+-    return mode1 == SFmode || mode1 == DFmode;
+-
+-  /* MODE2 being DFmode implies fp stack, general or sse regs, which means
+-     that we can tie it with SFmode.  */
+-  if (mode2 == DFmode)
+-    return mode1 == SFmode;
+-
+-  /* If MODE2 is only appropriate for an SSE register, then tie with
+-     any other mode acceptable to SSE registers.  */
+-  if (GET_MODE_SIZE (mode2) == 64
+-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+-    return (GET_MODE_SIZE (mode1) == 64
+-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+-  if (GET_MODE_SIZE (mode2) == 32
+-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+-    return (GET_MODE_SIZE (mode1) == 32
+-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+-  if (GET_MODE_SIZE (mode2) == 16
+-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+-    return (GET_MODE_SIZE (mode1) == 16
+-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+-
+-  /* If MODE2 is appropriate for an MMX register, then tie
+-     with any other mode acceptable to MMX registers.  */
+-  if (GET_MODE_SIZE (mode2) == 8
+-      && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
+-    return (GET_MODE_SIZE (mode1) == 8
+-	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
+-
+-  return false;
+-}
+-
+-/* Return the cost of moving between two registers of mode MODE.  */
+-
+-static int
+-ix86_set_reg_reg_cost (machine_mode mode)
+-{
+-  unsigned int units = UNITS_PER_WORD;
+-
+-  switch (GET_MODE_CLASS (mode))
+-    {
+-    default:
+-      break;
+-
+-    case MODE_CC:
+-      units = GET_MODE_SIZE (CCmode);
+-      break;
+-
+-    case MODE_FLOAT:
+-      if ((TARGET_SSE && mode == TFmode)
+-	  || (TARGET_80387 && mode == XFmode)
+-	  || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
+-	  || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
+-	units = GET_MODE_SIZE (mode);
+-      break;
+-
+-    case MODE_COMPLEX_FLOAT:
+-      if ((TARGET_SSE && mode == TCmode)
+-	  || (TARGET_80387 && mode == XCmode)
+-	  || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
+-	  || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
+-	units = GET_MODE_SIZE (mode);
+-      break;
+-
+-    case MODE_VECTOR_INT:
+-    case MODE_VECTOR_FLOAT:
+-      if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+-	  || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+-	  || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+-	  || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+-	  || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
+-	units = GET_MODE_SIZE (mode);
+-    }
+-
+-  /* Return the cost of moving between two registers of mode MODE,
+-     assuming that the move will be in pieces of at most UNITS bytes.  */
+-  return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
+-}
+-
+-/* Return cost of vector operation in MODE given that scalar version has
+-   COST.  */
+-
+-static int
+-ix86_vec_cost (machine_mode mode, int cost)
+-{
+-  if (!VECTOR_MODE_P (mode))
+-    return cost;
+-
+-  if (GET_MODE_BITSIZE (mode) == 128
+-      && TARGET_SSE_SPLIT_REGS)
+-    return cost * 2;
+-  if (GET_MODE_BITSIZE (mode) > 128
+-      && TARGET_AVX128_OPTIMAL)
+-    return cost * GET_MODE_BITSIZE (mode) / 128;
+-  return cost;
+-}
+-
+-/* Return cost of multiplication in MODE.  */
+-
+-static int
+-ix86_multiplication_cost (const struct processor_costs *cost,
+-			  enum machine_mode mode)
+-{
+-  machine_mode inner_mode = mode;
+-  if (VECTOR_MODE_P (mode))
+-    inner_mode = GET_MODE_INNER (mode);
+-
+-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-    return inner_mode == DFmode ? cost->mulsd : cost->mulss;
+-  else if (X87_FLOAT_MODE_P (mode))
+-    return cost->fmul;
+-  else if (FLOAT_MODE_P (mode))
+-    return  ix86_vec_cost (mode,
+-			   inner_mode == DFmode ? cost->mulsd : cost->mulss);
+-  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+-    {
+-      /* vpmullq is used in this case. No emulation is needed.  */
+-      if (TARGET_AVX512DQ)
+-	return ix86_vec_cost (mode, cost->mulss);
+-
+-      /* V*QImode is emulated with 7-13 insns.  */
+-      if (mode == V16QImode || mode == V32QImode)
+-	{
+-	  int extra = 11;
+-	  if (TARGET_XOP && mode == V16QImode)
+-	    extra = 5;
+-	  else if (TARGET_SSSE3)
+-	    extra = 6;
+-	  return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
+-	}
+-      /* V*DImode is emulated with 5-8 insns.  */
+-      else if (mode == V2DImode || mode == V4DImode)
+-	{
+-	  if (TARGET_XOP && mode == V2DImode)
+-	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
+-	  else
+-	    return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+-	}
+-      /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+-	 insns, including two PMULUDQ.  */
+-      else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+-	return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
+-      else
+-	return ix86_vec_cost (mode, cost->mulss);
+-    }
+-  else
+-    return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
+-}
+-
+-/* Return cost of multiplication in MODE.  */
+-
+-static int
+-ix86_division_cost (const struct processor_costs *cost,
+-			  enum machine_mode mode)
+-{
+-  machine_mode inner_mode = mode;
+-  if (VECTOR_MODE_P (mode))
+-    inner_mode = GET_MODE_INNER (mode);
+-
+-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-    return inner_mode == DFmode ? cost->divsd : cost->divss;
+-  else if (X87_FLOAT_MODE_P (mode))
+-    return cost->fdiv;
+-  else if (FLOAT_MODE_P (mode))
+-    return ix86_vec_cost (mode,
+-			  inner_mode == DFmode ? cost->divsd : cost->divss);
+-  else
+-    return cost->divide[MODE_INDEX (mode)];
+-}
+-
+-/* Return cost of shift in MODE.
+-   If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
+-   AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
+-   if op1 is a result of subreg.
+-
+-   SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored.  */
+-
+-static int
+-ix86_shift_rotate_cost (const struct processor_costs *cost,
+-			enum machine_mode mode, bool constant_op1,
+-			HOST_WIDE_INT op1_val,
+-			bool speed,
+-			bool and_in_op1,
+-			bool shift_and_truncate,
+-			bool *skip_op0, bool *skip_op1)
+-{
+-  if (skip_op0)
+-    *skip_op0 = *skip_op1 = false;
+-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+-    {
+-      /* V*QImode is emulated with 1-11 insns.  */
+-      if (mode == V16QImode || mode == V32QImode)
+-	{
+-	  int count = 11;
+-	  if (TARGET_XOP && mode == V16QImode)
+-	    {
+-	      /* For XOP we use vpshab, which requires a broadcast of the
+-		 value to the variable shift insn.  For constants this
+-		 means a V16Q const in mem; even when we can perform the
+-		 shift with one insn set the cost to prefer paddb.  */
+-	      if (constant_op1)
+-		{
+-		  if (skip_op1)
+-		    *skip_op1 = true;
+-		  return ix86_vec_cost (mode,
+-					cost->sse_op
+-					+ (speed
+-					   ? 2
+-					   : COSTS_N_BYTES
+-					       (GET_MODE_UNIT_SIZE (mode))));
+-		}
+-	      count = 3;
+-	    }
+-	  else if (TARGET_SSSE3)
+-	    count = 7;
+-	  return ix86_vec_cost (mode, cost->sse_op * count);
+-	}
+-      else
+-	return ix86_vec_cost (mode, cost->sse_op);
+-    }
+-  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+-    {
+-      if (constant_op1)
+-	{
+-	  if (op1_val > 32)
+-	    return cost->shift_const + COSTS_N_INSNS (2);
+-	  else
+-	    return cost->shift_const * 2;
+-	}
+-      else
+-	{
+-	  if (and_in_op1)
+-	    return cost->shift_var * 2;
+-	  else
+-	    return cost->shift_var * 6 + COSTS_N_INSNS (2);
+-	}
+-    }
+-  else
+-    {
+-      if (constant_op1)
+-	return cost->shift_const;
+-      else if (shift_and_truncate)
+-	{
+-	  if (skip_op0)
+-	    *skip_op0 = *skip_op1 = true;
+-	  /* Return the cost after shift-and truncation.  */
+-	  return cost->shift_var;
+-	}
+-      else
+-	return cost->shift_var;
+-    }
+-  return cost->shift_const;
+-}
+-
+-/* Compute a (partial) cost for rtx X.  Return true if the complete
+-   cost has been computed, and false if subexpressions should be
+-   scanned.  In either case, *TOTAL contains the cost result.  */
+-
+-static bool
+-ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
+-		int *total, bool speed)
+-{
+-  rtx mask;
+-  enum rtx_code code = GET_CODE (x);
+-  enum rtx_code outer_code = (enum rtx_code) outer_code_i;
+-  const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
+-  int src_cost;
+-
+-  switch (code)
+-    {
+-    case SET:
+-      if (register_operand (SET_DEST (x), VOIDmode)
+-	  && register_operand (SET_SRC (x), VOIDmode))
+-	{
+-	  *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
+-	  return true;
+-	}
+-
+-      if (register_operand (SET_SRC (x), VOIDmode))
+-	/* Avoid potentially incorrect high cost from rtx_costs
+-	   for non-tieable SUBREGs.  */
+-	src_cost = 0;
+-      else
+-	{
+-	  src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
+-
+-	  if (CONSTANT_P (SET_SRC (x)))
+-	    /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
+-	       a small value, possibly zero for cheap constants.  */
+-	    src_cost += COSTS_N_INSNS (1);
+-	}
+-
+-      *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
+-      return true;
+-
+-    case CONST_INT:
+-    case CONST:
+-    case LABEL_REF:
+-    case SYMBOL_REF:
+-      if (x86_64_immediate_operand (x, VOIDmode))
+-	*total = 0;
+-     else
+-	*total = 1;
+-      return true;
+-
+-    case CONST_DOUBLE:
+-      if (IS_STACK_MODE (mode))
+-	switch (standard_80387_constant_p (x))
+-	  {
+-	  case -1:
+-	  case 0:
+-	    break;
+-	  case 1: /* 0.0 */
+-	    *total = 1;
+-	    return true;
+-	  default: /* Other constants */
+-	    *total = 2;
+-	    return true;
+-	  }
+-      /* FALLTHRU */
+-
+-    case CONST_VECTOR:
+-      switch (standard_sse_constant_p (x, mode))
+-	{
+-	case 0:
+-	  break;
+-	case 1:  /* 0: xor eliminates false dependency */
+-	  *total = 0;
+-	  return true;
+-	default: /* -1: cmp contains false dependency */
+-	  *total = 1;
+-	  return true;
+-	}
+-      /* FALLTHRU */
+-
+-    case CONST_WIDE_INT:
+-      /* Fall back to (MEM (SYMBOL_REF)), since that's where
+-	 it'll probably end up.  Add a penalty for size.  */
+-      *total = (COSTS_N_INSNS (1)
+-		+ (!TARGET_64BIT && flag_pic)
+-		+ (GET_MODE_SIZE (mode) <= 4
+-		   ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
+-      return true;
+-
+-    case ZERO_EXTEND:
+-      /* The zero extensions is often completely free on x86_64, so make
+-	 it as cheap as possible.  */
+-      if (TARGET_64BIT && mode == DImode
+-	  && GET_MODE (XEXP (x, 0)) == SImode)
+-	*total = 1;
+-      else if (TARGET_ZERO_EXTEND_WITH_AND)
+-	*total = cost->add;
+-      else
+-	*total = cost->movzx;
+-      return false;
+-
+-    case SIGN_EXTEND:
+-      *total = cost->movsx;
+-      return false;
+-
+-    case ASHIFT:
+-      if (SCALAR_INT_MODE_P (mode)
+-	  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+-	  && CONST_INT_P (XEXP (x, 1)))
+-	{
+-	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+-	  if (value == 1)
+-	    {
+-	      *total = cost->add;
+-	      return false;
+-	    }
+-	  if ((value == 2 || value == 3)
+-	      && cost->lea <= cost->shift_const)
+-	    {
+-	      *total = cost->lea;
+-	      return false;
+-	    }
+-	}
+-      /* FALLTHRU */
+-
+-    case ROTATE:
+-    case ASHIFTRT:
+-    case LSHIFTRT:
+-    case ROTATERT:
+-      bool skip_op0, skip_op1;
+-      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
+-				       CONST_INT_P (XEXP (x, 1))
+-					 ? INTVAL (XEXP (x, 1)) : -1,
+-				       speed,
+-				       GET_CODE (XEXP (x, 1)) == AND,
+-				       SUBREG_P (XEXP (x, 1))
+-				       && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
+-				       &skip_op0, &skip_op1);
+-      if (skip_op0 || skip_op1)
+-	{
+-	  if (!skip_op0)
+-	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+-	  if (!skip_op1)
+-	    *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
+-	  return true;
+-	}
+-      return false;
+-
+-    case FMA:
+-      {
+-	rtx sub;
+-
+-        gcc_assert (FLOAT_MODE_P (mode));
+-        gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
+-
+-        *total = ix86_vec_cost (mode,
+-				GET_MODE_INNER (mode) == SFmode
+-				? cost->fmass : cost->fmasd);
+-	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
+-
+-        /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
+-	sub = XEXP (x, 0);
+-	if (GET_CODE (sub) == NEG)
+-	  sub = XEXP (sub, 0);
+-	*total += rtx_cost (sub, mode, FMA, 0, speed);
+-
+-	sub = XEXP (x, 2);
+-	if (GET_CODE (sub) == NEG)
+-	  sub = XEXP (sub, 0);
+-	*total += rtx_cost (sub, mode, FMA, 2, speed);
+-	return true;
+-      }
+-
+-    case MULT:
+-      if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
+-	{
+-	  rtx op0 = XEXP (x, 0);
+-	  rtx op1 = XEXP (x, 1);
+-	  int nbits;
+-	  if (CONST_INT_P (XEXP (x, 1)))
+-	    {
+-	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+-	      for (nbits = 0; value != 0; value &= value - 1)
+-	        nbits++;
+-	    }
+-	  else
+-	    /* This is arbitrary.  */
+-	    nbits = 7;
+-
+-	  /* Compute costs correctly for widening multiplication.  */
+-	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
+-	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
+-	         == GET_MODE_SIZE (mode))
+-	    {
+-	      int is_mulwiden = 0;
+-	      machine_mode inner_mode = GET_MODE (op0);
+-
+-	      if (GET_CODE (op0) == GET_CODE (op1))
+-		is_mulwiden = 1, op1 = XEXP (op1, 0);
+-	      else if (CONST_INT_P (op1))
+-		{
+-		  if (GET_CODE (op0) == SIGN_EXTEND)
+-		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
+-			          == INTVAL (op1);
+-		  else
+-		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
+-	        }
+-
+-	      if (is_mulwiden)
+-	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
+-	    }
+-
+-  	  *total = (cost->mult_init[MODE_INDEX (mode)]
+-		    + nbits * cost->mult_bit
+-	            + rtx_cost (op0, mode, outer_code, opno, speed)
+-		    + rtx_cost (op1, mode, outer_code, opno, speed));
+-
+-          return true;
+-	}
+-      *total = ix86_multiplication_cost (cost, mode);
+-      return false;
+-
+-    case DIV:
+-    case UDIV:
+-    case MOD:
+-    case UMOD:
+-      *total = ix86_division_cost (cost, mode);
+-      return false;
+-
+-    case PLUS:
+-      if (GET_MODE_CLASS (mode) == MODE_INT
+-	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
+-	{
+-	  if (GET_CODE (XEXP (x, 0)) == PLUS
+-	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+-	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
+-	      && CONSTANT_P (XEXP (x, 1)))
+-	    {
+-	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
+-	      if (val == 2 || val == 4 || val == 8)
+-		{
+-		  *total = cost->lea;
+-		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+-				      outer_code, opno, speed);
+-		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
+-				      outer_code, opno, speed);
+-		  *total += rtx_cost (XEXP (x, 1), mode,
+-				      outer_code, opno, speed);
+-		  return true;
+-		}
+-	    }
+-	  else if (GET_CODE (XEXP (x, 0)) == MULT
+-		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
+-	    {
+-	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
+-	      if (val == 2 || val == 4 || val == 8)
+-		{
+-		  *total = cost->lea;
+-		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+-				      outer_code, opno, speed);
+-		  *total += rtx_cost (XEXP (x, 1), mode,
+-				      outer_code, opno, speed);
+-		  return true;
+-		}
+-	    }
+-	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
+-	    {
+-	      /* Add with carry, ignore the cost of adding a carry flag.  */
+-	      if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
+-		*total = cost->add;
+-	      else
+-		{
+-		  *total = cost->lea;
+-		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+-				      outer_code, opno, speed);
+-		}
+-
+-	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+-				  outer_code, opno, speed);
+-	      *total += rtx_cost (XEXP (x, 1), mode,
+-				  outer_code, opno, speed);
+-	      return true;
+-	    }
+-	}
+-      /* FALLTHRU */
+-
+-    case MINUS:
+-      /* Subtract with borrow, ignore the cost of subtracting a carry flag.  */
+-      if (GET_MODE_CLASS (mode) == MODE_INT
+-	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
+-	  && GET_CODE (XEXP (x, 0)) == MINUS
+-	  && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
+-	{
+-	  *total = cost->add;
+-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+-			      outer_code, opno, speed);
+-	  *total += rtx_cost (XEXP (x, 1), mode,
+-			      outer_code, opno, speed);
+-	  return true;
+-	}
+-
+-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-	{
+-	  *total = cost->addss;
+-	  return false;
+-	}
+-      else if (X87_FLOAT_MODE_P (mode))
+-	{
+-	  *total = cost->fadd;
+-	  return false;
+-	}
+-      else if (FLOAT_MODE_P (mode))
+-	{
+-	  *total = ix86_vec_cost (mode, cost->addss);
+-	  return false;
+-	}
+-      /* FALLTHRU */
+-
+-    case AND:
+-    case IOR:
+-    case XOR:
+-      if (GET_MODE_CLASS (mode) == MODE_INT
+-	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+-	{
+-	  *total = (cost->add * 2
+-		    + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+-		       << (GET_MODE (XEXP (x, 0)) != DImode))
+-		    + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
+-	               << (GET_MODE (XEXP (x, 1)) != DImode)));
+-	  return true;
+-	}
+-      /* FALLTHRU */
+-
+-    case NEG:
+-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-	{
+-	  *total = cost->sse_op;
+-	  return false;
+-	}
+-      else if (X87_FLOAT_MODE_P (mode))
+-	{
+-	  *total = cost->fchs;
+-	  return false;
+-	}
+-      else if (FLOAT_MODE_P (mode))
+-	{
+-	  *total = ix86_vec_cost (mode, cost->sse_op);
+-	  return false;
+-	}
+-      /* FALLTHRU */
+-
+-    case NOT:
+-      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+-	*total = ix86_vec_cost (mode, cost->sse_op);
+-      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+-	*total = cost->add * 2;
+-      else
+-	*total = cost->add;
+-      return false;
+-
+-    case COMPARE:
+-      if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
+-	  && XEXP (XEXP (x, 0), 1) == const1_rtx
+-	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
+-	  && XEXP (x, 1) == const0_rtx)
+-	{
+-	  /* This kind of construct is implemented using test[bwl].
+-	     Treat it as if we had an AND.  */
+-	  mode = GET_MODE (XEXP (XEXP (x, 0), 0));
+-	  *total = (cost->add
+-		    + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
+-				opno, speed)
+-		    + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
+-	  return true;
+-	}
+-
+-      /* The embedded comparison operand is completely free.  */
+-      if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
+-	  && XEXP (x, 1) == const0_rtx)
+-	*total = 0;
+-
+-      return false;
+-
+-    case FLOAT_EXTEND:
+-      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+-	*total = 0;
+-      else
+-        *total = ix86_vec_cost (mode, cost->addss);
+-      return false;
+-
+-    case FLOAT_TRUNCATE:
+-      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+-	*total = cost->fadd;
+-      else
+-        *total = ix86_vec_cost (mode, cost->addss);
+-      return false;
+-
+-    case ABS:
+-      /* SSE requires memory load for the constant operand. It may make
+-	 sense to account for this.  Of course the constant operand may or
+-	 may not be reused. */
+-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-	*total = cost->sse_op;
+-      else if (X87_FLOAT_MODE_P (mode))
+-	*total = cost->fabs;
+-      else if (FLOAT_MODE_P (mode))
+-	*total = ix86_vec_cost (mode, cost->sse_op);
+-      return false;
+-
+-    case SQRT:
+-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+-	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
+-      else if (X87_FLOAT_MODE_P (mode))
+-	*total = cost->fsqrt;
+-      else if (FLOAT_MODE_P (mode))
+-	*total = ix86_vec_cost (mode,
+-				mode == SFmode ? cost->sqrtss : cost->sqrtsd);
+-      return false;
+-
+-    case UNSPEC:
+-      if (XINT (x, 1) == UNSPEC_TP)
+-	*total = 0;
+-      return false;
+-
+-    case VEC_SELECT:
+-    case VEC_CONCAT:
+-    case VEC_DUPLICATE:
+-      /* ??? Assume all of these vector manipulation patterns are
+-	 recognizable.  In which case they all pretty much have the
+-	 same cost.  */
+-     *total = cost->sse_op;
+-     return true;
+-    case VEC_MERGE:
+-      mask = XEXP (x, 2);
+-      /* This is masked instruction, assume the same cost,
+-	 as nonmasked variant.  */
+-      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+-	*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
+-      else
+-	*total = cost->sse_op;
+-      return true;
+-
+-    default:
+-      return false;
+-    }
+-}
+-
+-#if TARGET_MACHO
+-
+-static int current_machopic_label_num;
+-
+-/* Given a symbol name and its associated stub, write out the
+-   definition of the stub.  */
+-
+-void
+-machopic_output_stub (FILE *file, const char *symb, const char *stub)
+-{
+-  unsigned int length;
+-  char *binder_name, *symbol_name, lazy_ptr_name[32];
+-  int label = ++current_machopic_label_num;
+-
+-  /* For 64-bit we shouldn't get here.  */
+-  gcc_assert (!TARGET_64BIT);
+-
+-  /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
+-  symb = targetm.strip_name_encoding (symb);
+-
+-  length = strlen (stub);
+-  binder_name = XALLOCAVEC (char, length + 32);
+-  GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
+-
+-  length = strlen (symb);
+-  symbol_name = XALLOCAVEC (char, length + 32);
+-  GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
+-
+-  sprintf (lazy_ptr_name, "L%d$lz", label);
+-
+-  if (MACHOPIC_ATT_STUB)
+-    switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
+-  else if (MACHOPIC_PURE)
+-    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
+-  else
+-    switch_to_section (darwin_sections[machopic_symbol_stub_section]);
+-
+-  fprintf (file, "%s:\n", stub);
+-  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+-
+-  if (MACHOPIC_ATT_STUB)
+-    {
+-      fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
+-    }
+-  else if (MACHOPIC_PURE)
+-    {
+-      /* PIC stub.  */
+-      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+-      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
+-      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
+-      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
+-	       label, lazy_ptr_name, label);
+-      fprintf (file, "\tjmp\t*%%ecx\n");
+-    }
+-  else
+-    fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
+-
+-  /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
+-     it needs no stub-binding-helper.  */
+-  if (MACHOPIC_ATT_STUB)
+-    return;
+-
+-  fprintf (file, "%s:\n", binder_name);
+-
+-  if (MACHOPIC_PURE)
+-    {
+-      fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
+-      fprintf (file, "\tpushl\t%%ecx\n");
+-    }
+-  else
+-    fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
+-
+-  fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
+-
+-  /* N.B. Keep the correspondence of these
+-     'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
+-     old-pic/new-pic/non-pic stubs; altering this will break
+-     compatibility with existing dylibs.  */
+-  if (MACHOPIC_PURE)
+-    {
+-      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+-      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
+-    }
+-  else
+-    /* 16-byte -mdynamic-no-pic stub.  */
+-    switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
+-
+-  fprintf (file, "%s:\n", lazy_ptr_name);
+-  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+-  fprintf (file, ASM_LONG "%s\n", binder_name);
+-}
+-#endif /* TARGET_MACHO */
+-
+-/* Order the registers for register allocator.  */
+-
+-void
+-x86_order_regs_for_local_alloc (void)
+-{
+-   int pos = 0;
+-   int i;
+-
+-   /* First allocate the local general purpose registers.  */
+-   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-     if (GENERAL_REGNO_P (i) && call_used_regs[i])
+-	reg_alloc_order [pos++] = i;
+-
+-   /* Global general purpose registers.  */
+-   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-     if (GENERAL_REGNO_P (i) && !call_used_regs[i])
+-	reg_alloc_order [pos++] = i;
+-
+-   /* x87 registers come first in case we are doing FP math
+-      using them.  */
+-   if (!TARGET_SSE_MATH)
+-     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+-       reg_alloc_order [pos++] = i;
+-
+-   /* SSE registers.  */
+-   for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
+-     reg_alloc_order [pos++] = i;
+-   for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+-     reg_alloc_order [pos++] = i;
+-
+-   /* Extended REX SSE registers.  */
+-   for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+-     reg_alloc_order [pos++] = i;
+-
+-   /* Mask register.  */
+-   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+-     reg_alloc_order [pos++] = i;
+-
+-   /* x87 registers.  */
+-   if (TARGET_SSE_MATH)
+-     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+-       reg_alloc_order [pos++] = i;
+-
+-   for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
+-     reg_alloc_order [pos++] = i;
+-
+-   /* Initialize the rest of array as we do not allocate some registers
+-      at all.  */
+-   while (pos < FIRST_PSEUDO_REGISTER)
+-     reg_alloc_order [pos++] = 0;
+-}
+-
+-/* Handle a "callee_pop_aggregate_return" attribute; arguments as
+-   in struct attribute_spec handler.  */
+-static tree
+-ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
+-					 bool *no_add_attrs)
+-{
+-  if (TREE_CODE (*node) != FUNCTION_TYPE
+-      && TREE_CODE (*node) != METHOD_TYPE
+-      && TREE_CODE (*node) != FIELD_DECL
+-      && TREE_CODE (*node) != TYPE_DECL)
+-    {
+-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+-	       name);
+-      *no_add_attrs = true;
+-      return NULL_TREE;
+-    }
+-  if (TARGET_64BIT)
+-    {
+-      warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
+-	       name);
+-      *no_add_attrs = true;
+-      return NULL_TREE;
+-    }
+-  if (is_attribute_p ("callee_pop_aggregate_return", name))
+-    {
+-      tree cst;
+-
+-      cst = TREE_VALUE (args);
+-      if (TREE_CODE (cst) != INTEGER_CST)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "%qE attribute requires an integer constant argument",
+-		   name);
+-	  *no_add_attrs = true;
+-	}
+-      else if (compare_tree_int (cst, 0) != 0
+-	       && compare_tree_int (cst, 1) != 0)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "argument to %qE attribute is neither zero, nor one",
+-		   name);
+-	  *no_add_attrs = true;
+-	}
+-
+-      return NULL_TREE;
+-    }
+-
+-  return NULL_TREE;
+-}
+-
+-/* Handle a "ms_abi" or "sysv" attribute; arguments as in
+-   struct attribute_spec.handler.  */
+-static tree
+-ix86_handle_abi_attribute (tree *node, tree name, tree, int,
+-			   bool *no_add_attrs)
+-{
+-  if (TREE_CODE (*node) != FUNCTION_TYPE
+-      && TREE_CODE (*node) != METHOD_TYPE
+-      && TREE_CODE (*node) != FIELD_DECL
+-      && TREE_CODE (*node) != TYPE_DECL)
+-    {
+-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+-	       name);
+-      *no_add_attrs = true;
+-      return NULL_TREE;
+-    }
+-
+-  /* Can combine regparm with all attributes but fastcall.  */
+-  if (is_attribute_p ("ms_abi", name))
+-    {
+-      if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("ms_abi and sysv_abi attributes are not compatible");
+-	}
+-
+-      return NULL_TREE;
+-    }
+-  else if (is_attribute_p ("sysv_abi", name))
+-    {
+-      if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
+-        {
+-	  error ("ms_abi and sysv_abi attributes are not compatible");
+-	}
+-
+-      return NULL_TREE;
+-    }
+-
+-  return NULL_TREE;
+-}
+-
+-/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
+-   struct attribute_spec.handler.  */
+-static tree
+-ix86_handle_struct_attribute (tree *node, tree name, tree, int,
+-			      bool *no_add_attrs)
+-{
+-  tree *type = NULL;
+-  if (DECL_P (*node))
+-    {
+-      if (TREE_CODE (*node) == TYPE_DECL)
+-	type = &TREE_TYPE (*node);
+-    }
+-  else
+-    type = node;
+-
+-  if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
+-    {
+-      warning (OPT_Wattributes, "%qE attribute ignored",
+-	       name);
+-      *no_add_attrs = true;
+-    }
+-
+-  else if ((is_attribute_p ("ms_struct", name)
+-	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
+-	   || ((is_attribute_p ("gcc_struct", name)
+-		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
+-    {
+-      warning (OPT_Wattributes, "%qE incompatible attribute ignored",
+-               name);
+-      *no_add_attrs = true;
+-    }
+-
+-  return NULL_TREE;
+-}
+-
+-static tree
+-ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
+-			      bool *no_add_attrs)
+-{
+-  if (TREE_CODE (*node) != FUNCTION_DECL)
+-    {
+-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+-               name);
+-      *no_add_attrs = true;
+-    }
+-
+-  if (is_attribute_p ("indirect_branch", name))
+-    {
+-      tree cst = TREE_VALUE (args);
+-      if (TREE_CODE (cst) != STRING_CST)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "%qE attribute requires a string constant argument",
+-		   name);
+-	  *no_add_attrs = true;
+-	}
+-      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
+-	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
+-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
+-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "argument to %qE attribute is not "
+-		   "(keep|thunk|thunk-inline|thunk-extern)", name);
+-	  *no_add_attrs = true;
+-	}
+-    }
+-
+-  if (is_attribute_p ("function_return", name))
+-    {
+-      tree cst = TREE_VALUE (args);
+-      if (TREE_CODE (cst) != STRING_CST)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "%qE attribute requires a string constant argument",
+-		   name);
+-	  *no_add_attrs = true;
+-	}
+-      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
+-	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
+-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
+-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
+-	{
+-	  warning (OPT_Wattributes,
+-		   "argument to %qE attribute is not "
+-		   "(keep|thunk|thunk-inline|thunk-extern)", name);
+-	  *no_add_attrs = true;
+-	}
+-    }
+-
+-  return NULL_TREE;
+-}
+-
+-static tree
+-ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
+-						 int, bool *)
+-{
+-  return NULL_TREE;
+-}
+-
+-static tree
+-ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
+-{
+-  /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
+-     but the function type contains args and return type data.  */
+-  tree func_type = *node;
+-  tree return_type = TREE_TYPE (func_type);
+-
+-  int nargs = 0;
+-  tree current_arg_type = TYPE_ARG_TYPES (func_type);
+-  while (current_arg_type
+-	 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
+-    {
+-      if (nargs == 0)
+-	{
+-	  if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
+-	    error ("interrupt service routine should have a pointer "
+-		   "as the first argument");
+-	}
+-      else if (nargs == 1)
+-	{
+-	  if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
+-	      || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
+-	    error ("interrupt service routine should have %qs "
+-		   "as the second argument",
+-		   TARGET_64BIT
+-		   ? (TARGET_X32 ? "unsigned long long int"
+-				 : "unsigned long int")
+-		   : "unsigned int");
+-	}
+-      nargs++;
+-      current_arg_type = TREE_CHAIN (current_arg_type);
+-    }
+-  if (!nargs || nargs > 2)
+-    error ("interrupt service routine can only have a pointer argument "
+-	   "and an optional integer argument");
+-  if (! VOID_TYPE_P (return_type))
+-    error ("interrupt service routine can%'t have non-void return value");
+-
+-  return NULL_TREE;
+-}
+-
+-static bool
+-ix86_ms_bitfield_layout_p (const_tree record_type)
+-{
+-  return ((TARGET_MS_BITFIELD_LAYOUT
+-	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
+-          || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
+-}
+-
+-/* Returns an expression indicating where the this parameter is
+-   located on entry to the FUNCTION.  */
+-
+-static rtx
+-x86_this_parameter (tree function)
+-{
+-  tree type = TREE_TYPE (function);
+-  bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
+-  int nregs;
+-
+-  if (TARGET_64BIT)
+-    {
+-      const int *parm_regs;
+-
+-      if (ix86_function_type_abi (type) == MS_ABI)
+-        parm_regs = x86_64_ms_abi_int_parameter_registers;
+-      else
+-        parm_regs = x86_64_int_parameter_registers;
+-      return gen_rtx_REG (Pmode, parm_regs[aggr]);
+-    }
+-
+-  nregs = ix86_function_regparm (type, function);
+-
+-  if (nregs > 0 && !stdarg_p (type))
+-    {
+-      int regno;
+-      unsigned int ccvt = ix86_get_callcvt (type);
+-
+-      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+-	regno = aggr ? DX_REG : CX_REG;
+-      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+-        {
+-	  regno = CX_REG;
+-	  if (aggr)
+-	    return gen_rtx_MEM (SImode,
+-				plus_constant (Pmode, stack_pointer_rtx, 4));
+-	}
+-      else
+-        {
+-	  regno = AX_REG;
+-	  if (aggr)
+-	    {
+-	      regno = DX_REG;
+-	      if (nregs == 1)
+-		return gen_rtx_MEM (SImode,
+-				    plus_constant (Pmode,
+-						   stack_pointer_rtx, 4));
+-	    }
+-	}
+-      return gen_rtx_REG (SImode, regno);
+-    }
+-
+-  return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
+-					     aggr ? 8 : 4));
+-}
+-
+-/* Determine whether x86_output_mi_thunk can succeed.  */
+-
+-static bool
+-x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
+-			 const_tree function)
+-{
+-  /* 64-bit can handle anything.  */
+-  if (TARGET_64BIT)
+-    return true;
+-
+-  /* For 32-bit, everything's fine if we have one free register.  */
+-  if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
+-    return true;
+-
+-  /* Need a free register for vcall_offset.  */
+-  if (vcall_offset)
+-    return false;
+-
+-  /* Need a free register for GOT references.  */
+-  if (flag_pic && !targetm.binds_local_p (function))
+-    return false;
+-
+-  /* Otherwise ok.  */
+-  return true;
+-}
+-
+-/* Output the assembler code for a thunk function.  THUNK_DECL is the
+-   declaration for the thunk function itself, FUNCTION is the decl for
+-   the target function.  DELTA is an immediate constant offset to be
+-   added to THIS.  If VCALL_OFFSET is nonzero, the word at
+-   *(*this + vcall_offset) should be added to THIS.  */
+-
+-static void
+-x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
+-		     HOST_WIDE_INT vcall_offset, tree function)
+-{
+-  rtx this_param = x86_this_parameter (function);
+-  rtx this_reg, tmp, fnaddr;
+-  unsigned int tmp_regno;
+-  rtx_insn *insn;
+-
+-  if (TARGET_64BIT)
+-    tmp_regno = R10_REG;
+-  else
+-    {
+-      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
+-      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+-	tmp_regno = AX_REG;
+-      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+-	tmp_regno = DX_REG;
+-      else
+-	tmp_regno = CX_REG;
+-    }
+-
+-  emit_note (NOTE_INSN_PROLOGUE_END);
+-
+-  /* CET is enabled, insert EB instruction.  */
+-  if ((flag_cf_protection & CF_BRANCH))
+-    emit_insn (gen_nop_endbr ());
+-
+-  /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
+-     pull it in now and let DELTA benefit.  */
+-  if (REG_P (this_param))
+-    this_reg = this_param;
+-  else if (vcall_offset)
+-    {
+-      /* Put the this parameter into %eax.  */
+-      this_reg = gen_rtx_REG (Pmode, AX_REG);
+-      emit_move_insn (this_reg, this_param);
+-    }
+-  else
+-    this_reg = NULL_RTX;
+-
+-  /* Adjust the this parameter by a fixed constant.  */
+-  if (delta)
+-    {
+-      rtx delta_rtx = GEN_INT (delta);
+-      rtx delta_dst = this_reg ? this_reg : this_param;
+-
+-      if (TARGET_64BIT)
+-	{
+-	  if (!x86_64_general_operand (delta_rtx, Pmode))
+-	    {
+-	      tmp = gen_rtx_REG (Pmode, tmp_regno);
+-	      emit_move_insn (tmp, delta_rtx);
+-	      delta_rtx = tmp;
+-	    }
+-	}
+-
+-      ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
+-    }
+-
+-  /* Adjust the this parameter by a value stored in the vtable.  */
+-  if (vcall_offset)
+-    {
+-      rtx vcall_addr, vcall_mem, this_mem;
+-
+-      tmp = gen_rtx_REG (Pmode, tmp_regno);
+-
+-      this_mem = gen_rtx_MEM (ptr_mode, this_reg);
+-      if (Pmode != ptr_mode)
+-	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
+-      emit_move_insn (tmp, this_mem);
+-
+-      /* Adjust the this parameter.  */
+-      vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
+-      if (TARGET_64BIT
+-	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
+-	{
+-	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
+-	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
+-	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
+-	}
+-
+-      vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
+-      if (Pmode != ptr_mode)
+-	emit_insn (gen_addsi_1_zext (this_reg,
+-				     gen_rtx_REG (ptr_mode,
+-						  REGNO (this_reg)),
+-				     vcall_mem));
+-      else
+-	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
+-    }
+-
+-  /* If necessary, drop THIS back to its stack slot.  */
+-  if (this_reg && this_reg != this_param)
+-    emit_move_insn (this_param, this_reg);
+-
+-  fnaddr = XEXP (DECL_RTL (function), 0);
+-  if (TARGET_64BIT)
+-    {
+-      if (!flag_pic || targetm.binds_local_p (function)
+-	  || TARGET_PECOFF)
+-	;
+-      else
+-	{
+-	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
+-	  tmp = gen_rtx_CONST (Pmode, tmp);
+-	  fnaddr = gen_const_mem (Pmode, tmp);
+-	}
+-    }
+-  else
+-    {
+-      if (!flag_pic || targetm.binds_local_p (function))
+-	;
+-#if TARGET_MACHO
+-      else if (TARGET_MACHO)
+-	{
+-	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
+-	  fnaddr = XEXP (fnaddr, 0);
+-	}
+-#endif /* TARGET_MACHO */
+-      else
+-	{
+-	  tmp = gen_rtx_REG (Pmode, CX_REG);
+-	  output_set_got (tmp, NULL_RTX);
+-
+-	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
+-	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+-	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
+-	  fnaddr = gen_const_mem (Pmode, fnaddr);
+-	}
+-    }
+-
+-  /* Our sibling call patterns do not allow memories, because we have no
+-     predicate that can distinguish between frame and non-frame memory.
+-     For our purposes here, we can get away with (ab)using a jump pattern,
+-     because we're going to do no optimization.  */
+-  if (MEM_P (fnaddr))
+-    {
+-      if (sibcall_insn_operand (fnaddr, word_mode))
+-	{
+-	  fnaddr = XEXP (DECL_RTL (function), 0);
+-	  tmp = gen_rtx_MEM (QImode, fnaddr);
+-	  tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+-	  tmp = emit_call_insn (tmp);
+-	  SIBLING_CALL_P (tmp) = 1;
+-	}
+-      else
+-	emit_jump_insn (gen_indirect_jump (fnaddr));
+-    }
+-  else
+-    {
+-      if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
+-	{
+-	  // CM_LARGE_PIC always uses pseudo PIC register which is
+-	  // uninitialized.  Since FUNCTION is local and calling it
+-	  // doesn't go through PLT, we use scratch register %r11 as
+-	  // PIC register and initialize it here.
+-	  pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
+-	  ix86_init_large_pic_reg (tmp_regno);
+-	  fnaddr = legitimize_pic_address (fnaddr,
+-					   gen_rtx_REG (Pmode, tmp_regno));
+-	}
+-
+-      if (!sibcall_insn_operand (fnaddr, word_mode))
+-	{
+-	  tmp = gen_rtx_REG (word_mode, tmp_regno);
+-	  if (GET_MODE (fnaddr) != word_mode)
+-	    fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+-	  emit_move_insn (tmp, fnaddr);
+-	  fnaddr = tmp;
+-	}
+-
+-      tmp = gen_rtx_MEM (QImode, fnaddr);
+-      tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+-      tmp = emit_call_insn (tmp);
+-      SIBLING_CALL_P (tmp) = 1;
+-    }
+-  emit_barrier ();
+-
+-  /* Emit just enough of rest_of_compilation to get the insns emitted.
+-     Note that use_thunk calls assemble_start_function et al.  */
+-  insn = get_insns ();
+-  shorten_branches (insn);
+-  final_start_function (insn, file, 1);
+-  final (insn, file, 1);
+-  final_end_function ();
+-}
+-
+-static void
+-x86_file_start (void)
+-{
+-  default_file_start ();
+-  if (TARGET_16BIT)
+-    fputs ("\t.code16gcc\n", asm_out_file);
+-#if TARGET_MACHO
+-  darwin_file_start ();
+-#endif
+-  if (X86_FILE_START_VERSION_DIRECTIVE)
+-    fputs ("\t.version\t\"01.01\"\n", asm_out_file);
+-  if (X86_FILE_START_FLTUSED)
+-    fputs ("\t.global\t__fltused\n", asm_out_file);
+-  if (ix86_asm_dialect == ASM_INTEL)
+-    fputs ("\t.intel_syntax noprefix\n", asm_out_file);
+-}
+-
+-int
+-x86_field_alignment (tree type, int computed)
+-{
+-  machine_mode mode;
+-
+-  if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
+-    return computed;
+-  if (TARGET_IAMCU)
+-    return iamcu_alignment (type, computed);
+-  mode = TYPE_MODE (strip_array_types (type));
+-  if (mode == DFmode || mode == DCmode
+-      || GET_MODE_CLASS (mode) == MODE_INT
+-      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
+-    return MIN (32, computed);
+-  return computed;
+-}
+-
+-/* Print call to TARGET to FILE.  */
+-
+-static void
+-x86_print_call_or_nop (FILE *file, const char *target)
+-{
+-  if (flag_nop_mcount || !strcmp (target, "nop"))
+-    /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
+-    fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+-  else
+-    fprintf (file, "1:\tcall\t%s\n", target);
+-}
+-
+-static bool
+-current_fentry_name (const char **name)
+-{
+-  tree attr = lookup_attribute ("fentry_name",
+-				DECL_ATTRIBUTES (current_function_decl));
+-  if (!attr)
+-    return false;
+-  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
+-  return true;
+-}
+-
+-static bool
+-current_fentry_section (const char **name)
+-{
+-  tree attr = lookup_attribute ("fentry_section",
+-				DECL_ATTRIBUTES (current_function_decl));
+-  if (!attr)
+-    return false;
+-  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
+-  return true;
+-}
+-
+-/* Output assembler code to FILE to increment profiler label # LABELNO
+-   for profiling a function entry.  */
+-void
+-x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
+-{
+-  if (cfun->machine->endbr_queued_at_entrance)
+-    fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
+-
+-  const char *mcount_name = MCOUNT_NAME;
+-
+-  if (current_fentry_name (&mcount_name))
+-    ;
+-  else if (fentry_name)
+-    mcount_name = fentry_name;
+-  else if (flag_fentry)
+-    mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE;
+-
+-  if (TARGET_64BIT)
+-    {
+-#ifndef NO_PROFILE_COUNTERS
+-      fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
+-#endif
+-
+-      if (!TARGET_PECOFF && flag_pic)
+-	fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
+-      else
+-	x86_print_call_or_nop (file, mcount_name);
+-    }
+-  else if (flag_pic)
+-    {
+-#ifndef NO_PROFILE_COUNTERS
+-      fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
+-	       LPREFIX, labelno);
+-#endif
+-      fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
+-    }
+-  else
+-    {
+-#ifndef NO_PROFILE_COUNTERS
+-      fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
+-	       LPREFIX, labelno);
+-#endif
+-      x86_print_call_or_nop (file, mcount_name);
+-    }
+-
+-  if (flag_record_mcount
+-	|| lookup_attribute ("fentry_section",
+-                                DECL_ATTRIBUTES (current_function_decl)))
+-    {
+-      const char *sname = "__mcount_loc";
+-
+-      if (current_fentry_section (&sname))
+-	;
+-      else if (fentry_section)
+-	sname = fentry_section;
+-
+-      fprintf (file, "\t.section %s, \"a\",@progbits\n", sname);
+-      fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+-      fprintf (file, "\t.previous\n");
+-    }
+-}
+-
+-/* We don't have exact information about the insn sizes, but we may assume
+-   quite safely that we are informed about all 1 byte insns and memory
+-   address sizes.  This is enough to eliminate unnecessary padding in
+-   99% of cases.  */
+-
+-int
+-ix86_min_insn_size (rtx_insn *insn)
+-{
+-  int l = 0, len;
+-
+-  if (!INSN_P (insn) || !active_insn_p (insn))
+-    return 0;
+-
+-  /* Discard alignments we've emit and jump instructions.  */
+-  if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+-      && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
+-    return 0;
+-
+-  /* Important case - calls are always 5 bytes.
+-     It is common to have many calls in the row.  */
+-  if (CALL_P (insn)
+-      && symbolic_reference_mentioned_p (PATTERN (insn))
+-      && !SIBLING_CALL_P (insn))
+-    return 5;
+-  len = get_attr_length (insn);
+-  if (len <= 1)
+-    return 1;
+-
+-  /* For normal instructions we rely on get_attr_length being exact,
+-     with a few exceptions.  */
+-  if (!JUMP_P (insn))
+-    {
+-      enum attr_type type = get_attr_type (insn);
+-
+-      switch (type)
+-	{
+-	case TYPE_MULTI:
+-	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+-	      || asm_noperands (PATTERN (insn)) >= 0)
+-	    return 0;
+-	  break;
+-	case TYPE_OTHER:
+-	case TYPE_FCMP:
+-	  break;
+-	default:
+-	  /* Otherwise trust get_attr_length.  */
+-	  return len;
+-	}
+-
+-      l = get_attr_length_address (insn);
+-      if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
+-	l = 4;
+-    }
+-  if (l)
+-    return 1+l;
+-  else
+-    return 2;
+-}
+-
+-#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+-
+-/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
+-   window.  */
+-
+-static void
+-ix86_avoid_jump_mispredicts (void)
+-{
+-  rtx_insn *insn, *start = get_insns ();
+-  int nbytes = 0, njumps = 0;
+-  bool isjump = false;
+-
+-  /* Look for all minimal intervals of instructions containing 4 jumps.
+-     The intervals are bounded by START and INSN.  NBYTES is the total
+-     size of instructions in the interval including INSN and not including
+-     START.  When the NBYTES is smaller than 16 bytes, it is possible
+-     that the end of START and INSN ends up in the same 16byte page.
+-
+-     The smallest offset in the page INSN can start is the case where START
+-     ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
+-     We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
+-
+-     Don't consider asm goto as jump, while it can contain a jump, it doesn't
+-     have to, control transfer to label(s) can be performed through other
+-     means, and also we estimate minimum length of all asm stmts as 0.  */
+-  for (insn = start; insn; insn = NEXT_INSN (insn))
+-    {
+-      int min_size;
+-
+-      if (LABEL_P (insn))
+-	{
+-	  align_flags alignment = label_to_alignment (insn);
+-	  int align = alignment.levels[0].log;
+-	  int max_skip = alignment.levels[0].maxskip;
+-
+-	  if (max_skip > 15)
+-	    max_skip = 15;
+-	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
+-	     already in the current 16 byte page, because otherwise
+-	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
+-	     bytes to reach 16 byte boundary.  */
+-	  if (align <= 0
+-	      || (align <= 3 && max_skip != (1 << align) - 1))
+-	    max_skip = 0;
+-	  if (dump_file)
+-	    fprintf (dump_file, "Label %i with max_skip %i\n",
+-		     INSN_UID (insn), max_skip);
+-	  if (max_skip)
+-	    {
+-	      while (nbytes + max_skip >= 16)
+-		{
+-		  start = NEXT_INSN (start);
+-		  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+-		      || CALL_P (start))
+-		    njumps--, isjump = true;
+-		  else
+-		    isjump = false;
+-		  nbytes -= ix86_min_insn_size (start);
+-		}
+-	    }
+-	  continue;
+-	}
+-
+-      min_size = ix86_min_insn_size (insn);
+-      nbytes += min_size;
+-      if (dump_file)
+-	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
+-		 INSN_UID (insn), min_size);
+-      if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
+-	  || CALL_P (insn))
+-	njumps++;
+-      else
+-	continue;
+-
+-      while (njumps > 3)
+-	{
+-	  start = NEXT_INSN (start);
+-	  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+-	      || CALL_P (start))
+-	    njumps--, isjump = true;
+-	  else
+-	    isjump = false;
+-	  nbytes -= ix86_min_insn_size (start);
+-	}
+-      gcc_assert (njumps >= 0);
+-      if (dump_file)
+-        fprintf (dump_file, "Interval %i to %i has %i bytes\n",
+-		 INSN_UID (start), INSN_UID (insn), nbytes);
+-
+-      if (njumps == 3 && isjump && nbytes < 16)
+-	{
+-	  int padsize = 15 - nbytes + ix86_min_insn_size (insn);
+-
+-	  if (dump_file)
+-	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
+-		     INSN_UID (insn), padsize);
+-          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+-	}
+-    }
+-}
+-#endif
+-
+-/* AMD Athlon works faster
+-   when RET is not destination of conditional jump or directly preceded
+-   by other jump instruction.  We avoid the penalty by inserting NOP just
+-   before the RET instructions in such cases.  */
+-static void
+-ix86_pad_returns (void)
+-{
+-  edge e;
+-  edge_iterator ei;
+-
+-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+-    {
+-      basic_block bb = e->src;
+-      rtx_insn *ret = BB_END (bb);
+-      rtx_insn *prev;
+-      bool replace = false;
+-
+-      if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
+-	  || optimize_bb_for_size_p (bb))
+-	continue;
+-      for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
+-	if (active_insn_p (prev) || LABEL_P (prev))
+-	  break;
+-      if (prev && LABEL_P (prev))
+-	{
+-	  edge e;
+-	  edge_iterator ei;
+-
+-	  FOR_EACH_EDGE (e, ei, bb->preds)
+-	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
+-		&& !(e->flags & EDGE_FALLTHRU))
+-	      {
+-		replace = true;
+-		break;
+-	      }
+-	}
+-      if (!replace)
+-	{
+-	  prev = prev_active_insn (ret);
+-	  if (prev
+-	      && ((JUMP_P (prev) && any_condjump_p (prev))
+-		  || CALL_P (prev)))
+-	    replace = true;
+-	  /* Empty functions get branch mispredict even when
+-	     the jump destination is not visible to us.  */
+-	  if (!prev && !optimize_function_for_size_p (cfun))
+-	    replace = true;
+-	}
+-      if (replace)
+-	{
+-	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
+-	  delete_insn (ret);
+-	}
+-    }
+-}
+-
+-/* Count the minimum number of instructions in BB.  Return 4 if the
+-   number of instructions >= 4.  */
+-
+-static int
+-ix86_count_insn_bb (basic_block bb)
+-{
+-  rtx_insn *insn;
+-  int insn_count = 0;
+-
+-  /* Count number of instructions in this block.  Return 4 if the number
+-     of instructions >= 4.  */
+-  FOR_BB_INSNS (bb, insn)
+-    {
+-      /* Only happen in exit blocks.  */
+-      if (JUMP_P (insn)
+-	  && ANY_RETURN_P (PATTERN (insn)))
+-	break;
+-
+-      if (NONDEBUG_INSN_P (insn)
+-	  && GET_CODE (PATTERN (insn)) != USE
+-	  && GET_CODE (PATTERN (insn)) != CLOBBER)
+-	{
+-	  insn_count++;
+-	  if (insn_count >= 4)
+-	    return insn_count;
+-	}
+-    }
+-
+-  return insn_count;
+-}
+-
+-
+-/* Count the minimum number of instructions in code path in BB.
+-   Return 4 if the number of instructions >= 4.  */
+-
+-static int
+-ix86_count_insn (basic_block bb)
+-{
+-  edge e;
+-  edge_iterator ei;
+-  int min_prev_count;
+-
+-  /* Only bother counting instructions along paths with no
+-     more than 2 basic blocks between entry and exit.  Given
+-     that BB has an edge to exit, determine if a predecessor
+-     of BB has an edge from entry.  If so, compute the number
+-     of instructions in the predecessor block.  If there
+-     happen to be multiple such blocks, compute the minimum.  */
+-  min_prev_count = 4;
+-  FOR_EACH_EDGE (e, ei, bb->preds)
+-    {
+-      edge prev_e;
+-      edge_iterator prev_ei;
+-
+-      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+-	{
+-	  min_prev_count = 0;
+-	  break;
+-	}
+-      FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
+-	{
+-	  if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+-	    {
+-	      int count = ix86_count_insn_bb (e->src);
+-	      if (count < min_prev_count)
+-		min_prev_count = count;
+-	      break;
+-	    }
+-	}
+-    }
+-
+-  if (min_prev_count < 4)
+-    min_prev_count += ix86_count_insn_bb (bb);
+-
+-  return min_prev_count;
+-}
+-
+-/* Pad short function to 4 instructions.   */
+-
+-static void
+-ix86_pad_short_function (void)
+-{
+-  edge e;
+-  edge_iterator ei;
+-
+-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+-    {
+-      rtx_insn *ret = BB_END (e->src);
+-      if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
+-	{
+-	  int insn_count = ix86_count_insn (e->src);
+-
+-	  /* Pad short function.  */
+-	  if (insn_count < 4)
+-	    {
+-	      rtx_insn *insn = ret;
+-
+-	      /* Find epilogue.  */
+-	      while (insn
+-		     && (!NOTE_P (insn)
+-			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
+-		insn = PREV_INSN (insn);
+-
+-	      if (!insn)
+-		insn = ret;
+-
+-	      /* Two NOPs count as one instruction.  */
+-	      insn_count = 2 * (4 - insn_count);
+-	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
+-	    }
+-	}
+-    }
+-}
+-
+-/* Fix up a Windows system unwinder issue.  If an EH region falls through into
+-   the epilogue, the Windows system unwinder will apply epilogue logic and
+-   produce incorrect offsets.  This can be avoided by adding a nop between
+-   the last insn that can throw and the first insn of the epilogue.  */
+-
+-static void
+-ix86_seh_fixup_eh_fallthru (void)
+-{
+-  edge e;
+-  edge_iterator ei;
+-
+-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+-    {
+-      rtx_insn *insn, *next;
+-
+-      /* Find the beginning of the epilogue.  */
+-      for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
+-	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
+-	  break;
+-      if (insn == NULL)
+-	continue;
+-
+-      /* We only care about preceding insns that can throw.  */
+-      insn = prev_active_insn (insn);
+-      if (insn == NULL || !can_throw_internal (insn))
+-	continue;
+-
+-      /* Do not separate calls from their debug information.  */
+-      for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
+-	if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
+-	  insn = next;
+-	else
+-	  break;
+-
+-      emit_insn_after (gen_nops (const1_rtx), insn);
+-    }
+-}
+-
+-/* Implement machine specific optimizations.  We implement padding of returns
+-   for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
+-static void
+-ix86_reorg (void)
+-{
+-  /* We are freeing block_for_insn in the toplev to keep compatibility
+-     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+-  compute_bb_for_insn ();
+-
+-  if (TARGET_SEH && current_function_has_exception_handlers ())
+-    ix86_seh_fixup_eh_fallthru ();
+-
+-  if (optimize && optimize_function_for_speed_p (cfun))
+-    {
+-      if (TARGET_PAD_SHORT_FUNCTION)
+-	ix86_pad_short_function ();
+-      else if (TARGET_PAD_RETURNS)
+-	ix86_pad_returns ();
+-#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+-      if (TARGET_FOUR_JUMP_LIMIT)
+-	ix86_avoid_jump_mispredicts ();
+-#endif
+-    }
+-}
+-
+-/* Return nonzero when QImode register that must be represented via REX prefix
+-   is used.  */
+-bool
+-x86_extended_QIreg_mentioned_p (rtx_insn *insn)
+-{
+-  int i;
+-  extract_insn_cached (insn);
+-  for (i = 0; i < recog_data.n_operands; i++)
+-    if (GENERAL_REG_P (recog_data.operand[i])
+-	&& !QI_REGNO_P (REGNO (recog_data.operand[i])))
+-       return true;
+-  return false;
+-}
+-
+-/* Return true when INSN mentions register that must be encoded using REX
+-   prefix.  */
+-bool
+-x86_extended_reg_mentioned_p (rtx insn)
+-{
+-  subrtx_iterator::array_type array;
+-  FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
+-    {
+-      const_rtx x = *iter;
+-      if (REG_P (x)
+-	  && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
+-	return true;
+-    }
+-  return false;
+-}
+-
+-/* If profitable, negate (without causing overflow) integer constant
+-   of mode MODE at location LOC.  Return true in this case.  */
+-bool
+-x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
+-{
+-  HOST_WIDE_INT val;
+-
+-  if (!CONST_INT_P (*loc))
+-    return false;
+-
+-  switch (mode)
+-    {
+-    case E_DImode:
+-      /* DImode x86_64 constants must fit in 32 bits.  */
+-      gcc_assert (x86_64_immediate_operand (*loc, mode));
+-
+-      mode = SImode;
+-      break;
+-
+-    case E_SImode:
+-    case E_HImode:
+-    case E_QImode:
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  /* Avoid overflows.  */
+-  if (mode_signbit_p (mode, *loc))
+-    return false;
+-
+-  val = INTVAL (*loc);
+-
+-  /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+-     Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
+-  if ((val < 0 && val != -128)
+-      || val == 128)
+-    {
+-      *loc = GEN_INT (-val);
+-      return true;
+-    }
+-
+-  return false;
+-}
+-
+-/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
+-   optabs would emit if we didn't have TFmode patterns.  */
+-
+-void
+-x86_emit_floatuns (rtx operands[2])
+-{
+-  rtx_code_label *neglab, *donelab;
+-  rtx i0, i1, f0, in, out;
+-  machine_mode mode, inmode;
+-
+-  inmode = GET_MODE (operands[1]);
+-  gcc_assert (inmode == SImode || inmode == DImode);
+-
+-  out = operands[0];
+-  in = force_reg (inmode, operands[1]);
+-  mode = GET_MODE (out);
+-  neglab = gen_label_rtx ();
+-  donelab = gen_label_rtx ();
+-  f0 = gen_reg_rtx (mode);
+-
+-  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
+-
+-  expand_float (out, in, 0);
+-
+-  emit_jump_insn (gen_jump (donelab));
+-  emit_barrier ();
+-
+-  emit_label (neglab);
+-
+-  i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
+-			    1, OPTAB_DIRECT);
+-  i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
+-			    1, OPTAB_DIRECT);
+-  i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
+-
+-  expand_float (f0, i0, 0);
+-
+-  emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
+-
+-  emit_label (donelab);
+-}
+-
+-static bool canonicalize_perm (struct expand_vec_perm_d *d);
+-static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+-static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+-static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
+-
+-/* Get a vector mode of the same size as the original but with elements
+-   twice as wide.  This is only guaranteed to apply to integral vectors.  */
+-
+-static inline machine_mode
+-get_mode_wider_vector (machine_mode o)
+-{
+-  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
+-  machine_mode n = GET_MODE_WIDER_MODE (o).require ();
+-  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+-  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+-  return n;
+-}
+-
+-/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
+-   fill target with val via vec_duplicate.  */
+-
+-static bool
+-ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
+-{
+-  bool ok;
+-  rtx_insn *insn;
+-  rtx dup;
+-
+-  /* First attempt to recognize VAL as-is.  */
+-  dup = gen_vec_duplicate (mode, val);
+-  insn = emit_insn (gen_rtx_SET (target, dup));
+-  if (recog_memoized (insn) < 0)
+-    {
+-      rtx_insn *seq;
+-      machine_mode innermode = GET_MODE_INNER (mode);
+-      rtx reg;
+-
+-      /* If that fails, force VAL into a register.  */
+-
+-      start_sequence ();
+-      reg = force_reg (innermode, val);
+-      if (GET_MODE (reg) != innermode)
+-	reg = gen_lowpart (innermode, reg);
+-      SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
+-      seq = get_insns ();
+-      end_sequence ();
+-      if (seq)
+-	emit_insn_before (seq, insn);
+-
+-      ok = recog_memoized (insn) >= 0;
+-      gcc_assert (ok);
+-    }
+-  return true;
+-}
+-
+-/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+-   with all elements equal to VAR.  Return true if successful.  */
+-
+-static bool
+-ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
+-				   rtx target, rtx val)
+-{
+-  bool ok;
+-
+-  switch (mode)
+-    {
+-    case E_V2SImode:
+-    case E_V2SFmode:
+-      if (!mmx_ok)
+-	return false;
+-      /* FALLTHRU */
+-
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-    case E_V4SFmode:
+-    case E_V4SImode:
+-    case E_V16SImode:
+-    case E_V8DImode:
+-    case E_V16SFmode:
+-    case E_V8DFmode:
+-      return ix86_vector_duplicate_value (mode, target, val);
+-
+-    case E_V4HImode:
+-      if (!mmx_ok)
+-	return false;
+-      if (TARGET_SSE || TARGET_3DNOW_A)
+-	{
+-	  rtx x;
+-
+-	  val = gen_lowpart (SImode, val);
+-	  x = gen_rtx_TRUNCATE (HImode, val);
+-	  x = gen_rtx_VEC_DUPLICATE (mode, x);
+-	  emit_insn (gen_rtx_SET (target, x));
+-	  return true;
+-	}
+-      goto widen;
+-
+-    case E_V8QImode:
+-      if (!mmx_ok)
+-	return false;
+-      goto widen;
+-
+-    case E_V8HImode:
+-      if (TARGET_AVX2)
+-	return ix86_vector_duplicate_value (mode, target, val);
+-
+-      if (TARGET_SSE2)
+-	{
+-	  struct expand_vec_perm_d dperm;
+-	  rtx tmp1, tmp2;
+-
+-	permute:
+-	  memset (&dperm, 0, sizeof (dperm));
+-	  dperm.target = target;
+-	  dperm.vmode = mode;
+-	  dperm.nelt = GET_MODE_NUNITS (mode);
+-	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+-	  dperm.one_operand_p = true;
+-
+-	  /* Extend to SImode using a paradoxical SUBREG.  */
+-	  tmp1 = gen_reg_rtx (SImode);
+-	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
+-
+-	  /* Insert the SImode value as low element of a V4SImode vector. */
+-	  tmp2 = gen_reg_rtx (V4SImode);
+-	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+-	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
+-
+-	  ok = (expand_vec_perm_1 (&dperm)
+-		|| expand_vec_perm_broadcast_1 (&dperm));
+-	  gcc_assert (ok);
+-	  return ok;
+-	}
+-      goto widen;
+-
+-    case E_V16QImode:
+-      if (TARGET_AVX2)
+-	return ix86_vector_duplicate_value (mode, target, val);
+-
+-      if (TARGET_SSE2)
+-	goto permute;
+-      goto widen;
+-
+-    widen:
+-      /* Replicate the value once into the next wider mode and recurse.  */
+-      {
+-	machine_mode smode, wsmode, wvmode;
+-	rtx x;
+-
+-	smode = GET_MODE_INNER (mode);
+-	wvmode = get_mode_wider_vector (mode);
+-	wsmode = GET_MODE_INNER (wvmode);
+-
+-	val = convert_modes (wsmode, smode, val, true);
+-	x = expand_simple_binop (wsmode, ASHIFT, val,
+-				 GEN_INT (GET_MODE_BITSIZE (smode)),
+-				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
+-	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+-
+-	x = gen_reg_rtx (wvmode);
+-	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+-	gcc_assert (ok);
+-	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
+-	return ok;
+-      }
+-
+-    case E_V16HImode:
+-    case E_V32QImode:
+-      if (TARGET_AVX2)
+-	return ix86_vector_duplicate_value (mode, target, val);
+-      else
+-	{
+-	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+-	  rtx x = gen_reg_rtx (hvmode);
+-
+-	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+-	  gcc_assert (ok);
+-
+-	  x = gen_rtx_VEC_CONCAT (mode, x, x);
+-	  emit_insn (gen_rtx_SET (target, x));
+-	}
+-      return true;
+-
+-    case E_V64QImode:
+-    case E_V32HImode:
+-      if (TARGET_AVX512BW)
+-	return ix86_vector_duplicate_value (mode, target, val);
+-      else
+-	{
+-	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
+-	  rtx x = gen_reg_rtx (hvmode);
+-
+-	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+-	  gcc_assert (ok);
+-
+-	  x = gen_rtx_VEC_CONCAT (mode, x, x);
+-	  emit_insn (gen_rtx_SET (target, x));
+-	}
+-      return true;
+-
+-    default:
+-      return false;
+-    }
+-}
+-
+-/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+-   whose ONE_VAR element is VAR, and other elements are zero.  Return true
+-   if successful.  */
+-
+-static bool
+-ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
+-				     rtx target, rtx var, int one_var)
+-{
+-  machine_mode vsimode;
+-  rtx new_target;
+-  rtx x, tmp;
+-  bool use_vector_set = false;
+-  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
+-
+-  switch (mode)
+-    {
+-    case E_V2DImode:
+-      /* For SSE4.1, we normally use vector set.  But if the second
+-	 element is zero and inter-unit moves are OK, we use movq
+-	 instead.  */
+-      use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
+-			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
+-			     && one_var == 0));
+-      break;
+-    case E_V16QImode:
+-    case E_V4SImode:
+-    case E_V4SFmode:
+-      use_vector_set = TARGET_SSE4_1;
+-      break;
+-    case E_V8HImode:
+-      use_vector_set = TARGET_SSE2;
+-      break;
+-    case E_V4HImode:
+-      use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
+-      break;
+-    case E_V32QImode:
+-    case E_V16HImode:
+-      use_vector_set = TARGET_AVX;
+-      break;
+-    case E_V8SImode:
+-      use_vector_set = TARGET_AVX;
+-      gen_vec_set_0 = gen_vec_setv8si_0;
+-      break;
+-    case E_V8SFmode:
+-      use_vector_set = TARGET_AVX;
+-      gen_vec_set_0 = gen_vec_setv8sf_0;
+-      break;
+-    case E_V4DFmode:
+-      use_vector_set = TARGET_AVX;
+-      gen_vec_set_0 = gen_vec_setv4df_0;
+-      break;
+-    case E_V4DImode:
+-      /* Use ix86_expand_vector_set in 64bit mode only.  */
+-      use_vector_set = TARGET_AVX && TARGET_64BIT;
+-      gen_vec_set_0 = gen_vec_setv4di_0;
+-      break;
+-    case E_V16SImode:
+-      use_vector_set = TARGET_AVX512F && one_var == 0;
+-      gen_vec_set_0 = gen_vec_setv16si_0;
+-      break;
+-    case E_V16SFmode:
+-      use_vector_set = TARGET_AVX512F && one_var == 0;
+-      gen_vec_set_0 = gen_vec_setv16sf_0;
+-      break;
+-    case E_V8DFmode:
+-      use_vector_set = TARGET_AVX512F && one_var == 0;
+-      gen_vec_set_0 = gen_vec_setv8df_0;
+-      break;
+-    case E_V8DImode:
+-      /* Use ix86_expand_vector_set in 64bit mode only.  */
+-      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
+-      gen_vec_set_0 = gen_vec_setv8di_0;
+-      break;
+-    default:
+-      break;
+-    }
+-
+-  if (use_vector_set)
+-    {
+-      if (gen_vec_set_0 && one_var == 0)
+-	{
+-	  var = force_reg (GET_MODE_INNER (mode), var);
+-	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
+-	  return true;
+-	}
+-      emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
+-      var = force_reg (GET_MODE_INNER (mode), var);
+-      ix86_expand_vector_set (mmx_ok, target, var, one_var);
+-      return true;
+-    }
+-
+-  switch (mode)
+-    {
+-    case E_V2SFmode:
+-    case E_V2SImode:
+-      if (!mmx_ok)
+-	return false;
+-      /* FALLTHRU */
+-
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-      if (one_var != 0)
+-	return false;
+-      var = force_reg (GET_MODE_INNER (mode), var);
+-      x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
+-      emit_insn (gen_rtx_SET (target, x));
+-      return true;
+-
+-    case E_V4SFmode:
+-    case E_V4SImode:
+-      if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+-	new_target = gen_reg_rtx (mode);
+-      else
+-	new_target = target;
+-      var = force_reg (GET_MODE_INNER (mode), var);
+-      x = gen_rtx_VEC_DUPLICATE (mode, var);
+-      x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
+-      emit_insn (gen_rtx_SET (new_target, x));
+-      if (one_var != 0)
+-	{
+-	  /* We need to shuffle the value to the correct position, so
+-	     create a new pseudo to store the intermediate result.  */
+-
+-	  /* With SSE2, we can use the integer shuffle insns.  */
+-	  if (mode != V4SFmode && TARGET_SSE2)
+-	    {
+-	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
+-					    const1_rtx,
+-					    GEN_INT (one_var == 1 ? 0 : 1),
+-					    GEN_INT (one_var == 2 ? 0 : 1),
+-					    GEN_INT (one_var == 3 ? 0 : 1)));
+-	      if (target != new_target)
+-		emit_move_insn (target, new_target);
+-	      return true;
+-	    }
+-
+-	  /* Otherwise convert the intermediate result to V4SFmode and
+-	     use the SSE1 shuffle instructions.  */
+-	  if (mode != V4SFmode)
+-	    {
+-	      tmp = gen_reg_rtx (V4SFmode);
+-	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
+-	    }
+-	  else
+-	    tmp = new_target;
+-
+-	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
+-				       const1_rtx,
+-				       GEN_INT (one_var == 1 ? 0 : 1),
+-				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
+-				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
+-
+-	  if (mode != V4SFmode)
+-	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
+-	  else if (tmp != target)
+-	    emit_move_insn (target, tmp);
+-	}
+-      else if (target != new_target)
+-	emit_move_insn (target, new_target);
+-      return true;
+-
+-    case E_V8HImode:
+-    case E_V16QImode:
+-      vsimode = V4SImode;
+-      goto widen;
+-    case E_V4HImode:
+-    case E_V8QImode:
+-      if (!mmx_ok)
+-	return false;
+-      vsimode = V2SImode;
+-      goto widen;
+-    widen:
+-      if (one_var != 0)
+-	return false;
+-
+-      /* Zero extend the variable element to SImode and recurse.  */
+-      var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
+-
+-      x = gen_reg_rtx (vsimode);
+-      if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
+-						var, one_var))
+-	gcc_unreachable ();
+-
+-      emit_move_insn (target, gen_lowpart (mode, x));
+-      return true;
+-
+-    default:
+-      return false;
+-    }
+-}
+-
+-/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+-   consisting of the values in VALS.  It is known that all elements
+-   except ONE_VAR are constants.  Return true if successful.  */
+-
+-static bool
+-ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
+-				 rtx target, rtx vals, int one_var)
+-{
+-  rtx var = XVECEXP (vals, 0, one_var);
+-  machine_mode wmode;
+-  rtx const_vec, x;
+-
+-  const_vec = copy_rtx (vals);
+-  XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
+-  const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
+-
+-  switch (mode)
+-    {
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-    case E_V2SFmode:
+-    case E_V2SImode:
+-      /* For the two element vectors, it's just as easy to use
+-	 the general case.  */
+-      return false;
+-
+-    case E_V4DImode:
+-      /* Use ix86_expand_vector_set in 64bit mode only.  */
+-      if (!TARGET_64BIT)
+-	return false;
+-      /* FALLTHRU */
+-    case E_V4DFmode:
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V16HImode:
+-    case E_V32QImode:
+-    case E_V4SFmode:
+-    case E_V4SImode:
+-    case E_V8HImode:
+-    case E_V4HImode:
+-      break;
+-
+-    case E_V16QImode:
+-      if (TARGET_SSE4_1)
+-	break;
+-      wmode = V8HImode;
+-      goto widen;
+-    case E_V8QImode:
+-      wmode = V4HImode;
+-      goto widen;
+-    widen:
+-      /* There's no way to set one QImode entry easily.  Combine
+-	 the variable value with its adjacent constant value, and
+-	 promote to an HImode set.  */
+-      x = XVECEXP (vals, 0, one_var ^ 1);
+-      if (one_var & 1)
+-	{
+-	  var = convert_modes (HImode, QImode, var, true);
+-	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
+-				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
+-	  x = GEN_INT (INTVAL (x) & 0xff);
+-	}
+-      else
+-	{
+-	  var = convert_modes (HImode, QImode, var, true);
+-	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
+-	}
+-      if (x != const0_rtx)
+-	var = expand_simple_binop (HImode, IOR, var, x, var,
+-				   1, OPTAB_LIB_WIDEN);
+-
+-      x = gen_reg_rtx (wmode);
+-      emit_move_insn (x, gen_lowpart (wmode, const_vec));
+-      ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
+-
+-      emit_move_insn (target, gen_lowpart (mode, x));
+-      return true;
+-
+-    default:
+-      return false;
+-    }
+-
+-  emit_move_insn (target, const_vec);
+-  ix86_expand_vector_set (mmx_ok, target, var, one_var);
+-  return true;
+-}
+-
+-/* A subroutine of ix86_expand_vector_init_general.  Use vector
+-   concatenate to handle the most general case: all values variable,
+-   and none identical.  */
+-
+-static void
+-ix86_expand_vector_init_concat (machine_mode mode,
+-				rtx target, rtx *ops, int n)
+-{
+-  machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
+-  rtx first[16], second[8], third[4];
+-  rtvec v;
+-  int i, j;
+-
+-  switch (n)
+-    {
+-    case 2:
+-      switch (mode)
+-	{
+-	case E_V16SImode:
+-	  cmode = V8SImode;
+-	  break;
+-	case E_V16SFmode:
+-	  cmode = V8SFmode;
+-	  break;
+-	case E_V8DImode:
+-	  cmode = V4DImode;
+-	  break;
+-	case E_V8DFmode:
+-	  cmode = V4DFmode;
+-	  break;
+-	case E_V8SImode:
+-	  cmode = V4SImode;
+-	  break;
+-	case E_V8SFmode:
+-	  cmode = V4SFmode;
+-	  break;
+-	case E_V4DImode:
+-	  cmode = V2DImode;
+-	  break;
+-	case E_V4DFmode:
+-	  cmode = V2DFmode;
+-	  break;
+-	case E_V4SImode:
+-	  cmode = V2SImode;
+-	  break;
+-	case E_V4SFmode:
+-	  cmode = V2SFmode;
+-	  break;
+-	case E_V2DImode:
+-	  cmode = DImode;
+-	  break;
+-	case E_V2SImode:
+-	  cmode = SImode;
+-	  break;
+-	case E_V2DFmode:
+-	  cmode = DFmode;
+-	  break;
+-	case E_V2SFmode:
+-	  cmode = SFmode;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-
+-      if (!register_operand (ops[1], cmode))
+-	ops[1] = force_reg (cmode, ops[1]);
+-      if (!register_operand (ops[0], cmode))
+-	ops[0] = force_reg (cmode, ops[0]);
+-      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
+-							  ops[1])));
+-      break;
+-
+-    case 4:
+-      switch (mode)
+-	{
+-	case E_V4DImode:
+-	  cmode = V2DImode;
+-	  break;
+-	case E_V4DFmode:
+-	  cmode = V2DFmode;
+-	  break;
+-	case E_V4SImode:
+-	  cmode = V2SImode;
+-	  break;
+-	case E_V4SFmode:
+-	  cmode = V2SFmode;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      goto half;
+-
+-    case 8:
+-      switch (mode)
+-	{
+-	case E_V8DImode:
+-	  cmode = V2DImode;
+-	  hmode = V4DImode;
+-	  break;
+-	case E_V8DFmode:
+-	  cmode = V2DFmode;
+-	  hmode = V4DFmode;
+-	  break;
+-	case E_V8SImode:
+-	  cmode = V2SImode;
+-	  hmode = V4SImode;
+-	  break;
+-	case E_V8SFmode:
+-	  cmode = V2SFmode;
+-	  hmode = V4SFmode;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      goto half;
+-
+-    case 16:
+-      switch (mode)
+-	{
+-	case E_V16SImode:
+-	  cmode = V2SImode;
+-	  hmode = V4SImode;
+-	  gmode = V8SImode;
+-	  break;
+-	case E_V16SFmode:
+-	  cmode = V2SFmode;
+-	  hmode = V4SFmode;
+-	  gmode = V8SFmode;
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      goto half;
+-
+-half:
+-      /* FIXME: We process inputs backward to help RA.  PR 36222.  */
+-      i = n - 1;
+-      j = (n >> 1) - 1;
+-      for (; i > 0; i -= 2, j--)
+-	{
+-	  first[j] = gen_reg_rtx (cmode);
+-	  v = gen_rtvec (2, ops[i - 1], ops[i]);
+-	  ix86_expand_vector_init (false, first[j],
+-				   gen_rtx_PARALLEL (cmode, v));
+-	}
+-
+-      n >>= 1;
+-      if (n > 4)
+-	{
+-	  gcc_assert (hmode != VOIDmode);
+-	  gcc_assert (gmode != VOIDmode);
+-	  for (i = j = 0; i < n; i += 2, j++)
+-	    {
+-	      second[j] = gen_reg_rtx (hmode);
+-	      ix86_expand_vector_init_concat (hmode, second [j],
+-					      &first [i], 2);
+-	    }
+-	  n >>= 1;
+-	  for (i = j = 0; i < n; i += 2, j++)
+-	    {
+-	      third[j] = gen_reg_rtx (gmode);
+-	      ix86_expand_vector_init_concat (gmode, third[j],
+-					      &second[i], 2);
+-	    }
+-	  n >>= 1;
+-	  ix86_expand_vector_init_concat (mode, target, third, n);
+-	}
+-      else if (n > 2)
+-	{
+-	  gcc_assert (hmode != VOIDmode);
+-	  for (i = j = 0; i < n; i += 2, j++)
+-	    {
+-	      second[j] = gen_reg_rtx (hmode);
+-	      ix86_expand_vector_init_concat (hmode, second [j],
+-					      &first [i], 2);
+-	    }
+-	  n >>= 1;
+-	  ix86_expand_vector_init_concat (mode, target, second, n);
+-	}
+-      else
+-	ix86_expand_vector_init_concat (mode, target, first, n);
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* A subroutine of ix86_expand_vector_init_general.  Use vector
+-   interleave to handle the most general case: all values variable,
+-   and none identical.  */
+-
+-static void
+-ix86_expand_vector_init_interleave (machine_mode mode,
+-				    rtx target, rtx *ops, int n)
+-{
+-  machine_mode first_imode, second_imode, third_imode, inner_mode;
+-  int i, j;
+-  rtx op0, op1;
+-  rtx (*gen_load_even) (rtx, rtx, rtx);
+-  rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
+-  rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
+-
+-  switch (mode)
+-    {
+-    case E_V8HImode:
+-      gen_load_even = gen_vec_setv8hi;
+-      gen_interleave_first_low = gen_vec_interleave_lowv4si;
+-      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+-      inner_mode = HImode;
+-      first_imode = V4SImode;
+-      second_imode = V2DImode;
+-      third_imode = VOIDmode;
+-      break;
+-    case E_V16QImode:
+-      gen_load_even = gen_vec_setv16qi;
+-      gen_interleave_first_low = gen_vec_interleave_lowv8hi;
+-      gen_interleave_second_low = gen_vec_interleave_lowv4si;
+-      inner_mode = QImode;
+-      first_imode = V8HImode;
+-      second_imode = V4SImode;
+-      third_imode = V2DImode;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  for (i = 0; i < n; i++)
+-    {
+-      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
+-      op0 = gen_reg_rtx (SImode);
+-      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+-
+-      /* Insert the SImode value as low element of V4SImode vector. */
+-      op1 = gen_reg_rtx (V4SImode);
+-      op0 = gen_rtx_VEC_MERGE (V4SImode,
+-			       gen_rtx_VEC_DUPLICATE (V4SImode,
+-						      op0),
+-			       CONST0_RTX (V4SImode),
+-			       const1_rtx);
+-      emit_insn (gen_rtx_SET (op1, op0));
+-
+-      /* Cast the V4SImode vector back to a vector in orignal mode.  */
+-      op0 = gen_reg_rtx (mode);
+-      emit_move_insn (op0, gen_lowpart (mode, op1));
+-
+-      /* Load even elements into the second position.  */
+-      emit_insn (gen_load_even (op0,
+-				force_reg (inner_mode,
+-					   ops [i + i + 1]),
+-				const1_rtx));
+-
+-      /* Cast vector to FIRST_IMODE vector.  */
+-      ops[i] = gen_reg_rtx (first_imode);
+-      emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
+-    }
+-
+-  /* Interleave low FIRST_IMODE vectors.  */
+-  for (i = j = 0; i < n; i += 2, j++)
+-    {
+-      op0 = gen_reg_rtx (first_imode);
+-      emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
+-
+-      /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
+-      ops[j] = gen_reg_rtx (second_imode);
+-      emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
+-    }
+-
+-  /* Interleave low SECOND_IMODE vectors.  */
+-  switch (second_imode)
+-    {
+-    case E_V4SImode:
+-      for (i = j = 0; i < n / 2; i += 2, j++)
+-	{
+-	  op0 = gen_reg_rtx (second_imode);
+-	  emit_insn (gen_interleave_second_low (op0, ops[i],
+-						ops[i + 1]));
+-
+-	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
+-	     vector.  */
+-	  ops[j] = gen_reg_rtx (third_imode);
+-	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
+-	}
+-      second_imode = V2DImode;
+-      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+-      /* FALLTHRU */
+-
+-    case E_V2DImode:
+-      op0 = gen_reg_rtx (second_imode);
+-      emit_insn (gen_interleave_second_low (op0, ops[0],
+-					    ops[1]));
+-
+-      /* Cast the SECOND_IMODE vector back to a vector on original
+-	 mode.  */
+-      emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* A subroutine of ix86_expand_vector_init.  Handle the most general case:
+-   all values variable, and none identical.  */
+-
+-static void
+-ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
+-				 rtx target, rtx vals)
+-{
+-  rtx ops[64], op0, op1, op2, op3, op4, op5;
+-  machine_mode half_mode = VOIDmode;
+-  machine_mode quarter_mode = VOIDmode;
+-  int n, i;
+-
+-  switch (mode)
+-    {
+-    case E_V2SFmode:
+-    case E_V2SImode:
+-      if (!mmx_ok && !TARGET_SSE)
+-	break;
+-      /* FALLTHRU */
+-
+-    case E_V16SImode:
+-    case E_V16SFmode:
+-    case E_V8DFmode:
+-    case E_V8DImode:
+-    case E_V8SFmode:
+-    case E_V8SImode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-    case E_V4SFmode:
+-    case E_V4SImode:
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-      n = GET_MODE_NUNITS (mode);
+-      for (i = 0; i < n; i++)
+-	ops[i] = XVECEXP (vals, 0, i);
+-      ix86_expand_vector_init_concat (mode, target, ops, n);
+-      return;
+-
+-    case E_V2TImode:
+-      for (i = 0; i < 2; i++)
+-	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
+-      op0 = gen_reg_rtx (V4DImode);
+-      ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
+-      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
+-      return;
+-
+-    case E_V4TImode:
+-      for (i = 0; i < 4; i++)
+-	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
+-      ops[4] = gen_reg_rtx (V4DImode);
+-      ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
+-      ops[5] = gen_reg_rtx (V4DImode);
+-      ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
+-      op0 = gen_reg_rtx (V8DImode);
+-      ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
+-      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
+-      return;
+-
+-    case E_V32QImode:
+-      half_mode = V16QImode;
+-      goto half;
+-
+-    case E_V16HImode:
+-      half_mode = V8HImode;
+-      goto half;
+-
+-half:
+-      n = GET_MODE_NUNITS (mode);
+-      for (i = 0; i < n; i++)
+-	ops[i] = XVECEXP (vals, 0, i);
+-      op0 = gen_reg_rtx (half_mode);
+-      op1 = gen_reg_rtx (half_mode);
+-      ix86_expand_vector_init_interleave (half_mode, op0, ops,
+-					  n >> 2);
+-      ix86_expand_vector_init_interleave (half_mode, op1,
+-					  &ops [n >> 1], n >> 2);
+-      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
+-      return;
+-
+-    case E_V64QImode:
+-      quarter_mode = V16QImode;
+-      half_mode = V32QImode;
+-      goto quarter;
+-
+-    case E_V32HImode:
+-      quarter_mode = V8HImode;
+-      half_mode = V16HImode;
+-      goto quarter;
+-
+-quarter:
+-      n = GET_MODE_NUNITS (mode);
+-      for (i = 0; i < n; i++)
+-	ops[i] = XVECEXP (vals, 0, i);
+-      op0 = gen_reg_rtx (quarter_mode);
+-      op1 = gen_reg_rtx (quarter_mode);
+-      op2 = gen_reg_rtx (quarter_mode);
+-      op3 = gen_reg_rtx (quarter_mode);
+-      op4 = gen_reg_rtx (half_mode);
+-      op5 = gen_reg_rtx (half_mode);
+-      ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
+-					  n >> 3);
+-      ix86_expand_vector_init_interleave (quarter_mode, op1,
+-					  &ops [n >> 2], n >> 3);
+-      ix86_expand_vector_init_interleave (quarter_mode, op2,
+-					  &ops [n >> 1], n >> 3);
+-      ix86_expand_vector_init_interleave (quarter_mode, op3,
+-					  &ops [(n >> 1) | (n >> 2)], n >> 3);
+-      emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
+-      emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
+-      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
+-      return;
+-
+-    case E_V16QImode:
+-      if (!TARGET_SSE4_1)
+-	break;
+-      /* FALLTHRU */
+-
+-    case E_V8HImode:
+-      if (!TARGET_SSE2)
+-	break;
+-
+-      /* Don't use ix86_expand_vector_init_interleave if we can't
+-	 move from GPR to SSE register directly.  */
+-      if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+-	break;
+-
+-      n = GET_MODE_NUNITS (mode);
+-      for (i = 0; i < n; i++)
+-	ops[i] = XVECEXP (vals, 0, i);
+-      ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
+-      return;
+-
+-    case E_V4HImode:
+-    case E_V8QImode:
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-    {
+-      int i, j, n_elts, n_words, n_elt_per_word;
+-      machine_mode inner_mode;
+-      rtx words[4], shift;
+-
+-      inner_mode = GET_MODE_INNER (mode);
+-      n_elts = GET_MODE_NUNITS (mode);
+-      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+-      n_elt_per_word = n_elts / n_words;
+-      shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
+-
+-      for (i = 0; i < n_words; ++i)
+-	{
+-	  rtx word = NULL_RTX;
+-
+-	  for (j = 0; j < n_elt_per_word; ++j)
+-	    {
+-	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
+-	      elt = convert_modes (word_mode, inner_mode, elt, true);
+-
+-	      if (j == 0)
+-		word = elt;
+-	      else
+-		{
+-		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+-					      word, 1, OPTAB_LIB_WIDEN);
+-		  word = expand_simple_binop (word_mode, IOR, word, elt,
+-					      word, 1, OPTAB_LIB_WIDEN);
+-		}
+-	    }
+-
+-	  words[i] = word;
+-	}
+-
+-      if (n_words == 1)
+-	emit_move_insn (target, gen_lowpart (mode, words[0]));
+-      else if (n_words == 2)
+-	{
+-	  rtx tmp = gen_reg_rtx (mode);
+-	  emit_clobber (tmp);
+-	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
+-	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+-	  emit_move_insn (target, tmp);
+-	}
+-      else if (n_words == 4)
+-	{
+-	  rtx tmp = gen_reg_rtx (V4SImode);
+-	  gcc_assert (word_mode == SImode);
+-	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
+-	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
+-	  emit_move_insn (target, gen_lowpart (mode, tmp));
+-	}
+-      else
+-	gcc_unreachable ();
+-    }
+-}
+-
+-/* Initialize vector TARGET via VALS.  Suppress the use of MMX
+-   instructions unless MMX_OK is true.  */
+-
+-void
+-ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
+-{
+-  machine_mode mode = GET_MODE (target);
+-  machine_mode inner_mode = GET_MODE_INNER (mode);
+-  int n_elts = GET_MODE_NUNITS (mode);
+-  int n_var = 0, one_var = -1;
+-  bool all_same = true, all_const_zero = true;
+-  int i;
+-  rtx x;
+-
+-  /* Handle first initialization from vector elts.  */
+-  if (n_elts != XVECLEN (vals, 0))
+-    {
+-      rtx subtarget = target;
+-      x = XVECEXP (vals, 0, 0);
+-      gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
+-      if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
+-	{
+-	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
+-	  if (inner_mode == QImode || inner_mode == HImode)
+-	    {
+-	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
+-	      mode = mode_for_vector (SImode, n_bits / 4).require ();
+-	      inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
+-	      ops[0] = gen_lowpart (inner_mode, ops[0]);
+-	      ops[1] = gen_lowpart (inner_mode, ops[1]);
+-	      subtarget = gen_reg_rtx (mode);
+-	    }
+-	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
+-	  if (subtarget != target)
+-	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
+-	  return;
+-	}
+-      gcc_unreachable ();
+-    }
+-
+-  for (i = 0; i < n_elts; ++i)
+-    {
+-      x = XVECEXP (vals, 0, i);
+-      if (!(CONST_SCALAR_INT_P (x)
+-	    || CONST_DOUBLE_P (x)
+-	    || CONST_FIXED_P (x)))
+-	n_var++, one_var = i;
+-      else if (x != CONST0_RTX (inner_mode))
+-	all_const_zero = false;
+-      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+-	all_same = false;
+-    }
+-
+-  /* Constants are best loaded from the constant pool.  */
+-  if (n_var == 0)
+-    {
+-      emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
+-      return;
+-    }
+-
+-  /* If all values are identical, broadcast the value.  */
+-  if (all_same
+-      && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
+-					    XVECEXP (vals, 0, 0)))
+-    return;
+-
+-  /* Values where only one field is non-constant are best loaded from
+-     the pool and overwritten via move later.  */
+-  if (n_var == 1)
+-    {
+-      if (all_const_zero
+-	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
+-						  XVECEXP (vals, 0, one_var),
+-						  one_var))
+-	return;
+-
+-      if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
+-	return;
+-    }
+-
+-  ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
+-}
+-
+-void
+-ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
+-{
+-  machine_mode mode = GET_MODE (target);
+-  machine_mode inner_mode = GET_MODE_INNER (mode);
+-  machine_mode half_mode;
+-  bool use_vec_merge = false;
+-  rtx tmp;
+-  static rtx (*gen_extract[6][2]) (rtx, rtx)
+-    = {
+-	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
+-	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
+-	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
+-	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
+-	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
+-	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+-      };
+-  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+-    = {
+-	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
+-	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
+-	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
+-	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
+-	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
+-	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+-      };
+-  int i, j, n;
+-  machine_mode mmode = VOIDmode;
+-  rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
+-
+-  switch (mode)
+-    {
+-    case E_V2SFmode:
+-    case E_V2SImode:
+-      if (mmx_ok)
+-	{
+-	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+-	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
+-	  if (elt == 0)
+-	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+-	  else
+-	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+-	  emit_insn (gen_rtx_SET (target, tmp));
+-	  return;
+-	}
+-      break;
+-
+-    case E_V2DImode:
+-      use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
+-      if (use_vec_merge)
+-	break;
+-
+-      tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+-      ix86_expand_vector_extract (false, tmp, target, 1 - elt);
+-      if (elt == 0)
+-	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+-      else
+-	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+-      emit_insn (gen_rtx_SET (target, tmp));
+-      return;
+-
+-    case E_V2DFmode:
+-      {
+-	rtx op0, op1;
+-
+-	/* For the two element vectors, we implement a VEC_CONCAT with
+-	   the extraction of the other element.  */
+-
+-	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
+-	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
+-
+-	if (elt == 0)
+-	  op0 = val, op1 = tmp;
+-	else
+-	  op0 = tmp, op1 = val;
+-
+-	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
+-	emit_insn (gen_rtx_SET (target, tmp));
+-      }
+-      return;
+-
+-    case E_V4SFmode:
+-      use_vec_merge = TARGET_SSE4_1;
+-      if (use_vec_merge)
+-	break;
+-
+-      switch (elt)
+-	{
+-	case 0:
+-	  use_vec_merge = true;
+-	  break;
+-
+-	case 1:
+-	  /* tmp = target = A B C D */
+-	  tmp = copy_to_reg (target);
+-	  /* target = A A B B */
+-	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
+-	  /* target = X A B B */
+-	  ix86_expand_vector_set (false, target, val, 0);
+-	  /* target = A X C D  */
+-	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+-					  const1_rtx, const0_rtx,
+-					  GEN_INT (2+4), GEN_INT (3+4)));
+-	  return;
+-
+-	case 2:
+-	  /* tmp = target = A B C D */
+-	  tmp = copy_to_reg (target);
+-	  /* tmp = X B C D */
+-	  ix86_expand_vector_set (false, tmp, val, 0);
+-	  /* target = A B X D */
+-	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+-					  const0_rtx, const1_rtx,
+-					  GEN_INT (0+4), GEN_INT (3+4)));
+-	  return;
+-
+-	case 3:
+-	  /* tmp = target = A B C D */
+-	  tmp = copy_to_reg (target);
+-	  /* tmp = X B C D */
+-	  ix86_expand_vector_set (false, tmp, val, 0);
+-	  /* target = A B X D */
+-	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+-					  const0_rtx, const1_rtx,
+-					  GEN_INT (2+4), GEN_INT (0+4)));
+-	  return;
+-
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      break;
+-
+-    case E_V4SImode:
+-      use_vec_merge = TARGET_SSE4_1;
+-      if (use_vec_merge)
+-	break;
+-
+-      /* Element 0 handled by vec_merge below.  */
+-      if (elt == 0)
+-	{
+-	  use_vec_merge = true;
+-	  break;
+-	}
+-
+-      if (TARGET_SSE2)
+-	{
+-	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
+-	     store into element 0, then shuffle them back.  */
+-
+-	  rtx order[4];
+-
+-	  order[0] = GEN_INT (elt);
+-	  order[1] = const1_rtx;
+-	  order[2] = const2_rtx;
+-	  order[3] = GEN_INT (3);
+-	  order[elt] = const0_rtx;
+-
+-	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+-					order[1], order[2], order[3]));
+-
+-	  ix86_expand_vector_set (false, target, val, 0);
+-
+-	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+-					order[1], order[2], order[3]));
+-	}
+-      else
+-	{
+-	  /* For SSE1, we have to reuse the V4SF code.  */
+-	  rtx t = gen_reg_rtx (V4SFmode);
+-	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
+-	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
+-	  emit_move_insn (target, gen_lowpart (mode, t));
+-	}
+-      return;
+-
+-    case E_V8HImode:
+-      use_vec_merge = TARGET_SSE2;
+-      break;
+-    case E_V4HImode:
+-      use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+-      break;
+-
+-    case E_V16QImode:
+-      use_vec_merge = TARGET_SSE4_1;
+-      break;
+-
+-    case E_V8QImode:
+-      break;
+-
+-    case E_V32QImode:
+-      half_mode = V16QImode;
+-      j = 0;
+-      n = 16;
+-      goto half;
+-
+-    case E_V16HImode:
+-      half_mode = V8HImode;
+-      j = 1;
+-      n = 8;
+-      goto half;
+-
+-    case E_V8SImode:
+-      half_mode = V4SImode;
+-      j = 2;
+-      n = 4;
+-      goto half;
+-
+-    case E_V4DImode:
+-      half_mode = V2DImode;
+-      j = 3;
+-      n = 2;
+-      goto half;
+-
+-    case E_V8SFmode:
+-      half_mode = V4SFmode;
+-      j = 4;
+-      n = 4;
+-      goto half;
+-
+-    case E_V4DFmode:
+-      half_mode = V2DFmode;
+-      j = 5;
+-      n = 2;
+-      goto half;
+-
+-half:
+-      /* Compute offset.  */
+-      i = elt / n;
+-      elt %= n;
+-
+-      gcc_assert (i <= 1);
+-
+-      /* Extract the half.  */
+-      tmp = gen_reg_rtx (half_mode);
+-      emit_insn (gen_extract[j][i] (tmp, target));
+-
+-      /* Put val in tmp at elt.  */
+-      ix86_expand_vector_set (false, tmp, val, elt);
+-
+-      /* Put it back.  */
+-      emit_insn (gen_insert[j][i] (target, target, tmp));
+-      return;
+-
+-    case E_V8DFmode:
+-      if (TARGET_AVX512F)
+-	{
+-	  mmode = QImode;
+-	  gen_blendm = gen_avx512f_blendmv8df;
+-	}
+-      break;
+-
+-    case E_V8DImode:
+-      if (TARGET_AVX512F)
+-	{
+-	  mmode = QImode;
+-	  gen_blendm = gen_avx512f_blendmv8di;
+-	}
+-      break;
+-
+-    case E_V16SFmode:
+-      if (TARGET_AVX512F)
+-	{
+-	  mmode = HImode;
+-	  gen_blendm = gen_avx512f_blendmv16sf;
+-	}
+-      break;
+-
+-    case E_V16SImode:
+-      if (TARGET_AVX512F)
+-	{
+-	  mmode = HImode;
+-	  gen_blendm = gen_avx512f_blendmv16si;
+-	}
+-      break;
+-
+-    case E_V32HImode:
+-      if (TARGET_AVX512BW)
+-	{
+-	  mmode = SImode;
+-	  gen_blendm = gen_avx512bw_blendmv32hi;
+-	}
+-      else if (TARGET_AVX512F)
+-	{
+-	  half_mode = E_V8HImode;
+-	  n = 8;
+-	  goto quarter;
+-	}
+-      break;
+-
+-    case E_V64QImode:
+-      if (TARGET_AVX512BW)
+-	{
+-	  mmode = DImode;
+-	  gen_blendm = gen_avx512bw_blendmv64qi;
+-	}
+-      else if (TARGET_AVX512F)
+-	{
+-	  half_mode = E_V16QImode;
+-	  n = 16;
+-	  goto quarter;
+-	}
+-      break;
+-
+-quarter:
+-      /* Compute offset.  */
+-      i = elt / n;
+-      elt %= n;
+-
+-      gcc_assert (i <= 3);
+-
+-      {
+-	/* Extract the quarter.  */
+-	tmp = gen_reg_rtx (V4SImode);
+-	rtx tmp2 = gen_lowpart (V16SImode, target);
+-	rtx mask = gen_reg_rtx (QImode);
+-
+-	emit_move_insn (mask, constm1_rtx);
+-	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
+-						   tmp, mask));
+-
+-	tmp2 = gen_reg_rtx (half_mode);
+-	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
+-	tmp = tmp2;
+-
+-	/* Put val in tmp at elt.  */
+-	ix86_expand_vector_set (false, tmp, val, elt);
+-
+-	/* Put it back.  */
+-	tmp2 = gen_reg_rtx (V16SImode);
+-	rtx tmp3 = gen_lowpart (V16SImode, target);
+-	mask = gen_reg_rtx (HImode);
+-	emit_move_insn (mask, constm1_rtx);
+-	tmp = gen_lowpart (V4SImode, tmp);
+-	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
+-						  tmp3, mask));
+-	emit_move_insn (target, gen_lowpart (mode, tmp2));
+-      }
+-      return;
+-
+-    default:
+-      break;
+-    }
+-
+-  if (mmode != VOIDmode)
+-    {
+-      tmp = gen_reg_rtx (mode);
+-      emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
+-      /* The avx512*_blendm<mode> expanders have different operand order
+-	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
+-	 elements where the mask is set and second input operand otherwise,
+-	 in {sse,avx}*_*blend* the first input operand is used for elements
+-	 where the mask is clear and second input operand otherwise.  */
+-      emit_insn (gen_blendm (target, target, tmp,
+-			     force_reg (mmode,
+-					gen_int_mode (HOST_WIDE_INT_1U << elt,
+-						      mmode))));
+-    }
+-  else if (use_vec_merge)
+-    {
+-      tmp = gen_rtx_VEC_DUPLICATE (mode, val);
+-      tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
+-			       GEN_INT (HOST_WIDE_INT_1U << elt));
+-      emit_insn (gen_rtx_SET (target, tmp));
+-    }
+-  else
+-    {
+-      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+-
+-      emit_move_insn (mem, target);
+-
+-      tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
+-      emit_move_insn (tmp, val);
+-
+-      emit_move_insn (target, mem);
+-    }
+-}
+-
+-void
+-ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
+-{
+-  machine_mode mode = GET_MODE (vec);
+-  machine_mode inner_mode = GET_MODE_INNER (mode);
+-  bool use_vec_extr = false;
+-  rtx tmp;
+-
+-  switch (mode)
+-    {
+-    case E_V2SImode:
+-    case E_V2SFmode:
+-      if (!mmx_ok)
+-	break;
+-      /* FALLTHRU */
+-
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-    case E_V2TImode:
+-    case E_V4TImode:
+-      use_vec_extr = true;
+-      break;
+-
+-    case E_V4SFmode:
+-      use_vec_extr = TARGET_SSE4_1;
+-      if (use_vec_extr)
+-	break;
+-
+-      switch (elt)
+-	{
+-	case 0:
+-	  tmp = vec;
+-	  break;
+-
+-	case 1:
+-	case 3:
+-	  tmp = gen_reg_rtx (mode);
+-	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
+-				       GEN_INT (elt), GEN_INT (elt),
+-				       GEN_INT (elt+4), GEN_INT (elt+4)));
+-	  break;
+-
+-	case 2:
+-	  tmp = gen_reg_rtx (mode);
+-	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
+-	  break;
+-
+-	default:
+-	  gcc_unreachable ();
+-	}
+-      vec = tmp;
+-      use_vec_extr = true;
+-      elt = 0;
+-      break;
+-
+-    case E_V4SImode:
+-      use_vec_extr = TARGET_SSE4_1;
+-      if (use_vec_extr)
+-	break;
+-
+-      if (TARGET_SSE2)
+-	{
+-	  switch (elt)
+-	    {
+-	    case 0:
+-	      tmp = vec;
+-	      break;
+-
+-	    case 1:
+-	    case 3:
+-	      tmp = gen_reg_rtx (mode);
+-	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
+-					    GEN_INT (elt), GEN_INT (elt),
+-					    GEN_INT (elt), GEN_INT (elt)));
+-	      break;
+-
+-	    case 2:
+-	      tmp = gen_reg_rtx (mode);
+-	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
+-	      break;
+-
+-	    default:
+-	      gcc_unreachable ();
+-	    }
+-	  vec = tmp;
+-	  use_vec_extr = true;
+-	  elt = 0;
+-	}
+-      else
+-	{
+-	  /* For SSE1, we have to reuse the V4SF code.  */
+-	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
+-				      gen_lowpart (V4SFmode, vec), elt);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V8HImode:
+-      use_vec_extr = TARGET_SSE2;
+-      break;
+-    case E_V4HImode:
+-      use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+-      break;
+-
+-    case E_V16QImode:
+-      use_vec_extr = TARGET_SSE4_1;
+-      break;
+-
+-    case E_V8SFmode:
+-      if (TARGET_AVX)
+-	{
+-	  tmp = gen_reg_rtx (V4SFmode);
+-	  if (elt < 4)
+-	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V4DFmode:
+-      if (TARGET_AVX)
+-	{
+-	  tmp = gen_reg_rtx (V2DFmode);
+-	  if (elt < 2)
+-	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V32QImode:
+-      if (TARGET_AVX)
+-	{
+-	  tmp = gen_reg_rtx (V16QImode);
+-	  if (elt < 16)
+-	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V16HImode:
+-      if (TARGET_AVX)
+-	{
+-	  tmp = gen_reg_rtx (V8HImode);
+-	  if (elt < 8)
+-	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V8SImode:
+-      if (TARGET_AVX)
+-	{
+-	  tmp = gen_reg_rtx (V4SImode);
+-	  if (elt < 4)
+-	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V4DImode:
+-      if (TARGET_AVX)
+-	{
+-	  tmp = gen_reg_rtx (V2DImode);
+-	  if (elt < 2)
+-	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V32HImode:
+-      if (TARGET_AVX512BW)
+-	{
+-	  tmp = gen_reg_rtx (V16HImode);
+-	  if (elt < 16)
+-	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V64QImode:
+-      if (TARGET_AVX512BW)
+-	{
+-	  tmp = gen_reg_rtx (V32QImode);
+-	  if (elt < 32)
+-	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
+-	  else
+-	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
+-	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
+-	  return;
+-	}
+-      break;
+-
+-    case E_V16SFmode:
+-      tmp = gen_reg_rtx (V8SFmode);
+-      if (elt < 8)
+-	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
+-      else
+-	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
+-      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+-      return;
+-
+-    case E_V8DFmode:
+-      tmp = gen_reg_rtx (V4DFmode);
+-      if (elt < 4)
+-	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
+-      else
+-	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
+-      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+-      return;
+-
+-    case E_V16SImode:
+-      tmp = gen_reg_rtx (V8SImode);
+-      if (elt < 8)
+-	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
+-      else
+-	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
+-      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+-      return;
+-
+-    case E_V8DImode:
+-      tmp = gen_reg_rtx (V4DImode);
+-      if (elt < 4)
+-	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
+-      else
+-	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
+-      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+-      return;
+-
+-    case E_V8QImode:
+-      /* ??? Could extract the appropriate HImode element and shift.  */
+-    default:
+-      break;
+-    }
+-
+-  if (use_vec_extr)
+-    {
+-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
+-      tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
+-
+-      /* Let the rtl optimizers know about the zero extension performed.  */
+-      if (inner_mode == QImode || inner_mode == HImode)
+-	{
+-	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
+-	  target = gen_lowpart (SImode, target);
+-	}
+-
+-      emit_insn (gen_rtx_SET (target, tmp));
+-    }
+-  else
+-    {
+-      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+-
+-      emit_move_insn (mem, vec);
+-
+-      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
+-      emit_move_insn (target, tmp);
+-    }
+-}
+-
+-/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+-   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+-   The upper bits of DEST are undefined, though they shouldn't cause
+-   exceptions (some bits from src or all zeros are ok).  */
+-
+-static void
+-emit_reduc_half (rtx dest, rtx src, int i)
+-{
+-  rtx tem, d = dest;
+-  switch (GET_MODE (src))
+-    {
+-    case E_V4SFmode:
+-      if (i == 128)
+-	tem = gen_sse_movhlps (dest, src, src);
+-      else
+-	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+-				   GEN_INT (1 + 4), GEN_INT (1 + 4));
+-      break;
+-    case E_V2DFmode:
+-      tem = gen_vec_interleave_highv2df (dest, src, src);
+-      break;
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V4SImode:
+-    case E_V2DImode:
+-      d = gen_reg_rtx (V1TImode);
+-      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
+-				GEN_INT (i / 2));
+-      break;
+-    case E_V8SFmode:
+-      if (i == 256)
+-	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+-      else
+-	tem = gen_avx_shufps256 (dest, src, src,
+-				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
+-      break;
+-    case E_V4DFmode:
+-      if (i == 256)
+-	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+-      else
+-	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
+-      break;
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V8SImode:
+-    case E_V4DImode:
+-      if (i == 256)
+-	{
+-	  if (GET_MODE (dest) != V4DImode)
+-	    d = gen_reg_rtx (V4DImode);
+-	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
+-				   gen_lowpart (V4DImode, src),
+-				   const1_rtx);
+-	}
+-      else
+-	{
+-	  d = gen_reg_rtx (V2TImode);
+-	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
+-				    GEN_INT (i / 2));
+-	}
+-      break;
+-    case E_V64QImode:
+-    case E_V32HImode:
+-    case E_V16SImode:
+-    case E_V16SFmode:
+-    case E_V8DImode:
+-    case E_V8DFmode:
+-      if (i > 128)
+-	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
+-				      gen_lowpart (V16SImode, src),
+-				      gen_lowpart (V16SImode, src),
+-				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+-				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+-				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+-				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+-				      GEN_INT (0xC), GEN_INT (0xD),
+-				      GEN_INT (0xE), GEN_INT (0xF),
+-				      GEN_INT (0x10), GEN_INT (0x11),
+-				      GEN_INT (0x12), GEN_INT (0x13),
+-				      GEN_INT (0x14), GEN_INT (0x15),
+-				      GEN_INT (0x16), GEN_INT (0x17));
+-      else
+-	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
+-				   gen_lowpart (V16SImode, src),
+-				   GEN_INT (i == 128 ? 0x2 : 0x1),
+-				   GEN_INT (0x3),
+-				   GEN_INT (0x3),
+-				   GEN_INT (0x3),
+-				   GEN_INT (i == 128 ? 0x6 : 0x5),
+-				   GEN_INT (0x7),
+-				   GEN_INT (0x7),
+-				   GEN_INT (0x7),
+-				   GEN_INT (i == 128 ? 0xA : 0x9),
+-				   GEN_INT (0xB),
+-				   GEN_INT (0xB),
+-				   GEN_INT (0xB),
+-				   GEN_INT (i == 128 ? 0xE : 0xD),
+-				   GEN_INT (0xF),
+-				   GEN_INT (0xF),
+-				   GEN_INT (0xF));
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-  emit_insn (tem);
+-  if (d != dest)
+-    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+-}
+-
+-/* Expand a vector reduction.  FN is the binary pattern to reduce;
+-   DEST is the destination; IN is the input vector.  */
+-
+-void
+-ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+-{
+-  rtx half, dst, vec = in;
+-  machine_mode mode = GET_MODE (in);
+-  int i;
+-
+-  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
+-  if (TARGET_SSE4_1
+-      && mode == V8HImode
+-      && fn == gen_uminv8hi3)
+-    {
+-      emit_insn (gen_sse4_1_phminposuw (dest, in));
+-      return;
+-    }
+-
+-  for (i = GET_MODE_BITSIZE (mode);
+-       i > GET_MODE_UNIT_BITSIZE (mode);
+-       i >>= 1)
+-    {
+-      half = gen_reg_rtx (mode);
+-      emit_reduc_half (half, vec, i);
+-      if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
+-	dst = dest;
+-      else
+-	dst = gen_reg_rtx (mode);
+-      emit_insn (fn (dst, half, vec));
+-      vec = dst;
+-    }
+-}
+-
+-/* Target hook for scalar_mode_supported_p.  */
+-static bool
+-ix86_scalar_mode_supported_p (scalar_mode mode)
+-{
+-  if (DECIMAL_FLOAT_MODE_P (mode))
+-    return default_decimal_float_supported_p ();
+-  else if (mode == TFmode)
+-    return true;
+-  else
+-    return default_scalar_mode_supported_p (mode);
+-}
+-
+-/* Implements target hook vector_mode_supported_p.  */
+-static bool
+-ix86_vector_mode_supported_p (machine_mode mode)
+-{
+-  if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+-    return true;
+-  if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+-    return true;
+-  if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+-    return true;
+-  if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+-    return true;
+-  if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
+-    return true;
+-  if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
+-    return true;
+-  return false;
+-}
+-
+-/* Target hook for c_mode_for_suffix.  */
+-static machine_mode
+-ix86_c_mode_for_suffix (char suffix)
+-{
+-  if (suffix == 'q')
+-    return TFmode;
+-  if (suffix == 'w')
+-    return XFmode;
+-
+-  return VOIDmode;
+-}
+-
+-/* Worker function for TARGET_MD_ASM_ADJUST.
+-
+-   We implement asm flag outputs, and maintain source compatibility
+-   with the old cc0-based compiler.  */
+-
+-static rtx_insn *
+-ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
+-		    vec<const char *> &constraints,
+-		    vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
+-{
+-  bool saw_asm_flag = false;
+-
+-  start_sequence ();
+-  for (unsigned i = 0, n = outputs.length (); i < n; ++i)
+-    {
+-      const char *con = constraints[i];
+-      if (strncmp (con, "=@cc", 4) != 0)
+-	continue;
+-      con += 4;
+-      if (strchr (con, ',') != NULL)
+-	{
+-	  error ("alternatives not allowed in asm flag output");
+-	  continue;
+-	}
+-
+-      bool invert = false;
+-      if (con[0] == 'n')
+-	invert = true, con++;
+-
+-      machine_mode mode = CCmode;
+-      rtx_code code = UNKNOWN;
+-
+-      switch (con[0])
+-	{
+-	case 'a':
+-	  if (con[1] == 0)
+-	    mode = CCAmode, code = EQ;
+-	  else if (con[1] == 'e' && con[2] == 0)
+-	    mode = CCCmode, code = NE;
+-	  break;
+-	case 'b':
+-	  if (con[1] == 0)
+-	    mode = CCCmode, code = EQ;
+-	  else if (con[1] == 'e' && con[2] == 0)
+-	    mode = CCAmode, code = NE;
+-	  break;
+-	case 'c':
+-	  if (con[1] == 0)
+-	    mode = CCCmode, code = EQ;
+-	  break;
+-	case 'e':
+-	  if (con[1] == 0)
+-	    mode = CCZmode, code = EQ;
+-	  break;
+-	case 'g':
+-	  if (con[1] == 0)
+-	    mode = CCGCmode, code = GT;
+-	  else if (con[1] == 'e' && con[2] == 0)
+-	    mode = CCGCmode, code = GE;
+-	  break;
+-	case 'l':
+-	  if (con[1] == 0)
+-	    mode = CCGCmode, code = LT;
+-	  else if (con[1] == 'e' && con[2] == 0)
+-	    mode = CCGCmode, code = LE;
+-	  break;
+-	case 'o':
+-	  if (con[1] == 0)
+-	    mode = CCOmode, code = EQ;
+-	  break;
+-	case 'p':
+-	  if (con[1] == 0)
+-	    mode = CCPmode, code = EQ;
+-	  break;
+-	case 's':
+-	  if (con[1] == 0)
+-	    mode = CCSmode, code = EQ;
+-	  break;
+-	case 'z':
+-	  if (con[1] == 0)
+-	    mode = CCZmode, code = EQ;
+-	  break;
+-	}
+-      if (code == UNKNOWN)
+-	{
+-	  error ("unknown asm flag output %qs", constraints[i]);
+-	  continue;
+-	}
+-      if (invert)
+-	code = reverse_condition (code);
+-
+-      rtx dest = outputs[i];
+-      if (!saw_asm_flag)
+-	{
+-	  /* This is the first asm flag output.  Here we put the flags
+-	     register in as the real output and adjust the condition to
+-	     allow it.  */
+-	  constraints[i] = "=Bf";
+-	  outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
+-	  saw_asm_flag = true;
+-	}
+-      else
+-	{
+-	  /* We don't need the flags register as output twice.  */
+-	  constraints[i] = "=X";
+-	  outputs[i] = gen_rtx_SCRATCH (SImode);
+-	}
+-
+-      rtx x = gen_rtx_REG (mode, FLAGS_REG);
+-      x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
+-
+-      machine_mode dest_mode = GET_MODE (dest);
+-      if (!SCALAR_INT_MODE_P (dest_mode))
+-	{
+-	  error ("invalid type for asm flag output");
+-	  continue;
+-	}
+-
+-      if (dest_mode == DImode && !TARGET_64BIT)
+-	dest_mode = SImode;
+-
+-      if (dest_mode != QImode)
+-	{
+-	  rtx destqi = gen_reg_rtx (QImode);
+-	  emit_insn (gen_rtx_SET (destqi, x));
+-
+-	  if (TARGET_ZERO_EXTEND_WITH_AND
+-	      && optimize_function_for_speed_p (cfun))
+-	    {
+-	      x = force_reg (dest_mode, const0_rtx);
+-
+-	      emit_insn (gen_movstrictqi (gen_lowpart (QImode, x), destqi));
+-	    }
+-	  else
+-	    {
+-	      x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
+-	      if (dest_mode == GET_MODE (dest)
+-		  && !register_operand (dest, GET_MODE (dest)))
+-		x = force_reg (dest_mode, x);
+-	    }
+-	}
+-
+-      if (dest_mode != GET_MODE (dest))
+-	{
+-	  rtx tmp = gen_reg_rtx (SImode);
+-
+-	  emit_insn (gen_rtx_SET (tmp, x));
+-	  emit_insn (gen_zero_extendsidi2 (dest, tmp));
+-	}
+-      else
+-	emit_insn (gen_rtx_SET (dest, x));
+-    }
+-  rtx_insn *seq = get_insns ();
+-  end_sequence ();
+-
+-  if (saw_asm_flag)
+-    return seq;
+-  else
+-    {
+-      /* If we had no asm flag outputs, clobber the flags.  */
+-      clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
+-      SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
+-      return NULL;
+-    }
+-}
+-
+-/* Implements target vector targetm.asm.encode_section_info.  */
+-
+-static void ATTRIBUTE_UNUSED
+-ix86_encode_section_info (tree decl, rtx rtl, int first)
+-{
+-  default_encode_section_info (decl, rtl, first);
+-
+-  if (ix86_in_large_data_p (decl))
+-    SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
+-}
+-
+-/* Worker function for REVERSE_CONDITION.  */
+-
+-enum rtx_code
+-ix86_reverse_condition (enum rtx_code code, machine_mode mode)
+-{
+-  return (mode == CCFPmode
+-	  ? reverse_condition_maybe_unordered (code)
+-	  : reverse_condition (code));
+-}
+-
+-/* Output code to perform an x87 FP register move, from OPERANDS[1]
+-   to OPERANDS[0].  */
+-
+-const char *
+-output_387_reg_move (rtx_insn *insn, rtx *operands)
+-{
+-  if (REG_P (operands[0]))
+-    {
+-      if (REG_P (operands[1])
+-	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+-	{
+-	  if (REGNO (operands[0]) == FIRST_STACK_REG)
+-	    return output_387_ffreep (operands, 0);
+-	  return "fstp\t%y0";
+-	}
+-      if (STACK_TOP_P (operands[0]))
+-	return "fld%Z1\t%y1";
+-      return "fst\t%y0";
+-    }
+-  else if (MEM_P (operands[0]))
+-    {
+-      gcc_assert (REG_P (operands[1]));
+-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+-	return "fstp%Z0\t%y0";
+-      else
+-	{
+-	  /* There is no non-popping store to memory for XFmode.
+-	     So if we need one, follow the store with a load.  */
+-	  if (GET_MODE (operands[0]) == XFmode)
+-	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
+-	  else
+-	    return "fst%Z0\t%y0";
+-	}
+-    }
+-  else
+-    gcc_unreachable();
+-}
+-
+-/* Output code to perform a conditional jump to LABEL, if C2 flag in
+-   FP status register is set.  */
+-
+-void
+-ix86_emit_fp_unordered_jump (rtx label)
+-{
+-  rtx reg = gen_reg_rtx (HImode);
+-  rtx_insn *insn;
+-  rtx temp;
+-
+-  emit_insn (gen_x86_fnstsw_1 (reg));
+-
+-  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+-    {
+-      emit_insn (gen_x86_sahf_1 (reg));
+-
+-      temp = gen_rtx_REG (CCmode, FLAGS_REG);
+-      temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
+-    }
+-  else
+-    {
+-      emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
+-
+-      temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-      temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
+-    }
+-
+-  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
+-			      gen_rtx_LABEL_REF (VOIDmode, label),
+-			      pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+-  predict_jump (REG_BR_PROB_BASE * 10 / 100);
+-  JUMP_LABEL (insn) = label;
+-}
+-
+-/* Output code to perform an sinh XFmode calculation.  */
+-
+-void ix86_emit_i387_sinh (rtx op0, rtx op1)
+-{
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx scratch = gen_reg_rtx (HImode);
+-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+-  rtx cst1, tmp;
+-  rtx_code_label *jump_label = gen_label_rtx ();
+-  rtx_insn *insn;
+-
+-  /* scratch = fxam (op1) */
+-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+-
+-  /* e1 = expm1 (|op1|) */
+-  emit_insn (gen_absxf2 (e2, op1));
+-  emit_insn (gen_expm1xf2 (e1, e2));
+-
+-  /* e2 = e1 / (e1 + 1.0) + e1 */
+-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+-  emit_insn (gen_addxf3 (e2, e1, cst1));
+-  emit_insn (gen_divxf3 (e2, e1, e2));
+-  emit_insn (gen_addxf3 (e2, e2, e1));
+-
+-  /* flags = signbit (op1) */
+-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+-
+-  /* if (flags) then e2 = -e2 */
+-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+-			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+-			      pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  JUMP_LABEL (insn) = jump_label;
+-
+-  emit_insn (gen_negxf2 (e2, e2));
+-
+-  emit_label (jump_label);
+-  LABEL_NUSES (jump_label) = 1;
+-
+-  /* op0 = 0.5 * e2 */
+-  half = force_reg (XFmode, half);
+-  emit_insn (gen_mulxf3 (op0, e2, half));
+-}
+-
+-/* Output code to perform an cosh XFmode calculation.  */
+-
+-void ix86_emit_i387_cosh (rtx op0, rtx op1)
+-{
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+-  rtx cst1;
+-
+-  /* e1 = exp (op1) */
+-  emit_insn (gen_expxf2 (e1, op1));
+-
+-  /* e2 = e1 + 1.0 / e1 */
+-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+-  emit_insn (gen_divxf3 (e2, cst1, e1));
+-  emit_insn (gen_addxf3 (e2, e1, e2));
+-
+-  /* op0 = 0.5 * e2 */
+-  half = force_reg (XFmode, half);
+-  emit_insn (gen_mulxf3 (op0, e2, half));
+-}
+-
+-/* Output code to perform an tanh XFmode calculation.  */
+-
+-void ix86_emit_i387_tanh (rtx op0, rtx op1)
+-{
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx scratch = gen_reg_rtx (HImode);
+-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-  rtx cst2, tmp;
+-  rtx_code_label *jump_label = gen_label_rtx ();
+-  rtx_insn *insn;
+-
+-  /* scratch = fxam (op1) */
+-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+-
+-  /* e1 = expm1 (-|2 * op1|) */
+-  emit_insn (gen_addxf3 (e2, op1, op1));
+-  emit_insn (gen_absxf2 (e2, e2));
+-  emit_insn (gen_negxf2 (e2, e2));
+-  emit_insn (gen_expm1xf2 (e1, e2));
+-
+-  /* e2 = e1 / (e1 + 2.0) */
+-  cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
+-  emit_insn (gen_addxf3 (e2, e1, cst2));
+-  emit_insn (gen_divxf3 (e2, e1, e2));
+-
+-  /* flags = signbit (op1) */
+-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+-
+-  /* if (!flags) then e2 = -e2 */
+-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+-			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
+-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+-			      pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  JUMP_LABEL (insn) = jump_label;
+-
+-  emit_insn (gen_negxf2 (e2, e2));
+-
+-  emit_label (jump_label);
+-  LABEL_NUSES (jump_label) = 1;
+-
+-  emit_move_insn (op0, e2);
+-}
+-
+-/* Output code to perform an asinh XFmode calculation.  */
+-
+-void ix86_emit_i387_asinh (rtx op0, rtx op1)
+-{
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx scratch = gen_reg_rtx (HImode);
+-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-  rtx cst1, tmp;
+-  rtx_code_label *jump_label = gen_label_rtx ();
+-  rtx_insn *insn;
+-
+-  /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
+-  emit_insn (gen_mulxf3 (e1, op1, op1));
+-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+-  emit_insn (gen_addxf3 (e2, e1, cst1));
+-  emit_insn (gen_sqrtxf2 (e2, e2));
+-  emit_insn (gen_addxf3 (e2, e2, cst1));
+-
+-  /* e1 = e1 / e2 */
+-  emit_insn (gen_divxf3 (e1, e1, e2));
+-
+-  /* scratch = fxam (op1) */
+-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+-
+-  /* e1 = e1 + |op1| */
+-  emit_insn (gen_absxf2 (e2, op1));
+-  emit_insn (gen_addxf3 (e1, e1, e2));
+-
+-  /* e2 = log1p (e1) */
+-  ix86_emit_i387_log1p (e2, e1);
+-
+-  /* flags = signbit (op1) */
+-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+-
+-  /* if (flags) then e2 = -e2 */
+-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+-			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+-			      pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  JUMP_LABEL (insn) = jump_label;
+-
+-  emit_insn (gen_negxf2 (e2, e2));
+-
+-  emit_label (jump_label);
+-  LABEL_NUSES (jump_label) = 1;
+-
+-  emit_move_insn (op0, e2);
+-}
+-
+-/* Output code to perform an acosh XFmode calculation.  */
+-
+-void ix86_emit_i387_acosh (rtx op0, rtx op1)
+-{
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+-
+-  /* e2 = sqrt (op1 + 1.0) */
+-  emit_insn (gen_addxf3 (e2, op1, cst1));
+-  emit_insn (gen_sqrtxf2 (e2, e2));
+-
+-  /* e1 = sqrt (op1 - 1.0) */
+-  emit_insn (gen_subxf3 (e1, op1, cst1));
+-  emit_insn (gen_sqrtxf2 (e1, e1));
+-
+-  /* e1 = e1 * e2 */
+-  emit_insn (gen_mulxf3 (e1, e1, e2));
+-
+-  /* e1 = e1 + op1 */
+-  emit_insn (gen_addxf3 (e1, e1, op1));
+-
+-  /* op0 = log (e1) */
+-  emit_insn (gen_logxf2 (op0, e1));
+-}
+-
+-/* Output code to perform an atanh XFmode calculation.  */
+-
+-void ix86_emit_i387_atanh (rtx op0, rtx op1)
+-{
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx scratch = gen_reg_rtx (HImode);
+-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+-  rtx cst1, tmp;
+-  rtx_code_label *jump_label = gen_label_rtx ();
+-  rtx_insn *insn;
+-
+-  /* scratch = fxam (op1) */
+-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+-
+-  /* e2 = |op1| */
+-  emit_insn (gen_absxf2 (e2, op1));
+-
+-  /* e1 = -(e2 + e2) / (e2 + 1.0) */
+-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+-  emit_insn (gen_addxf3 (e1, e2, cst1));
+-  emit_insn (gen_addxf3 (e2, e2, e2));
+-  emit_insn (gen_negxf2 (e2, e2));
+-  emit_insn (gen_divxf3 (e1, e2, e1));
+-
+-  /* e2 = log1p (e1) */
+-  ix86_emit_i387_log1p (e2, e1);
+-
+-  /* flags = signbit (op1) */
+-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+-
+-  /* if (!flags) then e2 = -e2 */
+-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+-			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
+-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+-			      pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  JUMP_LABEL (insn) = jump_label;
+-
+-  emit_insn (gen_negxf2 (e2, e2));
+-
+-  emit_label (jump_label);
+-  LABEL_NUSES (jump_label) = 1;
+-
+-  /* op0 = 0.5 * e2 */
+-  half = force_reg (XFmode, half);
+-  emit_insn (gen_mulxf3 (op0, e2, half));
+-}
+-
+-/* Output code to perform a log1p XFmode calculation.  */
+-
+-void ix86_emit_i387_log1p (rtx op0, rtx op1)
+-{
+-  rtx_code_label *label1 = gen_label_rtx ();
+-  rtx_code_label *label2 = gen_label_rtx ();
+-
+-  rtx tmp = gen_reg_rtx (XFmode);
+-  rtx res = gen_reg_rtx (XFmode);
+-  rtx cst, cstln2, cst1;
+-  rtx_insn *insn;
+-
+-  cst = const_double_from_real_value
+-    (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
+-  cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
+-
+-  emit_insn (gen_absxf2 (tmp, op1));
+-
+-  cst = force_reg (XFmode, cst);
+-  ix86_expand_branch (GE, tmp, cst, label1);
+-  predict_jump (REG_BR_PROB_BASE * 10 / 100);
+-  insn = get_last_insn ();
+-  JUMP_LABEL (insn) = label1;
+-
+-  emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
+-  emit_jump (label2);
+-
+-  emit_label (label1);
+-  LABEL_NUSES (label1) = 1;
+-
+-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+-  emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
+-  emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
+-
+-  emit_label (label2);
+-  LABEL_NUSES (label2) = 1;
+-
+-  emit_move_insn (op0, res);
+-}
+-
+-/* Emit code for round calculation.  */
+-void ix86_emit_i387_round (rtx op0, rtx op1)
+-{
+-  machine_mode inmode = GET_MODE (op1);
+-  machine_mode outmode = GET_MODE (op0);
+-  rtx e1 = gen_reg_rtx (XFmode);
+-  rtx e2 = gen_reg_rtx (XFmode);
+-  rtx scratch = gen_reg_rtx (HImode);
+-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+-  rtx res = gen_reg_rtx (outmode);
+-  rtx_code_label *jump_label = gen_label_rtx ();
+-  rtx (*floor_insn) (rtx, rtx);
+-  rtx (*neg_insn) (rtx, rtx);
+-  rtx_insn *insn;
+-  rtx tmp;
+-
+-  switch (inmode)
+-    {
+-    case E_SFmode:
+-    case E_DFmode:
+-      tmp = gen_reg_rtx (XFmode);
+-
+-      emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
+-      op1 = tmp;
+-      break;
+-    case E_XFmode:
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  switch (outmode)
+-    {
+-    case E_SFmode:
+-      floor_insn = gen_frndintxf2_floor;
+-      neg_insn = gen_negsf2;
+-      break;
+-    case E_DFmode:
+-      floor_insn = gen_frndintxf2_floor;
+-      neg_insn = gen_negdf2;
+-      break;
+-    case E_XFmode:
+-      floor_insn = gen_frndintxf2_floor;
+-      neg_insn = gen_negxf2;
+-      break;
+-    case E_HImode:
+-      floor_insn = gen_lfloorxfhi2;
+-      neg_insn = gen_neghi2;
+-      break;
+-    case E_SImode:
+-      floor_insn = gen_lfloorxfsi2;
+-      neg_insn = gen_negsi2;
+-      break;
+-    case E_DImode:
+-      floor_insn = gen_lfloorxfdi2;
+-      neg_insn = gen_negdi2;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-
+-  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
+-
+-  /* scratch = fxam(op1) */
+-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+-
+-  /* e1 = fabs(op1) */
+-  emit_insn (gen_absxf2 (e1, op1));
+-
+-  /* e2 = e1 + 0.5 */
+-  half = force_reg (XFmode, half);
+-  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
+-
+-  /* res = floor(e2) */
+-  switch (outmode)
+-    {
+-    case E_SFmode:
+-    case E_DFmode:
+-      {
+-	tmp = gen_reg_rtx (XFmode);
+-
+-	emit_insn (floor_insn (tmp, e2));
+-	emit_insn (gen_rtx_SET (res,
+-				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
+-						UNSPEC_TRUNC_NOOP)));
+-      }
+-      break;
+-    default:
+-      emit_insn (floor_insn (res, e2));
+-    }
+-
+-  /* flags = signbit(a) */
+-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+-
+-  /* if (flags) then res = -res */
+-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+-			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+-			      pc_rtx);
+-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+-  JUMP_LABEL (insn) = jump_label;
+-
+-  emit_insn (neg_insn (res, res));
+-
+-  emit_label (jump_label);
+-  LABEL_NUSES (jump_label) = 1;
+-
+-  emit_move_insn (op0, res);
+-}
+-
+-/* Output code to perform a Newton-Rhapson approximation of a single precision
+-   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
+-
+-void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+-{
+-  rtx x0, x1, e0, e1;
+-
+-  x0 = gen_reg_rtx (mode);
+-  e0 = gen_reg_rtx (mode);
+-  e1 = gen_reg_rtx (mode);
+-  x1 = gen_reg_rtx (mode);
+-
+-  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+-
+-  b = force_reg (mode, b);
+-
+-  /* x0 = rcp(b) estimate */
+-  if (mode == V16SFmode || mode == V8DFmode)
+-    {
+-      if (TARGET_AVX512ER)
+-	{
+-	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+-						      UNSPEC_RCP28)));
+-	  /* res = a * x0 */
+-	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
+-	  return;
+-	}
+-      else
+-	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+-						    UNSPEC_RCP14)));
+-    }
+-  else
+-    emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+-						UNSPEC_RCP)));
+-
+-  /* e0 = x0 * b */
+-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
+-
+-  /* e0 = x0 * e0 */
+-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
+-
+-  /* e1 = x0 + x0 */
+-  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
+-
+-  /* x1 = e1 - e0 */
+-  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
+-
+-  /* res = a * x1 */
+-  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+-}
+-
+-/* Output code to perform a Newton-Rhapson approximation of a
+-   single precision floating point [reciprocal] square root.  */
+-
+-void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+-{
+-  rtx x0, e0, e1, e2, e3, mthree, mhalf;
+-  REAL_VALUE_TYPE r;
+-  int unspec;
+-
+-  x0 = gen_reg_rtx (mode);
+-  e0 = gen_reg_rtx (mode);
+-  e1 = gen_reg_rtx (mode);
+-  e2 = gen_reg_rtx (mode);
+-  e3 = gen_reg_rtx (mode);
+-
+-  if (TARGET_AVX512ER && mode == V16SFmode)
+-    {
+-      if (recip)
+-	/* res = rsqrt28(a) estimate */
+-	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+-						     UNSPEC_RSQRT28)));
+-      else
+-	{
+-	  /* x0 = rsqrt28(a) estimate */
+-	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+-						      UNSPEC_RSQRT28)));
+-	  /* res = rcp28(x0) estimate */
+-	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
+-						       UNSPEC_RCP28)));
+-	}
+-      return;
+-    }
+-
+-  real_from_integer (&r, VOIDmode, -3, SIGNED);
+-  mthree = const_double_from_real_value (r, SFmode);
+-
+-  real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
+-  mhalf = const_double_from_real_value (r, SFmode);
+-  unspec = UNSPEC_RSQRT;
+-
+-  if (VECTOR_MODE_P (mode))
+-    {
+-      mthree = ix86_build_const_vector (mode, true, mthree);
+-      mhalf = ix86_build_const_vector (mode, true, mhalf);
+-      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
+-      if (GET_MODE_SIZE (mode) == 64)
+-	unspec = UNSPEC_RSQRT14;
+-    }
+-
+-  /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
+-     rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
+-
+-  a = force_reg (mode, a);
+-
+-  /* x0 = rsqrt(a) estimate */
+-  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+-					      unspec)));
+-
+-  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
+-  if (!recip)
+-    {
+-      rtx zero = force_reg (mode, CONST0_RTX(mode));
+-      rtx mask;
+-
+-      /* Handle masked compare.  */
+-      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+-	{
+-	  mask = gen_reg_rtx (HImode);
+-	  /* Imm value 0x4 corresponds to not-equal comparison.  */
+-	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
+-	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
+-	}
+-      else
+-	{
+-	  mask = gen_reg_rtx (mode);
+-	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
+-	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
+-	}
+-    }
+-
+-  /* e0 = x0 * a */
+-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+-  /* e1 = e0 * x0 */
+-  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+-
+-  /* e2 = e1 - 3. */
+-  mthree = force_reg (mode, mthree);
+-  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+-
+-  mhalf = force_reg (mode, mhalf);
+-  if (recip)
+-    /* e3 = -.5 * x0 */
+-    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
+-  else
+-    /* e3 = -.5 * e0 */
+-    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
+-  /* ret = e2 * e3 */
+-  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
+-}
+-
+-#ifdef TARGET_SOLARIS
+-/* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
++	case IX86_BUILTIN_PSLLD:
++	case IX86_BUILTIN_PSLLD128:
++	case IX86_BUILTIN_PSLLD128_MASK:
++	case IX86_BUILTIN_PSLLD256:
++	case IX86_BUILTIN_PSLLD256_MASK:
++	case IX86_BUILTIN_PSLLD512:
++	case IX86_BUILTIN_PSLLDI:
++	case IX86_BUILTIN_PSLLDI128:
++	case IX86_BUILTIN_PSLLDI128_MASK:
++	case IX86_BUILTIN_PSLLDI256:
++	case IX86_BUILTIN_PSLLDI256_MASK:
++	case IX86_BUILTIN_PSLLDI512:
++	case IX86_BUILTIN_PSLLQ:
++	case IX86_BUILTIN_PSLLQ128:
++	case IX86_BUILTIN_PSLLQ128_MASK:
++	case IX86_BUILTIN_PSLLQ256:
++	case IX86_BUILTIN_PSLLQ256_MASK:
++	case IX86_BUILTIN_PSLLQ512:
++	case IX86_BUILTIN_PSLLQI:
++	case IX86_BUILTIN_PSLLQI128:
++	case IX86_BUILTIN_PSLLQI128_MASK:
++	case IX86_BUILTIN_PSLLQI256:
++	case IX86_BUILTIN_PSLLQI256_MASK:
++	case IX86_BUILTIN_PSLLQI512:
++	case IX86_BUILTIN_PSLLW:
++	case IX86_BUILTIN_PSLLW128:
++	case IX86_BUILTIN_PSLLW128_MASK:
++	case IX86_BUILTIN_PSLLW256:
++	case IX86_BUILTIN_PSLLW256_MASK:
++	case IX86_BUILTIN_PSLLW512_MASK:
++	case IX86_BUILTIN_PSLLWI:
++	case IX86_BUILTIN_PSLLWI128:
++	case IX86_BUILTIN_PSLLWI128_MASK:
++	case IX86_BUILTIN_PSLLWI256:
++	case IX86_BUILTIN_PSLLWI256_MASK:
++	case IX86_BUILTIN_PSLLWI512_MASK:
++	  rcode = ASHIFT;
++	  is_vshift = false;
++	  goto do_shift;
++	case IX86_BUILTIN_PSRAD:
++	case IX86_BUILTIN_PSRAD128:
++	case IX86_BUILTIN_PSRAD128_MASK:
++	case IX86_BUILTIN_PSRAD256:
++	case IX86_BUILTIN_PSRAD256_MASK:
++	case IX86_BUILTIN_PSRAD512:
++	case IX86_BUILTIN_PSRADI:
++	case IX86_BUILTIN_PSRADI128:
++	case IX86_BUILTIN_PSRADI128_MASK:
++	case IX86_BUILTIN_PSRADI256:
++	case IX86_BUILTIN_PSRADI256_MASK:
++	case IX86_BUILTIN_PSRADI512:
++	case IX86_BUILTIN_PSRAQ128_MASK:
++	case IX86_BUILTIN_PSRAQ256_MASK:
++	case IX86_BUILTIN_PSRAQ512:
++	case IX86_BUILTIN_PSRAQI128_MASK:
++	case IX86_BUILTIN_PSRAQI256_MASK:
++	case IX86_BUILTIN_PSRAQI512:
++	case IX86_BUILTIN_PSRAW:
++	case IX86_BUILTIN_PSRAW128:
++	case IX86_BUILTIN_PSRAW128_MASK:
++	case IX86_BUILTIN_PSRAW256:
++	case IX86_BUILTIN_PSRAW256_MASK:
++	case IX86_BUILTIN_PSRAW512:
++	case IX86_BUILTIN_PSRAWI:
++	case IX86_BUILTIN_PSRAWI128:
++	case IX86_BUILTIN_PSRAWI128_MASK:
++	case IX86_BUILTIN_PSRAWI256:
++	case IX86_BUILTIN_PSRAWI256_MASK:
++	case IX86_BUILTIN_PSRAWI512:
++	  rcode = ASHIFTRT;
++	  is_vshift = false;
++	  goto do_shift;
++	case IX86_BUILTIN_PSRLD:
++	case IX86_BUILTIN_PSRLD128:
++	case IX86_BUILTIN_PSRLD128_MASK:
++	case IX86_BUILTIN_PSRLD256:
++	case IX86_BUILTIN_PSRLD256_MASK:
++	case IX86_BUILTIN_PSRLD512:
++	case IX86_BUILTIN_PSRLDI:
++	case IX86_BUILTIN_PSRLDI128:
++	case IX86_BUILTIN_PSRLDI128_MASK:
++	case IX86_BUILTIN_PSRLDI256:
++	case IX86_BUILTIN_PSRLDI256_MASK:
++	case IX86_BUILTIN_PSRLDI512:
++	case IX86_BUILTIN_PSRLQ:
++	case IX86_BUILTIN_PSRLQ128:
++	case IX86_BUILTIN_PSRLQ128_MASK:
++	case IX86_BUILTIN_PSRLQ256:
++	case IX86_BUILTIN_PSRLQ256_MASK:
++	case IX86_BUILTIN_PSRLQ512:
++	case IX86_BUILTIN_PSRLQI:
++	case IX86_BUILTIN_PSRLQI128:
++	case IX86_BUILTIN_PSRLQI128_MASK:
++	case IX86_BUILTIN_PSRLQI256:
++	case IX86_BUILTIN_PSRLQI256_MASK:
++	case IX86_BUILTIN_PSRLQI512:
++	case IX86_BUILTIN_PSRLW:
++	case IX86_BUILTIN_PSRLW128:
++	case IX86_BUILTIN_PSRLW128_MASK:
++	case IX86_BUILTIN_PSRLW256:
++	case IX86_BUILTIN_PSRLW256_MASK:
++	case IX86_BUILTIN_PSRLW512:
++	case IX86_BUILTIN_PSRLWI:
++	case IX86_BUILTIN_PSRLWI128:
++	case IX86_BUILTIN_PSRLWI128_MASK:
++	case IX86_BUILTIN_PSRLWI256:
++	case IX86_BUILTIN_PSRLWI256_MASK:
++	case IX86_BUILTIN_PSRLWI512:
++	  rcode = LSHIFTRT;
++	  is_vshift = false;
++	  goto do_shift;
++	case IX86_BUILTIN_PSLLVV16HI:
++	case IX86_BUILTIN_PSLLVV16SI:
++	case IX86_BUILTIN_PSLLVV2DI:
++	case IX86_BUILTIN_PSLLVV2DI_MASK:
++	case IX86_BUILTIN_PSLLVV32HI:
++	case IX86_BUILTIN_PSLLVV4DI:
++	case IX86_BUILTIN_PSLLVV4DI_MASK:
++	case IX86_BUILTIN_PSLLVV4SI:
++	case IX86_BUILTIN_PSLLVV4SI_MASK:
++	case IX86_BUILTIN_PSLLVV8DI:
++	case IX86_BUILTIN_PSLLVV8HI:
++	case IX86_BUILTIN_PSLLVV8SI:
++	case IX86_BUILTIN_PSLLVV8SI_MASK:
++	  rcode = ASHIFT;
++	  is_vshift = true;
++	  goto do_shift;
++	case IX86_BUILTIN_PSRAVQ128:
++	case IX86_BUILTIN_PSRAVQ256:
++	case IX86_BUILTIN_PSRAVV16HI:
++	case IX86_BUILTIN_PSRAVV16SI:
++	case IX86_BUILTIN_PSRAVV32HI:
++	case IX86_BUILTIN_PSRAVV4SI:
++	case IX86_BUILTIN_PSRAVV4SI_MASK:
++	case IX86_BUILTIN_PSRAVV8DI:
++	case IX86_BUILTIN_PSRAVV8HI:
++	case IX86_BUILTIN_PSRAVV8SI:
++	case IX86_BUILTIN_PSRAVV8SI_MASK:
++	  rcode = ASHIFTRT;
++	  is_vshift = true;
++	  goto do_shift;
++	case IX86_BUILTIN_PSRLVV16HI:
++	case IX86_BUILTIN_PSRLVV16SI:
++	case IX86_BUILTIN_PSRLVV2DI:
++	case IX86_BUILTIN_PSRLVV2DI_MASK:
++	case IX86_BUILTIN_PSRLVV32HI:
++	case IX86_BUILTIN_PSRLVV4DI:
++	case IX86_BUILTIN_PSRLVV4DI_MASK:
++	case IX86_BUILTIN_PSRLVV4SI:
++	case IX86_BUILTIN_PSRLVV4SI_MASK:
++	case IX86_BUILTIN_PSRLVV8DI:
++	case IX86_BUILTIN_PSRLVV8HI:
++	case IX86_BUILTIN_PSRLVV8SI:
++	case IX86_BUILTIN_PSRLVV8SI_MASK:
++	  rcode = LSHIFTRT;
++	  is_vshift = true;
++	  goto do_shift;
+ 
+-static void
+-i386_solaris_elf_named_section (const char *name, unsigned int flags,
+-				tree decl)
+-{
+-  /* With Binutils 2.15, the "@unwind" marker must be specified on
+-     every occurrence of the ".eh_frame" section, not just the first
+-     one.  */
+-  if (TARGET_64BIT
+-      && strcmp (name, ".eh_frame") == 0)
+-    {
+-      fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
+-	       flags & SECTION_WRITE ? "aw" : "a");
+-      return;
+-    }
++	do_shift:
++	  gcc_assert (n_args >= 2);
++	  if (TREE_CODE (args[0]) != VECTOR_CST)
++	    break;
++	  mask = HOST_WIDE_INT_M1U;
++	  if (n_args > 2)
++	    {
++	      /* This is masked shift.  */
++	      if (!tree_fits_uhwi_p (args[n_args - 1])
++		  || TREE_SIDE_EFFECTS (args[n_args - 2]))
++		break;
++	      mask = tree_to_uhwi (args[n_args - 1]);
++	      unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
++	      mask |= HOST_WIDE_INT_M1U << elems;
++	      if (mask != HOST_WIDE_INT_M1U
++		  && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
++		break;
++	      if (mask == (HOST_WIDE_INT_M1U << elems))
++		return args[n_args - 2];
++	    }
++	  if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
++	    break;
++	  if (tree tem = (is_vshift ? integer_one_node
++			  : ix86_vector_shift_count (args[1])))
++	    {
++	      unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
++	      unsigned HOST_WIDE_INT prec
++		= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
++	      if (count == 0 && mask == HOST_WIDE_INT_M1U)
++		return args[0];
++	      if (count >= prec)
++		{
++		  if (rcode == ASHIFTRT)
++		    count = prec - 1;
++		  else if (mask == HOST_WIDE_INT_M1U)
++		    return build_zero_cst (TREE_TYPE (args[0]));
++		}
++	      tree countt = NULL_TREE;
++	      if (!is_vshift)
++		{
++		  if (count >= prec)
++		    countt = integer_zero_node;
++		  else
++		    countt = build_int_cst (integer_type_node, count);
++		}
++	      tree_vector_builder builder;
++	      if (mask != HOST_WIDE_INT_M1U || is_vshift)
++		builder.new_vector (TREE_TYPE (args[0]),
++				    TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])),
++				    1);
++	      else
++		builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
++					     false);
++	      unsigned int cnt = builder.encoded_nelts ();
++	      for (unsigned int i = 0; i < cnt; ++i)
++		{
++		  tree elt = VECTOR_CST_ELT (args[0], i);
++		  if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
++		    return NULL_TREE;
++		  tree type = TREE_TYPE (elt);
++		  if (rcode == LSHIFTRT)
++		    elt = fold_convert (unsigned_type_for (type), elt);
++		  if (is_vshift)
++		    {
++		      countt = VECTOR_CST_ELT (args[1], i);
++		      if (TREE_CODE (countt) != INTEGER_CST
++			  || TREE_OVERFLOW (countt))
++			return NULL_TREE;
++		      if (wi::neg_p (wi::to_wide (countt))
++			  || wi::to_widest (countt) >= prec)
++			{
++			  if (rcode == ASHIFTRT)
++			    countt = build_int_cst (TREE_TYPE (countt),
++						    prec - 1);
++			  else
++			    {
++			      elt = build_zero_cst (TREE_TYPE (elt));
++			      countt = build_zero_cst (TREE_TYPE (countt));
++			    }
++			}
++		    }
++		  else if (count >= prec)
++		    elt = build_zero_cst (TREE_TYPE (elt));
++		  elt = const_binop (rcode == ASHIFT
++				     ? LSHIFT_EXPR : RSHIFT_EXPR,
++				     TREE_TYPE (elt), elt, countt);
++		  if (!elt || TREE_CODE (elt) != INTEGER_CST)
++		    return NULL_TREE;
++		  if (rcode == LSHIFTRT)
++		    elt = fold_convert (type, elt);
++		  if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
++		    {
++		      elt = VECTOR_CST_ELT (args[n_args - 2], i);
++		      if (TREE_CODE (elt) != INTEGER_CST
++			  || TREE_OVERFLOW (elt))
++			return NULL_TREE;
++		    }
++		  builder.quick_push (elt);
++		}
++	      return builder.build ();
++	    }
++	  break;
+ 
+-#ifndef USE_GAS
+-  if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
+-    {
+-      solaris_elf_asm_comdat_section (name, flags, decl);
+-      return;
++	default:
++	  break;
++	}
+     }
+ 
+-  /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
+-     SPARC assembler.  One cannot mix single-letter flags and #exclude, so
+-     only emit the latter here.  */
+-  if (flags & SECTION_EXCLUDE)
+-    {
+-      fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
+-      return;
+-    }
++#ifdef SUBTARGET_FOLD_BUILTIN
++  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+ #endif
+ 
+-  default_elf_asm_named_section (name, flags, decl);
++  return NULL_TREE;
+ }
+-#endif /* TARGET_SOLARIS */
+ 
+-/* Return the mangling of TYPE if it is an extended fundamental type.  */
++/* Fold a MD builtin (use ix86_fold_builtin for folding into
++   constant) in GIMPLE.  */
+ 
+-static const char *
+-ix86_mangle_type (const_tree type)
++bool
++ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+ {
+-  type = TYPE_MAIN_VARIANT (type);
+-
+-  if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
+-      && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
+-    return NULL;
++  gimple *stmt = gsi_stmt (*gsi);
++  tree fndecl = gimple_call_fndecl (stmt);
++  gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
++  int n_args = gimple_call_num_args (stmt);
++  enum ix86_builtins fn_code
++    = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
++  tree decl = NULL_TREE;
++  tree arg0, arg1, arg2;
++  enum rtx_code rcode;
++  unsigned HOST_WIDE_INT count;
++  bool is_vshift;
+ 
+-  switch (TYPE_MODE (type))
++  switch (fn_code)
+     {
+-    case E_TFmode:
+-      /* __float128 is "g".  */
+-      return "g";
+-    case E_XFmode:
+-      /* "long double" or __float80 is "e".  */
+-      return "e";
+-    default:
+-      return NULL;
+-    }
+-}
++    case IX86_BUILTIN_TZCNT32:
++      decl = builtin_decl_implicit (BUILT_IN_CTZ);
++      goto fold_tzcnt_lzcnt;
+ 
+-static GTY(()) tree ix86_tls_stack_chk_guard_decl;
++    case IX86_BUILTIN_TZCNT64:
++      decl = builtin_decl_implicit (BUILT_IN_CTZLL);
++      goto fold_tzcnt_lzcnt;
+ 
+-static tree
+-ix86_stack_protect_guard (void)
+-{
+-  if (TARGET_SSP_TLS_GUARD)
+-    {
+-      tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
+-      int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
+-      tree type = build_qualified_type (type_node, qual);
+-      tree t;
++    case IX86_BUILTIN_LZCNT32:
++      decl = builtin_decl_implicit (BUILT_IN_CLZ);
++      goto fold_tzcnt_lzcnt;
++
++    case IX86_BUILTIN_LZCNT64:
++      decl = builtin_decl_implicit (BUILT_IN_CLZLL);
++      goto fold_tzcnt_lzcnt;
++
++    fold_tzcnt_lzcnt:
++      gcc_assert (n_args == 1);
++      arg0 = gimple_call_arg (stmt, 0);
++      if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
++	{
++	  int prec = TYPE_PRECISION (TREE_TYPE (arg0));
++	  /* If arg0 is provably non-zero, optimize into generic
++	     __builtin_c[tl]z{,ll} function the middle-end handles
++	     better.  */
++	  if (!expr_not_equal_to (arg0, wi::zero (prec)))
++	    return false;
++
++	  location_t loc = gimple_location (stmt);
++	  gimple *g = gimple_build_call (decl, 1, arg0);
++	  gimple_set_location (g, loc);
++	  tree lhs = make_ssa_name (integer_type_node);
++	  gimple_call_set_lhs (g, lhs);
++	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
++	  g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
++	  gimple_set_location (g, loc);
++	  gsi_replace (gsi, g, false);
++	  return true;
++	}
++      break;
++
++    case IX86_BUILTIN_BZHI32:
++    case IX86_BUILTIN_BZHI64:
++      gcc_assert (n_args == 2);
++      arg1 = gimple_call_arg (stmt, 1);
++      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
++	{
++	  unsigned int idx = tree_to_uhwi (arg1) & 0xff;
++	  arg0 = gimple_call_arg (stmt, 0);
++	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
++	    break;
++	  location_t loc = gimple_location (stmt);
++	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
++	  gimple_set_location (g, loc);
++	  gsi_replace (gsi, g, false);
++	  return true;
++	}
++      break;
++
++    case IX86_BUILTIN_PDEP32:
++    case IX86_BUILTIN_PDEP64:
++    case IX86_BUILTIN_PEXT32:
++    case IX86_BUILTIN_PEXT64:
++      gcc_assert (n_args == 2);
++      arg1 = gimple_call_arg (stmt, 1);
++      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
++	{
++	  location_t loc = gimple_location (stmt);
++	  arg0 = gimple_call_arg (stmt, 0);
++	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
++	  gimple_set_location (g, loc);
++	  gsi_replace (gsi, g, false);
++	  return true;
++	}
++      break;
++
++    case IX86_BUILTIN_PSLLD:
++    case IX86_BUILTIN_PSLLD128:
++    case IX86_BUILTIN_PSLLD128_MASK:
++    case IX86_BUILTIN_PSLLD256:
++    case IX86_BUILTIN_PSLLD256_MASK:
++    case IX86_BUILTIN_PSLLD512:
++    case IX86_BUILTIN_PSLLDI:
++    case IX86_BUILTIN_PSLLDI128:
++    case IX86_BUILTIN_PSLLDI128_MASK:
++    case IX86_BUILTIN_PSLLDI256:
++    case IX86_BUILTIN_PSLLDI256_MASK:
++    case IX86_BUILTIN_PSLLDI512:
++    case IX86_BUILTIN_PSLLQ:
++    case IX86_BUILTIN_PSLLQ128:
++    case IX86_BUILTIN_PSLLQ128_MASK:
++    case IX86_BUILTIN_PSLLQ256:
++    case IX86_BUILTIN_PSLLQ256_MASK:
++    case IX86_BUILTIN_PSLLQ512:
++    case IX86_BUILTIN_PSLLQI:
++    case IX86_BUILTIN_PSLLQI128:
++    case IX86_BUILTIN_PSLLQI128_MASK:
++    case IX86_BUILTIN_PSLLQI256:
++    case IX86_BUILTIN_PSLLQI256_MASK:
++    case IX86_BUILTIN_PSLLQI512:
++    case IX86_BUILTIN_PSLLW:
++    case IX86_BUILTIN_PSLLW128:
++    case IX86_BUILTIN_PSLLW128_MASK:
++    case IX86_BUILTIN_PSLLW256:
++    case IX86_BUILTIN_PSLLW256_MASK:
++    case IX86_BUILTIN_PSLLW512_MASK:
++    case IX86_BUILTIN_PSLLWI:
++    case IX86_BUILTIN_PSLLWI128:
++    case IX86_BUILTIN_PSLLWI128_MASK:
++    case IX86_BUILTIN_PSLLWI256:
++    case IX86_BUILTIN_PSLLWI256_MASK:
++    case IX86_BUILTIN_PSLLWI512_MASK:
++      rcode = ASHIFT;
++      is_vshift = false;
++      goto do_shift;
++    case IX86_BUILTIN_PSRAD:
++    case IX86_BUILTIN_PSRAD128:
++    case IX86_BUILTIN_PSRAD128_MASK:
++    case IX86_BUILTIN_PSRAD256:
++    case IX86_BUILTIN_PSRAD256_MASK:
++    case IX86_BUILTIN_PSRAD512:
++    case IX86_BUILTIN_PSRADI:
++    case IX86_BUILTIN_PSRADI128:
++    case IX86_BUILTIN_PSRADI128_MASK:
++    case IX86_BUILTIN_PSRADI256:
++    case IX86_BUILTIN_PSRADI256_MASK:
++    case IX86_BUILTIN_PSRADI512:
++    case IX86_BUILTIN_PSRAQ128_MASK:
++    case IX86_BUILTIN_PSRAQ256_MASK:
++    case IX86_BUILTIN_PSRAQ512:
++    case IX86_BUILTIN_PSRAQI128_MASK:
++    case IX86_BUILTIN_PSRAQI256_MASK:
++    case IX86_BUILTIN_PSRAQI512:
++    case IX86_BUILTIN_PSRAW:
++    case IX86_BUILTIN_PSRAW128:
++    case IX86_BUILTIN_PSRAW128_MASK:
++    case IX86_BUILTIN_PSRAW256:
++    case IX86_BUILTIN_PSRAW256_MASK:
++    case IX86_BUILTIN_PSRAW512:
++    case IX86_BUILTIN_PSRAWI:
++    case IX86_BUILTIN_PSRAWI128:
++    case IX86_BUILTIN_PSRAWI128_MASK:
++    case IX86_BUILTIN_PSRAWI256:
++    case IX86_BUILTIN_PSRAWI256_MASK:
++    case IX86_BUILTIN_PSRAWI512:
++      rcode = ASHIFTRT;
++      is_vshift = false;
++      goto do_shift;
++    case IX86_BUILTIN_PSRLD:
++    case IX86_BUILTIN_PSRLD128:
++    case IX86_BUILTIN_PSRLD128_MASK:
++    case IX86_BUILTIN_PSRLD256:
++    case IX86_BUILTIN_PSRLD256_MASK:
++    case IX86_BUILTIN_PSRLD512:
++    case IX86_BUILTIN_PSRLDI:
++    case IX86_BUILTIN_PSRLDI128:
++    case IX86_BUILTIN_PSRLDI128_MASK:
++    case IX86_BUILTIN_PSRLDI256:
++    case IX86_BUILTIN_PSRLDI256_MASK:
++    case IX86_BUILTIN_PSRLDI512:
++    case IX86_BUILTIN_PSRLQ:
++    case IX86_BUILTIN_PSRLQ128:
++    case IX86_BUILTIN_PSRLQ128_MASK:
++    case IX86_BUILTIN_PSRLQ256:
++    case IX86_BUILTIN_PSRLQ256_MASK:
++    case IX86_BUILTIN_PSRLQ512:
++    case IX86_BUILTIN_PSRLQI:
++    case IX86_BUILTIN_PSRLQI128:
++    case IX86_BUILTIN_PSRLQI128_MASK:
++    case IX86_BUILTIN_PSRLQI256:
++    case IX86_BUILTIN_PSRLQI256_MASK:
++    case IX86_BUILTIN_PSRLQI512:
++    case IX86_BUILTIN_PSRLW:
++    case IX86_BUILTIN_PSRLW128:
++    case IX86_BUILTIN_PSRLW128_MASK:
++    case IX86_BUILTIN_PSRLW256:
++    case IX86_BUILTIN_PSRLW256_MASK:
++    case IX86_BUILTIN_PSRLW512:
++    case IX86_BUILTIN_PSRLWI:
++    case IX86_BUILTIN_PSRLWI128:
++    case IX86_BUILTIN_PSRLWI128_MASK:
++    case IX86_BUILTIN_PSRLWI256:
++    case IX86_BUILTIN_PSRLWI256_MASK:
++    case IX86_BUILTIN_PSRLWI512:
++      rcode = LSHIFTRT;
++      is_vshift = false;
++      goto do_shift;
++    case IX86_BUILTIN_PSLLVV16HI:
++    case IX86_BUILTIN_PSLLVV16SI:
++    case IX86_BUILTIN_PSLLVV2DI:
++    case IX86_BUILTIN_PSLLVV2DI_MASK:
++    case IX86_BUILTIN_PSLLVV32HI:
++    case IX86_BUILTIN_PSLLVV4DI:
++    case IX86_BUILTIN_PSLLVV4DI_MASK:
++    case IX86_BUILTIN_PSLLVV4SI:
++    case IX86_BUILTIN_PSLLVV4SI_MASK:
++    case IX86_BUILTIN_PSLLVV8DI:
++    case IX86_BUILTIN_PSLLVV8HI:
++    case IX86_BUILTIN_PSLLVV8SI:
++    case IX86_BUILTIN_PSLLVV8SI_MASK:
++      rcode = ASHIFT;
++      is_vshift = true;
++      goto do_shift;
++    case IX86_BUILTIN_PSRAVQ128:
++    case IX86_BUILTIN_PSRAVQ256:
++    case IX86_BUILTIN_PSRAVV16HI:
++    case IX86_BUILTIN_PSRAVV16SI:
++    case IX86_BUILTIN_PSRAVV32HI:
++    case IX86_BUILTIN_PSRAVV4SI:
++    case IX86_BUILTIN_PSRAVV4SI_MASK:
++    case IX86_BUILTIN_PSRAVV8DI:
++    case IX86_BUILTIN_PSRAVV8HI:
++    case IX86_BUILTIN_PSRAVV8SI:
++    case IX86_BUILTIN_PSRAVV8SI_MASK:
++      rcode = ASHIFTRT;
++      is_vshift = true;
++      goto do_shift;
++    case IX86_BUILTIN_PSRLVV16HI:
++    case IX86_BUILTIN_PSRLVV16SI:
++    case IX86_BUILTIN_PSRLVV2DI:
++    case IX86_BUILTIN_PSRLVV2DI_MASK:
++    case IX86_BUILTIN_PSRLVV32HI:
++    case IX86_BUILTIN_PSRLVV4DI:
++    case IX86_BUILTIN_PSRLVV4DI_MASK:
++    case IX86_BUILTIN_PSRLVV4SI:
++    case IX86_BUILTIN_PSRLVV4SI_MASK:
++    case IX86_BUILTIN_PSRLVV8DI:
++    case IX86_BUILTIN_PSRLVV8HI:
++    case IX86_BUILTIN_PSRLVV8SI:
++    case IX86_BUILTIN_PSRLVV8SI_MASK:
++      rcode = LSHIFTRT;
++      is_vshift = true;
++      goto do_shift;
+ 
+-      if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
++    do_shift:
++      gcc_assert (n_args >= 2);
++      arg0 = gimple_call_arg (stmt, 0);
++      arg1 = gimple_call_arg (stmt, 1);
++      if (n_args > 2)
+ 	{
+-	  t = ix86_tls_stack_chk_guard_decl;
+-
+-	  if (t == NULL)
+-	    {
+-	      rtx x;
+-
+-	      t = build_decl
+-		(UNKNOWN_LOCATION, VAR_DECL,
+-		 get_identifier (ix86_stack_protector_guard_symbol_str),
+-		 type);
+-	      TREE_STATIC (t) = 1;
+-	      TREE_PUBLIC (t) = 1;
+-	      DECL_EXTERNAL (t) = 1;
+-	      TREE_USED (t) = 1;
+-	      TREE_THIS_VOLATILE (t) = 1;
+-	      DECL_ARTIFICIAL (t) = 1;
+-	      DECL_IGNORED_P (t) = 1;
+-
+-	      /* Do not share RTL as the declaration is visible outside of
+-		 current function.  */
+-	      x = DECL_RTL (t);
+-	      RTX_FLAG (x, used) = 1;
+-
+-	      ix86_tls_stack_chk_guard_decl = t;
+-	    }
++	  /* This is masked shift.  Only optimize if the mask is all ones.  */
++	  tree argl = gimple_call_arg (stmt, n_args - 1);
++	  if (!tree_fits_uhwi_p (argl))
++	    break;
++	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
++	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
++	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
++	    break;
+ 	}
+-      else
++      if (is_vshift)
+ 	{
+-	  tree asptrtype = build_pointer_type (type);
+-
+-	  t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
+-	  t = build2 (MEM_REF, asptrtype, t,
+-		      build_int_cst (asptrtype, 0));
+-	  TREE_THIS_VOLATILE (t) = 1;
++	  if (TREE_CODE (arg1) != VECTOR_CST)
++	    break;
++	  count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
++	  if (integer_zerop (arg1))
++	    count = 0;
++	  else if (rcode == ASHIFTRT)
++	    break;
++	  else
++	    for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
++	      {
++		tree elt = VECTOR_CST_ELT (arg1, i);
++		if (!wi::neg_p (wi::to_wide (elt))
++		    && wi::to_widest (elt) < count)
++		  return false;
++	      }
+ 	}
+-
+-      return t;
+-    }
+-
+-  return default_stack_protect_guard ();
+-}
+-
+-/* For 32-bit code we can save PIC register setup by using
+-   __stack_chk_fail_local hidden function instead of calling
+-   __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
+-   register, so it is better to call __stack_chk_fail directly.  */
+-
+-static tree ATTRIBUTE_UNUSED
+-ix86_stack_protect_fail (void)
+-{
+-  return TARGET_64BIT
+-	 ? default_external_stack_protect_fail ()
+-	 : default_hidden_stack_protect_fail ();
+-}
+-
+-/* Select a format to encode pointers in exception handling data.  CODE
+-   is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
+-   true if the symbol may be affected by dynamic relocations.
+-
+-   ??? All x86 object file formats are capable of representing this.
+-   After all, the relocation needed is the same as for the call insn.
+-   Whether or not a particular assembler allows us to enter such, I
+-   guess we'll have to see.  */
+-int
+-asm_preferred_eh_data_format (int code, int global)
+-{
+-  if (flag_pic)
+-    {
+-      int type = DW_EH_PE_sdata8;
+-      if (!TARGET_64BIT
+-	  || ix86_cmodel == CM_SMALL_PIC
+-	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
+-	type = DW_EH_PE_sdata4;
+-      return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
+-    }
+-  if (ix86_cmodel == CM_SMALL
+-      || (ix86_cmodel == CM_MEDIUM && code))
+-    return DW_EH_PE_udata4;
+-  return DW_EH_PE_absptr;
+-}
+-
+-/* Expand copysign from SIGN to the positive value ABS_VALUE
+-   storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
+-   the sign-bit.  */
+-static void
+-ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
+-{
+-  machine_mode mode = GET_MODE (sign);
+-  rtx sgn = gen_reg_rtx (mode);
+-  if (mask == NULL_RTX)
+-    {
+-      machine_mode vmode;
+-
+-      if (mode == SFmode)
+-	vmode = V4SFmode;
+-      else if (mode == DFmode)
+-	vmode = V2DFmode;
+       else
+-	vmode = mode;
+-
+-      mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
+-      if (!VECTOR_MODE_P (mode))
+ 	{
+-	  /* We need to generate a scalar mode mask in this case.  */
+-	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+-	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+-	  mask = gen_reg_rtx (mode);
+-	  emit_insn (gen_rtx_SET (mask, tmp));
++	  arg1 = ix86_vector_shift_count (arg1);
++	  if (!arg1)
++	    break;
++	  count = tree_to_uhwi (arg1);
+ 	}
+-    }
+-  else
+-    mask = gen_rtx_NOT (mode, mask);
+-  emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
+-  emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
+-}
++      if (count == 0)
++	{
++	  /* Just return the first argument for shift by 0.  */
++	  location_t loc = gimple_location (stmt);
++	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
++	  gimple_set_location (g, loc);
++	  gsi_replace (gsi, g, false);
++	  return true;
++	}
++      if (rcode != ASHIFTRT
++	  && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
++	{
++	  /* For shift counts equal or greater than precision, except for
++	     arithmetic right shift the result is zero.  */
++	  location_t loc = gimple_location (stmt);
++	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
++					   build_zero_cst (TREE_TYPE (arg0)));
++	  gimple_set_location (g, loc);
++	  gsi_replace (gsi, g, false);
++	  return true;
++	}
++      break;
+ 
+-/* Expand fabs (OP0) and return a new rtx that holds the result.  The
+-   mask for masking out the sign-bit is stored in *SMASK, if that is
+-   non-null.  */
+-static rtx
+-ix86_expand_sse_fabs (rtx op0, rtx *smask)
+-{
+-  machine_mode vmode, mode = GET_MODE (op0);
+-  rtx xa, mask;
++    case IX86_BUILTIN_SHUFPD:
++      arg2 = gimple_call_arg (stmt, 2);
++      if (TREE_CODE (arg2) == INTEGER_CST)
++	{
++	  location_t loc = gimple_location (stmt);
++	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
++	  arg0 = gimple_call_arg (stmt, 0);
++	  arg1 = gimple_call_arg (stmt, 1);
++	  tree itype = long_long_integer_type_node;
++	  tree vtype = build_vector_type (itype, 2); /* V2DI */
++	  tree_vector_builder elts (vtype, 2, 1);
++	  /* Ignore bits other than the lowest 2.  */
++	  elts.quick_push (build_int_cst (itype, imask & 1));
++	  imask >>= 1;
++	  elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
++	  tree omask = elts.build ();
++	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
++					   VEC_PERM_EXPR,
++					   arg0, arg1, omask);
++	  gimple_set_location (g, loc);
++	  gsi_replace (gsi, g, false);
++	  return true;
++	}
++      // Do not error yet, the constant could be propagated later?
++      break;
+ 
+-  xa = gen_reg_rtx (mode);
+-  if (mode == SFmode)
+-    vmode = V4SFmode;
+-  else if (mode == DFmode)
+-    vmode = V2DFmode;
+-  else
+-    vmode = mode;
+-  mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
+-  if (!VECTOR_MODE_P (mode))
+-    {
+-      /* We need to generate a scalar mode mask in this case.  */
+-      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+-      tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+-      mask = gen_reg_rtx (mode);
+-      emit_insn (gen_rtx_SET (mask, tmp));
++    default:
++      break;
+     }
+-  emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
+-
+-  if (smask)
+-    *smask = mask;
+-
+-  return xa;
+-}
+-
+-/* Expands a comparison of OP0 with OP1 using comparison code CODE,
+-   swapping the operands if SWAP_OPERANDS is true.  The expanded
+-   code is a forward jump to a newly created label in case the
+-   comparison is true.  The generated label rtx is returned.  */
+-static rtx_code_label *
+-ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
+-                                  bool swap_operands)
+-{
+-  bool unordered_compare = ix86_unordered_fp_compare (code);
+-  rtx_code_label *label;
+-  rtx tmp, reg;
+-
+-  if (swap_operands)
+-    std::swap (op0, op1);
+-
+-  label = gen_label_rtx ();
+-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+-  if (unordered_compare)
+-    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+-  reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
+-  emit_insn (gen_rtx_SET (reg, tmp));
+-  tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
+-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+-			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
+-  tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+-  JUMP_LABEL (tmp) = label;
+-
+-  return label;
+-}
+-
+-/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
+-   using comparison code CODE.  Operands are swapped for the comparison if
+-   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
+-static rtx
+-ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
+-			      bool swap_operands)
+-{
+-  rtx (*insn)(rtx, rtx, rtx, rtx);
+-  machine_mode mode = GET_MODE (op0);
+-  rtx mask = gen_reg_rtx (mode);
+-
+-  if (swap_operands)
+-    std::swap (op0, op1);
+-
+-  insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
+-
+-  emit_insn (insn (mask, op0, op1,
+-		   gen_rtx_fmt_ee (code, mode, op0, op1)));
+-  return mask;
+-}
+-
+-/* Generate and return a rtx of mode MODE for 2**n where n is the number
+-   of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
+-static rtx
+-ix86_gen_TWO52 (machine_mode mode)
+-{
+-  REAL_VALUE_TYPE TWO52r;
+-  rtx TWO52;
+-
+-  real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
+-  TWO52 = const_double_from_real_value (TWO52r, mode);
+-  TWO52 = force_reg (mode, TWO52);
+-
+-  return TWO52;
+-}
+-
+-/* Expand SSE sequence for computing lround from OP1 storing
+-   into OP0.  */
+-void
+-ix86_expand_lround (rtx op0, rtx op1)
+-{
+-  /* C code for the stuff we're doing below:
+-       tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
+-       return (long)tmp;
+-   */
+-  machine_mode mode = GET_MODE (op1);
+-  const struct real_format *fmt;
+-  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+-  rtx adj;
+-
+-  /* load nextafter (0.5, 0.0) */
+-  fmt = REAL_MODE_FORMAT (mode);
+-  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+-  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+ 
+-  /* adj = copysign (0.5, op1) */
+-  adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
+-  ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
+-
+-  /* adj = op1 + adj */
+-  adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
+-
+-  /* op0 = (imode)adj */
+-  expand_fix (op0, adj, 0);
+-}
+-
+-/* Expand SSE2 sequence for computing lround from OPERAND1 storing
+-   into OPERAND0.  */
+-void
+-ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
+-{
+-  /* C code for the stuff we're doing below (for do_floor):
+-	xi = (long)op1;
+-        xi -= (double)xi > op1 ? 1 : 0;
+-        return xi;
+-   */
+-  machine_mode fmode = GET_MODE (op1);
+-  machine_mode imode = GET_MODE (op0);
+-  rtx ireg, freg, tmp;
+-  rtx_code_label *label;
+-
+-  /* reg = (long)op1 */
+-  ireg = gen_reg_rtx (imode);
+-  expand_fix (ireg, op1, 0);
+-
+-  /* freg = (double)reg */
+-  freg = gen_reg_rtx (fmode);
+-  expand_float (freg, ireg, 0);
+-
+-  /* ireg = (freg > op1) ? ireg - 1 : ireg */
+-  label = ix86_expand_sse_compare_and_jump (UNLE,
+-					    freg, op1, !do_floor);
+-  tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
+-			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
+-  emit_move_insn (ireg, tmp);
+-
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
+-
+-  emit_move_insn (op0, ireg);
++  return false;
+ }
+ 
+-/* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
+-void
+-ix86_expand_rint (rtx operand0, rtx operand1)
+-{
+-  /* C code for the stuff we're doing below:
+-	xa = fabs (operand1);
+-        if (!isless (xa, 2**52))
+-	  return operand1;
+-        two52 = 2**52;
+-        if (flag_rounding_math)
+-	  {
+-	    two52 = copysign (two52, operand1);
+-	    xa = operand1;
+-	  }
+-        xa = xa + two52 - two52;
+-        return copysign (xa, operand1);
+-   */
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx res, xa, TWO52, two52, mask;
+-  rtx_code_label *label;
+-
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
+-
+-  /* xa = abs (operand1) */
+-  xa = ix86_expand_sse_fabs (res, &mask);
+-
+-  /* if (!isless (xa, TWO52)) goto label; */
+-  TWO52 = ix86_gen_TWO52 (mode);
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+-
+-  two52 = TWO52;
+-  if (flag_rounding_math)
+-    {
+-      two52 = gen_reg_rtx (mode);
+-      ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
+-      xa = res;
+-    }
+-
+-  xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
+-  xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
+-
+-  ix86_sse_copysign_to_positive (res, xa, res, mask);
+-
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++/* Handler for an SVML-style interface to
++   a library with vectorized intrinsics.  */
+ 
+-  emit_move_insn (operand0, res);
+-}
++tree
++ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
++{
++  char name[20];
++  tree fntype, new_fndecl, args;
++  unsigned arity;
++  const char *bname;
++  machine_mode el_mode, in_mode;
++  int n, in_n;
+ 
+-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+-   into OPERAND0.  */
+-void
+-ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+-{
+-  /* C code for the stuff we expand below.
+-        double xa = fabs (x), x2;
+-        if (!isless (xa, TWO52))
+-          return x;
+-        xa = xa + TWO52 - TWO52;
+-        x2 = copysign (xa, x);
+-     Compensate.  Floor:
+-        if (x2 > x)
+-          x2 -= 1;
+-     Compensate.  Ceil:
+-        if (x2 < x)
+-          x2 += 1;
+-	if (HONOR_SIGNED_ZEROS (mode))
+-	  x2 = copysign (x2, x);
+-	return x2;
+-   */
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx xa, TWO52, tmp, one, res, mask;
+-  rtx_code_label *label;
++  /* The SVML is suitable for unsafe math only.  */
++  if (!flag_unsafe_math_optimizations)
++    return NULL_TREE;
+ 
+-  TWO52 = ix86_gen_TWO52 (mode);
++  el_mode = TYPE_MODE (TREE_TYPE (type_out));
++  n = TYPE_VECTOR_SUBPARTS (type_out);
++  in_mode = TYPE_MODE (TREE_TYPE (type_in));
++  in_n = TYPE_VECTOR_SUBPARTS (type_in);
++  if (el_mode != in_mode
++      || n != in_n)
++    return NULL_TREE;
+ 
+-  /* Temporary for holding the result, initialized to the input
+-     operand to ease control flow.  */
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
++  switch (fn)
++    {
++    CASE_CFN_EXP:
++    CASE_CFN_LOG:
++    CASE_CFN_LOG10:
++    CASE_CFN_POW:
++    CASE_CFN_TANH:
++    CASE_CFN_TAN:
++    CASE_CFN_ATAN:
++    CASE_CFN_ATAN2:
++    CASE_CFN_ATANH:
++    CASE_CFN_CBRT:
++    CASE_CFN_SINH:
++    CASE_CFN_SIN:
++    CASE_CFN_ASINH:
++    CASE_CFN_ASIN:
++    CASE_CFN_COSH:
++    CASE_CFN_COS:
++    CASE_CFN_ACOSH:
++    CASE_CFN_ACOS:
++      if ((el_mode != DFmode || n != 2)
++	  && (el_mode != SFmode || n != 4))
++	return NULL_TREE;
++      break;
+ 
+-  /* xa = abs (operand1) */
+-  xa = ix86_expand_sse_fabs (res, &mask);
++    default:
++      return NULL_TREE;
++    }
+ 
+-  /* if (!isless (xa, TWO52)) goto label; */
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
++  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+ 
+-  /* xa = xa + TWO52 - TWO52; */
+-  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+-  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
++  if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
++    strcpy (name, "vmlsLn4");
++  else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
++    strcpy (name, "vmldLn2");
++  else if (n == 4)
++    {
++      sprintf (name, "vmls%s", bname+10);
++      name[strlen (name)-1] = '4';
++    }
++  else
++    sprintf (name, "vmld%s2", bname+10);
+ 
+-  /* xa = copysign (xa, operand1) */
+-  ix86_sse_copysign_to_positive (xa, xa, res, mask);
++  /* Convert to uppercase. */
++  name[4] &= ~0x20;
+ 
+-  /* generate 1.0 */
+-  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
++  arity = 0;
++  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
++    arity++;
+ 
+-  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+-  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+-  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+-			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+-  if (!do_floor && HONOR_SIGNED_ZEROS (mode))
+-    ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
+-  emit_move_insn (res, tmp);
++  if (arity == 1)
++    fntype = build_function_type_list (type_out, type_in, NULL);
++  else
++    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++  /* Build a function declaration for the vectorized function.  */
++  new_fndecl = build_decl (BUILTINS_LOCATION,
++			   FUNCTION_DECL, get_identifier (name), fntype);
++  TREE_PUBLIC (new_fndecl) = 1;
++  DECL_EXTERNAL (new_fndecl) = 1;
++  DECL_IS_NOVOPS (new_fndecl) = 1;
++  TREE_READONLY (new_fndecl) = 1;
+ 
+-  emit_move_insn (operand0, res);
++  return new_fndecl;
+ }
+ 
+-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+-   into OPERAND0.  */
+-void
+-ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+-{
+-  /* C code for the stuff we expand below.
+-	double xa = fabs (x), x2;
+-        if (!isless (xa, TWO52))
+-          return x;
+-	x2 = (double)(long)x;
+-     Compensate.  Floor:
+-	if (x2 > x)
+-	  x2 -= 1;
+-     Compensate.  Ceil:
+-	if (x2 < x)
+-	  x2 += 1;
+-	if (HONOR_SIGNED_ZEROS (mode))
+-	  return copysign (x2, x);
+-	return x2;
+-   */
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx xa, xi, TWO52, tmp, one, res, mask;
+-  rtx_code_label *label;
++/* Handler for an ACML-style interface to
++   a library with vectorized intrinsics.  */
+ 
+-  TWO52 = ix86_gen_TWO52 (mode);
++tree
++ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
++{
++  char name[20] = "__vr.._";
++  tree fntype, new_fndecl, args;
++  unsigned arity;
++  const char *bname;
++  machine_mode el_mode, in_mode;
++  int n, in_n;
+ 
+-  /* Temporary for holding the result, initialized to the input
+-     operand to ease control flow.  */
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
++  /* The ACML is 64bits only and suitable for unsafe math only as
++     it does not correctly support parts of IEEE with the required
++     precision such as denormals.  */
++  if (!TARGET_64BIT
++      || !flag_unsafe_math_optimizations)
++    return NULL_TREE;
+ 
+-  /* xa = abs (operand1) */
+-  xa = ix86_expand_sse_fabs (res, &mask);
++  el_mode = TYPE_MODE (TREE_TYPE (type_out));
++  n = TYPE_VECTOR_SUBPARTS (type_out);
++  in_mode = TYPE_MODE (TREE_TYPE (type_in));
++  in_n = TYPE_VECTOR_SUBPARTS (type_in);
++  if (el_mode != in_mode
++      || n != in_n)
++    return NULL_TREE;
+ 
+-  /* if (!isless (xa, TWO52)) goto label; */
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++  switch (fn)
++    {
++    CASE_CFN_SIN:
++    CASE_CFN_COS:
++    CASE_CFN_EXP:
++    CASE_CFN_LOG:
++    CASE_CFN_LOG2:
++    CASE_CFN_LOG10:
++      if (el_mode == DFmode && n == 2)
++	{
++	  name[4] = 'd';
++	  name[5] = '2';
++	}
++      else if (el_mode == SFmode && n == 4)
++	{
++	  name[4] = 's';
++	  name[5] = '4';
++	}
++      else
++	return NULL_TREE;
++      break;
+ 
+-  /* xa = (double)(long)x */
+-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+-  expand_fix (xi, res, 0);
+-  expand_float (xa, xi, 0);
++    default:
++      return NULL_TREE;
++    }
+ 
+-  /* generate 1.0 */
+-  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
++  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
++  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
++  sprintf (name + 7, "%s", bname+10);
+ 
+-  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+-  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+-  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+-			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+-  emit_move_insn (res, tmp);
++  arity = 0;
++  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
++    arity++;
+ 
+-  if (HONOR_SIGNED_ZEROS (mode))
+-    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
++  if (arity == 1)
++    fntype = build_function_type_list (type_out, type_in, NULL);
++  else
++    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++  /* Build a function declaration for the vectorized function.  */
++  new_fndecl = build_decl (BUILTINS_LOCATION,
++			   FUNCTION_DECL, get_identifier (name), fntype);
++  TREE_PUBLIC (new_fndecl) = 1;
++  DECL_EXTERNAL (new_fndecl) = 1;
++  DECL_IS_NOVOPS (new_fndecl) = 1;
++  TREE_READONLY (new_fndecl) = 1;
+ 
+-  emit_move_insn (operand0, res);
++  return new_fndecl;
+ }
+ 
+-/* Expand SSE sequence for computing round from OPERAND1 storing
+-   into OPERAND0.  Sequence that works without relying on DImode truncation
+-   via cvttsd2siq that is only available on 64bit targets.  */
+-void
+-ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+-{
+-  /* C code for the stuff we expand below.
+-        double xa = fabs (x), xa2, x2;
+-        if (!isless (xa, TWO52))
+-          return x;
+-     Using the absolute value and copying back sign makes
+-     -0.0 -> -0.0 correct.
+-        xa2 = xa + TWO52 - TWO52;
+-     Compensate.
+-	dxa = xa2 - xa;
+-        if (dxa <= -0.5)
+-          xa2 += 1;
+-        else if (dxa > 0.5)
+-          xa2 -= 1;
+-        x2 = copysign (xa2, x);
+-        return x2;
+-   */
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+-  rtx_code_label *label;
+-
+-  TWO52 = ix86_gen_TWO52 (mode);
+-
+-  /* Temporary for holding the result, initialized to the input
+-     operand to ease control flow.  */
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
+-
+-  /* xa = abs (operand1) */
+-  xa = ix86_expand_sse_fabs (res, &mask);
++/* Returns a decl of a function that implements scatter store with
++   register type VECTYPE and index type INDEX_TYPE and SCALE.
++   Return NULL_TREE if it is not available.  */
+ 
+-  /* if (!isless (xa, TWO52)) goto label; */
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++static tree
++ix86_vectorize_builtin_scatter (const_tree vectype,
++				const_tree index_type, int scale)
++{
++  bool si;
++  enum ix86_builtins code;
+ 
+-  /* xa2 = xa + TWO52 - TWO52; */
+-  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+-  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
++  if (!TARGET_AVX512F)
++    return NULL_TREE;
+ 
+-  /* dxa = xa2 - xa; */
+-  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
++  if ((TREE_CODE (index_type) != INTEGER_TYPE
++       && !POINTER_TYPE_P (index_type))
++      || (TYPE_MODE (index_type) != SImode
++	  && TYPE_MODE (index_type) != DImode))
++    return NULL_TREE;
+ 
+-  /* generate 0.5, 1.0 and -0.5 */
+-  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+-  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+-  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+-			       0, OPTAB_DIRECT);
++  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
++    return NULL_TREE;
+ 
+-  /* Compensate.  */
+-  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+-  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+-  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+-  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+-  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+-  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
++  /* v*scatter* insn sign extends index to pointer mode.  */
++  if (TYPE_PRECISION (index_type) < POINTER_SIZE
++      && TYPE_UNSIGNED (index_type))
++    return NULL_TREE;
+ 
+-  /* res = copysign (xa2, operand1) */
+-  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
++  /* Scale can be 1, 2, 4 or 8.  */
++  if (scale <= 0
++      || scale > 8
++      || (scale & (scale - 1)) != 0)
++    return NULL_TREE;
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++  si = TYPE_MODE (index_type) == SImode;
++  switch (TYPE_MODE (vectype))
++    {
++    case E_V8DFmode:
++      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
++      break;
++    case E_V8DImode:
++      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
++      break;
++    case E_V16SFmode:
++      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
++      break;
++    case E_V16SImode:
++      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
++      break;
++    case E_V4DFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF;
++      else
++	return NULL_TREE;
++      break;
++    case E_V4DImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI;
++      else
++	return NULL_TREE;
++      break;
++    case E_V8SFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF;
++      else
++	return NULL_TREE;
++      break;
++    case E_V8SImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI;
++      else
++	return NULL_TREE;
++      break;
++    case E_V2DFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF;
++      else
++	return NULL_TREE;
++      break;
++    case E_V2DImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI;
++      else
++	return NULL_TREE;
++      break;
++    case E_V4SFmode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF;
++      else
++	return NULL_TREE;
++      break;
++    case E_V4SImode:
++      if (TARGET_AVX512VL)
++	code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI;
++      else
++	return NULL_TREE;
++      break;
++    default:
++      return NULL_TREE;
++    }
+ 
+-  emit_move_insn (operand0, res);
++  return get_ix86_builtin (code);
+ }
+ 
+-/* Expand SSE sequence for computing trunc from OPERAND1 storing
+-   into OPERAND0.  */
+-void
+-ix86_expand_trunc (rtx operand0, rtx operand1)
+-{
+-  /* C code for SSE variant we expand below.
+-        double xa = fabs (x), x2;
+-        if (!isless (xa, TWO52))
+-          return x;
+-        x2 = (double)(long)x;
+-	if (HONOR_SIGNED_ZEROS (mode))
+-	  return copysign (x2, x);
+-	return x2;
+-   */
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx xa, xi, TWO52, res, mask;
+-  rtx_code_label *label;
++/* Return true if it is safe to use the rsqrt optabs to optimize
++   1.0/sqrt.  */
+ 
+-  TWO52 = ix86_gen_TWO52 (mode);
++static bool
++use_rsqrt_p ()
++{
++  return (TARGET_SSE && TARGET_SSE_MATH
++	  && flag_finite_math_only
++	  && !flag_trapping_math
++	  && flag_unsafe_math_optimizations);
++}
++
++/* Helper for avx_vpermilps256_operand et al.  This is also used by
++   the expansion functions to turn the parallel back into a mask.
++   The return value is 0 for no match and the imm8+1 for a match.  */
+ 
+-  /* Temporary for holding the result, initialized to the input
+-     operand to ease control flow.  */
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
++int
++avx_vpermilp_parallel (rtx par, machine_mode mode)
++{
++  unsigned i, nelt = GET_MODE_NUNITS (mode);
++  unsigned mask = 0;
++  unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
+ 
+-  /* xa = abs (operand1) */
+-  xa = ix86_expand_sse_fabs (res, &mask);
++  if (XVECLEN (par, 0) != (int) nelt)
++    return 0;
+ 
+-  /* if (!isless (xa, TWO52)) goto label; */
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++  /* Validate that all of the elements are constants, and not totally
++     out of range.  Copy the data into an integral array to make the
++     subsequent checks easier.  */
++  for (i = 0; i < nelt; ++i)
++    {
++      rtx er = XVECEXP (par, 0, i);
++      unsigned HOST_WIDE_INT ei;
+ 
+-  /* x = (double)(long)x */
+-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+-  expand_fix (xi, res, 0);
+-  expand_float (res, xi, 0);
++      if (!CONST_INT_P (er))
++	return 0;
++      ei = INTVAL (er);
++      if (ei >= nelt)
++	return 0;
++      ipar[i] = ei;
++    }
+ 
+-  if (HONOR_SIGNED_ZEROS (mode))
+-    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
++  switch (mode)
++    {
++    case E_V8DFmode:
++      /* In the 512-bit DFmode case, we can only move elements within
++         a 128-bit lane.  First fill the second part of the mask,
++	 then fallthru.  */
++      for (i = 4; i < 6; ++i)
++	{
++	  if (ipar[i] < 4 || ipar[i] >= 6)
++	    return 0;
++	  mask |= (ipar[i] - 4) << i;
++	}
++      for (i = 6; i < 8; ++i)
++	{
++	  if (ipar[i] < 6)
++	    return 0;
++	  mask |= (ipar[i] - 6) << i;
++	}
++      /* FALLTHRU */
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++    case E_V4DFmode:
++      /* In the 256-bit DFmode case, we can only move elements within
++         a 128-bit lane.  */
++      for (i = 0; i < 2; ++i)
++	{
++	  if (ipar[i] >= 2)
++	    return 0;
++	  mask |= ipar[i] << i;
++	}
++      for (i = 2; i < 4; ++i)
++	{
++	  if (ipar[i] < 2)
++	    return 0;
++	  mask |= (ipar[i] - 2) << i;
++	}
++      break;
+ 
+-  emit_move_insn (operand0, res);
+-}
++    case E_V16SFmode:
++      /* In 512 bit SFmode case, permutation in the upper 256 bits
++	 must mirror the permutation in the lower 256-bits.  */
++      for (i = 0; i < 8; ++i)
++	if (ipar[i] + 8 != ipar[i + 8])
++	  return 0;
++      /* FALLTHRU */
+ 
+-/* Expand SSE sequence for computing trunc from OPERAND1 storing
+-   into OPERAND0.  */
+-void
+-ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
+-{
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx xa, mask, TWO52, one, res, smask, tmp;
+-  rtx_code_label *label;
++    case E_V8SFmode:
++      /* In 256 bit SFmode case, we have full freedom of
++         movement within the low 128-bit lane, but the high 128-bit
++         lane must mirror the exact same pattern.  */
++      for (i = 0; i < 4; ++i)
++	if (ipar[i] + 4 != ipar[i + 4])
++	  return 0;
++      nelt = 4;
++      /* FALLTHRU */
+ 
+-  /* C code for SSE variant we expand below.
+-        double xa = fabs (x), x2;
+-        if (!isless (xa, TWO52))
+-          return x;
+-        xa2 = xa + TWO52 - TWO52;
+-     Compensate:
+-        if (xa2 > xa)
+-          xa2 -= 1.0;
+-        x2 = copysign (xa2, x);
+-        return x2;
+-   */
++    case E_V2DFmode:
++    case E_V4SFmode:
++      /* In the 128-bit case, we've full freedom in the placement of
++	 the elements from the source operand.  */
++      for (i = 0; i < nelt; ++i)
++	mask |= ipar[i] << (i * (nelt / 2));
++      break;
+ 
+-  TWO52 = ix86_gen_TWO52 (mode);
++    default:
++      gcc_unreachable ();
++    }
+ 
+-  /* Temporary for holding the result, initialized to the input
+-     operand to ease control flow.  */
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
++  /* Make sure success has a non-zero value by adding one.  */
++  return mask + 1;
++}
+ 
+-  /* xa = abs (operand1) */
+-  xa = ix86_expand_sse_fabs (res, &smask);
++/* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
++   the expansion functions to turn the parallel back into a mask.
++   The return value is 0 for no match and the imm8+1 for a match.  */
+ 
+-  /* if (!isless (xa, TWO52)) goto label; */
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++int
++avx_vperm2f128_parallel (rtx par, machine_mode mode)
++{
++  unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
++  unsigned mask = 0;
++  unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
+ 
+-  /* res = xa + TWO52 - TWO52; */
+-  tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+-  tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
+-  emit_move_insn (res, tmp);
++  if (XVECLEN (par, 0) != (int) nelt)
++    return 0;
+ 
+-  /* generate 1.0 */
+-  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
++  /* Validate that all of the elements are constants, and not totally
++     out of range.  Copy the data into an integral array to make the
++     subsequent checks easier.  */
++  for (i = 0; i < nelt; ++i)
++    {
++      rtx er = XVECEXP (par, 0, i);
++      unsigned HOST_WIDE_INT ei;
+ 
+-  /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
+-  mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
+-  emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
+-  tmp = expand_simple_binop (mode, MINUS,
+-			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
+-  emit_move_insn (res, tmp);
++      if (!CONST_INT_P (er))
++	return 0;
++      ei = INTVAL (er);
++      if (ei >= 2 * nelt)
++	return 0;
++      ipar[i] = ei;
++    }
+ 
+-  /* res = copysign (res, operand1) */
+-  ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
++  /* Validate that the halves of the permute are halves.  */
++  for (i = 0; i < nelt2 - 1; ++i)
++    if (ipar[i] + 1 != ipar[i + 1])
++      return 0;
++  for (i = nelt2; i < nelt - 1; ++i)
++    if (ipar[i] + 1 != ipar[i + 1])
++      return 0;
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++  /* Reconstruct the mask.  */
++  for (i = 0; i < 2; ++i)
++    {
++      unsigned e = ipar[i * nelt2];
++      if (e % nelt2)
++	return 0;
++      e /= nelt2;
++      mask |= e << (i * 4);
++    }
+ 
+-  emit_move_insn (operand0, res);
++  /* Make sure success has a non-zero value by adding one.  */
++  return mask + 1;
++}
++
++/* Return a register priority for hard reg REGNO.  */
++static int
++ix86_register_priority (int hard_regno)
++{
++  /* ebp and r13 as the base always wants a displacement, r12 as the
++     base always wants an index.  So discourage their usage in an
++     address.  */
++  if (hard_regno == R12_REG || hard_regno == R13_REG)
++    return 0;
++  if (hard_regno == BP_REG)
++    return 1;
++  /* New x86-64 int registers result in bigger code size.  Discourage
++     them.  */
++  if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
++    return 2;
++  /* New x86-64 SSE registers result in bigger code size.  Discourage
++     them.  */
++  if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
++    return 2;
++  if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG))
++    return 1;
++  /* Usage of AX register results in smaller code.  Prefer it.  */
++  if (hard_regno == AX_REG)
++    return 4;
++  return 3;
+ }
+ 
+-/* Expand SSE sequence for computing round from OPERAND1 storing
+-   into OPERAND0.  */
+-void
+-ix86_expand_round (rtx operand0, rtx operand1)
+-{
+-  /* C code for the stuff we're doing below:
+-        double xa = fabs (x);
+-        if (!isless (xa, TWO52))
+-          return x;
+-        xa = (double)(long)(xa + nextafter (0.5, 0.0));
+-        return copysign (xa, x);
+-   */
+-  machine_mode mode = GET_MODE (operand0);
+-  rtx res, TWO52, xa, xi, half, mask;
+-  rtx_code_label *label;
+-  const struct real_format *fmt;
+-  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
++/* Implement TARGET_PREFERRED_RELOAD_CLASS.
++
++   Put float CONST_DOUBLE in the constant pool instead of fp regs.
++   QImode must go into class Q_REGS.
++   Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
++   movdf to do mem-to-mem moves through integer regs.  */
+ 
+-  /* Temporary for holding the result, initialized to the input
+-     operand to ease control flow.  */
+-  res = gen_reg_rtx (mode);
+-  emit_move_insn (res, operand1);
++static reg_class_t
++ix86_preferred_reload_class (rtx x, reg_class_t regclass)
++{
++  machine_mode mode = GET_MODE (x);
+ 
+-  TWO52 = ix86_gen_TWO52 (mode);
+-  xa = ix86_expand_sse_fabs (res, &mask);
+-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
++  /* We're only allowed to return a subclass of CLASS.  Many of the
++     following checks fail for NO_REGS, so eliminate that early.  */
++  if (regclass == NO_REGS)
++    return NO_REGS;
+ 
+-  /* load nextafter (0.5, 0.0) */
+-  fmt = REAL_MODE_FORMAT (mode);
+-  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+-  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
++  /* All classes can load zeros.  */
++  if (x == CONST0_RTX (mode))
++    return regclass;
+ 
+-  /* xa = xa + 0.5 */
+-  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+-  xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
++  /* Force constants into memory if we are loading a (nonzero) constant into
++     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
++     instructions to load from a constant.  */
++  if (CONSTANT_P (x)
++      && (MAYBE_MMX_CLASS_P (regclass)
++	  || MAYBE_SSE_CLASS_P (regclass)
++	  || MAYBE_MASK_CLASS_P (regclass)))
++    return NO_REGS;
+ 
+-  /* xa = (double)(int64_t)xa */
+-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+-  expand_fix (xi, xa, 0);
+-  expand_float (xa, xi, 0);
++  /* Floating-point constants need more complex checks.  */
++  if (CONST_DOUBLE_P (x))
++    {
++      /* General regs can load everything.  */
++      if (INTEGER_CLASS_P (regclass))
++        return regclass;
+ 
+-  /* res = copysign (xa, operand1) */
+-  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
++      /* Floats can load 0 and 1 plus some others.  Note that we eliminated
++	 zero above.  We only want to wind up preferring 80387 registers if
++	 we plan on doing computation with them.  */
++      if (IS_STACK_MODE (mode)
++	  && standard_80387_constant_p (x) > 0)
++	{
++	  /* Limit class to FP regs.  */
++	  if (FLOAT_CLASS_P (regclass))
++	    return FLOAT_REGS;
++	}
+ 
+-  emit_label (label);
+-  LABEL_NUSES (label) = 1;
++      return NO_REGS;
++    }
+ 
+-  emit_move_insn (operand0, res);
+-}
++  /* Prefer SSE regs only, if we can use them for math.  */
++  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++    return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
+ 
+-/* Expand SSE sequence for computing round
+-   from OP1 storing into OP0 using sse4 round insn.  */
+-void
+-ix86_expand_round_sse4 (rtx op0, rtx op1)
+-{
+-  machine_mode mode = GET_MODE (op0);
+-  rtx e1, e2, res, half;
+-  const struct real_format *fmt;
+-  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+-  rtx (*gen_copysign) (rtx, rtx, rtx);
+-  rtx (*gen_round) (rtx, rtx, rtx);
++  /* Generally when we see PLUS here, it's the function invariant
++     (plus soft-fp const_int).  Which can only be computed into general
++     regs.  */
++  if (GET_CODE (x) == PLUS)
++    return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
+ 
+-  switch (mode)
++  /* QImode constants are easy to load, but non-constant QImode data
++     must go into Q_REGS.  */
++  if (GET_MODE (x) == QImode && !CONSTANT_P (x))
+     {
+-    case E_SFmode:
+-      gen_copysign = gen_copysignsf3;
+-      gen_round = gen_sse4_1_roundsf2;
+-      break;
+-    case E_DFmode:
+-      gen_copysign = gen_copysigndf3;
+-      gen_round = gen_sse4_1_rounddf2;
+-      break;
+-    default:
+-      gcc_unreachable ();
++      if (Q_CLASS_P (regclass))
++	return regclass;
++      else if (reg_class_subset_p (Q_REGS, regclass))
++	return Q_REGS;
++      else
++	return NO_REGS;
+     }
+ 
+-  /* round (a) = trunc (a + copysign (0.5, a)) */
+-
+-  /* load nextafter (0.5, 0.0) */
+-  fmt = REAL_MODE_FORMAT (mode);
+-  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+-  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+-  half = const_double_from_real_value (pred_half, mode);
++  return regclass;
++}
+ 
+-  /* e1 = copysign (0.5, op1) */
+-  e1 = gen_reg_rtx (mode);
+-  emit_insn (gen_copysign (e1, half, op1));
++/* Discourage putting floating-point values in SSE registers unless
++   SSE math is being used, and likewise for the 387 registers.  */
++static reg_class_t
++ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
++{
++  machine_mode mode = GET_MODE (x);
+ 
+-  /* e2 = op1 + e1 */
+-  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
++  /* Restrict the output reload class to the register bank that we are doing
++     math on.  If we would like not to return a subset of CLASS, reject this
++     alternative: if reload cannot do this, it will still use its choice.  */
++  mode = GET_MODE (x);
++  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++    return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
+ 
+-  /* res = trunc (e2) */
+-  res = gen_reg_rtx (mode);
+-  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
++  if (IS_STACK_MODE (mode))
++    return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
+ 
+-  emit_move_insn (op0, res);
++  return regclass;
+ }
+ 
+-/* Handle fentry_name / fentry_section attribute.  */
+-
+-static tree
+-ix86_handle_fentry_name (tree *node, tree name, tree args,
+-			 int, bool *no_add_attrs)
++static reg_class_t
++ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
++		       machine_mode mode, secondary_reload_info *sri)
+ {
+-  if (TREE_CODE (*node) == FUNCTION_DECL
+-      && TREE_CODE (TREE_VALUE (args)) == STRING_CST)
+-    /* Do nothing else, just set the attribute.  We'll get at
+-       it later with lookup_attribute.  */
+-    ;
+-  else
++  /* Double-word spills from general registers to non-offsettable memory
++     references (zero-extended addresses) require special handling.  */
++  if (TARGET_64BIT
++      && MEM_P (x)
++      && GET_MODE_SIZE (mode) > UNITS_PER_WORD
++      && INTEGER_CLASS_P (rclass)
++      && !offsettable_memref_p (x))
+     {
+-      warning (OPT_Wattributes, "%qE attribute ignored", name);
+-      *no_add_attrs = true;
+-    }
+-
+-  return NULL_TREE;
+-}
+-
+-
+-/* Table of valid machine attributes.  */
+-static const struct attribute_spec ix86_attribute_table[] =
+-{
+-  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+-       affects_type_identity, handler, exclude } */
+-  /* Stdcall attribute says callee is responsible for popping arguments
+-     if they are not variable.  */
+-  { "stdcall",   0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+-    NULL },
+-  /* Fastcall attribute says callee is responsible for popping arguments
+-     if they are not variable.  */
+-  { "fastcall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+-    NULL },
+-  /* Thiscall attribute says callee is responsible for popping arguments
+-     if they are not variable.  */
+-  { "thiscall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+-    NULL },
+-  /* Cdecl attribute says the callee is a normal C declaration */
+-  { "cdecl",     0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+-    NULL },
+-  /* Regparm attribute specifies how many integer arguments are to be
+-     passed in registers.  */
+-  { "regparm",   1, 1, false, true,  true,  true, ix86_handle_cconv_attribute,
+-    NULL },
+-  /* Sseregparm attribute says we are using x86_64 calling conventions
+-     for FP arguments.  */
+-  { "sseregparm", 0, 0, false, true, true,  true, ix86_handle_cconv_attribute,
+-    NULL },
+-  /* The transactional memory builtins are implicitly regparm or fastcall
+-     depending on the ABI.  Override the generic do-nothing attribute that
+-     these builtins were declared with.  */
+-  { "*tm regparm", 0, 0, false, true, true, true,
+-    ix86_handle_tm_regparm_attribute, NULL },
+-  /* force_align_arg_pointer says this function realigns the stack at entry.  */
+-  { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
+-    false, true,  true, false, ix86_handle_force_align_arg_pointer_attribute,
+-    NULL },
+-#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
+-  { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
+-    NULL },
+-  { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
+-    NULL },
+-  { "shared",    0, 0, true,  false, false, false,
+-    ix86_handle_shared_attribute, NULL },
+-#endif
+-  { "ms_struct", 0, 0, false, false,  false, false,
+-    ix86_handle_struct_attribute, NULL },
+-  { "gcc_struct", 0, 0, false, false,  false, false,
+-    ix86_handle_struct_attribute, NULL },
+-#ifdef SUBTARGET_ATTRIBUTE_TABLE
+-  SUBTARGET_ATTRIBUTE_TABLE,
+-#endif
+-  /* ms_abi and sysv_abi calling convention function attributes.  */
+-  { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
+-  { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
+-    NULL },
+-  { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
+-  { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
+-  { "ms_hook_prologue", 0, 0, true, false, false, false,
+-    ix86_handle_fndecl_attribute, NULL },
+-  { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
+-    ix86_handle_callee_pop_aggregate_return, NULL },
+-  { "interrupt", 0, 0, false, true, true, false,
+-    ix86_handle_interrupt_attribute, NULL },
+-  { "no_caller_saved_registers", 0, 0, false, true, true, false,
+-    ix86_handle_no_caller_saved_registers_attribute, NULL },
+-  { "naked", 0, 0, true, false, false, false,
+-    ix86_handle_fndecl_attribute, NULL },
+-  { "indirect_branch", 1, 1, true, false, false, false,
+-    ix86_handle_fndecl_attribute, NULL },
+-  { "function_return", 1, 1, true, false, false, false,
+-    ix86_handle_fndecl_attribute, NULL },
+-  { "indirect_return", 0, 0, false, true, true, false,
+-    NULL, NULL },
+-  { "fentry_name", 1, 1, true, false, false, false,
+-    ix86_handle_fentry_name, NULL },
+-  { "fentry_section", 1, 1, true, false, false, false,
+-    ix86_handle_fentry_name, NULL },
+-  { "cf_check", 0, 0, true, false, false, false,
+-    ix86_handle_fndecl_attribute, NULL },
+-
+-  /* End element.  */
+-  { NULL, 0, 0, false, false, false, false, NULL, NULL }
+-};
++      sri->icode = (in_p
++		    ? CODE_FOR_reload_noff_load
++		    : CODE_FOR_reload_noff_store);
++      /* Add the cost of moving address to a temporary.  */
++      sri->extra_cost = 1;
+ 
+-/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+-static int
+-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+-                                 tree vectype, int)
+-{
+-  bool fp = false;
+-  machine_mode mode = TImode;
+-  int index;
+-  if (vectype != NULL)
+-    {
+-      fp = FLOAT_TYPE_P (vectype);
+-      mode = TYPE_MODE (vectype);
++      return NO_REGS;
+     }
+ 
+-  switch (type_of_cost)
++  /* QImode spills from non-QI registers require
++     intermediate register on 32bit targets.  */
++  if (mode == QImode
++      && ((!TARGET_64BIT && !in_p
++	   && INTEGER_CLASS_P (rclass)
++	   && MAYBE_NON_Q_CLASS_P (rclass))
++	  || (!TARGET_AVX512DQ
++	      && MAYBE_MASK_CLASS_P (rclass))))
+     {
+-      case scalar_stmt:
+-        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
++      int regno = true_regnum (x);
+ 
+-      case scalar_load:
+-	/* load/store costs are relative to register move which is 2. Recompute
+- 	   it to COSTS_N_INSNS so everything have same base.  */
+-        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
+-			      : ix86_cost->int_load [2]) / 2;
++      /* Return Q_REGS if the operand is in memory.  */
++      if (regno == -1)
++	return Q_REGS;
+ 
+-      case scalar_store:
+-        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
+-			      : ix86_cost->int_store [2]) / 2;
++      return NO_REGS;
++    }
+ 
+-      case vector_stmt:
+-        return ix86_vec_cost (mode,
+-			      fp ? ix86_cost->addss : ix86_cost->sse_op);
++  /* This condition handles corner case where an expression involving
++     pointers gets vectorized.  We're trying to use the address of a
++     stack slot as a vector initializer.
+ 
+-      case vector_load:
+-	index = sse_store_index (mode);
+-	/* See PR82713 - we may end up being called on non-vector type.  */
+-	if (index < 0)
+-	  index = 2;
+-        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
++     (set (reg:V2DI 74 [ vect_cst_.2 ])
++          (vec_duplicate:V2DI (reg/f:DI 20 frame)))
+ 
+-      case vector_store:
+-	index = sse_store_index (mode);
+-	/* See PR82713 - we may end up being called on non-vector type.  */
+-	if (index < 0)
+-	  index = 2;
+-        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
++     Eventually frame gets turned into sp+offset like this:
+ 
+-      case vec_to_scalar:
+-      case scalar_to_vec:
+-        return ix86_vec_cost (mode, ix86_cost->sse_op);
++     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
++          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
++	                               (const_int 392 [0x188]))))
+ 
+-      /* We should have separate costs for unaligned loads and gather/scatter.
+-	 Do that incrementally.  */
+-      case unaligned_load:
+-	index = sse_store_index (mode);
+-	/* See PR82713 - we may end up being called on non-vector type.  */
+-	if (index < 0)
+-	  index = 2;
+-        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
++     That later gets turned into:
+ 
+-      case unaligned_store:
+-	index = sse_store_index (mode);
+-	/* See PR82713 - we may end up being called on non-vector type.  */
+-	if (index < 0)
+-	  index = 2;
+-        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
++     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
++          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
++	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
+ 
+-      case vector_gather_load:
+-        return ix86_vec_cost (mode,
+-			      COSTS_N_INSNS
+-				 (ix86_cost->gather_static
+-				  + ix86_cost->gather_per_elt
+-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
++     We'll have the following reload recorded:
+ 
+-      case vector_scatter_store:
+-        return ix86_vec_cost (mode,
+-			      COSTS_N_INSNS
+-				 (ix86_cost->scatter_static
+-				  + ix86_cost->scatter_per_elt
+-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
++     Reload 0: reload_in (DI) =
++           (plus:DI (reg/f:DI 7 sp)
++            (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
++     reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
++     SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
++     reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
++     reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
++     reload_reg_rtx: (reg:V2DI 22 xmm1)
+ 
+-      case cond_branch_taken:
+-        return ix86_cost->cond_taken_branch_cost;
++     Which isn't going to work since SSE instructions can't handle scalar
++     additions.  Returning GENERAL_REGS forces the addition into integer
++     register and reload can handle subsequent reloads without problems.  */
+ 
+-      case cond_branch_not_taken:
+-        return ix86_cost->cond_not_taken_branch_cost;
++  if (in_p && GET_CODE (x) == PLUS
++      && SSE_CLASS_P (rclass)
++      && SCALAR_INT_MODE_P (mode))
++    return GENERAL_REGS;
+ 
+-      case vec_perm:
+-      case vec_promote_demote:
+-        return ix86_vec_cost (mode, ix86_cost->sse_op);
++  return NO_REGS;
++}
+ 
+-      case vec_construct:
+-	{
+-	  /* N element inserts into SSE vectors.  */
+-	  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
+-	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
+-	  if (GET_MODE_BITSIZE (mode) == 256)
+-	    cost += ix86_vec_cost (mode, ix86_cost->addss);
+-	  /* One vinserti64x4 and two vinserti128 for combining SSE
+-	     and AVX256 vectors to AVX512.  */
+-	  else if (GET_MODE_BITSIZE (mode) == 512)
+-	    cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
+-	  return cost;
+-	}
++/* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
++
++static bool
++ix86_class_likely_spilled_p (reg_class_t rclass)
++{
++  switch (rclass)
++    {
++      case AREG:
++      case DREG:
++      case CREG:
++      case BREG:
++      case AD_REGS:
++      case SIREG:
++      case DIREG:
++      case SSE_FIRST_REG:
++      case FP_TOP_REG:
++      case FP_SECOND_REG:
++	return true;
+ 
+       default:
+-        gcc_unreachable ();
++	break;
+     }
++
++  return false;
+ }
+ 
+-/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
+-   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
+-   insn every time.  */
++/* If we are copying between registers from different register sets
++   (e.g. FP and integer), we may need a memory location.
++
++   The function can't work reliably when one of the CLASSES is a class
++   containing registers from multiple sets.  We avoid this by never combining
++   different sets in a single alternative in the machine description.
++   Ensure that this constraint holds to avoid unexpected surprises.
+ 
+-static GTY(()) rtx_insn *vselect_insn;
++   When STRICT is false, we are being called from REGISTER_MOVE_COST,
++   so do not enforce these sanity checks.
+ 
+-/* Initialize vselect_insn.  */
++   To optimize register_move_cost performance, define inline variant.  */
+ 
+-static void
+-init_vselect_insn (void)
++static inline bool
++inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
++				reg_class_t class2, int strict)
+ {
+-  unsigned i;
+-  rtx x;
++  if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
++    return false;
+ 
+-  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
+-  for (i = 0; i < MAX_VECT_LEN; ++i)
+-    XVECEXP (x, 0, i) = const0_rtx;
+-  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
+-							const0_rtx), x);
+-  x = gen_rtx_SET (const0_rtx, x);
+-  start_sequence ();
+-  vselect_insn = emit_insn (x);
+-  end_sequence ();
+-}
++  if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
++      || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
++      || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
++      || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
++      || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
++      || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
++      || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
++      || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
++    {
++      gcc_assert (!strict || lra_in_progress);
++      return true;
++    }
+ 
+-/* Construct (set target (vec_select op0 (parallel perm))) and
+-   return true if that's a valid instruction in the active ISA.  */
++  if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
++    return true;
+ 
+-static bool
+-expand_vselect (rtx target, rtx op0, const unsigned char *perm,
+-		unsigned nelt, bool testing_p)
+-{
+-  unsigned int i;
+-  rtx x, save_vconcat;
+-  int icode;
++  /* Between mask and general, we have moves no larger than word size.  */
++  if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
++      && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
++  return true;
+ 
+-  if (vselect_insn == NULL_RTX)
+-    init_vselect_insn ();
++  /* ??? This is a lie.  We do have moves between mmx/general, and for
++     mmx/sse2.  But by saying we need secondary memory we discourage the
++     register allocator from using the mmx registers unless needed.  */
++  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
++    return true;
+ 
+-  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
+-  PUT_NUM_ELEM (XVEC (x, 0), nelt);
+-  for (i = 0; i < nelt; ++i)
+-    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
+-  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+-  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
+-  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
+-  SET_DEST (PATTERN (vselect_insn)) = target;
+-  icode = recog_memoized (vselect_insn);
++  if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
++    {
++      /* SSE1 doesn't have any direct moves from other classes.  */
++      if (!TARGET_SSE2)
++	return true;
+ 
+-  if (icode >= 0 && !testing_p)
+-    emit_insn (copy_rtx (PATTERN (vselect_insn)));
++      /* If the target says that inter-unit moves are more expensive
++	 than moving through memory, then don't generate them.  */
++      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
++	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
++	return true;
+ 
+-  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
+-  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
+-  INSN_CODE (vselect_insn) = -1;
++      /* Between SSE and general, we have moves no larger than word size.  */
++      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
++	return true;
++    }
+ 
+-  return icode >= 0;
++  return false;
+ }
+ 
+-/* Similar, but generate a vec_concat from op0 and op1 as well.  */
++/* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
+ 
+ static bool
+-expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+-			const unsigned char *perm, unsigned nelt,
+-			bool testing_p)
++ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
++			      reg_class_t class2)
+ {
+-  machine_mode v2mode;
+-  rtx x;
+-  bool ok;
+-
+-  if (vselect_insn == NULL_RTX)
+-    init_vselect_insn ();
++  return inline_secondary_memory_needed (mode, class1, class2, true);
++}
+ 
+-  if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
+-    return false;
+-  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+-  PUT_MODE (x, v2mode);
+-  XEXP (x, 0) = op0;
+-  XEXP (x, 1) = op1;
+-  ok = expand_vselect (target, x, perm, nelt, testing_p);
+-  XEXP (x, 0) = const0_rtx;
+-  XEXP (x, 1) = const0_rtx;
+-  return ok;
+-}
+-
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+-   using movss or movsd.  */
+-static bool
+-expand_vec_perm_movs (struct expand_vec_perm_d *d)
+-{
+-  machine_mode vmode = d->vmode;
+-  unsigned i, nelt = d->nelt;
+-  rtx x;
++/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
+ 
+-  if (d->one_operand_p)
+-    return false;
++   get_secondary_mem widens integral modes to BITS_PER_WORD.
++   There is no need to emit full 64 bit move on 64 bit targets
++   for integral modes that can be moved using 32 bit move.  */
+ 
+-  if (!(TARGET_SSE && vmode == V4SFmode)
+-      && !(TARGET_SSE2 && vmode == V2DFmode))
+-    return false;
++static machine_mode
++ix86_secondary_memory_needed_mode (machine_mode mode)
++{
++  if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
++    return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
++  return mode;
++}
+ 
+-  /* Only the first element is changed.  */
+-  if (d->perm[0] != nelt && d->perm[0] != 0)
+-    return false;
+-  for (i = 1; i < nelt; ++i)
+-    if (d->perm[i] != i + nelt - d->perm[0])
+-      return false;
++/* Implement the TARGET_CLASS_MAX_NREGS hook.
+ 
+-  if (d->testing_p)
+-    return true;
++   On the 80386, this is the size of MODE in words,
++   except in the FP regs, where a single reg is always enough.  */
+ 
+-  if (d->perm[0] == nelt)
+-    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
++static unsigned char
++ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
++{
++  if (MAYBE_INTEGER_CLASS_P (rclass))
++    {
++      if (mode == XFmode)
++	return (TARGET_64BIT ? 2 : 3);
++      else if (mode == XCmode)
++	return (TARGET_64BIT ? 4 : 6);
++      else
++	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
++    }
+   else
+-    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+-
+-  emit_insn (gen_rtx_SET (d->target, x));
+-
+-  return true;
++    {
++      if (COMPLEX_MODE_P (mode))
++	return 2;
++      else
++	return 1;
++    }
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+-   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
++/* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
+ 
+ static bool
+-expand_vec_perm_blend (struct expand_vec_perm_d *d)
++ix86_can_change_mode_class (machine_mode from, machine_mode to,
++			    reg_class_t regclass)
+ {
+-  machine_mode mmode, vmode = d->vmode;
+-  unsigned i, nelt = d->nelt;
+-  unsigned HOST_WIDE_INT mask;
+-  rtx target, op0, op1, maskop, x;
+-  rtx rperm[32], vperm;
++  if (from == to)
++    return true;
+ 
+-  if (d->one_operand_p)
+-    return false;
+-  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+-      && (TARGET_AVX512BW
+-	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
+-    ;
+-  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+-    ;
+-  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+-    ;
+-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+-    ;
+-  else
++  /* x87 registers can't do subreg at all, as all values are reformatted
++     to extended precision.  */
++  if (MAYBE_FLOAT_CLASS_P (regclass))
+     return false;
+ 
+-  /* This is a blend, not a permute.  Elements must stay in their
+-     respective lanes.  */
+-  for (i = 0; i < nelt; ++i)
++  if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
+     {
+-      unsigned e = d->perm[i];
+-      if (!(e == i || e == i + nelt))
++      /* Vector registers do not support QI or HImode loads.  If we don't
++	 disallow a change to these modes, reload will assume it's ok to
++	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
++	 the vec_dupv4hi pattern.  */
++      if (GET_MODE_SIZE (from) < 4)
+ 	return false;
+     }
+ 
+-  if (d->testing_p)
+-    return true;
+-
+-  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
+-     decision should be extracted elsewhere, so that we only try that
+-     sequence once all budget==3 options have been tried.  */
+-  target = d->target;
+-  op0 = d->op0;
+-  op1 = d->op1;
+-  mask = 0;
+-
+-  switch (vmode)
+-    {
+-    case E_V8DFmode:
+-    case E_V16SFmode:
+-    case E_V4DFmode:
+-    case E_V8SFmode:
+-    case E_V2DFmode:
+-    case E_V4SFmode:
+-    case E_V8HImode:
+-    case E_V8SImode:
+-    case E_V32HImode:
+-    case E_V64QImode:
+-    case E_V16SImode:
+-    case E_V8DImode:
+-      for (i = 0; i < nelt; ++i)
+-	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
+-      break;
+-
+-    case E_V2DImode:
+-      for (i = 0; i < 2; ++i)
+-	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+-      vmode = V8HImode;
+-      goto do_subreg;
+-
+-    case E_V4SImode:
+-      for (i = 0; i < 4; ++i)
+-	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+-      vmode = V8HImode;
+-      goto do_subreg;
++  return true;
++}
+ 
+-    case E_V16QImode:
+-      /* See if bytes move in pairs so we can use pblendw with
+-	 an immediate argument, rather than pblendvb with a vector
+-	 argument.  */
+-      for (i = 0; i < 16; i += 2)
+-	if (d->perm[i] + 1 != d->perm[i + 1])
+-	  {
+-	  use_pblendvb:
+-	    for (i = 0; i < nelt; ++i)
+-	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
++/* Return index of MODE in the sse load/store tables.  */
+ 
+-	  finish_pblendvb:
+-	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+-	    vperm = force_reg (vmode, vperm);
++static inline int
++sse_store_index (machine_mode mode)
++{
++      switch (GET_MODE_SIZE (mode))
++	{
++	  case 4:
++	    return 0;
++	  case 8:
++	    return 1;
++	  case 16:
++	    return 2;
++	  case 32:
++	    return 3;
++	  case 64:
++	    return 4;
++	  default:
++	    return -1;
++	}
++}
+ 
+-	    if (GET_MODE_SIZE (vmode) == 16)
+-	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+-	    else
+-	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+-	    if (target != d->target)
+-	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+-	    return true;
+-	  }
++/* Return the cost of moving data of mode M between a
++   register and memory.  A value of 2 is the default; this cost is
++   relative to those in `REGISTER_MOVE_COST'.
+ 
+-      for (i = 0; i < 8; ++i)
+-	mask |= (d->perm[i * 2] >= 16) << i;
+-      vmode = V8HImode;
+-      /* FALLTHRU */
++   This function is used extensively by register_move_cost that is used to
++   build tables at startup.  Make it inline in this case.
++   When IN is 2, return maximum of in and out move cost.
+ 
+-    do_subreg:
+-      target = gen_reg_rtx (vmode);
+-      op0 = gen_lowpart (vmode, op0);
+-      op1 = gen_lowpart (vmode, op1);
+-      break;
++   If moving between registers and memory is more expensive than
++   between two registers, you should define this macro to express the
++   relative cost.
+ 
+-    case E_V32QImode:
+-      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
+-      for (i = 0; i < 32; i += 2)
+-	if (d->perm[i] + 1 != d->perm[i + 1])
+-	  goto use_pblendvb;
+-      /* See if bytes move in quadruplets.  If yes, vpblendd
+-	 with immediate can be used.  */
+-      for (i = 0; i < 32; i += 4)
+-	if (d->perm[i] + 2 != d->perm[i + 2])
+-	  break;
+-      if (i < 32)
++   Model also increased moving costs of QImode registers in non
++   Q_REGS classes.
++ */
++static inline int
++inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
++{
++  int cost;
++  if (FLOAT_CLASS_P (regclass))
++    {
++      int index;
++      switch (mode)
+ 	{
+-	  /* See if bytes move the same in both lanes.  If yes,
+-	     vpblendw with immediate can be used.  */
+-	  for (i = 0; i < 16; i += 2)
+-	    if (d->perm[i] + 16 != d->perm[i + 16])
+-	      goto use_pblendvb;
+-
+-	  /* Use vpblendw.  */
+-	  for (i = 0; i < 16; ++i)
+-	    mask |= (d->perm[i * 2] >= 32) << i;
+-	  vmode = V16HImode;
+-	  goto do_subreg;
++	  case E_SFmode:
++	    index = 0;
++	    break;
++	  case E_DFmode:
++	    index = 1;
++	    break;
++	  case E_XFmode:
++	    index = 2;
++	    break;
++	  default:
++	    return 100;
+ 	}
+-
+-      /* Use vpblendd.  */
+-      for (i = 0; i < 8; ++i)
+-	mask |= (d->perm[i * 4] >= 32) << i;
+-      vmode = V8SImode;
+-      goto do_subreg;
+-
+-    case E_V16HImode:
+-      /* See if words move in pairs.  If yes, vpblendd can be used.  */
+-      for (i = 0; i < 16; i += 2)
+-	if (d->perm[i] + 1 != d->perm[i + 1])
+-	  break;
+-      if (i < 16)
++      if (in == 2)
++        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
++      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
++    }
++  if (SSE_CLASS_P (regclass))
++    {
++      int index = sse_store_index (mode);
++      if (index == -1)
++	return 100;
++      if (in == 2)
++        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
++      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
++    }
++  if (MMX_CLASS_P (regclass))
++    {
++      int index;
++      switch (GET_MODE_SIZE (mode))
+ 	{
+-	  /* See if words move the same in both lanes.  If not,
+-	     vpblendvb must be used.  */
+-	  for (i = 0; i < 8; i++)
+-	    if (d->perm[i] + 8 != d->perm[i + 8])
+-	      {
+-		/* Use vpblendvb.  */
+-		for (i = 0; i < 32; ++i)
+-		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+-
+-		vmode = V32QImode;
+-		nelt = 32;
+-		target = gen_reg_rtx (vmode);
+-		op0 = gen_lowpart (vmode, op0);
+-		op1 = gen_lowpart (vmode, op1);
+-		goto finish_pblendvb;
+-	      }
+-
+-	  /* Use vpblendw.  */
+-	  for (i = 0; i < 16; ++i)
+-	    mask |= (d->perm[i] >= 16) << i;
+-	  break;
++	  case 4:
++	    index = 0;
++	    break;
++	  case 8:
++	    index = 1;
++	    break;
++	  default:
++	    return 100;
+ 	}
+-
+-      /* Use vpblendd.  */
+-      for (i = 0; i < 8; ++i)
+-	mask |= (d->perm[i * 2] >= 16) << i;
+-      vmode = V8SImode;
+-      goto do_subreg;
+-
+-    case E_V4DImode:
+-      /* Use vpblendd.  */
+-      for (i = 0; i < 4; ++i)
+-	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+-      vmode = V8SImode;
+-      goto do_subreg;
+-
+-    default:
+-      gcc_unreachable ();
++      if (in == 2)
++        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
++      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
+     }
+-
+-  switch (vmode)
++  switch (GET_MODE_SIZE (mode))
+     {
+-    case E_V8DFmode:
+-    case E_V8DImode:
+-      mmode = QImode;
+-      break;
+-    case E_V16SFmode:
+-    case E_V16SImode:
+-      mmode = HImode;
+-      break;
+-    case E_V32HImode:
+-      mmode = SImode;
+-      break;
+-    case E_V64QImode:
+-      mmode = DImode;
+-      break;
+-    default:
+-      mmode = VOIDmode;
++      case 1:
++	if (Q_CLASS_P (regclass) || TARGET_64BIT)
++	  {
++	    if (!in)
++	      return ix86_cost->int_store[0];
++	    if (TARGET_PARTIAL_REG_DEPENDENCY
++	        && optimize_function_for_speed_p (cfun))
++	      cost = ix86_cost->movzbl_load;
++	    else
++	      cost = ix86_cost->int_load[0];
++	    if (in == 2)
++	      return MAX (cost, ix86_cost->int_store[0]);
++	    return cost;
++	  }
++	else
++	  {
++	   if (in == 2)
++	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
++	   if (in)
++	     return ix86_cost->movzbl_load;
++	   else
++	     return ix86_cost->int_store[0] + 4;
++	  }
++	break;
++      case 2:
++	if (in == 2)
++	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
++	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
++      default:
++	if (in == 2)
++	  cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
++	else if (in)
++	  cost = ix86_cost->int_load[2];
++	else
++	  cost = ix86_cost->int_store[2];
++	/* Multiply with the number of GPR moves needed.  */
++	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
+     }
++}
+ 
+-  if (mmode != VOIDmode)
+-    maskop = force_reg (mmode, gen_int_mode (mask, mmode));
+-  else
+-    maskop = GEN_INT (mask);
+-
+-  /* This matches five different patterns with the different modes.  */
+-  x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
+-  x = gen_rtx_SET (target, x);
+-  emit_insn (x);
+-  if (target != d->target)
+-    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+-
+-  return true;
++static int
++ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
++{
++  return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+-   in terms of the variable form of vpermilps.
+ 
+-   Note that we will have already failed the immediate input vpermilps,
+-   which requires that the high and low part shuffle be identical; the
+-   variable form doesn't require that.  */
++/* Return the cost of moving data from a register in class CLASS1 to
++   one in class CLASS2.
+ 
+-static bool
+-expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
++   It is not required that the cost always equal 2 when FROM is the same as TO;
++   on some machines it is expensive to move between registers if they are not
++   general registers.  */
++
++static int
++ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
++			 reg_class_t class2_i)
+ {
+-  rtx rperm[8], vperm;
+-  unsigned i;
++  enum reg_class class1 = (enum reg_class) class1_i;
++  enum reg_class class2 = (enum reg_class) class2_i;
+ 
+-  if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
+-    return false;
++  /* In case we require secondary memory, compute cost of the store followed
++     by load.  In order to avoid bad register allocation choices, we need
++     for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
+ 
+-  /* We can only permute within the 128-bit lane.  */
+-  for (i = 0; i < 8; ++i)
++  if (inline_secondary_memory_needed (mode, class1, class2, false))
+     {
+-      unsigned e = d->perm[i];
+-      if (i < 4 ? e >= 4 : e < 4)
+-	return false;
+-    }
++      int cost = 1;
+ 
+-  if (d->testing_p)
+-    return true;
++      cost += inline_memory_move_cost (mode, class1, 2);
++      cost += inline_memory_move_cost (mode, class2, 2);
+ 
+-  for (i = 0; i < 8; ++i)
+-    {
+-      unsigned e = d->perm[i];
++      /* In case of copying from general_purpose_register we may emit multiple
++         stores followed by single load causing memory size mismatch stall.
++         Count this as arbitrarily high cost of 20.  */
++      if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
++	  && TARGET_MEMORY_MISMATCH_STALL
++	  && targetm.class_max_nregs (class1, mode)
++	     > targetm.class_max_nregs (class2, mode))
++	cost += 20;
+ 
+-      /* Within each 128-bit lane, the elements of op0 are numbered
+-	 from 0 and the elements of op1 are numbered from 4.  */
+-      if (e >= 8 + 4)
+-	e -= 8;
+-      else if (e >= 4)
+-	e -= 4;
++      /* In the case of FP/MMX moves, the registers actually overlap, and we
++	 have to switch modes in order to treat them differently.  */
++      if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
++          || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
++	cost += 20;
+ 
+-      rperm[i] = GEN_INT (e);
++      return cost;
+     }
+ 
+-  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+-  vperm = force_reg (V8SImode, vperm);
+-  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+-
+-  return true;
+-}
+-
+-/* Return true if permutation D can be performed as VMODE permutation
+-   instead.  */
++  /* Moves between SSE/MMX and integer unit are expensive.  */
++  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
++      || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+ 
+-static bool
+-valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
+-{
+-  unsigned int i, j, chunk;
++    /* ??? By keeping returned value relatively high, we limit the number
++       of moves between integer and MMX/SSE registers for all targets.
++       Additionally, high value prevents problem with x86_modes_tieable_p(),
++       where integer modes in MMX/SSE registers are not tieable
++       because of missing QImode and HImode moves to, from or between
++       MMX/SSE registers.  */
++    return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
++		? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
+ 
+-  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+-      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+-      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
+-    return false;
++  if (MAYBE_FLOAT_CLASS_P (class1))
++    return ix86_cost->fp_move;
++  if (MAYBE_SSE_CLASS_P (class1))
++    {
++      if (GET_MODE_BITSIZE (mode) <= 128)
++	return ix86_cost->xmm_move;
++      if (GET_MODE_BITSIZE (mode) <= 256)
++	return ix86_cost->ymm_move;
++      return ix86_cost->zmm_move;
++    }
++  if (MAYBE_MMX_CLASS_P (class1))
++    return ix86_cost->mmx_move;
++  return 2;
++}
+ 
+-  if (GET_MODE_NUNITS (vmode) >= d->nelt)
+-    return true;
++/* Implement TARGET_HARD_REGNO_NREGS.  This is ordinarily the length in
++   words of a value of mode MODE but can be less for certain modes in
++   special long registers.
+ 
+-  chunk = d->nelt / GET_MODE_NUNITS (vmode);
+-  for (i = 0; i < d->nelt; i += chunk)
+-    if (d->perm[i] & (chunk - 1))
+-      return false;
+-    else
+-      for (j = 1; j < chunk; ++j)
+-	if (d->perm[i] + j != d->perm[i + j])
+-	  return false;
++   Actually there are no two word move instructions for consecutive
++   registers.  And only registers 0-3 may have mov byte instructions
++   applied to them.  */
+ 
+-  return true;
++static unsigned int
++ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
++{
++  if (GENERAL_REGNO_P (regno))
++    {
++      if (mode == XFmode)
++	return TARGET_64BIT ? 2 : 3;
++      if (mode == XCmode)
++	return TARGET_64BIT ? 4 : 6;
++      return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
++    }
++  if (COMPLEX_MODE_P (mode))
++    return 2;
++  if (mode == V64SFmode || mode == V64SImode)
++    return 4;
++  return 1;
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+-   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
++/* Implement TARGET_HARD_REGNO_MODE_OK.  */
+ 
+ static bool
+-expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
++ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
+ {
+-  unsigned i, nelt, eltsz, mask;
+-  unsigned char perm[64];
+-  machine_mode vmode = V16QImode;
+-  rtx rperm[64], vperm, target, op0, op1;
+-
+-  nelt = d->nelt;
+-
+-  if (!d->one_operand_p)
+-    {
+-      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+-	{
+-	  if (TARGET_AVX2
+-	      && valid_perm_using_mode_p (V2TImode, d))
+-	    {
+-	      if (d->testing_p)
+-		return true;
+-
+-	      /* Use vperm2i128 insn.  The pattern uses
+-		 V4DImode instead of V2TImode.  */
+-	      target = d->target;
+-	      if (d->vmode != V4DImode)
+-		target = gen_reg_rtx (V4DImode);
+-	      op0 = gen_lowpart (V4DImode, d->op0);
+-	      op1 = gen_lowpart (V4DImode, d->op1);
+-	      rperm[0]
+-		= GEN_INT ((d->perm[0] / (nelt / 2))
+-			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
+-	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+-	      if (target != d->target)
+-		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+-	      return true;
+-	    }
+-	  return false;
+-	}
+-    }
+-  else
++  /* Flags and only flags can only hold CCmode values.  */
++  if (CC_REGNO_P (regno))
++    return GET_MODE_CLASS (mode) == MODE_CC;
++  if (GET_MODE_CLASS (mode) == MODE_CC
++      || GET_MODE_CLASS (mode) == MODE_RANDOM
++      || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
++    return false;
++  if (STACK_REGNO_P (regno))
++    return VALID_FP_MODE_P (mode);
++  if (MASK_REGNO_P (regno))
++    return (VALID_MASK_REG_MODE (mode)
++	    || (TARGET_AVX512BW
++		&& VALID_MASK_AVX512BW_MODE (mode)));
++  if (SSE_REGNO_P (regno))
+     {
+-      if (GET_MODE_SIZE (d->vmode) == 16)
+-	{
+-	  if (!TARGET_SSSE3)
+-	    return false;
+-	}
+-      else if (GET_MODE_SIZE (d->vmode) == 32)
+-	{
+-	  if (!TARGET_AVX2)
+-	    return false;
+-
+-	  /* V4DImode should be already handled through
+-	     expand_vselect by vpermq instruction.  */
+-	  gcc_assert (d->vmode != V4DImode);
+-
+-	  vmode = V32QImode;
+-	  if (d->vmode == V8SImode
+-	      || d->vmode == V16HImode
+-	      || d->vmode == V32QImode)
+-	    {
+-	      /* First see if vpermq can be used for
+-		 V8SImode/V16HImode/V32QImode.  */
+-	      if (valid_perm_using_mode_p (V4DImode, d))
+-		{
+-		  for (i = 0; i < 4; i++)
+-		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+-		  if (d->testing_p)
+-		    return true;
+-		  target = gen_reg_rtx (V4DImode);
+-		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+-				      perm, 4, false))
+-		    {
+-		      emit_move_insn (d->target,
+-				      gen_lowpart (d->vmode, target));
+-		      return true;
+-		    }
+-		  return false;
+-		}
+-
+-	      /* Next see if vpermd can be used.  */
+-	      if (valid_perm_using_mode_p (V8SImode, d))
+-		vmode = V8SImode;
+-	    }
+-	  /* Or if vpermps can be used.  */
+-	  else if (d->vmode == V8SFmode)
+-	    vmode = V8SImode;
++      /* We implement the move patterns for all vector modes into and
++	 out of SSE registers, even when no operation instructions
++	 are available.  */
+ 
+-	  if (vmode == V32QImode)
+-	    {
+-	      /* vpshufb only works intra lanes, it is not
+-		 possible to shuffle bytes in between the lanes.  */
+-	      for (i = 0; i < nelt; ++i)
+-		if ((d->perm[i] ^ i) & (nelt / 2))
+-		  return false;
+-	    }
+-	}
+-      else if (GET_MODE_SIZE (d->vmode) == 64)
+-	{
+-	  if (!TARGET_AVX512BW)
+-	    return false;
++      /* For AVX-512 we allow, regardless of regno:
++	  - XI mode
++	  - any of 512-bit wide vector mode
++	  - any scalar mode.  */
++      if (TARGET_AVX512F
++	  && (mode == XImode
++	      || VALID_AVX512F_REG_MODE (mode)
++	      || VALID_AVX512F_SCALAR_MODE (mode)))
++	return true;
+ 
+-	  /* If vpermq didn't work, vpshufb won't work either.  */
+-	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
+-	    return false;
++      /* For AVX-5124FMAPS or AVX-5124VNNIW
++	 allow V64SF and V64SI modes for special regnos.  */
++      if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
++	  && (mode == V64SFmode || mode == V64SImode)
++	  && MOD4_SSE_REGNO_P (regno))
++	return true;
+ 
+-	  vmode = V64QImode;
+-	  if (d->vmode == V16SImode
+-	      || d->vmode == V32HImode
+-	      || d->vmode == V64QImode)
+-	    {
+-	      /* First see if vpermq can be used for
+-		 V16SImode/V32HImode/V64QImode.  */
+-	      if (valid_perm_using_mode_p (V8DImode, d))
+-		{
+-		  for (i = 0; i < 8; i++)
+-		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
+-		  if (d->testing_p)
+-		    return true;
+-		  target = gen_reg_rtx (V8DImode);
+-		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
+-				      perm, 8, false))
+-		    {
+-		      emit_move_insn (d->target,
+-				      gen_lowpart (d->vmode, target));
+-		      return true;
+-		    }
+-		  return false;
+-		}
++      /* TODO check for QI/HI scalars.  */
++      /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
++      if (TARGET_AVX512VL
++	  && (mode == OImode
++	      || mode == TImode
++	      || VALID_AVX256_REG_MODE (mode)
++	      || VALID_AVX512VL_128_REG_MODE (mode)))
++	return true;
+ 
+-	      /* Next see if vpermd can be used.  */
+-	      if (valid_perm_using_mode_p (V16SImode, d))
+-		vmode = V16SImode;
+-	    }
+-	  /* Or if vpermps can be used.  */
+-	  else if (d->vmode == V16SFmode)
+-	    vmode = V16SImode;
+-	  if (vmode == V64QImode)
+-	    {
+-	      /* vpshufb only works intra lanes, it is not
+-		 possible to shuffle bytes in between the lanes.  */
+-	      for (i = 0; i < nelt; ++i)
+-		if ((d->perm[i] ^ i) & (nelt / 4))
+-		  return false;
+-	    }
+-	}
+-      else
++      /* xmm16-xmm31 are only available for AVX-512.  */
++      if (EXT_REX_SSE_REGNO_P (regno))
+ 	return false;
+-    }
+-
+-  if (d->testing_p)
+-    return true;
+ 
+-  if (vmode == V8SImode)
+-    for (i = 0; i < 8; ++i)
+-      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+-  else if (vmode == V16SImode)
+-    for (i = 0; i < 16; ++i)
+-      rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
+-  else
++      /* OImode and AVX modes are available only when AVX is enabled.  */
++      return ((TARGET_AVX
++	       && VALID_AVX256_REG_OR_OI_MODE (mode))
++	      || VALID_SSE_REG_MODE (mode)
++	      || VALID_SSE2_REG_MODE (mode)
++	      || VALID_MMX_REG_MODE (mode)
++	      || VALID_MMX_REG_MODE_3DNOW (mode));
++    }
++  if (MMX_REGNO_P (regno))
+     {
+-      eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+-      if (!d->one_operand_p)
+-	mask = 2 * nelt - 1;
+-      else if (vmode == V16QImode)
+-	mask = nelt - 1;
+-      else if (vmode == V64QImode)
+-	mask = nelt / 4 - 1;
+-      else
+-	mask = nelt / 2 - 1;
+-
+-      for (i = 0; i < nelt; ++i)
+-	{
+-	  unsigned j, e = d->perm[i] & mask;
+-	  for (j = 0; j < eltsz; ++j)
+-	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+-	}
+-    }
+-
+-  vperm = gen_rtx_CONST_VECTOR (vmode,
+-				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+-  vperm = force_reg (vmode, vperm);
+-
+-  target = d->target;
+-  if (d->vmode != vmode)
+-    target = gen_reg_rtx (vmode);
+-  op0 = gen_lowpart (vmode, d->op0);
+-  if (d->one_operand_p)
+-    {
+-      if (vmode == V16QImode)
+-	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+-      else if (vmode == V32QImode)
+-	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+-      else if (vmode == V64QImode)
+-	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
+-      else if (vmode == V8SFmode)
+-	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
+-      else if (vmode == V8SImode)
+-	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
+-      else if (vmode == V16SFmode)
+-	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
+-      else if (vmode == V16SImode)
+-	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
+-      else
+-	gcc_unreachable ();
++      /* We implement the move patterns for 3DNOW modes even in MMX mode,
++	 so if the register is available at all, then we can move data of
++	 the given mode into or out of it.  */
++      return (VALID_MMX_REG_MODE (mode)
++	      || VALID_MMX_REG_MODE_3DNOW (mode));
+     }
+-  else
++
++  if (mode == QImode)
+     {
+-      op1 = gen_lowpart (vmode, d->op1);
+-      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
++      /* Take care for QImode values - they can be in non-QI regs,
++	 but then they do cause partial register stalls.  */
++      if (ANY_QI_REGNO_P (regno))
++	return true;
++      if (!TARGET_PARTIAL_REG_STALL)
++	return true;
++      /* LRA checks if the hard register is OK for the given mode.
++	 QImode values can live in non-QI regs, so we allow all
++	 registers here.  */
++      if (lra_in_progress)
++       return true;
++      return !can_create_pseudo_p ();
+     }
+-  if (target != d->target)
+-    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
++  /* We handle both integer and floats in the general purpose registers.  */
++  else if (VALID_INT_MODE_P (mode))
++    return true;
++  else if (VALID_FP_MODE_P (mode))
++    return true;
++  else if (VALID_DFP_MODE_P (mode))
++    return true;
++  /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
++     on to use that value in smaller contexts, this can easily force a
++     pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
++     supporting DImode, allow it.  */
++  else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
++    return true;
+ 
+-  return true;
++  return false;
+ }
+ 
+-/* For V*[QHS]Imode permutations, check if the same permutation
+-   can't be performed in a 2x, 4x or 8x wider inner mode.  */
++/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The only ABI that
++   saves SSE registers across calls is Win64 (thus no need to check the
++   current ABI here), and with AVX enabled Win64 only guarantees that
++   the low 16 bytes are saved.  */
+ 
+ static bool
+-canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+-			      struct expand_vec_perm_d *nd)
++ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno,
++				     machine_mode mode)
+ {
+-  int i;
+-  machine_mode mode = VOIDmode;
+-
+-  switch (d->vmode)
+-    {
+-    case E_V16QImode: mode = V8HImode; break;
+-    case E_V32QImode: mode = V16HImode; break;
+-    case E_V64QImode: mode = V32HImode; break;
+-    case E_V8HImode: mode = V4SImode; break;
+-    case E_V16HImode: mode = V8SImode; break;
+-    case E_V32HImode: mode = V16SImode; break;
+-    case E_V4SImode: mode = V2DImode; break;
+-    case E_V8SImode: mode = V4DImode; break;
+-    case E_V16SImode: mode = V8DImode; break;
+-    default: return false;
+-    }
+-  for (i = 0; i < d->nelt; i += 2)
+-    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+-      return false;
+-  nd->vmode = mode;
+-  nd->nelt = d->nelt / 2;
+-  for (i = 0; i < nd->nelt; i++)
+-    nd->perm[i] = d->perm[2 * i] / 2;
+-  if (GET_MODE_INNER (mode) != DImode)
+-    canonicalize_vector_int_perm (nd, nd);
+-  if (nd != d)
+-    {
+-      nd->one_operand_p = d->one_operand_p;
+-      nd->testing_p = d->testing_p;
+-      if (d->op0 == d->op1)
+-	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+-      else
+-	{
+-	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
+-	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
+-	}
+-      if (d->testing_p)
+-	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+-      else
+-	nd->target = gen_reg_rtx (nd->vmode);
+-    }
+-  return true;
++  return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
+ }
+ 
+-/* Try to expand one-operand permutation with constant mask.  */
++/* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
++   tieable integer mode.  */
+ 
+ static bool
+-ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
++ix86_tieable_integer_mode_p (machine_mode mode)
+ {
+-  machine_mode mode = GET_MODE (d->op0);
+-  machine_mode maskmode = mode;
+-  rtx (*gen) (rtx, rtx, rtx) = NULL;
+-  rtx target, op0, mask;
+-  rtx vec[64];
++  switch (mode)
++    {
++    case E_HImode:
++    case E_SImode:
++      return true;
+ 
+-  if (!rtx_equal_p (d->op0, d->op1))
+-    return false;
++    case E_QImode:
++      return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
+ 
+-  if (!TARGET_AVX512F)
+-    return false;
++    case E_DImode:
++      return TARGET_64BIT;
+ 
+-  switch (mode)
+-    {
+-    case E_V16SImode:
+-      gen = gen_avx512f_permvarv16si;
+-      break;
+-    case E_V16SFmode:
+-      gen = gen_avx512f_permvarv16sf;
+-      maskmode = V16SImode;
+-      break;
+-    case E_V8DImode:
+-      gen = gen_avx512f_permvarv8di;
+-      break;
+-    case E_V8DFmode:
+-      gen = gen_avx512f_permvarv8df;
+-      maskmode = V8DImode;
+-      break;
+     default:
+       return false;
+     }
+-
+-  target = d->target;
+-  op0 = d->op0;
+-  for (int i = 0; i < d->nelt; ++i)
+-    vec[i] = GEN_INT (d->perm[i]);
+-  mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+-  emit_insn (gen (target, op0, force_reg (maskmode, mask)));
+-  return true;
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
+-   in a single instruction.  */
++/* Implement TARGET_MODES_TIEABLE_P.
++
++   Return true if MODE1 is accessible in a register that can hold MODE2
++   without copying.  That is, all register classes that can hold MODE2
++   can also hold MODE1.  */
+ 
+ static bool
+-expand_vec_perm_1 (struct expand_vec_perm_d *d)
++ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+ {
+-  unsigned i, nelt = d->nelt;
+-  struct expand_vec_perm_d nd;
+-
+-  /* Check plain VEC_SELECT first, because AVX has instructions that could
+-     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
+-     input where SEL+CONCAT may not.  */
+-  if (d->one_operand_p)
+-    {
+-      int mask = nelt - 1;
+-      bool identity_perm = true;
+-      bool broadcast_perm = true;
+-
+-      for (i = 0; i < nelt; i++)
+-	{
+-	  nd.perm[i] = d->perm[i] & mask;
+-	  if (nd.perm[i] != i)
+-	    identity_perm = false;
+-	  if (nd.perm[i])
+-	    broadcast_perm = false;
+-	}
++  if (mode1 == mode2)
++    return true;
+ 
+-      if (identity_perm)
+-	{
+-	  if (!d->testing_p)
+-	    emit_move_insn (d->target, d->op0);
+-	  return true;
+-	}
+-      else if (broadcast_perm && TARGET_AVX2)
+-	{
+-	  /* Use vpbroadcast{b,w,d}.  */
+-	  rtx (*gen) (rtx, rtx) = NULL;
+-	  switch (d->vmode)
+-	    {
+-	    case E_V64QImode:
+-	      if (TARGET_AVX512BW)
+-		gen = gen_avx512bw_vec_dupv64qi_1;
+-	      break;
+-	    case E_V32QImode:
+-	      gen = gen_avx2_pbroadcastv32qi_1;
+-	      break;
+-	    case E_V32HImode:
+-	      if (TARGET_AVX512BW)
+-		gen = gen_avx512bw_vec_dupv32hi_1;
+-	      break;
+-	    case E_V16HImode:
+-	      gen = gen_avx2_pbroadcastv16hi_1;
+-	      break;
+-	    case E_V16SImode:
+-	      if (TARGET_AVX512F)
+-		gen = gen_avx512f_vec_dupv16si_1;
+-	      break;
+-	    case E_V8SImode:
+-	      gen = gen_avx2_pbroadcastv8si_1;
+-	      break;
+-	    case E_V16QImode:
+-	      gen = gen_avx2_pbroadcastv16qi;
+-	      break;
+-	    case E_V8HImode:
+-	      gen = gen_avx2_pbroadcastv8hi;
+-	      break;
+-	    case E_V16SFmode:
+-	      if (TARGET_AVX512F)
+-		gen = gen_avx512f_vec_dupv16sf_1;
+-	      break;
+-	    case E_V8SFmode:
+-	      gen = gen_avx2_vec_dupv8sf_1;
+-	      break;
+-	    case E_V8DFmode:
+-	      if (TARGET_AVX512F)
+-		gen = gen_avx512f_vec_dupv8df_1;
+-	      break;
+-	    case E_V8DImode:
+-	      if (TARGET_AVX512F)
+-		gen = gen_avx512f_vec_dupv8di_1;
+-	      break;
+-	    /* For other modes prefer other shuffles this function creates.  */
+-	    default: break;
+-	    }
+-	  if (gen != NULL)
+-	    {
+-	      if (!d->testing_p)
+-		emit_insn (gen (d->target, d->op0));
+-	      return true;
+-	    }
+-	}
++  if (ix86_tieable_integer_mode_p (mode1)
++      && ix86_tieable_integer_mode_p (mode2))
++    return true;
+ 
+-      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
+-	return true;
++  /* MODE2 being XFmode implies fp stack or general regs, which means we
++     can tie any smaller floating point modes to it.  Note that we do not
++     tie this with TFmode.  */
++  if (mode2 == XFmode)
++    return mode1 == SFmode || mode1 == DFmode;
+ 
+-      /* There are plenty of patterns in sse.md that are written for
+-	 SEL+CONCAT and are not replicated for a single op.  Perhaps
+-	 that should be changed, to avoid the nastiness here.  */
++  /* MODE2 being DFmode implies fp stack, general or sse regs, which means
++     that we can tie it with SFmode.  */
++  if (mode2 == DFmode)
++    return mode1 == SFmode;
+ 
+-      /* Recognize interleave style patterns, which means incrementing
+-	 every other permutation operand.  */
+-      for (i = 0; i < nelt; i += 2)
+-	{
+-	  nd.perm[i] = d->perm[i] & mask;
+-	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
+-	}
+-      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
+-				  d->testing_p))
+-	return true;
++  /* If MODE2 is only appropriate for an SSE register, then tie with
++     any other mode acceptable to SSE registers.  */
++  if (GET_MODE_SIZE (mode2) == 64
++      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
++    return (GET_MODE_SIZE (mode1) == 64
++	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
++  if (GET_MODE_SIZE (mode2) == 32
++      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
++    return (GET_MODE_SIZE (mode1) == 32
++	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
++  if (GET_MODE_SIZE (mode2) == 16
++      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
++    return (GET_MODE_SIZE (mode1) == 16
++	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ 
+-      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
+-      if (nelt >= 4)
+-	{
+-	  for (i = 0; i < nelt; i += 4)
+-	    {
+-	      nd.perm[i + 0] = d->perm[i + 0] & mask;
+-	      nd.perm[i + 1] = d->perm[i + 1] & mask;
+-	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
+-	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
+-	    }
++  /* If MODE2 is appropriate for an MMX register, then tie
++     with any other mode acceptable to MMX registers.  */
++  if (GET_MODE_SIZE (mode2) == 8
++      && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
++    return (GET_MODE_SIZE (mode1) == 8
++	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
+ 
+-	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
+-				      d->testing_p))
+-	    return true;
+-	}
+-    }
++  return false;
++}
+ 
+-  /* Try movss/movsd instructions.  */
+-  if (expand_vec_perm_movs (d))
+-    return true;
++/* Return the cost of moving between two registers of mode MODE.  */
+ 
+-  /* Finally, try the fully general two operand permute.  */
+-  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
+-			      d->testing_p))
+-    return true;
++static int
++ix86_set_reg_reg_cost (machine_mode mode)
++{
++  unsigned int units = UNITS_PER_WORD;
+ 
+-  /* Recognize interleave style patterns with reversed operands.  */
+-  if (!d->one_operand_p)
++  switch (GET_MODE_CLASS (mode))
+     {
+-      for (i = 0; i < nelt; ++i)
+-	{
+-	  unsigned e = d->perm[i];
+-	  if (e >= nelt)
+-	    e -= nelt;
+-	  else
+-	    e += nelt;
+-	  nd.perm[i] = e;
+-	}
++    default:
++      break;
+ 
+-      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
+-				  d->testing_p))
+-	return true;
+-    }
++    case MODE_CC:
++      units = GET_MODE_SIZE (CCmode);
++      break;
+ 
+-  /* Try the SSE4.1 blend variable merge instructions.  */
+-  if (expand_vec_perm_blend (d))
+-    return true;
++    case MODE_FLOAT:
++      if ((TARGET_SSE && mode == TFmode)
++	  || (TARGET_80387 && mode == XFmode)
++	  || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
++	  || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
++	units = GET_MODE_SIZE (mode);
++      break;
+ 
+-  /* Try one of the AVX vpermil variable permutations.  */
+-  if (expand_vec_perm_vpermil (d))
+-    return true;
++    case MODE_COMPLEX_FLOAT:
++      if ((TARGET_SSE && mode == TCmode)
++	  || (TARGET_80387 && mode == XCmode)
++	  || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
++	  || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
++	units = GET_MODE_SIZE (mode);
++      break;
+ 
+-  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+-     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
+-  if (expand_vec_perm_pshufb (d))
+-    return true;
++    case MODE_VECTOR_INT:
++    case MODE_VECTOR_FLOAT:
++      if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
++	  || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
++	  || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
++	  || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
++	  || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
++	units = GET_MODE_SIZE (mode);
++    }
+ 
+-  /* Try the AVX2 vpalignr instruction.  */
+-  if (expand_vec_perm_palignr (d, true))
+-    return true;
++  /* Return the cost of moving between two registers of mode MODE,
++     assuming that the move will be in pieces of at most UNITS bytes.  */
++  return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
++}
+ 
+-  /* Try the AVX512F vperm{s,d} instructions.  */
+-  if (ix86_expand_vec_one_operand_perm_avx512 (d))
+-    return true;
++/* Return cost of vector operation in MODE given that scalar version has
++   COST.  */
+ 
+-  /* Try the AVX512F vpermt2/vpermi2 instructions.  */
+-  if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
+-    return true;
++static int
++ix86_vec_cost (machine_mode mode, int cost)
++{
++  if (!VECTOR_MODE_P (mode))
++    return cost;
+ 
+-  /* See if we can get the same permutation in different vector integer
+-     mode.  */
+-  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+-    {
+-      if (!d->testing_p)
+-	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+-      return true;
+-    }
+-  return false;
++  if (GET_MODE_BITSIZE (mode) == 128
++      && TARGET_SSE_SPLIT_REGS)
++    return cost * 2;
++  if (GET_MODE_BITSIZE (mode) > 128
++      && TARGET_AVX128_OPTIMAL)
++    return cost * GET_MODE_BITSIZE (mode) / 128;
++  return cost;
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+-   in terms of a pair of pshuflw + pshufhw instructions.  */
++/* Return cost of multiplication in MODE.  */
+ 
+-static bool
+-expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
++static int
++ix86_multiplication_cost (const struct processor_costs *cost,
++			  enum machine_mode mode)
+ {
+-  unsigned char perm2[MAX_VECT_LEN];
+-  unsigned i;
+-  bool ok;
+-
+-  if (d->vmode != V8HImode || !d->one_operand_p)
+-    return false;
++  machine_mode inner_mode = mode;
++  if (VECTOR_MODE_P (mode))
++    inner_mode = GET_MODE_INNER (mode);
+ 
+-  /* The two permutations only operate in 64-bit lanes.  */
+-  for (i = 0; i < 4; ++i)
+-    if (d->perm[i] >= 4)
+-      return false;
+-  for (i = 4; i < 8; ++i)
+-    if (d->perm[i] < 4)
+-      return false;
++  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++    return inner_mode == DFmode ? cost->mulsd : cost->mulss;
++  else if (X87_FLOAT_MODE_P (mode))
++    return cost->fmul;
++  else if (FLOAT_MODE_P (mode))
++    return  ix86_vec_cost (mode,
++			   inner_mode == DFmode ? cost->mulsd : cost->mulss);
++  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
++    {
++      /* vpmullq is used in this case. No emulation is needed.  */
++      if (TARGET_AVX512DQ)
++	return ix86_vec_cost (mode, cost->mulss);
+ 
+-  if (d->testing_p)
+-    return true;
++      /* V*QImode is emulated with 7-13 insns.  */
++      if (mode == V16QImode || mode == V32QImode)
++	{
++	  int extra = 11;
++	  if (TARGET_XOP && mode == V16QImode)
++	    extra = 5;
++	  else if (TARGET_SSSE3)
++	    extra = 6;
++	  return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
++	}
++      /* V*DImode is emulated with 5-8 insns.  */
++      else if (mode == V2DImode || mode == V4DImode)
++	{
++	  if (TARGET_XOP && mode == V2DImode)
++	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
++	  else
++	    return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
++	}
++      /* Without sse4.1, we don't have PMULLD; it's emulated with 7
++	 insns, including two PMULUDQ.  */
++      else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
++	return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
++      else
++	return ix86_vec_cost (mode, cost->mulss);
++    }
++  else
++    return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
++}
+ 
+-  /* Emit the pshuflw.  */
+-  memcpy (perm2, d->perm, 4);
+-  for (i = 4; i < 8; ++i)
+-    perm2[i] = i;
+-  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
+-  gcc_assert (ok);
++/* Return cost of multiplication in MODE.  */
+ 
+-  /* Emit the pshufhw.  */
+-  memcpy (perm2 + 4, d->perm + 4, 4);
+-  for (i = 0; i < 4; ++i)
+-    perm2[i] = i;
+-  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
+-  gcc_assert (ok);
++static int
++ix86_division_cost (const struct processor_costs *cost,
++			  enum machine_mode mode)
++{
++  machine_mode inner_mode = mode;
++  if (VECTOR_MODE_P (mode))
++    inner_mode = GET_MODE_INNER (mode);
+ 
+-  return true;
++  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++    return inner_mode == DFmode ? cost->divsd : cost->divss;
++  else if (X87_FLOAT_MODE_P (mode))
++    return cost->fdiv;
++  else if (FLOAT_MODE_P (mode))
++    return ix86_vec_cost (mode,
++			  inner_mode == DFmode ? cost->divsd : cost->divss);
++  else
++    return cost->divide[MODE_INDEX (mode)];
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+-   the permutation using the SSSE3 palignr instruction.  This succeeds
+-   when all of the elements in PERM fit within one vector and we merely
+-   need to shift them down so that a single vector permutation has a
+-   chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
+-   the vpalignr instruction itself can perform the requested permutation.  */
+-
+-static bool
+-expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
+-{
+-  unsigned i, nelt = d->nelt;
+-  unsigned min, max, minswap, maxswap;
+-  bool in_order, ok, swap = false;
+-  rtx shift, target;
+-  struct expand_vec_perm_d dcopy;
+-
+-  /* Even with AVX, palignr only operates on 128-bit vectors,
+-     in AVX2 palignr operates on both 128-bit lanes.  */
+-  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+-      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
+-    return false;
+-
+-  min = 2 * nelt;
+-  max = 0;
+-  minswap = 2 * nelt;
+-  maxswap = 0;
+-  for (i = 0; i < nelt; ++i)
+-    {
+-      unsigned e = d->perm[i];
+-      unsigned eswap = d->perm[i] ^ nelt;
+-      if (GET_MODE_SIZE (d->vmode) == 32)
+-	{
+-	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+-	  eswap = e ^ (nelt / 2);
+-	}
+-      if (e < min)
+-	min = e;
+-      if (e > max)
+-	max = e;
+-      if (eswap < minswap)
+-	minswap = eswap;
+-      if (eswap > maxswap)
+-	maxswap = eswap;
+-    }
+-  if (min == 0
+-      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
+-    {
+-      if (d->one_operand_p
+-	  || minswap == 0
+-	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
+-				   ? nelt / 2 : nelt))
+-	return false;
+-      swap = true;
+-      min = minswap;
+-      max = maxswap;
+-    }
++#define COSTS_N_BYTES(N) ((N) * 2)
+ 
+-  /* Given that we have SSSE3, we know we'll be able to implement the
+-     single operand permutation after the palignr with pshufb for
+-     128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
+-     first.  */
+-  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
+-    return true;
++/* Return cost of shift in MODE.
++   If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
++   AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
++   if op1 is a result of subreg.
+ 
+-  dcopy = *d;
+-  if (swap)
+-    {
+-      dcopy.op0 = d->op1;
+-      dcopy.op1 = d->op0;
+-      for (i = 0; i < nelt; ++i)
+-	dcopy.perm[i] ^= nelt;
+-    }
++   SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored.  */
+ 
+-  in_order = true;
+-  for (i = 0; i < nelt; ++i)
++static int
++ix86_shift_rotate_cost (const struct processor_costs *cost,
++			enum machine_mode mode, bool constant_op1,
++			HOST_WIDE_INT op1_val,
++			bool speed,
++			bool and_in_op1,
++			bool shift_and_truncate,
++			bool *skip_op0, bool *skip_op1)
++{
++  if (skip_op0)
++    *skip_op0 = *skip_op1 = false;
++  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+     {
+-      unsigned e = dcopy.perm[i];
+-      if (GET_MODE_SIZE (d->vmode) == 32
+-	  && e >= nelt
+-	  && (e & (nelt / 2 - 1)) < min)
+-	e = e - min - (nelt / 2);
++      /* V*QImode is emulated with 1-11 insns.  */
++      if (mode == V16QImode || mode == V32QImode)
++	{
++	  int count = 11;
++	  if (TARGET_XOP && mode == V16QImode)
++	    {
++	      /* For XOP we use vpshab, which requires a broadcast of the
++		 value to the variable shift insn.  For constants this
++		 means a V16Q const in mem; even when we can perform the
++		 shift with one insn set the cost to prefer paddb.  */
++	      if (constant_op1)
++		{
++		  if (skip_op1)
++		    *skip_op1 = true;
++		  return ix86_vec_cost (mode,
++					cost->sse_op
++					+ (speed
++					   ? 2
++					   : COSTS_N_BYTES
++					       (GET_MODE_UNIT_SIZE (mode))));
++		}
++	      count = 3;
++	    }
++	  else if (TARGET_SSSE3)
++	    count = 7;
++	  return ix86_vec_cost (mode, cost->sse_op * count);
++	}
+       else
+-	e = e - min;
+-      if (e != i)
+-	in_order = false;
+-      dcopy.perm[i] = e;
+-    }
+-  dcopy.one_operand_p = true;
+-
+-  if (single_insn_only_p && !in_order)
+-    return false;
+-
+-  /* For AVX2, test whether we can permute the result in one instruction.  */
+-  if (d->testing_p)
+-    {
+-      if (in_order)
+-	return true;
+-      dcopy.op1 = dcopy.op0;
+-      return expand_vec_perm_1 (&dcopy);
++	return ix86_vec_cost (mode, cost->sse_op);
+     }
+-
+-  shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
+-  if (GET_MODE_SIZE (d->vmode) == 16)
++  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+     {
+-      target = gen_reg_rtx (TImode);
+-      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
+-				      gen_lowpart (TImode, dcopy.op0), shift));
++      if (constant_op1)
++	{
++	  if (op1_val > 32)
++	    return cost->shift_const + COSTS_N_INSNS (2);
++	  else
++	    return cost->shift_const * 2;
++	}
++      else
++	{
++	  if (and_in_op1)
++	    return cost->shift_var * 2;
++	  else
++	    return cost->shift_var * 6 + COSTS_N_INSNS (2);
++	}
+     }
+   else
+     {
+-      target = gen_reg_rtx (V2TImode);
+-      emit_insn (gen_avx2_palignrv2ti (target,
+-				       gen_lowpart (V2TImode, dcopy.op1),
+-				       gen_lowpart (V2TImode, dcopy.op0),
+-				       shift));
+-    }
+-
+-  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
+-
+-  /* Test for the degenerate case where the alignment by itself
+-     produces the desired permutation.  */
+-  if (in_order)
+-    {
+-      emit_move_insn (d->target, dcopy.op0);
+-      return true;
++      if (constant_op1)
++	return cost->shift_const;
++      else if (shift_and_truncate)
++	{
++	  if (skip_op0)
++	    *skip_op0 = *skip_op1 = true;
++	  /* Return the cost after shift-and truncation.  */
++	  return cost->shift_var;
++	}
++      else
++	return cost->shift_var;
+     }
+-
+-  ok = expand_vec_perm_1 (&dcopy);
+-  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
+-
+-  return ok;
++  return cost->shift_const;
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
+-   the permutation using the SSE4_1 pblendv instruction.  Potentially
+-   reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
++/* Compute a (partial) cost for rtx X.  Return true if the complete
++   cost has been computed, and false if subexpressions should be
++   scanned.  In either case, *TOTAL contains the cost result.  */
+ 
+ static bool
+-expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
++ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
++		int *total, bool speed)
+ {
+-  unsigned i, which, nelt = d->nelt;
+-  struct expand_vec_perm_d dcopy, dcopy1;
+-  machine_mode vmode = d->vmode;
+-  bool ok;
+-
+-  /* Use the same checks as in expand_vec_perm_blend.  */
+-  if (d->one_operand_p)
+-    return false;
+-  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+-    ;
+-  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+-    ;
+-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+-    ;
+-  else
+-    return false;
+-
+-  /* Figure out where permutation elements stay not in their
+-     respective lanes.  */
+-  for (i = 0, which = 0; i < nelt; ++i)
+-    {
+-      unsigned e = d->perm[i];
+-      if (e != i)
+-	which |= (e < nelt ? 1 : 2);
+-    }
+-  /* We can pblend the part where elements stay not in their
+-     respective lanes only when these elements are all in one
+-     half of a permutation.
+-     {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
+-     lanes, but both 8 and 9 >= 8
+-     {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
+-     respective lanes and 8 >= 8, but 2 not.  */
+-  if (which != 1 && which != 2)
+-    return false;
+-  if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
+-    return true;
+-
+-  /* First we apply one operand permutation to the part where
+-     elements stay not in their respective lanes.  */
+-  dcopy = *d;
+-  if (which == 2)
+-    dcopy.op0 = dcopy.op1 = d->op1;
+-  else
+-    dcopy.op0 = dcopy.op1 = d->op0;
+-  if (!d->testing_p)
+-    dcopy.target = gen_reg_rtx (vmode);
+-  dcopy.one_operand_p = true;
+-
+-  for (i = 0; i < nelt; ++i)
+-    dcopy.perm[i] = d->perm[i] & (nelt - 1);
+-
+-  ok = expand_vec_perm_1 (&dcopy);
+-  if (GET_MODE_SIZE (vmode) != 16 && !ok)
+-    return false;
+-  else
+-    gcc_assert (ok);
+-  if (d->testing_p)
+-    return true;
+-
+-  /* Next we put permuted elements into their positions.  */
+-  dcopy1 = *d;
+-  if (which == 2)
+-    dcopy1.op1 = dcopy.target;
+-  else
+-    dcopy1.op0 = dcopy.target;
+-
+-  for (i = 0; i < nelt; ++i)
+-    dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
++  rtx mask;
++  enum rtx_code code = GET_CODE (x);
++  enum rtx_code outer_code = (enum rtx_code) outer_code_i;
++  const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
++  int src_cost;
+ 
+-  ok = expand_vec_perm_blend (&dcopy1);
+-  gcc_assert (ok);
++  switch (code)
++    {
++    case SET:
++      if (register_operand (SET_DEST (x), VOIDmode)
++	  && register_operand (SET_SRC (x), VOIDmode))
++	{
++	  *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
++	  return true;
++	}
+ 
+-  return true;
+-}
++      if (register_operand (SET_SRC (x), VOIDmode))
++	/* Avoid potentially incorrect high cost from rtx_costs
++	   for non-tieable SUBREGs.  */
++	src_cost = 0;
++      else
++	{
++	  src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
+ 
+-static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
++	  if (CONSTANT_P (SET_SRC (x)))
++	    /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
++	       a small value, possibly zero for cheap constants.  */
++	    src_cost += COSTS_N_INSNS (1);
++	}
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+-   a two vector permutation into a single vector permutation by using
+-   an interleave operation to merge the vectors.  */
++      *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
++      return true;
+ 
+-static bool
+-expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
+-{
+-  struct expand_vec_perm_d dremap, dfinal;
+-  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
+-  unsigned HOST_WIDE_INT contents;
+-  unsigned char remap[2 * MAX_VECT_LEN];
+-  rtx_insn *seq;
+-  bool ok, same_halves = false;
++    case CONST_INT:
++    case CONST:
++    case LABEL_REF:
++    case SYMBOL_REF:
++      if (x86_64_immediate_operand (x, VOIDmode))
++	*total = 0;
++     else
++	*total = 1;
++      return true;
+ 
+-  if (GET_MODE_SIZE (d->vmode) == 16)
+-    {
+-      if (d->one_operand_p)
+-	return false;
+-    }
+-  else if (GET_MODE_SIZE (d->vmode) == 32)
+-    {
+-      if (!TARGET_AVX)
+-	return false;
+-      /* For 32-byte modes allow even d->one_operand_p.
+-	 The lack of cross-lane shuffling in some instructions
+-	 might prevent a single insn shuffle.  */
+-      dfinal = *d;
+-      dfinal.testing_p = true;
+-      /* If expand_vec_perm_interleave3 can expand this into
+-	 a 3 insn sequence, give up and let it be expanded as
+-	 3 insn sequence.  While that is one insn longer,
+-	 it doesn't need a memory operand and in the common
+-	 case that both interleave low and high permutations
+-	 with the same operands are adjacent needs 4 insns
+-	 for both after CSE.  */
+-      if (expand_vec_perm_interleave3 (&dfinal))
+-	return false;
+-    }
+-  else
+-    return false;
++    case CONST_DOUBLE:
++      if (IS_STACK_MODE (mode))
++	switch (standard_80387_constant_p (x))
++	  {
++	  case -1:
++	  case 0:
++	    break;
++	  case 1: /* 0.0 */
++	    *total = 1;
++	    return true;
++	  default: /* Other constants */
++	    *total = 2;
++	    return true;
++	  }
++      /* FALLTHRU */
+ 
+-  /* Examine from whence the elements come.  */
+-  contents = 0;
+-  for (i = 0; i < nelt; ++i)
+-    contents |= HOST_WIDE_INT_1U << d->perm[i];
++    case CONST_VECTOR:
++      switch (standard_sse_constant_p (x, mode))
++	{
++	case 0:
++	  break;
++	case 1:  /* 0: xor eliminates false dependency */
++	  *total = 0;
++	  return true;
++	default: /* -1: cmp contains false dependency */
++	  *total = 1;
++	  return true;
++	}
++      /* FALLTHRU */
+ 
+-  memset (remap, 0xff, sizeof (remap));
+-  dremap = *d;
++    case CONST_WIDE_INT:
++      /* Fall back to (MEM (SYMBOL_REF)), since that's where
++	 it'll probably end up.  Add a penalty for size.  */
++      *total = (COSTS_N_INSNS (1)
++		+ (!TARGET_64BIT && flag_pic)
++		+ (GET_MODE_SIZE (mode) <= 4
++		   ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
++      return true;
+ 
+-  if (GET_MODE_SIZE (d->vmode) == 16)
+-    {
+-      unsigned HOST_WIDE_INT h1, h2, h3, h4;
++    case ZERO_EXTEND:
++      /* The zero extensions is often completely free on x86_64, so make
++	 it as cheap as possible.  */
++      if (TARGET_64BIT && mode == DImode
++	  && GET_MODE (XEXP (x, 0)) == SImode)
++	*total = 1;
++      else if (TARGET_ZERO_EXTEND_WITH_AND)
++	*total = cost->add;
++      else
++	*total = cost->movzx;
++      return false;
+ 
+-      /* Split the two input vectors into 4 halves.  */
+-      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+-      h2 = h1 << nelt2;
+-      h3 = h2 << nelt2;
+-      h4 = h3 << nelt2;
++    case SIGN_EXTEND:
++      *total = cost->movsx;
++      return false;
+ 
+-      /* If the elements from the low halves use interleave low, and similarly
+-	 for interleave high.  If the elements are from mis-matched halves, we
+-	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
+-      if ((contents & (h1 | h3)) == contents)
+-	{
+-	  /* punpckl* */
+-	  for (i = 0; i < nelt2; ++i)
+-	    {
+-	      remap[i] = i * 2;
+-	      remap[i + nelt] = i * 2 + 1;
+-	      dremap.perm[i * 2] = i;
+-	      dremap.perm[i * 2 + 1] = i + nelt;
+-	    }
+-	  if (!TARGET_SSE2 && d->vmode == V4SImode)
+-	    dremap.vmode = V4SFmode;
+-	}
+-      else if ((contents & (h2 | h4)) == contents)
+-	{
+-	  /* punpckh* */
+-	  for (i = 0; i < nelt2; ++i)
+-	    {
+-	      remap[i + nelt2] = i * 2;
+-	      remap[i + nelt + nelt2] = i * 2 + 1;
+-	      dremap.perm[i * 2] = i + nelt2;
+-	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+-	    }
+-	  if (!TARGET_SSE2 && d->vmode == V4SImode)
+-	    dremap.vmode = V4SFmode;
+-	}
+-      else if ((contents & (h1 | h4)) == contents)
++    case ASHIFT:
++      if (SCALAR_INT_MODE_P (mode)
++	  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
++	  && CONST_INT_P (XEXP (x, 1)))
+ 	{
+-	  /* shufps */
+-	  for (i = 0; i < nelt2; ++i)
++	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
++	  if (value == 1)
+ 	    {
+-	      remap[i] = i;
+-	      remap[i + nelt + nelt2] = i + nelt2;
+-	      dremap.perm[i] = i;
+-	      dremap.perm[i + nelt2] = i + nelt + nelt2;
++	      *total = cost->add;
++	      return false;
+ 	    }
+-	  if (nelt != 4)
++	  if ((value == 2 || value == 3)
++	      && cost->lea <= cost->shift_const)
+ 	    {
+-	      /* shufpd */
+-	      dremap.vmode = V2DImode;
+-	      dremap.nelt = 2;
+-	      dremap.perm[0] = 0;
+-	      dremap.perm[1] = 3;
++	      *total = cost->lea;
++	      return false;
+ 	    }
+ 	}
+-      else if ((contents & (h2 | h3)) == contents)
++      /* FALLTHRU */
++
++    case ROTATE:
++    case ASHIFTRT:
++    case LSHIFTRT:
++    case ROTATERT:
++      bool skip_op0, skip_op1;
++      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
++				       CONST_INT_P (XEXP (x, 1))
++					 ? INTVAL (XEXP (x, 1)) : -1,
++				       speed,
++				       GET_CODE (XEXP (x, 1)) == AND,
++				       SUBREG_P (XEXP (x, 1))
++				       && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
++				       &skip_op0, &skip_op1);
++      if (skip_op0 || skip_op1)
+ 	{
+-	  /* shufps */
+-	  for (i = 0; i < nelt2; ++i)
+-	    {
+-	      remap[i + nelt2] = i;
+-	      remap[i + nelt] = i + nelt2;
+-	      dremap.perm[i] = i + nelt2;
+-	      dremap.perm[i + nelt2] = i + nelt;
+-	    }
+-	  if (nelt != 4)
+-	    {
+-	      /* shufpd */
+-	      dremap.vmode = V2DImode;
+-	      dremap.nelt = 2;
+-	      dremap.perm[0] = 1;
+-	      dremap.perm[1] = 2;
+-	    }
++	  if (!skip_op0)
++	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
++	  if (!skip_op1)
++	    *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
++	  return true;
+ 	}
+-      else
+-	return false;
+-    }
+-  else
+-    {
+-      unsigned int nelt4 = nelt / 4, nzcnt = 0;
+-      unsigned HOST_WIDE_INT q[8];
+-      unsigned int nonzero_halves[4];
++      return false;
+ 
+-      /* Split the two input vectors into 8 quarters.  */
+-      q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
+-      for (i = 1; i < 8; ++i)
+-	q[i] = q[0] << (nelt4 * i);
+-      for (i = 0; i < 4; ++i)
+-	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+-	  {
+-	    nonzero_halves[nzcnt] = i;
+-	    ++nzcnt;
+-	  }
++    case FMA:
++      {
++	rtx sub;
+ 
+-      if (nzcnt == 1)
+-	{
+-	  gcc_assert (d->one_operand_p);
+-	  nonzero_halves[1] = nonzero_halves[0];
+-	  same_halves = true;
+-	}
+-      else if (d->one_operand_p)
+-	{
+-	  gcc_assert (nonzero_halves[0] == 0);
+-	  gcc_assert (nonzero_halves[1] == 1);
+-	}
++        gcc_assert (FLOAT_MODE_P (mode));
++        gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
++
++        *total = ix86_vec_cost (mode,
++				GET_MODE_INNER (mode) == SFmode
++				? cost->fmass : cost->fmasd);
++	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
++
++        /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
++	sub = XEXP (x, 0);
++	if (GET_CODE (sub) == NEG)
++	  sub = XEXP (sub, 0);
++	*total += rtx_cost (sub, mode, FMA, 0, speed);
++
++	sub = XEXP (x, 2);
++	if (GET_CODE (sub) == NEG)
++	  sub = XEXP (sub, 0);
++	*total += rtx_cost (sub, mode, FMA, 2, speed);
++	return true;
++      }
+ 
+-      if (nzcnt <= 2)
++    case MULT:
++      if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
+ 	{
+-	  if (d->perm[0] / nelt2 == nonzero_halves[1])
++	  rtx op0 = XEXP (x, 0);
++	  rtx op1 = XEXP (x, 1);
++	  int nbits;
++	  if (CONST_INT_P (XEXP (x, 1)))
+ 	    {
+-	      /* Attempt to increase the likelihood that dfinal
+-		 shuffle will be intra-lane.  */
+-	      std::swap (nonzero_halves[0], nonzero_halves[1]);
++	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
++	      for (nbits = 0; value != 0; value &= value - 1)
++	        nbits++;
+ 	    }
++	  else
++	    /* This is arbitrary.  */
++	    nbits = 7;
+ 
+-	  /* vperm2f128 or vperm2i128.  */
+-	  for (i = 0; i < nelt2; ++i)
++	  /* Compute costs correctly for widening multiplication.  */
++	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
++	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
++	         == GET_MODE_SIZE (mode))
+ 	    {
+-	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+-	      remap[i + nonzero_halves[0] * nelt2] = i;
+-	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+-	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
++	      int is_mulwiden = 0;
++	      machine_mode inner_mode = GET_MODE (op0);
++
++	      if (GET_CODE (op0) == GET_CODE (op1))
++		is_mulwiden = 1, op1 = XEXP (op1, 0);
++	      else if (CONST_INT_P (op1))
++		{
++		  if (GET_CODE (op0) == SIGN_EXTEND)
++		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
++			          == INTVAL (op1);
++		  else
++		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
++	        }
++
++	      if (is_mulwiden)
++	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
+ 	    }
+ 
+-	  if (d->vmode != V8SFmode
+-	      && d->vmode != V4DFmode
+-	      && d->vmode != V8SImode)
++  	  *total = (cost->mult_init[MODE_INDEX (mode)]
++		    + nbits * cost->mult_bit
++	            + rtx_cost (op0, mode, outer_code, opno, speed)
++		    + rtx_cost (op1, mode, outer_code, opno, speed));
++
++          return true;
++	}
++      *total = ix86_multiplication_cost (cost, mode);
++      return false;
++
++    case DIV:
++    case UDIV:
++    case MOD:
++    case UMOD:
++      *total = ix86_division_cost (cost, mode);
++      return false;
++
++    case PLUS:
++      if (GET_MODE_CLASS (mode) == MODE_INT
++	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
++	{
++	  if (GET_CODE (XEXP (x, 0)) == PLUS
++	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
++	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
++	      && CONSTANT_P (XEXP (x, 1)))
+ 	    {
+-	      dremap.vmode = V8SImode;
+-	      dremap.nelt = 8;
+-	      for (i = 0; i < 4; ++i)
++	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
++	      if (val == 2 || val == 4 || val == 8)
+ 		{
+-		  dremap.perm[i] = i + nonzero_halves[0] * 4;
+-		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
++		  *total = cost->lea;
++		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
++				      outer_code, opno, speed);
++		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
++				      outer_code, opno, speed);
++		  *total += rtx_cost (XEXP (x, 1), mode,
++				      outer_code, opno, speed);
++		  return true;
+ 		}
+ 	    }
+-	}
+-      else if (d->one_operand_p)
+-	return false;
+-      else if (TARGET_AVX2
+-	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
+-	{
+-	  /* vpunpckl* */
+-	  for (i = 0; i < nelt4; ++i)
++	  else if (GET_CODE (XEXP (x, 0)) == MULT
++		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
+ 	    {
+-	      remap[i] = i * 2;
+-	      remap[i + nelt] = i * 2 + 1;
+-	      remap[i + nelt2] = i * 2 + nelt2;
+-	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+-	      dremap.perm[i * 2] = i;
+-	      dremap.perm[i * 2 + 1] = i + nelt;
+-	      dremap.perm[i * 2 + nelt2] = i + nelt2;
+-	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
++	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
++	      if (val == 2 || val == 4 || val == 8)
++		{
++		  *total = cost->lea;
++		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
++				      outer_code, opno, speed);
++		  *total += rtx_cost (XEXP (x, 1), mode,
++				      outer_code, opno, speed);
++		  return true;
++		}
+ 	    }
+-	}
+-      else if (TARGET_AVX2
+-	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
+-	{
+-	  /* vpunpckh* */
+-	  for (i = 0; i < nelt4; ++i)
++	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
+ 	    {
+-	      remap[i + nelt4] = i * 2;
+-	      remap[i + nelt + nelt4] = i * 2 + 1;
+-	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+-	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+-	      dremap.perm[i * 2] = i + nelt4;
+-	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+-	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+-	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
++	      /* Add with carry, ignore the cost of adding a carry flag.  */
++	      if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
++		*total = cost->add;
++	      else
++		{
++		  *total = cost->lea;
++		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
++				      outer_code, opno, speed);
++		}
++
++	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
++				  outer_code, opno, speed);
++	      *total += rtx_cost (XEXP (x, 1), mode,
++				  outer_code, opno, speed);
++	      return true;
+ 	    }
+ 	}
+-      else
+-	return false;
+-    }
++      /* FALLTHRU */
+ 
+-  /* Use the remapping array set up above to move the elements from their
+-     swizzled locations into their final destinations.  */
+-  dfinal = *d;
+-  for (i = 0; i < nelt; ++i)
+-    {
+-      unsigned e = remap[d->perm[i]];
+-      gcc_assert (e < nelt);
+-      /* If same_halves is true, both halves of the remapped vector are the
+-	 same.  Avoid cross-lane accesses if possible.  */
+-      if (same_halves && i >= nelt2)
++    case MINUS:
++      /* Subtract with borrow, ignore the cost of subtracting a carry flag.  */
++      if (GET_MODE_CLASS (mode) == MODE_INT
++	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
++	  && GET_CODE (XEXP (x, 0)) == MINUS
++	  && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
+ 	{
+-	  gcc_assert (e < nelt2);
+-	  dfinal.perm[i] = e + nelt2;
++	  *total = cost->add;
++	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
++			      outer_code, opno, speed);
++	  *total += rtx_cost (XEXP (x, 1), mode,
++			      outer_code, opno, speed);
++	  return true;
+ 	}
+-      else
+-	dfinal.perm[i] = e;
+-    }
+-  if (!d->testing_p)
+-    {
+-      dremap.target = gen_reg_rtx (dremap.vmode);
+-      dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+-    }
+-  dfinal.op1 = dfinal.op0;
+-  dfinal.one_operand_p = true;
+ 
+-  /* Test if the final remap can be done with a single insn.  For V4SFmode or
+-     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
+-  start_sequence ();
+-  ok = expand_vec_perm_1 (&dfinal);
+-  seq = get_insns ();
+-  end_sequence ();
+-
+-  if (!ok)
+-    return false;
+-
+-  if (d->testing_p)
+-    return true;
+-
+-  if (dremap.vmode != dfinal.vmode)
+-    {
+-      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
+-      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
+-    }
+-
+-  ok = expand_vec_perm_1 (&dremap);
+-  gcc_assert (ok);
+-
+-  emit_insn (seq);
+-  return true;
+-}
+-
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+-   a single vector cross-lane permutation into vpermq followed
+-   by any of the single insn permutations.  */
+-
+-static bool
+-expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+-{
+-  struct expand_vec_perm_d dremap, dfinal;
+-  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+-  unsigned contents[2];
+-  bool ok;
+-
+-  if (!(TARGET_AVX2
+-	&& (d->vmode == V32QImode || d->vmode == V16HImode)
+-	&& d->one_operand_p))
+-    return false;
+-
+-  contents[0] = 0;
+-  contents[1] = 0;
+-  for (i = 0; i < nelt2; ++i)
+-    {
+-      contents[0] |= 1u << (d->perm[i] / nelt4);
+-      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+-    }
+-
+-  for (i = 0; i < 2; ++i)
+-    {
+-      unsigned int cnt = 0;
+-      for (j = 0; j < 4; ++j)
+-	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
++      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++	{
++	  *total = cost->addss;
+ 	  return false;
+-    }
+-
+-  if (d->testing_p)
+-    return true;
+-
+-  dremap = *d;
+-  dremap.vmode = V4DImode;
+-  dremap.nelt = 4;
+-  dremap.target = gen_reg_rtx (V4DImode);
+-  dremap.op0 = gen_lowpart (V4DImode, d->op0);
+-  dremap.op1 = dremap.op0;
+-  dremap.one_operand_p = true;
+-  for (i = 0; i < 2; ++i)
+-    {
+-      unsigned int cnt = 0;
+-      for (j = 0; j < 4; ++j)
+-	if ((contents[i] & (1u << j)) != 0)
+-	  dremap.perm[2 * i + cnt++] = j;
+-      for (; cnt < 2; ++cnt)
+-	dremap.perm[2 * i + cnt] = 0;
+-    }
+-
+-  dfinal = *d;
+-  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+-  dfinal.op1 = dfinal.op0;
+-  dfinal.one_operand_p = true;
+-  for (i = 0, j = 0; i < nelt; ++i)
+-    {
+-      if (i == nelt2)
+-	j = 2;
+-      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+-      if ((d->perm[i] / nelt4) == dremap.perm[j])
+-	;
+-      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+-	dfinal.perm[i] |= nelt4;
+-      else
+-	gcc_unreachable ();
+-    }
+-
+-  ok = expand_vec_perm_1 (&dremap);
+-  gcc_assert (ok);
+-
+-  ok = expand_vec_perm_1 (&dfinal);
+-  gcc_assert (ok);
+-
+-  return true;
+-}
+-
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
+-   a vector permutation using two instructions, vperm2f128 resp.
+-   vperm2i128 followed by any single in-lane permutation.  */
+-
+-static bool
+-expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
+-{
+-  struct expand_vec_perm_d dfirst, dsecond;
+-  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
+-  bool ok;
++	}
++      else if (X87_FLOAT_MODE_P (mode))
++	{
++	  *total = cost->fadd;
++	  return false;
++	}
++      else if (FLOAT_MODE_P (mode))
++	{
++	  *total = ix86_vec_cost (mode, cost->addss);
++	  return false;
++	}
++      /* FALLTHRU */
+ 
+-  if (!TARGET_AVX
+-      || GET_MODE_SIZE (d->vmode) != 32
+-      || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
+-    return false;
++    case AND:
++    case IOR:
++    case XOR:
++      if (GET_MODE_CLASS (mode) == MODE_INT
++	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
++	{
++	  *total = (cost->add * 2
++		    + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
++		       << (GET_MODE (XEXP (x, 0)) != DImode))
++		    + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
++	               << (GET_MODE (XEXP (x, 1)) != DImode)));
++	  return true;
++	}
++      /* FALLTHRU */
+ 
+-  dsecond = *d;
+-  dsecond.one_operand_p = false;
+-  dsecond.testing_p = true;
+-
+-  /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
+-     immediate.  For perm < 16 the second permutation uses
+-     d->op0 as first operand, for perm >= 16 it uses d->op1
+-     as first operand.  The second operand is the result of
+-     vperm2[fi]128.  */
+-  for (perm = 0; perm < 32; perm++)
+-    {
+-      /* Ignore permutations which do not move anything cross-lane.  */
+-      if (perm < 16)
+-	{
+-	  /* The second shuffle for e.g. V4DFmode has
+-	     0123 and ABCD operands.
+-	     Ignore AB23, as 23 is already in the second lane
+-	     of the first operand.  */
+-	  if ((perm & 0xc) == (1 << 2)) continue;
+-	  /* And 01CD, as 01 is in the first lane of the first
+-	     operand.  */
+-	  if ((perm & 3) == 0) continue;
+-	  /* And 4567, as then the vperm2[fi]128 doesn't change
+-	     anything on the original 4567 second operand.  */
+-	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
++    case NEG:
++      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++	{
++	  *total = cost->sse_op;
++	  return false;
+ 	}
+-      else
++      else if (X87_FLOAT_MODE_P (mode))
+ 	{
+-	  /* The second shuffle for e.g. V4DFmode has
+-	     4567 and ABCD operands.
+-	     Ignore AB67, as 67 is already in the second lane
+-	     of the first operand.  */
+-	  if ((perm & 0xc) == (3 << 2)) continue;
+-	  /* And 45CD, as 45 is in the first lane of the first
+-	     operand.  */
+-	  if ((perm & 3) == 2) continue;
+-	  /* And 0123, as then the vperm2[fi]128 doesn't change
+-	     anything on the original 0123 first operand.  */
+-	  if ((perm & 0xf) == (1 << 2)) continue;
+-	}
+-
+-      for (i = 0; i < nelt; i++)
+-	{
+-	  j = d->perm[i] / nelt2;
+-	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
+-	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
+-	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
+-	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
+-	  else
+-	    break;
++	  *total = cost->fchs;
++	  return false;
+ 	}
+-
+-      if (i == nelt)
++      else if (FLOAT_MODE_P (mode))
+ 	{
+-	  start_sequence ();
+-	  ok = expand_vec_perm_1 (&dsecond);
+-	  end_sequence ();
++	  *total = ix86_vec_cost (mode, cost->sse_op);
++	  return false;
+ 	}
++      /* FALLTHRU */
++
++    case NOT:
++      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
++	*total = ix86_vec_cost (mode, cost->sse_op);
++      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
++	*total = cost->add * 2;
+       else
+-	ok = false;
++	*total = cost->add;
++      return false;
+ 
+-      if (ok)
++    case COMPARE:
++      if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
++	  && XEXP (XEXP (x, 0), 1) == const1_rtx
++	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
++	  && XEXP (x, 1) == const0_rtx)
+ 	{
+-	  if (d->testing_p)
+-	    return true;
+-
+-	  /* Found a usable second shuffle.  dfirst will be
+-	     vperm2f128 on d->op0 and d->op1.  */
+-	  dsecond.testing_p = false;
+-	  dfirst = *d;
+-	  dfirst.target = gen_reg_rtx (d->vmode);
+-	  for (i = 0; i < nelt; i++)
+-	    dfirst.perm[i] = (i & (nelt2 - 1))
+-			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
+-
+-	  canonicalize_perm (&dfirst);
+-	  ok = expand_vec_perm_1 (&dfirst);
+-	  gcc_assert (ok);
+-
+-	  /* And dsecond is some single insn shuffle, taking
+-	     d->op0 and result of vperm2f128 (if perm < 16) or
+-	     d->op1 and result of vperm2f128 (otherwise).  */
+-	  if (perm >= 16)
+-	    dsecond.op0 = dsecond.op1;
+-	  dsecond.op1 = dfirst.target;
+-
+-	  ok = expand_vec_perm_1 (&dsecond);
+-	  gcc_assert (ok);
+-
++	  /* This kind of construct is implemented using test[bwl].
++	     Treat it as if we had an AND.  */
++	  mode = GET_MODE (XEXP (XEXP (x, 0), 0));
++	  *total = (cost->add
++		    + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
++				opno, speed)
++		    + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
+ 	  return true;
+ 	}
+ 
+-      /* For one operand, the only useful vperm2f128 permutation is 0x01
+-	 aka lanes swap.  */
+-      if (d->one_operand_p)
+-	return false;
+-    }
++      /* The embedded comparison operand is completely free.  */
++      if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
++	  && XEXP (x, 1) == const0_rtx)
++	*total = 0;
+ 
+-  return false;
+-}
++      return false;
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+-   a two vector permutation using 2 intra-lane interleave insns
+-   and cross-lane shuffle for 32-byte vectors.  */
++    case FLOAT_EXTEND:
++      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
++	*total = 0;
++      else
++        *total = ix86_vec_cost (mode, cost->addss);
++      return false;
+ 
+-static bool
+-expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+-{
+-  unsigned i, nelt;
+-  rtx (*gen) (rtx, rtx, rtx);
++    case FLOAT_TRUNCATE:
++      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
++	*total = cost->fadd;
++      else
++        *total = ix86_vec_cost (mode, cost->addss);
++      return false;
+ 
+-  if (d->one_operand_p)
+-    return false;
+-  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+-    ;
+-  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+-    ;
+-  else
+-    return false;
++    case ABS:
++      /* SSE requires memory load for the constant operand. It may make
++	 sense to account for this.  Of course the constant operand may or
++	 may not be reused. */
++      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++	*total = cost->sse_op;
++      else if (X87_FLOAT_MODE_P (mode))
++	*total = cost->fabs;
++      else if (FLOAT_MODE_P (mode))
++	*total = ix86_vec_cost (mode, cost->sse_op);
++      return false;
+ 
+-  nelt = d->nelt;
+-  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+-    return false;
+-  for (i = 0; i < nelt; i += 2)
+-    if (d->perm[i] != d->perm[0] + i / 2
+-	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
++    case SQRT:
++      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
++	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
++      else if (X87_FLOAT_MODE_P (mode))
++	*total = cost->fsqrt;
++      else if (FLOAT_MODE_P (mode))
++	*total = ix86_vec_cost (mode,
++				mode == SFmode ? cost->sqrtss : cost->sqrtsd);
+       return false;
+ 
+-  if (d->testing_p)
+-    return true;
++    case UNSPEC:
++      if (XINT (x, 1) == UNSPEC_TP)
++	*total = 0;
++      return false;
+ 
+-  switch (d->vmode)
+-    {
+-    case E_V32QImode:
+-      if (d->perm[0])
+-	gen = gen_vec_interleave_highv32qi;
+-      else
+-	gen = gen_vec_interleave_lowv32qi;
+-      break;
+-    case E_V16HImode:
+-      if (d->perm[0])
+-	gen = gen_vec_interleave_highv16hi;
+-      else
+-	gen = gen_vec_interleave_lowv16hi;
+-      break;
+-    case E_V8SImode:
+-      if (d->perm[0])
+-	gen = gen_vec_interleave_highv8si;
+-      else
+-	gen = gen_vec_interleave_lowv8si;
+-      break;
+-    case E_V4DImode:
+-      if (d->perm[0])
+-	gen = gen_vec_interleave_highv4di;
+-      else
+-	gen = gen_vec_interleave_lowv4di;
+-      break;
+-    case E_V8SFmode:
+-      if (d->perm[0])
+-	gen = gen_vec_interleave_highv8sf;
+-      else
+-	gen = gen_vec_interleave_lowv8sf;
+-      break;
+-    case E_V4DFmode:
+-      if (d->perm[0])
+-	gen = gen_vec_interleave_highv4df;
++    case VEC_SELECT:
++    case VEC_CONCAT:
++    case VEC_DUPLICATE:
++      /* ??? Assume all of these vector manipulation patterns are
++	 recognizable.  In which case they all pretty much have the
++	 same cost.  */
++     *total = cost->sse_op;
++     return true;
++    case VEC_MERGE:
++      mask = XEXP (x, 2);
++      /* This is masked instruction, assume the same cost,
++	 as nonmasked variant.  */
++      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
++	*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
+       else
+-	gen = gen_vec_interleave_lowv4df;
+-      break;
++	*total = cost->sse_op;
++      return true;
++
+     default:
+-      gcc_unreachable ();
++      return false;
+     }
+-
+-  emit_insn (gen (d->target, d->op0, d->op1));
+-  return true;
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
+-   a single vector permutation using a single intra-lane vector
+-   permutation, vperm2f128 swapping the lanes and vblend* insn blending
+-   the non-swapped and swapped vectors together.  */
+-
+-static bool
+-expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
+-{
+-  struct expand_vec_perm_d dfirst, dsecond;
+-  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
+-  rtx_insn *seq;
+-  bool ok;
+-  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+-
+-  if (!TARGET_AVX
+-      || TARGET_AVX2
+-      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+-      || !d->one_operand_p)
+-    return false;
+-
+-  dfirst = *d;
+-  for (i = 0; i < nelt; i++)
+-    dfirst.perm[i] = 0xff;
+-  for (i = 0, msk = 0; i < nelt; i++)
+-    {
+-      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+-      if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
+-	return false;
+-      dfirst.perm[j] = d->perm[i];
+-      if (j != i)
+-	msk |= (1 << i);
+-    }
+-  for (i = 0; i < nelt; i++)
+-    if (dfirst.perm[i] == 0xff)
+-      dfirst.perm[i] = i;
+-
+-  if (!d->testing_p)
+-    dfirst.target = gen_reg_rtx (dfirst.vmode);
+-
+-  start_sequence ();
+-  ok = expand_vec_perm_1 (&dfirst);
+-  seq = get_insns ();
+-  end_sequence ();
+-
+-  if (!ok)
+-    return false;
+-
+-  if (d->testing_p)
+-    return true;
+-
+-  emit_insn (seq);
+-
+-  dsecond = *d;
+-  dsecond.op0 = dfirst.target;
+-  dsecond.op1 = dfirst.target;
+-  dsecond.one_operand_p = true;
+-  dsecond.target = gen_reg_rtx (dsecond.vmode);
+-  for (i = 0; i < nelt; i++)
+-    dsecond.perm[i] = i ^ nelt2;
+-
+-  ok = expand_vec_perm_1 (&dsecond);
+-  gcc_assert (ok);
++#if TARGET_MACHO
+ 
+-  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+-  emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
+-  return true;
+-}
++static int current_machopic_label_num;
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
+-   permutation using two vperm2f128, followed by a vshufpd insn blending
+-   the two vectors together.  */
++/* Given a symbol name and its associated stub, write out the
++   definition of the stub.  */
+ 
+-static bool
+-expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
++void
++machopic_output_stub (FILE *file, const char *symb, const char *stub)
+ {
+-  struct expand_vec_perm_d dfirst, dsecond, dthird;
+-  bool ok;
+-
+-  if (!TARGET_AVX || (d->vmode != V4DFmode))
+-    return false;
+-
+-  if (d->testing_p)
+-    return true;
+-
+-  dfirst = *d;
+-  dsecond = *d;
+-  dthird = *d;
+-
+-  dfirst.perm[0] = (d->perm[0] & ~1);
+-  dfirst.perm[1] = (d->perm[0] & ~1) + 1;
+-  dfirst.perm[2] = (d->perm[2] & ~1);
+-  dfirst.perm[3] = (d->perm[2] & ~1) + 1;
+-  dsecond.perm[0] = (d->perm[1] & ~1);
+-  dsecond.perm[1] = (d->perm[1] & ~1) + 1;
+-  dsecond.perm[2] = (d->perm[3] & ~1);
+-  dsecond.perm[3] = (d->perm[3] & ~1) + 1;
+-  dthird.perm[0] = (d->perm[0] % 2);
+-  dthird.perm[1] = (d->perm[1] % 2) + 4;
+-  dthird.perm[2] = (d->perm[2] % 2) + 2;
+-  dthird.perm[3] = (d->perm[3] % 2) + 6;
+-
+-  dfirst.target = gen_reg_rtx (dfirst.vmode);
+-  dsecond.target = gen_reg_rtx (dsecond.vmode);
+-  dthird.op0 = dfirst.target;
+-  dthird.op1 = dsecond.target;
+-  dthird.one_operand_p = false;
+-
+-  canonicalize_perm (&dfirst);
+-  canonicalize_perm (&dsecond);
+-
+-  ok = expand_vec_perm_1 (&dfirst)
+-       && expand_vec_perm_1 (&dsecond)
+-       && expand_vec_perm_1 (&dthird);
++  unsigned int length;
++  char *binder_name, *symbol_name, lazy_ptr_name[32];
++  int label = ++current_machopic_label_num;
+ 
+-  gcc_assert (ok);
++  /* For 64-bit we shouldn't get here.  */
++  gcc_assert (!TARGET_64BIT);
+ 
+-  return true;
+-}
++  /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
++  symb = targetm.strip_name_encoding (symb);
+ 
+-/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
+-   permutation with two pshufb insns and an ior.  We should have already
+-   failed all two instruction sequences.  */
++  length = strlen (stub);
++  binder_name = XALLOCAVEC (char, length + 32);
++  GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
+ 
+-static bool
+-expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
+-{
+-  rtx rperm[2][16], vperm, l, h, op, m128;
+-  unsigned int i, nelt, eltsz;
++  length = strlen (symb);
++  symbol_name = XALLOCAVEC (char, length + 32);
++  GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
+ 
+-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+-    return false;
+-  gcc_assert (!d->one_operand_p);
++  sprintf (lazy_ptr_name, "L%d$lz", label);
+ 
+-  if (d->testing_p)
+-    return true;
++  if (MACHOPIC_ATT_STUB)
++    switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
++  else if (MACHOPIC_PURE)
++    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
++  else
++    switch_to_section (darwin_sections[machopic_symbol_stub_section]);
+ 
+-  nelt = d->nelt;
+-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
++  fprintf (file, "%s:\n", stub);
++  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+ 
+-  /* Generate two permutation masks.  If the required element is within
+-     the given vector it is shuffled into the proper lane.  If the required
+-     element is in the other vector, force a zero into the lane by setting
+-     bit 7 in the permutation mask.  */
+-  m128 = GEN_INT (-128);
+-  for (i = 0; i < nelt; ++i)
++  if (MACHOPIC_ATT_STUB)
+     {
+-      unsigned j, e = d->perm[i];
+-      unsigned which = (e >= nelt);
+-      if (e >= nelt)
+-	e -= nelt;
+-
+-      for (j = 0; j < eltsz; ++j)
+-	{
+-	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+-	  rperm[1-which][i*eltsz + j] = m128;
+-	}
++      fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
+     }
+-
+-  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
+-  vperm = force_reg (V16QImode, vperm);
+-
+-  l = gen_reg_rtx (V16QImode);
+-  op = gen_lowpart (V16QImode, d->op0);
+-  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+-
+-  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
+-  vperm = force_reg (V16QImode, vperm);
+-
+-  h = gen_reg_rtx (V16QImode);
+-  op = gen_lowpart (V16QImode, d->op1);
+-  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+-
+-  op = d->target;
+-  if (d->vmode != V16QImode)
+-    op = gen_reg_rtx (V16QImode);
+-  emit_insn (gen_iorv16qi3 (op, l, h));
+-  if (op != d->target)
+-    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+-
+-  return true;
+-}
+-
+-/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+-   with two vpshufb insns, vpermq and vpor.  We should have already failed
+-   all two or three instruction sequences.  */
+-
+-static bool
+-expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+-{
+-  rtx rperm[2][32], vperm, l, h, hp, op, m128;
+-  unsigned int i, nelt, eltsz;
+-
+-  if (!TARGET_AVX2
+-      || !d->one_operand_p
+-      || (d->vmode != V32QImode && d->vmode != V16HImode))
+-    return false;
+-
+-  if (d->testing_p)
+-    return true;
+-
+-  nelt = d->nelt;
+-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+-
+-  /* Generate two permutation masks.  If the required element is within
+-     the same lane, it is shuffled in.  If the required element from the
+-     other lane, force a zero by setting bit 7 in the permutation mask.
+-     In the other mask the mask has non-negative elements if element
+-     is requested from the other lane, but also moved to the other lane,
+-     so that the result of vpshufb can have the two V2TImode halves
+-     swapped.  */
+-  m128 = GEN_INT (-128);
+-  for (i = 0; i < nelt; ++i)
++  else if (MACHOPIC_PURE)
+     {
+-      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+-      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+-
+-      for (j = 0; j < eltsz; ++j)
+-	{
+-	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+-	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
+-	}
++      /* PIC stub.  */
++      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
++      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
++      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
++      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
++	       label, lazy_ptr_name, label);
++      fprintf (file, "\tjmp\t*%%ecx\n");
+     }
++  else
++    fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
+ 
+-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+-  vperm = force_reg (V32QImode, vperm);
+-
+-  h = gen_reg_rtx (V32QImode);
+-  op = gen_lowpart (V32QImode, d->op0);
+-  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
++  /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
++     it needs no stub-binding-helper.  */
++  if (MACHOPIC_ATT_STUB)
++    return;
+ 
+-  /* Swap the 128-byte lanes of h into hp.  */
+-  hp = gen_reg_rtx (V4DImode);
+-  op = gen_lowpart (V4DImode, h);
+-  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
+-				  const1_rtx));
++  fprintf (file, "%s:\n", binder_name);
+ 
+-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+-  vperm = force_reg (V32QImode, vperm);
++  if (MACHOPIC_PURE)
++    {
++      fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
++      fprintf (file, "\tpushl\t%%ecx\n");
++    }
++  else
++    fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
+ 
+-  l = gen_reg_rtx (V32QImode);
+-  op = gen_lowpart (V32QImode, d->op0);
+-  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
++  fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
+ 
+-  op = d->target;
+-  if (d->vmode != V32QImode)
+-    op = gen_reg_rtx (V32QImode);
+-  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+-  if (op != d->target)
+-    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
++  /* N.B. Keep the correspondence of these
++     'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
++     old-pic/new-pic/non-pic stubs; altering this will break
++     compatibility with existing dylibs.  */
++  if (MACHOPIC_PURE)
++    {
++      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
++      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
++    }
++  else
++    /* 16-byte -mdynamic-no-pic stub.  */
++    switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
+ 
+-  return true;
++  fprintf (file, "%s:\n", lazy_ptr_name);
++  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
++  fprintf (file, ASM_LONG "%s\n", binder_name);
+ }
++#endif /* TARGET_MACHO */
+ 
+-/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+-   and extract-odd permutations of two V32QImode and V16QImode operand
+-   with two vpshufb insns, vpor and vpermq.  We should have already
+-   failed all two or three instruction sequences.  */
++/* Order the registers for register allocator.  */
+ 
+-static bool
+-expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
++void
++x86_order_regs_for_local_alloc (void)
+ {
+-  rtx rperm[2][32], vperm, l, h, ior, op, m128;
+-  unsigned int i, nelt, eltsz;
+-
+-  if (!TARGET_AVX2
+-      || d->one_operand_p
+-      || (d->vmode != V32QImode && d->vmode != V16HImode))
+-    return false;
+-
+-  for (i = 0; i < d->nelt; ++i)
+-    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+-      return false;
+-
+-  if (d->testing_p)
+-    return true;
++   int pos = 0;
++   int i;
+ 
+-  nelt = d->nelt;
+-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+-
+-  /* Generate two permutation masks.  In the first permutation mask
+-     the first quarter will contain indexes for the first half
+-     of the op0, the second quarter will contain bit 7 set, third quarter
+-     will contain indexes for the second half of the op0 and the
+-     last quarter bit 7 set.  In the second permutation mask
+-     the first quarter will contain bit 7 set, the second quarter
+-     indexes for the first half of the op1, the third quarter bit 7 set
+-     and last quarter indexes for the second half of the op1.
+-     I.e. the first mask e.g. for V32QImode extract even will be:
+-     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+-     (all values masked with 0xf except for -128) and second mask
+-     for extract even will be
+-     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
+-  m128 = GEN_INT (-128);
+-  for (i = 0; i < nelt; ++i)
+-    {
+-      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+-      unsigned which = d->perm[i] >= nelt;
+-      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
++   /* First allocate the local general purpose registers.  */
++   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
++     if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
++	reg_alloc_order [pos++] = i;
+ 
+-      for (j = 0; j < eltsz; ++j)
+-	{
+-	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+-	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+-	}
+-    }
++   /* Global general purpose registers.  */
++   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
++     if (GENERAL_REGNO_P (i) && !call_used_or_fixed_reg_p (i))
++	reg_alloc_order [pos++] = i;
+ 
+-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+-  vperm = force_reg (V32QImode, vperm);
++   /* x87 registers come first in case we are doing FP math
++      using them.  */
++   if (!TARGET_SSE_MATH)
++     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
++       reg_alloc_order [pos++] = i;
+ 
+-  l = gen_reg_rtx (V32QImode);
+-  op = gen_lowpart (V32QImode, d->op0);
+-  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
++   /* SSE registers.  */
++   for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
++     reg_alloc_order [pos++] = i;
++   for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
++     reg_alloc_order [pos++] = i;
+ 
+-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+-  vperm = force_reg (V32QImode, vperm);
++   /* Extended REX SSE registers.  */
++   for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
++     reg_alloc_order [pos++] = i;
+ 
+-  h = gen_reg_rtx (V32QImode);
+-  op = gen_lowpart (V32QImode, d->op1);
+-  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
++   /* Mask register.  */
++   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
++     reg_alloc_order [pos++] = i;
+ 
+-  ior = gen_reg_rtx (V32QImode);
+-  emit_insn (gen_iorv32qi3 (ior, l, h));
++   /* x87 registers.  */
++   if (TARGET_SSE_MATH)
++     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
++       reg_alloc_order [pos++] = i;
+ 
+-  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
+-  op = gen_reg_rtx (V4DImode);
+-  ior = gen_lowpart (V4DImode, ior);
+-  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+-				  const1_rtx, GEN_INT (3)));
+-  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
++   for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
++     reg_alloc_order [pos++] = i;
+ 
+-  return true;
++   /* Initialize the rest of array as we do not allocate some registers
++      at all.  */
++   while (pos < FIRST_PSEUDO_REGISTER)
++     reg_alloc_order [pos++] = 0;
+ }
+ 
+-/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+-   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+-   with two "and" and "pack" or two "shift" and "pack" insns.  We should
+-   have already failed all two instruction sequences.  */
+-
+ static bool
+-expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
++ix86_ms_bitfield_layout_p (const_tree record_type)
+ {
+-  rtx op, dop0, dop1, t;
+-  unsigned i, odd, c, s, nelt = d->nelt;
+-  bool end_perm = false;
+-  machine_mode half_mode;
+-  rtx (*gen_and) (rtx, rtx, rtx);
+-  rtx (*gen_pack) (rtx, rtx, rtx);
+-  rtx (*gen_shift) (rtx, rtx, rtx);
++  return ((TARGET_MS_BITFIELD_LAYOUT
++	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
++          || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
++}
+ 
+-  if (d->one_operand_p)
+-    return false;
++/* Returns an expression indicating where the this parameter is
++   located on entry to the FUNCTION.  */
+ 
+-  switch (d->vmode)
+-    {
+-    case E_V8HImode:
+-      /* Required for "pack".  */
+-      if (!TARGET_SSE4_1)
+-        return false;
+-      c = 0xffff;
+-      s = 16;
+-      half_mode = V4SImode;
+-      gen_and = gen_andv4si3;
+-      gen_pack = gen_sse4_1_packusdw;
+-      gen_shift = gen_lshrv4si3;
+-      break;
+-    case E_V16QImode:
+-      /* No check as all instructions are SSE2.  */
+-      c = 0xff;
+-      s = 8;
+-      half_mode = V8HImode;
+-      gen_and = gen_andv8hi3;
+-      gen_pack = gen_sse2_packuswb;
+-      gen_shift = gen_lshrv8hi3;
+-      break;
+-    case E_V16HImode:
+-      if (!TARGET_AVX2)
+-        return false;
+-      c = 0xffff;
+-      s = 16;
+-      half_mode = V8SImode;
+-      gen_and = gen_andv8si3;
+-      gen_pack = gen_avx2_packusdw;
+-      gen_shift = gen_lshrv8si3;
+-      end_perm = true;
+-      break;
+-    case E_V32QImode:
+-      if (!TARGET_AVX2)
+-        return false;
+-      c = 0xff;
+-      s = 8;
+-      half_mode = V16HImode;
+-      gen_and = gen_andv16hi3;
+-      gen_pack = gen_avx2_packuswb;
+-      gen_shift = gen_lshrv16hi3;
+-      end_perm = true;
+-      break;
+-    default:
+-      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+-	 general shuffles.  */
+-      return false;
+-    }
++static rtx
++x86_this_parameter (tree function)
++{
++  tree type = TREE_TYPE (function);
++  bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
++  int nregs;
+ 
+-  /* Check that permutation is even or odd.  */
+-  odd = d->perm[0];
+-  if (odd > 1)
+-    return false;
++  if (TARGET_64BIT)
++    {
++      const int *parm_regs;
+ 
+-  for (i = 1; i < nelt; ++i)
+-    if (d->perm[i] != 2 * i + odd)
+-      return false;
++      if (ix86_function_type_abi (type) == MS_ABI)
++        parm_regs = x86_64_ms_abi_int_parameter_registers;
++      else
++        parm_regs = x86_64_int_parameter_registers;
++      return gen_rtx_REG (Pmode, parm_regs[aggr]);
++    }
+ 
+-  if (d->testing_p)
+-    return true;
++  nregs = ix86_function_regparm (type, function);
+ 
+-  dop0 = gen_reg_rtx (half_mode);
+-  dop1 = gen_reg_rtx (half_mode);
+-  if (odd == 0)
+-    {
+-      t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+-      t = force_reg (half_mode, t);
+-      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+-      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+-    }
+-  else
++  if (nregs > 0 && !stdarg_p (type))
+     {
+-      emit_insn (gen_shift (dop0,
+-			    gen_lowpart (half_mode, d->op0),
+-			    GEN_INT (s)));
+-      emit_insn (gen_shift (dop1,
+-			    gen_lowpart (half_mode, d->op1),
+-			    GEN_INT (s)));
+-    }
+-  /* In AVX2 for 256 bit case we need to permute pack result.  */
+-  if (TARGET_AVX2 && end_perm)
+-    {
+-      op = gen_reg_rtx (d->vmode);
+-      t = gen_reg_rtx (V4DImode);
+-      emit_insn (gen_pack (op, dop0, dop1));
+-      emit_insn (gen_avx2_permv4di_1 (t,
+-				      gen_lowpart (V4DImode, op),
+-				      const0_rtx,
+-				      const2_rtx,
+-				      const1_rtx,
+-				      GEN_INT (3)));
+-      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
++      int regno;
++      unsigned int ccvt = ix86_get_callcvt (type);
++
++      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
++	regno = aggr ? DX_REG : CX_REG;
++      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
++        {
++	  regno = CX_REG;
++	  if (aggr)
++	    return gen_rtx_MEM (SImode,
++				plus_constant (Pmode, stack_pointer_rtx, 4));
++	}
++      else
++        {
++	  regno = AX_REG;
++	  if (aggr)
++	    {
++	      regno = DX_REG;
++	      if (nregs == 1)
++		return gen_rtx_MEM (SImode,
++				    plus_constant (Pmode,
++						   stack_pointer_rtx, 4));
++	    }
++	}
++      return gen_rtx_REG (SImode, regno);
+     }
+-  else
+-    emit_insn (gen_pack (d->target, dop0, dop1));
+ 
+-  return true;
++  return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
++					     aggr ? 8 : 4));
+ }
+ 
+-/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+-   and extract-odd permutations of two V64QI operands
+-   with two "shifts", two "truncs" and one "concat" insns for "odd"
+-   and two "truncs" and one concat insn for "even."
+-   Have already failed all two instruction sequences.  */
++/* Determine whether x86_output_mi_thunk can succeed.  */
+ 
+ static bool
+-expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
++x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
++			 const_tree function)
+ {
+-  rtx t1, t2, t3, t4;
+-  unsigned i, odd, nelt = d->nelt;
+-
+-  if (!TARGET_AVX512BW
+-      || d->one_operand_p
+-      || d->vmode != V64QImode)
+-    return false;
+-
+-  /* Check that permutation is even or odd.  */
+-  odd = d->perm[0];
+-  if (odd > 1)
+-    return false;
+-
+-  for (i = 1; i < nelt; ++i)
+-    if (d->perm[i] != 2 * i + odd)
+-      return false;
+-
+-  if (d->testing_p)
++  /* 64-bit can handle anything.  */
++  if (TARGET_64BIT)
+     return true;
+ 
++  /* For 32-bit, everything's fine if we have one free register.  */
++  if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
++    return true;
+ 
+-  if (odd)
+-    {
+-      t1 = gen_reg_rtx (V32HImode);
+-      t2 = gen_reg_rtx (V32HImode);
+-      emit_insn (gen_lshrv32hi3 (t1,
+-				 gen_lowpart (V32HImode, d->op0),
+-				 GEN_INT (8)));
+-      emit_insn (gen_lshrv32hi3 (t2,
+-				 gen_lowpart (V32HImode, d->op1),
+-				 GEN_INT (8)));
+-    }
+-  else
+-    {
+-      t1 = gen_lowpart (V32HImode, d->op0);
+-      t2 = gen_lowpart (V32HImode, d->op1);
+-    }
++  /* Need a free register for vcall_offset.  */
++  if (vcall_offset)
++    return false;
+ 
+-  t3 = gen_reg_rtx (V32QImode);
+-  t4 = gen_reg_rtx (V32QImode);
+-  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
+-  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
+-  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
++  /* Need a free register for GOT references.  */
++  if (flag_pic && !targetm.binds_local_p (function))
++    return false;
+ 
++  /* Otherwise ok.  */
+   return true;
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+-   and extract-odd permutations.  */
++/* Output the assembler code for a thunk function.  THUNK_DECL is the
++   declaration for the thunk function itself, FUNCTION is the decl for
++   the target function.  DELTA is an immediate constant offset to be
++   added to THIS.  If VCALL_OFFSET is nonzero, the word at
++   *(*this + vcall_offset) should be added to THIS.  */
+ 
+-static bool
+-expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
++static void
++x86_output_mi_thunk (FILE *file, tree thunk_fndecl, HOST_WIDE_INT delta,
++		     HOST_WIDE_INT vcall_offset, tree function)
+ {
+-  rtx t1, t2, t3, t4, t5;
++  const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk_fndecl));
++  rtx this_param = x86_this_parameter (function);
++  rtx this_reg, tmp, fnaddr;
++  unsigned int tmp_regno;
++  rtx_insn *insn;
+ 
+-  switch (d->vmode)
++  if (TARGET_64BIT)
++    tmp_regno = R10_REG;
++  else
+     {
+-    case E_V4DFmode:
+-      if (d->testing_p)
+-	break;
+-      t1 = gen_reg_rtx (V4DFmode);
+-      t2 = gen_reg_rtx (V4DFmode);
+-
+-      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+-      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
+-      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
+-
+-      /* Now an unpck[lh]pd will produce the result required.  */
+-      if (odd)
+-	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
++      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
++      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
++	tmp_regno = AX_REG;
++      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
++	tmp_regno = DX_REG;
+       else
+-	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
+-      emit_insn (t3);
+-      break;
++	tmp_regno = CX_REG;
++    }
+ 
+-    case E_V8SFmode:
+-      {
+-	int mask = odd ? 0xdd : 0x88;
++  emit_note (NOTE_INSN_PROLOGUE_END);
+ 
+-	if (d->testing_p)
+-	  break;
+-	t1 = gen_reg_rtx (V8SFmode);
+-	t2 = gen_reg_rtx (V8SFmode);
+-	t3 = gen_reg_rtx (V8SFmode);
+-
+-	/* Shuffle within the 128-bit lanes to produce:
+-	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
+-	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
+-				      GEN_INT (mask)));
+-
+-	/* Shuffle the lanes around to produce:
+-	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
+-	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
+-					    GEN_INT (0x3)));
+-
+-	/* Shuffle within the 128-bit lanes to produce:
+-	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
+-	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
+-
+-	/* Shuffle within the 128-bit lanes to produce:
+-	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
+-	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
+-
+-	/* Shuffle the lanes around to produce:
+-	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
+-	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
+-					    GEN_INT (0x20)));
+-      }
+-      break;
++  /* CET is enabled, insert EB instruction.  */
++  if ((flag_cf_protection & CF_BRANCH))
++    emit_insn (gen_nop_endbr ());
+ 
+-    case E_V2DFmode:
+-    case E_V4SFmode:
+-    case E_V2DImode:
+-    case E_V4SImode:
+-      /* These are always directly implementable by expand_vec_perm_1.  */
+-      gcc_unreachable ();
++  /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
++     pull it in now and let DELTA benefit.  */
++  if (REG_P (this_param))
++    this_reg = this_param;
++  else if (vcall_offset)
++    {
++      /* Put the this parameter into %eax.  */
++      this_reg = gen_rtx_REG (Pmode, AX_REG);
++      emit_move_insn (this_reg, this_param);
++    }
++  else
++    this_reg = NULL_RTX;
+ 
+-    case E_V8HImode:
+-      if (TARGET_SSE4_1)
+-	return expand_vec_perm_even_odd_pack (d);
+-      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+-	return expand_vec_perm_pshufb2 (d);
+-      else
++  /* Adjust the this parameter by a fixed constant.  */
++  if (delta)
++    {
++      rtx delta_rtx = GEN_INT (delta);
++      rtx delta_dst = this_reg ? this_reg : this_param;
++
++      if (TARGET_64BIT)
+ 	{
+-	  if (d->testing_p)
+-	    break;
+-	  /* We need 2*log2(N)-1 operations to achieve odd/even
+-	     with interleave. */
+-	  t1 = gen_reg_rtx (V8HImode);
+-	  t2 = gen_reg_rtx (V8HImode);
+-	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
+-	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
+-	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
+-	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
+-	  if (odd)
+-	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
+-	  else
+-	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
+-	  emit_insn (t3);
++	  if (!x86_64_general_operand (delta_rtx, Pmode))
++	    {
++	      tmp = gen_rtx_REG (Pmode, tmp_regno);
++	      emit_move_insn (tmp, delta_rtx);
++	      delta_rtx = tmp;
++	    }
+ 	}
+-      break;
+ 
+-    case E_V16QImode:
+-      return expand_vec_perm_even_odd_pack (d);
++      ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
++    }
++
++  /* Adjust the this parameter by a value stored in the vtable.  */
++  if (vcall_offset)
++    {
++      rtx vcall_addr, vcall_mem, this_mem;
+ 
+-    case E_V16HImode:
+-    case E_V32QImode:
+-      return expand_vec_perm_even_odd_pack (d);
++      tmp = gen_rtx_REG (Pmode, tmp_regno);
+ 
+-    case E_V64QImode:
+-      return expand_vec_perm_even_odd_trunc (d);
++      this_mem = gen_rtx_MEM (ptr_mode, this_reg);
++      if (Pmode != ptr_mode)
++	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
++      emit_move_insn (tmp, this_mem);
+ 
+-    case E_V4DImode:
+-      if (!TARGET_AVX2)
++      /* Adjust the this parameter.  */
++      vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
++      if (TARGET_64BIT
++	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
+ 	{
+-	  struct expand_vec_perm_d d_copy = *d;
+-	  d_copy.vmode = V4DFmode;
+-	  if (d->testing_p)
+-	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
+-	  else
+-	    d_copy.target = gen_reg_rtx (V4DFmode);
+-	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
+-	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
+-	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+-	    {
+-	      if (!d->testing_p)
+-		emit_move_insn (d->target,
+-				gen_lowpart (V4DImode, d_copy.target));
+-	      return true;
+-	    }
+-	  return false;
++	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
++	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
++	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
+ 	}
+ 
+-      if (d->testing_p)
+-	break;
+-
+-      t1 = gen_reg_rtx (V4DImode);
+-      t2 = gen_reg_rtx (V4DImode);
++      vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
++      if (Pmode != ptr_mode)
++	emit_insn (gen_addsi_1_zext (this_reg,
++				     gen_rtx_REG (ptr_mode,
++						  REGNO (this_reg)),
++				     vcall_mem));
++      else
++	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
++    }
+ 
+-      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+-      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+-      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
++  /* If necessary, drop THIS back to its stack slot.  */
++  if (this_reg && this_reg != this_param)
++    emit_move_insn (this_param, this_reg);
+ 
+-      /* Now an vpunpck[lh]qdq will produce the result required.  */
+-      if (odd)
+-	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
++  fnaddr = XEXP (DECL_RTL (function), 0);
++  if (TARGET_64BIT)
++    {
++      if (!flag_pic || targetm.binds_local_p (function)
++	  || TARGET_PECOFF)
++	;
+       else
+-	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+-      emit_insn (t3);
+-      break;
+-
+-    case E_V8SImode:
+-      if (!TARGET_AVX2)
+ 	{
+-	  struct expand_vec_perm_d d_copy = *d;
+-	  d_copy.vmode = V8SFmode;
+-	  if (d->testing_p)
+-	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
+-	  else
+-	    d_copy.target = gen_reg_rtx (V8SFmode);
+-	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
+-	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
+-	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+-	    {
+-	      if (!d->testing_p)
+-		emit_move_insn (d->target,
+-				gen_lowpart (V8SImode, d_copy.target));
+-	      return true;
+-	    }
+-	  return false;
++	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
++	  tmp = gen_rtx_CONST (Pmode, tmp);
++	  fnaddr = gen_const_mem (Pmode, tmp);
++	}
++    }
++  else
++    {
++      if (!flag_pic || targetm.binds_local_p (function))
++	;
++#if TARGET_MACHO
++      else if (TARGET_MACHO)
++	{
++	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
++	  fnaddr = XEXP (fnaddr, 0);
+ 	}
++#endif /* TARGET_MACHO */
++      else
++	{
++	  tmp = gen_rtx_REG (Pmode, CX_REG);
++	  output_set_got (tmp, NULL_RTX);
+ 
+-      if (d->testing_p)
+-	break;
++	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
++	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
++	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
++	  fnaddr = gen_const_mem (Pmode, fnaddr);
++	}
++    }
+ 
+-      t1 = gen_reg_rtx (V8SImode);
+-      t2 = gen_reg_rtx (V8SImode);
+-      t3 = gen_reg_rtx (V4DImode);
+-      t4 = gen_reg_rtx (V4DImode);
+-      t5 = gen_reg_rtx (V4DImode);
+-
+-      /* Shuffle the lanes around into
+-	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
+-      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
+-				    gen_lowpart (V4DImode, d->op1),
+-				    GEN_INT (0x20)));
+-      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
+-				    gen_lowpart (V4DImode, d->op1),
+-				    GEN_INT (0x31)));
+-
+-      /* Swap the 2nd and 3rd position in each lane into
+-	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
+-      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
+-				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+-      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
+-				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+-
+-      /* Now an vpunpck[lh]qdq will produce
+-	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
+-      if (odd)
+-	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
+-					   gen_lowpart (V4DImode, t2));
++  /* Our sibling call patterns do not allow memories, because we have no
++     predicate that can distinguish between frame and non-frame memory.
++     For our purposes here, we can get away with (ab)using a jump pattern,
++     because we're going to do no optimization.  */
++  if (MEM_P (fnaddr))
++    {
++      if (sibcall_insn_operand (fnaddr, word_mode))
++	{
++	  fnaddr = XEXP (DECL_RTL (function), 0);
++	  tmp = gen_rtx_MEM (QImode, fnaddr);
++	  tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
++	  tmp = emit_call_insn (tmp);
++	  SIBLING_CALL_P (tmp) = 1;
++	}
+       else
+-	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
+-					  gen_lowpart (V4DImode, t2));
+-      emit_insn (t3);
+-      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
+-      break;
++	emit_jump_insn (gen_indirect_jump (fnaddr));
++    }
++  else
++    {
++      if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
++	{
++	  // CM_LARGE_PIC always uses pseudo PIC register which is
++	  // uninitialized.  Since FUNCTION is local and calling it
++	  // doesn't go through PLT, we use scratch register %r11 as
++	  // PIC register and initialize it here.
++	  pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
++	  ix86_init_large_pic_reg (tmp_regno);
++	  fnaddr = legitimize_pic_address (fnaddr,
++					   gen_rtx_REG (Pmode, tmp_regno));
++	}
+ 
+-    default:
+-      gcc_unreachable ();
++      if (!sibcall_insn_operand (fnaddr, word_mode))
++	{
++	  tmp = gen_rtx_REG (word_mode, tmp_regno);
++	  if (GET_MODE (fnaddr) != word_mode)
++	    fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
++	  emit_move_insn (tmp, fnaddr);
++	  fnaddr = tmp;
++	}
++
++      tmp = gen_rtx_MEM (QImode, fnaddr);
++      tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
++      tmp = emit_call_insn (tmp);
++      SIBLING_CALL_P (tmp) = 1;
+     }
++  emit_barrier ();
+ 
+-  return true;
++  /* Emit just enough of rest_of_compilation to get the insns emitted.
++     Note that use_thunk calls assemble_start_function et al.  */
++  insn = get_insns ();
++  shorten_branches (insn);
++  assemble_start_function (thunk_fndecl, fnname);
++  final_start_function (insn, file, 1);
++  final (insn, file, 1);
++  final_end_function ();
++  assemble_end_function (thunk_fndecl, fnname);
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+-   extract-even and extract-odd permutations.  */
++static void
++x86_file_start (void)
++{
++  default_file_start ();
++  if (TARGET_16BIT)
++    fputs ("\t.code16gcc\n", asm_out_file);
++#if TARGET_MACHO
++  darwin_file_start ();
++#endif
++  if (X86_FILE_START_VERSION_DIRECTIVE)
++    fputs ("\t.version\t\"01.01\"\n", asm_out_file);
++  if (X86_FILE_START_FLTUSED)
++    fputs ("\t.global\t__fltused\n", asm_out_file);
++  if (ix86_asm_dialect == ASM_INTEL)
++    fputs ("\t.intel_syntax noprefix\n", asm_out_file);
++}
+ 
+-static bool
+-expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
++int
++x86_field_alignment (tree type, int computed)
+ {
+-  unsigned i, odd, nelt = d->nelt;
++  machine_mode mode;
+ 
+-  odd = d->perm[0];
+-  if (odd != 0 && odd != 1)
+-    return false;
++  if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
++    return computed;
++  if (TARGET_IAMCU)
++    return iamcu_alignment (type, computed);
++  mode = TYPE_MODE (strip_array_types (type));
++  if (mode == DFmode || mode == DCmode
++      || GET_MODE_CLASS (mode) == MODE_INT
++      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
++    return MIN (32, computed);
++  return computed;
++}
+ 
+-  for (i = 1; i < nelt; ++i)
+-    if (d->perm[i] != 2 * i + odd)
+-      return false;
++/* Print call to TARGET to FILE.  */
+ 
+-  return expand_vec_perm_even_odd_1 (d, odd);
++static void
++x86_print_call_or_nop (FILE *file, const char *target)
++{
++  if (flag_nop_mcount || !strcmp (target, "nop"))
++    /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
++    fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
++  else
++    fprintf (file, "1:\tcall\t%s\n", target);
+ }
+ 
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+-   permutations.  We assume that expand_vec_perm_1 has already failed.  */
+-
+ static bool
+-expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
++current_fentry_name (const char **name)
+ {
+-  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+-  machine_mode vmode = d->vmode;
+-  unsigned char perm2[4];
+-  rtx op0 = d->op0, dest;
+-  bool ok;
+-
+-  switch (vmode)
+-    {
+-    case E_V4DFmode:
+-    case E_V8SFmode:
+-      /* These are special-cased in sse.md so that we can optionally
+-	 use the vbroadcast instruction.  They expand to two insns
+-	 if the input happens to be in a register.  */
+-      gcc_unreachable ();
+-
+-    case E_V2DFmode:
+-    case E_V2DImode:
+-    case E_V4SFmode:
+-    case E_V4SImode:
+-      /* These are always implementable using standard shuffle patterns.  */
+-      gcc_unreachable ();
++  tree attr = lookup_attribute ("fentry_name",
++				DECL_ATTRIBUTES (current_function_decl));
++  if (!attr)
++    return false;
++  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
++  return true;
++}
+ 
+-    case E_V8HImode:
+-    case E_V16QImode:
+-      /* These can be implemented via interleave.  We save one insn by
+-	 stopping once we have promoted to V4SImode and then use pshufd.  */
+-      if (d->testing_p)
+-	return true;
+-      do
+-	{
+-	  rtx dest;
+-	  rtx (*gen) (rtx, rtx, rtx)
+-	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+-				 : gen_vec_interleave_lowv8hi;
++static bool
++current_fentry_section (const char **name)
++{
++  tree attr = lookup_attribute ("fentry_section",
++				DECL_ATTRIBUTES (current_function_decl));
++  if (!attr)
++    return false;
++  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
++  return true;
++}
+ 
+-	  if (elt >= nelt2)
+-	    {
+-	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
+-				       : gen_vec_interleave_highv8hi;
+-	      elt -= nelt2;
+-	    }
+-	  nelt2 /= 2;
++/* Output assembler code to FILE to increment profiler label # LABELNO
++   for profiling a function entry.  */
++void
++x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
++{
++  if (cfun->machine->endbr_queued_at_entrance)
++    fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
+ 
+-	  dest = gen_reg_rtx (vmode);
+-	  emit_insn (gen (dest, op0, op0));
+-	  vmode = get_mode_wider_vector (vmode);
+-	  op0 = gen_lowpart (vmode, dest);
+-	}
+-      while (vmode != V4SImode);
++  const char *mcount_name = MCOUNT_NAME;
+ 
+-      memset (perm2, elt, 4);
+-      dest = gen_reg_rtx (V4SImode);
+-      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
+-      gcc_assert (ok);
+-      if (!d->testing_p)
+-	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+-      return true;
++  if (current_fentry_name (&mcount_name))
++    ;
++  else if (fentry_name)
++    mcount_name = fentry_name;
++  else if (flag_fentry)
++    mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE;
+ 
+-    case E_V64QImode:
+-    case E_V32QImode:
+-    case E_V16HImode:
+-    case E_V8SImode:
+-    case E_V4DImode:
+-      /* For AVX2 broadcasts of the first element vpbroadcast* or
+-	 vpermq should be used by expand_vec_perm_1.  */
+-      gcc_assert (!TARGET_AVX2 || d->perm[0]);
+-      return false;
++  if (TARGET_64BIT)
++    {
++#ifndef NO_PROFILE_COUNTERS
++      fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
++#endif
+ 
+-    default:
+-      gcc_unreachable ();
++      if (!TARGET_PECOFF && flag_pic)
++	fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
++      else
++	x86_print_call_or_nop (file, mcount_name);
++    }
++  else if (flag_pic)
++    {
++#ifndef NO_PROFILE_COUNTERS
++      fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
++	       LPREFIX, labelno);
++#endif
++      fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
++    }
++  else
++    {
++#ifndef NO_PROFILE_COUNTERS
++      fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
++	       LPREFIX, labelno);
++#endif
++      x86_print_call_or_nop (file, mcount_name);
+     }
+-}
+-
+-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+-   broadcast permutations.  */
+ 
+-static bool
+-expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+-{
+-  unsigned i, elt, nelt = d->nelt;
+-
+-  if (!d->one_operand_p)
+-    return false;
++  if (flag_record_mcount
++	|| lookup_attribute ("fentry_section",
++                                DECL_ATTRIBUTES (current_function_decl)))
++    {
++      const char *sname = "__mcount_loc";
+ 
+-  elt = d->perm[0];
+-  for (i = 1; i < nelt; ++i)
+-    if (d->perm[i] != elt)
+-      return false;
++      if (current_fentry_section (&sname))
++	;
++      else if (fentry_section)
++	sname = fentry_section;
+ 
+-  return expand_vec_perm_broadcast_1 (d);
++      fprintf (file, "\t.section %s, \"a\",@progbits\n", sname);
++      fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
++      fprintf (file, "\t.previous\n");
++    }
+ }
+ 
+-/* Implement arbitrary permutations of two V64QImode operands
+-   with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
+-static bool
+-expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
++/* We don't have exact information about the insn sizes, but we may assume
++   quite safely that we are informed about all 1 byte insns and memory
++   address sizes.  This is enough to eliminate unnecessary padding in
++   99% of cases.  */
++
++int
++ix86_min_insn_size (rtx_insn *insn)
+ {
+-  if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
+-    return false;
++  int l = 0, len;
+ 
+-  if (d->testing_p)
+-    return true;
++  if (!INSN_P (insn) || !active_insn_p (insn))
++    return 0;
+ 
+-  struct expand_vec_perm_d ds[2];
+-  rtx rperm[128], vperm, target0, target1;
+-  unsigned int i, nelt;
+-  machine_mode vmode;
++  /* Discard alignments we've emit and jump instructions.  */
++  if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
++      && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
++    return 0;
+ 
+-  nelt = d->nelt;
+-  vmode = V64QImode;
++  /* Important case - calls are always 5 bytes.
++     It is common to have many calls in the row.  */
++  if (CALL_P (insn)
++      && symbolic_reference_mentioned_p (PATTERN (insn))
++      && !SIBLING_CALL_P (insn))
++    return 5;
++  len = get_attr_length (insn);
++  if (len <= 1)
++    return 1;
+ 
+-  for (i = 0; i < 2; i++)
++  /* For normal instructions we rely on get_attr_length being exact,
++     with a few exceptions.  */
++  if (!JUMP_P (insn))
+     {
+-      ds[i] = *d;
+-      ds[i].vmode = V32HImode;
+-      ds[i].nelt = 32;
+-      ds[i].target = gen_reg_rtx (V32HImode);
+-      ds[i].op0 = gen_lowpart (V32HImode, d->op0);
+-      ds[i].op1 = gen_lowpart (V32HImode, d->op1);
+-    }
+-
+-  /* Prepare permutations such that the first one takes care of
+-     putting the even bytes into the right positions or one higher
+-     positions (ds[0]) and the second one takes care of
+-     putting the odd bytes into the right positions or one below
+-     (ds[1]).  */
++      enum attr_type type = get_attr_type (insn);
+ 
+-  for (i = 0; i < nelt; i++)
+-    {
+-      ds[i & 1].perm[i / 2] = d->perm[i] / 2;
+-      if (i & 1)
+-	{
+-	  rperm[i] = constm1_rtx;
+-	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
+-	}
+-      else
++      switch (type)
+ 	{
+-	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
+-	  rperm[i + 64] = constm1_rtx;
++	case TYPE_MULTI:
++	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
++	      || asm_noperands (PATTERN (insn)) >= 0)
++	    return 0;
++	  break;
++	case TYPE_OTHER:
++	case TYPE_FCMP:
++	  break;
++	default:
++	  /* Otherwise trust get_attr_length.  */
++	  return len;
+ 	}
+-    }
+-
+-  bool ok = expand_vec_perm_1 (&ds[0]);
+-  gcc_assert (ok);
+-  ds[0].target = gen_lowpart (V64QImode, ds[0].target);
+-
+-  ok = expand_vec_perm_1 (&ds[1]);
+-  gcc_assert (ok);
+-  ds[1].target = gen_lowpart (V64QImode, ds[1].target);
+-
+-  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
+-  vperm = force_reg (vmode, vperm);
+-  target0 = gen_reg_rtx (V64QImode);
+-  emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
+ 
+-  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
+-  vperm = force_reg (vmode, vperm);
+-  target1 = gen_reg_rtx (V64QImode);
+-  emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
+-
+-  emit_insn (gen_iorv64qi3 (d->target, target0, target1));
+-  return true;
++      l = get_attr_length_address (insn);
++      if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
++	l = 4;
++    }
++  if (l)
++    return 1+l;
++  else
++    return 2;
+ }
+ 
+-/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+-   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
+-   all the shorter instruction sequences.  */
++#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+ 
+-static bool
+-expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
++/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
++   window.  */
++
++static void
++ix86_avoid_jump_mispredicts (void)
+ {
+-  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+-  unsigned int i, nelt, eltsz;
+-  bool used[4];
++  rtx_insn *insn, *start = get_insns ();
++  int nbytes = 0, njumps = 0;
++  bool isjump = false;
+ 
+-  if (!TARGET_AVX2
+-      || d->one_operand_p
+-      || (d->vmode != V32QImode && d->vmode != V16HImode))
+-    return false;
++  /* Look for all minimal intervals of instructions containing 4 jumps.
++     The intervals are bounded by START and INSN.  NBYTES is the total
++     size of instructions in the interval including INSN and not including
++     START.  When the NBYTES is smaller than 16 bytes, it is possible
++     that the end of START and INSN ends up in the same 16byte page.
+ 
+-  if (d->testing_p)
+-    return true;
++     The smallest offset in the page INSN can start is the case where START
++     ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
++     We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
+ 
+-  nelt = d->nelt;
+-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+-
+-  /* Generate 4 permutation masks.  If the required element is within
+-     the same lane, it is shuffled in.  If the required element from the
+-     other lane, force a zero by setting bit 7 in the permutation mask.
+-     In the other mask the mask has non-negative elements if element
+-     is requested from the other lane, but also moved to the other lane,
+-     so that the result of vpshufb can have the two V2TImode halves
+-     swapped.  */
+-  m128 = GEN_INT (-128);
+-  for (i = 0; i < 32; ++i)
+-    {
+-      rperm[0][i] = m128;
+-      rperm[1][i] = m128;
+-      rperm[2][i] = m128;
+-      rperm[3][i] = m128;
+-    }
+-  used[0] = false;
+-  used[1] = false;
+-  used[2] = false;
+-  used[3] = false;
+-  for (i = 0; i < nelt; ++i)
++     Don't consider asm goto as jump, while it can contain a jump, it doesn't
++     have to, control transfer to label(s) can be performed through other
++     means, and also we estimate minimum length of all asm stmts as 0.  */
++  for (insn = start; insn; insn = NEXT_INSN (insn))
+     {
+-      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+-      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+-      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+-
+-      for (j = 0; j < eltsz; ++j)
+-	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+-      used[which] = true;
+-    }
++      int min_size;
+ 
+-  for (i = 0; i < 2; ++i)
+-    {
+-      if (!used[2 * i + 1])
++      if (LABEL_P (insn))
+ 	{
+-	  h[i] = NULL_RTX;
++	  align_flags alignment = label_to_alignment (insn);
++	  int align = alignment.levels[0].log;
++	  int max_skip = alignment.levels[0].maxskip;
++
++	  if (max_skip > 15)
++	    max_skip = 15;
++	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
++	     already in the current 16 byte page, because otherwise
++	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
++	     bytes to reach 16 byte boundary.  */
++	  if (align <= 0
++	      || (align <= 3 && max_skip != (1 << align) - 1))
++	    max_skip = 0;
++	  if (dump_file)
++	    fprintf (dump_file, "Label %i with max_skip %i\n",
++		     INSN_UID (insn), max_skip);
++	  if (max_skip)
++	    {
++	      while (nbytes + max_skip >= 16)
++		{
++		  start = NEXT_INSN (start);
++		  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
++		      || CALL_P (start))
++		    njumps--, isjump = true;
++		  else
++		    isjump = false;
++		  nbytes -= ix86_min_insn_size (start);
++		}
++	    }
+ 	  continue;
+ 	}
+-      vperm = gen_rtx_CONST_VECTOR (V32QImode,
+-				    gen_rtvec_v (32, rperm[2 * i + 1]));
+-      vperm = force_reg (V32QImode, vperm);
+-      h[i] = gen_reg_rtx (V32QImode);
+-      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+-      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+-    }
+ 
+-  /* Swap the 128-byte lanes of h[X].  */
+-  for (i = 0; i < 2; ++i)
+-   {
+-     if (h[i] == NULL_RTX)
+-       continue;
+-     op = gen_reg_rtx (V4DImode);
+-     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+-				     const2_rtx, GEN_INT (3), const0_rtx,
+-				     const1_rtx));
+-     h[i] = gen_lowpart (V32QImode, op);
+-   }
++      min_size = ix86_min_insn_size (insn);
++      nbytes += min_size;
++      if (dump_file)
++	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
++		 INSN_UID (insn), min_size);
++      if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
++	  || CALL_P (insn))
++	njumps++;
++      else
++	continue;
+ 
+-  for (i = 0; i < 2; ++i)
+-    {
+-      if (!used[2 * i])
++      while (njumps > 3)
+ 	{
+-	  l[i] = NULL_RTX;
+-	  continue;
++	  start = NEXT_INSN (start);
++	  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
++	      || CALL_P (start))
++	    njumps--, isjump = true;
++	  else
++	    isjump = false;
++	  nbytes -= ix86_min_insn_size (start);
+ 	}
+-      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+-      vperm = force_reg (V32QImode, vperm);
+-      l[i] = gen_reg_rtx (V32QImode);
+-      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+-      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+-    }
++      gcc_assert (njumps >= 0);
++      if (dump_file)
++        fprintf (dump_file, "Interval %i to %i has %i bytes\n",
++		 INSN_UID (start), INSN_UID (insn), nbytes);
+ 
+-  for (i = 0; i < 2; ++i)
+-    {
+-      if (h[i] && l[i])
++      if (njumps == 3 && isjump && nbytes < 16)
+ 	{
+-	  op = gen_reg_rtx (V32QImode);
+-	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+-	  l[i] = op;
++	  int padsize = 15 - nbytes + ix86_min_insn_size (insn);
++
++	  if (dump_file)
++	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
++		     INSN_UID (insn), padsize);
++          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+ 	}
+-      else if (h[i])
+-	l[i] = h[i];
+     }
+-
+-  gcc_assert (l[0] && l[1]);
+-  op = d->target;
+-  if (d->vmode != V32QImode)
+-    op = gen_reg_rtx (V32QImode);
+-  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+-  if (op != d->target)
+-    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+-  return true;
+ }
++#endif
+ 
+-/* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
+-   taken care of, perform the expansion in D and return true on success.  */
+-
+-static bool
+-ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
++/* AMD Athlon works faster
++   when RET is not destination of conditional jump or directly preceded
++   by other jump instruction.  We avoid the penalty by inserting NOP just
++   before the RET instructions in such cases.  */
++static void
++ix86_pad_returns (void)
+ {
+-  /* Try a single instruction expansion.  */
+-  if (expand_vec_perm_1 (d))
+-    return true;
+-
+-  /* Try sequences of two instructions.  */
+-
+-  if (expand_vec_perm_pshuflw_pshufhw (d))
+-    return true;
+-
+-  if (expand_vec_perm_palignr (d, false))
+-    return true;
+-
+-  if (expand_vec_perm_interleave2 (d))
+-    return true;
+-
+-  if (expand_vec_perm_broadcast (d))
+-    return true;
+-
+-  if (expand_vec_perm_vpermq_perm_1 (d))
+-    return true;
+-
+-  if (expand_vec_perm_vperm2f128 (d))
+-    return true;
+-
+-  if (expand_vec_perm_pblendv (d))
+-    return true;
+-
+-  /* Try sequences of three instructions.  */
+-
+-  if (expand_vec_perm_even_odd_pack (d))
+-    return true;
+-
+-  if (expand_vec_perm_2vperm2f128_vshuf (d))
+-    return true;
+-
+-  if (expand_vec_perm_pshufb2 (d))
+-    return true;
+-
+-  if (expand_vec_perm_interleave3 (d))
+-    return true;
+-
+-  if (expand_vec_perm_vperm2f128_vblend (d))
+-    return true;
+-
+-  /* Try sequences of four instructions.  */
+-
+-  if (expand_vec_perm_even_odd_trunc (d))
+-    return true;
+-  if (expand_vec_perm_vpshufb2_vpermq (d))
+-    return true;
+-
+-  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+-    return true;
+-
+-  if (expand_vec_perm_vpermt2_vpshub2 (d))
+-    return true;
++  edge e;
++  edge_iterator ei;
+ 
+-  /* ??? Look for narrow permutations whose element orderings would
+-     allow the promotion to a wider mode.  */
++  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
++    {
++      basic_block bb = e->src;
++      rtx_insn *ret = BB_END (bb);
++      rtx_insn *prev;
++      bool replace = false;
+ 
+-  /* ??? Look for sequences of interleave or a wider permute that place
+-     the data into the correct lanes for a half-vector shuffle like
+-     pshuf[lh]w or vpermilps.  */
++      if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
++	  || optimize_bb_for_size_p (bb))
++	continue;
++      for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
++	if (active_insn_p (prev) || LABEL_P (prev))
++	  break;
++      if (prev && LABEL_P (prev))
++	{
++	  edge e;
++	  edge_iterator ei;
+ 
+-  /* ??? Look for sequences of interleave that produce the desired results.
+-     The combinatorics of punpck[lh] get pretty ugly... */
++	  FOR_EACH_EDGE (e, ei, bb->preds)
++	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
++		&& !(e->flags & EDGE_FALLTHRU))
++	      {
++		replace = true;
++		break;
++	      }
++	}
++      if (!replace)
++	{
++	  prev = prev_active_insn (ret);
++	  if (prev
++	      && ((JUMP_P (prev) && any_condjump_p (prev))
++		  || CALL_P (prev)))
++	    replace = true;
++	  /* Empty functions get branch mispredict even when
++	     the jump destination is not visible to us.  */
++	  if (!prev && !optimize_function_for_size_p (cfun))
++	    replace = true;
++	}
++      if (replace)
++	{
++	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
++	  delete_insn (ret);
++	}
++    }
++}
+ 
+-  if (expand_vec_perm_even_odd (d))
+-    return true;
++/* Count the minimum number of instructions in BB.  Return 4 if the
++   number of instructions >= 4.  */
+ 
+-  /* Even longer sequences.  */
+-  if (expand_vec_perm_vpshufb4_vpermq2 (d))
+-    return true;
++static int
++ix86_count_insn_bb (basic_block bb)
++{
++  rtx_insn *insn;
++  int insn_count = 0;
+ 
+-  /* See if we can get the same permutation in different vector integer
+-     mode.  */
+-  struct expand_vec_perm_d nd;
+-  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
++  /* Count number of instructions in this block.  Return 4 if the number
++     of instructions >= 4.  */
++  FOR_BB_INSNS (bb, insn)
+     {
+-      if (!d->testing_p)
+-	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+-      return true;
++      /* Only happen in exit blocks.  */
++      if (JUMP_P (insn)
++	  && ANY_RETURN_P (PATTERN (insn)))
++	break;
++
++      if (NONDEBUG_INSN_P (insn)
++	  && GET_CODE (PATTERN (insn)) != USE
++	  && GET_CODE (PATTERN (insn)) != CLOBBER)
++	{
++	  insn_count++;
++	  if (insn_count >= 4)
++	    return insn_count;
++	}
+     }
+ 
+-  return false;
++  return insn_count;
+ }
+ 
+-/* If a permutation only uses one operand, make it clear. Returns true
+-   if the permutation references both operands.  */
+ 
+-static bool
+-canonicalize_perm (struct expand_vec_perm_d *d)
+-{
+-  int i, which, nelt = d->nelt;
++/* Count the minimum number of instructions in code path in BB.
++   Return 4 if the number of instructions >= 4.  */
+ 
+-  for (i = which = 0; i < nelt; ++i)
+-      which |= (d->perm[i] < nelt ? 1 : 2);
++static int
++ix86_count_insn (basic_block bb)
++{
++  edge e;
++  edge_iterator ei;
++  int min_prev_count;
+ 
+-  d->one_operand_p = true;
+-  switch (which)
++  /* Only bother counting instructions along paths with no
++     more than 2 basic blocks between entry and exit.  Given
++     that BB has an edge to exit, determine if a predecessor
++     of BB has an edge from entry.  If so, compute the number
++     of instructions in the predecessor block.  If there
++     happen to be multiple such blocks, compute the minimum.  */
++  min_prev_count = 4;
++  FOR_EACH_EDGE (e, ei, bb->preds)
+     {
+-    default:
+-      gcc_unreachable();
++      edge prev_e;
++      edge_iterator prev_ei;
+ 
+-    case 3:
+-      if (!rtx_equal_p (d->op0, d->op1))
+-        {
+-	  d->one_operand_p = false;
++      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
++	{
++	  min_prev_count = 0;
+ 	  break;
+-        }
+-      /* The elements of PERM do not suggest that only the first operand
+-	 is used, but both operands are identical.  Allow easier matching
+-	 of the permutation by folding the permutation into the single
+-	 input vector.  */
+-      /* FALLTHRU */
+-
+-    case 2:
+-      for (i = 0; i < nelt; ++i)
+-        d->perm[i] &= nelt - 1;
+-      d->op0 = d->op1;
+-      break;
+-
+-    case 1:
+-      d->op1 = d->op0;
+-      break;
++	}
++      FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
++	{
++	  if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
++	    {
++	      int count = ix86_count_insn_bb (e->src);
++	      if (count < min_prev_count)
++		min_prev_count = count;
++	      break;
++	    }
++	}
+     }
+ 
+-  return (which == 3);
++  if (min_prev_count < 4)
++    min_prev_count += ix86_count_insn_bb (bb);
++
++  return min_prev_count;
+ }
+ 
+-/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
++/* Pad short function to 4 instructions.   */
+ 
+-static bool
+-ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
+-			       rtx op1, const vec_perm_indices &sel)
++static void
++ix86_pad_short_function (void)
+ {
+-  struct expand_vec_perm_d d;
+-  unsigned char perm[MAX_VECT_LEN];
+-  unsigned int i, nelt, which;
+-  bool two_args;
++  edge e;
++  edge_iterator ei;
+ 
+-  d.target = target;
+-  d.op0 = op0;
+-  d.op1 = op1;
++  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
++    {
++      rtx_insn *ret = BB_END (e->src);
++      if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
++	{
++	  int insn_count = ix86_count_insn (e->src);
+ 
+-  d.vmode = vmode;
+-  gcc_assert (VECTOR_MODE_P (d.vmode));
+-  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+-  d.testing_p = !target;
++	  /* Pad short function.  */
++	  if (insn_count < 4)
++	    {
++	      rtx_insn *insn = ret;
+ 
+-  gcc_assert (sel.length () == nelt);
+-  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
++	      /* Find epilogue.  */
++	      while (insn
++		     && (!NOTE_P (insn)
++			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
++		insn = PREV_INSN (insn);
+ 
+-  /* Given sufficient ISA support we can just return true here
+-     for selected vector modes.  */
+-  switch (d.vmode)
+-    {
+-    case E_V16SFmode:
+-    case E_V16SImode:
+-    case E_V8DImode:
+-    case E_V8DFmode:
+-      if (!TARGET_AVX512F)
+-	return false;
+-      /* All implementable with a single vperm[it]2 insn.  */
+-      if (d.testing_p)
+-	return true;
+-      break;
+-    case E_V32HImode:
+-      if (!TARGET_AVX512BW)
+-	return false;
+-      if (d.testing_p)
+-	/* All implementable with a single vperm[it]2 insn.  */
+-	return true;
+-      break;
+-    case E_V64QImode:
+-      if (!TARGET_AVX512BW)
+-	return false;
+-      if (d.testing_p)
+-	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
+-	return true;
+-      break;
+-    case E_V8SImode:
+-    case E_V8SFmode:
+-    case E_V4DFmode:
+-    case E_V4DImode:
+-      if (!TARGET_AVX)
+-	return false;
+-      if (d.testing_p && TARGET_AVX512VL)
+-	/* All implementable with a single vperm[it]2 insn.  */
+-	return true;
+-      break;
+-    case E_V16HImode:
+-      if (!TARGET_SSE2)
+-	return false;
+-      if (d.testing_p && TARGET_AVX2)
+-	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
+-	return true;
+-      break;
+-    case E_V32QImode:
+-      if (!TARGET_SSE2)
+-	return false;
+-      if (d.testing_p && TARGET_AVX2)
+-	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
+-	return true;
+-      break;
+-    case E_V8HImode:
+-    case E_V16QImode:
+-      if (!TARGET_SSE2)
+-	return false;
+-      /* Fall through.  */
+-    case E_V4SImode:
+-    case E_V4SFmode:
+-      if (!TARGET_SSE)
+-	return false;
+-      /* All implementable with a single vpperm insn.  */
+-      if (d.testing_p && TARGET_XOP)
+-	return true;
+-      /* All implementable with 2 pshufb + 1 ior.  */
+-      if (d.testing_p && TARGET_SSSE3)
+-	return true;
+-      break;
+-    case E_V2DImode:
+-    case E_V2DFmode:
+-      if (!TARGET_SSE)
+-	return false;
+-      /* All implementable with shufpd or unpck[lh]pd.  */
+-      if (d.testing_p)
+-	return true;
+-      break;
+-    default:
+-      return false;
+-    }
++	      if (!insn)
++		insn = ret;
+ 
+-  for (i = which = 0; i < nelt; ++i)
+-    {
+-      unsigned char e = sel[i];
+-      gcc_assert (e < 2 * nelt);
+-      d.perm[i] = e;
+-      perm[i] = e;
+-      which |= (e < nelt ? 1 : 2);
++	      /* Two NOPs count as one instruction.  */
++	      insn_count = 2 * (4 - insn_count);
++	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
++	    }
++	}
+     }
++}
+ 
+-  if (d.testing_p)
+-    {
+-      /* For all elements from second vector, fold the elements to first.  */
+-      if (which == 2)
+-	for (i = 0; i < nelt; ++i)
+-	  d.perm[i] -= nelt;
+-
+-      /* Check whether the mask can be applied to the vector type.  */
+-      d.one_operand_p = (which != 3);
+-
+-      /* Implementable with shufps or pshufd.  */
+-      if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+-	return true;
++/* Fix up a Windows system unwinder issue.  If an EH region falls through into
++   the epilogue, the Windows system unwinder will apply epilogue logic and
++   produce incorrect offsets.  This can be avoided by adding a nop between
++   the last insn that can throw and the first insn of the epilogue.  */
+ 
+-      /* Otherwise we have to go through the motions and see if we can
+-	 figure out how to generate the requested permutation.  */
+-      d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+-      d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+-      if (!d.one_operand_p)
+-	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
++static void
++ix86_seh_fixup_eh_fallthru (void)
++{
++  edge e;
++  edge_iterator ei;
+ 
+-      start_sequence ();
+-      bool ret = ix86_expand_vec_perm_const_1 (&d);
+-      end_sequence ();
++  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
++    {
++      rtx_insn *insn, *next;
+ 
+-      return ret;
+-    }
++      /* Find the beginning of the epilogue.  */
++      for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
++	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
++	  break;
++      if (insn == NULL)
++	continue;
+ 
+-  two_args = canonicalize_perm (&d);
++      /* We only care about preceding insns that can throw.  */
++      insn = prev_active_insn (insn);
++      if (insn == NULL || !can_throw_internal (insn))
++	continue;
+ 
+-  if (ix86_expand_vec_perm_const_1 (&d))
+-    return true;
++      /* Do not separate calls from their debug information.  */
++      for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
++	if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
++	  insn = next;
++	else
++	  break;
+ 
+-  /* If the selector says both arguments are needed, but the operands are the
+-     same, the above tried to expand with one_operand_p and flattened selector.
+-     If that didn't work, retry without one_operand_p; we succeeded with that
+-     during testing.  */
+-  if (two_args && d.one_operand_p)
+-    {
+-      d.one_operand_p = false;
+-      memcpy (d.perm, perm, sizeof (perm));
+-      return ix86_expand_vec_perm_const_1 (&d);
++      emit_insn_after (gen_nops (const1_rtx), insn);
+     }
+-
+-  return false;
+ }
+ 
+-void
+-ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
++/* Implement machine specific optimizations.  We implement padding of returns
++   for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
++static void
++ix86_reorg (void)
+ {
+-  struct expand_vec_perm_d d;
+-  unsigned i, nelt;
+-
+-  d.target = targ;
+-  d.op0 = op0;
+-  d.op1 = op1;
+-  d.vmode = GET_MODE (targ);
+-  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+-  d.one_operand_p = false;
+-  d.testing_p = false;
++  /* We are freeing block_for_insn in the toplev to keep compatibility
++     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
++  compute_bb_for_insn ();
+ 
+-  for (i = 0; i < nelt; ++i)
+-    d.perm[i] = i * 2 + odd;
++  if (TARGET_SEH && current_function_has_exception_handlers ())
++    ix86_seh_fixup_eh_fallthru ();
+ 
+-  /* We'll either be able to implement the permutation directly...  */
+-  if (expand_vec_perm_1 (&d))
+-    return;
++  if (optimize && optimize_function_for_speed_p (cfun))
++    {
++      if (TARGET_PAD_SHORT_FUNCTION)
++	ix86_pad_short_function ();
++      else if (TARGET_PAD_RETURNS)
++	ix86_pad_returns ();
++#ifdef ASM_OUTPUT_MAX_SKIP_PAD
++      if (TARGET_FOUR_JUMP_LIMIT)
++	ix86_avoid_jump_mispredicts ();
++#endif
++    }
++}
+ 
+-  /* ... or we use the special-case patterns.  */
+-  expand_vec_perm_even_odd_1 (&d, odd);
++/* Return nonzero when QImode register that must be represented via REX prefix
++   is used.  */
++bool
++x86_extended_QIreg_mentioned_p (rtx_insn *insn)
++{
++  int i;
++  extract_insn_cached (insn);
++  for (i = 0; i < recog_data.n_operands; i++)
++    if (GENERAL_REG_P (recog_data.operand[i])
++	&& !QI_REGNO_P (REGNO (recog_data.operand[i])))
++       return true;
++  return false;
+ }
+ 
+-static void
+-ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
++/* Return true when INSN mentions register that must be encoded using REX
++   prefix.  */
++bool
++x86_extended_reg_mentioned_p (rtx insn)
+ {
+-  struct expand_vec_perm_d d;
+-  unsigned i, nelt, base;
+-  bool ok;
+-
+-  d.target = targ;
+-  d.op0 = op0;
+-  d.op1 = op1;
+-  d.vmode = GET_MODE (targ);
+-  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+-  d.one_operand_p = false;
+-  d.testing_p = false;
+-
+-  base = high_p ? nelt / 2 : 0;
+-  for (i = 0; i < nelt / 2; ++i)
++  subrtx_iterator::array_type array;
++  FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
+     {
+-      d.perm[i * 2] = i + base;
+-      d.perm[i * 2 + 1] = i + base + nelt;
++      const_rtx x = *iter;
++      if (REG_P (x)
++	  && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
++	return true;
+     }
+-
+-  /* Note that for AVX this isn't one instruction.  */
+-  ok = ix86_expand_vec_perm_const_1 (&d);
+-  gcc_assert (ok);
++  return false;
+ }
+ 
++/* If profitable, negate (without causing overflow) integer constant
++   of mode MODE at location LOC.  Return true in this case.  */
++bool
++x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
++{
++  HOST_WIDE_INT val;
+ 
+-/* Expand a vector operation CODE for a V*QImode in terms of the
+-   same operation on V*HImode.  */
+-
+-void
+-ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+-{
+-  machine_mode qimode = GET_MODE (dest);
+-  machine_mode himode;
+-  rtx (*gen_il) (rtx, rtx, rtx);
+-  rtx (*gen_ih) (rtx, rtx, rtx);
+-  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+-  struct expand_vec_perm_d d;
+-  bool ok, full_interleave;
+-  bool uns_p = false;
+-  int i;
++  if (!CONST_INT_P (*loc))
++    return false;
+ 
+-  switch (qimode)
++  switch (mode)
+     {
+-    case E_V16QImode:
+-      himode = V8HImode;
+-      gen_il = gen_vec_interleave_lowv16qi;
+-      gen_ih = gen_vec_interleave_highv16qi;
+-      break;
+-    case E_V32QImode:
+-      himode = V16HImode;
+-      gen_il = gen_avx2_interleave_lowv32qi;
+-      gen_ih = gen_avx2_interleave_highv32qi;
+-      break;
+-    case E_V64QImode:
+-      himode = V32HImode;
+-      gen_il = gen_avx512bw_interleave_lowv64qi;
+-      gen_ih = gen_avx512bw_interleave_highv64qi;
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
++    case E_DImode:
++      /* DImode x86_64 constants must fit in 32 bits.  */
++      gcc_assert (x86_64_immediate_operand (*loc, mode));
+ 
+-  op2_l = op2_h = op2;
+-  switch (code)
+-    {
+-    case MULT:
+-      /* Unpack data such that we've got a source byte in each low byte of
+-	 each word.  We don't care what goes into the high byte of each word.
+-	 Rather than trying to get zero in there, most convenient is to let
+-	 it be a copy of the low byte.  */
+-      op2_l = gen_reg_rtx (qimode);
+-      op2_h = gen_reg_rtx (qimode);
+-      emit_insn (gen_il (op2_l, op2, op2));
+-      emit_insn (gen_ih (op2_h, op2, op2));
+-
+-      op1_l = gen_reg_rtx (qimode);
+-      op1_h = gen_reg_rtx (qimode);
+-      emit_insn (gen_il (op1_l, op1, op1));
+-      emit_insn (gen_ih (op1_h, op1, op1));
+-      full_interleave = qimode == V16QImode;
++      mode = SImode;
+       break;
+ 
+-    case ASHIFT:
+-    case LSHIFTRT:
+-      uns_p = true;
+-      /* FALLTHRU */
+-    case ASHIFTRT:
+-      op1_l = gen_reg_rtx (himode);
+-      op1_h = gen_reg_rtx (himode);
+-      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+-      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+-      full_interleave = true;
++    case E_SImode:
++    case E_HImode:
++    case E_QImode:
+       break;
++
+     default:
+       gcc_unreachable ();
+     }
+ 
+-  /* Perform the operation.  */
+-  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+-			       1, OPTAB_DIRECT);
+-  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+-			       1, OPTAB_DIRECT);
+-  gcc_assert (res_l && res_h);
++  /* Avoid overflows.  */
++  if (mode_signbit_p (mode, *loc))
++    return false;
+ 
+-  /* Merge the data back into the right place.  */
+-  d.target = dest;
+-  d.op0 = gen_lowpart (qimode, res_l);
+-  d.op1 = gen_lowpart (qimode, res_h);
+-  d.vmode = qimode;
+-  d.nelt = GET_MODE_NUNITS (qimode);
+-  d.one_operand_p = false;
+-  d.testing_p = false;
++  val = INTVAL (*loc);
+ 
+-  if (full_interleave)
++  /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
++     Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
++  if ((val < 0 && val != -128)
++      || val == 128)
+     {
+-      /* For SSE2, we used an full interleave, so the desired
+-	 results are in the even elements.  */
+-      for (i = 0; i < d.nelt; ++i)
+-	d.perm[i] = i * 2;
++      *loc = GEN_INT (-val);
++      return true;
+     }
+-  else
+-    {
+-      /* For AVX, the interleave used above was not cross-lane.  So the
+-	 extraction is evens but with the second and third quarter swapped.
+-	 Happily, that is even one insn shorter than even extraction.
+-	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
+-	 always first from the first and then from the second source operand,
+-	 the index bits above the low 4 bits remains the same.
+-	 Thus, for d.nelt == 32 we want permutation
+-	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
+-	 and for d.nelt == 64 we want permutation
+-	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
+-	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
+-      for (i = 0; i < d.nelt; ++i)
+-	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
+-    }
+-
+-  ok = ix86_expand_vec_perm_const_1 (&d);
+-  gcc_assert (ok);
+ 
+-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+-		       gen_rtx_fmt_ee (code, qimode, op1, op2));
++  return false;
+ }
+ 
+-/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
+-   if op is CONST_VECTOR with all odd elements equal to their
+-   preceding element.  */
+-
+-static bool
+-const_vector_equal_evenodd_p (rtx op)
+-{
+-  machine_mode mode = GET_MODE (op);
+-  int i, nunits = GET_MODE_NUNITS (mode);
+-  if (GET_CODE (op) != CONST_VECTOR
+-      || nunits != CONST_VECTOR_NUNITS (op))
+-    return false;
+-  for (i = 0; i < nunits; i += 2)
+-    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
+-      return false;
+-  return true;
+-}
++/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
++   optabs would emit if we didn't have TFmode patterns.  */
+ 
+ void
+-ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+-			       bool uns_p, bool odd_p)
++x86_emit_floatuns (rtx operands[2])
+ {
+-  machine_mode mode = GET_MODE (op1);
+-  machine_mode wmode = GET_MODE (dest);
+-  rtx x;
+-  rtx orig_op1 = op1, orig_op2 = op2;
+-
+-  if (!nonimmediate_operand (op1, mode))
+-    op1 = force_reg (mode, op1);
+-  if (!nonimmediate_operand (op2, mode))
+-    op2 = force_reg (mode, op2);
++  rtx_code_label *neglab, *donelab;
++  rtx i0, i1, f0, in, out;
++  machine_mode mode, inmode;
+ 
+-  /* We only play even/odd games with vectors of SImode.  */
+-  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
++  inmode = GET_MODE (operands[1]);
++  gcc_assert (inmode == SImode || inmode == DImode);
+ 
+-  /* If we're looking for the odd results, shift those members down to
+-     the even slots.  For some cpus this is faster than a PSHUFD.  */
+-  if (odd_p)
+-    {
+-      /* For XOP use vpmacsdqh, but only for smult, as it is only
+-	 signed.  */
+-      if (TARGET_XOP && mode == V4SImode && !uns_p)
+-	{
+-	  x = force_reg (wmode, CONST0_RTX (wmode));
+-	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+-	  return;
+-	}
++  out = operands[0];
++  in = force_reg (inmode, operands[1]);
++  mode = GET_MODE (out);
++  neglab = gen_label_rtx ();
++  donelab = gen_label_rtx ();
++  f0 = gen_reg_rtx (mode);
+ 
+-      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+-      if (!const_vector_equal_evenodd_p (orig_op1))
+-	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+-			    x, NULL, 1, OPTAB_DIRECT);
+-      if (!const_vector_equal_evenodd_p (orig_op2))
+-	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+-			    x, NULL, 1, OPTAB_DIRECT);
+-      op1 = gen_lowpart (mode, op1);
+-      op2 = gen_lowpart (mode, op2);
+-    }
++  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
+ 
+-  if (mode == V16SImode)
+-    {
+-      if (uns_p)
+-	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
+-      else
+-	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
+-    }
+-  else if (mode == V8SImode)
+-    {
+-      if (uns_p)
+-	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+-      else
+-	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+-    }
+-  else if (uns_p)
+-    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+-  else if (TARGET_SSE4_1)
+-    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+-  else
+-    {
+-      rtx s1, s2, t0, t1, t2;
++  expand_float (out, in, 0);
+ 
+-      /* The easiest way to implement this without PMULDQ is to go through
+-	 the motions as if we are performing a full 64-bit multiply.  With
+-	 the exception that we need to do less shuffling of the elements.  */
++  emit_jump_insn (gen_jump (donelab));
++  emit_barrier ();
+ 
+-      /* Compute the sign-extension, aka highparts, of the two operands.  */
+-      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+-				op1, pc_rtx, pc_rtx);
+-      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+-				op2, pc_rtx, pc_rtx);
++  emit_label (neglab);
+ 
+-      /* Multiply LO(A) * HI(B), and vice-versa.  */
+-      t1 = gen_reg_rtx (wmode);
+-      t2 = gen_reg_rtx (wmode);
+-      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+-      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
++  i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
++			    1, OPTAB_DIRECT);
++  i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
++			    1, OPTAB_DIRECT);
++  i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
+ 
+-      /* Multiply LO(A) * LO(B).  */
+-      t0 = gen_reg_rtx (wmode);
+-      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
++  expand_float (f0, i0, 0);
+ 
+-      /* Combine and shift the highparts into place.  */
+-      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+-      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+-			 1, OPTAB_DIRECT);
++  emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
+ 
+-      /* Combine high and low parts.  */
+-      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+-      return;
+-    }
+-  emit_insn (x);
++  emit_label (donelab);
+ }
+-
+-void
+-ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+-			    bool uns_p, bool high_p)
++
++/* Target hook for scalar_mode_supported_p.  */
++static bool
++ix86_scalar_mode_supported_p (scalar_mode mode)
+ {
+-  machine_mode wmode = GET_MODE (dest);
+-  machine_mode mode = GET_MODE (op1);
+-  rtx t1, t2, t3, t4, mask;
+-
+-  switch (mode)
+-    {
+-    case E_V4SImode:
+-      t1 = gen_reg_rtx (mode);
+-      t2 = gen_reg_rtx (mode);
+-      if (TARGET_XOP && !uns_p)
+-	{
+-	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
+-	     shuffle the elements once so that all elements are in the right
+-	     place for immediate use: { A C B D }.  */
+-	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+-					const1_rtx, GEN_INT (3)));
+-	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+-					const1_rtx, GEN_INT (3)));
+-	}
+-      else
+-	{
+-	  /* Put the elements into place for the multiply.  */
+-	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
+-	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
+-	  high_p = false;
+-	}
+-      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+-      break;
+-
+-    case E_V8SImode:
+-      /* Shuffle the elements between the lanes.  After this we
+-	 have { A B E F | C D G H } for each operand.  */
+-      t1 = gen_reg_rtx (V4DImode);
+-      t2 = gen_reg_rtx (V4DImode);
+-      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+-				      const0_rtx, const2_rtx,
+-				      const1_rtx, GEN_INT (3)));
+-      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+-				      const0_rtx, const2_rtx,
+-				      const1_rtx, GEN_INT (3)));
+-
+-      /* Shuffle the elements within the lanes.  After this we
+-	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
+-      t3 = gen_reg_rtx (V8SImode);
+-      t4 = gen_reg_rtx (V8SImode);
+-      mask = GEN_INT (high_p
+-		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+-		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+-      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+-      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+-
+-      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+-      break;
+-
+-    case E_V8HImode:
+-    case E_V16HImode:
+-      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+-			 uns_p, OPTAB_DIRECT);
+-      t2 = expand_binop (mode,
+-			 uns_p ? umul_highpart_optab : smul_highpart_optab,
+-			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+-      gcc_assert (t1 && t2);
+-
+-      t3 = gen_reg_rtx (mode);
+-      ix86_expand_vec_interleave (t3, t1, t2, high_p);
+-      emit_move_insn (dest, gen_lowpart (wmode, t3));
+-      break;
+-
+-    case E_V16QImode:
+-    case E_V32QImode:
+-    case E_V32HImode:
+-    case E_V16SImode:
+-    case E_V64QImode:
+-      t1 = gen_reg_rtx (wmode);
+-      t2 = gen_reg_rtx (wmode);
+-      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+-      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+-
+-      emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+-    }
++  if (DECIMAL_FLOAT_MODE_P (mode))
++    return default_decimal_float_supported_p ();
++  else if (mode == TFmode)
++    return true;
++  else
++    return default_scalar_mode_supported_p (mode);
+ }
+ 
+-void
+-ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
++/* Implements target hook vector_mode_supported_p.  */
++static bool
++ix86_vector_mode_supported_p (machine_mode mode)
+ {
+-  rtx res_1, res_2, res_3, res_4;
+-
+-  res_1 = gen_reg_rtx (V4SImode);
+-  res_2 = gen_reg_rtx (V4SImode);
+-  res_3 = gen_reg_rtx (V2DImode);
+-  res_4 = gen_reg_rtx (V2DImode);
+-  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
+-  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
+-
+-  /* Move the results in element 2 down to element 1; we don't care
+-     what goes in elements 2 and 3.  Then we can merge the parts
+-     back together with an interleave.
+-
+-     Note that two other sequences were tried:
+-     (1) Use interleaves at the start instead of psrldq, which allows
+-     us to use a single shufps to merge things back at the end.
+-     (2) Use shufps here to combine the two vectors, then pshufd to
+-     put the elements in the correct order.
+-     In both cases the cost of the reformatting stall was too high
+-     and the overall sequence slower.  */
+-
+-  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
+-				const0_rtx, const2_rtx,
+-				const0_rtx, const0_rtx));
+-  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
+-				const0_rtx, const2_rtx,
+-				const0_rtx, const0_rtx));
+-  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+-
+-  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
++  if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
++    return true;
++  if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
++    return true;
++  if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
++    return true;
++  if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
++    return true;
++  if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
++    return true;
++  if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
++    return true;
++  return false;
+ }
+ 
+-void
+-ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
++/* Target hook for c_mode_for_suffix.  */
++static machine_mode
++ix86_c_mode_for_suffix (char suffix)
+ {
+-  machine_mode mode = GET_MODE (op0);
+-  rtx t1, t2, t3, t4, t5, t6;
++  if (suffix == 'q')
++    return TFmode;
++  if (suffix == 'w')
++    return XFmode;
+ 
+-  if (TARGET_AVX512DQ && mode == V8DImode)
+-    emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
+-  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
+-    emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
+-  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
+-    emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
+-  else if (TARGET_XOP && mode == V2DImode)
+-    {
+-      /* op1: A,B,C,D, op2: E,F,G,H */
+-      op1 = gen_lowpart (V4SImode, op1);
+-      op2 = gen_lowpart (V4SImode, op2);
++  return VOIDmode;
++}
+ 
+-      t1 = gen_reg_rtx (V4SImode);
+-      t2 = gen_reg_rtx (V4SImode);
+-      t3 = gen_reg_rtx (V2DImode);
+-      t4 = gen_reg_rtx (V2DImode);
++/* Worker function for TARGET_MD_ASM_ADJUST.
+ 
+-      /* t1: B,A,D,C */
+-      emit_insn (gen_sse2_pshufd_1 (t1, op1,
+-				    GEN_INT (1),
+-				    GEN_INT (0),
+-				    GEN_INT (3),
+-				    GEN_INT (2)));
++   We implement asm flag outputs, and maintain source compatibility
++   with the old cc0-based compiler.  */
+ 
+-      /* t2: (B*E),(A*F),(D*G),(C*H) */
+-      emit_insn (gen_mulv4si3 (t2, t1, op2));
++static rtx_insn *
++ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
++		    vec<const char *> &constraints,
++		    vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
++{
++  bool saw_asm_flag = false;
+ 
+-      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+-      emit_insn (gen_xop_phadddq (t3, t2));
++  start_sequence ();
++  for (unsigned i = 0, n = outputs.length (); i < n; ++i)
++    {
++      const char *con = constraints[i];
++      if (strncmp (con, "=@cc", 4) != 0)
++	continue;
++      con += 4;
++      if (strchr (con, ',') != NULL)
++	{
++	  error ("alternatives not allowed in %<asm%> flag output");
++	  continue;
++	}
+ 
+-      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+-      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
++      bool invert = false;
++      if (con[0] == 'n')
++	invert = true, con++;
+ 
+-      /* Multiply lower parts and add all */
+-      t5 = gen_reg_rtx (V2DImode);
+-      emit_insn (gen_vec_widen_umult_even_v4si (t5, 
+-					gen_lowpart (V4SImode, op1),
+-					gen_lowpart (V4SImode, op2)));
+-      op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
++      machine_mode mode = CCmode;
++      rtx_code code = UNKNOWN;
+ 
+-    }
+-  else
+-    {
+-      machine_mode nmode;
+-      rtx (*umul) (rtx, rtx, rtx);
++      switch (con[0])
++	{
++	case 'a':
++	  if (con[1] == 0)
++	    mode = CCAmode, code = EQ;
++	  else if (con[1] == 'e' && con[2] == 0)
++	    mode = CCCmode, code = NE;
++	  break;
++	case 'b':
++	  if (con[1] == 0)
++	    mode = CCCmode, code = EQ;
++	  else if (con[1] == 'e' && con[2] == 0)
++	    mode = CCAmode, code = NE;
++	  break;
++	case 'c':
++	  if (con[1] == 0)
++	    mode = CCCmode, code = EQ;
++	  break;
++	case 'e':
++	  if (con[1] == 0)
++	    mode = CCZmode, code = EQ;
++	  break;
++	case 'g':
++	  if (con[1] == 0)
++	    mode = CCGCmode, code = GT;
++	  else if (con[1] == 'e' && con[2] == 0)
++	    mode = CCGCmode, code = GE;
++	  break;
++	case 'l':
++	  if (con[1] == 0)
++	    mode = CCGCmode, code = LT;
++	  else if (con[1] == 'e' && con[2] == 0)
++	    mode = CCGCmode, code = LE;
++	  break;
++	case 'o':
++	  if (con[1] == 0)
++	    mode = CCOmode, code = EQ;
++	  break;
++	case 'p':
++	  if (con[1] == 0)
++	    mode = CCPmode, code = EQ;
++	  break;
++	case 's':
++	  if (con[1] == 0)
++	    mode = CCSmode, code = EQ;
++	  break;
++	case 'z':
++	  if (con[1] == 0)
++	    mode = CCZmode, code = EQ;
++	  break;
++	}
++      if (code == UNKNOWN)
++	{
++	  error ("unknown %<asm%> flag output %qs", constraints[i]);
++	  continue;
++	}
++      if (invert)
++	code = reverse_condition (code);
+ 
+-      if (mode == V2DImode)
++      rtx dest = outputs[i];
++      if (!saw_asm_flag)
+ 	{
+-	  umul = gen_vec_widen_umult_even_v4si;
+-	  nmode = V4SImode;
++	  /* This is the first asm flag output.  Here we put the flags
++	     register in as the real output and adjust the condition to
++	     allow it.  */
++	  constraints[i] = "=Bf";
++	  outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
++	  saw_asm_flag = true;
+ 	}
+-      else if (mode == V4DImode)
++      else
+ 	{
+-	  umul = gen_vec_widen_umult_even_v8si;
+-	  nmode = V8SImode;
++	  /* We don't need the flags register as output twice.  */
++	  constraints[i] = "=X";
++	  outputs[i] = gen_rtx_SCRATCH (SImode);
+ 	}
+-      else if (mode == V8DImode)
++
++      rtx x = gen_rtx_REG (mode, FLAGS_REG);
++      x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
++
++      machine_mode dest_mode = GET_MODE (dest);
++      if (!SCALAR_INT_MODE_P (dest_mode))
+ 	{
+-	  umul = gen_vec_widen_umult_even_v16si;
+-	  nmode = V16SImode;
++	  error ("invalid type for %<asm%> flag output");
++	  continue;
+ 	}
+-      else
+-	gcc_unreachable ();
+ 
++      if (dest_mode == DImode && !TARGET_64BIT)
++	dest_mode = SImode;
+ 
+-      /* Multiply low parts.  */
+-      t1 = gen_reg_rtx (mode);
+-      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
++      if (dest_mode != QImode)
++	{
++	  rtx destqi = gen_reg_rtx (QImode);
++	  emit_insn (gen_rtx_SET (destqi, x));
+ 
+-      /* Shift input vectors right 32 bits so we can multiply high parts.  */
+-      t6 = GEN_INT (32);
+-      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+-      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
++	  if (TARGET_ZERO_EXTEND_WITH_AND
++	      && optimize_function_for_speed_p (cfun))
++	    {
++	      x = force_reg (dest_mode, const0_rtx);
+ 
+-      /* Multiply high parts by low parts.  */
+-      t4 = gen_reg_rtx (mode);
+-      t5 = gen_reg_rtx (mode);
+-      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+-      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
++	      emit_insn (gen_movstrictqi (gen_lowpart (QImode, x), destqi));
++	    }
++	  else
++	    {
++	      x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
++	      if (dest_mode == GET_MODE (dest)
++		  && !register_operand (dest, GET_MODE (dest)))
++		x = force_reg (dest_mode, x);
++	    }
++	}
+ 
+-      /* Combine and shift the highparts back.  */
+-      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+-      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
++      if (dest_mode != GET_MODE (dest))
++	{
++	  rtx tmp = gen_reg_rtx (SImode);
+ 
+-      /* Combine high and low parts.  */
+-      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
++	  emit_insn (gen_rtx_SET (tmp, x));
++	  emit_insn (gen_zero_extendsidi2 (dest, tmp));
++	}
++      else
++	emit_insn (gen_rtx_SET (dest, x));
+     }
++  rtx_insn *seq = get_insns ();
++  end_sequence ();
+ 
+-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+-		       gen_rtx_MULT (mode, op1, op2));
++  if (saw_asm_flag)
++    return seq;
++  else
++    {
++      /* If we had no asm flag outputs, clobber the flags.  */
++      clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
++      SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
++      return NULL;
++    }
+ }
+ 
+-/* Return 1 if control tansfer instruction INSN
+-   should be encoded with notrack prefix.  */
++/* Implements target vector targetm.asm.encode_section_info.  */
+ 
+-static bool
+-ix86_notrack_prefixed_insn_p (rtx insn)
++static void ATTRIBUTE_UNUSED
++ix86_encode_section_info (tree decl, rtx rtl, int first)
+ {
+-  if (!insn || !((flag_cf_protection & CF_BRANCH)))
+-    return false;
+-
+-  if (CALL_P (insn))
+-    {
+-      rtx call = get_call_rtx_from (insn);
+-      gcc_assert (call != NULL_RTX);
+-      rtx addr = XEXP (call, 0);
++  default_encode_section_info (decl, rtl, first);
+ 
+-      /* Do not emit 'notrack' if it's not an indirect call.  */
+-      if (MEM_P (addr)
+-	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+-	return false;
+-      else
+-	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
+-    }
++  if (ix86_in_large_data_p (decl))
++    SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
++}
+ 
+-  if (JUMP_P (insn) && !flag_cet_switch)
+-    {
+-      rtx target = JUMP_LABEL (insn);
+-      if (target == NULL_RTX || ANY_RETURN_P (target))
+-	return false;
++/* Worker function for REVERSE_CONDITION.  */
+ 
+-      /* Check the jump is a switch table.  */
+-      rtx_insn *label = as_a<rtx_insn *> (target);
+-      rtx_insn *table = next_insn (label);
+-      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+-	return false;
+-      else
+-	return true;
+-    }
+-  return false;
++enum rtx_code
++ix86_reverse_condition (enum rtx_code code, machine_mode mode)
++{
++  return (mode == CCFPmode
++	  ? reverse_condition_maybe_unordered (code)
++	  : reverse_condition (code));
+ }
+ 
+-/* Calculate integer abs() using only SSE2 instructions.  */
++/* Output code to perform an x87 FP register move, from OPERANDS[1]
++   to OPERANDS[0].  */
+ 
+-void
+-ix86_expand_sse2_abs (rtx target, rtx input)
++const char *
++output_387_reg_move (rtx_insn *insn, rtx *operands)
+ {
+-  machine_mode mode = GET_MODE (target);
+-  rtx tmp0, tmp1, x;
+-
+-  switch (mode)
++  if (REG_P (operands[0]))
+     {
+-    case E_V2DImode:
+-    case E_V4DImode:
+-      /* For 64-bit signed integer X, with SSE4.2 use
+-	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
+-	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
+-	 32 and use logical instead of arithmetic right shift (which is
+-	 unimplemented) and subtract.  */
+-      if (TARGET_SSE4_2)
+-	{
+-	  tmp0 = gen_reg_rtx (mode);
+-	  tmp1 = gen_reg_rtx (mode);
+-	  emit_move_insn (tmp1, CONST0_RTX (mode));
+-	  if (mode == E_V2DImode)
+-	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
+-	  else
+-	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
++      if (REG_P (operands[1])
++	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
++	{
++	  if (REGNO (operands[0]) == FIRST_STACK_REG)
++	    return output_387_ffreep (operands, 0);
++	  return "fstp\t%y0";
+ 	}
++      if (STACK_TOP_P (operands[0]))
++	return "fld%Z1\t%y1";
++      return "fst\t%y0";
++    }
++  else if (MEM_P (operands[0]))
++    {
++      gcc_assert (REG_P (operands[1]));
++      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
++	return "fstp%Z0\t%y0";
+       else
+ 	{
+-	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
+-				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
+-					       - 1), NULL, 0, OPTAB_DIRECT);
+-	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
++	  /* There is no non-popping store to memory for XFmode.
++	     So if we need one, follow the store with a load.  */
++	  if (GET_MODE (operands[0]) == XFmode)
++	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
++	  else
++	    return "fst%Z0\t%y0";
+ 	}
+-
+-      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+-				  NULL, 0, OPTAB_DIRECT);
+-      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+-			       target, 0, OPTAB_DIRECT);
+-      break;
+-
+-    case E_V4SImode:
+-      /* For 32-bit signed integer X, the best way to calculate the absolute
+-	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+-      tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
+-				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
+-				  NULL, 0, OPTAB_DIRECT);
+-      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+-				  NULL, 0, OPTAB_DIRECT);
+-      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+-			       target, 0, OPTAB_DIRECT);
+-      break;
+-
+-    case E_V8HImode:
+-      /* For 16-bit signed integer X, the best way to calculate the absolute
+-	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+-      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+-
+-      x = expand_simple_binop (mode, SMAX, tmp0, input,
+-			       target, 0, OPTAB_DIRECT);
+-      break;
+-
+-    case E_V16QImode:
+-      /* For 8-bit signed integer X, the best way to calculate the absolute
+-	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
+-	 as SSE2 provides the PMINUB insn.  */
+-      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+-
+-      x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
+-			       target, 0, OPTAB_DIRECT);
+-      break;
+-
+-    default:
+-      gcc_unreachable ();
+     }
+-
+-  if (x != target)
+-    emit_move_insn (target, x);
++  else
++    gcc_unreachable();
+ }
++#ifdef TARGET_SOLARIS
++/* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
+ 
+-/* Expand an extract from a vector register through pextr insn.
+-   Return true if successful.  */
+-
+-bool
+-ix86_expand_pextr (rtx *operands)
++static void
++i386_solaris_elf_named_section (const char *name, unsigned int flags,
++				tree decl)
+ {
+-  rtx dst = operands[0];
+-  rtx src = operands[1];
+-
+-  unsigned int size = INTVAL (operands[2]);
+-  unsigned int pos = INTVAL (operands[3]);
+-
+-  if (SUBREG_P (dst))
++  /* With Binutils 2.15, the "@unwind" marker must be specified on
++     every occurrence of the ".eh_frame" section, not just the first
++     one.  */
++  if (TARGET_64BIT
++      && strcmp (name, ".eh_frame") == 0)
+     {
+-      /* Reject non-lowpart subregs.  */
+-      if (SUBREG_BYTE (dst) > 0)
+-	return false;
+-      dst = SUBREG_REG (dst);
++      fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
++	       flags & SECTION_WRITE ? "aw" : "a");
++      return;
+     }
+-	
+-  if (SUBREG_P (src))
++
++#ifndef USE_GAS
++  if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
+     {
+-      pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
+-      src = SUBREG_REG (src);
++      solaris_elf_asm_comdat_section (name, flags, decl);
++      return;
+     }
+ 
+-  switch (GET_MODE (src))
++  /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
++     SPARC assembler.  One cannot mix single-letter flags and #exclude, so
++     only emit the latter here.  */
++  if (flags & SECTION_EXCLUDE)
+     {
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V4SImode:
+-    case E_V2DImode:
+-    case E_V1TImode:
+-    case E_TImode:
+-      {
+-	machine_mode srcmode, dstmode;
+-	rtx d, pat;
++      fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
++      return;
++    }
++#endif
+ 
+-	if (!int_mode_for_size (size, 0).exists (&dstmode))
+-	  return false;
++  default_elf_asm_named_section (name, flags, decl);
++}
++#endif /* TARGET_SOLARIS */
+ 
+-	switch (dstmode)
+-	  {
+-	  case E_QImode:
+-	    if (!TARGET_SSE4_1)
+-	      return false;
+-	    srcmode = V16QImode;
+-	    break;
++/* Return the mangling of TYPE if it is an extended fundamental type.  */
+ 
+-	  case E_HImode:
+-	    if (!TARGET_SSE2)
+-	      return false;
+-	    srcmode = V8HImode;
+-	    break;
++static const char *
++ix86_mangle_type (const_tree type)
++{
++  type = TYPE_MAIN_VARIANT (type);
+ 
+-	  case E_SImode:
+-	    if (!TARGET_SSE4_1)
+-	      return false;
+-	    srcmode = V4SImode;
+-	    break;
++  if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
++      && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
++    return NULL;
+ 
+-	  case E_DImode:
+-	    gcc_assert (TARGET_64BIT);
+-	    if (!TARGET_SSE4_1)
+-	      return false;
+-	    srcmode = V2DImode;
+-	    break;
++  switch (TYPE_MODE (type))
++    {
++    case E_TFmode:
++      /* __float128 is "g".  */
++      return "g";
++    case E_XFmode:
++      /* "long double" or __float80 is "e".  */
++      return "e";
++    default:
++      return NULL;
++    }
++}
+ 
+-	  default:
+-	    return false;
+-	  }
++static GTY(()) tree ix86_tls_stack_chk_guard_decl;
++
++static tree
++ix86_stack_protect_guard (void)
++{
++  if (TARGET_SSP_TLS_GUARD)
++    {
++      tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
++      int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
++      tree type = build_qualified_type (type_node, qual);
++      tree t;
+ 
+-	/* Reject extractions from misaligned positions.  */
+-	if (pos & (size-1))
+-	  return false;
++      if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
++	{
++	  t = ix86_tls_stack_chk_guard_decl;
+ 
+-	if (GET_MODE (dst) == dstmode)
+-	  d = dst;
+-	else
+-	  d = gen_reg_rtx (dstmode);
++	  if (t == NULL)
++	    {
++	      rtx x;
+ 
+-	/* Construct insn pattern.  */
+-	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
+-	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
++	      t = build_decl
++		(UNKNOWN_LOCATION, VAR_DECL,
++		 get_identifier (ix86_stack_protector_guard_symbol_str),
++		 type);
++	      TREE_STATIC (t) = 1;
++	      TREE_PUBLIC (t) = 1;
++	      DECL_EXTERNAL (t) = 1;
++	      TREE_USED (t) = 1;
++	      TREE_THIS_VOLATILE (t) = 1;
++	      DECL_ARTIFICIAL (t) = 1;
++	      DECL_IGNORED_P (t) = 1;
+ 
+-	/* Let the rtl optimizers know about the zero extension performed.  */
+-	if (dstmode == QImode || dstmode == HImode)
+-	  {
+-	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
+-	    d = gen_lowpart (SImode, d);
+-	  }
++	      /* Do not share RTL as the declaration is visible outside of
++		 current function.  */
++	      x = DECL_RTL (t);
++	      RTX_FLAG (x, used) = 1;
+ 
+-	emit_insn (gen_rtx_SET (d, pat));
++	      ix86_tls_stack_chk_guard_decl = t;
++	    }
++	}
++      else
++	{
++	  tree asptrtype = build_pointer_type (type);
+ 
+-	if (d != dst)
+-	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+-	return true;
+-      }
++	  t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
++	  t = build2 (MEM_REF, asptrtype, t,
++		      build_int_cst (asptrtype, 0));
++	  TREE_THIS_VOLATILE (t) = 1;
++	}
+ 
+-    default:
+-      return false;
++      return t;
+     }
++
++  return default_stack_protect_guard ();
+ }
+ 
+-/* Expand an insert into a vector register through pinsr insn.
+-   Return true if successful.  */
++/* For 32-bit code we can save PIC register setup by using
++   __stack_chk_fail_local hidden function instead of calling
++   __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
++   register, so it is better to call __stack_chk_fail directly.  */
+ 
+-bool
+-ix86_expand_pinsr (rtx *operands)
++static tree ATTRIBUTE_UNUSED
++ix86_stack_protect_fail (void)
+ {
+-  rtx dst = operands[0];
+-  rtx src = operands[3];
++  return TARGET_64BIT
++	 ? default_external_stack_protect_fail ()
++	 : default_hidden_stack_protect_fail ();
++}
+ 
+-  unsigned int size = INTVAL (operands[1]);
+-  unsigned int pos = INTVAL (operands[2]);
++/* Select a format to encode pointers in exception handling data.  CODE
++   is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
++   true if the symbol may be affected by dynamic relocations.
+ 
+-  if (SUBREG_P (dst))
++   ??? All x86 object file formats are capable of representing this.
++   After all, the relocation needed is the same as for the call insn.
++   Whether or not a particular assembler allows us to enter such, I
++   guess we'll have to see.  */
++int
++asm_preferred_eh_data_format (int code, int global)
++{
++  if (flag_pic)
++    {
++      int type = DW_EH_PE_sdata8;
++      if (!TARGET_64BIT
++	  || ix86_cmodel == CM_SMALL_PIC
++	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
++	type = DW_EH_PE_sdata4;
++      return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
++    }
++  if (ix86_cmodel == CM_SMALL
++      || (ix86_cmodel == CM_MEDIUM && code))
++    return DW_EH_PE_udata4;
++  return DW_EH_PE_absptr;
++}
++
++/* Implement targetm.vectorize.builtin_vectorization_cost.  */
++static int
++ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
++                                 tree vectype, int)
++{
++  bool fp = false;
++  machine_mode mode = TImode;
++  int index;
++  if (vectype != NULL)
+     {
+-      pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
+-      dst = SUBREG_REG (dst);
++      fp = FLOAT_TYPE_P (vectype);
++      mode = TYPE_MODE (vectype);
+     }
+ 
+-  switch (GET_MODE (dst))
++  switch (type_of_cost)
+     {
+-    case E_V16QImode:
+-    case E_V8HImode:
+-    case E_V4SImode:
+-    case E_V2DImode:
+-    case E_V1TImode:
+-    case E_TImode:
+-      {
+-	machine_mode srcmode, dstmode;
+-	rtx (*pinsr)(rtx, rtx, rtx, rtx);
+-	rtx d;
++      case scalar_stmt:
++        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+ 
+-	if (!int_mode_for_size (size, 0).exists (&srcmode))
+-	  return false;
++      case scalar_load:
++	/* load/store costs are relative to register move which is 2. Recompute
++ 	   it to COSTS_N_INSNS so everything have same base.  */
++        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
++			      : ix86_cost->int_load [2]) / 2;
+ 
+-	switch (srcmode)
+-	  {
+-	  case E_QImode:
+-	    if (!TARGET_SSE4_1)
+-	      return false;
+-	    dstmode = V16QImode;
+-	    pinsr = gen_sse4_1_pinsrb;
+-	    break;
++      case scalar_store:
++        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
++			      : ix86_cost->int_store [2]) / 2;
+ 
+-	  case E_HImode:
+-	    if (!TARGET_SSE2)
+-	      return false;
+-	    dstmode = V8HImode;
+-	    pinsr = gen_sse2_pinsrw;
+-	    break;
++      case vector_stmt:
++        return ix86_vec_cost (mode,
++			      fp ? ix86_cost->addss : ix86_cost->sse_op);
+ 
+-	  case E_SImode:
+-	    if (!TARGET_SSE4_1)
+-	      return false;
+-	    dstmode = V4SImode;
+-	    pinsr = gen_sse4_1_pinsrd;
+-	    break;
++      case vector_load:
++	index = sse_store_index (mode);
++	/* See PR82713 - we may end up being called on non-vector type.  */
++	if (index < 0)
++	  index = 2;
++        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
+ 
+-	  case E_DImode:
+-	    gcc_assert (TARGET_64BIT);
+-	    if (!TARGET_SSE4_1)
+-	      return false;
+-	    dstmode = V2DImode;
+-	    pinsr = gen_sse4_1_pinsrq;
+-	    break;
++      case vector_store:
++	index = sse_store_index (mode);
++	/* See PR82713 - we may end up being called on non-vector type.  */
++	if (index < 0)
++	  index = 2;
++        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
+ 
+-	  default:
+-	    return false;
+-	  }
++      case vec_to_scalar:
++      case scalar_to_vec:
++        return ix86_vec_cost (mode, ix86_cost->sse_op);
+ 
+-	/* Reject insertions to misaligned positions.  */
+-	if (pos & (size-1))
+-	  return false;
++      /* We should have separate costs for unaligned loads and gather/scatter.
++	 Do that incrementally.  */
++      case unaligned_load:
++	index = sse_store_index (mode);
++	/* See PR82713 - we may end up being called on non-vector type.  */
++	if (index < 0)
++	  index = 2;
++        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
+ 
+-	if (SUBREG_P (src))
+-	  {
+-	    unsigned int srcpos = SUBREG_BYTE (src);
++      case unaligned_store:
++	index = sse_store_index (mode);
++	/* See PR82713 - we may end up being called on non-vector type.  */
++	if (index < 0)
++	  index = 2;
++        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
+ 
+-	    if (srcpos > 0)
+-	      {
+-		rtx extr_ops[4];
++      case vector_gather_load:
++        return ix86_vec_cost (mode,
++			      COSTS_N_INSNS
++				 (ix86_cost->gather_static
++				  + ix86_cost->gather_per_elt
++				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ 
+-		extr_ops[0] = gen_reg_rtx (srcmode);
+-		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
+-		extr_ops[2] = GEN_INT (size);
+-		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
++      case vector_scatter_store:
++        return ix86_vec_cost (mode,
++			      COSTS_N_INSNS
++				 (ix86_cost->scatter_static
++				  + ix86_cost->scatter_per_elt
++				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ 
+-		if (!ix86_expand_pextr (extr_ops))
+-		  return false;
++      case cond_branch_taken:
++        return ix86_cost->cond_taken_branch_cost;
+ 
+-		src = extr_ops[0];
+-	      }
+-	    else
+-	      src = gen_lowpart (srcmode, SUBREG_REG (src));
+-	  }
++      case cond_branch_not_taken:
++        return ix86_cost->cond_not_taken_branch_cost;
+ 
+-	if (GET_MODE (dst) == dstmode)
+-	  d = dst;
+-	else
+-	  d = gen_reg_rtx (dstmode);
++      case vec_perm:
++      case vec_promote_demote:
++        return ix86_vec_cost (mode, ix86_cost->sse_op);
+ 
+-	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
+-			  gen_lowpart (srcmode, src),
+-			  GEN_INT (1 << (pos / size))));
+-	if (d != dst)
+-	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+-	return true;
+-      }
++      case vec_construct:
++	{
++	  /* N element inserts into SSE vectors.  */
++	  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
++	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
++	  if (GET_MODE_BITSIZE (mode) == 256)
++	    cost += ix86_vec_cost (mode, ix86_cost->addss);
++	  /* One vinserti64x4 and two vinserti128 for combining SSE
++	     and AVX256 vectors to AVX512.  */
++	  else if (GET_MODE_BITSIZE (mode) == 512)
++	    cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
++	  return cost;
++	}
+ 
+-    default:
+-      return false;
++      default:
++        gcc_unreachable ();
+     }
+ }
++
+ 
+ /* This function returns the calling abi specific va_list type node.
+    It returns  the FNDECL specific va_list type.  */
+@@ -50192,39 +21332,6 @@ ix86_preferred_simd_mode (scalar_mode mode)
+     }
+ }
+ 
+-/* All CPUs prefer to avoid cross-lane operations so perform reductions
+-   upper against lower halves up to SSE reg size.  */
+-
+-static machine_mode
+-ix86_split_reduction (machine_mode mode)
+-{
+-  /* Reduce lowpart against highpart until we reach SSE reg width to
+-     avoid cross-lane operations.  */
+-  switch (mode)
+-    {
+-    case E_V8DImode:
+-    case E_V4DImode:
+-      return V2DImode;
+-    case E_V16SImode:
+-    case E_V8SImode:
+-      return V4SImode;
+-    case E_V32HImode:
+-    case E_V16HImode:
+-      return V8HImode;
+-    case E_V64QImode:
+-    case E_V32QImode:
+-      return V16QImode;
+-    case E_V16SFmode:
+-    case E_V8SFmode:
+-      return V4SFmode;
+-    case E_V8DFmode:
+-    case E_V4DFmode:
+-      return V2DFmode;
+-    default:
+-      return mode;
+-    }
+-}
+-
+ /* If AVX is enabled then try vectorizing with both 256bit and 128bit
+    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
+    256bit and 128bit vectors.  */
+@@ -50596,13 +21703,15 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
+   if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
+     {
+       warning (OPT_Winvalid_memory_model,
+-              "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
++	      "%<HLE_ACQUIRE%> not used with %<ACQUIRE%> or stronger "
++	       "memory model");
+       return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
+     }
+   if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
+     {
+       warning (OPT_Winvalid_memory_model,
+-              "HLE_RELEASE not used with RELEASE or stronger memory model");
++	      "%<HLE_RELEASE%> not used with %<RELEASE%> or stronger "
++	       "memory model");
+       return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
+     }
+   return val;
+@@ -50760,50 +21869,6 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
+   return ret;
+ }
+ 
+-/* Add target attribute to SIMD clone NODE if needed.  */
+-
+-static void
+-ix86_simd_clone_adjust (struct cgraph_node *node)
+-{
+-  const char *str = NULL;
+-
+-  /* Attributes need to be adjusted for definitions, not declarations.  */
+-  if (!node->definition)
+-    return;
+-
+-  gcc_assert (node->decl == cfun->decl);
+-  switch (node->simdclone->vecsize_mangle)
+-    {
+-    case 'b':
+-      if (!TARGET_SSE2)
+-	str = "sse2";
+-      break;
+-    case 'c':
+-      if (!TARGET_AVX)
+-	str = "avx";
+-      break;
+-    case 'd':
+-      if (!TARGET_AVX2)
+-	str = "avx2";
+-      break;
+-    case 'e':
+-      if (!TARGET_AVX512F)
+-	str = "avx512f";
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-  if (str == NULL)
+-    return;
+-  push_cfun (NULL);
+-  tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
+-  bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
+-  gcc_assert (ok);
+-  pop_cfun ();
+-  ix86_reset_previous_fndecl ();
+-  ix86_set_current_function (node->decl);
+-}
+-
+ /* If SIMD clone NODE can't be used in a vectorized loop
+    in current function, return -1, otherwise return a badness of using it
+    (0 if it is most desirable from vecsize_mangle point of view, 1
+@@ -50912,10 +21977,10 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
+       tree fenv_ptr = build_pointer_type (fenv_type);
+       tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
+       fenv_addr = fold_convert (ptr_type_node, fenv_addr);
+-      tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
+-      tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
+-      tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
+-      tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
++      tree fnstenv = get_ix86_builtin (IX86_BUILTIN_FNSTENV);
++      tree fldenv = get_ix86_builtin (IX86_BUILTIN_FLDENV);
++      tree fnstsw = get_ix86_builtin (IX86_BUILTIN_FNSTSW);
++      tree fnclex = get_ix86_builtin (IX86_BUILTIN_FNCLEX);
+       tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
+       tree hold_fnclex = build_call_expr (fnclex, 0);
+       fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
+@@ -50939,8 +22004,8 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
+     {
+       tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
+       tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
+-      tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
+-      tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
++      tree stmxcsr = get_ix86_builtin (IX86_BUILTIN_STMXCSR);
++      tree ldmxcsr = get_ix86_builtin (IX86_BUILTIN_LDMXCSR);
+       tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
+       tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
+ 				      mxcsr_orig_var, stmxcsr_hold_call);
+@@ -51183,22 +22248,6 @@ ix86_init_libfuncs (void)
+ #endif
+ }
+ 
+-/* Generate call to __divmoddi4.  */
+-
+-static void
+-ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
+-			    rtx op0, rtx op1,
+-			    rtx *quot_p, rtx *rem_p)
+-{
+-  rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
+-
+-  rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
+-				      mode, op0, mode, op1, mode,
+-				      XEXP (rem, 0), Pmode);
+-  *quot_p = quot;
+-  *rem_p = rem;
+-}
+-
+ /* Set the value of FLT_EVAL_METHOD in float.h.  When using only the
+    FPU, assume that the fpcw is set to extended precision; when using
+    only SSE, rounding is correct; when using both SSE and the FPU,
+@@ -51970,9 +23019,6 @@ ix86_run_selftests (void)
+ #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
+ #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
+ 
+-#undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
+-#define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
+-
+ #undef TARGET_OFFLOAD_OPTIONS
+ #define TARGET_OFFLOAD_OPTIONS \
+   ix86_offload_options
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index 14e5a392f..187e52a5b 100644
+--- a/gcc/config/i386/i386.h
++++ b/gcc/config/i386/i386.h
+@@ -1891,7 +1891,7 @@ typedef struct ix86_args {
+    ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction pairs, we will do a movmem or libcall instead.
++   move-instruction pairs, we will do a cpymem or libcall instead.
+    Increasing the value will always make code faster, but eventually
+    incurs high cost in increased code size.
+ 
+@@ -2784,6 +2784,9 @@ struct GTY(()) machine_function {
+   /* During SEH output, this is non-null.  */
+   struct seh_frame_state * GTY((skip(""))) seh;
+ };
++
++extern GTY(()) tree sysv_va_list_type_node;
++extern GTY(()) tree ms_va_list_type_node;
+ #endif
+ 
+ #define ix86_stack_locals (cfun->machine->stack_locals)
+@@ -2881,6 +2884,12 @@ extern void debug_dispatch_window (int);
+ 
+ #define TARGET_SUPPORTS_WIDE_INT 1
+ 
++#if !defined(GENERATOR_FILE) && !defined(IN_LIBGCC2)
++extern enum attr_cpu ix86_schedule;
++
++#define NUM_X86_64_MS_CLOBBERED_REGS 12
++#endif
++
+ /*
+ Local variables:
+ version-control: t
+diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
+index 698c31a0a..861248899 100644
+--- a/gcc/config/i386/i386.md
++++ b/gcc/config/i386/i386.md
+@@ -16731,7 +16731,7 @@
+    (set_attr "length_immediate" "0")
+    (set_attr "modrm" "0")])
+ 
+-(define_expand "movmem<mode>"
++(define_expand "cpymem<mode>"
+   [(use (match_operand:BLK 0 "memory_operand"))
+    (use (match_operand:BLK 1 "memory_operand"))
+    (use (match_operand:SWI48 2 "nonmemory_operand"))
+@@ -16743,7 +16743,7 @@
+    (use (match_operand:SI 8 ""))]
+   ""
+ {
+- if (ix86_expand_set_or_movmem (operands[0], operands[1],
++ if (ix86_expand_set_or_cpymem (operands[0], operands[1],
+ 			        operands[2], NULL, operands[3],
+ 			        operands[4], operands[5],
+ 				operands[6], operands[7],
+@@ -16958,7 +16958,7 @@
+     (use (match_operand:SI 8 ""))]
+   ""
+ {
+- if (ix86_expand_set_or_movmem (operands[0], NULL,
++ if (ix86_expand_set_or_cpymem (operands[0], NULL,
+ 			        operands[1], operands[2],
+ 				operands[3], operands[4],
+ 			        operands[5], operands[6],
+diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
+index 865947deb..4135159ac 100644
+--- a/gcc/config/i386/predicates.md
++++ b/gcc/config/i386/predicates.md
+@@ -683,7 +683,7 @@
+   if (GET_CODE (op) == PLUS && REG_P (XEXP (op, 0)))
+     {
+       int regno = REGNO (XEXP (op, 0));
+-      if (!HARD_REGISTER_NUM_P (regno) || call_used_regs[regno])
++      if (!HARD_REGISTER_NUM_P (regno) || call_used_or_fixed_reg_p (regno))
+ 	{
+ 	  op = XEXP (op, 1);
+ 	  if (GOT32_symbol_operand (op, VOIDmode))
+diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386
+index 0dac80fbc..50caf2c69 100644
+--- a/gcc/config/i386/t-i386
++++ b/gcc/config/i386/t-i386
+@@ -44,6 +44,22 @@ i386-d.o: $(srcdir)/config/i386/i386-d.c
+ 	$(COMPILE) $<
+ 	$(POSTCOMPILE)
+ 
++i386-options.o: $(srcdir)/config/i386/i386-options.c
++	$(COMPILE) $<
++	$(POSTCOMPILE)
++
++i386-builtins.o: $(srcdir)/config/i386/i386-builtins.c
++	$(COMPILE) $<
++	$(POSTCOMPILE)
++
++i386-expand.o: $(srcdir)/config/i386/i386-expand.c
++	$(COMPILE) $<
++	$(POSTCOMPILE)
++
++i386-features.o: $(srcdir)/config/i386/i386-features.c
++	$(COMPILE) $<
++	$(POSTCOMPILE)
++
+ i386.o: i386-builtin-types.inc
+ 
+ i386-builtin-types.inc: s-i386-bt ; @true
+diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
+index e8d905e22..d09e49637 100644
+--- a/gcc/config/ia64/ia64.c
++++ b/gcc/config/ia64/ia64.c
+@@ -5147,7 +5147,7 @@ ia64_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+ 		      gimple_seq *post_p)
+ {
+   /* Variable sized types are passed by reference.  */
+-  if (pass_by_reference (NULL, TYPE_MODE (type), type, false))
++  if (pass_va_arg_by_reference (type))
+     {
+       tree ptrtype = build_pointer_type (type);
+       tree addr = std_gimplify_va_arg_expr (valist, ptrtype, pre_p, post_p);
+diff --git a/gcc/config/lm32/lm32.md b/gcc/config/lm32/lm32.md
+index c09052c62..91a5fe1e0 100644
+--- a/gcc/config/lm32/lm32.md
++++ b/gcc/config/lm32/lm32.md
+@@ -216,7 +216,7 @@
+     }    
+ }")
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "general_operand" "")
+ 		   (match_operand:BLK 1 "general_operand" ""))
+ 	      (use (match_operand:SI 2 "" ""))
+diff --git a/gcc/config/m32c/blkmov.md b/gcc/config/m32c/blkmov.md
+index d7da439c2..e5cdc801f 100644
+--- a/gcc/config/m32c/blkmov.md
++++ b/gcc/config/m32c/blkmov.md
+@@ -40,14 +40,14 @@
+ ;; 1 = source (mem:BLK ...)
+ ;; 2 = count
+ ;; 3 = alignment
+-(define_expand "movmemhi"
++(define_expand "cpymemhi"
+   [(match_operand 0 "ap_operand" "")
+    (match_operand 1 "ap_operand" "")
+    (match_operand 2 "m32c_r3_operand" "")
+    (match_operand 3 "" "")
+    ]
+   ""
+-  "if (m32c_expand_movmemhi(operands)) DONE; FAIL;"
++  "if (m32c_expand_cpymemhi(operands)) DONE; FAIL;"
+   )
+ 
+ ;; We can't use mode iterators for these because M16C uses r1h to extend
+@@ -60,7 +60,7 @@
+ ;; 3 = dest (in)
+ ;; 4 = src (in)
+ ;; 5 = count (in)
+-(define_insn "movmemhi_bhi_op"
++(define_insn "cpymemhi_bhi_op"
+   [(set (mem:QI (match_operand:HI 3 "ap_operand" "0"))
+ 	(mem:QI (match_operand:HI 4 "ap_operand" "1")))
+    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
+@@ -75,7 +75,7 @@
+   "TARGET_A16"
+   "mov.b:q\t#0,r1h\n\tsmovf.b\t; %0[0..%2-1]=r1h%1[]"
+   )
+-(define_insn "movmemhi_bpsi_op"
++(define_insn "cpymemhi_bpsi_op"
+   [(set (mem:QI (match_operand:PSI 3 "ap_operand" "0"))
+ 	(mem:QI (match_operand:PSI 4 "ap_operand" "1")))
+    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
+@@ -89,7 +89,7 @@
+   "TARGET_A24"
+   "smovf.b\t; %0[0..%2-1]=%1[]"
+   )
+-(define_insn "movmemhi_whi_op"
++(define_insn "cpymemhi_whi_op"
+   [(set (mem:HI (match_operand:HI 3 "ap_operand" "0"))
+ 	(mem:HI (match_operand:HI 4 "ap_operand" "1")))
+    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
+@@ -104,7 +104,7 @@
+   "TARGET_A16"
+   "mov.b:q\t#0,r1h\n\tsmovf.w\t; %0[0..%2-1]=r1h%1[]"
+   )
+-(define_insn "movmemhi_wpsi_op"
++(define_insn "cpymemhi_wpsi_op"
+   [(set (mem:HI (match_operand:PSI 3 "ap_operand" "0"))
+ 	(mem:HI (match_operand:PSI 4 "ap_operand" "1")))
+    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
+diff --git a/gcc/config/m32c/m32c-protos.h b/gcc/config/m32c/m32c-protos.h
+index 7d4d478fd..fe926fd50 100644
+--- a/gcc/config/m32c/m32c-protos.h
++++ b/gcc/config/m32c/m32c-protos.h
+@@ -43,7 +43,7 @@ void m32c_emit_eh_epilogue (rtx);
+ int  m32c_expand_cmpstr (rtx *);
+ int  m32c_expand_insv (rtx *);
+ int  m32c_expand_movcc (rtx *);
+-int  m32c_expand_movmemhi (rtx *);
++int  m32c_expand_cpymemhi (rtx *);
+ int  m32c_expand_movstr (rtx *);
+ void m32c_expand_neg_mulpsi3 (rtx *);
+ int  m32c_expand_setmemhi (rtx *);
+diff --git a/gcc/config/m32c/m32c.c b/gcc/config/m32c/m32c.c
+index 1a0d0c681..d0d24bb5f 100644
+--- a/gcc/config/m32c/m32c.c
++++ b/gcc/config/m32c/m32c.c
+@@ -3592,7 +3592,7 @@ m32c_expand_setmemhi(rtx *operands)
+    addresses, not [mem] syntax.  $0 is the destination (MEM:BLK), $1
+    is the source (MEM:BLK), and $2 the count (HI).  */
+ int
+-m32c_expand_movmemhi(rtx *operands)
++m32c_expand_cpymemhi(rtx *operands)
+ {
+   rtx desta, srca, count;
+   rtx desto, srco, counto;
+@@ -3620,9 +3620,9 @@ m32c_expand_movmemhi(rtx *operands)
+     {
+       count = copy_to_mode_reg (HImode, GEN_INT (INTVAL (count) / 2));
+       if (TARGET_A16)
+-	emit_insn (gen_movmemhi_whi_op (desto, srco, counto, desta, srca, count));
++	emit_insn (gen_cpymemhi_whi_op (desto, srco, counto, desta, srca, count));
+       else
+-	emit_insn (gen_movmemhi_wpsi_op (desto, srco, counto, desta, srca, count));
++	emit_insn (gen_cpymemhi_wpsi_op (desto, srco, counto, desta, srca, count));
+       return 1;
+     }
+ 
+@@ -3632,9 +3632,9 @@ m32c_expand_movmemhi(rtx *operands)
+     count = copy_to_mode_reg (HImode, count);
+ 
+   if (TARGET_A16)
+-    emit_insn (gen_movmemhi_bhi_op (desto, srco, counto, desta, srca, count));
++    emit_insn (gen_cpymemhi_bhi_op (desto, srco, counto, desta, srca, count));
+   else
+-    emit_insn (gen_movmemhi_bpsi_op (desto, srco, counto, desta, srca, count));
++    emit_insn (gen_cpymemhi_bpsi_op (desto, srco, counto, desta, srca, count));
+ 
+   return 1;
+ }
+diff --git a/gcc/config/m32r/m32r.c b/gcc/config/m32r/m32r.c
+index 6e79b2aec..ac18aa286 100644
+--- a/gcc/config/m32r/m32r.c
++++ b/gcc/config/m32r/m32r.c
+@@ -2598,7 +2598,7 @@ m32r_expand_block_move (rtx operands[])
+ 	 to the word after the end of the source block, and dst_reg to point
+ 	 to the last word of the destination block, provided that the block
+ 	 is MAX_MOVE_BYTES long.  */
+-      emit_insn (gen_movmemsi_internal (dst_reg, src_reg, at_a_time,
++      emit_insn (gen_cpymemsi_internal (dst_reg, src_reg, at_a_time,
+ 					new_dst_reg, new_src_reg));
+       emit_move_insn (dst_reg, new_dst_reg);
+       emit_move_insn (src_reg, new_src_reg);
+@@ -2612,7 +2612,7 @@ m32r_expand_block_move (rtx operands[])
+     }
+ 
+   if (leftover)
+-    emit_insn (gen_movmemsi_internal (dst_reg, src_reg, GEN_INT (leftover),
++    emit_insn (gen_cpymemsi_internal (dst_reg, src_reg, GEN_INT (leftover),
+ 				      gen_reg_rtx (SImode),
+ 				      gen_reg_rtx (SImode)));
+   return 1;
+diff --git a/gcc/config/m32r/m32r.md b/gcc/config/m32r/m32r.md
+index be5739763..e944363fd 100644
+--- a/gcc/config/m32r/m32r.md
++++ b/gcc/config/m32r/m32r.md
+@@ -2195,7 +2195,7 @@
+ ;; Argument 2 is the length
+ ;; Argument 3 is the alignment
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "general_operand" "")
+ 		   (match_operand:BLK 1 "general_operand" ""))
+ 	      (use (match_operand:SI  2 "immediate_operand" ""))
+@@ -2214,7 +2214,7 @@
+ 
+ ;; Insn generated by block moves
+ 
+-(define_insn "movmemsi_internal"
++(define_insn "cpymemsi_internal"
+   [(set (mem:BLK (match_operand:SI 0 "register_operand" "r"))	;; destination
+ 	(mem:BLK (match_operand:SI 1 "register_operand" "r")))	;; source
+    (use (match_operand:SI 2 "m32r_block_immediate_operand" "J"));; # bytes to move
+diff --git a/gcc/config/mcore/mcore.md b/gcc/config/mcore/mcore.md
+index cc84e342b..c6893518d 100644
+--- a/gcc/config/mcore/mcore.md
++++ b/gcc/config/mcore/mcore.md
+@@ -2552,7 +2552,7 @@
+ ;; Block move - adapted from m88k.md
+ ;; ------------------------------------------------------------------------
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (mem:BLK (match_operand:BLK 0 "" ""))
+ 		   (mem:BLK (match_operand:BLK 1 "" "")))
+ 	      (use (match_operand:SI 2 "general_operand" ""))
+diff --git a/gcc/config/microblaze/microblaze.c b/gcc/config/microblaze/microblaze.c
+index 55c1becf9..07dd0bc6f 100644
+--- a/gcc/config/microblaze/microblaze.c
++++ b/gcc/config/microblaze/microblaze.c
+@@ -1250,7 +1250,7 @@ microblaze_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length)
+     microblaze_block_move_straight (dest, src, leftover);
+ }
+ 
+-/* Expand a movmemsi instruction.  */
++/* Expand a cpymemsi instruction.  */
+ 
+ bool
+ microblaze_expand_block_move (rtx dest, rtx src, rtx length, rtx align_rtx)
+diff --git a/gcc/config/microblaze/microblaze.md b/gcc/config/microblaze/microblaze.md
+index 183afff37..1509e4318 100644
+--- a/gcc/config/microblaze/microblaze.md
++++ b/gcc/config/microblaze/microblaze.md
+@@ -1144,7 +1144,7 @@
+ ;; Argument 2 is the length
+ ;; Argument 3 is the alignment
+  
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "general_operand")
+ 		   (match_operand:BLK 1 "general_operand"))
+ 	      (use (match_operand:SI 2 ""))
+diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
+index 100894720..3c95636bf 100644
+--- a/gcc/config/mips/mips.c
++++ b/gcc/config/mips/mips.c
+@@ -6780,7 +6780,7 @@ mips_std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   unsigned HOST_WIDE_INT align, boundary;
+   bool indirect;
+ 
+-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  indirect = pass_va_arg_by_reference (type);
+   if (indirect)
+     type = build_pointer_type (type);
+ 
+@@ -6867,7 +6867,7 @@ mips_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   tree addr;
+   bool indirect_p;
+ 
+-  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
++  indirect_p = pass_va_arg_by_reference (type);
+   if (indirect_p)
+     type = build_pointer_type (type);
+ 
+@@ -7938,15 +7938,15 @@ mips_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+ {
+   if (op == STORE_BY_PIECES)
+     return mips_store_by_pieces_p (size, align);
+-  if (op == MOVE_BY_PIECES && HAVE_movmemsi)
++  if (op == MOVE_BY_PIECES && HAVE_cpymemsi)
+     {
+-      /* movmemsi is meant to generate code that is at least as good as
+-	 move_by_pieces.  However, movmemsi effectively uses a by-pieces
++      /* cpymemsi is meant to generate code that is at least as good as
++	 move_by_pieces.  However, cpymemsi effectively uses a by-pieces
+ 	 implementation both for moves smaller than a word and for
+ 	 word-aligned moves of no more than MIPS_MAX_MOVE_BYTES_STRAIGHT
+ 	 bytes.  We should allow the tree-level optimisers to do such
+ 	 moves by pieces, as it often exposes other optimization
+-	 opportunities.  We might as well continue to use movmemsi at
++	 opportunities.  We might as well continue to use cpymemsi at
+ 	 the rtl level though, as it produces better code when
+ 	 scheduling is disabled (such as at -O).  */
+       if (currently_expanding_to_rtl)
+@@ -8165,7 +8165,7 @@ mips_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+     emit_insn (gen_nop ());
+ }
+ 
+-/* Expand a movmemsi instruction, which copies LENGTH bytes from
++/* Expand a cpymemsi instruction, which copies LENGTH bytes from
+    memory reference SRC to memory reference DEST.  */
+ 
+ bool
+diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
+index 953d82e85..a5be7fa39 100644
+--- a/gcc/config/mips/mips.h
++++ b/gcc/config/mips/mips.h
+@@ -3099,12 +3099,12 @@ while (0)
+ #define MIPS_MIN_MOVE_MEM_ALIGN 16
+ 
+ /* The maximum number of bytes that can be copied by one iteration of
+-   a movmemsi loop; see mips_block_move_loop.  */
++   a cpymemsi loop; see mips_block_move_loop.  */
+ #define MIPS_MAX_MOVE_BYTES_PER_LOOP_ITER \
+   (UNITS_PER_WORD * 4)
+ 
+ /* The maximum number of bytes that can be copied by a straight-line
+-   implementation of movmemsi; see mips_block_move_straight.  We want
++   implementation of cpymemsi; see mips_block_move_straight.  We want
+    to make sure that any loop-based implementation will iterate at
+    least twice.  */
+ #define MIPS_MAX_MOVE_BYTES_STRAIGHT \
+@@ -3119,11 +3119,11 @@ while (0)
+ 
+ #define MIPS_CALL_RATIO 8
+ 
+-/* Any loop-based implementation of movmemsi will have at least
++/* Any loop-based implementation of cpymemsi will have at least
+    MIPS_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
+    moves, so allow individual copies of fewer elements.
+ 
+-   When movmemsi is not available, use a value approximating
++   When cpymemsi is not available, use a value approximating
+    the length of a memcpy call sequence, so that move_by_pieces
+    will generate inline code if it is shorter than a function call.
+    Since move_by_pieces_ninsns counts memory-to-memory moves, but
+@@ -3131,7 +3131,7 @@ while (0)
+    value of MIPS_CALL_RATIO to take that into account.  */
+ 
+ #define MOVE_RATIO(speed)				\
+-  (HAVE_movmemsi					\
++  (HAVE_cpymemsi					\
+    ? MIPS_MAX_MOVE_BYTES_STRAIGHT / MOVE_MAX		\
+    : MIPS_CALL_RATIO / 2)
+ 
+diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
+index 3cfb1a751..a9abb6fdd 100644
+--- a/gcc/config/mips/mips.md
++++ b/gcc/config/mips/mips.md
+@@ -5638,7 +5638,7 @@
+ ;; Argument 2 is the length
+ ;; Argument 3 is the alignment
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "general_operand")
+ 		   (match_operand:BLK 1 "general_operand"))
+ 	      (use (match_operand:SI 2 ""))
+diff --git a/gcc/config/msp430/msp430.c b/gcc/config/msp430/msp430.c
+index 020e980b8..3ce649648 100644
+--- a/gcc/config/msp430/msp430.c
++++ b/gcc/config/msp430/msp430.c
+@@ -1457,7 +1457,7 @@ msp430_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   unsigned HOST_WIDE_INT align, boundary;
+   bool indirect;
+ 
+-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  indirect = pass_va_arg_by_reference (type);
+   if (indirect)
+     type = build_pointer_type (type);
+ 
+diff --git a/gcc/config/nds32/nds32-memory-manipulation.c b/gcc/config/nds32/nds32-memory-manipulation.c
+index 71b75dca5..b3f2cd698 100644
+--- a/gcc/config/nds32/nds32-memory-manipulation.c
++++ b/gcc/config/nds32/nds32-memory-manipulation.c
+@@ -1,4 +1,4 @@
+-/* Auxiliary functions for expand movmem, setmem, cmpmem, load_multiple
++/* Auxiliary functions for expand cpymem, setmem, cmpmem, load_multiple
+    and store_multiple pattern of Andes NDS32 cpu for GNU compiler
+    Copyright (C) 2012-2019 Free Software Foundation, Inc.
+    Contributed by Andes Technology Corporation.
+@@ -120,14 +120,14 @@ nds32_emit_mem_move_block (int base_regno, int count,
+ 
+ /* ------------------------------------------------------------------------ */
+ 
+-/* Auxiliary function for expand movmem pattern.  */
++/* Auxiliary function for expand cpymem pattern.  */
+ 
+ static bool
+-nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
++nds32_expand_cpymemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
+ 					 rtx size,
+ 					 rtx alignment)
+ {
+-  /* Emit loop version of movmem.
++  /* Emit loop version of cpymem.
+ 
+        andi    $size_least_3_bit, $size, #~7
+        add     $dst_end, $dst, $size
+@@ -254,7 +254,7 @@ nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
+ }
+ 
+ static bool
+-nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
++nds32_expand_cpymemsi_loop_known_size (rtx dstmem, rtx srcmem,
+ 				       rtx size, rtx alignment)
+ {
+   rtx dst_base_reg, src_base_reg;
+@@ -288,7 +288,7 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+ 
+   if (total_bytes < 8)
+     {
+-      /* Emit total_bytes less than 8 loop version of movmem.
++      /* Emit total_bytes less than 8 loop version of cpymem.
+ 	add     $dst_end, $dst, $size
+ 	move    $dst_itr, $dst
+ 	.Lbyte_mode_loop:
+@@ -321,7 +321,7 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+     }
+   else if (total_bytes % 8 == 0)
+     {
+-      /* Emit multiple of 8 loop version of movmem.
++      /* Emit multiple of 8 loop version of cpymem.
+ 
+ 	 add     $dst_end, $dst, $size
+ 	 move    $dst_itr, $dst
+@@ -370,7 +370,7 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+   else
+     {
+       /* Handle size greater than 8, and not a multiple of 8.  */
+-      return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
++      return nds32_expand_cpymemsi_loop_unknown_size (dstmem, srcmem,
+ 						      size, alignment);
+     }
+ 
+@@ -378,19 +378,19 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+ }
+ 
+ static bool
+-nds32_expand_movmemsi_loop (rtx dstmem, rtx srcmem,
++nds32_expand_cpymemsi_loop (rtx dstmem, rtx srcmem,
+ 			    rtx size, rtx alignment)
+ {
+   if (CONST_INT_P (size))
+-    return nds32_expand_movmemsi_loop_known_size (dstmem, srcmem,
++    return nds32_expand_cpymemsi_loop_known_size (dstmem, srcmem,
+ 						  size, alignment);
+   else
+-    return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
++    return nds32_expand_cpymemsi_loop_unknown_size (dstmem, srcmem,
+ 						    size, alignment);
+ }
+ 
+ static bool
+-nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem,
++nds32_expand_cpymemsi_unroll (rtx dstmem, rtx srcmem,
+ 			      rtx total_bytes, rtx alignment)
+ {
+   rtx dst_base_reg, src_base_reg;
+@@ -533,13 +533,13 @@ nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem,
+    This is auxiliary extern function to help create rtx template.
+    Check nds32-multiple.md file for the patterns.  */
+ bool
+-nds32_expand_movmemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
++nds32_expand_cpymemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
+ {
+-  if (nds32_expand_movmemsi_unroll (dstmem, srcmem, total_bytes, alignment))
++  if (nds32_expand_cpymemsi_unroll (dstmem, srcmem, total_bytes, alignment))
+     return true;
+ 
+   if (!optimize_size && optimize > 2)
+-    return nds32_expand_movmemsi_loop (dstmem, srcmem, total_bytes, alignment);
++    return nds32_expand_cpymemsi_loop (dstmem, srcmem, total_bytes, alignment);
+ 
+   return false;
+ }
+diff --git a/gcc/config/nds32/nds32-multiple.md b/gcc/config/nds32/nds32-multiple.md
+index a1e10c055..98d9508c0 100644
+--- a/gcc/config/nds32/nds32-multiple.md
++++ b/gcc/config/nds32/nds32-multiple.md
+@@ -3751,14 +3751,14 @@
+ ;; operands[3] is the known shared alignment.
+ 
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(match_operand:BLK 0 "general_operand" "")
+    (match_operand:BLK 1 "general_operand" "")
+    (match_operand:SI 2 "nds32_reg_constant_operand" "")
+    (match_operand:SI 3 "const_int_operand" "")]
+   ""
+ {
+-  if (nds32_expand_movmemsi (operands[0],
++  if (nds32_expand_cpymemsi (operands[0],
+ 			     operands[1],
+ 			     operands[2],
+ 			     operands[3]))
+diff --git a/gcc/config/nds32/nds32-protos.h b/gcc/config/nds32/nds32-protos.h
+index aaa65d6f0..7ae1954d0 100644
+--- a/gcc/config/nds32/nds32-protos.h
++++ b/gcc/config/nds32/nds32-protos.h
+@@ -78,7 +78,7 @@ extern rtx nds32_di_low_part_subreg(rtx);
+ 
+ extern rtx nds32_expand_load_multiple (int, int, rtx, rtx, bool, rtx *);
+ extern rtx nds32_expand_store_multiple (int, int, rtx, rtx, bool, rtx *);
+-extern bool nds32_expand_movmemsi (rtx, rtx, rtx, rtx);
++extern bool nds32_expand_cpymemsi (rtx, rtx, rtx, rtx);
+ extern bool nds32_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
+ extern bool nds32_expand_strlen (rtx, rtx, rtx, rtx);
+ 
+diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
+index 84a8cae22..73109c6f9 100644
+--- a/gcc/config/pa/pa.c
++++ b/gcc/config/pa/pa.c
+@@ -107,7 +107,7 @@ static int pa_can_combine_p (rtx_insn *, rtx_insn *, rtx_insn *, int, rtx,
+ static bool forward_branch_p (rtx_insn *);
+ static void compute_zdepwi_operands (unsigned HOST_WIDE_INT, unsigned *);
+ static void compute_zdepdi_operands (unsigned HOST_WIDE_INT, unsigned *);
+-static int compute_movmem_length (rtx_insn *);
++static int compute_cpymem_length (rtx_insn *);
+ static int compute_clrmem_length (rtx_insn *);
+ static bool pa_assemble_integer (rtx, unsigned int, int);
+ static void remove_useless_addtr_insns (int);
+@@ -2986,7 +2986,7 @@ pa_output_block_move (rtx *operands, int size_is_constant ATTRIBUTE_UNUSED)
+    count insns rather than emit them.  */
+ 
+ static int
+-compute_movmem_length (rtx_insn *insn)
++compute_cpymem_length (rtx_insn *insn)
+ {
+   rtx pat = PATTERN (insn);
+   unsigned int align = INTVAL (XEXP (XVECEXP (pat, 0, 7), 0));
+@@ -5061,7 +5061,7 @@ pa_adjust_insn_length (rtx_insn *insn, int length)
+       && GET_CODE (XEXP (XVECEXP (pat, 0, 0), 1)) == MEM
+       && GET_MODE (XEXP (XVECEXP (pat, 0, 0), 0)) == BLKmode
+       && GET_MODE (XEXP (XVECEXP (pat, 0, 0), 1)) == BLKmode)
+-    length += compute_movmem_length (insn) - 4;
++    length += compute_cpymem_length (insn) - 4;
+   /* Block clear pattern.  */
+   else if (NONJUMP_INSN_P (insn)
+ 	   && GET_CODE (pat) == PARALLEL
+@@ -6378,7 +6378,7 @@ hppa_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+       unsigned int size, ofs;
+       bool indirect;
+ 
+-      indirect = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
++      indirect = pass_va_arg_by_reference (type);
+       if (indirect)
+ 	{
+ 	  type = ptr;
+diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
+index 18f8e127d..a37989032 100644
+--- a/gcc/config/pa/pa.md
++++ b/gcc/config/pa/pa.md
+@@ -3162,9 +3162,9 @@
+ 
+ ;; The definition of this insn does not really explain what it does,
+ ;; but it should suffice that anything generated as this insn will be
+-;; recognized as a movmemsi operation, and that it will not successfully
++;; recognized as a cpymemsi operation, and that it will not successfully
+ ;; combine with anything.
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "" "")
+ 		   (match_operand:BLK 1 "" ""))
+ 	      (clobber (match_dup 4))
+@@ -3244,7 +3244,7 @@
+ ;; operands 0 and 1 are both equivalent to symbolic MEMs.  Thus, we are
+ ;; forced to internally copy operands 0 and 1 to operands 7 and 8,
+ ;; respectively.  We then split or peephole optimize after reload.
+-(define_insn "movmemsi_prereload"
++(define_insn "cpymemsi_prereload"
+   [(set (mem:BLK (match_operand:SI 0 "register_operand" "r,r"))
+ 	(mem:BLK (match_operand:SI 1 "register_operand" "r,r")))
+    (clobber (match_operand:SI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
+@@ -3337,7 +3337,7 @@
+     }
+ }")
+ 
+-(define_insn "movmemsi_postreload"
++(define_insn "cpymemsi_postreload"
+   [(set (mem:BLK (match_operand:SI 0 "register_operand" "+r,r"))
+ 	(mem:BLK (match_operand:SI 1 "register_operand" "+r,r")))
+    (clobber (match_operand:SI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
+@@ -3352,7 +3352,7 @@
+   "* return pa_output_block_move (operands, !which_alternative);"
+   [(set_attr "type" "multi,multi")])
+ 
+-(define_expand "movmemdi"
++(define_expand "cpymemdi"
+   [(parallel [(set (match_operand:BLK 0 "" "")
+ 		   (match_operand:BLK 1 "" ""))
+ 	      (clobber (match_dup 4))
+@@ -3432,7 +3432,7 @@
+ ;; operands 0 and 1 are both equivalent to symbolic MEMs.  Thus, we are
+ ;; forced to internally copy operands 0 and 1 to operands 7 and 8,
+ ;; respectively.  We then split or peephole optimize after reload.
+-(define_insn "movmemdi_prereload"
++(define_insn "cpymemdi_prereload"
+   [(set (mem:BLK (match_operand:DI 0 "register_operand" "r,r"))
+ 	(mem:BLK (match_operand:DI 1 "register_operand" "r,r")))
+    (clobber (match_operand:DI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
+@@ -3525,7 +3525,7 @@
+     }
+ }")
+ 
+-(define_insn "movmemdi_postreload"
++(define_insn "cpymemdi_postreload"
+   [(set (mem:BLK (match_operand:DI 0 "register_operand" "+r,r"))
+ 	(mem:BLK (match_operand:DI 1 "register_operand" "+r,r")))
+    (clobber (match_operand:DI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
+diff --git a/gcc/config/pdp11/pdp11.md b/gcc/config/pdp11/pdp11.md
+index ce781db06..be5ddc4c3 100644
+--- a/gcc/config/pdp11/pdp11.md
++++ b/gcc/config/pdp11/pdp11.md
+@@ -26,7 +26,7 @@
+     UNSPECV_BLOCKAGE
+     UNSPECV_SETD
+     UNSPECV_SETI
+-    UNSPECV_MOVMEM
++    UNSPECV_CPYMEM
+   ])
+ 
+ (define_constants
+@@ -664,8 +664,8 @@
+   [(set_attr "length" "2,2,4,4,2")])
+ 
+ ;; Expand a block move.  We turn this into a move loop.
+-(define_expand "movmemhi"
+-  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
++(define_expand "cpymemhi"
++  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
+ 	      (match_operand:BLK 0 "general_operand" "=g")
+ 	      (match_operand:BLK 1 "general_operand" "g")
+ 	      (match_operand:HI 2 "immediate_operand" "i")
+@@ -694,8 +694,8 @@
+ }")
+ 
+ ;; Expand a block move.  We turn this into a move loop.
+-(define_insn_and_split "movmemhi1"
+-  [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
++(define_insn_and_split "cpymemhi1"
++  [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
+    (match_operand:HI 0 "register_operand" "+r")
+    (match_operand:HI 1 "register_operand" "+r")
+    (match_operand:HI 2 "register_operand" "+r")
+@@ -707,7 +707,7 @@
+   ""
+   "#"
+   "reload_completed"
+-  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
++  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
+ 	      (match_dup 0)
+ 	      (match_dup 1)
+ 	      (match_dup 2)
+@@ -719,8 +719,8 @@
+ 	      (clobber (reg:CC CC_REGNUM))])]
+   "")
+ 
+-(define_insn "movmemhi_nocc"
+-  [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
++(define_insn "cpymemhi_nocc"
++  [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
+    (match_operand:HI 0 "register_operand" "+r")
+    (match_operand:HI 1 "register_operand" "+r")
+    (match_operand:HI 2 "register_operand" "+r")
+diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
+index b3297a381..49383d857 100644
+--- a/gcc/config/riscv/riscv.c
++++ b/gcc/config/riscv/riscv.c
+@@ -3024,7 +3024,7 @@ riscv_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+     emit_insn(gen_nop ());
+ }
+ 
+-/* Expand a movmemsi instruction, which copies LENGTH bytes from
++/* Expand a cpymemsi instruction, which copies LENGTH bytes from
+    memory reference SRC to memory reference DEST.  */
+ 
+ bool
+diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
+index 5130dc826..7e3612641 100644
+--- a/gcc/config/riscv/riscv.h
++++ b/gcc/config/riscv/riscv.h
+@@ -829,20 +829,20 @@ while (0)
+ #undef PTRDIFF_TYPE
+ #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")
+ 
+-/* The maximum number of bytes copied by one iteration of a movmemsi loop.  */
++/* The maximum number of bytes copied by one iteration of a cpymemsi loop.  */
+ 
+ #define RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
+ 
+ /* The maximum number of bytes that can be copied by a straight-line
+-   movmemsi implementation.  */
++   cpymemsi implementation.  */
+ 
+ #define RISCV_MAX_MOVE_BYTES_STRAIGHT (RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER * 3)
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction pairs, we will do a movmem or libcall instead.
++   move-instruction pairs, we will do a cpymem or libcall instead.
+    Do not use move_by_pieces at all when strict alignment is not
+    in effect but the target has slow unaligned accesses; in this
+-   case, movmem or libcall is more efficient.  */
++   case, cpymem or libcall is more efficient.  */
+ 
+ #define MOVE_RATIO(speed)						\
+   (!STRICT_ALIGNMENT && riscv_slow_unaligned_access_p ? 1 :		\
+diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
+index e40535c9e..cfb5fdd6a 100644
+--- a/gcc/config/riscv/riscv.md
++++ b/gcc/config/riscv/riscv.md
+@@ -1503,7 +1503,7 @@
+   DONE;
+ })
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "general_operand")
+ 		   (match_operand:BLK 1 "general_operand"))
+ 	      (use (match_operand:SI 2 ""))
+diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
+index 8f046de42..ee07aa9df 100644
+--- a/gcc/config/rs6000/rs6000.c
++++ b/gcc/config/rs6000/rs6000.c
+@@ -33472,7 +33472,7 @@ get_prev_label (tree function_name)
+   return NULL_TREE;
+ }
+ 
+-/* Generate PIC and indirect symbol stubs.  */
++/* Generate external symbol indirection stubs (PIC and non-PIC).  */
+ 
+ void
+ machopic_output_stub (FILE *file, const char *symb, const char *stub)
+@@ -38392,7 +38392,8 @@ rs6000_call_darwin_1 (rtx value, rtx func_desc, rtx tlsarg,
+   if ((cookie_val & CALL_LONG) != 0
+       && GET_CODE (func_desc) == SYMBOL_REF)
+     {
+-      /* FIXME: the longcall opt should not hang off picsymbol stubs.  */
++      /* FIXME: the longcall opt should not hang off this flag, it is most
++	 likely incorrect for kernel-mode code-generation.  */
+       if (darwin_symbol_stubs && TARGET_32BIT)
+ 	make_island = true; /* Do nothing yet, retain the CALL_LONG flag.  */
+       else
+diff --git a/gcc/config/rx/rx.md b/gcc/config/rx/rx.md
+index 2790882c9..9df73e6ef 100644
+--- a/gcc/config/rx/rx.md
++++ b/gcc/config/rx/rx.md
+@@ -46,7 +46,7 @@
+    (UNSPEC_CONST           13)
+    
+    (UNSPEC_MOVSTR          20)
+-   (UNSPEC_MOVMEM          21)
++   (UNSPEC_CPYMEM          21)
+    (UNSPEC_SETMEM          22)
+    (UNSPEC_STRLEN          23)
+    (UNSPEC_CMPSTRN         24)
+@@ -2449,13 +2449,13 @@
+    (set_attr "timings" "1111")] ;; The timing is a guesstimate.
+ )
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel
+     [(set (match_operand:BLK 0 "memory_operand")    ;; Dest
+ 	  (match_operand:BLK 1 "memory_operand"))   ;; Source
+      (use (match_operand:SI  2 "register_operand")) ;; Length in bytes
+      (match_operand          3 "immediate_operand") ;; Align
+-     (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_MOVMEM)]
++     (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_CPYMEM)]
+     )]
+   "rx_allow_string_insns"
+   {
+@@ -2486,16 +2486,16 @@
+     emit_move_insn (len, force_operand (operands[2], NULL_RTX));
+     operands[0] = replace_equiv_address_nv (operands[0], addr1);
+     operands[1] = replace_equiv_address_nv (operands[1], addr2);
+-    emit_insn (gen_rx_movmem ());
++    emit_insn (gen_rx_cpymem ());
+     DONE;
+   }
+ )
+ 
+-(define_insn "rx_movmem"
++(define_insn "rx_cpymem"
+   [(set (mem:BLK (reg:SI 1))
+ 	(mem:BLK (reg:SI 2)))
+    (use (reg:SI 3))
+-   (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_MOVMEM)
++   (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_CPYMEM)
+    (clobber (reg:SI 1))
+    (clobber (reg:SI 2))
+    (clobber (reg:SI 3))]
+diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
+index aa04479ec..b162b26b3 100644
+--- a/gcc/config/s390/s390-protos.h
++++ b/gcc/config/s390/s390-protos.h
+@@ -104,7 +104,7 @@ extern void s390_reload_symref_address (rtx , rtx , rtx , bool);
+ extern void s390_expand_plus_operand (rtx, rtx, rtx);
+ extern void emit_symbolic_move (rtx *);
+ extern void s390_load_address (rtx, rtx);
+-extern bool s390_expand_movmem (rtx, rtx, rtx);
++extern bool s390_expand_cpymem (rtx, rtx, rtx);
+ extern void s390_expand_setmem (rtx, rtx, rtx);
+ extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
+ extern void s390_expand_vec_strlen (rtx, rtx, rtx);
+diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
+index c35666dec..2959f6423 100644
+--- a/gcc/config/s390/s390.c
++++ b/gcc/config/s390/s390.c
+@@ -5400,7 +5400,7 @@ legitimize_reload_address (rtx ad, machine_mode mode ATTRIBUTE_UNUSED,
+ /* Emit code to move LEN bytes from DST to SRC.  */
+ 
+ bool
+-s390_expand_movmem (rtx dst, rtx src, rtx len)
++s390_expand_cpymem (rtx dst, rtx src, rtx len)
+ {
+   /* When tuning for z10 or higher we rely on the Glibc functions to
+      do the right thing. Only for constant lengths below 64k we will
+@@ -5425,14 +5425,14 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
+ 	{
+ 	  rtx newdst = adjust_address (dst, BLKmode, o);
+ 	  rtx newsrc = adjust_address (src, BLKmode, o);
+-	  emit_insn (gen_movmem_short (newdst, newsrc,
++	  emit_insn (gen_cpymem_short (newdst, newsrc,
+ 				       GEN_INT (l > 256 ? 255 : l - 1)));
+ 	}
+     }
+ 
+   else if (TARGET_MVCLE)
+     {
+-      emit_insn (gen_movmem_long (dst, src, convert_to_mode (Pmode, len, 1)));
++      emit_insn (gen_cpymem_long (dst, src, convert_to_mode (Pmode, len, 1)));
+     }
+ 
+   else
+@@ -5494,7 +5494,7 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
+ 	  emit_insn (prefetch);
+ 	}
+ 
+-      emit_insn (gen_movmem_short (dst, src, GEN_INT (255)));
++      emit_insn (gen_cpymem_short (dst, src, GEN_INT (255)));
+       s390_load_address (dst_addr,
+ 			 gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256)));
+       s390_load_address (src_addr,
+@@ -5511,7 +5511,7 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
+       emit_jump (loop_start_label);
+       emit_label (loop_end_label);
+ 
+-      emit_insn (gen_movmem_short (dst, src,
++      emit_insn (gen_cpymem_short (dst, src,
+ 				   convert_to_mode (Pmode, count, 1)));
+       emit_label (end_label);
+     }
+@@ -5563,7 +5563,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
+ 	    if (l > 1)
+ 	      {
+ 		rtx newdstp1 = adjust_address (dst, BLKmode, o + 1);
+-		emit_insn (gen_movmem_short (newdstp1, newdst,
++		emit_insn (gen_cpymem_short (newdstp1, newdst,
+ 					     GEN_INT (l > 257 ? 255 : l - 2)));
+ 	      }
+ 	  }
+@@ -5670,7 +5670,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
+ 	  /* Set the first byte in the block to the value and use an
+ 	     overlapping mvc for the block.  */
+ 	  emit_move_insn (adjust_address (dst, QImode, 0), val);
+-	  emit_insn (gen_movmem_short (dstp1, dst, GEN_INT (254)));
++	  emit_insn (gen_cpymem_short (dstp1, dst, GEN_INT (254)));
+ 	}
+       s390_load_address (dst_addr,
+ 			 gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256)));
+@@ -5694,7 +5694,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
+ 	  emit_move_insn (adjust_address (dst, QImode, 0), val);
+ 	  /* execute only uses the lowest 8 bits of count that's
+ 	     exactly what we need here.  */
+-	  emit_insn (gen_movmem_short (dstp1, dst,
++	  emit_insn (gen_cpymem_short (dstp1, dst,
+ 				       convert_to_mode (Pmode, count, 1)));
+ 	}
+ 
+@@ -6336,7 +6336,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
+ 
+ 	  dest = adjust_address (dest, BLKmode, 0);
+ 	  set_mem_size (dest, size);
+-	  s390_expand_movmem (dest, src_mem, GEN_INT (size));
++	  s390_expand_cpymem (dest, src_mem, GEN_INT (size));
+ 	  return true;
+ 	}
+ 
+@@ -12408,7 +12408,7 @@ s390_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+ 
+   s390_check_type_for_vector_abi (type, true, false);
+ 
+-  if (pass_by_reference (NULL, TYPE_MODE (type), type, false))
++  if (pass_va_arg_by_reference (type))
+     {
+       if (TARGET_DEBUG_ARG)
+ 	{
+diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
+index 5a3496ac9..8dc3c12df 100644
+--- a/gcc/config/s390/s390.md
++++ b/gcc/config/s390/s390.md
+@@ -3196,17 +3196,17 @@
+ 
+ 
+ ;
+-; movmemM instruction pattern(s).
++; cpymemM instruction pattern(s).
+ ;
+ 
+-(define_expand "movmem<mode>"
++(define_expand "cpymem<mode>"
+   [(set (match_operand:BLK 0 "memory_operand" "")   ; destination
+         (match_operand:BLK 1 "memory_operand" ""))  ; source
+    (use (match_operand:GPR 2 "general_operand" "")) ; count
+    (match_operand 3 "" "")]
+   ""
+ {
+-  if (s390_expand_movmem (operands[0], operands[1], operands[2]))
++  if (s390_expand_cpymem (operands[0], operands[1], operands[2]))
+     DONE;
+   else
+     FAIL;
+@@ -3215,7 +3215,7 @@
+ ; Move a block that is up to 256 bytes in length.
+ ; The block length is taken as (operands[2] % 256) + 1.
+ 
+-(define_expand "movmem_short"
++(define_expand "cpymem_short"
+   [(parallel
+     [(set (match_operand:BLK 0 "memory_operand" "")
+           (match_operand:BLK 1 "memory_operand" ""))
+@@ -3225,7 +3225,7 @@
+   ""
+   "operands[3] = gen_rtx_SCRATCH (Pmode);")
+ 
+-(define_insn "*movmem_short"
++(define_insn "*cpymem_short"
+   [(set (match_operand:BLK 0 "memory_operand" "=Q,Q,Q,Q")
+         (match_operand:BLK 1 "memory_operand" "Q,Q,Q,Q"))
+    (use (match_operand 2 "nonmemory_operand" "n,a,a,a"))
+@@ -3293,7 +3293,7 @@
+ 
+ ; Move a block of arbitrary length.
+ 
+-(define_expand "movmem_long"
++(define_expand "cpymem_long"
+   [(parallel
+     [(clobber (match_dup 2))
+      (clobber (match_dup 3))
+@@ -3327,7 +3327,7 @@
+   operands[3] = reg1;
+ })
+ 
+-(define_insn "*movmem_long"
++(define_insn "*cpymem_long"
+   [(clobber (match_operand:<DBL> 0 "register_operand" "=d"))
+    (clobber (match_operand:<DBL> 1 "register_operand" "=d"))
+    (set (mem:BLK (subreg:P (match_operand:<DBL> 2 "register_operand" "0") 0))
+@@ -3340,7 +3340,7 @@
+   [(set_attr "length" "8")
+    (set_attr "type" "vs")])
+ 
+-(define_insn "*movmem_long_31z"
++(define_insn "*cpymem_long_31z"
+   [(clobber (match_operand:TI 0 "register_operand" "=d"))
+    (clobber (match_operand:TI 1 "register_operand" "=d"))
+    (set (mem:BLK (subreg:SI (match_operand:TI 2 "register_operand" "0") 4))
+diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
+index fdb80d5d9..e687cf22a 100644
+--- a/gcc/config/sh/sh.md
++++ b/gcc/config/sh/sh.md
+@@ -8906,7 +8906,7 @@
+ 
+ ;; String/block move insn.
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (mem:BLK (match_operand:BLK 0))
+ 		   (mem:BLK (match_operand:BLK 1)))
+ 	      (use (match_operand:SI 2 "nonmemory_operand"))
+diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
+index a993aab76..02966fd03 100644
+--- a/gcc/config/sparc/sparc.c
++++ b/gcc/config/sparc/sparc.c
+@@ -7965,7 +7965,7 @@ sparc_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+   bool indirect;
+   tree ptrtype = build_pointer_type (type);
+ 
+-  if (pass_by_reference (NULL, TYPE_MODE (type), type, false))
++  if (pass_va_arg_by_reference (type))
+     {
+       indirect = true;
+       size = rsize = UNITS_PER_WORD;
+diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
+index 4b09fc86b..8807a56f4 100644
+--- a/gcc/config/sparc/sparc.h
++++ b/gcc/config/sparc/sparc.h
+@@ -1419,7 +1419,7 @@ do {									   \
+ #define MOVE_MAX 8
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction pairs, we will do a movmem or libcall instead.  */
++   move-instruction pairs, we will do a cpymem or libcall instead.  */
+ 
+ #define MOVE_RATIO(speed) ((speed) ? 8 : 3)
+ 
+diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c
+index 8d7439e69..ecc767bfa 100644
+--- a/gcc/config/spu/spu.c
++++ b/gcc/config/spu/spu.c
+@@ -4053,8 +4053,7 @@ spu_gimplify_va_arg_expr (tree valist, tree type, gimple_seq * pre_p,
+ 
+   /* if an object is dynamically sized, a pointer to it is passed
+      instead of the object itself. */
+-  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
+-					   false);
++  pass_by_reference_p = pass_va_arg_by_reference (type);
+   if (pass_by_reference_p)
+     type = build_pointer_type (type);
+   size = int_size_in_bytes (type);
+diff --git a/gcc/config/tilegx/tilegx.c b/gcc/config/tilegx/tilegx.c
+index 82226da3a..d12f1a99d 100644
+--- a/gcc/config/tilegx/tilegx.c
++++ b/gcc/config/tilegx/tilegx.c
+@@ -471,8 +471,7 @@ tilegx_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+ 
+   /* If an object is dynamically sized, a pointer to it is passed
+      instead of the object itself.  */
+-  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
+-					   false);
++  pass_by_reference_p = pass_va_arg_by_reference (type);
+ 
+   if (pass_by_reference_p)
+     type = build_pointer_type (type);
+diff --git a/gcc/config/tilepro/tilepro.c b/gcc/config/tilepro/tilepro.c
+index c8d69d32f..f1a0df0ad 100644
+--- a/gcc/config/tilepro/tilepro.c
++++ b/gcc/config/tilepro/tilepro.c
+@@ -419,8 +419,7 @@ tilepro_gimplify_va_arg_expr (tree valist, tree type, gimple_seq * pre_p,
+ 
+   /* if an object is dynamically sized, a pointer to it is passed
+      instead of the object itself.  */
+-  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
+-					   false);
++  pass_by_reference_p = pass_va_arg_by_reference (type);
+ 
+   if (pass_by_reference_p)
+     type = build_pointer_type (type);
+diff --git a/gcc/config/vax/vax-protos.h b/gcc/config/vax/vax-protos.h
+index a76cf0239..a85cf3611 100644
+--- a/gcc/config/vax/vax-protos.h
++++ b/gcc/config/vax/vax-protos.h
+@@ -31,7 +31,6 @@ extern void vax_expand_addsub_di_operands (rtx *, enum rtx_code);
+ extern const char * vax_output_int_move (rtx, rtx *, machine_mode);
+ extern const char * vax_output_int_add (rtx_insn *, rtx *, machine_mode);
+ extern const char * vax_output_int_subtract (rtx_insn *, rtx *, machine_mode);
+-extern const char * vax_output_movmemsi (rtx, rtx *);
+ #endif /* RTX_CODE */
+ 
+ #ifdef REAL_VALUE_TYPE
+diff --git a/gcc/config/vax/vax.h b/gcc/config/vax/vax.h
+index a6a8227f7..e7137dc09 100644
+--- a/gcc/config/vax/vax.h
++++ b/gcc/config/vax/vax.h
+@@ -430,7 +430,7 @@ enum reg_class { NO_REGS, ALL_REGS, LIM_REG_CLASSES };
+ #define MOVE_MAX 8
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction pairs, we will do a movmem or libcall instead.  */
++   move-instruction pairs, we will do a cpymem or libcall instead.  */
+ #define MOVE_RATIO(speed) ((speed) ? 6 : 3)
+ #define CLEAR_RATIO(speed) ((speed) ? 6 : 2)
+ 
+diff --git a/gcc/config/vax/vax.md b/gcc/config/vax/vax.md
+index bfeae7f80..298f3393d 100644
+--- a/gcc/config/vax/vax.md
++++ b/gcc/config/vax/vax.md
+@@ -206,8 +206,8 @@
+ }")
+ 
+ ;; This is here to accept 4 arguments and pass the first 3 along
+-;; to the movmemhi1 pattern that really does the work.
+-(define_expand "movmemhi"
++;; to the cpymemhi1 pattern that really does the work.
++(define_expand "cpymemhi"
+   [(set (match_operand:BLK 0 "general_operand" "=g")
+ 	(match_operand:BLK 1 "general_operand" "g"))
+    (use (match_operand:HI 2 "general_operand" "g"))
+@@ -215,7 +215,7 @@
+   ""
+   "
+ {
+-  emit_insn (gen_movmemhi1 (operands[0], operands[1], operands[2]));
++  emit_insn (gen_cpymemhi1 (operands[0], operands[1], operands[2]));
+   DONE;
+ }")
+ 
+@@ -224,7 +224,7 @@
+ ;; that anything generated as this insn will be recognized as one
+ ;; and that it won't successfully combine with anything.
+ 
+-(define_insn "movmemhi1"
++(define_insn "cpymemhi1"
+   [(set (match_operand:BLK 0 "memory_operand" "=o")
+ 	(match_operand:BLK 1 "memory_operand" "o"))
+    (use (match_operand:HI 2 "general_operand" "g"))
+diff --git a/gcc/config/visium/visium.c b/gcc/config/visium/visium.c
+index 431f64cfc..4ff331362 100644
+--- a/gcc/config/visium/visium.c
++++ b/gcc/config/visium/visium.c
+@@ -1637,8 +1637,7 @@ visium_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+   tree f_ovfl, f_gbase, f_fbase, f_gbytes, f_fbytes;
+   tree ovfl, base, bytes;
+   HOST_WIDE_INT size, rsize;
+-  const bool by_reference_p
+-    = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  const bool by_reference_p = pass_va_arg_by_reference (type);
+   const bool float_reg_arg_p
+     = (TARGET_FPU && !by_reference_p
+        && ((GET_MODE_CLASS (TYPE_MODE (type)) == MODE_FLOAT
+diff --git a/gcc/config/visium/visium.h b/gcc/config/visium/visium.h
+index 817e7dc70..c9376b28f 100644
+--- a/gcc/config/visium/visium.h
++++ b/gcc/config/visium/visium.h
+@@ -1138,8 +1138,8 @@ do									\
+    always make code faster, but eventually incurs high cost in
+    increased code size.
+ 
+-   Since we have a movmemsi pattern, the default MOVE_RATIO is 2, which
+-   is too low given that movmemsi will invoke a libcall.  */
++   Since we have a cpymemsi pattern, the default MOVE_RATIO is 2, which
++   is too low given that cpymemsi will invoke a libcall.  */
+ #define MOVE_RATIO(speed) ((speed) ? 9 : 3)
+ 
+ /* `CLEAR_RATIO (SPEED)`
+diff --git a/gcc/config/visium/visium.md b/gcc/config/visium/visium.md
+index f53544134..e146b89d1 100644
+--- a/gcc/config/visium/visium.md
++++ b/gcc/config/visium/visium.md
+@@ -3006,7 +3006,7 @@
+ ;; Argument 2 is the length
+ ;; Argument 3 is the alignment
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "memory_operand" "")
+ 		   (match_operand:BLK 1 "memory_operand" ""))
+ 	      (use (match_operand:SI  2 "general_operand" ""))
+diff --git a/gcc/config/xtensa/xtensa.c b/gcc/config/xtensa/xtensa.c
+index ee5612441..b275deafa 100644
+--- a/gcc/config/xtensa/xtensa.c
++++ b/gcc/config/xtensa/xtensa.c
+@@ -3252,7 +3252,7 @@ xtensa_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   tree lab_false, lab_over, lab_false2;
+   bool indirect;
+ 
+-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  indirect = pass_va_arg_by_reference (type);
+   if (indirect)
+     type = build_pointer_type (type);
+ 
+diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
+index 362e5ff3c..d1448a02f 100644
+--- a/gcc/config/xtensa/xtensa.md
++++ b/gcc/config/xtensa/xtensa.md
+@@ -1026,7 +1026,7 @@
+ 
+ ;; Block moves
+ 
+-(define_expand "movmemsi"
++(define_expand "cpymemsi"
+   [(parallel [(set (match_operand:BLK 0 "" "")
+ 		   (match_operand:BLK 1 "" ""))
+ 	      (use (match_operand:SI 2 "arith_operand" ""))
+diff --git a/gcc/coretypes.h b/gcc/coretypes.h
+index 2f6b8599d..88fe8a3f9 100644
+--- a/gcc/coretypes.h
++++ b/gcc/coretypes.h
+@@ -153,6 +153,14 @@ struct cl_option_handlers;
+ struct diagnostic_context;
+ struct pretty_printer;
+ 
++template<typename T> struct array_traits;
++
++/* Provides a read-only bitmap view of a single integer bitmask or an
++   array of integer bitmasks, or of a wrapper around such bitmasks.  */
++template<typename T, typename Traits = array_traits<T>,
++	 bool has_constant_size = Traits::has_constant_size>
++struct bitmap_view;
++
+ /* Address space number for named address space support.  */
+ typedef unsigned char addr_space_t;
+ 
+@@ -332,6 +340,7 @@ namespace gcc {
+ }
+ 
+ typedef std::pair <tree, tree> tree_pair;
++typedef std::pair <const char *, int> string_int_pair;
+ 
+ /* Define a name->value mapping.  */
+ template <typename ValueType>
+diff --git a/gcc/coverage.c b/gcc/coverage.c
+index 1ffefd5f4..a63cb94e9 100644
+--- a/gcc/coverage.c
++++ b/gcc/coverage.c
+@@ -643,7 +643,7 @@ coverage_begin_function (unsigned lineno_checksum, unsigned cfg_checksum)
+ 		     (DECL_ASSEMBLER_NAME (current_function_decl)));
+   gcov_write_unsigned (DECL_ARTIFICIAL (current_function_decl)
+ 		       && !DECL_FUNCTION_VERSIONED (current_function_decl)
+-		       && !DECL_LAMBDA_FUNCTION (current_function_decl));
++		       && !DECL_LAMBDA_FUNCTION_P (current_function_decl));
+   gcov_write_filename (xloc.file);
+   gcov_write_unsigned (xloc.line);
+   gcov_write_unsigned (xloc.column);
+diff --git a/gcc/cp/call.c b/gcc/cp/call.c
+index 23a54f3c3..3a821de7a 100644
+--- a/gcc/cp/call.c
++++ b/gcc/cp/call.c
+@@ -9166,12 +9166,14 @@ maybe_warn_class_memaccess (location_t loc, tree fndecl,
+ }
+ 
+ /* Build and return a call to FN, using NARGS arguments in ARGARRAY.
++   If FN is the result of resolving an overloaded target built-in,
++   ORIG_FNDECL is the original function decl, otherwise it is null.
+    This function performs no overload resolution, conversion, or other
+    high-level operations.  */
+ 
+ tree
+ build_cxx_call (tree fn, int nargs, tree *argarray,
+-		tsubst_flags_t complain)
++		tsubst_flags_t complain, tree orig_fndecl)
+ {
+   tree fndecl;
+ 
+@@ -9181,11 +9183,13 @@ build_cxx_call (tree fn, int nargs, tree *argarray,
+   SET_EXPR_LOCATION (fn, loc);
+ 
+   fndecl = get_callee_fndecl (fn);
++  if (!orig_fndecl)
++    orig_fndecl = fndecl;
+ 
+   /* Check that arguments to builtin functions match the expectations.  */
+   if (fndecl
+       && !processing_template_decl
+-      && fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
++      && fndecl_built_in_p (fndecl))
+     {
+       int i;
+ 
+@@ -9195,7 +9199,7 @@ build_cxx_call (tree fn, int nargs, tree *argarray,
+ 	argarray[i] = maybe_constant_value (argarray[i]);
+ 
+       if (!check_builtin_function_arguments (EXPR_LOCATION (fn), vNULL, fndecl,
+-					     nargs, argarray))
++					     orig_fndecl, nargs, argarray))
+ 	return error_mark_node;
+     }
+ 
+diff --git a/gcc/cp/cp-objcp-common.h b/gcc/cp/cp-objcp-common.h
+index 89a889a7d..e5d34f180 100644
+--- a/gcc/cp/cp-objcp-common.h
++++ b/gcc/cp/cp-objcp-common.h
+@@ -35,6 +35,8 @@ extern tree cp_get_global_decls ();
+ extern tree cp_pushdecl (tree);
+ extern void cp_register_dumps (gcc::dump_manager *);
+ extern tree cxx_make_type_hook			(tree_code);
++extern tree cxx_simulate_enum_decl (location_t, const char *,
++				    vec<string_int_pair>);
+ 
+ /* Lang hooks that are shared between C++ and ObjC++ are defined here.  Hooks
+    specific to C++ or ObjC++ go in cp/cp-lang.c and objcp/objcp-lang.c,
+@@ -100,6 +102,9 @@ extern tree cxx_make_type_hook			(tree_code);
+ #define LANG_HOOKS_BUILTIN_FUNCTION cxx_builtin_function
+ #undef  LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE
+ #define LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE cxx_builtin_function_ext_scope
++#undef  LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL
++#define LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL \
++  cxx_simulate_builtin_function_decl
+ #undef	LANG_HOOKS_TYPE_HASH_EQ
+ #define LANG_HOOKS_TYPE_HASH_EQ	cxx_type_hash_eq
+ #undef	LANG_HOOKS_COPY_LANG_QUALIFIERS
+@@ -128,6 +133,8 @@ extern tree cxx_make_type_hook			(tree_code);
+ 
+ #undef LANG_HOOKS_MAKE_TYPE
+ #define LANG_HOOKS_MAKE_TYPE cxx_make_type_hook
++#undef LANG_HOOKS_SIMULATE_ENUM_DECL
++#define LANG_HOOKS_SIMULATE_ENUM_DECL cxx_simulate_enum_decl
+ #undef LANG_HOOKS_TYPE_FOR_MODE
+ #define LANG_HOOKS_TYPE_FOR_MODE c_common_type_for_mode
+ #undef LANG_HOOKS_TYPE_FOR_SIZE
+diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
+index f7c3eea4c..4bba1887f 100644
+--- a/gcc/cp/cp-tree.h
++++ b/gcc/cp/cp-tree.h
+@@ -6245,7 +6245,8 @@ extern tree perform_direct_initialization_if_possible (tree, tree, bool,
+                                                        tsubst_flags_t);
+ extern tree in_charge_arg_for_name		(tree);
+ extern tree build_cxx_call			(tree, int, tree *,
+-						 tsubst_flags_t);
++						 tsubst_flags_t,
++						 tree = NULL_TREE);
+ extern bool is_std_init_list			(tree);
+ extern bool is_list_ctor			(tree);
+ extern void validate_conversion_obstack		(void);
+@@ -6451,6 +6452,7 @@ extern tmpl_spec_kind current_tmpl_spec_kind	(int);
+ extern tree cp_fname_init			(const char *, tree *);
+ extern tree cxx_builtin_function		(tree decl);
+ extern tree cxx_builtin_function_ext_scope	(tree decl);
++extern tree cxx_simulate_builtin_function_decl	(tree);
+ extern tree check_elaborated_type_specifier	(enum tag_types, tree, bool);
+ extern void warn_extern_redeclared_static	(tree, tree);
+ extern tree cxx_comdat_group			(tree);
+@@ -7386,7 +7388,8 @@ extern tree get_member_function_from_ptrfunc	(tree *, tree, tsubst_flags_t);
+ extern tree cp_build_function_call_nary         (tree, tsubst_flags_t, ...)
+ 						ATTRIBUTE_SENTINEL;
+ extern tree cp_build_function_call_vec		(tree, vec<tree, va_gc> **,
+-						 tsubst_flags_t);
++						 tsubst_flags_t,
++						 tree = NULL_TREE);
+ extern tree build_x_binary_op			(const op_location_t &,
+ 						 enum tree_code, tree,
+ 						 enum tree_code, tree,
+diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
+index 5c82c2272..928ac3f21 100644
+--- a/gcc/cp/decl.c
++++ b/gcc/cp/decl.c
+@@ -2273,7 +2273,8 @@ next_arg:;
+ 	  DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (newdecl)
+ 	    |= DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (olddecl);
+ 	  DECL_NO_LIMIT_STACK (newdecl) |= DECL_NO_LIMIT_STACK (olddecl);
+-	  DECL_IS_OPERATOR_NEW (newdecl) |= DECL_IS_OPERATOR_NEW (olddecl);
++	  if (DECL_IS_OPERATOR_NEW_P (olddecl))
++	    DECL_SET_IS_OPERATOR_NEW (newdecl, true);
+ 	  DECL_LOOPING_CONST_OR_PURE_P (newdecl)
+ 	    |= DECL_LOOPING_CONST_OR_PURE_P (olddecl);
+ 
+@@ -2520,8 +2521,7 @@ next_arg:;
+       if (fndecl_built_in_p (olddecl)
+ 	  && (new_defines_function ? GNU_INLINE_P (newdecl) : types_match))
+ 	{
+-	  DECL_BUILT_IN_CLASS (newdecl) = DECL_BUILT_IN_CLASS (olddecl);
+-	  DECL_FUNCTION_CODE (newdecl) = DECL_FUNCTION_CODE (olddecl);
++	  copy_decl_built_in_function (newdecl, olddecl);
+ 	  /* If we're keeping the built-in definition, keep the rtl,
+ 	     regardless of declaration matches.  */
+ 	  COPY_DECL_RTL (olddecl, newdecl);
+@@ -4335,10 +4335,10 @@ cxx_init_decl_processing (void)
+     deltype = build_exception_variant (deltype, empty_except_spec);
+     tree opnew = push_cp_library_fn (NEW_EXPR, newtype, 0);
+     DECL_IS_MALLOC (opnew) = 1;
+-    DECL_IS_OPERATOR_NEW (opnew) = 1;
++    DECL_SET_IS_OPERATOR_NEW (opnew, true);
+     opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
+     DECL_IS_MALLOC (opnew) = 1;
+-    DECL_IS_OPERATOR_NEW (opnew) = 1;
++    DECL_SET_IS_OPERATOR_NEW (opnew, true);
+     push_cp_library_fn (DELETE_EXPR, deltype, ECF_NOTHROW);
+     push_cp_library_fn (VEC_DELETE_EXPR, deltype, ECF_NOTHROW);
+     if (flag_sized_deallocation)
+@@ -4371,10 +4371,10 @@ cxx_init_decl_processing (void)
+ 	newtype = build_exception_variant (newtype, new_eh_spec);
+ 	opnew = push_cp_library_fn (NEW_EXPR, newtype, 0);
+ 	DECL_IS_MALLOC (opnew) = 1;
+-	DECL_IS_OPERATOR_NEW (opnew) = 1;
++	DECL_SET_IS_OPERATOR_NEW (opnew, true);
+ 	opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
+ 	DECL_IS_MALLOC (opnew) = 1;
+-	DECL_IS_OPERATOR_NEW (opnew) = 1;
++	DECL_SET_IS_OPERATOR_NEW (opnew, true);
+ 
+ 	/* operator delete (void *, align_val_t); */
+ 	deltype = build_function_type_list (void_type_node, ptr_type_node,
+@@ -4614,6 +4614,19 @@ cxx_builtin_function_ext_scope (tree decl)
+   return builtin_function_1 (decl, NULL_TREE, true);
+ }
+ 
++/* Implement LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL.  */
++
++tree
++cxx_simulate_builtin_function_decl (tree decl)
++{
++  retrofit_lang_decl (decl);
++
++  DECL_ARTIFICIAL (decl) = 1;
++  SET_DECL_LANGUAGE (decl, lang_cplusplus);
++  DECL_CONTEXT (decl) = FROB_CONTEXT (current_namespace);
++  return pushdecl (decl);
++}
++
+ /* Generate a FUNCTION_DECL with the typical flags for a runtime library
+    function.  Not called directly.  */
+ 
+@@ -13570,7 +13583,7 @@ grok_op_properties (tree decl, bool complain)
+ 	coerce_delete_type (decl, loc);
+       else
+ 	{
+-	  DECL_IS_OPERATOR_NEW (decl) = 1;
++	  DECL_SET_IS_OPERATOR_NEW (decl, true);
+ 	  TREE_TYPE (decl) = coerce_new_type (TREE_TYPE (decl), loc);
+ 	}
+ 
+@@ -15119,6 +15132,40 @@ lookup_enumerator (tree enumtype, tree name)
+   return e? TREE_VALUE (e) : NULL_TREE;
+ }
+ 
++/* Implement LANG_HOOKS_SIMULATE_ENUM_DECL.  */
++
++tree
++cxx_simulate_enum_decl (location_t loc, const char *name,
++			vec<string_int_pair> values)
++{
++  location_t saved_loc = input_location;
++  input_location = loc;
++
++  tree enumtype = start_enum (get_identifier (name), NULL_TREE, NULL_TREE,
++			      NULL_TREE, false, NULL);
++  if (!OPAQUE_ENUM_P (enumtype))
++    {
++      error_at (loc, "multiple definition of %q#T", enumtype);
++      inform (DECL_SOURCE_LOCATION (TYPE_MAIN_DECL (enumtype)),
++	      "previous definition here");
++      return enumtype;
++    }
++  SET_OPAQUE_ENUM_P (enumtype, false);
++  DECL_SOURCE_LOCATION (TYPE_NAME (enumtype)) = loc;
++
++  string_int_pair *value;
++  unsigned int i;
++  FOR_EACH_VEC_ELT (values, i, value)
++    build_enumerator (get_identifier (value->first),
++		      build_int_cst (integer_type_node, value->second),
++		      enumtype, NULL_TREE, loc);
++
++  finish_enum_value_list (enumtype);
++  finish_enum (enumtype);
++
++  input_location = saved_loc;
++  return enumtype;
++}
+ 
+ /* We're defining DECL.  Make sure that its type is OK.  */
+ 
+diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
+index 60fe58e03..6fc6ed4e3 100644
+--- a/gcc/cp/parser.c
++++ b/gcc/cp/parser.c
+@@ -10977,7 +10977,7 @@ cp_parser_lambda_declarator_opt (cp_parser* parser, tree lambda_expr)
+ 	DECL_ARTIFICIAL (fco) = 1;
+ 	/* Give the object parameter a different name.  */
+ 	DECL_NAME (DECL_ARGUMENTS (fco)) = closure_identifier;
+-	DECL_LAMBDA_FUNCTION (fco) = 1;
++	DECL_SET_LAMBDA_FUNCTION (fco, true);
+       }
+     if (template_param_list)
+       {
+diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
+index ff7921533..bd6df79a4 100644
+--- a/gcc/cp/pt.c
++++ b/gcc/cp/pt.c
+@@ -28431,9 +28431,8 @@ declare_integer_pack (void)
+ 							 NULL_TREE),
+ 			       NULL_TREE, ECF_CONST);
+   DECL_DECLARED_CONSTEXPR_P (ipfn) = true;
+-  DECL_BUILT_IN_CLASS (ipfn) = BUILT_IN_FRONTEND;
+-  DECL_FUNCTION_CODE (ipfn)
+-    = (enum built_in_function) (int) CP_BUILT_IN_INTEGER_PACK;
++  set_decl_built_in_function (ipfn, BUILT_IN_FRONTEND,
++			      CP_BUILT_IN_INTEGER_PACK);
+ }
+ 
+ /* Set up the hash tables for template instantiations.  */
+diff --git a/gcc/cp/typeck.c b/gcc/cp/typeck.c
+index c42fd731c..82f7bb0bd 100644
+--- a/gcc/cp/typeck.c
++++ b/gcc/cp/typeck.c
+@@ -3738,11 +3738,11 @@ build_function_call (location_t /*loc*/,
+ tree
+ build_function_call_vec (location_t /*loc*/, vec<location_t> /*arg_loc*/,
+ 			 tree function, vec<tree, va_gc> *params,
+-			 vec<tree, va_gc> * /*origtypes*/)
++			 vec<tree, va_gc> * /*origtypes*/, tree orig_function)
+ {
+   vec<tree, va_gc> *orig_params = params;
+   tree ret = cp_build_function_call_vec (function, &params,
+-					 tf_warning_or_error);
++					 tf_warning_or_error, orig_function);
+ 
+   /* cp_build_function_call_vec can reallocate PARAMS by adding
+      default arguments.  That should never happen here.  Verify
+@@ -3787,13 +3787,15 @@ cp_build_function_call_nary (tree function, tsubst_flags_t complain, ...)
+   return ret;
+ }
+ 
+-/* Build a function call using a vector of arguments.  PARAMS may be
+-   NULL if there are no parameters.  This changes the contents of
+-   PARAMS.  */
++/* Build a function call using a vector of arguments.
++   If FUNCTION is the result of resolving an overloaded target built-in,
++   ORIG_FNDECL is the original function decl, otherwise it is null.
++   PARAMS may be NULL if there are no parameters.  This changes the
++   contents of PARAMS.  */
+ 
+ tree
+ cp_build_function_call_vec (tree function, vec<tree, va_gc> **params,
+-			    tsubst_flags_t complain)
++			    tsubst_flags_t complain, tree orig_fndecl)
+ {
+   tree fntype, fndecl;
+   int is_method;
+@@ -3918,7 +3920,7 @@ cp_build_function_call_vec (tree function, vec<tree, va_gc> **params,
+   bool warned_p = check_function_arguments (input_location, fndecl, fntype,
+ 					    nargs, argarray, NULL);
+ 
+-  ret = build_cxx_call (function, nargs, argarray, complain);
++  ret = build_cxx_call (function, nargs, argarray, complain, orig_fndecl);
+ 
+   if (warned_p)
+     {
+diff --git a/gcc/cse.c b/gcc/cse.c
+index 6c9cda16a..18eb8dfbb 100644
+--- a/gcc/cse.c
++++ b/gcc/cse.c
+@@ -559,7 +559,6 @@ static struct table_elt *insert_with_costs (rtx, struct table_elt *, unsigned,
+ static struct table_elt *insert (rtx, struct table_elt *, unsigned,
+ 				 machine_mode);
+ static void merge_equiv_classes (struct table_elt *, struct table_elt *);
+-static void invalidate_reg (rtx, bool);
+ static void invalidate (rtx, machine_mode);
+ static void remove_invalid_refs (unsigned int);
+ static void remove_invalid_subreg_refs (unsigned int, poly_uint64,
+@@ -1821,12 +1820,10 @@ check_dependence (const_rtx x, rtx exp, machine_mode mode, rtx addr)
+ }
+ 
+ /* Remove from the hash table, or mark as invalid, all expressions whose
+-   values could be altered by storing in register X.
+-
+-   CLOBBER_HIGH is set if X was part of a CLOBBER_HIGH expression.  */
++   values could be altered by storing in register X.  */
+ 
+ static void
+-invalidate_reg (rtx x, bool clobber_high)
++invalidate_reg (rtx x)
+ {
+   gcc_assert (GET_CODE (x) == REG);
+ 
+@@ -1851,10 +1848,7 @@ invalidate_reg (rtx x, bool clobber_high)
+   SUBREG_TICKED (regno) = -1;
+ 
+   if (regno >= FIRST_PSEUDO_REGISTER)
+-    {
+-      gcc_assert (!clobber_high);
+-      remove_pseudo_from_table (x, hash);
+-    }
++    remove_pseudo_from_table (x, hash);
+   else
+     {
+       HOST_WIDE_INT in_table = TEST_HARD_REG_BIT (hard_regs_in_table, regno);
+@@ -1882,18 +1876,10 @@ invalidate_reg (rtx x, bool clobber_high)
+ 	      if (!REG_P (p->exp) || REGNO (p->exp) >= FIRST_PSEUDO_REGISTER)
+ 		continue;
+ 
+-	      if (clobber_high)
+-		{
+-		  if (reg_is_clobbered_by_clobber_high (p->exp, x))
+-		    remove_from_table (p, hash);
+-		}
+-	      else
+-		{
+-		  unsigned int tregno = REGNO (p->exp);
+-		  unsigned int tendregno = END_REGNO (p->exp);
+-		  if (tendregno > regno && tregno < endregno)
+-		    remove_from_table (p, hash);
+-		}
++	      unsigned int tregno = REGNO (p->exp);
++	      unsigned int tendregno = END_REGNO (p->exp);
++	      if (tendregno > regno && tregno < endregno)
++		remove_from_table (p, hash);
+ 	    }
+     }
+ }
+@@ -1920,7 +1906,7 @@ invalidate (rtx x, machine_mode full_mode)
+   switch (GET_CODE (x))
+     {
+     case REG:
+-      invalidate_reg (x, false);
++      invalidate_reg (x);
+       return;
+ 
+     case SUBREG:
+@@ -4420,8 +4406,6 @@ canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
+       if (MEM_P (XEXP (x, 0)))
+ 	canon_reg (XEXP (x, 0), insn);
+     }
+-  else if (GET_CODE (x) == CLOBBER_HIGH)
+-    gcc_assert (REG_P (XEXP (x, 0)));
+   else if (GET_CODE (x) == USE
+ 	   && ! (REG_P (XEXP (x, 0))
+ 		 && REGNO (XEXP (x, 0)) < FIRST_PSEUDO_REGISTER))
+@@ -4453,8 +4437,6 @@ canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
+ 	      if (MEM_P (XEXP (y, 0)))
+ 		canon_reg (XEXP (y, 0), insn);
+ 	    }
+-	  else if (GET_CODE (y) == CLOBBER_HIGH)
+-	    gcc_assert (REG_P (XEXP (y, 0)));
+ 	  else if (GET_CODE (y) == USE
+ 		   && ! (REG_P (XEXP (y, 0))
+ 			 && REGNO (XEXP (y, 0)) < FIRST_PSEUDO_REGISTER))
+@@ -6155,12 +6137,6 @@ invalidate_from_clobbers (rtx_insn *insn)
+ 	    invalidate (XEXP (ref, 0), GET_MODE (ref));
+ 	}
+     }
+-  if (GET_CODE (x) == CLOBBER_HIGH)
+-    {
+-      rtx ref = XEXP (x, 0);
+-      gcc_assert (REG_P (ref));
+-      invalidate_reg (ref, true);
+-    }
+   else if (GET_CODE (x) == PARALLEL)
+     {
+       int i;
+@@ -6177,12 +6153,6 @@ invalidate_from_clobbers (rtx_insn *insn)
+ 		       || GET_CODE (ref) == ZERO_EXTRACT)
+ 		invalidate (XEXP (ref, 0), GET_MODE (ref));
+ 	    }
+-	  else if (GET_CODE (y) == CLOBBER_HIGH)
+-	    {
+-	      rtx ref = XEXP (y, 0);
+-	      gcc_assert (REG_P (ref));
+-	      invalidate_reg (ref, true);
+-	    }
+ 	}
+     }
+ }
+@@ -6204,12 +6174,6 @@ invalidate_from_sets_and_clobbers (rtx_insn *insn)
+ 	  rtx temx = XEXP (tem, 0);
+ 	  if (GET_CODE (temx) == CLOBBER)
+ 	    invalidate (SET_DEST (temx), VOIDmode);
+-	  else if (GET_CODE (temx) == CLOBBER_HIGH)
+-	    {
+-	      rtx temref = XEXP (temx, 0);
+-	      gcc_assert (REG_P (temref));
+-	      invalidate_reg (temref, true);
+-	    }
+ 	}
+     }
+ 
+@@ -6237,12 +6201,6 @@ invalidate_from_sets_and_clobbers (rtx_insn *insn)
+ 		       || GET_CODE (clobbered) == ZERO_EXTRACT)
+ 		invalidate (XEXP (clobbered, 0), GET_MODE (clobbered));
+ 	    }
+-	  else if (GET_CODE (y) == CLOBBER_HIGH)
+-	    {
+-	      rtx ref = XEXP (y, 0);
+-	      gcc_assert (REG_P (ref));
+-	      invalidate_reg (ref, true);
+-	    }
+ 	  else if (GET_CODE (y) == SET && GET_CODE (SET_SRC (y)) == CALL)
+ 	    invalidate (SET_DEST (y), VOIDmode);
+ 	}
+@@ -6902,10 +6860,6 @@ count_reg_usage (rtx x, int *counts, rtx dest, int incr)
+ 	count_reg_usage (XEXP (XEXP (x, 0), 0), counts, NULL_RTX, incr);
+       return;
+ 
+-    case CLOBBER_HIGH:
+-      gcc_assert (REG_P ((XEXP (x, 0))));
+-      return;
+-
+     case SET:
+       /* Unless we are setting a REG, count everything in SET_DEST.  */
+       if (!REG_P (SET_DEST (x)))
+@@ -6958,8 +6912,7 @@ count_reg_usage (rtx x, int *counts, rtx dest, int incr)
+ 	  || (REG_NOTE_KIND (x) != REG_NONNEG && GET_CODE (XEXP (x,0)) == USE)
+ 	  /* FUNCTION_USAGE expression lists may include (CLOBBER (mem /u)),
+ 	     involving registers in the address.  */
+-	  || GET_CODE (XEXP (x, 0)) == CLOBBER
+-	  || GET_CODE (XEXP (x, 0)) == CLOBBER_HIGH)
++	  || GET_CODE (XEXP (x, 0)) == CLOBBER)
+ 	count_reg_usage (XEXP (x, 0), counts, NULL_RTX, incr);
+ 
+       count_reg_usage (XEXP (x, 1), counts, NULL_RTX, incr);
+@@ -7043,9 +6996,7 @@ insn_live_p (rtx_insn *insn, int *counts)
+ 	      if (set_live_p (elt, insn, counts))
+ 		return true;
+ 	    }
+-	  else if (GET_CODE (elt) != CLOBBER
+-		   && GET_CODE (elt) != CLOBBER_HIGH
+-		   && GET_CODE (elt) != USE)
++	  else if (GET_CODE (elt) != CLOBBER && GET_CODE (elt) != USE)
+ 	    return true;
+ 	}
+       return false;
+@@ -7158,7 +7109,7 @@ delete_trivially_dead_insns (rtx_insn *insns, int nreg)
+ 	else if (INSN_P (insn))
+ 	  {
+ 	    count_reg_usage (insn, counts, NULL_RTX, 1);
+-	    note_stores (PATTERN (insn), count_stores, counts + nreg * 2);
++	    note_stores (insn, count_stores, counts + nreg * 2);
+ 	  }
+       /* If there can be debug insns, COUNTS are 3 consecutive arrays.
+ 	 First one counts how many times each pseudo is used outside
+diff --git a/gcc/cselib.c b/gcc/cselib.c
+index 108b2588c..e3408bb38 100644
+--- a/gcc/cselib.c
++++ b/gcc/cselib.c
+@@ -32,6 +32,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "dumpfile.h"
+ #include "cselib.h"
+ #include "params.h"
++#include "function-abi.h"
+ 
+ /* A list of cselib_val structures.  */
+ struct elt_list
+@@ -54,8 +55,7 @@ static unsigned int cselib_hash_rtx (rtx, int, machine_mode);
+ static cselib_val *new_cselib_val (unsigned int, machine_mode, rtx);
+ static void add_mem_for_addr (cselib_val *, cselib_val *, rtx);
+ static cselib_val *cselib_lookup_mem (rtx, int);
+-static void cselib_invalidate_regno (unsigned int, machine_mode,
+-				     const_rtx = NULL);
++static void cselib_invalidate_regno (unsigned int, machine_mode);
+ static void cselib_invalidate_mem (rtx);
+ static void cselib_record_set (rtx, cselib_val *, cselib_val *);
+ static void cselib_record_sets (rtx_insn *);
+@@ -1662,7 +1662,6 @@ cselib_expand_value_rtx_1 (rtx orig, struct expand_value_data *evd,
+       /* SCRATCH must be shared because they represent distinct values.  */
+       return orig;
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       if (REG_P (XEXP (orig, 0)) && HARD_REGISTER_NUM_P (REGNO (XEXP (orig, 0))))
+ 	return orig;
+       break;
+@@ -2165,8 +2164,7 @@ cselib_lookup (rtx x, machine_mode mode,
+    invalidating call clobbered registers across a call.  */
+ 
+ static void
+-cselib_invalidate_regno (unsigned int regno, machine_mode mode,
+-			 const_rtx setter)
++cselib_invalidate_regno (unsigned int regno, machine_mode mode)
+ {
+   unsigned int endregno;
+   unsigned int i;
+@@ -2189,9 +2187,6 @@ cselib_invalidate_regno (unsigned int regno, machine_mode mode,
+ 	i = regno - max_value_regs;
+ 
+       endregno = end_hard_regno (mode, regno);
+-
+-      if (setter && GET_CODE (setter) == CLOBBER_HIGH)
+-	gcc_assert (endregno == regno + 1);
+     }
+   else
+     {
+@@ -2224,19 +2219,6 @@ cselib_invalidate_regno (unsigned int regno, machine_mode mode,
+ 	      continue;
+ 	    }
+ 
+-	  /* Ignore if clobber high and the register isn't clobbered.  */
+-	  if (setter && GET_CODE (setter) == CLOBBER_HIGH)
+-	    {
+-	      gcc_assert (endregno == regno + 1);
+-	      const_rtx x = XEXP (setter, 0);
+-	      if (!reg_is_clobbered_by_clobber_high (i, GET_MODE (v->val_rtx),
+-						     x))
+-		{
+-		  l = &(*l)->next;
+-		  continue;
+-		}
+-	    }
+-
+ 	  /* We have an overlap.  */
+ 	  if (*l == REG_VALUES (i))
+ 	    {
+@@ -2371,10 +2353,10 @@ cselib_invalidate_mem (rtx mem_rtx)
+   *vp = &dummy_val;
+ }
+ 
+-/* Invalidate DEST, which is being assigned to or clobbered by SETTER.  */
++/* Invalidate DEST.  */
+ 
+ void
+-cselib_invalidate_rtx (rtx dest, const_rtx setter)
++cselib_invalidate_rtx (rtx dest)
+ {
+   while (GET_CODE (dest) == SUBREG
+ 	 || GET_CODE (dest) == ZERO_EXTRACT
+@@ -2382,7 +2364,7 @@ cselib_invalidate_rtx (rtx dest, const_rtx setter)
+     dest = XEXP (dest, 0);
+ 
+   if (REG_P (dest))
+-    cselib_invalidate_regno (REGNO (dest), GET_MODE (dest), setter);
++    cselib_invalidate_regno (REGNO (dest), GET_MODE (dest));
+   else if (MEM_P (dest))
+     cselib_invalidate_mem (dest);
+ }
+@@ -2390,10 +2372,10 @@ cselib_invalidate_rtx (rtx dest, const_rtx setter)
+ /* A wrapper for cselib_invalidate_rtx to be called via note_stores.  */
+ 
+ static void
+-cselib_invalidate_rtx_note_stores (rtx dest, const_rtx setter,
++cselib_invalidate_rtx_note_stores (rtx dest, const_rtx,
+ 				   void *data ATTRIBUTE_UNUSED)
+ {
+-  cselib_invalidate_rtx (dest, setter);
++  cselib_invalidate_rtx (dest);
+ }
+ 
+ /* Record the result of a SET instruction.  DEST is being set; the source
+@@ -2659,7 +2641,7 @@ cselib_record_sets (rtx_insn *insn)
+   /* Invalidate all locations written by this insn.  Note that the elts we
+      looked up in the previous loop aren't affected, just some of their
+      locations may go away.  */
+-  note_stores (body, cselib_invalidate_rtx_note_stores, NULL);
++  note_pattern_stores (body, cselib_invalidate_rtx_note_stores, NULL);
+ 
+   for (i = n_sets_before_autoinc; i < n_sets; i++)
+     cselib_invalidate_rtx (sets[i].dest);
+@@ -2765,11 +2747,13 @@ cselib_process_insn (rtx_insn *insn)
+      memory.  */
+   if (CALL_P (insn))
+     {
++      function_abi callee_abi = insn_callee_abi (insn);
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-	if (call_used_regs[i]
++	if (call_used_or_fixed_reg_p (i)
+ 	    || (REG_VALUES (i) && REG_VALUES (i)->elt
+ 		&& (targetm.hard_regno_call_part_clobbered
+-		    (insn, i, GET_MODE (REG_VALUES (i)->elt->val_rtx)))))
++		    (callee_abi.id (), i,
++		     GET_MODE (REG_VALUES (i)->elt->val_rtx)))))
+ 	  cselib_invalidate_regno (i, reg_raw_mode[i]);
+ 
+       /* Since it is not clear how cselib is going to be used, be
+@@ -2794,11 +2778,9 @@ cselib_process_insn (rtx_insn *insn)
+   if (CALL_P (insn))
+     {
+       for (x = CALL_INSN_FUNCTION_USAGE (insn); x; x = XEXP (x, 1))
+-	{
+-	  gcc_assert (GET_CODE (XEXP (x, 0)) != CLOBBER_HIGH);
+-	  if (GET_CODE (XEXP (x, 0)) == CLOBBER)
+-	    cselib_invalidate_rtx (XEXP (XEXP (x, 0), 0));
+-	}
++	if (GET_CODE (XEXP (x, 0)) == CLOBBER)
++	  cselib_invalidate_rtx (XEXP (XEXP (x, 0), 0));
++
+       /* Flush everything on setjmp.  */
+       if (cselib_preserve_constants
+ 	  && find_reg_note (insn, REG_SETJMP, NULL))
+diff --git a/gcc/cselib.h b/gcc/cselib.h
+index 8b8d3e8d5..b5854aedc 100644
+--- a/gcc/cselib.h
++++ b/gcc/cselib.h
+@@ -92,7 +92,7 @@ extern bool cselib_dummy_expand_value_rtx_cb (rtx, bitmap, int,
+ 					      cselib_expand_callback, void *);
+ extern rtx cselib_subst_to_values (rtx, machine_mode);
+ extern rtx cselib_subst_to_values_from_insn (rtx, machine_mode, rtx_insn *);
+-extern void cselib_invalidate_rtx (rtx, const_rtx = NULL);
++extern void cselib_invalidate_rtx (rtx);
+ 
+ extern void cselib_reset_table (unsigned int);
+ extern unsigned int cselib_get_next_uid (void);
+diff --git a/gcc/d/intrinsics.cc b/gcc/d/intrinsics.cc
+index 4bd321b2d..56eab522e 100644
+--- a/gcc/d/intrinsics.cc
++++ b/gcc/d/intrinsics.cc
+@@ -134,10 +134,7 @@ maybe_set_intrinsic (FuncDeclaration *decl)
+ 	  /* If there is no function body, then the implementation is always
+ 	     provided by the compiler.  */
+ 	  if (!decl->fbody)
+-	    {
+-	      DECL_BUILT_IN_CLASS (decl->csym) = BUILT_IN_FRONTEND;
+-	      DECL_FUNCTION_CODE (decl->csym) = (built_in_function) code;
+-	    }
++	    set_decl_built_in_function (decl->csym, BUILT_IN_FRONTEND, code);
+ 
+ 	  /* Infer whether the intrinsic can be used for CTFE, let the
+ 	     front-end know that it can be evaluated at compile-time.  */
+diff --git a/gcc/dce.c b/gcc/dce.c
+index 68d3713b0..2894fa57b 100644
+--- a/gcc/dce.c
++++ b/gcc/dce.c
+@@ -174,7 +174,6 @@ deletable_insn_p (rtx_insn *insn, bool fast, bitmap arg_stores)
+       return false;
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       if (fast)
+ 	{
+ 	  /* A CLOBBER of a dead pseudo register serves no purpose.
+@@ -244,10 +243,7 @@ static void
+ mark_nonreg_stores_1 (rtx dest, const_rtx pattern, void *data)
+ {
+   if (GET_CODE (pattern) != CLOBBER && !REG_P (dest))
+-    {
+-      gcc_checking_assert (GET_CODE (pattern) != CLOBBER_HIGH);
+-      mark_insn ((rtx_insn *) data, true);
+-    }
++    mark_insn ((rtx_insn *) data, true);
+ }
+ 
+ 
+@@ -258,22 +254,19 @@ static void
+ mark_nonreg_stores_2 (rtx dest, const_rtx pattern, void *data)
+ {
+   if (GET_CODE (pattern) != CLOBBER && !REG_P (dest))
+-    {
+-      gcc_checking_assert (GET_CODE (pattern) != CLOBBER_HIGH);
+-      mark_insn ((rtx_insn *) data, false);
+-    }
++    mark_insn ((rtx_insn *) data, false);
+ }
+ 
+ 
+-/* Mark INSN if BODY stores to a non-register destination.  */
++/* Mark INSN if it stores to a non-register destination.  */
+ 
+ static void
+-mark_nonreg_stores (rtx body, rtx_insn *insn, bool fast)
++mark_nonreg_stores (rtx_insn *insn, bool fast)
+ {
+   if (fast)
+-    note_stores (body, mark_nonreg_stores_1, insn);
++    note_stores (insn, mark_nonreg_stores_1, insn);
+   else
+-    note_stores (body, mark_nonreg_stores_2, insn);
++    note_stores (insn, mark_nonreg_stores_2, insn);
+ }
+ 
+ 
+@@ -691,7 +684,7 @@ prescan_insns_for_dce (bool fast)
+ 	    if (arg_stores && bitmap_bit_p (arg_stores, INSN_UID (insn)))
+ 	      continue;
+ 	    if (deletable_insn_p (insn, fast, arg_stores))
+-	      mark_nonreg_stores (PATTERN (insn), insn, fast);
++	      mark_nonreg_stores (insn, fast);
+ 	    else
+ 	      mark_insn (insn, fast);
+ 	  }
+diff --git a/gcc/ddg.c b/gcc/ddg.c
+index 82554ed96..47a50d8ea 100644
+--- a/gcc/ddg.c
++++ b/gcc/ddg.c
+@@ -84,7 +84,7 @@ static bool
+ mem_write_insn_p (rtx_insn *insn)
+ {
+   mem_ref_p = false;
+-  note_stores (PATTERN (insn), mark_mem_store, NULL);
++  note_stores (insn, mark_mem_store, NULL);
+   return mem_ref_p;
+ }
+ 
+diff --git a/gcc/defaults.h b/gcc/defaults.h
+index b75342561..72d4fba11 100644
+--- a/gcc/defaults.h
++++ b/gcc/defaults.h
+@@ -1318,10 +1318,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ #endif
+ 
+ /* If a memory-to-memory move would take MOVE_RATIO or more simple
+-   move-instruction sequences, we will do a movmem or libcall instead.  */
++   move-instruction sequences, we will do a cpymem or libcall instead.  */
+ 
+ #ifndef MOVE_RATIO
+-#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
++#if defined (HAVE_cpymemqi) || defined (HAVE_cpymemhi) || defined (HAVE_cpymemsi) || defined (HAVE_cpymemdi) || defined (HAVE_cpymemti)
+ #define MOVE_RATIO(speed) 2
+ #else
+ /* If we are optimizing for space (-Os), cut down the default move ratio.  */
+@@ -1342,7 +1342,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ #endif
+ 
+ /* If a memory set (to value other than zero) operation would take
+-   SET_RATIO or more simple move-instruction sequences, we will do a movmem
++   SET_RATIO or more simple move-instruction sequences, we will do a setmem
+    or libcall instead.  */
+ #ifndef SET_RATIO
+ #define SET_RATIO(speed) MOVE_RATIO (speed)
+@@ -1459,4 +1459,18 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ #define DWARF_GNAT_ENCODINGS_DEFAULT DWARF_GNAT_ENCODINGS_GDB
+ #endif
+ 
++#ifndef USED_FOR_TARGET
++/* Done this way to keep gengtype happy.  */
++#if BITS_PER_UNIT == 8
++#define TARGET_UNIT uint8_t
++#elif BITS_PER_UNIT == 16
++#define TARGET_UNIT uint16_t
++#elif BITS_PER_UNIT == 32
++#define TARGET_UNIT uint32_t
++#else
++#error Unknown BITS_PER_UNIT
++#endif
++typedef TARGET_UNIT target_unit;
++#endif
++
+ #endif  /* ! GCC_DEFAULTS_H */
+diff --git a/gcc/df-core.c b/gcc/df-core.c
+index b19ba289d..2181ff131 100644
+--- a/gcc/df-core.c
++++ b/gcc/df-core.c
+@@ -2052,7 +2052,7 @@ debug_regset (regset r)
+    This is part of making a debugging dump.  */
+ 
+ void
+-df_print_regset (FILE *file, bitmap r)
++df_print_regset (FILE *file, const_bitmap r)
+ {
+   unsigned int i;
+   bitmap_iterator bi;
+@@ -2077,7 +2077,7 @@ df_print_regset (FILE *file, bitmap r)
+    debugging dump.  */
+ 
+ void
+-df_print_word_regset (FILE *file, bitmap r)
++df_print_word_regset (FILE *file, const_bitmap r)
+ {
+   unsigned int max_reg = max_reg_num ();
+ 
+diff --git a/gcc/df-problems.c b/gcc/df-problems.c
+index a9dfa6203..3c7aeceb2 100644
+--- a/gcc/df-problems.c
++++ b/gcc/df-problems.c
+@@ -388,7 +388,6 @@ df_rd_local_compute (bitmap all_blocks)
+ {
+   unsigned int bb_index;
+   bitmap_iterator bi;
+-  unsigned int regno;
+   struct df_rd_problem_data *problem_data
+     = (struct df_rd_problem_data *) df_rd->problem_data;
+   bitmap sparse_invalidated = &problem_data->sparse_invalidated_by_call;
+@@ -405,10 +404,9 @@ df_rd_local_compute (bitmap all_blocks)
+     }
+ 
+   /* Set up the knockout bit vectors to be applied across EH_EDGES.  */
+-  EXECUTE_IF_SET_IN_BITMAP (regs_invalidated_by_call_regset, 0, regno, bi)
+-    {
+-      if (! HARD_REGISTER_NUM_P (regno)
+-	  || !(df->changeable_flags & DF_NO_HARD_REGS))
++  if (!(df->changeable_flags & DF_NO_HARD_REGS))
++    for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
++      if (TEST_HARD_REG_BIT (regs_invalidated_by_call, regno))
+ 	{
+ 	  if (DF_DEFS_COUNT (regno) > DF_SPARSE_THRESHOLD)
+ 	    bitmap_set_bit (sparse_invalidated, regno);
+@@ -417,7 +415,6 @@ df_rd_local_compute (bitmap all_blocks)
+ 			      DF_DEFS_BEGIN (regno),
+ 			      DF_DEFS_COUNT (regno));
+ 	}
+-    }
+ 
+   bitmap_release (&seen_in_block);
+   bitmap_release (&seen_in_insn);
+@@ -982,7 +979,10 @@ df_lr_confluence_n (edge e)
+   /* ??? Abnormal call edges ignored for the moment, as this gets
+      confused by sibling call edges, which crashes reg-stack.  */
+   if (e->flags & EDGE_EH)
+-    changed = bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
++    {
++      bitmap_view<HARD_REG_SET> eh_kills (regs_invalidated_by_call);
++      changed = bitmap_ior_and_compl_into (op1, op2, eh_kills);
++    }
+   else
+     changed = bitmap_ior_into (op1, op2);
+ 
+@@ -4093,8 +4093,7 @@ can_move_insns_across (rtx_insn *from, rtx_insn *to,
+ 	  if (volatile_insn_p (PATTERN (insn)))
+ 	    return false;
+ 	  memrefs_in_across |= find_memory (insn);
+-	  note_stores (PATTERN (insn), find_memory_stores,
+-		       &mem_sets_in_across);
++	  note_stores (insn, find_memory_stores, &mem_sets_in_across);
+ 	  /* This is used just to find sets of the stack pointer.  */
+ 	  memrefs_in_across |= mem_sets_in_across;
+ 	  trapping_insns_in_across |= may_trap_p (PATTERN (insn));
+@@ -4173,7 +4172,7 @@ can_move_insns_across (rtx_insn *from, rtx_insn *to,
+ 	    {
+ 	      int mem_ref_flags = 0;
+ 	      int mem_set_flags = 0;
+-	      note_stores (PATTERN (insn), find_memory_stores, &mem_set_flags);
++	      note_stores (insn, find_memory_stores, &mem_set_flags);
+ 	      mem_ref_flags = find_memory (insn);
+ 	      /* Catch sets of the stack pointer.  */
+ 	      mem_ref_flags |= mem_set_flags;
+@@ -4635,8 +4634,10 @@ df_md_confluence_n (edge e)
+     return false;
+ 
+   if (e->flags & EDGE_EH)
+-    return bitmap_ior_and_compl_into (op1, op2,
+-				      regs_invalidated_by_call_regset);
++    {
++      bitmap_view<HARD_REG_SET> eh_kills (regs_invalidated_by_call);
++      return bitmap_ior_and_compl_into (op1, op2, eh_kills);
++    }
+   else
+     return bitmap_ior_into (op1, op2);
+ }
+diff --git a/gcc/df-scan.c b/gcc/df-scan.c
+index 84c2e54c8..ea149c6cc 100644
+--- a/gcc/df-scan.c
++++ b/gcc/df-scan.c
+@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "emit-rtl.h"  /* FIXME: Can go away once crtl is moved to rtl.h.  */
+ #include "dumpfile.h"
+ #include "calls.h"
+-
++#include "function-abi.h"
+ 
+ /* The set of hard registers in eliminables[i].from. */
+ 
+@@ -312,7 +312,7 @@ df_scan_start_dump (FILE *file ATTRIBUTE_UNUSED)
+   rtx_insn *insn;
+ 
+   fprintf (file, ";;  invalidated by call \t");
+-  df_print_regset (file, regs_invalidated_by_call_regset);
++  df_print_regset (file, bitmap_view<HARD_REG_SET> (regs_invalidated_by_call));
+   fprintf (file, ";;  hardware regs used \t");
+   df_print_regset (file, &df->hardware_regs_used);
+   fprintf (file, ";;  regular block artificial uses \t");
+@@ -2773,7 +2773,6 @@ df_find_hard_reg_defs (rtx x, HARD_REG_SET *defs)
+       break;
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       df_find_hard_reg_defs_1 (XEXP (x, 0), defs);
+       break;
+ 
+@@ -2833,10 +2832,6 @@ df_uses_record (struct df_collection_rec *collection_rec,
+       /* If we're clobbering a REG then we have a def so ignore.  */
+       return;
+ 
+-    case CLOBBER_HIGH:
+-      gcc_assert (REG_P (XEXP (x, 0)));
+-      return;
+-
+     case MEM:
+       df_uses_record (collection_rec,
+ 		      &XEXP (x, 0), DF_REF_REG_MEM_LOAD,
+@@ -3087,13 +3082,11 @@ df_get_call_refs (struct df_collection_rec *collection_rec,
+   bool is_sibling_call;
+   unsigned int i;
+   HARD_REG_SET defs_generated;
+-  HARD_REG_SET fn_reg_set_usage;
+ 
+   CLEAR_HARD_REG_SET (defs_generated);
+   df_find_hard_reg_defs (PATTERN (insn_info->insn), &defs_generated);
+   is_sibling_call = SIBLING_CALL_P (insn_info->insn);
+-  get_call_reg_set_usage (insn_info->insn, &fn_reg_set_usage,
+-			  regs_invalidated_by_call);
++  function_abi callee_abi = insn_callee_abi (insn_info->insn);
+ 
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     {
+@@ -3117,7 +3110,7 @@ df_get_call_refs (struct df_collection_rec *collection_rec,
+ 			       NULL, bb, insn_info, DF_REF_REG_DEF, flags);
+ 	    }
+ 	}
+-      else if (TEST_HARD_REG_BIT (fn_reg_set_usage, i)
++      else if (callee_abi.clobbers_full_reg_p (i)
+ 	       /* no clobbers for regs that are the result of the call */
+ 	       && !TEST_HARD_REG_BIT (defs_generated, i)
+ 	       && (!is_sibling_call
+@@ -3133,7 +3126,6 @@ df_get_call_refs (struct df_collection_rec *collection_rec,
+   for (note = CALL_INSN_FUNCTION_USAGE (insn_info->insn); note;
+        note = XEXP (note, 1))
+     {
+-      gcc_assert (GET_CODE (XEXP (note, 0)) != CLOBBER_HIGH);
+       if (GET_CODE (XEXP (note, 0)) == USE)
+         df_uses_record (collection_rec, &XEXP (XEXP (note, 0), 0),
+ 			DF_REF_REG_USE, bb, insn_info, flags);
+@@ -3499,7 +3491,9 @@ df_get_entry_block_def_set (bitmap entry_block_defs)
+       /* Defs for the callee saved registers are inserted so that the
+ 	 pushes have some defining location.  */
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-	if ((call_used_regs[i] == 0) && (df_regs_ever_live_p (i)))
++	if (!crtl->abi->clobbers_full_reg_p (i)
++	    && !fixed_regs[i]
++	    && df_regs_ever_live_p (i))
+ 	  bitmap_set_bit (entry_block_defs, i);
+     }
+ 
+@@ -3682,8 +3676,9 @@ df_get_exit_block_use_set (bitmap exit_block_uses)
+     {
+       /* Mark all call-saved registers that we actually used.  */
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-	if (df_regs_ever_live_p (i) && !LOCAL_REGNO (i)
+-	    && !TEST_HARD_REG_BIT (regs_invalidated_by_call, i))
++	if (df_regs_ever_live_p (i)
++	    && !LOCAL_REGNO (i)
++	    && !crtl->abi->clobbers_full_reg_p (i))
+ 	  bitmap_set_bit (exit_block_uses, i);
+     }
+ 
+diff --git a/gcc/df.h b/gcc/df.h
+index d76d31baa..241812235 100644
+--- a/gcc/df.h
++++ b/gcc/df.h
+@@ -984,8 +984,8 @@ extern bool df_reg_defined (rtx_insn *, rtx);
+ extern df_ref df_find_use (rtx_insn *, rtx);
+ extern bool df_reg_used (rtx_insn *, rtx);
+ extern void df_worklist_dataflow (struct dataflow *,bitmap, int *, int);
+-extern void df_print_regset (FILE *file, bitmap r);
+-extern void df_print_word_regset (FILE *file, bitmap r);
++extern void df_print_regset (FILE *file, const_bitmap r);
++extern void df_print_word_regset (FILE *file, const_bitmap r);
+ extern void df_dump (FILE *);
+ extern void df_dump_region (FILE *);
+ extern void df_dump_start (FILE *);
+diff --git a/gcc/diagnostic-color.c b/gcc/diagnostic-color.c
+index 69e759ff6..abc919f63 100644
+--- a/gcc/diagnostic-color.c
++++ b/gcc/diagnostic-color.c
+@@ -19,6 +19,7 @@
+ #include "config.h"
+ #include "system.h"
+ #include "diagnostic-color.h"
++#include "diagnostic-url.h"
+ 
+ #ifdef __MINGW32__
+ #  include <windows.h>
+@@ -236,3 +237,22 @@ colorize_init (diagnostic_color_rule_t rule)
+       gcc_unreachable ();
+     }
+ }
++
++/* Determine if URLs should be enabled, based on RULE.
++   This reuses the logic for colorization.  */
++
++bool
++diagnostic_urls_enabled_p (diagnostic_url_rule_t rule)
++{
++  switch (rule)
++    {
++    case DIAGNOSTICS_URL_NO:
++      return false;
++    case DIAGNOSTICS_URL_YES:
++      return true;
++    case DIAGNOSTICS_URL_AUTO:
++      return should_colorize ();
++    default:
++      gcc_unreachable ();
++    }
++}
+diff --git a/gcc/diagnostic-url.h b/gcc/diagnostic-url.h
+new file mode 100644
+index 000000000..ce0de459f
+--- /dev/null
++++ b/gcc/diagnostic-url.h
+@@ -0,0 +1,36 @@
++/* Copyright (C) 2019 Free Software Foundation, Inc.
++   Contributed by David Malcolm <dmalcolm@redhat.com>.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_DIAGNOSTIC_URL_H
++#define GCC_DIAGNOSTIC_URL_H
++
++/* Whether to add URLs to diagnostics:
++   - DIAGNOSTICS_URL_NO: never
++   - DIAGNOSTICS_URL_YES: always
++   - DIAGNOSTICS_URL_AUTO: depending on the output stream.  */
++typedef enum
++{
++  DIAGNOSTICS_URL_NO       = 0,
++  DIAGNOSTICS_URL_YES      = 1,
++  DIAGNOSTICS_URL_AUTO     = 2
++} diagnostic_url_rule_t;
++
++extern bool diagnostic_urls_enabled_p (diagnostic_url_rule_t);
++
++#endif /* ! GCC_DIAGNOSTIC_URL_H */
+diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
+index be6b65722..a9acda7cc 100644
+--- a/gcc/diagnostic.c
++++ b/gcc/diagnostic.c
+@@ -31,6 +31,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "backtrace.h"
+ #include "diagnostic.h"
+ #include "diagnostic-color.h"
++#include "diagnostic-url.h"
+ #include "edit-context.h"
+ #include "selftest.h"
+ #include "selftest-diagnostic.h"
+@@ -238,6 +239,18 @@ diagnostic_color_init (diagnostic_context *context, int value /*= -1 */)
+     = colorize_init ((diagnostic_color_rule_t) value);
+ }
+ 
++/* Initialize URL support within CONTEXT based on VALUE, handling "auto".  */
++
++void
++diagnostic_urls_init (diagnostic_context *context, int value /*= -1 */)
++{
++  if (value < 0)
++    value = DIAGNOSTICS_COLOR_DEFAULT;
++
++  context->printer->show_urls
++    = diagnostic_urls_enabled_p ((diagnostic_url_rule_t) value);
++}
++
+ /* Do any cleaning up required after the last diagnostic is emitted.  */
+ 
+ void
+diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
+index 46c3b50a5..5daf4f288 100644
+--- a/gcc/diagnostic.h
++++ b/gcc/diagnostic.h
+@@ -328,6 +328,7 @@ diagnostic_override_option_index (diagnostic_info *info, int optidx)
+ /* Diagnostic related functions.  */
+ extern void diagnostic_initialize (diagnostic_context *, int);
+ extern void diagnostic_color_init (diagnostic_context *, int value = -1);
++extern void diagnostic_urls_init (diagnostic_context *, int value = -1);
+ extern void diagnostic_finish (diagnostic_context *);
+ extern void diagnostic_report_current_module (diagnostic_context *, location_t);
+ extern void diagnostic_show_locus (diagnostic_context *,
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index 9c87792ff..e366ab923 100644
+--- a/gcc/doc/invoke.texi
++++ b/gcc/doc/invoke.texi
+@@ -271,6 +271,7 @@ Objective-C and Objective-C++ Dialects}.
+ @gccoptlist{-fmessage-length=@var{n}  @gol
+ -fdiagnostics-show-location=@r{[}once@r{|}every-line@r{]}  @gol
+ -fdiagnostics-color=@r{[}auto@r{|}never@r{|}always@r{]}  @gol
++-fdiagnostics-urls=@r{[}auto@r{|}never@r{|}always@r{]}  @gol
+ -fdiagnostics-format=@r{[}text@r{|}json@r{]}  @gol
+ -fno-diagnostics-show-option  -fno-diagnostics-show-caret @gol
+ -fno-diagnostics-show-labels  -fno-diagnostics-show-line-numbers @gol
+@@ -403,8 +404,7 @@ Objective-C and Objective-C++ Dialects}.
+ -fallow-store-data-races @gol
+ -fassociative-math  -fauto-profile  -fauto-profile[=@var{path}] @gol
+ -fauto-inc-dec  -fbranch-probabilities @gol
+--fbranch-target-load-optimize  -fbranch-target-load-optimize2 @gol
+--fbtr-bb-exclusive  -fcaller-saves @gol
++-fcaller-saves @gol
+ -fcombine-stack-adjustments  -fconserve-stack @gol
+ -fcompare-elim  -fcprop-registers  -fcrossjumping @gol
+ -fcse-follow-jumps  -fcse-skip-blocks  -fcx-fortran-rules @gol
+@@ -636,11 +636,13 @@ Objective-C and Objective-C++ Dialects}.
+ -mlow-precision-recip-sqrt  -mlow-precision-sqrt  -mlow-precision-div @gol
+ -mpc-relative-literal-loads @gol
+ -msign-return-address=@var{scope} @gol
+--mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}]|@var{bti} @gol
++-mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}
+++@var{b-key}]|@var{bti} @gol
+ -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}  @gol
+ -moverride=@var{string}  -mverbose-cost-dump @gol
+ -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{sysreg} @gol
+--mstack-protector-guard-offset=@var{offset} -mtrack-speculation }
++-mstack-protector-guard-offset=@var{offset} -mtrack-speculation @gol
++-moutline-atomics }
+ 
+ @emph{Adapteva Epiphany Options}
+ @gccoptlist{-mhalf-reg-file  -mprefer-short-insn-regs @gol
+@@ -3885,6 +3887,18 @@ SGR substring for highlighting mismatching types within template
+ arguments in the C++ frontend.
+ @end table
+ 
++@item -fdiagnostics-urls[=@var{WHEN}]
++@opindex fdiagnostics-urls
++@cindex urls
++Use escape sequences to embed URLs in diagnostics.  For example, when
++@option{-fdiagnostics-show-option} emits text showing the command-line
++option controlling a diagnostic, embed a URL for documentation of that
++option.
++
++@var{WHEN} is @samp{never}, @samp{always}, or @samp{auto}.
++The default is @samp{auto}, which means to use URL escape sequences only
++when the standard error is a terminal.
++
+ @item -fno-diagnostics-show-option
+ @opindex fno-diagnostics-show-option
+ @opindex fdiagnostics-show-option
+@@ -8295,6 +8309,7 @@ also turns on the following optimization flags:
+ -ffinite-loops @gol
+ -fgcse  -fgcse-lm  @gol
+ -fhoist-adjacent-loads @gol
++-finline-functions @gol
+ -finline-small-functions @gol
+ -findirect-inlining @gol
+ -fipa-bit-cp  -fipa-cp  -fipa-icf @gol
+@@ -8328,7 +8343,6 @@ by @option{-O2} and also turns on the following optimization flags:
+ 
+ @c Please keep the following list alphabetized!
+ @gccoptlist{-fgcse-after-reload @gol
+--finline-functions @gol
+ -fipa-cp-clone
+ -floop-interchange @gol
+ -floop-unroll-and-jam @gol
+@@ -8386,10 +8400,10 @@ no effect.  Otherwise @option{-Og} enables all @option{-O1}
+ optimization flags except for those that may interfere with debugging:
+ 
+ @gccoptlist{-fbranch-count-reg  -fdelayed-branch @gol
+--fif-conversion  -fif-conversion2  @gol
++-fdse  -fif-conversion  -fif-conversion2  @gol
+ -finline-functions-called-once @gol
+ -fmove-loop-invariants  -fssa-phiopt @gol
+--ftree-bit-ccp  -ftree-pta  -ftree-sra}
++-ftree-bit-ccp  -ftree-dse  -ftree-pta  -ftree-sra}
+ 
+ @end table
+ 
+@@ -8508,7 +8522,7 @@ If all calls to a given function are integrated, and the function is
+ declared @code{static}, then the function is normally not output as
+ assembler code in its own right.
+ 
+-Enabled at levels @option{-O3}, @option{-Os}.  Also enabled
++Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.  Also enabled
+ by @option{-fprofile-use} and @option{-fauto-profile}.
+ 
+ @item -finline-functions-called-once
+@@ -10986,24 +11000,6 @@ locations inside a translation unit since the locations are unknown until
+ link time.  An example of such an optimization is relaxing calls to short call
+ instructions.
+ 
+-@item -fbranch-target-load-optimize
+-@opindex fbranch-target-load-optimize
+-Perform branch target register load optimization before prologue / epilogue
+-threading.
+-The use of target registers can typically be exposed only during reload,
+-thus hoisting loads out of loops and doing inter-block scheduling needs
+-a separate optimization pass.
+-
+-@item -fbranch-target-load-optimize2
+-@opindex fbranch-target-load-optimize2
+-Perform branch target register load optimization after prologue / epilogue
+-threading.
+-
+-@item -fbtr-bb-exclusive
+-@opindex fbtr-bb-exclusive
+-When performing branch target register load optimization, don't reuse
+-branch target registers within any basic block.
+-
+ @item -fstdarg-opt
+ @opindex fstdarg-opt
+ Optimize the prologue of variadic argument functions with respect to usage of
+@@ -11154,19 +11150,30 @@ when modulo scheduling a loop.  Larger values can exponentially increase
+ compilation time.
+ 
+ @item max-inline-insns-single
+-Several parameters control the tree inliner used in GCC@.
+-This number sets the maximum number of instructions (counted in GCC's
+-internal representation) in a single function that the tree inliner
+-considers for inlining.  This only affects functions declared
+-inline and methods implemented in a class declaration (C++).
++@item max-inline-insns-single-O2
++Several parameters control the tree inliner used in GCC@.  This number sets the
++maximum number of instructions (counted in GCC's internal representation) in a
++single function that the tree inliner considers for inlining.  This only
++affects functions declared inline and methods implemented in a class
++declaration (C++). 
++
++For functions compiled with optimization levels
++@option{-O3} and @option{-Ofast} parameter @option{max-inline-insns-single} is
++applied. In other cases @option{max-inline-insns-single-O2} is applied.
++
+ 
+ @item max-inline-insns-auto
++@item max-inline-insns-auto-O2
+ When you use @option{-finline-functions} (included in @option{-O3}),
+ a lot of functions that would otherwise not be considered for inlining
+ by the compiler are investigated.  To those functions, a different
+ (more restrictive) limit compared to functions declared inline can
+ be applied.
+ 
++For functions compiled with optimization levels
++@option{-O3} and @option{-Ofast} parameter @option{max-inline-insns-auto} is
++applied. In other cases @option{max-inline-insns-auto-O2} is applied.
++
+ @item max-inline-insns-small
+ This is bound applied to calls which are considered relevant with
+ @option{-finline-small-functions}.
+@@ -11189,11 +11196,16 @@ Same as @option{--param uninlined-function-insns} and
+ @option{--param uninlined-function-time} but applied to function thunks
+ 
+ @item inline-min-speedup
++@item inline-min-speedup-O2
+ When estimated performance improvement of caller + callee runtime exceeds this
+ threshold (in percent), the function can be inlined regardless of the limit on
+ @option{--param max-inline-insns-single} and @option{--param
+ max-inline-insns-auto}.
+ 
++For functions compiled with optimization levels
++@option{-O3} and @option{-Ofast} parameter @option{inline-min-speedup} is
++applied. In other cases @option{inline-min-speedup-O2} is applied.
++
+ @item large-function-insns
+ The limit specifying really large functions.  For functions larger than this
+ limit after inlining, inlining is constrained by
+@@ -11271,9 +11283,14 @@ via a given call expression.  This parameter limits inlining only to call
+ expressions whose probability exceeds the given threshold (in percents).
+ 
+ @item early-inlining-insns
++@item early-inlining-insns-O2
+ Specify growth that the early inliner can make.  In effect it increases
+ the amount of inlining for code having a large abstraction penalty.
+ 
++For functions compiled with optimization levels
++@option{-O3} and @option{-Ofast} parameter @option{early-inlining-insns} is
++applied. In other cases @option{early-inlining-insns-O2} is applied.
++
+ @item max-early-inliner-iterations
+ Limit of iterations of the early inliner.  This basically bounds
+ the number of nested indirect calls the early inliner can resolve.
+@@ -15816,31 +15833,38 @@ be used by the compiler when expanding calls to
+ @code{__builtin_speculation_safe_copy} to permit a more efficient code
+ sequence to be generated.
+ 
++@item -moutline-atomics
++@itemx -mno-outline-atomics
++Enable or disable calls to out-of-line helpers to implement atomic operations.
++These helpers will, at runtime, determine if the LSE instructions from
++ARMv8.1-A can be used; if not, they will use the load/store-exclusive
++instructions that are present in the base ARMv8.0 ISA.
++
++This option is only applicable when compiling for the base ARMv8.0
++instruction set.  If using a later revision, e.g. @option{-march=armv8.1-a}
++or @option{-march=armv8-a+lse}, the ARMv8.1-Atomics instructions will be
++used directly.  The same applies when using @option{-mcpu=} when the
++selected cpu supports the @samp{lse} feature.
++
+ @item -march=@var{name}
+ @opindex march
+ Specify the name of the target architecture and, optionally, one or
+ more feature modifiers.  This option has the form
+ @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
+ 
+-The permissible values for @var{arch} are @samp{armv8-a},
+-@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
+-@samp{armv8.5-a} or @var{native}.
+-
+-The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
+-support for the ARMv8.5-A architecture extensions.
+-
+-The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
+-support for the ARMv8.4-A architecture extensions.
+-
+-The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
+-support for the ARMv8.3-A architecture extensions.
+-
+-The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
+-support for the ARMv8.2-A architecture extensions.
+-
+-The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
+-support for the ARMv8.1-A architecture extension.  In particular, it
+-enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
++The table below summarizes the permissible values for @var{arch}
++and the features that they enable by default:
++
++@multitable @columnfractions 0.20 0.20 0.60
++@headitem @var{arch} value @tab Architecture @tab Includes by default
++@item @samp{armv8-a} @tab Armv8-A @tab @samp{+fp}, @samp{+simd}
++@item @samp{armv8.1-a} @tab Armv8.1-A @tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
++@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
++@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
++@item @samp{armv8.4-a} @tab Armv8.4-A @tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
++@item @samp{armv8.5-a} @tab Armv8.5-A @tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
++@item @samp{armv8.6-a} @tab Armv8.6-A @tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
++@end multitable
+ 
+ The value @samp{native} is available on native AArch64 GNU/Linux and
+ causes the compiler to pick the architecture of the host system.  This
+@@ -15864,7 +15888,9 @@ Specify the name of the target processor for which GCC should tune the
+ performance of the code.  Permissible values for this option are:
+ @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
+ @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
+-@samp{cortex-a76}, @samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
++@samp{cortex-a76}, @samp{cortex-a76ae}, @samp{cortex-a77},
++@samp{cortex-a65}, @samp{cortex-a65ae}, @samp{cortex-a34},
++@samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
+ @samp{neoverse-e1},@samp{neoverse-n1},@samp{qdf24xx}, @samp{saphira},
+ @samp{phecda}, @samp{xgene1}, @samp{vulcan}, @samp{octeontx},
+ @samp{octeontx81},  @samp{octeontx83}, @samp{thunderx}, @samp{thunderxt88},
+@@ -15941,7 +15967,7 @@ functions, and @samp{all}, which enables pointer signing for all functions.  The
+ default value is @samp{none}. This option has been deprecated by
+ -mbranch-protection.
+ 
+-@item -mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}]|@var{bti}
++@item -mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}+@var{b-key}]|@var{bti}
+ @opindex mbranch-protection
+ Select the branch protection features to use.
+ @samp{none} is the default and turns off all types of branch protection.
+@@ -15952,7 +15978,8 @@ level.
+ level: signing functions that save the return address to memory (non-leaf
+ functions will practically always do this) using the a-key.  The optional
+ argument @samp{leaf} can be used to extend the signing to include leaf
+-functions.
++functions.  The optional argument @samp{b-key} can be used to sign the functions
++with the B-key instead of the A-key.
+ @samp{bti} turns on branch target identification mechanism.
+ 
+ @item -msve-vector-bits=@var{bits}
+@@ -16054,6 +16081,37 @@ Enable the Armv8-a Execution and Data Prediction Restriction instructions.
+ This option is only to enable the extension at the assembler level and does
+ not affect code generation.  This option is enabled by default for
+ @option{-march=armv8.5-a}.
++@item sve2
++Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
++instructions.
++@item sve2-bitperm
++Enable SVE2 bitperm instructions.  This also enables SVE2 instructions.
++@item sve2-sm4
++Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
++@item sve2-aes
++Enable SVE2 aes instructions.  This also enables SVE2 instructions.
++@item sve2-sha3
++Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
++@item tme
++Enable the Transactional Memory Extension.
++@item i8mm
++Enable 8-bit Integer Matrix Multiply instructions.  This also enables
++Advanced SIMD and floating-point instructions.  This option is enabled by
++default for @option{-march=armv8.6-a}.  Use of this option with architectures
++prior to Armv8.2-A is not supported.
++@item f32mm
++Enable 32-bit Floating point Matrix Multiply instructions.  This also enables
++SVE instructions.  Use of this option with architectures prior to Armv8.2-A is
++not supported.
++@item f64mm
++Enable 64-bit Floating point Matrix Multiply instructions.  This also enables
++SVE instructions.  Use of this option with architectures prior to Armv8.2-A is
++not supported.
++@item bf16
++Enable brain half-precision floating-point instructions.  This also enables
++Advanced SIMD and floating-point instructions.  This option is enabled by
++default for @option{-march=armv8.6-a}.  Use of this option with architectures
++prior to Armv8.2-A is not supported.
+ 
+ @end table
+ 
+@@ -28567,8 +28625,9 @@ By default GCC inlines string operations only when the destination is
+ known to be aligned to least a 4-byte boundary.  
+ This enables more inlining and increases code
+ size, but may improve performance of code that depends on fast
+-@code{memcpy}, @code{strlen},
+-and @code{memset} for short lengths.
++@code{memcpy} and @code{memset} for short lengths.
++The option enables inline expansion of @code{strlen} for all
++pointer alignments.
+ 
+ @item -minline-stringops-dynamically
+ @opindex minline-stringops-dynamically
+diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
+index 50e13124b..75482d7a2 100644
+--- a/gcc/doc/md.texi
++++ b/gcc/doc/md.texi
+@@ -1748,6 +1748,12 @@ The stack pointer register (@code{SP})
+ @item w
+ Floating point register, Advanced SIMD vector register or SVE vector register
+ 
++@item x
++Like @code{w}, but restricted to registers 0 to 15 inclusive.
++
++@item y
++Like @code{w}, but restricted to registers 0 to 7 inclusive.
++
+ @item Upl
+ One of the low eight SVE predicate registers (@code{P0} to @code{P7})
+ 
+@@ -5470,6 +5476,11 @@ mode @var{m} and the scalars have the mode appropriate for one
+ element of @var{m}.  The operation is strictly in-order: there is
+ no reassociation.
+ 
++@cindex @code{mask_fold_left_plus_@var{m}} instruction pattern
++@item @code{mask_fold_left_plus_@var{m}}
++Like @samp{fold_left_plus_@var{m}}, but takes an additional mask operand
++(operand 3) that specifies which elements of the source vector should be added.
++
+ @cindex @code{sdot_prod@var{m}} instruction pattern
+ @item @samp{sdot_prod@var{m}}
+ @cindex @code{udot_prod@var{m}} instruction pattern
+@@ -5499,6 +5510,44 @@ operand 1. Add operand 1 to operand 2 and place the widened result in
+ operand 0. (This is used express accumulation of elements into an accumulator
+ of a wider mode.)
+ 
++@cindex @code{smulhs@var{m3}} instruction pattern
++@item @samp{smulhs@var{m3}}
++@cindex @code{umulhs@var{m3}} instruction pattern
++@itemx @samp{umulhs@var{m3}}
++Signed/unsigned multiply high with scale. This is equivalent to the C code:
++@smallexample
++narrow op0, op1, op2;
++@dots{}
++op0 = (narrow) (((wide) op1 * (wide) op2) >> (N / 2 - 1));
++@end smallexample
++where the sign of @samp{narrow} determines whether this is a signed
++or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
++
++@cindex @code{smulhrs@var{m3}} instruction pattern
++@item @samp{smulhrs@var{m3}}
++@cindex @code{umulhrs@var{m3}} instruction pattern
++@itemx @samp{umulhrs@var{m3}}
++Signed/unsigned multiply high with round and scale. This is
++equivalent to the C code:
++@smallexample
++narrow op0, op1, op2;
++@dots{}
++op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 - 2)) + 1) >> 1);
++@end smallexample
++where the sign of @samp{narrow} determines whether this is a signed
++or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
++
++@cindex @code{sdiv_pow2@var{m3}} instruction pattern
++@item @samp{sdiv_pow2@var{m3}}
++@cindex @code{sdiv_pow2@var{m3}} instruction pattern
++@itemx @samp{sdiv_pow2@var{m3}}
++Signed division by power-of-2 immediate. Equivalent to:
++@smallexample
++signed op0, op1;
++@dots{}
++op0 = op1 / (1 << imm);
++@end smallexample
++
+ @cindex @code{vec_shl_insert_@var{m}} instruction pattern
+ @item @samp{vec_shl_insert_@var{m}}
+ Shift the elements in vector input operand 1 left one element (i.e.@:
+@@ -6240,13 +6289,13 @@ This pattern is not allowed to @code{FAIL}.
+ @item @samp{one_cmpl@var{m}2}
+ Store the bitwise-complement of operand 1 into operand 0.
+ 
+-@cindex @code{movmem@var{m}} instruction pattern
+-@item @samp{movmem@var{m}}
+-Block move instruction.  The destination and source blocks of memory
++@cindex @code{cpymem@var{m}} instruction pattern
++@item @samp{cpymem@var{m}}
++Block copy instruction.  The destination and source blocks of memory
+ are the first two operands, and both are @code{mem:BLK}s with an
+ address in mode @code{Pmode}.
+ 
+-The number of bytes to move is the third operand, in mode @var{m}.
++The number of bytes to copy is the third operand, in mode @var{m}.
+ Usually, you specify @code{Pmode} for @var{m}.  However, if you can
+ generate better code knowing the range of valid lengths is smaller than
+ those representable in a full Pmode pointer, you should provide
+@@ -6266,14 +6315,16 @@ in a way that the blocks are not required to be aligned according to it in
+ all cases. This expected alignment is also in bytes, just like operand 4.
+ Expected size, when unknown, is set to @code{(const_int -1)}.
+ 
+-Descriptions of multiple @code{movmem@var{m}} patterns can only be
++Descriptions of multiple @code{cpymem@var{m}} patterns can only be
+ beneficial if the patterns for smaller modes have fewer restrictions
+ on their first, second and fourth operands.  Note that the mode @var{m}
+-in @code{movmem@var{m}} does not impose any restriction on the mode of
+-individually moved data units in the block.
++in @code{cpymem@var{m}} does not impose any restriction on the mode of
++individually copied data units in the block.
+ 
+-These patterns need not give special consideration to the possibility
+-that the source and destination strings might overlap.
++The @code{cpymem@var{m}} patterns need not give special consideration
++to the possibility that the source and destination strings might
++overlap. These patterns are used to do inline expansion of
++@code{__builtin_memcpy}.
+ 
+ @cindex @code{movstr} instruction pattern
+ @item @samp{movstr}
+@@ -6294,7 +6345,7 @@ given as a @code{mem:BLK} whose address is in mode @code{Pmode}.  The
+ number of bytes to set is the second operand, in mode @var{m}.  The value to
+ initialize the memory with is the third operand. Targets that only support the
+ clearing of memory should reject any value that is not the constant 0.  See
+-@samp{movmem@var{m}} for a discussion of the choice of mode.
++@samp{cpymem@var{m}} for a discussion of the choice of mode.
+ 
+ The fourth operand is the known alignment of the destination, in the form
+ of a @code{const_int} rtx.  Thus, if the compiler knows that the
+@@ -6312,13 +6363,13 @@ Operand 9 is the probable maximal size (i.e.@: we cannot rely on it for
+ correctness, but it can be used for choosing proper code sequence for a
+ given size).
+ 
+-The use for multiple @code{setmem@var{m}} is as for @code{movmem@var{m}}.
++The use for multiple @code{setmem@var{m}} is as for @code{cpymem@var{m}}.
+ 
+ @cindex @code{cmpstrn@var{m}} instruction pattern
+ @item @samp{cmpstrn@var{m}}
+ String compare instruction, with five operands.  Operand 0 is the output;
+ it has mode @var{m}.  The remaining four operands are like the operands
+-of @samp{movmem@var{m}}.  The two memory blocks specified are compared
++of @samp{cpymem@var{m}}.  The two memory blocks specified are compared
+ byte by byte in lexicographic order starting at the beginning of each
+ string.  The instruction is not allowed to prefetch more than one byte
+ at a time since either string may end in the first byte and reading past
+@@ -8537,6 +8588,119 @@ functionality as two separate @code{define_insn} and @code{define_split}
+ patterns.  It exists for compactness, and as a maintenance tool to prevent
+ having to ensure the two patterns' templates match.
+ 
++@findex define_insn_and_rewrite
++It is sometimes useful to have a @code{define_insn_and_split}
++that replaces specific operands of an instruction but leaves the
++rest of the instruction pattern unchanged.  You can do this directly
++with a @code{define_insn_and_split}, but it requires a
++@var{new-insn-pattern-1} that repeats most of the original @var{insn-pattern}.
++There is also the complication that an implicit @code{parallel} in
++@var{insn-pattern} must become an explicit @code{parallel} in
++@var{new-insn-pattern-1}, which is easy to overlook.
++A simpler alternative is to use @code{define_insn_and_rewrite}, which
++is a form of @code{define_insn_and_split} that automatically generates
++@var{new-insn-pattern-1} by replacing each @code{match_operand}
++in @var{insn-pattern} with a corresponding @code{match_dup}, and each
++@code{match_operator} in the pattern with a corresponding @code{match_op_dup}.
++The arguments are otherwise identical to @code{define_insn_and_split}:
++
++@smallexample
++(define_insn_and_rewrite
++  [@var{insn-pattern}]
++  "@var{condition}"
++  "@var{output-template}"
++  "@var{split-condition}"
++  "@var{preparation-statements}"
++  [@var{insn-attributes}])
++@end smallexample
++
++The @code{match_dup}s and @code{match_op_dup}s in the new
++instruction pattern use any new operand values that the
++@var{preparation-statements} store in the @code{operands} array,
++as for a normal @code{define_insn_and_split}.  @var{preparation-statements}
++can also emit additional instructions before the new instruction.
++They can even emit an entirely different sequence of instructions and
++use @code{DONE} to avoid emitting a new form of the original
++instruction.
++
++The split in a @code{define_insn_and_rewrite} is only intended
++to apply to existing instructions that match @var{insn-pattern}.
++@var{split-condition} must therefore start with @code{&&},
++so that the split condition applies on top of @var{condition}.
++
++Here is an example from the AArch64 SVE port, in which operand 1 is
++known to be equivalent to an all-true constant and isn't used by the
++output template:
++
++@smallexample
++(define_insn_and_rewrite "*while_ult<GPI:mode><PRED_ALL:mode>_cc"
++  [(set (reg:CC CC_REGNUM)
++        (compare:CC
++          (unspec:SI [(match_operand:PRED_ALL 1)
++                      (unspec:PRED_ALL
++                        [(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")
++                         (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")]
++                        UNSPEC_WHILE_LO)]
++                     UNSPEC_PTEST_PTRUE)
++          (const_int 0)))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++        (unspec:PRED_ALL [(match_dup 2)
++                          (match_dup 3)]
++                         UNSPEC_WHILE_LO))]
++  "TARGET_SVE"
++  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>2, %<w>3"
++  ;; Force the compiler to drop the unused predicate operand, so that we
++  ;; don't have an unnecessary PTRUE.
++  "&& !CONSTANT_P (operands[1])"
++  @{
++    operands[1] = CONSTM1_RTX (<MODE>mode);
++  @}
++)
++@end smallexample
++
++The splitter in this case simply replaces operand 1 with the constant
++value that it is known to have.  The equivalent @code{define_insn_and_split}
++would be:
++
++@smallexample
++(define_insn_and_split "*while_ult<GPI:mode><PRED_ALL:mode>_cc"
++  [(set (reg:CC CC_REGNUM)
++        (compare:CC
++          (unspec:SI [(match_operand:PRED_ALL 1)
++                      (unspec:PRED_ALL
++                        [(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")
++                         (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")]
++                        UNSPEC_WHILE_LO)]
++                     UNSPEC_PTEST_PTRUE)
++          (const_int 0)))
++   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
++        (unspec:PRED_ALL [(match_dup 2)
++                          (match_dup 3)]
++                         UNSPEC_WHILE_LO))]
++  "TARGET_SVE"
++  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>2, %<w>3"
++  ;; Force the compiler to drop the unused predicate operand, so that we
++  ;; don't have an unnecessary PTRUE.
++  "&& !CONSTANT_P (operands[1])"
++  [(parallel
++     [(set (reg:CC CC_REGNUM)
++           (compare:CC
++             (unspec:SI [(match_dup 1)
++                         (unspec:PRED_ALL [(match_dup 2)
++                                           (match_dup 3)]
++                                          UNSPEC_WHILE_LO)]
++                        UNSPEC_PTEST_PTRUE)
++             (const_int 0)))
++      (set (match_dup 0)
++           (unspec:PRED_ALL [(match_dup 2)
++                             (match_dup 3)]
++                            UNSPEC_WHILE_LO))])]
++  @{
++    operands[1] = CONSTM1_RTX (<MODE>mode);
++  @}
++)
++@end smallexample
++
+ @end ifset
+ @ifset INTERNALS
+ @node Including Patterns
+@@ -10979,6 +11143,27 @@ Other attributes are defined using:
+ (define_code_attr @var{name} [(@var{code1} "@var{value1}") @dots{} (@var{coden} "@var{valuen}")])
+ @end smallexample
+ 
++Instruction patterns can use code attributes as rtx codes, which can be
++useful if two sets of codes act in tandem.  For example, the following
++@code{define_insn} defines two patterns, one calculating a signed absolute
++difference and another calculating an unsigned absolute difference:
++
++@smallexample
++(define_code_iterator any_max [smax umax])
++(define_code_attr paired_min [(smax "smin") (umax "umin")])
++(define_insn @dots{}
++  [(set (match_operand:SI 0 @dots{})
++        (minus:SI (any_max:SI (match_operand:SI 1 @dots{})
++                              (match_operand:SI 2 @dots{}))
++                  (<paired_min>:SI (match_dup 1) (match_dup 2))))]
++  @dots{})
++@end smallexample
++
++The signed version of the instruction uses @code{smax} and @code{smin}
++while the unsigned version uses @code{umax} and @code{umin}.  There
++are no versions that pair @code{smax} with @code{umin} or @code{umax}
++with @code{smin}.
++
+ Here's an example of code iterators in action, taken from the MIPS port:
+ 
+ @smallexample
+@@ -11249,4 +11434,13 @@ name and same types of iterator.  For example:
+ would produce a single set of functions that handles both
+ @code{INTEGER_MODES} and @code{FLOAT_MODES}.
+ 
++It is also possible for these @samp{@@} patterns to have different
++numbers of operands from each other.  For example, patterns with
++a binary rtl code might take three operands (one output and two inputs)
++while patterns with a ternary rtl code might take four operands (one
++output and three inputs).  This combination would produce separate
++@samp{maybe_gen_@var{name}} and @samp{gen_@var{name}} functions for
++each operand count, but it would still produce a single
++@samp{maybe_code_for_@var{name}} and a single @samp{code_for_@var{name}}.
++
+ @end ifset
+diff --git a/gcc/doc/rtl.texi b/gcc/doc/rtl.texi
+index f5f2de756..3df798216 100644
+--- a/gcc/doc/rtl.texi
++++ b/gcc/doc/rtl.texi
+@@ -3295,18 +3295,6 @@ There is one other known use for clobbering a pseudo register in a
+ clobbered by the insn.  In this case, using the same pseudo register in
+ the clobber and elsewhere in the insn produces the expected results.
+ 
+-@findex clobber_high
+-@item (clobber_high @var{x})
+-Represents the storing or possible storing of an unpredictable,
+-undescribed value into the upper parts of @var{x}. The mode of the expression
+-represents the lower parts of the register which will not be overwritten.
+-@code{reg} must be a reg expression.
+-
+-One place this is used is when calling into functions where the registers are
+-preserved, but only up to a given number of bits.  For example when using
+-Aarch64 SVE, calling a TLS descriptor will cause only the lower 128 bits of
+-each of the vector registers to be preserved.
+-
+ @findex use
+ @item (use @var{x})
+ Represents the use of the value of @var{x}.  It indicates that the
+@@ -3341,7 +3329,7 @@ that the register is live.  You should think twice before adding
+ instead.  The @code{use} RTX is most commonly useful to describe that
+ a fixed register is implicitly used in an insn.  It is also safe to use
+ in patterns where the compiler knows for other reasons that the result
+-of the whole pattern is variable, such as @samp{movmem@var{m}} or
++of the whole pattern is variable, such as @samp{cpymem@var{m}} or
+ @samp{call} patterns.
+ 
+ During the reload phase, an insn that has a @code{use} as pattern
+@@ -3360,8 +3348,7 @@ Represents several side effects performed in parallel.  The square
+ brackets stand for a vector; the operand of @code{parallel} is a
+ vector of expressions.  @var{x0}, @var{x1} and so on are individual
+ side effect expressions---expressions of code @code{set}, @code{call},
+-@code{return}, @code{simple_return}, @code{clobber} @code{use} or
+-@code{clobber_high}.
++@code{return}, @code{simple_return}, @code{clobber} or @code{use}.
+ 
+ ``In parallel'' means that first all the values used in the individual
+ side-effects are computed, and second all the actual side-effects are
+diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
+index 546af7f72..62245c2b3 100644
+--- a/gcc/doc/sourcebuild.texi
++++ b/gcc/doc/sourcebuild.texi
+@@ -1439,6 +1439,14 @@ vector alignment.
+ Target supports both signed and unsigned averaging operations on vectors
+ of bytes.
+ 
++@item vect_mulhrs_hi
++Target supports both signed and unsigned multiply-high-with-round-and-scale
++operations on vectors of half-words.
++
++@item vect_sdiv_pow2_si
++Target supports signed division by constant power-of-2 operations
++on vectors of 4-byte integers.
++
+ @item vect_condition
+ Target supports vector conditional operations.
+ 
+@@ -1854,6 +1862,16 @@ ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS}
+ half-precision floating-point instructions available from ARMv8.2-A and
+ onwards.  Some multilibs may be incompatible with these options.
+ 
++@item arm_v8_2a_bf16_neon_ok
++ARM target supports options to generate instructions from ARMv8.2-A with
++the BFloat16 extension (bf16). Some multilibs may be incompatible with these
++options.
++
++@item arm_v8_2a_i8mm_ok
++ARM target supports options to generate instructions from ARMv8.2-A with
++the 8-Bit Integer Matrix Multiply extension (i8mm). Some multilibs may be
++incompatible with these options.
++
+ @item arm_prefer_ldrd_strd
+ ARM target prefers @code{LDRD} and @code{STRD} instructions over
+ @code{LDM} and @code{STM} instructions.
+@@ -2663,6 +2681,91 @@ assembly output.
+ @item scan-not-hidden @var{symbol} [@{ target/xfail @var{selector} @}]
+ Passes if @var{symbol} is not defined as a hidden symbol in the test's
+ assembly output.
++
++@item check-function-bodies @var{prefix} @var{terminator} [@var{option} [@{ target/xfail @var{selector} @}]]
++Looks through the source file for comments that give the expected assembly
++output for selected functions.  Each line of expected output starts with the
++prefix string @var{prefix} and the expected output for a function as a whole
++is followed by a line that starts with the string @var{terminator}.
++Specifying an empty terminator is equivalent to specifying @samp{"*/"}.
++
++If @var{option} is specified, the test only applies to command lines
++that contain @var{option}.  This can be useful if a source file is compiled
++both with and without optimization, since it is rarely useful to check the
++assembly output for unoptimized code.
++
++The first line of the expected output for a function @var{fn} has the form:
++
++@smallexample
++@var{prefix} @var{fn}:  [@{ target/xfail @var{selector} @}]
++@end smallexample
++
++Subsequent lines of the expected output also start with @var{prefix}.
++In both cases, whitespace after @var{prefix} is not significant.
++
++The test discards assembly directives such as @code{.cfi_startproc}
++and local label definitions such as @code{.LFB0} from the compiler's
++assembly output.  It then matches the result against the expected
++output for a function as a single regular expression.  This means that
++later lines can use backslashes to refer back to @samp{(@dots{})}
++captures on earlier lines.  For example:
++
++@smallexample
++/* @{ dg-final @{ check-function-bodies "**" "" "-DCHECK_ASM" @} @} */
++@dots{}
++/*
++** add_w0_s8_m:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++svint8_t add_w0_s8_m (@dots{}) @{ @dots{} @}
++@dots{}
++/*
++** add_b0_s8_m:
++**	mov	(z[0-9]+\.b), b0
++**	add	z1\.b, p0/m, z1\.b, \1
++**	ret
++*/
++svint8_t add_b0_s8_m (@dots{}) @{ @dots{} @}
++@end smallexample
++
++checks whether the implementations of @code{add_w0_s8_m} and
++@code{add_b0_s8_m} match the regular expressions given.  The test only
++runs when @samp{-DCHECK_ASM} is passed on the command line.
++
++It is possible to create non-capturing multi-line regular expression
++groups of the form @samp{(@var{a}|@var{b}|@dots{})} by putting the
++@samp{(}, @samp{|} and @samp{)} on separate lines (each still using
++@var{prefix}).  For example:
++
++@smallexample
++/*
++** cmple_f16_tied:
++** (
++**	fcmge	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	fcmle	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++svbool_t cmple_f16_tied (@dots{}) @{ @dots{} @}
++@end smallexample
++
++checks whether @code{cmple_f16_tied} is implemented by the
++@code{fcmge} instruction followed by @code{ret} or by the
++@code{fcmle} instruction followed by @code{ret}.  The test is
++still a single regular rexpression.
++
++A line containing just:
++
++@smallexample
++@var{prefix} ...
++@end smallexample
++
++stands for zero or more unmatched lines; the whitespace after
++@var{prefix} is again not significant.
++
+ @end table
+ 
+ @subsubsection Scan optimization dump files
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 73db70867..3f22bb1f6 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -1878,6 +1878,9 @@ function calls.
+ If a register has 0 in @code{CALL_USED_REGISTERS}, the compiler
+ automatically saves it on function entry and restores it on function
+ exit, if the register is used within the function.
++
++Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
++must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
+ @end defmac
+ 
+ @defmac CALL_REALLY_USED_REGISTERS
+@@ -1887,48 +1890,55 @@ exit, if the register is used within the function.
+ Like @code{CALL_USED_REGISTERS} except this macro doesn't require
+ that the entire set of @code{FIXED_REGISTERS} be included.
+ (@code{CALL_USED_REGISTERS} must be a superset of @code{FIXED_REGISTERS}).
+-This macro is optional.  If not specified, it defaults to the value
+-of @code{CALL_USED_REGISTERS}.
++
++Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
++must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
+ @end defmac
+ 
+ @cindex call-used register
+ @cindex call-clobbered register
+ @cindex call-saved register
+-@deftypefn {Target Hook} bool TARGET_HARD_REGNO_CALL_PART_CLOBBERED (rtx_insn *@var{insn}, unsigned int @var{regno}, machine_mode @var{mode})
+-This hook should return true if @var{regno} is partly call-saved and
+-partly call-clobbered, and if a value of mode @var{mode} would be partly
+-clobbered by call instruction @var{insn}.  If @var{insn} is NULL then it
+-should return true if any call could partly clobber the register.
+-For example, if the low 32 bits of @var{regno} are preserved across a call
+-but higher bits are clobbered, this hook should return true for a 64-bit
+-mode but false for a 32-bit mode.
+-
+-The default implementation returns false, which is correct
+-for targets that don't have partly call-clobbered registers.
++@deftypefn {Target Hook} {const predefined_function_abi &} TARGET_FNTYPE_ABI (const_tree @var{type})
++Return the ABI used by a function with type @var{type}; see the
++definition of @code{predefined_function_abi} for details of the ABI
++descriptor.  Targets only need to define this hook if they support
++interoperability between several ABIs in the same translation unit.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} void TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS (rtx_insn *@var{insn}, HARD_REG_SET *@var{used_regs})
+-This hook removes registers from the set of call-clobbered registers
+- in @var{used_regs} if, contrary to the default rules, something guarantees
+- that @samp{insn} preserves those registers.  For example, some targets
+- support variant ABIs in which functions preserve more registers than
+- normal functions would.  Removing those extra registers from @var{used_regs}
+- can lead to better register allocation.
+- 
+- The default implementation does nothing, which is always safe.
+- Defining the hook is purely an optimization.
++@deftypefn {Target Hook} {const predefined_function_abi &} TARGET_INSN_CALLEE_ABI (const rtx_insn *@var{insn})
++This hook returns a description of the ABI used by the target of
++call instruction @var{insn}; see the definition of
++@code{predefined_function_abi} for details of the ABI descriptor.
++Only the global function @code{insn_callee_abi} should call this hook
++directly.
++
++Targets only need to define this hook if they support
++interoperability between several ABIs in the same translation unit.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} {rtx_insn *} TARGET_RETURN_CALL_WITH_MAX_CLOBBERS (rtx_insn *@var{call_1}, rtx_insn *@var{call_2})
+-This hook returns a pointer to the call that partially clobbers the
+-most registers.  If a platform supports multiple ABIs where the registers
+-that are partially clobbered may vary, this function compares two
+-calls and returns a pointer to the one that clobbers the most registers.
+-If both calls clobber the same registers, @var{call_1} must be returned.
++@cindex call-used register
++@cindex call-clobbered register
++@cindex call-saved register
++@deftypefn {Target Hook} bool TARGET_HARD_REGNO_CALL_PART_CLOBBERED (unsigned int @var{abi_id}, unsigned int @var{regno}, machine_mode @var{mode})
++ABIs usually specify that calls must preserve the full contents
++of a particular register, or that calls can alter any part of a
++particular register.  This information is captured by the target macro
++@code{CALL_REALLY_USED_REGISTERS}.  However, some ABIs specify that calls
++must preserve certain bits of a particular register but can alter others.
++This hook should return true if this applies to at least one of the
++registers in @samp{(reg:@var{mode} @var{regno})}, and if as a result the
++call would alter part of the @var{mode} value.  For example, if a call
++preserves the low 32 bits of a 64-bit hard register @var{regno} but can
++clobber the upper 32 bits, this hook should return true for a 64-bit mode
++but false for a 32-bit mode.
++
++The value of @var{abi_id} comes from the @code{predefined_function_abi}
++structure that describes the ABI of the call; see the definition of the
++structure for more details.  If (as is usual) the target uses the same ABI
++for all functions in a translation unit, @var{abi_id} is always 0.
+ 
+-The registers clobbered in different ABIs must be a proper subset or
+-superset of all other ABIs.  @var{call_1} must always be a call insn,
+-call_2 may be NULL or a call insn.
++The default implementation returns false, which is correct
++for targets that don't have partly call-clobbered registers.
+ @end deftypefn
+ 
+ @deftypefn {Target Hook} {const char *} TARGET_GET_MULTILIB_ABI_NAME (void)
+@@ -3961,18 +3971,10 @@ This section describes the macros which let you control how various
+ types of arguments are passed in registers or how they are arranged in
+ the stack.
+ 
+-@deftypefn {Target Hook} rtx TARGET_FUNCTION_ARG (cumulative_args_t @var{ca}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
+-Return an RTX indicating whether a function argument is passed in a
+-register and if so, which register.
+-
+-The arguments are @var{ca}, which summarizes all the previous
+-arguments; @var{mode}, the machine mode of the argument; @var{type},
+-the data type of the argument as a tree node or 0 if that is not known
+-(which happens for C support library functions); and @var{named},
+-which is @code{true} for an ordinary argument and @code{false} for
+-nameless arguments that correspond to @samp{@dots{}} in the called
+-function's prototype.  @var{type} can be an incomplete type if a
+-syntax error has previously occurred.
++@deftypefn {Target Hook} rtx TARGET_FUNCTION_ARG (cumulative_args_t @var{ca}, const function_arg_info @var{&arg})
++Return an RTX indicating whether function argument @var{arg} is passed
++in a register and if so, which register.  Argument @var{ca} summarizes all
++the previous arguments.
+ 
+ The return value is usually either a @code{reg} RTX for the hard
+ register in which to pass the argument, or zero to pass the argument
+@@ -4020,14 +4022,14 @@ defined, the argument will be computed in the stack and then loaded into
+ a register.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} bool TARGET_MUST_PASS_IN_STACK (machine_mode @var{mode}, const_tree @var{type})
+-This target hook should return @code{true} if we should not pass @var{type}
++@deftypefn {Target Hook} bool TARGET_MUST_PASS_IN_STACK (const function_arg_info @var{&arg})
++This target hook should return @code{true} if we should not pass @var{arg}
+ solely in registers.  The file @file{expr.h} defines a
+ definition that is usually appropriate, refer to @file{expr.h} for additional
+ documentation.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} rtx TARGET_FUNCTION_INCOMING_ARG (cumulative_args_t @var{ca}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
++@deftypefn {Target Hook} rtx TARGET_FUNCTION_INCOMING_ARG (cumulative_args_t @var{ca}, const function_arg_info @var{&arg})
+ Define this hook if the caller and callee on the target have different
+ views of where arguments are passed.  Also define this hook if there are
+ functions that are never directly called, but are invoked by the hardware
+@@ -4057,7 +4059,7 @@ Perform a target dependent initialization of pic_offset_table_rtx.
+ This hook is called at the start of register allocation.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} int TARGET_ARG_PARTIAL_BYTES (cumulative_args_t @var{cum}, machine_mode @var{mode}, tree @var{type}, bool @var{named})
++@deftypefn {Target Hook} int TARGET_ARG_PARTIAL_BYTES (cumulative_args_t @var{cum}, const function_arg_info @var{&arg})
+ This target hook returns the number of bytes at the beginning of an
+ argument that must be put in registers.  The value must be zero for
+ arguments that are passed entirely in registers or that are entirely
+@@ -4076,11 +4078,11 @@ register to be used by the caller for this argument; likewise
+ @code{TARGET_FUNCTION_INCOMING_ARG}, for the called function.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} bool TARGET_PASS_BY_REFERENCE (cumulative_args_t @var{cum}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
+-This target hook should return @code{true} if an argument at the
++@deftypefn {Target Hook} bool TARGET_PASS_BY_REFERENCE (cumulative_args_t @var{cum}, const function_arg_info @var{&arg})
++This target hook should return @code{true} if argument @var{arg} at the
+ position indicated by @var{cum} should be passed by reference.  This
+ predicate is queried after target independent reasons for being
+-passed by reference, such as @code{TREE_ADDRESSABLE (type)}.
++passed by reference, such as @code{TREE_ADDRESSABLE (@var{arg}.type)}.
+ 
+ If the hook returns true, a copy of that argument is made in memory and a
+ pointer to the argument is passed instead of the argument itself.
+@@ -4088,7 +4090,7 @@ The pointer is passed in whatever way is appropriate for passing a pointer
+ to that type.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} bool TARGET_CALLEE_COPIES (cumulative_args_t @var{cum}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
++@deftypefn {Target Hook} bool TARGET_CALLEE_COPIES (cumulative_args_t @var{cum}, const function_arg_info @var{&arg})
+ The function argument described by the parameters to this hook is
+ known to be passed by reference.  The hook should return true if the
+ function argument should be copied by the callee instead of copied
+@@ -4167,10 +4169,9 @@ argument @var{libname} exists for symmetry with
+ @c --mew 5feb93   i switched the order of the sentences.  --mew 10feb93
+ @end defmac
+ 
+-@deftypefn {Target Hook} void TARGET_FUNCTION_ARG_ADVANCE (cumulative_args_t @var{ca}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
++@deftypefn {Target Hook} void TARGET_FUNCTION_ARG_ADVANCE (cumulative_args_t @var{ca}, const function_arg_info @var{&arg})
+ This hook updates the summarizer variable pointed to by @var{ca} to
+-advance past an argument in the argument list.  The values @var{mode},
+-@var{type} and @var{named} describe that argument.  Once this is done,
++advance past argument @var{arg} in the argument list.  Once this is done,
+ the variable @var{cum} is suitable for analyzing the @emph{following}
+ argument with @code{TARGET_FUNCTION_ARG}, etc.
+ 
+@@ -4331,6 +4332,27 @@ insns involving vector mode @var{mode}.  At the very least, it
+ must have move patterns for this mode.
+ @end deftypefn
+ 
++@deftypefn {Target Hook} bool TARGET_COMPATIBLE_VECTOR_TYPES_P (const_tree @var{type1}, const_tree @var{type2})
++Return true if there is no target-specific reason for treating
++vector types @var{type1} and @var{type2} as distinct types.  The caller
++has already checked for target-independent reasons, meaning that the
++types are known to have the same mode, to have the same number of elements,
++and to have what the caller considers to be compatible element types.
++
++The main reason for defining this hook is to reject pairs of types
++that are handled differently by the target's calling convention.
++For example, when a new @var{N}-bit vector architecture is added
++to a target, the target may want to handle normal @var{N}-bit
++@code{VECTOR_TYPE} arguments and return values in the same way as
++before, to maintain backwards compatibility.  However, it may also
++provide new, architecture-specific @code{VECTOR_TYPE}s that are passed
++and returned in a more efficient way.  It is then important to maintain
++a distinction between the ``normal'' @code{VECTOR_TYPE}s and the new
++architecture-specific ones.
++
++The default implementation returns true, which is correct for most targets.
++@end deftypefn
++
+ @deftypefn {Target Hook} opt_machine_mode TARGET_ARRAY_MODE (machine_mode @var{mode}, unsigned HOST_WIDE_INT @var{nelems})
+ Return the mode that GCC should use for an array that has
+ @var{nelems} elements, with each element having mode @var{mode}.
+@@ -5202,7 +5224,7 @@ return value of this function should be an RTX that contains the value
+ to use as the return of @code{__builtin_saveregs}.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} void TARGET_SETUP_INCOMING_VARARGS (cumulative_args_t @var{args_so_far}, machine_mode @var{mode}, tree @var{type}, int *@var{pretend_args_size}, int @var{second_time})
++@deftypefn {Target Hook} void TARGET_SETUP_INCOMING_VARARGS (cumulative_args_t @var{args_so_far}, const function_arg_info @var{&arg}, int *@var{pretend_args_size}, int @var{second_time})
+ This target hook offers an alternative to using
+ @code{__builtin_saveregs} and defining the hook
+ @code{TARGET_EXPAND_BUILTIN_SAVEREGS}.  Use it to store the anonymous
+@@ -5213,8 +5235,8 @@ pass all their arguments on the stack.
+ 
+ The argument @var{args_so_far} points to the @code{CUMULATIVE_ARGS} data
+ structure, containing the values that are obtained after processing the
+-named arguments.  The arguments @var{mode} and @var{type} describe the
+-last named argument---its machine mode and its data type as a tree node.
++named arguments.  The argument @var{arg} describes the last of these named
++arguments.
+ 
+ The target hook should do two things: first, push onto the stack all the
+ argument registers @emph{not} used for the named arguments, and second,
+@@ -5314,12 +5336,6 @@ This hook is used by expand pass to emit insn to store @var{bounds}
+ returned by function call into @var{slot}.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} void TARGET_SETUP_INCOMING_VARARG_BOUNDS (cumulative_args_t @var{args_so_far}, machine_mode @var{mode}, tree @var{type}, int *@var{pretend_args_size}, int @var{second_time})
+-Use it to store bounds for anonymous register arguments stored
+-into the stack.  Arguments meaning is similar to
+-@code{TARGET_SETUP_INCOMING_VARARGS}.
+-@end deftypefn
+-
+ @node Trampolines
+ @section Support for Nested Functions
+ @cindex support for nested functions
+@@ -5967,18 +5983,6 @@ instruction pattern.  There is no need for the hook to handle these two
+ implementation approaches itself.
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_CONVERSION (unsigned @var{code}, tree @var{dest_type}, tree @var{src_type})
+-This hook should return the DECL of a function that implements conversion of the
+-input vector of type @var{src_type} to type @var{dest_type}.
+-The value of @var{code} is one of the enumerators in @code{enum tree_code} and
+-specifies how the conversion is to be applied
+-(truncation, rounding, etc.).
+-
+-If this hook is defined, the autovectorizer will use the
+-@code{TARGET_VECTORIZE_BUILTIN_CONVERSION} target hook when vectorizing
+-conversion. Otherwise, it will return @code{NULL_TREE}.
+-@end deftypefn
+-
+ @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
+ This hook should return the decl of a function that implements the
+ vectorized variant of the function with the @code{combined_fn} code
+@@ -6698,7 +6702,7 @@ two areas of memory, or to set, clear or store to memory, for example
+ when copying a @code{struct}. The @code{by_pieces} infrastructure
+ implements such memory operations as a sequence of load, store or move
+ insns.  Alternate strategies are to expand the
+-@code{movmem} or @code{setmem} optabs, to emit a library call, or to emit
++@code{cpymem} or @code{setmem} optabs, to emit a library call, or to emit
+ unit-by-unit, loop-based operations.
+ 
+ This target hook should return true if, for a memory operation with a
+@@ -6717,7 +6721,7 @@ optimized for speed rather than size.
+ 
+ Returning true for higher values of @var{size} can improve code generation
+ for speed if the target does not provide an implementation of the
+-@code{movmem} or @code{setmem} standard names, if the @code{movmem} or
++@code{cpymem} or @code{setmem} standard names, if the @code{cpymem} or
+ @code{setmem} implementation would be more expensive than a sequence of
+ insns, or if the overhead of a library call would dominate that of
+ the body of the memory operation.
+@@ -11607,6 +11611,21 @@ another @code{CALL_EXPR}.
+ @var{arglist} really has type @samp{VEC(tree,gc)*}
+ @end deftypefn
+ 
++@deftypefn {Target Hook} bool TARGET_CHECK_BUILTIN_CALL (location_t @var{loc}, vec<location_t> @var{arg_loc}, tree @var{fndecl}, tree @var{orig_fndecl}, unsigned int @var{nargs}, tree *@var{args})
++Perform semantic checking on a call to a machine-specific built-in
++function after its arguments have been constrained to the function
++signature.  Return true if the call is valid, otherwise report an error
++and return false.
++
++This hook is called after @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}.
++The call was originally to built-in function @var{orig_fndecl},
++but after the optional @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}
++step is now to built-in function @var{fndecl}.  @var{loc} is the
++location of the call and @var{args} is an array of function arguments,
++of which there are @var{nargs}.  @var{arg_loc} specifies the location
++of each argument.
++@end deftypefn
++
+ @deftypefn {Target Hook} tree TARGET_FOLD_BUILTIN (tree @var{fndecl}, int @var{n_args}, tree *@var{argp}, bool @var{ignore})
+ Fold a call to a machine specific built-in function that was set up by
+ @samp{TARGET_INIT_BUILTINS}.  @var{fndecl} is the declaration of the
+@@ -11791,28 +11810,6 @@ cannot_modify_jumps_past_reload_p ()
+ @end smallexample
+ @end deftypefn
+ 
+-@deftypefn {Target Hook} reg_class_t TARGET_BRANCH_TARGET_REGISTER_CLASS (void)
+-This target hook returns a register class for which branch target register
+-optimizations should be applied.  All registers in this class should be
+-usable interchangeably.  After reload, registers in this class will be
+-re-allocated and loads will be hoisted out of loops and be subjected
+-to inter-block scheduling.
+-@end deftypefn
+-
+-@deftypefn {Target Hook} bool TARGET_BRANCH_TARGET_REGISTER_CALLEE_SAVED (bool @var{after_prologue_epilogue_gen})
+-Branch target register optimization will by default exclude callee-saved
+-registers
+-that are not already live during the current function; if this target hook
+-returns true, they will be included.  The target code must than make sure
+-that all target registers in the class returned by
+-@samp{TARGET_BRANCH_TARGET_REGISTER_CLASS} that might need saving are
+-saved.  @var{after_prologue_epilogue_gen} indicates if prologues and
+-epilogues have already been generated.  Note, even if you only return
+-true when @var{after_prologue_epilogue_gen} is false, you still are likely
+-to have to make special provisions in @code{INITIAL_ELIMINATION_OFFSET}
+-to reserve space for caller-saved target registers.
+-@end deftypefn
+-
+ @deftypefn {Target Hook} bool TARGET_HAVE_CONDITIONAL_EXECUTION (void)
+ This target hook returns true if the target supports conditional execution.
+ This target hook is required only when the target has several different
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index bc362dca0..89cfb5253 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -1689,6 +1689,9 @@ function calls.
+ If a register has 0 in @code{CALL_USED_REGISTERS}, the compiler
+ automatically saves it on function entry and restores it on function
+ exit, if the register is used within the function.
++
++Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
++must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
+ @end defmac
+ 
+ @defmac CALL_REALLY_USED_REGISTERS
+@@ -1698,18 +1701,22 @@ exit, if the register is used within the function.
+ Like @code{CALL_USED_REGISTERS} except this macro doesn't require
+ that the entire set of @code{FIXED_REGISTERS} be included.
+ (@code{CALL_USED_REGISTERS} must be a superset of @code{FIXED_REGISTERS}).
+-This macro is optional.  If not specified, it defaults to the value
+-of @code{CALL_USED_REGISTERS}.
++
++Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
++must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
+ @end defmac
+ 
+ @cindex call-used register
+ @cindex call-clobbered register
+ @cindex call-saved register
+-@hook TARGET_HARD_REGNO_CALL_PART_CLOBBERED
++@hook TARGET_FNTYPE_ABI
+ 
+-@hook TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
++@hook TARGET_INSN_CALLEE_ABI
+ 
+-@hook TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
++@cindex call-used register
++@cindex call-clobbered register
++@cindex call-saved register
++@hook TARGET_HARD_REGNO_CALL_PART_CLOBBERED
+ 
+ @hook TARGET_GET_MULTILIB_ABI_NAME
+ 
+@@ -3362,6 +3369,8 @@ stack.
+ 
+ @hook TARGET_VECTOR_MODE_SUPPORTED_P
+ 
++@hook TARGET_COMPATIBLE_VECTOR_TYPES_P
++
+ @hook TARGET_ARRAY_MODE
+ 
+ @hook TARGET_ARRAY_MODE_SUPPORTED_P
+@@ -3785,8 +3794,6 @@ These machine description macros help implement varargs:
+ 
+ @hook TARGET_STORE_RETURNED_BOUNDS
+ 
+-@hook TARGET_SETUP_INCOMING_VARARG_BOUNDS
+-
+ @node Trampolines
+ @section Support for Nested Functions
+ @cindex support for nested functions
+@@ -4160,8 +4167,6 @@ address;  but often a machine-dependent strategy can generate better code.
+ 
+ @hook TARGET_VECTORIZE_VEC_PERM_CONST
+ 
+-@hook TARGET_VECTORIZE_BUILTIN_CONVERSION
+-
+ @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
+ 
+ @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
+@@ -7934,6 +7939,8 @@ to by @var{ce_info}.
+ 
+ @hook TARGET_RESOLVE_OVERLOADED_BUILTIN
+ 
++@hook TARGET_CHECK_BUILTIN_CALL
++
+ @hook TARGET_FOLD_BUILTIN
+ 
+ @hook TARGET_GIMPLE_FOLD_BUILTIN
+@@ -7999,10 +8006,6 @@ build_type_attribute_variant (@var{mdecl},
+ 
+ @hook TARGET_CANNOT_MODIFY_JUMPS_P
+ 
+-@hook TARGET_BRANCH_TARGET_REGISTER_CLASS
+-
+-@hook TARGET_BRANCH_TARGET_REGISTER_CALLEE_SAVED
+-
+ @hook TARGET_HAVE_CONDITIONAL_EXECUTION
+ 
+ @hook TARGET_GEN_CCMP_FIRST
+diff --git a/gcc/dse.c b/gcc/dse.c
+index 4becdcf1c..874ff507c 100644
+--- a/gcc/dse.c
++++ b/gcc/dse.c
+@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "params.h"
+ #include "rtl-iter.h"
+ #include "cfgcleanup.h"
++#include "calls.h"
+ 
+ /* This file contains three techniques for performing Dead Store
+    Elimination (dse).
+@@ -819,7 +820,7 @@ emit_inc_dec_insn_before (rtx mem ATTRIBUTE_UNUSED,
+   for (cur = new_insn; cur; cur = NEXT_INSN (cur))
+     {
+       info.current = cur;
+-      note_stores (PATTERN (cur), note_add_store, &info);
++      note_stores (cur, note_add_store, &info);
+     }
+ 
+   /* If a failure was flagged above, return 1 so that for_each_inc_dec will
+@@ -1976,7 +1977,7 @@ replace_read (store_info *store_info, insn_info_t store_insn,
+       bitmap regs_set = BITMAP_ALLOC (&reg_obstack);
+ 
+       for (this_insn = insns; this_insn != NULL_RTX; this_insn = NEXT_INSN (this_insn))
+-	note_stores (PATTERN (this_insn), look_for_hardregs, regs_set);
++	note_stores (this_insn, look_for_hardregs, regs_set);
+ 
+       bitmap_and_into (regs_set, regs_live);
+       if (!bitmap_empty_p (regs_set))
+@@ -2341,7 +2342,8 @@ get_call_args (rtx call_insn, tree fn, rtx *args, int nargs)
+       if (!is_int_mode (TYPE_MODE (TREE_VALUE (arg)), &mode))
+ 	return false;
+ 
+-      reg = targetm.calls.function_arg (args_so_far, mode, NULL_TREE, true);
++      function_arg_info arg (mode, /*named=*/true);
++      reg = targetm.calls.function_arg (args_so_far, arg);
+       if (!reg || !REG_P (reg) || GET_MODE (reg) != mode)
+ 	return false;
+ 
+@@ -2373,7 +2375,7 @@ get_call_args (rtx call_insn, tree fn, rtx *args, int nargs)
+       if (tmp)
+ 	args[idx] = tmp;
+ 
+-      targetm.calls.function_arg_advance (args_so_far, mode, NULL_TREE, true);
++      targetm.calls.function_arg_advance (args_so_far, arg);
+     }
+   if (arg != void_list_node || idx != nargs)
+     return false;
+@@ -2388,7 +2390,7 @@ copy_fixed_regs (const_bitmap in)
+   bitmap ret;
+ 
+   ret = ALLOC_REG_SET (NULL);
+-  bitmap_and (ret, in, fixed_reg_set_regset);
++  bitmap_and (ret, in, bitmap_view<HARD_REG_SET> (fixed_reg_set));
+   return ret;
+ }
+ 
+diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
+index 30c4c7007..a219d7fc3 100644
+--- a/gcc/dwarf2out.c
++++ b/gcc/dwarf2out.c
+@@ -16428,7 +16428,6 @@ mem_loc_descriptor (rtx rtl, machine_mode mode,
+     case CONST_FIXED:
+     case CLRSB:
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       break;
+ 
+     case CONST_STRING:
+@@ -18566,6 +18565,24 @@ loc_list_from_tree_1 (tree loc, int want_address,
+ 	}
+       break;
+ 
++    case POLY_INT_CST:
++      {
++	if (want_address)
++	  {
++	    expansion_failed (loc, NULL_RTX,
++			      "constant address with a runtime component");
++	    return 0;
++	  }
++	poly_int64 value;
++	if (!poly_int_tree_p (loc, &value))
++	  {
++	    expansion_failed (loc, NULL_RTX, "constant too big");
++	    return 0;
++	  }
++	ret = int_loc_descriptor (value);
++      }
++      break;
++
+     case CONSTRUCTOR:
+     case REAL_CST:
+     case STRING_CST:
+@@ -19682,6 +19699,7 @@ add_const_value_attribute (dw_die_ref die, rtx rtl)
+     case MINUS:
+     case SIGN_EXTEND:
+     case ZERO_EXTEND:
++    case CONST_POLY_INT:
+       return false;
+ 
+     case MEM:
+diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
+index 78104603c..d6636ccb0 100644
+--- a/gcc/emit-rtl.c
++++ b/gcc/emit-rtl.c
+@@ -2865,7 +2865,6 @@ verify_rtx_sharing (rtx orig, rtx insn)
+       /* SCRATCH must be shared because they represent distinct values.  */
+       return;
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       /* Share clobbers of hard registers (like cc0), but do not share pseudo reg
+          clobbers or clobbers of hard registers that originated as pseudos.
+          This is needed to allow safe register renaming.  */
+@@ -3119,7 +3118,6 @@ repeat:
+       /* SCRATCH must be shared because they represent distinct values.  */
+       return;
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       /* Share clobbers of hard registers (like cc0), but do not share pseudo reg
+          clobbers or clobbers of hard registers that originated as pseudos.
+          This is needed to allow safe register renaming.  */
+@@ -5693,7 +5691,6 @@ copy_insn_1 (rtx orig)
+     case SIMPLE_RETURN:
+       return orig;
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       /* Share clobbers of hard registers (like cc0), but do not share pseudo reg
+          clobbers or clobbers of hard registers that originated as pseudos.
+          This is needed to allow safe register renaming.  */
+@@ -6505,21 +6502,6 @@ gen_hard_reg_clobber (machine_mode mode, unsigned int regno)
+ 	    gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno)));
+ }
+ 
+-static GTY((deletable)) rtx
+-hard_reg_clobbers_high[NUM_MACHINE_MODES][FIRST_PSEUDO_REGISTER];
+-
+-/* Return a CLOBBER_HIGH expression for register REGNO that clobbers MODE,
+-   caching into HARD_REG_CLOBBERS_HIGH.  */
+-rtx
+-gen_hard_reg_clobber_high (machine_mode mode, unsigned int regno)
+-{
+-  if (hard_reg_clobbers_high[mode][regno])
+-    return hard_reg_clobbers_high[mode][regno];
+-  else
+-    return (hard_reg_clobbers_high[mode][regno]
+-	    = gen_rtx_CLOBBER_HIGH (VOIDmode, gen_rtx_REG (mode, regno)));
+-}
+-
+ location_t prologue_location;
+ location_t epilogue_location;
+ 
+diff --git a/gcc/emit-rtl.h b/gcc/emit-rtl.h
+index 7b1cecd3c..573140e84 100644
+--- a/gcc/emit-rtl.h
++++ b/gcc/emit-rtl.h
+@@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ struct temp_slot;
+ typedef struct temp_slot *temp_slot_p;
++struct predefined_function_abi;
+ 
+ /* Information mainlined about RTL representation of incoming arguments.  */
+ struct GTY(()) incoming_args {
+@@ -64,6 +65,14 @@ struct GTY(()) rtl_data {
+   struct function_subsections subsections;
+   struct rtl_eh eh;
+ 
++  /* The ABI of the function, i.e. the interface it presents to its callers.
++     This is the ABI that should be queried to see which registers the
++     function needs to save before it uses them.
++
++     Other functions (including those called by this function) might use
++     different ABIs.  */
++  const predefined_function_abi *GTY((skip)) abi;
++
+   /* For function.c  */
+ 
+   /* # of bytes of outgoing arguments.  If ACCUMULATE_OUTGOING_ARGS is
+diff --git a/gcc/expr.c b/gcc/expr.c
+index 650be8dad..b77f0409e 100644
+--- a/gcc/expr.c
++++ b/gcc/expr.c
+@@ -73,7 +73,7 @@ along with GCC; see the file COPYING3.  If not see
+ int cse_not_expected;
+ 
+ static bool block_move_libcall_safe_for_call_parm (void);
+-static bool emit_block_move_via_movmem (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT,
++static bool emit_block_move_via_cpymem (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT,
+ 					unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
+ 					unsigned HOST_WIDE_INT);
+ static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+@@ -1645,7 +1645,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
+ 
+   if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
+     move_by_pieces (x, y, INTVAL (size), align, RETURN_BEGIN);
+-  else if (emit_block_move_via_movmem (x, y, size, align,
++  else if (emit_block_move_via_cpymem (x, y, size, align,
+ 				       expected_align, expected_size,
+ 				       min_size, max_size, probable_max_size))
+     ;
+@@ -1723,31 +1723,28 @@ block_move_libcall_safe_for_call_parm (void)
+     for ( ; arg != void_list_node ; arg = TREE_CHAIN (arg))
+       {
+ 	machine_mode mode = TYPE_MODE (TREE_VALUE (arg));
+-	rtx tmp = targetm.calls.function_arg (args_so_far, mode,
+-					      NULL_TREE, true);
++	function_arg_info arg_info (mode, /*named=*/true);
++	rtx tmp = targetm.calls.function_arg (args_so_far, arg_info);
+ 	if (!tmp || !REG_P (tmp))
+ 	  return false;
+-	if (targetm.calls.arg_partial_bytes (args_so_far, mode, NULL, 1))
++	if (targetm.calls.arg_partial_bytes (args_so_far, arg_info))
+ 	  return false;
+-	targetm.calls.function_arg_advance (args_so_far, mode,
+-					    NULL_TREE, true);
++	targetm.calls.function_arg_advance (args_so_far, arg_info);
+       }
+   }
+   return true;
+ }
+ 
+-/* A subroutine of emit_block_move.  Expand a movmem pattern;
++/* A subroutine of emit_block_move.  Expand a cpymem pattern;
+    return true if successful.  */
+ 
+ static bool
+-emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
++emit_block_move_via_cpymem (rtx x, rtx y, rtx size, unsigned int align,
+ 			    unsigned int expected_align, HOST_WIDE_INT expected_size,
+ 			    unsigned HOST_WIDE_INT min_size,
+ 			    unsigned HOST_WIDE_INT max_size,
+ 			    unsigned HOST_WIDE_INT probable_max_size)
+ {
+-  int save_volatile_ok = volatile_ok;
+-
+   if (expected_align < align)
+     expected_align = align;
+   if (expected_size != -1)
+@@ -1759,7 +1756,7 @@ emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
+     }
+ 
+   /* Since this is a move insn, we don't care about volatility.  */
+-  volatile_ok = 1;
++  temporary_volatile_ok v (true);
+ 
+   /* Try the most limited insn first, because there's no point
+      including more than one in the machine description unless
+@@ -1769,7 +1766,7 @@ emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
+   FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
+     {
+       scalar_int_mode mode = mode_iter.require ();
+-      enum insn_code code = direct_optab_handler (movmem_optab, mode);
++      enum insn_code code = direct_optab_handler (cpymem_optab, mode);
+ 
+       if (code != CODE_FOR_nothing
+ 	  /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
+@@ -1823,14 +1820,10 @@ emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
+ 		create_fixed_operand (&ops[8], NULL);
+ 	    }
+ 	  if (maybe_expand_insn (code, nops, ops))
+-	    {
+-	      volatile_ok = save_volatile_ok;
+-	      return true;
+-	    }
++	    return true;
+ 	}
+     }
+ 
+-  volatile_ok = save_volatile_ok;
+   return false;
+ }
+ 
+@@ -5841,7 +5834,8 @@ store_expr (tree exp, rtx target, int call_param_p,
+ 		copy_blkmode_from_reg (target, temp, TREE_TYPE (exp));
+ 	      else
+ 		store_bit_field (target,
+-				 INTVAL (expr_size (exp)) * BITS_PER_UNIT,
++				 rtx_to_poly_int64 (expr_size (exp))
++				 * BITS_PER_UNIT,
+ 				 0, 0, 0, GET_MODE (temp), temp, reverse);
+ 	    }
+ 	  else
+diff --git a/gcc/final.c b/gcc/final.c
+index fefc4874b..7cf9ef1ef 100644
+--- a/gcc/final.c
++++ b/gcc/final.c
+@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "asan.h"
+ #include "rtl-iter.h"
+ #include "print-rtl.h"
++#include "function-abi.h"
+ 
+ #ifdef XCOFF_DEBUGGING_INFO
+ #include "xcoffout.h"		/* Needed for external data declarations.  */
+@@ -230,7 +231,6 @@ static int alter_cond (rtx);
+ #endif
+ static int align_fuzz (rtx, rtx, int, unsigned);
+ static void collect_fn_hard_reg_usage (void);
+-static tree get_call_fndecl (rtx_insn *);
+ 
+ /* Initialize data in final at the beginning of a compilation.  */
+ 
+@@ -4994,7 +4994,16 @@ collect_fn_hard_reg_usage (void)
+   if (!targetm.call_fusage_contains_non_callee_clobbers)
+     return;
+ 
+-  CLEAR_HARD_REG_SET (function_used_regs);
++  /* Be conservative - mark fixed and global registers as used.  */
++  function_used_regs = fixed_reg_set;
++
++#ifdef STACK_REGS
++  /* Handle STACK_REGS conservatively, since the df-framework does not
++     provide accurate information for them.  */
++
++  for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
++    SET_HARD_REG_BIT (function_used_regs, i);
++#endif
+ 
+   for (insn = get_insns (); insn != NULL_RTX; insn = next_insn (insn))
+     {
+@@ -5005,97 +5014,23 @@ collect_fn_hard_reg_usage (void)
+ 
+       if (CALL_P (insn)
+ 	  && !self_recursive_call_p (insn))
+-	{
+-	  if (!get_call_reg_set_usage (insn, &insn_used_regs,
+-				       call_used_reg_set))
+-	    return;
+-
+-	  IOR_HARD_REG_SET (function_used_regs, insn_used_regs);
+-	}
++	function_used_regs
++	  |= insn_callee_abi (insn).full_and_partial_reg_clobbers ();
+ 
+       find_all_hard_reg_sets (insn, &insn_used_regs, false);
+-      IOR_HARD_REG_SET (function_used_regs, insn_used_regs);
+-    }
++      function_used_regs |= insn_used_regs;
+ 
+-  /* Be conservative - mark fixed and global registers as used.  */
+-  IOR_HARD_REG_SET (function_used_regs, fixed_reg_set);
+-
+-#ifdef STACK_REGS
+-  /* Handle STACK_REGS conservatively, since the df-framework does not
+-     provide accurate information for them.  */
+-
+-  for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+-    SET_HARD_REG_BIT (function_used_regs, i);
+-#endif
++      if (hard_reg_set_subset_p (crtl->abi->full_and_partial_reg_clobbers (),
++				 function_used_regs))
++	return;
++    }
+ 
+-  /* The information we have gathered is only interesting if it exposes a
+-     register from the call_used_regs that is not used in this function.  */
+-  if (hard_reg_set_subset_p (call_used_reg_set, function_used_regs))
+-    return;
++  /* Mask out fully-saved registers, so that they don't affect equality
++     comparisons between function_abis.  */
++  function_used_regs &= crtl->abi->full_and_partial_reg_clobbers ();
+ 
+   node = cgraph_node::rtl_info (current_function_decl);
+   gcc_assert (node != NULL);
+ 
+-  COPY_HARD_REG_SET (node->function_used_regs, function_used_regs);
+-  node->function_used_regs_valid = 1;
+-}
+-
+-/* Get the declaration of the function called by INSN.  */
+-
+-static tree
+-get_call_fndecl (rtx_insn *insn)
+-{
+-  rtx note, datum;
+-
+-  note = find_reg_note (insn, REG_CALL_DECL, NULL_RTX);
+-  if (note == NULL_RTX)
+-    return NULL_TREE;
+-
+-  datum = XEXP (note, 0);
+-  if (datum != NULL_RTX)
+-    return SYMBOL_REF_DECL (datum);
+-
+-  return NULL_TREE;
+-}
+-
+-/* Return the cgraph_rtl_info of the function called by INSN.  Returns NULL for
+-   call targets that can be overwritten.  */
+-
+-static struct cgraph_rtl_info *
+-get_call_cgraph_rtl_info (rtx_insn *insn)
+-{
+-  tree fndecl;
+-
+-  if (insn == NULL_RTX)
+-    return NULL;
+-
+-  fndecl = get_call_fndecl (insn);
+-  if (fndecl == NULL_TREE
+-      || !decl_binds_to_current_def_p (fndecl))
+-    return NULL;
+-
+-  return cgraph_node::rtl_info (fndecl);
+-}
+-
+-/* Find hard registers used by function call instruction INSN, and return them
+-   in REG_SET.  Return DEFAULT_SET in REG_SET if not found.  */
+-
+-bool
+-get_call_reg_set_usage (rtx_insn *insn, HARD_REG_SET *reg_set,
+-			HARD_REG_SET default_set)
+-{
+-  if (flag_ipa_ra)
+-    {
+-      struct cgraph_rtl_info *node = get_call_cgraph_rtl_info (insn);
+-      if (node != NULL
+-	  && node->function_used_regs_valid)
+-	{
+-	  COPY_HARD_REG_SET (*reg_set, node->function_used_regs);
+-	  AND_HARD_REG_SET (*reg_set, default_set);
+-	  return true;
+-	}
+-    }
+-  COPY_HARD_REG_SET (*reg_set, default_set);
+-  targetm.remove_extra_call_preserved_regs (insn, reg_set);
+-  return false;
++  node->function_used_regs = function_used_regs;
+ }
+diff --git a/gcc/fold-const-call.c b/gcc/fold-const-call.c
+index 702c8b405..e21d8e110 100644
+--- a/gcc/fold-const-call.c
++++ b/gcc/fold-const-call.c
+@@ -689,6 +689,36 @@ fold_const_vec_convert (tree ret_type, tree arg)
+   return elts.build ();
+ }
+ 
++/* Try to evaluate:
++
++      IFN_WHILE_ULT (ARG0, ARG1, (TYPE) { ... })
++
++   Return the value on success and null on failure.  */
++
++static tree
++fold_while_ult (tree type, poly_uint64 arg0, poly_uint64 arg1)
++{
++  if (known_ge (arg0, arg1))
++    return build_zero_cst (type);
++
++  if (maybe_ge (arg0, arg1))
++    return NULL_TREE;
++
++  poly_uint64 diff = arg1 - arg0;
++  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
++  if (known_ge (diff, nelts))
++    return build_all_ones_cst (type);
++
++  unsigned HOST_WIDE_INT const_diff;
++  if (known_le (diff, nelts) && diff.is_constant (&const_diff))
++    {
++      tree minus_one = build_minus_one_cst (TREE_TYPE (type));
++      tree zero = build_zero_cst (TREE_TYPE (type));
++      return build_vector_a_then_b (type, const_diff, minus_one, zero);
++    }
++  return NULL_TREE;
++}
++
+ /* Try to evaluate:
+ 
+       *RESULT = FN (*ARG)
+@@ -1782,6 +1812,14 @@ fold_const_call (combined_fn fn, tree type, tree arg0, tree arg1, tree arg2)
+ 	}
+       return NULL_TREE;
+ 
++    case CFN_WHILE_ULT:
++      {
++	poly_uint64 parg0, parg1;
++	if (poly_int_tree_p (arg0, &parg0) && poly_int_tree_p (arg1, &parg1))
++	  return fold_while_ult (type, parg0, parg1);
++	return NULL_TREE;
++      }
++
+     default:
+       return fold_const_call_1 (fn, type, arg0, arg1, arg2);
+     }
+diff --git a/gcc/fold-const.c b/gcc/fold-const.c
+index c717f2450..ffc2669a7 100644
+--- a/gcc/fold-const.c
++++ b/gcc/fold-const.c
+@@ -3477,7 +3477,8 @@ operand_equal_p (const_tree arg0, const_tree arg1, unsigned int flags)
+       return (TREE_CODE (arg0) == FUNCTION_DECL
+ 	      && fndecl_built_in_p (arg0) && fndecl_built_in_p (arg1)
+ 	      && DECL_BUILT_IN_CLASS (arg0) == DECL_BUILT_IN_CLASS (arg1)
+-	      && DECL_FUNCTION_CODE (arg0) == DECL_FUNCTION_CODE (arg1));
++	      && (DECL_UNCHECKED_FUNCTION_CODE (arg0)
++		  == DECL_UNCHECKED_FUNCTION_CODE (arg1)));
+ 
+     case tcc_exceptional:
+       if (TREE_CODE (arg0) == CONSTRUCTOR)
+@@ -7380,22 +7381,18 @@ native_encode_complex (const_tree expr, unsigned char *ptr, int len, int off)
+   return rsize + isize;
+ }
+ 
+-
+-/* Subroutine of native_encode_expr.  Encode the VECTOR_CST
+-   specified by EXPR into the buffer PTR of length LEN bytes.
+-   Return the number of bytes placed in the buffer, or zero
+-   upon failure.  */
++/* Like native_encode_vector, but only encode the first COUNT elements.
++   The other arguments are as for native_encode_vector.  */
+ 
+ static int
+-native_encode_vector (const_tree expr, unsigned char *ptr, int len, int off)
++native_encode_vector_part (const_tree expr, unsigned char *ptr, int len,
++			   int off, unsigned HOST_WIDE_INT count)
+ {
+-  unsigned HOST_WIDE_INT i, count;
++  unsigned HOST_WIDE_INT i;
+   int size, offset;
+   tree itype, elem;
+ 
+   offset = 0;
+-  if (!VECTOR_CST_NELTS (expr).is_constant (&count))
+-    return 0;
+   itype = TREE_TYPE (TREE_TYPE (expr));
+   size = GET_MODE_SIZE (SCALAR_TYPE_MODE (itype));
+   for (i = 0; i < count; i++)
+@@ -7419,6 +7416,20 @@ native_encode_vector (const_tree expr, unsigned char *ptr, int len, int off)
+   return offset;
+ }
+ 
++/* Subroutine of native_encode_expr.  Encode the VECTOR_CST
++   specified by EXPR into the buffer PTR of length LEN bytes.
++   Return the number of bytes placed in the buffer, or zero
++   upon failure.  */
++
++static int
++native_encode_vector (const_tree expr, unsigned char *ptr, int len, int off)
++{
++  unsigned HOST_WIDE_INT count;
++  if (!VECTOR_CST_NELTS (expr).is_constant (&count))
++    return 0;
++  return native_encode_vector_part (expr, ptr, len, off, count);
++}
++
+ 
+ /* Subroutine of native_encode_expr.  Encode the STRING_CST
+    specified by EXPR into the buffer PTR of length LEN bytes.
+@@ -7714,6 +7725,113 @@ can_native_interpret_type_p (tree type)
+     }
+ }
+ 
++/* Read a vector of type TYPE from the target memory image given by BYTES,
++   starting at byte FIRST_BYTE.  The vector is known to be encodable using
++   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
++   and BYTES is known to have enough bytes to supply NPATTERNS *
++   NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
++   BITS_PER_UNIT bits and the bytes are in target memory order.
++
++   Return the vector on success, otherwise return null.  */
++
++static tree
++native_decode_vector_tree (tree type, vec<unsigned char> bytes,
++			   unsigned int first_byte, unsigned int npatterns,
++			   unsigned int nelts_per_pattern)
++{
++  tree_vector_builder builder (type, npatterns, nelts_per_pattern);
++  tree elt_type = TREE_TYPE (type);
++  unsigned int elt_bits = tree_to_uhwi (TYPE_SIZE (elt_type));
++  if (VECTOR_BOOLEAN_TYPE_P (type) && elt_bits <= BITS_PER_UNIT)
++    {
++      /* This is the only case in which elements can be smaller than a byte.
++	 Element 0 is always in the lsb of the containing byte.  */
++      elt_bits = TYPE_PRECISION (elt_type);
++      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
++	{
++	  unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
++	  unsigned int byte_index = bit_index / BITS_PER_UNIT;
++	  unsigned int lsb = bit_index % BITS_PER_UNIT;
++	  builder.quick_push (bytes[byte_index] & (1 << lsb)
++			      ? build_all_ones_cst (elt_type)
++			      : build_zero_cst (elt_type));
++	}
++    }
++  else
++    {
++      unsigned int elt_bytes = elt_bits / BITS_PER_UNIT;
++      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
++	{
++	  tree elt = native_interpret_expr (elt_type, &bytes[first_byte],
++					    elt_bytes);
++	  if (!elt)
++	    return NULL_TREE;
++	  builder.quick_push (elt);
++	  first_byte += elt_bytes;
++	}
++    }
++  return builder.build ();
++}
++
++/* Try to view-convert VECTOR_CST EXPR to VECTOR_TYPE TYPE by operating
++   directly on the VECTOR_CST encoding, in a way that works for variable-
++   length vectors.  Return the resulting VECTOR_CST on success or null
++   on failure.  */
++
++static tree
++fold_view_convert_vector_encoding (tree type, tree expr)
++{
++  tree expr_type = TREE_TYPE (expr);
++  poly_uint64 type_bits, expr_bits;
++  if (!poly_int_tree_p (TYPE_SIZE (type), &type_bits)
++      || !poly_int_tree_p (TYPE_SIZE (expr_type), &expr_bits))
++    return NULL_TREE;
++
++  poly_uint64 type_units = TYPE_VECTOR_SUBPARTS (type);
++  poly_uint64 expr_units = TYPE_VECTOR_SUBPARTS (expr_type);
++  unsigned int type_elt_bits = vector_element_size (type_bits, type_units);
++  unsigned int expr_elt_bits = vector_element_size (expr_bits, expr_units);
++
++  /* We can only preserve the semantics of a stepped pattern if the new
++     vector element is an integer of the same size.  */
++  if (VECTOR_CST_STEPPED_P (expr)
++      && (!INTEGRAL_TYPE_P (type) || type_elt_bits != expr_elt_bits))
++    return NULL_TREE;
++
++  /* The number of bits needed to encode one element from every pattern
++     of the original vector.  */
++  unsigned int expr_sequence_bits
++    = VECTOR_CST_NPATTERNS (expr) * expr_elt_bits;
++
++  /* The number of bits needed to encode one element from every pattern
++     of the result.  */
++  unsigned int type_sequence_bits
++    = least_common_multiple (expr_sequence_bits, type_elt_bits);
++
++  /* Don't try to read more bytes than are available, which can happen
++     for constant-sized vectors if TYPE has larger elements than EXPR_TYPE.
++     The general VIEW_CONVERT handling can cope with that case, so there's
++     no point complicating things here.  */
++  unsigned int nelts_per_pattern = VECTOR_CST_NELTS_PER_PATTERN (expr);
++  unsigned int buffer_bytes = CEIL (nelts_per_pattern * type_sequence_bits,
++				    BITS_PER_UNIT);
++  unsigned int buffer_bits = buffer_bytes * BITS_PER_UNIT;
++  if (known_gt (buffer_bits, expr_bits))
++    return NULL_TREE;
++
++  /* Get enough bytes of EXPR to form the new encoding.  */
++  auto_vec<unsigned char, 128> buffer (buffer_bytes);
++  buffer.quick_grow (buffer_bytes);
++  if (native_encode_vector_part (expr, buffer.address (), buffer_bytes, 0,
++				 buffer_bits / expr_elt_bits)
++      != (int) buffer_bytes)
++    return NULL_TREE;
++
++  /* Reencode the bytes as TYPE.  */
++  unsigned int type_npatterns = type_sequence_bits / type_elt_bits;
++  return native_decode_vector_tree (type, buffer, 0, type_npatterns,
++				    nelts_per_pattern);
++}
+ 
+ /* Fold a VIEW_CONVERT_EXPR of a constant expression EXPR to type
+    TYPE at compile-time.  If we're unable to perform the conversion
+@@ -7730,6 +7848,10 @@ fold_view_convert_expr (tree type, tree expr)
+   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
+     return NULL_TREE;
+ 
++  if (VECTOR_TYPE_P (type) && TREE_CODE (expr) == VECTOR_CST)
++    if (tree res = fold_view_convert_vector_encoding (type, expr))
++      return res;
++
+   len = native_encode_expr (expr, buffer, sizeof (buffer));
+   if (len == 0)
+     return NULL_TREE;
+@@ -9030,7 +9152,7 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
+    selector.  Return the folded VECTOR_CST or CONSTRUCTOR if successful,
+    NULL_TREE otherwise.  */
+ 
+-static tree
++tree
+ fold_vec_perm (tree type, tree arg0, tree arg1, const vec_perm_indices &sel)
+ {
+   unsigned int i;
+@@ -9254,7 +9376,7 @@ tree_expr_nonzero_warnv_p (tree t, bool *strict_overflow_p)
+ 	tree fndecl = get_callee_fndecl (t);
+ 	if (!fndecl) return false;
+ 	if (flag_delete_null_pointer_checks && !flag_check_new
+-	    && DECL_IS_OPERATOR_NEW (fndecl)
++	    && DECL_IS_OPERATOR_NEW_P (fndecl)
+ 	    && !TREE_NOTHROW (fndecl))
+ 	  return true;
+ 	if (flag_delete_null_pointer_checks
+@@ -11778,7 +11900,10 @@ fold_ternary_loc (location_t loc, enum tree_code code, tree type,
+       return NULL_TREE;
+ 
+     case VEC_PERM_EXPR:
+-      if (TREE_CODE (arg2) == VECTOR_CST)
++      /* Perform constant folding of BIT_INSERT_EXPR.  */
++      if (TREE_CODE (arg2) == VECTOR_CST
++	  && TREE_CODE (op0) == VECTOR_CST
++	  && TREE_CODE (op1) == VECTOR_CST)
+ 	{
+ 	  /* Build a vector of integers from the tree mask.  */
+ 	  vec_perm_builder builder;
+@@ -11789,61 +11914,7 @@ fold_ternary_loc (location_t loc, enum tree_code code, tree type,
+ 	  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+ 	  bool single_arg = (op0 == op1);
+ 	  vec_perm_indices sel (builder, single_arg ? 1 : 2, nelts);
+-
+-	  /* Check for cases that fold to OP0 or OP1 in their original
+-	     element order.  */
+-	  if (sel.series_p (0, 1, 0, 1))
+-	    return op0;
+-	  if (sel.series_p (0, 1, nelts, 1))
+-	    return op1;
+-
+-	  if (!single_arg)
+-	    {
+-	      if (sel.all_from_input_p (0))
+-		op1 = op0;
+-	      else if (sel.all_from_input_p (1))
+-		{
+-		  op0 = op1;
+-		  sel.rotate_inputs (1);
+-		}
+-	    }
+-
+-	  if ((TREE_CODE (op0) == VECTOR_CST
+-	       || TREE_CODE (op0) == CONSTRUCTOR)
+-	      && (TREE_CODE (op1) == VECTOR_CST
+-		  || TREE_CODE (op1) == CONSTRUCTOR))
+-	    {
+-	      tree t = fold_vec_perm (type, op0, op1, sel);
+-	      if (t != NULL_TREE)
+-		return t;
+-	    }
+-
+-	  bool changed = (op0 == op1 && !single_arg);
+-
+-	  /* Generate a canonical form of the selector.  */
+-	  if (arg2 == op2 && sel.encoding () != builder)
+-	    {
+-	      /* Some targets are deficient and fail to expand a single
+-		 argument permutation while still allowing an equivalent
+-		 2-argument version.  */
+-	      if (sel.ninputs () == 2
+-		  || can_vec_perm_const_p (TYPE_MODE (type), sel, false))
+-		op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel);
+-	      else
+-		{
+-		  vec_perm_indices sel2 (builder, 2, nelts);
+-		  if (can_vec_perm_const_p (TYPE_MODE (type), sel2, false))
+-		    op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel2);
+-		  else
+-		    /* Not directly supported with either encoding,
+-		       so use the preferred form.  */
+-		    op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel);
+-		}
+-	      changed = true;
+-	    }
+-
+-	  if (changed)
+-	    return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2);
++	  return fold_vec_perm (type, op0, op1, sel);
+ 	}
+       return NULL_TREE;
+ 
+diff --git a/gcc/fold-const.h b/gcc/fold-const.h
+index e2e662463..1d94e2894 100644
+--- a/gcc/fold-const.h
++++ b/gcc/fold-const.h
+@@ -100,6 +100,9 @@ extern tree fold_bit_and_mask (tree, tree, enum tree_code,
+ 			       tree, enum tree_code, tree, tree,
+ 			       tree, enum tree_code, tree, tree, tree *);
+ extern tree fold_read_from_constant_string (tree);
++#if GCC_VEC_PERN_INDICES_H
++extern tree fold_vec_perm (tree, tree, tree, const vec_perm_indices &);
++#endif
+ extern bool wide_int_binop (wide_int &res, enum tree_code,
+ 			    const wide_int &arg1, const wide_int &arg2,
+ 			    signop, wi::overflow_type *);
+diff --git a/gcc/function-abi.cc b/gcc/function-abi.cc
+new file mode 100644
+index 000000000..b4a183963
+--- /dev/null
++++ b/gcc/function-abi.cc
+@@ -0,0 +1,260 @@
++/* Information about fuunction binary interfaces.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++
++This file is part of GCC
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "backend.h"
++#include "target.h"
++#include "rtl.h"
++#include "tree.h"
++#include "regs.h"
++#include "function-abi.h"
++#include "varasm.h"
++#include "cgraph.h"
++
++target_function_abi_info default_target_function_abi_info;
++#if SWITCHABLE_TARGET
++target_function_abi_info *this_target_function_abi_info
++  = &default_target_function_abi_info;
++#endif
++
++/* Initialize a predefined function ABI with the given values of
++   ID and FULL_REG_CLOBBERS.  */
++
++void
++predefined_function_abi::initialize (unsigned int id,
++				     const_hard_reg_set full_reg_clobbers)
++{
++  m_id = id;
++  m_initialized = true;
++  m_full_reg_clobbers = full_reg_clobbers;
++
++  /* Set up the value of m_full_and_partial_reg_clobbers.
++
++     If the ABI specifies that part of a hard register R is call-clobbered,
++     we should be able to find a single-register mode M for which
++     targetm.hard_regno_call_part_clobbered (m_id, R, M) is true.
++     In other words, it shouldn't be the case that R can hold all
++     single-register modes across a call, but can't hold part of
++     a multi-register mode.
++
++     If that assumption doesn't hold for a future target, we would need
++     to change the interface of TARGET_HARD_REGNO_CALL_PART_CLOBBERED so
++     that it tells us which registers in a multi-register value are
++     actually clobbered.  */
++  m_full_and_partial_reg_clobbers = full_reg_clobbers;
++  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
++    {
++      machine_mode mode = (machine_mode) i;
++      for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
++	if (targetm.hard_regno_mode_ok (regno, mode)
++	    && hard_regno_nregs (regno, mode) == 1
++	    && targetm.hard_regno_call_part_clobbered (m_id, regno, mode))
++	  SET_HARD_REG_BIT (m_full_and_partial_reg_clobbers, regno);
++    }
++
++  /* For each mode MODE, work out which registers are unable to hold
++     any part of a MODE value across a call, i.e. those for which no
++     overlapping call-preserved (reg:MODE REGNO) exists.
++
++     We assume that this can be flipped around to say that a call
++     preserves (reg:MODE REGNO) unless the register overlaps this set.
++     The usual reason for this being true is that if (reg:MODE REGNO)
++     contains a part-clobbered register, that register would be
++     part-clobbered regardless of which part of MODE it holds.
++     For example, if (reg:M 2) occupies two registers and if the
++     register 3 portion of it is part-clobbered, (reg:M 3) is usually
++     either invalid or also part-clobbered.  */
++  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
++    {
++      machine_mode mode = (machine_mode) i;
++      m_mode_clobbers[i] = m_full_and_partial_reg_clobbers;
++      for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
++	if (targetm.hard_regno_mode_ok (regno, mode)
++	    && !overlaps_hard_reg_set_p (m_full_reg_clobbers, mode, regno)
++	    && !targetm.hard_regno_call_part_clobbered (m_id, regno, mode))
++	  remove_from_hard_reg_set (&m_mode_clobbers[i], mode, regno);
++    }
++
++  /* Check that the assumptions above actually hold, i.e. that testing
++     for single-register modes makes sense, and that overlap tests for
++     mode_clobbers work as expected.  */
++  if (flag_checking)
++    for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
++      {
++	machine_mode mode = (machine_mode) i;
++	const_hard_reg_set all_clobbers = m_full_and_partial_reg_clobbers;
++	for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
++	  if (targetm.hard_regno_mode_ok (regno, mode)
++	      && !overlaps_hard_reg_set_p (m_full_reg_clobbers, mode, regno)
++	      && targetm.hard_regno_call_part_clobbered (m_id, regno, mode))
++	    gcc_assert (overlaps_hard_reg_set_p (all_clobbers, mode, regno)
++			&& overlaps_hard_reg_set_p (m_mode_clobbers[i],
++						    mode, regno));
++      }
++}
++
++/* If the ABI has been initialized, add REGNO to the set of registers
++   that can be completely altered by a call.  */
++
++void
++predefined_function_abi::add_full_reg_clobber (unsigned int regno)
++{
++  if (!m_initialized)
++    return;
++
++  SET_HARD_REG_BIT (m_full_reg_clobbers, regno);
++  SET_HARD_REG_BIT (m_full_and_partial_reg_clobbers, regno);
++  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
++    SET_HARD_REG_BIT (m_mode_clobbers[i], regno);
++}
++
++/* Return the set of registers that the caller of the recorded functions must
++   save in order to honor the requirements of CALLER_ABI.  */
++
++HARD_REG_SET
++function_abi_aggregator::
++caller_save_regs (const function_abi &caller_abi) const
++{
++  HARD_REG_SET result;
++  CLEAR_HARD_REG_SET (result);
++  for (unsigned int abi_id = 0; abi_id < NUM_ABI_IDS; ++abi_id)
++    {
++      const predefined_function_abi &callee_abi = function_abis[abi_id];
++
++      /* Skip cases that clearly aren't problematic.  */
++      if (abi_id == caller_abi.id ()
++	  || hard_reg_set_empty_p (m_abi_clobbers[abi_id]))
++	continue;
++
++      /* Collect the set of registers that can be "more clobbered" by
++	 CALLEE_ABI than by CALLER_ABI.  */
++      HARD_REG_SET extra_clobbers;
++      CLEAR_HARD_REG_SET (extra_clobbers);
++      for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
++	{
++	  machine_mode mode = (machine_mode) i;
++	  extra_clobbers |= (callee_abi.mode_clobbers (mode)
++			     & ~caller_abi.mode_clobbers (mode));
++	}
++
++      /* Restrict it to the set of registers that we actually saw
++	 clobbers for (e.g. taking -fipa-ra into account).  */
++      result |= (extra_clobbers & m_abi_clobbers[abi_id]);
++    }
++  return result;
++}
++
++/* Return the set of registers that cannot be used to hold a value of
++   mode MODE across the calls in a region described by ABIS and MASK, where:
++
++   * Bit ID of ABIS is set if the region contains a call with
++     function_abi identifier ID.
++
++   * MASK contains all the registers that are fully or partially
++     clobbered by calls in the region.
++
++   This is not quite as accurate as testing each individual call,
++   but it's a close and conservatively-correct approximation.
++   It's much better for some targets than just using MASK.  */
++
++HARD_REG_SET
++call_clobbers_in_region (unsigned int abis, const_hard_reg_set mask,
++			 machine_mode mode)
++{
++  HARD_REG_SET result;
++  CLEAR_HARD_REG_SET (result);
++  for (unsigned int id = 0; abis; abis >>= 1, ++id)
++    if (abis & 1)
++      result |= function_abis[id].mode_clobbers (mode);
++  return result & mask;
++}
++
++/* Return the predefined ABI used by functions with type TYPE.  */
++
++const predefined_function_abi &
++fntype_abi (const_tree type)
++{
++  gcc_assert (FUNC_OR_METHOD_TYPE_P (type));
++  if (targetm.calls.fntype_abi)
++    return targetm.calls.fntype_abi (type);
++  return default_function_abi;
++}
++
++/* Return the ABI of function decl FNDECL.  */
++
++function_abi
++fndecl_abi (const_tree fndecl)
++{
++  gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
++  const predefined_function_abi &base_abi = fntype_abi (TREE_TYPE (fndecl));
++
++  if (flag_ipa_ra && decl_binds_to_current_def_p (fndecl))
++    if (cgraph_rtl_info *info = cgraph_node::rtl_info (fndecl))
++      return function_abi (base_abi, info->function_used_regs);
++
++  return base_abi;
++}
++
++/* Return the ABI of the function called by INSN.  */
++
++function_abi
++insn_callee_abi (const rtx_insn *insn)
++{
++  gcc_assert (insn && CALL_P (insn));
++
++  if (flag_ipa_ra)
++    if (tree fndecl = get_call_fndecl (insn))
++      return fndecl_abi (fndecl);
++
++  if (targetm.calls.insn_callee_abi)
++    return targetm.calls.insn_callee_abi (insn);
++
++  return default_function_abi;
++}
++
++/* Return the ABI of the function called by CALL_EXPR EXP.  Return the
++   default ABI for erroneous calls.  */
++
++function_abi
++expr_callee_abi (const_tree exp)
++{
++  gcc_assert (TREE_CODE (exp) == CALL_EXPR);
++
++  if (tree fndecl = get_callee_fndecl (exp))
++    return fndecl_abi (fndecl);
++
++  tree callee = CALL_EXPR_FN (exp);
++  if (callee == error_mark_node)
++    return default_function_abi;
++
++  tree type = TREE_TYPE (callee);
++  if (type == error_mark_node)
++    return default_function_abi;
++
++  if (POINTER_TYPE_P (type))
++    {
++      type = TREE_TYPE (type);
++      if (type == error_mark_node)
++	return default_function_abi;
++    }
++
++  return fntype_abi (type);
++}
+diff --git a/gcc/function-abi.h b/gcc/function-abi.h
+new file mode 100644
+index 000000000..96a49dfbe
+--- /dev/null
++++ b/gcc/function-abi.h
+@@ -0,0 +1,320 @@
++/* Information about fuunction binary interfaces.
++   Copyright (C) 2019 Free Software Foundation, Inc.
++
++This file is part of GCC
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef GCC_FUNCTION_ABI_H
++#define GCC_FUNCTION_ABI_H
++
++/* Most targets use the same ABI for all functions in a translation
++   unit, but some targets support interoperability between several ABIs.
++   Each such ABI has a unique 0-based identifier, with 0 always being
++   the default choice of ABI.
++
++   NUM_ABI_IDS is the maximum number of such ABIs that GCC can handle at once.
++   A bitfield with this number of bits can represent any combinaion of the
++   supported ABIs.  */
++const size_t NUM_ABI_IDS = 8;
++
++/* Information about one of the target's predefined ABIs.  */
++class predefined_function_abi
++{
++public:
++  /* A target-specific identifier for this ABI.  The value must be in
++     the range [0, NUM_ABI_IDS - 1].  */
++  unsigned int id () const { return m_id; }
++
++  /* True if this ABI has been initialized.  */
++  bool initialized_p () const { return m_initialized; }
++
++  /* Return true if a function call is allowed to alter every bit of
++     register REGNO, so that the register contains an arbitrary value
++     on return.  If so, the register cannot hold any part of a value
++     that is live across a call.  */
++  bool
++  clobbers_full_reg_p (unsigned int regno) const
++  {
++    return TEST_HARD_REG_BIT (m_full_reg_clobbers, regno);
++  }
++
++  /* Return true if a function call is allowed to alter some or all bits
++     of register REGNO.
++
++     This is true whenever clobbers_full_reg_p (REGNO) is true.  It is
++     also true if, for example, the ABI says that a call must preserve the
++     low 32 or 64 bits of REGNO, but can clobber the upper bits of REGNO.
++     In the latter case, it is possible for REGNO to hold values that
++     are live across a call, provided that the value occupies only the
++     call-preserved part of the register.  */
++  bool
++  clobbers_at_least_part_of_reg_p (unsigned int regno) const
++  {
++    return TEST_HARD_REG_BIT (m_full_and_partial_reg_clobbers, regno);
++  }
++
++  /* Return true if a function call is allowed to clobber at least part
++     of (reg:MODE REGNO).  If so, it is not possible for the register
++     as a whole to be live across a call.  */
++  bool
++  clobbers_reg_p (machine_mode mode, unsigned int regno) const
++  {
++    return overlaps_hard_reg_set_p (m_mode_clobbers[mode], mode, regno);
++  }
++
++  /* Return the set of registers that a function call is allowed to
++     alter completely, so that the registers contain arbitrary values
++     on return.  This doesn't include registers that a call can only
++     partly clobber (as per TARGET_HARD_REGNO_CALL_PART_CLOBBERED).
++
++     These registers cannot hold any part of a value that is live across
++     a call.  */
++  HARD_REG_SET full_reg_clobbers () const { return m_full_reg_clobbers; }
++
++  /* Return the set of registers that a function call is allowed to alter
++     to some degree.  For example, if an ABI says that a call must preserve
++     the low 32 or 64 bits of a register R, but can clobber the upper bits
++     of R, R would be in this set but not in full_reg_clobbers ().
++
++     This set is a superset of full_reg_clobbers ().  It is possible for a
++     register in full_and_partial_reg_clobbers () & ~full_reg_clobbers ()
++     to contain values that are live across a call, provided that the live
++     value only occupies the call-preserved part of the register.  */
++  HARD_REG_SET
++  full_and_partial_reg_clobbers () const
++  {
++    return m_full_and_partial_reg_clobbers;
++  }
++
++  /* Return the set of registers that cannot be used to hold a value of
++     mode MODE across a function call.  That is:
++
++       (reg:REGNO MODE)
++
++     might be clobbered by a call whenever:
++
++       overlaps_hard_reg_set (mode_clobbers (MODE), MODE, REGNO)
++
++     In allocation terms, the registers in the returned set conflict
++     with any value of mode MODE that is live across a call.  */
++  HARD_REG_SET
++  mode_clobbers (machine_mode mode) const
++  {
++    return m_mode_clobbers[mode];
++  }
++
++  void initialize (unsigned int, const_hard_reg_set);
++  void add_full_reg_clobber (unsigned int);
++
++private:
++  unsigned int m_id : NUM_ABI_IDS;
++  unsigned int m_initialized : 1;
++  HARD_REG_SET m_full_reg_clobbers;
++  HARD_REG_SET m_full_and_partial_reg_clobbers;
++  HARD_REG_SET m_mode_clobbers[NUM_MACHINE_MODES];
++};
++
++/* Describes either a predefined ABI or the ABI of a particular function.
++   In the latter case, the ABI might make use of extra function-specific
++   information, such as for -fipa-ra.  */
++class function_abi
++{
++public:
++  /* Initialize the structure for a general function with the given ABI.  */
++  function_abi (const predefined_function_abi &base_abi)
++    : m_base_abi (&base_abi),
++      m_mask (base_abi.full_and_partial_reg_clobbers ()) {}
++
++  /* Initialize the structure for a function that has the given ABI and
++     that is known not to clobber registers outside MASK.  */
++  function_abi (const predefined_function_abi &base_abi,
++		const_hard_reg_set mask)
++    : m_base_abi (&base_abi), m_mask (mask) {}
++
++  /* The predefined ABI from which this ABI is derived.  */
++  const predefined_function_abi &base_abi () const { return *m_base_abi; }
++
++  /* The target-specific identifier of the predefined ABI.  */
++  unsigned int id () const { return m_base_abi->id (); }
++
++  /* See the corresponding predefined_function_abi functions for
++     details about the following functions.  */
++
++  HARD_REG_SET
++  full_reg_clobbers () const
++  {
++    return m_mask & m_base_abi->full_reg_clobbers ();
++  }
++
++  HARD_REG_SET
++  full_and_partial_reg_clobbers () const
++  {
++    return m_mask & m_base_abi->full_and_partial_reg_clobbers ();
++  }
++
++  HARD_REG_SET
++  mode_clobbers (machine_mode mode) const
++  {
++    return m_mask & m_base_abi->mode_clobbers (mode);
++  }
++
++  bool
++  clobbers_full_reg_p (unsigned int regno) const
++  {
++    return (TEST_HARD_REG_BIT (m_mask, regno)
++	    & m_base_abi->clobbers_full_reg_p (regno));
++  }
++
++  bool
++  clobbers_at_least_part_of_reg_p (unsigned int regno) const
++  {
++    return (TEST_HARD_REG_BIT (m_mask, regno)
++	    & m_base_abi->clobbers_at_least_part_of_reg_p (regno));
++  }
++
++  bool
++  clobbers_reg_p (machine_mode mode, unsigned int regno) const
++  {
++    return overlaps_hard_reg_set_p (mode_clobbers (mode), mode, regno);
++  }
++
++  bool
++  operator== (const function_abi &other) const
++  {
++    return m_base_abi == other.m_base_abi && m_mask == other.m_mask;
++  }
++
++  bool
++  operator!= (const function_abi &other) const
++  {
++    return !operator== (other);
++  }
++
++protected:
++  const predefined_function_abi *m_base_abi;
++  HARD_REG_SET m_mask;
++};
++
++/* This class collects information about the ABIs of functions that are
++   called in a particular region of code.  It is mostly intended to be
++   used as a local variable during an IR walk.  */
++class function_abi_aggregator
++{
++public:
++  function_abi_aggregator () : m_abi_clobbers () {}
++
++  /* Record that the code region calls a function with the given ABI.  */
++  void
++  note_callee_abi (const function_abi &abi)
++  {
++    m_abi_clobbers[abi.id ()] |= abi.full_and_partial_reg_clobbers ();
++  }
++
++  HARD_REG_SET caller_save_regs (const function_abi &) const;
++
++private:
++  HARD_REG_SET m_abi_clobbers[NUM_ABI_IDS];
++};
++
++struct target_function_abi_info
++{
++  /* An array of all the target ABIs that are available in this
++     translation unit.  Not all entries are used for all targets,
++     but the structures are relatively small, and using a fixed-size
++     array avoids extra indirection.
++
++     There are various ways of getting an ABI descriptor:
++
++     * fndecl_abi (FNDECL) is the ABI of function FNDECL.
++
++     * fntype_abi (FNTYPE) is the ABI of a function with type FNTYPE.
++
++     * crtl->abi is the ABI of the function that we are currently
++       compiling to rtl.
++
++     * insn_callee_abi (INSN) is the ABI used by the target of call insn INSN.
++
++     * eh_edge_abi is the "ABI" used when taking an EH edge from an
++       exception-throwing statement to an exception handler.  Catching
++       exceptions from calls can be treated as an abnormal return from
++       those calls, and this ABI therefore describes the ABI of functions
++       on such an abnormal return.  Statements that throw non-call
++       exceptions can be treated as being implicitly wrapped in a call
++       that has such an abnormal return.
++
++       At present, no target needs to support more than one EH ABI.
++
++     * function_abis[N] is the ABI with identifier N.  This can be useful
++       when referring back to ABIs that have been collected by number in
++       a bitmask, such as after walking function calls in a particular
++       region of code.
++
++     * default_function_abi refers specifically to the target's default
++       choice of ABI, regardless of which (if any) functions actually
++       use it.  This ABI and data derived from it do *not* provide
++       globally conservatively-correct information, so it is only
++       useful in very specific circumstances.  */
++  predefined_function_abi x_function_abis[NUM_ABI_IDS];
++};
++
++extern target_function_abi_info default_target_function_abi_info;
++#if SWITCHABLE_TARGET
++extern target_function_abi_info *this_target_function_abi_info;
++#else
++#define this_target_function_abi_info (&default_target_function_abi_info)
++#endif
++
++/* See the comment above x_function_abis for when these macros should be used.
++   At present, eh_edge_abi is always the default ABI, but that could change
++   in future if a target needs it to.  */
++#define function_abis \
++  (this_target_function_abi_info->x_function_abis)
++#define default_function_abi \
++  (this_target_function_abi_info->x_function_abis[0])
++#define eh_edge_abi default_function_abi
++
++extern HARD_REG_SET call_clobbers_in_region (unsigned int, const_hard_reg_set,
++					     machine_mode mode);
++
++/* Return true if (reg:MODE REGNO) might be clobbered by one of the
++   calls in a region described by ABIS and MASK, where:
++
++   * Bit ID of ABIS is set if the region contains a call with
++     function_abi identifier ID.
++
++   * MASK contains all the registers that are fully or partially
++     clobbered by calls in the region.
++
++   This is not quite as accurate as testing each individual call,
++   but it's a close and conservatively-correct approximation.
++   It's much better for some targets than:
++
++     overlaps_hard_reg_set_p (MASK, MODE, REGNO).  */
++
++inline bool
++call_clobbered_in_region_p (unsigned int abis, const_hard_reg_set mask,
++			    machine_mode mode, unsigned int regno)
++{
++  HARD_REG_SET clobbers = call_clobbers_in_region (abis, mask, mode);
++  return overlaps_hard_reg_set_p (clobbers, mode, regno);
++}
++
++extern const predefined_function_abi &fntype_abi (const_tree);
++extern function_abi fndecl_abi (const_tree);
++extern function_abi insn_callee_abi (const rtx_insn *);
++extern function_abi expr_callee_abi (const_tree);
++
++#endif
+diff --git a/gcc/function.c b/gcc/function.c
+index acf9f9e60..6d5574244 100644
+--- a/gcc/function.c
++++ b/gcc/function.c
+@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "attribs.h"
+ #include "gimple.h"
+ #include "options.h"
++#include "function-abi.h"
+ 
+ /* So we can assign to cfun in this file.  */
+ #undef cfun
+@@ -2121,7 +2122,7 @@ aggregate_value_p (const_tree exp, const_tree fntype)
+   regno = REGNO (reg);
+   nregs = hard_regno_nregs (regno, TYPE_MODE (type));
+   for (i = 0; i < nregs; i++)
+-    if (! call_used_regs[regno + i])
++    if (! call_used_or_fixed_reg_p (regno + i))
+       return 1;
+ 
+   return 0;
+@@ -2454,13 +2455,15 @@ assign_parm_find_data_types (struct assign_parm_data_all *all, tree parm,
+     passed_type = TREE_TYPE (first_field (passed_type));
+ 
+   /* See if this arg was passed by invisible reference.  */
+-  if (pass_by_reference (&all->args_so_far_v, passed_mode,
+-			 passed_type, data->named_arg))
+-    {
+-      passed_type = nominal_type = build_pointer_type (passed_type);
+-      data->passed_pointer = true;
+-      passed_mode = nominal_mode = TYPE_MODE (nominal_type);
+-    }
++  {
++    function_arg_info arg (passed_type, passed_mode, data->named_arg);
++    if (apply_pass_by_reference_rules (&all->args_so_far_v, arg))
++      {
++	passed_type = nominal_type = arg.type;
++	data->passed_pointer = true;
++	passed_mode = nominal_mode = arg.mode;
++      }
++  }
+ 
+   /* Find mode as it is passed by the ABI.  */
+   unsignedp = TYPE_UNSIGNED (passed_type);
+@@ -2483,9 +2486,9 @@ assign_parms_setup_varargs (struct assign_parm_data_all *all,
+ {
+   int varargs_pretend_bytes = 0;
+ 
+-  targetm.calls.setup_incoming_varargs (all->args_so_far,
+-					data->promoted_mode,
+-					data->passed_type,
++  function_arg_info last_named_arg (data->passed_type, data->promoted_mode,
++				    /*named=*/true);
++  targetm.calls.setup_incoming_varargs (all->args_so_far, last_named_arg,
+ 					&varargs_pretend_bytes, no_rtl);
+ 
+   /* If the back-end has requested extra stack space, record how much is
+@@ -2515,11 +2518,9 @@ assign_parm_find_entry_rtl (struct assign_parm_data_all *all,
+   targetm.calls.warn_parameter_passing_abi (all->args_so_far,
+ 					    data->passed_type);
+ 
+-  entry_parm = targetm.calls.function_incoming_arg (all->args_so_far,
+-						    data->promoted_mode,
+-						    data->passed_type,
+-						    data->named_arg);
+-
++  function_arg_info arg (data->passed_type, data->promoted_mode,
++			 data->named_arg);
++  entry_parm = targetm.calls.function_incoming_arg (all->args_so_far, arg);
+   if (entry_parm == 0)
+     data->promoted_mode = data->passed_mode;
+ 
+@@ -2542,27 +2543,26 @@ assign_parm_find_entry_rtl (struct assign_parm_data_all *all,
+       if (targetm.calls.pretend_outgoing_varargs_named (all->args_so_far))
+ 	{
+ 	  rtx tem;
++	  function_arg_info named_arg (data->passed_type, data->promoted_mode,
++				       /*named=*/true);
+ 	  tem = targetm.calls.function_incoming_arg (all->args_so_far,
+-						     data->promoted_mode,
+-						     data->passed_type, true);
++						     named_arg);
+ 	  in_regs = tem != NULL;
+ 	}
+     }
+ 
+   /* If this parameter was passed both in registers and in the stack, use
+      the copy on the stack.  */
+-  if (targetm.calls.must_pass_in_stack (data->promoted_mode,
+-					data->passed_type))
++  if (targetm.calls.must_pass_in_stack (arg))
+     entry_parm = 0;
+ 
+   if (entry_parm)
+     {
+       int partial;
+ 
+-      partial = targetm.calls.arg_partial_bytes (all->args_so_far,
+-						 data->promoted_mode,
+-						 data->passed_type,
+-						 data->named_arg);
++      function_arg_info arg (data->passed_type, data->promoted_mode,
++			     data->named_arg);
++      partial = targetm.calls.arg_partial_bytes (all->args_so_far, arg);
+       data->partial = partial;
+ 
+       /* The caller might already have allocated stack space for the
+@@ -3226,8 +3226,7 @@ assign_parm_setup_reg (struct assign_parm_data_all *all, tree parm,
+ 	  for (insn = insns; insn && moved; insn = NEXT_INSN (insn))
+ 	    {
+ 	      if (INSN_P (insn))
+-		note_stores (PATTERN (insn), record_hard_reg_sets,
+-			     &hardregs);
++		note_stores (insn, record_hard_reg_sets, &hardregs);
+ 	      if (!hard_reg_set_empty_p (hardregs))
+ 		moved = false;
+ 	    }
+@@ -3647,8 +3646,9 @@ assign_parms (tree fndecl)
+ 	assign_parms_setup_varargs (&all, &data, false);
+ 
+       /* Update info on where next arg arrives in registers.  */
+-      targetm.calls.function_arg_advance (all.args_so_far, data.promoted_mode,
+-					  data.passed_type, data.named_arg);
++      function_arg_info arg (data.passed_type, data.promoted_mode,
++			     data.named_arg);
++      targetm.calls.function_arg_advance (all.args_so_far, arg);
+     }
+ 
+   if (targetm.calls.split_complex_arg)
+@@ -3835,8 +3835,9 @@ gimplify_parameters (gimple_seq *cleanup)
+ 	continue;
+ 
+       /* Update info on where next arg arrives in registers.  */
+-      targetm.calls.function_arg_advance (all.args_so_far, data.promoted_mode,
+-					  data.passed_type, data.named_arg);
++      function_arg_info arg (data.passed_type, data.promoted_mode,
++			     data.named_arg);
++      targetm.calls.function_arg_advance (all.args_so_far, arg);
+ 
+       /* ??? Once upon a time variable_size stuffed parameter list
+ 	 SAVE_EXPRs (amongst others) onto a pending sizes list.  This
+@@ -3854,8 +3855,8 @@ gimplify_parameters (gimple_seq *cleanup)
+       if (data.passed_pointer)
+ 	{
+           tree type = TREE_TYPE (data.passed_type);
+-	  if (reference_callee_copied (&all.args_so_far_v, TYPE_MODE (type),
+-				       type, data.named_arg))
++	  function_arg_info orig_arg (type, data.named_arg);
++	  if (reference_callee_copied (&all.args_so_far_v, orig_arg))
+ 	    {
+ 	      tree local, t;
+ 
+@@ -4823,6 +4824,12 @@ static void
+ prepare_function_start (void)
+ {
+   gcc_assert (!get_last_insn ());
++
++  if (in_dummy_function)
++    crtl->abi = &default_function_abi;
++  else
++    crtl->abi = &fndecl_abi (cfun->decl).base_abi ();
++
+   init_temp_slots ();
+   init_emit ();
+   init_varasm_status ();
+diff --git a/gcc/fwprop.c b/gcc/fwprop.c
+index f2966fada..e6f375271 100644
+--- a/gcc/fwprop.c
++++ b/gcc/fwprop.c
+@@ -740,7 +740,7 @@ propagate_rtx (rtx x, machine_mode mode, rtx old_rtx, rtx new_rtx,
+       || CONSTANT_P (new_rtx)
+       || (GET_CODE (new_rtx) == SUBREG
+ 	  && REG_P (SUBREG_REG (new_rtx))
+-	  && !paradoxical_subreg_p (mode, GET_MODE (SUBREG_REG (new_rtx)))))
++	  && !paradoxical_subreg_p (new_rtx)))
+     flags |= PR_CAN_APPEAR;
+   if (!varying_mem_p (new_rtx))
+     flags |= PR_HANDLE_MEM;
+diff --git a/gcc/gcc.c b/gcc/gcc.c
+index 4f57765b0..1a5ad7db3 100644
+--- a/gcc/gcc.c
++++ b/gcc/gcc.c
+@@ -4041,6 +4041,10 @@ driver_handle_option (struct gcc_options *opts,
+       diagnostic_color_init (dc, value);
+       break;
+ 
++    case OPT_fdiagnostics_urls_:
++      diagnostic_urls_init (dc, value);
++      break;
++
+     case OPT_fdiagnostics_format_:
+       diagnostic_output_format_init (dc,
+ 				     (enum diagnostics_output_format)value);
+@@ -7438,6 +7442,7 @@ driver::global_initializations ()
+ 
+   diagnostic_initialize (global_dc, 0);
+   diagnostic_color_init (global_dc);
++  diagnostic_urls_init (global_dc);
+ 
+ #ifdef GCC_DRIVER_HOST_INITIALIZATION
+   /* Perform host dependent initialization when needed.  */
+diff --git a/gcc/gcse-common.c b/gcc/gcse-common.c
+index e6e4b642b..55148623f 100644
+--- a/gcc/gcse-common.c
++++ b/gcc/gcse-common.c
+@@ -89,7 +89,7 @@ record_last_mem_set_info_common (rtx_insn *insn,
+       struct gcse_note_stores_info data;
+       data.insn = insn;
+       data.canon_mem_list = canon_modify_mem_list;
+-      note_stores (PATTERN (insn), canon_list_insert, (void*) &data);
++      note_stores (insn, canon_list_insert, (void*) &data);
+     }
+ }
+ 
+diff --git a/gcc/gcse.c b/gcc/gcse.c
+index 7fbdd6750..373ba7a16 100644
+--- a/gcc/gcse.c
++++ b/gcc/gcse.c
+@@ -1049,7 +1049,7 @@ load_killed_in_block_p (const_basic_block bb, int uid_limit, const_rtx x,
+ 	 note_stores to examine each hunk of memory that is modified.  */
+       mci.mem = x;
+       mci.conflict = false;
+-      note_stores (PATTERN (setter), mems_conflict_for_gcse_p, &mci);
++      note_stores (setter, mems_conflict_for_gcse_p, &mci);
+       if (mci.conflict)
+ 	return 1;
+     }
+@@ -1537,7 +1537,7 @@ compute_hash_table_work (struct gcse_hash_table_d *table)
+ 		record_last_mem_set_info (insn);
+ 	    }
+ 
+-	  note_stores (PATTERN (insn), record_last_set_info, insn);
++	  note_stores (insn, record_last_set_info, insn);
+ 	}
+ 
+       /* The next pass builds the hash table.  */
+@@ -2415,7 +2415,7 @@ single_set_gcse (rtx_insn *insn)
+ 
+   s.insn = insn;
+   s.nsets = 0;
+-  note_stores (pattern, record_set_data, &s);
++  note_pattern_stores (pattern, record_set_data, &s);
+ 
+   /* Considered invariant insns have exactly one set.  */
+   gcc_assert (s.nsets == 1);
+diff --git a/gcc/genconfig.c b/gcc/genconfig.c
+index 194fe950d..6f914b1e4 100644
+--- a/gcc/genconfig.c
++++ b/gcc/genconfig.c
+@@ -72,7 +72,6 @@ walk_insn_part (rtx part, int recog_p, int non_pc_set_src)
+   switch (code)
+     {
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       clobbers_seen_this_insn++;
+       break;
+ 
+diff --git a/gcc/genemit.c b/gcc/genemit.c
+index 83f86a35c..e03af01f2 100644
+--- a/gcc/genemit.c
++++ b/gcc/genemit.c
+@@ -169,15 +169,6 @@ gen_exp (rtx x, enum rtx_code subroutine_type, char *used, md_rtx_info *info)
+ 	  return;
+ 	}
+       break;
+-    case CLOBBER_HIGH:
+-      if (!REG_P (XEXP (x, 0)))
+-	error ("CLOBBER_HIGH argument is not a register expr, at %s:%d",
+-	       info->loc.filename, info->loc.lineno);
+-      printf ("gen_hard_reg_clobber_high (%smode, %i)",
+-	      GET_MODE_NAME (GET_MODE (XEXP (x, 0))),
+-	      REGNO (XEXP (x, 0)));
+-      return;
+-      break;
+     case CC0:
+       printf ("cc0_rtx");
+       return;
+@@ -343,8 +334,7 @@ gen_insn (md_rtx_info *info)
+ 
+       for (i = XVECLEN (insn, 1) - 1; i > 0; i--)
+ 	{
+-	  if (GET_CODE (XVECEXP (insn, 1, i)) != CLOBBER
+-	      && GET_CODE (XVECEXP (insn, 1, i)) != CLOBBER_HIGH)
++	  if (GET_CODE (XVECEXP (insn, 1, i)) != CLOBBER)
+ 	    break;
+ 
+ 	  if (REG_P (XEXP (XVECEXP (insn, 1, i), 0)))
+@@ -811,42 +801,45 @@ handle_overloaded_code_for (overloaded_name *oname)
+ static void
+ handle_overloaded_gen (overloaded_name *oname)
+ {
++  unsigned HOST_WIDE_INT seen = 0;
+   /* All patterns must have the same number of operands.  */
+-  pattern_stats stats;
+-  get_pattern_stats (&stats, XVEC (oname->first_instance->insn, 1));
+   for (overloaded_instance *instance = oname->first_instance->next;
+        instance; instance = instance->next)
+     {
+-      pattern_stats stats2;
+-      get_pattern_stats (&stats2, XVEC (instance->insn, 1));
+-      if (stats.num_generator_args != stats2.num_generator_args)
+-	fatal_at (get_file_location (instance->insn),
+-		  "inconsistent number of operands for '%s'; "
+-		  "this instance has %d, but previous instances had %d",
+-		  oname->name, stats2.num_generator_args,
+-		  stats.num_generator_args);
++      pattern_stats stats;
++      get_pattern_stats (&stats, XVEC (instance->insn, 1));
++      unsigned HOST_WIDE_INT mask
++	= HOST_WIDE_INT_1U << stats.num_generator_args;
++      if (seen & mask)
++	continue;
++
++      seen |= mask;
++
++      /* Print the function prototype.  */
++      printf ("\nrtx\nmaybe_gen_%s (", oname->name);
++      print_overload_arguments (oname);
++      for (int i = 0; i < stats.num_generator_args; ++i)
++	printf (", rtx x%d", i);
++      printf (")\n{\n");
++
++      /* Use maybe_code_for_*, instead of duplicating the selection
++	 logic here.  */
++      printf ("  insn_code code = maybe_code_for_%s (", oname->name);
++      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
++	printf ("%sarg%d", i == 0 ? "" : ", ", i);
++      printf (");\n"
++	      "  if (code != CODE_FOR_nothing)\n"
++	      "    {\n"
++	      "      gcc_assert (insn_data[code].n_generator_args == %d);\n"
++	      "      return GEN_FCN (code) (", stats.num_generator_args);
++      for (int i = 0; i < stats.num_generator_args; ++i)
++	printf ("%sx%d", i == 0 ? "" : ", ", i);
++      printf (");\n"
++	      "    }\n"
++	      "  else\n"
++	      "    return NULL_RTX;\n"
++	      "}\n");
+     }
+-
+-  /* Print the function prototype.  */
+-  printf ("\nrtx\nmaybe_gen_%s (", oname->name);
+-  print_overload_arguments (oname);
+-  for (int i = 0; i < stats.num_generator_args; ++i)
+-    printf (", rtx x%d", i);
+-  printf (")\n{\n");
+-
+-  /* Use maybe_code_for_*, instead of duplicating the selection logic here.  */
+-  printf ("  insn_code code = maybe_code_for_%s (", oname->name);
+-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+-    printf ("%sarg%d", i == 0 ? "" : ", ", i);
+-  printf (");\n"
+-	  "  if (code != CODE_FOR_nothing)\n"
+-	  "    return GEN_FCN (code) (");
+-  for (int i = 0; i < stats.num_generator_args; ++i)
+-    printf ("%sx%d", i == 0 ? "" : ", ", i);
+-  printf (");\n"
+-	  "  else\n"
+-	  "    return NULL_RTX;\n"
+-	  "}\n");
+ }
+ 
+ int
+diff --git a/gcc/generic-match-head.c b/gcc/generic-match-head.c
+index 3478cf59f..e9ef343c9 100644
+--- a/gcc/generic-match-head.c
++++ b/gcc/generic-match-head.c
+@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "gimple.h"
+ #include "ssa.h"
+ #include "cgraph.h"
++#include "vec-perm-indices.h"
+ #include "fold-const.h"
+ #include "fold-const-call.h"
+ #include "stor-layout.h"
+diff --git a/gcc/genmodes.c b/gcc/genmodes.c
+index f33eefa24..95522d6b5 100644
+--- a/gcc/genmodes.c
++++ b/gcc/genmodes.c
+@@ -53,6 +53,7 @@ struct mode_data
+ 
+   const char *name;		/* printable mode name -- SI, not SImode */
+   enum mode_class cl;		/* this mode class */
++  unsigned int order;		/* top-level sorting order */
+   unsigned int precision;	/* size in bits, equiv to TYPE_PRECISION */
+   unsigned int bytesize;	/* storage size in addressable units */
+   unsigned int ncomponents;	/* number of subunits */
+@@ -85,7 +86,7 @@ static struct mode_data *void_mode;
+ 
+ static const struct mode_data blank_mode = {
+   0, "<unknown>", MAX_MODE_CLASS,
+-  -1U, -1U, -1U, -1U,
++  0, -1U, -1U, -1U, -1U,
+   0, 0, 0, 0, 0, 0,
+   "<unknown>", 0, 0, 0, 0, false, false, 0
+ };
+@@ -484,14 +485,15 @@ make_complex_modes (enum mode_class cl,
+     }
+ }
+ 
+-/* For all modes in class CL, construct vector modes of width
+-   WIDTH, having as many components as necessary.  */
+-#define VECTOR_MODES_WITH_PREFIX(PREFIX, C, W) \
+-  make_vector_modes (MODE_##C, #PREFIX, W, __FILE__, __LINE__)
+-#define VECTOR_MODES(C, W) VECTOR_MODES_WITH_PREFIX (V, C, W)
++/* For all modes in class CL, construct vector modes of width WIDTH,
++   having as many components as necessary.  ORDER is the sorting order
++   of the mode, with smaller numbers indicating a higher priority.  */
++#define VECTOR_MODES_WITH_PREFIX(PREFIX, C, W, ORDER) \
++  make_vector_modes (MODE_##C, #PREFIX, W, ORDER, __FILE__, __LINE__)
++#define VECTOR_MODES(C, W) VECTOR_MODES_WITH_PREFIX (V, C, W, 0)
+ static void ATTRIBUTE_UNUSED
+ make_vector_modes (enum mode_class cl, const char *prefix, unsigned int width,
+-		   const char *file, unsigned int line)
++		   unsigned int order, const char *file, unsigned int line)
+ {
+   struct mode_data *m;
+   struct mode_data *v;
+@@ -530,6 +532,7 @@ make_vector_modes (enum mode_class cl, const char *prefix, unsigned int width,
+ 	}
+ 
+       v = new_mode (vclass, xstrdup (buf), file, line);
++      v->order = order;
+       v->component = m;
+       v->ncomponents = ncomponents;
+     }
+@@ -832,6 +835,11 @@ cmp_modes (const void *a, const void *b)
+   const struct mode_data *const m = *(const struct mode_data *const*)a;
+   const struct mode_data *const n = *(const struct mode_data *const*)b;
+ 
++  if (m->order > n->order)
++    return 1;
++  else if (m->order < n->order)
++    return -1;
++
+   if (m->bytesize > n->bytesize)
+     return 1;
+   else if (m->bytesize < n->bytesize)
+diff --git a/gcc/genopinit.c b/gcc/genopinit.c
+index ea4c3ce01..1dd1d82d0 100644
+--- a/gcc/genopinit.c
++++ b/gcc/genopinit.c
+@@ -134,31 +134,43 @@ handle_overloaded_code_for (FILE *file, overloaded_name *oname)
+ static void
+ handle_overloaded_gen (FILE *file, overloaded_name *oname)
+ {
+-  pattern_stats stats;
+-  get_pattern_stats (&stats, XVEC (oname->first_instance->insn, 1));
+-
+-  fprintf (file, "\nextern rtx maybe_gen_%s (", oname->name);
+-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+-    fprintf (file, "%s%s", i == 0 ? "" : ", ", oname->arg_types[i]);
+-  for (int i = 0; i < stats.num_generator_args; ++i)
+-    fprintf (file, ", rtx");
+-  fprintf (file, ");\n");
+-
+-  fprintf (file, "inline rtx\ngen_%s (", oname->name);
+-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+-    fprintf (file, "%s%s arg%d", i == 0 ? "" : ", ", oname->arg_types[i], i);
+-  for (int i = 0; i < stats.num_generator_args; ++i)
+-    fprintf (file, ", rtx x%d", i);
+-  fprintf (file, ")\n{\n  rtx res = maybe_gen_%s (", oname->name);
+-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+-    fprintf (file, "%sarg%d", i == 0 ? "" : ", ", i);
+-  for (int i = 0; i < stats.num_generator_args; ++i)
+-    fprintf (file, ", x%d", i);
+-  fprintf (file,
+-	   ");\n"
+-	   "  gcc_assert (res);\n"
+-	   "  return res;\n"
+-	   "}\n");
++  unsigned HOST_WIDE_INT seen = 0;
++  for (overloaded_instance *instance = oname->first_instance->next;
++       instance; instance = instance->next)
++    {
++      pattern_stats stats;
++      get_pattern_stats (&stats, XVEC (instance->insn, 1));
++      unsigned HOST_WIDE_INT mask
++	= HOST_WIDE_INT_1U << stats.num_generator_args;
++      if (seen & mask)
++	continue;
++
++      seen |= mask;
++
++      fprintf (file, "\nextern rtx maybe_gen_%s (", oname->name);
++      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
++	fprintf (file, "%s%s", i == 0 ? "" : ", ", oname->arg_types[i]);
++      for (int i = 0; i < stats.num_generator_args; ++i)
++	fprintf (file, ", rtx");
++      fprintf (file, ");\n");
++
++      fprintf (file, "inline rtx\ngen_%s (", oname->name);
++      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
++	fprintf (file, "%s%s arg%d", i == 0 ? "" : ", ",
++		 oname->arg_types[i], i);
++      for (int i = 0; i < stats.num_generator_args; ++i)
++	fprintf (file, ", rtx x%d", i);
++      fprintf (file, ")\n{\n  rtx res = maybe_gen_%s (", oname->name);
++      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
++	fprintf (file, "%sarg%d", i == 0 ? "" : ", ", i);
++      for (int i = 0; i < stats.num_generator_args; ++i)
++	fprintf (file, ", x%d", i);
++      fprintf (file,
++	       ");\n"
++	       "  gcc_assert (res);\n"
++	       "  return res;\n"
++	       "}\n");
++    }
+ }
+ 
+ int
+diff --git a/gcc/genrecog.c b/gcc/genrecog.c
+index 90e2508fa..ec921702a 100644
+--- a/gcc/genrecog.c
++++ b/gcc/genrecog.c
+@@ -718,7 +718,6 @@ validate_pattern (rtx pattern, md_rtx_info *info, rtx set, int set_code)
+       }
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       validate_pattern (SET_DEST (pattern), info, pattern, '=');
+       return;
+ 
+@@ -5295,7 +5294,7 @@ remove_clobbers (acceptance_type *acceptance_ptr, rtx *pattern_ptr)
+   for (i = XVECLEN (pattern, 0); i > 0; i--)
+     {
+       rtx x = XVECEXP (pattern, 0, i - 1);
+-      if ((GET_CODE (x) != CLOBBER && GET_CODE (x) != CLOBBER_HIGH)
++      if (GET_CODE (x) != CLOBBER
+ 	  || (!REG_P (XEXP (x, 0))
+ 	      && GET_CODE (XEXP (x, 0)) != MATCH_SCRATCH))
+ 	break;
+diff --git a/gcc/gensupport.c b/gcc/gensupport.c
+index 31a67d5ad..ab6a523dd 100644
+--- a/gcc/gensupport.c
++++ b/gcc/gensupport.c
+@@ -70,8 +70,8 @@ struct queue_elem
+   rtx data;
+   file_location loc;
+   struct queue_elem *next;
+-  /* In a DEFINE_INSN that came from a DEFINE_INSN_AND_SPLIT, SPLIT
+-     points to the generated DEFINE_SPLIT.  */
++  /* In a DEFINE_INSN that came from a DEFINE_INSN_AND_SPLIT or
++     DEFINE_INSN_AND_REWRITE, SPLIT points to the generated DEFINE_SPLIT.  */
+   struct queue_elem *split;
+ };
+ 
+@@ -485,6 +485,65 @@ remove_constraints (rtx part)
+       }
+ }
+ 
++/* Recursively replace MATCH_OPERANDs with MATCH_DUPs and MATCH_OPERATORs
++   with MATCH_OP_DUPs in X.  */
++
++static rtx
++replace_operands_with_dups (rtx x)
++{
++  if (x == 0)
++    return x;
++
++  rtx newx;
++  if (GET_CODE (x) == MATCH_OPERAND)
++    {
++      newx = rtx_alloc (MATCH_DUP);
++      XINT (newx, 0) = XINT (x, 0);
++      x = newx;
++    }
++  else if (GET_CODE (x) == MATCH_OPERATOR)
++    {
++      newx = rtx_alloc (MATCH_OP_DUP);
++      XINT (newx, 0) = XINT (x, 0);
++      XVEC (newx, 1) = XVEC (x, 2);
++      x = newx;
++    }
++  else
++    newx = shallow_copy_rtx (x);
++
++  const char *format_ptr = GET_RTX_FORMAT (GET_CODE (x));
++  for (int i = 0; i < GET_RTX_LENGTH (GET_CODE (x)); i++)
++    switch (*format_ptr++)
++      {
++      case 'e':
++      case 'u':
++	XEXP (newx, i) = replace_operands_with_dups (XEXP (x, i));
++	break;
++      case 'E':
++	if (XVEC (x, i) != NULL)
++	  {
++	    XVEC (newx, i) = rtvec_alloc (XVECLEN (x, i));
++	    for (int j = 0; j < XVECLEN (x, i); j++)
++	      XVECEXP (newx, i, j)
++		= replace_operands_with_dups (XVECEXP (x, i, j));
++	  }
++	break;
++      }
++  return newx;
++}
++
++/* Convert matching pattern VEC from a DEFINE_INSN_AND_REWRITE into
++   a sequence that should be generated by the splitter.  */
++
++static rtvec
++gen_rewrite_sequence (rtvec vec)
++{
++  rtvec new_vec = rtvec_alloc (1);
++  rtx x = add_implicit_parallel (vec);
++  RTVEC_ELT (new_vec, 0) = replace_operands_with_dups (x);
++  return new_vec;
++}
++
+ /* Process a top level rtx in some way, queuing as appropriate.  */
+ 
+ static void
+@@ -527,6 +586,7 @@ process_rtx (rtx desc, file_location loc)
+       break;
+ 
+     case DEFINE_INSN_AND_SPLIT:
++    case DEFINE_INSN_AND_REWRITE:
+       {
+ 	const char *split_cond;
+ 	rtx split;
+@@ -534,6 +594,7 @@ process_rtx (rtx desc, file_location loc)
+ 	int i;
+ 	struct queue_elem *insn_elem;
+ 	struct queue_elem *split_elem;
++	int split_code = (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE ? 5 : 6);
+ 
+ 	/* Create a split with values from the insn_and_split.  */
+ 	split = rtx_alloc (DEFINE_SPLIT);
+@@ -555,12 +616,17 @@ process_rtx (rtx desc, file_location loc)
+ 	    split_cond = rtx_reader_ptr->join_c_conditions (XSTR (desc, 2),
+ 							    split_cond + 2);
+ 	  }
++	else if (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
++	  error_at (loc, "the rewrite condition must start with `&&'");
+ 	XSTR (split, 1) = split_cond;
+-	XVEC (split, 2) = XVEC (desc, 5);
+-	XSTR (split, 3) = XSTR (desc, 6);
++	if (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
++	  XVEC (split, 2) = gen_rewrite_sequence (XVEC (desc, 1));
++	else
++	  XVEC (split, 2) = XVEC (desc, 5);
++	XSTR (split, 3) = XSTR (desc, split_code);
+ 
+ 	/* Fix up the DEFINE_INSN.  */
+-	attr = XVEC (desc, 7);
++	attr = XVEC (desc, split_code + 1);
+ 	PUT_CODE (desc, DEFINE_INSN);
+ 	XVEC (desc, 4) = attr;
+ 
+diff --git a/gcc/gimple-expr.c b/gcc/gimple-expr.c
+index b0c9f9b67..4ba194ff4 100644
+--- a/gcc/gimple-expr.c
++++ b/gcc/gimple-expr.c
+@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "tree-pass.h"
+ #include "stringpool.h"
+ #include "attribs.h"
++#include "target.h"
+ 
+ /* ----- Type related -----  */
+ 
+@@ -147,10 +148,12 @@ useless_type_conversion_p (tree outer_type, tree inner_type)
+ 
+   /* Recurse for vector types with the same number of subparts.  */
+   else if (TREE_CODE (inner_type) == VECTOR_TYPE
+-	   && TREE_CODE (outer_type) == VECTOR_TYPE
+-	   && TYPE_PRECISION (inner_type) == TYPE_PRECISION (outer_type))
+-    return useless_type_conversion_p (TREE_TYPE (outer_type),
+-				      TREE_TYPE (inner_type));
++	   && TREE_CODE (outer_type) == VECTOR_TYPE)
++    return (known_eq (TYPE_VECTOR_SUBPARTS (inner_type),
++		      TYPE_VECTOR_SUBPARTS (outer_type))
++	    && useless_type_conversion_p (TREE_TYPE (outer_type),
++					  TREE_TYPE (inner_type))
++	    && targetm.compatible_vector_types_p (inner_type, outer_type));
+ 
+   else if (TREE_CODE (inner_type) == ARRAY_TYPE
+ 	   && TREE_CODE (outer_type) == ARRAY_TYPE)
+diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
+index d33d93242..bbee8eb46 100644
+--- a/gcc/gimple-fold.c
++++ b/gcc/gimple-fold.c
+@@ -631,14 +631,7 @@ replace_call_with_call_and_fold (gimple_stmt_iterator *gsi, gimple *repl)
+   gimple *stmt = gsi_stmt (*gsi);
+   gimple_call_set_lhs (repl, gimple_call_lhs (stmt));
+   gimple_set_location (repl, gimple_location (stmt));
+-  if (gimple_vdef (stmt)
+-      && TREE_CODE (gimple_vdef (stmt)) == SSA_NAME)
+-    {
+-      gimple_set_vdef (repl, gimple_vdef (stmt));
+-      SSA_NAME_DEF_STMT (gimple_vdef (repl)) = repl;
+-    }
+-  if (gimple_vuse (stmt))
+-    gimple_set_vuse (repl, gimple_vuse (stmt));
++  gimple_move_vops (repl, stmt);
+   gsi_replace (gsi, repl, false);
+   fold_stmt (gsi);
+ }
+@@ -822,11 +815,7 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
+ 			= gimple_build_assign (fold_build2 (MEM_REF, desttype,
+ 							    dest, off0),
+ 					       srcmem);
+-		      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-		      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+-		      if (gimple_vdef (new_stmt)
+-			  && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME)
+-			SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
++		      gimple_move_vops (new_stmt, stmt);
+ 		      if (!lhs)
+ 			{
+ 			  gsi_replace (gsi, new_stmt, false);
+@@ -1087,11 +1076,7 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
+ 	= gimple_build_assign (fold_build2 (MEM_REF, desttype, dest, off0),
+ 			       fold_build2 (MEM_REF, srctype, src, off0));
+ set_vop_and_replace:
+-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+-      if (gimple_vdef (new_stmt)
+-	  && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME)
+-	SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
++      gimple_move_vops (new_stmt, stmt);
+       if (!lhs)
+ 	{
+ 	  gsi_replace (gsi, new_stmt, false);
+@@ -1264,13 +1249,7 @@ gimple_fold_builtin_memset (gimple_stmt_iterator *gsi, tree c, tree len)
+ 
+   var = fold_build2 (MEM_REF, etype, dest, build_int_cst (ptr_type_node, 0));
+   gimple *store = gimple_build_assign (var, build_int_cst_type (etype, cval));
+-  gimple_set_vuse (store, gimple_vuse (stmt));
+-  tree vdef = gimple_vdef (stmt);
+-  if (vdef && TREE_CODE (vdef) == SSA_NAME)
+-    {
+-      gimple_set_vdef (store, gimple_vdef (stmt));
+-      SSA_NAME_DEF_STMT (gimple_vdef (stmt)) = store;
+-    }
++  gimple_move_vops (store, stmt);
+   gsi_insert_before (gsi, store, GSI_SAME_STMT);
+   if (gimple_call_lhs (stmt))
+     {
+@@ -2979,11 +2958,7 @@ gimple_fold_builtin_stpcpy (gimple_stmt_iterator *gsi)
+ 			tem, build_int_cst (size_type_node, 1));
+   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+   gcall *repl = gimple_build_call (fn, 3, dest, src, lenp1);
+-  gimple_set_vuse (repl, gimple_vuse (stmt));
+-  gimple_set_vdef (repl, gimple_vdef (stmt));
+-  if (gimple_vdef (repl)
+-      && TREE_CODE (gimple_vdef (repl)) == SSA_NAME)
+-    SSA_NAME_DEF_STMT (gimple_vdef (repl)) = repl;
++  gimple_move_vops (repl, stmt);
+   gsi_insert_before (gsi, repl, GSI_SAME_STMT);
+   /* Replace the result with dest + len.  */
+   stmts = NULL;
+@@ -4135,9 +4110,7 @@ fold_builtin_atomic_compare_exchange (gimple_stmt_iterator *gsi)
+ 				  gimple_call_arg (stmt, 5));
+   tree lhs = make_ssa_name (ctype);
+   gimple_call_set_lhs (g, lhs);
+-  gimple_set_vdef (g, gimple_vdef (stmt));
+-  gimple_set_vuse (g, gimple_vuse (stmt));
+-  SSA_NAME_DEF_STMT (gimple_vdef (g)) = g;
++  gimple_move_vops (g, stmt);
+   tree oldlhs = gimple_call_lhs (stmt);
+   if (stmt_can_throw_internal (cfun, stmt))
+     {
+@@ -4316,8 +4289,7 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace)
+ 		      SSA_NAME_DEF_STMT (lhs) = gimple_build_nop ();
+ 		      set_ssa_default_def (cfun, var, lhs);
+ 		    }
+-		  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-		  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
++		  gimple_move_vops (new_stmt, stmt);
+ 		  gsi_replace (gsi, new_stmt, false);
+ 		  return true;
+ 		}
+diff --git a/gcc/gimple-match-head.c b/gcc/gimple-match-head.c
+index bbbc0f2c2..f83f22561 100644
+--- a/gcc/gimple-match-head.c
++++ b/gcc/gimple-match-head.c
+@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "gimple.h"
+ #include "ssa.h"
+ #include "cgraph.h"
++#include "vec-perm-indices.h"
+ #include "fold-const.h"
+ #include "fold-const-call.h"
+ #include "stor-layout.h"
+diff --git a/gcc/gimple.c b/gcc/gimple.c
+index bf362dbe5..763c8e7e1 100644
+--- a/gcc/gimple.c
++++ b/gcc/gimple.c
+@@ -1564,7 +1564,7 @@ gimple_call_nonnull_result_p (gcall *call)
+   if (!fndecl)
+     return false;
+   if (flag_delete_null_pointer_checks && !flag_check_new
+-      && DECL_IS_OPERATOR_NEW (fndecl)
++      && DECL_IS_OPERATOR_NEW_P (fndecl)
+       && !TREE_NOTHROW (fndecl))
+     return true;
+ 
+@@ -2034,6 +2034,18 @@ gimple_copy (gimple *stmt)
+   return copy;
+ }
+ 
++/* Move OLD_STMT's vuse and vdef operands to NEW_STMT, on the assumption
++   that OLD_STMT is about to be removed.  */
++
++void
++gimple_move_vops (gimple *new_stmt, gimple *old_stmt)
++{
++  tree vdef = gimple_vdef (old_stmt);
++  gimple_set_vuse (new_stmt, gimple_vuse (old_stmt));
++  gimple_set_vdef (new_stmt, vdef);
++  if (vdef && TREE_CODE (vdef) == SSA_NAME)
++    SSA_NAME_DEF_STMT (vdef) = new_stmt;
++}
+ 
+ /* Return true if statement S has side-effects.  We consider a
+    statement to have side effects if:
+diff --git a/gcc/gimple.h b/gcc/gimple.h
+index 8b5c9e219..f91c6db4d 100644
+--- a/gcc/gimple.h
++++ b/gcc/gimple.h
+@@ -1509,6 +1509,7 @@ void gimple_assign_set_rhs_with_ops (gimple_stmt_iterator *, enum tree_code,
+ tree gimple_get_lhs (const gimple *);
+ void gimple_set_lhs (gimple *, tree);
+ gimple *gimple_copy (gimple *);
++void gimple_move_vops (gimple *, gimple *);
+ bool gimple_has_side_effects (const gimple *);
+ bool gimple_could_trap_p_1 (gimple *, bool, bool);
+ bool gimple_could_trap_p (gimple *);
+diff --git a/gcc/gimplify.c b/gcc/gimplify.c
+index bd8bd6d7e..b23680f96 100644
+--- a/gcc/gimplify.c
++++ b/gcc/gimplify.c
+@@ -1699,11 +1699,12 @@ gimplify_decl_expr (tree *stmt_p, gimple_seq *seq_p)
+       tree init = DECL_INITIAL (decl);
+       bool is_vla = false;
+ 
+-      if (TREE_CODE (DECL_SIZE_UNIT (decl)) != INTEGER_CST
++      poly_uint64 size;
++      if (!poly_int_tree_p (DECL_SIZE_UNIT (decl), &size)
+ 	  || (!TREE_STATIC (decl)
+ 	      && flag_stack_check == GENERIC_STACK_CHECK
+-	      && compare_tree_int (DECL_SIZE_UNIT (decl),
+-				   STACK_CHECK_MAX_VAR_SIZE) > 0))
++	      && maybe_gt (size,
++			   (unsigned HOST_WIDE_INT) STACK_CHECK_MAX_VAR_SIZE)))
+ 	{
+ 	  gimplify_vla_decl (decl, seq_p);
+ 	  is_vla = true;
+diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c
+index 5025aae42..33a77542a 100644
+--- a/gcc/haifa-sched.c
++++ b/gcc/haifa-sched.c
+@@ -529,9 +529,6 @@ haifa_classify_rtx (const_rtx x)
+ 	  /* Test if it is a 'store'.  */
+ 	  tmp_class = may_trap_exp (XEXP (x, 0), 1);
+ 	  break;
+-	case CLOBBER_HIGH:
+-	  gcc_assert (REG_P (XEXP (x, 0)));
+-	  break;
+ 	case SET:
+ 	  /* Test if it is a store.  */
+ 	  tmp_class = may_trap_exp (SET_DEST (x), 1);
+@@ -7207,7 +7204,7 @@ alloc_global_sched_pressure_data (void)
+ 	  fixed_regs_num[cl] = 0;
+ 
+ 	  for (int i = 0; i < ira_class_hard_regs_num[cl]; ++i)
+-	    if (!call_used_regs[ira_class_hard_regs[cl][i]])
++	    if (!call_used_or_fixed_reg_p (ira_class_hard_regs[cl][i]))
+ 	      ++call_saved_regs_num[cl];
+ 	    else if (fixed_regs[ira_class_hard_regs[cl][i]])
+ 	      ++fixed_regs_num[cl];
+diff --git a/gcc/hard-reg-set.h b/gcc/hard-reg-set.h
+index a72819662..51c9e72bb 100644
+--- a/gcc/hard-reg-set.h
++++ b/gcc/hard-reg-set.h
+@@ -20,6 +20,8 @@ along with GCC; see the file COPYING3.  If not see
+ #ifndef GCC_HARD_REG_SET_H
+ #define GCC_HARD_REG_SET_H
+ 
++#include "array-traits.h"
++
+ /* Define the type of a set of hard registers.  */
+ 
+ /* HARD_REG_ELT_TYPE is a typedef of the unsigned integral type which
+@@ -42,14 +44,88 @@ typedef unsigned HOST_WIDEST_FAST_INT HARD_REG_ELT_TYPE;
+ 
+ #if FIRST_PSEUDO_REGISTER <= HOST_BITS_PER_WIDEST_FAST_INT
+ 
+-#define HARD_REG_SET HARD_REG_ELT_TYPE
++typedef HARD_REG_ELT_TYPE HARD_REG_SET;
++typedef const HARD_REG_SET const_hard_reg_set;
+ 
+ #else
+ 
+ #define HARD_REG_SET_LONGS \
+  ((FIRST_PSEUDO_REGISTER + HOST_BITS_PER_WIDEST_FAST_INT - 1)	\
+   / HOST_BITS_PER_WIDEST_FAST_INT)
+-typedef HARD_REG_ELT_TYPE HARD_REG_SET[HARD_REG_SET_LONGS];
++
++struct HARD_REG_SET
++{
++  HARD_REG_SET
++  operator~ () const
++  {
++    HARD_REG_SET res;
++    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
++      res.elts[i] = ~elts[i];
++    return res;
++  }
++
++  HARD_REG_SET
++  operator& (const HARD_REG_SET &other) const
++  {
++    HARD_REG_SET res;
++    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
++      res.elts[i] = elts[i] & other.elts[i];
++    return res;
++  }
++
++  HARD_REG_SET &
++  operator&= (const HARD_REG_SET &other)
++  {
++    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
++      elts[i] &= other.elts[i];
++    return *this;
++  }
++
++  HARD_REG_SET
++  operator| (const HARD_REG_SET &other) const
++  {
++    HARD_REG_SET res;
++    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
++      res.elts[i] = elts[i] | other.elts[i];
++    return res;
++  }
++
++  HARD_REG_SET &
++  operator|= (const HARD_REG_SET &other)
++  {
++    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
++      elts[i] |= other.elts[i];
++    return *this;
++  }
++
++  bool
++  operator== (const HARD_REG_SET &other) const
++  {
++    HARD_REG_ELT_TYPE bad = 0;
++    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
++      bad |= (elts[i] ^ other.elts[i]);
++    return bad == 0;
++  }
++
++  bool
++  operator!= (const HARD_REG_SET &other) const
++  {
++    return !operator== (other);
++  }
++
++  HARD_REG_ELT_TYPE elts[HARD_REG_SET_LONGS];
++};
++typedef const HARD_REG_SET &const_hard_reg_set;
++
++template<>
++struct array_traits<HARD_REG_SET>
++{
++  typedef HARD_REG_ELT_TYPE element_type;
++  static const bool has_constant_size = true;
++  static const size_t constant_size = HARD_REG_SET_LONGS;
++  static const element_type *base (const HARD_REG_SET &x) { return x.elts; }
++  static size_t size (const HARD_REG_SET &) { return HARD_REG_SET_LONGS; }
++};
+ 
+ #endif
+ 
+@@ -77,28 +153,15 @@ struct hard_reg_set_container
+    CLEAR_HARD_REG_SET and SET_HARD_REG_SET.
+    These take just one argument.
+ 
+-   Also define macros for copying hard reg sets:
+-   COPY_HARD_REG_SET and COMPL_HARD_REG_SET.
+-   These take two arguments TO and FROM; they read from FROM
+-   and store into TO.  COMPL_HARD_REG_SET complements each bit.
+-
+-   Also define macros for combining hard reg sets:
+-   IOR_HARD_REG_SET and AND_HARD_REG_SET.
+-   These take two arguments TO and FROM; they read from FROM
+-   and combine bitwise into TO.  Define also two variants
+-   IOR_COMPL_HARD_REG_SET and AND_COMPL_HARD_REG_SET
+-   which use the complement of the set FROM.
+-
+    Also define:
+ 
+    hard_reg_set_subset_p (X, Y), which returns true if X is a subset of Y.
+-   hard_reg_set_equal_p (X, Y), which returns true if X and Y are equal.
+    hard_reg_set_intersect_p (X, Y), which returns true if X and Y intersect.
+    hard_reg_set_empty_p (X), which returns true if X is empty.  */
+ 
+ #define UHOST_BITS_PER_WIDE_INT ((unsigned) HOST_BITS_PER_WIDEST_FAST_INT)
+ 
+-#ifdef HARD_REG_SET
++#if FIRST_PSEUDO_REGISTER <= HOST_BITS_PER_WIDEST_FAST_INT
+ 
+ #define SET_HARD_REG_BIT(SET, BIT)  \
+  ((SET) |= HARD_CONST (1) << (BIT))
+@@ -110,404 +173,87 @@ struct hard_reg_set_container
+ #define CLEAR_HARD_REG_SET(TO) ((TO) = HARD_CONST (0))
+ #define SET_HARD_REG_SET(TO) ((TO) = ~ HARD_CONST (0))
+ 
+-#define COPY_HARD_REG_SET(TO, FROM) ((TO) = (FROM))
+-#define COMPL_HARD_REG_SET(TO, FROM) ((TO) = ~(FROM))
+-
+-#define IOR_HARD_REG_SET(TO, FROM) ((TO) |= (FROM))
+-#define IOR_COMPL_HARD_REG_SET(TO, FROM) ((TO) |= ~ (FROM))
+-#define AND_HARD_REG_SET(TO, FROM) ((TO) &= (FROM))
+-#define AND_COMPL_HARD_REG_SET(TO, FROM) ((TO) &= ~ (FROM))
+-
+ static inline bool
+-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
++hard_reg_set_subset_p (const_hard_reg_set x, const_hard_reg_set y)
+ {
+   return (x & ~y) == HARD_CONST (0);
+ }
+ 
+ static inline bool
+-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
+-{
+-  return x == y;
+-}
+-
+-static inline bool
+-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
++hard_reg_set_intersect_p (const_hard_reg_set x, const_hard_reg_set y)
+ {
+   return (x & y) != HARD_CONST (0);
+ }
+ 
+ static inline bool
+-hard_reg_set_empty_p (const HARD_REG_SET x)
++hard_reg_set_empty_p (const_hard_reg_set x)
+ {
+   return x == HARD_CONST (0);
+ }
+ 
+ #else
+ 
+-#define SET_HARD_REG_BIT(SET, BIT)		\
+-  ((SET)[(BIT) / UHOST_BITS_PER_WIDE_INT]	\
+-   |= HARD_CONST (1) << ((BIT) % UHOST_BITS_PER_WIDE_INT))
+-
+-#define CLEAR_HARD_REG_BIT(SET, BIT)		\
+-  ((SET)[(BIT) / UHOST_BITS_PER_WIDE_INT]	\
+-   &= ~(HARD_CONST (1) << ((BIT) % UHOST_BITS_PER_WIDE_INT)))
+-
+-#define TEST_HARD_REG_BIT(SET, BIT)		\
+-  (!!((SET)[(BIT) / UHOST_BITS_PER_WIDE_INT]	\
+-      & (HARD_CONST (1) << ((BIT) % UHOST_BITS_PER_WIDE_INT))))
+-
+-#if FIRST_PSEUDO_REGISTER <= 2*HOST_BITS_PER_WIDEST_FAST_INT
+-#define CLEAR_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     scan_tp_[0] = 0;						\
+-     scan_tp_[1] = 0; } while (0)
+-
+-#define SET_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     scan_tp_[0] = -1;						\
+-     scan_tp_[1] = -1; } while (0)
+-
+-#define COPY_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] = scan_fp_[0];					\
+-     scan_tp_[1] = scan_fp_[1]; } while (0)
+-
+-#define COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] = ~ scan_fp_[0];				\
+-     scan_tp_[1] = ~ scan_fp_[1]; } while (0)
+-
+-#define AND_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] &= scan_fp_[0];				\
+-     scan_tp_[1] &= scan_fp_[1]; } while (0)
+-
+-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] &= ~ scan_fp_[0];				\
+-     scan_tp_[1] &= ~ scan_fp_[1]; } while (0)
+-
+-#define IOR_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] |= scan_fp_[0];				\
+-     scan_tp_[1] |= scan_fp_[1]; } while (0)
+-
+-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] |= ~ scan_fp_[0];				\
+-     scan_tp_[1] |= ~ scan_fp_[1]; } while (0)
+-
+-static inline bool
+-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
+-{
+-  return (x[0] & ~y[0]) == 0 && (x[1] & ~y[1]) == 0;
+-}
+-
+-static inline bool
+-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
++inline void
++SET_HARD_REG_BIT (HARD_REG_SET &set, unsigned int bit)
+ {
+-  return x[0] == y[0] && x[1] == y[1];
++  set.elts[bit / UHOST_BITS_PER_WIDE_INT]
++    |= HARD_CONST (1) << (bit % UHOST_BITS_PER_WIDE_INT);
+ }
+ 
+-static inline bool
+-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
++inline void
++CLEAR_HARD_REG_BIT (HARD_REG_SET &set, unsigned int bit)
+ {
+-  return (x[0] & y[0]) != 0 || (x[1] & y[1]) != 0;
++  set.elts[bit / UHOST_BITS_PER_WIDE_INT]
++    &= ~(HARD_CONST (1) << (bit % UHOST_BITS_PER_WIDE_INT));
+ }
+ 
+-static inline bool
+-hard_reg_set_empty_p (const HARD_REG_SET x)
++inline bool
++TEST_HARD_REG_BIT (const_hard_reg_set set, unsigned int bit)
+ {
+-  return x[0] == 0 && x[1] == 0;
++  return (set.elts[bit / UHOST_BITS_PER_WIDE_INT]
++	  & (HARD_CONST (1) << (bit % UHOST_BITS_PER_WIDE_INT)));
+ }
+ 
+-#else
+-#if FIRST_PSEUDO_REGISTER <= 3*HOST_BITS_PER_WIDEST_FAST_INT
+-#define CLEAR_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     scan_tp_[0] = 0;						\
+-     scan_tp_[1] = 0;						\
+-     scan_tp_[2] = 0; } while (0)
+-
+-#define SET_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     scan_tp_[0] = -1;						\
+-     scan_tp_[1] = -1;						\
+-     scan_tp_[2] = -1; } while (0)
+-
+-#define COPY_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] = scan_fp_[0];					\
+-     scan_tp_[1] = scan_fp_[1];					\
+-     scan_tp_[2] = scan_fp_[2]; } while (0)
+-
+-#define COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] = ~ scan_fp_[0];				\
+-     scan_tp_[1] = ~ scan_fp_[1];				\
+-     scan_tp_[2] = ~ scan_fp_[2]; } while (0)
+-
+-#define AND_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] &= scan_fp_[0];				\
+-     scan_tp_[1] &= scan_fp_[1];				\
+-     scan_tp_[2] &= scan_fp_[2]; } while (0)
+-
+-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] &= ~ scan_fp_[0];				\
+-     scan_tp_[1] &= ~ scan_fp_[1];				\
+-     scan_tp_[2] &= ~ scan_fp_[2]; } while (0)
+-
+-#define IOR_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] |= scan_fp_[0];				\
+-     scan_tp_[1] |= scan_fp_[1];				\
+-     scan_tp_[2] |= scan_fp_[2]; } while (0)
+-
+-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] |= ~ scan_fp_[0];				\
+-     scan_tp_[1] |= ~ scan_fp_[1];				\
+-     scan_tp_[2] |= ~ scan_fp_[2]; } while (0)
+-
+-static inline bool
+-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
++inline void
++CLEAR_HARD_REG_SET (HARD_REG_SET &set)
+ {
+-  return ((x[0] & ~y[0]) == 0
+-	  && (x[1] & ~y[1]) == 0
+-	  && (x[2] & ~y[2]) == 0);
++  for (unsigned int i = 0; i < ARRAY_SIZE (set.elts); ++i)
++    set.elts[i] = 0;
+ }
+ 
+-static inline bool
+-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
++inline void
++SET_HARD_REG_SET (HARD_REG_SET &set)
+ {
+-  return x[0] == y[0] && x[1] == y[1] && x[2] == y[2];
++  for (unsigned int i = 0; i < ARRAY_SIZE (set.elts); ++i)
++    set.elts[i] = -1;
+ }
+ 
+ static inline bool
+-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
+-{
+-  return ((x[0] & y[0]) != 0
+-	  || (x[1] & y[1]) != 0
+-	  || (x[2] & y[2]) != 0);
+-}
+-
+-static inline bool
+-hard_reg_set_empty_p (const HARD_REG_SET x)
+-{
+-  return x[0] == 0 && x[1] == 0 && x[2] == 0;
+-}
+-
+-#else
+-#if FIRST_PSEUDO_REGISTER <= 4*HOST_BITS_PER_WIDEST_FAST_INT
+-#define CLEAR_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     scan_tp_[0] = 0;						\
+-     scan_tp_[1] = 0;						\
+-     scan_tp_[2] = 0;						\
+-     scan_tp_[3] = 0; } while (0)
+-
+-#define SET_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     scan_tp_[0] = -1;						\
+-     scan_tp_[1] = -1;						\
+-     scan_tp_[2] = -1;						\
+-     scan_tp_[3] = -1; } while (0)
+-
+-#define COPY_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] = scan_fp_[0];					\
+-     scan_tp_[1] = scan_fp_[1];					\
+-     scan_tp_[2] = scan_fp_[2];					\
+-     scan_tp_[3] = scan_fp_[3]; } while (0)
+-
+-#define COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] = ~ scan_fp_[0];				\
+-     scan_tp_[1] = ~ scan_fp_[1];				\
+-     scan_tp_[2] = ~ scan_fp_[2];				\
+-     scan_tp_[3] = ~ scan_fp_[3]; } while (0)
+-
+-#define AND_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] &= scan_fp_[0];				\
+-     scan_tp_[1] &= scan_fp_[1];				\
+-     scan_tp_[2] &= scan_fp_[2];				\
+-     scan_tp_[3] &= scan_fp_[3]; } while (0)
+-
+-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] &= ~ scan_fp_[0];				\
+-     scan_tp_[1] &= ~ scan_fp_[1];				\
+-     scan_tp_[2] &= ~ scan_fp_[2];				\
+-     scan_tp_[3] &= ~ scan_fp_[3]; } while (0)
+-
+-#define IOR_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] |= scan_fp_[0];				\
+-     scan_tp_[1] |= scan_fp_[1];				\
+-     scan_tp_[2] |= scan_fp_[2];				\
+-     scan_tp_[3] |= scan_fp_[3]; } while (0)
+-
+-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     scan_tp_[0] |= ~ scan_fp_[0];				\
+-     scan_tp_[1] |= ~ scan_fp_[1];				\
+-     scan_tp_[2] |= ~ scan_fp_[2];				\
+-     scan_tp_[3] |= ~ scan_fp_[3]; } while (0)
+-
+-static inline bool
+-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
++hard_reg_set_subset_p (const_hard_reg_set x, const_hard_reg_set y)
+ {
+-  return ((x[0] & ~y[0]) == 0
+-	  && (x[1] & ~y[1]) == 0
+-	  && (x[2] & ~y[2]) == 0
+-	  && (x[3] & ~y[3]) == 0);
++  HARD_REG_ELT_TYPE bad = 0;
++  for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
++    bad |= (x.elts[i] & ~y.elts[i]);
++  return bad == 0;
+ }
+ 
+ static inline bool
+-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
++hard_reg_set_intersect_p (const_hard_reg_set x, const_hard_reg_set y)
+ {
+-  return x[0] == y[0] && x[1] == y[1] && x[2] == y[2] && x[3] == y[3];
++  HARD_REG_ELT_TYPE good = 0;
++  for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
++    good |= (x.elts[i] & y.elts[i]);
++  return good != 0;
+ }
+ 
+ static inline bool
+-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
++hard_reg_set_empty_p (const_hard_reg_set x)
+ {
+-  return ((x[0] & y[0]) != 0
+-	  || (x[1] & y[1]) != 0
+-	  || (x[2] & y[2]) != 0
+-	  || (x[3] & y[3]) != 0);
++  HARD_REG_ELT_TYPE bad = 0;
++  for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
++    bad |= x.elts[i];
++  return bad == 0;
+ }
+-
+-static inline bool
+-hard_reg_set_empty_p (const HARD_REG_SET x)
+-{
+-  return x[0] == 0 && x[1] == 0 && x[2] == 0 && x[3] == 0;
+-}
+-
+-#else /* FIRST_PSEUDO_REGISTER > 4*HOST_BITS_PER_WIDEST_FAST_INT */
+-
+-#define CLEAR_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ = 0; } while (0)
+-
+-#define SET_HARD_REG_SET(TO)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ = -1; } while (0)
+-
+-#define COPY_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ = *scan_fp_++; } while (0)
+-
+-#define COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ = ~ *scan_fp_++; } while (0)
+-
+-#define AND_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ &= *scan_fp_++; } while (0)
+-
+-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ &= ~ *scan_fp_++; } while (0)
+-
+-#define IOR_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ |= *scan_fp_++; } while (0)
+-
+-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
+-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
+-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
+-     int i;							\
+-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
+-       *scan_tp_++ |= ~ *scan_fp_++; } while (0)
+-
+-static inline bool
+-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
+-{
+-  int i;
+-
+-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
+-    if ((x[i] & ~y[i]) != 0)
+-      return false;
+-  return true;
+-}
+-
+-static inline bool
+-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
+-{
+-  int i;
+-
+-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
+-    if (x[i] != y[i])
+-      return false;
+-  return true;
+-}
+-
+-static inline bool
+-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
+-{
+-  int i;
+-
+-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
+-    if ((x[i] & y[i]) != 0)
+-      return true;
+-  return false;
+-}
+-
+-static inline bool
+-hard_reg_set_empty_p (const HARD_REG_SET x)
+-{
+-  int i;
+-
+-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
+-    if (x[i] != 0)
+-      return false;
+-  return true;
+-}
+-
+-#endif
+-#endif
+-#endif
+ #endif
+ 
+ /* Iterator for hard register sets.  */
+@@ -515,7 +261,7 @@ hard_reg_set_empty_p (const HARD_REG_SET x)
+ struct hard_reg_set_iterator
+ {
+   /* Pointer to the current element.  */
+-  HARD_REG_ELT_TYPE *pelt;
++  const HARD_REG_ELT_TYPE *pelt;
+ 
+   /* The length of the set.  */
+   unsigned short length;
+@@ -534,11 +280,11 @@ struct hard_reg_set_iterator
+ /* The implementation of the iterator functions is fully analogous to
+    the bitmap iterators.  */
+ static inline void
+-hard_reg_set_iter_init (hard_reg_set_iterator *iter, HARD_REG_SET set,
++hard_reg_set_iter_init (hard_reg_set_iterator *iter, const_hard_reg_set set,
+                         unsigned min, unsigned *regno)
+ {
+ #ifdef HARD_REG_SET_LONGS
+-  iter->pelt = set;
++  iter->pelt = set.elts;
+   iter->length = HARD_REG_SET_LONGS;
+ #else
+   iter->pelt = &set;
+@@ -649,16 +395,15 @@ struct target_hard_regs {
+      a pseudo reg whose life crosses calls.  */
+   char x_call_used_regs[FIRST_PSEUDO_REGISTER];
+ 
+-  char x_call_really_used_regs[FIRST_PSEUDO_REGISTER];
+-
+-  /* The same info as a HARD_REG_SET.  */
+-  HARD_REG_SET x_call_used_reg_set;
++  /* For targets that use reload rather than LRA, this is the set
++     of registers that we are able to save and restore around calls
++     (i.e. those for which we know a suitable mode and set of
++     load/store instructions exist).  For LRA targets it contains
++     all registers.
+ 
+-  /* Contains registers that are fixed use -- i.e. in fixed_reg_set -- or
+-     a function value return register or TARGET_STRUCT_VALUE_RTX or
+-     STATIC_CHAIN_REGNUM.  These are the registers that cannot hold quantities
+-     across calls even if we are willing to save and restore them.  */
+-  HARD_REG_SET x_call_fixed_reg_set;
++     This is legacy information and should be removed if all targets
++     switch to LRA.  */
++  HARD_REG_SET x_savable_regs;
+ 
+   /* Contains registers that are fixed use -- i.e. in fixed_reg_set -- but
+      only if they are not merely part of that set because they are global
+@@ -674,10 +419,6 @@ struct target_hard_regs {
+      with the local stack frame are safe, but scant others.  */
+   HARD_REG_SET x_regs_invalidated_by_call;
+ 
+-  /* Call used hard registers which cannot be saved because there is no
+-     insn for this.  */
+-  HARD_REG_SET x_no_caller_save_reg_set;
+-
+   /* Table of register numbers in the order in which to try to use them.  */
+   int x_reg_alloc_order[FIRST_PSEUDO_REGISTER];
+ 
+@@ -730,18 +471,16 @@ extern struct target_hard_regs *this_target_hard_regs;
+   (this_target_hard_regs->x_fixed_reg_set)
+ #define fixed_nonglobal_reg_set \
+   (this_target_hard_regs->x_fixed_nonglobal_reg_set)
++#ifdef IN_TARGET_CODE
+ #define call_used_regs \
+   (this_target_hard_regs->x_call_used_regs)
+-#define call_really_used_regs \
+-  (this_target_hard_regs->x_call_really_used_regs)
+-#define call_used_reg_set \
+-  (this_target_hard_regs->x_call_used_reg_set)
+-#define call_fixed_reg_set \
+-  (this_target_hard_regs->x_call_fixed_reg_set)
++#endif
++#define savable_regs \
++  (this_target_hard_regs->x_savable_regs)
+ #define regs_invalidated_by_call \
+   (this_target_hard_regs->x_regs_invalidated_by_call)
+-#define no_caller_save_reg_set \
+-  (this_target_hard_regs->x_no_caller_save_reg_set)
++#define call_used_or_fixed_regs \
++  (regs_invalidated_by_call | fixed_reg_set)
+ #define reg_alloc_order \
+   (this_target_hard_regs->x_reg_alloc_order)
+ #define inv_reg_alloc_order \
+@@ -770,4 +509,13 @@ extern const char * reg_class_names[];
+ #define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
+   (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
+ 
++/* Return true if register REGNO is either fixed or call-used
++   (aka call-clobbered).  */
++
++inline bool
++call_used_or_fixed_reg_p (unsigned int regno)
++{
++  return fixed_regs[regno] || this_target_hard_regs->x_call_used_regs[regno];
++}
++
+ #endif /* ! GCC_HARD_REG_SET_H */
+diff --git a/gcc/hooks.c b/gcc/hooks.c
+index f95659b38..98038860e 100644
+--- a/gcc/hooks.c
++++ b/gcc/hooks.c
+@@ -140,9 +140,8 @@ hook_bool_puint64_puint64_true (poly_uint64, poly_uint64)
+   return true;
+ }
+ 
+-/* Generic hook that takes (unsigned int, machine_mode) and returns false.  */
+ bool
+-hook_bool_insn_uint_mode_false (rtx_insn *, unsigned int, machine_mode)
++hook_bool_uint_uint_mode_false (unsigned int, unsigned int, machine_mode)
+ {
+   return false;
+ }
+@@ -313,6 +312,12 @@ hook_bool_const_tree_false (const_tree)
+   return false;
+ }
+ 
++bool
++hook_bool_const_tree_const_tree_true (const_tree, const_tree)
++{
++  return true;
++}
++
+ bool
+ hook_bool_tree_true (tree)
+ {
+diff --git a/gcc/hooks.h b/gcc/hooks.h
+index 0bc8117c2..b398d13ce 100644
+--- a/gcc/hooks.h
++++ b/gcc/hooks.h
+@@ -40,11 +40,12 @@ extern bool hook_bool_const_rtx_insn_const_rtx_insn_true (const rtx_insn *,
+ extern bool hook_bool_mode_uhwi_false (machine_mode,
+ 				       unsigned HOST_WIDE_INT);
+ extern bool hook_bool_puint64_puint64_true (poly_uint64, poly_uint64);
+-extern bool hook_bool_insn_uint_mode_false (rtx_insn *, unsigned int,
++extern bool hook_bool_uint_uint_mode_false (unsigned int, unsigned int,
+ 					    machine_mode);
+ extern bool hook_bool_uint_mode_true (unsigned int, machine_mode);
+ extern bool hook_bool_tree_false (tree);
+ extern bool hook_bool_const_tree_false (const_tree);
++extern bool hook_bool_const_tree_const_tree_true (const_tree, const_tree);
+ extern bool hook_bool_tree_true (tree);
+ extern bool hook_bool_const_tree_true (const_tree);
+ extern bool hook_bool_gsiptr_false (gimple_stmt_iterator *);
+diff --git a/gcc/hw-doloop.c b/gcc/hw-doloop.c
+index 2decece62..3ee0b4098 100644
+--- a/gcc/hw-doloop.c
++++ b/gcc/hw-doloop.c
+@@ -141,7 +141,7 @@ scan_loop (hwloop_info loop)
+ 	    CLEAR_HARD_REG_BIT (set_this_insn, REGNO (loop->iter_reg));
+ 	  else if (reg_mentioned_p (loop->iter_reg, PATTERN (insn)))
+ 	    loop->iter_reg_used = true;
+-	  IOR_HARD_REG_SET (loop->regs_set_in_loop, set_this_insn);
++	  loop->regs_set_in_loop |= set_this_insn;
+ 	}
+     }
+ }
+@@ -581,7 +581,7 @@ optimize_loop (hwloop_info loop, struct hw_doloop_hooks *hooks)
+ 	inner_depth = inner->depth;
+       /* The set of registers may be changed while optimizing the inner
+ 	 loop.  */
+-      IOR_HARD_REG_SET (loop->regs_set_in_loop, inner->regs_set_in_loop);
++      loop->regs_set_in_loop |= inner->regs_set_in_loop;
+     }
+ 
+   loop->depth = inner_depth + 1;
+diff --git a/gcc/int-vector-builder.h b/gcc/int-vector-builder.h
+index adf0904c5..dc9651021 100644
+--- a/gcc/int-vector-builder.h
++++ b/gcc/int-vector-builder.h
+@@ -26,10 +26,11 @@ along with GCC; see the file COPYING3.  If not see
+    encoding as tree and rtx constants.  See vector_builder for more
+    details.  */
+ template<typename T>
+-class int_vector_builder : public vector_builder<T, int_vector_builder<T> >
++class int_vector_builder : public vector_builder<T, poly_uint64,
++						 int_vector_builder<T> >
+ {
+-  typedef vector_builder<T, int_vector_builder> parent;
+-  friend class vector_builder<T, int_vector_builder>;
++  typedef vector_builder<T, poly_uint64, int_vector_builder> parent;
++  friend class vector_builder<T, poly_uint64, int_vector_builder>;
+ 
+ public:
+   int_vector_builder () {}
+@@ -45,6 +46,8 @@ private:
+   T apply_step (T, unsigned int, T) const;
+   bool can_elide_p (T) const { return true; }
+   void note_representative (T *, T) {}
++
++  static poly_uint64 shape_nelts (poly_uint64 x) { return x; }
+ };
+ 
+ /* Create a new builder for a vector with FULL_NELTS elements.
+diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
+index 21ecd5667..9753a12f3 100644
+--- a/gcc/internal-fn.c
++++ b/gcc/internal-fn.c
+@@ -117,6 +117,7 @@ init_internal_fns ()
+ #define while_direct { 0, 2, false }
+ #define fold_extract_direct { 2, 2, false }
+ #define fold_left_direct { 1, 1, false }
++#define mask_fold_left_direct { 1, 1, false }
+ 
+ const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
+ #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
+@@ -3005,6 +3006,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+ #define expand_fold_left_optab_fn(FN, STMT, OPTAB) \
+   expand_direct_optab_fn (FN, STMT, OPTAB, 2)
+ 
++#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
++  expand_direct_optab_fn (FN, STMT, OPTAB, 3)
++
+ /* RETURN_TYPE and ARGS are a return type and argument list that are
+    in principle compatible with FN (which satisfies direct_internal_fn_p).
+    Return the types that should be used to determine whether the
+@@ -3093,6 +3097,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
+ #define direct_while_optab_supported_p convert_optab_supported_p
+ #define direct_fold_extract_optab_supported_p direct_optab_supported_p
+ #define direct_fold_left_optab_supported_p direct_optab_supported_p
++#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
+ 
+ /* Return the optab used by internal function FN.  */
+ 
+@@ -3210,6 +3215,8 @@ first_commutative_argument (internal_fn fn)
+     case IFN_FNMS:
+     case IFN_AVG_FLOOR:
+     case IFN_AVG_CEIL:
++    case IFN_MULHS:
++    case IFN_MULHRS:
+     case IFN_FMIN:
+     case IFN_FMAX:
+       return 0;
+@@ -3286,7 +3293,9 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
+   T (MAX_EXPR, IFN_COND_MAX) \
+   T (BIT_AND_EXPR, IFN_COND_AND) \
+   T (BIT_IOR_EXPR, IFN_COND_IOR) \
+-  T (BIT_XOR_EXPR, IFN_COND_XOR)
++  T (BIT_XOR_EXPR, IFN_COND_XOR) \
++  T (LSHIFT_EXPR, IFN_COND_SHL) \
++  T (RSHIFT_EXPR, IFN_COND_SHR)
+ 
+ /* Return a function that only performs CODE when a certain condition is met
+    and that uses a given fallback value otherwise.  For example, if CODE is
+diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
+index e370eaa84..ae32fc7bd 100644
+--- a/gcc/internal-fn.def
++++ b/gcc/internal-fn.def
+@@ -140,6 +140,8 @@ DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
+ 		       vec_shl_insert, binary)
+ 
++DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
++
+ DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
+ DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
+ DEF_INTERNAL_OPTAB_FN (FNMS, ECF_CONST, fnms, ternary)
+@@ -149,6 +151,11 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first,
+ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first,
+ 			      savg_ceil, uavg_ceil, binary)
+ 
++DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
++			      smulhs, umulhs, binary)
++DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
++			      smulhrs, umulhrs, binary)
++
+ DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
+ DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
+ DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
+@@ -167,6 +174,10 @@ DEF_INTERNAL_OPTAB_FN (COND_IOR, ECF_CONST | ECF_NOTHROW,
+ 		       cond_ior, cond_binary)
+ DEF_INTERNAL_OPTAB_FN (COND_XOR, ECF_CONST | ECF_NOTHROW,
+ 		       cond_xor, cond_binary)
++DEF_INTERNAL_OPTAB_FN (COND_SHL, ECF_CONST | ECF_NOTHROW,
++		       cond_ashl, cond_binary)
++DEF_INTERNAL_SIGNED_OPTAB_FN (COND_SHR, ECF_CONST | ECF_NOTHROW, first,
++			      cond_ashr, cond_lshr, cond_binary)
+ 
+ DEF_INTERNAL_OPTAB_FN (COND_FMA, ECF_CONST, cond_fma, cond_ternary)
+ DEF_INTERNAL_OPTAB_FN (COND_FMS, ECF_CONST, cond_fms, cond_ternary)
+@@ -199,6 +210,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
+ DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
+ 		       fold_left_plus, fold_left)
+ 
++DEF_INTERNAL_OPTAB_FN (MASK_FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
++		       mask_fold_left_plus, mask_fold_left)
++
+ /* Unary math functions.  */
+ DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
+ DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary)
+@@ -217,6 +231,7 @@ DEF_INTERNAL_FLT_FN (LOG10, ECF_CONST, log10, unary)
+ DEF_INTERNAL_FLT_FN (LOG1P, ECF_CONST, log1p, unary)
+ DEF_INTERNAL_FLT_FN (LOG2, ECF_CONST, log2, unary)
+ DEF_INTERNAL_FLT_FN (LOGB, ECF_CONST, logb, unary)
++DEF_INTERNAL_FLT_FN (SIGNBIT, ECF_CONST, signbit, unary)
+ DEF_INTERNAL_FLT_FN (SIGNIFICAND, ECF_CONST, significand, unary)
+ DEF_INTERNAL_FLT_FN (SIN, ECF_CONST, sin, unary)
+ DEF_INTERNAL_FLT_FN (SINH, ECF_CONST, sinh, unary)
+diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
+index 8988a4e49..b9e2ef450 100644
+--- a/gcc/ipa-cp.c
++++ b/gcc/ipa-cp.c
+@@ -2862,8 +2862,7 @@ ipa_get_indirect_edge_target_1 (struct cgraph_edge *ie,
+ 	  if (can_refer)
+ 	    {
+ 	      if (!target
+-		  || (TREE_CODE (TREE_TYPE (target)) == FUNCTION_TYPE
+-		      && DECL_FUNCTION_CODE (target) == BUILT_IN_UNREACHABLE)
++		  || fndecl_built_in_p (target, BUILT_IN_UNREACHABLE)
+ 		  || !possible_polymorphic_call_target_p
+ 		       (ie, cgraph_node::get (target)))
+ 		{
+diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c
+index 2d8a0b383..df1ea21b4 100644
+--- a/gcc/ipa-devirt.c
++++ b/gcc/ipa-devirt.c
+@@ -3576,12 +3576,10 @@ possible_polymorphic_call_target_p (tree otr_type,
+ {
+   vec <cgraph_node *> targets;
+   unsigned int i;
+-  enum built_in_function fcode;
+   bool final;
+ 
+-  if (TREE_CODE (TREE_TYPE (n->decl)) == FUNCTION_TYPE
+-      && ((fcode = DECL_FUNCTION_CODE (n->decl)) == BUILT_IN_UNREACHABLE
+-          || fcode == BUILT_IN_TRAP))
++  if (fndecl_built_in_p (n->decl, BUILT_IN_UNREACHABLE)
++      || fndecl_built_in_p (n->decl, BUILT_IN_TRAP))
+     return true;
+ 
+   if (is_cxa_pure_virtual_p (n->decl))
+diff --git a/gcc/ipa-icf.c b/gcc/ipa-icf.c
+index 568c6a452..8b6961486 100644
+--- a/gcc/ipa-icf.c
++++ b/gcc/ipa-icf.c
+@@ -351,8 +351,8 @@ sem_item::compare_referenced_symbol_properties (symtab_node *used_by,
+ 	    return return_false_with_msg ("inline attributes are different");
+ 	}
+ 
+-      if (DECL_IS_OPERATOR_NEW (n1->decl)
+-	  != DECL_IS_OPERATOR_NEW (n2->decl))
++      if (DECL_IS_OPERATOR_NEW_P (n1->decl)
++	  != DECL_IS_OPERATOR_NEW_P (n2->decl))
+ 	return return_false_with_msg ("operator new flags are different");
+     }
+ 
+@@ -416,7 +416,7 @@ sem_item::hash_referenced_symbol_properties (symtab_node *ref,
+ 	  hstate.add_flag (DECL_DISREGARD_INLINE_LIMITS (ref->decl));
+ 	  hstate.add_flag (DECL_DECLARED_INLINE_P (ref->decl));
+ 	}
+-      hstate.add_flag (DECL_IS_OPERATOR_NEW (ref->decl));
++      hstate.add_flag (DECL_IS_OPERATOR_NEW_P (ref->decl));
+     }
+   else if (is_a <varpool_node *> (ref))
+     {
+diff --git a/gcc/ipa-inline.c b/gcc/ipa-inline.c
+index a2fb20320..7c627eff8 100644
+--- a/gcc/ipa-inline.c
++++ b/gcc/ipa-inline.c
+@@ -390,6 +390,28 @@ can_inline_edge_p (struct cgraph_edge *e, bool report,
+   return inlinable;
+ }
+ 
++/* Return inlining_insns_single limit for function N */
++
++static int
++inline_insns_single (cgraph_node *n)
++{
++  if (opt_for_fn (n->decl, optimize >= 3))
++    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SINGLE);
++  else
++    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SINGLE_O2);
++}
++
++/* Return inlining_insns_auto limit for function N */
++
++static int
++inline_insns_auto (cgraph_node *n)
++{
++  if (opt_for_fn (n->decl, optimize >= 3))
++    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_AUTO);
++  else
++    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_AUTO_O2);
++}
++
+ /* Decide if we can inline the edge and possibly update
+    inline_failed reason.  
+    We check whether inlining is possible at all and whether
+@@ -532,8 +554,8 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
+ 	  int growth = estimate_edge_growth (e);
+ 	  if (growth > PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SIZE)
+ 	      && (!DECL_DECLARED_INLINE_P (callee->decl)
+-		  && growth >= MAX (MAX_INLINE_INSNS_SINGLE,
+-				    MAX_INLINE_INSNS_AUTO)))
++		  && growth >= MAX (inline_insns_single (caller),
++				    inline_insns_auto (caller))))
+ 	    {
+ 	      e->inline_failed = CIF_OPTIMIZATION_MISMATCH;
+ 	      inlinable = false;
+@@ -641,6 +663,10 @@ want_early_inline_function_p (struct cgraph_edge *e)
+     {
+       int growth = estimate_edge_growth (e);
+       int n;
++      int early_inlining_insns = opt_for_fn (e->caller->decl, optimize) >= 3
++				 ? PARAM_VALUE (PARAM_EARLY_INLINING_INSNS)
++				 : PARAM_VALUE (PARAM_EARLY_INLINING_INSNS_O2);
++
+ 
+       if (growth <= PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SIZE))
+ 	;
+@@ -654,26 +680,28 @@ want_early_inline_function_p (struct cgraph_edge *e)
+ 			     growth);
+ 	  want_inline = false;
+ 	}
+-      else if (growth > PARAM_VALUE (PARAM_EARLY_INLINING_INSNS))
++      else if (growth > early_inlining_insns)
+ 	{
+ 	  if (dump_enabled_p ())
+ 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
+ 			     "  will not early inline: %C->%C, "
+-			     "growth %i exceeds --param early-inlining-insns\n",
+-			     e->caller, callee,
+-			     growth);
++			     "growth %i exceeds --param early-inlining-insns%s\n",
++			     e->caller, callee, growth,
++			     opt_for_fn (e->caller->decl, optimize) >= 3
++			     ? "" : "-O2");
+ 	  want_inline = false;
+ 	}
+       else if ((n = num_calls (callee)) != 0
+-	       && growth * (n + 1) > PARAM_VALUE (PARAM_EARLY_INLINING_INSNS))
++	       && growth * (n + 1) > early_inlining_insns)
+ 	{
+ 	  if (dump_enabled_p ())
+ 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
+ 			     "  will not early inline: %C->%C, "
+-			     "growth %i exceeds --param early-inlining-insns "
++			     "growth %i exceeds --param early-inlining-insns%s "
+ 			     "divided by number of calls\n",
+-			     e->caller, callee,
+-			     growth);
++			     e->caller, callee, growth,
++			     opt_for_fn (e->caller->decl, optimize) >= 3
++			     ? "" : "-O2");
+ 	  want_inline = false;
+ 	}
+     }
+@@ -739,9 +767,14 @@ big_speedup_p (struct cgraph_edge *e)
+   sreal spec_time = estimate_edge_time (e, &unspec_time);
+   sreal time = compute_uninlined_call_time (e, unspec_time);
+   sreal inlined_time = compute_inlined_call_time (e, spec_time);
++  cgraph_node *caller = (e->caller->inlined_to
++			 ? e->caller->inlined_to
++			 : e->caller);
++  int limit = opt_for_fn (caller->decl, optimize) >= 3
++	      ? PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP)
++	      : PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP_O2);
+ 
+-  if ((time - inlined_time) * 100
+-      > (sreal) (time * PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP)))
++  if ((time - inlined_time) * 100 > time * limit)
+     return true;
+   return false;
+ }
+@@ -775,20 +808,29 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
+ 	   && (!e->count.ipa ().initialized_p () || !e->maybe_hot_p ()))
+ 	   && ipa_fn_summaries->get (callee)->min_size
+ 		- ipa_call_summaries->get (e)->call_stmt_size
+-	      > MAX (MAX_INLINE_INSNS_SINGLE, MAX_INLINE_INSNS_AUTO))
++	      > MAX (inline_insns_single (e->caller),
++		     inline_insns_auto (e->caller)))
+     {
+-      e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
++      if (opt_for_fn (e->caller->decl, optimize) >= 3)
++        e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
++      else
++        e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_O2_LIMIT;
+       want_inline = false;
+     }
+   else if ((DECL_DECLARED_INLINE_P (callee->decl)
+ 	    || e->count.ipa ().nonzero_p ())
+ 	   && ipa_fn_summaries->get (callee)->min_size
+ 		- ipa_call_summaries->get (e)->call_stmt_size
+-	      > 16 * MAX_INLINE_INSNS_SINGLE)
++	      > 16 * inline_insns_single (e->caller))
+     {
+-      e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
+-			  ? CIF_MAX_INLINE_INSNS_SINGLE_LIMIT
+-			  : CIF_MAX_INLINE_INSNS_AUTO_LIMIT);
++      if (opt_for_fn (e->caller->decl, optimize) >= 3)
++	e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
++			    ? CIF_MAX_INLINE_INSNS_SINGLE_LIMIT
++			    : CIF_MAX_INLINE_INSNS_AUTO_LIMIT);
++      else
++	e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
++			    ? CIF_MAX_INLINE_INSNS_SINGLE_O2_LIMIT
++			    : CIF_MAX_INLINE_INSNS_AUTO_O2_LIMIT);
+       want_inline = false;
+     }
+   else
+@@ -802,15 +844,18 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
+       /* Apply MAX_INLINE_INSNS_SINGLE limit.  Do not do so when
+ 	 hints suggests that inlining given function is very profitable.  */
+       else if (DECL_DECLARED_INLINE_P (callee->decl)
+-	       && growth >= MAX_INLINE_INSNS_SINGLE
+-	       && (growth >= MAX_INLINE_INSNS_SINGLE * 16
++	       && growth >= inline_insns_single (e->caller)
++	       && (growth >= inline_insns_single (e->caller) * 16
+ 		   || (!(hints & (INLINE_HINT_indirect_call
+ 				  | INLINE_HINT_known_hot
+ 				  | INLINE_HINT_loop_iterations
+ 				  | INLINE_HINT_loop_stride))
+ 		       && !(big_speedup = big_speedup_p (e)))))
+ 	{
+-          e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_LIMIT;
++	  if (opt_for_fn (e->caller->decl, optimize) >= 3)
++            e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_LIMIT;
++	  else
++            e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_O2_LIMIT;
+ 	  want_inline = false;
+ 	}
+       else if (!DECL_DECLARED_INLINE_P (callee->decl)
+@@ -818,7 +863,7 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
+ 	       && growth >= PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SMALL))
+ 	{
+ 	  /* growth_likely_positive is expensive, always test it last.  */
+-          if (growth >= MAX_INLINE_INSNS_SINGLE
++          if (growth >= inline_insns_single (e->caller)
+ 	      || growth_likely_positive (callee, growth))
+ 	    {
+               e->inline_failed = CIF_NOT_DECLARED_INLINED;
+@@ -833,22 +878,25 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
+ 	       && growth >= ((hints & (INLINE_HINT_indirect_call
+ 				       | INLINE_HINT_loop_iterations
+ 				       | INLINE_HINT_loop_stride))
+-			     ? MAX (MAX_INLINE_INSNS_AUTO,
+-				    MAX_INLINE_INSNS_SINGLE)
+-			     : MAX_INLINE_INSNS_AUTO)
++			     ? MAX (inline_insns_auto (e->caller),
++				    inline_insns_single (e->caller))
++			     : inline_insns_auto (e->caller))
+ 	       && !(big_speedup == -1 ? big_speedup_p (e) : big_speedup))
+ 	{
+ 	  /* growth_likely_positive is expensive, always test it last.  */
+-          if (growth >= MAX_INLINE_INSNS_SINGLE
++          if (growth >= inline_insns_single (e->caller)
+ 	      || growth_likely_positive (callee, growth))
+ 	    {
+-	      e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
++	      if (opt_for_fn (e->caller->decl, optimize) >= 3)
++		e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
++	      else
++		e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_O2_LIMIT;
+ 	      want_inline = false;
+  	    }
+ 	}
+       /* If call is cold, do not inline when function body would grow. */
+       else if (!e->maybe_hot_p ()
+-	       && (growth >= MAX_INLINE_INSNS_SINGLE
++	       && (growth >= inline_insns_single (e->caller)
+ 		   || growth_likely_positive (callee, growth)))
+ 	{
+           e->inline_failed = CIF_UNLIKELY_CALL;
+@@ -1157,7 +1205,7 @@ edge_badness (struct cgraph_edge *edge, bool dump)
+ 	      && caller_info->inlinable
+ 	      && ipa_size_summaries->get (caller)->size
+ 		 < (DECL_DECLARED_INLINE_P (caller->decl)
+-		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
++		    ? inline_insns_single (caller) : inline_insns_auto (caller)))
+ 	    {
+ 	      if (dump)
+ 		fprintf (dump_file,
+diff --git a/gcc/ipa-param-manipulation.c b/gcc/ipa-param-manipulation.c
+index 037253a87..1af6d050c 100644
+--- a/gcc/ipa-param-manipulation.c
++++ b/gcc/ipa-param-manipulation.c
+@@ -219,10 +219,7 @@ ipa_modify_formal_parameters (tree fndecl, ipa_parm_adjustment_vec adjustments)
+ 
+   /* When signature changes, we need to clear builtin info.  */
+   if (fndecl_built_in_p (fndecl))
+-    {
+-      DECL_BUILT_IN_CLASS (fndecl) = NOT_BUILT_IN;
+-      DECL_FUNCTION_CODE (fndecl) = (enum built_in_function) 0;
+-    }
++    set_decl_built_in_function (fndecl, NOT_BUILT_IN, 0);
+ 
+   TREE_TYPE (fndecl) = new_type;
+   DECL_VIRTUAL_P (fndecl) = 0;
+@@ -452,14 +449,7 @@ ipa_modify_call_arguments (struct cgraph_edge *cs, gcall *stmt,
+   gimple_call_set_chain (new_stmt, gimple_call_chain (stmt));
+   gimple_call_copy_flags (new_stmt, stmt);
+   if (gimple_in_ssa_p (cfun))
+-    {
+-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-      if (gimple_vdef (stmt))
+-	{
+-	  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+-	  SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+-	}
+-    }
++    gimple_move_vops (new_stmt, stmt);
+ 
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     {
+diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
+index 0439ce0c5..a70319505 100644
+--- a/gcc/ipa-prop.c
++++ b/gcc/ipa-prop.c
+@@ -3685,8 +3685,7 @@ try_make_edge_direct_virtual_call (struct cgraph_edge *ie,
+ 	  if (can_refer)
+ 	    {
+ 	      if (!t
+-		  || (TREE_CODE (TREE_TYPE (t)) == FUNCTION_TYPE
+-		      && DECL_FUNCTION_CODE (t) == BUILT_IN_UNREACHABLE)
++		  || fndecl_built_in_p (t, BUILT_IN_UNREACHABLE)
+ 		  || !possible_polymorphic_call_target_p
+ 		       (ie, cgraph_node::get (t)))
+ 		{
+diff --git a/gcc/ipa-split.c b/gcc/ipa-split.c
+index 5eaf8257f..aef2fa53c 100644
+--- a/gcc/ipa-split.c
++++ b/gcc/ipa-split.c
+@@ -1348,10 +1348,7 @@ split_function (basic_block return_bb, struct split_point *split_point,
+      changes.  For partial inlining we however cannot expect the part
+      of builtin implementation to have same semantic as the whole.  */
+   if (fndecl_built_in_p (node->decl))
+-    {
+-      DECL_BUILT_IN_CLASS (node->decl) = NOT_BUILT_IN;
+-      DECL_FUNCTION_CODE (node->decl) = (enum built_in_function) 0;
+-    }
++    set_decl_built_in_function (node->decl, NOT_BUILT_IN, 0);
+ 
+   /* If return_bb contains any clobbers that refer to SSA_NAMEs
+      set in the split part, remove them.  Also reset debug stmts that
+diff --git a/gcc/ira-build.c b/gcc/ira-build.c
+index 83caa3a8e..55c552679 100644
+--- a/gcc/ira-build.c
++++ b/gcc/ira-build.c
+@@ -456,12 +456,10 @@ ira_create_object (ira_allocno_t a, int subword)
+   OBJECT_CONFLICT_VEC_P (obj) = false;
+   OBJECT_CONFLICT_ARRAY (obj) = NULL;
+   OBJECT_NUM_CONFLICTS (obj) = 0;
+-  COPY_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj), ira_no_alloc_regs);
+-  COPY_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), ira_no_alloc_regs);
+-  IOR_COMPL_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-			  reg_class_contents[aclass]);
+-  IOR_COMPL_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-			  reg_class_contents[aclass]);
++  OBJECT_CONFLICT_HARD_REGS (obj) = ira_no_alloc_regs;
++  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) = ira_no_alloc_regs;
++  OBJECT_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
++  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
+   OBJECT_MIN (obj) = INT_MAX;
+   OBJECT_MAX (obj) = -1;
+   OBJECT_LIVE_RANGES (obj) = NULL;
+@@ -549,10 +547,8 @@ ira_set_allocno_class (ira_allocno_t a, enum reg_class aclass)
+   ALLOCNO_CLASS (a) = aclass;
+   FOR_EACH_ALLOCNO_OBJECT (a, obj, oi)
+     {
+-      IOR_COMPL_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-			      reg_class_contents[aclass]);
+-      IOR_COMPL_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-			      reg_class_contents[aclass]);
++      OBJECT_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
++      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
+     }
+ }
+ 
+@@ -602,10 +598,10 @@ merge_hard_reg_conflicts (ira_allocno_t from, ira_allocno_t to,
+       ira_object_t to_obj = ALLOCNO_OBJECT (to, i);
+ 
+       if (!total_only)
+-	IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (to_obj),
+-			  OBJECT_CONFLICT_HARD_REGS (from_obj));
+-      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (to_obj),
+-			OBJECT_TOTAL_CONFLICT_HARD_REGS (from_obj));
++	OBJECT_CONFLICT_HARD_REGS (to_obj)
++	  |= OBJECT_CONFLICT_HARD_REGS (from_obj);
++      OBJECT_TOTAL_CONFLICT_HARD_REGS (to_obj)
++	|= OBJECT_TOTAL_CONFLICT_HARD_REGS (from_obj);
+     }
+ #ifdef STACK_REGS
+   if (!total_only && ALLOCNO_NO_STACK_REG_P (from))
+@@ -618,15 +614,15 @@ merge_hard_reg_conflicts (ira_allocno_t from, ira_allocno_t to,
+ /* Update hard register conflict information for all objects associated with
+    A to include the regs in SET.  */
+ void
+-ior_hard_reg_conflicts (ira_allocno_t a, HARD_REG_SET *set)
++ior_hard_reg_conflicts (ira_allocno_t a, const_hard_reg_set set)
+ {
+   ira_allocno_object_iterator i;
+   ira_object_t obj;
+ 
+   FOR_EACH_ALLOCNO_OBJECT (a, obj, i)
+     {
+-      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj), *set);
+-      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), *set);
++      OBJECT_CONFLICT_HARD_REGS (obj) |= set;
++      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= set;
+     }
+ }
+ 
+@@ -907,8 +903,9 @@ create_cap_allocno (ira_allocno_t a)
+ 
+   ALLOCNO_CALLS_CROSSED_NUM (cap) = ALLOCNO_CALLS_CROSSED_NUM (a);
+   ALLOCNO_CHEAP_CALLS_CROSSED_NUM (cap) = ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
+-  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (cap),
+-		    ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
++  ALLOCNO_CROSSED_CALLS_ABIS (cap) = ALLOCNO_CROSSED_CALLS_ABIS (a);
++  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (cap)
++    = ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a);
+   if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
+     {
+       fprintf (ira_dump_file, "    Creating cap ");
+@@ -1876,11 +1873,6 @@ create_insn_allocnos (rtx x, rtx outer, bool output_p)
+       create_insn_allocnos (XEXP (x, 0), NULL, true);
+       return;
+     }
+-  else if (code == CLOBBER_HIGH)
+-    {
+-      gcc_assert (REG_P (XEXP (x, 0)) && HARD_REGISTER_P (XEXP (x, 0)));
+-      return;
+-    }
+   else if (code == MEM)
+     {
+       create_insn_allocnos (XEXP (x, 0), NULL, false);
+@@ -2036,8 +2028,10 @@ propagate_allocno_info (void)
+ 	    += ALLOCNO_CALLS_CROSSED_NUM (a);
+ 	  ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a)
+ 	    += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
+- 	  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a),
+- 			    ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
++	  ALLOCNO_CROSSED_CALLS_ABIS (parent_a)
++	    |= ALLOCNO_CROSSED_CALLS_ABIS (a);
++	  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a)
++	    |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a);
+ 	  ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a)
+ 	    += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a);
+ 	  aclass = ALLOCNO_CLASS (a);
+@@ -2419,8 +2413,9 @@ propagate_some_info_from_allocno (ira_allocno_t a, ira_allocno_t from_a)
+   ALLOCNO_CALLS_CROSSED_NUM (a) += ALLOCNO_CALLS_CROSSED_NUM (from_a);
+   ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a)
+     += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (from_a);
+-  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
+- 		    ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (from_a));
++  ALLOCNO_CROSSED_CALLS_ABIS (a) |= ALLOCNO_CROSSED_CALLS_ABIS (from_a);
++  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a)
++    |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (from_a);
+ 
+   ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a)
+     += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (from_a);
+@@ -2569,8 +2564,8 @@ remove_low_level_allocnos (void)
+ 	  ALLOCNO_NEXT_REGNO_ALLOCNO (a) = NULL;
+ 	  ALLOCNO_CAP_MEMBER (a) = NULL;
+ 	  FOR_EACH_ALLOCNO_OBJECT (a, obj, oi)
+-	    COPY_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-			       OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
++	    OBJECT_CONFLICT_HARD_REGS (obj)
++	      = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
+ #ifdef STACK_REGS
+ 	  if (ALLOCNO_TOTAL_NO_STACK_REG_P (a))
+ 	    ALLOCNO_NO_STACK_REG_P (a) = true;
+@@ -3060,8 +3055,10 @@ copy_info_to_removed_store_destinations (int regno)
+ 	+= ALLOCNO_CALLS_CROSSED_NUM (a);
+       ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a)
+ 	+= ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
+-      IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a),
+- 			ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
++      ALLOCNO_CROSSED_CALLS_ABIS (parent_a)
++	|= ALLOCNO_CROSSED_CALLS_ABIS (a);
++      ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a)
++	|= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a);
+       ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a)
+ 	+= ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a);
+       merged_p = true;
+@@ -3108,8 +3105,8 @@ ira_flattening (int max_regno_before_emit, int ira_max_point_before_emit)
+ 	   flattening.  */
+ 	continue;
+       FOR_EACH_ALLOCNO_OBJECT (a, obj, oi)
+-	COPY_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-			   OBJECT_CONFLICT_HARD_REGS (obj));
++	OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
++	  = OBJECT_CONFLICT_HARD_REGS (obj);
+ #ifdef STACK_REGS
+       ALLOCNO_TOTAL_NO_STACK_REG_P (a) = ALLOCNO_NO_STACK_REG_P (a);
+ #endif
+@@ -3159,6 +3156,9 @@ ira_flattening (int max_regno_before_emit, int ira_max_point_before_emit)
+ 		-= ALLOCNO_CALLS_CROSSED_NUM (a);
+ 	      ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a)
+ 		-= ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
++	      /* Assume that ALLOCNO_CROSSED_CALLS_ABIS and
++		 ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS stay the same.
++		 We'd need to rebuild the IR to do better.  */
+ 	      ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a)
+ 		-= ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a);
+ 	      ira_assert (ALLOCNO_CALLS_CROSSED_NUM (parent_a) >= 0
+@@ -3466,7 +3466,7 @@ ira_build (void)
+ 	 allocno crossing calls.  */
+       FOR_EACH_ALLOCNO (a, ai)
+ 	if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
+-	  ior_hard_reg_conflicts (a, &call_used_reg_set);
++	  ior_hard_reg_conflicts (a, ira_need_caller_save_regs (a));
+     }
+   if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
+     print_copies (ira_dump_file);
+diff --git a/gcc/ira-color.c b/gcc/ira-color.c
+index 8a90ae1b4..62499be91 100644
+--- a/gcc/ira-color.c
++++ b/gcc/ira-color.c
+@@ -218,7 +218,7 @@ inline bool
+ allocno_hard_regs_hasher::equal (const allocno_hard_regs *hv1,
+ 				 const allocno_hard_regs *hv2)
+ {
+-  return hard_reg_set_equal_p (hv1->set, hv2->set);
++  return hv1->set == hv2->set;
+ }
+ 
+ /* Hash table of unique allocno hard registers.  */
+@@ -261,14 +261,14 @@ add_allocno_hard_regs (HARD_REG_SET set, int64_t cost)
+   allocno_hard_regs_t hv;
+ 
+   gcc_assert (! hard_reg_set_empty_p (set));
+-  COPY_HARD_REG_SET (temp.set, set);
++  temp.set = set;
+   if ((hv = find_hard_regs (&temp)) != NULL)
+     hv->cost += cost;
+   else
+     {
+       hv = ((struct allocno_hard_regs *)
+ 	    ira_allocate (sizeof (struct allocno_hard_regs)));
+-      COPY_HARD_REG_SET (hv->set, set);
++      hv->set = set;
+       hv->cost = cost;
+       allocno_hard_regs_vec.safe_push (hv);
+       insert_hard_regs (hv);
+@@ -371,7 +371,7 @@ add_allocno_hard_regs_to_forest (allocno_hard_regs_node_t *roots,
+   start = hard_regs_node_vec.length ();
+   for (node = *roots; node != NULL; node = node->next)
+     {
+-      if (hard_reg_set_equal_p (hv->set, node->hard_regs->set))
++      if (hv->set == node->hard_regs->set)
+ 	return;
+       if (hard_reg_set_subset_p (hv->set, node->hard_regs->set))
+ 	{
+@@ -382,8 +382,7 @@ add_allocno_hard_regs_to_forest (allocno_hard_regs_node_t *roots,
+ 	hard_regs_node_vec.safe_push (node);
+       else if (hard_reg_set_intersect_p (hv->set, node->hard_regs->set))
+ 	{
+-	  COPY_HARD_REG_SET (temp_set, hv->set);
+-	  AND_HARD_REG_SET (temp_set, node->hard_regs->set);
++	  temp_set = hv->set & node->hard_regs->set;
+ 	  hv2 = add_allocno_hard_regs (temp_set, hv->cost);
+ 	  add_allocno_hard_regs_to_forest (&node->first, hv2);
+ 	}
+@@ -398,7 +397,7 @@ add_allocno_hard_regs_to_forest (allocno_hard_regs_node_t *roots,
+ 	   i++)
+ 	{
+ 	  node = hard_regs_node_vec[i];
+-	  IOR_HARD_REG_SET (temp_set, node->hard_regs->set);
++	  temp_set |= node->hard_regs->set;
+ 	}
+       hv = add_allocno_hard_regs (temp_set, hv->cost);
+       new_node = create_new_allocno_hard_regs_node (hv);
+@@ -717,8 +716,7 @@ form_allocno_hard_regs_nodes_forest (void)
+ 	    (allocno_data->profitable_hard_regs,
+ 	     ALLOCNO_MEMORY_COST (a) - ALLOCNO_CLASS_COST (a)));
+     }
+-  SET_HARD_REG_SET (temp);
+-  AND_COMPL_HARD_REG_SET (temp, ira_no_alloc_regs);
++  temp = ~ira_no_alloc_regs;
+   add_allocno_hard_regs (temp, 0);
+   qsort (allocno_hard_regs_vec.address () + start,
+ 	 allocno_hard_regs_vec.length () - start,
+@@ -833,10 +831,10 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
+   nobj = ALLOCNO_NUM_OBJECTS (a);
+   data = ALLOCNO_COLOR_DATA (a);
+   subnodes = allocno_hard_regs_subnodes + data->hard_regs_subnodes_start;
+-  COPY_HARD_REG_SET (profitable_hard_regs, data->profitable_hard_regs);
++  profitable_hard_regs = data->profitable_hard_regs;
+   node = data->hard_regs_node;
+   node_preorder_num = node->preorder_num;
+-  COPY_HARD_REG_SET (node_set, node->hard_regs->set);
++  node_set = node->hard_regs->set;
+   node_check_tick++;
+   for (k = 0; k < nobj; k++)
+     {
+@@ -859,7 +857,7 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
+ 					     ->profitable_hard_regs))
+ 	    continue;
+ 	  conflict_node = conflict_data->hard_regs_node;
+-	  COPY_HARD_REG_SET (conflict_node_set, conflict_node->hard_regs->set);
++	  conflict_node_set = conflict_node->hard_regs->set;
+ 	  if (hard_reg_set_subset_p (node_set, conflict_node_set))
+ 	    temp_node = node;
+ 	  else
+@@ -897,8 +895,7 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
+ 	  int j, n, hard_regno;
+ 	  enum reg_class aclass;
+ 	  
+-	  COPY_HARD_REG_SET (temp_set, temp_node->hard_regs->set);
+-	  AND_HARD_REG_SET (temp_set, profitable_hard_regs);
++	  temp_set = temp_node->hard_regs->set & profitable_hard_regs;
+ 	  aclass = ALLOCNO_CLASS (a);
+ 	  for (n = 0, j = ira_class_hard_regs_num[aclass] - 1; j >= 0; j--)
+ 	    {
+@@ -1042,15 +1039,15 @@ setup_profitable_hard_regs (void)
+       else
+ 	{
+ 	  mode = ALLOCNO_MODE (a);
+-	  COPY_HARD_REG_SET (data->profitable_hard_regs,
+-			     ira_useful_class_mode_regs[aclass][mode]);
++	  data->profitable_hard_regs
++	    = ira_useful_class_mode_regs[aclass][mode];
+ 	  nobj = ALLOCNO_NUM_OBJECTS (a);
+ 	  for (k = 0; k < nobj; k++)
+ 	    {
+ 	      ira_object_t obj = ALLOCNO_OBJECT (a, k);
+ 	      
+-	      AND_COMPL_HARD_REG_SET (data->profitable_hard_regs,
+-				      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
++	      data->profitable_hard_regs
++		&= ~OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
+ 	    }
+ 	}
+     }
+@@ -1091,9 +1088,8 @@ setup_profitable_hard_regs (void)
+ 		       hard_regno + num);
+ 		}
+ 	      else
+-		AND_COMPL_HARD_REG_SET
+-		  (ALLOCNO_COLOR_DATA (conflict_a)->profitable_hard_regs,
+-		   ira_reg_mode_hard_regset[hard_regno][mode]);
++		ALLOCNO_COLOR_DATA (conflict_a)->profitable_hard_regs
++		  &= ~ira_reg_mode_hard_regset[hard_regno][mode];
+ 	    }
+ 	}
+     }
+@@ -1589,20 +1585,15 @@ get_conflict_and_start_profitable_regs (ira_allocno_t a, bool retry_p,
+   for (i = 0; i < nwords; i++)
+     {
+       obj = ALLOCNO_OBJECT (a, i);
+-      COPY_HARD_REG_SET (conflict_regs[i],
+-			 OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
++      conflict_regs[i] = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
+     }
+   if (retry_p)
+-    {
+-      COPY_HARD_REG_SET (*start_profitable_regs,
+-			 reg_class_contents[ALLOCNO_CLASS (a)]);
+-      AND_COMPL_HARD_REG_SET (*start_profitable_regs,
+-			      ira_prohibited_class_mode_regs
+-			      [ALLOCNO_CLASS (a)][ALLOCNO_MODE (a)]);
+-    }
++    *start_profitable_regs
++      = (reg_class_contents[ALLOCNO_CLASS (a)]
++	 &~ (ira_prohibited_class_mode_regs
++	     [ALLOCNO_CLASS (a)][ALLOCNO_MODE (a)]));
+   else
+-    COPY_HARD_REG_SET (*start_profitable_regs,
+-		       ALLOCNO_COLOR_DATA (a)->profitable_hard_regs);
++    *start_profitable_regs = ALLOCNO_COLOR_DATA (a)->profitable_hard_regs;
+ }
+ 
+ /* Return true if HARD_REGNO is ok for assigning to allocno A with
+@@ -1659,7 +1650,7 @@ calculate_saved_nregs (int hard_regno, machine_mode mode)
+   ira_assert (hard_regno >= 0);
+   for (i = hard_regno_nregs (hard_regno, mode) - 1; i >= 0; i--)
+     if (!allocated_hardreg_p[hard_regno + i]
+-	&& !TEST_HARD_REG_BIT (call_used_reg_set, hard_regno + i)
++	&& !crtl->abi->clobbers_full_reg_p (hard_regno + i)
+ 	&& !LOCAL_REGNO (hard_regno + i))
+       nregs++;
+   return nregs;
+@@ -1803,9 +1794,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
+ 					  hard_regno + num);
+ 		    }
+ 		  else
+-		    IOR_HARD_REG_SET
+-		      (conflicting_regs[word],
+-		       ira_reg_mode_hard_regset[hard_regno][mode]);
++		    conflicting_regs[word]
++		      |= ira_reg_mode_hard_regset[hard_regno][mode];
+ 		  if (hard_reg_set_subset_p (profitable_hard_regs,
+ 					     conflicting_regs[word]))
+ 		    goto fail;
+@@ -2698,8 +2688,7 @@ setup_allocno_available_regs_num (ira_allocno_t a)
+      reg_class_names[aclass], ira_class_hard_regs_num[aclass], n);
+   print_hard_reg_set (ira_dump_file, data->profitable_hard_regs, false);
+   fprintf (ira_dump_file, ", %snode: ",
+-	   hard_reg_set_equal_p (data->profitable_hard_regs,
+-				 data->hard_regs_node->hard_regs->set)
++	   data->profitable_hard_regs == data->hard_regs_node->hard_regs->set
+ 	   ? "" : "^");
+   print_hard_reg_set (ira_dump_file,
+ 		      data->hard_regs_node->hard_regs->set, false);
+@@ -4387,11 +4376,10 @@ allocno_reload_assign (ira_allocno_t a, HARD_REG_SET forbidden_regs)
+   for (i = 0; i < n; i++)
+     {
+       ira_object_t obj = ALLOCNO_OBJECT (a, i);
+-      COPY_HARD_REG_SET (saved[i], OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
+-      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), forbidden_regs);
++      saved[i] = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
++      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= forbidden_regs;
+       if (! flag_caller_saves && ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
+-	IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-			  call_used_reg_set);
++	OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= ira_need_caller_save_regs (a);
+     }
+   ALLOCNO_ASSIGNED_P (a) = false;
+   aclass = ALLOCNO_CLASS (a);
+@@ -4410,9 +4398,7 @@ allocno_reload_assign (ira_allocno_t a, HARD_REG_SET forbidden_regs)
+ 	       ? ALLOCNO_CLASS_COST (a)
+ 	       : ALLOCNO_HARD_REG_COSTS (a)[ira_class_hard_reg_index
+ 					    [aclass][hard_regno]]));
+-      if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0
+-	  && ira_hard_reg_set_intersection_p (hard_regno, ALLOCNO_MODE (a),
+-					      call_used_reg_set))
++      if (ira_need_caller_save_p (a, regno))
+ 	{
+ 	  ira_assert (flag_caller_saves);
+ 	  caller_save_needed = 1;
+@@ -4434,7 +4420,7 @@ allocno_reload_assign (ira_allocno_t a, HARD_REG_SET forbidden_regs)
+   for (i = 0; i < n; i++)
+     {
+       ira_object_t obj = ALLOCNO_OBJECT (a, i);
+-      COPY_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), saved[i]);
++      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) = saved[i];
+     }
+   return reg_renumber[regno] >= 0;
+ }
+@@ -4519,9 +4505,9 @@ ira_reassign_pseudos (int *spilled_pseudo_regs, int num,
+   for (i = 0; i < num; i++)
+     {
+       regno = spilled_pseudo_regs[i];
+-      COPY_HARD_REG_SET (forbidden_regs, bad_spill_regs);
+-      IOR_HARD_REG_SET (forbidden_regs, pseudo_forbidden_regs[regno]);
+-      IOR_HARD_REG_SET (forbidden_regs, pseudo_previous_regs[regno]);
++      forbidden_regs = (bad_spill_regs
++			| pseudo_forbidden_regs[regno]
++			| pseudo_previous_regs[regno]);
+       gcc_assert (reg_renumber[regno] < 0);
+       a = ira_regno_allocno_map[regno];
+       ira_mark_allocation_change (regno);
+@@ -4699,16 +4685,16 @@ ira_mark_new_stack_slot (rtx x, int regno, poly_uint64 total_size)
+    given IN and OUT for INSN.  Return also number points (through
+    EXCESS_PRESSURE_LIVE_LENGTH) where the pseudo-register lives and
+    the register pressure is high, number of references of the
+-   pseudo-registers (through NREFS), number of callee-clobbered
+-   hard-registers occupied by the pseudo-registers (through
+-   CALL_USED_COUNT), and the first hard regno occupied by the
++   pseudo-registers (through NREFS), the number of psuedo registers
++   whose allocated register wouldn't need saving in the prologue
++   (through CALL_USED_COUNT), and the first hard regno occupied by the
+    pseudo-registers (through FIRST_HARD_REGNO).  */
+ static int
+ calculate_spill_cost (int *regnos, rtx in, rtx out, rtx_insn *insn,
+ 		      int *excess_pressure_live_length,
+ 		      int *nrefs, int *call_used_count, int *first_hard_regno)
+ {
+-  int i, cost, regno, hard_regno, j, count, saved_cost, nregs;
++  int i, cost, regno, hard_regno, count, saved_cost;
+   bool in_p, out_p;
+   int length;
+   ira_allocno_t a;
+@@ -4725,11 +4711,8 @@ calculate_spill_cost (int *regnos, rtx in, rtx out, rtx_insn *insn,
+       a = ira_regno_allocno_map[regno];
+       length += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a) / ALLOCNO_NUM_OBJECTS (a);
+       cost += ALLOCNO_MEMORY_COST (a) - ALLOCNO_CLASS_COST (a);
+-      nregs = hard_regno_nregs (hard_regno, ALLOCNO_MODE (a));
+-      for (j = 0; j < nregs; j++)
+-	if (! TEST_HARD_REG_BIT (call_used_reg_set, hard_regno + j))
+-	  break;
+-      if (j == nregs)
++      if (in_hard_reg_set_p (crtl->abi->full_reg_clobbers (),
++			     ALLOCNO_MODE (a), hard_regno))
+ 	count++;
+       in_p = in && REG_P (in) && (int) REGNO (in) == hard_regno;
+       out_p = out && REG_P (out) && (int) REGNO (out) == hard_regno;
+@@ -4886,11 +4869,10 @@ fast_allocation (void)
+       for (l = 0; l < nr; l++)
+ 	{
+ 	  ira_object_t obj = ALLOCNO_OBJECT (a, l);
+-	  IOR_HARD_REG_SET (conflict_hard_regs,
+-			    OBJECT_CONFLICT_HARD_REGS (obj));
++	  conflict_hard_regs |= OBJECT_CONFLICT_HARD_REGS (obj);
+ 	  for (r = OBJECT_LIVE_RANGES (obj); r != NULL; r = r->next)
+ 	    for (j = r->start; j <= r->finish; j++)
+-	      IOR_HARD_REG_SET (conflict_hard_regs, used_hard_regs[j]);
++	      conflict_hard_regs |= used_hard_regs[j];
+ 	}
+       aclass = ALLOCNO_CLASS (a);
+       ALLOCNO_ASSIGNED_P (a) = true;
+@@ -4938,8 +4920,7 @@ fast_allocation (void)
+ 	  ira_object_t obj = ALLOCNO_OBJECT (a, l);
+ 	  for (r = OBJECT_LIVE_RANGES (obj); r != NULL; r = r->next)
+ 	    for (k = r->start; k <= r->finish; k++)
+-	      IOR_HARD_REG_SET (used_hard_regs[k],
+-				ira_reg_mode_hard_regset[hard_regno][mode]);
++	      used_hard_regs[k] |= ira_reg_mode_hard_regset[hard_regno][mode];
+ 	}
+     }
+   ira_free (sorted_allocnos);
+diff --git a/gcc/ira-conflicts.c b/gcc/ira-conflicts.c
+index 9a3e3811d..a0aefaa05 100644
+--- a/gcc/ira-conflicts.c
++++ b/gcc/ira-conflicts.c
+@@ -325,12 +325,37 @@ process_regs_for_copy (rtx reg1, rtx reg2, bool constraint_p,
+   return true;
+ }
+ 
+-/* Process all of the output registers of the current insn which are
+-   not bound (BOUND_P) and the input register REG (its operand number
++/* Return true if output operand OUTPUT and input operand INPUT of
++   INSN can use the same register class for at least one alternative.
++   INSN is already described in recog_data and recog_op_alt.  */
++static bool
++can_use_same_reg_p (rtx_insn *insn, int output, int input)
++{
++  alternative_mask preferred = get_preferred_alternatives (insn);
++  for (int nalt = 0; nalt < recog_data.n_alternatives; nalt++)
++    {
++      if (!TEST_BIT (preferred, nalt))
++	continue;
++
++      const operand_alternative *op_alt
++	= &recog_op_alt[nalt * recog_data.n_operands];
++      if (op_alt[input].matches == output)
++	return true;
++
++      if (ira_reg_class_intersect[op_alt[input].cl][op_alt[output].cl]
++	  != NO_REGS)
++	return true;
++    }
++  return false;
++}
++
++/* Process all of the output registers of the current insn (INSN) which
++   are not bound (BOUND_P) and the input register REG (its operand number
+    OP_NUM) which dies in the insn as if there were a move insn between
+    them with frequency FREQ.  */
+ static void
+-process_reg_shuffles (rtx reg, int op_num, int freq, bool *bound_p)
++process_reg_shuffles (rtx_insn *insn, rtx reg, int op_num, int freq,
++		      bool *bound_p)
+ {
+   int i;
+   rtx another_reg;
+@@ -342,7 +367,13 @@ process_reg_shuffles (rtx reg, int op_num, int freq, bool *bound_p)
+ 
+       if (!REG_SUBREG_P (another_reg) || op_num == i
+ 	  || recog_data.operand_type[i] != OP_OUT
+-	  || bound_p[i])
++	  || bound_p[i]
++	  || (!can_use_same_reg_p (insn, i, op_num)
++	      && (recog_data.constraints[op_num][0] != '%'
++		  || !can_use_same_reg_p (insn, i, op_num + 1))
++	      && (op_num == 0
++		  || recog_data.constraints[op_num - 1][0] != '%'
++		  || !can_use_same_reg_p (insn, i, op_num - 1))))
+ 	continue;
+ 
+       process_regs_for_copy (reg, another_reg, false, NULL, freq);
+@@ -358,7 +389,7 @@ add_insn_allocno_copies (rtx_insn *insn)
+   rtx set, operand, dup;
+   bool bound_p[MAX_RECOG_OPERANDS];
+   int i, n, freq;
+-  HARD_REG_SET alts;
++  alternative_mask alts;
+ 
+   freq = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (insn));
+   if (freq == 0)
+@@ -379,7 +410,7 @@ add_insn_allocno_copies (rtx_insn *insn)
+      there are no dead registers, there will be no such copies.  */
+   if (! find_reg_note (insn, REG_DEAD, NULL_RTX))
+     return;
+-  ira_setup_alts (insn, alts);
++  alts = ira_setup_alts (insn);
+   for (i = 0; i < recog_data.n_operands; i++)
+     bound_p[i] = false;
+   for (i = 0; i < recog_data.n_operands; i++)
+@@ -412,7 +443,8 @@ add_insn_allocno_copies (rtx_insn *insn)
+ 	   the corresponding allocno copies.  The cost will not
+ 	   correspond to a real move insn cost, so make the frequency
+ 	   smaller.  */
+-	process_reg_shuffles (operand, i, freq < 8 ? 1 : freq / 8, bound_p);
++	process_reg_shuffles (insn, operand, i, freq < 8 ? 1 : freq / 8,
++			      bound_p);
+     }
+ }
+ 
+@@ -660,17 +692,15 @@ print_allocno_conflicts (FILE * file, bool reg_p, ira_allocno_t a)
+ 	      putc (')', file);
+ 	    }
+ 	}
+-      COPY_HARD_REG_SET (conflicting_hard_regs, OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
+-      AND_COMPL_HARD_REG_SET (conflicting_hard_regs, ira_no_alloc_regs);
+-      AND_HARD_REG_SET (conflicting_hard_regs,
+-			reg_class_contents[ALLOCNO_CLASS (a)]);
++      conflicting_hard_regs = (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
++			       & ~ira_no_alloc_regs
++			       & reg_class_contents[ALLOCNO_CLASS (a)]);
+       print_hard_reg_set (file, "\n;;     total conflict hard regs:",
+ 			  conflicting_hard_regs);
+ 
+-      COPY_HARD_REG_SET (conflicting_hard_regs, OBJECT_CONFLICT_HARD_REGS (obj));
+-      AND_COMPL_HARD_REG_SET (conflicting_hard_regs, ira_no_alloc_regs);
+-      AND_HARD_REG_SET (conflicting_hard_regs,
+-			reg_class_contents[ALLOCNO_CLASS (a)]);
++      conflicting_hard_regs = (OBJECT_CONFLICT_HARD_REGS (obj)
++			       & ~ira_no_alloc_regs
++			       & reg_class_contents[ALLOCNO_CLASS (a)]);
+       print_hard_reg_set (file, ";;     conflict hard regs:",
+ 			  conflicting_hard_regs);
+       putc ('\n', file);
+@@ -740,11 +770,7 @@ ira_build_conflicts (void)
+   if (! targetm.class_likely_spilled_p (base))
+     CLEAR_HARD_REG_SET (temp_hard_reg_set);
+   else
+-    {
+-      COPY_HARD_REG_SET (temp_hard_reg_set, reg_class_contents[base]);
+-      AND_COMPL_HARD_REG_SET (temp_hard_reg_set, ira_no_alloc_regs);
+-      AND_HARD_REG_SET (temp_hard_reg_set, call_used_reg_set);
+-    }
++    temp_hard_reg_set = reg_class_contents[base] & ~ira_no_alloc_regs;
+   FOR_EACH_ALLOCNO (a, ai)
+     {
+       int i, n = ALLOCNO_NUM_OBJECTS (a);
+@@ -752,33 +778,28 @@ ira_build_conflicts (void)
+       for (i = 0; i < n; i++)
+ 	{
+ 	  ira_object_t obj = ALLOCNO_OBJECT (a, i);
+-	  machine_mode obj_mode = obj->allocno->mode;
+ 	  rtx allocno_reg = regno_reg_rtx [ALLOCNO_REGNO (a)];
+ 
+-	  if ((! flag_caller_saves && ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
+-	      /* For debugging purposes don't put user defined variables in
+-		 callee-clobbered registers.  However, do allow parameters
+-		 in callee-clobbered registers to improve debugging.  This
+-		 is a bit of a fragile hack.  */
+-	      || (optimize == 0
+-		  && REG_USERVAR_P (allocno_reg)
+-		  && ! reg_is_parm_p (allocno_reg)))
++	  /* For debugging purposes don't put user defined variables in
++	     callee-clobbered registers.  However, do allow parameters
++	     in callee-clobbered registers to improve debugging.  This
++	     is a bit of a fragile hack.  */
++	  if (optimize == 0
++	      && REG_USERVAR_P (allocno_reg)
++	      && ! reg_is_parm_p (allocno_reg))
+ 	    {
+-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-				call_used_reg_set);
+-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-				call_used_reg_set);
++	      HARD_REG_SET new_conflict_regs = crtl->abi->full_reg_clobbers ();
++	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
++	      OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+ 	    }
+-	  else if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
++
++	  if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
+ 	    {
+-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-				no_caller_save_reg_set);
+-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-				temp_hard_reg_set);
+-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-				no_caller_save_reg_set);
+-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-				temp_hard_reg_set);
++	      HARD_REG_SET new_conflict_regs = ira_need_caller_save_regs (a);
++	      if (flag_caller_saves)
++		new_conflict_regs &= (~savable_regs | temp_hard_reg_set);
++	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
++	      OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+ 	    }
+ 
+ 	  /* Now we deal with paradoxical subreg cases where certain registers
+@@ -805,23 +826,6 @@ ira_build_conflicts (void)
+ 		     }
+ 		}
+ 	    }
+-
+-	  if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
+-	    {
+-	      int regno;
+-
+-	      /* Allocnos bigger than the saved part of call saved
+-		 regs must conflict with them.  */
+-	      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-		if (!TEST_HARD_REG_BIT (call_used_reg_set, regno)
+-		    && targetm.hard_regno_call_part_clobbered (NULL, regno,
+-							       obj_mode))
+-		  {
+-		    SET_HARD_REG_BIT (OBJECT_CONFLICT_HARD_REGS (obj), regno);
+-		    SET_HARD_REG_BIT (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-				      regno);
+-		  }
+-	    }
+ 	}
+     }
+   if (optimize && ira_conflicts_p
+diff --git a/gcc/ira-costs.c b/gcc/ira-costs.c
+index c7feaba37..baf7261dd 100644
+--- a/gcc/ira-costs.c
++++ b/gcc/ira-costs.c
+@@ -237,7 +237,7 @@ setup_cost_classes (cost_classes_t from)
+    allocated.  */
+ static cost_classes_t
+ restrict_cost_classes (cost_classes_t full, machine_mode mode,
+-		       const HARD_REG_SET &regs)
++		       const_hard_reg_set regs)
+ {
+   static struct cost_classes narrow;
+   int map[N_REG_CLASSES];
+@@ -254,12 +254,9 @@ restrict_cost_classes (cost_classes_t full, machine_mode mode,
+ 
+       /* Calculate the set of registers in CL that belong to REGS and
+ 	 are valid for MODE.  */
+-      HARD_REG_SET valid_for_cl;
+-      COPY_HARD_REG_SET (valid_for_cl, reg_class_contents[cl]);
+-      AND_HARD_REG_SET (valid_for_cl, regs);
+-      AND_COMPL_HARD_REG_SET (valid_for_cl,
+-			      ira_prohibited_class_mode_regs[cl][mode]);
+-      AND_COMPL_HARD_REG_SET (valid_for_cl, ira_no_alloc_regs);
++      HARD_REG_SET valid_for_cl = reg_class_contents[cl] & regs;
++      valid_for_cl &= ~(ira_prohibited_class_mode_regs[cl][mode]
++			| ira_no_alloc_regs);
+       if (hard_reg_set_empty_p (valid_for_cl))
+ 	continue;
+ 
+@@ -343,8 +340,7 @@ setup_regno_cost_classes_by_aclass (int regno, enum reg_class aclass)
+ 
+   if ((classes_ptr = cost_classes_aclass_cache[aclass]) == NULL)
+     {
+-      COPY_HARD_REG_SET (temp, reg_class_contents[aclass]);
+-      AND_COMPL_HARD_REG_SET (temp, ira_no_alloc_regs);
++      temp = reg_class_contents[aclass] & ~ira_no_alloc_regs;
+       /* We exclude classes from consideration which are subsets of
+ 	 ACLASS only if ACLASS is an uniform class.  */
+       exclude_p = ira_uniform_class_p[aclass];
+@@ -356,8 +352,7 @@ setup_regno_cost_classes_by_aclass (int regno, enum reg_class aclass)
+ 	    {
+ 	      /* Exclude non-uniform classes which are subsets of
+ 		 ACLASS.  */
+-	      COPY_HARD_REG_SET (temp2, reg_class_contents[cl]);
+-	      AND_COMPL_HARD_REG_SET (temp2, ira_no_alloc_regs);
++	      temp2 = reg_class_contents[cl] & ~ira_no_alloc_regs;
+ 	      if (hard_reg_set_subset_p (temp2, temp) && cl != aclass)
+ 		continue;
+ 	    }
+@@ -1482,13 +1477,6 @@ scan_one_insn (rtx_insn *insn)
+       return insn;
+     }
+ 
+-  if (pat_code == CLOBBER_HIGH)
+-    {
+-      gcc_assert (REG_P (XEXP (PATTERN (insn), 0))
+-		  && HARD_REGISTER_P (XEXP (PATTERN (insn), 0)));
+-      return insn;
+-    }
+-
+   counted_mem = false;
+   set = single_set (insn);
+   extract_insn (insn);
+@@ -2345,7 +2333,6 @@ ira_tune_allocno_costs (void)
+   ira_allocno_object_iterator oi;
+   ira_object_t obj;
+   bool skip_p;
+-  HARD_REG_SET *crossed_calls_clobber_regs;
+ 
+   FOR_EACH_ALLOCNO (a, ai)
+     {
+@@ -2380,14 +2367,7 @@ ira_tune_allocno_costs (void)
+ 		continue;
+ 	      rclass = REGNO_REG_CLASS (regno);
+ 	      cost = 0;
+-	      crossed_calls_clobber_regs
+-		= &(ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
+-	      if (ira_hard_reg_set_intersection_p (regno, mode,
+-						   *crossed_calls_clobber_regs)
+-		  && (ira_hard_reg_set_intersection_p (regno, mode,
+-						       call_used_reg_set)
+-		      || targetm.hard_regno_call_part_clobbered (NULL, regno,
+-								 mode)))
++	      if (ira_need_caller_save_p (a, regno))
+ 		cost += (ALLOCNO_CALL_FREQ (a)
+ 			 * (ira_memory_move_cost[mode][rclass][0]
+ 			    + ira_memory_move_cost[mode][rclass][1]));
+diff --git a/gcc/ira-emit.c b/gcc/ira-emit.c
+index 51bf9c8bc..f44a0d199 100644
+--- a/gcc/ira-emit.c
++++ b/gcc/ira-emit.c
+@@ -1115,8 +1115,8 @@ add_range_and_copies_from_move_list (move_t list, ira_loop_tree_node_t node,
+ 	      ira_allocate_object_conflicts (to_obj, n);
+ 	    }
+ 	}
+-      ior_hard_reg_conflicts (from, &hard_regs_live);
+-      ior_hard_reg_conflicts (to, &hard_regs_live);
++      ior_hard_reg_conflicts (from, hard_regs_live);
++      ior_hard_reg_conflicts (to, hard_regs_live);
+ 
+       update_costs (from, true, freq);
+       update_costs (to, false, freq);
+diff --git a/gcc/ira-int.h b/gcc/ira-int.h
+index 3c7fe4e64..a2529ff81 100644
+--- a/gcc/ira-int.h
++++ b/gcc/ira-int.h
+@@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
+ #define GCC_IRA_INT_H
+ 
+ #include "recog.h"
++#include "function-abi.h"
+ 
+ /* To provide consistency in naming, all IRA external variables,
+    functions, common typedefs start with prefix ira_.  */
+@@ -287,6 +288,9 @@ struct ira_allocno
+   /* Register class which should be used for allocation for given
+      allocno.  NO_REGS means that we should use memory.  */
+   ENUM_BITFIELD (reg_class) aclass : 16;
++  /* A bitmask of the ABIs used by calls that occur while the allocno
++     is live.  */
++  unsigned int crossed_calls_abis : NUM_ABI_IDS;
+   /* During the reload, value TRUE means that we should not reassign a
+      hard register to the allocno got memory earlier.  It is set up
+      when we removed memory-memory move insn before each iteration of
+@@ -423,6 +427,7 @@ struct ira_allocno
+ #define ALLOCNO_CALL_FREQ(A) ((A)->call_freq)
+ #define ALLOCNO_CALLS_CROSSED_NUM(A) ((A)->calls_crossed_num)
+ #define ALLOCNO_CHEAP_CALLS_CROSSED_NUM(A) ((A)->cheap_calls_crossed_num)
++#define ALLOCNO_CROSSED_CALLS_ABIS(A) ((A)->crossed_calls_abis)
+ #define ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS(A) \
+   ((A)->crossed_calls_clobbered_regs)
+ #define ALLOCNO_MEM_OPTIMIZED_DEST(A) ((A)->mem_optimized_dest)
+@@ -963,8 +968,8 @@ extern void ira_print_disposition (FILE *);
+ extern void ira_debug_disposition (void);
+ extern void ira_debug_allocno_classes (void);
+ extern void ira_init_register_move_cost (machine_mode);
+-extern void ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts);
+-extern int ira_get_dup_out_num (int op_num, HARD_REG_SET &alts);
++extern alternative_mask ira_setup_alts (rtx_insn *);
++extern int ira_get_dup_out_num (int, alternative_mask);
+ 
+ /* ira-build.c */
+ 
+@@ -996,7 +1001,7 @@ extern void ira_set_allocno_class (ira_allocno_t, enum reg_class);
+ extern bool ira_conflict_vector_profitable_p (ira_object_t, int);
+ extern void ira_allocate_conflict_vec (ira_object_t, int);
+ extern void ira_allocate_object_conflicts (ira_object_t, int);
+-extern void ior_hard_reg_conflicts (ira_allocno_t, HARD_REG_SET *);
++extern void ior_hard_reg_conflicts (ira_allocno_t, const_hard_reg_set);
+ extern void ira_print_expanded_allocno (ira_allocno_t);
+ extern void ira_add_live_range_to_object (ira_object_t, int, int);
+ extern live_range_t ira_create_live_range (ira_object_t, int, int,
+@@ -1508,4 +1513,28 @@ ira_allocate_and_set_or_copy_costs (int **vec, enum reg_class aclass,
+ extern rtx ira_create_new_reg (rtx);
+ extern int first_moveable_pseudo, last_moveable_pseudo;
+ 
++/* Return the set of registers that would need a caller save if allocno A
++   overlapped them.  */
++
++inline HARD_REG_SET
++ira_need_caller_save_regs (ira_allocno_t a)
++{
++  return call_clobbers_in_region (ALLOCNO_CROSSED_CALLS_ABIS (a),
++				  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
++				  ALLOCNO_MODE (a));
++}
++
++/* Return true if we would need to save allocno A around a call if we
++   assigned hard register REGNO.  */
++
++inline bool
++ira_need_caller_save_p (ira_allocno_t a, unsigned int regno)
++{
++  if (ALLOCNO_CALLS_CROSSED_NUM (a) == 0)
++    return false;
++  return call_clobbered_in_region_p (ALLOCNO_CROSSED_CALLS_ABIS (a),
++				     ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
++				     ALLOCNO_MODE (a), regno);
++}
++
+ #endif /* GCC_IRA_INT_H */
+diff --git a/gcc/ira-lives.c b/gcc/ira-lives.c
+index faadf08b0..b933dff16 100644
+--- a/gcc/ira-lives.c
++++ b/gcc/ira-lives.c
+@@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "ira.h"
+ #include "ira-int.h"
+ #include "sparseset.h"
++#include "function-abi.h"
+ 
+ /* The code in this file is similar to one in global but the code
+    works on the allocno basis and creates live ranges instead of
+@@ -80,8 +81,9 @@ static int last_call_num;
+ /* The number of last call at which given allocno was saved.  */
+ static int *allocno_saved_at_call;
+ 
+-/* The value of get_preferred_alternatives for the current instruction,
+-   supplemental to recog_data.  */
++/* The value returned by ira_setup_alts for the current instruction;
++   i.e. the set of alternatives that we should consider to be likely
++   candidates during reloading.  */
+ static alternative_mask preferred_alternatives;
+ 
+ /* If non-NULL, the source operand of a register to register copy for which
+@@ -187,8 +189,8 @@ make_object_dead (ira_object_t obj)
+ 	}
+     }
+ 
+-  IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj), hard_regs_live);
+-  IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), hard_regs_live);
++  OBJECT_CONFLICT_HARD_REGS (obj) |= hard_regs_live;
++  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= hard_regs_live;
+ 
+   /* If IGNORE_REG_FOR_CONFLICTS did not already conflict with OBJ, make
+      sure it still doesn't.  */
+@@ -989,10 +991,8 @@ process_single_reg_class_operands (bool in_p, int freq)
+ 	      /* We could increase costs of A instead of making it
+ 		 conflicting with the hard register.  But it works worse
+ 		 because it will be spilled in reload in anyway.  */
+-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-				reg_class_contents[cl]);
+-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-				reg_class_contents[cl]);
++	      OBJECT_CONFLICT_HARD_REGS (obj) |= reg_class_contents[cl];
++	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= reg_class_contents[cl];
+ 	    }
+ 	}
+     }
+@@ -1130,8 +1130,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+       reg_live_out = df_get_live_out (bb);
+       sparseset_clear (objects_live);
+       REG_SET_TO_HARD_REG_SET (hard_regs_live, reg_live_out);
+-      AND_COMPL_HARD_REG_SET (hard_regs_live, eliminable_regset);
+-      AND_COMPL_HARD_REG_SET (hard_regs_live, ira_no_alloc_regs);
++      hard_regs_live &= ~(eliminable_regset | ira_no_alloc_regs);
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ 	if (TEST_HARD_REG_BIT (hard_regs_live, i))
+ 	  {
+@@ -1236,9 +1235,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+ 		  }
+ 	      }
+ 
+-	  extract_insn (insn);
+-	  preferred_alternatives = get_preferred_alternatives (insn);
+-	  preprocess_constraints (insn);
++	  preferred_alternatives = ira_setup_alts (insn);
+ 	  process_single_reg_class_operands (false, freq);
+ 
+ 	  /* See which defined values die here.  */
+@@ -1263,10 +1260,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+ 		  ira_object_t obj = ira_object_id_map[i];
+ 		  a = OBJECT_ALLOCNO (obj);
+ 		  int num = ALLOCNO_NUM (a);
+-		  HARD_REG_SET this_call_used_reg_set;
+-
+-		  get_call_reg_set_usage (insn, &this_call_used_reg_set,
+-					  call_used_reg_set);
++		  function_abi callee_abi = insn_callee_abi (insn);
+ 
+ 		  /* Don't allocate allocnos that cross setjmps or any
+ 		     call, if this function receives a nonlocal
+@@ -1281,10 +1275,10 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+ 		    }
+ 		  if (can_throw_internal (insn))
+ 		    {
+-		      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
+-					this_call_used_reg_set);
+-		      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-					this_call_used_reg_set);
++		      OBJECT_CONFLICT_HARD_REGS (obj)
++			|= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
++		      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
++			|= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
+ 		    }
+ 
+ 		  if (sparseset_bit_p (allocnos_processed, num))
+@@ -1301,8 +1295,9 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+ 		  /* Mark it as saved at the next call.  */
+ 		  allocno_saved_at_call[num] = last_call_num + 1;
+ 		  ALLOCNO_CALLS_CROSSED_NUM (a)++;
+-		  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
+-				    this_call_used_reg_set);
++		  ALLOCNO_CROSSED_CALLS_ABIS (a) |= 1 << callee_abi.id ();
++		  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a)
++		    |= callee_abi.full_and_partial_reg_clobbers ();
+ 		  if (cheap_reg != NULL_RTX
+ 		      && ALLOCNO_REGNO (a) == (int) REGNO (cheap_reg))
+ 		    ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a)++;
+@@ -1355,10 +1350,11 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+ 	  }
+ 
+       /* Allocnos can't go in stack regs at the start of a basic block
+-	 that is reached by an abnormal edge. Likewise for call
+-	 clobbered regs, because caller-save, fixup_abnormal_edges and
+-	 possibly the table driven EH machinery are not quite ready to
+-	 handle such allocnos live across such edges.  */
++	 that is reached by an abnormal edge. Likewise for registers
++	 that are at least partly call clobbered, because caller-save,
++	 fixup_abnormal_edges and possibly the table driven EH machinery
++	 are not quite ready to handle such allocnos live across such
++	 edges.  */
+       if (bb_has_abnormal_pred (bb))
+ 	{
+ #ifdef STACK_REGS
+@@ -1378,7 +1374,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
+ 	  if (!cfun->has_nonlocal_label
+ 	      && has_abnormal_call_or_eh_pred_edge_p (bb))
+ 	    for (px = 0; px < FIRST_PSEUDO_REGISTER; px++)
+-	      if (call_used_regs[px]
++	      if (eh_edge_abi.clobbers_at_least_part_of_reg_p (px)
+ #ifdef REAL_PIC_OFFSET_TABLE_REGNUM
+ 		  /* We should create a conflict of PIC pseudo with
+ 		     PIC hard reg as PIC hard reg can have a wrong
+diff --git a/gcc/ira.c b/gcc/ira.c
+index 4262e5cf3..a985dddaf 100644
+--- a/gcc/ira.c
++++ b/gcc/ira.c
+@@ -471,8 +471,7 @@ setup_class_hard_regs (void)
+   ira_assert (SHRT_MAX >= FIRST_PSEUDO_REGISTER);
+   for (cl = (int) N_REG_CLASSES - 1; cl >= 0; cl--)
+     {
+-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++      temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
+       CLEAR_HARD_REG_SET (processed_hard_reg_set);
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ 	{
+@@ -514,7 +513,7 @@ setup_alloc_regs (bool use_hard_frame_p)
+ #ifdef ADJUST_REG_ALLOC_ORDER
+   ADJUST_REG_ALLOC_ORDER;
+ #endif
+-  COPY_HARD_REG_SET (no_unit_alloc_regs, fixed_nonglobal_reg_set);
++  no_unit_alloc_regs = fixed_nonglobal_reg_set;
+   if (! use_hard_frame_p)
+     SET_HARD_REG_BIT (no_unit_alloc_regs, HARD_FRAME_POINTER_REGNUM);
+   setup_class_hard_regs ();
+@@ -541,8 +540,7 @@ setup_reg_subclasses (void)
+       if (i == (int) NO_REGS)
+ 	continue;
+ 
+-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[i]);
+-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++      temp_hard_regset = reg_class_contents[i] & ~no_unit_alloc_regs;
+       if (hard_reg_set_empty_p (temp_hard_regset))
+ 	continue;
+       for (j = 0; j < N_REG_CLASSES; j++)
+@@ -550,8 +548,7 @@ setup_reg_subclasses (void)
+ 	  {
+ 	    enum reg_class *p;
+ 
+-	    COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[j]);
+-	    AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
++	    temp_hard_regset2 = reg_class_contents[j] & ~no_unit_alloc_regs;
+ 	    if (! hard_reg_set_subset_p (temp_hard_regset,
+ 					 temp_hard_regset2))
+ 	      continue;
+@@ -605,10 +602,8 @@ setup_class_subset_and_memory_move_costs (void)
+   for (cl = (int) N_REG_CLASSES - 1; cl >= 0; cl--)
+     for (cl2 = (int) N_REG_CLASSES - 1; cl2 >= 0; cl2--)
+       {
+-	COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-	AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+-	COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl2]);
+-	AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
++	temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
++	temp_hard_regset2 = reg_class_contents[cl2] & ~no_unit_alloc_regs;
+ 	ira_class_subset_p[cl][cl2]
+ 	  = hard_reg_set_subset_p (temp_hard_regset, temp_hard_regset2);
+ 	if (! hard_reg_set_empty_p (temp_hard_regset2)
+@@ -757,8 +752,7 @@ setup_stack_reg_pressure_class (void)
+     for (i = 0; i < ira_pressure_classes_num; i++)
+       {
+ 	cl = ira_pressure_classes[i];
+-	COPY_HARD_REG_SET (temp_hard_regset2, temp_hard_regset);
+-	AND_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl]);
++	temp_hard_regset2 = temp_hard_regset & reg_class_contents[cl];
+ 	size = hard_reg_set_size (temp_hard_regset2);
+ 	if (best < size)
+ 	  {
+@@ -816,10 +810,10 @@ setup_pressure_classes (void)
+ 		 register pressure class.  */
+ 	      for (m = 0; m < NUM_MACHINE_MODES; m++)
+ 		{
+-		  COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-		  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+-		  AND_COMPL_HARD_REG_SET (temp_hard_regset,
+-					  ira_prohibited_class_mode_regs[cl][m]);
++		  temp_hard_regset
++		    = (reg_class_contents[cl]
++		       & ~(no_unit_alloc_regs
++			   | ira_prohibited_class_mode_regs[cl][m]));
+ 		  if (hard_reg_set_empty_p (temp_hard_regset))
+ 		    continue;
+ 		  ira_init_register_move_cost_if_necessary ((machine_mode) m);
+@@ -833,8 +827,7 @@ setup_pressure_classes (void)
+ 	    }
+ 	  curr = 0;
+ 	  insert_p = true;
+-	  COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-	  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++	  temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
+ 	  /* Remove so far added pressure classes which are subset of the
+ 	     current candidate class.  Prefer GENERAL_REGS as a pressure
+ 	     register class to another class containing the same
+@@ -845,11 +838,10 @@ setup_pressure_classes (void)
+ 	  for (i = 0; i < n; i++)
+ 	    {
+ 	      cl2 = pressure_classes[i];
+-	      COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl2]);
+-	      AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
++	      temp_hard_regset2 = (reg_class_contents[cl2]
++				   & ~no_unit_alloc_regs);
+ 	      if (hard_reg_set_subset_p (temp_hard_regset, temp_hard_regset2)
+-		  && (! hard_reg_set_equal_p (temp_hard_regset,
+-					      temp_hard_regset2)
++		  && (temp_hard_regset != temp_hard_regset2
+ 		      || cl2 == (int) GENERAL_REGS))
+ 		{
+ 		  pressure_classes[curr++] = (enum reg_class) cl2;
+@@ -857,11 +849,10 @@ setup_pressure_classes (void)
+ 		  continue;
+ 		}
+ 	      if (hard_reg_set_subset_p (temp_hard_regset2, temp_hard_regset)
+-		  && (! hard_reg_set_equal_p (temp_hard_regset2,
+-					      temp_hard_regset)
++		  && (temp_hard_regset2 != temp_hard_regset
+ 		      || cl == (int) GENERAL_REGS))
+ 		continue;
+-	      if (hard_reg_set_equal_p (temp_hard_regset2, temp_hard_regset))
++	      if (temp_hard_regset2 == temp_hard_regset)
+ 		insert_p = false;
+ 	      pressure_classes[curr++] = (enum reg_class) cl2;
+ 	    }
+@@ -882,7 +873,7 @@ setup_pressure_classes (void)
+        registers available for the allocation.  */
+     CLEAR_HARD_REG_SET (temp_hard_regset);
+     CLEAR_HARD_REG_SET (temp_hard_regset2);
+-    COPY_HARD_REG_SET (ignore_hard_regs, no_unit_alloc_regs);
++    ignore_hard_regs = no_unit_alloc_regs;
+     for (cl = 0; cl < LIM_REG_CLASSES; cl++)
+       {
+ 	/* For some targets (like MIPS with MD_REGS), there are some
+@@ -893,23 +884,23 @@ setup_pressure_classes (void)
+ 	    break;
+ 	if (m >= NUM_MACHINE_MODES)
+ 	  {
+-	    IOR_HARD_REG_SET (ignore_hard_regs, reg_class_contents[cl]);
++	    ignore_hard_regs |= reg_class_contents[cl];
+ 	    continue;
+ 	  }
+ 	for (i = 0; i < n; i++)
+ 	  if ((int) pressure_classes[i] == cl)
+ 	    break;
+-	IOR_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl]);
++	temp_hard_regset2 |= reg_class_contents[cl];
+ 	if (i < n)
+-	  IOR_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
++	  temp_hard_regset |= reg_class_contents[cl];
+       }
+     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+       /* Some targets (like SPARC with ICC reg) have allocatable regs
+ 	 for which no reg class is defined.  */
+       if (REGNO_REG_CLASS (i) == NO_REGS)
+ 	SET_HARD_REG_BIT (ignore_hard_regs, i);
+-    AND_COMPL_HARD_REG_SET (temp_hard_regset, ignore_hard_regs);
+-    AND_COMPL_HARD_REG_SET (temp_hard_regset2, ignore_hard_regs);
++    temp_hard_regset &= ~ignore_hard_regs;
++    temp_hard_regset2 &= ~ignore_hard_regs;
+     ira_assert (hard_reg_set_subset_p (temp_hard_regset2, temp_hard_regset));
+   }
+ #endif
+@@ -1001,16 +992,12 @@ setup_allocno_and_important_classes (void)
+      same set of hard registers.  */
+   for (i = 0; i < LIM_REG_CLASSES; i++)
+     {
+-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[i]);
+-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++      temp_hard_regset = reg_class_contents[i] & ~no_unit_alloc_regs;
+       for (j = 0; j < n; j++)
+ 	{
+ 	  cl = classes[j];
+-	  COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl]);
+-	  AND_COMPL_HARD_REG_SET (temp_hard_regset2,
+-				  no_unit_alloc_regs);
+-	  if (hard_reg_set_equal_p (temp_hard_regset,
+-				    temp_hard_regset2))
++	  temp_hard_regset2 = reg_class_contents[cl] & ~no_unit_alloc_regs;
++	  if (temp_hard_regset == temp_hard_regset2)
+ 	    break;
+ 	}
+       if (j >= n || targetm.additional_allocno_class_p (i))
+@@ -1037,14 +1024,12 @@ setup_allocno_and_important_classes (void)
+   for (cl = 0; cl < N_REG_CLASSES; cl++)
+     if (ira_class_hard_regs_num[cl] > 0)
+       {
+-	COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-	AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++	temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
+ 	set_p = false;
+ 	for (j = 0; j < ira_allocno_classes_num; j++)
+ 	  {
+-	    COPY_HARD_REG_SET (temp_hard_regset2,
+-			       reg_class_contents[ira_allocno_classes[j]]);
+-	    AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
++	    temp_hard_regset2 = (reg_class_contents[ira_allocno_classes[j]]
++				 & ~no_unit_alloc_regs);
+ 	    if ((enum reg_class) cl == ira_allocno_classes[j])
+ 	      break;
+ 	    else if (hard_reg_set_subset_p (temp_hard_regset,
+@@ -1118,10 +1103,9 @@ setup_class_translate_array (enum reg_class *class_translate,
+       for (i = 0; i < classes_num; i++)
+ 	{
+ 	  aclass = classes[i];
+-	  COPY_HARD_REG_SET (temp_hard_regset,
+-			     reg_class_contents[aclass]);
+-	  AND_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-	  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++	  temp_hard_regset = (reg_class_contents[aclass]
++			      & reg_class_contents[cl]
++			      & ~no_unit_alloc_regs);
+ 	  if (! hard_reg_set_empty_p (temp_hard_regset))
+ 	    {
+ 	      min_cost = INT_MAX;
+@@ -1223,10 +1207,8 @@ setup_reg_class_relations (void)
+ 	  ira_reg_classes_intersect_p[cl1][cl2] = false;
+ 	  ira_reg_class_intersect[cl1][cl2] = NO_REGS;
+ 	  ira_reg_class_subset[cl1][cl2] = NO_REGS;
+-	  COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl1]);
+-	  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+-	  COPY_HARD_REG_SET (temp_set2, reg_class_contents[cl2]);
+-	  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
++	  temp_hard_regset = reg_class_contents[cl1] & ~no_unit_alloc_regs;
++	  temp_set2 = reg_class_contents[cl2] & ~no_unit_alloc_regs;
+ 	  if (hard_reg_set_empty_p (temp_hard_regset)
+ 	      && hard_reg_set_empty_p (temp_set2))
+ 	    {
+@@ -1264,16 +1246,14 @@ setup_reg_class_relations (void)
+ 	    }
+ 	  ira_reg_class_subunion[cl1][cl2] = NO_REGS;
+ 	  ira_reg_class_superunion[cl1][cl2] = NO_REGS;
+-	  COPY_HARD_REG_SET (intersection_set, reg_class_contents[cl1]);
+-	  AND_HARD_REG_SET (intersection_set, reg_class_contents[cl2]);
+-	  AND_COMPL_HARD_REG_SET (intersection_set, no_unit_alloc_regs);
+-	  COPY_HARD_REG_SET (union_set, reg_class_contents[cl1]);
+-	  IOR_HARD_REG_SET (union_set, reg_class_contents[cl2]);
+-	  AND_COMPL_HARD_REG_SET (union_set, no_unit_alloc_regs);
++	  intersection_set = (reg_class_contents[cl1]
++			      & reg_class_contents[cl2]
++			      & ~no_unit_alloc_regs);
++	  union_set = ((reg_class_contents[cl1] | reg_class_contents[cl2])
++		       & ~no_unit_alloc_regs);
+ 	  for (cl3 = 0; cl3 < N_REG_CLASSES; cl3++)
+ 	    {
+-	      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl3]);
+-	      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++	      temp_hard_regset = reg_class_contents[cl3] & ~no_unit_alloc_regs;
+ 	      if (hard_reg_set_subset_p (temp_hard_regset, intersection_set))
+ 		{
+ 		  /* CL3 allocatable hard register set is inside of
+@@ -1281,17 +1261,16 @@ setup_reg_class_relations (void)
+ 		     of CL1 and CL2.  */
+ 		  if (important_class_p[cl3])
+ 		    {
+-		      COPY_HARD_REG_SET
+-			(temp_set2,
+-			 reg_class_contents
+-			 [(int) ira_reg_class_intersect[cl1][cl2]]);
+-		      AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
++		      temp_set2
++			= (reg_class_contents
++			   [ira_reg_class_intersect[cl1][cl2]]);
++		      temp_set2 &= ~no_unit_alloc_regs;
+ 		      if (! hard_reg_set_subset_p (temp_hard_regset, temp_set2)
+ 			  /* If the allocatable hard register sets are
+ 			     the same, prefer GENERAL_REGS or the
+ 			     smallest class for debugging
+ 			     purposes.  */
+-			  || (hard_reg_set_equal_p (temp_hard_regset, temp_set2)
++			  || (temp_hard_regset == temp_set2
+ 			      && (cl3 == GENERAL_REGS
+ 				  || ((ira_reg_class_intersect[cl1][cl2]
+ 				       != GENERAL_REGS)
+@@ -1302,14 +1281,13 @@ setup_reg_class_relations (void)
+ 					   ira_reg_class_intersect[cl1][cl2]])))))
+ 			ira_reg_class_intersect[cl1][cl2] = (enum reg_class) cl3;
+ 		    }
+-		  COPY_HARD_REG_SET
+-		    (temp_set2,
+-		     reg_class_contents[(int) ira_reg_class_subset[cl1][cl2]]);
+-		  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
++		  temp_set2
++		    = (reg_class_contents[ira_reg_class_subset[cl1][cl2]]
++		       & ~no_unit_alloc_regs);
+ 		  if (! hard_reg_set_subset_p (temp_hard_regset, temp_set2)
+ 		      /* Ignore unavailable hard registers and prefer
+ 			 smallest class for debugging purposes.  */
+-		      || (hard_reg_set_equal_p (temp_hard_regset, temp_set2)
++		      || (temp_hard_regset == temp_set2
+ 			  && hard_reg_set_subset_p
+ 			     (reg_class_contents[cl3],
+ 			      reg_class_contents
+@@ -1322,15 +1300,13 @@ setup_reg_class_relations (void)
+ 		  /* CL3 allocatable hard register set is inside of
+ 		     union of allocatable hard register sets of CL1
+ 		     and CL2.  */
+-		  COPY_HARD_REG_SET
+-		    (temp_set2,
+-		     reg_class_contents[(int) ira_reg_class_subunion[cl1][cl2]]);
+-		  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
++		  temp_set2
++		    = (reg_class_contents[ira_reg_class_subunion[cl1][cl2]]
++		       & ~no_unit_alloc_regs);
+ 	 	  if (ira_reg_class_subunion[cl1][cl2] == NO_REGS
+ 		      || (hard_reg_set_subset_p (temp_set2, temp_hard_regset)
+ 			  
+-			  && (! hard_reg_set_equal_p (temp_set2,
+-						      temp_hard_regset)
++			  && (temp_set2 != temp_hard_regset
+ 			      || cl3 == GENERAL_REGS
+ 			      /* If the allocatable hard register sets are the
+ 				 same, prefer GENERAL_REGS or the smallest
+@@ -1347,15 +1323,13 @@ setup_reg_class_relations (void)
+ 		  /* CL3 allocatable hard register set contains union
+ 		     of allocatable hard register sets of CL1 and
+ 		     CL2.  */
+-		  COPY_HARD_REG_SET
+-		    (temp_set2,
+-		     reg_class_contents[(int) ira_reg_class_superunion[cl1][cl2]]);
+-		  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
++		  temp_set2
++		    = (reg_class_contents[ira_reg_class_superunion[cl1][cl2]]
++		       & ~no_unit_alloc_regs);
+ 	 	  if (ira_reg_class_superunion[cl1][cl2] == NO_REGS
+ 		      || (hard_reg_set_subset_p (temp_hard_regset, temp_set2)
+ 
+-			  && (! hard_reg_set_equal_p (temp_set2,
+-						      temp_hard_regset)
++			  && (temp_set2 != temp_hard_regset
+ 			      || cl3 == GENERAL_REGS
+ 			      /* If the allocatable hard register sets are the
+ 				 same, prefer GENERAL_REGS or the smallest
+@@ -1499,8 +1473,7 @@ setup_prohibited_class_mode_regs (void)
+ 
+   for (cl = (int) N_REG_CLASSES - 1; cl >= 0; cl--)
+     {
+-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
++      temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
+       for (j = 0; j < NUM_MACHINE_MODES; j++)
+ 	{
+ 	  count = 0;
+@@ -1784,68 +1757,59 @@ setup_prohibited_mode_move_regs (void)
+ 
+ 
+ 
+-/* Setup possible alternatives in ALTS for INSN.  */
+-void
+-ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
++/* Extract INSN and return the set of alternatives that we should consider.
++   This excludes any alternatives whose constraints are obviously impossible
++   to meet (e.g. because the constraint requires a constant and the operand
++   is nonconstant).  It also excludes alternatives that are bound to need
++   a spill or reload, as long as we have other alternatives that match
++   exactly.  */
++alternative_mask
++ira_setup_alts (rtx_insn *insn)
+ {
+-  /* MAP nalt * nop -> start of constraints for given operand and
+-     alternative.  */
+-  static vec<const char *> insn_constraints;
+   int nop, nalt;
+   bool curr_swapped;
+   const char *p;
+   int commutative = -1;
+ 
+   extract_insn (insn);
++  preprocess_constraints (insn);
+   alternative_mask preferred = get_preferred_alternatives (insn);
+-  CLEAR_HARD_REG_SET (alts);
+-  insn_constraints.release ();
+-  insn_constraints.safe_grow_cleared (recog_data.n_operands
+-				      * recog_data.n_alternatives + 1);
++  alternative_mask alts = 0;
++  alternative_mask exact_alts = 0;
+   /* Check that the hard reg set is enough for holding all
+      alternatives.  It is hard to imagine the situation when the
+      assertion is wrong.  */
+   ira_assert (recog_data.n_alternatives
+ 	      <= (int) MAX (sizeof (HARD_REG_ELT_TYPE) * CHAR_BIT,
+ 			    FIRST_PSEUDO_REGISTER));
++  for (nop = 0; nop < recog_data.n_operands; nop++)
++    if (recog_data.constraints[nop][0] == '%')
++      {
++	commutative = nop;
++	break;
++      }
+   for (curr_swapped = false;; curr_swapped = true)
+     {
+-      /* Calculate some data common for all alternatives to speed up the
+-	 function.  */
+-      for (nop = 0; nop < recog_data.n_operands; nop++)
+-	{
+-	  for (nalt = 0, p = recog_data.constraints[nop];
+-	       nalt < recog_data.n_alternatives;
+-	       nalt++)
+-	    {
+-	      insn_constraints[nop * recog_data.n_alternatives + nalt] = p;
+-	      while (*p && *p != ',')
+-		{
+-		  /* We only support one commutative marker, the first
+-		     one.  We already set commutative above.  */
+-		  if (*p == '%' && commutative < 0)
+-		    commutative = nop;
+-		  p++;
+-		}
+-	      if (*p)
+-		p++;
+-	    }
+-	}
+       for (nalt = 0; nalt < recog_data.n_alternatives; nalt++)
+ 	{
+-	  if (!TEST_BIT (preferred, nalt)
+-	      || TEST_HARD_REG_BIT (alts, nalt))
++	  if (!TEST_BIT (preferred, nalt) || TEST_BIT (exact_alts, nalt))
+ 	    continue;
+ 
++	  const operand_alternative *op_alt
++	    = &recog_op_alt[nalt * recog_data.n_operands];
++	  int this_reject = 0;
+ 	  for (nop = 0; nop < recog_data.n_operands; nop++)
+ 	    {
+ 	      int c, len;
+ 
++	      this_reject += op_alt[nop].reject;
++
+ 	      rtx op = recog_data.operand[nop];
+-	      p = insn_constraints[nop * recog_data.n_alternatives + nalt];
++	      p = op_alt[nop].constraint;
+ 	      if (*p == 0 || *p == ',')
+ 		continue;
+-	      
++
++	      bool win_p = false;
+ 	      do
+ 		switch (c = *p, len = CONSTRAINT_LEN (c, p), c)
+ 		  {
+@@ -1863,7 +1827,14 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
+ 
+ 		  case '0':  case '1':  case '2':  case '3':  case '4':
+ 		  case '5':  case '6':  case '7':  case '8':  case '9':
+-		    goto op_success;
++		    {
++		      rtx other = recog_data.operand[c - '0'];
++		      if (MEM_P (other)
++			  ? rtx_equal_p (other, op)
++			  : REG_P (op) || SUBREG_P (op))
++			goto op_success;
++		      win_p = true;
++		    }
+ 		    break;
+ 		    
+ 		  case 'g':
+@@ -1877,7 +1848,11 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
+ 			{
+ 			case CT_REGISTER:
+ 			  if (reg_class_for_constraint (cn) != NO_REGS)
+-			    goto op_success;
++			    {
++			      if (REG_P (op) || SUBREG_P (op))
++				goto op_success;
++			      win_p = true;
++			    }
+ 			  break;
+ 
+ 			case CT_CONST_INT:
+@@ -1888,9 +1863,14 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
+ 			  break;
+ 
+ 			case CT_ADDRESS:
++			  goto op_success;
++
+ 			case CT_MEMORY:
+ 			case CT_SPECIAL_MEMORY:
+-			  goto op_success;
++			  if (MEM_P (op))
++			    goto op_success;
++			  win_p = true;
++			  break;
+ 
+ 			case CT_FIXED_FORM:
+ 			  if (constraint_satisfied_p (op, cn))
+@@ -1901,12 +1881,22 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
+ 		    }
+ 		  }
+ 	      while (p += len, c);
+-	      break;
++	      if (!win_p)
++		break;
++	      /* We can make the alternative match by spilling a register
++		 to memory or loading something into a register.  Count a
++		 cost of one reload (the equivalent of the '?' constraint).  */
++	      this_reject += 6;
+ 	    op_success:
+ 	      ;
+ 	    }
++
+ 	  if (nop >= recog_data.n_operands)
+-	    SET_HARD_REG_BIT (alts, nalt);
++	    {
++	      alts |= ALTERNATIVE_BIT (nalt);
++	      if (this_reject == 0)
++		exact_alts |= ALTERNATIVE_BIT (nalt);
++	    }
+ 	}
+       if (commutative < 0)
+ 	break;
+@@ -1916,14 +1906,15 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
+       if (curr_swapped)
+ 	break;
+     }
++  return exact_alts ? exact_alts : alts;
+ }
+ 
+ /* Return the number of the output non-early clobber operand which
+    should be the same in any case as operand with number OP_NUM (or
+-   negative value if there is no such operand).  The function takes
+-   only really possible alternatives into consideration.  */
++   negative value if there is no such operand).  ALTS is the mask
++   of alternatives that we should consider.  */
+ int
+-ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
++ira_get_dup_out_num (int op_num, alternative_mask alts)
+ {
+   int curr_alt, c, original, dup;
+   bool ignore_p, use_commut_op_p;
+@@ -1940,7 +1931,7 @@ ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
+     {
+       rtx op = recog_data.operand[op_num];
+       
+-      for (curr_alt = 0, ignore_p = !TEST_HARD_REG_BIT (alts, curr_alt),
++      for (curr_alt = 0, ignore_p = !TEST_BIT (alts, curr_alt),
+ 	   original = -1;;)
+ 	{
+ 	  c = *str;
+@@ -1951,7 +1942,7 @@ ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
+ 	  else if (c == ',')
+ 	    {
+ 	      curr_alt++;
+-	      ignore_p = !TEST_HARD_REG_BIT (alts, curr_alt);
++	      ignore_p = !TEST_BIT (alts, curr_alt);
+ 	    }
+ 	  else if (! ignore_p)
+ 	    switch (c)
+@@ -1981,26 +1972,8 @@ ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
+ 	}
+       if (original == -1)
+ 	goto fail;
+-      dup = -1;
+-      for (ignore_p = false, str = recog_data.constraints[original - '0'];
+-	   *str != 0;
+-	   str++)
+-	if (ignore_p)
+-	  {
+-	    if (*str == ',')
+-	      ignore_p = false;
+-	  }
+-	else if (*str == '#')
+-	  ignore_p = true;
+-	else if (! ignore_p)
+-	  {
+-	    if (*str == '=')
+-	      dup = original - '0';
+-	    /* It is better ignore an alternative with early clobber.  */
+-	    else if (*str == '&')
+-	      goto fail;
+-	  }
+-      if (dup >= 0)
++      dup = original - '0';
++      if (recog_data.operand_type[dup] == OP_OUT)
+ 	return dup;
+     fail:
+       if (use_commut_op_p)
+@@ -2305,7 +2278,7 @@ ira_setup_eliminable_regset (void)
+   if (frame_pointer_needed)
+     df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true);
+     
+-  COPY_HARD_REG_SET (ira_no_alloc_regs, no_unit_alloc_regs);
++  ira_no_alloc_regs = no_unit_alloc_regs;
+   CLEAR_HARD_REG_SET (eliminable_regset);
+ 
+   compute_regs_asm_clobbered ();
+@@ -2326,7 +2299,7 @@ ira_setup_eliminable_regset (void)
+ 	      SET_HARD_REG_BIT (ira_no_alloc_regs, eliminables[i].from);
+ 	}
+       else if (cannot_elim)
+-	error ("%s cannot be used in asm here",
++	error ("%s cannot be used in %<asm%> here",
+ 	       reg_names[eliminables[i].from]);
+       else
+ 	df_set_regs_ever_live (eliminables[i].from, true);
+@@ -2340,7 +2313,7 @@ ira_setup_eliminable_regset (void)
+ 	    SET_HARD_REG_BIT (ira_no_alloc_regs, HARD_FRAME_POINTER_REGNUM);
+ 	}
+       else if (frame_pointer_needed)
+-	error ("%s cannot be used in asm here",
++	error ("%s cannot be used in %<asm%> here",
+ 	       reg_names[HARD_FRAME_POINTER_REGNUM]);
+       else
+ 	df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true);
+@@ -2392,12 +2365,10 @@ setup_reg_renumber (void)
+ 	  for (i = 0; i < nwords; i++)
+ 	    {
+ 	      obj = ALLOCNO_OBJECT (a, i);
+-	      IOR_COMPL_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
+-				      reg_class_contents[pclass]);
++	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
++		|= ~reg_class_contents[pclass];
+ 	    }
+-	  if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0
+-	      && ira_hard_reg_set_intersection_p (hard_regno, ALLOCNO_MODE (a),
+-						  call_used_reg_set))
++	  if (ira_need_caller_save_p (a, hard_regno))
+ 	    {
+ 	      ira_assert (!optimize || flag_caller_saves
+ 			  || (ALLOCNO_CALLS_CROSSED_NUM (a)
+@@ -3004,7 +2975,7 @@ validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
+ 	    return valid_none;
+ 	}
+ 
+-      note_stores (PATTERN (insn), validate_equiv_mem_from_store, &info);
++      note_stores (insn, validate_equiv_mem_from_store, &info);
+       if (info.equiv_mem_modified)
+ 	return valid_none;
+ 
+@@ -3092,7 +3063,6 @@ equiv_init_movable_p (rtx x, int regno)
+ 
+     case CC0:
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       return 0;
+ 
+     case PRE_INC:
+@@ -3199,7 +3169,6 @@ memref_referenced_p (rtx memref, rtx x, bool read_p)
+       return memref_referenced_p (memref, SET_SRC (x), true);
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       if (process_set_for_memref_referenced_p (memref, XEXP (x, 0)))
+ 	return true;
+ 
+@@ -3391,6 +3360,37 @@ def_dominates_uses (int regno)
+   return true;
+ }
+ 
++/* Scan the instructions before update_equiv_regs.  Record which registers
++   are referenced as paradoxical subregs.  Also check for cases in which
++   the current function needs to save a register that one of its call
++   instructions clobbers.
++
++   These things are logically unrelated, but it's more efficient to do
++   them together.  */
++
++static void
++update_equiv_regs_prescan (void)
++{
++  basic_block bb;
++  rtx_insn *insn;
++  function_abi_aggregator callee_abis;
++
++  FOR_EACH_BB_FN (bb, cfun)
++    FOR_BB_INSNS (bb, insn)
++      if (NONDEBUG_INSN_P (insn))
++	{
++	  set_paradoxical_subreg (insn);
++	  if (CALL_P (insn))
++	    callee_abis.note_callee_abi (insn_callee_abi (insn));
++	}
++
++  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
++  if (!hard_reg_set_empty_p (extra_caller_saves))
++    for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
++      if (TEST_HARD_REG_BIT (extra_caller_saves, regno))
++	df_set_regs_ever_live (regno, true);
++}
++
+ /* Find registers that are equivalent to a single value throughout the
+    compilation (either because they can be referenced in memory or are
+    set once from a single constant).  Lower their priority for a
+@@ -3407,15 +3407,6 @@ update_equiv_regs (void)
+   rtx_insn *insn;
+   basic_block bb;
+ 
+-  /* Scan insns and set pdx_subregs if the reg is used in a
+-     paradoxical subreg.  Don't set such reg equivalent to a mem,
+-     because lra will not substitute such equiv memory in order to
+-     prevent access beyond allocated memory for paradoxical memory subreg.  */
+-  FOR_EACH_BB_FN (bb, cfun)
+-    FOR_BB_INSNS (bb, insn)
+-      if (NONDEBUG_INSN_P (insn))
+-	set_paradoxical_subreg (insn);
+-
+   /* Scan the insns and find which registers have equivalences.  Do this
+      in a separate scan of the insns because (due to -fcse-follow-jumps)
+      a register can be set below its use.  */
+@@ -3447,7 +3438,7 @@ update_equiv_regs (void)
+ 	  if (set == NULL_RTX
+ 	      || side_effects_p (SET_SRC (set)))
+ 	    {
+-	      note_stores (PATTERN (insn), no_equiv, NULL);
++	      note_pattern_stores (PATTERN (insn), no_equiv, NULL);
+ 	      continue;
+ 	    }
+ 	  else if (GET_CODE (PATTERN (insn)) == PARALLEL)
+@@ -3458,7 +3449,7 @@ update_equiv_regs (void)
+ 		{
+ 		  rtx part = XVECEXP (PATTERN (insn), 0, i);
+ 		  if (part != set)
+-		    note_stores (part, no_equiv, NULL);
++		    note_pattern_stores (part, no_equiv, NULL);
+ 		}
+ 	    }
+ 
+@@ -3516,7 +3507,7 @@ update_equiv_regs (void)
+ 	    {
+ 	      /* This might be setting a SUBREG of a pseudo, a pseudo that is
+ 		 also set somewhere else to a constant.  */
+-	      note_stores (set, no_equiv, NULL);
++	      note_pattern_stores (set, no_equiv, NULL);
+ 	      continue;
+ 	    }
+ 
+@@ -3524,7 +3515,7 @@ update_equiv_regs (void)
+ 	     equivalent to a mem.  */
+ 	  if (MEM_P (src) && reg_equiv[regno].pdx_subregs)
+ 	    {
+-	      note_stores (set, no_equiv, NULL);
++	      note_pattern_stores (set, no_equiv, NULL);
+ 	      continue;
+ 	    }
+ 
+@@ -4458,7 +4449,6 @@ rtx_moveable_p (rtx *loc, enum op_type type)
+ 	      && rtx_moveable_p (&XEXP (x, 2), OP_IN));
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       return rtx_moveable_p (&SET_DEST (x), OP_OUT);
+ 
+     case UNSPEC_VOLATILE:
+@@ -4911,9 +4901,7 @@ interesting_dest_for_shprep (rtx_insn *insn, basic_block call_dom)
+   for (int i = 0; i < XVECLEN (pat, 0); i++)
+     {
+       rtx sub = XVECEXP (pat, 0, i);
+-      if (GET_CODE (sub) == USE
+-	  || GET_CODE (sub) == CLOBBER
+-	  || GET_CODE (sub) == CLOBBER_HIGH)
++      if (GET_CODE (sub) == USE || GET_CODE (sub) == CLOBBER)
+ 	continue;
+       if (GET_CODE (sub) != SET
+ 	  || side_effects_p (sub))
+@@ -5305,6 +5293,7 @@ ira (FILE *f)
+   init_alias_analysis ();
+   loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+   reg_equiv = XCNEWVEC (struct equivalence, max_reg_num ());
++  update_equiv_regs_prescan ();
+   update_equiv_regs ();
+ 
+   /* Don't move insns if live range shrinkage or register
+@@ -5616,7 +5605,9 @@ do_reload (void)
+       poly_int64 size = get_frame_size () + STACK_CHECK_FIXED_FRAME_SIZE;
+ 
+       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-	if (df_regs_ever_live_p (i) && !fixed_regs[i] && call_used_regs[i])
++	if (df_regs_ever_live_p (i)
++	    && !fixed_regs[i]
++	    && !crtl->abi->clobbers_full_reg_p (i))
+ 	  size += UNITS_PER_WORD;
+ 
+       if (constant_lower_bound (size) > STACK_CHECK_MAX_FRAME_SIZE)
+diff --git a/gcc/jit/jit-playback.c b/gcc/jit/jit-playback.c
+index b74495c58..8b16e81d5 100644
+--- a/gcc/jit/jit-playback.c
++++ b/gcc/jit/jit-playback.c
+@@ -399,12 +399,11 @@ new_function (location *loc,
+ 
+   if (builtin_id)
+     {
+-      DECL_FUNCTION_CODE (fndecl) = builtin_id;
+       gcc_assert (loc == NULL);
+       DECL_SOURCE_LOCATION (fndecl) = BUILTINS_LOCATION;
+ 
+-      DECL_BUILT_IN_CLASS (fndecl) =
+-	builtins_manager::get_class (builtin_id);
++      built_in_class fclass = builtins_manager::get_class (builtin_id);
++      set_decl_built_in_function (fndecl, fclass, builtin_id);
+       set_builtin_decl (builtin_id, fndecl,
+ 			builtins_manager::implicit_p (builtin_id));
+ 
+diff --git a/gcc/jump.c b/gcc/jump.c
+index ce5cee523..17642a95b 100644
+--- a/gcc/jump.c
++++ b/gcc/jump.c
+@@ -1094,7 +1094,6 @@ mark_jump_label_1 (rtx x, rtx_insn *insn, bool in_mem, bool is_target)
+     case CC0:
+     case REG:
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+     case CALL:
+       return;
+ 
+diff --git a/gcc/langhooks-def.h b/gcc/langhooks-def.h
+index a059841b3..842f6a502 100644
+--- a/gcc/langhooks-def.h
++++ b/gcc/langhooks-def.h
+@@ -122,6 +122,7 @@ extern int lhd_type_dwarf_attribute (const_tree, int);
+ #define LANG_HOOKS_TYPES_COMPATIBLE_P	lhd_types_compatible_p
+ #define LANG_HOOKS_BUILTIN_FUNCTION	lhd_builtin_function
+ #define LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE	LANG_HOOKS_BUILTIN_FUNCTION
++#define LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL LANG_HOOKS_BUILTIN_FUNCTION
+ #define LANG_HOOKS_EXPR_TO_DECL		lhd_expr_to_decl
+ #define LANG_HOOKS_TO_TARGET_CHARSET	lhd_to_target_charset
+ #define LANG_HOOKS_INIT_TS		lhd_do_nothing
+@@ -170,6 +171,7 @@ extern tree lhd_make_node (enum tree_code);
+ extern tree lhd_unit_size_without_reusable_padding (tree);
+ 
+ #define LANG_HOOKS_MAKE_TYPE lhd_make_node
++#define LANG_HOOKS_SIMULATE_ENUM_DECL	NULL
+ #define LANG_HOOKS_CLASSIFY_RECORD	NULL
+ #define LANG_HOOKS_TYPE_FOR_SIZE	lhd_type_for_size
+ #define LANG_HOOKS_INCOMPLETE_TYPE_ERROR lhd_incomplete_type_error
+@@ -203,6 +205,7 @@ extern tree lhd_unit_size_without_reusable_padding (tree);
+ 
+ #define LANG_HOOKS_FOR_TYPES_INITIALIZER { \
+   LANG_HOOKS_MAKE_TYPE, \
++  LANG_HOOKS_SIMULATE_ENUM_DECL, \
+   LANG_HOOKS_CLASSIFY_RECORD, \
+   LANG_HOOKS_TYPE_FOR_MODE, \
+   LANG_HOOKS_TYPE_FOR_SIZE, \
+@@ -338,6 +341,7 @@ extern void lhd_end_section (void);
+   LANG_HOOKS_GIMPLIFY_EXPR, \
+   LANG_HOOKS_BUILTIN_FUNCTION, \
+   LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE, \
++  LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL, \
+   LANG_HOOKS_INIT_TS,          \
+   LANG_HOOKS_EXPR_TO_DECL, \
+   LANG_HOOKS_EH_PERSONALITY, \
+diff --git a/gcc/langhooks.c b/gcc/langhooks.c
+index 2df97f2b6..fd8f43312 100644
+--- a/gcc/langhooks.c
++++ b/gcc/langhooks.c
+@@ -599,28 +599,21 @@ lhd_omp_mappable_type (tree type)
+   return true;
+ }
+ 
+-/* Common function for add_builtin_function and
+-   add_builtin_function_ext_scope.  */
++/* Common function for add_builtin_function, add_builtin_function_ext_scope
++   and simulate_builtin_function_decl.  */
++
+ static tree
+-add_builtin_function_common (const char *name,
+-			     tree type,
+-			     int function_code,
+-			     enum built_in_class cl,
+-			     const char *library_name,
+-			     tree attrs,
+-			     tree (*hook) (tree))
++build_builtin_function (location_t location, const char *name, tree type,
++			int function_code, enum built_in_class cl,
++			const char *library_name, tree attrs)
+ {
+   tree   id = get_identifier (name);
+-  tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, id, type);
++  tree decl = build_decl (location, FUNCTION_DECL, id, type);
+ 
+   TREE_PUBLIC (decl)         = 1;
+   DECL_EXTERNAL (decl)       = 1;
+-  DECL_BUILT_IN_CLASS (decl) = cl;
+-
+-  DECL_FUNCTION_CODE (decl)  = (enum built_in_function) function_code;
+ 
+-  /* DECL_FUNCTION_CODE is a bitfield; verify that the value fits.  */
+-  gcc_assert (DECL_FUNCTION_CODE (decl) == function_code);
++  set_decl_built_in_function (decl, cl, function_code);
+ 
+   if (library_name)
+     {
+@@ -636,8 +629,7 @@ add_builtin_function_common (const char *name,
+   else
+     decl_attributes (&decl, NULL_TREE, 0);
+ 
+-  return hook (decl);
+-
++  return decl;
+ }
+ 
+ /* Create a builtin function.  */
+@@ -650,9 +642,9 @@ add_builtin_function (const char *name,
+ 		      const char *library_name,
+ 		      tree attrs)
+ {
+-  return add_builtin_function_common (name, type, function_code, cl,
+-				      library_name, attrs,
+-				      lang_hooks.builtin_function);
++  tree decl = build_builtin_function (BUILTINS_LOCATION, name, type,
++				      function_code, cl, library_name, attrs);
++  return lang_hooks.builtin_function (decl);
+ }
+ 
+ /* Like add_builtin_function, but make sure the scope is the external scope.
+@@ -670,9 +662,40 @@ add_builtin_function_ext_scope (const char *name,
+ 				const char *library_name,
+ 				tree attrs)
+ {
+-  return add_builtin_function_common (name, type, function_code, cl,
+-				      library_name, attrs,
+-				      lang_hooks.builtin_function_ext_scope);
++  tree decl = build_builtin_function (BUILTINS_LOCATION, name, type,
++				      function_code, cl, library_name, attrs);
++  return lang_hooks.builtin_function_ext_scope (decl);
++}
++
++/* Simulate a declaration of a target-specific built-in function at
++   location LOCATION, as though it had been declared directly in the
++   source language.  NAME is the name of the function, TYPE is its function
++   type, FUNCTION_CODE is the target-specific function code, LIBRARY_NAME
++   is the name of the underlying library function (NULL if none) and
++   ATTRS is a list of function attributes.
++
++   Return the decl of the declared function.  */
++
++tree
++simulate_builtin_function_decl (location_t location, const char *name,
++				tree type, int function_code,
++				const char *library_name, tree attrs)
++{
++  tree decl = build_builtin_function (location, name, type,
++				      function_code, BUILT_IN_MD,
++				      library_name, attrs);
++  tree new_decl = lang_hooks.simulate_builtin_function_decl (decl);
++
++  /* Give the front end a chance to create a new decl if necessary,
++     but if the front end discards the decl in favour of a conflicting
++     (erroneous) previous definition, return the decl that we tried but
++     failed to add.  This allows the caller to process the returned decl
++     normally, even though the source code won't be able to use it.  */
++  if (TREE_CODE (new_decl) == FUNCTION_DECL
++      && fndecl_built_in_p (new_decl, function_code, BUILT_IN_MD))
++    return new_decl;
++
++  return decl;
+ }
+ 
+ tree
+diff --git a/gcc/langhooks.h b/gcc/langhooks.h
+index a45579b33..b8cee93f5 100644
+--- a/gcc/langhooks.h
++++ b/gcc/langhooks.h
+@@ -64,6 +64,10 @@ struct lang_hooks_for_types
+      language-specific processing is required.  */
+   tree (*make_type) (enum tree_code);
+ 
++  /* Make an enum type with the given name and values, associating
++     them all with the given source location.  */
++  tree (*simulate_enum_decl) (location_t, const char *, vec<string_int_pair>);
++
+   /* Return what kind of RECORD_TYPE this is, mainly for purposes of
+      debug information.  If not defined, record types are assumed to
+      be structures.  */
+@@ -494,6 +498,15 @@ struct lang_hooks
+      backend must add all of the builtins at program initialization time.  */
+   tree (*builtin_function_ext_scope) (tree decl);
+ 
++  /* Do language-specific processing for target-specific built-in
++     function DECL, so that it is defined in the global scope (only)
++     and is available without needing to be explicitly declared.
++
++     This is intended for targets that want to inject declarations of
++     built-in functions into the source language (such as in response
++     to a pragma) rather than providing them in the source language itself.  */
++  tree (*simulate_builtin_function_decl) (tree decl);
++
+   /* Used to set up the tree_contains_structure array for a frontend. */
+   void (*init_ts) (void);
+ 
+@@ -562,6 +575,8 @@ extern tree add_builtin_function_ext_scope (const char *name, tree type,
+ 					    enum built_in_class cl,
+ 					    const char *library_name,
+ 					    tree attrs);
++extern tree simulate_builtin_function_decl (location_t, const char *, tree,
++					    int, const char *, tree);
+ extern tree add_builtin_type (const char *name, tree type);
+ 
+ /* Language helper functions.  */
+diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
+index 89714be76..732687dba 100644
+--- a/gcc/loop-doloop.c
++++ b/gcc/loop-doloop.c
+@@ -731,7 +731,7 @@ doloop_optimize (struct loop *loop)
+     bitmap modified = BITMAP_ALLOC (NULL);
+ 
+     for (rtx_insn *i = doloop_seq; i != NULL; i = NEXT_INSN (i))
+-      note_stores (PATTERN (i), record_reg_sets, modified);
++      note_stores (i, record_reg_sets, modified);
+ 
+     basic_block loop_end = desc->out_edge->src;
+     bool fail = bitmap_intersect_p (df_get_live_out (loop_end), modified);
+diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
+index b880ead3d..1af88876c 100644
+--- a/gcc/loop-invariant.c
++++ b/gcc/loop-invariant.c
+@@ -2170,7 +2170,7 @@ calculate_loop_reg_pressure (void)
+ 
+ 	  mark_ref_regs (PATTERN (insn));
+ 	  n_regs_set = 0;
+-	  note_stores (PATTERN (insn), mark_reg_clobber, NULL);
++	  note_stores (insn, mark_reg_clobber, NULL);
+ 
+ 	  /* Mark any registers dead after INSN as dead now.  */
+ 
+@@ -2183,7 +2183,7 @@ calculate_loop_reg_pressure (void)
+ 	     Clobbers are processed again, so they conflict with
+ 	     the registers that are set.  */
+ 
+-	  note_stores (PATTERN (insn), mark_reg_store, NULL);
++	  note_stores (insn, mark_reg_store, NULL);
+ 
+ 	  if (AUTO_INC_DEC)
+ 	    for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+diff --git a/gcc/loop-iv.c b/gcc/loop-iv.c
+index 340045ce8..1dc3bc74d 100644
+--- a/gcc/loop-iv.c
++++ b/gcc/loop-iv.c
+@@ -1967,16 +1967,10 @@ simplify_using_initial_values (struct loop *loop, enum rtx_code op, rtx *expr)
+ 	    continue;
+ 
+ 	  CLEAR_REG_SET (this_altered);
+-	  note_stores (PATTERN (insn), mark_altered, this_altered);
++	  note_stores (insn, mark_altered, this_altered);
+ 	  if (CALL_P (insn))
+-	    {
+-	      /* Kill all call clobbered registers.  */
+-	      unsigned int i;
+-	      hard_reg_set_iterator hrsi;
+-	      EXECUTE_IF_SET_IN_HARD_REG_SET (regs_invalidated_by_call,
+-					      0, i, hrsi)
+-		SET_REGNO_REG_SET (this_altered, i);
+-	    }
++	    /* Kill all call clobbered registers.  */
++	    IOR_REG_SET_HRS (this_altered, regs_invalidated_by_call);
+ 
+ 	  if (suitable_set_for_replacement (insn, &dest, &src))
+ 	    {
+diff --git a/gcc/lra-assigns.c b/gcc/lra-assigns.c
+index 5c5c73293..a35fc41ac 100644
+--- a/gcc/lra-assigns.c
++++ b/gcc/lra-assigns.c
+@@ -94,6 +94,7 @@ along with GCC; see the file COPYING3.	If not see
+ #include "params.h"
+ #include "lra.h"
+ #include "lra-int.h"
++#include "function-abi.h"
+ 
+ /* Current iteration number of the pass and current iteration number
+    of the pass after the latest spill pass when any former reload
+@@ -493,18 +494,15 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
+   HARD_REG_SET impossible_start_hard_regs, available_regs;
+ 
+   if (hard_reg_set_empty_p (regno_set))
+-    COPY_HARD_REG_SET (conflict_set, lra_no_alloc_regs);
++    conflict_set = lra_no_alloc_regs;
+   else
+-    {
+-      COMPL_HARD_REG_SET (conflict_set, regno_set);
+-      IOR_HARD_REG_SET (conflict_set, lra_no_alloc_regs);
+-    }
++    conflict_set = ~regno_set | lra_no_alloc_regs;
+   rclass = regno_allocno_class_array[regno];
+   rclass_intersect_p = ira_reg_classes_intersect_p[rclass];
+   curr_hard_regno_costs_check++;
+   sparseset_clear (conflict_reload_and_inheritance_pseudos);
+   sparseset_clear (live_range_hard_reg_pseudos);
+-  IOR_HARD_REG_SET (conflict_set, lra_reg_info[regno].conflict_hard_regs);
++  conflict_set |= lra_reg_info[regno].conflict_hard_regs;
+   biggest_mode = lra_reg_info[regno].biggest_mode;
+   for (r = lra_reg_info[regno].live_ranges; r != NULL; r = r->next)
+     {
+@@ -614,7 +612,7 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
+       }
+   /* Make sure that all registers in a multi-word pseudo belong to the
+      required class.  */
+-  IOR_COMPL_HARD_REG_SET (conflict_set, reg_class_contents[rclass]);
++  conflict_set |= ~reg_class_contents[rclass];
+   lra_assert (rclass != NO_REGS);
+   rclass_size = ira_class_hard_regs_num[rclass];
+   best_hard_regno = -1;
+@@ -622,8 +620,7 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
+   biggest_nregs = hard_regno_nregs (hard_regno, biggest_mode);
+   nregs_diff = (biggest_nregs
+ 		- hard_regno_nregs (hard_regno, PSEUDO_REGNO_MODE (regno)));
+-  COPY_HARD_REG_SET (available_regs, reg_class_contents[rclass]);
+-  AND_COMPL_HARD_REG_SET (available_regs, lra_no_alloc_regs);
++  available_regs = reg_class_contents[rclass] & ~lra_no_alloc_regs;
+   for (i = 0; i < rclass_size; i++)
+     {
+       if (try_only_hard_regno >= 0)
+@@ -658,7 +655,7 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
+ 	  for (j = 0;
+ 	       j < hard_regno_nregs (hard_regno, PSEUDO_REGNO_MODE (regno));
+ 	       j++)
+-	    if (! TEST_HARD_REG_BIT (call_used_reg_set, hard_regno + j)
++	    if (! crtl->abi->clobbers_full_reg_p (hard_regno + j)
+ 		&& ! df_regs_ever_live_p (hard_regno + j))
+ 	      /* It needs save restore.	 */
+ 	      hard_regno_costs[hard_regno]
+@@ -1219,8 +1216,8 @@ setup_live_pseudos_and_spill_after_risky_transforms (bitmap
+ 		  sparseset_set_bit (live_range_hard_reg_pseudos, r2->regno);
+ 	    }
+ 	}
+-      COPY_HARD_REG_SET (conflict_set, lra_no_alloc_regs);
+-      IOR_HARD_REG_SET (conflict_set, lra_reg_info[regno].conflict_hard_regs);
++      conflict_set = lra_no_alloc_regs;
++      conflict_set |= lra_reg_info[regno].conflict_hard_regs;
+       val = lra_reg_info[regno].val;
+       offset = lra_reg_info[regno].offset;
+       EXECUTE_IF_SET_IN_SPARSESET (live_range_hard_reg_pseudos, conflict_regno)
+@@ -1640,14 +1637,14 @@ lra_assign (bool &fails_p)
+   bitmap_initialize (&all_spilled_pseudos, &reg_obstack);
+   create_live_range_start_chains ();
+   setup_live_pseudos_and_spill_after_risky_transforms (&all_spilled_pseudos);
+-  if (! lra_asm_error_p && flag_checking && !flag_ipa_ra)
++  if (! lra_asm_error_p && flag_checking)
+     /* Check correctness of allocation for call-crossed pseudos but
+        only when there are no asm errors as in the case of errors the
+        asm is removed and it can result in incorrect allocation.  */
+     for (i = FIRST_PSEUDO_REGISTER; i < max_regno; i++)
+-      if (lra_reg_info[i].nrefs != 0 && reg_renumber[i] >= 0
+-	  && lra_reg_info[i].call_insn
+-	  && overlaps_hard_reg_set_p (call_used_reg_set,
++      if (lra_reg_info[i].nrefs != 0
++	  && reg_renumber[i] >= 0
++	  && overlaps_hard_reg_set_p (lra_reg_info[i].conflict_hard_regs,
+ 				      PSEUDO_REGNO_MODE (i), reg_renumber[i]))
+ 	gcc_unreachable ();
+   /* Setup insns to process on the next constraint pass.  */
+diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
+index f0a2f0491..b34aec227 100644
+--- a/gcc/lra-constraints.c
++++ b/gcc/lra-constraints.c
+@@ -131,6 +131,7 @@
+ #include "lra.h"
+ #include "lra-int.h"
+ #include "print-rtl.h"
++#include "function-abi.h"
+ 
+ /* Value of LRA_CURR_RELOAD_NUM at the beginning of BB of the current
+    insn.  Remember that LRA_CURR_RELOAD_NUM is the number of emitted
+@@ -394,11 +395,24 @@ address_eliminator::~address_eliminator ()
+     *m_index_loc = m_index_reg;
+ }
+ 
+-/* Return true if the eliminated form of AD is a legitimate target address.  */
++/* Return true if the eliminated form of AD is a legitimate target address.
++   If OP is a MEM, AD is the address within OP, otherwise OP should be
++   ignored.  CONSTRAINT is one constraint that the operand may need
++   to meet.  */
+ static bool
+-valid_address_p (struct address_info *ad)
++valid_address_p (rtx op, struct address_info *ad,
++		 enum constraint_num constraint)
+ {
+   address_eliminator eliminator (ad);
++
++  /* Allow a memory OP if it matches CONSTRAINT, even if CONSTRAINT is more
++     forgiving than "m".  */
++  if (MEM_P (op)
++      && (insn_extra_memory_constraint (constraint)
++	  || insn_extra_special_memory_constraint (constraint))
++      && constraint_satisfied_p (op, constraint))
++    return true;
++
+   return valid_address_p (ad->mode, *ad->outer, ad->as);
+ }
+ 
+@@ -1888,8 +1902,7 @@ prohibited_class_reg_set_mode_p (enum reg_class rclass,
+   HARD_REG_SET temp;
+   
+   lra_assert (hard_reg_set_subset_p (reg_class_contents[rclass], set));
+-  COPY_HARD_REG_SET (temp, set);
+-  AND_COMPL_HARD_REG_SET (temp, lra_no_alloc_regs);
++  temp = set & ~lra_no_alloc_regs;
+   return (hard_reg_set_subset_p
+ 	  (temp, ira_prohibited_class_mode_regs[rclass][mode]));
+ }
+@@ -1900,11 +1913,12 @@ prohibited_class_reg_set_mode_p (enum reg_class rclass,
+    alternative.  */
+ static unsigned int curr_small_class_check = 0;
+ 
+-/* Update number of used inputs of class OP_CLASS for operand NOP.
+-   Return true if we have more such class operands than the number of
+-   available regs.  */
++/* Update number of used inputs of class OP_CLASS for operand NOP
++   of alternative NALT.  Return true if we have more such class operands
++   than the number of available regs.  */
+ static bool
+-update_and_check_small_class_inputs (int nop, enum reg_class op_class)
++update_and_check_small_class_inputs (int nop, int nalt,
++				     enum reg_class op_class)
+ {
+   static unsigned int small_class_check[LIM_REG_CLASSES];
+   static int small_class_input_nums[LIM_REG_CLASSES];
+@@ -1915,7 +1929,7 @@ update_and_check_small_class_inputs (int nop, enum reg_class op_class)
+       && hard_reg_set_intersect_p (reg_class_contents[op_class],
+ 				   ira_no_alloc_regs)
+       && (curr_static_id->operand[nop].type != OP_OUT
+-	  || curr_static_id->operand[nop].early_clobber))
++	  || TEST_BIT (curr_static_id->operand[nop].early_clobber_alts, nalt)))
+     {
+       if (small_class_check[op_class] == curr_small_class_check)
+ 	small_class_input_nums[op_class]++;
+@@ -2184,7 +2198,8 @@ process_alt_operands (int only_alternative)
+ 			/* We should reject matching of an early
+ 			   clobber operand if the matching operand is
+ 			   not dying in the insn.  */
+-			if (! curr_static_id->operand[m].early_clobber
++			if (!TEST_BIT (curr_static_id->operand[m]
++				       .early_clobber_alts, nalt)
+ 			    || operand_reg[nop] == NULL_RTX
+ 			    || (find_regno_note (curr_insn, REG_DEAD,
+ 						 REGNO (op))
+@@ -2251,7 +2266,8 @@ process_alt_operands (int only_alternative)
+ 			   it results in less hard regs required for
+ 			   the insn than a non-matching earlyclobber
+ 			   alternative.  */
+-			if (curr_static_id->operand[m].early_clobber)
++			if (TEST_BIT (curr_static_id->operand[m]
++				      .early_clobber_alts, nalt))
+ 			  {
+ 			    if (lra_dump_file != NULL)
+ 			      fprintf
+@@ -2302,7 +2318,7 @@ process_alt_operands (int only_alternative)
+ 		       reloads. */
+ 		    badop = false;
+ 		    this_alternative = curr_alt[m];
+-		    COPY_HARD_REG_SET (this_alternative_set, curr_alt_set[m]);
++		    this_alternative_set = curr_alt_set[m];
+ 		    winreg = this_alternative != NO_REGS;
+ 		    break;
+ 		  }
+@@ -2387,14 +2403,12 @@ process_alt_operands (int only_alternative)
+ 		  if (mode == BLKmode)
+ 		    break;
+ 		  this_alternative = reg_class_subunion[this_alternative][cl];
+-		  IOR_HARD_REG_SET (this_alternative_set,
+-				    reg_class_contents[cl]);
++		  this_alternative_set |= reg_class_contents[cl];
+ 		  if (costly_p)
+ 		    {
+ 		      this_costly_alternative
+ 			= reg_class_subunion[this_costly_alternative][cl];
+-		      IOR_HARD_REG_SET (this_costly_alternative_set,
+-					reg_class_contents[cl]);
++		      this_costly_alternative_set |= reg_class_contents[cl];
+ 		    }
+ 		  winreg = true;
+ 		  if (REG_P (op))
+@@ -2529,14 +2543,11 @@ process_alt_operands (int only_alternative)
+ 
+ 	      if (this_alternative != NO_REGS)
+ 		{
+-		  HARD_REG_SET available_regs;
+-		  
+-		  COPY_HARD_REG_SET (available_regs,
+-				     reg_class_contents[this_alternative]);
+-		  AND_COMPL_HARD_REG_SET
+-		    (available_regs,
+-		     ira_prohibited_class_mode_regs[this_alternative][mode]);
+-		  AND_COMPL_HARD_REG_SET (available_regs, lra_no_alloc_regs);
++		  HARD_REG_SET available_regs
++		    = (reg_class_contents[this_alternative]
++		       & ~((ira_prohibited_class_mode_regs
++			    [this_alternative][mode])
++			   | lra_no_alloc_regs));
+ 		  if (hard_reg_set_empty_p (available_regs))
+ 		    {
+ 		      /* There are no hard regs holding a value of given
+@@ -2892,7 +2903,8 @@ process_alt_operands (int only_alternative)
+               goto fail;
+             }
+ 
+-	  if (update_and_check_small_class_inputs (nop, this_alternative))
++	  if (update_and_check_small_class_inputs (nop, nalt,
++						   this_alternative))
+ 	    {
+ 	      if (lra_dump_file != NULL)
+ 		fprintf (lra_dump_file,
+@@ -2901,7 +2913,7 @@ process_alt_operands (int only_alternative)
+ 	      goto fail;
+ 	    }
+ 	  curr_alt[nop] = this_alternative;
+-	  COPY_HARD_REG_SET (curr_alt_set[nop], this_alternative_set);
++	  curr_alt_set[nop] = this_alternative_set;
+ 	  curr_alt_win[nop] = this_alternative_win;
+ 	  curr_alt_match_win[nop] = this_alternative_match_win;
+ 	  curr_alt_offmemok[nop] = this_alternative_offmemok;
+@@ -3416,7 +3428,7 @@ process_address_1 (int nop, bool check_only_p,
+ 
+      All these cases involve a non-autoinc address, so there is no
+      point revalidating other types.  */
+-  if (ad.autoinc_p || valid_address_p (&ad))
++  if (ad.autoinc_p || valid_address_p (op, &ad, cn))
+     return change_p;
+ 
+   /* Any index existed before LRA started, so we can assume that the
+@@ -3445,7 +3457,7 @@ process_address_1 (int nop, bool check_only_p,
+ 	      if (code >= 0)
+ 		{
+ 		  *ad.inner = gen_rtx_LO_SUM (Pmode, new_reg, addr);
+-		  if (! valid_address_p (ad.mode, *ad.outer, ad.as))
++		  if (!valid_address_p (op, &ad, cn))
+ 		    {
+ 		      /* Try to put lo_sum into register.  */
+ 		      insn = emit_insn (gen_rtx_SET
+@@ -3455,7 +3467,7 @@ process_address_1 (int nop, bool check_only_p,
+ 		      if (code >= 0)
+ 			{
+ 			  *ad.inner = new_reg;
+-			  if (! valid_address_p (ad.mode, *ad.outer, ad.as))
++			  if (!valid_address_p (op, &ad, cn))
+ 			    {
+ 			      *ad.inner = addr;
+ 			      code = -1;
+@@ -3550,7 +3562,7 @@ process_address_1 (int nop, bool check_only_p,
+ 	  && CONSTANT_P (XEXP (SET_SRC (set), 1)))
+ 	{
+ 	  *ad.inner = SET_SRC (set);
+-	  if (valid_address_p (ad.mode, *ad.outer, ad.as))
++	  if (valid_address_p (op, &ad, cn))
+ 	    {
+ 	      *ad.base_term = XEXP (SET_SRC (set), 0);
+ 	      *ad.disp_term = XEXP (SET_SRC (set), 1);
+@@ -4573,7 +4585,7 @@ contains_reg_p (rtx x, bool hard_reg_p, bool spilled_p)
+ 	    regno = lra_get_regno_hard_regno (regno);
+ 	  if (regno < 0)
+ 	    return false;
+-	  COMPL_HARD_REG_SET (alloc_regs, lra_no_alloc_regs);
++	  alloc_regs = ~lra_no_alloc_regs;
+ 	  return overlaps_hard_reg_set_p (alloc_regs, GET_MODE (x), regno);
+ 	}
+       else
+@@ -5165,6 +5177,14 @@ static int reloads_num;
+ /* Number of calls passed so far in current EBB.  */
+ static int calls_num;
+ 
++/* Index ID is the CALLS_NUM associated the last call we saw with
++   ABI identifier ID.  */
++static int last_call_for_abi[NUM_ABI_IDS];
++
++/* Which registers have been fully or partially clobbered by a call
++   since they were last used.  */
++static HARD_REG_SET full_and_partial_call_clobbers;
++
+ /* Current reload pseudo check for validity of elements in
+    USAGE_INSNS.	 */
+ static int curr_usage_insns_check;
+@@ -5208,6 +5228,10 @@ setup_next_usage_insn (int regno, rtx insn, int reloads_num, bool after_p)
+   usage_insns[regno].reloads_num = reloads_num;
+   usage_insns[regno].calls_num = calls_num;
+   usage_insns[regno].after_p = after_p;
++  if (regno >= FIRST_PSEUDO_REGISTER && reg_renumber[regno] >= 0)
++    remove_from_hard_reg_set (&full_and_partial_call_clobbers,
++			      PSEUDO_REGNO_MODE (regno),
++			      reg_renumber[regno]);
+ }
+ 
+ /* The function is used to form list REGNO usages which consists of
+@@ -5453,16 +5477,19 @@ static inline bool
+ need_for_call_save_p (int regno)
+ {
+   lra_assert (regno >= FIRST_PSEUDO_REGISTER && reg_renumber[regno] >= 0);
+-  return (usage_insns[regno].calls_num < calls_num
+-	  && (overlaps_hard_reg_set_p
+-	      ((flag_ipa_ra &&
+-		! hard_reg_set_empty_p (lra_reg_info[regno].actual_call_used_reg_set))
+-	       ? lra_reg_info[regno].actual_call_used_reg_set
+-	       : call_used_reg_set,
+-	       PSEUDO_REGNO_MODE (regno), reg_renumber[regno])
+-	      || (targetm.hard_regno_call_part_clobbered
+-		  (lra_reg_info[regno].call_insn,
+-		   reg_renumber[regno], PSEUDO_REGNO_MODE (regno)))));
++  if (usage_insns[regno].calls_num < calls_num)
++    {
++      unsigned int abis = 0;
++      for (unsigned int i = 0; i < NUM_ABI_IDS; ++i)
++	if (last_call_for_abi[i] > usage_insns[regno].calls_num)
++	  abis |= 1 << i;
++      gcc_assert (abis);
++      if (call_clobbered_in_region_p (abis, full_and_partial_call_clobbers,
++				      PSEUDO_REGNO_MODE (regno),
++				      reg_renumber[regno]))
++	return true;
++    }
++  return false;
+ }
+ 
+ /* Global registers occurring in the current EBB.  */
+@@ -5502,8 +5529,7 @@ need_for_split_p (HARD_REG_SET potential_reload_hard_regs, int regno)
+ 	      true) the assign pass assumes that all pseudos living
+ 	      through calls are assigned to call saved hard regs.  */
+ 	   && (regno >= FIRST_PSEUDO_REGISTER
+-	       || ! TEST_HARD_REG_BIT (call_used_reg_set, regno)
+-	       || usage_insns[regno].calls_num == calls_num)
++	       || !TEST_HARD_REG_BIT (full_and_partial_call_clobbers, regno))
+ 	   /* We need at least 2 reloads to make pseudo splitting
+ 	      profitable.  We should provide hard regno splitting in
+ 	      any case to solve 1st insn scheduling problem when
+@@ -6255,12 +6281,14 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+   curr_usage_insns_check++;
+   clear_invariants ();
+   reloads_num = calls_num = 0;
++  for (unsigned int i = 0; i < NUM_ABI_IDS; ++i)
++    last_call_for_abi[i] = 0;
++  CLEAR_HARD_REG_SET (full_and_partial_call_clobbers);
+   bitmap_clear (&check_only_regs);
+   bitmap_clear (&invalid_invariant_regs);
+   last_processed_bb = NULL;
+   CLEAR_HARD_REG_SET (potential_reload_hard_regs);
+-  COPY_HARD_REG_SET (live_hard_regs, eliminable_regset);
+-  IOR_HARD_REG_SET (live_hard_regs, lra_no_alloc_regs);
++  live_hard_regs = eliminable_regset | lra_no_alloc_regs;
+   /* We don't process new insns generated in the loop.	*/
+   for (curr_insn = tail; curr_insn != PREV_INSN (head); curr_insn = prev_insn)
+     {
+@@ -6330,8 +6358,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 	  else
+ 	    setup_next_usage_insn (src_regno, curr_insn, reloads_num, false);
+ 	  if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
+-	    IOR_HARD_REG_SET (potential_reload_hard_regs,
+-			      reg_class_contents[cl]);
++	    potential_reload_hard_regs |= reg_class_contents[cl];
+ 	}
+       else if (src_regno < 0
+ 	       && dst_regno >= lra_constraint_new_regno_start
+@@ -6348,8 +6375,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 	  if (process_invariant_for_inheritance (SET_DEST (curr_set), SET_SRC (curr_set)))
+ 	    change_p = true;
+ 	  if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
+-	    IOR_HARD_REG_SET (potential_reload_hard_regs,
+-			      reg_class_contents[cl]);
++	    potential_reload_hard_regs |= reg_class_contents[cl];
+ 	}
+       else if (src_regno >= lra_constraint_new_regno_start
+ 	       && dst_regno < lra_constraint_new_regno_start
+@@ -6371,8 +6397,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 	  /* Invalidate.  */
+ 	  usage_insns[dst_regno].check = 0;
+ 	  if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
+-	    IOR_HARD_REG_SET (potential_reload_hard_regs,
+-			      reg_class_contents[cl]);
++	    potential_reload_hard_regs |= reg_class_contents[cl];
+ 	}
+       else if (INSN_P (curr_insn))
+ 	{
+@@ -6427,8 +6452,8 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 		      else
+ 			add_to_hard_reg_set (&s, PSEUDO_REGNO_MODE (dst_regno),
+ 					     reg_renumber[dst_regno]);
+-		      AND_COMPL_HARD_REG_SET (live_hard_regs, s);
+-		      AND_COMPL_HARD_REG_SET (potential_reload_hard_regs, s);
++		      live_hard_regs &= ~s;
++		      potential_reload_hard_regs &= ~s;
+ 		    }
+ 		  /* We should invalidate potential inheritance or
+ 		     splitting for the current insn usages to the next
+@@ -6472,6 +6497,10 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 	      int regno, hard_regno;
+ 
+ 	      calls_num++;
++	      function_abi callee_abi = insn_callee_abi (curr_insn);
++	      last_call_for_abi[callee_abi.id ()] = calls_num;
++	      full_and_partial_call_clobbers
++		|= callee_abi.full_and_partial_reg_clobbers ();
+ 	      if ((cheap = find_reg_note (curr_insn,
+ 					  REG_RETURNED, NULL_RTX)) != NULL_RTX
+ 		  && ((cheap = XEXP (cheap, 0)), true)
+@@ -6481,7 +6510,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 		  /* If there are pending saves/restores, the
+ 		     optimization is not worth.	 */
+ 		  && usage_insns[regno].calls_num == calls_num - 1
+-		  && TEST_HARD_REG_BIT (call_used_reg_set, hard_regno))
++		  && callee_abi.clobbers_reg_p (GET_MODE (cheap), hard_regno))
+ 		{
+ 		  /* Restore the pseudo from the call result as
+ 		     REG_RETURNED note says that the pseudo value is
+@@ -6504,6 +6533,9 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 		      /* We don't need to save/restore of the pseudo from
+ 			 this call.	 */
+ 		      usage_insns[regno].calls_num = calls_num;
++		      remove_from_hard_reg_set
++			(&full_and_partial_call_clobbers,
++			 GET_MODE (cheap), hard_regno);
+ 		      bitmap_set_bit (&check_only_regs, regno);
+ 		    }
+ 		}
+@@ -6607,8 +6639,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
+ 	      if (ira_class_hard_regs_num[cl] <= max_small_class_regs_num)
+ 		reloads_num++;
+ 	      if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
+-		IOR_HARD_REG_SET (potential_reload_hard_regs,
+-	                          reg_class_contents[cl]);
++		potential_reload_hard_regs |= reg_class_contents[cl];
+ 	    }
+ 	}
+       if (NONDEBUG_INSN_P (curr_insn))
+diff --git a/gcc/lra-eliminations.c b/gcc/lra-eliminations.c
+index 7a345a52a..9568c13cb 100644
+--- a/gcc/lra-eliminations.c
++++ b/gcc/lra-eliminations.c
+@@ -654,7 +654,6 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode mem_mode,
+       return x;
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+     case SET:
+       gcc_unreachable ();
+ 
+@@ -807,16 +806,6 @@ mark_not_eliminable (rtx x, machine_mode mem_mode)
+ 	    setup_can_eliminate (ep, false);
+       return;
+ 
+-    case CLOBBER_HIGH:
+-      gcc_assert (REG_P (XEXP (x, 0)));
+-      gcc_assert (REGNO (XEXP (x, 0)) < FIRST_PSEUDO_REGISTER);
+-      for (ep = reg_eliminate;
+-	   ep < &reg_eliminate[NUM_ELIMINABLE_REGS];
+-	   ep++)
+-	if (reg_is_clobbered_by_clobber_high (ep->to_rtx, XEXP (x, 0)))
+-	  setup_can_eliminate (ep, false);
+-      return;
+-
+     case SET:
+       if (SET_DEST (x) == stack_pointer_rtx
+ 	  && GET_CODE (SET_SRC (x)) == PLUS
+@@ -1180,7 +1169,7 @@ spill_pseudos (HARD_REG_SET set)
+ 	reg_renumber[i] = -1;
+ 	bitmap_ior_into (&to_process, &lra_reg_info[i].insn_bitmap);
+       }
+-  IOR_HARD_REG_SET (lra_no_alloc_regs, set);
++  lra_no_alloc_regs |= set;
+   for (insn = get_insns (); insn != NULL_RTX; insn = NEXT_INSN (insn))
+     if (bitmap_bit_p (&to_process, INSN_UID (insn)))
+       {
+@@ -1293,8 +1282,8 @@ update_reg_eliminate (bitmap insns_with_changed_offsets)
+ 	    result = true;
+ 	  }
+       }
+-  IOR_HARD_REG_SET (lra_no_alloc_regs, temp_hard_reg_set);
+-  AND_COMPL_HARD_REG_SET (eliminable_regset, temp_hard_reg_set);
++  lra_no_alloc_regs |= temp_hard_reg_set;
++  eliminable_regset &= ~temp_hard_reg_set;
+   spill_pseudos (temp_hard_reg_set);
+   return result;
+ }
+diff --git a/gcc/lra-int.h b/gcc/lra-int.h
+index 253ae1e6c..5671e2e65 100644
+--- a/gcc/lra-int.h
++++ b/gcc/lra-int.h
+@@ -72,10 +72,6 @@ struct lra_reg
+   /* The following fields are defined only for pseudos.	 */
+   /* Hard registers with which the pseudo conflicts.  */
+   HARD_REG_SET conflict_hard_regs;
+-  /* Call used registers with which the pseudo conflicts, taking into account
+-     the registers used by functions called from calls which cross the
+-     pseudo.  */
+-  HARD_REG_SET actual_call_used_reg_set;
+   /* We assign hard registers to reload pseudos which can occur in few
+      places.  So two hard register preferences are enough for them.
+      The following fields define the preferred hard registers.	If
+@@ -103,8 +99,6 @@ struct lra_reg
+   int val;
+   /* Offset from relative eliminate register to pesudo reg.  */
+   poly_int64 offset;
+-  /* Call instruction, if any, that may affect this psuedo reg.  */
+-  rtx_insn *call_insn;
+   /* These members are set up in lra-lives.c and updated in
+      lra-coalesce.c.  */
+   /* The biggest size mode in which each pseudo reg is referred in
+@@ -141,10 +135,6 @@ struct lra_operand_data
+   unsigned int strict_low : 1;
+   /* True if the operand is an operator.  */
+   unsigned int is_operator : 1;
+-  /* True if there is an early clobber alternative for this operand.
+-     This field is set up every time when corresponding
+-     operand_alternative in lra_static_insn_data is set up.  */
+-  unsigned int early_clobber : 1;
+   /* True if the operand is an address.  */
+   unsigned int is_address : 1;
+ };
+@@ -163,11 +153,6 @@ struct lra_insn_reg
+   /* True if the reg is accessed through a subreg and the subreg is
+      just a part of the register.  */
+   unsigned int subreg_p : 1;
+-  /* True if there is an early clobber alternative for this
+-     operand.  */
+-  unsigned int early_clobber : 1;
+-  /* True if the reg is clobber highed by the operand.  */
+-  unsigned int clobber_high : 1;
+   /* The corresponding regno of the register.  */
+   int regno;
+   /* Next reg info of the same insn.  */
+diff --git a/gcc/lra-lives.c b/gcc/lra-lives.c
+index 55b2adc2a..bce123d73 100644
+--- a/gcc/lra-lives.c
++++ b/gcc/lra-lives.c
+@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.	If not see
+ #include "sparseset.h"
+ #include "lra-int.h"
+ #include "target.h"
++#include "function-abi.h"
+ 
+ /* Program points are enumerated by numbers from range
+    0..LRA_LIVE_MAX_POINT-1.  There are approximately two times more
+@@ -327,7 +328,7 @@ static void
+ mark_pseudo_dead (int regno)
+ {
+   lra_assert (!HARD_REGISTER_NUM_P (regno));
+-  IOR_HARD_REG_SET (lra_reg_info[regno].conflict_hard_regs, hard_regs_live);
++  lra_reg_info[regno].conflict_hard_regs |= hard_regs_live;
+   if (!sparseset_bit_p (pseudos_live, regno))
+     return;
+ 
+@@ -574,41 +575,21 @@ lra_setup_reload_pseudo_preferenced_hard_reg (int regno,
+     }
+ }
+ 
+-/* Check that REGNO living through calls and setjumps, set up conflict
+-   regs using LAST_CALL_USED_REG_SET, and clear corresponding bits in
+-   PSEUDOS_LIVE_THROUGH_CALLS and PSEUDOS_LIVE_THROUGH_SETJUMPS.
+-   CALL_INSN is a call that is representative of all calls in the region
+-   described by the PSEUDOS_LIVE_THROUGH_* sets, in terms of the registers
+-   that it preserves and clobbers.  */
++/* Check whether REGNO lives through calls and setjmps and clear
++   the corresponding bits in PSEUDOS_LIVE_THROUGH_CALLS and
++   PSEUDOS_LIVE_THROUGH_SETJUMPS.  All calls in the region described
++   by PSEUDOS_LIVE_THROUGH_CALLS have the given ABI.  */
+ 
+ static inline void
+-check_pseudos_live_through_calls (int regno,
+-				  HARD_REG_SET last_call_used_reg_set,
+-				  rtx_insn *call_insn)
++check_pseudos_live_through_calls (int regno, const function_abi &abi)
+ {
+-  int hr;
+-  rtx_insn *old_call_insn;
+-
+   if (! sparseset_bit_p (pseudos_live_through_calls, regno))
+     return;
+ 
+-  gcc_assert (call_insn && CALL_P (call_insn));
+-  old_call_insn = lra_reg_info[regno].call_insn;
+-  if (!old_call_insn
+-      || (targetm.return_call_with_max_clobbers
+-	  && targetm.return_call_with_max_clobbers (old_call_insn, call_insn)
+-	     == call_insn))
+-    lra_reg_info[regno].call_insn = call_insn;
++  machine_mode mode = PSEUDO_REGNO_MODE (regno);
+ 
+   sparseset_clear_bit (pseudos_live_through_calls, regno);
+-  IOR_HARD_REG_SET (lra_reg_info[regno].conflict_hard_regs,
+-		    last_call_used_reg_set);
+-
+-  for (hr = 0; HARD_REGISTER_NUM_P (hr); hr++)
+-    if (targetm.hard_regno_call_part_clobbered (call_insn, hr,
+-						PSEUDO_REGNO_MODE (regno)))
+-      add_to_hard_reg_set (&lra_reg_info[regno].conflict_hard_regs,
+-			   PSEUDO_REGNO_MODE (regno), hr);
++  lra_reg_info[regno].conflict_hard_regs |= abi.mode_clobbers (mode);
+   if (! sparseset_bit_p (pseudos_live_through_setjumps, regno))
+     return;
+   sparseset_clear_bit (pseudos_live_through_setjumps, regno);
+@@ -623,23 +604,10 @@ check_pseudos_live_through_calls (int regno,
+ static inline bool
+ reg_early_clobber_p (const struct lra_insn_reg *reg, int n_alt)
+ {
+-  return (reg->early_clobber
+-	  && (n_alt == LRA_UNKNOWN_ALT
+-	      || (n_alt != LRA_NON_CLOBBERED_ALT
+-		  && TEST_BIT (reg->early_clobber_alts, n_alt))));
+-}
+-
+-/* Return true if call instructions CALL1 and CALL2 use ABIs that
+-   preserve the same set of registers.  */
+-
+-static bool
+-calls_have_same_clobbers_p (rtx_insn *call1, rtx_insn *call2)
+-{
+-  if (!targetm.return_call_with_max_clobbers)
+-    return false;
+-
+-  return (targetm.return_call_with_max_clobbers (call1, call2) == call1
+-          && targetm.return_call_with_max_clobbers (call2, call1) == call2);
++  return (n_alt == LRA_UNKNOWN_ALT
++	  ? reg->early_clobber_alts != 0
++	  : (n_alt != LRA_NON_CLOBBERED_ALT
++	     && TEST_BIT (reg->early_clobber_alts, n_alt)));
+ }
+ 
+ /* Process insns of the basic block BB to update pseudo live ranges,
+@@ -661,17 +629,15 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+   rtx_insn *next;
+   rtx link, *link_loc;
+   bool need_curr_point_incr;
+-  HARD_REG_SET last_call_used_reg_set;
+-  rtx_insn *call_insn = NULL;
+-  rtx_insn *last_call_insn = NULL;
++  /* Only has a meaningful value once we've seen a call.  */
++  function_abi last_call_abi = default_function_abi;
+ 
+   reg_live_out = df_get_live_out (bb);
+   sparseset_clear (pseudos_live);
+   sparseset_clear (pseudos_live_through_calls);
+   sparseset_clear (pseudos_live_through_setjumps);
+-  CLEAR_HARD_REG_SET (last_call_used_reg_set);
+   REG_SET_TO_HARD_REG_SET (hard_regs_live, reg_live_out);
+-  AND_COMPL_HARD_REG_SET (hard_regs_live, eliminable_regset);
++  hard_regs_live &= ~eliminable_regset;
+   EXECUTE_IF_SET_IN_BITMAP (reg_live_out, FIRST_PSEUDO_REGISTER, j, bi)
+     {
+       update_pseudo_point (j, curr_point, USE_POINT);
+@@ -701,7 +667,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+       bool call_p;
+       int n_alt, dst_regno, src_regno;
+       rtx set;
+-      struct lra_insn_reg *reg, *hr;
++      struct lra_insn_reg *reg;
+ 
+       if (!NONDEBUG_INSN_P (curr_insn))
+ 	continue;
+@@ -733,7 +699,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+ 		break;
+ 	      }
+ 	  for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
+-	    if (reg->type != OP_IN && !reg->clobber_high)
++	    if (reg->type != OP_IN)
+ 	      {
+ 		remove_p = false;
+ 		break;
+@@ -870,24 +836,13 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+ 	 unused values because they still conflict with quantities
+ 	 that are live at the time of the definition.  */
+       for (reg = curr_id->regs; reg != NULL; reg = reg->next)
+-	{
+-	  if (reg->type != OP_IN)
+-	    {
+-	      update_pseudo_point (reg->regno, curr_point, USE_POINT);
+-	      mark_regno_live (reg->regno, reg->biggest_mode);
+-	      check_pseudos_live_through_calls (reg->regno,
+-						last_call_used_reg_set,
+-						call_insn);
+-	    }
+-
+-	  if (!HARD_REGISTER_NUM_P (reg->regno))
+-	    for (hr = curr_static_id->hard_regs; hr != NULL; hr = hr->next)
+-	      if (hr->clobber_high
+-		  && maybe_gt (GET_MODE_SIZE (PSEUDO_REGNO_MODE (reg->regno)),
+-			       GET_MODE_SIZE (hr->biggest_mode)))
+-		SET_HARD_REG_BIT (lra_reg_info[reg->regno].conflict_hard_regs,
+-				  hr->regno);
+-	}
++	if (reg->type != OP_IN)
++	  {
++	    update_pseudo_point (reg->regno, curr_point, USE_POINT);
++	    mark_regno_live (reg->regno, reg->biggest_mode);
++	    /* ??? Should be a no-op for unused registers.  */
++	    check_pseudos_live_through_calls (reg->regno, last_call_abi);
++	  }
+ 
+       for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
+ 	if (reg->type != OP_IN)
+@@ -926,35 +881,13 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+ 
+       if (call_p)
+ 	{
+-	  call_insn = curr_insn;
+-	  if (! flag_ipa_ra && ! targetm.return_call_with_max_clobbers)
+-	    COPY_HARD_REG_SET(last_call_used_reg_set, call_used_reg_set);
+-	  else
+-	    {
+-	      HARD_REG_SET this_call_used_reg_set;
+-	      get_call_reg_set_usage (curr_insn, &this_call_used_reg_set,
+-				      call_used_reg_set);
+-
+-	      bool flush = (! hard_reg_set_empty_p (last_call_used_reg_set)
+-			    && ( ! hard_reg_set_equal_p (last_call_used_reg_set,
+-						       this_call_used_reg_set)))
+-			   || (last_call_insn && ! calls_have_same_clobbers_p
+-						     (call_insn,
+-						      last_call_insn));
+-
+-	      EXECUTE_IF_SET_IN_SPARSESET (pseudos_live, j)
+-		{
+-		  IOR_HARD_REG_SET (lra_reg_info[j].actual_call_used_reg_set,
+-				    this_call_used_reg_set);
++	  function_abi call_abi = insn_callee_abi (curr_insn);
+ 
+-		  if (flush)
+-		    check_pseudos_live_through_calls (j,
+-						      last_call_used_reg_set,
+-						      last_call_insn);
+-		}
+-	      COPY_HARD_REG_SET(last_call_used_reg_set, this_call_used_reg_set);
+-	      last_call_insn = call_insn;
+-	    }
++	  if (last_call_abi != call_abi)
++	    EXECUTE_IF_SET_IN_SPARSESET (pseudos_live, j)
++	      check_pseudos_live_through_calls (j, last_call_abi);
++
++	  last_call_abi = call_abi;
+ 
+ 	  sparseset_ior (pseudos_live_through_calls,
+ 			 pseudos_live_through_calls, pseudos_live);
+@@ -992,9 +925,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+ 	    if (reg->type == OP_IN)
+ 	      update_pseudo_point (reg->regno, curr_point, USE_POINT);
+ 	    mark_regno_live (reg->regno, reg->biggest_mode);
+-	    check_pseudos_live_through_calls (reg->regno,
+-					      last_call_used_reg_set,
+-					      call_insn);
++	    check_pseudos_live_through_calls (reg->regno, last_call_abi);
+ 	  }
+ 
+       for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
+@@ -1088,10 +1019,10 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+       }
+ 
+   /* Pseudos can't go in stack regs at the start of a basic block that
+-     is reached by an abnormal edge. Likewise for call clobbered regs,
+-     because caller-save, fixup_abnormal_edges and possibly the table
+-     driven EH machinery are not quite ready to handle such pseudos
+-     live across such edges.  */
++     is reached by an abnormal edge.  Likewise for registers that are at
++     least partly call clobbered, because caller-save, fixup_abnormal_edges
++     and possibly the table driven EH machinery are not quite ready to
++     handle such pseudos live across such edges.  */
+   if (bb_has_abnormal_pred (bb))
+     {
+ #ifdef STACK_REGS
+@@ -1106,7 +1037,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+       if (!cfun->has_nonlocal_label
+ 	  && has_abnormal_call_or_eh_pred_edge_p (bb))
+ 	for (px = 0; HARD_REGISTER_NUM_P (px); px++)
+-	  if (call_used_regs[px]
++	  if (eh_edge_abi.clobbers_at_least_part_of_reg_p (px)
+ #ifdef REAL_PIC_OFFSET_TABLE_REGNUM
+ 	      /* We should create a conflict of PIC pseudo with PIC
+ 		 hard reg as PIC hard reg can have a wrong value after
+@@ -1163,7 +1094,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
+       if (sparseset_cardinality (pseudos_live_through_calls) == 0)
+ 	break;
+       if (sparseset_bit_p (pseudos_live_through_calls, j))
+-	check_pseudos_live_through_calls (j, last_call_used_reg_set, call_insn);
++	check_pseudos_live_through_calls (j, last_call_abi);
+     }
+ 
+   for (i = 0; HARD_REGISTER_NUM_P (i); ++i)
+@@ -1397,7 +1328,6 @@ lra_create_live_ranges_1 (bool all_p, bool dead_insn_p)
+ 	lra_reg_info[i].biggest_mode = GET_MODE (regno_reg_rtx[i]);
+       else
+ 	lra_reg_info[i].biggest_mode = VOIDmode;
+-      lra_reg_info[i].call_insn = NULL;
+       if (!HARD_REGISTER_NUM_P (i)
+ 	  && lra_reg_info[i].nrefs != 0)
+ 	{
+diff --git a/gcc/lra-remat.c b/gcc/lra-remat.c
+index 69209b2a1..914f5e2ce 100644
+--- a/gcc/lra-remat.c
++++ b/gcc/lra-remat.c
+@@ -65,16 +65,11 @@ along with GCC; see the file COPYING3.	If not see
+ #include "recog.h"
+ #include "lra.h"
+ #include "lra-int.h"
++#include "function-abi.h"
+ 
+ /* Number of candidates for rematerialization.  */
+ static unsigned int cands_num;
+ 
+-/* The following is used for representation of call_used_reg_set in
+-   form array whose elements are hard register numbers with nonzero bit
+-   in CALL_USED_REG_SET. */
+-static int call_used_regs_arr_len;
+-static int call_used_regs_arr[FIRST_PSEUDO_REGISTER];
+-
+ /* Bitmap used for different calculations.  */
+ static bitmap_head temp_bitmap;
+ 
+@@ -632,9 +627,12 @@ set_bb_regs (basic_block bb, rtx_insn *insn)
+ 	bitmap_set_bit (&subreg_regs, regno);
+     }
+   if (CALL_P (insn))
+-    for (int i = 0; i < call_used_regs_arr_len; i++)
+-      bitmap_set_bit (&get_remat_bb_data (bb)->dead_regs,
+-		      call_used_regs_arr[i]);
++    {
++      /* Partially-clobbered registers might still be live.  */
++      HARD_REG_SET clobbers = insn_callee_abi (insn).full_reg_clobbers ();
++      bitmap_ior_into (&get_remat_bb_data (bb)->dead_regs,
++		       bitmap_view<HARD_REG_SET> (clobbers));
++    }
+ }
+ 
+ /* Calculate changed_regs and dead_regs for each BB.  */
+@@ -697,7 +695,7 @@ reg_overlap_for_remat_p (lra_insn_reg *reg, rtx_insn *insn)
+ 
+ /* Return true if a call used register is an input operand of INSN.  */
+ static bool
+-call_used_input_regno_present_p (rtx_insn *insn)
++call_used_input_regno_present_p (const function_abi &abi, rtx_insn *insn)
+ {
+   int iter;
+   lra_insn_recog_data_t id = lra_get_insn_recog_data (insn);
+@@ -708,8 +706,9 @@ call_used_input_regno_present_p (rtx_insn *insn)
+     for (reg = (iter == 0 ? id->regs : static_id->hard_regs);
+ 	 reg != NULL;
+ 	 reg = reg->next)
+-      if (reg->type == OP_IN && reg->regno < FIRST_PSEUDO_REGISTER
+-	  && TEST_HARD_REG_BIT (call_used_reg_set, reg->regno))
++      if (reg->type == OP_IN
++	  && reg->regno < FIRST_PSEUDO_REGISTER
++	  && abi.clobbers_reg_p (reg->biggest_mode, reg->regno))
+ 	return true;
+   return false;
+ }
+@@ -798,18 +797,21 @@ calculate_gen_cands (void)
+ 		    }
+ 	    
+ 	    if (CALL_P (insn))
+-	      EXECUTE_IF_SET_IN_BITMAP (gen_insns, 0, uid, bi)
+-		{
+-		  rtx_insn *insn2 = lra_insn_recog_data[uid]->insn;
++	      {
++		function_abi callee_abi = insn_callee_abi (insn);
++		EXECUTE_IF_SET_IN_BITMAP (gen_insns, 0, uid, bi)
++		  {
++		    rtx_insn *insn2 = lra_insn_recog_data[uid]->insn;
+ 		  
+-		  cand = insn_to_cand[INSN_UID (insn2)];
+-		  gcc_assert (cand != NULL);
+-		  if (call_used_input_regno_present_p (insn2))
+-		    {
+-		      bitmap_clear_bit (gen_cands, cand->index);
+-		      bitmap_set_bit (&temp_bitmap, uid);
+-		    }
+-		}
++		    cand = insn_to_cand[INSN_UID (insn2)];
++		    gcc_assert (cand != NULL);
++		    if (call_used_input_regno_present_p (callee_abi, insn2))
++		      {
++			bitmap_clear_bit (gen_cands, cand->index);
++			bitmap_set_bit (&temp_bitmap, uid);
++		      }
++		  }
++	      }
+ 	    bitmap_and_compl_into (gen_insns, &temp_bitmap);
+ 
+ 	    cand = insn_to_cand[INSN_UID (insn)];
+@@ -1204,13 +1206,16 @@ do_remat (void)
+ 		  }
+ 
+ 	  if (CALL_P (insn))
+-	    EXECUTE_IF_SET_IN_BITMAP (avail_cands, 0, cid, bi)
+-	      {
+-		cand = all_cands[cid];
++	    {
++	      function_abi callee_abi = insn_callee_abi (insn);
++	      EXECUTE_IF_SET_IN_BITMAP (avail_cands, 0, cid, bi)
++		{
++		  cand = all_cands[cid];
+ 		
+-		if (call_used_input_regno_present_p (cand->insn))
+-		  bitmap_set_bit (&temp_bitmap, cand->index);
+-	      }
++		  if (call_used_input_regno_present_p (callee_abi, cand->insn))
++		    bitmap_set_bit (&temp_bitmap, cand->index);
++		}
++	    }
+ 
+ 	  bitmap_and_compl_into (avail_cands, &temp_bitmap);
+ 
+@@ -1306,10 +1311,6 @@ lra_remat (void)
+   insn_to_cand_activation = XCNEWVEC (cand_t, get_max_uid ());
+   regno_cands = XCNEWVEC (cand_t, max_regno);
+   all_cands.create (8000);
+-  call_used_regs_arr_len = 0;
+-  for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-    if (call_used_regs[i])
+-      call_used_regs_arr[call_used_regs_arr_len++] = i;
+   initiate_cand_table ();
+   create_remat_bb_data ();
+   bitmap_initialize (&temp_bitmap, &reg_obstack);
+diff --git a/gcc/lra-spills.c b/gcc/lra-spills.c
+index c0f61c119..d4163eb75 100644
+--- a/gcc/lra-spills.c
++++ b/gcc/lra-spills.c
+@@ -242,7 +242,7 @@ assign_spill_hard_regs (int *pseudo_regnos, int n)
+   /* Set up reserved hard regs for every program point.	 */
+   reserved_hard_regs = XNEWVEC (HARD_REG_SET, lra_live_max_point);
+   for (p = 0; p < lra_live_max_point; p++)
+-    COPY_HARD_REG_SET (reserved_hard_regs[p], lra_no_alloc_regs);
++    reserved_hard_regs[p] = lra_no_alloc_regs;
+   for (i = FIRST_PSEUDO_REGISTER; i < regs_num; i++)
+     if (lra_reg_info[i].nrefs != 0
+ 	&& (hard_regno = lra_get_regno_hard_regno (i)) >= 0)
+@@ -273,11 +273,10 @@ assign_spill_hard_regs (int *pseudo_regnos, int n)
+ 	  continue;
+ 	}
+       lra_assert (spill_class != NO_REGS);
+-      COPY_HARD_REG_SET (conflict_hard_regs,
+-			 lra_reg_info[regno].conflict_hard_regs);
++      conflict_hard_regs = lra_reg_info[regno].conflict_hard_regs;
+       for (r = lra_reg_info[regno].live_ranges; r != NULL; r = r->next)
+ 	for (p = r->start; p <= r->finish; p++)
+-	  IOR_HARD_REG_SET (conflict_hard_regs, reserved_hard_regs[p]);
++	  conflict_hard_regs |= reserved_hard_regs[p];
+       spill_class_size = ira_class_hard_regs_num[spill_class];
+       mode = lra_reg_info[regno].biggest_mode;
+       for (k = 0; k < spill_class_size; k++)
+diff --git a/gcc/lra.c b/gcc/lra.c
+index 10b85340f..db2f82fb1 100644
+--- a/gcc/lra.c
++++ b/gcc/lra.c
+@@ -121,6 +121,7 @@ along with GCC; see the file COPYING3.	If not see
+ #include "lra.h"
+ #include "lra-int.h"
+ #include "print-rtl.h"
++#include "function-abi.h"
+ 
+ /* Dump bitmap SET with TITLE and BB INDEX.  */
+ void
+@@ -536,18 +537,15 @@ object_allocator<lra_insn_reg> lra_insn_reg_pool ("insn regs");
+ 
+ /* Create LRA insn related info about a reference to REGNO in INSN
+    with TYPE (in/out/inout), biggest reference mode MODE, flag that it
+-   is reference through subreg (SUBREG_P), flag that is early
+-   clobbered in the insn (EARLY_CLOBBER), and reference to the next
++   is reference through subreg (SUBREG_P), and reference to the next
+    insn reg info (NEXT).  If REGNO can be early clobbered,
+    alternatives in which it can be early clobbered are given by
+-   EARLY_CLOBBER_ALTS.  CLOBBER_HIGH marks if reference is a clobber
+-   high.  */
++   EARLY_CLOBBER_ALTS.  */
+ static struct lra_insn_reg *
+ new_insn_reg (rtx_insn *insn, int regno, enum op_type type,
+-	      machine_mode mode,
+-	      bool subreg_p, bool early_clobber,
++	      machine_mode mode, bool subreg_p,
+ 	      alternative_mask early_clobber_alts,
+-	      struct lra_insn_reg *next, bool clobber_high)
++	      struct lra_insn_reg *next)
+ {
+   lra_insn_reg *ir = lra_insn_reg_pool.allocate ();
+   ir->type = type;
+@@ -556,9 +554,7 @@ new_insn_reg (rtx_insn *insn, int regno, enum op_type type,
+       && partial_subreg_p (lra_reg_info[regno].biggest_mode, mode))
+     lra_reg_info[regno].biggest_mode = mode;
+   ir->subreg_p = subreg_p;
+-  ir->early_clobber = early_clobber;
+   ir->early_clobber_alts = early_clobber_alts;
+-  ir->clobber_high = clobber_high;
+   ir->regno = regno;
+   ir->next = next;
+   return ir;
+@@ -605,7 +601,7 @@ static struct lra_operand_data debug_operand_data =
+     0, /* early_clobber_alts */
+     E_VOIDmode, /* We are not interesting in the operand mode.  */
+     OP_IN,
+-    0, 0, 0, 0
++    0, 0, 0
+   };
+ 
+ /* The following data are used as static insn data for all debug
+@@ -801,7 +797,6 @@ setup_operand_alternative (lra_insn_recog_data_t data,
+   for (i = 0; i < nop; i++)
+     {
+       static_data->operand[i].early_clobber_alts = 0;
+-      static_data->operand[i].early_clobber = false;
+       static_data->operand[i].is_address = false;
+       if (static_data->operand[i].constraint[0] == '%')
+ 	{
+@@ -817,7 +812,6 @@ setup_operand_alternative (lra_insn_recog_data_t data,
+   for (j = 0; j < nalt; j++)
+     for (i = 0; i < nop; i++, op_alt++)
+       {
+-	static_data->operand[i].early_clobber |= op_alt->earlyclobber;
+ 	if (op_alt->earlyclobber)
+ 	  static_data->operand[i].early_clobber_alts |= (alternative_mask) 1 << j;
+ 	static_data->operand[i].is_address |= op_alt->is_address;
+@@ -828,13 +822,12 @@ setup_operand_alternative (lra_insn_recog_data_t data,
+    not the insn operands, in X with TYPE (in/out/inout) and flag that
+    it is early clobbered in the insn (EARLY_CLOBBER) and add the info
+    to LIST.  X is a part of insn given by DATA.	 Return the result
+-   list.  CLOBBER_HIGH marks if X is a clobber high.  */
++   list.  */
+ static struct lra_insn_reg *
+ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
+ 			       lra_insn_recog_data_t data,
+ 			       struct lra_insn_reg *list,
+-			       enum op_type type, bool early_clobber,
+-			       bool clobber_high)
++			       enum op_type type, bool early_clobber)
+ {
+   int i, j, regno, last;
+   bool subreg_p;
+@@ -878,10 +871,7 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
+ 		if (curr->type != type)
+ 		  curr->type = OP_INOUT;
+ 		if (early_clobber)
+-		  {
+-		    curr->early_clobber = true;
+-		    curr->early_clobber_alts = ALL_ALTERNATIVES;
+-		  }
++		  curr->early_clobber_alts = ALL_ALTERNATIVES;
+ 		break;
+ 	      }
+ 	  if (curr == NULL)
+@@ -897,9 +887,7 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
+ 			 && regno <= LAST_STACK_REG));
+ #endif
+ 	      list = new_insn_reg (data->insn, regno, type, mode, subreg_p,
+-				   early_clobber,
+-				   early_clobber ? ALL_ALTERNATIVES : 0, list,
+-				   clobber_high);
++				   early_clobber ? ALL_ALTERNATIVES : 0, list);
+ 	    }
+ 	}
+       return list;
+@@ -908,31 +896,24 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
+     {
+     case SET:
+       list = collect_non_operand_hard_regs (insn, &SET_DEST (op), data,
+-					    list, OP_OUT, false, false);
++					    list, OP_OUT, false);
+       list = collect_non_operand_hard_regs (insn, &SET_SRC (op), data,
+-					    list, OP_IN, false, false);
++					    list, OP_IN, false);
+       break;
+     case CLOBBER:
+       /* We treat clobber of non-operand hard registers as early clobber.  */
+       list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
+-					    list, OP_OUT, true, false);
+-      break;
+-    case CLOBBER_HIGH:
+-      /* Clobber high should always span exactly one register.  */
+-      gcc_assert (REG_NREGS (XEXP (op, 0)) == 1);
+-      /* We treat clobber of non-operand hard registers as early clobber.  */
+-      list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
+-					    list, OP_OUT, true, true);
++					    list, OP_OUT, true);
+       break;
+     case PRE_INC: case PRE_DEC: case POST_INC: case POST_DEC:
+       list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
+-					    list, OP_INOUT, false, false);
++					    list, OP_INOUT, false);
+       break;
+     case PRE_MODIFY: case POST_MODIFY:
+       list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
+-					    list, OP_INOUT, false, false);
++					    list, OP_INOUT, false);
+       list = collect_non_operand_hard_regs (insn, &XEXP (op, 1), data,
+-					    list, OP_IN, false, false);
++					    list, OP_IN, false);
+       break;
+     default:
+       fmt = GET_RTX_FORMAT (code);
+@@ -940,12 +921,11 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
+ 	{
+ 	  if (fmt[i] == 'e')
+ 	    list = collect_non_operand_hard_regs (insn, &XEXP (op, i), data,
+-						  list, OP_IN, false, false);
++						  list, OP_IN, false);
+ 	  else if (fmt[i] == 'E')
+ 	    for (j = XVECLEN (op, i) - 1; j >= 0; j--)
+ 	      list = collect_non_operand_hard_regs (insn, &XVECEXP (op, i, j),
+-						    data, list, OP_IN, false,
+-						    false);
++						    data, list, OP_IN, false);
+ 	}
+     }
+   return list;
+@@ -1094,7 +1074,7 @@ lra_set_insn_recog_data (rtx_insn *insn)
+   else
+     insn_static_data->hard_regs
+       = collect_non_operand_hard_regs (insn, &PATTERN (insn), data,
+-				       NULL, OP_IN, false, false);
++				       NULL, OP_IN, false);
+   data->arg_hard_regs = NULL;
+   if (CALL_P (insn))
+     {
+@@ -1120,10 +1100,6 @@ lra_set_insn_recog_data (rtx_insn *insn)
+ 	      arg_hard_regs[n_hard_regs++]
+ 		= regno + i + (use_p ? 0 : FIRST_PSEUDO_REGISTER);
+ 	  }
+-	else if (GET_CODE (XEXP (link, 0)) == CLOBBER_HIGH)
+-	  /* We could support CLOBBER_HIGH and treat it in the same way as
+-	     HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
+-	  gcc_unreachable ();
+ 
+       if (n_hard_regs != 0)
+ 	{
+@@ -1332,7 +1308,6 @@ initialize_lra_reg_info_element (int i)
+   lra_reg_info[i].no_stack_p = false;
+ #endif
+   CLEAR_HARD_REG_SET (lra_reg_info[i].conflict_hard_regs);
+-  CLEAR_HARD_REG_SET (lra_reg_info[i].actual_call_used_reg_set);
+   lra_reg_info[i].preferred_hard_regno1 = -1;
+   lra_reg_info[i].preferred_hard_regno2 = -1;
+   lra_reg_info[i].preferred_hard_regno_profit1 = 0;
+@@ -1345,7 +1320,6 @@ initialize_lra_reg_info_element (int i)
+   lra_reg_info[i].val = get_new_reg_value ();
+   lra_reg_info[i].offset = 0;
+   lra_reg_info[i].copies = NULL;
+-  lra_reg_info[i].call_insn = NULL;
+ }
+ 
+ /* Initialize common reg info and copies.  */
+@@ -1449,15 +1423,13 @@ lra_get_copy (int n)
+ /* This page contains code dealing with info about registers in
+    insns.  */
+ 
+-/* Process X of INSN recursively and add info (operand type is
+-   given by TYPE, flag of that it is early clobber is EARLY_CLOBBER)
+-   about registers in X to the insn DATA.  If X can be early clobbered,
+-   alternatives in which it can be early clobbered are given by
+-   EARLY_CLOBBER_ALTS.  */
++/* Process X of INSN recursively and add info (operand type is given
++   by TYPE) about registers in X to the insn DATA.  If X can be early
++   clobbered, alternatives in which it can be early clobbered are given
++   by EARLY_CLOBBER_ALTS.  */
+ static void
+ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
+-			     rtx_insn *insn,
+-			     enum op_type type, bool early_clobber,
++			     rtx_insn *insn, enum op_type type,
+ 			     alternative_mask early_clobber_alts)
+ {
+   int i, j, regno;
+@@ -1487,8 +1459,7 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
+       if (bitmap_set_bit (&lra_reg_info[regno].insn_bitmap, INSN_UID (insn)))
+ 	{
+ 	  data->regs = new_insn_reg (data->insn, regno, type, mode, subreg_p,
+-				     early_clobber, early_clobber_alts,
+-				     data->regs, false);
++				     early_clobber_alts, data->regs);
+ 	  return;
+ 	}
+       else
+@@ -1500,15 +1471,12 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
+ 		  /* The info cannot be integrated into the found
+ 		     structure.  */
+ 		  data->regs = new_insn_reg (data->insn, regno, type, mode,
+-					     subreg_p, early_clobber,
+-					     early_clobber_alts, data->regs,
+-					     false);
++					     subreg_p, early_clobber_alts,
++					     data->regs);
+ 		else
+ 		  {
+ 		    if (curr->type != type)
+ 		      curr->type = OP_INOUT;
+-		    if (curr->early_clobber != early_clobber)
+-		      curr->early_clobber = true;
+ 		    curr->early_clobber_alts |= early_clobber_alts;
+ 		  }
+ 		return;
+@@ -1520,23 +1488,21 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
+   switch (code)
+     {
+     case SET:
+-      add_regs_to_insn_regno_info (data, SET_DEST (x), insn, OP_OUT, false, 0);
+-      add_regs_to_insn_regno_info (data, SET_SRC (x), insn, OP_IN, false, 0);
++      add_regs_to_insn_regno_info (data, SET_DEST (x), insn, OP_OUT, 0);
++      add_regs_to_insn_regno_info (data, SET_SRC (x), insn, OP_IN, 0);
+       break;
+     case CLOBBER:
+       /* We treat clobber of non-operand hard registers as early
+ 	 clobber.  */
+       add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_OUT,
+-				   true, ALL_ALTERNATIVES);
++				   ALL_ALTERNATIVES);
+       break;
+-    case CLOBBER_HIGH:
+-      gcc_unreachable ();
+     case PRE_INC: case PRE_DEC: case POST_INC: case POST_DEC:
+-      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, false, 0);
++      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, 0);
+       break;
+     case PRE_MODIFY: case POST_MODIFY:
+-      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, false, 0);
+-      add_regs_to_insn_regno_info (data, XEXP (x, 1), insn, OP_IN, false, 0);
++      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, 0);
++      add_regs_to_insn_regno_info (data, XEXP (x, 1), insn, OP_IN, 0);
+       break;
+     default:
+       if ((code != PARALLEL && code != EXPR_LIST) || type != OP_OUT)
+@@ -1557,12 +1523,12 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
+       for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+ 	{
+ 	  if (fmt[i] == 'e')
+-	    add_regs_to_insn_regno_info (data, XEXP (x, i), insn, type, false, 0);
++	    add_regs_to_insn_regno_info (data, XEXP (x, i), insn, type, 0);
+ 	  else if (fmt[i] == 'E')
+ 	    {
+ 	      for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+ 		add_regs_to_insn_regno_info (data, XVECEXP (x, i, j), insn,
+-					     type, false, 0);
++					     type, 0);
+ 	    }
+ 	}
+     }
+@@ -1652,11 +1618,10 @@ lra_update_insn_regno_info (rtx_insn *insn)
+   for (i = static_data->n_operands - 1; i >= 0; i--)
+     add_regs_to_insn_regno_info (data, *data->operand_loc[i], insn,
+ 				 static_data->operand[i].type,
+-				 static_data->operand[i].early_clobber,
+ 				 static_data->operand[i].early_clobber_alts);
+   if ((code = GET_CODE (PATTERN (insn))) == CLOBBER || code == USE)
+     add_regs_to_insn_regno_info (data, XEXP (PATTERN (insn), 0), insn,
+-				 code == USE ? OP_IN : OP_OUT, false, 0);
++				 code == USE ? OP_IN : OP_OUT, 0);
+   if (CALL_P (insn))
+     /* On some targets call insns can refer to pseudos in memory in
+        CALL_INSN_FUNCTION_USAGE list.  Process them in order to
+@@ -1667,13 +1632,10 @@ lra_update_insn_regno_info (rtx_insn *insn)
+ 	 link = XEXP (link, 1))
+       {
+ 	code = GET_CODE (XEXP (link, 0));
+-	/* We could support CLOBBER_HIGH and treat it in the same way as
+-	   HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
+-	gcc_assert (code != CLOBBER_HIGH);
+ 	if ((code == USE || code == CLOBBER)
+ 	    && MEM_P (XEXP (XEXP (link, 0), 0)))
+ 	  add_regs_to_insn_regno_info (data, XEXP (XEXP (link, 0), 0), insn,
+-				       code == USE ? OP_IN : OP_OUT, false, 0);
++				       code == USE ? OP_IN : OP_OUT, 0);
+       }
+   if (NONDEBUG_INSN_P (insn))
+     setup_insn_reg_info (data, freq);
+@@ -2400,7 +2362,7 @@ lra (FILE *f)
+      need it.  */
+   emit_note (NOTE_INSN_DELETED);
+ 
+-  COPY_HARD_REG_SET (lra_no_alloc_regs, ira_no_alloc_regs);
++  lra_no_alloc_regs = ira_no_alloc_regs;
+ 
+   init_reg_info ();
+   expand_reg_info ();
+@@ -2436,7 +2398,9 @@ lra (FILE *f)
+ 
+   if (crtl->saves_all_registers)
+     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      if (! call_used_regs[i] && ! fixed_regs[i] && ! LOCAL_REGNO (i))
++      if (!crtl->abi->clobbers_full_reg_p (i)
++	  && !fixed_regs[i]
++	  && !LOCAL_REGNO (i))
+ 	df_set_regs_ever_live (i, true);
+ 
+   /* We don't DF from now and avoid its using because it is to
+@@ -2494,19 +2458,7 @@ lra (FILE *f)
+ 	    }
+ 	  /* Do inheritance only for regular algorithms.  */
+ 	  if (! lra_simple_p)
+-	    {
+-	      if (flag_ipa_ra)
+-		{
+-		  if (live_p)
+-		    lra_clear_live_ranges ();
+-		  /* As a side-effect of lra_create_live_ranges, we calculate
+-		     actual_call_used_reg_set,  which is needed during
+-		     lra_inheritance.  */
+-		  lra_create_live_ranges (true, true);
+-		  live_p = true;
+-		}
+-	      lra_inheritance ();
+-	    }
++	    lra_inheritance ();
+ 	  if (live_p)
+ 	    lra_clear_live_ranges ();
+ 	  bool fails_p;
+diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
+index c1b160237..f47ac5b76 100644
+--- a/gcc/lto-streamer-out.c
++++ b/gcc/lto-streamer-out.c
+@@ -1122,12 +1122,12 @@ hash_tree (struct streamer_tree_cache_d *cache, hash_map<tree, hashval_t> *map,
+       hstate.add_int (DECL_BUILT_IN_CLASS (t));
+       hstate.add_flag (DECL_STATIC_CONSTRUCTOR (t));
+       hstate.add_flag (DECL_STATIC_DESTRUCTOR (t));
++      hstate.add_flag (FUNCTION_DECL_DECL_TYPE (t));
+       hstate.add_flag (DECL_UNINLINABLE (t));
+       hstate.add_flag (DECL_POSSIBLY_INLINED (t));
+       hstate.add_flag (DECL_IS_NOVOPS (t));
+       hstate.add_flag (DECL_IS_RETURNS_TWICE (t));
+       hstate.add_flag (DECL_IS_MALLOC (t));
+-      hstate.add_flag (DECL_IS_OPERATOR_NEW (t));
+       hstate.add_flag (DECL_DECLARED_INLINE_P (t));
+       hstate.add_flag (DECL_STATIC_CHAIN (t));
+       hstate.add_flag (DECL_NO_INLINE_WARNING_P (t));
+@@ -1138,7 +1138,7 @@ hash_tree (struct streamer_tree_cache_d *cache, hash_map<tree, hashval_t> *map,
+       hstate.add_flag (DECL_LOOPING_CONST_OR_PURE_P (t));
+       hstate.commit_flag ();
+       if (DECL_BUILT_IN_CLASS (t) != NOT_BUILT_IN)
+-	hstate.add_int (DECL_FUNCTION_CODE (t));
++	hstate.add_int (DECL_UNCHECKED_FUNCTION_CODE (t));
+     }
+ 
+   if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
+diff --git a/gcc/lto/Make-lang.in b/gcc/lto/Make-lang.in
+index 1b856d6d4..b7ed96eac 100644
+--- a/gcc/lto/Make-lang.in
++++ b/gcc/lto/Make-lang.in
+@@ -22,7 +22,7 @@
+ # The name of the LTO compiler.
+ LTO_EXE = lto1$(exeext)
+ # The LTO-specific object files inclued in $(LTO_EXE).
+-LTO_OBJS = lto/lto-lang.o lto/lto.o lto/lto-object.o attribs.o lto/lto-partition.o lto/lto-symtab.o
++LTO_OBJS = lto/lto-lang.o lto/lto.o lto/lto-object.o attribs.o lto/lto-partition.o lto/lto-symtab.o lto/lto-common.o
+ lto_OBJS = $(LTO_OBJS)
+ 
+ # this is only useful in a LTO bootstrap, but this does not work right
+diff --git a/gcc/lto/config-lang.in b/gcc/lto/config-lang.in
+index de9712504..07214365f 100644
+--- a/gcc/lto/config-lang.in
++++ b/gcc/lto/config-lang.in
+@@ -20,7 +20,7 @@
+ language="lto"
+ compilers="lto1\$(exeext)"
+ 
+-gtfiles="\$(srcdir)/lto/lto-tree.h \$(srcdir)/lto/lto-lang.c \$(srcdir)/lto/lto.c \$(srcdir)/lto/lto.h"
++gtfiles="\$(srcdir)/lto/lto-tree.h \$(srcdir)/lto/lto-lang.c \$(srcdir)/lto/lto.c \$(srcdir)/lto/lto.h \$(srcdir)/lto/lto-common.h \$(srcdir)/lto/lto-common.c"
+ 
+ # LTO is a special front end.  From a user's perspective it is not
+ # really a language, but a middle end feature.  However, the GIMPLE
+diff --git a/gcc/lto/lto-common.c b/gcc/lto/lto-common.c
+new file mode 100644
+index 000000000..daf7f7b47
+--- /dev/null
++++ b/gcc/lto/lto-common.c
+@@ -0,0 +1,2837 @@
++/* Top-level LTO routines.
++   Copyright (C) 2009-2018 Free Software Foundation, Inc.
++   Contributed by CodeSourcery, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#include "config.h"
++#include "system.h"
++#include "coretypes.h"
++#include "tm.h"
++#include "function.h"
++#include "bitmap.h"
++#include "basic-block.h"
++#include "tree.h"
++#include "gimple.h"
++#include "cfghooks.h"
++#include "alloc-pool.h"
++#include "tree-pass.h"
++#include "tree-streamer.h"
++#include "cgraph.h"
++#include "opts.h"
++#include "toplev.h"
++#include "stor-layout.h"
++#include "symbol-summary.h"
++#include "tree-vrp.h"
++#include "ipa-prop.h"
++#include "common.h"
++#include "debug.h"
++#include "lto.h"
++#include "lto-section-names.h"
++#include "splay-tree.h"
++#include "lto-partition.h"
++#include "context.h"
++#include "pass_manager.h"
++#include "ipa-fnsummary.h"
++#include "params.h"
++#include "ipa-utils.h"
++#include "gomp-constants.h"
++#include "lto-symtab.h"
++#include "stringpool.h"
++#include "fold-const.h"
++#include "attribs.h"
++#include "builtins.h"
++#include "lto-common.h"
++
++GTY(()) tree first_personality_decl;
++
++GTY(()) const unsigned char *lto_mode_identity_table;
++
++/* Returns a hash code for P.  */
++
++static hashval_t
++hash_name (const void *p)
++{
++  const struct lto_section_slot *ds = (const struct lto_section_slot *) p;
++  return (hashval_t) htab_hash_string (ds->name);
++}
++
++
++/* Returns nonzero if P1 and P2 are equal.  */
++
++static int
++eq_name (const void *p1, const void *p2)
++{
++  const struct lto_section_slot *s1 =
++    (const struct lto_section_slot *) p1;
++  const struct lto_section_slot *s2 =
++    (const struct lto_section_slot *) p2;
++
++  return strcmp (s1->name, s2->name) == 0;
++}
++
++/* Free lto_section_slot */
++
++static void
++free_with_string (void *arg)
++{
++  struct lto_section_slot *s = (struct lto_section_slot *)arg;
++
++  free (CONST_CAST (char *, s->name));
++  free (arg);
++}
++
++/* Create section hash table */
++
++htab_t 
++lto_obj_create_section_hash_table (void)
++{
++  return htab_create (37, hash_name, eq_name, free_with_string);
++}
++
++/* Delete an allocated integer KEY in the splay tree.  */
++
++static void
++lto_splay_tree_delete_id (splay_tree_key key)
++{
++  free ((void *) key);
++}
++
++/* Compare splay tree node ids A and B.  */
++
++static int
++lto_splay_tree_compare_ids (splay_tree_key a, splay_tree_key b)
++{
++  unsigned HOST_WIDE_INT ai;
++  unsigned HOST_WIDE_INT bi;
++
++  ai = *(unsigned HOST_WIDE_INT *) a;
++  bi = *(unsigned HOST_WIDE_INT *) b;
++
++  if (ai < bi)
++    return -1;
++  else if (ai > bi)
++    return 1;
++  return 0;
++}
++
++/* Look up splay tree node by ID in splay tree T.  */
++
++static splay_tree_node
++lto_splay_tree_lookup (splay_tree t, unsigned HOST_WIDE_INT id)
++{
++  return splay_tree_lookup (t, (splay_tree_key) &id);
++}
++
++/* Check if KEY has ID.  */
++
++static bool
++lto_splay_tree_id_equal_p (splay_tree_key key, unsigned HOST_WIDE_INT id)
++{
++  return *(unsigned HOST_WIDE_INT *) key == id;
++}
++
++/* Insert a splay tree node into tree T with ID as key and FILE_DATA as value. 
++   The ID is allocated separately because we need HOST_WIDE_INTs which may
++   be wider than a splay_tree_key. */
++
++static void
++lto_splay_tree_insert (splay_tree t, unsigned HOST_WIDE_INT id,
++		       struct lto_file_decl_data *file_data)
++{
++  unsigned HOST_WIDE_INT *idp = XCNEW (unsigned HOST_WIDE_INT);
++  *idp = id;
++  splay_tree_insert (t, (splay_tree_key) idp, (splay_tree_value) file_data);
++}
++
++/* Create a splay tree.  */
++
++static splay_tree
++lto_splay_tree_new (void)
++{
++  return splay_tree_new (lto_splay_tree_compare_ids,
++	 	         lto_splay_tree_delete_id,
++			 NULL);
++}
++
++/* Decode the content of memory pointed to by DATA in the in decl
++   state object STATE. DATA_IN points to a data_in structure for
++   decoding. Return the address after the decoded object in the
++   input.  */
++
++static const uint32_t *
++lto_read_in_decl_state (struct data_in *data_in, const uint32_t *data,
++			struct lto_in_decl_state *state)
++{
++  uint32_t ix;
++  tree decl;
++  uint32_t i, j;
++
++  ix = *data++;
++  state->compressed = ix & 1;
++  ix /= 2;
++  decl = streamer_tree_cache_get_tree (data_in->reader_cache, ix);
++  if (!VAR_OR_FUNCTION_DECL_P (decl))
++    {
++      gcc_assert (decl == void_type_node);
++      decl = NULL_TREE;
++    }
++  state->fn_decl = decl;
++
++  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
++    {
++      uint32_t size = *data++;
++      vec<tree, va_gc> *decls = NULL;
++      vec_alloc (decls, size);
++
++      for (j = 0; j < size; j++)
++	vec_safe_push (decls,
++		       streamer_tree_cache_get_tree (data_in->reader_cache,
++						     data[j]));
++
++      state->streams[i] = decls;
++      data += size;
++    }
++
++  return data;
++}
++
++
++/* Global canonical type table.  */
++static htab_t gimple_canonical_types;
++static hash_map<const_tree, hashval_t> *canonical_type_hash_cache;
++static unsigned long num_canonical_type_hash_entries;
++static unsigned long num_canonical_type_hash_queries;
++
++static void iterative_hash_canonical_type (tree type, inchash::hash &hstate);
++static hashval_t gimple_canonical_type_hash (const void *p);
++static void gimple_register_canonical_type_1 (tree t, hashval_t hash);
++
++/* Returning a hash value for gimple type TYPE.
++
++   The hash value returned is equal for types considered compatible
++   by gimple_canonical_types_compatible_p.  */
++
++static hashval_t
++hash_canonical_type (tree type)
++{
++  inchash::hash hstate;
++  enum tree_code code;
++
++  /* We compute alias sets only for types that needs them.
++     Be sure we do not recurse to something else as we cannot hash incomplete
++     types in a way they would have same hash value as compatible complete
++     types.  */
++  gcc_checking_assert (type_with_alias_set_p (type));
++
++  /* Combine a few common features of types so that types are grouped into
++     smaller sets; when searching for existing matching types to merge,
++     only existing types having the same features as the new type will be
++     checked.  */
++  code = tree_code_for_canonical_type_merging (TREE_CODE (type));
++  hstate.add_int (code);
++  hstate.add_int (TYPE_MODE (type));
++
++  /* Incorporate common features of numerical types.  */
++  if (INTEGRAL_TYPE_P (type)
++      || SCALAR_FLOAT_TYPE_P (type)
++      || FIXED_POINT_TYPE_P (type)
++      || TREE_CODE (type) == OFFSET_TYPE
++      || POINTER_TYPE_P (type))
++    {
++      hstate.add_int (TYPE_PRECISION (type));
++      if (!type_with_interoperable_signedness (type))
++        hstate.add_int (TYPE_UNSIGNED (type));
++    }
++
++  if (VECTOR_TYPE_P (type))
++    {
++      hstate.add_poly_int (TYPE_VECTOR_SUBPARTS (type));
++      hstate.add_int (TYPE_UNSIGNED (type));
++    }
++
++  if (TREE_CODE (type) == COMPLEX_TYPE)
++    hstate.add_int (TYPE_UNSIGNED (type));
++
++  /* Fortran's C_SIGNED_CHAR is !TYPE_STRING_FLAG but needs to be
++     interoperable with "signed char".  Unless all frontends are revisited to
++     agree on these types, we must ignore the flag completely.  */
++
++  /* Fortran standard define C_PTR type that is compatible with every
++     C pointer.  For this reason we need to glob all pointers into one.
++     Still pointers in different address spaces are not compatible.  */
++  if (POINTER_TYPE_P (type))
++    hstate.add_int (TYPE_ADDR_SPACE (TREE_TYPE (type)));
++
++  /* For array types hash the domain bounds and the string flag.  */
++  if (TREE_CODE (type) == ARRAY_TYPE && TYPE_DOMAIN (type))
++    {
++      hstate.add_int (TYPE_STRING_FLAG (type));
++      /* OMP lowering can introduce error_mark_node in place of
++	 random local decls in types.  */
++      if (TYPE_MIN_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
++	inchash::add_expr (TYPE_MIN_VALUE (TYPE_DOMAIN (type)), hstate);
++      if (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
++	inchash::add_expr (TYPE_MAX_VALUE (TYPE_DOMAIN (type)), hstate);
++    }
++
++  /* Recurse for aggregates with a single element type.  */
++  if (TREE_CODE (type) == ARRAY_TYPE
++      || TREE_CODE (type) == COMPLEX_TYPE
++      || TREE_CODE (type) == VECTOR_TYPE)
++    iterative_hash_canonical_type (TREE_TYPE (type), hstate);
++
++  /* Incorporate function return and argument types.  */
++  if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
++    {
++      unsigned na;
++      tree p;
++
++      iterative_hash_canonical_type (TREE_TYPE (type), hstate);
++
++      for (p = TYPE_ARG_TYPES (type), na = 0; p; p = TREE_CHAIN (p))
++	{
++	  iterative_hash_canonical_type (TREE_VALUE (p), hstate);
++	  na++;
++	}
++
++      hstate.add_int (na);
++    }
++
++  if (RECORD_OR_UNION_TYPE_P (type))
++    {
++      unsigned nf;
++      tree f;
++
++      for (f = TYPE_FIELDS (type), nf = 0; f; f = TREE_CHAIN (f))
++	if (TREE_CODE (f) == FIELD_DECL
++	    && (! DECL_SIZE (f)
++		|| ! integer_zerop (DECL_SIZE (f))))
++	  {
++	    iterative_hash_canonical_type (TREE_TYPE (f), hstate);
++	    nf++;
++	  }
++
++      hstate.add_int (nf);
++    }
++
++  return hstate.end();
++}
++
++/* Returning a hash value for gimple type TYPE combined with VAL.  */
++
++static void
++iterative_hash_canonical_type (tree type, inchash::hash &hstate)
++{
++  hashval_t v;
++
++  /* All type variants have same TYPE_CANONICAL.  */
++  type = TYPE_MAIN_VARIANT (type);
++
++  if (!canonical_type_used_p (type))
++    v = hash_canonical_type (type);
++  /* An already processed type.  */
++  else if (TYPE_CANONICAL (type))
++    {
++      type = TYPE_CANONICAL (type);
++      v = gimple_canonical_type_hash (type);
++    }
++  else
++    {
++      /* Canonical types should not be able to form SCCs by design, this
++	 recursion is just because we do not register canonical types in
++	 optimal order.  To avoid quadratic behavior also register the
++	 type here.  */
++      v = hash_canonical_type (type);
++      gimple_register_canonical_type_1 (type, v);
++    }
++  hstate.add_int (v);
++}
++
++/* Returns the hash for a canonical type P.  */
++
++static hashval_t
++gimple_canonical_type_hash (const void *p)
++{
++  num_canonical_type_hash_queries++;
++  hashval_t *slot = canonical_type_hash_cache->get ((const_tree) p);
++  gcc_assert (slot != NULL);
++  return *slot;
++}
++
++
++
++/* Returns nonzero if P1 and P2 are equal.  */
++
++static int
++gimple_canonical_type_eq (const void *p1, const void *p2)
++{
++  const_tree t1 = (const_tree) p1;
++  const_tree t2 = (const_tree) p2;
++  return gimple_canonical_types_compatible_p (CONST_CAST_TREE (t1),
++					      CONST_CAST_TREE (t2));
++}
++
++/* Main worker for gimple_register_canonical_type.  */
++
++static void
++gimple_register_canonical_type_1 (tree t, hashval_t hash)
++{
++  void **slot;
++
++  gcc_checking_assert (TYPE_P (t) && !TYPE_CANONICAL (t)
++		       && type_with_alias_set_p (t)
++		       && canonical_type_used_p (t));
++
++  slot = htab_find_slot_with_hash (gimple_canonical_types, t, hash, INSERT);
++  if (*slot)
++    {
++      tree new_type = (tree)(*slot);
++      gcc_checking_assert (new_type != t);
++      TYPE_CANONICAL (t) = new_type;
++    }
++  else
++    {
++      TYPE_CANONICAL (t) = t;
++      *slot = (void *) t;
++      /* Cache the just computed hash value.  */
++      num_canonical_type_hash_entries++;
++      bool existed_p = canonical_type_hash_cache->put (t, hash);
++      gcc_assert (!existed_p);
++    }
++}
++
++/* Register type T in the global type table gimple_types and set
++   TYPE_CANONICAL of T accordingly.
++   This is used by LTO to merge structurally equivalent types for
++   type-based aliasing purposes across different TUs and languages.
++
++   ???  This merging does not exactly match how the tree.c middle-end
++   functions will assign TYPE_CANONICAL when new types are created
++   during optimization (which at least happens for pointer and array
++   types).  */
++
++static void
++gimple_register_canonical_type (tree t)
++{
++  if (TYPE_CANONICAL (t) || !type_with_alias_set_p (t)
++      || !canonical_type_used_p (t))
++    return;
++
++  /* Canonical types are same among all complete variants.  */
++  if (TYPE_CANONICAL (TYPE_MAIN_VARIANT (t)))
++    TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
++  else
++    {
++      gimple_register_canonical_type_1 (TYPE_MAIN_VARIANT (t),
++					hash_canonical_type (TYPE_MAIN_VARIANT (t)));
++      TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
++    }
++}
++
++/* Re-compute TYPE_CANONICAL for NODE and related types.  */
++
++static void
++lto_register_canonical_types (tree node, bool first_p)
++{
++  if (!node
++      || !TYPE_P (node))
++    return;
++
++  if (first_p)
++    TYPE_CANONICAL (node) = NULL_TREE;
++
++  if (POINTER_TYPE_P (node)
++      || TREE_CODE (node) == COMPLEX_TYPE
++      || TREE_CODE (node) == ARRAY_TYPE)
++    lto_register_canonical_types (TREE_TYPE (node), first_p);
++
++ if (!first_p) 
++    gimple_register_canonical_type (node);
++}
++
++
++/* Remember trees that contains references to declarations.  */
++vec <tree, va_gc> *tree_with_vars;
++
++#define CHECK_VAR(tt) \
++  do \
++    { \
++      if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
++	  && (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
++	return true; \
++    } while (0)
++
++#define CHECK_NO_VAR(tt) \
++  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
++
++/* Check presence of pointers to decls in fields of a tree_typed T.  */
++
++static inline bool
++mentions_vars_p_typed (tree t)
++{
++  CHECK_NO_VAR (TREE_TYPE (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a tree_common T.  */
++
++static inline bool
++mentions_vars_p_common (tree t)
++{
++  if (mentions_vars_p_typed (t))
++    return true;
++  CHECK_NO_VAR (TREE_CHAIN (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a decl_minimal T.  */
++
++static inline bool
++mentions_vars_p_decl_minimal (tree t)
++{
++  if (mentions_vars_p_common (t))
++    return true;
++  CHECK_NO_VAR (DECL_NAME (t));
++  CHECK_VAR (DECL_CONTEXT (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a decl_common T.  */
++
++static inline bool
++mentions_vars_p_decl_common (tree t)
++{
++  if (mentions_vars_p_decl_minimal (t))
++    return true;
++  CHECK_VAR (DECL_SIZE (t));
++  CHECK_VAR (DECL_SIZE_UNIT (t));
++  CHECK_VAR (DECL_INITIAL (t));
++  CHECK_NO_VAR (DECL_ATTRIBUTES (t));
++  CHECK_VAR (DECL_ABSTRACT_ORIGIN (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a decl_with_vis T.  */
++
++static inline bool
++mentions_vars_p_decl_with_vis (tree t)
++{
++  if (mentions_vars_p_decl_common (t))
++    return true;
++
++  /* Accessor macro has side-effects, use field-name here. */
++  CHECK_NO_VAR (DECL_ASSEMBLER_NAME_RAW (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a decl_non_common T.  */
++
++static inline bool
++mentions_vars_p_decl_non_common (tree t)
++{
++  if (mentions_vars_p_decl_with_vis (t))
++    return true;
++  CHECK_NO_VAR (DECL_RESULT_FLD (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a decl_non_common T.  */
++
++static bool
++mentions_vars_p_function (tree t)
++{
++  if (mentions_vars_p_decl_non_common (t))
++    return true;
++  CHECK_NO_VAR (DECL_ARGUMENTS (t));
++  CHECK_NO_VAR (DECL_VINDEX (t));
++  CHECK_VAR (DECL_FUNCTION_PERSONALITY (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a field_decl T.  */
++
++static bool
++mentions_vars_p_field_decl (tree t)
++{
++  if (mentions_vars_p_decl_common (t))
++    return true;
++  CHECK_VAR (DECL_FIELD_OFFSET (t));
++  CHECK_NO_VAR (DECL_BIT_FIELD_TYPE (t));
++  CHECK_NO_VAR (DECL_QUALIFIER (t));
++  CHECK_NO_VAR (DECL_FIELD_BIT_OFFSET (t));
++  CHECK_NO_VAR (DECL_FCONTEXT (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a type T.  */
++
++static bool
++mentions_vars_p_type (tree t)
++{
++  if (mentions_vars_p_common (t))
++    return true;
++  CHECK_NO_VAR (TYPE_CACHED_VALUES (t));
++  CHECK_VAR (TYPE_SIZE (t));
++  CHECK_VAR (TYPE_SIZE_UNIT (t));
++  CHECK_NO_VAR (TYPE_ATTRIBUTES (t));
++  CHECK_NO_VAR (TYPE_NAME (t));
++
++  CHECK_VAR (TYPE_MIN_VALUE_RAW (t));
++  CHECK_VAR (TYPE_MAX_VALUE_RAW (t));
++
++  /* Accessor is for derived node types only. */
++  CHECK_NO_VAR (TYPE_LANG_SLOT_1 (t));
++
++  CHECK_VAR (TYPE_CONTEXT (t));
++  CHECK_NO_VAR (TYPE_CANONICAL (t));
++  CHECK_NO_VAR (TYPE_MAIN_VARIANT (t));
++  CHECK_NO_VAR (TYPE_NEXT_VARIANT (t));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a BINFO T.  */
++
++static bool
++mentions_vars_p_binfo (tree t)
++{
++  unsigned HOST_WIDE_INT i, n;
++
++  if (mentions_vars_p_common (t))
++    return true;
++  CHECK_VAR (BINFO_VTABLE (t));
++  CHECK_NO_VAR (BINFO_OFFSET (t));
++  CHECK_NO_VAR (BINFO_VIRTUALS (t));
++  CHECK_NO_VAR (BINFO_VPTR_FIELD (t));
++  n = vec_safe_length (BINFO_BASE_ACCESSES (t));
++  for (i = 0; i < n; i++)
++    CHECK_NO_VAR (BINFO_BASE_ACCESS (t, i));
++  /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
++     and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
++  n = BINFO_N_BASE_BINFOS (t);
++  for (i = 0; i < n; i++)
++    CHECK_NO_VAR (BINFO_BASE_BINFO (t, i));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of a CONSTRUCTOR T.  */
++
++static bool
++mentions_vars_p_constructor (tree t)
++{
++  unsigned HOST_WIDE_INT idx;
++  constructor_elt *ce;
++
++  if (mentions_vars_p_typed (t))
++    return true;
++
++  for (idx = 0; vec_safe_iterate (CONSTRUCTOR_ELTS (t), idx, &ce); idx++)
++    {
++      CHECK_NO_VAR (ce->index);
++      CHECK_VAR (ce->value);
++    }
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of an expression tree T.  */
++
++static bool
++mentions_vars_p_expr (tree t)
++{
++  int i;
++  if (mentions_vars_p_typed (t))
++    return true;
++  for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
++    CHECK_VAR (TREE_OPERAND (t, i));
++  return false;
++}
++
++/* Check presence of pointers to decls in fields of an OMP_CLAUSE T.  */
++
++static bool
++mentions_vars_p_omp_clause (tree t)
++{
++  int i;
++  if (mentions_vars_p_common (t))
++    return true;
++  for (i = omp_clause_num_ops[OMP_CLAUSE_CODE (t)] - 1; i >= 0; --i)
++    CHECK_VAR (OMP_CLAUSE_OPERAND (t, i));
++  return false;
++}
++
++/* Check presence of pointers to decls that needs later fixup in T.  */
++
++static bool
++mentions_vars_p (tree t)
++{
++  switch (TREE_CODE (t))
++    {
++    case IDENTIFIER_NODE:
++      break;
++
++    case TREE_LIST:
++      CHECK_VAR (TREE_VALUE (t));
++      CHECK_VAR (TREE_PURPOSE (t));
++      CHECK_NO_VAR (TREE_CHAIN (t));
++      break;
++
++    case FIELD_DECL:
++      return mentions_vars_p_field_decl (t);
++
++    case LABEL_DECL:
++    case CONST_DECL:
++    case PARM_DECL:
++    case RESULT_DECL:
++    case IMPORTED_DECL:
++    case NAMESPACE_DECL:
++    case NAMELIST_DECL:
++      return mentions_vars_p_decl_common (t);
++
++    case VAR_DECL:
++      return mentions_vars_p_decl_with_vis (t);
++
++    case TYPE_DECL:
++      return mentions_vars_p_decl_non_common (t);
++
++    case FUNCTION_DECL:
++      return mentions_vars_p_function (t);
++
++    case TREE_BINFO:
++      return mentions_vars_p_binfo (t);
++
++    case PLACEHOLDER_EXPR:
++      return mentions_vars_p_common (t);
++
++    case BLOCK:
++    case TRANSLATION_UNIT_DECL:
++    case OPTIMIZATION_NODE:
++    case TARGET_OPTION_NODE:
++      break;
++
++    case CONSTRUCTOR:
++      return mentions_vars_p_constructor (t);
++
++    case OMP_CLAUSE:
++      return mentions_vars_p_omp_clause (t);
++
++    default:
++      if (TYPE_P (t))
++	{
++	  if (mentions_vars_p_type (t))
++	    return true;
++	}
++      else if (EXPR_P (t))
++	{
++	  if (mentions_vars_p_expr (t))
++	    return true;
++	}
++      else if (CONSTANT_CLASS_P (t))
++	CHECK_NO_VAR (TREE_TYPE (t));
++      else
++	gcc_unreachable ();
++    }
++  return false;
++}
++
++
++/* Return the resolution for the decl with index INDEX from DATA_IN. */
++
++static enum ld_plugin_symbol_resolution
++get_resolution (struct data_in *data_in, unsigned index)
++{
++  if (data_in->globals_resolution.exists ())
++    {
++      ld_plugin_symbol_resolution_t ret;
++      /* We can have references to not emitted functions in
++	 DECL_FUNCTION_PERSONALITY at least.  So we can and have
++	 to indeed return LDPR_UNKNOWN in some cases.   */
++      if (data_in->globals_resolution.length () <= index)
++	return LDPR_UNKNOWN;
++      ret = data_in->globals_resolution[index];
++      return ret;
++    }
++  else
++    /* Delay resolution finding until decl merging.  */
++    return LDPR_UNKNOWN;
++}
++
++/* We need to record resolutions until symbol table is read.  */
++static void
++register_resolution (struct lto_file_decl_data *file_data, tree decl,
++		     enum ld_plugin_symbol_resolution resolution)
++{
++  bool existed;
++  if (resolution == LDPR_UNKNOWN)
++    return;
++  if (!file_data->resolution_map)
++    file_data->resolution_map
++      = new hash_map<tree, ld_plugin_symbol_resolution>;
++  ld_plugin_symbol_resolution_t &res
++     = file_data->resolution_map->get_or_insert (decl, &existed);
++  if (!existed
++      || resolution == LDPR_PREVAILING_DEF_IRONLY
++      || resolution == LDPR_PREVAILING_DEF
++      || resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
++    res = resolution;
++}
++
++/* Register DECL with the global symbol table and change its
++   name if necessary to avoid name clashes for static globals across
++   different files.  */
++
++static void
++lto_register_var_decl_in_symtab (struct data_in *data_in, tree decl,
++				 unsigned ix)
++{
++  tree context;
++
++  /* Variable has file scope, not local.  */
++  if (!TREE_PUBLIC (decl)
++      && !((context = decl_function_context (decl))
++	   && auto_var_in_fn_p (decl, context)))
++    rest_of_decl_compilation (decl, 1, 0);
++
++  /* If this variable has already been declared, queue the
++     declaration for merging.  */
++  if (TREE_PUBLIC (decl))
++    register_resolution (data_in->file_data,
++			 decl, get_resolution (data_in, ix));
++}
++
++
++/* Register DECL with the global symbol table and change its
++   name if necessary to avoid name clashes for static globals across
++   different files.  DATA_IN contains descriptors and tables for the
++   file being read.  */
++
++static void
++lto_register_function_decl_in_symtab (struct data_in *data_in, tree decl,
++				      unsigned ix)
++{
++  /* If this variable has already been declared, queue the
++     declaration for merging.  */
++  if (TREE_PUBLIC (decl) && !DECL_ABSTRACT_P (decl))
++    register_resolution (data_in->file_data,
++			 decl, get_resolution (data_in, ix));
++}
++
++/* Check if T is a decl and needs register its resolution info.  */
++
++static void
++lto_maybe_register_decl (struct data_in *data_in, tree t, unsigned ix)
++{
++  if (TREE_CODE (t) == VAR_DECL)
++    lto_register_var_decl_in_symtab (data_in, t, ix);
++  else if (TREE_CODE (t) == FUNCTION_DECL
++	   && !fndecl_built_in_p (t))
++    lto_register_function_decl_in_symtab (data_in, t, ix);
++}
++
++
++/* For the type T re-materialize it in the type variant list and
++   the pointer/reference-to chains.  */
++
++static void
++lto_fixup_prevailing_type (tree t)
++{
++  /* The following re-creates proper variant lists while fixing up
++     the variant leaders.  We do not stream TYPE_NEXT_VARIANT so the
++     variant list state before fixup is broken.  */
++
++  /* If we are not our own variant leader link us into our new leaders
++     variant list.  */
++  if (TYPE_MAIN_VARIANT (t) != t)
++    {
++      tree mv = TYPE_MAIN_VARIANT (t);
++      TYPE_NEXT_VARIANT (t) = TYPE_NEXT_VARIANT (mv);
++      TYPE_NEXT_VARIANT (mv) = t;
++    }
++
++  /* The following reconstructs the pointer chains
++     of the new pointed-to type if we are a main variant.  We do
++     not stream those so they are broken before fixup.  */
++  if (TREE_CODE (t) == POINTER_TYPE
++      && TYPE_MAIN_VARIANT (t) == t)
++    {
++      TYPE_NEXT_PTR_TO (t) = TYPE_POINTER_TO (TREE_TYPE (t));
++      TYPE_POINTER_TO (TREE_TYPE (t)) = t;
++    }
++  else if (TREE_CODE (t) == REFERENCE_TYPE
++	   && TYPE_MAIN_VARIANT (t) == t)
++    {
++      TYPE_NEXT_REF_TO (t) = TYPE_REFERENCE_TO (TREE_TYPE (t));
++      TYPE_REFERENCE_TO (TREE_TYPE (t)) = t;
++    }
++}
++
++
++/* We keep prevailing tree SCCs in a hashtable with manual collision
++   handling (in case all hashes compare the same) and keep the colliding
++   entries in the tree_scc->next chain.  */
++
++struct tree_scc
++{
++  tree_scc *next;
++  /* Hash of the whole SCC.  */
++  hashval_t hash;
++  /* Number of trees in the SCC.  */
++  unsigned len;
++  /* Number of possible entries into the SCC (tree nodes [0..entry_len-1]
++     which share the same individual tree hash).  */
++  unsigned entry_len;
++  /* The members of the SCC.
++     We only need to remember the first entry node candidate for prevailing
++     SCCs (but of course have access to all entries for SCCs we are
++     processing).
++     ???  For prevailing SCCs we really only need hash and the first
++     entry candidate, but that's too awkward to implement.  */
++  tree entries[1];
++};
++
++struct tree_scc_hasher : nofree_ptr_hash <tree_scc>
++{
++  static inline hashval_t hash (const tree_scc *);
++  static inline bool equal (const tree_scc *, const tree_scc *);
++};
++
++hashval_t
++tree_scc_hasher::hash (const tree_scc *scc)
++{
++  return scc->hash;
++}
++
++bool
++tree_scc_hasher::equal (const tree_scc *scc1, const tree_scc *scc2)
++{
++  if (scc1->hash != scc2->hash
++      || scc1->len != scc2->len
++      || scc1->entry_len != scc2->entry_len)
++    return false;
++  return true;
++}
++
++static hash_table<tree_scc_hasher> *tree_scc_hash;
++static struct obstack tree_scc_hash_obstack;
++
++static unsigned long num_merged_types;
++static unsigned long num_prevailing_types;
++static unsigned long num_type_scc_trees;
++static unsigned long total_scc_size;
++static unsigned long num_sccs_read;
++static unsigned long total_scc_size_merged;
++static unsigned long num_sccs_merged;
++static unsigned long num_scc_compares;
++static unsigned long num_scc_compare_collisions;
++
++
++/* Compare the two entries T1 and T2 of two SCCs that are possibly equal,
++   recursing through in-SCC tree edges.  Returns true if the SCCs entered
++   through T1 and T2 are equal and fills in *MAP with the pairs of
++   SCC entries we visited, starting with (*MAP)[0] = T1 and (*MAP)[1] = T2.  */
++
++static bool
++compare_tree_sccs_1 (tree t1, tree t2, tree **map)
++{
++  enum tree_code code;
++
++  /* Mark already visited nodes.  */
++  TREE_ASM_WRITTEN (t2) = 1;
++
++  /* Push the pair onto map.  */
++  (*map)[0] = t1;
++  (*map)[1] = t2;
++  *map = *map + 2;
++
++  /* Compare value-fields.  */
++#define compare_values(X) \
++  do { \
++    if (X(t1) != X(t2)) \
++      return false; \
++  } while (0)
++
++  compare_values (TREE_CODE);
++  code = TREE_CODE (t1);
++
++  if (!TYPE_P (t1))
++    {
++      compare_values (TREE_SIDE_EFFECTS);
++      compare_values (TREE_CONSTANT);
++      compare_values (TREE_READONLY);
++      compare_values (TREE_PUBLIC);
++    }
++  compare_values (TREE_ADDRESSABLE);
++  compare_values (TREE_THIS_VOLATILE);
++  if (DECL_P (t1))
++    compare_values (DECL_UNSIGNED);
++  else if (TYPE_P (t1))
++    compare_values (TYPE_UNSIGNED);
++  if (TYPE_P (t1))
++    compare_values (TYPE_ARTIFICIAL);
++  else
++    compare_values (TREE_NO_WARNING);
++  compare_values (TREE_NOTHROW);
++  compare_values (TREE_STATIC);
++  if (code != TREE_BINFO)
++    compare_values (TREE_PRIVATE);
++  compare_values (TREE_PROTECTED);
++  compare_values (TREE_DEPRECATED);
++  if (TYPE_P (t1))
++    {
++      if (AGGREGATE_TYPE_P (t1))
++	compare_values (TYPE_REVERSE_STORAGE_ORDER);
++      else
++	compare_values (TYPE_SATURATING);
++      compare_values (TYPE_ADDR_SPACE);
++    }
++  else if (code == SSA_NAME)
++    compare_values (SSA_NAME_IS_DEFAULT_DEF);
++
++  if (CODE_CONTAINS_STRUCT (code, TS_INT_CST))
++    {
++      if (wi::to_wide (t1) != wi::to_wide (t2))
++	return false;
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_REAL_CST))
++    {
++      /* ???  No suitable compare routine available.  */
++      REAL_VALUE_TYPE r1 = TREE_REAL_CST (t1);
++      REAL_VALUE_TYPE r2 = TREE_REAL_CST (t2);
++      if (r1.cl != r2.cl
++	  || r1.decimal != r2.decimal
++	  || r1.sign != r2.sign
++	  || r1.signalling != r2.signalling
++	  || r1.canonical != r2.canonical
++	  || r1.uexp != r2.uexp)
++	return false;
++      for (unsigned i = 0; i < SIGSZ; ++i)
++	if (r1.sig[i] != r2.sig[i])
++	  return false;
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_FIXED_CST))
++    if (!fixed_compare (EQ_EXPR,
++			TREE_FIXED_CST_PTR (t1), TREE_FIXED_CST_PTR (t2)))
++      return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
++    {
++      compare_values (VECTOR_CST_LOG2_NPATTERNS);
++      compare_values (VECTOR_CST_NELTS_PER_PATTERN);
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
++    {
++      compare_values (DECL_MODE);
++      compare_values (DECL_NONLOCAL);
++      compare_values (DECL_VIRTUAL_P);
++      compare_values (DECL_IGNORED_P);
++      compare_values (DECL_ABSTRACT_P);
++      compare_values (DECL_ARTIFICIAL);
++      compare_values (DECL_USER_ALIGN);
++      compare_values (DECL_PRESERVE_P);
++      compare_values (DECL_EXTERNAL);
++      compare_values (DECL_GIMPLE_REG_P);
++      compare_values (DECL_ALIGN);
++      if (code == LABEL_DECL)
++	{
++	  compare_values (EH_LANDING_PAD_NR);
++	  compare_values (LABEL_DECL_UID);
++	}
++      else if (code == FIELD_DECL)
++	{
++	  compare_values (DECL_PACKED);
++	  compare_values (DECL_NONADDRESSABLE_P);
++	  compare_values (DECL_PADDING_P);
++	  compare_values (DECL_OFFSET_ALIGN);
++	}
++      else if (code == VAR_DECL)
++	{
++	  compare_values (DECL_HAS_DEBUG_EXPR_P);
++	  compare_values (DECL_NONLOCAL_FRAME);
++	}
++      if (code == RESULT_DECL
++	  || code == PARM_DECL
++	  || code == VAR_DECL)
++	{
++	  compare_values (DECL_BY_REFERENCE);
++	  if (code == VAR_DECL
++	      || code == PARM_DECL)
++	    compare_values (DECL_HAS_VALUE_EXPR_P);
++	}
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WRTL))
++    compare_values (DECL_REGISTER);
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
++    {
++      compare_values (DECL_COMMON);
++      compare_values (DECL_DLLIMPORT_P);
++      compare_values (DECL_WEAK);
++      compare_values (DECL_SEEN_IN_BIND_EXPR_P);
++      compare_values (DECL_COMDAT);
++      compare_values (DECL_VISIBILITY);
++      compare_values (DECL_VISIBILITY_SPECIFIED);
++      if (code == VAR_DECL)
++	{
++	  compare_values (DECL_HARD_REGISTER);
++          /* DECL_IN_TEXT_SECTION is set during final asm output only.  */
++	  compare_values (DECL_IN_CONSTANT_POOL);
++	}
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
++    {
++      compare_values (DECL_BUILT_IN_CLASS);
++      compare_values (DECL_STATIC_CONSTRUCTOR);
++      compare_values (DECL_STATIC_DESTRUCTOR);
++      compare_values (DECL_UNINLINABLE);
++      compare_values (DECL_POSSIBLY_INLINED);
++      compare_values (DECL_IS_NOVOPS);
++      compare_values (DECL_IS_RETURNS_TWICE);
++      compare_values (DECL_IS_MALLOC);
++      compare_values (DECL_IS_OPERATOR_NEW_P);
++      compare_values (DECL_DECLARED_INLINE_P);
++      compare_values (DECL_STATIC_CHAIN);
++      compare_values (DECL_NO_INLINE_WARNING_P);
++      compare_values (DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT);
++      compare_values (DECL_NO_LIMIT_STACK);
++      compare_values (DECL_DISREGARD_INLINE_LIMITS);
++      compare_values (DECL_PURE_P);
++      compare_values (DECL_LOOPING_CONST_OR_PURE_P);
++      compare_values (DECL_FINAL_P);
++      compare_values (DECL_CXX_CONSTRUCTOR_P);
++      compare_values (DECL_CXX_DESTRUCTOR_P);
++      if (DECL_BUILT_IN_CLASS (t1) != NOT_BUILT_IN)
++	compare_values (DECL_UNCHECKED_FUNCTION_CODE);
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
++    {
++      compare_values (TYPE_MODE);
++      compare_values (TYPE_STRING_FLAG);
++      compare_values (TYPE_NEEDS_CONSTRUCTING);
++      if (RECORD_OR_UNION_TYPE_P (t1))
++	{
++	  compare_values (TYPE_TRANSPARENT_AGGR);
++	  compare_values (TYPE_FINAL_P);
++	}
++      else if (code == ARRAY_TYPE)
++	compare_values (TYPE_NONALIASED_COMPONENT);
++      if (AGGREGATE_TYPE_P (t1))
++	compare_values (TYPE_TYPELESS_STORAGE);
++      compare_values (TYPE_EMPTY_P);
++      compare_values (TYPE_PACKED);
++      compare_values (TYPE_RESTRICT);
++      compare_values (TYPE_USER_ALIGN);
++      compare_values (TYPE_READONLY);
++      compare_values (TYPE_PRECISION);
++      compare_values (TYPE_ALIGN);
++      /* Do not compare TYPE_ALIAS_SET.  Doing so introduce ordering issues
++         with calls to get_alias_set which may initialize it for streamed
++ 	 in types.  */
++    }
++
++  /* We don't want to compare locations, so there is nothing do compare
++     for TS_EXP.  */
++
++  /* BLOCKs are function local and we don't merge anything there, so
++     simply refuse to merge.  */
++  if (CODE_CONTAINS_STRUCT (code, TS_BLOCK))
++    return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_TRANSLATION_UNIT_DECL))
++    if (strcmp (TRANSLATION_UNIT_LANGUAGE (t1),
++		TRANSLATION_UNIT_LANGUAGE (t2)) != 0)
++      return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_TARGET_OPTION))
++    if (!cl_target_option_eq (TREE_TARGET_OPTION (t1), TREE_TARGET_OPTION (t2)))
++      return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_OPTIMIZATION))
++    if (!cl_optimization_option_eq (TREE_OPTIMIZATION (t1),
++				    TREE_OPTIMIZATION (t2)))
++      return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
++    if (vec_safe_length (BINFO_BASE_ACCESSES (t1))
++	!= vec_safe_length (BINFO_BASE_ACCESSES (t2)))
++      return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
++    compare_values (CONSTRUCTOR_NELTS);
++
++  if (CODE_CONTAINS_STRUCT (code, TS_IDENTIFIER))
++    if (IDENTIFIER_LENGTH (t1) != IDENTIFIER_LENGTH (t2)
++	|| memcmp (IDENTIFIER_POINTER (t1), IDENTIFIER_POINTER (t2),
++		   IDENTIFIER_LENGTH (t1)) != 0)
++      return false;
++
++  if (CODE_CONTAINS_STRUCT (code, TS_STRING))
++    if (TREE_STRING_LENGTH (t1) != TREE_STRING_LENGTH (t2)
++	|| memcmp (TREE_STRING_POINTER (t1), TREE_STRING_POINTER (t2),
++		   TREE_STRING_LENGTH (t1)) != 0)
++      return false;
++
++  if (code == OMP_CLAUSE)
++    {
++      compare_values (OMP_CLAUSE_CODE);
++      switch (OMP_CLAUSE_CODE (t1))
++	{
++	case OMP_CLAUSE_DEFAULT:
++	  compare_values (OMP_CLAUSE_DEFAULT_KIND);
++	  break;
++	case OMP_CLAUSE_SCHEDULE:
++	  compare_values (OMP_CLAUSE_SCHEDULE_KIND);
++	  break;
++	case OMP_CLAUSE_DEPEND:
++	  compare_values (OMP_CLAUSE_DEPEND_KIND);
++	  break;
++	case OMP_CLAUSE_MAP:
++	  compare_values (OMP_CLAUSE_MAP_KIND);
++	  break;
++	case OMP_CLAUSE_PROC_BIND:
++	  compare_values (OMP_CLAUSE_PROC_BIND_KIND);
++	  break;
++	case OMP_CLAUSE_REDUCTION:
++	  compare_values (OMP_CLAUSE_REDUCTION_CODE);
++	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_INIT);
++	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_MERGE);
++	  break;
++	default:
++	  break;
++	}
++    }
++
++#undef compare_values
++
++
++  /* Compare pointer fields.  */
++
++  /* Recurse.  Search & Replaced from DFS_write_tree_body.
++     Folding the early checks into the compare_tree_edges recursion
++     macro makes debugging way quicker as you are able to break on
++     compare_tree_sccs_1 and simply finish until a call returns false
++     to spot the SCC members with the difference.  */
++#define compare_tree_edges(E1, E2) \
++  do { \
++    tree t1_ = (E1), t2_ = (E2); \
++    if (t1_ != t2_ \
++	&& (!t1_ || !t2_ \
++	    || !TREE_VISITED (t2_) \
++	    || (!TREE_ASM_WRITTEN (t2_) \
++		&& !compare_tree_sccs_1 (t1_, t2_, map)))) \
++      return false; \
++    /* Only non-NULL trees outside of the SCC may compare equal.  */ \
++    gcc_checking_assert (t1_ != t2_ || (!t2_ || !TREE_VISITED (t2_))); \
++  } while (0)
++
++  if (CODE_CONTAINS_STRUCT (code, TS_TYPED))
++    {
++      if (code != IDENTIFIER_NODE)
++	compare_tree_edges (TREE_TYPE (t1), TREE_TYPE (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
++    {
++      /* Note that the number of elements for EXPR has already been emitted
++	 in EXPR's header (see streamer_write_tree_header).  */
++      unsigned int count = vector_cst_encoded_nelts (t1);
++      for (unsigned int i = 0; i < count; ++i)
++	compare_tree_edges (VECTOR_CST_ENCODED_ELT (t1, i),
++			    VECTOR_CST_ENCODED_ELT (t2, i));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_COMPLEX))
++    {
++      compare_tree_edges (TREE_REALPART (t1), TREE_REALPART (t2));
++      compare_tree_edges (TREE_IMAGPART (t1), TREE_IMAGPART (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_MINIMAL))
++    {
++      compare_tree_edges (DECL_NAME (t1), DECL_NAME (t2));
++      /* ???  Global decls from different TUs have non-matching
++	 TRANSLATION_UNIT_DECLs.  Only consider a small set of
++	 decls equivalent, we should not end up merging others.  */
++      if ((code == TYPE_DECL
++	   || code == NAMESPACE_DECL
++	   || code == IMPORTED_DECL
++	   || code == CONST_DECL
++	   || (VAR_OR_FUNCTION_DECL_P (t1)
++	       && (TREE_PUBLIC (t1) || DECL_EXTERNAL (t1))))
++	  && DECL_FILE_SCOPE_P (t1) && DECL_FILE_SCOPE_P (t2))
++	;
++      else
++	compare_tree_edges (DECL_CONTEXT (t1), DECL_CONTEXT (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
++    {
++      compare_tree_edges (DECL_SIZE (t1), DECL_SIZE (t2));
++      compare_tree_edges (DECL_SIZE_UNIT (t1), DECL_SIZE_UNIT (t2));
++      compare_tree_edges (DECL_ATTRIBUTES (t1), DECL_ATTRIBUTES (t2));
++      compare_tree_edges (DECL_ABSTRACT_ORIGIN (t1), DECL_ABSTRACT_ORIGIN (t2));
++      if ((code == VAR_DECL
++	   || code == PARM_DECL)
++	  && DECL_HAS_VALUE_EXPR_P (t1))
++	compare_tree_edges (DECL_VALUE_EXPR (t1), DECL_VALUE_EXPR (t2));
++      if (code == VAR_DECL
++	  && DECL_HAS_DEBUG_EXPR_P (t1))
++	compare_tree_edges (DECL_DEBUG_EXPR (t1), DECL_DEBUG_EXPR (t2));
++      /* LTO specific edges.  */
++      if (code != FUNCTION_DECL
++	  && code != TRANSLATION_UNIT_DECL)
++	compare_tree_edges (DECL_INITIAL (t1), DECL_INITIAL (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
++    {
++      if (code == FUNCTION_DECL)
++	{
++	  tree a1, a2;
++	  for (a1 = DECL_ARGUMENTS (t1), a2 = DECL_ARGUMENTS (t2);
++	       a1 || a2;
++	       a1 = TREE_CHAIN (a1), a2 = TREE_CHAIN (a2))
++	    compare_tree_edges (a1, a2);
++	  compare_tree_edges (DECL_RESULT (t1), DECL_RESULT (t2));
++	}
++      else if (code == TYPE_DECL)
++	compare_tree_edges (DECL_ORIGINAL_TYPE (t1), DECL_ORIGINAL_TYPE (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
++    {
++      /* Make sure we don't inadvertently set the assembler name.  */
++      if (DECL_ASSEMBLER_NAME_SET_P (t1))
++	compare_tree_edges (DECL_ASSEMBLER_NAME (t1),
++			    DECL_ASSEMBLER_NAME (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
++    {
++      compare_tree_edges (DECL_FIELD_OFFSET (t1), DECL_FIELD_OFFSET (t2));
++      compare_tree_edges (DECL_BIT_FIELD_TYPE (t1), DECL_BIT_FIELD_TYPE (t2));
++      compare_tree_edges (DECL_BIT_FIELD_REPRESENTATIVE (t1),
++			  DECL_BIT_FIELD_REPRESENTATIVE (t2));
++      compare_tree_edges (DECL_FIELD_BIT_OFFSET (t1),
++			  DECL_FIELD_BIT_OFFSET (t2));
++      compare_tree_edges (DECL_FCONTEXT (t1), DECL_FCONTEXT (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
++    {
++      compare_tree_edges (DECL_FUNCTION_PERSONALITY (t1),
++			  DECL_FUNCTION_PERSONALITY (t2));
++      compare_tree_edges (DECL_VINDEX (t1), DECL_VINDEX (t2));
++      compare_tree_edges (DECL_FUNCTION_SPECIFIC_TARGET (t1),
++			  DECL_FUNCTION_SPECIFIC_TARGET (t2));
++      compare_tree_edges (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t1),
++			  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
++    {
++      compare_tree_edges (TYPE_SIZE (t1), TYPE_SIZE (t2));
++      compare_tree_edges (TYPE_SIZE_UNIT (t1), TYPE_SIZE_UNIT (t2));
++      compare_tree_edges (TYPE_ATTRIBUTES (t1), TYPE_ATTRIBUTES (t2));
++      compare_tree_edges (TYPE_NAME (t1), TYPE_NAME (t2));
++      /* Do not compare TYPE_POINTER_TO or TYPE_REFERENCE_TO.  They will be
++	 reconstructed during fixup.  */
++      /* Do not compare TYPE_NEXT_VARIANT, we reconstruct the variant lists
++	 during fixup.  */
++      compare_tree_edges (TYPE_MAIN_VARIANT (t1), TYPE_MAIN_VARIANT (t2));
++      /* ???  Global types from different TUs have non-matching
++	 TRANSLATION_UNIT_DECLs.  Still merge them if they are otherwise
++	 equal.  */
++      if (TYPE_FILE_SCOPE_P (t1) && TYPE_FILE_SCOPE_P (t2))
++	;
++      else
++	compare_tree_edges (TYPE_CONTEXT (t1), TYPE_CONTEXT (t2));
++      /* TYPE_CANONICAL is re-computed during type merging, so do not
++	 compare it here.  */
++      compare_tree_edges (TYPE_STUB_DECL (t1), TYPE_STUB_DECL (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_NON_COMMON))
++    {
++      if (code == ENUMERAL_TYPE)
++	compare_tree_edges (TYPE_VALUES (t1), TYPE_VALUES (t2));
++      else if (code == ARRAY_TYPE)
++	compare_tree_edges (TYPE_DOMAIN (t1), TYPE_DOMAIN (t2));
++      else if (RECORD_OR_UNION_TYPE_P (t1))
++	{
++	  tree f1, f2;
++	  for (f1 = TYPE_FIELDS (t1), f2 = TYPE_FIELDS (t2);
++	       f1 || f2;
++	       f1 = TREE_CHAIN (f1), f2 = TREE_CHAIN (f2))
++	    compare_tree_edges (f1, f2);
++	}
++      else if (code == FUNCTION_TYPE
++	       || code == METHOD_TYPE)
++	compare_tree_edges (TYPE_ARG_TYPES (t1), TYPE_ARG_TYPES (t2));
++
++      if (!POINTER_TYPE_P (t1))
++	compare_tree_edges (TYPE_MIN_VALUE_RAW (t1), TYPE_MIN_VALUE_RAW (t2));
++      compare_tree_edges (TYPE_MAX_VALUE_RAW (t1), TYPE_MAX_VALUE_RAW (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_LIST))
++    {
++      compare_tree_edges (TREE_PURPOSE (t1), TREE_PURPOSE (t2));
++      compare_tree_edges (TREE_VALUE (t1), TREE_VALUE (t2));
++      compare_tree_edges (TREE_CHAIN (t1), TREE_CHAIN (t2));
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_VEC))
++    for (int i = 0; i < TREE_VEC_LENGTH (t1); i++)
++      compare_tree_edges (TREE_VEC_ELT (t1, i), TREE_VEC_ELT (t2, i));
++
++  if (CODE_CONTAINS_STRUCT (code, TS_EXP))
++    {
++      for (int i = 0; i < TREE_OPERAND_LENGTH (t1); i++)
++	compare_tree_edges (TREE_OPERAND (t1, i),
++			    TREE_OPERAND (t2, i));
++
++      /* BLOCKs are function local and we don't merge anything there.  */
++      if (TREE_BLOCK (t1) || TREE_BLOCK (t2))
++	return false;
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
++    {
++      unsigned i;
++      tree t;
++      /* Lengths have already been compared above.  */
++      FOR_EACH_VEC_ELT (*BINFO_BASE_BINFOS (t1), i, t)
++	compare_tree_edges (t, BINFO_BASE_BINFO (t2, i));
++      FOR_EACH_VEC_SAFE_ELT (BINFO_BASE_ACCESSES (t1), i, t)
++	compare_tree_edges (t, BINFO_BASE_ACCESS (t2, i));
++      compare_tree_edges (BINFO_OFFSET (t1), BINFO_OFFSET (t2));
++      compare_tree_edges (BINFO_VTABLE (t1), BINFO_VTABLE (t2));
++      compare_tree_edges (BINFO_VPTR_FIELD (t1), BINFO_VPTR_FIELD (t2));
++      /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
++	 and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
++    }
++
++  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
++    {
++      unsigned i;
++      tree index, value;
++      /* Lengths have already been compared above.  */
++      FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (t1), i, index, value)
++	{
++	  compare_tree_edges (index, CONSTRUCTOR_ELT (t2, i)->index);
++	  compare_tree_edges (value, CONSTRUCTOR_ELT (t2, i)->value);
++	}
++    }
++
++  if (code == OMP_CLAUSE)
++    {
++      int i;
++
++      for (i = 0; i < omp_clause_num_ops[OMP_CLAUSE_CODE (t1)]; i++)
++	compare_tree_edges (OMP_CLAUSE_OPERAND (t1, i),
++			    OMP_CLAUSE_OPERAND (t2, i));
++      compare_tree_edges (OMP_CLAUSE_CHAIN (t1), OMP_CLAUSE_CHAIN (t2));
++    }
++
++#undef compare_tree_edges
++
++  return true;
++}
++
++/* Compare the tree scc SCC to the prevailing candidate PSCC, filling
++   out MAP if they are equal.  */
++
++static bool
++compare_tree_sccs (tree_scc *pscc, tree_scc *scc,
++		   tree *map)
++{
++  /* Assume SCC entry hashes are sorted after their cardinality.  Which
++     means we can simply take the first n-tuple of equal hashes
++     (which is recorded as entry_len) and do n SCC entry candidate
++     comparisons.  */
++  for (unsigned i = 0; i < pscc->entry_len; ++i)
++    {
++      tree *mapp = map;
++      num_scc_compare_collisions++;
++      if (compare_tree_sccs_1 (pscc->entries[0], scc->entries[i], &mapp))
++	{
++	  /* Equal - no need to reset TREE_VISITED or TREE_ASM_WRITTEN
++	     on the scc as all trees will be freed.  */
++	  return true;
++	}
++      /* Reset TREE_ASM_WRITTEN on scc for the next compare or in case
++         the SCC prevails.  */
++      for (unsigned j = 0; j < scc->len; ++j)
++	TREE_ASM_WRITTEN (scc->entries[j]) = 0;
++    }
++
++  return false;
++}
++
++/* QSort sort function to sort a map of two pointers after the 2nd
++   pointer.  */
++
++static int
++cmp_tree (const void *p1_, const void *p2_)
++{
++  tree *p1 = (tree *)(const_cast<void *>(p1_));
++  tree *p2 = (tree *)(const_cast<void *>(p2_));
++  if (p1[1] == p2[1])
++    return 0;
++  return ((uintptr_t)p1[1] < (uintptr_t)p2[1]) ? -1 : 1;
++}
++
++/* Try to unify the SCC with nodes FROM to FROM + LEN in CACHE and
++   hash value SCC_HASH with an already recorded SCC.  Return true if
++   that was successful, otherwise return false.  */
++
++static bool
++unify_scc (struct data_in *data_in, unsigned from,
++	   unsigned len, unsigned scc_entry_len, hashval_t scc_hash)
++{
++  bool unified_p = false;
++  struct streamer_tree_cache_d *cache = data_in->reader_cache;
++  tree_scc *scc
++    = (tree_scc *) alloca (sizeof (tree_scc) + (len - 1) * sizeof (tree));
++  scc->next = NULL;
++  scc->hash = scc_hash;
++  scc->len = len;
++  scc->entry_len = scc_entry_len;
++  for (unsigned i = 0; i < len; ++i)
++    {
++      tree t = streamer_tree_cache_get_tree (cache, from + i);
++      scc->entries[i] = t;
++      /* Do not merge SCCs with local entities inside them.  Also do
++	 not merge TRANSLATION_UNIT_DECLs.  */
++      if (TREE_CODE (t) == TRANSLATION_UNIT_DECL
++	  || (VAR_OR_FUNCTION_DECL_P (t)
++	      && !(TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
++	  || TREE_CODE (t) == LABEL_DECL)
++	{
++	  /* Avoid doing any work for these cases and do not worry to
++	     record the SCCs for further merging.  */
++	  return false;
++	}
++    }
++
++  /* Look for the list of candidate SCCs to compare against.  */
++  tree_scc **slot;
++  slot = tree_scc_hash->find_slot_with_hash (scc, scc_hash, INSERT);
++  if (*slot)
++    {
++      /* Try unifying against each candidate.  */
++      num_scc_compares++;
++
++      /* Set TREE_VISITED on the scc so we can easily identify tree nodes
++	 outside of the scc when following tree edges.  Make sure
++	 that TREE_ASM_WRITTEN is unset so we can use it as 2nd bit
++	 to track whether we visited the SCC member during the compare.
++	 We cannot use TREE_VISITED on the pscc members as the extended
++	 scc and pscc can overlap.  */
++      for (unsigned i = 0; i < scc->len; ++i)
++	{
++	  TREE_VISITED (scc->entries[i]) = 1;
++	  gcc_checking_assert (!TREE_ASM_WRITTEN (scc->entries[i]));
++	}
++
++      tree *map = XALLOCAVEC (tree, 2 * len);
++      for (tree_scc *pscc = *slot; pscc; pscc = pscc->next)
++	{
++	  if (!compare_tree_sccs (pscc, scc, map))
++	    continue;
++
++	  /* Found an equal SCC.  */
++	  unified_p = true;
++	  num_scc_compare_collisions--;
++	  num_sccs_merged++;
++	  total_scc_size_merged += len;
++
++	  if (flag_checking)
++	    for (unsigned i = 0; i < len; ++i)
++	      {
++		tree t = map[2*i+1];
++		enum tree_code code = TREE_CODE (t);
++		/* IDENTIFIER_NODEs should be singletons and are merged by the
++		   streamer.  The others should be singletons, too, and we
++		   should not merge them in any way.  */
++		gcc_assert (code != TRANSLATION_UNIT_DECL
++			    && code != IDENTIFIER_NODE);
++	      }
++
++	  /* Fixup the streamer cache with the prevailing nodes according
++	     to the tree node mapping computed by compare_tree_sccs.  */
++	  if (len == 1)
++	    {
++	      /* If we got a debug reference queued, see if the prevailing
++	         tree has a debug reference and if not, register the one
++		 for the tree we are about to throw away.  */
++	      if (dref_queue.length () == 1)
++		{
++		  dref_entry e = dref_queue.pop ();
++		  gcc_assert (e.decl
++			      == streamer_tree_cache_get_tree (cache, from));
++		  const char *sym;
++		  unsigned HOST_WIDE_INT off;
++		  if (!debug_hooks->die_ref_for_decl (pscc->entries[0], &sym,
++						      &off))
++		    debug_hooks->register_external_die (pscc->entries[0],
++							e.sym, e.off);
++		}
++	      lto_maybe_register_decl (data_in, pscc->entries[0], from);
++	      streamer_tree_cache_replace_tree (cache, pscc->entries[0], from);
++	    }
++	  else
++	    {
++	      tree *map2 = XALLOCAVEC (tree, 2 * len);
++	      for (unsigned i = 0; i < len; ++i)
++		{
++		  map2[i*2] = (tree)(uintptr_t)(from + i);
++		  map2[i*2+1] = scc->entries[i];
++		}
++	      qsort (map2, len, 2 * sizeof (tree), cmp_tree);
++	      qsort (map, len, 2 * sizeof (tree), cmp_tree);
++	      for (unsigned i = 0; i < len; ++i)
++		{
++		  lto_maybe_register_decl (data_in, map[2*i],
++					   (uintptr_t)map2[2*i]);
++		  streamer_tree_cache_replace_tree (cache, map[2*i],
++						    (uintptr_t)map2[2*i]);
++		}
++	    }
++
++	  /* Free the tree nodes from the read SCC.  */
++	  data_in->location_cache.revert_location_cache ();
++	  for (unsigned i = 0; i < len; ++i)
++	    {
++	      if (TYPE_P (scc->entries[i]))
++		num_merged_types++;
++	      free_node (scc->entries[i]);
++	    }
++
++	  /* Drop DIE references.
++	     ???  Do as in the size-one SCC case which involves sorting
++	     the queue.  */
++	  dref_queue.truncate (0);
++
++	  break;
++	}
++
++      /* Reset TREE_VISITED if we didn't unify the SCC with another.  */
++      if (!unified_p)
++	for (unsigned i = 0; i < scc->len; ++i)
++	  TREE_VISITED (scc->entries[i]) = 0;
++    }
++
++  /* If we didn't unify it to any candidate duplicate the relevant
++     pieces to permanent storage and link it into the chain.  */
++  if (!unified_p)
++    {
++      tree_scc *pscc
++	= XOBNEWVAR (&tree_scc_hash_obstack, tree_scc, sizeof (tree_scc));
++      memcpy (pscc, scc, sizeof (tree_scc));
++      pscc->next = (*slot);
++      *slot = pscc;
++    }
++  return unified_p;
++}
++
++
++/* Read all the symbols from buffer DATA, using descriptors in DECL_DATA.
++   RESOLUTIONS is the set of symbols picked by the linker (read from the
++   resolution file when the linker plugin is being used).  */
++
++static void
++lto_read_decls (struct lto_file_decl_data *decl_data, const void *data,
++		vec<ld_plugin_symbol_resolution_t> resolutions)
++{
++  const struct lto_decl_header *header = (const struct lto_decl_header *) data;
++  const int decl_offset = sizeof (struct lto_decl_header);
++  const int main_offset = decl_offset + header->decl_state_size;
++  const int string_offset = main_offset + header->main_size;
++  struct data_in *data_in;
++  unsigned int i;
++  const uint32_t *data_ptr, *data_end;
++  uint32_t num_decl_states;
++
++  lto_input_block ib_main ((const char *) data + main_offset,
++			   header->main_size, decl_data->mode_table);
++
++  data_in = lto_data_in_create (decl_data, (const char *) data + string_offset,
++				header->string_size, resolutions);
++
++  /* We do not uniquify the pre-loaded cache entries, those are middle-end
++     internal types that should not be merged.  */
++
++  /* Read the global declarations and types.  */
++  while (ib_main.p < ib_main.len)
++    {
++      tree t;
++      unsigned from = data_in->reader_cache->nodes.length ();
++      /* Read and uniquify SCCs as in the input stream.  */
++      enum LTO_tags tag = streamer_read_record_start (&ib_main);
++      if (tag == LTO_tree_scc)
++	{
++	  unsigned len_;
++	  unsigned scc_entry_len;
++	  hashval_t scc_hash = lto_input_scc (&ib_main, data_in, &len_,
++					      &scc_entry_len);
++	  unsigned len = data_in->reader_cache->nodes.length () - from;
++	  gcc_assert (len == len_);
++
++	  total_scc_size += len;
++	  num_sccs_read++;
++
++	  /* We have the special case of size-1 SCCs that are pre-merged
++	     by means of identifier and string sharing for example.
++	     ???  Maybe we should avoid streaming those as SCCs.  */
++	  tree first = streamer_tree_cache_get_tree (data_in->reader_cache,
++						     from);
++	  if (len == 1
++	      && (TREE_CODE (first) == IDENTIFIER_NODE
++		  || (TREE_CODE (first) == INTEGER_CST
++		      && !TREE_OVERFLOW (first))))
++	    continue;
++
++	  /* Try to unify the SCC with already existing ones.  */
++	  if (!flag_ltrans
++	      && unify_scc (data_in, from,
++			    len, scc_entry_len, scc_hash))
++	    continue;
++
++	  /* Tree merging failed, mark entries in location cache as
++	     permanent.  */
++	  data_in->location_cache.accept_location_cache ();
++
++	  bool seen_type = false;
++	  for (unsigned i = 0; i < len; ++i)
++	    {
++	      tree t = streamer_tree_cache_get_tree (data_in->reader_cache,
++						     from + i);
++	      /* Reconstruct the type variant and pointer-to/reference-to
++		 chains.  */
++	      if (TYPE_P (t))
++		{
++		  seen_type = true;
++		  num_prevailing_types++;
++		  lto_fixup_prevailing_type (t);
++
++		  /* Compute the canonical type of all types.
++		     Because SCC components are streamed in random (hash) order
++		     we may have encountered the type before while registering
++		     type canonical of a derived type in the same SCC.  */
++		  if (!TYPE_CANONICAL (t))
++		    gimple_register_canonical_type (t);
++		  if (TYPE_MAIN_VARIANT (t) == t && odr_type_p (t))
++		    register_odr_type (t);
++		}
++	      /* Link shared INTEGER_CSTs into TYPE_CACHED_VALUEs of its
++		 type which is also member of this SCC.  */
++	      if (TREE_CODE (t) == INTEGER_CST
++		  && !TREE_OVERFLOW (t))
++		cache_integer_cst (t);
++	      if (!flag_ltrans)
++		{
++		  lto_maybe_register_decl (data_in, t, from + i);
++		  /* Scan the tree for references to global functions or
++		     variables and record those for later fixup.  */
++		  if (mentions_vars_p (t))
++		    vec_safe_push (tree_with_vars, t);
++		}
++	    }
++
++	  /* Register DECLs with the debuginfo machinery.  */
++	  while (!dref_queue.is_empty ())
++	    {
++	      dref_entry e = dref_queue.pop ();
++	      debug_hooks->register_external_die (e.decl, e.sym, e.off);
++	    }
++
++	  if (seen_type)
++	    num_type_scc_trees += len;
++	}
++      else
++	{
++	  /* Pickle stray references.  */
++	  t = lto_input_tree_1 (&ib_main, data_in, tag, 0);
++	  gcc_assert (t && data_in->reader_cache->nodes.length () == from);
++	}
++    }
++  data_in->location_cache.apply_location_cache ();
++
++  /* Read in lto_in_decl_state objects.  */
++  data_ptr = (const uint32_t *) ((const char*) data + decl_offset); 
++  data_end =
++     (const uint32_t *) ((const char*) data_ptr + header->decl_state_size);
++  num_decl_states = *data_ptr++;
++  
++  gcc_assert (num_decl_states > 0);
++  decl_data->global_decl_state = lto_new_in_decl_state ();
++  data_ptr = lto_read_in_decl_state (data_in, data_ptr,
++				     decl_data->global_decl_state);
++
++  /* Read in per-function decl states and enter them in hash table.  */
++  decl_data->function_decl_states =
++    hash_table<decl_state_hasher>::create_ggc (37);
++
++  for (i = 1; i < num_decl_states; i++)
++    {
++      struct lto_in_decl_state *state = lto_new_in_decl_state ();
++
++      data_ptr = lto_read_in_decl_state (data_in, data_ptr, state);
++      lto_in_decl_state **slot
++	= decl_data->function_decl_states->find_slot (state, INSERT);
++      gcc_assert (*slot == NULL);
++      *slot = state;
++    }
++
++  if (data_ptr != data_end)
++    internal_error ("bytecode stream: garbage at the end of symbols section");
++
++  /* Set the current decl state to be the global state. */
++  decl_data->current_decl_state = decl_data->global_decl_state;
++
++  lto_data_in_delete (data_in);
++}
++
++/* Custom version of strtoll, which is not portable.  */
++
++static int64_t
++lto_parse_hex (const char *p)
++{
++  int64_t ret = 0;
++
++  for (; *p != '\0'; ++p)
++    {
++      char c = *p;
++      unsigned char part;
++      ret <<= 4;
++      if (c >= '0' && c <= '9')
++        part = c - '0';
++      else if (c >= 'a' && c <= 'f')
++        part = c - 'a' + 10;
++      else if (c >= 'A' && c <= 'F')
++        part = c - 'A' + 10;
++      else
++        internal_error ("could not parse hex number");
++      ret |= part;
++    }
++
++  return ret;
++}
++
++/* Read resolution for file named FILE_NAME. The resolution is read from
++   RESOLUTION. */
++
++static void
++lto_resolution_read (splay_tree file_ids, FILE *resolution, lto_file *file)
++{
++  /* We require that objects in the resolution file are in the same
++     order as the lto1 command line. */
++  unsigned int name_len;
++  char *obj_name;
++  unsigned int num_symbols;
++  unsigned int i;
++  struct lto_file_decl_data *file_data;
++  splay_tree_node nd = NULL; 
++
++  if (!resolution)
++    return;
++
++  name_len = strlen (file->filename);
++  obj_name = XNEWVEC (char, name_len + 1);
++  fscanf (resolution, " ");   /* Read white space. */
++
++  fread (obj_name, sizeof (char), name_len, resolution);
++  obj_name[name_len] = '\0';
++  if (filename_cmp (obj_name, file->filename) != 0)
++    internal_error ("unexpected file name %s in linker resolution file. "
++		    "Expected %s", obj_name, file->filename);
++  if (file->offset != 0)
++    {
++      int t;
++      char offset_p[17];
++      int64_t offset;
++      t = fscanf (resolution, "@0x%16s", offset_p);
++      if (t != 1)
++        internal_error ("could not parse file offset");
++      offset = lto_parse_hex (offset_p);
++      if (offset != file->offset)
++        internal_error ("unexpected offset");
++    }
++
++  free (obj_name);
++
++  fscanf (resolution, "%u", &num_symbols);
++
++  for (i = 0; i < num_symbols; i++)
++    {
++      int t;
++      unsigned index;
++      unsigned HOST_WIDE_INT id;
++      char r_str[27];
++      enum ld_plugin_symbol_resolution r = (enum ld_plugin_symbol_resolution) 0;
++      unsigned int j;
++      unsigned int lto_resolution_str_len =
++	sizeof (lto_resolution_str) / sizeof (char *);
++      res_pair rp;
++
++      t = fscanf (resolution, "%u " HOST_WIDE_INT_PRINT_HEX_PURE " %26s %*[^\n]\n", 
++		  &index, &id, r_str);
++      if (t != 3)
++        internal_error ("invalid line in the resolution file");
++
++      for (j = 0; j < lto_resolution_str_len; j++)
++	{
++	  if (strcmp (lto_resolution_str[j], r_str) == 0)
++	    {
++	      r = (enum ld_plugin_symbol_resolution) j;
++	      break;
++	    }
++	}
++      if (j == lto_resolution_str_len)
++	internal_error ("invalid resolution in the resolution file");
++
++      if (!(nd && lto_splay_tree_id_equal_p (nd->key, id)))
++	{
++	  nd = lto_splay_tree_lookup (file_ids, id);
++	  if (nd == NULL)
++	    internal_error ("resolution sub id %wx not in object file", id);
++	}
++
++      file_data = (struct lto_file_decl_data *)nd->value;
++      /* The indexes are very sparse. To save memory save them in a compact
++         format that is only unpacked later when the subfile is processed. */
++      rp.res = r;
++      rp.index = index;
++      file_data->respairs.safe_push (rp);
++      if (file_data->max_index < index)
++        file_data->max_index = index;
++    }
++}
++
++/* List of file_decl_datas */
++struct file_data_list
++  {
++    struct lto_file_decl_data *first, *last;
++  };
++
++/* Is the name for a id'ed LTO section? */
++
++static int 
++lto_section_with_id (const char *name, unsigned HOST_WIDE_INT *id)
++{
++  const char *s;
++
++  if (strncmp (name, section_name_prefix, strlen (section_name_prefix)))
++    return 0;
++  s = strrchr (name, '.');
++  if (!s)
++    return 0;
++  /* If the section is not suffixed with an ID return.  */
++  if ((size_t)(s - name) == strlen (section_name_prefix))
++    return 0;
++  return sscanf (s, "." HOST_WIDE_INT_PRINT_HEX_PURE, id) == 1;
++}
++
++/* Create file_data of each sub file id */
++
++static int 
++create_subid_section_table (struct lto_section_slot *ls, splay_tree file_ids,
++                            struct file_data_list *list)
++{
++  struct lto_section_slot s_slot, *new_slot;
++  unsigned HOST_WIDE_INT id;
++  splay_tree_node nd;
++  void **hash_slot;
++  char *new_name;
++  struct lto_file_decl_data *file_data;
++
++  if (!lto_section_with_id (ls->name, &id))
++    return 1;
++  
++  /* Find hash table of sub module id */
++  nd = lto_splay_tree_lookup (file_ids, id);
++  if (nd != NULL)
++    {
++      file_data = (struct lto_file_decl_data *)nd->value;
++    }
++  else
++    {
++      file_data = ggc_alloc<lto_file_decl_data> ();
++      memset(file_data, 0, sizeof (struct lto_file_decl_data));
++      file_data->id = id;
++      file_data->section_hash_table = lto_obj_create_section_hash_table ();
++      lto_splay_tree_insert (file_ids, id, file_data);
++
++      /* Maintain list in linker order */
++      if (!list->first)
++        list->first = file_data;
++      if (list->last)
++        list->last->next = file_data;
++      list->last = file_data;
++    }
++
++  /* Copy section into sub module hash table */
++  new_name = XDUPVEC (char, ls->name, strlen (ls->name) + 1);
++  s_slot.name = new_name;
++  hash_slot = htab_find_slot (file_data->section_hash_table, &s_slot, INSERT);
++  gcc_assert (*hash_slot == NULL);
++
++  new_slot = XDUP (struct lto_section_slot, ls);
++  new_slot->name = new_name;
++  *hash_slot = new_slot;
++  return 1;
++}
++
++/* Read declarations and other initializations for a FILE_DATA. */
++
++static void
++lto_file_finalize (struct lto_file_decl_data *file_data, lto_file *file)
++{
++  const char *data;
++  size_t len;
++  vec<ld_plugin_symbol_resolution_t>
++	resolutions = vNULL;
++  int i;
++  res_pair *rp;
++
++  /* Create vector for fast access of resolution. We do this lazily
++     to save memory. */ 
++  resolutions.safe_grow_cleared (file_data->max_index + 1);
++  for (i = 0; file_data->respairs.iterate (i, &rp); i++)
++    resolutions[rp->index] = rp->res;
++  file_data->respairs.release ();
++
++  file_data->renaming_hash_table = lto_create_renaming_table ();
++  file_data->file_name = file->filename;
++#ifdef ACCEL_COMPILER
++  lto_input_mode_table (file_data);
++#else
++  file_data->mode_table = lto_mode_identity_table;
++#endif
++  data = lto_get_section_data (file_data, LTO_section_decls, NULL, &len);
++  if (data == NULL)
++    {
++      internal_error ("cannot read LTO decls from %s", file_data->file_name);
++      return;
++    }
++  /* Frees resolutions */
++  lto_read_decls (file_data, data, resolutions);
++  lto_free_section_data (file_data, LTO_section_decls, NULL, data, len);
++}
++
++/* Finalize FILE_DATA in FILE and increase COUNT. */
++
++static int 
++lto_create_files_from_ids (lto_file *file, struct lto_file_decl_data *file_data,
++			   int *count)
++{
++  lto_file_finalize (file_data, file);
++  if (symtab->dump_file)
++    fprintf (symtab->dump_file,
++	     "Creating file %s with sub id " HOST_WIDE_INT_PRINT_HEX "\n",
++	     file_data->file_name, file_data->id);
++  (*count)++;
++  return 0;
++}
++
++/* Generate a TREE representation for all types and external decls
++   entities in FILE.  
++
++   Read all of the globals out of the file.  Then read the cgraph
++   and process the .o index into the cgraph nodes so that it can open
++   the .o file to load the functions and ipa information.   */
++
++static struct lto_file_decl_data *
++lto_file_read (lto_file *file, FILE *resolution_file, int *count)
++{
++  struct lto_file_decl_data *file_data = NULL;
++  splay_tree file_ids;
++  htab_t section_hash_table;
++  struct lto_section_slot *section;
++  struct file_data_list file_list;
++  struct lto_section_list section_list;
++ 
++  memset (&section_list, 0, sizeof (struct lto_section_list)); 
++  section_hash_table = lto_obj_build_section_table (file, &section_list);
++
++  /* Find all sub modules in the object and put their sections into new hash
++     tables in a splay tree. */
++  file_ids = lto_splay_tree_new ();
++  memset (&file_list, 0, sizeof (struct file_data_list));
++  for (section = section_list.first; section != NULL; section = section->next)
++    create_subid_section_table (section, file_ids, &file_list);
++
++  /* Add resolutions to file ids */
++  lto_resolution_read (file_ids, resolution_file, file);
++
++  /* Finalize each lto file for each submodule in the merged object */
++  for (file_data = file_list.first; file_data != NULL; file_data = file_data->next)
++    lto_create_files_from_ids (file, file_data, count);
++ 
++  splay_tree_delete (file_ids);
++  htab_delete (section_hash_table);
++
++  return file_list.first;
++}
++
++#if HAVE_MMAP_FILE && HAVE_SYSCONF && defined _SC_PAGE_SIZE
++#define LTO_MMAP_IO 1
++#endif
++
++#if LTO_MMAP_IO
++/* Page size of machine is used for mmap and munmap calls.  */
++static size_t page_mask;
++#endif
++
++/* Get the section data of length LEN from FILENAME starting at
++   OFFSET.  The data segment must be freed by the caller when the
++   caller is finished.  Returns NULL if all was not well.  */
++
++static char *
++lto_read_section_data (struct lto_file_decl_data *file_data,
++		       intptr_t offset, size_t len)
++{
++  char *result;
++  static int fd = -1;
++  static char *fd_name;
++#if LTO_MMAP_IO
++  intptr_t computed_len;
++  intptr_t computed_offset;
++  intptr_t diff;
++#endif
++
++  /* Keep a single-entry file-descriptor cache.  The last file we
++     touched will get closed at exit.
++     ???  Eventually we want to add a more sophisticated larger cache
++     or rather fix function body streaming to not stream them in
++     practically random order.  */
++  if (fd != -1
++      && filename_cmp (fd_name, file_data->file_name) != 0)
++    {
++      free (fd_name);
++      close (fd);
++      fd = -1;
++    }
++  if (fd == -1)
++    {
++      fd = open (file_data->file_name, O_RDONLY|O_BINARY);
++      if (fd == -1)
++        {
++	  fatal_error (input_location, "Cannot open %s", file_data->file_name);
++	  return NULL;
++        }
++      fd_name = xstrdup (file_data->file_name);
++    }
++
++#if LTO_MMAP_IO
++  if (!page_mask)
++    {
++      size_t page_size = sysconf (_SC_PAGE_SIZE);
++      page_mask = ~(page_size - 1);
++    }
++
++  computed_offset = offset & page_mask;
++  diff = offset - computed_offset;
++  computed_len = len + diff;
++
++  result = (char *) mmap (NULL, computed_len, PROT_READ, MAP_PRIVATE,
++			  fd, computed_offset);
++  if (result == MAP_FAILED)
++    {
++      fatal_error (input_location, "Cannot map %s", file_data->file_name);
++      return NULL;
++    }
++
++  return result + diff;
++#else
++  result = (char *) xmalloc (len);
++  if (lseek (fd, offset, SEEK_SET) != offset
++      || read (fd, result, len) != (ssize_t) len)
++    {
++      free (result);
++      fatal_error (input_location, "Cannot read %s", file_data->file_name);
++      result = NULL;
++    }
++#ifdef __MINGW32__
++  /* Native windows doesn't supports delayed unlink on opened file. So
++     we close file here again. This produces higher I/O load, but at least
++     it prevents to have dangling file handles preventing unlink.  */
++  free (fd_name);
++  fd_name = NULL;
++  close (fd);
++  fd = -1;
++#endif
++  return result;
++#endif
++}    
++
++
++/* Get the section data from FILE_DATA of SECTION_TYPE with NAME.
++   NAME will be NULL unless the section type is for a function
++   body.  */
++
++static const char *
++get_section_data (struct lto_file_decl_data *file_data,
++		      enum lto_section_type section_type,
++		      const char *name,
++		      size_t *len)
++{
++  htab_t section_hash_table = file_data->section_hash_table;
++  struct lto_section_slot *f_slot;
++  struct lto_section_slot s_slot;
++  const char *section_name = lto_get_section_name (section_type, name, file_data);
++  char *data = NULL;
++
++  *len = 0;
++  s_slot.name = section_name;
++  f_slot = (struct lto_section_slot *) htab_find (section_hash_table, &s_slot);
++  if (f_slot)
++    {
++      data = lto_read_section_data (file_data, f_slot->start, f_slot->len);
++      *len = f_slot->len;
++    }
++
++  free (CONST_CAST (char *, section_name));
++  return data;
++}
++
++
++/* Free the section data from FILE_DATA of SECTION_TYPE with NAME that
++   starts at OFFSET and has LEN bytes.  */
++
++static void
++free_section_data (struct lto_file_decl_data *file_data ATTRIBUTE_UNUSED,
++		   enum lto_section_type section_type ATTRIBUTE_UNUSED,
++		   const char *name ATTRIBUTE_UNUSED,
++		   const char *offset, size_t len ATTRIBUTE_UNUSED)
++{
++#if LTO_MMAP_IO
++  intptr_t computed_len;
++  intptr_t computed_offset;
++  intptr_t diff;
++#endif
++
++#if LTO_MMAP_IO
++  computed_offset = ((intptr_t) offset) & page_mask;
++  diff = (intptr_t) offset - computed_offset;
++  computed_len = len + diff;
++
++  munmap ((caddr_t) computed_offset, computed_len);
++#else
++  free (CONST_CAST(char *, offset));
++#endif
++}
++
++static lto_file *current_lto_file;
++
++/* If TT is a variable or function decl replace it with its
++   prevailing variant.  */
++#define LTO_SET_PREVAIL(tt) \
++  do {\
++    if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
++	&& (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
++      { \
++        tt = lto_symtab_prevailing_decl (tt); \
++	fixed = true; \
++      } \
++  } while (0)
++
++/* Ensure that TT isn't a replacable var of function decl.  */
++#define LTO_NO_PREVAIL(tt) \
++  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
++
++/* Given a tree T replace all fields referring to variables or functions
++   with their prevailing variant.  */
++static void
++lto_fixup_prevailing_decls (tree t)
++{
++  enum tree_code code = TREE_CODE (t);
++  bool fixed = false;
++
++  gcc_checking_assert (code != TREE_BINFO);
++  LTO_NO_PREVAIL (TREE_TYPE (t));
++  if (CODE_CONTAINS_STRUCT (code, TS_COMMON)
++      /* lto_symtab_prevail_decl use TREE_CHAIN to link to the prevailing decl.
++	 in the case T is a prevailed declaration we would ICE here. */
++      && !VAR_OR_FUNCTION_DECL_P (t))
++    LTO_NO_PREVAIL (TREE_CHAIN (t));
++  if (DECL_P (t))
++    {
++      LTO_NO_PREVAIL (DECL_NAME (t));
++      LTO_SET_PREVAIL (DECL_CONTEXT (t));
++      if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
++	{
++	  LTO_SET_PREVAIL (DECL_SIZE (t));
++	  LTO_SET_PREVAIL (DECL_SIZE_UNIT (t));
++	  LTO_SET_PREVAIL (DECL_INITIAL (t));
++	  LTO_NO_PREVAIL (DECL_ATTRIBUTES (t));
++	  LTO_SET_PREVAIL (DECL_ABSTRACT_ORIGIN (t));
++	}
++      if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
++	{
++	  LTO_NO_PREVAIL (DECL_ASSEMBLER_NAME_RAW (t));
++	}
++      if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
++	{
++	  LTO_NO_PREVAIL (DECL_RESULT_FLD (t));
++	}
++      if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
++	{
++	  LTO_NO_PREVAIL (DECL_ARGUMENTS (t));
++	  LTO_SET_PREVAIL (DECL_FUNCTION_PERSONALITY (t));
++	  LTO_NO_PREVAIL (DECL_VINDEX (t));
++	}
++      if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
++	{
++	  LTO_SET_PREVAIL (DECL_FIELD_OFFSET (t));
++	  LTO_NO_PREVAIL (DECL_BIT_FIELD_TYPE (t));
++	  LTO_NO_PREVAIL (DECL_QUALIFIER (t));
++	  LTO_NO_PREVAIL (DECL_FIELD_BIT_OFFSET (t));
++	  LTO_NO_PREVAIL (DECL_FCONTEXT (t));
++	}
++    }
++  else if (TYPE_P (t))
++    {
++      LTO_NO_PREVAIL (TYPE_CACHED_VALUES (t));
++      LTO_SET_PREVAIL (TYPE_SIZE (t));
++      LTO_SET_PREVAIL (TYPE_SIZE_UNIT (t));
++      LTO_NO_PREVAIL (TYPE_ATTRIBUTES (t));
++      LTO_NO_PREVAIL (TYPE_NAME (t));
++
++      LTO_SET_PREVAIL (TYPE_MIN_VALUE_RAW (t));
++      LTO_SET_PREVAIL (TYPE_MAX_VALUE_RAW (t));
++      LTO_NO_PREVAIL (TYPE_LANG_SLOT_1 (t));
++
++      LTO_SET_PREVAIL (TYPE_CONTEXT (t));
++
++      LTO_NO_PREVAIL (TYPE_CANONICAL (t));
++      LTO_NO_PREVAIL (TYPE_MAIN_VARIANT (t));
++      LTO_NO_PREVAIL (TYPE_NEXT_VARIANT (t));
++    }
++  else if (EXPR_P (t))
++    {
++      int i;
++      for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
++	LTO_SET_PREVAIL (TREE_OPERAND (t, i));
++    }
++  else if (TREE_CODE (t) == CONSTRUCTOR)
++    {
++      unsigned i;
++      tree val;
++      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (t), i, val)
++	LTO_SET_PREVAIL (val);
++    }
++  else
++    {
++      switch (code)
++	{
++	case TREE_LIST:
++	  LTO_SET_PREVAIL (TREE_VALUE (t));
++	  LTO_SET_PREVAIL (TREE_PURPOSE (t));
++	  LTO_NO_PREVAIL (TREE_PURPOSE (t));
++	  break;
++	default:
++	  gcc_unreachable ();
++	}
++    }
++  /* If we fixed nothing, then we missed something seen by
++     mentions_vars_p.  */
++  gcc_checking_assert (fixed);
++}
++#undef LTO_SET_PREVAIL
++#undef LTO_NO_PREVAIL
++
++/* Helper function of lto_fixup_decls. Walks the var and fn streams in STATE,
++   replaces var and function decls with the corresponding prevailing def.  */
++
++static void
++lto_fixup_state (struct lto_in_decl_state *state)
++{
++  unsigned i, si;
++
++  /* Although we only want to replace FUNCTION_DECLs and VAR_DECLs,
++     we still need to walk from all DECLs to find the reachable
++     FUNCTION_DECLs and VAR_DECLs.  */
++  for (si = 0; si < LTO_N_DECL_STREAMS; si++)
++    {
++      vec<tree, va_gc> *trees = state->streams[si];
++      for (i = 0; i < vec_safe_length (trees); i++)
++	{
++	  tree t = (*trees)[i];
++	  if (flag_checking && TYPE_P (t))
++	    verify_type (t);
++	  if (VAR_OR_FUNCTION_DECL_P (t)
++	      && (TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
++	    (*trees)[i] = lto_symtab_prevailing_decl (t);
++	}
++    }
++}
++
++/* Fix the decls from all FILES. Replaces each decl with the corresponding
++   prevailing one.  */
++
++static void
++lto_fixup_decls (struct lto_file_decl_data **files)
++{
++  unsigned int i;
++  tree t;
++
++  if (tree_with_vars)
++    FOR_EACH_VEC_ELT ((*tree_with_vars), i, t)
++      lto_fixup_prevailing_decls (t);
++
++  for (i = 0; files[i]; i++)
++    {
++      struct lto_file_decl_data *file = files[i];
++      struct lto_in_decl_state *state = file->global_decl_state;
++      lto_fixup_state (state);
++
++      hash_table<decl_state_hasher>::iterator iter;
++      lto_in_decl_state *elt;
++      FOR_EACH_HASH_TABLE_ELEMENT (*file->function_decl_states, elt,
++				   lto_in_decl_state *, iter)
++	lto_fixup_state (elt);
++    }
++}
++
++static GTY((length ("lto_stats.num_input_files + 1"))) struct lto_file_decl_data **all_file_decl_data;
++
++/* Turn file datas for sub files into a single array, so that they look
++   like separate files for further passes. */
++
++static void
++lto_flatten_files (struct lto_file_decl_data **orig, int count, int last_file_ix)
++{
++  struct lto_file_decl_data *n, *next;
++  int i, k;
++
++  lto_stats.num_input_files = count;
++  all_file_decl_data
++    = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (count + 1);
++  /* Set the hooks so that all of the ipa passes can read in their data.  */
++  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
++  for (i = 0, k = 0; i < last_file_ix; i++) 
++    {
++      for (n = orig[i]; n != NULL; n = next)
++	{
++	  all_file_decl_data[k++] = n;
++	  next = n->next;
++	  n->next = NULL;
++	}
++    }
++  all_file_decl_data[k] = NULL;
++  gcc_assert (k == count);
++}
++
++/* Input file data before flattening (i.e. splitting them to subfiles to support
++   incremental linking.  */
++static int real_file_count;
++static GTY((length ("real_file_count + 1"))) struct lto_file_decl_data **real_file_decl_data;
++
++/* Read all the symbols from the input files FNAMES.  NFILES is the
++   number of files requested in the command line.  Instantiate a
++   global call graph by aggregating all the sub-graphs found in each
++   file.  */
++
++void
++read_cgraph_and_symbols (unsigned nfiles, const char **fnames)
++{
++  unsigned int i, last_file_ix;
++  FILE *resolution;
++  int count = 0;
++  struct lto_file_decl_data **decl_data;
++  symtab_node *snode;
++
++  symtab->initialize ();
++
++  timevar_push (TV_IPA_LTO_DECL_IN);
++
++#ifdef ACCEL_COMPILER
++  section_name_prefix = OFFLOAD_SECTION_NAME_PREFIX;
++  lto_stream_offload_p = true;
++#endif
++
++  real_file_decl_data
++    = decl_data = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (nfiles + 1);
++  real_file_count = nfiles;
++
++  /* Read the resolution file.  */
++  resolution = NULL;
++  if (resolution_file_name)
++    {
++      int t;
++      unsigned num_objects;
++
++      resolution = fopen (resolution_file_name, "r");
++      if (resolution == NULL)
++	fatal_error (input_location,
++		     "could not open symbol resolution file: %m");
++
++      t = fscanf (resolution, "%u", &num_objects);
++      gcc_assert (t == 1);
++
++      /* True, since the plugin splits the archives.  */
++      gcc_assert (num_objects == nfiles);
++    }
++  symtab->state = LTO_STREAMING;
++
++  canonical_type_hash_cache = new hash_map<const_tree, hashval_t> (251);
++  gimple_canonical_types = htab_create (16381, gimple_canonical_type_hash,
++					gimple_canonical_type_eq, NULL);
++  gcc_obstack_init (&tree_scc_hash_obstack);
++  tree_scc_hash = new hash_table<tree_scc_hasher> (4096);
++
++  /* Register the common node types with the canonical type machinery so
++     we properly share alias-sets across languages and TUs.  Do not
++     expose the common nodes as type merge target - those that should be
++     are already exposed so by pre-loading the LTO streamer caches.
++     Do two passes - first clear TYPE_CANONICAL and then re-compute it.  */
++  for (i = 0; i < itk_none; ++i)
++    lto_register_canonical_types (integer_types[i], true);
++  for (i = 0; i < stk_type_kind_last; ++i)
++    lto_register_canonical_types (sizetype_tab[i], true);
++  for (i = 0; i < TI_MAX; ++i)
++    lto_register_canonical_types (global_trees[i], true);
++  for (i = 0; i < itk_none; ++i)
++    lto_register_canonical_types (integer_types[i], false);
++  for (i = 0; i < stk_type_kind_last; ++i)
++    lto_register_canonical_types (sizetype_tab[i], false);
++  for (i = 0; i < TI_MAX; ++i)
++    lto_register_canonical_types (global_trees[i], false);
++
++  if (!quiet_flag)
++    fprintf (stderr, "Reading object files:");
++
++  /* Read all of the object files specified on the command line.  */
++  for (i = 0, last_file_ix = 0; i < nfiles; ++i)
++    {
++      struct lto_file_decl_data *file_data = NULL;
++      if (!quiet_flag)
++	{
++	  fprintf (stderr, " %s", fnames[i]);
++	  fflush (stderr);
++	}
++
++      current_lto_file = lto_obj_file_open (fnames[i], false);
++      if (!current_lto_file)
++	break;
++
++      file_data = lto_file_read (current_lto_file, resolution, &count);
++      if (!file_data)
++	{
++	  lto_obj_file_close (current_lto_file);
++	  free (current_lto_file);
++	  current_lto_file = NULL;
++	  break;
++	}
++
++      decl_data[last_file_ix++] = file_data;
++
++      lto_obj_file_close (current_lto_file);
++      free (current_lto_file);
++      current_lto_file = NULL;
++    }
++
++  lto_flatten_files (decl_data, count, last_file_ix);
++  lto_stats.num_input_files = count;
++  ggc_free(decl_data);
++  real_file_decl_data = NULL;
++
++  if (resolution_file_name)
++    fclose (resolution);
++
++  /* Show the LTO report before launching LTRANS.  */
++  if (flag_lto_report || (flag_wpa && flag_lto_report_wpa))
++    print_lto_report_1 ();
++
++  /* Free gimple type merging datastructures.  */
++  delete tree_scc_hash;
++  tree_scc_hash = NULL;
++  obstack_free (&tree_scc_hash_obstack, NULL);
++  htab_delete (gimple_canonical_types);
++  gimple_canonical_types = NULL;
++  delete canonical_type_hash_cache;
++  canonical_type_hash_cache = NULL;
++
++  /* At this stage we know that majority of GGC memory is reachable.  
++     Growing the limits prevents unnecesary invocation of GGC.  */
++  ggc_grow ();
++  ggc_collect ();
++
++  /* Set the hooks so that all of the ipa passes can read in their data.  */
++  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
++
++  timevar_pop (TV_IPA_LTO_DECL_IN);
++
++  if (!quiet_flag)
++    fprintf (stderr, "\nReading the callgraph\n");
++
++  timevar_push (TV_IPA_LTO_CGRAPH_IO);
++  /* Read the symtab.  */
++  input_symtab ();
++
++  input_offload_tables (!flag_ltrans);
++
++  /* Store resolutions into the symbol table.  */
++
++  FOR_EACH_SYMBOL (snode)
++    if (snode->externally_visible && snode->real_symbol_p ()
++	&& snode->lto_file_data && snode->lto_file_data->resolution_map
++	&& !(TREE_CODE (snode->decl) == FUNCTION_DECL
++	     && fndecl_built_in_p (snode->decl))
++	&& !(VAR_P (snode->decl) && DECL_HARD_REGISTER (snode->decl)))
++      {
++	ld_plugin_symbol_resolution_t *res;
++
++	res = snode->lto_file_data->resolution_map->get (snode->decl);
++	if (!res || *res == LDPR_UNKNOWN)
++	  {
++	    if (snode->output_to_lto_symbol_table_p ())
++	      fatal_error (input_location, "missing resolution data for %s",
++		           IDENTIFIER_POINTER
++			     (DECL_ASSEMBLER_NAME (snode->decl)));
++	  }
++	else
++          snode->resolution = *res;
++      }
++  for (i = 0; all_file_decl_data[i]; i++)
++    if (all_file_decl_data[i]->resolution_map)
++      {
++        delete all_file_decl_data[i]->resolution_map;
++        all_file_decl_data[i]->resolution_map = NULL;
++      }
++  
++  timevar_pop (TV_IPA_LTO_CGRAPH_IO);
++
++  if (!quiet_flag)
++    fprintf (stderr, "Merging declarations\n");
++
++  timevar_push (TV_IPA_LTO_DECL_MERGE);
++  /* Merge global decls.  In ltrans mode we read merged cgraph, we do not
++     need to care about resolving symbols again, we only need to replace
++     duplicated declarations read from the callgraph and from function
++     sections.  */
++  if (!flag_ltrans)
++    {
++      lto_symtab_merge_decls ();
++
++      /* If there were errors during symbol merging bail out, we have no
++	 good way to recover here.  */
++      if (seen_error ())
++	fatal_error (input_location,
++		     "errors during merging of translation units");
++
++      /* Fixup all decls.  */
++      lto_fixup_decls (all_file_decl_data);
++    }
++  if (tree_with_vars)
++    ggc_free (tree_with_vars);
++  tree_with_vars = NULL;
++  ggc_collect ();
++
++  timevar_pop (TV_IPA_LTO_DECL_MERGE);
++  /* Each pass will set the appropriate timer.  */
++
++  if (!quiet_flag)
++    fprintf (stderr, "Reading summaries\n");
++
++  /* Read the IPA summary data.  */
++  if (flag_ltrans)
++    ipa_read_optimization_summaries ();
++  else
++    ipa_read_summaries ();
++
++  for (i = 0; all_file_decl_data[i]; i++)
++    {
++      gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
++      lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
++      all_file_decl_data[i]->symtab_node_encoder = NULL;
++      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
++      all_file_decl_data[i]->global_decl_state = NULL;
++      all_file_decl_data[i]->current_decl_state = NULL; 
++    }
++
++  if (!flag_ltrans)
++    {
++      /* Finally merge the cgraph according to the decl merging decisions.  */
++      timevar_push (TV_IPA_LTO_CGRAPH_MERGE);
++
++      gcc_assert (!dump_file);
++      dump_file = dump_begin (lto_link_dump_id, NULL);
++
++      if (dump_file)
++	{
++	  fprintf (dump_file, "Before merging:\n");
++	  symtab->dump (dump_file);
++	}
++      lto_symtab_merge_symbols ();
++      /* Removal of unreachable symbols is needed to make verify_symtab to pass;
++	 we are still having duplicated comdat groups containing local statics.
++	 We could also just remove them while merging.  */
++      symtab->remove_unreachable_nodes (dump_file);
++      ggc_collect ();
++
++      if (dump_file)
++        dump_end (lto_link_dump_id, dump_file);
++      dump_file = NULL;
++      timevar_pop (TV_IPA_LTO_CGRAPH_MERGE);
++    }
++  symtab->state = IPA_SSA;
++  /* All node removals happening here are useless, because
++     WPA should not stream them.  Still always perform remove_unreachable_nodes
++     because we may reshape clone tree, get rid of dead masters of inline
++     clones and remove symbol entries for read-only variables we keep around
++     only to be able to constant fold them.  */
++  if (flag_ltrans)
++    {
++      if (symtab->dump_file)
++	 symtab->dump (symtab->dump_file);
++      symtab->remove_unreachable_nodes (symtab->dump_file);
++    }
++
++  /* Indicate that the cgraph is built and ready.  */
++  symtab->function_flags_ready = true;
++
++  ggc_free (all_file_decl_data);
++  all_file_decl_data = NULL;
++}
++
++
++
++/* Show various memory usage statistics related to LTO.  */
++void
++print_lto_report_1 (void)
++{
++  const char *pfx = (flag_lto) ? "LTO" : (flag_wpa) ? "WPA" : "LTRANS";
++  fprintf (stderr, "%s statistics\n", pfx);
++
++  fprintf (stderr, "[%s] read %lu SCCs of average size %f\n",
++	   pfx, num_sccs_read, total_scc_size / (double)num_sccs_read);
++  fprintf (stderr, "[%s] %lu tree bodies read in total\n", pfx, total_scc_size);
++  if (flag_wpa && tree_scc_hash)
++    {
++      fprintf (stderr, "[%s] tree SCC table: size %ld, %ld elements, "
++	       "collision ratio: %f\n", pfx,
++	       (long) tree_scc_hash->size (),
++	       (long) tree_scc_hash->elements (),
++	       tree_scc_hash->collisions ());
++      hash_table<tree_scc_hasher>::iterator hiter;
++      tree_scc *scc, *max_scc = NULL;
++      unsigned max_length = 0;
++      FOR_EACH_HASH_TABLE_ELEMENT (*tree_scc_hash, scc, x, hiter)
++	{
++	  unsigned length = 0;
++	  tree_scc *s = scc;
++	  for (; s; s = s->next)
++	    length++;
++	  if (length > max_length)
++	    {
++	      max_length = length;
++	      max_scc = scc;
++	    }
++	}
++      fprintf (stderr, "[%s] tree SCC max chain length %u (size %u)\n",
++	       pfx, max_length, max_scc->len);
++      fprintf (stderr, "[%s] Compared %lu SCCs, %lu collisions (%f)\n", pfx,
++	       num_scc_compares, num_scc_compare_collisions,
++	       num_scc_compare_collisions / (double) num_scc_compares);
++      fprintf (stderr, "[%s] Merged %lu SCCs\n", pfx, num_sccs_merged);
++      fprintf (stderr, "[%s] Merged %lu tree bodies\n", pfx,
++	       total_scc_size_merged);
++      fprintf (stderr, "[%s] Merged %lu types\n", pfx, num_merged_types);
++      fprintf (stderr, "[%s] %lu types prevailed (%lu associated trees)\n",
++	       pfx, num_prevailing_types, num_type_scc_trees);
++      fprintf (stderr, "[%s] GIMPLE canonical type table: size %ld, "
++	       "%ld elements, %ld searches, %ld collisions (ratio: %f)\n", pfx,
++	       (long) htab_size (gimple_canonical_types),
++	       (long) htab_elements (gimple_canonical_types),
++	       (long) gimple_canonical_types->searches,
++	       (long) gimple_canonical_types->collisions,
++	       htab_collisions (gimple_canonical_types));
++      fprintf (stderr, "[%s] GIMPLE canonical type pointer-map: "
++	       "%lu elements, %ld searches\n", pfx,
++	       num_canonical_type_hash_entries,
++	       num_canonical_type_hash_queries);
++    }
++
++  print_lto_report (pfx);
++}
++
++GTY(()) tree lto_eh_personality_decl;
++
++/* Return the LTO personality function decl.  */
++
++tree
++lto_eh_personality (void)
++{
++  if (!lto_eh_personality_decl)
++    {
++      /* Use the first personality DECL for our personality if we don't
++	 support multiple ones.  This ensures that we don't artificially
++	 create the need for them in a single-language program.  */
++      if (first_personality_decl && !dwarf2out_do_cfi_asm ())
++	lto_eh_personality_decl = first_personality_decl;
++      else
++	lto_eh_personality_decl = lhd_gcc_personality ();
++    }
++
++  return lto_eh_personality_decl;
++}
++
++/* Set the process name based on the LTO mode. */
++
++static void 
++lto_process_name (void)
++{
++  if (flag_lto)
++    setproctitle (flag_incremental_link == INCREMENTAL_LINK_LTO
++		  ? "lto1-inclink" : "lto1-lto");
++  if (flag_wpa)
++    setproctitle ("lto1-wpa");
++  if (flag_ltrans)
++    setproctitle ("lto1-ltrans");
++}
++
++
++/* Initialize the LTO front end.  */
++
++void
++lto_fe_init (void)
++{
++  lto_process_name ();
++  lto_streamer_hooks_init ();
++  lto_reader_init ();
++  lto_set_in_hooks (NULL, get_section_data, free_section_data);
++  memset (&lto_stats, 0, sizeof (lto_stats));
++  bitmap_obstack_initialize (NULL);
++  gimple_register_cfg_hooks ();
++#ifndef ACCEL_COMPILER
++  unsigned char *table
++    = ggc_vec_alloc<unsigned char> (MAX_MACHINE_MODE);
++  for (int m = 0; m < MAX_MACHINE_MODE; m++)
++    table[m] = m;
++  lto_mode_identity_table = table;
++#endif
++}
++
++#include "gt-lto-lto-common.h"
+diff --git a/gcc/lto/lto-common.h b/gcc/lto/lto-common.h
+new file mode 100644
+index 000000000..b1209a3a3
+--- /dev/null
++++ b/gcc/lto/lto-common.h
+@@ -0,0 +1,33 @@
++/* LTO common functions between lto.c and lto-dump.c header file.
++   Copyright (C) 2018 Free Software Foundation, Inc.
++
++This file is part of GCC.
++
++GCC is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free
++Software Foundation; either version 3, or (at your option) any later
++version.
++
++GCC is distributed in the hope that it will be useful, but WITHOUT ANY
++WARRANTY; without even the implied warranty of MERCHANTABILITY or
++FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received a copy of the GNU General Public License
++along with GCC; see the file COPYING3.  If not see
++<http://www.gnu.org/licenses/>.  */
++
++#ifndef LTO_COMMON_H
++#define LTO_COMMON_H
++
++void lto_fe_init (void);
++void read_cgraph_and_symbols (unsigned, const char **);
++void print_lto_report_1 (void);
++
++extern tree lto_eh_personality_decl;
++extern GTY(()) vec<tree, va_gc> *tree_with_vars;
++extern const unsigned char *lto_mode_identity_table;
++extern tree first_personality_decl;
++
++#endif
++
+diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c
+index 4ef228fcb..1d35db11e 100644
+--- a/gcc/lto/lto-lang.c
++++ b/gcc/lto/lto-lang.c
+@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "debug.h"
+ #include "lto-tree.h"
+ #include "lto.h"
++#include "lto-common.h"
+ #include "stringpool.h"
+ #include "attribs.h"
+ 
+diff --git a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c
+index 63a633302..2fd5b1e8f 100644
+--- a/gcc/lto/lto-symtab.c
++++ b/gcc/lto/lto-symtab.c
+@@ -556,7 +556,8 @@ lto_symtab_merge_p (tree prevailing, tree decl)
+ 	}
+       if (fndecl_built_in_p (prevailing)
+ 	  && (DECL_BUILT_IN_CLASS (prevailing) != DECL_BUILT_IN_CLASS (decl)
+-	      || DECL_FUNCTION_CODE (prevailing) != DECL_FUNCTION_CODE (decl)))
++	      || (DECL_UNCHECKED_FUNCTION_CODE (prevailing)
++		  != DECL_UNCHECKED_FUNCTION_CODE (decl))))
+ 	{
+ 	  if (dump_file)
+ 	    fprintf (dump_file, "Not merging decls; "
+diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
+index 4db156fdf..c44e034a2 100644
+--- a/gcc/lto/lto.c
++++ b/gcc/lto/lto.c
+@@ -38,7 +38,6 @@ along with GCC; see the file COPYING3.  If not see
+ #include "symbol-summary.h"
+ #include "tree-vrp.h"
+ #include "ipa-prop.h"
+-#include "common.h"
+ #include "debug.h"
+ #include "lto.h"
+ #include "lto-section-names.h"
+@@ -55,122 +54,12 @@ along with GCC; see the file COPYING3.  If not see
+ #include "fold-const.h"
+ #include "attribs.h"
+ #include "builtins.h"
++#include "lto-common.h"
+ 
+ 
+ /* Number of parallel tasks to run, -1 if we want to use GNU Make jobserver.  */
+ static int lto_parallelism;
+ 
+-static GTY(()) tree first_personality_decl;
+-
+-static GTY(()) const unsigned char *lto_mode_identity_table;
+-
+-/* Returns a hash code for P.  */
+-
+-static hashval_t
+-hash_name (const void *p)
+-{
+-  const struct lto_section_slot *ds = (const struct lto_section_slot *) p;
+-  return (hashval_t) htab_hash_string (ds->name);
+-}
+-
+-
+-/* Returns nonzero if P1 and P2 are equal.  */
+-
+-static int
+-eq_name (const void *p1, const void *p2)
+-{
+-  const struct lto_section_slot *s1 =
+-    (const struct lto_section_slot *) p1;
+-  const struct lto_section_slot *s2 =
+-    (const struct lto_section_slot *) p2;
+-
+-  return strcmp (s1->name, s2->name) == 0;
+-}
+-
+-/* Free lto_section_slot */
+-
+-static void
+-free_with_string (void *arg)
+-{
+-  struct lto_section_slot *s = (struct lto_section_slot *)arg;
+-
+-  free (CONST_CAST (char *, s->name));
+-  free (arg);
+-}
+-
+-/* Create section hash table */
+-
+-htab_t 
+-lto_obj_create_section_hash_table (void)
+-{
+-  return htab_create (37, hash_name, eq_name, free_with_string);
+-}
+-
+-/* Delete an allocated integer KEY in the splay tree.  */
+-
+-static void
+-lto_splay_tree_delete_id (splay_tree_key key)
+-{
+-  free ((void *) key);
+-}
+-
+-/* Compare splay tree node ids A and B.  */
+-
+-static int
+-lto_splay_tree_compare_ids (splay_tree_key a, splay_tree_key b)
+-{
+-  unsigned HOST_WIDE_INT ai;
+-  unsigned HOST_WIDE_INT bi;
+-
+-  ai = *(unsigned HOST_WIDE_INT *) a;
+-  bi = *(unsigned HOST_WIDE_INT *) b;
+-
+-  if (ai < bi)
+-    return -1;
+-  else if (ai > bi)
+-    return 1;
+-  return 0;
+-}
+-
+-/* Look up splay tree node by ID in splay tree T.  */
+-
+-static splay_tree_node
+-lto_splay_tree_lookup (splay_tree t, unsigned HOST_WIDE_INT id)
+-{
+-  return splay_tree_lookup (t, (splay_tree_key) &id);
+-}
+-
+-/* Check if KEY has ID.  */
+-
+-static bool
+-lto_splay_tree_id_equal_p (splay_tree_key key, unsigned HOST_WIDE_INT id)
+-{
+-  return *(unsigned HOST_WIDE_INT *) key == id;
+-}
+-
+-/* Insert a splay tree node into tree T with ID as key and FILE_DATA as value. 
+-   The ID is allocated separately because we need HOST_WIDE_INTs which may
+-   be wider than a splay_tree_key. */
+-
+-static void
+-lto_splay_tree_insert (splay_tree t, unsigned HOST_WIDE_INT id,
+-		       struct lto_file_decl_data *file_data)
+-{
+-  unsigned HOST_WIDE_INT *idp = XCNEW (unsigned HOST_WIDE_INT);
+-  *idp = id;
+-  splay_tree_insert (t, (splay_tree_key) idp, (splay_tree_value) file_data);
+-}
+-
+-/* Create a splay tree.  */
+-
+-static splay_tree
+-lto_splay_tree_new (void)
+-{
+-  return splay_tree_new (lto_splay_tree_compare_ids,
+-	 	         lto_splay_tree_delete_id,
+-			 NULL);
+-}
+-
+ /* Return true when NODE has a clone that is analyzed (i.e. we need
+    to load its body even if the node itself is not needed).  */
+ 
+@@ -224,2083 +113,45 @@ lto_materialize_function (struct cgraph_node *node)
+   rest_of_decl_compilation (decl, 1, 0);
+ }
+ 
+-
+-/* Decode the content of memory pointed to by DATA in the in decl
+-   state object STATE. DATA_IN points to a data_in structure for
+-   decoding. Return the address after the decoded object in the
+-   input.  */
+-
+-static const uint32_t *
+-lto_read_in_decl_state (struct data_in *data_in, const uint32_t *data,
+-			struct lto_in_decl_state *state)
+-{
+-  uint32_t ix;
+-  tree decl;
+-  uint32_t i, j;
+-
+-  ix = *data++;
+-  state->compressed = ix & 1;
+-  ix /= 2;
+-  decl = streamer_tree_cache_get_tree (data_in->reader_cache, ix);
+-  if (!VAR_OR_FUNCTION_DECL_P (decl))
+-    {
+-      gcc_assert (decl == void_type_node);
+-      decl = NULL_TREE;
+-    }
+-  state->fn_decl = decl;
+-
+-  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
+-    {
+-      uint32_t size = *data++;
+-      vec<tree, va_gc> *decls = NULL;
+-      vec_alloc (decls, size);
+-
+-      for (j = 0; j < size; j++)
+-	vec_safe_push (decls,
+-		       streamer_tree_cache_get_tree (data_in->reader_cache,
+-						     data[j]));
+-
+-      state->streams[i] = decls;
+-      data += size;
+-    }
+-
+-  return data;
+-}
+-
+-
+-/* Global canonical type table.  */
+-static htab_t gimple_canonical_types;
+-static hash_map<const_tree, hashval_t> *canonical_type_hash_cache;
+-static unsigned long num_canonical_type_hash_entries;
+-static unsigned long num_canonical_type_hash_queries;
+-
+-static void iterative_hash_canonical_type (tree type, inchash::hash &hstate);
+-static hashval_t gimple_canonical_type_hash (const void *p);
+-static void gimple_register_canonical_type_1 (tree t, hashval_t hash);
+-
+-/* Returning a hash value for gimple type TYPE.
+-
+-   The hash value returned is equal for types considered compatible
+-   by gimple_canonical_types_compatible_p.  */
+-
+-static hashval_t
+-hash_canonical_type (tree type)
+-{
+-  inchash::hash hstate;
+-  enum tree_code code;
+-
+-  /* We compute alias sets only for types that needs them.
+-     Be sure we do not recurse to something else as we cannot hash incomplete
+-     types in a way they would have same hash value as compatible complete
+-     types.  */
+-  gcc_checking_assert (type_with_alias_set_p (type));
+-
+-  /* Combine a few common features of types so that types are grouped into
+-     smaller sets; when searching for existing matching types to merge,
+-     only existing types having the same features as the new type will be
+-     checked.  */
+-  code = tree_code_for_canonical_type_merging (TREE_CODE (type));
+-  hstate.add_int (code);
+-  hstate.add_int (TYPE_MODE (type));
+-
+-  /* Incorporate common features of numerical types.  */
+-  if (INTEGRAL_TYPE_P (type)
+-      || SCALAR_FLOAT_TYPE_P (type)
+-      || FIXED_POINT_TYPE_P (type)
+-      || TREE_CODE (type) == OFFSET_TYPE
+-      || POINTER_TYPE_P (type))
+-    {
+-      hstate.add_int (TYPE_PRECISION (type));
+-      if (!type_with_interoperable_signedness (type))
+-        hstate.add_int (TYPE_UNSIGNED (type));
+-    }
+-
+-  if (VECTOR_TYPE_P (type))
+-    {
+-      hstate.add_poly_int (TYPE_VECTOR_SUBPARTS (type));
+-      hstate.add_int (TYPE_UNSIGNED (type));
+-    }
+-
+-  if (TREE_CODE (type) == COMPLEX_TYPE)
+-    hstate.add_int (TYPE_UNSIGNED (type));
+-
+-  /* Fortran's C_SIGNED_CHAR is !TYPE_STRING_FLAG but needs to be
+-     interoperable with "signed char".  Unless all frontends are revisited to
+-     agree on these types, we must ignore the flag completely.  */
+-
+-  /* Fortran standard define C_PTR type that is compatible with every
+-     C pointer.  For this reason we need to glob all pointers into one.
+-     Still pointers in different address spaces are not compatible.  */
+-  if (POINTER_TYPE_P (type))
+-    hstate.add_int (TYPE_ADDR_SPACE (TREE_TYPE (type)));
+-
+-  /* For array types hash the domain bounds and the string flag.  */
+-  if (TREE_CODE (type) == ARRAY_TYPE && TYPE_DOMAIN (type))
+-    {
+-      hstate.add_int (TYPE_STRING_FLAG (type));
+-      /* OMP lowering can introduce error_mark_node in place of
+-	 random local decls in types.  */
+-      if (TYPE_MIN_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
+-	inchash::add_expr (TYPE_MIN_VALUE (TYPE_DOMAIN (type)), hstate);
+-      if (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
+-	inchash::add_expr (TYPE_MAX_VALUE (TYPE_DOMAIN (type)), hstate);
+-    }
+-
+-  /* Recurse for aggregates with a single element type.  */
+-  if (TREE_CODE (type) == ARRAY_TYPE
+-      || TREE_CODE (type) == COMPLEX_TYPE
+-      || TREE_CODE (type) == VECTOR_TYPE)
+-    iterative_hash_canonical_type (TREE_TYPE (type), hstate);
+-
+-  /* Incorporate function return and argument types.  */
+-  if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
+-    {
+-      unsigned na;
+-      tree p;
+-
+-      iterative_hash_canonical_type (TREE_TYPE (type), hstate);
+-
+-      for (p = TYPE_ARG_TYPES (type), na = 0; p; p = TREE_CHAIN (p))
+-	{
+-	  iterative_hash_canonical_type (TREE_VALUE (p), hstate);
+-	  na++;
+-	}
+-
+-      hstate.add_int (na);
+-    }
+-
+-  if (RECORD_OR_UNION_TYPE_P (type))
+-    {
+-      unsigned nf;
+-      tree f;
+-
+-      for (f = TYPE_FIELDS (type), nf = 0; f; f = TREE_CHAIN (f))
+-	if (TREE_CODE (f) == FIELD_DECL
+-	    && (! DECL_SIZE (f)
+-		|| ! integer_zerop (DECL_SIZE (f))))
+-	  {
+-	    iterative_hash_canonical_type (TREE_TYPE (f), hstate);
+-	    nf++;
+-	  }
+-
+-      hstate.add_int (nf);
+-    }
+-
+-  return hstate.end();
+-}
+-
+-/* Returning a hash value for gimple type TYPE combined with VAL.  */
+-
+-static void
+-iterative_hash_canonical_type (tree type, inchash::hash &hstate)
+-{
+-  hashval_t v;
+-
+-  /* All type variants have same TYPE_CANONICAL.  */
+-  type = TYPE_MAIN_VARIANT (type);
+-
+-  if (!canonical_type_used_p (type))
+-    v = hash_canonical_type (type);
+-  /* An already processed type.  */
+-  else if (TYPE_CANONICAL (type))
+-    {
+-      type = TYPE_CANONICAL (type);
+-      v = gimple_canonical_type_hash (type);
+-    }
+-  else
+-    {
+-      /* Canonical types should not be able to form SCCs by design, this
+-	 recursion is just because we do not register canonical types in
+-	 optimal order.  To avoid quadratic behavior also register the
+-	 type here.  */
+-      v = hash_canonical_type (type);
+-      gimple_register_canonical_type_1 (type, v);
+-    }
+-  hstate.add_int (v);
+-}
+-
+-/* Returns the hash for a canonical type P.  */
+-
+-static hashval_t
+-gimple_canonical_type_hash (const void *p)
+-{
+-  num_canonical_type_hash_queries++;
+-  hashval_t *slot = canonical_type_hash_cache->get ((const_tree) p);
+-  gcc_assert (slot != NULL);
+-  return *slot;
+-}
+-
+-
+-
+-/* Returns nonzero if P1 and P2 are equal.  */
+-
+-static int
+-gimple_canonical_type_eq (const void *p1, const void *p2)
+-{
+-  const_tree t1 = (const_tree) p1;
+-  const_tree t2 = (const_tree) p2;
+-  return gimple_canonical_types_compatible_p (CONST_CAST_TREE (t1),
+-					      CONST_CAST_TREE (t2));
+-}
+-
+-/* Main worker for gimple_register_canonical_type.  */
+-
+-static void
+-gimple_register_canonical_type_1 (tree t, hashval_t hash)
+-{
+-  void **slot;
+-
+-  gcc_checking_assert (TYPE_P (t) && !TYPE_CANONICAL (t)
+-		       && type_with_alias_set_p (t)
+-		       && canonical_type_used_p (t));
+-
+-  slot = htab_find_slot_with_hash (gimple_canonical_types, t, hash, INSERT);
+-  if (*slot)
+-    {
+-      tree new_type = (tree)(*slot);
+-      gcc_checking_assert (new_type != t);
+-      TYPE_CANONICAL (t) = new_type;
+-    }
+-  else
+-    {
+-      TYPE_CANONICAL (t) = t;
+-      *slot = (void *) t;
+-      /* Cache the just computed hash value.  */
+-      num_canonical_type_hash_entries++;
+-      bool existed_p = canonical_type_hash_cache->put (t, hash);
+-      gcc_assert (!existed_p);
+-    }
+-}
+-
+-/* Register type T in the global type table gimple_types and set
+-   TYPE_CANONICAL of T accordingly.
+-   This is used by LTO to merge structurally equivalent types for
+-   type-based aliasing purposes across different TUs and languages.
+-
+-   ???  This merging does not exactly match how the tree.c middle-end
+-   functions will assign TYPE_CANONICAL when new types are created
+-   during optimization (which at least happens for pointer and array
+-   types).  */
+-
+-static void
+-gimple_register_canonical_type (tree t)
+-{
+-  if (TYPE_CANONICAL (t) || !type_with_alias_set_p (t)
+-      || !canonical_type_used_p (t))
+-    return;
+-
+-  /* Canonical types are same among all complete variants.  */
+-  if (TYPE_CANONICAL (TYPE_MAIN_VARIANT (t)))
+-    TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
+-  else
+-    {
+-      gimple_register_canonical_type_1 (TYPE_MAIN_VARIANT (t),
+-					hash_canonical_type (TYPE_MAIN_VARIANT (t)));
+-      TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
+-    }
+-}
+-
+-/* Re-compute TYPE_CANONICAL for NODE and related types.  */
++/* Materialize all the bodies for all the nodes in the callgraph.  */
+ 
+ static void
+-lto_register_canonical_types (tree node, bool first_p)
+-{
+-  if (!node
+-      || !TYPE_P (node))
+-    return;
+-
+-  if (first_p)
+-    TYPE_CANONICAL (node) = NULL_TREE;
+-
+-  if (POINTER_TYPE_P (node)
+-      || TREE_CODE (node) == COMPLEX_TYPE
+-      || TREE_CODE (node) == ARRAY_TYPE)
+-    lto_register_canonical_types (TREE_TYPE (node), first_p);
+-
+- if (!first_p) 
+-    gimple_register_canonical_type (node);
+-}
+-
+-
+-/* Remember trees that contains references to declarations.  */
+-static GTY(()) vec <tree, va_gc> *tree_with_vars;
+-
+-#define CHECK_VAR(tt) \
+-  do \
+-    { \
+-      if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
+-	  && (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
+-	return true; \
+-    } while (0)
+-
+-#define CHECK_NO_VAR(tt) \
+-  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
+-
+-/* Check presence of pointers to decls in fields of a tree_typed T.  */
+-
+-static inline bool
+-mentions_vars_p_typed (tree t)
+-{
+-  CHECK_NO_VAR (TREE_TYPE (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a tree_common T.  */
+-
+-static inline bool
+-mentions_vars_p_common (tree t)
+-{
+-  if (mentions_vars_p_typed (t))
+-    return true;
+-  CHECK_NO_VAR (TREE_CHAIN (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a decl_minimal T.  */
+-
+-static inline bool
+-mentions_vars_p_decl_minimal (tree t)
+-{
+-  if (mentions_vars_p_common (t))
+-    return true;
+-  CHECK_NO_VAR (DECL_NAME (t));
+-  CHECK_VAR (DECL_CONTEXT (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a decl_common T.  */
+-
+-static inline bool
+-mentions_vars_p_decl_common (tree t)
+-{
+-  if (mentions_vars_p_decl_minimal (t))
+-    return true;
+-  CHECK_VAR (DECL_SIZE (t));
+-  CHECK_VAR (DECL_SIZE_UNIT (t));
+-  CHECK_VAR (DECL_INITIAL (t));
+-  CHECK_NO_VAR (DECL_ATTRIBUTES (t));
+-  CHECK_VAR (DECL_ABSTRACT_ORIGIN (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a decl_with_vis T.  */
+-
+-static inline bool
+-mentions_vars_p_decl_with_vis (tree t)
+-{
+-  if (mentions_vars_p_decl_common (t))
+-    return true;
+-
+-  /* Accessor macro has side-effects, use field-name here. */
+-  CHECK_NO_VAR (DECL_ASSEMBLER_NAME_RAW (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a decl_non_common T.  */
+-
+-static inline bool
+-mentions_vars_p_decl_non_common (tree t)
+-{
+-  if (mentions_vars_p_decl_with_vis (t))
+-    return true;
+-  CHECK_NO_VAR (DECL_RESULT_FLD (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a decl_non_common T.  */
+-
+-static bool
+-mentions_vars_p_function (tree t)
+-{
+-  if (mentions_vars_p_decl_non_common (t))
+-    return true;
+-  CHECK_NO_VAR (DECL_ARGUMENTS (t));
+-  CHECK_NO_VAR (DECL_VINDEX (t));
+-  CHECK_VAR (DECL_FUNCTION_PERSONALITY (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a field_decl T.  */
+-
+-static bool
+-mentions_vars_p_field_decl (tree t)
+-{
+-  if (mentions_vars_p_decl_common (t))
+-    return true;
+-  CHECK_VAR (DECL_FIELD_OFFSET (t));
+-  CHECK_NO_VAR (DECL_BIT_FIELD_TYPE (t));
+-  CHECK_NO_VAR (DECL_QUALIFIER (t));
+-  CHECK_NO_VAR (DECL_FIELD_BIT_OFFSET (t));
+-  CHECK_NO_VAR (DECL_FCONTEXT (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a type T.  */
+-
+-static bool
+-mentions_vars_p_type (tree t)
+-{
+-  if (mentions_vars_p_common (t))
+-    return true;
+-  CHECK_NO_VAR (TYPE_CACHED_VALUES (t));
+-  CHECK_VAR (TYPE_SIZE (t));
+-  CHECK_VAR (TYPE_SIZE_UNIT (t));
+-  CHECK_NO_VAR (TYPE_ATTRIBUTES (t));
+-  CHECK_NO_VAR (TYPE_NAME (t));
+-
+-  CHECK_VAR (TYPE_MIN_VALUE_RAW (t));
+-  CHECK_VAR (TYPE_MAX_VALUE_RAW (t));
+-
+-  /* Accessor is for derived node types only. */
+-  CHECK_NO_VAR (TYPE_LANG_SLOT_1 (t));
+-
+-  CHECK_VAR (TYPE_CONTEXT (t));
+-  CHECK_NO_VAR (TYPE_CANONICAL (t));
+-  CHECK_NO_VAR (TYPE_MAIN_VARIANT (t));
+-  CHECK_NO_VAR (TYPE_NEXT_VARIANT (t));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a BINFO T.  */
+-
+-static bool
+-mentions_vars_p_binfo (tree t)
+-{
+-  unsigned HOST_WIDE_INT i, n;
+-
+-  if (mentions_vars_p_common (t))
+-    return true;
+-  CHECK_VAR (BINFO_VTABLE (t));
+-  CHECK_NO_VAR (BINFO_OFFSET (t));
+-  CHECK_NO_VAR (BINFO_VIRTUALS (t));
+-  CHECK_NO_VAR (BINFO_VPTR_FIELD (t));
+-  n = vec_safe_length (BINFO_BASE_ACCESSES (t));
+-  for (i = 0; i < n; i++)
+-    CHECK_NO_VAR (BINFO_BASE_ACCESS (t, i));
+-  /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
+-     and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
+-  n = BINFO_N_BASE_BINFOS (t);
+-  for (i = 0; i < n; i++)
+-    CHECK_NO_VAR (BINFO_BASE_BINFO (t, i));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of a CONSTRUCTOR T.  */
+-
+-static bool
+-mentions_vars_p_constructor (tree t)
+-{
+-  unsigned HOST_WIDE_INT idx;
+-  constructor_elt *ce;
+-
+-  if (mentions_vars_p_typed (t))
+-    return true;
+-
+-  for (idx = 0; vec_safe_iterate (CONSTRUCTOR_ELTS (t), idx, &ce); idx++)
+-    {
+-      CHECK_NO_VAR (ce->index);
+-      CHECK_VAR (ce->value);
+-    }
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of an expression tree T.  */
+-
+-static bool
+-mentions_vars_p_expr (tree t)
+-{
+-  int i;
+-  if (mentions_vars_p_typed (t))
+-    return true;
+-  for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
+-    CHECK_VAR (TREE_OPERAND (t, i));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls in fields of an OMP_CLAUSE T.  */
+-
+-static bool
+-mentions_vars_p_omp_clause (tree t)
+-{
+-  int i;
+-  if (mentions_vars_p_common (t))
+-    return true;
+-  for (i = omp_clause_num_ops[OMP_CLAUSE_CODE (t)] - 1; i >= 0; --i)
+-    CHECK_VAR (OMP_CLAUSE_OPERAND (t, i));
+-  return false;
+-}
+-
+-/* Check presence of pointers to decls that needs later fixup in T.  */
+-
+-static bool
+-mentions_vars_p (tree t)
++materialize_cgraph (void)
+ {
+-  switch (TREE_CODE (t))
+-    {
+-    case IDENTIFIER_NODE:
+-      break;
+-
+-    case TREE_LIST:
+-      CHECK_VAR (TREE_VALUE (t));
+-      CHECK_VAR (TREE_PURPOSE (t));
+-      CHECK_NO_VAR (TREE_CHAIN (t));
+-      break;
+-
+-    case FIELD_DECL:
+-      return mentions_vars_p_field_decl (t);
+-
+-    case LABEL_DECL:
+-    case CONST_DECL:
+-    case PARM_DECL:
+-    case RESULT_DECL:
+-    case IMPORTED_DECL:
+-    case NAMESPACE_DECL:
+-    case NAMELIST_DECL:
+-      return mentions_vars_p_decl_common (t);
+-
+-    case VAR_DECL:
+-      return mentions_vars_p_decl_with_vis (t);
+-
+-    case TYPE_DECL:
+-      return mentions_vars_p_decl_non_common (t);
+-
+-    case FUNCTION_DECL:
+-      return mentions_vars_p_function (t);
+-
+-    case TREE_BINFO:
+-      return mentions_vars_p_binfo (t);
+-
+-    case PLACEHOLDER_EXPR:
+-      return mentions_vars_p_common (t);
+-
+-    case BLOCK:
+-    case TRANSLATION_UNIT_DECL:
+-    case OPTIMIZATION_NODE:
+-    case TARGET_OPTION_NODE:
+-      break;
+-
+-    case CONSTRUCTOR:
+-      return mentions_vars_p_constructor (t);
+-
+-    case OMP_CLAUSE:
+-      return mentions_vars_p_omp_clause (t);
+-
+-    default:
+-      if (TYPE_P (t))
+-	{
+-	  if (mentions_vars_p_type (t))
+-	    return true;
+-	}
+-      else if (EXPR_P (t))
+-	{
+-	  if (mentions_vars_p_expr (t))
+-	    return true;
+-	}
+-      else if (CONSTANT_CLASS_P (t))
+-	CHECK_NO_VAR (TREE_TYPE (t));
+-      else
+-	gcc_unreachable ();
+-    }
+-  return false;
+-}
+-
+-
+-/* Return the resolution for the decl with index INDEX from DATA_IN. */
+-
+-static enum ld_plugin_symbol_resolution
+-get_resolution (struct data_in *data_in, unsigned index)
+-{
+-  if (data_in->globals_resolution.exists ())
+-    {
+-      ld_plugin_symbol_resolution_t ret;
+-      /* We can have references to not emitted functions in
+-	 DECL_FUNCTION_PERSONALITY at least.  So we can and have
+-	 to indeed return LDPR_UNKNOWN in some cases.   */
+-      if (data_in->globals_resolution.length () <= index)
+-	return LDPR_UNKNOWN;
+-      ret = data_in->globals_resolution[index];
+-      return ret;
+-    }
+-  else
+-    /* Delay resolution finding until decl merging.  */
+-    return LDPR_UNKNOWN;
+-}
+-
+-/* We need to record resolutions until symbol table is read.  */
+-static void
+-register_resolution (struct lto_file_decl_data *file_data, tree decl,
+-		     enum ld_plugin_symbol_resolution resolution)
+-{
+-  bool existed;
+-  if (resolution == LDPR_UNKNOWN)
+-    return;
+-  if (!file_data->resolution_map)
+-    file_data->resolution_map
+-      = new hash_map<tree, ld_plugin_symbol_resolution>;
+-  ld_plugin_symbol_resolution_t &res
+-     = file_data->resolution_map->get_or_insert (decl, &existed);
+-  if (!existed
+-      || resolution == LDPR_PREVAILING_DEF_IRONLY
+-      || resolution == LDPR_PREVAILING_DEF
+-      || resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
+-    res = resolution;
+-}
+-
+-/* Register DECL with the global symbol table and change its
+-   name if necessary to avoid name clashes for static globals across
+-   different files.  */
+-
+-static void
+-lto_register_var_decl_in_symtab (struct data_in *data_in, tree decl,
+-				 unsigned ix)
+-{
+-  tree context;
+-
+-  /* Variable has file scope, not local.  */
+-  if (!TREE_PUBLIC (decl)
+-      && !((context = decl_function_context (decl))
+-	   && auto_var_in_fn_p (decl, context)))
+-    rest_of_decl_compilation (decl, 1, 0);
+-
+-  /* If this variable has already been declared, queue the
+-     declaration for merging.  */
+-  if (TREE_PUBLIC (decl))
+-    register_resolution (data_in->file_data,
+-			 decl, get_resolution (data_in, ix));
+-}
+-
+-
+-/* Register DECL with the global symbol table and change its
+-   name if necessary to avoid name clashes for static globals across
+-   different files.  DATA_IN contains descriptors and tables for the
+-   file being read.  */
+-
+-static void
+-lto_register_function_decl_in_symtab (struct data_in *data_in, tree decl,
+-				      unsigned ix)
+-{
+-  /* If this variable has already been declared, queue the
+-     declaration for merging.  */
+-  if (TREE_PUBLIC (decl) && !DECL_ABSTRACT_P (decl))
+-    register_resolution (data_in->file_data,
+-			 decl, get_resolution (data_in, ix));
+-}
+-
+-/* Check if T is a decl and needs register its resolution info.  */
+-
+-static void
+-lto_maybe_register_decl (struct data_in *data_in, tree t, unsigned ix)
+-{
+-  if (TREE_CODE (t) == VAR_DECL)
+-    lto_register_var_decl_in_symtab (data_in, t, ix);
+-  else if (TREE_CODE (t) == FUNCTION_DECL
+-	   && !fndecl_built_in_p (t))
+-    lto_register_function_decl_in_symtab (data_in, t, ix);
+-}
+-
+-
+-/* For the type T re-materialize it in the type variant list and
+-   the pointer/reference-to chains.  */
+-
+-static void
+-lto_fixup_prevailing_type (tree t)
+-{
+-  /* The following re-creates proper variant lists while fixing up
+-     the variant leaders.  We do not stream TYPE_NEXT_VARIANT so the
+-     variant list state before fixup is broken.  */
+-
+-  /* If we are not our own variant leader link us into our new leaders
+-     variant list.  */
+-  if (TYPE_MAIN_VARIANT (t) != t)
+-    {
+-      tree mv = TYPE_MAIN_VARIANT (t);
+-      TYPE_NEXT_VARIANT (t) = TYPE_NEXT_VARIANT (mv);
+-      TYPE_NEXT_VARIANT (mv) = t;
+-    }
+-
+-  /* The following reconstructs the pointer chains
+-     of the new pointed-to type if we are a main variant.  We do
+-     not stream those so they are broken before fixup.  */
+-  if (TREE_CODE (t) == POINTER_TYPE
+-      && TYPE_MAIN_VARIANT (t) == t)
+-    {
+-      TYPE_NEXT_PTR_TO (t) = TYPE_POINTER_TO (TREE_TYPE (t));
+-      TYPE_POINTER_TO (TREE_TYPE (t)) = t;
+-    }
+-  else if (TREE_CODE (t) == REFERENCE_TYPE
+-	   && TYPE_MAIN_VARIANT (t) == t)
+-    {
+-      TYPE_NEXT_REF_TO (t) = TYPE_REFERENCE_TO (TREE_TYPE (t));
+-      TYPE_REFERENCE_TO (TREE_TYPE (t)) = t;
+-    }
+-}
+-
+-
+-/* We keep prevailing tree SCCs in a hashtable with manual collision
+-   handling (in case all hashes compare the same) and keep the colliding
+-   entries in the tree_scc->next chain.  */
+-
+-struct tree_scc
+-{
+-  tree_scc *next;
+-  /* Hash of the whole SCC.  */
+-  hashval_t hash;
+-  /* Number of trees in the SCC.  */
+-  unsigned len;
+-  /* Number of possible entries into the SCC (tree nodes [0..entry_len-1]
+-     which share the same individual tree hash).  */
+-  unsigned entry_len;
+-  /* The members of the SCC.
+-     We only need to remember the first entry node candidate for prevailing
+-     SCCs (but of course have access to all entries for SCCs we are
+-     processing).
+-     ???  For prevailing SCCs we really only need hash and the first
+-     entry candidate, but that's too awkward to implement.  */
+-  tree entries[1];
+-};
+-
+-struct tree_scc_hasher : nofree_ptr_hash <tree_scc>
+-{
+-  static inline hashval_t hash (const tree_scc *);
+-  static inline bool equal (const tree_scc *, const tree_scc *);
+-};
+-
+-hashval_t
+-tree_scc_hasher::hash (const tree_scc *scc)
+-{
+-  return scc->hash;
+-}
+-
+-bool
+-tree_scc_hasher::equal (const tree_scc *scc1, const tree_scc *scc2)
+-{
+-  if (scc1->hash != scc2->hash
+-      || scc1->len != scc2->len
+-      || scc1->entry_len != scc2->entry_len)
+-    return false;
+-  return true;
+-}
+-
+-static hash_table<tree_scc_hasher> *tree_scc_hash;
+-static struct obstack tree_scc_hash_obstack;
+-
+-static unsigned long num_merged_types;
+-static unsigned long num_prevailing_types;
+-static unsigned long num_type_scc_trees;
+-static unsigned long total_scc_size;
+-static unsigned long num_sccs_read;
+-static unsigned long total_scc_size_merged;
+-static unsigned long num_sccs_merged;
+-static unsigned long num_scc_compares;
+-static unsigned long num_scc_compare_collisions;
+-
+-
+-/* Compare the two entries T1 and T2 of two SCCs that are possibly equal,
+-   recursing through in-SCC tree edges.  Returns true if the SCCs entered
+-   through T1 and T2 are equal and fills in *MAP with the pairs of
+-   SCC entries we visited, starting with (*MAP)[0] = T1 and (*MAP)[1] = T2.  */
+-
+-static bool
+-compare_tree_sccs_1 (tree t1, tree t2, tree **map)
+-{
+-  enum tree_code code;
+-
+-  /* Mark already visited nodes.  */
+-  TREE_ASM_WRITTEN (t2) = 1;
+-
+-  /* Push the pair onto map.  */
+-  (*map)[0] = t1;
+-  (*map)[1] = t2;
+-  *map = *map + 2;
+-
+-  /* Compare value-fields.  */
+-#define compare_values(X) \
+-  do { \
+-    if (X(t1) != X(t2)) \
+-      return false; \
+-  } while (0)
+-
+-  compare_values (TREE_CODE);
+-  code = TREE_CODE (t1);
+-
+-  if (!TYPE_P (t1))
+-    {
+-      compare_values (TREE_SIDE_EFFECTS);
+-      compare_values (TREE_CONSTANT);
+-      compare_values (TREE_READONLY);
+-      compare_values (TREE_PUBLIC);
+-    }
+-  compare_values (TREE_ADDRESSABLE);
+-  compare_values (TREE_THIS_VOLATILE);
+-  if (DECL_P (t1))
+-    compare_values (DECL_UNSIGNED);
+-  else if (TYPE_P (t1))
+-    compare_values (TYPE_UNSIGNED);
+-  if (TYPE_P (t1))
+-    compare_values (TYPE_ARTIFICIAL);
+-  else
+-    compare_values (TREE_NO_WARNING);
+-  compare_values (TREE_NOTHROW);
+-  compare_values (TREE_STATIC);
+-  if (code != TREE_BINFO)
+-    compare_values (TREE_PRIVATE);
+-  compare_values (TREE_PROTECTED);
+-  compare_values (TREE_DEPRECATED);
+-  if (TYPE_P (t1))
+-    {
+-      if (AGGREGATE_TYPE_P (t1))
+-	compare_values (TYPE_REVERSE_STORAGE_ORDER);
+-      else
+-	compare_values (TYPE_SATURATING);
+-      compare_values (TYPE_ADDR_SPACE);
+-    }
+-  else if (code == SSA_NAME)
+-    compare_values (SSA_NAME_IS_DEFAULT_DEF);
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_INT_CST))
+-    {
+-      if (wi::to_wide (t1) != wi::to_wide (t2))
+-	return false;
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_REAL_CST))
+-    {
+-      /* ???  No suitable compare routine available.  */
+-      REAL_VALUE_TYPE r1 = TREE_REAL_CST (t1);
+-      REAL_VALUE_TYPE r2 = TREE_REAL_CST (t2);
+-      if (r1.cl != r2.cl
+-	  || r1.decimal != r2.decimal
+-	  || r1.sign != r2.sign
+-	  || r1.signalling != r2.signalling
+-	  || r1.canonical != r2.canonical
+-	  || r1.uexp != r2.uexp)
+-	return false;
+-      for (unsigned i = 0; i < SIGSZ; ++i)
+-	if (r1.sig[i] != r2.sig[i])
+-	  return false;
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_FIXED_CST))
+-    if (!fixed_compare (EQ_EXPR,
+-			TREE_FIXED_CST_PTR (t1), TREE_FIXED_CST_PTR (t2)))
+-      return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
+-    {
+-      compare_values (VECTOR_CST_LOG2_NPATTERNS);
+-      compare_values (VECTOR_CST_NELTS_PER_PATTERN);
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
+-    {
+-      compare_values (DECL_MODE);
+-      compare_values (DECL_NONLOCAL);
+-      compare_values (DECL_VIRTUAL_P);
+-      compare_values (DECL_IGNORED_P);
+-      compare_values (DECL_ABSTRACT_P);
+-      compare_values (DECL_ARTIFICIAL);
+-      compare_values (DECL_USER_ALIGN);
+-      compare_values (DECL_PRESERVE_P);
+-      compare_values (DECL_EXTERNAL);
+-      compare_values (DECL_GIMPLE_REG_P);
+-      compare_values (DECL_ALIGN);
+-      if (code == LABEL_DECL)
+-	{
+-	  compare_values (EH_LANDING_PAD_NR);
+-	  compare_values (LABEL_DECL_UID);
+-	}
+-      else if (code == FIELD_DECL)
+-	{
+-	  compare_values (DECL_PACKED);
+-	  compare_values (DECL_NONADDRESSABLE_P);
+-	  compare_values (DECL_PADDING_P);
+-	  compare_values (DECL_OFFSET_ALIGN);
+-	}
+-      else if (code == VAR_DECL)
+-	{
+-	  compare_values (DECL_HAS_DEBUG_EXPR_P);
+-	  compare_values (DECL_NONLOCAL_FRAME);
+-	}
+-      if (code == RESULT_DECL
+-	  || code == PARM_DECL
+-	  || code == VAR_DECL)
+-	{
+-	  compare_values (DECL_BY_REFERENCE);
+-	  if (code == VAR_DECL
+-	      || code == PARM_DECL)
+-	    compare_values (DECL_HAS_VALUE_EXPR_P);
+-	}
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WRTL))
+-    compare_values (DECL_REGISTER);
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
+-    {
+-      compare_values (DECL_COMMON);
+-      compare_values (DECL_DLLIMPORT_P);
+-      compare_values (DECL_WEAK);
+-      compare_values (DECL_SEEN_IN_BIND_EXPR_P);
+-      compare_values (DECL_COMDAT);
+-      compare_values (DECL_VISIBILITY);
+-      compare_values (DECL_VISIBILITY_SPECIFIED);
+-      if (code == VAR_DECL)
+-	{
+-	  compare_values (DECL_HARD_REGISTER);
+-          /* DECL_IN_TEXT_SECTION is set during final asm output only.  */
+-	  compare_values (DECL_IN_CONSTANT_POOL);
+-	}
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
+-    {
+-      compare_values (DECL_BUILT_IN_CLASS);
+-      compare_values (DECL_STATIC_CONSTRUCTOR);
+-      compare_values (DECL_STATIC_DESTRUCTOR);
+-      compare_values (DECL_UNINLINABLE);
+-      compare_values (DECL_POSSIBLY_INLINED);
+-      compare_values (DECL_IS_NOVOPS);
+-      compare_values (DECL_IS_RETURNS_TWICE);
+-      compare_values (DECL_IS_MALLOC);
+-      compare_values (DECL_IS_OPERATOR_NEW);
+-      compare_values (DECL_DECLARED_INLINE_P);
+-      compare_values (DECL_STATIC_CHAIN);
+-      compare_values (DECL_NO_INLINE_WARNING_P);
+-      compare_values (DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT);
+-      compare_values (DECL_NO_LIMIT_STACK);
+-      compare_values (DECL_DISREGARD_INLINE_LIMITS);
+-      compare_values (DECL_PURE_P);
+-      compare_values (DECL_LOOPING_CONST_OR_PURE_P);
+-      compare_values (DECL_FINAL_P);
+-      compare_values (DECL_CXX_CONSTRUCTOR_P);
+-      compare_values (DECL_CXX_DESTRUCTOR_P);
+-      if (DECL_BUILT_IN_CLASS (t1) != NOT_BUILT_IN)
+-	compare_values (DECL_FUNCTION_CODE);
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
+-    {
+-      compare_values (TYPE_MODE);
+-      compare_values (TYPE_STRING_FLAG);
+-      compare_values (TYPE_NEEDS_CONSTRUCTING);
+-      if (RECORD_OR_UNION_TYPE_P (t1))
+-	{
+-	  compare_values (TYPE_TRANSPARENT_AGGR);
+-	  compare_values (TYPE_FINAL_P);
+-	}
+-      else if (code == ARRAY_TYPE)
+-	compare_values (TYPE_NONALIASED_COMPONENT);
+-      if (AGGREGATE_TYPE_P (t1))
+-	compare_values (TYPE_TYPELESS_STORAGE);
+-      compare_values (TYPE_EMPTY_P);
+-      compare_values (TYPE_PACKED);
+-      compare_values (TYPE_RESTRICT);
+-      compare_values (TYPE_USER_ALIGN);
+-      compare_values (TYPE_READONLY);
+-      compare_values (TYPE_PRECISION);
+-      compare_values (TYPE_ALIGN);
+-      /* Do not compare TYPE_ALIAS_SET.  Doing so introduce ordering issues
+-         with calls to get_alias_set which may initialize it for streamed
+- 	 in types.  */
+-    }
+-
+-  /* We don't want to compare locations, so there is nothing do compare
+-     for TS_EXP.  */
+-
+-  /* BLOCKs are function local and we don't merge anything there, so
+-     simply refuse to merge.  */
+-  if (CODE_CONTAINS_STRUCT (code, TS_BLOCK))
+-    return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_TRANSLATION_UNIT_DECL))
+-    if (strcmp (TRANSLATION_UNIT_LANGUAGE (t1),
+-		TRANSLATION_UNIT_LANGUAGE (t2)) != 0)
+-      return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_TARGET_OPTION))
+-    if (!cl_target_option_eq (TREE_TARGET_OPTION (t1), TREE_TARGET_OPTION (t2)))
+-      return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_OPTIMIZATION))
+-    if (!cl_optimization_option_eq (TREE_OPTIMIZATION (t1),
+-				    TREE_OPTIMIZATION (t2)))
+-      return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
+-    if (vec_safe_length (BINFO_BASE_ACCESSES (t1))
+-	!= vec_safe_length (BINFO_BASE_ACCESSES (t2)))
+-      return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
+-    compare_values (CONSTRUCTOR_NELTS);
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_IDENTIFIER))
+-    if (IDENTIFIER_LENGTH (t1) != IDENTIFIER_LENGTH (t2)
+-	|| memcmp (IDENTIFIER_POINTER (t1), IDENTIFIER_POINTER (t2),
+-		   IDENTIFIER_LENGTH (t1)) != 0)
+-      return false;
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_STRING))
+-    if (TREE_STRING_LENGTH (t1) != TREE_STRING_LENGTH (t2)
+-	|| memcmp (TREE_STRING_POINTER (t1), TREE_STRING_POINTER (t2),
+-		   TREE_STRING_LENGTH (t1)) != 0)
+-      return false;
+-
+-  if (code == OMP_CLAUSE)
+-    {
+-      compare_values (OMP_CLAUSE_CODE);
+-      switch (OMP_CLAUSE_CODE (t1))
+-	{
+-	case OMP_CLAUSE_DEFAULT:
+-	  compare_values (OMP_CLAUSE_DEFAULT_KIND);
+-	  break;
+-	case OMP_CLAUSE_SCHEDULE:
+-	  compare_values (OMP_CLAUSE_SCHEDULE_KIND);
+-	  break;
+-	case OMP_CLAUSE_DEPEND:
+-	  compare_values (OMP_CLAUSE_DEPEND_KIND);
+-	  break;
+-	case OMP_CLAUSE_MAP:
+-	  compare_values (OMP_CLAUSE_MAP_KIND);
+-	  break;
+-	case OMP_CLAUSE_PROC_BIND:
+-	  compare_values (OMP_CLAUSE_PROC_BIND_KIND);
+-	  break;
+-	case OMP_CLAUSE_REDUCTION:
+-	  compare_values (OMP_CLAUSE_REDUCTION_CODE);
+-	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_INIT);
+-	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_MERGE);
+-	  break;
+-	default:
+-	  break;
+-	}
+-    }
+-
+-#undef compare_values
+-
+-
+-  /* Compare pointer fields.  */
+-
+-  /* Recurse.  Search & Replaced from DFS_write_tree_body.
+-     Folding the early checks into the compare_tree_edges recursion
+-     macro makes debugging way quicker as you are able to break on
+-     compare_tree_sccs_1 and simply finish until a call returns false
+-     to spot the SCC members with the difference.  */
+-#define compare_tree_edges(E1, E2) \
+-  do { \
+-    tree t1_ = (E1), t2_ = (E2); \
+-    if (t1_ != t2_ \
+-	&& (!t1_ || !t2_ \
+-	    || !TREE_VISITED (t2_) \
+-	    || (!TREE_ASM_WRITTEN (t2_) \
+-		&& !compare_tree_sccs_1 (t1_, t2_, map)))) \
+-      return false; \
+-    /* Only non-NULL trees outside of the SCC may compare equal.  */ \
+-    gcc_checking_assert (t1_ != t2_ || (!t2_ || !TREE_VISITED (t2_))); \
+-  } while (0)
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_TYPED))
+-    {
+-      if (code != IDENTIFIER_NODE)
+-	compare_tree_edges (TREE_TYPE (t1), TREE_TYPE (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
+-    {
+-      /* Note that the number of elements for EXPR has already been emitted
+-	 in EXPR's header (see streamer_write_tree_header).  */
+-      unsigned int count = vector_cst_encoded_nelts (t1);
+-      for (unsigned int i = 0; i < count; ++i)
+-	compare_tree_edges (VECTOR_CST_ENCODED_ELT (t1, i),
+-			    VECTOR_CST_ENCODED_ELT (t2, i));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_COMPLEX))
+-    {
+-      compare_tree_edges (TREE_REALPART (t1), TREE_REALPART (t2));
+-      compare_tree_edges (TREE_IMAGPART (t1), TREE_IMAGPART (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_MINIMAL))
+-    {
+-      compare_tree_edges (DECL_NAME (t1), DECL_NAME (t2));
+-      /* ???  Global decls from different TUs have non-matching
+-	 TRANSLATION_UNIT_DECLs.  Only consider a small set of
+-	 decls equivalent, we should not end up merging others.  */
+-      if ((code == TYPE_DECL
+-	   || code == NAMESPACE_DECL
+-	   || code == IMPORTED_DECL
+-	   || code == CONST_DECL
+-	   || (VAR_OR_FUNCTION_DECL_P (t1)
+-	       && (TREE_PUBLIC (t1) || DECL_EXTERNAL (t1))))
+-	  && DECL_FILE_SCOPE_P (t1) && DECL_FILE_SCOPE_P (t2))
+-	;
+-      else
+-	compare_tree_edges (DECL_CONTEXT (t1), DECL_CONTEXT (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
+-    {
+-      compare_tree_edges (DECL_SIZE (t1), DECL_SIZE (t2));
+-      compare_tree_edges (DECL_SIZE_UNIT (t1), DECL_SIZE_UNIT (t2));
+-      compare_tree_edges (DECL_ATTRIBUTES (t1), DECL_ATTRIBUTES (t2));
+-      compare_tree_edges (DECL_ABSTRACT_ORIGIN (t1), DECL_ABSTRACT_ORIGIN (t2));
+-      if ((code == VAR_DECL
+-	   || code == PARM_DECL)
+-	  && DECL_HAS_VALUE_EXPR_P (t1))
+-	compare_tree_edges (DECL_VALUE_EXPR (t1), DECL_VALUE_EXPR (t2));
+-      if (code == VAR_DECL
+-	  && DECL_HAS_DEBUG_EXPR_P (t1))
+-	compare_tree_edges (DECL_DEBUG_EXPR (t1), DECL_DEBUG_EXPR (t2));
+-      /* LTO specific edges.  */
+-      if (code != FUNCTION_DECL
+-	  && code != TRANSLATION_UNIT_DECL)
+-	compare_tree_edges (DECL_INITIAL (t1), DECL_INITIAL (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
+-    {
+-      if (code == FUNCTION_DECL)
+-	{
+-	  tree a1, a2;
+-	  for (a1 = DECL_ARGUMENTS (t1), a2 = DECL_ARGUMENTS (t2);
+-	       a1 || a2;
+-	       a1 = TREE_CHAIN (a1), a2 = TREE_CHAIN (a2))
+-	    compare_tree_edges (a1, a2);
+-	  compare_tree_edges (DECL_RESULT (t1), DECL_RESULT (t2));
+-	}
+-      else if (code == TYPE_DECL)
+-	compare_tree_edges (DECL_ORIGINAL_TYPE (t1), DECL_ORIGINAL_TYPE (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
+-    {
+-      /* Make sure we don't inadvertently set the assembler name.  */
+-      if (DECL_ASSEMBLER_NAME_SET_P (t1))
+-	compare_tree_edges (DECL_ASSEMBLER_NAME (t1),
+-			    DECL_ASSEMBLER_NAME (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
+-    {
+-      compare_tree_edges (DECL_FIELD_OFFSET (t1), DECL_FIELD_OFFSET (t2));
+-      compare_tree_edges (DECL_BIT_FIELD_TYPE (t1), DECL_BIT_FIELD_TYPE (t2));
+-      compare_tree_edges (DECL_BIT_FIELD_REPRESENTATIVE (t1),
+-			  DECL_BIT_FIELD_REPRESENTATIVE (t2));
+-      compare_tree_edges (DECL_FIELD_BIT_OFFSET (t1),
+-			  DECL_FIELD_BIT_OFFSET (t2));
+-      compare_tree_edges (DECL_FCONTEXT (t1), DECL_FCONTEXT (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
+-    {
+-      compare_tree_edges (DECL_FUNCTION_PERSONALITY (t1),
+-			  DECL_FUNCTION_PERSONALITY (t2));
+-      compare_tree_edges (DECL_VINDEX (t1), DECL_VINDEX (t2));
+-      compare_tree_edges (DECL_FUNCTION_SPECIFIC_TARGET (t1),
+-			  DECL_FUNCTION_SPECIFIC_TARGET (t2));
+-      compare_tree_edges (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t1),
+-			  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
+-    {
+-      compare_tree_edges (TYPE_SIZE (t1), TYPE_SIZE (t2));
+-      compare_tree_edges (TYPE_SIZE_UNIT (t1), TYPE_SIZE_UNIT (t2));
+-      compare_tree_edges (TYPE_ATTRIBUTES (t1), TYPE_ATTRIBUTES (t2));
+-      compare_tree_edges (TYPE_NAME (t1), TYPE_NAME (t2));
+-      /* Do not compare TYPE_POINTER_TO or TYPE_REFERENCE_TO.  They will be
+-	 reconstructed during fixup.  */
+-      /* Do not compare TYPE_NEXT_VARIANT, we reconstruct the variant lists
+-	 during fixup.  */
+-      compare_tree_edges (TYPE_MAIN_VARIANT (t1), TYPE_MAIN_VARIANT (t2));
+-      /* ???  Global types from different TUs have non-matching
+-	 TRANSLATION_UNIT_DECLs.  Still merge them if they are otherwise
+-	 equal.  */
+-      if (TYPE_FILE_SCOPE_P (t1) && TYPE_FILE_SCOPE_P (t2))
+-	;
+-      else
+-	compare_tree_edges (TYPE_CONTEXT (t1), TYPE_CONTEXT (t2));
+-      /* TYPE_CANONICAL is re-computed during type merging, so do not
+-	 compare it here.  */
+-      compare_tree_edges (TYPE_STUB_DECL (t1), TYPE_STUB_DECL (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_NON_COMMON))
+-    {
+-      if (code == ENUMERAL_TYPE)
+-	compare_tree_edges (TYPE_VALUES (t1), TYPE_VALUES (t2));
+-      else if (code == ARRAY_TYPE)
+-	compare_tree_edges (TYPE_DOMAIN (t1), TYPE_DOMAIN (t2));
+-      else if (RECORD_OR_UNION_TYPE_P (t1))
+-	{
+-	  tree f1, f2;
+-	  for (f1 = TYPE_FIELDS (t1), f2 = TYPE_FIELDS (t2);
+-	       f1 || f2;
+-	       f1 = TREE_CHAIN (f1), f2 = TREE_CHAIN (f2))
+-	    compare_tree_edges (f1, f2);
+-	}
+-      else if (code == FUNCTION_TYPE
+-	       || code == METHOD_TYPE)
+-	compare_tree_edges (TYPE_ARG_TYPES (t1), TYPE_ARG_TYPES (t2));
+-
+-      if (!POINTER_TYPE_P (t1))
+-	compare_tree_edges (TYPE_MIN_VALUE_RAW (t1), TYPE_MIN_VALUE_RAW (t2));
+-      compare_tree_edges (TYPE_MAX_VALUE_RAW (t1), TYPE_MAX_VALUE_RAW (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_LIST))
+-    {
+-      compare_tree_edges (TREE_PURPOSE (t1), TREE_PURPOSE (t2));
+-      compare_tree_edges (TREE_VALUE (t1), TREE_VALUE (t2));
+-      compare_tree_edges (TREE_CHAIN (t1), TREE_CHAIN (t2));
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_VEC))
+-    for (int i = 0; i < TREE_VEC_LENGTH (t1); i++)
+-      compare_tree_edges (TREE_VEC_ELT (t1, i), TREE_VEC_ELT (t2, i));
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_EXP))
+-    {
+-      for (int i = 0; i < TREE_OPERAND_LENGTH (t1); i++)
+-	compare_tree_edges (TREE_OPERAND (t1, i),
+-			    TREE_OPERAND (t2, i));
+-
+-      /* BLOCKs are function local and we don't merge anything there.  */
+-      if (TREE_BLOCK (t1) || TREE_BLOCK (t2))
+-	return false;
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
+-    {
+-      unsigned i;
+-      tree t;
+-      /* Lengths have already been compared above.  */
+-      FOR_EACH_VEC_ELT (*BINFO_BASE_BINFOS (t1), i, t)
+-	compare_tree_edges (t, BINFO_BASE_BINFO (t2, i));
+-      FOR_EACH_VEC_SAFE_ELT (BINFO_BASE_ACCESSES (t1), i, t)
+-	compare_tree_edges (t, BINFO_BASE_ACCESS (t2, i));
+-      compare_tree_edges (BINFO_OFFSET (t1), BINFO_OFFSET (t2));
+-      compare_tree_edges (BINFO_VTABLE (t1), BINFO_VTABLE (t2));
+-      compare_tree_edges (BINFO_VPTR_FIELD (t1), BINFO_VPTR_FIELD (t2));
+-      /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
+-	 and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
+-    }
+-
+-  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
+-    {
+-      unsigned i;
+-      tree index, value;
+-      /* Lengths have already been compared above.  */
+-      FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (t1), i, index, value)
+-	{
+-	  compare_tree_edges (index, CONSTRUCTOR_ELT (t2, i)->index);
+-	  compare_tree_edges (value, CONSTRUCTOR_ELT (t2, i)->value);
+-	}
+-    }
+-
+-  if (code == OMP_CLAUSE)
+-    {
+-      int i;
+-
+-      for (i = 0; i < omp_clause_num_ops[OMP_CLAUSE_CODE (t1)]; i++)
+-	compare_tree_edges (OMP_CLAUSE_OPERAND (t1, i),
+-			    OMP_CLAUSE_OPERAND (t2, i));
+-      compare_tree_edges (OMP_CLAUSE_CHAIN (t1), OMP_CLAUSE_CHAIN (t2));
+-    }
+-
+-#undef compare_tree_edges
+-
+-  return true;
+-}
+-
+-/* Compare the tree scc SCC to the prevailing candidate PSCC, filling
+-   out MAP if they are equal.  */
+-
+-static bool
+-compare_tree_sccs (tree_scc *pscc, tree_scc *scc,
+-		   tree *map)
+-{
+-  /* Assume SCC entry hashes are sorted after their cardinality.  Which
+-     means we can simply take the first n-tuple of equal hashes
+-     (which is recorded as entry_len) and do n SCC entry candidate
+-     comparisons.  */
+-  for (unsigned i = 0; i < pscc->entry_len; ++i)
+-    {
+-      tree *mapp = map;
+-      num_scc_compare_collisions++;
+-      if (compare_tree_sccs_1 (pscc->entries[0], scc->entries[i], &mapp))
+-	{
+-	  /* Equal - no need to reset TREE_VISITED or TREE_ASM_WRITTEN
+-	     on the scc as all trees will be freed.  */
+-	  return true;
+-	}
+-      /* Reset TREE_ASM_WRITTEN on scc for the next compare or in case
+-         the SCC prevails.  */
+-      for (unsigned j = 0; j < scc->len; ++j)
+-	TREE_ASM_WRITTEN (scc->entries[j]) = 0;
+-    }
+-
+-  return false;
+-}
+-
+-/* QSort sort function to sort a map of two pointers after the 2nd
+-   pointer.  */
+-
+-static int
+-cmp_tree (const void *p1_, const void *p2_)
+-{
+-  tree *p1 = (tree *)(const_cast<void *>(p1_));
+-  tree *p2 = (tree *)(const_cast<void *>(p2_));
+-  if (p1[1] == p2[1])
+-    return 0;
+-  return ((uintptr_t)p1[1] < (uintptr_t)p2[1]) ? -1 : 1;
+-}
+-
+-/* Try to unify the SCC with nodes FROM to FROM + LEN in CACHE and
+-   hash value SCC_HASH with an already recorded SCC.  Return true if
+-   that was successful, otherwise return false.  */
+-
+-static bool
+-unify_scc (struct data_in *data_in, unsigned from,
+-	   unsigned len, unsigned scc_entry_len, hashval_t scc_hash)
+-{
+-  bool unified_p = false;
+-  struct streamer_tree_cache_d *cache = data_in->reader_cache;
+-  tree_scc *scc
+-    = (tree_scc *) alloca (sizeof (tree_scc) + (len - 1) * sizeof (tree));
+-  scc->next = NULL;
+-  scc->hash = scc_hash;
+-  scc->len = len;
+-  scc->entry_len = scc_entry_len;
+-  for (unsigned i = 0; i < len; ++i)
+-    {
+-      tree t = streamer_tree_cache_get_tree (cache, from + i);
+-      scc->entries[i] = t;
+-      /* Do not merge SCCs with local entities inside them.  Also do
+-	 not merge TRANSLATION_UNIT_DECLs.  */
+-      if (TREE_CODE (t) == TRANSLATION_UNIT_DECL
+-	  || (VAR_OR_FUNCTION_DECL_P (t)
+-	      && !(TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
+-	  || TREE_CODE (t) == LABEL_DECL)
+-	{
+-	  /* Avoid doing any work for these cases and do not worry to
+-	     record the SCCs for further merging.  */
+-	  return false;
+-	}
+-    }
+-
+-  /* Look for the list of candidate SCCs to compare against.  */
+-  tree_scc **slot;
+-  slot = tree_scc_hash->find_slot_with_hash (scc, scc_hash, INSERT);
+-  if (*slot)
+-    {
+-      /* Try unifying against each candidate.  */
+-      num_scc_compares++;
+-
+-      /* Set TREE_VISITED on the scc so we can easily identify tree nodes
+-	 outside of the scc when following tree edges.  Make sure
+-	 that TREE_ASM_WRITTEN is unset so we can use it as 2nd bit
+-	 to track whether we visited the SCC member during the compare.
+-	 We cannot use TREE_VISITED on the pscc members as the extended
+-	 scc and pscc can overlap.  */
+-      for (unsigned i = 0; i < scc->len; ++i)
+-	{
+-	  TREE_VISITED (scc->entries[i]) = 1;
+-	  gcc_checking_assert (!TREE_ASM_WRITTEN (scc->entries[i]));
+-	}
+-
+-      tree *map = XALLOCAVEC (tree, 2 * len);
+-      for (tree_scc *pscc = *slot; pscc; pscc = pscc->next)
+-	{
+-	  if (!compare_tree_sccs (pscc, scc, map))
+-	    continue;
+-
+-	  /* Found an equal SCC.  */
+-	  unified_p = true;
+-	  num_scc_compare_collisions--;
+-	  num_sccs_merged++;
+-	  total_scc_size_merged += len;
+-
+-	  if (flag_checking)
+-	    for (unsigned i = 0; i < len; ++i)
+-	      {
+-		tree t = map[2*i+1];
+-		enum tree_code code = TREE_CODE (t);
+-		/* IDENTIFIER_NODEs should be singletons and are merged by the
+-		   streamer.  The others should be singletons, too, and we
+-		   should not merge them in any way.  */
+-		gcc_assert (code != TRANSLATION_UNIT_DECL
+-			    && code != IDENTIFIER_NODE);
+-	      }
+-
+-	  /* Fixup the streamer cache with the prevailing nodes according
+-	     to the tree node mapping computed by compare_tree_sccs.  */
+-	  if (len == 1)
+-	    {
+-	      /* If we got a debug reference queued, see if the prevailing
+-	         tree has a debug reference and if not, register the one
+-		 for the tree we are about to throw away.  */
+-	      if (dref_queue.length () == 1)
+-		{
+-		  dref_entry e = dref_queue.pop ();
+-		  gcc_assert (e.decl
+-			      == streamer_tree_cache_get_tree (cache, from));
+-		  const char *sym;
+-		  unsigned HOST_WIDE_INT off;
+-		  if (!debug_hooks->die_ref_for_decl (pscc->entries[0], &sym,
+-						      &off))
+-		    debug_hooks->register_external_die (pscc->entries[0],
+-							e.sym, e.off);
+-		}
+-	      lto_maybe_register_decl (data_in, pscc->entries[0], from);
+-	      streamer_tree_cache_replace_tree (cache, pscc->entries[0], from);
+-	    }
+-	  else
+-	    {
+-	      tree *map2 = XALLOCAVEC (tree, 2 * len);
+-	      for (unsigned i = 0; i < len; ++i)
+-		{
+-		  map2[i*2] = (tree)(uintptr_t)(from + i);
+-		  map2[i*2+1] = scc->entries[i];
+-		}
+-	      qsort (map2, len, 2 * sizeof (tree), cmp_tree);
+-	      qsort (map, len, 2 * sizeof (tree), cmp_tree);
+-	      for (unsigned i = 0; i < len; ++i)
+-		{
+-		  lto_maybe_register_decl (data_in, map[2*i],
+-					   (uintptr_t)map2[2*i]);
+-		  streamer_tree_cache_replace_tree (cache, map[2*i],
+-						    (uintptr_t)map2[2*i]);
+-		}
+-	    }
+-
+-	  /* Free the tree nodes from the read SCC.  */
+-	  data_in->location_cache.revert_location_cache ();
+-	  for (unsigned i = 0; i < len; ++i)
+-	    {
+-	      if (TYPE_P (scc->entries[i]))
+-		num_merged_types++;
+-	      free_node (scc->entries[i]);
+-	    }
+-
+-	  /* Drop DIE references.
+-	     ???  Do as in the size-one SCC case which involves sorting
+-	     the queue.  */
+-	  dref_queue.truncate (0);
+-
+-	  break;
+-	}
+-
+-      /* Reset TREE_VISITED if we didn't unify the SCC with another.  */
+-      if (!unified_p)
+-	for (unsigned i = 0; i < scc->len; ++i)
+-	  TREE_VISITED (scc->entries[i]) = 0;
+-    }
+-
+-  /* If we didn't unify it to any candidate duplicate the relevant
+-     pieces to permanent storage and link it into the chain.  */
+-  if (!unified_p)
+-    {
+-      tree_scc *pscc
+-	= XOBNEWVAR (&tree_scc_hash_obstack, tree_scc, sizeof (tree_scc));
+-      memcpy (pscc, scc, sizeof (tree_scc));
+-      pscc->next = (*slot);
+-      *slot = pscc;
+-    }
+-  return unified_p;
+-}
+-
+-
+-/* Read all the symbols from buffer DATA, using descriptors in DECL_DATA.
+-   RESOLUTIONS is the set of symbols picked by the linker (read from the
+-   resolution file when the linker plugin is being used).  */
+-
+-static void
+-lto_read_decls (struct lto_file_decl_data *decl_data, const void *data,
+-		vec<ld_plugin_symbol_resolution_t> resolutions)
+-{
+-  const struct lto_decl_header *header = (const struct lto_decl_header *) data;
+-  const int decl_offset = sizeof (struct lto_decl_header);
+-  const int main_offset = decl_offset + header->decl_state_size;
+-  const int string_offset = main_offset + header->main_size;
+-  struct data_in *data_in;
+-  unsigned int i;
+-  const uint32_t *data_ptr, *data_end;
+-  uint32_t num_decl_states;
+-
+-  lto_input_block ib_main ((const char *) data + main_offset,
+-			   header->main_size, decl_data->mode_table);
+-
+-  data_in = lto_data_in_create (decl_data, (const char *) data + string_offset,
+-				header->string_size, resolutions);
+-
+-  /* We do not uniquify the pre-loaded cache entries, those are middle-end
+-     internal types that should not be merged.  */
+-
+-  /* Read the global declarations and types.  */
+-  while (ib_main.p < ib_main.len)
+-    {
+-      tree t;
+-      unsigned from = data_in->reader_cache->nodes.length ();
+-      /* Read and uniquify SCCs as in the input stream.  */
+-      enum LTO_tags tag = streamer_read_record_start (&ib_main);
+-      if (tag == LTO_tree_scc)
+-	{
+-	  unsigned len_;
+-	  unsigned scc_entry_len;
+-	  hashval_t scc_hash = lto_input_scc (&ib_main, data_in, &len_,
+-					      &scc_entry_len);
+-	  unsigned len = data_in->reader_cache->nodes.length () - from;
+-	  gcc_assert (len == len_);
+-
+-	  total_scc_size += len;
+-	  num_sccs_read++;
+-
+-	  /* We have the special case of size-1 SCCs that are pre-merged
+-	     by means of identifier and string sharing for example.
+-	     ???  Maybe we should avoid streaming those as SCCs.  */
+-	  tree first = streamer_tree_cache_get_tree (data_in->reader_cache,
+-						     from);
+-	  if (len == 1
+-	      && (TREE_CODE (first) == IDENTIFIER_NODE
+-		  || (TREE_CODE (first) == INTEGER_CST
+-		      && !TREE_OVERFLOW (first))))
+-	    continue;
+-
+-	  /* Try to unify the SCC with already existing ones.  */
+-	  if (!flag_ltrans
+-	      && unify_scc (data_in, from,
+-			    len, scc_entry_len, scc_hash))
+-	    continue;
+-
+-	  /* Tree merging failed, mark entries in location cache as
+-	     permanent.  */
+-	  data_in->location_cache.accept_location_cache ();
+-
+-	  bool seen_type = false;
+-	  for (unsigned i = 0; i < len; ++i)
+-	    {
+-	      tree t = streamer_tree_cache_get_tree (data_in->reader_cache,
+-						     from + i);
+-	      /* Reconstruct the type variant and pointer-to/reference-to
+-		 chains.  */
+-	      if (TYPE_P (t))
+-		{
+-		  seen_type = true;
+-		  num_prevailing_types++;
+-		  lto_fixup_prevailing_type (t);
+-
+-		  /* Compute the canonical type of all types.
+-		     Because SCC components are streamed in random (hash) order
+-		     we may have encountered the type before while registering
+-		     type canonical of a derived type in the same SCC.  */
+-		  if (!TYPE_CANONICAL (t))
+-		    gimple_register_canonical_type (t);
+-		  if (TYPE_MAIN_VARIANT (t) == t && odr_type_p (t))
+-		    register_odr_type (t);
+-		}
+-	      /* Link shared INTEGER_CSTs into TYPE_CACHED_VALUEs of its
+-		 type which is also member of this SCC.  */
+-	      if (TREE_CODE (t) == INTEGER_CST
+-		  && !TREE_OVERFLOW (t))
+-		cache_integer_cst (t);
+-	      if (!flag_ltrans)
+-		{
+-		  lto_maybe_register_decl (data_in, t, from + i);
+-		  /* Scan the tree for references to global functions or
+-		     variables and record those for later fixup.  */
+-		  if (mentions_vars_p (t))
+-		    vec_safe_push (tree_with_vars, t);
+-		}
+-	    }
+-
+-	  /* Register DECLs with the debuginfo machinery.  */
+-	  while (!dref_queue.is_empty ())
+-	    {
+-	      dref_entry e = dref_queue.pop ();
+-	      debug_hooks->register_external_die (e.decl, e.sym, e.off);
+-	    }
+-
+-	  if (seen_type)
+-	    num_type_scc_trees += len;
+-	}
+-      else
+-	{
+-	  /* Pickle stray references.  */
+-	  t = lto_input_tree_1 (&ib_main, data_in, tag, 0);
+-	  gcc_assert (t && data_in->reader_cache->nodes.length () == from);
+-	}
+-    }
+-  data_in->location_cache.apply_location_cache ();
+-
+-  /* Read in lto_in_decl_state objects.  */
+-  data_ptr = (const uint32_t *) ((const char*) data + decl_offset); 
+-  data_end =
+-     (const uint32_t *) ((const char*) data_ptr + header->decl_state_size);
+-  num_decl_states = *data_ptr++;
+-  
+-  gcc_assert (num_decl_states > 0);
+-  decl_data->global_decl_state = lto_new_in_decl_state ();
+-  data_ptr = lto_read_in_decl_state (data_in, data_ptr,
+-				     decl_data->global_decl_state);
+-
+-  /* Read in per-function decl states and enter them in hash table.  */
+-  decl_data->function_decl_states =
+-    hash_table<decl_state_hasher>::create_ggc (37);
+-
+-  for (i = 1; i < num_decl_states; i++)
+-    {
+-      struct lto_in_decl_state *state = lto_new_in_decl_state ();
+-
+-      data_ptr = lto_read_in_decl_state (data_in, data_ptr, state);
+-      lto_in_decl_state **slot
+-	= decl_data->function_decl_states->find_slot (state, INSERT);
+-      gcc_assert (*slot == NULL);
+-      *slot = state;
+-    }
+-
+-  if (data_ptr != data_end)
+-    internal_error ("bytecode stream: garbage at the end of symbols section");
+-
+-  /* Set the current decl state to be the global state. */
+-  decl_data->current_decl_state = decl_data->global_decl_state;
+-
+-  lto_data_in_delete (data_in);
+-}
+-
+-/* Custom version of strtoll, which is not portable.  */
+-
+-static int64_t
+-lto_parse_hex (const char *p)
+-{
+-  int64_t ret = 0;
+-
+-  for (; *p != '\0'; ++p)
+-    {
+-      char c = *p;
+-      unsigned char part;
+-      ret <<= 4;
+-      if (c >= '0' && c <= '9')
+-        part = c - '0';
+-      else if (c >= 'a' && c <= 'f')
+-        part = c - 'a' + 10;
+-      else if (c >= 'A' && c <= 'F')
+-        part = c - 'A' + 10;
+-      else
+-        internal_error ("could not parse hex number");
+-      ret |= part;
+-    }
+-
+-  return ret;
+-}
+-
+-/* Read resolution for file named FILE_NAME. The resolution is read from
+-   RESOLUTION. */
+-
+-static void
+-lto_resolution_read (splay_tree file_ids, FILE *resolution, lto_file *file)
+-{
+-  /* We require that objects in the resolution file are in the same
+-     order as the lto1 command line. */
+-  unsigned int name_len;
+-  char *obj_name;
+-  unsigned int num_symbols;
+-  unsigned int i;
+-  struct lto_file_decl_data *file_data;
+-  splay_tree_node nd = NULL; 
+-
+-  if (!resolution)
+-    return;
+-
+-  name_len = strlen (file->filename);
+-  obj_name = XNEWVEC (char, name_len + 1);
+-  fscanf (resolution, " ");   /* Read white space. */
+-
+-  fread (obj_name, sizeof (char), name_len, resolution);
+-  obj_name[name_len] = '\0';
+-  if (filename_cmp (obj_name, file->filename) != 0)
+-    internal_error ("unexpected file name %s in linker resolution file. "
+-		    "Expected %s", obj_name, file->filename);
+-  if (file->offset != 0)
+-    {
+-      int t;
+-      char offset_p[17];
+-      int64_t offset;
+-      t = fscanf (resolution, "@0x%16s", offset_p);
+-      if (t != 1)
+-        internal_error ("could not parse file offset");
+-      offset = lto_parse_hex (offset_p);
+-      if (offset != file->offset)
+-        internal_error ("unexpected offset");
+-    }
+-
+-  free (obj_name);
+-
+-  fscanf (resolution, "%u", &num_symbols);
+-
+-  for (i = 0; i < num_symbols; i++)
+-    {
+-      int t;
+-      unsigned index;
+-      unsigned HOST_WIDE_INT id;
+-      char r_str[27];
+-      enum ld_plugin_symbol_resolution r = (enum ld_plugin_symbol_resolution) 0;
+-      unsigned int j;
+-      unsigned int lto_resolution_str_len =
+-	sizeof (lto_resolution_str) / sizeof (char *);
+-      res_pair rp;
+-
+-      t = fscanf (resolution, "%u " HOST_WIDE_INT_PRINT_HEX_PURE " %26s %*[^\n]\n", 
+-		  &index, &id, r_str);
+-      if (t != 3)
+-        internal_error ("invalid line in the resolution file");
+-
+-      for (j = 0; j < lto_resolution_str_len; j++)
+-	{
+-	  if (strcmp (lto_resolution_str[j], r_str) == 0)
+-	    {
+-	      r = (enum ld_plugin_symbol_resolution) j;
+-	      break;
+-	    }
+-	}
+-      if (j == lto_resolution_str_len)
+-	internal_error ("invalid resolution in the resolution file");
+-
+-      if (!(nd && lto_splay_tree_id_equal_p (nd->key, id)))
+-	{
+-	  nd = lto_splay_tree_lookup (file_ids, id);
+-	  if (nd == NULL)
+-	    internal_error ("resolution sub id %wx not in object file", id);
+-	}
+-
+-      file_data = (struct lto_file_decl_data *)nd->value;
+-      /* The indexes are very sparse. To save memory save them in a compact
+-         format that is only unpacked later when the subfile is processed. */
+-      rp.res = r;
+-      rp.index = index;
+-      file_data->respairs.safe_push (rp);
+-      if (file_data->max_index < index)
+-        file_data->max_index = index;
+-    }
+-}
+-
+-/* List of file_decl_datas */
+-struct file_data_list
+-  {
+-    struct lto_file_decl_data *first, *last;
+-  };
+-
+-/* Is the name for a id'ed LTO section? */
+-
+-static int 
+-lto_section_with_id (const char *name, unsigned HOST_WIDE_INT *id)
+-{
+-  const char *s;
+-
+-  if (strncmp (name, section_name_prefix, strlen (section_name_prefix)))
+-    return 0;
+-  s = strrchr (name, '.');
+-  if (!s)
+-    return 0;
+-  /* If the section is not suffixed with an ID return.  */
+-  if ((size_t)(s - name) == strlen (section_name_prefix))
+-    return 0;
+-  return sscanf (s, "." HOST_WIDE_INT_PRINT_HEX_PURE, id) == 1;
+-}
+-
+-/* Create file_data of each sub file id */
+-
+-static int 
+-create_subid_section_table (struct lto_section_slot *ls, splay_tree file_ids,
+-                            struct file_data_list *list)
+-{
+-  struct lto_section_slot s_slot, *new_slot;
+-  unsigned HOST_WIDE_INT id;
+-  splay_tree_node nd;
+-  void **hash_slot;
+-  char *new_name;
+-  struct lto_file_decl_data *file_data;
+-
+-  if (!lto_section_with_id (ls->name, &id))
+-    return 1;
+-  
+-  /* Find hash table of sub module id */
+-  nd = lto_splay_tree_lookup (file_ids, id);
+-  if (nd != NULL)
+-    {
+-      file_data = (struct lto_file_decl_data *)nd->value;
+-    }
+-  else
+-    {
+-      file_data = ggc_alloc<lto_file_decl_data> ();
+-      memset(file_data, 0, sizeof (struct lto_file_decl_data));
+-      file_data->id = id;
+-      file_data->section_hash_table = lto_obj_create_section_hash_table ();
+-      lto_splay_tree_insert (file_ids, id, file_data);
+-
+-      /* Maintain list in linker order */
+-      if (!list->first)
+-        list->first = file_data;
+-      if (list->last)
+-        list->last->next = file_data;
+-      list->last = file_data;
+-    }
+-
+-  /* Copy section into sub module hash table */
+-  new_name = XDUPVEC (char, ls->name, strlen (ls->name) + 1);
+-  s_slot.name = new_name;
+-  hash_slot = htab_find_slot (file_data->section_hash_table, &s_slot, INSERT);
+-  gcc_assert (*hash_slot == NULL);
+-
+-  new_slot = XDUP (struct lto_section_slot, ls);
+-  new_slot->name = new_name;
+-  *hash_slot = new_slot;
+-  return 1;
+-}
+-
+-/* Read declarations and other initializations for a FILE_DATA. */
+-
+-static void
+-lto_file_finalize (struct lto_file_decl_data *file_data, lto_file *file)
+-{
+-  const char *data;
+-  size_t len;
+-  vec<ld_plugin_symbol_resolution_t>
+-	resolutions = vNULL;
+-  int i;
+-  res_pair *rp;
+-
+-  /* Create vector for fast access of resolution. We do this lazily
+-     to save memory. */ 
+-  resolutions.safe_grow_cleared (file_data->max_index + 1);
+-  for (i = 0; file_data->respairs.iterate (i, &rp); i++)
+-    resolutions[rp->index] = rp->res;
+-  file_data->respairs.release ();
+-
+-  file_data->renaming_hash_table = lto_create_renaming_table ();
+-  file_data->file_name = file->filename;
+-#ifdef ACCEL_COMPILER
+-  lto_input_mode_table (file_data);
+-#else
+-  file_data->mode_table = lto_mode_identity_table;
+-#endif
+-  data = lto_get_section_data (file_data, LTO_section_decls, NULL, &len);
+-  if (data == NULL)
+-    {
+-      internal_error ("cannot read LTO decls from %s", file_data->file_name);
+-      return;
+-    }
+-  /* Frees resolutions */
+-  lto_read_decls (file_data, data, resolutions);
+-  lto_free_section_data (file_data, LTO_section_decls, NULL, data, len);
+-}
+-
+-/* Finalize FILE_DATA in FILE and increase COUNT. */
+-
+-static int 
+-lto_create_files_from_ids (lto_file *file, struct lto_file_decl_data *file_data,
+-			   int *count)
+-{
+-  lto_file_finalize (file_data, file);
+-  if (symtab->dump_file)
+-    fprintf (symtab->dump_file,
+-	     "Creating file %s with sub id " HOST_WIDE_INT_PRINT_HEX "\n",
+-	     file_data->file_name, file_data->id);
+-  (*count)++;
+-  return 0;
+-}
+-
+-/* Generate a TREE representation for all types and external decls
+-   entities in FILE.  
+-
+-   Read all of the globals out of the file.  Then read the cgraph
+-   and process the .o index into the cgraph nodes so that it can open
+-   the .o file to load the functions and ipa information.   */
+-
+-static struct lto_file_decl_data *
+-lto_file_read (lto_file *file, FILE *resolution_file, int *count)
+-{
+-  struct lto_file_decl_data *file_data = NULL;
+-  splay_tree file_ids;
+-  htab_t section_hash_table;
+-  struct lto_section_slot *section;
+-  struct file_data_list file_list;
+-  struct lto_section_list section_list;
+- 
+-  memset (&section_list, 0, sizeof (struct lto_section_list)); 
+-  section_hash_table = lto_obj_build_section_table (file, &section_list);
+-
+-  /* Find all sub modules in the object and put their sections into new hash
+-     tables in a splay tree. */
+-  file_ids = lto_splay_tree_new ();
+-  memset (&file_list, 0, sizeof (struct file_data_list));
+-  for (section = section_list.first; section != NULL; section = section->next)
+-    create_subid_section_table (section, file_ids, &file_list);
+-
+-  /* Add resolutions to file ids */
+-  lto_resolution_read (file_ids, resolution_file, file);
+-
+-  /* Finalize each lto file for each submodule in the merged object */
+-  for (file_data = file_list.first; file_data != NULL; file_data = file_data->next)
+-    lto_create_files_from_ids (file, file_data, count);
+- 
+-  splay_tree_delete (file_ids);
+-  htab_delete (section_hash_table);
+-
+-  return file_list.first;
+-}
+-
+-#if HAVE_MMAP_FILE && HAVE_SYSCONF && defined _SC_PAGE_SIZE
+-#define LTO_MMAP_IO 1
+-#endif
+-
+-#if LTO_MMAP_IO
+-/* Page size of machine is used for mmap and munmap calls.  */
+-static size_t page_mask;
+-#endif
+-
+-/* Get the section data of length LEN from FILENAME starting at
+-   OFFSET.  The data segment must be freed by the caller when the
+-   caller is finished.  Returns NULL if all was not well.  */
+-
+-static char *
+-lto_read_section_data (struct lto_file_decl_data *file_data,
+-		       intptr_t offset, size_t len)
+-{
+-  char *result;
+-  static int fd = -1;
+-  static char *fd_name;
+-#if LTO_MMAP_IO
+-  intptr_t computed_len;
+-  intptr_t computed_offset;
+-  intptr_t diff;
+-#endif
+-
+-  /* Keep a single-entry file-descriptor cache.  The last file we
+-     touched will get closed at exit.
+-     ???  Eventually we want to add a more sophisticated larger cache
+-     or rather fix function body streaming to not stream them in
+-     practically random order.  */
+-  if (fd != -1
+-      && filename_cmp (fd_name, file_data->file_name) != 0)
+-    {
+-      free (fd_name);
+-      close (fd);
+-      fd = -1;
+-    }
+-  if (fd == -1)
+-    {
+-      fd = open (file_data->file_name, O_RDONLY|O_BINARY);
+-      if (fd == -1)
+-        {
+-	  fatal_error (input_location, "Cannot open %s", file_data->file_name);
+-	  return NULL;
+-        }
+-      fd_name = xstrdup (file_data->file_name);
+-    }
+-
+-#if LTO_MMAP_IO
+-  if (!page_mask)
+-    {
+-      size_t page_size = sysconf (_SC_PAGE_SIZE);
+-      page_mask = ~(page_size - 1);
+-    }
+-
+-  computed_offset = offset & page_mask;
+-  diff = offset - computed_offset;
+-  computed_len = len + diff;
+-
+-  result = (char *) mmap (NULL, computed_len, PROT_READ, MAP_PRIVATE,
+-			  fd, computed_offset);
+-  if (result == MAP_FAILED)
+-    {
+-      fatal_error (input_location, "Cannot map %s", file_data->file_name);
+-      return NULL;
+-    }
+-
+-  return result + diff;
+-#else
+-  result = (char *) xmalloc (len);
+-  if (lseek (fd, offset, SEEK_SET) != offset
+-      || read (fd, result, len) != (ssize_t) len)
+-    {
+-      free (result);
+-      fatal_error (input_location, "Cannot read %s", file_data->file_name);
+-      result = NULL;
+-    }
+-#ifdef __MINGW32__
+-  /* Native windows doesn't supports delayed unlink on opened file. So
+-     we close file here again. This produces higher I/O load, but at least
+-     it prevents to have dangling file handles preventing unlink.  */
+-  free (fd_name);
+-  fd_name = NULL;
+-  close (fd);
+-  fd = -1;
+-#endif
+-  return result;
+-#endif
+-}    
++  struct cgraph_node *node; 
++  timevar_id_t lto_timer;
+ 
++  if (!quiet_flag)
++    fprintf (stderr,
++	     flag_wpa ? "Materializing decls:" : "Reading function bodies:");
+ 
+-/* Get the section data from FILE_DATA of SECTION_TYPE with NAME.
+-   NAME will be NULL unless the section type is for a function
+-   body.  */
+ 
+-static const char *
+-get_section_data (struct lto_file_decl_data *file_data,
+-		      enum lto_section_type section_type,
+-		      const char *name,
+-		      size_t *len)
+-{
+-  htab_t section_hash_table = file_data->section_hash_table;
+-  struct lto_section_slot *f_slot;
+-  struct lto_section_slot s_slot;
+-  const char *section_name = lto_get_section_name (section_type, name, file_data);
+-  char *data = NULL;
+-
+-  *len = 0;
+-  s_slot.name = section_name;
+-  f_slot = (struct lto_section_slot *) htab_find (section_hash_table, &s_slot);
+-  if (f_slot)
++  FOR_EACH_FUNCTION (node)
+     {
+-      data = lto_read_section_data (file_data, f_slot->start, f_slot->len);
+-      *len = f_slot->len;
++      if (node->lto_file_data)
++	{
++	  lto_materialize_function (node);
++	  lto_stats.num_input_cgraph_nodes++;
++	}
+     }
+ 
+-  free (CONST_CAST (char *, section_name));
+-  return data;
+-}
+-
+ 
+-/* Free the section data from FILE_DATA of SECTION_TYPE with NAME that
+-   starts at OFFSET and has LEN bytes.  */
++  /* Start the appropriate timer depending on the mode that we are
++     operating in.  */
++  lto_timer = (flag_wpa) ? TV_WHOPR_WPA
++	      : (flag_ltrans) ? TV_WHOPR_LTRANS
++	      : TV_LTO;
++  timevar_push (lto_timer);
+ 
+-static void
+-free_section_data (struct lto_file_decl_data *file_data ATTRIBUTE_UNUSED,
+-		   enum lto_section_type section_type ATTRIBUTE_UNUSED,
+-		   const char *name ATTRIBUTE_UNUSED,
+-		   const char *offset, size_t len ATTRIBUTE_UNUSED)
+-{
+-#if LTO_MMAP_IO
+-  intptr_t computed_len;
+-  intptr_t computed_offset;
+-  intptr_t diff;
+-#endif
++  current_function_decl = NULL;
++  set_cfun (NULL);
+ 
+-#if LTO_MMAP_IO
+-  computed_offset = ((intptr_t) offset) & page_mask;
+-  diff = (intptr_t) offset - computed_offset;
+-  computed_len = len + diff;
++  if (!quiet_flag)
++    fprintf (stderr, "\n");
+ 
+-  munmap ((caddr_t) computed_offset, computed_len);
+-#else
+-  free (CONST_CAST(char *, offset));
+-#endif
++  timevar_pop (lto_timer);
+ }
+ 
+-static lto_file *current_lto_file;
+-
+ /* Actually stream out ENCODER into TEMP_FILENAME.  */
+ 
+ static void
+@@ -2560,581 +411,6 @@ lto_wpa_write_files (void)
+   timevar_pop (TV_WHOPR_WPA_IO);
+ }
+ 
+-
+-/* If TT is a variable or function decl replace it with its
+-   prevailing variant.  */
+-#define LTO_SET_PREVAIL(tt) \
+-  do {\
+-    if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
+-	&& (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
+-      { \
+-        tt = lto_symtab_prevailing_decl (tt); \
+-	fixed = true; \
+-      } \
+-  } while (0)
+-
+-/* Ensure that TT isn't a replacable var of function decl.  */
+-#define LTO_NO_PREVAIL(tt) \
+-  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
+-
+-/* Given a tree T replace all fields referring to variables or functions
+-   with their prevailing variant.  */
+-static void
+-lto_fixup_prevailing_decls (tree t)
+-{
+-  enum tree_code code = TREE_CODE (t);
+-  bool fixed = false;
+-
+-  gcc_checking_assert (code != TREE_BINFO);
+-  LTO_NO_PREVAIL (TREE_TYPE (t));
+-  if (CODE_CONTAINS_STRUCT (code, TS_COMMON)
+-      /* lto_symtab_prevail_decl use TREE_CHAIN to link to the prevailing decl.
+-	 in the case T is a prevailed declaration we would ICE here. */
+-      && !VAR_OR_FUNCTION_DECL_P (t))
+-    LTO_NO_PREVAIL (TREE_CHAIN (t));
+-  if (DECL_P (t))
+-    {
+-      LTO_NO_PREVAIL (DECL_NAME (t));
+-      LTO_SET_PREVAIL (DECL_CONTEXT (t));
+-      if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
+-	{
+-	  LTO_SET_PREVAIL (DECL_SIZE (t));
+-	  LTO_SET_PREVAIL (DECL_SIZE_UNIT (t));
+-	  LTO_SET_PREVAIL (DECL_INITIAL (t));
+-	  LTO_NO_PREVAIL (DECL_ATTRIBUTES (t));
+-	  LTO_SET_PREVAIL (DECL_ABSTRACT_ORIGIN (t));
+-	}
+-      if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
+-	{
+-	  LTO_NO_PREVAIL (DECL_ASSEMBLER_NAME_RAW (t));
+-	}
+-      if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
+-	{
+-	  LTO_NO_PREVAIL (DECL_RESULT_FLD (t));
+-	}
+-      if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
+-	{
+-	  LTO_NO_PREVAIL (DECL_ARGUMENTS (t));
+-	  LTO_SET_PREVAIL (DECL_FUNCTION_PERSONALITY (t));
+-	  LTO_NO_PREVAIL (DECL_VINDEX (t));
+-	}
+-      if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
+-	{
+-	  LTO_SET_PREVAIL (DECL_FIELD_OFFSET (t));
+-	  LTO_NO_PREVAIL (DECL_BIT_FIELD_TYPE (t));
+-	  LTO_NO_PREVAIL (DECL_QUALIFIER (t));
+-	  LTO_NO_PREVAIL (DECL_FIELD_BIT_OFFSET (t));
+-	  LTO_NO_PREVAIL (DECL_FCONTEXT (t));
+-	}
+-    }
+-  else if (TYPE_P (t))
+-    {
+-      LTO_NO_PREVAIL (TYPE_CACHED_VALUES (t));
+-      LTO_SET_PREVAIL (TYPE_SIZE (t));
+-      LTO_SET_PREVAIL (TYPE_SIZE_UNIT (t));
+-      LTO_NO_PREVAIL (TYPE_ATTRIBUTES (t));
+-      LTO_NO_PREVAIL (TYPE_NAME (t));
+-
+-      LTO_SET_PREVAIL (TYPE_MIN_VALUE_RAW (t));
+-      LTO_SET_PREVAIL (TYPE_MAX_VALUE_RAW (t));
+-      LTO_NO_PREVAIL (TYPE_LANG_SLOT_1 (t));
+-
+-      LTO_SET_PREVAIL (TYPE_CONTEXT (t));
+-
+-      LTO_NO_PREVAIL (TYPE_CANONICAL (t));
+-      LTO_NO_PREVAIL (TYPE_MAIN_VARIANT (t));
+-      LTO_NO_PREVAIL (TYPE_NEXT_VARIANT (t));
+-    }
+-  else if (EXPR_P (t))
+-    {
+-      int i;
+-      for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
+-	LTO_SET_PREVAIL (TREE_OPERAND (t, i));
+-    }
+-  else if (TREE_CODE (t) == CONSTRUCTOR)
+-    {
+-      unsigned i;
+-      tree val;
+-      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (t), i, val)
+-	LTO_SET_PREVAIL (val);
+-    }
+-  else
+-    {
+-      switch (code)
+-	{
+-	case TREE_LIST:
+-	  LTO_SET_PREVAIL (TREE_VALUE (t));
+-	  LTO_SET_PREVAIL (TREE_PURPOSE (t));
+-	  LTO_NO_PREVAIL (TREE_PURPOSE (t));
+-	  break;
+-	default:
+-	  gcc_unreachable ();
+-	}
+-    }
+-  /* If we fixed nothing, then we missed something seen by
+-     mentions_vars_p.  */
+-  gcc_checking_assert (fixed);
+-}
+-#undef LTO_SET_PREVAIL
+-#undef LTO_NO_PREVAIL
+-
+-/* Helper function of lto_fixup_decls. Walks the var and fn streams in STATE,
+-   replaces var and function decls with the corresponding prevailing def.  */
+-
+-static void
+-lto_fixup_state (struct lto_in_decl_state *state)
+-{
+-  unsigned i, si;
+-
+-  /* Although we only want to replace FUNCTION_DECLs and VAR_DECLs,
+-     we still need to walk from all DECLs to find the reachable
+-     FUNCTION_DECLs and VAR_DECLs.  */
+-  for (si = 0; si < LTO_N_DECL_STREAMS; si++)
+-    {
+-      vec<tree, va_gc> *trees = state->streams[si];
+-      for (i = 0; i < vec_safe_length (trees); i++)
+-	{
+-	  tree t = (*trees)[i];
+-	  if (flag_checking && TYPE_P (t))
+-	    verify_type (t);
+-	  if (VAR_OR_FUNCTION_DECL_P (t)
+-	      && (TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
+-	    (*trees)[i] = lto_symtab_prevailing_decl (t);
+-	}
+-    }
+-}
+-
+-/* Fix the decls from all FILES. Replaces each decl with the corresponding
+-   prevailing one.  */
+-
+-static void
+-lto_fixup_decls (struct lto_file_decl_data **files)
+-{
+-  unsigned int i;
+-  tree t;
+-
+-  if (tree_with_vars)
+-    FOR_EACH_VEC_ELT ((*tree_with_vars), i, t)
+-      lto_fixup_prevailing_decls (t);
+-
+-  for (i = 0; files[i]; i++)
+-    {
+-      struct lto_file_decl_data *file = files[i];
+-      struct lto_in_decl_state *state = file->global_decl_state;
+-      lto_fixup_state (state);
+-
+-      hash_table<decl_state_hasher>::iterator iter;
+-      lto_in_decl_state *elt;
+-      FOR_EACH_HASH_TABLE_ELEMENT (*file->function_decl_states, elt,
+-				   lto_in_decl_state *, iter)
+-	lto_fixup_state (elt);
+-    }
+-}
+-
+-static GTY((length ("lto_stats.num_input_files + 1"))) struct lto_file_decl_data **all_file_decl_data;
+-
+-/* Turn file datas for sub files into a single array, so that they look
+-   like separate files for further passes. */
+-
+-static void
+-lto_flatten_files (struct lto_file_decl_data **orig, int count, int last_file_ix)
+-{
+-  struct lto_file_decl_data *n, *next;
+-  int i, k;
+-
+-  lto_stats.num_input_files = count;
+-  all_file_decl_data
+-    = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (count + 1);
+-  /* Set the hooks so that all of the ipa passes can read in their data.  */
+-  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
+-  for (i = 0, k = 0; i < last_file_ix; i++) 
+-    {
+-      for (n = orig[i]; n != NULL; n = next)
+-	{
+-	  all_file_decl_data[k++] = n;
+-	  next = n->next;
+-	  n->next = NULL;
+-	}
+-    }
+-  all_file_decl_data[k] = NULL;
+-  gcc_assert (k == count);
+-}
+-
+-/* Input file data before flattening (i.e. splitting them to subfiles to support
+-   incremental linking.  */
+-static int real_file_count;
+-static GTY((length ("real_file_count + 1"))) struct lto_file_decl_data **real_file_decl_data;
+-
+-static void print_lto_report_1 (void);
+-
+-/* Read all the symbols from the input files FNAMES.  NFILES is the
+-   number of files requested in the command line.  Instantiate a
+-   global call graph by aggregating all the sub-graphs found in each
+-   file.  */
+-
+-static void
+-read_cgraph_and_symbols (unsigned nfiles, const char **fnames)
+-{
+-  unsigned int i, last_file_ix;
+-  FILE *resolution;
+-  int count = 0;
+-  struct lto_file_decl_data **decl_data;
+-  symtab_node *snode;
+-
+-  symtab->initialize ();
+-
+-  timevar_push (TV_IPA_LTO_DECL_IN);
+-
+-#ifdef ACCEL_COMPILER
+-  section_name_prefix = OFFLOAD_SECTION_NAME_PREFIX;
+-  lto_stream_offload_p = true;
+-#endif
+-
+-  real_file_decl_data
+-    = decl_data = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (nfiles + 1);
+-  real_file_count = nfiles;
+-
+-  /* Read the resolution file.  */
+-  resolution = NULL;
+-  if (resolution_file_name)
+-    {
+-      int t;
+-      unsigned num_objects;
+-
+-      resolution = fopen (resolution_file_name, "r");
+-      if (resolution == NULL)
+-	fatal_error (input_location,
+-		     "could not open symbol resolution file: %m");
+-
+-      t = fscanf (resolution, "%u", &num_objects);
+-      gcc_assert (t == 1);
+-
+-      /* True, since the plugin splits the archives.  */
+-      gcc_assert (num_objects == nfiles);
+-    }
+-  symtab->state = LTO_STREAMING;
+-
+-  canonical_type_hash_cache = new hash_map<const_tree, hashval_t> (251);
+-  gimple_canonical_types = htab_create (16381, gimple_canonical_type_hash,
+-					gimple_canonical_type_eq, NULL);
+-  gcc_obstack_init (&tree_scc_hash_obstack);
+-  tree_scc_hash = new hash_table<tree_scc_hasher> (4096);
+-
+-  /* Register the common node types with the canonical type machinery so
+-     we properly share alias-sets across languages and TUs.  Do not
+-     expose the common nodes as type merge target - those that should be
+-     are already exposed so by pre-loading the LTO streamer caches.
+-     Do two passes - first clear TYPE_CANONICAL and then re-compute it.  */
+-  for (i = 0; i < itk_none; ++i)
+-    lto_register_canonical_types (integer_types[i], true);
+-  for (i = 0; i < stk_type_kind_last; ++i)
+-    lto_register_canonical_types (sizetype_tab[i], true);
+-  for (i = 0; i < TI_MAX; ++i)
+-    lto_register_canonical_types (global_trees[i], true);
+-  for (i = 0; i < itk_none; ++i)
+-    lto_register_canonical_types (integer_types[i], false);
+-  for (i = 0; i < stk_type_kind_last; ++i)
+-    lto_register_canonical_types (sizetype_tab[i], false);
+-  for (i = 0; i < TI_MAX; ++i)
+-    lto_register_canonical_types (global_trees[i], false);
+-
+-  if (!quiet_flag)
+-    fprintf (stderr, "Reading object files:");
+-
+-  /* Read all of the object files specified on the command line.  */
+-  for (i = 0, last_file_ix = 0; i < nfiles; ++i)
+-    {
+-      struct lto_file_decl_data *file_data = NULL;
+-      if (!quiet_flag)
+-	{
+-	  fprintf (stderr, " %s", fnames[i]);
+-	  fflush (stderr);
+-	}
+-
+-      current_lto_file = lto_obj_file_open (fnames[i], false);
+-      if (!current_lto_file)
+-	break;
+-
+-      file_data = lto_file_read (current_lto_file, resolution, &count);
+-      if (!file_data)
+-	{
+-	  lto_obj_file_close (current_lto_file);
+-	  free (current_lto_file);
+-	  current_lto_file = NULL;
+-	  break;
+-	}
+-
+-      decl_data[last_file_ix++] = file_data;
+-
+-      lto_obj_file_close (current_lto_file);
+-      free (current_lto_file);
+-      current_lto_file = NULL;
+-    }
+-
+-  lto_flatten_files (decl_data, count, last_file_ix);
+-  lto_stats.num_input_files = count;
+-  ggc_free(decl_data);
+-  real_file_decl_data = NULL;
+-
+-  if (resolution_file_name)
+-    fclose (resolution);
+-
+-  /* Show the LTO report before launching LTRANS.  */
+-  if (flag_lto_report || (flag_wpa && flag_lto_report_wpa))
+-    print_lto_report_1 ();
+-
+-  /* Free gimple type merging datastructures.  */
+-  delete tree_scc_hash;
+-  tree_scc_hash = NULL;
+-  obstack_free (&tree_scc_hash_obstack, NULL);
+-  htab_delete (gimple_canonical_types);
+-  gimple_canonical_types = NULL;
+-  delete canonical_type_hash_cache;
+-  canonical_type_hash_cache = NULL;
+-
+-  /* At this stage we know that majority of GGC memory is reachable.  
+-     Growing the limits prevents unnecesary invocation of GGC.  */
+-  ggc_grow ();
+-  ggc_collect ();
+-
+-  /* Set the hooks so that all of the ipa passes can read in their data.  */
+-  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
+-
+-  timevar_pop (TV_IPA_LTO_DECL_IN);
+-
+-  if (!quiet_flag)
+-    fprintf (stderr, "\nReading the callgraph\n");
+-
+-  timevar_push (TV_IPA_LTO_CGRAPH_IO);
+-  /* Read the symtab.  */
+-  input_symtab ();
+-
+-  input_offload_tables (!flag_ltrans);
+-
+-  /* Store resolutions into the symbol table.  */
+-
+-  FOR_EACH_SYMBOL (snode)
+-    if (snode->externally_visible && snode->real_symbol_p ()
+-	&& snode->lto_file_data && snode->lto_file_data->resolution_map
+-	&& !(TREE_CODE (snode->decl) == FUNCTION_DECL
+-	     && fndecl_built_in_p (snode->decl))
+-	&& !(VAR_P (snode->decl) && DECL_HARD_REGISTER (snode->decl)))
+-      {
+-	ld_plugin_symbol_resolution_t *res;
+-
+-	res = snode->lto_file_data->resolution_map->get (snode->decl);
+-	if (!res || *res == LDPR_UNKNOWN)
+-	  {
+-	    if (snode->output_to_lto_symbol_table_p ())
+-	      fatal_error (input_location, "missing resolution data for %s",
+-		           IDENTIFIER_POINTER
+-			     (DECL_ASSEMBLER_NAME (snode->decl)));
+-	  }
+-	else
+-          snode->resolution = *res;
+-      }
+-  for (i = 0; all_file_decl_data[i]; i++)
+-    if (all_file_decl_data[i]->resolution_map)
+-      {
+-        delete all_file_decl_data[i]->resolution_map;
+-        all_file_decl_data[i]->resolution_map = NULL;
+-      }
+-  
+-  timevar_pop (TV_IPA_LTO_CGRAPH_IO);
+-
+-  if (!quiet_flag)
+-    fprintf (stderr, "Merging declarations\n");
+-
+-  timevar_push (TV_IPA_LTO_DECL_MERGE);
+-  /* Merge global decls.  In ltrans mode we read merged cgraph, we do not
+-     need to care about resolving symbols again, we only need to replace
+-     duplicated declarations read from the callgraph and from function
+-     sections.  */
+-  if (!flag_ltrans)
+-    {
+-      lto_symtab_merge_decls ();
+-
+-      /* If there were errors during symbol merging bail out, we have no
+-	 good way to recover here.  */
+-      if (seen_error ())
+-	fatal_error (input_location,
+-		     "errors during merging of translation units");
+-
+-      /* Fixup all decls.  */
+-      lto_fixup_decls (all_file_decl_data);
+-    }
+-  if (tree_with_vars)
+-    ggc_free (tree_with_vars);
+-  tree_with_vars = NULL;
+-  ggc_collect ();
+-
+-  timevar_pop (TV_IPA_LTO_DECL_MERGE);
+-  /* Each pass will set the appropriate timer.  */
+-
+-  if (!quiet_flag)
+-    fprintf (stderr, "Reading summaries\n");
+-
+-  /* Read the IPA summary data.  */
+-  if (flag_ltrans)
+-    ipa_read_optimization_summaries ();
+-  else
+-    ipa_read_summaries ();
+-
+-  for (i = 0; all_file_decl_data[i]; i++)
+-    {
+-      gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
+-      lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
+-      all_file_decl_data[i]->symtab_node_encoder = NULL;
+-      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
+-      all_file_decl_data[i]->global_decl_state = NULL;
+-      all_file_decl_data[i]->current_decl_state = NULL; 
+-    }
+-
+-  if (!flag_ltrans)
+-    {
+-      /* Finally merge the cgraph according to the decl merging decisions.  */
+-      timevar_push (TV_IPA_LTO_CGRAPH_MERGE);
+-
+-      gcc_assert (!dump_file);
+-      dump_file = dump_begin (lto_link_dump_id, NULL);
+-
+-      if (dump_file)
+-	{
+-	  fprintf (dump_file, "Before merging:\n");
+-	  symtab->dump (dump_file);
+-	}
+-      lto_symtab_merge_symbols ();
+-      /* Removal of unreachable symbols is needed to make verify_symtab to pass;
+-	 we are still having duplicated comdat groups containing local statics.
+-	 We could also just remove them while merging.  */
+-      symtab->remove_unreachable_nodes (dump_file);
+-      ggc_collect ();
+-
+-      if (dump_file)
+-        dump_end (lto_link_dump_id, dump_file);
+-      dump_file = NULL;
+-      timevar_pop (TV_IPA_LTO_CGRAPH_MERGE);
+-    }
+-  symtab->state = IPA_SSA;
+-  /* All node removals happening here are useless, because
+-     WPA should not stream them.  Still always perform remove_unreachable_nodes
+-     because we may reshape clone tree, get rid of dead masters of inline
+-     clones and remove symbol entries for read-only variables we keep around
+-     only to be able to constant fold them.  */
+-  if (flag_ltrans)
+-    {
+-      if (symtab->dump_file)
+-	 symtab->dump (symtab->dump_file);
+-      symtab->remove_unreachable_nodes (symtab->dump_file);
+-    }
+-
+-  /* Indicate that the cgraph is built and ready.  */
+-  symtab->function_flags_ready = true;
+-
+-  ggc_free (all_file_decl_data);
+-  all_file_decl_data = NULL;
+-}
+-
+-
+-/* Materialize all the bodies for all the nodes in the callgraph.  */
+-
+-static void
+-materialize_cgraph (void)
+-{
+-  struct cgraph_node *node; 
+-  timevar_id_t lto_timer;
+-
+-  if (!quiet_flag)
+-    fprintf (stderr,
+-	     flag_wpa ? "Materializing decls:" : "Reading function bodies:");
+-
+-
+-  FOR_EACH_FUNCTION (node)
+-    {
+-      if (node->lto_file_data)
+-	{
+-	  lto_materialize_function (node);
+-	  lto_stats.num_input_cgraph_nodes++;
+-	}
+-    }
+-
+-
+-  /* Start the appropriate timer depending on the mode that we are
+-     operating in.  */
+-  lto_timer = (flag_wpa) ? TV_WHOPR_WPA
+-	      : (flag_ltrans) ? TV_WHOPR_LTRANS
+-	      : TV_LTO;
+-  timevar_push (lto_timer);
+-
+-  current_function_decl = NULL;
+-  set_cfun (NULL);
+-
+-  if (!quiet_flag)
+-    fprintf (stderr, "\n");
+-
+-  timevar_pop (lto_timer);
+-}
+-
+-
+-/* Show various memory usage statistics related to LTO.  */
+-static void
+-print_lto_report_1 (void)
+-{
+-  const char *pfx = (flag_lto) ? "LTO" : (flag_wpa) ? "WPA" : "LTRANS";
+-  fprintf (stderr, "%s statistics\n", pfx);
+-
+-  fprintf (stderr, "[%s] read %lu SCCs of average size %f\n",
+-	   pfx, num_sccs_read, total_scc_size / (double)num_sccs_read);
+-  fprintf (stderr, "[%s] %lu tree bodies read in total\n", pfx, total_scc_size);
+-  if (flag_wpa && tree_scc_hash)
+-    {
+-      fprintf (stderr, "[%s] tree SCC table: size %ld, %ld elements, "
+-	       "collision ratio: %f\n", pfx,
+-	       (long) tree_scc_hash->size (),
+-	       (long) tree_scc_hash->elements (),
+-	       tree_scc_hash->collisions ());
+-      hash_table<tree_scc_hasher>::iterator hiter;
+-      tree_scc *scc, *max_scc = NULL;
+-      unsigned max_length = 0;
+-      FOR_EACH_HASH_TABLE_ELEMENT (*tree_scc_hash, scc, x, hiter)
+-	{
+-	  unsigned length = 0;
+-	  tree_scc *s = scc;
+-	  for (; s; s = s->next)
+-	    length++;
+-	  if (length > max_length)
+-	    {
+-	      max_length = length;
+-	      max_scc = scc;
+-	    }
+-	}
+-      fprintf (stderr, "[%s] tree SCC max chain length %u (size %u)\n",
+-	       pfx, max_length, max_scc->len);
+-      fprintf (stderr, "[%s] Compared %lu SCCs, %lu collisions (%f)\n", pfx,
+-	       num_scc_compares, num_scc_compare_collisions,
+-	       num_scc_compare_collisions / (double) num_scc_compares);
+-      fprintf (stderr, "[%s] Merged %lu SCCs\n", pfx, num_sccs_merged);
+-      fprintf (stderr, "[%s] Merged %lu tree bodies\n", pfx,
+-	       total_scc_size_merged);
+-      fprintf (stderr, "[%s] Merged %lu types\n", pfx, num_merged_types);
+-      fprintf (stderr, "[%s] %lu types prevailed (%lu associated trees)\n",
+-	       pfx, num_prevailing_types, num_type_scc_trees);
+-      fprintf (stderr, "[%s] GIMPLE canonical type table: size %ld, "
+-	       "%ld elements, %ld searches, %ld collisions (ratio: %f)\n", pfx,
+-	       (long) htab_size (gimple_canonical_types),
+-	       (long) htab_elements (gimple_canonical_types),
+-	       (long) gimple_canonical_types->searches,
+-	       (long) gimple_canonical_types->collisions,
+-	       htab_collisions (gimple_canonical_types));
+-      fprintf (stderr, "[%s] GIMPLE canonical type pointer-map: "
+-	       "%lu elements, %ld searches\n", pfx,
+-	       num_canonical_type_hash_entries,
+-	       num_canonical_type_hash_queries);
+-    }
+-
+-  print_lto_report (pfx);
+-}
+-
+ /* Perform whole program analysis (WPA) on the callgraph and write out the
+    optimization plan.  */
+ 
+@@ -3262,64 +538,6 @@ do_whole_program_analysis (void)
+     dump_memory_report (true);
+ }
+ 
+-
+-static GTY(()) tree lto_eh_personality_decl;
+-
+-/* Return the LTO personality function decl.  */
+-
+-tree
+-lto_eh_personality (void)
+-{
+-  if (!lto_eh_personality_decl)
+-    {
+-      /* Use the first personality DECL for our personality if we don't
+-	 support multiple ones.  This ensures that we don't artificially
+-	 create the need for them in a single-language program.  */
+-      if (first_personality_decl && !dwarf2out_do_cfi_asm ())
+-	lto_eh_personality_decl = first_personality_decl;
+-      else
+-	lto_eh_personality_decl = lhd_gcc_personality ();
+-    }
+-
+-  return lto_eh_personality_decl;
+-}
+-
+-/* Set the process name based on the LTO mode. */
+-
+-static void 
+-lto_process_name (void)
+-{
+-  if (flag_lto)
+-    setproctitle (flag_incremental_link == INCREMENTAL_LINK_LTO
+-		  ? "lto1-inclink" : "lto1-lto");
+-  if (flag_wpa)
+-    setproctitle ("lto1-wpa");
+-  if (flag_ltrans)
+-    setproctitle ("lto1-ltrans");
+-}
+-
+-
+-/* Initialize the LTO front end.  */
+-
+-static void
+-lto_init (void)
+-{
+-  lto_process_name ();
+-  lto_streamer_hooks_init ();
+-  lto_reader_init ();
+-  lto_set_in_hooks (NULL, get_section_data, free_section_data);
+-  memset (&lto_stats, 0, sizeof (lto_stats));
+-  bitmap_obstack_initialize (NULL);
+-  gimple_register_cfg_hooks ();
+-#ifndef ACCEL_COMPILER
+-  unsigned char *table
+-    = ggc_vec_alloc<unsigned char> (MAX_MACHINE_MODE);
+-  for (int m = 0; m < MAX_MACHINE_MODE; m++)
+-    table[m] = m;
+-  lto_mode_identity_table = table;
+-#endif
+-}
+-
+ /* Create artificial pointers for "omp declare target link" vars.  */
+ 
+ static void
+@@ -3351,7 +569,6 @@ offload_handle_link_vars (void)
+ #endif
+ }
+ 
+-
+ /* Main entry point for the GIMPLE front end.  This front end has
+    three main personalities:
+ 
+@@ -3386,7 +603,7 @@ lto_main (void)
+   timevar_start (TV_PHASE_SETUP);
+ 
+   /* Initialize the LTO front end.  */
+-  lto_init ();
++  lto_fe_init ();
+ 
+   timevar_stop (TV_PHASE_SETUP);
+   timevar_start (TV_PHASE_STREAM_IN);
+@@ -3439,5 +656,3 @@ lto_main (void)
+   timevar_start (TV_PHASE_PARSING);
+   timevar_push (TV_PARSE_GLOBAL);
+ }
+-
+-#include "gt-lto-lto.h"
+diff --git a/gcc/machmode.h b/gcc/machmode.h
+index d564f9c64..a507ed66c 100644
+--- a/gcc/machmode.h
++++ b/gcc/machmode.h
+@@ -244,14 +244,15 @@ class opt_mode
+ public:
+   enum from_int { dummy = MAX_MACHINE_MODE };
+ 
+-  ALWAYS_INLINE opt_mode () : m_mode (E_VOIDmode) {}
+-  ALWAYS_INLINE opt_mode (const T &m) : m_mode (m) {}
++  ALWAYS_INLINE CONSTEXPR opt_mode () : m_mode (E_VOIDmode) {}
++  ALWAYS_INLINE CONSTEXPR opt_mode (const T &m) : m_mode (m) {}
+   template<typename U>
+-  ALWAYS_INLINE opt_mode (const U &m) : m_mode (T (m)) {}
+-  ALWAYS_INLINE opt_mode (from_int m) : m_mode (machine_mode (m)) {}
++  ALWAYS_INLINE CONSTEXPR opt_mode (const U &m) : m_mode (T (m)) {}
++  ALWAYS_INLINE CONSTEXPR opt_mode (from_int m) : m_mode (machine_mode (m)) {}
+ 
+   machine_mode else_void () const;
+-  machine_mode else_blk () const;
++  machine_mode else_blk () const { return else_mode (BLKmode); }
++  machine_mode else_mode (machine_mode) const;
+   T require () const;
+ 
+   bool exists () const;
+@@ -274,13 +275,13 @@ opt_mode<T>::else_void () const
+   return m_mode;
+ }
+ 
+-/* If the T exists, return its enum value, otherwise return E_BLKmode.  */
++/* If the T exists, return its enum value, otherwise return FALLBACK.  */
+ 
+ template<typename T>
+ inline machine_mode
+-opt_mode<T>::else_blk () const
++opt_mode<T>::else_mode (machine_mode fallback) const
+ {
+-  return m_mode == E_VOIDmode ? E_BLKmode : m_mode;
++  return m_mode == E_VOIDmode ? fallback : m_mode;
+ }
+ 
+ /* Assert that the object contains a T and return it.  */
+@@ -326,8 +327,12 @@ struct pod_mode
+   typedef typename T::measurement_type measurement_type;
+ 
+   machine_mode m_mode;
+-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
+-  ALWAYS_INLINE operator T () const { return from_int (m_mode); }
++  ALWAYS_INLINE CONSTEXPR
++  operator machine_mode () const { return m_mode; }
++
++  ALWAYS_INLINE CONSTEXPR
++  operator T () const { return from_int (m_mode); }
++
+   ALWAYS_INLINE pod_mode &operator = (const T &m) { m_mode = m; return *this; }
+ };
+ 
+@@ -405,8 +410,11 @@ public:
+   typedef unsigned short measurement_type;
+ 
+   ALWAYS_INLINE scalar_int_mode () {}
+-  ALWAYS_INLINE scalar_int_mode (from_int m) : m_mode (machine_mode (m)) {}
+-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
++
++  ALWAYS_INLINE CONSTEXPR
++  scalar_int_mode (from_int m) : m_mode (machine_mode (m)) {}
++
++  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
+ 
+   static bool includes_p (machine_mode);
+ 
+@@ -430,8 +438,11 @@ public:
+   typedef unsigned short measurement_type;
+ 
+   ALWAYS_INLINE scalar_float_mode () {}
+-  ALWAYS_INLINE scalar_float_mode (from_int m) : m_mode (machine_mode (m)) {}
+-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
++
++  ALWAYS_INLINE CONSTEXPR
++  scalar_float_mode (from_int m) : m_mode (machine_mode (m)) {}
++
++  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
+ 
+   static bool includes_p (machine_mode);
+ 
+@@ -455,11 +466,20 @@ public:
+   typedef unsigned short measurement_type;
+ 
+   ALWAYS_INLINE scalar_mode () {}
+-  ALWAYS_INLINE scalar_mode (from_int m) : m_mode (machine_mode (m)) {}
+-  ALWAYS_INLINE scalar_mode (const scalar_int_mode &m) : m_mode (m) {}
+-  ALWAYS_INLINE scalar_mode (const scalar_float_mode &m) : m_mode (m) {}
+-  ALWAYS_INLINE scalar_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
+-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
++
++  ALWAYS_INLINE CONSTEXPR
++  scalar_mode (from_int m) : m_mode (machine_mode (m)) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  scalar_mode (const scalar_int_mode &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  scalar_mode (const scalar_float_mode &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  scalar_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
+ 
+   static bool includes_p (machine_mode);
+ 
+@@ -496,8 +516,11 @@ public:
+   typedef unsigned short measurement_type;
+ 
+   ALWAYS_INLINE complex_mode () {}
+-  ALWAYS_INLINE complex_mode (from_int m) : m_mode (machine_mode (m)) {}
+-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
++
++  ALWAYS_INLINE CONSTEXPR
++  complex_mode (from_int m) : m_mode (machine_mode (m)) {}
++
++  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
+ 
+   static bool includes_p (machine_mode);
+ 
+@@ -766,14 +789,29 @@ public:
+   typedef unsigned short measurement_type;
+ 
+   ALWAYS_INLINE fixed_size_mode () {}
+-  ALWAYS_INLINE fixed_size_mode (from_int m) : m_mode (machine_mode (m)) {}
+-  ALWAYS_INLINE fixed_size_mode (const scalar_mode &m) : m_mode (m) {}
+-  ALWAYS_INLINE fixed_size_mode (const scalar_int_mode &m) : m_mode (m) {}
+-  ALWAYS_INLINE fixed_size_mode (const scalar_float_mode &m) : m_mode (m) {}
+-  ALWAYS_INLINE fixed_size_mode (const scalar_mode_pod &m) : m_mode (m) {}
+-  ALWAYS_INLINE fixed_size_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
+-  ALWAYS_INLINE fixed_size_mode (const complex_mode &m) : m_mode (m) {}
+-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (from_int m) : m_mode (machine_mode (m)) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (const scalar_mode &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (const scalar_int_mode &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (const scalar_float_mode &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (const scalar_mode_pod &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR
++  fixed_size_mode (const complex_mode &m) : m_mode (m) {}
++
++  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
+ 
+   static bool includes_p (machine_mode);
+ 
+diff --git a/gcc/match.pd b/gcc/match.pd
+index f7e192d9b..facc43387 100644
+--- a/gcc/match.pd
++++ b/gcc/match.pd
+@@ -82,12 +82,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+   plus minus
+   mult trunc_div trunc_mod rdiv
+   min max
+-  bit_and bit_ior bit_xor)
++  bit_and bit_ior bit_xor
++  lshift rshift)
+ (define_operator_list COND_BINARY
+   IFN_COND_ADD IFN_COND_SUB
+   IFN_COND_MUL IFN_COND_DIV IFN_COND_MOD IFN_COND_RDIV
+   IFN_COND_MIN IFN_COND_MAX
+-  IFN_COND_AND IFN_COND_IOR IFN_COND_XOR)
++  IFN_COND_AND IFN_COND_IOR IFN_COND_XOR
++  IFN_COND_SHL IFN_COND_SHR)
+ 
+ /* Same for ternary operations.  */
+ (define_operator_list UNCOND_TERNARY
+@@ -5378,3 +5380,86 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+       (bit_and:elt_type
+        (BIT_FIELD_REF:elt_type @0 { size; } { pos; })
+        { elt; })))))))
++
++(simplify
++ (vec_perm @0 @1 VECTOR_CST@2)
++ (with
++  {
++    tree op0 = @0, op1 = @1, op2 = @2;
++
++    /* Build a vector of integers from the tree mask.  */
++    vec_perm_builder builder;
++    if (!tree_to_vec_perm_builder (&builder, op2))
++      return NULL_TREE;
++
++    /* Create a vec_perm_indices for the integer vector.  */
++    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
++    bool single_arg = (op0 == op1);
++    vec_perm_indices sel (builder, single_arg ? 1 : 2, nelts);
++  }
++  (if (sel.series_p (0, 1, 0, 1))
++   { op0; }
++   (if (sel.series_p (0, 1, nelts, 1))
++    { op1; }
++    (with
++     {
++       if (!single_arg)
++         {
++	   if (sel.all_from_input_p (0))
++	     op1 = op0;
++	   else if (sel.all_from_input_p (1))
++	     {
++	       op0 = op1;
++	       sel.rotate_inputs (1);
++	     }
++         }
++       gassign *def;
++       tree cop0 = op0, cop1 = op1;
++       if (TREE_CODE (op0) == SSA_NAME
++           && (def = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op0)))
++	   && gimple_assign_rhs_code (def) == CONSTRUCTOR)
++	 cop0 = gimple_assign_rhs1 (def);
++       if (TREE_CODE (op1) == SSA_NAME
++           && (def = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op1)))
++	   && gimple_assign_rhs_code (def) == CONSTRUCTOR)
++	 cop1 = gimple_assign_rhs1 (def);
++
++       tree t;
++    }
++    (if ((TREE_CODE (cop0) == VECTOR_CST
++	  || TREE_CODE (cop0) == CONSTRUCTOR)
++	 && (TREE_CODE (cop1) == VECTOR_CST
++	     || TREE_CODE (cop1) == CONSTRUCTOR)
++	 && (t = fold_vec_perm (type, cop0, cop1, sel)))
++     { t; }
++     (with
++      {
++	bool changed = (op0 == op1 && !single_arg);
++
++	/* Generate a canonical form of the selector.  */
++	if (sel.encoding () != builder)
++	  {
++	    /* Some targets are deficient and fail to expand a single
++	       argument permutation while still allowing an equivalent
++	       2-argument version.  */
++	    tree oldop2 = op2;
++	    if (sel.ninputs () == 2
++	       || can_vec_perm_const_p (TYPE_MODE (type), sel, false))
++	      op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel);
++	    else
++	      {
++	        vec_perm_indices sel2 (builder, 2, nelts);
++	        if (can_vec_perm_const_p (TYPE_MODE (type), sel2, false))
++	          op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel2);
++	        else
++	          /* Not directly supported with either encoding,
++		     so use the preferred form.  */
++		  op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel);
++	      }
++	    /* Differences in the encoder do not necessarily mean
++	       differences in the resulting vector.  */
++	    changed = !operand_equal_p (op2, oldop2, 0);
++	  }
++      }
++      (if (changed)
++       (vec_perm { op0; } { op1; } { op2; })))))))))
+diff --git a/gcc/mode-switching.c b/gcc/mode-switching.c
+index 2ff21a400..4a34d4a2b 100644
+--- a/gcc/mode-switching.c
++++ b/gcc/mode-switching.c
+@@ -165,7 +165,7 @@ new_seginfo (int mode, rtx_insn *insn, int bb, HARD_REG_SET regs_live)
+   ptr->insn_ptr = insn;
+   ptr->bbnum = bb;
+   ptr->next = NULL;
+-  COPY_HARD_REG_SET (ptr->regs_live, regs_live);
++  ptr->regs_live = regs_live;
+   return ptr;
+ }
+ 
+@@ -637,7 +637,7 @@ optimize_mode_switching (void)
+ 		    if (REG_NOTE_KIND (link) == REG_DEAD)
+ 		      reg_dies (XEXP (link, 0), &live_now);
+ 
+-		  note_stores (PATTERN (insn), reg_becomes_live, &live_now);
++		  note_stores (insn, reg_becomes_live, &live_now);
+ 		  for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+ 		    if (REG_NOTE_KIND (link) == REG_UNUSED)
+ 		      reg_dies (XEXP (link, 0), &live_now);
+diff --git a/gcc/omp-simd-clone.c b/gcc/omp-simd-clone.c
+index 10490f34f..d884514cc 100644
+--- a/gcc/omp-simd-clone.c
++++ b/gcc/omp-simd-clone.c
+@@ -461,8 +461,7 @@ simd_clone_create (struct cgraph_node *old_node)
+   if (new_node == NULL)
+     return new_node;
+ 
+-  DECL_BUILT_IN_CLASS (new_node->decl) = NOT_BUILT_IN;
+-  DECL_FUNCTION_CODE (new_node->decl) = (enum built_in_function) 0;
++  set_decl_built_in_function (new_node->decl, NOT_BUILT_IN, 0);
+   TREE_PUBLIC (new_node->decl) = TREE_PUBLIC (old_node->decl);
+   DECL_COMDAT (new_node->decl) = DECL_COMDAT (old_node->decl);
+   DECL_WEAK (new_node->decl) = DECL_WEAK (old_node->decl);
+diff --git a/gcc/opt-suggestions.c b/gcc/opt-suggestions.c
+index a820c78ff..1ec94203c 100644
+--- a/gcc/opt-suggestions.c
++++ b/gcc/opt-suggestions.c
+@@ -307,7 +307,6 @@ test_completion_valid_options (option_proposer &proposer)
+     "-Wassign-intercept",
+     "-Wno-format-security",
+     "-fno-sched-stalled-insns",
+-    "-fbtr-bb-exclusive",
+     "-fno-tree-tail-merge",
+     "-Wlong-long",
+     "-Wno-unused-but-set-parameter",
+diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
+index 341e02bd5..7bad9c87b 100644
+--- a/gcc/optabs-tree.c
++++ b/gcc/optabs-tree.c
+@@ -267,20 +267,16 @@ optab_for_tree_code (enum tree_code code, const_tree type,
+ 
+    Convert operations we currently support directly are FIX_TRUNC and FLOAT.
+    This function checks if these operations are supported
+-   by the target platform either directly (via vector tree-codes), or via
+-   target builtins.
++   by the target platform directly (via vector tree-codes).
+ 
+    Output:
+    - CODE1 is code of vector operation to be used when
+-   vectorizing the operation, if available.
+-   - DECL is decl of target builtin functions to be used
+-   when vectorizing the operation, if available.  In this case,
+-   CODE1 is CALL_EXPR.  */
++   vectorizing the operation, if available.  */
+ 
+ bool
+ supportable_convert_operation (enum tree_code code,
+ 			       tree vectype_out, tree vectype_in,
+-			       tree *decl, enum tree_code *code1)
++			       enum tree_code *code1)
+ {
+   machine_mode m1,m2;
+   bool truncp;
+@@ -314,15 +310,6 @@ supportable_convert_operation (enum tree_code code,
+       return true;
+     }
+ 
+-  /* Now check for builtin.  */
+-  if (targetm.vectorize.builtin_conversion
+-      && targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in))
+-    {
+-      *code1 = CALL_EXPR;
+-      *decl = targetm.vectorize.builtin_conversion (code, vectype_out,
+-						    vectype_in);
+-      return true;
+-    }
+   return false;
+ }
+ 
+diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
+index 5e4848997..dac350142 100644
+--- a/gcc/optabs-tree.h
++++ b/gcc/optabs-tree.h
+@@ -36,7 +36,7 @@ enum optab_subtype
+    the second argument.  The third argument distinguishes between the types of
+    vector shifts and rotates.  */
+ optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
+-bool supportable_convert_operation (enum tree_code, tree, tree, tree *,
++bool supportable_convert_operation (enum tree_code, tree, tree,
+ 				    enum tree_code *);
+ bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
+ bool expand_vec_cond_expr_p (tree, tree, enum tree_code);
+diff --git a/gcc/optabs.c b/gcc/optabs.c
+index c2c1274eb..d9788d248 100644
+--- a/gcc/optabs.c
++++ b/gcc/optabs.c
+@@ -3727,7 +3727,7 @@ emit_libcall_block_1 (rtx_insn *insns, rtx target, rtx result, rtx equiv,
+ 	  data.first = insns;
+ 	  data.insn = insn;
+ 	  data.must_stay = 0;
+-	  note_stores (PATTERN (insn), no_conflict_move_test, &data);
++	  note_stores (insn, no_conflict_move_test, &data);
+ 	  if (! data.must_stay)
+ 	    {
+ 	      if (PREV_INSN (insn))
+@@ -6428,7 +6428,7 @@ expand_atomic_compare_and_swap (rtx *ptarget_bool, rtx *ptarget_oval,
+       /* Otherwise, work out if the compare-and-swap succeeded.  */
+       cc_reg = NULL_RTX;
+       if (have_insn_for (COMPARE, CCmode))
+-	note_stores (PATTERN (get_last_insn ()), find_cc_set, &cc_reg);
++	note_stores (get_last_insn (), find_cc_set, &cc_reg);
+       if (cc_reg)
+ 	{
+ 	  target_bool = emit_store_flag_force (target_bool, EQ, cc_reg,
+@@ -7181,18 +7181,16 @@ static bool
+ maybe_legitimize_operand (enum insn_code icode, unsigned int opno,
+ 			  struct expand_operand *op)
+ {
+-  machine_mode mode, imode;
+-  bool old_volatile_ok, result;
++  machine_mode mode, imode, tmode;
+ 
+   mode = op->mode;
+   switch (op->type)
+     {
+     case EXPAND_FIXED:
+-      old_volatile_ok = volatile_ok;
+-      volatile_ok = true;
+-      result = maybe_legitimize_operand_same_code (icode, opno, op);
+-      volatile_ok = old_volatile_ok;
+-      return result;
++      {
++	temporary_volatile_ok v (true);
++	return maybe_legitimize_operand_same_code (icode, opno, op);
++      }
+ 
+     case EXPAND_OUTPUT:
+       gcc_assert (mode != VOIDmode);
+@@ -7230,9 +7228,17 @@ maybe_legitimize_operand (enum insn_code icode, unsigned int opno,
+ 	gcc_assert (mode != VOIDmode);
+ 
+       imode = insn_data[(int) icode].operand[opno].mode;
++      tmode = (VECTOR_MODE_P (imode) && !VECTOR_MODE_P (mode)
++	       ? GET_MODE_INNER (imode) : imode);
++      if (tmode != VOIDmode && tmode != mode)
++	{
++	  op->value = convert_modes (tmode, mode, op->value, op->unsigned_p);
++	  mode = tmode;
++	}
+       if (imode != VOIDmode && imode != mode)
+ 	{
+-	  op->value = convert_modes (imode, mode, op->value, op->unsigned_p);
++	  gcc_assert (VECTOR_MODE_P (imode) && !VECTOR_MODE_P (mode));
++	  op->value = expand_vector_broadcast (imode, op->value);
+ 	  mode = imode;
+ 	}
+       goto input;
+diff --git a/gcc/optabs.def b/gcc/optabs.def
+index 8af3a2f43..912766656 100644
+--- a/gcc/optabs.def
++++ b/gcc/optabs.def
+@@ -230,6 +230,9 @@ OPTAB_D (cond_umod_optab, "cond_umod$a")
+ OPTAB_D (cond_and_optab, "cond_and$a")
+ OPTAB_D (cond_ior_optab, "cond_ior$a")
+ OPTAB_D (cond_xor_optab, "cond_xor$a")
++OPTAB_D (cond_ashl_optab, "cond_ashl$a")
++OPTAB_D (cond_ashr_optab, "cond_ashr$a")
++OPTAB_D (cond_lshr_optab, "cond_lshr$a")
+ OPTAB_D (cond_smin_optab, "cond_smin$a")
+ OPTAB_D (cond_smax_optab, "cond_smax$a")
+ OPTAB_D (cond_umin_optab, "cond_umin$a")
+@@ -256,7 +259,7 @@ OPTAB_D (umul_highpart_optab, "umul$a3_highpart")
+ OPTAB_D (cmpmem_optab, "cmpmem$a")
+ OPTAB_D (cmpstr_optab, "cmpstr$a")
+ OPTAB_D (cmpstrn_optab, "cmpstrn$a")
+-OPTAB_D (movmem_optab, "movmem$a")
++OPTAB_D (cpymem_optab, "cpymem$a")
+ OPTAB_D (setmem_optab, "setmem$a")
+ OPTAB_D (strlen_optab, "strlen$a")
+ 
+@@ -323,6 +326,7 @@ OPTAB_D (reduc_and_scal_optab,  "reduc_and_scal_$a")
+ OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
+ OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
+ OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
++OPTAB_D (mask_fold_left_plus_optab, "mask_fold_left_plus_$a")
+ 
+ OPTAB_D (extract_last_optab, "extract_last_$a")
+ OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
+@@ -337,6 +341,11 @@ OPTAB_D (udot_prod_optab, "udot_prod$I$a")
+ OPTAB_D (usum_widen_optab, "widen_usum$I$a3")
+ OPTAB_D (usad_optab, "usad$I$a")
+ OPTAB_D (ssad_optab, "ssad$I$a")
++OPTAB_D (smulhs_optab, "smulhs$a3")
++OPTAB_D (smulhrs_optab, "smulhrs$a3")
++OPTAB_D (umulhs_optab, "umulhs$a3")
++OPTAB_D (umulhrs_optab, "umulhrs$a3")
++OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
+ OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
+ OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
+ OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
+diff --git a/gcc/optabs.h b/gcc/optabs.h
+index 17b5dfb67..18dec50f5 100644
+--- a/gcc/optabs.h
++++ b/gcc/optabs.h
+@@ -128,7 +128,11 @@ create_convert_operand_to (struct expand_operand *op, rtx value,
+ /* Make OP describe an input operand that should have the same value
+    as VALUE, after any mode conversion that the backend might request.
+    If VALUE is a CONST_INT, it should be treated as having mode MODE.
+-   UNSIGNED_P says whether VALUE is unsigned.  */
++   UNSIGNED_P says whether VALUE is unsigned.
++
++   The conversion of VALUE can include a combination of numerical
++   conversion (as for convert_modes) and duplicating a scalar to fill
++   a vector (if VALUE is a scalar but the operand is a vector).  */
+ 
+ static inline void
+ create_convert_operand_from (struct expand_operand *op, rtx value,
+diff --git a/gcc/opts-global.c b/gcc/opts-global.c
+index 4f8aac7e9..6e4f2d528 100644
+--- a/gcc/opts-global.c
++++ b/gcc/opts-global.c
+@@ -255,6 +255,7 @@ init_options_once (void)
+      construct their pretty-printers means that all previous settings
+      are overriden.  */
+   diagnostic_color_init (global_dc);
++  diagnostic_urls_init (global_dc);
+ }
+ 
+ /* Decode command-line options to an array, like
+diff --git a/gcc/opts.c b/gcc/opts.c
+index 494be7a9f..a8db491b5 100644
+--- a/gcc/opts.c
++++ b/gcc/opts.c
+@@ -465,7 +465,6 @@ static const struct default_options default_options_table[] =
+     { OPT_LEVELS_1_PLUS, OPT_ftree_copy_prop, NULL, 1 },
+     { OPT_LEVELS_1_PLUS, OPT_ftree_dce, NULL, 1 },
+     { OPT_LEVELS_1_PLUS, OPT_ftree_dominator_opts, NULL, 1 },
+-    { OPT_LEVELS_1_PLUS, OPT_ftree_dse, NULL, 1 },
+     { OPT_LEVELS_1_PLUS, OPT_ftree_fre, NULL, 1 },
+     { OPT_LEVELS_1_PLUS, OPT_ftree_sink, NULL, 1 },
+     { OPT_LEVELS_1_PLUS, OPT_ftree_slsr, NULL, 1 },
+@@ -476,14 +475,16 @@ static const struct default_options default_options_table[] =
+ #if DELAY_SLOTS
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fdelayed_branch, NULL, 1 },
+ #endif
++    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fdse, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fif_conversion, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fif_conversion2, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fmove_loop_invariants, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fssa_phiopt, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_bit_ccp, NULL, 1 },
+-    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_sra, NULL, 1 },
++    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_dse, NULL, 1 },
+     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_pta, NULL, 1 },
++    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_sra, NULL, 1 },
+ 
+     /* -O2 and -Os optimizations.  */
+     { OPT_LEVELS_2_PLUS, OPT_fcaller_saves, NULL, 1 },
+@@ -521,6 +522,7 @@ static const struct default_options default_options_table[] =
+     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
+     { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 },
+     { OPT_LEVELS_2_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_CHEAP },
++    { OPT_LEVELS_2_PLUS, OPT_finline_functions, NULL, 1 },
+ 
+     /* -O2 and -Os optimizations.  */
+     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_falign_functions, NULL, 1 },
+@@ -536,9 +538,6 @@ static const struct default_options default_options_table[] =
+ #endif
+ 
+     /* -O3 and -Os optimizations.  */
+-    /* Inlining of functions reducing size is a good idea with -Os
+-       regardless of them being declared inline.  */
+-    { OPT_LEVELS_3_PLUS_AND_SIZE, OPT_finline_functions, NULL, 1 },
+ 
+     /* -O3 optimizations.  */
+     { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
+@@ -2400,6 +2399,10 @@ common_handle_option (struct gcc_options *opts,
+       diagnostic_color_init (dc, value);
+       break;
+ 
++    case OPT_fdiagnostics_urls_:
++      diagnostic_urls_init (dc, value);
++      break;
++
+     case OPT_fdiagnostics_format_:
+       diagnostic_output_format_init (dc,
+ 				     (enum diagnostics_output_format)value);
+diff --git a/gcc/params.def b/gcc/params.def
+index 08c709636..0ef092214 100644
+--- a/gcc/params.def
++++ b/gcc/params.def
+@@ -61,8 +61,13 @@ DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME,
+ 
+ DEFPARAM (PARAM_INLINE_MIN_SPEEDUP,
+ 	  "inline-min-speedup",
++	  "The minimal estimated speedup allowing inliner to ignore inline-insns-single and inline-insns-auto with -O3 and -Ofast.",
++	  15, 0, 100)
++
++DEFPARAM (PARAM_INLINE_MIN_SPEEDUP_O2,
++	  "inline-min-speedup-O2",
+ 	  "The minimal estimated speedup allowing inliner to ignore inline-insns-single and inline-insns-auto.",
+-	  15, 0, 0)
++	  30, 0, 100)
+ 
+ /* The single function inlining limit. This is the maximum size
+    of a function counted in internal gcc instructions (not in
+@@ -77,9 +82,14 @@ DEFPARAM (PARAM_INLINE_MIN_SPEEDUP,
+    gets decreased.  */
+ DEFPARAM (PARAM_MAX_INLINE_INSNS_SINGLE,
+ 	  "max-inline-insns-single",
+-	  "The maximum number of instructions in a single function eligible for inlining.",
++	  "The maximum number of instructions in a single function eligible for inlining with -O3 and -Ofast.",
+ 	  200, 0, 0)
+ 
++DEFPARAM (PARAM_MAX_INLINE_INSNS_SINGLE_O2,
++	  "max-inline-insns-single-O2",
++	  "The maximum number of instructions in a single function eligible for inlining.",
++	  30, 0, 0)
++
+ /* The single function inlining limit for functions that are
+    inlined by virtue of -finline-functions (-O3).
+    This limit should be chosen to be below or equal to the limit
+@@ -89,9 +99,14 @@ DEFPARAM (PARAM_MAX_INLINE_INSNS_SINGLE,
+    The default value is 30.  */
+ DEFPARAM (PARAM_MAX_INLINE_INSNS_AUTO,
+ 	  "max-inline-insns-auto",
+-	  "The maximum number of instructions when automatically inlining.",
++	  "The maximum number of instructions when automatically inlining with -O3 and -Ofast.",
+ 	  30, 0, 0)
+ 
++DEFPARAM (PARAM_MAX_INLINE_INSNS_AUTO_O2,
++	  "max-inline-insns-auto-O2",
++	  "The maximum number of instructions when automatically inlining.",
++	  15, 0, 0)
++
+ DEFPARAM (PARAM_MAX_INLINE_INSNS_SMALL,
+ 	  "max-inline-insns-small",
+ 	  "The maximum number of instructions when automatically inlining small functions.",
+@@ -243,8 +258,12 @@ DEFPARAM(PARAM_IPCP_UNIT_GROWTH,
+ 	 10, 0, 0)
+ DEFPARAM(PARAM_EARLY_INLINING_INSNS,
+ 	 "early-inlining-insns",
+-	 "Maximal estimated growth of function body caused by early inlining of single call.",
++	 "Maximal estimated growth of function body caused by early inlining of single call with -O3 and -Ofast.",
+ 	 14, 0, 0)
++DEFPARAM(PARAM_EARLY_INLINING_INSNS_O2,
++	 "early-inlining-insns-O2",
++	 "Maximal estimated growth of function body caused by early inlining of single call with -O1 and -O2.",
++	 6, 0, 0)
+ DEFPARAM(PARAM_LARGE_STACK_FRAME,
+ 	 "large-stack-frame",
+ 	 "The size of stack frame to be considered large.",
+diff --git a/gcc/passes.def b/gcc/passes.def
+index 901dbef93..a03685500 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -459,7 +459,6 @@ along with GCC; see the file COPYING3.  If not see
+ 	  NEXT_PASS (pass_split_after_reload);
+ 	  NEXT_PASS (pass_ree);
+ 	  NEXT_PASS (pass_compare_elim_after_reload);
+-	  NEXT_PASS (pass_branch_target_load_optimize1);
+ 	  NEXT_PASS (pass_thread_prologue_and_epilogue);
+ 	  NEXT_PASS (pass_rtl_dse2);
+ 	  NEXT_PASS (pass_stack_adjustments);
+@@ -472,7 +471,6 @@ along with GCC; see the file COPYING3.  If not see
+ 	  NEXT_PASS (pass_cprop_hardreg);
+ 	  NEXT_PASS (pass_fast_rtl_dce);
+ 	  NEXT_PASS (pass_reorder_blocks);
+-	  NEXT_PASS (pass_branch_target_load_optimize2);
+ 	  NEXT_PASS (pass_leaf_regs);
+ 	  NEXT_PASS (pass_split_before_sched2);
+ 	  NEXT_PASS (pass_sched2);
+diff --git a/gcc/postreload-gcse.c b/gcc/postreload-gcse.c
+index a165351ca..bc2e8fc91 100644
+--- a/gcc/postreload-gcse.c
++++ b/gcc/postreload-gcse.c
+@@ -672,7 +672,7 @@ load_killed_in_block_p (int uid_limit, rtx x, bool after_insn)
+ 	 It will set mems_conflict_p to nonzero if there may be a
+ 	 conflict between X and SETTER.  */
+       mems_conflict_p = 0;
+-      note_stores (PATTERN (setter), find_mem_conflicts, x);
++      note_stores (setter, find_mem_conflicts, x);
+       if (mems_conflict_p)
+ 	return 1;
+ 
+@@ -774,7 +774,7 @@ record_opr_changes (rtx_insn *insn)
+   rtx note;
+ 
+   /* Find all stores and record them.  */
+-  note_stores (PATTERN (insn), record_last_set_info, insn);
++  note_stores (insn, record_last_set_info, insn);
+ 
+   /* Also record autoincremented REGs for this insn as changed.  */
+   for (note = REG_NOTES (insn); note; note = XEXP (note, 1))
+@@ -785,25 +785,10 @@ record_opr_changes (rtx_insn *insn)
+   if (CALL_P (insn))
+     {
+       unsigned int regno;
+-      rtx link, x;
+       hard_reg_set_iterator hrsi;
+       EXECUTE_IF_SET_IN_HARD_REG_SET (regs_invalidated_by_call, 0, regno, hrsi)
+ 	record_last_reg_set_info_regno (insn, regno);
+ 
+-      for (link = CALL_INSN_FUNCTION_USAGE (insn); link; link = XEXP (link, 1))
+-	{
+-	  gcc_assert (GET_CODE (XEXP (link, 0)) != CLOBBER_HIGH);
+-	  if (GET_CODE (XEXP (link, 0)) == CLOBBER)
+-	    {
+-	      x = XEXP (XEXP (link, 0), 0);
+-	      if (REG_P (x))
+-		{
+-		  gcc_assert (HARD_REGISTER_P (x));
+-		  record_last_reg_set_info (insn, x);
+-		}
+-	    }
+-	}
+-
+       if (! RTL_CONST_OR_PURE_CALL_P (insn))
+ 	record_last_mem_set_info (insn);
+     }
+diff --git a/gcc/postreload.c b/gcc/postreload.c
+index b76c7b0b7..ee0dc6ae8 100644
+--- a/gcc/postreload.c
++++ b/gcc/postreload.c
+@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "cselib.h"
+ #include "tree-pass.h"
+ #include "dbgcnt.h"
++#include "function-abi.h"
+ 
+ static int reload_cse_noop_set_p (rtx);
+ static bool reload_cse_simplify (rtx_insn *, rtx);
+@@ -133,8 +134,6 @@ reload_cse_simplify (rtx_insn *insn, rtx testreg)
+ 	  for (i = XVECLEN (body, 0) - 1; i >= 0; --i)
+ 	    {
+ 	      rtx part = XVECEXP (body, 0, i);
+-	      /* asms can only have full clobbers, not clobber_highs.  */
+-	      gcc_assert (GET_CODE (part) != CLOBBER_HIGH);
+ 	      if (GET_CODE (part) == CLOBBER && REG_P (XEXP (part, 0)))
+ 		cselib_invalidate_rtx (XEXP (part, 0));
+ 	    }
+@@ -157,9 +156,7 @@ reload_cse_simplify (rtx_insn *insn, rtx testreg)
+ 		  value = SET_DEST (part);
+ 		}
+ 	    }
+-	  else if (GET_CODE (part) != CLOBBER
+-		   && GET_CODE (part) != CLOBBER_HIGH
+-		   && GET_CODE (part) != USE)
++	  else if (GET_CODE (part) != CLOBBER && GET_CODE (part) != USE)
+ 	    break;
+ 	}
+ 
+@@ -1139,7 +1136,7 @@ reload_combine_recognize_pattern (rtx_insn *insn)
+ 	      if (TEST_HARD_REG_BIT (reg_class_contents[INDEX_REG_CLASS], i)
+ 		  && reg_state[i].use_index == RELOAD_COMBINE_MAX_USES
+ 		  && reg_state[i].store_ruid <= reg_state[regno].use_ruid
+-		  && (call_used_regs[i] || df_regs_ever_live_p (i))
++		  && (call_used_or_fixed_reg_p (i) || df_regs_ever_live_p (i))
+ 		  && (!frame_pointer_needed || i != HARD_FRAME_POINTER_REGNUM)
+ 		  && !fixed_regs[i] && !global_regs[i]
+ 		  && hard_regno_nregs (i, GET_MODE (reg)) == 1
+@@ -1271,8 +1268,8 @@ reload_combine (void)
+ 
+ 	  REG_SET_TO_HARD_REG_SET (live, live_in);
+ 	  compute_use_by_pseudos (&live, live_in);
+-	  COPY_HARD_REG_SET (LABEL_LIVE (insn), live);
+-	  IOR_HARD_REG_SET (ever_live_at_start, live);
++	  LABEL_LIVE (insn) = live;
++	  ever_live_at_start |= live;
+ 	}
+     }
+ 
+@@ -1329,14 +1326,15 @@ reload_combine (void)
+ 	  || reload_combine_recognize_pattern (insn))
+ 	continue;
+ 
+-      note_stores (PATTERN (insn), reload_combine_note_store, NULL);
++      note_stores (insn, reload_combine_note_store, NULL);
+ 
+       if (CALL_P (insn))
+ 	{
+ 	  rtx link;
+-	  HARD_REG_SET used_regs;
+-
+-	  get_call_reg_set_usage (insn, &used_regs, call_used_reg_set);
++	  HARD_REG_SET used_regs = insn_callee_abi (insn).full_reg_clobbers ();
++	  /* ??? This preserves traditional behavior; it might not be
++	     needed.  */
++	  used_regs |= fixed_reg_set;
+ 
+ 	  for (r = 0; r < FIRST_PSEUDO_REGISTER; r++)
+ 	    if (TEST_HARD_REG_BIT (used_regs, r))
+@@ -1350,22 +1348,12 @@ reload_combine (void)
+ 	    {
+ 	      rtx setuse = XEXP (link, 0);
+ 	      rtx usage_rtx = XEXP (setuse, 0);
+-	      /* We could support CLOBBER_HIGH and treat it in the same way as
+-		 HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
+-	      gcc_assert (GET_CODE (setuse) != CLOBBER_HIGH);
+ 
+-	      if ((GET_CODE (setuse) == USE || GET_CODE (setuse) == CLOBBER)
+-		  && REG_P (usage_rtx))
++	      if (GET_CODE (setuse) == USE && REG_P (usage_rtx))
+ 	        {
+ 		  unsigned int end_regno = END_REGNO (usage_rtx);
+ 		  for (unsigned int i = REGNO (usage_rtx); i < end_regno; ++i)
+-		    if (GET_CODE (XEXP (link, 0)) == CLOBBER)
+-		      {
+-		        reg_state[i].use_index = RELOAD_COMBINE_MAX_USES;
+-		        reg_state[i].store_ruid = reload_combine_ruid;
+-		      }
+-		    else
+-		      reg_state[i].use_index = -1;
++		    reg_state[i].use_index = -1;
+ 	         }
+ 	     }
+ 	}
+@@ -1529,10 +1517,6 @@ reload_combine_note_use (rtx *xp, rtx_insn *insn, int ruid, rtx containing_mem)
+ 	}
+       break;
+ 
+-    case CLOBBER_HIGH:
+-      gcc_assert (REG_P (SET_DEST (x)));
+-      return;
+-
+     case PLUS:
+       /* We are interested in (plus (reg) (const_int)) .  */
+       if (!REG_P (XEXP (x, 0))
+@@ -2108,7 +2092,7 @@ reload_cse_move2add (rtx_insn *first)
+ 		}
+ 	    }
+ 	}
+-      note_stores (PATTERN (insn), move2add_note_store, insn);
++      note_stores (insn, move2add_note_store, insn);
+ 
+       /* If INSN is a conditional branch, we try to extract an
+ 	 implicit set out of it.  */
+@@ -2138,32 +2122,12 @@ reload_cse_move2add (rtx_insn *first)
+ 	 unknown values.  */
+       if (CALL_P (insn))
+ 	{
+-	  rtx link;
+-
+ 	  for (i = FIRST_PSEUDO_REGISTER - 1; i >= 0; i--)
+ 	    {
+-	      if (call_used_regs[i])
++	      if (call_used_or_fixed_reg_p (i))
+ 		/* Reset the information about this register.  */
+ 		reg_mode[i] = VOIDmode;
+ 	    }
+-
+-	  for (link = CALL_INSN_FUNCTION_USAGE (insn); link;
+-	       link = XEXP (link, 1))
+-	    {
+-	      rtx setuse = XEXP (link, 0);
+-	      rtx usage_rtx = XEXP (setuse, 0);
+-	      /* CALL_INSN_FUNCTION_USAGEs can only have full clobbers, not
+-		 clobber_highs.  */
+-	      gcc_assert (GET_CODE (setuse) != CLOBBER_HIGH);
+-	      if (GET_CODE (setuse) == CLOBBER
+-		  && REG_P (usage_rtx))
+-	        {
+-		  unsigned int end_regno = END_REGNO (usage_rtx);
+-		  for (unsigned int r = REGNO (usage_rtx); r < end_regno; ++r)
+-		    /* Reset the information about this register.  */
+-		    reg_mode[r] = VOIDmode;
+-		}
+-	    }
+ 	}
+     }
+   return changed;
+@@ -2317,13 +2281,6 @@ move2add_note_store (rtx dst, const_rtx set, void *data)
+ 
+       move2add_record_mode (dst);
+     }
+-  else if (GET_CODE (set) == CLOBBER_HIGH)
+-    {
+-      /* Only invalidate if actually clobbered.  */
+-      if (reg_mode[regno] == BLKmode
+-	  || reg_is_clobbered_by_clobber_high (regno, reg_mode[regno], dst))
+-	 goto invalidate;
+-    }
+   else
+     {
+     invalidate:
+diff --git a/gcc/predict.c b/gcc/predict.c
+index eaab47f99..03dd4ddfa 100644
+--- a/gcc/predict.c
++++ b/gcc/predict.c
+@@ -2450,7 +2450,7 @@ expr_expected_value_1 (tree type, tree op0, enum tree_code code,
+ 	      return NULL;
+ 	    }
+ 
+-	  if (DECL_IS_MALLOC (decl) || DECL_IS_OPERATOR_NEW (decl))
++	  if (DECL_IS_MALLOC (decl) || DECL_IS_OPERATOR_NEW_P (decl))
+ 	    {
+ 	      if (predictor)
+ 		*predictor = PRED_MALLOC_NONNULL;
+diff --git a/gcc/pretty-print.c b/gcc/pretty-print.c
+index 6948971ce..5af7ca764 100644
+--- a/gcc/pretty-print.c
++++ b/gcc/pretty-print.c
+@@ -1579,7 +1579,8 @@ pretty_printer::pretty_printer (int maximum_length)
+     emitted_prefix (),
+     need_newline (),
+     translate_identifiers (true),
+-    show_color ()
++    show_color (),
++    show_urls (false)
+ {
+   pp_line_cutoff (this) = maximum_length;
+   /* By default, we emit prefixes once per message.  */
+@@ -2028,6 +2029,41 @@ identifier_to_locale (const char *ident)
+   }
+ }
+ 
++/* Support for encoding URLs.
++   See egmontkob/Hyperlinks_in_Terminal_Emulators.md
++   ( https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda ).
++
++   > A hyperlink is opened upon encountering an OSC 8 escape sequence with
++   > the target URI. The syntax is
++   >
++   >  OSC 8 ; params ; URI ST
++   >
++   > A hyperlink is closed with the same escape sequence, omitting the
++   > parameters and the URI but keeping the separators:
++   >
++   > OSC 8 ; ; ST
++   >
++   > OSC (operating system command) is typically ESC ].  */
++
++/* If URL-printing is enabled, write an "open URL" escape sequence to PP
++   for the given URL.  */
++
++void
++pp_begin_url (pretty_printer *pp, const char *url)
++{
++  if (pp->show_urls)
++    pp_printf (pp, "\33]8;;%s\33\\", url);
++}
++
++/* If URL-printing is enabled, write a "close URL" escape sequence to PP.  */
++
++void
++pp_end_url (pretty_printer *pp)
++{
++  if (pp->show_urls)
++    pp_string (pp, "\33]8;;\33\\");
++}
++
+ #if CHECKING_P
+ 
+ namespace selftest {
+@@ -2312,6 +2348,32 @@ test_prefixes_and_wrapping ()
+ 
+ }
+ 
++/* Verify that URL-printing works as expected.  */
++
++void
++test_urls ()
++{
++  {
++    pretty_printer pp;
++    pp.show_urls = false;
++    pp_begin_url (&pp, "http://example.com");
++    pp_string (&pp, "This is a link");
++    pp_end_url (&pp);
++    ASSERT_STREQ ("This is a link",
++		  pp_formatted_text (&pp));
++  }
++
++  {
++    pretty_printer pp;
++    pp.show_urls = true;
++    pp_begin_url (&pp, "http://example.com");
++    pp_string (&pp, "This is a link");
++    pp_end_url (&pp);
++    ASSERT_STREQ ("\33]8;;http://example.com\33\\This is a link\33]8;;\33\\",
++		  pp_formatted_text (&pp));
++  }
++}
++
+ /* Run all of the selftests within this file.  */
+ 
+ void
+@@ -2320,6 +2382,7 @@ pretty_print_c_tests ()
+   test_basic_printing ();
+   test_pp_format ();
+   test_prefixes_and_wrapping ();
++  test_urls ();
+ }
+ 
+ } // namespace selftest
+diff --git a/gcc/pretty-print.h b/gcc/pretty-print.h
+index e4df65907..07cd39176 100644
+--- a/gcc/pretty-print.h
++++ b/gcc/pretty-print.h
+@@ -271,6 +271,9 @@ struct pretty_printer
+ 
+   /* Nonzero means that text should be colorized.  */
+   bool show_color;
++
++  /* Nonzero means that URLs should be emitted.  */
++  bool show_urls;
+ };
+ 
+ static inline const char *
+@@ -391,6 +394,9 @@ extern void pp_maybe_space (pretty_printer *);
+ extern void pp_begin_quote (pretty_printer *, bool);
+ extern void pp_end_quote (pretty_printer *, bool);
+ 
++extern void pp_begin_url (pretty_printer *pp, const char *url);
++extern void pp_end_url (pretty_printer *pp);
++
+ /* Switch into verbatim mode and return the old mode.  */
+ static inline pp_wrapping_mode_t
+ pp_set_verbatim_wrapping_ (pretty_printer *pp)
+diff --git a/gcc/print-rtl.c b/gcc/print-rtl.c
+index fbb108568..01f281604 100644
+--- a/gcc/print-rtl.c
++++ b/gcc/print-rtl.c
+@@ -1756,7 +1756,6 @@ print_pattern (pretty_printer *pp, const_rtx x, int verbose)
+       print_exp (pp, x, verbose);
+       break;
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+     case USE:
+       pp_printf (pp, "%s ", GET_RTX_NAME (GET_CODE (x)));
+       print_value (pp, XEXP (x, 0), verbose);
+diff --git a/gcc/print-tree.c b/gcc/print-tree.c
+index 81b66a189..7c0d05548 100644
+--- a/gcc/print-tree.c
++++ b/gcc/print-tree.c
+@@ -517,7 +517,11 @@ print_node (FILE *file, const char *prefix, tree node, int indent,
+ 	  if (code == FUNCTION_DECL && fndecl_built_in_p (node))
+ 	    {
+ 	      if (DECL_BUILT_IN_CLASS (node) == BUILT_IN_MD)
+-		fprintf (file, " built-in: BUILT_IN_MD:%d", DECL_FUNCTION_CODE (node));
++		fprintf (file, " built-in: BUILT_IN_MD:%d",
++			 DECL_MD_FUNCTION_CODE (node));
++	      else if (DECL_BUILT_IN_CLASS (node) == BUILT_IN_FRONTEND)
++		fprintf (file, " built-in: BUILT_IN_FRONTEND:%d",
++			 DECL_FE_FUNCTION_CODE (node));
+ 	      else
+ 		fprintf (file, " built-in: %s:%s",
+ 			 built_in_class_names[(int) DECL_BUILT_IN_CLASS (node)],
+diff --git a/gcc/read-md.h b/gcc/read-md.h
+index 18426f71d..327f378ea 100644
+--- a/gcc/read-md.h
++++ b/gcc/read-md.h
+@@ -337,6 +337,7 @@ class rtx_reader : public md_reader
+   ~rtx_reader ();
+ 
+   bool read_rtx (const char *rtx_name, vec<rtx> *rtxen);
++  rtx rtx_alloc_for_name (const char *);
+   rtx read_rtx_code (const char *code_name);
+   virtual rtx read_rtx_operand (rtx return_rtx, int idx);
+   rtx read_nested_rtx ();
+diff --git a/gcc/read-rtl-function.c b/gcc/read-rtl-function.c
+index 53f7a94c1..ded407737 100644
+--- a/gcc/read-rtl-function.c
++++ b/gcc/read-rtl-function.c
+@@ -41,6 +41,8 @@ along with GCC; see the file COPYING3.  If not see
+ #include "read-rtl-function.h"
+ #include "selftest.h"
+ #include "selftest-rtl.h"
++#include "regs.h"
++#include "function-abi.h"
+ 
+ /* Forward decls.  */
+ class function_reader;
+@@ -1610,6 +1612,7 @@ bool
+ read_rtl_function_body (const char *path)
+ {
+   initialize_rtl ();
++  crtl->abi = &default_function_abi;
+   init_emit ();
+   init_varasm_status ();
+ 
+@@ -1643,6 +1646,7 @@ read_rtl_function_body_from_file_range (location_t start_loc,
+     }
+ 
+   initialize_rtl ();
++  crtl->abi = &fndecl_abi (cfun->decl).base_abi ();
+   init_emit ();
+   init_varasm_status ();
+ 
+diff --git a/gcc/read-rtl.c b/gcc/read-rtl.c
+index 1af51f686..6b1b811cb 100644
+--- a/gcc/read-rtl.c
++++ b/gcc/read-rtl.c
+@@ -194,22 +194,31 @@ static const compact_insn_name compact_insn_names[] = {
+   { NOTE, "cnote" }
+ };
+ 
+-/* Implementations of the iterator_group callbacks for codes.  */
++/* Return the rtx code for NAME, or UNKNOWN if NAME isn't a valid rtx code.  */
+ 
+-static int
+-find_code (const char *name)
++static rtx_code
++maybe_find_code (const char *name)
+ {
+-  int i;
+-
+-  for (i = 0; i < NUM_RTX_CODE; i++)
++  for (int i = 0; i < NUM_RTX_CODE; i++)
+     if (strcmp (GET_RTX_NAME (i), name) == 0)
+-      return i;
++      return (rtx_code) i;
+ 
+-  for (i = 0; i < (signed)ARRAY_SIZE (compact_insn_names); i++)
++  for (int i = 0; i < (signed)ARRAY_SIZE (compact_insn_names); i++)
+     if (strcmp (compact_insn_names[i].name, name) == 0)
+       return compact_insn_names[i].code;
+ 
+-  fatal_with_file_and_line ("unknown rtx code `%s'", name);
++  return UNKNOWN;
++}
++
++/* Implementations of the iterator_group callbacks for codes.  */
++
++static int
++find_code (const char *name)
++{
++  rtx_code code = maybe_find_code (name);
++  if (code == UNKNOWN)
++    fatal_with_file_and_line ("unknown rtx code `%s'", name);
++  return code;
+ }
+ 
+ static void
+@@ -277,9 +286,11 @@ apply_subst_iterator (rtx rt, unsigned int, int value)
+     return;
+   gcc_assert (GET_CODE (rt) == DEFINE_INSN
+ 	      || GET_CODE (rt) == DEFINE_INSN_AND_SPLIT
++	      || GET_CODE (rt) == DEFINE_INSN_AND_REWRITE
+ 	      || GET_CODE (rt) == DEFINE_EXPAND);
+ 
+-  int attrs = GET_CODE (rt) == DEFINE_INSN_AND_SPLIT ? 7 : 4;
++  int attrs = (GET_CODE (rt) == DEFINE_INSN_AND_SPLIT ? 7
++	       : GET_CODE (rt) == DEFINE_INSN_AND_REWRITE ? 6 : 4);
+   attrs_vec = XVEC (rt, attrs);
+ 
+   /* If we've already added attribute 'current_iterator_name', then we
+@@ -540,6 +551,7 @@ add_condition_to_rtx (rtx x, const char *extra)
+       break;
+ 
+     case DEFINE_INSN_AND_SPLIT:
++    case DEFINE_INSN_AND_REWRITE:
+       XSTR (x, 2) = add_condition_to_string (XSTR (x, 2), extra);
+       XSTR (x, 4) = add_condition_to_string (XSTR (x, 4), extra);
+       break;
+@@ -623,6 +635,7 @@ named_rtx_p (rtx x)
+     case DEFINE_EXPAND:
+     case DEFINE_INSN:
+     case DEFINE_INSN_AND_SPLIT:
++    case DEFINE_INSN_AND_REWRITE:
+       return true;
+ 
+     default:
+@@ -1306,7 +1319,37 @@ check_code_iterator (struct mapping *iterator)
+   for (v = iterator->values->next; v != 0; v = v->next)
+     if (strcmp (GET_RTX_FORMAT (bellwether), GET_RTX_FORMAT (v->number)) != 0)
+       fatal_with_file_and_line ("code iterator `%s' combines "
+-				"different rtx formats", iterator->name);
++				"`%s' and `%s', which have different "
++				"rtx formats", iterator->name,
++				GET_RTX_NAME (bellwether),
++				GET_RTX_NAME (v->number));
++}
++
++/* Check that all values of attribute ATTR are rtx codes that have a
++   consistent format.  Return a representative code.  */
++
++static rtx_code
++check_code_attribute (mapping *attr)
++{
++  rtx_code bellwether = UNKNOWN;
++  for (map_value *v = attr->values; v != 0; v = v->next)
++    {
++      rtx_code code = maybe_find_code (v->string);
++      if (code == UNKNOWN)
++	fatal_with_file_and_line ("code attribute `%s' contains "
++				  "unrecognized rtx code `%s'",
++				  attr->name, v->string);
++      if (bellwether == UNKNOWN)
++	bellwether = code;
++      else if (strcmp (GET_RTX_FORMAT (bellwether),
++		       GET_RTX_FORMAT (code)) != 0)
++	fatal_with_file_and_line ("code attribute `%s' combines "
++				  "`%s' and `%s', which have different "
++				  "rtx formats", attr->name,
++				  GET_RTX_NAME (bellwether),
++				  GET_RTX_NAME (code));
++    }
++  return bellwether;
+ }
+ 
+ /* Read an rtx-related declaration from the MD file, given that it
+@@ -1467,6 +1510,54 @@ parse_reg_note_name (const char *string)
+   fatal_with_file_and_line ("unrecognized REG_NOTE name: `%s'", string);
+ }
+ 
++/* Allocate an rtx for code NAME.  If NAME is a code iterator or code
++   attribute, record its use for later and use one of its possible
++   values as an interim rtx code.  */
++
++rtx
++rtx_reader::rtx_alloc_for_name (const char *name)
++{
++#ifdef GENERATOR_FILE
++  size_t len = strlen (name);
++  if (name[0] == '<' && name[len - 1] == '>')
++    {
++      /* Copy the attribute string into permanent storage, without the
++	 angle brackets around it.  */
++      obstack *strings = get_string_obstack ();
++      obstack_grow0 (strings, name + 1, len - 2);
++      char *deferred_name = XOBFINISH (strings, char *);
++
++      /* Find the name of the attribute.  */
++      const char *attr = strchr (deferred_name, ':');
++      if (!attr)
++	attr = deferred_name;
++
++      /* Find the attribute itself.  */
++      mapping *m = (mapping *) htab_find (codes.attrs, &attr);
++      if (!m)
++	fatal_with_file_and_line ("unknown code attribute `%s'", attr);
++
++      /* Pick the first possible code for now, and record the attribute
++	 use for later.  */
++      rtx x = rtx_alloc (check_code_attribute (m));
++      record_attribute_use (&codes, x, 0, deferred_name);
++      return x;
++    }
++
++  mapping *iterator = (mapping *) htab_find (codes.iterators, &name);
++  if (iterator != 0)
++    {
++      /* Pick the first possible code for now, and record the iterator
++	 use for later.  */
++      rtx x = rtx_alloc (rtx_code (iterator->values->number));
++      record_iterator_use (iterator, x, 0);
++      return x;
++    }
++#endif
++
++  return rtx_alloc (rtx_code (codes.find_builtin (name)));
++}
++
+ /* Subroutine of read_rtx and read_nested_rtx.  CODE_NAME is the name of
+    either an rtx code or a code iterator.  Parse the rest of the rtx and
+    return it.  */
+@@ -1475,7 +1566,6 @@ rtx
+ rtx_reader::read_rtx_code (const char *code_name)
+ {
+   RTX_CODE code;
+-  struct mapping *iterator = NULL;
+   const char *format_ptr;
+   struct md_name name;
+   rtx return_rtx;
+@@ -1509,20 +1599,9 @@ rtx_reader::read_rtx_code (const char *code_name)
+       return return_rtx;
+     }
+ 
+-  /* If this code is an iterator, build the rtx using the iterator's
+-     first value.  */
+-#ifdef GENERATOR_FILE
+-  iterator = (struct mapping *) htab_find (codes.iterators, &code_name);
+-  if (iterator != 0)
+-    code = (enum rtx_code) iterator->values->number;
+-  else
+-    code = (enum rtx_code) codes.find_builtin (code_name);
+-#else
+-    code = (enum rtx_code) codes.find_builtin (code_name);
+-#endif
+-
+   /* If we end up with an insn expression then we free this space below.  */
+-  return_rtx = rtx_alloc (code);
++  return_rtx = rtx_alloc_for_name (code_name);
++  code = GET_CODE (return_rtx);
+   format_ptr = GET_RTX_FORMAT (code);
+   memset (return_rtx, 0, RTX_CODE_SIZE (code));
+   PUT_CODE (return_rtx, code);
+@@ -1534,9 +1613,6 @@ rtx_reader::read_rtx_code (const char *code_name)
+       m_reuse_rtx_by_id[reuse_id] = return_rtx;
+     }
+ 
+-  if (iterator)
+-    record_iterator_use (iterator, return_rtx, 0);
+-
+   /* Check for flags. */
+   read_flags (return_rtx);
+ 
+@@ -1765,8 +1841,8 @@ rtx_reader::read_rtx_operand (rtx return_rtx, int idx)
+ 	    break;
+ 	  }
+ 
+-	/* The output template slot of a DEFINE_INSN,
+-	   DEFINE_INSN_AND_SPLIT, or DEFINE_PEEPHOLE automatically
++	/* The output template slot of a DEFINE_INSN, DEFINE_INSN_AND_SPLIT,
++	   DEFINE_INSN_AND_REWRITE or DEFINE_PEEPHOLE automatically
+ 	   gets a star inserted as its first character, if it is
+ 	   written with a brace block instead of a string constant.  */
+ 	star_if_braced = (format_ptr[idx] == 'T');
+@@ -1783,7 +1859,8 @@ rtx_reader::read_rtx_operand (rtx return_rtx, int idx)
+ 	if (*stringbuf == '\0'
+ 	    && idx == 0
+ 	    && (GET_CODE (return_rtx) == DEFINE_INSN
+-		|| GET_CODE (return_rtx) == DEFINE_INSN_AND_SPLIT))
++		|| GET_CODE (return_rtx) == DEFINE_INSN_AND_SPLIT
++		|| GET_CODE (return_rtx) == DEFINE_INSN_AND_REWRITE))
+ 	  {
+ 	    struct obstack *string_obstack = get_string_obstack ();
+ 	    char line_name[20];
+diff --git a/gcc/real.c b/gcc/real.c
+index 0164f097a..a2bd37a9e 100644
+--- a/gcc/real.c
++++ b/gcc/real.c
+@@ -4799,6 +4799,116 @@ decode_ieee_half (const struct real_format *fmt, REAL_VALUE_TYPE *r,
+     }
+ }
+ 
++/* Encode arm_bfloat types.  */
++static void
++encode_arm_bfloat_half (const struct real_format *fmt, long *buf,
++		    const REAL_VALUE_TYPE *r)
++{
++  unsigned long image, sig, exp;
++  unsigned long sign = r->sign;
++  bool denormal = (r->sig[SIGSZ-1] & SIG_MSB) == 0;
++
++  image = sign << 15;
++  sig = (r->sig[SIGSZ-1] >> (HOST_BITS_PER_LONG - 8)) & 0x7f;
++
++  switch (r->cl)
++    {
++    case rvc_zero:
++      break;
++
++    case rvc_inf:
++      if (fmt->has_inf)
++	image |= 255 << 7;
++      else
++	image |= 0x7fff;
++      break;
++
++    case rvc_nan:
++      if (fmt->has_nans)
++	{
++	  if (r->canonical)
++	    sig = (fmt->canonical_nan_lsbs_set ? (1 << 6) - 1 : 0);
++	  if (r->signalling == fmt->qnan_msb_set)
++	    sig &= ~(1 << 6);
++	  else
++	    sig |= 1 << 6;
++	  if (sig == 0)
++	    sig = 1 << 5;
++
++	  image |= 255 << 7;
++	  image |= sig;
++	}
++      else
++	image |= 0x7fff;
++      break;
++
++    case rvc_normal:
++      if (denormal)
++	exp = 0;
++      else
++      exp = REAL_EXP (r) + 127 - 1;
++      image |= exp << 7;
++      image |= sig;
++      break;
++
++    default:
++      gcc_unreachable ();
++    }
++
++  buf[0] = image;
++}
++
++/* Decode arm_bfloat types.  */
++static void
++decode_arm_bfloat_half (const struct real_format *fmt, REAL_VALUE_TYPE *r,
++		    const long *buf)
++{
++  unsigned long image = buf[0] & 0xffff;
++  bool sign = (image >> 15) & 1;
++  int exp = (image >> 7) & 0xff;
++
++  memset (r, 0, sizeof (*r));
++  image <<= HOST_BITS_PER_LONG - 8;
++  image &= ~SIG_MSB;
++
++  if (exp == 0)
++    {
++      if (image && fmt->has_denorm)
++	{
++	  r->cl = rvc_normal;
++	  r->sign = sign;
++	  SET_REAL_EXP (r, -126);
++	  r->sig[SIGSZ-1] = image << 1;
++	  normalize (r);
++	}
++      else if (fmt->has_signed_zero)
++	r->sign = sign;
++    }
++  else if (exp == 255 && (fmt->has_nans || fmt->has_inf))
++    {
++      if (image)
++	{
++	  r->cl = rvc_nan;
++	  r->sign = sign;
++	  r->signalling = (((image >> (HOST_BITS_PER_LONG - 2)) & 1)
++			   ^ fmt->qnan_msb_set);
++	  r->sig[SIGSZ-1] = image;
++	}
++      else
++	{
++	  r->cl = rvc_inf;
++	  r->sign = sign;
++	}
++    }
++  else
++    {
++      r->cl = rvc_normal;
++      r->sign = sign;
++      SET_REAL_EXP (r, exp - 127 + 1);
++      r->sig[SIGSZ-1] = image | SIG_MSB;
++    }
++}
++
+ /* Half-precision format, as specified in IEEE 754R.  */
+ const struct real_format ieee_half_format =
+   {
+@@ -4848,6 +4958,33 @@ const struct real_format arm_half_format =
+     false,
+     "arm_half"
+   };
++
++/* ARM Bfloat half-precision format.  This format resembles a truncated
++   (16-bit) version of the 32-bit IEEE 754 single-precision floating-point
++   format.  */
++const struct real_format arm_bfloat_half_format =
++  {
++    encode_arm_bfloat_half,
++    decode_arm_bfloat_half,
++    2,
++    8,
++    8,
++    -125,
++    128,
++    15,
++    15,
++    0,
++    false,
++    true,
++    true,
++    true,
++    true,
++    true,
++    true,
++    false,
++    "arm_bfloat_half"
++  };
++
+ 
+ /* A synthetic "format" for internal arithmetic.  It's the size of the
+    internal significand minus the two bits needed for proper rounding.
+diff --git a/gcc/real.h b/gcc/real.h
+index 95b9db83d..d1b79f804 100644
+--- a/gcc/real.h
++++ b/gcc/real.h
+@@ -361,6 +361,7 @@ extern const struct real_format decimal_double_format;
+ extern const struct real_format decimal_quad_format;
+ extern const struct real_format ieee_half_format;
+ extern const struct real_format arm_half_format;
++extern const struct real_format arm_bfloat_half_format;
+ 
+ 
+ /* ====================================================================== */
+diff --git a/gcc/recog.c b/gcc/recog.c
+index a9f584bc0..b12eba33a 100644
+--- a/gcc/recog.c
++++ b/gcc/recog.c
+@@ -3227,7 +3227,8 @@ peep2_find_free_register (int from, int to, const char *class_str,
+ 	      break;
+ 	    }
+ 	  /* And that we don't create an extra save/restore.  */
+-	  if (! call_used_regs[regno + j] && ! df_regs_ever_live_p (regno + j))
++	  if (! call_used_or_fixed_reg_p (regno + j)
++	      && ! df_regs_ever_live_p (regno + j))
+ 	    {
+ 	      success = 0;
+ 	      break;
+@@ -3724,8 +3725,7 @@ store_data_bypass_p_1 (rtx_insn *out_insn, rtx in_set)
+     {
+       rtx out_exp = XVECEXP (out_pat, 0, i);
+ 
+-      if (GET_CODE (out_exp) == CLOBBER || GET_CODE (out_exp) == USE
+-	  || GET_CODE (out_exp) == CLOBBER_HIGH)
++      if (GET_CODE (out_exp) == CLOBBER || GET_CODE (out_exp) == USE)
+ 	continue;
+ 
+       gcc_assert (GET_CODE (out_exp) == SET);
+@@ -3756,8 +3756,7 @@ store_data_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn)
+     {
+       rtx in_exp = XVECEXP (in_pat, 0, i);
+ 
+-      if (GET_CODE (in_exp) == CLOBBER || GET_CODE (in_exp) == USE
+-	  || GET_CODE (in_exp) == CLOBBER_HIGH)
++      if (GET_CODE (in_exp) == CLOBBER || GET_CODE (in_exp) == USE)
+ 	continue;
+ 
+       gcc_assert (GET_CODE (in_exp) == SET);
+@@ -3809,7 +3808,7 @@ if_test_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn)
+ 	{
+ 	  rtx exp = XVECEXP (out_pat, 0, i);
+ 
+-	  if (GET_CODE (exp) == CLOBBER  || GET_CODE (exp) == CLOBBER_HIGH)
++	  if (GET_CODE (exp) == CLOBBER)
+ 	    continue;
+ 
+ 	  gcc_assert (GET_CODE (exp) == SET);
+diff --git a/gcc/recog.h b/gcc/recog.h
+index 75cbbdc10..71d88e3e3 100644
+--- a/gcc/recog.h
++++ b/gcc/recog.h
+@@ -142,7 +142,7 @@ extern void preprocess_constraints (rtx_insn *);
+ extern rtx_insn *peep2_next_insn (int);
+ extern int peep2_regno_dead_p (int, int);
+ extern int peep2_reg_dead_p (int, rtx);
+-#ifdef CLEAR_HARD_REG_SET
++#ifdef HARD_CONST
+ extern rtx peep2_find_free_register (int, int, const char *,
+ 				     machine_mode, HARD_REG_SET *);
+ #endif
+@@ -186,6 +186,23 @@ skip_alternative (const char *p)
+ /* Nonzero means volatile operands are recognized.  */
+ extern int volatile_ok;
+ 
++/* RAII class for temporarily setting volatile_ok.  */
++
++class temporary_volatile_ok
++{
++public:
++  temporary_volatile_ok (int value) : save_volatile_ok (volatile_ok)
++  {
++    volatile_ok = value;
++  }
++
++  ~temporary_volatile_ok () { volatile_ok = save_volatile_ok; }
++
++private:
++  temporary_volatile_ok (const temporary_volatile_ok &);
++  int save_volatile_ok;
++};
++
+ /* Set by constrain_operands to the number of the alternative that
+    matched.  */
+ extern int which_alternative;
+diff --git a/gcc/reg-stack.c b/gcc/reg-stack.c
+index 033c978a1..b464f493f 100644
+--- a/gcc/reg-stack.c
++++ b/gcc/reg-stack.c
+@@ -368,7 +368,7 @@ straighten_stack (rtx_insn *insn, stack_ptr regstack)
+   if (regstack->top <= 0)
+     return;
+ 
+-  COPY_HARD_REG_SET (temp_stack.reg_set, regstack->reg_set);
++  temp_stack.reg_set = regstack->reg_set;
+ 
+   for (top = temp_stack.top = regstack->top; top >= 0; top--)
+     temp_stack.reg[top] = FIRST_STACK_REG + temp_stack.top - top;
+@@ -568,7 +568,7 @@ check_asm_stack_operands (rtx_insn *insn)
+ 
+   if (i != LAST_STACK_REG + 1)
+     {
+-      error_for_asm (insn, "output regs must be grouped at top of stack");
++      error_for_asm (insn, "output registers must be grouped at top of stack");
+       malformed_asm = 1;
+     }
+ 
+@@ -625,7 +625,8 @@ check_asm_stack_operands (rtx_insn *insn)
+   if (i != LAST_STACK_REG + 1)
+     {
+       error_for_asm (insn,
+-		     "explicitly used regs must be grouped at top of stack");
++		     "explicitly used registers must be grouped "
++		     "at top of stack");
+       malformed_asm = 1;
+     }
+ 
+@@ -2640,7 +2641,7 @@ change_stack (rtx_insn *insn, stack_ptr old, stack_ptr new_stack,
+       /* By now, the only difference should be the order of the stack,
+ 	 not their depth or liveliness.  */
+ 
+-      gcc_assert (hard_reg_set_equal_p (old->reg_set, new_stack->reg_set));
++      gcc_assert (old->reg_set == new_stack->reg_set);
+       gcc_assert (old->top == new_stack->top);
+ 
+       /* If the stack is not empty (new_stack->top != -1), loop here emitting
+@@ -3154,8 +3155,7 @@ convert_regs_1 (basic_block block)
+      asms, we zapped the instruction itself, but that didn't produce the
+      same pattern of register kills as before.  */
+ 
+-  gcc_assert (hard_reg_set_equal_p (regstack.reg_set, bi->out_reg_set)
+-	      || any_malformed_asm);
++  gcc_assert (regstack.reg_set == bi->out_reg_set || any_malformed_asm);
+   bi->stack_out = regstack;
+   bi->done = true;
+ 
+diff --git a/gcc/regcprop.c b/gcc/regcprop.c
+index 4842ce922..675111db8 100644
+--- a/gcc/regcprop.c
++++ b/gcc/regcprop.c
+@@ -35,6 +35,7 @@
+ #include "rtl-iter.h"
+ #include "cfgrtl.h"
+ #include "target.h"
++#include "function-abi.h"
+ 
+ /* The following code does forward propagation of hard register copies.
+    The object is to eliminate as many dependencies as possible, so that
+@@ -237,11 +238,8 @@ static void
+ kill_clobbered_value (rtx x, const_rtx set, void *data)
+ {
+   struct value_data *const vd = (struct value_data *) data;
+-  gcc_assert (GET_CODE (set) != CLOBBER_HIGH || REG_P (x));
+ 
+-  if (GET_CODE (set) == CLOBBER
+-      || (GET_CODE (set) == CLOBBER_HIGH
+-	  && reg_is_clobbered_by_clobber_high (x, XEXP (set, 0))))
++  if (GET_CODE (set) == CLOBBER)
+     kill_value (x, vd);
+ }
+ 
+@@ -262,8 +260,7 @@ kill_set_value (rtx x, const_rtx set, void *data)
+   if (rtx_equal_p (x, ksvd->ignore_set_reg))
+     return;
+ 
+-  gcc_assert (GET_CODE (set) != CLOBBER_HIGH || REG_P (x));
+-  if (GET_CODE (set) != CLOBBER && GET_CODE (set) != CLOBBER_HIGH)
++  if (GET_CODE (set) != CLOBBER)
+     {
+       kill_value (x, ksvd->vd);
+       if (REG_P (x))
+@@ -728,19 +725,7 @@ cprop_find_used_regs (rtx *loc, void *data)
+ static void
+ kill_clobbered_values (rtx_insn *insn, struct value_data *vd)
+ {
+-  note_stores (PATTERN (insn), kill_clobbered_value, vd);
+-
+-  if (CALL_P (insn))
+-    {
+-      rtx exp;
+-
+-      for (exp = CALL_INSN_FUNCTION_USAGE (insn); exp; exp = XEXP (exp, 1))
+-	{
+-	  rtx x = XEXP (exp, 0);
+-	  if (GET_CODE (x) == CLOBBER)
+-	    kill_value (SET_DEST (x), vd);
+-	}
+-    }
++  note_stores (insn, kill_clobbered_value, vd);
+ }
+ 
+ /* Perform the forward copy propagation on basic block BB.  */
+@@ -1047,7 +1032,6 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
+ 	  unsigned int set_nregs = 0;
+ 	  unsigned int regno;
+ 	  rtx exp;
+-	  HARD_REG_SET regs_invalidated_by_this_call;
+ 
+ 	  for (exp = CALL_INSN_FUNCTION_USAGE (insn); exp; exp = XEXP (exp, 1))
+ 	    {
+@@ -1065,13 +1049,11 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
+ 		}
+ 	    }
+ 
+-	  get_call_reg_set_usage (insn,
+-				  &regs_invalidated_by_this_call,
+-				  regs_invalidated_by_call);
++	  function_abi callee_abi = insn_callee_abi (insn);
+ 	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+-	    if ((TEST_HARD_REG_BIT (regs_invalidated_by_this_call, regno)
++	    if ((callee_abi.clobbers_full_reg_p (regno)
+ 		 || (targetm.hard_regno_call_part_clobbered
+-		     (insn, regno, vd->e[regno].mode)))
++		     (callee_abi.id (), regno, vd->e[regno].mode)))
+ 		&& (regno < set_regno || regno >= set_regno + set_nregs))
+ 	      kill_value_regno (regno, 1, vd);
+ 
+@@ -1109,7 +1091,7 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
+       if (!noop_p)
+ 	{
+ 	  /* Notice stores.  */
+-	  note_stores (PATTERN (insn), kill_set_value, &ksvd);
++	  note_stores (insn, kill_set_value, &ksvd);
+ 
+ 	  /* Notice copies.  */
+ 	  if (copy_p)
+diff --git a/gcc/reginfo.c b/gcc/reginfo.c
+index 315c5ecab..4f07e968e 100644
+--- a/gcc/reginfo.c
++++ b/gcc/reginfo.c
+@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "reload.h"
+ #include "output.h"
+ #include "tree-pass.h"
++#include "function-abi.h"
+ 
+ /* Maximum register number used in this function, plus one.  */
+ 
+@@ -65,21 +66,20 @@ struct target_hard_regs *this_target_hard_regs = &default_target_hard_regs;
+ struct target_regs *this_target_regs = &default_target_regs;
+ #endif
+ 
++#define call_used_regs \
++  (this_target_hard_regs->x_call_used_regs)
++
+ /* Data for initializing fixed_regs.  */
+ static const char initial_fixed_regs[] = FIXED_REGISTERS;
+ 
+ /* Data for initializing call_used_regs.  */
+-static const char initial_call_used_regs[] = CALL_USED_REGISTERS;
+-
+ #ifdef CALL_REALLY_USED_REGISTERS
+-/* Data for initializing call_really_used_regs.  */
+-static const char initial_call_really_used_regs[] = CALL_REALLY_USED_REGISTERS;
++#ifdef CALL_USED_REGISTERS
++#error CALL_USED_REGISTERS and CALL_REALLY_USED_REGISTERS are both defined
+ #endif
+-
+-#ifdef CALL_REALLY_USED_REGISTERS
+-#define CALL_REALLY_USED_REGNO_P(X)  call_really_used_regs[X]
++static const char initial_call_used_regs[] = CALL_REALLY_USED_REGISTERS;
+ #else
+-#define CALL_REALLY_USED_REGNO_P(X)  call_used_regs[X]
++static const char initial_call_used_regs[] = CALL_USED_REGISTERS;
+ #endif
+ 
+ /* Indexed by hard register number, contains 1 for registers
+@@ -91,17 +91,6 @@ char global_regs[FIRST_PSEUDO_REGISTER];
+ /* Declaration for the global register. */
+ tree global_regs_decl[FIRST_PSEUDO_REGISTER];
+ 
+-/* Same information as REGS_INVALIDATED_BY_CALL but in regset form to be used
+-   in dataflow more conveniently.  */
+-regset regs_invalidated_by_call_regset;
+-
+-/* Same information as FIXED_REG_SET but in regset form.  */
+-regset fixed_reg_set_regset;
+-
+-/* The bitmap_obstack is used to hold some static variables that
+-   should not be reset after each function is compiled.  */
+-static bitmap_obstack persistent_obstack;
+-
+ /* Used to initialize reg_alloc_order.  */
+ #ifdef REG_ALLOC_ORDER
+ static int initial_reg_alloc_order[FIRST_PSEUDO_REGISTER] = REG_ALLOC_ORDER;
+@@ -171,10 +160,6 @@ init_reg_sets (void)
+      CALL_USED_REGISTERS had the right number of initializers.  */
+   gcc_assert (sizeof fixed_regs == sizeof initial_fixed_regs);
+   gcc_assert (sizeof call_used_regs == sizeof initial_call_used_regs);
+-#ifdef CALL_REALLY_USED_REGISTERS
+-  gcc_assert (sizeof call_really_used_regs
+-	      == sizeof initial_call_really_used_regs);
+-#endif
+ #ifdef REG_ALLOC_ORDER
+   gcc_assert (sizeof reg_alloc_order == sizeof initial_reg_alloc_order);
+ #endif
+@@ -182,10 +167,6 @@ init_reg_sets (void)
+ 
+   memcpy (fixed_regs, initial_fixed_regs, sizeof fixed_regs);
+   memcpy (call_used_regs, initial_call_used_regs, sizeof call_used_regs);
+-#ifdef CALL_REALLY_USED_REGISTERS
+-  memcpy (call_really_used_regs, initial_call_really_used_regs,
+-	  sizeof call_really_used_regs);
+-#endif
+ #ifdef REG_ALLOC_ORDER
+   memcpy (reg_alloc_order, initial_reg_alloc_order, sizeof reg_alloc_order);
+ #endif
+@@ -200,9 +181,6 @@ init_reg_sets (void)
+    subsequent back-end reinitialization.  */
+ static char saved_fixed_regs[FIRST_PSEUDO_REGISTER];
+ static char saved_call_used_regs[FIRST_PSEUDO_REGISTER];
+-#ifdef CALL_REALLY_USED_REGISTERS
+-static char saved_call_really_used_regs[FIRST_PSEUDO_REGISTER];
+-#endif
+ static const char *saved_reg_names[FIRST_PSEUDO_REGISTER];
+ static HARD_REG_SET saved_accessible_reg_set;
+ static HARD_REG_SET saved_operand_reg_set;
+@@ -218,19 +196,11 @@ save_register_info (void)
+   memcpy (saved_fixed_regs, fixed_regs, sizeof fixed_regs);
+   memcpy (saved_call_used_regs, call_used_regs, sizeof call_used_regs);
+ 
+-  /* Likewise for call_really_used_regs.  */
+-#ifdef CALL_REALLY_USED_REGISTERS
+-  gcc_assert (sizeof call_really_used_regs
+-	      == sizeof saved_call_really_used_regs);
+-  memcpy (saved_call_really_used_regs, call_really_used_regs,
+-	  sizeof call_really_used_regs);
+-#endif
+-
+   /* And similarly for reg_names.  */
+   gcc_assert (sizeof reg_names == sizeof saved_reg_names);
+   memcpy (saved_reg_names, reg_names, sizeof reg_names);
+-  COPY_HARD_REG_SET (saved_accessible_reg_set, accessible_reg_set);
+-  COPY_HARD_REG_SET (saved_operand_reg_set, operand_reg_set);
++  saved_accessible_reg_set = accessible_reg_set;
++  saved_operand_reg_set = operand_reg_set;
+ }
+ 
+ /* Restore the register information.  */
+@@ -240,14 +210,9 @@ restore_register_info (void)
+   memcpy (fixed_regs, saved_fixed_regs, sizeof fixed_regs);
+   memcpy (call_used_regs, saved_call_used_regs, sizeof call_used_regs);
+ 
+-#ifdef CALL_REALLY_USED_REGISTERS
+-  memcpy (call_really_used_regs, saved_call_really_used_regs,
+-	  sizeof call_really_used_regs);
+-#endif
+-
+   memcpy (reg_names, saved_reg_names, sizeof reg_names);
+-  COPY_HARD_REG_SET (accessible_reg_set, saved_accessible_reg_set);
+-  COPY_HARD_REG_SET (operand_reg_set, saved_operand_reg_set);
++  accessible_reg_set = saved_accessible_reg_set;
++  operand_reg_set = saved_operand_reg_set;
+ }
+ 
+ /* After switches have been processed, which perhaps alter
+@@ -297,8 +262,7 @@ init_reg_sets_1 (void)
+ 	  HARD_REG_SET c;
+ 	  int k;
+ 
+-	  COPY_HARD_REG_SET (c, reg_class_contents[i]);
+-	  IOR_HARD_REG_SET (c, reg_class_contents[j]);
++	  c = reg_class_contents[i] | reg_class_contents[j];
+ 	  for (k = 0; k < N_REG_CLASSES; k++)
+ 	    if (hard_reg_set_subset_p (reg_class_contents[k], c)
+ 		&& !hard_reg_set_subset_p (reg_class_contents[k],
+@@ -320,8 +284,7 @@ init_reg_sets_1 (void)
+ 	  HARD_REG_SET c;
+ 	  int k;
+ 
+-	  COPY_HARD_REG_SET (c, reg_class_contents[i]);
+-	  IOR_HARD_REG_SET (c, reg_class_contents[j]);
++	  c = reg_class_contents[i] | reg_class_contents[j];
+ 	  for (k = 0; k < N_REG_CLASSES; k++)
+ 	    if (hard_reg_set_subset_p (c, reg_class_contents[k]))
+ 	      break;
+@@ -362,22 +325,9 @@ init_reg_sets_1 (void)
+   /* Initialize "constant" tables.  */
+ 
+   CLEAR_HARD_REG_SET (fixed_reg_set);
+-  CLEAR_HARD_REG_SET (call_used_reg_set);
+-  CLEAR_HARD_REG_SET (call_fixed_reg_set);
+   CLEAR_HARD_REG_SET (regs_invalidated_by_call);
+-  if (!regs_invalidated_by_call_regset)
+-    {
+-      bitmap_obstack_initialize (&persistent_obstack);
+-      regs_invalidated_by_call_regset = ALLOC_REG_SET (&persistent_obstack);
+-    }
+-  else
+-    CLEAR_REG_SET (regs_invalidated_by_call_regset);
+-  if (!fixed_reg_set_regset)
+-    fixed_reg_set_regset = ALLOC_REG_SET (&persistent_obstack);
+-  else
+-    CLEAR_REG_SET (fixed_reg_set_regset);
+ 
+-  AND_HARD_REG_SET (operand_reg_set, accessible_reg_set);
++  operand_reg_set &= accessible_reg_set;
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     {
+       /* As a special exception, registers whose class is NO_REGS are
+@@ -393,26 +343,10 @@ init_reg_sets_1 (void)
+       /* If a register is too limited to be treated as a register operand,
+ 	 then it should never be allocated to a pseudo.  */
+       if (!TEST_HARD_REG_BIT (operand_reg_set, i))
+-	{
+-	  fixed_regs[i] = 1;
+-	  call_used_regs[i] = 1;
+-	}
+-
+-      /* call_used_regs must include fixed_regs.  */
+-      gcc_assert (!fixed_regs[i] || call_used_regs[i]);
+-#ifdef CALL_REALLY_USED_REGISTERS
+-      /* call_used_regs must include call_really_used_regs.  */
+-      gcc_assert (!call_really_used_regs[i] || call_used_regs[i]);
+-#endif
++	fixed_regs[i] = 1;
+ 
+       if (fixed_regs[i])
+-	{
+-	  SET_HARD_REG_BIT (fixed_reg_set, i);
+-	  SET_REGNO_REG_SET (fixed_reg_set_regset, i);
+-	}
+-
+-      if (call_used_regs[i])
+-	SET_HARD_REG_BIT (call_used_reg_set, i);
++	SET_HARD_REG_BIT (fixed_reg_set, i);
+ 
+       /* There are a couple of fixed registers that we know are safe to
+ 	 exclude from being clobbered by calls:
+@@ -427,10 +361,7 @@ init_reg_sets_1 (void)
+       if (i == STACK_POINTER_REGNUM)
+ 	;
+       else if (global_regs[i])
+-        {
+-	  SET_HARD_REG_BIT (regs_invalidated_by_call, i);
+-	  SET_REGNO_REG_SET (regs_invalidated_by_call_regset, i);
+-	}
++	SET_HARD_REG_BIT (regs_invalidated_by_call, i);
+       else if (i == FRAME_POINTER_REGNUM)
+ 	;
+       else if (!HARD_FRAME_POINTER_IS_FRAME_POINTER
+@@ -442,15 +373,12 @@ init_reg_sets_1 (void)
+       else if (!PIC_OFFSET_TABLE_REG_CALL_CLOBBERED
+ 	       && i == (unsigned) PIC_OFFSET_TABLE_REGNUM && fixed_regs[i])
+ 	;
+-      else if (CALL_REALLY_USED_REGNO_P (i))
+-        {
+-	  SET_HARD_REG_BIT (regs_invalidated_by_call, i);
+-	  SET_REGNO_REG_SET (regs_invalidated_by_call_regset, i);
+-        }
++      else if (call_used_regs[i])
++	SET_HARD_REG_BIT (regs_invalidated_by_call, i);
+     }
+ 
+-  COPY_HARD_REG_SET (call_fixed_reg_set, fixed_reg_set);
+-  COPY_HARD_REG_SET (fixed_nonglobal_reg_set, fixed_reg_set);
++  SET_HARD_REG_SET (savable_regs);
++  fixed_nonglobal_reg_set = fixed_reg_set;
+ 
+   /* Preserve global registers if called more than once.  */
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+@@ -459,8 +387,6 @@ init_reg_sets_1 (void)
+ 	{
+ 	  fixed_regs[i] = call_used_regs[i] = 1;
+ 	  SET_HARD_REG_BIT (fixed_reg_set, i);
+-	  SET_HARD_REG_BIT (call_used_reg_set, i);
+-	  SET_HARD_REG_BIT (call_fixed_reg_set, i);
+ 	}
+     }
+ 
+@@ -493,6 +419,8 @@ init_reg_sets_1 (void)
+ 	       }
+ 	  }
+      }
++
++  default_function_abi.initialize (0, regs_invalidated_by_call);
+ }
+ 
+ /* Compute the table of register modes.
+@@ -639,7 +567,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
+     if (hard_regno_nregs (regno, mode) == nregs
+ 	&& targetm.hard_regno_mode_ok (regno, mode)
+ 	&& (!call_saved
+-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
++	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
+ 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
+       found_mode = mode;
+ 
+@@ -647,7 +575,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
+     if (hard_regno_nregs (regno, mode) == nregs
+ 	&& targetm.hard_regno_mode_ok (regno, mode)
+ 	&& (!call_saved
+-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
++	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
+ 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
+       found_mode = mode;
+ 
+@@ -655,7 +583,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
+     if (hard_regno_nregs (regno, mode) == nregs
+ 	&& targetm.hard_regno_mode_ok (regno, mode)
+ 	&& (!call_saved
+-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
++	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
+ 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
+       found_mode = mode;
+ 
+@@ -663,7 +591,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
+     if (hard_regno_nregs (regno, mode) == nregs
+ 	&& targetm.hard_regno_mode_ok (regno, mode)
+ 	&& (!call_saved
+-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
++	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
+ 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
+       found_mode = mode;
+ 
+@@ -677,7 +605,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
+       if (hard_regno_nregs (regno, mode) == nregs
+ 	  && targetm.hard_regno_mode_ok (regno, mode)
+ 	  && (!call_saved
+-	      || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode)))
++	      || !targetm.hard_regno_call_part_clobbered (0, regno, mode)))
+ 	return mode;
+     }
+ 
+@@ -749,10 +677,11 @@ fix_register (const char *name, int fixed, int call_used)
+ 	  else
+ 	    {
+ 	      fixed_regs[i] = fixed;
+-	      call_used_regs[i] = call_used;
+ #ifdef CALL_REALLY_USED_REGISTERS
+ 	      if (fixed == 0)
+-		call_really_used_regs[i] = call_used;
++		call_used_regs[i] = call_used;
++#else
++	      call_used_regs[i] = call_used;
+ #endif
+ 	    }
+ 	}
+@@ -803,7 +732,8 @@ globalize_reg (tree decl, int i)
+   if (i != STACK_POINTER_REGNUM)
+     {
+       SET_HARD_REG_BIT (regs_invalidated_by_call, i);
+-      SET_REGNO_REG_SET (regs_invalidated_by_call_regset, i);
++      for (unsigned int i = 0; i < NUM_ABI_IDS; ++i)
++	function_abis[i].add_full_reg_clobber (i);
+     }
+ 
+   /* If already fixed, nothing else to do.  */
+@@ -811,13 +741,8 @@ globalize_reg (tree decl, int i)
+     return;
+ 
+   fixed_regs[i] = call_used_regs[i] = 1;
+-#ifdef CALL_REALLY_USED_REGISTERS
+-  call_really_used_regs[i] = 1;
+-#endif
+ 
+   SET_HARD_REG_BIT (fixed_reg_set, i);
+-  SET_HARD_REG_BIT (call_used_reg_set, i);
+-  SET_HARD_REG_BIT (call_fixed_reg_set, i);
+ 
+   reinit_regs ();
+ }
+@@ -1101,10 +1026,6 @@ reg_scan_mark_refs (rtx x, rtx_insn *insn)
+ 	reg_scan_mark_refs (XEXP (XEXP (x, 0), 0), insn);
+       break;
+ 
+-    case CLOBBER_HIGH:
+-      gcc_assert (!(MEM_P (XEXP (x, 0))));
+-      break;
+-
+     case SET:
+       /* Count a set of the destination if it is a register.  */
+       for (dest = SET_DEST (x);
+@@ -1316,14 +1237,12 @@ record_subregs_of_mode (rtx subreg, bool partial_def)
+     }
+ 
+   if (valid_mode_changes[regno])
+-    AND_HARD_REG_SET (*valid_mode_changes[regno],
+-		      simplifiable_subregs (shape));
++    *valid_mode_changes[regno] &= simplifiable_subregs (shape);
+   else
+     {
+       valid_mode_changes[regno]
+ 	= XOBNEW (&valid_mode_changes_obstack, HARD_REG_SET);
+-      COPY_HARD_REG_SET (*valid_mode_changes[regno],
+-			 simplifiable_subregs (shape));
++      *valid_mode_changes[regno] = simplifiable_subregs (shape);
+     }
+ }
+ 
+diff --git a/gcc/regrename.c b/gcc/regrename.c
+index 5259d565e..6f7fe0a6d 100644
+--- a/gcc/regrename.c
++++ b/gcc/regrename.c
+@@ -33,6 +33,7 @@
+ #include "addresses.h"
+ #include "cfganal.h"
+ #include "tree-pass.h"
++#include "function-abi.h"
+ #include "regrename.h"
+ 
+ /* This file implements the RTL register renaming pass of the compiler.  It is
+@@ -253,7 +254,7 @@ create_new_chain (unsigned this_regno, unsigned this_nregs, rtx *loc,
+       CLEAR_HARD_REG_BIT (live_hard_regs, head->regno + nregs);
+     }
+ 
+-  COPY_HARD_REG_SET (head->hard_conflicts, live_hard_regs);
++  head->hard_conflicts = live_hard_regs;
+   bitmap_set_bit (&open_chains_set, head->id);
+ 
+   open_chains = head;
+@@ -292,7 +293,7 @@ merge_overlapping_regs (HARD_REG_SET *pset, struct du_head *head)
+ {
+   bitmap_iterator bi;
+   unsigned i;
+-  IOR_HARD_REG_SET (*pset, head->hard_conflicts);
++  *pset |= head->hard_conflicts;
+   EXECUTE_IF_SET_IN_BITMAP (&head->conflicts, 0, i, bi)
+     {
+       du_head_p other = regrename_chain_from_id (i);
+@@ -303,6 +304,18 @@ merge_overlapping_regs (HARD_REG_SET *pset, struct du_head *head)
+     }
+ }
+ 
++/* Return true if (reg:MODE REGNO) would be clobbered by a call covered
++   by THIS_HEAD.  */
++
++static bool
++call_clobbered_in_chain_p (du_head *this_head, machine_mode mode,
++			   unsigned int regno)
++{
++  return call_clobbered_in_region_p (this_head->call_abis,
++				     this_head->call_clobber_mask,
++				     mode, regno);
++}
++
+ /* Check if NEW_REG can be the candidate register to rename for
+    REG in THIS_HEAD chain.  THIS_UNAVAILABLE is a set of unavailable hard
+    registers.  */
+@@ -322,7 +335,7 @@ check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg,
+ 	|| global_regs[new_reg + i]
+ 	/* Can't use regs which aren't saved by the prologue.  */
+ 	|| (! df_regs_ever_live_p (new_reg + i)
+-	    && ! call_used_regs[new_reg + i])
++	    && ! crtl->abi->clobbers_full_reg_p (new_reg + i))
+ #ifdef LEAF_REGISTERS
+ 	/* We can't use a non-leaf register if we're in a
+ 	   leaf function.  */
+@@ -337,11 +350,8 @@ check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg,
+   for (tmp = this_head->first; tmp; tmp = tmp->next_use)
+     if ((!targetm.hard_regno_mode_ok (new_reg, GET_MODE (*tmp->loc))
+ 	 && ! DEBUG_INSN_P (tmp->insn))
+-	|| (this_head->need_caller_save_reg
+-	    && ! (targetm.hard_regno_call_part_clobbered
+-		  (NULL, reg, GET_MODE (*tmp->loc)))
+-	    && (targetm.hard_regno_call_part_clobbered
+-		(NULL, new_reg, GET_MODE (*tmp->loc)))))
++	|| call_clobbered_in_chain_p (this_head, GET_MODE (*tmp->loc),
++				      new_reg))
+       return false;
+ 
+   return true;
+@@ -363,12 +373,6 @@ find_rename_reg (du_head_p this_head, enum reg_class super_class,
+   int pass;
+   int best_new_reg = old_reg;
+ 
+-  /* Further narrow the set of registers we can use for renaming.
+-     If the chain needs a call-saved register, mark the call-used
+-     registers as unavailable.  */
+-  if (this_head->need_caller_save_reg)
+-    IOR_HARD_REG_SET (*unavailable, call_used_reg_set);
+-
+   /* Mark registers that overlap this chain's lifetime as unavailable.  */
+   merge_overlapping_regs (unavailable, this_head);
+ 
+@@ -441,8 +445,7 @@ regrename_find_superclass (du_head_p head, int *pn_uses,
+       if (DEBUG_INSN_P (tmp->insn))
+ 	continue;
+       n_uses++;
+-      IOR_COMPL_HARD_REG_SET (*punavailable,
+-			      reg_class_contents[tmp->cl]);
++      *punavailable |= ~reg_class_contents[tmp->cl];
+       super_class
+ 	= reg_class_superunion[(int) super_class][(int) tmp->cl];
+     }
+@@ -486,7 +489,7 @@ rename_chains (void)
+ 	      && reg == FRAME_POINTER_REGNUM))
+ 	continue;
+ 
+-      COPY_HARD_REG_SET (this_unavailable, unavailable);
++      this_unavailable = unavailable;
+ 
+       reg_class super_class = regrename_find_superclass (this_head, &n_uses,
+ 							 &this_unavailable);
+@@ -500,7 +503,7 @@ rename_chains (void)
+ 	{
+ 	  fprintf (dump_file, "Register %s in insn %d",
+ 		   reg_names[reg], INSN_UID (this_head->first->insn));
+-	  if (this_head->need_caller_save_reg)
++	  if (this_head->call_abis)
+ 	    fprintf (dump_file, " crosses a call");
+ 	}
+ 
+@@ -677,10 +680,11 @@ merge_chains (du_head_p c1, du_head_p c2)
+   c2->first = c2->last = NULL;
+   c2->id = c1->id;
+ 
+-  IOR_HARD_REG_SET (c1->hard_conflicts, c2->hard_conflicts);
++  c1->hard_conflicts |= c2->hard_conflicts;
+   bitmap_ior_into (&c1->conflicts, &c2->conflicts);
+ 
+-  c1->need_caller_save_reg |= c2->need_caller_save_reg;
++  c1->call_clobber_mask |= c2->call_clobber_mask;
++  c1->call_abis |= c2->call_abis;
+   c1->cannot_rename |= c2->cannot_rename;
+ }
+ 
+@@ -1740,7 +1744,7 @@ build_def_use (basic_block bb)
+ 	     outside an operand, as live.  */
+ 	  hide_operands (n_ops, old_operands, old_dups, untracked_operands,
+ 			 false);
+-	  note_stores (PATTERN (insn), note_sets_clobbers, &clobber_code);
++	  note_stores (insn, note_sets_clobbers, &clobber_code);
+ 	  restore_operands (insn, n_ops, old_operands, old_dups);
+ 
+ 	  /* Step 1b: Begin new chains for earlyclobbered writes inside
+@@ -1834,9 +1838,15 @@ build_def_use (basic_block bb)
+ 	     requires a caller-saved reg.  */
+ 	  if (CALL_P (insn))
+ 	    {
++	      function_abi callee_abi = insn_callee_abi (insn);
+ 	      struct du_head *p;
+ 	      for (p = open_chains; p; p = p->next_chain)
+-		p->need_caller_save_reg = 1;
++		{
++		  p->call_abis |= (1 << callee_abi.id ());
++		  p->call_clobber_mask
++		    |= callee_abi.full_and_partial_reg_clobbers ();
++		  p->hard_conflicts |= callee_abi.full_reg_clobbers ();
++		}
+ 	    }
+ 
+ 	  /* Step 5: Close open chains that overlap writes.  Similar to
+@@ -1856,7 +1866,7 @@ build_def_use (basic_block bb)
+ 	     outside an operand, as live.  */
+ 	  hide_operands (n_ops, old_operands, old_dups, untracked_operands,
+ 			 false);
+-	  note_stores (PATTERN (insn), note_sets_clobbers, &set_code);
++	  note_stores (insn, note_sets_clobbers, &set_code);
+ 	  restore_operands (insn, n_ops, old_operands, old_dups);
+ 
+ 	  /* Step 6b: Begin new chains for writes inside operands.  */
+diff --git a/gcc/regrename.h b/gcc/regrename.h
+index 37f5e398d..1bbf78fda 100644
+--- a/gcc/regrename.h
++++ b/gcc/regrename.h
+@@ -40,9 +40,12 @@ struct du_head
+   bitmap_head conflicts;
+   /* Conflicts with untracked hard registers.  */
+   HARD_REG_SET hard_conflicts;
++  /* Which registers are fully or partially clobbered by the calls that
++     the chain crosses.  */
++  HARD_REG_SET call_clobber_mask;
+ 
+-  /* Nonzero if the chain crosses a call.  */
+-  unsigned int need_caller_save_reg:1;
++  /* A bitmask of ABIs used by the calls that the chain crosses.  */
++  unsigned int call_abis : NUM_ABI_IDS;
+   /* Nonzero if the register is used in a way that prevents renaming,
+      such as the SET_DEST of a CALL_INSN or an asm operand that used
+      to be a hard register.  */
+diff --git a/gcc/regs.h b/gcc/regs.h
+index 48b2e7081..821979ec6 100644
+--- a/gcc/regs.h
++++ b/gcc/regs.h
+@@ -298,7 +298,7 @@ remove_from_hard_reg_set (HARD_REG_SET *regs, machine_mode mode,
+ /* Return true if REGS contains the whole of (reg:MODE REGNO).  */
+ 
+ static inline bool
+-in_hard_reg_set_p (const HARD_REG_SET regs, machine_mode mode,
++in_hard_reg_set_p (const_hard_reg_set regs, machine_mode mode,
+ 		   unsigned int regno)
+ {
+   unsigned int end_regno;
+@@ -323,7 +323,7 @@ in_hard_reg_set_p (const HARD_REG_SET regs, machine_mode mode,
+ /* Return true if (reg:MODE REGNO) includes an element of REGS.  */
+ 
+ static inline bool
+-overlaps_hard_reg_set_p (const HARD_REG_SET regs, machine_mode mode,
++overlaps_hard_reg_set_p (const_hard_reg_set regs, machine_mode mode,
+ 			 unsigned int regno)
+ {
+   unsigned int end_regno;
+@@ -363,7 +363,7 @@ remove_range_from_hard_reg_set (HARD_REG_SET *regs, unsigned int regno,
+ /* Like overlaps_hard_reg_set_p, but use a REGNO/NREGS range instead of
+    REGNO and MODE.  */
+ static inline bool
+-range_overlaps_hard_reg_set_p (const HARD_REG_SET set, unsigned regno,
++range_overlaps_hard_reg_set_p (const_hard_reg_set set, unsigned regno,
+ 			       int nregs)
+ {
+   while (nregs-- > 0)
+@@ -375,7 +375,7 @@ range_overlaps_hard_reg_set_p (const HARD_REG_SET set, unsigned regno,
+ /* Like in_hard_reg_set_p, but use a REGNO/NREGS range instead of
+    REGNO and MODE.  */
+ static inline bool
+-range_in_hard_reg_set_p (const HARD_REG_SET set, unsigned regno, int nregs)
++range_in_hard_reg_set_p (const_hard_reg_set set, unsigned regno, int nregs)
+ {
+   while (nregs-- > 0)
+     if (!TEST_HARD_REG_BIT (set, regno + nregs))
+@@ -383,8 +383,4 @@ range_in_hard_reg_set_p (const HARD_REG_SET set, unsigned regno, int nregs)
+   return true;
+ }
+ 
+-/* Get registers used by given function call instruction.  */
+-extern bool get_call_reg_set_usage (rtx_insn *insn, HARD_REG_SET *reg_set,
+-				    HARD_REG_SET default_set);
+-
+ #endif /* GCC_REGS_H */
+diff --git a/gcc/regset.h b/gcc/regset.h
+index 34a9eb457..72ff45891 100644
+--- a/gcc/regset.h
++++ b/gcc/regset.h
+@@ -64,6 +64,10 @@ typedef bitmap regset;
+ /* Inclusive or a register set with a second register set.  */
+ #define IOR_REG_SET(TO, FROM) bitmap_ior_into (TO, FROM)
+ 
++/* Same, but with FROM being a HARD_REG_SET.  */
++#define IOR_REG_SET_HRS(TO, FROM) \
++  bitmap_ior_into (TO, bitmap_view<HARD_REG_SET> (FROM))
++
+ /* Exclusive or a register set with a second register set.  */
+ #define XOR_REG_SET(TO, FROM) bitmap_xor_into (TO, FROM)
+ 
+@@ -107,14 +111,6 @@ typedef bitmap_iterator reg_set_iterator;
+ #define EXECUTE_IF_AND_IN_REG_SET(REGSET1, REGSET2, MIN, REGNUM, RSI) \
+   EXECUTE_IF_AND_IN_BITMAP (REGSET1, REGSET2, MIN, REGNUM, RSI)	\
+ 
+-/* Same information as REGS_INVALIDATED_BY_CALL but in regset form to be used
+-   in dataflow more conveniently.  */
+-
+-extern regset regs_invalidated_by_call_regset;
+-
+-/* Same information as FIXED_REG_SET but in regset form.  */
+-extern regset fixed_reg_set_regset;
+-
+ /* An obstack for regsets.  */
+ extern bitmap_obstack reg_obstack;
+ 
+diff --git a/gcc/reload.c b/gcc/reload.c
+index 72cc38a0e..b7601307f 100644
+--- a/gcc/reload.c
++++ b/gcc/reload.c
+@@ -6911,15 +6911,15 @@ find_equiv_reg (rtx goal, rtx_insn *insn, enum reg_class rclass, int other,
+ 
+ 	  if (regno >= 0 && regno < FIRST_PSEUDO_REGISTER)
+ 	    for (i = 0; i < nregs; ++i)
+-	      if (call_used_regs[regno + i]
+-		  || targetm.hard_regno_call_part_clobbered (NULL, regno + i,
++	      if (call_used_or_fixed_reg_p (regno + i)
++		  || targetm.hard_regno_call_part_clobbered (0, regno + i,
+ 							     mode))
+ 		return 0;
+ 
+ 	  if (valueno >= 0 && valueno < FIRST_PSEUDO_REGISTER)
+ 	    for (i = 0; i < valuenregs; ++i)
+-	      if (call_used_regs[valueno + i]
+-		  || targetm.hard_regno_call_part_clobbered (NULL, valueno + i,
++	      if (call_used_or_fixed_reg_p (valueno + i)
++		  || targetm.hard_regno_call_part_clobbered (0, valueno + i,
+ 							     mode))
+ 		return 0;
+ 	}
+diff --git a/gcc/reload.h b/gcc/reload.h
+index 813075b6f..fef6aa9da 100644
+--- a/gcc/reload.h
++++ b/gcc/reload.h
+@@ -274,7 +274,7 @@ extern int reload_first_uid;
+ 
+ extern int num_not_at_initial_offset;
+ 
+-#if defined SET_HARD_REG_BIT && defined CLEAR_REG_SET
++#if defined HARD_CONST && defined CLEAR_REG_SET
+ /* This structure describes instructions which are relevant for reload.
+    Apart from all regular insns, this also includes CODE_LABELs, since they
+    must be examined for register elimination.  */
+@@ -325,7 +325,7 @@ extern struct insn_chain *reload_insn_chain;
+ extern struct insn_chain *new_insn_chain (void);
+ #endif
+ 
+-#if defined SET_HARD_REG_BIT
++#if defined HARD_CONST
+ extern void compute_use_by_pseudos (HARD_REG_SET *, bitmap);
+ #endif
+ 
+diff --git a/gcc/reload1.c b/gcc/reload1.c
+index bb112d817..d36ebec60 100644
+--- a/gcc/reload1.c
++++ b/gcc/reload1.c
+@@ -795,7 +795,9 @@ reload (rtx_insn *first, int global)
+ 
+   if (crtl->saves_all_registers)
+     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-      if (! call_used_regs[i] && ! fixed_regs[i] && ! LOCAL_REGNO (i))
++      if (! call_used_or_fixed_reg_p (i)
++	  && ! fixed_regs[i]
++	  && ! LOCAL_REGNO (i))
+ 	df_set_regs_ever_live (i, true);
+ 
+   /* Find all the pseudo registers that didn't get hard regs
+@@ -843,7 +845,7 @@ reload (rtx_insn *first, int global)
+      cannot be done.  */
+   for (insn = first; insn && num_eliminable; insn = NEXT_INSN (insn))
+     if (INSN_P (insn))
+-      note_stores (PATTERN (insn), mark_not_eliminable, NULL);
++      note_pattern_stores (PATTERN (insn), mark_not_eliminable, NULL);
+ 
+   maybe_fix_stack_asms ();
+ 
+@@ -1339,8 +1341,6 @@ maybe_fix_stack_asms (void)
+ 	  rtx t = XVECEXP (pat, 0, i);
+ 	  if (GET_CODE (t) == CLOBBER && STACK_REG_P (XEXP (t, 0)))
+ 	    SET_HARD_REG_BIT (clobbered, REGNO (XEXP (t, 0)));
+-	  /* CLOBBER_HIGH is only supported for LRA.  */
+-	  gcc_assert (GET_CODE (t) != CLOBBER_HIGH);
+ 	}
+ 
+       /* Get the operand values and constraints out of the insn.  */
+@@ -1364,7 +1364,7 @@ maybe_fix_stack_asms (void)
+ 		{
+ 		  /* End of one alternative - mark the regs in the current
+ 		     class, and reset the class.  */
+-		  IOR_HARD_REG_SET (allowed, reg_class_contents[cls]);
++		  allowed |= reg_class_contents[cls];
+ 		  cls = NO_REGS;
+ 		  p++;
+ 		  if (c == '#')
+@@ -1399,7 +1399,7 @@ maybe_fix_stack_asms (void)
+       /* Those of the registers which are clobbered, but allowed by the
+ 	 constraints, must be usable as reload registers.  So clear them
+ 	 out of the life information.  */
+-      AND_HARD_REG_SET (allowed, clobbered);
++      allowed &= clobbered;
+       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ 	if (TEST_HARD_REG_BIT (allowed, i))
+ 	  {
+@@ -1732,7 +1732,7 @@ order_regs_for_reload (struct insn_chain *chain)
+   HARD_REG_SET used_by_pseudos2;
+   reg_set_iterator rsi;
+ 
+-  COPY_HARD_REG_SET (bad_spill_regs, fixed_reg_set);
++  bad_spill_regs = fixed_reg_set;
+ 
+   memset (spill_cost, 0, sizeof spill_cost);
+   memset (spill_add_cost, 0, sizeof spill_add_cost);
+@@ -1745,8 +1745,8 @@ order_regs_for_reload (struct insn_chain *chain)
+ 
+   REG_SET_TO_HARD_REG_SET (used_by_pseudos, &chain->live_throughout);
+   REG_SET_TO_HARD_REG_SET (used_by_pseudos2, &chain->dead_or_set);
+-  IOR_HARD_REG_SET (bad_spill_regs, used_by_pseudos);
+-  IOR_HARD_REG_SET (bad_spill_regs, used_by_pseudos2);
++  bad_spill_regs |= used_by_pseudos;
++  bad_spill_regs |= used_by_pseudos2;
+ 
+   /* Now find out which pseudos are allocated to it, and update
+      hard_reg_n_uses.  */
+@@ -1823,9 +1823,9 @@ find_reg (struct insn_chain *chain, int order)
+   static int regno_pseudo_regs[FIRST_PSEUDO_REGISTER];
+   static int best_regno_pseudo_regs[FIRST_PSEUDO_REGISTER];
+ 
+-  COPY_HARD_REG_SET (not_usable, bad_spill_regs);
+-  IOR_HARD_REG_SET (not_usable, bad_spill_regs_global);
+-  IOR_COMPL_HARD_REG_SET (not_usable, reg_class_contents[rl->rclass]);
++  not_usable = (bad_spill_regs
++		| bad_spill_regs_global
++		| ~reg_class_contents[rl->rclass]);
+ 
+   CLEAR_HARD_REG_SET (used_by_other_reload);
+   for (k = 0; k < order; k++)
+@@ -1906,8 +1906,8 @@ find_reg (struct insn_chain *chain, int order)
+ 		  && (inv_reg_alloc_order[regno]
+ 		      < inv_reg_alloc_order[best_reg])
+ #else
+-		  && call_used_regs[regno]
+-		  && ! call_used_regs[best_reg]
++		  && call_used_or_fixed_reg_p (regno)
++		  && ! call_used_or_fixed_reg_p (best_reg)
+ #endif
+ 		  ))
+ 	    {
+@@ -2007,8 +2007,8 @@ find_reload_regs (struct insn_chain *chain)
+ 	  }
+     }
+ 
+-  COPY_HARD_REG_SET (chain->used_spill_regs, used_spill_regs_local);
+-  IOR_HARD_REG_SET (used_spill_regs, used_spill_regs_local);
++  chain->used_spill_regs = used_spill_regs_local;
++  used_spill_regs |= used_spill_regs_local;
+ 
+   memcpy (chain->rld, rld, n_reloads * sizeof (struct reload));
+ }
+@@ -2881,7 +2881,6 @@ eliminate_regs_1 (rtx x, machine_mode mem_mode, rtx insn,
+       return x;
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+     case ASM_OPERANDS:
+       gcc_assert (insn && DEBUG_INSN_P (insn));
+       break;
+@@ -3092,10 +3091,6 @@ elimination_effects (rtx x, machine_mode mem_mode)
+       elimination_effects (XEXP (x, 0), mem_mode);
+       return;
+ 
+-    case CLOBBER_HIGH:
+-      /* CLOBBER_HIGH is only supported for LRA.  */
+-      return;
+-
+     case SET:
+       /* Check for setting a register that we know about.  */
+       if (REG_P (SET_DEST (x)))
+@@ -3817,9 +3812,6 @@ mark_not_eliminable (rtx dest, const_rtx x, void *data ATTRIBUTE_UNUSED)
+   if (dest == hard_frame_pointer_rtx)
+     return;
+ 
+-  /* CLOBBER_HIGH is only supported for LRA.  */
+-  gcc_assert (GET_CODE (x) != CLOBBER_HIGH);
+-
+   for (i = 0; i < NUM_ELIMINABLE_REGS; i++)
+     if (reg_eliminate[i].can_eliminate && dest == reg_eliminate[i].to_rtx
+ 	&& (GET_CODE (x) != SET
+@@ -4020,7 +4012,7 @@ update_eliminables_and_spill (void)
+   HARD_REG_SET to_spill;
+   CLEAR_HARD_REG_SET (to_spill);
+   update_eliminables (&to_spill);
+-  AND_COMPL_HARD_REG_SET (used_spill_regs, to_spill);
++  used_spill_regs &= ~to_spill;
+ 
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     if (TEST_HARD_REG_BIT (to_spill, i))
+@@ -4346,14 +4338,12 @@ finish_spills (int global)
+ 	  EXECUTE_IF_SET_IN_REG_SET
+ 	    (&chain->live_throughout, FIRST_PSEUDO_REGISTER, i, rsi)
+ 	    {
+-	      IOR_HARD_REG_SET (pseudo_forbidden_regs[i],
+-				chain->used_spill_regs);
++	      pseudo_forbidden_regs[i] |= chain->used_spill_regs;
+ 	    }
+ 	  EXECUTE_IF_SET_IN_REG_SET
+ 	    (&chain->dead_or_set, FIRST_PSEUDO_REGISTER, i, rsi)
+ 	    {
+-	      IOR_HARD_REG_SET (pseudo_forbidden_regs[i],
+-				chain->used_spill_regs);
++	      pseudo_forbidden_regs[i] |= chain->used_spill_regs;
+ 	    }
+ 	}
+ 
+@@ -4397,7 +4387,7 @@ finish_spills (int global)
+ 	{
+ 	  REG_SET_TO_HARD_REG_SET (used_by_pseudos, &chain->live_throughout);
+ 	  REG_SET_TO_HARD_REG_SET (used_by_pseudos2, &chain->dead_or_set);
+-	  IOR_HARD_REG_SET (used_by_pseudos, used_by_pseudos2);
++	  used_by_pseudos |= used_by_pseudos2;
+ 
+ 	  compute_use_by_pseudos (&used_by_pseudos, &chain->live_throughout);
+ 	  compute_use_by_pseudos (&used_by_pseudos, &chain->dead_or_set);
+@@ -4405,8 +4395,7 @@ finish_spills (int global)
+ 	     may be not included in the value calculated here because
+ 	     of possible removing caller-saves insns (see function
+ 	     delete_caller_save_insns.  */
+-	  COMPL_HARD_REG_SET (chain->used_spill_regs, used_by_pseudos);
+-	  AND_HARD_REG_SET (chain->used_spill_regs, used_spill_regs);
++	  chain->used_spill_regs = ~used_by_pseudos & used_spill_regs;
+ 	}
+     }
+ 
+@@ -4455,7 +4444,6 @@ scan_paradoxical_subregs (rtx x)
+     case PC:
+     case USE:
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       return;
+ 
+     case SUBREG:
+@@ -4589,7 +4577,7 @@ reload_as_needed (int live_known)
+ 	{
+ 	  regset_head regs_to_forget;
+ 	  INIT_REG_SET (&regs_to_forget);
+-	  note_stores (PATTERN (insn), forget_old_reloads_1, &regs_to_forget);
++	  note_stores (insn, forget_old_reloads_1, &regs_to_forget);
+ 
+ 	  /* If this is a USE and CLOBBER of a MEM, ensure that any
+ 	     references to eliminable registers have been removed.  */
+@@ -4716,7 +4704,7 @@ reload_as_needed (int live_known)
+ 	     between INSN and NEXT and use them to forget old reloads.  */
+ 	  for (rtx_insn *x = NEXT_INSN (insn); x != old_next; x = NEXT_INSN (x))
+ 	    if (NONJUMP_INSN_P (x) && GET_CODE (PATTERN (x)) == CLOBBER)
+-	      note_stores (PATTERN (x), forget_old_reloads_1, NULL);
++	      note_stores (x, forget_old_reloads_1, NULL);
+ 
+ #if AUTO_INC_DEC
+ 	  /* Likewise for regs altered by auto-increment in this insn.
+@@ -4882,8 +4870,8 @@ reload_as_needed (int live_known)
+          be partially clobbered by the call.  */
+       else if (CALL_P (insn))
+ 	{
+-	  AND_COMPL_HARD_REG_SET (reg_reloaded_valid, call_used_reg_set);
+-	  AND_COMPL_HARD_REG_SET (reg_reloaded_valid, reg_reloaded_call_part_clobbered);
++	  reg_reloaded_valid &= ~(call_used_or_fixed_regs
++				  | reg_reloaded_call_part_clobbered);
+ 
+ 	  /* If this is a call to a setjmp-type function, we must not
+ 	     reuse any reload reg contents across the call; that will
+@@ -4910,8 +4898,7 @@ reload_as_needed (int live_known)
+    to be forgotten later.  */
+ 
+ static void
+-forget_old_reloads_1 (rtx x, const_rtx setter,
+-		      void *data)
++forget_old_reloads_1 (rtx x, const_rtx, void *data)
+ {
+   unsigned int regno;
+   unsigned int nr;
+@@ -4930,9 +4917,6 @@ forget_old_reloads_1 (rtx x, const_rtx setter,
+   if (!REG_P (x))
+     return;
+ 
+-  /* CLOBBER_HIGH is only supported for LRA.  */
+-  gcc_assert (setter == NULL_RTX || GET_CODE (setter) != CLOBBER_HIGH);
+-
+   regno = REGNO (x);
+ 
+   if (regno >= FIRST_PSEUDO_REGISTER)
+@@ -6335,9 +6319,9 @@ choose_reload_regs_init (struct insn_chain *chain, rtx *save_reload_reg_rtx)
+   {
+     HARD_REG_SET tmp;
+     REG_SET_TO_HARD_REG_SET (tmp, &chain->live_throughout);
+-    IOR_HARD_REG_SET (reg_used_in_insn, tmp);
++    reg_used_in_insn |= tmp;
+     REG_SET_TO_HARD_REG_SET (tmp, &chain->dead_or_set);
+-    IOR_HARD_REG_SET (reg_used_in_insn, tmp);
++    reg_used_in_insn |= tmp;
+     compute_use_by_pseudos (&reg_used_in_insn, &chain->live_throughout);
+     compute_use_by_pseudos (&reg_used_in_insn, &chain->dead_or_set);
+   }
+@@ -6352,7 +6336,7 @@ choose_reload_regs_init (struct insn_chain *chain, rtx *save_reload_reg_rtx)
+       CLEAR_HARD_REG_SET (reload_reg_used_in_outaddr_addr[i]);
+     }
+ 
+-  COMPL_HARD_REG_SET (reload_reg_unavailable, chain->used_spill_regs);
++  reload_reg_unavailable = ~chain->used_spill_regs;
+ 
+   CLEAR_HARD_REG_SET (reload_reg_used_for_inherit);
+ 
+@@ -7797,7 +7781,7 @@ emit_output_reload_insns (struct insn_chain *chain, struct reload *rl,
+ 	   clear any memory of reloaded copies of the pseudo reg.
+ 	   If this output reload comes from a spill reg,
+ 	   reg_has_output_reload will make this do nothing.  */
+-	note_stores (pat, forget_old_reloads_1, NULL);
++	note_stores (p, forget_old_reloads_1, NULL);
+ 
+ 	if (reg_mentioned_p (rl_reg_rtx, pat))
+ 	  {
+@@ -8289,8 +8273,7 @@ emit_reload_insns (struct insn_chain *chain)
+ 			   : out_regno + k);
+ 		      reg_reloaded_insn[regno + k] = insn;
+ 		      SET_HARD_REG_BIT (reg_reloaded_valid, regno + k);
+-		      if (targetm.hard_regno_call_part_clobbered (NULL,
+-								  regno + k,
++		      if (targetm.hard_regno_call_part_clobbered (0, regno + k,
+ 								  mode))
+ 			SET_HARD_REG_BIT (reg_reloaded_call_part_clobbered,
+ 					  regno + k);
+@@ -8370,8 +8353,7 @@ emit_reload_insns (struct insn_chain *chain)
+ 			   : in_regno + k);
+ 		      reg_reloaded_insn[regno + k] = insn;
+ 		      SET_HARD_REG_BIT (reg_reloaded_valid, regno + k);
+-		      if (targetm.hard_regno_call_part_clobbered (NULL,
+-								  regno + k,
++		      if (targetm.hard_regno_call_part_clobbered (0, regno + k,
+ 								  mode))
+ 			SET_HARD_REG_BIT (reg_reloaded_call_part_clobbered,
+ 					  regno + k);
+@@ -8487,7 +8469,7 @@ emit_reload_insns (struct insn_chain *chain)
+ 		      CLEAR_HARD_REG_BIT (reg_reloaded_dead, src_regno + k);
+ 		      SET_HARD_REG_BIT (reg_reloaded_valid, src_regno + k);
+ 		      if (targetm.hard_regno_call_part_clobbered
+-			  (NULL, src_regno + k, mode))
++			  (0, src_regno + k, mode))
+ 			SET_HARD_REG_BIT (reg_reloaded_call_part_clobbered,
+ 					  src_regno + k);
+ 		      else
+@@ -8516,7 +8498,7 @@ emit_reload_insns (struct insn_chain *chain)
+ 	    }
+ 	}
+     }
+-  IOR_HARD_REG_SET (reg_reloaded_dead, reg_reloaded_died);
++  reg_reloaded_dead |= reg_reloaded_died;
+ }
+ 
+ /* Go through the motions to emit INSN and test if it is strictly valid.
+diff --git a/gcc/reorg.c b/gcc/reorg.c
+index bdfcf8851..cba183e9c 100644
+--- a/gcc/reorg.c
++++ b/gcc/reorg.c
+@@ -410,8 +410,7 @@ find_end_label (rtx kind)
+   while (NOTE_P (insn)
+ 	 || (NONJUMP_INSN_P (insn)
+ 	     && (GET_CODE (PATTERN (insn)) == USE
+-		 || GET_CODE (PATTERN (insn)) == CLOBBER
+-		 || GET_CODE (PATTERN (insn)) == CLOBBER_HIGH)))
++		 || GET_CODE (PATTERN (insn)) == CLOBBER)))
+     insn = PREV_INSN (insn);
+ 
+   /* When a target threads its epilogue we might already have a
+@@ -1311,8 +1310,7 @@ try_merge_delay_insns (rtx_insn *insn, rtx_insn *thread)
+ 
+       /* TRIAL must be a CALL_INSN or INSN.  Skip USE and CLOBBER.  */
+       if (NONJUMP_INSN_P (trial)
+-	  && (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
+-	      || GET_CODE (pat) == CLOBBER_HIGH))
++	  && (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER))
+ 	continue;
+ 
+       if (GET_CODE (next_to_match) == GET_CODE (trial)
+@@ -1506,8 +1504,7 @@ redundant_insn (rtx insn, rtx_insn *target, const vec<rtx_insn *> &delay_list)
+       --insns_to_search;
+ 
+       pat = PATTERN (trial);
+-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
+-	  || GET_CODE (pat) == CLOBBER_HIGH)
++      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
+ 	continue;
+ 
+       if (GET_CODE (trial) == DEBUG_INSN)
+@@ -1575,7 +1572,7 @@ redundant_insn (rtx insn, rtx_insn *target, const vec<rtx_insn *> &delay_list)
+   /* Insns we pass may not set either NEEDED or SET, so merge them for
+      simpler tests.  */
+   needed.memory |= set.memory;
+-  IOR_HARD_REG_SET (needed.regs, set.regs);
++  needed.regs |= set.regs;
+ 
+   /* This insn isn't redundant if it conflicts with an insn that either is
+      or will be in a delay slot of TARGET.  */
+@@ -1605,8 +1602,7 @@ redundant_insn (rtx insn, rtx_insn *target, const vec<rtx_insn *> &delay_list)
+       --insns_to_search;
+ 
+       pat = PATTERN (trial);
+-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
+-	  || GET_CODE (pat) == CLOBBER_HIGH)
++      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
+ 	continue;
+ 
+       if (GET_CODE (trial) == DEBUG_INSN)
+@@ -1718,8 +1714,7 @@ own_thread_p (rtx thread, rtx label, int allow_fallthrough)
+ 	|| LABEL_P (insn)
+ 	|| (NONJUMP_INSN_P (insn)
+ 	    && GET_CODE (PATTERN (insn)) != USE
+-	    && GET_CODE (PATTERN (insn)) != CLOBBER
+-	    && GET_CODE (PATTERN (insn)) != CLOBBER_HIGH))
++	    && GET_CODE (PATTERN (insn)) != CLOBBER))
+       return 0;
+ 
+   return 1;
+@@ -2042,8 +2037,7 @@ fill_simple_delay_slots (int non_jumps_p)
+ 	      pat = PATTERN (trial);
+ 
+ 	      /* Stand-alone USE and CLOBBER are just for flow.  */
+-	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
+-		  || GET_CODE (pat) == CLOBBER_HIGH)
++	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
+ 		continue;
+ 
+ 	      /* And DEBUG_INSNs never go into delay slots.  */
+@@ -2169,8 +2163,7 @@ fill_simple_delay_slots (int non_jumps_p)
+ 	      pat = PATTERN (trial);
+ 
+ 	      /* Stand-alone USE and CLOBBER are just for flow.  */
+-	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
+-		  || GET_CODE (pat) == CLOBBER_HIGH)
++	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
+ 		continue;
+ 
+ 	      /* And DEBUG_INSNs do not go in delay slots.  */
+@@ -2438,8 +2431,7 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition,
+ 	}
+ 
+       pat = PATTERN (trial);
+-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
+-	  || GET_CODE (pat) == CLOBBER_HIGH)
++      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
+ 	continue;
+ 
+       if (GET_CODE (trial) == DEBUG_INSN)
+@@ -3833,8 +3825,7 @@ dbr_schedule (rtx_insn *first)
+ 	  if (! insn->deleted ()
+ 	      && NONJUMP_INSN_P (insn)
+ 	      && GET_CODE (PATTERN (insn)) != USE
+-	      && GET_CODE (PATTERN (insn)) != CLOBBER
+-	      && GET_CODE (PATTERN (insn)) != CLOBBER_HIGH)
++	      && GET_CODE (PATTERN (insn)) != CLOBBER)
+ 	    {
+ 	      if (GET_CODE (PATTERN (insn)) == SEQUENCE)
+ 		{
+diff --git a/gcc/resource.c b/gcc/resource.c
+index c4bcfd7dc..bf2d6beaf 100644
+--- a/gcc/resource.c
++++ b/gcc/resource.c
+@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "resource.h"
+ #include "insn-attr.h"
+ #include "params.h"
++#include "function-abi.h"
+ 
+ /* This structure is used to record liveness information at the targets or
+    fallthrough insns of branches.  We will most likely need the information
+@@ -108,11 +109,6 @@ update_live_status (rtx dest, const_rtx x, void *data ATTRIBUTE_UNUSED)
+   if (GET_CODE (x) == CLOBBER)
+     for (i = first_regno; i < last_regno; i++)
+       CLEAR_HARD_REG_BIT (current_live_regs, i);
+-  else if (GET_CODE (x) == CLOBBER_HIGH)
+-    /* No current target supports both branch delay slots and CLOBBER_HIGH.
+-       We'd need more elaborate liveness tracking to handle that
+-       combination.  */
+-    gcc_unreachable ();
+   else
+     for (i = first_regno; i < last_regno; i++)
+       {
+@@ -298,7 +294,6 @@ mark_referenced_resources (rtx x, struct resources *res,
+       return;
+ 
+     case CLOBBER:
+-    case CLOBBER_HIGH:
+       return;
+ 
+     case CALL_INSN:
+@@ -450,8 +445,8 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
+ 	case CODE_LABEL:
+ 	  /* After a label, any pending dead registers that weren't yet
+ 	     used can be made dead.  */
+-	  AND_COMPL_HARD_REG_SET (pending_dead_regs, needed.regs);
+-	  AND_COMPL_HARD_REG_SET (res->regs, pending_dead_regs);
++	  pending_dead_regs &= ~needed.regs;
++	  res->regs &= ~pending_dead_regs;
+ 	  CLEAR_HARD_REG_SET (pending_dead_regs);
+ 
+ 	  continue;
+@@ -565,14 +560,12 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
+ 		    }
+ 
+ 		  target_res = *res;
+-		  COPY_HARD_REG_SET (scratch, target_set.regs);
+-		  AND_COMPL_HARD_REG_SET (scratch, needed.regs);
+-		  AND_COMPL_HARD_REG_SET (target_res.regs, scratch);
++		  scratch = target_set.regs & ~needed.regs;
++		  target_res.regs &= ~scratch;
+ 
+ 		  fallthrough_res = *res;
+-		  COPY_HARD_REG_SET (scratch, set.regs);
+-		  AND_COMPL_HARD_REG_SET (scratch, needed.regs);
+-		  AND_COMPL_HARD_REG_SET (fallthrough_res.regs, scratch);
++		  scratch = set.regs & ~needed.regs;
++		  fallthrough_res.regs &= ~scratch;
+ 
+ 		  if (!ANY_RETURN_P (this_jump_insn->jump_label ()))
+ 		    find_dead_or_set_registers
+@@ -581,8 +574,8 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
+ 		  find_dead_or_set_registers (next_insn,
+ 					      &fallthrough_res, 0, jump_count,
+ 					      set, needed);
+-		  IOR_HARD_REG_SET (fallthrough_res.regs, target_res.regs);
+-		  AND_HARD_REG_SET (res->regs, fallthrough_res.regs);
++		  fallthrough_res.regs |= target_res.regs;
++		  res->regs &= fallthrough_res.regs;
+ 		  break;
+ 		}
+ 	      else
+@@ -601,9 +594,8 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
+       mark_referenced_resources (insn, &needed, true);
+       mark_set_resources (insn, &set, 0, MARK_SRC_DEST_CALL);
+ 
+-      COPY_HARD_REG_SET (scratch, set.regs);
+-      AND_COMPL_HARD_REG_SET (scratch, needed.regs);
+-      AND_COMPL_HARD_REG_SET (res->regs, scratch);
++      scratch = set.regs & ~needed.regs;
++      res->regs &= ~scratch;
+     }
+ 
+   return jump_insn;
+@@ -665,24 +657,16 @@ mark_set_resources (rtx x, struct resources *res, int in_dest,
+ 	{
+ 	  rtx_call_insn *call_insn = as_a <rtx_call_insn *> (x);
+ 	  rtx link;
+-	  HARD_REG_SET regs;
+ 
+ 	  res->cc = res->memory = 1;
+ 
+-	  get_call_reg_set_usage (call_insn, &regs, regs_invalidated_by_call);
+-	  IOR_HARD_REG_SET (res->regs, regs);
++	  res->regs |= insn_callee_abi (call_insn).full_reg_clobbers ();
+ 
+ 	  for (link = CALL_INSN_FUNCTION_USAGE (call_insn);
+ 	       link; link = XEXP (link, 1))
+-	    {
+-	      /* We could support CLOBBER_HIGH and treat it in the same way as
+-		 HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that
+-		 yet.  */
+-	      gcc_assert (GET_CODE (XEXP (link, 0)) != CLOBBER_HIGH);
+-	      if (GET_CODE (XEXP (link, 0)) == CLOBBER)
+-		mark_set_resources (SET_DEST (XEXP (link, 0)), res, 1,
+-				    MARK_SRC_DEST);
+-	    }
++	    if (GET_CODE (XEXP (link, 0)) == CLOBBER)
++	      mark_set_resources (SET_DEST (XEXP (link, 0)), res, 1,
++				  MARK_SRC_DEST);
+ 
+ 	  /* Check for a REG_SETJMP.  If it exists, then we must
+ 	     assume that this call can clobber any register.  */
+@@ -725,12 +709,6 @@ mark_set_resources (rtx x, struct resources *res, int in_dest,
+       mark_set_resources (XEXP (x, 0), res, 1, MARK_SRC_DEST);
+       return;
+ 
+-    case CLOBBER_HIGH:
+-      /* No current target supports both branch delay slots and CLOBBER_HIGH.
+-	 We'd need more elaborate liveness tracking to handle that
+-	 combination.  */
+-      gcc_unreachable ();
+-
+     case SEQUENCE:
+       {
+         rtx_sequence *seq = as_a <rtx_sequence *> (x);
+@@ -960,7 +938,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 	     update it below.  */
+ 	  if (b == tinfo->block && b != -1 && tinfo->bb_tick == bb_ticks[b])
+ 	    {
+-	      COPY_HARD_REG_SET (res->regs, tinfo->live_regs);
++	      res->regs = tinfo->live_regs;
+ 	      return;
+ 	    }
+ 	}
+@@ -1041,15 +1019,12 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 		 predicated instruction, or if the CALL is NORETURN.  */
+ 	      if (GET_CODE (PATTERN (real_insn)) != COND_EXEC)
+ 		{
+-		  HARD_REG_SET regs_invalidated_by_this_call;
+-		  get_call_reg_set_usage (real_insn,
+-					  &regs_invalidated_by_this_call,
+-					  regs_invalidated_by_call);
++		  HARD_REG_SET regs_invalidated_by_this_call
++		    = insn_callee_abi (real_insn).full_reg_clobbers ();
+ 		  /* CALL clobbers all call-used regs that aren't fixed except
+ 		     sp, ap, and fp.  Do this before setting the result of the
+ 		     call live.  */
+-		  AND_COMPL_HARD_REG_SET (current_live_regs,
+-					  regs_invalidated_by_this_call);
++		  current_live_regs &= ~regs_invalidated_by_this_call;
+ 		}
+ 
+ 	      /* A CALL_INSN sets any global register live, since it may
+@@ -1078,7 +1053,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 				      GET_MODE (XEXP (link, 0)),
+ 				      REGNO (XEXP (link, 0)));
+ 
+-	      note_stores (PATTERN (real_insn), update_live_status, NULL);
++	      note_stores (real_insn, update_live_status, NULL);
+ 
+ 	      /* If any registers were unused after this insn, kill them.
+ 		 These notes will always be accurate.  */
+@@ -1097,7 +1072,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 
+ 	      /* A label clobbers the pending dead registers since neither
+ 		 reload nor jump will propagate a value across a label.  */
+-	      AND_COMPL_HARD_REG_SET (current_live_regs, pending_dead_regs);
++	      current_live_regs &= ~pending_dead_regs;
+ 	      CLEAR_HARD_REG_SET (pending_dead_regs);
+ 
+ 	      /* We must conservatively assume that all registers that used
+@@ -1109,7 +1084,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 		  HARD_REG_SET extra_live;
+ 
+ 		  REG_SET_TO_HARD_REG_SET (extra_live, DF_LR_IN (bb));
+-		  IOR_HARD_REG_SET (current_live_regs, extra_live);
++		  current_live_regs |= extra_live;
+ 		}
+ 	    }
+ 
+@@ -1118,10 +1093,10 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 	     are implicitly required at that point.  */
+ 	  else if (NOTE_P (real_insn)
+ 		   && NOTE_KIND (real_insn) == NOTE_INSN_EPILOGUE_BEG)
+-	    IOR_HARD_REG_SET (current_live_regs, start_of_epilogue_needs.regs);
++	    current_live_regs |= start_of_epilogue_needs.regs;
+ 	}
+ 
+-      COPY_HARD_REG_SET (res->regs, current_live_regs);
++      res->regs = current_live_regs;
+       if (tinfo != NULL)
+ 	{
+ 	  tinfo->block = b;
+@@ -1160,20 +1135,17 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
+ 	{
+ 	  mark_referenced_resources (insn, &needed, true);
+ 
+-	  COPY_HARD_REG_SET (scratch, needed.regs);
+-	  AND_COMPL_HARD_REG_SET (scratch, set.regs);
+-	  IOR_HARD_REG_SET (new_resources.regs, scratch);
++	  scratch = needed.regs & ~set.regs;
++	  new_resources.regs |= scratch;
+ 
+ 	  mark_set_resources (insn, &set, 0, MARK_SRC_DEST_CALL);
+ 	}
+ 
+-      IOR_HARD_REG_SET (res->regs, new_resources.regs);
++      res->regs |= new_resources.regs;
+     }
+ 
+   if (tinfo != NULL)
+-    {
+-      COPY_HARD_REG_SET (tinfo->live_regs, res->regs);
+-    }
++    tinfo->live_regs = res->regs;
+ }
+ 
+ /* Initialize the resources required by mark_target_live_regs ().
+diff --git a/gcc/rtl.c b/gcc/rtl.c
+index d7b8e9877..ec65fbb37 100644
+--- a/gcc/rtl.c
++++ b/gcc/rtl.c
+@@ -315,10 +315,6 @@ copy_rtx (rtx orig)
+ 	return orig;
+       break;
+ 
+-    case CLOBBER_HIGH:
+-	gcc_assert (REG_P (XEXP (orig, 0)));
+-	return orig;
+-
+     case CONST:
+       if (shared_const_p (orig))
+ 	return orig;
+diff --git a/gcc/rtl.def b/gcc/rtl.def
+index f4c9d946c..edb34c5ac 100644
+--- a/gcc/rtl.def
++++ b/gcc/rtl.def
+@@ -312,16 +312,6 @@ DEF_RTL_EXPR(USE, "use", "e", RTX_EXTRA)
+    is considered undeletable before reload.  */
+ DEF_RTL_EXPR(CLOBBER, "clobber", "e", RTX_EXTRA)
+ 
+-/* Indicate that the upper parts of something are clobbered in a way that we
+-   don't want to explain.  The MODE references the lower bits that will be
+-   preserved.  Anything above that size will be clobbered.
+-
+-   CLOBBER_HIGH only occurs as the operand of a PARALLEL rtx.  It cannot appear
+-   in other contexts, and unlike CLOBBER, it cannot appear on its own.
+-   CLOBBER_HIGH can only be used with fixed register rtxes.  */
+-
+-DEF_RTL_EXPR(CLOBBER_HIGH, "clobber_high", "e", RTX_EXTRA)
+-
+ /* Call a subroutine.
+    Operand 1 is the address to call.
+    Operand 2 is the number of arguments.  */
+@@ -936,6 +926,12 @@ DEF_RTL_EXPR(DEFINE_SPLIT, "define_split", "EsES", RTX_EXTRA)
+    7: optionally, a vector of attributes for this insn.  */
+ DEF_RTL_EXPR(DEFINE_INSN_AND_SPLIT, "define_insn_and_split", "sEsTsESV", RTX_EXTRA)
+ 
++/* A form of define_insn_and_split in which the split insn pattern (operand 5)
++   is determined automatically by replacing match_operands with match_dups
++   and match_operators with match_op_dups.  The operands are the same as
++   define_insn_and_split but with operand 5 removed.  */
++DEF_RTL_EXPR(DEFINE_INSN_AND_REWRITE, "define_insn_and_rewrite", "sEsTsSV", RTX_EXTRA)
++
+ /* Definition of an RTL peephole operation.
+    Follows the same arguments as define_split.  */
+ DEF_RTL_EXPR(DEFINE_PEEPHOLE2, "define_peephole2", "EsES", RTX_EXTRA)
+diff --git a/gcc/rtl.h b/gcc/rtl.h
+index b4a906f91..6093d42c0 100644
+--- a/gcc/rtl.h
++++ b/gcc/rtl.h
+@@ -1623,11 +1623,17 @@ extern const char * const reg_note_name[];
+ #define GET_REG_NOTE_NAME(MODE) (reg_note_name[(int) (MODE)])
+ 
+ /* This field is only present on CALL_INSNs.  It holds a chain of EXPR_LIST of
+-   USE and CLOBBER expressions.
++   USE, CLOBBER and SET expressions.
+      USE expressions list the registers filled with arguments that
+    are passed to the function.
+      CLOBBER expressions document the registers explicitly clobbered
+    by this CALL_INSN.
++     SET expressions say that the return value of the call (the SET_DEST)
++   is equivalent to a value available before the call (the SET_SRC).
++   This kind of SET is used when the return value is predictable in
++   advance.  It is purely an optimisation hint; unlike USEs and CLOBBERs,
++   it does not affect register liveness.
++
+      Pseudo registers cannot be mentioned in this list.  */
+ #define CALL_INSN_FUNCTION_USAGE(INSN)	XEXP(INSN, 7)
+ 
+@@ -2392,12 +2398,30 @@ extern int rtx_cost (rtx, machine_mode, enum rtx_code, int, bool);
+ extern int address_cost (rtx, machine_mode, addr_space_t, bool);
+ extern void get_full_rtx_cost (rtx, machine_mode, enum rtx_code, int,
+ 			       struct full_rtx_costs *);
++extern bool native_encode_rtx (machine_mode, rtx, vec<target_unit> &,
++			       unsigned int, unsigned int);
++extern rtx native_decode_rtx (machine_mode, vec<target_unit>,
++			      unsigned int);
++extern rtx native_decode_vector_rtx (machine_mode, vec<target_unit>,
++				     unsigned int, unsigned int, unsigned int);
+ extern poly_uint64 subreg_lsb (const_rtx);
+-extern poly_uint64 subreg_lsb_1 (machine_mode, machine_mode, poly_uint64);
++extern poly_uint64 subreg_size_lsb (poly_uint64, poly_uint64, poly_uint64);
+ extern poly_uint64 subreg_size_offset_from_lsb (poly_uint64, poly_uint64,
+ 						poly_uint64);
+ extern bool read_modify_subreg_p (const_rtx);
+ 
++/* Given a subreg's OUTER_MODE, INNER_MODE, and SUBREG_BYTE, return the
++   bit offset at which the subreg begins (counting from the least significant
++   bit of the operand).  */
++
++inline poly_uint64
++subreg_lsb_1 (machine_mode outer_mode, machine_mode inner_mode,
++	      poly_uint64 subreg_byte)
++{
++  return subreg_size_lsb (GET_MODE_SIZE (outer_mode),
++			  GET_MODE_SIZE (inner_mode), subreg_byte);
++}
++
+ /* Return the subreg byte offset for a subreg whose outer mode is
+    OUTER_MODE, whose inner mode is INNER_MODE, and where there are
+    LSB_SHIFT *bits* between the lsb of the outer value and the lsb of
+@@ -2645,7 +2669,7 @@ do {								        \
+ 
+ /* For a SET rtx, SET_DEST is the place that is set
+    and SET_SRC is the value it is set to.  */
+-#define SET_DEST(RTX) XC3EXP (RTX, 0, SET, CLOBBER, CLOBBER_HIGH)
++#define SET_DEST(RTX) XC2EXP (RTX, 0, SET, CLOBBER)
+ #define SET_SRC(RTX) XCEXP (RTX, 1, SET)
+ #define SET_IS_RETURN_P(RTX)						\
+   (RTL_FLAG_CHECK1 ("SET_IS_RETURN_P", (RTX), SET)->jump)
+@@ -3369,8 +3393,7 @@ extern bool val_signbit_known_clear_p (machine_mode,
+ 				       unsigned HOST_WIDE_INT);
+ 
+ /* In reginfo.c  */
+-extern machine_mode choose_hard_reg_mode (unsigned int, unsigned int,
+-					       bool);
++extern machine_mode choose_hard_reg_mode (unsigned int, unsigned int, bool);
+ extern const HARD_REG_SET &simplifiable_subregs (const subreg_shape &);
+ 
+ /* In emit-rtl.c  */
+@@ -3407,6 +3430,7 @@ extern int rtx_unstable_p (const_rtx);
+ extern bool rtx_varies_p (const_rtx, bool);
+ extern bool rtx_addr_varies_p (const_rtx, bool);
+ extern rtx get_call_rtx_from (rtx);
++extern tree get_call_fndecl (const rtx_insn *);
+ extern HOST_WIDE_INT get_integer_term (const_rtx);
+ extern rtx get_related_value (const_rtx);
+ extern bool offset_within_block_p (const_rtx, HOST_WIDE_INT);
+@@ -3435,7 +3459,10 @@ extern void record_hard_reg_sets (rtx, const_rtx, void *);
+ extern void record_hard_reg_uses (rtx *, void *);
+ extern void find_all_hard_regs (const_rtx, HARD_REG_SET *);
+ extern void find_all_hard_reg_sets (const rtx_insn *, HARD_REG_SET *, bool);
+-extern void note_stores (const_rtx, void (*) (rtx, const_rtx, void *), void *);
++extern void note_pattern_stores (const_rtx,
++				 void (*) (rtx, const_rtx, void *), void *);
++extern void note_stores (const rtx_insn *,
++			 void (*) (rtx, const_rtx, void *), void *);
+ extern void note_uses (rtx *, void (*) (rtx *, void *), void *);
+ extern int dead_or_set_p (const rtx_insn *, const_rtx);
+ extern int dead_or_set_regno_p (const rtx_insn *, unsigned int);
+@@ -3476,16 +3503,6 @@ extern bool tablejump_p (const rtx_insn *, rtx_insn **, rtx_jump_table_data **);
+ extern int computed_jump_p (const rtx_insn *);
+ extern bool tls_referenced_p (const_rtx);
+ extern bool contains_mem_rtx_p (rtx x);
+-extern bool reg_is_clobbered_by_clobber_high (unsigned int, machine_mode,
+-					      const_rtx);
+-
+-/* Convenient wrapper for reg_is_clobbered_by_clobber_high.  */
+-inline bool
+-reg_is_clobbered_by_clobber_high (const_rtx x, const_rtx clobber_high_op)
+-{
+-  return reg_is_clobbered_by_clobber_high (REGNO (x), GET_MODE (x),
+-					   clobber_high_op);
+-}
+ 
+ /* Overload for refers_to_regno_p for checking a single register.  */
+ inline bool
+@@ -4279,7 +4296,6 @@ extern void vt_equate_reg_base_value (const_rtx, const_rtx);
+ extern bool memory_modified_in_insn_p (const_rtx, const_rtx);
+ extern bool may_be_sp_based_p (rtx);
+ extern rtx gen_hard_reg_clobber (machine_mode, unsigned int);
+-extern rtx gen_hard_reg_clobber_high (machine_mode, unsigned int);
+ extern rtx get_reg_known_value (unsigned int);
+ extern bool get_reg_known_equiv_p (unsigned int);
+ extern rtx get_reg_base_value (unsigned int);
+@@ -4353,14 +4369,11 @@ extern tree GTY(()) global_regs_decl[FIRST_PSEUDO_REGISTER];
+    Available only for functions that has been already assembled.  */
+ 
+ struct GTY(()) cgraph_rtl_info {
+-   unsigned int preferred_incoming_stack_boundary;
++  unsigned int preferred_incoming_stack_boundary;
+ 
+-  /* Call unsaved hard registers really used by the corresponding
+-     function (including ones used by functions called by the
+-     function).  */
++  /* Which registers the function clobbers, either directly or by
++     calling another function.  */
+   HARD_REG_SET function_used_regs;
+-  /* Set if function_used_regs is valid.  */
+-  unsigned function_used_regs_valid: 1;
+ };
+ 
+ /* If loads from memories of mode MODE always sign or zero extend,
+diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
+index 01af063a2..553d71c1c 100644
+--- a/gcc/rtlanal.c
++++ b/gcc/rtlanal.c
+@@ -823,6 +823,24 @@ get_call_rtx_from (rtx x)
+     return x;
+   return NULL_RTX;
+ }
++
++/* Get the declaration of the function called by INSN.  */
++
++tree
++get_call_fndecl (const rtx_insn *insn)
++{
++  rtx note, datum;
++
++  note = find_reg_note (insn, REG_CALL_DECL, NULL_RTX);
++  if (note == NULL_RTX)
++    return NULL_TREE;
++
++  datum = XEXP (note, 0);
++  if (datum != NULL_RTX)
++    return SYMBOL_REF_DECL (datum);
++
++  return NULL_TREE;
++}
+ 
+ /* Return the value of the integer term in X, if one is apparent;
+    otherwise return 0.
+@@ -1198,10 +1216,6 @@ reg_referenced_p (const_rtx x, const_rtx body)
+ 	  return 1;
+       return 0;
+ 
+-    case CLOBBER_HIGH:
+-      gcc_assert (REG_P (XEXP (body, 0)));
+-      return 0;
+-
+     case COND_EXEC:
+       if (reg_overlap_mentioned_p (x, COND_EXEC_TEST (body)))
+ 	return 1;
+@@ -1424,11 +1438,7 @@ set_of_1 (rtx x, const_rtx pat, void *data1)
+ {
+   struct set_of_data *const data = (struct set_of_data *) (data1);
+   if (rtx_equal_p (x, data->pat)
+-      || (GET_CODE (pat) == CLOBBER_HIGH
+-	  && REGNO(data->pat) == REGNO(XEXP (pat, 0))
+-	  && reg_is_clobbered_by_clobber_high (data->pat, XEXP (pat, 0)))
+-      || (GET_CODE (pat) != CLOBBER_HIGH && !MEM_P (x)
+-	  && reg_overlap_mentioned_p (data->pat, x)))
++      || (!MEM_P (x) && reg_overlap_mentioned_p (data->pat, x)))
+     data->found = pat;
+ }
+ 
+@@ -1440,7 +1450,7 @@ set_of (const_rtx pat, const_rtx insn)
+   struct set_of_data data;
+   data.found = NULL_RTX;
+   data.pat = pat;
+-  note_stores (INSN_P (insn) ? PATTERN (insn) : insn, set_of_1, &data);
++  note_pattern_stores (INSN_P (insn) ? PATTERN (insn) : insn, set_of_1, &data);
+   return data.found;
+ }
+ 
+@@ -1476,15 +1486,9 @@ find_all_hard_reg_sets (const rtx_insn *insn, HARD_REG_SET *pset, bool implicit)
+   rtx link;
+ 
+   CLEAR_HARD_REG_SET (*pset);
+-  note_stores (PATTERN (insn), record_hard_reg_sets, pset);
+-  if (CALL_P (insn))
+-    {
+-      if (implicit)
+-	IOR_HARD_REG_SET (*pset, call_used_reg_set);
+-
+-      for (link = CALL_INSN_FUNCTION_USAGE (insn); link; link = XEXP (link, 1))
+-	record_hard_reg_sets (XEXP (link, 0), NULL, pset);
+-    }
++  note_stores (insn, record_hard_reg_sets, pset);
++  if (CALL_P (insn) && implicit)
++    *pset |= call_used_or_fixed_regs;
+   for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+     if (REG_NOTE_KIND (link) == REG_INC)
+       record_hard_reg_sets (XEXP (link, 0), NULL, pset);
+@@ -1517,7 +1521,6 @@ single_set_2 (const rtx_insn *insn, const_rtx pat)
+ 	    {
+ 	    case USE:
+ 	    case CLOBBER:
+-	    case CLOBBER_HIGH:
+ 	      break;
+ 
+ 	    case SET:
+@@ -1671,9 +1674,7 @@ noop_move_p (const rtx_insn *insn)
+ 	{
+ 	  rtx tem = XVECEXP (pat, 0, i);
+ 
+-	  if (GET_CODE (tem) == USE
+-	      || GET_CODE (tem) == CLOBBER
+-	      || GET_CODE (tem) == CLOBBER_HIGH)
++	  if (GET_CODE (tem) == USE || GET_CODE (tem) == CLOBBER)
+ 	    continue;
+ 
+ 	  if (GET_CODE (tem) != SET || ! set_noop_p (tem))
+@@ -1899,16 +1900,15 @@ reg_overlap_mentioned_p (const_rtx x, const_rtx in)
+   the SUBREG will be passed.  */
+ 
+ void
+-note_stores (const_rtx x, void (*fun) (rtx, const_rtx, void *), void *data)
++note_pattern_stores (const_rtx x,
++		     void (*fun) (rtx, const_rtx, void *), void *data)
+ {
+   int i;
+ 
+   if (GET_CODE (x) == COND_EXEC)
+     x = COND_EXEC_CODE (x);
+ 
+-  if (GET_CODE (x) == SET
+-      || GET_CODE (x) == CLOBBER
+-      || GET_CODE (x) == CLOBBER_HIGH)
++  if (GET_CODE (x) == SET || GET_CODE (x) == CLOBBER)
+     {
+       rtx dest = SET_DEST (x);
+ 
+@@ -1933,7 +1933,22 @@ note_stores (const_rtx x, void (*fun) (rtx, const_rtx, void *), void *data)
+ 
+   else if (GET_CODE (x) == PARALLEL)
+     for (i = XVECLEN (x, 0) - 1; i >= 0; i--)
+-      note_stores (XVECEXP (x, 0, i), fun, data);
++      note_pattern_stores (XVECEXP (x, 0, i), fun, data);
++}
++
++/* Same, but for an instruction.  If the instruction is a call, include
++   any CLOBBERs in its CALL_INSN_FUNCTION_USAGE.  */
++
++void
++note_stores (const rtx_insn *insn,
++	     void (*fun) (rtx, const_rtx, void *), void *data)
++{
++  if (CALL_P (insn))
++    for (rtx link = CALL_INSN_FUNCTION_USAGE (insn);
++	 link; link = XEXP (link, 1))
++      if (GET_CODE (XEXP (link, 0)) == CLOBBER)
++	note_pattern_stores (XEXP (link, 0), fun, data);
++  note_pattern_stores (PATTERN (insn), fun, data);
+ }
+ 
+ /* Like notes_stores, but call FUN for each expression that is being
+@@ -3611,23 +3626,31 @@ loc_mentioned_in_p (rtx *loc, const_rtx in)
+   return 0;
+ }
+ 
+-/* Helper function for subreg_lsb.  Given a subreg's OUTER_MODE, INNER_MODE,
+-   and SUBREG_BYTE, return the bit offset where the subreg begins
+-   (counting from the least significant bit of the operand).  */
++/* Reinterpret a subreg as a bit extraction from an integer and return
++   the position of the least significant bit of the extracted value.
++   In other words, if the extraction were performed as a shift right
++   and mask, return the number of bits to shift right.
++
++   The outer value of the subreg has OUTER_BYTES bytes and starts at
++   byte offset SUBREG_BYTE within an inner value of INNER_BYTES bytes.  */
+ 
+ poly_uint64
+-subreg_lsb_1 (machine_mode outer_mode,
+-	      machine_mode inner_mode,
+-	      poly_uint64 subreg_byte)
++subreg_size_lsb (poly_uint64 outer_bytes,
++		 poly_uint64 inner_bytes,
++		 poly_uint64 subreg_byte)
+ {
+   poly_uint64 subreg_end, trailing_bytes, byte_pos;
+ 
+   /* A paradoxical subreg begins at bit position 0.  */
+-  if (paradoxical_subreg_p (outer_mode, inner_mode))
+-    return 0;
++  gcc_checking_assert (ordered_p (outer_bytes, inner_bytes));
++  if (maybe_gt (outer_bytes, inner_bytes))
++    {
++      gcc_checking_assert (known_eq (subreg_byte, 0U));
++      return 0;
++    }
+ 
+-  subreg_end = subreg_byte + GET_MODE_SIZE (outer_mode);
+-  trailing_bytes = GET_MODE_SIZE (inner_mode) - subreg_end;
++  subreg_end = subreg_byte + outer_bytes;
++  trailing_bytes = inner_bytes - subreg_end;
+   if (WORDS_BIG_ENDIAN && BYTES_BIG_ENDIAN)
+     byte_pos = trailing_bytes;
+   else if (!WORDS_BIG_ENDIAN && !BYTES_BIG_ENDIAN)
+@@ -4123,7 +4146,7 @@ find_first_parameter_load (rtx_insn *call_insn, rtx_insn *boundary)
+       if (INSN_P (before))
+ 	{
+ 	  int nregs_old = parm.nregs;
+-	  note_stores (PATTERN (before), parms_set, &parm);
++	  note_stores (before, parms_set, &parm);
+ 	  /* If we found something that did not set a parameter reg,
+ 	     we're done.  Do not keep going, as that might result
+ 	     in hoisting an insn before the setting of a pseudo
+@@ -6601,32 +6624,3 @@ tls_referenced_p (const_rtx x)
+       return true;
+   return false;
+ }
+-
+-/* Return true if reg REGNO with mode REG_MODE would be clobbered by the
+-   clobber_high operand in CLOBBER_HIGH_OP.  */
+-
+-bool
+-reg_is_clobbered_by_clobber_high (unsigned int regno, machine_mode reg_mode,
+-				  const_rtx clobber_high_op)
+-{
+-  unsigned int clobber_regno = REGNO (clobber_high_op);
+-  machine_mode clobber_mode = GET_MODE (clobber_high_op);
+-  unsigned char regno_nregs = hard_regno_nregs (regno, reg_mode);
+-
+-  /* Clobber high should always span exactly one register.  */
+-  gcc_assert (REG_NREGS (clobber_high_op) == 1);
+-
+-  /* Clobber high needs to match with one of the registers in X.  */
+-  if (clobber_regno < regno || clobber_regno >= regno + regno_nregs)
+-    return false;
+-
+-  gcc_assert (reg_mode != BLKmode && clobber_mode != BLKmode);
+-
+-  if (reg_mode == VOIDmode)
+-    return clobber_mode != VOIDmode;
+-
+-  /* Clobber high will clobber if its size might be greater than the size of
+-     register regno.  */
+-  return maybe_gt (exact_div (GET_MODE_SIZE (reg_mode), regno_nregs),
+-		 GET_MODE_SIZE (clobber_mode));
+-}
+diff --git a/gcc/rtx-vector-builder.h b/gcc/rtx-vector-builder.h
+index d5950e2b8..08b55dd36 100644
+--- a/gcc/rtx-vector-builder.h
++++ b/gcc/rtx-vector-builder.h
+@@ -24,10 +24,11 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ /* This class is used to build VECTOR_CSTs from a sequence of elements.
+    See vector_builder for more details.  */
+-class rtx_vector_builder : public vector_builder<rtx, rtx_vector_builder>
++class rtx_vector_builder : public vector_builder<rtx, machine_mode,
++						 rtx_vector_builder>
+ {
+-  typedef vector_builder<rtx, rtx_vector_builder> parent;
+-  friend class vector_builder<rtx, rtx_vector_builder>;
++  typedef vector_builder<rtx, machine_mode, rtx_vector_builder> parent;
++  friend class vector_builder<rtx, machine_mode, rtx_vector_builder>;
+ 
+ public:
+   rtx_vector_builder () : m_mode (VOIDmode) {}
+@@ -48,6 +49,15 @@ private:
+   bool can_elide_p (rtx) const { return true; }
+   void note_representative (rtx *, rtx) {}
+ 
++  static poly_uint64 shape_nelts (machine_mode mode)
++    { return GET_MODE_NUNITS (mode); }
++  static poly_uint64 nelts_of (const_rtx x)
++    { return CONST_VECTOR_NUNITS (x); }
++  static unsigned int npatterns_of (const_rtx x)
++    { return CONST_VECTOR_NPATTERNS (x); }
++  static unsigned int nelts_per_pattern_of (const_rtx x)
++    { return CONST_VECTOR_NELTS_PER_PATTERN (x); }
++
+   rtx find_cached_value ();
+ 
+   machine_mode m_mode;
+diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
+index 28b9d38ab..fe447d16a 100644
+--- a/gcc/sched-deps.c
++++ b/gcc/sched-deps.c
+@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "sched-int.h"
+ #include "params.h"
+ #include "cselib.h"
++#include "function-abi.h"
+ 
+ #ifdef INSN_SCHEDULING
+ 
+@@ -2203,9 +2204,9 @@ init_insn_reg_pressure_info (rtx_insn *insn)
+       reg_pressure_info[cl].change = 0;
+     }
+ 
+-  note_stores (PATTERN (insn), mark_insn_reg_clobber, insn);
++  note_stores (insn, mark_insn_reg_clobber, insn);
+ 
+-  note_stores (PATTERN (insn), mark_insn_reg_store, insn);
++  note_stores (insn, mark_insn_reg_store, insn);
+ 
+   if (AUTO_INC_DEC)
+     for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+@@ -2319,13 +2320,6 @@ sched_analyze_reg (struct deps_desc *deps, int regno, machine_mode mode,
+ 	  while (--i >= 0)
+ 	    note_reg_use (regno + i);
+ 	}
+-      else if (ref == CLOBBER_HIGH)
+-	{
+-	  gcc_assert (i == 1);
+-	  /* We don't know the current state of the register, so have to treat
+-	     the clobber high as a full clobber.  */
+-	  note_reg_clobber (regno);
+-	}
+       else
+ 	{
+ 	  while (--i >= 0)
+@@ -2349,8 +2343,6 @@ sched_analyze_reg (struct deps_desc *deps, int regno, machine_mode mode,
+       else if (ref == USE)
+ 	note_reg_use (regno);
+       else
+-	/* For CLOBBER_HIGH, we don't know the current state of the register,
+-	   so have to treat it as a full clobber.  */
+ 	note_reg_clobber (regno);
+ 
+       /* Pseudos that are REG_EQUIV to something may be replaced
+@@ -2885,7 +2877,7 @@ get_implicit_reg_pending_clobbers (HARD_REG_SET *temp, rtx_insn *insn)
+   preprocess_constraints (insn);
+   alternative_mask preferred = get_preferred_alternatives (insn);
+   ira_implicitly_set_insn_hard_regs (temp, preferred);
+-  AND_COMPL_HARD_REG_SET (*temp, ira_no_alloc_regs);
++  *temp &= ~ira_no_alloc_regs;
+ }
+ 
+ /* Analyze an INSN with pattern X to find all dependencies.  */
+@@ -2901,7 +2893,7 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
+     {
+       HARD_REG_SET temp;
+       get_implicit_reg_pending_clobbers (&temp, insn);
+-      IOR_HARD_REG_SET (implicit_reg_pending_clobbers, temp);
++      implicit_reg_pending_clobbers |= temp;
+     }
+ 
+   can_start_lhs_rhs_p = (NONJUMP_INSN_P (insn)
+@@ -2973,7 +2965,7 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
+ 	      sub = COND_EXEC_CODE (sub);
+ 	      code = GET_CODE (sub);
+ 	    }
+-	  else if (code == SET || code == CLOBBER || code == CLOBBER_HIGH)
++	  else if (code == SET || code == CLOBBER)
+ 	    sched_analyze_1 (deps, sub, insn);
+ 	  else
+ 	    sched_analyze_2 (deps, sub, insn);
+@@ -2989,10 +2981,6 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
+ 	{
+ 	  if (GET_CODE (XEXP (link, 0)) == CLOBBER)
+ 	    sched_analyze_1 (deps, XEXP (link, 0), insn);
+-	  else if (GET_CODE (XEXP (link, 0)) == CLOBBER_HIGH)
+-	    /* We could support CLOBBER_HIGH and treat it in the same way as
+-	      HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
+-	    gcc_unreachable ();
+ 	  else if (GET_CODE (XEXP (link, 0)) != SET)
+ 	    sched_analyze_2 (deps, XEXP (link, 0), insn);
+ 	}
+@@ -3332,10 +3320,9 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
+       IOR_REG_SET (&deps->reg_last_in_use, reg_pending_uses);
+       IOR_REG_SET (&deps->reg_last_in_use, reg_pending_clobbers);
+       IOR_REG_SET (&deps->reg_last_in_use, reg_pending_sets);
+-      for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+-	if (TEST_HARD_REG_BIT (implicit_reg_pending_uses, i)
+-	    || TEST_HARD_REG_BIT (implicit_reg_pending_clobbers, i))
+-	  SET_REGNO_REG_SET (&deps->reg_last_in_use, i);
++      IOR_REG_SET_HRS (&deps->reg_last_in_use,
++		       implicit_reg_pending_uses
++		       | implicit_reg_pending_clobbers);
+ 
+       /* Set up the pending barrier found.  */
+       deps->last_reg_pending_barrier = reg_pending_barrier;
+@@ -3724,6 +3711,7 @@ deps_analyze_insn (struct deps_desc *deps, rtx_insn *insn)
+         }
+       else
+         {
++	  function_abi callee_abi = insn_callee_abi (insn);
+           for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+             /* A call may read and modify global register variables.  */
+             if (global_regs[i])
+@@ -3735,8 +3723,8 @@ deps_analyze_insn (struct deps_desc *deps, rtx_insn *insn)
+              Since we only have a choice between 'might be clobbered'
+              and 'definitely not clobbered', we must include all
+              partly call-clobbered registers here.  */
+-	    else if (targetm.hard_regno_call_part_clobbered (insn, i,
+-							     reg_raw_mode[i])
++	    else if (targetm.hard_regno_call_part_clobbered
++		     (callee_abi.id (), i, reg_raw_mode[i])
+                      || TEST_HARD_REG_BIT (regs_invalidated_by_call, i))
+               SET_REGNO_REG_SET (reg_pending_clobbers, i);
+           /* We don't know what set of fixed registers might be used
+diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c
+index 83688b3c9..c5ee33bf5 100644
+--- a/gcc/sched-rgn.c
++++ b/gcc/sched-rgn.c
+@@ -2409,7 +2409,7 @@ static bool
+ sets_likely_spilled (rtx pat)
+ {
+   bool ret = false;
+-  note_stores (pat, sets_likely_spilled_1, &ret);
++  note_pattern_stores (pat, sets_likely_spilled_1, &ret);
+   return ret;
+ }
+ 
+diff --git a/gcc/sel-sched-ir.c b/gcc/sel-sched-ir.c
+index 6dec1beaa..f8f1d8238 100644
+--- a/gcc/sel-sched-ir.c
++++ b/gcc/sel-sched-ir.c
+@@ -2661,12 +2661,9 @@ setup_id_implicit_regs (idata_t id, insn_t insn)
+     return;
+ 
+   HARD_REG_SET temp;
+-  unsigned regno;
+-  hard_reg_set_iterator hrsi;
+ 
+   get_implicit_reg_pending_clobbers (&temp, insn);
+-  EXECUTE_IF_SET_IN_HARD_REG_SET (temp, 0, regno, hrsi)
+-    SET_REGNO_REG_SET (IDATA_REG_SETS (id), regno);
++  IOR_REG_SET_HRS (IDATA_REG_SETS (id), temp);
+ }
+ 
+ /* Setup register sets describing INSN in ID.  */
+diff --git a/gcc/sel-sched.c b/gcc/sel-sched.c
+index f127ff745..bf370b5a5 100644
+--- a/gcc/sel-sched.c
++++ b/gcc/sel-sched.c
+@@ -1102,7 +1102,7 @@ init_regs_for_mode (machine_mode mode)
+       if (i >= 0)
+         continue;
+ 
+-      if (targetm.hard_regno_call_part_clobbered (NULL, cur_reg, mode))
++      if (targetm.hard_regno_call_part_clobbered (0, cur_reg, mode))
+         SET_HARD_REG_BIT (sel_hrd.regs_for_call_clobbered[mode],
+                           cur_reg);
+ 
+@@ -1123,7 +1123,7 @@ init_hard_regs_data (void)
+ 
+   CLEAR_HARD_REG_SET (sel_hrd.regs_ever_used);
+   for (cur_reg = 0; cur_reg < FIRST_PSEUDO_REGISTER; cur_reg++)
+-    if (df_regs_ever_live_p (cur_reg) || call_used_regs[cur_reg])
++    if (df_regs_ever_live_p (cur_reg) || call_used_or_fixed_reg_p (cur_reg))
+       SET_HARD_REG_BIT (sel_hrd.regs_ever_used, cur_reg);
+ 
+   /* Initialize registers that are valid based on mode when this is
+@@ -1221,15 +1221,13 @@ mark_unavailable_hard_regs (def_t def, struct reg_rename *reg_rename_p,
+      The HARD_REGNO_RENAME_OK covers other cases in condition below.  */
+   if (IN_RANGE (REGNO (orig_dest), FIRST_STACK_REG, LAST_STACK_REG)
+       && REGNO_REG_SET_P (used_regs, FIRST_STACK_REG))
+-    IOR_HARD_REG_SET (reg_rename_p->unavailable_hard_regs,
+-                      sel_hrd.stack_regs);
++    reg_rename_p->unavailable_hard_regs |= sel_hrd.stack_regs;
+ #endif
+ 
+-  /* If there's a call on this path, make regs from call_used_reg_set
++  /* If there's a call on this path, make regs from call_used_or_fixed_regs
+      unavailable.  */
+   if (def->crosses_call)
+-    IOR_HARD_REG_SET (reg_rename_p->unavailable_hard_regs,
+-                      call_used_reg_set);
++    reg_rename_p->unavailable_hard_regs |= call_used_or_fixed_regs;
+ 
+   /* Stop here before reload: we need FRAME_REGS, STACK_REGS, and crosses_call,
+      but not register classes.  */
+@@ -1238,22 +1236,20 @@ mark_unavailable_hard_regs (def_t def, struct reg_rename *reg_rename_p,
+ 
+   /* Leave regs as 'available' only from the current
+      register class.  */
+-  COPY_HARD_REG_SET (reg_rename_p->available_for_renaming,
+-                     reg_class_contents[cl]);
++  reg_rename_p->available_for_renaming = reg_class_contents[cl];
+ 
+   mode = GET_MODE (orig_dest);
+ 
+   /* Leave only registers available for this mode.  */
+   if (!sel_hrd.regs_for_mode_ok[mode])
+     init_regs_for_mode (mode);
+-  AND_HARD_REG_SET (reg_rename_p->available_for_renaming,
+-                    sel_hrd.regs_for_mode[mode]);
++  reg_rename_p->available_for_renaming &= sel_hrd.regs_for_mode[mode];
+ 
+   /* Exclude registers that are partially call clobbered.  */
+   if (def->crosses_call
+-      && !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
+-    AND_COMPL_HARD_REG_SET (reg_rename_p->available_for_renaming,
+-                            sel_hrd.regs_for_call_clobbered[mode]);
++      && !targetm.hard_regno_call_part_clobbered (0, regno, mode))
++    reg_rename_p->available_for_renaming
++      &= ~sel_hrd.regs_for_call_clobbered[mode];
+ 
+   /* Leave only those that are ok to rename.  */
+   EXECUTE_IF_SET_IN_HARD_REG_SET (reg_rename_p->available_for_renaming,
+@@ -1274,8 +1270,7 @@ mark_unavailable_hard_regs (def_t def, struct reg_rename *reg_rename_p,
+                             cur_reg);
+     }
+ 
+-  AND_COMPL_HARD_REG_SET (reg_rename_p->available_for_renaming,
+-                          reg_rename_p->unavailable_hard_regs);
++  reg_rename_p->available_for_renaming &= ~reg_rename_p->unavailable_hard_regs;
+ 
+   /* Regno is always ok from the renaming part of view, but it really
+      could be in *unavailable_hard_regs already, so set it here instead
+@@ -1686,8 +1681,7 @@ find_best_reg_for_expr (expr_t expr, blist_t bnds, bool *is_orig_reg_p)
+ 
+ 	  /* Join hard registers unavailable due to register class
+ 	     restrictions and live range intersection.  */
+-	  IOR_HARD_REG_SET (hard_regs_used,
+-			    reg_rename_data.unavailable_hard_regs);
++	  hard_regs_used |= reg_rename_data.unavailable_hard_regs;
+ 
+ 	  best_reg = choose_best_reg (hard_regs_used, &reg_rename_data,
+ 				      original_insns, is_orig_reg_p);
+@@ -2110,7 +2104,7 @@ implicit_clobber_conflict_p (insn_t through_insn, expr_t expr)
+   preprocess_constraints (insn);
+   alternative_mask prefrred = get_preferred_alternatives (insn);
+   ira_implicitly_set_insn_hard_regs (&temp, prefrred);
+-  AND_COMPL_HARD_REG_SET (temp, ira_no_alloc_regs);
++  temp &= ~ira_no_alloc_regs;
+ 
+   /* If any implicit clobber registers intersect with regular ones in
+      through_insn, we have a dependency and thus bail out.  */
+diff --git a/gcc/shrink-wrap.c b/gcc/shrink-wrap.c
+index 57124db92..018696637 100644
+--- a/gcc/shrink-wrap.c
++++ b/gcc/shrink-wrap.c
+@@ -76,7 +76,7 @@ requires_stack_frame_p (rtx_insn *insn, HARD_REG_SET prologue_used,
+     }
+   if (hard_reg_set_intersect_p (hardregs, prologue_used))
+     return true;
+-  AND_COMPL_HARD_REG_SET (hardregs, call_used_reg_set);
++  hardregs &= ~call_used_or_fixed_regs;
+   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+     if (TEST_HARD_REG_BIT (hardregs, regno)
+ 	&& df_regs_ever_live_p (regno))
+@@ -151,8 +151,8 @@ live_edge_for_reg (basic_block bb, int regno, int end_regno)
+ 
+ static bool
+ move_insn_for_shrink_wrap (basic_block bb, rtx_insn *insn,
+-			   const HARD_REG_SET uses,
+-			   const HARD_REG_SET defs,
++			   const_hard_reg_set uses,
++			   const_hard_reg_set defs,
+ 			   bool *split_p,
+ 			   struct dead_debug_local *debug)
+ {
+@@ -687,9 +687,9 @@ try_shrink_wrapping (edge *entry_edge, rtx_insn *prologue_seq)
+ 	HARD_REG_SET this_used;
+ 	CLEAR_HARD_REG_SET (this_used);
+ 	note_uses (&PATTERN (insn), record_hard_reg_uses, &this_used);
+-	AND_COMPL_HARD_REG_SET (this_used, prologue_clobbered);
+-	IOR_HARD_REG_SET (prologue_used, this_used);
+-	note_stores (PATTERN (insn), record_hard_reg_sets, &prologue_clobbered);
++	this_used &= ~prologue_clobbered;
++	prologue_used |= this_used;
++	note_stores (insn, record_hard_reg_sets, &prologue_clobbered);
+       }
+   CLEAR_HARD_REG_BIT (prologue_clobbered, STACK_POINTER_REGNUM);
+   if (frame_pointer_needed)
+diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
+index bdbd1b98e..612d21b72 100644
+--- a/gcc/simplify-rtx.c
++++ b/gcc/simplify-rtx.c
+@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "flags.h"
+ #include "selftest.h"
+ #include "selftest-rtl.h"
++#include "rtx-vector-builder.h"
+ 
+ /* Simplification and canonicalization of RTL.  */
+ 
+@@ -45,7 +46,6 @@ along with GCC; see the file COPYING3.  If not see
+ #define HWI_SIGN_EXTEND(low) \
+   ((((HOST_WIDE_INT) low) < 0) ? HOST_WIDE_INT_M1 : HOST_WIDE_INT_0)
+ 
+-static rtx neg_const_int (machine_mode, const_rtx);
+ static bool plus_minus_operand_p (const_rtx);
+ static rtx simplify_plus_minus (enum rtx_code, machine_mode, rtx, rtx);
+ static rtx simplify_associative_operation (enum rtx_code, machine_mode,
+@@ -56,17 +56,12 @@ static rtx simplify_unary_operation_1 (enum rtx_code, machine_mode, rtx);
+ static rtx simplify_binary_operation_1 (enum rtx_code, machine_mode,
+ 					rtx, rtx, rtx, rtx);
+ 
+-/* Negate a CONST_INT rtx.  */
++/* Negate I, which satisfies poly_int_rtx_p.  MODE is the mode of I.  */
++
+ static rtx
+-neg_const_int (machine_mode mode, const_rtx i)
++neg_poly_int_rtx (machine_mode mode, const_rtx i)
+ {
+-  unsigned HOST_WIDE_INT val = -UINTVAL (i);
+-  
+-  if (!HWI_COMPUTABLE_MODE_P (mode)
+-      && val == UINTVAL (i))
+-    return simplify_const_unary_operation (NEG, mode, CONST_CAST_RTX (i),
+-					   mode);
+-  return gen_int_mode (val, mode);
++  return immed_wide_int_const (-wi::to_poly_wide (i, mode), mode);
+ }
+ 
+ /* Test whether expression, X, is an immediate constant that represents
+@@ -1504,12 +1499,12 @@ simplify_unary_operation_1 (enum rtx_code code, machine_mode mode, rtx op)
+ 	  && CONST_INT_P (XEXP (op, 1))
+ 	  && XEXP (XEXP (op, 0), 1) == XEXP (op, 1)
+ 	  && (op_mode = as_a <scalar_int_mode> (GET_MODE (op)),
+-	      GET_MODE_BITSIZE (op_mode) > INTVAL (XEXP (op, 1))))
++	      GET_MODE_PRECISION (op_mode) > INTVAL (XEXP (op, 1))))
+ 	{
+ 	  scalar_int_mode tmode;
+-	  gcc_assert (GET_MODE_BITSIZE (int_mode)
+-		      > GET_MODE_BITSIZE (op_mode));
+-	  if (int_mode_for_size (GET_MODE_BITSIZE (op_mode)
++	  gcc_assert (GET_MODE_PRECISION (int_mode)
++		      > GET_MODE_PRECISION (op_mode));
++	  if (int_mode_for_size (GET_MODE_PRECISION (op_mode)
+ 				 - INTVAL (XEXP (op, 1)), 1).exists (&tmode))
+ 	    {
+ 	      rtx inner =
+@@ -1735,45 +1730,42 @@ simplify_const_unary_operation (enum rtx_code code, machine_mode mode,
+       }
+       if (CONST_SCALAR_INT_P (op) || CONST_DOUBLE_AS_FLOAT_P (op))
+ 	return gen_const_vec_duplicate (mode, op);
+-      unsigned int n_elts;
+       if (GET_CODE (op) == CONST_VECTOR
+-	  && GET_MODE_NUNITS (mode).is_constant (&n_elts))
+-	{
+-	  /* This must be constant if we're duplicating it to a constant
+-	     number of elements.  */
+-	  unsigned int in_n_elts = CONST_VECTOR_NUNITS (op).to_constant ();
+-	  gcc_assert (in_n_elts < n_elts);
+-	  gcc_assert ((n_elts % in_n_elts) == 0);
+-	  rtvec v = rtvec_alloc (n_elts);
+-	  for (unsigned i = 0; i < n_elts; i++)
+-	    RTVEC_ELT (v, i) = CONST_VECTOR_ELT (op, i % in_n_elts);
+-	  return gen_rtx_CONST_VECTOR (mode, v);
++	  && (CONST_VECTOR_DUPLICATE_P (op)
++	      || CONST_VECTOR_NUNITS (op).is_constant ()))
++	{
++	  unsigned int npatterns = (CONST_VECTOR_DUPLICATE_P (op)
++				    ? CONST_VECTOR_NPATTERNS (op)
++				    : CONST_VECTOR_NUNITS (op).to_constant ());
++	  gcc_assert (multiple_p (GET_MODE_NUNITS (mode), npatterns));
++	  rtx_vector_builder builder (mode, npatterns, 1);
++	  for (unsigned i = 0; i < npatterns; i++)
++	    builder.quick_push (CONST_VECTOR_ELT (op, i));
++	  return builder.build ();
+ 	}
+     }
+ 
+-  if (VECTOR_MODE_P (mode) && GET_CODE (op) == CONST_VECTOR)
++  if (VECTOR_MODE_P (mode)
++      && GET_CODE (op) == CONST_VECTOR
++      && known_eq (GET_MODE_NUNITS (mode), CONST_VECTOR_NUNITS (op)))
+     {
+-      unsigned int n_elts;
+-      if (!CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
+-	return NULL_RTX;
++      gcc_assert (GET_MODE (op) == op_mode);
+ 
+-      machine_mode opmode = GET_MODE (op);
+-      gcc_assert (known_eq (GET_MODE_NUNITS (mode), n_elts));
+-      gcc_assert (known_eq (GET_MODE_NUNITS (opmode), n_elts));
+-
+-      rtvec v = rtvec_alloc (n_elts);
+-      unsigned int i;
++      rtx_vector_builder builder;
++      if (!builder.new_unary_operation (mode, op, false))
++	return 0;
+ 
+-      for (i = 0; i < n_elts; i++)
++      unsigned int count = builder.encoded_nelts ();
++      for (unsigned int i = 0; i < count; i++)
+ 	{
+ 	  rtx x = simplify_unary_operation (code, GET_MODE_INNER (mode),
+ 					    CONST_VECTOR_ELT (op, i),
+-					    GET_MODE_INNER (opmode));
++					    GET_MODE_INNER (op_mode));
+ 	  if (!x || !valid_for_const_vector_p (mode, x))
+ 	    return 0;
+-	  RTVEC_ELT (v, i) = x;
++	  builder.quick_push (x);
+ 	}
+-      return gen_rtx_CONST_VECTOR (mode, v);
++      return builder.build ();
+     }
+ 
+   /* The order of these tests is critical so that, for example, we don't
+@@ -2549,10 +2541,10 @@ simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
+ 	return plus_constant (mode, op0, trunc_int_for_mode (-offset, mode));
+ 
+       /* Don't let a relocatable value get a negative coeff.  */
+-      if (CONST_INT_P (op1) && GET_MODE (op0) != VOIDmode)
++      if (poly_int_rtx_p (op1) && GET_MODE (op0) != VOIDmode)
+ 	return simplify_gen_binary (PLUS, mode,
+ 				    op0,
+-				    neg_const_int (mode, op1));
++				    neg_poly_int_rtx (mode, op1));
+ 
+       /* (x - (x & y)) -> (x & ~y) */
+       if (INTEGRAL_MODE_P (mode) && GET_CODE (op1) == AND)
+@@ -4071,6 +4063,27 @@ simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
+   return 0;
+ }
+ 
++/* Return true if binary operation OP distributes over addition in operand
++   OPNO, with the other operand being held constant.  OPNO counts from 1.  */
++
++static bool
++distributes_over_addition_p (rtx_code op, int opno)
++{
++  switch (op)
++    {
++    case PLUS:
++    case MINUS:
++    case MULT:
++      return true;
++
++    case ASHIFT:
++      return opno == 1;
++
++    default:
++      return false;
++    }
++}
++
+ rtx
+ simplify_const_binary_operation (enum rtx_code code, machine_mode mode,
+ 				 rtx op0, rtx op1)
+@@ -4080,26 +4093,45 @@ simplify_const_binary_operation (enum rtx_code code, machine_mode mode,
+       && GET_CODE (op0) == CONST_VECTOR
+       && GET_CODE (op1) == CONST_VECTOR)
+     {
+-      unsigned int n_elts;
+-      if (!CONST_VECTOR_NUNITS (op0).is_constant (&n_elts))
+-	return NULL_RTX;
+-
+-      gcc_assert (known_eq (n_elts, CONST_VECTOR_NUNITS (op1)));
+-      gcc_assert (known_eq (n_elts, GET_MODE_NUNITS (mode)));
+-      rtvec v = rtvec_alloc (n_elts);
+-      unsigned int i;
++      bool step_ok_p;
++      if (CONST_VECTOR_STEPPED_P (op0)
++	  && CONST_VECTOR_STEPPED_P (op1))
++	/* We can operate directly on the encoding if:
++
++	      a3 - a2 == a2 - a1 && b3 - b2 == b2 - b1
++	    implies
++	      (a3 op b3) - (a2 op b2) == (a2 op b2) - (a1 op b1)
++
++	   Addition and subtraction are the supported operators
++	   for which this is true.  */
++	step_ok_p = (code == PLUS || code == MINUS);
++      else if (CONST_VECTOR_STEPPED_P (op0))
++	/* We can operate directly on stepped encodings if:
++
++	     a3 - a2 == a2 - a1
++	   implies:
++	     (a3 op c) - (a2 op c) == (a2 op c) - (a1 op c)
++
++	   which is true if (x -> x op c) distributes over addition.  */
++	step_ok_p = distributes_over_addition_p (code, 1);
++      else
++	/* Similarly in reverse.  */
++	step_ok_p = distributes_over_addition_p (code, 2);
++      rtx_vector_builder builder;
++      if (!builder.new_binary_operation (mode, op0, op1, step_ok_p))
++	return 0;
+ 
+-      for (i = 0; i < n_elts; i++)
++      unsigned int count = builder.encoded_nelts ();
++      for (unsigned int i = 0; i < count; i++)
+ 	{
+ 	  rtx x = simplify_binary_operation (code, GET_MODE_INNER (mode),
+ 					     CONST_VECTOR_ELT (op0, i),
+ 					     CONST_VECTOR_ELT (op1, i));
+ 	  if (!x || !valid_for_const_vector_p (mode, x))
+ 	    return 0;
+-	  RTVEC_ELT (v, i) = x;
++	  builder.quick_push (x);
+ 	}
+-
+-      return gen_rtx_CONST_VECTOR (mode, v);
++      return builder.build ();
+     }
+ 
+   if (VECTOR_MODE_P (mode)
+@@ -4593,11 +4625,12 @@ simplify_plus_minus (enum rtx_code code, machine_mode mode, rtx op0,
+ 		}
+ 	      break;
+ 
+-	    case CONST_INT:
++	    CASE_CONST_SCALAR_INT:
++	    case CONST_POLY_INT:
+ 	      n_constants++;
+ 	      if (this_neg)
+ 		{
+-		  ops[i].op = neg_const_int (mode, this_op);
++		  ops[i].op = neg_poly_int_rtx (mode, this_op);
+ 		  ops[i].neg = 0;
+ 		  changed = 1;
+ 		  canonicalized = 1;
+@@ -4722,8 +4755,8 @@ simplify_plus_minus (enum rtx_code code, machine_mode mode, rtx op0,
+ 		    lneg &= rneg;
+ 		    if (GET_CODE (tem) == NEG)
+ 		      tem = XEXP (tem, 0), lneg = !lneg;
+-		    if (CONST_INT_P (tem) && lneg)
+-		      tem = neg_const_int (mode, tem), lneg = 0;
++		    if (poly_int_rtx_p (tem) && lneg)
++		      tem = neg_poly_int_rtx (mode, tem), lneg = 0;
+ 
+ 		    ops[i].op = tem;
+ 		    ops[i].neg = lneg;
+@@ -4782,12 +4815,12 @@ simplify_plus_minus (enum rtx_code code, machine_mode mode, rtx op0,
+      in the array and that any other constant will be next-to-last.  */
+ 
+   if (n_ops > 1
+-      && CONST_INT_P (ops[n_ops - 1].op)
++      && poly_int_rtx_p (ops[n_ops - 1].op)
+       && CONSTANT_P (ops[n_ops - 2].op))
+     {
+       rtx value = ops[n_ops - 1].op;
+       if (ops[n_ops - 1].neg ^ ops[n_ops - 2].neg)
+-	value = neg_const_int (mode, value);
++	value = neg_poly_int_rtx (mode, value);
+       if (CONST_INT_P (value))
+ 	{
+ 	  ops[n_ops - 2].op = plus_constant (mode, ops[n_ops - 2].op,
+@@ -6104,342 +6137,466 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
+   return 0;
+ }
+ 
+-/* Evaluate a SUBREG of a CONST_INT or CONST_WIDE_INT or CONST_DOUBLE
+-   or CONST_FIXED or CONST_VECTOR, returning another CONST_INT or
+-   CONST_WIDE_INT or CONST_DOUBLE or CONST_FIXED or CONST_VECTOR.
++/* Try to calculate NUM_BYTES bytes of the target memory image of X,
++   starting at byte FIRST_BYTE.  Return true on success and add the
++   bytes to BYTES, such that each byte has BITS_PER_UNIT bits and such
++   that the bytes follow target memory order.  Leave BYTES unmodified
++   on failure.
+ 
+-   Works by unpacking INNER_BYTES bytes of OP into a collection of 8-bit values
+-   represented as a little-endian array of 'unsigned char', selecting by BYTE,
+-   and then repacking them again for OUTERMODE.  If OP is a CONST_VECTOR,
+-   FIRST_ELEM is the number of the first element to extract, otherwise
+-   FIRST_ELEM is ignored.  */
++   MODE is the mode of X.  The caller must reserve NUM_BYTES bytes in
++   BYTES before calling this function.  */
+ 
+-static rtx
+-simplify_immed_subreg (fixed_size_mode outermode, rtx op,
+-		       machine_mode innermode, unsigned int byte,
+-		       unsigned int first_elem, unsigned int inner_bytes)
++bool
++native_encode_rtx (machine_mode mode, rtx x, vec<target_unit> &bytes,
++		   unsigned int first_byte, unsigned int num_bytes)
+ {
+-  enum {
+-    value_bit = 8,
+-    value_mask = (1 << value_bit) - 1
+-  };
+-  unsigned char value[MAX_BITSIZE_MODE_ANY_MODE / value_bit];
+-  int value_start;
+-  int i;
+-  int elem;
+-
+-  int num_elem;
+-  rtx * elems;
+-  int elem_bitsize;
+-  rtx result_s = NULL;
+-  rtvec result_v = NULL;
+-  enum mode_class outer_class;
+-  scalar_mode outer_submode;
+-  int max_bitsize;
++  /* Check the mode is sensible.  */
++  gcc_assert (GET_MODE (x) == VOIDmode
++	      ? is_a <scalar_int_mode> (mode)
++	      : mode == GET_MODE (x));
+ 
+-  /* Some ports misuse CCmode.  */
+-  if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (op))
+-    return op;
++  if (GET_CODE (x) == CONST_VECTOR)
++    {
++      /* CONST_VECTOR_ELT follows target memory order, so no shuffling
++	 is necessary.  The only complication is that MODE_VECTOR_BOOL
++	 vectors can have several elements per byte.  */
++      unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
++						   GET_MODE_NUNITS (mode));
++      unsigned int elt = first_byte * BITS_PER_UNIT / elt_bits;
++      if (elt_bits < BITS_PER_UNIT)
++	{
++	  /* This is the only case in which elements can be smaller than
++	     a byte.  */
++	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
++	  for (unsigned int i = 0; i < num_bytes; ++i)
++	    {
++	      target_unit value = 0;
++	      for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits)
++		{
++		  value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & 1) << j;
++		  elt += 1;
++		}
++	      bytes.quick_push (value);
++	    }
++	  return true;
++	}
+ 
+-  /* We have no way to represent a complex constant at the rtl level.  */
+-  if (COMPLEX_MODE_P (outermode))
+-    return NULL_RTX;
++      unsigned int start = bytes.length ();
++      unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mode);
++      /* Make FIRST_BYTE relative to ELT.  */
++      first_byte %= elt_bytes;
++      while (num_bytes > 0)
++	{
++	  /* Work out how many bytes we want from element ELT.  */
++	  unsigned int chunk_bytes = MIN (num_bytes, elt_bytes - first_byte);
++	  if (!native_encode_rtx (GET_MODE_INNER (mode),
++				  CONST_VECTOR_ELT (x, elt), bytes,
++				  first_byte, chunk_bytes))
++	    {
++	      bytes.truncate (start);
++	      return false;
++	    }
++	  elt += 1;
++	  first_byte = 0;
++	  num_bytes -= chunk_bytes;
++	}
++      return true;
++    }
+ 
+-  /* We support any size mode.  */
+-  max_bitsize = MAX (GET_MODE_BITSIZE (outermode),
+-		     inner_bytes * BITS_PER_UNIT);
++  /* All subsequent cases are limited to scalars.  */
++  scalar_mode smode;
++  if (!is_a <scalar_mode> (mode, &smode))
++    return false;
+ 
+-  /* Unpack the value.  */
++  /* Make sure that the region is in range.  */
++  unsigned int end_byte = first_byte + num_bytes;
++  unsigned int mode_bytes = GET_MODE_SIZE (smode);
++  gcc_assert (end_byte <= mode_bytes);
+ 
+-  if (GET_CODE (op) == CONST_VECTOR)
++  if (CONST_SCALAR_INT_P (x))
+     {
+-      num_elem = CEIL (inner_bytes, GET_MODE_UNIT_SIZE (innermode));
+-      elem_bitsize = GET_MODE_UNIT_BITSIZE (innermode);
++      /* The target memory layout is affected by both BYTES_BIG_ENDIAN
++	 and WORDS_BIG_ENDIAN.  Use the subreg machinery to get the lsb
++	 position of each byte.  */
++      rtx_mode_t value (x, smode);
++      wide_int_ref value_wi (value);
++      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
++	{
++	  /* Always constant because the inputs are.  */
++	  unsigned int lsb
++	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
++	  /* Operate directly on the encoding rather than using
++	     wi::extract_uhwi, so that we preserve the sign or zero
++	     extension for modes that are not a whole number of bits in
++	     size.  (Zero extension is only used for the combination of
++	     innermode == BImode && STORE_FLAG_VALUE == 1).  */
++	  unsigned int elt = lsb / HOST_BITS_PER_WIDE_INT;
++	  unsigned int shift = lsb % HOST_BITS_PER_WIDE_INT;
++	  unsigned HOST_WIDE_INT uhwi = value_wi.elt (elt);
++	  bytes.quick_push (uhwi >> shift);
++	}
++      return true;
+     }
+-  else
++
++  if (CONST_DOUBLE_P (x))
+     {
+-      num_elem = 1;
+-      elem_bitsize = max_bitsize;
++      /* real_to_target produces an array of integers in target memory order.
++	 All integers before the last one have 32 bits; the last one may
++	 have 32 bits or fewer, depending on whether the mode bitsize
++	 is divisible by 32.  Each of these integers is then laid out
++	 in target memory as any other integer would be.  */
++      long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
++      real_to_target (el32, CONST_DOUBLE_REAL_VALUE (x), smode);
++
++      /* The (maximum) number of target bytes per element of el32.  */
++      unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
++      gcc_assert (bytes_per_el32 != 0);
++
++      /* Build up the integers in a similar way to the CONST_SCALAR_INT_P
++	 handling above.  */
++      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
++	{
++	  unsigned int index = byte / bytes_per_el32;
++	  unsigned int subbyte = byte % bytes_per_el32;
++	  unsigned int int_bytes = MIN (bytes_per_el32,
++					mode_bytes - index * bytes_per_el32);
++	  /* Always constant because the inputs are.  */
++	  unsigned int lsb
++	    = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
++	  bytes.quick_push ((unsigned long) el32[index] >> lsb);
++	}
++      return true;
+     }
+-  /* If this asserts, it is too complicated; reducing value_bit may help.  */
+-  gcc_assert (BITS_PER_UNIT % value_bit == 0);
+-  /* I don't know how to handle endianness of sub-units.  */
+-  gcc_assert (elem_bitsize % BITS_PER_UNIT == 0);
+ 
+-  for (elem = 0; elem < num_elem; elem++)
++  if (GET_CODE (x) == CONST_FIXED)
+     {
+-      unsigned char * vp;
+-      rtx el = (GET_CODE (op) == CONST_VECTOR
+-		? CONST_VECTOR_ELT (op, first_elem + elem)
+-		: op);
++      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
++	{
++	  /* Always constant because the inputs are.  */
++	  unsigned int lsb
++	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
++	  unsigned HOST_WIDE_INT piece = CONST_FIXED_VALUE_LOW (x);
++	  if (lsb >= HOST_BITS_PER_WIDE_INT)
++	    {
++	      lsb -= HOST_BITS_PER_WIDE_INT;
++	      piece = CONST_FIXED_VALUE_HIGH (x);
++	    }
++	  bytes.quick_push (piece >> lsb);
++	}
++      return true;
++    }
+ 
+-      /* Vectors are kept in target memory order.  (This is probably
+-	 a mistake.)  */
+-      {
+-	unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
+-	unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
+-			  / BITS_PER_UNIT);
+-	unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
+-	unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
+-	unsigned bytele = (subword_byte % UNITS_PER_WORD
+-			 + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
+-	vp = value + (bytele * BITS_PER_UNIT) / value_bit;
+-      }
++  return false;
++}
+ 
+-      switch (GET_CODE (el))
+-	{
+-	case CONST_INT:
+-	  for (i = 0;
+-	       i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
+-	       i += value_bit)
+-	    *vp++ = INTVAL (el) >> i;
+-	  /* CONST_INTs are always logically sign-extended.  */
+-	  for (; i < elem_bitsize; i += value_bit)
+-	    *vp++ = INTVAL (el) < 0 ? -1 : 0;
+-	  break;
++/* Read a vector of mode MODE from the target memory image given by BYTES,
++   starting at byte FIRST_BYTE.  The vector is known to be encodable using
++   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
++   and BYTES is known to have enough bytes to supply NPATTERNS *
++   NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
++   BITS_PER_UNIT bits and the bytes are in target memory order.
+ 
+-	case CONST_WIDE_INT:
+-	  {
+-	    rtx_mode_t val = rtx_mode_t (el, GET_MODE_INNER (innermode));
+-	    unsigned char extend = wi::sign_mask (val);
+-	    int prec = wi::get_precision (val);
+-
+-	    for (i = 0; i < prec && i < elem_bitsize; i += value_bit)
+-	      *vp++ = wi::extract_uhwi (val, i, value_bit);
+-	    for (; i < elem_bitsize; i += value_bit)
+-	      *vp++ = extend;
+-	  }
+-	  break;
++   Return the vector on success, otherwise return NULL_RTX.  */
+ 
+-	case CONST_DOUBLE:
+-	  if (TARGET_SUPPORTS_WIDE_INT == 0 && GET_MODE (el) == VOIDmode)
+-	    {
+-	      unsigned char extend = 0;
+-	      /* If this triggers, someone should have generated a
+-		 CONST_INT instead.  */
+-	      gcc_assert (elem_bitsize > HOST_BITS_PER_WIDE_INT);
+-
+-	      for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
+-		*vp++ = CONST_DOUBLE_LOW (el) >> i;
+-	      while (i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize)
+-		{
+-		  *vp++
+-		    = CONST_DOUBLE_HIGH (el) >> (i - HOST_BITS_PER_WIDE_INT);
+-		  i += value_bit;
+-		}
++rtx
++native_decode_vector_rtx (machine_mode mode, vec<target_unit> bytes,
++			  unsigned int first_byte, unsigned int npatterns,
++			  unsigned int nelts_per_pattern)
++{
++  rtx_vector_builder builder (mode, npatterns, nelts_per_pattern);
+ 
+-	      if (CONST_DOUBLE_HIGH (el) >> (HOST_BITS_PER_WIDE_INT - 1))
+-		extend = -1;
+-	      for (; i < elem_bitsize; i += value_bit)
+-		*vp++ = extend;
+-	    }
+-	  else
+-	    {
+-	      /* This is big enough for anything on the platform.  */
+-	      long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32];
+-	      scalar_float_mode el_mode;
++  unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
++					       GET_MODE_NUNITS (mode));
++  if (elt_bits < BITS_PER_UNIT)
++    {
++      /* This is the only case in which elements can be smaller than a byte.
++	 Element 0 is always in the lsb of the containing byte.  */
++      gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
++      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
++	{
++	  unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
++	  unsigned int byte_index = bit_index / BITS_PER_UNIT;
++	  unsigned int lsb = bit_index % BITS_PER_UNIT;
++	  builder.quick_push (bytes[byte_index] & (1 << lsb)
++			      ? CONST1_RTX (BImode)
++			      : CONST0_RTX (BImode));
++	}
++    }
++  else
++    {
++      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
++	{
++	  rtx x = native_decode_rtx (GET_MODE_INNER (mode), bytes, first_byte);
++	  if (!x)
++	    return NULL_RTX;
++	  builder.quick_push (x);
++	  first_byte += elt_bits / BITS_PER_UNIT;
++	}
++    }
++  return builder.build ();
++}
+ 
+-	      el_mode = as_a <scalar_float_mode> (GET_MODE (el));
+-	      int bitsize = GET_MODE_BITSIZE (el_mode);
++/* Read an rtx of mode MODE from the target memory image given by BYTES,
++   starting at byte FIRST_BYTE.  Each element of BYTES contains BITS_PER_UNIT
++   bits and the bytes are in target memory order.  The image has enough
++   values to specify all bytes of MODE.
+ 
+-	      gcc_assert (bitsize <= elem_bitsize);
+-	      gcc_assert (bitsize % value_bit == 0);
++   Return the rtx on success, otherwise return NULL_RTX.  */
+ 
+-	      real_to_target (tmp, CONST_DOUBLE_REAL_VALUE (el),
+-			      GET_MODE (el));
++rtx
++native_decode_rtx (machine_mode mode, vec<target_unit> bytes,
++		   unsigned int first_byte)
++{
++  if (VECTOR_MODE_P (mode))
++    {
++      /* If we know at compile time how many elements there are,
++	 pull each element directly from BYTES.  */
++      unsigned int nelts;
++      if (GET_MODE_NUNITS (mode).is_constant (&nelts))
++	return native_decode_vector_rtx (mode, bytes, first_byte, nelts, 1);
++      return NULL_RTX;
++    }
+ 
+-	      /* real_to_target produces its result in words affected by
+-		 FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
+-		 and use WORDS_BIG_ENDIAN instead; see the documentation
+-	         of SUBREG in rtl.texi.  */
+-	      for (i = 0; i < bitsize; i += value_bit)
+-		{
+-		  int ibase;
+-		  if (WORDS_BIG_ENDIAN)
+-		    ibase = bitsize - 1 - i;
+-		  else
+-		    ibase = i;
+-		  *vp++ = tmp[ibase / 32] >> i % 32;
+-		}
++  scalar_int_mode imode;
++  if (is_a <scalar_int_mode> (mode, &imode)
++      && GET_MODE_PRECISION (imode) <= MAX_BITSIZE_MODE_ANY_INT)
++    {
++      /* Pull the bytes msb first, so that we can use simple
++	 shift-and-insert wide_int operations.  */
++      unsigned int size = GET_MODE_SIZE (imode);
++      wide_int result (wi::zero (GET_MODE_PRECISION (imode)));
++      for (unsigned int i = 0; i < size; ++i)
++	{
++	  unsigned int lsb = (size - i - 1) * BITS_PER_UNIT;
++	  /* Always constant because the inputs are.  */
++	  unsigned int subbyte
++	    = subreg_size_offset_from_lsb (1, size, lsb).to_constant ();
++	  result <<= BITS_PER_UNIT;
++	  result |= bytes[first_byte + subbyte];
++	}
++      return immed_wide_int_const (result, imode);
++    }
+ 
+-	      /* It shouldn't matter what's done here, so fill it with
+-		 zero.  */
+-	      for (; i < elem_bitsize; i += value_bit)
+-		*vp++ = 0;
+-	    }
+-	  break;
++  scalar_float_mode fmode;
++  if (is_a <scalar_float_mode> (mode, &fmode))
++    {
++      /* We need to build an array of integers in target memory order.
++	 All integers before the last one have 32 bits; the last one may
++	 have 32 bits or fewer, depending on whether the mode bitsize
++	 is divisible by 32.  */
++      long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
++      unsigned int num_el32 = CEIL (GET_MODE_BITSIZE (fmode), 32);
++      memset (el32, 0, num_el32 * sizeof (long));
++
++      /* The (maximum) number of target bytes per element of el32.  */
++      unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
++      gcc_assert (bytes_per_el32 != 0);
++
++      unsigned int mode_bytes = GET_MODE_SIZE (fmode);
++      for (unsigned int byte = 0; byte < mode_bytes; ++byte)
++	{
++	  unsigned int index = byte / bytes_per_el32;
++	  unsigned int subbyte = byte % bytes_per_el32;
++	  unsigned int int_bytes = MIN (bytes_per_el32,
++					mode_bytes - index * bytes_per_el32);
++	  /* Always constant because the inputs are.  */
++	  unsigned int lsb
++	    = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
++	  el32[index] |= (unsigned long) bytes[first_byte + byte] << lsb;
++	}
++      REAL_VALUE_TYPE r;
++      real_from_target (&r, el32, fmode);
++      return const_double_from_real_value (r, fmode);
++    }
+ 
+-        case CONST_FIXED:
+-	  if (elem_bitsize <= HOST_BITS_PER_WIDE_INT)
+-	    {
+-	      for (i = 0; i < elem_bitsize; i += value_bit)
+-		*vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
+-	    }
++  if (ALL_SCALAR_FIXED_POINT_MODE_P (mode))
++    {
++      scalar_mode smode = as_a <scalar_mode> (mode);
++      FIXED_VALUE_TYPE f;
++      f.data.low = 0;
++      f.data.high = 0;
++      f.mode = smode;
++
++      unsigned int mode_bytes = GET_MODE_SIZE (smode);
++      for (unsigned int byte = 0; byte < mode_bytes; ++byte)
++	{
++	  /* Always constant because the inputs are.  */
++	  unsigned int lsb
++	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
++	  unsigned HOST_WIDE_INT unit = bytes[first_byte + byte];
++	  if (lsb >= HOST_BITS_PER_WIDE_INT)
++	    f.data.high |= unit << (lsb - HOST_BITS_PER_WIDE_INT);
+ 	  else
+-	    {
+-	      for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
+-		*vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
+-              for (; i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize;
+-		   i += value_bit)
+-		*vp++ = CONST_FIXED_VALUE_HIGH (el)
+-			>> (i - HOST_BITS_PER_WIDE_INT);
+-	      for (; i < elem_bitsize; i += value_bit)
+-		*vp++ = 0;
+-	    }
+-          break;
+-
+-	default:
+-	  gcc_unreachable ();
++	    f.data.low |= unit << lsb;
+ 	}
++      return CONST_FIXED_FROM_FIXED_VALUE (f, mode);
+     }
+ 
+-  /* Now, pick the right byte to start with.  */
+-  /* Renumber BYTE so that the least-significant byte is byte 0.  A special
+-     case is paradoxical SUBREGs, which shouldn't be adjusted since they
+-     will already have offset 0.  */
+-  if (inner_bytes >= GET_MODE_SIZE (outermode))
++  return NULL_RTX;
++}
++
++/* Simplify a byte offset BYTE into CONST_VECTOR X.  The main purpose
++   is to convert a runtime BYTE value into a constant one.  */
++
++static poly_uint64
++simplify_const_vector_byte_offset (rtx x, poly_uint64 byte)
++{
++  /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
++  machine_mode mode = GET_MODE (x);
++  unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
++					       GET_MODE_NUNITS (mode));
++  /* The number of bits needed to encode one element from each pattern.  */
++  unsigned int sequence_bits = CONST_VECTOR_NPATTERNS (x) * elt_bits;
++
++  /* Identify the start point in terms of a sequence number and a byte offset
++     within that sequence.  */
++  poly_uint64 first_sequence;
++  unsigned HOST_WIDE_INT subbit;
++  if (can_div_trunc_p (byte * BITS_PER_UNIT, sequence_bits,
++		       &first_sequence, &subbit))
+     {
+-      unsigned ibyte = inner_bytes - GET_MODE_SIZE (outermode) - byte;
+-      unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
+-      unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
+-      byte = (subword_byte % UNITS_PER_WORD
+-	      + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
++      unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
++      if (nelts_per_pattern == 1)
++	/* This is a duplicated vector, so the value of FIRST_SEQUENCE
++	   doesn't matter.  */
++	byte = subbit / BITS_PER_UNIT;
++      else if (nelts_per_pattern == 2 && known_gt (first_sequence, 0U))
++	{
++	  /* The subreg drops the first element from each pattern and
++	     only uses the second element.  Find the first sequence
++	     that starts on a byte boundary.  */
++	  subbit += least_common_multiple (sequence_bits, BITS_PER_UNIT);
++	  byte = subbit / BITS_PER_UNIT;
++	}
+     }
++  return byte;
++}
++
++/* Subroutine of simplify_subreg in which:
+ 
+-  /* BYTE should still be inside OP.  (Note that BYTE is unsigned,
+-     so if it's become negative it will instead be very large.)  */
+-  gcc_assert (byte < inner_bytes);
++   - X is known to be a CONST_VECTOR
++   - OUTERMODE is known to be a vector mode
+ 
+-  /* Convert from bytes to chunks of size value_bit.  */
+-  value_start = byte * (BITS_PER_UNIT / value_bit);
++   Try to handle the subreg by operating on the CONST_VECTOR encoding
++   rather than on each individual element of the CONST_VECTOR.
+ 
+-  /* Re-pack the value.  */
+-  num_elem = GET_MODE_NUNITS (outermode);
++   Return the simplified subreg on success, otherwise return NULL_RTX.  */
+ 
+-  if (VECTOR_MODE_P (outermode))
++static rtx
++simplify_const_vector_subreg (machine_mode outermode, rtx x,
++			      machine_mode innermode, unsigned int first_byte)
++{
++  /* Paradoxical subregs of vectors have dubious semantics.  */
++  if (paradoxical_subreg_p (outermode, innermode))
++    return NULL_RTX;
++
++  /* We can only preserve the semantics of a stepped pattern if the new
++     vector element is the same as the original one.  */
++  if (CONST_VECTOR_STEPPED_P (x)
++      && GET_MODE_INNER (outermode) != GET_MODE_INNER (innermode))
++    return NULL_RTX;
++
++  /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
++  unsigned int x_elt_bits
++    = vector_element_size (GET_MODE_BITSIZE (innermode),
++			   GET_MODE_NUNITS (innermode));
++  unsigned int out_elt_bits
++    = vector_element_size (GET_MODE_BITSIZE (outermode),
++			   GET_MODE_NUNITS (outermode));
++
++  /* The number of bits needed to encode one element from every pattern
++     of the original vector.  */
++  unsigned int x_sequence_bits = CONST_VECTOR_NPATTERNS (x) * x_elt_bits;
++
++  /* The number of bits needed to encode one element from every pattern
++     of the result.  */
++  unsigned int out_sequence_bits
++    = least_common_multiple (x_sequence_bits, out_elt_bits);
++
++  /* Work out the number of interleaved patterns in the output vector
++     and the number of encoded elements per pattern.  */
++  unsigned int out_npatterns = out_sequence_bits / out_elt_bits;
++  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
++
++  /* The encoding scheme requires the number of elements to be a multiple
++     of the number of patterns, so that each pattern appears at least once
++     and so that the same number of elements appear from each pattern.  */
++  bool ok_p = multiple_p (GET_MODE_NUNITS (outermode), out_npatterns);
++  unsigned int const_nunits;
++  if (GET_MODE_NUNITS (outermode).is_constant (&const_nunits)
++      && (!ok_p || out_npatterns * nelts_per_pattern > const_nunits))
+     {
+-      result_v = rtvec_alloc (num_elem);
+-      elems = &RTVEC_ELT (result_v, 0);
++      /* Either the encoding is invalid, or applying it would give us
++	 more elements than we need.  Just encode each element directly.  */
++      out_npatterns = const_nunits;
++      nelts_per_pattern = 1;
+     }
+-  else
+-    elems = &result_s;
++  else if (!ok_p)
++    return NULL_RTX;
+ 
+-  outer_submode = GET_MODE_INNER (outermode);
+-  outer_class = GET_MODE_CLASS (outer_submode);
+-  elem_bitsize = GET_MODE_BITSIZE (outer_submode);
++  /* Get enough bytes of X to form the new encoding.  */
++  unsigned int buffer_bits = out_npatterns * nelts_per_pattern * out_elt_bits;
++  unsigned int buffer_bytes = CEIL (buffer_bits, BITS_PER_UNIT);
++  auto_vec<target_unit, 128> buffer (buffer_bytes);
++  if (!native_encode_rtx (innermode, x, buffer, first_byte, buffer_bytes))
++    return NULL_RTX;
+ 
+-  gcc_assert (elem_bitsize % value_bit == 0);
+-  gcc_assert (elem_bitsize + value_start * value_bit <= max_bitsize);
++  /* Reencode the bytes as OUTERMODE.  */
++  return native_decode_vector_rtx (outermode, buffer, 0, out_npatterns,
++				   nelts_per_pattern);
++}
+ 
+-  for (elem = 0; elem < num_elem; elem++)
+-    {
+-      unsigned char *vp;
++/* Try to simplify a subreg of a constant by encoding the subreg region
++   as a sequence of target bytes and reading them back in the new mode.
++   Return the new value on success, otherwise return null.
+ 
+-      /* Vectors are stored in target memory order.  (This is probably
+-	 a mistake.)  */
+-      {
+-	unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
+-	unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
+-			  / BITS_PER_UNIT);
+-	unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
+-	unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
+-	unsigned bytele = (subword_byte % UNITS_PER_WORD
+-			 + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
+-	vp = value + value_start + (bytele * BITS_PER_UNIT) / value_bit;
+-      }
++   The subreg has outer mode OUTERMODE, inner mode INNERMODE, inner value X
++   and byte offset FIRST_BYTE.  */
+ 
+-      switch (outer_class)
+-	{
+-	case MODE_INT:
+-	case MODE_PARTIAL_INT:
+-	  {
+-	    int u;
+-	    int base = 0;
+-	    int units
+-	      = (GET_MODE_BITSIZE (outer_submode) + HOST_BITS_PER_WIDE_INT - 1)
+-	      / HOST_BITS_PER_WIDE_INT;
+-	    HOST_WIDE_INT tmp[MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_WIDE_INT];
+-	    wide_int r;
+-
+-	    if (GET_MODE_PRECISION (outer_submode) > MAX_BITSIZE_MODE_ANY_INT)
+-	      return NULL_RTX;
+-	    for (u = 0; u < units; u++)
+-	      {
+-		unsigned HOST_WIDE_INT buf = 0;
+-		for (i = 0;
+-		     i < HOST_BITS_PER_WIDE_INT && base + i < elem_bitsize;
+-		     i += value_bit)
+-		  buf |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
+-
+-		tmp[u] = buf;
+-		base += HOST_BITS_PER_WIDE_INT;
+-	      }
+-	    r = wide_int::from_array (tmp, units,
+-				      GET_MODE_PRECISION (outer_submode));
+-#if TARGET_SUPPORTS_WIDE_INT == 0
+-	    /* Make sure r will fit into CONST_INT or CONST_DOUBLE.  */
+-	    if (wi::min_precision (r, SIGNED) > HOST_BITS_PER_DOUBLE_INT)
+-	      return NULL_RTX;
+-#endif
+-	    elems[elem] = immed_wide_int_const (r, outer_submode);
+-	  }
+-	  break;
++static rtx
++simplify_immed_subreg (fixed_size_mode outermode, rtx x,
++		       machine_mode innermode, unsigned int first_byte)
++{
++  unsigned int buffer_bytes = GET_MODE_SIZE (outermode);
++  auto_vec<target_unit, 128> buffer (buffer_bytes);
+ 
+-	case MODE_FLOAT:
+-	case MODE_DECIMAL_FLOAT:
+-	  {
+-	    REAL_VALUE_TYPE r;
+-	    long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32] = { 0 };
+-
+-	    /* real_from_target wants its input in words affected by
+-	       FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
+-	       and use WORDS_BIG_ENDIAN instead; see the documentation
+-	       of SUBREG in rtl.texi.  */
+-	    for (i = 0; i < elem_bitsize; i += value_bit)
+-	      {
+-		int ibase;
+-		if (WORDS_BIG_ENDIAN)
+-		  ibase = elem_bitsize - 1 - i;
+-		else
+-		  ibase = i;
+-		tmp[ibase / 32] |= (*vp++ & value_mask) << i % 32;
+-	      }
++  /* Some ports misuse CCmode.  */
++  if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (x))
++    return x;
+ 
+-	    real_from_target (&r, tmp, outer_submode);
+-	    elems[elem] = const_double_from_real_value (r, outer_submode);
+-	  }
+-	  break;
++  /* Paradoxical subregs read undefined values for bytes outside of the
++     inner value.  However, we have traditionally always sign-extended
++     integer constants and zero-extended others.  */
++  unsigned int inner_bytes = buffer_bytes;
++  if (paradoxical_subreg_p (outermode, innermode))
++    {
++      if (!GET_MODE_SIZE (innermode).is_constant (&inner_bytes))
++	return NULL_RTX;
+ 
+-	case MODE_FRACT:
+-	case MODE_UFRACT:
+-	case MODE_ACCUM:
+-	case MODE_UACCUM:
+-	  {
+-	    FIXED_VALUE_TYPE f;
+-	    f.data.low = 0;
+-	    f.data.high = 0;
+-	    f.mode = outer_submode;
+-
+-	    for (i = 0;
+-		 i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
+-		 i += value_bit)
+-	      f.data.low |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
+-	    for (; i < elem_bitsize; i += value_bit)
+-	      f.data.high |= ((unsigned HOST_WIDE_INT)(*vp++ & value_mask)
+-			     << (i - HOST_BITS_PER_WIDE_INT));
+-
+-	    elems[elem] = CONST_FIXED_FROM_FIXED_VALUE (f, outer_submode);
+-          }
+-          break;
++      target_unit filler = 0;
++      if (CONST_SCALAR_INT_P (x) && wi::neg_p (rtx_mode_t (x, innermode)))
++	filler = -1;
+ 
+-	default:
+-	  gcc_unreachable ();
+-	}
++      /* Add any leading bytes due to big-endian layout.  The number of
++	 bytes must be constant because both modes have constant size.  */
++      unsigned int leading_bytes
++	= -byte_lowpart_offset (outermode, innermode).to_constant ();
++      for (unsigned int i = 0; i < leading_bytes; ++i)
++	buffer.quick_push (filler);
++
++      if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
++	return NULL_RTX;
++
++      /* Add any trailing bytes due to little-endian layout.  */
++      while (buffer.length () < buffer_bytes)
++	buffer.quick_push (filler);
+     }
+-  if (VECTOR_MODE_P (outermode))
+-    return gen_rtx_CONST_VECTOR (outermode, result_v);
+   else
+-    return result_s;
++    {
++      if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
++	return NULL_RTX;
++      }
++  return native_decode_rtx (outermode, buffer, 0);
+ }
+ 
+ /* Simplify SUBREG:OUTERMODE(OP:INNERMODE, BYTE)
+@@ -6468,6 +6625,9 @@ simplify_subreg (machine_mode outermode, rtx op,
+   if (outermode == innermode && known_eq (byte, 0U))
+     return op;
+ 
++  if (GET_CODE (op) == CONST_VECTOR)
++    byte = simplify_const_vector_byte_offset (op, byte);
++
+   if (multiple_p (byte, GET_MODE_UNIT_SIZE (innermode)))
+     {
+       rtx elt;
+@@ -6487,30 +6647,21 @@ simplify_subreg (machine_mode outermode, rtx op,
+       || CONST_FIXED_P (op)
+       || GET_CODE (op) == CONST_VECTOR)
+     {
+-      /* simplify_immed_subreg deconstructs OP into bytes and constructs
+-	 the result from bytes, so it only works if the sizes of the modes
+-	 and the value of the offset are known at compile time.  Cases that
+-	 that apply to general modes and offsets should be handled here
+-	 before calling simplify_immed_subreg.  */
+-      fixed_size_mode fs_outermode, fs_innermode;
+       unsigned HOST_WIDE_INT cbyte;
+-      if (is_a <fixed_size_mode> (outermode, &fs_outermode)
+-	  && is_a <fixed_size_mode> (innermode, &fs_innermode)
+-	  && byte.is_constant (&cbyte))
+-	return simplify_immed_subreg (fs_outermode, op, fs_innermode, cbyte,
+-				      0, GET_MODE_SIZE (fs_innermode));
+-
+-      /* Handle constant-sized outer modes and variable-sized inner modes.  */
+-      unsigned HOST_WIDE_INT first_elem;
+-      if (GET_CODE (op) == CONST_VECTOR
+-	  && is_a <fixed_size_mode> (outermode, &fs_outermode)
+-	  && constant_multiple_p (byte, GET_MODE_UNIT_SIZE (innermode),
+-				  &first_elem))
+-	return simplify_immed_subreg (fs_outermode, op, innermode, 0,
+-				      first_elem,
+-				      GET_MODE_SIZE (fs_outermode));
++      if (byte.is_constant (&cbyte))
++	{
++	  if (GET_CODE (op) == CONST_VECTOR && VECTOR_MODE_P (outermode))
++	    {
++	      rtx tmp = simplify_const_vector_subreg (outermode, op,
++						      innermode, cbyte);
++	      if (tmp)
++		return tmp;
++	    }
+ 
+-      return NULL_RTX;
++	  fixed_size_mode fs_outermode;
++	  if (is_a <fixed_size_mode> (outermode, &fs_outermode))
++	    return simplify_immed_subreg (fs_outermode, op, innermode, cbyte);
++	}
+     }
+ 
+   /* Changing mode twice with SUBREG => just change it once,
+@@ -6952,6 +7103,18 @@ test_vector_ops_duplicate (machine_mode mode, rtx scalar_reg)
+       && mode_for_vector (inner_mode, 2).exists (&narrower_mode)
+       && VECTOR_MODE_P (narrower_mode))
+     {
++      /* Test VEC_DUPLICATE of a vector.  */
++      rtx_vector_builder nbuilder (narrower_mode, 2, 1);
++      nbuilder.quick_push (const0_rtx);
++      nbuilder.quick_push (const1_rtx);
++      rtx_vector_builder builder (mode, 2, 1);
++      builder.quick_push (const0_rtx);
++      builder.quick_push (const1_rtx);
++      ASSERT_RTX_EQ (builder.build (),
++		     simplify_unary_operation (VEC_DUPLICATE, mode,
++					       nbuilder.build (),
++					       narrower_mode));
++
+       /* Test VEC_SELECT of a vector.  */
+       rtx vec_par
+ 	= gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, const1_rtx, const0_rtx));
+@@ -7024,6 +7187,58 @@ test_vector_ops_series (machine_mode mode, rtx scalar_reg)
+   ASSERT_RTX_EQ (series_0_m1,
+ 		 simplify_binary_operation (VEC_SERIES, mode, const0_rtx,
+ 					    constm1_rtx));
++
++  /* Test NEG on constant vector series.  */
++  ASSERT_RTX_EQ (series_0_m1,
++		 simplify_unary_operation (NEG, mode, series_0_1, mode));
++  ASSERT_RTX_EQ (series_0_1,
++		 simplify_unary_operation (NEG, mode, series_0_m1, mode));
++
++  /* Test PLUS and MINUS on constant vector series.  */
++  rtx scalar2 = gen_int_mode (2, inner_mode);
++  rtx scalar3 = gen_int_mode (3, inner_mode);
++  rtx series_1_1 = gen_const_vec_series (mode, const1_rtx, const1_rtx);
++  rtx series_0_2 = gen_const_vec_series (mode, const0_rtx, scalar2);
++  rtx series_1_3 = gen_const_vec_series (mode, const1_rtx, scalar3);
++  ASSERT_RTX_EQ (series_1_1,
++		 simplify_binary_operation (PLUS, mode, series_0_1,
++					    CONST1_RTX (mode)));
++  ASSERT_RTX_EQ (series_0_m1,
++		 simplify_binary_operation (PLUS, mode, CONST0_RTX (mode),
++					    series_0_m1));
++  ASSERT_RTX_EQ (series_1_3,
++		 simplify_binary_operation (PLUS, mode, series_1_1,
++					    series_0_2));
++  ASSERT_RTX_EQ (series_0_1,
++		 simplify_binary_operation (MINUS, mode, series_1_1,
++					    CONST1_RTX (mode)));
++  ASSERT_RTX_EQ (series_1_1,
++		 simplify_binary_operation (MINUS, mode, CONST1_RTX (mode),
++					    series_0_m1));
++  ASSERT_RTX_EQ (series_1_1,
++		 simplify_binary_operation (MINUS, mode, series_1_3,
++					    series_0_2));
++
++  /* Test MULT between constant vectors.  */
++  rtx vec2 = gen_const_vec_duplicate (mode, scalar2);
++  rtx vec3 = gen_const_vec_duplicate (mode, scalar3);
++  rtx scalar9 = gen_int_mode (9, inner_mode);
++  rtx series_3_9 = gen_const_vec_series (mode, scalar3, scalar9);
++  ASSERT_RTX_EQ (series_0_2,
++		 simplify_binary_operation (MULT, mode, series_0_1, vec2));
++  ASSERT_RTX_EQ (series_3_9,
++		 simplify_binary_operation (MULT, mode, vec3, series_1_3));
++  if (!GET_MODE_NUNITS (mode).is_constant ())
++    ASSERT_FALSE (simplify_binary_operation (MULT, mode, series_0_1,
++					     series_0_1));
++
++  /* Test ASHIFT between constant vectors.  */
++  ASSERT_RTX_EQ (series_0_2,
++		 simplify_binary_operation (ASHIFT, mode, series_0_1,
++					    CONST1_RTX (mode)));
++  if (!GET_MODE_NUNITS (mode).is_constant ())
++    ASSERT_FALSE (simplify_binary_operation (ASHIFT, mode, CONST1_RTX (mode),
++					     series_0_1));
+ }
+ 
+ /* Verify simplify_merge_mask works correctly.  */
+@@ -7089,6 +7304,165 @@ test_vec_merge (machine_mode mode)
+ 		 simplify_rtx (nvm));
+ }
+ 
++/* Test subregs of integer vector constant X, trying elements in
++   the range [ELT_BIAS, ELT_BIAS + constant_lower_bound (NELTS)),
++   where NELTS is the number of elements in X.  Subregs involving
++   elements [ELT_BIAS, ELT_BIAS + FIRST_VALID) are expected to fail.  */
++
++static void
++test_vector_subregs_modes (rtx x, poly_uint64 elt_bias = 0,
++			   unsigned int first_valid = 0)
++{
++  machine_mode inner_mode = GET_MODE (x);
++  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
++
++  for (unsigned int modei = 0; modei < NUM_MACHINE_MODES; ++modei)
++    {
++      machine_mode outer_mode = (machine_mode) modei;
++      if (!VECTOR_MODE_P (outer_mode))
++	continue;
++
++      unsigned int outer_nunits;
++      if (GET_MODE_INNER (outer_mode) == int_mode
++	  && GET_MODE_NUNITS (outer_mode).is_constant (&outer_nunits)
++	  && multiple_p (GET_MODE_NUNITS (inner_mode), outer_nunits))
++	{
++	  /* Test subregs in which the outer mode is a smaller,
++	     constant-sized vector of the same element type.  */
++	  unsigned int limit
++	    = constant_lower_bound (GET_MODE_NUNITS (inner_mode));
++	  for (unsigned int elt = 0; elt < limit; elt += outer_nunits)
++	    {
++	      rtx expected = NULL_RTX;
++	      if (elt >= first_valid)
++		{
++		  rtx_vector_builder builder (outer_mode, outer_nunits, 1);
++		  for (unsigned int i = 0; i < outer_nunits; ++i)
++		    builder.quick_push (CONST_VECTOR_ELT (x, elt + i));
++		  expected = builder.build ();
++		}
++	      poly_uint64 byte = (elt_bias + elt) * GET_MODE_SIZE (int_mode);
++	      ASSERT_RTX_EQ (expected,
++			     simplify_subreg (outer_mode, x,
++					      inner_mode, byte));
++	    }
++	}
++      else if (known_eq (GET_MODE_SIZE (outer_mode),
++			 GET_MODE_SIZE (inner_mode))
++	       && known_eq (elt_bias, 0U)
++	       && (GET_MODE_CLASS (outer_mode) != MODE_VECTOR_BOOL
++		   || known_eq (GET_MODE_BITSIZE (outer_mode),
++				GET_MODE_NUNITS (outer_mode)))
++	       && (!FLOAT_MODE_P (outer_mode)
++		   || (FLOAT_MODE_FORMAT (outer_mode)->ieee_bits
++		       == GET_MODE_UNIT_PRECISION (outer_mode)))
++	       && (GET_MODE_SIZE (inner_mode).is_constant ()
++		   || !CONST_VECTOR_STEPPED_P (x)))
++	{
++	  /* Try converting to OUTER_MODE and back.  */
++	  rtx outer_x = simplify_subreg (outer_mode, x, inner_mode, 0);
++	  ASSERT_TRUE (outer_x != NULL_RTX);
++	  ASSERT_RTX_EQ (x, simplify_subreg (inner_mode, outer_x,
++					     outer_mode, 0));
++	}
++    }
++
++  if (BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
++    {
++      /* Test each byte in the element range.  */
++      unsigned int limit
++	= constant_lower_bound (GET_MODE_SIZE (inner_mode));
++      for (unsigned int i = 0; i < limit; ++i)
++	{
++	  unsigned int elt = i / GET_MODE_SIZE (int_mode);
++	  rtx expected = NULL_RTX;
++	  if (elt >= first_valid)
++	    {
++	      unsigned int byte_shift = i % GET_MODE_SIZE (int_mode);
++	      if (BYTES_BIG_ENDIAN)
++		byte_shift = GET_MODE_SIZE (int_mode) - byte_shift - 1;
++	      rtx_mode_t vec_elt (CONST_VECTOR_ELT (x, elt), int_mode);
++	      wide_int shifted_elt
++		= wi::lrshift (vec_elt, byte_shift * BITS_PER_UNIT);
++	      expected = immed_wide_int_const (shifted_elt, QImode);
++	    }
++	  poly_uint64 byte = elt_bias * GET_MODE_SIZE (int_mode) + i;
++	  ASSERT_RTX_EQ (expected,
++			 simplify_subreg (QImode, x, inner_mode, byte));
++	}
++    }
++}
++
++/* Test constant subregs of integer vector mode INNER_MODE, using 1
++   element per pattern.  */
++
++static void
++test_vector_subregs_repeating (machine_mode inner_mode)
++{
++  poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
++  unsigned int min_nunits = constant_lower_bound (nunits);
++  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
++  unsigned int count = gcd (min_nunits, 8);
++
++  rtx_vector_builder builder (inner_mode, count, 1);
++  for (unsigned int i = 0; i < count; ++i)
++    builder.quick_push (gen_int_mode (8 - i, int_mode));
++  rtx x = builder.build ();
++
++  test_vector_subregs_modes (x);
++  if (!nunits.is_constant ())
++    test_vector_subregs_modes (x, nunits - min_nunits);
++}
++
++/* Test constant subregs of integer vector mode INNER_MODE, using 2
++   elements per pattern.  */
++
++static void
++test_vector_subregs_fore_back (machine_mode inner_mode)
++{
++  poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
++  unsigned int min_nunits = constant_lower_bound (nunits);
++  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
++  unsigned int count = gcd (min_nunits, 4);
++
++  rtx_vector_builder builder (inner_mode, count, 2);
++  for (unsigned int i = 0; i < count; ++i)
++    builder.quick_push (gen_int_mode (i, int_mode));
++  for (unsigned int i = 0; i < count; ++i)
++    builder.quick_push (gen_int_mode (-(int) i, int_mode));
++  rtx x = builder.build ();
++
++  test_vector_subregs_modes (x);
++  if (!nunits.is_constant ())
++    test_vector_subregs_modes (x, nunits - min_nunits, count);
++}
++
++/* Test constant subregs of integer vector mode INNER_MODE, using 3
++   elements per pattern.  */
++
++static void
++test_vector_subregs_stepped (machine_mode inner_mode)
++{
++  /* Build { 0, 1, 2, 3, ... }.  */
++  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
++  rtx_vector_builder builder (inner_mode, 1, 3);
++  for (unsigned int i = 0; i < 3; ++i)
++    builder.quick_push (gen_int_mode (i, int_mode));
++  rtx x = builder.build ();
++
++  test_vector_subregs_modes (x);
++}
++
++/* Test constant subregs of integer vector mode INNER_MODE.  */
++
++static void
++test_vector_subregs (machine_mode inner_mode)
++{
++  test_vector_subregs_repeating (inner_mode);
++  test_vector_subregs_fore_back (inner_mode);
++  test_vector_subregs_stepped (inner_mode);
++}
++
+ /* Verify some simplifications involving vectors.  */
+ 
+ static void
+@@ -7103,7 +7477,10 @@ test_vector_ops ()
+ 	  test_vector_ops_duplicate (mode, scalar_reg);
+ 	  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ 	      && maybe_gt (GET_MODE_NUNITS (mode), 2))
+-	    test_vector_ops_series (mode, scalar_reg);
++	    {
++	      test_vector_ops_series (mode, scalar_reg);
++	      test_vector_subregs (mode);
++	    }
+ 	  test_vec_merge (mode);
+ 	}
+     }
+diff --git a/gcc/stack-ptr-mod.c b/gcc/stack-ptr-mod.c
+index a10d59b61..5cb95e712 100644
+--- a/gcc/stack-ptr-mod.c
++++ b/gcc/stack-ptr-mod.c
+@@ -91,9 +91,7 @@ pass_stack_ptr_mod::execute (function *fun)
+ 	  if (INSN_P (insn))
+ 	    {
+ 	      /* Check if insn modifies the stack pointer.  */
+-	      note_stores (PATTERN (insn),
+-			   notice_stack_pointer_modification_1,
+-			   NULL);
++	      note_stores (insn, notice_stack_pointer_modification_1, NULL);
+ 	      if (! crtl->sp_is_unchanging)
+ 		return 0;
+ 	    }
+diff --git a/gcc/stor-layout.c b/gcc/stor-layout.c
+index a054b7887..7d1917f82 100644
+--- a/gcc/stor-layout.c
++++ b/gcc/stor-layout.c
+@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "gimplify.h"
+ #include "attribs.h"
+ #include "debug.h"
++#include "calls.h"
+ 
+ /* Data type for the expressions representing sizes of data types.
+    It is the first integer type laid out.  */
+@@ -1835,7 +1836,8 @@ compute_record_mode (tree type)
+      line.  */
+   SET_TYPE_MODE (type, BLKmode);
+ 
+-  if (! tree_fits_uhwi_p (TYPE_SIZE (type)))
++  poly_uint64 type_size;
++  if (!poly_int_tree_p (TYPE_SIZE (type), &type_size))
+     return;
+ 
+   /* A record which has any BLKmode members must itself be
+@@ -1846,20 +1848,21 @@ compute_record_mode (tree type)
+       if (TREE_CODE (field) != FIELD_DECL)
+ 	continue;
+ 
++      poly_uint64 field_size;
+       if (TREE_CODE (TREE_TYPE (field)) == ERROR_MARK
+ 	  || (TYPE_MODE (TREE_TYPE (field)) == BLKmode
+ 	      && ! TYPE_NO_FORCE_BLK (TREE_TYPE (field))
+ 	      && !(TYPE_SIZE (TREE_TYPE (field)) != 0
+ 		   && integer_zerop (TYPE_SIZE (TREE_TYPE (field)))))
+-	  || ! tree_fits_uhwi_p (bit_position (field))
++	  || !tree_fits_poly_uint64_p (bit_position (field))
+ 	  || DECL_SIZE (field) == 0
+-	  || ! tree_fits_uhwi_p (DECL_SIZE (field)))
++	  || !poly_int_tree_p (DECL_SIZE (field), &field_size))
+ 	return;
+ 
+       /* If this field is the whole struct, remember its mode so
+ 	 that, say, we can put a double in a class into a DF
+ 	 register instead of forcing it to live in the stack.  */
+-      if (simple_cst_equal (TYPE_SIZE (type), DECL_SIZE (field))
++      if (known_eq (field_size, type_size)
+ 	  /* Partial int types (e.g. __int20) may have TYPE_SIZE equal to
+ 	     wider types (e.g. int32), despite precision being less.  Ensure
+ 	     that the TYPE_MODE of the struct does not get set to the partial
+@@ -1879,15 +1882,14 @@ compute_record_mode (tree type)
+      For UNION_TYPE, if the widest field is MODE_INT then use that mode.
+      If the widest field is MODE_PARTIAL_INT, and the union will be passed
+      by reference, then use that mode.  */
+-  poly_uint64 type_size;
+   if ((TREE_CODE (type) == RECORD_TYPE
+        || (TREE_CODE (type) == UNION_TYPE
+ 	   && (GET_MODE_CLASS (mode) == MODE_INT
+ 	       || (GET_MODE_CLASS (mode) == MODE_PARTIAL_INT
+-		   && targetm.calls.pass_by_reference (pack_cumulative_args (0),
+-						       mode, type, 0)))))
++		   && (targetm.calls.pass_by_reference
++		       (pack_cumulative_args (0),
++			function_arg_info (type, mode, /*named=*/false)))))))
+       && mode != VOIDmode
+-      && poly_int_tree_p (TYPE_SIZE (type), &type_size)
+       && known_eq (GET_MODE_BITSIZE (mode), type_size))
+     ;
+   else
+diff --git a/gcc/target-globals.c b/gcc/target-globals.c
+index 94a465c91..00bbda69c 100644
+--- a/gcc/target-globals.c
++++ b/gcc/target-globals.c
+@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "gcse.h"
+ #include "bb-reorder.h"
+ #include "lower-subreg.h"
++#include "function-abi.h"
+ 
+ #if SWITCHABLE_TARGET
+ struct target_globals default_target_globals = {
+@@ -48,6 +49,7 @@ struct target_globals default_target_globals = {
+   &default_target_rtl,
+   &default_target_recog,
+   &default_target_hard_regs,
++  &default_target_function_abi_info,
+   &default_target_reload,
+   &default_target_expmed,
+   &default_target_optabs,
+@@ -70,6 +72,7 @@ save_target_globals (void)
+   g->rtl = ggc_cleared_alloc<target_rtl> ();
+   g->recog = XCNEW (struct target_recog);
+   g->hard_regs = XCNEW (struct target_hard_regs);
++  g->function_abi_info = XCNEW (struct target_function_abi_info);
+   g->reload = XCNEW (struct target_reload);
+   g->expmed = XCNEW (struct target_expmed);
+   g->optabs = XCNEW (struct target_optabs);
+@@ -127,6 +130,7 @@ target_globals::~target_globals ()
+       XDELETE (regs);
+       XDELETE (recog);
+       XDELETE (hard_regs);
++      XDELETE (function_abi_info);
+       XDELETE (reload);
+       XDELETE (expmed);
+       XDELETE (optabs);
+diff --git a/gcc/target-globals.h b/gcc/target-globals.h
+index 5af846c9f..f21580be6 100644
+--- a/gcc/target-globals.h
++++ b/gcc/target-globals.h
+@@ -26,6 +26,7 @@ extern struct target_regs *this_target_regs;
+ extern struct target_rtl *this_target_rtl;
+ extern struct target_recog *this_target_recog;
+ extern struct target_hard_regs *this_target_hard_regs;
++extern struct target_function_abi_info *this_target_function_abi_info;
+ extern struct target_reload *this_target_reload;
+ extern struct target_expmed *this_target_expmed;
+ extern struct target_optabs *this_target_optabs;
+@@ -47,6 +48,7 @@ struct GTY(()) target_globals {
+   struct target_rtl *rtl;
+   struct target_recog *GTY((skip)) recog;
+   struct target_hard_regs *GTY((skip)) hard_regs;
++  struct target_function_abi_info *GTY((skip)) function_abi_info;
+   struct target_reload *GTY((skip)) reload;
+   struct target_expmed *GTY((skip)) expmed;
+   struct target_optabs *GTY((skip)) optabs;
+@@ -74,6 +76,7 @@ restore_target_globals (struct target_globals *g)
+   this_target_rtl = g->rtl;
+   this_target_recog = g->recog;
+   this_target_hard_regs = g->hard_regs;
++  this_target_function_abi_info = g->function_abi_info;
+   this_target_reload = g->reload;
+   this_target_expmed = g->expmed;
+   this_target_optabs = g->optabs;
+diff --git a/gcc/target.def b/gcc/target.def
+index f998470ff..05389cdd1 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -1782,22 +1782,6 @@ return type of the vectorized function shall be of vector type\n\
+  tree, (tree fndecl, tree vec_type_out, tree vec_type_in),
+  default_builtin_md_vectorized_function)
+ 
+-/* Returns a function declaration for a builtin that realizes the
+-   vector conversion, or NULL_TREE if not available.  */
+-DEFHOOK
+-(builtin_conversion,
+- "This hook should return the DECL of a function that implements conversion of the\n\
+-input vector of type @var{src_type} to type @var{dest_type}.\n\
+-The value of @var{code} is one of the enumerators in @code{enum tree_code} and\n\
+-specifies how the conversion is to be applied\n\
+-(truncation, rounding, etc.).\n\
+-\n\
+-If this hook is defined, the autovectorizer will use the\n\
+-@code{TARGET_VECTORIZE_BUILTIN_CONVERSION} target hook when vectorizing\n\
+-conversion. Otherwise, it will return @code{NULL_TREE}.",
+- tree, (unsigned code, tree dest_type, tree src_type),
+- default_builtin_vectorized_conversion)
+-
+ /* Cost of different vector/scalar statements in vectorization cost
+    model. In case of misaligned vector loads and stores the cost depends
+    on the data type and misalignment value.  */
+@@ -2431,6 +2415,24 @@ another @code{CALL_EXPR}.\n\
+ @var{arglist} really has type @samp{VEC(tree,gc)*}",
+  tree, (unsigned int /*location_t*/ loc, tree fndecl, void *arglist), NULL)
+ 
++DEFHOOK
++(check_builtin_call,
++ "Perform semantic checking on a call to a machine-specific built-in\n\
++function after its arguments have been constrained to the function\n\
++signature.  Return true if the call is valid, otherwise report an error\n\
++and return false.\n\
++\n\
++This hook is called after @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}.\n\
++The call was originally to built-in function @var{orig_fndecl},\n\
++but after the optional @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}\n\
++step is now to built-in function @var{fndecl}.  @var{loc} is the\n\
++location of the call and @var{args} is an array of function arguments,\n\
++of which there are @var{nargs}.  @var{arg_loc} specifies the location\n\
++of each argument.",
++ bool, (location_t loc, vec<location_t> arg_loc, tree fndecl,
++	tree orig_fndecl, unsigned int nargs, tree *args),
++ NULL)
++
+ /* Fold a target-specific builtin to a tree valid for both GIMPLE
+    and GENERIC.  */
+ DEFHOOK
+@@ -2624,38 +2626,6 @@ DEFHOOK
+  bool, (const rtx_insn *follower, const rtx_insn *followee),
+  hook_bool_const_rtx_insn_const_rtx_insn_true)
+ 
+-/* Return a register class for which branch target register
+-   optimizations should be applied.  */
+-DEFHOOK
+-(branch_target_register_class,
+- "This target hook returns a register class for which branch target register\n\
+-optimizations should be applied.  All registers in this class should be\n\
+-usable interchangeably.  After reload, registers in this class will be\n\
+-re-allocated and loads will be hoisted out of loops and be subjected\n\
+-to inter-block scheduling.",
+- reg_class_t, (void),
+- default_branch_target_register_class)
+-
+-/* Return true if branch target register optimizations should include
+-   callee-saved registers that are not already live during the current
+-   function.  AFTER_PE_GEN is true if prologues and epilogues have
+-   already been generated.  */
+-DEFHOOK
+-(branch_target_register_callee_saved,
+- "Branch target register optimization will by default exclude callee-saved\n\
+-registers\n\
+-that are not already live during the current function; if this target hook\n\
+-returns true, they will be included.  The target code must than make sure\n\
+-that all target registers in the class returned by\n\
+-@samp{TARGET_BRANCH_TARGET_REGISTER_CLASS} that might need saving are\n\
+-saved.  @var{after_prologue_epilogue_gen} indicates if prologues and\n\
+-epilogues have already been generated.  Note, even if you only return\n\
+-true when @var{after_prologue_epilogue_gen} is false, you still are likely\n\
+-to have to make special provisions in @code{INITIAL_ELIMINATION_OFFSET}\n\
+-to reserve space for caller-saved target registers.",
+- bool, (bool after_prologue_epilogue_gen),
+- hook_bool_bool_false)
+-
+ /* Return true if the target supports conditional execution.  */
+ DEFHOOK
+ (have_conditional_execution,
+@@ -3407,6 +3377,29 @@ must have move patterns for this mode.",
+  bool, (machine_mode mode),
+  hook_bool_mode_false)
+ 
++DEFHOOK
++(compatible_vector_types_p,
++ "Return true if there is no target-specific reason for treating\n\
++vector types @var{type1} and @var{type2} as distinct types.  The caller\n\
++has already checked for target-independent reasons, meaning that the\n\
++types are known to have the same mode, to have the same number of elements,\n\
++and to have what the caller considers to be compatible element types.\n\
++\n\
++The main reason for defining this hook is to reject pairs of types\n\
++that are handled differently by the target's calling convention.\n\
++For example, when a new @var{N}-bit vector architecture is added\n\
++to a target, the target may want to handle normal @var{N}-bit\n\
++@code{VECTOR_TYPE} arguments and return values in the same way as\n\
++before, to maintain backwards compatibility.  However, it may also\n\
++provide new, architecture-specific @code{VECTOR_TYPE}s that are passed\n\
++and returned in a more efficient way.  It is then important to maintain\n\
++a distinction between the ``normal'' @code{VECTOR_TYPE}s and the new\n\
++architecture-specific ones.\n\
++\n\
++The default implementation returns true, which is correct for most targets.",
++ bool, (const_tree type1, const_tree type2),
++ hook_bool_const_tree_const_tree_true)
++
+ DEFHOOK
+ (vector_alignment,
+  "This hook can be used to define the alignment for a vector of type\n\
+@@ -3569,7 +3562,7 @@ two areas of memory, or to set, clear or store to memory, for example\n\
+ when copying a @code{struct}. The @code{by_pieces} infrastructure\n\
+ implements such memory operations as a sequence of load, store or move\n\
+ insns.  Alternate strategies are to expand the\n\
+-@code{movmem} or @code{setmem} optabs, to emit a library call, or to emit\n\
++@code{cpymem} or @code{setmem} optabs, to emit a library call, or to emit\n\
+ unit-by-unit, loop-based operations.\n\
+ \n\
+ This target hook should return true if, for a memory operation with a\n\
+@@ -3588,7 +3581,7 @@ optimized for speed rather than size.\n\
+ \n\
+ Returning true for higher values of @var{size} can improve code generation\n\
+ for speed if the target does not provide an implementation of the\n\
+-@code{movmem} or @code{setmem} standard names, if the @code{movmem} or\n\
++@code{cpymem} or @code{setmem} standard names, if the @code{cpymem} or\n\
+ @code{setmem} implementation would be more expensive than a sequence of\n\
+ insns, or if the overhead of a library call would dominate that of\n\
+ the body of the memory operation.\n\
+@@ -4479,18 +4472,18 @@ or 3-byte structure is returned at the most significant end of a\n\
+    from __builtin_va_arg.  */
+ DEFHOOK
+ (pass_by_reference,
+- "This target hook should return @code{true} if an argument at the\n\
++ "This target hook should return @code{true} if argument @var{arg} at the\n\
+ position indicated by @var{cum} should be passed by reference.  This\n\
+ predicate is queried after target independent reasons for being\n\
+-passed by reference, such as @code{TREE_ADDRESSABLE (type)}.\n\
++passed by reference, such as @code{TREE_ADDRESSABLE (@var{arg}.type)}.\n\
+ \n\
+ If the hook returns true, a copy of that argument is made in memory and a\n\
+ pointer to the argument is passed instead of the argument itself.\n\
+ The pointer is passed in whatever way is appropriate for passing a pointer\n\
+ to that type.",
+  bool,
+- (cumulative_args_t cum, machine_mode mode, const_tree type, bool named),
+- hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false)
++ (cumulative_args_t cum, const function_arg_info &arg),
++ hook_bool_CUMULATIVE_ARGS_arg_info_false)
+ 
+ DEFHOOK
+ (expand_builtin_saveregs,
+@@ -4515,8 +4508,8 @@ pass all their arguments on the stack.\n\
+ \n\
+ The argument @var{args_so_far} points to the @code{CUMULATIVE_ARGS} data\n\
+ structure, containing the values that are obtained after processing the\n\
+-named arguments.  The arguments @var{mode} and @var{type} describe the\n\
+-last named argument---its machine mode and its data type as a tree node.\n\
++named arguments.  The argument @var{arg} describes the last of these named\n\
++arguments.\n\
+ \n\
+ The target hook should do two things: first, push onto the stack all the\n\
+ argument registers @emph{not} used for the named arguments, and second,\n\
+@@ -4536,7 +4529,7 @@ arguments of the function are being analyzed for the second time.  This\n\
+ happens for an inline function, which is not actually compiled until the\n\
+ end of the source file.  The hook @code{TARGET_SETUP_INCOMING_VARARGS} should\n\
+ not generate any instructions in this case.",
+- void, (cumulative_args_t args_so_far, machine_mode mode, tree type,
++ void, (cumulative_args_t args_so_far, const function_arg_info &arg,
+ 	int *pretend_args_size, int second_time),
+  default_setup_incoming_varargs)
+ 
+@@ -4579,15 +4572,6 @@ returned by function call into @var{slot}.",
+  void, (rtx slot, rtx bounds),
+  default_store_returned_bounds)
+ 
+-DEFHOOK
+-(setup_incoming_vararg_bounds,
+- "Use it to store bounds for anonymous register arguments stored\n\
+-into the stack.  Arguments meaning is similar to\n\
+-@code{TARGET_SETUP_INCOMING_VARARGS}.",
+- void, (cumulative_args_t args_so_far, machine_mode mode, tree type,
+-	int *pretend_args_size, int second_time),
+- default_setup_incoming_vararg_bounds)
+-
+ DEFHOOK
+ (call_args,
+  "While generating RTL for a function call, this target hook is invoked once\n\
+@@ -4668,11 +4652,11 @@ false.",
+    Need audit to verify that this is the case.  */
+ DEFHOOK
+ (must_pass_in_stack,
+- "This target hook should return @code{true} if we should not pass @var{type}\n\
++ "This target hook should return @code{true} if we should not pass @var{arg}\n\
+ solely in registers.  The file @file{expr.h} defines a\n\
+ definition that is usually appropriate, refer to @file{expr.h} for additional\n\
+ documentation.",
+- bool, (machine_mode mode, const_tree type),
++ bool, (const function_arg_info &arg),
+  must_pass_in_stack_var_size_or_pad)
+ 
+ /* Return true if type TYPE, mode MODE, which is passed by reference,
+@@ -4691,8 +4675,8 @@ not be generated.\n\
+ \n\
+ The default version of this hook always returns false.",
+  bool,
+- (cumulative_args_t cum, machine_mode mode, const_tree type, bool named),
+- hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false)
++ (cumulative_args_t cum, const function_arg_info &arg),
++ hook_bool_CUMULATIVE_ARGS_arg_info_false)
+ 
+ /* Return zero for arguments passed entirely on the stack or entirely
+    in registers.  If passed in both, return the number of bytes passed
+@@ -4715,8 +4699,8 @@ compiler when this occurs, and how many bytes should go in registers.\n\
+ @code{TARGET_FUNCTION_ARG} for these arguments should return the first\n\
+ register to be used by the caller for this argument; likewise\n\
+ @code{TARGET_FUNCTION_INCOMING_ARG}, for the called function.",
+- int, (cumulative_args_t cum, machine_mode mode, tree type, bool named),
+- hook_int_CUMULATIVE_ARGS_mode_tree_bool_0)
++ int, (cumulative_args_t cum, const function_arg_info &arg),
++ hook_int_CUMULATIVE_ARGS_arg_info_0)
+ 
+ /* Update the state in CA to advance past an argument in the
+    argument list.  The values MODE, TYPE, and NAMED describe that
+@@ -4724,8 +4708,7 @@ register to be used by the caller for this argument; likewise\n\
+ DEFHOOK
+ (function_arg_advance,
+  "This hook updates the summarizer variable pointed to by @var{ca} to\n\
+-advance past an argument in the argument list.  The values @var{mode},\n\
+-@var{type} and @var{named} describe that argument.  Once this is done,\n\
++advance past argument @var{arg} in the argument list.  Once this is done,\n\
+ the variable @var{cum} is suitable for analyzing the @emph{following}\n\
+ argument with @code{TARGET_FUNCTION_ARG}, etc.\n\
+ \n\
+@@ -4733,7 +4716,7 @@ This hook need not do anything if the argument in question was passed\n\
+ on the stack.  The compiler knows how to track the amount of stack space\n\
+ used for arguments without any special help.",
+  void,
+- (cumulative_args_t ca, machine_mode mode, const_tree type, bool named),
++ (cumulative_args_t ca, const function_arg_info &arg),
+  default_function_arg_advance)
+ 
+ DEFHOOK
+@@ -4770,17 +4753,9 @@ constant size shorter than an @code{int}, and upward otherwise.",
+    argument.  */
+ DEFHOOK
+ (function_arg,
+- "Return an RTX indicating whether a function argument is passed in a\n\
+-register and if so, which register.\n\
+-\n\
+-The arguments are @var{ca}, which summarizes all the previous\n\
+-arguments; @var{mode}, the machine mode of the argument; @var{type},\n\
+-the data type of the argument as a tree node or 0 if that is not known\n\
+-(which happens for C support library functions); and @var{named},\n\
+-which is @code{true} for an ordinary argument and @code{false} for\n\
+-nameless arguments that correspond to @samp{@dots{}} in the called\n\
+-function's prototype.  @var{type} can be an incomplete type if a\n\
+-syntax error has previously occurred.\n\
++ "Return an RTX indicating whether function argument @var{arg} is passed\n\
++in a register and if so, which register.  Argument @var{ca} summarizes all\n\
++the previous arguments.\n\
+ \n\
+ The return value is usually either a @code{reg} RTX for the hard\n\
+ register in which to pass the argument, or zero to pass the argument\n\
+@@ -4826,8 +4801,7 @@ is not defined and @code{TARGET_FUNCTION_ARG} returns nonzero for such an\n\
+ argument, the compiler will abort.  If @code{REG_PARM_STACK_SPACE} is\n\
+ defined, the argument will be computed in the stack and then loaded into\n\
+ a register.",
+- rtx, (cumulative_args_t ca, machine_mode mode, const_tree type,
+-       bool named),
++ rtx, (cumulative_args_t ca, const function_arg_info &arg),
+  default_function_arg)
+ 
+ DEFHOOK
+@@ -4849,8 +4823,7 @@ so that it can be used to pass special arguments.\n\
+ \n\
+ If @code{TARGET_FUNCTION_INCOMING_ARG} is not defined,\n\
+ @code{TARGET_FUNCTION_ARG} serves both purposes.",
+- rtx, (cumulative_args_t ca, machine_mode mode, const_tree type,
+-       bool named),
++ rtx, (cumulative_args_t ca, const function_arg_info &arg),
+  default_function_incoming_arg)
+ 
+ DEFHOOK
+@@ -4962,6 +4935,28 @@ If this hook is not defined, then FUNCTION_VALUE_REGNO_P will be used.",
+  bool, (const unsigned int regno),
+  default_function_value_regno_p)
+ 
++DEFHOOK
++(fntype_abi,
++ "Return the ABI used by a function with type @var{type}; see the\n\
++definition of @code{predefined_function_abi} for details of the ABI\n\
++descriptor.  Targets only need to define this hook if they support\n\
++interoperability between several ABIs in the same translation unit.",
++ const predefined_function_abi &, (const_tree type),
++ NULL)
++
++DEFHOOK
++(insn_callee_abi,
++ "This hook returns a description of the ABI used by the target of\n\
++call instruction @var{insn}; see the definition of\n\
++@code{predefined_function_abi} for details of the ABI descriptor.\n\
++Only the global function @code{insn_callee_abi} should call this hook\n\
++directly.\n\
++\n\
++Targets only need to define this hook if they support\n\
++interoperability between several ABIs in the same translation unit.",
++ const predefined_function_abi &, (const rtx_insn *insn),
++ NULL)
++
+ /* ??? Documenting this hook requires a GFDL license grant.  */
+ DEFHOOK_UNDOC
+ (internal_arg_pointer,
+@@ -5811,32 +5806,27 @@ The default version of this hook always returns @code{true}.",
+ 
+ DEFHOOK
+ (hard_regno_call_part_clobbered,
+- "This hook should return true if @var{regno} is partly call-saved and\n\
+-partly call-clobbered, and if a value of mode @var{mode} would be partly\n\
+-clobbered by call instruction @var{insn}.  If @var{insn} is NULL then it\n\
+-should return true if any call could partly clobber the register.\n\
+-For example, if the low 32 bits of @var{regno} are preserved across a call\n\
+-but higher bits are clobbered, this hook should return true for a 64-bit\n\
+-mode but false for a 32-bit mode.\n\
++ "ABIs usually specify that calls must preserve the full contents\n\
++of a particular register, or that calls can alter any part of a\n\
++particular register.  This information is captured by the target macro\n\
++@code{CALL_REALLY_USED_REGISTERS}.  However, some ABIs specify that calls\n\
++must preserve certain bits of a particular register but can alter others.\n\
++This hook should return true if this applies to at least one of the\n\
++registers in @samp{(reg:@var{mode} @var{regno})}, and if as a result the\n\
++call would alter part of the @var{mode} value.  For example, if a call\n\
++preserves the low 32 bits of a 64-bit hard register @var{regno} but can\n\
++clobber the upper 32 bits, this hook should return true for a 64-bit mode\n\
++but false for a 32-bit mode.\n\
++\n\
++The value of @var{abi_id} comes from the @code{predefined_function_abi}\n\
++structure that describes the ABI of the call; see the definition of the\n\
++structure for more details.  If (as is usual) the target uses the same ABI\n\
++for all functions in a translation unit, @var{abi_id} is always 0.\n\
+ \n\
+ The default implementation returns false, which is correct\n\
+ for targets that don't have partly call-clobbered registers.",
+- bool, (rtx_insn *insn, unsigned int regno, machine_mode mode),
+- hook_bool_insn_uint_mode_false)
+-
+-DEFHOOK
+-(return_call_with_max_clobbers,
+- "This hook returns a pointer to the call that partially clobbers the\n\
+-most registers.  If a platform supports multiple ABIs where the registers\n\
+-that are partially clobbered may vary, this function compares two\n\
+-calls and returns a pointer to the one that clobbers the most registers.\n\
+-If both calls clobber the same registers, @var{call_1} must be returned.\n\
+-\n\
+-The registers clobbered in different ABIs must be a proper subset or\n\
+-superset of all other ABIs.  @var{call_1} must always be a call insn,\n\
+-call_2 may be NULL or a call insn.",
+- rtx_insn *, (rtx_insn *call_1, rtx_insn *call_2),
+- NULL)
++ bool, (unsigned int abi_id, unsigned int regno, machine_mode mode),
++ hook_bool_uint_uint_mode_false)
+ 
+ DEFHOOK
+ (get_multilib_abi_name,
+@@ -5844,20 +5834,6 @@ DEFHOOK
+  const char *, (void),
+  hook_constcharptr_void_null)
+ 
+-DEFHOOK
+-(remove_extra_call_preserved_regs,
+- "This hook removes registers from the set of call-clobbered registers\n\
+- in @var{used_regs} if, contrary to the default rules, something guarantees\n\
+- that @samp{insn} preserves those registers.  For example, some targets\n\
+- support variant ABIs in which functions preserve more registers than\n\
+- normal functions would.  Removing those extra registers from @var{used_regs}\n\
+- can lead to better register allocation.\n\
+- \n\
+- The default implementation does nothing, which is always safe.\n\
+- Defining the hook is purely an optimization.",
+- void, (rtx_insn *insn, HARD_REG_SET *used_regs),
+- default_remove_extra_call_preserved_regs)
+-
+ /* Return the smallest number of different values for which it is best to
+    use a jump-table instead of a tree of conditional branches.  */
+ DEFHOOK
+diff --git a/gcc/target.h b/gcc/target.h
+index 057e6ae87..964629669 100644
+--- a/gcc/target.h
++++ b/gcc/target.h
+@@ -149,6 +149,12 @@ struct ao_ref;
+ /* This is defined in tree-vectorizer.h.  */
+ struct _stmt_vec_info;
+ 
++/* This is defined in calls.h.  */
++struct function_arg_info;
++
++/* This is defined in function-abi.h.  */
++struct predefined_function_abi;
++
+ /* These are defined in tree-vect-stmts.c.  */
+ extern tree stmt_vectype (struct _stmt_vec_info *);
+ extern bool stmt_in_inner_loop_p (struct _stmt_vec_info *);
+diff --git a/gcc/targhooks.c b/gcc/targhooks.c
+index 6396f6f4b..6f54de0d5 100644
+--- a/gcc/targhooks.c
++++ b/gcc/targhooks.c
+@@ -193,11 +193,8 @@ default_expand_builtin_saveregs (void)
+ }
+ 
+ void
+-default_setup_incoming_varargs (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-				machine_mode mode ATTRIBUTE_UNUSED,
+-				tree type ATTRIBUTE_UNUSED,
+-				int *pretend_arg_size ATTRIBUTE_UNUSED,
+-				int second_time ATTRIBUTE_UNUSED)
++default_setup_incoming_varargs (cumulative_args_t,
++				const function_arg_info &, int *, int)
+ {
+ }
+ 
+@@ -323,22 +320,19 @@ default_cxx_get_cookie_size (tree type)
+    of the TARGET_PASS_BY_REFERENCE hook uses just MUST_PASS_IN_STACK.  */
+ 
+ bool
+-hook_pass_by_reference_must_pass_in_stack (cumulative_args_t c ATTRIBUTE_UNUSED,
+-	machine_mode mode ATTRIBUTE_UNUSED, const_tree type ATTRIBUTE_UNUSED,
+-	bool named_arg ATTRIBUTE_UNUSED)
++hook_pass_by_reference_must_pass_in_stack (cumulative_args_t,
++					   const function_arg_info &arg)
+ {
+-  return targetm.calls.must_pass_in_stack (mode, type);
++  return targetm.calls.must_pass_in_stack (arg);
+ }
+ 
+ /* Return true if a parameter follows callee copies conventions.  This
+    version of the hook is true for all named arguments.  */
+ 
+ bool
+-hook_callee_copies_named (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-			  machine_mode mode ATTRIBUTE_UNUSED,
+-			  const_tree type ATTRIBUTE_UNUSED, bool named)
++hook_callee_copies_named (cumulative_args_t, const function_arg_info &arg)
+ {
+-  return named;
++  return arg.named;
+ }
+ 
+ /* Emit to STREAM the assembler syntax for insn operand X.  */
+@@ -681,16 +675,6 @@ default_builtin_md_vectorized_function (tree, tree, tree)
+   return NULL_TREE;
+ }
+ 
+-/* Vectorized conversion.  */
+-
+-tree
+-default_builtin_vectorized_conversion (unsigned int code ATTRIBUTE_UNUSED,
+-				       tree dest_type ATTRIBUTE_UNUSED,
+-				       tree src_type ATTRIBUTE_UNUSED)
+-{
+-  return NULL_TREE;
+-}
+-
+ /* Default vectorizer cost model values.  */
+ 
+ int
+@@ -737,28 +721,22 @@ default_builtin_reciprocal (tree)
+ }
+ 
+ bool
+-hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false (
+-	cumulative_args_t ca ATTRIBUTE_UNUSED,
+-	machine_mode mode ATTRIBUTE_UNUSED,
+-	const_tree type ATTRIBUTE_UNUSED, bool named ATTRIBUTE_UNUSED)
++hook_bool_CUMULATIVE_ARGS_arg_info_false (cumulative_args_t,
++					  const function_arg_info &)
+ {
+   return false;
+ }
+ 
+ bool
+-hook_bool_CUMULATIVE_ARGS_mode_tree_bool_true (
+-	cumulative_args_t ca ATTRIBUTE_UNUSED,
+-	machine_mode mode ATTRIBUTE_UNUSED,
+-	const_tree type ATTRIBUTE_UNUSED, bool named ATTRIBUTE_UNUSED)
++hook_bool_CUMULATIVE_ARGS_arg_info_true (cumulative_args_t,
++					 const function_arg_info &)
+ {
+   return true;
+ }
+ 
+ int
+-hook_int_CUMULATIVE_ARGS_mode_tree_bool_0 (
+-	cumulative_args_t ca ATTRIBUTE_UNUSED,
+-	machine_mode mode ATTRIBUTE_UNUSED,
+-	tree type ATTRIBUTE_UNUSED, bool named ATTRIBUTE_UNUSED)
++hook_int_CUMULATIVE_ARGS_arg_info_0 (cumulative_args_t,
++				     const function_arg_info &)
+ {
+   return 0;
+ }
+@@ -770,10 +748,7 @@ hook_void_CUMULATIVE_ARGS_tree (cumulative_args_t ca ATTRIBUTE_UNUSED,
+ }
+ 
+ void
+-default_function_arg_advance (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-			      machine_mode mode ATTRIBUTE_UNUSED,
+-			      const_tree type ATTRIBUTE_UNUSED,
+-			      bool named ATTRIBUTE_UNUSED)
++default_function_arg_advance (cumulative_args_t, const function_arg_info &)
+ {
+   gcc_unreachable ();
+ }
+@@ -814,19 +789,13 @@ default_function_arg_padding (machine_mode mode, const_tree type)
+ }
+ 
+ rtx
+-default_function_arg (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-		      machine_mode mode ATTRIBUTE_UNUSED,
+-		      const_tree type ATTRIBUTE_UNUSED,
+-		      bool named ATTRIBUTE_UNUSED)
++default_function_arg (cumulative_args_t, const function_arg_info &)
+ {
+   gcc_unreachable ();
+ }
+ 
+ rtx
+-default_function_incoming_arg (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-			       machine_mode mode ATTRIBUTE_UNUSED,
+-			       const_tree type ATTRIBUTE_UNUSED,
+-			       bool named ATTRIBUTE_UNUSED)
++default_function_incoming_arg (cumulative_args_t, const function_arg_info &)
+ {
+   gcc_unreachable ();
+ }
+@@ -1061,12 +1030,6 @@ default_return_pops_args (tree, tree, poly_int64)
+   return 0;
+ }
+ 
+-reg_class_t
+-default_branch_target_register_class (void)
+-{
+-  return NO_REGS;
+-}
+-
+ reg_class_t
+ default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
+ 					 reg_class_t cl,
+@@ -1732,9 +1695,9 @@ get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
+ #ifdef MOVE_RATIO
+   move_ratio = (unsigned int) MOVE_RATIO (speed_p);
+ #else
+-#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
++#if defined (HAVE_cpymemqi) || defined (HAVE_cpymemhi) || defined (HAVE_cpymemsi) || defined (HAVE_cpymemdi) || defined (HAVE_cpymemti)
+   move_ratio = 2;
+-#else /* No movmem patterns, pick a default.  */
++#else /* No cpymem patterns, pick a default.  */
+   move_ratio = ((speed_p) ? 15 : 3);
+ #endif
+ #endif
+@@ -1742,7 +1705,7 @@ get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
+ }
+ 
+ /* Return TRUE if the move_by_pieces/set_by_pieces infrastructure should be
+-   used; return FALSE if the movmem/setmem optab should be expanded, or
++   used; return FALSE if the cpymem/setmem optab should be expanded, or
+    a call to memcpy emitted.  */
+ 
+ bool
+@@ -1941,7 +1904,7 @@ default_dwarf_frame_reg_mode (int regno)
+ {
+   machine_mode save_mode = reg_raw_mode[regno];
+ 
+-  if (targetm.hard_regno_call_part_clobbered (NULL, regno, save_mode))
++  if (targetm.hard_regno_call_part_clobbered (0, regno, save_mode))
+     save_mode = choose_hard_reg_mode (regno, 1, true);
+   return save_mode;
+ }
+@@ -2163,7 +2126,7 @@ std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   if (ARGS_GROW_DOWNWARD)
+     gcc_unreachable ();
+ 
+-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
++  indirect = pass_va_arg_by_reference (type);
+   if (indirect)
+     type = build_pointer_type (type);
+ 
+@@ -2260,15 +2223,6 @@ std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
+   return build_va_arg_indirect_ref (addr);
+ }
+ 
+-void
+-default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-				      machine_mode mode ATTRIBUTE_UNUSED,
+-				      tree type ATTRIBUTE_UNUSED,
+-				      int *pretend_arg_size ATTRIBUTE_UNUSED,
+-				      int second_time ATTRIBUTE_UNUSED)
+-{
+-}
+-
+ /* An implementation of TARGET_CAN_USE_DOLOOP_P for targets that do
+    not support nested low-overhead loops.  */
+ 
+@@ -2385,9 +2339,4 @@ default_speculation_safe_value (machine_mode mode ATTRIBUTE_UNUSED,
+   return result;
+ }
+ 
+-void
+-default_remove_extra_call_preserved_regs (rtx_insn *, HARD_REG_SET *)
+-{
+-}
+-
+ #include "gt-targhooks.h"
+diff --git a/gcc/targhooks.h b/gcc/targhooks.h
+index 2d5991908..e5e803c33 100644
+--- a/gcc/targhooks.h
++++ b/gcc/targhooks.h
+@@ -40,7 +40,9 @@ extern machine_mode default_cc_modes_compatible (machine_mode,
+ extern bool default_return_in_memory (const_tree, const_tree);
+ 
+ extern rtx default_expand_builtin_saveregs (void);
+-extern void default_setup_incoming_varargs (cumulative_args_t, machine_mode, tree, int *, int);
++extern void default_setup_incoming_varargs (cumulative_args_t,
++					    const function_arg_info &,
++					    int *, int);
+ extern rtx default_builtin_setjmp_frame_value (void);
+ extern bool default_pretend_outgoing_varargs_named (cumulative_args_t);
+ 
+@@ -63,9 +65,9 @@ extern tree default_cxx_guard_type (void);
+ extern tree default_cxx_get_cookie_size (tree);
+ 
+ extern bool hook_pass_by_reference_must_pass_in_stack
+-  (cumulative_args_t, machine_mode mode, const_tree, bool);
++  (cumulative_args_t, const function_arg_info &);
+ extern bool hook_callee_copies_named
+-  (cumulative_args_t ca, machine_mode, const_tree, bool);
++  (cumulative_args_t ca, const function_arg_info &);
+ 
+ extern void default_print_operand (FILE *, rtx, int);
+ extern void default_print_operand_address (FILE *, machine_mode, rtx);
+@@ -90,8 +92,6 @@ extern const char * default_invalid_within_doloop (const rtx_insn *);
+ extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
+ extern tree default_builtin_md_vectorized_function (tree, tree, tree);
+ 
+-extern tree default_builtin_vectorized_conversion (unsigned int, tree, tree);
+-
+ extern int default_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int);
+ 
+ extern tree default_builtin_reciprocal (tree);
+@@ -135,24 +135,23 @@ extern void default_goacc_reduction (gcall *);
+ extern bool hook_bool_CUMULATIVE_ARGS_false (cumulative_args_t);
+ extern bool hook_bool_CUMULATIVE_ARGS_true (cumulative_args_t);
+ 
+-extern bool hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
+-  (cumulative_args_t, machine_mode, const_tree, bool);
+-extern bool hook_bool_CUMULATIVE_ARGS_mode_tree_bool_true
+-  (cumulative_args_t, machine_mode, const_tree, bool);
+-extern int hook_int_CUMULATIVE_ARGS_mode_tree_bool_0
+-  (cumulative_args_t, machine_mode, tree, bool);
++extern bool hook_bool_CUMULATIVE_ARGS_arg_info_false
++  (cumulative_args_t, const function_arg_info &);
++extern bool hook_bool_CUMULATIVE_ARGS_arg_info_true
++  (cumulative_args_t, const function_arg_info &);
++extern int hook_int_CUMULATIVE_ARGS_arg_info_0
++  (cumulative_args_t, const function_arg_info &);
+ extern void hook_void_CUMULATIVE_ARGS_tree
+   (cumulative_args_t, tree);
+ extern const char *hook_invalid_arg_for_unprototyped_fn
+   (const_tree, const_tree, const_tree);
+ extern void default_function_arg_advance
+-  (cumulative_args_t, machine_mode, const_tree, bool);
++  (cumulative_args_t, const function_arg_info &);
+ extern HOST_WIDE_INT default_function_arg_offset (machine_mode, const_tree);
+ extern pad_direction default_function_arg_padding (machine_mode, const_tree);
+-extern rtx default_function_arg
+-  (cumulative_args_t, machine_mode, const_tree, bool);
+-extern rtx default_function_incoming_arg
+-  (cumulative_args_t, machine_mode, const_tree, bool);
++extern rtx default_function_arg (cumulative_args_t, const function_arg_info &);
++extern rtx default_function_incoming_arg (cumulative_args_t,
++					  const function_arg_info &);
+ extern unsigned int default_function_arg_boundary (machine_mode,
+ 						   const_tree);
+ extern unsigned int default_function_arg_round_boundary (machine_mode,
+@@ -165,7 +164,6 @@ extern rtx default_internal_arg_pointer (void);
+ extern rtx default_static_chain (const_tree, bool);
+ extern void default_trampoline_init (rtx, tree, rtx);
+ extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
+-extern reg_class_t default_branch_target_register_class (void);
+ extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
+ 							    reg_class_t);
+ extern bool default_lra_p (void);
+@@ -266,11 +264,6 @@ extern rtx default_load_bounds_for_arg (rtx, rtx, rtx);
+ extern void default_store_bounds_for_arg (rtx, rtx, rtx, rtx);
+ extern rtx default_load_returned_bounds (rtx);
+ extern void default_store_returned_bounds (rtx,rtx);
+-extern void default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE_UNUSED,
+-						  machine_mode mode ATTRIBUTE_UNUSED,
+-						  tree type ATTRIBUTE_UNUSED,
+-						  int *pretend_arg_size ATTRIBUTE_UNUSED,
+-						  int second_time ATTRIBUTE_UNUSED);
+ extern bool default_optab_supported_p (int, machine_mode, machine_mode,
+ 				       optimization_type);
+ extern unsigned int default_max_noce_ifcvt_seq_cost (edge);
+@@ -287,7 +280,5 @@ extern tree default_preferred_else_value (unsigned, tree, unsigned, tree *);
+ extern bool default_have_speculation_safe_value (bool);
+ extern bool speculation_safe_value_not_needed (bool);
+ extern rtx default_speculation_safe_value (machine_mode, rtx, rtx, rtx);
+-extern void default_remove_extra_call_preserved_regs (rtx_insn *,
+-						      HARD_REG_SET *);
+ 
+ #endif /* GCC_TARGHOOKS_H */
+diff --git a/gcc/testsuite/c-c++-common/guality/Og-dce-1.c b/gcc/testsuite/c-c++-common/guality/Og-dce-1.c
+new file mode 100644
+index 000000000..a859e3252
+--- /dev/null
++++ b/gcc/testsuite/c-c++-common/guality/Og-dce-1.c
+@@ -0,0 +1,14 @@
++/* { dg-do run } */
++/* { dg-options "-g" } */
++
++int *__attribute__((noipa)) consume (int *ptr) { return ptr; }
++
++int
++main (void)
++{
++  int x;
++  int *volatile ptr = consume (&x);
++  x = 0;
++  x = 1;	/* { dg-final { gdb-test . "*ptr" "0" } } */
++  return 0;	/* { dg-final { gdb-test . "*ptr" "1" } } */
++}
+diff --git a/gcc/testsuite/c-c++-common/guality/Og-dce-2.c b/gcc/testsuite/c-c++-common/guality/Og-dce-2.c
+new file mode 100644
+index 000000000..3df2c7921
+--- /dev/null
++++ b/gcc/testsuite/c-c++-common/guality/Og-dce-2.c
+@@ -0,0 +1,19 @@
++/* { dg-do run } */
++/* { dg-options "-g" } */
++
++struct s { int a, b, c, d; };
++
++struct s gs1 = { 1, 2, 3, 4 };
++struct s gs2 = { 5, 6, 7, 8 };
++
++struct s *__attribute__((noipa)) consume (struct s *ptr) { return ptr; }
++
++int
++main (void)
++{
++  struct s x;
++  struct s *volatile ptr = consume (&x);
++  x = gs1;
++  x = gs2;	/* { dg-final { gdb-test . "ptr->a" "1" } } */
++  return 0;	/* { dg-final { gdb-test . "ptr->a" "5" } } */
++}
+diff --git a/gcc/testsuite/c-c++-common/guality/Og-dce-3.c b/gcc/testsuite/c-c++-common/guality/Og-dce-3.c
+new file mode 100644
+index 000000000..fa6186a73
+--- /dev/null
++++ b/gcc/testsuite/c-c++-common/guality/Og-dce-3.c
+@@ -0,0 +1,29 @@
++/* { dg-do run } */
++/* { dg-options "-g" } */
++
++volatile int amount = 10;
++
++void __attribute__((noipa))
++do_something (int *ptr)
++{
++  *ptr += 10;
++}
++
++int __attribute__((noipa))
++foo (int count)
++{
++  int x = 1;
++  for (int i = 0; i < count; ++i)
++    do_something (&x); /* { dg-final { gdb-test . "x" "1" } } */
++  int res = x; /* { dg-final { gdb-test . "x" "101" } } */
++  x = res + 1;
++  return res; /* { dg-final { gdb-test . "x" "102" } } */
++  
++}
++
++int
++main (void)
++{
++  foo (10);
++  return 0;
++}
+diff --git a/gcc/testsuite/c-c++-common/guality/Og-global-dse-1.c b/gcc/testsuite/c-c++-common/guality/Og-global-dse-1.c
+new file mode 100644
+index 000000000..3d4b4e60e
+--- /dev/null
++++ b/gcc/testsuite/c-c++-common/guality/Og-global-dse-1.c
+@@ -0,0 +1,17 @@
++/* { dg-do run } */
++/* { dg-options "-g" } */
++
++struct s { int i, j; };
++struct s gs1, gs2 = { 3, 4 };
++
++void __attribute__((noipa)) consume (void) {};
++
++int
++main (void)
++{
++  gs1.i = 1;
++  gs1.j = 2;	/* { dg-final { gdb-test . "gs1.i" "1" } } */
++  gs1 = gs2;	/* { dg-final { gdb-test . "gs1.j" "2" } } */
++  consume ();	/* { dg-final { gdb-test . "gs1.i" "3" } } */
++  return 0;	/* { dg-final { gdb-test . "gs1.j" "4" } } */
++}
+diff --git a/gcc/testsuite/c-c++-common/guality/Og-static-wo-1.c b/gcc/testsuite/c-c++-common/guality/Og-static-wo-1.c
+new file mode 100644
+index 000000000..a4c7f3067
+--- /dev/null
++++ b/gcc/testsuite/c-c++-common/guality/Og-static-wo-1.c
+@@ -0,0 +1,15 @@
++/* { dg-do run } */
++/* { dg-options "-g" } */
++
++#include "../../gcc.dg/nop.h"
++
++static int x = 0;
++
++int
++main (void)
++{
++  asm volatile (NOP);		/* { dg-final { gdb-test . "x" "0" } } */
++  x = 1;
++  asm volatile (NOP);		/* { dg-final { gdb-test . "x" "1" } } */
++  return 0;
++}
+diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+index 5740c0281..50c1452ed 100644
+--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
++++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+@@ -14,6 +14,7 @@ void f4 (uint16x4_t a) {}
+ void f5 (uint32x2_t a) {}
+ void f23 (uint64x1_t a) {}
+ void f61 (float16x4_t a) {}
++void f62 (bfloat16x4_t a) {}
+ void f6 (float32x2_t a) {}
+ void f7 (poly8x8_t a) {}
+ void f8 (poly16x4_t a) {}
+@@ -27,6 +28,7 @@ void f14 (uint16x8_t a) {}
+ void f15 (uint32x4_t a) {}
+ void f16 (uint64x2_t a) {}
+ void f171 (float16x8_t a) {}
++void f172 (bfloat16x8_t a) {}
+ void f17 (float32x4_t a) {}
+ void f18 (float64x2_t a) {}
+ void f19 (poly8x16_t a) {}
+@@ -45,6 +47,7 @@ void g1 (int8x16_t, int8x16_t) {}
+ // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
+ // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
+ // { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
++// { dg-final { scan-assembler "_Z3f6214__Bfloat16x4_t:" } }
+ // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
+ // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
+ // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
+@@ -57,6 +60,7 @@ void g1 (int8x16_t, int8x16_t) {}
+ // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
+ // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
+ // { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
++// { dg-final { scan-assembler "_Z4f17214__Bfloat16x8_t:" } }
+ // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
+ // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
+ // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
+diff --git a/gcc/testsuite/g++.dg/diagnostic/aka4.C b/gcc/testsuite/g++.dg/diagnostic/aka4.C
+new file mode 100644
+index 000000000..da8c57964
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/diagnostic/aka4.C
+@@ -0,0 +1,9 @@
++typedef unsigned int myvec __attribute__((vector_size (16)));
++
++void f (float x)
++{
++  myvec y = x; // { dg-error {cannot convert 'float' to 'myvec' {aka '__vector\([48]\) unsigned int'} in initialization} }
++  myvec *ptr = &x; // { dg-error {cannot convert 'float\*' to 'myvec\*' {aka '__vector\([48]\) unsigned int\*'} in initialization} }
++  const myvec *const_ptr = &x; // { dg-error {cannot convert 'float\*' to 'const myvec\*' {aka 'const __vector\([48]\) unsigned int\*'} in initialization} }
++  volatile myvec *volatile_ptr = &x; // { dg-error {cannot convert 'float\*' to 'volatile myvec\*' {aka 'volatile __vector\([48]\) unsigned int\*'} in initialization} }
++}
+diff --git a/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
+new file mode 100644
+index 000000000..5426a1814
+--- /dev/null
++++ b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
+@@ -0,0 +1,13 @@
++/* { dg-do compile { target aarch64*-*-* } } */
++
++/* Test mangling */
++
++/* { dg-final { scan-assembler "\t.global\t_Z1fPu6__bf16" } } */
++void f (__bf16 *x) { }
++
++/* { dg-final { scan-assembler "\t.global\t_Z1gPu6__bf16S_" } } */
++void g (__bf16 *x, __bf16 *y) { }
++
++/* { dg-final { scan-assembler "\t.global\t_ZN1SIu6__bf16u6__bf16E1iE" } } */
++template <typename T, typename U> struct S { static int i; };
++template <> int S<__bf16, __bf16>::i = 3;
+diff --git a/gcc/testsuite/g++.dg/guality/guality.exp b/gcc/testsuite/g++.dg/guality/guality.exp
+index 757b20b61..33571f1f2 100644
+--- a/gcc/testsuite/g++.dg/guality/guality.exp
++++ b/gcc/testsuite/g++.dg/guality/guality.exp
+@@ -65,8 +65,22 @@ if {[check_guality "
+     return 0;
+   }
+ "]} {
+-  gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.C]] "" ""
+-  gcc-dg-runtest [lsort [glob $srcdir/c-c++-common/guality/*.c]] "" ""
++    set general [list]
++    set Og [list]
++    foreach file [lsort [glob $srcdir/c-c++-common/guality/*.c]] {
++	switch -glob -- [file tail $file] {
++	    Og-* { lappend Og $file }
++	    * { lappend general $file }
++	}
++    }
++
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.C]] "" ""
++    gcc-dg-runtest $general "" ""
++    set-torture-options \
++	[list "-O0" "-Og"] \
++	[list {}] \
++	[list "-Og -flto"]
++    gcc-dg-runtest $Og "" ""
+ }
+ 
+ if [info exists guality_gdb_name] {
+diff --git a/gcc/testsuite/g++.dg/ipa/pr93763.C b/gcc/testsuite/g++.dg/ipa/pr93763.C
+index 61117108e..13ab2d57f 100644
+--- a/gcc/testsuite/g++.dg/ipa/pr93763.C
++++ b/gcc/testsuite/g++.dg/ipa/pr93763.C
+@@ -1,4 +1,4 @@
+-/* { dg-do compile } */
++/* { dg-do compile { target c++11 } } */
+ /* { dg-options "-O3" } */
+ 
+ struct search_param {
+diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr53844.C b/gcc/testsuite/g++.dg/tree-ssa/pr53844.C
+index 954cc71b4..ab9879f6a 100644
+--- a/gcc/testsuite/g++.dg/tree-ssa/pr53844.C
++++ b/gcc/testsuite/g++.dg/tree-ssa/pr53844.C
+@@ -1,5 +1,5 @@
+ // { dg-do compile }
+-// { dg-options "-O2 -fdump-tree-optimized-vops" }
++// { dg-options "-O2 -fdump-tree-optimized-vops -fno-inline-functions --param max-inline-insns-single-O2=200" }
+ 
+ struct VBase;
+ 
+diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+index 870b23721..2e3dfecac 100644
+--- a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
++++ b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+@@ -1,5 +1,5 @@
+ // { dg-do compile }
+-// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks" }
++// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks --param early-inlining-insns-O2=14" }
+ 
+ #define assume(x) if(!(x))__builtin_unreachable()
+ 
+diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr8781.C b/gcc/testsuite/g++.dg/tree-ssa/pr8781.C
+index 1f115b2b2..5bc1ef035 100644
+--- a/gcc/testsuite/g++.dg/tree-ssa/pr8781.C
++++ b/gcc/testsuite/g++.dg/tree-ssa/pr8781.C
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O -fno-tree-sra -fdump-tree-fre1" } */
++/* { dg-options "-O -fno-tree-sra -fdump-tree-fre1 --param early-inlining-insns-O2=14" } */
+ 
+ int f();
+ 
+diff --git a/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C b/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C
+index 830660197..49dde0a65 100644
+--- a/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C
++++ b/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C
+@@ -1,7 +1,7 @@
+ /* PR/tree-optimization/84480 - bogus -Wstringop-truncation despite
+    assignment with an inlined string literal
+    { dg-do compile }
+-   { dg-options "-O2 -Wstringop-truncation" }  */
++   { dg-options "-O2 -Wstringop-truncation --param early-inlining-insns-O2=14" }  */
+ 
+ #include <string.h>
+ 
+diff --git a/gcc/testsuite/g++.target/aarch64/bfloat_cpp_typecheck.C b/gcc/testsuite/g++.target/aarch64/bfloat_cpp_typecheck.C
+new file mode 100644
+index 000000000..9203d91f8
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/bfloat_cpp_typecheck.C
+@@ -0,0 +1,14 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-O3 --save-temps" } */
++
++#include <arm_neon.h>
++
++void foo (void)
++{
++  bfloat16_t (); /* { dg-bogus {invalid conversion to type 'bfloat16_t'} "" { xfail *-*-* } } */
++  bfloat16_t a = bfloat16_t(); /* { dg-bogus {invalid conversion to type 'bfloat16_t'} "" { xfail *-*-* } } */
++  bfloat16_t (0x1234); /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t (0.1); /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
+new file mode 100644
+index 000000000..e9d624ff8
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
+@@ -0,0 +1,83 @@
++#  Assembly-based regression-test driver for the SVE ACLE
++#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.  */
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++# Exit immediately if this isn't an AArch64 target.
++if { ![istarget aarch64*-*-*] } {
++    return
++}
++
++# Load support procs.
++load_lib g++-dg.exp
++
++# Initialize `dg'.
++dg-init
++
++# Force SVE if we're not testing it already.
++if { [check_effective_target_aarch64_sve] } {
++    set sve_flags ""
++} else {
++    set sve_flags "-march=armv8.2-a+sve"
++}
++
++global gcc_runtest_parallelize_limit_minor
++if { [info exists gcc_runtest_parallelize_limit_minor] } {
++    set old_limit_minor $gcc_runtest_parallelize_limit_minor
++    set gcc_runtest_parallelize_limit_minor 1
++}
++
++torture-init
++set-torture-options {
++    "-std=c++98 -O0 -g"
++    "-std=c++98 -O1 -g"
++    "-std=c++11 -O2 -g"
++    "-std=c++14 -O3 -g"
++    "-std=c++17 -Og -g"
++    "-std=c++2a -Os -g"
++    "-std=gnu++98 -O2 -fno-schedule-insns -DCHECK_ASM --save-temps"
++    "-std=gnu++11 -Ofast -g"
++    "-std=gnu++17 -O3 -g"
++    "-std=gnu++2a -O0 -g"
++} {
++    "-DTEST_FULL"
++    "-DTEST_OVERLOADS"
++}
++
++# Main loop.
++set gcc_subdir [string replace $subdir 0 2 gcc]
++set files [glob -nocomplain $srcdir/$gcc_subdir/asm/*.c]
++set save-dg-do-what-default ${dg-do-what-default}
++if { [check_effective_target_aarch64_asm_sve_ok]
++     && [check_effective_target_aarch64_variant_pcs] } {
++    set dg-do-what-default assemble
++} else {
++    set dg-do-what-default compile
++}
++gcc-dg-runtest [lsort $files] "" "$sve_flags -fno-ipa-icf"
++set dg-do-what-default ${save-dg-do-what-default}
++
++torture-finish
++
++if { [info exists gcc_runtest_parallelize_limit_minor] } {
++    set gcc_runtest_parallelize_limit_minor $old_limit_minor
++}
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp
+new file mode 100644
+index 000000000..54c43a3ac
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp
+@@ -0,0 +1,55 @@
++#  Specific regression driver for AArch64 SVE.
++#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
++#  Contributed by ARM Ltd.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.  */
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++# Exit immediately if this isn't an AArch64 target.
++if {![istarget aarch64*-*-*] } {
++    return
++}
++
++# Load support procs.
++load_lib g++-dg.exp
++
++# If a testcase doesn't have special options, use these.
++global DEFAULT_CXXFLAGS
++if ![info exists DEFAULT_CXXFLAGS] then {
++    set DEFAULT_CXXFLAGS " -pedantic-errors -Wno-long-long"
++}
++
++# Initialize `dg'.
++dg-init
++
++# Force SVE if we're not testing it already.
++if { [check_effective_target_aarch64_sve] } {
++    set sve_flags ""
++} else {
++    set sve_flags "-march=armv8.2-a+sve"
++}
++
++# Main loop.
++set gcc_subdir [string replace $subdir 0 2 gcc]
++set files [glob -nocomplain \
++	       "$srcdir/$gcc_subdir/general/*.c" \
++	       "$srcdir/$subdir/general-c++/*.\[cC\]"]
++dg-runtest [lsort $files] "$sve_flags" $DEFAULT_CXXFLAGS
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.C
+new file mode 100644
+index 000000000..44aa10e20
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.C
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++
++#include "add_1.h"
++
++svuint8_t
++f1 (svbool_t pg, svuint8_t x, svint8_t y)
++{
++  return svadd_u8_x (pg, x, y); /* { dg-error "cannot convert 'svint8_t' to 'svuint8_t'" } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.h
+new file mode 100644
+index 000000000..d441328a3
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.h
+@@ -0,0 +1,2 @@
++#pragma GCC system_header
++#pragma GCC aarch64 "arm_sve.h" /* { dg-message "initializing argument 3" } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.C
+new file mode 100644
+index 000000000..fcfb0f489
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.C
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++
++#include "add_2.h"
++
++void
++f1 (svbool_t pg, svuint8_t x, svint8_t y)
++{
++  svadd_x (pg, x); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svuint8_t&\)'} } */
++  svadd_x (pg, x, x, x); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svuint8_t&, svuint8_t&, svuint8_t&\)'} } */
++  svadd_x (x, x, x); /* { dg-error {no matching function for call to 'svadd_x\(svuint8_t&, svuint8_t&, svuint8_t&\)'} } */
++  svadd_x (pg, pg, pg); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svbool_t&, svbool_t&\)'} } */
++  svadd_x (pg, 1, x); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, int, svuint8_t&\)'} } */
++  svadd_x (pg, x, y); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svuint8_t&, svint8_t&\)'} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.h
+new file mode 100644
+index 000000000..2b3a520d3
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.h
+@@ -0,0 +1,9 @@
++#pragma GCC system_header
++#pragma GCC aarch64 "arm_sve.h"
++/* { dg-message {note: candidate: 'svfloat16_t svadd_x\(svbool_t, svfloat16_t, svfloat16_t\)'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *candidate expects 3 arguments, 2 provided} "" { target *-*-* } 3 } */
++/* { dg-message {note: *candidate expects 3 arguments, 4 provided} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 1 from 'svuint8_t' to 'svbool_t'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 2 from 'svbool_t' to 'svfloat16_t'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 2 from 'int' to 'svfloat16_t'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 2 from 'svuint8_t' to 'svfloat16_t'} "" { target *-*-* } 3 } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_3.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_3.C
+new file mode 100644
+index 000000000..1d811fc76
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_3.C
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-optimized -fnon-call-exceptions" } */
++
++#include <arm_sve.h>
++
++svint8_t
++foo (svbool_t pg, svint8_t a, svint8_t b)
++{
++  try
++    {
++      a = svadd_m (pg, a, b);
++    }
++  catch (...)
++    {
++      a = b;
++    }
++  return a;
++}
++
++/* { dg-final { scan-tree-dump-not {__cxa_begin_catch} "optimized" } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_1.C
+new file mode 100644
+index 000000000..a73934f56
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_1.C
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_add (uint64_t a, uint64_t b) { return a + b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++void
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint16_t s16,
++    svint32_t s32, svint64_t s64, int x)
++{
++  const int one = 1;
++  u8 = svasrd_x (pg, u8, 1); /* { dg-error {no matching function for call to 'svasrd_x\(svbool_t&, svuint8_t&, [^)]*\)'} } */
++  s8 = svasrd_x (pg, s8, x); /* { dg-error "argument 3 of 'svasrd_x' must be an integer constant expression" } */
++  s8 = svasrd_x (pg, s8, one);
++  s8 = svasrd_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_x (pg, s8, 1.0);
++  s8 = svasrd_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_x (pg, s8, 1);
++  s8 = svasrd_x (pg, s8, 1 + 1);
++  s8 = svasrd_x (pg, s8, const_add (1, 1));
++  s8 = svasrd_x (pg, s8, add (1, 1)); /* { dg-error "argument 3 of 'svasrd_x' must be an integer constant expression" } */
++  s8 = svasrd_x (pg, s8, 8);
++  s8 = svasrd_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_x (pg, s8, (uint64_t (1) << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s16 = svasrd_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
++  s16 = svasrd_x (pg, s16, 1);
++  s16 = svasrd_x (pg, s16, 16);
++  s16 = svasrd_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
++  s32 = svasrd_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
++  s32 = svasrd_x (pg, s32, 1);
++  s32 = svasrd_x (pg, s32, 32);
++  s32 = svasrd_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
++  s64 = svasrd_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
++  s64 = svasrd_x (pg, s64, 1);
++  s64 = svasrd_x (pg, s64, 64);
++  s64 = svasrd_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_2.C
+new file mode 100644
+index 000000000..bbe7ba72b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_2.C
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_add (uint64_t a, uint64_t b) { return a + b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++void
++f1 (svbool_t pg, svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
++    int x)
++{
++  const int one = 1;
++  s8 = svasrd_n_s8_x (pg, s8, x); /* { dg-error "argument 3 of 'svasrd_n_s8_x' must be an integer constant expression" } */
++  s8 = svasrd_n_s8_x (pg, s8, one);
++  s8 = svasrd_n_s8_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_n_s8_x (pg, s8, 1.0);
++  s8 = svasrd_n_s8_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_n_s8_x (pg, s8, 1);
++  s8 = svasrd_n_s8_x (pg, s8, 1 + 1);
++  s8 = svasrd_n_s8_x (pg, s8, const_add (1, 1));
++  s8 = svasrd_n_s8_x (pg, s8, add (1, 1)); /* { dg-error "argument 3 of 'svasrd_n_s8_x' must be an integer constant expression" } */
++  s8 = svasrd_n_s8_x (pg, s8, 8);
++  s8 = svasrd_n_s8_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_n_s8_x (pg, s8, (uint64_t (1) << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s16 = svasrd_n_s16_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
++  s16 = svasrd_n_s16_x (pg, s16, 1);
++  s16 = svasrd_n_s16_x (pg, s16, 16);
++  s16 = svasrd_n_s16_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
++  s32 = svasrd_n_s32_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
++  s32 = svasrd_n_s32_x (pg, s32, 1);
++  s32 = svasrd_n_s32_x (pg, s32, 32);
++  s32 = svasrd_n_s32_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
++  s64 = svasrd_n_s64_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
++  s64 = svasrd_n_s64_x (pg, s64, 1);
++  s64 = svasrd_n_s64_x (pg, s64, 64);
++  s64 = svasrd_n_s64_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_3.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_3.C
+new file mode 100644
+index 000000000..5ebd770b2
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_3.C
+@@ -0,0 +1,51 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_add (uint64_t a, uint64_t b) { return a + b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++template<uint64_t N, typename T>
++T shift (svbool_t pg, T v) { return svasrd_x (pg, v, N); }
++/* { dg-error {no matching function for call to 'svasrd_x\(svbool_t&,} "" { target *-*-* } .-1 } */
++/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} "" { target *-*-* } .-2 } */
++/* { dg-error {passing 9 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} "" { target *-*-* } .-3 } */
++/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} "" { target *-*-* } .-4 } */
++/* { dg-error {passing 17 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} "" { target *-*-* } .-5 } */
++/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} "" { target *-*-* } .-6 } */
++/* { dg-error {passing 33 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} "" { target *-*-* } .-7 } */
++/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} "" { target *-*-* } .-8 } */
++/* { dg-error {passing 65 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} "" { target *-*-* } .-9 } */
++
++template<typename T>
++T shift1 (svbool_t pg, T v, uint64_t n) { return svasrd_x (pg, v, n); }
++
++template<typename T>
++T shift2 (svbool_t pg, T v, uint64_t n) { return svasrd_x (pg, v, n); }
++/* { dg-error {argument 3 of 'svasrd_x' must be an integer constant expression} "" { target *-*-* } .-1 } */
++
++void
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint16_t s16,
++    svint32_t s32, svint64_t s64)
++{
++  u8 = shift <1> (pg, u8);
++  s8 = shift <0> (pg, s8);
++  s8 = shift <1> (pg, s8);
++  s8 = shift <8> (pg, s8);
++  s8 = shift <9> (pg, s8);
++  s16 = shift <0> (pg, s16);
++  s16 = shift <1> (pg, s16);
++  s16 = shift <16> (pg, s16);
++  s16 = shift <17> (pg, s16);
++  s32 = shift <0> (pg, s32);
++  s32 = shift <1> (pg, s32);
++  s32 = shift <32> (pg, s32);
++  s32 = shift <33> (pg, s32);
++  s64 = shift <0> (pg, s64);
++  s64 = shift <1> (pg, s64);
++  s64 = shift <64> (pg, s64);
++  s64 = shift <65> (pg, s64);
++
++  s8 = shift2 (pg, s8, 1);
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/cntb_pat.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/cntb_pat.c
+new file mode 100644
+index 000000000..bbc9f9010
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/cntb_pat.c
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++test (svpattern pat, int i)
++{
++  svcntb_pat (pat); /* { dg-error "argument 1 of 'svcntb_pat' must be an integer constant expression" } */
++  svcntb_pat (i); /* { dg-error "invalid conversion from 'int' to 'svpattern'" } */
++   /* { dg-error "argument 1 of 'svcntb_pat' must be an integer constant expression" "" { target *-*-* } .-1 } */
++  svcntb_pat ((svpattern) -1); /* { dg-error "passing 4294967295 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 0);
++  svcntb_pat ((svpattern) 1);
++  svcntb_pat ((svpattern) 2);
++  svcntb_pat ((svpattern) 3);
++  svcntb_pat ((svpattern) 4);
++  svcntb_pat ((svpattern) 5);
++  svcntb_pat ((svpattern) 6);
++  svcntb_pat ((svpattern) 7);
++  svcntb_pat ((svpattern) 8);
++  svcntb_pat ((svpattern) 9);
++  svcntb_pat ((svpattern) 10);
++  svcntb_pat ((svpattern) 11);
++  svcntb_pat ((svpattern) 12);
++  svcntb_pat ((svpattern) 13);
++  svcntb_pat ((svpattern) 14); /* { dg-error "passing 14 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 15); /* { dg-error "passing 15 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 16); /* { dg-error "passing 16 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 17); /* { dg-error "passing 17 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 18); /* { dg-error "passing 18 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 19); /* { dg-error "passing 19 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 20); /* { dg-error "passing 20 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 21); /* { dg-error "passing 21 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 22); /* { dg-error "passing 22 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 23); /* { dg-error "passing 23 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 24); /* { dg-error "passing 24 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 25); /* { dg-error "passing 25 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 26); /* { dg-error "passing 26 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 27); /* { dg-error "passing 27 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 28); /* { dg-error "passing 28 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++  svcntb_pat ((svpattern) 29);
++  svcntb_pat ((svpattern) 30);
++  svcntb_pat ((svpattern) 31);
++  svcntb_pat ((svpattern) 32); /* { dg-error "passing 32 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/conversion_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/conversion_1.C
+new file mode 100644
+index 000000000..1b939cdf7
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/conversion_1.C
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++template<typename T>
++struct S
++{
++  S(T);
++  operator T() const;
++  void *base;
++};
++
++void f(svbool_t pg, const S<svuint8_t> &u8a, const S<svuint8_t> &u8b,
++       const S<svint8_t> &s8a)
++{
++  svadd_x(pg, u8a, u8b);
++  svadd_x(pg, u8a, 1);
++  svadd_x(pg, s8a, u8b); // { dg-error "no matching function for call" }
++  svadd_x(pg, s8a, 1);
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_1.C
+new file mode 100644
+index 000000000..247fd85ec
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_1.C
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
++    svuint8x2_t u8x2)
++{
++  *ptr = svcreate2 (u8); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&\)'} } */
++  *ptr = svcreate2 (u8, u8, u8); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&, svuint8_t\&, svuint8_t\&\)'} } */
++  *ptr = svcreate2 (u8x2, u8x2); /* { dg-error {no matching function for call to 'svcreate2\(svuint8x2_t\&, svuint8x2_t\&\)'} } */
++  *ptr = svcreate2 (u8, f64); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&, svfloat64_t\&\)'} } */
++  *ptr = svcreate2 (u8, pg); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&, svbool_t\&\)'} } */
++  *ptr = svcreate2 (u8, u8);
++  *ptr = svcreate2 (f64, f64); /* { dg-error {cannot convert 'svfloat64x2_t' to 'svuint8x2_t' in assignment} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_2.C
+new file mode 100644
+index 000000000..10f3231fa
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_2.C
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
++    svuint8x2_t u8x2)
++{
++  *ptr = svcreate2_u8 (u8); /* { dg-error {too few arguments to function '[^']*'} } */
++  *ptr = svcreate2_u8 (u8, u8, u8); /* { dg-error {too many arguments to function '[^']*'} } */
++  *ptr = svcreate2_u8 (u8x2, u8x2); /* { dg-error {cannot convert 'svuint8x2_t' to 'svuint8_t'} } */
++  *ptr = svcreate2_u8 (u8, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svuint8_t'} } */
++  *ptr = svcreate2_u8 (pg, u8); /* { dg-error {cannot convert 'svbool_t' to 'svuint8_t'} } */
++  *ptr = svcreate2_u8 (u8, u8);
++  *ptr = svcreate2_f64 (f64, f64); /* { dg-error {cannot convert 'svfloat64x2_t' to 'svuint8x2_t' in assignment} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_1.C
+new file mode 100644
+index 000000000..ff013634d
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_1.C
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
++    svfloat16x3_t f16x3)
++{
++  *ptr = svcreate3 (f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&\)'} } */
++  *ptr = svcreate3 (f16, f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svfloat16_t\&\)'} } */
++  *ptr = svcreate3 (f16, f16, f16, f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svfloat16_t\&, svfloat16_t\&, svfloat16_t\&\)'} } */
++  *ptr = svcreate3 (f16x3, f16x3, f16x3); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16x3_t\&, svfloat16x3_t\&, svfloat16x3_t\&\)'} } */
++  *ptr = svcreate3 (f16, f16, f64); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svfloat16_t\&, svfloat64_t\&\)'} } */
++  *ptr = svcreate3 (f16, pg, f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svbool_t\&, svfloat16_t\&\)'} } */
++  *ptr = svcreate3 (f16, f16, f16);
++  *ptr = svcreate3 (f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x3_t' to 'svfloat16x3_t' in assignment} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_2.C
+new file mode 100644
+index 000000000..07a72b1e2
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_2.C
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
++    svfloat16x3_t f16x3)
++{
++  *ptr = svcreate3_f16 (f16); /* { dg-error {too few arguments to function '[^']*'} } */
++  *ptr = svcreate3_f16 (f16, f16); /* { dg-error {too few arguments to function '[^']*'} } */
++  *ptr = svcreate3_f16 (f16, f16, f16, f16); /* { dg-error {too many arguments to function '[^']*'} } */
++  *ptr = svcreate3_f16 (f16x3, f16x3, f16x3); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svfloat16_t'} } */
++  *ptr = svcreate3_f16 (f16, f16, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svfloat16_t'} } */
++  *ptr = svcreate3_f16 (f16, pg, f16); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16_t'} } */
++  *ptr = svcreate3_f16 (f16, f16, f16);
++  *ptr = svcreate3_f64 (f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x3_t' to 'svfloat16x3_t' in assignment} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_1.C
+new file mode 100644
+index 000000000..2785d9011
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_1.C
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
++    svint32x4_t s32x4)
++{
++  *ptr = svcreate4 (s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&\)'} } */
++  *ptr = svcreate4 (s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&\)'} } */
++  *ptr = svcreate4 (s32, s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&, svint32_t\&\)'} } */
++  *ptr = svcreate4 (s32, s32, s32, s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&, svint32_t\&, svint32_t\&, svint32_t\&\)'} } */
++  *ptr = svcreate4 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {no matching function for call to 'svcreate4\(svint32x4_t\&, svint32x4_t\&, svint32x4_t\&, svint32x4_t\&\)'} } */
++  *ptr = svcreate4 (s32, s32, s32, f64); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&, svint32_t\&, svfloat64_t\&\)'} } */
++  *ptr = svcreate4 (s32, pg, s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svbool_t\&, svint32_t\&, svint32_t\&\)'} } */
++  *ptr = svcreate4 (s32, s32, s32, s32);
++  *ptr = svcreate4 (f64, f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x4_t' to 'svint32x4_t' in assignment} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_2.C
+new file mode 100644
+index 000000000..68f21a1d4
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_2.C
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
++    svint32x4_t s32x4)
++{
++  *ptr = svcreate4_s32 (s32); /* { dg-error {too few arguments to function '[^']*'} } */
++  *ptr = svcreate4_s32 (s32, s32); /* { dg-error {too few arguments to function '[^']*'} } */
++  *ptr = svcreate4_s32 (s32, s32, s32); /* { dg-error {too few arguments to function '[^']*'} } */
++  *ptr = svcreate4_s32 (s32, s32, s32, s32, s32); /* { dg-error {too many arguments to function '[^']*'} } */
++  *ptr = svcreate4_s32 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {cannot convert 'svint32x4_t' to 'svint32_t'} } */
++  *ptr = svcreate4_s32 (s32, s32, s32, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svint32_t'} } */
++  *ptr = svcreate4_s32 (s32, pg, s32, s32); /* { dg-error {cannot convert 'svbool_t' to 'svint32_t'} } */
++  *ptr = svcreate4_s32 (s32, s32, s32, s32);
++  *ptr = svcreate4_f64 (f64, f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x4_t' to 'svint32x4_t' in assignment} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.C
+new file mode 100644
+index 000000000..93397c82f
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.C
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++
++#include "dot_1.h"
++
++svuint32_t
++f1 (svuint32_t x, svint8_t y, svuint8_t z)
++{
++  return svdot_u32 (x, y, z); /* { dg-error "cannot convert 'svint8_t' to 'svuint8_t'" } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.h
+new file mode 100644
+index 000000000..aef02f20b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.h
+@@ -0,0 +1,2 @@
++#pragma GCC system_header
++#pragma GCC aarch64 "arm_sve.h" /* { dg-message "initializing argument 2" } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.C
+new file mode 100644
+index 000000000..2084ed828
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.C
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++
++#include "dot_2.h"
++
++void
++f1 (svuint32_t x, svint8_t y, svuint8_t z)
++{
++  svdot (x, y); /* { dg-error {no matching function for call to 'svdot\(svuint32_t&, svint8_t&\)'} } */
++  svdot (x, x, x); /* { dg-error {no matching function for call to 'svdot\(svuint32_t&, svuint32_t&, svuint32_t&\)'} } */
++  svdot (1, z, z); /* { dg-error {no matching function for call to 'svdot\(int, svuint8_t&, svuint8_t&\)'} } */
++  svdot (x, y, z); /* { dg-error {no matching function for call to 'svdot\(svuint32_t&, svint8_t&, svuint8_t&\)'} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.h
+new file mode 100644
+index 000000000..3e4a9c794
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.h
+@@ -0,0 +1,7 @@
++#pragma GCC system_header
++#pragma GCC aarch64 "arm_sve.h"
++/* { dg-message {note: candidate: 'svuint32_t svdot\(svuint32_t, svuint8_t, svuint8_t\)'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *candidate expects 3 arguments, 2 provided} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 2 from 'svuint32_t' to 'svuint8_t'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 1 from 'int' to 'svuint32_t'} "" { target *-*-* } 3 } */
++/* { dg-message {note: *no known conversion for argument 2 from 'svint8_t' to 'svuint8_t'} "" { target *-*-* } 3 } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_1.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_1.c
+new file mode 100644
+index 000000000..8f18810c0
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_1.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svadd_n_u8_x; /* { dg-message "note: previous declaration 'int svadd_n_u8_x'" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svuint8_t svadd_n_u8_x\(svbool_t, svuint8_t, [^)\n]*\)' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_2.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_2.c
+new file mode 100644
+index 000000000..a67f9f756
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_2.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svadd_n_u8_x = 1; /* { dg-message "note: previous declaration 'int svadd_n_u8_x'" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svuint8_t svadd_n_u8_x\(svbool_t, svuint8_t, [^)\n]*\)' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_3.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_3.c
+new file mode 100644
+index 000000000..74b820fe6
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_3.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++
++/* Although not supported, there's nothing to stop the user overloading
++   the sv* functions.  */
++extern __SVInt8_t svadd_u8_x (__SVBool_t, __SVInt8_t, __SVInt8_t);
++
++#pragma GCC aarch64 "arm_sve.h"
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_4.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_4.c
+new file mode 100644
+index 000000000..9591e3d01
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_4.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++
++/* Although somewhat suspect, this isn't actively wrong, and doesn't need
++   to be diagnosed.  Any attempt to call the function before including
++   arm_sve.h will lead to a link failure.  (Same for taking its address,
++   etc.)  */
++extern __SVUint8_t svadd_u8_x (__SVBool_t, __SVUint8_t, __SVUint8_t);
++
++#pragma GCC aarch64 "arm_sve.h"
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_5.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_5.c
+new file mode 100644
+index 000000000..f87201984
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_5.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++
++__SVUint8_t
++svadd_u8_x (__SVBool_t pg, __SVUint8_t x, __SVUint8_t y)
++{
++  return x;
++}
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svuint8_t
++f (svbool_t pg, svuint8_t x, svuint8_t y)
++{
++  return svadd_u8_x (pg, x, y);
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_6.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_6.c
+new file mode 100644
+index 000000000..a65e0d65c
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_6.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef int svadd_u8_x; /* { dg-message "note: previous declaration 'typedef int svadd_u8_x'" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svuint8_t svadd_u8_x\(svbool_t, svuint8_t, svuint8_t\)' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_7.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_7.c
+new file mode 100644
+index 000000000..1f2e4bf66
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_7.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++
++__SVUint8_t
++svadd_x (__SVBool_t pg, __SVUint8_t x, __SVUint8_t y)
++{
++  return x;
++}
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svuint8_t
++f (svbool_t pg, svuint8_t x, svuint8_t y)
++{
++  return svadd_x (pg, x, y);
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_1.C
+new file mode 100644
+index 000000000..8d6bb2307
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_1.C
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8 = svget2 (u8x2); /* { dg-error {no matching function for call to 'svget2\(svuint8x2_t\&\)'} } */
++  u8 = svget2 (u8x2, 1, 2); /* { dg-error {no matching function for call to 'svget2\(svuint8x2_t\&, int, int\)'} } */
++  u8 = svget2 (u8, 0); /* { dg-error {no matching function for call to 'svget2\(svuint8_t\&, int\)'} } */
++  u8 = svget2 (u8x3, 0); /* { dg-error {no matching function for call to 'svget2\(svuint8x3_t\&, int\)'} } */
++  u8 = svget2 (pg, 0); /* { dg-error {no matching function for call to 'svget2\(svbool_t\&, int\)'} } */
++  u8 = svget2 (u8x2, x); /* { dg-error "argument 2 of 'svget2' must be an integer constant expression" } */
++  u8 = svget2 (u8x2, 0);
++  f64 = svget2 (u8x2, 0); /* { dg-error "cannot convert 'svuint8_t' to 'svfloat64_t' in assignment" } */
++  u8 = svget2 (u8x2, 1);
++  u8 = svget2 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, one);
++  u8 = svget2 (u8x2, 3 - 2);
++  u8 = svget2 (u8x2, 1.0);
++  u8 = svget2 (u8x2, const_sub (5, 4));
++  u8 = svget2 (u8x2, const_sub (6, 4)); /* { dg-error {passing 2 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, add (0, 0)); /* { dg-error "argument 2 of 'svget2' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_2.C
+new file mode 100644
+index 000000000..9c7674be1
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_2.C
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8 = svget2_u8 (u8x2); /* { dg-error {too few arguments to function '[^']*'} } */
++  u8 = svget2_u8 (u8x2, 1, 2); /* { dg-error {too many arguments to function '[^']*'} } */
++  u8 = svget2_u8 (u8, 0); /* { dg-error {cannot convert 'svuint8_t' to 'svuint8x2_t'} } */
++  u8 = svget2_u8 (u8x3, 0); /* { dg-error {cannot convert 'svuint8x3_t' to 'svuint8x2_t'} } */
++  u8 = svget2_u8 (pg, 0); /* { dg-error {cannot convert 'svbool_t' to 'svuint8x2_t'} } */
++  u8 = svget2_u8 (u8x2, x); /* { dg-error "argument 2 of 'svget2_u8' must be an integer constant expression" } */
++  u8 = svget2_u8 (u8x2, 0);
++  f64 = svget2_u8 (u8x2, 0); /* { dg-error "cannot convert 'svuint8_t' to 'svfloat64_t' in assignment" } */
++  u8 = svget2_u8 (u8x2, 1);
++  u8 = svget2_u8 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, one);
++  u8 = svget2_u8 (u8x2, 3 - 2);
++  u8 = svget2_u8 (u8x2, 1.0);
++  u8 = svget2_u8 (u8x2, const_sub (5, 4));
++  u8 = svget2_u8 (u8x2, const_sub (6, 4)); /* { dg-error {passing 2 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, add (0, 0)); /* { dg-error "argument 2 of 'svget2_u8' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_1.C
+new file mode 100644
+index 000000000..bd8808a8b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_1.C
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
++    int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16 = svget3 (f16x3); /* { dg-error {no matching function for call to 'svget3\(svfloat16x3_t\&\)'} } */
++  f16 = svget3 (f16x3, 1, 2); /* { dg-error {no matching function for call to 'svget3\(svfloat16x3_t\&, int, int\)'} } */
++  f16 = svget3 (f16, 0); /* { dg-error {no matching function for call to 'svget3\(svfloat16_t\&, int\)'} } */
++  f16 = svget3 (f16x4, 0); /* { dg-error {no matching function for call to 'svget3\(svfloat16x4_t\&, int\)'} } */
++  f16 = svget3 (pg, 0); /* { dg-error {no matching function for call to 'svget3\(svbool_t\&, int\)'} } */
++  f16 = svget3 (f16x3, x); /* { dg-error "argument 2 of 'svget3' must be an integer constant expression" } */
++  f16 = svget3 (f16x3, 0);
++  f64 = svget3 (f16x3, 0); /* { dg-error "cannot convert 'svfloat16_t' to 'svfloat64_t' in assignment" } */
++  f16 = svget3 (f16x3, 1);
++  f16 = svget3 (f16x3, 2);
++  f16 = svget3 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, one);
++  f16 = svget3 (f16x3, 3 - 2);
++  f16 = svget3 (f16x3, 1.0);
++  f16 = svget3 (f16x3, const_sub (5, 4));
++  f16 = svget3 (f16x3, const_sub (6, 4));
++  f16 = svget3 (f16x3, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, add (0, 0)); /* { dg-error "argument 2 of 'svget3' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_2.C
+new file mode 100644
+index 000000000..d526947d1
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_2.C
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
++    int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16 = svget3_f16 (f16x3); /* { dg-error {too few arguments to function '[^']*'} } */
++  f16 = svget3_f16 (f16x3, 1, 2); /* { dg-error {too many arguments to function '[^']*'} } */
++  f16 = svget3_f16 (f16, 0); /* { dg-error {cannot convert 'svfloat16_t' to 'svfloat16x3_t'} } */
++  f16 = svget3_f16 (f16x4, 0); /* { dg-error {cannot convert 'svfloat16x4_t' to 'svfloat16x3_t'} } */
++  f16 = svget3_f16 (pg, 0); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16x3_t'} } */
++  f16 = svget3_f16 (f16x3, x); /* { dg-error "argument 2 of 'svget3_f16' must be an integer constant expression" } */
++  f16 = svget3_f16 (f16x3, 0);
++  f64 = svget3_f16 (f16x3, 0); /* { dg-error "cannot convert 'svfloat16_t' to 'svfloat64_t' in assignment" } */
++  f16 = svget3_f16 (f16x3, 1);
++  f16 = svget3_f16 (f16x3, 2);
++  f16 = svget3_f16 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, one);
++  f16 = svget3_f16 (f16x3, 3 - 2);
++  f16 = svget3_f16 (f16x3, 1.0);
++  f16 = svget3_f16 (f16x3, const_sub (5, 4));
++  f16 = svget3_f16 (f16x3, const_sub (6, 4));
++  f16 = svget3_f16 (f16x3, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, add (0, 0)); /* { dg-error "argument 2 of 'svget3_f16' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_1.C
+new file mode 100644
+index 000000000..19853dece
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_1.C
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32 = svget4 (s32x4); /* { dg-error {no matching function for call to 'svget4\(svint32x4_t\&\)'} } */
++  s32 = svget4 (s32x4, 1, 2); /* { dg-error {no matching function for call to 'svget4\(svint32x4_t\&, int, int\)'} } */
++  s32 = svget4 (s32, 0); /* { dg-error {no matching function for call to 'svget4\(svint32_t\&, int\)'} } */
++  s32 = svget4 (s32x2, 0); /* { dg-error {no matching function for call to 'svget4\(svint32x2_t\&, int\)'} } */
++  s32 = svget4 (pg, 0); /* { dg-error {no matching function for call to 'svget4\(svbool_t\&, int\)'} } */
++  s32 = svget4 (s32x4, x); /* { dg-error "argument 2 of 'svget4' must be an integer constant expression" } */
++  s32 = svget4 (s32x4, 0);
++  f64 = svget4 (s32x4, 0); /* { dg-error "cannot convert 'svint32_t' to 'svfloat64_t' in assignment" } */
++  s32 = svget4 (s32x4, 1);
++  s32 = svget4 (s32x4, 2);
++  s32 = svget4 (s32x4, 3);
++  s32 = svget4 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, one);
++  s32 = svget4 (s32x4, 3 - 2);
++  s32 = svget4 (s32x4, 1.0);
++  s32 = svget4 (s32x4, const_sub (5, 4));
++  s32 = svget4 (s32x4, const_sub (6, 4));
++  s32 = svget4 (s32x4, const_sub (7, 4));
++  s32 = svget4 (s32x4, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, add (0, 0)); /* { dg-error "argument 2 of 'svget4' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_2.C
+new file mode 100644
+index 000000000..7a0979225
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_2.C
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32 = svget4_s32 (s32x4); /* { dg-error {too few arguments to function '[^']*'} } */
++  s32 = svget4_s32 (s32x4, 1, 2); /* { dg-error {too many arguments to function '[^']*'} } */
++  s32 = svget4_s32 (s32, 0); /* { dg-error {cannot convert 'svint32_t' to 'svint32x4_t'} } */
++  s32 = svget4_s32 (s32x2, 0); /* { dg-error {cannot convert 'svint32x2_t' to 'svint32x4_t'} } */
++  s32 = svget4_s32 (pg, 0); /* { dg-error {cannot convert 'svbool_t' to 'svint32x4_t'} } */
++  s32 = svget4_s32 (s32x4, x); /* { dg-error "argument 2 of 'svget4_s32' must be an integer constant expression" } */
++  s32 = svget4_s32 (s32x4, 0);
++  f64 = svget4_s32 (s32x4, 0); /* { dg-error "cannot convert 'svint32_t' to 'svfloat64_t' in assignment" } */
++  s32 = svget4_s32 (s32x4, 1);
++  s32 = svget4_s32 (s32x4, 2);
++  s32 = svget4_s32 (s32x4, 3);
++  s32 = svget4_s32 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, one);
++  s32 = svget4_s32 (s32x4, 3 - 2);
++  s32 = svget4_s32 (s32x4, 1.0);
++  s32 = svget4_s32 (s32x4, const_sub (5, 4));
++  s32 = svget4_s32 (s32x4, const_sub (6, 4));
++  s32 = svget4_s32 (s32x4, const_sub (7, 4));
++  s32 = svget4_s32 (s32x4, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, add (0, 0)); /* { dg-error "argument 2 of 'svget4_s32' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_1.C
+new file mode 100644
+index 000000000..fb31e947d
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_1.C
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svuint8_t
++f1 (svbool_t pg, svuint8_t x, svint8_t w, svuint64_t y)
++{
++  svlsl_wide_u8_x (pg, x, x); /* { dg-error "cannot convert 'svuint8_t' to 'svuint64_t'" } */
++  svlsl_wide_u8_x (pg, x); /* { dg-error {too few arguments to function 'svuint8_t svlsl_wide_u8_x\(svbool_t, svuint8_t, svuint64_t\)'} } */
++  svlsl_wide_u8_x (pg, x, y, x); /* { dg-error {too many arguments to function 'svuint8_t svlsl_wide_u8_x\(svbool_t, svuint8_t, svuint64_t\)'} } */
++  return svlsl_wide_s8_x (pg, w, y); /* { dg-error {cannot convert 'svint8_t' to 'svuint8_t' in return} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_2.C
+new file mode 100644
+index 000000000..95d341dc5
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_2.C
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++void
++f1 (svbool_t pg, svuint8_t x, svuint64_t y)
++{
++  svlsl_wide_x (pg, x); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint8_t&\)'} } */
++  svlsl_wide_x (pg, x, x, x, x); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint8_t&, svuint8_t&, svuint8_t&, svuint8_t&\)'} } */
++  svlsl_wide_x (x, x, y); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svuint8_t&, svuint8_t&, svuint64_t&\)'} } */
++  svlsl_wide_x (pg, 1, y); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, int, svuint64_t&\)'} } */
++  svlsl_wide_x (pg, x, x); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint8_t&, svuint8_t&\)'} } */
++  svlsl_wide_x (pg, y, y); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint64_t&, svuint64_t&\)'} } */
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
+new file mode 100644
+index 000000000..1a1712485
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void f1(svbool_t) {}
++void f2(svint8_t) {}
++void f3(svint16_t) {}
++void f4(svint32_t) {}
++void f5(svint64_t) {}
++void f6(svuint8_t) {}
++void f7(svuint16_t) {}
++void f8(svuint32_t) {}
++void f9(svuint64_t) {}
++void f10(svfloat16_t) {}
++void f11(svfloat32_t) {}
++void f12(svfloat64_t) {}
++void f13(svbfloat16_t) {}
++
++/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
++/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
++/* { dg-final { scan-assembler "_Z2f311__SVInt16_t:" } } */
++/* { dg-final { scan-assembler "_Z2f411__SVInt32_t:" } } */
++/* { dg-final { scan-assembler "_Z2f511__SVInt64_t:" } } */
++/* { dg-final { scan-assembler "_Z2f611__SVUint8_t:" } } */
++/* { dg-final { scan-assembler "_Z2f712__SVUint16_t:" } } */
++/* { dg-final { scan-assembler "_Z2f812__SVUint32_t:" } } */
++/* { dg-final { scan-assembler "_Z2f912__SVUint64_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
+new file mode 100644
+index 000000000..6792b8a31
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++
++void f1(__SVBool_t) {}
++void f2(__SVInt8_t) {}
++void f3(__SVInt16_t) {}
++void f4(__SVInt32_t) {}
++void f5(__SVInt64_t) {}
++void f6(__SVUint8_t) {}
++void f7(__SVUint16_t) {}
++void f8(__SVUint32_t) {}
++void f9(__SVUint64_t) {}
++void f10(__SVFloat16_t) {}
++void f11(__SVFloat32_t) {}
++void f12(__SVFloat64_t) {}
++void f13(__SVBfloat16_t) {}
++
++/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
++/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
++/* { dg-final { scan-assembler "_Z2f311__SVInt16_t:" } } */
++/* { dg-final { scan-assembler "_Z2f411__SVInt32_t:" } } */
++/* { dg-final { scan-assembler "_Z2f511__SVInt64_t:" } } */
++/* { dg-final { scan-assembler "_Z2f611__SVUint8_t:" } } */
++/* { dg-final { scan-assembler "_Z2f712__SVUint16_t:" } } */
++/* { dg-final { scan-assembler "_Z2f812__SVUint32_t:" } } */
++/* { dg-final { scan-assembler "_Z2f912__SVUint64_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_3.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_3.C
+new file mode 100644
+index 000000000..8f64f7c2e
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_3.C
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-msve-vector-bits=256" } */
++
++#include <arm_sve.h>
++
++typedef __SVInt8_t t1;
++typedef svint8_t t2;
++/* Distinct from svint8_t, but compatible with it.  */
++typedef int8_t t3 __attribute__((vector_size(32)));
++
++void f1(t1) {}
++void f2(t2) {}
++void f3(t3) {}
++void f4(t1 &a, t2 &b, t3 &c) { a = b = c; }
++
++/* { dg-final { scan-assembler "_Z2f110__SVInt8_t:" } } */
++/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
++/* { dg-final { scan-assembler "_Z2f3Dv32_a:" } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_4.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_4.C
+new file mode 100644
+index 000000000..7cdc6cb0c
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_4.C
+@@ -0,0 +1,75 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void f1(svint8x2_t) {}
++void f2(svint16x2_t) {}
++void f3(svint32x2_t) {}
++void f4(svint64x2_t) {}
++void f5(svuint8x2_t) {}
++void f6(svuint16x2_t) {}
++void f7(svuint32x2_t) {}
++void f8(svuint64x2_t) {}
++void f9(svfloat16x2_t) {}
++void f10(svfloat32x2_t) {}
++void f11(svfloat64x2_t) {}
++
++void g1(svint8x3_t) {}
++void g2(svint16x3_t) {}
++void g3(svint32x3_t) {}
++void g4(svint64x3_t) {}
++void g5(svuint8x3_t) {}
++void g6(svuint16x3_t) {}
++void g7(svuint32x3_t) {}
++void g8(svuint64x3_t) {}
++void g9(svfloat16x3_t) {}
++void g10(svfloat32x3_t) {}
++void g11(svfloat64x3_t) {}
++
++void h1(svint8x4_t) {}
++void h2(svint16x4_t) {}
++void h3(svint32x4_t) {}
++void h4(svint64x4_t) {}
++void h5(svuint8x4_t) {}
++void h6(svuint16x4_t) {}
++void h7(svuint32x4_t) {}
++void h8(svuint64x4_t) {}
++void h9(svfloat16x4_t) {}
++void h10(svfloat32x4_t) {}
++void h11(svfloat64x4_t) {}
++
++/* { dg-final { scan-assembler "_Z2f110svint8x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f211svint16x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f311svint32x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f411svint64x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f511svuint8x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f612svuint16x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f712svuint32x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f812svuint64x2_t:" } } */
++/* { dg-final { scan-assembler "_Z2f913svfloat16x2_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1013svfloat32x2_t:" } } */
++/* { dg-final { scan-assembler "_Z3f1113svfloat64x2_t:" } } */
++
++/* { dg-final { scan-assembler "_Z2g110svint8x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g211svint16x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g311svint32x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g411svint64x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g511svuint8x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g612svuint16x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g712svuint32x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g812svuint64x3_t:" } } */
++/* { dg-final { scan-assembler "_Z2g913svfloat16x3_t:" } } */
++/* { dg-final { scan-assembler "_Z3g1013svfloat32x3_t:" } } */
++/* { dg-final { scan-assembler "_Z3g1113svfloat64x3_t:" } } */
++
++/* { dg-final { scan-assembler "_Z2h110svint8x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h211svint16x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h311svint32x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h411svint64x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h511svuint8x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h612svuint16x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h712svuint32x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h812svuint64x4_t:" } } */
++/* { dg-final { scan-assembler "_Z2h913svfloat16x4_t:" } } */
++/* { dg-final { scan-assembler "_Z3h1013svfloat32x4_t:" } } */
++/* { dg-final { scan-assembler "_Z3h1113svfloat64x4_t:" } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_1.C
+new file mode 100644
+index 000000000..80c3ad74f
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_1.C
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
++    svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8x2 = svset2 (u8x2); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&\)'} } */
++  u8x2 = svset2 (u8x2, 1); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int\)'} } */
++  u8x2 = svset2 (u8x2, 1, u8, 2); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svuint8_t\&, int\)'} } */
++  u8x2 = svset2 (u8, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svuint8_t\&, int, svuint8_t\&\)'} } */
++  u8x2 = svset2 (s8x2, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svint8x2_t\&, int, svuint8_t\&\)'} } */
++  u8x2 = svset2 (u8x3, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svuint8x3_t\&, int, svuint8_t\&\)'} } */
++  u8x2 = svset2 (pg, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svbool_t\&, int, svuint8_t\&\)'} } */
++  u8x2 = svset2 (u8x2, 0, f64); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svfloat64_t\&\)'} } */
++  u8x2 = svset2 (u8x2, 0, u8x2); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svuint8x2_t\&\)'} } */
++  u8x2 = svset2 (u8x2, 0, pg); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svbool_t\&\)'} } */
++  u8x2 = svset2 (u8x2, x, u8); /* { dg-error "argument 2 of 'svset2' must be an integer constant expression" } */
++  u8x2 = svset2 (u8x2, 0, u8);
++  s8x2 = svset2 (u8x2, 0, u8); /* { dg-error {cannot convert 'svuint8x2_t' to 'svint8x2_t' in assignment} } */
++  u8x2 = svset2 (u8x2, 1, u8);
++  u8x2 = svset2 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, one, u8);
++  u8x2 = svset2 (u8x2, 3 - 2, u8);
++  u8x2 = svset2 (u8x2, 1.0, u8);
++  u8x2 = svset2 (u8x2, const_sub (5, 4), u8);
++  u8x2 = svset2 (u8x2, const_sub (6, 4), u8); /* { dg-error {passing 2 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, const_sub (7, 4), u8); /* { dg-error {passing 3 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, const_sub (8, 4), u8); /* { dg-error {passing 4 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, add (0, 0), u8); /* { dg-error "argument 2 of 'svset2' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_2.C
+new file mode 100644
+index 000000000..1433b78ba
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_2.C
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
++    svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8x2 = svset2_u8 (u8x2); /* { dg-error {too few arguments to function '[^']*'} } */
++  u8x2 = svset2_u8 (u8x2, 1); /* { dg-error {too few arguments to function '[^']*'} } */
++  u8x2 = svset2_u8 (u8x2, 1, u8, 2); /* { dg-error {too many arguments to function '[^']*'} } */
++  u8x2 = svset2_u8 (u8, 0, u8); /* { dg-error {cannot convert 'svuint8_t' to 'svuint8x2_t'} } */
++  u8x2 = svset2_u8 (s8x2, 0, u8); /* { dg-error {cannot convert 'svint8x2_t' to 'svuint8x2_t'} } */
++  u8x2 = svset2_u8 (u8x3, 0, u8); /* { dg-error {cannot convert 'svuint8x3_t' to 'svuint8x2_t'} } */
++  u8x2 = svset2_u8 (pg, 0, u8); /* { dg-error {cannot convert 'svbool_t' to 'svuint8x2_t'} } */
++  u8x2 = svset2_u8 (u8x2, 0, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svuint8_t'} } */
++  u8x2 = svset2_u8 (u8x2, 0, u8x2); /* { dg-error {cannot convert 'svuint8x2_t' to 'svuint8_t'} } */
++  u8x2 = svset2_u8 (u8x2, 0, pg); /* { dg-error {cannot convert 'svbool_t' to 'svuint8_t'} } */
++  u8x2 = svset2_u8 (u8x2, x, u8); /* { dg-error "argument 2 of 'svset2_u8' must be an integer constant expression" } */
++  u8x2 = svset2_u8 (u8x2, 0, u8);
++  s8x2 = svset2_u8 (u8x2, 0, u8); /* { dg-error {cannot convert 'svuint8x2_t' to 'svint8x2_t' in assignment} } */
++  u8x2 = svset2_u8 (u8x2, 1, u8);
++  u8x2 = svset2_u8 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, one, u8);
++  u8x2 = svset2_u8 (u8x2, 3 - 2, u8);
++  u8x2 = svset2_u8 (u8x2, 1.0, u8);
++  u8x2 = svset2_u8 (u8x2, const_sub (5, 4), u8);
++  u8x2 = svset2_u8 (u8x2, const_sub (6, 4), u8); /* { dg-error {passing 2 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, const_sub (7, 4), u8); /* { dg-error {passing 3 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, const_sub (8, 4), u8); /* { dg-error {passing 4 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, add (0, 0), u8); /* { dg-error "argument 2 of 'svset2_u8' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_1.C
+new file mode 100644
+index 000000000..9bb4f7a04
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_1.C
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svuint16x3_t u16x3,
++    svfloat16x4_t f16x4, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16x3 = svset3 (f16x3); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&\)'} } */
++  f16x3 = svset3 (f16x3, 1); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int\)'} } */
++  f16x3 = svset3 (f16x3, 1, f16, 2); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svfloat16_t\&, int\)'} } */
++  f16x3 = svset3 (f16, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svfloat16_t\&, int, svfloat16_t\&\)'} } */
++  f16x3 = svset3 (u16x3, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svuint16x3_t\&, int, svfloat16_t\&\)'} } */
++  f16x3 = svset3 (f16x4, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svfloat16x4_t\&, int, svfloat16_t\&\)'} } */
++  f16x3 = svset3 (pg, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svbool_t\&, int, svfloat16_t\&\)'} } */
++  f16x3 = svset3 (f16x3, 0, f64); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svfloat64_t\&\)'} } */
++  f16x3 = svset3 (f16x3, 0, f16x3); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svfloat16x3_t\&\)'} } */
++  f16x3 = svset3 (f16x3, 0, pg); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svbool_t\&\)'} } */
++  f16x3 = svset3 (f16x3, x, f16); /* { dg-error "argument 2 of 'svset3' must be an integer constant expression" } */
++  f16x3 = svset3 (f16x3, 0, f16);
++  u16x3 = svset3 (f16x3, 0, f16); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svuint16x3_t' in assignment} } */
++  f16x3 = svset3 (f16x3, 1, f16);
++  f16x3 = svset3 (f16x3, 2, f16);
++  f16x3 = svset3 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, one, f16);
++  f16x3 = svset3 (f16x3, 3 - 2, f16);
++  f16x3 = svset3 (f16x3, 1.0, f16);
++  f16x3 = svset3 (f16x3, const_sub (5, 4), f16);
++  f16x3 = svset3 (f16x3, const_sub (6, 4), f16);
++  f16x3 = svset3 (f16x3, const_sub (7, 4), f16); /* { dg-error {passing 3 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, const_sub (8, 4), f16); /* { dg-error {passing 4 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, add (0, 0), f16); /* { dg-error "argument 2 of 'svset3' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_2.C
+new file mode 100644
+index 000000000..0bb604924
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_2.C
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svuint16x3_t u16x3,
++    svfloat16x4_t f16x4, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16x3 = svset3_f16 (f16x3); /* { dg-error {too few arguments to function '[^']*'} } */
++  f16x3 = svset3_f16 (f16x3, 1); /* { dg-error {too few arguments to function '[^']*'} } */
++  f16x3 = svset3_f16 (f16x3, 1, f16, 2); /* { dg-error {too many arguments to function '[^']*'} } */
++  f16x3 = svset3_f16 (f16, 0, f16); /* { dg-error {cannot convert 'svfloat16_t' to 'svfloat16x3_t'} } */
++  f16x3 = svset3_f16 (u16x3, 0, f16); /* { dg-error {cannot convert 'svuint16x3_t' to 'svfloat16x3_t'} } */
++  f16x3 = svset3_f16 (f16x4, 0, f16); /* { dg-error {cannot convert 'svfloat16x4_t' to 'svfloat16x3_t'} } */
++  f16x3 = svset3_f16 (pg, 0, f16); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16x3_t'} } */
++  f16x3 = svset3_f16 (f16x3, 0, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svfloat16_t'} } */
++  f16x3 = svset3_f16 (f16x3, 0, f16x3); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svfloat16_t'} } */
++  f16x3 = svset3_f16 (f16x3, 0, pg); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16_t'} } */
++  f16x3 = svset3_f16 (f16x3, x, f16); /* { dg-error "argument 2 of 'svset3_f16' must be an integer constant expression" } */
++  f16x3 = svset3_f16 (f16x3, 0, f16);
++  u16x3 = svset3_f16 (f16x3, 0, f16); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svuint16x3_t' in assignment} } */
++  f16x3 = svset3_f16 (f16x3, 1, f16);
++  f16x3 = svset3_f16 (f16x3, 2, f16);
++  f16x3 = svset3_f16 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, one, f16);
++  f16x3 = svset3_f16 (f16x3, 3 - 2, f16);
++  f16x3 = svset3_f16 (f16x3, 1.0, f16);
++  f16x3 = svset3_f16 (f16x3, const_sub (5, 4), f16);
++  f16x3 = svset3_f16 (f16x3, const_sub (6, 4), f16);
++  f16x3 = svset3_f16 (f16x3, const_sub (7, 4), f16); /* { dg-error {passing 3 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, const_sub (8, 4), f16); /* { dg-error {passing 4 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, add (0, 0), f16); /* { dg-error "argument 2 of 'svset3_f16' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_1.C
+new file mode 100644
+index 000000000..dc5dae872
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_1.C
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
++    svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32x4 = svset4 (s32x4); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&\)'} } */
++  s32x4 = svset4 (s32x4, 1); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int\)'} } */
++  s32x4 = svset4 (s32x4, 1, s32, 2); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svint32_t\&, int\)'} } */
++  s32x4 = svset4 (s32, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svint32_t\&, int, svint32_t\&\)'} } */
++  s32x4 = svset4 (f32x4, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svfloat32x4_t\&, int, svint32_t\&\)'} } */
++  s32x4 = svset4 (s32x2, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svint32x2_t\&, int, svint32_t\&\)'} } */
++  s32x4 = svset4 (pg, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svbool_t\&, int, svint32_t\&\)'} } */
++  s32x4 = svset4 (s32x4, 0, f64); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svfloat64_t\&\)'} } */
++  s32x4 = svset4 (s32x4, 0, s32x4); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svint32x4_t\&\)'} } */
++  s32x4 = svset4 (s32x4, 0, pg); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svbool_t\&\)'} } */
++  s32x4 = svset4 (s32x4, x, s32); /* { dg-error "argument 2 of 'svset4' must be an integer constant expression" } */
++  s32x4 = svset4 (s32x4, 0, s32);
++  f32x4 = svset4 (s32x4, 0, s32); /* { dg-error {cannot convert 'svint32x4_t' to 'svfloat32x4_t' in assignment} } */
++  s32x4 = svset4 (s32x4, 1, s32);
++  s32x4 = svset4 (s32x4, 2, s32);
++  s32x4 = svset4 (s32x4, 3, s32);
++  s32x4 = svset4 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, one, s32);
++  s32x4 = svset4 (s32x4, 3 - 2, s32);
++  s32x4 = svset4 (s32x4, 1.0, s32);
++  s32x4 = svset4 (s32x4, const_sub (5, 4), s32);
++  s32x4 = svset4 (s32x4, const_sub (6, 4), s32);
++  s32x4 = svset4 (s32x4, const_sub (7, 4), s32);
++  s32x4 = svset4 (s32x4, const_sub (8, 4), s32); /* { dg-error {passing 4 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, add (0, 0), s32); /* { dg-error "argument 2 of 'svset4' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_2.C
+new file mode 100644
+index 000000000..762a6db74
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_2.C
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
++uint64_t add (uint64_t a, uint64_t b) { return a + b; }
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
++    svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32x4 = svset4_s32 (s32x4); /* { dg-error {too few arguments to function '[^']*'} } */
++  s32x4 = svset4_s32 (s32x4, 1); /* { dg-error {too few arguments to function '[^']*'} } */
++  s32x4 = svset4_s32 (s32x4, 1, s32, 2); /* { dg-error {too many arguments to function '[^']*'} } */
++  s32x4 = svset4_s32 (s32, 0, s32); /* { dg-error {cannot convert 'svint32_t' to 'svint32x4_t'} } */
++  s32x4 = svset4_s32 (f32x4, 0, s32); /* { dg-error {cannot convert 'svfloat32x4_t' to 'svint32x4_t'} } */
++  s32x4 = svset4_s32 (s32x2, 0, s32); /* { dg-error {cannot convert 'svint32x2_t' to 'svint32x4_t'} } */
++  s32x4 = svset4_s32 (pg, 0, s32); /* { dg-error {cannot convert 'svbool_t' to 'svint32x4_t'} } */
++  s32x4 = svset4_s32 (s32x4, 0, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svint32_t'} } */
++  s32x4 = svset4_s32 (s32x4, 0, s32x4); /* { dg-error {cannot convert 'svint32x4_t' to 'svint32_t'} } */
++  s32x4 = svset4_s32 (s32x4, 0, pg); /* { dg-error {cannot convert 'svbool_t' to 'svint32_t'} } */
++  s32x4 = svset4_s32 (s32x4, x, s32); /* { dg-error "argument 2 of 'svset4_s32' must be an integer constant expression" } */
++  s32x4 = svset4_s32 (s32x4, 0, s32);
++  f32x4 = svset4_s32 (s32x4, 0, s32); /* { dg-error {cannot convert 'svint32x4_t' to 'svfloat32x4_t' in assignment} } */
++  s32x4 = svset4_s32 (s32x4, 1, s32);
++  s32x4 = svset4_s32 (s32x4, 2, s32);
++  s32x4 = svset4_s32 (s32x4, 3, s32);
++  s32x4 = svset4_s32 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, one, s32);
++  s32x4 = svset4_s32 (s32x4, 3 - 2, s32);
++  s32x4 = svset4_s32 (s32x4, 1.0, s32);
++  s32x4 = svset4_s32 (s32x4, const_sub (5, 4), s32);
++  s32x4 = svset4_s32 (s32x4, const_sub (6, 4), s32);
++  s32x4 = svset4_s32 (s32x4, const_sub (7, 4), s32);
++  s32x4 = svset4_s32 (s32x4, const_sub (8, 4), s32); /* { dg-error {passing 4 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, add (0, 0), s32); /* { dg-error "argument 2 of 'svset4_s32' must be an integer constant expression" } */
++
++  return f64;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_1.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_1.c
+new file mode 100644
+index 000000000..ff2590032
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_1.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svbool_t; /* { dg-message "note: previous declaration 'int svbool_t'" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'typedef [^'\n]* svbool_t' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_10.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_10.c
+new file mode 100644
+index 000000000..86d87fa37
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_10.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef int svint8x2_t;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_11.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_11.c
+new file mode 100644
+index 000000000..741d10eaf
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_11.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++
++struct svint8x2_t;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
++
++svint8_t f (svint8x2_t x) { return x.__val[0]; } /* { dg-error {'x' has incomplete type} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_12.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_12.c
+new file mode 100644
+index 000000000..fc6a07ac6
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_12.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++
++typedef struct svint8x2_t svint8x2_t;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
++
++svint8_t f (svint8x2_t x) { return x.__val[0]; } /* { dg-error {'x' has incomplete type} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_13.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_13.c
+new file mode 100644
+index 000000000..161aacb7b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_13.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++struct svint8x2_t {};
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_14.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_14.c
+new file mode 100644
+index 000000000..83191118f
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_14.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++enum svpattern { FOO }; /* { dg-message "note: previous definition here" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error "multiple definition of 'enum svpattern'" } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_15.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_15.c
+new file mode 100644
+index 000000000..71e35a4eb
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_15.c
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++enum svpattern { FOO }; /* { dg-error "multiple definition of 'enum svpattern'" } */
++enum foo { SV_ALL }; /* { dg-error "'SV_ALL' conflicts with a previous declaration" } */
++typedef int SV_POW2; /* { dg-error "'typedef int SV_POW2' redeclared as different kind of entity" } */
++int SV_VL3; /* { dg-error "'int SV_VL3' redeclared as different kind of entity" } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_16.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_16.c
+new file mode 100644
+index 000000000..277064d31
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_16.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++struct svpattern { int x; };
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error "'svpattern' referred to as enum" } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_17.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_17.c
+new file mode 100644
+index 000000000..e4bcda6fb
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_17.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++struct svpattern { int x; }; /* { dg-error "'svpattern' referred to as 'struct'" } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_18.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_18.c
+new file mode 100644
+index 000000000..b6706150b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_18.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svpattern; /* OK in C.  */
++
++#pragma GCC aarch64 "arm_sve.h"
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_19.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_19.c
+new file mode 100644
+index 000000000..c6379f762
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_19.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++int svpattern; /* OK in C.  */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_2.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_2.c
+new file mode 100644
+index 000000000..5baf59932
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_2.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svint8_t; /* { dg-message "note: previous declaration 'int svint8_t" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'typedef [^'\n]* svint8_t' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_20.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_20.c
+new file mode 100644
+index 000000000..3ba19f596
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_20.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++
++enum foo { SV_VL4 };
++typedef int SV_POW2;
++int SV_ALL;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error "'SV_VL4' conflicts with a previous declaration" } */
++/* { dg-error "'SV_POW2' redeclared as different kind of entity" "" { target *-*-* } .-1 } */
++/* { dg-error "'SV_ALL' redeclared as different kind of entity" "" { target *-*-* } .-2 } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_3.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_3.c
+new file mode 100644
+index 000000000..a8d7bdcc7
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_3.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svuint16_t; /* { dg-message "note: previous declaration 'int svuint16_t'" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'typedef [^'\n]* svuint16_t' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_4.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_4.c
+new file mode 100644
+index 000000000..c0563d0ee
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_4.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svfloat32_t; /* { dg-message "note: previous declaration 'int svfloat32_t'" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'typedef [^'\n]* svfloat32_t' redeclared as different kind of entity} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_5.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_5.c
+new file mode 100644
+index 000000000..ee28e9527
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_5.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef int svbool_t; /* { dg-message "note: previous declaration as 'typedef int svbool_t'" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting declaration '[^'\n]* svbool_t'} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_6.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_6.c
+new file mode 100644
+index 000000000..85c17eab6
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_6.c
+@@ -0,0 +1,6 @@
++/* { dg-do compile } */
++
++typedef __SVBool_t svbool_t;
++
++#pragma GCC aarch64 "arm_sve.h"
++
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_7.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_7.c
+new file mode 100644
+index 000000000..3a0dfb1c0
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_7.c
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++
++int svint8x2_t;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'typedef struct svint8x2_t svint8x2_t' redeclared as different kind of entity} } */
++
++void f (struct svint8x2_t) {} /* { dg-error {incomplete type} } */
++void g () { int &x = svint8x2_t; }
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_8.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_8.c
+new file mode 100644
+index 000000000..9b0df9137
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_8.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++
++struct svint8x2_t;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
++
++void f (svint8x2_t) {} /* { dg-error {incomplete type} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_9.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_9.c
+new file mode 100644
+index 000000000..43068da78
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_9.c
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++int svint8x2_t; /* { dg-error {'int svint8x2_t' redeclared as different kind of entity} } */
++
++void f (struct svint8x2_t) {} /* { dg-error {using typedef-name 'svint8x2_t' after 'struct'} } */
++void g () { int &x = svint8x2_t; } /* { dg-error {expected primary-expression before ';' token} } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/whilele_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/whilele_1.C
+new file mode 100644
+index 000000000..9571e668b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/whilele_1.C
+@@ -0,0 +1,81 @@
++// { dg-do compile }
++
++#include <arm_sve.h>
++
++enum foo { A, B };
++
++void
++test (int8_t s8, int16_t s16, int32_t s32, int64_t s64,
++      uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64,
++      bool b, foo e, int *ptr, float f32, svbool_t pg,
++      svint32_t vec)
++{
++  svwhilele_b8 (s32); // { dg-error {no matching function for call to 'svwhilele_b8\(int32_t&\)'} }
++  svwhilele_b8 (s32, s32, s32); // { dg-error {no matching function for call to 'svwhilele_b8\(int32_t&, int32_t&, int32_t&\)'} }
++
++  svwhilele_b8 (b, b);
++  svwhilele_b8 (e, e);
++  svwhilele_b8 (s8, s8);
++  svwhilele_b8 (u8, u8);
++  svwhilele_b8 (s16, s16);
++  svwhilele_b8 (u16, u16);
++  svwhilele_b8 (ptr, ptr); // { dg-error {no matching function for call to 'svwhilele_b8\(int\*&, int\*&\)'} }
++  // { dg-error {invalid conversion from 'int\*' to '[^']*'} "" { target *-*-* } .-1 }
++  svwhilele_b8 (f32, f32); // { dg-error {call of overloaded 'svwhilele_b8\(float&, float&\)' is ambiguous} }
++  svwhilele_b8 (pg, pg); // { dg-error {no matching function for call to 'svwhilele_b8\(svbool_t&, svbool_t&\)'} }
++  svwhilele_b8 (vec, vec); // { dg-error {no matching function for call to 'svwhilele_b8\(svint32_t&, svint32_t&\)'} }
++
++  svwhilele_b8 (s32, b);
++  svwhilele_b8 (s32, e);
++  svwhilele_b8 (s32, s8);
++  svwhilele_b8 (s32, u8);
++  svwhilele_b8 (s32, s16);
++  svwhilele_b8 (s32, u16);
++
++  svwhilele_b8 (u32, b); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, bool&\)' is ambiguous} }
++  svwhilele_b8 (u32, e); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, foo&\)' is ambiguous} }
++  svwhilele_b8 (u32, s8); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int8_t&\)' is ambiguous} }
++  svwhilele_b8 (u32, u8); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, uint8_t&\)' is ambiguous} }
++  svwhilele_b8 (u32, s16); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int16_t&\)' is ambiguous} }
++  svwhilele_b8 (u32, u16); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, uint16_t&\)' is ambiguous} }
++
++  svwhilele_b8 (s32, s32);
++  svwhilele_b8 (s32, u32); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, uint32_t&\)' is ambiguous} }
++  svwhilele_b8 (s32, s64); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, int64_t&\)' is ambiguous} }
++  svwhilele_b8 (s32, u64); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, uint64_t&\)' is ambiguous} }
++
++  svwhilele_b8 (u32, s32); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int32_t&\)' is ambiguous} }
++  svwhilele_b8 (u32, u32);
++  svwhilele_b8 (u32, s64); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int64_t&\)' is ambiguous} }
++  svwhilele_b8 (u32, u64); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, uint64_t&\)' is ambiguous} }
++
++  svwhilele_b8 (s64, s32); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, int32_t&\)' is ambiguous} }
++  svwhilele_b8 (s64, u32); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, uint32_t&\)' is ambiguous} }
++  svwhilele_b8 (s64, s64);
++  svwhilele_b8 (s64, u64); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, uint64_t&\)' is ambiguous} }
++
++  svwhilele_b8 (u64, s32); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, int32_t&\)' is ambiguous} }
++  svwhilele_b8 (u64, u32); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, uint32_t&\)' is ambiguous} }
++  svwhilele_b8 (u64, s64); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, int64_t&\)' is ambiguous} }
++  svwhilele_b8 (u64, u64);
++
++  svwhilele_b8 (0, s32);
++  svwhilele_b8 (0, u32); // { dg-error {call of overloaded 'svwhilele_b8\(int, uint32_t&\)' is ambiguous} }
++  svwhilele_b8 (0, s64); // { dg-error {call of overloaded 'svwhilele_b8\(int, int64_t&\)' is ambiguous} }
++  svwhilele_b8 (0, u64); // { dg-error {call of overloaded 'svwhilele_b8\(int, uint64_t&\)' is ambiguous} }
++
++  svwhilele_b8 (s32, 0);
++  svwhilele_b8 (u32, 0); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int\)' is ambiguous} }
++  svwhilele_b8 (s64, 0); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, int\)' is ambiguous} }
++  svwhilele_b8 (u64, 0); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, int\)' is ambiguous} }
++
++  svwhilele_b8 (0U, s32); // { dg-error {call of overloaded 'svwhilele_b8\(unsigned int, int32_t&\)' is ambiguous} }
++  svwhilele_b8 (0U, u32);
++  svwhilele_b8 (0U, s64); // { dg-error {call of overloaded 'svwhilele_b8\(unsigned int, int64_t&\)' is ambiguous} }
++  svwhilele_b8 (0U, u64); // { dg-error {call of overloaded 'svwhilele_b8\(unsigned int, uint64_t&\)' is ambiguous} }
++
++  svwhilele_b8 (s32, 0U); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, unsigned int\)' is ambiguous} }
++  svwhilele_b8 (u32, 0U);
++  svwhilele_b8 (s64, 0U); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, unsigned int\)' is ambiguous} }
++  svwhilele_b8 (u64, 0U); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, unsigned int\)' is ambiguous} }
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/catch_7.C b/gcc/testsuite/g++.target/aarch64/sve/catch_7.C
+new file mode 100644
+index 000000000..ac10b6984
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/catch_7.C
+@@ -0,0 +1,38 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O" } */
++
++#include <arm_sve.h>
++
++void __attribute__ ((noipa))
++f1 (void)
++{
++  throw 1;
++}
++
++void __attribute__ ((noipa))
++f2 (svbool_t)
++{
++  register svint8_t z8 asm ("z8") = svindex_s8 (11, 1);
++  asm volatile ("" :: "w" (z8));
++  f1 ();
++}
++
++void __attribute__ ((noipa))
++f3 (int n)
++{
++  register double d8 asm ("v8") = 42.0;
++  for (int i = 0; i < n; ++i)
++    {
++      asm volatile ("" : "=w" (d8) : "w" (d8));
++      try { f2 (svptrue_b8 ()); } catch (int) { break; }
++    }
++  if (d8 != 42.0)
++    __builtin_abort ();
++}
++
++int
++main (void)
++{
++  f3 (100);
++  return 0;
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_1.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_1.C
+new file mode 100644
+index 000000000..a59862cf9
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_1.C
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size(32)));
++
++void
++foo (int32_t val)
++{
++  register vnx4si x asm ("z0");
++  register vnx4si y asm ("z0");
++  asm volatile ("" : "=w" (y));
++  val += 1;
++  vnx4si z = { val, val, val, val, val, val, val, val };
++  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmov\tz0\.s, p[0-7]/m, w[0-9]+\n} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_2.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_2.C
+new file mode 100644
+index 000000000..47aad2d58
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_2.C
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size(32)));
++
++void
++foo (int32_t val)
++{
++  register vnx4si x asm ("z0");
++  register vnx4si y asm ("z1");
++  asm volatile ("" : "=w" (y));
++  val += 1;
++  vnx4si z = { val, val, val, val, val, val, val, val };
++  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\tmov\tz0\.s, p[0-7]/m, w[0-9]+\n} } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_3.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_3.C
+new file mode 100644
+index 000000000..e8ec6f8b4
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_3.C
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size(32)));
++typedef float vnx4sf __attribute__((vector_size(32)));
++
++void
++foo (float val)
++{
++  register vnx4sf x asm ("z0");
++  register vnx4sf y asm ("z0");
++  asm volatile ("" : "=w" (y));
++  vnx4sf z = { val, val, val, val, val, val, val, val };
++  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmov\tz0\.s, p[0-7]/m, s[0-9]+\n} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_4.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_4.C
+new file mode 100644
+index 000000000..32ca59439
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_4.C
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size(32)));
++typedef float vnx4sf __attribute__((vector_size(32)));
++
++void
++foo (float val)
++{
++  register vnx4sf x asm ("z0");
++  register vnx4sf y asm ("z1");
++  asm volatile ("" : "=w" (y));
++  vnx4sf z = { val, val, val, val, val, val, val, val };
++  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\tmov\tz0\.s, p[0-7]/m, s[0-9]+\n} } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_5.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_5.C
+new file mode 100644
+index 000000000..2fb903a91
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_5.C
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size(32)));
++
++void
++foo (int32_t val)
++{
++  register vnx4si x asm ("z0");
++  val += 1;
++  vnx4si y = { val, val, val, val, val, val, val, val };
++  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? y : (vnx4si) { 0 };
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmovprfx\tz0\.s, p[0-7]/z, z0\.s\n\tmov\tz0\.s, p[0-7]/m, w[0-9]+\n} } } */
+diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_6.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_6.C
+new file mode 100644
+index 000000000..f2b0181bb
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_6.C
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size(32)));
++typedef float vnx4sf __attribute__((vector_size(32)));
++
++void
++foo (float val)
++{
++  register vnx4sf x asm ("z0");
++  vnx4sf y = { val, val, val, val, val, val, val, val };
++  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? y : (vnx4sf) { 0 };
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmovprfx\tz0\.s, p[0-7]/z, z0\.s\n\tmov\tz0\.s, p[0-7]/m, s[0-9]+\n} } } */
+diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp b/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp
+index acb9eacb4..3560a1ff2 100644
+--- a/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp
++++ b/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp
+@@ -37,7 +37,7 @@ load_lib c-torture.exp
+ torture-init
+ set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS
+ 
+-set additional_flags "-fno-tree-loop-distribute-patterns -fno-tracer -fno-ipa-ra"
++set additional_flags "-fno-tree-loop-distribute-patterns -fno-tracer -fno-ipa-ra -fno-inline-functions"
+ if [istarget "powerpc-*-darwin*"] {
+    lappend additional_flags "-Wl,-multiply_defined,suppress"
+ }
+diff --git a/gcc/testsuite/gcc.dg/diag-aka-3.c b/gcc/testsuite/gcc.dg/diag-aka-3.c
+new file mode 100644
+index 000000000..a3778ed7d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/diag-aka-3.c
+@@ -0,0 +1,9 @@
++typedef unsigned int myvec __attribute__((vector_size (16)));
++
++void f (float x)
++{
++  myvec y = x; /* { dg-error {incompatible types when initializing type 'myvec' {aka '__vector\([48]\) unsigned int'} using type 'float'} } */
++  myvec *ptr = &x; /* { dg-error {initialization of 'myvec \*' {aka '__vector\([48]\) unsigned int \*'} from incompatible pointer type 'float \*'} } */
++  const myvec *const_ptr = &x; /* { dg-error {initialization of 'const myvec \*' {aka 'const __vector\([48]\) unsigned int \*'} from incompatible pointer type 'float \*'} } */
++  volatile myvec *volatile_ptr = &x; /* { dg-error {initialization of 'volatile myvec \*' {aka 'volatile __vector\([48]\) unsigned int \*'} from incompatible pointer type 'float \*'} } */
++}
+diff --git a/gcc/testsuite/gcc.dg/enum-redef-1.c b/gcc/testsuite/gcc.dg/enum-redef-1.c
+new file mode 100644
+index 000000000..b3fa6cbf8
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/enum-redef-1.c
+@@ -0,0 +1,29 @@
++enum a { A };
++enum a { B }; /* { dg-bogus "nested redefinition" } */
++/* { dg-error "redeclaration of 'enum a'" "" { target *-*-* } .-1 } */
++
++enum empty {}; /* { dg-error "empty enum is invalid" } */
++enum empty {}; /* { dg-bogus "nested redefinition" } */
++/* { dg-error "empty enum is invalid" "" { target *-*-* } .-1 } */
++
++enum nested_first {
++  C1 = sizeof(enum nested_first { C1a }), /* { dg-error "nested redefinition of 'enum nested_first" } */
++  C2 = sizeof(enum nested_first { C2a }) /* { dg-error "redeclaration of 'enum nested_first'" "" } */
++};
++
++enum nested_second {
++  D1,
++  D2 = sizeof(enum nested_second { D2a }), /* { dg-error "nested redefinition of 'enum nested_second" } */
++  D3 = sizeof(enum nested_second { D3a }) /* { dg-error "redeclaration of 'enum nested_second'" "" } */
++};
++
++enum nested_repeat { E };
++enum nested_repeat { /* { dg-error "redeclaration of 'enum nested_repeat'" "" } */
++  F = sizeof(enum nested_repeat { Fa }) /* { dg-error "nested redefinition of 'enum nested_repeat" } */
++};
++
++enum nested_empty {
++  G1 = sizeof(enum nested_empty {}), /* { dg-error "nested redefinition of 'enum nested_empty" } */
++  /* { dg-error "empty enum is invalid" "" { target *-*-* } .-1 } */
++  G2 = sizeof(enum nested_empty { G2a })
++};
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-1.c b/gcc/testsuite/gcc.dg/graphite/interchange-1.c
+index b65d4861e..65a569e71 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-1.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-1.c
+@@ -48,10 +48,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/*FIXME: Between isl 0.12 and isl 0.15 the schedule optimizer needs to print
+-something canonical so that it can be checked in the test.  The final code
+-generated by both are same in this case but the messaged printed are
+-not consistent.  */
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-10.c b/gcc/testsuite/gcc.dg/graphite/interchange-10.c
+index a955644de..45c248db8 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-10.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-10.c
+@@ -45,5 +45,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-11.c b/gcc/testsuite/gcc.dg/graphite/interchange-11.c
+index 61028225f..6ba6907a5 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-11.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-11.c
+@@ -45,5 +45,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-3.c b/gcc/testsuite/gcc.dg/graphite/interchange-3.c
+index 4aec82418..e8539e2d3 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-3.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-3.c
+@@ -46,5 +46,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-4.c b/gcc/testsuite/gcc.dg/graphite/interchange-4.c
+index 463ecb5a6..1370d5f9d 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-4.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-4.c
+@@ -45,5 +45,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-7.c b/gcc/testsuite/gcc.dg/graphite/interchange-7.c
+index 50f7dd7f8..b2696dbec 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-7.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-7.c
+@@ -46,5 +46,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-9.c b/gcc/testsuite/gcc.dg/graphite/interchange-9.c
+index 88a357893..506b5001f 100644
+--- a/gcc/testsuite/gcc.dg/graphite/interchange-9.c
++++ b/gcc/testsuite/gcc.dg/graphite/interchange-9.c
+@@ -43,5 +43,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c b/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c
+index cc108c2bb..a89578032 100644
+--- a/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c
++++ b/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c
+@@ -44,5 +44,3 @@ main (void)
+ 
+   return 0;
+ }
+-
+-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
+diff --git a/gcc/testsuite/gcc.dg/guality/guality.exp b/gcc/testsuite/gcc.dg/guality/guality.exp
+index ca77a446f..89cd896d0 100644
+--- a/gcc/testsuite/gcc.dg/guality/guality.exp
++++ b/gcc/testsuite/gcc.dg/guality/guality.exp
+@@ -80,8 +80,22 @@ if {[check_guality "
+     return 0;
+   }
+ "]} {
+-  gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] "" ""
+-  gcc-dg-runtest [lsort [glob $srcdir/c-c++-common/guality/*.c]] "" "-Wc++-compat"
++    set general [list]
++    set Og [list]
++    foreach file [lsort [glob $srcdir/c-c++-common/guality/*.c]] {
++	switch -glob -- [file tail $file] {
++	    Og-* { lappend Og $file }
++	    * { lappend general $file }
++	}
++    }
++
++    gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] "" ""
++    gcc-dg-runtest $general "" "-Wc++-compat"
++    set-torture-options \
++	[list "-O0" "-Og"] \
++	[list {}] \
++	[list "-Og -flto"]
++    gcc-dg-runtest $Og "" "-Wc++-compat"
+ }
+ 
+ if [info exists guality_gdb_name] {
+diff --git a/gcc/testsuite/gcc.dg/guality/pr59776.c b/gcc/testsuite/gcc.dg/guality/pr59776.c
+index 382abb622..6c1c8165b 100644
+--- a/gcc/testsuite/gcc.dg/guality/pr59776.c
++++ b/gcc/testsuite/gcc.dg/guality/pr59776.c
+@@ -12,11 +12,11 @@ foo (struct S *p)
+   struct S s1, s2;			/* { dg-final { gdb-test pr59776.c:17 "s1.f" "5.0" } } */
+   s1 = *p;				/* { dg-final { gdb-test pr59776.c:17 "s1.g" "6.0" } } */
+   s2 = s1;				/* { dg-final { gdb-test pr59776.c:17 "s2.f" "0.0" } } */
+-  *(int *) &s2.f = 0;			/* { dg-final { gdb-test pr59776.c:17 "s2.g" "6.0" } } */
++  *(int *) &s2.f = 0;			/* { dg-final { gdb-test pr59776.c:17 "s2.g" "6.0" { xfail *-*-* } } } */
+   asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s1.f" "5.0" } } */
+   asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s1.g" "6.0" } } */
+   s2 = s1;				/* { dg-final { gdb-test pr59776.c:20 "s2.f" "5.0" } } */
+-  asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s2.g" "6.0" } } */
++  asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s2.g" "6.0" { xfail *-*-* } } } */
+   asm volatile (NOP : : : "memory");
+ }
+ 
+diff --git a/gcc/testsuite/gcc.dg/ipa/inline-7.c b/gcc/testsuite/gcc.dg/ipa/inline-7.c
+index 7dabb14f6..7c6491141 100644
+--- a/gcc/testsuite/gcc.dg/ipa/inline-7.c
++++ b/gcc/testsuite/gcc.dg/ipa/inline-7.c
+@@ -1,6 +1,6 @@
+ /* Check that early inliner works out that a is empty of parameter 0.  */
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-einline-optimized -fopt-info-inline -fno-partial-inlining"  } */
++/* { dg-options "-O2 -fdump-tree-einline-optimized -fopt-info-inline -fno-partial-inlining -fno-inline-functions"  } */
+ void t(void);
+ int a (int b)
+ {
+diff --git a/gcc/testsuite/gcc.dg/ipa/pr63416.c b/gcc/testsuite/gcc.dg/ipa/pr63416.c
+index b5374c51f..5873954fb 100644
+--- a/gcc/testsuite/gcc.dg/ipa/pr63416.c
++++ b/gcc/testsuite/gcc.dg/ipa/pr63416.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-optimized"  } */
++/* { dg-options "-O2 -fdump-tree-optimized --param early-inlining-insns-O2=14"  } */
+ #define _UNUSED_ __attribute__((__unused__))
+ 
+ typedef int TEST_F30 (int *v);
+diff --git a/gcc/testsuite/gcc.dg/ipa/pr93763.c b/gcc/testsuite/gcc.dg/ipa/pr93763.c
+index d11705932..aa2e60c5f 100644
+--- a/gcc/testsuite/gcc.dg/ipa/pr93763.c
++++ b/gcc/testsuite/gcc.dg/ipa/pr93763.c
+@@ -3,44 +3,48 @@
+ 
+ typedef struct a a;
+ struct a {
+-  a *b
++  a *b;
+ } d;
+-e, k, ah, al;
+-f(aa) {
++int e, k, ah, al;
++void h(void);
++void
++f(aa) int aa; {
+   if (aa & 1)
+     goto g;
+   f(aa | 2);
+ g:
+   h();
+ }
++void i();
++void
+ l() {
+-  {
+     f(072);
+     i(e, d, 92);
+-  }
+ }
++void
+ ag() {
+-  { i(e, d, 36); }
++  i(e, d, 36);
+ }
++void j();
++void
+ ai(a *m, a *n, unsigned aa) {
+   f(aa);
+   j(k, l, ah, 1);
+ }
++void
+ j(int c, a m, int aj, int aa) {
+   int ak = aa;
+-  { i(e, d, ak); }
++  i(e, d, ak);
+ }
++void
+ i(int c, a *m, unsigned aa) {
+-  {
+-    {             i(c, (*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(
++  i(c, (*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(
+ *(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(
+ *(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*m).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
+ .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
+ .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
+ .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
+ .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b, 0);
+-    }
+-  }
+   int am = aa;
+-  ai(ag, al, am);
++  ai((a *) (void *) ag, (a *) (__INTPTR_TYPE__) al, am);
+ }
+diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c
+index 5819fd719..b4d8b9a8d 100644
+--- a/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c
++++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c
+@@ -1,6 +1,6 @@
+ /* { dg-do compile } */
+ /* { dg-require-effective-target bswap } */
+-/* { dg-options "-O2 -fdump-tree-bswap" } */
++/* { dg-options "-O2 -fdump-tree-bswap -fno-inline-functions" } */
+ /* { dg-additional-options "-march=z900" { target s390-*-* } } */
+ 
+ struct L { unsigned int l[2]; };
+diff --git a/gcc/testsuite/gcc.dg/pr79983.c b/gcc/testsuite/gcc.dg/pr79983.c
+index 84aae6913..1e292d421 100644
+--- a/gcc/testsuite/gcc.dg/pr79983.c
++++ b/gcc/testsuite/gcc.dg/pr79983.c
+@@ -8,7 +8,7 @@ struct S { int i, j; }; /* { dg-error "redefinition of 'struct S'" } */
+ 
+ enum E;
+ enum E { A, B, C }; /* { dg-message "originally defined here" } */
+-enum E { D, F }; /* { dg-error "nested redefinition of 'enum E'|redeclaration of 'enum E'" } */
++enum E { D, F }; /* { dg-error "redeclaration of 'enum E'" } */
+ 
+ union U;
+ union U { int i; }; /* { dg-message "originally defined here" } */
+diff --git a/gcc/testsuite/gcc.dg/struct-ret-1.c b/gcc/testsuite/gcc.dg/struct-ret-1.c
+index 23c9e9813..330c76ab8 100644
+--- a/gcc/testsuite/gcc.dg/struct-ret-1.c
++++ b/gcc/testsuite/gcc.dg/struct-ret-1.c
+@@ -1,5 +1,5 @@
+-/* { dg-do run { target hppa*-*-* } } */
+-/* { dg-options { -O2 } { target hppa*-*-* } } */
++/* { dg-do run } */
++/* { dg-options { -O2 } } */
+ extern void abort (void);
+ extern void exit (int);
+ typedef struct {
+diff --git a/gcc/testsuite/gcc.dg/torture/pr90395.c b/gcc/testsuite/gcc.dg/torture/pr90395.c
+new file mode 100644
+index 000000000..eba8750ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/torture/pr90395.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-fexceptions -fnon-call-exceptions" } */
++
++typedef int v16si __attribute__ ((__vector_size__ (64)));
++
++void
++rl (int uq)
++{
++  v16si qw[1];
++
++  qw[uq] = (v16si) { uq };
++}
+diff --git a/gcc/testsuite/gcc.dg/torture/pr92690.c b/gcc/testsuite/gcc.dg/torture/pr92690.c
+new file mode 100644
+index 000000000..b49f184fc
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/torture/pr92690.c
+@@ -0,0 +1,38 @@
++/* { dg-do run { target *-*-*gnu* } } */
++/* { dg-additional-options "-D_GNU_SOURCE" } */
++/* { dg-require-effective-target fenv_exceptions } */
++
++#include <fenv.h>
++
++typedef int v4si __attribute__((vector_size(16)));
++typedef float v4sf __attribute__((vector_size(16)));
++
++void __attribute__((noipa))
++foo (v4si *dstp, v4sf *srcp)
++{
++  v4sf src = *srcp;
++  *dstp = (v4si) { src[0], src[1], 3, 4 };
++}
++
++void __attribute__((noipa))
++bar (v4sf *dstp, v4si *srcp)
++{
++  v4si src = *srcp;
++  *dstp = (v4sf) { src[0], src[1], 3.5, 4.5 };
++}
++
++int
++main()
++{
++  feenableexcept (FE_INVALID|FE_INEXACT);
++  v4sf x = (v4sf) { 1, 2, __builtin_nanf (""), 3.5 };
++  v4si y;
++  foo (&y, &x);
++  if (y[0] != 1 || y[1] != 2 || y[2] != 3 || y[3] != 4)
++    __builtin_abort ();
++  y = (v4si) { 0, 1, __INT_MAX__, -__INT_MAX__ };
++  bar (&x, &y);
++  if (x[0] != 0 || x[1] != 1 || x[2] != 3.5 || x[3] != 4.5)
++    __builtin_abort ();
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.dg/torture/pr92715.c b/gcc/testsuite/gcc.dg/torture/pr92715.c
+new file mode 100644
+index 000000000..170179c20
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/torture/pr92715.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
++
++typedef double v4si __attribute__((vector_size(32)));
++typedef double v2si __attribute__((vector_size(16)));
++
++void foo (v4si *dstp, v2si *srcp)
++{
++  v2si src = *srcp;
++  *dstp = (v4si) { src[0], src[1], src[0], src[1] };
++}
++
++void bar (v4si *dstp, v2si *srcp)
++{
++  v2si src = *srcp;
++  *dstp = (v4si) { src[0], src[0], src[0], src[0] };
++}
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c b/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c
+new file mode 100644
+index 000000000..ba90b56fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-fre3" } */
++struct foo
++{
++  int val;
++} *fooptr;
++struct bar
++{
++  struct foo foo;
++  int val2;
++} *barptr;
++int
++test ()
++{
++  struct foo foo = { 0 };
++  barptr->val2 = 123;
++  *fooptr = foo;
++  return barptr->val2;
++}
++
++/* { dg-final { scan-tree-dump-times "return 123" 1 "fre3"} } */
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c
+index d55197bce..24e633869 100644
+--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c
++++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c
+@@ -16,4 +16,5 @@ v4sf vec_cast_perm(v4si f)
+ }
+ 
+ /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "cddce1" { target { i?86-*-* x86_64-*-* } } } } */
+-/* { dg-final { scan-tree-dump-times "\\\(v4sf\\\) " 2 "cddce1" { target { i?86-*-* x86_64-*-* } } } } */
++/* Catch (v4sf) and (vector(4) float).  */
++/* { dg-final { scan-tree-dump-times " = \\\(v" 2 "cddce1" { target { i?86-*-* x86_64-*-* } } } } */
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr92706-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92706-2.c
+new file mode 100644
+index 000000000..37ab9765d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92706-2.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-esra" } */
++
++typedef __UINT64_TYPE__ uint64_t;
++typedef __UINT32_TYPE__ uint32_t;
++struct S { uint32_t i[2]; } __attribute__((aligned(__alignof__(uint64_t))));
++typedef uint64_t my_int64 __attribute__((may_alias));
++uint64_t load (void *p)
++{
++  struct S u, v, w;
++  uint64_t tem;
++  tem = *(my_int64 *)p;
++  *(my_int64 *)&v = tem;
++  u = v;
++  w = u;
++  return *(my_int64 *)&w;
++}
++
++/* { dg-final { scan-tree-dump "Created a replacement for v" "esra" } } */
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c
+index 32d63899b..836a8092a 100644
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-dse1-details -fno-short-enums" } */
++/* { dg-options "-O2 -fdump-tree-dse1-details -fno-short-enums -fno-tree-fre" } */
+ /* { dg-skip-if "temporary variable for constraint_expr is never used" { msp430-*-* } } */
+ 
+ enum constraint_expr_type
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c
+index 6402c81e6..3d429ab15 100644
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O -fdump-tree-fre1-details" } */
++/* { dg-options "-O -fdump-tree-fre1-details -fno-tree-forwprop" } */
+ /* { dg-additional-options "-fno-common" { target hppa*-*-hpux* } } */
+ 
+ typedef double d128 __attribute__((vector_size(16)));
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c
+index 67526762f..fff731e8c 100644
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-thread2-details -fdump-tree-thread3-details -fdump-tree-thread4-details -fno-finite-loops" } */
++/* { dg-options "-O2 -fdump-tree-thread2-details -fdump-tree-thread3-details -fdump-tree-thread4-details -fno-finite-loops --param early-inlining-insns-O2=14 -fno-inline-functions" } */
+ /* { dg-final { scan-tree-dump "FSM" "thread2" } } */
+ /* { dg-final { scan-tree-dump "FSM" "thread3" } } */
+ /* { dg-final { scan-tree-dump "FSM" "thread4" { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/pr66142.c b/gcc/testsuite/gcc.dg/vect/pr66142.c
+index 8c79f2907..a0316f1f0 100644
+--- a/gcc/testsuite/gcc.dg/vect/pr66142.c
++++ b/gcc/testsuite/gcc.dg/vect/pr66142.c
+@@ -1,6 +1,6 @@
+ /* PR middle-end/66142 */
+ /* { dg-do compile } */
+-/* { dg-additional-options "-ffast-math -fopenmp-simd" } */
++/* { dg-additional-options "-ffast-math -fopenmp-simd --param early-inlining-insns-O2=14" } */
+ /* { dg-additional-options "-mavx" { target avx_runtime } } */
+ 
+ struct A { float x, y; };
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-arith-7.c b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-7.c
+new file mode 100644
+index 000000000..739b98f59
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-7.c
+@@ -0,0 +1,60 @@
++/* { dg-require-effective-target scalar_all_fma } */
++/* { dg-additional-options "-fdump-tree-optimized -ffp-contract=fast" } */
++
++#include "tree-vect.h"
++
++#define N (VECTOR_BITS * 11 / 64 + 3)
++
++#define DEF(INV)					\
++  void __attribute__ ((noipa))				\
++  f_##INV (double *restrict a, double *restrict b,	\
++	   double *restrict c, double *restrict d)	\
++  {							\
++    for (int i = 0; i < N; ++i)				\
++      {							\
++	double mb = (INV & 1 ? -b[i] : b[i]);		\
++	double mc = c[i];				\
++	double md = (INV & 2 ? -d[i] : d[i]);		\
++	a[i] = b[i] < 10 ? mb * mc + md : 10.0;		\
++      }							\
++  }
++
++#define TEST(INV)					\
++  {							\
++    f_##INV (a, b, c, d);				\
++    for (int i = 0; i < N; ++i)				\
++      {							\
++	double mb = (INV & 1 ? -b[i] : b[i]);		\
++	double mc = c[i];				\
++	double md = (INV & 2 ? -d[i] : d[i]);		\
++	double fma = __builtin_fma (mb, mc, md);	\
++	if (a[i] != (i % 17 < 10 ? fma : 10.0))		\
++	  __builtin_abort ();				\
++	asm volatile ("" ::: "memory");			\
++      }							\
++  }
++
++#define FOR_EACH_INV(T) \
++  T (0) T (1) T (2) T (3)
++
++FOR_EACH_INV (DEF)
++
++int
++main (void)
++{
++  double a[N], b[N], c[N], d[N];
++  for (int i = 0; i < N; ++i)
++    {
++      b[i] = i % 17;
++      c[i] = i % 9 + 11;
++      d[i] = i % 13 + 14;
++      asm volatile ("" ::: "memory");
++    }
++  FOR_EACH_INV (TEST)
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times { = \.COND_FMA } 1 "optimized" { target vect_double_cond_arith } } } */
++/* { dg-final { scan-tree-dump-times { = \.COND_FMS } 1 "optimized" { target vect_double_cond_arith } } } */
++/* { dg-final { scan-tree-dump-times { = \.COND_FNMA } 1 "optimized" { target vect_double_cond_arith } } } */
++/* { dg-final { scan-tree-dump-times { = \.COND_FNMS } 1 "optimized" { target vect_double_cond_arith } } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c
+new file mode 100644
+index 000000000..8e46ff6b0
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c
+@@ -0,0 +1,49 @@
++/* { dg-require-effective-target vect_int } */
++
++#include "tree-vect.h"
++#ifndef SIGNEDNESS
++#define SIGNEDNESS signed
++#endif
++#ifndef BIAS
++#define BIAS 0
++#endif
++
++#define HRS(x) ((((x) >> (15 - BIAS)) + BIAS) >> BIAS)
++
++void __attribute__ ((noipa))
++f (SIGNEDNESS short *restrict a, SIGNEDNESS short *restrict b,
++   SIGNEDNESS short *restrict c, __INTPTR_TYPE__ n)
++{
++  for (__INTPTR_TYPE__ i = 0; i < n; ++i)
++    a[i] = HRS((SIGNEDNESS int) b[i] * (SIGNEDNESS int) c[i]);
++}
++
++#define N 50
++#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
++#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
++#define CONST1 0x01AB
++#define CONST2 0x01CD
++
++int
++main (void)
++{
++  check_vect ();
++
++  SIGNEDNESS short a[N], b[N], c[N];
++  for (int i = 0; i < N; ++i)
++    {
++      b[i] = BASE1 + i * CONST1;
++      c[i] = BASE2 + i * CONST2;
++      asm volatile ("" ::: "memory");
++    }
++  f (a, b, c, N);
++  for (int i = 0; i < N; ++i)
++    if (a[i] != HRS(BASE1 * BASE2 + i * i * (CONST1 * CONST2)
++		    + i * (BASE1 * CONST2 + BASE2 * CONST1)))
++      __builtin_abort ();
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
++/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c
+new file mode 100644
+index 000000000..a16e71c6a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c
+@@ -0,0 +1,9 @@
++/* { dg-require-effective-target vect_int } */
++
++#define SIGNEDNESS unsigned
++
++#include "vect-mulhrs-1.c"
++
++/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
++/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c
+new file mode 100644
+index 000000000..e7d44d75d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c
+@@ -0,0 +1,9 @@
++/* { dg-require-effective-target vect_int } */
++
++#define BIAS 1
++
++#include "vect-mulhrs-1.c"
++
++/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
++/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c
+new file mode 100644
+index 000000000..e12176335
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c
+@@ -0,0 +1,10 @@
++/* { dg-require-effective-target vect_int } */
++
++#define SIGNEDNESS unsigned
++#define BIAS 1
++
++#include "vect-mulhrs-1.c"
++
++/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
++/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-sdiv-pow2-1.c b/gcc/testsuite/gcc.dg/vect/vect-sdiv-pow2-1.c
+new file mode 100644
+index 000000000..be70bc6c4
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-sdiv-pow2-1.c
+@@ -0,0 +1,79 @@
++/* { dg-require-effective-target vect_int } */
++
++#include "tree-vect.h"
++
++#define DIV(x,y) ((x)/(y))
++#define MOD(x,y) ((x)%(y))
++
++#define TEMPLATE(PO2,OP)						\
++void __attribute__ ((noipa))						\
++f_##PO2##_##OP (int *restrict a, int *restrict b, __INTPTR_TYPE__ n)	\
++{									\
++  for (__INTPTR_TYPE__ i = 0; i < n; ++i)				\
++    a[i] = OP (b[i], (1 << PO2));					\
++}
++#define TEMPLATES(PO2)	\
++TEMPLATE (PO2,DIV);	\
++TEMPLATE (PO2,MOD);
++
++TEMPLATES (1);
++TEMPLATES (2);
++TEMPLATES (3);
++TEMPLATES (7);
++TEMPLATES (8);
++TEMPLATES (10);
++TEMPLATES (15);
++TEMPLATES (16);
++TEMPLATES (20);
++
++typedef void (*func_t) (int *, int *, __INTPTR_TYPE__);
++typedef struct {
++  int po2;
++  func_t div;
++  func_t mod;
++} fn_t;
++const fn_t fns[] = {
++#define FN_PAIR(PO2) { PO2, f_##PO2##_DIV, f_##PO2##_MOD }
++  FN_PAIR (1),
++  FN_PAIR (2),
++  FN_PAIR (3),
++  FN_PAIR (7),
++  FN_PAIR (8),
++  FN_PAIR (10),
++  FN_PAIR (15),
++  FN_PAIR (16),
++  FN_PAIR (20),
++};
++
++int __attribute__ ((noipa, noinline))
++power2 (int x)
++{
++  return 1 << x;
++}
++
++#define N 50
++
++int
++main (void)
++{
++  int a[N], b[N], c[N];
++
++  for (int i = 0; i < (sizeof(fns)/sizeof(fns[0])); i++)
++    {
++      int p = power2 (fns[i].po2);
++      for (int j = 0; j < N; j++)
++        a[j] = ((p << 4) * j) / (N - 1) - (p << 5);
++
++      fns[i].div (b, a, N);
++      fns[i].mod (c, a, N);
++
++      for (int j = 0; j < N; j++)
++	if (a[j] != (b[j] * p + c[j]))
++          __builtin_abort ();
++    }
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump {\.DIV_POW2} "vect" { target vect_sdiv_pow2_si } } } */
++/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 18 "vect" { target vect_sdiv_pow2_si } } } */
+diff --git a/gcc/testsuite/gcc.dg/winline-3.c b/gcc/testsuite/gcc.dg/winline-3.c
+index 7b7c8c5b9..7043a2760 100644
+--- a/gcc/testsuite/gcc.dg/winline-3.c
++++ b/gcc/testsuite/gcc.dg/winline-3.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-Winline -O2 --param max-inline-insns-single=1 --param inline-min-speedup=100 -fgnu89-inline" } */
++/* { dg-options "-Winline -O2 --param max-inline-insns-single-O2=1 --param inline-min-speedup-O2=100 -fgnu89-inline" } */
+ 
+ void big (void);
+ inline int q(void) /* { dg-warning "max-inline-insns-single" } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/acle/jcvt_1.c b/gcc/testsuite/gcc.target/aarch64/acle/jcvt_1.c
+new file mode 100644
+index 000000000..0c900b1b5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/acle/jcvt_1.c
+@@ -0,0 +1,15 @@
++/* Test the __jcvt ACLE intrinsic.  */
++/* { dg-do compile } */
++/* { dg-options "-O2 -march=armv8.3-a" } */
++
++#include <arm_acle.h>
++
++#ifdef __ARM_FEATURE_JCVT
++int32_t
++test_jcvt (double a)
++{
++  return __jcvt (a);
++}
++#endif
++
++/* { dg-final { scan-assembler-times "fjcvtzs\tw\[0-9\]+, d\[0-9\]+\n" 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/acle/rintnzx_1.c b/gcc/testsuite/gcc.target/aarch64/acle/rintnzx_1.c
+new file mode 100644
+index 000000000..125720848
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/acle/rintnzx_1.c
+@@ -0,0 +1,73 @@
++/* Test the __rint[32,64][z,x] intrinsics.  */
++
++/* { dg-do compile } */
++/* { dg-options "-O2 -march=armv8.5-a" } */
++
++#include <arm_acle.h>
++
++#ifdef __ARM_FEATURE_FRINT
++float
++foo_32z_f32_scal (float a)
++{
++  return __rint32zf (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32z\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
++
++double
++foo_32z_f64_scal (double a)
++{
++  return __rint32z (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float
++foo_32x_f32_scal (float a)
++{
++  return __rint32xf (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32x\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
++
++double
++foo_32x_f64_scal (double a)
++{
++  return __rint32x (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float
++foo_64z_f32_scal (float a)
++{
++  return __rint64zf (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64z\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
++
++double
++foo_64z_f64_scal (double a)
++{
++  return __rint64z (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float
++foo_64x_f32_scal (float a)
++{
++  return __rint64xf (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64x\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
++
++double
++foo_64x_f64_scal (double a)
++{
++  return __rint64x (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/acle/rng_1.c b/gcc/testsuite/gcc.target/aarch64/acle/rng_1.c
+new file mode 100644
+index 000000000..1fbdb6276
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/acle/rng_1.c
+@@ -0,0 +1,53 @@
++/* Test the __rndr ACLE intrinsic.  */
++/* { dg-do compile } */
++/* { dg-options "-O2 -march=armv8.5-a+rng" } */
++
++#include <arm_acle.h>
++
++#ifdef __ARM_FEATURE_RNG
++/* Check that instruction is generated when status result is unused.  */
++uint64_t
++test_rndr_no_stat (void)
++{
++  uint64_t res;
++  __rndr (&res);
++  return res;
++}
++
++/* Check that instruction is generated when random number result
++   is unused.  */
++int
++test_rndr_error_check (void)
++{
++  uint64_t res;
++  int fail = __rndr (&res);
++  if (fail)
++    return 0;
++  return -1;
++}
++
++/* { dg-final { scan-assembler-times "mrs\tx..?, RNDR\n" 2 } } */
++
++/* Check that instruction is generated when status result is unused.  */
++uint64_t
++test_rndrrs_no_stat (void)
++{
++  uint64_t res;
++  __rndrrs (&res);
++  return res;
++}
++
++/* Check that instruction is generated when random number result
++   is unused.  */
++int
++test_rndrrs_error_check (void)
++{
++  uint64_t res;
++  int fail = __rndrrs (&res);
++  if (fail)
++    return 0;
++  return -1;
++}
++
++/* { dg-final { scan-assembler-times "mrs\tx..?, RNDRRS\n" 2 } } */
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/acle/tme.c b/gcc/testsuite/gcc.target/aarch64/acle/tme.c
+new file mode 100644
+index 000000000..5df93b1dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/acle/tme.c
+@@ -0,0 +1,34 @@
++/* Test the TME intrinsics.  */
++
++/* { dg-do compile } */
++/* { dg-options "-save-temps -O2 -march=armv8-a+tme" } */
++
++#include "arm_acle.h"
++
++#define tcancel_reason 0x234
++
++unsigned
++check_tme (void)
++{
++  unsigned status = __tstart ();
++  if (status == 0)
++    {
++      if (__ttest () == 2)
++	{
++	  __tcancel (tcancel_reason & _TMFAILURE_REASON);
++	  return tcancel_reason;
++	}
++
++      __tcommit ();
++      return 0;
++    }
++  else if (status & _TMFAILURE_NEST)
++    return _TMFAILURE_NEST;
++  else if (status & _TMFAILURE_TRIVIAL)
++    return _TMFAILURE_TRIVIAL;
++}
++
++/* { dg-final { scan-assembler "tstart\tx..?\n" } } */
++/* { dg-final { scan-assembler "tcancel\t#564\n" } } */
++/* { dg-final { scan-assembler "ttest\tx..?\n" } } */
++/* { dg-final { scan-assembler "tcommit\n" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+new file mode 100644
+index 000000000..c42c7acbb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+@@ -0,0 +1,85 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-options "-O2" } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-save-temps" } */
++
++#include <arm_neon.h>
++
++float32x2_t test_vcreate (float32x2_t r, uint64_t a, uint64_t b)
++{
++  bfloat16x4_t _a = vcreate_bf16(a);
++  bfloat16x4_t _b = vcreate_bf16(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++/* { dg-final { scan-assembler {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} } } */
++
++bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
++{
++  return vset_lane_bf16 (a, b, 3);
++}
++
++bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
++{
++  return vsetq_lane_bf16 (a, b, 7);
++}
++/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
++
++bfloat16x4_t vdup_test (bfloat16_t a)
++{
++  return vdup_n_bf16 (a);
++}
++/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+.h\\\[0\\\]" } } */
++
++bfloat16x8_t vdupq_test (bfloat16_t a)
++{
++  return vdupq_n_bf16 (a);
++}
++
++bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
++{
++  return vdupq_lane_bf16 (a, 1);
++}
++/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8h, v\[0-9\]+.h\\\[0\\\]" 2 } } */
++
++bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
++{
++  return vget_lane_bf16 (a, 1);
++}
++/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[1\\\]" 2 } } */
++
++bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
++{
++  return vdup_lane_bf16 (a, 1);
++}
++/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" } } */
++
++bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
++{
++  return vdup_laneq_bf16 (a, 7);
++}
++/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[7\\\]" } } */
++
++bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
++{
++  return vdupq_laneq_bf16 (a, 5);
++}
++/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[5\\\]" } } */
++
++bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
++{
++  return vduph_lane_bf16 (a, 3);
++}
++/* { dg-final { scan-assembler "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[3\\\]" } } */
++
++bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
++{
++  return vgetq_lane_bf16 (a, 7);
++}
++
++bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
++{
++  return vduph_laneq_bf16 (a, 7);
++}
++/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
+new file mode 100644
+index 000000000..f5adf40c6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
+@@ -0,0 +1,466 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-save-temps" } */
++
++#include <arm_neon.h>
++
++float32x2_t
++test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_s8(a);
++  bfloat16x4_t _b = vreinterpret_bf16_s8(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_s16(a);
++  bfloat16x4_t _b = vreinterpret_bf16_s16(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_s32(a);
++  bfloat16x4_t _b = vreinterpret_bf16_s32(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_s64(a);
++  bfloat16x4_t _b = vreinterpret_bf16_s64(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_u8(a);
++  bfloat16x4_t _b = vreinterpret_bf16_u8(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_u16(a);
++  bfloat16x4_t _b = vreinterpret_bf16_u16(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_u32(a);
++  bfloat16x4_t _b = vreinterpret_bf16_u32(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_u64(a);
++  bfloat16x4_t _b = vreinterpret_bf16_u64(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_p8(a);
++  bfloat16x4_t _b = vreinterpret_bf16_p8(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_p16(a);
++  bfloat16x4_t _b = vreinterpret_bf16_p16(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_p64(a);
++  bfloat16x4_t _b = vreinterpret_bf16_p64(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_f16(a);
++  bfloat16x4_t _b = vreinterpret_bf16_f16(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_f32(a);
++  bfloat16x4_t _b = vreinterpret_bf16_f32(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x2_t
++test_vbfdot_f32_f64 (float32x2_t r, float64x1_t a, float64x1_t b)
++{
++  bfloat16x4_t _a = vreinterpret_bf16_f64(a);
++  bfloat16x4_t _b = vreinterpret_bf16_f64(b);
++
++  return vbfdot_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++float32x4_t
++test_vbfdotq_f32_f64 (float32x4_t r, float64x2_t a, float64x2_t b)
++{
++  bfloat16x8_t _a = vreinterpretq_bf16_f64(a);
++  bfloat16x8_t _b = vreinterpretq_bf16_f64(b);
++
++  return vbfdotq_f32 (r, _a, _b);
++}
++
++/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} 14 } } */
++/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h} 15 } } */
++
++int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
++{
++  int8x8_t _a = vreinterpret_s8_bf16 (a);
++  return vadd_s8 (_a, b);
++}
++
++int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
++{
++  int16x4_t _a = vreinterpret_s16_bf16 (a);
++  return vadd_s16 (_a, b);
++}
++
++int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
++{
++  int32x2_t _a = vreinterpret_s32_bf16 (a);
++  return vadd_s32 (_a, b);
++}
++
++int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
++{
++  int64x1_t _a = vreinterpret_s64_bf16 (a);
++  return vrshl_s64 (_a, b);
++}
++
++uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
++{
++  uint8x8_t _a = vreinterpret_u8_bf16 (a);
++  return vadd_u8 (_a, b);
++}
++
++uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
++{
++  uint16x4_t _a = vreinterpret_u16_bf16 (a);
++  return vadd_u16 (_a, b);
++}
++
++uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
++{
++  uint32x2_t _a = vreinterpret_u32_bf16 (a);
++  return vadd_u32 (_a, b);
++}
++
++uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
++{
++  uint64x1_t _a = vreinterpret_u64_bf16 (a);
++  return vrshl_u64 (_a, b);
++}
++
++poly8x8_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
++{
++  poly8x8_t _a = vreinterpret_p8_bf16 (a);
++  return vzip1_p8 (_a, b);
++}
++
++poly16x4_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
++{
++  poly16x4_t _a = vreinterpret_p16_bf16 (a);
++  return vzip1_p16 (_a, b);
++}
++
++poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
++{
++  poly64x1_t _a = vreinterpret_p64_bf16 (a);
++  return vsli_n_p64 (_a, b, 3);
++}
++
++float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
++{
++  float32x2_t _a = vreinterpret_f32_bf16 (a);
++  return vsub_f32 (_a, b);
++}
++
++float64x1_t test_vreinterpret_f64_bf16 (bfloat16x4_t a, float64x1_t b)
++{
++  float64x1_t _a = vreinterpret_f64_bf16 (a);
++  return vsub_f64 (_a, b);
++}
++
++int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
++{
++  int8x16_t _a = vreinterpretq_s8_bf16 (a);
++  return vaddq_s8 (_a, b);
++}
++
++int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
++{
++  int16x8_t _a = vreinterpretq_s16_bf16 (a);
++  return vaddq_s16 (_a, b);
++}
++
++int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
++{
++  int32x4_t _a = vreinterpretq_s32_bf16 (a);
++  return vaddq_s32 (_a, b);
++}
++
++int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
++{
++  int64x2_t _a = vreinterpretq_s64_bf16 (a);
++  return vaddq_s64 (_a, b);
++}
++
++uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
++{
++  uint8x16_t _a = vreinterpretq_u8_bf16 (a);
++  return vaddq_u8 (_a, b);
++}
++
++uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
++{
++  uint16x8_t _a = vreinterpretq_u16_bf16 (a);
++  return vaddq_u16 (_a, b);
++}
++
++uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
++{
++  uint32x4_t _a = vreinterpretq_u32_bf16 (a);
++  return vaddq_u32 (_a, b);
++}
++
++uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
++{
++  uint64x2_t _a = vreinterpretq_u64_bf16 (a);
++  return vaddq_u64 (_a, b);
++}
++
++poly8x16_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
++{
++  poly8x16_t _a = vreinterpretq_p8_bf16 (a);
++  return vzip1q_p8 (_a, b);
++}
++
++poly16x8_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
++{
++  poly16x8_t _a = vreinterpretq_p16_bf16 (a);
++  return vzip1q_p16 (_a, b);
++}
++
++poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
++{
++  poly64x2_t _a = vreinterpretq_p64_bf16 (a);
++  return vsliq_n_p64 (_a, b, 3);
++}
++
++poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
++{
++  poly128_t _a = vreinterpretq_p128_bf16 (a);
++  return _a;
++}
++
++float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
++{
++  float32x4_t _a = vreinterpretq_f32_bf16 (a);
++  return vsubq_f32 (_a, b);
++}
++
++float64x2_t test_vreinterpretq_f64_bf16 (bfloat16x8_t a, float64x2_t b)
++{
++  float64x2_t _a = vreinterpretq_f64_bf16 (a);
++  return vsubq_f64 (_a, b);
++}
++
++float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
++{
++  return vreinterpret_f16_bf16 (a);
++}
++
++float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
++{
++  return vreinterpretq_f16_bf16 (a);
++}
++
++/* { dg-final { scan-assembler-times {add\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} 2 } } */
++/* { dg-final { scan-assembler-times {add\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} 2 } } */
++/* { dg-final { scan-assembler-times {add\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} 2 } } */
++
++/* { dg-final { scan-assembler-times {add\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} 2 } } */
++/* { dg-final { scan-assembler-times {add\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} 2 } } */
++/* { dg-final { scan-assembler-times {add\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} 2 } } */
++
++/* { dg-final { scan-assembler {fsub\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} } } */
++/* { dg-final { scan-assembler {fsub\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} } } */
++/* { dg-final { scan-assembler {fsub\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d} } } */
++/* { dg-final { scan-assembler {fsub\td[0-9]+, d[0-9]+, d[0-9]+} } } */
++
++/* { dg-final { scan-assembler {zip1\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} } } */
++/* { dg-final { scan-assembler {zip1\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} } } */
++/* { dg-final { scan-assembler {zip1\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} } } */
++/* { dg-final { scan-assembler {zip1\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} } } */
++
++/* { dg-final { scan-assembler {sli\tv[0-9]+.2d, v[0-9]+.2d, 3} } } */
++/* { dg-final { scan-assembler {sli\td[0-9]+, d[0-9]+, 3} } } */
++
++/* { dg-final { scan-assembler {urshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
++/* { dg-final { scan-assembler {srshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c
+new file mode 100644
+index 000000000..cf245091a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c
+@@ -0,0 +1,150 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++
++#include <arm_neon.h>
++
++bfloat16x4_t
++test_vld1_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld1_dup_bf16 (ptr);
++}
++
++bfloat16x8_t
++test_vld1q_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld1q_dup_bf16 (ptr);
++}
++
++bfloat16x4_t
++test_vld1_lane_bf16 (bfloat16_t * ptr, bfloat16x4_t src)
++{
++  return vld1_lane_bf16 (ptr, src, 3);
++}
++
++bfloat16x8_t
++test_vld1q_lane_bf16 (bfloat16_t * ptr, bfloat16x8_t src)
++{
++  return vld1q_lane_bf16 (ptr, src, 7);
++}
++
++bfloat16x4_t
++test_vld1_bf16 (bfloat16_t * ptr)
++{
++  return vld1_bf16 (ptr);
++}
++
++bfloat16x8_t
++test_vld1q_bf16 (bfloat16_t * ptr)
++{
++  return vld1q_bf16 (ptr);
++}
++
++bfloat16x4x2_t
++test_vld1_bf16_x2 (bfloat16_t * ptr)
++{
++  return vld1_bf16_x2 (ptr);
++}
++
++bfloat16x8x2_t
++test_vld1q_bf16_x2 (bfloat16_t * ptr)
++{
++  return vld1q_bf16_x2 (ptr);
++}
++
++bfloat16x4x3_t
++test_vld1_bf16_x3 (bfloat16_t * ptr)
++{
++  return vld1_bf16_x3 (ptr);
++}
++
++bfloat16x8x3_t
++test_vld1q_bf16_x3 (bfloat16_t * ptr)
++{
++  return vld1q_bf16_x3 (ptr);
++}
++
++bfloat16x4x4_t
++test_vld1_bf16_x4 (bfloat16_t * ptr)
++{
++  return vld1_bf16_x4 (ptr);
++}
++
++bfloat16x8x4_t
++test_vld1q_bf16_x4 (bfloat16_t * ptr)
++{
++  return vld1q_bf16_x4 (ptr);
++}
++
++bfloat16x4x2_t
++test_vld2_bf16 (bfloat16_t * ptr)
++{
++  return vld2_bf16 (ptr);
++}
++
++bfloat16x8x2_t
++test_vld2q_bf16 (bfloat16_t * ptr)
++{
++  return vld2q_bf16 (ptr);
++}
++
++bfloat16x4x2_t
++test_vld2_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld2_dup_bf16 (ptr);
++}
++
++bfloat16x8x2_t
++test_vld2q_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld2q_dup_bf16 (ptr);
++}
++
++bfloat16x4x3_t
++test_vld3_bf16 (bfloat16_t * ptr)
++{
++  return vld3_bf16 (ptr);
++}
++
++bfloat16x8x3_t
++test_vld3q_bf16 (bfloat16_t * ptr)
++{
++  return vld3q_bf16 (ptr);
++}
++
++bfloat16x4x3_t
++test_vld3_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld3_dup_bf16 (ptr);
++}
++
++bfloat16x8x3_t
++test_vld3q_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld3q_dup_bf16 (ptr);
++}
++
++bfloat16x4x4_t
++test_vld4_bf16 (bfloat16_t * ptr)
++{
++ return vld4_bf16 (ptr);
++}
++
++bfloat16x8x4_t
++test_vld4q_bf16 (bfloat16_t * ptr)
++{
++ return vld4q_bf16 (ptr);
++}
++
++bfloat16x4x4_t
++test_vld4_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld4_dup_bf16 (ptr);
++}
++
++bfloat16x8x4_t
++test_vld4q_dup_bf16 (bfloat16_t * ptr)
++{
++  return vld4q_dup_bf16 (ptr);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c
+new file mode 100644
+index 000000000..162b3ee36
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c
+@@ -0,0 +1,107 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++
++#include <arm_neon.h>
++
++void
++test_vst1_bf16_x2 (bfloat16_t *ptr, bfloat16x4x2_t val)
++{
++  vst1_bf16_x2 (ptr, val);
++}
++
++void
++test_vst1q_bf16_x2 (bfloat16_t *ptr, bfloat16x8x2_t val)
++{
++  vst1q_bf16_x2 (ptr, val);
++}
++
++void
++test_vst1_bf16_x3 (bfloat16_t *ptr, bfloat16x4x3_t val)
++{
++  vst1_bf16_x3 (ptr, val);
++}
++
++void
++test_vst1q_bf16_x3 (bfloat16_t *ptr, bfloat16x8x3_t val)
++{
++  vst1q_bf16_x3 (ptr, val);
++}
++
++void
++test_vst1_bf16_x4 (bfloat16_t *ptr, bfloat16x4x4_t val)
++{
++  vst1_bf16_x4 (ptr, val);
++}
++
++void
++test_vst1q_bf16_x4 (bfloat16_t *ptr, bfloat16x8x4_t val)
++{
++  vst1q_bf16_x4 (ptr, val);
++}
++
++void
++test_vst1_lane_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
++{
++  vst1_lane_bf16 (ptr, val, 3);
++}
++
++void
++test_vst1q_lane_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
++{
++  vst1q_lane_bf16 (ptr, val, 7);
++}
++
++void
++test_vst1_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
++{
++  vst1_bf16 (ptr, val);
++}
++
++void
++test_vst1q_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
++{
++  vst1q_bf16 (ptr, val);
++}
++
++void
++test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
++{
++  vst2_bf16 (ptr, val);
++}
++
++void
++test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
++{
++  vst2q_bf16 (ptr, val);
++}
++
++void
++test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
++{
++  vst3_bf16 (ptr, val);
++}
++
++void
++test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
++{
++  vst3q_bf16 (ptr, val);
++}
++
++void
++test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
++{
++  vst4_bf16 (ptr, val);
++}
++
++void
++test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
++{
++  vst4q_bf16 (ptr, val);
++}
++
++int main()
++{
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
+new file mode 100644
+index 000000000..bbea630b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
+@@ -0,0 +1,48 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon } */
++/* { dg-additional-options "-save-temps" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++/*
++**test_bfcvtn:
++**     bfcvtn	v0.4h, v0.4s
++**     ret
++*/
++bfloat16x4_t test_bfcvtn (float32x4_t a)
++{
++  return vcvt_bf16_f32 (a);
++}
++
++/*
++**test_bfcvtnq:
++**     bfcvtn	v0.4h, v0.4s
++**     ret
++*/
++bfloat16x8_t test_bfcvtnq (float32x4_t a)
++{
++  return vcvtq_low_bf16_f32 (a);
++}
++
++/*
++**test_bfcvtnq2:
++**     bfcvtn2	v0.8h, v1.4s
++**     ret
++*/
++bfloat16x8_t test_bfcvtnq2 (bfloat16x8_t inactive, float32x4_t a)
++{
++  return vcvtq_high_bf16_f32 (inactive, a);
++}
++
++/*
++**test_bfcvt:
++**     bfcvt	h0, s0
++**     ret
++*/
++bfloat16_t test_bfcvt (float32_t a)
++{
++  return vcvth_bf16_f32 (a);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c
+new file mode 100644
+index 000000000..9904d65f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c
+@@ -0,0 +1,10 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-additional-options "-march=armv8.2-a+nobf16" } */
++
++#include <arm_neon.h>
++
++bfloat16_t test_bfcvt (float32_t a)
++{
++  /* { dg-error "inlining failed .* 'vcvth_bf16_f32" "" { target *-*-* } 0 } */
++  return vcvth_bf16_f32 (a);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c
+new file mode 100644
+index 000000000..a91468093
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c
+@@ -0,0 +1,17 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok } */
++/* { dg-additional-options "-save-temps -march=armv8.2-a+bf16+nosimd" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++
++#include <arm_neon.h>
++
++/*
++**test_bfcvt:
++**	bfcvt	h0, s0
++**	ret
++*/
++bfloat16_t test_bfcvt (float32_t a)
++{
++  return vcvth_bf16_f32 (a);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtn-nobf16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtn-nobf16.c
+new file mode 100644
+index 000000000..b3b6db123
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtn-nobf16.c
+@@ -0,0 +1,10 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-additional-options "-march=armv8.2-a+nobf16" } */
++
++#include <arm_neon.h>
++
++bfloat16x4_t test_bfcvtn (float32x4_t a)
++{
++  /* { dg-error "inlining failed .* 'vcvt_bf16_f32" "" { target *-*-* } 0 } */
++  return vcvt_bf16_f32 (a);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c
+new file mode 100644
+index 000000000..4b730e39d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c
+@@ -0,0 +1,20 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon } */
++/* { dg-additional-options "-save-temps" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++/*
++**test_bfcvtnq2_untied:
++**     mov	v0.16b, v1.16b
++**     bfcvtn2	v0.8h, v2.4s
++**     ret
++*/
++bfloat16x8_t test_bfcvtnq2_untied (bfloat16x8_t unused, bfloat16x8_t inactive,
++                                  float32x4_t a)
++{
++  return vcvtq_high_bf16_f32 (inactive, a);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c
+new file mode 100755
+index 000000000..ad5150773
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c
+@@ -0,0 +1,91 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-save-temps" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++/*
++**ufoo:
++**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
++**	ret
++*/
++float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_f32 (r, x, y);
++}
++
++/*
++**ufooq:
++**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
++**	ret
++*/
++float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
++{
++  return vbfdotq_f32 (r, x, y);
++}
++
++/*
++**ufoo_lane:
++**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
++**	ret
++*/
++float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_lane_f32 (r, x, y, 0);
++}
++
++/*
++**ufooq_laneq:
++**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
++**	ret
++*/
++float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
++{
++  return vbfdotq_laneq_f32 (r, x, y, 2);
++}
++
++/*
++**ufoo_laneq:
++**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
++**	ret
++*/
++float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
++{
++  return vbfdot_laneq_f32 (r, x, y, 3);
++}
++
++/*
++**ufooq_lane:
++**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
++**	ret
++*/
++float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
++{
++  return vbfdotq_lane_f32 (r, x, y, 1);
++}
++
++/*
++**ufoo_untied:
++**	mov	v0.8b, v1.8b
++**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
++**	ret
++*/
++float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_f32 (r, x, y);
++}
++
++/*
++**ufooq_lane_untied:
++**	mov	v0.16b, v1.16b
++**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
++**	ret
++*/
++float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
++{
++  return vbfdotq_lane_f32 (r, x, y, 1);
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c
+new file mode 100755
+index 000000000..58bdee5ac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c
+@@ -0,0 +1,91 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-mbig-endian --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++/*
++**ufoo:
++**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
++**	ret
++*/
++float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_f32 (r, x, y);
++}
++
++/*
++**ufooq:
++**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
++**	ret
++*/
++float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
++{
++  return vbfdotq_f32 (r, x, y);
++}
++
++/*
++**ufoo_lane:
++**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
++**	ret
++*/
++float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_lane_f32 (r, x, y, 0);
++}
++
++/*
++**ufooq_laneq:
++**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
++**	ret
++*/
++float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
++{
++  return vbfdotq_laneq_f32 (r, x, y, 2);
++}
++
++/*
++**ufoo_laneq:
++**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
++**	ret
++*/
++float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
++{
++  return vbfdot_laneq_f32 (r, x, y, 3);
++}
++
++/*
++**ufooq_lane:
++**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
++**	ret
++*/
++float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
++{
++  return vbfdotq_lane_f32 (r, x, y, 1);
++}
++
++/*
++**ufoo_untied:
++**	mov	v0.8b, v1.8b
++**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
++**	ret
++*/
++float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_f32 (r, x, y);
++}
++
++/*
++**ufooq_lane_untied:
++**	mov	v0.16b, v1.16b
++**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
++**	ret
++*/
++float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
++{
++  return vbfdotq_lane_f32 (r, x, y, 1);
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c
+new file mode 100755
+index 000000000..607126203
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c
+@@ -0,0 +1,28 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "--save-temps" } */
++
++#include <arm_neon.h>
++
++float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
++{
++  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
++}
++
++float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
++{
++  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
++}
++
++float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
++{
++  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
++}
++
++float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
++{
++  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
+new file mode 100644
+index 000000000..9810e4ba3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
+@@ -0,0 +1,67 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon } */
++/* { dg-additional-options "-save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include <arm_neon.h>
++
++/*
++**test_bfmlalb:
++**      bfmlalb	v0.4s, v1.8h, v2.8h
++**      ret
++*/
++float32x4_t test_bfmlalb (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
++{
++  return vbfmlalbq_f32 (r, a, b);
++}
++
++/*
++**test_bfmlalt:
++**      bfmlalt	v0.4s, v1.8h, v2.8h
++**      ret
++*/
++float32x4_t test_bfmlalt (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
++{
++  return vbfmlaltq_f32 (r, a, b);
++}
++
++/*
++**test_bfmlalb_lane:
++**      bfmlalb	v0.4s, v1.8h, v2.h[0]
++**      ret
++*/
++float32x4_t test_bfmlalb_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
++{
++  return vbfmlalbq_lane_f32 (r, a, b, 0);
++}
++
++/*
++**test_bfmlalt_lane:
++**      bfmlalt	v0.4s, v1.8h, v2.h[2]
++**      ret
++*/
++float32x4_t test_bfmlalt_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
++{
++  return vbfmlaltq_lane_f32 (r, a, b, 2);
++}
++
++/*
++**test_bfmlalb_laneq:
++**      bfmlalb	v0.4s, v1.8h, v2.h[4]
++**      ret
++*/
++float32x4_t test_bfmlalb_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
++{
++  return vbfmlalbq_laneq_f32 (r, a, b, 4);
++}
++
++/*
++**test_bfmlalt_laneq:
++**      bfmlalt	v0.4s, v1.8h, v2.h[7]
++**      ret
++*/
++float32x4_t test_bfmlalt_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
++{
++  return vbfmlaltq_laneq_f32 (r, a, b, 7);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
+new file mode 100644
+index 000000000..0aaa69f00
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
+@@ -0,0 +1,18 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon } */
++/* { dg-additional-options "-save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include <arm_neon.h>
++
++
++/*
++**test_bfmmla:
++**     bfmmla	v0.4s, v1.8h, v2.8h
++**     ret
++*/
++float32x4_t test_bfmmla (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
++{
++  return vbfmmlaq_f32 (r, x, y);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
+new file mode 100644
+index 000000000..4d50ba3a3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon } */
++
++#include <arm_neon.h>
++
++void
++f_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
++{
++  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
++  vbfmlaltq_lane_f32 (r, a, b, -1);
++  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
++  vbfmlaltq_lane_f32 (r, a, b, 4);
++  return;
++}
++
++void
++f_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
++{
++  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
++  vbfmlaltq_laneq_f32 (r, a, b, -1);
++  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
++  vbfmlaltq_laneq_f32 (r, a, b, 8);
++  return;
++}
++
++void
++f_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
++{
++  /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
++  vbfmlalbq_lane_f32 (r, a, b, -2);
++  /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
++  vbfmlalbq_lane_f32 (r, a, b, 5);
++  return;
++}
++
++void
++f_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
++{
++  /* { dg-error "lane -2 out of range 0 - 7" "" { target *-*-* } 0 } */
++  vbfmlalbq_laneq_f32 (r, a, b, -2);
++  /* { dg-error "lane 9 out of range 0 - 7" "" { target *-*-* } 0 } */
++  vbfmlalbq_laneq_f32 (r, a, b, 9);
++  return;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c
+new file mode 100755
+index 000000000..ac4f821e7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c
+@@ -0,0 +1,136 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
++/* { dg-add-options arm_v8_2a_i8mm }  */
++/* { dg-additional-options "-save-temps" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++/* Unsigned-Signed Dot Product instructions.  */
++
++/*
++**ufoo:
++**	usdot	v0\.2s, v1\.8b, v2\.8b
++**	ret
++*/
++int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  return vusdot_s32 (r, x, y);
++}
++
++/*
++**ufooq:
++**	usdot	v0\.4s, v1\.16b, v2\.16b
++**	ret
++*/
++int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  return vusdotq_s32 (r, x, y);
++}
++
++/*
++**ufoo_lane:
++**	usdot	v0\.2s, v1\.8b, v2\.4b\[0\]
++**	ret
++*/
++int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  return vusdot_lane_s32 (r, x, y, 0);
++}
++
++/*
++**ufoo_laneq:
++**	usdot	v0\.2s, v1\.8b, v2\.4b\[2\]
++**	ret
++*/
++int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
++{
++  return vusdot_laneq_s32 (r, x, y, 2);
++}
++
++/*
++**ufooq_lane:
++**	usdot	v0\.4s, v1\.16b, v2\.4b\[1\]
++**	ret
++*/
++int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
++{
++  return vusdotq_lane_s32 (r, x, y, 1);
++}
++
++/*
++**ufooq_laneq:
++**	usdot	v0\.4s, v1\.16b, v2\.4b\[3\]
++**	ret
++*/
++int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  return vusdotq_laneq_s32 (r, x, y, 3);
++}
++
++
++/* Signed-Unsigned Dot Product instructions.  */
++
++/*
++**sfoo_lane:
++**	sudot	v0\.2s, v1\.8b, v2\.4b\[0\]
++**	ret
++*/
++int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
++{
++  return vsudot_lane_s32 (r, x, y, 0);
++}
++
++/*
++**sfoo_laneq:
++**	sudot	v0\.2s, v1\.8b, v2\.4b\[2\]
++**	ret
++*/
++int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
++{
++  return vsudot_laneq_s32 (r, x, y, 2);
++}
++
++/*
++**sfooq_lane:
++**	sudot	v0\.4s, v1\.16b, v2\.4b\[1\]
++**	ret
++*/
++int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
++{
++  return vsudotq_lane_s32 (r, x, y, 1);
++}
++
++/*
++**sfooq_laneq:
++**	sudot	v0\.4s, v1\.16b, v2\.4b\[3\]
++**	ret
++*/
++int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
++{
++  return vsudotq_laneq_s32 (r, x, y, 3);
++}
++
++/*
++**ufoo_untied:
++**	mov	v0\.8b, v1\.8b
++**	usdot	v0\.2s, v2\.8b, v3\.8b
++**	ret
++*/
++int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  return vusdot_s32 (r, x, y);
++}
++
++/*
++**ufooq_laneq_untied:
++**	mov	v0\.16b, v1\.16b
++**	usdot	v0\.4s, v2\.16b, v3\.4b\[3\]
++**	ret
++*/
++int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  return vusdotq_laneq_s32 (r, x, y, 3);
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c
+new file mode 100755
+index 000000000..96bca2356
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c
+@@ -0,0 +1,137 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
++/* { dg-add-options arm_v8_2a_i8mm }  */
++/* { dg-additional-options "-mbig-endian -save-temps" } */
++/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++/* Unsigned-Signed Dot Product instructions.  */
++
++/*
++**ufoo:
++**	usdot	v0\.2s, v1\.8b, v2\.8b
++**	ret
++*/
++int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  return vusdot_s32 (r, x, y);
++}
++
++/*
++**ufooq:
++**	usdot	v0\.4s, v1\.16b, v2\.16b
++**	ret
++*/
++int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  return vusdotq_s32 (r, x, y);
++}
++
++/*
++**ufoo_lane:
++**	usdot	v0\.2s, v1\.8b, v2\.4b\[0\]
++**	ret
++*/
++int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  return vusdot_lane_s32 (r, x, y, 0);
++}
++
++/*
++**ufoo_laneq:
++**	usdot	v0\.2s, v1\.8b, v2\.4b\[2\]
++**	ret
++*/
++int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
++{
++  return vusdot_laneq_s32 (r, x, y, 2);
++}
++
++/*
++**ufooq_lane:
++**	usdot	v0\.4s, v1\.16b, v2\.4b\[1\]
++**	ret
++*/
++int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
++{
++  return vusdotq_lane_s32 (r, x, y, 1);
++}
++
++/*
++**ufooq_laneq:
++**	usdot	v0\.4s, v1\.16b, v2\.4b\[3\]
++**	ret
++*/
++int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  return vusdotq_laneq_s32 (r, x, y, 3);
++}
++
++
++/* Signed-Unsigned Dot Product instructions.  */
++
++/*
++**sfoo_lane:
++**	sudot	v0\.2s, v1\.8b, v2\.4b\[0\]
++**	ret
++*/
++int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
++{
++  return vsudot_lane_s32 (r, x, y, 0);
++}
++
++/*
++**sfoo_laneq:
++**	sudot	v0\.2s, v1\.8b, v2\.4b\[2\]
++**	ret
++*/
++int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
++{
++  return vsudot_laneq_s32 (r, x, y, 2);
++}
++
++/*
++**sfooq_lane:
++**	sudot	v0\.4s, v1\.16b, v2\.4b\[1\]
++**	ret
++*/
++int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
++{
++  return vsudotq_lane_s32 (r, x, y, 1);
++}
++
++/*
++**sfooq_laneq:
++**	sudot	v0\.4s, v1\.16b, v2\.4b\[3\]
++**	ret
++*/
++int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
++{
++  return vsudotq_laneq_s32 (r, x, y, 3);
++}
++
++/*
++**ufoo_untied:
++**	mov	v0\.8b, v1\.8b
++**	usdot	v0\.2s, v2\.8b, v3\.8b
++**	ret
++*/
++int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  return vusdot_s32 (r, x, y);
++}
++
++/*
++**ufooq_laneq_untied:
++**	mov	v0\.16b, v1\.16b
++**	usdot	v0\.4s, v2\.16b, v3\.4b\[3\]
++**	ret
++*/
++int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  return vusdotq_laneq_s32 (r, x, y, 3);
++}
++
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c
+new file mode 100755
+index 000000000..18ecabef8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c
+@@ -0,0 +1,31 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
++/* { dg-add-options arm_v8_2a_i8mm }  */
++/* { dg-additional-options "--save-temps" } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
++{
++  /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
++  return vusdot_lane_s32 (r, x, y, -1);
++}
++
++int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
++{
++  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
++  return vusdot_laneq_s32 (r, x, y, -1);
++}
++
++int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
++{
++  /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
++  return vusdotq_lane_s32 (r, x, y, 2);
++}
++
++int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
++{
++  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
++  return vusdotq_laneq_s32 (r, x, y, 4);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c
+new file mode 100755
+index 000000000..66c87d486
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c
+@@ -0,0 +1,31 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
++/* { dg-add-options arm_v8_2a_i8mm }  */
++/* { dg-additional-options "--save-temps" } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++
++#include <arm_neon.h>
++
++int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
++{
++  /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
++  return vsudot_lane_s32 (r, x, y, -1);
++}
++
++int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
++{
++  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
++  return vsudot_laneq_s32 (r, x, y, -1);
++}
++
++int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
++{
++  /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
++  return vsudotq_lane_s32 (r, x, y, 2);
++}
++
++int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
++{
++  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
++  return vsudotq_laneq_s32 (r, x, y, 4);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
+new file mode 100644
+index 000000000..451a0afc6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
+@@ -0,0 +1,83 @@
++/* We haven't implemented these intrinsics for arm yet.  */
++/* { dg-xfail-if "" { arm*-*-* } } */
++/* { dg-do run } */
++/* { dg-options "-O3" } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++
++extern void abort (void);
++
++#define TESTMETH(BASE, ELTS, SUFFIX)			\
++int __attribute__ ((noinline))				\
++test_vld1##SUFFIX##_x4 ()				\
++{							\
++  BASE##_t data[ELTS * 4];				\
++  BASE##_t temp[ELTS * 4];				\
++  BASE##x##ELTS##x##4##_t vectors;			\
++  int i,j;						\
++  for (i = 0; i < ELTS * 4; i++)			\
++    data [i] = (BASE##_t) 4*i;				\
++  asm volatile ("" : : : "memory");			\
++  vectors = vld1##SUFFIX##_x4 (data);			\
++  vst1##SUFFIX (temp, vectors.val[0]);			\
++  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);		\
++  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
++  vst1##SUFFIX (&temp[ELTS * 3], vectors.val[3]);	\
++  asm volatile ("" : : : "memory");			\
++  for (j = 0; j < ELTS * 4; j++)			\
++    if (temp[j] != data[j])				\
++      return 1;						\
++  return 0;						\
++}
++
++#define VARIANTS_1(VARIANT)	\
++VARIANT (uint8, 8, _u8)		\
++VARIANT (uint16, 4, _u16)	\
++VARIANT (uint32, 2, _u32)	\
++VARIANT (uint64, 1, _u64)	\
++VARIANT (int8, 8, _s8)		\
++VARIANT (int16, 4, _s16)	\
++VARIANT (int32, 2, _s32)	\
++VARIANT (int64, 1, _s64)	\
++VARIANT (poly8, 8, _p8)		\
++VARIANT (poly16, 4, _p16)	\
++VARIANT (poly64, 1, _p64)	\
++VARIANT (float16, 4, _f16)	\
++VARIANT (float32, 2, _f32)	\
++VARIANT (uint8, 16, q_u8)	\
++VARIANT (uint16, 8, q_u16)	\
++VARIANT (uint32, 4, q_u32)	\
++VARIANT (uint64, 2, q_u64)	\
++VARIANT (int8, 16, q_s8)	\
++VARIANT (int16, 8, q_s16)	\
++VARIANT (int32, 4, q_s32)	\
++VARIANT (int64, 2, q_s64)	\
++VARIANT (poly8, 16, q_p8)	\
++VARIANT (poly16, 8, q_p16)	\
++VARIANT (poly64, 2, q_p64)	\
++VARIANT (float16, 8, q_f16)	\
++VARIANT (float32, 4, q_f32)
++
++#ifdef __aarch64__
++#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
++VARIANT (float64, 1, _f64)			\
++VARIANT (float64, 2, q_f64)
++#else
++#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
++#endif
++
++/* Tests of vld1_x4 and vld1q_x4.  */
++VARIANTS (TESTMETH)
++
++#define CHECKS(BASE, ELTS, SUFFIX)	\
++  if (test_vld1##SUFFIX##_x4 () != 0)	\
++    fprintf (stderr, "test_vld1##SUFFIX##_x4");
++
++int
++main (int argc, char **argv)
++{
++  VARIANTS (CHECKS)
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
+new file mode 100644
+index 000000000..1f17b5342
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
+@@ -0,0 +1,83 @@
++/* We haven't implemented these intrinsics for arm yet.  */
++/* { dg-xfail-if "" { arm*-*-* } } */
++/* { dg-do run } */
++/* { dg-options "-O3" } */
++
++#include <arm_neon.h>
++#include "arm-neon-ref.h"
++
++extern void abort (void);
++
++#define TESTMETH(BASE, ELTS, SUFFIX)			\
++int __attribute__ ((noinline))				\
++test_vst1##SUFFIX##_x4 ()				\
++{							\
++  BASE##_t data[ELTS * 4];				\
++  BASE##_t temp[ELTS * 4];				\
++  BASE##x##ELTS##x##4##_t vectors;			\
++  int i,j;						\
++  for (i = 0; i < ELTS * 4; i++)			\
++    data [i] = (BASE##_t) 4*i;				\
++  asm volatile ("" : : : "memory");			\
++  vectors.val[0] = vld1##SUFFIX (data);			\
++  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);		\
++  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
++  vectors.val[3] = vld1##SUFFIX (&data[ELTS * 3]);	\
++  vst1##SUFFIX##_x4 (temp, vectors);			\
++  asm volatile ("" : : : "memory");			\
++  for (j = 0; j < ELTS * 4; j++)			\
++    if (temp[j] != data[j])				\
++      return 1;						\
++  return 0;						\
++}
++
++#define VARIANTS_1(VARIANT)	\
++VARIANT (uint8, 8, _u8)		\
++VARIANT (uint16, 4, _u16)	\
++VARIANT (uint32, 2, _u32)	\
++VARIANT (uint64, 1, _u64)	\
++VARIANT (int8, 8, _s8)		\
++VARIANT (int16, 4, _s16)	\
++VARIANT (int32, 2, _s32)	\
++VARIANT (int64, 1, _s64)	\
++VARIANT (poly8, 8, _p8)		\
++VARIANT (poly16, 4, _p16)	\
++VARIANT (poly64, 1, _p64)	\
++VARIANT (float16, 4, _f16)	\
++VARIANT (float32, 2, _f32)	\
++VARIANT (uint8, 16, q_u8)	\
++VARIANT (uint16, 8, q_u16)	\
++VARIANT (uint32, 4, q_u32)	\
++VARIANT (uint64, 2, q_u64)	\
++VARIANT (int8, 16, q_s8)	\
++VARIANT (int16, 8, q_s16)	\
++VARIANT (int32, 4, q_s32)	\
++VARIANT (int64, 2, q_s64)	\
++VARIANT (poly8, 16, q_p8)	\
++VARIANT (poly16, 8, q_p16)	\
++VARIANT (poly64, 2, q_p64)	\
++VARIANT (float16, 8, q_f16)	\
++VARIANT (float32, 4, q_f32)
++
++#ifdef __aarch64__
++#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
++VARIANT (float64, 1, _f64)			\
++VARIANT (float64, 2, q_f64)
++#else
++#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
++#endif
++
++/* Tests of vst1_x4 and vst1q_x4.  */
++VARIANTS (TESTMETH)
++
++#define CHECKS(BASE, ELTS, SUFFIX)	\
++  if (test_vst1##SUFFIX##_x4 () != 0)	\
++    fprintf (stderr, "test_vst1##SUFFIX##_x4");
++
++int
++main (int argc, char **argv)
++{
++  VARIANTS (CHECKS)
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c b/gcc/testsuite/gcc.target/aarch64/aes-fuse-1.c
+similarity index 51%
+rename from gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
+rename to gcc/testsuite/gcc.target/aarch64/aes-fuse-1.c
+index d8adc8946..d7b4f8991 100644
+--- a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
++++ b/gcc/testsuite/gcc.target/aarch64/aes-fuse-1.c
+@@ -1,45 +1,66 @@
+ /* { dg-do compile } */
+ /* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
++/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
+ 
+ #include <arm_neon.h>
+ 
+ #define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
+ #define AESMC(r, i) (r = vaesmcq_u8 (i))
+ 
++const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
++
+ uint8x16_t dummy;
+ uint8x16_t a;
+ uint8x16_t b;
+ uint8x16_t c;
+ uint8x16_t d;
+-uint8x16_t e;
++uint8x16_t x;
++uint8x16_t y;
++uint8x16_t k;
++
++void foo (void)
+ 
+-void
+-foo (void)
+ {
+-  AESE (a, a, e);
++  AESE (a, a, k);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+-  AESE (b, b, e);
++  AESE (b, b, k);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+-  AESE (c, c, e);
++  AESE (c, c, k);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+-  AESE (d, d, e);
++  AESE (d, d, k);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+ 
+-  AESMC (a, a);
++  x = x ^ k;
++  AESE (x, x, zero);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+-  AESMC (b, b);
++  y = y ^ k;
++  AESE (y, y, zero);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++
++  AESMC (d, d);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+   AESMC (c, c);
+   dummy = vaddq_u8 (dummy, dummy);
+   dummy = vaddq_u8 (dummy, dummy);
+-  AESMC (d, d);
+-}
++  AESMC (b, b);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESMC (a, a);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
+ 
+-/* { dg-final { scan-assembler-times "crypto_aese_fused" 4 } } */
++  AESMC (y, y);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESMC (x, x);
++}
+ 
++/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */
++/* { dg-final { scan-assembler-not "veor" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/aes-fuse-2.c b/gcc/testsuite/gcc.target/aarch64/aes-fuse-2.c
+new file mode 100644
+index 000000000..dfe01b03a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/aes-fuse-2.c
+@@ -0,0 +1,65 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
++/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
++
++#include <arm_neon.h>
++
++#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
++#define AESIMC(r, i) (r = vaesimcq_u8 (i))
++
++const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
++
++uint8x16_t dummy;
++uint8x16_t a;
++uint8x16_t b;
++uint8x16_t c;
++uint8x16_t d;
++uint8x16_t x;
++uint8x16_t y;
++uint8x16_t k;
++
++void foo (void)
++{
++  AESD (a, a, k);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESD (b, b, k);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESD (c, c, k);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESD (d, d, k);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++
++  x = x ^ k;
++  AESD (x, x, zero);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  y = y ^ k;
++  AESD (y, y, zero);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++
++  AESIMC (d, d);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESIMC (c, c);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESIMC (b, b);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESIMC (a, a);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++
++  AESIMC (y, y);
++  dummy = vaddq_u8 (dummy, dummy);
++  dummy = vaddq_u8 (dummy, dummy);
++  AESIMC (x, x);
++}
++
++/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */
++/* { dg-final { scan-assembler-not "veor" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/asm-x-constraint-1.c b/gcc/testsuite/gcc.target/aarch64/asm-x-constraint-1.c
+new file mode 100644
+index 000000000..a71043be5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/asm-x-constraint-1.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-options "-O" } */
++
++void
++f (void)
++{
++  register float s0 asm ("s0");
++  register float s7 asm ("s7");
++  register float s8 asm ("s8");
++  register float s15 asm ("s15");
++  register float s16 asm ("s16");
++  register float s31 asm ("s31");
++  asm volatile ("// s0 out: %s0" : "=w" (s0));
++  asm volatile ("// s0 in: %s0" :: "x" (s0));
++  asm volatile ("// s7 out: %s0" : "=w" (s7));
++  asm volatile ("// s7 in: %s0" :: "x" (s7));
++  asm volatile ("// s8 out: %s0" : "=w" (s8));
++  asm volatile ("// s8 in: %s0" :: "x" (s8));
++  asm volatile ("// s15 out: %s0" : "=w" (s15));
++  asm volatile ("// s15 in: %s0" :: "x" (s15));
++  asm volatile ("// s16 out: %s0" : "=w" (s16));
++  asm volatile ("// s16 in: %s0" :: "x" (s16));
++  asm volatile ("// s31 out: %s0" : "=w" (s31));
++  asm volatile ("// s31 in: %s0" :: "x" (s31));
++}
++
++/* { dg-final { scan-assembler {\t// s0 out: s0\n.*[/]/ s0 in: s0\n} } } */
++/* { dg-final { scan-assembler {\t// s7 out: s7\n.*[/]/ s7 in: s7\n} } } */
++/* { dg-final { scan-assembler {\t// s8 out: s8\n.*[/]/ s8 in: s8\n} } } */
++/* { dg-final { scan-assembler {\t// s15 out: s15\n.*[/]/ s15 in: s15\n} } } */
++/* { dg-final { scan-assembler {\t// s16 out: s16\n.*\tfmov\t(s[0-7]), s16\n.*[/]/ s16 in: \1\n} } } */
++/* { dg-final { scan-assembler {\t// s31 out: s31\n.*\tfmov\t(s[0-7]), s31\n.*[/]/ s31 in: \1\n} } } */
++/* { dg-final { scan-assembler-not {\t// s16 in: s16\n} } } */
++/* { dg-final { scan-assembler-not {\t// s31 in: s31\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/asm-y-constraint-1.c b/gcc/testsuite/gcc.target/aarch64/asm-y-constraint-1.c
+new file mode 100644
+index 000000000..4a3fcac56
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/asm-y-constraint-1.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile } */
++/* { dg-options "-O" } */
++
++void
++f (void)
++{
++  register float s0 asm ("s0");
++  register float s7 asm ("s7");
++  register float s8 asm ("s8");
++  register float s15 asm ("s15");
++  register float s16 asm ("s16");
++  register float s31 asm ("s31");
++  asm volatile ("// s0 out: %s0" : "=w" (s0));
++  asm volatile ("// s0 in: %s0" :: "y" (s0));
++  asm volatile ("// s7 out: %s0" : "=w" (s7));
++  asm volatile ("// s7 in: %s0" :: "y" (s7));
++  asm volatile ("// s8 out: %s0" : "=w" (s8));
++  asm volatile ("// s8 in: %s0" :: "y" (s8));
++  asm volatile ("// s15 out: %s0" : "=w" (s15));
++  asm volatile ("// s15 in: %s0" :: "y" (s15));
++  asm volatile ("// s16 out: %s0" : "=w" (s16));
++  asm volatile ("// s16 in: %s0" :: "y" (s16));
++  asm volatile ("// s31 out: %s0" : "=w" (s31));
++  asm volatile ("// s31 in: %s0" :: "y" (s31));
++}
++
++/* { dg-final { scan-assembler {\t// s0 out: s0\n.*[/]/ s0 in: s0\n} } } */
++/* { dg-final { scan-assembler {\t// s7 out: s7\n.*[/]/ s7 in: s7\n} } } */
++/* { dg-final { scan-assembler {\t// s8 out: s8\n.*\tfmov\t(s[0-7]), s8\n.*[/]/ s8 in: \1\n} } } */
++/* { dg-final { scan-assembler {\t// s15 out: s15\n.*\tfmov\t(s[0-7]), s15\n.*[/]/ s15 in: \1\n} } } */
++/* { dg-final { scan-assembler {\t// s16 out: s16\n.*\tfmov\t(s[0-7]), s16\n.*[/]/ s16 in: \1\n} } } */
++/* { dg-final { scan-assembler {\t// s31 out: s31\n.*\tfmov\t(s[0-7]), s31\n.*[/]/ s31 in: \1\n} } } */
++/* { dg-final { scan-assembler-not {\t// s8 in: s8\n} } } */
++/* { dg-final { scan-assembler-not {\t// s15 in: s15\n} } } */
++/* { dg-final { scan-assembler-not {\t// s16 in: s16\n} } } */
++/* { dg-final { scan-assembler-not {\t// s31 in: s31\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
+index 49ca5d0d0..a828a72aa 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-outline-atomics" } */
+ 
+ #include "atomic-comp-swap-release-acquire.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
+index 74f26348e..6823ce381 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-acq_rel.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
+index 66c1b1efe..87937de37 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-acquire.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
+index c09d0434e..60955e57d 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-char.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
+index 5783ab84f..16cb11aee 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-consume.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
+index 18b8f0b04..bcab4e481 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ int v = 0;
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
+index 8520f0839..040e4a8d1 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-int.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
+index d011f8c5c..fc88b92cd 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ long v = 0;
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
+index ed96bfdb9..503d62b02 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-relaxed.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
+index fc4be17de..efe14aea7 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-release.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
+index 613000fe4..09973bf82 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-seq_cst.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
+index e82c8118e..e1dcebb0f 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "atomic-op-short.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
+index f2a21ddf2..29246979b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -march=armv8-a+nolse" } */
++/* { dg-options "-O2 -march=armv8-a+nolse -mno-outline-atomics" } */
+ /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
+ 
+ int
+diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
+index 8d2ae67df..6daf9b08f 100644
+--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -march=armv8-a+nolse" } */
++/* { dg-options "-O2 -march=armv8-a+nolse -mno-outline-atomics" } */
+ /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
+ 
+ int
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
+new file mode 100644
+index 000000000..ef4376649
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
+@@ -0,0 +1,102 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-O3 --save-temps -std=gnu90" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_bf16.h>
++
++/*
++**stacktest1:
++**	sub	sp, sp, #16
++**	str	h0, \[sp, 14\]
++**	ldr	h0, \[sp, 14\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16_t stacktest1 (bfloat16_t __a)
++{
++  volatile bfloat16_t b = __a;
++  return b;
++}
++
++/*
++**bfloat_mov_ww:
++**	mov	v1.h\[0\], v2.h\[0\]
++**	ret
++*/
++void bfloat_mov_ww (void)
++{
++  register bfloat16_t x asm ("h2");
++  register bfloat16_t y asm ("h1");
++  asm volatile ("" : "=w" (x));
++  y = x;
++  asm volatile ("" :: "w" (y));
++}
++
++/*
++**bfloat_mov_rw:
++**	dup	v1.4h, w1
++**	ret
++*/
++void bfloat_mov_rw (void)
++{
++  register bfloat16_t x asm ("w1");
++  register bfloat16_t y asm ("h1");
++  asm volatile ("" : "=r" (x));
++  y = x;
++  asm volatile ("" :: "w" (y));
++}
++
++/*
++**bfloat_mov_wr:
++**	umov	w1, v1.h\[0\]
++**	ret
++*/
++void bfloat_mov_wr (void)
++{
++  register bfloat16_t x asm ("h1");
++  register bfloat16_t y asm ("w1");
++  asm volatile ("" : "=w" (x));
++  y = x;
++  asm volatile ("" :: "r" (y));
++}
++
++/*
++**bfloat_mov_rr:
++**	mov	w1, w2
++**	ret
++*/
++void bfloat_mov_rr (void)
++{
++  register bfloat16_t x asm ("w2");
++  register bfloat16_t y asm ("w1");
++  asm volatile ("" : "=r" (x));
++  y = x;
++  asm volatile ("" :: "r" (y));
++}
++
++/*
++**bfloat_mov_rm:
++**	strh	w2, \[x0\]
++**	ret
++*/
++void bfloat_mov_rm (bfloat16_t *ptr)
++{
++   register bfloat16_t x asm ("w2");
++   asm volatile ("" : "=r" (x));
++   *ptr = x;
++}
++
++/*
++**bfloat_mov_mr:
++**	ldrh	w2, \[x0\]
++**	ret
++*/
++void bfloat_mov_mr (bfloat16_t *ptr)
++{
++   register bfloat16_t y asm ("w2");
++   y = *ptr;
++   asm volatile ("" :: "r" (y));
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
+new file mode 100644
+index 000000000..df8e7518c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
+@@ -0,0 +1,106 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_bf16.h>
++
++#pragma GCC push_options
++#pragma GCC target ("+bf16")
++
++/*
++**stacktest1:
++**	sub	sp, sp, #16
++**	str	h0, \[sp, 14\]
++**	ldr	h0, \[sp, 14\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16_t stacktest1 (bfloat16_t __a)
++{
++  volatile bfloat16_t b = __a;
++  return b;
++}
++
++/*
++**bfloat_mov_ww:
++**	mov	v1.h\[0\], v2.h\[0\]
++**	ret
++*/
++void bfloat_mov_ww (void)
++{
++  register bfloat16_t x asm ("h2");
++  register bfloat16_t y asm ("h1");
++  asm volatile ("" : "=w" (x));
++  y = x;
++  asm volatile ("" :: "w" (y));
++}
++
++/*
++**bfloat_mov_rw:
++**	dup	v1.4h, w1
++**	ret
++*/
++void bfloat_mov_rw (void)
++{
++  register bfloat16_t x asm ("w1");
++  register bfloat16_t y asm ("h1");
++  asm volatile ("" : "=r" (x));
++  y = x;
++  asm volatile ("" :: "w" (y));
++}
++
++/*
++**bfloat_mov_wr:
++**	umov	w1, v1.h\[0\]
++**	ret
++*/
++void bfloat_mov_wr (void)
++{
++  register bfloat16_t x asm ("h1");
++  register bfloat16_t y asm ("w1");
++  asm volatile ("" : "=w" (x));
++  y = x;
++  asm volatile ("" :: "r" (y));
++}
++
++/*
++**bfloat_mov_rr:
++**	mov	w1, w2
++**	ret
++*/
++void bfloat_mov_rr (void)
++{
++  register bfloat16_t x asm ("w2");
++  register bfloat16_t y asm ("w1");
++  asm volatile ("" : "=r" (x));
++  y = x;
++  asm volatile ("" :: "r" (y));
++}
++
++/*
++**bfloat_mov_rm:
++**	strh	w2, \[x0\]
++**	ret
++*/
++void bfloat_mov_rm (bfloat16_t *ptr)
++{
++   register bfloat16_t x asm ("w2");
++   asm volatile ("" : "=r" (x));
++   *ptr = x;
++}
++
++/*
++**bfloat_mov_mr:
++**	ldrh	w2, \[x0\]
++**	ret
++*/
++void bfloat_mov_mr (bfloat16_t *ptr)
++{
++   register bfloat16_t y asm ("w2");
++   y = *ptr;
++   asm volatile ("" :: "r" (y));
++}
++
++#pragma GCC pop_options
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
+new file mode 100644
+index 000000000..5d7a4317c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
+@@ -0,0 +1,101 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_bf16.h>
++
++/*
++**stacktest1:
++**	sub	sp, sp, #16
++**	str	h0, \[sp, 14\]
++**	ldr	h0, \[sp, 14\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16_t stacktest1 (bfloat16_t __a)
++{
++  volatile bfloat16_t b = __a;
++  return b;
++}
++
++/*
++**bfloat_mov_ww:
++**	mov	v1.h\[0\], v2.h\[0\]
++**	ret
++*/
++void bfloat_mov_ww (void)
++{
++  register bfloat16_t x asm ("h2");
++  register bfloat16_t y asm ("h1");
++  asm volatile ("" : "=w" (x));
++  y = x;
++  asm volatile ("" :: "w" (y));
++}
++
++/*
++**bfloat_mov_rw:
++**	dup	v1.4h, w1
++**	ret
++*/
++void bfloat_mov_rw (void)
++{
++  register bfloat16_t x asm ("w1");
++  register bfloat16_t y asm ("h1");
++  asm volatile ("" : "=r" (x));
++  y = x;
++  asm volatile ("" :: "w" (y));
++}
++
++/*
++**bfloat_mov_wr:
++**	umov	w1, v1.h\[0\]
++**	ret
++*/
++void bfloat_mov_wr (void)
++{
++  register bfloat16_t x asm ("h1");
++  register bfloat16_t y asm ("w1");
++  asm volatile ("" : "=w" (x));
++  y = x;
++  asm volatile ("" :: "r" (y));
++}
++
++/*
++**bfloat_mov_rr:
++**	mov	w1, w2
++**	ret
++*/
++void bfloat_mov_rr (void)
++{
++  register bfloat16_t x asm ("w2");
++  register bfloat16_t y asm ("w1");
++  asm volatile ("" : "=r" (x));
++  y = x;
++  asm volatile ("" :: "r" (y));
++}
++
++/*
++**bfloat_mov_rm:
++**	strh	w2, \[x0\]
++**	ret
++*/
++void bfloat_mov_rm (bfloat16_t *ptr)
++{
++   register bfloat16_t x asm ("w2");
++   asm volatile ("" : "=r" (x));
++   *ptr = x;
++}
++
++/*
++**bfloat_mov_mr:
++**	ldrh	w2, \[x0\]
++**	ret
++*/
++void bfloat_mov_mr (bfloat16_t *ptr)
++{
++   register bfloat16_t y asm ("w2");
++   y = *ptr;
++   asm volatile ("" :: "r" (y));
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
+new file mode 100644
+index 000000000..b812011c2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
+@@ -0,0 +1,16 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
++
++#include <arm_bf16.h>
++
++_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
++{
++  volatile _Complex bfloat16_t b = __a;
++  return b;
++}
++
++/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
++/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_typecheck.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_typecheck.c
+new file mode 100644
+index 000000000..7c9188cf2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_typecheck.c
+@@ -0,0 +1,219 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-Wno-pedantic -O3 --save-temps" }  */
++
++#include <arm_neon.h>
++
++bfloat16_t glob_bfloat;
++
++int is_an_int;
++short is_a_short_int;
++float is_a_float;
++float is_a_float16;
++double is_a_double;
++
++float *float_ptr;
++
++bfloat16_t foo1 (void) { return (bfloat16_t) 0x1234; } /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++bfloat16_t foo2 (void) { return (bfloat16_t) (short) 0x1234; } /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++bfloat16_t footest (bfloat16_t scalar0)
++{
++
++  /* Initialisation  */
++
++  bfloat16_t scalar1_1;
++  bfloat16_t scalar1_2 = glob_bfloat;
++  bfloat16_t scalar1_3 = 0;   /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar1_4 = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar1_5 = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar1_6 = is_an_int;  /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar1_7 = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar1_8 = is_a_double; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar1_9 = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  int initi_1_1 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float initi_1_2 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float16_t initi_1_3 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  short initi_1_4 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  double initi_1_5 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  bfloat16_t scalar2_1 = {}; /* { dg-error {empty scalar initializer} } */
++  bfloat16_t scalar2_2 = { glob_bfloat };
++  bfloat16_t scalar2_3 = { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar2_4 = { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar2_5 = { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar2_6 = { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar2_7 = { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar2_8 = { is_a_double }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16_t scalar2_9 = { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  int initi_2_1 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float initi_2_2 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float16_t initi_2_3 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  short initi_2_4 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  double initi_2_5 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Assignments.  */
++
++  glob_bfloat = glob_bfloat;
++  glob_bfloat = 0;   /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  glob_bfloat = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  glob_bfloat = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  glob_bfloat = is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  glob_bfloat = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  glob_bfloat = is_a_double; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  glob_bfloat = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  is_an_int = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_float = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_float16 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_double = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_short_int = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Casting.  */
++
++  (void) glob_bfloat;
++  (bfloat16_t) glob_bfloat;
++
++  (int) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (float) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (float16_t) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (double) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (short) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  (bfloat16_t) is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) is_a_double; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  /* Compound literals.  */
++
++  (bfloat16_t) {}; /* { dg-error {empty scalar initializer} } */
++  (bfloat16_t) { glob_bfloat };
++  (bfloat16_t) { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) { is_a_double }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16_t) { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  (int) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (float) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (float16_t) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (double) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  (short) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Arrays and Structs.  */
++
++  typedef bfloat16_t array_type[2];
++  extern bfloat16_t extern_array[];
++
++  bfloat16_t array[2];
++  bfloat16_t zero_length_array[0];
++  bfloat16_t empty_init_array[] = {};
++  typedef bfloat16_t some_other_type[is_an_int];
++
++  struct struct1 {
++    bfloat16_t a;
++  };
++
++  union union1 {
++    bfloat16_t a;
++  };
++
++  /* Addressing and dereferencing.  */
++
++  bfloat16_t *bfloat_ptr = &scalar0;
++  scalar0 = *bfloat_ptr;
++
++  /* Pointer assignment.  */
++
++  bfloat16_t *bfloat_ptr2 = bfloat_ptr;
++  bfloat16_t *bfloat_ptr3 = array;
++
++  /* Pointer arithmetic.  */
++
++  ++bfloat_ptr;
++  --bfloat_ptr;
++  bfloat_ptr++;
++  bfloat_ptr--;
++  bfloat_ptr += 1;
++  bfloat_ptr -= 1;
++  bfloat_ptr - bfloat_ptr2;
++  bfloat_ptr = &bfloat_ptr3[0];
++  bfloat_ptr = &bfloat_ptr3[1];
++
++  /* Simple comparison.  */
++  scalar0 > glob_bfloat; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  glob_bfloat == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 > is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  is_a_float == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 > 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  0 == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 > 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  0.1 == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 > is_an_int; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  is_an_int == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Pointer comparison.  */
++
++  bfloat_ptr == &scalar0;
++  bfloat_ptr != &scalar0;
++  bfloat_ptr < &scalar0;
++  bfloat_ptr <= &scalar0;
++  bfloat_ptr > &scalar0;
++  bfloat_ptr >= &scalar0;
++  bfloat_ptr == bfloat_ptr2;
++  bfloat_ptr != bfloat_ptr2;
++  bfloat_ptr < bfloat_ptr2;
++  bfloat_ptr <= bfloat_ptr2;
++  bfloat_ptr > bfloat_ptr2;
++  bfloat_ptr >= bfloat_ptr2;
++
++  /* Conditional expressions.  */
++
++  0 ? scalar0 : scalar0;
++  0 ? scalar0 : is_a_float; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  0 ? is_a_float : scalar0; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  0 ? scalar0 : 0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  0 ? 0 : scalar0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  0 ? 0.1 : scalar0; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  0 ? scalar0 : 0.1; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  0 ? bfloat_ptr : bfloat_ptr2;
++  0 ? bfloat_ptr : float_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
++  0 ? float_ptr : bfloat_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
++
++  scalar0 ? scalar0 : scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 ? is_a_float : scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 ? scalar0 : is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 ? is_a_float : is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Unary operators.  */
++
++  +scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  -scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  ~scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  !scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  *scalar0; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  __imag scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  ++scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  --scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0++; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0--; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Binary arithmetic operations.  */
++
++  scalar0 = glob_bfloat + *bfloat_ptr; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 = glob_bfloat + 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 = glob_bfloat + 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  scalar0 = glob_bfloat + is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  return scalar0;
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
+new file mode 100644
+index 000000000..6cad557eb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
+@@ -0,0 +1,93 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-O3 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_neon.h>
++
++/*
++**stacktest1:
++**	sub	sp, sp, #16
++**	str	h0, \[sp, 14\]
++**	ldr	h0, \[sp, 14\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16_t stacktest1 (bfloat16_t __a)
++{
++  volatile bfloat16_t b = __a;
++  return b;
++}
++
++/*
++**stacktest2:
++**	sub	sp, sp, #16
++**	str	d0, \[sp, 8\]
++**	ldr	d0, \[sp, 8\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16x4_t stacktest2 (bfloat16x4_t __a)
++{
++  volatile bfloat16x4_t b = __a;
++  return b;
++}
++
++/*
++**stacktest3:
++**	sub	sp, sp, #16
++**	str	q0, \[sp\]
++**	ldr	q0, \[sp\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16x8_t stacktest3 (bfloat16x8_t __a)
++{
++  volatile bfloat16x8_t b = __a;
++  return b;
++}
++
++/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
++typedef bfloat16_t v8bf __attribute__((vector_size(16)));
++typedef bfloat16_t v16bf __attribute__((vector_size(32)));
++typedef bfloat16_t v32bf __attribute__((vector_size(64)));
++typedef bfloat16_t v64bf __attribute__((vector_size(128)));
++typedef bfloat16_t v128bf __attribute__((vector_size(256)));
++
++v8bf stacktest4 (v8bf __a)
++{
++  volatile v8bf b = __a;
++  return b;
++}
++
++v16bf stacktest5 (v16bf __a)
++{
++  volatile v16bf b = __a;
++  return b;
++}
++
++v32bf stacktest6 (v32bf __a)
++{
++  volatile v32bf b = __a;
++  return b;
++}
++
++v64bf stacktest7 (v64bf __a)
++{
++  volatile v64bf b = __a;
++  return b;
++}
++
++v128bf stacktest8 (v128bf __a)
++{
++  volatile v128bf b = __a;
++  return b;
++}
++
++/* Test use of constant values to assign values to vectors.  */
++
++typedef bfloat16_t v2bf __attribute__((vector_size(4)));
++v2bf c2 (void) { return (v2bf) 0x12345678; }
++
++bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
+new file mode 100644
+index 000000000..3891dcfc9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
+@@ -0,0 +1,97 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_neon.h>
++
++#pragma GCC push_options
++#pragma GCC target ("+bf16")
++
++/*
++**stacktest1:
++**	sub	sp, sp, #16
++**	str	h0, \[sp, 14\]
++**	ldr	h0, \[sp, 14\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16_t stacktest1 (bfloat16_t __a)
++{
++  volatile bfloat16_t b = __a;
++  return b;
++}
++
++/*
++**stacktest2:
++**	sub	sp, sp, #16
++**	str	d0, \[sp, 8\]
++**	ldr	d0, \[sp, 8\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16x4_t stacktest2 (bfloat16x4_t __a)
++{
++  volatile bfloat16x4_t b = __a;
++  return b;
++}
++
++/*
++**stacktest3:
++**	sub	sp, sp, #16
++**	str	q0, \[sp\]
++**	ldr	q0, \[sp\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16x8_t stacktest3 (bfloat16x8_t __a)
++{
++  volatile bfloat16x8_t b = __a;
++  return b;
++}
++
++/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
++typedef bfloat16_t v8bf __attribute__((vector_size(16)));
++typedef bfloat16_t v16bf __attribute__((vector_size(32)));
++typedef bfloat16_t v32bf __attribute__((vector_size(64)));
++typedef bfloat16_t v64bf __attribute__((vector_size(128)));
++typedef bfloat16_t v128bf __attribute__((vector_size(256)));
++
++v8bf stacktest4 (v8bf __a)
++{
++  volatile v8bf b = __a;
++  return b;
++}
++
++v16bf stacktest5 (v16bf __a)
++{
++  volatile v16bf b = __a;
++  return b;
++}
++
++v32bf stacktest6 (v32bf __a)
++{
++  volatile v32bf b = __a;
++  return b;
++}
++
++v64bf stacktest7 (v64bf __a)
++{
++  volatile v64bf b = __a;
++  return b;
++}
++
++v128bf stacktest8 (v128bf __a)
++{
++  volatile v128bf b = __a;
++  return b;
++}
++
++/* Test use of constant values to assign values to vectors.  */
++
++typedef bfloat16_t v2bf __attribute__((vector_size(4)));
++v2bf c2 (void) { return (v2bf) 0x12345678; }
++
++bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
++
++#pragma GCC pop_options
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
+new file mode 100644
+index 000000000..b35f5e527
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
+@@ -0,0 +1,92 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_neon.h>
++
++/*
++**stacktest1:
++**	sub	sp, sp, #16
++**	str	h0, \[sp, 14\]
++**	ldr	h0, \[sp, 14\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16_t stacktest1 (bfloat16_t __a)
++{
++  volatile bfloat16_t b = __a;
++  return b;
++}
++
++/*
++**stacktest2:
++**	sub	sp, sp, #16
++**	str	d0, \[sp, 8\]
++**	ldr	d0, \[sp, 8\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16x4_t stacktest2 (bfloat16x4_t __a)
++{
++  volatile bfloat16x4_t b = __a;
++  return b;
++}
++
++/*
++**stacktest3:
++**	sub	sp, sp, #16
++**	str	q0, \[sp\]
++**	ldr	q0, \[sp\]
++**	add	sp, sp, 16
++**	ret
++*/
++bfloat16x8_t stacktest3 (bfloat16x8_t __a)
++{
++  volatile bfloat16x8_t b = __a;
++  return b;
++}
++
++/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
++typedef bfloat16_t v8bf __attribute__((vector_size(16)));
++typedef bfloat16_t v16bf __attribute__((vector_size(32)));
++typedef bfloat16_t v32bf __attribute__((vector_size(64)));
++typedef bfloat16_t v64bf __attribute__((vector_size(128)));
++typedef bfloat16_t v128bf __attribute__((vector_size(256)));
++
++v8bf stacktest4 (v8bf __a)
++{
++  volatile v8bf b = __a;
++  return b;
++}
++
++v16bf stacktest5 (v16bf __a)
++{
++  volatile v16bf b = __a;
++  return b;
++}
++
++v32bf stacktest6 (v32bf __a)
++{
++  volatile v32bf b = __a;
++  return b;
++}
++
++v64bf stacktest7 (v64bf __a)
++{
++  volatile v64bf b = __a;
++  return b;
++}
++
++v128bf stacktest8 (v128bf __a)
++{
++  volatile v128bf b = __a;
++  return b;
++}
++
++/* Test use of constant values to assign values to vectors.  */
++
++typedef bfloat16_t v2bf __attribute__((vector_size(4)));
++v2bf c2 (void) { return (v2bf) 0x12345678; }
++
++bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_1.c
+new file mode 100644
+index 000000000..4af3d295f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_1.c
+@@ -0,0 +1,262 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-O3 --save-temps -Wno-pedantic" }  */
++#include <arm_neon.h>
++
++bfloat16_t glob_bfloat;
++bfloat16x4_t glob_bfloat_vec;
++
++float32x4_t is_a_float_vec;
++float32x2_t is_a_float_pair;
++
++float16x4_t *float_ptr;
++float16x4_t is_a_float16_vec;
++
++int32x4_t is_an_int_vec;
++int32x2_t is_an_int_pair;
++int16x4_t is_a_short_vec;
++
++int is_an_int;
++short is_a_short_int;
++float is_a_float;
++float is_a_float16;
++double is_a_double;
++
++/* Create a vector of 2 bfloat16_t.  */
++typedef bfloat16_t v2bf __attribute__((vector_size(4)));
++v2bf foo1 (void) { return (v2bf) 0x12345678; }
++bfloat16x4_t foo2 (void) { return (bfloat16x4_t) 0x1234567812345678; }
++
++bfloat16x4_t footest (bfloat16x4_t vector0)
++{
++  /* Initialisation  */
++
++  bfloat16x4_t vector1_1;
++  bfloat16x4_t vector1_2 = glob_bfloat_vec;
++  bfloat16x4_t vector1_3 = is_a_float_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'float32x4_t'} } */
++  bfloat16x4_t vector1_4 = is_an_int_vec;  /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'int32x4_t'} } */
++  bfloat16x4_t vector1_5 = is_a_float16_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'float16x4_t'} } */
++  bfloat16x4_t vector1_6 = is_a_float_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'float32x2_t'} } */
++  bfloat16x4_t vector1_7 = is_an_int_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'int32x2_t'} } */
++  bfloat16x4_t vector1_8 = is_a_short_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'int16x4_t'} } */
++
++  int32x4_t initi_1_1 = glob_bfloat_vec;   /* { dg-error {incompatible types when initializing type 'int32x4_t' using type 'bfloat16x4_t'} } */
++  float32x4_t initi_1_2 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float32x4_t' using type 'bfloat16x4_t'} } */
++  float16x4_t initi_1_3 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float16x4_t' using type 'bfloat16x4_t'} } */
++  float32x2_t initi_1_4 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float32x2_t' using type 'bfloat16x4_t'} } */
++  int32x2_t initi_1_5 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int32x2_t' using type 'bfloat16x4_t'} } */
++  int16x4_t initi_1_6 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int16x4_t' using type 'bfloat16x4_t'} } */
++
++  bfloat16x4_t vector2_1 = {};
++  bfloat16x4_t vector2_2 = { glob_bfloat };
++  bfloat16x4_t vector2_3 = { glob_bfloat, glob_bfloat, glob_bfloat, glob_bfloat };
++  bfloat16x4_t vector2_4 = { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x4_t vector2_5 = { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x4_t vector2_6 = { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x4_t vector2_7 = { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x4_t vector2_8 = { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x4_t vector2_9 = { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x4_t vector2_10 = { 0.0, 0, is_a_short_int, is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  int32x4_t initi_2_1 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float32x4_t initi_2_2 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float16x4_t initi_2_3 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float32x2_t initi_2_4 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  int32x2_t initi_2_5 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  int16x4_t initi_2_6 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Assignments to/from vectors.  */
++
++  glob_bfloat_vec = glob_bfloat_vec;
++  glob_bfloat_vec = 0;   /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int'} } */
++  glob_bfloat_vec = 0.1; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'double'} } */
++  glob_bfloat_vec = is_a_float_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'float32x4_t'} } */
++  glob_bfloat_vec = is_an_int_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int32x4_t'} } */
++  glob_bfloat_vec = is_a_float16_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'float16x4_t'} } */
++  glob_bfloat_vec = is_a_float_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'float32x2_t'} } */
++  glob_bfloat_vec = is_an_int_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int32x2_t'} } */
++  glob_bfloat_vec = is_a_short_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int16x4_t'} } */
++
++  is_an_int_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int32x4_t' from type 'bfloat16x4_t'} } */
++  is_a_float_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float32x4_t' from type 'bfloat16x4_t'} } */
++  is_a_float16_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float16x4_t' from type 'bfloat16x4_t'} } */
++  is_a_float_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float32x2_t' from type 'bfloat16x4_t'} } */
++  is_an_int_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int32x2_t' from type 'bfloat16x4_t'} } */
++  is_a_short_vec = glob_bfloat_vec;/* { dg-error {incompatible types when assigning to type 'int16x4_t' from type 'bfloat16x4_t'} } */
++
++  /* Assignments to/from elements.  */
++
++  vector2_3[0] = glob_bfloat;
++  vector2_3[0] = is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = 0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  glob_bfloat = vector2_3[0];
++  is_an_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_short_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_float = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_float16 = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Compound literals.  */
++
++  (bfloat16x4_t) {};
++
++  (bfloat16x4_t) { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16x4_t) { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16x4_t) { is_a_float_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float32x4_t'} } */
++  (bfloat16x4_t) { is_an_int_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int32x4_t'} } */
++  (bfloat16x4_t) { is_a_float_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float32x2_t'} } */
++  (bfloat16x4_t) { is_an_int_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int32x2_t'} } */
++  (bfloat16x4_t) { is_a_float16_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float16x4_t'} } */
++  (bfloat16x4_t) { is_a_short_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int16x4_t'} } */
++
++  (bfloat16x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'bfloat16x4_t'} } */
++  (int32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'int' using type 'bfloat16x4_t'} } */
++  (float32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'float' using type 'bfloat16x4_t'} } */
++  (int32x2_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'int' using type 'bfloat16x4_t'} } */
++  (float16x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__fp16' using type 'bfloat16x4_t'} } */
++  (int16x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'short int' using type 'bfloat16x4_t'} } */
++
++  /* Casting.  */
++
++  (void) glob_bfloat_vec;
++  (bfloat16x4_t) glob_bfloat_vec;
++
++  (bfloat16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++  (short) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x4_t' to type 'short int' which has different size} } */
++  (int) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x4_t' to type 'int' which has different size} } */
++  (float16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++  (float) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++  (double) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++
++  (int32x4_t) glob_bfloat_vec; /* { dg-error {can't convert a value of type 'bfloat16x4_t' to vector type '__Int32x4_t' which has different size} } */
++  (float32x4_t) glob_bfloat_vec; /* { dg-error {can't convert a value of type 'bfloat16x4_t' to vector type '__Float32x4_t' which has different size} } */
++  (float16x4_t) glob_bfloat_vec;
++  (int32x2_t) glob_bfloat_vec;
++  (float32x2_t) glob_bfloat_vec;
++  (int16x4_t) glob_bfloat_vec;
++
++  (bfloat16x4_t) is_an_int_vec; /* { dg-error {can't convert a value of type 'int32x4_t' to vector type '__Bfloat16x4_t' which has different size} } */
++  (bfloat16x4_t) is_a_float_vec; /* { dg-error {can't convert a value of type 'float32x4_t' to vector type '__Bfloat16x4_t' which has different size} } */
++  (bfloat16x4_t) is_a_float16_vec;
++  (bfloat16x4_t) is_an_int_pair;
++  (bfloat16x4_t) is_a_float_pair;
++  (bfloat16x4_t) is_a_short_vec;
++  (bfloat16x4_t) is_a_double; /* { dg-error {can't convert value to a vector} } */
++
++  /* Arrays and Structs.  */
++
++  typedef bfloat16x4_t array_type[2];
++  extern bfloat16x4_t extern_array[];
++
++  bfloat16x4_t array[2];
++  bfloat16x4_t zero_length_array[0];
++  bfloat16x4_t empty_init_array[] = {};
++  typedef bfloat16x4_t some_other_type[is_an_int];
++
++  struct struct1 {
++    bfloat16x4_t a;
++  };
++
++  union union1 {
++    bfloat16x4_t a;
++  };
++
++  /* Addressing and dereferencing.  */
++
++  bfloat16x4_t *bfloat_ptr = &vector0;
++  vector0 = *bfloat_ptr;
++
++  /* Pointer assignment.  */
++
++  bfloat16x4_t *bfloat_ptr2 = bfloat_ptr;
++  bfloat16x4_t *bfloat_ptr3 = array;
++
++  /* Pointer arithmetic.  */
++
++  ++bfloat_ptr;
++  --bfloat_ptr;
++  bfloat_ptr++;
++  bfloat_ptr--;
++  bfloat_ptr += 1;
++  bfloat_ptr -= 1;
++  bfloat_ptr - bfloat_ptr2;
++  bfloat_ptr = &bfloat_ptr3[0];
++  bfloat_ptr = &bfloat_ptr3[1];
++
++  /* Simple comparison.  */
++  vector0 > glob_bfloat_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  glob_bfloat_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  is_a_float_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  0 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  0.1 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > is_an_int_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  is_an_int_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Pointer comparison.  */
++
++  bfloat_ptr == &vector0;
++  bfloat_ptr != &vector0;
++  bfloat_ptr < &vector0;
++  bfloat_ptr <= &vector0;
++  bfloat_ptr > &vector0;
++  bfloat_ptr >= &vector0;
++  bfloat_ptr == bfloat_ptr2;
++  bfloat_ptr != bfloat_ptr2;
++  bfloat_ptr < bfloat_ptr2;
++  bfloat_ptr <= bfloat_ptr2;
++  bfloat_ptr > bfloat_ptr2;
++  bfloat_ptr >= bfloat_ptr2;
++
++  /* Conditional expressions.  */
++
++  0 ? vector0 : vector0;
++  0 ? vector0 : is_a_float_vec; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? is_a_float_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? vector0 : is_a_float16_vec; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? is_a_float16_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? vector0 : 0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? 0 : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? 0.1 : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? vector0 : 0.1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? bfloat_ptr : bfloat_ptr2;
++  0 ? bfloat_ptr : float_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
++  0 ? float_ptr : bfloat_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
++
++  vector0 ? vector0 : vector0; /* { dg-error {used vector type where scalar is required} } */
++  vector0 ? is_a_float16_vec : vector0; /* { dg-error {used vector type where scalar is required} } */
++  vector0 ? vector0 : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
++  vector0 ? is_a_float16_vec : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Unary operators.  */
++
++  +vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  -vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  ~vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  !vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  *vector0; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  __imag vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  ++vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  --vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0++; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0--; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Binary arithmetic operations.  */
++
++  vector0 = glob_bfloat_vec + *bfloat_ptr; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 = glob_bfloat_vec + 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 = glob_bfloat_vec + 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 = glob_bfloat_vec + is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  return vector0;
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_2.c
+new file mode 100644
+index 000000000..99c499ce8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_2.c
+@@ -0,0 +1,260 @@
++/* { dg-do assemble { target { aarch64*-*-* } } } */
++/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
++/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
++/* { dg-add-options arm_v8_2a_bf16_neon }  */
++/* { dg-additional-options "-O3 --save-temps -Wno-pedantic" }  */
++#include <arm_neon.h>
++
++bfloat16_t glob_bfloat;
++bfloat16x8_t glob_bfloat_vec;
++
++float32x4_t is_a_float_vec;
++float64x2_t is_a_double_pair;
++
++float16x8_t *float_ptr;
++float16x8_t is_a_float16_vec;
++
++int32x4_t is_an_int_vec;
++int64x2_t is_a_long_int_pair;
++int16x8_t is_a_short_vec;
++
++int is_an_int;
++short is_a_short_int;
++float is_a_float;
++float is_a_float16;
++double is_a_double;
++
++bfloat16x8_t foo3 (void) { return (bfloat16x8_t) 0x12345678123456781234567812345678; }
++ /* { dg-error {integer constant is too large for its type} "" {target *-*-*} 27 } */
++ /* { dg-error {can't convert a value of type 'long int' to vector type '__Bfloat16x8_t' which has different size} "" {target *-*-*} 27 } */
++
++bfloat16x8_t footest (bfloat16x8_t vector0)
++{
++  /* Initialisation  */
++
++  bfloat16x8_t vector1_1;
++  bfloat16x8_t vector1_2 = glob_bfloat_vec;
++  bfloat16x8_t vector1_3 = is_a_float_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'float32x4_t'} } */
++  bfloat16x8_t vector1_4 = is_an_int_vec;  /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'int32x4_t'} } */
++  bfloat16x8_t vector1_5 = is_a_float16_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'float16x8_t'} } */
++  bfloat16x8_t vector1_6 = is_a_double_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'float64x2_t'} } */
++  bfloat16x8_t vector1_7 = is_a_long_int_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'int64x2_t'} } */
++  bfloat16x8_t vector1_8 = is_a_short_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'int16x8_t'} } */
++
++  int32x4_t initi_1_1 = glob_bfloat_vec;   /* { dg-error {incompatible types when initializing type 'int32x4_t' using type 'bfloat16x8_t'} } */
++  float32x4_t initi_1_2 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float32x4_t' using type 'bfloat16x8_t'} } */
++  float16x8_t initi_1_3 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float16x8_t' using type 'bfloat16x8_t'} } */
++  float64x2_t initi_1_4 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float64x2_t' using type 'bfloat16x8_t'} } */
++  int64x2_t initi_1_5 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int64x2_t' using type 'bfloat16x8_t'} } */
++  int16x8_t initi_1_6 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int16x8_t' using type 'bfloat16x8_t'} } */
++
++  bfloat16x8_t vector2_1 = {};
++  bfloat16x8_t vector2_2 = { glob_bfloat };
++  bfloat16x8_t vector2_3 = { glob_bfloat, glob_bfloat, glob_bfloat, glob_bfloat };
++  bfloat16x8_t vector2_4 = { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x8_t vector2_5 = { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x8_t vector2_6 = { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x8_t vector2_7 = { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x8_t vector2_8 = { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x8_t vector2_9 = { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  bfloat16x8_t vector2_10 = { 0.0, 0, is_a_short_int, is_a_float }; /* { dg-error "invalid conversion to type 'bfloat16_t'" } */
++
++  int32x4_t initi_2_1 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float32x4_t initi_2_2 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float16x8_t initi_2_3 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  float64x2_t initi_2_4 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  int64x2_t initi_2_5 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  int16x8_t initi_2_6 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Assignments to/from vectors.  */
++
++  glob_bfloat_vec = glob_bfloat_vec;
++  glob_bfloat_vec = 0;   /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int'} } */
++  glob_bfloat_vec = 0.1; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'double'} } */
++  glob_bfloat_vec = is_a_float_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'float32x4_t'} } */
++  glob_bfloat_vec = is_an_int_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int32x4_t'} } */
++  glob_bfloat_vec = is_a_float16_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'float16x8_t'} } */
++  glob_bfloat_vec = is_a_double_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'float64x2_t'} } */
++  glob_bfloat_vec = is_a_long_int_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int64x2_t'} } */
++  glob_bfloat_vec = is_a_short_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int16x8_t'} } */
++
++  is_an_int_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int32x4_t' from type 'bfloat16x8_t'} } */
++  is_a_float_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float32x4_t' from type 'bfloat16x8_t'} } */
++  is_a_float16_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float16x8_t' from type 'bfloat16x8_t'} } */
++  is_a_double_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float64x2_t' from type 'bfloat16x8_t'} } */
++  is_a_long_int_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int64x2_t' from type 'bfloat16x8_t'} } */
++  is_a_short_vec = glob_bfloat_vec;/* { dg-error {incompatible types when assigning to type 'int16x8_t' from type 'bfloat16x8_t'} } */
++
++  /* Assignments to/from elements.  */
++
++  vector2_3[0] = glob_bfloat;
++  vector2_3[0] = is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = 0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  vector2_3[0] = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++
++  glob_bfloat = vector2_3[0];
++  is_an_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_short_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_float = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++  is_a_float16 = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
++
++  /* Compound literals.  */
++
++  (bfloat16x8_t) {};
++
++  (bfloat16x8_t) { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16x8_t) { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  (bfloat16x8_t) { is_a_float_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float32x4_t'} } */
++  (bfloat16x8_t) { is_an_int_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int32x4_t'} } */
++  (bfloat16x8_t) { is_a_double_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float64x2_t'} } */
++  (bfloat16x8_t) { is_a_long_int_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int64x2_t'} } */
++  (bfloat16x8_t) { is_a_float16_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float16x8_t'} } */
++  (bfloat16x8_t) { is_a_short_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int16x8_t'} } */
++
++  (bfloat16x8_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'bfloat16x8_t'} } */
++  (int32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'int' using type 'bfloat16x8_t'} } */
++  (float32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'float' using type 'bfloat16x8_t'} } */
++  (int64x2_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'long int' using type 'bfloat16x8_t'} } */
++  (float16x8_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__fp16' using type 'bfloat16x8_t'} } */
++  (int16x8_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'short int' using type 'bfloat16x8_t'} } */
++
++  /* Casting.  */
++
++  (void) glob_bfloat_vec;
++  (bfloat16x8_t) glob_bfloat_vec;
++
++  (bfloat16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++  (short) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x8_t' to type 'short int' which has different size} } */
++  (int) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x8_t' to type 'int' which has different size} } */
++  (float16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++  (float) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++  (double) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
++
++  (int32x4_t) glob_bfloat_vec;
++  (float32x4_t) glob_bfloat_vec;
++  (float16x8_t) glob_bfloat_vec;
++  (int64x2_t) glob_bfloat_vec;
++  (float64x2_t) glob_bfloat_vec;
++  (int16x8_t) glob_bfloat_vec;
++
++  (bfloat16x8_t) is_an_int_vec;
++  (bfloat16x8_t) is_a_float_vec;
++  (bfloat16x8_t) is_a_float16_vec;
++  (bfloat16x8_t) is_a_long_int_pair;
++  (bfloat16x8_t) is_a_double_pair;
++  (bfloat16x8_t) is_a_short_vec;
++
++  /* Arrays and Structs.  */
++
++  typedef bfloat16x8_t array_type[2];
++  extern bfloat16x8_t extern_array[];
++
++  bfloat16x8_t array[2];
++  bfloat16x8_t zero_length_array[0];
++  bfloat16x8_t empty_init_array[] = {};
++  typedef bfloat16x8_t some_other_type[is_an_int];
++
++  struct struct1 {
++    bfloat16x8_t a;
++  };
++
++  union union1 {
++    bfloat16x8_t a;
++  };
++
++  /* Addressing and dereferencing.  */
++
++  bfloat16x8_t *bfloat_ptr = &vector0;
++  vector0 = *bfloat_ptr;
++
++  /* Pointer assignment.  */
++
++  bfloat16x8_t *bfloat_ptr2 = bfloat_ptr;
++  bfloat16x8_t *bfloat_ptr3 = array;
++
++  /* Pointer arithmetic.  */
++
++  ++bfloat_ptr;
++  --bfloat_ptr;
++  bfloat_ptr++;
++  bfloat_ptr--;
++  bfloat_ptr += 1;
++  bfloat_ptr -= 1;
++  bfloat_ptr - bfloat_ptr2;
++  bfloat_ptr = &bfloat_ptr3[0];
++  bfloat_ptr = &bfloat_ptr3[1];
++
++  /* Simple comparison.  */
++  vector0 > glob_bfloat_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  glob_bfloat_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  is_a_float_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  0 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  0.1 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 > is_an_int_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  is_an_int_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Pointer comparison.  */
++
++  bfloat_ptr == &vector0;
++  bfloat_ptr != &vector0;
++  bfloat_ptr < &vector0;
++  bfloat_ptr <= &vector0;
++  bfloat_ptr > &vector0;
++  bfloat_ptr >= &vector0;
++  bfloat_ptr == bfloat_ptr2;
++  bfloat_ptr != bfloat_ptr2;
++  bfloat_ptr < bfloat_ptr2;
++  bfloat_ptr <= bfloat_ptr2;
++  bfloat_ptr > bfloat_ptr2;
++  bfloat_ptr >= bfloat_ptr2;
++
++  /* Conditional expressions.  */
++
++  0 ? vector0 : vector0;
++  0 ? vector0 : is_a_float_vec; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? is_a_float_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? vector0 : is_a_float16_vec; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? is_a_float16_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? vector0 : 0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? 0 : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? 0.1 : vector0; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? vector0 : 0.1; /* { dg-error {type mismatch in conditional expression} } */
++  0 ? bfloat_ptr : bfloat_ptr2;
++  0 ? bfloat_ptr : float_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
++  0 ? float_ptr : bfloat_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
++
++  vector0 ? vector0 : vector0; /* { dg-error {used vector type where scalar is required} } */
++  vector0 ? is_a_float16_vec : vector0; /* { dg-error {used vector type where scalar is required} } */
++  vector0 ? vector0 : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
++  vector0 ? is_a_float16_vec : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
++
++  /* Unary operators.  */
++
++  +vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  -vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  ~vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  !vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  *vector0; /* { dg-error {invalid type argument of unary '\*'} } */
++  __real vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  __imag vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  ++vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  --vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0++; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0--; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  /* Binary arithmetic operations.  */
++
++  vector0 = glob_bfloat_vec + *bfloat_ptr; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 = glob_bfloat_vec + 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 = glob_bfloat_vec + 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++  vector0 = glob_bfloat_vec + is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
++
++  return vector0;
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
+deleted file mode 100644
+index b12df2d3e..000000000
+--- a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
++++ /dev/null
+@@ -1,45 +0,0 @@
+-/* { dg-do compile } */
+-/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
+-
+-#include <arm_neon.h>
+-
+-#define AESE(r, v, key) (r = vaesdq_u8 ((v), (key)));
+-#define AESMC(r, i) (r = vaesimcq_u8 (i))
+-
+-uint8x16_t dummy;
+-uint8x16_t a;
+-uint8x16_t b;
+-uint8x16_t c;
+-uint8x16_t d;
+-uint8x16_t e;
+-
+-void
+-foo (void)
+-{
+-  AESE (a, a, e);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  AESE (b, b, e);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  AESE (c, c, e);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  AESE (d, d, e);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-
+-  AESMC (a, a);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  AESMC (b, b);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  AESMC (c, c);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  dummy = vaddq_u8 (dummy, dummy);
+-  AESMC (d, d);
+-}
+-
+-/* { dg-final { scan-assembler-times "crypto_aesd_fused" 4 } } */
+-
+diff --git a/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c b/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
+new file mode 100644
+index 000000000..59e24f48b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
+@@ -0,0 +1,14 @@
++#include <arm_neon.h>
++
++typedef int16x4_t myvec;
++
++void f (float x)
++{
++  __Int8x8_t y1 = x; /* { dg-error {incompatible types when initializing type '__Int8x8_t' using type 'float'} } */
++  __Int8x8_t *ptr1 = &x; /* { dg-error {initialization of '__Int8x8_t \*' from incompatible pointer type 'float \*'} } */
++  int8x8_t y2 = x; /* { dg-error {incompatible types when initializing type 'int8x8_t' using type 'float'} } */
++  int8x8_t *ptr2 = &x; /* { dg-error {initialization of 'int8x8_t \*' from incompatible pointer type 'float \*'} } */
++  /* ??? For these it would be better to print an aka for 'int16x4_t'.  */
++  myvec y3 = x; /* { dg-error {incompatible types when initializing type 'myvec' using type 'float'} } */
++  myvec *ptr3 = &x; /* { dg-error {initialization of 'myvec \*' from incompatible pointer type 'float \*'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
+new file mode 100644
+index 000000000..8bfe06ac3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
+@@ -0,0 +1,140 @@
++/* { dg-do run } */
++/* { dg-options "-save-temps -O2 -fno-inline" } */
++
++#define FUNC_DEFS(__a)				\
++float						\
++fsfoo##__a (int x)				\
++{						\
++  return ((float) x)/(1lu << __a);		\
++}						\
++float						\
++fusfoo##__a (unsigned int x)			\
++{						\
++  return ((float) x)/(1lu << __a);		\
++}						\
++float						\
++fslfoo##__a (long long x)			\
++{						\
++  return ((float) x)/(1lu << __a);		\
++}						\
++float						\
++fulfoo##__a (unsigned long long x)		\
++{						\
++  return ((float) x)/(1lu << __a);		\
++}						\
++
++#define FUNC_DEFD(__a)				\
++double						\
++dsfoo##__a (int x)				\
++{						\
++  return ((double) x)/(1lu << __a);		\
++}						\
++double						\
++dusfoo##__a (unsigned int x)			\
++{						\
++  return ((double) x)/(1lu << __a);		\
++}						\
++double						\
++dslfoo##__a (long long x)			\
++{						\
++  return ((double) x)/(1lu << __a);		\
++}						\
++double						\
++dulfoo##__a (unsigned long long x)		\
++{						\
++  return ((double) x)/(1lu << __a);		\
++}
++
++FUNC_DEFS (4)
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#4" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#4" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#4" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#4" 1 } } */
++
++FUNC_DEFD (4)
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#4" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#4" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#4" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#4" 1 } } */
++
++FUNC_DEFS (8)
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#8" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#8" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#8" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#8" 1 } } */
++
++FUNC_DEFD (8)
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#8" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#8" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#8" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#8" 1 } } */
++
++FUNC_DEFS (16)
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#16" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#16" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#16" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#16" 1 } } */
++
++FUNC_DEFD (16)
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#16" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#16" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#16" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#16" 1 } } */
++
++FUNC_DEFS (32)
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#32" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#32" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#32" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#32" 1 } } */
++
++FUNC_DEFD (32)
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#32" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#32" 1 } } */
++	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#32" 1 } } */
++	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#32" 1 } } */
++
++#define FUNC_TESTS(__a, __b)					\
++do								\
++{								\
++  if (fsfoo##__a (__b) !=  ((int) i) * (1.0f/(1lu << __a)) )	\
++    __builtin_abort ();						\
++  if (fusfoo##__a (__b) != ((int) i) * (1.0f/(1lu << __a)) )	\
++    __builtin_abort ();						\
++  if (fslfoo##__a (__b) != ((int) i) * (1.0f/(1lu << __a)) )	\
++    __builtin_abort ();						\
++  if (fulfoo##__a (__b) != ((int) i) * (1.0f/(1lu << __a)) )	\
++    __builtin_abort ();						\
++} while (0)
++
++#define FUNC_TESTD(__a, __b)					\
++do								\
++{								\
++  if (dsfoo##__a (__b) !=  ((int) i) * (1.0d/(1lu << __a)) )	\
++    __builtin_abort ();						\
++  if (dusfoo##__a (__b) != ((int) i) * (1.0d/(1lu << __a)) )	\
++    __builtin_abort ();						\
++  if (dslfoo##__a (__b) != ((int) i) * (1.0d/(1lu << __a)) )	\
++    __builtin_abort ();						\
++  if (dulfoo##__a (__b) != ((int) i) * (1.0d/(1lu << __a)) )	\
++    __builtin_abort ();						\
++} while (0)
++
++int
++main (void)
++{
++	int i;
++
++	for (i = 0; i < 32; i ++)
++	{
++		FUNC_TESTS (4, i);
++		FUNC_TESTS (8, i);
++		FUNC_TESTS (16, i);
++		FUNC_TESTS (32, i);
++
++		FUNC_TESTD (4, i);
++		FUNC_TESTD (8, i);
++		FUNC_TESTD (16, i);
++		FUNC_TESTD (32, i);
++	}
++	return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/pr88834.c b/gcc/testsuite/gcc.target/aarch64/pr88834.c
+new file mode 100644
+index 000000000..ea00967ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/pr88834.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
++
++void
++f (int *restrict x, int *restrict y, int *restrict z, int n)
++{
++  for (int i = 0; i < n; i += 2)
++    {
++      x[i] = y[i] + z[i];
++      x[i + 1] = y[i + 1] - z[i + 1];
++    }
++}
++
++/* { dg-final { scan-assembler-times {\tld2w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tst2w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
+new file mode 100644
+index 000000000..fa2267598
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
+@@ -0,0 +1,215 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8-a")
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8-a+tme")
++#ifndef __ARM_FEATURE_TME
++#error "__ARM_FEATURE_TME is not defined but should be!"
++#endif
++
++#pragma GCC pop_options
++
++#ifdef __ARM_FEATURE_TME
++#error "__ARM_FEATURE_TME is defined but should not be!"
++#endif
++
++/* Test Armv8.6-A features.  */
++
++#ifdef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.6-a")
++#ifndef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE
++#error "__ARM_FEATURE_SVE is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.6-a+sve")
++#ifndef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
++#endif
++#ifndef __ARM_FEATURE_SVE
++#error "__ARM_FEATURE_SVE is not defined but should be!"
++#endif
++#ifndef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++#pragma GCC pop_pragma
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+i8mm")
++#ifndef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE
++#error "__ARM_FEATURE_SVE is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+i8mm+sve")
++#ifndef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
++#endif
++#ifndef __ARM_FEATURE_SVE
++#error "__ARM_FEATURE_SVE is not defined but should be!"
++#endif
++#ifndef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+f32mm")
++#ifndef __ARM_FEATURE_SVE
++#error "__ARM_FEATURE_SVE is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifndef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++#pragma GCC pop_pragma
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+f64mm")
++#ifndef __ARM_FEATURE_SVE
++#error "__ARM_FEATURE_SVE is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
++#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifndef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is not defined but should be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.6-a+nosimd")
++#ifdef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.6-a+nofp")
++#ifdef __ARM_FEATURE_MATMUL_INT8
++#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
++#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
++#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
++#endif
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.6-a")
++#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
++#endif
++#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+bf16")
++#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
++#endif
++#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
++#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
++#endif
++#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC push_options
++#pragma GCC target ("arch=armv8.6-a+nofp")
++#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
++#endif
++#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
++#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
++#endif
++#pragma GCC pop_options
++
++#pragma GCC pop_options
++
++int
++foo (int a)
++{
++  return a;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c
+new file mode 100644
+index 000000000..2587bfedd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c
+@@ -0,0 +1,40 @@
++/* { dg-do run } */
++/* { dg-additional-options "-O3 --save-temps" } */
++
++extern void abort ();
++
++#define N 8
++float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
++int out[N];
++
++void
++foo (int *i, float *f)
++{
++  i[0] = __builtin_signbit (f[0]);
++  i[1] = __builtin_signbit (f[1]);
++}
++
++/* { dg-final { scan-assembler-not {-2147483648} } } */
++/* { dg-final { scan-assembler {\tushr\tv[0-9]+.2s, v[0-9]+.2s, 31} } } */
++
++int
++main ()
++{
++  int i;
++
++  foo (out, in);
++  foo (out + 2, in + 2);
++  foo (out + 4, in + 4);
++  foo (out + 6, in + 6);
++
++  for (i = 0; i < N; i++)
++  {
++    if (in[i] >= 0.0 && out[i])
++      abort ();
++    if (in[i] < 0.0 && !out[i])
++      abort ();
++  }
++
++  return 0;
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
+new file mode 100644
+index 000000000..18cffdc7d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
+@@ -0,0 +1,38 @@
++/* { dg-do run } */
++/* { dg-additional-options "-O3 --save-temps" } */
++
++extern void abort ();
++
++#define N 1024
++float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
++int out[N];
++
++void
++foo ()
++{
++  int i;
++  for (i = 0; i < N; i++)
++    out[i] = __builtin_signbit (in[i]);
++}
++
++/* { dg-final { scan-assembler-not {-2147483648} } } */
++/* { dg-final { scan-assembler {\tushr\tv[0-9]+.4s, v[0-9]+.4s, 31} } } */
++
++int
++main ()
++{
++  int i;
++
++  foo ();
++
++  for (i = 0; i < N; i++)
++  {
++    if (in[i] >= 0.0 && out[i])
++      abort ();
++    if (in[i] < 0.0 && !out[i])
++      abort ();
++  }
++
++  return 0;
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ssra.c b/gcc/testsuite/gcc.target/aarch64/simd/ssra.c
+new file mode 100644
+index 000000000..e9c2e04c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/simd/ssra.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile { target aarch64*-*-* } } */
++/* { dg-options "-O3" } */
++/* { dg-skip-if "" { *-*-* } {"*sve*"} {""} } */
++
++#include <stdint.h>
++
++#define SSRA(func, vtype, n)				\
++	void func ()					\
++	{						\
++	    int i;					\
++	    for (i = 0; i < n; i++)			\
++	    {						\
++		s1##vtype[i] += s2##vtype[i] >> 2;	\
++	    }						\
++	}
++
++#define TEST_VDQ_I_MODES(FUNC)				\
++	FUNC (test_v8qi_v16qi, _char, 16)		\
++	FUNC (test_v4hi_v8h1, _short, 8)		\
++	FUNC (test_v2si_v4si, _int, 4)			\
++	FUNC (test_v2di, _ll, 2)			\
++
++int8_t s1_char[16], s2_char[16];
++int16_t s1_short[8], s2_short[8];
++int32_t s1_int[4], s2_int[4];
++int64_t s1_ll[2], s2_ll[2];
++
++TEST_VDQ_I_MODES(SSRA)
++
++/* { dg-final { scan-assembler "ssra" } } */
++/* { dg-final { scan-assembler-not "sshr" } } */
++
++/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.16b, v[0-9]+\.16b, [0-9]+} 1 } } */
++/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.8h, v[0-9]+\.8h, [0-9]+} 1 } } */
++/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.4s, v[0-9]+\.4s, [0-9]+} 1 } } */
++/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.2d, v[0-9]+\.2d, [0-9]+} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/simd/usra.c b/gcc/testsuite/gcc.target/aarch64/simd/usra.c
+new file mode 100644
+index 000000000..4e7446dfa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/simd/usra.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile { target aarch64*-*-* } } */
++/* { dg-options "-O3" } */
++/* { dg-skip-if "" { *-*-* } {"*sve*"} {""} } */
++
++#include <stdint.h>
++
++#define USRA(func, vtype, n)				\
++	void func ()					\
++	{						\
++	    int i;					\
++	    for (i = 0; i < n; i++)			\
++	    {						\
++		u1##vtype[i] += u2##vtype[i] >> 2;	\
++	    }						\
++	}
++
++#define TEST_VDQ_I_MODES(FUNC)				\
++	FUNC (test_v8qi_v16qi, _char, 16)		\
++	FUNC (test_v4hi_v8h1, _short, 8)		\
++	FUNC (test_v2si_v4si, _int, 4)			\
++	FUNC (test_v2di, _ll, 2)			\
++
++uint8_t u1_char[16], u2_char[16];
++uint16_t u1_short[8], u2_short[8];
++uint32_t u1_int[4], u2_int[4];
++uint64_t u1_ll[2], u2_ll[2];
++
++TEST_VDQ_I_MODES(USRA)
++
++/* { dg-final { scan-assembler "usra" } } */
++/* { dg-final { scan-assembler-not "ushr" } } */
++
++/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.16b, v[0-9]+\.16b, [0-9]+} 1 } } */
++/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.8h, v[0-9]+\.8h, [0-9]+} 1 } } */
++/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.4s, v[0-9]+\.4s, [0-9]+} 1 } } */
++/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.2d, v[0-9]+\.2d, [0-9]+} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
+new file mode 100644
+index 000000000..5eec2b5cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
+@@ -0,0 +1,27 @@
++/* { dg-do assemble} */
++/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
++
++#include "arm_neon.h"
++
++int32x4_t
++test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b)
++{
++  return vmmlaq_s32 (r, a, b);
++}
++
++uint32x4_t
++test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b)
++{
++  return vmmlaq_u32 (r, a, b);
++}
++
++int32x4_t
++test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b)
++{
++  return vusmmlaq_s32 (r, a, b);
++}
++
++/* { dg-final { scan-assembler-times {\tsmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
++/* { dg-final { scan-assembler-times {\tummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
++/* { dg-final { scan-assembler-times {\tusmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vrndnzx_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vrndnzx_1.c
+new file mode 100644
+index 000000000..0399b838d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/simd/vrndnzx_1.c
+@@ -0,0 +1,137 @@
++/* Test the vrnd[32,64][z,x] intrinsics.  */
++
++/* { dg-do compile } */
++/* { dg-options "-O2 -march=armv8.5-a" } */
++
++#include "arm_neon.h"
++
++#ifdef __ARM_FEATURE_FRINT
++
++float32x2_t
++foo_32z (float32x2_t a)
++{
++  return vrnd32z_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32z\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
++
++float32x4_t
++foo_32z_q (float32x4_t a)
++{
++  return vrnd32zq_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32z\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
++
++float64x1_t
++foo_32z_f64 (float64x1_t a)
++{
++  return vrnd32z_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float64x2_t
++foo_32z_q_f64 (float64x2_t a)
++{
++  return vrnd32zq_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32z\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
++
++float32x2_t
++foo_32x (float32x2_t a)
++{
++  return vrnd32x_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32x\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
++
++float32x4_t
++foo_32x_q (float32x4_t a)
++{
++  return vrnd32xq_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32x\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
++
++float64x1_t
++foo_32x_f64 (float64x1_t a)
++{
++  return vrnd32x_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float64x2_t
++foo_32x_q_f64 (float64x2_t a)
++{
++  return vrnd32xq_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint32x\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
++
++float32x2_t
++foo_64z (float32x2_t a)
++{
++  return vrnd64z_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64z\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
++
++float32x4_t
++foo_64z_q (float32x4_t a)
++{
++  return vrnd64zq_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64z\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
++
++float64x1_t
++foo_64z_f64 (float64x1_t a)
++{
++  return vrnd64z_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float64x2_t
++foo_64z_q_f64 (float64x2_t a)
++{
++  return vrnd64zq_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64z\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
++
++float32x2_t
++foo_64x (float32x2_t a)
++{
++  return vrnd64x_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64x\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
++
++float32x4_t
++foo_64x_q (float32x4_t a)
++{
++  return vrnd64xq_f32 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64x\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
++
++float64x1_t
++foo_64x_f64 (float64x1_t a)
++{
++  return vrnd64x_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
++
++float64x2_t
++foo_64x_q_f64 (float64x2_t a)
++{
++  return vrnd64xq_f64 (a);
++}
++
++/* { dg-final { scan-assembler-times "frint64x\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
+new file mode 100644
+index 000000000..08b6831cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
++/* { dg-add-options arm_v8_2a_dotprod_neon }  */
++/* { dg-additional-options "-O3" } */
++
++#pragma GCC target "+nosve"
++
++#define N 1024
++
++signed char pix1[N], pix2[N];
++
++int foo (void)
++{
++  int i_sum = 0;
++  int i;
++
++  for (i = 0; i < N; i++)
++    i_sum += __builtin_abs (pix1[i] - pix2[i]);
++
++  return i_sum;
++}
++
++/* { dg-final { scan-assembler-not {\tsshll\t} } } */
++/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
++/* { dg-final { scan-assembler-not {\tssubl\t} } } */
++/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
++/* { dg-final { scan-assembler-not {\tabs\t} } } */
++
++/* { dg-final { scan-assembler {\tsabd\t} } } */
++/* { dg-final { scan-assembler {\tudot\t} } } */
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
+index 40b288436..85a867a11 100644
+--- a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
++++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
+@@ -1,7 +1,7 @@
+ /* { dg-do compile } */
+ /* { dg-options "-O3" } */
+ 
+-#pragma GCC target "+nosve"
++#pragma GCC target "+nosve+nodotprod"
+ 
+ #define N 1024
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
+new file mode 100644
+index 000000000..7ce85a414
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
+@@ -0,0 +1,79 @@
++#  Assembly-based regression-test driver for the SVE ACLE
++#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.  */
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++# Exit immediately if this isn't an AArch64 target.
++if {![istarget aarch64*-*-*] } {
++    return
++}
++
++# Load support procs.
++load_lib gcc-dg.exp
++
++# Initialize `dg'.
++dg-init
++
++# Force SVE if we're not testing it already.
++if { [check_effective_target_aarch64_sve] } {
++    set sve_flags ""
++} else {
++    set sve_flags "-march=armv8.2-a+sve"
++}
++
++global gcc_runtest_parallelize_limit_minor
++if { [info exists gcc_runtest_parallelize_limit_minor] } {
++    set old_limit_minor $gcc_runtest_parallelize_limit_minor
++    set gcc_runtest_parallelize_limit_minor 1
++}
++
++torture-init
++set-torture-options {
++    "-std=c90 -O0 -g"
++    "-std=c90 -O1 -g"
++    "-std=c99 -O2 -g"
++    "-std=c11 -O3 -g"
++    "-std=gnu90 -O2 -fno-schedule-insns -DCHECK_ASM --save-temps"
++    "-std=gnu99 -Ofast -g"
++    "-std=gnu11 -Os -g"
++} {
++    "-DTEST_FULL"
++    "-DTEST_OVERLOADS"
++}
++
++# Main loop.
++set files [glob -nocomplain $srcdir/$subdir/asm/*.c]
++set save-dg-do-what-default ${dg-do-what-default}
++if { [check_effective_target_aarch64_asm_sve_ok]
++     && [check_effective_target_aarch64_variant_pcs] } {
++    set dg-do-what-default assemble
++} else {
++    set dg-do-what-default compile
++}
++gcc-dg-runtest [lsort $files] "" "$sve_flags -fno-ipa-icf"
++set dg-do-what-default ${save-dg-do-what-default}
++
++torture-finish
++
++if { [info exists gcc_runtest_parallelize_limit_minor] } {
++    set gcc_runtest_parallelize_limit_minor $old_limit_minor
++}
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
+new file mode 100644
+index 000000000..34d9dfd43
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
+@@ -0,0 +1,54 @@
++#  Specific regression driver for AArch64 SVE.
++#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
++#  Contributed by ARM Ltd.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.  */
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++# Exit immediately if this isn't an AArch64 target.
++if {![istarget aarch64*-*-*] } {
++    return
++}
++
++# Load support procs.
++load_lib gcc-dg.exp
++
++# If a testcase doesn't have special options, use these.
++global DEFAULT_CFLAGS
++if ![info exists DEFAULT_CFLAGS] then {
++    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
++}
++
++# Initialize `dg'.
++dg-init
++
++# Force SVE if we're not testing it already.
++if { [check_effective_target_aarch64_sve] } {
++    set sve_flags ""
++} else {
++    set sve_flags "-march=armv8.2-a+sve"
++}
++
++# Main loop.
++# FIXME: This should include general/*.c too, but leave that until the
++# C frontend allows initialization of SVE vectors.
++set files [glob -nocomplain $srcdir/$subdir/general-c/*.c]
++dg-runtest [lsort $files] "$sve_flags" $DEFAULT_CFLAGS
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f16.c
+new file mode 100644
+index 000000000..c019f248d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f16.c
+@@ -0,0 +1,552 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_f16_m_tied1:
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_m_tied1, svfloat16_t,
++		z0 = svabd_f16_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fabd	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_m_tied2, svfloat16_t,
++		z0 = svabd_f16_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_f16_m_untied:
++**	movprfx	z0, z1
++**	fabd	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_m_untied, svfloat16_t,
++		z0 = svabd_f16_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svabd_n_f16_m (p0, z0, d4),
++		 z0 = svabd_m (p0, z0, d4))
++
++/*
++** abd_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svabd_n_f16_m (p0, z1, d4),
++		 z0 = svabd_m (p0, z1, d4))
++
++/*
++** abd_1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f16_m_tied1, svfloat16_t,
++		z0 = svabd_n_f16_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f16_m_untied, svfloat16_t,
++		z0 = svabd_n_f16_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_z_tied1, svfloat16_t,
++		z0 = svabd_f16_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_z_tied2, svfloat16_t,
++		z0 = svabd_f16_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_z_untied, svfloat16_t,
++		z0 = svabd_f16_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svabd_n_f16_z (p0, z0, d4),
++		 z0 = svabd_z (p0, z0, d4))
++
++/*
++** abd_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svabd_n_f16_z (p0, z1, d4),
++		 z0 = svabd_z (p0, z1, d4))
++
++/*
++** abd_1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f16_z_tied1, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f16_z_untied, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_0p5_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z0, 0.5),
++		z0 = svabd_z (p0, z0, 0.5))
++
++/*
++** abd_0p5_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f16_z_untied, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z1, 0.5),
++		z0 = svabd_z (p0, z1, 0.5))
++
++/*
++** abd_m1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f16_z_tied1, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z0, -1),
++		z0 = svabd_z (p0, z0, -1))
++
++/*
++** abd_m1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f16_z_untied, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z1, -1),
++		z0 = svabd_z (p0, z1, -1))
++
++/*
++** abd_m0p5_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #-(?:0\.5|5\.0e-1)
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f16_z_tied1, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z0, -0.5),
++		z0 = svabd_z (p0, z0, -0.5))
++
++/*
++** abd_m0p5_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #-(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f16_z_untied, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z1, -0.5),
++		z0 = svabd_z (p0, z1, -0.5))
++
++/*
++** abd_m2_f16_z:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m2_f16_z, svfloat16_t,
++		z0 = svabd_n_f16_z (p0, z0, -2),
++		z0 = svabd_z (p0, z0, -2))
++
++/*
++** abd_f16_x_tied1:
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_x_tied1, svfloat16_t,
++		z0 = svabd_f16_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_f16_x_tied2:
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_x_tied2, svfloat16_t,
++		z0 = svabd_f16_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fabd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f16_x_untied, svfloat16_t,
++		z0 = svabd_f16_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svabd_n_f16_x (p0, z0, d4),
++		 z0 = svabd_x (p0, z0, d4))
++
++/*
++** abd_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svabd_n_f16_x (p0, z1, d4),
++		 z0 = svabd_x (p0, z1, d4))
++
++/*
++** abd_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_f16_x_untied:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
++
++/*
++** abd_0p5_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z0, 0.5),
++		z0 = svabd_x (p0, z0, 0.5))
++
++/*
++** abd_0p5_f16_x_untied:
++**	fmov	z0\.h, #(?:0\.5|5\.0e-1)
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z1, 0.5),
++		z0 = svabd_x (p0, z1, 0.5))
++
++/*
++** abd_m1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z0, -1),
++		z0 = svabd_x (p0, z0, -1))
++
++/*
++** abd_m1_f16_x_untied:
++**	fmov	z0\.h, #-1\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z1, -1),
++		z0 = svabd_x (p0, z1, -1))
++
++/*
++** abd_m0p5_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-(?:0\.5|5\.0e-1)
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z0, -0.5),
++		z0 = svabd_x (p0, z0, -0.5))
++
++/*
++** abd_m0p5_f16_x_untied:
++**	fmov	z0\.h, #-(?:0\.5|5\.0e-1)
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z1, -0.5),
++		z0 = svabd_x (p0, z1, -0.5))
++
++/*
++** abd_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_2_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z0, 2),
++		z0 = svabd_x (p0, z0, 2))
++
++/*
++** abd_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_2_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (p0, z1, 2),
++		z0 = svabd_x (p0, z1, 2))
++
++/*
++** ptrue_abd_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f16_x_tied1, svfloat16_t,
++		z0 = svabd_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svabd_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_abd_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f16_x_tied2, svfloat16_t,
++		z0 = svabd_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svabd_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_abd_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f16_x_untied, svfloat16_t,
++		z0 = svabd_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svabd_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_abd_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_1_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svabd_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_abd_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_1_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svabd_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_abd_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svabd_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_abd_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_0p5_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svabd_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_abd_m1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m1_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svabd_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_abd_m1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m1_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svabd_x (svptrue_b16 (), z1, -1))
++
++/*
++** ptrue_abd_m0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z0, -0.5),
++		z0 = svabd_x (svptrue_b16 (), z0, -0.5))
++
++/*
++** ptrue_abd_m0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z1, -0.5),
++		z0 = svabd_x (svptrue_b16 (), z1, -0.5))
++
++/*
++** ptrue_abd_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_2_f16_x_tied1, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svabd_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_abd_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_2_f16_x_untied, svfloat16_t,
++		z0 = svabd_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svabd_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f32.c
+new file mode 100644
+index 000000000..bff37580c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f32.c
+@@ -0,0 +1,552 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_f32_m_tied1:
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_m_tied1, svfloat32_t,
++		z0 = svabd_f32_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fabd	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_m_tied2, svfloat32_t,
++		z0 = svabd_f32_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_f32_m_untied:
++**	movprfx	z0, z1
++**	fabd	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_m_untied, svfloat32_t,
++		z0 = svabd_f32_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svabd_n_f32_m (p0, z0, d4),
++		 z0 = svabd_m (p0, z0, d4))
++
++/*
++** abd_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svabd_n_f32_m (p0, z1, d4),
++		 z0 = svabd_m (p0, z1, d4))
++
++/*
++** abd_1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f32_m_tied1, svfloat32_t,
++		z0 = svabd_n_f32_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f32_m_untied, svfloat32_t,
++		z0 = svabd_n_f32_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_z_tied1, svfloat32_t,
++		z0 = svabd_f32_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_z_tied2, svfloat32_t,
++		z0 = svabd_f32_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_z_untied, svfloat32_t,
++		z0 = svabd_f32_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svabd_n_f32_z (p0, z0, d4),
++		 z0 = svabd_z (p0, z0, d4))
++
++/*
++** abd_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svabd_n_f32_z (p0, z1, d4),
++		 z0 = svabd_z (p0, z1, d4))
++
++/*
++** abd_1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f32_z_tied1, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f32_z_untied, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_0p5_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z0, 0.5),
++		z0 = svabd_z (p0, z0, 0.5))
++
++/*
++** abd_0p5_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f32_z_untied, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z1, 0.5),
++		z0 = svabd_z (p0, z1, 0.5))
++
++/*
++** abd_m1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f32_z_tied1, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z0, -1),
++		z0 = svabd_z (p0, z0, -1))
++
++/*
++** abd_m1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f32_z_untied, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z1, -1),
++		z0 = svabd_z (p0, z1, -1))
++
++/*
++** abd_m0p5_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #-(?:0\.5|5\.0e-1)
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f32_z_tied1, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z0, -0.5),
++		z0 = svabd_z (p0, z0, -0.5))
++
++/*
++** abd_m0p5_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #-(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f32_z_untied, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z1, -0.5),
++		z0 = svabd_z (p0, z1, -0.5))
++
++/*
++** abd_m2_f32_z:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m2_f32_z, svfloat32_t,
++		z0 = svabd_n_f32_z (p0, z0, -2),
++		z0 = svabd_z (p0, z0, -2))
++
++/*
++** abd_f32_x_tied1:
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_x_tied1, svfloat32_t,
++		z0 = svabd_f32_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_f32_x_tied2:
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_x_tied2, svfloat32_t,
++		z0 = svabd_f32_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fabd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f32_x_untied, svfloat32_t,
++		z0 = svabd_f32_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svabd_n_f32_x (p0, z0, d4),
++		 z0 = svabd_x (p0, z0, d4))
++
++/*
++** abd_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svabd_n_f32_x (p0, z1, d4),
++		 z0 = svabd_x (p0, z1, d4))
++
++/*
++** abd_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_f32_x_untied:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
++
++/*
++** abd_0p5_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z0, 0.5),
++		z0 = svabd_x (p0, z0, 0.5))
++
++/*
++** abd_0p5_f32_x_untied:
++**	fmov	z0\.s, #(?:0\.5|5\.0e-1)
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z1, 0.5),
++		z0 = svabd_x (p0, z1, 0.5))
++
++/*
++** abd_m1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z0, -1),
++		z0 = svabd_x (p0, z0, -1))
++
++/*
++** abd_m1_f32_x_untied:
++**	fmov	z0\.s, #-1\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z1, -1),
++		z0 = svabd_x (p0, z1, -1))
++
++/*
++** abd_m0p5_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-(?:0\.5|5\.0e-1)
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z0, -0.5),
++		z0 = svabd_x (p0, z0, -0.5))
++
++/*
++** abd_m0p5_f32_x_untied:
++**	fmov	z0\.s, #-(?:0\.5|5\.0e-1)
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z1, -0.5),
++		z0 = svabd_x (p0, z1, -0.5))
++
++/*
++** abd_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_2_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z0, 2),
++		z0 = svabd_x (p0, z0, 2))
++
++/*
++** abd_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_2_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (p0, z1, 2),
++		z0 = svabd_x (p0, z1, 2))
++
++/*
++** ptrue_abd_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f32_x_tied1, svfloat32_t,
++		z0 = svabd_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svabd_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_abd_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f32_x_tied2, svfloat32_t,
++		z0 = svabd_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svabd_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_abd_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f32_x_untied, svfloat32_t,
++		z0 = svabd_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svabd_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_abd_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_1_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svabd_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_abd_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_1_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svabd_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_abd_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svabd_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_abd_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_0p5_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svabd_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_abd_m1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m1_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svabd_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_abd_m1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m1_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svabd_x (svptrue_b32 (), z1, -1))
++
++/*
++** ptrue_abd_m0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z0, -0.5),
++		z0 = svabd_x (svptrue_b32 (), z0, -0.5))
++
++/*
++** ptrue_abd_m0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z1, -0.5),
++		z0 = svabd_x (svptrue_b32 (), z1, -0.5))
++
++/*
++** ptrue_abd_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_2_f32_x_tied1, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svabd_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_abd_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_2_f32_x_untied, svfloat32_t,
++		z0 = svabd_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svabd_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f64.c
+new file mode 100644
+index 000000000..c1e5f14e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f64.c
+@@ -0,0 +1,552 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_f64_m_tied1:
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_m_tied1, svfloat64_t,
++		z0 = svabd_f64_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_m_tied2, svfloat64_t,
++		z0 = svabd_f64_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_f64_m_untied:
++**	movprfx	z0, z1
++**	fabd	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_m_untied, svfloat64_t,
++		z0 = svabd_f64_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svabd_n_f64_m (p0, z0, d4),
++		 z0 = svabd_m (p0, z0, d4))
++
++/*
++** abd_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svabd_n_f64_m (p0, z1, d4),
++		 z0 = svabd_m (p0, z1, d4))
++
++/*
++** abd_1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f64_m_tied1, svfloat64_t,
++		z0 = svabd_n_f64_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f64_m_untied, svfloat64_t,
++		z0 = svabd_n_f64_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_z_tied1, svfloat64_t,
++		z0 = svabd_f64_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_z_tied2, svfloat64_t,
++		z0 = svabd_f64_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_z_untied, svfloat64_t,
++		z0 = svabd_f64_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svabd_n_f64_z (p0, z0, d4),
++		 z0 = svabd_z (p0, z0, d4))
++
++/*
++** abd_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svabd_n_f64_z (p0, z1, d4),
++		 z0 = svabd_z (p0, z1, d4))
++
++/*
++** abd_1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f64_z_tied1, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f64_z_untied, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_0p5_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z0, 0.5),
++		z0 = svabd_z (p0, z0, 0.5))
++
++/*
++** abd_0p5_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f64_z_untied, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z1, 0.5),
++		z0 = svabd_z (p0, z1, 0.5))
++
++/*
++** abd_m1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f64_z_tied1, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z0, -1),
++		z0 = svabd_z (p0, z0, -1))
++
++/*
++** abd_m1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f64_z_untied, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z1, -1),
++		z0 = svabd_z (p0, z1, -1))
++
++/*
++** abd_m0p5_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #-(?:0\.5|5\.0e-1)
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f64_z_tied1, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z0, -0.5),
++		z0 = svabd_z (p0, z0, -0.5))
++
++/*
++** abd_m0p5_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #-(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f64_z_untied, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z1, -0.5),
++		z0 = svabd_z (p0, z1, -0.5))
++
++/*
++** abd_m2_f64_z:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m2_f64_z, svfloat64_t,
++		z0 = svabd_n_f64_z (p0, z0, -2),
++		z0 = svabd_z (p0, z0, -2))
++
++/*
++** abd_f64_x_tied1:
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_x_tied1, svfloat64_t,
++		z0 = svabd_f64_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_f64_x_tied2:
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_x_tied2, svfloat64_t,
++		z0 = svabd_f64_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fabd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_f64_x_untied, svfloat64_t,
++		z0 = svabd_f64_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svabd_n_f64_x (p0, z0, d4),
++		 z0 = svabd_x (p0, z0, d4))
++
++/*
++** abd_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (abd_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svabd_n_f64_x (p0, z1, d4),
++		 z0 = svabd_x (p0, z1, d4))
++
++/*
++** abd_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_f64_x_untied:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
++
++/*
++** abd_0p5_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z0, 0.5),
++		z0 = svabd_x (p0, z0, 0.5))
++
++/*
++** abd_0p5_f64_x_untied:
++**	fmov	z0\.d, #(?:0\.5|5\.0e-1)
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_0p5_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z1, 0.5),
++		z0 = svabd_x (p0, z1, 0.5))
++
++/*
++** abd_m1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z0, -1),
++		z0 = svabd_x (p0, z0, -1))
++
++/*
++** abd_m1_f64_x_untied:
++**	fmov	z0\.d, #-1\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m1_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z1, -1),
++		z0 = svabd_x (p0, z1, -1))
++
++/*
++** abd_m0p5_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-(?:0\.5|5\.0e-1)
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z0, -0.5),
++		z0 = svabd_x (p0, z0, -0.5))
++
++/*
++** abd_m0p5_f64_x_untied:
++**	fmov	z0\.d, #-(?:0\.5|5\.0e-1)
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z1, -0.5),
++		z0 = svabd_x (p0, z1, -0.5))
++
++/*
++** abd_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_2_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z0, 2),
++		z0 = svabd_x (p0, z0, 2))
++
++/*
++** abd_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_2_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (p0, z1, 2),
++		z0 = svabd_x (p0, z1, 2))
++
++/*
++** ptrue_abd_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f64_x_tied1, svfloat64_t,
++		z0 = svabd_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svabd_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_abd_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f64_x_tied2, svfloat64_t,
++		z0 = svabd_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svabd_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_abd_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_f64_x_untied, svfloat64_t,
++		z0 = svabd_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svabd_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_abd_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_1_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svabd_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_abd_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_1_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svabd_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_abd_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svabd_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_abd_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_0p5_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svabd_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_abd_m1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m1_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svabd_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_abd_m1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m1_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svabd_x (svptrue_b64 (), z1, -1))
++
++/*
++** ptrue_abd_m0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z0, -0.5),
++		z0 = svabd_x (svptrue_b64 (), z0, -0.5))
++
++/*
++** ptrue_abd_m0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z1, -0.5),
++		z0 = svabd_x (svptrue_b64 (), z1, -0.5))
++
++/*
++** ptrue_abd_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_2_f64_x_tied1, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svabd_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_abd_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abd_2_f64_x_untied, svfloat64_t,
++		z0 = svabd_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svabd_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s16.c
+new file mode 100644
+index 000000000..e2d0c0fb7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s16.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_s16_m_tied1:
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_m_tied1, svint16_t,
++		z0 = svabd_s16_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sabd	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_m_tied2, svint16_t,
++		z0 = svabd_s16_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_s16_m_untied:
++**	movprfx	z0, z1
++**	sabd	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_m_untied, svint16_t,
++		z0 = svabd_s16_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svabd_n_s16_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svabd_n_s16_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s16_m_tied1, svint16_t,
++		z0 = svabd_n_s16_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s16_m_untied, svint16_t,
++		z0 = svabd_n_s16_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_z_tied1, svint16_t,
++		z0 = svabd_s16_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_z_tied2, svint16_t,
++		z0 = svabd_s16_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sabd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_z_untied, svint16_t,
++		z0 = svabd_s16_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svabd_n_s16_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svabd_n_s16_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s16_z_tied1, svint16_t,
++		z0 = svabd_n_s16_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s16_z_untied, svint16_t,
++		z0 = svabd_n_s16_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_s16_x_tied1:
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_x_tied1, svint16_t,
++		z0 = svabd_s16_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_s16_x_tied2:
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_x_tied2, svint16_t,
++		z0 = svabd_s16_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	sabd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s16_x_untied, svint16_t,
++		z0 = svabd_s16_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svabd_n_s16_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svabd_n_s16_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	sabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s16_x_tied1, svint16_t,
++		z0 = svabd_n_s16_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_s16_x_untied:
++**	mov	z0\.h, #1
++**	sabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s16_x_untied, svint16_t,
++		z0 = svabd_n_s16_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s32.c
+new file mode 100644
+index 000000000..5c95ec04d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s32.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_s32_m_tied1:
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_m_tied1, svint32_t,
++		z0 = svabd_s32_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sabd	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_m_tied2, svint32_t,
++		z0 = svabd_s32_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_s32_m_untied:
++**	movprfx	z0, z1
++**	sabd	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_m_untied, svint32_t,
++		z0 = svabd_s32_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svabd_n_s32_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svabd_n_s32_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s32_m_tied1, svint32_t,
++		z0 = svabd_n_s32_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s32_m_untied, svint32_t,
++		z0 = svabd_n_s32_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_z_tied1, svint32_t,
++		z0 = svabd_s32_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_z_tied2, svint32_t,
++		z0 = svabd_s32_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sabd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_z_untied, svint32_t,
++		z0 = svabd_s32_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svabd_n_s32_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svabd_n_s32_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s32_z_tied1, svint32_t,
++		z0 = svabd_n_s32_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s32_z_untied, svint32_t,
++		z0 = svabd_n_s32_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_s32_x_tied1:
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_x_tied1, svint32_t,
++		z0 = svabd_s32_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_s32_x_tied2:
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_x_tied2, svint32_t,
++		z0 = svabd_s32_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	sabd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s32_x_untied, svint32_t,
++		z0 = svabd_s32_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svabd_n_s32_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svabd_n_s32_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	sabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s32_x_tied1, svint32_t,
++		z0 = svabd_n_s32_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_s32_x_untied:
++**	mov	z0\.s, #1
++**	sabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s32_x_untied, svint32_t,
++		z0 = svabd_n_s32_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s64.c
+new file mode 100644
+index 000000000..2402ecf29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s64.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_s64_m_tied1:
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_m_tied1, svint64_t,
++		z0 = svabd_s64_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_m_tied2, svint64_t,
++		z0 = svabd_s64_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_s64_m_untied:
++**	movprfx	z0, z1
++**	sabd	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_m_untied, svint64_t,
++		z0 = svabd_s64_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svabd_n_s64_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svabd_n_s64_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s64_m_tied1, svint64_t,
++		z0 = svabd_n_s64_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s64_m_untied, svint64_t,
++		z0 = svabd_n_s64_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_z_tied1, svint64_t,
++		z0 = svabd_s64_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_z_tied2, svint64_t,
++		z0 = svabd_s64_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sabd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_z_untied, svint64_t,
++		z0 = svabd_s64_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svabd_n_s64_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svabd_n_s64_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s64_z_tied1, svint64_t,
++		z0 = svabd_n_s64_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s64_z_untied, svint64_t,
++		z0 = svabd_n_s64_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_s64_x_tied1:
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_x_tied1, svint64_t,
++		z0 = svabd_s64_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_s64_x_tied2:
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_x_tied2, svint64_t,
++		z0 = svabd_s64_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	sabd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s64_x_untied, svint64_t,
++		z0 = svabd_s64_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svabd_n_s64_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svabd_n_s64_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	sabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s64_x_tied1, svint64_t,
++		z0 = svabd_n_s64_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_s64_x_untied:
++**	mov	z0\.d, #1
++**	sabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s64_x_untied, svint64_t,
++		z0 = svabd_n_s64_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s8.c
+new file mode 100644
+index 000000000..49a2cc388
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s8.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_s8_m_tied1:
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_m_tied1, svint8_t,
++		z0 = svabd_s8_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sabd	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_m_tied2, svint8_t,
++		z0 = svabd_s8_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_s8_m_untied:
++**	movprfx	z0, z1
++**	sabd	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_m_untied, svint8_t,
++		z0 = svabd_s8_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svabd_n_s8_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svabd_n_s8_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s8_m_tied1, svint8_t,
++		z0 = svabd_n_s8_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s8_m_untied, svint8_t,
++		z0 = svabd_n_s8_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_z_tied1, svint8_t,
++		z0 = svabd_s8_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_z_tied2, svint8_t,
++		z0 = svabd_s8_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sabd	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_z_untied, svint8_t,
++		z0 = svabd_s8_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svabd_n_s8_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sabd	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svabd_n_s8_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s8_z_tied1, svint8_t,
++		z0 = svabd_n_s8_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sabd	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s8_z_untied, svint8_t,
++		z0 = svabd_n_s8_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_s8_x_tied1:
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_x_tied1, svint8_t,
++		z0 = svabd_s8_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_s8_x_tied2:
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_x_tied2, svint8_t,
++		z0 = svabd_s8_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	sabd	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_s8_x_untied, svint8_t,
++		z0 = svabd_s8_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svabd_n_s8_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svabd_n_s8_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	sabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s8_x_tied1, svint8_t,
++		z0 = svabd_n_s8_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_s8_x_untied:
++**	mov	z0\.b, #1
++**	sabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_s8_x_untied, svint8_t,
++		z0 = svabd_n_s8_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u16.c
+new file mode 100644
+index 000000000..60aa9429e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u16.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_u16_m_tied1:
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_m_tied1, svuint16_t,
++		z0 = svabd_u16_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	uabd	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_m_tied2, svuint16_t,
++		z0 = svabd_u16_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_u16_m_untied:
++**	movprfx	z0, z1
++**	uabd	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_m_untied, svuint16_t,
++		z0 = svabd_u16_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svabd_n_u16_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svabd_n_u16_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u16_m_tied1, svuint16_t,
++		z0 = svabd_n_u16_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u16_m_untied, svuint16_t,
++		z0 = svabd_n_u16_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_z_tied1, svuint16_t,
++		z0 = svabd_u16_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_z_tied2, svuint16_t,
++		z0 = svabd_u16_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	uabd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_z_untied, svuint16_t,
++		z0 = svabd_u16_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svabd_n_u16_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	uabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svabd_n_u16_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u16_z_tied1, svuint16_t,
++		z0 = svabd_n_u16_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	uabd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u16_z_untied, svuint16_t,
++		z0 = svabd_n_u16_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_u16_x_tied1:
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_x_tied1, svuint16_t,
++		z0 = svabd_u16_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_u16_x_tied2:
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_x_tied2, svuint16_t,
++		z0 = svabd_u16_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	uabd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u16_x_untied, svuint16_t,
++		z0 = svabd_u16_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svabd_n_u16_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svabd_n_u16_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	uabd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u16_x_tied1, svuint16_t,
++		z0 = svabd_n_u16_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_u16_x_untied:
++**	mov	z0\.h, #1
++**	uabd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u16_x_untied, svuint16_t,
++		z0 = svabd_n_u16_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u32.c
+new file mode 100644
+index 000000000..bc2410783
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u32.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_u32_m_tied1:
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_m_tied1, svuint32_t,
++		z0 = svabd_u32_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	uabd	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_m_tied2, svuint32_t,
++		z0 = svabd_u32_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_u32_m_untied:
++**	movprfx	z0, z1
++**	uabd	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_m_untied, svuint32_t,
++		z0 = svabd_u32_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svabd_n_u32_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svabd_n_u32_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u32_m_tied1, svuint32_t,
++		z0 = svabd_n_u32_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u32_m_untied, svuint32_t,
++		z0 = svabd_n_u32_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_z_tied1, svuint32_t,
++		z0 = svabd_u32_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_z_tied2, svuint32_t,
++		z0 = svabd_u32_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uabd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_z_untied, svuint32_t,
++		z0 = svabd_u32_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svabd_n_u32_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svabd_n_u32_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u32_z_tied1, svuint32_t,
++		z0 = svabd_n_u32_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uabd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u32_z_untied, svuint32_t,
++		z0 = svabd_n_u32_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_u32_x_tied1:
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_x_tied1, svuint32_t,
++		z0 = svabd_u32_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_u32_x_tied2:
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_x_tied2, svuint32_t,
++		z0 = svabd_u32_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	uabd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u32_x_untied, svuint32_t,
++		z0 = svabd_u32_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svabd_n_u32_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svabd_n_u32_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	uabd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u32_x_tied1, svuint32_t,
++		z0 = svabd_n_u32_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_u32_x_untied:
++**	mov	z0\.s, #1
++**	uabd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u32_x_untied, svuint32_t,
++		z0 = svabd_n_u32_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u64.c
+new file mode 100644
+index 000000000..d2cdaa06a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u64.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_u64_m_tied1:
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_m_tied1, svuint64_t,
++		z0 = svabd_u64_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_m_tied2, svuint64_t,
++		z0 = svabd_u64_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_u64_m_untied:
++**	movprfx	z0, z1
++**	uabd	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_m_untied, svuint64_t,
++		z0 = svabd_u64_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svabd_n_u64_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svabd_n_u64_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u64_m_tied1, svuint64_t,
++		z0 = svabd_n_u64_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u64_m_untied, svuint64_t,
++		z0 = svabd_n_u64_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_z_tied1, svuint64_t,
++		z0 = svabd_u64_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_z_tied2, svuint64_t,
++		z0 = svabd_u64_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uabd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_z_untied, svuint64_t,
++		z0 = svabd_u64_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svabd_n_u64_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svabd_n_u64_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u64_z_tied1, svuint64_t,
++		z0 = svabd_n_u64_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uabd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u64_z_untied, svuint64_t,
++		z0 = svabd_n_u64_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_u64_x_tied1:
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_x_tied1, svuint64_t,
++		z0 = svabd_u64_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_u64_x_tied2:
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_x_tied2, svuint64_t,
++		z0 = svabd_u64_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	uabd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u64_x_untied, svuint64_t,
++		z0 = svabd_u64_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svabd_n_u64_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svabd_n_u64_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	uabd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u64_x_tied1, svuint64_t,
++		z0 = svabd_n_u64_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_u64_x_untied:
++**	mov	z0\.d, #1
++**	uabd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u64_x_untied, svuint64_t,
++		z0 = svabd_n_u64_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u8.c
+new file mode 100644
+index 000000000..454ef153c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u8.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abd_u8_m_tied1:
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_m_tied1, svuint8_t,
++		z0 = svabd_u8_m (p0, z0, z1),
++		z0 = svabd_m (p0, z0, z1))
++
++/*
++** abd_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	uabd	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_m_tied2, svuint8_t,
++		z0 = svabd_u8_m (p0, z1, z0),
++		z0 = svabd_m (p0, z1, z0))
++
++/*
++** abd_u8_m_untied:
++**	movprfx	z0, z1
++**	uabd	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_m_untied, svuint8_t,
++		z0 = svabd_u8_m (p0, z1, z2),
++		z0 = svabd_m (p0, z1, z2))
++
++/*
++** abd_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svabd_n_u8_m (p0, z0, x0),
++		 z0 = svabd_m (p0, z0, x0))
++
++/*
++** abd_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svabd_n_u8_m (p0, z1, x0),
++		 z0 = svabd_m (p0, z1, x0))
++
++/*
++** abd_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u8_m_tied1, svuint8_t,
++		z0 = svabd_n_u8_m (p0, z0, 1),
++		z0 = svabd_m (p0, z0, 1))
++
++/*
++** abd_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u8_m_untied, svuint8_t,
++		z0 = svabd_n_u8_m (p0, z1, 1),
++		z0 = svabd_m (p0, z1, 1))
++
++/*
++** abd_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_z_tied1, svuint8_t,
++		z0 = svabd_u8_z (p0, z0, z1),
++		z0 = svabd_z (p0, z0, z1))
++
++/*
++** abd_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_z_tied2, svuint8_t,
++		z0 = svabd_u8_z (p0, z1, z0),
++		z0 = svabd_z (p0, z1, z0))
++
++/*
++** abd_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	uabd	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_z_untied, svuint8_t,
++		z0 = svabd_u8_z (p0, z1, z2),
++		z0 = svabd_z (p0, z1, z2))
++
++/*
++** abd_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svabd_n_u8_z (p0, z0, x0),
++		 z0 = svabd_z (p0, z0, x0))
++
++/*
++** abd_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	uabd	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svabd_n_u8_z (p0, z1, x0),
++		 z0 = svabd_z (p0, z1, x0))
++
++/*
++** abd_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u8_z_tied1, svuint8_t,
++		z0 = svabd_n_u8_z (p0, z0, 1),
++		z0 = svabd_z (p0, z0, 1))
++
++/*
++** abd_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	uabd	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u8_z_untied, svuint8_t,
++		z0 = svabd_n_u8_z (p0, z1, 1),
++		z0 = svabd_z (p0, z1, 1))
++
++/*
++** abd_u8_x_tied1:
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_x_tied1, svuint8_t,
++		z0 = svabd_u8_x (p0, z0, z1),
++		z0 = svabd_x (p0, z0, z1))
++
++/*
++** abd_u8_x_tied2:
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_x_tied2, svuint8_t,
++		z0 = svabd_u8_x (p0, z1, z0),
++		z0 = svabd_x (p0, z1, z0))
++
++/*
++** abd_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	uabd	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (abd_u8_x_untied, svuint8_t,
++		z0 = svabd_u8_x (p0, z1, z2),
++		z0 = svabd_x (p0, z1, z2))
++
++/*
++** abd_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svabd_n_u8_x (p0, z0, x0),
++		 z0 = svabd_x (p0, z0, x0))
++
++/*
++** abd_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (abd_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svabd_n_u8_x (p0, z1, x0),
++		 z0 = svabd_x (p0, z1, x0))
++
++/*
++** abd_1_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	uabd	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u8_x_tied1, svuint8_t,
++		z0 = svabd_n_u8_x (p0, z0, 1),
++		z0 = svabd_x (p0, z0, 1))
++
++/*
++** abd_1_u8_x_untied:
++**	mov	z0\.b, #1
++**	uabd	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abd_1_u8_x_untied, svuint8_t,
++		z0 = svabd_n_u8_x (p0, z1, 1),
++		z0 = svabd_x (p0, z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f16.c
+new file mode 100644
+index 000000000..2aa8736e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_f16_m_tied12:
++**	fabs	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_m_tied12, svfloat16_t,
++		z0 = svabs_f16_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_f16_m_tied1:
++**	fabs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_m_tied1, svfloat16_t,
++		z0 = svabs_f16_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fabs	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_m_tied2, svfloat16_t,
++		z0 = svabs_f16_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_f16_m_untied:
++**	movprfx	z0, z2
++**	fabs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_m_untied, svfloat16_t,
++		z0 = svabs_f16_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	fabs	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_z_tied1, svfloat16_t,
++		z0 = svabs_f16_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fabs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_z_untied, svfloat16_t,
++		z0 = svabs_f16_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_f16_x_tied1:
++**	fabs	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_x_tied1, svfloat16_t,
++		z0 = svabs_f16_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_f16_x_untied:
++**	fabs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f16_x_untied, svfloat16_t,
++		z0 = svabs_f16_x (p0, z1),
++		z0 = svabs_x (p0, z1))
++
++/*
++** ptrue_abs_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abs_f16_x_tied1, svfloat16_t,
++		z0 = svabs_f16_x (svptrue_b16 (), z0),
++		z0 = svabs_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_abs_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abs_f16_x_untied, svfloat16_t,
++		z0 = svabs_f16_x (svptrue_b16 (), z1),
++		z0 = svabs_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f32.c
+new file mode 100644
+index 000000000..30286afc7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_f32_m_tied12:
++**	fabs	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_m_tied12, svfloat32_t,
++		z0 = svabs_f32_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_f32_m_tied1:
++**	fabs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_m_tied1, svfloat32_t,
++		z0 = svabs_f32_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fabs	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_m_tied2, svfloat32_t,
++		z0 = svabs_f32_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_f32_m_untied:
++**	movprfx	z0, z2
++**	fabs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_m_untied, svfloat32_t,
++		z0 = svabs_f32_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fabs	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_z_tied1, svfloat32_t,
++		z0 = svabs_f32_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fabs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_z_untied, svfloat32_t,
++		z0 = svabs_f32_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_f32_x_tied1:
++**	fabs	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_x_tied1, svfloat32_t,
++		z0 = svabs_f32_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_f32_x_untied:
++**	fabs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f32_x_untied, svfloat32_t,
++		z0 = svabs_f32_x (p0, z1),
++		z0 = svabs_x (p0, z1))
++
++/*
++** ptrue_abs_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abs_f32_x_tied1, svfloat32_t,
++		z0 = svabs_f32_x (svptrue_b32 (), z0),
++		z0 = svabs_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_abs_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abs_f32_x_untied, svfloat32_t,
++		z0 = svabs_f32_x (svptrue_b32 (), z1),
++		z0 = svabs_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f64.c
+new file mode 100644
+index 000000000..28ef9fbba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_f64_m_tied12:
++**	fabs	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_m_tied12, svfloat64_t,
++		z0 = svabs_f64_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_f64_m_tied1:
++**	fabs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_m_tied1, svfloat64_t,
++		z0 = svabs_f64_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fabs	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_m_tied2, svfloat64_t,
++		z0 = svabs_f64_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_f64_m_untied:
++**	movprfx	z0, z2
++**	fabs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_m_untied, svfloat64_t,
++		z0 = svabs_f64_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fabs	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_z_tied1, svfloat64_t,
++		z0 = svabs_f64_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fabs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_z_untied, svfloat64_t,
++		z0 = svabs_f64_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_f64_x_tied1:
++**	fabs	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_x_tied1, svfloat64_t,
++		z0 = svabs_f64_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_f64_x_untied:
++**	fabs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_f64_x_untied, svfloat64_t,
++		z0 = svabs_f64_x (p0, z1),
++		z0 = svabs_x (p0, z1))
++
++/*
++** ptrue_abs_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abs_f64_x_tied1, svfloat64_t,
++		z0 = svabs_f64_x (svptrue_b64 (), z0),
++		z0 = svabs_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_abs_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_abs_f64_x_untied, svfloat64_t,
++		z0 = svabs_f64_x (svptrue_b64 (), z1),
++		z0 = svabs_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s16.c
+new file mode 100644
+index 000000000..3b16a9c4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_s16_m_tied12:
++**	abs	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_m_tied12, svint16_t,
++		z0 = svabs_s16_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_s16_m_tied1:
++**	abs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_m_tied1, svint16_t,
++		z0 = svabs_s16_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	abs	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_m_tied2, svint16_t,
++		z0 = svabs_s16_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_s16_m_untied:
++**	movprfx	z0, z2
++**	abs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_m_untied, svint16_t,
++		z0 = svabs_s16_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	abs	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_z_tied1, svint16_t,
++		z0 = svabs_s16_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	abs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_z_untied, svint16_t,
++		z0 = svabs_s16_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_s16_x_tied1:
++**	abs	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_x_tied1, svint16_t,
++		z0 = svabs_s16_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_s16_x_untied:
++**	abs	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s16_x_untied, svint16_t,
++		z0 = svabs_s16_x (p0, z1),
++		z0 = svabs_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s32.c
+new file mode 100644
+index 000000000..14bcbd50c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_s32_m_tied12:
++**	abs	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_m_tied12, svint32_t,
++		z0 = svabs_s32_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_s32_m_tied1:
++**	abs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_m_tied1, svint32_t,
++		z0 = svabs_s32_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	abs	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_m_tied2, svint32_t,
++		z0 = svabs_s32_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_s32_m_untied:
++**	movprfx	z0, z2
++**	abs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_m_untied, svint32_t,
++		z0 = svabs_s32_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	abs	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_z_tied1, svint32_t,
++		z0 = svabs_s32_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	abs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_z_untied, svint32_t,
++		z0 = svabs_s32_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_s32_x_tied1:
++**	abs	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_x_tied1, svint32_t,
++		z0 = svabs_s32_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_s32_x_untied:
++**	abs	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s32_x_untied, svint32_t,
++		z0 = svabs_s32_x (p0, z1),
++		z0 = svabs_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s64.c
+new file mode 100644
+index 000000000..c7b60ff48
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_s64_m_tied12:
++**	abs	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_m_tied12, svint64_t,
++		z0 = svabs_s64_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_s64_m_tied1:
++**	abs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_m_tied1, svint64_t,
++		z0 = svabs_s64_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	abs	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_m_tied2, svint64_t,
++		z0 = svabs_s64_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_s64_m_untied:
++**	movprfx	z0, z2
++**	abs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_m_untied, svint64_t,
++		z0 = svabs_s64_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	abs	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_z_tied1, svint64_t,
++		z0 = svabs_s64_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	abs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_z_untied, svint64_t,
++		z0 = svabs_s64_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_s64_x_tied1:
++**	abs	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_x_tied1, svint64_t,
++		z0 = svabs_s64_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_s64_x_untied:
++**	abs	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s64_x_untied, svint64_t,
++		z0 = svabs_s64_x (p0, z1),
++		z0 = svabs_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s8.c
+new file mode 100644
+index 000000000..0bc64c078
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** abs_s8_m_tied12:
++**	abs	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_m_tied12, svint8_t,
++		z0 = svabs_s8_m (z0, p0, z0),
++		z0 = svabs_m (z0, p0, z0))
++
++/*
++** abs_s8_m_tied1:
++**	abs	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_m_tied1, svint8_t,
++		z0 = svabs_s8_m (z0, p0, z1),
++		z0 = svabs_m (z0, p0, z1))
++
++/*
++** abs_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	abs	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_m_tied2, svint8_t,
++		z0 = svabs_s8_m (z1, p0, z0),
++		z0 = svabs_m (z1, p0, z0))
++
++/*
++** abs_s8_m_untied:
++**	movprfx	z0, z2
++**	abs	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_m_untied, svint8_t,
++		z0 = svabs_s8_m (z2, p0, z1),
++		z0 = svabs_m (z2, p0, z1))
++
++/*
++** abs_s8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	abs	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_z_tied1, svint8_t,
++		z0 = svabs_s8_z (p0, z0),
++		z0 = svabs_z (p0, z0))
++
++/*
++** abs_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	abs	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_z_untied, svint8_t,
++		z0 = svabs_s8_z (p0, z1),
++		z0 = svabs_z (p0, z1))
++
++/*
++** abs_s8_x_tied1:
++**	abs	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_x_tied1, svint8_t,
++		z0 = svabs_s8_x (p0, z0),
++		z0 = svabs_x (p0, z0))
++
++/*
++** abs_s8_x_untied:
++**	abs	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (abs_s8_x_untied, svint8_t,
++		z0 = svabs_s8_x (p0, z1),
++		z0 = svabs_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f16.c
+new file mode 100644
+index 000000000..acef17309
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f16.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acge_f16_tied:
++** (
++**	facge	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	facle	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_f16_tied, svfloat16_t,
++		p0 = svacge_f16 (p0, z0, z1),
++		p0 = svacge (p0, z0, z1))
++
++/*
++** acge_f16_untied:
++** (
++**	facge	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	facle	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_f16_untied, svfloat16_t,
++		p0 = svacge_f16 (p1, z0, z1),
++		p0 = svacge (p1, z0, z1))
++
++/*
++** acge_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	facge	p0\.h, p1/z, z0\.h, \1
++** |
++**	facle	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acge_h4_f16, svfloat16_t, float16_t,
++		 p0 = svacge_n_f16 (p1, z0, d4),
++		 p0 = svacge (p1, z0, d4))
++
++/*
++** acge_0_f16:
++**	mov	(z[0-9]+\.h), #0
++** (
++**	facge	p0\.h, p1/z, z0\.h, \1
++** |
++**	facle	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_0_f16, svfloat16_t,
++		p0 = svacge_n_f16 (p1, z0, 0),
++		p0 = svacge (p1, z0, 0))
++
++/*
++** acge_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	facge	p0\.h, p1/z, z0\.h, \1
++** |
++**	facle	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_1_f16, svfloat16_t,
++		p0 = svacge_n_f16 (p1, z0, 1),
++		p0 = svacge (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f32.c
+new file mode 100644
+index 000000000..c3d195ab8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f32.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acge_f32_tied:
++** (
++**	facge	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	facle	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_f32_tied, svfloat32_t,
++		p0 = svacge_f32 (p0, z0, z1),
++		p0 = svacge (p0, z0, z1))
++
++/*
++** acge_f32_untied:
++** (
++**	facge	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	facle	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_f32_untied, svfloat32_t,
++		p0 = svacge_f32 (p1, z0, z1),
++		p0 = svacge (p1, z0, z1))
++
++/*
++** acge_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	facge	p0\.s, p1/z, z0\.s, \1
++** |
++**	facle	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acge_s4_f32, svfloat32_t, float32_t,
++		 p0 = svacge_n_f32 (p1, z0, d4),
++		 p0 = svacge (p1, z0, d4))
++
++/*
++** acge_0_f32:
++**	mov	(z[0-9]+\.s), #0
++** (
++**	facge	p0\.s, p1/z, z0\.s, \1
++** |
++**	facle	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_0_f32, svfloat32_t,
++		p0 = svacge_n_f32 (p1, z0, 0),
++		p0 = svacge (p1, z0, 0))
++
++/*
++** acge_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	facge	p0\.s, p1/z, z0\.s, \1
++** |
++**	facle	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_1_f32, svfloat32_t,
++		p0 = svacge_n_f32 (p1, z0, 1),
++		p0 = svacge (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f64.c
+new file mode 100644
+index 000000000..207ce93a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f64.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acge_f64_tied:
++** (
++**	facge	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	facle	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_f64_tied, svfloat64_t,
++		p0 = svacge_f64 (p0, z0, z1),
++		p0 = svacge (p0, z0, z1))
++
++/*
++** acge_f64_untied:
++** (
++**	facge	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	facle	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_f64_untied, svfloat64_t,
++		p0 = svacge_f64 (p1, z0, z1),
++		p0 = svacge (p1, z0, z1))
++
++/*
++** acge_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	facge	p0\.d, p1/z, z0\.d, \1
++** |
++**	facle	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acge_d4_f64, svfloat64_t, float64_t,
++		 p0 = svacge_n_f64 (p1, z0, d4),
++		 p0 = svacge (p1, z0, d4))
++
++/*
++** acge_0_f64:
++**	mov	(z[0-9]+\.d), #0
++** (
++**	facge	p0\.d, p1/z, z0\.d, \1
++** |
++**	facle	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_0_f64, svfloat64_t,
++		p0 = svacge_n_f64 (p1, z0, 0),
++		p0 = svacge (p1, z0, 0))
++
++/*
++** acge_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	facge	p0\.d, p1/z, z0\.d, \1
++** |
++**	facle	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acge_1_f64, svfloat64_t,
++		p0 = svacge_n_f64 (p1, z0, 1),
++		p0 = svacge (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f16.c
+new file mode 100644
+index 000000000..53c63351c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f16.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acgt_f16_tied:
++** (
++**	facgt	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	faclt	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_f16_tied, svfloat16_t,
++		p0 = svacgt_f16 (p0, z0, z1),
++		p0 = svacgt (p0, z0, z1))
++
++/*
++** acgt_f16_untied:
++** (
++**	facgt	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	faclt	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_f16_untied, svfloat16_t,
++		p0 = svacgt_f16 (p1, z0, z1),
++		p0 = svacgt (p1, z0, z1))
++
++/*
++** acgt_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	facgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	faclt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acgt_h4_f16, svfloat16_t, float16_t,
++		 p0 = svacgt_n_f16 (p1, z0, d4),
++		 p0 = svacgt (p1, z0, d4))
++
++/*
++** acgt_0_f16:
++**	mov	(z[0-9]+\.h), #0
++** (
++**	facgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	faclt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_0_f16, svfloat16_t,
++		p0 = svacgt_n_f16 (p1, z0, 0),
++		p0 = svacgt (p1, z0, 0))
++
++/*
++** acgt_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	facgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	faclt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_1_f16, svfloat16_t,
++		p0 = svacgt_n_f16 (p1, z0, 1),
++		p0 = svacgt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f32.c
+new file mode 100644
+index 000000000..d71c84ea6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f32.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acgt_f32_tied:
++** (
++**	facgt	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	faclt	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_f32_tied, svfloat32_t,
++		p0 = svacgt_f32 (p0, z0, z1),
++		p0 = svacgt (p0, z0, z1))
++
++/*
++** acgt_f32_untied:
++** (
++**	facgt	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	faclt	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_f32_untied, svfloat32_t,
++		p0 = svacgt_f32 (p1, z0, z1),
++		p0 = svacgt (p1, z0, z1))
++
++/*
++** acgt_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	facgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	faclt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acgt_s4_f32, svfloat32_t, float32_t,
++		 p0 = svacgt_n_f32 (p1, z0, d4),
++		 p0 = svacgt (p1, z0, d4))
++
++/*
++** acgt_0_f32:
++**	mov	(z[0-9]+\.s), #0
++** (
++**	facgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	faclt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_0_f32, svfloat32_t,
++		p0 = svacgt_n_f32 (p1, z0, 0),
++		p0 = svacgt (p1, z0, 0))
++
++/*
++** acgt_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	facgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	faclt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_1_f32, svfloat32_t,
++		p0 = svacgt_n_f32 (p1, z0, 1),
++		p0 = svacgt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f64.c
+new file mode 100644
+index 000000000..15d549e18
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f64.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acgt_f64_tied:
++** (
++**	facgt	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	faclt	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_f64_tied, svfloat64_t,
++		p0 = svacgt_f64 (p0, z0, z1),
++		p0 = svacgt (p0, z0, z1))
++
++/*
++** acgt_f64_untied:
++** (
++**	facgt	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	faclt	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_f64_untied, svfloat64_t,
++		p0 = svacgt_f64 (p1, z0, z1),
++		p0 = svacgt (p1, z0, z1))
++
++/*
++** acgt_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	facgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	faclt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acgt_d4_f64, svfloat64_t, float64_t,
++		 p0 = svacgt_n_f64 (p1, z0, d4),
++		 p0 = svacgt (p1, z0, d4))
++
++/*
++** acgt_0_f64:
++**	mov	(z[0-9]+\.d), #0
++** (
++**	facgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	faclt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_0_f64, svfloat64_t,
++		p0 = svacgt_n_f64 (p1, z0, 0),
++		p0 = svacgt (p1, z0, 0))
++
++/*
++** acgt_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	facgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	faclt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acgt_1_f64, svfloat64_t,
++		p0 = svacgt_n_f64 (p1, z0, 1),
++		p0 = svacgt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f16.c
+new file mode 100644
+index 000000000..ed6721d57
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f16.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acle_f16_tied:
++** (
++**	facge	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	facle	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_f16_tied, svfloat16_t,
++		p0 = svacle_f16 (p0, z0, z1),
++		p0 = svacle (p0, z0, z1))
++
++/*
++** acle_f16_untied:
++** (
++**	facge	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	facle	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_f16_untied, svfloat16_t,
++		p0 = svacle_f16 (p1, z0, z1),
++		p0 = svacle (p1, z0, z1))
++
++/*
++** acle_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	facge	p0\.h, p1/z, \1, z0\.h
++** |
++**	facle	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acle_h4_f16, svfloat16_t, float16_t,
++		 p0 = svacle_n_f16 (p1, z0, d4),
++		 p0 = svacle (p1, z0, d4))
++
++/*
++** acle_0_f16:
++**	mov	(z[0-9]+\.h), #0
++** (
++**	facge	p0\.h, p1/z, \1, z0\.h
++** |
++**	facle	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_0_f16, svfloat16_t,
++		p0 = svacle_n_f16 (p1, z0, 0),
++		p0 = svacle (p1, z0, 0))
++
++/*
++** acle_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	facge	p0\.h, p1/z, \1, z0\.h
++** |
++**	facle	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_1_f16, svfloat16_t,
++		p0 = svacle_n_f16 (p1, z0, 1),
++		p0 = svacle (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f32.c
+new file mode 100644
+index 000000000..7fc9da701
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f32.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acle_f32_tied:
++** (
++**	facge	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	facle	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_f32_tied, svfloat32_t,
++		p0 = svacle_f32 (p0, z0, z1),
++		p0 = svacle (p0, z0, z1))
++
++/*
++** acle_f32_untied:
++** (
++**	facge	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	facle	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_f32_untied, svfloat32_t,
++		p0 = svacle_f32 (p1, z0, z1),
++		p0 = svacle (p1, z0, z1))
++
++/*
++** acle_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	facge	p0\.s, p1/z, \1, z0\.s
++** |
++**	facle	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acle_s4_f32, svfloat32_t, float32_t,
++		 p0 = svacle_n_f32 (p1, z0, d4),
++		 p0 = svacle (p1, z0, d4))
++
++/*
++** acle_0_f32:
++**	mov	(z[0-9]+\.s), #0
++** (
++**	facge	p0\.s, p1/z, \1, z0\.s
++** |
++**	facle	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_0_f32, svfloat32_t,
++		p0 = svacle_n_f32 (p1, z0, 0),
++		p0 = svacle (p1, z0, 0))
++
++/*
++** acle_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	facge	p0\.s, p1/z, \1, z0\.s
++** |
++**	facle	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_1_f32, svfloat32_t,
++		p0 = svacle_n_f32 (p1, z0, 1),
++		p0 = svacle (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f64.c
+new file mode 100644
+index 000000000..ecbb8e500
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f64.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** acle_f64_tied:
++** (
++**	facge	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	facle	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_f64_tied, svfloat64_t,
++		p0 = svacle_f64 (p0, z0, z1),
++		p0 = svacle (p0, z0, z1))
++
++/*
++** acle_f64_untied:
++** (
++**	facge	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	facle	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_f64_untied, svfloat64_t,
++		p0 = svacle_f64 (p1, z0, z1),
++		p0 = svacle (p1, z0, z1))
++
++/*
++** acle_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	facge	p0\.d, p1/z, \1, z0\.d
++** |
++**	facle	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (acle_d4_f64, svfloat64_t, float64_t,
++		 p0 = svacle_n_f64 (p1, z0, d4),
++		 p0 = svacle (p1, z0, d4))
++
++/*
++** acle_0_f64:
++**	mov	(z[0-9]+\.d), #0
++** (
++**	facge	p0\.d, p1/z, \1, z0\.d
++** |
++**	facle	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_0_f64, svfloat64_t,
++		p0 = svacle_n_f64 (p1, z0, 0),
++		p0 = svacle (p1, z0, 0))
++
++/*
++** acle_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	facge	p0\.d, p1/z, \1, z0\.d
++** |
++**	facle	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (acle_1_f64, svfloat64_t,
++		p0 = svacle_n_f64 (p1, z0, 1),
++		p0 = svacle (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f16.c
+new file mode 100644
+index 000000000..e5f5040c7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f16.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** aclt_f16_tied:
++** (
++**	facgt	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	faclt	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_f16_tied, svfloat16_t,
++		p0 = svaclt_f16 (p0, z0, z1),
++		p0 = svaclt (p0, z0, z1))
++
++/*
++** aclt_f16_untied:
++** (
++**	facgt	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	faclt	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_f16_untied, svfloat16_t,
++		p0 = svaclt_f16 (p1, z0, z1),
++		p0 = svaclt (p1, z0, z1))
++
++/*
++** aclt_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	facgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	faclt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (aclt_h4_f16, svfloat16_t, float16_t,
++		 p0 = svaclt_n_f16 (p1, z0, d4),
++		 p0 = svaclt (p1, z0, d4))
++
++/*
++** aclt_0_f16:
++**	mov	(z[0-9]+\.h), #0
++** (
++**	facgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	faclt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_0_f16, svfloat16_t,
++		p0 = svaclt_n_f16 (p1, z0, 0),
++		p0 = svaclt (p1, z0, 0))
++
++/*
++** aclt_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	facgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	faclt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_1_f16, svfloat16_t,
++		p0 = svaclt_n_f16 (p1, z0, 1),
++		p0 = svaclt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f32.c
+new file mode 100644
+index 000000000..f40826445
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f32.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** aclt_f32_tied:
++** (
++**	facgt	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	faclt	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_f32_tied, svfloat32_t,
++		p0 = svaclt_f32 (p0, z0, z1),
++		p0 = svaclt (p0, z0, z1))
++
++/*
++** aclt_f32_untied:
++** (
++**	facgt	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	faclt	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_f32_untied, svfloat32_t,
++		p0 = svaclt_f32 (p1, z0, z1),
++		p0 = svaclt (p1, z0, z1))
++
++/*
++** aclt_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	facgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	faclt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (aclt_s4_f32, svfloat32_t, float32_t,
++		 p0 = svaclt_n_f32 (p1, z0, d4),
++		 p0 = svaclt (p1, z0, d4))
++
++/*
++** aclt_0_f32:
++**	mov	(z[0-9]+\.s), #0
++** (
++**	facgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	faclt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_0_f32, svfloat32_t,
++		p0 = svaclt_n_f32 (p1, z0, 0),
++		p0 = svaclt (p1, z0, 0))
++
++/*
++** aclt_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	facgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	faclt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_1_f32, svfloat32_t,
++		p0 = svaclt_n_f32 (p1, z0, 1),
++		p0 = svaclt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f64.c
+new file mode 100644
+index 000000000..0170b3307
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f64.c
+@@ -0,0 +1,71 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** aclt_f64_tied:
++** (
++**	facgt	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	faclt	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_f64_tied, svfloat64_t,
++		p0 = svaclt_f64 (p0, z0, z1),
++		p0 = svaclt (p0, z0, z1))
++
++/*
++** aclt_f64_untied:
++** (
++**	facgt	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	faclt	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_f64_untied, svfloat64_t,
++		p0 = svaclt_f64 (p1, z0, z1),
++		p0 = svaclt (p1, z0, z1))
++
++/*
++** aclt_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	facgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	faclt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (aclt_d4_f64, svfloat64_t, float64_t,
++		 p0 = svaclt_n_f64 (p1, z0, d4),
++		 p0 = svaclt (p1, z0, d4))
++
++/*
++** aclt_0_f64:
++**	mov	(z[0-9]+\.d), #0
++** (
++**	facgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	faclt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_0_f64, svfloat64_t,
++		p0 = svaclt_n_f64 (p1, z0, 0),
++		p0 = svaclt (p1, z0, 0))
++
++/*
++** aclt_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	facgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	faclt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (aclt_1_f64, svfloat64_t,
++		p0 = svaclt_n_f64 (p1, z0, 1),
++		p0 = svaclt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16.c
+new file mode 100644
+index 000000000..7228e5dd5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16.c
+@@ -0,0 +1,577 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_m_tied1, svfloat16_t,
++		z0 = svadd_f16_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_m_tied2, svfloat16_t,
++		z0 = svadd_f16_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_m_untied, svfloat16_t,
++		z0 = svadd_f16_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_m (p0, z0, d4),
++		 z0 = svadd_m (p0, z0, d4))
++
++/*
++** add_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_m (p0, z1, d4),
++		 z0 = svadd_m (p0, z1, d4))
++
++/*
++** add_1_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_0p5_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, 0.5),
++		z0 = svadd_m (p0, z0, 0.5))
++
++/*
++** add_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, 0.5),
++		z0 = svadd_m (p0, z1, 0.5))
++
++/*
++** add_m1_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_m1_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, -1),
++		z0 = svadd_m (p0, z1, -1))
++
++/*
++** add_m0p5_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, -0.5),
++		z0 = svadd_m (p0, z0, -0.5))
++
++/*
++** add_m0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, -0.5),
++		z0 = svadd_m (p0, z1, -0.5))
++
++/*
++** add_m2_f16_m:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f16_m, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_z_tied1, svfloat16_t,
++		z0 = svadd_f16_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_z_tied2, svfloat16_t,
++		z0 = svadd_f16_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_z_untied, svfloat16_t,
++		z0 = svadd_f16_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_z (p0, z0, d4),
++		 z0 = svadd_z (p0, z0, d4))
++
++/*
++** add_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_z (p0, z1, d4),
++		 z0 = svadd_z (p0, z1, d4))
++
++/*
++** add_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, 0.5),
++		z0 = svadd_z (p0, z0, 0.5))
++
++/*
++** add_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, 0.5),
++		z0 = svadd_z (p0, z1, 0.5))
++
++/*
++** add_m1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, -1),
++		z0 = svadd_z (p0, z0, -1))
++
++/*
++** add_m1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, -1),
++		z0 = svadd_z (p0, z1, -1))
++
++/*
++** add_m0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, -0.5),
++		z0 = svadd_z (p0, z0, -0.5))
++
++/*
++** add_m0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, -0.5),
++		z0 = svadd_z (p0, z1, -0.5))
++
++/*
++** add_m2_f16_z:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f16_z, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, -2),
++		z0 = svadd_z (p0, z0, -2))
++
++/*
++** add_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_x_tied1, svfloat16_t,
++		z0 = svadd_f16_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_f16_x_tied2:
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_x_tied2, svfloat16_t,
++		z0 = svadd_f16_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_x_untied, svfloat16_t,
++		z0 = svadd_f16_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_x (p0, z0, d4),
++		 z0 = svadd_x (p0, z0, d4))
++
++/*
++** add_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_x (p0, z1, d4),
++		 z0 = svadd_x (p0, z1, d4))
++
++/*
++** add_1_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_0p5_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, 0.5),
++		z0 = svadd_x (p0, z0, 0.5))
++
++/*
++** add_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, 0.5),
++		z0 = svadd_x (p0, z1, 0.5))
++
++/*
++** add_m1_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m1_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, -1),
++		z0 = svadd_x (p0, z1, -1))
++
++/*
++** add_m0p5_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, -0.5),
++		z0 = svadd_x (p0, z0, -0.5))
++
++/*
++** add_m0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, -0.5),
++		z0 = svadd_x (p0, z1, -0.5))
++
++/*
++** add_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, 2),
++		z0 = svadd_x (p0, z0, 2))
++
++/*
++** add_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, 2),
++		z0 = svadd_x (p0, z1, 2))
++
++/*
++** ptrue_add_f16_x_tied1:
++**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f16_x_tied1, svfloat16_t,
++		z0 = svadd_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svadd_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_add_f16_x_tied2:
++**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f16_x_tied2, svfloat16_t,
++		z0 = svadd_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svadd_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_add_f16_x_untied:
++**	fadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f16_x_untied, svfloat16_t,
++		z0 = svadd_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svadd_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_add_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svadd_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_add_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svadd_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_add_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svadd_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_add_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svadd_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_add_m1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svadd_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_add_m1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svadd_x (svptrue_b16 (), z1, -1))
++
++/*
++** ptrue_add_m0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -0.5),
++		z0 = svadd_x (svptrue_b16 (), z0, -0.5))
++
++/*
++** ptrue_add_m0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -0.5),
++		z0 = svadd_x (svptrue_b16 (), z1, -0.5))
++
++/*
++** ptrue_add_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svadd_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_add_2_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svadd_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16_notrap.c
+new file mode 100644
+index 000000000..f6330acee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16_notrap.c
+@@ -0,0 +1,572 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_m_tied1, svfloat16_t,
++		z0 = svadd_f16_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_m_tied2, svfloat16_t,
++		z0 = svadd_f16_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_m_untied, svfloat16_t,
++		z0 = svadd_f16_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_m (p0, z0, d4),
++		 z0 = svadd_m (p0, z0, d4))
++
++/*
++** add_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_m (p0, z1, d4),
++		 z0 = svadd_m (p0, z1, d4))
++
++/*
++** add_1_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_0p5_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, 0.5),
++		z0 = svadd_m (p0, z0, 0.5))
++
++/*
++** add_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, 0.5),
++		z0 = svadd_m (p0, z1, 0.5))
++
++/*
++** add_m1_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_m1_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, -1),
++		z0 = svadd_m (p0, z1, -1))
++
++/*
++** add_m0p5_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_m_tied1, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, -0.5),
++		z0 = svadd_m (p0, z0, -0.5))
++
++/*
++** add_m0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_m_untied, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z1, -0.5),
++		z0 = svadd_m (p0, z1, -0.5))
++
++/*
++** add_m2_f16_m:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f16_m, svfloat16_t,
++		z0 = svadd_n_f16_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_z_tied1, svfloat16_t,
++		z0 = svadd_f16_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_z_tied2, svfloat16_t,
++		z0 = svadd_f16_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_z_untied, svfloat16_t,
++		z0 = svadd_f16_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_z (p0, z0, d4),
++		 z0 = svadd_z (p0, z0, d4))
++
++/*
++** add_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_z (p0, z1, d4),
++		 z0 = svadd_z (p0, z1, d4))
++
++/*
++** add_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, 0.5),
++		z0 = svadd_z (p0, z0, 0.5))
++
++/*
++** add_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, 0.5),
++		z0 = svadd_z (p0, z1, 0.5))
++
++/*
++** add_m1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, -1),
++		z0 = svadd_z (p0, z0, -1))
++
++/*
++** add_m1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, -1),
++		z0 = svadd_z (p0, z1, -1))
++
++/*
++** add_m0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_z_tied1, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, -0.5),
++		z0 = svadd_z (p0, z0, -0.5))
++
++/*
++** add_m0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_z_untied, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z1, -0.5),
++		z0 = svadd_z (p0, z1, -0.5))
++
++/*
++** add_m2_f16_z:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f16_z, svfloat16_t,
++		z0 = svadd_n_f16_z (p0, z0, -2),
++		z0 = svadd_z (p0, z0, -2))
++
++/*
++** add_f16_x_tied1:
++**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_x_tied1, svfloat16_t,
++		z0 = svadd_f16_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_f16_x_tied2:
++**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_x_tied2, svfloat16_t,
++		z0 = svadd_f16_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_f16_x_untied:
++**	fadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f16_x_untied, svfloat16_t,
++		z0 = svadd_f16_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_x (p0, z0, d4),
++		 z0 = svadd_x (p0, z0, d4))
++
++/*
++** add_h4_f16_x_untied:
++**	mov	(z[0-9]+\.h), h4
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_ZD (add_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svadd_n_f16_x (p0, z1, d4),
++		 z0 = svadd_x (p0, z1, d4))
++
++/*
++** add_1_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_0p5_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, 0.5),
++		z0 = svadd_x (p0, z0, 0.5))
++
++/*
++** add_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, 0.5),
++		z0 = svadd_x (p0, z1, 0.5))
++
++/*
++** add_m1_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m1_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, -1),
++		z0 = svadd_x (p0, z1, -1))
++
++/*
++** add_m0p5_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, -0.5),
++		z0 = svadd_x (p0, z0, -0.5))
++
++/*
++** add_m0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, -0.5),
++		z0 = svadd_x (p0, z1, -0.5))
++
++/*
++** add_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z0, 2),
++		z0 = svadd_x (p0, z0, 2))
++
++/*
++** add_2_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (p0, z1, 2),
++		z0 = svadd_x (p0, z1, 2))
++
++/*
++** ptrue_add_f16_x_tied1:
++**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f16_x_tied1, svfloat16_t,
++		z0 = svadd_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svadd_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_add_f16_x_tied2:
++**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f16_x_tied2, svfloat16_t,
++		z0 = svadd_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svadd_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_add_f16_x_untied:
++**	fadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f16_x_untied, svfloat16_t,
++		z0 = svadd_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svadd_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_add_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svadd_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_add_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svadd_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_add_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svadd_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_add_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svadd_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_add_m1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svadd_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_add_m1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svadd_x (svptrue_b16 (), z1, -1))
++
++/*
++** ptrue_add_m0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -0.5),
++		z0 = svadd_x (svptrue_b16 (), z0, -0.5))
++
++/*
++** ptrue_add_m0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -0.5),
++		z0 = svadd_x (svptrue_b16 (), z1, -0.5))
++
++/*
++** ptrue_add_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f16_x_tied1, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svadd_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_add_2_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f16_x_untied, svfloat16_t,
++		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svadd_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32.c
+new file mode 100644
+index 000000000..b5f4e9623
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32.c
+@@ -0,0 +1,577 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_m_tied1, svfloat32_t,
++		z0 = svadd_f32_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_m_tied2, svfloat32_t,
++		z0 = svadd_f32_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_m_untied, svfloat32_t,
++		z0 = svadd_f32_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svadd_n_f32_m (p0, z0, d4),
++		 z0 = svadd_m (p0, z0, d4))
++
++/*
++** add_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svadd_n_f32_m (p0, z1, d4),
++		 z0 = svadd_m (p0, z1, d4))
++
++/*
++** add_1_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_0p5_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, 0.5),
++		z0 = svadd_m (p0, z0, 0.5))
++
++/*
++** add_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, 0.5),
++		z0 = svadd_m (p0, z1, 0.5))
++
++/*
++** add_m1_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_m1_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, -1),
++		z0 = svadd_m (p0, z1, -1))
++
++/*
++** add_m0p5_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, -0.5),
++		z0 = svadd_m (p0, z0, -0.5))
++
++/*
++** add_m0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, -0.5),
++		z0 = svadd_m (p0, z1, -0.5))
++
++/*
++** add_m2_f32_m:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f32_m, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_z_tied1, svfloat32_t,
++		z0 = svadd_f32_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_z_tied2, svfloat32_t,
++		z0 = svadd_f32_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_z_untied, svfloat32_t,
++		z0 = svadd_f32_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svadd_n_f32_z (p0, z0, d4),
++		 z0 = svadd_z (p0, z0, d4))
++
++/*
++** add_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svadd_n_f32_z (p0, z1, d4),
++		 z0 = svadd_z (p0, z1, d4))
++
++/*
++** add_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, 0.5),
++		z0 = svadd_z (p0, z0, 0.5))
++
++/*
++** add_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, 0.5),
++		z0 = svadd_z (p0, z1, 0.5))
++
++/*
++** add_m1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, -1),
++		z0 = svadd_z (p0, z0, -1))
++
++/*
++** add_m1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, -1),
++		z0 = svadd_z (p0, z1, -1))
++
++/*
++** add_m0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, -0.5),
++		z0 = svadd_z (p0, z0, -0.5))
++
++/*
++** add_m0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, -0.5),
++		z0 = svadd_z (p0, z1, -0.5))
++
++/*
++** add_m2_f32_z:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f32_z, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, -2),
++		z0 = svadd_z (p0, z0, -2))
++
++/*
++** add_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_x_tied1, svfloat32_t,
++		z0 = svadd_f32_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_f32_x_tied2:
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_x_tied2, svfloat32_t,
++		z0 = svadd_f32_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_x_untied, svfloat32_t,
++		z0 = svadd_f32_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svadd_n_f32_x (p0, z0, d4),
++		 z0 = svadd_x (p0, z0, d4))
++
++/*
++** add_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svadd_n_f32_x (p0, z1, d4),
++		 z0 = svadd_x (p0, z1, d4))
++
++/*
++** add_1_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_0p5_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, 0.5),
++		z0 = svadd_x (p0, z0, 0.5))
++
++/*
++** add_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, 0.5),
++		z0 = svadd_x (p0, z1, 0.5))
++
++/*
++** add_m1_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m1_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, -1),
++		z0 = svadd_x (p0, z1, -1))
++
++/*
++** add_m0p5_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, -0.5),
++		z0 = svadd_x (p0, z0, -0.5))
++
++/*
++** add_m0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, -0.5),
++		z0 = svadd_x (p0, z1, -0.5))
++
++/*
++** add_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, 2),
++		z0 = svadd_x (p0, z0, 2))
++
++/*
++** add_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, 2),
++		z0 = svadd_x (p0, z1, 2))
++
++/*
++** ptrue_add_f32_x_tied1:
++**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f32_x_tied1, svfloat32_t,
++		z0 = svadd_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svadd_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_add_f32_x_tied2:
++**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f32_x_tied2, svfloat32_t,
++		z0 = svadd_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svadd_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_add_f32_x_untied:
++**	fadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f32_x_untied, svfloat32_t,
++		z0 = svadd_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svadd_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_add_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svadd_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_add_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svadd_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_add_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svadd_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_add_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svadd_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_add_m1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svadd_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_add_m1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svadd_x (svptrue_b32 (), z1, -1))
++
++/*
++** ptrue_add_m0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -0.5),
++		z0 = svadd_x (svptrue_b32 (), z0, -0.5))
++
++/*
++** ptrue_add_m0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -0.5),
++		z0 = svadd_x (svptrue_b32 (), z1, -0.5))
++
++/*
++** ptrue_add_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svadd_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_add_2_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svadd_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32_notrap.c
+new file mode 100644
+index 000000000..062e5fd67
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32_notrap.c
+@@ -0,0 +1,572 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_m_tied1, svfloat32_t,
++		z0 = svadd_f32_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_m_tied2, svfloat32_t,
++		z0 = svadd_f32_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_m_untied, svfloat32_t,
++		z0 = svadd_f32_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svadd_n_f32_m (p0, z0, d4),
++		 z0 = svadd_m (p0, z0, d4))
++
++/*
++** add_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svadd_n_f32_m (p0, z1, d4),
++		 z0 = svadd_m (p0, z1, d4))
++
++/*
++** add_1_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_0p5_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, 0.5),
++		z0 = svadd_m (p0, z0, 0.5))
++
++/*
++** add_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, 0.5),
++		z0 = svadd_m (p0, z1, 0.5))
++
++/*
++** add_m1_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_m1_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, -1),
++		z0 = svadd_m (p0, z1, -1))
++
++/*
++** add_m0p5_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_m_tied1, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, -0.5),
++		z0 = svadd_m (p0, z0, -0.5))
++
++/*
++** add_m0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_m_untied, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z1, -0.5),
++		z0 = svadd_m (p0, z1, -0.5))
++
++/*
++** add_m2_f32_m:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f32_m, svfloat32_t,
++		z0 = svadd_n_f32_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_z_tied1, svfloat32_t,
++		z0 = svadd_f32_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_z_tied2, svfloat32_t,
++		z0 = svadd_f32_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_z_untied, svfloat32_t,
++		z0 = svadd_f32_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svadd_n_f32_z (p0, z0, d4),
++		 z0 = svadd_z (p0, z0, d4))
++
++/*
++** add_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svadd_n_f32_z (p0, z1, d4),
++		 z0 = svadd_z (p0, z1, d4))
++
++/*
++** add_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, 0.5),
++		z0 = svadd_z (p0, z0, 0.5))
++
++/*
++** add_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, 0.5),
++		z0 = svadd_z (p0, z1, 0.5))
++
++/*
++** add_m1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, -1),
++		z0 = svadd_z (p0, z0, -1))
++
++/*
++** add_m1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, -1),
++		z0 = svadd_z (p0, z1, -1))
++
++/*
++** add_m0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_z_tied1, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, -0.5),
++		z0 = svadd_z (p0, z0, -0.5))
++
++/*
++** add_m0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_z_untied, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z1, -0.5),
++		z0 = svadd_z (p0, z1, -0.5))
++
++/*
++** add_m2_f32_z:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f32_z, svfloat32_t,
++		z0 = svadd_n_f32_z (p0, z0, -2),
++		z0 = svadd_z (p0, z0, -2))
++
++/*
++** add_f32_x_tied1:
++**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_x_tied1, svfloat32_t,
++		z0 = svadd_f32_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_f32_x_tied2:
++**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_x_tied2, svfloat32_t,
++		z0 = svadd_f32_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_f32_x_untied:
++**	fadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f32_x_untied, svfloat32_t,
++		z0 = svadd_f32_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svadd_n_f32_x (p0, z0, d4),
++		 z0 = svadd_x (p0, z0, d4))
++
++/*
++** add_s4_f32_x_untied:
++**	mov	(z[0-9]+\.s), s4
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_ZD (add_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svadd_n_f32_x (p0, z1, d4),
++		 z0 = svadd_x (p0, z1, d4))
++
++/*
++** add_1_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_0p5_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, 0.5),
++		z0 = svadd_x (p0, z0, 0.5))
++
++/*
++** add_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, 0.5),
++		z0 = svadd_x (p0, z1, 0.5))
++
++/*
++** add_m1_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m1_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, -1),
++		z0 = svadd_x (p0, z1, -1))
++
++/*
++** add_m0p5_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, -0.5),
++		z0 = svadd_x (p0, z0, -0.5))
++
++/*
++** add_m0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, -0.5),
++		z0 = svadd_x (p0, z1, -0.5))
++
++/*
++** add_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z0, 2),
++		z0 = svadd_x (p0, z0, 2))
++
++/*
++** add_2_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (p0, z1, 2),
++		z0 = svadd_x (p0, z1, 2))
++
++/*
++** ptrue_add_f32_x_tied1:
++**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f32_x_tied1, svfloat32_t,
++		z0 = svadd_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svadd_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_add_f32_x_tied2:
++**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f32_x_tied2, svfloat32_t,
++		z0 = svadd_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svadd_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_add_f32_x_untied:
++**	fadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f32_x_untied, svfloat32_t,
++		z0 = svadd_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svadd_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_add_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svadd_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_add_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svadd_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_add_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svadd_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_add_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svadd_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_add_m1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svadd_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_add_m1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svadd_x (svptrue_b32 (), z1, -1))
++
++/*
++** ptrue_add_m0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -0.5),
++		z0 = svadd_x (svptrue_b32 (), z0, -0.5))
++
++/*
++** ptrue_add_m0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -0.5),
++		z0 = svadd_x (svptrue_b32 (), z1, -0.5))
++
++/*
++** ptrue_add_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f32_x_tied1, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svadd_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_add_2_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f32_x_untied, svfloat32_t,
++		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svadd_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64.c
+new file mode 100644
+index 000000000..7185f3acf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64.c
+@@ -0,0 +1,577 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_m_tied1, svfloat64_t,
++		z0 = svadd_f64_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_m_tied2, svfloat64_t,
++		z0 = svadd_f64_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_m_untied, svfloat64_t,
++		z0 = svadd_f64_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svadd_n_f64_m (p0, z0, d4),
++		 z0 = svadd_m (p0, z0, d4))
++
++/*
++** add_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svadd_n_f64_m (p0, z1, d4),
++		 z0 = svadd_m (p0, z1, d4))
++
++/*
++** add_1_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_0p5_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, 0.5),
++		z0 = svadd_m (p0, z0, 0.5))
++
++/*
++** add_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, 0.5),
++		z0 = svadd_m (p0, z1, 0.5))
++
++/*
++** add_m1_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_m1_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, -1),
++		z0 = svadd_m (p0, z1, -1))
++
++/*
++** add_m0p5_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, -0.5),
++		z0 = svadd_m (p0, z0, -0.5))
++
++/*
++** add_m0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, -0.5),
++		z0 = svadd_m (p0, z1, -0.5))
++
++/*
++** add_m2_f64_m:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f64_m, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_z_tied1, svfloat64_t,
++		z0 = svadd_f64_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_z_tied2, svfloat64_t,
++		z0 = svadd_f64_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_z_untied, svfloat64_t,
++		z0 = svadd_f64_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svadd_n_f64_z (p0, z0, d4),
++		 z0 = svadd_z (p0, z0, d4))
++
++/*
++** add_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svadd_n_f64_z (p0, z1, d4),
++		 z0 = svadd_z (p0, z1, d4))
++
++/*
++** add_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, 0.5),
++		z0 = svadd_z (p0, z0, 0.5))
++
++/*
++** add_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, 0.5),
++		z0 = svadd_z (p0, z1, 0.5))
++
++/*
++** add_m1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, -1),
++		z0 = svadd_z (p0, z0, -1))
++
++/*
++** add_m1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, -1),
++		z0 = svadd_z (p0, z1, -1))
++
++/*
++** add_m0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, -0.5),
++		z0 = svadd_z (p0, z0, -0.5))
++
++/*
++** add_m0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, -0.5),
++		z0 = svadd_z (p0, z1, -0.5))
++
++/*
++** add_m2_f64_z:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f64_z, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, -2),
++		z0 = svadd_z (p0, z0, -2))
++
++/*
++** add_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_x_tied1, svfloat64_t,
++		z0 = svadd_f64_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_f64_x_tied2:
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_x_tied2, svfloat64_t,
++		z0 = svadd_f64_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_x_untied, svfloat64_t,
++		z0 = svadd_f64_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svadd_n_f64_x (p0, z0, d4),
++		 z0 = svadd_x (p0, z0, d4))
++
++/*
++** add_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svadd_n_f64_x (p0, z1, d4),
++		 z0 = svadd_x (p0, z1, d4))
++
++/*
++** add_1_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_0p5_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, 0.5),
++		z0 = svadd_x (p0, z0, 0.5))
++
++/*
++** add_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, 0.5),
++		z0 = svadd_x (p0, z1, 0.5))
++
++/*
++** add_m1_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m1_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, -1),
++		z0 = svadd_x (p0, z1, -1))
++
++/*
++** add_m0p5_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, -0.5),
++		z0 = svadd_x (p0, z0, -0.5))
++
++/*
++** add_m0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, -0.5),
++		z0 = svadd_x (p0, z1, -0.5))
++
++/*
++** add_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, 2),
++		z0 = svadd_x (p0, z0, 2))
++
++/*
++** add_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, 2),
++		z0 = svadd_x (p0, z1, 2))
++
++/*
++** ptrue_add_f64_x_tied1:
++**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f64_x_tied1, svfloat64_t,
++		z0 = svadd_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svadd_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_add_f64_x_tied2:
++**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f64_x_tied2, svfloat64_t,
++		z0 = svadd_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svadd_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_add_f64_x_untied:
++**	fadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f64_x_untied, svfloat64_t,
++		z0 = svadd_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svadd_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_add_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svadd_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_add_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svadd_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_add_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svadd_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_add_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svadd_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_add_m1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svadd_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_add_m1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svadd_x (svptrue_b64 (), z1, -1))
++
++/*
++** ptrue_add_m0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -0.5),
++		z0 = svadd_x (svptrue_b64 (), z0, -0.5))
++
++/*
++** ptrue_add_m0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -0.5),
++		z0 = svadd_x (svptrue_b64 (), z1, -0.5))
++
++/*
++** ptrue_add_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svadd_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_add_2_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svadd_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64_notrap.c
+new file mode 100644
+index 000000000..6d095b507
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64_notrap.c
+@@ -0,0 +1,572 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_m_tied1, svfloat64_t,
++		z0 = svadd_f64_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_m_tied2, svfloat64_t,
++		z0 = svadd_f64_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_m_untied, svfloat64_t,
++		z0 = svadd_f64_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svadd_n_f64_m (p0, z0, d4),
++		 z0 = svadd_m (p0, z0, d4))
++
++/*
++** add_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svadd_n_f64_m (p0, z1, d4),
++		 z0 = svadd_m (p0, z1, d4))
++
++/*
++** add_1_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_0p5_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, 0.5),
++		z0 = svadd_m (p0, z0, 0.5))
++
++/*
++** add_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, 0.5),
++		z0 = svadd_m (p0, z1, 0.5))
++
++/*
++** add_m1_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_m1_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, -1),
++		z0 = svadd_m (p0, z1, -1))
++
++/*
++** add_m0p5_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_m_tied1, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, -0.5),
++		z0 = svadd_m (p0, z0, -0.5))
++
++/*
++** add_m0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_m_untied, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z1, -0.5),
++		z0 = svadd_m (p0, z1, -0.5))
++
++/*
++** add_m2_f64_m:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f64_m, svfloat64_t,
++		z0 = svadd_n_f64_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_z_tied1, svfloat64_t,
++		z0 = svadd_f64_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_z_tied2, svfloat64_t,
++		z0 = svadd_f64_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_z_untied, svfloat64_t,
++		z0 = svadd_f64_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svadd_n_f64_z (p0, z0, d4),
++		 z0 = svadd_z (p0, z0, d4))
++
++/*
++** add_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svadd_n_f64_z (p0, z1, d4),
++		 z0 = svadd_z (p0, z1, d4))
++
++/*
++** add_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, 0.5),
++		z0 = svadd_z (p0, z0, 0.5))
++
++/*
++** add_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, 0.5),
++		z0 = svadd_z (p0, z1, 0.5))
++
++/*
++** add_m1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, -1),
++		z0 = svadd_z (p0, z0, -1))
++
++/*
++** add_m1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, -1),
++		z0 = svadd_z (p0, z1, -1))
++
++/*
++** add_m0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_z_tied1, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, -0.5),
++		z0 = svadd_z (p0, z0, -0.5))
++
++/*
++** add_m0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_z_untied, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z1, -0.5),
++		z0 = svadd_z (p0, z1, -0.5))
++
++/*
++** add_m2_f64_z:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_f64_z, svfloat64_t,
++		z0 = svadd_n_f64_z (p0, z0, -2),
++		z0 = svadd_z (p0, z0, -2))
++
++/*
++** add_f64_x_tied1:
++**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_x_tied1, svfloat64_t,
++		z0 = svadd_f64_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_f64_x_tied2:
++**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_x_tied2, svfloat64_t,
++		z0 = svadd_f64_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_f64_x_untied:
++**	fadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_f64_x_untied, svfloat64_t,
++		z0 = svadd_f64_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svadd_n_f64_x (p0, z0, d4),
++		 z0 = svadd_x (p0, z0, d4))
++
++/*
++** add_d4_f64_x_untied:
++**	mov	(z[0-9]+\.d), d4
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZD (add_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svadd_n_f64_x (p0, z1, d4),
++		 z0 = svadd_x (p0, z1, d4))
++
++/*
++** add_1_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_0p5_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, 0.5),
++		z0 = svadd_x (p0, z0, 0.5))
++
++/*
++** add_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, 0.5),
++		z0 = svadd_x (p0, z1, 0.5))
++
++/*
++** add_m1_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m1_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, -1),
++		z0 = svadd_x (p0, z1, -1))
++
++/*
++** add_m0p5_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, -0.5),
++		z0 = svadd_x (p0, z0, -0.5))
++
++/*
++** add_m0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (add_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, -0.5),
++		z0 = svadd_x (p0, z1, -0.5))
++
++/*
++** add_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z0, 2),
++		z0 = svadd_x (p0, z0, 2))
++
++/*
++** add_2_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_2_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (p0, z1, 2),
++		z0 = svadd_x (p0, z1, 2))
++
++/*
++** ptrue_add_f64_x_tied1:
++**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f64_x_tied1, svfloat64_t,
++		z0 = svadd_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svadd_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_add_f64_x_tied2:
++**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f64_x_tied2, svfloat64_t,
++		z0 = svadd_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svadd_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_add_f64_x_untied:
++**	fadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_f64_x_untied, svfloat64_t,
++		z0 = svadd_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svadd_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_add_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svadd_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_add_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svadd_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_add_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svadd_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_add_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svadd_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_add_m1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svadd_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_add_m1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m1_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svadd_x (svptrue_b64 (), z1, -1))
++
++/*
++** ptrue_add_m0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -0.5),
++		z0 = svadd_x (svptrue_b64 (), z0, -0.5))
++
++/*
++** ptrue_add_m0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -0.5),
++		z0 = svadd_x (svptrue_b64 (), z1, -0.5))
++
++/*
++** ptrue_add_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f64_x_tied1, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svadd_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_add_2_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_add_2_f64_x_untied, svfloat64_t,
++		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svadd_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s16.c
+new file mode 100644
+index 000000000..c0883edf9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s16.c
+@@ -0,0 +1,377 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_s16_m_tied1:
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_m_tied1, svint16_t,
++		z0 = svadd_s16_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_m_tied2, svint16_t,
++		z0 = svadd_s16_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_s16_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_m_untied, svint16_t,
++		z0 = svadd_s16_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svadd_n_s16_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svadd_n_s16_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s16_m_tied1, svint16_t,
++		z0 = svadd_n_s16_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s16_m_untied, svint16_t,
++		z0 = svadd_n_s16_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m2_s16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_s16_m, svint16_t,
++		z0 = svadd_n_s16_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_z_tied1, svint16_t,
++		z0 = svadd_s16_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_z_tied2, svint16_t,
++		z0 = svadd_s16_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_z_untied, svint16_t,
++		z0 = svadd_s16_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svadd_n_s16_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svadd_n_s16_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s16_z_tied1, svint16_t,
++		z0 = svadd_n_s16_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s16_z_untied, svint16_t,
++		z0 = svadd_n_s16_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_s16_x_tied1:
++**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_x_tied1, svint16_t,
++		z0 = svadd_s16_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_s16_x_tied2:
++**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_x_tied2, svint16_t,
++		z0 = svadd_s16_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_s16_x_untied:
++**	add	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s16_x_untied, svint16_t,
++		z0 = svadd_s16_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svadd_n_s16_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_w0_s16_x_untied:
++**	mov	(z[0-9]+\.h), w0
++**	add	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svadd_n_s16_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_s16_x_tied1:
++**	add	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s16_x_tied1, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_s16_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s16_x_untied, svint16_t,
++		z0 = svadd_n_s16_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_s16_x:
++**	add	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_s16_x:
++**	add	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_s16_x:
++**	add	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_256_s16_x:
++**	add	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_256_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 256),
++		z0 = svadd_x (p0, z0, 256))
++
++/*
++** add_257_s16_x:
++**	mov	(z[0-9]+)\.b, #1
++**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_257_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 257),
++		z0 = svadd_x (p0, z0, 257))
++
++/*
++** add_512_s16_x:
++**	add	z0\.h, z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_512_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 512),
++		z0 = svadd_x (p0, z0, 512))
++
++/*
++** add_65280_s16_x:
++**	add	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_65280_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, 0xff00),
++		z0 = svadd_x (p0, z0, 0xff00))
++
++/*
++** add_m1_s16_x:
++**	sub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_s16_x:
++**	sub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_s16_x:
++**	sub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
++
++/*
++** add_m255_s16_x:
++**	sub	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m255_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -255),
++		z0 = svadd_x (p0, z0, -255))
++
++/*
++** add_m256_s16_x:
++**	add	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_m256_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -256),
++		z0 = svadd_x (p0, z0, -256))
++
++/*
++** add_m257_s16_x:
++**	mov	(z[0-9]+\.h), #-257
++**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m257_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -257),
++		z0 = svadd_x (p0, z0, -257))
++
++/*
++** add_m512_s16_x:
++**	add	z0\.h, z0\.h, #65024
++**	ret
++*/
++TEST_UNIFORM_Z (add_m512_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -512),
++		z0 = svadd_x (p0, z0, -512))
++
++/*
++** add_m32768_s16_x:
++**	add	z0\.h, z0\.h, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (add_m32768_s16_x, svint16_t,
++		z0 = svadd_n_s16_x (p0, z0, -0x8000),
++		z0 = svadd_x (p0, z0, -0x8000))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s32.c
+new file mode 100644
+index 000000000..887038ba3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s32.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_s32_m_tied1:
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_m_tied1, svint32_t,
++		z0 = svadd_s32_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_m_tied2, svint32_t,
++		z0 = svadd_s32_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_s32_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_m_untied, svint32_t,
++		z0 = svadd_s32_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svadd_n_s32_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svadd_n_s32_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s32_m_tied1, svint32_t,
++		z0 = svadd_n_s32_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s32_m_untied, svint32_t,
++		z0 = svadd_n_s32_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m2_s32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_s32_m, svint32_t,
++		z0 = svadd_n_s32_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_z_tied1, svint32_t,
++		z0 = svadd_s32_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_z_tied2, svint32_t,
++		z0 = svadd_s32_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_z_untied, svint32_t,
++		z0 = svadd_s32_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svadd_n_s32_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svadd_n_s32_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s32_z_tied1, svint32_t,
++		z0 = svadd_n_s32_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s32_z_untied, svint32_t,
++		z0 = svadd_n_s32_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_s32_x_tied1:
++**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_x_tied1, svint32_t,
++		z0 = svadd_s32_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_s32_x_tied2:
++**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_x_tied2, svint32_t,
++		z0 = svadd_s32_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_s32_x_untied:
++**	add	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s32_x_untied, svint32_t,
++		z0 = svadd_s32_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svadd_n_s32_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_w0_s32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	add	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svadd_n_s32_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_s32_x_tied1:
++**	add	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s32_x_tied1, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_s32_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s32_x_untied, svint32_t,
++		z0 = svadd_n_s32_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_s32_x:
++**	add	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_s32_x:
++**	add	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_s32_x:
++**	add	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_256_s32_x:
++**	add	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_256_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 256),
++		z0 = svadd_x (p0, z0, 256))
++
++/*
++** add_511_s32_x:
++**	mov	(z[0-9]+\.s), #511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_511_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 511),
++		z0 = svadd_x (p0, z0, 511))
++
++/*
++** add_512_s32_x:
++**	add	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_512_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 512),
++		z0 = svadd_x (p0, z0, 512))
++
++/*
++** add_65280_s32_x:
++**	add	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_65280_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 0xff00),
++		z0 = svadd_x (p0, z0, 0xff00))
++
++/*
++** add_65535_s32_x:
++**	mov	(z[0-9]+\.s), #65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65535_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 65535),
++		z0 = svadd_x (p0, z0, 65535))
++
++/*
++** add_65536_s32_x:
++**	mov	(z[0-9]+\.s), #65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65536_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, 65536),
++		z0 = svadd_x (p0, z0, 65536))
++
++/*
++** add_m1_s32_x:
++**	sub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_s32_x:
++**	sub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_s32_x:
++**	sub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
++
++/*
++** add_m255_s32_x:
++**	sub	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m255_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -255),
++		z0 = svadd_x (p0, z0, -255))
++
++/*
++** add_m256_s32_x:
++**	sub	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_m256_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -256),
++		z0 = svadd_x (p0, z0, -256))
++
++/*
++** add_m511_s32_x:
++**	mov	(z[0-9]+\.s), #-511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m511_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -511),
++		z0 = svadd_x (p0, z0, -511))
++
++/*
++** add_m512_s32_x:
++**	sub	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_m512_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -512),
++		z0 = svadd_x (p0, z0, -512))
++
++/*
++** add_m32768_s32_x:
++**	sub	z0\.s, z0\.s, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (add_m32768_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -0x8000),
++		z0 = svadd_x (p0, z0, -0x8000))
++
++/*
++** add_m65280_s32_x:
++**	sub	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65280_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -0xff00),
++		z0 = svadd_x (p0, z0, -0xff00))
++
++/*
++** add_m65535_s32_x:
++**	mov	(z[0-9]+\.s), #-65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65535_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -65535),
++		z0 = svadd_x (p0, z0, -65535))
++
++/*
++** add_m65536_s32_x:
++**	mov	(z[0-9]+\.s), #-65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65536_s32_x, svint32_t,
++		z0 = svadd_n_s32_x (p0, z0, -65536),
++		z0 = svadd_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s64.c
+new file mode 100644
+index 000000000..aab63ef62
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s64.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_s64_m_tied1:
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_m_tied1, svint64_t,
++		z0 = svadd_s64_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_m_tied2, svint64_t,
++		z0 = svadd_s64_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_s64_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_m_untied, svint64_t,
++		z0 = svadd_s64_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svadd_n_s64_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svadd_n_s64_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s64_m_tied1, svint64_t,
++		z0 = svadd_n_s64_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s64_m_untied, svint64_t,
++		z0 = svadd_n_s64_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m2_s64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_s64_m, svint64_t,
++		z0 = svadd_n_s64_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_z_tied1, svint64_t,
++		z0 = svadd_s64_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_z_tied2, svint64_t,
++		z0 = svadd_s64_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_z_untied, svint64_t,
++		z0 = svadd_s64_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svadd_n_s64_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svadd_n_s64_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s64_z_tied1, svint64_t,
++		z0 = svadd_n_s64_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s64_z_untied, svint64_t,
++		z0 = svadd_n_s64_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_s64_x_tied1:
++**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_x_tied1, svint64_t,
++		z0 = svadd_s64_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_s64_x_tied2:
++**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_x_tied2, svint64_t,
++		z0 = svadd_s64_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_s64_x_untied:
++**	add	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s64_x_untied, svint64_t,
++		z0 = svadd_s64_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svadd_n_s64_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	add	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svadd_n_s64_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_s64_x_tied1:
++**	add	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s64_x_tied1, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_s64_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s64_x_untied, svint64_t,
++		z0 = svadd_n_s64_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_s64_x:
++**	add	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_s64_x:
++**	add	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_s64_x:
++**	add	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_256_s64_x:
++**	add	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_256_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 256),
++		z0 = svadd_x (p0, z0, 256))
++
++/*
++** add_511_s64_x:
++**	mov	(z[0-9]+\.d), #511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_511_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 511),
++		z0 = svadd_x (p0, z0, 511))
++
++/*
++** add_512_s64_x:
++**	add	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_512_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 512),
++		z0 = svadd_x (p0, z0, 512))
++
++/*
++** add_65280_s64_x:
++**	add	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_65280_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 0xff00),
++		z0 = svadd_x (p0, z0, 0xff00))
++
++/*
++** add_65535_s64_x:
++**	mov	(z[0-9]+\.d), #65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65535_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 65535),
++		z0 = svadd_x (p0, z0, 65535))
++
++/*
++** add_65536_s64_x:
++**	mov	(z[0-9]+\.d), #65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65536_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, 65536),
++		z0 = svadd_x (p0, z0, 65536))
++
++/*
++** add_m1_s64_x:
++**	sub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_s64_x:
++**	sub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_s64_x:
++**	sub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
++
++/*
++** add_m255_s64_x:
++**	sub	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m255_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -255),
++		z0 = svadd_x (p0, z0, -255))
++
++/*
++** add_m256_s64_x:
++**	sub	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_m256_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -256),
++		z0 = svadd_x (p0, z0, -256))
++
++/*
++** add_m511_s64_x:
++**	mov	(z[0-9]+\.d), #-511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m511_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -511),
++		z0 = svadd_x (p0, z0, -511))
++
++/*
++** add_m512_s64_x:
++**	sub	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_m512_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -512),
++		z0 = svadd_x (p0, z0, -512))
++
++/*
++** add_m32768_s64_x:
++**	sub	z0\.d, z0\.d, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (add_m32768_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -0x8000),
++		z0 = svadd_x (p0, z0, -0x8000))
++
++/*
++** add_m65280_s64_x:
++**	sub	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65280_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -0xff00),
++		z0 = svadd_x (p0, z0, -0xff00))
++
++/*
++** add_m65535_s64_x:
++**	mov	(z[0-9]+\.d), #-65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65535_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -65535),
++		z0 = svadd_x (p0, z0, -65535))
++
++/*
++** add_m65536_s64_x:
++**	mov	(z[0-9]+\.d), #-65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65536_s64_x, svint64_t,
++		z0 = svadd_n_s64_x (p0, z0, -65536),
++		z0 = svadd_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s8.c
+new file mode 100644
+index 000000000..0889c189d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_s8_m_tied1:
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_m_tied1, svint8_t,
++		z0 = svadd_s8_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_m_tied2, svint8_t,
++		z0 = svadd_s8_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_s8_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_m_untied, svint8_t,
++		z0 = svadd_s8_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svadd_n_s8_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svadd_n_s8_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s8_m_tied1, svint8_t,
++		z0 = svadd_n_s8_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s8_m_untied, svint8_t,
++		z0 = svadd_n_s8_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m1_s8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_s8_m, svint8_t,
++		z0 = svadd_n_s8_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_z_tied1, svint8_t,
++		z0 = svadd_s8_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_z_tied2, svint8_t,
++		z0 = svadd_s8_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_z_untied, svint8_t,
++		z0 = svadd_s8_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svadd_n_s8_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svadd_n_s8_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s8_z_tied1, svint8_t,
++		z0 = svadd_n_s8_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s8_z_untied, svint8_t,
++		z0 = svadd_n_s8_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_s8_x_tied1:
++**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_x_tied1, svint8_t,
++		z0 = svadd_s8_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_s8_x_tied2:
++**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_x_tied2, svint8_t,
++		z0 = svadd_s8_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_s8_x_untied:
++**	add	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (add_s8_x_untied, svint8_t,
++		z0 = svadd_s8_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svadd_n_s8_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_w0_s8_x_untied:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, (z1\.b, \1|\1, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svadd_n_s8_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_s8_x_tied1:
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s8_x_tied1, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_s8_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_s8_x_untied, svint8_t,
++		z0 = svadd_n_s8_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_s8_x:
++**	add	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_s8_x, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_s8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_s8_x, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_s8_x:
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_s8_x, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_m1_s8_x:
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_s8_x, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_s8_x:
++**	add	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_s8_x, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_s8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_s8_x, svint8_t,
++		z0 = svadd_n_s8_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u16.c
+new file mode 100644
+index 000000000..25cb90353
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u16.c
+@@ -0,0 +1,377 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_u16_m_tied1:
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_m_tied1, svuint16_t,
++		z0 = svadd_u16_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_m_tied2, svuint16_t,
++		z0 = svadd_u16_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_u16_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_m_untied, svuint16_t,
++		z0 = svadd_u16_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svadd_n_u16_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svadd_n_u16_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u16_m_tied1, svuint16_t,
++		z0 = svadd_n_u16_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u16_m_untied, svuint16_t,
++		z0 = svadd_n_u16_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m2_u16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_u16_m, svuint16_t,
++		z0 = svadd_n_u16_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_z_tied1, svuint16_t,
++		z0 = svadd_u16_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_z_tied2, svuint16_t,
++		z0 = svadd_u16_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_z_untied, svuint16_t,
++		z0 = svadd_u16_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svadd_n_u16_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svadd_n_u16_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u16_z_tied1, svuint16_t,
++		z0 = svadd_n_u16_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u16_z_untied, svuint16_t,
++		z0 = svadd_n_u16_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_u16_x_tied1:
++**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_x_tied1, svuint16_t,
++		z0 = svadd_u16_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_u16_x_tied2:
++**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_x_tied2, svuint16_t,
++		z0 = svadd_u16_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_u16_x_untied:
++**	add	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u16_x_untied, svuint16_t,
++		z0 = svadd_u16_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svadd_n_u16_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_w0_u16_x_untied:
++**	mov	(z[0-9]+\.h), w0
++**	add	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svadd_n_u16_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_u16_x_tied1:
++**	add	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u16_x_tied1, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_u16_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u16_x_untied, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_u16_x:
++**	add	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_u16_x:
++**	add	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_u16_x:
++**	add	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_256_u16_x:
++**	add	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_256_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 256),
++		z0 = svadd_x (p0, z0, 256))
++
++/*
++** add_257_u16_x:
++**	mov	(z[0-9]+)\.b, #1
++**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_257_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 257),
++		z0 = svadd_x (p0, z0, 257))
++
++/*
++** add_512_u16_x:
++**	add	z0\.h, z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_512_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 512),
++		z0 = svadd_x (p0, z0, 512))
++
++/*
++** add_65280_u16_x:
++**	add	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_65280_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, 0xff00),
++		z0 = svadd_x (p0, z0, 0xff00))
++
++/*
++** add_m1_u16_x:
++**	sub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_u16_x:
++**	sub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_u16_x:
++**	sub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
++
++/*
++** add_m255_u16_x:
++**	sub	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m255_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -255),
++		z0 = svadd_x (p0, z0, -255))
++
++/*
++** add_m256_u16_x:
++**	add	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_m256_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -256),
++		z0 = svadd_x (p0, z0, -256))
++
++/*
++** add_m257_u16_x:
++**	mov	(z[0-9]+\.h), #-257
++**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m257_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -257),
++		z0 = svadd_x (p0, z0, -257))
++
++/*
++** add_m512_u16_x:
++**	add	z0\.h, z0\.h, #65024
++**	ret
++*/
++TEST_UNIFORM_Z (add_m512_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -512),
++		z0 = svadd_x (p0, z0, -512))
++
++/*
++** add_m32768_u16_x:
++**	add	z0\.h, z0\.h, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (add_m32768_u16_x, svuint16_t,
++		z0 = svadd_n_u16_x (p0, z0, -0x8000),
++		z0 = svadd_x (p0, z0, -0x8000))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u32.c
+new file mode 100644
+index 000000000..ee979489b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u32.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_u32_m_tied1:
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_m_tied1, svuint32_t,
++		z0 = svadd_u32_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_m_tied2, svuint32_t,
++		z0 = svadd_u32_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_u32_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_m_untied, svuint32_t,
++		z0 = svadd_u32_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svadd_n_u32_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svadd_n_u32_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u32_m_tied1, svuint32_t,
++		z0 = svadd_n_u32_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u32_m_untied, svuint32_t,
++		z0 = svadd_n_u32_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m2_u32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_u32_m, svuint32_t,
++		z0 = svadd_n_u32_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_z_tied1, svuint32_t,
++		z0 = svadd_u32_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_z_tied2, svuint32_t,
++		z0 = svadd_u32_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_z_untied, svuint32_t,
++		z0 = svadd_u32_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svadd_n_u32_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svadd_n_u32_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u32_z_tied1, svuint32_t,
++		z0 = svadd_n_u32_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u32_z_untied, svuint32_t,
++		z0 = svadd_n_u32_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_u32_x_tied1:
++**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_x_tied1, svuint32_t,
++		z0 = svadd_u32_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_u32_x_tied2:
++**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_x_tied2, svuint32_t,
++		z0 = svadd_u32_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_u32_x_untied:
++**	add	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u32_x_untied, svuint32_t,
++		z0 = svadd_u32_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svadd_n_u32_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_w0_u32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	add	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svadd_n_u32_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_u32_x_tied1:
++**	add	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u32_x_tied1, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_u32_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u32_x_untied, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_u32_x:
++**	add	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_u32_x:
++**	add	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_u32_x:
++**	add	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_256_u32_x:
++**	add	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_256_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 256),
++		z0 = svadd_x (p0, z0, 256))
++
++/*
++** add_511_u32_x:
++**	mov	(z[0-9]+\.s), #511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_511_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 511),
++		z0 = svadd_x (p0, z0, 511))
++
++/*
++** add_512_u32_x:
++**	add	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_512_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 512),
++		z0 = svadd_x (p0, z0, 512))
++
++/*
++** add_65280_u32_x:
++**	add	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_65280_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 0xff00),
++		z0 = svadd_x (p0, z0, 0xff00))
++
++/*
++** add_65535_u32_x:
++**	mov	(z[0-9]+\.s), #65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65535_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 65535),
++		z0 = svadd_x (p0, z0, 65535))
++
++/*
++** add_65536_u32_x:
++**	mov	(z[0-9]+\.s), #65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65536_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, 65536),
++		z0 = svadd_x (p0, z0, 65536))
++
++/*
++** add_m1_u32_x:
++**	sub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_u32_x:
++**	sub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_u32_x:
++**	sub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
++
++/*
++** add_m255_u32_x:
++**	sub	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m255_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -255),
++		z0 = svadd_x (p0, z0, -255))
++
++/*
++** add_m256_u32_x:
++**	sub	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_m256_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -256),
++		z0 = svadd_x (p0, z0, -256))
++
++/*
++** add_m511_u32_x:
++**	mov	(z[0-9]+\.s), #-511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m511_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -511),
++		z0 = svadd_x (p0, z0, -511))
++
++/*
++** add_m512_u32_x:
++**	sub	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_m512_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -512),
++		z0 = svadd_x (p0, z0, -512))
++
++/*
++** add_m32768_u32_x:
++**	sub	z0\.s, z0\.s, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (add_m32768_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -0x8000),
++		z0 = svadd_x (p0, z0, -0x8000))
++
++/*
++** add_m65280_u32_x:
++**	sub	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65280_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -0xff00),
++		z0 = svadd_x (p0, z0, -0xff00))
++
++/*
++** add_m65535_u32_x:
++**	mov	(z[0-9]+\.s), #-65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65535_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -65535),
++		z0 = svadd_x (p0, z0, -65535))
++
++/*
++** add_m65536_u32_x:
++**	mov	(z[0-9]+\.s), #-65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65536_u32_x, svuint32_t,
++		z0 = svadd_n_u32_x (p0, z0, -65536),
++		z0 = svadd_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u64.c
+new file mode 100644
+index 000000000..25d2972a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u64.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_u64_m_tied1:
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_m_tied1, svuint64_t,
++		z0 = svadd_u64_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_m_tied2, svuint64_t,
++		z0 = svadd_u64_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_u64_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_m_untied, svuint64_t,
++		z0 = svadd_u64_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svadd_n_u64_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svadd_n_u64_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u64_m_tied1, svuint64_t,
++		z0 = svadd_n_u64_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u64_m_untied, svuint64_t,
++		z0 = svadd_n_u64_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m2_u64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m2_u64_m, svuint64_t,
++		z0 = svadd_n_u64_m (p0, z0, -2),
++		z0 = svadd_m (p0, z0, -2))
++
++/*
++** add_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_z_tied1, svuint64_t,
++		z0 = svadd_u64_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_z_tied2, svuint64_t,
++		z0 = svadd_u64_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_z_untied, svuint64_t,
++		z0 = svadd_u64_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svadd_n_u64_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svadd_n_u64_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u64_z_tied1, svuint64_t,
++		z0 = svadd_n_u64_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u64_z_untied, svuint64_t,
++		z0 = svadd_n_u64_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_u64_x_tied1:
++**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_x_tied1, svuint64_t,
++		z0 = svadd_u64_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_u64_x_tied2:
++**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_x_tied2, svuint64_t,
++		z0 = svadd_u64_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_u64_x_untied:
++**	add	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u64_x_untied, svuint64_t,
++		z0 = svadd_u64_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svadd_n_u64_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	add	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svadd_n_u64_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_u64_x_tied1:
++**	add	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u64_x_tied1, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_u64_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u64_x_untied, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_u64_x:
++**	add	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_u64_x:
++**	add	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_u64_x:
++**	add	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_256_u64_x:
++**	add	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_256_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 256),
++		z0 = svadd_x (p0, z0, 256))
++
++/*
++** add_511_u64_x:
++**	mov	(z[0-9]+\.d), #511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_511_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 511),
++		z0 = svadd_x (p0, z0, 511))
++
++/*
++** add_512_u64_x:
++**	add	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_512_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 512),
++		z0 = svadd_x (p0, z0, 512))
++
++/*
++** add_65280_u64_x:
++**	add	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_65280_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 0xff00),
++		z0 = svadd_x (p0, z0, 0xff00))
++
++/*
++** add_65535_u64_x:
++**	mov	(z[0-9]+\.d), #65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65535_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 65535),
++		z0 = svadd_x (p0, z0, 65535))
++
++/*
++** add_65536_u64_x:
++**	mov	(z[0-9]+\.d), #65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_65536_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, 65536),
++		z0 = svadd_x (p0, z0, 65536))
++
++/*
++** add_m1_u64_x:
++**	sub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_u64_x:
++**	sub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_u64_x:
++**	sub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
++
++/*
++** add_m255_u64_x:
++**	sub	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m255_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -255),
++		z0 = svadd_x (p0, z0, -255))
++
++/*
++** add_m256_u64_x:
++**	sub	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (add_m256_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -256),
++		z0 = svadd_x (p0, z0, -256))
++
++/*
++** add_m511_u64_x:
++**	mov	(z[0-9]+\.d), #-511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m511_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -511),
++		z0 = svadd_x (p0, z0, -511))
++
++/*
++** add_m512_u64_x:
++**	sub	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (add_m512_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -512),
++		z0 = svadd_x (p0, z0, -512))
++
++/*
++** add_m32768_u64_x:
++**	sub	z0\.d, z0\.d, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (add_m32768_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -0x8000),
++		z0 = svadd_x (p0, z0, -0x8000))
++
++/*
++** add_m65280_u64_x:
++**	sub	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65280_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -0xff00),
++		z0 = svadd_x (p0, z0, -0xff00))
++
++/*
++** add_m65535_u64_x:
++**	mov	(z[0-9]+\.d), #-65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65535_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -65535),
++		z0 = svadd_x (p0, z0, -65535))
++
++/*
++** add_m65536_u64_x:
++**	mov	(z[0-9]+\.d), #-65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (add_m65536_u64_x, svuint64_t,
++		z0 = svadd_n_u64_x (p0, z0, -65536),
++		z0 = svadd_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u8.c
+new file mode 100644
+index 000000000..06b68c97c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** add_u8_m_tied1:
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_m_tied1, svuint8_t,
++		z0 = svadd_u8_m (p0, z0, z1),
++		z0 = svadd_m (p0, z0, z1))
++
++/*
++** add_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_m_tied2, svuint8_t,
++		z0 = svadd_u8_m (p0, z1, z0),
++		z0 = svadd_m (p0, z1, z0))
++
++/*
++** add_u8_m_untied:
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_m_untied, svuint8_t,
++		z0 = svadd_u8_m (p0, z1, z2),
++		z0 = svadd_m (p0, z1, z2))
++
++/*
++** add_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svadd_n_u8_m (p0, z0, x0),
++		 z0 = svadd_m (p0, z0, x0))
++
++/*
++** add_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svadd_n_u8_m (p0, z1, x0),
++		 z0 = svadd_m (p0, z1, x0))
++
++/*
++** add_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u8_m_tied1, svuint8_t,
++		z0 = svadd_n_u8_m (p0, z0, 1),
++		z0 = svadd_m (p0, z0, 1))
++
++/*
++** add_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u8_m_untied, svuint8_t,
++		z0 = svadd_n_u8_m (p0, z1, 1),
++		z0 = svadd_m (p0, z1, 1))
++
++/*
++** add_m1_u8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_u8_m, svuint8_t,
++		z0 = svadd_n_u8_m (p0, z0, -1),
++		z0 = svadd_m (p0, z0, -1))
++
++/*
++** add_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_z_tied1, svuint8_t,
++		z0 = svadd_u8_z (p0, z0, z1),
++		z0 = svadd_z (p0, z0, z1))
++
++/*
++** add_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_z_tied2, svuint8_t,
++		z0 = svadd_u8_z (p0, z1, z0),
++		z0 = svadd_z (p0, z1, z0))
++
++/*
++** add_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_z_untied, svuint8_t,
++		z0 = svadd_u8_z (p0, z1, z2),
++		z0 = svadd_z (p0, z1, z2))
++
++/*
++** add_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svadd_n_u8_z (p0, z0, x0),
++		 z0 = svadd_z (p0, z0, x0))
++
++/*
++** add_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svadd_n_u8_z (p0, z1, x0),
++		 z0 = svadd_z (p0, z1, x0))
++
++/*
++** add_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u8_z_tied1, svuint8_t,
++		z0 = svadd_n_u8_z (p0, z0, 1),
++		z0 = svadd_z (p0, z0, 1))
++
++/*
++** add_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u8_z_untied, svuint8_t,
++		z0 = svadd_n_u8_z (p0, z1, 1),
++		z0 = svadd_z (p0, z1, 1))
++
++/*
++** add_u8_x_tied1:
++**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_x_tied1, svuint8_t,
++		z0 = svadd_u8_x (p0, z0, z1),
++		z0 = svadd_x (p0, z0, z1))
++
++/*
++** add_u8_x_tied2:
++**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_x_tied2, svuint8_t,
++		z0 = svadd_u8_x (p0, z1, z0),
++		z0 = svadd_x (p0, z1, z0))
++
++/*
++** add_u8_x_untied:
++**	add	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (add_u8_x_untied, svuint8_t,
++		z0 = svadd_u8_x (p0, z1, z2),
++		z0 = svadd_x (p0, z1, z2))
++
++/*
++** add_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svadd_n_u8_x (p0, z0, x0),
++		 z0 = svadd_x (p0, z0, x0))
++
++/*
++** add_w0_u8_x_untied:
++**	mov	(z[0-9]+\.b), w0
++**	add	z0\.b, (z1\.b, \1|\1, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (add_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svadd_n_u8_x (p0, z1, x0),
++		 z0 = svadd_x (p0, z1, x0))
++
++/*
++** add_1_u8_x_tied1:
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u8_x_tied1, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, 1),
++		z0 = svadd_x (p0, z0, 1))
++
++/*
++** add_1_u8_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (add_1_u8_x_untied, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z1, 1),
++		z0 = svadd_x (p0, z1, 1))
++
++/*
++** add_127_u8_x:
++**	add	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (add_127_u8_x, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, 127),
++		z0 = svadd_x (p0, z0, 127))
++
++/*
++** add_128_u8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_128_u8_x, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, 128),
++		z0 = svadd_x (p0, z0, 128))
++
++/*
++** add_255_u8_x:
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_255_u8_x, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, 255),
++		z0 = svadd_x (p0, z0, 255))
++
++/*
++** add_m1_u8_x:
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (add_m1_u8_x, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, -1),
++		z0 = svadd_x (p0, z0, -1))
++
++/*
++** add_m127_u8_x:
++**	add	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (add_m127_u8_x, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, -127),
++		z0 = svadd_x (p0, z0, -127))
++
++/*
++** add_m128_u8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (add_m128_u8_x, svuint8_t,
++		z0 = svadd_n_u8_x (p0, z0, -128),
++		z0 = svadd_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f16.c
+new file mode 100644
+index 000000000..6c6bfa1c2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adda_d0_f16:
++**	fadda	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (adda_d0_f16, float16_t, svfloat16_t,
++		  d0 = svadda_f16 (p0, d0, z2),
++		  d0 = svadda (p0, d0, z2))
++
++/*
++** adda_d1_f16:
++**	mov	v0\.h\[0\], v1\.h\[0\]
++**	fadda	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (adda_d1_f16, float16_t, svfloat16_t,
++		  d0 = svadda_f16 (p0, d1, z2),
++		  d0 = svadda (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f32.c
+new file mode 100644
+index 000000000..8b2a1dd1c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f32.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adda_d0_f32:
++**	fadda	s0, p0, s0, z2\.s
++**	ret
++*/
++TEST_FOLD_LEFT_D (adda_d0_f32, float32_t, svfloat32_t,
++		  d0 = svadda_f32 (p0, d0, z2),
++		  d0 = svadda (p0, d0, z2))
++
++/*
++** adda_d1_f32:
++**	fmov	s0, s1
++**	fadda	s0, p0, s0, z2\.s
++**	ret
++*/
++TEST_FOLD_LEFT_D (adda_d1_f32, float32_t, svfloat32_t,
++		  d0 = svadda_f32 (p0, d1, z2),
++		  d0 = svadda (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f64.c
+new file mode 100644
+index 000000000..90a56420a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f64.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adda_d0_f64:
++**	fadda	d0, p0, d0, z2\.d
++**	ret
++*/
++TEST_FOLD_LEFT_D (adda_d0_f64, float64_t, svfloat64_t,
++		  d0 = svadda_f64 (p0, d0, z2),
++		  d0 = svadda (p0, d0, z2))
++
++/*
++** adda_d1_f64:
++**	fmov	d0, d1
++**	fadda	d0, p0, d0, z2\.d
++**	ret
++*/
++TEST_FOLD_LEFT_D (adda_d1_f64, float64_t, svfloat64_t,
++		  d0 = svadda_f64 (p0, d1, z2),
++		  d0 = svadda (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f16.c
+new file mode 100644
+index 000000000..7bb0c1de4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_d0_f16_tied:
++**	faddv	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (addv_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svaddv_f16 (p0, z0),
++		  d0 = svaddv (p0, z0))
++
++/*
++** addv_d0_f16_untied:
++**	faddv	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (addv_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svaddv_f16 (p0, z1),
++		  d0 = svaddv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f32.c
+new file mode 100644
+index 000000000..51c621910
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_d0_f32_tied:
++**	faddv	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (addv_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svaddv_f32 (p0, z0),
++		  d0 = svaddv (p0, z0))
++
++/*
++** addv_d0_f32_untied:
++**	faddv	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (addv_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svaddv_f32 (p0, z1),
++		  d0 = svaddv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f64.c
+new file mode 100644
+index 000000000..882866210
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_d0_f64_tied:
++**	faddv	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (addv_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svaddv_f64 (p0, z0),
++		  d0 = svaddv (p0, z0))
++
++/*
++** addv_d0_f64_untied:
++**	faddv	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (addv_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svaddv_f64 (p0, z1),
++		  d0 = svaddv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s16.c
+new file mode 100644
+index 000000000..05429a47e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_s16:
++**	saddv	(d[0-9]+), p0, z0\.h
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_s16, int64_t, svint16_t,
++		  x0 = svaddv_s16 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s32.c
+new file mode 100644
+index 000000000..5f7789a9a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_s32:
++**	saddv	(d[0-9]+), p0, z0\.s
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_s32, int64_t, svint32_t,
++		  x0 = svaddv_s32 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s64.c
+new file mode 100644
+index 000000000..76c480091
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_s64:
++**	uaddv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_s64, int64_t, svint64_t,
++		  x0 = svaddv_s64 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s8.c
+new file mode 100644
+index 000000000..8ccb2bf4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_s8:
++**	saddv	(d[0-9]+), p0, z0\.b
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_s8, int64_t, svint8_t,
++		  x0 = svaddv_s8 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u16.c
+new file mode 100644
+index 000000000..6371921fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_u16:
++**	uaddv	(d[0-9]+), p0, z0\.h
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_u16, uint64_t, svuint16_t,
++		  x0 = svaddv_u16 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u32.c
+new file mode 100644
+index 000000000..bdd0ed1f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_u32:
++**	uaddv	(d[0-9]+), p0, z0\.s
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_u32, uint64_t, svuint32_t,
++		  x0 = svaddv_u32 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u64.c
+new file mode 100644
+index 000000000..7b1995d3f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_u64:
++**	uaddv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_u64, uint64_t, svuint64_t,
++		  x0 = svaddv_u64 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u8.c
+new file mode 100644
+index 000000000..0e972f093
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** addv_x0_u8:
++**	uaddv	(d[0-9]+), p0, z0\.b
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (addv_x0_u8, uint64_t, svuint8_t,
++		  x0 = svaddv_u8 (p0, z0),
++		  x0 = svaddv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrb.c
+new file mode 100644
+index 000000000..a61eec971
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrb.c
+@@ -0,0 +1,57 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adrb_u32base_s32offset:
++**	adr	z0\.s, \[z0\.s, z1\.s\]
++**	ret
++*/
++TEST_ADR (adrb_u32base_s32offset, svuint32_t, svint32_t,
++	  z0 = svadrb_u32base_s32offset (z0, z1),
++	  z0 = svadrb_offset (z0, z1))
++
++/*
++** adrb_u32base_u32offset:
++**	adr	z0\.s, \[z0\.s, z1\.s\]
++**	ret
++*/
++TEST_ADR (adrb_u32base_u32offset, svuint32_t, svuint32_t,
++	  z0 = svadrb_u32base_u32offset (z0, z1),
++	  z0 = svadrb_offset (z0, z1))
++
++/*
++** adrb_u64base_s64offset:
++**	adr	z0\.d, \[z0\.d, z1\.d\]
++**	ret
++*/
++TEST_ADR (adrb_u64base_s64offset, svuint64_t, svint64_t,
++	  z0 = svadrb_u64base_s64offset (z0, z1),
++	  z0 = svadrb_offset (z0, z1))
++
++/*
++** adrb_ext_u64base_s64offset:
++**	adr	z0\.d, \[z0\.d, z1\.d, sxtw\]
++**	ret
++*/
++TEST_ADR (adrb_ext_u64base_s64offset, svuint64_t, svint64_t,
++	  z0 = svadrb_u64base_s64offset (z0, svextw_s64_x (svptrue_b64 (), z1)),
++	  z0 = svadrb_offset (z0, svextw_x (svptrue_b64 (), z1)))
++
++/*
++** adrb_u64base_u64offset:
++**	adr	z0\.d, \[z0\.d, z1\.d\]
++**	ret
++*/
++TEST_ADR (adrb_u64base_u64offset, svuint64_t, svuint64_t,
++	  z0 = svadrb_u64base_u64offset (z0, z1),
++	  z0 = svadrb_offset (z0, z1))
++
++/*
++** adrb_ext_u64base_u64offset:
++**	adr	z0\.d, \[z0\.d, z1\.d, uxtw\]
++**	ret
++*/
++TEST_ADR (adrb_ext_u64base_u64offset, svuint64_t, svuint64_t,
++	  z0 = svadrb_u64base_u64offset (z0, svextw_u64_x (svptrue_b64 (), z1)),
++	  z0 = svadrb_offset (z0, svextw_x (svptrue_b64 (), z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrd.c
+new file mode 100644
+index 000000000..970485bd6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrd.c
+@@ -0,0 +1,57 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adrd_u32base_s32index:
++**	adr	z0\.s, \[z0\.s, z1\.s, lsl 3\]
++**	ret
++*/
++TEST_ADR (adrd_u32base_s32index, svuint32_t, svint32_t,
++	  z0 = svadrd_u32base_s32index (z0, z1),
++	  z0 = svadrd_index (z0, z1))
++
++/*
++** adrd_u32base_u32index:
++**	adr	z0\.s, \[z0\.s, z1\.s, lsl 3\]
++**	ret
++*/
++TEST_ADR (adrd_u32base_u32index, svuint32_t, svuint32_t,
++	  z0 = svadrd_u32base_u32index (z0, z1),
++	  z0 = svadrd_index (z0, z1))
++
++/*
++** adrd_u64base_s64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_ADR (adrd_u64base_s64index, svuint64_t, svint64_t,
++	  z0 = svadrd_u64base_s64index (z0, z1),
++	  z0 = svadrd_index (z0, z1))
++
++/*
++** adrd_ext_u64base_s64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_ADR (adrd_ext_u64base_s64index, svuint64_t, svint64_t,
++	  z0 = svadrd_u64base_s64index (z0, svextw_s64_x (svptrue_b64 (), z1)),
++	  z0 = svadrd_index (z0, svextw_x (svptrue_b64 (), z1)))
++
++/*
++** adrd_u64base_u64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_ADR (adrd_u64base_u64index, svuint64_t, svuint64_t,
++	  z0 = svadrd_u64base_u64index (z0, z1),
++	  z0 = svadrd_index (z0, z1))
++
++/*
++** adrd_ext_u64base_u64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_ADR (adrd_ext_u64base_u64index, svuint64_t, svuint64_t,
++	  z0 = svadrd_u64base_u64index (z0, svextw_u64_x (svptrue_b64 (), z1)),
++	  z0 = svadrd_index (z0, svextw_x (svptrue_b64 (), z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrh.c
+new file mode 100644
+index 000000000..d06f51fe3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrh.c
+@@ -0,0 +1,57 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adrh_u32base_s32index:
++**	adr	z0\.s, \[z0\.s, z1\.s, lsl 1\]
++**	ret
++*/
++TEST_ADR (adrh_u32base_s32index, svuint32_t, svint32_t,
++	  z0 = svadrh_u32base_s32index (z0, z1),
++	  z0 = svadrh_index (z0, z1))
++
++/*
++** adrh_u32base_u32index:
++**	adr	z0\.s, \[z0\.s, z1\.s, lsl 1\]
++**	ret
++*/
++TEST_ADR (adrh_u32base_u32index, svuint32_t, svuint32_t,
++	  z0 = svadrh_u32base_u32index (z0, z1),
++	  z0 = svadrh_index (z0, z1))
++
++/*
++** adrh_u64base_s64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_ADR (adrh_u64base_s64index, svuint64_t, svint64_t,
++	  z0 = svadrh_u64base_s64index (z0, z1),
++	  z0 = svadrh_index (z0, z1))
++
++/*
++** adrh_ext_u64base_s64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_ADR (adrh_ext_u64base_s64index, svuint64_t, svint64_t,
++	  z0 = svadrh_u64base_s64index (z0, svextw_s64_x (svptrue_b64 (), z1)),
++	  z0 = svadrh_index (z0, svextw_x (svptrue_b64 (), z1)))
++
++/*
++** adrh_u64base_u64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_ADR (adrh_u64base_u64index, svuint64_t, svuint64_t,
++	  z0 = svadrh_u64base_u64index (z0, z1),
++	  z0 = svadrh_index (z0, z1))
++
++/*
++** adrh_ext_u64base_u64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_ADR (adrh_ext_u64base_u64index, svuint64_t, svuint64_t,
++	  z0 = svadrh_u64base_u64index (z0, svextw_u64_x (svptrue_b64 (), z1)),
++	  z0 = svadrh_index (z0, svextw_x (svptrue_b64 (), z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrw.c
+new file mode 100644
+index 000000000..b23f25a11
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrw.c
+@@ -0,0 +1,57 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** adrw_u32base_s32index:
++**	adr	z0\.s, \[z0\.s, z1\.s, lsl 2\]
++**	ret
++*/
++TEST_ADR (adrw_u32base_s32index, svuint32_t, svint32_t,
++	  z0 = svadrw_u32base_s32index (z0, z1),
++	  z0 = svadrw_index (z0, z1))
++
++/*
++** adrw_u32base_u32index:
++**	adr	z0\.s, \[z0\.s, z1\.s, lsl 2\]
++**	ret
++*/
++TEST_ADR (adrw_u32base_u32index, svuint32_t, svuint32_t,
++	  z0 = svadrw_u32base_u32index (z0, z1),
++	  z0 = svadrw_index (z0, z1))
++
++/*
++** adrw_u64base_s64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_ADR (adrw_u64base_s64index, svuint64_t, svint64_t,
++	  z0 = svadrw_u64base_s64index (z0, z1),
++	  z0 = svadrw_index (z0, z1))
++
++/*
++** adrw_ext_u64base_s64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_ADR (adrw_ext_u64base_s64index, svuint64_t, svint64_t,
++	  z0 = svadrw_u64base_s64index (z0, svextw_s64_x (svptrue_b64 (), z1)),
++	  z0 = svadrw_index (z0, svextw_x (svptrue_b64 (), z1)))
++
++/*
++** adrw_u64base_u64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_ADR (adrw_u64base_u64index, svuint64_t, svuint64_t,
++	  z0 = svadrw_u64base_u64index (z0, z1),
++	  z0 = svadrw_index (z0, z1))
++
++/*
++** adrw_ext_u64base_u64index:
++**	adr	z0\.d, \[z0\.d, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_ADR (adrw_ext_u64base_u64index, svuint64_t, svuint64_t,
++	  z0 = svadrw_u64base_u64index (z0, svextw_u64_x (svptrue_b64 (), z1)),
++	  z0 = svadrw_index (z0, svextw_x (svptrue_b64 (), z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_b.c
+new file mode 100644
+index 000000000..f0c4ff1b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_b_z_tied1:
++**	and	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
++**	ret
++*/
++TEST_UNIFORM_P (and_b_z_tied1,
++		p0 = svand_b_z (p3, p0, p1),
++		p0 = svand_z (p3, p0, p1))
++
++/*
++** and_b_z_tied2:
++**	and	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
++**	ret
++*/
++TEST_UNIFORM_P (and_b_z_tied2,
++		p0 = svand_b_z (p3, p1, p0),
++		p0 = svand_z (p3, p1, p0))
++
++/*
++** and_b_z_untied:
++**	and	p0\.b, p3/z, (p1\.b, p2\.b|p2\.b, p1\.b)
++**	ret
++*/
++TEST_UNIFORM_P (and_b_z_untied,
++		p0 = svand_b_z (p3, p1, p2),
++		p0 = svand_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s16.c
+new file mode 100644
+index 000000000..d54613e91
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s16.c
+@@ -0,0 +1,422 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_s16_m_tied1:
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_m_tied1, svint16_t,
++		z0 = svand_s16_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_m_tied2, svint16_t,
++		z0 = svand_s16_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_s16_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_m_untied, svint16_t,
++		z0 = svand_s16_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svand_n_s16_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svand_n_s16_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s16_m_tied1, svint16_t,
++		z0 = svand_n_s16_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s16_m_untied, svint16_t,
++		z0 = svand_n_s16_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_s16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_s16_m, svint16_t,
++		z0 = svand_n_s16_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_255_s16_m_tied1:
++**	uxtb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s16_m_tied1, svint16_t,
++		z0 = svand_n_s16_m (p0, z0, 255),
++		z0 = svand_m (p0, z0, 255))
++
++/*
++** and_255_s16_m_untied:
++**	movprfx	z0, z1
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s16_m_untied, svint16_t,
++		z0 = svand_n_s16_m (p0, z1, 255),
++		z0 = svand_m (p0, z1, 255))
++
++/*
++** and_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_z_tied1, svint16_t,
++		z0 = svand_s16_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_z_tied2, svint16_t,
++		z0 = svand_s16_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_z_untied, svint16_t,
++		z0 = svand_s16_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svand_n_s16_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svand_n_s16_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s16_z_tied1, svint16_t,
++		z0 = svand_n_s16_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s16_z_untied, svint16_t,
++		z0 = svand_n_s16_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_255_s16_z_tied1:
++** (
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	uxtb	z0\.h, p0/m, \1\.h
++** |
++**	mov	(z[0-9]+\.h), #255
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s16_z_tied1, svint16_t,
++		z0 = svand_n_s16_z (p0, z0, 255),
++		z0 = svand_z (p0, z0, 255))
++
++/*
++** and_255_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s16_z_untied, svint16_t,
++		z0 = svand_n_s16_z (p0, z1, 255),
++		z0 = svand_z (p0, z1, 255))
++
++/*
++** and_s16_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_x_tied1, svint16_t,
++		z0 = svand_s16_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_s16_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_x_tied2, svint16_t,
++		z0 = svand_s16_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_s16_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s16_x_untied, svint16_t,
++		z0 = svand_s16_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_w0_s16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svand_n_s16_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_w0_s16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svand_n_s16_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_s16_x_tied1:
++**	and	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s16_x_tied1, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_s16_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s16_x_untied, svint16_t,
++		z0 = svand_n_s16_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_s16_x:
++**	and	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_s16_x:
++**	and	z0\.h, z0\.h, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_s16_x:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_256_s16_x:
++**	and	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (and_256_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 256),
++		z0 = svand_x (p0, z0, 256))
++
++/*
++** and_257_s16_x:
++**	and	z0\.h, z0\.h, #0x101
++**	ret
++*/
++TEST_UNIFORM_Z (and_257_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 257),
++		z0 = svand_x (p0, z0, 257))
++
++/*
++** and_512_s16_x:
++**	and	z0\.h, z0\.h, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (and_512_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 512),
++		z0 = svand_x (p0, z0, 512))
++
++/*
++** and_65280_s16_x:
++**	and	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_65280_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 0xff00),
++		z0 = svand_x (p0, z0, 0xff00))
++
++/*
++** and_m127_s16_x:
++**	and	z0\.h, z0\.h, #0xff81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_s16_x:
++**	and	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_m255_s16_x:
++**	and	z0\.h, z0\.h, #0xff01
++**	ret
++*/
++TEST_UNIFORM_Z (and_m255_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -255),
++		z0 = svand_x (p0, z0, -255))
++
++/*
++** and_m256_s16_x:
++**	and	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m256_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -256),
++		z0 = svand_x (p0, z0, -256))
++
++/*
++** and_m257_s16_x:
++**	and	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (and_m257_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -257),
++		z0 = svand_x (p0, z0, -257))
++
++/*
++** and_m512_s16_x:
++**	and	z0\.h, z0\.h, #0xfe00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m512_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -512),
++		z0 = svand_x (p0, z0, -512))
++
++/*
++** and_m32768_s16_x:
++**	and	z0\.h, z0\.h, #0x8000
++**	ret
++*/
++TEST_UNIFORM_Z (and_m32768_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, -0x8000),
++		z0 = svand_x (p0, z0, -0x8000))
++
++/*
++** and_5_s16_x:
++**	mov	(z[0-9]+)\.h, #5
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_s16_x, svint16_t,
++		z0 = svand_n_s16_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s32.c
+new file mode 100644
+index 000000000..7f4082b32
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s32.c
+@@ -0,0 +1,464 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_s32_m_tied1:
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_m_tied1, svint32_t,
++		z0 = svand_s32_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_m_tied2, svint32_t,
++		z0 = svand_s32_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_s32_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_m_untied, svint32_t,
++		z0 = svand_s32_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svand_n_s32_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svand_n_s32_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s32_m_tied1, svint32_t,
++		z0 = svand_n_s32_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s32_m_untied, svint32_t,
++		z0 = svand_n_s32_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_s32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_s32_m, svint32_t,
++		z0 = svand_n_s32_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_255_s32_m_tied1:
++**	uxtb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s32_m_tied1, svint32_t,
++		z0 = svand_n_s32_m (p0, z0, 255),
++		z0 = svand_m (p0, z0, 255))
++
++/*
++** and_255_s32_m_untied:
++**	movprfx	z0, z1
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s32_m_untied, svint32_t,
++		z0 = svand_n_s32_m (p0, z1, 255),
++		z0 = svand_m (p0, z1, 255))
++
++/*
++** and_65535_s32_m_tied1:
++**	uxth	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s32_m_tied1, svint32_t,
++		z0 = svand_n_s32_m (p0, z0, 65535),
++		z0 = svand_m (p0, z0, 65535))
++
++/*
++** and_65535_s32_m_untied:
++**	movprfx	z0, z1
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s32_m_untied, svint32_t,
++		z0 = svand_n_s32_m (p0, z1, 65535),
++		z0 = svand_m (p0, z1, 65535))
++
++/*
++** and_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_z_tied1, svint32_t,
++		z0 = svand_s32_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_z_tied2, svint32_t,
++		z0 = svand_s32_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_z_untied, svint32_t,
++		z0 = svand_s32_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svand_n_s32_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svand_n_s32_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s32_z_tied1, svint32_t,
++		z0 = svand_n_s32_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s32_z_untied, svint32_t,
++		z0 = svand_n_s32_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_255_s32_z_tied1:
++** (
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	uxtb	z0\.s, p0/m, \1\.s
++** |
++**	mov	(z[0-9]+\.s), #255
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s32_z_tied1, svint32_t,
++		z0 = svand_n_s32_z (p0, z0, 255),
++		z0 = svand_z (p0, z0, 255))
++
++/*
++** and_255_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s32_z_untied, svint32_t,
++		z0 = svand_n_s32_z (p0, z1, 255),
++		z0 = svand_z (p0, z1, 255))
++
++/*
++** and_65535_s32_z_tied1:
++** (
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	uxth	z0\.s, p0/m, \1\.s
++** |
++**	mov	(z[0-9]+\.s), #65535
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s32_z_tied1, svint32_t,
++		z0 = svand_n_s32_z (p0, z0, 65535),
++		z0 = svand_z (p0, z0, 65535))
++
++/*
++** and_65535_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s32_z_untied, svint32_t,
++		z0 = svand_n_s32_z (p0, z1, 65535),
++		z0 = svand_z (p0, z1, 65535))
++
++/*
++** and_s32_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_x_tied1, svint32_t,
++		z0 = svand_s32_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_s32_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_x_tied2, svint32_t,
++		z0 = svand_s32_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_s32_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s32_x_untied, svint32_t,
++		z0 = svand_s32_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_w0_s32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svand_n_s32_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_w0_s32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svand_n_s32_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_s32_x_tied1:
++**	and	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s32_x_tied1, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_s32_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s32_x_untied, svint32_t,
++		z0 = svand_n_s32_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_s32_x:
++**	and	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_s32_x:
++**	and	z0\.s, z0\.s, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_s32_x:
++**	and	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_256_s32_x:
++**	and	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (and_256_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 256),
++		z0 = svand_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (and_257_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 257),
++		z0 = svand_x (p0, z0, 257))
++
++/*
++** and_512_s32_x:
++**	and	z0\.s, z0\.s, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (and_512_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 512),
++		z0 = svand_x (p0, z0, 512))
++
++/*
++** and_65280_s32_x:
++**	and	z0\.s, z0\.s, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_65280_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 0xff00),
++		z0 = svand_x (p0, z0, 0xff00))
++
++/*
++** and_m127_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_m255_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (and_m255_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -255),
++		z0 = svand_x (p0, z0, -255))
++
++/*
++** and_m256_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m256_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -256),
++		z0 = svand_x (p0, z0, -256))
++
++/*
++** and_m257_s32_x:
++**	and	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (and_m257_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -257),
++		z0 = svand_x (p0, z0, -257))
++
++/*
++** and_m512_s32_x:
++**	and	z0\.s, z0\.s, #0xfffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m512_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -512),
++		z0 = svand_x (p0, z0, -512))
++
++/*
++** and_m32768_s32_x:
++**	and	z0\.s, z0\.s, #0xffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (and_m32768_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, -0x8000),
++		z0 = svand_x (p0, z0, -0x8000))
++
++/*
++** and_5_s32_x:
++**	mov	(z[0-9]+)\.s, #5
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_s32_x, svint32_t,
++		z0 = svand_n_s32_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s64.c
+new file mode 100644
+index 000000000..8868258dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s64.c
+@@ -0,0 +1,510 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_s64_m_tied1:
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_m_tied1, svint64_t,
++		z0 = svand_s64_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_m_tied2, svint64_t,
++		z0 = svand_s64_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_s64_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_m_untied, svint64_t,
++		z0 = svand_s64_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svand_n_s64_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svand_n_s64_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s64_m_tied1, svint64_t,
++		z0 = svand_n_s64_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s64_m_untied, svint64_t,
++		z0 = svand_n_s64_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_s64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_s64_m, svint64_t,
++		z0 = svand_n_s64_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_255_s64_m_tied1:
++**	uxtb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s64_m_tied1, svint64_t,
++		z0 = svand_n_s64_m (p0, z0, 255),
++		z0 = svand_m (p0, z0, 255))
++
++/*
++** and_255_s64_m_untied:
++**	movprfx	z0, z1
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s64_m_untied, svint64_t,
++		z0 = svand_n_s64_m (p0, z1, 255),
++		z0 = svand_m (p0, z1, 255))
++
++/*
++** and_65535_s64_m_tied1:
++**	uxth	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s64_m_tied1, svint64_t,
++		z0 = svand_n_s64_m (p0, z0, 65535),
++		z0 = svand_m (p0, z0, 65535))
++
++/*
++** and_65535_s64_m_untied:
++**	movprfx	z0, z1
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s64_m_untied, svint64_t,
++		z0 = svand_n_s64_m (p0, z1, 65535),
++		z0 = svand_m (p0, z1, 65535))
++
++/*
++** and_0xffffffff_s64_m_tied1:
++**	uxtw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_s64_m_tied1, svint64_t,
++		z0 = svand_n_s64_m (p0, z0, 0xffffffff),
++		z0 = svand_m (p0, z0, 0xffffffff))
++
++/*
++** and_0xffffffff_s64_m_untied:
++**	movprfx	z0, z1
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_s64_m_untied, svint64_t,
++		z0 = svand_n_s64_m (p0, z1, 0xffffffff),
++		z0 = svand_m (p0, z1, 0xffffffff))
++
++/*
++** and_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_z_tied1, svint64_t,
++		z0 = svand_s64_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_z_tied2, svint64_t,
++		z0 = svand_s64_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_z_untied, svint64_t,
++		z0 = svand_s64_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svand_n_s64_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svand_n_s64_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s64_z_tied1, svint64_t,
++		z0 = svand_n_s64_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s64_z_untied, svint64_t,
++		z0 = svand_n_s64_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_255_s64_z_tied1:
++** (
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxtb	z0\.d, p0/m, \1
++** |
++**	mov	(z[0-9]+\.d), #255
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s64_z_tied1, svint64_t,
++		z0 = svand_n_s64_z (p0, z0, 255),
++		z0 = svand_z (p0, z0, 255))
++
++/*
++** and_255_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s64_z_untied, svint64_t,
++		z0 = svand_n_s64_z (p0, z1, 255),
++		z0 = svand_z (p0, z1, 255))
++
++/*
++** and_65535_s64_z_tied1:
++** (
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxth	z0\.d, p0/m, \1
++** |
++**	mov	(z[0-9]+\.d), #65535
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s64_z_tied1, svint64_t,
++		z0 = svand_n_s64_z (p0, z0, 65535),
++		z0 = svand_z (p0, z0, 65535))
++
++/*
++** and_65535_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_s64_z_untied, svint64_t,
++		z0 = svand_n_s64_z (p0, z1, 65535),
++		z0 = svand_z (p0, z1, 65535))
++
++/*
++** and_0xffffffff_s64_z_tied1:
++** (
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxtw	z0\.d, p0/m, \1
++** |
++**	mov	(z[0-9]+\.d), #4294967295
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_s64_z_tied1, svint64_t,
++		z0 = svand_n_s64_z (p0, z0, 0xffffffff),
++		z0 = svand_z (p0, z0, 0xffffffff))
++
++/*
++** and_0xffffffff_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_s64_z_untied, svint64_t,
++		z0 = svand_n_s64_z (p0, z1, 0xffffffff),
++		z0 = svand_z (p0, z1, 0xffffffff))
++
++/*
++** and_s64_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_x_tied1, svint64_t,
++		z0 = svand_s64_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_s64_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_x_tied2, svint64_t,
++		z0 = svand_s64_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_s64_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s64_x_untied, svint64_t,
++		z0 = svand_s64_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svand_n_s64_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	and	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svand_n_s64_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_s64_x_tied1:
++**	and	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s64_x_tied1, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_s64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s64_x_untied, svint64_t,
++		z0 = svand_n_s64_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_s64_x:
++**	and	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_s64_x:
++**	and	z0\.d, z0\.d, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_s64_x:
++**	and	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_256_s64_x:
++**	and	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (and_256_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 256),
++		z0 = svand_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (and_257_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 257),
++		z0 = svand_x (p0, z0, 257))
++
++/*
++** and_512_s64_x:
++**	and	z0\.d, z0\.d, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (and_512_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 512),
++		z0 = svand_x (p0, z0, 512))
++
++/*
++** and_65280_s64_x:
++**	and	z0\.d, z0\.d, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_65280_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 0xff00),
++		z0 = svand_x (p0, z0, 0xff00))
++
++/*
++** and_m127_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_m255_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (and_m255_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -255),
++		z0 = svand_x (p0, z0, -255))
++
++/*
++** and_m256_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m256_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -256),
++		z0 = svand_x (p0, z0, -256))
++
++/*
++** and_m257_s64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (and_m257_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -257),
++		z0 = svand_x (p0, z0, -257))
++
++/*
++** and_m512_s64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m512_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -512),
++		z0 = svand_x (p0, z0, -512))
++
++/*
++** and_m32768_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (and_m32768_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, -0x8000),
++		z0 = svand_x (p0, z0, -0x8000))
++
++/*
++** and_5_s64_x:
++**	mov	(z[0-9]+\.d), #5
++**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_s64_x, svint64_t,
++		z0 = svand_n_s64_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s8.c
+new file mode 100644
+index 000000000..61d168d3f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_s8_m_tied1:
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_m_tied1, svint8_t,
++		z0 = svand_s8_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_m_tied2, svint8_t,
++		z0 = svand_s8_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_s8_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_m_untied, svint8_t,
++		z0 = svand_s8_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svand_n_s8_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svand_n_s8_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s8_m_tied1, svint8_t,
++		z0 = svand_n_s8_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s8_m_untied, svint8_t,
++		z0 = svand_n_s8_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_s8_m:
++**	mov	(z[0-9]+\.b), #-2
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_s8_m, svint8_t,
++		z0 = svand_n_s8_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_z_tied1, svint8_t,
++		z0 = svand_s8_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_z_tied2, svint8_t,
++		z0 = svand_s8_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_z_untied, svint8_t,
++		z0 = svand_s8_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svand_n_s8_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svand_n_s8_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s8_z_tied1, svint8_t,
++		z0 = svand_n_s8_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s8_z_untied, svint8_t,
++		z0 = svand_n_s8_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_s8_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_x_tied1, svint8_t,
++		z0 = svand_s8_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_s8_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_x_tied2, svint8_t,
++		z0 = svand_s8_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_s8_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_s8_x_untied, svint8_t,
++		z0 = svand_s8_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_w0_s8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svand_n_s8_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_w0_s8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svand_n_s8_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_s8_x_tied1:
++**	and	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s8_x_tied1, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_s8_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_s8_x_untied, svint8_t,
++		z0 = svand_n_s8_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_s8_x:
++**	and	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_s8_x, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_s8_x:
++**	and	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_s8_x, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_s8_x:
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_s8_x, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_m127_s8_x:
++**	and	z0\.b, z0\.b, #0x81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_s8_x, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_s8_x:
++**	and	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_s8_x, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_5_s8_x:
++**	mov	(z[0-9]+)\.b, #5
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_s8_x, svint8_t,
++		z0 = svand_n_s8_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u16.c
+new file mode 100644
+index 000000000..875a08d71
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u16.c
+@@ -0,0 +1,422 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_u16_m_tied1:
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_m_tied1, svuint16_t,
++		z0 = svand_u16_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_m_tied2, svuint16_t,
++		z0 = svand_u16_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_u16_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_m_untied, svuint16_t,
++		z0 = svand_u16_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svand_n_u16_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svand_n_u16_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u16_m_tied1, svuint16_t,
++		z0 = svand_n_u16_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u16_m_untied, svuint16_t,
++		z0 = svand_n_u16_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_u16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_u16_m, svuint16_t,
++		z0 = svand_n_u16_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_255_u16_m_tied1:
++**	uxtb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u16_m_tied1, svuint16_t,
++		z0 = svand_n_u16_m (p0, z0, 255),
++		z0 = svand_m (p0, z0, 255))
++
++/*
++** and_255_u16_m_untied:
++**	movprfx	z0, z1
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u16_m_untied, svuint16_t,
++		z0 = svand_n_u16_m (p0, z1, 255),
++		z0 = svand_m (p0, z1, 255))
++
++/*
++** and_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_z_tied1, svuint16_t,
++		z0 = svand_u16_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_z_tied2, svuint16_t,
++		z0 = svand_u16_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_z_untied, svuint16_t,
++		z0 = svand_u16_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svand_n_u16_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svand_n_u16_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u16_z_tied1, svuint16_t,
++		z0 = svand_n_u16_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u16_z_untied, svuint16_t,
++		z0 = svand_n_u16_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_255_u16_z_tied1:
++** (
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	uxtb	z0\.h, p0/m, \1\.h
++** |
++**	mov	(z[0-9]+\.h), #255
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u16_z_tied1, svuint16_t,
++		z0 = svand_n_u16_z (p0, z0, 255),
++		z0 = svand_z (p0, z0, 255))
++
++/*
++** and_255_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u16_z_untied, svuint16_t,
++		z0 = svand_n_u16_z (p0, z1, 255),
++		z0 = svand_z (p0, z1, 255))
++
++/*
++** and_u16_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_x_tied1, svuint16_t,
++		z0 = svand_u16_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_u16_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_x_tied2, svuint16_t,
++		z0 = svand_u16_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_u16_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u16_x_untied, svuint16_t,
++		z0 = svand_u16_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_w0_u16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svand_n_u16_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_w0_u16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svand_n_u16_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_u16_x_tied1:
++**	and	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u16_x_tied1, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_u16_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u16_x_untied, svuint16_t,
++		z0 = svand_n_u16_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_u16_x:
++**	and	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_u16_x:
++**	and	z0\.h, z0\.h, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_u16_x:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_256_u16_x:
++**	and	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (and_256_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 256),
++		z0 = svand_x (p0, z0, 256))
++
++/*
++** and_257_u16_x:
++**	and	z0\.h, z0\.h, #0x101
++**	ret
++*/
++TEST_UNIFORM_Z (and_257_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 257),
++		z0 = svand_x (p0, z0, 257))
++
++/*
++** and_512_u16_x:
++**	and	z0\.h, z0\.h, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (and_512_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 512),
++		z0 = svand_x (p0, z0, 512))
++
++/*
++** and_65280_u16_x:
++**	and	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_65280_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 0xff00),
++		z0 = svand_x (p0, z0, 0xff00))
++
++/*
++** and_m127_u16_x:
++**	and	z0\.h, z0\.h, #0xff81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_u16_x:
++**	and	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_m255_u16_x:
++**	and	z0\.h, z0\.h, #0xff01
++**	ret
++*/
++TEST_UNIFORM_Z (and_m255_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -255),
++		z0 = svand_x (p0, z0, -255))
++
++/*
++** and_m256_u16_x:
++**	and	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m256_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -256),
++		z0 = svand_x (p0, z0, -256))
++
++/*
++** and_m257_u16_x:
++**	and	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (and_m257_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -257),
++		z0 = svand_x (p0, z0, -257))
++
++/*
++** and_m512_u16_x:
++**	and	z0\.h, z0\.h, #0xfe00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m512_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -512),
++		z0 = svand_x (p0, z0, -512))
++
++/*
++** and_m32768_u16_x:
++**	and	z0\.h, z0\.h, #0x8000
++**	ret
++*/
++TEST_UNIFORM_Z (and_m32768_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, -0x8000),
++		z0 = svand_x (p0, z0, -0x8000))
++
++/*
++** and_5_u16_x:
++**	mov	(z[0-9]+)\.h, #5
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_u16_x, svuint16_t,
++		z0 = svand_n_u16_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u32.c
+new file mode 100644
+index 000000000..80ff50396
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u32.c
+@@ -0,0 +1,464 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_u32_m_tied1:
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_m_tied1, svuint32_t,
++		z0 = svand_u32_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_m_tied2, svuint32_t,
++		z0 = svand_u32_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_u32_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_m_untied, svuint32_t,
++		z0 = svand_u32_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svand_n_u32_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svand_n_u32_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u32_m_tied1, svuint32_t,
++		z0 = svand_n_u32_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u32_m_untied, svuint32_t,
++		z0 = svand_n_u32_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_u32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_u32_m, svuint32_t,
++		z0 = svand_n_u32_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_255_u32_m_tied1:
++**	uxtb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u32_m_tied1, svuint32_t,
++		z0 = svand_n_u32_m (p0, z0, 255),
++		z0 = svand_m (p0, z0, 255))
++
++/*
++** and_255_u32_m_untied:
++**	movprfx	z0, z1
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u32_m_untied, svuint32_t,
++		z0 = svand_n_u32_m (p0, z1, 255),
++		z0 = svand_m (p0, z1, 255))
++
++/*
++** and_65535_u32_m_tied1:
++**	uxth	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u32_m_tied1, svuint32_t,
++		z0 = svand_n_u32_m (p0, z0, 65535),
++		z0 = svand_m (p0, z0, 65535))
++
++/*
++** and_65535_u32_m_untied:
++**	movprfx	z0, z1
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u32_m_untied, svuint32_t,
++		z0 = svand_n_u32_m (p0, z1, 65535),
++		z0 = svand_m (p0, z1, 65535))
++
++/*
++** and_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_z_tied1, svuint32_t,
++		z0 = svand_u32_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_z_tied2, svuint32_t,
++		z0 = svand_u32_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_z_untied, svuint32_t,
++		z0 = svand_u32_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svand_n_u32_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svand_n_u32_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u32_z_tied1, svuint32_t,
++		z0 = svand_n_u32_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u32_z_untied, svuint32_t,
++		z0 = svand_n_u32_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_255_u32_z_tied1:
++** (
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	uxtb	z0\.s, p0/m, \1\.s
++** |
++**	mov	(z[0-9]+\.s), #255
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u32_z_tied1, svuint32_t,
++		z0 = svand_n_u32_z (p0, z0, 255),
++		z0 = svand_z (p0, z0, 255))
++
++/*
++** and_255_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u32_z_untied, svuint32_t,
++		z0 = svand_n_u32_z (p0, z1, 255),
++		z0 = svand_z (p0, z1, 255))
++
++/*
++** and_65535_u32_z_tied1:
++** (
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	uxth	z0\.s, p0/m, \1\.s
++** |
++**	mov	(z[0-9]+\.s), #65535
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u32_z_tied1, svuint32_t,
++		z0 = svand_n_u32_z (p0, z0, 65535),
++		z0 = svand_z (p0, z0, 65535))
++
++/*
++** and_65535_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u32_z_untied, svuint32_t,
++		z0 = svand_n_u32_z (p0, z1, 65535),
++		z0 = svand_z (p0, z1, 65535))
++
++/*
++** and_u32_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_x_tied1, svuint32_t,
++		z0 = svand_u32_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_u32_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_x_tied2, svuint32_t,
++		z0 = svand_u32_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_u32_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u32_x_untied, svuint32_t,
++		z0 = svand_u32_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_w0_u32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svand_n_u32_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_w0_u32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svand_n_u32_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_u32_x_tied1:
++**	and	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u32_x_tied1, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_u32_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u32_x_untied, svuint32_t,
++		z0 = svand_n_u32_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_u32_x:
++**	and	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_u32_x:
++**	and	z0\.s, z0\.s, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_u32_x:
++**	and	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_256_u32_x:
++**	and	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (and_256_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 256),
++		z0 = svand_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (and_257_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 257),
++		z0 = svand_x (p0, z0, 257))
++
++/*
++** and_512_u32_x:
++**	and	z0\.s, z0\.s, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (and_512_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 512),
++		z0 = svand_x (p0, z0, 512))
++
++/*
++** and_65280_u32_x:
++**	and	z0\.s, z0\.s, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_65280_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 0xff00),
++		z0 = svand_x (p0, z0, 0xff00))
++
++/*
++** and_m127_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_m255_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (and_m255_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -255),
++		z0 = svand_x (p0, z0, -255))
++
++/*
++** and_m256_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m256_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -256),
++		z0 = svand_x (p0, z0, -256))
++
++/*
++** and_m257_u32_x:
++**	and	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (and_m257_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -257),
++		z0 = svand_x (p0, z0, -257))
++
++/*
++** and_m512_u32_x:
++**	and	z0\.s, z0\.s, #0xfffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m512_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -512),
++		z0 = svand_x (p0, z0, -512))
++
++/*
++** and_m32768_u32_x:
++**	and	z0\.s, z0\.s, #0xffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (and_m32768_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, -0x8000),
++		z0 = svand_x (p0, z0, -0x8000))
++
++/*
++** and_5_u32_x:
++**	mov	(z[0-9]+)\.s, #5
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_u32_x, svuint32_t,
++		z0 = svand_n_u32_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u64.c
+new file mode 100644
+index 000000000..906b19c37
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u64.c
+@@ -0,0 +1,510 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_u64_m_tied1:
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_m_tied1, svuint64_t,
++		z0 = svand_u64_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_m_tied2, svuint64_t,
++		z0 = svand_u64_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_u64_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_m_untied, svuint64_t,
++		z0 = svand_u64_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svand_n_u64_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svand_n_u64_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u64_m_tied1, svuint64_t,
++		z0 = svand_n_u64_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u64_m_untied, svuint64_t,
++		z0 = svand_n_u64_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_u64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_u64_m, svuint64_t,
++		z0 = svand_n_u64_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_255_u64_m_tied1:
++**	uxtb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u64_m_tied1, svuint64_t,
++		z0 = svand_n_u64_m (p0, z0, 255),
++		z0 = svand_m (p0, z0, 255))
++
++/*
++** and_255_u64_m_untied:
++**	movprfx	z0, z1
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u64_m_untied, svuint64_t,
++		z0 = svand_n_u64_m (p0, z1, 255),
++		z0 = svand_m (p0, z1, 255))
++
++/*
++** and_65535_u64_m_tied1:
++**	uxth	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u64_m_tied1, svuint64_t,
++		z0 = svand_n_u64_m (p0, z0, 65535),
++		z0 = svand_m (p0, z0, 65535))
++
++/*
++** and_65535_u64_m_untied:
++**	movprfx	z0, z1
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u64_m_untied, svuint64_t,
++		z0 = svand_n_u64_m (p0, z1, 65535),
++		z0 = svand_m (p0, z1, 65535))
++
++/*
++** and_0xffffffff_u64_m_tied1:
++**	uxtw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_u64_m_tied1, svuint64_t,
++		z0 = svand_n_u64_m (p0, z0, 0xffffffff),
++		z0 = svand_m (p0, z0, 0xffffffff))
++
++/*
++** and_0xffffffff_u64_m_untied:
++**	movprfx	z0, z1
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_u64_m_untied, svuint64_t,
++		z0 = svand_n_u64_m (p0, z1, 0xffffffff),
++		z0 = svand_m (p0, z1, 0xffffffff))
++
++/*
++** and_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_z_tied1, svuint64_t,
++		z0 = svand_u64_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_z_tied2, svuint64_t,
++		z0 = svand_u64_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_z_untied, svuint64_t,
++		z0 = svand_u64_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svand_n_u64_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svand_n_u64_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u64_z_tied1, svuint64_t,
++		z0 = svand_n_u64_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u64_z_untied, svuint64_t,
++		z0 = svand_n_u64_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_255_u64_z_tied1:
++** (
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxtb	z0\.d, p0/m, \1
++** |
++**	mov	(z[0-9]+\.d), #255
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u64_z_tied1, svuint64_t,
++		z0 = svand_n_u64_z (p0, z0, 255),
++		z0 = svand_z (p0, z0, 255))
++
++/*
++** and_255_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u64_z_untied, svuint64_t,
++		z0 = svand_n_u64_z (p0, z1, 255),
++		z0 = svand_z (p0, z1, 255))
++
++/*
++** and_65535_u64_z_tied1:
++** (
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxth	z0\.d, p0/m, \1
++** |
++**	mov	(z[0-9]+\.d), #65535
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u64_z_tied1, svuint64_t,
++		z0 = svand_n_u64_z (p0, z0, 65535),
++		z0 = svand_z (p0, z0, 65535))
++
++/*
++** and_65535_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_65535_u64_z_untied, svuint64_t,
++		z0 = svand_n_u64_z (p0, z1, 65535),
++		z0 = svand_z (p0, z1, 65535))
++
++/*
++** and_0xffffffff_u64_z_tied1:
++** (
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxtw	z0\.d, p0/m, \1
++** |
++**	mov	(z[0-9]+\.d), #4294967295
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_u64_z_tied1, svuint64_t,
++		z0 = svand_n_u64_z (p0, z0, 0xffffffff),
++		z0 = svand_z (p0, z0, 0xffffffff))
++
++/*
++** and_0xffffffff_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (and_0xffffffff_u64_z_untied, svuint64_t,
++		z0 = svand_n_u64_z (p0, z1, 0xffffffff),
++		z0 = svand_z (p0, z1, 0xffffffff))
++
++/*
++** and_u64_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_x_tied1, svuint64_t,
++		z0 = svand_u64_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_u64_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_x_tied2, svuint64_t,
++		z0 = svand_u64_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_u64_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u64_x_untied, svuint64_t,
++		z0 = svand_u64_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svand_n_u64_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	and	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svand_n_u64_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_u64_x_tied1:
++**	and	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u64_x_tied1, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_u64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u64_x_untied, svuint64_t,
++		z0 = svand_n_u64_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_u64_x:
++**	and	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_u64_x:
++**	and	z0\.d, z0\.d, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_u64_x:
++**	and	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_256_u64_x:
++**	and	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (and_256_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 256),
++		z0 = svand_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (and_257_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 257),
++		z0 = svand_x (p0, z0, 257))
++
++/*
++** and_512_u64_x:
++**	and	z0\.d, z0\.d, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (and_512_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 512),
++		z0 = svand_x (p0, z0, 512))
++
++/*
++** and_65280_u64_x:
++**	and	z0\.d, z0\.d, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_65280_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 0xff00),
++		z0 = svand_x (p0, z0, 0xff00))
++
++/*
++** and_m127_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_m255_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (and_m255_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -255),
++		z0 = svand_x (p0, z0, -255))
++
++/*
++** and_m256_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m256_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -256),
++		z0 = svand_x (p0, z0, -256))
++
++/*
++** and_m257_u64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (and_m257_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -257),
++		z0 = svand_x (p0, z0, -257))
++
++/*
++** and_m512_u64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (and_m512_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -512),
++		z0 = svand_x (p0, z0, -512))
++
++/*
++** and_m32768_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (and_m32768_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, -0x8000),
++		z0 = svand_x (p0, z0, -0x8000))
++
++/*
++** and_5_u64_x:
++**	mov	(z[0-9]+\.d), #5
++**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_u64_x, svuint64_t,
++		z0 = svand_n_u64_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u8.c
+new file mode 100644
+index 000000000..b0f1c9529
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** and_u8_m_tied1:
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_m_tied1, svuint8_t,
++		z0 = svand_u8_m (p0, z0, z1),
++		z0 = svand_m (p0, z0, z1))
++
++/*
++** and_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_m_tied2, svuint8_t,
++		z0 = svand_u8_m (p0, z1, z0),
++		z0 = svand_m (p0, z1, z0))
++
++/*
++** and_u8_m_untied:
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_m_untied, svuint8_t,
++		z0 = svand_u8_m (p0, z1, z2),
++		z0 = svand_m (p0, z1, z2))
++
++/*
++** and_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svand_n_u8_m (p0, z0, x0),
++		 z0 = svand_m (p0, z0, x0))
++
++/*
++** and_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svand_n_u8_m (p0, z1, x0),
++		 z0 = svand_m (p0, z1, x0))
++
++/*
++** and_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u8_m_tied1, svuint8_t,
++		z0 = svand_n_u8_m (p0, z0, 1),
++		z0 = svand_m (p0, z0, 1))
++
++/*
++** and_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u8_m_untied, svuint8_t,
++		z0 = svand_n_u8_m (p0, z1, 1),
++		z0 = svand_m (p0, z1, 1))
++
++/*
++** and_m2_u8_m:
++**	mov	(z[0-9]+\.b), #-2
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_m2_u8_m, svuint8_t,
++		z0 = svand_n_u8_m (p0, z0, -2),
++		z0 = svand_m (p0, z0, -2))
++
++/*
++** and_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_z_tied1, svuint8_t,
++		z0 = svand_u8_z (p0, z0, z1),
++		z0 = svand_z (p0, z0, z1))
++
++/*
++** and_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_z_tied2, svuint8_t,
++		z0 = svand_u8_z (p0, z1, z0),
++		z0 = svand_z (p0, z1, z0))
++
++/*
++** and_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_z_untied, svuint8_t,
++		z0 = svand_u8_z (p0, z1, z2),
++		z0 = svand_z (p0, z1, z2))
++
++/*
++** and_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svand_n_u8_z (p0, z0, x0),
++		 z0 = svand_z (p0, z0, x0))
++
++/*
++** and_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svand_n_u8_z (p0, z1, x0),
++		 z0 = svand_z (p0, z1, x0))
++
++/*
++** and_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u8_z_tied1, svuint8_t,
++		z0 = svand_n_u8_z (p0, z0, 1),
++		z0 = svand_z (p0, z0, 1))
++
++/*
++** and_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u8_z_untied, svuint8_t,
++		z0 = svand_n_u8_z (p0, z1, 1),
++		z0 = svand_z (p0, z1, 1))
++
++/*
++** and_u8_x_tied1:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_x_tied1, svuint8_t,
++		z0 = svand_u8_x (p0, z0, z1),
++		z0 = svand_x (p0, z0, z1))
++
++/*
++** and_u8_x_tied2:
++**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_x_tied2, svuint8_t,
++		z0 = svand_u8_x (p0, z1, z0),
++		z0 = svand_x (p0, z1, z0))
++
++/*
++** and_u8_x_untied:
++**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_u8_x_untied, svuint8_t,
++		z0 = svand_u8_x (p0, z1, z2),
++		z0 = svand_x (p0, z1, z2))
++
++/*
++** and_w0_u8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svand_n_u8_x (p0, z0, x0),
++		 z0 = svand_x (p0, z0, x0))
++
++/*
++** and_w0_u8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (and_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svand_n_u8_x (p0, z1, x0),
++		 z0 = svand_x (p0, z1, x0))
++
++/*
++** and_1_u8_x_tied1:
++**	and	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u8_x_tied1, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, 1),
++		z0 = svand_x (p0, z0, 1))
++
++/*
++** and_1_u8_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (and_1_u8_x_untied, svuint8_t,
++		z0 = svand_n_u8_x (p0, z1, 1),
++		z0 = svand_x (p0, z1, 1))
++
++/*
++** and_127_u8_x:
++**	and	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (and_127_u8_x, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, 127),
++		z0 = svand_x (p0, z0, 127))
++
++/*
++** and_128_u8_x:
++**	and	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_128_u8_x, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, 128),
++		z0 = svand_x (p0, z0, 128))
++
++/*
++** and_255_u8_x:
++**	ret
++*/
++TEST_UNIFORM_Z (and_255_u8_x, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, 255),
++		z0 = svand_x (p0, z0, 255))
++
++/*
++** and_m127_u8_x:
++**	and	z0\.b, z0\.b, #0x81
++**	ret
++*/
++TEST_UNIFORM_Z (and_m127_u8_x, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, -127),
++		z0 = svand_x (p0, z0, -127))
++
++/*
++** and_m128_u8_x:
++**	and	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (and_m128_u8_x, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, -128),
++		z0 = svand_x (p0, z0, -128))
++
++/*
++** and_5_u8_x:
++**	mov	(z[0-9]+)\.b, #5
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (and_5_u8_x, svuint8_t,
++		z0 = svand_n_u8_x (p0, z0, 5),
++		z0 = svand_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s16.c
+new file mode 100644
+index 000000000..16761b823
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_s16:
++**	andv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_s16, int16_t, svint16_t,
++		  x0 = svandv_s16 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s32.c
+new file mode 100644
+index 000000000..bccc91e21
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_s32:
++**	andv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_s32, int32_t, svint32_t,
++		  x0 = svandv_s32 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s64.c
+new file mode 100644
+index 000000000..53488b6e3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_s64:
++**	andv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_s64, int64_t, svint64_t,
++		  x0 = svandv_s64 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s8.c
+new file mode 100644
+index 000000000..052f74c7f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_s8:
++**	andv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_s8, int8_t, svint8_t,
++		  x0 = svandv_s8 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u16.c
+new file mode 100644
+index 000000000..03328022d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_u16:
++**	andv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_u16, uint16_t, svuint16_t,
++		  x0 = svandv_u16 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u32.c
+new file mode 100644
+index 000000000..a1677e703
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_u32:
++**	andv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_u32, uint32_t, svuint32_t,
++		  x0 = svandv_u32 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u64.c
+new file mode 100644
+index 000000000..d45422693
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_u64:
++**	andv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_u64, uint64_t, svuint64_t,
++		  x0 = svandv_u64 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u8.c
+new file mode 100644
+index 000000000..b07f6b6e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** andv_x0_u8:
++**	andv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (andv_x0_u8, uint8_t, svuint8_t,
++		  x0 = svandv_u8 (p0, z0),
++		  x0 = svandv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s16.c
+new file mode 100644
+index 000000000..877bf1068
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s16.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (asr_s16_m_tied1, svint16_t, svuint16_t,
++	     z0 = svasr_s16_m (p0, z0, z4),
++	     z0 = svasr_m (p0, z0, z4))
++
++/*
++** asr_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s16_m_tied2, svint16_t, svuint16_t,
++		 z0_res = svasr_s16_m (p0, z4, z0),
++		 z0_res = svasr_m (p0, z4, z0))
++
++/*
++** asr_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (asr_s16_m_untied, svint16_t, svuint16_t,
++	     z0 = svasr_s16_m (p0, z1, z4),
++	     z0 = svasr_m (p0, z1, z4))
++
++/*
++** asr_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s16_m_tied1, svint16_t, uint16_t,
++		 z0 = svasr_n_s16_m (p0, z0, x0),
++		 z0 = svasr_m (p0, z0, x0))
++
++/*
++** asr_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s16_m_untied, svint16_t, uint16_t,
++		 z0 = svasr_n_s16_m (p0, z1, x0),
++		 z0 = svasr_m (p0, z1, x0))
++
++/*
++** asr_1_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s16_m_tied1, svint16_t,
++		z0 = svasr_n_s16_m (p0, z0, 1),
++		z0 = svasr_m (p0, z0, 1))
++
++/*
++** asr_1_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s16_m_untied, svint16_t,
++		z0 = svasr_n_s16_m (p0, z1, 1),
++		z0 = svasr_m (p0, z1, 1))
++
++/*
++** asr_15_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_15_s16_m_tied1, svint16_t,
++		z0 = svasr_n_s16_m (p0, z0, 15),
++		z0 = svasr_m (p0, z0, 15))
++
++/*
++** asr_15_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_15_s16_m_untied, svint16_t,
++		z0 = svasr_n_s16_m (p0, z1, 15),
++		z0 = svasr_m (p0, z1, 15))
++
++/*
++** asr_16_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_16_s16_m_tied1, svint16_t,
++		z0 = svasr_n_s16_m (p0, z0, 16),
++		z0 = svasr_m (p0, z0, 16))
++
++/*
++** asr_16_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_16_s16_m_untied, svint16_t,
++		z0 = svasr_n_s16_m (p0, z1, 16),
++		z0 = svasr_m (p0, z1, 16))
++
++/*
++** asr_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (asr_s16_z_tied1, svint16_t, svuint16_t,
++	     z0 = svasr_s16_z (p0, z0, z4),
++	     z0 = svasr_z (p0, z0, z4))
++
++/*
++** asr_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asrr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s16_z_tied2, svint16_t, svuint16_t,
++		 z0_res = svasr_s16_z (p0, z4, z0),
++		 z0_res = svasr_z (p0, z4, z0))
++
++/*
++** asr_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, z4\.h
++** |
++**	movprfx	z0\.h, p0/z, z4\.h
++**	asrr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s16_z_untied, svint16_t, svuint16_t,
++	     z0 = svasr_s16_z (p0, z1, z4),
++	     z0 = svasr_z (p0, z1, z4))
++
++/*
++** asr_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s16_z_tied1, svint16_t, uint16_t,
++		 z0 = svasr_n_s16_z (p0, z0, x0),
++		 z0 = svasr_z (p0, z0, x0))
++
++/*
++** asr_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	asrr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s16_z_untied, svint16_t, uint16_t,
++		 z0 = svasr_n_s16_z (p0, z1, x0),
++		 z0 = svasr_z (p0, z1, x0))
++
++/*
++** asr_1_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s16_z_tied1, svint16_t,
++		z0 = svasr_n_s16_z (p0, z0, 1),
++		z0 = svasr_z (p0, z0, 1))
++
++/*
++** asr_1_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s16_z_untied, svint16_t,
++		z0 = svasr_n_s16_z (p0, z1, 1),
++		z0 = svasr_z (p0, z1, 1))
++
++/*
++** asr_15_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_15_s16_z_tied1, svint16_t,
++		z0 = svasr_n_s16_z (p0, z0, 15),
++		z0 = svasr_z (p0, z0, 15))
++
++/*
++** asr_15_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_15_s16_z_untied, svint16_t,
++		z0 = svasr_n_s16_z (p0, z1, 15),
++		z0 = svasr_z (p0, z1, 15))
++
++/*
++** asr_16_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_16_s16_z_tied1, svint16_t,
++		z0 = svasr_n_s16_z (p0, z0, 16),
++		z0 = svasr_z (p0, z0, 16))
++
++/*
++** asr_16_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_16_s16_z_untied, svint16_t,
++		z0 = svasr_n_s16_z (p0, z1, 16),
++		z0 = svasr_z (p0, z1, 16))
++
++/*
++** asr_s16_x_tied1:
++**	asr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (asr_s16_x_tied1, svint16_t, svuint16_t,
++	     z0 = svasr_s16_x (p0, z0, z4),
++	     z0 = svasr_x (p0, z0, z4))
++
++/*
++** asr_s16_x_tied2:
++**	asrr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s16_x_tied2, svint16_t, svuint16_t,
++		 z0_res = svasr_s16_x (p0, z4, z0),
++		 z0_res = svasr_x (p0, z4, z0))
++
++/*
++** asr_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, z4\.h
++** |
++**	movprfx	z0, z4
++**	asrr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s16_x_untied, svint16_t, svuint16_t,
++	     z0 = svasr_s16_x (p0, z1, z4),
++	     z0 = svasr_x (p0, z1, z4))
++
++/*
++** asr_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s16_x_tied1, svint16_t, uint16_t,
++		 z0 = svasr_n_s16_x (p0, z0, x0),
++		 z0 = svasr_x (p0, z0, x0))
++
++/*
++** asr_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	asrr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s16_x_untied, svint16_t, uint16_t,
++		 z0 = svasr_n_s16_x (p0, z1, x0),
++		 z0 = svasr_x (p0, z1, x0))
++
++/*
++** asr_1_s16_x_tied1:
++**	asr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s16_x_tied1, svint16_t,
++		z0 = svasr_n_s16_x (p0, z0, 1),
++		z0 = svasr_x (p0, z0, 1))
++
++/*
++** asr_1_s16_x_untied:
++**	asr	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s16_x_untied, svint16_t,
++		z0 = svasr_n_s16_x (p0, z1, 1),
++		z0 = svasr_x (p0, z1, 1))
++
++/*
++** asr_15_s16_x_tied1:
++**	asr	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_15_s16_x_tied1, svint16_t,
++		z0 = svasr_n_s16_x (p0, z0, 15),
++		z0 = svasr_x (p0, z0, 15))
++
++/*
++** asr_15_s16_x_untied:
++**	asr	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_15_s16_x_untied, svint16_t,
++		z0 = svasr_n_s16_x (p0, z1, 15),
++		z0 = svasr_x (p0, z1, 15))
++
++/*
++** asr_16_s16_x_tied1:
++**	asr	z0\.h, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_16_s16_x_tied1, svint16_t,
++		z0 = svasr_n_s16_x (p0, z0, 16),
++		z0 = svasr_x (p0, z0, 16))
++
++/*
++** asr_16_s16_x_untied:
++**	asr	z0\.h, z1\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_16_s16_x_untied, svint16_t,
++		z0 = svasr_n_s16_x (p0, z1, 16),
++		z0 = svasr_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s32.c
+new file mode 100644
+index 000000000..0f5a37372
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s32.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (asr_s32_m_tied1, svint32_t, svuint32_t,
++	     z0 = svasr_s32_m (p0, z0, z4),
++	     z0 = svasr_m (p0, z0, z4))
++
++/*
++** asr_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s32_m_tied2, svint32_t, svuint32_t,
++		 z0_res = svasr_s32_m (p0, z4, z0),
++		 z0_res = svasr_m (p0, z4, z0))
++
++/*
++** asr_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (asr_s32_m_untied, svint32_t, svuint32_t,
++	     z0 = svasr_s32_m (p0, z1, z4),
++	     z0 = svasr_m (p0, z1, z4))
++
++/*
++** asr_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s32_m_tied1, svint32_t, uint32_t,
++		 z0 = svasr_n_s32_m (p0, z0, x0),
++		 z0 = svasr_m (p0, z0, x0))
++
++/*
++** asr_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s32_m_untied, svint32_t, uint32_t,
++		 z0 = svasr_n_s32_m (p0, z1, x0),
++		 z0 = svasr_m (p0, z1, x0))
++
++/*
++** asr_1_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s32_m_tied1, svint32_t,
++		z0 = svasr_n_s32_m (p0, z0, 1),
++		z0 = svasr_m (p0, z0, 1))
++
++/*
++** asr_1_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s32_m_untied, svint32_t,
++		z0 = svasr_n_s32_m (p0, z1, 1),
++		z0 = svasr_m (p0, z1, 1))
++
++/*
++** asr_31_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_31_s32_m_tied1, svint32_t,
++		z0 = svasr_n_s32_m (p0, z0, 31),
++		z0 = svasr_m (p0, z0, 31))
++
++/*
++** asr_31_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_31_s32_m_untied, svint32_t,
++		z0 = svasr_n_s32_m (p0, z1, 31),
++		z0 = svasr_m (p0, z1, 31))
++
++/*
++** asr_32_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_32_s32_m_tied1, svint32_t,
++		z0 = svasr_n_s32_m (p0, z0, 32),
++		z0 = svasr_m (p0, z0, 32))
++
++/*
++** asr_32_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_32_s32_m_untied, svint32_t,
++		z0 = svasr_n_s32_m (p0, z1, 32),
++		z0 = svasr_m (p0, z1, 32))
++
++/*
++** asr_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (asr_s32_z_tied1, svint32_t, svuint32_t,
++	     z0 = svasr_s32_z (p0, z0, z4),
++	     z0 = svasr_z (p0, z0, z4))
++
++/*
++** asr_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asrr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s32_z_tied2, svint32_t, svuint32_t,
++		 z0_res = svasr_s32_z (p0, z4, z0),
++		 z0_res = svasr_z (p0, z4, z0))
++
++/*
++** asr_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, z4\.s
++** |
++**	movprfx	z0\.s, p0/z, z4\.s
++**	asrr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s32_z_untied, svint32_t, svuint32_t,
++	     z0 = svasr_s32_z (p0, z1, z4),
++	     z0 = svasr_z (p0, z1, z4))
++
++/*
++** asr_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s32_z_tied1, svint32_t, uint32_t,
++		 z0 = svasr_n_s32_z (p0, z0, x0),
++		 z0 = svasr_z (p0, z0, x0))
++
++/*
++** asr_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	asrr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s32_z_untied, svint32_t, uint32_t,
++		 z0 = svasr_n_s32_z (p0, z1, x0),
++		 z0 = svasr_z (p0, z1, x0))
++
++/*
++** asr_1_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s32_z_tied1, svint32_t,
++		z0 = svasr_n_s32_z (p0, z0, 1),
++		z0 = svasr_z (p0, z0, 1))
++
++/*
++** asr_1_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s32_z_untied, svint32_t,
++		z0 = svasr_n_s32_z (p0, z1, 1),
++		z0 = svasr_z (p0, z1, 1))
++
++/*
++** asr_31_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_31_s32_z_tied1, svint32_t,
++		z0 = svasr_n_s32_z (p0, z0, 31),
++		z0 = svasr_z (p0, z0, 31))
++
++/*
++** asr_31_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_31_s32_z_untied, svint32_t,
++		z0 = svasr_n_s32_z (p0, z1, 31),
++		z0 = svasr_z (p0, z1, 31))
++
++/*
++** asr_32_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_32_s32_z_tied1, svint32_t,
++		z0 = svasr_n_s32_z (p0, z0, 32),
++		z0 = svasr_z (p0, z0, 32))
++
++/*
++** asr_32_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_32_s32_z_untied, svint32_t,
++		z0 = svasr_n_s32_z (p0, z1, 32),
++		z0 = svasr_z (p0, z1, 32))
++
++/*
++** asr_s32_x_tied1:
++**	asr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (asr_s32_x_tied1, svint32_t, svuint32_t,
++	     z0 = svasr_s32_x (p0, z0, z4),
++	     z0 = svasr_x (p0, z0, z4))
++
++/*
++** asr_s32_x_tied2:
++**	asrr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s32_x_tied2, svint32_t, svuint32_t,
++		 z0_res = svasr_s32_x (p0, z4, z0),
++		 z0_res = svasr_x (p0, z4, z0))
++
++/*
++** asr_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, z4\.s
++** |
++**	movprfx	z0, z4
++**	asrr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s32_x_untied, svint32_t, svuint32_t,
++	     z0 = svasr_s32_x (p0, z1, z4),
++	     z0 = svasr_x (p0, z1, z4))
++
++/*
++** asr_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s32_x_tied1, svint32_t, uint32_t,
++		 z0 = svasr_n_s32_x (p0, z0, x0),
++		 z0 = svasr_x (p0, z0, x0))
++
++/*
++** asr_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	asrr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s32_x_untied, svint32_t, uint32_t,
++		 z0 = svasr_n_s32_x (p0, z1, x0),
++		 z0 = svasr_x (p0, z1, x0))
++
++/*
++** asr_1_s32_x_tied1:
++**	asr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s32_x_tied1, svint32_t,
++		z0 = svasr_n_s32_x (p0, z0, 1),
++		z0 = svasr_x (p0, z0, 1))
++
++/*
++** asr_1_s32_x_untied:
++**	asr	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s32_x_untied, svint32_t,
++		z0 = svasr_n_s32_x (p0, z1, 1),
++		z0 = svasr_x (p0, z1, 1))
++
++/*
++** asr_31_s32_x_tied1:
++**	asr	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_31_s32_x_tied1, svint32_t,
++		z0 = svasr_n_s32_x (p0, z0, 31),
++		z0 = svasr_x (p0, z0, 31))
++
++/*
++** asr_31_s32_x_untied:
++**	asr	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_31_s32_x_untied, svint32_t,
++		z0 = svasr_n_s32_x (p0, z1, 31),
++		z0 = svasr_x (p0, z1, 31))
++
++/*
++** asr_32_s32_x_tied1:
++**	asr	z0\.s, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_32_s32_x_tied1, svint32_t,
++		z0 = svasr_n_s32_x (p0, z0, 32),
++		z0 = svasr_x (p0, z0, 32))
++
++/*
++** asr_32_s32_x_untied:
++**	asr	z0\.s, z1\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_32_s32_x_untied, svint32_t,
++		z0 = svasr_n_s32_x (p0, z1, 32),
++		z0 = svasr_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s64.c
+new file mode 100644
+index 000000000..80cae07c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s64.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_s64_m_tied1:
++**	asr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_s64_m_tied1, svint64_t, svuint64_t,
++	     z0 = svasr_s64_m (p0, z0, z4),
++	     z0 = svasr_m (p0, z0, z4))
++
++/*
++** asr_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s64_m_tied2, svint64_t, svuint64_t,
++		 z0_res = svasr_s64_m (p0, z4, z0),
++		 z0_res = svasr_m (p0, z4, z0))
++
++/*
++** asr_s64_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_s64_m_untied, svint64_t, svuint64_t,
++	     z0 = svasr_s64_m (p0, z1, z4),
++	     z0 = svasr_m (p0, z1, z4))
++
++/*
++** asr_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_x0_s64_m_tied1, svint64_t, uint64_t,
++		 z0 = svasr_n_s64_m (p0, z0, x0),
++		 z0 = svasr_m (p0, z0, x0))
++
++/*
++** asr_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	asr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_x0_s64_m_untied, svint64_t, uint64_t,
++		 z0 = svasr_n_s64_m (p0, z1, x0),
++		 z0 = svasr_m (p0, z1, x0))
++
++/*
++** asr_1_s64_m_tied1:
++**	asr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s64_m_tied1, svint64_t,
++		z0 = svasr_n_s64_m (p0, z0, 1),
++		z0 = svasr_m (p0, z0, 1))
++
++/*
++** asr_1_s64_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s64_m_untied, svint64_t,
++		z0 = svasr_n_s64_m (p0, z1, 1),
++		z0 = svasr_m (p0, z1, 1))
++
++/*
++** asr_63_s64_m_tied1:
++**	asr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (asr_63_s64_m_tied1, svint64_t,
++		z0 = svasr_n_s64_m (p0, z0, 63),
++		z0 = svasr_m (p0, z0, 63))
++
++/*
++** asr_63_s64_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (asr_63_s64_m_untied, svint64_t,
++		z0 = svasr_n_s64_m (p0, z1, 63),
++		z0 = svasr_m (p0, z1, 63))
++
++/*
++** asr_64_s64_m_tied1:
++**	asr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asr_64_s64_m_tied1, svint64_t,
++		z0 = svasr_n_s64_m (p0, z0, 64),
++		z0 = svasr_m (p0, z0, 64))
++
++/*
++** asr_64_s64_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asr_64_s64_m_untied, svint64_t,
++		z0 = svasr_n_s64_m (p0, z1, 64),
++		z0 = svasr_m (p0, z1, 64))
++
++/*
++** asr_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_s64_z_tied1, svint64_t, svuint64_t,
++	     z0 = svasr_s64_z (p0, z0, z4),
++	     z0 = svasr_z (p0, z0, z4))
++
++/*
++** asr_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asrr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s64_z_tied2, svint64_t, svuint64_t,
++		 z0_res = svasr_s64_z (p0, z4, z0),
++		 z0_res = svasr_z (p0, z4, z0))
++
++/*
++** asr_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asr	z0\.d, p0/m, z0\.d, z4\.d
++** |
++**	movprfx	z0\.d, p0/z, z4\.d
++**	asrr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s64_z_untied, svint64_t, svuint64_t,
++	     z0 = svasr_s64_z (p0, z1, z4),
++	     z0 = svasr_z (p0, z1, z4))
++
++/*
++** asr_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_x0_s64_z_tied1, svint64_t, uint64_t,
++		 z0 = svasr_n_s64_z (p0, z0, x0),
++		 z0 = svasr_z (p0, z0, x0))
++
++/*
++** asr_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	asrr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_x0_s64_z_untied, svint64_t, uint64_t,
++		 z0 = svasr_n_s64_z (p0, z1, x0),
++		 z0 = svasr_z (p0, z1, x0))
++
++/*
++** asr_1_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s64_z_tied1, svint64_t,
++		z0 = svasr_n_s64_z (p0, z0, 1),
++		z0 = svasr_z (p0, z0, 1))
++
++/*
++** asr_1_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s64_z_untied, svint64_t,
++		z0 = svasr_n_s64_z (p0, z1, 1),
++		z0 = svasr_z (p0, z1, 1))
++
++/*
++** asr_63_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (asr_63_s64_z_tied1, svint64_t,
++		z0 = svasr_n_s64_z (p0, z0, 63),
++		z0 = svasr_z (p0, z0, 63))
++
++/*
++** asr_63_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (asr_63_s64_z_untied, svint64_t,
++		z0 = svasr_n_s64_z (p0, z1, 63),
++		z0 = svasr_z (p0, z1, 63))
++
++/*
++** asr_64_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asr_64_s64_z_tied1, svint64_t,
++		z0 = svasr_n_s64_z (p0, z0, 64),
++		z0 = svasr_z (p0, z0, 64))
++
++/*
++** asr_64_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asr_64_s64_z_untied, svint64_t,
++		z0 = svasr_n_s64_z (p0, z1, 64),
++		z0 = svasr_z (p0, z1, 64))
++
++/*
++** asr_s64_x_tied1:
++**	asr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_s64_x_tied1, svint64_t, svuint64_t,
++	     z0 = svasr_s64_x (p0, z0, z4),
++	     z0 = svasr_x (p0, z0, z4))
++
++/*
++** asr_s64_x_tied2:
++**	asrr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s64_x_tied2, svint64_t, svuint64_t,
++		 z0_res = svasr_s64_x (p0, z4, z0),
++		 z0_res = svasr_x (p0, z4, z0))
++
++/*
++** asr_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	asr	z0\.d, p0/m, z0\.d, z4\.d
++** |
++**	movprfx	z0, z4
++**	asrr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s64_x_untied, svint64_t, svuint64_t,
++	     z0 = svasr_s64_x (p0, z1, z4),
++	     z0 = svasr_x (p0, z1, z4))
++
++/*
++** asr_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_x0_s64_x_tied1, svint64_t, uint64_t,
++		 z0 = svasr_n_s64_x (p0, z0, x0),
++		 z0 = svasr_x (p0, z0, x0))
++
++/*
++** asr_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	asrr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_x0_s64_x_untied, svint64_t, uint64_t,
++		 z0 = svasr_n_s64_x (p0, z1, x0),
++		 z0 = svasr_x (p0, z1, x0))
++
++/*
++** asr_1_s64_x_tied1:
++**	asr	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s64_x_tied1, svint64_t,
++		z0 = svasr_n_s64_x (p0, z0, 1),
++		z0 = svasr_x (p0, z0, 1))
++
++/*
++** asr_1_s64_x_untied:
++**	asr	z0\.d, z1\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s64_x_untied, svint64_t,
++		z0 = svasr_n_s64_x (p0, z1, 1),
++		z0 = svasr_x (p0, z1, 1))
++
++/*
++** asr_63_s64_x_tied1:
++**	asr	z0\.d, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (asr_63_s64_x_tied1, svint64_t,
++		z0 = svasr_n_s64_x (p0, z0, 63),
++		z0 = svasr_x (p0, z0, 63))
++
++/*
++** asr_63_s64_x_untied:
++**	asr	z0\.d, z1\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (asr_63_s64_x_untied, svint64_t,
++		z0 = svasr_n_s64_x (p0, z1, 63),
++		z0 = svasr_x (p0, z1, 63))
++
++/*
++** asr_64_s64_x_tied1:
++**	asr	z0\.d, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asr_64_s64_x_tied1, svint64_t,
++		z0 = svasr_n_s64_x (p0, z0, 64),
++		z0 = svasr_x (p0, z0, 64))
++
++/*
++** asr_64_s64_x_untied:
++**	asr	z0\.d, z1\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asr_64_s64_x_untied, svint64_t,
++		z0 = svasr_n_s64_x (p0, z1, 64),
++		z0 = svasr_x (p0, z1, 64))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s8.c
+new file mode 100644
+index 000000000..992e93fde
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s8.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (asr_s8_m_tied1, svint8_t, svuint8_t,
++	     z0 = svasr_s8_m (p0, z0, z4),
++	     z0 = svasr_m (p0, z0, z4))
++
++/*
++** asr_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s8_m_tied2, svint8_t, svuint8_t,
++		 z0_res = svasr_s8_m (p0, z4, z0),
++		 z0_res = svasr_m (p0, z4, z0))
++
++/*
++** asr_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (asr_s8_m_untied, svint8_t, svuint8_t,
++	     z0 = svasr_s8_m (p0, z1, z4),
++	     z0 = svasr_m (p0, z1, z4))
++
++/*
++** asr_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s8_m_tied1, svint8_t, uint8_t,
++		 z0 = svasr_n_s8_m (p0, z0, x0),
++		 z0 = svasr_m (p0, z0, x0))
++
++/*
++** asr_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s8_m_untied, svint8_t, uint8_t,
++		 z0 = svasr_n_s8_m (p0, z1, x0),
++		 z0 = svasr_m (p0, z1, x0))
++
++/*
++** asr_1_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s8_m_tied1, svint8_t,
++		z0 = svasr_n_s8_m (p0, z0, 1),
++		z0 = svasr_m (p0, z0, 1))
++
++/*
++** asr_1_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s8_m_untied, svint8_t,
++		z0 = svasr_n_s8_m (p0, z1, 1),
++		z0 = svasr_m (p0, z1, 1))
++
++/*
++** asr_7_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_7_s8_m_tied1, svint8_t,
++		z0 = svasr_n_s8_m (p0, z0, 7),
++		z0 = svasr_m (p0, z0, 7))
++
++/*
++** asr_7_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_7_s8_m_untied, svint8_t,
++		z0 = svasr_n_s8_m (p0, z1, 7),
++		z0 = svasr_m (p0, z1, 7))
++
++/*
++** asr_8_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_8_s8_m_tied1, svint8_t,
++		z0 = svasr_n_s8_m (p0, z0, 8),
++		z0 = svasr_m (p0, z0, 8))
++
++/*
++** asr_8_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_8_s8_m_untied, svint8_t,
++		z0 = svasr_n_s8_m (p0, z1, 8),
++		z0 = svasr_m (p0, z1, 8))
++
++/*
++** asr_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (asr_s8_z_tied1, svint8_t, svuint8_t,
++	     z0 = svasr_s8_z (p0, z0, z4),
++	     z0 = svasr_z (p0, z0, z4))
++
++/*
++** asr_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asrr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s8_z_tied2, svint8_t, svuint8_t,
++		 z0_res = svasr_s8_z (p0, z4, z0),
++		 z0_res = svasr_z (p0, z4, z0))
++
++/*
++** asr_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, z4\.b
++** |
++**	movprfx	z0\.b, p0/z, z4\.b
++**	asrr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s8_z_untied, svint8_t, svuint8_t,
++	     z0 = svasr_s8_z (p0, z1, z4),
++	     z0 = svasr_z (p0, z1, z4))
++
++/*
++** asr_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s8_z_tied1, svint8_t, uint8_t,
++		 z0 = svasr_n_s8_z (p0, z0, x0),
++		 z0 = svasr_z (p0, z0, x0))
++
++/*
++** asr_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	asrr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s8_z_untied, svint8_t, uint8_t,
++		 z0 = svasr_n_s8_z (p0, z1, x0),
++		 z0 = svasr_z (p0, z1, x0))
++
++/*
++** asr_1_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s8_z_tied1, svint8_t,
++		z0 = svasr_n_s8_z (p0, z0, 1),
++		z0 = svasr_z (p0, z0, 1))
++
++/*
++** asr_1_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s8_z_untied, svint8_t,
++		z0 = svasr_n_s8_z (p0, z1, 1),
++		z0 = svasr_z (p0, z1, 1))
++
++/*
++** asr_7_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_7_s8_z_tied1, svint8_t,
++		z0 = svasr_n_s8_z (p0, z0, 7),
++		z0 = svasr_z (p0, z0, 7))
++
++/*
++** asr_7_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_7_s8_z_untied, svint8_t,
++		z0 = svasr_n_s8_z (p0, z1, 7),
++		z0 = svasr_z (p0, z1, 7))
++
++/*
++** asr_8_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_8_s8_z_tied1, svint8_t,
++		z0 = svasr_n_s8_z (p0, z0, 8),
++		z0 = svasr_z (p0, z0, 8))
++
++/*
++** asr_8_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_8_s8_z_untied, svint8_t,
++		z0 = svasr_n_s8_z (p0, z1, 8),
++		z0 = svasr_z (p0, z1, 8))
++
++/*
++** asr_s8_x_tied1:
++**	asr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (asr_s8_x_tied1, svint8_t, svuint8_t,
++	     z0 = svasr_s8_x (p0, z0, z4),
++	     z0 = svasr_x (p0, z0, z4))
++
++/*
++** asr_s8_x_tied2:
++**	asrr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_s8_x_tied2, svint8_t, svuint8_t,
++		 z0_res = svasr_s8_x (p0, z4, z0),
++		 z0_res = svasr_x (p0, z4, z0))
++
++/*
++** asr_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, z4\.b
++** |
++**	movprfx	z0, z4
++**	asrr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_DUAL_Z (asr_s8_x_untied, svint8_t, svuint8_t,
++	     z0 = svasr_s8_x (p0, z1, z4),
++	     z0 = svasr_x (p0, z1, z4))
++
++/*
++** asr_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s8_x_tied1, svint8_t, uint8_t,
++		 z0 = svasr_n_s8_x (p0, z0, x0),
++		 z0 = svasr_x (p0, z0, x0))
++
++/*
++** asr_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	asrr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_w0_s8_x_untied, svint8_t, uint8_t,
++		 z0 = svasr_n_s8_x (p0, z1, x0),
++		 z0 = svasr_x (p0, z1, x0))
++
++/*
++** asr_1_s8_x_tied1:
++**	asr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s8_x_tied1, svint8_t,
++		z0 = svasr_n_s8_x (p0, z0, 1),
++		z0 = svasr_x (p0, z0, 1))
++
++/*
++** asr_1_s8_x_untied:
++**	asr	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_1_s8_x_untied, svint8_t,
++		z0 = svasr_n_s8_x (p0, z1, 1),
++		z0 = svasr_x (p0, z1, 1))
++
++/*
++** asr_7_s8_x_tied1:
++**	asr	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_7_s8_x_tied1, svint8_t,
++		z0 = svasr_n_s8_x (p0, z0, 7),
++		z0 = svasr_x (p0, z0, 7))
++
++/*
++** asr_7_s8_x_untied:
++**	asr	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_7_s8_x_untied, svint8_t,
++		z0 = svasr_n_s8_x (p0, z1, 7),
++		z0 = svasr_x (p0, z1, 7))
++
++/*
++** asr_8_s8_x_tied1:
++**	asr	z0\.b, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_8_s8_x_tied1, svint8_t,
++		z0 = svasr_n_s8_x (p0, z0, 8),
++		z0 = svasr_x (p0, z0, 8))
++
++/*
++** asr_8_s8_x_untied:
++**	asr	z0\.b, z1\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_8_s8_x_untied, svint8_t,
++		z0 = svasr_n_s8_x (p0, z1, 8),
++		z0 = svasr_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s16.c
+new file mode 100644
+index 000000000..b74ae33e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s16.c
+@@ -0,0 +1,325 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_wide_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s16_m_tied1, svint16_t, svuint64_t,
++	     z0 = svasr_wide_s16_m (p0, z0, z4),
++	     z0 = svasr_wide_m (p0, z0, z4))
++
++/*
++** asr_wide_s16_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s16_m_tied2, svint16_t, svuint64_t,
++		 z0_res = svasr_wide_s16_m (p0, z4, z0),
++		 z0_res = svasr_wide_m (p0, z4, z0))
++
++/*
++** asr_wide_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s16_m_untied, svint16_t, svuint64_t,
++	     z0 = svasr_wide_s16_m (p0, z1, z4),
++	     z0 = svasr_wide_m (p0, z1, z4))
++
++/*
++** asr_wide_x0_s16_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s16_m_tied1, svint16_t, uint64_t,
++		 z0 = svasr_wide_n_s16_m (p0, z0, x0),
++		 z0 = svasr_wide_m (p0, z0, x0))
++
++/*
++** asr_wide_x0_s16_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s16_m_untied, svint16_t, uint64_t,
++		 z0 = svasr_wide_n_s16_m (p0, z1, x0),
++		 z0 = svasr_wide_m (p0, z1, x0))
++
++/*
++** asr_wide_1_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s16_m_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_m (p0, z0, 1),
++		z0 = svasr_wide_m (p0, z0, 1))
++
++/*
++** asr_wide_1_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s16_m_untied, svint16_t,
++		z0 = svasr_wide_n_s16_m (p0, z1, 1),
++		z0 = svasr_wide_m (p0, z1, 1))
++
++/*
++** asr_wide_15_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_15_s16_m_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_m (p0, z0, 15),
++		z0 = svasr_wide_m (p0, z0, 15))
++
++/*
++** asr_wide_15_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_15_s16_m_untied, svint16_t,
++		z0 = svasr_wide_n_s16_m (p0, z1, 15),
++		z0 = svasr_wide_m (p0, z1, 15))
++
++/*
++** asr_wide_16_s16_m_tied1:
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_16_s16_m_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_m (p0, z0, 16),
++		z0 = svasr_wide_m (p0, z0, 16))
++
++/*
++** asr_wide_16_s16_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_16_s16_m_untied, svint16_t,
++		z0 = svasr_wide_n_s16_m (p0, z1, 16),
++		z0 = svasr_wide_m (p0, z1, 16))
++
++/*
++** asr_wide_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s16_z_tied1, svint16_t, svuint64_t,
++	     z0 = svasr_wide_s16_z (p0, z0, z4),
++	     z0 = svasr_wide_z (p0, z0, z4))
++
++/*
++** asr_wide_s16_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.h, p0/z, z4\.h
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s16_z_tied2, svint16_t, svuint64_t,
++		 z0_res = svasr_wide_s16_z (p0, z4, z0),
++		 z0_res = svasr_wide_z (p0, z4, z0))
++
++/*
++** asr_wide_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s16_z_untied, svint16_t, svuint64_t,
++	     z0 = svasr_wide_s16_z (p0, z1, z4),
++	     z0 = svasr_wide_z (p0, z1, z4))
++
++/*
++** asr_wide_x0_s16_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s16_z_tied1, svint16_t, uint64_t,
++		 z0 = svasr_wide_n_s16_z (p0, z0, x0),
++		 z0 = svasr_wide_z (p0, z0, x0))
++
++/*
++** asr_wide_x0_s16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s16_z_untied, svint16_t, uint64_t,
++		 z0 = svasr_wide_n_s16_z (p0, z1, x0),
++		 z0 = svasr_wide_z (p0, z1, x0))
++
++/*
++** asr_wide_1_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s16_z_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_z (p0, z0, 1),
++		z0 = svasr_wide_z (p0, z0, 1))
++
++/*
++** asr_wide_1_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s16_z_untied, svint16_t,
++		z0 = svasr_wide_n_s16_z (p0, z1, 1),
++		z0 = svasr_wide_z (p0, z1, 1))
++
++/*
++** asr_wide_15_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_15_s16_z_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_z (p0, z0, 15),
++		z0 = svasr_wide_z (p0, z0, 15))
++
++/*
++** asr_wide_15_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_15_s16_z_untied, svint16_t,
++		z0 = svasr_wide_n_s16_z (p0, z1, 15),
++		z0 = svasr_wide_z (p0, z1, 15))
++
++/*
++** asr_wide_16_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_16_s16_z_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_z (p0, z0, 16),
++		z0 = svasr_wide_z (p0, z0, 16))
++
++/*
++** asr_wide_16_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_16_s16_z_untied, svint16_t,
++		z0 = svasr_wide_n_s16_z (p0, z1, 16),
++		z0 = svasr_wide_z (p0, z1, 16))
++
++/*
++** asr_wide_s16_x_tied1:
++**	asr	z0\.h, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s16_x_tied1, svint16_t, svuint64_t,
++	     z0 = svasr_wide_s16_x (p0, z0, z4),
++	     z0 = svasr_wide_x (p0, z0, z4))
++
++/*
++** asr_wide_s16_x_tied2:
++**	asr	z0\.h, z4\.h, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s16_x_tied2, svint16_t, svuint64_t,
++		 z0_res = svasr_wide_s16_x (p0, z4, z0),
++		 z0_res = svasr_wide_x (p0, z4, z0))
++
++/*
++** asr_wide_s16_x_untied:
++**	asr	z0\.h, z1\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s16_x_untied, svint16_t, svuint64_t,
++	     z0 = svasr_wide_s16_x (p0, z1, z4),
++	     z0 = svasr_wide_x (p0, z1, z4))
++
++/*
++** asr_wide_x0_s16_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s16_x_tied1, svint16_t, uint64_t,
++		 z0 = svasr_wide_n_s16_x (p0, z0, x0),
++		 z0 = svasr_wide_x (p0, z0, x0))
++
++/*
++** asr_wide_x0_s16_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s16_x_untied, svint16_t, uint64_t,
++		 z0 = svasr_wide_n_s16_x (p0, z1, x0),
++		 z0 = svasr_wide_x (p0, z1, x0))
++
++/*
++** asr_wide_1_s16_x_tied1:
++**	asr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s16_x_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_x (p0, z0, 1),
++		z0 = svasr_wide_x (p0, z0, 1))
++
++/*
++** asr_wide_1_s16_x_untied:
++**	asr	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s16_x_untied, svint16_t,
++		z0 = svasr_wide_n_s16_x (p0, z1, 1),
++		z0 = svasr_wide_x (p0, z1, 1))
++
++/*
++** asr_wide_15_s16_x_tied1:
++**	asr	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_15_s16_x_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_x (p0, z0, 15),
++		z0 = svasr_wide_x (p0, z0, 15))
++
++/*
++** asr_wide_15_s16_x_untied:
++**	asr	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_15_s16_x_untied, svint16_t,
++		z0 = svasr_wide_n_s16_x (p0, z1, 15),
++		z0 = svasr_wide_x (p0, z1, 15))
++
++/*
++** asr_wide_16_s16_x_tied1:
++**	asr	z0\.h, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_16_s16_x_tied1, svint16_t,
++		z0 = svasr_wide_n_s16_x (p0, z0, 16),
++		z0 = svasr_wide_x (p0, z0, 16))
++
++/*
++** asr_wide_16_s16_x_untied:
++**	asr	z0\.h, z1\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_16_s16_x_untied, svint16_t,
++		z0 = svasr_wide_n_s16_x (p0, z1, 16),
++		z0 = svasr_wide_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s32.c
+new file mode 100644
+index 000000000..8698aef26
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s32.c
+@@ -0,0 +1,325 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_wide_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s32_m_tied1, svint32_t, svuint64_t,
++	     z0 = svasr_wide_s32_m (p0, z0, z4),
++	     z0 = svasr_wide_m (p0, z0, z4))
++
++/*
++** asr_wide_s32_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s32_m_tied2, svint32_t, svuint64_t,
++		 z0_res = svasr_wide_s32_m (p0, z4, z0),
++		 z0_res = svasr_wide_m (p0, z4, z0))
++
++/*
++** asr_wide_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s32_m_untied, svint32_t, svuint64_t,
++	     z0 = svasr_wide_s32_m (p0, z1, z4),
++	     z0 = svasr_wide_m (p0, z1, z4))
++
++/*
++** asr_wide_x0_s32_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s32_m_tied1, svint32_t, uint64_t,
++		 z0 = svasr_wide_n_s32_m (p0, z0, x0),
++		 z0 = svasr_wide_m (p0, z0, x0))
++
++/*
++** asr_wide_x0_s32_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s32_m_untied, svint32_t, uint64_t,
++		 z0 = svasr_wide_n_s32_m (p0, z1, x0),
++		 z0 = svasr_wide_m (p0, z1, x0))
++
++/*
++** asr_wide_1_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s32_m_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_m (p0, z0, 1),
++		z0 = svasr_wide_m (p0, z0, 1))
++
++/*
++** asr_wide_1_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s32_m_untied, svint32_t,
++		z0 = svasr_wide_n_s32_m (p0, z1, 1),
++		z0 = svasr_wide_m (p0, z1, 1))
++
++/*
++** asr_wide_31_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_31_s32_m_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_m (p0, z0, 31),
++		z0 = svasr_wide_m (p0, z0, 31))
++
++/*
++** asr_wide_31_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_31_s32_m_untied, svint32_t,
++		z0 = svasr_wide_n_s32_m (p0, z1, 31),
++		z0 = svasr_wide_m (p0, z1, 31))
++
++/*
++** asr_wide_32_s32_m_tied1:
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_32_s32_m_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_m (p0, z0, 32),
++		z0 = svasr_wide_m (p0, z0, 32))
++
++/*
++** asr_wide_32_s32_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_32_s32_m_untied, svint32_t,
++		z0 = svasr_wide_n_s32_m (p0, z1, 32),
++		z0 = svasr_wide_m (p0, z1, 32))
++
++/*
++** asr_wide_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s32_z_tied1, svint32_t, svuint64_t,
++	     z0 = svasr_wide_s32_z (p0, z0, z4),
++	     z0 = svasr_wide_z (p0, z0, z4))
++
++/*
++** asr_wide_s32_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.s, p0/z, z4\.s
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s32_z_tied2, svint32_t, svuint64_t,
++		 z0_res = svasr_wide_s32_z (p0, z4, z0),
++		 z0_res = svasr_wide_z (p0, z4, z0))
++
++/*
++** asr_wide_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s32_z_untied, svint32_t, svuint64_t,
++	     z0 = svasr_wide_s32_z (p0, z1, z4),
++	     z0 = svasr_wide_z (p0, z1, z4))
++
++/*
++** asr_wide_x0_s32_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s32_z_tied1, svint32_t, uint64_t,
++		 z0 = svasr_wide_n_s32_z (p0, z0, x0),
++		 z0 = svasr_wide_z (p0, z0, x0))
++
++/*
++** asr_wide_x0_s32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s32_z_untied, svint32_t, uint64_t,
++		 z0 = svasr_wide_n_s32_z (p0, z1, x0),
++		 z0 = svasr_wide_z (p0, z1, x0))
++
++/*
++** asr_wide_1_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s32_z_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_z (p0, z0, 1),
++		z0 = svasr_wide_z (p0, z0, 1))
++
++/*
++** asr_wide_1_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s32_z_untied, svint32_t,
++		z0 = svasr_wide_n_s32_z (p0, z1, 1),
++		z0 = svasr_wide_z (p0, z1, 1))
++
++/*
++** asr_wide_31_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_31_s32_z_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_z (p0, z0, 31),
++		z0 = svasr_wide_z (p0, z0, 31))
++
++/*
++** asr_wide_31_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_31_s32_z_untied, svint32_t,
++		z0 = svasr_wide_n_s32_z (p0, z1, 31),
++		z0 = svasr_wide_z (p0, z1, 31))
++
++/*
++** asr_wide_32_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_32_s32_z_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_z (p0, z0, 32),
++		z0 = svasr_wide_z (p0, z0, 32))
++
++/*
++** asr_wide_32_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_32_s32_z_untied, svint32_t,
++		z0 = svasr_wide_n_s32_z (p0, z1, 32),
++		z0 = svasr_wide_z (p0, z1, 32))
++
++/*
++** asr_wide_s32_x_tied1:
++**	asr	z0\.s, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s32_x_tied1, svint32_t, svuint64_t,
++	     z0 = svasr_wide_s32_x (p0, z0, z4),
++	     z0 = svasr_wide_x (p0, z0, z4))
++
++/*
++** asr_wide_s32_x_tied2:
++**	asr	z0\.s, z4\.s, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s32_x_tied2, svint32_t, svuint64_t,
++		 z0_res = svasr_wide_s32_x (p0, z4, z0),
++		 z0_res = svasr_wide_x (p0, z4, z0))
++
++/*
++** asr_wide_s32_x_untied:
++**	asr	z0\.s, z1\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s32_x_untied, svint32_t, svuint64_t,
++	     z0 = svasr_wide_s32_x (p0, z1, z4),
++	     z0 = svasr_wide_x (p0, z1, z4))
++
++/*
++** asr_wide_x0_s32_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s32_x_tied1, svint32_t, uint64_t,
++		 z0 = svasr_wide_n_s32_x (p0, z0, x0),
++		 z0 = svasr_wide_x (p0, z0, x0))
++
++/*
++** asr_wide_x0_s32_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s32_x_untied, svint32_t, uint64_t,
++		 z0 = svasr_wide_n_s32_x (p0, z1, x0),
++		 z0 = svasr_wide_x (p0, z1, x0))
++
++/*
++** asr_wide_1_s32_x_tied1:
++**	asr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s32_x_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_x (p0, z0, 1),
++		z0 = svasr_wide_x (p0, z0, 1))
++
++/*
++** asr_wide_1_s32_x_untied:
++**	asr	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s32_x_untied, svint32_t,
++		z0 = svasr_wide_n_s32_x (p0, z1, 1),
++		z0 = svasr_wide_x (p0, z1, 1))
++
++/*
++** asr_wide_31_s32_x_tied1:
++**	asr	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_31_s32_x_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_x (p0, z0, 31),
++		z0 = svasr_wide_x (p0, z0, 31))
++
++/*
++** asr_wide_31_s32_x_untied:
++**	asr	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_31_s32_x_untied, svint32_t,
++		z0 = svasr_wide_n_s32_x (p0, z1, 31),
++		z0 = svasr_wide_x (p0, z1, 31))
++
++/*
++** asr_wide_32_s32_x_tied1:
++**	asr	z0\.s, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_32_s32_x_tied1, svint32_t,
++		z0 = svasr_wide_n_s32_x (p0, z0, 32),
++		z0 = svasr_wide_x (p0, z0, 32))
++
++/*
++** asr_wide_32_s32_x_untied:
++**	asr	z0\.s, z1\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_32_s32_x_untied, svint32_t,
++		z0 = svasr_wide_n_s32_x (p0, z1, 32),
++		z0 = svasr_wide_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s8.c
+new file mode 100644
+index 000000000..77b166939
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s8.c
+@@ -0,0 +1,325 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asr_wide_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s8_m_tied1, svint8_t, svuint64_t,
++	     z0 = svasr_wide_s8_m (p0, z0, z4),
++	     z0 = svasr_wide_m (p0, z0, z4))
++
++/*
++** asr_wide_s8_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s8_m_tied2, svint8_t, svuint64_t,
++		 z0_res = svasr_wide_s8_m (p0, z4, z0),
++		 z0_res = svasr_wide_m (p0, z4, z0))
++
++/*
++** asr_wide_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s8_m_untied, svint8_t, svuint64_t,
++	     z0 = svasr_wide_s8_m (p0, z1, z4),
++	     z0 = svasr_wide_m (p0, z1, z4))
++
++/*
++** asr_wide_x0_s8_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s8_m_tied1, svint8_t, uint64_t,
++		 z0 = svasr_wide_n_s8_m (p0, z0, x0),
++		 z0 = svasr_wide_m (p0, z0, x0))
++
++/*
++** asr_wide_x0_s8_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s8_m_untied, svint8_t, uint64_t,
++		 z0 = svasr_wide_n_s8_m (p0, z1, x0),
++		 z0 = svasr_wide_m (p0, z1, x0))
++
++/*
++** asr_wide_1_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s8_m_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_m (p0, z0, 1),
++		z0 = svasr_wide_m (p0, z0, 1))
++
++/*
++** asr_wide_1_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s8_m_untied, svint8_t,
++		z0 = svasr_wide_n_s8_m (p0, z1, 1),
++		z0 = svasr_wide_m (p0, z1, 1))
++
++/*
++** asr_wide_7_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_7_s8_m_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_m (p0, z0, 7),
++		z0 = svasr_wide_m (p0, z0, 7))
++
++/*
++** asr_wide_7_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_7_s8_m_untied, svint8_t,
++		z0 = svasr_wide_n_s8_m (p0, z1, 7),
++		z0 = svasr_wide_m (p0, z1, 7))
++
++/*
++** asr_wide_8_s8_m_tied1:
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_8_s8_m_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_m (p0, z0, 8),
++		z0 = svasr_wide_m (p0, z0, 8))
++
++/*
++** asr_wide_8_s8_m_untied:
++**	movprfx	z0, z1
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_8_s8_m_untied, svint8_t,
++		z0 = svasr_wide_n_s8_m (p0, z1, 8),
++		z0 = svasr_wide_m (p0, z1, 8))
++
++/*
++** asr_wide_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s8_z_tied1, svint8_t, svuint64_t,
++	     z0 = svasr_wide_s8_z (p0, z0, z4),
++	     z0 = svasr_wide_z (p0, z0, z4))
++
++/*
++** asr_wide_s8_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.b, p0/z, z4\.b
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s8_z_tied2, svint8_t, svuint64_t,
++		 z0_res = svasr_wide_s8_z (p0, z4, z0),
++		 z0_res = svasr_wide_z (p0, z4, z0))
++
++/*
++** asr_wide_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s8_z_untied, svint8_t, svuint64_t,
++	     z0 = svasr_wide_s8_z (p0, z1, z4),
++	     z0 = svasr_wide_z (p0, z1, z4))
++
++/*
++** asr_wide_x0_s8_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s8_z_tied1, svint8_t, uint64_t,
++		 z0 = svasr_wide_n_s8_z (p0, z0, x0),
++		 z0 = svasr_wide_z (p0, z0, x0))
++
++/*
++** asr_wide_x0_s8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s8_z_untied, svint8_t, uint64_t,
++		 z0 = svasr_wide_n_s8_z (p0, z1, x0),
++		 z0 = svasr_wide_z (p0, z1, x0))
++
++/*
++** asr_wide_1_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s8_z_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_z (p0, z0, 1),
++		z0 = svasr_wide_z (p0, z0, 1))
++
++/*
++** asr_wide_1_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s8_z_untied, svint8_t,
++		z0 = svasr_wide_n_s8_z (p0, z1, 1),
++		z0 = svasr_wide_z (p0, z1, 1))
++
++/*
++** asr_wide_7_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_7_s8_z_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_z (p0, z0, 7),
++		z0 = svasr_wide_z (p0, z0, 7))
++
++/*
++** asr_wide_7_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_7_s8_z_untied, svint8_t,
++		z0 = svasr_wide_n_s8_z (p0, z1, 7),
++		z0 = svasr_wide_z (p0, z1, 7))
++
++/*
++** asr_wide_8_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_8_s8_z_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_z (p0, z0, 8),
++		z0 = svasr_wide_z (p0, z0, 8))
++
++/*
++** asr_wide_8_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_8_s8_z_untied, svint8_t,
++		z0 = svasr_wide_n_s8_z (p0, z1, 8),
++		z0 = svasr_wide_z (p0, z1, 8))
++
++/*
++** asr_wide_s8_x_tied1:
++**	asr	z0\.b, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s8_x_tied1, svint8_t, svuint64_t,
++	     z0 = svasr_wide_s8_x (p0, z0, z4),
++	     z0 = svasr_wide_x (p0, z0, z4))
++
++/*
++** asr_wide_s8_x_tied2:
++**	asr	z0\.b, z4\.b, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (asr_wide_s8_x_tied2, svint8_t, svuint64_t,
++		 z0_res = svasr_wide_s8_x (p0, z4, z0),
++		 z0_res = svasr_wide_x (p0, z4, z0))
++
++/*
++** asr_wide_s8_x_untied:
++**	asr	z0\.b, z1\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (asr_wide_s8_x_untied, svint8_t, svuint64_t,
++	     z0 = svasr_wide_s8_x (p0, z1, z4),
++	     z0 = svasr_wide_x (p0, z1, z4))
++
++/*
++** asr_wide_x0_s8_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s8_x_tied1, svint8_t, uint64_t,
++		 z0 = svasr_wide_n_s8_x (p0, z0, x0),
++		 z0 = svasr_wide_x (p0, z0, x0))
++
++/*
++** asr_wide_x0_s8_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	asr	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (asr_wide_x0_s8_x_untied, svint8_t, uint64_t,
++		 z0 = svasr_wide_n_s8_x (p0, z1, x0),
++		 z0 = svasr_wide_x (p0, z1, x0))
++
++/*
++** asr_wide_1_s8_x_tied1:
++**	asr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s8_x_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_x (p0, z0, 1),
++		z0 = svasr_wide_x (p0, z0, 1))
++
++/*
++** asr_wide_1_s8_x_untied:
++**	asr	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_1_s8_x_untied, svint8_t,
++		z0 = svasr_wide_n_s8_x (p0, z1, 1),
++		z0 = svasr_wide_x (p0, z1, 1))
++
++/*
++** asr_wide_7_s8_x_tied1:
++**	asr	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_7_s8_x_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_x (p0, z0, 7),
++		z0 = svasr_wide_x (p0, z0, 7))
++
++/*
++** asr_wide_7_s8_x_untied:
++**	asr	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_7_s8_x_untied, svint8_t,
++		z0 = svasr_wide_n_s8_x (p0, z1, 7),
++		z0 = svasr_wide_x (p0, z1, 7))
++
++/*
++** asr_wide_8_s8_x_tied1:
++**	asr	z0\.b, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_8_s8_x_tied1, svint8_t,
++		z0 = svasr_wide_n_s8_x (p0, z0, 8),
++		z0 = svasr_wide_x (p0, z0, 8))
++
++/*
++** asr_wide_8_s8_x_untied:
++**	asr	z0\.b, z1\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asr_wide_8_s8_x_untied, svint8_t,
++		z0 = svasr_wide_n_s8_x (p0, z1, 8),
++		z0 = svasr_wide_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s16.c
+new file mode 100644
+index 000000000..40bbce042
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s16.c
+@@ -0,0 +1,177 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asrd_1_s16_m_tied1:
++**	asrd	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s16_m_tied1, svint16_t,
++		z0 = svasrd_n_s16_m (p0, z0, 1),
++		z0 = svasrd_m (p0, z0, 1))
++
++/*
++** asrd_1_s16_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s16_m_untied, svint16_t,
++		z0 = svasrd_n_s16_m (p0, z1, 1),
++		z0 = svasrd_m (p0, z1, 1))
++
++/*
++** asrd_2_s16_m_tied1:
++**	asrd	z0\.h, p0/m, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s16_m_tied1, svint16_t,
++		z0 = svasrd_n_s16_m (p0, z0, 2),
++		z0 = svasrd_m (p0, z0, 2))
++
++/*
++** asrd_2_s16_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.h, p0/m, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s16_m_untied, svint16_t,
++		z0 = svasrd_n_s16_m (p0, z1, 2),
++		z0 = svasrd_m (p0, z1, 2))
++
++/*
++** asrd_16_s16_m_tied1:
++**	asrd	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_16_s16_m_tied1, svint16_t,
++		z0 = svasrd_n_s16_m (p0, z0, 16),
++		z0 = svasrd_m (p0, z0, 16))
++
++/*
++** asrd_16_s16_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_16_s16_m_untied, svint16_t,
++		z0 = svasrd_n_s16_m (p0, z1, 16),
++		z0 = svasrd_m (p0, z1, 16))
++
++/*
++** asrd_1_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asrd	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s16_z_tied1, svint16_t,
++		z0 = svasrd_n_s16_z (p0, z0, 1),
++		z0 = svasrd_z (p0, z0, 1))
++
++/*
++** asrd_1_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asrd	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s16_z_untied, svint16_t,
++		z0 = svasrd_n_s16_z (p0, z1, 1),
++		z0 = svasrd_z (p0, z1, 1))
++
++/*
++** asrd_2_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asrd	z0\.h, p0/m, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s16_z_tied1, svint16_t,
++		z0 = svasrd_n_s16_z (p0, z0, 2),
++		z0 = svasrd_z (p0, z0, 2))
++
++/*
++** asrd_2_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asrd	z0\.h, p0/m, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s16_z_untied, svint16_t,
++		z0 = svasrd_n_s16_z (p0, z1, 2),
++		z0 = svasrd_z (p0, z1, 2))
++
++/*
++** asrd_16_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	asrd	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_16_s16_z_tied1, svint16_t,
++		z0 = svasrd_n_s16_z (p0, z0, 16),
++		z0 = svasrd_z (p0, z0, 16))
++
++/*
++** asrd_16_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	asrd	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_16_s16_z_untied, svint16_t,
++		z0 = svasrd_n_s16_z (p0, z1, 16),
++		z0 = svasrd_z (p0, z1, 16))
++
++/*
++** asrd_1_s16_x_tied1:
++**	asrd	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s16_x_tied1, svint16_t,
++		z0 = svasrd_n_s16_x (p0, z0, 1),
++		z0 = svasrd_x (p0, z0, 1))
++
++/*
++** asrd_1_s16_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s16_x_untied, svint16_t,
++		z0 = svasrd_n_s16_x (p0, z1, 1),
++		z0 = svasrd_x (p0, z1, 1))
++
++/*
++** asrd_2_s16_x_tied1:
++**	asrd	z0\.h, p0/m, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s16_x_tied1, svint16_t,
++		z0 = svasrd_n_s16_x (p0, z0, 2),
++		z0 = svasrd_x (p0, z0, 2))
++
++/*
++** asrd_2_s16_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.h, p0/m, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s16_x_untied, svint16_t,
++		z0 = svasrd_n_s16_x (p0, z1, 2),
++		z0 = svasrd_x (p0, z1, 2))
++
++/*
++** asrd_16_s16_x_tied1:
++**	asrd	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_16_s16_x_tied1, svint16_t,
++		z0 = svasrd_n_s16_x (p0, z0, 16),
++		z0 = svasrd_x (p0, z0, 16))
++
++/*
++** asrd_16_s16_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_16_s16_x_untied, svint16_t,
++		z0 = svasrd_n_s16_x (p0, z1, 16),
++		z0 = svasrd_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s32.c
+new file mode 100644
+index 000000000..0760b03de
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s32.c
+@@ -0,0 +1,177 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asrd_1_s32_m_tied1:
++**	asrd	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s32_m_tied1, svint32_t,
++		z0 = svasrd_n_s32_m (p0, z0, 1),
++		z0 = svasrd_m (p0, z0, 1))
++
++/*
++** asrd_1_s32_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s32_m_untied, svint32_t,
++		z0 = svasrd_n_s32_m (p0, z1, 1),
++		z0 = svasrd_m (p0, z1, 1))
++
++/*
++** asrd_2_s32_m_tied1:
++**	asrd	z0\.s, p0/m, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s32_m_tied1, svint32_t,
++		z0 = svasrd_n_s32_m (p0, z0, 2),
++		z0 = svasrd_m (p0, z0, 2))
++
++/*
++** asrd_2_s32_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.s, p0/m, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s32_m_untied, svint32_t,
++		z0 = svasrd_n_s32_m (p0, z1, 2),
++		z0 = svasrd_m (p0, z1, 2))
++
++/*
++** asrd_32_s32_m_tied1:
++**	asrd	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_32_s32_m_tied1, svint32_t,
++		z0 = svasrd_n_s32_m (p0, z0, 32),
++		z0 = svasrd_m (p0, z0, 32))
++
++/*
++** asrd_32_s32_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_32_s32_m_untied, svint32_t,
++		z0 = svasrd_n_s32_m (p0, z1, 32),
++		z0 = svasrd_m (p0, z1, 32))
++
++/*
++** asrd_1_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asrd	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s32_z_tied1, svint32_t,
++		z0 = svasrd_n_s32_z (p0, z0, 1),
++		z0 = svasrd_z (p0, z0, 1))
++
++/*
++** asrd_1_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asrd	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s32_z_untied, svint32_t,
++		z0 = svasrd_n_s32_z (p0, z1, 1),
++		z0 = svasrd_z (p0, z1, 1))
++
++/*
++** asrd_2_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asrd	z0\.s, p0/m, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s32_z_tied1, svint32_t,
++		z0 = svasrd_n_s32_z (p0, z0, 2),
++		z0 = svasrd_z (p0, z0, 2))
++
++/*
++** asrd_2_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asrd	z0\.s, p0/m, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s32_z_untied, svint32_t,
++		z0 = svasrd_n_s32_z (p0, z1, 2),
++		z0 = svasrd_z (p0, z1, 2))
++
++/*
++** asrd_32_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	asrd	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_32_s32_z_tied1, svint32_t,
++		z0 = svasrd_n_s32_z (p0, z0, 32),
++		z0 = svasrd_z (p0, z0, 32))
++
++/*
++** asrd_32_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	asrd	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_32_s32_z_untied, svint32_t,
++		z0 = svasrd_n_s32_z (p0, z1, 32),
++		z0 = svasrd_z (p0, z1, 32))
++
++/*
++** asrd_1_s32_x_tied1:
++**	asrd	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s32_x_tied1, svint32_t,
++		z0 = svasrd_n_s32_x (p0, z0, 1),
++		z0 = svasrd_x (p0, z0, 1))
++
++/*
++** asrd_1_s32_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s32_x_untied, svint32_t,
++		z0 = svasrd_n_s32_x (p0, z1, 1),
++		z0 = svasrd_x (p0, z1, 1))
++
++/*
++** asrd_2_s32_x_tied1:
++**	asrd	z0\.s, p0/m, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s32_x_tied1, svint32_t,
++		z0 = svasrd_n_s32_x (p0, z0, 2),
++		z0 = svasrd_x (p0, z0, 2))
++
++/*
++** asrd_2_s32_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.s, p0/m, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s32_x_untied, svint32_t,
++		z0 = svasrd_n_s32_x (p0, z1, 2),
++		z0 = svasrd_x (p0, z1, 2))
++
++/*
++** asrd_32_s32_x_tied1:
++**	asrd	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_32_s32_x_tied1, svint32_t,
++		z0 = svasrd_n_s32_x (p0, z0, 32),
++		z0 = svasrd_x (p0, z0, 32))
++
++/*
++** asrd_32_s32_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_32_s32_x_untied, svint32_t,
++		z0 = svasrd_n_s32_x (p0, z1, 32),
++		z0 = svasrd_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s64.c
+new file mode 100644
+index 000000000..0ef26c9fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s64.c
+@@ -0,0 +1,177 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asrd_1_s64_m_tied1:
++**	asrd	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s64_m_tied1, svint64_t,
++		z0 = svasrd_n_s64_m (p0, z0, 1),
++		z0 = svasrd_m (p0, z0, 1))
++
++/*
++** asrd_1_s64_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s64_m_untied, svint64_t,
++		z0 = svasrd_n_s64_m (p0, z1, 1),
++		z0 = svasrd_m (p0, z1, 1))
++
++/*
++** asrd_2_s64_m_tied1:
++**	asrd	z0\.d, p0/m, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s64_m_tied1, svint64_t,
++		z0 = svasrd_n_s64_m (p0, z0, 2),
++		z0 = svasrd_m (p0, z0, 2))
++
++/*
++** asrd_2_s64_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.d, p0/m, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s64_m_untied, svint64_t,
++		z0 = svasrd_n_s64_m (p0, z1, 2),
++		z0 = svasrd_m (p0, z1, 2))
++
++/*
++** asrd_64_s64_m_tied1:
++**	asrd	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_64_s64_m_tied1, svint64_t,
++		z0 = svasrd_n_s64_m (p0, z0, 64),
++		z0 = svasrd_m (p0, z0, 64))
++
++/*
++** asrd_64_s64_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_64_s64_m_untied, svint64_t,
++		z0 = svasrd_n_s64_m (p0, z1, 64),
++		z0 = svasrd_m (p0, z1, 64))
++
++/*
++** asrd_1_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asrd	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s64_z_tied1, svint64_t,
++		z0 = svasrd_n_s64_z (p0, z0, 1),
++		z0 = svasrd_z (p0, z0, 1))
++
++/*
++** asrd_1_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asrd	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s64_z_untied, svint64_t,
++		z0 = svasrd_n_s64_z (p0, z1, 1),
++		z0 = svasrd_z (p0, z1, 1))
++
++/*
++** asrd_2_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asrd	z0\.d, p0/m, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s64_z_tied1, svint64_t,
++		z0 = svasrd_n_s64_z (p0, z0, 2),
++		z0 = svasrd_z (p0, z0, 2))
++
++/*
++** asrd_2_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asrd	z0\.d, p0/m, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s64_z_untied, svint64_t,
++		z0 = svasrd_n_s64_z (p0, z1, 2),
++		z0 = svasrd_z (p0, z1, 2))
++
++/*
++** asrd_64_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	asrd	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_64_s64_z_tied1, svint64_t,
++		z0 = svasrd_n_s64_z (p0, z0, 64),
++		z0 = svasrd_z (p0, z0, 64))
++
++/*
++** asrd_64_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	asrd	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_64_s64_z_untied, svint64_t,
++		z0 = svasrd_n_s64_z (p0, z1, 64),
++		z0 = svasrd_z (p0, z1, 64))
++
++/*
++** asrd_1_s64_x_tied1:
++**	asrd	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s64_x_tied1, svint64_t,
++		z0 = svasrd_n_s64_x (p0, z0, 1),
++		z0 = svasrd_x (p0, z0, 1))
++
++/*
++** asrd_1_s64_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s64_x_untied, svint64_t,
++		z0 = svasrd_n_s64_x (p0, z1, 1),
++		z0 = svasrd_x (p0, z1, 1))
++
++/*
++** asrd_2_s64_x_tied1:
++**	asrd	z0\.d, p0/m, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s64_x_tied1, svint64_t,
++		z0 = svasrd_n_s64_x (p0, z0, 2),
++		z0 = svasrd_x (p0, z0, 2))
++
++/*
++** asrd_2_s64_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.d, p0/m, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s64_x_untied, svint64_t,
++		z0 = svasrd_n_s64_x (p0, z1, 2),
++		z0 = svasrd_x (p0, z1, 2))
++
++/*
++** asrd_64_s64_x_tied1:
++**	asrd	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_64_s64_x_tied1, svint64_t,
++		z0 = svasrd_n_s64_x (p0, z0, 64),
++		z0 = svasrd_x (p0, z0, 64))
++
++/*
++** asrd_64_s64_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_64_s64_x_untied, svint64_t,
++		z0 = svasrd_n_s64_x (p0, z1, 64),
++		z0 = svasrd_x (p0, z1, 64))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s8.c
+new file mode 100644
+index 000000000..9249ffbcb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s8.c
+@@ -0,0 +1,177 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** asrd_1_s8_m_tied1:
++**	asrd	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s8_m_tied1, svint8_t,
++		z0 = svasrd_n_s8_m (p0, z0, 1),
++		z0 = svasrd_m (p0, z0, 1))
++
++/*
++** asrd_1_s8_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s8_m_untied, svint8_t,
++		z0 = svasrd_n_s8_m (p0, z1, 1),
++		z0 = svasrd_m (p0, z1, 1))
++
++/*
++** asrd_2_s8_m_tied1:
++**	asrd	z0\.b, p0/m, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s8_m_tied1, svint8_t,
++		z0 = svasrd_n_s8_m (p0, z0, 2),
++		z0 = svasrd_m (p0, z0, 2))
++
++/*
++** asrd_2_s8_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.b, p0/m, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s8_m_untied, svint8_t,
++		z0 = svasrd_n_s8_m (p0, z1, 2),
++		z0 = svasrd_m (p0, z1, 2))
++
++/*
++** asrd_8_s8_m_tied1:
++**	asrd	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_8_s8_m_tied1, svint8_t,
++		z0 = svasrd_n_s8_m (p0, z0, 8),
++		z0 = svasrd_m (p0, z0, 8))
++
++/*
++** asrd_8_s8_m_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_8_s8_m_untied, svint8_t,
++		z0 = svasrd_n_s8_m (p0, z1, 8),
++		z0 = svasrd_m (p0, z1, 8))
++
++/*
++** asrd_1_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asrd	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s8_z_tied1, svint8_t,
++		z0 = svasrd_n_s8_z (p0, z0, 1),
++		z0 = svasrd_z (p0, z0, 1))
++
++/*
++** asrd_1_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asrd	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s8_z_untied, svint8_t,
++		z0 = svasrd_n_s8_z (p0, z1, 1),
++		z0 = svasrd_z (p0, z1, 1))
++
++/*
++** asrd_2_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asrd	z0\.b, p0/m, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s8_z_tied1, svint8_t,
++		z0 = svasrd_n_s8_z (p0, z0, 2),
++		z0 = svasrd_z (p0, z0, 2))
++
++/*
++** asrd_2_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asrd	z0\.b, p0/m, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s8_z_untied, svint8_t,
++		z0 = svasrd_n_s8_z (p0, z1, 2),
++		z0 = svasrd_z (p0, z1, 2))
++
++/*
++** asrd_8_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	asrd	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_8_s8_z_tied1, svint8_t,
++		z0 = svasrd_n_s8_z (p0, z0, 8),
++		z0 = svasrd_z (p0, z0, 8))
++
++/*
++** asrd_8_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	asrd	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_8_s8_z_untied, svint8_t,
++		z0 = svasrd_n_s8_z (p0, z1, 8),
++		z0 = svasrd_z (p0, z1, 8))
++
++/*
++** asrd_1_s8_x_tied1:
++**	asrd	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s8_x_tied1, svint8_t,
++		z0 = svasrd_n_s8_x (p0, z0, 1),
++		z0 = svasrd_x (p0, z0, 1))
++
++/*
++** asrd_1_s8_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_1_s8_x_untied, svint8_t,
++		z0 = svasrd_n_s8_x (p0, z1, 1),
++		z0 = svasrd_x (p0, z1, 1))
++
++/*
++** asrd_2_s8_x_tied1:
++**	asrd	z0\.b, p0/m, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s8_x_tied1, svint8_t,
++		z0 = svasrd_n_s8_x (p0, z0, 2),
++		z0 = svasrd_x (p0, z0, 2))
++
++/*
++** asrd_2_s8_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.b, p0/m, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_2_s8_x_untied, svint8_t,
++		z0 = svasrd_n_s8_x (p0, z1, 2),
++		z0 = svasrd_x (p0, z1, 2))
++
++/*
++** asrd_8_s8_x_tied1:
++**	asrd	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_8_s8_x_tied1, svint8_t,
++		z0 = svasrd_n_s8_x (p0, z0, 8),
++		z0 = svasrd_x (p0, z0, 8))
++
++/*
++** asrd_8_s8_x_untied:
++**	movprfx	z0, z1
++**	asrd	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (asrd_8_s8_x_untied, svint8_t,
++		z0 = svasrd_n_s8_x (p0, z1, 8),
++		z0 = svasrd_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_f32.c
+new file mode 100644
+index 000000000..376622da0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_f32.c
+@@ -0,0 +1,67 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfdot_f32_tied1:
++**	bfdot	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfdot_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfdot_f32 (z0, z4, z5),
++	     z0 = svbfdot (z0, z4, z5))
++
++/*
++** bfdot_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfdot	z0\.s, \1\.h, z1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfdot_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfdot_f32 (z4, z0, z1),
++		 z0_res = svbfdot (z4, z0, z1))
++
++/*
++** bfdot_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfdot	z0\.s, z1\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfdot_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfdot_f32 (z4, z1, z0),
++		 z0_res = svbfdot (z4, z1, z0))
++
++/*
++** bfdot_f32_untied:
++**	movprfx	z0, z1
++**	bfdot	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfdot_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfdot_f32 (z1, z4, z5),
++	     z0 = svbfdot (z1, z4, z5))
++
++/*
++** bfdot_h7_f32_tied1:
++**	mov	(z[0-9]+\.h), h7
++**	bfdot	z0\.s, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZD (bfdot_h7_f32_tied1, svfloat32_t, svbfloat16_t, bfloat16_t,
++	      z0 = svbfdot_n_f32 (z0, z4, d7),
++	      z0 = svbfdot (z0, z4, d7))
++
++/*
++** bfdot_h7_f32_untied:
++**	mov	(z[0-9]+\.h), h7
++**	movprfx	z0, z1
++**	bfdot	z0\.s, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZD (bfdot_h7_f32_untied, svfloat32_t, svbfloat16_t, bfloat16_t,
++	      z0 = svbfdot_n_f32 (z1, z4, d7),
++	      z0 = svbfdot (z1, z4, d7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_lane_f32.c
+new file mode 100644
+index 000000000..0f624fe9f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_lane_f32.c
+@@ -0,0 +1,86 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfdot_lane_0_f32_tied1:
++**	bfdot	z0\.s, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (bfdot_lane_0_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfdot_lane_f32 (z0, z4, z5, 0),
++	     z0 = svbfdot_lane (z0, z4, z5, 0))
++
++/*
++** bfdot_lane_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfdot	z0\.s, \1\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (bfdot_lane_0_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfdot_lane_f32 (z4, z0, z1, 0),
++		 z0_res = svbfdot_lane (z4, z0, z1, 0))
++
++/*
++** bfdot_lane_0_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfdot	z0\.s, z1\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (bfdot_lane_0_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfdot_lane_f32 (z4, z1, z0, 0),
++		 z0_res = svbfdot_lane (z4, z1, z0, 0))
++
++/*
++** bfdot_lane_0_f32_untied:
++**	movprfx	z0, z1
++**	bfdot	z0\.s, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (bfdot_lane_0_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfdot_lane_f32 (z1, z4, z5, 0),
++	     z0 = svbfdot_lane (z1, z4, z5, 0))
++
++/*
++** bfdot_lane_1_f32:
++**	bfdot	z0\.s, z4\.h, z5\.h\[1\]
++**	ret
++*/
++TEST_DUAL_Z (bfdot_lane_1_f32, svfloat32_t, svbfloat16_t,
++	     z0 = svbfdot_lane_f32 (z0, z4, z5, 1),
++	     z0 = svbfdot_lane (z0, z4, z5, 1))
++
++/*
++** bfdot_lane_3_f32:
++**	bfdot	z0\.s, z4\.h, z5\.h\[3\]
++**	ret
++*/
++TEST_DUAL_Z (bfdot_lane_3_f32, svfloat32_t, svbfloat16_t,
++	     z0 = svbfdot_lane_f32 (z0, z4, z5, 3),
++	     z0 = svbfdot_lane (z0, z4, z5, 3))
++
++/*
++** bfdot_lane_z8_f32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	bfdot	z0\.s, z1\.h, \1\.h\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (bfdot_lane_z8_f32, svfloat32_t, svbfloat16_t, z8,
++		    z0 = svbfdot_lane_f32 (z0, z1, z8, 1),
++		    z0 = svbfdot_lane (z0, z1, z8, 1))
++
++/*
++** bfdot_lane_z16_f32:
++**	mov	(z[0-7])\.d, z16\.d
++**	bfdot	z0\.s, z1\.h, \1\.h\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (bfdot_lane_z16_f32, svfloat32_t, svbfloat16_t, z16,
++		    z0 = svbfdot_lane_f32 (z0, z1, z16, 1),
++		    z0 = svbfdot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_f32.c
+new file mode 100644
+index 000000000..0f810116c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_f32.c
+@@ -0,0 +1,67 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfmlalb_f32_tied1:
++**	bfmlalb	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfmlalb_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalb_f32 (z0, z4, z5),
++	     z0 = svbfmlalb (z0, z4, z5))
++
++/*
++** bfmlalb_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalb	z0\.s, \1\.h, z1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalb_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalb_f32 (z4, z0, z1),
++		 z0_res = svbfmlalb (z4, z0, z1))
++
++/*
++** bfmlalb_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalb	z0\.s, z1\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalb_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalb_f32 (z4, z1, z0),
++		 z0_res = svbfmlalb (z4, z1, z0))
++
++/*
++** bfmlalb_f32_untied:
++**	movprfx	z0, z1
++**	bfmlalb	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfmlalb_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalb_f32 (z1, z4, z5),
++	     z0 = svbfmlalb (z1, z4, z5))
++
++/*
++** bfmlalb_h7_f32_tied1:
++**	mov	(z[0-9]+\.h), h7
++**	bfmlalb	z0\.s, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZD (bfmlalb_h7_f32_tied1, svfloat32_t, svbfloat16_t, bfloat16_t,
++	      z0 = svbfmlalb_n_f32 (z0, z4, d7),
++	      z0 = svbfmlalb (z0, z4, d7))
++
++/*
++** bfmlalb_h7_f32_untied:
++**	mov	(z[0-9]+\.h), h7
++**	movprfx	z0, z1
++**	bfmlalb	z0\.s, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZD (bfmlalb_h7_f32_untied, svfloat32_t, svbfloat16_t, bfloat16_t,
++	      z0 = svbfmlalb_n_f32 (z1, z4, d7),
++	      z0 = svbfmlalb (z1, z4, d7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_lane_f32.c
+new file mode 100644
+index 000000000..b0ec0881d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_lane_f32.c
+@@ -0,0 +1,86 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfmlalb_lane_0_f32_tied1:
++**	bfmlalb	z0\.s, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalb_lane_0_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalb_lane_f32 (z0, z4, z5, 0),
++	     z0 = svbfmlalb_lane (z0, z4, z5, 0))
++
++/*
++** bfmlalb_lane_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalb	z0\.s, \1\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalb_lane_0_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalb_lane_f32 (z4, z0, z1, 0),
++		 z0_res = svbfmlalb_lane (z4, z0, z1, 0))
++
++/*
++** bfmlalb_lane_0_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalb	z0\.s, z1\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalb_lane_0_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalb_lane_f32 (z4, z1, z0, 0),
++		 z0_res = svbfmlalb_lane (z4, z1, z0, 0))
++
++/*
++** bfmlalb_lane_0_f32_untied:
++**	movprfx	z0, z1
++**	bfmlalb	z0\.s, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalb_lane_0_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalb_lane_f32 (z1, z4, z5, 0),
++	     z0 = svbfmlalb_lane (z1, z4, z5, 0))
++
++/*
++** bfmlalb_lane_1_f32:
++**	bfmlalb	z0\.s, z4\.h, z5\.h\[1\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalb_lane_1_f32, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalb_lane_f32 (z0, z4, z5, 1),
++	     z0 = svbfmlalb_lane (z0, z4, z5, 1))
++
++/*
++** bfmlalb_lane_7_f32:
++**	bfmlalb	z0\.s, z4\.h, z5\.h\[7\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalb_lane_7_f32, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalb_lane_f32 (z0, z4, z5, 7),
++	     z0 = svbfmlalb_lane (z0, z4, z5, 7))
++
++/*
++** bfmlalb_lane_z8_f32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	bfmlalb	z0\.s, z1\.h, \1\.h\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (bfmlalb_lane_z8_f32, svfloat32_t, svbfloat16_t, z8,
++		    z0 = svbfmlalb_lane_f32 (z0, z1, z8, 1),
++		    z0 = svbfmlalb_lane (z0, z1, z8, 1))
++
++/*
++** bfmlalb_lane_z16_f32:
++**	mov	(z[0-7])\.d, z16\.d
++**	bfmlalb	z0\.s, z1\.h, \1\.h\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (bfmlalb_lane_z16_f32, svfloat32_t, svbfloat16_t, z16,
++		    z0 = svbfmlalb_lane_f32 (z0, z1, z16, 1),
++		    z0 = svbfmlalb_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_f32.c
+new file mode 100644
+index 000000000..2a583fa4a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_f32.c
+@@ -0,0 +1,67 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfmlalt_f32_tied1:
++**	bfmlalt	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfmlalt_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalt_f32 (z0, z4, z5),
++	     z0 = svbfmlalt (z0, z4, z5))
++
++/*
++** bfmlalt_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalt	z0\.s, \1\.h, z1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalt_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalt_f32 (z4, z0, z1),
++		 z0_res = svbfmlalt (z4, z0, z1))
++
++/*
++** bfmlalt_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalt	z0\.s, z1\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalt_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalt_f32 (z4, z1, z0),
++		 z0_res = svbfmlalt (z4, z1, z0))
++
++/*
++** bfmlalt_f32_untied:
++**	movprfx	z0, z1
++**	bfmlalt	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfmlalt_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalt_f32 (z1, z4, z5),
++	     z0 = svbfmlalt (z1, z4, z5))
++
++/*
++** bfmlalt_h7_f32_tied1:
++**	mov	(z[0-9]+\.h), h7
++**	bfmlalt	z0\.s, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZD (bfmlalt_h7_f32_tied1, svfloat32_t, svbfloat16_t, bfloat16_t,
++	      z0 = svbfmlalt_n_f32 (z0, z4, d7),
++	      z0 = svbfmlalt (z0, z4, d7))
++
++/*
++** bfmlalt_h7_f32_untied:
++**	mov	(z[0-9]+\.h), h7
++**	movprfx	z0, z1
++**	bfmlalt	z0\.s, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZD (bfmlalt_h7_f32_untied, svfloat32_t, svbfloat16_t, bfloat16_t,
++	      z0 = svbfmlalt_n_f32 (z1, z4, d7),
++	      z0 = svbfmlalt (z1, z4, d7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_lane_f32.c
+new file mode 100644
+index 000000000..3af3997e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_lane_f32.c
+@@ -0,0 +1,86 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfmlalt_lane_0_f32_tied1:
++**	bfmlalt	z0\.s, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalt_lane_0_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalt_lane_f32 (z0, z4, z5, 0),
++	     z0 = svbfmlalt_lane (z0, z4, z5, 0))
++
++/*
++** bfmlalt_lane_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalt	z0\.s, \1\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalt_lane_0_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalt_lane_f32 (z4, z0, z1, 0),
++		 z0_res = svbfmlalt_lane (z4, z0, z1, 0))
++
++/*
++** bfmlalt_lane_0_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmlalt	z0\.s, z1\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmlalt_lane_0_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmlalt_lane_f32 (z4, z1, z0, 0),
++		 z0_res = svbfmlalt_lane (z4, z1, z0, 0))
++
++/*
++** bfmlalt_lane_0_f32_untied:
++**	movprfx	z0, z1
++**	bfmlalt	z0\.s, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalt_lane_0_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalt_lane_f32 (z1, z4, z5, 0),
++	     z0 = svbfmlalt_lane (z1, z4, z5, 0))
++
++/*
++** bfmlalt_lane_1_f32:
++**	bfmlalt	z0\.s, z4\.h, z5\.h\[1\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalt_lane_1_f32, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalt_lane_f32 (z0, z4, z5, 1),
++	     z0 = svbfmlalt_lane (z0, z4, z5, 1))
++
++/*
++** bfmlalt_lane_7_f32:
++**	bfmlalt	z0\.s, z4\.h, z5\.h\[7\]
++**	ret
++*/
++TEST_DUAL_Z (bfmlalt_lane_7_f32, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmlalt_lane_f32 (z0, z4, z5, 7),
++	     z0 = svbfmlalt_lane (z0, z4, z5, 7))
++
++/*
++** bfmlalt_lane_z8_f32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	bfmlalt	z0\.s, z1\.h, \1\.h\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (bfmlalt_lane_z8_f32, svfloat32_t, svbfloat16_t, z8,
++		    z0 = svbfmlalt_lane_f32 (z0, z1, z8, 1),
++		    z0 = svbfmlalt_lane (z0, z1, z8, 1))
++
++/*
++** bfmlalt_lane_z16_f32:
++**	mov	(z[0-7])\.d, z16\.d
++**	bfmlalt	z0\.s, z1\.h, \1\.h\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (bfmlalt_lane_z16_f32, svfloat32_t, svbfloat16_t, z16,
++		    z0 = svbfmlalt_lane_f32 (z0, z1, z16, 1),
++		    z0 = svbfmlalt_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmmla_f32.c
+new file mode 100644
+index 000000000..b1d98fbf5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmmla_f32.c
+@@ -0,0 +1,46 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bfmmla_f32_tied1:
++**	bfmmla	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfmmla_f32_tied1, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmmla_f32 (z0, z4, z5),
++	     z0 = svbfmmla (z0, z4, z5))
++
++/*
++** bfmmla_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmmla	z0\.s, \1\.h, z1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmmla_f32_tied2, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmmla_f32 (z4, z0, z1),
++		 z0_res = svbfmmla (z4, z0, z1))
++
++/*
++** bfmmla_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfmmla	z0\.s, z1\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (bfmmla_f32_tied3, svfloat32_t, svbfloat16_t,
++		 z0_res = svbfmmla_f32 (z4, z1, z0),
++		 z0_res = svbfmmla (z4, z1, z0))
++
++/*
++** bfmmla_f32_untied:
++**	movprfx	z0, z1
++**	bfmmla	z0\.s, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (bfmmla_f32_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svbfmmla_f32 (z1, z4, z5),
++	     z0 = svbfmmla (z1, z4, z5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_b.c
+new file mode 100644
+index 000000000..9d41aeaa2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_b_z_tied1:
++**	bic	p0\.b, p3/z, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (bic_b_z_tied1,
++		p0 = svbic_b_z (p3, p0, p1),
++		p0 = svbic_z (p3, p0, p1))
++
++/*
++** bic_b_z_tied2:
++**	bic	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (bic_b_z_tied2,
++		p0 = svbic_b_z (p3, p1, p0),
++		p0 = svbic_z (p3, p1, p0))
++
++/*
++** bic_b_z_untied:
++**	bic	p0\.b, p3/z, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (bic_b_z_untied,
++		p0 = svbic_b_z (p3, p1, p2),
++		p0 = svbic_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s16.c
+new file mode 100644
+index 000000000..c80f5697f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s16.c
+@@ -0,0 +1,367 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_s16_m_tied1:
++**	bic	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_m_tied1, svint16_t,
++		z0 = svbic_s16_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_m_tied2, svint16_t,
++		z0 = svbic_s16_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_s16_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_m_untied, svint16_t,
++		z0 = svbic_s16_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svbic_n_s16_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svbic_n_s16_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #-2
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s16_m_tied1, svint16_t,
++		z0 = svbic_n_s16_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #-2
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s16_m_untied, svint16_t,
++		z0 = svbic_n_s16_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_s16_m:
++**	mov	(z[0-9]+\.h), #1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_s16_m, svint16_t,
++		z0 = svbic_n_s16_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	bic	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_z_tied1, svint16_t,
++		z0 = svbic_s16_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_s16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	bic	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_z_tied2, svint16_t,
++		z0 = svbic_s16_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	bic	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_z_untied, svint16_t,
++		z0 = svbic_s16_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svbic_n_s16_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_w0_s16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svbic_n_s16_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #-2
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s16_z_tied1, svint16_t,
++		z0 = svbic_n_s16_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #-2
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s16_z_untied, svint16_t,
++		z0 = svbic_n_s16_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_s16_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_x_tied1, svint16_t,
++		z0 = svbic_s16_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_s16_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_x_tied2, svint16_t,
++		z0 = svbic_s16_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_s16_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s16_x_untied, svint16_t,
++		z0 = svbic_s16_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_w0_s16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	bic	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svbic_n_s16_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_w0_s16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	bic	z0\.d, z1\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svbic_n_s16_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_s16_x_tied1:
++**	and	z0\.h, z0\.h, #0xfffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s16_x_tied1, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_s16_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, z0\.h, #0xfffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s16_x_untied, svint16_t,
++		z0 = svbic_n_s16_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_s16_x:
++**	and	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_s16_x:
++**	and	z0\.h, z0\.h, #0xff7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_s16_x:
++**	and	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_256_s16_x:
++**	and	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_256_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 256),
++		z0 = svbic_x (p0, z0, 256))
++
++/*
++** bic_257_s16_x:
++**	and	z0\.h, z0\.h, #0xfefe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_257_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 257),
++		z0 = svbic_x (p0, z0, 257))
++
++/*
++** bic_512_s16_x:
++**	and	z0\.h, z0\.h, #0xfdff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_512_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 512),
++		z0 = svbic_x (p0, z0, 512))
++
++/*
++** bic_65280_s16_x:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_65280_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 0xff00),
++		z0 = svbic_x (p0, z0, 0xff00))
++
++/*
++** bic_m127_s16_x:
++**	and	z0\.h, z0\.h, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_s16_x:
++**	and	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_m255_s16_x:
++**	and	z0\.h, z0\.h, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m255_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -255),
++		z0 = svbic_x (p0, z0, -255))
++
++/*
++** bic_m256_s16_x:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m256_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -256),
++		z0 = svbic_x (p0, z0, -256))
++
++/*
++** bic_m257_s16_x:
++**	and	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m257_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -257),
++		z0 = svbic_x (p0, z0, -257))
++
++/*
++** bic_m512_s16_x:
++**	and	z0\.h, z0\.h, #0x1ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m512_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -512),
++		z0 = svbic_x (p0, z0, -512))
++
++/*
++** bic_m32768_s16_x:
++**	and	z0\.h, z0\.h, #0x7fff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m32768_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, -0x8000),
++		z0 = svbic_x (p0, z0, -0x8000))
++
++/*
++** bic_5_s16_x:
++**	mov	(z[0-9]+)\.h, #-6
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_s16_x, svint16_t,
++		z0 = svbic_n_s16_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s32.c
+new file mode 100644
+index 000000000..9e388e499
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s32.c
+@@ -0,0 +1,363 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_s32_m_tied1:
++**	bic	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_m_tied1, svint32_t,
++		z0 = svbic_s32_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_m_tied2, svint32_t,
++		z0 = svbic_s32_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_s32_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_m_untied, svint32_t,
++		z0 = svbic_s32_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svbic_n_s32_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svbic_n_s32_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #-2
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s32_m_tied1, svint32_t,
++		z0 = svbic_n_s32_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #-2
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s32_m_untied, svint32_t,
++		z0 = svbic_n_s32_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_s32_m:
++**	mov	(z[0-9]+\.s), #1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_s32_m, svint32_t,
++		z0 = svbic_n_s32_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	bic	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_z_tied1, svint32_t,
++		z0 = svbic_s32_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_s32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	bic	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_z_tied2, svint32_t,
++		z0 = svbic_s32_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	bic	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_z_untied, svint32_t,
++		z0 = svbic_s32_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svbic_n_s32_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_w0_s32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svbic_n_s32_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #-2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s32_z_tied1, svint32_t,
++		z0 = svbic_n_s32_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #-2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s32_z_untied, svint32_t,
++		z0 = svbic_n_s32_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_s32_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_x_tied1, svint32_t,
++		z0 = svbic_s32_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_s32_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_x_tied2, svint32_t,
++		z0 = svbic_s32_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_s32_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s32_x_untied, svint32_t,
++		z0 = svbic_s32_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_w0_s32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	bic	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svbic_n_s32_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_w0_s32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	bic	z0\.d, z1\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svbic_n_s32_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_s32_x_tied1:
++**	and	z0\.s, z0\.s, #0xfffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s32_x_tied1, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_s32_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, z0\.s, #0xfffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s32_x_untied, svint32_t,
++		z0 = svbic_n_s32_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_s32_x:
++**	and	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_256_s32_x:
++**	and	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_256_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 256),
++		z0 = svbic_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (bic_257_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 257),
++		z0 = svbic_x (p0, z0, 257))
++
++/*
++** bic_512_s32_x:
++**	and	z0\.s, z0\.s, #0xfffffdff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_512_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 512),
++		z0 = svbic_x (p0, z0, 512))
++
++/*
++** bic_65280_s32_x:
++**	and	z0\.s, z0\.s, #0xffff00ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_65280_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 0xff00),
++		z0 = svbic_x (p0, z0, 0xff00))
++
++/*
++** bic_m127_s32_x:
++**	and	z0\.s, z0\.s, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_s32_x:
++**	and	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_m255_s32_x:
++**	and	z0\.s, z0\.s, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m255_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -255),
++		z0 = svbic_x (p0, z0, -255))
++
++/*
++** bic_m256_s32_x:
++**	and	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m256_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -256),
++		z0 = svbic_x (p0, z0, -256))
++
++/*
++** bic_m257_s32_x:
++**	and	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m257_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -257),
++		z0 = svbic_x (p0, z0, -257))
++
++/*
++** bic_m512_s32_x:
++**	and	z0\.s, z0\.s, #0x1ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m512_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -512),
++		z0 = svbic_x (p0, z0, -512))
++
++/*
++** bic_m32768_s32_x:
++**	and	z0\.s, z0\.s, #0x7fff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m32768_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, -0x8000),
++		z0 = svbic_x (p0, z0, -0x8000))
++
++/*
++** bic_5_s32_x:
++**	mov	(z[0-9]+)\.s, #-6
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_s32_x, svint32_t,
++		z0 = svbic_n_s32_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s64.c
+new file mode 100644
+index 000000000..bf9536815
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s64.c
+@@ -0,0 +1,363 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_s64_m_tied1:
++**	bic	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_m_tied1, svint64_t,
++		z0 = svbic_s64_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_m_tied2, svint64_t,
++		z0 = svbic_s64_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_s64_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_m_untied, svint64_t,
++		z0 = svbic_s64_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svbic_n_s64_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svbic_n_s64_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #-2
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s64_m_tied1, svint64_t,
++		z0 = svbic_n_s64_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #-2
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s64_m_untied, svint64_t,
++		z0 = svbic_n_s64_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_s64_m:
++**	mov	(z[0-9]+\.d), #1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_s64_m, svint64_t,
++		z0 = svbic_n_s64_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	bic	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_z_tied1, svint64_t,
++		z0 = svbic_s64_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_s64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_z_tied2, svint64_t,
++		z0 = svbic_s64_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	bic	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_z_untied, svint64_t,
++		z0 = svbic_s64_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svbic_n_s64_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_x0_s64_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z1\.d
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svbic_n_s64_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #-2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s64_z_tied1, svint64_t,
++		z0 = svbic_n_s64_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #-2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s64_z_untied, svint64_t,
++		z0 = svbic_n_s64_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_s64_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_x_tied1, svint64_t,
++		z0 = svbic_s64_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_s64_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_x_tied2, svint64_t,
++		z0 = svbic_s64_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_s64_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s64_x_untied, svint64_t,
++		z0 = svbic_s64_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	bic	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svbic_n_s64_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	bic	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svbic_n_s64_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_s64_x_tied1:
++**	and	z0\.d, z0\.d, #0xfffffffffffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s64_x_tied1, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_s64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0xfffffffffffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s64_x_untied, svint64_t,
++		z0 = svbic_n_s64_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_256_s64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_256_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 256),
++		z0 = svbic_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (bic_257_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 257),
++		z0 = svbic_x (p0, z0, 257))
++
++/*
++** bic_512_s64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffdff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_512_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 512),
++		z0 = svbic_x (p0, z0, 512))
++
++/*
++** bic_65280_s64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffff00ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_65280_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 0xff00),
++		z0 = svbic_x (p0, z0, 0xff00))
++
++/*
++** bic_m127_s64_x:
++**	and	z0\.d, z0\.d, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_s64_x:
++**	and	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_m255_s64_x:
++**	and	z0\.d, z0\.d, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m255_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -255),
++		z0 = svbic_x (p0, z0, -255))
++
++/*
++** bic_m256_s64_x:
++**	and	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m256_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -256),
++		z0 = svbic_x (p0, z0, -256))
++
++/*
++** bic_m257_s64_x:
++**	and	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m257_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -257),
++		z0 = svbic_x (p0, z0, -257))
++
++/*
++** bic_m512_s64_x:
++**	and	z0\.d, z0\.d, #0x1ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m512_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -512),
++		z0 = svbic_x (p0, z0, -512))
++
++/*
++** bic_m32768_s64_x:
++**	and	z0\.d, z0\.d, #0x7fff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m32768_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, -0x8000),
++		z0 = svbic_x (p0, z0, -0x8000))
++
++/*
++** bic_5_s64_x:
++**	mov	(z[0-9]+\.d), #-6
++**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_s64_x, svint64_t,
++		z0 = svbic_n_s64_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s8.c
+new file mode 100644
+index 000000000..0958a3403
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s8.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_s8_m_tied1:
++**	bic	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_m_tied1, svint8_t,
++		z0 = svbic_s8_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_m_tied2, svint8_t,
++		z0 = svbic_s8_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_s8_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_m_untied, svint8_t,
++		z0 = svbic_s8_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svbic_n_s8_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svbic_n_s8_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #-2
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s8_m_tied1, svint8_t,
++		z0 = svbic_n_s8_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #-2
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s8_m_untied, svint8_t,
++		z0 = svbic_n_s8_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_s8_m:
++**	mov	(z[0-9]+\.b), #1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_s8_m, svint8_t,
++		z0 = svbic_n_s8_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	bic	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_z_tied1, svint8_t,
++		z0 = svbic_s8_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_s8_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, z1\.b
++**	bic	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_z_tied2, svint8_t,
++		z0 = svbic_s8_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	bic	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_z_untied, svint8_t,
++		z0 = svbic_s8_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svbic_n_s8_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_w0_s8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z1\.b
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svbic_n_s8_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #-2
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s8_z_tied1, svint8_t,
++		z0 = svbic_n_s8_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #-2
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s8_z_untied, svint8_t,
++		z0 = svbic_n_s8_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_s8_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_x_tied1, svint8_t,
++		z0 = svbic_s8_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_s8_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_x_tied2, svint8_t,
++		z0 = svbic_s8_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_s8_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_s8_x_untied, svint8_t,
++		z0 = svbic_s8_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_w0_s8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	bic	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svbic_n_s8_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_w0_s8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	bic	z0\.d, z1\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svbic_n_s8_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_s8_x_tied1:
++**	and	z0\.b, z0\.b, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s8_x_tied1, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_s8_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.b, z0\.b, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_s8_x_untied, svint8_t,
++		z0 = svbic_n_s8_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_s8_x:
++**	and	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_s8_x, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_s8_x:
++**	and	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_s8_x, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_s8_x:
++**	mov	z0\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_s8_x, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_m127_s8_x:
++**	and	z0\.b, z0\.b, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_s8_x, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_s8_x:
++**	and	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_s8_x, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_5_s8_x:
++**	mov	(z[0-9]+)\.b, #-6
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_s8_x, svint8_t,
++		z0 = svbic_n_s8_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u16.c
+new file mode 100644
+index 000000000..30209ffb4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u16.c
+@@ -0,0 +1,367 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_u16_m_tied1:
++**	bic	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_m_tied1, svuint16_t,
++		z0 = svbic_u16_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_m_tied2, svuint16_t,
++		z0 = svbic_u16_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_u16_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_m_untied, svuint16_t,
++		z0 = svbic_u16_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svbic_n_u16_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svbic_n_u16_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #-2
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u16_m_tied1, svuint16_t,
++		z0 = svbic_n_u16_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #-2
++**	movprfx	z0, z1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u16_m_untied, svuint16_t,
++		z0 = svbic_n_u16_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_u16_m:
++**	mov	(z[0-9]+\.h), #1
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_u16_m, svuint16_t,
++		z0 = svbic_n_u16_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	bic	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_z_tied1, svuint16_t,
++		z0 = svbic_u16_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_u16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	bic	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_z_tied2, svuint16_t,
++		z0 = svbic_u16_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	bic	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_z_untied, svuint16_t,
++		z0 = svbic_u16_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svbic_n_u16_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_w0_u16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	bic	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svbic_n_u16_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #-2
++**	movprfx	z0\.h, p0/z, z0\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u16_z_tied1, svuint16_t,
++		z0 = svbic_n_u16_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #-2
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	and	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	and	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u16_z_untied, svuint16_t,
++		z0 = svbic_n_u16_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_u16_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_x_tied1, svuint16_t,
++		z0 = svbic_u16_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_u16_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_x_tied2, svuint16_t,
++		z0 = svbic_u16_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_u16_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u16_x_untied, svuint16_t,
++		z0 = svbic_u16_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_w0_u16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	bic	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svbic_n_u16_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_w0_u16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	bic	z0\.d, z1\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svbic_n_u16_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_u16_x_tied1:
++**	and	z0\.h, z0\.h, #0xfffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u16_x_tied1, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_u16_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, z0\.h, #0xfffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u16_x_untied, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_u16_x:
++**	and	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_u16_x:
++**	and	z0\.h, z0\.h, #0xff7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_u16_x:
++**	and	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_256_u16_x:
++**	and	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_256_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 256),
++		z0 = svbic_x (p0, z0, 256))
++
++/*
++** bic_257_u16_x:
++**	and	z0\.h, z0\.h, #0xfefe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_257_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 257),
++		z0 = svbic_x (p0, z0, 257))
++
++/*
++** bic_512_u16_x:
++**	and	z0\.h, z0\.h, #0xfdff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_512_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 512),
++		z0 = svbic_x (p0, z0, 512))
++
++/*
++** bic_65280_u16_x:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_65280_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 0xff00),
++		z0 = svbic_x (p0, z0, 0xff00))
++
++/*
++** bic_m127_u16_x:
++**	and	z0\.h, z0\.h, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_u16_x:
++**	and	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_m255_u16_x:
++**	and	z0\.h, z0\.h, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m255_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -255),
++		z0 = svbic_x (p0, z0, -255))
++
++/*
++** bic_m256_u16_x:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m256_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -256),
++		z0 = svbic_x (p0, z0, -256))
++
++/*
++** bic_m257_u16_x:
++**	and	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m257_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -257),
++		z0 = svbic_x (p0, z0, -257))
++
++/*
++** bic_m512_u16_x:
++**	and	z0\.h, z0\.h, #0x1ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m512_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -512),
++		z0 = svbic_x (p0, z0, -512))
++
++/*
++** bic_m32768_u16_x:
++**	and	z0\.h, z0\.h, #0x7fff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m32768_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, -0x8000),
++		z0 = svbic_x (p0, z0, -0x8000))
++
++/*
++** bic_5_u16_x:
++**	mov	(z[0-9]+)\.h, #-6
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_u16_x, svuint16_t,
++		z0 = svbic_n_u16_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u32.c
+new file mode 100644
+index 000000000..b308b599b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u32.c
+@@ -0,0 +1,363 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_u32_m_tied1:
++**	bic	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_m_tied1, svuint32_t,
++		z0 = svbic_u32_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_m_tied2, svuint32_t,
++		z0 = svbic_u32_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_u32_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_m_untied, svuint32_t,
++		z0 = svbic_u32_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svbic_n_u32_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svbic_n_u32_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #-2
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u32_m_tied1, svuint32_t,
++		z0 = svbic_n_u32_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #-2
++**	movprfx	z0, z1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u32_m_untied, svuint32_t,
++		z0 = svbic_n_u32_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_u32_m:
++**	mov	(z[0-9]+\.s), #1
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_u32_m, svuint32_t,
++		z0 = svbic_n_u32_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	bic	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_z_tied1, svuint32_t,
++		z0 = svbic_u32_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_u32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	bic	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_z_tied2, svuint32_t,
++		z0 = svbic_u32_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	bic	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_z_untied, svuint32_t,
++		z0 = svbic_u32_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svbic_n_u32_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_w0_u32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	bic	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svbic_n_u32_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #-2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u32_z_tied1, svuint32_t,
++		z0 = svbic_n_u32_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #-2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	and	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	and	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u32_z_untied, svuint32_t,
++		z0 = svbic_n_u32_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_u32_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_x_tied1, svuint32_t,
++		z0 = svbic_u32_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_u32_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_x_tied2, svuint32_t,
++		z0 = svbic_u32_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_u32_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u32_x_untied, svuint32_t,
++		z0 = svbic_u32_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_w0_u32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	bic	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svbic_n_u32_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_w0_u32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	bic	z0\.d, z1\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svbic_n_u32_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_u32_x_tied1:
++**	and	z0\.s, z0\.s, #0xfffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u32_x_tied1, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_u32_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, z0\.s, #0xfffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u32_x_untied, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_u32_x:
++**	and	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_256_u32_x:
++**	and	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_256_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 256),
++		z0 = svbic_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (bic_257_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 257),
++		z0 = svbic_x (p0, z0, 257))
++
++/*
++** bic_512_u32_x:
++**	and	z0\.s, z0\.s, #0xfffffdff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_512_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 512),
++		z0 = svbic_x (p0, z0, 512))
++
++/*
++** bic_65280_u32_x:
++**	and	z0\.s, z0\.s, #0xffff00ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_65280_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 0xff00),
++		z0 = svbic_x (p0, z0, 0xff00))
++
++/*
++** bic_m127_u32_x:
++**	and	z0\.s, z0\.s, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_u32_x:
++**	and	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_m255_u32_x:
++**	and	z0\.s, z0\.s, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m255_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -255),
++		z0 = svbic_x (p0, z0, -255))
++
++/*
++** bic_m256_u32_x:
++**	and	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m256_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -256),
++		z0 = svbic_x (p0, z0, -256))
++
++/*
++** bic_m257_u32_x:
++**	and	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m257_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -257),
++		z0 = svbic_x (p0, z0, -257))
++
++/*
++** bic_m512_u32_x:
++**	and	z0\.s, z0\.s, #0x1ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m512_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -512),
++		z0 = svbic_x (p0, z0, -512))
++
++/*
++** bic_m32768_u32_x:
++**	and	z0\.s, z0\.s, #0x7fff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m32768_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, -0x8000),
++		z0 = svbic_x (p0, z0, -0x8000))
++
++/*
++** bic_5_u32_x:
++**	mov	(z[0-9]+)\.s, #-6
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_u32_x, svuint32_t,
++		z0 = svbic_n_u32_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u64.c
+new file mode 100644
+index 000000000..e82db1e94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u64.c
+@@ -0,0 +1,363 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_u64_m_tied1:
++**	bic	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_m_tied1, svuint64_t,
++		z0 = svbic_u64_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_m_tied2, svuint64_t,
++		z0 = svbic_u64_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_u64_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_m_untied, svuint64_t,
++		z0 = svbic_u64_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svbic_n_u64_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svbic_n_u64_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #-2
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u64_m_tied1, svuint64_t,
++		z0 = svbic_n_u64_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #-2
++**	movprfx	z0, z1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u64_m_untied, svuint64_t,
++		z0 = svbic_n_u64_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_u64_m:
++**	mov	(z[0-9]+\.d), #1
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_u64_m, svuint64_t,
++		z0 = svbic_n_u64_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	bic	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_z_tied1, svuint64_t,
++		z0 = svbic_u64_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_u64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_z_tied2, svuint64_t,
++		z0 = svbic_u64_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	bic	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_z_untied, svuint64_t,
++		z0 = svbic_u64_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svbic_n_u64_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_x0_u64_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z1\.d
++**	bic	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svbic_n_u64_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #-2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u64_z_tied1, svuint64_t,
++		z0 = svbic_n_u64_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #-2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	and	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	and	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u64_z_untied, svuint64_t,
++		z0 = svbic_n_u64_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_u64_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_x_tied1, svuint64_t,
++		z0 = svbic_u64_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_u64_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_x_tied2, svuint64_t,
++		z0 = svbic_u64_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_u64_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u64_x_untied, svuint64_t,
++		z0 = svbic_u64_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	bic	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svbic_n_u64_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	bic	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svbic_n_u64_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_u64_x_tied1:
++**	and	z0\.d, z0\.d, #0xfffffffffffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u64_x_tied1, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_u64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0xfffffffffffffffe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u64_x_untied, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_256_u64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_256_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 256),
++		z0 = svbic_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (bic_257_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 257),
++		z0 = svbic_x (p0, z0, 257))
++
++/*
++** bic_512_u64_x:
++**	and	z0\.d, z0\.d, #0xfffffffffffffdff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_512_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 512),
++		z0 = svbic_x (p0, z0, 512))
++
++/*
++** bic_65280_u64_x:
++**	and	z0\.d, z0\.d, #0xffffffffffff00ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_65280_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 0xff00),
++		z0 = svbic_x (p0, z0, 0xff00))
++
++/*
++** bic_m127_u64_x:
++**	and	z0\.d, z0\.d, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_u64_x:
++**	and	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_m255_u64_x:
++**	and	z0\.d, z0\.d, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m255_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -255),
++		z0 = svbic_x (p0, z0, -255))
++
++/*
++** bic_m256_u64_x:
++**	and	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m256_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -256),
++		z0 = svbic_x (p0, z0, -256))
++
++/*
++** bic_m257_u64_x:
++**	and	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m257_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -257),
++		z0 = svbic_x (p0, z0, -257))
++
++/*
++** bic_m512_u64_x:
++**	and	z0\.d, z0\.d, #0x1ff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m512_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -512),
++		z0 = svbic_x (p0, z0, -512))
++
++/*
++** bic_m32768_u64_x:
++**	and	z0\.d, z0\.d, #0x7fff
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m32768_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, -0x8000),
++		z0 = svbic_x (p0, z0, -0x8000))
++
++/*
++** bic_5_u64_x:
++**	mov	(z[0-9]+\.d), #-6
++**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_u64_x, svuint64_t,
++		z0 = svbic_n_u64_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u8.c
+new file mode 100644
+index 000000000..80c489b9c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u8.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** bic_u8_m_tied1:
++**	bic	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_m_tied1, svuint8_t,
++		z0 = svbic_u8_m (p0, z0, z1),
++		z0 = svbic_m (p0, z0, z1))
++
++/*
++** bic_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	bic	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_m_tied2, svuint8_t,
++		z0 = svbic_u8_m (p0, z1, z0),
++		z0 = svbic_m (p0, z1, z0))
++
++/*
++** bic_u8_m_untied:
++**	movprfx	z0, z1
++**	bic	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_m_untied, svuint8_t,
++		z0 = svbic_u8_m (p0, z1, z2),
++		z0 = svbic_m (p0, z1, z2))
++
++/*
++** bic_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svbic_n_u8_m (p0, z0, x0),
++		 z0 = svbic_m (p0, z0, x0))
++
++/*
++** bic_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svbic_n_u8_m (p0, z1, x0),
++		 z0 = svbic_m (p0, z1, x0))
++
++/*
++** bic_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #-2
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u8_m_tied1, svuint8_t,
++		z0 = svbic_n_u8_m (p0, z0, 1),
++		z0 = svbic_m (p0, z0, 1))
++
++/*
++** bic_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #-2
++**	movprfx	z0, z1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u8_m_untied, svuint8_t,
++		z0 = svbic_n_u8_m (p0, z1, 1),
++		z0 = svbic_m (p0, z1, 1))
++
++/*
++** bic_m2_u8_m:
++**	mov	(z[0-9]+\.b), #1
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m2_u8_m, svuint8_t,
++		z0 = svbic_n_u8_m (p0, z0, -2),
++		z0 = svbic_m (p0, z0, -2))
++
++/*
++** bic_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	bic	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_z_tied1, svuint8_t,
++		z0 = svbic_u8_z (p0, z0, z1),
++		z0 = svbic_z (p0, z0, z1))
++
++/*
++** bic_u8_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, z1\.b
++**	bic	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_z_tied2, svuint8_t,
++		z0 = svbic_u8_z (p0, z1, z0),
++		z0 = svbic_z (p0, z1, z0))
++
++/*
++** bic_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	bic	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_z_untied, svuint8_t,
++		z0 = svbic_u8_z (p0, z1, z2),
++		z0 = svbic_z (p0, z1, z2))
++
++/*
++** bic_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svbic_n_u8_z (p0, z0, x0),
++		 z0 = svbic_z (p0, z0, x0))
++
++/*
++** bic_w0_u8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z1\.b
++**	bic	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svbic_n_u8_z (p0, z1, x0),
++		 z0 = svbic_z (p0, z1, x0))
++
++/*
++** bic_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #-2
++**	movprfx	z0\.b, p0/z, z0\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u8_z_tied1, svuint8_t,
++		z0 = svbic_n_u8_z (p0, z0, 1),
++		z0 = svbic_z (p0, z0, 1))
++
++/*
++** bic_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #-2
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	and	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	and	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u8_z_untied, svuint8_t,
++		z0 = svbic_n_u8_z (p0, z1, 1),
++		z0 = svbic_z (p0, z1, 1))
++
++/*
++** bic_u8_x_tied1:
++**	bic	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_x_tied1, svuint8_t,
++		z0 = svbic_u8_x (p0, z0, z1),
++		z0 = svbic_x (p0, z0, z1))
++
++/*
++** bic_u8_x_tied2:
++**	bic	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_x_tied2, svuint8_t,
++		z0 = svbic_u8_x (p0, z1, z0),
++		z0 = svbic_x (p0, z1, z0))
++
++/*
++** bic_u8_x_untied:
++**	bic	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (bic_u8_x_untied, svuint8_t,
++		z0 = svbic_u8_x (p0, z1, z2),
++		z0 = svbic_x (p0, z1, z2))
++
++/*
++** bic_w0_u8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	bic	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svbic_n_u8_x (p0, z0, x0),
++		 z0 = svbic_x (p0, z0, x0))
++
++/*
++** bic_w0_u8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	bic	z0\.d, z1\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (bic_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svbic_n_u8_x (p0, z1, x0),
++		 z0 = svbic_x (p0, z1, x0))
++
++/*
++** bic_1_u8_x_tied1:
++**	and	z0\.b, z0\.b, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u8_x_tied1, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, 1),
++		z0 = svbic_x (p0, z0, 1))
++
++/*
++** bic_1_u8_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.b, z0\.b, #0xfe
++**	ret
++*/
++TEST_UNIFORM_Z (bic_1_u8_x_untied, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z1, 1),
++		z0 = svbic_x (p0, z1, 1))
++
++/*
++** bic_127_u8_x:
++**	and	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (bic_127_u8_x, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, 127),
++		z0 = svbic_x (p0, z0, 127))
++
++/*
++** bic_128_u8_x:
++**	and	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_128_u8_x, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, 128),
++		z0 = svbic_x (p0, z0, 128))
++
++/*
++** bic_255_u8_x:
++**	mov	z0\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (bic_255_u8_x, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, 255),
++		z0 = svbic_x (p0, z0, 255))
++
++/*
++** bic_m127_u8_x:
++**	and	z0\.b, z0\.b, #0x7e
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m127_u8_x, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, -127),
++		z0 = svbic_x (p0, z0, -127))
++
++/*
++** bic_m128_u8_x:
++**	and	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (bic_m128_u8_x, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, -128),
++		z0 = svbic_x (p0, z0, -128))
++
++/*
++** bic_5_u8_x:
++**	mov	(z[0-9]+)\.b, #-6
++**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (bic_5_u8_x, svuint8_t,
++		z0 = svbic_n_u8_x (p0, z0, 5),
++		z0 = svbic_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brka_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brka_b.c
+new file mode 100644
+index 000000000..63426cf94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brka_b.c
+@@ -0,0 +1,54 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** brka_b_m_tied12:
++**	brka	p0\.b, p3/m, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brka_b_m_tied12,
++		p0 = svbrka_b_m (p0, p3, p0),
++		p0 = svbrka_m (p0, p3, p0))
++
++/*
++** brka_b_m_tied1:
++**	brka	p0\.b, p3/m, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brka_b_m_tied1,
++		p0 = svbrka_b_m (p0, p3, p1),
++		p0 = svbrka_m (p0, p3, p1))
++
++/* Bad RA choice: no preferred output sequence.  */
++TEST_UNIFORM_P (brka_b_m_tied2,
++		p0 = svbrka_b_m (p1, p3, p0),
++		p0 = svbrka_m (p1, p3, p0))
++
++/*
++** brka_b_m_untied:
++**	mov	p0\.b, p2\.b
++**	brka	p0\.b, p3/m, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brka_b_m_untied,
++		p0 = svbrka_b_m (p2, p3, p1),
++		p0 = svbrka_m (p2, p3, p1))
++
++/*
++** brka_b_z_tied1:
++**	brka	p0\.b, p3/z, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brka_b_z_tied1,
++		p0 = svbrka_b_z (p3, p0),
++		p0 = svbrka_z (p3, p0))
++
++/*
++** brka_b_z_untied:
++**	brka	p0\.b, p3/z, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brka_b_z_untied,
++		p0 = svbrka_b_z (p3, p1),
++		p0 = svbrka_z (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkb_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkb_b.c
+new file mode 100644
+index 000000000..4f9a2c2d7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkb_b.c
+@@ -0,0 +1,54 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** brkb_b_m_tied12:
++**	brkb	p0\.b, p3/m, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkb_b_m_tied12,
++		p0 = svbrkb_b_m (p0, p3, p0),
++		p0 = svbrkb_m (p0, p3, p0))
++
++/*
++** brkb_b_m_tied1:
++**	brkb	p0\.b, p3/m, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkb_b_m_tied1,
++		p0 = svbrkb_b_m (p0, p3, p1),
++		p0 = svbrkb_m (p0, p3, p1))
++
++/* Bad RA choice: no preferred output sequence.  */
++TEST_UNIFORM_P (brkb_b_m_tied2,
++		p0 = svbrkb_b_m (p1, p3, p0),
++		p0 = svbrkb_m (p1, p3, p0))
++
++/*
++** brkb_b_m_untied:
++**	mov	p0\.b, p2\.b
++**	brkb	p0\.b, p3/m, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkb_b_m_untied,
++		p0 = svbrkb_b_m (p2, p3, p1),
++		p0 = svbrkb_m (p2, p3, p1))
++
++/*
++** brkb_b_z_tied1:
++**	brkb	p0\.b, p3/z, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkb_b_z_tied1,
++		p0 = svbrkb_b_z (p3, p0),
++		p0 = svbrkb_z (p3, p0))
++
++/*
++** brkb_b_z_untied:
++**	brkb	p0\.b, p3/z, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkb_b_z_untied,
++		p0 = svbrkb_b_z (p3, p1),
++		p0 = svbrkb_z (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkn_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkn_b.c
+new file mode 100644
+index 000000000..229a5fff9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkn_b.c
+@@ -0,0 +1,27 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/* Bad RA choice: no preferred output sequence.  */
++TEST_UNIFORM_P (brkn_b_z_tied1,
++		p0 = svbrkn_b_z (p3, p0, p1),
++		p0 = svbrkn_z (p3, p0, p1))
++
++/*
++** brkn_b_z_tied2:
++**	brkn	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkn_b_z_tied2,
++		p0 = svbrkn_b_z (p3, p1, p0),
++		p0 = svbrkn_z (p3, p1, p0))
++
++/*
++** brkn_b_z_untied:
++**	mov	p0\.b, p2\.b
++**	brkn	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkn_b_z_untied,
++		p0 = svbrkn_b_z (p3, p1, p2),
++		p0 = svbrkn_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpa_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpa_b.c
+new file mode 100644
+index 000000000..2c074e389
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpa_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** brkpa_b_z_tied1:
++**	brkpa	p0\.b, p3/z, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkpa_b_z_tied1,
++		p0 = svbrkpa_b_z (p3, p0, p1),
++		p0 = svbrkpa_z (p3, p0, p1))
++
++/*
++** brkpa_b_z_tied2:
++**	brkpa	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkpa_b_z_tied2,
++		p0 = svbrkpa_b_z (p3, p1, p0),
++		p0 = svbrkpa_z (p3, p1, p0))
++
++/*
++** brkpa_b_z_untied:
++**	brkpa	p0\.b, p3/z, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkpa_b_z_untied,
++		p0 = svbrkpa_b_z (p3, p1, p2),
++		p0 = svbrkpa_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpb_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpb_b.c
+new file mode 100644
+index 000000000..b41797ee1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpb_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** brkpb_b_z_tied1:
++**	brkpb	p0\.b, p3/z, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkpb_b_z_tied1,
++		p0 = svbrkpb_b_z (p3, p0, p1),
++		p0 = svbrkpb_z (p3, p0, p1))
++
++/*
++** brkpb_b_z_tied2:
++**	brkpb	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkpb_b_z_tied2,
++		p0 = svbrkpb_b_z (p3, p1, p0),
++		p0 = svbrkpb_z (p3, p1, p0))
++
++/*
++** brkpb_b_z_untied:
++**	brkpb	p0\.b, p3/z, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (brkpb_b_z_untied,
++		p0 = svbrkpb_b_z (p3, p1, p2),
++		p0 = svbrkpb_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f16.c
+new file mode 100644
+index 000000000..e89c78455
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f16.c
+@@ -0,0 +1,251 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cadd_90_f16_m_tied1:
++**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_m_tied1, svfloat16_t,
++		z0 = svcadd_f16_m (p0, z0, z1, 90),
++		z0 = svcadd_m (p0, z0, z1, 90))
++
++/*
++** cadd_90_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_m_tied2, svfloat16_t,
++		z0 = svcadd_f16_m (p0, z1, z0, 90),
++		z0 = svcadd_m (p0, z1, z0, 90))
++
++/*
++** cadd_90_f16_m_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_m_untied, svfloat16_t,
++		z0 = svcadd_f16_m (p0, z1, z2, 90),
++		z0 = svcadd_m (p0, z1, z2, 90))
++
++/*
++** cadd_270_f16_m_tied1:
++**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_m_tied1, svfloat16_t,
++		z0 = svcadd_f16_m (p0, z0, z1, 270),
++		z0 = svcadd_m (p0, z0, z1, 270))
++
++/*
++** cadd_270_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_m_tied2, svfloat16_t,
++		z0 = svcadd_f16_m (p0, z1, z0, 270),
++		z0 = svcadd_m (p0, z1, z0, 270))
++
++/*
++** cadd_270_f16_m_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_m_untied, svfloat16_t,
++		z0 = svcadd_f16_m (p0, z1, z2, 270),
++		z0 = svcadd_m (p0, z1, z2, 270))
++
++/*
++** cadd_90_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_z_tied1, svfloat16_t,
++		z0 = svcadd_f16_z (p0, z0, z1, 90),
++		z0 = svcadd_z (p0, z0, z1, 90))
++
++/*
++** cadd_90_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_z_tied2, svfloat16_t,
++		z0 = svcadd_f16_z (p0, z1, z0, 90),
++		z0 = svcadd_z (p0, z1, z0, 90))
++
++/*
++** cadd_90_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_z_untied, svfloat16_t,
++		z0 = svcadd_f16_z (p0, z1, z2, 90),
++		z0 = svcadd_z (p0, z1, z2, 90))
++
++/*
++** cadd_270_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_z_tied1, svfloat16_t,
++		z0 = svcadd_f16_z (p0, z0, z1, 270),
++		z0 = svcadd_z (p0, z0, z1, 270))
++
++/*
++** cadd_270_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_z_tied2, svfloat16_t,
++		z0 = svcadd_f16_z (p0, z1, z0, 270),
++		z0 = svcadd_z (p0, z1, z0, 270))
++
++/*
++** cadd_270_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_z_untied, svfloat16_t,
++		z0 = svcadd_f16_z (p0, z1, z2, 270),
++		z0 = svcadd_z (p0, z1, z2, 270))
++
++/*
++** cadd_90_f16_x_tied1:
++**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_x_tied1, svfloat16_t,
++		z0 = svcadd_f16_x (p0, z0, z1, 90),
++		z0 = svcadd_x (p0, z0, z1, 90))
++
++/*
++** cadd_90_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_x_tied2, svfloat16_t,
++		z0 = svcadd_f16_x (p0, z1, z0, 90),
++		z0 = svcadd_x (p0, z1, z0, 90))
++
++/*
++** cadd_90_f16_x_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f16_x_untied, svfloat16_t,
++		z0 = svcadd_f16_x (p0, z1, z2, 90),
++		z0 = svcadd_x (p0, z1, z2, 90))
++
++/*
++** cadd_270_f16_x_tied1:
++**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_x_tied1, svfloat16_t,
++		z0 = svcadd_f16_x (p0, z0, z1, 270),
++		z0 = svcadd_x (p0, z0, z1, 270))
++
++/*
++** cadd_270_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_x_tied2, svfloat16_t,
++		z0 = svcadd_f16_x (p0, z1, z0, 270),
++		z0 = svcadd_x (p0, z1, z0, 270))
++
++/*
++** cadd_270_f16_x_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f16_x_untied, svfloat16_t,
++		z0 = svcadd_f16_x (p0, z1, z2, 270),
++		z0 = svcadd_x (p0, z1, z2, 270))
++
++/*
++** ptrue_cadd_90_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f16_x_tied1, svfloat16_t,
++		z0 = svcadd_f16_x (svptrue_b16 (), z0, z1, 90),
++		z0 = svcadd_x (svptrue_b16 (), z0, z1, 90))
++
++/*
++** ptrue_cadd_90_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f16_x_tied2, svfloat16_t,
++		z0 = svcadd_f16_x (svptrue_b16 (), z1, z0, 90),
++		z0 = svcadd_x (svptrue_b16 (), z1, z0, 90))
++
++/*
++** ptrue_cadd_90_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f16_x_untied, svfloat16_t,
++		z0 = svcadd_f16_x (svptrue_b16 (), z1, z2, 90),
++		z0 = svcadd_x (svptrue_b16 (), z1, z2, 90))
++
++/*
++** ptrue_cadd_270_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f16_x_tied1, svfloat16_t,
++		z0 = svcadd_f16_x (svptrue_b16 (), z0, z1, 270),
++		z0 = svcadd_x (svptrue_b16 (), z0, z1, 270))
++
++/*
++** ptrue_cadd_270_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f16_x_tied2, svfloat16_t,
++		z0 = svcadd_f16_x (svptrue_b16 (), z1, z0, 270),
++		z0 = svcadd_x (svptrue_b16 (), z1, z0, 270))
++
++/*
++** ptrue_cadd_270_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f16_x_untied, svfloat16_t,
++		z0 = svcadd_f16_x (svptrue_b16 (), z1, z2, 270),
++		z0 = svcadd_x (svptrue_b16 (), z1, z2, 270))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f32.c
+new file mode 100644
+index 000000000..ed5c16ff3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f32.c
+@@ -0,0 +1,251 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cadd_90_f32_m_tied1:
++**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_m_tied1, svfloat32_t,
++		z0 = svcadd_f32_m (p0, z0, z1, 90),
++		z0 = svcadd_m (p0, z0, z1, 90))
++
++/*
++** cadd_90_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_m_tied2, svfloat32_t,
++		z0 = svcadd_f32_m (p0, z1, z0, 90),
++		z0 = svcadd_m (p0, z1, z0, 90))
++
++/*
++** cadd_90_f32_m_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_m_untied, svfloat32_t,
++		z0 = svcadd_f32_m (p0, z1, z2, 90),
++		z0 = svcadd_m (p0, z1, z2, 90))
++
++/*
++** cadd_270_f32_m_tied1:
++**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_m_tied1, svfloat32_t,
++		z0 = svcadd_f32_m (p0, z0, z1, 270),
++		z0 = svcadd_m (p0, z0, z1, 270))
++
++/*
++** cadd_270_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_m_tied2, svfloat32_t,
++		z0 = svcadd_f32_m (p0, z1, z0, 270),
++		z0 = svcadd_m (p0, z1, z0, 270))
++
++/*
++** cadd_270_f32_m_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_m_untied, svfloat32_t,
++		z0 = svcadd_f32_m (p0, z1, z2, 270),
++		z0 = svcadd_m (p0, z1, z2, 270))
++
++/*
++** cadd_90_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_z_tied1, svfloat32_t,
++		z0 = svcadd_f32_z (p0, z0, z1, 90),
++		z0 = svcadd_z (p0, z0, z1, 90))
++
++/*
++** cadd_90_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_z_tied2, svfloat32_t,
++		z0 = svcadd_f32_z (p0, z1, z0, 90),
++		z0 = svcadd_z (p0, z1, z0, 90))
++
++/*
++** cadd_90_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_z_untied, svfloat32_t,
++		z0 = svcadd_f32_z (p0, z1, z2, 90),
++		z0 = svcadd_z (p0, z1, z2, 90))
++
++/*
++** cadd_270_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_z_tied1, svfloat32_t,
++		z0 = svcadd_f32_z (p0, z0, z1, 270),
++		z0 = svcadd_z (p0, z0, z1, 270))
++
++/*
++** cadd_270_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_z_tied2, svfloat32_t,
++		z0 = svcadd_f32_z (p0, z1, z0, 270),
++		z0 = svcadd_z (p0, z1, z0, 270))
++
++/*
++** cadd_270_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_z_untied, svfloat32_t,
++		z0 = svcadd_f32_z (p0, z1, z2, 270),
++		z0 = svcadd_z (p0, z1, z2, 270))
++
++/*
++** cadd_90_f32_x_tied1:
++**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_x_tied1, svfloat32_t,
++		z0 = svcadd_f32_x (p0, z0, z1, 90),
++		z0 = svcadd_x (p0, z0, z1, 90))
++
++/*
++** cadd_90_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_x_tied2, svfloat32_t,
++		z0 = svcadd_f32_x (p0, z1, z0, 90),
++		z0 = svcadd_x (p0, z1, z0, 90))
++
++/*
++** cadd_90_f32_x_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f32_x_untied, svfloat32_t,
++		z0 = svcadd_f32_x (p0, z1, z2, 90),
++		z0 = svcadd_x (p0, z1, z2, 90))
++
++/*
++** cadd_270_f32_x_tied1:
++**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_x_tied1, svfloat32_t,
++		z0 = svcadd_f32_x (p0, z0, z1, 270),
++		z0 = svcadd_x (p0, z0, z1, 270))
++
++/*
++** cadd_270_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_x_tied2, svfloat32_t,
++		z0 = svcadd_f32_x (p0, z1, z0, 270),
++		z0 = svcadd_x (p0, z1, z0, 270))
++
++/*
++** cadd_270_f32_x_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f32_x_untied, svfloat32_t,
++		z0 = svcadd_f32_x (p0, z1, z2, 270),
++		z0 = svcadd_x (p0, z1, z2, 270))
++
++/*
++** ptrue_cadd_90_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f32_x_tied1, svfloat32_t,
++		z0 = svcadd_f32_x (svptrue_b32 (), z0, z1, 90),
++		z0 = svcadd_x (svptrue_b32 (), z0, z1, 90))
++
++/*
++** ptrue_cadd_90_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f32_x_tied2, svfloat32_t,
++		z0 = svcadd_f32_x (svptrue_b32 (), z1, z0, 90),
++		z0 = svcadd_x (svptrue_b32 (), z1, z0, 90))
++
++/*
++** ptrue_cadd_90_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f32_x_untied, svfloat32_t,
++		z0 = svcadd_f32_x (svptrue_b32 (), z1, z2, 90),
++		z0 = svcadd_x (svptrue_b32 (), z1, z2, 90))
++
++/*
++** ptrue_cadd_270_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f32_x_tied1, svfloat32_t,
++		z0 = svcadd_f32_x (svptrue_b32 (), z0, z1, 270),
++		z0 = svcadd_x (svptrue_b32 (), z0, z1, 270))
++
++/*
++** ptrue_cadd_270_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f32_x_tied2, svfloat32_t,
++		z0 = svcadd_f32_x (svptrue_b32 (), z1, z0, 270),
++		z0 = svcadd_x (svptrue_b32 (), z1, z0, 270))
++
++/*
++** ptrue_cadd_270_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f32_x_untied, svfloat32_t,
++		z0 = svcadd_f32_x (svptrue_b32 (), z1, z2, 270),
++		z0 = svcadd_x (svptrue_b32 (), z1, z2, 270))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f64.c
+new file mode 100644
+index 000000000..0ada881c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f64.c
+@@ -0,0 +1,251 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cadd_90_f64_m_tied1:
++**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_m_tied1, svfloat64_t,
++		z0 = svcadd_f64_m (p0, z0, z1, 90),
++		z0 = svcadd_m (p0, z0, z1, 90))
++
++/*
++** cadd_90_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, \1, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_m_tied2, svfloat64_t,
++		z0 = svcadd_f64_m (p0, z1, z0, 90),
++		z0 = svcadd_m (p0, z1, z0, 90))
++
++/*
++** cadd_90_f64_m_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_m_untied, svfloat64_t,
++		z0 = svcadd_f64_m (p0, z1, z2, 90),
++		z0 = svcadd_m (p0, z1, z2, 90))
++
++/*
++** cadd_270_f64_m_tied1:
++**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_m_tied1, svfloat64_t,
++		z0 = svcadd_f64_m (p0, z0, z1, 270),
++		z0 = svcadd_m (p0, z0, z1, 270))
++
++/*
++** cadd_270_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, \1, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_m_tied2, svfloat64_t,
++		z0 = svcadd_f64_m (p0, z1, z0, 270),
++		z0 = svcadd_m (p0, z1, z0, 270))
++
++/*
++** cadd_270_f64_m_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_m_untied, svfloat64_t,
++		z0 = svcadd_f64_m (p0, z1, z2, 270),
++		z0 = svcadd_m (p0, z1, z2, 270))
++
++/*
++** cadd_90_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_z_tied1, svfloat64_t,
++		z0 = svcadd_f64_z (p0, z0, z1, 90),
++		z0 = svcadd_z (p0, z0, z1, 90))
++
++/*
++** cadd_90_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcadd	z0\.d, p0/m, z0\.d, \1, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_z_tied2, svfloat64_t,
++		z0 = svcadd_f64_z (p0, z1, z0, 90),
++		z0 = svcadd_z (p0, z1, z0, 90))
++
++/*
++** cadd_90_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_z_untied, svfloat64_t,
++		z0 = svcadd_f64_z (p0, z1, z2, 90),
++		z0 = svcadd_z (p0, z1, z2, 90))
++
++/*
++** cadd_270_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_z_tied1, svfloat64_t,
++		z0 = svcadd_f64_z (p0, z0, z1, 270),
++		z0 = svcadd_z (p0, z0, z1, 270))
++
++/*
++** cadd_270_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcadd	z0\.d, p0/m, z0\.d, \1, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_z_tied2, svfloat64_t,
++		z0 = svcadd_f64_z (p0, z1, z0, 270),
++		z0 = svcadd_z (p0, z1, z0, 270))
++
++/*
++** cadd_270_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_z_untied, svfloat64_t,
++		z0 = svcadd_f64_z (p0, z1, z2, 270),
++		z0 = svcadd_z (p0, z1, z2, 270))
++
++/*
++** cadd_90_f64_x_tied1:
++**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_x_tied1, svfloat64_t,
++		z0 = svcadd_f64_x (p0, z0, z1, 90),
++		z0 = svcadd_x (p0, z0, z1, 90))
++
++/*
++** cadd_90_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, \1, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_x_tied2, svfloat64_t,
++		z0 = svcadd_f64_x (p0, z1, z0, 90),
++		z0 = svcadd_x (p0, z1, z0, 90))
++
++/*
++** cadd_90_f64_x_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_90_f64_x_untied, svfloat64_t,
++		z0 = svcadd_f64_x (p0, z1, z2, 90),
++		z0 = svcadd_x (p0, z1, z2, 90))
++
++/*
++** cadd_270_f64_x_tied1:
++**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_x_tied1, svfloat64_t,
++		z0 = svcadd_f64_x (p0, z0, z1, 270),
++		z0 = svcadd_x (p0, z0, z1, 270))
++
++/*
++** cadd_270_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, \1, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_x_tied2, svfloat64_t,
++		z0 = svcadd_f64_x (p0, z1, z0, 270),
++		z0 = svcadd_x (p0, z1, z0, 270))
++
++/*
++** cadd_270_f64_x_untied:
++**	movprfx	z0, z1
++**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cadd_270_f64_x_untied, svfloat64_t,
++		z0 = svcadd_f64_x (p0, z1, z2, 270),
++		z0 = svcadd_x (p0, z1, z2, 270))
++
++/*
++** ptrue_cadd_90_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f64_x_tied1, svfloat64_t,
++		z0 = svcadd_f64_x (svptrue_b64 (), z0, z1, 90),
++		z0 = svcadd_x (svptrue_b64 (), z0, z1, 90))
++
++/*
++** ptrue_cadd_90_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f64_x_tied2, svfloat64_t,
++		z0 = svcadd_f64_x (svptrue_b64 (), z1, z0, 90),
++		z0 = svcadd_x (svptrue_b64 (), z1, z0, 90))
++
++/*
++** ptrue_cadd_90_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_90_f64_x_untied, svfloat64_t,
++		z0 = svcadd_f64_x (svptrue_b64 (), z1, z2, 90),
++		z0 = svcadd_x (svptrue_b64 (), z1, z2, 90))
++
++/*
++** ptrue_cadd_270_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f64_x_tied1, svfloat64_t,
++		z0 = svcadd_f64_x (svptrue_b64 (), z0, z1, 270),
++		z0 = svcadd_x (svptrue_b64 (), z0, z1, 270))
++
++/*
++** ptrue_cadd_270_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f64_x_tied2, svfloat64_t,
++		z0 = svcadd_f64_x (svptrue_b64 (), z1, z0, 270),
++		z0 = svcadd_x (svptrue_b64 (), z1, z0, 270))
++
++/*
++** ptrue_cadd_270_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cadd_270_f64_x_untied, svfloat64_t,
++		z0 = svcadd_f64_x (svptrue_b64 (), z1, z2, 270),
++		z0 = svcadd_x (svptrue_b64 (), z1, z2, 270))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_bf16.c
+new file mode 100644
+index 000000000..a15e34400
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_bf16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_bf16_tied1:
++**	clasta	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_bf16_tied1, svbfloat16_t,
++		z0 = svclasta_bf16 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_bf16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_bf16_tied2, svbfloat16_t,
++		z0 = svclasta_bf16 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_bf16_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_bf16_untied, svbfloat16_t,
++		z0 = svclasta_bf16 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_d0_bf16:
++**	clasta	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d0_bf16, bfloat16_t, svbfloat16_t,
++		  d0 = svclasta_n_bf16 (p0, d0, z2),
++		  d0 = svclasta (p0, d0, z2))
++
++/*
++** clasta_d1_bf16:
++**	mov	v0\.h\[0\], v1\.h\[0\]
++**	clasta	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d1_bf16, bfloat16_t, svbfloat16_t,
++		  d0 = svclasta_n_bf16 (p0, d1, z2),
++		  d0 = svclasta (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f16.c
+new file mode 100644
+index 000000000..d9a980f60
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_f16_tied1:
++**	clasta	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f16_tied1, svfloat16_t,
++		z0 = svclasta_f16 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f16_tied2, svfloat16_t,
++		z0 = svclasta_f16 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_f16_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f16_untied, svfloat16_t,
++		z0 = svclasta_f16 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_d0_f16:
++**	clasta	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d0_f16, float16_t, svfloat16_t,
++		  d0 = svclasta_n_f16 (p0, d0, z2),
++		  d0 = svclasta (p0, d0, z2))
++
++/*
++** clasta_d1_f16:
++**	mov	v0\.h\[0\], v1\.h\[0\]
++**	clasta	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d1_f16, float16_t, svfloat16_t,
++		  d0 = svclasta_n_f16 (p0, d1, z2),
++		  d0 = svclasta (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f32.c
+new file mode 100644
+index 000000000..cac01fa6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_f32_tied1:
++**	clasta	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f32_tied1, svfloat32_t,
++		z0 = svclasta_f32 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f32_tied2, svfloat32_t,
++		z0 = svclasta_f32 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_f32_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f32_untied, svfloat32_t,
++		z0 = svclasta_f32 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_d0_f32:
++**	clasta	s0, p0, s0, z2\.s
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d0_f32, float32_t, svfloat32_t,
++		  d0 = svclasta_n_f32 (p0, d0, z2),
++		  d0 = svclasta (p0, d0, z2))
++
++/*
++** clasta_d1_f32:
++**	fmov	s0, s1
++**	clasta	s0, p0, s0, z2\.s
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d1_f32, float32_t, svfloat32_t,
++		  d0 = svclasta_n_f32 (p0, d1, z2),
++		  d0 = svclasta (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f64.c
+new file mode 100644
+index 000000000..43b93553b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_f64_tied1:
++**	clasta	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f64_tied1, svfloat64_t,
++		z0 = svclasta_f64 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f64_tied2, svfloat64_t,
++		z0 = svclasta_f64 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_f64_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_f64_untied, svfloat64_t,
++		z0 = svclasta_f64 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_d0_f64:
++**	clasta	d0, p0, d0, z2\.d
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d0_f64, float64_t, svfloat64_t,
++		  d0 = svclasta_n_f64 (p0, d0, z2),
++		  d0 = svclasta (p0, d0, z2))
++
++/*
++** clasta_d1_f64:
++**	fmov	d0, d1
++**	clasta	d0, p0, d0, z2\.d
++**	ret
++*/
++TEST_FOLD_LEFT_D (clasta_d1_f64, float64_t, svfloat64_t,
++		  d0 = svclasta_n_f64 (p0, d1, z2),
++		  d0 = svclasta (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s16.c
+new file mode 100644
+index 000000000..f5e4f85ce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_s16_tied1:
++**	clasta	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s16_tied1, svint16_t,
++		z0 = svclasta_s16 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_s16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s16_tied2, svint16_t,
++		z0 = svclasta_s16 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_s16_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s16_untied, svint16_t,
++		z0 = svclasta_s16 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_s16:
++**	clasta	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_s16, int16_t, svint16_t,
++		  x0 = svclasta_n_s16 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_s16:
++**	mov	w0, w1
++**	clasta	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_s16, int16_t, svint16_t,
++		  x0 = svclasta_n_s16 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s32.c
+new file mode 100644
+index 000000000..fbd82e778
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_s32_tied1:
++**	clasta	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s32_tied1, svint32_t,
++		z0 = svclasta_s32 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s32_tied2, svint32_t,
++		z0 = svclasta_s32 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_s32_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s32_untied, svint32_t,
++		z0 = svclasta_s32 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_s32:
++**	clasta	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_s32, int32_t, svint32_t,
++		  x0 = svclasta_n_s32 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_s32:
++**	mov	w0, w1
++**	clasta	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_s32, int32_t, svint32_t,
++		  x0 = svclasta_n_s32 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s64.c
+new file mode 100644
+index 000000000..08edf157b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_s64_tied1:
++**	clasta	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s64_tied1, svint64_t,
++		z0 = svclasta_s64 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_s64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s64_tied2, svint64_t,
++		z0 = svclasta_s64 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_s64_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s64_untied, svint64_t,
++		z0 = svclasta_s64 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_s64:
++**	clasta	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_s64, int64_t, svint64_t,
++		  x0 = svclasta_n_s64 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_s64:
++**	mov	x0, x1
++**	clasta	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_s64, int64_t, svint64_t,
++		  x0 = svclasta_n_s64 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s8.c
+new file mode 100644
+index 000000000..286f16a9d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s8.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_s8_tied1:
++**	clasta	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s8_tied1, svint8_t,
++		z0 = svclasta_s8 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_s8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.b, p0, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s8_tied2, svint8_t,
++		z0 = svclasta_s8 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_s8_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.b, p0, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_s8_untied, svint8_t,
++		z0 = svclasta_s8 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_s8:
++**	clasta	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_s8, int8_t, svint8_t,
++		  x0 = svclasta_n_s8 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_s8:
++**	mov	w0, w1
++**	clasta	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_s8, int8_t, svint8_t,
++		  x0 = svclasta_n_s8 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u16.c
+new file mode 100644
+index 000000000..40c6dca90
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_u16_tied1:
++**	clasta	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u16_tied1, svuint16_t,
++		z0 = svclasta_u16 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_u16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u16_tied2, svuint16_t,
++		z0 = svclasta_u16 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_u16_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u16_untied, svuint16_t,
++		z0 = svclasta_u16 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_u16:
++**	clasta	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_u16, uint16_t, svuint16_t,
++		  x0 = svclasta_n_u16 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_u16:
++**	mov	w0, w1
++**	clasta	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_u16, uint16_t, svuint16_t,
++		  x0 = svclasta_n_u16 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u32.c
+new file mode 100644
+index 000000000..6c46e13cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_u32_tied1:
++**	clasta	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u32_tied1, svuint32_t,
++		z0 = svclasta_u32 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u32_tied2, svuint32_t,
++		z0 = svclasta_u32 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_u32_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u32_untied, svuint32_t,
++		z0 = svclasta_u32 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_u32:
++**	clasta	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_u32, uint32_t, svuint32_t,
++		  x0 = svclasta_n_u32 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_u32:
++**	mov	w0, w1
++**	clasta	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_u32, uint32_t, svuint32_t,
++		  x0 = svclasta_n_u32 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u64.c
+new file mode 100644
+index 000000000..99ad41e50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_u64_tied1:
++**	clasta	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u64_tied1, svuint64_t,
++		z0 = svclasta_u64 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_u64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u64_tied2, svuint64_t,
++		z0 = svclasta_u64 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_u64_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u64_untied, svuint64_t,
++		z0 = svclasta_u64 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_u64:
++**	clasta	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_u64, uint64_t, svuint64_t,
++		  x0 = svclasta_n_u64 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_u64:
++**	mov	x0, x1
++**	clasta	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_u64, uint64_t, svuint64_t,
++		  x0 = svclasta_n_u64 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u8.c
+new file mode 100644
+index 000000000..eb438f4ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u8.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clasta_u8_tied1:
++**	clasta	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u8_tied1, svuint8_t,
++		z0 = svclasta_u8 (p0, z0, z1),
++		z0 = svclasta (p0, z0, z1))
++
++/*
++** clasta_u8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clasta	z0\.b, p0, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u8_tied2, svuint8_t,
++		z0 = svclasta_u8 (p0, z1, z0),
++		z0 = svclasta (p0, z1, z0))
++
++/*
++** clasta_u8_untied:
++**	movprfx	z0, z1
++**	clasta	z0\.b, p0, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clasta_u8_untied, svuint8_t,
++		z0 = svclasta_u8 (p0, z1, z2),
++		z0 = svclasta (p0, z1, z2))
++
++/*
++** clasta_x0_u8:
++**	clasta	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x0_u8, uint8_t, svuint8_t,
++		  x0 = svclasta_n_u8 (p0, x0, z0),
++		  x0 = svclasta (p0, x0, z0))
++
++/*
++** clasta_x1_u8:
++**	mov	w0, w1
++**	clasta	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clasta_x1_u8, uint8_t, svuint8_t,
++		  x0 = svclasta_n_u8 (p0, x1, z0),
++		  x0 = svclasta (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_bf16.c
+new file mode 100644
+index 000000000..235fd1b4e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_bf16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_bf16_tied1:
++**	clastb	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_bf16_tied1, svbfloat16_t,
++		z0 = svclastb_bf16 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_bf16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_bf16_tied2, svbfloat16_t,
++		z0 = svclastb_bf16 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_bf16_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_bf16_untied, svbfloat16_t,
++		z0 = svclastb_bf16 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_d0_bf16:
++**	clastb	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d0_bf16, bfloat16_t, svbfloat16_t,
++		  d0 = svclastb_n_bf16 (p0, d0, z2),
++		  d0 = svclastb (p0, d0, z2))
++
++/*
++** clastb_d1_bf16:
++**	mov	v0\.h\[0\], v1\.h\[0\]
++**	clastb	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d1_bf16, bfloat16_t, svbfloat16_t,
++		  d0 = svclastb_n_bf16 (p0, d1, z2),
++		  d0 = svclastb (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f16.c
+new file mode 100644
+index 000000000..e56d7688a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_f16_tied1:
++**	clastb	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f16_tied1, svfloat16_t,
++		z0 = svclastb_f16 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f16_tied2, svfloat16_t,
++		z0 = svclastb_f16 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_f16_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f16_untied, svfloat16_t,
++		z0 = svclastb_f16 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_d0_f16:
++**	clastb	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d0_f16, float16_t, svfloat16_t,
++		  d0 = svclastb_n_f16 (p0, d0, z2),
++		  d0 = svclastb (p0, d0, z2))
++
++/*
++** clastb_d1_f16:
++**	mov	v0\.h\[0\], v1\.h\[0\]
++**	clastb	h0, p0, h0, z2\.h
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d1_f16, float16_t, svfloat16_t,
++		  d0 = svclastb_n_f16 (p0, d1, z2),
++		  d0 = svclastb (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f32.c
+new file mode 100644
+index 000000000..c580d1306
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_f32_tied1:
++**	clastb	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f32_tied1, svfloat32_t,
++		z0 = svclastb_f32 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f32_tied2, svfloat32_t,
++		z0 = svclastb_f32 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_f32_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f32_untied, svfloat32_t,
++		z0 = svclastb_f32 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_d0_f32:
++**	clastb	s0, p0, s0, z2\.s
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d0_f32, float32_t, svfloat32_t,
++		  d0 = svclastb_n_f32 (p0, d0, z2),
++		  d0 = svclastb (p0, d0, z2))
++
++/*
++** clastb_d1_f32:
++**	fmov	s0, s1
++**	clastb	s0, p0, s0, z2\.s
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d1_f32, float32_t, svfloat32_t,
++		  d0 = svclastb_n_f32 (p0, d1, z2),
++		  d0 = svclastb (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f64.c
+new file mode 100644
+index 000000000..217a76f51
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_f64_tied1:
++**	clastb	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f64_tied1, svfloat64_t,
++		z0 = svclastb_f64 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f64_tied2, svfloat64_t,
++		z0 = svclastb_f64 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_f64_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_f64_untied, svfloat64_t,
++		z0 = svclastb_f64 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_d0_f64:
++**	clastb	d0, p0, d0, z2\.d
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d0_f64, float64_t, svfloat64_t,
++		  d0 = svclastb_n_f64 (p0, d0, z2),
++		  d0 = svclastb (p0, d0, z2))
++
++/*
++** clastb_d1_f64:
++**	fmov	d0, d1
++**	clastb	d0, p0, d0, z2\.d
++**	ret
++*/
++TEST_FOLD_LEFT_D (clastb_d1_f64, float64_t, svfloat64_t,
++		  d0 = svclastb_n_f64 (p0, d1, z2),
++		  d0 = svclastb (p0, d1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s16.c
+new file mode 100644
+index 000000000..37be28040
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_s16_tied1:
++**	clastb	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s16_tied1, svint16_t,
++		z0 = svclastb_s16 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_s16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s16_tied2, svint16_t,
++		z0 = svclastb_s16 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_s16_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s16_untied, svint16_t,
++		z0 = svclastb_s16 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_s16:
++**	clastb	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_s16, int16_t, svint16_t,
++		  x0 = svclastb_n_s16 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_s16:
++**	mov	w0, w1
++**	clastb	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_s16, int16_t, svint16_t,
++		  x0 = svclastb_n_s16 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s32.c
+new file mode 100644
+index 000000000..2e56c5a8f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_s32_tied1:
++**	clastb	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s32_tied1, svint32_t,
++		z0 = svclastb_s32 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s32_tied2, svint32_t,
++		z0 = svclastb_s32 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_s32_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s32_untied, svint32_t,
++		z0 = svclastb_s32 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_s32:
++**	clastb	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_s32, int32_t, svint32_t,
++		  x0 = svclastb_n_s32 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_s32:
++**	mov	w0, w1
++**	clastb	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_s32, int32_t, svint32_t,
++		  x0 = svclastb_n_s32 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s64.c
+new file mode 100644
+index 000000000..9ce210aae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_s64_tied1:
++**	clastb	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s64_tied1, svint64_t,
++		z0 = svclastb_s64 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_s64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s64_tied2, svint64_t,
++		z0 = svclastb_s64 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_s64_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s64_untied, svint64_t,
++		z0 = svclastb_s64 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_s64:
++**	clastb	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_s64, int64_t, svint64_t,
++		  x0 = svclastb_n_s64 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_s64:
++**	mov	x0, x1
++**	clastb	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_s64, int64_t, svint64_t,
++		  x0 = svclastb_n_s64 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s8.c
+new file mode 100644
+index 000000000..eb76c22cd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s8.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_s8_tied1:
++**	clastb	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s8_tied1, svint8_t,
++		z0 = svclastb_s8 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_s8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.b, p0, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s8_tied2, svint8_t,
++		z0 = svclastb_s8 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_s8_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.b, p0, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_s8_untied, svint8_t,
++		z0 = svclastb_s8 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_s8:
++**	clastb	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_s8, int8_t, svint8_t,
++		  x0 = svclastb_n_s8 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_s8:
++**	mov	w0, w1
++**	clastb	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_s8, int8_t, svint8_t,
++		  x0 = svclastb_n_s8 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u16.c
+new file mode 100644
+index 000000000..5aea9c7bd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_u16_tied1:
++**	clastb	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u16_tied1, svuint16_t,
++		z0 = svclastb_u16 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_u16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u16_tied2, svuint16_t,
++		z0 = svclastb_u16 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_u16_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u16_untied, svuint16_t,
++		z0 = svclastb_u16 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_u16:
++**	clastb	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_u16, uint16_t, svuint16_t,
++		  x0 = svclastb_n_u16 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_u16:
++**	mov	w0, w1
++**	clastb	w0, p0, w0, z0\.h
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_u16, uint16_t, svuint16_t,
++		  x0 = svclastb_n_u16 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u32.c
+new file mode 100644
+index 000000000..47fcf4f27
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_u32_tied1:
++**	clastb	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u32_tied1, svuint32_t,
++		z0 = svclastb_u32 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u32_tied2, svuint32_t,
++		z0 = svclastb_u32 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_u32_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u32_untied, svuint32_t,
++		z0 = svclastb_u32 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_u32:
++**	clastb	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_u32, uint32_t, svuint32_t,
++		  x0 = svclastb_n_u32 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_u32:
++**	mov	w0, w1
++**	clastb	w0, p0, w0, z0\.s
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_u32, uint32_t, svuint32_t,
++		  x0 = svclastb_n_u32 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u64.c
+new file mode 100644
+index 000000000..fb57afe85
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_u64_tied1:
++**	clastb	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u64_tied1, svuint64_t,
++		z0 = svclastb_u64 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_u64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u64_tied2, svuint64_t,
++		z0 = svclastb_u64 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_u64_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u64_untied, svuint64_t,
++		z0 = svclastb_u64 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_u64:
++**	clastb	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_u64, uint64_t, svuint64_t,
++		  x0 = svclastb_n_u64 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_u64:
++**	mov	x0, x1
++**	clastb	x0, p0, x0, z0\.d
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_u64, uint64_t, svuint64_t,
++		  x0 = svclastb_n_u64 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u8.c
+new file mode 100644
+index 000000000..f3ca84920
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u8.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clastb_u8_tied1:
++**	clastb	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u8_tied1, svuint8_t,
++		z0 = svclastb_u8 (p0, z0, z1),
++		z0 = svclastb (p0, z0, z1))
++
++/*
++** clastb_u8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clastb	z0\.b, p0, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u8_tied2, svuint8_t,
++		z0 = svclastb_u8 (p0, z1, z0),
++		z0 = svclastb (p0, z1, z0))
++
++/*
++** clastb_u8_untied:
++**	movprfx	z0, z1
++**	clastb	z0\.b, p0, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clastb_u8_untied, svuint8_t,
++		z0 = svclastb_u8 (p0, z1, z2),
++		z0 = svclastb (p0, z1, z2))
++
++/*
++** clastb_x0_u8:
++**	clastb	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x0_u8, uint8_t, svuint8_t,
++		  x0 = svclastb_n_u8 (p0, x0, z0),
++		  x0 = svclastb (p0, x0, z0))
++
++/*
++** clastb_x1_u8:
++**	mov	w0, w1
++**	clastb	w0, p0, w0, z0\.b
++**	ret
++*/
++TEST_FOLD_LEFT_X (clastb_x1_u8, uint8_t, svuint8_t,
++		  x0 = svclastb_n_u8 (p0, x1, z0),
++		  x0 = svclastb (p0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s16.c
+new file mode 100644
+index 000000000..7af312397
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cls_s16_m_tied1:
++**	cls	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cls_s16_m_tied1, svuint16_t, svint16_t,
++	     z0 = svcls_s16_m (z0, p0, z4),
++	     z0 = svcls_m (z0, p0, z4))
++
++/*
++** cls_s16_m_untied:
++**	movprfx	z0, z1
++**	cls	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cls_s16_m_untied, svuint16_t, svint16_t,
++	     z0 = svcls_s16_m (z1, p0, z4),
++	     z0 = svcls_m (z1, p0, z4))
++
++/*
++** cls_s16_z:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	cls	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cls_s16_z, svuint16_t, svint16_t,
++	     z0 = svcls_s16_z (p0, z4),
++	     z0 = svcls_z (p0, z4))
++
++/*
++** cls_s16_x:
++**	cls	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cls_s16_x, svuint16_t, svint16_t,
++	     z0 = svcls_s16_x (p0, z4),
++	     z0 = svcls_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s32.c
+new file mode 100644
+index 000000000..813876f68
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s32.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cls_s32_m_tied1:
++**	cls	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cls_s32_m_tied1, svuint32_t, svint32_t,
++	     z0 = svcls_s32_m (z0, p0, z4),
++	     z0 = svcls_m (z0, p0, z4))
++
++/*
++** cls_s32_m_untied:
++**	movprfx	z0, z1
++**	cls	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cls_s32_m_untied, svuint32_t, svint32_t,
++	     z0 = svcls_s32_m (z1, p0, z4),
++	     z0 = svcls_m (z1, p0, z4))
++
++/*
++** cls_s32_z:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	cls	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cls_s32_z, svuint32_t, svint32_t,
++	     z0 = svcls_s32_z (p0, z4),
++	     z0 = svcls_z (p0, z4))
++
++/*
++** cls_s32_x:
++**	cls	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cls_s32_x, svuint32_t, svint32_t,
++	     z0 = svcls_s32_x (p0, z4),
++	     z0 = svcls_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s64.c
+new file mode 100644
+index 000000000..660a20556
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s64.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cls_s64_m_tied1:
++**	cls	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cls_s64_m_tied1, svuint64_t, svint64_t,
++	     z0 = svcls_s64_m (z0, p0, z4),
++	     z0 = svcls_m (z0, p0, z4))
++
++/*
++** cls_s64_m_untied:
++**	movprfx	z0, z1
++**	cls	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cls_s64_m_untied, svuint64_t, svint64_t,
++	     z0 = svcls_s64_m (z1, p0, z4),
++	     z0 = svcls_m (z1, p0, z4))
++
++/*
++** cls_s64_z:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	cls	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cls_s64_z, svuint64_t, svint64_t,
++	     z0 = svcls_s64_z (p0, z4),
++	     z0 = svcls_z (p0, z4))
++
++/*
++** cls_s64_x:
++**	cls	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cls_s64_x, svuint64_t, svint64_t,
++	     z0 = svcls_s64_x (p0, z4),
++	     z0 = svcls_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s8.c
+new file mode 100644
+index 000000000..56f5c2608
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s8.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cls_s8_m_tied1:
++**	cls	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cls_s8_m_tied1, svuint8_t, svint8_t,
++	     z0 = svcls_s8_m (z0, p0, z4),
++	     z0 = svcls_m (z0, p0, z4))
++
++/*
++** cls_s8_m_untied:
++**	movprfx	z0, z1
++**	cls	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cls_s8_m_untied, svuint8_t, svint8_t,
++	     z0 = svcls_s8_m (z1, p0, z4),
++	     z0 = svcls_m (z1, p0, z4))
++
++/*
++** cls_s8_z:
++**	movprfx	z0\.b, p0/z, z4\.b
++**	cls	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cls_s8_z, svuint8_t, svint8_t,
++	     z0 = svcls_s8_z (p0, z4),
++	     z0 = svcls_z (p0, z4))
++
++/*
++** cls_s8_x:
++**	cls	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cls_s8_x, svuint8_t, svint8_t,
++	     z0 = svcls_s8_x (p0, z4),
++	     z0 = svcls_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s16.c
+new file mode 100644
+index 000000000..58f89005c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_s16_m_tied1:
++**	clz	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (clz_s16_m_tied1, svuint16_t, svint16_t,
++	     z0 = svclz_s16_m (z0, p0, z4),
++	     z0 = svclz_m (z0, p0, z4))
++
++/*
++** clz_s16_m_untied:
++**	movprfx	z0, z1
++**	clz	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (clz_s16_m_untied, svuint16_t, svint16_t,
++	     z0 = svclz_s16_m (z1, p0, z4),
++	     z0 = svclz_m (z1, p0, z4))
++
++/*
++** clz_s16_z:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	clz	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (clz_s16_z, svuint16_t, svint16_t,
++	     z0 = svclz_s16_z (p0, z4),
++	     z0 = svclz_z (p0, z4))
++
++/*
++** clz_s16_x:
++**	clz	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (clz_s16_x, svuint16_t, svint16_t,
++	     z0 = svclz_s16_x (p0, z4),
++	     z0 = svclz_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s32.c
+new file mode 100644
+index 000000000..a9198070b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s32.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_s32_m_tied1:
++**	clz	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (clz_s32_m_tied1, svuint32_t, svint32_t,
++	     z0 = svclz_s32_m (z0, p0, z4),
++	     z0 = svclz_m (z0, p0, z4))
++
++/*
++** clz_s32_m_untied:
++**	movprfx	z0, z1
++**	clz	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (clz_s32_m_untied, svuint32_t, svint32_t,
++	     z0 = svclz_s32_m (z1, p0, z4),
++	     z0 = svclz_m (z1, p0, z4))
++
++/*
++** clz_s32_z:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	clz	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (clz_s32_z, svuint32_t, svint32_t,
++	     z0 = svclz_s32_z (p0, z4),
++	     z0 = svclz_z (p0, z4))
++
++/*
++** clz_s32_x:
++**	clz	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (clz_s32_x, svuint32_t, svint32_t,
++	     z0 = svclz_s32_x (p0, z4),
++	     z0 = svclz_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s64.c
+new file mode 100644
+index 000000000..02c0c993e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s64.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_s64_m_tied1:
++**	clz	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (clz_s64_m_tied1, svuint64_t, svint64_t,
++	     z0 = svclz_s64_m (z0, p0, z4),
++	     z0 = svclz_m (z0, p0, z4))
++
++/*
++** clz_s64_m_untied:
++**	movprfx	z0, z1
++**	clz	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (clz_s64_m_untied, svuint64_t, svint64_t,
++	     z0 = svclz_s64_m (z1, p0, z4),
++	     z0 = svclz_m (z1, p0, z4))
++
++/*
++** clz_s64_z:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	clz	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (clz_s64_z, svuint64_t, svint64_t,
++	     z0 = svclz_s64_z (p0, z4),
++	     z0 = svclz_z (p0, z4))
++
++/*
++** clz_s64_x:
++**	clz	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (clz_s64_x, svuint64_t, svint64_t,
++	     z0 = svclz_s64_x (p0, z4),
++	     z0 = svclz_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s8.c
+new file mode 100644
+index 000000000..642d298c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s8.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_s8_m_tied1:
++**	clz	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (clz_s8_m_tied1, svuint8_t, svint8_t,
++	     z0 = svclz_s8_m (z0, p0, z4),
++	     z0 = svclz_m (z0, p0, z4))
++
++/*
++** clz_s8_m_untied:
++**	movprfx	z0, z1
++**	clz	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (clz_s8_m_untied, svuint8_t, svint8_t,
++	     z0 = svclz_s8_m (z1, p0, z4),
++	     z0 = svclz_m (z1, p0, z4))
++
++/*
++** clz_s8_z:
++**	movprfx	z0\.b, p0/z, z4\.b
++**	clz	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (clz_s8_z, svuint8_t, svint8_t,
++	     z0 = svclz_s8_z (p0, z4),
++	     z0 = svclz_z (p0, z4))
++
++/*
++** clz_s8_x:
++**	clz	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (clz_s8_x, svuint8_t, svint8_t,
++	     z0 = svclz_s8_x (p0, z4),
++	     z0 = svclz_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u16.c
+new file mode 100644
+index 000000000..f08723017
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_u16_m_tied12:
++**	clz	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_m_tied12, svuint16_t,
++		z0 = svclz_u16_m (z0, p0, z0),
++		z0 = svclz_m (z0, p0, z0))
++
++/*
++** clz_u16_m_tied1:
++**	clz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_m_tied1, svuint16_t,
++		z0 = svclz_u16_m (z0, p0, z1),
++		z0 = svclz_m (z0, p0, z1))
++
++/*
++** clz_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clz	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_m_tied2, svuint16_t,
++		z0 = svclz_u16_m (z1, p0, z0),
++		z0 = svclz_m (z1, p0, z0))
++
++/*
++** clz_u16_m_untied:
++**	movprfx	z0, z2
++**	clz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_m_untied, svuint16_t,
++		z0 = svclz_u16_m (z2, p0, z1),
++		z0 = svclz_m (z2, p0, z1))
++
++/*
++** clz_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	clz	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_z_tied1, svuint16_t,
++		z0 = svclz_u16_z (p0, z0),
++		z0 = svclz_z (p0, z0))
++
++/*
++** clz_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	clz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_z_untied, svuint16_t,
++		z0 = svclz_u16_z (p0, z1),
++		z0 = svclz_z (p0, z1))
++
++/*
++** clz_u16_x_tied1:
++**	clz	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_x_tied1, svuint16_t,
++		z0 = svclz_u16_x (p0, z0),
++		z0 = svclz_x (p0, z0))
++
++/*
++** clz_u16_x_untied:
++**	clz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u16_x_untied, svuint16_t,
++		z0 = svclz_u16_x (p0, z1),
++		z0 = svclz_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u32.c
+new file mode 100644
+index 000000000..e00424131
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_u32_m_tied12:
++**	clz	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_m_tied12, svuint32_t,
++		z0 = svclz_u32_m (z0, p0, z0),
++		z0 = svclz_m (z0, p0, z0))
++
++/*
++** clz_u32_m_tied1:
++**	clz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_m_tied1, svuint32_t,
++		z0 = svclz_u32_m (z0, p0, z1),
++		z0 = svclz_m (z0, p0, z1))
++
++/*
++** clz_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clz	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_m_tied2, svuint32_t,
++		z0 = svclz_u32_m (z1, p0, z0),
++		z0 = svclz_m (z1, p0, z0))
++
++/*
++** clz_u32_m_untied:
++**	movprfx	z0, z2
++**	clz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_m_untied, svuint32_t,
++		z0 = svclz_u32_m (z2, p0, z1),
++		z0 = svclz_m (z2, p0, z1))
++
++/*
++** clz_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	clz	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_z_tied1, svuint32_t,
++		z0 = svclz_u32_z (p0, z0),
++		z0 = svclz_z (p0, z0))
++
++/*
++** clz_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	clz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_z_untied, svuint32_t,
++		z0 = svclz_u32_z (p0, z1),
++		z0 = svclz_z (p0, z1))
++
++/*
++** clz_u32_x_tied1:
++**	clz	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_x_tied1, svuint32_t,
++		z0 = svclz_u32_x (p0, z0),
++		z0 = svclz_x (p0, z0))
++
++/*
++** clz_u32_x_untied:
++**	clz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u32_x_untied, svuint32_t,
++		z0 = svclz_u32_x (p0, z1),
++		z0 = svclz_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u64.c
+new file mode 100644
+index 000000000..e879e1b9a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_u64_m_tied12:
++**	clz	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_m_tied12, svuint64_t,
++		z0 = svclz_u64_m (z0, p0, z0),
++		z0 = svclz_m (z0, p0, z0))
++
++/*
++** clz_u64_m_tied1:
++**	clz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_m_tied1, svuint64_t,
++		z0 = svclz_u64_m (z0, p0, z1),
++		z0 = svclz_m (z0, p0, z1))
++
++/*
++** clz_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	clz	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_m_tied2, svuint64_t,
++		z0 = svclz_u64_m (z1, p0, z0),
++		z0 = svclz_m (z1, p0, z0))
++
++/*
++** clz_u64_m_untied:
++**	movprfx	z0, z2
++**	clz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_m_untied, svuint64_t,
++		z0 = svclz_u64_m (z2, p0, z1),
++		z0 = svclz_m (z2, p0, z1))
++
++/*
++** clz_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	clz	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_z_tied1, svuint64_t,
++		z0 = svclz_u64_z (p0, z0),
++		z0 = svclz_z (p0, z0))
++
++/*
++** clz_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	clz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_z_untied, svuint64_t,
++		z0 = svclz_u64_z (p0, z1),
++		z0 = svclz_z (p0, z1))
++
++/*
++** clz_u64_x_tied1:
++**	clz	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_x_tied1, svuint64_t,
++		z0 = svclz_u64_x (p0, z0),
++		z0 = svclz_x (p0, z0))
++
++/*
++** clz_u64_x_untied:
++**	clz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u64_x_untied, svuint64_t,
++		z0 = svclz_u64_x (p0, z1),
++		z0 = svclz_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u8.c
+new file mode 100644
+index 000000000..ce6cb8f45
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** clz_u8_m_tied12:
++**	clz	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_m_tied12, svuint8_t,
++		z0 = svclz_u8_m (z0, p0, z0),
++		z0 = svclz_m (z0, p0, z0))
++
++/*
++** clz_u8_m_tied1:
++**	clz	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_m_tied1, svuint8_t,
++		z0 = svclz_u8_m (z0, p0, z1),
++		z0 = svclz_m (z0, p0, z1))
++
++/*
++** clz_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	clz	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_m_tied2, svuint8_t,
++		z0 = svclz_u8_m (z1, p0, z0),
++		z0 = svclz_m (z1, p0, z0))
++
++/*
++** clz_u8_m_untied:
++**	movprfx	z0, z2
++**	clz	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_m_untied, svuint8_t,
++		z0 = svclz_u8_m (z2, p0, z1),
++		z0 = svclz_m (z2, p0, z1))
++
++/*
++** clz_u8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	clz	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_z_tied1, svuint8_t,
++		z0 = svclz_u8_z (p0, z0),
++		z0 = svclz_z (p0, z0))
++
++/*
++** clz_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	clz	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_z_untied, svuint8_t,
++		z0 = svclz_u8_z (p0, z1),
++		z0 = svclz_z (p0, z1))
++
++/*
++** clz_u8_x_tied1:
++**	clz	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_x_tied1, svuint8_t,
++		z0 = svclz_u8_x (p0, z0),
++		z0 = svclz_x (p0, z0))
++
++/*
++** clz_u8_x_untied:
++**	clz	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (clz_u8_x_untied, svuint8_t,
++		z0 = svclz_u8_x (p0, z1),
++		z0 = svclz_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f16.c
+new file mode 100644
+index 000000000..3bf44a59f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f16.c
+@@ -0,0 +1,675 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmla_0_f16_m_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_m_tied1, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z0, z1, z2, 0),
++		z0 = svcmla_m (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_m_tied2, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z0, z2, 0),
++		z0 = svcmla_m (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_m_tied3, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z0, 0),
++		z0 = svcmla_m (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f16_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_m_untied, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z3, 0),
++		z0 = svcmla_m (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f16_m_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_m_tied1, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z0, z1, z2, 90),
++		z0 = svcmla_m (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_m_tied2, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z0, z2, 90),
++		z0 = svcmla_m (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_m_tied3, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z0, 90),
++		z0 = svcmla_m (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f16_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_m_untied, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z3, 90),
++		z0 = svcmla_m (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f16_m_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_m_tied1, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z0, z1, z2, 180),
++		z0 = svcmla_m (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_m_tied2, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z0, z2, 180),
++		z0 = svcmla_m (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_m_tied3, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z0, 180),
++		z0 = svcmla_m (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f16_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_m_untied, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z3, 180),
++		z0 = svcmla_m (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f16_m_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_m_tied1, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z0, z1, z2, 270),
++		z0 = svcmla_m (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_m_tied2, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z0, z2, 270),
++		z0 = svcmla_m (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_m_tied3, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z0, 270),
++		z0 = svcmla_m (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f16_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_m_untied, svfloat16_t,
++		z0 = svcmla_f16_m (p0, z1, z2, z3, 270),
++		z0 = svcmla_m (p0, z1, z2, z3, 270))
++
++/*
++** cmla_0_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_z_tied1, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z0, z1, z2, 0),
++		z0 = svcmla_z (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_z_tied2, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z0, z2, 0),
++		z0 = svcmla_z (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f16_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_z_tied3, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z0, 0),
++		z0 = svcmla_z (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_z_untied, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z3, 0),
++		z0 = svcmla_z (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_z_tied1, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z0, z1, z2, 90),
++		z0 = svcmla_z (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_z_tied2, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z0, z2, 90),
++		z0 = svcmla_z (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f16_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_z_tied3, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z0, 90),
++		z0 = svcmla_z (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_z_untied, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z3, 90),
++		z0 = svcmla_z (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_z_tied1, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z0, z1, z2, 180),
++		z0 = svcmla_z (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_z_tied2, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z0, z2, 180),
++		z0 = svcmla_z (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f16_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_z_tied3, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z0, 180),
++		z0 = svcmla_z (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_z_untied, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z3, 180),
++		z0 = svcmla_z (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_z_tied1, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z0, z1, z2, 270),
++		z0 = svcmla_z (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_z_tied2, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z0, z2, 270),
++		z0 = svcmla_z (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f16_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_z_tied3, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z0, 270),
++		z0 = svcmla_z (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_z_untied, svfloat16_t,
++		z0 = svcmla_f16_z (p0, z1, z2, z3, 270),
++		z0 = svcmla_z (p0, z1, z2, z3, 270))
++
++/*
++** cmla_0_f16_x_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z0, z1, z2, 0),
++		z0 = svcmla_x (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z0, z2, 0),
++		z0 = svcmla_x (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f16_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z0, 0),
++		z0 = svcmla_x (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f16_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z3, 0),
++		z0 = svcmla_x (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f16_x_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z0, z1, z2, 90),
++		z0 = svcmla_x (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z0, z2, 90),
++		z0 = svcmla_x (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f16_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z0, 90),
++		z0 = svcmla_x (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f16_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z3, 90),
++		z0 = svcmla_x (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f16_x_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z0, z1, z2, 180),
++		z0 = svcmla_x (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z0, z2, 180),
++		z0 = svcmla_x (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f16_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z0, 180),
++		z0 = svcmla_x (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f16_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z3, 180),
++		z0 = svcmla_x (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f16_x_tied1:
++**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z0, z1, z2, 270),
++		z0 = svcmla_x (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z0, z2, 270),
++		z0 = svcmla_x (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f16_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z0, 270),
++		z0 = svcmla_x (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f16_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (p0, z1, z2, z3, 270),
++		z0 = svcmla_x (p0, z1, z2, z3, 270))
++
++/*
++** ptrue_cmla_0_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 0),
++		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 0))
++
++/*
++** ptrue_cmla_0_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 0),
++		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 0))
++
++/*
++** ptrue_cmla_0_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 0),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 0))
++
++/*
++** ptrue_cmla_0_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 0),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 0))
++
++/*
++** ptrue_cmla_90_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 90),
++		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 90))
++
++/*
++** ptrue_cmla_90_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 90),
++		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 90))
++
++/*
++** ptrue_cmla_90_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 90),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 90))
++
++/*
++** ptrue_cmla_90_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 90),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 90))
++
++/*
++** ptrue_cmla_180_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 180),
++		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 180))
++
++/*
++** ptrue_cmla_180_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 180),
++		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 180))
++
++/*
++** ptrue_cmla_180_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 180),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 180))
++
++/*
++** ptrue_cmla_180_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 180),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 180))
++
++/*
++** ptrue_cmla_270_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_tied1, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 270),
++		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 270))
++
++/*
++** ptrue_cmla_270_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_tied2, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 270),
++		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 270))
++
++/*
++** ptrue_cmla_270_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_tied3, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 270),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 270))
++
++/*
++** ptrue_cmla_270_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_untied, svfloat16_t,
++		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 270),
++		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 270))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f32.c
+new file mode 100644
+index 000000000..b266738b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f32.c
+@@ -0,0 +1,675 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmla_0_f32_m_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_m_tied1, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z0, z1, z2, 0),
++		z0 = svcmla_m (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_m_tied2, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z0, z2, 0),
++		z0 = svcmla_m (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_m_tied3, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z0, 0),
++		z0 = svcmla_m (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f32_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_m_untied, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z3, 0),
++		z0 = svcmla_m (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f32_m_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_m_tied1, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z0, z1, z2, 90),
++		z0 = svcmla_m (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_m_tied2, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z0, z2, 90),
++		z0 = svcmla_m (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_m_tied3, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z0, 90),
++		z0 = svcmla_m (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f32_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_m_untied, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z3, 90),
++		z0 = svcmla_m (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f32_m_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_m_tied1, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z0, z1, z2, 180),
++		z0 = svcmla_m (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_m_tied2, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z0, z2, 180),
++		z0 = svcmla_m (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_m_tied3, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z0, 180),
++		z0 = svcmla_m (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f32_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_m_untied, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z3, 180),
++		z0 = svcmla_m (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f32_m_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_m_tied1, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z0, z1, z2, 270),
++		z0 = svcmla_m (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_m_tied2, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z0, z2, 270),
++		z0 = svcmla_m (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_m_tied3, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z0, 270),
++		z0 = svcmla_m (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f32_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_m_untied, svfloat32_t,
++		z0 = svcmla_f32_m (p0, z1, z2, z3, 270),
++		z0 = svcmla_m (p0, z1, z2, z3, 270))
++
++/*
++** cmla_0_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_z_tied1, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z0, z1, z2, 0),
++		z0 = svcmla_z (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_z_tied2, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z0, z2, 0),
++		z0 = svcmla_z (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f32_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_z_tied3, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z0, 0),
++		z0 = svcmla_z (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_z_untied, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z3, 0),
++		z0 = svcmla_z (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_z_tied1, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z0, z1, z2, 90),
++		z0 = svcmla_z (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_z_tied2, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z0, z2, 90),
++		z0 = svcmla_z (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f32_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_z_tied3, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z0, 90),
++		z0 = svcmla_z (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_z_untied, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z3, 90),
++		z0 = svcmla_z (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_z_tied1, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z0, z1, z2, 180),
++		z0 = svcmla_z (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_z_tied2, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z0, z2, 180),
++		z0 = svcmla_z (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f32_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_z_tied3, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z0, 180),
++		z0 = svcmla_z (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_z_untied, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z3, 180),
++		z0 = svcmla_z (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_z_tied1, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z0, z1, z2, 270),
++		z0 = svcmla_z (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_z_tied2, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z0, z2, 270),
++		z0 = svcmla_z (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f32_z_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_z_tied3, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z0, 270),
++		z0 = svcmla_z (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_z_untied, svfloat32_t,
++		z0 = svcmla_f32_z (p0, z1, z2, z3, 270),
++		z0 = svcmla_z (p0, z1, z2, z3, 270))
++
++/*
++** cmla_0_f32_x_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z0, z1, z2, 0),
++		z0 = svcmla_x (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z0, z2, 0),
++		z0 = svcmla_x (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f32_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z0, 0),
++		z0 = svcmla_x (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f32_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z3, 0),
++		z0 = svcmla_x (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f32_x_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z0, z1, z2, 90),
++		z0 = svcmla_x (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z0, z2, 90),
++		z0 = svcmla_x (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f32_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z0, 90),
++		z0 = svcmla_x (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f32_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z3, 90),
++		z0 = svcmla_x (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f32_x_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z0, z1, z2, 180),
++		z0 = svcmla_x (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z0, z2, 180),
++		z0 = svcmla_x (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f32_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z0, 180),
++		z0 = svcmla_x (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f32_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z3, 180),
++		z0 = svcmla_x (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f32_x_tied1:
++**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z0, z1, z2, 270),
++		z0 = svcmla_x (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z0, z2, 270),
++		z0 = svcmla_x (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f32_x_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z0, 270),
++		z0 = svcmla_x (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f32_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (p0, z1, z2, z3, 270),
++		z0 = svcmla_x (p0, z1, z2, z3, 270))
++
++/*
++** ptrue_cmla_0_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 0),
++		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 0))
++
++/*
++** ptrue_cmla_0_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 0),
++		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 0))
++
++/*
++** ptrue_cmla_0_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 0),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 0))
++
++/*
++** ptrue_cmla_0_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 0),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 0))
++
++/*
++** ptrue_cmla_90_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 90),
++		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 90))
++
++/*
++** ptrue_cmla_90_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 90),
++		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 90))
++
++/*
++** ptrue_cmla_90_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 90),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 90))
++
++/*
++** ptrue_cmla_90_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 90),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 90))
++
++/*
++** ptrue_cmla_180_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 180),
++		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 180))
++
++/*
++** ptrue_cmla_180_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 180),
++		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 180))
++
++/*
++** ptrue_cmla_180_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 180),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 180))
++
++/*
++** ptrue_cmla_180_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 180),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 180))
++
++/*
++** ptrue_cmla_270_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_tied1, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 270),
++		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 270))
++
++/*
++** ptrue_cmla_270_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_tied2, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 270),
++		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 270))
++
++/*
++** ptrue_cmla_270_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_tied3, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 270),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 270))
++
++/*
++** ptrue_cmla_270_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_untied, svfloat32_t,
++		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 270),
++		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 270))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f64.c
+new file mode 100644
+index 000000000..024ae5ce3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f64.c
+@@ -0,0 +1,675 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmla_0_f64_m_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_m_tied1, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z0, z1, z2, 0),
++		z0 = svcmla_m (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_m_tied2, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z0, z2, 0),
++		z0 = svcmla_m (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_m_tied3, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z0, 0),
++		z0 = svcmla_m (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f64_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_m_untied, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z3, 0),
++		z0 = svcmla_m (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f64_m_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_m_tied1, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z0, z1, z2, 90),
++		z0 = svcmla_m (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_m_tied2, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z0, z2, 90),
++		z0 = svcmla_m (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_m_tied3, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z0, 90),
++		z0 = svcmla_m (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f64_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_m_untied, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z3, 90),
++		z0 = svcmla_m (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f64_m_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_m_tied1, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z0, z1, z2, 180),
++		z0 = svcmla_m (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_m_tied2, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z0, z2, 180),
++		z0 = svcmla_m (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_m_tied3, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z0, 180),
++		z0 = svcmla_m (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f64_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_m_untied, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z3, 180),
++		z0 = svcmla_m (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f64_m_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_m_tied1, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z0, z1, z2, 270),
++		z0 = svcmla_m (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_m_tied2, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z0, z2, 270),
++		z0 = svcmla_m (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_m_tied3, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z0, 270),
++		z0 = svcmla_m (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f64_m_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_m_untied, svfloat64_t,
++		z0 = svcmla_f64_m (p0, z1, z2, z3, 270),
++		z0 = svcmla_m (p0, z1, z2, z3, 270))
++
++/*
++** cmla_0_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_z_tied1, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z0, z1, z2, 0),
++		z0 = svcmla_z (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_z_tied2, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z0, z2, 0),
++		z0 = svcmla_z (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f64_z_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_z_tied3, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z0, 0),
++		z0 = svcmla_z (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_z_untied, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z3, 0),
++		z0 = svcmla_z (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_z_tied1, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z0, z1, z2, 90),
++		z0 = svcmla_z (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_z_tied2, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z0, z2, 90),
++		z0 = svcmla_z (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f64_z_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_z_tied3, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z0, 90),
++		z0 = svcmla_z (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_z_untied, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z3, 90),
++		z0 = svcmla_z (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_z_tied1, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z0, z1, z2, 180),
++		z0 = svcmla_z (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_z_tied2, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z0, z2, 180),
++		z0 = svcmla_z (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f64_z_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_z_tied3, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z0, 180),
++		z0 = svcmla_z (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_z_untied, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z3, 180),
++		z0 = svcmla_z (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_z_tied1, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z0, z1, z2, 270),
++		z0 = svcmla_z (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_z_tied2, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z0, z2, 270),
++		z0 = svcmla_z (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f64_z_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_z_tied3, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z0, 270),
++		z0 = svcmla_z (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_z_untied, svfloat64_t,
++		z0 = svcmla_f64_z (p0, z1, z2, z3, 270),
++		z0 = svcmla_z (p0, z1, z2, z3, 270))
++
++/*
++** cmla_0_f64_x_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z0, z1, z2, 0),
++		z0 = svcmla_x (p0, z0, z1, z2, 0))
++
++/*
++** cmla_0_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z0, z2, 0),
++		z0 = svcmla_x (p0, z1, z0, z2, 0))
++
++/*
++** cmla_0_f64_x_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z0, 0),
++		z0 = svcmla_x (p0, z1, z2, z0, 0))
++
++/*
++** cmla_0_f64_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_0_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z3, 0),
++		z0 = svcmla_x (p0, z1, z2, z3, 0))
++
++/*
++** cmla_90_f64_x_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z0, z1, z2, 90),
++		z0 = svcmla_x (p0, z0, z1, z2, 90))
++
++/*
++** cmla_90_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z0, z2, 90),
++		z0 = svcmla_x (p0, z1, z0, z2, 90))
++
++/*
++** cmla_90_f64_x_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z0, 90),
++		z0 = svcmla_x (p0, z1, z2, z0, 90))
++
++/*
++** cmla_90_f64_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_90_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z3, 90),
++		z0 = svcmla_x (p0, z1, z2, z3, 90))
++
++/*
++** cmla_180_f64_x_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z0, z1, z2, 180),
++		z0 = svcmla_x (p0, z0, z1, z2, 180))
++
++/*
++** cmla_180_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z0, z2, 180),
++		z0 = svcmla_x (p0, z1, z0, z2, 180))
++
++/*
++** cmla_180_f64_x_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z0, 180),
++		z0 = svcmla_x (p0, z1, z2, z0, 180))
++
++/*
++** cmla_180_f64_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_180_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z3, 180),
++		z0 = svcmla_x (p0, z1, z2, z3, 180))
++
++/*
++** cmla_270_f64_x_tied1:
++**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z0, z1, z2, 270),
++		z0 = svcmla_x (p0, z0, z1, z2, 270))
++
++/*
++** cmla_270_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, \1, z2\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z0, z2, 270),
++		z0 = svcmla_x (p0, z1, z0, z2, 270))
++
++/*
++** cmla_270_f64_x_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, \1, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z0, 270),
++		z0 = svcmla_x (p0, z1, z2, z0, 270))
++
++/*
++** cmla_270_f64_x_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_270_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (p0, z1, z2, z3, 270),
++		z0 = svcmla_x (p0, z1, z2, z3, 270))
++
++/*
++** ptrue_cmla_0_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 0),
++		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 0))
++
++/*
++** ptrue_cmla_0_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 0),
++		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 0))
++
++/*
++** ptrue_cmla_0_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 0),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 0))
++
++/*
++** ptrue_cmla_0_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 0),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 0))
++
++/*
++** ptrue_cmla_90_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 90),
++		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 90))
++
++/*
++** ptrue_cmla_90_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 90),
++		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 90))
++
++/*
++** ptrue_cmla_90_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 90),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 90))
++
++/*
++** ptrue_cmla_90_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 90),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 90))
++
++/*
++** ptrue_cmla_180_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 180),
++		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 180))
++
++/*
++** ptrue_cmla_180_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 180),
++		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 180))
++
++/*
++** ptrue_cmla_180_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 180),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 180))
++
++/*
++** ptrue_cmla_180_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 180),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 180))
++
++/*
++** ptrue_cmla_270_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_tied1, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 270),
++		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 270))
++
++/*
++** ptrue_cmla_270_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_tied2, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 270),
++		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 270))
++
++/*
++** ptrue_cmla_270_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_tied3, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 270),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 270))
++
++/*
++** ptrue_cmla_270_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_untied, svfloat64_t,
++		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 270),
++		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 270))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f16.c
+new file mode 100644
+index 000000000..16f1b77ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f16.c
+@@ -0,0 +1,194 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmla_lane_0_0_f16_tied1:
++**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f16_tied1, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 0),
++		z0 = svcmla_lane (z0, z1, z2, 0, 0))
++
++/*
++** cmla_lane_0_0_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f16_tied2, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 0),
++		z0 = svcmla_lane (z1, z0, z2, 0, 0))
++
++/*
++** cmla_lane_0_0_f16_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f16_tied3, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 0),
++		z0 = svcmla_lane (z1, z2, z0, 0, 0))
++
++/*
++** cmla_lane_0_0_f16_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f16_untied, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 0),
++		z0 = svcmla_lane (z1, z2, z3, 0, 0))
++
++/*
++** cmla_lane_0_90_f16_tied1:
++**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f16_tied1, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 90),
++		z0 = svcmla_lane (z0, z1, z2, 0, 90))
++
++/*
++** cmla_lane_0_90_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f16_tied2, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 90),
++		z0 = svcmla_lane (z1, z0, z2, 0, 90))
++
++/*
++** cmla_lane_0_90_f16_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f16_tied3, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 90),
++		z0 = svcmla_lane (z1, z2, z0, 0, 90))
++
++/*
++** cmla_lane_0_90_f16_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f16_untied, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 90),
++		z0 = svcmla_lane (z1, z2, z3, 0, 90))
++
++/*
++** cmla_lane_0_180_f16_tied1:
++**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f16_tied1, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 180),
++		z0 = svcmla_lane (z0, z1, z2, 0, 180))
++
++/*
++** cmla_lane_0_180_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f16_tied2, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 180),
++		z0 = svcmla_lane (z1, z0, z2, 0, 180))
++
++/*
++** cmla_lane_0_180_f16_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f16_tied3, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 180),
++		z0 = svcmla_lane (z1, z2, z0, 0, 180))
++
++/*
++** cmla_lane_0_180_f16_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f16_untied, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 180),
++		z0 = svcmla_lane (z1, z2, z3, 0, 180))
++
++/*
++** cmla_lane_0_270_f16_tied1:
++**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f16_tied1, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 270),
++		z0 = svcmla_lane (z0, z1, z2, 0, 270))
++
++/*
++** cmla_lane_0_270_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f16_tied2, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 270),
++		z0 = svcmla_lane (z1, z0, z2, 0, 270))
++
++/*
++** cmla_lane_0_270_f16_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f16_tied3, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 270),
++		z0 = svcmla_lane (z1, z2, z0, 0, 270))
++
++/*
++** cmla_lane_0_270_f16_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f16_untied, svfloat16_t,
++		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 270),
++		z0 = svcmla_lane (z1, z2, z3, 0, 270))
++
++/*
++** cmla_lane_1_f16:
++**	fcmla	z0\.h, z1\.h, z2\.h\[1\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_1_f16, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 1, 0),
++		z0 = svcmla_lane (z0, z1, z2, 1, 0))
++
++/*
++** cmla_lane_2_f16:
++**	fcmla	z0\.h, z1\.h, z2\.h\[2\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_2_f16, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 2, 0),
++		z0 = svcmla_lane (z0, z1, z2, 2, 0))
++
++/*
++** cmla_lane_3_f16:
++**	fcmla	z0\.h, z1\.h, z2\.h\[3\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_3_f16, svfloat16_t,
++		z0 = svcmla_lane_f16 (z0, z1, z2, 3, 0),
++		z0 = svcmla_lane (z0, z1, z2, 3, 0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f32.c
+new file mode 100644
+index 000000000..85bff68fd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f32.c
+@@ -0,0 +1,176 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmla_lane_0_0_f32_tied1:
++**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f32_tied1, svfloat32_t,
++		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 0),
++		z0 = svcmla_lane (z0, z1, z2, 0, 0))
++
++/*
++** cmla_lane_0_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f32_tied2, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 0),
++		z0 = svcmla_lane (z1, z0, z2, 0, 0))
++
++/*
++** cmla_lane_0_0_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f32_tied3, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 0),
++		z0 = svcmla_lane (z1, z2, z0, 0, 0))
++
++/*
++** cmla_lane_0_0_f32_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_0_f32_untied, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 0),
++		z0 = svcmla_lane (z1, z2, z3, 0, 0))
++
++/*
++** cmla_lane_0_90_f32_tied1:
++**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f32_tied1, svfloat32_t,
++		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 90),
++		z0 = svcmla_lane (z0, z1, z2, 0, 90))
++
++/*
++** cmla_lane_0_90_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f32_tied2, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 90),
++		z0 = svcmla_lane (z1, z0, z2, 0, 90))
++
++/*
++** cmla_lane_0_90_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f32_tied3, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 90),
++		z0 = svcmla_lane (z1, z2, z0, 0, 90))
++
++/*
++** cmla_lane_0_90_f32_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #90
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_90_f32_untied, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 90),
++		z0 = svcmla_lane (z1, z2, z3, 0, 90))
++
++/*
++** cmla_lane_0_180_f32_tied1:
++**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f32_tied1, svfloat32_t,
++		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 180),
++		z0 = svcmla_lane (z0, z1, z2, 0, 180))
++
++/*
++** cmla_lane_0_180_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f32_tied2, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 180),
++		z0 = svcmla_lane (z1, z0, z2, 0, 180))
++
++/*
++** cmla_lane_0_180_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f32_tied3, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 180),
++		z0 = svcmla_lane (z1, z2, z0, 0, 180))
++
++/*
++** cmla_lane_0_180_f32_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #180
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_180_f32_untied, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 180),
++		z0 = svcmla_lane (z1, z2, z3, 0, 180))
++
++/*
++** cmla_lane_0_270_f32_tied1:
++**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f32_tied1, svfloat32_t,
++		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 270),
++		z0 = svcmla_lane (z0, z1, z2, 0, 270))
++
++/*
++** cmla_lane_0_270_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f32_tied2, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 270),
++		z0 = svcmla_lane (z1, z0, z2, 0, 270))
++
++/*
++** cmla_lane_0_270_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f32_tied3, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 270),
++		z0 = svcmla_lane (z1, z2, z0, 0, 270))
++
++/*
++** cmla_lane_0_270_f32_untied:
++**	movprfx	z0, z1
++**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #270
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_0_270_f32_untied, svfloat32_t,
++		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 270),
++		z0 = svcmla_lane (z1, z2, z3, 0, 270))
++
++/*
++** cmla_lane_1_f32:
++**	fcmla	z0\.s, z1\.s, z2\.s\[1\], #0
++**	ret
++*/
++TEST_UNIFORM_Z (cmla_lane_1_f32, svfloat32_t,
++		z0 = svcmla_lane_f32 (z0, z1, z2, 1, 0),
++		z0 = svcmla_lane (z0, z1, z2, 1, 0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f16.c
+new file mode 100644
+index 000000000..7149ad300
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f16.c
+@@ -0,0 +1,50 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_f16_tied:
++**	fcmeq	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_f16_tied, svfloat16_t,
++		p0 = svcmpeq_f16 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_f16_untied:
++**	fcmeq	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_f16_untied, svfloat16_t,
++		p0 = svcmpeq_f16 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++**	fcmeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpeq_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmpeq_n_f16 (p1, z0, d4),
++		 p0 = svcmpeq (p1, z0, d4))
++
++/*
++** cmpeq_0_f16:
++**	fcmeq	p0\.h, p1/z, z0\.h, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_f16, svfloat16_t,
++		p0 = svcmpeq_n_f16 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fcmeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_f16, svfloat16_t,
++		p0 = svcmpeq_n_f16 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f32.c
+new file mode 100644
+index 000000000..05910bc50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f32.c
+@@ -0,0 +1,50 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_f32_tied:
++**	fcmeq	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_f32_tied, svfloat32_t,
++		p0 = svcmpeq_f32 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_f32_untied:
++**	fcmeq	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_f32_untied, svfloat32_t,
++		p0 = svcmpeq_f32 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++**	fcmeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpeq_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmpeq_n_f32 (p1, z0, d4),
++		 p0 = svcmpeq (p1, z0, d4))
++
++/*
++** cmpeq_0_f32:
++**	fcmeq	p0\.s, p1/z, z0\.s, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_f32, svfloat32_t,
++		p0 = svcmpeq_n_f32 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fcmeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_f32, svfloat32_t,
++		p0 = svcmpeq_n_f32 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f64.c
+new file mode 100644
+index 000000000..f94bdfe27
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f64.c
+@@ -0,0 +1,50 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_f64_tied:
++**	fcmeq	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_f64_tied, svfloat64_t,
++		p0 = svcmpeq_f64 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_f64_untied:
++**	fcmeq	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_f64_untied, svfloat64_t,
++		p0 = svcmpeq_f64 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++**	fcmeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpeq_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmpeq_n_f64 (p1, z0, d4),
++		 p0 = svcmpeq (p1, z0, d4))
++
++/*
++** cmpeq_0_f64:
++**	fcmeq	p0\.d, p1/z, z0\.d, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_f64, svfloat64_t,
++		p0 = svcmpeq_n_f64 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fcmeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_f64, svfloat64_t,
++		p0 = svcmpeq_n_f64 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s16.c
+new file mode 100644
+index 000000000..b0befcb77
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_s16_tied:
++**	cmpeq	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s16_tied, svint16_t,
++		p0 = svcmpeq_s16 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_s16_untied:
++**	cmpeq	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s16_untied, svint16_t,
++		p0 = svcmpeq_s16 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_w0_s16:
++**	mov	(z[0-9]+\.h), w0
++**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_w0_s16, svint16_t, int16_t,
++		 p0 = svcmpeq_n_s16 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_s16:
++**	mov	(z[0-9]+\.h), #16
++**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_s16:
++**	mov	(z[0-9]+\.h), #-17
++**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_s16, svint16_t,
++		p0 = svcmpeq_n_s16 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s32.c
+new file mode 100644
+index 000000000..de48a2c38
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_s32_tied:
++**	cmpeq	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s32_tied, svint32_t,
++		p0 = svcmpeq_s32 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_s32_untied:
++**	cmpeq	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s32_untied, svint32_t,
++		p0 = svcmpeq_s32 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_w0_s32:
++**	mov	(z[0-9]+\.s), w0
++**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_w0_s32, svint32_t, int32_t,
++		 p0 = svcmpeq_n_s32 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_s32:
++**	mov	(z[0-9]+\.s), #16
++**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_s32:
++**	mov	(z[0-9]+\.s), #-17
++**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_s32, svint32_t,
++		p0 = svcmpeq_n_s32 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s64.c
+new file mode 100644
+index 000000000..ff976712a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s64.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_s64_tied:
++**	cmpeq	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s64_tied, svint64_t,
++		p0 = svcmpeq_s64 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_s64_untied:
++**	cmpeq	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s64_untied, svint64_t,
++		p0 = svcmpeq_s64 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_x0_s64:
++**	mov	(z[0-9]+\.d), x0
++**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_x0_s64, svint64_t, int64_t,
++		 p0 = svcmpeq_n_s64 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_s64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_s64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_s64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_s64:
++**	mov	(z[0-9]+\.d), #16
++**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_s64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_s64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_s64:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_s64, svint64_t,
++		p0 = svcmpeq_n_s64 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s8.c
+new file mode 100644
+index 000000000..1325755a8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_s8_tied:
++**	cmpeq	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s8_tied, svint8_t,
++		p0 = svcmpeq_s8 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_s8_untied:
++**	cmpeq	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_s8_untied, svint8_t,
++		p0 = svcmpeq_s8 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_w0_s8:
++**	mov	(z[0-9]+\.b), w0
++**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_w0_s8, svint8_t, int8_t,
++		 p0 = svcmpeq_n_s8 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_s8:
++**	mov	(z[0-9]+\.b), #16
++**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_s8:
++**	mov	(z[0-9]+\.b), #-17
++**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_s8, svint8_t,
++		p0 = svcmpeq_n_s8 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u16.c
+new file mode 100644
+index 000000000..91004692c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_u16_tied:
++**	cmpeq	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u16_tied, svuint16_t,
++		p0 = svcmpeq_u16 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_u16_untied:
++**	cmpeq	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u16_untied, svuint16_t,
++		p0 = svcmpeq_u16 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_w0_u16:
++**	mov	(z[0-9]+\.h), w0
++**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_w0_u16, svuint16_t, uint16_t,
++		 p0 = svcmpeq_n_u16 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_u16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_u16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_u16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_u16:
++**	mov	(z[0-9]+\.h), #16
++**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_u16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_u16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_u16:
++**	mov	(z[0-9]+\.h), #-17
++**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_u16, svuint16_t,
++		p0 = svcmpeq_n_u16 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u32.c
+new file mode 100644
+index 000000000..2cff56eb6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_u32_tied:
++**	cmpeq	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u32_tied, svuint32_t,
++		p0 = svcmpeq_u32 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_u32_untied:
++**	cmpeq	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u32_untied, svuint32_t,
++		p0 = svcmpeq_u32 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_w0_u32:
++**	mov	(z[0-9]+\.s), w0
++**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_w0_u32, svuint32_t, uint32_t,
++		 p0 = svcmpeq_n_u32 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_u32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_u32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_u32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_u32:
++**	mov	(z[0-9]+\.s), #16
++**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_u32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_u32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_u32:
++**	mov	(z[0-9]+\.s), #-17
++**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_u32, svuint32_t,
++		p0 = svcmpeq_n_u32 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u64.c
+new file mode 100644
+index 000000000..0f02c9988
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u64.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_u64_tied:
++**	cmpeq	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u64_tied, svuint64_t,
++		p0 = svcmpeq_u64 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_u64_untied:
++**	cmpeq	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u64_untied, svuint64_t,
++		p0 = svcmpeq_u64 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_x0_u64:
++**	mov	(z[0-9]+\.d), x0
++**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_x0_u64, svuint64_t, uint64_t,
++		 p0 = svcmpeq_n_u64 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_u64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_u64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_u64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_u64:
++**	mov	(z[0-9]+\.d), #16
++**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_u64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_u64:
++**	cmpeq	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_u64:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_u64, svuint64_t,
++		p0 = svcmpeq_n_u64 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u8.c
+new file mode 100644
+index 000000000..ccd9a61c6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_u8_tied:
++**	cmpeq	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u8_tied, svuint8_t,
++		p0 = svcmpeq_u8 (p0, z0, z1),
++		p0 = svcmpeq (p0, z0, z1))
++
++/*
++** cmpeq_u8_untied:
++**	cmpeq	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_u8_untied, svuint8_t,
++		p0 = svcmpeq_u8 (p1, z0, z1),
++		p0 = svcmpeq (p1, z0, z1))
++
++/*
++** cmpeq_w0_u8:
++**	mov	(z[0-9]+\.b), w0
++**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_w0_u8, svuint8_t, uint8_t,
++		 p0 = svcmpeq_n_u8 (p1, z0, x0),
++		 p0 = svcmpeq (p1, z0, x0))
++
++/*
++** cmpeq_0_u8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_0_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, 0),
++		p0 = svcmpeq (p1, z0, 0))
++
++/*
++** cmpeq_1_u8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_1_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, 1),
++		p0 = svcmpeq (p1, z0, 1))
++
++/*
++** cmpeq_15_u8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_15_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, 15),
++		p0 = svcmpeq (p1, z0, 15))
++
++/*
++** cmpeq_16_u8:
++**	mov	(z[0-9]+\.b), #16
++**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_16_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, 16),
++		p0 = svcmpeq (p1, z0, 16))
++
++/*
++** cmpeq_m1_u8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m1_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, -1),
++		p0 = svcmpeq (p1, z0, -1))
++
++/*
++** cmpeq_m16_u8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m16_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, -16),
++		p0 = svcmpeq (p1, z0, -16))
++
++/*
++** cmpeq_m17_u8:
++**	mov	(z[0-9]+\.b), #-17
++**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_m17_u8, svuint8_t,
++		p0 = svcmpeq_n_u8 (p1, z0, -17),
++		p0 = svcmpeq (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s16.c
+new file mode 100644
+index 000000000..c9712b3b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_wide_s16_tied:
++**	cmpeq	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpeq_wide_s16_tied, svint16_t, svint64_t,
++		     p0 = svcmpeq_wide_s16 (p0, z0, z1),
++		     p0 = svcmpeq_wide (p0, z0, z1))
++
++/*
++** cmpeq_wide_s16_untied:
++**	cmpeq	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpeq_wide_s16_untied, svint16_t, svint64_t,
++		     p0 = svcmpeq_wide_s16 (p1, z0, z1),
++		     p0 = svcmpeq_wide (p1, z0, z1))
++
++/*
++** cmpeq_wide_x0_s16:
++**	mov	(z[0-9]+\.d), x0
++**	cmpeq	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_wide_x0_s16, svint16_t, int64_t,
++		 p0 = svcmpeq_wide_n_s16 (p1, z0, x0),
++		 p0 = svcmpeq_wide (p1, z0, x0))
++
++/*
++** cmpeq_wide_0_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_0_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, 0),
++		p0 = svcmpeq_wide (p1, z0, 0))
++
++/*
++** cmpeq_wide_1_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_1_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, 1),
++		p0 = svcmpeq_wide (p1, z0, 1))
++
++/*
++** cmpeq_wide_15_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_15_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, 15),
++		p0 = svcmpeq_wide (p1, z0, 15))
++
++/*
++** cmpeq_wide_16_s16:
++**	mov	(z[0-9]+\.d), #16
++**	cmpeq	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_16_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, 16),
++		p0 = svcmpeq_wide (p1, z0, 16))
++
++/*
++** cmpeq_wide_m1_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m1_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, -1),
++		p0 = svcmpeq_wide (p1, z0, -1))
++
++/*
++** cmpeq_wide_m16_s16:
++**	cmpeq	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m16_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, -16),
++		p0 = svcmpeq_wide (p1, z0, -16))
++
++/*
++** cmpeq_wide_m17_s16:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpeq	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m17_s16, svint16_t,
++		p0 = svcmpeq_wide_n_s16 (p1, z0, -17),
++		p0 = svcmpeq_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s32.c
+new file mode 100644
+index 000000000..22bd99f57
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_wide_s32_tied:
++**	cmpeq	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpeq_wide_s32_tied, svint32_t, svint64_t,
++		     p0 = svcmpeq_wide_s32 (p0, z0, z1),
++		     p0 = svcmpeq_wide (p0, z0, z1))
++
++/*
++** cmpeq_wide_s32_untied:
++**	cmpeq	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpeq_wide_s32_untied, svint32_t, svint64_t,
++		     p0 = svcmpeq_wide_s32 (p1, z0, z1),
++		     p0 = svcmpeq_wide (p1, z0, z1))
++
++/*
++** cmpeq_wide_x0_s32:
++**	mov	(z[0-9]+\.d), x0
++**	cmpeq	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_wide_x0_s32, svint32_t, int64_t,
++		 p0 = svcmpeq_wide_n_s32 (p1, z0, x0),
++		 p0 = svcmpeq_wide (p1, z0, x0))
++
++/*
++** cmpeq_wide_0_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_0_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, 0),
++		p0 = svcmpeq_wide (p1, z0, 0))
++
++/*
++** cmpeq_wide_1_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_1_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, 1),
++		p0 = svcmpeq_wide (p1, z0, 1))
++
++/*
++** cmpeq_wide_15_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_15_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, 15),
++		p0 = svcmpeq_wide (p1, z0, 15))
++
++/*
++** cmpeq_wide_16_s32:
++**	mov	(z[0-9]+\.d), #16
++**	cmpeq	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_16_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, 16),
++		p0 = svcmpeq_wide (p1, z0, 16))
++
++/*
++** cmpeq_wide_m1_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m1_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, -1),
++		p0 = svcmpeq_wide (p1, z0, -1))
++
++/*
++** cmpeq_wide_m16_s32:
++**	cmpeq	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m16_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, -16),
++		p0 = svcmpeq_wide (p1, z0, -16))
++
++/*
++** cmpeq_wide_m17_s32:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpeq	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m17_s32, svint32_t,
++		p0 = svcmpeq_wide_n_s32 (p1, z0, -17),
++		p0 = svcmpeq_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s8.c
+new file mode 100644
+index 000000000..a9e9a0bf5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpeq_wide_s8_tied:
++**	cmpeq	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpeq_wide_s8_tied, svint8_t, svint64_t,
++		     p0 = svcmpeq_wide_s8 (p0, z0, z1),
++		     p0 = svcmpeq_wide (p0, z0, z1))
++
++/*
++** cmpeq_wide_s8_untied:
++**	cmpeq	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpeq_wide_s8_untied, svint8_t, svint64_t,
++		     p0 = svcmpeq_wide_s8 (p1, z0, z1),
++		     p0 = svcmpeq_wide (p1, z0, z1))
++
++/*
++** cmpeq_wide_x0_s8:
++**	mov	(z[0-9]+\.d), x0
++**	cmpeq	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpeq_wide_x0_s8, svint8_t, int64_t,
++		 p0 = svcmpeq_wide_n_s8 (p1, z0, x0),
++		 p0 = svcmpeq_wide (p1, z0, x0))
++
++/*
++** cmpeq_wide_0_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_0_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, 0),
++		p0 = svcmpeq_wide (p1, z0, 0))
++
++/*
++** cmpeq_wide_1_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_1_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, 1),
++		p0 = svcmpeq_wide (p1, z0, 1))
++
++/*
++** cmpeq_wide_15_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_15_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, 15),
++		p0 = svcmpeq_wide (p1, z0, 15))
++
++/*
++** cmpeq_wide_16_s8:
++**	mov	(z[0-9]+\.d), #16
++**	cmpeq	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_16_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, 16),
++		p0 = svcmpeq_wide (p1, z0, 16))
++
++/*
++** cmpeq_wide_m1_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m1_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, -1),
++		p0 = svcmpeq_wide (p1, z0, -1))
++
++/*
++** cmpeq_wide_m16_s8:
++**	cmpeq	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m16_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, -16),
++		p0 = svcmpeq_wide (p1, z0, -16))
++
++/*
++** cmpeq_wide_m17_s8:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpeq	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpeq_wide_m17_s8, svint8_t,
++		p0 = svcmpeq_wide_n_s8 (p1, z0, -17),
++		p0 = svcmpeq_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f16.c
+new file mode 100644
+index 000000000..a6db8c16a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f16.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_f16_tied:
++** (
++**	fcmge	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	fcmle	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_f16_tied, svfloat16_t,
++		p0 = svcmpge_f16 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_f16_untied:
++** (
++**	fcmge	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	fcmle	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_f16_untied, svfloat16_t,
++		p0 = svcmpge_f16 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	fcmge	p0\.h, p1/z, z0\.h, \1
++** |
++**	fcmle	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmpge_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmpge_n_f16 (p1, z0, d4),
++		 p0 = svcmpge (p1, z0, d4))
++
++/*
++** cmpge_0_f16:
++**	fcmge	p0\.h, p1/z, z0\.h, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_f16, svfloat16_t,
++		p0 = svcmpge_n_f16 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	fcmge	p0\.h, p1/z, z0\.h, \1
++** |
++**	fcmle	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_f16, svfloat16_t,
++		p0 = svcmpge_n_f16 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f32.c
+new file mode 100644
+index 000000000..ee2976e58
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f32.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_f32_tied:
++** (
++**	fcmge	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	fcmle	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_f32_tied, svfloat32_t,
++		p0 = svcmpge_f32 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_f32_untied:
++** (
++**	fcmge	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	fcmle	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_f32_untied, svfloat32_t,
++		p0 = svcmpge_f32 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	fcmge	p0\.s, p1/z, z0\.s, \1
++** |
++**	fcmle	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmpge_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmpge_n_f32 (p1, z0, d4),
++		 p0 = svcmpge (p1, z0, d4))
++
++/*
++** cmpge_0_f32:
++**	fcmge	p0\.s, p1/z, z0\.s, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_f32, svfloat32_t,
++		p0 = svcmpge_n_f32 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	fcmge	p0\.s, p1/z, z0\.s, \1
++** |
++**	fcmle	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_f32, svfloat32_t,
++		p0 = svcmpge_n_f32 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f64.c
+new file mode 100644
+index 000000000..ceea0afe3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f64.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_f64_tied:
++** (
++**	fcmge	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	fcmle	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_f64_tied, svfloat64_t,
++		p0 = svcmpge_f64 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_f64_untied:
++** (
++**	fcmge	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	fcmle	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_f64_untied, svfloat64_t,
++		p0 = svcmpge_f64 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	fcmge	p0\.d, p1/z, z0\.d, \1
++** |
++**	fcmle	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmpge_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmpge_n_f64 (p1, z0, d4),
++		 p0 = svcmpge (p1, z0, d4))
++
++/*
++** cmpge_0_f64:
++**	fcmge	p0\.d, p1/z, z0\.d, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_f64, svfloat64_t,
++		p0 = svcmpge_n_f64 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	fcmge	p0\.d, p1/z, z0\.d, \1
++** |
++**	fcmle	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_f64, svfloat64_t,
++		p0 = svcmpge_n_f64 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s16.c
+new file mode 100644
+index 000000000..de9180b84
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_s16_tied:
++** (
++**	cmpge	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	cmple	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s16_tied, svint16_t,
++		p0 = svcmpge_s16 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_s16_untied:
++** (
++**	cmpge	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	cmple	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s16_untied, svint16_t,
++		p0 = svcmpge_s16 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_w0_s16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmpge	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmple	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_w0_s16, svint16_t, int16_t,
++		 p0 = svcmpge_n_s16 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_s16:
++**	mov	(z[0-9]+\.h), #16
++** (
++**	cmpge	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmple	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_m1_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
++
++/*
++** cmpge_m16_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m16_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, -16),
++		p0 = svcmpge (p1, z0, -16))
++
++/*
++** cmpge_m17_s16:
++**	mov	(z[0-9]+\.h), #-17
++** (
++**	cmpge	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmple	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m17_s16, svint16_t,
++		p0 = svcmpge_n_s16 (p1, z0, -17),
++		p0 = svcmpge (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s32.c
+new file mode 100644
+index 000000000..67286b1fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_s32_tied:
++** (
++**	cmpge	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	cmple	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s32_tied, svint32_t,
++		p0 = svcmpge_s32 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_s32_untied:
++** (
++**	cmpge	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	cmple	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s32_untied, svint32_t,
++		p0 = svcmpge_s32 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_w0_s32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmpge	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmple	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_w0_s32, svint32_t, int32_t,
++		 p0 = svcmpge_n_s32 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_s32:
++**	mov	(z[0-9]+\.s), #16
++** (
++**	cmpge	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmple	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_m1_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
++
++/*
++** cmpge_m16_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m16_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, -16),
++		p0 = svcmpge (p1, z0, -16))
++
++/*
++** cmpge_m17_s32:
++**	mov	(z[0-9]+\.s), #-17
++** (
++**	cmpge	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmple	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m17_s32, svint32_t,
++		p0 = svcmpge_n_s32 (p1, z0, -17),
++		p0 = svcmpge (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s64.c
+new file mode 100644
+index 000000000..02e3ac07a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_s64_tied:
++** (
++**	cmpge	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	cmple	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s64_tied, svint64_t,
++		p0 = svcmpge_s64 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_s64_untied:
++** (
++**	cmpge	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	cmple	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s64_untied, svint64_t,
++		p0 = svcmpge_s64 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_x0_s64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmpge	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmple	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_x0_s64, svint64_t, int64_t,
++		 p0 = svcmpge_n_s64 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_s64:
++**	cmpge	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_s64:
++**	cmpge	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_s64:
++**	cmpge	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_s64:
++**	mov	(z[0-9]+\.d), #16
++** (
++**	cmpge	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmple	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_m1_s64:
++**	cmpge	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
++
++/*
++** cmpge_m16_s64:
++**	cmpge	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m16_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, -16),
++		p0 = svcmpge (p1, z0, -16))
++
++/*
++** cmpge_m17_s64:
++**	mov	(z[0-9]+\.d), #-17
++** (
++**	cmpge	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmple	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m17_s64, svint64_t,
++		p0 = svcmpge_n_s64 (p1, z0, -17),
++		p0 = svcmpge (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s8.c
+new file mode 100644
+index 000000000..45c9c5f10
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_s8_tied:
++** (
++**	cmpge	p0\.b, p0/z, z0\.b, z1\.b
++** |
++**	cmple	p0\.b, p0/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s8_tied, svint8_t,
++		p0 = svcmpge_s8 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_s8_untied:
++** (
++**	cmpge	p0\.b, p1/z, z0\.b, z1\.b
++** |
++**	cmple	p0\.b, p1/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_s8_untied, svint8_t,
++		p0 = svcmpge_s8 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_w0_s8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmpge	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmple	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_w0_s8, svint8_t, int8_t,
++		 p0 = svcmpge_n_s8 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_s8:
++**	mov	(z[0-9]+\.b), #16
++** (
++**	cmpge	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmple	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_m1_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
++
++/*
++** cmpge_m16_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m16_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, -16),
++		p0 = svcmpge (p1, z0, -16))
++
++/*
++** cmpge_m17_s8:
++**	mov	(z[0-9]+\.b), #-17
++** (
++**	cmpge	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmple	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m17_s8, svint8_t,
++		p0 = svcmpge_n_s8 (p1, z0, -17),
++		p0 = svcmpge (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u16.c
+new file mode 100644
+index 000000000..7c7d2b307
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_u16_tied:
++** (
++**	cmphs	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	cmpls	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u16_tied, svuint16_t,
++		p0 = svcmpge_u16 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_u16_untied:
++** (
++**	cmphs	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	cmpls	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u16_untied, svuint16_t,
++		p0 = svcmpge_u16 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_w0_u16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmphs	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmpls	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_w0_u16, svuint16_t, uint16_t,
++		 p0 = svcmpge_n_u16 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_127_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_127_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, 127),
++		p0 = svcmpge (p1, z0, 127))
++
++/*
++** cmpge_128_u16:
++**	mov	(z[0-9]+\.h), #128
++** (
++**	cmphs	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmpls	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_128_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, 128),
++		p0 = svcmpge (p1, z0, 128))
++
++/*
++** cmpge_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphs	p0\.h, p1/z, z0\.h, \1\.h
++** |
++**	cmpls	p0\.h, p1/z, \1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_u16, svuint16_t,
++		p0 = svcmpge_n_u16 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u32.c
+new file mode 100644
+index 000000000..a2021ef50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_u32_tied:
++** (
++**	cmphs	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	cmpls	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u32_tied, svuint32_t,
++		p0 = svcmpge_u32 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_u32_untied:
++** (
++**	cmphs	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	cmpls	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u32_untied, svuint32_t,
++		p0 = svcmpge_u32 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_w0_u32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmphs	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmpls	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_w0_u32, svuint32_t, uint32_t,
++		 p0 = svcmpge_n_u32 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_127_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_127_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, 127),
++		p0 = svcmpge (p1, z0, 127))
++
++/*
++** cmpge_128_u32:
++**	mov	(z[0-9]+\.s), #128
++** (
++**	cmphs	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmpls	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_128_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, 128),
++		p0 = svcmpge (p1, z0, 128))
++
++/*
++** cmpge_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphs	p0\.s, p1/z, z0\.s, \1\.s
++** |
++**	cmpls	p0\.s, p1/z, \1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_u32, svuint32_t,
++		p0 = svcmpge_n_u32 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u64.c
+new file mode 100644
+index 000000000..0f9159590
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_u64_tied:
++** (
++**	cmphs	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	cmpls	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u64_tied, svuint64_t,
++		p0 = svcmpge_u64 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_u64_untied:
++** (
++**	cmphs	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	cmpls	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u64_untied, svuint64_t,
++		p0 = svcmpge_u64 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_x0_u64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmphs	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmpls	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_x0_u64, svuint64_t, uint64_t,
++		 p0 = svcmpge_n_u64 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_u64:
++**	cmphs	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_u64:
++**	cmphs	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_u64:
++**	cmphs	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_u64:
++**	cmphs	p0\.d, p1/z, z0\.d, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_127_u64:
++**	cmphs	p0\.d, p1/z, z0\.d, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_127_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, 127),
++		p0 = svcmpge (p1, z0, 127))
++
++/*
++** cmpge_128_u64:
++**	mov	(z[0-9]+\.d), #128
++** (
++**	cmphs	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmpls	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_128_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, 128),
++		p0 = svcmpge (p1, z0, 128))
++
++/*
++** cmpge_m1_u64:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphs	p0\.d, p1/z, z0\.d, \1\.d
++** |
++**	cmpls	p0\.d, p1/z, \1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_u64, svuint64_t,
++		p0 = svcmpge_n_u64 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u8.c
+new file mode 100644
+index 000000000..39f988d01
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_u8_tied:
++** (
++**	cmphs	p0\.b, p0/z, z0\.b, z1\.b
++** |
++**	cmpls	p0\.b, p0/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u8_tied, svuint8_t,
++		p0 = svcmpge_u8 (p0, z0, z1),
++		p0 = svcmpge (p0, z0, z1))
++
++/*
++** cmpge_u8_untied:
++** (
++**	cmphs	p0\.b, p1/z, z0\.b, z1\.b
++** |
++**	cmpls	p0\.b, p1/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_u8_untied, svuint8_t,
++		p0 = svcmpge_u8 (p1, z0, z1),
++		p0 = svcmpge (p1, z0, z1))
++
++/*
++** cmpge_w0_u8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmphs	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmpls	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_w0_u8, svuint8_t, uint8_t,
++		 p0 = svcmpge_n_u8 (p1, z0, x0),
++		 p0 = svcmpge (p1, z0, x0))
++
++/*
++** cmpge_0_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_0_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, 0),
++		p0 = svcmpge (p1, z0, 0))
++
++/*
++** cmpge_1_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_1_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, 1),
++		p0 = svcmpge (p1, z0, 1))
++
++/*
++** cmpge_15_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_15_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, 15),
++		p0 = svcmpge (p1, z0, 15))
++
++/*
++** cmpge_16_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_16_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, 16),
++		p0 = svcmpge (p1, z0, 16))
++
++/*
++** cmpge_127_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_127_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, 127),
++		p0 = svcmpge (p1, z0, 127))
++
++/*
++** cmpge_128_u8:
++**	mov	(z[0-9]+\.b), #-128
++** (
++**	cmphs	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmpls	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_128_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, 128),
++		p0 = svcmpge (p1, z0, 128))
++
++/*
++** cmpge_m1_u8:
++**	mov	(z[0-9]+\.b), #-1
++** (
++**	cmphs	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmpls	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_m1_u8, svuint8_t,
++		p0 = svcmpge_n_u8 (p1, z0, -1),
++		p0 = svcmpge (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s16.c
+new file mode 100644
+index 000000000..0400d7871
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_wide_s16_tied:
++**	cmpge	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_s16_tied, svint16_t, svint64_t,
++		     p0 = svcmpge_wide_s16 (p0, z0, z1),
++		     p0 = svcmpge_wide (p0, z0, z1))
++
++/*
++** cmpge_wide_s16_untied:
++**	cmpge	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_s16_untied, svint16_t, svint64_t,
++		     p0 = svcmpge_wide_s16 (p1, z0, z1),
++		     p0 = svcmpge_wide (p1, z0, z1))
++
++/*
++** cmpge_wide_x0_s16:
++**	mov	(z[0-9]+\.d), x0
++**	cmpge	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_wide_x0_s16, svint16_t, int64_t,
++		 p0 = svcmpge_wide_n_s16 (p1, z0, x0),
++		 p0 = svcmpge_wide (p1, z0, x0))
++
++/*
++** cmpge_wide_0_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_0_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, 0),
++		p0 = svcmpge_wide (p1, z0, 0))
++
++/*
++** cmpge_wide_1_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_1_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, 1),
++		p0 = svcmpge_wide (p1, z0, 1))
++
++/*
++** cmpge_wide_15_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_15_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, 15),
++		p0 = svcmpge_wide (p1, z0, 15))
++
++/*
++** cmpge_wide_16_s16:
++**	mov	(z[0-9]+\.d), #16
++**	cmpge	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_16_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, 16),
++		p0 = svcmpge_wide (p1, z0, 16))
++
++/*
++** cmpge_wide_m1_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m1_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, -1),
++		p0 = svcmpge_wide (p1, z0, -1))
++
++/*
++** cmpge_wide_m16_s16:
++**	cmpge	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m16_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, -16),
++		p0 = svcmpge_wide (p1, z0, -16))
++
++/*
++** cmpge_wide_m17_s16:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpge	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m17_s16, svint16_t,
++		p0 = svcmpge_wide_n_s16 (p1, z0, -17),
++		p0 = svcmpge_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s32.c
+new file mode 100644
+index 000000000..ad7b9c55b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_wide_s32_tied:
++**	cmpge	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_s32_tied, svint32_t, svint64_t,
++		     p0 = svcmpge_wide_s32 (p0, z0, z1),
++		     p0 = svcmpge_wide (p0, z0, z1))
++
++/*
++** cmpge_wide_s32_untied:
++**	cmpge	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_s32_untied, svint32_t, svint64_t,
++		     p0 = svcmpge_wide_s32 (p1, z0, z1),
++		     p0 = svcmpge_wide (p1, z0, z1))
++
++/*
++** cmpge_wide_x0_s32:
++**	mov	(z[0-9]+\.d), x0
++**	cmpge	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_wide_x0_s32, svint32_t, int64_t,
++		 p0 = svcmpge_wide_n_s32 (p1, z0, x0),
++		 p0 = svcmpge_wide (p1, z0, x0))
++
++/*
++** cmpge_wide_0_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_0_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, 0),
++		p0 = svcmpge_wide (p1, z0, 0))
++
++/*
++** cmpge_wide_1_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_1_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, 1),
++		p0 = svcmpge_wide (p1, z0, 1))
++
++/*
++** cmpge_wide_15_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_15_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, 15),
++		p0 = svcmpge_wide (p1, z0, 15))
++
++/*
++** cmpge_wide_16_s32:
++**	mov	(z[0-9]+\.d), #16
++**	cmpge	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_16_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, 16),
++		p0 = svcmpge_wide (p1, z0, 16))
++
++/*
++** cmpge_wide_m1_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m1_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, -1),
++		p0 = svcmpge_wide (p1, z0, -1))
++
++/*
++** cmpge_wide_m16_s32:
++**	cmpge	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m16_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, -16),
++		p0 = svcmpge_wide (p1, z0, -16))
++
++/*
++** cmpge_wide_m17_s32:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpge	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m17_s32, svint32_t,
++		p0 = svcmpge_wide_n_s32 (p1, z0, -17),
++		p0 = svcmpge_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s8.c
+new file mode 100644
+index 000000000..b03a42488
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_wide_s8_tied:
++**	cmpge	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_s8_tied, svint8_t, svint64_t,
++		     p0 = svcmpge_wide_s8 (p0, z0, z1),
++		     p0 = svcmpge_wide (p0, z0, z1))
++
++/*
++** cmpge_wide_s8_untied:
++**	cmpge	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_s8_untied, svint8_t, svint64_t,
++		     p0 = svcmpge_wide_s8 (p1, z0, z1),
++		     p0 = svcmpge_wide (p1, z0, z1))
++
++/*
++** cmpge_wide_x0_s8:
++**	mov	(z[0-9]+\.d), x0
++**	cmpge	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_wide_x0_s8, svint8_t, int64_t,
++		 p0 = svcmpge_wide_n_s8 (p1, z0, x0),
++		 p0 = svcmpge_wide (p1, z0, x0))
++
++/*
++** cmpge_wide_0_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_0_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, 0),
++		p0 = svcmpge_wide (p1, z0, 0))
++
++/*
++** cmpge_wide_1_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_1_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, 1),
++		p0 = svcmpge_wide (p1, z0, 1))
++
++/*
++** cmpge_wide_15_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_15_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, 15),
++		p0 = svcmpge_wide (p1, z0, 15))
++
++/*
++** cmpge_wide_16_s8:
++**	mov	(z[0-9]+\.d), #16
++**	cmpge	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_16_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, 16),
++		p0 = svcmpge_wide (p1, z0, 16))
++
++/*
++** cmpge_wide_m1_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m1_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, -1),
++		p0 = svcmpge_wide (p1, z0, -1))
++
++/*
++** cmpge_wide_m16_s8:
++**	cmpge	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m16_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, -16),
++		p0 = svcmpge_wide (p1, z0, -16))
++
++/*
++** cmpge_wide_m17_s8:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpge	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m17_s8, svint8_t,
++		p0 = svcmpge_wide_n_s8 (p1, z0, -17),
++		p0 = svcmpge_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u16.c
+new file mode 100644
+index 000000000..966b1e554
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_wide_u16_tied:
++**	cmphs	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_u16_tied, svuint16_t, svuint64_t,
++		     p0 = svcmpge_wide_u16 (p0, z0, z1),
++		     p0 = svcmpge_wide (p0, z0, z1))
++
++/*
++** cmpge_wide_u16_untied:
++**	cmphs	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_u16_untied, svuint16_t, svuint64_t,
++		     p0 = svcmpge_wide_u16 (p1, z0, z1),
++		     p0 = svcmpge_wide (p1, z0, z1))
++
++/*
++** cmpge_wide_x0_u16:
++**	mov	(z[0-9]+\.d), x0
++**	cmphs	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_wide_x0_u16, svuint16_t, uint64_t,
++		 p0 = svcmpge_wide_n_u16 (p1, z0, x0),
++		 p0 = svcmpge_wide (p1, z0, x0))
++
++/*
++** cmpge_wide_0_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_0_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, 0),
++		p0 = svcmpge_wide (p1, z0, 0))
++
++/*
++** cmpge_wide_1_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_1_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, 1),
++		p0 = svcmpge_wide (p1, z0, 1))
++
++/*
++** cmpge_wide_15_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_15_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, 15),
++		p0 = svcmpge_wide (p1, z0, 15))
++
++/*
++** cmpge_wide_16_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_16_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, 16),
++		p0 = svcmpge_wide (p1, z0, 16))
++
++/*
++** cmpge_wide_127_u16:
++**	cmphs	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_127_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, 127),
++		p0 = svcmpge_wide (p1, z0, 127))
++
++/*
++** cmpge_wide_128_u16:
++**	mov	(z[0-9]+\.d), #128
++**	cmphs	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_128_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, 128),
++		p0 = svcmpge_wide (p1, z0, 128))
++
++/*
++** cmpge_wide_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmphs	p0\.h, p1/z, z0\.h, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m1_u16, svuint16_t,
++		p0 = svcmpge_wide_n_u16 (p1, z0, -1),
++		p0 = svcmpge_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u32.c
+new file mode 100644
+index 000000000..fdeb53a46
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_wide_u32_tied:
++**	cmphs	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_u32_tied, svuint32_t, svuint64_t,
++		     p0 = svcmpge_wide_u32 (p0, z0, z1),
++		     p0 = svcmpge_wide (p0, z0, z1))
++
++/*
++** cmpge_wide_u32_untied:
++**	cmphs	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_u32_untied, svuint32_t, svuint64_t,
++		     p0 = svcmpge_wide_u32 (p1, z0, z1),
++		     p0 = svcmpge_wide (p1, z0, z1))
++
++/*
++** cmpge_wide_x0_u32:
++**	mov	(z[0-9]+\.d), x0
++**	cmphs	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_wide_x0_u32, svuint32_t, uint64_t,
++		 p0 = svcmpge_wide_n_u32 (p1, z0, x0),
++		 p0 = svcmpge_wide (p1, z0, x0))
++
++/*
++** cmpge_wide_0_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_0_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, 0),
++		p0 = svcmpge_wide (p1, z0, 0))
++
++/*
++** cmpge_wide_1_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_1_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, 1),
++		p0 = svcmpge_wide (p1, z0, 1))
++
++/*
++** cmpge_wide_15_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_15_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, 15),
++		p0 = svcmpge_wide (p1, z0, 15))
++
++/*
++** cmpge_wide_16_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_16_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, 16),
++		p0 = svcmpge_wide (p1, z0, 16))
++
++/*
++** cmpge_wide_127_u32:
++**	cmphs	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_127_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, 127),
++		p0 = svcmpge_wide (p1, z0, 127))
++
++/*
++** cmpge_wide_128_u32:
++**	mov	(z[0-9]+\.d), #128
++**	cmphs	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_128_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, 128),
++		p0 = svcmpge_wide (p1, z0, 128))
++
++/*
++** cmpge_wide_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmphs	p0\.s, p1/z, z0\.s, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m1_u32, svuint32_t,
++		p0 = svcmpge_wide_n_u32 (p1, z0, -1),
++		p0 = svcmpge_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u8.c
+new file mode 100644
+index 000000000..565093120
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpge_wide_u8_tied:
++**	cmphs	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_u8_tied, svuint8_t, svuint64_t,
++		     p0 = svcmpge_wide_u8 (p0, z0, z1),
++		     p0 = svcmpge_wide (p0, z0, z1))
++
++/*
++** cmpge_wide_u8_untied:
++**	cmphs	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpge_wide_u8_untied, svuint8_t, svuint64_t,
++		     p0 = svcmpge_wide_u8 (p1, z0, z1),
++		     p0 = svcmpge_wide (p1, z0, z1))
++
++/*
++** cmpge_wide_x0_u8:
++**	mov	(z[0-9]+\.d), x0
++**	cmphs	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpge_wide_x0_u8, svuint8_t, uint64_t,
++		 p0 = svcmpge_wide_n_u8 (p1, z0, x0),
++		 p0 = svcmpge_wide (p1, z0, x0))
++
++/*
++** cmpge_wide_0_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_0_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, 0),
++		p0 = svcmpge_wide (p1, z0, 0))
++
++/*
++** cmpge_wide_1_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_1_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, 1),
++		p0 = svcmpge_wide (p1, z0, 1))
++
++/*
++** cmpge_wide_15_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_15_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, 15),
++		p0 = svcmpge_wide (p1, z0, 15))
++
++/*
++** cmpge_wide_16_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_16_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, 16),
++		p0 = svcmpge_wide (p1, z0, 16))
++
++/*
++** cmpge_wide_127_u8:
++**	cmphs	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_127_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, 127),
++		p0 = svcmpge_wide (p1, z0, 127))
++
++/*
++** cmpge_wide_128_u8:
++**	mov	(z[0-9]+\.d), #128
++**	cmphs	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_128_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, 128),
++		p0 = svcmpge_wide (p1, z0, 128))
++
++/*
++** cmpge_wide_m1_u8:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmphs	p0\.b, p1/z, z0\.b, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmpge_wide_m1_u8, svuint8_t,
++		p0 = svcmpge_wide_n_u8 (p1, z0, -1),
++		p0 = svcmpge_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f16.c
+new file mode 100644
+index 000000000..69b015794
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f16.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_f16_tied:
++** (
++**	fcmgt	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	fcmlt	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_f16_tied, svfloat16_t,
++		p0 = svcmpgt_f16 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_f16_untied:
++** (
++**	fcmgt	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	fcmlt	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_f16_untied, svfloat16_t,
++		p0 = svcmpgt_f16 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	fcmgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	fcmlt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmpgt_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmpgt_n_f16 (p1, z0, d4),
++		 p0 = svcmpgt (p1, z0, d4))
++
++/*
++** cmpgt_0_f16:
++**	fcmgt	p0\.h, p1/z, z0\.h, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_f16, svfloat16_t,
++		p0 = svcmpgt_n_f16 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	fcmgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	fcmlt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_f16, svfloat16_t,
++		p0 = svcmpgt_n_f16 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f32.c
+new file mode 100644
+index 000000000..7d66b67c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f32.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_f32_tied:
++** (
++**	fcmgt	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	fcmlt	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_f32_tied, svfloat32_t,
++		p0 = svcmpgt_f32 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_f32_untied:
++** (
++**	fcmgt	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	fcmlt	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_f32_untied, svfloat32_t,
++		p0 = svcmpgt_f32 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	fcmgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	fcmlt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmpgt_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmpgt_n_f32 (p1, z0, d4),
++		 p0 = svcmpgt (p1, z0, d4))
++
++/*
++** cmpgt_0_f32:
++**	fcmgt	p0\.s, p1/z, z0\.s, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_f32, svfloat32_t,
++		p0 = svcmpgt_n_f32 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	fcmgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	fcmlt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_f32, svfloat32_t,
++		p0 = svcmpgt_n_f32 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f64.c
+new file mode 100644
+index 000000000..f3a155476
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f64.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_f64_tied:
++** (
++**	fcmgt	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	fcmlt	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_f64_tied, svfloat64_t,
++		p0 = svcmpgt_f64 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_f64_untied:
++** (
++**	fcmgt	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	fcmlt	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_f64_untied, svfloat64_t,
++		p0 = svcmpgt_f64 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	fcmgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	fcmlt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmpgt_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmpgt_n_f64 (p1, z0, d4),
++		 p0 = svcmpgt (p1, z0, d4))
++
++/*
++** cmpgt_0_f64:
++**	fcmgt	p0\.d, p1/z, z0\.d, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_f64, svfloat64_t,
++		p0 = svcmpgt_n_f64 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	fcmgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	fcmlt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_f64, svfloat64_t,
++		p0 = svcmpgt_n_f64 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s16.c
+new file mode 100644
+index 000000000..cc86c0c00
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_s16_tied:
++** (
++**	cmpgt	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	cmplt	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s16_tied, svint16_t,
++		p0 = svcmpgt_s16 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_s16_untied:
++** (
++**	cmpgt	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	cmplt	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s16_untied, svint16_t,
++		p0 = svcmpgt_s16 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_w0_s16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmpgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmplt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_w0_s16, svint16_t, int16_t,
++		 p0 = svcmpgt_n_s16 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_s16:
++**	mov	(z[0-9]+\.h), #16
++** (
++**	cmpgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmplt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_m1_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
++
++/*
++** cmpgt_m16_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m16_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, -16),
++		p0 = svcmpgt (p1, z0, -16))
++
++/*
++** cmpgt_m17_s16:
++**	mov	(z[0-9]+\.h), #-17
++** (
++**	cmpgt	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmplt	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m17_s16, svint16_t,
++		p0 = svcmpgt_n_s16 (p1, z0, -17),
++		p0 = svcmpgt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s32.c
+new file mode 100644
+index 000000000..75f0cc737
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_s32_tied:
++** (
++**	cmpgt	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	cmplt	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s32_tied, svint32_t,
++		p0 = svcmpgt_s32 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_s32_untied:
++** (
++**	cmpgt	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	cmplt	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s32_untied, svint32_t,
++		p0 = svcmpgt_s32 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_w0_s32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmpgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmplt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_w0_s32, svint32_t, int32_t,
++		 p0 = svcmpgt_n_s32 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_s32:
++**	mov	(z[0-9]+\.s), #16
++** (
++**	cmpgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmplt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_m1_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
++
++/*
++** cmpgt_m16_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m16_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, -16),
++		p0 = svcmpgt (p1, z0, -16))
++
++/*
++** cmpgt_m17_s32:
++**	mov	(z[0-9]+\.s), #-17
++** (
++**	cmpgt	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmplt	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m17_s32, svint32_t,
++		p0 = svcmpgt_n_s32 (p1, z0, -17),
++		p0 = svcmpgt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s64.c
+new file mode 100644
+index 000000000..dbfd55e6f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_s64_tied:
++** (
++**	cmpgt	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	cmplt	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s64_tied, svint64_t,
++		p0 = svcmpgt_s64 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_s64_untied:
++** (
++**	cmpgt	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	cmplt	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s64_untied, svint64_t,
++		p0 = svcmpgt_s64 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_x0_s64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmpgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmplt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_x0_s64, svint64_t, int64_t,
++		 p0 = svcmpgt_n_s64 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_s64:
++**	cmpgt	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_s64:
++**	cmpgt	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_s64:
++**	cmpgt	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_s64:
++**	mov	(z[0-9]+\.d), #16
++** (
++**	cmpgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmplt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_m1_s64:
++**	cmpgt	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
++
++/*
++** cmpgt_m16_s64:
++**	cmpgt	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m16_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, -16),
++		p0 = svcmpgt (p1, z0, -16))
++
++/*
++** cmpgt_m17_s64:
++**	mov	(z[0-9]+\.d), #-17
++** (
++**	cmpgt	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmplt	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m17_s64, svint64_t,
++		p0 = svcmpgt_n_s64 (p1, z0, -17),
++		p0 = svcmpgt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s8.c
+new file mode 100644
+index 000000000..710c2e602
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_s8_tied:
++** (
++**	cmpgt	p0\.b, p0/z, z0\.b, z1\.b
++** |
++**	cmplt	p0\.b, p0/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s8_tied, svint8_t,
++		p0 = svcmpgt_s8 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_s8_untied:
++** (
++**	cmpgt	p0\.b, p1/z, z0\.b, z1\.b
++** |
++**	cmplt	p0\.b, p1/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_s8_untied, svint8_t,
++		p0 = svcmpgt_s8 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_w0_s8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmpgt	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmplt	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_w0_s8, svint8_t, int8_t,
++		 p0 = svcmpgt_n_s8 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_s8:
++**	mov	(z[0-9]+\.b), #16
++** (
++**	cmpgt	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmplt	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_m1_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
++
++/*
++** cmpgt_m16_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m16_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, -16),
++		p0 = svcmpgt (p1, z0, -16))
++
++/*
++** cmpgt_m17_s8:
++**	mov	(z[0-9]+\.b), #-17
++** (
++**	cmpgt	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmplt	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m17_s8, svint8_t,
++		p0 = svcmpgt_n_s8 (p1, z0, -17),
++		p0 = svcmpgt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u16.c
+new file mode 100644
+index 000000000..48e99c72c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_u16_tied:
++** (
++**	cmphi	p0\.h, p0/z, z0\.h, z1\.h
++** |
++**	cmplo	p0\.h, p0/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u16_tied, svuint16_t,
++		p0 = svcmpgt_u16 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_u16_untied:
++** (
++**	cmphi	p0\.h, p1/z, z0\.h, z1\.h
++** |
++**	cmplo	p0\.h, p1/z, z1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u16_untied, svuint16_t,
++		p0 = svcmpgt_u16 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_w0_u16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmphi	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmplo	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_w0_u16, svuint16_t, uint16_t,
++		 p0 = svcmpgt_n_u16 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_127_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_127_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, 127),
++		p0 = svcmpgt (p1, z0, 127))
++
++/*
++** cmpgt_128_u16:
++**	mov	(z[0-9]+\.h), #128
++** (
++**	cmphi	p0\.h, p1/z, z0\.h, \1
++** |
++**	cmplo	p0\.h, p1/z, \1, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_128_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, 128),
++		p0 = svcmpgt (p1, z0, 128))
++
++/*
++** cmpgt_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphi	p0\.h, p1/z, z0\.h, \1\.h
++** |
++**	cmplo	p0\.h, p1/z, \1\.h, z0\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_u16, svuint16_t,
++		p0 = svcmpgt_n_u16 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u32.c
+new file mode 100644
+index 000000000..408037d72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_u32_tied:
++** (
++**	cmphi	p0\.s, p0/z, z0\.s, z1\.s
++** |
++**	cmplo	p0\.s, p0/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u32_tied, svuint32_t,
++		p0 = svcmpgt_u32 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_u32_untied:
++** (
++**	cmphi	p0\.s, p1/z, z0\.s, z1\.s
++** |
++**	cmplo	p0\.s, p1/z, z1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u32_untied, svuint32_t,
++		p0 = svcmpgt_u32 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_w0_u32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmphi	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmplo	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_w0_u32, svuint32_t, uint32_t,
++		 p0 = svcmpgt_n_u32 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_127_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_127_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, 127),
++		p0 = svcmpgt (p1, z0, 127))
++
++/*
++** cmpgt_128_u32:
++**	mov	(z[0-9]+\.s), #128
++** (
++**	cmphi	p0\.s, p1/z, z0\.s, \1
++** |
++**	cmplo	p0\.s, p1/z, \1, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_128_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, 128),
++		p0 = svcmpgt (p1, z0, 128))
++
++/*
++** cmpgt_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphi	p0\.s, p1/z, z0\.s, \1\.s
++** |
++**	cmplo	p0\.s, p1/z, \1\.s, z0\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_u32, svuint32_t,
++		p0 = svcmpgt_n_u32 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u64.c
+new file mode 100644
+index 000000000..f76a23e49
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_u64_tied:
++** (
++**	cmphi	p0\.d, p0/z, z0\.d, z1\.d
++** |
++**	cmplo	p0\.d, p0/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u64_tied, svuint64_t,
++		p0 = svcmpgt_u64 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_u64_untied:
++** (
++**	cmphi	p0\.d, p1/z, z0\.d, z1\.d
++** |
++**	cmplo	p0\.d, p1/z, z1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u64_untied, svuint64_t,
++		p0 = svcmpgt_u64 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_x0_u64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmphi	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmplo	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_x0_u64, svuint64_t, uint64_t,
++		 p0 = svcmpgt_n_u64 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_u64:
++**	cmphi	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_u64:
++**	cmphi	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_u64:
++**	cmphi	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_u64:
++**	cmphi	p0\.d, p1/z, z0\.d, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_127_u64:
++**	cmphi	p0\.d, p1/z, z0\.d, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_127_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, 127),
++		p0 = svcmpgt (p1, z0, 127))
++
++/*
++** cmpgt_128_u64:
++**	mov	(z[0-9]+\.d), #128
++** (
++**	cmphi	p0\.d, p1/z, z0\.d, \1
++** |
++**	cmplo	p0\.d, p1/z, \1, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_128_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, 128),
++		p0 = svcmpgt (p1, z0, 128))
++
++/*
++** cmpgt_m1_u64:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphi	p0\.d, p1/z, z0\.d, \1\.d
++** |
++**	cmplo	p0\.d, p1/z, \1\.d, z0\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_u64, svuint64_t,
++		p0 = svcmpgt_n_u64 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u8.c
+new file mode 100644
+index 000000000..4f28331f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_u8_tied:
++** (
++**	cmphi	p0\.b, p0/z, z0\.b, z1\.b
++** |
++**	cmplo	p0\.b, p0/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u8_tied, svuint8_t,
++		p0 = svcmpgt_u8 (p0, z0, z1),
++		p0 = svcmpgt (p0, z0, z1))
++
++/*
++** cmpgt_u8_untied:
++** (
++**	cmphi	p0\.b, p1/z, z0\.b, z1\.b
++** |
++**	cmplo	p0\.b, p1/z, z1\.b, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_u8_untied, svuint8_t,
++		p0 = svcmpgt_u8 (p1, z0, z1),
++		p0 = svcmpgt (p1, z0, z1))
++
++/*
++** cmpgt_w0_u8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmphi	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmplo	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_w0_u8, svuint8_t, uint8_t,
++		 p0 = svcmpgt_n_u8 (p1, z0, x0),
++		 p0 = svcmpgt (p1, z0, x0))
++
++/*
++** cmpgt_0_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_0_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, 0),
++		p0 = svcmpgt (p1, z0, 0))
++
++/*
++** cmpgt_1_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_1_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, 1),
++		p0 = svcmpgt (p1, z0, 1))
++
++/*
++** cmpgt_15_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_15_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, 15),
++		p0 = svcmpgt (p1, z0, 15))
++
++/*
++** cmpgt_16_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_16_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, 16),
++		p0 = svcmpgt (p1, z0, 16))
++
++/*
++** cmpgt_127_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_127_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, 127),
++		p0 = svcmpgt (p1, z0, 127))
++
++/*
++** cmpgt_128_u8:
++**	mov	(z[0-9]+\.b), #-128
++** (
++**	cmphi	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmplo	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_128_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, 128),
++		p0 = svcmpgt (p1, z0, 128))
++
++/*
++** cmpgt_m1_u8:
++**	mov	(z[0-9]+\.b), #-1
++** (
++**	cmphi	p0\.b, p1/z, z0\.b, \1
++** |
++**	cmplo	p0\.b, p1/z, \1, z0\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_m1_u8, svuint8_t,
++		p0 = svcmpgt_n_u8 (p1, z0, -1),
++		p0 = svcmpgt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s16.c
+new file mode 100644
+index 000000000..07d3bbbd9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_wide_s16_tied:
++**	cmpgt	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_s16_tied, svint16_t, svint64_t,
++		     p0 = svcmpgt_wide_s16 (p0, z0, z1),
++		     p0 = svcmpgt_wide (p0, z0, z1))
++
++/*
++** cmpgt_wide_s16_untied:
++**	cmpgt	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_s16_untied, svint16_t, svint64_t,
++		     p0 = svcmpgt_wide_s16 (p1, z0, z1),
++		     p0 = svcmpgt_wide (p1, z0, z1))
++
++/*
++** cmpgt_wide_x0_s16:
++**	mov	(z[0-9]+\.d), x0
++**	cmpgt	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_wide_x0_s16, svint16_t, int64_t,
++		 p0 = svcmpgt_wide_n_s16 (p1, z0, x0),
++		 p0 = svcmpgt_wide (p1, z0, x0))
++
++/*
++** cmpgt_wide_0_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_0_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, 0),
++		p0 = svcmpgt_wide (p1, z0, 0))
++
++/*
++** cmpgt_wide_1_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_1_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, 1),
++		p0 = svcmpgt_wide (p1, z0, 1))
++
++/*
++** cmpgt_wide_15_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_15_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, 15),
++		p0 = svcmpgt_wide (p1, z0, 15))
++
++/*
++** cmpgt_wide_16_s16:
++**	mov	(z[0-9]+\.d), #16
++**	cmpgt	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_16_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, 16),
++		p0 = svcmpgt_wide (p1, z0, 16))
++
++/*
++** cmpgt_wide_m1_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m1_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, -1),
++		p0 = svcmpgt_wide (p1, z0, -1))
++
++/*
++** cmpgt_wide_m16_s16:
++**	cmpgt	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m16_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, -16),
++		p0 = svcmpgt_wide (p1, z0, -16))
++
++/*
++** cmpgt_wide_m17_s16:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpgt	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m17_s16, svint16_t,
++		p0 = svcmpgt_wide_n_s16 (p1, z0, -17),
++		p0 = svcmpgt_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s32.c
+new file mode 100644
+index 000000000..f984362e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_wide_s32_tied:
++**	cmpgt	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_s32_tied, svint32_t, svint64_t,
++		     p0 = svcmpgt_wide_s32 (p0, z0, z1),
++		     p0 = svcmpgt_wide (p0, z0, z1))
++
++/*
++** cmpgt_wide_s32_untied:
++**	cmpgt	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_s32_untied, svint32_t, svint64_t,
++		     p0 = svcmpgt_wide_s32 (p1, z0, z1),
++		     p0 = svcmpgt_wide (p1, z0, z1))
++
++/*
++** cmpgt_wide_x0_s32:
++**	mov	(z[0-9]+\.d), x0
++**	cmpgt	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_wide_x0_s32, svint32_t, int64_t,
++		 p0 = svcmpgt_wide_n_s32 (p1, z0, x0),
++		 p0 = svcmpgt_wide (p1, z0, x0))
++
++/*
++** cmpgt_wide_0_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_0_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, 0),
++		p0 = svcmpgt_wide (p1, z0, 0))
++
++/*
++** cmpgt_wide_1_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_1_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, 1),
++		p0 = svcmpgt_wide (p1, z0, 1))
++
++/*
++** cmpgt_wide_15_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_15_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, 15),
++		p0 = svcmpgt_wide (p1, z0, 15))
++
++/*
++** cmpgt_wide_16_s32:
++**	mov	(z[0-9]+\.d), #16
++**	cmpgt	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_16_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, 16),
++		p0 = svcmpgt_wide (p1, z0, 16))
++
++/*
++** cmpgt_wide_m1_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m1_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, -1),
++		p0 = svcmpgt_wide (p1, z0, -1))
++
++/*
++** cmpgt_wide_m16_s32:
++**	cmpgt	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m16_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, -16),
++		p0 = svcmpgt_wide (p1, z0, -16))
++
++/*
++** cmpgt_wide_m17_s32:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpgt	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m17_s32, svint32_t,
++		p0 = svcmpgt_wide_n_s32 (p1, z0, -17),
++		p0 = svcmpgt_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s8.c
+new file mode 100644
+index 000000000..07047a315
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_wide_s8_tied:
++**	cmpgt	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_s8_tied, svint8_t, svint64_t,
++		     p0 = svcmpgt_wide_s8 (p0, z0, z1),
++		     p0 = svcmpgt_wide (p0, z0, z1))
++
++/*
++** cmpgt_wide_s8_untied:
++**	cmpgt	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_s8_untied, svint8_t, svint64_t,
++		     p0 = svcmpgt_wide_s8 (p1, z0, z1),
++		     p0 = svcmpgt_wide (p1, z0, z1))
++
++/*
++** cmpgt_wide_x0_s8:
++**	mov	(z[0-9]+\.d), x0
++**	cmpgt	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_wide_x0_s8, svint8_t, int64_t,
++		 p0 = svcmpgt_wide_n_s8 (p1, z0, x0),
++		 p0 = svcmpgt_wide (p1, z0, x0))
++
++/*
++** cmpgt_wide_0_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_0_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, 0),
++		p0 = svcmpgt_wide (p1, z0, 0))
++
++/*
++** cmpgt_wide_1_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_1_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, 1),
++		p0 = svcmpgt_wide (p1, z0, 1))
++
++/*
++** cmpgt_wide_15_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_15_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, 15),
++		p0 = svcmpgt_wide (p1, z0, 15))
++
++/*
++** cmpgt_wide_16_s8:
++**	mov	(z[0-9]+\.d), #16
++**	cmpgt	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_16_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, 16),
++		p0 = svcmpgt_wide (p1, z0, 16))
++
++/*
++** cmpgt_wide_m1_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m1_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, -1),
++		p0 = svcmpgt_wide (p1, z0, -1))
++
++/*
++** cmpgt_wide_m16_s8:
++**	cmpgt	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m16_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, -16),
++		p0 = svcmpgt_wide (p1, z0, -16))
++
++/*
++** cmpgt_wide_m17_s8:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpgt	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m17_s8, svint8_t,
++		p0 = svcmpgt_wide_n_s8 (p1, z0, -17),
++		p0 = svcmpgt_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u16.c
+new file mode 100644
+index 000000000..bcffb88c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_wide_u16_tied:
++**	cmphi	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_u16_tied, svuint16_t, svuint64_t,
++		     p0 = svcmpgt_wide_u16 (p0, z0, z1),
++		     p0 = svcmpgt_wide (p0, z0, z1))
++
++/*
++** cmpgt_wide_u16_untied:
++**	cmphi	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_u16_untied, svuint16_t, svuint64_t,
++		     p0 = svcmpgt_wide_u16 (p1, z0, z1),
++		     p0 = svcmpgt_wide (p1, z0, z1))
++
++/*
++** cmpgt_wide_x0_u16:
++**	mov	(z[0-9]+\.d), x0
++**	cmphi	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_wide_x0_u16, svuint16_t, uint64_t,
++		 p0 = svcmpgt_wide_n_u16 (p1, z0, x0),
++		 p0 = svcmpgt_wide (p1, z0, x0))
++
++/*
++** cmpgt_wide_0_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_0_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, 0),
++		p0 = svcmpgt_wide (p1, z0, 0))
++
++/*
++** cmpgt_wide_1_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_1_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, 1),
++		p0 = svcmpgt_wide (p1, z0, 1))
++
++/*
++** cmpgt_wide_15_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_15_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, 15),
++		p0 = svcmpgt_wide (p1, z0, 15))
++
++/*
++** cmpgt_wide_16_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_16_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, 16),
++		p0 = svcmpgt_wide (p1, z0, 16))
++
++/*
++** cmpgt_wide_127_u16:
++**	cmphi	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_127_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, 127),
++		p0 = svcmpgt_wide (p1, z0, 127))
++
++/*
++** cmpgt_wide_128_u16:
++**	mov	(z[0-9]+\.d), #128
++**	cmphi	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_128_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, 128),
++		p0 = svcmpgt_wide (p1, z0, 128))
++
++/*
++** cmpgt_wide_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmphi	p0\.h, p1/z, z0\.h, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m1_u16, svuint16_t,
++		p0 = svcmpgt_wide_n_u16 (p1, z0, -1),
++		p0 = svcmpgt_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u32.c
+new file mode 100644
+index 000000000..65c0231e5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_wide_u32_tied:
++**	cmphi	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_u32_tied, svuint32_t, svuint64_t,
++		     p0 = svcmpgt_wide_u32 (p0, z0, z1),
++		     p0 = svcmpgt_wide (p0, z0, z1))
++
++/*
++** cmpgt_wide_u32_untied:
++**	cmphi	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_u32_untied, svuint32_t, svuint64_t,
++		     p0 = svcmpgt_wide_u32 (p1, z0, z1),
++		     p0 = svcmpgt_wide (p1, z0, z1))
++
++/*
++** cmpgt_wide_x0_u32:
++**	mov	(z[0-9]+\.d), x0
++**	cmphi	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_wide_x0_u32, svuint32_t, uint64_t,
++		 p0 = svcmpgt_wide_n_u32 (p1, z0, x0),
++		 p0 = svcmpgt_wide (p1, z0, x0))
++
++/*
++** cmpgt_wide_0_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_0_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, 0),
++		p0 = svcmpgt_wide (p1, z0, 0))
++
++/*
++** cmpgt_wide_1_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_1_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, 1),
++		p0 = svcmpgt_wide (p1, z0, 1))
++
++/*
++** cmpgt_wide_15_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_15_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, 15),
++		p0 = svcmpgt_wide (p1, z0, 15))
++
++/*
++** cmpgt_wide_16_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_16_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, 16),
++		p0 = svcmpgt_wide (p1, z0, 16))
++
++/*
++** cmpgt_wide_127_u32:
++**	cmphi	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_127_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, 127),
++		p0 = svcmpgt_wide (p1, z0, 127))
++
++/*
++** cmpgt_wide_128_u32:
++**	mov	(z[0-9]+\.d), #128
++**	cmphi	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_128_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, 128),
++		p0 = svcmpgt_wide (p1, z0, 128))
++
++/*
++** cmpgt_wide_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmphi	p0\.s, p1/z, z0\.s, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m1_u32, svuint32_t,
++		p0 = svcmpgt_wide_n_u32 (p1, z0, -1),
++		p0 = svcmpgt_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u8.c
+new file mode 100644
+index 000000000..0d1142f27
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpgt_wide_u8_tied:
++**	cmphi	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_u8_tied, svuint8_t, svuint64_t,
++		     p0 = svcmpgt_wide_u8 (p0, z0, z1),
++		     p0 = svcmpgt_wide (p0, z0, z1))
++
++/*
++** cmpgt_wide_u8_untied:
++**	cmphi	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpgt_wide_u8_untied, svuint8_t, svuint64_t,
++		     p0 = svcmpgt_wide_u8 (p1, z0, z1),
++		     p0 = svcmpgt_wide (p1, z0, z1))
++
++/*
++** cmpgt_wide_x0_u8:
++**	mov	(z[0-9]+\.d), x0
++**	cmphi	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpgt_wide_x0_u8, svuint8_t, uint64_t,
++		 p0 = svcmpgt_wide_n_u8 (p1, z0, x0),
++		 p0 = svcmpgt_wide (p1, z0, x0))
++
++/*
++** cmpgt_wide_0_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_0_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, 0),
++		p0 = svcmpgt_wide (p1, z0, 0))
++
++/*
++** cmpgt_wide_1_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_1_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, 1),
++		p0 = svcmpgt_wide (p1, z0, 1))
++
++/*
++** cmpgt_wide_15_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_15_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, 15),
++		p0 = svcmpgt_wide (p1, z0, 15))
++
++/*
++** cmpgt_wide_16_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_16_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, 16),
++		p0 = svcmpgt_wide (p1, z0, 16))
++
++/*
++** cmpgt_wide_127_u8:
++**	cmphi	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_127_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, 127),
++		p0 = svcmpgt_wide (p1, z0, 127))
++
++/*
++** cmpgt_wide_128_u8:
++**	mov	(z[0-9]+\.d), #128
++**	cmphi	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_128_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, 128),
++		p0 = svcmpgt_wide (p1, z0, 128))
++
++/*
++** cmpgt_wide_m1_u8:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmphi	p0\.b, p1/z, z0\.b, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmpgt_wide_m1_u8, svuint8_t,
++		p0 = svcmpgt_wide_n_u8 (p1, z0, -1),
++		p0 = svcmpgt_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f16.c
+new file mode 100644
+index 000000000..7d500590f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f16.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_f16_tied:
++** (
++**	fcmge	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	fcmle	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_f16_tied, svfloat16_t,
++		p0 = svcmple_f16 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_f16_untied:
++** (
++**	fcmge	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	fcmle	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_f16_untied, svfloat16_t,
++		p0 = svcmple_f16 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	fcmge	p0\.h, p1/z, \1, z0\.h
++** |
++**	fcmle	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmple_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmple_n_f16 (p1, z0, d4),
++		 p0 = svcmple (p1, z0, d4))
++
++/*
++** cmple_0_f16:
++**	fcmle	p0\.h, p1/z, z0\.h, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_f16, svfloat16_t,
++		p0 = svcmple_n_f16 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	fcmge	p0\.h, p1/z, \1, z0\.h
++** |
++**	fcmle	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_f16, svfloat16_t,
++		p0 = svcmple_n_f16 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f32.c
+new file mode 100644
+index 000000000..3df63fef7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f32.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_f32_tied:
++** (
++**	fcmge	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	fcmle	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_f32_tied, svfloat32_t,
++		p0 = svcmple_f32 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_f32_untied:
++** (
++**	fcmge	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	fcmle	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_f32_untied, svfloat32_t,
++		p0 = svcmple_f32 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	fcmge	p0\.s, p1/z, \1, z0\.s
++** |
++**	fcmle	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmple_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmple_n_f32 (p1, z0, d4),
++		 p0 = svcmple (p1, z0, d4))
++
++/*
++** cmple_0_f32:
++**	fcmle	p0\.s, p1/z, z0\.s, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_f32, svfloat32_t,
++		p0 = svcmple_n_f32 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	fcmge	p0\.s, p1/z, \1, z0\.s
++** |
++**	fcmle	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_f32, svfloat32_t,
++		p0 = svcmple_n_f32 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f64.c
+new file mode 100644
+index 000000000..5946a1b3a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f64.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_f64_tied:
++** (
++**	fcmge	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	fcmle	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_f64_tied, svfloat64_t,
++		p0 = svcmple_f64 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_f64_untied:
++** (
++**	fcmge	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	fcmle	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_f64_untied, svfloat64_t,
++		p0 = svcmple_f64 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	fcmge	p0\.d, p1/z, \1, z0\.d
++** |
++**	fcmle	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmple_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmple_n_f64 (p1, z0, d4),
++		 p0 = svcmple (p1, z0, d4))
++
++/*
++** cmple_0_f64:
++**	fcmle	p0\.d, p1/z, z0\.d, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_f64, svfloat64_t,
++		p0 = svcmple_n_f64 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	fcmge	p0\.d, p1/z, \1, z0\.d
++** |
++**	fcmle	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_f64, svfloat64_t,
++		p0 = svcmple_n_f64 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s16.c
+new file mode 100644
+index 000000000..9b221bb4c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_s16_tied:
++** (
++**	cmpge	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	cmple	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s16_tied, svint16_t,
++		p0 = svcmple_s16 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_s16_untied:
++** (
++**	cmpge	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	cmple	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s16_untied, svint16_t,
++		p0 = svcmple_s16 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_w0_s16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmpge	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmple	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_w0_s16, svint16_t, int16_t,
++		 p0 = svcmple_n_s16 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_s16:
++**	mov	(z[0-9]+\.h), #16
++** (
++**	cmpge	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmple	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_m1_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
++
++/*
++** cmple_m16_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m16_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, -16),
++		p0 = svcmple (p1, z0, -16))
++
++/*
++** cmple_m17_s16:
++**	mov	(z[0-9]+\.h), #-17
++** (
++**	cmpge	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmple	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m17_s16, svint16_t,
++		p0 = svcmple_n_s16 (p1, z0, -17),
++		p0 = svcmple (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s32.c
+new file mode 100644
+index 000000000..b0c8367e2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_s32_tied:
++** (
++**	cmpge	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	cmple	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s32_tied, svint32_t,
++		p0 = svcmple_s32 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_s32_untied:
++** (
++**	cmpge	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	cmple	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s32_untied, svint32_t,
++		p0 = svcmple_s32 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_w0_s32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmpge	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmple	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_w0_s32, svint32_t, int32_t,
++		 p0 = svcmple_n_s32 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_s32:
++**	mov	(z[0-9]+\.s), #16
++** (
++**	cmpge	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmple	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_m1_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
++
++/*
++** cmple_m16_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m16_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, -16),
++		p0 = svcmple (p1, z0, -16))
++
++/*
++** cmple_m17_s32:
++**	mov	(z[0-9]+\.s), #-17
++** (
++**	cmpge	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmple	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m17_s32, svint32_t,
++		p0 = svcmple_n_s32 (p1, z0, -17),
++		p0 = svcmple (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s64.c
+new file mode 100644
+index 000000000..faaa87614
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_s64_tied:
++** (
++**	cmpge	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	cmple	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s64_tied, svint64_t,
++		p0 = svcmple_s64 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_s64_untied:
++** (
++**	cmpge	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	cmple	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s64_untied, svint64_t,
++		p0 = svcmple_s64 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_x0_s64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmpge	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmple	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_x0_s64, svint64_t, int64_t,
++		 p0 = svcmple_n_s64 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_s64:
++**	cmple	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_s64:
++**	cmple	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_s64:
++**	cmple	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_s64:
++**	mov	(z[0-9]+\.d), #16
++** (
++**	cmpge	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmple	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_m1_s64:
++**	cmple	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
++
++/*
++** cmple_m16_s64:
++**	cmple	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m16_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, -16),
++		p0 = svcmple (p1, z0, -16))
++
++/*
++** cmple_m17_s64:
++**	mov	(z[0-9]+\.d), #-17
++** (
++**	cmpge	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmple	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m17_s64, svint64_t,
++		p0 = svcmple_n_s64 (p1, z0, -17),
++		p0 = svcmple (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s8.c
+new file mode 100644
+index 000000000..222487d75
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_s8_tied:
++** (
++**	cmpge	p0\.b, p0/z, z1\.b, z0\.b
++** |
++**	cmple	p0\.b, p0/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s8_tied, svint8_t,
++		p0 = svcmple_s8 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_s8_untied:
++** (
++**	cmpge	p0\.b, p1/z, z1\.b, z0\.b
++** |
++**	cmple	p0\.b, p1/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_s8_untied, svint8_t,
++		p0 = svcmple_s8 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_w0_s8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmpge	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmple	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_w0_s8, svint8_t, int8_t,
++		 p0 = svcmple_n_s8 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_s8:
++**	mov	(z[0-9]+\.b), #16
++** (
++**	cmpge	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmple	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_m1_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
++
++/*
++** cmple_m16_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m16_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, -16),
++		p0 = svcmple (p1, z0, -16))
++
++/*
++** cmple_m17_s8:
++**	mov	(z[0-9]+\.b), #-17
++** (
++**	cmpge	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmple	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m17_s8, svint8_t,
++		p0 = svcmple_n_s8 (p1, z0, -17),
++		p0 = svcmple (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u16.c
+new file mode 100644
+index 000000000..26af06e52
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_u16_tied:
++** (
++**	cmphs	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	cmpls	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u16_tied, svuint16_t,
++		p0 = svcmple_u16 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_u16_untied:
++** (
++**	cmphs	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	cmpls	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u16_untied, svuint16_t,
++		p0 = svcmple_u16 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_w0_u16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmphs	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmpls	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_w0_u16, svuint16_t, uint16_t,
++		 p0 = svcmple_n_u16 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_127_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_127_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, 127),
++		p0 = svcmple (p1, z0, 127))
++
++/*
++** cmple_128_u16:
++**	mov	(z[0-9]+\.h), #128
++** (
++**	cmphs	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmpls	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_128_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, 128),
++		p0 = svcmple (p1, z0, 128))
++
++/*
++** cmple_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphs	p0\.h, p1/z, \1\.h, z0\.h
++** |
++**	cmpls	p0\.h, p1/z, z0\.h, \1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_u16, svuint16_t,
++		p0 = svcmple_n_u16 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u32.c
+new file mode 100644
+index 000000000..cee2d14c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_u32_tied:
++** (
++**	cmphs	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	cmpls	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u32_tied, svuint32_t,
++		p0 = svcmple_u32 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_u32_untied:
++** (
++**	cmphs	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	cmpls	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u32_untied, svuint32_t,
++		p0 = svcmple_u32 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_w0_u32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmphs	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmpls	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_w0_u32, svuint32_t, uint32_t,
++		 p0 = svcmple_n_u32 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_127_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_127_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, 127),
++		p0 = svcmple (p1, z0, 127))
++
++/*
++** cmple_128_u32:
++**	mov	(z[0-9]+\.s), #128
++** (
++**	cmphs	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmpls	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_128_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, 128),
++		p0 = svcmple (p1, z0, 128))
++
++/*
++** cmple_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphs	p0\.s, p1/z, \1\.s, z0\.s
++** |
++**	cmpls	p0\.s, p1/z, z0\.s, \1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_u32, svuint32_t,
++		p0 = svcmple_n_u32 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u64.c
+new file mode 100644
+index 000000000..b8388bca8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_u64_tied:
++** (
++**	cmphs	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	cmpls	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u64_tied, svuint64_t,
++		p0 = svcmple_u64 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_u64_untied:
++** (
++**	cmphs	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	cmpls	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u64_untied, svuint64_t,
++		p0 = svcmple_u64 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_x0_u64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmphs	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmpls	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_x0_u64, svuint64_t, uint64_t,
++		 p0 = svcmple_n_u64 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_u64:
++**	cmpls	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_u64:
++**	cmpls	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_u64:
++**	cmpls	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_u64:
++**	cmpls	p0\.d, p1/z, z0\.d, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_127_u64:
++**	cmpls	p0\.d, p1/z, z0\.d, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_127_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, 127),
++		p0 = svcmple (p1, z0, 127))
++
++/*
++** cmple_128_u64:
++**	mov	(z[0-9]+\.d), #128
++** (
++**	cmphs	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmpls	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_128_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, 128),
++		p0 = svcmple (p1, z0, 128))
++
++/*
++** cmple_m1_u64:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphs	p0\.d, p1/z, \1\.d, z0\.d
++** |
++**	cmpls	p0\.d, p1/z, z0\.d, \1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_u64, svuint64_t,
++		p0 = svcmple_n_u64 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u8.c
+new file mode 100644
+index 000000000..55a8d4f40
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_u8_tied:
++** (
++**	cmphs	p0\.b, p0/z, z1\.b, z0\.b
++** |
++**	cmpls	p0\.b, p0/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u8_tied, svuint8_t,
++		p0 = svcmple_u8 (p0, z0, z1),
++		p0 = svcmple (p0, z0, z1))
++
++/*
++** cmple_u8_untied:
++** (
++**	cmphs	p0\.b, p1/z, z1\.b, z0\.b
++** |
++**	cmpls	p0\.b, p1/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_u8_untied, svuint8_t,
++		p0 = svcmple_u8 (p1, z0, z1),
++		p0 = svcmple (p1, z0, z1))
++
++/*
++** cmple_w0_u8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmphs	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmpls	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_w0_u8, svuint8_t, uint8_t,
++		 p0 = svcmple_n_u8 (p1, z0, x0),
++		 p0 = svcmple (p1, z0, x0))
++
++/*
++** cmple_0_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_0_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, 0),
++		p0 = svcmple (p1, z0, 0))
++
++/*
++** cmple_1_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_1_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, 1),
++		p0 = svcmple (p1, z0, 1))
++
++/*
++** cmple_15_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_15_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, 15),
++		p0 = svcmple (p1, z0, 15))
++
++/*
++** cmple_16_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_16_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, 16),
++		p0 = svcmple (p1, z0, 16))
++
++/*
++** cmple_127_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_127_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, 127),
++		p0 = svcmple (p1, z0, 127))
++
++/*
++** cmple_128_u8:
++**	mov	(z[0-9]+\.b), #-128
++** (
++**	cmphs	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmpls	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_128_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, 128),
++		p0 = svcmple (p1, z0, 128))
++
++/*
++** cmple_m1_u8:
++**	mov	(z[0-9]+\.b), #-1
++** (
++**	cmphs	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmpls	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmple_m1_u8, svuint8_t,
++		p0 = svcmple_n_u8 (p1, z0, -1),
++		p0 = svcmple (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s16.c
+new file mode 100644
+index 000000000..f1f0b2ed6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_wide_s16_tied:
++**	cmple	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_s16_tied, svint16_t, svint64_t,
++		     p0 = svcmple_wide_s16 (p0, z0, z1),
++		     p0 = svcmple_wide (p0, z0, z1))
++
++/*
++** cmple_wide_s16_untied:
++**	cmple	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_s16_untied, svint16_t, svint64_t,
++		     p0 = svcmple_wide_s16 (p1, z0, z1),
++		     p0 = svcmple_wide (p1, z0, z1))
++
++/*
++** cmple_wide_x0_s16:
++**	mov	(z[0-9]+\.d), x0
++**	cmple	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_wide_x0_s16, svint16_t, int64_t,
++		 p0 = svcmple_wide_n_s16 (p1, z0, x0),
++		 p0 = svcmple_wide (p1, z0, x0))
++
++/*
++** cmple_wide_0_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_0_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, 0),
++		p0 = svcmple_wide (p1, z0, 0))
++
++/*
++** cmple_wide_1_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_1_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, 1),
++		p0 = svcmple_wide (p1, z0, 1))
++
++/*
++** cmple_wide_15_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_15_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, 15),
++		p0 = svcmple_wide (p1, z0, 15))
++
++/*
++** cmple_wide_16_s16:
++**	mov	(z[0-9]+\.d), #16
++**	cmple	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_16_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, 16),
++		p0 = svcmple_wide (p1, z0, 16))
++
++/*
++** cmple_wide_m1_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m1_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, -1),
++		p0 = svcmple_wide (p1, z0, -1))
++
++/*
++** cmple_wide_m16_s16:
++**	cmple	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m16_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, -16),
++		p0 = svcmple_wide (p1, z0, -16))
++
++/*
++** cmple_wide_m17_s16:
++**	mov	(z[0-9]+\.d), #-17
++**	cmple	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m17_s16, svint16_t,
++		p0 = svcmple_wide_n_s16 (p1, z0, -17),
++		p0 = svcmple_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s32.c
+new file mode 100644
+index 000000000..edc5513b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_wide_s32_tied:
++**	cmple	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_s32_tied, svint32_t, svint64_t,
++		     p0 = svcmple_wide_s32 (p0, z0, z1),
++		     p0 = svcmple_wide (p0, z0, z1))
++
++/*
++** cmple_wide_s32_untied:
++**	cmple	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_s32_untied, svint32_t, svint64_t,
++		     p0 = svcmple_wide_s32 (p1, z0, z1),
++		     p0 = svcmple_wide (p1, z0, z1))
++
++/*
++** cmple_wide_x0_s32:
++**	mov	(z[0-9]+\.d), x0
++**	cmple	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_wide_x0_s32, svint32_t, int64_t,
++		 p0 = svcmple_wide_n_s32 (p1, z0, x0),
++		 p0 = svcmple_wide (p1, z0, x0))
++
++/*
++** cmple_wide_0_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_0_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, 0),
++		p0 = svcmple_wide (p1, z0, 0))
++
++/*
++** cmple_wide_1_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_1_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, 1),
++		p0 = svcmple_wide (p1, z0, 1))
++
++/*
++** cmple_wide_15_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_15_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, 15),
++		p0 = svcmple_wide (p1, z0, 15))
++
++/*
++** cmple_wide_16_s32:
++**	mov	(z[0-9]+\.d), #16
++**	cmple	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_16_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, 16),
++		p0 = svcmple_wide (p1, z0, 16))
++
++/*
++** cmple_wide_m1_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m1_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, -1),
++		p0 = svcmple_wide (p1, z0, -1))
++
++/*
++** cmple_wide_m16_s32:
++**	cmple	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m16_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, -16),
++		p0 = svcmple_wide (p1, z0, -16))
++
++/*
++** cmple_wide_m17_s32:
++**	mov	(z[0-9]+\.d), #-17
++**	cmple	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m17_s32, svint32_t,
++		p0 = svcmple_wide_n_s32 (p1, z0, -17),
++		p0 = svcmple_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s8.c
+new file mode 100644
+index 000000000..984044460
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_wide_s8_tied:
++**	cmple	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_s8_tied, svint8_t, svint64_t,
++		     p0 = svcmple_wide_s8 (p0, z0, z1),
++		     p0 = svcmple_wide (p0, z0, z1))
++
++/*
++** cmple_wide_s8_untied:
++**	cmple	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_s8_untied, svint8_t, svint64_t,
++		     p0 = svcmple_wide_s8 (p1, z0, z1),
++		     p0 = svcmple_wide (p1, z0, z1))
++
++/*
++** cmple_wide_x0_s8:
++**	mov	(z[0-9]+\.d), x0
++**	cmple	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_wide_x0_s8, svint8_t, int64_t,
++		 p0 = svcmple_wide_n_s8 (p1, z0, x0),
++		 p0 = svcmple_wide (p1, z0, x0))
++
++/*
++** cmple_wide_0_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_0_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, 0),
++		p0 = svcmple_wide (p1, z0, 0))
++
++/*
++** cmple_wide_1_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_1_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, 1),
++		p0 = svcmple_wide (p1, z0, 1))
++
++/*
++** cmple_wide_15_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_15_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, 15),
++		p0 = svcmple_wide (p1, z0, 15))
++
++/*
++** cmple_wide_16_s8:
++**	mov	(z[0-9]+\.d), #16
++**	cmple	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_16_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, 16),
++		p0 = svcmple_wide (p1, z0, 16))
++
++/*
++** cmple_wide_m1_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m1_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, -1),
++		p0 = svcmple_wide (p1, z0, -1))
++
++/*
++** cmple_wide_m16_s8:
++**	cmple	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m16_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, -16),
++		p0 = svcmple_wide (p1, z0, -16))
++
++/*
++** cmple_wide_m17_s8:
++**	mov	(z[0-9]+\.d), #-17
++**	cmple	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m17_s8, svint8_t,
++		p0 = svcmple_wide_n_s8 (p1, z0, -17),
++		p0 = svcmple_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u16.c
+new file mode 100644
+index 000000000..a39a1aad5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_wide_u16_tied:
++**	cmpls	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_u16_tied, svuint16_t, svuint64_t,
++		     p0 = svcmple_wide_u16 (p0, z0, z1),
++		     p0 = svcmple_wide (p0, z0, z1))
++
++/*
++** cmple_wide_u16_untied:
++**	cmpls	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_u16_untied, svuint16_t, svuint64_t,
++		     p0 = svcmple_wide_u16 (p1, z0, z1),
++		     p0 = svcmple_wide (p1, z0, z1))
++
++/*
++** cmple_wide_x0_u16:
++**	mov	(z[0-9]+\.d), x0
++**	cmpls	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_wide_x0_u16, svuint16_t, uint64_t,
++		 p0 = svcmple_wide_n_u16 (p1, z0, x0),
++		 p0 = svcmple_wide (p1, z0, x0))
++
++/*
++** cmple_wide_0_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_0_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, 0),
++		p0 = svcmple_wide (p1, z0, 0))
++
++/*
++** cmple_wide_1_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_1_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, 1),
++		p0 = svcmple_wide (p1, z0, 1))
++
++/*
++** cmple_wide_15_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_15_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, 15),
++		p0 = svcmple_wide (p1, z0, 15))
++
++/*
++** cmple_wide_16_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_16_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, 16),
++		p0 = svcmple_wide (p1, z0, 16))
++
++/*
++** cmple_wide_127_u16:
++**	cmpls	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_127_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, 127),
++		p0 = svcmple_wide (p1, z0, 127))
++
++/*
++** cmple_wide_128_u16:
++**	mov	(z[0-9]+\.d), #128
++**	cmpls	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_128_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, 128),
++		p0 = svcmple_wide (p1, z0, 128))
++
++/*
++** cmple_wide_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmpls	p0\.h, p1/z, z0\.h, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m1_u16, svuint16_t,
++		p0 = svcmple_wide_n_u16 (p1, z0, -1),
++		p0 = svcmple_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u32.c
+new file mode 100644
+index 000000000..fe682c9e8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_wide_u32_tied:
++**	cmpls	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_u32_tied, svuint32_t, svuint64_t,
++		     p0 = svcmple_wide_u32 (p0, z0, z1),
++		     p0 = svcmple_wide (p0, z0, z1))
++
++/*
++** cmple_wide_u32_untied:
++**	cmpls	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_u32_untied, svuint32_t, svuint64_t,
++		     p0 = svcmple_wide_u32 (p1, z0, z1),
++		     p0 = svcmple_wide (p1, z0, z1))
++
++/*
++** cmple_wide_x0_u32:
++**	mov	(z[0-9]+\.d), x0
++**	cmpls	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_wide_x0_u32, svuint32_t, uint64_t,
++		 p0 = svcmple_wide_n_u32 (p1, z0, x0),
++		 p0 = svcmple_wide (p1, z0, x0))
++
++/*
++** cmple_wide_0_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_0_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, 0),
++		p0 = svcmple_wide (p1, z0, 0))
++
++/*
++** cmple_wide_1_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_1_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, 1),
++		p0 = svcmple_wide (p1, z0, 1))
++
++/*
++** cmple_wide_15_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_15_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, 15),
++		p0 = svcmple_wide (p1, z0, 15))
++
++/*
++** cmple_wide_16_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_16_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, 16),
++		p0 = svcmple_wide (p1, z0, 16))
++
++/*
++** cmple_wide_127_u32:
++**	cmpls	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_127_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, 127),
++		p0 = svcmple_wide (p1, z0, 127))
++
++/*
++** cmple_wide_128_u32:
++**	mov	(z[0-9]+\.d), #128
++**	cmpls	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_128_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, 128),
++		p0 = svcmple_wide (p1, z0, 128))
++
++/*
++** cmple_wide_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmpls	p0\.s, p1/z, z0\.s, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m1_u32, svuint32_t,
++		p0 = svcmple_wide_n_u32 (p1, z0, -1),
++		p0 = svcmple_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u8.c
+new file mode 100644
+index 000000000..893dfa627
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmple_wide_u8_tied:
++**	cmpls	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_u8_tied, svuint8_t, svuint64_t,
++		     p0 = svcmple_wide_u8 (p0, z0, z1),
++		     p0 = svcmple_wide (p0, z0, z1))
++
++/*
++** cmple_wide_u8_untied:
++**	cmpls	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmple_wide_u8_untied, svuint8_t, svuint64_t,
++		     p0 = svcmple_wide_u8 (p1, z0, z1),
++		     p0 = svcmple_wide (p1, z0, z1))
++
++/*
++** cmple_wide_x0_u8:
++**	mov	(z[0-9]+\.d), x0
++**	cmpls	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmple_wide_x0_u8, svuint8_t, uint64_t,
++		 p0 = svcmple_wide_n_u8 (p1, z0, x0),
++		 p0 = svcmple_wide (p1, z0, x0))
++
++/*
++** cmple_wide_0_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_0_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, 0),
++		p0 = svcmple_wide (p1, z0, 0))
++
++/*
++** cmple_wide_1_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_1_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, 1),
++		p0 = svcmple_wide (p1, z0, 1))
++
++/*
++** cmple_wide_15_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_15_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, 15),
++		p0 = svcmple_wide (p1, z0, 15))
++
++/*
++** cmple_wide_16_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_16_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, 16),
++		p0 = svcmple_wide (p1, z0, 16))
++
++/*
++** cmple_wide_127_u8:
++**	cmpls	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_127_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, 127),
++		p0 = svcmple_wide (p1, z0, 127))
++
++/*
++** cmple_wide_128_u8:
++**	mov	(z[0-9]+\.d), #128
++**	cmpls	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_128_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, 128),
++		p0 = svcmple_wide (p1, z0, 128))
++
++/*
++** cmple_wide_m1_u8:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmpls	p0\.b, p1/z, z0\.b, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmple_wide_m1_u8, svuint8_t,
++		p0 = svcmple_wide_n_u8 (p1, z0, -1),
++		p0 = svcmple_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f16.c
+new file mode 100644
+index 000000000..598f673a8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f16.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_f16_tied:
++** (
++**	fcmgt	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	fcmlt	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_f16_tied, svfloat16_t,
++		p0 = svcmplt_f16 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_f16_untied:
++** (
++**	fcmgt	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	fcmlt	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_f16_untied, svfloat16_t,
++		p0 = svcmplt_f16 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	fcmgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	fcmlt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmplt_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmplt_n_f16 (p1, z0, d4),
++		 p0 = svcmplt (p1, z0, d4))
++
++/*
++** cmplt_0_f16:
++**	fcmlt	p0\.h, p1/z, z0\.h, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_f16, svfloat16_t,
++		p0 = svcmplt_n_f16 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	fcmgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	fcmlt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_f16, svfloat16_t,
++		p0 = svcmplt_n_f16 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f32.c
+new file mode 100644
+index 000000000..f9dea3665
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f32.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_f32_tied:
++** (
++**	fcmgt	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	fcmlt	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_f32_tied, svfloat32_t,
++		p0 = svcmplt_f32 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_f32_untied:
++** (
++**	fcmgt	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	fcmlt	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_f32_untied, svfloat32_t,
++		p0 = svcmplt_f32 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	fcmgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	fcmlt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmplt_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmplt_n_f32 (p1, z0, d4),
++		 p0 = svcmplt (p1, z0, d4))
++
++/*
++** cmplt_0_f32:
++**	fcmlt	p0\.s, p1/z, z0\.s, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_f32, svfloat32_t,
++		p0 = svcmplt_n_f32 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	fcmgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	fcmlt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_f32, svfloat32_t,
++		p0 = svcmplt_n_f32 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f64.c
+new file mode 100644
+index 000000000..6f251db4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f64.c
+@@ -0,0 +1,66 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_f64_tied:
++** (
++**	fcmgt	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	fcmlt	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_f64_tied, svfloat64_t,
++		p0 = svcmplt_f64 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_f64_untied:
++** (
++**	fcmgt	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	fcmlt	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_f64_untied, svfloat64_t,
++		p0 = svcmplt_f64 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	fcmgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	fcmlt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZD (cmplt_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmplt_n_f64 (p1, z0, d4),
++		 p0 = svcmplt (p1, z0, d4))
++
++/*
++** cmplt_0_f64:
++**	fcmlt	p0\.d, p1/z, z0\.d, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_f64, svfloat64_t,
++		p0 = svcmplt_n_f64 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	fcmgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	fcmlt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_f64, svfloat64_t,
++		p0 = svcmplt_n_f64 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s16.c
+new file mode 100644
+index 000000000..1e2bf9dde
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_s16_tied:
++** (
++**	cmpgt	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	cmplt	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s16_tied, svint16_t,
++		p0 = svcmplt_s16 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_s16_untied:
++** (
++**	cmpgt	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	cmplt	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s16_untied, svint16_t,
++		p0 = svcmplt_s16 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_w0_s16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmpgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmplt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_w0_s16, svint16_t, int16_t,
++		 p0 = svcmplt_n_s16 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_s16:
++**	mov	(z[0-9]+\.h), #16
++** (
++**	cmpgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmplt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_m1_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
++
++/*
++** cmplt_m16_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m16_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, -16),
++		p0 = svcmplt (p1, z0, -16))
++
++/*
++** cmplt_m17_s16:
++**	mov	(z[0-9]+\.h), #-17
++** (
++**	cmpgt	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmplt	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m17_s16, svint16_t,
++		p0 = svcmplt_n_s16 (p1, z0, -17),
++		p0 = svcmplt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s32.c
+new file mode 100644
+index 000000000..8e2c02c4d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_s32_tied:
++** (
++**	cmpgt	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	cmplt	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s32_tied, svint32_t,
++		p0 = svcmplt_s32 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_s32_untied:
++** (
++**	cmpgt	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	cmplt	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s32_untied, svint32_t,
++		p0 = svcmplt_s32 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_w0_s32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmpgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmplt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_w0_s32, svint32_t, int32_t,
++		 p0 = svcmplt_n_s32 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_s32:
++**	mov	(z[0-9]+\.s), #16
++** (
++**	cmpgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmplt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_m1_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
++
++/*
++** cmplt_m16_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m16_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, -16),
++		p0 = svcmplt (p1, z0, -16))
++
++/*
++** cmplt_m17_s32:
++**	mov	(z[0-9]+\.s), #-17
++** (
++**	cmpgt	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmplt	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m17_s32, svint32_t,
++		p0 = svcmplt_n_s32 (p1, z0, -17),
++		p0 = svcmplt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s64.c
+new file mode 100644
+index 000000000..818c9fba9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_s64_tied:
++** (
++**	cmpgt	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	cmplt	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s64_tied, svint64_t,
++		p0 = svcmplt_s64 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_s64_untied:
++** (
++**	cmpgt	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	cmplt	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s64_untied, svint64_t,
++		p0 = svcmplt_s64 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_x0_s64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmpgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmplt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_x0_s64, svint64_t, int64_t,
++		 p0 = svcmplt_n_s64 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_s64:
++**	cmplt	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_s64:
++**	cmplt	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_s64:
++**	cmplt	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_s64:
++**	mov	(z[0-9]+\.d), #16
++** (
++**	cmpgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmplt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_m1_s64:
++**	cmplt	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
++
++/*
++** cmplt_m16_s64:
++**	cmplt	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m16_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, -16),
++		p0 = svcmplt (p1, z0, -16))
++
++/*
++** cmplt_m17_s64:
++**	mov	(z[0-9]+\.d), #-17
++** (
++**	cmpgt	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmplt	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m17_s64, svint64_t,
++		p0 = svcmplt_n_s64 (p1, z0, -17),
++		p0 = svcmplt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s8.c
+new file mode 100644
+index 000000000..54b8dc408
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_s8_tied:
++** (
++**	cmpgt	p0\.b, p0/z, z1\.b, z0\.b
++** |
++**	cmplt	p0\.b, p0/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s8_tied, svint8_t,
++		p0 = svcmplt_s8 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_s8_untied:
++** (
++**	cmpgt	p0\.b, p1/z, z1\.b, z0\.b
++** |
++**	cmplt	p0\.b, p1/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_s8_untied, svint8_t,
++		p0 = svcmplt_s8 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_w0_s8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmpgt	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmplt	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_w0_s8, svint8_t, int8_t,
++		 p0 = svcmplt_n_s8 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_s8:
++**	mov	(z[0-9]+\.b), #16
++** (
++**	cmpgt	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmplt	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_m1_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
++
++/*
++** cmplt_m16_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m16_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, -16),
++		p0 = svcmplt (p1, z0, -16))
++
++/*
++** cmplt_m17_s8:
++**	mov	(z[0-9]+\.b), #-17
++** (
++**	cmpgt	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmplt	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m17_s8, svint8_t,
++		p0 = svcmplt_n_s8 (p1, z0, -17),
++		p0 = svcmplt (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u16.c
+new file mode 100644
+index 000000000..c0f2a0550
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u16.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_u16_tied:
++** (
++**	cmphi	p0\.h, p0/z, z1\.h, z0\.h
++** |
++**	cmplo	p0\.h, p0/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u16_tied, svuint16_t,
++		p0 = svcmplt_u16 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_u16_untied:
++** (
++**	cmphi	p0\.h, p1/z, z1\.h, z0\.h
++** |
++**	cmplo	p0\.h, p1/z, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u16_untied, svuint16_t,
++		p0 = svcmplt_u16 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_w0_u16:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	cmphi	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmplo	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_w0_u16, svuint16_t, uint16_t,
++		 p0 = svcmplt_n_u16 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_127_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_127_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, 127),
++		p0 = svcmplt (p1, z0, 127))
++
++/*
++** cmplt_128_u16:
++**	mov	(z[0-9]+\.h), #128
++** (
++**	cmphi	p0\.h, p1/z, \1, z0\.h
++** |
++**	cmplo	p0\.h, p1/z, z0\.h, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_128_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, 128),
++		p0 = svcmplt (p1, z0, 128))
++
++/*
++** cmplt_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphi	p0\.h, p1/z, \1\.h, z0\.h
++** |
++**	cmplo	p0\.h, p1/z, z0\.h, \1\.h
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_u16, svuint16_t,
++		p0 = svcmplt_n_u16 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u32.c
+new file mode 100644
+index 000000000..3bb0b1464
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u32.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_u32_tied:
++** (
++**	cmphi	p0\.s, p0/z, z1\.s, z0\.s
++** |
++**	cmplo	p0\.s, p0/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u32_tied, svuint32_t,
++		p0 = svcmplt_u32 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_u32_untied:
++** (
++**	cmphi	p0\.s, p1/z, z1\.s, z0\.s
++** |
++**	cmplo	p0\.s, p1/z, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u32_untied, svuint32_t,
++		p0 = svcmplt_u32 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_w0_u32:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	cmphi	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmplo	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_w0_u32, svuint32_t, uint32_t,
++		 p0 = svcmplt_n_u32 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_127_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_127_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, 127),
++		p0 = svcmplt (p1, z0, 127))
++
++/*
++** cmplt_128_u32:
++**	mov	(z[0-9]+\.s), #128
++** (
++**	cmphi	p0\.s, p1/z, \1, z0\.s
++** |
++**	cmplo	p0\.s, p1/z, z0\.s, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_128_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, 128),
++		p0 = svcmplt (p1, z0, 128))
++
++/*
++** cmplt_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphi	p0\.s, p1/z, \1\.s, z0\.s
++** |
++**	cmplo	p0\.s, p1/z, z0\.s, \1\.s
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_u32, svuint32_t,
++		p0 = svcmplt_n_u32 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u64.c
+new file mode 100644
+index 000000000..d9de5add2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u64.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_u64_tied:
++** (
++**	cmphi	p0\.d, p0/z, z1\.d, z0\.d
++** |
++**	cmplo	p0\.d, p0/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u64_tied, svuint64_t,
++		p0 = svcmplt_u64 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_u64_untied:
++** (
++**	cmphi	p0\.d, p1/z, z1\.d, z0\.d
++** |
++**	cmplo	p0\.d, p1/z, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u64_untied, svuint64_t,
++		p0 = svcmplt_u64 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_x0_u64:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	cmphi	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmplo	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_x0_u64, svuint64_t, uint64_t,
++		 p0 = svcmplt_n_u64 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_u64:
++**	cmplo	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_u64:
++**	cmplo	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_u64:
++**	cmplo	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_u64:
++**	cmplo	p0\.d, p1/z, z0\.d, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_127_u64:
++**	cmplo	p0\.d, p1/z, z0\.d, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_127_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, 127),
++		p0 = svcmplt (p1, z0, 127))
++
++/*
++** cmplt_128_u64:
++**	mov	(z[0-9]+\.d), #128
++** (
++**	cmphi	p0\.d, p1/z, \1, z0\.d
++** |
++**	cmplo	p0\.d, p1/z, z0\.d, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_128_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, 128),
++		p0 = svcmplt (p1, z0, 128))
++
++/*
++** cmplt_m1_u64:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	cmphi	p0\.d, p1/z, \1\.d, z0\.d
++** |
++**	cmplo	p0\.d, p1/z, z0\.d, \1\.d
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_u64, svuint64_t,
++		p0 = svcmplt_n_u64 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u8.c
+new file mode 100644
+index 000000000..42d5ad868
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u8.c
+@@ -0,0 +1,116 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_u8_tied:
++** (
++**	cmphi	p0\.b, p0/z, z1\.b, z0\.b
++** |
++**	cmplo	p0\.b, p0/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u8_tied, svuint8_t,
++		p0 = svcmplt_u8 (p0, z0, z1),
++		p0 = svcmplt (p0, z0, z1))
++
++/*
++** cmplt_u8_untied:
++** (
++**	cmphi	p0\.b, p1/z, z1\.b, z0\.b
++** |
++**	cmplo	p0\.b, p1/z, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_u8_untied, svuint8_t,
++		p0 = svcmplt_u8 (p1, z0, z1),
++		p0 = svcmplt (p1, z0, z1))
++
++/*
++** cmplt_w0_u8:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	cmphi	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmplo	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_w0_u8, svuint8_t, uint8_t,
++		 p0 = svcmplt_n_u8 (p1, z0, x0),
++		 p0 = svcmplt (p1, z0, x0))
++
++/*
++** cmplt_0_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_0_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, 0),
++		p0 = svcmplt (p1, z0, 0))
++
++/*
++** cmplt_1_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_1_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, 1),
++		p0 = svcmplt (p1, z0, 1))
++
++/*
++** cmplt_15_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_15_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, 15),
++		p0 = svcmplt (p1, z0, 15))
++
++/*
++** cmplt_16_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_16_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, 16),
++		p0 = svcmplt (p1, z0, 16))
++
++/*
++** cmplt_127_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_127_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, 127),
++		p0 = svcmplt (p1, z0, 127))
++
++/*
++** cmplt_128_u8:
++**	mov	(z[0-9]+\.b), #-128
++** (
++**	cmphi	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmplo	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_128_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, 128),
++		p0 = svcmplt (p1, z0, 128))
++
++/*
++** cmplt_m1_u8:
++**	mov	(z[0-9]+\.b), #-1
++** (
++**	cmphi	p0\.b, p1/z, \1, z0\.b
++** |
++**	cmplo	p0\.b, p1/z, z0\.b, \1
++** )
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_m1_u8, svuint8_t,
++		p0 = svcmplt_n_u8 (p1, z0, -1),
++		p0 = svcmplt (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s16.c
+new file mode 100644
+index 000000000..a3c8942ba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_wide_s16_tied:
++**	cmplt	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_s16_tied, svint16_t, svint64_t,
++		     p0 = svcmplt_wide_s16 (p0, z0, z1),
++		     p0 = svcmplt_wide (p0, z0, z1))
++
++/*
++** cmplt_wide_s16_untied:
++**	cmplt	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_s16_untied, svint16_t, svint64_t,
++		     p0 = svcmplt_wide_s16 (p1, z0, z1),
++		     p0 = svcmplt_wide (p1, z0, z1))
++
++/*
++** cmplt_wide_x0_s16:
++**	mov	(z[0-9]+\.d), x0
++**	cmplt	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_wide_x0_s16, svint16_t, int64_t,
++		 p0 = svcmplt_wide_n_s16 (p1, z0, x0),
++		 p0 = svcmplt_wide (p1, z0, x0))
++
++/*
++** cmplt_wide_0_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_0_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, 0),
++		p0 = svcmplt_wide (p1, z0, 0))
++
++/*
++** cmplt_wide_1_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_1_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, 1),
++		p0 = svcmplt_wide (p1, z0, 1))
++
++/*
++** cmplt_wide_15_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_15_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, 15),
++		p0 = svcmplt_wide (p1, z0, 15))
++
++/*
++** cmplt_wide_16_s16:
++**	mov	(z[0-9]+\.d), #16
++**	cmplt	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_16_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, 16),
++		p0 = svcmplt_wide (p1, z0, 16))
++
++/*
++** cmplt_wide_m1_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m1_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, -1),
++		p0 = svcmplt_wide (p1, z0, -1))
++
++/*
++** cmplt_wide_m16_s16:
++**	cmplt	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m16_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, -16),
++		p0 = svcmplt_wide (p1, z0, -16))
++
++/*
++** cmplt_wide_m17_s16:
++**	mov	(z[0-9]+\.d), #-17
++**	cmplt	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m17_s16, svint16_t,
++		p0 = svcmplt_wide_n_s16 (p1, z0, -17),
++		p0 = svcmplt_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s32.c
+new file mode 100644
+index 000000000..b2cad6773
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_wide_s32_tied:
++**	cmplt	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_s32_tied, svint32_t, svint64_t,
++		     p0 = svcmplt_wide_s32 (p0, z0, z1),
++		     p0 = svcmplt_wide (p0, z0, z1))
++
++/*
++** cmplt_wide_s32_untied:
++**	cmplt	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_s32_untied, svint32_t, svint64_t,
++		     p0 = svcmplt_wide_s32 (p1, z0, z1),
++		     p0 = svcmplt_wide (p1, z0, z1))
++
++/*
++** cmplt_wide_x0_s32:
++**	mov	(z[0-9]+\.d), x0
++**	cmplt	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_wide_x0_s32, svint32_t, int64_t,
++		 p0 = svcmplt_wide_n_s32 (p1, z0, x0),
++		 p0 = svcmplt_wide (p1, z0, x0))
++
++/*
++** cmplt_wide_0_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_0_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, 0),
++		p0 = svcmplt_wide (p1, z0, 0))
++
++/*
++** cmplt_wide_1_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_1_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, 1),
++		p0 = svcmplt_wide (p1, z0, 1))
++
++/*
++** cmplt_wide_15_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_15_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, 15),
++		p0 = svcmplt_wide (p1, z0, 15))
++
++/*
++** cmplt_wide_16_s32:
++**	mov	(z[0-9]+\.d), #16
++**	cmplt	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_16_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, 16),
++		p0 = svcmplt_wide (p1, z0, 16))
++
++/*
++** cmplt_wide_m1_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m1_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, -1),
++		p0 = svcmplt_wide (p1, z0, -1))
++
++/*
++** cmplt_wide_m16_s32:
++**	cmplt	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m16_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, -16),
++		p0 = svcmplt_wide (p1, z0, -16))
++
++/*
++** cmplt_wide_m17_s32:
++**	mov	(z[0-9]+\.d), #-17
++**	cmplt	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m17_s32, svint32_t,
++		p0 = svcmplt_wide_n_s32 (p1, z0, -17),
++		p0 = svcmplt_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s8.c
+new file mode 100644
+index 000000000..1015fe309
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_wide_s8_tied:
++**	cmplt	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_s8_tied, svint8_t, svint64_t,
++		     p0 = svcmplt_wide_s8 (p0, z0, z1),
++		     p0 = svcmplt_wide (p0, z0, z1))
++
++/*
++** cmplt_wide_s8_untied:
++**	cmplt	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_s8_untied, svint8_t, svint64_t,
++		     p0 = svcmplt_wide_s8 (p1, z0, z1),
++		     p0 = svcmplt_wide (p1, z0, z1))
++
++/*
++** cmplt_wide_x0_s8:
++**	mov	(z[0-9]+\.d), x0
++**	cmplt	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_wide_x0_s8, svint8_t, int64_t,
++		 p0 = svcmplt_wide_n_s8 (p1, z0, x0),
++		 p0 = svcmplt_wide (p1, z0, x0))
++
++/*
++** cmplt_wide_0_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_0_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, 0),
++		p0 = svcmplt_wide (p1, z0, 0))
++
++/*
++** cmplt_wide_1_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_1_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, 1),
++		p0 = svcmplt_wide (p1, z0, 1))
++
++/*
++** cmplt_wide_15_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_15_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, 15),
++		p0 = svcmplt_wide (p1, z0, 15))
++
++/*
++** cmplt_wide_16_s8:
++**	mov	(z[0-9]+\.d), #16
++**	cmplt	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_16_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, 16),
++		p0 = svcmplt_wide (p1, z0, 16))
++
++/*
++** cmplt_wide_m1_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m1_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, -1),
++		p0 = svcmplt_wide (p1, z0, -1))
++
++/*
++** cmplt_wide_m16_s8:
++**	cmplt	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m16_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, -16),
++		p0 = svcmplt_wide (p1, z0, -16))
++
++/*
++** cmplt_wide_m17_s8:
++**	mov	(z[0-9]+\.d), #-17
++**	cmplt	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m17_s8, svint8_t,
++		p0 = svcmplt_wide_n_s8 (p1, z0, -17),
++		p0 = svcmplt_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u16.c
+new file mode 100644
+index 000000000..851400d36
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_wide_u16_tied:
++**	cmplo	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_u16_tied, svuint16_t, svuint64_t,
++		     p0 = svcmplt_wide_u16 (p0, z0, z1),
++		     p0 = svcmplt_wide (p0, z0, z1))
++
++/*
++** cmplt_wide_u16_untied:
++**	cmplo	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_u16_untied, svuint16_t, svuint64_t,
++		     p0 = svcmplt_wide_u16 (p1, z0, z1),
++		     p0 = svcmplt_wide (p1, z0, z1))
++
++/*
++** cmplt_wide_x0_u16:
++**	mov	(z[0-9]+\.d), x0
++**	cmplo	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_wide_x0_u16, svuint16_t, uint64_t,
++		 p0 = svcmplt_wide_n_u16 (p1, z0, x0),
++		 p0 = svcmplt_wide (p1, z0, x0))
++
++/*
++** cmplt_wide_0_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_0_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, 0),
++		p0 = svcmplt_wide (p1, z0, 0))
++
++/*
++** cmplt_wide_1_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_1_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, 1),
++		p0 = svcmplt_wide (p1, z0, 1))
++
++/*
++** cmplt_wide_15_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_15_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, 15),
++		p0 = svcmplt_wide (p1, z0, 15))
++
++/*
++** cmplt_wide_16_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_16_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, 16),
++		p0 = svcmplt_wide (p1, z0, 16))
++
++/*
++** cmplt_wide_127_u16:
++**	cmplo	p0\.h, p1/z, z0\.h, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_127_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, 127),
++		p0 = svcmplt_wide (p1, z0, 127))
++
++/*
++** cmplt_wide_128_u16:
++**	mov	(z[0-9]+\.d), #128
++**	cmplo	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_128_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, 128),
++		p0 = svcmplt_wide (p1, z0, 128))
++
++/*
++** cmplt_wide_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmplo	p0\.h, p1/z, z0\.h, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m1_u16, svuint16_t,
++		p0 = svcmplt_wide_n_u16 (p1, z0, -1),
++		p0 = svcmplt_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u32.c
+new file mode 100644
+index 000000000..1f9652def
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_wide_u32_tied:
++**	cmplo	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_u32_tied, svuint32_t, svuint64_t,
++		     p0 = svcmplt_wide_u32 (p0, z0, z1),
++		     p0 = svcmplt_wide (p0, z0, z1))
++
++/*
++** cmplt_wide_u32_untied:
++**	cmplo	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_u32_untied, svuint32_t, svuint64_t,
++		     p0 = svcmplt_wide_u32 (p1, z0, z1),
++		     p0 = svcmplt_wide (p1, z0, z1))
++
++/*
++** cmplt_wide_x0_u32:
++**	mov	(z[0-9]+\.d), x0
++**	cmplo	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_wide_x0_u32, svuint32_t, uint64_t,
++		 p0 = svcmplt_wide_n_u32 (p1, z0, x0),
++		 p0 = svcmplt_wide (p1, z0, x0))
++
++/*
++** cmplt_wide_0_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_0_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, 0),
++		p0 = svcmplt_wide (p1, z0, 0))
++
++/*
++** cmplt_wide_1_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_1_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, 1),
++		p0 = svcmplt_wide (p1, z0, 1))
++
++/*
++** cmplt_wide_15_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_15_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, 15),
++		p0 = svcmplt_wide (p1, z0, 15))
++
++/*
++** cmplt_wide_16_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_16_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, 16),
++		p0 = svcmplt_wide (p1, z0, 16))
++
++/*
++** cmplt_wide_127_u32:
++**	cmplo	p0\.s, p1/z, z0\.s, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_127_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, 127),
++		p0 = svcmplt_wide (p1, z0, 127))
++
++/*
++** cmplt_wide_128_u32:
++**	mov	(z[0-9]+\.d), #128
++**	cmplo	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_128_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, 128),
++		p0 = svcmplt_wide (p1, z0, 128))
++
++/*
++** cmplt_wide_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmplo	p0\.s, p1/z, z0\.s, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m1_u32, svuint32_t,
++		p0 = svcmplt_wide_n_u32 (p1, z0, -1),
++		p0 = svcmplt_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u8.c
+new file mode 100644
+index 000000000..95ef3cf16
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmplt_wide_u8_tied:
++**	cmplo	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_u8_tied, svuint8_t, svuint64_t,
++		     p0 = svcmplt_wide_u8 (p0, z0, z1),
++		     p0 = svcmplt_wide (p0, z0, z1))
++
++/*
++** cmplt_wide_u8_untied:
++**	cmplo	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmplt_wide_u8_untied, svuint8_t, svuint64_t,
++		     p0 = svcmplt_wide_u8 (p1, z0, z1),
++		     p0 = svcmplt_wide (p1, z0, z1))
++
++/*
++** cmplt_wide_x0_u8:
++**	mov	(z[0-9]+\.d), x0
++**	cmplo	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmplt_wide_x0_u8, svuint8_t, uint64_t,
++		 p0 = svcmplt_wide_n_u8 (p1, z0, x0),
++		 p0 = svcmplt_wide (p1, z0, x0))
++
++/*
++** cmplt_wide_0_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_0_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, 0),
++		p0 = svcmplt_wide (p1, z0, 0))
++
++/*
++** cmplt_wide_1_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_1_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, 1),
++		p0 = svcmplt_wide (p1, z0, 1))
++
++/*
++** cmplt_wide_15_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_15_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, 15),
++		p0 = svcmplt_wide (p1, z0, 15))
++
++/*
++** cmplt_wide_16_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #16
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_16_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, 16),
++		p0 = svcmplt_wide (p1, z0, 16))
++
++/*
++** cmplt_wide_127_u8:
++**	cmplo	p0\.b, p1/z, z0\.b, #127
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_127_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, 127),
++		p0 = svcmplt_wide (p1, z0, 127))
++
++/*
++** cmplt_wide_128_u8:
++**	mov	(z[0-9]+\.d), #128
++**	cmplo	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_128_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, 128),
++		p0 = svcmplt_wide (p1, z0, 128))
++
++/*
++** cmplt_wide_m1_u8:
++**	mov	(z[0-9]+)\.b, #-1
++**	cmplo	p0\.b, p1/z, z0\.b, \1\.d
++**	ret
++*/
++TEST_COMPARE_Z (cmplt_wide_m1_u8, svuint8_t,
++		p0 = svcmplt_wide_n_u8 (p1, z0, -1),
++		p0 = svcmplt_wide (p1, z0, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f16.c
+new file mode 100644
+index 000000000..63e203b09
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f16.c
+@@ -0,0 +1,50 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_f16_tied:
++**	fcmne	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_f16_tied, svfloat16_t,
++		p0 = svcmpne_f16 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_f16_untied:
++**	fcmne	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_f16_untied, svfloat16_t,
++		p0 = svcmpne_f16 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++**	fcmne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpne_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmpne_n_f16 (p1, z0, d4),
++		 p0 = svcmpne (p1, z0, d4))
++
++/*
++** cmpne_0_f16:
++**	fcmne	p0\.h, p1/z, z0\.h, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_f16, svfloat16_t,
++		p0 = svcmpne_n_f16 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fcmne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_f16, svfloat16_t,
++		p0 = svcmpne_n_f16 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f32.c
+new file mode 100644
+index 000000000..f81e2da51
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f32.c
+@@ -0,0 +1,50 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_f32_tied:
++**	fcmne	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_f32_tied, svfloat32_t,
++		p0 = svcmpne_f32 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_f32_untied:
++**	fcmne	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_f32_untied, svfloat32_t,
++		p0 = svcmpne_f32 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++**	fcmne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpne_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmpne_n_f32 (p1, z0, d4),
++		 p0 = svcmpne (p1, z0, d4))
++
++/*
++** cmpne_0_f32:
++**	fcmne	p0\.s, p1/z, z0\.s, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_f32, svfloat32_t,
++		p0 = svcmpne_n_f32 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fcmne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_f32, svfloat32_t,
++		p0 = svcmpne_n_f32 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f64.c
+new file mode 100644
+index 000000000..22e4eeef4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f64.c
+@@ -0,0 +1,50 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_f64_tied:
++**	fcmne	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_f64_tied, svfloat64_t,
++		p0 = svcmpne_f64 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_f64_untied:
++**	fcmne	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_f64_untied, svfloat64_t,
++		p0 = svcmpne_f64 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++**	fcmne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpne_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmpne_n_f64 (p1, z0, d4),
++		 p0 = svcmpne (p1, z0, d4))
++
++/*
++** cmpne_0_f64:
++**	fcmne	p0\.d, p1/z, z0\.d, #0\.0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_f64, svfloat64_t,
++		p0 = svcmpne_n_f64 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fcmne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_f64, svfloat64_t,
++		p0 = svcmpne_n_f64 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s16.c
+new file mode 100644
+index 000000000..d8c743f8b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_s16_tied:
++**	cmpne	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s16_tied, svint16_t,
++		p0 = svcmpne_s16 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_s16_untied:
++**	cmpne	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s16_untied, svint16_t,
++		p0 = svcmpne_s16 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_w0_s16:
++**	mov	(z[0-9]+\.h), w0
++**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_w0_s16, svint16_t, int16_t,
++		 p0 = svcmpne_n_s16 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_s16:
++**	mov	(z[0-9]+\.h), #16
++**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_s16:
++**	mov	(z[0-9]+\.h), #-17
++**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_s16, svint16_t,
++		p0 = svcmpne_n_s16 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s32.c
+new file mode 100644
+index 000000000..0d3c35111
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_s32_tied:
++**	cmpne	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s32_tied, svint32_t,
++		p0 = svcmpne_s32 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_s32_untied:
++**	cmpne	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s32_untied, svint32_t,
++		p0 = svcmpne_s32 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_w0_s32:
++**	mov	(z[0-9]+\.s), w0
++**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_w0_s32, svint32_t, int32_t,
++		 p0 = svcmpne_n_s32 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_s32:
++**	mov	(z[0-9]+\.s), #16
++**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_s32:
++**	mov	(z[0-9]+\.s), #-17
++**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_s32, svint32_t,
++		p0 = svcmpne_n_s32 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s64.c
+new file mode 100644
+index 000000000..4cf78f2dd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s64.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_s64_tied:
++**	cmpne	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s64_tied, svint64_t,
++		p0 = svcmpne_s64 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_s64_untied:
++**	cmpne	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s64_untied, svint64_t,
++		p0 = svcmpne_s64 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_x0_s64:
++**	mov	(z[0-9]+\.d), x0
++**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_x0_s64, svint64_t, int64_t,
++		 p0 = svcmpne_n_s64 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_s64:
++**	cmpne	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_s64:
++**	cmpne	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_s64:
++**	cmpne	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_s64:
++**	mov	(z[0-9]+\.d), #16
++**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_s64:
++**	cmpne	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_s64:
++**	cmpne	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_s64:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_s64, svint64_t,
++		p0 = svcmpne_n_s64 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s8.c
+new file mode 100644
+index 000000000..6409ecdd4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_s8_tied:
++**	cmpne	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s8_tied, svint8_t,
++		p0 = svcmpne_s8 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_s8_untied:
++**	cmpne	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_s8_untied, svint8_t,
++		p0 = svcmpne_s8 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_w0_s8:
++**	mov	(z[0-9]+\.b), w0
++**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_w0_s8, svint8_t, int8_t,
++		 p0 = svcmpne_n_s8 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_s8:
++**	mov	(z[0-9]+\.b), #16
++**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_s8:
++**	mov	(z[0-9]+\.b), #-17
++**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_s8, svint8_t,
++		p0 = svcmpne_n_s8 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u16.c
+new file mode 100644
+index 000000000..4d22bc7d3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_u16_tied:
++**	cmpne	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u16_tied, svuint16_t,
++		p0 = svcmpne_u16 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_u16_untied:
++**	cmpne	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u16_untied, svuint16_t,
++		p0 = svcmpne_u16 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_w0_u16:
++**	mov	(z[0-9]+\.h), w0
++**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_w0_u16, svuint16_t, uint16_t,
++		 p0 = svcmpne_n_u16 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_u16:
++**	cmpne	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_u16:
++**	cmpne	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_u16:
++**	cmpne	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_u16:
++**	mov	(z[0-9]+\.h), #16
++**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_u16:
++**	cmpne	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_u16:
++**	cmpne	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_u16:
++**	mov	(z[0-9]+\.h), #-17
++**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_u16, svuint16_t,
++		p0 = svcmpne_n_u16 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u32.c
+new file mode 100644
+index 000000000..b7ca94a69
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_u32_tied:
++**	cmpne	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u32_tied, svuint32_t,
++		p0 = svcmpne_u32 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_u32_untied:
++**	cmpne	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u32_untied, svuint32_t,
++		p0 = svcmpne_u32 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_w0_u32:
++**	mov	(z[0-9]+\.s), w0
++**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_w0_u32, svuint32_t, uint32_t,
++		 p0 = svcmpne_n_u32 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_u32:
++**	cmpne	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_u32:
++**	cmpne	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_u32:
++**	cmpne	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_u32:
++**	mov	(z[0-9]+\.s), #16
++**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_u32:
++**	cmpne	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_u32:
++**	cmpne	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_u32:
++**	mov	(z[0-9]+\.s), #-17
++**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_u32, svuint32_t,
++		p0 = svcmpne_n_u32 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u64.c
+new file mode 100644
+index 000000000..960ac85b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u64.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_u64_tied:
++**	cmpne	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u64_tied, svuint64_t,
++		p0 = svcmpne_u64 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_u64_untied:
++**	cmpne	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u64_untied, svuint64_t,
++		p0 = svcmpne_u64 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_x0_u64:
++**	mov	(z[0-9]+\.d), x0
++**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_x0_u64, svuint64_t, uint64_t,
++		 p0 = svcmpne_n_u64 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_u64:
++**	cmpne	p0\.d, p1/z, z0\.d, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_u64:
++**	cmpne	p0\.d, p1/z, z0\.d, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_u64:
++**	cmpne	p0\.d, p1/z, z0\.d, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_u64:
++**	mov	(z[0-9]+\.d), #16
++**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_u64:
++**	cmpne	p0\.d, p1/z, z0\.d, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_u64:
++**	cmpne	p0\.d, p1/z, z0\.d, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_u64:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_u64, svuint64_t,
++		p0 = svcmpne_n_u64 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u8.c
+new file mode 100644
+index 000000000..cb8496eab
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_u8_tied:
++**	cmpne	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u8_tied, svuint8_t,
++		p0 = svcmpne_u8 (p0, z0, z1),
++		p0 = svcmpne (p0, z0, z1))
++
++/*
++** cmpne_u8_untied:
++**	cmpne	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_u8_untied, svuint8_t,
++		p0 = svcmpne_u8 (p1, z0, z1),
++		p0 = svcmpne (p1, z0, z1))
++
++/*
++** cmpne_w0_u8:
++**	mov	(z[0-9]+\.b), w0
++**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_w0_u8, svuint8_t, uint8_t,
++		 p0 = svcmpne_n_u8 (p1, z0, x0),
++		 p0 = svcmpne (p1, z0, x0))
++
++/*
++** cmpne_0_u8:
++**	cmpne	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_0_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, 0),
++		p0 = svcmpne (p1, z0, 0))
++
++/*
++** cmpne_1_u8:
++**	cmpne	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_1_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, 1),
++		p0 = svcmpne (p1, z0, 1))
++
++/*
++** cmpne_15_u8:
++**	cmpne	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_15_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, 15),
++		p0 = svcmpne (p1, z0, 15))
++
++/*
++** cmpne_16_u8:
++**	mov	(z[0-9]+\.b), #16
++**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_16_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, 16),
++		p0 = svcmpne (p1, z0, 16))
++
++/*
++** cmpne_m1_u8:
++**	cmpne	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m1_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, -1),
++		p0 = svcmpne (p1, z0, -1))
++
++/*
++** cmpne_m16_u8:
++**	cmpne	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m16_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, -16),
++		p0 = svcmpne (p1, z0, -16))
++
++/*
++** cmpne_m17_u8:
++**	mov	(z[0-9]+\.b), #-17
++**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_m17_u8, svuint8_t,
++		p0 = svcmpne_n_u8 (p1, z0, -17),
++		p0 = svcmpne (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s16.c
+new file mode 100644
+index 000000000..4cb7586c9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_wide_s16_tied:
++**	cmpne	p0\.h, p0/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpne_wide_s16_tied, svint16_t, svint64_t,
++		     p0 = svcmpne_wide_s16 (p0, z0, z1),
++		     p0 = svcmpne_wide (p0, z0, z1))
++
++/*
++** cmpne_wide_s16_untied:
++**	cmpne	p0\.h, p1/z, z0\.h, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpne_wide_s16_untied, svint16_t, svint64_t,
++		     p0 = svcmpne_wide_s16 (p1, z0, z1),
++		     p0 = svcmpne_wide (p1, z0, z1))
++
++/*
++** cmpne_wide_x0_s16:
++**	mov	(z[0-9]+\.d), x0
++**	cmpne	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_wide_x0_s16, svint16_t, int64_t,
++		 p0 = svcmpne_wide_n_s16 (p1, z0, x0),
++		 p0 = svcmpne_wide (p1, z0, x0))
++
++/*
++** cmpne_wide_0_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_0_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, 0),
++		p0 = svcmpne_wide (p1, z0, 0))
++
++/*
++** cmpne_wide_1_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_1_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, 1),
++		p0 = svcmpne_wide (p1, z0, 1))
++
++/*
++** cmpne_wide_15_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_15_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, 15),
++		p0 = svcmpne_wide (p1, z0, 15))
++
++/*
++** cmpne_wide_16_s16:
++**	mov	(z[0-9]+\.d), #16
++**	cmpne	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_16_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, 16),
++		p0 = svcmpne_wide (p1, z0, 16))
++
++/*
++** cmpne_wide_m1_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m1_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, -1),
++		p0 = svcmpne_wide (p1, z0, -1))
++
++/*
++** cmpne_wide_m16_s16:
++**	cmpne	p0\.h, p1/z, z0\.h, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m16_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, -16),
++		p0 = svcmpne_wide (p1, z0, -16))
++
++/*
++** cmpne_wide_m17_s16:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpne	p0\.h, p1/z, z0\.h, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m17_s16, svint16_t,
++		p0 = svcmpne_wide_n_s16 (p1, z0, -17),
++		p0 = svcmpne_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s32.c
+new file mode 100644
+index 000000000..633994ed3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_wide_s32_tied:
++**	cmpne	p0\.s, p0/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpne_wide_s32_tied, svint32_t, svint64_t,
++		     p0 = svcmpne_wide_s32 (p0, z0, z1),
++		     p0 = svcmpne_wide (p0, z0, z1))
++
++/*
++** cmpne_wide_s32_untied:
++**	cmpne	p0\.s, p1/z, z0\.s, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpne_wide_s32_untied, svint32_t, svint64_t,
++		     p0 = svcmpne_wide_s32 (p1, z0, z1),
++		     p0 = svcmpne_wide (p1, z0, z1))
++
++/*
++** cmpne_wide_x0_s32:
++**	mov	(z[0-9]+\.d), x0
++**	cmpne	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_wide_x0_s32, svint32_t, int64_t,
++		 p0 = svcmpne_wide_n_s32 (p1, z0, x0),
++		 p0 = svcmpne_wide (p1, z0, x0))
++
++/*
++** cmpne_wide_0_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_0_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, 0),
++		p0 = svcmpne_wide (p1, z0, 0))
++
++/*
++** cmpne_wide_1_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_1_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, 1),
++		p0 = svcmpne_wide (p1, z0, 1))
++
++/*
++** cmpne_wide_15_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_15_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, 15),
++		p0 = svcmpne_wide (p1, z0, 15))
++
++/*
++** cmpne_wide_16_s32:
++**	mov	(z[0-9]+\.d), #16
++**	cmpne	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_16_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, 16),
++		p0 = svcmpne_wide (p1, z0, 16))
++
++/*
++** cmpne_wide_m1_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m1_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, -1),
++		p0 = svcmpne_wide (p1, z0, -1))
++
++/*
++** cmpne_wide_m16_s32:
++**	cmpne	p0\.s, p1/z, z0\.s, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m16_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, -16),
++		p0 = svcmpne_wide (p1, z0, -16))
++
++/*
++** cmpne_wide_m17_s32:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpne	p0\.s, p1/z, z0\.s, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m17_s32, svint32_t,
++		p0 = svcmpne_wide_n_s32 (p1, z0, -17),
++		p0 = svcmpne_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s8.c
+new file mode 100644
+index 000000000..de343f4cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s8.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpne_wide_s8_tied:
++**	cmpne	p0\.b, p0/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpne_wide_s8_tied, svint8_t, svint64_t,
++		     p0 = svcmpne_wide_s8 (p0, z0, z1),
++		     p0 = svcmpne_wide (p0, z0, z1))
++
++/*
++** cmpne_wide_s8_untied:
++**	cmpne	p0\.b, p1/z, z0\.b, z1\.d
++**	ret
++*/
++TEST_COMPARE_DUAL_Z (cmpne_wide_s8_untied, svint8_t, svint64_t,
++		     p0 = svcmpne_wide_s8 (p1, z0, z1),
++		     p0 = svcmpne_wide (p1, z0, z1))
++
++/*
++** cmpne_wide_x0_s8:
++**	mov	(z[0-9]+\.d), x0
++**	cmpne	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_ZX (cmpne_wide_x0_s8, svint8_t, int64_t,
++		 p0 = svcmpne_wide_n_s8 (p1, z0, x0),
++		 p0 = svcmpne_wide (p1, z0, x0))
++
++/*
++** cmpne_wide_0_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #0
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_0_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, 0),
++		p0 = svcmpne_wide (p1, z0, 0))
++
++/*
++** cmpne_wide_1_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_1_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, 1),
++		p0 = svcmpne_wide (p1, z0, 1))
++
++/*
++** cmpne_wide_15_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #15
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_15_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, 15),
++		p0 = svcmpne_wide (p1, z0, 15))
++
++/*
++** cmpne_wide_16_s8:
++**	mov	(z[0-9]+\.d), #16
++**	cmpne	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_16_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, 16),
++		p0 = svcmpne_wide (p1, z0, 16))
++
++/*
++** cmpne_wide_m1_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #-1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m1_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, -1),
++		p0 = svcmpne_wide (p1, z0, -1))
++
++/*
++** cmpne_wide_m16_s8:
++**	cmpne	p0\.b, p1/z, z0\.b, #-16
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m16_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, -16),
++		p0 = svcmpne_wide (p1, z0, -16))
++
++/*
++** cmpne_wide_m17_s8:
++**	mov	(z[0-9]+\.d), #-17
++**	cmpne	p0\.b, p1/z, z0\.b, \1
++**	ret
++*/
++TEST_COMPARE_Z (cmpne_wide_m17_s8, svint8_t,
++		p0 = svcmpne_wide_n_s8 (p1, z0, -17),
++		p0 = svcmpne_wide (p1, z0, -17))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f16.c
+new file mode 100644
+index 000000000..8f702cdde
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f16.c
+@@ -0,0 +1,51 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpuo_f16_tied:
++**	fcmuo	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_f16_tied, svfloat16_t,
++		p0 = svcmpuo_f16 (p0, z0, z1),
++		p0 = svcmpuo (p0, z0, z1))
++
++/*
++** cmpuo_f16_untied:
++**	fcmuo	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_f16_untied, svfloat16_t,
++		p0 = svcmpuo_f16 (p1, z0, z1),
++		p0 = svcmpuo (p1, z0, z1))
++
++/*
++** cmpuo_h4_f16:
++**	mov	(z[0-9]+\.h), h4
++**	fcmuo	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpuo_h4_f16, svfloat16_t, float16_t,
++		 p0 = svcmpuo_n_f16 (p1, z0, d4),
++		 p0 = svcmpuo (p1, z0, d4))
++
++/*
++** cmpuo_0_f16:
++**	mov	(z[0-9]+\.h), #0
++**	fcmuo	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_0_f16, svfloat16_t,
++		p0 = svcmpuo_n_f16 (p1, z0, 0),
++		p0 = svcmpuo (p1, z0, 0))
++
++/*
++** cmpuo_1_f16:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fcmuo	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_1_f16, svfloat16_t,
++		p0 = svcmpuo_n_f16 (p1, z0, 1),
++		p0 = svcmpuo (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f32.c
+new file mode 100644
+index 000000000..8827604aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f32.c
+@@ -0,0 +1,51 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpuo_f32_tied:
++**	fcmuo	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_f32_tied, svfloat32_t,
++		p0 = svcmpuo_f32 (p0, z0, z1),
++		p0 = svcmpuo (p0, z0, z1))
++
++/*
++** cmpuo_f32_untied:
++**	fcmuo	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_f32_untied, svfloat32_t,
++		p0 = svcmpuo_f32 (p1, z0, z1),
++		p0 = svcmpuo (p1, z0, z1))
++
++/*
++** cmpuo_s4_f32:
++**	mov	(z[0-9]+\.s), s4
++**	fcmuo	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpuo_s4_f32, svfloat32_t, float32_t,
++		 p0 = svcmpuo_n_f32 (p1, z0, d4),
++		 p0 = svcmpuo (p1, z0, d4))
++
++/*
++** cmpuo_0_f32:
++**	mov	(z[0-9]+\.s), #0
++**	fcmuo	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_0_f32, svfloat32_t,
++		p0 = svcmpuo_n_f32 (p1, z0, 0),
++		p0 = svcmpuo (p1, z0, 0))
++
++/*
++** cmpuo_1_f32:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fcmuo	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_1_f32, svfloat32_t,
++		p0 = svcmpuo_n_f32 (p1, z0, 1),
++		p0 = svcmpuo (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f64.c
+new file mode 100644
+index 000000000..d7a71eca4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f64.c
+@@ -0,0 +1,51 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cmpuo_f64_tied:
++**	fcmuo	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_f64_tied, svfloat64_t,
++		p0 = svcmpuo_f64 (p0, z0, z1),
++		p0 = svcmpuo (p0, z0, z1))
++
++/*
++** cmpuo_f64_untied:
++**	fcmuo	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_f64_untied, svfloat64_t,
++		p0 = svcmpuo_f64 (p1, z0, z1),
++		p0 = svcmpuo (p1, z0, z1))
++
++/*
++** cmpuo_d4_f64:
++**	mov	(z[0-9]+\.d), d4
++**	fcmuo	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_ZD (cmpuo_d4_f64, svfloat64_t, float64_t,
++		 p0 = svcmpuo_n_f64 (p1, z0, d4),
++		 p0 = svcmpuo (p1, z0, d4))
++
++/*
++** cmpuo_0_f64:
++**	mov	(z[0-9]+\.d), #0
++**	fcmuo	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_0_f64, svfloat64_t,
++		p0 = svcmpuo_n_f64 (p1, z0, 0),
++		p0 = svcmpuo (p1, z0, 0))
++
++/*
++** cmpuo_1_f64:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fcmuo	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_COMPARE_Z (cmpuo_1_f64, svfloat64_t,
++		p0 = svcmpuo_n_f64 (p1, z0, 1),
++		p0 = svcmpuo (p1, z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s16.c
+new file mode 100644
+index 000000000..19d46be68
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_s16_m_tied12:
++**	cnot	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_m_tied12, svint16_t,
++		z0 = svcnot_s16_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_s16_m_tied1:
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_m_tied1, svint16_t,
++		z0 = svcnot_s16_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_m_tied2, svint16_t,
++		z0 = svcnot_s16_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_s16_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_m_untied, svint16_t,
++		z0 = svcnot_s16_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	cnot	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_z_tied1, svint16_t,
++		z0 = svcnot_s16_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_z_untied, svint16_t,
++		z0 = svcnot_s16_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_s16_x_tied1:
++**	cnot	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_x_tied1, svint16_t,
++		z0 = svcnot_s16_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_s16_x_untied:
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s16_x_untied, svint16_t,
++		z0 = svcnot_s16_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s32.c
+new file mode 100644
+index 000000000..041b59a04
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_s32_m_tied12:
++**	cnot	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_m_tied12, svint32_t,
++		z0 = svcnot_s32_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_s32_m_tied1:
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_m_tied1, svint32_t,
++		z0 = svcnot_s32_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_m_tied2, svint32_t,
++		z0 = svcnot_s32_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_s32_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_m_untied, svint32_t,
++		z0 = svcnot_s32_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	cnot	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_z_tied1, svint32_t,
++		z0 = svcnot_s32_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_z_untied, svint32_t,
++		z0 = svcnot_s32_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_s32_x_tied1:
++**	cnot	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_x_tied1, svint32_t,
++		z0 = svcnot_s32_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_s32_x_untied:
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s32_x_untied, svint32_t,
++		z0 = svcnot_s32_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s64.c
+new file mode 100644
+index 000000000..c7135cb95
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_s64_m_tied12:
++**	cnot	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_m_tied12, svint64_t,
++		z0 = svcnot_s64_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_s64_m_tied1:
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_m_tied1, svint64_t,
++		z0 = svcnot_s64_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_m_tied2, svint64_t,
++		z0 = svcnot_s64_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_s64_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_m_untied, svint64_t,
++		z0 = svcnot_s64_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	cnot	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_z_tied1, svint64_t,
++		z0 = svcnot_s64_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_z_untied, svint64_t,
++		z0 = svcnot_s64_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_s64_x_tied1:
++**	cnot	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_x_tied1, svint64_t,
++		z0 = svcnot_s64_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_s64_x_untied:
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s64_x_untied, svint64_t,
++		z0 = svcnot_s64_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s8.c
+new file mode 100644
+index 000000000..0560f9751
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_s8_m_tied12:
++**	cnot	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_m_tied12, svint8_t,
++		z0 = svcnot_s8_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_s8_m_tied1:
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_m_tied1, svint8_t,
++		z0 = svcnot_s8_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_m_tied2, svint8_t,
++		z0 = svcnot_s8_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_s8_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_m_untied, svint8_t,
++		z0 = svcnot_s8_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_s8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	cnot	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_z_tied1, svint8_t,
++		z0 = svcnot_s8_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_z_untied, svint8_t,
++		z0 = svcnot_s8_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_s8_x_tied1:
++**	cnot	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_x_tied1, svint8_t,
++		z0 = svcnot_s8_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_s8_x_untied:
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_s8_x_untied, svint8_t,
++		z0 = svcnot_s8_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u16.c
+new file mode 100644
+index 000000000..7ea9ff71d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_u16_m_tied12:
++**	cnot	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_m_tied12, svuint16_t,
++		z0 = svcnot_u16_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_u16_m_tied1:
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_m_tied1, svuint16_t,
++		z0 = svcnot_u16_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_m_tied2, svuint16_t,
++		z0 = svcnot_u16_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_u16_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_m_untied, svuint16_t,
++		z0 = svcnot_u16_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	cnot	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_z_tied1, svuint16_t,
++		z0 = svcnot_u16_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_z_untied, svuint16_t,
++		z0 = svcnot_u16_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_u16_x_tied1:
++**	cnot	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_x_tied1, svuint16_t,
++		z0 = svcnot_u16_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_u16_x_untied:
++**	cnot	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u16_x_untied, svuint16_t,
++		z0 = svcnot_u16_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u32.c
+new file mode 100644
+index 000000000..972c7751e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_u32_m_tied12:
++**	cnot	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_m_tied12, svuint32_t,
++		z0 = svcnot_u32_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_u32_m_tied1:
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_m_tied1, svuint32_t,
++		z0 = svcnot_u32_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_m_tied2, svuint32_t,
++		z0 = svcnot_u32_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_u32_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_m_untied, svuint32_t,
++		z0 = svcnot_u32_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	cnot	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_z_tied1, svuint32_t,
++		z0 = svcnot_u32_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_z_untied, svuint32_t,
++		z0 = svcnot_u32_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_u32_x_tied1:
++**	cnot	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_x_tied1, svuint32_t,
++		z0 = svcnot_u32_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_u32_x_untied:
++**	cnot	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u32_x_untied, svuint32_t,
++		z0 = svcnot_u32_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u64.c
+new file mode 100644
+index 000000000..f25e001c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_u64_m_tied12:
++**	cnot	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_m_tied12, svuint64_t,
++		z0 = svcnot_u64_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_u64_m_tied1:
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_m_tied1, svuint64_t,
++		z0 = svcnot_u64_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_m_tied2, svuint64_t,
++		z0 = svcnot_u64_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_u64_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_m_untied, svuint64_t,
++		z0 = svcnot_u64_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	cnot	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_z_tied1, svuint64_t,
++		z0 = svcnot_u64_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_z_untied, svuint64_t,
++		z0 = svcnot_u64_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_u64_x_tied1:
++**	cnot	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_x_tied1, svuint64_t,
++		z0 = svcnot_u64_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_u64_x_untied:
++**	cnot	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u64_x_untied, svuint64_t,
++		z0 = svcnot_u64_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u8.c
+new file mode 100644
+index 000000000..e135a7295
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnot_u8_m_tied12:
++**	cnot	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_m_tied12, svuint8_t,
++		z0 = svcnot_u8_m (z0, p0, z0),
++		z0 = svcnot_m (z0, p0, z0))
++
++/*
++** cnot_u8_m_tied1:
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_m_tied1, svuint8_t,
++		z0 = svcnot_u8_m (z0, p0, z1),
++		z0 = svcnot_m (z0, p0, z1))
++
++/*
++** cnot_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnot	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_m_tied2, svuint8_t,
++		z0 = svcnot_u8_m (z1, p0, z0),
++		z0 = svcnot_m (z1, p0, z0))
++
++/*
++** cnot_u8_m_untied:
++**	movprfx	z0, z2
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_m_untied, svuint8_t,
++		z0 = svcnot_u8_m (z2, p0, z1),
++		z0 = svcnot_m (z2, p0, z1))
++
++/*
++** cnot_u8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	cnot	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_z_tied1, svuint8_t,
++		z0 = svcnot_u8_z (p0, z0),
++		z0 = svcnot_z (p0, z0))
++
++/*
++** cnot_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_z_untied, svuint8_t,
++		z0 = svcnot_u8_z (p0, z1),
++		z0 = svcnot_z (p0, z1))
++
++/*
++** cnot_u8_x_tied1:
++**	cnot	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_x_tied1, svuint8_t,
++		z0 = svcnot_u8_x (p0, z0),
++		z0 = svcnot_x (p0, z0))
++
++/*
++** cnot_u8_x_untied:
++**	cnot	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnot_u8_x_untied, svuint8_t,
++		z0 = svcnot_u8_x (p0, z1),
++		z0 = svcnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_bf16.c
+new file mode 100644
+index 000000000..d92fbc157
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_bf16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_bf16_m_tied1:
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_bf16_m_tied1, svuint16_t, svbfloat16_t,
++	     z0 = svcnt_bf16_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_bf16_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_bf16_m_untied, svuint16_t, svbfloat16_t,
++	     z0 = svcnt_bf16_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_bf16_z:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_bf16_z, svuint16_t, svbfloat16_t,
++	     z0 = svcnt_bf16_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_bf16_x:
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_bf16_x, svuint16_t, svbfloat16_t,
++	     z0 = svcnt_bf16_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
++
++/*
++** ptrue_cnt_bf16_x:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cnt_bf16_x, svuint16_t, svbfloat16_t,
++	     z0 = svcnt_bf16_x (svptrue_b16 (), z4),
++	     z0 = svcnt_x (svptrue_b16 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f16.c
+new file mode 100644
+index 000000000..b8061bb80
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f16.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_f16_m_tied1:
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_f16_m_tied1, svuint16_t, svfloat16_t,
++	     z0 = svcnt_f16_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_f16_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_f16_m_untied, svuint16_t, svfloat16_t,
++	     z0 = svcnt_f16_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_f16_z:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_f16_z, svuint16_t, svfloat16_t,
++	     z0 = svcnt_f16_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_f16_x:
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_f16_x, svuint16_t, svfloat16_t,
++	     z0 = svcnt_f16_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
++
++/*
++** ptrue_cnt_f16_x:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cnt_f16_x, svuint16_t, svfloat16_t,
++	     z0 = svcnt_f16_x (svptrue_b16 (), z4),
++	     z0 = svcnt_x (svptrue_b16 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f32.c
+new file mode 100644
+index 000000000..b9292c977
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f32.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_f32_m_tied1:
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_f32_m_tied1, svuint32_t, svfloat32_t,
++	     z0 = svcnt_f32_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_f32_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_f32_m_untied, svuint32_t, svfloat32_t,
++	     z0 = svcnt_f32_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_f32_z:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_f32_z, svuint32_t, svfloat32_t,
++	     z0 = svcnt_f32_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_f32_x:
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_f32_x, svuint32_t, svfloat32_t,
++	     z0 = svcnt_f32_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
++
++/*
++** ptrue_cnt_f32_x:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cnt_f32_x, svuint32_t, svfloat32_t,
++	     z0 = svcnt_f32_x (svptrue_b32 (), z4),
++	     z0 = svcnt_x (svptrue_b32 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f64.c
+new file mode 100644
+index 000000000..4976ee467
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f64.c
+@@ -0,0 +1,52 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_f64_m_tied1:
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_f64_m_tied1, svuint64_t, svfloat64_t,
++	     z0 = svcnt_f64_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_f64_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_f64_m_untied, svuint64_t, svfloat64_t,
++	     z0 = svcnt_f64_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_f64_z:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_f64_z, svuint64_t, svfloat64_t,
++	     z0 = svcnt_f64_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_f64_x:
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_f64_x, svuint64_t, svfloat64_t,
++	     z0 = svcnt_f64_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
++
++/*
++** ptrue_cnt_f64_x:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cnt_f64_x, svuint64_t, svfloat64_t,
++	     z0 = svcnt_f64_x (svptrue_b64 (), z4),
++	     z0 = svcnt_x (svptrue_b64 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s16.c
+new file mode 100644
+index 000000000..a8ff8f3d2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_s16_m_tied1:
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_s16_m_tied1, svuint16_t, svint16_t,
++	     z0 = svcnt_s16_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_s16_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_s16_m_untied, svuint16_t, svint16_t,
++	     z0 = svcnt_s16_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_s16_z:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_s16_z, svuint16_t, svint16_t,
++	     z0 = svcnt_s16_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_s16_x:
++**	cnt	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cnt_s16_x, svuint16_t, svint16_t,
++	     z0 = svcnt_s16_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s32.c
+new file mode 100644
+index 000000000..3d16041f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s32.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_s32_m_tied1:
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_s32_m_tied1, svuint32_t, svint32_t,
++	     z0 = svcnt_s32_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_s32_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_s32_m_untied, svuint32_t, svint32_t,
++	     z0 = svcnt_s32_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_s32_z:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_s32_z, svuint32_t, svint32_t,
++	     z0 = svcnt_s32_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_s32_x:
++**	cnt	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cnt_s32_x, svuint32_t, svint32_t,
++	     z0 = svcnt_s32_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s64.c
+new file mode 100644
+index 000000000..8c8871ba5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s64.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_s64_m_tied1:
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_s64_m_tied1, svuint64_t, svint64_t,
++	     z0 = svcnt_s64_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_s64_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_s64_m_untied, svuint64_t, svint64_t,
++	     z0 = svcnt_s64_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_s64_z:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_s64_z, svuint64_t, svint64_t,
++	     z0 = svcnt_s64_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_s64_x:
++**	cnt	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cnt_s64_x, svuint64_t, svint64_t,
++	     z0 = svcnt_s64_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s8.c
+new file mode 100644
+index 000000000..8d85c8e51
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s8.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_s8_m_tied1:
++**	cnt	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cnt_s8_m_tied1, svuint8_t, svint8_t,
++	     z0 = svcnt_s8_m (z0, p0, z4),
++	     z0 = svcnt_m (z0, p0, z4))
++
++/*
++** cnt_s8_m_untied:
++**	movprfx	z0, z1
++**	cnt	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cnt_s8_m_untied, svuint8_t, svint8_t,
++	     z0 = svcnt_s8_m (z1, p0, z4),
++	     z0 = svcnt_m (z1, p0, z4))
++
++/*
++** cnt_s8_z:
++**	movprfx	z0\.b, p0/z, z4\.b
++**	cnt	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cnt_s8_z, svuint8_t, svint8_t,
++	     z0 = svcnt_s8_z (p0, z4),
++	     z0 = svcnt_z (p0, z4))
++
++/*
++** cnt_s8_x:
++**	cnt	z0\.b, p0/m, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (cnt_s8_x, svuint8_t, svint8_t,
++	     z0 = svcnt_s8_x (p0, z4),
++	     z0 = svcnt_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u16.c
+new file mode 100644
+index 000000000..f173d3108
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_u16_m_tied12:
++**	cnt	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_m_tied12, svuint16_t,
++		z0 = svcnt_u16_m (z0, p0, z0),
++		z0 = svcnt_m (z0, p0, z0))
++
++/*
++** cnt_u16_m_tied1:
++**	cnt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_m_tied1, svuint16_t,
++		z0 = svcnt_u16_m (z0, p0, z1),
++		z0 = svcnt_m (z0, p0, z1))
++
++/*
++** cnt_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnt	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_m_tied2, svuint16_t,
++		z0 = svcnt_u16_m (z1, p0, z0),
++		z0 = svcnt_m (z1, p0, z0))
++
++/*
++** cnt_u16_m_untied:
++**	movprfx	z0, z2
++**	cnt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_m_untied, svuint16_t,
++		z0 = svcnt_u16_m (z2, p0, z1),
++		z0 = svcnt_m (z2, p0, z1))
++
++/*
++** cnt_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	cnt	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_z_tied1, svuint16_t,
++		z0 = svcnt_u16_z (p0, z0),
++		z0 = svcnt_z (p0, z0))
++
++/*
++** cnt_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	cnt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_z_untied, svuint16_t,
++		z0 = svcnt_u16_z (p0, z1),
++		z0 = svcnt_z (p0, z1))
++
++/*
++** cnt_u16_x_tied1:
++**	cnt	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_x_tied1, svuint16_t,
++		z0 = svcnt_u16_x (p0, z0),
++		z0 = svcnt_x (p0, z0))
++
++/*
++** cnt_u16_x_untied:
++**	cnt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u16_x_untied, svuint16_t,
++		z0 = svcnt_u16_x (p0, z1),
++		z0 = svcnt_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u32.c
+new file mode 100644
+index 000000000..11969a6b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_u32_m_tied12:
++**	cnt	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_m_tied12, svuint32_t,
++		z0 = svcnt_u32_m (z0, p0, z0),
++		z0 = svcnt_m (z0, p0, z0))
++
++/*
++** cnt_u32_m_tied1:
++**	cnt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_m_tied1, svuint32_t,
++		z0 = svcnt_u32_m (z0, p0, z1),
++		z0 = svcnt_m (z0, p0, z1))
++
++/*
++** cnt_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnt	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_m_tied2, svuint32_t,
++		z0 = svcnt_u32_m (z1, p0, z0),
++		z0 = svcnt_m (z1, p0, z0))
++
++/*
++** cnt_u32_m_untied:
++**	movprfx	z0, z2
++**	cnt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_m_untied, svuint32_t,
++		z0 = svcnt_u32_m (z2, p0, z1),
++		z0 = svcnt_m (z2, p0, z1))
++
++/*
++** cnt_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	cnt	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_z_tied1, svuint32_t,
++		z0 = svcnt_u32_z (p0, z0),
++		z0 = svcnt_z (p0, z0))
++
++/*
++** cnt_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	cnt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_z_untied, svuint32_t,
++		z0 = svcnt_u32_z (p0, z1),
++		z0 = svcnt_z (p0, z1))
++
++/*
++** cnt_u32_x_tied1:
++**	cnt	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_x_tied1, svuint32_t,
++		z0 = svcnt_u32_x (p0, z0),
++		z0 = svcnt_x (p0, z0))
++
++/*
++** cnt_u32_x_untied:
++**	cnt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u32_x_untied, svuint32_t,
++		z0 = svcnt_u32_x (p0, z1),
++		z0 = svcnt_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u64.c
+new file mode 100644
+index 000000000..4eb69ea84
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_u64_m_tied12:
++**	cnt	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_m_tied12, svuint64_t,
++		z0 = svcnt_u64_m (z0, p0, z0),
++		z0 = svcnt_m (z0, p0, z0))
++
++/*
++** cnt_u64_m_tied1:
++**	cnt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_m_tied1, svuint64_t,
++		z0 = svcnt_u64_m (z0, p0, z1),
++		z0 = svcnt_m (z0, p0, z1))
++
++/*
++** cnt_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	cnt	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_m_tied2, svuint64_t,
++		z0 = svcnt_u64_m (z1, p0, z0),
++		z0 = svcnt_m (z1, p0, z0))
++
++/*
++** cnt_u64_m_untied:
++**	movprfx	z0, z2
++**	cnt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_m_untied, svuint64_t,
++		z0 = svcnt_u64_m (z2, p0, z1),
++		z0 = svcnt_m (z2, p0, z1))
++
++/*
++** cnt_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	cnt	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_z_tied1, svuint64_t,
++		z0 = svcnt_u64_z (p0, z0),
++		z0 = svcnt_z (p0, z0))
++
++/*
++** cnt_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	cnt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_z_untied, svuint64_t,
++		z0 = svcnt_u64_z (p0, z1),
++		z0 = svcnt_z (p0, z1))
++
++/*
++** cnt_u64_x_tied1:
++**	cnt	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_x_tied1, svuint64_t,
++		z0 = svcnt_u64_x (p0, z0),
++		z0 = svcnt_x (p0, z0))
++
++/*
++** cnt_u64_x_untied:
++**	cnt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u64_x_untied, svuint64_t,
++		z0 = svcnt_u64_x (p0, z1),
++		z0 = svcnt_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u8.c
+new file mode 100644
+index 000000000..30e798302
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnt_u8_m_tied12:
++**	cnt	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_m_tied12, svuint8_t,
++		z0 = svcnt_u8_m (z0, p0, z0),
++		z0 = svcnt_m (z0, p0, z0))
++
++/*
++** cnt_u8_m_tied1:
++**	cnt	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_m_tied1, svuint8_t,
++		z0 = svcnt_u8_m (z0, p0, z1),
++		z0 = svcnt_m (z0, p0, z1))
++
++/*
++** cnt_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	cnt	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_m_tied2, svuint8_t,
++		z0 = svcnt_u8_m (z1, p0, z0),
++		z0 = svcnt_m (z1, p0, z0))
++
++/*
++** cnt_u8_m_untied:
++**	movprfx	z0, z2
++**	cnt	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_m_untied, svuint8_t,
++		z0 = svcnt_u8_m (z2, p0, z1),
++		z0 = svcnt_m (z2, p0, z1))
++
++/*
++** cnt_u8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	cnt	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_z_tied1, svuint8_t,
++		z0 = svcnt_u8_z (p0, z0),
++		z0 = svcnt_z (p0, z0))
++
++/*
++** cnt_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	cnt	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_z_untied, svuint8_t,
++		z0 = svcnt_u8_z (p0, z1),
++		z0 = svcnt_z (p0, z1))
++
++/*
++** cnt_u8_x_tied1:
++**	cnt	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_x_tied1, svuint8_t,
++		z0 = svcnt_u8_x (p0, z0),
++		z0 = svcnt_x (p0, z0))
++
++/*
++** cnt_u8_x_untied:
++**	cnt	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (cnt_u8_x_untied, svuint8_t,
++		z0 = svcnt_u8_x (p0, z1),
++		z0 = svcnt_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+new file mode 100644
+index 000000000..8b8fe8e4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+@@ -0,0 +1,280 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cntb_1:
++**	cntb	x0
++**	ret
++*/
++PROTO (cntb_1, uint64_t, ()) { return svcntb (); }
++
++/*
++** cntb_2:
++**	cntb	x0, all, mul #2
++**	ret
++*/
++PROTO (cntb_2, uint64_t, ()) { return svcntb () * 2; }
++
++/*
++** cntb_3:
++**	cntb	x0, all, mul #3
++**	ret
++*/
++PROTO (cntb_3, uint64_t, ()) { return svcntb () * 3; }
++
++/*
++** cntb_4:
++**	cntb	x0, all, mul #4
++**	ret
++*/
++PROTO (cntb_4, uint64_t, ()) { return svcntb () * 4; }
++
++/*
++** cntb_8:
++**	cntb	x0, all, mul #8
++**	ret
++*/
++PROTO (cntb_8, uint64_t, ()) { return svcntb () * 8; }
++
++/*
++** cntb_15:
++**	cntb	x0, all, mul #15
++**	ret
++*/
++PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
++
++/*
++** cntb_16:
++**	cntb	x0, all, mul #16
++**	ret
++*/
++PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cntb_17:
++**	cntb	x0, all, mul #16
++**	incb	x0
++**	ret
++*/
++PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
++
++/*
++** cntb_32:
++**	cntd	(x[0-9]+)
++**	lsl	x0, \1, 8
++**	ret
++*/
++PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
++
++/* Other sequences would be OK.  */
++/*
++** cntb_33:
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 5
++**	incb	x0
++**	ret
++*/
++PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
++
++/*
++** cntb_64:
++**	cntd	(x[0-9]+)
++**	lsl	x0, \1, 9
++**	ret
++*/
++PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
++
++/*
++** cntb_128:
++**	cntd	(x[0-9]+)
++**	lsl	x0, \1, 10
++**	ret
++*/
++PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
++
++/* Other sequences would be OK.  */
++/*
++** cntb_129:
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 7
++**	incb	x0
++**	ret
++*/
++PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
++
++/*
++** cntb_m1:
++**	cntb	(x[0-9]+)
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
++
++/*
++** cntb_m13:
++**	cntb	(x[0-9]+), all, mul #13
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
++
++/*
++** cntb_m15:
++**	cntb	(x[0-9]+), all, mul #15
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
++
++/*
++** cntb_m16:
++**	cntb	(x[0-9]+), all, mul #16
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cntb_m17:
++**	cntb	x0, all, mul #16
++**	incb	x0
++**	neg	x0, x0
++**	ret
++*/
++PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
++
++/*
++** incb_1:
++**	incb	x0
++**	ret
++*/
++PROTO (incb_1, uint64_t, (uint64_t x0)) { return x0 + svcntb (); }
++
++/*
++** incb_2:
++**	incb	x0, all, mul #2
++**	ret
++*/
++PROTO (incb_2, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 2; }
++
++/*
++** incb_3:
++**	incb	x0, all, mul #3
++**	ret
++*/
++PROTO (incb_3, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 3; }
++
++/*
++** incb_4:
++**	incb	x0, all, mul #4
++**	ret
++*/
++PROTO (incb_4, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 4; }
++
++/*
++** incb_8:
++**	incb	x0, all, mul #8
++**	ret
++*/
++PROTO (incb_8, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 8; }
++
++/*
++** incb_15:
++**	incb	x0, all, mul #15
++**	ret
++*/
++PROTO (incb_15, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 15; }
++
++/*
++** incb_16:
++**	incb	x0, all, mul #16
++**	ret
++*/
++PROTO (incb_16, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 16; }
++
++/*
++** incb_17:
++**	addvl	x0, x0, #17
++**	ret
++*/
++PROTO (incb_17, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 17; }
++
++/*
++** incb_31:
++**	addvl	x0, x0, #31
++**	ret
++*/
++PROTO (incb_31, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 31; }
++
++/*
++** decb_1:
++**	decb	x0
++**	ret
++*/
++PROTO (decb_1, uint64_t, (uint64_t x0)) { return x0 - svcntb (); }
++
++/*
++** decb_2:
++**	decb	x0, all, mul #2
++**	ret
++*/
++PROTO (decb_2, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 2; }
++
++/*
++** decb_3:
++**	decb	x0, all, mul #3
++**	ret
++*/
++PROTO (decb_3, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 3; }
++
++/*
++** decb_4:
++**	decb	x0, all, mul #4
++**	ret
++*/
++PROTO (decb_4, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 4; }
++
++/*
++** decb_8:
++**	decb	x0, all, mul #8
++**	ret
++*/
++PROTO (decb_8, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 8; }
++
++/*
++** decb_15:
++**	decb	x0, all, mul #15
++**	ret
++*/
++PROTO (decb_15, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 15; }
++
++/*
++** decb_16:
++**	decb	x0, all, mul #16
++**	ret
++*/
++PROTO (decb_16, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 16; }
++
++/*
++** decb_17:
++**	addvl	x0, x0, #-17
++**	ret
++*/
++PROTO (decb_17, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 17; }
++
++/*
++** decb_31:
++**	addvl	x0, x0, #-31
++**	ret
++*/
++PROTO (decb_31, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 31; }
++
++/*
++** decb_32:
++**	addvl	x0, x0, #-32
++**	ret
++*/
++PROTO (decb_32, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 32; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb_pat.c
+new file mode 100644
+index 000000000..effc5668d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb_pat.c
+@@ -0,0 +1,432 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cntb_pow2:
++**	cntb	x0, pow2
++**	ret
++*/
++PROTO (cntb_pow2, uint64_t, ()) { return svcntb_pat (SV_POW2); }
++
++/*
++** cntb_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++PROTO (cntb_vl1, uint64_t, ()) { return svcntb_pat (SV_VL1); }
++
++/*
++** cntb_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++PROTO (cntb_vl2, uint64_t, ()) { return svcntb_pat (SV_VL2); }
++
++/*
++** cntb_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++PROTO (cntb_vl3, uint64_t, ()) { return svcntb_pat (SV_VL3); }
++
++/*
++** cntb_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++PROTO (cntb_vl4, uint64_t, ()) { return svcntb_pat (SV_VL4); }
++
++/*
++** cntb_vl5:
++**	mov	x0, #?5
++**	ret
++*/
++PROTO (cntb_vl5, uint64_t, ()) { return svcntb_pat (SV_VL5); }
++
++/*
++** cntb_vl6:
++**	mov	x0, #?6
++**	ret
++*/
++PROTO (cntb_vl6, uint64_t, ()) { return svcntb_pat (SV_VL6); }
++
++/*
++** cntb_vl7:
++**	mov	x0, #?7
++**	ret
++*/
++PROTO (cntb_vl7, uint64_t, ()) { return svcntb_pat (SV_VL7); }
++
++/*
++** cntb_vl8:
++**	mov	x0, #?8
++**	ret
++*/
++PROTO (cntb_vl8, uint64_t, ()) { return svcntb_pat (SV_VL8); }
++
++/*
++** cntb_vl16:
++**	mov	x0, #?16
++**	ret
++*/
++PROTO (cntb_vl16, uint64_t, ()) { return svcntb_pat (SV_VL16); }
++
++/*
++** cntb_vl32:
++**	cntb	x0, vl32
++**	ret
++*/
++PROTO (cntb_vl32, uint64_t, ()) { return svcntb_pat (SV_VL32); }
++
++/*
++** cntb_vl64:
++**	cntb	x0, vl64
++**	ret
++*/
++PROTO (cntb_vl64, uint64_t, ()) { return svcntb_pat (SV_VL64); }
++
++/*
++** cntb_vl128:
++**	cntb	x0, vl128
++**	ret
++*/
++PROTO (cntb_vl128, uint64_t, ()) { return svcntb_pat (SV_VL128); }
++
++/*
++** cntb_vl256:
++**	cntb	x0, vl256
++**	ret
++*/
++PROTO (cntb_vl256, uint64_t, ()) { return svcntb_pat (SV_VL256); }
++
++/*
++** cntb_mul3:
++**	cntb	x0, mul3
++**	ret
++*/
++PROTO (cntb_mul3, uint64_t, ()) { return svcntb_pat (SV_MUL3); }
++
++/*
++** cntb_mul4:
++**	cntb	x0, mul4
++**	ret
++*/
++PROTO (cntb_mul4, uint64_t, ()) { return svcntb_pat (SV_MUL4); }
++
++/*
++** cntb_all:
++**	cntb	x0
++**	ret
++*/
++PROTO (cntb_all, uint64_t, ()) { return svcntb_pat (SV_ALL); }
++
++/*
++** incb_32_pow2:
++**	incb	x0, pow2
++**	ret
++*/
++PROTO (incb_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_POW2); }
++
++/*
++** incb_32_vl1:
++**	add	w0, w0, #?1
++**	ret
++*/
++PROTO (incb_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL1); }
++
++/*
++** incb_32_vl2:
++**	add	w0, w0, #?2
++**	ret
++*/
++PROTO (incb_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL2); }
++
++/*
++** incb_32_vl3:
++**	add	w0, w0, #?3
++**	ret
++*/
++PROTO (incb_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL3); }
++
++/*
++** incb_32_vl4:
++**	add	w0, w0, #?4
++**	ret
++*/
++PROTO (incb_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL4); }
++
++/*
++** incb_32_vl5:
++**	add	w0, w0, #?5
++**	ret
++*/
++PROTO (incb_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL5); }
++
++/*
++** incb_32_vl6:
++**	add	w0, w0, #?6
++**	ret
++*/
++PROTO (incb_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL6); }
++
++/*
++** incb_32_vl7:
++**	add	w0, w0, #?7
++**	ret
++*/
++PROTO (incb_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL7); }
++
++/*
++** incb_32_vl8:
++**	add	w0, w0, #?8
++**	ret
++*/
++PROTO (incb_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL8); }
++
++/*
++** incb_32_vl16:
++**	add	w0, w0, #?16
++**	ret
++*/
++PROTO (incb_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL16); }
++
++/*
++** incb_32_vl32:
++**	incb	x0, vl32
++**	ret
++*/
++PROTO (incb_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL32); }
++
++/*
++** incb_32_vl64:
++**	incb	x0, vl64
++**	ret
++*/
++PROTO (incb_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL64); }
++
++/*
++** incb_32_vl128:
++**	incb	x0, vl128
++**	ret
++*/
++PROTO (incb_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL128); }
++
++/*
++** incb_32_vl256:
++**	incb	x0, vl256
++**	ret
++*/
++PROTO (incb_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL256); }
++
++/*
++** incb_32_mul3:
++**	incb	x0, mul3
++**	ret
++*/
++PROTO (incb_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_MUL3); }
++
++/*
++** incb_32_mul4:
++**	incb	x0, mul4
++**	ret
++*/
++PROTO (incb_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_MUL4); }
++
++/*
++** incb_32_all:
++**	incb	x0
++**	ret
++*/
++PROTO (incb_32_all, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_ALL); }
++
++/*
++** incb_64_pow2:
++**	incb	x0, pow2
++**	ret
++*/
++PROTO (incb_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcntb_pat (SV_POW2); }
++
++/*
++** incb_64_all:
++**	incb	x0
++**	ret
++*/
++PROTO (incb_64_all, uint64_t, (uint64_t x0)) { return x0 + svcntb_pat (SV_ALL); }
++
++/*
++** decb_32_pow2:
++**	decb	x0, pow2
++**	ret
++*/
++PROTO (decb_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_POW2); }
++
++/*
++** decb_32_vl1:
++**	sub	w0, w0, #?1
++**	ret
++*/
++PROTO (decb_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL1); }
++
++/*
++** decb_32_vl2:
++**	sub	w0, w0, #?2
++**	ret
++*/
++PROTO (decb_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL2); }
++
++/*
++** decb_32_vl3:
++**	sub	w0, w0, #?3
++**	ret
++*/
++PROTO (decb_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL3); }
++
++/*
++** decb_32_vl4:
++**	sub	w0, w0, #?4
++**	ret
++*/
++PROTO (decb_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL4); }
++
++/*
++** decb_32_vl5:
++**	sub	w0, w0, #?5
++**	ret
++*/
++PROTO (decb_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL5); }
++
++/*
++** decb_32_vl6:
++**	sub	w0, w0, #?6
++**	ret
++*/
++PROTO (decb_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL6); }
++
++/*
++** decb_32_vl7:
++**	sub	w0, w0, #?7
++**	ret
++*/
++PROTO (decb_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL7); }
++
++/*
++** decb_32_vl8:
++**	sub	w0, w0, #?8
++**	ret
++*/
++PROTO (decb_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL8); }
++
++/*
++** decb_32_vl16:
++**	sub	w0, w0, #?16
++**	ret
++*/
++PROTO (decb_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL16); }
++
++/*
++** decb_32_vl32:
++**	decb	x0, vl32
++**	ret
++*/
++PROTO (decb_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL32); }
++
++/*
++** decb_32_vl64:
++**	decb	x0, vl64
++**	ret
++*/
++PROTO (decb_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL64); }
++
++/*
++** decb_32_vl128:
++**	decb	x0, vl128
++**	ret
++*/
++PROTO (decb_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL128); }
++
++/*
++** decb_32_vl256:
++**	decb	x0, vl256
++**	ret
++*/
++PROTO (decb_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL256); }
++
++/*
++** decb_32_mul3:
++**	decb	x0, mul3
++**	ret
++*/
++PROTO (decb_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_MUL3); }
++
++/*
++** decb_32_mul4:
++**	decb	x0, mul4
++**	ret
++*/
++PROTO (decb_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_MUL4); }
++
++/*
++** decb_32_all:
++**	decb	x0
++**	ret
++*/
++PROTO (decb_32_all, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_ALL); }
++
++/*
++** decb_64_pow2:
++**	decb	x0, pow2
++**	ret
++*/
++PROTO (decb_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcntb_pat (SV_POW2); }
++
++/*
++** decb_64_all:
++**	decb	x0
++**	ret
++*/
++PROTO (decb_64_all, uint64_t, (uint64_t x0)) { return x0 - svcntb_pat (SV_ALL); }
++
++/*
++** incb_s8_pow2_z0:
++**	cntb	x([0-9]+), pow2
++**	mov	(z[0-9]+\.b), w\1
++**	add	z0\.b, (z0\.b, \2|\2, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (incb_s8_pow2_z0, svint8_t,
++		z0 = svadd_n_s8_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)));
++
++/*
++** incb_s8_pow2_z1:
++**	cntb	x([0-9]+), pow2
++**	mov	(z[0-9]+\.b), w\1
++**	add	z0\.b, (z1\.b, \2|\2, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (incb_s8_pow2_z1, svint8_t,
++		z0 = svadd_n_s8_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)));
++
++/*
++** decb_s8_pow2_z0:
++**	cntb	x([0-9]+), pow2
++**	mov	(z[0-9]+\.b), w\1
++**	sub	z0\.b, z0\.b, \2
++**	ret
++*/
++TEST_UNIFORM_Z (decb_s8_pow2_z0, svint8_t,
++		z0 = svsub_n_s8_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)));
++
++/*
++** decb_s8_pow2_z1:
++**	cntb	x([0-9]+), pow2
++**	mov	(z[0-9]+\.b), w\1
++**	sub	z0\.b, z1\.b, \2
++**	ret
++*/
++TEST_UNIFORM_Z (decb_s8_pow2_z1, svint8_t,
++		z0 = svsub_n_s8_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+new file mode 100644
+index 000000000..0d0ed4849
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+@@ -0,0 +1,278 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cntd_1:
++**	cntd	x0
++**	ret
++*/
++PROTO (cntd_1, uint64_t, ()) { return svcntd (); }
++
++/*
++** cntd_2:
++**	cntw	x0
++**	ret
++*/
++PROTO (cntd_2, uint64_t, ()) { return svcntd () * 2; }
++
++/*
++** cntd_3:
++**	cntd	x0, all, mul #3
++**	ret
++*/
++PROTO (cntd_3, uint64_t, ()) { return svcntd () * 3; }
++
++/*
++** cntd_4:
++**	cnth	x0
++**	ret
++*/
++PROTO (cntd_4, uint64_t, ()) { return svcntd () * 4; }
++
++/*
++** cntd_8:
++**	cntb	x0
++**	ret
++*/
++PROTO (cntd_8, uint64_t, ()) { return svcntd () * 8; }
++
++/*
++** cntd_15:
++**	cntd	x0, all, mul #15
++**	ret
++*/
++PROTO (cntd_15, uint64_t, ()) { return svcntd () * 15; }
++
++/*
++** cntd_16:
++**	cntb	x0, all, mul #2
++**	ret
++*/
++PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cntd_17:
++**	cntb	x0, all, mul #2
++**	incd	x0
++**	ret
++*/
++PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
++
++/*
++** cntd_32:
++**	cntb	x0, all, mul #4
++**	ret
++*/
++PROTO (cntd_32, uint64_t, ()) { return svcntd () * 32; }
++
++/*
++** cntd_64:
++**	cntb	x0, all, mul #8
++**	ret
++*/
++PROTO (cntd_64, uint64_t, ()) { return svcntd () * 64; }
++
++/*
++** cntd_128:
++**	cntb	x0, all, mul #16
++**	ret
++*/
++PROTO (cntd_128, uint64_t, ()) { return svcntd () * 128; }
++
++/*
++** cntd_m1:
++**	cntd	(x[0-9]+)
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntd_m1, uint64_t, ()) { return -svcntd (); }
++
++/*
++** cntd_m13:
++**	cntd	(x[0-9]+), all, mul #13
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntd_m13, uint64_t, ()) { return -svcntd () * 13; }
++
++/*
++** cntd_m15:
++**	cntd	(x[0-9]+), all, mul #15
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
++
++/*
++** cntd_m16:
++**	cntb	(x[0-9]+), all, mul #2
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cntd_m17:
++**	cntb	x0, all, mul #2
++**	incd	x0
++**	neg	x0, x0
++**	ret
++*/
++PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
++
++/*
++** incd_1:
++**	incd	x0
++**	ret
++*/
++PROTO (incd_1, uint64_t, (uint64_t x0)) { return x0 + svcntd (); }
++
++/*
++** incd_2:
++**	incw	x0
++**	ret
++*/
++PROTO (incd_2, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 2; }
++
++/*
++** incd_3:
++**	incd	x0, all, mul #3
++**	ret
++*/
++PROTO (incd_3, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 3; }
++
++/*
++** incd_4:
++**	inch	x0
++**	ret
++*/
++PROTO (incd_4, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 4; }
++
++/*
++** incd_7:
++**	incd	x0, all, mul #7
++**	ret
++*/
++PROTO (incd_7, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 7; }
++
++/*
++** incd_8:
++**	incb	x0
++**	ret
++*/
++PROTO (incd_8, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 8; }
++
++/*
++** incd_9:
++**	incd	x0, all, mul #9
++**	ret
++*/
++PROTO (incd_9, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 9; }
++
++/*
++** incd_15:
++**	incd	x0, all, mul #15
++**	ret
++*/
++PROTO (incd_15, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 15; }
++
++/*
++** incd_16:
++**	incb	x0, all, mul #2
++**	ret
++*/
++PROTO (incd_16, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 16; }
++
++/*
++** incd_18:
++**	incw	x0, all, mul #9
++**	ret
++*/
++PROTO (incd_18, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 18; }
++
++/*
++** incd_30:
++**	incw	x0, all, mul #15
++**	ret
++*/
++PROTO (incd_30, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 30; }
++
++/*
++** decd_1:
++**	decd	x0
++**	ret
++*/
++PROTO (decd_1, uint64_t, (uint64_t x0)) { return x0 - svcntd (); }
++
++/*
++** decd_2:
++**	decw	x0
++**	ret
++*/
++PROTO (decd_2, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 2; }
++
++/*
++** decd_3:
++**	decd	x0, all, mul #3
++**	ret
++*/
++PROTO (decd_3, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 3; }
++
++/*
++** decd_4:
++**	dech	x0
++**	ret
++*/
++PROTO (decd_4, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 4; }
++
++/*
++** decd_7:
++**	decd	x0, all, mul #7
++**	ret
++*/
++PROTO (decd_7, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 7; }
++
++/*
++** decd_8:
++**	decb	x0
++**	ret
++*/
++PROTO (decd_8, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 8; }
++
++/*
++** decd_9:
++**	decd	x0, all, mul #9
++**	ret
++*/
++PROTO (decd_9, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 9; }
++
++/*
++** decd_15:
++**	decd	x0, all, mul #15
++**	ret
++*/
++PROTO (decd_15, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 15; }
++
++/*
++** decd_16:
++**	decb	x0, all, mul #2
++**	ret
++*/
++PROTO (decd_16, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 16; }
++
++/*
++** decd_18:
++**	decw	x0, all, mul #9
++**	ret
++*/
++PROTO (decd_18, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 18; }
++
++/*
++** decd_30:
++**	decw	x0, all, mul #15
++**	ret
++*/
++PROTO (decd_30, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 30; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd_pat.c
+new file mode 100644
+index 000000000..31ecde7ae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd_pat.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cntd_pow2:
++**	cntd	x0, pow2
++**	ret
++*/
++PROTO (cntd_pow2, uint64_t, ()) { return svcntd_pat (SV_POW2); }
++
++/*
++** cntd_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++PROTO (cntd_vl1, uint64_t, ()) { return svcntd_pat (SV_VL1); }
++
++/*
++** cntd_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++PROTO (cntd_vl2, uint64_t, ()) { return svcntd_pat (SV_VL2); }
++
++/*
++** cntd_vl3:
++**	cntd	x0, vl3
++**	ret
++*/
++PROTO (cntd_vl3, uint64_t, ()) { return svcntd_pat (SV_VL3); }
++
++/*
++** cntd_vl4:
++**	cntd	x0, vl4
++**	ret
++*/
++PROTO (cntd_vl4, uint64_t, ()) { return svcntd_pat (SV_VL4); }
++
++/*
++** cntd_vl5:
++**	cntd	x0, vl5
++**	ret
++*/
++PROTO (cntd_vl5, uint64_t, ()) { return svcntd_pat (SV_VL5); }
++
++/*
++** cntd_vl6:
++**	cntd	x0, vl6
++**	ret
++*/
++PROTO (cntd_vl6, uint64_t, ()) { return svcntd_pat (SV_VL6); }
++
++/*
++** cntd_vl7:
++**	cntd	x0, vl7
++**	ret
++*/
++PROTO (cntd_vl7, uint64_t, ()) { return svcntd_pat (SV_VL7); }
++
++/*
++** cntd_vl8:
++**	cntd	x0, vl8
++**	ret
++*/
++PROTO (cntd_vl8, uint64_t, ()) { return svcntd_pat (SV_VL8); }
++
++/*
++** cntd_vl16:
++**	cntd	x0, vl16
++**	ret
++*/
++PROTO (cntd_vl16, uint64_t, ()) { return svcntd_pat (SV_VL16); }
++
++/*
++** cntd_vl32:
++**	cntd	x0, vl32
++**	ret
++*/
++PROTO (cntd_vl32, uint64_t, ()) { return svcntd_pat (SV_VL32); }
++
++/*
++** cntd_vl64:
++**	cntd	x0, vl64
++**	ret
++*/
++PROTO (cntd_vl64, uint64_t, ()) { return svcntd_pat (SV_VL64); }
++
++/*
++** cntd_vl128:
++**	cntd	x0, vl128
++**	ret
++*/
++PROTO (cntd_vl128, uint64_t, ()) { return svcntd_pat (SV_VL128); }
++
++/*
++** cntd_vl256:
++**	cntd	x0, vl256
++**	ret
++*/
++PROTO (cntd_vl256, uint64_t, ()) { return svcntd_pat (SV_VL256); }
++
++/*
++** cntd_mul3:
++**	cntd	x0, mul3
++**	ret
++*/
++PROTO (cntd_mul3, uint64_t, ()) { return svcntd_pat (SV_MUL3); }
++
++/*
++** cntd_mul4:
++**	cntd	x0, mul4
++**	ret
++*/
++PROTO (cntd_mul4, uint64_t, ()) { return svcntd_pat (SV_MUL4); }
++
++/*
++** cntd_all:
++**	cntd	x0
++**	ret
++*/
++PROTO (cntd_all, uint64_t, ()) { return svcntd_pat (SV_ALL); }
++
++/*
++** incd_32_pow2:
++**	incd	x0, pow2
++**	ret
++*/
++PROTO (incd_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_POW2); }
++
++/*
++** incd_32_vl1:
++**	add	w0, w0, #?1
++**	ret
++*/
++PROTO (incd_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL1); }
++
++/*
++** incd_32_vl2:
++**	add	w0, w0, #?2
++**	ret
++*/
++PROTO (incd_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL2); }
++
++/*
++** incd_32_vl3:
++**	incd	x0, vl3
++**	ret
++*/
++PROTO (incd_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL3); }
++
++/*
++** incd_32_vl4:
++**	incd	x0, vl4
++**	ret
++*/
++PROTO (incd_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL4); }
++
++/*
++** incd_32_vl5:
++**	incd	x0, vl5
++**	ret
++*/
++PROTO (incd_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL5); }
++
++/*
++** incd_32_vl6:
++**	incd	x0, vl6
++**	ret
++*/
++PROTO (incd_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL6); }
++
++/*
++** incd_32_vl7:
++**	incd	x0, vl7
++**	ret
++*/
++PROTO (incd_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL7); }
++
++/*
++** incd_32_vl8:
++**	incd	x0, vl8
++**	ret
++*/
++PROTO (incd_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL8); }
++
++/*
++** incd_32_vl16:
++**	incd	x0, vl16
++**	ret
++*/
++PROTO (incd_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL16); }
++
++/*
++** incd_32_vl32:
++**	incd	x0, vl32
++**	ret
++*/
++PROTO (incd_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL32); }
++
++/*
++** incd_32_vl64:
++**	incd	x0, vl64
++**	ret
++*/
++PROTO (incd_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL64); }
++
++/*
++** incd_32_vl128:
++**	incd	x0, vl128
++**	ret
++*/
++PROTO (incd_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL128); }
++
++/*
++** incd_32_vl256:
++**	incd	x0, vl256
++**	ret
++*/
++PROTO (incd_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL256); }
++
++/*
++** incd_32_mul3:
++**	incd	x0, mul3
++**	ret
++*/
++PROTO (incd_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_MUL3); }
++
++/*
++** incd_32_mul4:
++**	incd	x0, mul4
++**	ret
++*/
++PROTO (incd_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_MUL4); }
++
++/*
++** incd_32_all:
++**	incd	x0
++**	ret
++*/
++PROTO (incd_32_all, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_ALL); }
++
++/*
++** incd_64_pow2:
++**	incd	x0, pow2
++**	ret
++*/
++PROTO (incd_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcntd_pat (SV_POW2); }
++
++/*
++** incd_64_all:
++**	incd	x0
++**	ret
++*/
++PROTO (incd_64_all, uint64_t, (uint64_t x0)) { return x0 + svcntd_pat (SV_ALL); }
++
++/*
++** decd_32_pow2:
++**	decd	x0, pow2
++**	ret
++*/
++PROTO (decd_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_POW2); }
++
++/*
++** decd_32_vl1:
++**	sub	w0, w0, #?1
++**	ret
++*/
++PROTO (decd_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL1); }
++
++/*
++** decd_32_vl2:
++**	sub	w0, w0, #?2
++**	ret
++*/
++PROTO (decd_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL2); }
++
++/*
++** decd_32_vl3:
++**	decd	x0, vl3
++**	ret
++*/
++PROTO (decd_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL3); }
++
++/*
++** decd_32_vl4:
++**	decd	x0, vl4
++**	ret
++*/
++PROTO (decd_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL4); }
++
++/*
++** decd_32_vl5:
++**	decd	x0, vl5
++**	ret
++*/
++PROTO (decd_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL5); }
++
++/*
++** decd_32_vl6:
++**	decd	x0, vl6
++**	ret
++*/
++PROTO (decd_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL6); }
++
++/*
++** decd_32_vl7:
++**	decd	x0, vl7
++**	ret
++*/
++PROTO (decd_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL7); }
++
++/*
++** decd_32_vl8:
++**	decd	x0, vl8
++**	ret
++*/
++PROTO (decd_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL8); }
++
++/*
++** decd_32_vl16:
++**	decd	x0, vl16
++**	ret
++*/
++PROTO (decd_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL16); }
++
++/*
++** decd_32_vl32:
++**	decd	x0, vl32
++**	ret
++*/
++PROTO (decd_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL32); }
++
++/*
++** decd_32_vl64:
++**	decd	x0, vl64
++**	ret
++*/
++PROTO (decd_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL64); }
++
++/*
++** decd_32_vl128:
++**	decd	x0, vl128
++**	ret
++*/
++PROTO (decd_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL128); }
++
++/*
++** decd_32_vl256:
++**	decd	x0, vl256
++**	ret
++*/
++PROTO (decd_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL256); }
++
++/*
++** decd_32_mul3:
++**	decd	x0, mul3
++**	ret
++*/
++PROTO (decd_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_MUL3); }
++
++/*
++** decd_32_mul4:
++**	decd	x0, mul4
++**	ret
++*/
++PROTO (decd_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_MUL4); }
++
++/*
++** decd_32_all:
++**	decd	x0
++**	ret
++*/
++PROTO (decd_32_all, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_ALL); }
++
++/*
++** decd_64_pow2:
++**	decd	x0, pow2
++**	ret
++*/
++PROTO (decd_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcntd_pat (SV_POW2); }
++
++/*
++** decd_64_all:
++**	decd	x0
++**	ret
++*/
++PROTO (decd_64_all, uint64_t, (uint64_t x0)) { return x0 - svcntd_pat (SV_ALL); }
++
++/*
++** incd_s64_pow2_z0:
++**	incd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (incd_s64_pow2_z0, svint64_t,
++		z0 = svadd_n_s64_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)));
++
++/*
++** incd_s64_pow2_z1:
++**	movprfx	z0, z1
++**	incd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (incd_s64_pow2_z1, svint64_t,
++		z0 = svadd_n_s64_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)));
++
++/*
++** decd_s64_pow2_z0:
++**	decd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (decd_s64_pow2_z0, svint64_t,
++		z0 = svsub_n_s64_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)));
++
++/*
++** decd_s64_pow2_z1:
++**	movprfx	z0, z1
++**	decd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (decd_s64_pow2_z1, svint64_t,
++		z0 = svsub_n_s64_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+new file mode 100644
+index 000000000..c29930f15
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+@@ -0,0 +1,280 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnth_1:
++**	cnth	x0
++**	ret
++*/
++PROTO (cnth_1, uint64_t, ()) { return svcnth (); }
++
++/*
++** cnth_2:
++**	cntb	x0
++**	ret
++*/
++PROTO (cnth_2, uint64_t, ()) { return svcnth () * 2; }
++
++/*
++** cnth_3:
++**	cnth	x0, all, mul #3
++**	ret
++*/
++PROTO (cnth_3, uint64_t, ()) { return svcnth () * 3; }
++
++/*
++** cnth_4:
++**	cntb	x0, all, mul #2
++**	ret
++*/
++PROTO (cnth_4, uint64_t, ()) { return svcnth () * 4; }
++
++/*
++** cnth_8:
++**	cntb	x0, all, mul #4
++**	ret
++*/
++PROTO (cnth_8, uint64_t, ()) { return svcnth () * 8; }
++
++/*
++** cnth_15:
++**	cnth	x0, all, mul #15
++**	ret
++*/
++PROTO (cnth_15, uint64_t, ()) { return svcnth () * 15; }
++
++/*
++** cnth_16:
++**	cntb	x0, all, mul #8
++**	ret
++*/
++PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cnth_17:
++**	cntb	x0, all, mul #8
++**	inch	x0
++**	ret
++*/
++PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
++
++/*
++** cnth_32:
++**	cntb	x0, all, mul #16
++**	ret
++*/
++PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
++
++/*
++** cnth_64:
++**	cntd	(x[0-9]+)
++**	lsl	x0, \1, 8
++**	ret
++*/
++PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
++
++/*
++** cnth_128:
++**	cntd	(x[0-9]+)
++**	lsl	x0, \1, 9
++**	ret
++*/
++PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
++
++/*
++** cnth_m1:
++**	cnth	(x[0-9]+)
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cnth_m1, uint64_t, ()) { return -svcnth (); }
++
++/*
++** cnth_m13:
++**	cnth	(x[0-9]+), all, mul #13
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cnth_m13, uint64_t, ()) { return -svcnth () * 13; }
++
++/*
++** cnth_m15:
++**	cnth	(x[0-9]+), all, mul #15
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
++
++/*
++** cnth_m16:
++**	cntb	(x[0-9]+), all, mul #8
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cnth_m17:
++**	cntb	x0, all, mul #8
++**	inch	x0
++**	neg	x0, x0
++**	ret
++*/
++PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
++
++/*
++** inch_1:
++**	inch	x0
++**	ret
++*/
++PROTO (inch_1, uint64_t, (uint64_t x0)) { return x0 + svcnth (); }
++
++/*
++** inch_2:
++**	incb	x0
++**	ret
++*/
++PROTO (inch_2, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 2; }
++
++/*
++** inch_3:
++**	inch	x0, all, mul #3
++**	ret
++*/
++PROTO (inch_3, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 3; }
++
++/*
++** inch_4:
++**	incb	x0, all, mul #2
++**	ret
++*/
++PROTO (inch_4, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 4; }
++
++/*
++** inch_7:
++**	inch	x0, all, mul #7
++**	ret
++*/
++PROTO (inch_7, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 7; }
++
++/*
++** inch_8:
++**	incb	x0, all, mul #4
++**	ret
++*/
++PROTO (inch_8, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 8; }
++
++/*
++** inch_9:
++**	inch	x0, all, mul #9
++**	ret
++*/
++PROTO (inch_9, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 9; }
++
++/*
++** inch_15:
++**	inch	x0, all, mul #15
++**	ret
++*/
++PROTO (inch_15, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 15; }
++
++/*
++** inch_16:
++**	incb	x0, all, mul #8
++**	ret
++*/
++PROTO (inch_16, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 16; }
++
++/*
++** inch_18:
++**	incb	x0, all, mul #9
++**	ret
++*/
++PROTO (inch_18, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 18; }
++
++/*
++** inch_30:
++**	incb	x0, all, mul #15
++**	ret
++*/
++PROTO (inch_30, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 30; }
++
++/*
++** dech_1:
++**	dech	x0
++**	ret
++*/
++PROTO (dech_1, uint64_t, (uint64_t x0)) { return x0 - svcnth (); }
++
++/*
++** dech_2:
++**	decb	x0
++**	ret
++*/
++PROTO (dech_2, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 2; }
++
++/*
++** dech_3:
++**	dech	x0, all, mul #3
++**	ret
++*/
++PROTO (dech_3, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 3; }
++
++/*
++** dech_4:
++**	decb	x0, all, mul #2
++**	ret
++*/
++PROTO (dech_4, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 4; }
++
++/*
++** dech_7:
++**	dech	x0, all, mul #7
++**	ret
++*/
++PROTO (dech_7, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 7; }
++
++/*
++** dech_8:
++**	decb	x0, all, mul #4
++**	ret
++*/
++PROTO (dech_8, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 8; }
++
++/*
++** dech_9:
++**	dech	x0, all, mul #9
++**	ret
++*/
++PROTO (dech_9, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 9; }
++
++/*
++** dech_15:
++**	dech	x0, all, mul #15
++**	ret
++*/
++PROTO (dech_15, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 15; }
++
++/*
++** dech_16:
++**	decb	x0, all, mul #8
++**	ret
++*/
++PROTO (dech_16, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 16; }
++
++/*
++** dech_18:
++**	decb	x0, all, mul #9
++**	ret
++*/
++PROTO (dech_18, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 18; }
++
++/*
++** dech_30:
++**	decb	x0, all, mul #15
++**	ret
++*/
++PROTO (dech_30, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 30; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth_pat.c
+new file mode 100644
+index 000000000..7a42e7ad9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth_pat.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cnth_pow2:
++**	cnth	x0, pow2
++**	ret
++*/
++PROTO (cnth_pow2, uint64_t, ()) { return svcnth_pat (SV_POW2); }
++
++/*
++** cnth_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++PROTO (cnth_vl1, uint64_t, ()) { return svcnth_pat (SV_VL1); }
++
++/*
++** cnth_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++PROTO (cnth_vl2, uint64_t, ()) { return svcnth_pat (SV_VL2); }
++
++/*
++** cnth_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++PROTO (cnth_vl3, uint64_t, ()) { return svcnth_pat (SV_VL3); }
++
++/*
++** cnth_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++PROTO (cnth_vl4, uint64_t, ()) { return svcnth_pat (SV_VL4); }
++
++/*
++** cnth_vl5:
++**	mov	x0, #?5
++**	ret
++*/
++PROTO (cnth_vl5, uint64_t, ()) { return svcnth_pat (SV_VL5); }
++
++/*
++** cnth_vl6:
++**	mov	x0, #?6
++**	ret
++*/
++PROTO (cnth_vl6, uint64_t, ()) { return svcnth_pat (SV_VL6); }
++
++/*
++** cnth_vl7:
++**	mov	x0, #?7
++**	ret
++*/
++PROTO (cnth_vl7, uint64_t, ()) { return svcnth_pat (SV_VL7); }
++
++/*
++** cnth_vl8:
++**	mov	x0, #?8
++**	ret
++*/
++PROTO (cnth_vl8, uint64_t, ()) { return svcnth_pat (SV_VL8); }
++
++/*
++** cnth_vl16:
++**	cnth	x0, vl16
++**	ret
++*/
++PROTO (cnth_vl16, uint64_t, ()) { return svcnth_pat (SV_VL16); }
++
++/*
++** cnth_vl32:
++**	cnth	x0, vl32
++**	ret
++*/
++PROTO (cnth_vl32, uint64_t, ()) { return svcnth_pat (SV_VL32); }
++
++/*
++** cnth_vl64:
++**	cnth	x0, vl64
++**	ret
++*/
++PROTO (cnth_vl64, uint64_t, ()) { return svcnth_pat (SV_VL64); }
++
++/*
++** cnth_vl128:
++**	cnth	x0, vl128
++**	ret
++*/
++PROTO (cnth_vl128, uint64_t, ()) { return svcnth_pat (SV_VL128); }
++
++/*
++** cnth_vl256:
++**	cnth	x0, vl256
++**	ret
++*/
++PROTO (cnth_vl256, uint64_t, ()) { return svcnth_pat (SV_VL256); }
++
++/*
++** cnth_mul3:
++**	cnth	x0, mul3
++**	ret
++*/
++PROTO (cnth_mul3, uint64_t, ()) { return svcnth_pat (SV_MUL3); }
++
++/*
++** cnth_mul4:
++**	cnth	x0, mul4
++**	ret
++*/
++PROTO (cnth_mul4, uint64_t, ()) { return svcnth_pat (SV_MUL4); }
++
++/*
++** cnth_all:
++**	cnth	x0
++**	ret
++*/
++PROTO (cnth_all, uint64_t, ()) { return svcnth_pat (SV_ALL); }
++
++/*
++** inch_32_pow2:
++**	inch	x0, pow2
++**	ret
++*/
++PROTO (inch_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_POW2); }
++
++/*
++** inch_32_vl1:
++**	add	w0, w0, #?1
++**	ret
++*/
++PROTO (inch_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL1); }
++
++/*
++** inch_32_vl2:
++**	add	w0, w0, #?2
++**	ret
++*/
++PROTO (inch_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL2); }
++
++/*
++** inch_32_vl3:
++**	add	w0, w0, #?3
++**	ret
++*/
++PROTO (inch_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL3); }
++
++/*
++** inch_32_vl4:
++**	add	w0, w0, #?4
++**	ret
++*/
++PROTO (inch_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL4); }
++
++/*
++** inch_32_vl5:
++**	add	w0, w0, #?5
++**	ret
++*/
++PROTO (inch_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL5); }
++
++/*
++** inch_32_vl6:
++**	add	w0, w0, #?6
++**	ret
++*/
++PROTO (inch_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL6); }
++
++/*
++** inch_32_vl7:
++**	add	w0, w0, #?7
++**	ret
++*/
++PROTO (inch_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL7); }
++
++/*
++** inch_32_vl8:
++**	add	w0, w0, #?8
++**	ret
++*/
++PROTO (inch_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL8); }
++
++/*
++** inch_32_vl16:
++**	inch	x0, vl16
++**	ret
++*/
++PROTO (inch_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL16); }
++
++/*
++** inch_32_vl32:
++**	inch	x0, vl32
++**	ret
++*/
++PROTO (inch_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL32); }
++
++/*
++** inch_32_vl64:
++**	inch	x0, vl64
++**	ret
++*/
++PROTO (inch_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL64); }
++
++/*
++** inch_32_vl128:
++**	inch	x0, vl128
++**	ret
++*/
++PROTO (inch_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL128); }
++
++/*
++** inch_32_vl256:
++**	inch	x0, vl256
++**	ret
++*/
++PROTO (inch_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL256); }
++
++/*
++** inch_32_mul3:
++**	inch	x0, mul3
++**	ret
++*/
++PROTO (inch_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_MUL3); }
++
++/*
++** inch_32_mul4:
++**	inch	x0, mul4
++**	ret
++*/
++PROTO (inch_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_MUL4); }
++
++/*
++** inch_32_all:
++**	inch	x0
++**	ret
++*/
++PROTO (inch_32_all, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_ALL); }
++
++/*
++** inch_64_pow2:
++**	inch	x0, pow2
++**	ret
++*/
++PROTO (inch_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcnth_pat (SV_POW2); }
++
++/*
++** inch_64_all:
++**	inch	x0
++**	ret
++*/
++PROTO (inch_64_all, uint64_t, (uint64_t x0)) { return x0 + svcnth_pat (SV_ALL); }
++
++/*
++** dech_32_pow2:
++**	dech	x0, pow2
++**	ret
++*/
++PROTO (dech_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_POW2); }
++
++/*
++** dech_32_vl1:
++**	sub	w0, w0, #?1
++**	ret
++*/
++PROTO (dech_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL1); }
++
++/*
++** dech_32_vl2:
++**	sub	w0, w0, #?2
++**	ret
++*/
++PROTO (dech_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL2); }
++
++/*
++** dech_32_vl3:
++**	sub	w0, w0, #?3
++**	ret
++*/
++PROTO (dech_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL3); }
++
++/*
++** dech_32_vl4:
++**	sub	w0, w0, #?4
++**	ret
++*/
++PROTO (dech_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL4); }
++
++/*
++** dech_32_vl5:
++**	sub	w0, w0, #?5
++**	ret
++*/
++PROTO (dech_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL5); }
++
++/*
++** dech_32_vl6:
++**	sub	w0, w0, #?6
++**	ret
++*/
++PROTO (dech_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL6); }
++
++/*
++** dech_32_vl7:
++**	sub	w0, w0, #?7
++**	ret
++*/
++PROTO (dech_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL7); }
++
++/*
++** dech_32_vl8:
++**	sub	w0, w0, #?8
++**	ret
++*/
++PROTO (dech_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL8); }
++
++/*
++** dech_32_vl16:
++**	dech	x0, vl16
++**	ret
++*/
++PROTO (dech_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL16); }
++
++/*
++** dech_32_vl32:
++**	dech	x0, vl32
++**	ret
++*/
++PROTO (dech_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL32); }
++
++/*
++** dech_32_vl64:
++**	dech	x0, vl64
++**	ret
++*/
++PROTO (dech_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL64); }
++
++/*
++** dech_32_vl128:
++**	dech	x0, vl128
++**	ret
++*/
++PROTO (dech_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL128); }
++
++/*
++** dech_32_vl256:
++**	dech	x0, vl256
++**	ret
++*/
++PROTO (dech_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL256); }
++
++/*
++** dech_32_mul3:
++**	dech	x0, mul3
++**	ret
++*/
++PROTO (dech_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_MUL3); }
++
++/*
++** dech_32_mul4:
++**	dech	x0, mul4
++**	ret
++*/
++PROTO (dech_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_MUL4); }
++
++/*
++** dech_32_all:
++**	dech	x0
++**	ret
++*/
++PROTO (dech_32_all, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_ALL); }
++
++/*
++** dech_64_pow2:
++**	dech	x0, pow2
++**	ret
++*/
++PROTO (dech_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcnth_pat (SV_POW2); }
++
++/*
++** dech_64_all:
++**	dech	x0
++**	ret
++*/
++PROTO (dech_64_all, uint64_t, (uint64_t x0)) { return x0 - svcnth_pat (SV_ALL); }
++
++/*
++** inch_s16_pow2_z0:
++**	inch	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (inch_s16_pow2_z0, svint16_t,
++		z0 = svadd_n_s16_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)));
++
++/*
++** inch_s16_pow2_z1:
++**	movprfx	z0, z1
++**	inch	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (inch_s16_pow2_z1, svint16_t,
++		z0 = svadd_n_s16_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)));
++
++/*
++** dech_s16_pow2_z0:
++**	dech	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (dech_s16_pow2_z0, svint16_t,
++		z0 = svsub_n_s16_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)));
++
++/*
++** dech_s16_pow2_z1:
++**	movprfx	z0, z1
++**	dech	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (dech_s16_pow2_z1, svint16_t,
++		z0 = svsub_n_s16_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b16.c
+new file mode 100644
+index 000000000..d88b9e5f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b16.c
+@@ -0,0 +1,243 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** cnt_b16_32:
++**	cntp	x0, p0, p1\.h
++**	ret
++*/
++TEST_PTEST (cnt_b16_32, uint32_t,
++	    x0 = svcntp_b16 (p0, p1));
++
++/*
++** cnt_b16_64:
++**	cntp	x0, p0, p1\.h
++**	ret
++*/
++TEST_PTEST (cnt_b16_64, uint64_t,
++	    x0 = svcntp_b16 (p0, p1));
++
++/*
++** inc_b16_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.h
++**	add	w0, (w0, w\1|w\1, w0)
++**	ret
++*/
++TEST_PTEST (inc_b16_32_general_x0, uint32_t,
++	    x0 += svcntp_b16 (p0, p1));
++
++/*
++** inc_b16_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.h
++**	add	w0, (w1, w\1|w\1, w1)
++**	ret
++*/
++TEST_PTEST (inc_b16_32_general_x1, uint32_t,
++	    x0 = x1 + svcntp_b16 (p0, p1));
++
++/*
++** inc_b16_32_ptrue_x0:
++**	incp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (inc_b16_32_ptrue_x0, uint32_t,
++	    x0 += svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** inc_b16_32_ptrue_x1:
++**	mov	w0, w1
++**	incp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (inc_b16_32_ptrue_x1, uint32_t,
++	    x0 = x1 + svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** inc_b16_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.h
++**	add	x0, (x0, \1|\1, x0)
++**	ret
++*/
++TEST_PTEST (inc_b16_64_general_x0, uint64_t,
++	    x0 += svcntp_b16 (p0, p1));
++
++/*
++** inc_b16_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.h
++**	add	x0, (x1, \1|\1, x1)
++**	ret
++*/
++TEST_PTEST (inc_b16_64_general_x1, uint64_t,
++	    x0 = x1 + svcntp_b16 (p0, p1));
++
++/*
++** inc_b16_64_ptrue_x0:
++**	incp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (inc_b16_64_ptrue_x0, uint64_t,
++	    x0 += svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** inc_b16_64_ptrue_x1:
++**	mov	x0, x1
++**	incp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (inc_b16_64_ptrue_x1, uint64_t,
++	    x0 = x1 + svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** dec_b16_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.h
++**	sub	w0, w0, w\1
++**	ret
++*/
++TEST_PTEST (dec_b16_32_general_x0, uint32_t,
++	    x0 -= svcntp_b16 (p0, p1));
++
++/*
++** dec_b16_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.h
++**	sub	w0, w1, w\1
++**	ret
++*/
++TEST_PTEST (dec_b16_32_general_x1, uint32_t,
++	    x0 = x1 - svcntp_b16 (p0, p1));
++
++/*
++** dec_b16_32_ptrue_x0:
++**	decp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (dec_b16_32_ptrue_x0, uint32_t,
++	    x0 -= svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** dec_b16_32_ptrue_x1:
++**	mov	w0, w1
++**	decp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (dec_b16_32_ptrue_x1, uint32_t,
++	    x0 = x1 - svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** dec_b16_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.h
++**	sub	x0, x0, \1
++**	ret
++*/
++TEST_PTEST (dec_b16_64_general_x0, uint64_t,
++	    x0 -= svcntp_b16 (p0, p1));
++
++/*
++** dec_b16_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.h
++**	sub	x0, x1, \1
++**	ret
++*/
++TEST_PTEST (dec_b16_64_general_x1, uint64_t,
++	    x0 = x1 - svcntp_b16 (p0, p1));
++
++/*
++** dec_b16_64_ptrue_x0:
++**	decp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (dec_b16_64_ptrue_x0, uint64_t,
++	    x0 -= svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** dec_b16_64_ptrue_x1:
++**	mov	x0, x1
++**	decp	x0, p1\.h
++**	ret
++*/
++TEST_PTEST (dec_b16_64_ptrue_x1, uint64_t,
++	    x0 = x1 - svcntp_b16 (svptrue_b16 (), p1));
++
++/*
++** inc_b16_u16_general_z0:
++**	cntp	x([0-9]+), p0, p1\.h
++**	mov	(z[0-9]+\.h), w\1
++**	add	z0\.h, (z0\.h, \2|\2, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b16_u16_general_z0, svuint16_t,
++		z0 = svadd_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)),
++		z0 = svadd_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)));
++
++/*
++** inc_b16_u16_general_z1:
++**	cntp	x([0-9]+), p0, p1\.h
++**	mov	(z[0-9]+\.h), w\1
++**	add	z0\.h, (z1\.h, \2|\2, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b16_u16_general_z1, svuint16_t,
++		z0 = svadd_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)),
++		z0 = svadd_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)));
++
++/*
++** inc_b16_u16_ptrue_z0:
++**	incp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b16_u16_ptrue_z0, svuint16_t,
++		z0 = svadd_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)),
++		z0 = svadd_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)));
++
++/*
++** inc_b16_u16_ptrue_z1:
++**	movprfx	z0, z1
++**	incp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b16_u16_ptrue_z1, svuint16_t,
++		z0 = svadd_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)),
++		z0 = svadd_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)));
++
++/*
++** dec_b16_u16_general_z0:
++**	cntp	x([0-9]+), p0, p1\.h
++**	mov	(z[0-9]+\.h), w\1
++**	sub	z0\.h, z0\.h, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b16_u16_general_z0, svuint16_t,
++		z0 = svsub_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)),
++		z0 = svsub_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)));
++
++/*
++** dec_b16_u16_general_z1:
++**	cntp	x([0-9]+), p0, p1\.h
++**	mov	(z[0-9]+\.h), w\1
++**	sub	z0\.h, z1\.h, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b16_u16_general_z1, svuint16_t,
++		z0 = svsub_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)),
++		z0 = svsub_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)));
++
++/*
++** dec_b16_u16_ptrue_z0:
++**	decp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b16_u16_ptrue_z0, svuint16_t,
++		z0 = svsub_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)),
++		z0 = svsub_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)));
++
++/*
++** dec_b16_u16_ptrue_z1:
++**	movprfx	z0, z1
++**	decp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b16_u16_ptrue_z1, svuint16_t,
++		z0 = svsub_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)),
++		z0 = svsub_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b32.c
+new file mode 100644
+index 000000000..0da818895
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b32.c
+@@ -0,0 +1,243 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** cnt_b32_32:
++**	cntp	x0, p0, p1\.s
++**	ret
++*/
++TEST_PTEST (cnt_b32_32, uint32_t,
++	    x0 = svcntp_b32 (p0, p1));
++
++/*
++** cnt_b32_64:
++**	cntp	x0, p0, p1\.s
++**	ret
++*/
++TEST_PTEST (cnt_b32_64, uint64_t,
++	    x0 = svcntp_b32 (p0, p1));
++
++/*
++** inc_b32_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.s
++**	add	w0, (w0, w\1|w\1, w0)
++**	ret
++*/
++TEST_PTEST (inc_b32_32_general_x0, uint32_t,
++	    x0 += svcntp_b32 (p0, p1));
++
++/*
++** inc_b32_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.s
++**	add	w0, (w1, w\1|w\1, w1)
++**	ret
++*/
++TEST_PTEST (inc_b32_32_general_x1, uint32_t,
++	    x0 = x1 + svcntp_b32 (p0, p1));
++
++/*
++** inc_b32_32_ptrue_x0:
++**	incp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (inc_b32_32_ptrue_x0, uint32_t,
++	    x0 += svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** inc_b32_32_ptrue_x1:
++**	mov	w0, w1
++**	incp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (inc_b32_32_ptrue_x1, uint32_t,
++	    x0 = x1 + svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** inc_b32_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.s
++**	add	x0, (x0, \1|\1, x0)
++**	ret
++*/
++TEST_PTEST (inc_b32_64_general_x0, uint64_t,
++	    x0 += svcntp_b32 (p0, p1));
++
++/*
++** inc_b32_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.s
++**	add	x0, (x1, \1|\1, x1)
++**	ret
++*/
++TEST_PTEST (inc_b32_64_general_x1, uint64_t,
++	    x0 = x1 + svcntp_b32 (p0, p1));
++
++/*
++** inc_b32_64_ptrue_x0:
++**	incp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (inc_b32_64_ptrue_x0, uint64_t,
++	    x0 += svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** inc_b32_64_ptrue_x1:
++**	mov	x0, x1
++**	incp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (inc_b32_64_ptrue_x1, uint64_t,
++	    x0 = x1 + svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** dec_b32_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.s
++**	sub	w0, w0, w\1
++**	ret
++*/
++TEST_PTEST (dec_b32_32_general_x0, uint32_t,
++	    x0 -= svcntp_b32 (p0, p1));
++
++/*
++** dec_b32_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.s
++**	sub	w0, w1, w\1
++**	ret
++*/
++TEST_PTEST (dec_b32_32_general_x1, uint32_t,
++	    x0 = x1 - svcntp_b32 (p0, p1));
++
++/*
++** dec_b32_32_ptrue_x0:
++**	decp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (dec_b32_32_ptrue_x0, uint32_t,
++	    x0 -= svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** dec_b32_32_ptrue_x1:
++**	mov	w0, w1
++**	decp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (dec_b32_32_ptrue_x1, uint32_t,
++	    x0 = x1 - svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** dec_b32_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.s
++**	sub	x0, x0, \1
++**	ret
++*/
++TEST_PTEST (dec_b32_64_general_x0, uint64_t,
++	    x0 -= svcntp_b32 (p0, p1));
++
++/*
++** dec_b32_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.s
++**	sub	x0, x1, \1
++**	ret
++*/
++TEST_PTEST (dec_b32_64_general_x1, uint64_t,
++	    x0 = x1 - svcntp_b32 (p0, p1));
++
++/*
++** dec_b32_64_ptrue_x0:
++**	decp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (dec_b32_64_ptrue_x0, uint64_t,
++	    x0 -= svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** dec_b32_64_ptrue_x1:
++**	mov	x0, x1
++**	decp	x0, p1\.s
++**	ret
++*/
++TEST_PTEST (dec_b32_64_ptrue_x1, uint64_t,
++	    x0 = x1 - svcntp_b32 (svptrue_b32 (), p1));
++
++/*
++** inc_b32_s32_general_z0:
++**	cntp	x([0-9]+), p0, p1\.s
++**	mov	(z[0-9]+\.s), w\1
++**	add	z0\.s, (z0\.s, \2|\2, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b32_s32_general_z0, svint32_t,
++		z0 = svadd_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)),
++		z0 = svadd_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)));
++
++/*
++** inc_b32_s32_general_z1:
++**	cntp	x([0-9]+), p0, p1\.s
++**	mov	(z[0-9]+\.s), w\1
++**	add	z0\.s, (z1\.s, \2|\2, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b32_s32_general_z1, svint32_t,
++		z0 = svadd_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)),
++		z0 = svadd_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)));
++
++/*
++** inc_b32_s32_ptrue_z0:
++**	incp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b32_s32_ptrue_z0, svint32_t,
++		z0 = svadd_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)),
++		z0 = svadd_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)));
++
++/*
++** inc_b32_s32_ptrue_z1:
++**	movprfx	z0, z1
++**	incp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b32_s32_ptrue_z1, svint32_t,
++		z0 = svadd_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)),
++		z0 = svadd_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)));
++
++/*
++** dec_b32_s32_general_z0:
++**	cntp	x([0-9]+), p0, p1\.s
++**	mov	(z[0-9]+\.s), w\1
++**	sub	z0\.s, z0\.s, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b32_s32_general_z0, svint32_t,
++		z0 = svsub_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)),
++		z0 = svsub_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)));
++
++/*
++** dec_b32_s32_general_z1:
++**	cntp	x([0-9]+), p0, p1\.s
++**	mov	(z[0-9]+\.s), w\1
++**	sub	z0\.s, z1\.s, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b32_s32_general_z1, svint32_t,
++		z0 = svsub_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)),
++		z0 = svsub_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)));
++
++/*
++** dec_b32_s32_ptrue_z0:
++**	decp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b32_s32_ptrue_z0, svint32_t,
++		z0 = svsub_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)),
++		z0 = svsub_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)));
++
++/*
++** dec_b32_s32_ptrue_z1:
++**	movprfx	z0, z1
++**	decp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b32_s32_ptrue_z1, svint32_t,
++		z0 = svsub_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)),
++		z0 = svsub_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b64.c
+new file mode 100644
+index 000000000..6ddbaef5a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b64.c
+@@ -0,0 +1,243 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** cnt_b64_32:
++**	cntp	x0, p0, p1\.d
++**	ret
++*/
++TEST_PTEST (cnt_b64_32, uint32_t,
++	    x0 = svcntp_b64 (p0, p1));
++
++/*
++** cnt_b64_64:
++**	cntp	x0, p0, p1\.d
++**	ret
++*/
++TEST_PTEST (cnt_b64_64, uint64_t,
++	    x0 = svcntp_b64 (p0, p1));
++
++/*
++** inc_b64_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.d
++**	add	w0, (w0, w\1|w\1, w0)
++**	ret
++*/
++TEST_PTEST (inc_b64_32_general_x0, uint32_t,
++	    x0 += svcntp_b64 (p0, p1));
++
++/*
++** inc_b64_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.d
++**	add	w0, (w1, w\1|w\1, w1)
++**	ret
++*/
++TEST_PTEST (inc_b64_32_general_x1, uint32_t,
++	    x0 = x1 + svcntp_b64 (p0, p1));
++
++/*
++** inc_b64_32_ptrue_x0:
++**	incp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (inc_b64_32_ptrue_x0, uint32_t,
++	    x0 += svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** inc_b64_32_ptrue_x1:
++**	mov	w0, w1
++**	incp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (inc_b64_32_ptrue_x1, uint32_t,
++	    x0 = x1 + svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** inc_b64_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	add	x0, (x0, \1|\1, x0)
++**	ret
++*/
++TEST_PTEST (inc_b64_64_general_x0, uint64_t,
++	    x0 += svcntp_b64 (p0, p1));
++
++/*
++** inc_b64_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	add	x0, (x1, \1|\1, x1)
++**	ret
++*/
++TEST_PTEST (inc_b64_64_general_x1, uint64_t,
++	    x0 = x1 + svcntp_b64 (p0, p1));
++
++/*
++** inc_b64_64_ptrue_x0:
++**	incp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (inc_b64_64_ptrue_x0, uint64_t,
++	    x0 += svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** inc_b64_64_ptrue_x1:
++**	mov	x0, x1
++**	incp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (inc_b64_64_ptrue_x1, uint64_t,
++	    x0 = x1 + svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** dec_b64_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.d
++**	sub	w0, w0, w\1
++**	ret
++*/
++TEST_PTEST (dec_b64_32_general_x0, uint32_t,
++	    x0 -= svcntp_b64 (p0, p1));
++
++/*
++** dec_b64_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.d
++**	sub	w0, w1, w\1
++**	ret
++*/
++TEST_PTEST (dec_b64_32_general_x1, uint32_t,
++	    x0 = x1 - svcntp_b64 (p0, p1));
++
++/*
++** dec_b64_32_ptrue_x0:
++**	decp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (dec_b64_32_ptrue_x0, uint32_t,
++	    x0 -= svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** dec_b64_32_ptrue_x1:
++**	mov	w0, w1
++**	decp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (dec_b64_32_ptrue_x1, uint32_t,
++	    x0 = x1 - svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** dec_b64_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	sub	x0, x0, \1
++**	ret
++*/
++TEST_PTEST (dec_b64_64_general_x0, uint64_t,
++	    x0 -= svcntp_b64 (p0, p1));
++
++/*
++** dec_b64_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	sub	x0, x1, \1
++**	ret
++*/
++TEST_PTEST (dec_b64_64_general_x1, uint64_t,
++	    x0 = x1 - svcntp_b64 (p0, p1));
++
++/*
++** dec_b64_64_ptrue_x0:
++**	decp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (dec_b64_64_ptrue_x0, uint64_t,
++	    x0 -= svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** dec_b64_64_ptrue_x1:
++**	mov	x0, x1
++**	decp	x0, p1\.d
++**	ret
++*/
++TEST_PTEST (dec_b64_64_ptrue_x1, uint64_t,
++	    x0 = x1 - svcntp_b64 (svptrue_b64 (), p1));
++
++/*
++** inc_b64_u64_general_z0:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	mov	(z[0-9]+\.d), \1
++**	add	z0\.d, (z0\.d, \2|\2, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b64_u64_general_z0, svuint64_t,
++		z0 = svadd_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)),
++		z0 = svadd_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)));
++
++/*
++** inc_b64_u64_general_z1:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	mov	(z[0-9]+\.d), \1
++**	add	z0\.d, (z1\.d, \2|\2, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b64_u64_general_z1, svuint64_t,
++		z0 = svadd_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)),
++		z0 = svadd_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)));
++
++/*
++** inc_b64_u64_ptrue_z0:
++**	incp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b64_u64_ptrue_z0, svuint64_t,
++		z0 = svadd_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)),
++		z0 = svadd_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)));
++
++/*
++** inc_b64_u64_ptrue_z1:
++**	movprfx	z0, z1
++**	incp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b64_u64_ptrue_z1, svuint64_t,
++		z0 = svadd_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)),
++		z0 = svadd_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)));
++
++/*
++** dec_b64_u64_general_z0:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	mov	(z[0-9]+\.d), \1
++**	sub	z0\.d, z0\.d, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b64_u64_general_z0, svuint64_t,
++		z0 = svsub_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)),
++		z0 = svsub_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)));
++
++/*
++** dec_b64_u64_general_z1:
++**	cntp	(x[0-9]+), p0, p1\.d
++**	mov	(z[0-9]+\.d), \1
++**	sub	z0\.d, z1\.d, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b64_u64_general_z1, svuint64_t,
++		z0 = svsub_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)),
++		z0 = svsub_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)));
++
++/*
++** dec_b64_u64_ptrue_z0:
++**	decp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b64_u64_ptrue_z0, svuint64_t,
++		z0 = svsub_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)),
++		z0 = svsub_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)));
++
++/*
++** dec_b64_u64_ptrue_z1:
++**	movprfx	z0, z1
++**	decp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b64_u64_ptrue_z1, svuint64_t,
++		z0 = svsub_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)),
++		z0 = svsub_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b8.c
+new file mode 100644
+index 000000000..e02c02cd6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b8.c
+@@ -0,0 +1,253 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** cnt_b8_32:
++**	cntp	x0, p0, p1\.b
++**	ret
++*/
++TEST_PTEST (cnt_b8_32, uint32_t,
++	    x0 = svcntp_b8 (p0, p1));
++
++/*
++** cnt_b8_64:
++**	cntp	x0, p0, p1\.b
++**	ret
++*/
++TEST_PTEST (cnt_b8_64, uint64_t,
++	    x0 = svcntp_b8 (p0, p1));
++
++/*
++** inc_b8_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.b
++**	add	w0, (w0, w\1|w\1, w0)
++**	ret
++*/
++TEST_PTEST (inc_b8_32_general_x0, uint32_t,
++	    x0 += svcntp_b8 (p0, p1));
++
++/*
++** inc_b8_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.b
++**	add	w0, (w1, w\1|w\1, w1)
++**	ret
++*/
++TEST_PTEST (inc_b8_32_general_x1, uint32_t,
++	    x0 = x1 + svcntp_b8 (p0, p1));
++
++/*
++** inc_b8_32_ptrue_x0:
++**	incp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (inc_b8_32_ptrue_x0, uint32_t,
++	    x0 += svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** inc_b8_32_ptrue_x1:
++**	mov	w0, w1
++**	incp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (inc_b8_32_ptrue_x1, uint32_t,
++	    x0 = x1 + svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** inc_b8_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.b
++**	add	x0, (x0, \1|\1, x0)
++**	ret
++*/
++TEST_PTEST (inc_b8_64_general_x0, uint64_t,
++	    x0 += svcntp_b8 (p0, p1));
++
++/*
++** inc_b8_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.b
++**	add	x0, (x1, \1|\1, x1)
++**	ret
++*/
++TEST_PTEST (inc_b8_64_general_x1, uint64_t,
++	    x0 = x1 + svcntp_b8 (p0, p1));
++
++/*
++** inc_b8_64_ptrue_x0:
++**	incp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (inc_b8_64_ptrue_x0, uint64_t,
++	    x0 += svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** inc_b8_64_ptrue_x1:
++**	mov	x0, x1
++**	incp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (inc_b8_64_ptrue_x1, uint64_t,
++	    x0 = x1 + svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** dec_b8_32_general_x0:
++**	cntp	x([0-9]+), p0, p1\.b
++**	sub	w0, w0, w\1
++**	ret
++*/
++TEST_PTEST (dec_b8_32_general_x0, uint32_t,
++	    x0 -= svcntp_b8 (p0, p1));
++
++/*
++** dec_b8_32_general_x1:
++**	cntp	x([0-9]+), p0, p1\.b
++**	sub	w0, w1, w\1
++**	ret
++*/
++TEST_PTEST (dec_b8_32_general_x1, uint32_t,
++	    x0 = x1 - svcntp_b8 (p0, p1));
++
++/*
++** dec_b8_32_ptrue_x0:
++**	decp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (dec_b8_32_ptrue_x0, uint32_t,
++	    x0 -= svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** dec_b8_32_ptrue_x1:
++**	mov	w0, w1
++**	decp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (dec_b8_32_ptrue_x1, uint32_t,
++	    x0 = x1 - svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** dec_b8_64_general_x0:
++**	cntp	(x[0-9]+), p0, p1\.b
++**	sub	x0, x0, \1
++**	ret
++*/
++TEST_PTEST (dec_b8_64_general_x0, uint64_t,
++	    x0 -= svcntp_b8 (p0, p1));
++
++/*
++** dec_b8_64_general_x1:
++**	cntp	(x[0-9]+), p0, p1\.b
++**	sub	x0, x1, \1
++**	ret
++*/
++TEST_PTEST (dec_b8_64_general_x1, uint64_t,
++	    x0 = x1 - svcntp_b8 (p0, p1));
++
++/*
++** dec_b8_64_ptrue_x0:
++**	decp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (dec_b8_64_ptrue_x0, uint64_t,
++	    x0 -= svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** dec_b8_64_ptrue_x1:
++**	mov	x0, x1
++**	decp	x0, p1\.b
++**	ret
++*/
++TEST_PTEST (dec_b8_64_ptrue_x1, uint64_t,
++	    x0 = x1 - svcntp_b8 (svptrue_b8 (), p1));
++
++/*
++** inc_b8_s8_general_z0:
++**	cntp	x([0-9]+), p0, p1\.b
++**	mov	(z[0-9]+\.b), w\1
++**	add	z0\.b, (z0\.b, \2|\2, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b8_s8_general_z0, svint8_t,
++		z0 = svadd_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)),
++		z0 = svadd_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)));
++
++/*
++** inc_b8_s8_general_z1:
++**	cntp	x([0-9]+), p0, p1\.b
++**	mov	(z[0-9]+\.b), w\1
++**	add	z0\.b, (z1\.b, \2|\2, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b8_s8_general_z1, svint8_t,
++		z0 = svadd_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)),
++		z0 = svadd_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)));
++
++/*
++** inc_b8_s8_ptrue_z0:
++**	ptrue	(p[0-7])\.b, all
++**	cntp	x([0-9]+), \1, p0\.b
++**	mov	(z[0-9]+\.b), w\2
++**	add	z0\.b, (z0\.b, \3|\3, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b8_s8_ptrue_z0, svint8_t,
++		z0 = svadd_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)),
++		z0 = svadd_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)));
++
++/*
++** inc_b8_s8_ptrue_z1:
++**	ptrue	(p[0-7])\.b, all
++**	cntp	x([0-9]+), \1, p0\.b
++**	mov	(z[0-9]+\.b), w\2
++**	add	z0\.b, (z1\.b, \3|\3, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (inc_b8_s8_ptrue_z1, svint8_t,
++		z0 = svadd_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)),
++		z0 = svadd_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)));
++
++/*
++** dec_b8_s8_general_z0:
++**	cntp	x([0-9]+), p0, p1\.b
++**	mov	(z[0-9]+\.b), w\1
++**	sub	z0\.b, z0\.b, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b8_s8_general_z0, svint8_t,
++		z0 = svsub_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)),
++		z0 = svsub_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)));
++
++/*
++** dec_b8_s8_general_z1:
++**	cntp	x([0-9]+), p0, p1\.b
++**	mov	(z[0-9]+\.b), w\1
++**	sub	z0\.b, z1\.b, \2
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b8_s8_general_z1, svint8_t,
++		z0 = svsub_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)),
++		z0 = svsub_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)));
++
++/*
++** dec_b8_s8_ptrue_z0:
++**	ptrue	(p[0-7])\.b, all
++**	cntp	x([0-9]+), \1, p0\.b
++**	mov	(z[0-9]+\.b), w\2
++**	sub	z0\.b, z0\.b, \3
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b8_s8_ptrue_z0, svint8_t,
++		z0 = svsub_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)),
++		z0 = svsub_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)));
++
++/*
++** dec_b8_s8_ptrue_z1:
++**	ptrue	(p[0-7])\.b, all
++**	cntp	x([0-9]+), \1, p0\.b
++**	mov	(z[0-9]+\.b), w\2
++**	sub	z0\.b, z1\.b, \3
++**	ret
++*/
++TEST_UNIFORM_Z (dec_b8_s8_ptrue_z1, svint8_t,
++		z0 = svsub_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)),
++		z0 = svsub_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+new file mode 100644
+index 000000000..e26cc67a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+@@ -0,0 +1,279 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cntw_1:
++**	cntw	x0
++**	ret
++*/
++PROTO (cntw_1, uint64_t, ()) { return svcntw (); }
++
++/*
++** cntw_2:
++**	cnth	x0
++**	ret
++*/
++PROTO (cntw_2, uint64_t, ()) { return svcntw () * 2; }
++
++/*
++** cntw_3:
++**	cntw	x0, all, mul #3
++**	ret
++*/
++PROTO (cntw_3, uint64_t, ()) { return svcntw () * 3; }
++
++/*
++** cntw_4:
++**	cntb	x0
++**	ret
++*/
++PROTO (cntw_4, uint64_t, ()) { return svcntw () * 4; }
++
++/*
++** cntw_8:
++**	cntb	x0, all, mul #2
++**	ret
++*/
++PROTO (cntw_8, uint64_t, ()) { return svcntw () * 8; }
++
++/*
++** cntw_15:
++**	cntw	x0, all, mul #15
++**	ret
++*/
++PROTO (cntw_15, uint64_t, ()) { return svcntw () * 15; }
++
++/*
++** cntw_16:
++**	cntb	x0, all, mul #4
++**	ret
++*/
++PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cntw_17:
++**	cntb	x0, all, mul #4
++**	incw	x0
++**	ret
++*/
++PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
++
++/*
++** cntw_32:
++**	cntb	x0, all, mul #8
++**	ret
++*/
++PROTO (cntw_32, uint64_t, ()) { return svcntw () * 32; }
++
++/*
++** cntw_64:
++**	cntb	x0, all, mul #16
++**	ret
++*/
++PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
++
++/*
++** cntw_128:
++**	cntd	(x[0-9]+)
++**	lsl	x0, \1, 8
++**	ret
++*/
++PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
++
++/*
++** cntw_m1:
++**	cntw	(x[0-9]+)
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntw_m1, uint64_t, ()) { return -svcntw (); }
++
++/*
++** cntw_m13:
++**	cntw	(x[0-9]+), all, mul #13
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntw_m13, uint64_t, ()) { return -svcntw () * 13; }
++
++/*
++** cntw_m15:
++**	cntw	(x[0-9]+), all, mul #15
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
++
++/*
++** cntw_m16:
++**	cntb	(x[0-9]+), all, mul #4
++**	neg	x0, \1
++**	ret
++*/
++PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
++
++/* Other sequences would be OK.  */
++/*
++** cntw_m17:
++**	cntb	x0, all, mul #4
++**	incw	x0
++**	neg	x0, x0
++**	ret
++*/
++PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
++
++/*
++** incw_1:
++**	incw	x0
++**	ret
++*/
++PROTO (incw_1, uint64_t, (uint64_t x0)) { return x0 + svcntw (); }
++
++/*
++** incw_2:
++**	inch	x0
++**	ret
++*/
++PROTO (incw_2, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 2; }
++
++/*
++** incw_3:
++**	incw	x0, all, mul #3
++**	ret
++*/
++PROTO (incw_3, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 3; }
++
++/*
++** incw_4:
++**	incb	x0
++**	ret
++*/
++PROTO (incw_4, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 4; }
++
++/*
++** incw_7:
++**	incw	x0, all, mul #7
++**	ret
++*/
++PROTO (incw_7, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 7; }
++
++/*
++** incw_8:
++**	incb	x0, all, mul #2
++**	ret
++*/
++PROTO (incw_8, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 8; }
++
++/*
++** incw_9:
++**	incw	x0, all, mul #9
++**	ret
++*/
++PROTO (incw_9, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 9; }
++
++/*
++** incw_15:
++**	incw	x0, all, mul #15
++**	ret
++*/
++PROTO (incw_15, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 15; }
++
++/*
++** incw_16:
++**	incb	x0, all, mul #4
++**	ret
++*/
++PROTO (incw_16, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 16; }
++
++/*
++** incw_18:
++**	inch	x0, all, mul #9
++**	ret
++*/
++PROTO (incw_18, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 18; }
++
++/*
++** incw_30:
++**	inch	x0, all, mul #15
++**	ret
++*/
++PROTO (incw_30, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 30; }
++
++/*
++** decw_1:
++**	decw	x0
++**	ret
++*/
++PROTO (decw_1, uint64_t, (uint64_t x0)) { return x0 - svcntw (); }
++
++/*
++** decw_2:
++**	dech	x0
++**	ret
++*/
++PROTO (decw_2, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 2; }
++
++/*
++** decw_3:
++**	decw	x0, all, mul #3
++**	ret
++*/
++PROTO (decw_3, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 3; }
++
++/*
++** decw_4:
++**	decb	x0
++**	ret
++*/
++PROTO (decw_4, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 4; }
++
++/*
++** decw_7:
++**	decw	x0, all, mul #7
++**	ret
++*/
++PROTO (decw_7, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 7; }
++
++/*
++** decw_8:
++**	decb	x0, all, mul #2
++**	ret
++*/
++PROTO (decw_8, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 8; }
++
++/*
++** decw_9:
++**	decw	x0, all, mul #9
++**	ret
++*/
++PROTO (decw_9, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 9; }
++
++/*
++** decw_15:
++**	decw	x0, all, mul #15
++**	ret
++*/
++PROTO (decw_15, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 15; }
++
++/*
++** decw_16:
++**	decb	x0, all, mul #4
++**	ret
++*/
++PROTO (decw_16, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 16; }
++
++/*
++** decw_18:
++**	dech	x0, all, mul #9
++**	ret
++*/
++PROTO (decw_18, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 18; }
++
++/*
++** decw_30:
++**	dech	x0, all, mul #15
++**	ret
++*/
++PROTO (decw_30, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 30; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw_pat.c
+new file mode 100644
+index 000000000..ff6b7d882
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw_pat.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cntw_pow2:
++**	cntw	x0, pow2
++**	ret
++*/
++PROTO (cntw_pow2, uint64_t, ()) { return svcntw_pat (SV_POW2); }
++
++/*
++** cntw_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++PROTO (cntw_vl1, uint64_t, ()) { return svcntw_pat (SV_VL1); }
++
++/*
++** cntw_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++PROTO (cntw_vl2, uint64_t, ()) { return svcntw_pat (SV_VL2); }
++
++/*
++** cntw_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++PROTO (cntw_vl3, uint64_t, ()) { return svcntw_pat (SV_VL3); }
++
++/*
++** cntw_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++PROTO (cntw_vl4, uint64_t, ()) { return svcntw_pat (SV_VL4); }
++
++/*
++** cntw_vl5:
++**	cntw	x0, vl5
++**	ret
++*/
++PROTO (cntw_vl5, uint64_t, ()) { return svcntw_pat (SV_VL5); }
++
++/*
++** cntw_vl6:
++**	cntw	x0, vl6
++**	ret
++*/
++PROTO (cntw_vl6, uint64_t, ()) { return svcntw_pat (SV_VL6); }
++
++/*
++** cntw_vl7:
++**	cntw	x0, vl7
++**	ret
++*/
++PROTO (cntw_vl7, uint64_t, ()) { return svcntw_pat (SV_VL7); }
++
++/*
++** cntw_vl8:
++**	cntw	x0, vl8
++**	ret
++*/
++PROTO (cntw_vl8, uint64_t, ()) { return svcntw_pat (SV_VL8); }
++
++/*
++** cntw_vl16:
++**	cntw	x0, vl16
++**	ret
++*/
++PROTO (cntw_vl16, uint64_t, ()) { return svcntw_pat (SV_VL16); }
++
++/*
++** cntw_vl32:
++**	cntw	x0, vl32
++**	ret
++*/
++PROTO (cntw_vl32, uint64_t, ()) { return svcntw_pat (SV_VL32); }
++
++/*
++** cntw_vl64:
++**	cntw	x0, vl64
++**	ret
++*/
++PROTO (cntw_vl64, uint64_t, ()) { return svcntw_pat (SV_VL64); }
++
++/*
++** cntw_vl128:
++**	cntw	x0, vl128
++**	ret
++*/
++PROTO (cntw_vl128, uint64_t, ()) { return svcntw_pat (SV_VL128); }
++
++/*
++** cntw_vl256:
++**	cntw	x0, vl256
++**	ret
++*/
++PROTO (cntw_vl256, uint64_t, ()) { return svcntw_pat (SV_VL256); }
++
++/*
++** cntw_mul3:
++**	cntw	x0, mul3
++**	ret
++*/
++PROTO (cntw_mul3, uint64_t, ()) { return svcntw_pat (SV_MUL3); }
++
++/*
++** cntw_mul4:
++**	cntw	x0, mul4
++**	ret
++*/
++PROTO (cntw_mul4, uint64_t, ()) { return svcntw_pat (SV_MUL4); }
++
++/*
++** cntw_all:
++**	cntw	x0
++**	ret
++*/
++PROTO (cntw_all, uint64_t, ()) { return svcntw_pat (SV_ALL); }
++
++/*
++** incw_32_pow2:
++**	incw	x0, pow2
++**	ret
++*/
++PROTO (incw_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_POW2); }
++
++/*
++** incw_32_vl1:
++**	add	w0, w0, #?1
++**	ret
++*/
++PROTO (incw_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL1); }
++
++/*
++** incw_32_vl2:
++**	add	w0, w0, #?2
++**	ret
++*/
++PROTO (incw_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL2); }
++
++/*
++** incw_32_vl3:
++**	add	w0, w0, #?3
++**	ret
++*/
++PROTO (incw_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL3); }
++
++/*
++** incw_32_vl4:
++**	add	w0, w0, #?4
++**	ret
++*/
++PROTO (incw_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL4); }
++
++/*
++** incw_32_vl5:
++**	incw	x0, vl5
++**	ret
++*/
++PROTO (incw_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL5); }
++
++/*
++** incw_32_vl6:
++**	incw	x0, vl6
++**	ret
++*/
++PROTO (incw_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL6); }
++
++/*
++** incw_32_vl7:
++**	incw	x0, vl7
++**	ret
++*/
++PROTO (incw_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL7); }
++
++/*
++** incw_32_vl8:
++**	incw	x0, vl8
++**	ret
++*/
++PROTO (incw_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL8); }
++
++/*
++** incw_32_vl16:
++**	incw	x0, vl16
++**	ret
++*/
++PROTO (incw_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL16); }
++
++/*
++** incw_32_vl32:
++**	incw	x0, vl32
++**	ret
++*/
++PROTO (incw_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL32); }
++
++/*
++** incw_32_vl64:
++**	incw	x0, vl64
++**	ret
++*/
++PROTO (incw_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL64); }
++
++/*
++** incw_32_vl128:
++**	incw	x0, vl128
++**	ret
++*/
++PROTO (incw_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL128); }
++
++/*
++** incw_32_vl256:
++**	incw	x0, vl256
++**	ret
++*/
++PROTO (incw_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL256); }
++
++/*
++** incw_32_mul3:
++**	incw	x0, mul3
++**	ret
++*/
++PROTO (incw_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_MUL3); }
++
++/*
++** incw_32_mul4:
++**	incw	x0, mul4
++**	ret
++*/
++PROTO (incw_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_MUL4); }
++
++/*
++** incw_32_all:
++**	incw	x0
++**	ret
++*/
++PROTO (incw_32_all, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_ALL); }
++
++/*
++** incw_64_pow2:
++**	incw	x0, pow2
++**	ret
++*/
++PROTO (incw_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcntw_pat (SV_POW2); }
++
++/*
++** incw_64_all:
++**	incw	x0
++**	ret
++*/
++PROTO (incw_64_all, uint64_t, (uint64_t x0)) { return x0 + svcntw_pat (SV_ALL); }
++
++/*
++** decw_32_pow2:
++**	decw	x0, pow2
++**	ret
++*/
++PROTO (decw_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_POW2); }
++
++/*
++** decw_32_vl1:
++**	sub	w0, w0, #?1
++**	ret
++*/
++PROTO (decw_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL1); }
++
++/*
++** decw_32_vl2:
++**	sub	w0, w0, #?2
++**	ret
++*/
++PROTO (decw_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL2); }
++
++/*
++** decw_32_vl3:
++**	sub	w0, w0, #?3
++**	ret
++*/
++PROTO (decw_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL3); }
++
++/*
++** decw_32_vl4:
++**	sub	w0, w0, #?4
++**	ret
++*/
++PROTO (decw_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL4); }
++
++/*
++** decw_32_vl5:
++**	decw	x0, vl5
++**	ret
++*/
++PROTO (decw_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL5); }
++
++/*
++** decw_32_vl6:
++**	decw	x0, vl6
++**	ret
++*/
++PROTO (decw_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL6); }
++
++/*
++** decw_32_vl7:
++**	decw	x0, vl7
++**	ret
++*/
++PROTO (decw_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL7); }
++
++/*
++** decw_32_vl8:
++**	decw	x0, vl8
++**	ret
++*/
++PROTO (decw_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL8); }
++
++/*
++** decw_32_vl16:
++**	decw	x0, vl16
++**	ret
++*/
++PROTO (decw_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL16); }
++
++/*
++** decw_32_vl32:
++**	decw	x0, vl32
++**	ret
++*/
++PROTO (decw_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL32); }
++
++/*
++** decw_32_vl64:
++**	decw	x0, vl64
++**	ret
++*/
++PROTO (decw_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL64); }
++
++/*
++** decw_32_vl128:
++**	decw	x0, vl128
++**	ret
++*/
++PROTO (decw_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL128); }
++
++/*
++** decw_32_vl256:
++**	decw	x0, vl256
++**	ret
++*/
++PROTO (decw_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL256); }
++
++/*
++** decw_32_mul3:
++**	decw	x0, mul3
++**	ret
++*/
++PROTO (decw_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_MUL3); }
++
++/*
++** decw_32_mul4:
++**	decw	x0, mul4
++**	ret
++*/
++PROTO (decw_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_MUL4); }
++
++/*
++** decw_32_all:
++**	decw	x0
++**	ret
++*/
++PROTO (decw_32_all, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_ALL); }
++
++/*
++** decw_64_pow2:
++**	decw	x0, pow2
++**	ret
++*/
++PROTO (decw_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcntw_pat (SV_POW2); }
++
++/*
++** decw_64_all:
++**	decw	x0
++**	ret
++*/
++PROTO (decw_64_all, uint64_t, (uint64_t x0)) { return x0 - svcntw_pat (SV_ALL); }
++
++/*
++** incw_s32_pow2_z0:
++**	incw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (incw_s32_pow2_z0, svint32_t,
++		z0 = svadd_n_s32_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)));
++
++/*
++** incw_s32_pow2_z1:
++**	movprfx	z0, z1
++**	incw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (incw_s32_pow2_z1, svint32_t,
++		z0 = svadd_n_s32_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)),
++		z0 = svadd_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)));
++
++/*
++** decw_s32_pow2_z0:
++**	decw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (decw_s32_pow2_z0, svint32_t,
++		z0 = svsub_n_s32_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)));
++
++/*
++** decw_s32_pow2_z1:
++**	movprfx	z0, z1
++**	decw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (decw_s32_pow2_z1, svint32_t,
++		z0 = svsub_n_s32_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)),
++		z0 = svsub_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)));
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f32.c
+new file mode 100644
+index 000000000..2e80d6830
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** compact_f32_tied1:
++**	compact	z0\.s, p0, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (compact_f32_tied1, svfloat32_t,
++		z0 = svcompact_f32 (p0, z0),
++		z0 = svcompact (p0, z0))
++
++/*
++** compact_f32_untied:
++**	compact	z0\.s, p0, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (compact_f32_untied, svfloat32_t,
++		z0 = svcompact_f32 (p0, z1),
++		z0 = svcompact (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f64.c
+new file mode 100644
+index 000000000..e0bc33efe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** compact_f64_tied1:
++**	compact	z0\.d, p0, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (compact_f64_tied1, svfloat64_t,
++		z0 = svcompact_f64 (p0, z0),
++		z0 = svcompact (p0, z0))
++
++/*
++** compact_f64_untied:
++**	compact	z0\.d, p0, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (compact_f64_untied, svfloat64_t,
++		z0 = svcompact_f64 (p0, z1),
++		z0 = svcompact (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s32.c
+new file mode 100644
+index 000000000..e4634982b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** compact_s32_tied1:
++**	compact	z0\.s, p0, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (compact_s32_tied1, svint32_t,
++		z0 = svcompact_s32 (p0, z0),
++		z0 = svcompact (p0, z0))
++
++/*
++** compact_s32_untied:
++**	compact	z0\.s, p0, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (compact_s32_untied, svint32_t,
++		z0 = svcompact_s32 (p0, z1),
++		z0 = svcompact (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s64.c
+new file mode 100644
+index 000000000..71cb97b8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** compact_s64_tied1:
++**	compact	z0\.d, p0, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (compact_s64_tied1, svint64_t,
++		z0 = svcompact_s64 (p0, z0),
++		z0 = svcompact (p0, z0))
++
++/*
++** compact_s64_untied:
++**	compact	z0\.d, p0, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (compact_s64_untied, svint64_t,
++		z0 = svcompact_s64 (p0, z1),
++		z0 = svcompact (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u32.c
+new file mode 100644
+index 000000000..954329a0b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** compact_u32_tied1:
++**	compact	z0\.s, p0, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (compact_u32_tied1, svuint32_t,
++		z0 = svcompact_u32 (p0, z0),
++		z0 = svcompact (p0, z0))
++
++/*
++** compact_u32_untied:
++**	compact	z0\.s, p0, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (compact_u32_untied, svuint32_t,
++		z0 = svcompact_u32 (p0, z1),
++		z0 = svcompact (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u64.c
+new file mode 100644
+index 000000000..ec664845f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** compact_u64_tied1:
++**	compact	z0\.d, p0, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (compact_u64_tied1, svuint64_t,
++		z0 = svcompact_u64 (p0, z0),
++		z0 = svcompact (p0, z0))
++
++/*
++** compact_u64_untied:
++**	compact	z0\.d, p0, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (compact_u64_untied, svuint64_t,
++		z0 = svcompact_u64 (p0, z1),
++		z0 = svcompact (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create2_1.c
+new file mode 100644
+index 000000000..e9158ed8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create2_1.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** create2_s8:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create2_s8, svint8x2_t, svint8_t,
++	     z0 = svcreate2_s8 (z6, z4),
++	     z0 = svcreate2 (z6, z4))
++
++/*
++** create2_u8:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create2_u8, svuint8x2_t, svuint8_t,
++	     z0 = svcreate2_u8 (z4, z6),
++	     z0 = svcreate2 (z4, z6))
++
++/*
++** create2_s16:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create2_s16, svint16x2_t, svint16_t,
++	     z0 = svcreate2_s16 (z6, z4),
++	     z0 = svcreate2 (z6, z4))
++
++/*
++** create2_u16:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create2_u16, svuint16x2_t, svuint16_t,
++	     z0 = svcreate2_u16 (z6, z5),
++	     z0 = svcreate2 (z6, z5))
++
++/*
++** create2_bf16:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create2_bf16, svbfloat16x2_t, svbfloat16_t,
++	     z0 = svcreate2_bf16 (z4, z5),
++	     z0 = svcreate2 (z4, z5))
++
++/*
++** create2_f16:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create2_f16, svfloat16x2_t, svfloat16_t,
++	     z0 = svcreate2_f16 (z4, z5),
++	     z0 = svcreate2 (z4, z5))
++
++/*
++** create2_s32:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create2_s32, svint32x2_t, svint32_t,
++	     z0 = svcreate2_s32 (z6, z7),
++	     z0 = svcreate2 (z6, z7))
++
++/*
++** create2_u32:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create2_u32, svuint32x2_t, svuint32_t,
++	     z0 = svcreate2_u32 (z7, z5),
++	     z0 = svcreate2 (z7, z5))
++
++/*
++** create2_f32:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create2_f32, svfloat32x2_t, svfloat32_t,
++	     z0 = svcreate2_f32 (z7, z4),
++	     z0 = svcreate2 (z7, z4))
++
++/*
++** create2_s64:
++**	mov	z0\.d, z5\.d
++**	mov	z1\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create2_s64, svint64x2_t, svint64_t,
++	     z0 = svcreate2_s64 (z5, z7),
++	     z0 = svcreate2 (z5, z7))
++
++/*
++** create2_u64:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create2_u64, svuint64x2_t, svuint64_t,
++	     z0 = svcreate2_u64 (z7, z6),
++	     z0 = svcreate2 (z7, z6))
++
++/*
++** create2_f64:
++**	mov	z0\.d, z5\.d
++**	mov	z1\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create2_f64, svfloat64x2_t, svfloat64_t,
++	     z0 = svcreate2_f64 (z5, z4),
++	     z0 = svcreate2 (z5, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create3_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create3_1.c
+new file mode 100644
+index 000000000..6f1afb772
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create3_1.c
+@@ -0,0 +1,135 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** create3_s8:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create3_s8, svint8x3_t, svint8_t,
++	     z0 = svcreate3_s8 (z6, z4, z7),
++	     z0 = svcreate3 (z6, z4, z7))
++
++/*
++** create3_u8:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z6\.d
++**	mov	z2\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create3_u8, svuint8x3_t, svuint8_t,
++	     z0 = svcreate3_u8 (z4, z6, z5),
++	     z0 = svcreate3 (z4, z6, z5))
++
++/*
++** create3_s16:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create3_s16, svint16x3_t, svint16_t,
++	     z0 = svcreate3_s16 (z6, z4, z5),
++	     z0 = svcreate3 (z6, z4, z5))
++
++/*
++** create3_u16:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create3_u16, svuint16x3_t, svuint16_t,
++	     z0 = svcreate3_u16 (z6, z5, z4),
++	     z0 = svcreate3 (z6, z5, z4))
++
++/*
++** create3_bf16:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create3_bf16, svbfloat16x3_t, svbfloat16_t,
++	     z0 = svcreate3_bf16 (z4, z5, z6),
++	     z0 = svcreate3 (z4, z5, z6))
++
++/*
++** create3_f16:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create3_f16, svfloat16x3_t, svfloat16_t,
++	     z0 = svcreate3_f16 (z4, z5, z6),
++	     z0 = svcreate3 (z4, z5, z6))
++
++/*
++** create3_s32:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z7\.d
++**	mov	z2\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create3_s32, svint32x3_t, svint32_t,
++	     z0 = svcreate3_s32 (z6, z7, z4),
++	     z0 = svcreate3 (z6, z7, z4))
++
++/*
++** create3_u32:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create3_u32, svuint32x3_t, svuint32_t,
++	     z0 = svcreate3_u32 (z7, z5, z6),
++	     z0 = svcreate3 (z7, z5, z6))
++
++/*
++** create3_f32:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create3_f32, svfloat32x3_t, svfloat32_t,
++	     z0 = svcreate3_f32 (z7, z4, z6),
++	     z0 = svcreate3 (z7, z4, z6))
++
++/*
++** create3_s64:
++**	mov	z0\.d, z5\.d
++**	mov	z1\.d, z7\.d
++**	mov	z2\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create3_s64, svint64x3_t, svint64_t,
++	     z0 = svcreate3_s64 (z5, z7, z6),
++	     z0 = svcreate3 (z5, z7, z6))
++
++/*
++** create3_u64:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z6\.d
++**	mov	z2\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create3_u64, svuint64x3_t, svuint64_t,
++	     z0 = svcreate3_u64 (z7, z6, z4),
++	     z0 = svcreate3 (z7, z6, z4))
++
++/*
++** create3_f64:
++**	mov	z0\.d, z5\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create3_f64, svfloat64x3_t, svfloat64_t,
++	     z0 = svcreate3_f64 (z5, z4, z7),
++	     z0 = svcreate3 (z5, z4, z7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create4_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create4_1.c
+new file mode 100644
+index 000000000..a3866286e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create4_1.c
+@@ -0,0 +1,147 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** create4_s8:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z7\.d
++**	mov	z3\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create4_s8, svint8x4_t, svint8_t,
++	     z0 = svcreate4_s8 (z6, z4, z7, z5),
++	     z0 = svcreate4 (z6, z4, z7, z5))
++
++/*
++** create4_u8:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z6\.d
++**	mov	z2\.d, z5\.d
++**	mov	z3\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create4_u8, svuint8x4_t, svuint8_t,
++	     z0 = svcreate4_u8 (z4, z6, z5, z7),
++	     z0 = svcreate4 (z4, z6, z5, z7))
++
++/*
++** create4_s16:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z5\.d
++**	mov	z3\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create4_s16, svint16x4_t, svint16_t,
++	     z0 = svcreate4_s16 (z6, z4, z5, z7),
++	     z0 = svcreate4 (z6, z4, z5, z7))
++
++/*
++** create4_u16:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z4\.d
++**	mov	z3\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create4_u16, svuint16x4_t, svuint16_t,
++	     z0 = svcreate4_u16 (z6, z5, z4, z7),
++	     z0 = svcreate4 (z6, z5, z4, z7))
++
++/*
++** create4_bf16:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z6\.d
++**	mov	z3\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create4_bf16, svbfloat16x4_t, svbfloat16_t,
++	     z0 = svcreate4_bf16 (z4, z5, z6, z7),
++	     z0 = svcreate4 (z4, z5, z6, z7))
++
++/*
++** create4_f16:
++**	mov	z0\.d, z4\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z6\.d
++**	mov	z3\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create4_f16, svfloat16x4_t, svfloat16_t,
++	     z0 = svcreate4_f16 (z4, z5, z6, z7),
++	     z0 = svcreate4 (z4, z5, z6, z7))
++
++/*
++** create4_s32:
++**	mov	z0\.d, z6\.d
++**	mov	z1\.d, z7\.d
++**	mov	z2\.d, z4\.d
++**	mov	z3\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create4_s32, svint32x4_t, svint32_t,
++	     z0 = svcreate4_s32 (z6, z7, z4, z5),
++	     z0 = svcreate4 (z6, z7, z4, z5))
++
++/*
++** create4_u32:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z5\.d
++**	mov	z2\.d, z6\.d
++**	mov	z3\.d, z7\.d
++**	ret
++*/
++TEST_CREATE (create4_u32, svuint32x4_t, svuint32_t,
++	     z0 = svcreate4_u32 (z7, z5, z6, z7),
++	     z0 = svcreate4 (z7, z5, z6, z7))
++
++/*
++** create4_f32:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z6\.d
++**	mov	z3\.d, z4\.d
++**	ret
++*/
++TEST_CREATE (create4_f32, svfloat32x4_t, svfloat32_t,
++	     z0 = svcreate4_f32 (z7, z4, z6, z4),
++	     z0 = svcreate4 (z7, z4, z6, z4))
++
++/*
++** create4_s64:
++**	mov	z0\.d, z5\.d
++**	mov	z1\.d, z7\.d
++**	mov	z2\.d, z6\.d
++**	mov	z3\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create4_s64, svint64x4_t, svint64_t,
++	     z0 = svcreate4_s64 (z5, z7, z6, z6),
++	     z0 = svcreate4 (z5, z7, z6, z6))
++
++/*
++** create4_u64:
++**	mov	z0\.d, z7\.d
++**	mov	z1\.d, z6\.d
++**	mov	z2\.d, z4\.d
++**	mov	z3\.d, z5\.d
++**	ret
++*/
++TEST_CREATE (create4_u64, svuint64x4_t, svuint64_t,
++	     z0 = svcreate4_u64 (z7, z6, z4, z5),
++	     z0 = svcreate4 (z7, z6, z4, z5))
++
++/*
++** create4_f64:
++**	mov	z0\.d, z5\.d
++**	mov	z1\.d, z4\.d
++**	mov	z2\.d, z7\.d
++**	mov	z3\.d, z6\.d
++**	ret
++*/
++TEST_CREATE (create4_f64, svfloat64x4_t, svfloat64_t,
++	     z0 = svcreate4_f64 (z5, z4, z7, z6),
++	     z0 = svcreate4 (z5, z4, z7, z6))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_bf16.c
+new file mode 100644
+index 000000000..52baa1f58
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_bf16.c
+@@ -0,0 +1,96 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_bf16_f32_m_tied1:
++**	bfcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_bf16_f32_m_tied1, svbfloat16_t, svfloat32_t,
++	     z0 = svcvt_bf16_f32_m (z0, p0, z4),
++	     z0 = svcvt_bf16_m (z0, p0, z4))
++
++/*
++** cvt_bf16_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	bfcvt	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_bf16_f32_m_tied2, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvt_bf16_f32_m (z4, p0, z0),
++		 z0_res = svcvt_bf16_m (z4, p0, z0))
++
++/*
++** cvt_bf16_f32_m_untied:
++**	movprfx	z0, z1
++**	bfcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_bf16_f32_m_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvt_bf16_f32_m (z1, p0, z4),
++	     z0 = svcvt_bf16_m (z1, p0, z4))
++
++/*
++** cvt_bf16_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	bfcvt	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_bf16_f32_z_tied1, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvt_bf16_f32_z (p0, z0),
++		 z0_res = svcvt_bf16_z (p0, z0))
++
++/*
++** cvt_bf16_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	bfcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_bf16_f32_z_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvt_bf16_f32_z (p0, z4),
++	     z0 = svcvt_bf16_z (p0, z4))
++
++/*
++** cvt_bf16_f32_x_tied1:
++**	bfcvt	z0\.h, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvt_bf16_f32_x (p0, z0),
++		 z0_res = svcvt_bf16_x (p0, z0))
++
++/*
++** cvt_bf16_f32_x_untied:
++**	bfcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvt_bf16_f32_x (p0, z4),
++	     z0 = svcvt_bf16_x (p0, z4))
++
++/*
++** ptrue_cvt_bf16_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvt_bf16_f32_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_bf16_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_bf16_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvt_bf16_f32_x (svptrue_b32 (), z4),
++	     z0 = svcvt_bf16_x (svptrue_b32 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f16.c
+new file mode 100644
+index 000000000..5dcd48046
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f16.c
+@@ -0,0 +1,731 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_f16_f32_m_tied1:
++**	fcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f32_m_tied1, svfloat16_t, svfloat32_t,
++	     z0 = svcvt_f16_f32_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvt	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_f32_m_tied2, svfloat16_t, svfloat32_t,
++		 z0_res = svcvt_f16_f32_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_f32_m_untied:
++**	movprfx	z0, z1
++**	fcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f32_m_untied, svfloat16_t, svfloat32_t,
++	     z0 = svcvt_f16_f32_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_f64_m_tied1:
++**	fcvt	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f64_m_tied1, svfloat16_t, svfloat64_t,
++	     z0 = svcvt_f16_f64_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fcvt	z0\.h, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_f64_m_tied2, svfloat16_t, svfloat64_t,
++		 z0_res = svcvt_f16_f64_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_f64_m_untied:
++**	movprfx	z0, z1
++**	fcvt	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f64_m_untied, svfloat16_t, svfloat64_t,
++	     z0 = svcvt_f16_f64_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_s16_m_tied1:
++**	scvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s16_m_tied1, svfloat16_t, svint16_t,
++	     z0 = svcvt_f16_s16_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s16_m_tied2, svfloat16_t, svint16_t,
++		 z0_res = svcvt_f16_s16_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_s16_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s16_m_untied, svfloat16_t, svint16_t,
++	     z0 = svcvt_f16_s16_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_s32_m_tied1:
++**	scvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s32_m_tied1, svfloat16_t, svint32_t,
++	     z0 = svcvt_f16_s32_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s32_m_tied2, svfloat16_t, svint32_t,
++		 z0_res = svcvt_f16_s32_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_s32_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s32_m_untied, svfloat16_t, svint32_t,
++	     z0 = svcvt_f16_s32_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_s64_m_tied1:
++**	scvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s64_m_tied1, svfloat16_t, svint64_t,
++	     z0 = svcvt_f16_s64_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.h, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s64_m_tied2, svfloat16_t, svint64_t,
++		 z0_res = svcvt_f16_s64_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_s64_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s64_m_untied, svfloat16_t, svint64_t,
++	     z0 = svcvt_f16_s64_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_u16_m_tied1:
++**	ucvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u16_m_tied1, svfloat16_t, svuint16_t,
++	     z0 = svcvt_f16_u16_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u16_m_tied2, svfloat16_t, svuint16_t,
++		 z0_res = svcvt_f16_u16_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_u16_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u16_m_untied, svfloat16_t, svuint16_t,
++	     z0 = svcvt_f16_u16_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_u32_m_tied1:
++**	ucvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u32_m_tied1, svfloat16_t, svuint32_t,
++	     z0 = svcvt_f16_u32_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u32_m_tied2, svfloat16_t, svuint32_t,
++		 z0_res = svcvt_f16_u32_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_u32_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u32_m_untied, svfloat16_t, svuint32_t,
++	     z0 = svcvt_f16_u32_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_u64_m_tied1:
++**	ucvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u64_m_tied1, svfloat16_t, svuint64_t,
++	     z0 = svcvt_f16_u64_m (z0, p0, z4),
++	     z0 = svcvt_f16_m (z0, p0, z4))
++
++/*
++** cvt_f16_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.h, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u64_m_tied2, svfloat16_t, svuint64_t,
++		 z0_res = svcvt_f16_u64_m (z4, p0, z0),
++		 z0_res = svcvt_f16_m (z4, p0, z0))
++
++/*
++** cvt_f16_u64_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u64_m_untied, svfloat16_t, svuint64_t,
++	     z0 = svcvt_f16_u64_m (z1, p0, z4),
++	     z0 = svcvt_f16_m (z1, p0, z4))
++
++/*
++** cvt_f16_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fcvt	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_f32_z_tied1, svfloat16_t, svfloat32_t,
++		 z0_res = svcvt_f16_f32_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f32_z_untied, svfloat16_t, svfloat32_t,
++	     z0 = svcvt_f16_f32_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fcvt	z0\.h, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_f64_z_tied1, svfloat16_t, svfloat64_t,
++		 z0_res = svcvt_f16_f64_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvt	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f64_z_untied, svfloat16_t, svfloat64_t,
++	     z0 = svcvt_f16_f64_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	scvtf	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s16_z_tied1, svfloat16_t, svint16_t,
++		 z0_res = svcvt_f16_s16_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	scvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s16_z_untied, svfloat16_t, svint16_t,
++	     z0 = svcvt_f16_s16_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	scvtf	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s32_z_tied1, svfloat16_t, svint32_t,
++		 z0_res = svcvt_f16_s32_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	scvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s32_z_untied, svfloat16_t, svint32_t,
++	     z0 = svcvt_f16_s32_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	scvtf	z0\.h, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s64_z_tied1, svfloat16_t, svint64_t,
++		 z0_res = svcvt_f16_s64_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	scvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s64_z_untied, svfloat16_t, svint64_t,
++	     z0 = svcvt_f16_s64_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	ucvtf	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u16_z_tied1, svfloat16_t, svuint16_t,
++		 z0_res = svcvt_f16_u16_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	ucvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u16_z_untied, svfloat16_t, svuint16_t,
++	     z0 = svcvt_f16_u16_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	ucvtf	z0\.h, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u32_z_tied1, svfloat16_t, svuint32_t,
++		 z0_res = svcvt_f16_u32_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	ucvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u32_z_untied, svfloat16_t, svuint32_t,
++	     z0 = svcvt_f16_u32_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	ucvtf	z0\.h, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u64_z_tied1, svfloat16_t, svuint64_t,
++		 z0_res = svcvt_f16_u64_z (p0, z0),
++		 z0_res = svcvt_f16_z (p0, z0))
++
++/*
++** cvt_f16_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	ucvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u64_z_untied, svfloat16_t, svuint64_t,
++	     z0 = svcvt_f16_u64_z (p0, z4),
++	     z0 = svcvt_f16_z (p0, z4))
++
++/*
++** cvt_f16_f32_x_tied1:
++**	fcvt	z0\.h, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_f32_x_tied1, svfloat16_t, svfloat32_t,
++		 z0_res = svcvt_f16_f32_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_f32_x_untied:
++**	fcvt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f32_x_untied, svfloat16_t, svfloat32_t,
++	     z0 = svcvt_f16_f32_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_f64_x_tied1:
++**	fcvt	z0\.h, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_f64_x_tied1, svfloat16_t, svfloat64_t,
++		 z0_res = svcvt_f16_f64_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_f64_x_untied:
++**	fcvt	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_f64_x_untied, svfloat16_t, svfloat64_t,
++	     z0 = svcvt_f16_f64_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_s16_x_tied1:
++**	scvtf	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s16_x_tied1, svfloat16_t, svint16_t,
++		 z0_res = svcvt_f16_s16_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_s16_x_untied:
++**	scvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s16_x_untied, svfloat16_t, svint16_t,
++	     z0 = svcvt_f16_s16_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_s32_x_tied1:
++**	scvtf	z0\.h, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s32_x_tied1, svfloat16_t, svint32_t,
++		 z0_res = svcvt_f16_s32_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_s32_x_untied:
++**	scvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s32_x_untied, svfloat16_t, svint32_t,
++	     z0 = svcvt_f16_s32_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_s64_x_tied1:
++**	scvtf	z0\.h, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_s64_x_tied1, svfloat16_t, svint64_t,
++		 z0_res = svcvt_f16_s64_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_s64_x_untied:
++**	scvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_s64_x_untied, svfloat16_t, svint64_t,
++	     z0 = svcvt_f16_s64_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_u16_x_tied1:
++**	ucvtf	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u16_x_tied1, svfloat16_t, svuint16_t,
++		 z0_res = svcvt_f16_u16_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_u16_x_untied:
++**	ucvtf	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u16_x_untied, svfloat16_t, svuint16_t,
++	     z0 = svcvt_f16_u16_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_u32_x_tied1:
++**	ucvtf	z0\.h, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u32_x_tied1, svfloat16_t, svuint32_t,
++		 z0_res = svcvt_f16_u32_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_u32_x_untied:
++**	ucvtf	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u32_x_untied, svfloat16_t, svuint32_t,
++	     z0 = svcvt_f16_u32_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** cvt_f16_u64_x_tied1:
++**	ucvtf	z0\.h, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f16_u64_x_tied1, svfloat16_t, svuint64_t,
++		 z0_res = svcvt_f16_u64_x (p0, z0),
++		 z0_res = svcvt_f16_x (p0, z0))
++
++/*
++** cvt_f16_u64_x_untied:
++**	ucvtf	z0\.h, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f16_u64_x_untied, svfloat16_t, svuint64_t,
++	     z0 = svcvt_f16_u64_x (p0, z4),
++	     z0 = svcvt_f16_x (p0, z4))
++
++/*
++** ptrue_cvt_f16_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_f32_x_tied1, svfloat16_t, svfloat32_t,
++		 z0_res = svcvt_f16_f32_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_f16_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_f32_x_untied, svfloat16_t, svfloat32_t,
++	     z0 = svcvt_f16_f32_x (svptrue_b32 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b32 (), z4))
++
++/*
++** ptrue_cvt_f16_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_f64_x_tied1, svfloat16_t, svfloat64_t,
++		 z0_res = svcvt_f16_f64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f16_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_f64_x_untied, svfloat16_t, svfloat64_t,
++	     z0 = svcvt_f16_f64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f16_s16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_s16_x_tied1, svfloat16_t, svint16_t,
++		 z0_res = svcvt_f16_s16_x (svptrue_b16 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_cvt_f16_s16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_s16_x_untied, svfloat16_t, svint16_t,
++	     z0 = svcvt_f16_s16_x (svptrue_b16 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b16 (), z4))
++
++/*
++** ptrue_cvt_f16_s32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_s32_x_tied1, svfloat16_t, svint32_t,
++		 z0_res = svcvt_f16_s32_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_f16_s32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_s32_x_untied, svfloat16_t, svint32_t,
++	     z0 = svcvt_f16_s32_x (svptrue_b32 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b32 (), z4))
++
++/*
++** ptrue_cvt_f16_s64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_s64_x_tied1, svfloat16_t, svint64_t,
++		 z0_res = svcvt_f16_s64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f16_s64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_s64_x_untied, svfloat16_t, svint64_t,
++	     z0 = svcvt_f16_s64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f16_u16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_u16_x_tied1, svfloat16_t, svuint16_t,
++		 z0_res = svcvt_f16_u16_x (svptrue_b16 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_cvt_f16_u16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_u16_x_untied, svfloat16_t, svuint16_t,
++	     z0 = svcvt_f16_u16_x (svptrue_b16 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b16 (), z4))
++
++/*
++** ptrue_cvt_f16_u32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_u32_x_tied1, svfloat16_t, svuint32_t,
++		 z0_res = svcvt_f16_u32_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_f16_u32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_u32_x_untied, svfloat16_t, svuint32_t,
++	     z0 = svcvt_f16_u32_x (svptrue_b32 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b32 (), z4))
++
++/*
++** ptrue_cvt_f16_u64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f16_u64_x_tied1, svfloat16_t, svuint64_t,
++		 z0_res = svcvt_f16_u64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f16_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f16_u64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f16_u64_x_untied, svfloat16_t, svuint64_t,
++	     z0 = svcvt_f16_u64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f16_x (svptrue_b64 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f32.c
+new file mode 100644
+index 000000000..c16469939
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f32.c
+@@ -0,0 +1,549 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_f32_f16_m_tied1:
++**	fcvt	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f16_m_tied1, svfloat32_t, svfloat16_t,
++	     z0 = svcvt_f32_f16_m (z0, p0, z4),
++	     z0 = svcvt_f32_m (z0, p0, z4))
++
++/*
++** cvt_f32_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvt	z0\.s, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_f16_m_tied2, svfloat32_t, svfloat16_t,
++		 z0_res = svcvt_f32_f16_m (z4, p0, z0),
++		 z0_res = svcvt_f32_m (z4, p0, z0))
++
++/*
++** cvt_f32_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvt	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f16_m_untied, svfloat32_t, svfloat16_t,
++	     z0 = svcvt_f32_f16_m (z1, p0, z4),
++	     z0 = svcvt_f32_m (z1, p0, z4))
++
++/*
++** cvt_f32_f64_m_tied1:
++**	fcvt	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f64_m_tied1, svfloat32_t, svfloat64_t,
++	     z0 = svcvt_f32_f64_m (z0, p0, z4),
++	     z0 = svcvt_f32_m (z0, p0, z4))
++
++/*
++** cvt_f32_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fcvt	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_f64_m_tied2, svfloat32_t, svfloat64_t,
++		 z0_res = svcvt_f32_f64_m (z4, p0, z0),
++		 z0_res = svcvt_f32_m (z4, p0, z0))
++
++/*
++** cvt_f32_f64_m_untied:
++**	movprfx	z0, z1
++**	fcvt	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f64_m_untied, svfloat32_t, svfloat64_t,
++	     z0 = svcvt_f32_f64_m (z1, p0, z4),
++	     z0 = svcvt_f32_m (z1, p0, z4))
++
++/*
++** cvt_f32_s32_m_tied1:
++**	scvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s32_m_tied1, svfloat32_t, svint32_t,
++	     z0 = svcvt_f32_s32_m (z0, p0, z4),
++	     z0 = svcvt_f32_m (z0, p0, z4))
++
++/*
++** cvt_f32_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_s32_m_tied2, svfloat32_t, svint32_t,
++		 z0_res = svcvt_f32_s32_m (z4, p0, z0),
++		 z0_res = svcvt_f32_m (z4, p0, z0))
++
++/*
++** cvt_f32_s32_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s32_m_untied, svfloat32_t, svint32_t,
++	     z0 = svcvt_f32_s32_m (z1, p0, z4),
++	     z0 = svcvt_f32_m (z1, p0, z4))
++
++/*
++** cvt_f32_s64_m_tied1:
++**	scvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s64_m_tied1, svfloat32_t, svint64_t,
++	     z0 = svcvt_f32_s64_m (z0, p0, z4),
++	     z0 = svcvt_f32_m (z0, p0, z4))
++
++/*
++** cvt_f32_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_s64_m_tied2, svfloat32_t, svint64_t,
++		 z0_res = svcvt_f32_s64_m (z4, p0, z0),
++		 z0_res = svcvt_f32_m (z4, p0, z0))
++
++/*
++** cvt_f32_s64_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s64_m_untied, svfloat32_t, svint64_t,
++	     z0 = svcvt_f32_s64_m (z1, p0, z4),
++	     z0 = svcvt_f32_m (z1, p0, z4))
++
++/*
++** cvt_f32_u32_m_tied1:
++**	ucvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u32_m_tied1, svfloat32_t, svuint32_t,
++	     z0 = svcvt_f32_u32_m (z0, p0, z4),
++	     z0 = svcvt_f32_m (z0, p0, z4))
++
++/*
++** cvt_f32_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_u32_m_tied2, svfloat32_t, svuint32_t,
++		 z0_res = svcvt_f32_u32_m (z4, p0, z0),
++		 z0_res = svcvt_f32_m (z4, p0, z0))
++
++/*
++** cvt_f32_u32_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u32_m_untied, svfloat32_t, svuint32_t,
++	     z0 = svcvt_f32_u32_m (z1, p0, z4),
++	     z0 = svcvt_f32_m (z1, p0, z4))
++
++/*
++** cvt_f32_u64_m_tied1:
++**	ucvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u64_m_tied1, svfloat32_t, svuint64_t,
++	     z0 = svcvt_f32_u64_m (z0, p0, z4),
++	     z0 = svcvt_f32_m (z0, p0, z4))
++
++/*
++** cvt_f32_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_u64_m_tied2, svfloat32_t, svuint64_t,
++		 z0_res = svcvt_f32_u64_m (z4, p0, z0),
++		 z0_res = svcvt_f32_m (z4, p0, z0))
++
++/*
++** cvt_f32_u64_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u64_m_untied, svfloat32_t, svuint64_t,
++	     z0 = svcvt_f32_u64_m (z1, p0, z4),
++	     z0 = svcvt_f32_m (z1, p0, z4))
++
++/*
++** cvt_f32_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fcvt	z0\.s, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_f16_z_tied1, svfloat32_t, svfloat16_t,
++		 z0_res = svcvt_f32_f16_z (p0, z0),
++		 z0_res = svcvt_f32_z (p0, z0))
++
++/*
++** cvt_f32_f16_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fcvt	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f16_z_untied, svfloat32_t, svfloat16_t,
++	     z0 = svcvt_f32_f16_z (p0, z4),
++	     z0 = svcvt_f32_z (p0, z4))
++
++/*
++** cvt_f32_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fcvt	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_f64_z_tied1, svfloat32_t, svfloat64_t,
++		 z0_res = svcvt_f32_f64_z (p0, z0),
++		 z0_res = svcvt_f32_z (p0, z0))
++
++/*
++** cvt_f32_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvt	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f64_z_untied, svfloat32_t, svfloat64_t,
++	     z0 = svcvt_f32_f64_z (p0, z4),
++	     z0 = svcvt_f32_z (p0, z4))
++
++/*
++** cvt_f32_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	scvtf	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_s32_z_tied1, svfloat32_t, svint32_t,
++		 z0_res = svcvt_f32_s32_z (p0, z0),
++		 z0_res = svcvt_f32_z (p0, z0))
++
++/*
++** cvt_f32_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	scvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s32_z_untied, svfloat32_t, svint32_t,
++	     z0 = svcvt_f32_s32_z (p0, z4),
++	     z0 = svcvt_f32_z (p0, z4))
++
++/*
++** cvt_f32_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	scvtf	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_s64_z_tied1, svfloat32_t, svint64_t,
++		 z0_res = svcvt_f32_s64_z (p0, z0),
++		 z0_res = svcvt_f32_z (p0, z0))
++
++/*
++** cvt_f32_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	scvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s64_z_untied, svfloat32_t, svint64_t,
++	     z0 = svcvt_f32_s64_z (p0, z4),
++	     z0 = svcvt_f32_z (p0, z4))
++
++/*
++** cvt_f32_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	ucvtf	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_u32_z_tied1, svfloat32_t, svuint32_t,
++		 z0_res = svcvt_f32_u32_z (p0, z0),
++		 z0_res = svcvt_f32_z (p0, z0))
++
++/*
++** cvt_f32_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	ucvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u32_z_untied, svfloat32_t, svuint32_t,
++	     z0 = svcvt_f32_u32_z (p0, z4),
++	     z0 = svcvt_f32_z (p0, z4))
++
++/*
++** cvt_f32_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	ucvtf	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_u64_z_tied1, svfloat32_t, svuint64_t,
++		 z0_res = svcvt_f32_u64_z (p0, z0),
++		 z0_res = svcvt_f32_z (p0, z0))
++
++/*
++** cvt_f32_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	ucvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u64_z_untied, svfloat32_t, svuint64_t,
++	     z0 = svcvt_f32_u64_z (p0, z4),
++	     z0 = svcvt_f32_z (p0, z4))
++
++/*
++** cvt_f32_f16_x_tied1:
++**	fcvt	z0\.s, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_f16_x_tied1, svfloat32_t, svfloat16_t,
++		 z0_res = svcvt_f32_f16_x (p0, z0),
++		 z0_res = svcvt_f32_x (p0, z0))
++
++/*
++** cvt_f32_f16_x_untied:
++**	fcvt	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f16_x_untied, svfloat32_t, svfloat16_t,
++	     z0 = svcvt_f32_f16_x (p0, z4),
++	     z0 = svcvt_f32_x (p0, z4))
++
++/*
++** cvt_f32_f64_x_tied1:
++**	fcvt	z0\.s, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_f64_x_tied1, svfloat32_t, svfloat64_t,
++		 z0_res = svcvt_f32_f64_x (p0, z0),
++		 z0_res = svcvt_f32_x (p0, z0))
++
++/*
++** cvt_f32_f64_x_untied:
++**	fcvt	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_f64_x_untied, svfloat32_t, svfloat64_t,
++	     z0 = svcvt_f32_f64_x (p0, z4),
++	     z0 = svcvt_f32_x (p0, z4))
++
++/*
++** cvt_f32_s32_x_tied1:
++**	scvtf	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_s32_x_tied1, svfloat32_t, svint32_t,
++		 z0_res = svcvt_f32_s32_x (p0, z0),
++		 z0_res = svcvt_f32_x (p0, z0))
++
++/*
++** cvt_f32_s32_x_untied:
++**	scvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s32_x_untied, svfloat32_t, svint32_t,
++	     z0 = svcvt_f32_s32_x (p0, z4),
++	     z0 = svcvt_f32_x (p0, z4))
++
++/*
++** cvt_f32_s64_x_tied1:
++**	scvtf	z0\.s, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_s64_x_tied1, svfloat32_t, svint64_t,
++		 z0_res = svcvt_f32_s64_x (p0, z0),
++		 z0_res = svcvt_f32_x (p0, z0))
++
++/*
++** cvt_f32_s64_x_untied:
++**	scvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_s64_x_untied, svfloat32_t, svint64_t,
++	     z0 = svcvt_f32_s64_x (p0, z4),
++	     z0 = svcvt_f32_x (p0, z4))
++
++/*
++** cvt_f32_u32_x_tied1:
++**	ucvtf	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_u32_x_tied1, svfloat32_t, svuint32_t,
++		 z0_res = svcvt_f32_u32_x (p0, z0),
++		 z0_res = svcvt_f32_x (p0, z0))
++
++/*
++** cvt_f32_u32_x_untied:
++**	ucvtf	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u32_x_untied, svfloat32_t, svuint32_t,
++	     z0 = svcvt_f32_u32_x (p0, z4),
++	     z0 = svcvt_f32_x (p0, z4))
++
++/*
++** cvt_f32_u64_x_tied1:
++**	ucvtf	z0\.s, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f32_u64_x_tied1, svfloat32_t, svuint64_t,
++		 z0_res = svcvt_f32_u64_x (p0, z0),
++		 z0_res = svcvt_f32_x (p0, z0))
++
++/*
++** cvt_f32_u64_x_untied:
++**	ucvtf	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f32_u64_x_untied, svfloat32_t, svuint64_t,
++	     z0 = svcvt_f32_u64_x (p0, z4),
++	     z0 = svcvt_f32_x (p0, z4))
++
++/*
++** ptrue_cvt_f32_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f32_f16_x_tied1, svfloat32_t, svfloat16_t,
++		 z0_res = svcvt_f32_f16_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_f32_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_f32_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f32_f16_x_untied, svfloat32_t, svfloat16_t,
++	     z0 = svcvt_f32_f16_x (svptrue_b32 (), z4),
++	     z0 = svcvt_f32_x (svptrue_b32 (), z4))
++
++/*
++** ptrue_cvt_f32_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f32_f64_x_tied1, svfloat32_t, svfloat64_t,
++		 z0_res = svcvt_f32_f64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f32_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f32_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f32_f64_x_untied, svfloat32_t, svfloat64_t,
++	     z0 = svcvt_f32_f64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f32_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f32_s32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f32_s32_x_tied1, svfloat32_t, svint32_t,
++		 z0_res = svcvt_f32_s32_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_f32_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_f32_s32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f32_s32_x_untied, svfloat32_t, svint32_t,
++	     z0 = svcvt_f32_s32_x (svptrue_b32 (), z4),
++	     z0 = svcvt_f32_x (svptrue_b32 (), z4))
++
++/*
++** ptrue_cvt_f32_s64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f32_s64_x_tied1, svfloat32_t, svint64_t,
++		 z0_res = svcvt_f32_s64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f32_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f32_s64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f32_s64_x_untied, svfloat32_t, svint64_t,
++	     z0 = svcvt_f32_s64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f32_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f32_u32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f32_u32_x_tied1, svfloat32_t, svuint32_t,
++		 z0_res = svcvt_f32_u32_x (svptrue_b32 (), z0),
++		 z0_res = svcvt_f32_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_cvt_f32_u32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f32_u32_x_untied, svfloat32_t, svuint32_t,
++	     z0 = svcvt_f32_u32_x (svptrue_b32 (), z4),
++	     z0 = svcvt_f32_x (svptrue_b32 (), z4))
++
++/*
++** ptrue_cvt_f32_u64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f32_u64_x_tied1, svfloat32_t, svuint64_t,
++		 z0_res = svcvt_f32_u64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f32_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f32_u64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f32_u64_x_untied, svfloat32_t, svuint64_t,
++	     z0 = svcvt_f32_u64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f32_x (svptrue_b64 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f64.c
+new file mode 100644
+index 000000000..1d08e6ec5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f64.c
+@@ -0,0 +1,549 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_f64_f16_m_tied1:
++**	fcvt	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f16_m_tied1, svfloat64_t, svfloat16_t,
++	     z0 = svcvt_f64_f16_m (z0, p0, z4),
++	     z0 = svcvt_f64_m (z0, p0, z4))
++
++/*
++** cvt_f64_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvt	z0\.d, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_f16_m_tied2, svfloat64_t, svfloat16_t,
++		 z0_res = svcvt_f64_f16_m (z4, p0, z0),
++		 z0_res = svcvt_f64_m (z4, p0, z0))
++
++/*
++** cvt_f64_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvt	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f16_m_untied, svfloat64_t, svfloat16_t,
++	     z0 = svcvt_f64_f16_m (z1, p0, z4),
++	     z0 = svcvt_f64_m (z1, p0, z4))
++
++/*
++** cvt_f64_f32_m_tied1:
++**	fcvt	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f32_m_tied1, svfloat64_t, svfloat32_t,
++	     z0 = svcvt_f64_f32_m (z0, p0, z4),
++	     z0 = svcvt_f64_m (z0, p0, z4))
++
++/*
++** cvt_f64_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvt	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_f32_m_tied2, svfloat64_t, svfloat32_t,
++		 z0_res = svcvt_f64_f32_m (z4, p0, z0),
++		 z0_res = svcvt_f64_m (z4, p0, z0))
++
++/*
++** cvt_f64_f32_m_untied:
++**	movprfx	z0, z1
++**	fcvt	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f32_m_untied, svfloat64_t, svfloat32_t,
++	     z0 = svcvt_f64_f32_m (z1, p0, z4),
++	     z0 = svcvt_f64_m (z1, p0, z4))
++
++/*
++** cvt_f64_s32_m_tied1:
++**	scvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s32_m_tied1, svfloat64_t, svint32_t,
++	     z0 = svcvt_f64_s32_m (z0, p0, z4),
++	     z0 = svcvt_f64_m (z0, p0, z4))
++
++/*
++** cvt_f64_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_s32_m_tied2, svfloat64_t, svint32_t,
++		 z0_res = svcvt_f64_s32_m (z4, p0, z0),
++		 z0_res = svcvt_f64_m (z4, p0, z0))
++
++/*
++** cvt_f64_s32_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s32_m_untied, svfloat64_t, svint32_t,
++	     z0 = svcvt_f64_s32_m (z1, p0, z4),
++	     z0 = svcvt_f64_m (z1, p0, z4))
++
++/*
++** cvt_f64_s64_m_tied1:
++**	scvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s64_m_tied1, svfloat64_t, svint64_t,
++	     z0 = svcvt_f64_s64_m (z0, p0, z4),
++	     z0 = svcvt_f64_m (z0, p0, z4))
++
++/*
++** cvt_f64_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	scvtf	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_s64_m_tied2, svfloat64_t, svint64_t,
++		 z0_res = svcvt_f64_s64_m (z4, p0, z0),
++		 z0_res = svcvt_f64_m (z4, p0, z0))
++
++/*
++** cvt_f64_s64_m_untied:
++**	movprfx	z0, z1
++**	scvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s64_m_untied, svfloat64_t, svint64_t,
++	     z0 = svcvt_f64_s64_m (z1, p0, z4),
++	     z0 = svcvt_f64_m (z1, p0, z4))
++
++/*
++** cvt_f64_u32_m_tied1:
++**	ucvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u32_m_tied1, svfloat64_t, svuint32_t,
++	     z0 = svcvt_f64_u32_m (z0, p0, z4),
++	     z0 = svcvt_f64_m (z0, p0, z4))
++
++/*
++** cvt_f64_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_u32_m_tied2, svfloat64_t, svuint32_t,
++		 z0_res = svcvt_f64_u32_m (z4, p0, z0),
++		 z0_res = svcvt_f64_m (z4, p0, z0))
++
++/*
++** cvt_f64_u32_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u32_m_untied, svfloat64_t, svuint32_t,
++	     z0 = svcvt_f64_u32_m (z1, p0, z4),
++	     z0 = svcvt_f64_m (z1, p0, z4))
++
++/*
++** cvt_f64_u64_m_tied1:
++**	ucvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u64_m_tied1, svfloat64_t, svuint64_t,
++	     z0 = svcvt_f64_u64_m (z0, p0, z4),
++	     z0 = svcvt_f64_m (z0, p0, z4))
++
++/*
++** cvt_f64_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	ucvtf	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_u64_m_tied2, svfloat64_t, svuint64_t,
++		 z0_res = svcvt_f64_u64_m (z4, p0, z0),
++		 z0_res = svcvt_f64_m (z4, p0, z0))
++
++/*
++** cvt_f64_u64_m_untied:
++**	movprfx	z0, z1
++**	ucvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u64_m_untied, svfloat64_t, svuint64_t,
++	     z0 = svcvt_f64_u64_m (z1, p0, z4),
++	     z0 = svcvt_f64_m (z1, p0, z4))
++
++/*
++** cvt_f64_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	fcvt	z0\.d, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_f16_z_tied1, svfloat64_t, svfloat16_t,
++		 z0_res = svcvt_f64_f16_z (p0, z0),
++		 z0_res = svcvt_f64_z (p0, z0))
++
++/*
++** cvt_f64_f16_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvt	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f16_z_untied, svfloat64_t, svfloat16_t,
++	     z0 = svcvt_f64_f16_z (p0, z4),
++	     z0 = svcvt_f64_z (p0, z4))
++
++/*
++** cvt_f64_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	fcvt	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_f32_z_tied1, svfloat64_t, svfloat32_t,
++		 z0_res = svcvt_f64_f32_z (p0, z0),
++		 z0_res = svcvt_f64_z (p0, z0))
++
++/*
++** cvt_f64_f32_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvt	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f32_z_untied, svfloat64_t, svfloat32_t,
++	     z0 = svcvt_f64_f32_z (p0, z4),
++	     z0 = svcvt_f64_z (p0, z4))
++
++/*
++** cvt_f64_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	scvtf	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_s32_z_tied1, svfloat64_t, svint32_t,
++		 z0_res = svcvt_f64_s32_z (p0, z0),
++		 z0_res = svcvt_f64_z (p0, z0))
++
++/*
++** cvt_f64_s32_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	scvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s32_z_untied, svfloat64_t, svint32_t,
++	     z0 = svcvt_f64_s32_z (p0, z4),
++	     z0 = svcvt_f64_z (p0, z4))
++
++/*
++** cvt_f64_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	scvtf	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_s64_z_tied1, svfloat64_t, svint64_t,
++		 z0_res = svcvt_f64_s64_z (p0, z0),
++		 z0_res = svcvt_f64_z (p0, z0))
++
++/*
++** cvt_f64_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	scvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s64_z_untied, svfloat64_t, svint64_t,
++	     z0 = svcvt_f64_s64_z (p0, z4),
++	     z0 = svcvt_f64_z (p0, z4))
++
++/*
++** cvt_f64_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	ucvtf	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_u32_z_tied1, svfloat64_t, svuint32_t,
++		 z0_res = svcvt_f64_u32_z (p0, z0),
++		 z0_res = svcvt_f64_z (p0, z0))
++
++/*
++** cvt_f64_u32_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	ucvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u32_z_untied, svfloat64_t, svuint32_t,
++	     z0 = svcvt_f64_u32_z (p0, z4),
++	     z0 = svcvt_f64_z (p0, z4))
++
++/*
++** cvt_f64_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	ucvtf	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_u64_z_tied1, svfloat64_t, svuint64_t,
++		 z0_res = svcvt_f64_u64_z (p0, z0),
++		 z0_res = svcvt_f64_z (p0, z0))
++
++/*
++** cvt_f64_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	ucvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u64_z_untied, svfloat64_t, svuint64_t,
++	     z0 = svcvt_f64_u64_z (p0, z4),
++	     z0 = svcvt_f64_z (p0, z4))
++
++/*
++** cvt_f64_f16_x_tied1:
++**	fcvt	z0\.d, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_f16_x_tied1, svfloat64_t, svfloat16_t,
++		 z0_res = svcvt_f64_f16_x (p0, z0),
++		 z0_res = svcvt_f64_x (p0, z0))
++
++/*
++** cvt_f64_f16_x_untied:
++**	fcvt	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f16_x_untied, svfloat64_t, svfloat16_t,
++	     z0 = svcvt_f64_f16_x (p0, z4),
++	     z0 = svcvt_f64_x (p0, z4))
++
++/*
++** cvt_f64_f32_x_tied1:
++**	fcvt	z0\.d, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_f32_x_tied1, svfloat64_t, svfloat32_t,
++		 z0_res = svcvt_f64_f32_x (p0, z0),
++		 z0_res = svcvt_f64_x (p0, z0))
++
++/*
++** cvt_f64_f32_x_untied:
++**	fcvt	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_f32_x_untied, svfloat64_t, svfloat32_t,
++	     z0 = svcvt_f64_f32_x (p0, z4),
++	     z0 = svcvt_f64_x (p0, z4))
++
++/*
++** cvt_f64_s32_x_tied1:
++**	scvtf	z0\.d, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_s32_x_tied1, svfloat64_t, svint32_t,
++		 z0_res = svcvt_f64_s32_x (p0, z0),
++		 z0_res = svcvt_f64_x (p0, z0))
++
++/*
++** cvt_f64_s32_x_untied:
++**	scvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s32_x_untied, svfloat64_t, svint32_t,
++	     z0 = svcvt_f64_s32_x (p0, z4),
++	     z0 = svcvt_f64_x (p0, z4))
++
++/*
++** cvt_f64_s64_x_tied1:
++**	scvtf	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_s64_x_tied1, svfloat64_t, svint64_t,
++		 z0_res = svcvt_f64_s64_x (p0, z0),
++		 z0_res = svcvt_f64_x (p0, z0))
++
++/*
++** cvt_f64_s64_x_untied:
++**	scvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_s64_x_untied, svfloat64_t, svint64_t,
++	     z0 = svcvt_f64_s64_x (p0, z4),
++	     z0 = svcvt_f64_x (p0, z4))
++
++/*
++** cvt_f64_u32_x_tied1:
++**	ucvtf	z0\.d, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_u32_x_tied1, svfloat64_t, svuint32_t,
++		 z0_res = svcvt_f64_u32_x (p0, z0),
++		 z0_res = svcvt_f64_x (p0, z0))
++
++/*
++** cvt_f64_u32_x_untied:
++**	ucvtf	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u32_x_untied, svfloat64_t, svuint32_t,
++	     z0 = svcvt_f64_u32_x (p0, z4),
++	     z0 = svcvt_f64_x (p0, z4))
++
++/*
++** cvt_f64_u64_x_tied1:
++**	ucvtf	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_f64_u64_x_tied1, svfloat64_t, svuint64_t,
++		 z0_res = svcvt_f64_u64_x (p0, z0),
++		 z0_res = svcvt_f64_x (p0, z0))
++
++/*
++** cvt_f64_u64_x_untied:
++**	ucvtf	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_f64_u64_x_untied, svfloat64_t, svuint64_t,
++	     z0 = svcvt_f64_u64_x (p0, z4),
++	     z0 = svcvt_f64_x (p0, z4))
++
++/*
++** ptrue_cvt_f64_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f64_f16_x_tied1, svfloat64_t, svfloat16_t,
++		 z0_res = svcvt_f64_f16_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f64_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f64_f16_x_untied, svfloat64_t, svfloat16_t,
++	     z0 = svcvt_f64_f16_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f64_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f64_f32_x_tied1, svfloat64_t, svfloat32_t,
++		 z0_res = svcvt_f64_f32_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f64_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f64_f32_x_untied, svfloat64_t, svfloat32_t,
++	     z0 = svcvt_f64_f32_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f64_s32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f64_s32_x_tied1, svfloat64_t, svint32_t,
++		 z0_res = svcvt_f64_s32_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f64_s32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f64_s32_x_untied, svfloat64_t, svint32_t,
++	     z0 = svcvt_f64_s32_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f64_s64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f64_s64_x_tied1, svfloat64_t, svint64_t,
++		 z0_res = svcvt_f64_s64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f64_s64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f64_s64_x_untied, svfloat64_t, svint64_t,
++	     z0 = svcvt_f64_s64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f64_u32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f64_u32_x_tied1, svfloat64_t, svuint32_t,
++		 z0_res = svcvt_f64_u32_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f64_u32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f64_u32_x_untied, svfloat64_t, svuint32_t,
++	     z0 = svcvt_f64_u32_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
++
++/*
++** ptrue_cvt_f64_u64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_cvt_f64_u64_x_tied1, svfloat64_t, svuint64_t,
++		 z0_res = svcvt_f64_u64_x (svptrue_b64 (), z0),
++		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_cvt_f64_u64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvt_f64_u64_x_untied, svfloat64_t, svuint64_t,
++	     z0 = svcvt_f64_u64_x (svptrue_b64 (), z4),
++	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s16.c
+new file mode 100644
+index 000000000..81761ab09
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s16.c
+@@ -0,0 +1,72 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_s16_f16_m_tied1:
++**	fcvtzs	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s16_f16_m_tied1, svint16_t, svfloat16_t,
++	     z0 = svcvt_s16_f16_m (z0, p0, z4),
++	     z0 = svcvt_s16_m (z0, p0, z4))
++
++/*
++** cvt_s16_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s16_f16_m_tied2, svint16_t, svfloat16_t,
++		 z0_res = svcvt_s16_f16_m (z4, p0, z0),
++		 z0_res = svcvt_s16_m (z4, p0, z0))
++
++/*
++** cvt_s16_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s16_f16_m_untied, svint16_t, svfloat16_t,
++	     z0 = svcvt_s16_f16_m (z1, p0, z4),
++	     z0 = svcvt_s16_m (z1, p0, z4))
++
++/*
++** cvt_s16_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	fcvtzs	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s16_f16_z_tied1, svint16_t, svfloat16_t,
++		 z0_res = svcvt_s16_f16_z (p0, z0),
++		 z0_res = svcvt_s16_z (p0, z0))
++
++/*
++** cvt_s16_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	fcvtzs	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s16_f16_z_untied, svint16_t, svfloat16_t,
++	     z0 = svcvt_s16_f16_z (p0, z4),
++	     z0 = svcvt_s16_z (p0, z4))
++
++/*
++** cvt_s16_f16_x_tied1:
++**	fcvtzs	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s16_f16_x_tied1, svint16_t, svfloat16_t,
++		 z0_res = svcvt_s16_f16_x (p0, z0),
++		 z0_res = svcvt_s16_x (p0, z0))
++
++/*
++** cvt_s16_f16_x_untied:
++**	fcvtzs	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s16_f16_x_untied, svint16_t, svfloat16_t,
++	     z0 = svcvt_s16_f16_x (p0, z4),
++	     z0 = svcvt_s16_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s32.c
+new file mode 100644
+index 000000000..d30da5cc5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s32.c
+@@ -0,0 +1,210 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_s32_f16_m_tied1:
++**	fcvtzs	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f16_m_tied1, svint32_t, svfloat16_t,
++	     z0 = svcvt_s32_f16_m (z0, p0, z4),
++	     z0 = svcvt_s32_m (z0, p0, z4))
++
++/*
++** cvt_s32_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.s, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f16_m_tied2, svint32_t, svfloat16_t,
++		 z0_res = svcvt_s32_f16_m (z4, p0, z0),
++		 z0_res = svcvt_s32_m (z4, p0, z0))
++
++/*
++** cvt_s32_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f16_m_untied, svint32_t, svfloat16_t,
++	     z0 = svcvt_s32_f16_m (z1, p0, z4),
++	     z0 = svcvt_s32_m (z1, p0, z4))
++
++/*
++** cvt_s32_f32_m_tied1:
++**	fcvtzs	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f32_m_tied1, svint32_t, svfloat32_t,
++	     z0 = svcvt_s32_f32_m (z0, p0, z4),
++	     z0 = svcvt_s32_m (z0, p0, z4))
++
++/*
++** cvt_s32_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f32_m_tied2, svint32_t, svfloat32_t,
++		 z0_res = svcvt_s32_f32_m (z4, p0, z0),
++		 z0_res = svcvt_s32_m (z4, p0, z0))
++
++/*
++** cvt_s32_f32_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f32_m_untied, svint32_t, svfloat32_t,
++	     z0 = svcvt_s32_f32_m (z1, p0, z4),
++	     z0 = svcvt_s32_m (z1, p0, z4))
++
++/*
++** cvt_s32_f64_m_tied1:
++**	fcvtzs	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f64_m_tied1, svint32_t, svfloat64_t,
++	     z0 = svcvt_s32_f64_m (z0, p0, z4),
++	     z0 = svcvt_s32_m (z0, p0, z4))
++
++/*
++** cvt_s32_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f64_m_tied2, svint32_t, svfloat64_t,
++		 z0_res = svcvt_s32_f64_m (z4, p0, z0),
++		 z0_res = svcvt_s32_m (z4, p0, z0))
++
++/*
++** cvt_s32_f64_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f64_m_untied, svint32_t, svfloat64_t,
++	     z0 = svcvt_s32_f64_m (z1, p0, z4),
++	     z0 = svcvt_s32_m (z1, p0, z4))
++
++/*
++** cvt_s32_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fcvtzs	z0\.s, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f16_z_tied1, svint32_t, svfloat16_t,
++		 z0_res = svcvt_s32_f16_z (p0, z0),
++		 z0_res = svcvt_s32_z (p0, z0))
++
++/*
++** cvt_s32_f16_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fcvtzs	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f16_z_untied, svint32_t, svfloat16_t,
++	     z0 = svcvt_s32_f16_z (p0, z4),
++	     z0 = svcvt_s32_z (p0, z4))
++
++/*
++** cvt_s32_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fcvtzs	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f32_z_tied1, svint32_t, svfloat32_t,
++		 z0_res = svcvt_s32_f32_z (p0, z0),
++		 z0_res = svcvt_s32_z (p0, z0))
++
++/*
++** cvt_s32_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fcvtzs	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f32_z_untied, svint32_t, svfloat32_t,
++	     z0 = svcvt_s32_f32_z (p0, z4),
++	     z0 = svcvt_s32_z (p0, z4))
++
++/*
++** cvt_s32_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fcvtzs	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f64_z_tied1, svint32_t, svfloat64_t,
++		 z0_res = svcvt_s32_f64_z (p0, z0),
++		 z0_res = svcvt_s32_z (p0, z0))
++
++/*
++** cvt_s32_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzs	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f64_z_untied, svint32_t, svfloat64_t,
++	     z0 = svcvt_s32_f64_z (p0, z4),
++	     z0 = svcvt_s32_z (p0, z4))
++
++/*
++** cvt_s32_f16_x_tied1:
++**	fcvtzs	z0\.s, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f16_x_tied1, svint32_t, svfloat16_t,
++		 z0_res = svcvt_s32_f16_x (p0, z0),
++		 z0_res = svcvt_s32_x (p0, z0))
++
++/*
++** cvt_s32_f16_x_untied:
++**	fcvtzs	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f16_x_untied, svint32_t, svfloat16_t,
++	     z0 = svcvt_s32_f16_x (p0, z4),
++	     z0 = svcvt_s32_x (p0, z4))
++
++/*
++** cvt_s32_f32_x_tied1:
++**	fcvtzs	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f32_x_tied1, svint32_t, svfloat32_t,
++		 z0_res = svcvt_s32_f32_x (p0, z0),
++		 z0_res = svcvt_s32_x (p0, z0))
++
++/*
++** cvt_s32_f32_x_untied:
++**	fcvtzs	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f32_x_untied, svint32_t, svfloat32_t,
++	     z0 = svcvt_s32_f32_x (p0, z4),
++	     z0 = svcvt_s32_x (p0, z4))
++
++/*
++** cvt_s32_f64_x_tied1:
++**	fcvtzs	z0\.s, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s32_f64_x_tied1, svint32_t, svfloat64_t,
++		 z0_res = svcvt_s32_f64_x (p0, z0),
++		 z0_res = svcvt_s32_x (p0, z0))
++
++/*
++** cvt_s32_f64_x_untied:
++**	fcvtzs	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s32_f64_x_untied, svint32_t, svfloat64_t,
++	     z0 = svcvt_s32_f64_x (p0, z4),
++	     z0 = svcvt_s32_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s64.c
+new file mode 100644
+index 000000000..68cd80784
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s64.c
+@@ -0,0 +1,210 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_s64_f16_m_tied1:
++**	fcvtzs	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f16_m_tied1, svint64_t, svfloat16_t,
++	     z0 = svcvt_s64_f16_m (z0, p0, z4),
++	     z0 = svcvt_s64_m (z0, p0, z4))
++
++/*
++** cvt_s64_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.d, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f16_m_tied2, svint64_t, svfloat16_t,
++		 z0_res = svcvt_s64_f16_m (z4, p0, z0),
++		 z0_res = svcvt_s64_m (z4, p0, z0))
++
++/*
++** cvt_s64_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f16_m_untied, svint64_t, svfloat16_t,
++	     z0 = svcvt_s64_f16_m (z1, p0, z4),
++	     z0 = svcvt_s64_m (z1, p0, z4))
++
++/*
++** cvt_s64_f32_m_tied1:
++**	fcvtzs	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f32_m_tied1, svint64_t, svfloat32_t,
++	     z0 = svcvt_s64_f32_m (z0, p0, z4),
++	     z0 = svcvt_s64_m (z0, p0, z4))
++
++/*
++** cvt_s64_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f32_m_tied2, svint64_t, svfloat32_t,
++		 z0_res = svcvt_s64_f32_m (z4, p0, z0),
++		 z0_res = svcvt_s64_m (z4, p0, z0))
++
++/*
++** cvt_s64_f32_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f32_m_untied, svint64_t, svfloat32_t,
++	     z0 = svcvt_s64_f32_m (z1, p0, z4),
++	     z0 = svcvt_s64_m (z1, p0, z4))
++
++/*
++** cvt_s64_f64_m_tied1:
++**	fcvtzs	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f64_m_tied1, svint64_t, svfloat64_t,
++	     z0 = svcvt_s64_f64_m (z0, p0, z4),
++	     z0 = svcvt_s64_m (z0, p0, z4))
++
++/*
++** cvt_s64_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fcvtzs	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f64_m_tied2, svint64_t, svfloat64_t,
++		 z0_res = svcvt_s64_f64_m (z4, p0, z0),
++		 z0_res = svcvt_s64_m (z4, p0, z0))
++
++/*
++** cvt_s64_f64_m_untied:
++**	movprfx	z0, z1
++**	fcvtzs	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f64_m_untied, svint64_t, svfloat64_t,
++	     z0 = svcvt_s64_f64_m (z1, p0, z4),
++	     z0 = svcvt_s64_m (z1, p0, z4))
++
++/*
++** cvt_s64_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	fcvtzs	z0\.d, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f16_z_tied1, svint64_t, svfloat16_t,
++		 z0_res = svcvt_s64_f16_z (p0, z0),
++		 z0_res = svcvt_s64_z (p0, z0))
++
++/*
++** cvt_s64_f16_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzs	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f16_z_untied, svint64_t, svfloat16_t,
++	     z0 = svcvt_s64_f16_z (p0, z4),
++	     z0 = svcvt_s64_z (p0, z4))
++
++/*
++** cvt_s64_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	fcvtzs	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f32_z_tied1, svint64_t, svfloat32_t,
++		 z0_res = svcvt_s64_f32_z (p0, z0),
++		 z0_res = svcvt_s64_z (p0, z0))
++
++/*
++** cvt_s64_f32_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzs	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f32_z_untied, svint64_t, svfloat32_t,
++	     z0 = svcvt_s64_f32_z (p0, z4),
++	     z0 = svcvt_s64_z (p0, z4))
++
++/*
++** cvt_s64_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fcvtzs	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f64_z_tied1, svint64_t, svfloat64_t,
++		 z0_res = svcvt_s64_f64_z (p0, z0),
++		 z0_res = svcvt_s64_z (p0, z0))
++
++/*
++** cvt_s64_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzs	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f64_z_untied, svint64_t, svfloat64_t,
++	     z0 = svcvt_s64_f64_z (p0, z4),
++	     z0 = svcvt_s64_z (p0, z4))
++
++/*
++** cvt_s64_f16_x_tied1:
++**	fcvtzs	z0\.d, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f16_x_tied1, svint64_t, svfloat16_t,
++		 z0_res = svcvt_s64_f16_x (p0, z0),
++		 z0_res = svcvt_s64_x (p0, z0))
++
++/*
++** cvt_s64_f16_x_untied:
++**	fcvtzs	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f16_x_untied, svint64_t, svfloat16_t,
++	     z0 = svcvt_s64_f16_x (p0, z4),
++	     z0 = svcvt_s64_x (p0, z4))
++
++/*
++** cvt_s64_f32_x_tied1:
++**	fcvtzs	z0\.d, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f32_x_tied1, svint64_t, svfloat32_t,
++		 z0_res = svcvt_s64_f32_x (p0, z0),
++		 z0_res = svcvt_s64_x (p0, z0))
++
++/*
++** cvt_s64_f32_x_untied:
++**	fcvtzs	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f32_x_untied, svint64_t, svfloat32_t,
++	     z0 = svcvt_s64_f32_x (p0, z4),
++	     z0 = svcvt_s64_x (p0, z4))
++
++/*
++** cvt_s64_f64_x_tied1:
++**	fcvtzs	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_s64_f64_x_tied1, svint64_t, svfloat64_t,
++		 z0_res = svcvt_s64_f64_x (p0, z0),
++		 z0_res = svcvt_s64_x (p0, z0))
++
++/*
++** cvt_s64_f64_x_untied:
++**	fcvtzs	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_s64_f64_x_untied, svint64_t, svfloat64_t,
++	     z0 = svcvt_s64_f64_x (p0, z4),
++	     z0 = svcvt_s64_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u16.c
+new file mode 100644
+index 000000000..4db0dffdd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u16.c
+@@ -0,0 +1,72 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_u16_f16_m_tied1:
++**	fcvtzu	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u16_f16_m_tied1, svuint16_t, svfloat16_t,
++	     z0 = svcvt_u16_f16_m (z0, p0, z4),
++	     z0 = svcvt_u16_m (z0, p0, z4))
++
++/*
++** cvt_u16_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u16_f16_m_tied2, svuint16_t, svfloat16_t,
++		 z0_res = svcvt_u16_f16_m (z4, p0, z0),
++		 z0_res = svcvt_u16_m (z4, p0, z0))
++
++/*
++** cvt_u16_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u16_f16_m_untied, svuint16_t, svfloat16_t,
++	     z0 = svcvt_u16_f16_m (z1, p0, z4),
++	     z0 = svcvt_u16_m (z1, p0, z4))
++
++/*
++** cvt_u16_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	fcvtzu	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u16_f16_z_tied1, svuint16_t, svfloat16_t,
++		 z0_res = svcvt_u16_f16_z (p0, z0),
++		 z0_res = svcvt_u16_z (p0, z0))
++
++/*
++** cvt_u16_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z4\.h
++**	fcvtzu	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u16_f16_z_untied, svuint16_t, svfloat16_t,
++	     z0 = svcvt_u16_f16_z (p0, z4),
++	     z0 = svcvt_u16_z (p0, z4))
++
++/*
++** cvt_u16_f16_x_tied1:
++**	fcvtzu	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u16_f16_x_tied1, svuint16_t, svfloat16_t,
++		 z0_res = svcvt_u16_f16_x (p0, z0),
++		 z0_res = svcvt_u16_x (p0, z0))
++
++/*
++** cvt_u16_f16_x_untied:
++**	fcvtzu	z0\.h, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u16_f16_x_untied, svuint16_t, svfloat16_t,
++	     z0 = svcvt_u16_f16_x (p0, z4),
++	     z0 = svcvt_u16_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u32.c
+new file mode 100644
+index 000000000..52ef49fcf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u32.c
+@@ -0,0 +1,210 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_u32_f16_m_tied1:
++**	fcvtzu	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f16_m_tied1, svuint32_t, svfloat16_t,
++	     z0 = svcvt_u32_f16_m (z0, p0, z4),
++	     z0 = svcvt_u32_m (z0, p0, z4))
++
++/*
++** cvt_u32_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.s, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f16_m_tied2, svuint32_t, svfloat16_t,
++		 z0_res = svcvt_u32_f16_m (z4, p0, z0),
++		 z0_res = svcvt_u32_m (z4, p0, z0))
++
++/*
++** cvt_u32_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f16_m_untied, svuint32_t, svfloat16_t,
++	     z0 = svcvt_u32_f16_m (z1, p0, z4),
++	     z0 = svcvt_u32_m (z1, p0, z4))
++
++/*
++** cvt_u32_f32_m_tied1:
++**	fcvtzu	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f32_m_tied1, svuint32_t, svfloat32_t,
++	     z0 = svcvt_u32_f32_m (z0, p0, z4),
++	     z0 = svcvt_u32_m (z0, p0, z4))
++
++/*
++** cvt_u32_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f32_m_tied2, svuint32_t, svfloat32_t,
++		 z0_res = svcvt_u32_f32_m (z4, p0, z0),
++		 z0_res = svcvt_u32_m (z4, p0, z0))
++
++/*
++** cvt_u32_f32_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f32_m_untied, svuint32_t, svfloat32_t,
++	     z0 = svcvt_u32_f32_m (z1, p0, z4),
++	     z0 = svcvt_u32_m (z1, p0, z4))
++
++/*
++** cvt_u32_f64_m_tied1:
++**	fcvtzu	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f64_m_tied1, svuint32_t, svfloat64_t,
++	     z0 = svcvt_u32_f64_m (z0, p0, z4),
++	     z0 = svcvt_u32_m (z0, p0, z4))
++
++/*
++** cvt_u32_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f64_m_tied2, svuint32_t, svfloat64_t,
++		 z0_res = svcvt_u32_f64_m (z4, p0, z0),
++		 z0_res = svcvt_u32_m (z4, p0, z0))
++
++/*
++** cvt_u32_f64_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f64_m_untied, svuint32_t, svfloat64_t,
++	     z0 = svcvt_u32_f64_m (z1, p0, z4),
++	     z0 = svcvt_u32_m (z1, p0, z4))
++
++/*
++** cvt_u32_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fcvtzu	z0\.s, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f16_z_tied1, svuint32_t, svfloat16_t,
++		 z0_res = svcvt_u32_f16_z (p0, z0),
++		 z0_res = svcvt_u32_z (p0, z0))
++
++/*
++** cvt_u32_f16_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fcvtzu	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f16_z_untied, svuint32_t, svfloat16_t,
++	     z0 = svcvt_u32_f16_z (p0, z4),
++	     z0 = svcvt_u32_z (p0, z4))
++
++/*
++** cvt_u32_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fcvtzu	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f32_z_tied1, svuint32_t, svfloat32_t,
++		 z0_res = svcvt_u32_f32_z (p0, z0),
++		 z0_res = svcvt_u32_z (p0, z0))
++
++/*
++** cvt_u32_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fcvtzu	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f32_z_untied, svuint32_t, svfloat32_t,
++	     z0 = svcvt_u32_f32_z (p0, z4),
++	     z0 = svcvt_u32_z (p0, z4))
++
++/*
++** cvt_u32_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fcvtzu	z0\.s, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f64_z_tied1, svuint32_t, svfloat64_t,
++		 z0_res = svcvt_u32_f64_z (p0, z0),
++		 z0_res = svcvt_u32_z (p0, z0))
++
++/*
++** cvt_u32_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzu	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f64_z_untied, svuint32_t, svfloat64_t,
++	     z0 = svcvt_u32_f64_z (p0, z4),
++	     z0 = svcvt_u32_z (p0, z4))
++
++/*
++** cvt_u32_f16_x_tied1:
++**	fcvtzu	z0\.s, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f16_x_tied1, svuint32_t, svfloat16_t,
++		 z0_res = svcvt_u32_f16_x (p0, z0),
++		 z0_res = svcvt_u32_x (p0, z0))
++
++/*
++** cvt_u32_f16_x_untied:
++**	fcvtzu	z0\.s, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f16_x_untied, svuint32_t, svfloat16_t,
++	     z0 = svcvt_u32_f16_x (p0, z4),
++	     z0 = svcvt_u32_x (p0, z4))
++
++/*
++** cvt_u32_f32_x_tied1:
++**	fcvtzu	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f32_x_tied1, svuint32_t, svfloat32_t,
++		 z0_res = svcvt_u32_f32_x (p0, z0),
++		 z0_res = svcvt_u32_x (p0, z0))
++
++/*
++** cvt_u32_f32_x_untied:
++**	fcvtzu	z0\.s, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f32_x_untied, svuint32_t, svfloat32_t,
++	     z0 = svcvt_u32_f32_x (p0, z4),
++	     z0 = svcvt_u32_x (p0, z4))
++
++/*
++** cvt_u32_f64_x_tied1:
++**	fcvtzu	z0\.s, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u32_f64_x_tied1, svuint32_t, svfloat64_t,
++		 z0_res = svcvt_u32_f64_x (p0, z0),
++		 z0_res = svcvt_u32_x (p0, z0))
++
++/*
++** cvt_u32_f64_x_untied:
++**	fcvtzu	z0\.s, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u32_f64_x_untied, svuint32_t, svfloat64_t,
++	     z0 = svcvt_u32_f64_x (p0, z4),
++	     z0 = svcvt_u32_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u64.c
+new file mode 100644
+index 000000000..0c43758ae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u64.c
+@@ -0,0 +1,210 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvt_u64_f16_m_tied1:
++**	fcvtzu	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f16_m_tied1, svuint64_t, svfloat16_t,
++	     z0 = svcvt_u64_f16_m (z0, p0, z4),
++	     z0 = svcvt_u64_m (z0, p0, z4))
++
++/*
++** cvt_u64_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.d, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f16_m_tied2, svuint64_t, svfloat16_t,
++		 z0_res = svcvt_u64_f16_m (z4, p0, z0),
++		 z0_res = svcvt_u64_m (z4, p0, z0))
++
++/*
++** cvt_u64_f16_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f16_m_untied, svuint64_t, svfloat16_t,
++	     z0 = svcvt_u64_f16_m (z1, p0, z4),
++	     z0 = svcvt_u64_m (z1, p0, z4))
++
++/*
++** cvt_u64_f32_m_tied1:
++**	fcvtzu	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f32_m_tied1, svuint64_t, svfloat32_t,
++	     z0 = svcvt_u64_f32_m (z0, p0, z4),
++	     z0 = svcvt_u64_m (z0, p0, z4))
++
++/*
++** cvt_u64_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f32_m_tied2, svuint64_t, svfloat32_t,
++		 z0_res = svcvt_u64_f32_m (z4, p0, z0),
++		 z0_res = svcvt_u64_m (z4, p0, z0))
++
++/*
++** cvt_u64_f32_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f32_m_untied, svuint64_t, svfloat32_t,
++	     z0 = svcvt_u64_f32_m (z1, p0, z4),
++	     z0 = svcvt_u64_m (z1, p0, z4))
++
++/*
++** cvt_u64_f64_m_tied1:
++**	fcvtzu	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f64_m_tied1, svuint64_t, svfloat64_t,
++	     z0 = svcvt_u64_f64_m (z0, p0, z4),
++	     z0 = svcvt_u64_m (z0, p0, z4))
++
++/*
++** cvt_u64_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fcvtzu	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f64_m_tied2, svuint64_t, svfloat64_t,
++		 z0_res = svcvt_u64_f64_m (z4, p0, z0),
++		 z0_res = svcvt_u64_m (z4, p0, z0))
++
++/*
++** cvt_u64_f64_m_untied:
++**	movprfx	z0, z1
++**	fcvtzu	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f64_m_untied, svuint64_t, svfloat64_t,
++	     z0 = svcvt_u64_f64_m (z1, p0, z4),
++	     z0 = svcvt_u64_m (z1, p0, z4))
++
++/*
++** cvt_u64_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	fcvtzu	z0\.d, p0/m, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f16_z_tied1, svuint64_t, svfloat16_t,
++		 z0_res = svcvt_u64_f16_z (p0, z0),
++		 z0_res = svcvt_u64_z (p0, z0))
++
++/*
++** cvt_u64_f16_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzu	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f16_z_untied, svuint64_t, svfloat16_t,
++	     z0 = svcvt_u64_f16_z (p0, z4),
++	     z0 = svcvt_u64_z (p0, z4))
++
++/*
++** cvt_u64_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.d, p0/z, \1\.d
++**	fcvtzu	z0\.d, p0/m, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f32_z_tied1, svuint64_t, svfloat32_t,
++		 z0_res = svcvt_u64_f32_z (p0, z0),
++		 z0_res = svcvt_u64_z (p0, z0))
++
++/*
++** cvt_u64_f32_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzu	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f32_z_untied, svuint64_t, svfloat32_t,
++	     z0 = svcvt_u64_f32_z (p0, z4),
++	     z0 = svcvt_u64_z (p0, z4))
++
++/*
++** cvt_u64_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fcvtzu	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f64_z_tied1, svuint64_t, svfloat64_t,
++		 z0_res = svcvt_u64_f64_z (p0, z0),
++		 z0_res = svcvt_u64_z (p0, z0))
++
++/*
++** cvt_u64_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fcvtzu	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f64_z_untied, svuint64_t, svfloat64_t,
++	     z0 = svcvt_u64_f64_z (p0, z4),
++	     z0 = svcvt_u64_z (p0, z4))
++
++/*
++** cvt_u64_f16_x_tied1:
++**	fcvtzu	z0\.d, p0/m, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f16_x_tied1, svuint64_t, svfloat16_t,
++		 z0_res = svcvt_u64_f16_x (p0, z0),
++		 z0_res = svcvt_u64_x (p0, z0))
++
++/*
++** cvt_u64_f16_x_untied:
++**	fcvtzu	z0\.d, p0/m, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f16_x_untied, svuint64_t, svfloat16_t,
++	     z0 = svcvt_u64_f16_x (p0, z4),
++	     z0 = svcvt_u64_x (p0, z4))
++
++/*
++** cvt_u64_f32_x_tied1:
++**	fcvtzu	z0\.d, p0/m, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f32_x_tied1, svuint64_t, svfloat32_t,
++		 z0_res = svcvt_u64_f32_x (p0, z0),
++		 z0_res = svcvt_u64_x (p0, z0))
++
++/*
++** cvt_u64_f32_x_untied:
++**	fcvtzu	z0\.d, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f32_x_untied, svuint64_t, svfloat32_t,
++	     z0 = svcvt_u64_f32_x (p0, z4),
++	     z0 = svcvt_u64_x (p0, z4))
++
++/*
++** cvt_u64_f64_x_tied1:
++**	fcvtzu	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (cvt_u64_f64_x_tied1, svuint64_t, svfloat64_t,
++		 z0_res = svcvt_u64_f64_x (p0, z0),
++		 z0_res = svcvt_u64_x (p0, z0))
++
++/*
++** cvt_u64_f64_x_untied:
++**	fcvtzu	z0\.d, p0/m, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (cvt_u64_f64_x_untied, svuint64_t, svfloat64_t,
++	     z0 = svcvt_u64_f64_x (p0, z4),
++	     z0 = svcvt_u64_x (p0, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvtnt_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvtnt_bf16.c
+new file mode 100644
+index 000000000..54614c95d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvtnt_bf16.c
+@@ -0,0 +1,90 @@
++/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
++/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** cvtnt_bf16_f32_m_tied1:
++**	bfcvtnt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvtnt_bf16_f32_m_tied1, svbfloat16_t, svfloat32_t,
++	     z0 = svcvtnt_bf16_f32_m (z0, p0, z4),
++	     z0 = svcvtnt_bf16_m (z0, p0, z4))
++
++/* Bad RA choice: no preferred output sequence.  */
++TEST_DUAL_Z_REV (cvtnt_bf16_f32_m_tied2, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvtnt_bf16_f32_m (z4, p0, z0),
++		 z0_res = svcvtnt_bf16_m (z4, p0, z0))
++
++/*
++** cvtnt_bf16_f32_m_untied:
++** (
++**	mov	z0\.d, z1\.d
++**	bfcvtnt	z0\.h, p0/m, z4\.s
++** |
++**	bfcvtnt	z1\.h, p0/m, z4\.s
++**	mov	z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_DUAL_Z (cvtnt_bf16_f32_m_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvtnt_bf16_f32_m (z1, p0, z4),
++	     z0 = svcvtnt_bf16_m (z1, p0, z4))
++
++/*
++** cvtnt_bf16_f32_x_tied1:
++**	bfcvtnt	z0\.h, p0/m, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (cvtnt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
++	     z0 = svcvtnt_bf16_f32_x (z0, p0, z4),
++	     z0 = svcvtnt_bf16_x (z0, p0, z4))
++
++/* Bad RA choice: no preferred output sequence.  */
++TEST_DUAL_Z_REV (cvtnt_bf16_f32_x_tied2, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvtnt_bf16_f32_x (z4, p0, z0),
++		 z0_res = svcvtnt_bf16_x (z4, p0, z0))
++
++/*
++** cvtnt_bf16_f32_x_untied:
++** (
++**	mov	z0\.d, z1\.d
++**	bfcvtnt	z0\.h, p0/m, z4\.s
++** |
++**	bfcvtnt	z1\.h, p0/m, z4\.s
++**	mov	z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_DUAL_Z (cvtnt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvtnt_bf16_f32_x (z1, p0, z4),
++	     z0 = svcvtnt_bf16_x (z1, p0, z4))
++
++/*
++** ptrue_cvtnt_bf16_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvtnt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
++	     z0 = svcvtnt_bf16_f32_x (z0, svptrue_b32 (), z4),
++	     z0 = svcvtnt_bf16_x (z0, svptrue_b32 (), z4))
++
++/* Bad RA choice: no preferred output sequence.  */
++TEST_DUAL_Z_REV (ptrue_cvtnt_bf16_f32_x_tied2, svbfloat16_t, svfloat32_t,
++		 z0_res = svcvtnt_bf16_f32_x (z4, svptrue_b32 (), z0),
++		 z0_res = svcvtnt_bf16_x (z4, svptrue_b32 (), z0))
++
++/*
++** ptrue_cvtnt_bf16_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_cvtnt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svcvtnt_bf16_f32_x (z1, svptrue_b32 (), z4),
++	     z0 = svcvtnt_bf16_x (z1, svptrue_b32 (), z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f16.c
+new file mode 100644
+index 000000000..35f5c1589
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f16.c
+@@ -0,0 +1,303 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_f16_m_tied1:
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_m_tied1, svfloat16_t,
++		z0 = svdiv_f16_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fdiv	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_m_tied2, svfloat16_t,
++		z0 = svdiv_f16_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_f16_m_untied:
++**	movprfx	z0, z1
++**	fdiv	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_m_untied, svfloat16_t,
++		z0 = svdiv_f16_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svdiv_n_f16_m (p0, z0, d4),
++		 z0 = svdiv_m (p0, z0, d4))
++
++/*
++** div_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svdiv_n_f16_m (p0, z1, d4),
++		 z0 = svdiv_m (p0, z1, d4))
++
++/*
++** div_1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f16_m_tied1, svfloat16_t,
++		z0 = svdiv_n_f16_m (p0, z0, 1),
++		z0 = svdiv_m (p0, z0, 1))
++
++/*
++** div_1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f16_m_untied, svfloat16_t,
++		z0 = svdiv_n_f16_m (p0, z1, 1),
++		z0 = svdiv_m (p0, z1, 1))
++
++/*
++** div_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_z_tied1, svfloat16_t,
++		z0 = svdiv_f16_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_z_tied2, svfloat16_t,
++		z0 = svdiv_f16_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fdiv	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_z_untied, svfloat16_t,
++		z0 = svdiv_f16_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svdiv_n_f16_z (p0, z0, d4),
++		 z0 = svdiv_z (p0, z0, d4))
++
++/*
++** div_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (div_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svdiv_n_f16_z (p0, z1, d4),
++		 z0 = svdiv_z (p0, z1, d4))
++
++/*
++** div_1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f16_z_tied1, svfloat16_t,
++		z0 = svdiv_n_f16_z (p0, z0, 1),
++		z0 = svdiv_z (p0, z0, 1))
++
++/*
++** div_1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f16_z_untied, svfloat16_t,
++		z0 = svdiv_n_f16_z (p0, z1, 1),
++		z0 = svdiv_z (p0, z1, 1))
++
++/*
++** div_0p5_f16_z:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_0p5_f16_z, svfloat16_t,
++		z0 = svdiv_n_f16_z (p0, z0, 0.5),
++		z0 = svdiv_z (p0, z0, 0.5))
++
++/*
++** div_f16_x_tied1:
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_x_tied1, svfloat16_t,
++		z0 = svdiv_f16_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_f16_x_tied2:
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_x_tied2, svfloat16_t,
++		z0 = svdiv_f16_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fdiv	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_f16_x_untied, svfloat16_t,
++		z0 = svdiv_f16_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svdiv_n_f16_x (p0, z0, d4),
++		 z0 = svdiv_x (p0, z0, d4))
++
++/*
++** div_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (div_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svdiv_n_f16_x (p0, z1, d4),
++		 z0 = svdiv_x (p0, z1, d4))
++
++/*
++** div_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fdiv	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f16_x_tied1, svfloat16_t,
++		z0 = svdiv_n_f16_x (p0, z0, 1),
++		z0 = svdiv_x (p0, z0, 1))
++
++/*
++** div_1_f16_x_untied:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f16_x_untied, svfloat16_t,
++		z0 = svdiv_n_f16_x (p0, z1, 1),
++		z0 = svdiv_x (p0, z1, 1))
++
++/*
++** ptrue_div_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f16_x_tied1, svfloat16_t,
++		z0 = svdiv_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svdiv_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_div_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f16_x_tied2, svfloat16_t,
++		z0 = svdiv_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svdiv_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_div_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f16_x_untied, svfloat16_t,
++		z0 = svdiv_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svdiv_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_div_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_1_f16_x_tied1, svfloat16_t,
++		z0 = svdiv_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svdiv_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_div_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_1_f16_x_untied, svfloat16_t,
++		z0 = svdiv_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svdiv_x (svptrue_b16 (), z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f32.c
+new file mode 100644
+index 000000000..40cc203da
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f32.c
+@@ -0,0 +1,303 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_f32_m_tied1:
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_m_tied1, svfloat32_t,
++		z0 = svdiv_f32_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fdiv	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_m_tied2, svfloat32_t,
++		z0 = svdiv_f32_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_f32_m_untied:
++**	movprfx	z0, z1
++**	fdiv	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_m_untied, svfloat32_t,
++		z0 = svdiv_f32_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svdiv_n_f32_m (p0, z0, d4),
++		 z0 = svdiv_m (p0, z0, d4))
++
++/*
++** div_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svdiv_n_f32_m (p0, z1, d4),
++		 z0 = svdiv_m (p0, z1, d4))
++
++/*
++** div_1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f32_m_tied1, svfloat32_t,
++		z0 = svdiv_n_f32_m (p0, z0, 1),
++		z0 = svdiv_m (p0, z0, 1))
++
++/*
++** div_1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f32_m_untied, svfloat32_t,
++		z0 = svdiv_n_f32_m (p0, z1, 1),
++		z0 = svdiv_m (p0, z1, 1))
++
++/*
++** div_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_z_tied1, svfloat32_t,
++		z0 = svdiv_f32_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_z_tied2, svfloat32_t,
++		z0 = svdiv_f32_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fdiv	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_z_untied, svfloat32_t,
++		z0 = svdiv_f32_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svdiv_n_f32_z (p0, z0, d4),
++		 z0 = svdiv_z (p0, z0, d4))
++
++/*
++** div_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (div_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svdiv_n_f32_z (p0, z1, d4),
++		 z0 = svdiv_z (p0, z1, d4))
++
++/*
++** div_1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f32_z_tied1, svfloat32_t,
++		z0 = svdiv_n_f32_z (p0, z0, 1),
++		z0 = svdiv_z (p0, z0, 1))
++
++/*
++** div_1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f32_z_untied, svfloat32_t,
++		z0 = svdiv_n_f32_z (p0, z1, 1),
++		z0 = svdiv_z (p0, z1, 1))
++
++/*
++** div_0p5_f32_z:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_0p5_f32_z, svfloat32_t,
++		z0 = svdiv_n_f32_z (p0, z0, 0.5),
++		z0 = svdiv_z (p0, z0, 0.5))
++
++/*
++** div_f32_x_tied1:
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_x_tied1, svfloat32_t,
++		z0 = svdiv_f32_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_f32_x_tied2:
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_x_tied2, svfloat32_t,
++		z0 = svdiv_f32_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fdiv	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_f32_x_untied, svfloat32_t,
++		z0 = svdiv_f32_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svdiv_n_f32_x (p0, z0, d4),
++		 z0 = svdiv_x (p0, z0, d4))
++
++/*
++** div_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (div_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svdiv_n_f32_x (p0, z1, d4),
++		 z0 = svdiv_x (p0, z1, d4))
++
++/*
++** div_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f32_x_tied1, svfloat32_t,
++		z0 = svdiv_n_f32_x (p0, z0, 1),
++		z0 = svdiv_x (p0, z0, 1))
++
++/*
++** div_1_f32_x_untied:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f32_x_untied, svfloat32_t,
++		z0 = svdiv_n_f32_x (p0, z1, 1),
++		z0 = svdiv_x (p0, z1, 1))
++
++/*
++** ptrue_div_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f32_x_tied1, svfloat32_t,
++		z0 = svdiv_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svdiv_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_div_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f32_x_tied2, svfloat32_t,
++		z0 = svdiv_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svdiv_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_div_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f32_x_untied, svfloat32_t,
++		z0 = svdiv_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svdiv_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_div_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_1_f32_x_tied1, svfloat32_t,
++		z0 = svdiv_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svdiv_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_div_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_1_f32_x_untied, svfloat32_t,
++		z0 = svdiv_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svdiv_x (svptrue_b32 (), z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f64.c
+new file mode 100644
+index 000000000..56acbbe95
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f64.c
+@@ -0,0 +1,303 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_f64_m_tied1:
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_m_tied1, svfloat64_t,
++		z0 = svdiv_f64_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_m_tied2, svfloat64_t,
++		z0 = svdiv_f64_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_f64_m_untied:
++**	movprfx	z0, z1
++**	fdiv	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_m_untied, svfloat64_t,
++		z0 = svdiv_f64_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svdiv_n_f64_m (p0, z0, d4),
++		 z0 = svdiv_m (p0, z0, d4))
++
++/*
++** div_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svdiv_n_f64_m (p0, z1, d4),
++		 z0 = svdiv_m (p0, z1, d4))
++
++/*
++** div_1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f64_m_tied1, svfloat64_t,
++		z0 = svdiv_n_f64_m (p0, z0, 1),
++		z0 = svdiv_m (p0, z0, 1))
++
++/*
++** div_1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f64_m_untied, svfloat64_t,
++		z0 = svdiv_n_f64_m (p0, z1, 1),
++		z0 = svdiv_m (p0, z1, 1))
++
++/*
++** div_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_z_tied1, svfloat64_t,
++		z0 = svdiv_f64_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_z_tied2, svfloat64_t,
++		z0 = svdiv_f64_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fdiv	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_z_untied, svfloat64_t,
++		z0 = svdiv_f64_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svdiv_n_f64_z (p0, z0, d4),
++		 z0 = svdiv_z (p0, z0, d4))
++
++/*
++** div_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (div_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svdiv_n_f64_z (p0, z1, d4),
++		 z0 = svdiv_z (p0, z1, d4))
++
++/*
++** div_1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f64_z_tied1, svfloat64_t,
++		z0 = svdiv_n_f64_z (p0, z0, 1),
++		z0 = svdiv_z (p0, z0, 1))
++
++/*
++** div_1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f64_z_untied, svfloat64_t,
++		z0 = svdiv_n_f64_z (p0, z1, 1),
++		z0 = svdiv_z (p0, z1, 1))
++
++/*
++** div_0p5_f64_z:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_0p5_f64_z, svfloat64_t,
++		z0 = svdiv_n_f64_z (p0, z0, 0.5),
++		z0 = svdiv_z (p0, z0, 0.5))
++
++/*
++** div_f64_x_tied1:
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_x_tied1, svfloat64_t,
++		z0 = svdiv_f64_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_f64_x_tied2:
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_x_tied2, svfloat64_t,
++		z0 = svdiv_f64_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fdiv	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_f64_x_untied, svfloat64_t,
++		z0 = svdiv_f64_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (div_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svdiv_n_f64_x (p0, z0, d4),
++		 z0 = svdiv_x (p0, z0, d4))
++
++/*
++** div_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (div_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svdiv_n_f64_x (p0, z1, d4),
++		 z0 = svdiv_x (p0, z1, d4))
++
++/*
++** div_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f64_x_tied1, svfloat64_t,
++		z0 = svdiv_n_f64_x (p0, z0, 1),
++		z0 = svdiv_x (p0, z0, 1))
++
++/*
++** div_1_f64_x_untied:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_1_f64_x_untied, svfloat64_t,
++		z0 = svdiv_n_f64_x (p0, z1, 1),
++		z0 = svdiv_x (p0, z1, 1))
++
++/*
++** ptrue_div_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f64_x_tied1, svfloat64_t,
++		z0 = svdiv_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svdiv_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_div_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f64_x_tied2, svfloat64_t,
++		z0 = svdiv_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svdiv_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_div_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_f64_x_untied, svfloat64_t,
++		z0 = svdiv_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svdiv_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_div_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_1_f64_x_tied1, svfloat64_t,
++		z0 = svdiv_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svdiv_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_div_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_div_1_f64_x_untied, svfloat64_t,
++		z0 = svdiv_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svdiv_x (svptrue_b64 (), z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
+new file mode 100644
+index 000000000..8e70ae797
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_s32_m_tied1:
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_m_tied1, svint32_t,
++		z0 = svdiv_s32_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sdiv	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_m_tied2, svint32_t,
++		z0 = svdiv_s32_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_s32_m_untied:
++**	movprfx	z0, z1
++**	sdiv	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_m_untied, svint32_t,
++		z0 = svdiv_s32_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svdiv_n_s32_m (p0, z0, x0),
++		 z0 = svdiv_m (p0, z0, x0))
++
++/*
++** div_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svdiv_n_s32_m (p0, z1, x0),
++		 z0 = svdiv_m (p0, z1, x0))
++
++/*
++** div_2_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s32_m_tied1, svint32_t,
++		z0 = svdiv_n_s32_m (p0, z0, 2),
++		z0 = svdiv_m (p0, z0, 2))
++
++/*
++** div_2_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0, z1
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s32_m_untied, svint32_t,
++		z0 = svdiv_n_s32_m (p0, z1, 2),
++		z0 = svdiv_m (p0, z1, 2))
++
++/*
++** div_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_z_tied1, svint32_t,
++		z0 = svdiv_s32_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_z_tied2, svint32_t,
++		z0 = svdiv_s32_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sdiv	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_z_untied, svint32_t,
++		z0 = svdiv_s32_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svdiv_n_s32_z (p0, z0, x0),
++		 z0 = svdiv_z (p0, z0, x0))
++
++/*
++** div_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svdiv_n_s32_z (p0, z1, x0),
++		 z0 = svdiv_z (p0, z1, x0))
++
++/*
++** div_2_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s32_z_tied1, svint32_t,
++		z0 = svdiv_n_s32_z (p0, z0, 2),
++		z0 = svdiv_z (p0, z0, 2))
++
++/*
++** div_2_s32_z_untied:
++**	mov	(z[0-9]+\.s), #2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s32_z_untied, svint32_t,
++		z0 = svdiv_n_s32_z (p0, z1, 2),
++		z0 = svdiv_z (p0, z1, 2))
++
++/*
++** div_s32_x_tied1:
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_x_tied1, svint32_t,
++		z0 = svdiv_s32_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_s32_x_tied2:
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_x_tied2, svint32_t,
++		z0 = svdiv_s32_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	sdiv	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_s32_x_untied, svint32_t,
++		z0 = svdiv_s32_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svdiv_n_s32_x (p0, z0, x0),
++		 z0 = svdiv_x (p0, z0, x0))
++
++/*
++** div_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svdiv_n_s32_x (p0, z1, x0),
++		 z0 = svdiv_x (p0, z1, x0))
++
++/*
++** div_2_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	sdiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s32_x_tied1, svint32_t,
++		z0 = svdiv_n_s32_x (p0, z0, 2),
++		z0 = svdiv_x (p0, z0, 2))
++
++/*
++** div_2_s32_x_untied:
++**	mov	z0\.s, #2
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s32_x_untied, svint32_t,
++		z0 = svdiv_n_s32_x (p0, z1, 2),
++		z0 = svdiv_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
+new file mode 100644
+index 000000000..439da1f57
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_s64_m_tied1:
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_m_tied1, svint64_t,
++		z0 = svdiv_s64_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_m_tied2, svint64_t,
++		z0 = svdiv_s64_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_s64_m_untied:
++**	movprfx	z0, z1
++**	sdiv	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_m_untied, svint64_t,
++		z0 = svdiv_s64_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svdiv_n_s64_m (p0, z0, x0),
++		 z0 = svdiv_m (p0, z0, x0))
++
++/*
++** div_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svdiv_n_s64_m (p0, z1, x0),
++		 z0 = svdiv_m (p0, z1, x0))
++
++/*
++** div_2_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s64_m_tied1, svint64_t,
++		z0 = svdiv_n_s64_m (p0, z0, 2),
++		z0 = svdiv_m (p0, z0, 2))
++
++/*
++** div_2_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0, z1
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s64_m_untied, svint64_t,
++		z0 = svdiv_n_s64_m (p0, z1, 2),
++		z0 = svdiv_m (p0, z1, 2))
++
++/*
++** div_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_z_tied1, svint64_t,
++		z0 = svdiv_s64_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_z_tied2, svint64_t,
++		z0 = svdiv_s64_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sdiv	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_z_untied, svint64_t,
++		z0 = svdiv_s64_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svdiv_n_s64_z (p0, z0, x0),
++		 z0 = svdiv_z (p0, z0, x0))
++
++/*
++** div_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svdiv_n_s64_z (p0, z1, x0),
++		 z0 = svdiv_z (p0, z1, x0))
++
++/*
++** div_2_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s64_z_tied1, svint64_t,
++		z0 = svdiv_n_s64_z (p0, z0, 2),
++		z0 = svdiv_z (p0, z0, 2))
++
++/*
++** div_2_s64_z_untied:
++**	mov	(z[0-9]+\.d), #2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s64_z_untied, svint64_t,
++		z0 = svdiv_n_s64_z (p0, z1, 2),
++		z0 = svdiv_z (p0, z1, 2))
++
++/*
++** div_s64_x_tied1:
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_x_tied1, svint64_t,
++		z0 = svdiv_s64_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_s64_x_tied2:
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_x_tied2, svint64_t,
++		z0 = svdiv_s64_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	sdiv	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_s64_x_untied, svint64_t,
++		z0 = svdiv_s64_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svdiv_n_s64_x (p0, z0, x0),
++		 z0 = svdiv_x (p0, z0, x0))
++
++/*
++** div_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svdiv_n_s64_x (p0, z1, x0),
++		 z0 = svdiv_x (p0, z1, x0))
++
++/*
++** div_2_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	sdiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s64_x_tied1, svint64_t,
++		z0 = svdiv_n_s64_x (p0, z0, 2),
++		z0 = svdiv_x (p0, z0, 2))
++
++/*
++** div_2_s64_x_untied:
++**	mov	z0\.d, #2
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_s64_x_untied, svint64_t,
++		z0 = svdiv_n_s64_x (p0, z1, 2),
++		z0 = svdiv_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
+new file mode 100644
+index 000000000..8e8e464b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_u32_m_tied1:
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_m_tied1, svuint32_t,
++		z0 = svdiv_u32_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	udiv	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_m_tied2, svuint32_t,
++		z0 = svdiv_u32_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_u32_m_untied:
++**	movprfx	z0, z1
++**	udiv	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_m_untied, svuint32_t,
++		z0 = svdiv_u32_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svdiv_n_u32_m (p0, z0, x0),
++		 z0 = svdiv_m (p0, z0, x0))
++
++/*
++** div_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svdiv_n_u32_m (p0, z1, x0),
++		 z0 = svdiv_m (p0, z1, x0))
++
++/*
++** div_2_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u32_m_tied1, svuint32_t,
++		z0 = svdiv_n_u32_m (p0, z0, 2),
++		z0 = svdiv_m (p0, z0, 2))
++
++/*
++** div_2_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0, z1
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u32_m_untied, svuint32_t,
++		z0 = svdiv_n_u32_m (p0, z1, 2),
++		z0 = svdiv_m (p0, z1, 2))
++
++/*
++** div_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_z_tied1, svuint32_t,
++		z0 = svdiv_u32_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_z_tied2, svuint32_t,
++		z0 = svdiv_u32_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	udiv	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_z_untied, svuint32_t,
++		z0 = svdiv_u32_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svdiv_n_u32_z (p0, z0, x0),
++		 z0 = svdiv_z (p0, z0, x0))
++
++/*
++** div_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	udiv	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svdiv_n_u32_z (p0, z1, x0),
++		 z0 = svdiv_z (p0, z1, x0))
++
++/*
++** div_2_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u32_z_tied1, svuint32_t,
++		z0 = svdiv_n_u32_z (p0, z0, 2),
++		z0 = svdiv_z (p0, z0, 2))
++
++/*
++** div_2_u32_z_untied:
++**	mov	(z[0-9]+\.s), #2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	udiv	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u32_z_untied, svuint32_t,
++		z0 = svdiv_n_u32_z (p0, z1, 2),
++		z0 = svdiv_z (p0, z1, 2))
++
++/*
++** div_u32_x_tied1:
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_x_tied1, svuint32_t,
++		z0 = svdiv_u32_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_u32_x_tied2:
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_x_tied2, svuint32_t,
++		z0 = svdiv_u32_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	udiv	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_u32_x_untied, svuint32_t,
++		z0 = svdiv_u32_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svdiv_n_u32_x (p0, z0, x0),
++		 z0 = svdiv_x (p0, z0, x0))
++
++/*
++** div_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (div_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svdiv_n_u32_x (p0, z1, x0),
++		 z0 = svdiv_x (p0, z1, x0))
++
++/*
++** div_2_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	udiv	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u32_x_tied1, svuint32_t,
++		z0 = svdiv_n_u32_x (p0, z0, 2),
++		z0 = svdiv_x (p0, z0, 2))
++
++/*
++** div_2_u32_x_untied:
++**	mov	z0\.s, #2
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u32_x_untied, svuint32_t,
++		z0 = svdiv_n_u32_x (p0, z1, 2),
++		z0 = svdiv_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
+new file mode 100644
+index 000000000..fc152e8e5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** div_u64_m_tied1:
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_m_tied1, svuint64_t,
++		z0 = svdiv_u64_m (p0, z0, z1),
++		z0 = svdiv_m (p0, z0, z1))
++
++/*
++** div_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_m_tied2, svuint64_t,
++		z0 = svdiv_u64_m (p0, z1, z0),
++		z0 = svdiv_m (p0, z1, z0))
++
++/*
++** div_u64_m_untied:
++**	movprfx	z0, z1
++**	udiv	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_m_untied, svuint64_t,
++		z0 = svdiv_u64_m (p0, z1, z2),
++		z0 = svdiv_m (p0, z1, z2))
++
++/*
++** div_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svdiv_n_u64_m (p0, z0, x0),
++		 z0 = svdiv_m (p0, z0, x0))
++
++/*
++** div_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svdiv_n_u64_m (p0, z1, x0),
++		 z0 = svdiv_m (p0, z1, x0))
++
++/*
++** div_2_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u64_m_tied1, svuint64_t,
++		z0 = svdiv_n_u64_m (p0, z0, 2),
++		z0 = svdiv_m (p0, z0, 2))
++
++/*
++** div_2_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0, z1
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u64_m_untied, svuint64_t,
++		z0 = svdiv_n_u64_m (p0, z1, 2),
++		z0 = svdiv_m (p0, z1, 2))
++
++/*
++** div_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_z_tied1, svuint64_t,
++		z0 = svdiv_u64_z (p0, z0, z1),
++		z0 = svdiv_z (p0, z0, z1))
++
++/*
++** div_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_z_tied2, svuint64_t,
++		z0 = svdiv_u64_z (p0, z1, z0),
++		z0 = svdiv_z (p0, z1, z0))
++
++/*
++** div_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	udiv	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_z_untied, svuint64_t,
++		z0 = svdiv_u64_z (p0, z1, z2),
++		z0 = svdiv_z (p0, z1, z2))
++
++/*
++** div_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svdiv_n_u64_z (p0, z0, x0),
++		 z0 = svdiv_z (p0, z0, x0))
++
++/*
++** div_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	udiv	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svdiv_n_u64_z (p0, z1, x0),
++		 z0 = svdiv_z (p0, z1, x0))
++
++/*
++** div_2_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u64_z_tied1, svuint64_t,
++		z0 = svdiv_n_u64_z (p0, z0, 2),
++		z0 = svdiv_z (p0, z0, 2))
++
++/*
++** div_2_u64_z_untied:
++**	mov	(z[0-9]+\.d), #2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	udiv	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u64_z_untied, svuint64_t,
++		z0 = svdiv_n_u64_z (p0, z1, 2),
++		z0 = svdiv_z (p0, z1, 2))
++
++/*
++** div_u64_x_tied1:
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_x_tied1, svuint64_t,
++		z0 = svdiv_u64_x (p0, z0, z1),
++		z0 = svdiv_x (p0, z0, z1))
++
++/*
++** div_u64_x_tied2:
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_x_tied2, svuint64_t,
++		z0 = svdiv_u64_x (p0, z1, z0),
++		z0 = svdiv_x (p0, z1, z0))
++
++/*
++** div_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	udiv	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (div_u64_x_untied, svuint64_t,
++		z0 = svdiv_u64_x (p0, z1, z2),
++		z0 = svdiv_x (p0, z1, z2))
++
++/*
++** div_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svdiv_n_u64_x (p0, z0, x0),
++		 z0 = svdiv_x (p0, z0, x0))
++
++/*
++** div_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (div_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svdiv_n_u64_x (p0, z1, x0),
++		 z0 = svdiv_x (p0, z1, x0))
++
++/*
++** div_2_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	udiv	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u64_x_tied1, svuint64_t,
++		z0 = svdiv_n_u64_x (p0, z0, 2),
++		z0 = svdiv_x (p0, z0, 2))
++
++/*
++** div_2_u64_x_untied:
++**	mov	z0\.d, #2
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (div_2_u64_x_untied, svuint64_t,
++		z0 = svdiv_n_u64_x (p0, z1, 2),
++		z0 = svdiv_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f16.c
+new file mode 100644
+index 000000000..03cc0343b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f16.c
+@@ -0,0 +1,324 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_f16_m_tied1:
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_m_tied1, svfloat16_t,
++		z0 = svdivr_f16_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fdivr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_m_tied2, svfloat16_t,
++		z0 = svdivr_f16_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_f16_m_untied:
++**	movprfx	z0, z1
++**	fdivr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_m_untied, svfloat16_t,
++		z0 = svdivr_f16_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svdivr_n_f16_m (p0, z0, d4),
++		 z0 = svdivr_m (p0, z0, d4))
++
++/*
++** divr_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svdivr_n_f16_m (p0, z1, d4),
++		 z0 = svdivr_m (p0, z1, d4))
++
++/*
++** divr_1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f16_m_tied1, svfloat16_t,
++		z0 = svdivr_n_f16_m (p0, z0, 1),
++		z0 = svdivr_m (p0, z0, 1))
++
++/*
++** divr_1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f16_m_untied, svfloat16_t,
++		z0 = svdivr_n_f16_m (p0, z1, 1),
++		z0 = svdivr_m (p0, z1, 1))
++
++/*
++** divr_0p5_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svdivr_n_f16_m (p0, z0, 0.5),
++		z0 = svdivr_m (p0, z0, 0.5))
++
++/*
++** divr_0p5_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	movprfx	z0, z1
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f16_m_untied, svfloat16_t,
++		z0 = svdivr_n_f16_m (p0, z1, 0.5),
++		z0 = svdivr_m (p0, z1, 0.5))
++
++/*
++** divr_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_z_tied1, svfloat16_t,
++		z0 = svdivr_f16_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_z_tied2, svfloat16_t,
++		z0 = svdivr_f16_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fdivr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_z_untied, svfloat16_t,
++		z0 = svdivr_f16_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svdivr_n_f16_z (p0, z0, d4),
++		 z0 = svdivr_z (p0, z0, d4))
++
++/*
++** divr_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svdivr_n_f16_z (p0, z1, d4),
++		 z0 = svdivr_z (p0, z1, d4))
++
++/*
++** divr_1_f16_z:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f16_z, svfloat16_t,
++		z0 = svdivr_n_f16_z (p0, z0, 1),
++		z0 = svdivr_z (p0, z0, 1))
++
++/*
++** divr_0p5_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svdivr_n_f16_z (p0, z0, 0.5),
++		z0 = svdivr_z (p0, z0, 0.5))
++
++/*
++** divr_0p5_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f16_z_untied, svfloat16_t,
++		z0 = svdivr_n_f16_z (p0, z1, 0.5),
++		z0 = svdivr_z (p0, z1, 0.5))
++
++/*
++** divr_f16_x_tied1:
++**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_x_tied1, svfloat16_t,
++		z0 = svdivr_f16_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_f16_x_tied2:
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_x_tied2, svfloat16_t,
++		z0 = svdivr_f16_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fdivr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f16_x_untied, svfloat16_t,
++		z0 = svdivr_f16_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svdivr_n_f16_x (p0, z0, d4),
++		 z0 = svdivr_x (p0, z0, d4))
++
++/*
++** divr_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svdivr_n_f16_x (p0, z1, d4),
++		 z0 = svdivr_x (p0, z1, d4))
++
++/*
++** divr_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fdivr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f16_x_tied1, svfloat16_t,
++		z0 = svdivr_n_f16_x (p0, z0, 1),
++		z0 = svdivr_x (p0, z0, 1))
++
++/*
++** divr_1_f16_x_untied:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f16_x_untied, svfloat16_t,
++		z0 = svdivr_n_f16_x (p0, z1, 1),
++		z0 = svdivr_x (p0, z1, 1))
++
++/*
++** ptrue_divr_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f16_x_tied1, svfloat16_t,
++		z0 = svdivr_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svdivr_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_divr_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f16_x_tied2, svfloat16_t,
++		z0 = svdivr_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svdivr_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_divr_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f16_x_untied, svfloat16_t,
++		z0 = svdivr_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svdivr_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_divr_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_1_f16_x_tied1, svfloat16_t,
++		z0 = svdivr_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svdivr_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_divr_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_1_f16_x_untied, svfloat16_t,
++		z0 = svdivr_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svdivr_x (svptrue_b16 (), z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f32.c
+new file mode 100644
+index 000000000..c2b65fc33
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f32.c
+@@ -0,0 +1,324 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_f32_m_tied1:
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_m_tied1, svfloat32_t,
++		z0 = svdivr_f32_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fdivr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_m_tied2, svfloat32_t,
++		z0 = svdivr_f32_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_f32_m_untied:
++**	movprfx	z0, z1
++**	fdivr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_m_untied, svfloat32_t,
++		z0 = svdivr_f32_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svdivr_n_f32_m (p0, z0, d4),
++		 z0 = svdivr_m (p0, z0, d4))
++
++/*
++** divr_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svdivr_n_f32_m (p0, z1, d4),
++		 z0 = svdivr_m (p0, z1, d4))
++
++/*
++** divr_1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f32_m_tied1, svfloat32_t,
++		z0 = svdivr_n_f32_m (p0, z0, 1),
++		z0 = svdivr_m (p0, z0, 1))
++
++/*
++** divr_1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f32_m_untied, svfloat32_t,
++		z0 = svdivr_n_f32_m (p0, z1, 1),
++		z0 = svdivr_m (p0, z1, 1))
++
++/*
++** divr_0p5_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svdivr_n_f32_m (p0, z0, 0.5),
++		z0 = svdivr_m (p0, z0, 0.5))
++
++/*
++** divr_0p5_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	movprfx	z0, z1
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f32_m_untied, svfloat32_t,
++		z0 = svdivr_n_f32_m (p0, z1, 0.5),
++		z0 = svdivr_m (p0, z1, 0.5))
++
++/*
++** divr_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_z_tied1, svfloat32_t,
++		z0 = svdivr_f32_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_z_tied2, svfloat32_t,
++		z0 = svdivr_f32_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fdivr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_z_untied, svfloat32_t,
++		z0 = svdivr_f32_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svdivr_n_f32_z (p0, z0, d4),
++		 z0 = svdivr_z (p0, z0, d4))
++
++/*
++** divr_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svdivr_n_f32_z (p0, z1, d4),
++		 z0 = svdivr_z (p0, z1, d4))
++
++/*
++** divr_1_f32_z:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f32_z, svfloat32_t,
++		z0 = svdivr_n_f32_z (p0, z0, 1),
++		z0 = svdivr_z (p0, z0, 1))
++
++/*
++** divr_0p5_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svdivr_n_f32_z (p0, z0, 0.5),
++		z0 = svdivr_z (p0, z0, 0.5))
++
++/*
++** divr_0p5_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f32_z_untied, svfloat32_t,
++		z0 = svdivr_n_f32_z (p0, z1, 0.5),
++		z0 = svdivr_z (p0, z1, 0.5))
++
++/*
++** divr_f32_x_tied1:
++**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_x_tied1, svfloat32_t,
++		z0 = svdivr_f32_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_f32_x_tied2:
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_x_tied2, svfloat32_t,
++		z0 = svdivr_f32_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fdivr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f32_x_untied, svfloat32_t,
++		z0 = svdivr_f32_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svdivr_n_f32_x (p0, z0, d4),
++		 z0 = svdivr_x (p0, z0, d4))
++
++/*
++** divr_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svdivr_n_f32_x (p0, z1, d4),
++		 z0 = svdivr_x (p0, z1, d4))
++
++/*
++** divr_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f32_x_tied1, svfloat32_t,
++		z0 = svdivr_n_f32_x (p0, z0, 1),
++		z0 = svdivr_x (p0, z0, 1))
++
++/*
++** divr_1_f32_x_untied:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f32_x_untied, svfloat32_t,
++		z0 = svdivr_n_f32_x (p0, z1, 1),
++		z0 = svdivr_x (p0, z1, 1))
++
++/*
++** ptrue_divr_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f32_x_tied1, svfloat32_t,
++		z0 = svdivr_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svdivr_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_divr_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f32_x_tied2, svfloat32_t,
++		z0 = svdivr_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svdivr_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_divr_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f32_x_untied, svfloat32_t,
++		z0 = svdivr_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svdivr_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_divr_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_1_f32_x_tied1, svfloat32_t,
++		z0 = svdivr_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svdivr_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_divr_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_1_f32_x_untied, svfloat32_t,
++		z0 = svdivr_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svdivr_x (svptrue_b32 (), z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f64.c
+new file mode 100644
+index 000000000..0a72a37b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f64.c
+@@ -0,0 +1,324 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_f64_m_tied1:
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_m_tied1, svfloat64_t,
++		z0 = svdivr_f64_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_m_tied2, svfloat64_t,
++		z0 = svdivr_f64_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_f64_m_untied:
++**	movprfx	z0, z1
++**	fdivr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_m_untied, svfloat64_t,
++		z0 = svdivr_f64_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svdivr_n_f64_m (p0, z0, d4),
++		 z0 = svdivr_m (p0, z0, d4))
++
++/*
++** divr_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svdivr_n_f64_m (p0, z1, d4),
++		 z0 = svdivr_m (p0, z1, d4))
++
++/*
++** divr_1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f64_m_tied1, svfloat64_t,
++		z0 = svdivr_n_f64_m (p0, z0, 1),
++		z0 = svdivr_m (p0, z0, 1))
++
++/*
++** divr_1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f64_m_untied, svfloat64_t,
++		z0 = svdivr_n_f64_m (p0, z1, 1),
++		z0 = svdivr_m (p0, z1, 1))
++
++/*
++** divr_0p5_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svdivr_n_f64_m (p0, z0, 0.5),
++		z0 = svdivr_m (p0, z0, 0.5))
++
++/*
++** divr_0p5_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	movprfx	z0, z1
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f64_m_untied, svfloat64_t,
++		z0 = svdivr_n_f64_m (p0, z1, 0.5),
++		z0 = svdivr_m (p0, z1, 0.5))
++
++/*
++** divr_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_z_tied1, svfloat64_t,
++		z0 = svdivr_f64_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_z_tied2, svfloat64_t,
++		z0 = svdivr_f64_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fdivr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_z_untied, svfloat64_t,
++		z0 = svdivr_f64_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svdivr_n_f64_z (p0, z0, d4),
++		 z0 = svdivr_z (p0, z0, d4))
++
++/*
++** divr_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svdivr_n_f64_z (p0, z1, d4),
++		 z0 = svdivr_z (p0, z1, d4))
++
++/*
++** divr_1_f64_z:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f64_z, svfloat64_t,
++		z0 = svdivr_n_f64_z (p0, z0, 1),
++		z0 = svdivr_z (p0, z0, 1))
++
++/*
++** divr_0p5_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svdivr_n_f64_z (p0, z0, 0.5),
++		z0 = svdivr_z (p0, z0, 0.5))
++
++/*
++** divr_0p5_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_0p5_f64_z_untied, svfloat64_t,
++		z0 = svdivr_n_f64_z (p0, z1, 0.5),
++		z0 = svdivr_z (p0, z1, 0.5))
++
++/*
++** divr_f64_x_tied1:
++**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_x_tied1, svfloat64_t,
++		z0 = svdivr_f64_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_f64_x_tied2:
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_x_tied2, svfloat64_t,
++		z0 = svdivr_f64_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fdivr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_f64_x_untied, svfloat64_t,
++		z0 = svdivr_f64_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svdivr_n_f64_x (p0, z0, d4),
++		 z0 = svdivr_x (p0, z0, d4))
++
++/*
++** divr_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (divr_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svdivr_n_f64_x (p0, z1, d4),
++		 z0 = svdivr_x (p0, z1, d4))
++
++/*
++** divr_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f64_x_tied1, svfloat64_t,
++		z0 = svdivr_n_f64_x (p0, z0, 1),
++		z0 = svdivr_x (p0, z0, 1))
++
++/*
++** divr_1_f64_x_untied:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_1_f64_x_untied, svfloat64_t,
++		z0 = svdivr_n_f64_x (p0, z1, 1),
++		z0 = svdivr_x (p0, z1, 1))
++
++/*
++** ptrue_divr_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f64_x_tied1, svfloat64_t,
++		z0 = svdivr_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svdivr_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_divr_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f64_x_tied2, svfloat64_t,
++		z0 = svdivr_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svdivr_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_divr_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_f64_x_untied, svfloat64_t,
++		z0 = svdivr_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svdivr_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_divr_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_1_f64_x_tied1, svfloat64_t,
++		z0 = svdivr_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svdivr_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_divr_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_divr_1_f64_x_untied, svfloat64_t,
++		z0 = svdivr_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svdivr_x (svptrue_b64 (), z1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s32.c
+new file mode 100644
+index 000000000..75a6c1d97
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s32.c
+@@ -0,0 +1,247 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_s32_m_tied1:
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_m_tied1, svint32_t,
++		z0 = svdivr_s32_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sdivr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_m_tied2, svint32_t,
++		z0 = svdivr_s32_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_s32_m_untied:
++**	movprfx	z0, z1
++**	sdivr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_m_untied, svint32_t,
++		z0 = svdivr_s32_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svdivr_n_s32_m (p0, z0, x0),
++		 z0 = svdivr_m (p0, z0, x0))
++
++/*
++** divr_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svdivr_n_s32_m (p0, z1, x0),
++		 z0 = svdivr_m (p0, z1, x0))
++
++/*
++** divr_2_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s32_m_tied1, svint32_t,
++		z0 = svdivr_n_s32_m (p0, z0, 2),
++		z0 = svdivr_m (p0, z0, 2))
++
++/*
++** divr_2_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0, z1
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s32_m_untied, svint32_t,
++		z0 = svdivr_n_s32_m (p0, z1, 2),
++		z0 = svdivr_m (p0, z1, 2))
++
++/*
++** divr_m1_s32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	sdivr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_m1_s32_m, svint32_t,
++		z0 = svdivr_n_s32_m (p0, z0, -1),
++		z0 = svdivr_m (p0, z0, -1))
++
++/*
++** divr_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_z_tied1, svint32_t,
++		z0 = svdivr_s32_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_z_tied2, svint32_t,
++		z0 = svdivr_s32_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sdivr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_z_untied, svint32_t,
++		z0 = svdivr_s32_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svdivr_n_s32_z (p0, z0, x0),
++		 z0 = svdivr_z (p0, z0, x0))
++
++/*
++** divr_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svdivr_n_s32_z (p0, z1, x0),
++		 z0 = svdivr_z (p0, z1, x0))
++
++/*
++** divr_2_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s32_z_tied1, svint32_t,
++		z0 = svdivr_n_s32_z (p0, z0, 2),
++		z0 = svdivr_z (p0, z0, 2))
++
++/*
++** divr_2_s32_z_untied:
++**	mov	(z[0-9]+\.s), #2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s32_z_untied, svint32_t,
++		z0 = svdivr_n_s32_z (p0, z1, 2),
++		z0 = svdivr_z (p0, z1, 2))
++
++/*
++** divr_s32_x_tied1:
++**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_x_tied1, svint32_t,
++		z0 = svdivr_s32_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_s32_x_tied2:
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_x_tied2, svint32_t,
++		z0 = svdivr_s32_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	sdivr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s32_x_untied, svint32_t,
++		z0 = svdivr_s32_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svdivr_n_s32_x (p0, z0, x0),
++		 z0 = svdivr_x (p0, z0, x0))
++
++/*
++** divr_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svdivr_n_s32_x (p0, z1, x0),
++		 z0 = svdivr_x (p0, z1, x0))
++
++/*
++** divr_2_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	sdivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s32_x_tied1, svint32_t,
++		z0 = svdivr_n_s32_x (p0, z0, 2),
++		z0 = svdivr_x (p0, z0, 2))
++
++/*
++** divr_2_s32_x_untied:
++**	mov	z0\.s, #2
++**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s32_x_untied, svint32_t,
++		z0 = svdivr_n_s32_x (p0, z1, 2),
++		z0 = svdivr_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s64.c
+new file mode 100644
+index 000000000..8f4939a91
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s64.c
+@@ -0,0 +1,247 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_s64_m_tied1:
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_m_tied1, svint64_t,
++		z0 = svdivr_s64_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_m_tied2, svint64_t,
++		z0 = svdivr_s64_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_s64_m_untied:
++**	movprfx	z0, z1
++**	sdivr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_m_untied, svint64_t,
++		z0 = svdivr_s64_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svdivr_n_s64_m (p0, z0, x0),
++		 z0 = svdivr_m (p0, z0, x0))
++
++/*
++** divr_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svdivr_n_s64_m (p0, z1, x0),
++		 z0 = svdivr_m (p0, z1, x0))
++
++/*
++** divr_2_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s64_m_tied1, svint64_t,
++		z0 = svdivr_n_s64_m (p0, z0, 2),
++		z0 = svdivr_m (p0, z0, 2))
++
++/*
++** divr_2_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0, z1
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s64_m_untied, svint64_t,
++		z0 = svdivr_n_s64_m (p0, z1, 2),
++		z0 = svdivr_m (p0, z1, 2))
++
++/*
++** divr_m1_s64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	sdivr	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_m1_s64_m, svint64_t,
++		z0 = svdivr_n_s64_m (p0, z0, -1),
++		z0 = svdivr_m (p0, z0, -1))
++
++/*
++** divr_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_z_tied1, svint64_t,
++		z0 = svdivr_s64_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_z_tied2, svint64_t,
++		z0 = svdivr_s64_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sdivr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_z_untied, svint64_t,
++		z0 = svdivr_s64_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svdivr_n_s64_z (p0, z0, x0),
++		 z0 = svdivr_z (p0, z0, x0))
++
++/*
++** divr_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svdivr_n_s64_z (p0, z1, x0),
++		 z0 = svdivr_z (p0, z1, x0))
++
++/*
++** divr_2_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s64_z_tied1, svint64_t,
++		z0 = svdivr_n_s64_z (p0, z0, 2),
++		z0 = svdivr_z (p0, z0, 2))
++
++/*
++** divr_2_s64_z_untied:
++**	mov	(z[0-9]+\.d), #2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s64_z_untied, svint64_t,
++		z0 = svdivr_n_s64_z (p0, z1, 2),
++		z0 = svdivr_z (p0, z1, 2))
++
++/*
++** divr_s64_x_tied1:
++**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_x_tied1, svint64_t,
++		z0 = svdivr_s64_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_s64_x_tied2:
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_x_tied2, svint64_t,
++		z0 = svdivr_s64_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	sdivr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_s64_x_untied, svint64_t,
++		z0 = svdivr_s64_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svdivr_n_s64_x (p0, z0, x0),
++		 z0 = svdivr_x (p0, z0, x0))
++
++/*
++** divr_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svdivr_n_s64_x (p0, z1, x0),
++		 z0 = svdivr_x (p0, z1, x0))
++
++/*
++** divr_2_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	sdivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s64_x_tied1, svint64_t,
++		z0 = svdivr_n_s64_x (p0, z0, 2),
++		z0 = svdivr_x (p0, z0, 2))
++
++/*
++** divr_2_s64_x_untied:
++**	mov	z0\.d, #2
++**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_s64_x_untied, svint64_t,
++		z0 = svdivr_n_s64_x (p0, z1, 2),
++		z0 = svdivr_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u32.c
+new file mode 100644
+index 000000000..84c243b44
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u32.c
+@@ -0,0 +1,247 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_u32_m_tied1:
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_m_tied1, svuint32_t,
++		z0 = svdivr_u32_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	udivr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_m_tied2, svuint32_t,
++		z0 = svdivr_u32_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_u32_m_untied:
++**	movprfx	z0, z1
++**	udivr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_m_untied, svuint32_t,
++		z0 = svdivr_u32_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svdivr_n_u32_m (p0, z0, x0),
++		 z0 = svdivr_m (p0, z0, x0))
++
++/*
++** divr_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svdivr_n_u32_m (p0, z1, x0),
++		 z0 = svdivr_m (p0, z1, x0))
++
++/*
++** divr_2_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u32_m_tied1, svuint32_t,
++		z0 = svdivr_n_u32_m (p0, z0, 2),
++		z0 = svdivr_m (p0, z0, 2))
++
++/*
++** divr_2_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0, z1
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u32_m_untied, svuint32_t,
++		z0 = svdivr_n_u32_m (p0, z1, 2),
++		z0 = svdivr_m (p0, z1, 2))
++
++/*
++** divr_m1_u32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	udivr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_m1_u32_m, svuint32_t,
++		z0 = svdivr_n_u32_m (p0, z0, -1),
++		z0 = svdivr_m (p0, z0, -1))
++
++/*
++** divr_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_z_tied1, svuint32_t,
++		z0 = svdivr_u32_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_z_tied2, svuint32_t,
++		z0 = svdivr_u32_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	udivr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_z_untied, svuint32_t,
++		z0 = svdivr_u32_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svdivr_n_u32_z (p0, z0, x0),
++		 z0 = svdivr_z (p0, z0, x0))
++
++/*
++** divr_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	udivr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svdivr_n_u32_z (p0, z1, x0),
++		 z0 = svdivr_z (p0, z1, x0))
++
++/*
++** divr_2_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u32_z_tied1, svuint32_t,
++		z0 = svdivr_n_u32_z (p0, z0, 2),
++		z0 = svdivr_z (p0, z0, 2))
++
++/*
++** divr_2_u32_z_untied:
++**	mov	(z[0-9]+\.s), #2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	udivr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u32_z_untied, svuint32_t,
++		z0 = svdivr_n_u32_z (p0, z1, 2),
++		z0 = svdivr_z (p0, z1, 2))
++
++/*
++** divr_u32_x_tied1:
++**	udivr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_x_tied1, svuint32_t,
++		z0 = svdivr_u32_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_u32_x_tied2:
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_x_tied2, svuint32_t,
++		z0 = svdivr_u32_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	udivr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u32_x_untied, svuint32_t,
++		z0 = svdivr_u32_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svdivr_n_u32_x (p0, z0, x0),
++		 z0 = svdivr_x (p0, z0, x0))
++
++/*
++** divr_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svdivr_n_u32_x (p0, z1, x0),
++		 z0 = svdivr_x (p0, z1, x0))
++
++/*
++** divr_2_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	udivr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u32_x_tied1, svuint32_t,
++		z0 = svdivr_n_u32_x (p0, z0, 2),
++		z0 = svdivr_x (p0, z0, 2))
++
++/*
++** divr_2_u32_x_untied:
++**	mov	z0\.s, #2
++**	udiv	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u32_x_untied, svuint32_t,
++		z0 = svdivr_n_u32_x (p0, z1, 2),
++		z0 = svdivr_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u64.c
+new file mode 100644
+index 000000000..03bb62472
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u64.c
+@@ -0,0 +1,247 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** divr_u64_m_tied1:
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_m_tied1, svuint64_t,
++		z0 = svdivr_u64_m (p0, z0, z1),
++		z0 = svdivr_m (p0, z0, z1))
++
++/*
++** divr_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_m_tied2, svuint64_t,
++		z0 = svdivr_u64_m (p0, z1, z0),
++		z0 = svdivr_m (p0, z1, z0))
++
++/*
++** divr_u64_m_untied:
++**	movprfx	z0, z1
++**	udivr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_m_untied, svuint64_t,
++		z0 = svdivr_u64_m (p0, z1, z2),
++		z0 = svdivr_m (p0, z1, z2))
++
++/*
++** divr_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svdivr_n_u64_m (p0, z0, x0),
++		 z0 = svdivr_m (p0, z0, x0))
++
++/*
++** divr_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svdivr_n_u64_m (p0, z1, x0),
++		 z0 = svdivr_m (p0, z1, x0))
++
++/*
++** divr_2_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u64_m_tied1, svuint64_t,
++		z0 = svdivr_n_u64_m (p0, z0, 2),
++		z0 = svdivr_m (p0, z0, 2))
++
++/*
++** divr_2_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0, z1
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u64_m_untied, svuint64_t,
++		z0 = svdivr_n_u64_m (p0, z1, 2),
++		z0 = svdivr_m (p0, z1, 2))
++
++/*
++** divr_m1_u64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	udivr	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_m1_u64_m, svuint64_t,
++		z0 = svdivr_n_u64_m (p0, z0, -1),
++		z0 = svdivr_m (p0, z0, -1))
++
++/*
++** divr_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_z_tied1, svuint64_t,
++		z0 = svdivr_u64_z (p0, z0, z1),
++		z0 = svdivr_z (p0, z0, z1))
++
++/*
++** divr_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_z_tied2, svuint64_t,
++		z0 = svdivr_u64_z (p0, z1, z0),
++		z0 = svdivr_z (p0, z1, z0))
++
++/*
++** divr_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	udivr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_z_untied, svuint64_t,
++		z0 = svdivr_u64_z (p0, z1, z2),
++		z0 = svdivr_z (p0, z1, z2))
++
++/*
++** divr_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svdivr_n_u64_z (p0, z0, x0),
++		 z0 = svdivr_z (p0, z0, x0))
++
++/*
++** divr_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	udivr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svdivr_n_u64_z (p0, z1, x0),
++		 z0 = svdivr_z (p0, z1, x0))
++
++/*
++** divr_2_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u64_z_tied1, svuint64_t,
++		z0 = svdivr_n_u64_z (p0, z0, 2),
++		z0 = svdivr_z (p0, z0, 2))
++
++/*
++** divr_2_u64_z_untied:
++**	mov	(z[0-9]+\.d), #2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	udivr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u64_z_untied, svuint64_t,
++		z0 = svdivr_n_u64_z (p0, z1, 2),
++		z0 = svdivr_z (p0, z1, 2))
++
++/*
++** divr_u64_x_tied1:
++**	udivr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_x_tied1, svuint64_t,
++		z0 = svdivr_u64_x (p0, z0, z1),
++		z0 = svdivr_x (p0, z0, z1))
++
++/*
++** divr_u64_x_tied2:
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_x_tied2, svuint64_t,
++		z0 = svdivr_u64_x (p0, z1, z0),
++		z0 = svdivr_x (p0, z1, z0))
++
++/*
++** divr_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	udivr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (divr_u64_x_untied, svuint64_t,
++		z0 = svdivr_u64_x (p0, z1, z2),
++		z0 = svdivr_x (p0, z1, z2))
++
++/*
++** divr_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svdivr_n_u64_x (p0, z0, x0),
++		 z0 = svdivr_x (p0, z0, x0))
++
++/*
++** divr_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (divr_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svdivr_n_u64_x (p0, z1, x0),
++		 z0 = svdivr_x (p0, z1, x0))
++
++/*
++** divr_2_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	udivr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u64_x_tied1, svuint64_t,
++		z0 = svdivr_n_u64_x (p0, z0, 2),
++		z0 = svdivr_x (p0, z0, 2))
++
++/*
++** divr_2_u64_x_untied:
++**	mov	z0\.d, #2
++**	udiv	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (divr_2_u64_x_untied, svuint64_t,
++		z0 = svdivr_n_u64_x (p0, z1, 2),
++		z0 = svdivr_x (p0, z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s32.c
+new file mode 100644
+index 000000000..a4d713e29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s32.c
+@@ -0,0 +1,93 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_lane_0_s32_tied1:
++**	sdot	z0\.s, z4\.b, z5\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_s32_tied1, svint32_t, svint8_t,
++	     z0 = svdot_lane_s32 (z0, z4, z5, 0),
++	     z0 = svdot_lane (z0, z4, z5, 0))
++
++/*
++** dot_lane_0_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.s, \1\.b, z1\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_s32_tied2, svint32_t, svint8_t,
++		 z0_res = svdot_lane_s32 (z4, z0, z1, 0),
++		 z0_res = svdot_lane (z4, z0, z1, 0))
++
++/*
++** dot_lane_0_s32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.s, z1\.b, \1\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_s32_tied3, svint32_t, svint8_t,
++		 z0_res = svdot_lane_s32 (z4, z1, z0, 0),
++		 z0_res = svdot_lane (z4, z1, z0, 0))
++
++/*
++** dot_lane_0_s32_untied:
++**	movprfx	z0, z1
++**	sdot	z0\.s, z4\.b, z5\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_s32_untied, svint32_t, svint8_t,
++	     z0 = svdot_lane_s32 (z1, z4, z5, 0),
++	     z0 = svdot_lane (z1, z4, z5, 0))
++
++/*
++** dot_lane_1_s32:
++**	sdot	z0\.s, z4\.b, z5\.b\[1\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_1_s32, svint32_t, svint8_t,
++	     z0 = svdot_lane_s32 (z0, z4, z5, 1),
++	     z0 = svdot_lane (z0, z4, z5, 1))
++
++/*
++** dot_lane_2_s32:
++**	sdot	z0\.s, z4\.b, z5\.b\[2\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_2_s32, svint32_t, svint8_t,
++	     z0 = svdot_lane_s32 (z0, z4, z5, 2),
++	     z0 = svdot_lane (z0, z4, z5, 2))
++
++/*
++** dot_lane_3_s32:
++**	sdot	z0\.s, z4\.b, z5\.b\[3\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_3_s32, svint32_t, svint8_t,
++	     z0 = svdot_lane_s32 (z0, z4, z5, 3),
++	     z0 = svdot_lane (z0, z4, z5, 3))
++
++/*
++** dot_lane_z8_s32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	sdot	z0\.s, z1\.b, \1\.b\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z8_s32, svint32_t, svint8_t, z8,
++		    z0 = svdot_lane_s32 (z0, z1, z8, 1),
++		    z0 = svdot_lane (z0, z1, z8, 1))
++
++/*
++** dot_lane_z16_s32:
++**	mov	(z[0-7])\.d, z16\.d
++**	sdot	z0\.s, z1\.b, \1\.b\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z16_s32, svint32_t, svint8_t, z16,
++		    z0 = svdot_lane_s32 (z0, z1, z16, 1),
++		    z0 = svdot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s64.c
+new file mode 100644
+index 000000000..daee74091
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s64.c
+@@ -0,0 +1,74 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_lane_0_s64_tied1:
++**	sdot	z0\.d, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_s64_tied1, svint64_t, svint16_t,
++	     z0 = svdot_lane_s64 (z0, z4, z5, 0),
++	     z0 = svdot_lane (z0, z4, z5, 0))
++
++/*
++** dot_lane_0_s64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.d, \1\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_s64_tied2, svint64_t, svint16_t,
++		 z0_res = svdot_lane_s64 (z4, z0, z1, 0),
++		 z0_res = svdot_lane (z4, z0, z1, 0))
++
++/*
++** dot_lane_0_s64_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.d, z1\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_s64_tied3, svint64_t, svint16_t,
++		 z0_res = svdot_lane_s64 (z4, z1, z0, 0),
++		 z0_res = svdot_lane (z4, z1, z0, 0))
++
++/*
++** dot_lane_0_s64_untied:
++**	movprfx	z0, z1
++**	sdot	z0\.d, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_s64_untied, svint64_t, svint16_t,
++	     z0 = svdot_lane_s64 (z1, z4, z5, 0),
++	     z0 = svdot_lane (z1, z4, z5, 0))
++
++/*
++** dot_lane_1_s64:
++**	sdot	z0\.d, z4\.h, z5\.h\[1\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_1_s64, svint64_t, svint16_t,
++	     z0 = svdot_lane_s64 (z0, z4, z5, 1),
++	     z0 = svdot_lane (z0, z4, z5, 1))
++
++/*
++** dot_lane_z15_s64:
++**	str	d15, \[sp, -16\]!
++**	sdot	z0\.d, z1\.h, z15\.h\[1\]
++**	ldr	d15, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z15_s64, svint64_t, svint16_t, z15,
++		    z0 = svdot_lane_s64 (z0, z1, z15, 1),
++		    z0 = svdot_lane (z0, z1, z15, 1))
++
++/*
++** dot_lane_z16_s64:
++**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
++**	sdot	z0\.d, z1\.h, \1\.h\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z16_s64, svint64_t, svint16_t, z16,
++		    z0 = svdot_lane_s64 (z0, z1, z16, 1),
++		    z0 = svdot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u32.c
+new file mode 100644
+index 000000000..6d69df76d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u32.c
+@@ -0,0 +1,93 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_lane_0_u32_tied1:
++**	udot	z0\.s, z4\.b, z5\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_u32_tied1, svuint32_t, svuint8_t,
++	     z0 = svdot_lane_u32 (z0, z4, z5, 0),
++	     z0 = svdot_lane (z0, z4, z5, 0))
++
++/*
++** dot_lane_0_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.s, \1\.b, z1\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_u32_tied2, svuint32_t, svuint8_t,
++		 z0_res = svdot_lane_u32 (z4, z0, z1, 0),
++		 z0_res = svdot_lane (z4, z0, z1, 0))
++
++/*
++** dot_lane_0_u32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.s, z1\.b, \1\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_u32_tied3, svuint32_t, svuint8_t,
++		 z0_res = svdot_lane_u32 (z4, z1, z0, 0),
++		 z0_res = svdot_lane (z4, z1, z0, 0))
++
++/*
++** dot_lane_0_u32_untied:
++**	movprfx	z0, z1
++**	udot	z0\.s, z4\.b, z5\.b\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_u32_untied, svuint32_t, svuint8_t,
++	     z0 = svdot_lane_u32 (z1, z4, z5, 0),
++	     z0 = svdot_lane (z1, z4, z5, 0))
++
++/*
++** dot_lane_1_u32:
++**	udot	z0\.s, z4\.b, z5\.b\[1\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_1_u32, svuint32_t, svuint8_t,
++	     z0 = svdot_lane_u32 (z0, z4, z5, 1),
++	     z0 = svdot_lane (z0, z4, z5, 1))
++
++/*
++** dot_lane_2_u32:
++**	udot	z0\.s, z4\.b, z5\.b\[2\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_2_u32, svuint32_t, svuint8_t,
++	     z0 = svdot_lane_u32 (z0, z4, z5, 2),
++	     z0 = svdot_lane (z0, z4, z5, 2))
++
++/*
++** dot_lane_3_u32:
++**	udot	z0\.s, z4\.b, z5\.b\[3\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_3_u32, svuint32_t, svuint8_t,
++	     z0 = svdot_lane_u32 (z0, z4, z5, 3),
++	     z0 = svdot_lane (z0, z4, z5, 3))
++
++/*
++** dot_lane_z8_u32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	udot	z0\.s, z1\.b, \1\.b\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z8_u32, svuint32_t, svuint8_t, z8,
++		    z0 = svdot_lane_u32 (z0, z1, z8, 1),
++		    z0 = svdot_lane (z0, z1, z8, 1))
++
++/*
++** dot_lane_z16_u32:
++**	mov	(z[0-7])\.d, z16\.d
++**	udot	z0\.s, z1\.b, \1\.b\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z16_u32, svuint32_t, svuint8_t, z16,
++		    z0 = svdot_lane_u32 (z0, z1, z16, 1),
++		    z0 = svdot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u64.c
+new file mode 100644
+index 000000000..242e21c78
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u64.c
+@@ -0,0 +1,74 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_lane_0_u64_tied1:
++**	udot	z0\.d, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_u64_tied1, svuint64_t, svuint16_t,
++	     z0 = svdot_lane_u64 (z0, z4, z5, 0),
++	     z0 = svdot_lane (z0, z4, z5, 0))
++
++/*
++** dot_lane_0_u64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.d, \1\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_u64_tied2, svuint64_t, svuint16_t,
++		 z0_res = svdot_lane_u64 (z4, z0, z1, 0),
++		 z0_res = svdot_lane (z4, z0, z1, 0))
++
++/*
++** dot_lane_0_u64_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.d, z1\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_lane_0_u64_tied3, svuint64_t, svuint16_t,
++		 z0_res = svdot_lane_u64 (z4, z1, z0, 0),
++		 z0_res = svdot_lane (z4, z1, z0, 0))
++
++/*
++** dot_lane_0_u64_untied:
++**	movprfx	z0, z1
++**	udot	z0\.d, z4\.h, z5\.h\[0\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_0_u64_untied, svuint64_t, svuint16_t,
++	     z0 = svdot_lane_u64 (z1, z4, z5, 0),
++	     z0 = svdot_lane (z1, z4, z5, 0))
++
++/*
++** dot_lane_1_u64:
++**	udot	z0\.d, z4\.h, z5\.h\[1\]
++**	ret
++*/
++TEST_DUAL_Z (dot_lane_1_u64, svuint64_t, svuint16_t,
++	     z0 = svdot_lane_u64 (z0, z4, z5, 1),
++	     z0 = svdot_lane (z0, z4, z5, 1))
++
++/*
++** dot_lane_z15_u64:
++**	str	d15, \[sp, -16\]!
++**	udot	z0\.d, z1\.h, z15\.h\[1\]
++**	ldr	d15, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z15_u64, svuint64_t, svuint16_t, z15,
++		    z0 = svdot_lane_u64 (z0, z1, z15, 1),
++		    z0 = svdot_lane (z0, z1, z15, 1))
++
++/*
++** dot_lane_z16_u64:
++**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
++**	udot	z0\.d, z1\.h, \1\.h\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (dot_lane_z16_u64, svuint64_t, svuint16_t, z16,
++		    z0 = svdot_lane_u64 (z0, z1, z16, 1),
++		    z0 = svdot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s32.c
+new file mode 100644
+index 000000000..605bd1b30
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_s32_tied1:
++**	sdot	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (dot_s32_tied1, svint32_t, svint8_t,
++	     z0 = svdot_s32 (z0, z4, z5),
++	     z0 = svdot (z0, z4, z5))
++
++/*
++** dot_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.s, \1\.b, z1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_s32_tied2, svint32_t, svint8_t,
++		 z0_res = svdot_s32 (z4, z0, z1),
++		 z0_res = svdot (z4, z0, z1))
++
++/*
++** dot_s32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.s, z1\.b, \1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_s32_tied3, svint32_t, svint8_t,
++		 z0_res = svdot_s32 (z4, z1, z0),
++		 z0_res = svdot (z4, z1, z0))
++
++/*
++** dot_s32_untied:
++**	movprfx	z0, z1
++**	sdot	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (dot_s32_untied, svint32_t, svint8_t,
++	     z0 = svdot_s32 (z1, z4, z5),
++	     z0 = svdot (z1, z4, z5))
++
++/*
++** dot_w0_s32_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sdot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_s32_tied1, svint32_t, svint8_t, int8_t,
++	      z0 = svdot_n_s32 (z0, z4, x0),
++	      z0 = svdot (z0, z4, x0))
++
++/*
++** dot_w0_s32_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	sdot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_s32_untied, svint32_t, svint8_t, int8_t,
++	      z0 = svdot_n_s32 (z1, z4, x0),
++	      z0 = svdot (z1, z4, x0))
++
++/*
++** dot_9_s32_tied1:
++**	mov	(z[0-9]+\.b), #9
++**	sdot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_s32_tied1, svint32_t, svint8_t,
++	     z0 = svdot_n_s32 (z0, z4, 9),
++	     z0 = svdot (z0, z4, 9))
++
++/*
++** dot_9_s32_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #9
++**	movprfx	z0, z1
++**	sdot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_s32_untied, svint32_t, svint8_t,
++	     z0 = svdot_n_s32 (z1, z4, 9),
++	     z0 = svdot (z1, z4, 9))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s64.c
+new file mode 100644
+index 000000000..b6574740b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_s64_tied1:
++**	sdot	z0\.d, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (dot_s64_tied1, svint64_t, svint16_t,
++	     z0 = svdot_s64 (z0, z4, z5),
++	     z0 = svdot (z0, z4, z5))
++
++/*
++** dot_s64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.d, \1\.h, z1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_s64_tied2, svint64_t, svint16_t,
++		 z0_res = svdot_s64 (z4, z0, z1),
++		 z0_res = svdot (z4, z0, z1))
++
++/*
++** dot_s64_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sdot	z0\.d, z1\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_s64_tied3, svint64_t, svint16_t,
++		 z0_res = svdot_s64 (z4, z1, z0),
++		 z0_res = svdot (z4, z1, z0))
++
++/*
++** dot_s64_untied:
++**	movprfx	z0, z1
++**	sdot	z0\.d, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (dot_s64_untied, svint64_t, svint16_t,
++	     z0 = svdot_s64 (z1, z4, z5),
++	     z0 = svdot (z1, z4, z5))
++
++/*
++** dot_w0_s64_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sdot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_s64_tied1, svint64_t, svint16_t, int16_t,
++	      z0 = svdot_n_s64 (z0, z4, x0),
++	      z0 = svdot (z0, z4, x0))
++
++/*
++** dot_w0_s64_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	sdot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_s64_untied, svint64_t, svint16_t, int16_t,
++	      z0 = svdot_n_s64 (z1, z4, x0),
++	      z0 = svdot (z1, z4, x0))
++
++/*
++** dot_9_s64_tied1:
++**	mov	(z[0-9]+\.h), #9
++**	sdot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_s64_tied1, svint64_t, svint16_t,
++	     z0 = svdot_n_s64 (z0, z4, 9),
++	     z0 = svdot (z0, z4, 9))
++
++/*
++** dot_9_s64_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #9
++**	movprfx	z0, z1
++**	sdot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_s64_untied, svint64_t, svint16_t,
++	     z0 = svdot_n_s64 (z1, z4, 9),
++	     z0 = svdot (z1, z4, 9))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u32.c
+new file mode 100644
+index 000000000..541e71cc2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_u32_tied1:
++**	udot	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (dot_u32_tied1, svuint32_t, svuint8_t,
++	     z0 = svdot_u32 (z0, z4, z5),
++	     z0 = svdot (z0, z4, z5))
++
++/*
++** dot_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.s, \1\.b, z1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_u32_tied2, svuint32_t, svuint8_t,
++		 z0_res = svdot_u32 (z4, z0, z1),
++		 z0_res = svdot (z4, z0, z1))
++
++/*
++** dot_u32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.s, z1\.b, \1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_u32_tied3, svuint32_t, svuint8_t,
++		 z0_res = svdot_u32 (z4, z1, z0),
++		 z0_res = svdot (z4, z1, z0))
++
++/*
++** dot_u32_untied:
++**	movprfx	z0, z1
++**	udot	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (dot_u32_untied, svuint32_t, svuint8_t,
++	     z0 = svdot_u32 (z1, z4, z5),
++	     z0 = svdot (z1, z4, z5))
++
++/*
++** dot_w0_u32_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	udot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_u32_tied1, svuint32_t, svuint8_t, uint8_t,
++	      z0 = svdot_n_u32 (z0, z4, x0),
++	      z0 = svdot (z0, z4, x0))
++
++/*
++** dot_w0_u32_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	udot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_u32_untied, svuint32_t, svuint8_t, uint8_t,
++	      z0 = svdot_n_u32 (z1, z4, x0),
++	      z0 = svdot (z1, z4, x0))
++
++/*
++** dot_9_u32_tied1:
++**	mov	(z[0-9]+\.b), #9
++**	udot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_u32_tied1, svuint32_t, svuint8_t,
++	     z0 = svdot_n_u32 (z0, z4, 9),
++	     z0 = svdot (z0, z4, 9))
++
++/*
++** dot_9_u32_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #9
++**	movprfx	z0, z1
++**	udot	z0\.s, z4\.b, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_u32_untied, svuint32_t, svuint8_t,
++	     z0 = svdot_n_u32 (z1, z4, 9),
++	     z0 = svdot (z1, z4, 9))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u64.c
+new file mode 100644
+index 000000000..cc0e85373
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dot_u64_tied1:
++**	udot	z0\.d, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (dot_u64_tied1, svuint64_t, svuint16_t,
++	     z0 = svdot_u64 (z0, z4, z5),
++	     z0 = svdot (z0, z4, z5))
++
++/*
++** dot_u64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.d, \1\.h, z1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_u64_tied2, svuint64_t, svuint16_t,
++		 z0_res = svdot_u64 (z4, z0, z1),
++		 z0_res = svdot (z4, z0, z1))
++
++/*
++** dot_u64_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	udot	z0\.d, z1\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (dot_u64_tied3, svuint64_t, svuint16_t,
++		 z0_res = svdot_u64 (z4, z1, z0),
++		 z0_res = svdot (z4, z1, z0))
++
++/*
++** dot_u64_untied:
++**	movprfx	z0, z1
++**	udot	z0\.d, z4\.h, z5\.h
++**	ret
++*/
++TEST_DUAL_Z (dot_u64_untied, svuint64_t, svuint16_t,
++	     z0 = svdot_u64 (z1, z4, z5),
++	     z0 = svdot (z1, z4, z5))
++
++/*
++** dot_w0_u64_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	udot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_u64_tied1, svuint64_t, svuint16_t, uint16_t,
++	      z0 = svdot_n_u64 (z0, z4, x0),
++	      z0 = svdot (z0, z4, x0))
++
++/*
++** dot_w0_u64_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	udot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_ZX (dot_w0_u64_untied, svuint64_t, svuint16_t, uint16_t,
++	      z0 = svdot_n_u64 (z1, z4, x0),
++	      z0 = svdot (z1, z4, x0))
++
++/*
++** dot_9_u64_tied1:
++**	mov	(z[0-9]+\.h), #9
++**	udot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_u64_tied1, svuint64_t, svuint16_t,
++	     z0 = svdot_n_u64 (z0, z4, 9),
++	     z0 = svdot (z0, z4, 9))
++
++/*
++** dot_9_u64_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #9
++**	movprfx	z0, z1
++**	udot	z0\.d, z4\.h, \1
++**	ret
++*/
++TEST_DUAL_Z (dot_9_u64_untied, svuint64_t, svuint16_t,
++	     z0 = svdot_n_u64 (z1, z4, 9),
++	     z0 = svdot (z1, z4, 9))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b16.c
+new file mode 100644
+index 000000000..785832ab3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b16.c
+@@ -0,0 +1,32 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include <stdbool.h>
++#include "test_sve_acle.h"
++
++/*
++** dup_false_b16:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dup_false_b16,
++		p0 = svdup_n_b16 (false),
++		p0 = svdup_b16 (false))
++
++/*
++** dup_true_b16:
++**	ptrue	p0\.h, all
++**	ret
++*/
++TEST_UNIFORM_P (dup_true_b16,
++		p0 = svdup_n_b16 (true),
++		p0 = svdup_b16 (true))
++
++/*
++** dup_w0_b16:
++**	lsl	(x[0-9]+), x0, 63
++**	whilelo	p0\.h, xzr, \1
++**	ret
++*/
++TEST_UNIFORM_PS (dup_w0_b16,
++		 p0 = svdup_n_b16 (x0),
++		 p0 = svdup_b16 (x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b32.c
+new file mode 100644
+index 000000000..6e9d91eaf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b32.c
+@@ -0,0 +1,32 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include <stdbool.h>
++#include "test_sve_acle.h"
++
++/*
++** dup_false_b32:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dup_false_b32,
++		p0 = svdup_n_b32 (false),
++		p0 = svdup_b32 (false))
++
++/*
++** dup_true_b32:
++**	ptrue	p0\.s, all
++**	ret
++*/
++TEST_UNIFORM_P (dup_true_b32,
++		p0 = svdup_n_b32 (true),
++		p0 = svdup_b32 (true))
++
++/*
++** dup_w0_b32:
++**	lsl	(x[0-9]+), x0, 63
++**	whilelo	p0\.s, xzr, \1
++**	ret
++*/
++TEST_UNIFORM_PS (dup_w0_b32,
++		 p0 = svdup_n_b32 (x0),
++		 p0 = svdup_b32 (x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b64.c
+new file mode 100644
+index 000000000..ed69896c4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b64.c
+@@ -0,0 +1,32 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include <stdbool.h>
++#include "test_sve_acle.h"
++
++/*
++** dup_false_b64:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dup_false_b64,
++		p0 = svdup_n_b64 (false),
++		p0 = svdup_b64 (false))
++
++/*
++** dup_true_b64:
++**	ptrue	p0\.d, all
++**	ret
++*/
++TEST_UNIFORM_P (dup_true_b64,
++		p0 = svdup_n_b64 (true),
++		p0 = svdup_b64 (true))
++
++/*
++** dup_w0_b64:
++**	lsl	(x[0-9]+), x0, 63
++**	whilelo	p0\.d, xzr, \1
++**	ret
++*/
++TEST_UNIFORM_PS (dup_w0_b64,
++		 p0 = svdup_n_b64 (x0),
++		 p0 = svdup_b64 (x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b8.c
+new file mode 100644
+index 000000000..a99ab552a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b8.c
+@@ -0,0 +1,32 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include <stdbool.h>
++#include "test_sve_acle.h"
++
++/*
++** dup_false_b8:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dup_false_b8,
++		p0 = svdup_n_b8 (false),
++		p0 = svdup_b8 (false))
++
++/*
++** dup_true_b8:
++**	ptrue	p0\.b, all
++**	ret
++*/
++TEST_UNIFORM_P (dup_true_b8,
++		p0 = svdup_n_b8 (true),
++		p0 = svdup_b8 (true))
++
++/*
++** dup_w0_b8:
++**	lsl	(x[0-9]+), x0, 63
++**	whilelo	p0\.b, xzr, \1
++**	ret
++*/
++TEST_UNIFORM_PS (dup_w0_b8,
++		 p0 = svdup_n_b8 (x0),
++		 p0 = svdup_b8 (x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_bf16.c
+new file mode 100644
+index 000000000..db47d849c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_bf16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_h4_bf16:
++**	mov	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_bf16, svbfloat16_t, __bf16,
++		z0 = svdup_n_bf16 (d4),
++		z0 = svdup_bf16 (d4))
++
++/*
++** dup_h4_bf16_m:
++**	movprfx	z0, z1
++**	mov	z0\.h, p0/m, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_bf16_m, svbfloat16_t, __bf16,
++		z0 = svdup_n_bf16_m (z1, p0, d4),
++		z0 = svdup_bf16_m (z1, p0, d4))
++
++/*
++** dup_h4_bf16_z:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mov	z0\.h, p0/m, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_bf16_z, svbfloat16_t, __bf16,
++		z0 = svdup_n_bf16_z (p0, d4),
++		z0 = svdup_bf16_z (p0, d4))
++
++/*
++** dup_h4_bf16_x:
++**	mov	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_bf16_x, svbfloat16_t, __bf16,
++		z0 = svdup_n_bf16_x (p0, d4),
++		z0 = svdup_bf16_x (p0, d4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f16.c
+new file mode 100644
+index 000000000..2d48b9a3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f16.c
+@@ -0,0 +1,215 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_f16:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f16, svfloat16_t,
++		z0 = svdup_n_f16 (1),
++		z0 = svdup_f16 (1))
++
++/*
++** dup_0_f16:
++**	mov	z0\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f16, svfloat16_t,
++		z0 = svdup_n_f16 (0),
++		z0 = svdup_f16 (0))
++
++/*
++** dup_8_f16:
++**	fmov	z0\.h, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f16, svfloat16_t,
++		z0 = svdup_n_f16 (8),
++		z0 = svdup_f16 (8))
++
++/*
++** dup_512_f16:
++**	mov	z0\.h, #24576
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f16, svfloat16_t,
++		z0 = svdup_n_f16 (512),
++		z0 = svdup_f16 (512))
++
++/*
++** dup_513_f16:
++**	mov	(w[0-7]+), 24578
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_513_f16, svfloat16_t,
++		z0 = svdup_n_f16 (513),
++		z0 = svdup_f16 (513))
++
++/*
++** dup_h4_f16:
++**	mov	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_f16, svfloat16_t, __fp16,
++		z0 = svdup_n_f16 (d4),
++		z0 = svdup_f16 (d4))
++
++/*
++** dup_1_f16_m:
++**	mov	z0\.h, p0/m, #15360
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f16_m, svfloat16_t,
++		z0 = svdup_n_f16_m (z0, p0, 1),
++		z0 = svdup_f16_m (z0, p0, 1))
++
++/*
++** dup_0_f16_m:
++**	mov	z0\.h, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f16_m, svfloat16_t,
++		z0 = svdup_n_f16_m (z0, p0, 0),
++		z0 = svdup_f16_m (z0, p0, 0))
++
++/*
++** dup_8_f16_m:
++**	mov	z0\.h, p0/m, #18432
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f16_m, svfloat16_t,
++		z0 = svdup_n_f16_m (z0, p0, 8),
++		z0 = svdup_f16_m (z0, p0, 8))
++
++/*
++** dup_512_f16_m:
++**	mov	z0\.h, p0/m, #24576
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f16_m, svfloat16_t,
++		z0 = svdup_n_f16_m (z0, p0, 512),
++		z0 = svdup_f16_m (z0, p0, 512))
++
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_513_f16_m, svfloat16_t,
++		z0 = svdup_n_f16_m (z0, p0, 513),
++		z0 = svdup_f16_m (z0, p0, 513))
++
++/*
++** dup_h4_f16_m:
++**	movprfx	z0, z1
++**	mov	z0\.h, p0/m, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_f16_m, svfloat16_t, __fp16,
++		z0 = svdup_n_f16_m (z1, p0, d4),
++		z0 = svdup_f16_m (z1, p0, d4))
++
++/*
++** dup_1_f16_z:
++**	mov	z0\.h, p0/z, #15360
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f16_z, svfloat16_t,
++		z0 = svdup_n_f16_z (p0, 1),
++		z0 = svdup_f16_z (p0, 1))
++
++/*
++** dup_0_f16_z:
++**	mov	z0\.h, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f16_z, svfloat16_t,
++		z0 = svdup_n_f16_z (p0, 0),
++		z0 = svdup_f16_z (p0, 0))
++
++/*
++** dup_8_f16_z:
++**	mov	z0\.h, p0/z, #18432
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f16_z, svfloat16_t,
++		z0 = svdup_n_f16_z (p0, 8),
++		z0 = svdup_f16_z (p0, 8))
++
++/*
++** dup_512_f16_z:
++**	mov	z0\.h, p0/z, #24576
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f16_z, svfloat16_t,
++		z0 = svdup_n_f16_z (p0, 512),
++		z0 = svdup_f16_z (p0, 512))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_513_f16_z, svfloat16_t,
++		z0 = svdup_n_f16_z (p0, 513),
++		z0 = svdup_f16_z (p0, 513))
++/*
++** dup_h4_f16_z:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mov	z0\.h, p0/m, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_f16_z, svfloat16_t, __fp16,
++		z0 = svdup_n_f16_z (p0, d4),
++		z0 = svdup_f16_z (p0, d4))
++
++/*
++** dup_1_f16_x:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f16_x, svfloat16_t,
++		z0 = svdup_n_f16_x (p0, 1),
++		z0 = svdup_f16_x (p0, 1))
++
++/*
++** dup_0_f16_x:
++**	mov	z0\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f16_x, svfloat16_t,
++		z0 = svdup_n_f16_x (p0, 0),
++		z0 = svdup_f16_x (p0, 0))
++
++/*
++** dup_8_f16_x:
++**	fmov	z0\.h, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f16_x, svfloat16_t,
++		z0 = svdup_n_f16_x (p0, 8),
++		z0 = svdup_f16_x (p0, 8))
++
++/*
++** dup_512_f16_x:
++**	mov	z0\.h, #24576
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f16_x, svfloat16_t,
++		z0 = svdup_n_f16_x (p0, 512),
++		z0 = svdup_f16_x (p0, 512))
++
++/*
++** dup_513_f16_x:
++**	mov	(w[0-7]+), 24578
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_513_f16_x, svfloat16_t,
++		z0 = svdup_n_f16_x (p0, 513),
++		z0 = svdup_f16_x (p0, 513))
++
++/*
++** dup_h4_f16_x:
++**	mov	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_h4_f16_x, svfloat16_t, __fp16,
++		z0 = svdup_n_f16_x (p0, d4),
++		z0 = svdup_f16_x (p0, d4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f32.c
+new file mode 100644
+index 000000000..f997b7a7d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f32.c
+@@ -0,0 +1,212 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_f32:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f32, svfloat32_t,
++		z0 = svdup_n_f32 (1),
++		z0 = svdup_f32 (1))
++
++/*
++** dup_0_f32:
++**	mov	z0\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f32, svfloat32_t,
++		z0 = svdup_n_f32 (0),
++		z0 = svdup_f32 (0))
++
++/*
++** dup_8_f32:
++**	fmov	z0\.s, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f32, svfloat32_t,
++		z0 = svdup_n_f32 (8),
++		z0 = svdup_f32 (8))
++
++/*
++** dup_512_f32:
++**	movi	v([0-9]+).4s, 0x44, lsl 24
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f32, svfloat32_t,
++		z0 = svdup_n_f32 (512),
++		z0 = svdup_f32 (512))
++
++/*
++** dup_513_f32:
++**	...
++**	ld1rw	z0\.s, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_513_f32, svfloat32_t,
++		z0 = svdup_n_f32 (513),
++		z0 = svdup_f32 (513))
++
++/*
++** dup_s4_f32:
++**	mov	z0\.s, s4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_s4_f32, svfloat32_t, float,
++		z0 = svdup_n_f32 (d4),
++		z0 = svdup_f32 (d4))
++
++/*
++** dup_1_f32_m:
++**	fmov	z0\.s, p0/m, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f32_m, svfloat32_t,
++		z0 = svdup_n_f32_m (z0, p0, 1),
++		z0 = svdup_f32_m (z0, p0, 1))
++
++/*
++** dup_0_f32_m:
++**	mov	z0\.s, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f32_m, svfloat32_t,
++		z0 = svdup_n_f32_m (z0, p0, 0),
++		z0 = svdup_f32_m (z0, p0, 0))
++
++/*
++** dup_8_f32_m:
++**	fmov	z0\.s, p0/m, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f32_m, svfloat32_t,
++		z0 = svdup_n_f32_m (z0, p0, 8),
++		z0 = svdup_f32_m (z0, p0, 8))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_512_f32_m, svfloat32_t,
++		z0 = svdup_n_f32_m (z0, p0, 512),
++		z0 = svdup_f32_m (z0, p0, 512))
++
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_513_f32_m, svfloat32_t,
++		z0 = svdup_n_f32_m (z0, p0, 513),
++		z0 = svdup_f32_m (z0, p0, 513))
++
++/*
++** dup_s4_f32_m:
++**	movprfx	z0, z1
++**	mov	z0\.s, p0/m, s4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_s4_f32_m, svfloat32_t, float,
++		z0 = svdup_n_f32_m (z1, p0, d4),
++		z0 = svdup_f32_m (z1, p0, d4))
++
++/*
++** dup_1_f32_z:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmov	z0\.s, p0/m, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f32_z, svfloat32_t,
++		z0 = svdup_n_f32_z (p0, 1),
++		z0 = svdup_f32_z (p0, 1))
++
++/*
++** dup_0_f32_z:
++**	mov	z0\.s, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f32_z, svfloat32_t,
++		z0 = svdup_n_f32_z (p0, 0),
++		z0 = svdup_f32_z (p0, 0))
++
++/*
++** dup_8_f32_z:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmov	z0\.s, p0/m, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f32_z, svfloat32_t,
++		z0 = svdup_n_f32_z (p0, 8),
++		z0 = svdup_f32_z (p0, 8))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_512_f32_z, svfloat32_t,
++		z0 = svdup_n_f32_z (p0, 512),
++		z0 = svdup_f32_z (p0, 512))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_513_f32_z, svfloat32_t,
++		z0 = svdup_n_f32_z (p0, 513),
++		z0 = svdup_f32_z (p0, 513))
++
++/*
++** dup_s4_f32_z:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mov	z0\.s, p0/m, s4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_s4_f32_z, svfloat32_t, float,
++		z0 = svdup_n_f32_z (p0, d4),
++		z0 = svdup_f32_z (p0, d4))
++
++/*
++** dup_1_f32_x:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f32_x, svfloat32_t,
++		z0 = svdup_n_f32_x (p0, 1),
++		z0 = svdup_f32_x (p0, 1))
++
++/*
++** dup_0_f32_x:
++**	mov	z0\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f32_x, svfloat32_t,
++		z0 = svdup_n_f32_x (p0, 0),
++		z0 = svdup_f32_x (p0, 0))
++
++/*
++** dup_8_f32_x:
++**	fmov	z0\.s, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f32_x, svfloat32_t,
++		z0 = svdup_n_f32_x (p0, 8),
++		z0 = svdup_f32_x (p0, 8))
++
++/*
++** dup_512_f32_x:
++**	movi	v([0-9]+).4s, 0x44, lsl 24
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f32_x, svfloat32_t,
++		z0 = svdup_n_f32_x (p0, 512),
++		z0 = svdup_f32_x (p0, 512))
++
++/*
++** dup_513_f32_x:
++**	...
++**	ld1rw	z0\.s, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_513_f32_x, svfloat32_t,
++		z0 = svdup_n_f32_x (p0, 513),
++		z0 = svdup_f32_x (p0, 513))
++
++/*
++** dup_s4_f32_x:
++**	mov	z0\.s, s4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_s4_f32_x, svfloat32_t, float,
++		z0 = svdup_n_f32_x (p0, d4),
++		z0 = svdup_f32_x (p0, d4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f64.c
+new file mode 100644
+index 000000000..e177d9108
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f64.c
+@@ -0,0 +1,212 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_f64:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f64, svfloat64_t,
++		z0 = svdup_n_f64 (1),
++		z0 = svdup_f64 (1))
++
++/*
++** dup_0_f64:
++**	mov	z0\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f64, svfloat64_t,
++		z0 = svdup_n_f64 (0),
++		z0 = svdup_f64 (0))
++
++/*
++** dup_8_f64:
++**	fmov	z0\.d, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f64, svfloat64_t,
++		z0 = svdup_n_f64 (8),
++		z0 = svdup_f64 (8))
++
++/*
++** dup_512_f64:
++**	mov	(x[0-9]+), 4647714815446351872
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f64, svfloat64_t,
++		z0 = svdup_n_f64 (512),
++		z0 = svdup_f64 (512))
++
++/*
++** dup_513_f64:
++**	...
++**	ld1rd	z0\.d, p[0-7]/z, \[x[0-9+]\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_513_f64, svfloat64_t,
++		z0 = svdup_n_f64 (513),
++		z0 = svdup_f64 (513))
++
++/*
++** dup_d4_f64:
++**	mov	z0\.d, d4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_d4_f64, svfloat64_t, double,
++		z0 = svdup_n_f64 (d4),
++		z0 = svdup_f64 (d4))
++
++/*
++** dup_1_f64_m:
++**	fmov	z0\.d, p0/m, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f64_m, svfloat64_t,
++		z0 = svdup_n_f64_m (z0, p0, 1),
++		z0 = svdup_f64_m (z0, p0, 1))
++
++/*
++** dup_0_f64_m:
++**	mov	z0\.d, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f64_m, svfloat64_t,
++		z0 = svdup_n_f64_m (z0, p0, 0),
++		z0 = svdup_f64_m (z0, p0, 0))
++
++/*
++** dup_8_f64_m:
++**	fmov	z0\.d, p0/m, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f64_m, svfloat64_t,
++		z0 = svdup_n_f64_m (z0, p0, 8),
++		z0 = svdup_f64_m (z0, p0, 8))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_512_f64_m, svfloat64_t,
++		z0 = svdup_n_f64_m (z0, p0, 512),
++		z0 = svdup_f64_m (z0, p0, 512))
++
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_513_f64_m, svfloat64_t,
++		z0 = svdup_n_f64_m (z0, p0, 513),
++		z0 = svdup_f64_m (z0, p0, 513))
++
++/*
++** dup_d4_f64_m:
++**	movprfx	z0, z1
++**	mov	z0\.d, p0/m, d4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_d4_f64_m, svfloat64_t, double,
++		z0 = svdup_n_f64_m (z1, p0, d4),
++		z0 = svdup_f64_m (z1, p0, d4))
++
++/*
++** dup_1_f64_z:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmov	z0\.d, p0/m, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f64_z, svfloat64_t,
++		z0 = svdup_n_f64_z (p0, 1),
++		z0 = svdup_f64_z (p0, 1))
++
++/*
++** dup_0_f64_z:
++**	mov	z0\.d, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f64_z, svfloat64_t,
++		z0 = svdup_n_f64_z (p0, 0),
++		z0 = svdup_f64_z (p0, 0))
++
++/*
++** dup_8_f64_z:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmov	z0\.d, p0/m, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f64_z, svfloat64_t,
++		z0 = svdup_n_f64_z (p0, 8),
++		z0 = svdup_f64_z (p0, 8))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_512_f64_z, svfloat64_t,
++		z0 = svdup_n_f64_z (p0, 512),
++		z0 = svdup_f64_z (p0, 512))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_513_f64_z, svfloat64_t,
++		z0 = svdup_n_f64_z (p0, 513),
++		z0 = svdup_f64_z (p0, 513))
++
++/*
++** dup_d4_f64_z:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mov	z0\.d, p0/m, d4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_d4_f64_z, svfloat64_t, double,
++		z0 = svdup_n_f64_z (p0, d4),
++		z0 = svdup_f64_z (p0, d4))
++
++/*
++** dup_1_f64_x:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_f64_x, svfloat64_t,
++		z0 = svdup_n_f64_x (p0, 1),
++		z0 = svdup_f64_x (p0, 1))
++
++/*
++** dup_0_f64_x:
++**	mov	z0\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_f64_x, svfloat64_t,
++		z0 = svdup_n_f64_x (p0, 0),
++		z0 = svdup_f64_x (p0, 0))
++
++/*
++** dup_8_f64_x:
++**	fmov	z0\.d, #8\.0(?:e\+0)?
++**	ret
++*/
++TEST_UNIFORM_Z (dup_8_f64_x, svfloat64_t,
++		z0 = svdup_n_f64_x (p0, 8),
++		z0 = svdup_f64_x (p0, 8))
++
++/*
++** dup_512_f64_x:
++**	mov	(x[0-9]+), 4647714815446351872
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_f64_x, svfloat64_t,
++		z0 = svdup_n_f64_x (p0, 512),
++		z0 = svdup_f64_x (p0, 512))
++
++/*
++** dup_513_f64_x:
++**	...
++**	ld1rd	z0\.d, p[0-7]/z, \[x[0-9+]\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_513_f64_x, svfloat64_t,
++		z0 = svdup_n_f64_x (p0, 513),
++		z0 = svdup_f64_x (p0, 513))
++
++/*
++** dup_d4_f64_x:
++**	mov	z0\.d, d4
++**	ret
++*/
++TEST_UNIFORM_ZD (dup_d4_f64_x, svfloat64_t, double,
++		z0 = svdup_n_f64_x (p0, d4),
++		z0 = svdup_f64_x (p0, d4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c
+new file mode 100644
+index 000000000..d05ad5adb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_bf16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_bf16_tied1, svbfloat16_t, uint16_t,
++		 z0 = svdup_lane_bf16 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_bf16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_bf16_untied, svbfloat16_t, uint16_t,
++		 z0 = svdup_lane_bf16 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_bf16_tied1:
++**	dup	z0\.h, z0\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_bf16_tied1, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_bf16_untied:
++**	dup	z0\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_bf16_untied, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_15_bf16:
++**	dup	z0\.h, z0\.h\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_bf16:
++**	dup	z0\.h, z0\.h\[16\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_bf16:
++**	dup	z0\.h, z0\.h\[31\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_bf16:
++**	mov	(z[0-9]+\.h), #32
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_bf16:
++**	mov	(z[0-9]+\.h), #63
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_bf16:
++**	mov	(z[0-9]+\.h), #64
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_bf16:
++**	mov	(z[0-9]+\.h), #255
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_bf16, svbfloat16_t,
++		z0 = svdup_lane_bf16 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f16.c
+new file mode 100644
+index 000000000..142afbb24
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f16.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_f16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_f16_tied1, svfloat16_t, uint16_t,
++		 z0 = svdup_lane_f16 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_f16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_f16_untied, svfloat16_t, uint16_t,
++		 z0 = svdup_lane_f16 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_f16_tied1:
++**	dup	z0\.h, z0\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_f16_tied1, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_f16_untied:
++**	dup	z0\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_f16_untied, svfloat16_t,
++		z0 = svdup_lane_f16 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_15_f16:
++**	dup	z0\.h, z0\.h\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_f16:
++**	dup	z0\.h, z0\.h\[16\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_f16:
++**	dup	z0\.h, z0\.h\[31\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_f16:
++**	mov	(z[0-9]+\.h), #32
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_f16:
++**	mov	(z[0-9]+\.h), #63
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_f16:
++**	mov	(z[0-9]+\.h), #64
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_f16:
++**	mov	(z[0-9]+\.h), #255
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_f16, svfloat16_t,
++		z0 = svdup_lane_f16 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f32.c
+new file mode 100644
+index 000000000..b32068a37
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f32.c
+@@ -0,0 +1,110 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_f32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_f32_tied1, svfloat32_t, uint32_t,
++		 z0 = svdup_lane_f32 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_f32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	tbl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_f32_untied, svfloat32_t, uint32_t,
++		 z0 = svdup_lane_f32 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_f32_tied1:
++**	dup	z0\.s, z0\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_f32_tied1, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_f32_untied:
++**	dup	z0\.s, z1\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_f32_untied, svfloat32_t,
++		z0 = svdup_lane_f32 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_15_f32:
++**	dup	z0\.s, z0\.s\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_f32:
++**	mov	(z[0-9]+\.s), #16
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_f32:
++**	mov	(z[0-9]+\.s), #31
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_f32:
++**	mov	(z[0-9]+\.s), #32
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_f32:
++**	mov	(z[0-9]+\.s), #63
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_f32:
++**	mov	(z[0-9]+\.s), #64
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_f32:
++**	mov	(z[0-9]+\.s), #255
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_f32, svfloat32_t,
++		z0 = svdup_lane_f32 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f64.c
+new file mode 100644
+index 000000000..64af50d0c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f64.c
+@@ -0,0 +1,111 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_x0_f64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_x0_f64_tied1, svfloat64_t, uint64_t,
++		 z0 = svdup_lane_f64 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_x0_f64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	tbl	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_x0_f64_untied, svfloat64_t, uint64_t,
++		 z0 = svdup_lane_f64 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_f64_tied1:
++**	dup	z0\.d, z0\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_f64_tied1, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_f64_untied:
++**	dup	z0\.d, z1\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_f64_untied, svfloat64_t,
++		z0 = svdup_lane_f64 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_15_f64:
++**	mov	(z[0-9]+\.d), #15
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_f64:
++**	mov	(z[0-9]+\.d), #16
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_f64:
++**	mov	(z[0-9]+\.d), #31
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_f64:
++**	mov	(z[0-9]+\.d), #32
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_f64:
++**	mov	(z[0-9]+\.d), #63
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_f64:
++**	mov	(z[0-9]+\.d), #64
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_f64:
++**	mov	(z[0-9]+\.d), #255
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_f64, svfloat64_t,
++		z0 = svdup_lane_f64 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s16.c
+new file mode 100644
+index 000000000..3b6f20696
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s16.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_s16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_s16_tied1, svint16_t, uint16_t,
++		 z0 = svdup_lane_s16 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_s16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_s16_untied, svint16_t, uint16_t,
++		 z0 = svdup_lane_s16 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_s16_tied1:
++**	dup	z0\.h, z0\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s16_tied1, svint16_t,
++		z0 = svdup_lane_s16 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_s16_untied:
++**	dup	z0\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s16_untied, svint16_t,
++		z0 = svdup_lane_s16 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_s16:
++**	dup	z0\.h, z0\.h\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_s16:
++**	dup	z0\.h, z0\.h\[8\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_s16:
++**	dup	z0\.h, z0\.h\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_s16:
++**	dup	z0\.h, z0\.h\[16\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_s16:
++**	dup	z0\.h, z0\.h\[31\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_s16:
++**	mov	(z[0-9]+\.h), #32
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_s16:
++**	mov	(z[0-9]+\.h), #63
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_s16:
++**	mov	(z[0-9]+\.h), #64
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_s16:
++**	mov	(z[0-9]+\.h), #255
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_s16, svint16_t,
++		z0 = svdup_lane_s16 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s32.c
+new file mode 100644
+index 000000000..bf597fdf6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s32.c
+@@ -0,0 +1,128 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_s32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_s32_tied1, svint32_t, uint32_t,
++		 z0 = svdup_lane_s32 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_s32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	tbl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_s32_untied, svint32_t, uint32_t,
++		 z0 = svdup_lane_s32 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_s32_tied1:
++**	dup	z0\.s, z0\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s32_tied1, svint32_t,
++		z0 = svdup_lane_s32 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_s32_untied:
++**	dup	z0\.s, z1\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s32_untied, svint32_t,
++		z0 = svdup_lane_s32 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_s32:
++**	dup	z0\.s, z0\.s\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_s32:
++**	dup	z0\.s, z0\.s\[8\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_s32:
++**	dup	z0\.s, z0\.s\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_s32:
++**	mov	(z[0-9]+\.s), #16
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_s32:
++**	mov	(z[0-9]+\.s), #31
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_s32:
++**	mov	(z[0-9]+\.s), #32
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_s32:
++**	mov	(z[0-9]+\.s), #63
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_s32:
++**	mov	(z[0-9]+\.s), #64
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_s32:
++**	mov	(z[0-9]+\.s), #255
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_s32, svint32_t,
++		z0 = svdup_lane_s32 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s64.c
+new file mode 100644
+index 000000000..f2f3a1770
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s64.c
+@@ -0,0 +1,130 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_x0_s64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_x0_s64_tied1, svint64_t, uint64_t,
++		 z0 = svdup_lane_s64 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_x0_s64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	tbl	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_x0_s64_untied, svint64_t, uint64_t,
++		 z0 = svdup_lane_s64 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_s64_tied1:
++**	dup	z0\.d, z0\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s64_tied1, svint64_t,
++		z0 = svdup_lane_s64 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_s64_untied:
++**	dup	z0\.d, z1\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s64_untied, svint64_t,
++		z0 = svdup_lane_s64 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_s64:
++**	dup	z0\.d, z0\.d\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_s64:
++**	mov	(z[0-9]+\.d), #8
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_s64:
++**	mov	(z[0-9]+\.d), #15
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_s64:
++**	mov	(z[0-9]+\.d), #16
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_s64:
++**	mov	(z[0-9]+\.d), #31
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_s64:
++**	mov	(z[0-9]+\.d), #32
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_s64:
++**	mov	(z[0-9]+\.d), #63
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_s64:
++**	mov	(z[0-9]+\.d), #64
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_s64:
++**	mov	(z[0-9]+\.d), #255
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_s64, svint64_t,
++		z0 = svdup_lane_s64 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s8.c
+new file mode 100644
+index 000000000..f5a07e9f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s8.c
+@@ -0,0 +1,124 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_s8_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	tbl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_s8_tied1, svint8_t, uint8_t,
++		 z0 = svdup_lane_s8 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_s8_untied:
++**	mov	(z[0-9]+\.b), w0
++**	tbl	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_s8_untied, svint8_t, uint8_t,
++		 z0 = svdup_lane_s8 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_s8_tied1:
++**	dup	z0\.b, z0\.b\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s8_tied1, svint8_t,
++		z0 = svdup_lane_s8 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_s8_untied:
++**	dup	z0\.b, z1\.b\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_s8_untied, svint8_t,
++		z0 = svdup_lane_s8 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_s8:
++**	dup	z0\.b, z0\.b\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_s8:
++**	dup	z0\.b, z0\.b\[8\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_s8:
++**	dup	z0\.b, z0\.b\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_s8:
++**	dup	z0\.b, z0\.b\[16\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_s8:
++**	dup	z0\.b, z0\.b\[31\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_s8:
++**	dup	z0\.b, z0\.b\[32\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_s8:
++**	dup	z0\.b, z0\.b\[63\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_s8:
++**	mov	(z[0-9]+\.b), #64
++**	tbl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_s8:
++**	mov	(z[0-9]+\.b), #-1
++**	tbl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_s8, svint8_t,
++		z0 = svdup_lane_s8 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u16.c
+new file mode 100644
+index 000000000..e5135caa5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u16.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_u16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_u16_tied1, svuint16_t, uint16_t,
++		 z0 = svdup_lane_u16 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_u16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	tbl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_u16_untied, svuint16_t, uint16_t,
++		 z0 = svdup_lane_u16 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_u16_tied1:
++**	dup	z0\.h, z0\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u16_tied1, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_u16_untied:
++**	dup	z0\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u16_untied, svuint16_t,
++		z0 = svdup_lane_u16 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_u16:
++**	dup	z0\.h, z0\.h\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_u16:
++**	dup	z0\.h, z0\.h\[8\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_u16:
++**	dup	z0\.h, z0\.h\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_u16:
++**	dup	z0\.h, z0\.h\[16\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_u16:
++**	dup	z0\.h, z0\.h\[31\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_u16:
++**	mov	(z[0-9]+\.h), #32
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_u16:
++**	mov	(z[0-9]+\.h), #63
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_u16:
++**	mov	(z[0-9]+\.h), #64
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_u16:
++**	mov	(z[0-9]+\.h), #255
++**	tbl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_u16, svuint16_t,
++		z0 = svdup_lane_u16 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u32.c
+new file mode 100644
+index 000000000..7e972aca7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u32.c
+@@ -0,0 +1,128 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_u32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_u32_tied1, svuint32_t, uint32_t,
++		 z0 = svdup_lane_u32 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_u32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	tbl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_u32_untied, svuint32_t, uint32_t,
++		 z0 = svdup_lane_u32 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_u32_tied1:
++**	dup	z0\.s, z0\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u32_tied1, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_u32_untied:
++**	dup	z0\.s, z1\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u32_untied, svuint32_t,
++		z0 = svdup_lane_u32 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_u32:
++**	dup	z0\.s, z0\.s\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_u32:
++**	dup	z0\.s, z0\.s\[8\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_u32:
++**	dup	z0\.s, z0\.s\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_u32:
++**	mov	(z[0-9]+\.s), #16
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_u32:
++**	mov	(z[0-9]+\.s), #31
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_u32:
++**	mov	(z[0-9]+\.s), #32
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_u32:
++**	mov	(z[0-9]+\.s), #63
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_u32:
++**	mov	(z[0-9]+\.s), #64
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_u32:
++**	mov	(z[0-9]+\.s), #255
++**	tbl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_u32, svuint32_t,
++		z0 = svdup_lane_u32 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u64.c
+new file mode 100644
+index 000000000..5097b7e96
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u64.c
+@@ -0,0 +1,130 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_x0_u64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_x0_u64_tied1, svuint64_t, uint64_t,
++		 z0 = svdup_lane_u64 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_x0_u64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	tbl	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_x0_u64_untied, svuint64_t, uint64_t,
++		 z0 = svdup_lane_u64 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_u64_tied1:
++**	dup	z0\.d, z0\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u64_tied1, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_u64_untied:
++**	dup	z0\.d, z1\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u64_untied, svuint64_t,
++		z0 = svdup_lane_u64 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_u64:
++**	dup	z0\.d, z0\.d\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_u64:
++**	mov	(z[0-9]+\.d), #8
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_u64:
++**	mov	(z[0-9]+\.d), #15
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_u64:
++**	mov	(z[0-9]+\.d), #16
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_u64:
++**	mov	(z[0-9]+\.d), #31
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_u64:
++**	mov	(z[0-9]+\.d), #32
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_u64:
++**	mov	(z[0-9]+\.d), #63
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_u64:
++**	mov	(z[0-9]+\.d), #64
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_u64:
++**	mov	(z[0-9]+\.d), #255
++**	tbl	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_u64, svuint64_t,
++		z0 = svdup_lane_u64 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u8.c
+new file mode 100644
+index 000000000..25fdf0acb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u8.c
+@@ -0,0 +1,124 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_lane_w0_u8_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	tbl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_u8_tied1, svuint8_t, uint8_t,
++		 z0 = svdup_lane_u8 (z0, x0),
++		 z0 = svdup_lane (z0, x0))
++
++/*
++** dup_lane_w0_u8_untied:
++**	mov	(z[0-9]+\.b), w0
++**	tbl	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_lane_w0_u8_untied, svuint8_t, uint8_t,
++		 z0 = svdup_lane_u8 (z1, x0),
++		 z0 = svdup_lane (z1, x0))
++
++/*
++** dup_lane_0_u8_tied1:
++**	dup	z0\.b, z0\.b\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u8_tied1, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 0),
++		z0 = svdup_lane (z0, 0))
++
++/*
++** dup_lane_0_u8_untied:
++**	dup	z0\.b, z1\.b\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_0_u8_untied, svuint8_t,
++		z0 = svdup_lane_u8 (z1, 0),
++		z0 = svdup_lane (z1, 0))
++
++/*
++** dup_lane_7_u8:
++**	dup	z0\.b, z0\.b\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_7_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 7),
++		z0 = svdup_lane (z0, 7))
++
++/*
++** dup_lane_8_u8:
++**	dup	z0\.b, z0\.b\[8\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_8_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 8),
++		z0 = svdup_lane (z0, 8))
++
++/*
++** dup_lane_15_u8:
++**	dup	z0\.b, z0\.b\[15\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_15_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 15),
++		z0 = svdup_lane (z0, 15))
++
++/*
++** dup_lane_16_u8:
++**	dup	z0\.b, z0\.b\[16\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_16_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 16),
++		z0 = svdup_lane (z0, 16))
++
++/*
++** dup_lane_31_u8:
++**	dup	z0\.b, z0\.b\[31\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_31_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 31),
++		z0 = svdup_lane (z0, 31))
++
++/*
++** dup_lane_32_u8:
++**	dup	z0\.b, z0\.b\[32\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_32_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 32),
++		z0 = svdup_lane (z0, 32))
++
++/*
++** dup_lane_63_u8:
++**	dup	z0\.b, z0\.b\[63\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_63_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 63),
++		z0 = svdup_lane (z0, 63))
++
++/*
++** dup_lane_64_u8:
++**	mov	(z[0-9]+\.b), #64
++**	tbl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_64_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 64),
++		z0 = svdup_lane (z0, 64))
++
++/*
++** dup_lane_255_u8:
++**	mov	(z[0-9]+\.b), #-1
++**	tbl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_lane_255_u8, svuint8_t,
++		z0 = svdup_lane_u8 (z0, 255),
++		z0 = svdup_lane (z0, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s16.c
+new file mode 100644
+index 000000000..876f36db7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s16.c
+@@ -0,0 +1,1193 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_s16:
++**	mov	z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s16, svint16_t,
++		z0 = svdup_n_s16 (1),
++		z0 = svdup_s16 (1))
++
++/*
++** dup_127_s16:
++**	mov	z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s16, svint16_t,
++		z0 = svdup_n_s16 (127),
++		z0 = svdup_s16 (127))
++
++/*
++** dup_128_s16:
++**	mov	z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s16, svint16_t,
++		z0 = svdup_n_s16 (128),
++		z0 = svdup_s16 (128))
++
++/*
++** dup_129_s16:
++**	movi	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s16, svint16_t,
++		z0 = svdup_n_s16 (129),
++		z0 = svdup_s16 (129))
++
++/*
++** dup_253_s16:
++**	movi	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s16, svint16_t,
++		z0 = svdup_n_s16 (253),
++		z0 = svdup_s16 (253))
++
++/*
++** dup_254_s16:
++**	mov	z0\.h, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s16, svint16_t,
++		z0 = svdup_n_s16 (254),
++		z0 = svdup_s16 (254))
++
++/*
++** dup_255_s16:
++**	mov	z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s16, svint16_t,
++		z0 = svdup_n_s16 (255),
++		z0 = svdup_s16 (255))
++
++/*
++** dup_256_s16:
++**	mov	z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s16, svint16_t,
++		z0 = svdup_n_s16 (256),
++		z0 = svdup_s16 (256))
++
++/*
++** dup_257_s16:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s16, svint16_t,
++		z0 = svdup_n_s16 (257),
++		z0 = svdup_s16 (257))
++
++/*
++** dup_512_s16:
++**	mov	z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s16, svint16_t,
++		z0 = svdup_n_s16 (512),
++		z0 = svdup_s16 (512))
++
++/*
++** dup_7f00_s16:
++**	mov	z0\.h, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s16, svint16_t,
++		z0 = svdup_n_s16 (0x7f00),
++		z0 = svdup_s16 (0x7f00))
++
++/*
++** dup_7f01_s16:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_s16, svint16_t,
++		z0 = svdup_n_s16 (0x7f01),
++		z0 = svdup_s16 (0x7f01))
++
++/*
++** dup_7ffd_s16:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_s16, svint16_t,
++		z0 = svdup_n_s16 (0x7ffd),
++		z0 = svdup_s16 (0x7ffd))
++
++/*
++** dup_7ffe_s16:
++**	mov	z0\.h, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s16, svint16_t,
++		z0 = svdup_n_s16 (0x7ffe),
++		z0 = svdup_s16 (0x7ffe))
++
++/*
++** dup_7fff_s16:
++**	mov	z0\.h, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s16, svint16_t,
++		z0 = svdup_n_s16 (0x7fff),
++		z0 = svdup_s16 (0x7fff))
++
++/*
++** dup_m1_s16:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s16, svint16_t,
++		z0 = svdup_n_s16 (-1),
++		z0 = svdup_s16 (-1))
++
++/*
++** dup_m128_s16:
++**	mov	z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s16, svint16_t,
++		z0 = svdup_n_s16 (-128),
++		z0 = svdup_s16 (-128))
++
++/*
++** dup_m129_s16:
++**	mov	z0\.h, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s16, svint16_t,
++		z0 = svdup_n_s16 (-129),
++		z0 = svdup_s16 (-129))
++
++/*
++** dup_m130_s16:
++**	mvni	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_s16, svint16_t,
++		z0 = svdup_n_s16 (-130),
++		z0 = svdup_s16 (-130))
++
++/*
++** dup_m254_s16:
++**	mvni	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_s16, svint16_t,
++		z0 = svdup_n_s16 (-254),
++		z0 = svdup_s16 (-254))
++
++/*
++** dup_m255_s16:
++**	mov	z0\.h, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s16, svint16_t,
++		z0 = svdup_n_s16 (-255),
++		z0 = svdup_s16 (-255))
++
++/*
++** dup_m256_s16:
++**	mov	z0\.h, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s16, svint16_t,
++		z0 = svdup_n_s16 (-256),
++		z0 = svdup_s16 (-256))
++
++/*
++** dup_m257_s16:
++**	mov	z0\.h, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s16, svint16_t,
++		z0 = svdup_n_s16 (-257),
++		z0 = svdup_s16 (-257))
++
++/*
++** dup_m258_s16:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s16, svint16_t,
++		z0 = svdup_n_s16 (-258),
++		z0 = svdup_s16 (-258))
++
++/*
++** dup_m259_s16:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_s16, svint16_t,
++		z0 = svdup_n_s16 (-259),
++		z0 = svdup_s16 (-259))
++
++/*
++** dup_m512_s16:
++**	mov	z0\.h, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s16, svint16_t,
++		z0 = svdup_n_s16 (-512),
++		z0 = svdup_s16 (-512))
++
++/*
++** dup_m7f00_s16:
++**	mov	z0\.h, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s16, svint16_t,
++		z0 = svdup_n_s16 (-0x7f00),
++		z0 = svdup_s16 (-0x7f00))
++
++/*
++** dup_m7f01_s16:
++**	mov	z0\.h, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s16, svint16_t,
++		z0 = svdup_n_s16 (-0x7f01),
++		z0 = svdup_s16 (-0x7f01))
++
++/*
++** dup_m7f02_s16:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_s16, svint16_t,
++		z0 = svdup_n_s16 (-0x7f02),
++		z0 = svdup_s16 (-0x7f02))
++
++/*
++** dup_m7ffe_s16:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_s16, svint16_t,
++		z0 = svdup_n_s16 (-0x7ffe),
++		z0 = svdup_s16 (-0x7ffe))
++
++/*
++** dup_m7fff_s16:
++**	mov	z0\.h, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s16, svint16_t,
++		z0 = svdup_n_s16 (-0x7fff),
++		z0 = svdup_s16 (-0x7fff))
++
++/*
++** dup_m8000_s16:
++**	mov	z0\.h, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s16, svint16_t,
++		z0 = svdup_n_s16 (-0x8000),
++		z0 = svdup_s16 (-0x8000))
++
++/*
++** dup_w0_s16:
++**	mov	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s16, svint16_t, int16_t,
++		 z0 = svdup_n_s16 (x0),
++		 z0 = svdup_s16 (x0))
++
++/*
++** dup_1_s16_m:
++**	mov	z0\.h, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 1),
++		z0 = svdup_s16_m (z0, p0, 1))
++
++/*
++** dup_127_s16_m:
++**	mov	z0\.h, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 127),
++		z0 = svdup_s16_m (z0, p0, 127))
++
++/*
++** dup_128_s16_m:
++**	mov	(z[0-9]+\.h), #128
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 128),
++		z0 = svdup_s16_m (z0, p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 129),
++		z0 = svdup_s16_m (z0, p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 253),
++		z0 = svdup_s16_m (z0, p0, 253))
++
++/*
++** dup_254_s16_m:
++**	mov	(z[0-9]+\.h), #254
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 254),
++		z0 = svdup_s16_m (z0, p0, 254))
++
++/*
++** dup_255_s16_m:
++**	mov	(z[0-9]+\.h), #255
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 255),
++		z0 = svdup_s16_m (z0, p0, 255))
++
++/*
++** dup_256_s16_m:
++**	mov	z0\.h, p0/m, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 256),
++		z0 = svdup_s16_m (z0, p0, 256))
++
++/*
++** dup_257_s16_m:
++**	mov	(z[0-9]+)\.b, #1
++**	sel	z0\.h, p0, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 257),
++		z0 = svdup_s16_m (z0, p0, 257))
++
++/*
++** dup_512_s16_m:
++**	mov	z0\.h, p0/m, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 512),
++		z0 = svdup_s16_m (z0, p0, 512))
++
++/*
++** dup_7f00_s16_m:
++**	mov	z0\.h, p0/m, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 0x7f00),
++		z0 = svdup_s16_m (z0, p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 0x7f01),
++		z0 = svdup_s16_m (z0, p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 0x7ffd),
++		z0 = svdup_s16_m (z0, p0, 0x7ffd))
++
++/*
++** dup_7ffe_s16_m:
++**	mov	(z[0-9]+\.h), #32766
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 0x7ffe),
++		z0 = svdup_s16_m (z0, p0, 0x7ffe))
++
++/*
++** dup_7fff_s16_m:
++**	mov	(z[0-9]+\.h), #32767
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 0x7fff),
++		z0 = svdup_s16_m (z0, p0, 0x7fff))
++
++/*
++** dup_m1_s16_m:
++**	mov	z0\.h, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -1),
++		z0 = svdup_s16_m (z0, p0, -1))
++
++/*
++** dup_m128_s16_m:
++**	mov	z0\.h, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -128),
++		z0 = svdup_s16_m (z0, p0, -128))
++
++/*
++** dup_m129_s16_m:
++**	mov	(z[0-9]+\.h), #-129
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -129),
++		z0 = svdup_s16_m (z0, p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -130),
++		z0 = svdup_s16_m (z0, p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -254),
++		z0 = svdup_s16_m (z0, p0, -254))
++
++/*
++** dup_m255_s16_m:
++**	mov	(z[0-9]+\.h), #-255
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -255),
++		z0 = svdup_s16_m (z0, p0, -255))
++
++/*
++** dup_m256_s16_m:
++**	mov	z0\.h, p0/m, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -256),
++		z0 = svdup_s16_m (z0, p0, -256))
++
++/*
++** dup_m257_s16_m:
++**	mov	(z[0-9]+\.h), #-257
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -257),
++		z0 = svdup_s16_m (z0, p0, -257))
++
++/*
++** dup_m258_s16_m:
++**	mov	(z[0-9]+)\.b, #-2
++**	sel	z0\.h, p0, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -258),
++		z0 = svdup_s16_m (z0, p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -259),
++		z0 = svdup_s16_m (z0, p0, -259))
++
++/*
++** dup_m512_s16_m:
++**	mov	z0\.h, p0/m, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -512),
++		z0 = svdup_s16_m (z0, p0, -512))
++
++/*
++** dup_m7f00_s16_m:
++**	mov	z0\.h, p0/m, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -0x7f00),
++		z0 = svdup_s16_m (z0, p0, -0x7f00))
++
++/*
++** dup_m7f01_s16_m:
++**	mov	(z[0-9]+\.h), #-32513
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -0x7f01),
++		z0 = svdup_s16_m (z0, p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -0x7f02),
++		z0 = svdup_s16_m (z0, p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -0x7ffe),
++		z0 = svdup_s16_m (z0, p0, -0x7ffe))
++
++/*
++** dup_m7fff_s16_m:
++**	mov	(z[0-9]+\.h), #-32767
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -0x7fff),
++		z0 = svdup_s16_m (z0, p0, -0x7fff))
++
++/*
++** dup_m8000_s16_m:
++**	mov	z0\.h, p0/m, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, -0x8000),
++		z0 = svdup_s16_m (z0, p0, -0x8000))
++
++/*
++** dup_0_s16_m:
++**	mov	z0\.h, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s16_m, svint16_t,
++		z0 = svdup_n_s16_m (z0, p0, 0),
++		z0 = svdup_s16_m (z0, p0, 0))
++
++/*
++** dup_w0_s16_m:
++**	movprfx	z0, z1
++**	mov	z0\.h, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s16_m, svint16_t, int16_t,
++		z0 = svdup_n_s16_m (z1, p0, x0),
++		z0 = svdup_s16_m (z1, p0, x0))
++
++/*
++** dup_1_s16_z:
++**	mov	z0\.h, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 1),
++		z0 = svdup_s16_z (p0, 1))
++
++/*
++** dup_127_s16_z:
++**	mov	z0\.h, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 127),
++		z0 = svdup_s16_z (p0, 127))
++
++/*
++** dup_128_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #128
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 128),
++		z0 = svdup_s16_z (p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 129),
++		z0 = svdup_s16_z (p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 253),
++		z0 = svdup_s16_z (p0, 253))
++
++/*
++** dup_254_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #254
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 254),
++		z0 = svdup_s16_z (p0, 254))
++
++/*
++** dup_255_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #255
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 255),
++		z0 = svdup_s16_z (p0, 255))
++
++/*
++** dup_256_s16_z:
++**	mov	z0\.h, p0/z, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 256),
++		z0 = svdup_s16_z (p0, 256))
++
++/*
++** dup_257_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+)\.b, #1
++**	sel	z0\.h, p0, \2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 257),
++		z0 = svdup_s16_z (p0, 257))
++
++/*
++** dup_512_s16_z:
++**	mov	z0\.h, p0/z, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 512),
++		z0 = svdup_s16_z (p0, 512))
++
++/*
++** dup_7f00_s16_z:
++**	mov	z0\.h, p0/z, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 0x7f00),
++		z0 = svdup_s16_z (p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 0x7f01),
++		z0 = svdup_s16_z (p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 0x7ffd),
++		z0 = svdup_s16_z (p0, 0x7ffd))
++
++/*
++** dup_7ffe_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #32766
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 0x7ffe),
++		z0 = svdup_s16_z (p0, 0x7ffe))
++
++/*
++** dup_7fff_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #32767
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 0x7fff),
++		z0 = svdup_s16_z (p0, 0x7fff))
++
++/*
++** dup_m1_s16_z:
++**	mov	z0\.h, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -1),
++		z0 = svdup_s16_z (p0, -1))
++
++/*
++** dup_m128_s16_z:
++**	mov	z0\.h, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -128),
++		z0 = svdup_s16_z (p0, -128))
++
++/*
++** dup_m129_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-129
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -129),
++		z0 = svdup_s16_z (p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -130),
++		z0 = svdup_s16_z (p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -254),
++		z0 = svdup_s16_z (p0, -254))
++
++/*
++** dup_m255_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-255
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -255),
++		z0 = svdup_s16_z (p0, -255))
++
++/*
++** dup_m256_s16_z:
++**	mov	z0\.h, p0/z, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -256),
++		z0 = svdup_s16_z (p0, -256))
++
++/*
++** dup_m257_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-257
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -257),
++		z0 = svdup_s16_z (p0, -257))
++
++/*
++** dup_m258_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+)\.b, #-2
++**	sel	z0\.h, p0, \2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -258),
++		z0 = svdup_s16_z (p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -259),
++		z0 = svdup_s16_z (p0, -259))
++
++/*
++** dup_m512_s16_z:
++**	mov	z0\.h, p0/z, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -512),
++		z0 = svdup_s16_z (p0, -512))
++
++/*
++** dup_m7f00_s16_z:
++**	mov	z0\.h, p0/z, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -0x7f00),
++		z0 = svdup_s16_z (p0, -0x7f00))
++
++/*
++** dup_m7f01_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-32513
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -0x7f01),
++		z0 = svdup_s16_z (p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -0x7f02),
++		z0 = svdup_s16_z (p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -0x7ffe),
++		z0 = svdup_s16_z (p0, -0x7ffe))
++
++/*
++** dup_m7fff_s16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-32767
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -0x7fff),
++		z0 = svdup_s16_z (p0, -0x7fff))
++
++/*
++** dup_m8000_s16_z:
++**	mov	z0\.h, p0/z, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, -0x8000),
++		z0 = svdup_s16_z (p0, -0x8000))
++
++/*
++** dup_0_s16_z:
++**	mov	z0\.h, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s16_z, svint16_t,
++		z0 = svdup_n_s16_z (p0, 0),
++		z0 = svdup_s16_z (p0, 0))
++
++/*
++** dup_w0_s16_z:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mov	z0\.h, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s16_z, svint16_t, int16_t,
++		z0 = svdup_n_s16_z (p0, x0),
++		z0 = svdup_s16_z (p0, x0))
++
++/*
++** dup_1_s16_x:
++**	mov	z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 1),
++		z0 = svdup_s16_x (p0, 1))
++
++/*
++** dup_127_s16_x:
++**	mov	z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 127),
++		z0 = svdup_s16_x (p0, 127))
++
++/*
++** dup_128_s16_x:
++**	mov	z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 128),
++		z0 = svdup_s16_x (p0, 128))
++
++/*
++** dup_129_s16_x:
++**	movi	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 129),
++		z0 = svdup_s16_x (p0, 129))
++
++/*
++** dup_253_s16_x:
++**	movi	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 253),
++		z0 = svdup_s16_x (p0, 253))
++
++/*
++** dup_254_s16_x:
++**	mov	z0\.h, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 254),
++		z0 = svdup_s16_x (p0, 254))
++
++/*
++** dup_255_s16_x:
++**	mov	z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 255),
++		z0 = svdup_s16_x (p0, 255))
++
++/*
++** dup_256_s16_x:
++**	mov	z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 256),
++		z0 = svdup_s16_x (p0, 256))
++
++/*
++** dup_257_s16_x:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 257),
++		z0 = svdup_s16_x (p0, 257))
++
++/*
++** dup_512_s16_x:
++**	mov	z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 512),
++		z0 = svdup_s16_x (p0, 512))
++
++/*
++** dup_7f00_s16_x:
++**	mov	z0\.h, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 0x7f00),
++		z0 = svdup_s16_x (p0, 0x7f00))
++
++/*
++** dup_7f01_s16_x:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 0x7f01),
++		z0 = svdup_s16_x (p0, 0x7f01))
++
++/*
++** dup_7ffd_s16_x:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 0x7ffd),
++		z0 = svdup_s16_x (p0, 0x7ffd))
++
++/*
++** dup_7ffe_s16_x:
++**	mov	z0\.h, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 0x7ffe),
++		z0 = svdup_s16_x (p0, 0x7ffe))
++
++/*
++** dup_7fff_s16_x:
++**	mov	z0\.h, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, 0x7fff),
++		z0 = svdup_s16_x (p0, 0x7fff))
++
++/*
++** dup_m1_s16_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -1),
++		z0 = svdup_s16_x (p0, -1))
++
++/*
++** dup_m128_s16_x:
++**	mov	z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -128),
++		z0 = svdup_s16_x (p0, -128))
++
++/*
++** dup_m129_s16_x:
++**	mov	z0\.h, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -129),
++		z0 = svdup_s16_x (p0, -129))
++
++/*
++** dup_m130_s16_x:
++**	mvni	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -130),
++		z0 = svdup_s16_x (p0, -130))
++
++/*
++** dup_m254_s16_x:
++**	mvni	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -254),
++		z0 = svdup_s16_x (p0, -254))
++
++/*
++** dup_m255_s16_x:
++**	mov	z0\.h, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -255),
++		z0 = svdup_s16_x (p0, -255))
++
++/*
++** dup_m256_s16_x:
++**	mov	z0\.h, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -256),
++		z0 = svdup_s16_x (p0, -256))
++
++/*
++** dup_m257_s16_x:
++**	mov	z0\.h, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -257),
++		z0 = svdup_s16_x (p0, -257))
++
++/*
++** dup_m258_s16_x:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -258),
++		z0 = svdup_s16_x (p0, -258))
++
++/*
++** dup_m259_s16_x:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -259),
++		z0 = svdup_s16_x (p0, -259))
++
++/*
++** dup_m512_s16_x:
++**	mov	z0\.h, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -512),
++		z0 = svdup_s16_x (p0, -512))
++
++/*
++** dup_m7f00_s16_x:
++**	mov	z0\.h, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -0x7f00),
++		z0 = svdup_s16_x (p0, -0x7f00))
++
++/*
++** dup_m7f01_s16_x:
++**	mov	z0\.h, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -0x7f01),
++		z0 = svdup_s16_x (p0, -0x7f01))
++
++/*
++** dup_m7f02_s16_x:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -0x7f02),
++		z0 = svdup_s16_x (p0, -0x7f02))
++
++/*
++** dup_m7ffe_s16_x:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -0x7ffe),
++		z0 = svdup_s16_x (p0, -0x7ffe))
++
++/*
++** dup_m7fff_s16_x:
++**	mov	z0\.h, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -0x7fff),
++		z0 = svdup_s16_x (p0, -0x7fff))
++
++/*
++** dup_m8000_s16_x:
++**	mov	z0\.h, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s16_x, svint16_t,
++		z0 = svdup_n_s16_x (p0, -0x8000),
++		z0 = svdup_s16_x (p0, -0x8000))
++
++/*
++** dup_w0_s16_x:
++**	mov	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s16_x, svint16_t, int16_t,
++		z0 = svdup_n_s16_x (p0, x0),
++		z0 = svdup_s16_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s32.c
+new file mode 100644
+index 000000000..0b396dbeb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s32.c
+@@ -0,0 +1,1175 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_s32:
++**	mov	z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s32, svint32_t,
++		z0 = svdup_n_s32 (1),
++		z0 = svdup_s32 (1))
++
++/*
++** dup_127_s32:
++**	mov	z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s32, svint32_t,
++		z0 = svdup_n_s32 (127),
++		z0 = svdup_s32 (127))
++
++/*
++** dup_128_s32:
++**	mov	z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s32, svint32_t,
++		z0 = svdup_n_s32 (128),
++		z0 = svdup_s32 (128))
++
++/*
++** dup_129_s32:
++**	movi	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s32, svint32_t,
++		z0 = svdup_n_s32 (129),
++		z0 = svdup_s32 (129))
++
++/*
++** dup_253_s32:
++**	movi	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s32, svint32_t,
++		z0 = svdup_n_s32 (253),
++		z0 = svdup_s32 (253))
++
++/*
++** dup_254_s32:
++**	mov	z0\.s, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s32, svint32_t,
++		z0 = svdup_n_s32 (254),
++		z0 = svdup_s32 (254))
++
++/*
++** dup_255_s32:
++**	mov	z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s32, svint32_t,
++		z0 = svdup_n_s32 (255),
++		z0 = svdup_s32 (255))
++
++/*
++** dup_256_s32:
++**	mov	z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s32, svint32_t,
++		z0 = svdup_n_s32 (256),
++		z0 = svdup_s32 (256))
++
++/*
++** dup_257_s32:
++**	mov	(w[0-9]+), 257
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s32, svint32_t,
++		z0 = svdup_n_s32 (257),
++		z0 = svdup_s32 (257))
++
++/*
++** dup_512_s32:
++**	mov	z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s32, svint32_t,
++		z0 = svdup_n_s32 (512),
++		z0 = svdup_s32 (512))
++
++/*
++** dup_7f00_s32:
++**	mov	z0\.s, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s32, svint32_t,
++		z0 = svdup_n_s32 (0x7f00),
++		z0 = svdup_s32 (0x7f00))
++
++/*
++** dup_7f01_s32:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_s32, svint32_t,
++		z0 = svdup_n_s32 (0x7f01),
++		z0 = svdup_s32 (0x7f01))
++
++/*
++** dup_7ffd_s32:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_s32, svint32_t,
++		z0 = svdup_n_s32 (0x7ffd),
++		z0 = svdup_s32 (0x7ffd))
++
++/*
++** dup_7ffe_s32:
++**	mov	z0\.s, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s32, svint32_t,
++		z0 = svdup_n_s32 (0x7ffe),
++		z0 = svdup_s32 (0x7ffe))
++
++/*
++** dup_7fff_s32:
++**	mov	z0\.s, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s32, svint32_t,
++		z0 = svdup_n_s32 (0x7fff),
++		z0 = svdup_s32 (0x7fff))
++
++/*
++** dup_m1_s32:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s32, svint32_t,
++		z0 = svdup_n_s32 (-1),
++		z0 = svdup_s32 (-1))
++
++/*
++** dup_m128_s32:
++**	mov	z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s32, svint32_t,
++		z0 = svdup_n_s32 (-128),
++		z0 = svdup_s32 (-128))
++
++/*
++** dup_m129_s32:
++**	mov	z0\.s, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s32, svint32_t,
++		z0 = svdup_n_s32 (-129),
++		z0 = svdup_s32 (-129))
++
++/*
++** dup_m130_s32:
++**	mvni	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_s32, svint32_t,
++		z0 = svdup_n_s32 (-130),
++		z0 = svdup_s32 (-130))
++
++/*
++** dup_m254_s32:
++**	mvni	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_s32, svint32_t,
++		z0 = svdup_n_s32 (-254),
++		z0 = svdup_s32 (-254))
++
++/*
++** dup_m255_s32:
++**	mov	z0\.s, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s32, svint32_t,
++		z0 = svdup_n_s32 (-255),
++		z0 = svdup_s32 (-255))
++
++/*
++** dup_m256_s32:
++**	mov	z0\.s, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s32, svint32_t,
++		z0 = svdup_n_s32 (-256),
++		z0 = svdup_s32 (-256))
++
++/*
++** dup_m257_s32:
++**	mov	z0\.s, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s32, svint32_t,
++		z0 = svdup_n_s32 (-257),
++		z0 = svdup_s32 (-257))
++
++/*
++** dup_m258_s32:
++**	mov	(w[0-9]+), -258
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s32, svint32_t,
++		z0 = svdup_n_s32 (-258),
++		z0 = svdup_s32 (-258))
++
++/*
++** dup_m259_s32:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_s32, svint32_t,
++		z0 = svdup_n_s32 (-259),
++		z0 = svdup_s32 (-259))
++
++/*
++** dup_m512_s32:
++**	mov	z0\.s, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s32, svint32_t,
++		z0 = svdup_n_s32 (-512),
++		z0 = svdup_s32 (-512))
++
++/*
++** dup_m7f00_s32:
++**	mov	z0\.s, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s32, svint32_t,
++		z0 = svdup_n_s32 (-0x7f00),
++		z0 = svdup_s32 (-0x7f00))
++
++/*
++** dup_m7f01_s32:
++**	mov	z0\.s, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s32, svint32_t,
++		z0 = svdup_n_s32 (-0x7f01),
++		z0 = svdup_s32 (-0x7f01))
++
++/*
++** dup_m7f02_s32:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_s32, svint32_t,
++		z0 = svdup_n_s32 (-0x7f02),
++		z0 = svdup_s32 (-0x7f02))
++
++/*
++** dup_m7ffe_s32:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_s32, svint32_t,
++		z0 = svdup_n_s32 (-0x7ffe),
++		z0 = svdup_s32 (-0x7ffe))
++
++/*
++** dup_m7fff_s32:
++**	mov	z0\.s, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s32, svint32_t,
++		z0 = svdup_n_s32 (-0x7fff),
++		z0 = svdup_s32 (-0x7fff))
++
++/*
++** dup_m8000_s32:
++**	mov	z0\.s, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s32, svint32_t,
++		z0 = svdup_n_s32 (-0x8000),
++		z0 = svdup_s32 (-0x8000))
++
++/*
++** dup_w0_s32:
++**	mov	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s32, svint32_t, int32_t,
++		 z0 = svdup_n_s32 (x0),
++		 z0 = svdup_s32 (x0))
++
++/*
++** dup_1_s32_m:
++**	mov	z0\.s, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 1),
++		z0 = svdup_s32_m (z0, p0, 1))
++
++/*
++** dup_127_s32_m:
++**	mov	z0\.s, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 127),
++		z0 = svdup_s32_m (z0, p0, 127))
++
++/*
++** dup_128_s32_m:
++**	mov	(z[0-9]+\.s), #128
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 128),
++		z0 = svdup_s32_m (z0, p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 129),
++		z0 = svdup_s32_m (z0, p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 253),
++		z0 = svdup_s32_m (z0, p0, 253))
++
++/*
++** dup_254_s32_m:
++**	mov	(z[0-9]+\.s), #254
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 254),
++		z0 = svdup_s32_m (z0, p0, 254))
++
++/*
++** dup_255_s32_m:
++**	mov	(z[0-9]+\.s), #255
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 255),
++		z0 = svdup_s32_m (z0, p0, 255))
++
++/*
++** dup_256_s32_m:
++**	mov	z0\.s, p0/m, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 256),
++		z0 = svdup_s32_m (z0, p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 257),
++		z0 = svdup_s32_m (z0, p0, 257))
++
++/*
++** dup_512_s32_m:
++**	mov	z0\.s, p0/m, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 512),
++		z0 = svdup_s32_m (z0, p0, 512))
++
++/*
++** dup_7f00_s32_m:
++**	mov	z0\.s, p0/m, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 0x7f00),
++		z0 = svdup_s32_m (z0, p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 0x7f01),
++		z0 = svdup_s32_m (z0, p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 0x7ffd),
++		z0 = svdup_s32_m (z0, p0, 0x7ffd))
++
++/*
++** dup_7ffe_s32_m:
++**	mov	(z[0-9]+\.s), #32766
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 0x7ffe),
++		z0 = svdup_s32_m (z0, p0, 0x7ffe))
++
++/*
++** dup_7fff_s32_m:
++**	mov	(z[0-9]+\.s), #32767
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 0x7fff),
++		z0 = svdup_s32_m (z0, p0, 0x7fff))
++
++/*
++** dup_m1_s32_m:
++**	mov	z0\.s, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -1),
++		z0 = svdup_s32_m (z0, p0, -1))
++
++/*
++** dup_m128_s32_m:
++**	mov	z0\.s, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -128),
++		z0 = svdup_s32_m (z0, p0, -128))
++
++/*
++** dup_m129_s32_m:
++**	mov	(z[0-9]+\.s), #-129
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -129),
++		z0 = svdup_s32_m (z0, p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -130),
++		z0 = svdup_s32_m (z0, p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -254),
++		z0 = svdup_s32_m (z0, p0, -254))
++
++/*
++** dup_m255_s32_m:
++**	mov	(z[0-9]+\.s), #-255
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -255),
++		z0 = svdup_s32_m (z0, p0, -255))
++
++/*
++** dup_m256_s32_m:
++**	mov	z0\.s, p0/m, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -256),
++		z0 = svdup_s32_m (z0, p0, -256))
++
++/*
++** dup_m257_s32_m:
++**	mov	(z[0-9]+\.s), #-257
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -257),
++		z0 = svdup_s32_m (z0, p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -258),
++		z0 = svdup_s32_m (z0, p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -259),
++		z0 = svdup_s32_m (z0, p0, -259))
++
++/*
++** dup_m512_s32_m:
++**	mov	z0\.s, p0/m, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -512),
++		z0 = svdup_s32_m (z0, p0, -512))
++
++/*
++** dup_m7f00_s32_m:
++**	mov	z0\.s, p0/m, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -0x7f00),
++		z0 = svdup_s32_m (z0, p0, -0x7f00))
++
++/*
++** dup_m7f01_s32_m:
++**	mov	(z[0-9]+\.s), #-32513
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -0x7f01),
++		z0 = svdup_s32_m (z0, p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -0x7f02),
++		z0 = svdup_s32_m (z0, p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -0x7ffe),
++		z0 = svdup_s32_m (z0, p0, -0x7ffe))
++
++/*
++** dup_m7fff_s32_m:
++**	mov	(z[0-9]+\.s), #-32767
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -0x7fff),
++		z0 = svdup_s32_m (z0, p0, -0x7fff))
++
++/*
++** dup_m8000_s32_m:
++**	mov	z0\.s, p0/m, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, -0x8000),
++		z0 = svdup_s32_m (z0, p0, -0x8000))
++
++/*
++** dup_0_s32_m:
++**	mov	z0\.s, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s32_m, svint32_t,
++		z0 = svdup_n_s32_m (z0, p0, 0),
++		z0 = svdup_s32_m (z0, p0, 0))
++
++/*
++** dup_w0_s32_m:
++**	movprfx	z0, z1
++**	mov	z0\.s, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s32_m, svint32_t, int32_t,
++		z0 = svdup_n_s32_m (z1, p0, x0),
++		z0 = svdup_s32_m (z1, p0, x0))
++
++/*
++** dup_1_s32_z:
++**	mov	z0\.s, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 1),
++		z0 = svdup_s32_z (p0, 1))
++
++/*
++** dup_127_s32_z:
++**	mov	z0\.s, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 127),
++		z0 = svdup_s32_z (p0, 127))
++
++/*
++** dup_128_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #128
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 128),
++		z0 = svdup_s32_z (p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 129),
++		z0 = svdup_s32_z (p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 253),
++		z0 = svdup_s32_z (p0, 253))
++
++/*
++** dup_254_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #254
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 254),
++		z0 = svdup_s32_z (p0, 254))
++
++/*
++** dup_255_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #255
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 255),
++		z0 = svdup_s32_z (p0, 255))
++
++/*
++** dup_256_s32_z:
++**	mov	z0\.s, p0/z, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 256),
++		z0 = svdup_s32_z (p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 257),
++		z0 = svdup_s32_z (p0, 257))
++
++/*
++** dup_512_s32_z:
++**	mov	z0\.s, p0/z, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 512),
++		z0 = svdup_s32_z (p0, 512))
++
++/*
++** dup_7f00_s32_z:
++**	mov	z0\.s, p0/z, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 0x7f00),
++		z0 = svdup_s32_z (p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 0x7f01),
++		z0 = svdup_s32_z (p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 0x7ffd),
++		z0 = svdup_s32_z (p0, 0x7ffd))
++
++/*
++** dup_7ffe_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #32766
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 0x7ffe),
++		z0 = svdup_s32_z (p0, 0x7ffe))
++
++/*
++** dup_7fff_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #32767
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 0x7fff),
++		z0 = svdup_s32_z (p0, 0x7fff))
++
++/*
++** dup_m1_s32_z:
++**	mov	z0\.s, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -1),
++		z0 = svdup_s32_z (p0, -1))
++
++/*
++** dup_m128_s32_z:
++**	mov	z0\.s, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -128),
++		z0 = svdup_s32_z (p0, -128))
++
++/*
++** dup_m129_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-129
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -129),
++		z0 = svdup_s32_z (p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -130),
++		z0 = svdup_s32_z (p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -254),
++		z0 = svdup_s32_z (p0, -254))
++
++/*
++** dup_m255_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-255
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -255),
++		z0 = svdup_s32_z (p0, -255))
++
++/*
++** dup_m256_s32_z:
++**	mov	z0\.s, p0/z, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -256),
++		z0 = svdup_s32_z (p0, -256))
++
++/*
++** dup_m257_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-257
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -257),
++		z0 = svdup_s32_z (p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -258),
++		z0 = svdup_s32_z (p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -259),
++		z0 = svdup_s32_z (p0, -259))
++
++/*
++** dup_m512_s32_z:
++**	mov	z0\.s, p0/z, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -512),
++		z0 = svdup_s32_z (p0, -512))
++
++/*
++** dup_m7f00_s32_z:
++**	mov	z0\.s, p0/z, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -0x7f00),
++		z0 = svdup_s32_z (p0, -0x7f00))
++
++/*
++** dup_m7f01_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-32513
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -0x7f01),
++		z0 = svdup_s32_z (p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -0x7f02),
++		z0 = svdup_s32_z (p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -0x7ffe),
++		z0 = svdup_s32_z (p0, -0x7ffe))
++
++/*
++** dup_m7fff_s32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-32767
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -0x7fff),
++		z0 = svdup_s32_z (p0, -0x7fff))
++
++/*
++** dup_m8000_s32_z:
++**	mov	z0\.s, p0/z, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, -0x8000),
++		z0 = svdup_s32_z (p0, -0x8000))
++
++/*
++** dup_0_s32_z:
++**	mov	z0\.s, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s32_z, svint32_t,
++		z0 = svdup_n_s32_z (p0, 0),
++		z0 = svdup_s32_z (p0, 0))
++
++/*
++** dup_w0_s32_z:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mov	z0\.s, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s32_z, svint32_t, int32_t,
++		z0 = svdup_n_s32_z (p0, x0),
++		z0 = svdup_s32_z (p0, x0))
++
++/*
++** dup_1_s32_x:
++**	mov	z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 1),
++		z0 = svdup_s32_x (p0, 1))
++
++/*
++** dup_127_s32_x:
++**	mov	z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 127),
++		z0 = svdup_s32_x (p0, 127))
++
++/*
++** dup_128_s32_x:
++**	mov	z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 128),
++		z0 = svdup_s32_x (p0, 128))
++
++/*
++** dup_129_s32_x:
++**	movi	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 129),
++		z0 = svdup_s32_x (p0, 129))
++
++/*
++** dup_253_s32_x:
++**	movi	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 253),
++		z0 = svdup_s32_x (p0, 253))
++
++/*
++** dup_254_s32_x:
++**	mov	z0\.s, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 254),
++		z0 = svdup_s32_x (p0, 254))
++
++/*
++** dup_255_s32_x:
++**	mov	z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 255),
++		z0 = svdup_s32_x (p0, 255))
++
++/*
++** dup_256_s32_x:
++**	mov	z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 256),
++		z0 = svdup_s32_x (p0, 256))
++
++/*
++** dup_257_s32_x:
++**	mov	(w[0-9]+), 257
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 257),
++		z0 = svdup_s32_x (p0, 257))
++
++/*
++** dup_512_s32_x:
++**	mov	z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 512),
++		z0 = svdup_s32_x (p0, 512))
++
++/*
++** dup_7f00_s32_x:
++**	mov	z0\.s, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 0x7f00),
++		z0 = svdup_s32_x (p0, 0x7f00))
++
++/*
++** dup_7f01_s32_x:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 0x7f01),
++		z0 = svdup_s32_x (p0, 0x7f01))
++
++/*
++** dup_7ffd_s32_x:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 0x7ffd),
++		z0 = svdup_s32_x (p0, 0x7ffd))
++
++/*
++** dup_7ffe_s32_x:
++**	mov	z0\.s, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 0x7ffe),
++		z0 = svdup_s32_x (p0, 0x7ffe))
++
++/*
++** dup_7fff_s32_x:
++**	mov	z0\.s, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, 0x7fff),
++		z0 = svdup_s32_x (p0, 0x7fff))
++
++/*
++** dup_m1_s32_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -1),
++		z0 = svdup_s32_x (p0, -1))
++
++/*
++** dup_m128_s32_x:
++**	mov	z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -128),
++		z0 = svdup_s32_x (p0, -128))
++
++/*
++** dup_m129_s32_x:
++**	mov	z0\.s, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -129),
++		z0 = svdup_s32_x (p0, -129))
++
++/*
++** dup_m130_s32_x:
++**	mvni	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -130),
++		z0 = svdup_s32_x (p0, -130))
++
++/*
++** dup_m254_s32_x:
++**	mvni	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -254),
++		z0 = svdup_s32_x (p0, -254))
++
++/*
++** dup_m255_s32_x:
++**	mov	z0\.s, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -255),
++		z0 = svdup_s32_x (p0, -255))
++
++/*
++** dup_m256_s32_x:
++**	mov	z0\.s, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -256),
++		z0 = svdup_s32_x (p0, -256))
++
++/*
++** dup_m257_s32_x:
++**	mov	z0\.s, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -257),
++		z0 = svdup_s32_x (p0, -257))
++
++/*
++** dup_m258_s32_x:
++**	mov	(w[0-9]+), -258
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -258),
++		z0 = svdup_s32_x (p0, -258))
++
++/*
++** dup_m259_s32_x:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -259),
++		z0 = svdup_s32_x (p0, -259))
++
++/*
++** dup_m512_s32_x:
++**	mov	z0\.s, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -512),
++		z0 = svdup_s32_x (p0, -512))
++
++/*
++** dup_m7f00_s32_x:
++**	mov	z0\.s, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -0x7f00),
++		z0 = svdup_s32_x (p0, -0x7f00))
++
++/*
++** dup_m7f01_s32_x:
++**	mov	z0\.s, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -0x7f01),
++		z0 = svdup_s32_x (p0, -0x7f01))
++
++/*
++** dup_m7f02_s32_x:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -0x7f02),
++		z0 = svdup_s32_x (p0, -0x7f02))
++
++/*
++** dup_m7ffe_s32_x:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -0x7ffe),
++		z0 = svdup_s32_x (p0, -0x7ffe))
++
++/*
++** dup_m7fff_s32_x:
++**	mov	z0\.s, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -0x7fff),
++		z0 = svdup_s32_x (p0, -0x7fff))
++
++/*
++** dup_m8000_s32_x:
++**	mov	z0\.s, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s32_x, svint32_t,
++		z0 = svdup_n_s32_x (p0, -0x8000),
++		z0 = svdup_s32_x (p0, -0x8000))
++
++/*
++** dup_w0_s32_x:
++**	mov	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s32_x, svint32_t, int32_t,
++		z0 = svdup_n_s32_x (p0, x0),
++		z0 = svdup_s32_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s64.c
+new file mode 100644
+index 000000000..6259b7fb5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s64.c
+@@ -0,0 +1,1175 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_s64:
++**	mov	z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s64, svint64_t,
++		z0 = svdup_n_s64 (1),
++		z0 = svdup_s64 (1))
++
++/*
++** dup_127_s64:
++**	mov	z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s64, svint64_t,
++		z0 = svdup_n_s64 (127),
++		z0 = svdup_s64 (127))
++
++/*
++** dup_128_s64:
++**	mov	z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s64, svint64_t,
++		z0 = svdup_n_s64 (128),
++		z0 = svdup_s64 (128))
++
++/*
++** dup_129_s64:
++**	mov	(x[0-9]+), 129
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s64, svint64_t,
++		z0 = svdup_n_s64 (129),
++		z0 = svdup_s64 (129))
++
++/*
++** dup_253_s64:
++**	mov	(x[0-9]+), 253
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s64, svint64_t,
++		z0 = svdup_n_s64 (253),
++		z0 = svdup_s64 (253))
++
++/*
++** dup_254_s64:
++**	mov	z0\.d, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s64, svint64_t,
++		z0 = svdup_n_s64 (254),
++		z0 = svdup_s64 (254))
++
++/*
++** dup_255_s64:
++**	mov	z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s64, svint64_t,
++		z0 = svdup_n_s64 (255),
++		z0 = svdup_s64 (255))
++
++/*
++** dup_256_s64:
++**	mov	z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s64, svint64_t,
++		z0 = svdup_n_s64 (256),
++		z0 = svdup_s64 (256))
++
++/*
++** dup_257_s64:
++**	mov	(x[0-9]+), 257
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s64, svint64_t,
++		z0 = svdup_n_s64 (257),
++		z0 = svdup_s64 (257))
++
++/*
++** dup_512_s64:
++**	mov	z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s64, svint64_t,
++		z0 = svdup_n_s64 (512),
++		z0 = svdup_s64 (512))
++
++/*
++** dup_7f00_s64:
++**	mov	z0\.d, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s64, svint64_t,
++		z0 = svdup_n_s64 (0x7f00),
++		z0 = svdup_s64 (0x7f00))
++
++/*
++** dup_7f01_s64:
++**	mov	(x[0-9]+), 32513
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_s64, svint64_t,
++		z0 = svdup_n_s64 (0x7f01),
++		z0 = svdup_s64 (0x7f01))
++
++/*
++** dup_7ffd_s64:
++**	mov	(x[0-9]+), 32765
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_s64, svint64_t,
++		z0 = svdup_n_s64 (0x7ffd),
++		z0 = svdup_s64 (0x7ffd))
++
++/*
++** dup_7ffe_s64:
++**	mov	z0\.d, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s64, svint64_t,
++		z0 = svdup_n_s64 (0x7ffe),
++		z0 = svdup_s64 (0x7ffe))
++
++/*
++** dup_7fff_s64:
++**	mov	z0\.d, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s64, svint64_t,
++		z0 = svdup_n_s64 (0x7fff),
++		z0 = svdup_s64 (0x7fff))
++
++/*
++** dup_m1_s64:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s64, svint64_t,
++		z0 = svdup_n_s64 (-1),
++		z0 = svdup_s64 (-1))
++
++/*
++** dup_m128_s64:
++**	mov	z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s64, svint64_t,
++		z0 = svdup_n_s64 (-128),
++		z0 = svdup_s64 (-128))
++
++/*
++** dup_m129_s64:
++**	mov	z0\.d, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s64, svint64_t,
++		z0 = svdup_n_s64 (-129),
++		z0 = svdup_s64 (-129))
++
++/*
++** dup_m130_s64:
++**	mov	(x[0-9]+), -130
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_s64, svint64_t,
++		z0 = svdup_n_s64 (-130),
++		z0 = svdup_s64 (-130))
++
++/*
++** dup_m254_s64:
++**	mov	(x[0-9]+), -254
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_s64, svint64_t,
++		z0 = svdup_n_s64 (-254),
++		z0 = svdup_s64 (-254))
++
++/*
++** dup_m255_s64:
++**	mov	z0\.d, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s64, svint64_t,
++		z0 = svdup_n_s64 (-255),
++		z0 = svdup_s64 (-255))
++
++/*
++** dup_m256_s64:
++**	mov	z0\.d, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s64, svint64_t,
++		z0 = svdup_n_s64 (-256),
++		z0 = svdup_s64 (-256))
++
++/*
++** dup_m257_s64:
++**	mov	z0\.d, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s64, svint64_t,
++		z0 = svdup_n_s64 (-257),
++		z0 = svdup_s64 (-257))
++
++/*
++** dup_m258_s64:
++**	mov	(x[0-9]+), -258
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s64, svint64_t,
++		z0 = svdup_n_s64 (-258),
++		z0 = svdup_s64 (-258))
++
++/*
++** dup_m259_s64:
++**	mov	(x[0-9]+), -259
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_s64, svint64_t,
++		z0 = svdup_n_s64 (-259),
++		z0 = svdup_s64 (-259))
++
++/*
++** dup_m512_s64:
++**	mov	z0\.d, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s64, svint64_t,
++		z0 = svdup_n_s64 (-512),
++		z0 = svdup_s64 (-512))
++
++/*
++** dup_m7f00_s64:
++**	mov	z0\.d, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s64, svint64_t,
++		z0 = svdup_n_s64 (-0x7f00),
++		z0 = svdup_s64 (-0x7f00))
++
++/*
++** dup_m7f01_s64:
++**	mov	z0\.d, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s64, svint64_t,
++		z0 = svdup_n_s64 (-0x7f01),
++		z0 = svdup_s64 (-0x7f01))
++
++/*
++** dup_m7f02_s64:
++**	mov	(x[0-9]+), -32514
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_s64, svint64_t,
++		z0 = svdup_n_s64 (-0x7f02),
++		z0 = svdup_s64 (-0x7f02))
++
++/*
++** dup_m7ffe_s64:
++**	mov	(x[0-9]+), -32766
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_s64, svint64_t,
++		z0 = svdup_n_s64 (-0x7ffe),
++		z0 = svdup_s64 (-0x7ffe))
++
++/*
++** dup_m7fff_s64:
++**	mov	z0\.d, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s64, svint64_t,
++		z0 = svdup_n_s64 (-0x7fff),
++		z0 = svdup_s64 (-0x7fff))
++
++/*
++** dup_m8000_s64:
++**	mov	z0\.d, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s64, svint64_t,
++		z0 = svdup_n_s64 (-0x8000),
++		z0 = svdup_s64 (-0x8000))
++
++/*
++** dup_x0_s64:
++**	mov	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_s64, svint64_t, int64_t,
++		 z0 = svdup_n_s64 (x0),
++		 z0 = svdup_s64 (x0))
++
++/*
++** dup_1_s64_m:
++**	mov	z0\.d, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 1),
++		z0 = svdup_s64_m (z0, p0, 1))
++
++/*
++** dup_127_s64_m:
++**	mov	z0\.d, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 127),
++		z0 = svdup_s64_m (z0, p0, 127))
++
++/*
++** dup_128_s64_m:
++**	mov	(z[0-9]+\.d), #128
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 128),
++		z0 = svdup_s64_m (z0, p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 129),
++		z0 = svdup_s64_m (z0, p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 253),
++		z0 = svdup_s64_m (z0, p0, 253))
++
++/*
++** dup_254_s64_m:
++**	mov	(z[0-9]+\.d), #254
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 254),
++		z0 = svdup_s64_m (z0, p0, 254))
++
++/*
++** dup_255_s64_m:
++**	mov	(z[0-9]+\.d), #255
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 255),
++		z0 = svdup_s64_m (z0, p0, 255))
++
++/*
++** dup_256_s64_m:
++**	mov	z0\.d, p0/m, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 256),
++		z0 = svdup_s64_m (z0, p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 257),
++		z0 = svdup_s64_m (z0, p0, 257))
++
++/*
++** dup_512_s64_m:
++**	mov	z0\.d, p0/m, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 512),
++		z0 = svdup_s64_m (z0, p0, 512))
++
++/*
++** dup_7f00_s64_m:
++**	mov	z0\.d, p0/m, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 0x7f00),
++		z0 = svdup_s64_m (z0, p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 0x7f01),
++		z0 = svdup_s64_m (z0, p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 0x7ffd),
++		z0 = svdup_s64_m (z0, p0, 0x7ffd))
++
++/*
++** dup_7ffe_s64_m:
++**	mov	(z[0-9]+\.d), #32766
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 0x7ffe),
++		z0 = svdup_s64_m (z0, p0, 0x7ffe))
++
++/*
++** dup_7fff_s64_m:
++**	mov	(z[0-9]+\.d), #32767
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 0x7fff),
++		z0 = svdup_s64_m (z0, p0, 0x7fff))
++
++/*
++** dup_m1_s64_m:
++**	mov	z0\.d, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -1),
++		z0 = svdup_s64_m (z0, p0, -1))
++
++/*
++** dup_m128_s64_m:
++**	mov	z0\.d, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -128),
++		z0 = svdup_s64_m (z0, p0, -128))
++
++/*
++** dup_m129_s64_m:
++**	mov	(z[0-9]+\.d), #-129
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -129),
++		z0 = svdup_s64_m (z0, p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -130),
++		z0 = svdup_s64_m (z0, p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -254),
++		z0 = svdup_s64_m (z0, p0, -254))
++
++/*
++** dup_m255_s64_m:
++**	mov	(z[0-9]+\.d), #-255
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -255),
++		z0 = svdup_s64_m (z0, p0, -255))
++
++/*
++** dup_m256_s64_m:
++**	mov	z0\.d, p0/m, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -256),
++		z0 = svdup_s64_m (z0, p0, -256))
++
++/*
++** dup_m257_s64_m:
++**	mov	(z[0-9]+\.d), #-257
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -257),
++		z0 = svdup_s64_m (z0, p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -258),
++		z0 = svdup_s64_m (z0, p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -259),
++		z0 = svdup_s64_m (z0, p0, -259))
++
++/*
++** dup_m512_s64_m:
++**	mov	z0\.d, p0/m, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -512),
++		z0 = svdup_s64_m (z0, p0, -512))
++
++/*
++** dup_m7f00_s64_m:
++**	mov	z0\.d, p0/m, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -0x7f00),
++		z0 = svdup_s64_m (z0, p0, -0x7f00))
++
++/*
++** dup_m7f01_s64_m:
++**	mov	(z[0-9]+\.d), #-32513
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -0x7f01),
++		z0 = svdup_s64_m (z0, p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -0x7f02),
++		z0 = svdup_s64_m (z0, p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -0x7ffe),
++		z0 = svdup_s64_m (z0, p0, -0x7ffe))
++
++/*
++** dup_m7fff_s64_m:
++**	mov	(z[0-9]+\.d), #-32767
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -0x7fff),
++		z0 = svdup_s64_m (z0, p0, -0x7fff))
++
++/*
++** dup_m8000_s64_m:
++**	mov	z0\.d, p0/m, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, -0x8000),
++		z0 = svdup_s64_m (z0, p0, -0x8000))
++
++/*
++** dup_0_s64_m:
++**	mov	z0\.d, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s64_m, svint64_t,
++		z0 = svdup_n_s64_m (z0, p0, 0),
++		z0 = svdup_s64_m (z0, p0, 0))
++
++/*
++** dup_x0_s64_m:
++**	movprfx	z0, z1
++**	mov	z0\.d, p0/m, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_s64_m, svint64_t, int64_t,
++		z0 = svdup_n_s64_m (z1, p0, x0),
++		z0 = svdup_s64_m (z1, p0, x0))
++
++/*
++** dup_1_s64_z:
++**	mov	z0\.d, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 1),
++		z0 = svdup_s64_z (p0, 1))
++
++/*
++** dup_127_s64_z:
++**	mov	z0\.d, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 127),
++		z0 = svdup_s64_z (p0, 127))
++
++/*
++** dup_128_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #128
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 128),
++		z0 = svdup_s64_z (p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 129),
++		z0 = svdup_s64_z (p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 253),
++		z0 = svdup_s64_z (p0, 253))
++
++/*
++** dup_254_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #254
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 254),
++		z0 = svdup_s64_z (p0, 254))
++
++/*
++** dup_255_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #255
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 255),
++		z0 = svdup_s64_z (p0, 255))
++
++/*
++** dup_256_s64_z:
++**	mov	z0\.d, p0/z, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 256),
++		z0 = svdup_s64_z (p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 257),
++		z0 = svdup_s64_z (p0, 257))
++
++/*
++** dup_512_s64_z:
++**	mov	z0\.d, p0/z, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 512),
++		z0 = svdup_s64_z (p0, 512))
++
++/*
++** dup_7f00_s64_z:
++**	mov	z0\.d, p0/z, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 0x7f00),
++		z0 = svdup_s64_z (p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 0x7f01),
++		z0 = svdup_s64_z (p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 0x7ffd),
++		z0 = svdup_s64_z (p0, 0x7ffd))
++
++/*
++** dup_7ffe_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #32766
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 0x7ffe),
++		z0 = svdup_s64_z (p0, 0x7ffe))
++
++/*
++** dup_7fff_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #32767
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 0x7fff),
++		z0 = svdup_s64_z (p0, 0x7fff))
++
++/*
++** dup_m1_s64_z:
++**	mov	z0\.d, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -1),
++		z0 = svdup_s64_z (p0, -1))
++
++/*
++** dup_m128_s64_z:
++**	mov	z0\.d, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -128),
++		z0 = svdup_s64_z (p0, -128))
++
++/*
++** dup_m129_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-129
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -129),
++		z0 = svdup_s64_z (p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -130),
++		z0 = svdup_s64_z (p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -254),
++		z0 = svdup_s64_z (p0, -254))
++
++/*
++** dup_m255_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-255
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -255),
++		z0 = svdup_s64_z (p0, -255))
++
++/*
++** dup_m256_s64_z:
++**	mov	z0\.d, p0/z, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -256),
++		z0 = svdup_s64_z (p0, -256))
++
++/*
++** dup_m257_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-257
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -257),
++		z0 = svdup_s64_z (p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -258),
++		z0 = svdup_s64_z (p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -259),
++		z0 = svdup_s64_z (p0, -259))
++
++/*
++** dup_m512_s64_z:
++**	mov	z0\.d, p0/z, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -512),
++		z0 = svdup_s64_z (p0, -512))
++
++/*
++** dup_m7f00_s64_z:
++**	mov	z0\.d, p0/z, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -0x7f00),
++		z0 = svdup_s64_z (p0, -0x7f00))
++
++/*
++** dup_m7f01_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-32513
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -0x7f01),
++		z0 = svdup_s64_z (p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -0x7f02),
++		z0 = svdup_s64_z (p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -0x7ffe),
++		z0 = svdup_s64_z (p0, -0x7ffe))
++
++/*
++** dup_m7fff_s64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-32767
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -0x7fff),
++		z0 = svdup_s64_z (p0, -0x7fff))
++
++/*
++** dup_m8000_s64_z:
++**	mov	z0\.d, p0/z, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, -0x8000),
++		z0 = svdup_s64_z (p0, -0x8000))
++
++/*
++** dup_0_s64_z:
++**	mov	z0\.d, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s64_z, svint64_t,
++		z0 = svdup_n_s64_z (p0, 0),
++		z0 = svdup_s64_z (p0, 0))
++
++/*
++** dup_x0_s64_z:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mov	z0\.d, p0/m, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_s64_z, svint64_t, int64_t,
++		z0 = svdup_n_s64_z (p0, x0),
++		z0 = svdup_s64_z (p0, x0))
++
++/*
++** dup_1_s64_x:
++**	mov	z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 1),
++		z0 = svdup_s64_x (p0, 1))
++
++/*
++** dup_127_s64_x:
++**	mov	z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 127),
++		z0 = svdup_s64_x (p0, 127))
++
++/*
++** dup_128_s64_x:
++**	mov	z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 128),
++		z0 = svdup_s64_x (p0, 128))
++
++/*
++** dup_129_s64_x:
++**	mov	(x[0-9]+), 129
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 129),
++		z0 = svdup_s64_x (p0, 129))
++
++/*
++** dup_253_s64_x:
++**	mov	(x[0-9]+), 253
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 253),
++		z0 = svdup_s64_x (p0, 253))
++
++/*
++** dup_254_s64_x:
++**	mov	z0\.d, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 254),
++		z0 = svdup_s64_x (p0, 254))
++
++/*
++** dup_255_s64_x:
++**	mov	z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 255),
++		z0 = svdup_s64_x (p0, 255))
++
++/*
++** dup_256_s64_x:
++**	mov	z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 256),
++		z0 = svdup_s64_x (p0, 256))
++
++/*
++** dup_257_s64_x:
++**	mov	(x[0-9]+), 257
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 257),
++		z0 = svdup_s64_x (p0, 257))
++
++/*
++** dup_512_s64_x:
++**	mov	z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 512),
++		z0 = svdup_s64_x (p0, 512))
++
++/*
++** dup_7f00_s64_x:
++**	mov	z0\.d, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 0x7f00),
++		z0 = svdup_s64_x (p0, 0x7f00))
++
++/*
++** dup_7f01_s64_x:
++**	mov	(x[0-9]+), 32513
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 0x7f01),
++		z0 = svdup_s64_x (p0, 0x7f01))
++
++/*
++** dup_7ffd_s64_x:
++**	mov	(x[0-9]+), 32765
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 0x7ffd),
++		z0 = svdup_s64_x (p0, 0x7ffd))
++
++/*
++** dup_7ffe_s64_x:
++**	mov	z0\.d, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 0x7ffe),
++		z0 = svdup_s64_x (p0, 0x7ffe))
++
++/*
++** dup_7fff_s64_x:
++**	mov	z0\.d, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, 0x7fff),
++		z0 = svdup_s64_x (p0, 0x7fff))
++
++/*
++** dup_m1_s64_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -1),
++		z0 = svdup_s64_x (p0, -1))
++
++/*
++** dup_m128_s64_x:
++**	mov	z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -128),
++		z0 = svdup_s64_x (p0, -128))
++
++/*
++** dup_m129_s64_x:
++**	mov	z0\.d, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -129),
++		z0 = svdup_s64_x (p0, -129))
++
++/*
++** dup_m130_s64_x:
++**	mov	(x[0-9]+), -130
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -130),
++		z0 = svdup_s64_x (p0, -130))
++
++/*
++** dup_m254_s64_x:
++**	mov	(x[0-9]+), -254
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -254),
++		z0 = svdup_s64_x (p0, -254))
++
++/*
++** dup_m255_s64_x:
++**	mov	z0\.d, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -255),
++		z0 = svdup_s64_x (p0, -255))
++
++/*
++** dup_m256_s64_x:
++**	mov	z0\.d, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -256),
++		z0 = svdup_s64_x (p0, -256))
++
++/*
++** dup_m257_s64_x:
++**	mov	z0\.d, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -257),
++		z0 = svdup_s64_x (p0, -257))
++
++/*
++** dup_m258_s64_x:
++**	mov	(x[0-9]+), -258
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -258),
++		z0 = svdup_s64_x (p0, -258))
++
++/*
++** dup_m259_s64_x:
++**	mov	(x[0-9]+), -259
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -259),
++		z0 = svdup_s64_x (p0, -259))
++
++/*
++** dup_m512_s64_x:
++**	mov	z0\.d, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -512),
++		z0 = svdup_s64_x (p0, -512))
++
++/*
++** dup_m7f00_s64_x:
++**	mov	z0\.d, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -0x7f00),
++		z0 = svdup_s64_x (p0, -0x7f00))
++
++/*
++** dup_m7f01_s64_x:
++**	mov	z0\.d, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -0x7f01),
++		z0 = svdup_s64_x (p0, -0x7f01))
++
++/*
++** dup_m7f02_s64_x:
++**	mov	(x[0-9]+), -32514
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -0x7f02),
++		z0 = svdup_s64_x (p0, -0x7f02))
++
++/*
++** dup_m7ffe_s64_x:
++**	mov	(x[0-9]+), -32766
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -0x7ffe),
++		z0 = svdup_s64_x (p0, -0x7ffe))
++
++/*
++** dup_m7fff_s64_x:
++**	mov	z0\.d, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -0x7fff),
++		z0 = svdup_s64_x (p0, -0x7fff))
++
++/*
++** dup_m8000_s64_x:
++**	mov	z0\.d, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_s64_x, svint64_t,
++		z0 = svdup_n_s64_x (p0, -0x8000),
++		z0 = svdup_s64_x (p0, -0x8000))
++
++/*
++** dup_x0_s64_x:
++**	mov	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_s64_x, svint64_t, int64_t,
++		z0 = svdup_n_s64_x (p0, x0),
++		z0 = svdup_s64_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s8.c
+new file mode 100644
+index 000000000..96fc5fa64
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s8.c
+@@ -0,0 +1,383 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_s8:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s8, svint8_t,
++		z0 = svdup_n_s8 (1),
++		z0 = svdup_s8 (1))
++
++/*
++** dup_127_s8:
++**	mov	z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s8, svint8_t,
++		z0 = svdup_n_s8 (127),
++		z0 = svdup_s8 (127))
++
++/*
++** dup_128_s8:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s8, svint8_t,
++		z0 = svdup_n_s8 (128),
++		z0 = svdup_s8 (128))
++
++/*
++** dup_129_s8:
++**	mov	z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s8, svint8_t,
++		z0 = svdup_n_s8 (129),
++		z0 = svdup_s8 (129))
++
++/*
++** dup_253_s8:
++**	mov	z0\.b, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s8, svint8_t,
++		z0 = svdup_n_s8 (253),
++		z0 = svdup_s8 (253))
++
++/*
++** dup_254_s8:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s8, svint8_t,
++		z0 = svdup_n_s8 (254),
++		z0 = svdup_s8 (254))
++
++/*
++** dup_255_s8:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s8, svint8_t,
++		z0 = svdup_n_s8 (255),
++		z0 = svdup_s8 (255))
++
++/*
++** dup_m1_s8:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s8, svint8_t,
++		z0 = svdup_n_s8 (-1),
++		z0 = svdup_s8 (-1))
++
++/*
++** dup_m128_s8:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s8, svint8_t,
++		z0 = svdup_n_s8 (-128),
++		z0 = svdup_s8 (-128))
++
++/*
++** dup_w0_s8:
++**	mov	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s8, svint8_t, int8_t,
++		 z0 = svdup_n_s8 (x0),
++		 z0 = svdup_s8 (x0))
++
++/*
++** dup_1_s8_m:
++**	mov	z0\.b, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 1),
++		z0 = svdup_s8_m (z0, p0, 1))
++
++/*
++** dup_127_s8_m:
++**	mov	z0\.b, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 127),
++		z0 = svdup_s8_m (z0, p0, 127))
++
++/*
++** dup_128_s8_m:
++**	mov	z0\.b, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 128),
++		z0 = svdup_s8_m (z0, p0, 128))
++
++/*
++** dup_129_s8_m:
++**	mov	z0\.b, p0/m, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 129),
++		z0 = svdup_s8_m (z0, p0, 129))
++
++/*
++** dup_253_s8_m:
++**	mov	z0\.b, p0/m, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 253),
++		z0 = svdup_s8_m (z0, p0, 253))
++
++/*
++** dup_254_s8_m:
++**	mov	z0\.b, p0/m, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 254),
++		z0 = svdup_s8_m (z0, p0, 254))
++
++/*
++** dup_255_s8_m:
++**	mov	z0\.b, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 255),
++		z0 = svdup_s8_m (z0, p0, 255))
++
++/*
++** dup_m1_s8_m:
++**	mov	z0\.b, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, -1),
++		z0 = svdup_s8_m (z0, p0, -1))
++
++/*
++** dup_m128_s8_m:
++**	mov	z0\.b, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, -128),
++		z0 = svdup_s8_m (z0, p0, -128))
++
++/*
++** dup_0_s8_m:
++**	mov	z0\.b, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s8_m, svint8_t,
++		z0 = svdup_n_s8_m (z0, p0, 0),
++		z0 = svdup_s8_m (z0, p0, 0))
++
++/*
++** dup_w0_s8_m:
++**	movprfx	z0, z1
++**	mov	z0\.b, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s8_m, svint8_t, int8_t,
++		z0 = svdup_n_s8_m (z1, p0, x0),
++		z0 = svdup_s8_m (z1, p0, x0))
++
++/*
++** dup_1_s8_z:
++**	mov	z0\.b, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 1),
++		z0 = svdup_s8_z (p0, 1))
++
++/*
++** dup_127_s8_z:
++**	mov	z0\.b, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 127),
++		z0 = svdup_s8_z (p0, 127))
++
++/*
++** dup_128_s8_z:
++**	mov	z0\.b, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 128),
++		z0 = svdup_s8_z (p0, 128))
++
++/*
++** dup_129_s8_z:
++**	mov	z0\.b, p0/z, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 129),
++		z0 = svdup_s8_z (p0, 129))
++
++/*
++** dup_253_s8_z:
++**	mov	z0\.b, p0/z, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 253),
++		z0 = svdup_s8_z (p0, 253))
++
++/*
++** dup_254_s8_z:
++**	mov	z0\.b, p0/z, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 254),
++		z0 = svdup_s8_z (p0, 254))
++
++/*
++** dup_255_s8_z:
++**	mov	z0\.b, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 255),
++		z0 = svdup_s8_z (p0, 255))
++
++/*
++** dup_m1_s8_z:
++**	mov	z0\.b, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, -1),
++		z0 = svdup_s8_z (p0, -1))
++
++/*
++** dup_m128_s8_z:
++**	mov	z0\.b, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, -128),
++		z0 = svdup_s8_z (p0, -128))
++
++/*
++** dup_0_s8_z:
++**	mov	z0\.b, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_s8_z, svint8_t,
++		z0 = svdup_n_s8_z (p0, 0),
++		z0 = svdup_s8_z (p0, 0))
++
++/*
++** dup_w0_s8_z:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mov	z0\.b, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s8_z, svint8_t, int8_t,
++		z0 = svdup_n_s8_z (p0, x0),
++		z0 = svdup_s8_z (p0, x0))
++
++/*
++** dup_1_s8_x:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 1),
++		z0 = svdup_s8_x (p0, 1))
++
++/*
++** dup_127_s8_x:
++**	mov	z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 127),
++		z0 = svdup_s8_x (p0, 127))
++
++/*
++** dup_128_s8_x:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 128),
++		z0 = svdup_s8_x (p0, 128))
++
++/*
++** dup_129_s8_x:
++**	mov	z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 129),
++		z0 = svdup_s8_x (p0, 129))
++
++/*
++** dup_253_s8_x:
++**	mov	z0\.b, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 253),
++		z0 = svdup_s8_x (p0, 253))
++
++/*
++** dup_254_s8_x:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 254),
++		z0 = svdup_s8_x (p0, 254))
++
++/*
++** dup_255_s8_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, 255),
++		z0 = svdup_s8_x (p0, 255))
++
++/*
++** dup_m1_s8_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, -1),
++		z0 = svdup_s8_x (p0, -1))
++
++/*
++** dup_m128_s8_x:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_s8_x, svint8_t,
++		z0 = svdup_n_s8_x (p0, -128),
++		z0 = svdup_s8_x (p0, -128))
++
++/*
++** dup_w0_s8_x:
++**	mov	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_s8_x, svint8_t, int8_t,
++		z0 = svdup_n_s8_x (p0, x0),
++		z0 = svdup_s8_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u16.c
+new file mode 100644
+index 000000000..263eafef0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u16.c
+@@ -0,0 +1,1193 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_u16:
++**	mov	z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u16, svuint16_t,
++		z0 = svdup_n_u16 (1),
++		z0 = svdup_u16 (1))
++
++/*
++** dup_127_u16:
++**	mov	z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u16, svuint16_t,
++		z0 = svdup_n_u16 (127),
++		z0 = svdup_u16 (127))
++
++/*
++** dup_128_u16:
++**	mov	z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u16, svuint16_t,
++		z0 = svdup_n_u16 (128),
++		z0 = svdup_u16 (128))
++
++/*
++** dup_129_u16:
++**	movi	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u16, svuint16_t,
++		z0 = svdup_n_u16 (129),
++		z0 = svdup_u16 (129))
++
++/*
++** dup_253_u16:
++**	movi	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u16, svuint16_t,
++		z0 = svdup_n_u16 (253),
++		z0 = svdup_u16 (253))
++
++/*
++** dup_254_u16:
++**	mov	z0\.h, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u16, svuint16_t,
++		z0 = svdup_n_u16 (254),
++		z0 = svdup_u16 (254))
++
++/*
++** dup_255_u16:
++**	mov	z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u16, svuint16_t,
++		z0 = svdup_n_u16 (255),
++		z0 = svdup_u16 (255))
++
++/*
++** dup_256_u16:
++**	mov	z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u16, svuint16_t,
++		z0 = svdup_n_u16 (256),
++		z0 = svdup_u16 (256))
++
++/*
++** dup_257_u16:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u16, svuint16_t,
++		z0 = svdup_n_u16 (257),
++		z0 = svdup_u16 (257))
++
++/*
++** dup_512_u16:
++**	mov	z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u16, svuint16_t,
++		z0 = svdup_n_u16 (512),
++		z0 = svdup_u16 (512))
++
++/*
++** dup_7f00_u16:
++**	mov	z0\.h, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u16, svuint16_t,
++		z0 = svdup_n_u16 (0x7f00),
++		z0 = svdup_u16 (0x7f00))
++
++/*
++** dup_7f01_u16:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_u16, svuint16_t,
++		z0 = svdup_n_u16 (0x7f01),
++		z0 = svdup_u16 (0x7f01))
++
++/*
++** dup_7ffd_u16:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_u16, svuint16_t,
++		z0 = svdup_n_u16 (0x7ffd),
++		z0 = svdup_u16 (0x7ffd))
++
++/*
++** dup_7ffe_u16:
++**	mov	z0\.h, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u16, svuint16_t,
++		z0 = svdup_n_u16 (0x7ffe),
++		z0 = svdup_u16 (0x7ffe))
++
++/*
++** dup_7fff_u16:
++**	mov	z0\.h, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u16, svuint16_t,
++		z0 = svdup_n_u16 (0x7fff),
++		z0 = svdup_u16 (0x7fff))
++
++/*
++** dup_m1_u16:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u16, svuint16_t,
++		z0 = svdup_n_u16 (-1),
++		z0 = svdup_u16 (-1))
++
++/*
++** dup_m128_u16:
++**	mov	z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u16, svuint16_t,
++		z0 = svdup_n_u16 (-128),
++		z0 = svdup_u16 (-128))
++
++/*
++** dup_m129_u16:
++**	mov	z0\.h, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u16, svuint16_t,
++		z0 = svdup_n_u16 (-129),
++		z0 = svdup_u16 (-129))
++
++/*
++** dup_m130_u16:
++**	mvni	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_u16, svuint16_t,
++		z0 = svdup_n_u16 (-130),
++		z0 = svdup_u16 (-130))
++
++/*
++** dup_m254_u16:
++**	mvni	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_u16, svuint16_t,
++		z0 = svdup_n_u16 (-254),
++		z0 = svdup_u16 (-254))
++
++/*
++** dup_m255_u16:
++**	mov	z0\.h, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u16, svuint16_t,
++		z0 = svdup_n_u16 (-255),
++		z0 = svdup_u16 (-255))
++
++/*
++** dup_m256_u16:
++**	mov	z0\.h, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u16, svuint16_t,
++		z0 = svdup_n_u16 (-256),
++		z0 = svdup_u16 (-256))
++
++/*
++** dup_m257_u16:
++**	mov	z0\.h, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u16, svuint16_t,
++		z0 = svdup_n_u16 (-257),
++		z0 = svdup_u16 (-257))
++
++/*
++** dup_m258_u16:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u16, svuint16_t,
++		z0 = svdup_n_u16 (-258),
++		z0 = svdup_u16 (-258))
++
++/*
++** dup_m259_u16:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_u16, svuint16_t,
++		z0 = svdup_n_u16 (-259),
++		z0 = svdup_u16 (-259))
++
++/*
++** dup_m512_u16:
++**	mov	z0\.h, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u16, svuint16_t,
++		z0 = svdup_n_u16 (-512),
++		z0 = svdup_u16 (-512))
++
++/*
++** dup_m7f00_u16:
++**	mov	z0\.h, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u16, svuint16_t,
++		z0 = svdup_n_u16 (-0x7f00),
++		z0 = svdup_u16 (-0x7f00))
++
++/*
++** dup_m7f01_u16:
++**	mov	z0\.h, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u16, svuint16_t,
++		z0 = svdup_n_u16 (-0x7f01),
++		z0 = svdup_u16 (-0x7f01))
++
++/*
++** dup_m7f02_u16:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_u16, svuint16_t,
++		z0 = svdup_n_u16 (-0x7f02),
++		z0 = svdup_u16 (-0x7f02))
++
++/*
++** dup_m7ffe_u16:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_u16, svuint16_t,
++		z0 = svdup_n_u16 (-0x7ffe),
++		z0 = svdup_u16 (-0x7ffe))
++
++/*
++** dup_m7fff_u16:
++**	mov	z0\.h, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u16, svuint16_t,
++		z0 = svdup_n_u16 (-0x7fff),
++		z0 = svdup_u16 (-0x7fff))
++
++/*
++** dup_m8000_u16:
++**	mov	z0\.h, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u16, svuint16_t,
++		z0 = svdup_n_u16 (-0x8000),
++		z0 = svdup_u16 (-0x8000))
++
++/*
++** dup_w0_u16:
++**	mov	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u16, svuint16_t, uint16_t,
++		 z0 = svdup_n_u16 (x0),
++		 z0 = svdup_u16 (x0))
++
++/*
++** dup_1_u16_m:
++**	mov	z0\.h, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 1),
++		z0 = svdup_u16_m (z0, p0, 1))
++
++/*
++** dup_127_u16_m:
++**	mov	z0\.h, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 127),
++		z0 = svdup_u16_m (z0, p0, 127))
++
++/*
++** dup_128_u16_m:
++**	mov	(z[0-9]+\.h), #128
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 128),
++		z0 = svdup_u16_m (z0, p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 129),
++		z0 = svdup_u16_m (z0, p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 253),
++		z0 = svdup_u16_m (z0, p0, 253))
++
++/*
++** dup_254_u16_m:
++**	mov	(z[0-9]+\.h), #254
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 254),
++		z0 = svdup_u16_m (z0, p0, 254))
++
++/*
++** dup_255_u16_m:
++**	mov	(z[0-9]+\.h), #255
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 255),
++		z0 = svdup_u16_m (z0, p0, 255))
++
++/*
++** dup_256_u16_m:
++**	mov	z0\.h, p0/m, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 256),
++		z0 = svdup_u16_m (z0, p0, 256))
++
++/*
++** dup_257_u16_m:
++**	mov	(z[0-9]+)\.b, #1
++**	sel	z0\.h, p0, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 257),
++		z0 = svdup_u16_m (z0, p0, 257))
++
++/*
++** dup_512_u16_m:
++**	mov	z0\.h, p0/m, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 512),
++		z0 = svdup_u16_m (z0, p0, 512))
++
++/*
++** dup_7f00_u16_m:
++**	mov	z0\.h, p0/m, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 0x7f00),
++		z0 = svdup_u16_m (z0, p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 0x7f01),
++		z0 = svdup_u16_m (z0, p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 0x7ffd),
++		z0 = svdup_u16_m (z0, p0, 0x7ffd))
++
++/*
++** dup_7ffe_u16_m:
++**	mov	(z[0-9]+\.h), #32766
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 0x7ffe),
++		z0 = svdup_u16_m (z0, p0, 0x7ffe))
++
++/*
++** dup_7fff_u16_m:
++**	mov	(z[0-9]+\.h), #32767
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 0x7fff),
++		z0 = svdup_u16_m (z0, p0, 0x7fff))
++
++/*
++** dup_m1_u16_m:
++**	mov	z0\.h, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -1),
++		z0 = svdup_u16_m (z0, p0, -1))
++
++/*
++** dup_m128_u16_m:
++**	mov	z0\.h, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -128),
++		z0 = svdup_u16_m (z0, p0, -128))
++
++/*
++** dup_m129_u16_m:
++**	mov	(z[0-9]+\.h), #-129
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -129),
++		z0 = svdup_u16_m (z0, p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -130),
++		z0 = svdup_u16_m (z0, p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -254),
++		z0 = svdup_u16_m (z0, p0, -254))
++
++/*
++** dup_m255_u16_m:
++**	mov	(z[0-9]+\.h), #-255
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -255),
++		z0 = svdup_u16_m (z0, p0, -255))
++
++/*
++** dup_m256_u16_m:
++**	mov	z0\.h, p0/m, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -256),
++		z0 = svdup_u16_m (z0, p0, -256))
++
++/*
++** dup_m257_u16_m:
++**	mov	(z[0-9]+\.h), #-257
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -257),
++		z0 = svdup_u16_m (z0, p0, -257))
++
++/*
++** dup_m258_u16_m:
++**	mov	(z[0-9]+)\.b, #-2
++**	sel	z0\.h, p0, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -258),
++		z0 = svdup_u16_m (z0, p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -259),
++		z0 = svdup_u16_m (z0, p0, -259))
++
++/*
++** dup_m512_u16_m:
++**	mov	z0\.h, p0/m, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -512),
++		z0 = svdup_u16_m (z0, p0, -512))
++
++/*
++** dup_m7f00_u16_m:
++**	mov	z0\.h, p0/m, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -0x7f00),
++		z0 = svdup_u16_m (z0, p0, -0x7f00))
++
++/*
++** dup_m7f01_u16_m:
++**	mov	(z[0-9]+\.h), #-32513
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -0x7f01),
++		z0 = svdup_u16_m (z0, p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -0x7f02),
++		z0 = svdup_u16_m (z0, p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -0x7ffe),
++		z0 = svdup_u16_m (z0, p0, -0x7ffe))
++
++/*
++** dup_m7fff_u16_m:
++**	mov	(z[0-9]+\.h), #-32767
++**	sel	z0\.h, p0, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -0x7fff),
++		z0 = svdup_u16_m (z0, p0, -0x7fff))
++
++/*
++** dup_m8000_u16_m:
++**	mov	z0\.h, p0/m, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, -0x8000),
++		z0 = svdup_u16_m (z0, p0, -0x8000))
++
++/*
++** dup_0_u16_m:
++**	mov	z0\.h, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u16_m, svuint16_t,
++		z0 = svdup_n_u16_m (z0, p0, 0),
++		z0 = svdup_u16_m (z0, p0, 0))
++
++/*
++** dup_w0_u16_m:
++**	movprfx	z0, z1
++**	mov	z0\.h, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u16_m, svuint16_t, uint16_t,
++		z0 = svdup_n_u16_m (z1, p0, x0),
++		z0 = svdup_u16_m (z1, p0, x0))
++
++/*
++** dup_1_u16_z:
++**	mov	z0\.h, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 1),
++		z0 = svdup_u16_z (p0, 1))
++
++/*
++** dup_127_u16_z:
++**	mov	z0\.h, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 127),
++		z0 = svdup_u16_z (p0, 127))
++
++/*
++** dup_128_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #128
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 128),
++		z0 = svdup_u16_z (p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 129),
++		z0 = svdup_u16_z (p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 253),
++		z0 = svdup_u16_z (p0, 253))
++
++/*
++** dup_254_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #254
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 254),
++		z0 = svdup_u16_z (p0, 254))
++
++/*
++** dup_255_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #255
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 255),
++		z0 = svdup_u16_z (p0, 255))
++
++/*
++** dup_256_u16_z:
++**	mov	z0\.h, p0/z, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 256),
++		z0 = svdup_u16_z (p0, 256))
++
++/*
++** dup_257_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+)\.b, #1
++**	sel	z0\.h, p0, \2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 257),
++		z0 = svdup_u16_z (p0, 257))
++
++/*
++** dup_512_u16_z:
++**	mov	z0\.h, p0/z, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 512),
++		z0 = svdup_u16_z (p0, 512))
++
++/*
++** dup_7f00_u16_z:
++**	mov	z0\.h, p0/z, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 0x7f00),
++		z0 = svdup_u16_z (p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 0x7f01),
++		z0 = svdup_u16_z (p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 0x7ffd),
++		z0 = svdup_u16_z (p0, 0x7ffd))
++
++/*
++** dup_7ffe_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #32766
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 0x7ffe),
++		z0 = svdup_u16_z (p0, 0x7ffe))
++
++/*
++** dup_7fff_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #32767
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 0x7fff),
++		z0 = svdup_u16_z (p0, 0x7fff))
++
++/*
++** dup_m1_u16_z:
++**	mov	z0\.h, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -1),
++		z0 = svdup_u16_z (p0, -1))
++
++/*
++** dup_m128_u16_z:
++**	mov	z0\.h, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -128),
++		z0 = svdup_u16_z (p0, -128))
++
++/*
++** dup_m129_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-129
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -129),
++		z0 = svdup_u16_z (p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -130),
++		z0 = svdup_u16_z (p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -254),
++		z0 = svdup_u16_z (p0, -254))
++
++/*
++** dup_m255_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-255
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -255),
++		z0 = svdup_u16_z (p0, -255))
++
++/*
++** dup_m256_u16_z:
++**	mov	z0\.h, p0/z, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -256),
++		z0 = svdup_u16_z (p0, -256))
++
++/*
++** dup_m257_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-257
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -257),
++		z0 = svdup_u16_z (p0, -257))
++
++/*
++** dup_m258_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+)\.b, #-2
++**	sel	z0\.h, p0, \2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -258),
++		z0 = svdup_u16_z (p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -259),
++		z0 = svdup_u16_z (p0, -259))
++
++/*
++** dup_m512_u16_z:
++**	mov	z0\.h, p0/z, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -512),
++		z0 = svdup_u16_z (p0, -512))
++
++/*
++** dup_m7f00_u16_z:
++**	mov	z0\.h, p0/z, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -0x7f00),
++		z0 = svdup_u16_z (p0, -0x7f00))
++
++/*
++** dup_m7f01_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-32513
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -0x7f01),
++		z0 = svdup_u16_z (p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -0x7f02),
++		z0 = svdup_u16_z (p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -0x7ffe),
++		z0 = svdup_u16_z (p0, -0x7ffe))
++
++/*
++** dup_m7fff_u16_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.h), #-32767
++**	sel	z0\.h, p0, \2, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -0x7fff),
++		z0 = svdup_u16_z (p0, -0x7fff))
++
++/*
++** dup_m8000_u16_z:
++**	mov	z0\.h, p0/z, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, -0x8000),
++		z0 = svdup_u16_z (p0, -0x8000))
++
++/*
++** dup_0_u16_z:
++**	mov	z0\.h, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u16_z, svuint16_t,
++		z0 = svdup_n_u16_z (p0, 0),
++		z0 = svdup_u16_z (p0, 0))
++
++/*
++** dup_w0_u16_z:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mov	z0\.h, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u16_z, svuint16_t, uint16_t,
++		z0 = svdup_n_u16_z (p0, x0),
++		z0 = svdup_u16_z (p0, x0))
++
++/*
++** dup_1_u16_x:
++**	mov	z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 1),
++		z0 = svdup_u16_x (p0, 1))
++
++/*
++** dup_127_u16_x:
++**	mov	z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 127),
++		z0 = svdup_u16_x (p0, 127))
++
++/*
++** dup_128_u16_x:
++**	mov	z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 128),
++		z0 = svdup_u16_x (p0, 128))
++
++/*
++** dup_129_u16_x:
++**	movi	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 129),
++		z0 = svdup_u16_x (p0, 129))
++
++/*
++** dup_253_u16_x:
++**	movi	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 253),
++		z0 = svdup_u16_x (p0, 253))
++
++/*
++** dup_254_u16_x:
++**	mov	z0\.h, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 254),
++		z0 = svdup_u16_x (p0, 254))
++
++/*
++** dup_255_u16_x:
++**	mov	z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 255),
++		z0 = svdup_u16_x (p0, 255))
++
++/*
++** dup_256_u16_x:
++**	mov	z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 256),
++		z0 = svdup_u16_x (p0, 256))
++
++/*
++** dup_257_u16_x:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 257),
++		z0 = svdup_u16_x (p0, 257))
++
++/*
++** dup_512_u16_x:
++**	mov	z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 512),
++		z0 = svdup_u16_x (p0, 512))
++
++/*
++** dup_7f00_u16_x:
++**	mov	z0\.h, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 0x7f00),
++		z0 = svdup_u16_x (p0, 0x7f00))
++
++/*
++** dup_7f01_u16_x:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 0x7f01),
++		z0 = svdup_u16_x (p0, 0x7f01))
++
++/*
++** dup_7ffd_u16_x:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 0x7ffd),
++		z0 = svdup_u16_x (p0, 0x7ffd))
++
++/*
++** dup_7ffe_u16_x:
++**	mov	z0\.h, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 0x7ffe),
++		z0 = svdup_u16_x (p0, 0x7ffe))
++
++/*
++** dup_7fff_u16_x:
++**	mov	z0\.h, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, 0x7fff),
++		z0 = svdup_u16_x (p0, 0x7fff))
++
++/*
++** dup_m1_u16_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -1),
++		z0 = svdup_u16_x (p0, -1))
++
++/*
++** dup_m128_u16_x:
++**	mov	z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -128),
++		z0 = svdup_u16_x (p0, -128))
++
++/*
++** dup_m129_u16_x:
++**	mov	z0\.h, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -129),
++		z0 = svdup_u16_x (p0, -129))
++
++/*
++** dup_m130_u16_x:
++**	mvni	v([0-9]+)\.8h, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -130),
++		z0 = svdup_u16_x (p0, -130))
++
++/*
++** dup_m254_u16_x:
++**	mvni	v([0-9]+)\.8h, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -254),
++		z0 = svdup_u16_x (p0, -254))
++
++/*
++** dup_m255_u16_x:
++**	mov	z0\.h, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -255),
++		z0 = svdup_u16_x (p0, -255))
++
++/*
++** dup_m256_u16_x:
++**	mov	z0\.h, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -256),
++		z0 = svdup_u16_x (p0, -256))
++
++/*
++** dup_m257_u16_x:
++**	mov	z0\.h, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -257),
++		z0 = svdup_u16_x (p0, -257))
++
++/*
++** dup_m258_u16_x:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -258),
++		z0 = svdup_u16_x (p0, -258))
++
++/*
++** dup_m259_u16_x:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -259),
++		z0 = svdup_u16_x (p0, -259))
++
++/*
++** dup_m512_u16_x:
++**	mov	z0\.h, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -512),
++		z0 = svdup_u16_x (p0, -512))
++
++/*
++** dup_m7f00_u16_x:
++**	mov	z0\.h, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -0x7f00),
++		z0 = svdup_u16_x (p0, -0x7f00))
++
++/*
++** dup_m7f01_u16_x:
++**	mov	z0\.h, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -0x7f01),
++		z0 = svdup_u16_x (p0, -0x7f01))
++
++/*
++** dup_m7f02_u16_x:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -0x7f02),
++		z0 = svdup_u16_x (p0, -0x7f02))
++
++/*
++** dup_m7ffe_u16_x:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -0x7ffe),
++		z0 = svdup_u16_x (p0, -0x7ffe))
++
++/*
++** dup_m7fff_u16_x:
++**	mov	z0\.h, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -0x7fff),
++		z0 = svdup_u16_x (p0, -0x7fff))
++
++/*
++** dup_m8000_u16_x:
++**	mov	z0\.h, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u16_x, svuint16_t,
++		z0 = svdup_n_u16_x (p0, -0x8000),
++		z0 = svdup_u16_x (p0, -0x8000))
++
++/*
++** dup_w0_u16_x:
++**	mov	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u16_x, svuint16_t, uint16_t,
++		z0 = svdup_n_u16_x (p0, x0),
++		z0 = svdup_u16_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u32.c
+new file mode 100644
+index 000000000..667feea64
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u32.c
+@@ -0,0 +1,1175 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_u32:
++**	mov	z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u32, svuint32_t,
++		z0 = svdup_n_u32 (1),
++		z0 = svdup_u32 (1))
++
++/*
++** dup_127_u32:
++**	mov	z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u32, svuint32_t,
++		z0 = svdup_n_u32 (127),
++		z0 = svdup_u32 (127))
++
++/*
++** dup_128_u32:
++**	mov	z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u32, svuint32_t,
++		z0 = svdup_n_u32 (128),
++		z0 = svdup_u32 (128))
++
++/*
++** dup_129_u32:
++**	movi	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u32, svuint32_t,
++		z0 = svdup_n_u32 (129),
++		z0 = svdup_u32 (129))
++
++/*
++** dup_253_u32:
++**	movi	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u32, svuint32_t,
++		z0 = svdup_n_u32 (253),
++		z0 = svdup_u32 (253))
++
++/*
++** dup_254_u32:
++**	mov	z0\.s, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u32, svuint32_t,
++		z0 = svdup_n_u32 (254),
++		z0 = svdup_u32 (254))
++
++/*
++** dup_255_u32:
++**	mov	z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u32, svuint32_t,
++		z0 = svdup_n_u32 (255),
++		z0 = svdup_u32 (255))
++
++/*
++** dup_256_u32:
++**	mov	z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u32, svuint32_t,
++		z0 = svdup_n_u32 (256),
++		z0 = svdup_u32 (256))
++
++/*
++** dup_257_u32:
++**	mov	(w[0-9]+), 257
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u32, svuint32_t,
++		z0 = svdup_n_u32 (257),
++		z0 = svdup_u32 (257))
++
++/*
++** dup_512_u32:
++**	mov	z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u32, svuint32_t,
++		z0 = svdup_n_u32 (512),
++		z0 = svdup_u32 (512))
++
++/*
++** dup_7f00_u32:
++**	mov	z0\.s, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u32, svuint32_t,
++		z0 = svdup_n_u32 (0x7f00),
++		z0 = svdup_u32 (0x7f00))
++
++/*
++** dup_7f01_u32:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_u32, svuint32_t,
++		z0 = svdup_n_u32 (0x7f01),
++		z0 = svdup_u32 (0x7f01))
++
++/*
++** dup_7ffd_u32:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_u32, svuint32_t,
++		z0 = svdup_n_u32 (0x7ffd),
++		z0 = svdup_u32 (0x7ffd))
++
++/*
++** dup_7ffe_u32:
++**	mov	z0\.s, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u32, svuint32_t,
++		z0 = svdup_n_u32 (0x7ffe),
++		z0 = svdup_u32 (0x7ffe))
++
++/*
++** dup_7fff_u32:
++**	mov	z0\.s, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u32, svuint32_t,
++		z0 = svdup_n_u32 (0x7fff),
++		z0 = svdup_u32 (0x7fff))
++
++/*
++** dup_m1_u32:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u32, svuint32_t,
++		z0 = svdup_n_u32 (-1),
++		z0 = svdup_u32 (-1))
++
++/*
++** dup_m128_u32:
++**	mov	z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u32, svuint32_t,
++		z0 = svdup_n_u32 (-128),
++		z0 = svdup_u32 (-128))
++
++/*
++** dup_m129_u32:
++**	mov	z0\.s, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u32, svuint32_t,
++		z0 = svdup_n_u32 (-129),
++		z0 = svdup_u32 (-129))
++
++/*
++** dup_m130_u32:
++**	mvni	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_u32, svuint32_t,
++		z0 = svdup_n_u32 (-130),
++		z0 = svdup_u32 (-130))
++
++/*
++** dup_m254_u32:
++**	mvni	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_u32, svuint32_t,
++		z0 = svdup_n_u32 (-254),
++		z0 = svdup_u32 (-254))
++
++/*
++** dup_m255_u32:
++**	mov	z0\.s, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u32, svuint32_t,
++		z0 = svdup_n_u32 (-255),
++		z0 = svdup_u32 (-255))
++
++/*
++** dup_m256_u32:
++**	mov	z0\.s, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u32, svuint32_t,
++		z0 = svdup_n_u32 (-256),
++		z0 = svdup_u32 (-256))
++
++/*
++** dup_m257_u32:
++**	mov	z0\.s, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u32, svuint32_t,
++		z0 = svdup_n_u32 (-257),
++		z0 = svdup_u32 (-257))
++
++/*
++** dup_m258_u32:
++**	mov	(w[0-9]+), -258
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u32, svuint32_t,
++		z0 = svdup_n_u32 (-258),
++		z0 = svdup_u32 (-258))
++
++/*
++** dup_m259_u32:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_u32, svuint32_t,
++		z0 = svdup_n_u32 (-259),
++		z0 = svdup_u32 (-259))
++
++/*
++** dup_m512_u32:
++**	mov	z0\.s, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u32, svuint32_t,
++		z0 = svdup_n_u32 (-512),
++		z0 = svdup_u32 (-512))
++
++/*
++** dup_m7f00_u32:
++**	mov	z0\.s, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u32, svuint32_t,
++		z0 = svdup_n_u32 (-0x7f00),
++		z0 = svdup_u32 (-0x7f00))
++
++/*
++** dup_m7f01_u32:
++**	mov	z0\.s, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u32, svuint32_t,
++		z0 = svdup_n_u32 (-0x7f01),
++		z0 = svdup_u32 (-0x7f01))
++
++/*
++** dup_m7f02_u32:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_u32, svuint32_t,
++		z0 = svdup_n_u32 (-0x7f02),
++		z0 = svdup_u32 (-0x7f02))
++
++/*
++** dup_m7ffe_u32:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_u32, svuint32_t,
++		z0 = svdup_n_u32 (-0x7ffe),
++		z0 = svdup_u32 (-0x7ffe))
++
++/*
++** dup_m7fff_u32:
++**	mov	z0\.s, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u32, svuint32_t,
++		z0 = svdup_n_u32 (-0x7fff),
++		z0 = svdup_u32 (-0x7fff))
++
++/*
++** dup_m8000_u32:
++**	mov	z0\.s, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u32, svuint32_t,
++		z0 = svdup_n_u32 (-0x8000),
++		z0 = svdup_u32 (-0x8000))
++
++/*
++** dup_w0_u32:
++**	mov	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u32, svuint32_t, uint32_t,
++		 z0 = svdup_n_u32 (x0),
++		 z0 = svdup_u32 (x0))
++
++/*
++** dup_1_u32_m:
++**	mov	z0\.s, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 1),
++		z0 = svdup_u32_m (z0, p0, 1))
++
++/*
++** dup_127_u32_m:
++**	mov	z0\.s, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 127),
++		z0 = svdup_u32_m (z0, p0, 127))
++
++/*
++** dup_128_u32_m:
++**	mov	(z[0-9]+\.s), #128
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 128),
++		z0 = svdup_u32_m (z0, p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 129),
++		z0 = svdup_u32_m (z0, p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 253),
++		z0 = svdup_u32_m (z0, p0, 253))
++
++/*
++** dup_254_u32_m:
++**	mov	(z[0-9]+\.s), #254
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 254),
++		z0 = svdup_u32_m (z0, p0, 254))
++
++/*
++** dup_255_u32_m:
++**	mov	(z[0-9]+\.s), #255
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 255),
++		z0 = svdup_u32_m (z0, p0, 255))
++
++/*
++** dup_256_u32_m:
++**	mov	z0\.s, p0/m, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 256),
++		z0 = svdup_u32_m (z0, p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 257),
++		z0 = svdup_u32_m (z0, p0, 257))
++
++/*
++** dup_512_u32_m:
++**	mov	z0\.s, p0/m, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 512),
++		z0 = svdup_u32_m (z0, p0, 512))
++
++/*
++** dup_7f00_u32_m:
++**	mov	z0\.s, p0/m, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 0x7f00),
++		z0 = svdup_u32_m (z0, p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 0x7f01),
++		z0 = svdup_u32_m (z0, p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 0x7ffd),
++		z0 = svdup_u32_m (z0, p0, 0x7ffd))
++
++/*
++** dup_7ffe_u32_m:
++**	mov	(z[0-9]+\.s), #32766
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 0x7ffe),
++		z0 = svdup_u32_m (z0, p0, 0x7ffe))
++
++/*
++** dup_7fff_u32_m:
++**	mov	(z[0-9]+\.s), #32767
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 0x7fff),
++		z0 = svdup_u32_m (z0, p0, 0x7fff))
++
++/*
++** dup_m1_u32_m:
++**	mov	z0\.s, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -1),
++		z0 = svdup_u32_m (z0, p0, -1))
++
++/*
++** dup_m128_u32_m:
++**	mov	z0\.s, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -128),
++		z0 = svdup_u32_m (z0, p0, -128))
++
++/*
++** dup_m129_u32_m:
++**	mov	(z[0-9]+\.s), #-129
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -129),
++		z0 = svdup_u32_m (z0, p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -130),
++		z0 = svdup_u32_m (z0, p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -254),
++		z0 = svdup_u32_m (z0, p0, -254))
++
++/*
++** dup_m255_u32_m:
++**	mov	(z[0-9]+\.s), #-255
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -255),
++		z0 = svdup_u32_m (z0, p0, -255))
++
++/*
++** dup_m256_u32_m:
++**	mov	z0\.s, p0/m, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -256),
++		z0 = svdup_u32_m (z0, p0, -256))
++
++/*
++** dup_m257_u32_m:
++**	mov	(z[0-9]+\.s), #-257
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -257),
++		z0 = svdup_u32_m (z0, p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -258),
++		z0 = svdup_u32_m (z0, p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -259),
++		z0 = svdup_u32_m (z0, p0, -259))
++
++/*
++** dup_m512_u32_m:
++**	mov	z0\.s, p0/m, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -512),
++		z0 = svdup_u32_m (z0, p0, -512))
++
++/*
++** dup_m7f00_u32_m:
++**	mov	z0\.s, p0/m, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -0x7f00),
++		z0 = svdup_u32_m (z0, p0, -0x7f00))
++
++/*
++** dup_m7f01_u32_m:
++**	mov	(z[0-9]+\.s), #-32513
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -0x7f01),
++		z0 = svdup_u32_m (z0, p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -0x7f02),
++		z0 = svdup_u32_m (z0, p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -0x7ffe),
++		z0 = svdup_u32_m (z0, p0, -0x7ffe))
++
++/*
++** dup_m7fff_u32_m:
++**	mov	(z[0-9]+\.s), #-32767
++**	sel	z0\.s, p0, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -0x7fff),
++		z0 = svdup_u32_m (z0, p0, -0x7fff))
++
++/*
++** dup_m8000_u32_m:
++**	mov	z0\.s, p0/m, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, -0x8000),
++		z0 = svdup_u32_m (z0, p0, -0x8000))
++
++/*
++** dup_0_u32_m:
++**	mov	z0\.s, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u32_m, svuint32_t,
++		z0 = svdup_n_u32_m (z0, p0, 0),
++		z0 = svdup_u32_m (z0, p0, 0))
++
++/*
++** dup_w0_u32_m:
++**	movprfx	z0, z1
++**	mov	z0\.s, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u32_m, svuint32_t, uint32_t,
++		z0 = svdup_n_u32_m (z1, p0, x0),
++		z0 = svdup_u32_m (z1, p0, x0))
++
++/*
++** dup_1_u32_z:
++**	mov	z0\.s, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 1),
++		z0 = svdup_u32_z (p0, 1))
++
++/*
++** dup_127_u32_z:
++**	mov	z0\.s, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 127),
++		z0 = svdup_u32_z (p0, 127))
++
++/*
++** dup_128_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #128
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 128),
++		z0 = svdup_u32_z (p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 129),
++		z0 = svdup_u32_z (p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 253),
++		z0 = svdup_u32_z (p0, 253))
++
++/*
++** dup_254_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #254
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 254),
++		z0 = svdup_u32_z (p0, 254))
++
++/*
++** dup_255_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #255
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 255),
++		z0 = svdup_u32_z (p0, 255))
++
++/*
++** dup_256_u32_z:
++**	mov	z0\.s, p0/z, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 256),
++		z0 = svdup_u32_z (p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 257),
++		z0 = svdup_u32_z (p0, 257))
++
++/*
++** dup_512_u32_z:
++**	mov	z0\.s, p0/z, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 512),
++		z0 = svdup_u32_z (p0, 512))
++
++/*
++** dup_7f00_u32_z:
++**	mov	z0\.s, p0/z, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 0x7f00),
++		z0 = svdup_u32_z (p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 0x7f01),
++		z0 = svdup_u32_z (p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 0x7ffd),
++		z0 = svdup_u32_z (p0, 0x7ffd))
++
++/*
++** dup_7ffe_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #32766
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 0x7ffe),
++		z0 = svdup_u32_z (p0, 0x7ffe))
++
++/*
++** dup_7fff_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #32767
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 0x7fff),
++		z0 = svdup_u32_z (p0, 0x7fff))
++
++/*
++** dup_m1_u32_z:
++**	mov	z0\.s, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -1),
++		z0 = svdup_u32_z (p0, -1))
++
++/*
++** dup_m128_u32_z:
++**	mov	z0\.s, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -128),
++		z0 = svdup_u32_z (p0, -128))
++
++/*
++** dup_m129_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-129
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -129),
++		z0 = svdup_u32_z (p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -130),
++		z0 = svdup_u32_z (p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -254),
++		z0 = svdup_u32_z (p0, -254))
++
++/*
++** dup_m255_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-255
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -255),
++		z0 = svdup_u32_z (p0, -255))
++
++/*
++** dup_m256_u32_z:
++**	mov	z0\.s, p0/z, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -256),
++		z0 = svdup_u32_z (p0, -256))
++
++/*
++** dup_m257_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-257
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -257),
++		z0 = svdup_u32_z (p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -258),
++		z0 = svdup_u32_z (p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -259),
++		z0 = svdup_u32_z (p0, -259))
++
++/*
++** dup_m512_u32_z:
++**	mov	z0\.s, p0/z, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -512),
++		z0 = svdup_u32_z (p0, -512))
++
++/*
++** dup_m7f00_u32_z:
++**	mov	z0\.s, p0/z, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -0x7f00),
++		z0 = svdup_u32_z (p0, -0x7f00))
++
++/*
++** dup_m7f01_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-32513
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -0x7f01),
++		z0 = svdup_u32_z (p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -0x7f02),
++		z0 = svdup_u32_z (p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -0x7ffe),
++		z0 = svdup_u32_z (p0, -0x7ffe))
++
++/*
++** dup_m7fff_u32_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.s), #-32767
++**	sel	z0\.s, p0, \2, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -0x7fff),
++		z0 = svdup_u32_z (p0, -0x7fff))
++
++/*
++** dup_m8000_u32_z:
++**	mov	z0\.s, p0/z, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, -0x8000),
++		z0 = svdup_u32_z (p0, -0x8000))
++
++/*
++** dup_0_u32_z:
++**	mov	z0\.s, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u32_z, svuint32_t,
++		z0 = svdup_n_u32_z (p0, 0),
++		z0 = svdup_u32_z (p0, 0))
++
++/*
++** dup_w0_u32_z:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mov	z0\.s, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u32_z, svuint32_t, uint32_t,
++		z0 = svdup_n_u32_z (p0, x0),
++		z0 = svdup_u32_z (p0, x0))
++
++/*
++** dup_1_u32_x:
++**	mov	z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 1),
++		z0 = svdup_u32_x (p0, 1))
++
++/*
++** dup_127_u32_x:
++**	mov	z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 127),
++		z0 = svdup_u32_x (p0, 127))
++
++/*
++** dup_128_u32_x:
++**	mov	z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 128),
++		z0 = svdup_u32_x (p0, 128))
++
++/*
++** dup_129_u32_x:
++**	movi	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 129),
++		z0 = svdup_u32_x (p0, 129))
++
++/*
++** dup_253_u32_x:
++**	movi	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 253),
++		z0 = svdup_u32_x (p0, 253))
++
++/*
++** dup_254_u32_x:
++**	mov	z0\.s, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 254),
++		z0 = svdup_u32_x (p0, 254))
++
++/*
++** dup_255_u32_x:
++**	mov	z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 255),
++		z0 = svdup_u32_x (p0, 255))
++
++/*
++** dup_256_u32_x:
++**	mov	z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 256),
++		z0 = svdup_u32_x (p0, 256))
++
++/*
++** dup_257_u32_x:
++**	mov	(w[0-9]+), 257
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 257),
++		z0 = svdup_u32_x (p0, 257))
++
++/*
++** dup_512_u32_x:
++**	mov	z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 512),
++		z0 = svdup_u32_x (p0, 512))
++
++/*
++** dup_7f00_u32_x:
++**	mov	z0\.s, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 0x7f00),
++		z0 = svdup_u32_x (p0, 0x7f00))
++
++/*
++** dup_7f01_u32_x:
++**	mov	(w[0-9]+), 32513
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 0x7f01),
++		z0 = svdup_u32_x (p0, 0x7f01))
++
++/*
++** dup_7ffd_u32_x:
++**	mov	(w[0-9]+), 32765
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 0x7ffd),
++		z0 = svdup_u32_x (p0, 0x7ffd))
++
++/*
++** dup_7ffe_u32_x:
++**	mov	z0\.s, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 0x7ffe),
++		z0 = svdup_u32_x (p0, 0x7ffe))
++
++/*
++** dup_7fff_u32_x:
++**	mov	z0\.s, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, 0x7fff),
++		z0 = svdup_u32_x (p0, 0x7fff))
++
++/*
++** dup_m1_u32_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -1),
++		z0 = svdup_u32_x (p0, -1))
++
++/*
++** dup_m128_u32_x:
++**	mov	z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -128),
++		z0 = svdup_u32_x (p0, -128))
++
++/*
++** dup_m129_u32_x:
++**	mov	z0\.s, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -129),
++		z0 = svdup_u32_x (p0, -129))
++
++/*
++** dup_m130_u32_x:
++**	mvni	v([0-9]+)\.4s, 0x81
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -130),
++		z0 = svdup_u32_x (p0, -130))
++
++/*
++** dup_m254_u32_x:
++**	mvni	v([0-9]+)\.4s, 0xfd
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -254),
++		z0 = svdup_u32_x (p0, -254))
++
++/*
++** dup_m255_u32_x:
++**	mov	z0\.s, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -255),
++		z0 = svdup_u32_x (p0, -255))
++
++/*
++** dup_m256_u32_x:
++**	mov	z0\.s, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -256),
++		z0 = svdup_u32_x (p0, -256))
++
++/*
++** dup_m257_u32_x:
++**	mov	z0\.s, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -257),
++		z0 = svdup_u32_x (p0, -257))
++
++/*
++** dup_m258_u32_x:
++**	mov	(w[0-9]+), -258
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -258),
++		z0 = svdup_u32_x (p0, -258))
++
++/*
++** dup_m259_u32_x:
++**	mov	(w[0-9]+), -259
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -259),
++		z0 = svdup_u32_x (p0, -259))
++
++/*
++** dup_m512_u32_x:
++**	mov	z0\.s, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -512),
++		z0 = svdup_u32_x (p0, -512))
++
++/*
++** dup_m7f00_u32_x:
++**	mov	z0\.s, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -0x7f00),
++		z0 = svdup_u32_x (p0, -0x7f00))
++
++/*
++** dup_m7f01_u32_x:
++**	mov	z0\.s, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -0x7f01),
++		z0 = svdup_u32_x (p0, -0x7f01))
++
++/*
++** dup_m7f02_u32_x:
++**	mov	(w[0-9]+), -32514
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -0x7f02),
++		z0 = svdup_u32_x (p0, -0x7f02))
++
++/*
++** dup_m7ffe_u32_x:
++**	mov	(w[0-9]+), -32766
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -0x7ffe),
++		z0 = svdup_u32_x (p0, -0x7ffe))
++
++/*
++** dup_m7fff_u32_x:
++**	mov	z0\.s, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -0x7fff),
++		z0 = svdup_u32_x (p0, -0x7fff))
++
++/*
++** dup_m8000_u32_x:
++**	mov	z0\.s, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u32_x, svuint32_t,
++		z0 = svdup_n_u32_x (p0, -0x8000),
++		z0 = svdup_u32_x (p0, -0x8000))
++
++/*
++** dup_w0_u32_x:
++**	mov	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u32_x, svuint32_t, uint32_t,
++		z0 = svdup_n_u32_x (p0, x0),
++		z0 = svdup_u32_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u64.c
+new file mode 100644
+index 000000000..a7cca7af0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u64.c
+@@ -0,0 +1,1175 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_u64:
++**	mov	z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u64, svuint64_t,
++		z0 = svdup_n_u64 (1),
++		z0 = svdup_u64 (1))
++
++/*
++** dup_127_u64:
++**	mov	z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u64, svuint64_t,
++		z0 = svdup_n_u64 (127),
++		z0 = svdup_u64 (127))
++
++/*
++** dup_128_u64:
++**	mov	z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u64, svuint64_t,
++		z0 = svdup_n_u64 (128),
++		z0 = svdup_u64 (128))
++
++/*
++** dup_129_u64:
++**	mov	(x[0-9]+), 129
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u64, svuint64_t,
++		z0 = svdup_n_u64 (129),
++		z0 = svdup_u64 (129))
++
++/*
++** dup_253_u64:
++**	mov	(x[0-9]+), 253
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u64, svuint64_t,
++		z0 = svdup_n_u64 (253),
++		z0 = svdup_u64 (253))
++
++/*
++** dup_254_u64:
++**	mov	z0\.d, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u64, svuint64_t,
++		z0 = svdup_n_u64 (254),
++		z0 = svdup_u64 (254))
++
++/*
++** dup_255_u64:
++**	mov	z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u64, svuint64_t,
++		z0 = svdup_n_u64 (255),
++		z0 = svdup_u64 (255))
++
++/*
++** dup_256_u64:
++**	mov	z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u64, svuint64_t,
++		z0 = svdup_n_u64 (256),
++		z0 = svdup_u64 (256))
++
++/*
++** dup_257_u64:
++**	mov	(x[0-9]+), 257
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u64, svuint64_t,
++		z0 = svdup_n_u64 (257),
++		z0 = svdup_u64 (257))
++
++/*
++** dup_512_u64:
++**	mov	z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u64, svuint64_t,
++		z0 = svdup_n_u64 (512),
++		z0 = svdup_u64 (512))
++
++/*
++** dup_7f00_u64:
++**	mov	z0\.d, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u64, svuint64_t,
++		z0 = svdup_n_u64 (0x7f00),
++		z0 = svdup_u64 (0x7f00))
++
++/*
++** dup_7f01_u64:
++**	mov	(x[0-9]+), 32513
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_u64, svuint64_t,
++		z0 = svdup_n_u64 (0x7f01),
++		z0 = svdup_u64 (0x7f01))
++
++/*
++** dup_7ffd_u64:
++**	mov	(x[0-9]+), 32765
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_u64, svuint64_t,
++		z0 = svdup_n_u64 (0x7ffd),
++		z0 = svdup_u64 (0x7ffd))
++
++/*
++** dup_7ffe_u64:
++**	mov	z0\.d, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u64, svuint64_t,
++		z0 = svdup_n_u64 (0x7ffe),
++		z0 = svdup_u64 (0x7ffe))
++
++/*
++** dup_7fff_u64:
++**	mov	z0\.d, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u64, svuint64_t,
++		z0 = svdup_n_u64 (0x7fff),
++		z0 = svdup_u64 (0x7fff))
++
++/*
++** dup_m1_u64:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u64, svuint64_t,
++		z0 = svdup_n_u64 (-1),
++		z0 = svdup_u64 (-1))
++
++/*
++** dup_m128_u64:
++**	mov	z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u64, svuint64_t,
++		z0 = svdup_n_u64 (-128),
++		z0 = svdup_u64 (-128))
++
++/*
++** dup_m129_u64:
++**	mov	z0\.d, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u64, svuint64_t,
++		z0 = svdup_n_u64 (-129),
++		z0 = svdup_u64 (-129))
++
++/*
++** dup_m130_u64:
++**	mov	(x[0-9]+), -130
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_u64, svuint64_t,
++		z0 = svdup_n_u64 (-130),
++		z0 = svdup_u64 (-130))
++
++/*
++** dup_m254_u64:
++**	mov	(x[0-9]+), -254
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_u64, svuint64_t,
++		z0 = svdup_n_u64 (-254),
++		z0 = svdup_u64 (-254))
++
++/*
++** dup_m255_u64:
++**	mov	z0\.d, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u64, svuint64_t,
++		z0 = svdup_n_u64 (-255),
++		z0 = svdup_u64 (-255))
++
++/*
++** dup_m256_u64:
++**	mov	z0\.d, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u64, svuint64_t,
++		z0 = svdup_n_u64 (-256),
++		z0 = svdup_u64 (-256))
++
++/*
++** dup_m257_u64:
++**	mov	z0\.d, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u64, svuint64_t,
++		z0 = svdup_n_u64 (-257),
++		z0 = svdup_u64 (-257))
++
++/*
++** dup_m258_u64:
++**	mov	(x[0-9]+), -258
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u64, svuint64_t,
++		z0 = svdup_n_u64 (-258),
++		z0 = svdup_u64 (-258))
++
++/*
++** dup_m259_u64:
++**	mov	(x[0-9]+), -259
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_u64, svuint64_t,
++		z0 = svdup_n_u64 (-259),
++		z0 = svdup_u64 (-259))
++
++/*
++** dup_m512_u64:
++**	mov	z0\.d, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u64, svuint64_t,
++		z0 = svdup_n_u64 (-512),
++		z0 = svdup_u64 (-512))
++
++/*
++** dup_m7f00_u64:
++**	mov	z0\.d, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u64, svuint64_t,
++		z0 = svdup_n_u64 (-0x7f00),
++		z0 = svdup_u64 (-0x7f00))
++
++/*
++** dup_m7f01_u64:
++**	mov	z0\.d, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u64, svuint64_t,
++		z0 = svdup_n_u64 (-0x7f01),
++		z0 = svdup_u64 (-0x7f01))
++
++/*
++** dup_m7f02_u64:
++**	mov	(x[0-9]+), -32514
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_u64, svuint64_t,
++		z0 = svdup_n_u64 (-0x7f02),
++		z0 = svdup_u64 (-0x7f02))
++
++/*
++** dup_m7ffe_u64:
++**	mov	(x[0-9]+), -32766
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_u64, svuint64_t,
++		z0 = svdup_n_u64 (-0x7ffe),
++		z0 = svdup_u64 (-0x7ffe))
++
++/*
++** dup_m7fff_u64:
++**	mov	z0\.d, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u64, svuint64_t,
++		z0 = svdup_n_u64 (-0x7fff),
++		z0 = svdup_u64 (-0x7fff))
++
++/*
++** dup_m8000_u64:
++**	mov	z0\.d, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u64, svuint64_t,
++		z0 = svdup_n_u64 (-0x8000),
++		z0 = svdup_u64 (-0x8000))
++
++/*
++** dup_x0_u64:
++**	mov	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_u64, svuint64_t, uint64_t,
++		 z0 = svdup_n_u64 (x0),
++		 z0 = svdup_u64 (x0))
++
++/*
++** dup_1_u64_m:
++**	mov	z0\.d, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 1),
++		z0 = svdup_u64_m (z0, p0, 1))
++
++/*
++** dup_127_u64_m:
++**	mov	z0\.d, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 127),
++		z0 = svdup_u64_m (z0, p0, 127))
++
++/*
++** dup_128_u64_m:
++**	mov	(z[0-9]+\.d), #128
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 128),
++		z0 = svdup_u64_m (z0, p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 129),
++		z0 = svdup_u64_m (z0, p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 253),
++		z0 = svdup_u64_m (z0, p0, 253))
++
++/*
++** dup_254_u64_m:
++**	mov	(z[0-9]+\.d), #254
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 254),
++		z0 = svdup_u64_m (z0, p0, 254))
++
++/*
++** dup_255_u64_m:
++**	mov	(z[0-9]+\.d), #255
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 255),
++		z0 = svdup_u64_m (z0, p0, 255))
++
++/*
++** dup_256_u64_m:
++**	mov	z0\.d, p0/m, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 256),
++		z0 = svdup_u64_m (z0, p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 257),
++		z0 = svdup_u64_m (z0, p0, 257))
++
++/*
++** dup_512_u64_m:
++**	mov	z0\.d, p0/m, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 512),
++		z0 = svdup_u64_m (z0, p0, 512))
++
++/*
++** dup_7f00_u64_m:
++**	mov	z0\.d, p0/m, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 0x7f00),
++		z0 = svdup_u64_m (z0, p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 0x7f01),
++		z0 = svdup_u64_m (z0, p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 0x7ffd),
++		z0 = svdup_u64_m (z0, p0, 0x7ffd))
++
++/*
++** dup_7ffe_u64_m:
++**	mov	(z[0-9]+\.d), #32766
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 0x7ffe),
++		z0 = svdup_u64_m (z0, p0, 0x7ffe))
++
++/*
++** dup_7fff_u64_m:
++**	mov	(z[0-9]+\.d), #32767
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 0x7fff),
++		z0 = svdup_u64_m (z0, p0, 0x7fff))
++
++/*
++** dup_m1_u64_m:
++**	mov	z0\.d, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -1),
++		z0 = svdup_u64_m (z0, p0, -1))
++
++/*
++** dup_m128_u64_m:
++**	mov	z0\.d, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -128),
++		z0 = svdup_u64_m (z0, p0, -128))
++
++/*
++** dup_m129_u64_m:
++**	mov	(z[0-9]+\.d), #-129
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -129),
++		z0 = svdup_u64_m (z0, p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -130),
++		z0 = svdup_u64_m (z0, p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -254),
++		z0 = svdup_u64_m (z0, p0, -254))
++
++/*
++** dup_m255_u64_m:
++**	mov	(z[0-9]+\.d), #-255
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -255),
++		z0 = svdup_u64_m (z0, p0, -255))
++
++/*
++** dup_m256_u64_m:
++**	mov	z0\.d, p0/m, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -256),
++		z0 = svdup_u64_m (z0, p0, -256))
++
++/*
++** dup_m257_u64_m:
++**	mov	(z[0-9]+\.d), #-257
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -257),
++		z0 = svdup_u64_m (z0, p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -258),
++		z0 = svdup_u64_m (z0, p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -259),
++		z0 = svdup_u64_m (z0, p0, -259))
++
++/*
++** dup_m512_u64_m:
++**	mov	z0\.d, p0/m, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -512),
++		z0 = svdup_u64_m (z0, p0, -512))
++
++/*
++** dup_m7f00_u64_m:
++**	mov	z0\.d, p0/m, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -0x7f00),
++		z0 = svdup_u64_m (z0, p0, -0x7f00))
++
++/*
++** dup_m7f01_u64_m:
++**	mov	(z[0-9]+\.d), #-32513
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -0x7f01),
++		z0 = svdup_u64_m (z0, p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -0x7f02),
++		z0 = svdup_u64_m (z0, p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -0x7ffe),
++		z0 = svdup_u64_m (z0, p0, -0x7ffe))
++
++/*
++** dup_m7fff_u64_m:
++**	mov	(z[0-9]+\.d), #-32767
++**	sel	z0\.d, p0, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -0x7fff),
++		z0 = svdup_u64_m (z0, p0, -0x7fff))
++
++/*
++** dup_m8000_u64_m:
++**	mov	z0\.d, p0/m, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, -0x8000),
++		z0 = svdup_u64_m (z0, p0, -0x8000))
++
++/*
++** dup_0_u64_m:
++**	mov	z0\.d, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u64_m, svuint64_t,
++		z0 = svdup_n_u64_m (z0, p0, 0),
++		z0 = svdup_u64_m (z0, p0, 0))
++
++/*
++** dup_x0_u64_m:
++**	movprfx	z0, z1
++**	mov	z0\.d, p0/m, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_u64_m, svuint64_t, uint64_t,
++		z0 = svdup_n_u64_m (z1, p0, x0),
++		z0 = svdup_u64_m (z1, p0, x0))
++
++/*
++** dup_1_u64_z:
++**	mov	z0\.d, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 1),
++		z0 = svdup_u64_z (p0, 1))
++
++/*
++** dup_127_u64_z:
++**	mov	z0\.d, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 127),
++		z0 = svdup_u64_z (p0, 127))
++
++/*
++** dup_128_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #128
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 128),
++		z0 = svdup_u64_z (p0, 128))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_129_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 129),
++		z0 = svdup_u64_z (p0, 129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_253_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 253),
++		z0 = svdup_u64_z (p0, 253))
++
++/*
++** dup_254_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #254
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 254),
++		z0 = svdup_u64_z (p0, 254))
++
++/*
++** dup_255_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #255
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 255),
++		z0 = svdup_u64_z (p0, 255))
++
++/*
++** dup_256_u64_z:
++**	mov	z0\.d, p0/z, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 256),
++		z0 = svdup_u64_z (p0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_257_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 257),
++		z0 = svdup_u64_z (p0, 257))
++
++/*
++** dup_512_u64_z:
++**	mov	z0\.d, p0/z, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 512),
++		z0 = svdup_u64_z (p0, 512))
++
++/*
++** dup_7f00_u64_z:
++**	mov	z0\.d, p0/z, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 0x7f00),
++		z0 = svdup_u64_z (p0, 0x7f00))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7f01_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 0x7f01),
++		z0 = svdup_u64_z (p0, 0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_7ffd_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 0x7ffd),
++		z0 = svdup_u64_z (p0, 0x7ffd))
++
++/*
++** dup_7ffe_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #32766
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 0x7ffe),
++		z0 = svdup_u64_z (p0, 0x7ffe))
++
++/*
++** dup_7fff_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #32767
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 0x7fff),
++		z0 = svdup_u64_z (p0, 0x7fff))
++
++/*
++** dup_m1_u64_z:
++**	mov	z0\.d, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -1),
++		z0 = svdup_u64_z (p0, -1))
++
++/*
++** dup_m128_u64_z:
++**	mov	z0\.d, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -128),
++		z0 = svdup_u64_z (p0, -128))
++
++/*
++** dup_m129_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-129
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -129),
++		z0 = svdup_u64_z (p0, -129))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m130_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -130),
++		z0 = svdup_u64_z (p0, -130))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m254_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -254),
++		z0 = svdup_u64_z (p0, -254))
++
++/*
++** dup_m255_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-255
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -255),
++		z0 = svdup_u64_z (p0, -255))
++
++/*
++** dup_m256_u64_z:
++**	mov	z0\.d, p0/z, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -256),
++		z0 = svdup_u64_z (p0, -256))
++
++/*
++** dup_m257_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-257
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -257),
++		z0 = svdup_u64_z (p0, -257))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m258_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -258),
++		z0 = svdup_u64_z (p0, -258))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m259_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -259),
++		z0 = svdup_u64_z (p0, -259))
++
++/*
++** dup_m512_u64_z:
++**	mov	z0\.d, p0/z, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -512),
++		z0 = svdup_u64_z (p0, -512))
++
++/*
++** dup_m7f00_u64_z:
++**	mov	z0\.d, p0/z, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -0x7f00),
++		z0 = svdup_u64_z (p0, -0x7f00))
++
++/*
++** dup_m7f01_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-32513
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -0x7f01),
++		z0 = svdup_u64_z (p0, -0x7f01))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7f02_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -0x7f02),
++		z0 = svdup_u64_z (p0, -0x7f02))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (dup_m7ffe_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -0x7ffe),
++		z0 = svdup_u64_z (p0, -0x7ffe))
++
++/*
++** dup_m7fff_u64_z:
++**	mov	(z[0-9]+)\.b, #0
++**	mov	(z[0-9]+\.d), #-32767
++**	sel	z0\.d, p0, \2, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -0x7fff),
++		z0 = svdup_u64_z (p0, -0x7fff))
++
++/*
++** dup_m8000_u64_z:
++**	mov	z0\.d, p0/z, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, -0x8000),
++		z0 = svdup_u64_z (p0, -0x8000))
++
++/*
++** dup_0_u64_z:
++**	mov	z0\.d, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u64_z, svuint64_t,
++		z0 = svdup_n_u64_z (p0, 0),
++		z0 = svdup_u64_z (p0, 0))
++
++/*
++** dup_x0_u64_z:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mov	z0\.d, p0/m, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_u64_z, svuint64_t, uint64_t,
++		z0 = svdup_n_u64_z (p0, x0),
++		z0 = svdup_u64_z (p0, x0))
++
++/*
++** dup_1_u64_x:
++**	mov	z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 1),
++		z0 = svdup_u64_x (p0, 1))
++
++/*
++** dup_127_u64_x:
++**	mov	z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 127),
++		z0 = svdup_u64_x (p0, 127))
++
++/*
++** dup_128_u64_x:
++**	mov	z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 128),
++		z0 = svdup_u64_x (p0, 128))
++
++/*
++** dup_129_u64_x:
++**	mov	(x[0-9]+), 129
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 129),
++		z0 = svdup_u64_x (p0, 129))
++
++/*
++** dup_253_u64_x:
++**	mov	(x[0-9]+), 253
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 253),
++		z0 = svdup_u64_x (p0, 253))
++
++/*
++** dup_254_u64_x:
++**	mov	z0\.d, #254
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 254),
++		z0 = svdup_u64_x (p0, 254))
++
++/*
++** dup_255_u64_x:
++**	mov	z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 255),
++		z0 = svdup_u64_x (p0, 255))
++
++/*
++** dup_256_u64_x:
++**	mov	z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_256_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 256),
++		z0 = svdup_u64_x (p0, 256))
++
++/*
++** dup_257_u64_x:
++**	mov	(x[0-9]+), 257
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_257_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 257),
++		z0 = svdup_u64_x (p0, 257))
++
++/*
++** dup_512_u64_x:
++**	mov	z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_512_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 512),
++		z0 = svdup_u64_x (p0, 512))
++
++/*
++** dup_7f00_u64_x:
++**	mov	z0\.d, #32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f00_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 0x7f00),
++		z0 = svdup_u64_x (p0, 0x7f00))
++
++/*
++** dup_7f01_u64_x:
++**	mov	(x[0-9]+), 32513
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7f01_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 0x7f01),
++		z0 = svdup_u64_x (p0, 0x7f01))
++
++/*
++** dup_7ffd_u64_x:
++**	mov	(x[0-9]+), 32765
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffd_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 0x7ffd),
++		z0 = svdup_u64_x (p0, 0x7ffd))
++
++/*
++** dup_7ffe_u64_x:
++**	mov	z0\.d, #32766
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7ffe_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 0x7ffe),
++		z0 = svdup_u64_x (p0, 0x7ffe))
++
++/*
++** dup_7fff_u64_x:
++**	mov	z0\.d, #32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_7fff_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, 0x7fff),
++		z0 = svdup_u64_x (p0, 0x7fff))
++
++/*
++** dup_m1_u64_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -1),
++		z0 = svdup_u64_x (p0, -1))
++
++/*
++** dup_m128_u64_x:
++**	mov	z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -128),
++		z0 = svdup_u64_x (p0, -128))
++
++/*
++** dup_m129_u64_x:
++**	mov	z0\.d, #-129
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m129_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -129),
++		z0 = svdup_u64_x (p0, -129))
++
++/*
++** dup_m130_u64_x:
++**	mov	(x[0-9]+), -130
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m130_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -130),
++		z0 = svdup_u64_x (p0, -130))
++
++/*
++** dup_m254_u64_x:
++**	mov	(x[0-9]+), -254
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m254_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -254),
++		z0 = svdup_u64_x (p0, -254))
++
++/*
++** dup_m255_u64_x:
++**	mov	z0\.d, #-255
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m255_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -255),
++		z0 = svdup_u64_x (p0, -255))
++
++/*
++** dup_m256_u64_x:
++**	mov	z0\.d, #-256
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m256_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -256),
++		z0 = svdup_u64_x (p0, -256))
++
++/*
++** dup_m257_u64_x:
++**	mov	z0\.d, #-257
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m257_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -257),
++		z0 = svdup_u64_x (p0, -257))
++
++/*
++** dup_m258_u64_x:
++**	mov	(x[0-9]+), -258
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m258_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -258),
++		z0 = svdup_u64_x (p0, -258))
++
++/*
++** dup_m259_u64_x:
++**	mov	(x[0-9]+), -259
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m259_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -259),
++		z0 = svdup_u64_x (p0, -259))
++
++/*
++** dup_m512_u64_x:
++**	mov	z0\.d, #-512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m512_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -512),
++		z0 = svdup_u64_x (p0, -512))
++
++/*
++** dup_m7f00_u64_x:
++**	mov	z0\.d, #-32512
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f00_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -0x7f00),
++		z0 = svdup_u64_x (p0, -0x7f00))
++
++/*
++** dup_m7f01_u64_x:
++**	mov	z0\.d, #-32513
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f01_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -0x7f01),
++		z0 = svdup_u64_x (p0, -0x7f01))
++
++/*
++** dup_m7f02_u64_x:
++**	mov	(x[0-9]+), -32514
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7f02_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -0x7f02),
++		z0 = svdup_u64_x (p0, -0x7f02))
++
++/*
++** dup_m7ffe_u64_x:
++**	mov	(x[0-9]+), -32766
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7ffe_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -0x7ffe),
++		z0 = svdup_u64_x (p0, -0x7ffe))
++
++/*
++** dup_m7fff_u64_x:
++**	mov	z0\.d, #-32767
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m7fff_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -0x7fff),
++		z0 = svdup_u64_x (p0, -0x7fff))
++
++/*
++** dup_m8000_u64_x:
++**	mov	z0\.d, #-32768
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m8000_u64_x, svuint64_t,
++		z0 = svdup_n_u64_x (p0, -0x8000),
++		z0 = svdup_u64_x (p0, -0x8000))
++
++/*
++** dup_x0_u64_x:
++**	mov	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_x0_u64_x, svuint64_t, uint64_t,
++		z0 = svdup_n_u64_x (p0, x0),
++		z0 = svdup_u64_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u8.c
+new file mode 100644
+index 000000000..d27f4bba9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u8.c
+@@ -0,0 +1,383 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dup_1_u8:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u8, svuint8_t,
++		z0 = svdup_n_u8 (1),
++		z0 = svdup_u8 (1))
++
++/*
++** dup_127_u8:
++**	mov	z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u8, svuint8_t,
++		z0 = svdup_n_u8 (127),
++		z0 = svdup_u8 (127))
++
++/*
++** dup_128_u8:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u8, svuint8_t,
++		z0 = svdup_n_u8 (128),
++		z0 = svdup_u8 (128))
++
++/*
++** dup_129_u8:
++**	mov	z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u8, svuint8_t,
++		z0 = svdup_n_u8 (129),
++		z0 = svdup_u8 (129))
++
++/*
++** dup_253_u8:
++**	mov	z0\.b, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u8, svuint8_t,
++		z0 = svdup_n_u8 (253),
++		z0 = svdup_u8 (253))
++
++/*
++** dup_254_u8:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u8, svuint8_t,
++		z0 = svdup_n_u8 (254),
++		z0 = svdup_u8 (254))
++
++/*
++** dup_255_u8:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u8, svuint8_t,
++		z0 = svdup_n_u8 (255),
++		z0 = svdup_u8 (255))
++
++/*
++** dup_m1_u8:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u8, svuint8_t,
++		z0 = svdup_n_u8 (-1),
++		z0 = svdup_u8 (-1))
++
++/*
++** dup_m128_u8:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u8, svuint8_t,
++		z0 = svdup_n_u8 (-128),
++		z0 = svdup_u8 (-128))
++
++/*
++** dup_w0_u8:
++**	mov	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u8, svuint8_t, uint8_t,
++		 z0 = svdup_n_u8 (x0),
++		 z0 = svdup_u8 (x0))
++
++/*
++** dup_1_u8_m:
++**	mov	z0\.b, p0/m, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 1),
++		z0 = svdup_u8_m (z0, p0, 1))
++
++/*
++** dup_127_u8_m:
++**	mov	z0\.b, p0/m, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 127),
++		z0 = svdup_u8_m (z0, p0, 127))
++
++/*
++** dup_128_u8_m:
++**	mov	z0\.b, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 128),
++		z0 = svdup_u8_m (z0, p0, 128))
++
++/*
++** dup_129_u8_m:
++**	mov	z0\.b, p0/m, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 129),
++		z0 = svdup_u8_m (z0, p0, 129))
++
++/*
++** dup_253_u8_m:
++**	mov	z0\.b, p0/m, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 253),
++		z0 = svdup_u8_m (z0, p0, 253))
++
++/*
++** dup_254_u8_m:
++**	mov	z0\.b, p0/m, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 254),
++		z0 = svdup_u8_m (z0, p0, 254))
++
++/*
++** dup_255_u8_m:
++**	mov	z0\.b, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 255),
++		z0 = svdup_u8_m (z0, p0, 255))
++
++/*
++** dup_m1_u8_m:
++**	mov	z0\.b, p0/m, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, -1),
++		z0 = svdup_u8_m (z0, p0, -1))
++
++/*
++** dup_m128_u8_m:
++**	mov	z0\.b, p0/m, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, -128),
++		z0 = svdup_u8_m (z0, p0, -128))
++
++/*
++** dup_0_u8_m:
++**	mov	z0\.b, p0/m, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u8_m, svuint8_t,
++		z0 = svdup_n_u8_m (z0, p0, 0),
++		z0 = svdup_u8_m (z0, p0, 0))
++
++/*
++** dup_w0_u8_m:
++**	movprfx	z0, z1
++**	mov	z0\.b, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u8_m, svuint8_t, uint8_t,
++		z0 = svdup_n_u8_m (z1, p0, x0),
++		z0 = svdup_u8_m (z1, p0, x0))
++
++/*
++** dup_1_u8_z:
++**	mov	z0\.b, p0/z, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 1),
++		z0 = svdup_u8_z (p0, 1))
++
++/*
++** dup_127_u8_z:
++**	mov	z0\.b, p0/z, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 127),
++		z0 = svdup_u8_z (p0, 127))
++
++/*
++** dup_128_u8_z:
++**	mov	z0\.b, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 128),
++		z0 = svdup_u8_z (p0, 128))
++
++/*
++** dup_129_u8_z:
++**	mov	z0\.b, p0/z, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 129),
++		z0 = svdup_u8_z (p0, 129))
++
++/*
++** dup_253_u8_z:
++**	mov	z0\.b, p0/z, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 253),
++		z0 = svdup_u8_z (p0, 253))
++
++/*
++** dup_254_u8_z:
++**	mov	z0\.b, p0/z, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 254),
++		z0 = svdup_u8_z (p0, 254))
++
++/*
++** dup_255_u8_z:
++**	mov	z0\.b, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 255),
++		z0 = svdup_u8_z (p0, 255))
++
++/*
++** dup_m1_u8_z:
++**	mov	z0\.b, p0/z, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, -1),
++		z0 = svdup_u8_z (p0, -1))
++
++/*
++** dup_m128_u8_z:
++**	mov	z0\.b, p0/z, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, -128),
++		z0 = svdup_u8_z (p0, -128))
++
++/*
++** dup_0_u8_z:
++**	mov	z0\.b, p0/z, #0
++**	ret
++*/
++TEST_UNIFORM_Z (dup_0_u8_z, svuint8_t,
++		z0 = svdup_n_u8_z (p0, 0),
++		z0 = svdup_u8_z (p0, 0))
++
++/*
++** dup_w0_u8_z:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mov	z0\.b, p0/m, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u8_z, svuint8_t, uint8_t,
++		z0 = svdup_n_u8_z (p0, x0),
++		z0 = svdup_u8_z (p0, x0))
++
++/*
++** dup_1_u8_x:
++**	mov	z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_1_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 1),
++		z0 = svdup_u8_x (p0, 1))
++
++/*
++** dup_127_u8_x:
++**	mov	z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_127_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 127),
++		z0 = svdup_u8_x (p0, 127))
++
++/*
++** dup_128_u8_x:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_128_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 128),
++		z0 = svdup_u8_x (p0, 128))
++
++/*
++** dup_129_u8_x:
++**	mov	z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (dup_129_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 129),
++		z0 = svdup_u8_x (p0, 129))
++
++/*
++** dup_253_u8_x:
++**	mov	z0\.b, #-3
++**	ret
++*/
++TEST_UNIFORM_Z (dup_253_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 253),
++		z0 = svdup_u8_x (p0, 253))
++
++/*
++** dup_254_u8_x:
++**	mov	z0\.b, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dup_254_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 254),
++		z0 = svdup_u8_x (p0, 254))
++
++/*
++** dup_255_u8_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_255_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, 255),
++		z0 = svdup_u8_x (p0, 255))
++
++/*
++** dup_m1_u8_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m1_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, -1),
++		z0 = svdup_u8_x (p0, -1))
++
++/*
++** dup_m128_u8_x:
++**	mov	z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (dup_m128_u8_x, svuint8_t,
++		z0 = svdup_n_u8_x (p0, -128),
++		z0 = svdup_u8_x (p0, -128))
++
++/*
++** dup_w0_u8_x:
++**	mov	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (dup_w0_u8_x, svuint8_t, uint8_t,
++		z0 = svdup_n_u8_x (p0, x0),
++		z0 = svdup_u8_x (p0, x0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b16.c
+new file mode 100644
+index 000000000..ecbacd7e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b16.c
+@@ -0,0 +1,276 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_00_b16:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dupq_00_b16,
++		p0 = svdupq_n_b16 (0, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b16 (0, 0, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_11_b16:
++**	ptrue	p0\.d, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_11_b16,
++		p0 = svdupq_n_b16 (1, 0, 0, 0, 1, 0, 0, 0),
++		p0 = svdupq_b16 (1, 0, 0, 0, 1, 0, 0, 0))
++
++/*
++** dupq_22_b16:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.h, \1\.h, \2\.h
++** |
++**	ptrue	(p[0-7])\.d, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.h, \4\.h, \3\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_22_b16,
++		p0 = svdupq_n_b16 (0, 1, 0, 0, 0, 1, 0, 0),
++		p0 = svdupq_b16 (0, 1, 0, 0, 0, 1, 0, 0))
++
++/*
++** dupq_33_b16:
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.h, \1\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_P (dupq_33_b16,
++		p0 = svdupq_n_b16 (1, 1, 0, 0, 1, 1, 0, 0),
++		p0 = svdupq_b16 (1, 1, 0, 0, 1, 1, 0, 0))
++
++/*
++** dupq_44_b16:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.s, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.d, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_44_b16,
++		p0 = svdupq_n_b16 (0, 0, 1, 0, 0, 0, 1, 0),
++		p0 = svdupq_b16 (0, 0, 1, 0, 0, 0, 1, 0))
++
++/*
++** dupq_55_b16:
++**	ptrue	p0\.s, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_55_b16,
++		p0 = svdupq_n_b16 (1, 0, 1, 0, 1, 0, 1, 0),
++		p0 = svdupq_b16 (1, 0, 1, 0, 1, 0, 1, 0))
++
++/*
++** dupq_66_b16:
++**	...
++**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
++**	ret
++*/
++TEST_UNIFORM_P (dupq_66_b16,
++		p0 = svdupq_n_b16 (0, 1, 1, 0, 0, 1, 1, 0),
++		p0 = svdupq_b16 (0, 1, 1, 0, 0, 1, 1, 0))
++
++/*
++** dupq_77_b16:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.[hs], all
++**	trn1	p0\.h, \2\.h, \1\.h
++** |
++**	ptrue	(p[0-7])\.[hs], all
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.h, \3\.h, \4\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_77_b16,
++		p0 = svdupq_n_b16 (1, 1, 1, 0, 1, 1, 1, 0),
++		p0 = svdupq_b16 (1, 1, 1, 0, 1, 1, 1, 0))
++
++/*
++** dupq_88_b16:
++** (
++**	mov	(z[0-9]+)\.d, #71776119061217280
++**	ptrue	(p[0-7])\.b, all
++**	cmpne	p0\.b, \2/z, \1\.b, #0
++** |
++**	ptrue	(p[0-7])\.b, all
++**	mov	(z[0-9]+)\.d, #71776119061217280
++**	cmpne	p0\.b, \3/z, \4\.b, #0
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_88_b16,
++		p0 = svdupq_n_b16 (0, 0, 0, 1, 0, 0, 0, 1),
++		p0 = svdupq_b16 (0, 0, 0, 1, 0, 0, 0, 1))
++
++/*
++** dupq_99_b16:
++**	...
++**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
++**	ret
++*/
++TEST_UNIFORM_P (dupq_99_b16,
++		p0 = svdupq_n_b16 (1, 0, 0, 1, 1, 0, 0, 1),
++		p0 = svdupq_b16 (1, 0, 0, 1, 1, 0, 0, 1))
++
++/*
++** dupq_aa_b16:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.h, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.h, all
++**	ptrue	(p[0-7])\.s, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_aa_b16,
++		p0 = svdupq_n_b16 (0, 1, 0, 1, 0, 1, 0, 1),
++		p0 = svdupq_b16 (0, 1, 0, 1, 0, 1, 0, 1))
++
++/*
++** dupq_bb_b16:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.[hs], all
++**	trn1	p0\.h, \1\.h, \2\.h
++** |
++**	ptrue	(p[0-7])\.[hs], all
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.h, \4\.h, \3\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_bb_b16,
++		p0 = svdupq_n_b16 (1, 1, 0, 1, 1, 1, 0, 1),
++		p0 = svdupq_b16 (1, 1, 0, 1, 1, 1, 0, 1))
++
++/*
++** dupq_cc_b16:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.h, all
++**	trn1	p0\.s, \1\.s, \2\.s
++** |
++**	ptrue	(p[0-7])\.h, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.s, \4\.s, \3\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_cc_b16,
++		p0 = svdupq_n_b16 (0, 0, 1, 1, 0, 0, 1, 1),
++		p0 = svdupq_b16 (0, 0, 1, 1, 0, 0, 1, 1))
++
++/*
++** dupq_dd_b16:
++** (
++**	ptrue	(p[0-7])\.[sd], all
++**	ptrue	(p[0-7])\.h, all
++**	trn1	p0\.s, \1\.s, \2\.s
++** |
++**	ptrue	(p[0-7])\.h, all
++**	ptrue	(p[0-7])\.[sd], all
++**	trn1	p0\.s, \4\.s, \3\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_dd_b16,
++		p0 = svdupq_n_b16 (1, 0, 1, 1, 1, 0, 1, 1),
++		p0 = svdupq_b16 (1, 0, 1, 1, 1, 0, 1, 1))
++
++/*
++** dupq_ee_b16:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.h, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.h, all
++**	ptrue	(p[0-7])\.d, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_ee_b16,
++		p0 = svdupq_n_b16 (0, 1, 1, 1, 0, 1, 1, 1),
++		p0 = svdupq_b16 (0, 1, 1, 1, 0, 1, 1, 1))
++
++/*
++** dupq_ff_b16:
++**	ptrue	p0\.h, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_ff_b16,
++		p0 = svdupq_n_b16 (1, 1, 1, 1, 1, 1, 1, 1),
++		p0 = svdupq_b16 (1, 1, 1, 1, 1, 1, 1, 1))
++
++/*
++** dupq_01_b16:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_01_b16,
++		p0 = svdupq_n_b16 (1, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b16 (1, 0, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_03_b16:
++**	...
++**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
++**	ret
++*/
++TEST_UNIFORM_P (dupq_03_b16,
++		p0 = svdupq_n_b16 (1, 1, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b16 (1, 1, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_0f_b16:
++** (
++**	ptrue	(p[0-7])\.h, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.h, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0f_b16,
++		p0 = svdupq_n_b16 (1, 1, 1, 1, 0, 0, 0, 0),
++		p0 = svdupq_b16 (1, 1, 1, 1, 0, 0, 0, 0))
++
++/*
++** dupq_3f_b16:
++**	...
++**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
++**	ret
++*/
++TEST_UNIFORM_P (dupq_3f_b16,
++		p0 = svdupq_n_b16 (1, 1, 1, 1, 1, 1, 0, 0),
++		p0 = svdupq_b16 (1, 1, 1, 1, 1, 1, 0, 0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b32.c
+new file mode 100644
+index 000000000..39719a76d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b32.c
+@@ -0,0 +1,132 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_0_b32:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0_b32,
++		p0 = svdupq_n_b32 (0, 0, 0, 0),
++		p0 = svdupq_b32 (0, 0, 0, 0))
++
++/*
++** dupq_1_b32:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_1_b32,
++		p0 = svdupq_n_b32 (1, 0, 0, 0),
++		p0 = svdupq_b32 (1, 0, 0, 0))
++
++/*
++** dupq_3_b32:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_3_b32,
++		p0 = svdupq_n_b32 (1, 1, 0, 0),
++		p0 = svdupq_b32 (1, 1, 0, 0))
++
++/*
++** dupq_4_b32:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	ptrue	(p[0-7])\.d, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_4_b32,
++		p0 = svdupq_n_b32 (0, 0, 1, 0),
++		p0 = svdupq_b32 (0, 0, 1, 0))
++
++/*
++** dupq_5_b32:
++**	ptrue	p0\.d, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_5_b32,
++		p0 = svdupq_n_b32 (1, 0, 1, 0),
++		p0 = svdupq_b32 (1, 0, 1, 0))
++
++/*
++** dupq_7_b32:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_7_b32,
++		p0 = svdupq_n_b32 (1, 1, 1, 0),
++		p0 = svdupq_b32 (1, 1, 1, 0))
++
++/*
++** dupq_a_b32:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.s, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.d, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_a_b32,
++		p0 = svdupq_n_b32 (0, 1, 0, 1),
++		p0 = svdupq_b32 (0, 1, 0, 1))
++
++/*
++** dupq_e_b32:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_e_b32,
++		p0 = svdupq_n_b32 (1, 0, 1, 1),
++		p0 = svdupq_b32 (1, 0, 1, 1))
++
++/*
++** dupq_f_b32:
++**	ptrue	p0\.s, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_f_b32,
++		p0 = svdupq_n_b32 (1, 1, 1, 1),
++		p0 = svdupq_b32 (1, 1, 1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b64.c
+new file mode 100644
+index 000000000..820ace431
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b64.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_0_b64:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0_b64,
++		p0 = svdupq_n_b64 (0, 0),
++		p0 = svdupq_b64 (0, 0))
++
++/*
++** dupq_1_b64:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_1_b64,
++		p0 = svdupq_n_b64 (1, 0),
++		p0 = svdupq_b64 (1, 0))
++
++/*
++** dupq_2_b64:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \1\.d, \2\.d
++** |
++**	ptrue	(p[0-7])\.d, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \4\.d, \3\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_2_b64,
++		p0 = svdupq_n_b64 (0, 1),
++		p0 = svdupq_b64 (0, 1))
++
++/*
++** dupq_3_b64:
++**	ptrue	p0\.d, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_3_b64,
++		p0 = svdupq_n_b64 (1, 1),
++		p0 = svdupq_b64 (1, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b8.c
+new file mode 100644
+index 000000000..4762f950b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b8.c
+@@ -0,0 +1,413 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_0000_b8:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0000_b8,
++		p0 = svdupq_n_b8 (0, 0, 0, 0, 0, 0, 0, 0,
++				  0, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b8 (0, 0, 0, 0, 0, 0, 0, 0,
++				0, 0, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_1111_b8:
++**	ptrue	p0\.s, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_1111_b8,
++		p0 = svdupq_n_b8 (1, 0, 0, 0, 1, 0, 0, 0,
++				  1, 0, 0, 0, 1, 0, 0, 0),
++		p0 = svdupq_b8 (1, 0, 0, 0, 1, 0, 0, 0,
++				1, 0, 0, 0, 1, 0, 0, 0))
++
++/*
++** dupq_2222_b8:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.b, \1\.b, \2\.b
++** |
++**	ptrue	(p[0-7])\.s, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.b, \4\.b, \3\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_2222_b8,
++		p0 = svdupq_n_b8 (0, 1, 0, 0, 0, 1, 0, 0,
++				  0, 1, 0, 0, 0, 1, 0, 0),
++		p0 = svdupq_b8 (0, 1, 0, 0, 0, 1, 0, 0,
++				0, 1, 0, 0, 0, 1, 0, 0))
++
++/*
++** dupq_3333_b8:
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.b, \1\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_P (dupq_3333_b8,
++		p0 = svdupq_n_b8 (1, 1, 0, 0, 1, 1, 0, 0,
++				  1, 1, 0, 0, 1, 1, 0, 0),
++		p0 = svdupq_b8 (1, 1, 0, 0, 1, 1, 0, 0,
++				1, 1, 0, 0, 1, 1, 0, 0))
++
++/*
++** dupq_4444_b8:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.h, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.h, all
++**	ptrue	(p[0-7])\.s, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_4444_b8,
++		p0 = svdupq_n_b8 (0, 0, 1, 0, 0, 0, 1, 0,
++				  0, 0, 1, 0, 0, 0, 1, 0),
++		p0 = svdupq_b8 (0, 0, 1, 0, 0, 0, 1, 0,
++				0, 0, 1, 0, 0, 0, 1, 0))
++
++/*
++** dupq_5555_b8:
++**	ptrue	p0\.h, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_5555_b8,
++		p0 = svdupq_n_b8 (1, 0, 1, 0, 1, 0, 1, 0,
++				  1, 0, 1, 0, 1, 0, 1, 0),
++		p0 = svdupq_b8 (1, 0, 1, 0, 1, 0, 1, 0,
++				1, 0, 1, 0, 1, 0, 1, 0))
++
++/*
++** dupq_6666_b8:
++** (
++**	mov	(z[0-9]+)\.s, #16776960
++**	ptrue	(p[0-7])\.b, all
++**	cmpne	p0\.b, \2/z, \1\.b, #0
++** |
++**	ptrue	(p[0-7])\.b, all
++**	mov	(z[0-9]+)\.s, #16776960
++**	cmpne	p0\.b, \3/z, \4\.b, #0
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_6666_b8,
++		p0 = svdupq_n_b8 (0, 1, 1, 0, 0, 1, 1, 0,
++				  0, 1, 1, 0, 0, 1, 1, 0),
++		p0 = svdupq_b8 (0, 1, 1, 0, 0, 1, 1, 0,
++				0, 1, 1, 0, 0, 1, 1, 0))
++
++/*
++** dupq_7777_b8:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.[bh], all
++**	trn1	p0\.b, \2\.b, \1\.b
++** |
++**	ptrue	(p[0-7])\.[bh], all
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.b, \3\.b, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_7777_b8,
++		p0 = svdupq_n_b8 (1, 1, 1, 0, 1, 1, 1, 0,
++				  1, 1, 1, 0, 1, 1, 1, 0),
++		p0 = svdupq_b8 (1, 1, 1, 0, 1, 1, 1, 0,
++				1, 1, 1, 0, 1, 1, 1, 0))
++
++/*
++** dupq_8888_b8:
++** (
++**	mov	(z[0-9]+)\.s, #-16777216
++**	ptrue	(p[0-7])\.b, all
++**	cmpne	p0\.b, \2/z, \1\.b, #0
++** |
++**	ptrue	(p[0-7])\.b, all
++**	mov	(z[0-9]+)\.s, #-16777216
++**	cmpne	p0\.b, \3/z, \4\.b, #0
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_8888_b8,
++		p0 = svdupq_n_b8 (0, 0, 0, 1, 0, 0, 0, 1,
++				  0, 0, 0, 1, 0, 0, 0, 1),
++		p0 = svdupq_b8 (0, 0, 0, 1, 0, 0, 0, 1,
++				0, 0, 0, 1, 0, 0, 0, 1))
++
++/*
++** dupq_9999_b8:
++** (
++**	mov	(z[0-9]+)\.s, #-16776961
++**	ptrue	(p[0-7])\.b, all
++**	cmpne	p0\.b, \2/z, \1\.b, #0
++** |
++**	ptrue	(p[0-7])\.b, all
++**	mov	(z[0-9]+)\.s, #-16776961
++**	cmpne	p0\.b, \3/z, \4\.b, #0
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_9999_b8,
++		p0 = svdupq_n_b8 (1, 0, 0, 1, 1, 0, 0, 1,
++				  1, 0, 0, 1, 1, 0, 0, 1),
++		p0 = svdupq_b8 (1, 0, 0, 1, 1, 0, 0, 1,
++				1, 0, 0, 1, 1, 0, 0, 1))
++
++/*
++** dupq_aaaa_b8:
++** (
++**	ptrue	(p[0-7])\.h, all
++**	ptrue	(p[0-7])\.b, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.b, all
++**	ptrue	(p[0-7])\.h, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_aaaa_b8,
++		p0 = svdupq_n_b8 (0, 1, 0, 1, 0, 1, 0, 1,
++				  0, 1, 0, 1, 0, 1, 0, 1),
++		p0 = svdupq_b8 (0, 1, 0, 1, 0, 1, 0, 1,
++				0, 1, 0, 1, 0, 1, 0, 1))
++
++/*
++** dupq_bbbb_b8:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.[bh], all
++**	trn1	p0\.b, \1\.b, \2\.b
++** |
++**	ptrue	(p[0-7])\.[bh], all
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.b, \4\.b, \3\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_bbbb_b8,
++		p0 = svdupq_n_b8 (1, 1, 0, 1, 1, 1, 0, 1,
++				  1, 1, 0, 1, 1, 1, 0, 1),
++		p0 = svdupq_b8 (1, 1, 0, 1, 1, 1, 0, 1,
++				1, 1, 0, 1, 1, 1, 0, 1))
++
++/*
++** dupq_cccc_b8:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.b, all
++**	trn1	p0\.h, \1\.h, \2\.h
++** |
++**	ptrue	(p[0-7])\.b, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.h, \4\.h, \3\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_cccc_b8,
++		p0 = svdupq_n_b8 (0, 0, 1, 1, 0, 0, 1, 1,
++				  0, 0, 1, 1, 0, 0, 1, 1),
++		p0 = svdupq_b8 (0, 0, 1, 1, 0, 0, 1, 1,
++				0, 0, 1, 1, 0, 0, 1, 1))
++
++/*
++** dupq_dddd_b8:
++** (
++**	ptrue	(p[0-7])\.[hs], all
++**	ptrue	(p[0-7])\.b, all
++**	trn1	p0\.h, \1\.h, \2\.h
++** |
++**	ptrue	(p[0-7])\.b, all
++**	ptrue	(p[0-7])\.[hs], all
++**	trn1	p0\.h, \4\.h, \3\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_dddd_b8,
++		p0 = svdupq_n_b8 (1, 0, 1, 1, 1, 0, 1, 1,
++				  1, 0, 1, 1, 1, 0, 1, 1),
++		p0 = svdupq_b8 (1, 0, 1, 1, 1, 0, 1, 1,
++				1, 0, 1, 1, 1, 0, 1, 1))
++
++/*
++** dupq_eeee_b8:
++** (
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.b, all
++**	not	p0\.b, \2/z, \1\.b
++** |
++**	ptrue	(p[0-7])\.b, all
++**	ptrue	(p[0-7])\.s, all
++**	not	p0\.b, \3/z, \4\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_eeee_b8,
++		p0 = svdupq_n_b8 (0, 1, 1, 1, 0, 1, 1, 1,
++				  0, 1, 1, 1, 0, 1, 1, 1),
++		p0 = svdupq_b8 (0, 1, 1, 1, 0, 1, 1, 1,
++				0, 1, 1, 1, 0, 1, 1, 1))
++
++/*
++** dupq_ffff_b8:
++**	ptrue	p0\.b, all
++**	ret
++*/
++TEST_UNIFORM_P (dupq_ffff_b8,
++		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 1, 1, 1,
++				  1, 1, 1, 1, 1, 1, 1, 1),
++		p0 = svdupq_b8 (1, 1, 1, 1, 1, 1, 1, 1,
++				1, 1, 1, 1, 1, 1, 1, 1))
++
++/*
++** dupq_5f5f_b8:
++** (
++**	ptrue	(p[0-7])\.h, all
++**	ptrue	(p[0-7])\.b, all
++**	trn1	p0\.s, \2\.s, \1\.s
++** |
++**	ptrue	(p[0-7])\.b, all
++**	ptrue	(p[0-7])\.h, all
++**	trn1	p0\.s, \3\.s, \4\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_5f5f_b8,
++		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 0, 1, 0,
++				  1, 1, 1, 1, 1, 0, 1, 0),
++		p0 = svdupq_b8 (1, 1, 1, 1, 1, 0, 1, 0,
++				1, 1, 1, 1, 1, 0, 1, 0))
++
++/*
++** dupq_1f1f_b8:
++** (
++**	ptrue	(p[0-7])\.[sd], all
++**	ptrue	(p[0-7])\.b, all
++**	trn1	p0\.s, \2\.s, \1\.s
++** |
++**	ptrue	(p[0-7])\.b, all
++**	ptrue	(p[0-7])\.[sd], all
++**	trn1	p0\.s, \3\.s, \4\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_1f1f_b8,
++		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 0, 0, 0,
++				  1, 1, 1, 1, 1, 0, 0, 0),
++		p0 = svdupq_b8 (1, 1, 1, 1, 1, 0, 0, 0,
++				1, 1, 1, 1, 1, 0, 0, 0))
++
++/*
++** dupq_1515_b8:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.[hs], all
++**	trn1	p0\.h, \2\.h, \1\.h
++** |
++**	ptrue	(p[0-7])\.[hs], all
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.h, \3\.h, \4\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_1515_b8,
++		p0 = svdupq_n_b8 (1, 0, 1, 0, 1, 0, 0, 0,
++				  1, 0, 1, 0, 1, 0, 0, 0),
++		p0 = svdupq_b8 (1, 0, 1, 0, 1, 0, 0, 0,
++				1, 0, 1, 0, 1, 0, 0, 0))
++
++/*
++** dupq_0505_b8:
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.h, \1\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0505_b8,
++		p0 = svdupq_n_b8 (1, 0, 1, 0, 0, 0, 0, 0,
++				  1, 0, 1, 0, 0, 0, 0, 0),
++		p0 = svdupq_b8 (1, 0, 1, 0, 0, 0, 0, 0,
++				1, 0, 1, 0, 0, 0, 0, 0))
++
++/*
++** dupq_00ff_b8:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.b, all
++**	trn1	p0\.d, \2\.d, \1\.d
++** |
++**	ptrue	(p[0-7])\.b, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \3\.d, \4\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_00ff_b8,
++		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 1, 1, 1,
++				  0, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b8 (1, 1, 1, 1, 1, 1, 1, 1,
++				0, 0, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_0055_b8:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.h, all
++**	trn1	p0\.d, \2\.d, \1\.d
++** |
++**	ptrue	(p[0-7])\.h, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \3\.d, \4\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0055_b8,
++		p0 = svdupq_n_b8 (1, 0, 1, 0, 1, 0, 1, 0,
++				  0, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b8 (1, 0, 1, 0, 1, 0, 1, 0,
++				0, 0, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_0011_b8:
++** (
++**	pfalse	(p[0-7])\.b
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.d, \2\.d, \1\.d
++** |
++**	ptrue	(p[0-7])\.s, all
++**	pfalse	(p[0-7])\.b
++**	trn1	p0\.d, \3\.d, \4\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0011_b8,
++		p0 = svdupq_n_b8 (1, 0, 0, 0, 1, 0, 0, 0,
++				  0, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b8 (1, 0, 0, 0, 1, 0, 0, 0,
++				0, 0, 0, 0, 0, 0, 0, 0))
++
++/*
++** dupq_0111_b8:
++** (
++**	ptrue	(p[0-7])\.d, all
++**	ptrue	(p[0-7])\.s, all
++**	trn1	p0\.d, \2\.d, \1\.d
++** |
++**	ptrue	(p[0-7])\.s, all
++**	ptrue	(p[0-7])\.d, all
++**	trn1	p0\.d, \3\.d, \4\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_P (dupq_0111_b8,
++		p0 = svdupq_n_b8 (1, 0, 0, 0, 1, 0, 0, 0,
++				  1, 0, 0, 0, 0, 0, 0, 0),
++		p0 = svdupq_b8 (1, 0, 0, 0, 1, 0, 0, 0,
++				1, 0, 0, 0, 0, 0, 0, 0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f16.c
+new file mode 100644
+index 000000000..91de8344c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f16.c
+@@ -0,0 +1,53 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_1c_f16:
++**	mov	z0\.s, #15360
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_1c_f16, svfloat16_t,
++		z0 = svdupq_n_f16 (1.0, 0, 1.0, 0, 1.0, 0, 1.0, 0),
++		z0 = svdupq_f16 (1.0, 0, 1.0, 0, 1.0, 0, 1.0, 0));
++
++/*
++** dupq_5ic_f16:
++**	movi	v([0-9]+)\.4s, 0x45, lsl 24
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_5ic_f16, svfloat16_t,
++		z0 = svdupq_n_f16 (0, 5.0, 0, 5.0, 0, 5.0, 0, 5.0),
++		z0 = svdupq_f16 (0, 5.0, 0, 5.0, 0, 5.0, 0, 5.0));
++
++
++/*
++** dupq_m1c_f16:
++**	movi	v([0-9]+)\.4s, 0xbc, lsl 8
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_m1c_f16, svfloat16_t,
++		z0 = svdupq_n_f16 (-1.0, 0, -1.0, 0, -1.0, 0, -1.0, 0),
++		z0 = svdupq_f16 (-1.0, 0, -1.0, 0, -1.0, 0, -1.0, 0));
++
++/*
++** dupq_40p5c_f16:
++**	mov	(w[0-9]+), 20752
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_40p5c_f16, svfloat16_t,
++		z0 = svdupq_n_f16 (40.5, 0, 40.5, 0, 40.5, 0, 40.5, 0),
++		z0 = svdupq_f16 (40.5, 0, 40.5, 0, 40.5, 0, 40.5, 0));
++
++/*
++** dupq_pool_f16:
++**	...
++**	ld1rqh	z0\.h, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_f16, svfloat16_t,
++		z0 = svdupq_n_f16 (4.75, 1.0, 9, 77, 5.25, 22, 19, 50),
++		z0 = svdupq_f16 (4.75, 1.0, 9, 77, 5.25, 22, 19, 50))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f32.c
+new file mode 100644
+index 000000000..4f9c04f1a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f32.c
+@@ -0,0 +1,53 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_1c_f32:
++**	mov	z0\.d, #1065353216
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_1c_f32, svfloat32_t,
++		z0 = svdupq_n_f32 (1.0, 0, 1.0, 0),
++		z0 = svdupq_f32 (1.0, 0, 1.0, 0));
++
++/*
++** dupq_5ic_f32:
++**	mov	(x[0-9]+), 4656722014701092864
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_5ic_f32, svfloat32_t,
++		z0 = svdupq_n_f32 (0, 5.0, 0, 5.0),
++		z0 = svdupq_f32 (0, 5.0, 0, 5.0));
++
++
++/*
++** dupq_m1c_f32:
++**	mov	(x[0-9]+), 3212836864
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_m1c_f32, svfloat32_t,
++		z0 = svdupq_n_f32 (-1.0, 0, -1.0, 0),
++		z0 = svdupq_f32 (-1.0, 0, -1.0, 0));
++
++/*
++** dupq_40p5c_f32:
++**	mov	(x[0-9]+), 1109524480
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_40p5c_f32, svfloat32_t,
++		z0 = svdupq_n_f32 (40.5, 0, 40.5, 0),
++		z0 = svdupq_f32 (40.5, 0, 40.5, 0));
++
++/*
++** dupq_pool_f32:
++**	...
++**	ld1rqw	z0\.s, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_f32, svfloat32_t,
++		z0 = svdupq_n_f32 (4.5, 10.1, 7.3, 11.8),
++		z0 = svdupq_f32 (4.5, 10.1, 7.3, 11.8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f64.c
+new file mode 100644
+index 000000000..27d14480e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_pool_f64:
++**	...
++**	ld1rqd	z0\.d, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_f64, svfloat64_t,
++		z0 = svdupq_n_f64 (4.5, 10.1),
++		z0 = svdupq_f64 (4.5, 10.1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c
+new file mode 100644
+index 000000000..89ae4a4c2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_bf16_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_bf16_tied, svbfloat16_t,
++		z0 = svdupq_lane_bf16 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_bf16_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_bf16_untied, svbfloat16_t,
++		z0 = svdupq_lane_bf16 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_bf16:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_bf16, svbfloat16_t,
++		z0 = svdupq_lane_bf16 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_bf16:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_bf16, svbfloat16_t,
++		z0 = svdupq_lane_bf16 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_bf16:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_bf16, svbfloat16_t,
++		z0 = svdupq_lane_bf16 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f16.c
+new file mode 100644
+index 000000000..6fa97ca3a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f16.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_f16_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_f16_tied, svfloat16_t,
++		z0 = svdupq_lane_f16 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_f16_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_f16_untied, svfloat16_t,
++		z0 = svdupq_lane_f16 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_f16:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_f16, svfloat16_t,
++		z0 = svdupq_lane_f16 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_f16:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_f16, svfloat16_t,
++		z0 = svdupq_lane_f16 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_f16:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_f16, svfloat16_t,
++		z0 = svdupq_lane_f16 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f32.c
+new file mode 100644
+index 000000000..69ce5452e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f32.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_f32_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_f32_tied, svfloat32_t,
++		z0 = svdupq_lane_f32 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_f32_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_f32_untied, svfloat32_t,
++		z0 = svdupq_lane_f32 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_f32:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_f32, svfloat32_t,
++		z0 = svdupq_lane_f32 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_f32:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_f32, svfloat32_t,
++		z0 = svdupq_lane_f32 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_f32:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_f32, svfloat32_t,
++		z0 = svdupq_lane_f32 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f64.c
+new file mode 100644
+index 000000000..51a8d9f2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f64.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_f64_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_f64_tied, svfloat64_t,
++		z0 = svdupq_lane_f64 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_f64_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_f64_untied, svfloat64_t,
++		z0 = svdupq_lane_f64 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_f64:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_f64, svfloat64_t,
++		z0 = svdupq_lane_f64 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_f64:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_f64, svfloat64_t,
++		z0 = svdupq_lane_f64 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_f64:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_f64, svfloat64_t,
++		z0 = svdupq_lane_f64 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s16.c
+new file mode 100644
+index 000000000..08a0510be
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s16.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_s16_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s16_tied, svint16_t,
++		z0 = svdupq_lane_s16 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_s16_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s16_untied, svint16_t,
++		z0 = svdupq_lane_s16 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_s16:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_s16, svint16_t,
++		z0 = svdupq_lane_s16 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_s16:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_s16, svint16_t,
++		z0 = svdupq_lane_s16 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_s16:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_s16, svint16_t,
++		z0 = svdupq_lane_s16 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s32.c
+new file mode 100644
+index 000000000..e9a9c9a60
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s32.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_s32_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s32_tied, svint32_t,
++		z0 = svdupq_lane_s32 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_s32_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s32_untied, svint32_t,
++		z0 = svdupq_lane_s32 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_s32:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_s32, svint32_t,
++		z0 = svdupq_lane_s32 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_s32:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_s32, svint32_t,
++		z0 = svdupq_lane_s32 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_s32:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_s32, svint32_t,
++		z0 = svdupq_lane_s32 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s64.c
+new file mode 100644
+index 000000000..2c6342149
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s64.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_s64_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s64_tied, svint64_t,
++		z0 = svdupq_lane_s64 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_s64_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s64_untied, svint64_t,
++		z0 = svdupq_lane_s64 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_s64:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_s64, svint64_t,
++		z0 = svdupq_lane_s64 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_s64:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_s64, svint64_t,
++		z0 = svdupq_lane_s64 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_s64:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_s64, svint64_t,
++		z0 = svdupq_lane_s64 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s8.c
+new file mode 100644
+index 000000000..2c2e6ee72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s8.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_s8_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s8_tied, svint8_t,
++		z0 = svdupq_lane_s8 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_s8_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_s8_untied, svint8_t,
++		z0 = svdupq_lane_s8 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_s8:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_s8, svint8_t,
++		z0 = svdupq_lane_s8 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_s8:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_s8, svint8_t,
++		z0 = svdupq_lane_s8 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_s8:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_s8, svint8_t,
++		z0 = svdupq_lane_s8 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u16.c
+new file mode 100644
+index 000000000..e5fba592f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u16.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_u16_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u16_tied, svuint16_t,
++		z0 = svdupq_lane_u16 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_u16_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u16_untied, svuint16_t,
++		z0 = svdupq_lane_u16 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_u16:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_u16, svuint16_t,
++		z0 = svdupq_lane_u16 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_u16:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_u16, svuint16_t,
++		z0 = svdupq_lane_u16 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_u16:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_u16, svuint16_t,
++		z0 = svdupq_lane_u16 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u32.c
+new file mode 100644
+index 000000000..fb3346e45
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u32.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_u32_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u32_tied, svuint32_t,
++		z0 = svdupq_lane_u32 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_u32_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u32_untied, svuint32_t,
++		z0 = svdupq_lane_u32 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_u32:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_u32, svuint32_t,
++		z0 = svdupq_lane_u32 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_u32:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_u32, svuint32_t,
++		z0 = svdupq_lane_u32 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_u32:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_u32, svuint32_t,
++		z0 = svdupq_lane_u32 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u64.c
+new file mode 100644
+index 000000000..22f1d5d55
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u64.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_u64_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u64_tied, svuint64_t,
++		z0 = svdupq_lane_u64 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_u64_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u64_untied, svuint64_t,
++		z0 = svdupq_lane_u64 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_u64:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_u64, svuint64_t,
++		z0 = svdupq_lane_u64 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_u64:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_u64, svuint64_t,
++		z0 = svdupq_lane_u64 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_u64:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_u64, svuint64_t,
++		z0 = svdupq_lane_u64 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u8.c
+new file mode 100644
+index 000000000..ba16f836a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u8.c
+@@ -0,0 +1,48 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_lane_0_u8_tied:
++**	dup	z0\.q, z0\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u8_tied, svuint8_t,
++		z0 = svdupq_lane_u8 (z0, 0),
++		z0 = svdupq_lane (z0, 0))
++
++/*
++** dupq_lane_0_u8_untied:
++**	dup	z0\.q, z1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_0_u8_untied, svuint8_t,
++		z0 = svdupq_lane_u8 (z1, 0),
++		z0 = svdupq_lane (z1, 0))
++
++/*
++** dupq_lane_1_u8:
++**	dup	z0\.q, z0\.q\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_1_u8, svuint8_t,
++		z0 = svdupq_lane_u8 (z0, 1),
++		z0 = svdupq_lane (z0, 1))
++
++/*
++** dupq_lane_2_u8:
++**	dup	z0\.q, z0\.q\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_2_u8, svuint8_t,
++		z0 = svdupq_lane_u8 (z0, 2),
++		z0 = svdupq_lane (z0, 2))
++
++/*
++** dupq_lane_3_u8:
++**	dup	z0\.q, z0\.q\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_lane_3_u8, svuint8_t,
++		z0 = svdupq_lane_u8 (z0, 3),
++		z0 = svdupq_lane (z0, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s16.c
+new file mode 100644
+index 000000000..5a9a53b2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s16.c
+@@ -0,0 +1,70 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_25600s_s16:
++**	mov	z0\.s, #25600
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_25600s_s16, svint16_t,
++		z0 = svdupq_n_s16 (25600, 0, 25600, 0, 25600, 0, 25600, 0),
++		z0 = svdupq_s16 (25600, 0, 25600, 0, 25600, 0, 25600, 0))
++
++/*
++** dupq_7ff00s_s16:
++**	mov	z0\.s, #524032
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_7ff00s_s16, svint16_t,
++		z0 = svdupq_n_s16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7),
++		z0 = svdupq_s16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7))
++
++/*
++** dupq_65536d_s16:
++**	mov	z0\.d, #65536
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_65536d_s16, svint16_t,
++		z0 = svdupq_n_s16 (0, 1, 0, 0, 0, 1, 0, 0),
++		z0 = svdupq_s16 (0, 1, 0, 0, 0, 1, 0, 0))
++
++/*
++** dupq_m2d_s16:
++**	mov	z0\.d, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_m2d_s16, svint16_t,
++		z0 = svdupq_n_s16 (-2, -1, -1, -1, -2, -1, -1, -1),
++		z0 = svdupq_s16 (-2, -1, -1, -1, -2, -1, -1, -1))
++
++/*
++** dupq_4ddb_s16:
++**	movi	v([0-9]+)\.2d, 0xff0000ffff00ff
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_4ddb_s16, svint16_t,
++		z0 = svdupq_n_s16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff),
++		z0 = svdupq_s16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff))
++
++
++/*
++** dupq_a093s_s16:
++**	mov	(w[0-9]+), 41107
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_a093s_s16, svint16_t,
++		z0 = svdupq_n_s16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0),
++		z0 = svdupq_s16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0));
++
++/*
++** dupq_pool_s16:
++**	...
++**	ld1rqh	z0\.h, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_s16, svint16_t,
++		z0 = svdupq_n_s16 (4, 10, 9, 77, 52, 22, 19, 50),
++		z0 = svdupq_s16 (4, 10, 9, 77, 52, 22, 19, 50))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s32.c
+new file mode 100644
+index 000000000..13b24c0db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s32.c
+@@ -0,0 +1,61 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_12800d_s32:
++**	mov	z0\.d, #12800
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_12800d_s32, svint32_t,
++		z0 = svdupq_n_s32 (12800, 0, 12800, 0),
++		z0 = svdupq_s32 (12800, 0, 12800, 0))
++
++/*
++** dupq_fffffffed_s32:
++**	mov	z0\.d, #4294967294
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_fffffffed_s32, svint32_t,
++		z0 = svdupq_n_s32 (-2, 0, -2, 0),
++		z0 = svdupq_s32 (-2, 0, -2, 0))
++
++/*
++** dupq_ff00ffffff00d_s32:
++**	movi	v([0-9]+)\.2d, 0xff00ffffff00
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_ff00ffffff00d_s32, svint32_t,
++		z0 = svdupq_n_s32 (-256, 0xff00, -256, 0xff00),
++		z0 = svdupq_s32 (-256, 0xff00, -256, 0xff00))
++
++/*
++** dupq_fedcd_s32:
++**	mov	(x[0-9]+), 65244
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_fedcd_s32, svint32_t,
++		z0 = svdupq_n_s32 (0xfedc, 0, 0xfedc, 0),
++		z0 = svdupq_s32 (0xfedc, 0, 0xfedc, 0))
++
++/*
++** dupq_1357ud_s32:
++**	mov	(x[0-9]+), 21264383082496
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_1357ud_s32, svint32_t,
++		z0 = svdupq_n_s32 (0, 0x1357, 0, 0x1357),
++		z0 = svdupq_s32 (0, 0x1357, 0, 0x1357))
++
++/*
++** dupq_pool_s32:
++**	...
++**	ld1rqw	z0\.s, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_s32, svint32_t,
++		z0 = svdupq_n_s32 (4, 10, 9, 77),
++		z0 = svdupq_s32 (4, 10, 9, 77))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s64.c
+new file mode 100644
+index 000000000..d2689fa5c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_pool_s64:
++**	...
++**	ld1rqd	z0\.d, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_s64, svint64_t,
++		z0 = svdupq_n_s64 (4, 10),
++		z0 = svdupq_s64 (4, 10))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s8.c
+new file mode 100644
+index 000000000..30b36c162
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s8.c
+@@ -0,0 +1,99 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_54h_s8:
++**	mov	z0\.h, #54
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_54h_s8, svint8_t,
++		z0 = svdupq_n_s8 (54, 0, 54, 0, 54, 0, 54, 0,
++				  54, 0, 54, 0, 54, 0, 54, 0),
++		z0 = svdupq_s8 (54, 0, 54, 0, 54, 0, 54, 0,
++				54, 0, 54, 0, 54, 0, 54, 0))
++
++/*
++** dupq_2560h_s8:
++**	mov	z0\.h, #2560
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_2560h_s8, svint8_t,
++		z0 = svdupq_n_s8 (0, 10, 0, 10, 0, 10, 0, 10,
++				  0, 10, 0, 10, 0, 10, 0, 10),
++		z0 = svdupq_s8 (0, 10, 0, 10, 0, 10, 0, 10,
++				0, 10, 0, 10, 0, 10, 0, 10))
++
++/*
++** dupq_5120s_s8:
++**	mov	z0\.s, #5120
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_5120s_s8, svint8_t,
++		z0 = svdupq_n_s8 (0, 20, 0, 0, 0, 20, 0, 0,
++				  0, 20, 0, 0, 0, 20, 0, 0),
++		z0 = svdupq_s8 (0, 20, 0, 0, 0, 20, 0, 0,
++				0, 20, 0, 0, 0, 20, 0, 0))
++
++/*
++** dupq_1ff00s_s8:
++**	mov	z0\.s, #130816
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_1ff00s_s8, svint8_t,
++		z0 = svdupq_n_s8 (0, -1, 1, 0, 0, -1, 1, 0,
++				  0, -1, 1, 0, 0, -1, 1, 0),
++		z0 = svdupq_s8 (0, -1, 1, 0, 0, -1, 1, 0,
++				0, -1, 1, 0, 0, -1, 1, 0))
++
++/*
++** dupq_96db_s8:
++**	movi	v([0-9]+)\.2d, 0xff0000ff00ffff00
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_96db_s8, svint8_t,
++		z0 = svdupq_n_s8 (0, -1, -1, 0, -1, 0, 0, -1,
++				  0, -1, -1, 0, -1, 0, 0, -1),
++		z0 = svdupq_s8 (0, -1, -1, 0, -1, 0, 0, -1,
++				0, -1, -1, 0, -1, 0, 0, -1))
++
++/*
++** dupq_7755h_s8:
++**	mov	(w[0-9]+), 21879
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_7755h_s8, svint8_t,
++		z0 = svdupq_n_s8 (0x77, 0x55, 0x77, 0x55,
++				  0x77, 0x55, 0x77, 0x55,
++				  0x77, 0x55, 0x77, 0x55,
++				  0x77, 0x55, 0x77, 0x55),
++		z0 = svdupq_s8 (0x77, 0x55, 0x77, 0x55,
++				0x77, 0x55, 0x77, 0x55,
++				0x77, 0x55, 0x77, 0x55,
++				0x77, 0x55, 0x77, 0x55))
++
++/*
++** dupq_729a0000s_s8:
++**	mov	(w[0-9]+), 1922695168
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_729a0000s_s8, svint8_t,
++		z0 = svdupq_n_s8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
++				  0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72),
++		z0 = svdupq_s8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
++				0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72))
++
++/*
++** dupq_pool_s8:
++**	...
++**	ld1rqb	z0\.b, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_s8, svint8_t,
++		z0 = svdupq_n_s8 (4, 10, 9, 77, 52, 22, 19, 50,
++				  -1, 32, 44, 17, 23, 99, 53, 39),
++		z0 = svdupq_s8 (4, 10, 9, 77, 52, 22, 19, 50,
++				-1, 32, 44, 17, 23, 99, 53, 39))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u16.c
+new file mode 100644
+index 000000000..6ca13222d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u16.c
+@@ -0,0 +1,70 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_25600s_u16:
++**	mov	z0\.s, #25600
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_25600s_u16, svuint16_t,
++		z0 = svdupq_n_u16 (25600, 0, 25600, 0, 25600, 0, 25600, 0),
++		z0 = svdupq_u16 (25600, 0, 25600, 0, 25600, 0, 25600, 0))
++
++/*
++** dupq_7ff00s_u16:
++**	mov	z0\.s, #524032
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_7ff00s_u16, svuint16_t,
++		z0 = svdupq_n_u16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7),
++		z0 = svdupq_u16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7))
++
++/*
++** dupq_65536d_u16:
++**	mov	z0\.d, #65536
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_65536d_u16, svuint16_t,
++		z0 = svdupq_n_u16 (0, 1, 0, 0, 0, 1, 0, 0),
++		z0 = svdupq_u16 (0, 1, 0, 0, 0, 1, 0, 0))
++
++/*
++** dupq_m2d_u16:
++**	mov	z0\.d, #-2
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_m2d_u16, svuint16_t,
++		z0 = svdupq_n_u16 (-2, -1, -1, -1, -2, -1, -1, -1),
++		z0 = svdupq_u16 (-2, -1, -1, -1, -2, -1, -1, -1))
++
++/*
++** dupq_4ddb_u16:
++**	movi	v([0-9]+)\.2d, 0xff0000ffff00ff
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_4ddb_u16, svuint16_t,
++		z0 = svdupq_n_u16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff),
++		z0 = svdupq_u16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff))
++
++
++/*
++** dupq_a093s_u16:
++**	mov	(w[0-9]+), 41107
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_a093s_u16, svuint16_t,
++		z0 = svdupq_n_u16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0),
++		z0 = svdupq_u16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0));
++
++/*
++** dupq_pool_u16:
++**	...
++**	ld1rqh	z0\.h, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_u16, svuint16_t,
++		z0 = svdupq_n_u16 (4, 10, 9, 77, 52, 22, 19, 50),
++		z0 = svdupq_u16 (4, 10, 9, 77, 52, 22, 19, 50))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u32.c
+new file mode 100644
+index 000000000..3669bf8a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u32.c
+@@ -0,0 +1,61 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_12800d_u32:
++**	mov	z0\.d, #12800
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_12800d_u32, svuint32_t,
++		z0 = svdupq_n_u32 (12800, 0, 12800, 0),
++		z0 = svdupq_u32 (12800, 0, 12800, 0))
++
++/*
++** dupq_fffffffed_u32:
++**	mov	z0\.d, #4294967294
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_fffffffed_u32, svuint32_t,
++		z0 = svdupq_n_u32 (-2, 0, -2, 0),
++		z0 = svdupq_u32 (-2, 0, -2, 0))
++
++/*
++** dupq_ff00ffffff00d_u32:
++**	movi	v([0-9]+)\.2d, 0xff00ffffff00
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_ff00ffffff00d_u32, svuint32_t,
++		z0 = svdupq_n_u32 (-256, 0xff00, -256, 0xff00),
++		z0 = svdupq_u32 (-256, 0xff00, -256, 0xff00))
++
++/*
++** dupq_fedcd_u32:
++**	mov	(x[0-9]+), 65244
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_fedcd_u32, svuint32_t,
++		z0 = svdupq_n_u32 (0xfedc, 0, 0xfedc, 0),
++		z0 = svdupq_u32 (0xfedc, 0, 0xfedc, 0))
++
++/*
++** dupq_1357ud_u32:
++**	mov	(x[0-9]+), 21264383082496
++**	mov	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_1357ud_u32, svuint32_t,
++		z0 = svdupq_n_u32 (0, 0x1357, 0, 0x1357),
++		z0 = svdupq_u32 (0, 0x1357, 0, 0x1357))
++
++/*
++** dupq_pool_u32:
++**	...
++**	ld1rqw	z0\.s, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_u32, svuint32_t,
++		z0 = svdupq_n_u32 (4, 10, 9, 77),
++		z0 = svdupq_u32 (4, 10, 9, 77))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u64.c
+new file mode 100644
+index 000000000..cb655a15a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_pool_u64:
++**	...
++**	ld1rqd	z0\.d, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_u64, svuint64_t,
++		z0 = svdupq_n_u64 (4, 10),
++		z0 = svdupq_u64 (4, 10))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u8.c
+new file mode 100644
+index 000000000..8b40c2b41
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u8.c
+@@ -0,0 +1,99 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** dupq_54h_u8:
++**	mov	z0\.h, #54
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_54h_u8, svuint8_t,
++		z0 = svdupq_n_u8 (54, 0, 54, 0, 54, 0, 54, 0,
++				  54, 0, 54, 0, 54, 0, 54, 0),
++		z0 = svdupq_u8 (54, 0, 54, 0, 54, 0, 54, 0,
++				54, 0, 54, 0, 54, 0, 54, 0))
++
++/*
++** dupq_2560h_u8:
++**	mov	z0\.h, #2560
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_2560h_u8, svuint8_t,
++		z0 = svdupq_n_u8 (0, 10, 0, 10, 0, 10, 0, 10,
++				  0, 10, 0, 10, 0, 10, 0, 10),
++		z0 = svdupq_u8 (0, 10, 0, 10, 0, 10, 0, 10,
++				0, 10, 0, 10, 0, 10, 0, 10))
++
++/*
++** dupq_5120s_u8:
++**	mov	z0\.s, #5120
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_5120s_u8, svuint8_t,
++		z0 = svdupq_n_u8 (0, 20, 0, 0, 0, 20, 0, 0,
++				  0, 20, 0, 0, 0, 20, 0, 0),
++		z0 = svdupq_u8 (0, 20, 0, 0, 0, 20, 0, 0,
++				0, 20, 0, 0, 0, 20, 0, 0))
++
++/*
++** dupq_1ff00s_u8:
++**	mov	z0\.s, #130816
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_1ff00s_u8, svuint8_t,
++		z0 = svdupq_n_u8 (0, -1, 1, 0, 0, -1, 1, 0,
++				  0, -1, 1, 0, 0, -1, 1, 0),
++		z0 = svdupq_u8 (0, -1, 1, 0, 0, -1, 1, 0,
++				0, -1, 1, 0, 0, -1, 1, 0))
++
++/*
++** dupq_96db_u8:
++**	movi	v([0-9]+)\.2d, 0xff0000ff00ffff00
++**	dup	z0\.q, z\1\.q\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_96db_u8, svuint8_t,
++		z0 = svdupq_n_u8 (0, -1, -1, 0, -1, 0, 0, -1,
++				  0, -1, -1, 0, -1, 0, 0, -1),
++		z0 = svdupq_u8 (0, -1, -1, 0, -1, 0, 0, -1,
++				0, -1, -1, 0, -1, 0, 0, -1))
++
++/*
++** dupq_7755h_u8:
++**	mov	(w[0-9]+), 21879
++**	mov	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_7755h_u8, svuint8_t,
++		z0 = svdupq_n_u8 (0x77, 0x55, 0x77, 0x55,
++				  0x77, 0x55, 0x77, 0x55,
++				  0x77, 0x55, 0x77, 0x55,
++				  0x77, 0x55, 0x77, 0x55),
++		z0 = svdupq_u8 (0x77, 0x55, 0x77, 0x55,
++				0x77, 0x55, 0x77, 0x55,
++				0x77, 0x55, 0x77, 0x55,
++				0x77, 0x55, 0x77, 0x55))
++
++/*
++** dupq_729a0000s_u8:
++**	mov	(w[0-9]+), 1922695168
++**	mov	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_729a0000s_u8, svuint8_t,
++		z0 = svdupq_n_u8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
++				  0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72),
++		z0 = svdupq_u8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
++				0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72))
++
++/*
++** dupq_pool_u8:
++**	...
++**	ld1rqb	z0\.b, p[0-7]/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_UNIFORM_Z (dupq_pool_u8, svuint8_t,
++		z0 = svdupq_n_u8 (4, 10, 9, 77, 52, 22, 19, 50,
++				  -1, 32, 44, 17, 23, 99, 53, 39),
++		z0 = svdupq_u8 (4, 10, 9, 77, 52, 22, 19, 50,
++				-1, 32, 44, 17, 23, 99, 53, 39))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_b.c
+new file mode 100644
+index 000000000..961ae84c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_b_z_tied1:
++**	eor	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
++**	ret
++*/
++TEST_UNIFORM_P (eor_b_z_tied1,
++		p0 = sveor_b_z (p3, p0, p1),
++		p0 = sveor_z (p3, p0, p1))
++
++/*
++** eor_b_z_tied2:
++**	eor	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
++**	ret
++*/
++TEST_UNIFORM_P (eor_b_z_tied2,
++		p0 = sveor_b_z (p3, p1, p0),
++		p0 = sveor_z (p3, p1, p0))
++
++/*
++** eor_b_z_untied:
++**	eor	p0\.b, p3/z, (p1\.b, p2\.b|p2\.b, p1\.b)
++**	ret
++*/
++TEST_UNIFORM_P (eor_b_z_untied,
++		p0 = sveor_b_z (p3, p1, p2),
++		p0 = sveor_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s16.c
+new file mode 100644
+index 000000000..7cf73609a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s16.c
+@@ -0,0 +1,376 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_s16_m_tied1:
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_m_tied1, svint16_t,
++		z0 = sveor_s16_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_m_tied2, svint16_t,
++		z0 = sveor_s16_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_s16_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_m_untied, svint16_t,
++		z0 = sveor_s16_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = sveor_n_s16_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = sveor_n_s16_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s16_m_tied1, svint16_t,
++		z0 = sveor_n_s16_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s16_m_untied, svint16_t,
++		z0 = sveor_n_s16_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_s16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_s16_m, svint16_t,
++		z0 = sveor_n_s16_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_z_tied1, svint16_t,
++		z0 = sveor_s16_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_z_tied2, svint16_t,
++		z0 = sveor_s16_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	eor	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_z_untied, svint16_t,
++		z0 = sveor_s16_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = sveor_n_s16_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = sveor_n_s16_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s16_z_tied1, svint16_t,
++		z0 = sveor_n_s16_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s16_z_untied, svint16_t,
++		z0 = sveor_n_s16_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_s16_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_x_tied1, svint16_t,
++		z0 = sveor_s16_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_s16_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_x_tied2, svint16_t,
++		z0 = sveor_s16_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_s16_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s16_x_untied, svint16_t,
++		z0 = sveor_s16_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_w0_s16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = sveor_n_s16_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_w0_s16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = sveor_n_s16_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_s16_x_tied1:
++**	eor	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s16_x_tied1, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_s16_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s16_x_untied, svint16_t,
++		z0 = sveor_n_s16_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_s16_x:
++**	eor	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_s16_x:
++**	eor	z0\.h, z0\.h, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_s16_x:
++**	eor	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_256_s16_x:
++**	eor	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (eor_256_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 256),
++		z0 = sveor_x (p0, z0, 256))
++
++/*
++** eor_257_s16_x:
++**	eor	z0\.h, z0\.h, #0x101
++**	ret
++*/
++TEST_UNIFORM_Z (eor_257_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 257),
++		z0 = sveor_x (p0, z0, 257))
++
++/*
++** eor_512_s16_x:
++**	eor	z0\.h, z0\.h, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (eor_512_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 512),
++		z0 = sveor_x (p0, z0, 512))
++
++/*
++** eor_65280_s16_x:
++**	eor	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_65280_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 0xff00),
++		z0 = sveor_x (p0, z0, 0xff00))
++
++/*
++** eor_m127_s16_x:
++**	eor	z0\.h, z0\.h, #0xff81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_s16_x:
++**	eor	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_m255_s16_x:
++**	eor	z0\.h, z0\.h, #0xff01
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m255_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -255),
++		z0 = sveor_x (p0, z0, -255))
++
++/*
++** eor_m256_s16_x:
++**	eor	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m256_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -256),
++		z0 = sveor_x (p0, z0, -256))
++
++/*
++** eor_m257_s16_x:
++**	eor	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m257_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -257),
++		z0 = sveor_x (p0, z0, -257))
++
++/*
++** eor_m512_s16_x:
++**	eor	z0\.h, z0\.h, #0xfe00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m512_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -512),
++		z0 = sveor_x (p0, z0, -512))
++
++/*
++** eor_m32768_s16_x:
++**	eor	z0\.h, z0\.h, #0x8000
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m32768_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, -0x8000),
++		z0 = sveor_x (p0, z0, -0x8000))
++
++/*
++** eor_5_s16_x:
++**	mov	(z[0-9]+)\.h, #5
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_s16_x, svint16_t,
++		z0 = sveor_n_s16_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s32.c
+new file mode 100644
+index 000000000..d5aecb201
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s32.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_s32_m_tied1:
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_m_tied1, svint32_t,
++		z0 = sveor_s32_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_m_tied2, svint32_t,
++		z0 = sveor_s32_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_s32_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_m_untied, svint32_t,
++		z0 = sveor_s32_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = sveor_n_s32_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = sveor_n_s32_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s32_m_tied1, svint32_t,
++		z0 = sveor_n_s32_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s32_m_untied, svint32_t,
++		z0 = sveor_n_s32_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_s32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_s32_m, svint32_t,
++		z0 = sveor_n_s32_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_z_tied1, svint32_t,
++		z0 = sveor_s32_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_z_tied2, svint32_t,
++		z0 = sveor_s32_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	eor	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_z_untied, svint32_t,
++		z0 = sveor_s32_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = sveor_n_s32_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = sveor_n_s32_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s32_z_tied1, svint32_t,
++		z0 = sveor_n_s32_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s32_z_untied, svint32_t,
++		z0 = sveor_n_s32_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_s32_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_x_tied1, svint32_t,
++		z0 = sveor_s32_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_s32_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_x_tied2, svint32_t,
++		z0 = sveor_s32_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_s32_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s32_x_untied, svint32_t,
++		z0 = sveor_s32_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_w0_s32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = sveor_n_s32_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_w0_s32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = sveor_n_s32_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_s32_x_tied1:
++**	eor	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s32_x_tied1, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_s32_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s32_x_untied, svint32_t,
++		z0 = sveor_n_s32_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_s32_x:
++**	eor	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_s32_x:
++**	eor	z0\.s, z0\.s, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_s32_x:
++**	eor	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_256_s32_x:
++**	eor	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (eor_256_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 256),
++		z0 = sveor_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (eor_257_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 257),
++		z0 = sveor_x (p0, z0, 257))
++
++/*
++** eor_512_s32_x:
++**	eor	z0\.s, z0\.s, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (eor_512_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 512),
++		z0 = sveor_x (p0, z0, 512))
++
++/*
++** eor_65280_s32_x:
++**	eor	z0\.s, z0\.s, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_65280_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 0xff00),
++		z0 = sveor_x (p0, z0, 0xff00))
++
++/*
++** eor_m127_s32_x:
++**	eor	z0\.s, z0\.s, #0xffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_s32_x:
++**	eor	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_m255_s32_x:
++**	eor	z0\.s, z0\.s, #0xffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m255_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -255),
++		z0 = sveor_x (p0, z0, -255))
++
++/*
++** eor_m256_s32_x:
++**	eor	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m256_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -256),
++		z0 = sveor_x (p0, z0, -256))
++
++/*
++** eor_m257_s32_x:
++**	eor	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m257_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -257),
++		z0 = sveor_x (p0, z0, -257))
++
++/*
++** eor_m512_s32_x:
++**	eor	z0\.s, z0\.s, #0xfffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m512_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -512),
++		z0 = sveor_x (p0, z0, -512))
++
++/*
++** eor_m32768_s32_x:
++**	eor	z0\.s, z0\.s, #0xffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m32768_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, -0x8000),
++		z0 = sveor_x (p0, z0, -0x8000))
++
++/*
++** eor_5_s32_x:
++**	mov	(z[0-9]+)\.s, #5
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_s32_x, svint32_t,
++		z0 = sveor_n_s32_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s64.c
+new file mode 100644
+index 000000000..157128974
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s64.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_s64_m_tied1:
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_m_tied1, svint64_t,
++		z0 = sveor_s64_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_m_tied2, svint64_t,
++		z0 = sveor_s64_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_s64_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_m_untied, svint64_t,
++		z0 = sveor_s64_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = sveor_n_s64_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = sveor_n_s64_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s64_m_tied1, svint64_t,
++		z0 = sveor_n_s64_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s64_m_untied, svint64_t,
++		z0 = sveor_n_s64_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_s64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_s64_m, svint64_t,
++		z0 = sveor_n_s64_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_z_tied1, svint64_t,
++		z0 = sveor_s64_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_z_tied2, svint64_t,
++		z0 = sveor_s64_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	eor	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_z_untied, svint64_t,
++		z0 = sveor_s64_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = sveor_n_s64_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = sveor_n_s64_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s64_z_tied1, svint64_t,
++		z0 = sveor_n_s64_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s64_z_untied, svint64_t,
++		z0 = sveor_n_s64_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_s64_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_x_tied1, svint64_t,
++		z0 = sveor_s64_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_s64_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_x_tied2, svint64_t,
++		z0 = sveor_s64_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_s64_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s64_x_untied, svint64_t,
++		z0 = sveor_s64_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = sveor_n_s64_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	eor	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = sveor_n_s64_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_s64_x_tied1:
++**	eor	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s64_x_tied1, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_s64_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s64_x_untied, svint64_t,
++		z0 = sveor_n_s64_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_s64_x:
++**	eor	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_s64_x:
++**	eor	z0\.d, z0\.d, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_s64_x:
++**	eor	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_256_s64_x:
++**	eor	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (eor_256_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 256),
++		z0 = sveor_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (eor_257_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 257),
++		z0 = sveor_x (p0, z0, 257))
++
++/*
++** eor_512_s64_x:
++**	eor	z0\.d, z0\.d, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (eor_512_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 512),
++		z0 = sveor_x (p0, z0, 512))
++
++/*
++** eor_65280_s64_x:
++**	eor	z0\.d, z0\.d, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_65280_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 0xff00),
++		z0 = sveor_x (p0, z0, 0xff00))
++
++/*
++** eor_m127_s64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_s64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_m255_s64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m255_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -255),
++		z0 = sveor_x (p0, z0, -255))
++
++/*
++** eor_m256_s64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m256_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -256),
++		z0 = sveor_x (p0, z0, -256))
++
++/*
++** eor_m257_s64_x:
++**	eor	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m257_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -257),
++		z0 = sveor_x (p0, z0, -257))
++
++/*
++** eor_m512_s64_x:
++**	eor	z0\.d, z0\.d, #0xfffffffffffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m512_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -512),
++		z0 = sveor_x (p0, z0, -512))
++
++/*
++** eor_m32768_s64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m32768_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, -0x8000),
++		z0 = sveor_x (p0, z0, -0x8000))
++
++/*
++** eor_5_s64_x:
++**	mov	(z[0-9]+\.d), #5
++**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_s64_x, svint64_t,
++		z0 = sveor_n_s64_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s8.c
+new file mode 100644
+index 000000000..083ac2dde
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s8.c
+@@ -0,0 +1,296 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_s8_m_tied1:
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_m_tied1, svint8_t,
++		z0 = sveor_s8_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_m_tied2, svint8_t,
++		z0 = sveor_s8_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_s8_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_m_untied, svint8_t,
++		z0 = sveor_s8_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = sveor_n_s8_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = sveor_n_s8_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s8_m_tied1, svint8_t,
++		z0 = sveor_n_s8_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s8_m_untied, svint8_t,
++		z0 = sveor_n_s8_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_s8_m:
++**	mov	(z[0-9]+\.b), #-2
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_s8_m, svint8_t,
++		z0 = sveor_n_s8_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_z_tied1, svint8_t,
++		z0 = sveor_s8_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_z_tied2, svint8_t,
++		z0 = sveor_s8_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	eor	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_z_untied, svint8_t,
++		z0 = sveor_s8_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = sveor_n_s8_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = sveor_n_s8_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s8_z_tied1, svint8_t,
++		z0 = sveor_n_s8_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s8_z_untied, svint8_t,
++		z0 = sveor_n_s8_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_s8_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_x_tied1, svint8_t,
++		z0 = sveor_s8_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_s8_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_x_tied2, svint8_t,
++		z0 = sveor_s8_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_s8_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_s8_x_untied, svint8_t,
++		z0 = sveor_s8_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_w0_s8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = sveor_n_s8_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_w0_s8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = sveor_n_s8_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_s8_x_tied1:
++**	eor	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s8_x_tied1, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_s8_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_s8_x_untied, svint8_t,
++		z0 = sveor_n_s8_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_s8_x:
++**	eor	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_s8_x, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_s8_x:
++**	eor	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_s8_x, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_s8_x:
++**	mov	(z[0-9]+)\.b, #-1
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_s8_x, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_m127_s8_x:
++**	eor	z0\.b, z0\.b, #0x81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_s8_x, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_s8_x:
++**	eor	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_s8_x, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_5_s8_x:
++**	mov	(z[0-9]+)\.b, #5
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_s8_x, svint8_t,
++		z0 = sveor_n_s8_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u16.c
+new file mode 100644
+index 000000000..40b43a5f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u16.c
+@@ -0,0 +1,376 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_u16_m_tied1:
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_m_tied1, svuint16_t,
++		z0 = sveor_u16_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_m_tied2, svuint16_t,
++		z0 = sveor_u16_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_u16_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_m_untied, svuint16_t,
++		z0 = sveor_u16_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = sveor_n_u16_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = sveor_n_u16_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u16_m_tied1, svuint16_t,
++		z0 = sveor_n_u16_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u16_m_untied, svuint16_t,
++		z0 = sveor_n_u16_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_u16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_u16_m, svuint16_t,
++		z0 = sveor_n_u16_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_z_tied1, svuint16_t,
++		z0 = sveor_u16_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_z_tied2, svuint16_t,
++		z0 = sveor_u16_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	eor	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_z_untied, svuint16_t,
++		z0 = sveor_u16_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = sveor_n_u16_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = sveor_n_u16_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u16_z_tied1, svuint16_t,
++		z0 = sveor_n_u16_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	eor	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	eor	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u16_z_untied, svuint16_t,
++		z0 = sveor_n_u16_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_u16_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_x_tied1, svuint16_t,
++		z0 = sveor_u16_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_u16_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_x_tied2, svuint16_t,
++		z0 = sveor_u16_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_u16_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u16_x_untied, svuint16_t,
++		z0 = sveor_u16_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_w0_u16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = sveor_n_u16_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_w0_u16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = sveor_n_u16_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_u16_x_tied1:
++**	eor	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u16_x_tied1, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_u16_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u16_x_untied, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_u16_x:
++**	eor	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_u16_x:
++**	eor	z0\.h, z0\.h, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_u16_x:
++**	eor	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_256_u16_x:
++**	eor	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (eor_256_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 256),
++		z0 = sveor_x (p0, z0, 256))
++
++/*
++** eor_257_u16_x:
++**	eor	z0\.h, z0\.h, #0x101
++**	ret
++*/
++TEST_UNIFORM_Z (eor_257_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 257),
++		z0 = sveor_x (p0, z0, 257))
++
++/*
++** eor_512_u16_x:
++**	eor	z0\.h, z0\.h, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (eor_512_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 512),
++		z0 = sveor_x (p0, z0, 512))
++
++/*
++** eor_65280_u16_x:
++**	eor	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_65280_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 0xff00),
++		z0 = sveor_x (p0, z0, 0xff00))
++
++/*
++** eor_m127_u16_x:
++**	eor	z0\.h, z0\.h, #0xff81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_u16_x:
++**	eor	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_m255_u16_x:
++**	eor	z0\.h, z0\.h, #0xff01
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m255_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -255),
++		z0 = sveor_x (p0, z0, -255))
++
++/*
++** eor_m256_u16_x:
++**	eor	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m256_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -256),
++		z0 = sveor_x (p0, z0, -256))
++
++/*
++** eor_m257_u16_x:
++**	eor	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m257_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -257),
++		z0 = sveor_x (p0, z0, -257))
++
++/*
++** eor_m512_u16_x:
++**	eor	z0\.h, z0\.h, #0xfe00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m512_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -512),
++		z0 = sveor_x (p0, z0, -512))
++
++/*
++** eor_m32768_u16_x:
++**	eor	z0\.h, z0\.h, #0x8000
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m32768_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, -0x8000),
++		z0 = sveor_x (p0, z0, -0x8000))
++
++/*
++** eor_5_u16_x:
++**	mov	(z[0-9]+)\.h, #5
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_u16_x, svuint16_t,
++		z0 = sveor_n_u16_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u32.c
+new file mode 100644
+index 000000000..8e46d08ca
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u32.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_u32_m_tied1:
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_m_tied1, svuint32_t,
++		z0 = sveor_u32_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_m_tied2, svuint32_t,
++		z0 = sveor_u32_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_u32_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_m_untied, svuint32_t,
++		z0 = sveor_u32_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = sveor_n_u32_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = sveor_n_u32_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u32_m_tied1, svuint32_t,
++		z0 = sveor_n_u32_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u32_m_untied, svuint32_t,
++		z0 = sveor_n_u32_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_u32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_u32_m, svuint32_t,
++		z0 = sveor_n_u32_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_z_tied1, svuint32_t,
++		z0 = sveor_u32_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_z_tied2, svuint32_t,
++		z0 = sveor_u32_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	eor	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_z_untied, svuint32_t,
++		z0 = sveor_u32_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = sveor_n_u32_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = sveor_n_u32_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u32_z_tied1, svuint32_t,
++		z0 = sveor_n_u32_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	eor	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	eor	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u32_z_untied, svuint32_t,
++		z0 = sveor_n_u32_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_u32_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_x_tied1, svuint32_t,
++		z0 = sveor_u32_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_u32_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_x_tied2, svuint32_t,
++		z0 = sveor_u32_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_u32_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u32_x_untied, svuint32_t,
++		z0 = sveor_u32_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_w0_u32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = sveor_n_u32_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_w0_u32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = sveor_n_u32_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_u32_x_tied1:
++**	eor	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u32_x_tied1, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_u32_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u32_x_untied, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_u32_x:
++**	eor	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_u32_x:
++**	eor	z0\.s, z0\.s, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_u32_x:
++**	eor	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_256_u32_x:
++**	eor	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (eor_256_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 256),
++		z0 = sveor_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (eor_257_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 257),
++		z0 = sveor_x (p0, z0, 257))
++
++/*
++** eor_512_u32_x:
++**	eor	z0\.s, z0\.s, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (eor_512_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 512),
++		z0 = sveor_x (p0, z0, 512))
++
++/*
++** eor_65280_u32_x:
++**	eor	z0\.s, z0\.s, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_65280_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 0xff00),
++		z0 = sveor_x (p0, z0, 0xff00))
++
++/*
++** eor_m127_u32_x:
++**	eor	z0\.s, z0\.s, #0xffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_u32_x:
++**	eor	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_m255_u32_x:
++**	eor	z0\.s, z0\.s, #0xffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m255_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -255),
++		z0 = sveor_x (p0, z0, -255))
++
++/*
++** eor_m256_u32_x:
++**	eor	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m256_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -256),
++		z0 = sveor_x (p0, z0, -256))
++
++/*
++** eor_m257_u32_x:
++**	eor	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m257_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -257),
++		z0 = sveor_x (p0, z0, -257))
++
++/*
++** eor_m512_u32_x:
++**	eor	z0\.s, z0\.s, #0xfffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m512_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -512),
++		z0 = sveor_x (p0, z0, -512))
++
++/*
++** eor_m32768_u32_x:
++**	eor	z0\.s, z0\.s, #0xffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m32768_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, -0x8000),
++		z0 = sveor_x (p0, z0, -0x8000))
++
++/*
++** eor_5_u32_x:
++**	mov	(z[0-9]+)\.s, #5
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_u32_x, svuint32_t,
++		z0 = sveor_n_u32_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u64.c
+new file mode 100644
+index 000000000..a82398f91
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u64.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_u64_m_tied1:
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_m_tied1, svuint64_t,
++		z0 = sveor_u64_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_m_tied2, svuint64_t,
++		z0 = sveor_u64_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_u64_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_m_untied, svuint64_t,
++		z0 = sveor_u64_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = sveor_n_u64_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = sveor_n_u64_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u64_m_tied1, svuint64_t,
++		z0 = sveor_n_u64_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u64_m_untied, svuint64_t,
++		z0 = sveor_n_u64_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_u64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_u64_m, svuint64_t,
++		z0 = sveor_n_u64_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_z_tied1, svuint64_t,
++		z0 = sveor_u64_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_z_tied2, svuint64_t,
++		z0 = sveor_u64_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	eor	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_z_untied, svuint64_t,
++		z0 = sveor_u64_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = sveor_n_u64_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = sveor_n_u64_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u64_z_tied1, svuint64_t,
++		z0 = sveor_n_u64_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	eor	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	eor	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u64_z_untied, svuint64_t,
++		z0 = sveor_n_u64_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_u64_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_x_tied1, svuint64_t,
++		z0 = sveor_u64_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_u64_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_x_tied2, svuint64_t,
++		z0 = sveor_u64_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_u64_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u64_x_untied, svuint64_t,
++		z0 = sveor_u64_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = sveor_n_u64_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	eor	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = sveor_n_u64_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_u64_x_tied1:
++**	eor	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u64_x_tied1, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_u64_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u64_x_untied, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_u64_x:
++**	eor	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_u64_x:
++**	eor	z0\.d, z0\.d, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_u64_x:
++**	eor	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_256_u64_x:
++**	eor	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (eor_256_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 256),
++		z0 = sveor_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (eor_257_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 257),
++		z0 = sveor_x (p0, z0, 257))
++
++/*
++** eor_512_u64_x:
++**	eor	z0\.d, z0\.d, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (eor_512_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 512),
++		z0 = sveor_x (p0, z0, 512))
++
++/*
++** eor_65280_u64_x:
++**	eor	z0\.d, z0\.d, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_65280_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 0xff00),
++		z0 = sveor_x (p0, z0, 0xff00))
++
++/*
++** eor_m127_u64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_u64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_m255_u64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m255_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -255),
++		z0 = sveor_x (p0, z0, -255))
++
++/*
++** eor_m256_u64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m256_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -256),
++		z0 = sveor_x (p0, z0, -256))
++
++/*
++** eor_m257_u64_x:
++**	eor	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m257_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -257),
++		z0 = sveor_x (p0, z0, -257))
++
++/*
++** eor_m512_u64_x:
++**	eor	z0\.d, z0\.d, #0xfffffffffffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m512_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -512),
++		z0 = sveor_x (p0, z0, -512))
++
++/*
++** eor_m32768_u64_x:
++**	eor	z0\.d, z0\.d, #0xffffffffffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m32768_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, -0x8000),
++		z0 = sveor_x (p0, z0, -0x8000))
++
++/*
++** eor_5_u64_x:
++**	mov	(z[0-9]+\.d), #5
++**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_u64_x, svuint64_t,
++		z0 = sveor_n_u64_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u8.c
+new file mode 100644
+index 000000000..006637699
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u8.c
+@@ -0,0 +1,296 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eor_u8_m_tied1:
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_m_tied1, svuint8_t,
++		z0 = sveor_u8_m (p0, z0, z1),
++		z0 = sveor_m (p0, z0, z1))
++
++/*
++** eor_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_m_tied2, svuint8_t,
++		z0 = sveor_u8_m (p0, z1, z0),
++		z0 = sveor_m (p0, z1, z0))
++
++/*
++** eor_u8_m_untied:
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_m_untied, svuint8_t,
++		z0 = sveor_u8_m (p0, z1, z2),
++		z0 = sveor_m (p0, z1, z2))
++
++/*
++** eor_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = sveor_n_u8_m (p0, z0, x0),
++		 z0 = sveor_m (p0, z0, x0))
++
++/*
++** eor_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = sveor_n_u8_m (p0, z1, x0),
++		 z0 = sveor_m (p0, z1, x0))
++
++/*
++** eor_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u8_m_tied1, svuint8_t,
++		z0 = sveor_n_u8_m (p0, z0, 1),
++		z0 = sveor_m (p0, z0, 1))
++
++/*
++** eor_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u8_m_untied, svuint8_t,
++		z0 = sveor_n_u8_m (p0, z1, 1),
++		z0 = sveor_m (p0, z1, 1))
++
++/*
++** eor_m2_u8_m:
++**	mov	(z[0-9]+\.b), #-2
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m2_u8_m, svuint8_t,
++		z0 = sveor_n_u8_m (p0, z0, -2),
++		z0 = sveor_m (p0, z0, -2))
++
++/*
++** eor_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_z_tied1, svuint8_t,
++		z0 = sveor_u8_z (p0, z0, z1),
++		z0 = sveor_z (p0, z0, z1))
++
++/*
++** eor_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_z_tied2, svuint8_t,
++		z0 = sveor_u8_z (p0, z1, z0),
++		z0 = sveor_z (p0, z1, z0))
++
++/*
++** eor_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	eor	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_z_untied, svuint8_t,
++		z0 = sveor_u8_z (p0, z1, z2),
++		z0 = sveor_z (p0, z1, z2))
++
++/*
++** eor_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = sveor_n_u8_z (p0, z0, x0),
++		 z0 = sveor_z (p0, z0, x0))
++
++/*
++** eor_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = sveor_n_u8_z (p0, z1, x0),
++		 z0 = sveor_z (p0, z1, x0))
++
++/*
++** eor_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u8_z_tied1, svuint8_t,
++		z0 = sveor_n_u8_z (p0, z0, 1),
++		z0 = sveor_z (p0, z0, 1))
++
++/*
++** eor_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	eor	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	eor	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u8_z_untied, svuint8_t,
++		z0 = sveor_n_u8_z (p0, z1, 1),
++		z0 = sveor_z (p0, z1, 1))
++
++/*
++** eor_u8_x_tied1:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_x_tied1, svuint8_t,
++		z0 = sveor_u8_x (p0, z0, z1),
++		z0 = sveor_x (p0, z0, z1))
++
++/*
++** eor_u8_x_tied2:
++**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_x_tied2, svuint8_t,
++		z0 = sveor_u8_x (p0, z1, z0),
++		z0 = sveor_x (p0, z1, z0))
++
++/*
++** eor_u8_x_untied:
++**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_u8_x_untied, svuint8_t,
++		z0 = sveor_u8_x (p0, z1, z2),
++		z0 = sveor_x (p0, z1, z2))
++
++/*
++** eor_w0_u8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = sveor_n_u8_x (p0, z0, x0),
++		 z0 = sveor_x (p0, z0, x0))
++
++/*
++** eor_w0_u8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (eor_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = sveor_n_u8_x (p0, z1, x0),
++		 z0 = sveor_x (p0, z1, x0))
++
++/*
++** eor_1_u8_x_tied1:
++**	eor	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u8_x_tied1, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, 1),
++		z0 = sveor_x (p0, z0, 1))
++
++/*
++** eor_1_u8_x_untied:
++**	movprfx	z0, z1
++**	eor	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (eor_1_u8_x_untied, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z1, 1),
++		z0 = sveor_x (p0, z1, 1))
++
++/*
++** eor_127_u8_x:
++**	eor	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (eor_127_u8_x, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, 127),
++		z0 = sveor_x (p0, z0, 127))
++
++/*
++** eor_128_u8_x:
++**	eor	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_128_u8_x, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, 128),
++		z0 = sveor_x (p0, z0, 128))
++
++/*
++** eor_255_u8_x:
++**	mov	(z[0-9]+)\.b, #-1
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_255_u8_x, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, 255),
++		z0 = sveor_x (p0, z0, 255))
++
++/*
++** eor_m127_u8_x:
++**	eor	z0\.b, z0\.b, #0x81
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m127_u8_x, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, -127),
++		z0 = sveor_x (p0, z0, -127))
++
++/*
++** eor_m128_u8_x:
++**	eor	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (eor_m128_u8_x, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, -128),
++		z0 = sveor_x (p0, z0, -128))
++
++/*
++** eor_5_u8_x:
++**	mov	(z[0-9]+)\.b, #5
++**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (eor_5_u8_x, svuint8_t,
++		z0 = sveor_n_u8_x (p0, z0, 5),
++		z0 = sveor_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s16.c
+new file mode 100644
+index 000000000..0675d7ed9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_s16:
++**	eorv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_s16, int16_t, svint16_t,
++		  x0 = sveorv_s16 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s32.c
+new file mode 100644
+index 000000000..9c0c1089f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_s32:
++**	eorv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_s32, int32_t, svint32_t,
++		  x0 = sveorv_s32 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s64.c
+new file mode 100644
+index 000000000..7a474556c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_s64:
++**	eorv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_s64, int64_t, svint64_t,
++		  x0 = sveorv_s64 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s8.c
+new file mode 100644
+index 000000000..43f056d3a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_s8:
++**	eorv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_s8, int8_t, svint8_t,
++		  x0 = sveorv_s8 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u16.c
+new file mode 100644
+index 000000000..5f7836db4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_u16:
++**	eorv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_u16, uint16_t, svuint16_t,
++		  x0 = sveorv_u16 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u32.c
+new file mode 100644
+index 000000000..f112a0dc2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_u32:
++**	eorv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_u32, uint32_t, svuint32_t,
++		  x0 = sveorv_u32 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u64.c
+new file mode 100644
+index 000000000..5f8b8f86b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_u64:
++**	eorv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_u64, uint64_t, svuint64_t,
++		  x0 = sveorv_u64 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u8.c
+new file mode 100644
+index 000000000..eed4d4915
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** eorv_x0_u8:
++**	eorv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (eorv_x0_u8, uint8_t, svuint8_t,
++		  x0 = sveorv_u8 (p0, z0),
++		  x0 = sveorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f16.c
+new file mode 100644
+index 000000000..5a5411e46
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** expa_f16_tied1:
++**	fexpa	z0\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (expa_f16_tied1, svfloat16_t, svuint16_t,
++		 z0_res = svexpa_f16 (z0),
++		 z0_res = svexpa (z0))
++
++/*
++** expa_f16_untied:
++**	fexpa	z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (expa_f16_untied, svfloat16_t, svuint16_t,
++	     z0 = svexpa_f16 (z4),
++	     z0 = svexpa (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f32.c
+new file mode 100644
+index 000000000..4ded1c575
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** expa_f32_tied1:
++**	fexpa	z0\.s, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (expa_f32_tied1, svfloat32_t, svuint32_t,
++		 z0_res = svexpa_f32 (z0),
++		 z0_res = svexpa (z0))
++
++/*
++** expa_f32_untied:
++**	fexpa	z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (expa_f32_untied, svfloat32_t, svuint32_t,
++	     z0 = svexpa_f32 (z4),
++	     z0 = svexpa (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f64.c
+new file mode 100644
+index 000000000..c31f9ccb5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** expa_f64_tied1:
++**	fexpa	z0\.d, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (expa_f64_tied1, svfloat64_t, svuint64_t,
++		 z0_res = svexpa_f64 (z0),
++		 z0_res = svexpa (z0))
++
++/*
++** expa_f64_untied:
++**	fexpa	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (expa_f64_untied, svfloat64_t, svuint64_t,
++	     z0 = svexpa_f64 (z4),
++	     z0 = svexpa (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_bf16.c
+new file mode 100644
+index 000000000..f982873c4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_bf16.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_bf16_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_bf16_tied1, svbfloat16_t,
++		z0 = svext_bf16 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_bf16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_bf16_tied2, svbfloat16_t,
++		z0 = svext_bf16 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_bf16_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_bf16_untied, svbfloat16_t,
++		z0 = svext_bf16 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_bf16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_bf16, svbfloat16_t,
++		z0 = svext_bf16 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_bf16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_bf16, svbfloat16_t,
++		z0 = svext_bf16 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_bf16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #6
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_bf16, svbfloat16_t,
++		z0 = svext_bf16 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_127_bf16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #254
++**	ret
++*/
++TEST_UNIFORM_Z (ext_127_bf16, svbfloat16_t,
++		z0 = svext_bf16 (z1, z2, 127),
++		z0 = svext (z1, z2, 127))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f16.c
+new file mode 100644
+index 000000000..d8edccb9f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f16.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_f16_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f16_tied1, svfloat16_t,
++		z0 = svext_f16 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f16_tied2, svfloat16_t,
++		z0 = svext_f16 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_f16_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f16_untied, svfloat16_t,
++		z0 = svext_f16 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_f16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_f16, svfloat16_t,
++		z0 = svext_f16 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_f16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_f16, svfloat16_t,
++		z0 = svext_f16 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_f16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #6
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_f16, svfloat16_t,
++		z0 = svext_f16 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_127_f16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #254
++**	ret
++*/
++TEST_UNIFORM_Z (ext_127_f16, svfloat16_t,
++		z0 = svext_f16 (z1, z2, 127),
++		z0 = svext (z1, z2, 127))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f32.c
+new file mode 100644
+index 000000000..c00ea06fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f32.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_f32_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f32_tied1, svfloat32_t,
++		z0 = svext_f32 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f32_tied2, svfloat32_t,
++		z0 = svext_f32 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_f32_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f32_untied, svfloat32_t,
++		z0 = svext_f32 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_f32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_f32, svfloat32_t,
++		z0 = svext_f32 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_f32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_f32, svfloat32_t,
++		z0 = svext_f32 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_f32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #12
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_f32, svfloat32_t,
++		z0 = svext_f32 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_63_f32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #252
++**	ret
++*/
++TEST_UNIFORM_Z (ext_63_f32, svfloat32_t,
++		z0 = svext_f32 (z1, z2, 63),
++		z0 = svext (z1, z2, 63))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f64.c
+new file mode 100644
+index 000000000..af72870ca
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f64.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_f64_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f64_tied1, svfloat64_t,
++		z0 = svext_f64 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_f64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f64_tied2, svfloat64_t,
++		z0 = svext_f64 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_f64_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_f64_untied, svfloat64_t,
++		z0 = svext_f64 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_f64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_f64, svfloat64_t,
++		z0 = svext_f64 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_f64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #16
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_f64, svfloat64_t,
++		z0 = svext_f64 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_f64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #24
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_f64, svfloat64_t,
++		z0 = svext_f64 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_31_f64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #248
++**	ret
++*/
++TEST_UNIFORM_Z (ext_31_f64, svfloat64_t,
++		z0 = svext_f64 (z1, z2, 31),
++		z0 = svext (z1, z2, 31))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s16.c
+new file mode 100644
+index 000000000..a7c4484ac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s16.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_s16_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s16_tied1, svint16_t,
++		z0 = svext_s16 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_s16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s16_tied2, svint16_t,
++		z0 = svext_s16 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_s16_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s16_untied, svint16_t,
++		z0 = svext_s16 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_s16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_s16, svint16_t,
++		z0 = svext_s16 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_s16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_s16, svint16_t,
++		z0 = svext_s16 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_s16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #6
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_s16, svint16_t,
++		z0 = svext_s16 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_127_s16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #254
++**	ret
++*/
++TEST_UNIFORM_Z (ext_127_s16, svint16_t,
++		z0 = svext_s16 (z1, z2, 127),
++		z0 = svext (z1, z2, 127))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s32.c
+new file mode 100644
+index 000000000..68242a9ec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s32.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_s32_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s32_tied1, svint32_t,
++		z0 = svext_s32 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s32_tied2, svint32_t,
++		z0 = svext_s32 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_s32_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s32_untied, svint32_t,
++		z0 = svext_s32 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_s32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_s32, svint32_t,
++		z0 = svext_s32 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_s32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_s32, svint32_t,
++		z0 = svext_s32 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_s32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #12
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_s32, svint32_t,
++		z0 = svext_s32 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_63_s32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #252
++**	ret
++*/
++TEST_UNIFORM_Z (ext_63_s32, svint32_t,
++		z0 = svext_s32 (z1, z2, 63),
++		z0 = svext (z1, z2, 63))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s64.c
+new file mode 100644
+index 000000000..8bdbd0561
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s64.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_s64_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s64_tied1, svint64_t,
++		z0 = svext_s64 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_s64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s64_tied2, svint64_t,
++		z0 = svext_s64 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_s64_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s64_untied, svint64_t,
++		z0 = svext_s64 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_s64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_s64, svint64_t,
++		z0 = svext_s64 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_s64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #16
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_s64, svint64_t,
++		z0 = svext_s64 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_s64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #24
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_s64, svint64_t,
++		z0 = svext_s64 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_31_s64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #248
++**	ret
++*/
++TEST_UNIFORM_Z (ext_31_s64, svint64_t,
++		z0 = svext_s64 (z1, z2, 31),
++		z0 = svext (z1, z2, 31))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s8.c
+new file mode 100644
+index 000000000..52490f00e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s8.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_s8_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s8_tied1, svint8_t,
++		z0 = svext_s8 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_s8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s8_tied2, svint8_t,
++		z0 = svext_s8 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_s8_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_s8_untied, svint8_t,
++		z0 = svext_s8 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_s8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_s8, svint8_t,
++		z0 = svext_s8 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_s8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_s8, svint8_t,
++		z0 = svext_s8 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_s8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #3
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_s8, svint8_t,
++		z0 = svext_s8 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_255_s8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (ext_255_s8, svint8_t,
++		z0 = svext_s8 (z1, z2, 255),
++		z0 = svext (z1, z2, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u16.c
+new file mode 100644
+index 000000000..dc7574ffa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u16.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_u16_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u16_tied1, svuint16_t,
++		z0 = svext_u16 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_u16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u16_tied2, svuint16_t,
++		z0 = svext_u16 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_u16_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u16_untied, svuint16_t,
++		z0 = svext_u16 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_u16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_u16, svuint16_t,
++		z0 = svext_u16 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_u16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_u16, svuint16_t,
++		z0 = svext_u16 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_u16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #6
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_u16, svuint16_t,
++		z0 = svext_u16 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_127_u16:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #254
++**	ret
++*/
++TEST_UNIFORM_Z (ext_127_u16, svuint16_t,
++		z0 = svext_u16 (z1, z2, 127),
++		z0 = svext (z1, z2, 127))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u32.c
+new file mode 100644
+index 000000000..0d417fc43
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u32.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_u32_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u32_tied1, svuint32_t,
++		z0 = svext_u32 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u32_tied2, svuint32_t,
++		z0 = svext_u32 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_u32_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u32_untied, svuint32_t,
++		z0 = svext_u32 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_u32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #4
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_u32, svuint32_t,
++		z0 = svext_u32 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_u32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_u32, svuint32_t,
++		z0 = svext_u32 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_u32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #12
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_u32, svuint32_t,
++		z0 = svext_u32 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_63_u32:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #252
++**	ret
++*/
++TEST_UNIFORM_Z (ext_63_u32, svuint32_t,
++		z0 = svext_u32 (z1, z2, 63),
++		z0 = svext (z1, z2, 63))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u64.c
+new file mode 100644
+index 000000000..ed81f811e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u64.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_u64_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u64_tied1, svuint64_t,
++		z0 = svext_u64 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_u64_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u64_tied2, svuint64_t,
++		z0 = svext_u64 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_u64_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u64_untied, svuint64_t,
++		z0 = svext_u64 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_u64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_u64, svuint64_t,
++		z0 = svext_u64 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_u64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #16
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_u64, svuint64_t,
++		z0 = svext_u64 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_u64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #24
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_u64, svuint64_t,
++		z0 = svext_u64 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_31_u64:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #248
++**	ret
++*/
++TEST_UNIFORM_Z (ext_31_u64, svuint64_t,
++		z0 = svext_u64 (z1, z2, 31),
++		z0 = svext (z1, z2, 31))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u8.c
+new file mode 100644
+index 000000000..6c061406b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u8.c
+@@ -0,0 +1,73 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ext_0_u8_tied1:
++**	ext	z0\.b, z0\.b, z1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u8_tied1, svuint8_t,
++		z0 = svext_u8 (z0, z1, 0),
++		z0 = svext (z0, z1, 0))
++
++/*
++** ext_0_u8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, \1\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u8_tied2, svuint8_t,
++		z0 = svext_u8 (z1, z0, 0),
++		z0 = svext (z1, z0, 0))
++
++/*
++** ext_0_u8_untied:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #0
++**	ret
++*/
++TEST_UNIFORM_Z (ext_0_u8_untied, svuint8_t,
++		z0 = svext_u8 (z1, z2, 0),
++		z0 = svext (z1, z2, 0))
++
++/*
++** ext_1_u8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (ext_1_u8, svuint8_t,
++		z0 = svext_u8 (z1, z2, 1),
++		z0 = svext (z1, z2, 1))
++
++/*
++** ext_2_u8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (ext_2_u8, svuint8_t,
++		z0 = svext_u8 (z1, z2, 2),
++		z0 = svext (z1, z2, 2))
++
++/*
++** ext_3_u8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #3
++**	ret
++*/
++TEST_UNIFORM_Z (ext_3_u8, svuint8_t,
++		z0 = svext_u8 (z1, z2, 3),
++		z0 = svext (z1, z2, 3))
++
++/*
++** ext_255_u8:
++**	movprfx	z0, z1
++**	ext	z0\.b, z0\.b, z2\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (ext_255_u8, svuint8_t,
++		z0 = svext_u8 (z1, z2, 255),
++		z0 = svext (z1, z2, 255))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s16.c
+new file mode 100644
+index 000000000..32e836f01
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extb_s16_m_tied12:
++**	sxtb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_m_tied12, svint16_t,
++		z0 = svextb_s16_m (z0, p0, z0),
++		z0 = svextb_m (z0, p0, z0))
++
++/*
++** extb_s16_m_tied1:
++**	sxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_m_tied1, svint16_t,
++		z0 = svextb_s16_m (z0, p0, z1),
++		z0 = svextb_m (z0, p0, z1))
++
++/*
++** extb_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sxtb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_m_tied2, svint16_t,
++		z0 = svextb_s16_m (z1, p0, z0),
++		z0 = svextb_m (z1, p0, z0))
++
++/*
++** extb_s16_m_untied:
++**	movprfx	z0, z2
++**	sxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_m_untied, svint16_t,
++		z0 = svextb_s16_m (z2, p0, z1),
++		z0 = svextb_m (z2, p0, z1))
++
++/*
++** extb_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	sxtb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_z_tied1, svint16_t,
++		z0 = svextb_s16_z (p0, z0),
++		z0 = svextb_z (p0, z0))
++
++/*
++** extb_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_z_untied, svint16_t,
++		z0 = svextb_s16_z (p0, z1),
++		z0 = svextb_z (p0, z1))
++
++/*
++** extb_s16_x_tied1:
++**	sxtb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_x_tied1, svint16_t,
++		z0 = svextb_s16_x (p0, z0),
++		z0 = svextb_x (p0, z0))
++
++/*
++** extb_s16_x_untied:
++**	sxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s16_x_untied, svint16_t,
++		z0 = svextb_s16_x (p0, z1),
++		z0 = svextb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s32.c
+new file mode 100644
+index 000000000..e2f13f41c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extb_s32_m_tied12:
++**	sxtb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_m_tied12, svint32_t,
++		z0 = svextb_s32_m (z0, p0, z0),
++		z0 = svextb_m (z0, p0, z0))
++
++/*
++** extb_s32_m_tied1:
++**	sxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_m_tied1, svint32_t,
++		z0 = svextb_s32_m (z0, p0, z1),
++		z0 = svextb_m (z0, p0, z1))
++
++/*
++** extb_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sxtb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_m_tied2, svint32_t,
++		z0 = svextb_s32_m (z1, p0, z0),
++		z0 = svextb_m (z1, p0, z0))
++
++/*
++** extb_s32_m_untied:
++**	movprfx	z0, z2
++**	sxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_m_untied, svint32_t,
++		z0 = svextb_s32_m (z2, p0, z1),
++		z0 = svextb_m (z2, p0, z1))
++
++/*
++** extb_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	sxtb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_z_tied1, svint32_t,
++		z0 = svextb_s32_z (p0, z0),
++		z0 = svextb_z (p0, z0))
++
++/*
++** extb_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_z_untied, svint32_t,
++		z0 = svextb_s32_z (p0, z1),
++		z0 = svextb_z (p0, z1))
++
++/*
++** extb_s32_x_tied1:
++**	sxtb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_x_tied1, svint32_t,
++		z0 = svextb_s32_x (p0, z0),
++		z0 = svextb_x (p0, z0))
++
++/*
++** extb_s32_x_untied:
++**	sxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s32_x_untied, svint32_t,
++		z0 = svextb_s32_x (p0, z1),
++		z0 = svextb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s64.c
+new file mode 100644
+index 000000000..83363efdb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extb_s64_m_tied12:
++**	sxtb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_m_tied12, svint64_t,
++		z0 = svextb_s64_m (z0, p0, z0),
++		z0 = svextb_m (z0, p0, z0))
++
++/*
++** extb_s64_m_tied1:
++**	sxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_m_tied1, svint64_t,
++		z0 = svextb_s64_m (z0, p0, z1),
++		z0 = svextb_m (z0, p0, z1))
++
++/*
++** extb_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sxtb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_m_tied2, svint64_t,
++		z0 = svextb_s64_m (z1, p0, z0),
++		z0 = svextb_m (z1, p0, z0))
++
++/*
++** extb_s64_m_untied:
++**	movprfx	z0, z2
++**	sxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_m_untied, svint64_t,
++		z0 = svextb_s64_m (z2, p0, z1),
++		z0 = svextb_m (z2, p0, z1))
++
++/*
++** extb_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	sxtb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_z_tied1, svint64_t,
++		z0 = svextb_s64_z (p0, z0),
++		z0 = svextb_z (p0, z0))
++
++/*
++** extb_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_z_untied, svint64_t,
++		z0 = svextb_s64_z (p0, z1),
++		z0 = svextb_z (p0, z1))
++
++/*
++** extb_s64_x_tied1:
++**	sxtb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_x_tied1, svint64_t,
++		z0 = svextb_s64_x (p0, z0),
++		z0 = svextb_x (p0, z0))
++
++/*
++** extb_s64_x_untied:
++**	sxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_s64_x_untied, svint64_t,
++		z0 = svextb_s64_x (p0, z1),
++		z0 = svextb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u16.c
+new file mode 100644
+index 000000000..d806edfaa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u16.c
+@@ -0,0 +1,82 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extb_u16_m_tied12:
++**	uxtb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_m_tied12, svuint16_t,
++		z0 = svextb_u16_m (z0, p0, z0),
++		z0 = svextb_m (z0, p0, z0))
++
++/*
++** extb_u16_m_tied1:
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_m_tied1, svuint16_t,
++		z0 = svextb_u16_m (z0, p0, z1),
++		z0 = svextb_m (z0, p0, z1))
++
++/*
++** extb_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	uxtb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_m_tied2, svuint16_t,
++		z0 = svextb_u16_m (z1, p0, z0),
++		z0 = svextb_m (z1, p0, z0))
++
++/*
++** extb_u16_m_untied:
++**	movprfx	z0, z2
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_m_untied, svuint16_t,
++		z0 = svextb_u16_m (z2, p0, z1),
++		z0 = svextb_m (z2, p0, z1))
++
++/*
++** extb_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	uxtb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_z_tied1, svuint16_t,
++		z0 = svextb_u16_z (p0, z0),
++		z0 = svextb_z (p0, z0))
++
++/*
++** extb_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	uxtb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_z_untied, svuint16_t,
++		z0 = svextb_u16_z (p0, z1),
++		z0 = svextb_z (p0, z1))
++
++/*
++** extb_u16_x_tied1:
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_x_tied1, svuint16_t,
++		z0 = svextb_u16_x (p0, z0),
++		z0 = svextb_x (p0, z0))
++
++/*
++** extb_u16_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u16_x_untied, svuint16_t,
++		z0 = svextb_u16_x (p0, z1),
++		z0 = svextb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u32.c
+new file mode 100644
+index 000000000..274656dbd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u32.c
+@@ -0,0 +1,82 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extb_u32_m_tied12:
++**	uxtb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_m_tied12, svuint32_t,
++		z0 = svextb_u32_m (z0, p0, z0),
++		z0 = svextb_m (z0, p0, z0))
++
++/*
++** extb_u32_m_tied1:
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_m_tied1, svuint32_t,
++		z0 = svextb_u32_m (z0, p0, z1),
++		z0 = svextb_m (z0, p0, z1))
++
++/*
++** extb_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	uxtb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_m_tied2, svuint32_t,
++		z0 = svextb_u32_m (z1, p0, z0),
++		z0 = svextb_m (z1, p0, z0))
++
++/*
++** extb_u32_m_untied:
++**	movprfx	z0, z2
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_m_untied, svuint32_t,
++		z0 = svextb_u32_m (z2, p0, z1),
++		z0 = svextb_m (z2, p0, z1))
++
++/*
++** extb_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	uxtb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_z_tied1, svuint32_t,
++		z0 = svextb_u32_z (p0, z0),
++		z0 = svextb_z (p0, z0))
++
++/*
++** extb_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uxtb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_z_untied, svuint32_t,
++		z0 = svextb_u32_z (p0, z1),
++		z0 = svextb_z (p0, z1))
++
++/*
++** extb_u32_x_tied1:
++**	and	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_x_tied1, svuint32_t,
++		z0 = svextb_u32_x (p0, z0),
++		z0 = svextb_x (p0, z0))
++
++/*
++** extb_u32_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u32_x_untied, svuint32_t,
++		z0 = svextb_u32_x (p0, z1),
++		z0 = svextb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u64.c
+new file mode 100644
+index 000000000..de24cc605
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u64.c
+@@ -0,0 +1,82 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extb_u64_m_tied12:
++**	uxtb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_m_tied12, svuint64_t,
++		z0 = svextb_u64_m (z0, p0, z0),
++		z0 = svextb_m (z0, p0, z0))
++
++/*
++** extb_u64_m_tied1:
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_m_tied1, svuint64_t,
++		z0 = svextb_u64_m (z0, p0, z1),
++		z0 = svextb_m (z0, p0, z1))
++
++/*
++** extb_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	uxtb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_m_tied2, svuint64_t,
++		z0 = svextb_u64_m (z1, p0, z0),
++		z0 = svextb_m (z1, p0, z0))
++
++/*
++** extb_u64_m_untied:
++**	movprfx	z0, z2
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_m_untied, svuint64_t,
++		z0 = svextb_u64_m (z2, p0, z1),
++		z0 = svextb_m (z2, p0, z1))
++
++/*
++** extb_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxtb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_z_tied1, svuint64_t,
++		z0 = svextb_u64_z (p0, z0),
++		z0 = svextb_z (p0, z0))
++
++/*
++** extb_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxtb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_z_untied, svuint64_t,
++		z0 = svextb_u64_z (p0, z1),
++		z0 = svextb_z (p0, z1))
++
++/*
++** extb_u64_x_tied1:
++**	and	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_x_tied1, svuint64_t,
++		z0 = svextb_u64_x (p0, z0),
++		z0 = svextb_x (p0, z0))
++
++/*
++** extb_u64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (extb_u64_x_untied, svuint64_t,
++		z0 = svextb_u64_x (p0, z1),
++		z0 = svextb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s32.c
+new file mode 100644
+index 000000000..3bb0bf31f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** exth_s32_m_tied12:
++**	sxth	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_m_tied12, svint32_t,
++		z0 = svexth_s32_m (z0, p0, z0),
++		z0 = svexth_m (z0, p0, z0))
++
++/*
++** exth_s32_m_tied1:
++**	sxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_m_tied1, svint32_t,
++		z0 = svexth_s32_m (z0, p0, z1),
++		z0 = svexth_m (z0, p0, z1))
++
++/*
++** exth_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sxth	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_m_tied2, svint32_t,
++		z0 = svexth_s32_m (z1, p0, z0),
++		z0 = svexth_m (z1, p0, z0))
++
++/*
++** exth_s32_m_untied:
++**	movprfx	z0, z2
++**	sxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_m_untied, svint32_t,
++		z0 = svexth_s32_m (z2, p0, z1),
++		z0 = svexth_m (z2, p0, z1))
++
++/*
++** exth_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	sxth	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_z_tied1, svint32_t,
++		z0 = svexth_s32_z (p0, z0),
++		z0 = svexth_z (p0, z0))
++
++/*
++** exth_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_z_untied, svint32_t,
++		z0 = svexth_s32_z (p0, z1),
++		z0 = svexth_z (p0, z1))
++
++/*
++** exth_s32_x_tied1:
++**	sxth	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_x_tied1, svint32_t,
++		z0 = svexth_s32_x (p0, z0),
++		z0 = svexth_x (p0, z0))
++
++/*
++** exth_s32_x_untied:
++**	sxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s32_x_untied, svint32_t,
++		z0 = svexth_s32_x (p0, z1),
++		z0 = svexth_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s64.c
+new file mode 100644
+index 000000000..0718b67ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** exth_s64_m_tied12:
++**	sxth	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_m_tied12, svint64_t,
++		z0 = svexth_s64_m (z0, p0, z0),
++		z0 = svexth_m (z0, p0, z0))
++
++/*
++** exth_s64_m_tied1:
++**	sxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_m_tied1, svint64_t,
++		z0 = svexth_s64_m (z0, p0, z1),
++		z0 = svexth_m (z0, p0, z1))
++
++/*
++** exth_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sxth	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_m_tied2, svint64_t,
++		z0 = svexth_s64_m (z1, p0, z0),
++		z0 = svexth_m (z1, p0, z0))
++
++/*
++** exth_s64_m_untied:
++**	movprfx	z0, z2
++**	sxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_m_untied, svint64_t,
++		z0 = svexth_s64_m (z2, p0, z1),
++		z0 = svexth_m (z2, p0, z1))
++
++/*
++** exth_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	sxth	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_z_tied1, svint64_t,
++		z0 = svexth_s64_z (p0, z0),
++		z0 = svexth_z (p0, z0))
++
++/*
++** exth_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_z_untied, svint64_t,
++		z0 = svexth_s64_z (p0, z1),
++		z0 = svexth_z (p0, z1))
++
++/*
++** exth_s64_x_tied1:
++**	sxth	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_x_tied1, svint64_t,
++		z0 = svexth_s64_x (p0, z0),
++		z0 = svexth_x (p0, z0))
++
++/*
++** exth_s64_x_untied:
++**	sxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_s64_x_untied, svint64_t,
++		z0 = svexth_s64_x (p0, z1),
++		z0 = svexth_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u32.c
+new file mode 100644
+index 000000000..1ba7fc8c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u32.c
+@@ -0,0 +1,82 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** exth_u32_m_tied12:
++**	uxth	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_m_tied12, svuint32_t,
++		z0 = svexth_u32_m (z0, p0, z0),
++		z0 = svexth_m (z0, p0, z0))
++
++/*
++** exth_u32_m_tied1:
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_m_tied1, svuint32_t,
++		z0 = svexth_u32_m (z0, p0, z1),
++		z0 = svexth_m (z0, p0, z1))
++
++/*
++** exth_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	uxth	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_m_tied2, svuint32_t,
++		z0 = svexth_u32_m (z1, p0, z0),
++		z0 = svexth_m (z1, p0, z0))
++
++/*
++** exth_u32_m_untied:
++**	movprfx	z0, z2
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_m_untied, svuint32_t,
++		z0 = svexth_u32_m (z2, p0, z1),
++		z0 = svexth_m (z2, p0, z1))
++
++/*
++** exth_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	uxth	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_z_tied1, svuint32_t,
++		z0 = svexth_u32_z (p0, z0),
++		z0 = svexth_z (p0, z0))
++
++/*
++** exth_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	uxth	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_z_untied, svuint32_t,
++		z0 = svexth_u32_z (p0, z1),
++		z0 = svexth_z (p0, z1))
++
++/*
++** exth_u32_x_tied1:
++**	and	z0\.s, z0\.s, #0xffff
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_x_tied1, svuint32_t,
++		z0 = svexth_u32_x (p0, z0),
++		z0 = svexth_x (p0, z0))
++
++/*
++** exth_u32_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.s, z0\.s, #0xffff
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u32_x_untied, svuint32_t,
++		z0 = svexth_u32_x (p0, z1),
++		z0 = svexth_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u64.c
+new file mode 100644
+index 000000000..1555cf0b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u64.c
+@@ -0,0 +1,82 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** exth_u64_m_tied12:
++**	uxth	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_m_tied12, svuint64_t,
++		z0 = svexth_u64_m (z0, p0, z0),
++		z0 = svexth_m (z0, p0, z0))
++
++/*
++** exth_u64_m_tied1:
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_m_tied1, svuint64_t,
++		z0 = svexth_u64_m (z0, p0, z1),
++		z0 = svexth_m (z0, p0, z1))
++
++/*
++** exth_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	uxth	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_m_tied2, svuint64_t,
++		z0 = svexth_u64_m (z1, p0, z0),
++		z0 = svexth_m (z1, p0, z0))
++
++/*
++** exth_u64_m_untied:
++**	movprfx	z0, z2
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_m_untied, svuint64_t,
++		z0 = svexth_u64_m (z2, p0, z1),
++		z0 = svexth_m (z2, p0, z1))
++
++/*
++** exth_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxth	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_z_tied1, svuint64_t,
++		z0 = svexth_u64_z (p0, z0),
++		z0 = svexth_z (p0, z0))
++
++/*
++** exth_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxth	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_z_untied, svuint64_t,
++		z0 = svexth_u64_z (p0, z1),
++		z0 = svexth_z (p0, z1))
++
++/*
++** exth_u64_x_tied1:
++**	and	z0\.d, z0\.d, #0xffff
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_x_tied1, svuint64_t,
++		z0 = svexth_u64_x (p0, z0),
++		z0 = svexth_x (p0, z0))
++
++/*
++** exth_u64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0xffff
++**	ret
++*/
++TEST_UNIFORM_Z (exth_u64_x_untied, svuint64_t,
++		z0 = svexth_u64_x (p0, z1),
++		z0 = svexth_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_s64.c
+new file mode 100644
+index 000000000..a6edadfa7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extw_s64_m_tied12:
++**	sxtw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_m_tied12, svint64_t,
++		z0 = svextw_s64_m (z0, p0, z0),
++		z0 = svextw_m (z0, p0, z0))
++
++/*
++** extw_s64_m_tied1:
++**	sxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_m_tied1, svint64_t,
++		z0 = svextw_s64_m (z0, p0, z1),
++		z0 = svextw_m (z0, p0, z1))
++
++/*
++** extw_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sxtw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_m_tied2, svint64_t,
++		z0 = svextw_s64_m (z1, p0, z0),
++		z0 = svextw_m (z1, p0, z0))
++
++/*
++** extw_s64_m_untied:
++**	movprfx	z0, z2
++**	sxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_m_untied, svint64_t,
++		z0 = svextw_s64_m (z2, p0, z1),
++		z0 = svextw_m (z2, p0, z1))
++
++/*
++** extw_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	sxtw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_z_tied1, svint64_t,
++		z0 = svextw_s64_z (p0, z0),
++		z0 = svextw_z (p0, z0))
++
++/*
++** extw_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_z_untied, svint64_t,
++		z0 = svextw_s64_z (p0, z1),
++		z0 = svextw_z (p0, z1))
++
++/*
++** extw_s64_x_tied1:
++**	sxtw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_x_tied1, svint64_t,
++		z0 = svextw_s64_x (p0, z0),
++		z0 = svextw_x (p0, z0))
++
++/*
++** extw_s64_x_untied:
++**	sxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_s64_x_untied, svint64_t,
++		z0 = svextw_s64_x (p0, z1),
++		z0 = svextw_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_u64.c
+new file mode 100644
+index 000000000..880a287f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_u64.c
+@@ -0,0 +1,82 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** extw_u64_m_tied12:
++**	uxtw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_m_tied12, svuint64_t,
++		z0 = svextw_u64_m (z0, p0, z0),
++		z0 = svextw_m (z0, p0, z0))
++
++/*
++** extw_u64_m_tied1:
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_m_tied1, svuint64_t,
++		z0 = svextw_u64_m (z0, p0, z1),
++		z0 = svextw_m (z0, p0, z1))
++
++/*
++** extw_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	uxtw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_m_tied2, svuint64_t,
++		z0 = svextw_u64_m (z1, p0, z0),
++		z0 = svextw_m (z1, p0, z0))
++
++/*
++** extw_u64_m_untied:
++**	movprfx	z0, z2
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_m_untied, svuint64_t,
++		z0 = svextw_u64_m (z2, p0, z1),
++		z0 = svextw_m (z2, p0, z1))
++
++/*
++** extw_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	uxtw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_z_tied1, svuint64_t,
++		z0 = svextw_u64_z (p0, z0),
++		z0 = svextw_z (p0, z0))
++
++/*
++** extw_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	uxtw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_z_untied, svuint64_t,
++		z0 = svextw_u64_z (p0, z1),
++		z0 = svextw_z (p0, z1))
++
++/*
++** extw_u64_x_tied1:
++**	and	z0\.d, z0\.d, #0xffffffff
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_x_tied1, svuint64_t,
++		z0 = svextw_u64_x (p0, z0),
++		z0 = svextw_x (p0, z0))
++
++/*
++** extw_u64_x_untied:
++**	movprfx	z0, z1
++**	and	z0\.d, z0\.d, #0xffffffff
++**	ret
++*/
++TEST_UNIFORM_Z (extw_u64_x_untied, svuint64_t,
++		z0 = svextw_u64_x (p0, z1),
++		z0 = svextw_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_bf16.c
+new file mode 100644
+index 000000000..6e5c773b5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_bf16.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_bf16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_bf16_z0_0, svbfloat16x2_t, svbfloat16_t,
++	  z0 = svget2_bf16 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_bf16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_bf16_z0_1, svbfloat16x2_t, svbfloat16_t,
++	  z0 = svget2_bf16 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_bf16_z4_0:
++**	ret
++*/
++TEST_GET (get2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
++	  z4_res = svget2_bf16 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_bf16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
++	  z4_res = svget2_bf16 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_bf16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_bf16_z5_0, svbfloat16x2_t, svbfloat16_t,
++	  z5_res = svget2_bf16 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_bf16_z5_1:
++**	ret
++*/
++TEST_GET (get2_bf16_z5_1, svbfloat16x2_t, svbfloat16_t,
++	  z5_res = svget2_bf16 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f16.c
+new file mode 100644
+index 000000000..9b6379e0b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f16.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_f16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_f16_z0_0, svfloat16x2_t, svfloat16_t,
++	  z0 = svget2_f16 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_f16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_f16_z0_1, svfloat16x2_t, svfloat16_t,
++	  z0 = svget2_f16 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_f16_z4_0:
++**	ret
++*/
++TEST_GET (get2_f16_z4_0, svfloat16x2_t, svfloat16_t,
++	  z4_res = svget2_f16 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_f16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_f16_z4_1, svfloat16x2_t, svfloat16_t,
++	  z4_res = svget2_f16 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_f16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_f16_z5_0, svfloat16x2_t, svfloat16_t,
++	  z5_res = svget2_f16 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_f16_z5_1:
++**	ret
++*/
++TEST_GET (get2_f16_z5_1, svfloat16x2_t, svfloat16_t,
++	  z5_res = svget2_f16 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f32.c
+new file mode 100644
+index 000000000..76080dc66
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f32.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_f32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_f32_z0_0, svfloat32x2_t, svfloat32_t,
++	  z0 = svget2_f32 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_f32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_f32_z0_1, svfloat32x2_t, svfloat32_t,
++	  z0 = svget2_f32 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_f32_z4_0:
++**	ret
++*/
++TEST_GET (get2_f32_z4_0, svfloat32x2_t, svfloat32_t,
++	  z4_res = svget2_f32 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_f32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_f32_z4_1, svfloat32x2_t, svfloat32_t,
++	  z4_res = svget2_f32 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_f32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_f32_z5_0, svfloat32x2_t, svfloat32_t,
++	  z5_res = svget2_f32 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_f32_z5_1:
++**	ret
++*/
++TEST_GET (get2_f32_z5_1, svfloat32x2_t, svfloat32_t,
++	  z5_res = svget2_f32 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f64.c
+new file mode 100644
+index 000000000..cabe6e7de
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f64.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_f64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_f64_z0_0, svfloat64x2_t, svfloat64_t,
++	  z0 = svget2_f64 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_f64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_f64_z0_1, svfloat64x2_t, svfloat64_t,
++	  z0 = svget2_f64 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_f64_z4_0:
++**	ret
++*/
++TEST_GET (get2_f64_z4_0, svfloat64x2_t, svfloat64_t,
++	  z4_res = svget2_f64 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_f64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_f64_z4_1, svfloat64x2_t, svfloat64_t,
++	  z4_res = svget2_f64 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_f64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_f64_z5_0, svfloat64x2_t, svfloat64_t,
++	  z5_res = svget2_f64 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_f64_z5_1:
++**	ret
++*/
++TEST_GET (get2_f64_z5_1, svfloat64x2_t, svfloat64_t,
++	  z5_res = svget2_f64 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s16.c
+new file mode 100644
+index 000000000..387e6daad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s16.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_s16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s16_z0_0, svint16x2_t, svint16_t,
++	  z0 = svget2_s16 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_s16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s16_z0_1, svint16x2_t, svint16_t,
++	  z0 = svget2_s16 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_s16_z4_0:
++**	ret
++*/
++TEST_GET (get2_s16_z4_0, svint16x2_t, svint16_t,
++	  z4_res = svget2_s16 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_s16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s16_z4_1, svint16x2_t, svint16_t,
++	  z4_res = svget2_s16 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_s16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s16_z5_0, svint16x2_t, svint16_t,
++	  z5_res = svget2_s16 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_s16_z5_1:
++**	ret
++*/
++TEST_GET (get2_s16_z5_1, svint16x2_t, svint16_t,
++	  z5_res = svget2_s16 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s32.c
+new file mode 100644
+index 000000000..5c47286e0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s32.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_s32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s32_z0_0, svint32x2_t, svint32_t,
++	  z0 = svget2_s32 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_s32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s32_z0_1, svint32x2_t, svint32_t,
++	  z0 = svget2_s32 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_s32_z4_0:
++**	ret
++*/
++TEST_GET (get2_s32_z4_0, svint32x2_t, svint32_t,
++	  z4_res = svget2_s32 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_s32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s32_z4_1, svint32x2_t, svint32_t,
++	  z4_res = svget2_s32 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_s32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s32_z5_0, svint32x2_t, svint32_t,
++	  z5_res = svget2_s32 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_s32_z5_1:
++**	ret
++*/
++TEST_GET (get2_s32_z5_1, svint32x2_t, svint32_t,
++	  z5_res = svget2_s32 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s64.c
+new file mode 100644
+index 000000000..18f930d4c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s64.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_s64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s64_z0_0, svint64x2_t, svint64_t,
++	  z0 = svget2_s64 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_s64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s64_z0_1, svint64x2_t, svint64_t,
++	  z0 = svget2_s64 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_s64_z4_0:
++**	ret
++*/
++TEST_GET (get2_s64_z4_0, svint64x2_t, svint64_t,
++	  z4_res = svget2_s64 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_s64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s64_z4_1, svint64x2_t, svint64_t,
++	  z4_res = svget2_s64 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_s64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s64_z5_0, svint64x2_t, svint64_t,
++	  z5_res = svget2_s64 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_s64_z5_1:
++**	ret
++*/
++TEST_GET (get2_s64_z5_1, svint64x2_t, svint64_t,
++	  z5_res = svget2_s64 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s8.c
+new file mode 100644
+index 000000000..27e2cfafb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s8.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_s8_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s8_z0_0, svint8x2_t, svint8_t,
++	  z0 = svget2_s8 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_s8_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s8_z0_1, svint8x2_t, svint8_t,
++	  z0 = svget2_s8 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_s8_z4_0:
++**	ret
++*/
++TEST_GET (get2_s8_z4_0, svint8x2_t, svint8_t,
++	  z4_res = svget2_s8 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_s8_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_s8_z4_1, svint8x2_t, svint8_t,
++	  z4_res = svget2_s8 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_s8_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_s8_z5_0, svint8x2_t, svint8_t,
++	  z5_res = svget2_s8 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_s8_z5_1:
++**	ret
++*/
++TEST_GET (get2_s8_z5_1, svint8x2_t, svint8_t,
++	  z5_res = svget2_s8 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u16.c
+new file mode 100644
+index 000000000..1804900cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u16.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_u16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u16_z0_0, svuint16x2_t, svuint16_t,
++	  z0 = svget2_u16 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_u16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u16_z0_1, svuint16x2_t, svuint16_t,
++	  z0 = svget2_u16 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_u16_z4_0:
++**	ret
++*/
++TEST_GET (get2_u16_z4_0, svuint16x2_t, svuint16_t,
++	  z4_res = svget2_u16 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_u16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u16_z4_1, svuint16x2_t, svuint16_t,
++	  z4_res = svget2_u16 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_u16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u16_z5_0, svuint16x2_t, svuint16_t,
++	  z5_res = svget2_u16 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_u16_z5_1:
++**	ret
++*/
++TEST_GET (get2_u16_z5_1, svuint16x2_t, svuint16_t,
++	  z5_res = svget2_u16 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u32.c
+new file mode 100644
+index 000000000..5c14de6aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u32.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_u32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u32_z0_0, svuint32x2_t, svuint32_t,
++	  z0 = svget2_u32 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_u32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u32_z0_1, svuint32x2_t, svuint32_t,
++	  z0 = svget2_u32 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_u32_z4_0:
++**	ret
++*/
++TEST_GET (get2_u32_z4_0, svuint32x2_t, svuint32_t,
++	  z4_res = svget2_u32 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_u32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u32_z4_1, svuint32x2_t, svuint32_t,
++	  z4_res = svget2_u32 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_u32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u32_z5_0, svuint32x2_t, svuint32_t,
++	  z5_res = svget2_u32 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_u32_z5_1:
++**	ret
++*/
++TEST_GET (get2_u32_z5_1, svuint32x2_t, svuint32_t,
++	  z5_res = svget2_u32 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u64.c
+new file mode 100644
+index 000000000..fd389a01e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u64.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_u64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u64_z0_0, svuint64x2_t, svuint64_t,
++	  z0 = svget2_u64 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_u64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u64_z0_1, svuint64x2_t, svuint64_t,
++	  z0 = svget2_u64 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_u64_z4_0:
++**	ret
++*/
++TEST_GET (get2_u64_z4_0, svuint64x2_t, svuint64_t,
++	  z4_res = svget2_u64 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_u64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u64_z4_1, svuint64x2_t, svuint64_t,
++	  z4_res = svget2_u64 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_u64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u64_z5_0, svuint64x2_t, svuint64_t,
++	  z5_res = svget2_u64 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_u64_z5_1:
++**	ret
++*/
++TEST_GET (get2_u64_z5_1, svuint64x2_t, svuint64_t,
++	  z5_res = svget2_u64 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u8.c
+new file mode 100644
+index 000000000..42ffb0344
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u8.c
+@@ -0,0 +1,55 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get2_u8_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u8_z0_0, svuint8x2_t, svuint8_t,
++	  z0 = svget2_u8 (z4, 0),
++	  z0 = svget2 (z4, 0))
++
++/*
++** get2_u8_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u8_z0_1, svuint8x2_t, svuint8_t,
++	  z0 = svget2_u8 (z4, 1),
++	  z0 = svget2 (z4, 1))
++
++/*
++** get2_u8_z4_0:
++**	ret
++*/
++TEST_GET (get2_u8_z4_0, svuint8x2_t, svuint8_t,
++	  z4_res = svget2_u8 (z4, 0),
++	  z4_res = svget2 (z4, 0))
++
++/*
++** get2_u8_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get2_u8_z4_1, svuint8x2_t, svuint8_t,
++	  z4_res = svget2_u8 (z4, 1),
++	  z4_res = svget2 (z4, 1))
++
++/*
++** get2_u8_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get2_u8_z5_0, svuint8x2_t, svuint8_t,
++	  z5_res = svget2_u8 (z4, 0),
++	  z5_res = svget2 (z4, 0))
++
++/*
++** get2_u8_z5_1:
++**	ret
++*/
++TEST_GET (get2_u8_z5_1, svuint8x2_t, svuint8_t,
++	  z5_res = svget2_u8 (z4, 1),
++	  z5_res = svget2 (z4, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_bf16.c
+new file mode 100644
+index 000000000..292f02a12
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_bf16.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_bf16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z0_0, svbfloat16x3_t, svbfloat16_t,
++	  z0 = svget3_bf16 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_bf16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z0_1, svbfloat16x3_t, svbfloat16_t,
++	  z0 = svget3_bf16 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_bf16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z0_2, svbfloat16x3_t, svbfloat16_t,
++	  z0 = svget3_bf16 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_bf16_z4_0:
++**	ret
++*/
++TEST_GET (get3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
++	  z4_res = svget3_bf16 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_bf16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
++	  z4_res = svget3_bf16 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_bf16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
++	  z4_res = svget3_bf16 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_bf16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z5_0, svbfloat16x3_t, svbfloat16_t,
++	  z5_res = svget3_bf16 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_bf16_z5_1:
++**	ret
++*/
++TEST_GET (get3_bf16_z5_1, svbfloat16x3_t, svbfloat16_t,
++	  z5_res = svget3_bf16 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_bf16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z5_2, svbfloat16x3_t, svbfloat16_t,
++	  z5_res = svget3_bf16 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_bf16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z6_0, svbfloat16x3_t, svbfloat16_t,
++	  z6_res = svget3_bf16 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_bf16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_bf16_z6_1, svbfloat16x3_t, svbfloat16_t,
++	  z6_res = svget3_bf16 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_bf16_z6_2:
++**	ret
++*/
++TEST_GET (get3_bf16_z6_2, svbfloat16x3_t, svbfloat16_t,
++	  z6_res = svget3_bf16 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f16.c
+new file mode 100644
+index 000000000..8bea03bc5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f16.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_f16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f16_z0_0, svfloat16x3_t, svfloat16_t,
++	  z0 = svget3_f16 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_f16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f16_z0_1, svfloat16x3_t, svfloat16_t,
++	  z0 = svget3_f16 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_f16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f16_z0_2, svfloat16x3_t, svfloat16_t,
++	  z0 = svget3_f16 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_f16_z4_0:
++**	ret
++*/
++TEST_GET (get3_f16_z4_0, svfloat16x3_t, svfloat16_t,
++	  z4_res = svget3_f16 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_f16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f16_z4_1, svfloat16x3_t, svfloat16_t,
++	  z4_res = svget3_f16 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_f16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f16_z4_2, svfloat16x3_t, svfloat16_t,
++	  z4_res = svget3_f16 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_f16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f16_z5_0, svfloat16x3_t, svfloat16_t,
++	  z5_res = svget3_f16 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_f16_z5_1:
++**	ret
++*/
++TEST_GET (get3_f16_z5_1, svfloat16x3_t, svfloat16_t,
++	  z5_res = svget3_f16 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_f16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f16_z5_2, svfloat16x3_t, svfloat16_t,
++	  z5_res = svget3_f16 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_f16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f16_z6_0, svfloat16x3_t, svfloat16_t,
++	  z6_res = svget3_f16 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_f16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f16_z6_1, svfloat16x3_t, svfloat16_t,
++	  z6_res = svget3_f16 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_f16_z6_2:
++**	ret
++*/
++TEST_GET (get3_f16_z6_2, svfloat16x3_t, svfloat16_t,
++	  z6_res = svget3_f16 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f32.c
+new file mode 100644
+index 000000000..246679584
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f32.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_f32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f32_z0_0, svfloat32x3_t, svfloat32_t,
++	  z0 = svget3_f32 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_f32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f32_z0_1, svfloat32x3_t, svfloat32_t,
++	  z0 = svget3_f32 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_f32_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f32_z0_2, svfloat32x3_t, svfloat32_t,
++	  z0 = svget3_f32 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_f32_z4_0:
++**	ret
++*/
++TEST_GET (get3_f32_z4_0, svfloat32x3_t, svfloat32_t,
++	  z4_res = svget3_f32 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_f32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f32_z4_1, svfloat32x3_t, svfloat32_t,
++	  z4_res = svget3_f32 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_f32_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f32_z4_2, svfloat32x3_t, svfloat32_t,
++	  z4_res = svget3_f32 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_f32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f32_z5_0, svfloat32x3_t, svfloat32_t,
++	  z5_res = svget3_f32 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_f32_z5_1:
++**	ret
++*/
++TEST_GET (get3_f32_z5_1, svfloat32x3_t, svfloat32_t,
++	  z5_res = svget3_f32 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_f32_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f32_z5_2, svfloat32x3_t, svfloat32_t,
++	  z5_res = svget3_f32 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_f32_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f32_z6_0, svfloat32x3_t, svfloat32_t,
++	  z6_res = svget3_f32 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_f32_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f32_z6_1, svfloat32x3_t, svfloat32_t,
++	  z6_res = svget3_f32 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_f32_z6_2:
++**	ret
++*/
++TEST_GET (get3_f32_z6_2, svfloat32x3_t, svfloat32_t,
++	  z6_res = svget3_f32 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f64.c
+new file mode 100644
+index 000000000..e44eb15fd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f64.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_f64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f64_z0_0, svfloat64x3_t, svfloat64_t,
++	  z0 = svget3_f64 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_f64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f64_z0_1, svfloat64x3_t, svfloat64_t,
++	  z0 = svget3_f64 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_f64_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f64_z0_2, svfloat64x3_t, svfloat64_t,
++	  z0 = svget3_f64 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_f64_z4_0:
++**	ret
++*/
++TEST_GET (get3_f64_z4_0, svfloat64x3_t, svfloat64_t,
++	  z4_res = svget3_f64 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_f64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f64_z4_1, svfloat64x3_t, svfloat64_t,
++	  z4_res = svget3_f64 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_f64_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f64_z4_2, svfloat64x3_t, svfloat64_t,
++	  z4_res = svget3_f64 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_f64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f64_z5_0, svfloat64x3_t, svfloat64_t,
++	  z5_res = svget3_f64 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_f64_z5_1:
++**	ret
++*/
++TEST_GET (get3_f64_z5_1, svfloat64x3_t, svfloat64_t,
++	  z5_res = svget3_f64 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_f64_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_f64_z5_2, svfloat64x3_t, svfloat64_t,
++	  z5_res = svget3_f64 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_f64_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_f64_z6_0, svfloat64x3_t, svfloat64_t,
++	  z6_res = svget3_f64 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_f64_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_f64_z6_1, svfloat64x3_t, svfloat64_t,
++	  z6_res = svget3_f64 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_f64_z6_2:
++**	ret
++*/
++TEST_GET (get3_f64_z6_2, svfloat64x3_t, svfloat64_t,
++	  z6_res = svget3_f64 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s16.c
+new file mode 100644
+index 000000000..88f7e4986
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s16.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_s16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s16_z0_0, svint16x3_t, svint16_t,
++	  z0 = svget3_s16 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_s16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s16_z0_1, svint16x3_t, svint16_t,
++	  z0 = svget3_s16 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_s16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s16_z0_2, svint16x3_t, svint16_t,
++	  z0 = svget3_s16 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_s16_z4_0:
++**	ret
++*/
++TEST_GET (get3_s16_z4_0, svint16x3_t, svint16_t,
++	  z4_res = svget3_s16 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_s16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s16_z4_1, svint16x3_t, svint16_t,
++	  z4_res = svget3_s16 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_s16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s16_z4_2, svint16x3_t, svint16_t,
++	  z4_res = svget3_s16 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_s16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s16_z5_0, svint16x3_t, svint16_t,
++	  z5_res = svget3_s16 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_s16_z5_1:
++**	ret
++*/
++TEST_GET (get3_s16_z5_1, svint16x3_t, svint16_t,
++	  z5_res = svget3_s16 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_s16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s16_z5_2, svint16x3_t, svint16_t,
++	  z5_res = svget3_s16 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_s16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s16_z6_0, svint16x3_t, svint16_t,
++	  z6_res = svget3_s16 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_s16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s16_z6_1, svint16x3_t, svint16_t,
++	  z6_res = svget3_s16 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_s16_z6_2:
++**	ret
++*/
++TEST_GET (get3_s16_z6_2, svint16x3_t, svint16_t,
++	  z6_res = svget3_s16 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s32.c
+new file mode 100644
+index 000000000..f0f7785c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s32.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_s32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s32_z0_0, svint32x3_t, svint32_t,
++	  z0 = svget3_s32 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_s32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s32_z0_1, svint32x3_t, svint32_t,
++	  z0 = svget3_s32 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_s32_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s32_z0_2, svint32x3_t, svint32_t,
++	  z0 = svget3_s32 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_s32_z4_0:
++**	ret
++*/
++TEST_GET (get3_s32_z4_0, svint32x3_t, svint32_t,
++	  z4_res = svget3_s32 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_s32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s32_z4_1, svint32x3_t, svint32_t,
++	  z4_res = svget3_s32 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_s32_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s32_z4_2, svint32x3_t, svint32_t,
++	  z4_res = svget3_s32 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_s32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s32_z5_0, svint32x3_t, svint32_t,
++	  z5_res = svget3_s32 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_s32_z5_1:
++**	ret
++*/
++TEST_GET (get3_s32_z5_1, svint32x3_t, svint32_t,
++	  z5_res = svget3_s32 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_s32_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s32_z5_2, svint32x3_t, svint32_t,
++	  z5_res = svget3_s32 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_s32_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s32_z6_0, svint32x3_t, svint32_t,
++	  z6_res = svget3_s32 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_s32_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s32_z6_1, svint32x3_t, svint32_t,
++	  z6_res = svget3_s32 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_s32_z6_2:
++**	ret
++*/
++TEST_GET (get3_s32_z6_2, svint32x3_t, svint32_t,
++	  z6_res = svget3_s32 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s64.c
+new file mode 100644
+index 000000000..92500bfdf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s64.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_s64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s64_z0_0, svint64x3_t, svint64_t,
++	  z0 = svget3_s64 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_s64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s64_z0_1, svint64x3_t, svint64_t,
++	  z0 = svget3_s64 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_s64_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s64_z0_2, svint64x3_t, svint64_t,
++	  z0 = svget3_s64 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_s64_z4_0:
++**	ret
++*/
++TEST_GET (get3_s64_z4_0, svint64x3_t, svint64_t,
++	  z4_res = svget3_s64 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_s64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s64_z4_1, svint64x3_t, svint64_t,
++	  z4_res = svget3_s64 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_s64_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s64_z4_2, svint64x3_t, svint64_t,
++	  z4_res = svget3_s64 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_s64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s64_z5_0, svint64x3_t, svint64_t,
++	  z5_res = svget3_s64 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_s64_z5_1:
++**	ret
++*/
++TEST_GET (get3_s64_z5_1, svint64x3_t, svint64_t,
++	  z5_res = svget3_s64 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_s64_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s64_z5_2, svint64x3_t, svint64_t,
++	  z5_res = svget3_s64 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_s64_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s64_z6_0, svint64x3_t, svint64_t,
++	  z6_res = svget3_s64 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_s64_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s64_z6_1, svint64x3_t, svint64_t,
++	  z6_res = svget3_s64 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_s64_z6_2:
++**	ret
++*/
++TEST_GET (get3_s64_z6_2, svint64x3_t, svint64_t,
++	  z6_res = svget3_s64 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s8.c
+new file mode 100644
+index 000000000..edf225ba5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s8.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_s8_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s8_z0_0, svint8x3_t, svint8_t,
++	  z0 = svget3_s8 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_s8_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s8_z0_1, svint8x3_t, svint8_t,
++	  z0 = svget3_s8 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_s8_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s8_z0_2, svint8x3_t, svint8_t,
++	  z0 = svget3_s8 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_s8_z4_0:
++**	ret
++*/
++TEST_GET (get3_s8_z4_0, svint8x3_t, svint8_t,
++	  z4_res = svget3_s8 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_s8_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s8_z4_1, svint8x3_t, svint8_t,
++	  z4_res = svget3_s8 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_s8_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s8_z4_2, svint8x3_t, svint8_t,
++	  z4_res = svget3_s8 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_s8_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s8_z5_0, svint8x3_t, svint8_t,
++	  z5_res = svget3_s8 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_s8_z5_1:
++**	ret
++*/
++TEST_GET (get3_s8_z5_1, svint8x3_t, svint8_t,
++	  z5_res = svget3_s8 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_s8_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_s8_z5_2, svint8x3_t, svint8_t,
++	  z5_res = svget3_s8 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_s8_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_s8_z6_0, svint8x3_t, svint8_t,
++	  z6_res = svget3_s8 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_s8_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_s8_z6_1, svint8x3_t, svint8_t,
++	  z6_res = svget3_s8 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_s8_z6_2:
++**	ret
++*/
++TEST_GET (get3_s8_z6_2, svint8x3_t, svint8_t,
++	  z6_res = svget3_s8 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u16.c
+new file mode 100644
+index 000000000..1fa7c63c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u16.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_u16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u16_z0_0, svuint16x3_t, svuint16_t,
++	  z0 = svget3_u16 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_u16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u16_z0_1, svuint16x3_t, svuint16_t,
++	  z0 = svget3_u16 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_u16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u16_z0_2, svuint16x3_t, svuint16_t,
++	  z0 = svget3_u16 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_u16_z4_0:
++**	ret
++*/
++TEST_GET (get3_u16_z4_0, svuint16x3_t, svuint16_t,
++	  z4_res = svget3_u16 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_u16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u16_z4_1, svuint16x3_t, svuint16_t,
++	  z4_res = svget3_u16 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_u16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u16_z4_2, svuint16x3_t, svuint16_t,
++	  z4_res = svget3_u16 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_u16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u16_z5_0, svuint16x3_t, svuint16_t,
++	  z5_res = svget3_u16 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_u16_z5_1:
++**	ret
++*/
++TEST_GET (get3_u16_z5_1, svuint16x3_t, svuint16_t,
++	  z5_res = svget3_u16 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_u16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u16_z5_2, svuint16x3_t, svuint16_t,
++	  z5_res = svget3_u16 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_u16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u16_z6_0, svuint16x3_t, svuint16_t,
++	  z6_res = svget3_u16 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_u16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u16_z6_1, svuint16x3_t, svuint16_t,
++	  z6_res = svget3_u16 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_u16_z6_2:
++**	ret
++*/
++TEST_GET (get3_u16_z6_2, svuint16x3_t, svuint16_t,
++	  z6_res = svget3_u16 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u32.c
+new file mode 100644
+index 000000000..03b5f2616
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u32.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_u32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u32_z0_0, svuint32x3_t, svuint32_t,
++	  z0 = svget3_u32 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_u32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u32_z0_1, svuint32x3_t, svuint32_t,
++	  z0 = svget3_u32 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_u32_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u32_z0_2, svuint32x3_t, svuint32_t,
++	  z0 = svget3_u32 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_u32_z4_0:
++**	ret
++*/
++TEST_GET (get3_u32_z4_0, svuint32x3_t, svuint32_t,
++	  z4_res = svget3_u32 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_u32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u32_z4_1, svuint32x3_t, svuint32_t,
++	  z4_res = svget3_u32 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_u32_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u32_z4_2, svuint32x3_t, svuint32_t,
++	  z4_res = svget3_u32 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_u32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u32_z5_0, svuint32x3_t, svuint32_t,
++	  z5_res = svget3_u32 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_u32_z5_1:
++**	ret
++*/
++TEST_GET (get3_u32_z5_1, svuint32x3_t, svuint32_t,
++	  z5_res = svget3_u32 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_u32_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u32_z5_2, svuint32x3_t, svuint32_t,
++	  z5_res = svget3_u32 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_u32_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u32_z6_0, svuint32x3_t, svuint32_t,
++	  z6_res = svget3_u32 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_u32_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u32_z6_1, svuint32x3_t, svuint32_t,
++	  z6_res = svget3_u32 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_u32_z6_2:
++**	ret
++*/
++TEST_GET (get3_u32_z6_2, svuint32x3_t, svuint32_t,
++	  z6_res = svget3_u32 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u64.c
+new file mode 100644
+index 000000000..ae4ef0024
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u64.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_u64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u64_z0_0, svuint64x3_t, svuint64_t,
++	  z0 = svget3_u64 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_u64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u64_z0_1, svuint64x3_t, svuint64_t,
++	  z0 = svget3_u64 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_u64_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u64_z0_2, svuint64x3_t, svuint64_t,
++	  z0 = svget3_u64 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_u64_z4_0:
++**	ret
++*/
++TEST_GET (get3_u64_z4_0, svuint64x3_t, svuint64_t,
++	  z4_res = svget3_u64 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_u64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u64_z4_1, svuint64x3_t, svuint64_t,
++	  z4_res = svget3_u64 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_u64_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u64_z4_2, svuint64x3_t, svuint64_t,
++	  z4_res = svget3_u64 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_u64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u64_z5_0, svuint64x3_t, svuint64_t,
++	  z5_res = svget3_u64 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_u64_z5_1:
++**	ret
++*/
++TEST_GET (get3_u64_z5_1, svuint64x3_t, svuint64_t,
++	  z5_res = svget3_u64 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_u64_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u64_z5_2, svuint64x3_t, svuint64_t,
++	  z5_res = svget3_u64 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_u64_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u64_z6_0, svuint64x3_t, svuint64_t,
++	  z6_res = svget3_u64 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_u64_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u64_z6_1, svuint64x3_t, svuint64_t,
++	  z6_res = svget3_u64 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_u64_z6_2:
++**	ret
++*/
++TEST_GET (get3_u64_z6_2, svuint64x3_t, svuint64_t,
++	  z6_res = svget3_u64 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u8.c
+new file mode 100644
+index 000000000..497dcbbae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u8.c
+@@ -0,0 +1,108 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get3_u8_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u8_z0_0, svuint8x3_t, svuint8_t,
++	  z0 = svget3_u8 (z4, 0),
++	  z0 = svget3 (z4, 0))
++
++/*
++** get3_u8_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u8_z0_1, svuint8x3_t, svuint8_t,
++	  z0 = svget3_u8 (z4, 1),
++	  z0 = svget3 (z4, 1))
++
++/*
++** get3_u8_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u8_z0_2, svuint8x3_t, svuint8_t,
++	  z0 = svget3_u8 (z4, 2),
++	  z0 = svget3 (z4, 2))
++
++/*
++** get3_u8_z4_0:
++**	ret
++*/
++TEST_GET (get3_u8_z4_0, svuint8x3_t, svuint8_t,
++	  z4_res = svget3_u8 (z4, 0),
++	  z4_res = svget3 (z4, 0))
++
++/*
++** get3_u8_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u8_z4_1, svuint8x3_t, svuint8_t,
++	  z4_res = svget3_u8 (z4, 1),
++	  z4_res = svget3 (z4, 1))
++
++/*
++** get3_u8_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u8_z4_2, svuint8x3_t, svuint8_t,
++	  z4_res = svget3_u8 (z4, 2),
++	  z4_res = svget3 (z4, 2))
++
++/*
++** get3_u8_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u8_z5_0, svuint8x3_t, svuint8_t,
++	  z5_res = svget3_u8 (z4, 0),
++	  z5_res = svget3 (z4, 0))
++
++/*
++** get3_u8_z5_1:
++**	ret
++*/
++TEST_GET (get3_u8_z5_1, svuint8x3_t, svuint8_t,
++	  z5_res = svget3_u8 (z4, 1),
++	  z5_res = svget3 (z4, 1))
++
++/*
++** get3_u8_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get3_u8_z5_2, svuint8x3_t, svuint8_t,
++	  z5_res = svget3_u8 (z4, 2),
++	  z5_res = svget3 (z4, 2))
++
++/*
++** get3_u8_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get3_u8_z6_0, svuint8x3_t, svuint8_t,
++	  z6_res = svget3_u8 (z4, 0),
++	  z6_res = svget3 (z4, 0))
++
++/*
++** get3_u8_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get3_u8_z6_1, svuint8x3_t, svuint8_t,
++	  z6_res = svget3_u8 (z4, 1),
++	  z6_res = svget3 (z4, 1))
++
++/*
++** get3_u8_z6_2:
++**	ret
++*/
++TEST_GET (get3_u8_z6_2, svuint8x3_t, svuint8_t,
++	  z6_res = svget3_u8 (z4, 2),
++	  z6_res = svget3 (z4, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_bf16.c
+new file mode 100644
+index 000000000..f751fc147
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_bf16.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_bf16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z0_0, svbfloat16x4_t, svbfloat16_t,
++	  z0 = svget4_bf16 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_bf16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z0_1, svbfloat16x4_t, svbfloat16_t,
++	  z0 = svget4_bf16 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_bf16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z0_2, svbfloat16x4_t, svbfloat16_t,
++	  z0 = svget4_bf16 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_bf16_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z0_3, svbfloat16x4_t, svbfloat16_t,
++	  z0 = svget4_bf16 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_bf16_z4_0:
++**	ret
++*/
++TEST_GET (get4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
++	  z4_res = svget4_bf16 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_bf16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
++	  z4_res = svget4_bf16 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_bf16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
++	  z4_res = svget4_bf16 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_bf16_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
++	  z4_res = svget4_bf16 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_bf16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z5_0, svbfloat16x4_t, svbfloat16_t,
++	  z5_res = svget4_bf16 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_bf16_z5_1:
++**	ret
++*/
++TEST_GET (get4_bf16_z5_1, svbfloat16x4_t, svbfloat16_t,
++	  z5_res = svget4_bf16 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_bf16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z5_2, svbfloat16x4_t, svbfloat16_t,
++	  z5_res = svget4_bf16 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_bf16_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z5_3, svbfloat16x4_t, svbfloat16_t,
++	  z5_res = svget4_bf16 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_bf16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z6_0, svbfloat16x4_t, svbfloat16_t,
++	  z6_res = svget4_bf16 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_bf16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z6_1, svbfloat16x4_t, svbfloat16_t,
++	  z6_res = svget4_bf16 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_bf16_z6_2:
++**	ret
++*/
++TEST_GET (get4_bf16_z6_2, svbfloat16x4_t, svbfloat16_t,
++	  z6_res = svget4_bf16 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_bf16_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z6_3, svbfloat16x4_t, svbfloat16_t,
++	  z6_res = svget4_bf16 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_bf16_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z7_0, svbfloat16x4_t, svbfloat16_t,
++	  z7_res = svget4_bf16 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_bf16_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z7_1, svbfloat16x4_t, svbfloat16_t,
++	  z7_res = svget4_bf16 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_bf16_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_bf16_z7_2, svbfloat16x4_t, svbfloat16_t,
++	  z7_res = svget4_bf16 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_bf16_z7_3:
++**	ret
++*/
++TEST_GET (get4_bf16_z7_3, svbfloat16x4_t, svbfloat16_t,
++	  z7_res = svget4_bf16 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f16.c
+new file mode 100644
+index 000000000..7871f6f4e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f16.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_f16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f16_z0_0, svfloat16x4_t, svfloat16_t,
++	  z0 = svget4_f16 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_f16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f16_z0_1, svfloat16x4_t, svfloat16_t,
++	  z0 = svget4_f16 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_f16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f16_z0_2, svfloat16x4_t, svfloat16_t,
++	  z0 = svget4_f16 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_f16_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f16_z0_3, svfloat16x4_t, svfloat16_t,
++	  z0 = svget4_f16 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_f16_z4_0:
++**	ret
++*/
++TEST_GET (get4_f16_z4_0, svfloat16x4_t, svfloat16_t,
++	  z4_res = svget4_f16 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_f16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f16_z4_1, svfloat16x4_t, svfloat16_t,
++	  z4_res = svget4_f16 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_f16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f16_z4_2, svfloat16x4_t, svfloat16_t,
++	  z4_res = svget4_f16 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_f16_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f16_z4_3, svfloat16x4_t, svfloat16_t,
++	  z4_res = svget4_f16 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_f16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f16_z5_0, svfloat16x4_t, svfloat16_t,
++	  z5_res = svget4_f16 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_f16_z5_1:
++**	ret
++*/
++TEST_GET (get4_f16_z5_1, svfloat16x4_t, svfloat16_t,
++	  z5_res = svget4_f16 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_f16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f16_z5_2, svfloat16x4_t, svfloat16_t,
++	  z5_res = svget4_f16 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_f16_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f16_z5_3, svfloat16x4_t, svfloat16_t,
++	  z5_res = svget4_f16 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_f16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f16_z6_0, svfloat16x4_t, svfloat16_t,
++	  z6_res = svget4_f16 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_f16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f16_z6_1, svfloat16x4_t, svfloat16_t,
++	  z6_res = svget4_f16 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_f16_z6_2:
++**	ret
++*/
++TEST_GET (get4_f16_z6_2, svfloat16x4_t, svfloat16_t,
++	  z6_res = svget4_f16 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_f16_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f16_z6_3, svfloat16x4_t, svfloat16_t,
++	  z6_res = svget4_f16 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_f16_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f16_z7_0, svfloat16x4_t, svfloat16_t,
++	  z7_res = svget4_f16 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_f16_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f16_z7_1, svfloat16x4_t, svfloat16_t,
++	  z7_res = svget4_f16 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_f16_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f16_z7_2, svfloat16x4_t, svfloat16_t,
++	  z7_res = svget4_f16 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_f16_z7_3:
++**	ret
++*/
++TEST_GET (get4_f16_z7_3, svfloat16x4_t, svfloat16_t,
++	  z7_res = svget4_f16 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f32.c
+new file mode 100644
+index 000000000..a290e026d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f32.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_f32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f32_z0_0, svfloat32x4_t, svfloat32_t,
++	  z0 = svget4_f32 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_f32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f32_z0_1, svfloat32x4_t, svfloat32_t,
++	  z0 = svget4_f32 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_f32_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f32_z0_2, svfloat32x4_t, svfloat32_t,
++	  z0 = svget4_f32 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_f32_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f32_z0_3, svfloat32x4_t, svfloat32_t,
++	  z0 = svget4_f32 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_f32_z4_0:
++**	ret
++*/
++TEST_GET (get4_f32_z4_0, svfloat32x4_t, svfloat32_t,
++	  z4_res = svget4_f32 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_f32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f32_z4_1, svfloat32x4_t, svfloat32_t,
++	  z4_res = svget4_f32 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_f32_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f32_z4_2, svfloat32x4_t, svfloat32_t,
++	  z4_res = svget4_f32 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_f32_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f32_z4_3, svfloat32x4_t, svfloat32_t,
++	  z4_res = svget4_f32 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_f32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f32_z5_0, svfloat32x4_t, svfloat32_t,
++	  z5_res = svget4_f32 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_f32_z5_1:
++**	ret
++*/
++TEST_GET (get4_f32_z5_1, svfloat32x4_t, svfloat32_t,
++	  z5_res = svget4_f32 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_f32_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f32_z5_2, svfloat32x4_t, svfloat32_t,
++	  z5_res = svget4_f32 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_f32_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f32_z5_3, svfloat32x4_t, svfloat32_t,
++	  z5_res = svget4_f32 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_f32_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f32_z6_0, svfloat32x4_t, svfloat32_t,
++	  z6_res = svget4_f32 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_f32_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f32_z6_1, svfloat32x4_t, svfloat32_t,
++	  z6_res = svget4_f32 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_f32_z6_2:
++**	ret
++*/
++TEST_GET (get4_f32_z6_2, svfloat32x4_t, svfloat32_t,
++	  z6_res = svget4_f32 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_f32_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f32_z6_3, svfloat32x4_t, svfloat32_t,
++	  z6_res = svget4_f32 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_f32_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f32_z7_0, svfloat32x4_t, svfloat32_t,
++	  z7_res = svget4_f32 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_f32_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f32_z7_1, svfloat32x4_t, svfloat32_t,
++	  z7_res = svget4_f32 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_f32_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f32_z7_2, svfloat32x4_t, svfloat32_t,
++	  z7_res = svget4_f32 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_f32_z7_3:
++**	ret
++*/
++TEST_GET (get4_f32_z7_3, svfloat32x4_t, svfloat32_t,
++	  z7_res = svget4_f32 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f64.c
+new file mode 100644
+index 000000000..2c34dfef1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f64.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_f64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f64_z0_0, svfloat64x4_t, svfloat64_t,
++	  z0 = svget4_f64 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_f64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f64_z0_1, svfloat64x4_t, svfloat64_t,
++	  z0 = svget4_f64 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_f64_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f64_z0_2, svfloat64x4_t, svfloat64_t,
++	  z0 = svget4_f64 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_f64_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f64_z0_3, svfloat64x4_t, svfloat64_t,
++	  z0 = svget4_f64 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_f64_z4_0:
++**	ret
++*/
++TEST_GET (get4_f64_z4_0, svfloat64x4_t, svfloat64_t,
++	  z4_res = svget4_f64 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_f64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f64_z4_1, svfloat64x4_t, svfloat64_t,
++	  z4_res = svget4_f64 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_f64_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f64_z4_2, svfloat64x4_t, svfloat64_t,
++	  z4_res = svget4_f64 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_f64_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f64_z4_3, svfloat64x4_t, svfloat64_t,
++	  z4_res = svget4_f64 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_f64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f64_z5_0, svfloat64x4_t, svfloat64_t,
++	  z5_res = svget4_f64 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_f64_z5_1:
++**	ret
++*/
++TEST_GET (get4_f64_z5_1, svfloat64x4_t, svfloat64_t,
++	  z5_res = svget4_f64 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_f64_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f64_z5_2, svfloat64x4_t, svfloat64_t,
++	  z5_res = svget4_f64 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_f64_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f64_z5_3, svfloat64x4_t, svfloat64_t,
++	  z5_res = svget4_f64 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_f64_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f64_z6_0, svfloat64x4_t, svfloat64_t,
++	  z6_res = svget4_f64 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_f64_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f64_z6_1, svfloat64x4_t, svfloat64_t,
++	  z6_res = svget4_f64 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_f64_z6_2:
++**	ret
++*/
++TEST_GET (get4_f64_z6_2, svfloat64x4_t, svfloat64_t,
++	  z6_res = svget4_f64 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_f64_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_f64_z6_3, svfloat64x4_t, svfloat64_t,
++	  z6_res = svget4_f64 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_f64_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_f64_z7_0, svfloat64x4_t, svfloat64_t,
++	  z7_res = svget4_f64 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_f64_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_f64_z7_1, svfloat64x4_t, svfloat64_t,
++	  z7_res = svget4_f64 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_f64_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_f64_z7_2, svfloat64x4_t, svfloat64_t,
++	  z7_res = svget4_f64 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_f64_z7_3:
++**	ret
++*/
++TEST_GET (get4_f64_z7_3, svfloat64x4_t, svfloat64_t,
++	  z7_res = svget4_f64 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s16.c
+new file mode 100644
+index 000000000..6a2280fea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s16.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_s16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s16_z0_0, svint16x4_t, svint16_t,
++	  z0 = svget4_s16 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_s16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s16_z0_1, svint16x4_t, svint16_t,
++	  z0 = svget4_s16 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_s16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s16_z0_2, svint16x4_t, svint16_t,
++	  z0 = svget4_s16 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_s16_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s16_z0_3, svint16x4_t, svint16_t,
++	  z0 = svget4_s16 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_s16_z4_0:
++**	ret
++*/
++TEST_GET (get4_s16_z4_0, svint16x4_t, svint16_t,
++	  z4_res = svget4_s16 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_s16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s16_z4_1, svint16x4_t, svint16_t,
++	  z4_res = svget4_s16 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_s16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s16_z4_2, svint16x4_t, svint16_t,
++	  z4_res = svget4_s16 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_s16_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s16_z4_3, svint16x4_t, svint16_t,
++	  z4_res = svget4_s16 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_s16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s16_z5_0, svint16x4_t, svint16_t,
++	  z5_res = svget4_s16 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_s16_z5_1:
++**	ret
++*/
++TEST_GET (get4_s16_z5_1, svint16x4_t, svint16_t,
++	  z5_res = svget4_s16 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_s16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s16_z5_2, svint16x4_t, svint16_t,
++	  z5_res = svget4_s16 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_s16_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s16_z5_3, svint16x4_t, svint16_t,
++	  z5_res = svget4_s16 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_s16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s16_z6_0, svint16x4_t, svint16_t,
++	  z6_res = svget4_s16 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_s16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s16_z6_1, svint16x4_t, svint16_t,
++	  z6_res = svget4_s16 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_s16_z6_2:
++**	ret
++*/
++TEST_GET (get4_s16_z6_2, svint16x4_t, svint16_t,
++	  z6_res = svget4_s16 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_s16_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s16_z6_3, svint16x4_t, svint16_t,
++	  z6_res = svget4_s16 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_s16_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s16_z7_0, svint16x4_t, svint16_t,
++	  z7_res = svget4_s16 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_s16_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s16_z7_1, svint16x4_t, svint16_t,
++	  z7_res = svget4_s16 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_s16_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s16_z7_2, svint16x4_t, svint16_t,
++	  z7_res = svget4_s16 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_s16_z7_3:
++**	ret
++*/
++TEST_GET (get4_s16_z7_3, svint16x4_t, svint16_t,
++	  z7_res = svget4_s16 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s32.c
+new file mode 100644
+index 000000000..41aca09d9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s32.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_s32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s32_z0_0, svint32x4_t, svint32_t,
++	  z0 = svget4_s32 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_s32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s32_z0_1, svint32x4_t, svint32_t,
++	  z0 = svget4_s32 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_s32_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s32_z0_2, svint32x4_t, svint32_t,
++	  z0 = svget4_s32 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_s32_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s32_z0_3, svint32x4_t, svint32_t,
++	  z0 = svget4_s32 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_s32_z4_0:
++**	ret
++*/
++TEST_GET (get4_s32_z4_0, svint32x4_t, svint32_t,
++	  z4_res = svget4_s32 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_s32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s32_z4_1, svint32x4_t, svint32_t,
++	  z4_res = svget4_s32 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_s32_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s32_z4_2, svint32x4_t, svint32_t,
++	  z4_res = svget4_s32 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_s32_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s32_z4_3, svint32x4_t, svint32_t,
++	  z4_res = svget4_s32 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_s32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s32_z5_0, svint32x4_t, svint32_t,
++	  z5_res = svget4_s32 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_s32_z5_1:
++**	ret
++*/
++TEST_GET (get4_s32_z5_1, svint32x4_t, svint32_t,
++	  z5_res = svget4_s32 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_s32_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s32_z5_2, svint32x4_t, svint32_t,
++	  z5_res = svget4_s32 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_s32_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s32_z5_3, svint32x4_t, svint32_t,
++	  z5_res = svget4_s32 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_s32_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s32_z6_0, svint32x4_t, svint32_t,
++	  z6_res = svget4_s32 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_s32_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s32_z6_1, svint32x4_t, svint32_t,
++	  z6_res = svget4_s32 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_s32_z6_2:
++**	ret
++*/
++TEST_GET (get4_s32_z6_2, svint32x4_t, svint32_t,
++	  z6_res = svget4_s32 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_s32_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s32_z6_3, svint32x4_t, svint32_t,
++	  z6_res = svget4_s32 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_s32_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s32_z7_0, svint32x4_t, svint32_t,
++	  z7_res = svget4_s32 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_s32_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s32_z7_1, svint32x4_t, svint32_t,
++	  z7_res = svget4_s32 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_s32_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s32_z7_2, svint32x4_t, svint32_t,
++	  z7_res = svget4_s32 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_s32_z7_3:
++**	ret
++*/
++TEST_GET (get4_s32_z7_3, svint32x4_t, svint32_t,
++	  z7_res = svget4_s32 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s64.c
+new file mode 100644
+index 000000000..a17e2779c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s64.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_s64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s64_z0_0, svint64x4_t, svint64_t,
++	  z0 = svget4_s64 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_s64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s64_z0_1, svint64x4_t, svint64_t,
++	  z0 = svget4_s64 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_s64_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s64_z0_2, svint64x4_t, svint64_t,
++	  z0 = svget4_s64 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_s64_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s64_z0_3, svint64x4_t, svint64_t,
++	  z0 = svget4_s64 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_s64_z4_0:
++**	ret
++*/
++TEST_GET (get4_s64_z4_0, svint64x4_t, svint64_t,
++	  z4_res = svget4_s64 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_s64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s64_z4_1, svint64x4_t, svint64_t,
++	  z4_res = svget4_s64 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_s64_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s64_z4_2, svint64x4_t, svint64_t,
++	  z4_res = svget4_s64 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_s64_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s64_z4_3, svint64x4_t, svint64_t,
++	  z4_res = svget4_s64 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_s64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s64_z5_0, svint64x4_t, svint64_t,
++	  z5_res = svget4_s64 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_s64_z5_1:
++**	ret
++*/
++TEST_GET (get4_s64_z5_1, svint64x4_t, svint64_t,
++	  z5_res = svget4_s64 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_s64_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s64_z5_2, svint64x4_t, svint64_t,
++	  z5_res = svget4_s64 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_s64_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s64_z5_3, svint64x4_t, svint64_t,
++	  z5_res = svget4_s64 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_s64_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s64_z6_0, svint64x4_t, svint64_t,
++	  z6_res = svget4_s64 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_s64_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s64_z6_1, svint64x4_t, svint64_t,
++	  z6_res = svget4_s64 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_s64_z6_2:
++**	ret
++*/
++TEST_GET (get4_s64_z6_2, svint64x4_t, svint64_t,
++	  z6_res = svget4_s64 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_s64_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s64_z6_3, svint64x4_t, svint64_t,
++	  z6_res = svget4_s64 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_s64_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s64_z7_0, svint64x4_t, svint64_t,
++	  z7_res = svget4_s64 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_s64_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s64_z7_1, svint64x4_t, svint64_t,
++	  z7_res = svget4_s64 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_s64_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s64_z7_2, svint64x4_t, svint64_t,
++	  z7_res = svget4_s64 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_s64_z7_3:
++**	ret
++*/
++TEST_GET (get4_s64_z7_3, svint64x4_t, svint64_t,
++	  z7_res = svget4_s64 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s8.c
+new file mode 100644
+index 000000000..9fa159597
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s8.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_s8_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s8_z0_0, svint8x4_t, svint8_t,
++	  z0 = svget4_s8 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_s8_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s8_z0_1, svint8x4_t, svint8_t,
++	  z0 = svget4_s8 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_s8_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s8_z0_2, svint8x4_t, svint8_t,
++	  z0 = svget4_s8 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_s8_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s8_z0_3, svint8x4_t, svint8_t,
++	  z0 = svget4_s8 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_s8_z4_0:
++**	ret
++*/
++TEST_GET (get4_s8_z4_0, svint8x4_t, svint8_t,
++	  z4_res = svget4_s8 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_s8_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s8_z4_1, svint8x4_t, svint8_t,
++	  z4_res = svget4_s8 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_s8_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s8_z4_2, svint8x4_t, svint8_t,
++	  z4_res = svget4_s8 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_s8_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s8_z4_3, svint8x4_t, svint8_t,
++	  z4_res = svget4_s8 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_s8_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s8_z5_0, svint8x4_t, svint8_t,
++	  z5_res = svget4_s8 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_s8_z5_1:
++**	ret
++*/
++TEST_GET (get4_s8_z5_1, svint8x4_t, svint8_t,
++	  z5_res = svget4_s8 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_s8_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s8_z5_2, svint8x4_t, svint8_t,
++	  z5_res = svget4_s8 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_s8_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s8_z5_3, svint8x4_t, svint8_t,
++	  z5_res = svget4_s8 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_s8_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s8_z6_0, svint8x4_t, svint8_t,
++	  z6_res = svget4_s8 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_s8_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s8_z6_1, svint8x4_t, svint8_t,
++	  z6_res = svget4_s8 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_s8_z6_2:
++**	ret
++*/
++TEST_GET (get4_s8_z6_2, svint8x4_t, svint8_t,
++	  z6_res = svget4_s8 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_s8_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_s8_z6_3, svint8x4_t, svint8_t,
++	  z6_res = svget4_s8 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_s8_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_s8_z7_0, svint8x4_t, svint8_t,
++	  z7_res = svget4_s8 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_s8_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_s8_z7_1, svint8x4_t, svint8_t,
++	  z7_res = svget4_s8 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_s8_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_s8_z7_2, svint8x4_t, svint8_t,
++	  z7_res = svget4_s8 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_s8_z7_3:
++**	ret
++*/
++TEST_GET (get4_s8_z7_3, svint8x4_t, svint8_t,
++	  z7_res = svget4_s8 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u16.c
+new file mode 100644
+index 000000000..8f17ad213
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u16.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_u16_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u16_z0_0, svuint16x4_t, svuint16_t,
++	  z0 = svget4_u16 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_u16_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u16_z0_1, svuint16x4_t, svuint16_t,
++	  z0 = svget4_u16 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_u16_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u16_z0_2, svuint16x4_t, svuint16_t,
++	  z0 = svget4_u16 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_u16_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u16_z0_3, svuint16x4_t, svuint16_t,
++	  z0 = svget4_u16 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_u16_z4_0:
++**	ret
++*/
++TEST_GET (get4_u16_z4_0, svuint16x4_t, svuint16_t,
++	  z4_res = svget4_u16 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_u16_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u16_z4_1, svuint16x4_t, svuint16_t,
++	  z4_res = svget4_u16 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_u16_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u16_z4_2, svuint16x4_t, svuint16_t,
++	  z4_res = svget4_u16 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_u16_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u16_z4_3, svuint16x4_t, svuint16_t,
++	  z4_res = svget4_u16 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_u16_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u16_z5_0, svuint16x4_t, svuint16_t,
++	  z5_res = svget4_u16 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_u16_z5_1:
++**	ret
++*/
++TEST_GET (get4_u16_z5_1, svuint16x4_t, svuint16_t,
++	  z5_res = svget4_u16 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_u16_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u16_z5_2, svuint16x4_t, svuint16_t,
++	  z5_res = svget4_u16 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_u16_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u16_z5_3, svuint16x4_t, svuint16_t,
++	  z5_res = svget4_u16 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_u16_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u16_z6_0, svuint16x4_t, svuint16_t,
++	  z6_res = svget4_u16 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_u16_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u16_z6_1, svuint16x4_t, svuint16_t,
++	  z6_res = svget4_u16 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_u16_z6_2:
++**	ret
++*/
++TEST_GET (get4_u16_z6_2, svuint16x4_t, svuint16_t,
++	  z6_res = svget4_u16 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_u16_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u16_z6_3, svuint16x4_t, svuint16_t,
++	  z6_res = svget4_u16 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_u16_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u16_z7_0, svuint16x4_t, svuint16_t,
++	  z7_res = svget4_u16 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_u16_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u16_z7_1, svuint16x4_t, svuint16_t,
++	  z7_res = svget4_u16 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_u16_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u16_z7_2, svuint16x4_t, svuint16_t,
++	  z7_res = svget4_u16 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_u16_z7_3:
++**	ret
++*/
++TEST_GET (get4_u16_z7_3, svuint16x4_t, svuint16_t,
++	  z7_res = svget4_u16 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u32.c
+new file mode 100644
+index 000000000..e6c94b39d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u32.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_u32_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u32_z0_0, svuint32x4_t, svuint32_t,
++	  z0 = svget4_u32 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_u32_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u32_z0_1, svuint32x4_t, svuint32_t,
++	  z0 = svget4_u32 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_u32_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u32_z0_2, svuint32x4_t, svuint32_t,
++	  z0 = svget4_u32 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_u32_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u32_z0_3, svuint32x4_t, svuint32_t,
++	  z0 = svget4_u32 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_u32_z4_0:
++**	ret
++*/
++TEST_GET (get4_u32_z4_0, svuint32x4_t, svuint32_t,
++	  z4_res = svget4_u32 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_u32_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u32_z4_1, svuint32x4_t, svuint32_t,
++	  z4_res = svget4_u32 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_u32_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u32_z4_2, svuint32x4_t, svuint32_t,
++	  z4_res = svget4_u32 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_u32_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u32_z4_3, svuint32x4_t, svuint32_t,
++	  z4_res = svget4_u32 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_u32_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u32_z5_0, svuint32x4_t, svuint32_t,
++	  z5_res = svget4_u32 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_u32_z5_1:
++**	ret
++*/
++TEST_GET (get4_u32_z5_1, svuint32x4_t, svuint32_t,
++	  z5_res = svget4_u32 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_u32_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u32_z5_2, svuint32x4_t, svuint32_t,
++	  z5_res = svget4_u32 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_u32_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u32_z5_3, svuint32x4_t, svuint32_t,
++	  z5_res = svget4_u32 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_u32_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u32_z6_0, svuint32x4_t, svuint32_t,
++	  z6_res = svget4_u32 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_u32_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u32_z6_1, svuint32x4_t, svuint32_t,
++	  z6_res = svget4_u32 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_u32_z6_2:
++**	ret
++*/
++TEST_GET (get4_u32_z6_2, svuint32x4_t, svuint32_t,
++	  z6_res = svget4_u32 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_u32_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u32_z6_3, svuint32x4_t, svuint32_t,
++	  z6_res = svget4_u32 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_u32_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u32_z7_0, svuint32x4_t, svuint32_t,
++	  z7_res = svget4_u32 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_u32_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u32_z7_1, svuint32x4_t, svuint32_t,
++	  z7_res = svget4_u32 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_u32_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u32_z7_2, svuint32x4_t, svuint32_t,
++	  z7_res = svget4_u32 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_u32_z7_3:
++**	ret
++*/
++TEST_GET (get4_u32_z7_3, svuint32x4_t, svuint32_t,
++	  z7_res = svget4_u32 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u64.c
+new file mode 100644
+index 000000000..79c293a2c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u64.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_u64_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u64_z0_0, svuint64x4_t, svuint64_t,
++	  z0 = svget4_u64 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_u64_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u64_z0_1, svuint64x4_t, svuint64_t,
++	  z0 = svget4_u64 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_u64_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u64_z0_2, svuint64x4_t, svuint64_t,
++	  z0 = svget4_u64 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_u64_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u64_z0_3, svuint64x4_t, svuint64_t,
++	  z0 = svget4_u64 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_u64_z4_0:
++**	ret
++*/
++TEST_GET (get4_u64_z4_0, svuint64x4_t, svuint64_t,
++	  z4_res = svget4_u64 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_u64_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u64_z4_1, svuint64x4_t, svuint64_t,
++	  z4_res = svget4_u64 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_u64_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u64_z4_2, svuint64x4_t, svuint64_t,
++	  z4_res = svget4_u64 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_u64_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u64_z4_3, svuint64x4_t, svuint64_t,
++	  z4_res = svget4_u64 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_u64_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u64_z5_0, svuint64x4_t, svuint64_t,
++	  z5_res = svget4_u64 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_u64_z5_1:
++**	ret
++*/
++TEST_GET (get4_u64_z5_1, svuint64x4_t, svuint64_t,
++	  z5_res = svget4_u64 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_u64_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u64_z5_2, svuint64x4_t, svuint64_t,
++	  z5_res = svget4_u64 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_u64_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u64_z5_3, svuint64x4_t, svuint64_t,
++	  z5_res = svget4_u64 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_u64_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u64_z6_0, svuint64x4_t, svuint64_t,
++	  z6_res = svget4_u64 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_u64_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u64_z6_1, svuint64x4_t, svuint64_t,
++	  z6_res = svget4_u64 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_u64_z6_2:
++**	ret
++*/
++TEST_GET (get4_u64_z6_2, svuint64x4_t, svuint64_t,
++	  z6_res = svget4_u64 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_u64_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u64_z6_3, svuint64x4_t, svuint64_t,
++	  z6_res = svget4_u64 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_u64_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u64_z7_0, svuint64x4_t, svuint64_t,
++	  z7_res = svget4_u64 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_u64_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u64_z7_1, svuint64x4_t, svuint64_t,
++	  z7_res = svget4_u64 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_u64_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u64_z7_2, svuint64x4_t, svuint64_t,
++	  z7_res = svget4_u64 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_u64_z7_3:
++**	ret
++*/
++TEST_GET (get4_u64_z7_3, svuint64x4_t, svuint64_t,
++	  z7_res = svget4_u64 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u8.c
+new file mode 100644
+index 000000000..f3ad9a85b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u8.c
+@@ -0,0 +1,179 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** get4_u8_z0_0:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u8_z0_0, svuint8x4_t, svuint8_t,
++	  z0 = svget4_u8 (z4, 0),
++	  z0 = svget4 (z4, 0))
++
++/*
++** get4_u8_z0_1:
++**	mov	z0\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u8_z0_1, svuint8x4_t, svuint8_t,
++	  z0 = svget4_u8 (z4, 1),
++	  z0 = svget4 (z4, 1))
++
++/*
++** get4_u8_z0_2:
++**	mov	z0\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u8_z0_2, svuint8x4_t, svuint8_t,
++	  z0 = svget4_u8 (z4, 2),
++	  z0 = svget4 (z4, 2))
++
++/*
++** get4_u8_z0_3:
++**	mov	z0\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u8_z0_3, svuint8x4_t, svuint8_t,
++	  z0 = svget4_u8 (z4, 3),
++	  z0 = svget4 (z4, 3))
++
++/*
++** get4_u8_z4_0:
++**	ret
++*/
++TEST_GET (get4_u8_z4_0, svuint8x4_t, svuint8_t,
++	  z4_res = svget4_u8 (z4, 0),
++	  z4_res = svget4 (z4, 0))
++
++/*
++** get4_u8_z4_1:
++**	mov	z4\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u8_z4_1, svuint8x4_t, svuint8_t,
++	  z4_res = svget4_u8 (z4, 1),
++	  z4_res = svget4 (z4, 1))
++
++/*
++** get4_u8_z4_2:
++**	mov	z4\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u8_z4_2, svuint8x4_t, svuint8_t,
++	  z4_res = svget4_u8 (z4, 2),
++	  z4_res = svget4 (z4, 2))
++
++/*
++** get4_u8_z4_3:
++**	mov	z4\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u8_z4_3, svuint8x4_t, svuint8_t,
++	  z4_res = svget4_u8 (z4, 3),
++	  z4_res = svget4 (z4, 3))
++
++/*
++** get4_u8_z5_0:
++**	mov	z5\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u8_z5_0, svuint8x4_t, svuint8_t,
++	  z5_res = svget4_u8 (z4, 0),
++	  z5_res = svget4 (z4, 0))
++
++/*
++** get4_u8_z5_1:
++**	ret
++*/
++TEST_GET (get4_u8_z5_1, svuint8x4_t, svuint8_t,
++	  z5_res = svget4_u8 (z4, 1),
++	  z5_res = svget4 (z4, 1))
++
++/*
++** get4_u8_z5_2:
++**	mov	z5\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u8_z5_2, svuint8x4_t, svuint8_t,
++	  z5_res = svget4_u8 (z4, 2),
++	  z5_res = svget4 (z4, 2))
++
++/*
++** get4_u8_z5_3:
++**	mov	z5\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u8_z5_3, svuint8x4_t, svuint8_t,
++	  z5_res = svget4_u8 (z4, 3),
++	  z5_res = svget4 (z4, 3))
++
++/*
++** get4_u8_z6_0:
++**	mov	z6\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u8_z6_0, svuint8x4_t, svuint8_t,
++	  z6_res = svget4_u8 (z4, 0),
++	  z6_res = svget4 (z4, 0))
++
++/*
++** get4_u8_z6_1:
++**	mov	z6\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u8_z6_1, svuint8x4_t, svuint8_t,
++	  z6_res = svget4_u8 (z4, 1),
++	  z6_res = svget4 (z4, 1))
++
++/*
++** get4_u8_z6_2:
++**	ret
++*/
++TEST_GET (get4_u8_z6_2, svuint8x4_t, svuint8_t,
++	  z6_res = svget4_u8 (z4, 2),
++	  z6_res = svget4 (z4, 2))
++
++/*
++** get4_u8_z6_3:
++**	mov	z6\.d, z7\.d
++**	ret
++*/
++TEST_GET (get4_u8_z6_3, svuint8x4_t, svuint8_t,
++	  z6_res = svget4_u8 (z4, 3),
++	  z6_res = svget4 (z4, 3))
++
++/*
++** get4_u8_z7_0:
++**	mov	z7\.d, z4\.d
++**	ret
++*/
++TEST_GET (get4_u8_z7_0, svuint8x4_t, svuint8_t,
++	  z7_res = svget4_u8 (z4, 0),
++	  z7_res = svget4 (z4, 0))
++
++/*
++** get4_u8_z7_1:
++**	mov	z7\.d, z5\.d
++**	ret
++*/
++TEST_GET (get4_u8_z7_1, svuint8x4_t, svuint8_t,
++	  z7_res = svget4_u8 (z4, 1),
++	  z7_res = svget4 (z4, 1))
++
++/*
++** get4_u8_z7_2:
++**	mov	z7\.d, z6\.d
++**	ret
++*/
++TEST_GET (get4_u8_z7_2, svuint8x4_t, svuint8_t,
++	  z7_res = svget4_u8 (z4, 2),
++	  z7_res = svget4 (z4, 2))
++
++/*
++** get4_u8_z7_3:
++**	ret
++*/
++TEST_GET (get4_u8_z7_3, svuint8x4_t, svuint8_t,
++	  z7_res = svget4_u8 (z4, 3),
++	  z7_res = svget4 (z4, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s16.c
+new file mode 100644
+index 000000000..90a1434f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s16.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_s16_w0_w1:
++**	index	z0\.h, w0, w1
++**	ret
++*/
++TEST_S (index_s16_w0_w1, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, x1))
++
++/*
++** index_s16_w0_2:
++**	index	z0\.h, w0, #2
++**	ret
++*/
++TEST_S (index_s16_w0_2, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, 2))
++
++/*
++** index_s16_50_2:
++**	mov	(w[0-9]+), 50
++**	index	z0\.h, \1, #2
++**	ret
++*/
++TEST_S (index_s16_50_2, svint16_t, int16_t,
++	z0 = svindex_s16 (50, 2))
++
++/*
++** index_s16_0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, #0, \1
++**	ret
++*/
++TEST_S (index_s16_0_m17, svint16_t, int16_t,
++	z0 = svindex_s16 (0, -17))
++
++/*
++** index_s16_0_m16:
++**	index	z0\.h, #0, #-16
++**	ret
++*/
++TEST_S (index_s16_0_m16, svint16_t, int16_t,
++	z0 = svindex_s16 (0, -16))
++
++/*
++** index_s16_0_1:
++**	index	z0\.h, #0, #1
++**	ret
++*/
++TEST_S (index_s16_0_1, svint16_t, int16_t,
++	z0 = svindex_s16 (0, 1))
++
++/*
++** index_s16_0_15:
++**	index	z0\.h, #0, #15
++**	ret
++*/
++TEST_S (index_s16_0_15, svint16_t, int16_t,
++	z0 = svindex_s16 (0, 15))
++
++/*
++** index_s16_0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, #0, \1
++**	ret
++*/
++TEST_S (index_s16_0_16, svint16_t, int16_t,
++	z0 = svindex_s16 (0, 16))
++
++/*
++** index_s16_m17_1:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, \1, #1
++**	ret
++*/
++TEST_S (index_s16_m17_1, svint16_t, int16_t,
++	z0 = svindex_s16 (-17, 1))
++
++/*
++** index_s16_m16_1:
++**	index	z0\.h, #-16, #1
++**	ret
++*/
++TEST_S (index_s16_m16_1, svint16_t, int16_t,
++	z0 = svindex_s16 (-16, 1))
++
++/*
++** index_s16_m1_1:
++**	index	z0\.h, #-1, #1
++**	ret
++*/
++TEST_S (index_s16_m1_1, svint16_t, int16_t,
++	z0 = svindex_s16 (-1, 1))
++
++/*
++** index_s16_1_1:
++**	index	z0\.h, #1, #1
++**	ret
++*/
++TEST_S (index_s16_1_1, svint16_t, int16_t,
++	z0 = svindex_s16 (1, 1))
++
++/*
++** index_s16_15_1:
++**	index	z0\.h, #15, #1
++**	ret
++*/
++TEST_S (index_s16_15_1, svint16_t, int16_t,
++	z0 = svindex_s16 (15, 1))
++
++/*
++** index_s16_16_1:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, \1, #1
++**	ret
++*/
++TEST_S (index_s16_16_1, svint16_t, int16_t,
++	z0 = svindex_s16 (16, 1))
++
++/*
++** index_s16_m17_x0:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, \1, w0
++**	ret
++*/
++TEST_S (index_s16_m17_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (-17, x0))
++
++/*
++** index_s16_m16_x0:
++**	index	z0\.h, #-16, w0
++**	ret
++*/
++TEST_S (index_s16_m16_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (-16, x0))
++
++/*
++** index_s16_m1_x0:
++**	index	z0\.h, #-1, w0
++**	ret
++*/
++TEST_S (index_s16_m1_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (-1, x0))
++
++/*
++** index_s16_0_x0:
++**	index	z0\.h, #0, w0
++**	ret
++*/
++TEST_S (index_s16_0_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (0, x0))
++
++/*
++** index_s16_1_x0:
++**	index	z0\.h, #1, w0
++**	ret
++*/
++TEST_S (index_s16_1_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (1, x0))
++
++/*
++** index_s16_15_x0:
++**	index	z0\.h, #15, w0
++**	ret
++*/
++TEST_S (index_s16_15_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (15, x0))
++
++/*
++** index_s16_16_x0:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, \1, w0
++**	ret
++*/
++TEST_S (index_s16_16_x0, svint16_t, int16_t,
++	z0 = svindex_s16 (16, x0))
++
++/*
++** index_s16_x0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, w0, \1
++**	ret
++*/
++TEST_S (index_s16_x0_m17, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, -17))
++
++/*
++** index_s16_x0_m16:
++**	index	z0\.h, w0, #-16
++**	ret
++*/
++TEST_S (index_s16_x0_m16, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, -16))
++
++/*
++** index_s16_x0_1:
++**	index	z0\.h, w0, #1
++**	ret
++*/
++TEST_S (index_s16_x0_1, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, 1))
++
++/*
++** index_s16_x0_15:
++**	index	z0\.h, w0, #15
++**	ret
++*/
++TEST_S (index_s16_x0_15, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, 15))
++
++/*
++** index_s16_x0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, w0, \1
++**	ret
++*/
++TEST_S (index_s16_x0_16, svint16_t, int16_t,
++	z0 = svindex_s16 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s32.c
+new file mode 100644
+index 000000000..18afedac0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s32.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_s32_w0_w1:
++**	index	z0\.s, w0, w1
++**	ret
++*/
++TEST_S (index_s32_w0_w1, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, x1))
++
++/*
++** index_s32_w0_2:
++**	index	z0\.s, w0, #2
++**	ret
++*/
++TEST_S (index_s32_w0_2, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, 2))
++
++/*
++** index_s32_50_2:
++**	mov	(w[0-9]+), 50
++**	index	z0\.s, \1, #2
++**	ret
++*/
++TEST_S (index_s32_50_2, svint32_t, int32_t,
++	z0 = svindex_s32 (50, 2))
++
++/*
++** index_s32_0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, #0, \1
++**	ret
++*/
++TEST_S (index_s32_0_m17, svint32_t, int32_t,
++	z0 = svindex_s32 (0, -17))
++
++/*
++** index_s32_0_m16:
++**	index	z0\.s, #0, #-16
++**	ret
++*/
++TEST_S (index_s32_0_m16, svint32_t, int32_t,
++	z0 = svindex_s32 (0, -16))
++
++/*
++** index_s32_0_1:
++**	index	z0\.s, #0, #1
++**	ret
++*/
++TEST_S (index_s32_0_1, svint32_t, int32_t,
++	z0 = svindex_s32 (0, 1))
++
++/*
++** index_s32_0_15:
++**	index	z0\.s, #0, #15
++**	ret
++*/
++TEST_S (index_s32_0_15, svint32_t, int32_t,
++	z0 = svindex_s32 (0, 15))
++
++/*
++** index_s32_0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, #0, \1
++**	ret
++*/
++TEST_S (index_s32_0_16, svint32_t, int32_t,
++	z0 = svindex_s32 (0, 16))
++
++/*
++** index_s32_m17_1:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, \1, #1
++**	ret
++*/
++TEST_S (index_s32_m17_1, svint32_t, int32_t,
++	z0 = svindex_s32 (-17, 1))
++
++/*
++** index_s32_m16_1:
++**	index	z0\.s, #-16, #1
++**	ret
++*/
++TEST_S (index_s32_m16_1, svint32_t, int32_t,
++	z0 = svindex_s32 (-16, 1))
++
++/*
++** index_s32_m1_1:
++**	index	z0\.s, #-1, #1
++**	ret
++*/
++TEST_S (index_s32_m1_1, svint32_t, int32_t,
++	z0 = svindex_s32 (-1, 1))
++
++/*
++** index_s32_1_1:
++**	index	z0\.s, #1, #1
++**	ret
++*/
++TEST_S (index_s32_1_1, svint32_t, int32_t,
++	z0 = svindex_s32 (1, 1))
++
++/*
++** index_s32_15_1:
++**	index	z0\.s, #15, #1
++**	ret
++*/
++TEST_S (index_s32_15_1, svint32_t, int32_t,
++	z0 = svindex_s32 (15, 1))
++
++/*
++** index_s32_16_1:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, \1, #1
++**	ret
++*/
++TEST_S (index_s32_16_1, svint32_t, int32_t,
++	z0 = svindex_s32 (16, 1))
++
++/*
++** index_s32_m17_x0:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, \1, w0
++**	ret
++*/
++TEST_S (index_s32_m17_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (-17, x0))
++
++/*
++** index_s32_m16_x0:
++**	index	z0\.s, #-16, w0
++**	ret
++*/
++TEST_S (index_s32_m16_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (-16, x0))
++
++/*
++** index_s32_m1_x0:
++**	index	z0\.s, #-1, w0
++**	ret
++*/
++TEST_S (index_s32_m1_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (-1, x0))
++
++/*
++** index_s32_0_x0:
++**	index	z0\.s, #0, w0
++**	ret
++*/
++TEST_S (index_s32_0_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (0, x0))
++
++/*
++** index_s32_1_x0:
++**	index	z0\.s, #1, w0
++**	ret
++*/
++TEST_S (index_s32_1_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (1, x0))
++
++/*
++** index_s32_15_x0:
++**	index	z0\.s, #15, w0
++**	ret
++*/
++TEST_S (index_s32_15_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (15, x0))
++
++/*
++** index_s32_16_x0:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, \1, w0
++**	ret
++*/
++TEST_S (index_s32_16_x0, svint32_t, int32_t,
++	z0 = svindex_s32 (16, x0))
++
++/*
++** index_s32_x0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, w0, \1
++**	ret
++*/
++TEST_S (index_s32_x0_m17, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, -17))
++
++/*
++** index_s32_x0_m16:
++**	index	z0\.s, w0, #-16
++**	ret
++*/
++TEST_S (index_s32_x0_m16, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, -16))
++
++/*
++** index_s32_x0_1:
++**	index	z0\.s, w0, #1
++**	ret
++*/
++TEST_S (index_s32_x0_1, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, 1))
++
++/*
++** index_s32_x0_15:
++**	index	z0\.s, w0, #15
++**	ret
++*/
++TEST_S (index_s32_x0_15, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, 15))
++
++/*
++** index_s32_x0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, w0, \1
++**	ret
++*/
++TEST_S (index_s32_x0_16, svint32_t, int32_t,
++	z0 = svindex_s32 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s64.c
+new file mode 100644
+index 000000000..298eec9ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s64.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_s64_x0_x1:
++**	index	z0\.d, x0, x1
++**	ret
++*/
++TEST_S (index_s64_x0_x1, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, x1))
++
++/*
++** index_s64_x0_2:
++**	index	z0\.d, x0, #2
++**	ret
++*/
++TEST_S (index_s64_x0_2, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, 2))
++
++/*
++** index_s64_50_2:
++**	mov	(x[0-9]+), 50
++**	index	z0\.d, \1, #2
++**	ret
++*/
++TEST_S (index_s64_50_2, svint64_t, int64_t,
++	z0 = svindex_s64 (50, 2))
++
++/*
++** index_s64_0_m17:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, #0, \1
++**	ret
++*/
++TEST_S (index_s64_0_m17, svint64_t, int64_t,
++	z0 = svindex_s64 (0, -17))
++
++/*
++** index_s64_0_m16:
++**	index	z0\.d, #0, #-16
++**	ret
++*/
++TEST_S (index_s64_0_m16, svint64_t, int64_t,
++	z0 = svindex_s64 (0, -16))
++
++/*
++** index_s64_0_1:
++**	index	z0\.d, #0, #1
++**	ret
++*/
++TEST_S (index_s64_0_1, svint64_t, int64_t,
++	z0 = svindex_s64 (0, 1))
++
++/*
++** index_s64_0_15:
++**	index	z0\.d, #0, #15
++**	ret
++*/
++TEST_S (index_s64_0_15, svint64_t, int64_t,
++	z0 = svindex_s64 (0, 15))
++
++/*
++** index_s64_0_16:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, #0, \1
++**	ret
++*/
++TEST_S (index_s64_0_16, svint64_t, int64_t,
++	z0 = svindex_s64 (0, 16))
++
++/*
++** index_s64_m17_1:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, \1, #1
++**	ret
++*/
++TEST_S (index_s64_m17_1, svint64_t, int64_t,
++	z0 = svindex_s64 (-17, 1))
++
++/*
++** index_s64_m16_1:
++**	index	z0\.d, #-16, #1
++**	ret
++*/
++TEST_S (index_s64_m16_1, svint64_t, int64_t,
++	z0 = svindex_s64 (-16, 1))
++
++/*
++** index_s64_m1_1:
++**	index	z0\.d, #-1, #1
++**	ret
++*/
++TEST_S (index_s64_m1_1, svint64_t, int64_t,
++	z0 = svindex_s64 (-1, 1))
++
++/*
++** index_s64_1_1:
++**	index	z0\.d, #1, #1
++**	ret
++*/
++TEST_S (index_s64_1_1, svint64_t, int64_t,
++	z0 = svindex_s64 (1, 1))
++
++/*
++** index_s64_15_1:
++**	index	z0\.d, #15, #1
++**	ret
++*/
++TEST_S (index_s64_15_1, svint64_t, int64_t,
++	z0 = svindex_s64 (15, 1))
++
++/*
++** index_s64_16_1:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, \1, #1
++**	ret
++*/
++TEST_S (index_s64_16_1, svint64_t, int64_t,
++	z0 = svindex_s64 (16, 1))
++
++/*
++** index_s64_m17_x0:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, \1, x0
++**	ret
++*/
++TEST_S (index_s64_m17_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (-17, x0))
++
++/*
++** index_s64_m16_x0:
++**	index	z0\.d, #-16, x0
++**	ret
++*/
++TEST_S (index_s64_m16_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (-16, x0))
++
++/*
++** index_s64_m1_x0:
++**	index	z0\.d, #-1, x0
++**	ret
++*/
++TEST_S (index_s64_m1_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (-1, x0))
++
++/*
++** index_s64_0_x0:
++**	index	z0\.d, #0, x0
++**	ret
++*/
++TEST_S (index_s64_0_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (0, x0))
++
++/*
++** index_s64_1_x0:
++**	index	z0\.d, #1, x0
++**	ret
++*/
++TEST_S (index_s64_1_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (1, x0))
++
++/*
++** index_s64_15_x0:
++**	index	z0\.d, #15, x0
++**	ret
++*/
++TEST_S (index_s64_15_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (15, x0))
++
++/*
++** index_s64_16_x0:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, \1, x0
++**	ret
++*/
++TEST_S (index_s64_16_x0, svint64_t, int64_t,
++	z0 = svindex_s64 (16, x0))
++
++/*
++** index_s64_x0_m17:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, x0, \1
++**	ret
++*/
++TEST_S (index_s64_x0_m17, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, -17))
++
++/*
++** index_s64_x0_m16:
++**	index	z0\.d, x0, #-16
++**	ret
++*/
++TEST_S (index_s64_x0_m16, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, -16))
++
++/*
++** index_s64_x0_1:
++**	index	z0\.d, x0, #1
++**	ret
++*/
++TEST_S (index_s64_x0_1, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, 1))
++
++/*
++** index_s64_x0_15:
++**	index	z0\.d, x0, #15
++**	ret
++*/
++TEST_S (index_s64_x0_15, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, 15))
++
++/*
++** index_s64_x0_16:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, x0, \1
++**	ret
++*/
++TEST_S (index_s64_x0_16, svint64_t, int64_t,
++	z0 = svindex_s64 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s8.c
+new file mode 100644
+index 000000000..8a1f14f50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s8.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_s8_w0_w1:
++**	index	z0\.b, w0, w1
++**	ret
++*/
++TEST_S (index_s8_w0_w1, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, x1))
++
++/*
++** index_s8_w0_2:
++**	index	z0\.b, w0, #2
++**	ret
++*/
++TEST_S (index_s8_w0_2, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, 2))
++
++/*
++** index_s8_50_2:
++**	mov	(w[0-9]+), 50
++**	index	z0\.b, \1, #2
++**	ret
++*/
++TEST_S (index_s8_50_2, svint8_t, int8_t,
++	z0 = svindex_s8 (50, 2))
++
++/*
++** index_s8_0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, #0, \1
++**	ret
++*/
++TEST_S (index_s8_0_m17, svint8_t, int8_t,
++	z0 = svindex_s8 (0, -17))
++
++/*
++** index_s8_0_m16:
++**	index	z0\.b, #0, #-16
++**	ret
++*/
++TEST_S (index_s8_0_m16, svint8_t, int8_t,
++	z0 = svindex_s8 (0, -16))
++
++/*
++** index_s8_0_1:
++**	index	z0\.b, #0, #1
++**	ret
++*/
++TEST_S (index_s8_0_1, svint8_t, int8_t,
++	z0 = svindex_s8 (0, 1))
++
++/*
++** index_s8_0_15:
++**	index	z0\.b, #0, #15
++**	ret
++*/
++TEST_S (index_s8_0_15, svint8_t, int8_t,
++	z0 = svindex_s8 (0, 15))
++
++/*
++** index_s8_0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, #0, \1
++**	ret
++*/
++TEST_S (index_s8_0_16, svint8_t, int8_t,
++	z0 = svindex_s8 (0, 16))
++
++/*
++** index_s8_m17_1:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, \1, #1
++**	ret
++*/
++TEST_S (index_s8_m17_1, svint8_t, int8_t,
++	z0 = svindex_s8 (-17, 1))
++
++/*
++** index_s8_m16_1:
++**	index	z0\.b, #-16, #1
++**	ret
++*/
++TEST_S (index_s8_m16_1, svint8_t, int8_t,
++	z0 = svindex_s8 (-16, 1))
++
++/*
++** index_s8_m1_1:
++**	index	z0\.b, #-1, #1
++**	ret
++*/
++TEST_S (index_s8_m1_1, svint8_t, int8_t,
++	z0 = svindex_s8 (-1, 1))
++
++/*
++** index_s8_1_1:
++**	index	z0\.b, #1, #1
++**	ret
++*/
++TEST_S (index_s8_1_1, svint8_t, int8_t,
++	z0 = svindex_s8 (1, 1))
++
++/*
++** index_s8_15_1:
++**	index	z0\.b, #15, #1
++**	ret
++*/
++TEST_S (index_s8_15_1, svint8_t, int8_t,
++	z0 = svindex_s8 (15, 1))
++
++/*
++** index_s8_16_1:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, \1, #1
++**	ret
++*/
++TEST_S (index_s8_16_1, svint8_t, int8_t,
++	z0 = svindex_s8 (16, 1))
++
++/*
++** index_s8_m17_x0:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, \1, w0
++**	ret
++*/
++TEST_S (index_s8_m17_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (-17, x0))
++
++/*
++** index_s8_m16_x0:
++**	index	z0\.b, #-16, w0
++**	ret
++*/
++TEST_S (index_s8_m16_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (-16, x0))
++
++/*
++** index_s8_m1_x0:
++**	index	z0\.b, #-1, w0
++**	ret
++*/
++TEST_S (index_s8_m1_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (-1, x0))
++
++/*
++** index_s8_0_x0:
++**	index	z0\.b, #0, w0
++**	ret
++*/
++TEST_S (index_s8_0_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (0, x0))
++
++/*
++** index_s8_1_x0:
++**	index	z0\.b, #1, w0
++**	ret
++*/
++TEST_S (index_s8_1_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (1, x0))
++
++/*
++** index_s8_15_x0:
++**	index	z0\.b, #15, w0
++**	ret
++*/
++TEST_S (index_s8_15_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (15, x0))
++
++/*
++** index_s8_16_x0:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, \1, w0
++**	ret
++*/
++TEST_S (index_s8_16_x0, svint8_t, int8_t,
++	z0 = svindex_s8 (16, x0))
++
++/*
++** index_s8_x0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, w0, \1
++**	ret
++*/
++TEST_S (index_s8_x0_m17, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, -17))
++
++/*
++** index_s8_x0_m16:
++**	index	z0\.b, w0, #-16
++**	ret
++*/
++TEST_S (index_s8_x0_m16, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, -16))
++
++/*
++** index_s8_x0_1:
++**	index	z0\.b, w0, #1
++**	ret
++*/
++TEST_S (index_s8_x0_1, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, 1))
++
++/*
++** index_s8_x0_15:
++**	index	z0\.b, w0, #15
++**	ret
++*/
++TEST_S (index_s8_x0_15, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, 15))
++
++/*
++** index_s8_x0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, w0, \1
++**	ret
++*/
++TEST_S (index_s8_x0_16, svint8_t, int8_t,
++	z0 = svindex_s8 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u16.c
+new file mode 100644
+index 000000000..1c6631088
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u16.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_u16_w0_w1:
++**	index	z0\.h, w0, w1
++**	ret
++*/
++TEST_S (index_u16_w0_w1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, x1))
++
++/*
++** index_u16_w0_2:
++**	index	z0\.h, w0, #2
++**	ret
++*/
++TEST_S (index_u16_w0_2, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, 2))
++
++/*
++** index_u16_50_2:
++**	mov	(w[0-9]+), 50
++**	index	z0\.h, \1, #2
++**	ret
++*/
++TEST_S (index_u16_50_2, svuint16_t, uint16_t,
++	z0 = svindex_u16 (50, 2))
++
++/*
++** index_u16_0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, #0, \1
++**	ret
++*/
++TEST_S (index_u16_0_m17, svuint16_t, uint16_t,
++	z0 = svindex_u16 (0, -17))
++
++/*
++** index_u16_0_m16:
++**	index	z0\.h, #0, #-16
++**	ret
++*/
++TEST_S (index_u16_0_m16, svuint16_t, uint16_t,
++	z0 = svindex_u16 (0, -16))
++
++/*
++** index_u16_0_1:
++**	index	z0\.h, #0, #1
++**	ret
++*/
++TEST_S (index_u16_0_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (0, 1))
++
++/*
++** index_u16_0_15:
++**	index	z0\.h, #0, #15
++**	ret
++*/
++TEST_S (index_u16_0_15, svuint16_t, uint16_t,
++	z0 = svindex_u16 (0, 15))
++
++/*
++** index_u16_0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, #0, \1
++**	ret
++*/
++TEST_S (index_u16_0_16, svuint16_t, uint16_t,
++	z0 = svindex_u16 (0, 16))
++
++/*
++** index_u16_m17_1:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, \1, #1
++**	ret
++*/
++TEST_S (index_u16_m17_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (-17, 1))
++
++/*
++** index_u16_m16_1:
++**	index	z0\.h, #-16, #1
++**	ret
++*/
++TEST_S (index_u16_m16_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (-16, 1))
++
++/*
++** index_u16_m1_1:
++**	index	z0\.h, #-1, #1
++**	ret
++*/
++TEST_S (index_u16_m1_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (-1, 1))
++
++/*
++** index_u16_1_1:
++**	index	z0\.h, #1, #1
++**	ret
++*/
++TEST_S (index_u16_1_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (1, 1))
++
++/*
++** index_u16_15_1:
++**	index	z0\.h, #15, #1
++**	ret
++*/
++TEST_S (index_u16_15_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (15, 1))
++
++/*
++** index_u16_16_1:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, \1, #1
++**	ret
++*/
++TEST_S (index_u16_16_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (16, 1))
++
++/*
++** index_u16_m17_x0:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, \1, w0
++**	ret
++*/
++TEST_S (index_u16_m17_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (-17, x0))
++
++/*
++** index_u16_m16_x0:
++**	index	z0\.h, #-16, w0
++**	ret
++*/
++TEST_S (index_u16_m16_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (-16, x0))
++
++/*
++** index_u16_m1_x0:
++**	index	z0\.h, #-1, w0
++**	ret
++*/
++TEST_S (index_u16_m1_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (-1, x0))
++
++/*
++** index_u16_0_x0:
++**	index	z0\.h, #0, w0
++**	ret
++*/
++TEST_S (index_u16_0_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (0, x0))
++
++/*
++** index_u16_1_x0:
++**	index	z0\.h, #1, w0
++**	ret
++*/
++TEST_S (index_u16_1_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (1, x0))
++
++/*
++** index_u16_15_x0:
++**	index	z0\.h, #15, w0
++**	ret
++*/
++TEST_S (index_u16_15_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (15, x0))
++
++/*
++** index_u16_16_x0:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, \1, w0
++**	ret
++*/
++TEST_S (index_u16_16_x0, svuint16_t, uint16_t,
++	z0 = svindex_u16 (16, x0))
++
++/*
++** index_u16_x0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.h, w0, \1
++**	ret
++*/
++TEST_S (index_u16_x0_m17, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, -17))
++
++/*
++** index_u16_x0_m16:
++**	index	z0\.h, w0, #-16
++**	ret
++*/
++TEST_S (index_u16_x0_m16, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, -16))
++
++/*
++** index_u16_x0_1:
++**	index	z0\.h, w0, #1
++**	ret
++*/
++TEST_S (index_u16_x0_1, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, 1))
++
++/*
++** index_u16_x0_15:
++**	index	z0\.h, w0, #15
++**	ret
++*/
++TEST_S (index_u16_x0_15, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, 15))
++
++/*
++** index_u16_x0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.h, w0, \1
++**	ret
++*/
++TEST_S (index_u16_x0_16, svuint16_t, uint16_t,
++	z0 = svindex_u16 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u32.c
+new file mode 100644
+index 000000000..c2badb05e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u32.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_u32_w0_w1:
++**	index	z0\.s, w0, w1
++**	ret
++*/
++TEST_S (index_u32_w0_w1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, x1))
++
++/*
++** index_u32_w0_2:
++**	index	z0\.s, w0, #2
++**	ret
++*/
++TEST_S (index_u32_w0_2, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, 2))
++
++/*
++** index_u32_50_2:
++**	mov	(w[0-9]+), 50
++**	index	z0\.s, \1, #2
++**	ret
++*/
++TEST_S (index_u32_50_2, svuint32_t, uint32_t,
++	z0 = svindex_u32 (50, 2))
++
++/*
++** index_u32_0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, #0, \1
++**	ret
++*/
++TEST_S (index_u32_0_m17, svuint32_t, uint32_t,
++	z0 = svindex_u32 (0, -17))
++
++/*
++** index_u32_0_m16:
++**	index	z0\.s, #0, #-16
++**	ret
++*/
++TEST_S (index_u32_0_m16, svuint32_t, uint32_t,
++	z0 = svindex_u32 (0, -16))
++
++/*
++** index_u32_0_1:
++**	index	z0\.s, #0, #1
++**	ret
++*/
++TEST_S (index_u32_0_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (0, 1))
++
++/*
++** index_u32_0_15:
++**	index	z0\.s, #0, #15
++**	ret
++*/
++TEST_S (index_u32_0_15, svuint32_t, uint32_t,
++	z0 = svindex_u32 (0, 15))
++
++/*
++** index_u32_0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, #0, \1
++**	ret
++*/
++TEST_S (index_u32_0_16, svuint32_t, uint32_t,
++	z0 = svindex_u32 (0, 16))
++
++/*
++** index_u32_m17_1:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, \1, #1
++**	ret
++*/
++TEST_S (index_u32_m17_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (-17, 1))
++
++/*
++** index_u32_m16_1:
++**	index	z0\.s, #-16, #1
++**	ret
++*/
++TEST_S (index_u32_m16_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (-16, 1))
++
++/*
++** index_u32_m1_1:
++**	index	z0\.s, #-1, #1
++**	ret
++*/
++TEST_S (index_u32_m1_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (-1, 1))
++
++/*
++** index_u32_1_1:
++**	index	z0\.s, #1, #1
++**	ret
++*/
++TEST_S (index_u32_1_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (1, 1))
++
++/*
++** index_u32_15_1:
++**	index	z0\.s, #15, #1
++**	ret
++*/
++TEST_S (index_u32_15_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (15, 1))
++
++/*
++** index_u32_16_1:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, \1, #1
++**	ret
++*/
++TEST_S (index_u32_16_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (16, 1))
++
++/*
++** index_u32_m17_x0:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, \1, w0
++**	ret
++*/
++TEST_S (index_u32_m17_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (-17, x0))
++
++/*
++** index_u32_m16_x0:
++**	index	z0\.s, #-16, w0
++**	ret
++*/
++TEST_S (index_u32_m16_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (-16, x0))
++
++/*
++** index_u32_m1_x0:
++**	index	z0\.s, #-1, w0
++**	ret
++*/
++TEST_S (index_u32_m1_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (-1, x0))
++
++/*
++** index_u32_0_x0:
++**	index	z0\.s, #0, w0
++**	ret
++*/
++TEST_S (index_u32_0_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (0, x0))
++
++/*
++** index_u32_1_x0:
++**	index	z0\.s, #1, w0
++**	ret
++*/
++TEST_S (index_u32_1_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (1, x0))
++
++/*
++** index_u32_15_x0:
++**	index	z0\.s, #15, w0
++**	ret
++*/
++TEST_S (index_u32_15_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (15, x0))
++
++/*
++** index_u32_16_x0:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, \1, w0
++**	ret
++*/
++TEST_S (index_u32_16_x0, svuint32_t, uint32_t,
++	z0 = svindex_u32 (16, x0))
++
++/*
++** index_u32_x0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.s, w0, \1
++**	ret
++*/
++TEST_S (index_u32_x0_m17, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, -17))
++
++/*
++** index_u32_x0_m16:
++**	index	z0\.s, w0, #-16
++**	ret
++*/
++TEST_S (index_u32_x0_m16, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, -16))
++
++/*
++** index_u32_x0_1:
++**	index	z0\.s, w0, #1
++**	ret
++*/
++TEST_S (index_u32_x0_1, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, 1))
++
++/*
++** index_u32_x0_15:
++**	index	z0\.s, w0, #15
++**	ret
++*/
++TEST_S (index_u32_x0_15, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, 15))
++
++/*
++** index_u32_x0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.s, w0, \1
++**	ret
++*/
++TEST_S (index_u32_x0_16, svuint32_t, uint32_t,
++	z0 = svindex_u32 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u64.c
+new file mode 100644
+index 000000000..526c5e80a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u64.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_u64_x0_x1:
++**	index	z0\.d, x0, x1
++**	ret
++*/
++TEST_S (index_u64_x0_x1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, x1))
++
++/*
++** index_u64_x0_2:
++**	index	z0\.d, x0, #2
++**	ret
++*/
++TEST_S (index_u64_x0_2, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, 2))
++
++/*
++** index_u64_50_2:
++**	mov	(x[0-9]+), 50
++**	index	z0\.d, \1, #2
++**	ret
++*/
++TEST_S (index_u64_50_2, svuint64_t, uint64_t,
++	z0 = svindex_u64 (50, 2))
++
++/*
++** index_u64_0_m17:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, #0, \1
++**	ret
++*/
++TEST_S (index_u64_0_m17, svuint64_t, uint64_t,
++	z0 = svindex_u64 (0, -17))
++
++/*
++** index_u64_0_m16:
++**	index	z0\.d, #0, #-16
++**	ret
++*/
++TEST_S (index_u64_0_m16, svuint64_t, uint64_t,
++	z0 = svindex_u64 (0, -16))
++
++/*
++** index_u64_0_1:
++**	index	z0\.d, #0, #1
++**	ret
++*/
++TEST_S (index_u64_0_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (0, 1))
++
++/*
++** index_u64_0_15:
++**	index	z0\.d, #0, #15
++**	ret
++*/
++TEST_S (index_u64_0_15, svuint64_t, uint64_t,
++	z0 = svindex_u64 (0, 15))
++
++/*
++** index_u64_0_16:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, #0, \1
++**	ret
++*/
++TEST_S (index_u64_0_16, svuint64_t, uint64_t,
++	z0 = svindex_u64 (0, 16))
++
++/*
++** index_u64_m17_1:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, \1, #1
++**	ret
++*/
++TEST_S (index_u64_m17_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (-17, 1))
++
++/*
++** index_u64_m16_1:
++**	index	z0\.d, #-16, #1
++**	ret
++*/
++TEST_S (index_u64_m16_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (-16, 1))
++
++/*
++** index_u64_m1_1:
++**	index	z0\.d, #-1, #1
++**	ret
++*/
++TEST_S (index_u64_m1_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (-1, 1))
++
++/*
++** index_u64_1_1:
++**	index	z0\.d, #1, #1
++**	ret
++*/
++TEST_S (index_u64_1_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (1, 1))
++
++/*
++** index_u64_15_1:
++**	index	z0\.d, #15, #1
++**	ret
++*/
++TEST_S (index_u64_15_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (15, 1))
++
++/*
++** index_u64_16_1:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, \1, #1
++**	ret
++*/
++TEST_S (index_u64_16_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (16, 1))
++
++/*
++** index_u64_m17_x0:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, \1, x0
++**	ret
++*/
++TEST_S (index_u64_m17_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (-17, x0))
++
++/*
++** index_u64_m16_x0:
++**	index	z0\.d, #-16, x0
++**	ret
++*/
++TEST_S (index_u64_m16_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (-16, x0))
++
++/*
++** index_u64_m1_x0:
++**	index	z0\.d, #-1, x0
++**	ret
++*/
++TEST_S (index_u64_m1_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (-1, x0))
++
++/*
++** index_u64_0_x0:
++**	index	z0\.d, #0, x0
++**	ret
++*/
++TEST_S (index_u64_0_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (0, x0))
++
++/*
++** index_u64_1_x0:
++**	index	z0\.d, #1, x0
++**	ret
++*/
++TEST_S (index_u64_1_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (1, x0))
++
++/*
++** index_u64_15_x0:
++**	index	z0\.d, #15, x0
++**	ret
++*/
++TEST_S (index_u64_15_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (15, x0))
++
++/*
++** index_u64_16_x0:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, \1, x0
++**	ret
++*/
++TEST_S (index_u64_16_x0, svuint64_t, uint64_t,
++	z0 = svindex_u64 (16, x0))
++
++/*
++** index_u64_x0_m17:
++**	mov	(x[0-9]+), -17
++**	index	z0\.d, x0, \1
++**	ret
++*/
++TEST_S (index_u64_x0_m17, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, -17))
++
++/*
++** index_u64_x0_m16:
++**	index	z0\.d, x0, #-16
++**	ret
++*/
++TEST_S (index_u64_x0_m16, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, -16))
++
++/*
++** index_u64_x0_1:
++**	index	z0\.d, x0, #1
++**	ret
++*/
++TEST_S (index_u64_x0_1, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, 1))
++
++/*
++** index_u64_x0_15:
++**	index	z0\.d, x0, #15
++**	ret
++*/
++TEST_S (index_u64_x0_15, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, 15))
++
++/*
++** index_u64_x0_16:
++**	mov	(x[0-9]+), 16
++**	index	z0\.d, x0, \1
++**	ret
++*/
++TEST_S (index_u64_x0_16, svuint64_t, uint64_t,
++	z0 = svindex_u64 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u8.c
+new file mode 100644
+index 000000000..c6ce12ec8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u8.c
+@@ -0,0 +1,220 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** index_u8_w0_w1:
++**	index	z0\.b, w0, w1
++**	ret
++*/
++TEST_S (index_u8_w0_w1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, x1))
++
++/*
++** index_u8_w0_2:
++**	index	z0\.b, w0, #2
++**	ret
++*/
++TEST_S (index_u8_w0_2, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, 2))
++
++/*
++** index_u8_50_2:
++**	mov	(w[0-9]+), 50
++**	index	z0\.b, \1, #2
++**	ret
++*/
++TEST_S (index_u8_50_2, svuint8_t, uint8_t,
++	z0 = svindex_u8 (50, 2))
++
++/*
++** index_u8_0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, #0, \1
++**	ret
++*/
++TEST_S (index_u8_0_m17, svuint8_t, uint8_t,
++	z0 = svindex_u8 (0, -17))
++
++/*
++** index_u8_0_m16:
++**	index	z0\.b, #0, #-16
++**	ret
++*/
++TEST_S (index_u8_0_m16, svuint8_t, uint8_t,
++	z0 = svindex_u8 (0, -16))
++
++/*
++** index_u8_0_1:
++**	index	z0\.b, #0, #1
++**	ret
++*/
++TEST_S (index_u8_0_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (0, 1))
++
++/*
++** index_u8_0_15:
++**	index	z0\.b, #0, #15
++**	ret
++*/
++TEST_S (index_u8_0_15, svuint8_t, uint8_t,
++	z0 = svindex_u8 (0, 15))
++
++/*
++** index_u8_0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, #0, \1
++**	ret
++*/
++TEST_S (index_u8_0_16, svuint8_t, uint8_t,
++	z0 = svindex_u8 (0, 16))
++
++/*
++** index_u8_m17_1:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, \1, #1
++**	ret
++*/
++TEST_S (index_u8_m17_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (-17, 1))
++
++/*
++** index_u8_m16_1:
++**	index	z0\.b, #-16, #1
++**	ret
++*/
++TEST_S (index_u8_m16_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (-16, 1))
++
++/*
++** index_u8_m1_1:
++**	index	z0\.b, #-1, #1
++**	ret
++*/
++TEST_S (index_u8_m1_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (-1, 1))
++
++/*
++** index_u8_1_1:
++**	index	z0\.b, #1, #1
++**	ret
++*/
++TEST_S (index_u8_1_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (1, 1))
++
++/*
++** index_u8_15_1:
++**	index	z0\.b, #15, #1
++**	ret
++*/
++TEST_S (index_u8_15_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (15, 1))
++
++/*
++** index_u8_16_1:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, \1, #1
++**	ret
++*/
++TEST_S (index_u8_16_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (16, 1))
++
++/*
++** index_u8_m17_x0:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, \1, w0
++**	ret
++*/
++TEST_S (index_u8_m17_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (-17, x0))
++
++/*
++** index_u8_m16_x0:
++**	index	z0\.b, #-16, w0
++**	ret
++*/
++TEST_S (index_u8_m16_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (-16, x0))
++
++/*
++** index_u8_m1_x0:
++**	index	z0\.b, #-1, w0
++**	ret
++*/
++TEST_S (index_u8_m1_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (-1, x0))
++
++/*
++** index_u8_0_x0:
++**	index	z0\.b, #0, w0
++**	ret
++*/
++TEST_S (index_u8_0_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (0, x0))
++
++/*
++** index_u8_1_x0:
++**	index	z0\.b, #1, w0
++**	ret
++*/
++TEST_S (index_u8_1_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (1, x0))
++
++/*
++** index_u8_15_x0:
++**	index	z0\.b, #15, w0
++**	ret
++*/
++TEST_S (index_u8_15_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (15, x0))
++
++/*
++** index_u8_16_x0:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, \1, w0
++**	ret
++*/
++TEST_S (index_u8_16_x0, svuint8_t, uint8_t,
++	z0 = svindex_u8 (16, x0))
++
++/*
++** index_u8_x0_m17:
++**	mov	(w[0-9]+), -17
++**	index	z0\.b, w0, \1
++**	ret
++*/
++TEST_S (index_u8_x0_m17, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, -17))
++
++/*
++** index_u8_x0_m16:
++**	index	z0\.b, w0, #-16
++**	ret
++*/
++TEST_S (index_u8_x0_m16, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, -16))
++
++/*
++** index_u8_x0_1:
++**	index	z0\.b, w0, #1
++**	ret
++*/
++TEST_S (index_u8_x0_1, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, 1))
++
++/*
++** index_u8_x0_15:
++**	index	z0\.b, w0, #15
++**	ret
++*/
++TEST_S (index_u8_x0_15, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, 15))
++
++/*
++** index_u8_x0_16:
++**	mov	(w[0-9]+), 16
++**	index	z0\.b, w0, \1
++**	ret
++*/
++TEST_S (index_u8_x0_16, svuint8_t, uint8_t,
++	z0 = svindex_u8 (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_bf16.c
+new file mode 100644
+index 000000000..55afdba62
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_bf16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_h4_bf16_tied1:
++**	insr	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_h4_bf16_tied1, svbfloat16_t, bfloat16_t,
++		 z0 = svinsr_n_bf16 (z0, d4),
++		 z0 = svinsr (z0, d4))
++
++/*
++** insr_h4_bf16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_h4_bf16_untied, svbfloat16_t, bfloat16_t,
++		 z0 = svinsr_n_bf16 (z1, d4),
++		 z0 = svinsr (z1, d4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f16.c
+new file mode 100644
+index 000000000..f01a36189
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f16.c
+@@ -0,0 +1,51 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_h4_f16_tied1:
++**	insr	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_h4_f16_tied1, svfloat16_t, __fp16,
++		 z0 = svinsr_n_f16 (z0, d4),
++		 z0 = svinsr (z0, d4))
++
++/*
++** insr_h4_f16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, h4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_h4_f16_untied, svfloat16_t, __fp16,
++		 z0 = svinsr_n_f16 (z1, d4),
++		 z0 = svinsr (z1, d4))
++
++/*
++** insr_0_f16_tied1:
++**	insr	z0\.h, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_f16_tied1, svfloat16_t,
++		z0 = svinsr_n_f16 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_f16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_f16_untied, svfloat16_t,
++		z0 = svinsr_n_f16 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_f16:
++**	fmov	(h[0-9]+), #?1\.0(?:e\+0)?
++**	insr	z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_f16, svfloat16_t,
++		z0 = svinsr_n_f16 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f32.c
+new file mode 100644
+index 000000000..e339727b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f32.c
+@@ -0,0 +1,51 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_s4_f32_tied1:
++**	insr	z0\.s, s4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_s4_f32_tied1, svfloat32_t, float,
++		 z0 = svinsr_n_f32 (z0, d4),
++		 z0 = svinsr (z0, d4))
++
++/*
++** insr_s4_f32_untied:
++**	movprfx	z0, z1
++**	insr	z0\.s, s4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_s4_f32_untied, svfloat32_t, float,
++		 z0 = svinsr_n_f32 (z1, d4),
++		 z0 = svinsr (z1, d4))
++
++/*
++** insr_0_f32_tied1:
++**	insr	z0\.s, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_f32_tied1, svfloat32_t,
++		z0 = svinsr_n_f32 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_f32_untied:
++**	movprfx	z0, z1
++**	insr	z0\.s, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_f32_untied, svfloat32_t,
++		z0 = svinsr_n_f32 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_f32:
++**	fmov	(s[0-9]+), #?1\.0(?:e\+0)?
++**	insr	z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_f32, svfloat32_t,
++		z0 = svinsr_n_f32 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f64.c
+new file mode 100644
+index 000000000..9400225a5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f64.c
+@@ -0,0 +1,51 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_d4_f64_tied1:
++**	insr	z0\.d, d4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_d4_f64_tied1, svfloat64_t, double,
++		 z0 = svinsr_n_f64 (z0, d4),
++		 z0 = svinsr (z0, d4))
++
++/*
++** insr_d4_f64_untied:
++**	movprfx	z0, z1
++**	insr	z0\.d, d4
++**	ret
++*/
++TEST_UNIFORM_ZD (insr_d4_f64_untied, svfloat64_t, double,
++		 z0 = svinsr_n_f64 (z1, d4),
++		 z0 = svinsr (z1, d4))
++
++/*
++** insr_0_f64_tied1:
++**	insr	z0\.d, xzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_f64_tied1, svfloat64_t,
++		z0 = svinsr_n_f64 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_f64_untied:
++**	movprfx	z0, z1
++**	insr	z0\.d, xzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_f64_untied, svfloat64_t,
++		z0 = svinsr_n_f64 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_f64:
++**	fmov	(d[0-9]+), #?1\.0(?:e\+0)?
++**	insr	z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_f64, svfloat64_t,
++		z0 = svinsr_n_f64 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s16.c
+new file mode 100644
+index 000000000..651977a9d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s16.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_w0_s16_tied1:
++**	insr	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_s16_tied1, svint16_t, int16_t,
++		 z0 = svinsr_n_s16 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_w0_s16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_s16_untied, svint16_t, int16_t,
++		 z0 = svinsr_n_s16 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_s16_tied1:
++**	insr	z0\.h, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s16_tied1, svint16_t,
++		z0 = svinsr_n_s16 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_s16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s16_untied, svint16_t,
++		z0 = svinsr_n_s16 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_s16:
++** (
++**	mov	(w[0-9]+), #?1
++**	insr	z0\.h, \1
++** |
++**	movi	v([0-9]+)\.4h, 0x1
++**	insr	z0\.h, h\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_s16, svint16_t,
++		z0 = svinsr_n_s16 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s32.c
+new file mode 100644
+index 000000000..a1dcfc090
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s32.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_w0_s32_tied1:
++**	insr	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_s32_tied1, svint32_t, int32_t,
++		 z0 = svinsr_n_s32 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_w0_s32_untied:
++**	movprfx	z0, z1
++**	insr	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_s32_untied, svint32_t, int32_t,
++		 z0 = svinsr_n_s32 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_s32_tied1:
++**	insr	z0\.s, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s32_tied1, svint32_t,
++		z0 = svinsr_n_s32 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_s32_untied:
++**	movprfx	z0, z1
++**	insr	z0\.s, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s32_untied, svint32_t,
++		z0 = svinsr_n_s32 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_s32:
++** (
++**	mov	(w[0-9]+), #?1
++**	insr	z0\.s, \1
++** |
++**	movi	v([0-9]+)\.2s, 0x1
++**	insr	z0\.s, s\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_s32, svint32_t,
++		z0 = svinsr_n_s32 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s64.c
+new file mode 100644
+index 000000000..32cdc8263
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s64.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_x0_s64_tied1:
++**	insr	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_x0_s64_tied1, svint64_t, int64_t,
++		 z0 = svinsr_n_s64 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_x0_s64_untied:
++**	movprfx	z0, z1
++**	insr	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_x0_s64_untied, svint64_t, int64_t,
++		 z0 = svinsr_n_s64 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_s64_tied1:
++**	insr	z0\.d, xzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s64_tied1, svint64_t,
++		z0 = svinsr_n_s64 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_s64_untied:
++**	movprfx	z0, z1
++**	insr	z0\.d, xzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s64_untied, svint64_t,
++		z0 = svinsr_n_s64 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_s64:
++** (
++**	mov	(x[0-9]+), #?1
++**	insr	z0\.d, \1
++** |
++**	movi	v([0-9]+)\.2d, 0x1
++**	insr	z0\.d, d\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_s64, svint64_t,
++		z0 = svinsr_n_s64 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s8.c
+new file mode 100644
+index 000000000..cb69b09fa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s8.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_w0_s8_tied1:
++**	insr	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_s8_tied1, svint8_t, int8_t,
++		 z0 = svinsr_n_s8 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_w0_s8_untied:
++**	movprfx	z0, z1
++**	insr	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_s8_untied, svint8_t, int8_t,
++		 z0 = svinsr_n_s8 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_s8_tied1:
++**	insr	z0\.b, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s8_tied1, svint8_t,
++		z0 = svinsr_n_s8 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_s8_untied:
++**	movprfx	z0, z1
++**	insr	z0\.b, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_s8_untied, svint8_t,
++		z0 = svinsr_n_s8 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_s8:
++** (
++**	mov	(w[0-9]+), #?1
++**	insr	z0\.b, \1
++** |
++**	movi	v([0-9]+)\.8b, 0x1
++**	insr	z0\.b, b\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_s8, svint8_t,
++		z0 = svinsr_n_s8 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u16.c
+new file mode 100644
+index 000000000..35af77402
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u16.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_w0_u16_tied1:
++**	insr	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_u16_tied1, svuint16_t, uint16_t,
++		 z0 = svinsr_n_u16 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_w0_u16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_u16_untied, svuint16_t, uint16_t,
++		 z0 = svinsr_n_u16 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_u16_tied1:
++**	insr	z0\.h, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u16_tied1, svuint16_t,
++		z0 = svinsr_n_u16 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_u16_untied:
++**	movprfx	z0, z1
++**	insr	z0\.h, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u16_untied, svuint16_t,
++		z0 = svinsr_n_u16 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_u16:
++** (
++**	mov	(w[0-9]+), #?1
++**	insr	z0\.h, \1
++** |
++**	movi	v([0-9]+)\.4h, 0x1
++**	insr	z0\.h, h\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_u16, svuint16_t,
++		z0 = svinsr_n_u16 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u32.c
+new file mode 100644
+index 000000000..8a72e7f2a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u32.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_w0_u32_tied1:
++**	insr	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_u32_tied1, svuint32_t, uint32_t,
++		 z0 = svinsr_n_u32 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_w0_u32_untied:
++**	movprfx	z0, z1
++**	insr	z0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_u32_untied, svuint32_t, uint32_t,
++		 z0 = svinsr_n_u32 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_u32_tied1:
++**	insr	z0\.s, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u32_tied1, svuint32_t,
++		z0 = svinsr_n_u32 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_u32_untied:
++**	movprfx	z0, z1
++**	insr	z0\.s, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u32_untied, svuint32_t,
++		z0 = svinsr_n_u32 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_u32:
++** (
++**	mov	(w[0-9]+), #?1
++**	insr	z0\.s, \1
++** |
++**	movi	v([0-9]+)\.2s, 0x1
++**	insr	z0\.s, s\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_u32, svuint32_t,
++		z0 = svinsr_n_u32 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u64.c
+new file mode 100644
+index 000000000..ab23f677d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u64.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_x0_u64_tied1:
++**	insr	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_x0_u64_tied1, svuint64_t, uint64_t,
++		 z0 = svinsr_n_u64 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_x0_u64_untied:
++**	movprfx	z0, z1
++**	insr	z0\.d, x0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_x0_u64_untied, svuint64_t, uint64_t,
++		 z0 = svinsr_n_u64 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_u64_tied1:
++**	insr	z0\.d, xzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u64_tied1, svuint64_t,
++		z0 = svinsr_n_u64 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_u64_untied:
++**	movprfx	z0, z1
++**	insr	z0\.d, xzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u64_untied, svuint64_t,
++		z0 = svinsr_n_u64 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_u64:
++** (
++**	mov	(x[0-9]+), #?1
++**	insr	z0\.d, \1
++** |
++**	movi	v([0-9]+)\.2d, 0x1
++**	insr	z0\.d, d\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_u64, svuint64_t,
++		z0 = svinsr_n_u64 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u8.c
+new file mode 100644
+index 000000000..549d71882
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u8.c
+@@ -0,0 +1,56 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** insr_w0_u8_tied1:
++**	insr	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_u8_tied1, svuint8_t, uint8_t,
++		 z0 = svinsr_n_u8 (z0, x0),
++		 z0 = svinsr (z0, x0))
++
++/*
++** insr_w0_u8_untied:
++**	movprfx	z0, z1
++**	insr	z0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_ZX (insr_w0_u8_untied, svuint8_t, uint8_t,
++		 z0 = svinsr_n_u8 (z1, x0),
++		 z0 = svinsr (z1, x0))
++
++/*
++** insr_0_u8_tied1:
++**	insr	z0\.b, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u8_tied1, svuint8_t,
++		z0 = svinsr_n_u8 (z0, 0),
++		z0 = svinsr (z0, 0))
++
++/*
++** insr_0_u8_untied:
++**	movprfx	z0, z1
++**	insr	z0\.b, wzr
++**	ret
++*/
++TEST_UNIFORM_Z (insr_0_u8_untied, svuint8_t,
++		z0 = svinsr_n_u8 (z1, 0),
++		z0 = svinsr (z1, 0))
++
++/*
++** insr_1_u8:
++** (
++**	mov	(w[0-9]+), #?1
++**	insr	z0\.b, \1
++** |
++**	movi	v([0-9]+)\.8b, 0x1
++**	insr	z0\.b, b\2
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (insr_1_u8, svuint8_t,
++		z0 = svinsr_n_u8 (z0, 1),
++		z0 = svinsr (z0, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_bf16.c
+new file mode 100644
+index 000000000..da30e05e5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_bf16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_d0_bf16_tied:
++**	lasta	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_bf16_tied, bfloat16_t, svbfloat16_t,
++		  d0 = svlasta_bf16 (p0, z0),
++		  d0 = svlasta (p0, z0))
++
++/*
++** lasta_d0_bf16_untied:
++**	lasta	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_bf16_untied, bfloat16_t, svbfloat16_t,
++		  d0 = svlasta_bf16 (p0, z1),
++		  d0 = svlasta (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f16.c
+new file mode 100644
+index 000000000..972b55ab6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_d0_f16_tied:
++**	lasta	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svlasta_f16 (p0, z0),
++		  d0 = svlasta (p0, z0))
++
++/*
++** lasta_d0_f16_untied:
++**	lasta	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svlasta_f16 (p0, z1),
++		  d0 = svlasta (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f32.c
+new file mode 100644
+index 000000000..cfb537f2f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_d0_f32_tied:
++**	lasta	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svlasta_f32 (p0, z0),
++		  d0 = svlasta (p0, z0))
++
++/*
++** lasta_d0_f32_untied:
++**	lasta	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svlasta_f32 (p0, z1),
++		  d0 = svlasta (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f64.c
+new file mode 100644
+index 000000000..a4a8a74c9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_d0_f64_tied:
++**	lasta	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svlasta_f64 (p0, z0),
++		  d0 = svlasta (p0, z0))
++
++/*
++** lasta_d0_f64_untied:
++**	lasta	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (lasta_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svlasta_f64 (p0, z1),
++		  d0 = svlasta (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s16.c
+new file mode 100644
+index 000000000..54bd0248f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_s16:
++**	lasta	w0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_s16, int16_t, svint16_t,
++		  x0 = svlasta_s16 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s32.c
+new file mode 100644
+index 000000000..18f852f94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_s32:
++**	lasta	w0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_s32, int32_t, svint32_t,
++		  x0 = svlasta_s32 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s64.c
+new file mode 100644
+index 000000000..6e45af3d4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_s64:
++**	lasta	x0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_s64, int64_t, svint64_t,
++		  x0 = svlasta_s64 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s8.c
+new file mode 100644
+index 000000000..58e574f30
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s8.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_s8:
++**	lasta	w0, p0, z0\.b
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_s8, int8_t, svint8_t,
++		  x0 = svlasta_s8 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u16.c
+new file mode 100644
+index 000000000..a0e14eca4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_u16:
++**	lasta	w0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_u16, uint16_t, svuint16_t,
++		  x0 = svlasta_u16 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u32.c
+new file mode 100644
+index 000000000..dab37c36a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_u32:
++**	lasta	w0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_u32, uint32_t, svuint32_t,
++		  x0 = svlasta_u32 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u64.c
+new file mode 100644
+index 000000000..c766f36ec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_u64:
++**	lasta	x0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_u64, uint64_t, svuint64_t,
++		  x0 = svlasta_u64 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u8.c
+new file mode 100644
+index 000000000..a83f25fe4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u8.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lasta_x0_u8:
++**	lasta	w0, p0, z0\.b
++**	ret
++*/
++TEST_REDUCTION_X (lasta_x0_u8, uint8_t, svuint8_t,
++		  x0 = svlasta_u8 (p0, z0),
++		  x0 = svlasta (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_bf16.c
+new file mode 100644
+index 000000000..01ba39a02
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_bf16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_d0_bf16_tied:
++**	lastb	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_bf16_tied, bfloat16_t, svbfloat16_t,
++		  d0 = svlastb_bf16 (p0, z0),
++		  d0 = svlastb (p0, z0))
++
++/*
++** lastb_d0_bf16_untied:
++**	lastb	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_bf16_untied, bfloat16_t, svbfloat16_t,
++		  d0 = svlastb_bf16 (p0, z1),
++		  d0 = svlastb (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f16.c
+new file mode 100644
+index 000000000..0bc7e9ef4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_d0_f16_tied:
++**	lastb	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svlastb_f16 (p0, z0),
++		  d0 = svlastb (p0, z0))
++
++/*
++** lastb_d0_f16_untied:
++**	lastb	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svlastb_f16 (p0, z1),
++		  d0 = svlastb (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f32.c
+new file mode 100644
+index 000000000..b33d61eee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_d0_f32_tied:
++**	lastb	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svlastb_f32 (p0, z0),
++		  d0 = svlastb (p0, z0))
++
++/*
++** lastb_d0_f32_untied:
++**	lastb	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svlastb_f32 (p0, z1),
++		  d0 = svlastb (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f64.c
+new file mode 100644
+index 000000000..9fa7de706
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_d0_f64_tied:
++**	lastb	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svlastb_f64 (p0, z0),
++		  d0 = svlastb (p0, z0))
++
++/*
++** lastb_d0_f64_untied:
++**	lastb	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (lastb_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svlastb_f64 (p0, z1),
++		  d0 = svlastb (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s16.c
+new file mode 100644
+index 000000000..6575f21cd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_s16:
++**	lastb	w0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_s16, int16_t, svint16_t,
++		  x0 = svlastb_s16 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s32.c
+new file mode 100644
+index 000000000..856e5bdc8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_s32:
++**	lastb	w0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_s32, int32_t, svint32_t,
++		  x0 = svlastb_s32 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s64.c
+new file mode 100644
+index 000000000..bd7de2ab2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_s64:
++**	lastb	x0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_s64, int64_t, svint64_t,
++		  x0 = svlastb_s64 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s8.c
+new file mode 100644
+index 000000000..4c343a705
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s8.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_s8:
++**	lastb	w0, p0, z0\.b
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_s8, int8_t, svint8_t,
++		  x0 = svlastb_s8 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u16.c
+new file mode 100644
+index 000000000..7f3db1bb1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_u16:
++**	lastb	w0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_u16, uint16_t, svuint16_t,
++		  x0 = svlastb_u16 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u32.c
+new file mode 100644
+index 000000000..c2eeacba0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_u32:
++**	lastb	w0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_u32, uint32_t, svuint32_t,
++		  x0 = svlastb_u32 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u64.c
+new file mode 100644
+index 000000000..1496ffa0e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_u64:
++**	lastb	x0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_u64, uint64_t, svuint64_t,
++		  x0 = svlastb_u64 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u8.c
+new file mode 100644
+index 000000000..25f036063
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u8.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lastb_x0_u8:
++**	lastb	w0, p0, z0\.b
++**	ret
++*/
++TEST_REDUCTION_X (lastb_x0_u8, uint8_t, svuint8_t,
++		  x0 = svlastb_u8 (p0, z0),
++		  x0 = svlastb (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_bf16.c
+new file mode 100644
+index 000000000..07891de04
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_bf16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_bf16_base:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_base, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_bf16_index:
++**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_index, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_bf16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 + svcnth ()),
++	   z0 = svld1 (p0, x0 + svcnth ()))
++
++/*
++** ld1_bf16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_bf16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1_bf16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 - svcnth ()),
++	   z0 = svld1 (p0, x0 - svcnth ()))
++
++/*
++** ld1_bf16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_bf16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_bf16_m9, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_bf16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1_vnum_bf16_0:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_bf16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_bf16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_bf16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_bf16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_bf16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_bf16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1_vnum_bf16 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f16.c
+new file mode 100644
+index 000000000..c3552bfbd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_f16_base:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_base, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_f16_index:
++**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_index, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_f16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_1, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 + svcnth ()),
++	   z0 = svld1 (p0, x0 + svcnth ()))
++
++/*
++** ld1_f16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_7, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_f16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_8, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1_f16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_m1, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 - svcnth ()),
++	   z0 = svld1 (p0, x0 - svcnth ()))
++
++/*
++** ld1_f16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_m8, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_f16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f16_m9, svfloat16_t, float16_t,
++	   z0 = svld1_f16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1_vnum_f16_0:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_0, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_f16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_1, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_f16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_7, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_f16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_8, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_f16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_m1, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_f16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_m8, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_f16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_m9, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f16_x1, svfloat16_t, float16_t,
++	   z0 = svld1_vnum_f16 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f32.c
+new file mode 100644
+index 000000000..8990f48d9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_f32_base:
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_base, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_f32_index:
++**	ld1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_index, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_f32_1:
++**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_1, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 + svcntw ()),
++	   z0 = svld1 (p0, x0 + svcntw ()))
++
++/*
++** ld1_f32_7:
++**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_7, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_f32_8:
++**	incb	x0, all, mul #8
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_8, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1_f32_m1:
++**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_m1, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 - svcntw ()),
++	   z0 = svld1 (p0, x0 - svcntw ()))
++
++/*
++** ld1_f32_m8:
++**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_m8, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_f32_m9:
++**	decb	x0, all, mul #9
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f32_m9, svfloat32_t, float32_t,
++	   z0 = svld1_f32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1_vnum_f32_0:
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_0, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_f32_1:
++**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_1, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_f32_7:
++**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_7, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_f32_8:
++**	incb	x0, all, mul #8
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_8, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_f32_m1:
++**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_m1, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_f32_m8:
++**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_m8, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_f32_m9:
++**	decb	x0, all, mul #9
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_m9, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f32_x1, svfloat32_t, float32_t,
++	   z0 = svld1_vnum_f32 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f64.c
+new file mode 100644
+index 000000000..eb28687fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_f64_base:
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_base, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_f64_index:
++**	ld1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_index, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_f64_1:
++**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_1, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 + svcntd ()),
++	   z0 = svld1 (p0, x0 + svcntd ()))
++
++/*
++** ld1_f64_7:
++**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_7, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_f64_8:
++**	incb	x0, all, mul #8
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_8, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1_f64_m1:
++**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_m1, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 - svcntd ()),
++	   z0 = svld1 (p0, x0 - svcntd ()))
++
++/*
++** ld1_f64_m8:
++**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_m8, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_f64_m9:
++**	decb	x0, all, mul #9
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_f64_m9, svfloat64_t, float64_t,
++	   z0 = svld1_f64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1_vnum_f64_0:
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_0, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_f64_1:
++**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_1, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_f64_7:
++**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_7, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_f64_8:
++**	incb	x0, all, mul #8
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_8, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_f64_m1:
++**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_m1, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_f64_m8:
++**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_m8, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_f64_m9:
++**	decb	x0, all, mul #9
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_m9, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_f64_x1, svfloat64_t, float64_t,
++	   z0 = svld1_vnum_f64 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f32.c
+new file mode 100644
+index 000000000..00b68ff29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f32.c
+@@ -0,0 +1,272 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_gather_f32_tied1:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_f32_tied1, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_f32 (p0, z0),
++		     z0_res = svld1_gather_f32 (p0, z0))
++
++/*
++** ld1_gather_f32_untied:
++**	ld1w	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_f32_untied, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_f32 (p0, z1),
++		     z0_res = svld1_gather_f32 (p0, z1))
++
++/*
++** ld1_gather_x0_f32_offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, x0),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, x0))
++
++/*
++** ld1_gather_m4_f32_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m4_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, -4),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, -4))
++
++/*
++** ld1_gather_0_f32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 0),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 0))
++
++/*
++** ld1_gather_5_f32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 5),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 5))
++
++/*
++** ld1_gather_6_f32_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_6_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 6),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 6))
++
++/*
++** ld1_gather_7_f32_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_7_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 7),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 7))
++
++/*
++** ld1_gather_8_f32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_8_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 8),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 8))
++
++/*
++** ld1_gather_124_f32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_124_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 124),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 124))
++
++/*
++** ld1_gather_128_f32_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_128_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 128),
++		     z0_res = svld1_gather_offset_f32 (p0, z0, 128))
++
++/*
++** ld1_gather_x0_f32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, x0),
++		     z0_res = svld1_gather_index_f32 (p0, z0, x0))
++
++/*
++** ld1_gather_m1_f32_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m1_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, -1),
++		     z0_res = svld1_gather_index_f32 (p0, z0, -1))
++
++/*
++** ld1_gather_0_f32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 0),
++		     z0_res = svld1_gather_index_f32 (p0, z0, 0))
++
++/*
++** ld1_gather_5_f32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 5),
++		     z0_res = svld1_gather_index_f32 (p0, z0, 5))
++
++/*
++** ld1_gather_31_f32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_31_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 31),
++		     z0_res = svld1_gather_index_f32 (p0, z0, 31))
++
++/*
++** ld1_gather_32_f32_index:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_32_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 32),
++		     z0_res = svld1_gather_index_f32 (p0, z0, 32))
++
++/*
++** ld1_gather_x0_f32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_f32 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_x0_f32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_f32 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_x0_f32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_f32 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_x0_f32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_f32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_f32 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f64.c
+new file mode 100644
+index 000000000..47127960c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f64.c
+@@ -0,0 +1,348 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_gather_f64_tied1:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_f64_tied1, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_f64 (p0, z0),
++		     z0_res = svld1_gather_f64 (p0, z0))
++
++/*
++** ld1_gather_f64_untied:
++**	ld1d	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_f64_untied, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_f64 (p0, z1),
++		     z0_res = svld1_gather_f64 (p0, z1))
++
++/*
++** ld1_gather_x0_f64_offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, x0),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, x0))
++
++/*
++** ld1_gather_m8_f64_offset:
++**	mov	(x[0-9]+), #?-8
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m8_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, -8),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, -8))
++
++/*
++** ld1_gather_0_f64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 0),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 0))
++
++/*
++** ld1_gather_9_f64_offset:
++**	mov	(x[0-9]+), #?9
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_9_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 9),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 9))
++
++/*
++** ld1_gather_10_f64_offset:
++**	mov	(x[0-9]+), #?10
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_10_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 10),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 10))
++
++/*
++** ld1_gather_11_f64_offset:
++**	mov	(x[0-9]+), #?11
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_11_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 11),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 11))
++
++/*
++** ld1_gather_12_f64_offset:
++**	mov	(x[0-9]+), #?12
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_12_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 12),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 12))
++
++/*
++** ld1_gather_13_f64_offset:
++**	mov	(x[0-9]+), #?13
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_13_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 13),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 13))
++
++/*
++** ld1_gather_14_f64_offset:
++**	mov	(x[0-9]+), #?14
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_14_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 14),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 14))
++
++/*
++** ld1_gather_15_f64_offset:
++**	mov	(x[0-9]+), #?15
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_15_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 15),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 15))
++
++/*
++** ld1_gather_16_f64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #16\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_16_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 16),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 16))
++
++/*
++** ld1_gather_248_f64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_248_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 248),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 248))
++
++/*
++** ld1_gather_256_f64_offset:
++**	mov	(x[0-9]+), #?256
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_256_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 256),
++		     z0_res = svld1_gather_offset_f64 (p0, z0, 256))
++
++/*
++** ld1_gather_x0_f64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, x0),
++		     z0_res = svld1_gather_index_f64 (p0, z0, x0))
++
++/*
++** ld1_gather_m1_f64_index:
++**	mov	(x[0-9]+), #?-8
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m1_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, -1),
++		     z0_res = svld1_gather_index_f64 (p0, z0, -1))
++
++/*
++** ld1_gather_0_f64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 0),
++		     z0_res = svld1_gather_index_f64 (p0, z0, 0))
++
++/*
++** ld1_gather_5_f64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #40\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 5),
++		     z0_res = svld1_gather_index_f64 (p0, z0, 5))
++
++/*
++** ld1_gather_31_f64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_31_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 31),
++		     z0_res = svld1_gather_index_f64 (p0, z0, 31))
++
++/*
++** ld1_gather_32_f64_index:
++**	mov	(x[0-9]+), #?256
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_32_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 32),
++		     z0_res = svld1_gather_index_f64 (p0, z0, 32))
++
++/*
++** ld1_gather_x0_f64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_f64 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_ext_f64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_f64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_f64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_f64 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_ext_f64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_f64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_f64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_f64 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_ext_f64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_f64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_f64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_f64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_f64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_f64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_f64 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_ext_f64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_f64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s32.c
+new file mode 100644
+index 000000000..9b6335547
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s32.c
+@@ -0,0 +1,272 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_gather_s32_tied1:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_s32 (p0, z0),
++		     z0_res = svld1_gather_s32 (p0, z0))
++
++/*
++** ld1_gather_s32_untied:
++**	ld1w	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_s32 (p0, z1),
++		     z0_res = svld1_gather_s32 (p0, z1))
++
++/*
++** ld1_gather_x0_s32_offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ld1_gather_m4_s32_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m4_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, -4),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, -4))
++
++/*
++** ld1_gather_0_s32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ld1_gather_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ld1_gather_6_s32_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_6_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 6),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 6))
++
++/*
++** ld1_gather_7_s32_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_7_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 7),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 7))
++
++/*
++** ld1_gather_8_s32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_8_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 8),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 8))
++
++/*
++** ld1_gather_124_s32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_124_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 124),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 124))
++
++/*
++** ld1_gather_128_s32_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_128_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 128),
++		     z0_res = svld1_gather_offset_s32 (p0, z0, 128))
++
++/*
++** ld1_gather_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, x0),
++		     z0_res = svld1_gather_index_s32 (p0, z0, x0))
++
++/*
++** ld1_gather_m1_s32_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m1_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, -1),
++		     z0_res = svld1_gather_index_s32 (p0, z0, -1))
++
++/*
++** ld1_gather_0_s32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 0),
++		     z0_res = svld1_gather_index_s32 (p0, z0, 0))
++
++/*
++** ld1_gather_5_s32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 5),
++		     z0_res = svld1_gather_index_s32 (p0, z0, 5))
++
++/*
++** ld1_gather_31_s32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_31_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 31),
++		     z0_res = svld1_gather_index_s32 (p0, z0, 31))
++
++/*
++** ld1_gather_32_s32_index:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_32_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 32),
++		     z0_res = svld1_gather_index_s32 (p0, z0, 32))
++
++/*
++** ld1_gather_x0_s32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_s32offset, svint32_t, int32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_s32offset, svint32_t, int32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_s32offset, svint32_t, int32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_x0_s32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_x0_s32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_s32index, svint32_t, int32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_s32index, svint32_t, int32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_s32index, svint32_t, int32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_s32 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_x0_s32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_u32index, svint32_t, int32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_u32index, svint32_t, int32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_u32index, svint32_t, int32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_s32 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s64.c
+new file mode 100644
+index 000000000..c9cea3ad8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s64.c
+@@ -0,0 +1,348 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_gather_s64_tied1:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1_gather_s64 (p0, z0))
++
++/*
++** ld1_gather_s64_untied:
++**	ld1d	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1_gather_s64 (p0, z1))
++
++/*
++** ld1_gather_x0_s64_offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1_gather_m8_s64_offset:
++**	mov	(x[0-9]+), #?-8
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m8_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, -8),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, -8))
++
++/*
++** ld1_gather_0_s64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1_gather_9_s64_offset:
++**	mov	(x[0-9]+), #?9
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_9_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 9),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 9))
++
++/*
++** ld1_gather_10_s64_offset:
++**	mov	(x[0-9]+), #?10
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_10_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 10),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 10))
++
++/*
++** ld1_gather_11_s64_offset:
++**	mov	(x[0-9]+), #?11
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_11_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 11),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 11))
++
++/*
++** ld1_gather_12_s64_offset:
++**	mov	(x[0-9]+), #?12
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_12_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 12),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 12))
++
++/*
++** ld1_gather_13_s64_offset:
++**	mov	(x[0-9]+), #?13
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_13_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 13),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 13))
++
++/*
++** ld1_gather_14_s64_offset:
++**	mov	(x[0-9]+), #?14
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_14_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 14),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 14))
++
++/*
++** ld1_gather_15_s64_offset:
++**	mov	(x[0-9]+), #?15
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_15_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 15),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 15))
++
++/*
++** ld1_gather_16_s64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #16\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_16_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 16),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 16))
++
++/*
++** ld1_gather_248_s64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_248_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 248),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 248))
++
++/*
++** ld1_gather_256_s64_offset:
++**	mov	(x[0-9]+), #?256
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_256_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 256),
++		     z0_res = svld1_gather_offset_s64 (p0, z0, 256))
++
++/*
++** ld1_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svld1_gather_index_s64 (p0, z0, x0))
++
++/*
++** ld1_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-8
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svld1_gather_index_s64 (p0, z0, -1))
++
++/*
++** ld1_gather_0_s64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svld1_gather_index_s64 (p0, z0, 0))
++
++/*
++** ld1_gather_5_s64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #40\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svld1_gather_index_s64 (p0, z0, 5))
++
++/*
++** ld1_gather_31_s64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svld1_gather_index_s64 (p0, z0, 31))
++
++/*
++** ld1_gather_32_s64_index:
++**	mov	(x[0-9]+), #?256
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svld1_gather_index_s64 (p0, z0, 32))
++
++/*
++** ld1_gather_x0_s64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_ext_s64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_s64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_ext_s64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_s64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_ext_s64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_s64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_s64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_s64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_ext_s64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u32.c
+new file mode 100644
+index 000000000..2cccc8d49
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u32.c
+@@ -0,0 +1,272 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_gather_u32_tied1:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_u32 (p0, z0),
++		     z0_res = svld1_gather_u32 (p0, z0))
++
++/*
++** ld1_gather_u32_untied:
++**	ld1w	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_u32 (p0, z1),
++		     z0_res = svld1_gather_u32 (p0, z1))
++
++/*
++** ld1_gather_x0_u32_offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ld1_gather_m4_u32_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m4_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, -4),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, -4))
++
++/*
++** ld1_gather_0_u32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ld1_gather_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ld1_gather_6_u32_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_6_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 6),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 6))
++
++/*
++** ld1_gather_7_u32_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_7_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 7),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 7))
++
++/*
++** ld1_gather_8_u32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_8_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 8),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 8))
++
++/*
++** ld1_gather_124_u32_offset:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_124_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 124),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 124))
++
++/*
++** ld1_gather_128_u32_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_128_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 128),
++		     z0_res = svld1_gather_offset_u32 (p0, z0, 128))
++
++/*
++** ld1_gather_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, x0),
++		     z0_res = svld1_gather_index_u32 (p0, z0, x0))
++
++/*
++** ld1_gather_m1_u32_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m1_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, -1),
++		     z0_res = svld1_gather_index_u32 (p0, z0, -1))
++
++/*
++** ld1_gather_0_u32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 0),
++		     z0_res = svld1_gather_index_u32 (p0, z0, 0))
++
++/*
++** ld1_gather_5_u32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 5),
++		     z0_res = svld1_gather_index_u32 (p0, z0, 5))
++
++/*
++** ld1_gather_31_u32_index:
++**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_31_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 31),
++		     z0_res = svld1_gather_index_u32 (p0, z0, 31))
++
++/*
++** ld1_gather_32_u32_index:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_32_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 32),
++		     z0_res = svld1_gather_index_u32 (p0, z0, 32))
++
++/*
++** ld1_gather_x0_u32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u32_s32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svld1_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_x0_u32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u32_u32offset:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_x0_u32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u32_s32index:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svld1_gather_s32index_u32 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_x0_u32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u32_u32index:
++**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svld1_gather_u32index_u32 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u64.c
+new file mode 100644
+index 000000000..6ee1d48ab
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u64.c
+@@ -0,0 +1,348 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_gather_u64_tied1:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1_gather_u64 (p0, z0))
++
++/*
++** ld1_gather_u64_untied:
++**	ld1d	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1_gather_u64 (p0, z1))
++
++/*
++** ld1_gather_x0_u64_offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1_gather_m8_u64_offset:
++**	mov	(x[0-9]+), #?-8
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m8_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, -8),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, -8))
++
++/*
++** ld1_gather_0_u64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1_gather_9_u64_offset:
++**	mov	(x[0-9]+), #?9
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_9_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 9),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 9))
++
++/*
++** ld1_gather_10_u64_offset:
++**	mov	(x[0-9]+), #?10
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_10_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 10),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 10))
++
++/*
++** ld1_gather_11_u64_offset:
++**	mov	(x[0-9]+), #?11
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_11_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 11),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 11))
++
++/*
++** ld1_gather_12_u64_offset:
++**	mov	(x[0-9]+), #?12
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_12_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 12),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 12))
++
++/*
++** ld1_gather_13_u64_offset:
++**	mov	(x[0-9]+), #?13
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_13_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 13),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 13))
++
++/*
++** ld1_gather_14_u64_offset:
++**	mov	(x[0-9]+), #?14
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_14_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 14),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 14))
++
++/*
++** ld1_gather_15_u64_offset:
++**	mov	(x[0-9]+), #?15
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_15_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 15),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 15))
++
++/*
++** ld1_gather_16_u64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #16\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_16_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 16),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 16))
++
++/*
++** ld1_gather_248_u64_offset:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_248_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 248),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 248))
++
++/*
++** ld1_gather_256_u64_offset:
++**	mov	(x[0-9]+), #?256
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_256_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 256),
++		     z0_res = svld1_gather_offset_u64 (p0, z0, 256))
++
++/*
++** ld1_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svld1_gather_index_u64 (p0, z0, x0))
++
++/*
++** ld1_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-8
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svld1_gather_index_u64 (p0, z0, -1))
++
++/*
++** ld1_gather_0_u64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svld1_gather_index_u64 (p0, z0, 0))
++
++/*
++** ld1_gather_5_u64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #40\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svld1_gather_index_u64 (p0, z0, 5))
++
++/*
++** ld1_gather_31_u64_index:
++**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svld1_gather_index_u64 (p0, z0, 31))
++
++/*
++** ld1_gather_32_u64_index:
++**	mov	(x[0-9]+), #?256
++**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svld1_gather_index_u64 (p0, z0, 32))
++
++/*
++** ld1_gather_x0_u64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_ext_u64_s64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_u64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_offset (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1_gather_offset (p0, x0, z1))
++
++/*
++** ld1_gather_ext_u64_u64offset:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_u64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_ext_u64_s64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svld1_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1_gather_x0_u64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_tied1_u64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1_gather_index (p0, x0, z0))
++
++/*
++** ld1_gather_untied_u64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svld1_gather_index (p0, x0, z1))
++
++/*
++** ld1_gather_ext_u64_u64index:
++**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svld1_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s16.c
+new file mode 100644
+index 000000000..d86b49a73
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_s16_base:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_base, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_s16_index:
++**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_index, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_s16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_1, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 + svcnth ()),
++	   z0 = svld1 (p0, x0 + svcnth ()))
++
++/*
++** ld1_s16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_7, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_8, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1_s16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_m1, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 - svcnth ()),
++	   z0 = svld1 (p0, x0 - svcnth ()))
++
++/*
++** ld1_s16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_m8, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s16_m9, svint16_t, int16_t,
++	   z0 = svld1_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1_vnum_s16_0:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_0, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_s16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_1, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_s16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_7, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_8, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_s16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_m1, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_s16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_m8, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_m9, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s16_x1, svint16_t, int16_t,
++	   z0 = svld1_vnum_s16 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s32.c
+new file mode 100644
+index 000000000..5b692e510
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_s32_base:
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_base, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_s32_index:
++**	ld1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_index, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_s32_1:
++**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_1, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 + svcntw ()),
++	   z0 = svld1 (p0, x0 + svcntw ()))
++
++/*
++** ld1_s32_7:
++**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_7, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s32_8:
++**	incb	x0, all, mul #8
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_8, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1_s32_m1:
++**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_m1, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 - svcntw ()),
++	   z0 = svld1 (p0, x0 - svcntw ()))
++
++/*
++** ld1_s32_m8:
++**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_m8, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s32_m9:
++**	decb	x0, all, mul #9
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s32_m9, svint32_t, int32_t,
++	   z0 = svld1_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1_vnum_s32_0:
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_0, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_s32_1:
++**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_1, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_s32_7:
++**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_7, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s32_8:
++**	incb	x0, all, mul #8
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_8, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_s32_m1:
++**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_m1, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_s32_m8:
++**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_m8, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s32_m9:
++**	decb	x0, all, mul #9
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_m9, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s32_x1, svint32_t, int32_t,
++	   z0 = svld1_vnum_s32 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s64.c
+new file mode 100644
+index 000000000..15ee29bba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_s64_base:
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_base, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_s64_index:
++**	ld1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_index, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_s64_1:
++**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_1, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1 (p0, x0 + svcntd ()))
++
++/*
++** ld1_s64_7:
++**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_7, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s64_8:
++**	incb	x0, all, mul #8
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_8, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1_s64_m1:
++**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_m1, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1 (p0, x0 - svcntd ()))
++
++/*
++** ld1_s64_m8:
++**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_m8, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s64_m9:
++**	decb	x0, all, mul #9
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s64_m9, svint64_t, int64_t,
++	   z0 = svld1_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1_vnum_s64_0:
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_0, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_s64_1:
++**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_1, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_s64_7:
++**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_7, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s64_8:
++**	incb	x0, all, mul #8
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_8, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_s64_m1:
++**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_m1, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_s64_m8:
++**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_m8, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s64_m9:
++**	decb	x0, all, mul #9
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_m9, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s64_x1, svint64_t, int64_t,
++	   z0 = svld1_vnum_s64 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s8.c
+new file mode 100644
+index 000000000..036fb3d41
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_s8_base:
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_base, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_s8_index:
++**	ld1b	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_index, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_s8_1:
++**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_1, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 + svcntb ()),
++	   z0 = svld1 (p0, x0 + svcntb ()))
++
++/*
++** ld1_s8_7:
++**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_7, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 + svcntb () * 7),
++	   z0 = svld1 (p0, x0 + svcntb () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s8_8:
++**	incb	x0, all, mul #8
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_8, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 + svcntb () * 8),
++	   z0 = svld1 (p0, x0 + svcntb () * 8))
++
++/*
++** ld1_s8_m1:
++**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_m1, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 - svcntb ()),
++	   z0 = svld1 (p0, x0 - svcntb ()))
++
++/*
++** ld1_s8_m8:
++**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_m8, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 - svcntb () * 8),
++	   z0 = svld1 (p0, x0 - svcntb () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_s8_m9:
++**	decb	x0, all, mul #9
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_s8_m9, svint8_t, int8_t,
++	   z0 = svld1_s8 (p0, x0 - svcntb () * 9),
++	   z0 = svld1 (p0, x0 - svcntb () * 9))
++
++/*
++** ld1_vnum_s8_0:
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_0, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_s8_1:
++**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_1, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_s8_7:
++**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_7, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s8_8:
++**	incb	x0, all, mul #8
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_8, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_s8_m1:
++**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_m1, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_s8_m8:
++**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_m8, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_s8_m9:
++**	decb	x0, all, mul #9
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_m9, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/*
++** ld1_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.b, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.b, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1_vnum_s8_x1, svint8_t, int8_t,
++	   z0 = svld1_vnum_s8 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u16.c
+new file mode 100644
+index 000000000..ee25b9e37
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_u16_base:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_base, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_u16_index:
++**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_index, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_u16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_1, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 + svcnth ()),
++	   z0 = svld1 (p0, x0 + svcnth ()))
++
++/*
++** ld1_u16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_7, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_8, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1_u16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_m1, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 - svcnth ()),
++	   z0 = svld1 (p0, x0 - svcnth ()))
++
++/*
++** ld1_u16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_m8, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u16_m9, svuint16_t, uint16_t,
++	   z0 = svld1_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1_vnum_u16_0:
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_0, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_u16_1:
++**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_1, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_u16_7:
++**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_7, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u16_8:
++**	incb	x0, all, mul #8
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_8, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_u16_m1:
++**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_m1, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_u16_m8:
++**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_m8, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u16_m9:
++**	decb	x0, all, mul #9
++**	ld1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_m9, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u16_x1, svuint16_t, uint16_t,
++	   z0 = svld1_vnum_u16 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u32.c
+new file mode 100644
+index 000000000..bcd304126
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_u32_base:
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_base, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_u32_index:
++**	ld1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_index, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_u32_1:
++**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_1, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 + svcntw ()),
++	   z0 = svld1 (p0, x0 + svcntw ()))
++
++/*
++** ld1_u32_7:
++**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_7, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u32_8:
++**	incb	x0, all, mul #8
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_8, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1_u32_m1:
++**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_m1, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 - svcntw ()),
++	   z0 = svld1 (p0, x0 - svcntw ()))
++
++/*
++** ld1_u32_m8:
++**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_m8, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u32_m9:
++**	decb	x0, all, mul #9
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u32_m9, svuint32_t, uint32_t,
++	   z0 = svld1_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1_vnum_u32_0:
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_0, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_u32_1:
++**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_1, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_u32_7:
++**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_7, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u32_8:
++**	incb	x0, all, mul #8
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_8, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_u32_m1:
++**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_m1, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_u32_m8:
++**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_m8, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u32_m9:
++**	decb	x0, all, mul #9
++**	ld1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_m9, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u32_x1, svuint32_t, uint32_t,
++	   z0 = svld1_vnum_u32 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u64.c
+new file mode 100644
+index 000000000..ebb874720
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_u64_base:
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_base, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_u64_index:
++**	ld1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_index, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_u64_1:
++**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_1, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1 (p0, x0 + svcntd ()))
++
++/*
++** ld1_u64_7:
++**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_7, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u64_8:
++**	incb	x0, all, mul #8
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_8, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1_u64_m1:
++**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_m1, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1 (p0, x0 - svcntd ()))
++
++/*
++** ld1_u64_m8:
++**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_m8, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u64_m9:
++**	decb	x0, all, mul #9
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u64_m9, svuint64_t, uint64_t,
++	   z0 = svld1_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1_vnum_u64_0:
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_0, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_u64_1:
++**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_1, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_u64_7:
++**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_7, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u64_8:
++**	incb	x0, all, mul #8
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_8, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_u64_m1:
++**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_m1, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_u64_m8:
++**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_m8, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u64_m9:
++**	decb	x0, all, mul #9
++**	ld1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_m9, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u64_x1, svuint64_t, uint64_t,
++	   z0 = svld1_vnum_u64 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u8.c
+new file mode 100644
+index 000000000..12f42bd92
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1_u8_base:
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_base, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0),
++	   z0 = svld1 (p0, x0))
++
++/*
++** ld1_u8_index:
++**	ld1b	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_index, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 + x1),
++	   z0 = svld1 (p0, x0 + x1))
++
++/*
++** ld1_u8_1:
++**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_1, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 + svcntb ()),
++	   z0 = svld1 (p0, x0 + svcntb ()))
++
++/*
++** ld1_u8_7:
++**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_7, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 + svcntb () * 7),
++	   z0 = svld1 (p0, x0 + svcntb () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u8_8:
++**	incb	x0, all, mul #8
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_8, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 + svcntb () * 8),
++	   z0 = svld1 (p0, x0 + svcntb () * 8))
++
++/*
++** ld1_u8_m1:
++**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_m1, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 - svcntb ()),
++	   z0 = svld1 (p0, x0 - svcntb ()))
++
++/*
++** ld1_u8_m8:
++**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_m8, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 - svcntb () * 8),
++	   z0 = svld1 (p0, x0 - svcntb () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_u8_m9:
++**	decb	x0, all, mul #9
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_u8_m9, svuint8_t, uint8_t,
++	   z0 = svld1_u8 (p0, x0 - svcntb () * 9),
++	   z0 = svld1 (p0, x0 - svcntb () * 9))
++
++/*
++** ld1_vnum_u8_0:
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_0, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, 0),
++	   z0 = svld1_vnum (p0, x0, 0))
++
++/*
++** ld1_vnum_u8_1:
++**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_1, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, 1),
++	   z0 = svld1_vnum (p0, x0, 1))
++
++/*
++** ld1_vnum_u8_7:
++**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_7, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, 7),
++	   z0 = svld1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u8_8:
++**	incb	x0, all, mul #8
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_8, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, 8),
++	   z0 = svld1_vnum (p0, x0, 8))
++
++/*
++** ld1_vnum_u8_m1:
++**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_m1, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, -1),
++	   z0 = svld1_vnum (p0, x0, -1))
++
++/*
++** ld1_vnum_u8_m8:
++**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_m8, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, -8),
++	   z0 = svld1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1_vnum_u8_m9:
++**	decb	x0, all, mul #9
++**	ld1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_m9, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, -9),
++	   z0 = svld1_vnum (p0, x0, -9))
++
++/*
++** ld1_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.b, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.b, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1_vnum_u8_x1, svuint8_t, uint8_t,
++	   z0 = svld1_vnum_u8 (p0, x0, x1),
++	   z0 = svld1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c
+new file mode 100644
+index 000000000..cb1801778
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_bf16_base:
++**	ld1roh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_base, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_bf16_index:
++**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_index, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_bf16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_bf16_8:
++**	add	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_bf16_128:
++**	add	(x[0-9]+), x0, #?256
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_128, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 + 128),
++	   z0 = svld1ro (p0, x0 + 128))
++
++/*
++** ld1ro_bf16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_bf16_m8:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_bf16_m144:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_m144, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 - 144),
++	   z0 = svld1ro (p0, x0 - 144))
++
++/*
++** ld1ro_bf16_16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_16, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 + 16),
++	   z0 = svld1ro (p0, x0 + 16))
++
++/*
++** ld1ro_bf16_112:
++**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_112, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 + 112),
++	   z0 = svld1ro (p0, x0 + 112))
++
++/*
++** ld1ro_bf16_m16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_m16, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 - 16),
++	   z0 = svld1ro (p0, x0 - 16))
++
++/*
++** ld1ro_bf16_m128:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_bf16_m128, svbfloat16_t, bfloat16_t,
++	   z0 = svld1ro_bf16 (p0, x0 - 128),
++	   z0 = svld1ro (p0, x0 - 128))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f16.c
+new file mode 100644
+index 000000000..86081edbd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f16.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_f16_base:
++**	ld1roh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_base, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_f16_index:
++**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_index, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_f16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_1, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_f16_8:
++**	add	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_8, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_f16_128:
++**	add	(x[0-9]+), x0, #?256
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_128, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 + 128),
++	   z0 = svld1ro (p0, x0 + 128))
++
++/*
++** ld1ro_f16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_m1, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_f16_m8:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_m8, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_f16_m144:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_m144, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 - 144),
++	   z0 = svld1ro (p0, x0 - 144))
++
++/*
++** ld1ro_f16_16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_16, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 + 16),
++	   z0 = svld1ro (p0, x0 + 16))
++
++/*
++** ld1ro_f16_112:
++**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_112, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 + 112),
++	   z0 = svld1ro (p0, x0 + 112))
++
++/*
++** ld1ro_f16_m16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_m16, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 - 16),
++	   z0 = svld1ro (p0, x0 - 16))
++
++/*
++** ld1ro_f16_m128:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f16_m128, svfloat16_t, float16_t,
++	   z0 = svld1ro_f16 (p0, x0 - 128),
++	   z0 = svld1ro (p0, x0 - 128))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f32.c
+new file mode 100644
+index 000000000..c8df00f8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f32.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_f32_base:
++**	ld1row	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_base, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_f32_index:
++**	ld1row	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_index, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_f32_1:
++**	add	(x[0-9]+), x0, #?4
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_1, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_f32_4:
++**	add	(x[0-9]+), x0, #?16
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_4, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 + 4),
++	   z0 = svld1ro (p0, x0 + 4))
++
++/*
++** ld1ro_f32_64:
++**	add	(x[0-9]+), x0, #?256
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_64, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 + 64),
++	   z0 = svld1ro (p0, x0 + 64))
++
++/*
++** ld1ro_f32_m1:
++**	sub	(x[0-9]+), x0, #?4
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_m1, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_f32_m4:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_m4, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 - 4),
++	   z0 = svld1ro (p0, x0 - 4))
++
++/*
++** ld1ro_f32_m72:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_m72, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 - 72),
++	   z0 = svld1ro (p0, x0 - 72))
++
++/*
++** ld1ro_f32_8:
++**	ld1row	z0\.s, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_8, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_f32_56:
++**	ld1row	z0\.s, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_56, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 + 56),
++	   z0 = svld1ro (p0, x0 + 56))
++
++/*
++** ld1ro_f32_m8:
++**	ld1row	z0\.s, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_m8, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_f32_m64:
++**	ld1row	z0\.s, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f32_m64, svfloat32_t, float32_t,
++	   z0 = svld1ro_f32 (p0, x0 - 64),
++	   z0 = svld1ro (p0, x0 - 64))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f64.c
+new file mode 100644
+index 000000000..2fb9d5b74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f64.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_f64_base:
++**	ld1rod	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_base, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_f64_index:
++**	ld1rod	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_index, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_f64_1:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_1, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_f64_2:
++**	add	(x[0-9]+), x0, #?16
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_2, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 + 2),
++	   z0 = svld1ro (p0, x0 + 2))
++
++/*
++** ld1ro_f64_32:
++**	add	(x[0-9]+), x0, #?256
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_32, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 + 32),
++	   z0 = svld1ro (p0, x0 + 32))
++
++/*
++** ld1ro_f64_m1:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_m1, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_f64_m2:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_m2, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 - 2),
++	   z0 = svld1ro (p0, x0 - 2))
++
++/*
++** ld1ro_f64_m36:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_m36, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 - 36),
++	   z0 = svld1ro (p0, x0 - 36))
++
++/*
++** ld1ro_f64_4:
++**	ld1rod	z0\.d, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_4, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 + 4),
++	   z0 = svld1ro (p0, x0 + 4))
++
++/*
++** ld1ro_f64_28:
++**	ld1rod	z0\.d, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_28, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 + 28),
++	   z0 = svld1ro (p0, x0 + 28))
++
++/*
++** ld1ro_f64_m4:
++**	ld1rod	z0\.d, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_m4, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 - 4),
++	   z0 = svld1ro (p0, x0 - 4))
++
++/*
++** ld1ro_f64_m32:
++**	ld1rod	z0\.d, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_f64_m32, svfloat64_t, float64_t,
++	   z0 = svld1ro_f64 (p0, x0 - 32),
++	   z0 = svld1ro (p0, x0 - 32))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s16.c
+new file mode 100644
+index 000000000..3cd211b16
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s16.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_s16_base:
++**	ld1roh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_base, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_s16_index:
++**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_index, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_s16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_1, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_s16_8:
++**	add	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_8, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_s16_128:
++**	add	(x[0-9]+), x0, #?256
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_128, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 + 128),
++	   z0 = svld1ro (p0, x0 + 128))
++
++/*
++** ld1ro_s16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_m1, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_s16_m8:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_m8, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_s16_m144:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_m144, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 - 144),
++	   z0 = svld1ro (p0, x0 - 144))
++
++/*
++** ld1ro_s16_16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_16, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 + 16),
++	   z0 = svld1ro (p0, x0 + 16))
++
++/*
++** ld1ro_s16_112:
++**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_112, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 + 112),
++	   z0 = svld1ro (p0, x0 + 112))
++
++/*
++** ld1ro_s16_m16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_m16, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 - 16),
++	   z0 = svld1ro (p0, x0 - 16))
++
++/*
++** ld1ro_s16_m128:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s16_m128, svint16_t, int16_t,
++	   z0 = svld1ro_s16 (p0, x0 - 128),
++	   z0 = svld1ro (p0, x0 - 128))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s32.c
+new file mode 100644
+index 000000000..44b16ed5f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s32.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_s32_base:
++**	ld1row	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_base, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_s32_index:
++**	ld1row	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_index, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_s32_1:
++**	add	(x[0-9]+), x0, #?4
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_1, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_s32_4:
++**	add	(x[0-9]+), x0, #?16
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_4, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 + 4),
++	   z0 = svld1ro (p0, x0 + 4))
++
++/*
++** ld1ro_s32_64:
++**	add	(x[0-9]+), x0, #?256
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_64, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 + 64),
++	   z0 = svld1ro (p0, x0 + 64))
++
++/*
++** ld1ro_s32_m1:
++**	sub	(x[0-9]+), x0, #?4
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_m1, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_s32_m4:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_m4, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 - 4),
++	   z0 = svld1ro (p0, x0 - 4))
++
++/*
++** ld1ro_s32_m72:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_m72, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 - 72),
++	   z0 = svld1ro (p0, x0 - 72))
++
++/*
++** ld1ro_s32_8:
++**	ld1row	z0\.s, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_8, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_s32_56:
++**	ld1row	z0\.s, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_56, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 + 56),
++	   z0 = svld1ro (p0, x0 + 56))
++
++/*
++** ld1ro_s32_m8:
++**	ld1row	z0\.s, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_m8, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_s32_m64:
++**	ld1row	z0\.s, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s32_m64, svint32_t, int32_t,
++	   z0 = svld1ro_s32 (p0, x0 - 64),
++	   z0 = svld1ro (p0, x0 - 64))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s64.c
+new file mode 100644
+index 000000000..3aa9a15ee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s64.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_s64_base:
++**	ld1rod	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_base, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_s64_index:
++**	ld1rod	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_index, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_s64_1:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_1, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_s64_2:
++**	add	(x[0-9]+), x0, #?16
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_2, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 + 2),
++	   z0 = svld1ro (p0, x0 + 2))
++
++/*
++** ld1ro_s64_32:
++**	add	(x[0-9]+), x0, #?256
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_32, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 + 32),
++	   z0 = svld1ro (p0, x0 + 32))
++
++/*
++** ld1ro_s64_m1:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_m1, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_s64_m2:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_m2, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 - 2),
++	   z0 = svld1ro (p0, x0 - 2))
++
++/*
++** ld1ro_s64_m36:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_m36, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 - 36),
++	   z0 = svld1ro (p0, x0 - 36))
++
++/*
++** ld1ro_s64_4:
++**	ld1rod	z0\.d, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_4, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 + 4),
++	   z0 = svld1ro (p0, x0 + 4))
++
++/*
++** ld1ro_s64_28:
++**	ld1rod	z0\.d, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_28, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 + 28),
++	   z0 = svld1ro (p0, x0 + 28))
++
++/*
++** ld1ro_s64_m4:
++**	ld1rod	z0\.d, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_m4, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 - 4),
++	   z0 = svld1ro (p0, x0 - 4))
++
++/*
++** ld1ro_s64_m32:
++**	ld1rod	z0\.d, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s64_m32, svint64_t, int64_t,
++	   z0 = svld1ro_s64 (p0, x0 - 32),
++	   z0 = svld1ro (p0, x0 - 32))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s8.c
+new file mode 100644
+index 000000000..49aff5146
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s8.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_s8_base:
++**	ld1rob	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_base, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_s8_index:
++**	ld1rob	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_index, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_s8_1:
++**	add	(x[0-9]+), x0, #?1
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_1, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_s8_16:
++**	add	(x[0-9]+), x0, #?16
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_16, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 + 16),
++	   z0 = svld1ro (p0, x0 + 16))
++
++/*
++** ld1ro_s8_256:
++**	add	(x[0-9]+), x0, #?256
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_256, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 + 256),
++	   z0 = svld1ro (p0, x0 + 256))
++
++/*
++** ld1ro_s8_m1:
++**	sub	(x[0-9]+), x0, #?1
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_m1, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_s8_m16:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_m16, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 - 16),
++	   z0 = svld1ro (p0, x0 - 16))
++
++/*
++** ld1ro_s8_m288:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_m288, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 - 288),
++	   z0 = svld1ro (p0, x0 - 288))
++
++/*
++** ld1ro_s8_32:
++**	ld1rob	z0\.b, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_32, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 + 32),
++	   z0 = svld1ro (p0, x0 + 32))
++
++/*
++** ld1ro_s8_224:
++**	ld1rob	z0\.b, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_224, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 + 224),
++	   z0 = svld1ro (p0, x0 + 224))
++
++/*
++** ld1ro_s8_m32:
++**	ld1rob	z0\.b, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_m32, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 - 32),
++	   z0 = svld1ro (p0, x0 - 32))
++
++/*
++** ld1ro_s8_m256:
++**	ld1rob	z0\.b, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_s8_m256, svint8_t, int8_t,
++	   z0 = svld1ro_s8 (p0, x0 - 256),
++	   z0 = svld1ro (p0, x0 - 256))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u16.c
+new file mode 100644
+index 000000000..00bf9e129
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u16.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_u16_base:
++**	ld1roh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_base, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_u16_index:
++**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_index, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_u16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_1, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_u16_8:
++**	add	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_8, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_u16_128:
++**	add	(x[0-9]+), x0, #?256
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_128, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 + 128),
++	   z0 = svld1ro (p0, x0 + 128))
++
++/*
++** ld1ro_u16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_m1, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_u16_m8:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_m8, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_u16_m144:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1roh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_m144, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 - 144),
++	   z0 = svld1ro (p0, x0 - 144))
++
++/*
++** ld1ro_u16_16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_16, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 + 16),
++	   z0 = svld1ro (p0, x0 + 16))
++
++/*
++** ld1ro_u16_112:
++**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_112, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 + 112),
++	   z0 = svld1ro (p0, x0 + 112))
++
++/*
++** ld1ro_u16_m16:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_m16, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 - 16),
++	   z0 = svld1ro (p0, x0 - 16))
++
++/*
++** ld1ro_u16_m128:
++**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u16_m128, svuint16_t, uint16_t,
++	   z0 = svld1ro_u16 (p0, x0 - 128),
++	   z0 = svld1ro (p0, x0 - 128))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u32.c
+new file mode 100644
+index 000000000..9e9b3290a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u32.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_u32_base:
++**	ld1row	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_base, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_u32_index:
++**	ld1row	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_index, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_u32_1:
++**	add	(x[0-9]+), x0, #?4
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_1, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_u32_4:
++**	add	(x[0-9]+), x0, #?16
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_4, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 + 4),
++	   z0 = svld1ro (p0, x0 + 4))
++
++/*
++** ld1ro_u32_64:
++**	add	(x[0-9]+), x0, #?256
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_64, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 + 64),
++	   z0 = svld1ro (p0, x0 + 64))
++
++/*
++** ld1ro_u32_m1:
++**	sub	(x[0-9]+), x0, #?4
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_m1, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_u32_m4:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_m4, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 - 4),
++	   z0 = svld1ro (p0, x0 - 4))
++
++/*
++** ld1ro_u32_m72:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1row	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_m72, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 - 72),
++	   z0 = svld1ro (p0, x0 - 72))
++
++/*
++** ld1ro_u32_8:
++**	ld1row	z0\.s, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_8, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 + 8),
++	   z0 = svld1ro (p0, x0 + 8))
++
++/*
++** ld1ro_u32_56:
++**	ld1row	z0\.s, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_56, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 + 56),
++	   z0 = svld1ro (p0, x0 + 56))
++
++/*
++** ld1ro_u32_m8:
++**	ld1row	z0\.s, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_m8, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 - 8),
++	   z0 = svld1ro (p0, x0 - 8))
++
++/*
++** ld1ro_u32_m64:
++**	ld1row	z0\.s, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u32_m64, svuint32_t, uint32_t,
++	   z0 = svld1ro_u32 (p0, x0 - 64),
++	   z0 = svld1ro (p0, x0 - 64))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u64.c
+new file mode 100644
+index 000000000..64ec62871
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u64.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_u64_base:
++**	ld1rod	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_base, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_u64_index:
++**	ld1rod	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_index, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_u64_1:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_1, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_u64_2:
++**	add	(x[0-9]+), x0, #?16
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_2, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 + 2),
++	   z0 = svld1ro (p0, x0 + 2))
++
++/*
++** ld1ro_u64_32:
++**	add	(x[0-9]+), x0, #?256
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_32, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 + 32),
++	   z0 = svld1ro (p0, x0 + 32))
++
++/*
++** ld1ro_u64_m1:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_m1, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_u64_m2:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_m2, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 - 2),
++	   z0 = svld1ro (p0, x0 - 2))
++
++/*
++** ld1ro_u64_m36:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1rod	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_m36, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 - 36),
++	   z0 = svld1ro (p0, x0 - 36))
++
++/*
++** ld1ro_u64_4:
++**	ld1rod	z0\.d, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_4, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 + 4),
++	   z0 = svld1ro (p0, x0 + 4))
++
++/*
++** ld1ro_u64_28:
++**	ld1rod	z0\.d, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_28, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 + 28),
++	   z0 = svld1ro (p0, x0 + 28))
++
++/*
++** ld1ro_u64_m4:
++**	ld1rod	z0\.d, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_m4, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 - 4),
++	   z0 = svld1ro (p0, x0 - 4))
++
++/*
++** ld1ro_u64_m32:
++**	ld1rod	z0\.d, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u64_m32, svuint64_t, uint64_t,
++	   z0 = svld1ro_u64 (p0, x0 - 32),
++	   z0 = svld1ro (p0, x0 - 32))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u8.c
+new file mode 100644
+index 000000000..22701320b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u8.c
+@@ -0,0 +1,120 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
++/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ro_u8_base:
++**	ld1rob	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_base, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0),
++	   z0 = svld1ro (p0, x0))
++
++/*
++** ld1ro_u8_index:
++**	ld1rob	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_index, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 + x1),
++	   z0 = svld1ro (p0, x0 + x1))
++
++/*
++** ld1ro_u8_1:
++**	add	(x[0-9]+), x0, #?1
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_1, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 + 1),
++	   z0 = svld1ro (p0, x0 + 1))
++
++/*
++** ld1ro_u8_16:
++**	add	(x[0-9]+), x0, #?16
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_16, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 + 16),
++	   z0 = svld1ro (p0, x0 + 16))
++
++/*
++** ld1ro_u8_256:
++**	add	(x[0-9]+), x0, #?256
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_256, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 + 256),
++	   z0 = svld1ro (p0, x0 + 256))
++
++/*
++** ld1ro_u8_m1:
++**	sub	(x[0-9]+), x0, #?1
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_m1, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 - 1),
++	   z0 = svld1ro (p0, x0 - 1))
++
++/*
++** ld1ro_u8_m16:
++**	sub	(x[0-9]+), x0, #?16
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_m16, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 - 16),
++	   z0 = svld1ro (p0, x0 - 16))
++
++/*
++** ld1ro_u8_m288:
++**	sub	(x[0-9]+), x0, #?288
++**	ld1rob	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_m288, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 - 288),
++	   z0 = svld1ro (p0, x0 - 288))
++
++/*
++** ld1ro_u8_32:
++**	ld1rob	z0\.b, p0/z, \[x0, #?32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_32, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 + 32),
++	   z0 = svld1ro (p0, x0 + 32))
++
++/*
++** ld1ro_u8_224:
++**	ld1rob	z0\.b, p0/z, \[x0, #?224\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_224, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 + 224),
++	   z0 = svld1ro (p0, x0 + 224))
++
++/*
++** ld1ro_u8_m32:
++**	ld1rob	z0\.b, p0/z, \[x0, #?-32\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_m32, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 - 32),
++	   z0 = svld1ro (p0, x0 - 32))
++
++/*
++** ld1ro_u8_m256:
++**	ld1rob	z0\.b, p0/z, \[x0, #?-256\]
++**	ret
++*/
++TEST_LOAD (ld1ro_u8_m256, svuint8_t, uint8_t,
++	   z0 = svld1ro_u8 (p0, x0 - 256),
++	   z0 = svld1ro (p0, x0 - 256))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c
+new file mode 100644
+index 000000000..54c69a1db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_bf16_base:
++**	ld1rqh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_base, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_bf16_index:
++**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_index, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_bf16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_bf16_4:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_4, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_bf16_7:
++**	add	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + 7),
++	   z0 = svld1rq (p0, x0 + 7))
++
++/*
++** ld1rq_bf16_8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + 8),
++	   z0 = svld1rq (p0, x0 + 8))
++
++/*
++** ld1rq_bf16_56:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_56, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + 56),
++	   z0 = svld1rq (p0, x0 + 56))
++
++/*
++** ld1rq_bf16_64:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_64, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 + 64),
++	   z0 = svld1rq (p0, x0 + 64))
++
++/*
++** ld1rq_bf16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_bf16_m4:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_m4, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_bf16_m7:
++**	sub	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_m7, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 - 7),
++	   z0 = svld1rq (p0, x0 - 7))
++
++/*
++** ld1rq_bf16_m8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 - 8),
++	   z0 = svld1rq (p0, x0 - 8))
++
++/*
++** ld1rq_bf16_m64:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_m64, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 - 64),
++	   z0 = svld1rq (p0, x0 - 64))
++
++/*
++** ld1rq_bf16_m72:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_bf16_m72, svbfloat16_t, bfloat16_t,
++	   z0 = svld1rq_bf16 (p0, x0 - 72),
++	   z0 = svld1rq (p0, x0 - 72))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c
+new file mode 100644
+index 000000000..7536236f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_f16_base:
++**	ld1rqh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_base, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_f16_index:
++**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_index, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_f16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_1, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_f16_4:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_4, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_f16_7:
++**	add	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_7, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + 7),
++	   z0 = svld1rq (p0, x0 + 7))
++
++/*
++** ld1rq_f16_8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_8, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + 8),
++	   z0 = svld1rq (p0, x0 + 8))
++
++/*
++** ld1rq_f16_56:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_56, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + 56),
++	   z0 = svld1rq (p0, x0 + 56))
++
++/*
++** ld1rq_f16_64:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_64, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 + 64),
++	   z0 = svld1rq (p0, x0 + 64))
++
++/*
++** ld1rq_f16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_m1, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_f16_m4:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_m4, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_f16_m7:
++**	sub	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_m7, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 - 7),
++	   z0 = svld1rq (p0, x0 - 7))
++
++/*
++** ld1rq_f16_m8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_m8, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 - 8),
++	   z0 = svld1rq (p0, x0 - 8))
++
++/*
++** ld1rq_f16_m64:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_m64, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 - 64),
++	   z0 = svld1rq (p0, x0 - 64))
++
++/*
++** ld1rq_f16_m72:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f16_m72, svfloat16_t, float16_t,
++	   z0 = svld1rq_f16 (p0, x0 - 72),
++	   z0 = svld1rq (p0, x0 - 72))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c
+new file mode 100644
+index 000000000..9be2b7412
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_f32_base:
++**	ld1rqw	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_base, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_f32_index:
++**	ld1rqw	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_index, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_f32_1:
++**	add	(x[0-9]+), x0, #?4
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_1, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_f32_2:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_2, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + 2),
++	   z0 = svld1rq (p0, x0 + 2))
++
++/*
++** ld1rq_f32_3:
++**	add	(x[0-9]+), x0, #?12
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_3, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + 3),
++	   z0 = svld1rq (p0, x0 + 3))
++
++/*
++** ld1rq_f32_4:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_4, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_f32_28:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_28, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + 28),
++	   z0 = svld1rq (p0, x0 + 28))
++
++/*
++** ld1rq_f32_32:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_32, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 + 32),
++	   z0 = svld1rq (p0, x0 + 32))
++
++/*
++** ld1rq_f32_m1:
++**	sub	(x[0-9]+), x0, #?4
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_m1, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_f32_m2:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_m2, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 - 2),
++	   z0 = svld1rq (p0, x0 - 2))
++
++/*
++** ld1rq_f32_m3:
++**	sub	(x[0-9]+), x0, #?12
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_m3, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 - 3),
++	   z0 = svld1rq (p0, x0 - 3))
++
++/*
++** ld1rq_f32_m4:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_m4, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_f32_m32:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_m32, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 - 32),
++	   z0 = svld1rq (p0, x0 - 32))
++
++/*
++** ld1rq_f32_m36:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f32_m36, svfloat32_t, float32_t,
++	   z0 = svld1rq_f32 (p0, x0 - 36),
++	   z0 = svld1rq (p0, x0 - 36))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c
+new file mode 100644
+index 000000000..32105af17
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c
+@@ -0,0 +1,97 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_f64_base:
++**	ld1rqd	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_base, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_f64_index:
++**	ld1rqd	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_index, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_f64_1:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_1, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_f64_2:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_2, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 + 2),
++	   z0 = svld1rq (p0, x0 + 2))
++
++/*
++** ld1rq_f64_14:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_14, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 + 14),
++	   z0 = svld1rq (p0, x0 + 14))
++
++/*
++** ld1rq_f64_16:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_16, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 + 16),
++	   z0 = svld1rq (p0, x0 + 16))
++
++/*
++** ld1rq_f64_m1:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_m1, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_f64_m2:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_m2, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 - 2),
++	   z0 = svld1rq (p0, x0 - 2))
++
++/*
++** ld1rq_f64_m16:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_m16, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 - 16),
++	   z0 = svld1rq (p0, x0 - 16))
++
++/*
++** ld1rq_f64_m18:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_f64_m18, svfloat64_t, float64_t,
++	   z0 = svld1rq_f64 (p0, x0 - 18),
++	   z0 = svld1rq (p0, x0 - 18))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s16.c
+new file mode 100644
+index 000000000..8903b96a3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s16.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_s16_base:
++**	ld1rqh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_base, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_s16_index:
++**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_index, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_s16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_1, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_s16_4:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_4, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_s16_7:
++**	add	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_7, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + 7),
++	   z0 = svld1rq (p0, x0 + 7))
++
++/*
++** ld1rq_s16_8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_8, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + 8),
++	   z0 = svld1rq (p0, x0 + 8))
++
++/*
++** ld1rq_s16_56:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_56, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + 56),
++	   z0 = svld1rq (p0, x0 + 56))
++
++/*
++** ld1rq_s16_64:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_64, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 + 64),
++	   z0 = svld1rq (p0, x0 + 64))
++
++/*
++** ld1rq_s16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_m1, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_s16_m4:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_m4, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_s16_m7:
++**	sub	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_m7, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 - 7),
++	   z0 = svld1rq (p0, x0 - 7))
++
++/*
++** ld1rq_s16_m8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_m8, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 - 8),
++	   z0 = svld1rq (p0, x0 - 8))
++
++/*
++** ld1rq_s16_m64:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_m64, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 - 64),
++	   z0 = svld1rq (p0, x0 - 64))
++
++/*
++** ld1rq_s16_m72:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s16_m72, svint16_t, int16_t,
++	   z0 = svld1rq_s16 (p0, x0 - 72),
++	   z0 = svld1rq (p0, x0 - 72))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s32.c
+new file mode 100644
+index 000000000..a428b4350
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s32.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_s32_base:
++**	ld1rqw	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_base, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_s32_index:
++**	ld1rqw	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_index, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_s32_1:
++**	add	(x[0-9]+), x0, #?4
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_1, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_s32_2:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_2, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + 2),
++	   z0 = svld1rq (p0, x0 + 2))
++
++/*
++** ld1rq_s32_3:
++**	add	(x[0-9]+), x0, #?12
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_3, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + 3),
++	   z0 = svld1rq (p0, x0 + 3))
++
++/*
++** ld1rq_s32_4:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_4, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_s32_28:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_28, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + 28),
++	   z0 = svld1rq (p0, x0 + 28))
++
++/*
++** ld1rq_s32_32:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_32, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 + 32),
++	   z0 = svld1rq (p0, x0 + 32))
++
++/*
++** ld1rq_s32_m1:
++**	sub	(x[0-9]+), x0, #?4
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_m1, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_s32_m2:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_m2, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 - 2),
++	   z0 = svld1rq (p0, x0 - 2))
++
++/*
++** ld1rq_s32_m3:
++**	sub	(x[0-9]+), x0, #?12
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_m3, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 - 3),
++	   z0 = svld1rq (p0, x0 - 3))
++
++/*
++** ld1rq_s32_m4:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_m4, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_s32_m32:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_m32, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 - 32),
++	   z0 = svld1rq (p0, x0 - 32))
++
++/*
++** ld1rq_s32_m36:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s32_m36, svint32_t, int32_t,
++	   z0 = svld1rq_s32 (p0, x0 - 36),
++	   z0 = svld1rq (p0, x0 - 36))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s64.c
+new file mode 100644
+index 000000000..efc0e740f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s64.c
+@@ -0,0 +1,97 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_s64_base:
++**	ld1rqd	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_base, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_s64_index:
++**	ld1rqd	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_index, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_s64_1:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_1, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_s64_2:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_2, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 + 2),
++	   z0 = svld1rq (p0, x0 + 2))
++
++/*
++** ld1rq_s64_14:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_14, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 + 14),
++	   z0 = svld1rq (p0, x0 + 14))
++
++/*
++** ld1rq_s64_16:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_16, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 + 16),
++	   z0 = svld1rq (p0, x0 + 16))
++
++/*
++** ld1rq_s64_m1:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_m1, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_s64_m2:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_m2, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 - 2),
++	   z0 = svld1rq (p0, x0 - 2))
++
++/*
++** ld1rq_s64_m16:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_m16, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 - 16),
++	   z0 = svld1rq (p0, x0 - 16))
++
++/*
++** ld1rq_s64_m18:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s64_m18, svint64_t, int64_t,
++	   z0 = svld1rq_s64 (p0, x0 - 18),
++	   z0 = svld1rq (p0, x0 - 18))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s8.c
+new file mode 100644
+index 000000000..e183e472f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s8.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_s8_base:
++**	ld1rqb	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_base, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_s8_index:
++**	ld1rqb	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_index, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_s8_1:
++**	add	(x[0-9]+), x0, #?1
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_1, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_s8_8:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_8, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + 8),
++	   z0 = svld1rq (p0, x0 + 8))
++
++/*
++** ld1rq_s8_15:
++**	add	(x[0-9]+), x0, #?15
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_15, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + 15),
++	   z0 = svld1rq (p0, x0 + 15))
++
++/*
++** ld1rq_s8_16:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_16, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + 16),
++	   z0 = svld1rq (p0, x0 + 16))
++
++/*
++** ld1rq_s8_112:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_112, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + 112),
++	   z0 = svld1rq (p0, x0 + 112))
++
++/*
++** ld1rq_s8_128:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_128, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 + 128),
++	   z0 = svld1rq (p0, x0 + 128))
++
++/*
++** ld1rq_s8_m1:
++**	sub	(x[0-9]+), x0, #?1
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_m1, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_s8_m8:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_m8, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 - 8),
++	   z0 = svld1rq (p0, x0 - 8))
++
++/*
++** ld1rq_s8_m15:
++**	sub	(x[0-9]+), x0, #?15
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_m15, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 - 15),
++	   z0 = svld1rq (p0, x0 - 15))
++
++/*
++** ld1rq_s8_m16:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_m16, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 - 16),
++	   z0 = svld1rq (p0, x0 - 16))
++
++/*
++** ld1rq_s8_m128:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_m128, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 - 128),
++	   z0 = svld1rq (p0, x0 - 128))
++
++/*
++** ld1rq_s8_m144:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_s8_m144, svint8_t, int8_t,
++	   z0 = svld1rq_s8 (p0, x0 - 144),
++	   z0 = svld1rq (p0, x0 - 144))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u16.c
+new file mode 100644
+index 000000000..c24ab680a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u16.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_u16_base:
++**	ld1rqh	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_base, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_u16_index:
++**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_index, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_u16_1:
++**	add	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_1, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_u16_4:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_4, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_u16_7:
++**	add	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_7, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + 7),
++	   z0 = svld1rq (p0, x0 + 7))
++
++/*
++** ld1rq_u16_8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_8, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + 8),
++	   z0 = svld1rq (p0, x0 + 8))
++
++/*
++** ld1rq_u16_56:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_56, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + 56),
++	   z0 = svld1rq (p0, x0 + 56))
++
++/*
++** ld1rq_u16_64:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_64, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 + 64),
++	   z0 = svld1rq (p0, x0 + 64))
++
++/*
++** ld1rq_u16_m1:
++**	sub	(x[0-9]+), x0, #?2
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_m1, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_u16_m4:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_m4, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_u16_m7:
++**	sub	(x[0-9]+), x0, #?14
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_m7, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 - 7),
++	   z0 = svld1rq (p0, x0 - 7))
++
++/*
++** ld1rq_u16_m8:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_m8, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 - 8),
++	   z0 = svld1rq (p0, x0 - 8))
++
++/*
++** ld1rq_u16_m64:
++**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_m64, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 - 64),
++	   z0 = svld1rq (p0, x0 - 64))
++
++/*
++** ld1rq_u16_m72:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqh	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u16_m72, svuint16_t, uint16_t,
++	   z0 = svld1rq_u16 (p0, x0 - 72),
++	   z0 = svld1rq (p0, x0 - 72))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u32.c
+new file mode 100644
+index 000000000..722e34db3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u32.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_u32_base:
++**	ld1rqw	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_base, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_u32_index:
++**	ld1rqw	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_index, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_u32_1:
++**	add	(x[0-9]+), x0, #?4
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_1, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_u32_2:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_2, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + 2),
++	   z0 = svld1rq (p0, x0 + 2))
++
++/*
++** ld1rq_u32_3:
++**	add	(x[0-9]+), x0, #?12
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_3, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + 3),
++	   z0 = svld1rq (p0, x0 + 3))
++
++/*
++** ld1rq_u32_4:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_4, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + 4),
++	   z0 = svld1rq (p0, x0 + 4))
++
++/*
++** ld1rq_u32_28:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_28, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + 28),
++	   z0 = svld1rq (p0, x0 + 28))
++
++/*
++** ld1rq_u32_32:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_32, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 + 32),
++	   z0 = svld1rq (p0, x0 + 32))
++
++/*
++** ld1rq_u32_m1:
++**	sub	(x[0-9]+), x0, #?4
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_m1, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_u32_m2:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_m2, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 - 2),
++	   z0 = svld1rq (p0, x0 - 2))
++
++/*
++** ld1rq_u32_m3:
++**	sub	(x[0-9]+), x0, #?12
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_m3, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 - 3),
++	   z0 = svld1rq (p0, x0 - 3))
++
++/*
++** ld1rq_u32_m4:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_m4, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 - 4),
++	   z0 = svld1rq (p0, x0 - 4))
++
++/*
++** ld1rq_u32_m32:
++**	ld1rqw	z0\.s, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_m32, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 - 32),
++	   z0 = svld1rq (p0, x0 - 32))
++
++/*
++** ld1rq_u32_m36:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqw	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u32_m36, svuint32_t, uint32_t,
++	   z0 = svld1rq_u32 (p0, x0 - 36),
++	   z0 = svld1rq (p0, x0 - 36))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u64.c
+new file mode 100644
+index 000000000..a116b7fd9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u64.c
+@@ -0,0 +1,97 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_u64_base:
++**	ld1rqd	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_base, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_u64_index:
++**	ld1rqd	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_index, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_u64_1:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_1, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_u64_2:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_2, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 + 2),
++	   z0 = svld1rq (p0, x0 + 2))
++
++/*
++** ld1rq_u64_14:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_14, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 + 14),
++	   z0 = svld1rq (p0, x0 + 14))
++
++/*
++** ld1rq_u64_16:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_16, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 + 16),
++	   z0 = svld1rq (p0, x0 + 16))
++
++/*
++** ld1rq_u64_m1:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_m1, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_u64_m2:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_m2, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 - 2),
++	   z0 = svld1rq (p0, x0 - 2))
++
++/*
++** ld1rq_u64_m16:
++**	ld1rqd	z0\.d, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_m16, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 - 16),
++	   z0 = svld1rq (p0, x0 - 16))
++
++/*
++** ld1rq_u64_m18:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqd	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u64_m18, svuint64_t, uint64_t,
++	   z0 = svld1rq_u64 (p0, x0 - 18),
++	   z0 = svld1rq (p0, x0 - 18))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u8.c
+new file mode 100644
+index 000000000..74b72530e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u8.c
+@@ -0,0 +1,137 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1rq_u8_base:
++**	ld1rqb	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_base, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0),
++	   z0 = svld1rq (p0, x0))
++
++/*
++** ld1rq_u8_index:
++**	ld1rqb	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_index, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + x1),
++	   z0 = svld1rq (p0, x0 + x1))
++
++/*
++** ld1rq_u8_1:
++**	add	(x[0-9]+), x0, #?1
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_1, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + 1),
++	   z0 = svld1rq (p0, x0 + 1))
++
++/*
++** ld1rq_u8_8:
++**	add	(x[0-9]+), x0, #?8
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_8, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + 8),
++	   z0 = svld1rq (p0, x0 + 8))
++
++/*
++** ld1rq_u8_15:
++**	add	(x[0-9]+), x0, #?15
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_15, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + 15),
++	   z0 = svld1rq (p0, x0 + 15))
++
++/*
++** ld1rq_u8_16:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_16, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + 16),
++	   z0 = svld1rq (p0, x0 + 16))
++
++/*
++** ld1rq_u8_112:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?112\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_112, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + 112),
++	   z0 = svld1rq (p0, x0 + 112))
++
++/*
++** ld1rq_u8_128:
++**	add	(x[0-9]+), x0, #?128
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_128, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 + 128),
++	   z0 = svld1rq (p0, x0 + 128))
++
++/*
++** ld1rq_u8_m1:
++**	sub	(x[0-9]+), x0, #?1
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_m1, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 - 1),
++	   z0 = svld1rq (p0, x0 - 1))
++
++/*
++** ld1rq_u8_m8:
++**	sub	(x[0-9]+), x0, #?8
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_m8, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 - 8),
++	   z0 = svld1rq (p0, x0 - 8))
++
++/*
++** ld1rq_u8_m15:
++**	sub	(x[0-9]+), x0, #?15
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_m15, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 - 15),
++	   z0 = svld1rq (p0, x0 - 15))
++
++/*
++** ld1rq_u8_m16:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?-16\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_m16, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 - 16),
++	   z0 = svld1rq (p0, x0 - 16))
++
++/*
++** ld1rq_u8_m128:
++**	ld1rqb	z0\.b, p0/z, \[x0, #?-128\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_m128, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 - 128),
++	   z0 = svld1rq (p0, x0 - 128))
++
++/*
++** ld1rq_u8_m144:
++**	sub	(x[0-9]+), x0, #?144
++**	ld1rqb	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld1rq_u8_m144, svuint8_t, uint8_t,
++	   z0 = svld1rq_u8 (p0, x0 - 144),
++	   z0 = svld1rq (p0, x0 - 144))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s32.c
+new file mode 100644
+index 000000000..16a5316a9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_gather_s32_tied1:
++**	ld1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_s32 (p0, z0),
++		     z0_res = svld1sb_gather_s32 (p0, z0))
++
++/*
++** ld1sb_gather_s32_untied:
++**	ld1sb	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_s32 (p0, z1),
++		     z0_res = svld1sb_gather_s32 (p0, z1))
++
++/*
++** ld1sb_gather_x0_s32_offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svld1sb_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ld1sb_gather_m1_s32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, -1),
++		     z0_res = svld1sb_gather_offset_s32 (p0, z0, -1))
++
++/*
++** ld1sb_gather_0_s32_offset:
++**	ld1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ld1sb_gather_5_s32_offset:
++**	ld1sb	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ld1sb_gather_31_s32_offset:
++**	ld1sb	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_31_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 31),
++		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 31))
++
++/*
++** ld1sb_gather_32_s32_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_32_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 32),
++		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 32))
++
++/*
++** ld1sb_gather_x0_s32_s32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s32_s32offset, svint32_t, int8_t, svint32_t,
++		     z0_res = svld1sb_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_s32_s32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s32_s32offset, svint32_t, int8_t, svint32_t,
++		     z0_res = svld1sb_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_s32_s32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s32_s32offset, svint32_t, int8_t, svint32_t,
++		     z0_res = svld1sb_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ld1sb_gather_x0_s32_u32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_s32_u32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_s32_u32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s64.c
+new file mode 100644
+index 000000000..3f953247e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_gather_s64_tied1:
++**	ld1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1sb_gather_s64 (p0, z0))
++
++/*
++** ld1sb_gather_s64_untied:
++**	ld1sb	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1sb_gather_s64 (p0, z1))
++
++/*
++** ld1sb_gather_x0_s64_offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1sb_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1sb_gather_m1_s64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, -1),
++		     z0_res = svld1sb_gather_offset_s64 (p0, z0, -1))
++
++/*
++** ld1sb_gather_0_s64_offset:
++**	ld1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1sb_gather_5_s64_offset:
++**	ld1sb	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ld1sb_gather_31_s64_offset:
++**	ld1sb	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_31_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 31),
++		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 31))
++
++/*
++** ld1sb_gather_32_s64_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_32_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 32),
++		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 32))
++
++/*
++** ld1sb_gather_x0_s64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_s64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_s64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1sb_gather_ext_s64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sb_gather_x0_s64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_s64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_s64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1sb_gather_ext_s64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u32.c
+new file mode 100644
+index 000000000..424de65a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_gather_u32_tied1:
++**	ld1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_u32 (p0, z0),
++		     z0_res = svld1sb_gather_u32 (p0, z0))
++
++/*
++** ld1sb_gather_u32_untied:
++**	ld1sb	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_u32 (p0, z1),
++		     z0_res = svld1sb_gather_u32 (p0, z1))
++
++/*
++** ld1sb_gather_x0_u32_offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svld1sb_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ld1sb_gather_m1_u32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, -1),
++		     z0_res = svld1sb_gather_offset_u32 (p0, z0, -1))
++
++/*
++** ld1sb_gather_0_u32_offset:
++**	ld1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ld1sb_gather_5_u32_offset:
++**	ld1sb	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ld1sb_gather_31_u32_offset:
++**	ld1sb	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_31_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 31),
++		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 31))
++
++/*
++** ld1sb_gather_32_u32_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_32_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 32),
++		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 32))
++
++/*
++** ld1sb_gather_x0_u32_s32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u32_s32offset, svuint32_t, int8_t, svint32_t,
++		     z0_res = svld1sb_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_u32_s32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u32_s32offset, svuint32_t, int8_t, svint32_t,
++		     z0_res = svld1sb_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_u32_s32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u32_s32offset, svuint32_t, int8_t, svint32_t,
++		     z0_res = svld1sb_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ld1sb_gather_x0_u32_u32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u32_u32offset, svuint32_t, int8_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_u32_u32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u32_u32offset, svuint32_t, int8_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_u32_u32offset:
++**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u32_u32offset, svuint32_t, int8_t, svuint32_t,
++		     z0_res = svld1sb_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u64.c
+new file mode 100644
+index 000000000..aa375bea2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_gather_u64_tied1:
++**	ld1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1sb_gather_u64 (p0, z0))
++
++/*
++** ld1sb_gather_u64_untied:
++**	ld1sb	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1sb_gather_u64 (p0, z1))
++
++/*
++** ld1sb_gather_x0_u64_offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1sb_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1sb_gather_m1_u64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, -1),
++		     z0_res = svld1sb_gather_offset_u64 (p0, z0, -1))
++
++/*
++** ld1sb_gather_0_u64_offset:
++**	ld1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1sb_gather_5_u64_offset:
++**	ld1sb	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ld1sb_gather_31_u64_offset:
++**	ld1sb	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_31_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 31),
++		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 31))
++
++/*
++** ld1sb_gather_32_u64_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sb_gather_32_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 32),
++		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 32))
++
++/*
++** ld1sb_gather_x0_u64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_u64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_u64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1sb_gather_ext_u64_s64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sb_gather_x0_u64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_tied1_u64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sb_gather_untied_u64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1sb_gather_ext_u64_u64offset:
++**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s16.c
+new file mode 100644
+index 000000000..70a793c14
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s16.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_s16_base:
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_base, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0),
++	   z0 = svld1sb_s16 (p0, x0))
++
++/*
++** ld1sb_s16_index:
++**	ld1sb	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_index, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 + x1),
++	   z0 = svld1sb_s16 (p0, x0 + x1))
++
++/*
++** ld1sb_s16_1:
++**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_1, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 + svcnth ()),
++	   z0 = svld1sb_s16 (p0, x0 + svcnth ()))
++
++/*
++** ld1sb_s16_7:
++**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_7, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_s16_8:
++**	incb	x0, all, mul #4
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_8, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1sb_s16_m1:
++**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_m1, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 - svcnth ()),
++	   z0 = svld1sb_s16 (p0, x0 - svcnth ()))
++
++/*
++** ld1sb_s16_m8:
++**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_m8, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_s16_m9:
++**	dech	x0, all, mul #9
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s16_m9, svint16_t, int8_t,
++	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1sb_vnum_s16_0:
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_0, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, 0),
++	   z0 = svld1sb_vnum_s16 (p0, x0, 0))
++
++/*
++** ld1sb_vnum_s16_1:
++**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_1, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, 1),
++	   z0 = svld1sb_vnum_s16 (p0, x0, 1))
++
++/*
++** ld1sb_vnum_s16_7:
++**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_7, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, 7),
++	   z0 = svld1sb_vnum_s16 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_s16_8:
++**	incb	x0, all, mul #4
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_8, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, 8),
++	   z0 = svld1sb_vnum_s16 (p0, x0, 8))
++
++/*
++** ld1sb_vnum_s16_m1:
++**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_m1, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, -1),
++	   z0 = svld1sb_vnum_s16 (p0, x0, -1))
++
++/*
++** ld1sb_vnum_s16_m8:
++**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_m8, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, -8),
++	   z0 = svld1sb_vnum_s16 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_s16_m9:
++**	dech	x0, all, mul #9
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_m9, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, -9),
++	   z0 = svld1sb_vnum_s16 (p0, x0, -9))
++
++/*
++** ld1sb_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1sb	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1sb	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s16_x1, svint16_t, int8_t,
++	   z0 = svld1sb_vnum_s16 (p0, x0, x1),
++	   z0 = svld1sb_vnum_s16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s32.c
+new file mode 100644
+index 000000000..74b3a321b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s32.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_s32_base:
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_base, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0),
++	   z0 = svld1sb_s32 (p0, x0))
++
++/*
++** ld1sb_s32_index:
++**	ld1sb	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_index, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 + x1),
++	   z0 = svld1sb_s32 (p0, x0 + x1))
++
++/*
++** ld1sb_s32_1:
++**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_1, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 + svcntw ()),
++	   z0 = svld1sb_s32 (p0, x0 + svcntw ()))
++
++/*
++** ld1sb_s32_7:
++**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_7, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_s32_8:
++**	incb	x0, all, mul #2
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_8, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1sb_s32_m1:
++**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_m1, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 - svcntw ()),
++	   z0 = svld1sb_s32 (p0, x0 - svcntw ()))
++
++/*
++** ld1sb_s32_m8:
++**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_m8, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_s32_m9:
++**	decw	x0, all, mul #9
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s32_m9, svint32_t, int8_t,
++	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1sb_vnum_s32_0:
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_0, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, 0),
++	   z0 = svld1sb_vnum_s32 (p0, x0, 0))
++
++/*
++** ld1sb_vnum_s32_1:
++**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_1, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, 1),
++	   z0 = svld1sb_vnum_s32 (p0, x0, 1))
++
++/*
++** ld1sb_vnum_s32_7:
++**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_7, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, 7),
++	   z0 = svld1sb_vnum_s32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_s32_8:
++**	incb	x0, all, mul #2
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_8, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, 8),
++	   z0 = svld1sb_vnum_s32 (p0, x0, 8))
++
++/*
++** ld1sb_vnum_s32_m1:
++**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_m1, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, -1),
++	   z0 = svld1sb_vnum_s32 (p0, x0, -1))
++
++/*
++** ld1sb_vnum_s32_m8:
++**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_m8, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, -8),
++	   z0 = svld1sb_vnum_s32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_s32_m9:
++**	decw	x0, all, mul #9
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_m9, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, -9),
++	   z0 = svld1sb_vnum_s32 (p0, x0, -9))
++
++/*
++** ld1sb_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1sb	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1sb	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s32_x1, svint32_t, int8_t,
++	   z0 = svld1sb_vnum_s32 (p0, x0, x1),
++	   z0 = svld1sb_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s64.c
+new file mode 100644
+index 000000000..1984e1956
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s64.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_s64_base:
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_base, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0),
++	   z0 = svld1sb_s64 (p0, x0))
++
++/*
++** ld1sb_s64_index:
++**	ld1sb	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_index, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 + x1),
++	   z0 = svld1sb_s64 (p0, x0 + x1))
++
++/*
++** ld1sb_s64_1:
++**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_1, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1sb_s64 (p0, x0 + svcntd ()))
++
++/*
++** ld1sb_s64_7:
++**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_7, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_s64_8:
++**	incb	x0
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_8, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1sb_s64_m1:
++**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_m1, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1sb_s64 (p0, x0 - svcntd ()))
++
++/*
++** ld1sb_s64_m8:
++**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_m8, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_s64_m9:
++**	decd	x0, all, mul #9
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_s64_m9, svint64_t, int8_t,
++	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1sb_vnum_s64_0:
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_0, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, 0),
++	   z0 = svld1sb_vnum_s64 (p0, x0, 0))
++
++/*
++** ld1sb_vnum_s64_1:
++**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_1, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, 1),
++	   z0 = svld1sb_vnum_s64 (p0, x0, 1))
++
++/*
++** ld1sb_vnum_s64_7:
++**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_7, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, 7),
++	   z0 = svld1sb_vnum_s64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_s64_8:
++**	incb	x0
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_8, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, 8),
++	   z0 = svld1sb_vnum_s64 (p0, x0, 8))
++
++/*
++** ld1sb_vnum_s64_m1:
++**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_m1, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, -1),
++	   z0 = svld1sb_vnum_s64 (p0, x0, -1))
++
++/*
++** ld1sb_vnum_s64_m8:
++**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_m8, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, -8),
++	   z0 = svld1sb_vnum_s64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_s64_m9:
++**	decd	x0, all, mul #9
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_m9, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, -9),
++	   z0 = svld1sb_vnum_s64 (p0, x0, -9))
++
++/*
++** ld1sb_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1sb	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1sb	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_s64_x1, svint64_t, int8_t,
++	   z0 = svld1sb_vnum_s64 (p0, x0, x1),
++	   z0 = svld1sb_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u16.c
+new file mode 100644
+index 000000000..cfa616251
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u16.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_u16_base:
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_base, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0),
++	   z0 = svld1sb_u16 (p0, x0))
++
++/*
++** ld1sb_u16_index:
++**	ld1sb	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_index, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 + x1),
++	   z0 = svld1sb_u16 (p0, x0 + x1))
++
++/*
++** ld1sb_u16_1:
++**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_1, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 + svcnth ()),
++	   z0 = svld1sb_u16 (p0, x0 + svcnth ()))
++
++/*
++** ld1sb_u16_7:
++**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_7, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_u16_8:
++**	incb	x0, all, mul #4
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_8, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1sb_u16_m1:
++**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_m1, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 - svcnth ()),
++	   z0 = svld1sb_u16 (p0, x0 - svcnth ()))
++
++/*
++** ld1sb_u16_m8:
++**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_m8, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_u16_m9:
++**	dech	x0, all, mul #9
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u16_m9, svuint16_t, int8_t,
++	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1sb_vnum_u16_0:
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_0, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, 0),
++	   z0 = svld1sb_vnum_u16 (p0, x0, 0))
++
++/*
++** ld1sb_vnum_u16_1:
++**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_1, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, 1),
++	   z0 = svld1sb_vnum_u16 (p0, x0, 1))
++
++/*
++** ld1sb_vnum_u16_7:
++**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_7, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, 7),
++	   z0 = svld1sb_vnum_u16 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_u16_8:
++**	incb	x0, all, mul #4
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_8, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, 8),
++	   z0 = svld1sb_vnum_u16 (p0, x0, 8))
++
++/*
++** ld1sb_vnum_u16_m1:
++**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_m1, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, -1),
++	   z0 = svld1sb_vnum_u16 (p0, x0, -1))
++
++/*
++** ld1sb_vnum_u16_m8:
++**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_m8, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, -8),
++	   z0 = svld1sb_vnum_u16 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_u16_m9:
++**	dech	x0, all, mul #9
++**	ld1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_m9, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, -9),
++	   z0 = svld1sb_vnum_u16 (p0, x0, -9))
++
++/*
++** ld1sb_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1sb	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1sb	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u16_x1, svuint16_t, int8_t,
++	   z0 = svld1sb_vnum_u16 (p0, x0, x1),
++	   z0 = svld1sb_vnum_u16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u32.c
+new file mode 100644
+index 000000000..990ae5e1b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u32.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_u32_base:
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_base, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0),
++	   z0 = svld1sb_u32 (p0, x0))
++
++/*
++** ld1sb_u32_index:
++**	ld1sb	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_index, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 + x1),
++	   z0 = svld1sb_u32 (p0, x0 + x1))
++
++/*
++** ld1sb_u32_1:
++**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_1, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 + svcntw ()),
++	   z0 = svld1sb_u32 (p0, x0 + svcntw ()))
++
++/*
++** ld1sb_u32_7:
++**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_7, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_u32_8:
++**	incb	x0, all, mul #2
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_8, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1sb_u32_m1:
++**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_m1, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 - svcntw ()),
++	   z0 = svld1sb_u32 (p0, x0 - svcntw ()))
++
++/*
++** ld1sb_u32_m8:
++**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_m8, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_u32_m9:
++**	decw	x0, all, mul #9
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u32_m9, svuint32_t, int8_t,
++	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1sb_vnum_u32_0:
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_0, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, 0),
++	   z0 = svld1sb_vnum_u32 (p0, x0, 0))
++
++/*
++** ld1sb_vnum_u32_1:
++**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_1, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, 1),
++	   z0 = svld1sb_vnum_u32 (p0, x0, 1))
++
++/*
++** ld1sb_vnum_u32_7:
++**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_7, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, 7),
++	   z0 = svld1sb_vnum_u32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_u32_8:
++**	incb	x0, all, mul #2
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_8, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, 8),
++	   z0 = svld1sb_vnum_u32 (p0, x0, 8))
++
++/*
++** ld1sb_vnum_u32_m1:
++**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_m1, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, -1),
++	   z0 = svld1sb_vnum_u32 (p0, x0, -1))
++
++/*
++** ld1sb_vnum_u32_m8:
++**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_m8, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, -8),
++	   z0 = svld1sb_vnum_u32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_u32_m9:
++**	decw	x0, all, mul #9
++**	ld1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_m9, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, -9),
++	   z0 = svld1sb_vnum_u32 (p0, x0, -9))
++
++/*
++** ld1sb_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1sb	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1sb	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u32_x1, svuint32_t, int8_t,
++	   z0 = svld1sb_vnum_u32 (p0, x0, x1),
++	   z0 = svld1sb_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u64.c
+new file mode 100644
+index 000000000..8051bf140
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u64.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sb_u64_base:
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_base, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0),
++	   z0 = svld1sb_u64 (p0, x0))
++
++/*
++** ld1sb_u64_index:
++**	ld1sb	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_index, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 + x1),
++	   z0 = svld1sb_u64 (p0, x0 + x1))
++
++/*
++** ld1sb_u64_1:
++**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_1, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1sb_u64 (p0, x0 + svcntd ()))
++
++/*
++** ld1sb_u64_7:
++**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_7, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_u64_8:
++**	incb	x0
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_8, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1sb_u64_m1:
++**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_m1, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1sb_u64 (p0, x0 - svcntd ()))
++
++/*
++** ld1sb_u64_m8:
++**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_m8, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_u64_m9:
++**	decd	x0, all, mul #9
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_u64_m9, svuint64_t, int8_t,
++	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1sb_vnum_u64_0:
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_0, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, 0),
++	   z0 = svld1sb_vnum_u64 (p0, x0, 0))
++
++/*
++** ld1sb_vnum_u64_1:
++**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_1, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, 1),
++	   z0 = svld1sb_vnum_u64 (p0, x0, 1))
++
++/*
++** ld1sb_vnum_u64_7:
++**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_7, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, 7),
++	   z0 = svld1sb_vnum_u64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_u64_8:
++**	incb	x0
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_8, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, 8),
++	   z0 = svld1sb_vnum_u64 (p0, x0, 8))
++
++/*
++** ld1sb_vnum_u64_m1:
++**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_m1, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, -1),
++	   z0 = svld1sb_vnum_u64 (p0, x0, -1))
++
++/*
++** ld1sb_vnum_u64_m8:
++**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_m8, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, -8),
++	   z0 = svld1sb_vnum_u64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sb_vnum_u64_m9:
++**	decd	x0, all, mul #9
++**	ld1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_m9, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, -9),
++	   z0 = svld1sb_vnum_u64 (p0, x0, -9))
++
++/*
++** ld1sb_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1sb	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1sb	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1sb_vnum_u64_x1, svuint64_t, int8_t,
++	   z0 = svld1sb_vnum_u64 (p0, x0, x1),
++	   z0 = svld1sb_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s32.c
+new file mode 100644
+index 000000000..ed07b4dfc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_gather_s32_tied1:
++**	ld1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_s32 (p0, z0),
++		     z0_res = svld1sh_gather_s32 (p0, z0))
++
++/*
++** ld1sh_gather_s32_untied:
++**	ld1sh	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_s32 (p0, z1),
++		     z0_res = svld1sh_gather_s32 (p0, z1))
++
++/*
++** ld1sh_gather_x0_s32_offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m2_s32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, -2),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, -2))
++
++/*
++** ld1sh_gather_0_s32_offset:
++**	ld1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ld1sh_gather_6_s32_offset:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_6_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 6),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 6))
++
++/*
++** ld1sh_gather_62_s32_offset:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_62_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 62),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 62))
++
++/*
++** ld1sh_gather_64_s32_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_64_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 64),
++		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 64))
++
++/*
++** ld1sh_gather_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, x0),
++		     z0_res = svld1sh_gather_index_s32 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m1_s32_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, -1),
++		     z0_res = svld1sh_gather_index_s32 (p0, z0, -1))
++
++/*
++** ld1sh_gather_0_s32_index:
++**	ld1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 0),
++		     z0_res = svld1sh_gather_index_s32 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_s32_index:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 5),
++		     z0_res = svld1sh_gather_index_s32 (p0, z0, 5))
++
++/*
++** ld1sh_gather_31_s32_index:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_31_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 31),
++		     z0_res = svld1sh_gather_index_s32 (p0, z0, 31))
++
++/*
++** ld1sh_gather_32_s32_index:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_32_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 32),
++		     z0_res = svld1sh_gather_index_s32 (p0, z0, 32))
++
++/*
++** ld1sh_gather_x0_s32_s32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_s32offset, svint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s32_s32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_s32offset, svint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s32_s32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_s32offset, svint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ld1sh_gather_x0_s32_u32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s32_u32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s32_u32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ld1sh_gather_x0_s32_s32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_s32index, svint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s32_s32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_s32index, svint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s32_s32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_s32index, svint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32index_s32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_s32 (p0, x0, z1))
++
++/*
++** ld1sh_gather_x0_s32_u32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_u32index, svint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s32_u32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_u32index, svint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s32_u32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_u32index, svint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32index_s32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s64.c
+new file mode 100644
+index 000000000..20ca42720
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_gather_s64_tied1:
++**	ld1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1sh_gather_s64 (p0, z0))
++
++/*
++** ld1sh_gather_s64_untied:
++**	ld1sh	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1sh_gather_s64 (p0, z1))
++
++/*
++** ld1sh_gather_x0_s64_offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m2_s64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, -2),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, -2))
++
++/*
++** ld1sh_gather_0_s64_offset:
++**	ld1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ld1sh_gather_6_s64_offset:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ld1sh_gather_62_s64_offset:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_62_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 62),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 62))
++
++/*
++** ld1sh_gather_64_s64_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_64_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 64),
++		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 64))
++
++/*
++** ld1sh_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svld1sh_gather_index_s64 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svld1sh_gather_index_s64 (p0, z0, -1))
++
++/*
++** ld1sh_gather_0_s64_index:
++**	ld1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svld1sh_gather_index_s64 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_s64_index:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svld1sh_gather_index_s64 (p0, z0, 5))
++
++/*
++** ld1sh_gather_31_s64_index:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svld1sh_gather_index_s64 (p0, z0, 31))
++
++/*
++** ld1sh_gather_32_s64_index:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svld1sh_gather_index_s64 (p0, z0, 32))
++
++/*
++** ld1sh_gather_x0_s64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_s64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sh_gather_x0_s64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_s64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sh_gather_x0_s64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_s64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sh_gather_x0_s64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_s64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_s64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_s64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u32.c
+new file mode 100644
+index 000000000..e3a85a23f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_gather_u32_tied1:
++**	ld1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_u32 (p0, z0),
++		     z0_res = svld1sh_gather_u32 (p0, z0))
++
++/*
++** ld1sh_gather_u32_untied:
++**	ld1sh	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_u32 (p0, z1),
++		     z0_res = svld1sh_gather_u32 (p0, z1))
++
++/*
++** ld1sh_gather_x0_u32_offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m2_u32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, -2),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, -2))
++
++/*
++** ld1sh_gather_0_u32_offset:
++**	ld1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ld1sh_gather_6_u32_offset:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_6_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 6),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 6))
++
++/*
++** ld1sh_gather_62_u32_offset:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_62_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 62),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 62))
++
++/*
++** ld1sh_gather_64_u32_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_64_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 64),
++		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 64))
++
++/*
++** ld1sh_gather_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, x0),
++		     z0_res = svld1sh_gather_index_u32 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m1_u32_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, -1),
++		     z0_res = svld1sh_gather_index_u32 (p0, z0, -1))
++
++/*
++** ld1sh_gather_0_u32_index:
++**	ld1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 0),
++		     z0_res = svld1sh_gather_index_u32 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_u32_index:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 5),
++		     z0_res = svld1sh_gather_index_u32 (p0, z0, 5))
++
++/*
++** ld1sh_gather_31_u32_index:
++**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_31_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 31),
++		     z0_res = svld1sh_gather_index_u32 (p0, z0, 31))
++
++/*
++** ld1sh_gather_32_u32_index:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_32_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 32),
++		     z0_res = svld1sh_gather_index_u32 (p0, z0, 32))
++
++/*
++** ld1sh_gather_x0_u32_s32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_s32offset, svuint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u32_s32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_s32offset, svuint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u32_s32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_s32offset, svuint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ld1sh_gather_x0_u32_u32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_u32offset, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u32_u32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_u32offset, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u32_u32offset:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_u32offset, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ld1sh_gather_x0_u32_s32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_s32index, svuint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u32_s32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_s32index, svuint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u32_s32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_s32index, svuint32_t, int16_t, svint32_t,
++		     z0_res = svld1sh_gather_s32index_u32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_u32 (p0, x0, z1))
++
++/*
++** ld1sh_gather_x0_u32_u32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_u32index, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u32_u32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_u32index, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u32_u32index:
++**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_u32index, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svld1sh_gather_u32index_u32 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u64.c
+new file mode 100644
+index 000000000..3a0094fba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_gather_u64_tied1:
++**	ld1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1sh_gather_u64 (p0, z0))
++
++/*
++** ld1sh_gather_u64_untied:
++**	ld1sh	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1sh_gather_u64 (p0, z1))
++
++/*
++** ld1sh_gather_x0_u64_offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m2_u64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, -2),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, -2))
++
++/*
++** ld1sh_gather_0_u64_offset:
++**	ld1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ld1sh_gather_6_u64_offset:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ld1sh_gather_62_u64_offset:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_62_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 62),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 62))
++
++/*
++** ld1sh_gather_64_u64_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_64_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 64),
++		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 64))
++
++/*
++** ld1sh_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svld1sh_gather_index_u64 (p0, z0, x0))
++
++/*
++** ld1sh_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svld1sh_gather_index_u64 (p0, z0, -1))
++
++/*
++** ld1sh_gather_0_u64_index:
++**	ld1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svld1sh_gather_index_u64 (p0, z0, 0))
++
++/*
++** ld1sh_gather_5_u64_index:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svld1sh_gather_index_u64 (p0, z0, 5))
++
++/*
++** ld1sh_gather_31_u64_index:
++**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svld1sh_gather_index_u64 (p0, z0, 31))
++
++/*
++** ld1sh_gather_32_u64_index:
++**	mov	(x[0-9]+), #?64
++**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sh_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svld1sh_gather_index_u64 (p0, z0, 32))
++
++/*
++** ld1sh_gather_x0_u64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_u64_s64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sh_gather_x0_u64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_u64_u64offset:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sh_gather_x0_u64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_u64_s64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sh_gather_x0_u64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_tied1_u64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sh_gather_untied_u64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1sh_gather_ext_u64_u64index:
++**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s32.c
+new file mode 100644
+index 000000000..8614f52c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_s32_base:
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_base, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0),
++	   z0 = svld1sh_s32 (p0, x0))
++
++/*
++** ld1sh_s32_index:
++**	ld1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_index, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 + x1),
++	   z0 = svld1sh_s32 (p0, x0 + x1))
++
++/*
++** ld1sh_s32_1:
++**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_1, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 + svcntw ()),
++	   z0 = svld1sh_s32 (p0, x0 + svcntw ()))
++
++/*
++** ld1sh_s32_7:
++**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_7, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_s32_8:
++**	incb	x0, all, mul #4
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_8, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1sh_s32_m1:
++**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_m1, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 - svcntw ()),
++	   z0 = svld1sh_s32 (p0, x0 - svcntw ()))
++
++/*
++** ld1sh_s32_m8:
++**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_m8, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_s32_m9:
++**	dech	x0, all, mul #9
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s32_m9, svint32_t, int16_t,
++	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1sh_vnum_s32_0:
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_0, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, 0),
++	   z0 = svld1sh_vnum_s32 (p0, x0, 0))
++
++/*
++** ld1sh_vnum_s32_1:
++**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_1, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, 1),
++	   z0 = svld1sh_vnum_s32 (p0, x0, 1))
++
++/*
++** ld1sh_vnum_s32_7:
++**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_7, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, 7),
++	   z0 = svld1sh_vnum_s32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_s32_8:
++**	incb	x0, all, mul #4
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_8, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, 8),
++	   z0 = svld1sh_vnum_s32 (p0, x0, 8))
++
++/*
++** ld1sh_vnum_s32_m1:
++**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_m1, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, -1),
++	   z0 = svld1sh_vnum_s32 (p0, x0, -1))
++
++/*
++** ld1sh_vnum_s32_m8:
++**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_m8, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, -8),
++	   z0 = svld1sh_vnum_s32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_s32_m9:
++**	dech	x0, all, mul #9
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_m9, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, -9),
++	   z0 = svld1sh_vnum_s32 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1sh_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1sh	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s32_x1, svint32_t, int16_t,
++	   z0 = svld1sh_vnum_s32 (p0, x0, x1),
++	   z0 = svld1sh_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s64.c
+new file mode 100644
+index 000000000..c02b40a76
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_s64_base:
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_base, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0),
++	   z0 = svld1sh_s64 (p0, x0))
++
++/*
++** ld1sh_s64_index:
++**	ld1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_index, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 + x1),
++	   z0 = svld1sh_s64 (p0, x0 + x1))
++
++/*
++** ld1sh_s64_1:
++**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_1, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1sh_s64 (p0, x0 + svcntd ()))
++
++/*
++** ld1sh_s64_7:
++**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_7, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_s64_8:
++**	incb	x0, all, mul #2
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_8, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1sh_s64_m1:
++**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_m1, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1sh_s64 (p0, x0 - svcntd ()))
++
++/*
++** ld1sh_s64_m8:
++**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_m8, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_s64_m9:
++**	decw	x0, all, mul #9
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_s64_m9, svint64_t, int16_t,
++	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1sh_vnum_s64_0:
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_0, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, 0),
++	   z0 = svld1sh_vnum_s64 (p0, x0, 0))
++
++/*
++** ld1sh_vnum_s64_1:
++**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_1, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, 1),
++	   z0 = svld1sh_vnum_s64 (p0, x0, 1))
++
++/*
++** ld1sh_vnum_s64_7:
++**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_7, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, 7),
++	   z0 = svld1sh_vnum_s64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_s64_8:
++**	incb	x0, all, mul #2
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_8, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, 8),
++	   z0 = svld1sh_vnum_s64 (p0, x0, 8))
++
++/*
++** ld1sh_vnum_s64_m1:
++**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_m1, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, -1),
++	   z0 = svld1sh_vnum_s64 (p0, x0, -1))
++
++/*
++** ld1sh_vnum_s64_m8:
++**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_m8, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, -8),
++	   z0 = svld1sh_vnum_s64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_s64_m9:
++**	decw	x0, all, mul #9
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_m9, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, -9),
++	   z0 = svld1sh_vnum_s64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1sh_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1sh	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_s64_x1, svint64_t, int16_t,
++	   z0 = svld1sh_vnum_s64 (p0, x0, x1),
++	   z0 = svld1sh_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u32.c
+new file mode 100644
+index 000000000..ead96174a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_u32_base:
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_base, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0),
++	   z0 = svld1sh_u32 (p0, x0))
++
++/*
++** ld1sh_u32_index:
++**	ld1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_index, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 + x1),
++	   z0 = svld1sh_u32 (p0, x0 + x1))
++
++/*
++** ld1sh_u32_1:
++**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_1, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 + svcntw ()),
++	   z0 = svld1sh_u32 (p0, x0 + svcntw ()))
++
++/*
++** ld1sh_u32_7:
++**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_7, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_u32_8:
++**	incb	x0, all, mul #4
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_8, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1sh_u32_m1:
++**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_m1, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 - svcntw ()),
++	   z0 = svld1sh_u32 (p0, x0 - svcntw ()))
++
++/*
++** ld1sh_u32_m8:
++**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_m8, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_u32_m9:
++**	dech	x0, all, mul #9
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u32_m9, svuint32_t, int16_t,
++	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1sh_vnum_u32_0:
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_0, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, 0),
++	   z0 = svld1sh_vnum_u32 (p0, x0, 0))
++
++/*
++** ld1sh_vnum_u32_1:
++**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_1, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, 1),
++	   z0 = svld1sh_vnum_u32 (p0, x0, 1))
++
++/*
++** ld1sh_vnum_u32_7:
++**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_7, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, 7),
++	   z0 = svld1sh_vnum_u32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_u32_8:
++**	incb	x0, all, mul #4
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_8, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, 8),
++	   z0 = svld1sh_vnum_u32 (p0, x0, 8))
++
++/*
++** ld1sh_vnum_u32_m1:
++**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_m1, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, -1),
++	   z0 = svld1sh_vnum_u32 (p0, x0, -1))
++
++/*
++** ld1sh_vnum_u32_m8:
++**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_m8, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, -8),
++	   z0 = svld1sh_vnum_u32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_u32_m9:
++**	dech	x0, all, mul #9
++**	ld1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_m9, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, -9),
++	   z0 = svld1sh_vnum_u32 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1sh_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1sh	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u32_x1, svuint32_t, int16_t,
++	   z0 = svld1sh_vnum_u32 (p0, x0, x1),
++	   z0 = svld1sh_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u64.c
+new file mode 100644
+index 000000000..e407a08a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sh_u64_base:
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_base, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0),
++	   z0 = svld1sh_u64 (p0, x0))
++
++/*
++** ld1sh_u64_index:
++**	ld1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_index, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 + x1),
++	   z0 = svld1sh_u64 (p0, x0 + x1))
++
++/*
++** ld1sh_u64_1:
++**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_1, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1sh_u64 (p0, x0 + svcntd ()))
++
++/*
++** ld1sh_u64_7:
++**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_7, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_u64_8:
++**	incb	x0, all, mul #2
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_8, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1sh_u64_m1:
++**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_m1, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1sh_u64 (p0, x0 - svcntd ()))
++
++/*
++** ld1sh_u64_m8:
++**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_m8, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_u64_m9:
++**	decw	x0, all, mul #9
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_u64_m9, svuint64_t, int16_t,
++	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1sh_vnum_u64_0:
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_0, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, 0),
++	   z0 = svld1sh_vnum_u64 (p0, x0, 0))
++
++/*
++** ld1sh_vnum_u64_1:
++**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_1, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, 1),
++	   z0 = svld1sh_vnum_u64 (p0, x0, 1))
++
++/*
++** ld1sh_vnum_u64_7:
++**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_7, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, 7),
++	   z0 = svld1sh_vnum_u64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_u64_8:
++**	incb	x0, all, mul #2
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_8, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, 8),
++	   z0 = svld1sh_vnum_u64 (p0, x0, 8))
++
++/*
++** ld1sh_vnum_u64_m1:
++**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_m1, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, -1),
++	   z0 = svld1sh_vnum_u64 (p0, x0, -1))
++
++/*
++** ld1sh_vnum_u64_m8:
++**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_m8, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, -8),
++	   z0 = svld1sh_vnum_u64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sh_vnum_u64_m9:
++**	decw	x0, all, mul #9
++**	ld1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_m9, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, -9),
++	   z0 = svld1sh_vnum_u64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1sh_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1sh	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1sh_vnum_u64_x1, svuint64_t, int16_t,
++	   z0 = svld1sh_vnum_u64 (p0, x0, x1),
++	   z0 = svld1sh_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_s64.c
+new file mode 100644
+index 000000000..4d076b486
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_s64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sw_gather_s64_tied1:
++**	ld1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1sw_gather_s64 (p0, z0))
++
++/*
++** ld1sw_gather_s64_untied:
++**	ld1sw	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1sw_gather_s64 (p0, z1))
++
++/*
++** ld1sw_gather_x0_s64_offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1sw_gather_m4_s64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_m4_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, -4),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, -4))
++
++/*
++** ld1sw_gather_0_s64_offset:
++**	ld1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1sw_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ld1sw_gather_6_s64_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ld1sw_gather_7_s64_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_7_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 7),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 7))
++
++/*
++** ld1sw_gather_8_s64_offset:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_8_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 8),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 8))
++
++/*
++** ld1sw_gather_124_s64_offset:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_124_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 124),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 124))
++
++/*
++** ld1sw_gather_128_s64_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_128_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 128),
++		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 128))
++
++/*
++** ld1sw_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svld1sw_gather_index_s64 (p0, z0, x0))
++
++/*
++** ld1sw_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svld1sw_gather_index_s64 (p0, z0, -1))
++
++/*
++** ld1sw_gather_0_s64_index:
++**	ld1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svld1sw_gather_index_s64 (p0, z0, 0))
++
++/*
++** ld1sw_gather_5_s64_index:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svld1sw_gather_index_s64 (p0, z0, 5))
++
++/*
++** ld1sw_gather_31_s64_index:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svld1sw_gather_index_s64 (p0, z0, 31))
++
++/*
++** ld1sw_gather_32_s64_index:
++**	mov	(x[0-9]+), #?128
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svld1sw_gather_index_s64 (p0, z0, 32))
++
++/*
++** ld1sw_gather_x0_s64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_s64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_s64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_s64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sw_gather_x0_s64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_s64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_s64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_s64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sw_gather_x0_s64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_s64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_s64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_s64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sw_gather_x0_s64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_s64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_s64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_s64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_u64.c
+new file mode 100644
+index 000000000..ffa85eb3e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_u64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sw_gather_u64_tied1:
++**	ld1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1sw_gather_u64 (p0, z0))
++
++/*
++** ld1sw_gather_u64_untied:
++**	ld1sw	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1sw_gather_u64 (p0, z1))
++
++/*
++** ld1sw_gather_x0_u64_offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1sw_gather_m4_u64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_m4_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, -4),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, -4))
++
++/*
++** ld1sw_gather_0_u64_offset:
++**	ld1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1sw_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ld1sw_gather_6_u64_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ld1sw_gather_7_u64_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_7_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 7),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 7))
++
++/*
++** ld1sw_gather_8_u64_offset:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_8_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 8),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 8))
++
++/*
++** ld1sw_gather_124_u64_offset:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_124_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 124),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 124))
++
++/*
++** ld1sw_gather_128_u64_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_128_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 128),
++		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 128))
++
++/*
++** ld1sw_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svld1sw_gather_index_u64 (p0, z0, x0))
++
++/*
++** ld1sw_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svld1sw_gather_index_u64 (p0, z0, -1))
++
++/*
++** ld1sw_gather_0_u64_index:
++**	ld1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svld1sw_gather_index_u64 (p0, z0, 0))
++
++/*
++** ld1sw_gather_5_u64_index:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svld1sw_gather_index_u64 (p0, z0, 5))
++
++/*
++** ld1sw_gather_31_u64_index:
++**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svld1sw_gather_index_u64 (p0, z0, 31))
++
++/*
++** ld1sw_gather_32_u64_index:
++**	mov	(x[0-9]+), #?128
++**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1sw_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svld1sw_gather_index_u64 (p0, z0, 32))
++
++/*
++** ld1sw_gather_x0_u64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_u64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_u64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_u64_s64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sw_gather_x0_u64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_u64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_u64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_u64_u64offset:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sw_gather_x0_u64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_u64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_u64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_u64_s64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1sw_gather_x0_u64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_tied1_u64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1sw_gather_untied_u64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1sw_gather_ext_u64_u64index:
++**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_s64.c
+new file mode 100644
+index 000000000..019a12b20
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sw_s64_base:
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_base, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0),
++	   z0 = svld1sw_s64 (p0, x0))
++
++/*
++** ld1sw_s64_index:
++**	ld1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_index, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 + x1),
++	   z0 = svld1sw_s64 (p0, x0 + x1))
++
++/*
++** ld1sw_s64_1:
++**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_1, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1sw_s64 (p0, x0 + svcntd ()))
++
++/*
++** ld1sw_s64_7:
++**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_7, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_s64_8:
++**	incb	x0, all, mul #4
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_8, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1sw_s64_m1:
++**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_m1, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1sw_s64 (p0, x0 - svcntd ()))
++
++/*
++** ld1sw_s64_m8:
++**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_m8, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_s64_m9:
++**	dech	x0, all, mul #9
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_s64_m9, svint64_t, int32_t,
++	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1sw_vnum_s64_0:
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_0, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, 0),
++	   z0 = svld1sw_vnum_s64 (p0, x0, 0))
++
++/*
++** ld1sw_vnum_s64_1:
++**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_1, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, 1),
++	   z0 = svld1sw_vnum_s64 (p0, x0, 1))
++
++/*
++** ld1sw_vnum_s64_7:
++**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_7, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, 7),
++	   z0 = svld1sw_vnum_s64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_vnum_s64_8:
++**	incb	x0, all, mul #4
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_8, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, 8),
++	   z0 = svld1sw_vnum_s64 (p0, x0, 8))
++
++/*
++** ld1sw_vnum_s64_m1:
++**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_m1, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, -1),
++	   z0 = svld1sw_vnum_s64 (p0, x0, -1))
++
++/*
++** ld1sw_vnum_s64_m8:
++**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_m8, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, -8),
++	   z0 = svld1sw_vnum_s64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_vnum_s64_m9:
++**	dech	x0, all, mul #9
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_m9, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, -9),
++	   z0 = svld1sw_vnum_s64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1sw_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1sw	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_s64_x1, svint64_t, int32_t,
++	   z0 = svld1sw_vnum_s64 (p0, x0, x1),
++	   z0 = svld1sw_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_u64.c
+new file mode 100644
+index 000000000..4c291c243
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1sw_u64_base:
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_base, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0),
++	   z0 = svld1sw_u64 (p0, x0))
++
++/*
++** ld1sw_u64_index:
++**	ld1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_index, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 + x1),
++	   z0 = svld1sw_u64 (p0, x0 + x1))
++
++/*
++** ld1sw_u64_1:
++**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_1, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1sw_u64 (p0, x0 + svcntd ()))
++
++/*
++** ld1sw_u64_7:
++**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_7, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_u64_8:
++**	incb	x0, all, mul #4
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_8, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1sw_u64_m1:
++**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_m1, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1sw_u64 (p0, x0 - svcntd ()))
++
++/*
++** ld1sw_u64_m8:
++**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_m8, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_u64_m9:
++**	dech	x0, all, mul #9
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_u64_m9, svuint64_t, int32_t,
++	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1sw_vnum_u64_0:
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_0, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, 0),
++	   z0 = svld1sw_vnum_u64 (p0, x0, 0))
++
++/*
++** ld1sw_vnum_u64_1:
++**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_1, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, 1),
++	   z0 = svld1sw_vnum_u64 (p0, x0, 1))
++
++/*
++** ld1sw_vnum_u64_7:
++**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_7, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, 7),
++	   z0 = svld1sw_vnum_u64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_vnum_u64_8:
++**	incb	x0, all, mul #4
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_8, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, 8),
++	   z0 = svld1sw_vnum_u64 (p0, x0, 8))
++
++/*
++** ld1sw_vnum_u64_m1:
++**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_m1, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, -1),
++	   z0 = svld1sw_vnum_u64 (p0, x0, -1))
++
++/*
++** ld1sw_vnum_u64_m8:
++**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_m8, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, -8),
++	   z0 = svld1sw_vnum_u64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1sw_vnum_u64_m9:
++**	dech	x0, all, mul #9
++**	ld1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_m9, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, -9),
++	   z0 = svld1sw_vnum_u64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1sw_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1sw	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1sw_vnum_u64_x1, svuint64_t, int32_t,
++	   z0 = svld1sw_vnum_u64 (p0, x0, x1),
++	   z0 = svld1sw_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s32.c
+new file mode 100644
+index 000000000..a9c418265
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_gather_s32_tied1:
++**	ld1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_s32 (p0, z0),
++		     z0_res = svld1ub_gather_s32 (p0, z0))
++
++/*
++** ld1ub_gather_s32_untied:
++**	ld1b	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_s32 (p0, z1),
++		     z0_res = svld1ub_gather_s32 (p0, z1))
++
++/*
++** ld1ub_gather_x0_s32_offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svld1ub_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ld1ub_gather_m1_s32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, -1),
++		     z0_res = svld1ub_gather_offset_s32 (p0, z0, -1))
++
++/*
++** ld1ub_gather_0_s32_offset:
++**	ld1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ld1ub_gather_5_s32_offset:
++**	ld1b	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ld1ub_gather_31_s32_offset:
++**	ld1b	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_31_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 31),
++		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 31))
++
++/*
++** ld1ub_gather_32_s32_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_32_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 32),
++		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 32))
++
++/*
++** ld1ub_gather_x0_s32_s32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s32_s32offset, svint32_t, uint8_t, svint32_t,
++		     z0_res = svld1ub_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_s32_s32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s32_s32offset, svint32_t, uint8_t, svint32_t,
++		     z0_res = svld1ub_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_s32_s32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s32_s32offset, svint32_t, uint8_t, svint32_t,
++		     z0_res = svld1ub_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ld1ub_gather_x0_s32_u32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s32_u32offset, svint32_t, uint8_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_s32_u32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s32_u32offset, svint32_t, uint8_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_s32_u32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s32_u32offset, svint32_t, uint8_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s64.c
+new file mode 100644
+index 000000000..99af86ddf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_gather_s64_tied1:
++**	ld1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1ub_gather_s64 (p0, z0))
++
++/*
++** ld1ub_gather_s64_untied:
++**	ld1b	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1ub_gather_s64 (p0, z1))
++
++/*
++** ld1ub_gather_x0_s64_offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1ub_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1ub_gather_m1_s64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, -1),
++		     z0_res = svld1ub_gather_offset_s64 (p0, z0, -1))
++
++/*
++** ld1ub_gather_0_s64_offset:
++**	ld1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1ub_gather_5_s64_offset:
++**	ld1b	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ld1ub_gather_31_s64_offset:
++**	ld1b	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_31_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 31),
++		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 31))
++
++/*
++** ld1ub_gather_32_s64_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_32_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 32),
++		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 32))
++
++/*
++** ld1ub_gather_x0_s64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_s64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_s64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1ub_gather_ext_s64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1ub_gather_x0_s64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_s64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_s64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1ub_gather_ext_s64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u32.c
+new file mode 100644
+index 000000000..77c7e0a2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_gather_u32_tied1:
++**	ld1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_u32 (p0, z0),
++		     z0_res = svld1ub_gather_u32 (p0, z0))
++
++/*
++** ld1ub_gather_u32_untied:
++**	ld1b	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_u32 (p0, z1),
++		     z0_res = svld1ub_gather_u32 (p0, z1))
++
++/*
++** ld1ub_gather_x0_u32_offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svld1ub_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ld1ub_gather_m1_u32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, -1),
++		     z0_res = svld1ub_gather_offset_u32 (p0, z0, -1))
++
++/*
++** ld1ub_gather_0_u32_offset:
++**	ld1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ld1ub_gather_5_u32_offset:
++**	ld1b	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ld1ub_gather_31_u32_offset:
++**	ld1b	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_31_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 31),
++		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 31))
++
++/*
++** ld1ub_gather_32_u32_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_32_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 32),
++		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 32))
++
++/*
++** ld1ub_gather_x0_u32_s32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		     z0_res = svld1ub_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_u32_s32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		     z0_res = svld1ub_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_u32_s32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		     z0_res = svld1ub_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ld1ub_gather_x0_u32_u32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_u32_u32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_u32_u32offset:
++**	ld1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		     z0_res = svld1ub_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u64.c
+new file mode 100644
+index 000000000..b605f8b67
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_gather_u64_tied1:
++**	ld1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1ub_gather_u64 (p0, z0))
++
++/*
++** ld1ub_gather_u64_untied:
++**	ld1b	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1ub_gather_u64 (p0, z1))
++
++/*
++** ld1ub_gather_x0_u64_offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1ub_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1ub_gather_m1_u64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, -1),
++		     z0_res = svld1ub_gather_offset_u64 (p0, z0, -1))
++
++/*
++** ld1ub_gather_0_u64_offset:
++**	ld1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1ub_gather_5_u64_offset:
++**	ld1b	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ld1ub_gather_31_u64_offset:
++**	ld1b	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_31_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 31),
++		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 31))
++
++/*
++** ld1ub_gather_32_u64_offset:
++**	mov	(x[0-9]+), #?32
++**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1ub_gather_32_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 32),
++		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 32))
++
++/*
++** ld1ub_gather_x0_u64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_u64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_u64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1ub_gather_ext_u64_s64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1ub_gather_x0_u64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_tied1_u64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1ub_gather_untied_u64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1ub_gather_ext_u64_u64offset:
++**	ld1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s16.c
+new file mode 100644
+index 000000000..c492086b5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s16.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_s16_base:
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_base, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0),
++	   z0 = svld1ub_s16 (p0, x0))
++
++/*
++** ld1ub_s16_index:
++**	ld1b	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_index, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 + x1),
++	   z0 = svld1ub_s16 (p0, x0 + x1))
++
++/*
++** ld1ub_s16_1:
++**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_1, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 + svcnth ()),
++	   z0 = svld1ub_s16 (p0, x0 + svcnth ()))
++
++/*
++** ld1ub_s16_7:
++**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_7, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_s16_8:
++**	incb	x0, all, mul #4
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_8, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1ub_s16_m1:
++**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_m1, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 - svcnth ()),
++	   z0 = svld1ub_s16 (p0, x0 - svcnth ()))
++
++/*
++** ld1ub_s16_m8:
++**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_m8, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_s16_m9:
++**	dech	x0, all, mul #9
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s16_m9, svint16_t, uint8_t,
++	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1ub_vnum_s16_0:
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_0, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, 0),
++	   z0 = svld1ub_vnum_s16 (p0, x0, 0))
++
++/*
++** ld1ub_vnum_s16_1:
++**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_1, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, 1),
++	   z0 = svld1ub_vnum_s16 (p0, x0, 1))
++
++/*
++** ld1ub_vnum_s16_7:
++**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_7, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, 7),
++	   z0 = svld1ub_vnum_s16 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_s16_8:
++**	incb	x0, all, mul #4
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_8, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, 8),
++	   z0 = svld1ub_vnum_s16 (p0, x0, 8))
++
++/*
++** ld1ub_vnum_s16_m1:
++**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_m1, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, -1),
++	   z0 = svld1ub_vnum_s16 (p0, x0, -1))
++
++/*
++** ld1ub_vnum_s16_m8:
++**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_m8, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, -8),
++	   z0 = svld1ub_vnum_s16 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_s16_m9:
++**	dech	x0, all, mul #9
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_m9, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, -9),
++	   z0 = svld1ub_vnum_s16 (p0, x0, -9))
++
++/*
++** ld1ub_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s16_x1, svint16_t, uint8_t,
++	   z0 = svld1ub_vnum_s16 (p0, x0, x1),
++	   z0 = svld1ub_vnum_s16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s32.c
+new file mode 100644
+index 000000000..b2f8c4b04
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s32.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_s32_base:
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_base, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0),
++	   z0 = svld1ub_s32 (p0, x0))
++
++/*
++** ld1ub_s32_index:
++**	ld1b	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_index, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 + x1),
++	   z0 = svld1ub_s32 (p0, x0 + x1))
++
++/*
++** ld1ub_s32_1:
++**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_1, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 + svcntw ()),
++	   z0 = svld1ub_s32 (p0, x0 + svcntw ()))
++
++/*
++** ld1ub_s32_7:
++**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_7, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_s32_8:
++**	incb	x0, all, mul #2
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_8, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1ub_s32_m1:
++**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_m1, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 - svcntw ()),
++	   z0 = svld1ub_s32 (p0, x0 - svcntw ()))
++
++/*
++** ld1ub_s32_m8:
++**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_m8, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_s32_m9:
++**	decw	x0, all, mul #9
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s32_m9, svint32_t, uint8_t,
++	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1ub_vnum_s32_0:
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_0, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, 0),
++	   z0 = svld1ub_vnum_s32 (p0, x0, 0))
++
++/*
++** ld1ub_vnum_s32_1:
++**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_1, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, 1),
++	   z0 = svld1ub_vnum_s32 (p0, x0, 1))
++
++/*
++** ld1ub_vnum_s32_7:
++**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_7, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, 7),
++	   z0 = svld1ub_vnum_s32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_s32_8:
++**	incb	x0, all, mul #2
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_8, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, 8),
++	   z0 = svld1ub_vnum_s32 (p0, x0, 8))
++
++/*
++** ld1ub_vnum_s32_m1:
++**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_m1, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, -1),
++	   z0 = svld1ub_vnum_s32 (p0, x0, -1))
++
++/*
++** ld1ub_vnum_s32_m8:
++**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_m8, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, -8),
++	   z0 = svld1ub_vnum_s32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_s32_m9:
++**	decw	x0, all, mul #9
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_m9, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, -9),
++	   z0 = svld1ub_vnum_s32 (p0, x0, -9))
++
++/*
++** ld1ub_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s32_x1, svint32_t, uint8_t,
++	   z0 = svld1ub_vnum_s32 (p0, x0, x1),
++	   z0 = svld1ub_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s64.c
+new file mode 100644
+index 000000000..d8694bf28
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s64.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_s64_base:
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_base, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0),
++	   z0 = svld1ub_s64 (p0, x0))
++
++/*
++** ld1ub_s64_index:
++**	ld1b	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_index, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 + x1),
++	   z0 = svld1ub_s64 (p0, x0 + x1))
++
++/*
++** ld1ub_s64_1:
++**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_1, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1ub_s64 (p0, x0 + svcntd ()))
++
++/*
++** ld1ub_s64_7:
++**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_7, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_s64_8:
++**	incb	x0
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_8, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1ub_s64_m1:
++**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_m1, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1ub_s64 (p0, x0 - svcntd ()))
++
++/*
++** ld1ub_s64_m8:
++**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_m8, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_s64_m9:
++**	decd	x0, all, mul #9
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_s64_m9, svint64_t, uint8_t,
++	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1ub_vnum_s64_0:
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_0, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, 0),
++	   z0 = svld1ub_vnum_s64 (p0, x0, 0))
++
++/*
++** ld1ub_vnum_s64_1:
++**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_1, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, 1),
++	   z0 = svld1ub_vnum_s64 (p0, x0, 1))
++
++/*
++** ld1ub_vnum_s64_7:
++**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_7, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, 7),
++	   z0 = svld1ub_vnum_s64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_s64_8:
++**	incb	x0
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_8, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, 8),
++	   z0 = svld1ub_vnum_s64 (p0, x0, 8))
++
++/*
++** ld1ub_vnum_s64_m1:
++**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_m1, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, -1),
++	   z0 = svld1ub_vnum_s64 (p0, x0, -1))
++
++/*
++** ld1ub_vnum_s64_m8:
++**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_m8, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, -8),
++	   z0 = svld1ub_vnum_s64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_s64_m9:
++**	decd	x0, all, mul #9
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_m9, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, -9),
++	   z0 = svld1ub_vnum_s64 (p0, x0, -9))
++
++/*
++** ld1ub_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_s64_x1, svint64_t, uint8_t,
++	   z0 = svld1ub_vnum_s64 (p0, x0, x1),
++	   z0 = svld1ub_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u16.c
+new file mode 100644
+index 000000000..049234ee4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u16.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_u16_base:
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_base, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0),
++	   z0 = svld1ub_u16 (p0, x0))
++
++/*
++** ld1ub_u16_index:
++**	ld1b	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_index, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 + x1),
++	   z0 = svld1ub_u16 (p0, x0 + x1))
++
++/*
++** ld1ub_u16_1:
++**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_1, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 + svcnth ()),
++	   z0 = svld1ub_u16 (p0, x0 + svcnth ()))
++
++/*
++** ld1ub_u16_7:
++**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_7, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_u16_8:
++**	incb	x0, all, mul #4
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_8, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 8))
++
++/*
++** ld1ub_u16_m1:
++**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_m1, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 - svcnth ()),
++	   z0 = svld1ub_u16 (p0, x0 - svcnth ()))
++
++/*
++** ld1ub_u16_m8:
++**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_m8, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_u16_m9:
++**	dech	x0, all, mul #9
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u16_m9, svuint16_t, uint8_t,
++	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 9))
++
++/*
++** ld1ub_vnum_u16_0:
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_0, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, 0),
++	   z0 = svld1ub_vnum_u16 (p0, x0, 0))
++
++/*
++** ld1ub_vnum_u16_1:
++**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_1, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, 1),
++	   z0 = svld1ub_vnum_u16 (p0, x0, 1))
++
++/*
++** ld1ub_vnum_u16_7:
++**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_7, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, 7),
++	   z0 = svld1ub_vnum_u16 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_u16_8:
++**	incb	x0, all, mul #4
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_8, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, 8),
++	   z0 = svld1ub_vnum_u16 (p0, x0, 8))
++
++/*
++** ld1ub_vnum_u16_m1:
++**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_m1, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, -1),
++	   z0 = svld1ub_vnum_u16 (p0, x0, -1))
++
++/*
++** ld1ub_vnum_u16_m8:
++**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_m8, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, -8),
++	   z0 = svld1ub_vnum_u16 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_u16_m9:
++**	dech	x0, all, mul #9
++**	ld1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_m9, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, -9),
++	   z0 = svld1ub_vnum_u16 (p0, x0, -9))
++
++/*
++** ld1ub_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u16_x1, svuint16_t, uint8_t,
++	   z0 = svld1ub_vnum_u16 (p0, x0, x1),
++	   z0 = svld1ub_vnum_u16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u32.c
+new file mode 100644
+index 000000000..58d2ef527
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u32.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_u32_base:
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_base, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0),
++	   z0 = svld1ub_u32 (p0, x0))
++
++/*
++** ld1ub_u32_index:
++**	ld1b	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_index, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 + x1),
++	   z0 = svld1ub_u32 (p0, x0 + x1))
++
++/*
++** ld1ub_u32_1:
++**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_1, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 + svcntw ()),
++	   z0 = svld1ub_u32 (p0, x0 + svcntw ()))
++
++/*
++** ld1ub_u32_7:
++**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_7, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_u32_8:
++**	incb	x0, all, mul #2
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_8, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1ub_u32_m1:
++**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_m1, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 - svcntw ()),
++	   z0 = svld1ub_u32 (p0, x0 - svcntw ()))
++
++/*
++** ld1ub_u32_m8:
++**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_m8, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_u32_m9:
++**	decw	x0, all, mul #9
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u32_m9, svuint32_t, uint8_t,
++	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1ub_vnum_u32_0:
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_0, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, 0),
++	   z0 = svld1ub_vnum_u32 (p0, x0, 0))
++
++/*
++** ld1ub_vnum_u32_1:
++**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_1, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, 1),
++	   z0 = svld1ub_vnum_u32 (p0, x0, 1))
++
++/*
++** ld1ub_vnum_u32_7:
++**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_7, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, 7),
++	   z0 = svld1ub_vnum_u32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_u32_8:
++**	incb	x0, all, mul #2
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_8, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, 8),
++	   z0 = svld1ub_vnum_u32 (p0, x0, 8))
++
++/*
++** ld1ub_vnum_u32_m1:
++**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_m1, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, -1),
++	   z0 = svld1ub_vnum_u32 (p0, x0, -1))
++
++/*
++** ld1ub_vnum_u32_m8:
++**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_m8, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, -8),
++	   z0 = svld1ub_vnum_u32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_u32_m9:
++**	decw	x0, all, mul #9
++**	ld1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_m9, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, -9),
++	   z0 = svld1ub_vnum_u32 (p0, x0, -9))
++
++/*
++** ld1ub_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u32_x1, svuint32_t, uint8_t,
++	   z0 = svld1ub_vnum_u32 (p0, x0, x1),
++	   z0 = svld1ub_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u64.c
+new file mode 100644
+index 000000000..46d7250f0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u64.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1ub_u64_base:
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_base, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0),
++	   z0 = svld1ub_u64 (p0, x0))
++
++/*
++** ld1ub_u64_index:
++**	ld1b	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_index, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 + x1),
++	   z0 = svld1ub_u64 (p0, x0 + x1))
++
++/*
++** ld1ub_u64_1:
++**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_1, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1ub_u64 (p0, x0 + svcntd ()))
++
++/*
++** ld1ub_u64_7:
++**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_7, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_u64_8:
++**	incb	x0
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_8, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1ub_u64_m1:
++**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_m1, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1ub_u64 (p0, x0 - svcntd ()))
++
++/*
++** ld1ub_u64_m8:
++**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_m8, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_u64_m9:
++**	decd	x0, all, mul #9
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_u64_m9, svuint64_t, uint8_t,
++	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1ub_vnum_u64_0:
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_0, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, 0),
++	   z0 = svld1ub_vnum_u64 (p0, x0, 0))
++
++/*
++** ld1ub_vnum_u64_1:
++**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_1, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, 1),
++	   z0 = svld1ub_vnum_u64 (p0, x0, 1))
++
++/*
++** ld1ub_vnum_u64_7:
++**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_7, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, 7),
++	   z0 = svld1ub_vnum_u64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_u64_8:
++**	incb	x0
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_8, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, 8),
++	   z0 = svld1ub_vnum_u64 (p0, x0, 8))
++
++/*
++** ld1ub_vnum_u64_m1:
++**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_m1, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, -1),
++	   z0 = svld1ub_vnum_u64 (p0, x0, -1))
++
++/*
++** ld1ub_vnum_u64_m8:
++**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_m8, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, -8),
++	   z0 = svld1ub_vnum_u64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1ub_vnum_u64_m9:
++**	decd	x0, all, mul #9
++**	ld1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_m9, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, -9),
++	   z0 = svld1ub_vnum_u64 (p0, x0, -9))
++
++/*
++** ld1ub_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld1b	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld1b	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld1ub_vnum_u64_x1, svuint64_t, uint8_t,
++	   z0 = svld1ub_vnum_u64 (p0, x0, x1),
++	   z0 = svld1ub_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s32.c
+new file mode 100644
+index 000000000..84fb5c335
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_gather_s32_tied1:
++**	ld1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_s32 (p0, z0),
++		     z0_res = svld1uh_gather_s32 (p0, z0))
++
++/*
++** ld1uh_gather_s32_untied:
++**	ld1h	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_s32 (p0, z1),
++		     z0_res = svld1uh_gather_s32 (p0, z1))
++
++/*
++** ld1uh_gather_x0_s32_offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m2_s32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, -2),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, -2))
++
++/*
++** ld1uh_gather_0_s32_offset:
++**	ld1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ld1uh_gather_6_s32_offset:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_6_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 6),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 6))
++
++/*
++** ld1uh_gather_62_s32_offset:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_62_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 62),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 62))
++
++/*
++** ld1uh_gather_64_s32_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_64_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 64),
++		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 64))
++
++/*
++** ld1uh_gather_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, x0),
++		     z0_res = svld1uh_gather_index_s32 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m1_s32_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, -1),
++		     z0_res = svld1uh_gather_index_s32 (p0, z0, -1))
++
++/*
++** ld1uh_gather_0_s32_index:
++**	ld1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 0),
++		     z0_res = svld1uh_gather_index_s32 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_s32_index:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 5),
++		     z0_res = svld1uh_gather_index_s32 (p0, z0, 5))
++
++/*
++** ld1uh_gather_31_s32_index:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_31_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 31),
++		     z0_res = svld1uh_gather_index_s32 (p0, z0, 31))
++
++/*
++** ld1uh_gather_32_s32_index:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_32_s32_index, svint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 32),
++		     z0_res = svld1uh_gather_index_s32 (p0, z0, 32))
++
++/*
++** ld1uh_gather_x0_s32_s32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_s32offset, svint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s32_s32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_s32offset, svint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s32_s32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_s32offset, svint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ld1uh_gather_x0_s32_u32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_u32offset, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s32_u32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_u32offset, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s32_u32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_u32offset, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ld1uh_gather_x0_s32_s32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_s32index, svint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s32_s32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_s32index, svint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s32_s32index:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_s32index, svint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32index_s32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_s32 (p0, x0, z1))
++
++/*
++** ld1uh_gather_x0_s32_u32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_u32index, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s32_u32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_u32index, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s32_u32index:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_u32index, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32index_s32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s64.c
+new file mode 100644
+index 000000000..447001793
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_gather_s64_tied1:
++**	ld1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1uh_gather_s64 (p0, z0))
++
++/*
++** ld1uh_gather_s64_untied:
++**	ld1h	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1uh_gather_s64 (p0, z1))
++
++/*
++** ld1uh_gather_x0_s64_offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m2_s64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, -2),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, -2))
++
++/*
++** ld1uh_gather_0_s64_offset:
++**	ld1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ld1uh_gather_6_s64_offset:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ld1uh_gather_62_s64_offset:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_62_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 62),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 62))
++
++/*
++** ld1uh_gather_64_s64_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_64_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 64),
++		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 64))
++
++/*
++** ld1uh_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svld1uh_gather_index_s64 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svld1uh_gather_index_s64 (p0, z0, -1))
++
++/*
++** ld1uh_gather_0_s64_index:
++**	ld1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svld1uh_gather_index_s64 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_s64_index:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svld1uh_gather_index_s64 (p0, z0, 5))
++
++/*
++** ld1uh_gather_31_s64_index:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svld1uh_gather_index_s64 (p0, z0, 31))
++
++/*
++** ld1uh_gather_32_s64_index:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svld1uh_gather_index_s64 (p0, z0, 32))
++
++/*
++** ld1uh_gather_x0_s64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_s64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uh_gather_x0_s64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_s64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uh_gather_x0_s64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_s64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uh_gather_x0_s64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_s64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_s64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_s64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u32.c
+new file mode 100644
+index 000000000..09d3cc8c2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_gather_u32_tied1:
++**	ld1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_u32 (p0, z0),
++		     z0_res = svld1uh_gather_u32 (p0, z0))
++
++/*
++** ld1uh_gather_u32_untied:
++**	ld1h	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_u32 (p0, z1),
++		     z0_res = svld1uh_gather_u32 (p0, z1))
++
++/*
++** ld1uh_gather_x0_u32_offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m2_u32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, -2),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, -2))
++
++/*
++** ld1uh_gather_0_u32_offset:
++**	ld1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ld1uh_gather_6_u32_offset:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_6_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 6),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 6))
++
++/*
++** ld1uh_gather_62_u32_offset:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_62_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 62),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 62))
++
++/*
++** ld1uh_gather_64_u32_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_64_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 64),
++		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 64))
++
++/*
++** ld1uh_gather_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, x0),
++		     z0_res = svld1uh_gather_index_u32 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m1_u32_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, -1),
++		     z0_res = svld1uh_gather_index_u32 (p0, z0, -1))
++
++/*
++** ld1uh_gather_0_u32_index:
++**	ld1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 0),
++		     z0_res = svld1uh_gather_index_u32 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_u32_index:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 5),
++		     z0_res = svld1uh_gather_index_u32 (p0, z0, 5))
++
++/*
++** ld1uh_gather_31_u32_index:
++**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_31_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 31),
++		     z0_res = svld1uh_gather_index_u32 (p0, z0, 31))
++
++/*
++** ld1uh_gather_32_u32_index:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_32_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 32),
++		     z0_res = svld1uh_gather_index_u32 (p0, z0, 32))
++
++/*
++** ld1uh_gather_x0_u32_s32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u32_s32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u32_s32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ld1uh_gather_x0_u32_u32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u32_u32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u32_u32offset:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ld1uh_gather_x0_u32_s32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u32_s32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u32_s32index:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svld1uh_gather_s32index_u32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_u32 (p0, x0, z1))
++
++/*
++** ld1uh_gather_x0_u32_u32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u32_u32index:
++**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u32_u32index:
++**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svld1uh_gather_u32index_u32 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u64.c
+new file mode 100644
+index 000000000..f3dcf03cd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_gather_u64_tied1:
++**	ld1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1uh_gather_u64 (p0, z0))
++
++/*
++** ld1uh_gather_u64_untied:
++**	ld1h	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1uh_gather_u64 (p0, z1))
++
++/*
++** ld1uh_gather_x0_u64_offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m2_u64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, -2),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, -2))
++
++/*
++** ld1uh_gather_0_u64_offset:
++**	ld1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ld1uh_gather_6_u64_offset:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ld1uh_gather_62_u64_offset:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_62_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 62),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 62))
++
++/*
++** ld1uh_gather_64_u64_offset:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_64_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 64),
++		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 64))
++
++/*
++** ld1uh_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svld1uh_gather_index_u64 (p0, z0, x0))
++
++/*
++** ld1uh_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-2
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svld1uh_gather_index_u64 (p0, z0, -1))
++
++/*
++** ld1uh_gather_0_u64_index:
++**	ld1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svld1uh_gather_index_u64 (p0, z0, 0))
++
++/*
++** ld1uh_gather_5_u64_index:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svld1uh_gather_index_u64 (p0, z0, 5))
++
++/*
++** ld1uh_gather_31_u64_index:
++**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svld1uh_gather_index_u64 (p0, z0, 31))
++
++/*
++** ld1uh_gather_32_u64_index:
++**	mov	(x[0-9]+), #?64
++**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uh_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svld1uh_gather_index_u64 (p0, z0, 32))
++
++/*
++** ld1uh_gather_x0_u64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_u64_s64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uh_gather_x0_u64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_u64_u64offset:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uh_gather_x0_u64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_u64_s64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uh_gather_x0_u64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_tied1_u64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uh_gather_untied_u64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1uh_gather_ext_u64_u64index:
++**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s32.c
+new file mode 100644
+index 000000000..df1ce974b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_s32_base:
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_base, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0),
++	   z0 = svld1uh_s32 (p0, x0))
++
++/*
++** ld1uh_s32_index:
++**	ld1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_index, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 + x1),
++	   z0 = svld1uh_s32 (p0, x0 + x1))
++
++/*
++** ld1uh_s32_1:
++**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_1, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 + svcntw ()),
++	   z0 = svld1uh_s32 (p0, x0 + svcntw ()))
++
++/*
++** ld1uh_s32_7:
++**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_7, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_s32_8:
++**	incb	x0, all, mul #4
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_8, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1uh_s32_m1:
++**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_m1, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 - svcntw ()),
++	   z0 = svld1uh_s32 (p0, x0 - svcntw ()))
++
++/*
++** ld1uh_s32_m8:
++**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_m8, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_s32_m9:
++**	dech	x0, all, mul #9
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s32_m9, svint32_t, uint16_t,
++	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1uh_vnum_s32_0:
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_0, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, 0),
++	   z0 = svld1uh_vnum_s32 (p0, x0, 0))
++
++/*
++** ld1uh_vnum_s32_1:
++**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_1, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, 1),
++	   z0 = svld1uh_vnum_s32 (p0, x0, 1))
++
++/*
++** ld1uh_vnum_s32_7:
++**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_7, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, 7),
++	   z0 = svld1uh_vnum_s32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_s32_8:
++**	incb	x0, all, mul #4
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_8, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, 8),
++	   z0 = svld1uh_vnum_s32 (p0, x0, 8))
++
++/*
++** ld1uh_vnum_s32_m1:
++**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_m1, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, -1),
++	   z0 = svld1uh_vnum_s32 (p0, x0, -1))
++
++/*
++** ld1uh_vnum_s32_m8:
++**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_m8, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, -8),
++	   z0 = svld1uh_vnum_s32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_s32_m9:
++**	dech	x0, all, mul #9
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_m9, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, -9),
++	   z0 = svld1uh_vnum_s32 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1uh_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s32_x1, svint32_t, uint16_t,
++	   z0 = svld1uh_vnum_s32 (p0, x0, x1),
++	   z0 = svld1uh_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s64.c
+new file mode 100644
+index 000000000..7c3ab0aee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_s64_base:
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_base, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0),
++	   z0 = svld1uh_s64 (p0, x0))
++
++/*
++** ld1uh_s64_index:
++**	ld1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_index, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 + x1),
++	   z0 = svld1uh_s64 (p0, x0 + x1))
++
++/*
++** ld1uh_s64_1:
++**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_1, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1uh_s64 (p0, x0 + svcntd ()))
++
++/*
++** ld1uh_s64_7:
++**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_7, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_s64_8:
++**	incb	x0, all, mul #2
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_8, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1uh_s64_m1:
++**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_m1, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1uh_s64 (p0, x0 - svcntd ()))
++
++/*
++** ld1uh_s64_m8:
++**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_m8, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_s64_m9:
++**	decw	x0, all, mul #9
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_s64_m9, svint64_t, uint16_t,
++	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1uh_vnum_s64_0:
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_0, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, 0),
++	   z0 = svld1uh_vnum_s64 (p0, x0, 0))
++
++/*
++** ld1uh_vnum_s64_1:
++**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_1, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, 1),
++	   z0 = svld1uh_vnum_s64 (p0, x0, 1))
++
++/*
++** ld1uh_vnum_s64_7:
++**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_7, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, 7),
++	   z0 = svld1uh_vnum_s64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_s64_8:
++**	incb	x0, all, mul #2
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_8, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, 8),
++	   z0 = svld1uh_vnum_s64 (p0, x0, 8))
++
++/*
++** ld1uh_vnum_s64_m1:
++**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_m1, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, -1),
++	   z0 = svld1uh_vnum_s64 (p0, x0, -1))
++
++/*
++** ld1uh_vnum_s64_m8:
++**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_m8, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, -8),
++	   z0 = svld1uh_vnum_s64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_s64_m9:
++**	decw	x0, all, mul #9
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_m9, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, -9),
++	   z0 = svld1uh_vnum_s64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1uh_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_s64_x1, svint64_t, uint16_t,
++	   z0 = svld1uh_vnum_s64 (p0, x0, x1),
++	   z0 = svld1uh_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u32.c
+new file mode 100644
+index 000000000..a07b19259
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_u32_base:
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_base, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0),
++	   z0 = svld1uh_u32 (p0, x0))
++
++/*
++** ld1uh_u32_index:
++**	ld1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_index, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 + x1),
++	   z0 = svld1uh_u32 (p0, x0 + x1))
++
++/*
++** ld1uh_u32_1:
++**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_1, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 + svcntw ()),
++	   z0 = svld1uh_u32 (p0, x0 + svcntw ()))
++
++/*
++** ld1uh_u32_7:
++**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_7, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_u32_8:
++**	incb	x0, all, mul #4
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_8, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ld1uh_u32_m1:
++**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_m1, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 - svcntw ()),
++	   z0 = svld1uh_u32 (p0, x0 - svcntw ()))
++
++/*
++** ld1uh_u32_m8:
++**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_m8, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_u32_m9:
++**	dech	x0, all, mul #9
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u32_m9, svuint32_t, uint16_t,
++	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ld1uh_vnum_u32_0:
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_0, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, 0),
++	   z0 = svld1uh_vnum_u32 (p0, x0, 0))
++
++/*
++** ld1uh_vnum_u32_1:
++**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_1, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, 1),
++	   z0 = svld1uh_vnum_u32 (p0, x0, 1))
++
++/*
++** ld1uh_vnum_u32_7:
++**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_7, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, 7),
++	   z0 = svld1uh_vnum_u32 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_u32_8:
++**	incb	x0, all, mul #4
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_8, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, 8),
++	   z0 = svld1uh_vnum_u32 (p0, x0, 8))
++
++/*
++** ld1uh_vnum_u32_m1:
++**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_m1, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, -1),
++	   z0 = svld1uh_vnum_u32 (p0, x0, -1))
++
++/*
++** ld1uh_vnum_u32_m8:
++**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_m8, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, -8),
++	   z0 = svld1uh_vnum_u32 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_u32_m9:
++**	dech	x0, all, mul #9
++**	ld1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_m9, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, -9),
++	   z0 = svld1uh_vnum_u32 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1uh_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u32_x1, svuint32_t, uint16_t,
++	   z0 = svld1uh_vnum_u32 (p0, x0, x1),
++	   z0 = svld1uh_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u64.c
+new file mode 100644
+index 000000000..79be01fbd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uh_u64_base:
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_base, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0),
++	   z0 = svld1uh_u64 (p0, x0))
++
++/*
++** ld1uh_u64_index:
++**	ld1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_index, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 + x1),
++	   z0 = svld1uh_u64 (p0, x0 + x1))
++
++/*
++** ld1uh_u64_1:
++**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_1, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1uh_u64 (p0, x0 + svcntd ()))
++
++/*
++** ld1uh_u64_7:
++**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_7, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_u64_8:
++**	incb	x0, all, mul #2
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_8, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1uh_u64_m1:
++**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_m1, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1uh_u64 (p0, x0 - svcntd ()))
++
++/*
++** ld1uh_u64_m8:
++**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_m8, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_u64_m9:
++**	decw	x0, all, mul #9
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_u64_m9, svuint64_t, uint16_t,
++	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1uh_vnum_u64_0:
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_0, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, 0),
++	   z0 = svld1uh_vnum_u64 (p0, x0, 0))
++
++/*
++** ld1uh_vnum_u64_1:
++**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_1, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, 1),
++	   z0 = svld1uh_vnum_u64 (p0, x0, 1))
++
++/*
++** ld1uh_vnum_u64_7:
++**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_7, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, 7),
++	   z0 = svld1uh_vnum_u64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_u64_8:
++**	incb	x0, all, mul #2
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_8, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, 8),
++	   z0 = svld1uh_vnum_u64 (p0, x0, 8))
++
++/*
++** ld1uh_vnum_u64_m1:
++**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_m1, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, -1),
++	   z0 = svld1uh_vnum_u64 (p0, x0, -1))
++
++/*
++** ld1uh_vnum_u64_m8:
++**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_m8, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, -8),
++	   z0 = svld1uh_vnum_u64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uh_vnum_u64_m9:
++**	decw	x0, all, mul #9
++**	ld1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_m9, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, -9),
++	   z0 = svld1uh_vnum_u64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1uh_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1h	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1uh_vnum_u64_x1, svuint64_t, uint16_t,
++	   z0 = svld1uh_vnum_u64 (p0, x0, x1),
++	   z0 = svld1uh_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_s64.c
+new file mode 100644
+index 000000000..f4e9d5db9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_s64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uw_gather_s64_tied1:
++**	ld1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_s64 (p0, z0),
++		     z0_res = svld1uw_gather_s64 (p0, z0))
++
++/*
++** ld1uw_gather_s64_untied:
++**	ld1w	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_s64 (p0, z1),
++		     z0_res = svld1uw_gather_s64 (p0, z1))
++
++/*
++** ld1uw_gather_x0_s64_offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ld1uw_gather_m4_s64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_m4_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, -4),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, -4))
++
++/*
++** ld1uw_gather_0_s64_offset:
++**	ld1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ld1uw_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ld1uw_gather_6_s64_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ld1uw_gather_7_s64_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_7_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 7),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 7))
++
++/*
++** ld1uw_gather_8_s64_offset:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_8_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 8),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 8))
++
++/*
++** ld1uw_gather_124_s64_offset:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_124_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 124),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 124))
++
++/*
++** ld1uw_gather_128_s64_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_128_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 128),
++		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 128))
++
++/*
++** ld1uw_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svld1uw_gather_index_s64 (p0, z0, x0))
++
++/*
++** ld1uw_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svld1uw_gather_index_s64 (p0, z0, -1))
++
++/*
++** ld1uw_gather_0_s64_index:
++**	ld1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svld1uw_gather_index_s64 (p0, z0, 0))
++
++/*
++** ld1uw_gather_5_s64_index:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svld1uw_gather_index_s64 (p0, z0, 5))
++
++/*
++** ld1uw_gather_31_s64_index:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svld1uw_gather_index_s64 (p0, z0, 31))
++
++/*
++** ld1uw_gather_32_s64_index:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svld1uw_gather_index_s64 (p0, z0, 32))
++
++/*
++** ld1uw_gather_x0_s64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_s64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_s64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_s64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uw_gather_x0_s64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_s64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_s64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_s64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uw_gather_x0_s64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_s64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_s64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_s64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uw_gather_x0_s64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_s64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_s64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_s64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_u64.c
+new file mode 100644
+index 000000000..854d19233
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_u64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uw_gather_u64_tied1:
++**	ld1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_u64 (p0, z0),
++		     z0_res = svld1uw_gather_u64 (p0, z0))
++
++/*
++** ld1uw_gather_u64_untied:
++**	ld1w	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_u64 (p0, z1),
++		     z0_res = svld1uw_gather_u64 (p0, z1))
++
++/*
++** ld1uw_gather_x0_u64_offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ld1uw_gather_m4_u64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_m4_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, -4),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, -4))
++
++/*
++** ld1uw_gather_0_u64_offset:
++**	ld1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ld1uw_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ld1uw_gather_6_u64_offset:
++**	mov	(x[0-9]+), #?6
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ld1uw_gather_7_u64_offset:
++**	mov	(x[0-9]+), #?7
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_7_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 7),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 7))
++
++/*
++** ld1uw_gather_8_u64_offset:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_8_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 8),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 8))
++
++/*
++** ld1uw_gather_124_u64_offset:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_124_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 124),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 124))
++
++/*
++** ld1uw_gather_128_u64_offset:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_128_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 128),
++		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 128))
++
++/*
++** ld1uw_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svld1uw_gather_index_u64 (p0, z0, x0))
++
++/*
++** ld1uw_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-4
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svld1uw_gather_index_u64 (p0, z0, -1))
++
++/*
++** ld1uw_gather_0_u64_index:
++**	ld1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svld1uw_gather_index_u64 (p0, z0, 0))
++
++/*
++** ld1uw_gather_5_u64_index:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svld1uw_gather_index_u64 (p0, z0, 5))
++
++/*
++** ld1uw_gather_31_u64_index:
++**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svld1uw_gather_index_u64 (p0, z0, 31))
++
++/*
++** ld1uw_gather_32_u64_index:
++**	mov	(x[0-9]+), #?128
++**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ld1uw_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svld1uw_gather_index_u64 (p0, z0, 32))
++
++/*
++** ld1uw_gather_x0_u64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_u64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_u64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_u64_s64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uw_gather_x0_u64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_u64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_u64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_u64_u64offset:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uw_gather_x0_u64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_u64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_u64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_u64_s64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ld1uw_gather_x0_u64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_tied1_u64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ld1uw_gather_untied_u64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ld1uw_gather_ext_u64_u64index:
++**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svld1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_s64.c
+new file mode 100644
+index 000000000..55f5cbad3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uw_s64_base:
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_base, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0),
++	   z0 = svld1uw_s64 (p0, x0))
++
++/*
++** ld1uw_s64_index:
++**	ld1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_index, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 + x1),
++	   z0 = svld1uw_s64 (p0, x0 + x1))
++
++/*
++** ld1uw_s64_1:
++**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_1, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 + svcntd ()),
++	   z0 = svld1uw_s64 (p0, x0 + svcntd ()))
++
++/*
++** ld1uw_s64_7:
++**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_7, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_s64_8:
++**	incb	x0, all, mul #4
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_8, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1uw_s64_m1:
++**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_m1, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 - svcntd ()),
++	   z0 = svld1uw_s64 (p0, x0 - svcntd ()))
++
++/*
++** ld1uw_s64_m8:
++**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_m8, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_s64_m9:
++**	dech	x0, all, mul #9
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_s64_m9, svint64_t, uint32_t,
++	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1uw_vnum_s64_0:
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_0, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, 0),
++	   z0 = svld1uw_vnum_s64 (p0, x0, 0))
++
++/*
++** ld1uw_vnum_s64_1:
++**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_1, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, 1),
++	   z0 = svld1uw_vnum_s64 (p0, x0, 1))
++
++/*
++** ld1uw_vnum_s64_7:
++**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_7, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, 7),
++	   z0 = svld1uw_vnum_s64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_vnum_s64_8:
++**	incb	x0, all, mul #4
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_8, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, 8),
++	   z0 = svld1uw_vnum_s64 (p0, x0, 8))
++
++/*
++** ld1uw_vnum_s64_m1:
++**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_m1, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, -1),
++	   z0 = svld1uw_vnum_s64 (p0, x0, -1))
++
++/*
++** ld1uw_vnum_s64_m8:
++**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_m8, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, -8),
++	   z0 = svld1uw_vnum_s64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_vnum_s64_m9:
++**	dech	x0, all, mul #9
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_m9, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, -9),
++	   z0 = svld1uw_vnum_s64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1uw_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1w	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_s64_x1, svint64_t, uint32_t,
++	   z0 = svld1uw_vnum_s64 (p0, x0, x1),
++	   z0 = svld1uw_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_u64.c
+new file mode 100644
+index 000000000..175b593f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld1uw_u64_base:
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_base, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0),
++	   z0 = svld1uw_u64 (p0, x0))
++
++/*
++** ld1uw_u64_index:
++**	ld1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_index, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 + x1),
++	   z0 = svld1uw_u64 (p0, x0 + x1))
++
++/*
++** ld1uw_u64_1:
++**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_1, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 + svcntd ()),
++	   z0 = svld1uw_u64 (p0, x0 + svcntd ()))
++
++/*
++** ld1uw_u64_7:
++**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_7, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_u64_8:
++**	incb	x0, all, mul #4
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_8, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ld1uw_u64_m1:
++**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_m1, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 - svcntd ()),
++	   z0 = svld1uw_u64 (p0, x0 - svcntd ()))
++
++/*
++** ld1uw_u64_m8:
++**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_m8, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_u64_m9:
++**	dech	x0, all, mul #9
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_u64_m9, svuint64_t, uint32_t,
++	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ld1uw_vnum_u64_0:
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_0, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, 0),
++	   z0 = svld1uw_vnum_u64 (p0, x0, 0))
++
++/*
++** ld1uw_vnum_u64_1:
++**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_1, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, 1),
++	   z0 = svld1uw_vnum_u64 (p0, x0, 1))
++
++/*
++** ld1uw_vnum_u64_7:
++**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_7, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, 7),
++	   z0 = svld1uw_vnum_u64 (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_vnum_u64_8:
++**	incb	x0, all, mul #4
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_8, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, 8),
++	   z0 = svld1uw_vnum_u64 (p0, x0, 8))
++
++/*
++** ld1uw_vnum_u64_m1:
++**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_m1, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, -1),
++	   z0 = svld1uw_vnum_u64 (p0, x0, -1))
++
++/*
++** ld1uw_vnum_u64_m8:
++**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_m8, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, -8),
++	   z0 = svld1uw_vnum_u64 (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld1uw_vnum_u64_m9:
++**	dech	x0, all, mul #9
++**	ld1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_m9, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, -9),
++	   z0 = svld1uw_vnum_u64 (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld1uw_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld1w	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld1uw_vnum_u64_x1, svuint64_t, uint32_t,
++	   z0 = svld1uw_vnum_u64 (p0, x0, x1),
++	   z0 = svld1uw_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_bf16.c
+new file mode 100644
+index 000000000..5d08c1e6e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_bf16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_bf16_base:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_base, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_bf16_index:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_index, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_bf16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_1, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 + svcnth ()),
++	   z0 = svld2 (p0, x0 + svcnth ()))
++
++/*
++** ld2_bf16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_2, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 + svcnth () * 2),
++	   z0 = svld2 (p0, x0 + svcnth () * 2))
++
++/*
++** ld2_bf16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_14, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 + svcnth () * 14),
++	   z0 = svld2 (p0, x0 + svcnth () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_bf16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_16, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 + svcnth () * 16),
++	   z0 = svld2 (p0, x0 + svcnth () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_bf16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_m1, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 - svcnth ()),
++	   z0 = svld2 (p0, x0 - svcnth ()))
++
++/*
++** ld2_bf16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_m2, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 - svcnth () * 2),
++	   z0 = svld2 (p0, x0 - svcnth () * 2))
++
++/*
++** ld2_bf16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_m16, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 - svcnth () * 16),
++	   z0 = svld2 (p0, x0 - svcnth () * 16))
++
++/*
++** ld2_bf16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_bf16_m18, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_bf16 (p0, x0 - svcnth () * 18),
++	   z0 = svld2 (p0, x0 - svcnth () * 18))
++
++/*
++** ld2_vnum_bf16_0:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_bf16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_bf16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_bf16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_bf16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_bf16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_bf16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_bf16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_bf16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
++	   z0 = svld2_vnum_bf16 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f16.c
+new file mode 100644
+index 000000000..43392b2b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_f16_base:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_base, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_f16_index:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_index, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_1, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 + svcnth ()),
++	   z0 = svld2 (p0, x0 + svcnth ()))
++
++/*
++** ld2_f16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_2, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 + svcnth () * 2),
++	   z0 = svld2 (p0, x0 + svcnth () * 2))
++
++/*
++** ld2_f16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_14, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 + svcnth () * 14),
++	   z0 = svld2 (p0, x0 + svcnth () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_16, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 + svcnth () * 16),
++	   z0 = svld2 (p0, x0 + svcnth () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_m1, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 - svcnth ()),
++	   z0 = svld2 (p0, x0 - svcnth ()))
++
++/*
++** ld2_f16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_m2, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 - svcnth () * 2),
++	   z0 = svld2 (p0, x0 - svcnth () * 2))
++
++/*
++** ld2_f16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_m16, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 - svcnth () * 16),
++	   z0 = svld2 (p0, x0 - svcnth () * 16))
++
++/*
++** ld2_f16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_f16_m18, svfloat16x2_t, float16_t,
++	   z0 = svld2_f16 (p0, x0 - svcnth () * 18),
++	   z0 = svld2 (p0, x0 - svcnth () * 18))
++
++/*
++** ld2_vnum_f16_0:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_0, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_1, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_f16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_2, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_f16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_14, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_16, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_m1, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_f16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_m2, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_f16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_m16, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_f16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_m18, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f16_x1, svfloat16x2_t, float16_t,
++	   z0 = svld2_vnum_f16 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f32.c
+new file mode 100644
+index 000000000..379145e0c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f32.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_f32_base:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_base, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_f32_index:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_index, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f32_1:
++**	incb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_1, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 + svcntw ()),
++	   z0 = svld2 (p0, x0 + svcntw ()))
++
++/*
++** ld2_f32_2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_2, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 + svcntw () * 2),
++	   z0 = svld2 (p0, x0 + svcntw () * 2))
++
++/*
++** ld2_f32_14:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_14, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 + svcntw () * 14),
++	   z0 = svld2 (p0, x0 + svcntw () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f32_16:
++**	incb	x0, all, mul #16
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_16, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 + svcntw () * 16),
++	   z0 = svld2 (p0, x0 + svcntw () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f32_m1:
++**	decb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_m1, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 - svcntw ()),
++	   z0 = svld2 (p0, x0 - svcntw ()))
++
++/*
++** ld2_f32_m2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_m2, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 - svcntw () * 2),
++	   z0 = svld2 (p0, x0 - svcntw () * 2))
++
++/*
++** ld2_f32_m16:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_m16, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 - svcntw () * 16),
++	   z0 = svld2 (p0, x0 - svcntw () * 16))
++
++/*
++** ld2_f32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_f32_m18, svfloat32x2_t, float32_t,
++	   z0 = svld2_f32 (p0, x0 - svcntw () * 18),
++	   z0 = svld2 (p0, x0 - svcntw () * 18))
++
++/*
++** ld2_vnum_f32_0:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_0, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f32_1:
++**	incb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_1, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_f32_2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_2, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_f32_14:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_14, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f32_16:
++**	incb	x0, all, mul #16
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_16, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f32_m1:
++**	decb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_m1, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_f32_m2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_m2, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_f32_m16:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_m16, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_f32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_m18, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f32_x1, svfloat32x2_t, float32_t,
++	   z0 = svld2_vnum_f32 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f64.c
+new file mode 100644
+index 000000000..1911612c6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f64.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_f64_base:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_base, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_f64_index:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_index, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f64_1:
++**	incb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_1, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 + svcntd ()),
++	   z0 = svld2 (p0, x0 + svcntd ()))
++
++/*
++** ld2_f64_2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_2, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 + svcntd () * 2),
++	   z0 = svld2 (p0, x0 + svcntd () * 2))
++
++/*
++** ld2_f64_14:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_14, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 + svcntd () * 14),
++	   z0 = svld2 (p0, x0 + svcntd () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f64_16:
++**	incb	x0, all, mul #16
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_16, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 + svcntd () * 16),
++	   z0 = svld2 (p0, x0 + svcntd () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_f64_m1:
++**	decb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_m1, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 - svcntd ()),
++	   z0 = svld2 (p0, x0 - svcntd ()))
++
++/*
++** ld2_f64_m2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_m2, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 - svcntd () * 2),
++	   z0 = svld2 (p0, x0 - svcntd () * 2))
++
++/*
++** ld2_f64_m16:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_m16, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 - svcntd () * 16),
++	   z0 = svld2 (p0, x0 - svcntd () * 16))
++
++/*
++** ld2_f64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_f64_m18, svfloat64x2_t, float64_t,
++	   z0 = svld2_f64 (p0, x0 - svcntd () * 18),
++	   z0 = svld2 (p0, x0 - svcntd () * 18))
++
++/*
++** ld2_vnum_f64_0:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_0, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f64_1:
++**	incb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_1, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_f64_2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_2, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_f64_14:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_14, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f64_16:
++**	incb	x0, all, mul #16
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_16, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_f64_m1:
++**	decb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_m1, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_f64_m2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_m2, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_f64_m16:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_m16, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_f64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_m18, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_f64_x1, svfloat64x2_t, float64_t,
++	   z0 = svld2_vnum_f64 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s16.c
+new file mode 100644
+index 000000000..90677d837
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_s16_base:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_base, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_s16_index:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_index, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_1, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 + svcnth ()),
++	   z0 = svld2 (p0, x0 + svcnth ()))
++
++/*
++** ld2_s16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_2, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 + svcnth () * 2),
++	   z0 = svld2 (p0, x0 + svcnth () * 2))
++
++/*
++** ld2_s16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_14, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 + svcnth () * 14),
++	   z0 = svld2 (p0, x0 + svcnth () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_16, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 + svcnth () * 16),
++	   z0 = svld2 (p0, x0 + svcnth () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_m1, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 - svcnth ()),
++	   z0 = svld2 (p0, x0 - svcnth ()))
++
++/*
++** ld2_s16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_m2, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 - svcnth () * 2),
++	   z0 = svld2 (p0, x0 - svcnth () * 2))
++
++/*
++** ld2_s16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_m16, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 - svcnth () * 16),
++	   z0 = svld2 (p0, x0 - svcnth () * 16))
++
++/*
++** ld2_s16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_s16_m18, svint16x2_t, int16_t,
++	   z0 = svld2_s16 (p0, x0 - svcnth () * 18),
++	   z0 = svld2 (p0, x0 - svcnth () * 18))
++
++/*
++** ld2_vnum_s16_0:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_0, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_1, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_s16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_2, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_s16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_14, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_16, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_m1, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_s16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_m2, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_s16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_m16, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_s16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_m18, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s16_x1, svint16x2_t, int16_t,
++	   z0 = svld2_vnum_s16 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s32.c
+new file mode 100644
+index 000000000..10913c2d0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s32.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_s32_base:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_base, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_s32_index:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_index, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s32_1:
++**	incb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_1, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 + svcntw ()),
++	   z0 = svld2 (p0, x0 + svcntw ()))
++
++/*
++** ld2_s32_2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_2, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 + svcntw () * 2),
++	   z0 = svld2 (p0, x0 + svcntw () * 2))
++
++/*
++** ld2_s32_14:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_14, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 + svcntw () * 14),
++	   z0 = svld2 (p0, x0 + svcntw () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s32_16:
++**	incb	x0, all, mul #16
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_16, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 + svcntw () * 16),
++	   z0 = svld2 (p0, x0 + svcntw () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s32_m1:
++**	decb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_m1, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 - svcntw ()),
++	   z0 = svld2 (p0, x0 - svcntw ()))
++
++/*
++** ld2_s32_m2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_m2, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 - svcntw () * 2),
++	   z0 = svld2 (p0, x0 - svcntw () * 2))
++
++/*
++** ld2_s32_m16:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_m16, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 - svcntw () * 16),
++	   z0 = svld2 (p0, x0 - svcntw () * 16))
++
++/*
++** ld2_s32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_s32_m18, svint32x2_t, int32_t,
++	   z0 = svld2_s32 (p0, x0 - svcntw () * 18),
++	   z0 = svld2 (p0, x0 - svcntw () * 18))
++
++/*
++** ld2_vnum_s32_0:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_0, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s32_1:
++**	incb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_1, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_s32_2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_2, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_s32_14:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_14, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s32_16:
++**	incb	x0, all, mul #16
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_16, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s32_m1:
++**	decb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_m1, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_s32_m2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_m2, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_s32_m16:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_m16, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_s32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_m18, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s32_x1, svint32x2_t, int32_t,
++	   z0 = svld2_vnum_s32 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s64.c
+new file mode 100644
+index 000000000..9a43e86d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s64.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_s64_base:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_base, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_s64_index:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_index, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s64_1:
++**	incb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_1, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 + svcntd ()),
++	   z0 = svld2 (p0, x0 + svcntd ()))
++
++/*
++** ld2_s64_2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_2, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 + svcntd () * 2),
++	   z0 = svld2 (p0, x0 + svcntd () * 2))
++
++/*
++** ld2_s64_14:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_14, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 + svcntd () * 14),
++	   z0 = svld2 (p0, x0 + svcntd () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s64_16:
++**	incb	x0, all, mul #16
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_16, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 + svcntd () * 16),
++	   z0 = svld2 (p0, x0 + svcntd () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s64_m1:
++**	decb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_m1, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 - svcntd ()),
++	   z0 = svld2 (p0, x0 - svcntd ()))
++
++/*
++** ld2_s64_m2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_m2, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 - svcntd () * 2),
++	   z0 = svld2 (p0, x0 - svcntd () * 2))
++
++/*
++** ld2_s64_m16:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_m16, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 - svcntd () * 16),
++	   z0 = svld2 (p0, x0 - svcntd () * 16))
++
++/*
++** ld2_s64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_s64_m18, svint64x2_t, int64_t,
++	   z0 = svld2_s64 (p0, x0 - svcntd () * 18),
++	   z0 = svld2 (p0, x0 - svcntd () * 18))
++
++/*
++** ld2_vnum_s64_0:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_0, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s64_1:
++**	incb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_1, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_s64_2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_2, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_s64_14:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_14, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s64_16:
++**	incb	x0, all, mul #16
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_16, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s64_m1:
++**	decb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_m1, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_s64_m2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_m2, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_s64_m16:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_m16, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_s64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_m18, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s64_x1, svint64x2_t, int64_t,
++	   z0 = svld2_vnum_s64 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s8.c
+new file mode 100644
+index 000000000..af5c04c66
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s8.c
+@@ -0,0 +1,204 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_s8_base:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_base, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_s8_index:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_index, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s8_1:
++**	incb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_1, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 + svcntb ()),
++	   z0 = svld2 (p0, x0 + svcntb ()))
++
++/*
++** ld2_s8_2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_2, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 + svcntb () * 2),
++	   z0 = svld2 (p0, x0 + svcntb () * 2))
++
++/*
++** ld2_s8_14:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_14, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 + svcntb () * 14),
++	   z0 = svld2 (p0, x0 + svcntb () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s8_16:
++**	incb	x0, all, mul #16
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_16, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 + svcntb () * 16),
++	   z0 = svld2 (p0, x0 + svcntb () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_s8_m1:
++**	decb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_m1, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 - svcntb ()),
++	   z0 = svld2 (p0, x0 - svcntb ()))
++
++/*
++** ld2_s8_m2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_m2, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 - svcntb () * 2),
++	   z0 = svld2 (p0, x0 - svcntb () * 2))
++
++/*
++** ld2_s8_m16:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_m16, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 - svcntb () * 16),
++	   z0 = svld2 (p0, x0 - svcntb () * 16))
++
++/*
++** ld2_s8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_s8_m18, svint8x2_t, int8_t,
++	   z0 = svld2_s8 (p0, x0 - svcntb () * 18),
++	   z0 = svld2 (p0, x0 - svcntb () * 18))
++
++/*
++** ld2_vnum_s8_0:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_0, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s8_1:
++**	incb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_1, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_s8_2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_2, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_s8_14:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_14, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s8_16:
++**	incb	x0, all, mul #16
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_16, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_s8_m1:
++**	decb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_m1, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_s8_m2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_m2, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_s8_m16:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_m16, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_s8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_m18, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/*
++** ld2_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld2_vnum_s8_x1, svint8x2_t, int8_t,
++	   z0 = svld2_vnum_s8 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u16.c
+new file mode 100644
+index 000000000..6c33322c1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_u16_base:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_base, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_u16_index:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_index, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_1, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 + svcnth ()),
++	   z0 = svld2 (p0, x0 + svcnth ()))
++
++/*
++** ld2_u16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_2, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 + svcnth () * 2),
++	   z0 = svld2 (p0, x0 + svcnth () * 2))
++
++/*
++** ld2_u16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_14, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 + svcnth () * 14),
++	   z0 = svld2 (p0, x0 + svcnth () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_16, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 + svcnth () * 16),
++	   z0 = svld2 (p0, x0 + svcnth () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_m1, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 - svcnth ()),
++	   z0 = svld2 (p0, x0 - svcnth ()))
++
++/*
++** ld2_u16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_m2, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 - svcnth () * 2),
++	   z0 = svld2 (p0, x0 - svcnth () * 2))
++
++/*
++** ld2_u16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_m16, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 - svcnth () * 16),
++	   z0 = svld2 (p0, x0 - svcnth () * 16))
++
++/*
++** ld2_u16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_u16_m18, svuint16x2_t, uint16_t,
++	   z0 = svld2_u16 (p0, x0 - svcnth () * 18),
++	   z0 = svld2 (p0, x0 - svcnth () * 18))
++
++/*
++** ld2_vnum_u16_0:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_0, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u16_1:
++**	incb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_1, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_u16_2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_2, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_u16_14:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_14, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u16_16:
++**	incb	x0, all, mul #16
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_16, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u16_m1:
++**	decb	x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_m1, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_u16_m2:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_m2, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_u16_m16:
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_m16, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_u16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_m18, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u16_x1, svuint16x2_t, uint16_t,
++	   z0 = svld2_vnum_u16 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u32.c
+new file mode 100644
+index 000000000..84a23cf47
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u32.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_u32_base:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_base, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_u32_index:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_index, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u32_1:
++**	incb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_1, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 + svcntw ()),
++	   z0 = svld2 (p0, x0 + svcntw ()))
++
++/*
++** ld2_u32_2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_2, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 + svcntw () * 2),
++	   z0 = svld2 (p0, x0 + svcntw () * 2))
++
++/*
++** ld2_u32_14:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_14, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 + svcntw () * 14),
++	   z0 = svld2 (p0, x0 + svcntw () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u32_16:
++**	incb	x0, all, mul #16
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_16, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 + svcntw () * 16),
++	   z0 = svld2 (p0, x0 + svcntw () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u32_m1:
++**	decb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_m1, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 - svcntw ()),
++	   z0 = svld2 (p0, x0 - svcntw ()))
++
++/*
++** ld2_u32_m2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_m2, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 - svcntw () * 2),
++	   z0 = svld2 (p0, x0 - svcntw () * 2))
++
++/*
++** ld2_u32_m16:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_m16, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 - svcntw () * 16),
++	   z0 = svld2 (p0, x0 - svcntw () * 16))
++
++/*
++** ld2_u32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_u32_m18, svuint32x2_t, uint32_t,
++	   z0 = svld2_u32 (p0, x0 - svcntw () * 18),
++	   z0 = svld2 (p0, x0 - svcntw () * 18))
++
++/*
++** ld2_vnum_u32_0:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_0, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u32_1:
++**	incb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_1, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_u32_2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_2, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_u32_14:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_14, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u32_16:
++**	incb	x0, all, mul #16
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_16, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u32_m1:
++**	decb	x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_m1, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_u32_m2:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_m2, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_u32_m16:
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_m16, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_u32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_m18, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u32_x1, svuint32x2_t, uint32_t,
++	   z0 = svld2_vnum_u32 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u64.c
+new file mode 100644
+index 000000000..350b05792
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u64.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_u64_base:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_base, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_u64_index:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_index, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u64_1:
++**	incb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_1, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 + svcntd ()),
++	   z0 = svld2 (p0, x0 + svcntd ()))
++
++/*
++** ld2_u64_2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_2, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 + svcntd () * 2),
++	   z0 = svld2 (p0, x0 + svcntd () * 2))
++
++/*
++** ld2_u64_14:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_14, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 + svcntd () * 14),
++	   z0 = svld2 (p0, x0 + svcntd () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u64_16:
++**	incb	x0, all, mul #16
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_16, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 + svcntd () * 16),
++	   z0 = svld2 (p0, x0 + svcntd () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u64_m1:
++**	decb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_m1, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 - svcntd ()),
++	   z0 = svld2 (p0, x0 - svcntd ()))
++
++/*
++** ld2_u64_m2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_m2, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 - svcntd () * 2),
++	   z0 = svld2 (p0, x0 - svcntd () * 2))
++
++/*
++** ld2_u64_m16:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_m16, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 - svcntd () * 16),
++	   z0 = svld2 (p0, x0 - svcntd () * 16))
++
++/*
++** ld2_u64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_u64_m18, svuint64x2_t, uint64_t,
++	   z0 = svld2_u64 (p0, x0 - svcntd () * 18),
++	   z0 = svld2 (p0, x0 - svcntd () * 18))
++
++/*
++** ld2_vnum_u64_0:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_0, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u64_1:
++**	incb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_1, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_u64_2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_2, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_u64_14:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_14, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u64_16:
++**	incb	x0, all, mul #16
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_16, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u64_m1:
++**	decb	x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_m1, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_u64_m2:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_m2, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_u64_m16:
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_m16, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_u64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_m18, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld2_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u64_x1, svuint64x2_t, uint64_t,
++	   z0 = svld2_vnum_u64 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u8.c
+new file mode 100644
+index 000000000..e67634c4c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u8.c
+@@ -0,0 +1,204 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld2_u8_base:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_base, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0),
++	   z0 = svld2 (p0, x0))
++
++/*
++** ld2_u8_index:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_index, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 + x1),
++	   z0 = svld2 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u8_1:
++**	incb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_1, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 + svcntb ()),
++	   z0 = svld2 (p0, x0 + svcntb ()))
++
++/*
++** ld2_u8_2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_2, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 + svcntb () * 2),
++	   z0 = svld2 (p0, x0 + svcntb () * 2))
++
++/*
++** ld2_u8_14:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_14, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 + svcntb () * 14),
++	   z0 = svld2 (p0, x0 + svcntb () * 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u8_16:
++**	incb	x0, all, mul #16
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_16, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 + svcntb () * 16),
++	   z0 = svld2 (p0, x0 + svcntb () * 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_u8_m1:
++**	decb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_m1, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 - svcntb ()),
++	   z0 = svld2 (p0, x0 - svcntb ()))
++
++/*
++** ld2_u8_m2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_m2, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 - svcntb () * 2),
++	   z0 = svld2 (p0, x0 - svcntb () * 2))
++
++/*
++** ld2_u8_m16:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_m16, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 - svcntb () * 16),
++	   z0 = svld2 (p0, x0 - svcntb () * 16))
++
++/*
++** ld2_u8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_u8_m18, svuint8x2_t, uint8_t,
++	   z0 = svld2_u8 (p0, x0 - svcntb () * 18),
++	   z0 = svld2 (p0, x0 - svcntb () * 18))
++
++/*
++** ld2_vnum_u8_0:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_0, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, 0),
++	   z0 = svld2_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u8_1:
++**	incb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_1, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, 1),
++	   z0 = svld2_vnum (p0, x0, 1))
++
++/*
++** ld2_vnum_u8_2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_2, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, 2),
++	   z0 = svld2_vnum (p0, x0, 2))
++
++/*
++** ld2_vnum_u8_14:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_14, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, 14),
++	   z0 = svld2_vnum (p0, x0, 14))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u8_16:
++**	incb	x0, all, mul #16
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_16, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, 16),
++	   z0 = svld2_vnum (p0, x0, 16))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld2_vnum_u8_m1:
++**	decb	x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_m1, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, -1),
++	   z0 = svld2_vnum (p0, x0, -1))
++
++/*
++** ld2_vnum_u8_m2:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_m2, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, -2),
++	   z0 = svld2_vnum (p0, x0, -2))
++
++/*
++** ld2_vnum_u8_m16:
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_m16, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, -16),
++	   z0 = svld2_vnum (p0, x0, -16))
++
++/*
++** ld2_vnum_u8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_m18, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, -18),
++	   z0 = svld2_vnum (p0, x0, -18))
++
++/*
++** ld2_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld2_vnum_u8_x1, svuint8x2_t, uint8_t,
++	   z0 = svld2_vnum_u8 (p0, x0, x1),
++	   z0 = svld2_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_bf16.c
+new file mode 100644
+index 000000000..e0b4fb1af
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_bf16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_bf16_base:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_base, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_bf16_index:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_index, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_bf16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_1, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 + svcnth ()),
++	   z0 = svld3 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_bf16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_2, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 + svcnth () * 2),
++	   z0 = svld3 (p0, x0 + svcnth () * 2))
++
++/*
++** ld3_bf16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_3, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 + svcnth () * 3),
++	   z0 = svld3 (p0, x0 + svcnth () * 3))
++
++/*
++** ld3_bf16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_21, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 + svcnth () * 21),
++	   z0 = svld3 (p0, x0 + svcnth () * 21))
++
++/*
++** ld3_bf16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_24, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 + svcnth () * 24),
++	   z0 = svld3 (p0, x0 + svcnth () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_bf16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_m1, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 - svcnth ()),
++	   z0 = svld3 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_bf16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_m2, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 - svcnth () * 2),
++	   z0 = svld3 (p0, x0 - svcnth () * 2))
++
++/*
++** ld3_bf16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_m3, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 - svcnth () * 3),
++	   z0 = svld3 (p0, x0 - svcnth () * 3))
++
++/*
++** ld3_bf16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_m24, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 - svcnth () * 24),
++	   z0 = svld3 (p0, x0 - svcnth () * 24))
++
++/*
++** ld3_bf16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_bf16_m27, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_bf16 (p0, x0 - svcnth () * 27),
++	   z0 = svld3 (p0, x0 - svcnth () * 27))
++
++/*
++** ld3_vnum_bf16_0:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_bf16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_bf16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_bf16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_bf16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_bf16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_bf16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_bf16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_bf16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_bf16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_bf16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
++	   z0 = svld3_vnum_bf16 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f16.c
+new file mode 100644
+index 000000000..3d7777e52
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_f16_base:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_base, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_f16_index:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_index, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_1, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 + svcnth ()),
++	   z0 = svld3 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_2, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 + svcnth () * 2),
++	   z0 = svld3 (p0, x0 + svcnth () * 2))
++
++/*
++** ld3_f16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_3, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 + svcnth () * 3),
++	   z0 = svld3 (p0, x0 + svcnth () * 3))
++
++/*
++** ld3_f16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_21, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 + svcnth () * 21),
++	   z0 = svld3 (p0, x0 + svcnth () * 21))
++
++/*
++** ld3_f16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_24, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 + svcnth () * 24),
++	   z0 = svld3 (p0, x0 + svcnth () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_m1, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 - svcnth ()),
++	   z0 = svld3 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_m2, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 - svcnth () * 2),
++	   z0 = svld3 (p0, x0 - svcnth () * 2))
++
++/*
++** ld3_f16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_m3, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 - svcnth () * 3),
++	   z0 = svld3 (p0, x0 - svcnth () * 3))
++
++/*
++** ld3_f16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_m24, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 - svcnth () * 24),
++	   z0 = svld3 (p0, x0 - svcnth () * 24))
++
++/*
++** ld3_f16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_f16_m27, svfloat16x3_t, float16_t,
++	   z0 = svld3_f16 (p0, x0 - svcnth () * 27),
++	   z0 = svld3 (p0, x0 - svcnth () * 27))
++
++/*
++** ld3_vnum_f16_0:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_0, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_1, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_2, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_f16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_3, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_f16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_21, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_f16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_24, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_m1, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_m2, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_f16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_m3, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_f16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_m24, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_f16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_m27, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f16_x1, svfloat16x3_t, float16_t,
++	   z0 = svld3_vnum_f16 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f32.c
+new file mode 100644
+index 000000000..4e4ad7521
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f32.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_f32_base:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_base, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_f32_index:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_index, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f32_1:
++**	incb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_1, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 + svcntw ()),
++	   z0 = svld3 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f32_2:
++**	incb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_2, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 + svcntw () * 2),
++	   z0 = svld3 (p0, x0 + svcntw () * 2))
++
++/*
++** ld3_f32_3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_3, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 + svcntw () * 3),
++	   z0 = svld3 (p0, x0 + svcntw () * 3))
++
++/*
++** ld3_f32_21:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_21, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 + svcntw () * 21),
++	   z0 = svld3 (p0, x0 + svcntw () * 21))
++
++/*
++** ld3_f32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_24, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 + svcntw () * 24),
++	   z0 = svld3 (p0, x0 + svcntw () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f32_m1:
++**	decb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_m1, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 - svcntw ()),
++	   z0 = svld3 (p0, x0 - svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f32_m2:
++**	decb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_m2, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 - svcntw () * 2),
++	   z0 = svld3 (p0, x0 - svcntw () * 2))
++
++/*
++** ld3_f32_m3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_m3, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 - svcntw () * 3),
++	   z0 = svld3 (p0, x0 - svcntw () * 3))
++
++/*
++** ld3_f32_m24:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_m24, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 - svcntw () * 24),
++	   z0 = svld3 (p0, x0 - svcntw () * 24))
++
++/*
++** ld3_f32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_f32_m27, svfloat32x3_t, float32_t,
++	   z0 = svld3_f32 (p0, x0 - svcntw () * 27),
++	   z0 = svld3 (p0, x0 - svcntw () * 27))
++
++/*
++** ld3_vnum_f32_0:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_0, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f32_1:
++**	incb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_1, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f32_2:
++**	incb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_2, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_f32_3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_3, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_f32_21:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_21, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_f32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_24, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f32_m1:
++**	decb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_m1, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f32_m2:
++**	decb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_m2, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_f32_m3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_m3, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_f32_m24:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_m24, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_f32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_m27, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f32_x1, svfloat32x3_t, float32_t,
++	   z0 = svld3_vnum_f32 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f64.c
+new file mode 100644
+index 000000000..7e6e1e749
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f64.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_f64_base:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_base, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_f64_index:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_index, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f64_1:
++**	incb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_1, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 + svcntd ()),
++	   z0 = svld3 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f64_2:
++**	incb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_2, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 + svcntd () * 2),
++	   z0 = svld3 (p0, x0 + svcntd () * 2))
++
++/*
++** ld3_f64_3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_3, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 + svcntd () * 3),
++	   z0 = svld3 (p0, x0 + svcntd () * 3))
++
++/*
++** ld3_f64_21:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_21, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 + svcntd () * 21),
++	   z0 = svld3 (p0, x0 + svcntd () * 21))
++
++/*
++** ld3_f64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_24, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 + svcntd () * 24),
++	   z0 = svld3 (p0, x0 + svcntd () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f64_m1:
++**	decb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_m1, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 - svcntd ()),
++	   z0 = svld3 (p0, x0 - svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_f64_m2:
++**	decb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_m2, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 - svcntd () * 2),
++	   z0 = svld3 (p0, x0 - svcntd () * 2))
++
++/*
++** ld3_f64_m3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_m3, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 - svcntd () * 3),
++	   z0 = svld3 (p0, x0 - svcntd () * 3))
++
++/*
++** ld3_f64_m24:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_m24, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 - svcntd () * 24),
++	   z0 = svld3 (p0, x0 - svcntd () * 24))
++
++/*
++** ld3_f64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_f64_m27, svfloat64x3_t, float64_t,
++	   z0 = svld3_f64 (p0, x0 - svcntd () * 27),
++	   z0 = svld3 (p0, x0 - svcntd () * 27))
++
++/*
++** ld3_vnum_f64_0:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_0, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f64_1:
++**	incb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_1, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f64_2:
++**	incb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_2, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_f64_3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_3, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_f64_21:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_21, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_f64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_24, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f64_m1:
++**	decb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_m1, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_f64_m2:
++**	decb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_m2, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_f64_m3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_m3, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_f64_m24:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_m24, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_f64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_m27, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_f64_x1, svfloat64x3_t, float64_t,
++	   z0 = svld3_vnum_f64 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s16.c
+new file mode 100644
+index 000000000..d4a046c64
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_s16_base:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_base, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_s16_index:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_index, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_1, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 + svcnth ()),
++	   z0 = svld3 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_2, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 + svcnth () * 2),
++	   z0 = svld3 (p0, x0 + svcnth () * 2))
++
++/*
++** ld3_s16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_3, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 + svcnth () * 3),
++	   z0 = svld3 (p0, x0 + svcnth () * 3))
++
++/*
++** ld3_s16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_21, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 + svcnth () * 21),
++	   z0 = svld3 (p0, x0 + svcnth () * 21))
++
++/*
++** ld3_s16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_24, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 + svcnth () * 24),
++	   z0 = svld3 (p0, x0 + svcnth () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_m1, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 - svcnth ()),
++	   z0 = svld3 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_m2, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 - svcnth () * 2),
++	   z0 = svld3 (p0, x0 - svcnth () * 2))
++
++/*
++** ld3_s16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_m3, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 - svcnth () * 3),
++	   z0 = svld3 (p0, x0 - svcnth () * 3))
++
++/*
++** ld3_s16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_m24, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 - svcnth () * 24),
++	   z0 = svld3 (p0, x0 - svcnth () * 24))
++
++/*
++** ld3_s16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s16_m27, svint16x3_t, int16_t,
++	   z0 = svld3_s16 (p0, x0 - svcnth () * 27),
++	   z0 = svld3 (p0, x0 - svcnth () * 27))
++
++/*
++** ld3_vnum_s16_0:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_0, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_1, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_2, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_s16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_3, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_s16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_21, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_s16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_24, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_m1, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_m2, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_s16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_m3, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_s16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_m24, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_s16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_m27, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s16_x1, svint16x3_t, int16_t,
++	   z0 = svld3_vnum_s16 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s32.c
+new file mode 100644
+index 000000000..3b0ba6e2a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s32.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_s32_base:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_base, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_s32_index:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_index, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s32_1:
++**	incb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_1, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 + svcntw ()),
++	   z0 = svld3 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s32_2:
++**	incb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_2, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 + svcntw () * 2),
++	   z0 = svld3 (p0, x0 + svcntw () * 2))
++
++/*
++** ld3_s32_3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_3, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 + svcntw () * 3),
++	   z0 = svld3 (p0, x0 + svcntw () * 3))
++
++/*
++** ld3_s32_21:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_21, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 + svcntw () * 21),
++	   z0 = svld3 (p0, x0 + svcntw () * 21))
++
++/*
++** ld3_s32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_24, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 + svcntw () * 24),
++	   z0 = svld3 (p0, x0 + svcntw () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s32_m1:
++**	decb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_m1, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 - svcntw ()),
++	   z0 = svld3 (p0, x0 - svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s32_m2:
++**	decb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_m2, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 - svcntw () * 2),
++	   z0 = svld3 (p0, x0 - svcntw () * 2))
++
++/*
++** ld3_s32_m3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_m3, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 - svcntw () * 3),
++	   z0 = svld3 (p0, x0 - svcntw () * 3))
++
++/*
++** ld3_s32_m24:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_m24, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 - svcntw () * 24),
++	   z0 = svld3 (p0, x0 - svcntw () * 24))
++
++/*
++** ld3_s32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s32_m27, svint32x3_t, int32_t,
++	   z0 = svld3_s32 (p0, x0 - svcntw () * 27),
++	   z0 = svld3 (p0, x0 - svcntw () * 27))
++
++/*
++** ld3_vnum_s32_0:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_0, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s32_1:
++**	incb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_1, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s32_2:
++**	incb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_2, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_s32_3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_3, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_s32_21:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_21, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_s32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_24, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s32_m1:
++**	decb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_m1, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s32_m2:
++**	decb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_m2, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_s32_m3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_m3, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_s32_m24:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_m24, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_s32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_m27, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s32_x1, svint32x3_t, int32_t,
++	   z0 = svld3_vnum_s32 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s64.c
+new file mode 100644
+index 000000000..080a10b8f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s64.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_s64_base:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_base, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_s64_index:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_index, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s64_1:
++**	incb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_1, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 + svcntd ()),
++	   z0 = svld3 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s64_2:
++**	incb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_2, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 + svcntd () * 2),
++	   z0 = svld3 (p0, x0 + svcntd () * 2))
++
++/*
++** ld3_s64_3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_3, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 + svcntd () * 3),
++	   z0 = svld3 (p0, x0 + svcntd () * 3))
++
++/*
++** ld3_s64_21:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_21, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 + svcntd () * 21),
++	   z0 = svld3 (p0, x0 + svcntd () * 21))
++
++/*
++** ld3_s64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_24, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 + svcntd () * 24),
++	   z0 = svld3 (p0, x0 + svcntd () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s64_m1:
++**	decb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_m1, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 - svcntd ()),
++	   z0 = svld3 (p0, x0 - svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s64_m2:
++**	decb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_m2, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 - svcntd () * 2),
++	   z0 = svld3 (p0, x0 - svcntd () * 2))
++
++/*
++** ld3_s64_m3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_m3, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 - svcntd () * 3),
++	   z0 = svld3 (p0, x0 - svcntd () * 3))
++
++/*
++** ld3_s64_m24:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_m24, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 - svcntd () * 24),
++	   z0 = svld3 (p0, x0 - svcntd () * 24))
++
++/*
++** ld3_s64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s64_m27, svint64x3_t, int64_t,
++	   z0 = svld3_s64 (p0, x0 - svcntd () * 27),
++	   z0 = svld3 (p0, x0 - svcntd () * 27))
++
++/*
++** ld3_vnum_s64_0:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_0, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s64_1:
++**	incb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_1, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s64_2:
++**	incb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_2, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_s64_3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_3, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_s64_21:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_21, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_s64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_24, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s64_m1:
++**	decb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_m1, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s64_m2:
++**	decb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_m2, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_s64_m3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_m3, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_s64_m24:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_m24, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_s64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_m27, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s64_x1, svint64x3_t, int64_t,
++	   z0 = svld3_vnum_s64 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s8.c
+new file mode 100644
+index 000000000..e0c551472
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s8.c
+@@ -0,0 +1,246 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_s8_base:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_base, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_s8_index:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_index, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s8_1:
++**	incb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_1, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 + svcntb ()),
++	   z0 = svld3 (p0, x0 + svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s8_2:
++**	incb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_2, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 + svcntb () * 2),
++	   z0 = svld3 (p0, x0 + svcntb () * 2))
++
++/*
++** ld3_s8_3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_3, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 + svcntb () * 3),
++	   z0 = svld3 (p0, x0 + svcntb () * 3))
++
++/*
++** ld3_s8_21:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_21, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 + svcntb () * 21),
++	   z0 = svld3 (p0, x0 + svcntb () * 21))
++
++/*
++** ld3_s8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_24, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 + svcntb () * 24),
++	   z0 = svld3 (p0, x0 + svcntb () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s8_m1:
++**	decb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_m1, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 - svcntb ()),
++	   z0 = svld3 (p0, x0 - svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_s8_m2:
++**	decb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_m2, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 - svcntb () * 2),
++	   z0 = svld3 (p0, x0 - svcntb () * 2))
++
++/*
++** ld3_s8_m3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_m3, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 - svcntb () * 3),
++	   z0 = svld3 (p0, x0 - svcntb () * 3))
++
++/*
++** ld3_s8_m24:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_m24, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 - svcntb () * 24),
++	   z0 = svld3 (p0, x0 - svcntb () * 24))
++
++/*
++** ld3_s8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_s8_m27, svint8x3_t, int8_t,
++	   z0 = svld3_s8 (p0, x0 - svcntb () * 27),
++	   z0 = svld3 (p0, x0 - svcntb () * 27))
++
++/*
++** ld3_vnum_s8_0:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_0, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s8_1:
++**	incb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_1, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s8_2:
++**	incb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_2, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_s8_3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_3, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_s8_21:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_21, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_s8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_24, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s8_m1:
++**	decb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_m1, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_s8_m2:
++**	decb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_m2, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_s8_m3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_m3, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_s8_m24:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_m24, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_s8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_m27, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/*
++** ld3_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld3_vnum_s8_x1, svint8x3_t, int8_t,
++	   z0 = svld3_vnum_s8 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u16.c
+new file mode 100644
+index 000000000..12f6dd092
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_u16_base:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_base, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_u16_index:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_index, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_1, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 + svcnth ()),
++	   z0 = svld3 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_2, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 + svcnth () * 2),
++	   z0 = svld3 (p0, x0 + svcnth () * 2))
++
++/*
++** ld3_u16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_3, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 + svcnth () * 3),
++	   z0 = svld3 (p0, x0 + svcnth () * 3))
++
++/*
++** ld3_u16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_21, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 + svcnth () * 21),
++	   z0 = svld3 (p0, x0 + svcnth () * 21))
++
++/*
++** ld3_u16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_24, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 + svcnth () * 24),
++	   z0 = svld3 (p0, x0 + svcnth () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_m1, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 - svcnth ()),
++	   z0 = svld3 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_m2, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 - svcnth () * 2),
++	   z0 = svld3 (p0, x0 - svcnth () * 2))
++
++/*
++** ld3_u16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_m3, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 - svcnth () * 3),
++	   z0 = svld3 (p0, x0 - svcnth () * 3))
++
++/*
++** ld3_u16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_m24, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 - svcnth () * 24),
++	   z0 = svld3 (p0, x0 - svcnth () * 24))
++
++/*
++** ld3_u16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u16_m27, svuint16x3_t, uint16_t,
++	   z0 = svld3_u16 (p0, x0 - svcnth () * 27),
++	   z0 = svld3 (p0, x0 - svcnth () * 27))
++
++/*
++** ld3_vnum_u16_0:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_0, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u16_1:
++**	incb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_1, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u16_2:
++**	incb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_2, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_u16_3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_3, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_u16_21:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_21, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_u16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_24, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u16_m1:
++**	decb	x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_m1, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u16_m2:
++**	decb	x0, all, mul #2
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_m2, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_u16_m3:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_m3, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_u16_m24:
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_m24, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_u16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_m27, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u16_x1, svuint16x3_t, uint16_t,
++	   z0 = svld3_vnum_u16 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u32.c
+new file mode 100644
+index 000000000..ffc6edfdc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u32.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_u32_base:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_base, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_u32_index:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_index, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u32_1:
++**	incb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_1, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 + svcntw ()),
++	   z0 = svld3 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u32_2:
++**	incb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_2, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 + svcntw () * 2),
++	   z0 = svld3 (p0, x0 + svcntw () * 2))
++
++/*
++** ld3_u32_3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_3, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 + svcntw () * 3),
++	   z0 = svld3 (p0, x0 + svcntw () * 3))
++
++/*
++** ld3_u32_21:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_21, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 + svcntw () * 21),
++	   z0 = svld3 (p0, x0 + svcntw () * 21))
++
++/*
++** ld3_u32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_24, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 + svcntw () * 24),
++	   z0 = svld3 (p0, x0 + svcntw () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u32_m1:
++**	decb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_m1, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 - svcntw ()),
++	   z0 = svld3 (p0, x0 - svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u32_m2:
++**	decb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_m2, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 - svcntw () * 2),
++	   z0 = svld3 (p0, x0 - svcntw () * 2))
++
++/*
++** ld3_u32_m3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_m3, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 - svcntw () * 3),
++	   z0 = svld3 (p0, x0 - svcntw () * 3))
++
++/*
++** ld3_u32_m24:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_m24, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 - svcntw () * 24),
++	   z0 = svld3 (p0, x0 - svcntw () * 24))
++
++/*
++** ld3_u32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u32_m27, svuint32x3_t, uint32_t,
++	   z0 = svld3_u32 (p0, x0 - svcntw () * 27),
++	   z0 = svld3 (p0, x0 - svcntw () * 27))
++
++/*
++** ld3_vnum_u32_0:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_0, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u32_1:
++**	incb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_1, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u32_2:
++**	incb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_2, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_u32_3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_3, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_u32_21:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_21, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_u32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_24, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u32_m1:
++**	decb	x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_m1, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u32_m2:
++**	decb	x0, all, mul #2
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_m2, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_u32_m3:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_m3, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_u32_m24:
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_m24, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_u32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_m27, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3w	{z0\.s - z2\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u32_x1, svuint32x3_t, uint32_t,
++	   z0 = svld3_vnum_u32 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u64.c
+new file mode 100644
+index 000000000..2c0dc2f1a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u64.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_u64_base:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_base, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_u64_index:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_index, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u64_1:
++**	incb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_1, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 + svcntd ()),
++	   z0 = svld3 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u64_2:
++**	incb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_2, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 + svcntd () * 2),
++	   z0 = svld3 (p0, x0 + svcntd () * 2))
++
++/*
++** ld3_u64_3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_3, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 + svcntd () * 3),
++	   z0 = svld3 (p0, x0 + svcntd () * 3))
++
++/*
++** ld3_u64_21:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_21, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 + svcntd () * 21),
++	   z0 = svld3 (p0, x0 + svcntd () * 21))
++
++/*
++** ld3_u64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_24, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 + svcntd () * 24),
++	   z0 = svld3 (p0, x0 + svcntd () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u64_m1:
++**	decb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_m1, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 - svcntd ()),
++	   z0 = svld3 (p0, x0 - svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u64_m2:
++**	decb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_m2, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 - svcntd () * 2),
++	   z0 = svld3 (p0, x0 - svcntd () * 2))
++
++/*
++** ld3_u64_m3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_m3, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 - svcntd () * 3),
++	   z0 = svld3 (p0, x0 - svcntd () * 3))
++
++/*
++** ld3_u64_m24:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_m24, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 - svcntd () * 24),
++	   z0 = svld3 (p0, x0 - svcntd () * 24))
++
++/*
++** ld3_u64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u64_m27, svuint64x3_t, uint64_t,
++	   z0 = svld3_u64 (p0, x0 - svcntd () * 27),
++	   z0 = svld3 (p0, x0 - svcntd () * 27))
++
++/*
++** ld3_vnum_u64_0:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_0, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u64_1:
++**	incb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_1, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u64_2:
++**	incb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_2, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_u64_3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_3, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_u64_21:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_21, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_u64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_24, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u64_m1:
++**	decb	x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_m1, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u64_m2:
++**	decb	x0, all, mul #2
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_m2, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_u64_m3:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_m3, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_u64_m24:
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_m24, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_u64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_m27, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld3_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld3d	{z0\.d - z2\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u64_x1, svuint64x3_t, uint64_t,
++	   z0 = svld3_vnum_u64 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u8.c
+new file mode 100644
+index 000000000..e9d1ab495
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u8.c
+@@ -0,0 +1,246 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld3_u8_base:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_base, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0),
++	   z0 = svld3 (p0, x0))
++
++/*
++** ld3_u8_index:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_index, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 + x1),
++	   z0 = svld3 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u8_1:
++**	incb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_1, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 + svcntb ()),
++	   z0 = svld3 (p0, x0 + svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u8_2:
++**	incb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_2, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 + svcntb () * 2),
++	   z0 = svld3 (p0, x0 + svcntb () * 2))
++
++/*
++** ld3_u8_3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_3, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 + svcntb () * 3),
++	   z0 = svld3 (p0, x0 + svcntb () * 3))
++
++/*
++** ld3_u8_21:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_21, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 + svcntb () * 21),
++	   z0 = svld3 (p0, x0 + svcntb () * 21))
++
++/*
++** ld3_u8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_24, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 + svcntb () * 24),
++	   z0 = svld3 (p0, x0 + svcntb () * 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u8_m1:
++**	decb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_m1, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 - svcntb ()),
++	   z0 = svld3 (p0, x0 - svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_u8_m2:
++**	decb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_m2, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 - svcntb () * 2),
++	   z0 = svld3 (p0, x0 - svcntb () * 2))
++
++/*
++** ld3_u8_m3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_m3, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 - svcntb () * 3),
++	   z0 = svld3 (p0, x0 - svcntb () * 3))
++
++/*
++** ld3_u8_m24:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_m24, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 - svcntb () * 24),
++	   z0 = svld3 (p0, x0 - svcntb () * 24))
++
++/*
++** ld3_u8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_u8_m27, svuint8x3_t, uint8_t,
++	   z0 = svld3_u8 (p0, x0 - svcntb () * 27),
++	   z0 = svld3 (p0, x0 - svcntb () * 27))
++
++/*
++** ld3_vnum_u8_0:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_0, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, 0),
++	   z0 = svld3_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u8_1:
++**	incb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_1, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, 1),
++	   z0 = svld3_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u8_2:
++**	incb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_2, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, 2),
++	   z0 = svld3_vnum (p0, x0, 2))
++
++/*
++** ld3_vnum_u8_3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_3, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, 3),
++	   z0 = svld3_vnum (p0, x0, 3))
++
++/*
++** ld3_vnum_u8_21:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_21, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, 21),
++	   z0 = svld3_vnum (p0, x0, 21))
++
++/*
++** ld3_vnum_u8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_24, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, 24),
++	   z0 = svld3_vnum (p0, x0, 24))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u8_m1:
++**	decb	x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_m1, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, -1),
++	   z0 = svld3_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld3_vnum_u8_m2:
++**	decb	x0, all, mul #2
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_m2, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, -2),
++	   z0 = svld3_vnum (p0, x0, -2))
++
++/*
++** ld3_vnum_u8_m3:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_m3, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, -3),
++	   z0 = svld3_vnum (p0, x0, -3))
++
++/*
++** ld3_vnum_u8_m24:
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_m24, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, -24),
++	   z0 = svld3_vnum (p0, x0, -24))
++
++/*
++** ld3_vnum_u8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_m27, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, -27),
++	   z0 = svld3_vnum (p0, x0, -27))
++
++/*
++** ld3_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld3_vnum_u8_x1, svuint8x3_t, uint8_t,
++	   z0 = svld3_vnum_u8 (p0, x0, x1),
++	   z0 = svld3_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_bf16.c
+new file mode 100644
+index 000000000..123ff6355
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_bf16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_bf16_base:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_base, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_bf16_index:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_index, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_bf16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_1, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + svcnth ()),
++	   z0 = svld4 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_bf16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_2, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + svcnth () * 2),
++	   z0 = svld4 (p0, x0 + svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_bf16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_3, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + svcnth () * 3),
++	   z0 = svld4 (p0, x0 + svcnth () * 3))
++
++/*
++** ld4_bf16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_4, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + svcnth () * 4),
++	   z0 = svld4 (p0, x0 + svcnth () * 4))
++
++/*
++** ld4_bf16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_28, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + svcnth () * 28),
++	   z0 = svld4 (p0, x0 + svcnth () * 28))
++
++/*
++** ld4_bf16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_32, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 + svcnth () * 32),
++	   z0 = svld4 (p0, x0 + svcnth () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_bf16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_m1, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 - svcnth ()),
++	   z0 = svld4 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_bf16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_m2, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 - svcnth () * 2),
++	   z0 = svld4 (p0, x0 - svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_bf16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_m3, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 - svcnth () * 3),
++	   z0 = svld4 (p0, x0 - svcnth () * 3))
++
++/*
++** ld4_bf16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_m4, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 - svcnth () * 4),
++	   z0 = svld4 (p0, x0 - svcnth () * 4))
++
++/*
++** ld4_bf16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_m32, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 - svcnth () * 32),
++	   z0 = svld4 (p0, x0 - svcnth () * 32))
++
++/*
++** ld4_bf16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_bf16_m36, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_bf16 (p0, x0 - svcnth () * 36),
++	   z0 = svld4 (p0, x0 - svcnth () * 36))
++
++/*
++** ld4_vnum_bf16_0:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_bf16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_bf16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_bf16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_bf16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_bf16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_bf16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_bf16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_bf16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_bf16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_bf16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_bf16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_bf16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
++	   z0 = svld4_vnum_bf16 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f16.c
+new file mode 100644
+index 000000000..0d0ecf0af
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_f16_base:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_base, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_f16_index:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_index, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_1, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + svcnth ()),
++	   z0 = svld4 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_2, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + svcnth () * 2),
++	   z0 = svld4 (p0, x0 + svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_3, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + svcnth () * 3),
++	   z0 = svld4 (p0, x0 + svcnth () * 3))
++
++/*
++** ld4_f16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_4, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + svcnth () * 4),
++	   z0 = svld4 (p0, x0 + svcnth () * 4))
++
++/*
++** ld4_f16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_28, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + svcnth () * 28),
++	   z0 = svld4 (p0, x0 + svcnth () * 28))
++
++/*
++** ld4_f16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_32, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 + svcnth () * 32),
++	   z0 = svld4 (p0, x0 + svcnth () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_m1, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 - svcnth ()),
++	   z0 = svld4 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_m2, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 - svcnth () * 2),
++	   z0 = svld4 (p0, x0 - svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_m3, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 - svcnth () * 3),
++	   z0 = svld4 (p0, x0 - svcnth () * 3))
++
++/*
++** ld4_f16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_m4, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 - svcnth () * 4),
++	   z0 = svld4 (p0, x0 - svcnth () * 4))
++
++/*
++** ld4_f16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_m32, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 - svcnth () * 32),
++	   z0 = svld4 (p0, x0 - svcnth () * 32))
++
++/*
++** ld4_f16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_f16_m36, svfloat16x4_t, float16_t,
++	   z0 = svld4_f16 (p0, x0 - svcnth () * 36),
++	   z0 = svld4 (p0, x0 - svcnth () * 36))
++
++/*
++** ld4_vnum_f16_0:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_0, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_1, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_2, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_3, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_f16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_4, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_f16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_28, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_f16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_32, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_m1, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_m2, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_m3, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_f16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_m4, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_f16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_m32, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_f16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_m36, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f16_x1, svfloat16x4_t, float16_t,
++	   z0 = svld4_vnum_f16 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f32.c
+new file mode 100644
+index 000000000..a433d1ffe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f32.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_f32_base:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_base, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_f32_index:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_index, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f32_1:
++**	incb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_1, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + svcntw ()),
++	   z0 = svld4 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f32_2:
++**	incb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_2, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + svcntw () * 2),
++	   z0 = svld4 (p0, x0 + svcntw () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f32_3:
++**	incb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_3, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + svcntw () * 3),
++	   z0 = svld4 (p0, x0 + svcntw () * 3))
++
++/*
++** ld4_f32_4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_4, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + svcntw () * 4),
++	   z0 = svld4 (p0, x0 + svcntw () * 4))
++
++/*
++** ld4_f32_28:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_28, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + svcntw () * 28),
++	   z0 = svld4 (p0, x0 + svcntw () * 28))
++
++/*
++** ld4_f32_32:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_32, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 + svcntw () * 32),
++	   z0 = svld4 (p0, x0 + svcntw () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f32_m1:
++**	decb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_m1, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 - svcntw ()),
++	   z0 = svld4 (p0, x0 - svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f32_m2:
++**	decb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_m2, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 - svcntw () * 2),
++	   z0 = svld4 (p0, x0 - svcntw () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f32_m3:
++**	decb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_m3, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 - svcntw () * 3),
++	   z0 = svld4 (p0, x0 - svcntw () * 3))
++
++/*
++** ld4_f32_m4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_m4, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 - svcntw () * 4),
++	   z0 = svld4 (p0, x0 - svcntw () * 4))
++
++/*
++** ld4_f32_m32:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_m32, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 - svcntw () * 32),
++	   z0 = svld4 (p0, x0 - svcntw () * 32))
++
++/*
++** ld4_f32_m36:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_f32_m36, svfloat32x4_t, float32_t,
++	   z0 = svld4_f32 (p0, x0 - svcntw () * 36),
++	   z0 = svld4 (p0, x0 - svcntw () * 36))
++
++/*
++** ld4_vnum_f32_0:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_0, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f32_1:
++**	incb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_1, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f32_2:
++**	incb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_2, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f32_3:
++**	incb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_3, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_f32_4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_4, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_f32_28:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_28, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_f32_32:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_32, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f32_m1:
++**	decb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_m1, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f32_m2:
++**	decb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_m2, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f32_m3:
++**	decb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_m3, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_f32_m4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_m4, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_f32_m32:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_m32, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_f32_m36:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_m36, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f32_x1, svfloat32x4_t, float32_t,
++	   z0 = svld4_vnum_f32 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f64.c
+new file mode 100644
+index 000000000..bb18decec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f64.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_f64_base:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_base, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_f64_index:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_index, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f64_1:
++**	incb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_1, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + svcntd ()),
++	   z0 = svld4 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f64_2:
++**	incb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_2, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + svcntd () * 2),
++	   z0 = svld4 (p0, x0 + svcntd () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f64_3:
++**	incb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_3, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + svcntd () * 3),
++	   z0 = svld4 (p0, x0 + svcntd () * 3))
++
++/*
++** ld4_f64_4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_4, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + svcntd () * 4),
++	   z0 = svld4 (p0, x0 + svcntd () * 4))
++
++/*
++** ld4_f64_28:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_28, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + svcntd () * 28),
++	   z0 = svld4 (p0, x0 + svcntd () * 28))
++
++/*
++** ld4_f64_32:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_32, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 + svcntd () * 32),
++	   z0 = svld4 (p0, x0 + svcntd () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f64_m1:
++**	decb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_m1, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 - svcntd ()),
++	   z0 = svld4 (p0, x0 - svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f64_m2:
++**	decb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_m2, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 - svcntd () * 2),
++	   z0 = svld4 (p0, x0 - svcntd () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_f64_m3:
++**	decb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_m3, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 - svcntd () * 3),
++	   z0 = svld4 (p0, x0 - svcntd () * 3))
++
++/*
++** ld4_f64_m4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_m4, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 - svcntd () * 4),
++	   z0 = svld4 (p0, x0 - svcntd () * 4))
++
++/*
++** ld4_f64_m32:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_m32, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 - svcntd () * 32),
++	   z0 = svld4 (p0, x0 - svcntd () * 32))
++
++/*
++** ld4_f64_m36:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_f64_m36, svfloat64x4_t, float64_t,
++	   z0 = svld4_f64 (p0, x0 - svcntd () * 36),
++	   z0 = svld4 (p0, x0 - svcntd () * 36))
++
++/*
++** ld4_vnum_f64_0:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_0, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f64_1:
++**	incb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_1, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f64_2:
++**	incb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_2, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f64_3:
++**	incb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_3, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_f64_4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_4, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_f64_28:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_28, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_f64_32:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_32, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f64_m1:
++**	decb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_m1, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f64_m2:
++**	decb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_m2, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_f64_m3:
++**	decb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_m3, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_f64_m4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_m4, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_f64_m32:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_m32, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_f64_m36:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_m36, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_f64_x1, svfloat64x4_t, float64_t,
++	   z0 = svld4_vnum_f64 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s16.c
+new file mode 100644
+index 000000000..15fb1b595
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_s16_base:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_base, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_s16_index:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_index, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_1, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + svcnth ()),
++	   z0 = svld4 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_2, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + svcnth () * 2),
++	   z0 = svld4 (p0, x0 + svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_3, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + svcnth () * 3),
++	   z0 = svld4 (p0, x0 + svcnth () * 3))
++
++/*
++** ld4_s16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_4, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + svcnth () * 4),
++	   z0 = svld4 (p0, x0 + svcnth () * 4))
++
++/*
++** ld4_s16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_28, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + svcnth () * 28),
++	   z0 = svld4 (p0, x0 + svcnth () * 28))
++
++/*
++** ld4_s16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_32, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 + svcnth () * 32),
++	   z0 = svld4 (p0, x0 + svcnth () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_m1, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 - svcnth ()),
++	   z0 = svld4 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_m2, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 - svcnth () * 2),
++	   z0 = svld4 (p0, x0 - svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_m3, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 - svcnth () * 3),
++	   z0 = svld4 (p0, x0 - svcnth () * 3))
++
++/*
++** ld4_s16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_m4, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 - svcnth () * 4),
++	   z0 = svld4 (p0, x0 - svcnth () * 4))
++
++/*
++** ld4_s16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_m32, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 - svcnth () * 32),
++	   z0 = svld4 (p0, x0 - svcnth () * 32))
++
++/*
++** ld4_s16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s16_m36, svint16x4_t, int16_t,
++	   z0 = svld4_s16 (p0, x0 - svcnth () * 36),
++	   z0 = svld4 (p0, x0 - svcnth () * 36))
++
++/*
++** ld4_vnum_s16_0:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_0, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_1, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_2, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_3, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_s16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_4, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_s16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_28, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_s16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_32, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_m1, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_m2, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_m3, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_s16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_m4, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_s16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_m32, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_s16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_m36, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s16_x1, svint16x4_t, int16_t,
++	   z0 = svld4_vnum_s16 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s32.c
+new file mode 100644
+index 000000000..81c67710f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s32.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_s32_base:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_base, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_s32_index:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_index, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s32_1:
++**	incb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_1, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + svcntw ()),
++	   z0 = svld4 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s32_2:
++**	incb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_2, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + svcntw () * 2),
++	   z0 = svld4 (p0, x0 + svcntw () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s32_3:
++**	incb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_3, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + svcntw () * 3),
++	   z0 = svld4 (p0, x0 + svcntw () * 3))
++
++/*
++** ld4_s32_4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_4, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + svcntw () * 4),
++	   z0 = svld4 (p0, x0 + svcntw () * 4))
++
++/*
++** ld4_s32_28:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_28, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + svcntw () * 28),
++	   z0 = svld4 (p0, x0 + svcntw () * 28))
++
++/*
++** ld4_s32_32:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_32, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 + svcntw () * 32),
++	   z0 = svld4 (p0, x0 + svcntw () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s32_m1:
++**	decb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_m1, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 - svcntw ()),
++	   z0 = svld4 (p0, x0 - svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s32_m2:
++**	decb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_m2, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 - svcntw () * 2),
++	   z0 = svld4 (p0, x0 - svcntw () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s32_m3:
++**	decb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_m3, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 - svcntw () * 3),
++	   z0 = svld4 (p0, x0 - svcntw () * 3))
++
++/*
++** ld4_s32_m4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_m4, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 - svcntw () * 4),
++	   z0 = svld4 (p0, x0 - svcntw () * 4))
++
++/*
++** ld4_s32_m32:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_m32, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 - svcntw () * 32),
++	   z0 = svld4 (p0, x0 - svcntw () * 32))
++
++/*
++** ld4_s32_m36:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s32_m36, svint32x4_t, int32_t,
++	   z0 = svld4_s32 (p0, x0 - svcntw () * 36),
++	   z0 = svld4 (p0, x0 - svcntw () * 36))
++
++/*
++** ld4_vnum_s32_0:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_0, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s32_1:
++**	incb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_1, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s32_2:
++**	incb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_2, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s32_3:
++**	incb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_3, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_s32_4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_4, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_s32_28:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_28, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_s32_32:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_32, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s32_m1:
++**	decb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_m1, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s32_m2:
++**	decb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_m2, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s32_m3:
++**	decb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_m3, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_s32_m4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_m4, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_s32_m32:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_m32, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_s32_m36:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_m36, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s32_x1, svint32x4_t, int32_t,
++	   z0 = svld4_vnum_s32 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s64.c
+new file mode 100644
+index 000000000..d24c30dcf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s64.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_s64_base:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_base, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_s64_index:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_index, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s64_1:
++**	incb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_1, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + svcntd ()),
++	   z0 = svld4 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s64_2:
++**	incb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_2, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + svcntd () * 2),
++	   z0 = svld4 (p0, x0 + svcntd () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s64_3:
++**	incb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_3, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + svcntd () * 3),
++	   z0 = svld4 (p0, x0 + svcntd () * 3))
++
++/*
++** ld4_s64_4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_4, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + svcntd () * 4),
++	   z0 = svld4 (p0, x0 + svcntd () * 4))
++
++/*
++** ld4_s64_28:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_28, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + svcntd () * 28),
++	   z0 = svld4 (p0, x0 + svcntd () * 28))
++
++/*
++** ld4_s64_32:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_32, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 + svcntd () * 32),
++	   z0 = svld4 (p0, x0 + svcntd () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s64_m1:
++**	decb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_m1, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 - svcntd ()),
++	   z0 = svld4 (p0, x0 - svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s64_m2:
++**	decb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_m2, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 - svcntd () * 2),
++	   z0 = svld4 (p0, x0 - svcntd () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s64_m3:
++**	decb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_m3, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 - svcntd () * 3),
++	   z0 = svld4 (p0, x0 - svcntd () * 3))
++
++/*
++** ld4_s64_m4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_m4, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 - svcntd () * 4),
++	   z0 = svld4 (p0, x0 - svcntd () * 4))
++
++/*
++** ld4_s64_m32:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_m32, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 - svcntd () * 32),
++	   z0 = svld4 (p0, x0 - svcntd () * 32))
++
++/*
++** ld4_s64_m36:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s64_m36, svint64x4_t, int64_t,
++	   z0 = svld4_s64 (p0, x0 - svcntd () * 36),
++	   z0 = svld4 (p0, x0 - svcntd () * 36))
++
++/*
++** ld4_vnum_s64_0:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_0, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s64_1:
++**	incb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_1, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s64_2:
++**	incb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_2, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s64_3:
++**	incb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_3, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_s64_4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_4, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_s64_28:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_28, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_s64_32:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_32, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s64_m1:
++**	decb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_m1, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s64_m2:
++**	decb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_m2, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s64_m3:
++**	decb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_m3, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_s64_m4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_m4, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_s64_m32:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_m32, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_s64_m36:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_m36, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s64_x1, svint64x4_t, int64_t,
++	   z0 = svld4_vnum_s64 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s8.c
+new file mode 100644
+index 000000000..d7a17e266
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s8.c
+@@ -0,0 +1,290 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_s8_base:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_base, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_s8_index:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_index, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s8_1:
++**	incb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_1, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + svcntb ()),
++	   z0 = svld4 (p0, x0 + svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s8_2:
++**	incb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_2, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + svcntb () * 2),
++	   z0 = svld4 (p0, x0 + svcntb () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s8_3:
++**	incb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_3, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + svcntb () * 3),
++	   z0 = svld4 (p0, x0 + svcntb () * 3))
++
++/*
++** ld4_s8_4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_4, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + svcntb () * 4),
++	   z0 = svld4 (p0, x0 + svcntb () * 4))
++
++/*
++** ld4_s8_28:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_28, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + svcntb () * 28),
++	   z0 = svld4 (p0, x0 + svcntb () * 28))
++
++/*
++** ld4_s8_32:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_32, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 + svcntb () * 32),
++	   z0 = svld4 (p0, x0 + svcntb () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s8_m1:
++**	decb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_m1, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 - svcntb ()),
++	   z0 = svld4 (p0, x0 - svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s8_m2:
++**	decb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_m2, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 - svcntb () * 2),
++	   z0 = svld4 (p0, x0 - svcntb () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_s8_m3:
++**	decb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_m3, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 - svcntb () * 3),
++	   z0 = svld4 (p0, x0 - svcntb () * 3))
++
++/*
++** ld4_s8_m4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_m4, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 - svcntb () * 4),
++	   z0 = svld4 (p0, x0 - svcntb () * 4))
++
++/*
++** ld4_s8_m32:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_m32, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 - svcntb () * 32),
++	   z0 = svld4 (p0, x0 - svcntb () * 32))
++
++/*
++** ld4_s8_m36:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_s8_m36, svint8x4_t, int8_t,
++	   z0 = svld4_s8 (p0, x0 - svcntb () * 36),
++	   z0 = svld4 (p0, x0 - svcntb () * 36))
++
++/*
++** ld4_vnum_s8_0:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_0, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s8_1:
++**	incb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_1, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s8_2:
++**	incb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_2, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s8_3:
++**	incb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_3, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_s8_4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_4, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_s8_28:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_28, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_s8_32:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_32, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s8_m1:
++**	decb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_m1, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s8_m2:
++**	decb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_m2, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_s8_m3:
++**	decb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_m3, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_s8_m4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_m4, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_s8_m32:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_m32, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_s8_m36:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_m36, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/*
++** ld4_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld4_vnum_s8_x1, svint8x4_t, int8_t,
++	   z0 = svld4_vnum_s8 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u16.c
+new file mode 100644
+index 000000000..234593d10
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_u16_base:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_base, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_u16_index:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_index, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_1, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + svcnth ()),
++	   z0 = svld4 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_2, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + svcnth () * 2),
++	   z0 = svld4 (p0, x0 + svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_3, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + svcnth () * 3),
++	   z0 = svld4 (p0, x0 + svcnth () * 3))
++
++/*
++** ld4_u16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_4, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + svcnth () * 4),
++	   z0 = svld4 (p0, x0 + svcnth () * 4))
++
++/*
++** ld4_u16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_28, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + svcnth () * 28),
++	   z0 = svld4 (p0, x0 + svcnth () * 28))
++
++/*
++** ld4_u16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_32, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 + svcnth () * 32),
++	   z0 = svld4 (p0, x0 + svcnth () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_m1, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 - svcnth ()),
++	   z0 = svld4 (p0, x0 - svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_m2, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 - svcnth () * 2),
++	   z0 = svld4 (p0, x0 - svcnth () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_m3, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 - svcnth () * 3),
++	   z0 = svld4 (p0, x0 - svcnth () * 3))
++
++/*
++** ld4_u16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_m4, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 - svcnth () * 4),
++	   z0 = svld4 (p0, x0 - svcnth () * 4))
++
++/*
++** ld4_u16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_m32, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 - svcnth () * 32),
++	   z0 = svld4 (p0, x0 - svcnth () * 32))
++
++/*
++** ld4_u16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u16_m36, svuint16x4_t, uint16_t,
++	   z0 = svld4_u16 (p0, x0 - svcnth () * 36),
++	   z0 = svld4 (p0, x0 - svcnth () * 36))
++
++/*
++** ld4_vnum_u16_0:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_0, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u16_1:
++**	incb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_1, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u16_2:
++**	incb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_2, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u16_3:
++**	incb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_3, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_u16_4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_4, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_u16_28:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_28, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_u16_32:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_32, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u16_m1:
++**	decb	x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_m1, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u16_m2:
++**	decb	x0, all, mul #2
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_m2, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u16_m3:
++**	decb	x0, all, mul #3
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_m3, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_u16_m4:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_m4, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_u16_m32:
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_m32, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_u16_m36:
++**	[^{]*
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_m36, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u16_x1, svuint16x4_t, uint16_t,
++	   z0 = svld4_vnum_u16 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u32.c
+new file mode 100644
+index 000000000..ad2627800
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u32.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_u32_base:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_base, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_u32_index:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_index, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u32_1:
++**	incb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_1, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + svcntw ()),
++	   z0 = svld4 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u32_2:
++**	incb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_2, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + svcntw () * 2),
++	   z0 = svld4 (p0, x0 + svcntw () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u32_3:
++**	incb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_3, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + svcntw () * 3),
++	   z0 = svld4 (p0, x0 + svcntw () * 3))
++
++/*
++** ld4_u32_4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_4, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + svcntw () * 4),
++	   z0 = svld4 (p0, x0 + svcntw () * 4))
++
++/*
++** ld4_u32_28:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_28, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + svcntw () * 28),
++	   z0 = svld4 (p0, x0 + svcntw () * 28))
++
++/*
++** ld4_u32_32:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_32, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 + svcntw () * 32),
++	   z0 = svld4 (p0, x0 + svcntw () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u32_m1:
++**	decb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_m1, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 - svcntw ()),
++	   z0 = svld4 (p0, x0 - svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u32_m2:
++**	decb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_m2, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 - svcntw () * 2),
++	   z0 = svld4 (p0, x0 - svcntw () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u32_m3:
++**	decb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_m3, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 - svcntw () * 3),
++	   z0 = svld4 (p0, x0 - svcntw () * 3))
++
++/*
++** ld4_u32_m4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_m4, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 - svcntw () * 4),
++	   z0 = svld4 (p0, x0 - svcntw () * 4))
++
++/*
++** ld4_u32_m32:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_m32, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 - svcntw () * 32),
++	   z0 = svld4 (p0, x0 - svcntw () * 32))
++
++/*
++** ld4_u32_m36:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u32_m36, svuint32x4_t, uint32_t,
++	   z0 = svld4_u32 (p0, x0 - svcntw () * 36),
++	   z0 = svld4 (p0, x0 - svcntw () * 36))
++
++/*
++** ld4_vnum_u32_0:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_0, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u32_1:
++**	incb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_1, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u32_2:
++**	incb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_2, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u32_3:
++**	incb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_3, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_u32_4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_4, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_u32_28:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_28, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_u32_32:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_32, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u32_m1:
++**	decb	x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_m1, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u32_m2:
++**	decb	x0, all, mul #2
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_m2, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u32_m3:
++**	decb	x0, all, mul #3
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_m3, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_u32_m4:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_m4, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_u32_m32:
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_m32, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_u32_m36:
++**	[^{]*
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_m36, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4w	{z0\.s - z3\.s}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u32_x1, svuint32x4_t, uint32_t,
++	   z0 = svld4_vnum_u32 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u64.c
+new file mode 100644
+index 000000000..8772ba42d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u64.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_u64_base:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_base, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_u64_index:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_index, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u64_1:
++**	incb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_1, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + svcntd ()),
++	   z0 = svld4 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u64_2:
++**	incb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_2, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + svcntd () * 2),
++	   z0 = svld4 (p0, x0 + svcntd () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u64_3:
++**	incb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_3, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + svcntd () * 3),
++	   z0 = svld4 (p0, x0 + svcntd () * 3))
++
++/*
++** ld4_u64_4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_4, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + svcntd () * 4),
++	   z0 = svld4 (p0, x0 + svcntd () * 4))
++
++/*
++** ld4_u64_28:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_28, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + svcntd () * 28),
++	   z0 = svld4 (p0, x0 + svcntd () * 28))
++
++/*
++** ld4_u64_32:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_32, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 + svcntd () * 32),
++	   z0 = svld4 (p0, x0 + svcntd () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u64_m1:
++**	decb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_m1, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 - svcntd ()),
++	   z0 = svld4 (p0, x0 - svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u64_m2:
++**	decb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_m2, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 - svcntd () * 2),
++	   z0 = svld4 (p0, x0 - svcntd () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u64_m3:
++**	decb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_m3, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 - svcntd () * 3),
++	   z0 = svld4 (p0, x0 - svcntd () * 3))
++
++/*
++** ld4_u64_m4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_m4, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 - svcntd () * 4),
++	   z0 = svld4 (p0, x0 - svcntd () * 4))
++
++/*
++** ld4_u64_m32:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_m32, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 - svcntd () * 32),
++	   z0 = svld4 (p0, x0 - svcntd () * 32))
++
++/*
++** ld4_u64_m36:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u64_m36, svuint64x4_t, uint64_t,
++	   z0 = svld4_u64 (p0, x0 - svcntd () * 36),
++	   z0 = svld4 (p0, x0 - svcntd () * 36))
++
++/*
++** ld4_vnum_u64_0:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_0, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u64_1:
++**	incb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_1, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u64_2:
++**	incb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_2, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u64_3:
++**	incb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_3, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_u64_4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_4, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_u64_28:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_28, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_u64_32:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_32, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u64_m1:
++**	decb	x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_m1, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u64_m2:
++**	decb	x0, all, mul #2
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_m2, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u64_m3:
++**	decb	x0, all, mul #3
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_m3, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_u64_m4:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_m4, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_u64_m32:
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_m32, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_u64_m36:
++**	[^{]*
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_m36, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ld4_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ld4d	{z0\.d - z3\.d}, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u64_x1, svuint64x4_t, uint64_t,
++	   z0 = svld4_vnum_u64 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u8.c
+new file mode 100644
+index 000000000..85b2987ce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u8.c
+@@ -0,0 +1,290 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ld4_u8_base:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_base, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0),
++	   z0 = svld4 (p0, x0))
++
++/*
++** ld4_u8_index:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_index, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + x1),
++	   z0 = svld4 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u8_1:
++**	incb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_1, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + svcntb ()),
++	   z0 = svld4 (p0, x0 + svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u8_2:
++**	incb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_2, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + svcntb () * 2),
++	   z0 = svld4 (p0, x0 + svcntb () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u8_3:
++**	incb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_3, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + svcntb () * 3),
++	   z0 = svld4 (p0, x0 + svcntb () * 3))
++
++/*
++** ld4_u8_4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_4, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + svcntb () * 4),
++	   z0 = svld4 (p0, x0 + svcntb () * 4))
++
++/*
++** ld4_u8_28:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_28, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + svcntb () * 28),
++	   z0 = svld4 (p0, x0 + svcntb () * 28))
++
++/*
++** ld4_u8_32:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_32, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 + svcntb () * 32),
++	   z0 = svld4 (p0, x0 + svcntb () * 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u8_m1:
++**	decb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_m1, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 - svcntb ()),
++	   z0 = svld4 (p0, x0 - svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u8_m2:
++**	decb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_m2, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 - svcntb () * 2),
++	   z0 = svld4 (p0, x0 - svcntb () * 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_u8_m3:
++**	decb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_m3, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 - svcntb () * 3),
++	   z0 = svld4 (p0, x0 - svcntb () * 3))
++
++/*
++** ld4_u8_m4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_m4, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 - svcntb () * 4),
++	   z0 = svld4 (p0, x0 - svcntb () * 4))
++
++/*
++** ld4_u8_m32:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_m32, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 - svcntb () * 32),
++	   z0 = svld4 (p0, x0 - svcntb () * 32))
++
++/*
++** ld4_u8_m36:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_u8_m36, svuint8x4_t, uint8_t,
++	   z0 = svld4_u8 (p0, x0 - svcntb () * 36),
++	   z0 = svld4 (p0, x0 - svcntb () * 36))
++
++/*
++** ld4_vnum_u8_0:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_0, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 0),
++	   z0 = svld4_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u8_1:
++**	incb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_1, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 1),
++	   z0 = svld4_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u8_2:
++**	incb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_2, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 2),
++	   z0 = svld4_vnum (p0, x0, 2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u8_3:
++**	incb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_3, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 3),
++	   z0 = svld4_vnum (p0, x0, 3))
++
++/*
++** ld4_vnum_u8_4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_4, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 4),
++	   z0 = svld4_vnum (p0, x0, 4))
++
++/*
++** ld4_vnum_u8_28:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_28, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 28),
++	   z0 = svld4_vnum (p0, x0, 28))
++
++/*
++** ld4_vnum_u8_32:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_32, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, 32),
++	   z0 = svld4_vnum (p0, x0, 32))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u8_m1:
++**	decb	x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_m1, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, -1),
++	   z0 = svld4_vnum (p0, x0, -1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u8_m2:
++**	decb	x0, all, mul #2
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_m2, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, -2),
++	   z0 = svld4_vnum (p0, x0, -2))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ld4_vnum_u8_m3:
++**	decb	x0, all, mul #3
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_m3, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, -3),
++	   z0 = svld4_vnum (p0, x0, -3))
++
++/*
++** ld4_vnum_u8_m4:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_m4, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, -4),
++	   z0 = svld4_vnum (p0, x0, -4))
++
++/*
++** ld4_vnum_u8_m32:
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_m32, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, -32),
++	   z0 = svld4_vnum (p0, x0, -32))
++
++/*
++** ld4_vnum_u8_m36:
++**	[^{]*
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_m36, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, -36),
++	   z0 = svld4_vnum (p0, x0, -36))
++
++/*
++** ld4_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ld4_vnum_u8_x1, svuint8x4_t, uint8_t,
++	   z0 = svld4_vnum_u8 (p0, x0, x1),
++	   z0 = svld4_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c
+new file mode 100644
+index 000000000..80f646870
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_bf16_base:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_bf16_base, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_bf16 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_bf16_index:
++**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1_bf16_index, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_bf16 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_bf16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_bf16 (p0, x0 + svcnth ()),
++	   z0 = svldff1 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_bf16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_bf16 (p0, x0 - svcnth ()),
++	   z0 = svldff1 (p0, x0 - svcnth ()))
++
++/*
++** ldff1_vnum_bf16_0:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_vnum_bf16 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_bf16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_vnum_bf16 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_bf16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_vnum_bf16 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
++	   z0 = svldff1_vnum_bf16 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f16.c
+new file mode 100644
+index 000000000..13ce863c9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f16.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_f16_base:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f16_base, svfloat16_t, float16_t,
++	   z0 = svldff1_f16 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_f16_index:
++**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1_f16_index, svfloat16_t, float16_t,
++	   z0 = svldff1_f16 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_f16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f16_1, svfloat16_t, float16_t,
++	   z0 = svldff1_f16 (p0, x0 + svcnth ()),
++	   z0 = svldff1 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_f16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f16_m1, svfloat16_t, float16_t,
++	   z0 = svldff1_f16 (p0, x0 - svcnth ()),
++	   z0 = svldff1 (p0, x0 - svcnth ()))
++
++/*
++** ldff1_vnum_f16_0:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f16_0, svfloat16_t, float16_t,
++	   z0 = svldff1_vnum_f16 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_f16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f16_1, svfloat16_t, float16_t,
++	   z0 = svldff1_vnum_f16 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_f16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f16_m1, svfloat16_t, float16_t,
++	   z0 = svldff1_vnum_f16 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f16_x1, svfloat16_t, float16_t,
++	   z0 = svldff1_vnum_f16 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f32.c
+new file mode 100644
+index 000000000..2fcc63390
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_f32_base:
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f32_base, svfloat32_t, float32_t,
++	   z0 = svldff1_f32 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_f32_index:
++**	ldff1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1_f32_index, svfloat32_t, float32_t,
++	   z0 = svldff1_f32 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_f32_1:
++**	incb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f32_1, svfloat32_t, float32_t,
++	   z0 = svldff1_f32 (p0, x0 + svcntw ()),
++	   z0 = svldff1 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_f32_m1:
++**	decb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f32_m1, svfloat32_t, float32_t,
++	   z0 = svldff1_f32 (p0, x0 - svcntw ()),
++	   z0 = svldff1 (p0, x0 - svcntw ()))
++
++/*
++** ldff1_vnum_f32_0:
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f32_0, svfloat32_t, float32_t,
++	   z0 = svldff1_vnum_f32 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_f32_1:
++**	incb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f32_1, svfloat32_t, float32_t,
++	   z0 = svldff1_vnum_f32 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_f32_m1:
++**	decb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f32_m1, svfloat32_t, float32_t,
++	   z0 = svldff1_vnum_f32 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f32_x1, svfloat32_t, float32_t,
++	   z0 = svldff1_vnum_f32 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f64.c
+new file mode 100644
+index 000000000..cc15b927a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_f64_base:
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f64_base, svfloat64_t, float64_t,
++	   z0 = svldff1_f64 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_f64_index:
++**	ldff1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ldff1_f64_index, svfloat64_t, float64_t,
++	   z0 = svldff1_f64 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_f64_1:
++**	incb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f64_1, svfloat64_t, float64_t,
++	   z0 = svldff1_f64 (p0, x0 + svcntd ()),
++	   z0 = svldff1 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_f64_m1:
++**	decb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_f64_m1, svfloat64_t, float64_t,
++	   z0 = svldff1_f64 (p0, x0 - svcntd ()),
++	   z0 = svldff1 (p0, x0 - svcntd ()))
++
++/*
++** ldff1_vnum_f64_0:
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f64_0, svfloat64_t, float64_t,
++	   z0 = svldff1_vnum_f64 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_f64_1:
++**	incb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f64_1, svfloat64_t, float64_t,
++	   z0 = svldff1_vnum_f64 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_f64_m1:
++**	decb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f64_m1, svfloat64_t, float64_t,
++	   z0 = svldff1_vnum_f64 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_f64_x1, svfloat64_t, float64_t,
++	   z0 = svldff1_vnum_f64 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f32.c
+new file mode 100644
+index 000000000..7e330c042
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f32.c
+@@ -0,0 +1,272 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_gather_f32_tied1:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_f32_tied1, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_f32 (p0, z0),
++		     z0_res = svldff1_gather_f32 (p0, z0))
++
++/*
++** ldff1_gather_f32_untied:
++**	ldff1w	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_f32_untied, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_f32 (p0, z1),
++		     z0_res = svldff1_gather_f32 (p0, z1))
++
++/*
++** ldff1_gather_x0_f32_offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, x0),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, x0))
++
++/*
++** ldff1_gather_m4_f32_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m4_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, -4),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, -4))
++
++/*
++** ldff1_gather_0_f32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 0),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_f32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 5),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 5))
++
++/*
++** ldff1_gather_6_f32_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_6_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 6),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 6))
++
++/*
++** ldff1_gather_7_f32_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_7_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 7),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 7))
++
++/*
++** ldff1_gather_8_f32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_8_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 8),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 8))
++
++/*
++** ldff1_gather_124_f32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_124_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 124),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 124))
++
++/*
++** ldff1_gather_128_f32_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_128_f32_offset, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 128),
++		     z0_res = svldff1_gather_offset_f32 (p0, z0, 128))
++
++/*
++** ldff1_gather_x0_f32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, x0),
++		     z0_res = svldff1_gather_index_f32 (p0, z0, x0))
++
++/*
++** ldff1_gather_m1_f32_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m1_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, -1),
++		     z0_res = svldff1_gather_index_f32 (p0, z0, -1))
++
++/*
++** ldff1_gather_0_f32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 0),
++		     z0_res = svldff1_gather_index_f32 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_f32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 5),
++		     z0_res = svldff1_gather_index_f32 (p0, z0, 5))
++
++/*
++** ldff1_gather_31_f32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_31_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 31),
++		     z0_res = svldff1_gather_index_f32 (p0, z0, 31))
++
++/*
++** ldff1_gather_32_f32_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_32_f32_index, svfloat32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 32),
++		     z0_res = svldff1_gather_index_f32 (p0, z0, 32))
++
++/*
++** ldff1_gather_x0_f32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_f32 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_f32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_f32 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_f32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_f32 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_f32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_f32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_f32 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f64.c
+new file mode 100644
+index 000000000..d0e47f0bf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f64.c
+@@ -0,0 +1,348 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_gather_f64_tied1:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_f64_tied1, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_f64 (p0, z0),
++		     z0_res = svldff1_gather_f64 (p0, z0))
++
++/*
++** ldff1_gather_f64_untied:
++**	ldff1d	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_f64_untied, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_f64 (p0, z1),
++		     z0_res = svldff1_gather_f64 (p0, z1))
++
++/*
++** ldff1_gather_x0_f64_offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, x0),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, x0))
++
++/*
++** ldff1_gather_m8_f64_offset:
++**	mov	(x[0-9]+), #?-8
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m8_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, -8),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, -8))
++
++/*
++** ldff1_gather_0_f64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 0),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 0))
++
++/*
++** ldff1_gather_9_f64_offset:
++**	mov	(x[0-9]+), #?9
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_9_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 9),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 9))
++
++/*
++** ldff1_gather_10_f64_offset:
++**	mov	(x[0-9]+), #?10
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_10_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 10),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 10))
++
++/*
++** ldff1_gather_11_f64_offset:
++**	mov	(x[0-9]+), #?11
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_11_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 11),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 11))
++
++/*
++** ldff1_gather_12_f64_offset:
++**	mov	(x[0-9]+), #?12
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_12_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 12),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 12))
++
++/*
++** ldff1_gather_13_f64_offset:
++**	mov	(x[0-9]+), #?13
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_13_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 13),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 13))
++
++/*
++** ldff1_gather_14_f64_offset:
++**	mov	(x[0-9]+), #?14
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_14_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 14),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 14))
++
++/*
++** ldff1_gather_15_f64_offset:
++**	mov	(x[0-9]+), #?15
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_15_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 15),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 15))
++
++/*
++** ldff1_gather_16_f64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #16\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_16_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 16),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 16))
++
++/*
++** ldff1_gather_248_f64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_248_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 248),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 248))
++
++/*
++** ldff1_gather_256_f64_offset:
++**	mov	(x[0-9]+), #?256
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_256_f64_offset, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 256),
++		     z0_res = svldff1_gather_offset_f64 (p0, z0, 256))
++
++/*
++** ldff1_gather_x0_f64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, x0),
++		     z0_res = svldff1_gather_index_f64 (p0, z0, x0))
++
++/*
++** ldff1_gather_m1_f64_index:
++**	mov	(x[0-9]+), #?-8
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m1_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, -1),
++		     z0_res = svldff1_gather_index_f64 (p0, z0, -1))
++
++/*
++** ldff1_gather_0_f64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 0),
++		     z0_res = svldff1_gather_index_f64 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_f64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #40\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 5),
++		     z0_res = svldff1_gather_index_f64 (p0, z0, 5))
++
++/*
++** ldff1_gather_31_f64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_31_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 31),
++		     z0_res = svldff1_gather_index_f64 (p0, z0, 31))
++
++/*
++** ldff1_gather_32_f64_index:
++**	mov	(x[0-9]+), #?256
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_32_f64_index, svfloat64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 32),
++		     z0_res = svldff1_gather_index_f64 (p0, z0, 32))
++
++/*
++** ldff1_gather_x0_f64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_f64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_f64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_f64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_f64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_f64 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_f64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_f64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_f64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_f64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_f64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_f64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_f64 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_f64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_f64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s32.c
+new file mode 100644
+index 000000000..66bf0f746
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s32.c
+@@ -0,0 +1,272 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_gather_s32_tied1:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_s32 (p0, z0),
++		     z0_res = svldff1_gather_s32 (p0, z0))
++
++/*
++** ldff1_gather_s32_untied:
++**	ldff1w	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_s32 (p0, z1),
++		     z0_res = svldff1_gather_s32 (p0, z1))
++
++/*
++** ldff1_gather_x0_s32_offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ldff1_gather_m4_s32_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m4_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, -4),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, -4))
++
++/*
++** ldff1_gather_0_s32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ldff1_gather_6_s32_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_6_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 6),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 6))
++
++/*
++** ldff1_gather_7_s32_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_7_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 7),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 7))
++
++/*
++** ldff1_gather_8_s32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_8_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 8),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 8))
++
++/*
++** ldff1_gather_124_s32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_124_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 124),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 124))
++
++/*
++** ldff1_gather_128_s32_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_128_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 128),
++		     z0_res = svldff1_gather_offset_s32 (p0, z0, 128))
++
++/*
++** ldff1_gather_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, x0),
++		     z0_res = svldff1_gather_index_s32 (p0, z0, x0))
++
++/*
++** ldff1_gather_m1_s32_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m1_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, -1),
++		     z0_res = svldff1_gather_index_s32 (p0, z0, -1))
++
++/*
++** ldff1_gather_0_s32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 0),
++		     z0_res = svldff1_gather_index_s32 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_s32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 5),
++		     z0_res = svldff1_gather_index_s32 (p0, z0, 5))
++
++/*
++** ldff1_gather_31_s32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_31_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 31),
++		     z0_res = svldff1_gather_index_s32 (p0, z0, 31))
++
++/*
++** ldff1_gather_32_s32_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_32_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 32),
++		     z0_res = svldff1_gather_index_s32 (p0, z0, 32))
++
++/*
++** ldff1_gather_x0_s32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_s32offset, svint32_t, int32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_s32offset, svint32_t, int32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_s32offset, svint32_t, int32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_s32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_s32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_s32index, svint32_t, int32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_s32index, svint32_t, int32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_s32index, svint32_t, int32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_s32 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_s32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_u32index, svint32_t, int32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_u32index, svint32_t, int32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_u32index, svint32_t, int32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_s32 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s64.c
+new file mode 100644
+index 000000000..faf71bf9d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s64.c
+@@ -0,0 +1,348 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_gather_s64_tied1:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1_gather_s64 (p0, z0))
++
++/*
++** ldff1_gather_s64_untied:
++**	ldff1d	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1_gather_s64 (p0, z1))
++
++/*
++** ldff1_gather_x0_s64_offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1_gather_m8_s64_offset:
++**	mov	(x[0-9]+), #?-8
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m8_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, -8),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, -8))
++
++/*
++** ldff1_gather_0_s64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1_gather_9_s64_offset:
++**	mov	(x[0-9]+), #?9
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_9_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 9),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 9))
++
++/*
++** ldff1_gather_10_s64_offset:
++**	mov	(x[0-9]+), #?10
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_10_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 10),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 10))
++
++/*
++** ldff1_gather_11_s64_offset:
++**	mov	(x[0-9]+), #?11
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_11_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 11),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 11))
++
++/*
++** ldff1_gather_12_s64_offset:
++**	mov	(x[0-9]+), #?12
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_12_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 12),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 12))
++
++/*
++** ldff1_gather_13_s64_offset:
++**	mov	(x[0-9]+), #?13
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_13_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 13),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 13))
++
++/*
++** ldff1_gather_14_s64_offset:
++**	mov	(x[0-9]+), #?14
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_14_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 14),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 14))
++
++/*
++** ldff1_gather_15_s64_offset:
++**	mov	(x[0-9]+), #?15
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_15_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 15),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 15))
++
++/*
++** ldff1_gather_16_s64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #16\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_16_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 16),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 16))
++
++/*
++** ldff1_gather_248_s64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_248_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 248),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 248))
++
++/*
++** ldff1_gather_256_s64_offset:
++**	mov	(x[0-9]+), #?256
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_256_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 256),
++		     z0_res = svldff1_gather_offset_s64 (p0, z0, 256))
++
++/*
++** ldff1_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svldff1_gather_index_s64 (p0, z0, x0))
++
++/*
++** ldff1_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-8
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svldff1_gather_index_s64 (p0, z0, -1))
++
++/*
++** ldff1_gather_0_s64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svldff1_gather_index_s64 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_s64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #40\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svldff1_gather_index_s64 (p0, z0, 5))
++
++/*
++** ldff1_gather_31_s64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svldff1_gather_index_s64 (p0, z0, 31))
++
++/*
++** ldff1_gather_32_s64_index:
++**	mov	(x[0-9]+), #?256
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svldff1_gather_index_s64 (p0, z0, 32))
++
++/*
++** ldff1_gather_x0_s64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_s64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_s64offset, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_s64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_s64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_s64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_s64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_s64index, svint64_t, int64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_s64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_s64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_s64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_s64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_u64index, svint64_t, int64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u32.c
+new file mode 100644
+index 000000000..41c7dc9cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u32.c
+@@ -0,0 +1,272 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_gather_u32_tied1:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_u32 (p0, z0),
++		     z0_res = svldff1_gather_u32 (p0, z0))
++
++/*
++** ldff1_gather_u32_untied:
++**	ldff1w	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_u32 (p0, z1),
++		     z0_res = svldff1_gather_u32 (p0, z1))
++
++/*
++** ldff1_gather_x0_u32_offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ldff1_gather_m4_u32_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m4_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, -4),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, -4))
++
++/*
++** ldff1_gather_0_u32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ldff1_gather_6_u32_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_6_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 6),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 6))
++
++/*
++** ldff1_gather_7_u32_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_7_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 7),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 7))
++
++/*
++** ldff1_gather_8_u32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_8_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 8),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 8))
++
++/*
++** ldff1_gather_124_u32_offset:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_124_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 124),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 124))
++
++/*
++** ldff1_gather_128_u32_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_128_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 128),
++		     z0_res = svldff1_gather_offset_u32 (p0, z0, 128))
++
++/*
++** ldff1_gather_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, x0),
++		     z0_res = svldff1_gather_index_u32 (p0, z0, x0))
++
++/*
++** ldff1_gather_m1_u32_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m1_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, -1),
++		     z0_res = svldff1_gather_index_u32 (p0, z0, -1))
++
++/*
++** ldff1_gather_0_u32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 0),
++		     z0_res = svldff1_gather_index_u32 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_u32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 5),
++		     z0_res = svldff1_gather_index_u32 (p0, z0, 5))
++
++/*
++** ldff1_gather_31_u32_index:
++**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_31_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 31),
++		     z0_res = svldff1_gather_index_u32 (p0, z0, 31))
++
++/*
++** ldff1_gather_32_u32_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_32_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 32),
++		     z0_res = svldff1_gather_index_u32 (p0, z0, 32))
++
++/*
++** ldff1_gather_x0_u32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u32_s32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svldff1_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_u32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u32_u32offset:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_u32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u32_s32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		     z0_res = svldff1_gather_s32index_u32 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_x0_u32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u32_u32index:
++**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		     z0_res = svldff1_gather_u32index_u32 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u64.c
+new file mode 100644
+index 000000000..8b53ce94f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u64.c
+@@ -0,0 +1,348 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_gather_u64_tied1:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1_gather_u64 (p0, z0))
++
++/*
++** ldff1_gather_u64_untied:
++**	ldff1d	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1_gather_u64 (p0, z1))
++
++/*
++** ldff1_gather_x0_u64_offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1_gather_m8_u64_offset:
++**	mov	(x[0-9]+), #?-8
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m8_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, -8),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, -8))
++
++/*
++** ldff1_gather_0_u64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1_gather_9_u64_offset:
++**	mov	(x[0-9]+), #?9
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_9_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 9),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 9))
++
++/*
++** ldff1_gather_10_u64_offset:
++**	mov	(x[0-9]+), #?10
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_10_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 10),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 10))
++
++/*
++** ldff1_gather_11_u64_offset:
++**	mov	(x[0-9]+), #?11
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_11_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 11),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 11))
++
++/*
++** ldff1_gather_12_u64_offset:
++**	mov	(x[0-9]+), #?12
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_12_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 12),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 12))
++
++/*
++** ldff1_gather_13_u64_offset:
++**	mov	(x[0-9]+), #?13
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_13_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 13),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 13))
++
++/*
++** ldff1_gather_14_u64_offset:
++**	mov	(x[0-9]+), #?14
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_14_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 14),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 14))
++
++/*
++** ldff1_gather_15_u64_offset:
++**	mov	(x[0-9]+), #?15
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_15_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 15),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 15))
++
++/*
++** ldff1_gather_16_u64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #16\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_16_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 16),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 16))
++
++/*
++** ldff1_gather_248_u64_offset:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_248_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 248),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 248))
++
++/*
++** ldff1_gather_256_u64_offset:
++**	mov	(x[0-9]+), #?256
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_256_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 256),
++		     z0_res = svldff1_gather_offset_u64 (p0, z0, 256))
++
++/*
++** ldff1_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svldff1_gather_index_u64 (p0, z0, x0))
++
++/*
++** ldff1_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-8
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svldff1_gather_index_u64 (p0, z0, -1))
++
++/*
++** ldff1_gather_0_u64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svldff1_gather_index_u64 (p0, z0, 0))
++
++/*
++** ldff1_gather_5_u64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #40\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svldff1_gather_index_u64 (p0, z0, 5))
++
++/*
++** ldff1_gather_31_u64_index:
++**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svldff1_gather_index_u64 (p0, z0, 31))
++
++/*
++** ldff1_gather_32_u64_index:
++**	mov	(x[0-9]+), #?256
++**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svldff1_gather_index_u64 (p0, z0, 32))
++
++/*
++** ldff1_gather_x0_u64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_u64_s64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_u64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_offset (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1_gather_offset (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_u64_u64offset:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_u64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_u64_s64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		     z0_res = svldff1_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1_gather_x0_u64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_tied1_u64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1_gather_index (p0, x0, z0))
++
++/*
++** ldff1_gather_untied_u64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svldff1_gather_index (p0, x0, z1))
++
++/*
++** ldff1_gather_ext_u64_u64index:
++**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		     z0_res = svldff1_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s16.c
+new file mode 100644
+index 000000000..1d5fde0e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s16.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_s16_base:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s16_base, svint16_t, int16_t,
++	   z0 = svldff1_s16 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_s16_index:
++**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1_s16_index, svint16_t, int16_t,
++	   z0 = svldff1_s16 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s16_1, svint16_t, int16_t,
++	   z0 = svldff1_s16 (p0, x0 + svcnth ()),
++	   z0 = svldff1 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s16_m1, svint16_t, int16_t,
++	   z0 = svldff1_s16 (p0, x0 - svcnth ()),
++	   z0 = svldff1 (p0, x0 - svcnth ()))
++
++/*
++** ldff1_vnum_s16_0:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s16_0, svint16_t, int16_t,
++	   z0 = svldff1_vnum_s16 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s16_1, svint16_t, int16_t,
++	   z0 = svldff1_vnum_s16 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s16_m1, svint16_t, int16_t,
++	   z0 = svldff1_vnum_s16 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s16_x1, svint16_t, int16_t,
++	   z0 = svldff1_vnum_s16 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s32.c
+new file mode 100644
+index 000000000..97a36e884
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_s32_base:
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s32_base, svint32_t, int32_t,
++	   z0 = svldff1_s32 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_s32_index:
++**	ldff1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1_s32_index, svint32_t, int32_t,
++	   z0 = svldff1_s32 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s32_1:
++**	incb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s32_1, svint32_t, int32_t,
++	   z0 = svldff1_s32 (p0, x0 + svcntw ()),
++	   z0 = svldff1 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s32_m1:
++**	decb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s32_m1, svint32_t, int32_t,
++	   z0 = svldff1_s32 (p0, x0 - svcntw ()),
++	   z0 = svldff1 (p0, x0 - svcntw ()))
++
++/*
++** ldff1_vnum_s32_0:
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s32_0, svint32_t, int32_t,
++	   z0 = svldff1_vnum_s32 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s32_1:
++**	incb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s32_1, svint32_t, int32_t,
++	   z0 = svldff1_vnum_s32 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s32_m1:
++**	decb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s32_m1, svint32_t, int32_t,
++	   z0 = svldff1_vnum_s32 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s32_x1, svint32_t, int32_t,
++	   z0 = svldff1_vnum_s32 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s64.c
+new file mode 100644
+index 000000000..c018a4c1c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_s64_base:
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s64_base, svint64_t, int64_t,
++	   z0 = svldff1_s64 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_s64_index:
++**	ldff1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ldff1_s64_index, svint64_t, int64_t,
++	   z0 = svldff1_s64 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s64_1:
++**	incb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s64_1, svint64_t, int64_t,
++	   z0 = svldff1_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s64_m1:
++**	decb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s64_m1, svint64_t, int64_t,
++	   z0 = svldff1_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1 (p0, x0 - svcntd ()))
++
++/*
++** ldff1_vnum_s64_0:
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s64_0, svint64_t, int64_t,
++	   z0 = svldff1_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s64_1:
++**	incb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s64_1, svint64_t, int64_t,
++	   z0 = svldff1_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s64_m1:
++**	decb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s64_m1, svint64_t, int64_t,
++	   z0 = svldff1_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s64_x1, svint64_t, int64_t,
++	   z0 = svldff1_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s8.c
+new file mode 100644
+index 000000000..cf620d1f4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s8.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_s8_base:
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s8_base, svint8_t, int8_t,
++	   z0 = svldff1_s8 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_s8_index:
++**	ldff1b	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1_s8_index, svint8_t, int8_t,
++	   z0 = svldff1_s8 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s8_1:
++**	incb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s8_1, svint8_t, int8_t,
++	   z0 = svldff1_s8 (p0, x0 + svcntb ()),
++	   z0 = svldff1 (p0, x0 + svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_s8_m1:
++**	decb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_s8_m1, svint8_t, int8_t,
++	   z0 = svldff1_s8 (p0, x0 - svcntb ()),
++	   z0 = svldff1 (p0, x0 - svcntb ()))
++
++/*
++** ldff1_vnum_s8_0:
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s8_0, svint8_t, int8_t,
++	   z0 = svldff1_vnum_s8 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s8_1:
++**	incb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s8_1, svint8_t, int8_t,
++	   z0 = svldff1_vnum_s8 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_s8_m1:
++**	decb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s8_m1, svint8_t, int8_t,
++	   z0 = svldff1_vnum_s8 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/*
++** ldff1_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.b, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.b, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_s8_x1, svint8_t, int8_t,
++	   z0 = svldff1_vnum_s8 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u16.c
+new file mode 100644
+index 000000000..1fa819296
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u16.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_u16_base:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u16_base, svuint16_t, uint16_t,
++	   z0 = svldff1_u16 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_u16_index:
++**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1_u16_index, svuint16_t, uint16_t,
++	   z0 = svldff1_u16 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u16_1, svuint16_t, uint16_t,
++	   z0 = svldff1_u16 (p0, x0 + svcnth ()),
++	   z0 = svldff1 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u16_m1, svuint16_t, uint16_t,
++	   z0 = svldff1_u16 (p0, x0 - svcnth ()),
++	   z0 = svldff1 (p0, x0 - svcnth ()))
++
++/*
++** ldff1_vnum_u16_0:
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u16_0, svuint16_t, uint16_t,
++	   z0 = svldff1_vnum_u16 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u16_1:
++**	incb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u16_1, svuint16_t, uint16_t,
++	   z0 = svldff1_vnum_u16 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u16_m1:
++**	decb	x0
++**	ldff1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u16_m1, svuint16_t, uint16_t,
++	   z0 = svldff1_vnum_u16 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u16_x1, svuint16_t, uint16_t,
++	   z0 = svldff1_vnum_u16 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u32.c
+new file mode 100644
+index 000000000..5224ec40a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_u32_base:
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u32_base, svuint32_t, uint32_t,
++	   z0 = svldff1_u32 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_u32_index:
++**	ldff1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1_u32_index, svuint32_t, uint32_t,
++	   z0 = svldff1_u32 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u32_1:
++**	incb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u32_1, svuint32_t, uint32_t,
++	   z0 = svldff1_u32 (p0, x0 + svcntw ()),
++	   z0 = svldff1 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u32_m1:
++**	decb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u32_m1, svuint32_t, uint32_t,
++	   z0 = svldff1_u32 (p0, x0 - svcntw ()),
++	   z0 = svldff1 (p0, x0 - svcntw ()))
++
++/*
++** ldff1_vnum_u32_0:
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u32_0, svuint32_t, uint32_t,
++	   z0 = svldff1_vnum_u32 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u32_1:
++**	incb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u32_1, svuint32_t, uint32_t,
++	   z0 = svldff1_vnum_u32 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u32_m1:
++**	decb	x0
++**	ldff1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u32_m1, svuint32_t, uint32_t,
++	   z0 = svldff1_vnum_u32 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u32_x1, svuint32_t, uint32_t,
++	   z0 = svldff1_vnum_u32 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u64.c
+new file mode 100644
+index 000000000..18e87f2b8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_u64_base:
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u64_base, svuint64_t, uint64_t,
++	   z0 = svldff1_u64 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_u64_index:
++**	ldff1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ldff1_u64_index, svuint64_t, uint64_t,
++	   z0 = svldff1_u64 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u64_1:
++**	incb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u64_1, svuint64_t, uint64_t,
++	   z0 = svldff1_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u64_m1:
++**	decb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u64_m1, svuint64_t, uint64_t,
++	   z0 = svldff1_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1 (p0, x0 - svcntd ()))
++
++/*
++** ldff1_vnum_u64_0:
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u64_0, svuint64_t, uint64_t,
++	   z0 = svldff1_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u64_1:
++**	incb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u64_1, svuint64_t, uint64_t,
++	   z0 = svldff1_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u64_m1:
++**	decb	x0
++**	ldff1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u64_m1, svuint64_t, uint64_t,
++	   z0 = svldff1_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u64_x1, svuint64_t, uint64_t,
++	   z0 = svldff1_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u8.c
+new file mode 100644
+index 000000000..83883fca4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u8.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1_u8_base:
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u8_base, svuint8_t, uint8_t,
++	   z0 = svldff1_u8 (p0, x0),
++	   z0 = svldff1 (p0, x0))
++
++/*
++** ldff1_u8_index:
++**	ldff1b	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1_u8_index, svuint8_t, uint8_t,
++	   z0 = svldff1_u8 (p0, x0 + x1),
++	   z0 = svldff1 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u8_1:
++**	incb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u8_1, svuint8_t, uint8_t,
++	   z0 = svldff1_u8 (p0, x0 + svcntb ()),
++	   z0 = svldff1 (p0, x0 + svcntb ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_u8_m1:
++**	decb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_u8_m1, svuint8_t, uint8_t,
++	   z0 = svldff1_u8 (p0, x0 - svcntb ()),
++	   z0 = svldff1 (p0, x0 - svcntb ()))
++
++/*
++** ldff1_vnum_u8_0:
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u8_0, svuint8_t, uint8_t,
++	   z0 = svldff1_vnum_u8 (p0, x0, 0),
++	   z0 = svldff1_vnum (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u8_1:
++**	incb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u8_1, svuint8_t, uint8_t,
++	   z0 = svldff1_vnum_u8 (p0, x0, 1),
++	   z0 = svldff1_vnum (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1_vnum_u8_m1:
++**	decb	x0
++**	ldff1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u8_m1, svuint8_t, uint8_t,
++	   z0 = svldff1_vnum_u8 (p0, x0, -1),
++	   z0 = svldff1_vnum (p0, x0, -1))
++
++/*
++** ldff1_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.b, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.b, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1_vnum_u8_x1, svuint8_t, uint8_t,
++	   z0 = svldff1_vnum_u8 (p0, x0, x1),
++	   z0 = svldff1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s32.c
+new file mode 100644
+index 000000000..c2a676807
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_gather_s32_tied1:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_s32 (p0, z0),
++		     z0_res = svldff1sb_gather_s32 (p0, z0))
++
++/*
++** ldff1sb_gather_s32_untied:
++**	ldff1sb	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_s32 (p0, z1),
++		     z0_res = svldff1sb_gather_s32 (p0, z1))
++
++/*
++** ldff1sb_gather_x0_s32_offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ldff1sb_gather_m1_s32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, -1),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, -1))
++
++/*
++** ldff1sb_gather_0_s32_offset:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ldff1sb_gather_5_s32_offset:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ldff1sb_gather_31_s32_offset:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 31),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 31))
++
++/*
++** ldff1sb_gather_32_s32_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 32),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 32))
++
++/*
++** ldff1sb_gather_x0_s32_s32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s32_s32offset, svint32_t, int8_t, svint32_t,
++		     z0_res = svldff1sb_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_s32_s32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s32_s32offset, svint32_t, int8_t, svint32_t,
++		     z0_res = svldff1sb_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_s32_s32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s32_s32offset, svint32_t, int8_t, svint32_t,
++		     z0_res = svldff1sb_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ldff1sb_gather_x0_s32_u32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_s32_u32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_s32_u32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s64.c
+new file mode 100644
+index 000000000..2f2a04d24
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_gather_s64_tied1:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1sb_gather_s64 (p0, z0))
++
++/*
++** ldff1sb_gather_s64_untied:
++**	ldff1sb	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1sb_gather_s64 (p0, z1))
++
++/*
++** ldff1sb_gather_x0_s64_offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1sb_gather_m1_s64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, -1),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, -1))
++
++/*
++** ldff1sb_gather_0_s64_offset:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1sb_gather_5_s64_offset:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ldff1sb_gather_31_s64_offset:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 31),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 31))
++
++/*
++** ldff1sb_gather_32_s64_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 32),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 32))
++
++/*
++** ldff1sb_gather_x0_s64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_s64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_s64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1sb_gather_ext_s64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_s64_s64offset, svint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sb_gather_x0_s64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_s64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_s64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1sb_gather_ext_s64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u32.c
+new file mode 100644
+index 000000000..e3e83a205
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_gather_u32_tied1:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_u32 (p0, z0),
++		     z0_res = svldff1sb_gather_u32 (p0, z0))
++
++/*
++** ldff1sb_gather_u32_untied:
++**	ldff1sb	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_u32 (p0, z1),
++		     z0_res = svldff1sb_gather_u32 (p0, z1))
++
++/*
++** ldff1sb_gather_x0_u32_offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ldff1sb_gather_m1_u32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, -1),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, -1))
++
++/*
++** ldff1sb_gather_0_u32_offset:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ldff1sb_gather_5_u32_offset:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ldff1sb_gather_31_u32_offset:
++**	ldff1sb	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 31),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 31))
++
++/*
++** ldff1sb_gather_32_u32_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 32),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 32))
++
++/*
++** ldff1sb_gather_x0_u32_s32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u32_s32offset, svuint32_t, int8_t, svint32_t,
++		     z0_res = svldff1sb_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_u32_s32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u32_s32offset, svuint32_t, int8_t, svint32_t,
++		     z0_res = svldff1sb_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_u32_s32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u32_s32offset, svuint32_t, int8_t, svint32_t,
++		     z0_res = svldff1sb_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ldff1sb_gather_x0_u32_u32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u32_u32offset, svuint32_t, int8_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_u32_u32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u32_u32offset, svuint32_t, int8_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_u32_u32offset:
++**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u32_u32offset, svuint32_t, int8_t, svuint32_t,
++		     z0_res = svldff1sb_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u64.c
+new file mode 100644
+index 000000000..769f2c266
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_gather_u64_tied1:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1sb_gather_u64 (p0, z0))
++
++/*
++** ldff1sb_gather_u64_untied:
++**	ldff1sb	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1sb_gather_u64 (p0, z1))
++
++/*
++** ldff1sb_gather_x0_u64_offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1sb_gather_m1_u64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, -1),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, -1))
++
++/*
++** ldff1sb_gather_0_u64_offset:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1sb_gather_5_u64_offset:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ldff1sb_gather_31_u64_offset:
++**	ldff1sb	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 31),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 31))
++
++/*
++** ldff1sb_gather_32_u64_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 32),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 32))
++
++/*
++** ldff1sb_gather_x0_u64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_u64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_u64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1sb_gather_ext_u64_s64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_u64_s64offset, svuint64_t, int8_t, svint64_t,
++		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sb_gather_x0_u64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_tied1_u64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sb_gather_untied_u64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1sb_gather_ext_u64_u64offset:
++**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_u64_u64offset, svuint64_t, int8_t, svuint64_t,
++		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s16.c
+new file mode 100644
+index 000000000..e0a748c6a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s16.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_s16_base:
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s16_base, svint16_t, int8_t,
++	   z0 = svldff1sb_s16 (p0, x0),
++	   z0 = svldff1sb_s16 (p0, x0))
++
++/*
++** ldff1sb_s16_index:
++**	ldff1sb	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s16_index, svint16_t, int8_t,
++	   z0 = svldff1sb_s16 (p0, x0 + x1),
++	   z0 = svldff1sb_s16 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_s16_1:
++**	inch	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s16_1, svint16_t, int8_t,
++	   z0 = svldff1sb_s16 (p0, x0 + svcnth ()),
++	   z0 = svldff1sb_s16 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_s16_m1:
++**	dech	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s16_m1, svint16_t, int8_t,
++	   z0 = svldff1sb_s16 (p0, x0 - svcnth ()),
++	   z0 = svldff1sb_s16 (p0, x0 - svcnth ()))
++
++/*
++** ldff1sb_vnum_s16_0:
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s16_0, svint16_t, int8_t,
++	   z0 = svldff1sb_vnum_s16 (p0, x0, 0),
++	   z0 = svldff1sb_vnum_s16 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_s16_1:
++**	inch	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s16_1, svint16_t, int8_t,
++	   z0 = svldff1sb_vnum_s16 (p0, x0, 1),
++	   z0 = svldff1sb_vnum_s16 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_s16_m1:
++**	dech	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s16_m1, svint16_t, int8_t,
++	   z0 = svldff1sb_vnum_s16 (p0, x0, -1),
++	   z0 = svldff1sb_vnum_s16 (p0, x0, -1))
++
++/*
++** ldff1sb_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1sb	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1sb	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s16_x1, svint16_t, int8_t,
++	   z0 = svldff1sb_vnum_s16 (p0, x0, x1),
++	   z0 = svldff1sb_vnum_s16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s32.c
+new file mode 100644
+index 000000000..86716da9b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s32.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_s32_base:
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s32_base, svint32_t, int8_t,
++	   z0 = svldff1sb_s32 (p0, x0),
++	   z0 = svldff1sb_s32 (p0, x0))
++
++/*
++** ldff1sb_s32_index:
++**	ldff1sb	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s32_index, svint32_t, int8_t,
++	   z0 = svldff1sb_s32 (p0, x0 + x1),
++	   z0 = svldff1sb_s32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_s32_1:
++**	incw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s32_1, svint32_t, int8_t,
++	   z0 = svldff1sb_s32 (p0, x0 + svcntw ()),
++	   z0 = svldff1sb_s32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_s32_m1:
++**	decw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s32_m1, svint32_t, int8_t,
++	   z0 = svldff1sb_s32 (p0, x0 - svcntw ()),
++	   z0 = svldff1sb_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1sb_vnum_s32_0:
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s32_0, svint32_t, int8_t,
++	   z0 = svldff1sb_vnum_s32 (p0, x0, 0),
++	   z0 = svldff1sb_vnum_s32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_s32_1:
++**	incw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s32_1, svint32_t, int8_t,
++	   z0 = svldff1sb_vnum_s32 (p0, x0, 1),
++	   z0 = svldff1sb_vnum_s32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_s32_m1:
++**	decw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s32_m1, svint32_t, int8_t,
++	   z0 = svldff1sb_vnum_s32 (p0, x0, -1),
++	   z0 = svldff1sb_vnum_s32 (p0, x0, -1))
++
++/*
++** ldff1sb_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1sb	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1sb	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s32_x1, svint32_t, int8_t,
++	   z0 = svldff1sb_vnum_s32 (p0, x0, x1),
++	   z0 = svldff1sb_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s64.c
+new file mode 100644
+index 000000000..e7a4aa6e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s64.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_s64_base:
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s64_base, svint64_t, int8_t,
++	   z0 = svldff1sb_s64 (p0, x0),
++	   z0 = svldff1sb_s64 (p0, x0))
++
++/*
++** ldff1sb_s64_index:
++**	ldff1sb	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s64_index, svint64_t, int8_t,
++	   z0 = svldff1sb_s64 (p0, x0 + x1),
++	   z0 = svldff1sb_s64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_s64_1:
++**	incd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s64_1, svint64_t, int8_t,
++	   z0 = svldff1sb_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1sb_s64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_s64_m1:
++**	decd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_s64_m1, svint64_t, int8_t,
++	   z0 = svldff1sb_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1sb_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1sb_vnum_s64_0:
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s64_0, svint64_t, int8_t,
++	   z0 = svldff1sb_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1sb_vnum_s64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_s64_1:
++**	incd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s64_1, svint64_t, int8_t,
++	   z0 = svldff1sb_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1sb_vnum_s64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_s64_m1:
++**	decd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s64_m1, svint64_t, int8_t,
++	   z0 = svldff1sb_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1sb_vnum_s64 (p0, x0, -1))
++
++/*
++** ldff1sb_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1sb	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1sb	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_s64_x1, svint64_t, int8_t,
++	   z0 = svldff1sb_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1sb_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u16.c
+new file mode 100644
+index 000000000..69ba96d52
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u16.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_u16_base:
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u16_base, svuint16_t, int8_t,
++	   z0 = svldff1sb_u16 (p0, x0),
++	   z0 = svldff1sb_u16 (p0, x0))
++
++/*
++** ldff1sb_u16_index:
++**	ldff1sb	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u16_index, svuint16_t, int8_t,
++	   z0 = svldff1sb_u16 (p0, x0 + x1),
++	   z0 = svldff1sb_u16 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_u16_1:
++**	inch	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u16_1, svuint16_t, int8_t,
++	   z0 = svldff1sb_u16 (p0, x0 + svcnth ()),
++	   z0 = svldff1sb_u16 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_u16_m1:
++**	dech	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u16_m1, svuint16_t, int8_t,
++	   z0 = svldff1sb_u16 (p0, x0 - svcnth ()),
++	   z0 = svldff1sb_u16 (p0, x0 - svcnth ()))
++
++/*
++** ldff1sb_vnum_u16_0:
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u16_0, svuint16_t, int8_t,
++	   z0 = svldff1sb_vnum_u16 (p0, x0, 0),
++	   z0 = svldff1sb_vnum_u16 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_u16_1:
++**	inch	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u16_1, svuint16_t, int8_t,
++	   z0 = svldff1sb_vnum_u16 (p0, x0, 1),
++	   z0 = svldff1sb_vnum_u16 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_u16_m1:
++**	dech	x0
++**	ldff1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u16_m1, svuint16_t, int8_t,
++	   z0 = svldff1sb_vnum_u16 (p0, x0, -1),
++	   z0 = svldff1sb_vnum_u16 (p0, x0, -1))
++
++/*
++** ldff1sb_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1sb	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1sb	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u16_x1, svuint16_t, int8_t,
++	   z0 = svldff1sb_vnum_u16 (p0, x0, x1),
++	   z0 = svldff1sb_vnum_u16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u32.c
+new file mode 100644
+index 000000000..e1a1873f0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u32.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_u32_base:
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u32_base, svuint32_t, int8_t,
++	   z0 = svldff1sb_u32 (p0, x0),
++	   z0 = svldff1sb_u32 (p0, x0))
++
++/*
++** ldff1sb_u32_index:
++**	ldff1sb	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u32_index, svuint32_t, int8_t,
++	   z0 = svldff1sb_u32 (p0, x0 + x1),
++	   z0 = svldff1sb_u32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_u32_1:
++**	incw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u32_1, svuint32_t, int8_t,
++	   z0 = svldff1sb_u32 (p0, x0 + svcntw ()),
++	   z0 = svldff1sb_u32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_u32_m1:
++**	decw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u32_m1, svuint32_t, int8_t,
++	   z0 = svldff1sb_u32 (p0, x0 - svcntw ()),
++	   z0 = svldff1sb_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1sb_vnum_u32_0:
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u32_0, svuint32_t, int8_t,
++	   z0 = svldff1sb_vnum_u32 (p0, x0, 0),
++	   z0 = svldff1sb_vnum_u32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_u32_1:
++**	incw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u32_1, svuint32_t, int8_t,
++	   z0 = svldff1sb_vnum_u32 (p0, x0, 1),
++	   z0 = svldff1sb_vnum_u32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_u32_m1:
++**	decw	x0
++**	ldff1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u32_m1, svuint32_t, int8_t,
++	   z0 = svldff1sb_vnum_u32 (p0, x0, -1),
++	   z0 = svldff1sb_vnum_u32 (p0, x0, -1))
++
++/*
++** ldff1sb_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1sb	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1sb	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u32_x1, svuint32_t, int8_t,
++	   z0 = svldff1sb_vnum_u32 (p0, x0, x1),
++	   z0 = svldff1sb_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u64.c
+new file mode 100644
+index 000000000..0a49cbcc0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u64.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sb_u64_base:
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u64_base, svuint64_t, int8_t,
++	   z0 = svldff1sb_u64 (p0, x0),
++	   z0 = svldff1sb_u64 (p0, x0))
++
++/*
++** ldff1sb_u64_index:
++**	ldff1sb	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u64_index, svuint64_t, int8_t,
++	   z0 = svldff1sb_u64 (p0, x0 + x1),
++	   z0 = svldff1sb_u64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_u64_1:
++**	incd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u64_1, svuint64_t, int8_t,
++	   z0 = svldff1sb_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1sb_u64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_u64_m1:
++**	decd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_u64_m1, svuint64_t, int8_t,
++	   z0 = svldff1sb_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1sb_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1sb_vnum_u64_0:
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u64_0, svuint64_t, int8_t,
++	   z0 = svldff1sb_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1sb_vnum_u64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_u64_1:
++**	incd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u64_1, svuint64_t, int8_t,
++	   z0 = svldff1sb_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1sb_vnum_u64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sb_vnum_u64_m1:
++**	decd	x0
++**	ldff1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u64_m1, svuint64_t, int8_t,
++	   z0 = svldff1sb_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1sb_vnum_u64 (p0, x0, -1))
++
++/*
++** ldff1sb_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1sb	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1sb	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1sb_vnum_u64_x1, svuint64_t, int8_t,
++	   z0 = svldff1sb_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1sb_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s32.c
+new file mode 100644
+index 000000000..b633335dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_gather_s32_tied1:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_s32 (p0, z0),
++		     z0_res = svldff1sh_gather_s32 (p0, z0))
++
++/*
++** ldff1sh_gather_s32_untied:
++**	ldff1sh	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_s32 (p0, z1),
++		     z0_res = svldff1sh_gather_s32 (p0, z1))
++
++/*
++** ldff1sh_gather_x0_s32_offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m2_s32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, -2),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, -2))
++
++/*
++** ldff1sh_gather_0_s32_offset:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_6_s32_offset:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 6),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 6))
++
++/*
++** ldff1sh_gather_62_s32_offset:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 62),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 62))
++
++/*
++** ldff1sh_gather_64_s32_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 64),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 64))
++
++/*
++** ldff1sh_gather_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_index_s32 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m1_s32_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, -1),
++		     z0_res = svldff1sh_gather_index_s32 (p0, z0, -1))
++
++/*
++** ldff1sh_gather_0_s32_index:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_s32_index:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_31_s32_index:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 31),
++		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 31))
++
++/*
++** ldff1sh_gather_32_s32_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 32),
++		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 32))
++
++/*
++** ldff1sh_gather_x0_s32_s32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_s32offset, svint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s32_s32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_s32offset, svint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s32_s32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_s32offset, svint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_x0_s32_u32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s32_u32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s32_u32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_x0_s32_s32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_s32index, svint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s32_s32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_s32index, svint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s32_s32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_s32index, svint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32index_s32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_x0_s32_u32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_u32index, svint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s32_u32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_u32index, svint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s32_u32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_u32index, svint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32index_s32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s64.c
+new file mode 100644
+index 000000000..32a4309b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_gather_s64_tied1:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1sh_gather_s64 (p0, z0))
++
++/*
++** ldff1sh_gather_s64_untied:
++**	ldff1sh	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1sh_gather_s64 (p0, z1))
++
++/*
++** ldff1sh_gather_x0_s64_offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m2_s64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, -2),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, -2))
++
++/*
++** ldff1sh_gather_0_s64_offset:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_6_s64_offset:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ldff1sh_gather_62_s64_offset:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 62),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 62))
++
++/*
++** ldff1sh_gather_64_s64_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 64),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 64))
++
++/*
++** ldff1sh_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_index_s64 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svldff1sh_gather_index_s64 (p0, z0, -1))
++
++/*
++** ldff1sh_gather_0_s64_index:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_s64_index:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_31_s64_index:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 31))
++
++/*
++** ldff1sh_gather_32_s64_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 32))
++
++/*
++** ldff1sh_gather_x0_s64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_s64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_s64offset, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sh_gather_x0_s64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_s64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sh_gather_x0_s64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_s64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_s64index, svint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sh_gather_x0_s64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_s64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_s64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_s64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_u64index, svint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u32.c
+new file mode 100644
+index 000000000..73a9be892
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_gather_u32_tied1:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_u32 (p0, z0),
++		     z0_res = svldff1sh_gather_u32 (p0, z0))
++
++/*
++** ldff1sh_gather_u32_untied:
++**	ldff1sh	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_u32 (p0, z1),
++		     z0_res = svldff1sh_gather_u32 (p0, z1))
++
++/*
++** ldff1sh_gather_x0_u32_offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m2_u32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, -2),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, -2))
++
++/*
++** ldff1sh_gather_0_u32_offset:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_6_u32_offset:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 6),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 6))
++
++/*
++** ldff1sh_gather_62_u32_offset:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 62),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 62))
++
++/*
++** ldff1sh_gather_64_u32_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 64),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 64))
++
++/*
++** ldff1sh_gather_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_index_u32 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m1_u32_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, -1),
++		     z0_res = svldff1sh_gather_index_u32 (p0, z0, -1))
++
++/*
++** ldff1sh_gather_0_u32_index:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_u32_index:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_31_u32_index:
++**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 31),
++		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 31))
++
++/*
++** ldff1sh_gather_32_u32_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 32),
++		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 32))
++
++/*
++** ldff1sh_gather_x0_u32_s32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_s32offset, svuint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u32_s32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_s32offset, svuint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u32_s32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_s32offset, svuint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_x0_u32_u32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_u32offset, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u32_u32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_u32offset, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u32_u32offset:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_u32offset, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_x0_u32_s32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_s32index, svuint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u32_s32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_s32index, svuint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u32_s32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_s32index, svuint32_t, int16_t, svint32_t,
++		     z0_res = svldff1sh_gather_s32index_u32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_x0_u32_u32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_u32index, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u32_u32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_u32index, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u32_u32index:
++**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_u32index, svuint32_t, int16_t, svuint32_t,
++		     z0_res = svldff1sh_gather_u32index_u32 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u64.c
+new file mode 100644
+index 000000000..94ea73b63
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_gather_u64_tied1:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1sh_gather_u64 (p0, z0))
++
++/*
++** ldff1sh_gather_u64_untied:
++**	ldff1sh	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1sh_gather_u64 (p0, z1))
++
++/*
++** ldff1sh_gather_x0_u64_offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m2_u64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, -2),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, -2))
++
++/*
++** ldff1sh_gather_0_u64_offset:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_6_u64_offset:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ldff1sh_gather_62_u64_offset:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 62),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 62))
++
++/*
++** ldff1sh_gather_64_u64_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 64),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 64))
++
++/*
++** ldff1sh_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svldff1sh_gather_index_u64 (p0, z0, x0))
++
++/*
++** ldff1sh_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svldff1sh_gather_index_u64 (p0, z0, -1))
++
++/*
++** ldff1sh_gather_0_u64_index:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 0))
++
++/*
++** ldff1sh_gather_5_u64_index:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 5))
++
++/*
++** ldff1sh_gather_31_u64_index:
++**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 31))
++
++/*
++** ldff1sh_gather_32_u64_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 32))
++
++/*
++** ldff1sh_gather_x0_u64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_u64_s64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_s64offset, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sh_gather_x0_u64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_u64_u64offset:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_u64offset, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sh_gather_x0_u64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_u64_s64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_s64index, svuint64_t, int16_t, svint64_t,
++		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sh_gather_x0_u64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_tied1_u64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sh_gather_untied_u64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1sh_gather_ext_u64_u64index:
++**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_u64index, svuint64_t, int16_t, svuint64_t,
++		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s32.c
+new file mode 100644
+index 000000000..81b64e836
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_s32_base:
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s32_base, svint32_t, int16_t,
++	   z0 = svldff1sh_s32 (p0, x0),
++	   z0 = svldff1sh_s32 (p0, x0))
++
++/*
++** ldff1sh_s32_index:
++**	ldff1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s32_index, svint32_t, int16_t,
++	   z0 = svldff1sh_s32 (p0, x0 + x1),
++	   z0 = svldff1sh_s32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_s32_1:
++**	inch	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s32_1, svint32_t, int16_t,
++	   z0 = svldff1sh_s32 (p0, x0 + svcntw ()),
++	   z0 = svldff1sh_s32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_s32_m1:
++**	dech	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s32_m1, svint32_t, int16_t,
++	   z0 = svldff1sh_s32 (p0, x0 - svcntw ()),
++	   z0 = svldff1sh_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1sh_vnum_s32_0:
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s32_0, svint32_t, int16_t,
++	   z0 = svldff1sh_vnum_s32 (p0, x0, 0),
++	   z0 = svldff1sh_vnum_s32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_s32_1:
++**	inch	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s32_1, svint32_t, int16_t,
++	   z0 = svldff1sh_vnum_s32 (p0, x0, 1),
++	   z0 = svldff1sh_vnum_s32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_s32_m1:
++**	dech	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s32_m1, svint32_t, int16_t,
++	   z0 = svldff1sh_vnum_s32 (p0, x0, -1),
++	   z0 = svldff1sh_vnum_s32 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1sh_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1sh	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s32_x1, svint32_t, int16_t,
++	   z0 = svldff1sh_vnum_s32 (p0, x0, x1),
++	   z0 = svldff1sh_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s64.c
+new file mode 100644
+index 000000000..453b3ff24
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_s64_base:
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s64_base, svint64_t, int16_t,
++	   z0 = svldff1sh_s64 (p0, x0),
++	   z0 = svldff1sh_s64 (p0, x0))
++
++/*
++** ldff1sh_s64_index:
++**	ldff1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s64_index, svint64_t, int16_t,
++	   z0 = svldff1sh_s64 (p0, x0 + x1),
++	   z0 = svldff1sh_s64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_s64_1:
++**	incw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s64_1, svint64_t, int16_t,
++	   z0 = svldff1sh_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1sh_s64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_s64_m1:
++**	decw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_s64_m1, svint64_t, int16_t,
++	   z0 = svldff1sh_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1sh_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1sh_vnum_s64_0:
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s64_0, svint64_t, int16_t,
++	   z0 = svldff1sh_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1sh_vnum_s64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_s64_1:
++**	incw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s64_1, svint64_t, int16_t,
++	   z0 = svldff1sh_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1sh_vnum_s64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_s64_m1:
++**	decw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s64_m1, svint64_t, int16_t,
++	   z0 = svldff1sh_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1sh_vnum_s64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1sh_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1sh	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_s64_x1, svint64_t, int16_t,
++	   z0 = svldff1sh_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1sh_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u32.c
+new file mode 100644
+index 000000000..bbbed79dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_u32_base:
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u32_base, svuint32_t, int16_t,
++	   z0 = svldff1sh_u32 (p0, x0),
++	   z0 = svldff1sh_u32 (p0, x0))
++
++/*
++** ldff1sh_u32_index:
++**	ldff1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u32_index, svuint32_t, int16_t,
++	   z0 = svldff1sh_u32 (p0, x0 + x1),
++	   z0 = svldff1sh_u32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_u32_1:
++**	inch	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u32_1, svuint32_t, int16_t,
++	   z0 = svldff1sh_u32 (p0, x0 + svcntw ()),
++	   z0 = svldff1sh_u32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_u32_m1:
++**	dech	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u32_m1, svuint32_t, int16_t,
++	   z0 = svldff1sh_u32 (p0, x0 - svcntw ()),
++	   z0 = svldff1sh_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1sh_vnum_u32_0:
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u32_0, svuint32_t, int16_t,
++	   z0 = svldff1sh_vnum_u32 (p0, x0, 0),
++	   z0 = svldff1sh_vnum_u32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_u32_1:
++**	inch	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u32_1, svuint32_t, int16_t,
++	   z0 = svldff1sh_vnum_u32 (p0, x0, 1),
++	   z0 = svldff1sh_vnum_u32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_u32_m1:
++**	dech	x0
++**	ldff1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u32_m1, svuint32_t, int16_t,
++	   z0 = svldff1sh_vnum_u32 (p0, x0, -1),
++	   z0 = svldff1sh_vnum_u32 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1sh_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1sh	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u32_x1, svuint32_t, int16_t,
++	   z0 = svldff1sh_vnum_u32 (p0, x0, x1),
++	   z0 = svldff1sh_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u64.c
+new file mode 100644
+index 000000000..5430e256b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sh_u64_base:
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u64_base, svuint64_t, int16_t,
++	   z0 = svldff1sh_u64 (p0, x0),
++	   z0 = svldff1sh_u64 (p0, x0))
++
++/*
++** ldff1sh_u64_index:
++**	ldff1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u64_index, svuint64_t, int16_t,
++	   z0 = svldff1sh_u64 (p0, x0 + x1),
++	   z0 = svldff1sh_u64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_u64_1:
++**	incw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u64_1, svuint64_t, int16_t,
++	   z0 = svldff1sh_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1sh_u64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_u64_m1:
++**	decw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_u64_m1, svuint64_t, int16_t,
++	   z0 = svldff1sh_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1sh_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1sh_vnum_u64_0:
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u64_0, svuint64_t, int16_t,
++	   z0 = svldff1sh_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1sh_vnum_u64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_u64_1:
++**	incw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u64_1, svuint64_t, int16_t,
++	   z0 = svldff1sh_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1sh_vnum_u64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sh_vnum_u64_m1:
++**	decw	x0
++**	ldff1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u64_m1, svuint64_t, int16_t,
++	   z0 = svldff1sh_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1sh_vnum_u64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1sh_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1sh	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1sh_vnum_u64_x1, svuint64_t, int16_t,
++	   z0 = svldff1sh_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1sh_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_s64.c
+new file mode 100644
+index 000000000..e5da8a83d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_s64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sw_gather_s64_tied1:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1sw_gather_s64 (p0, z0))
++
++/*
++** ldff1sw_gather_s64_untied:
++**	ldff1sw	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1sw_gather_s64 (p0, z1))
++
++/*
++** ldff1sw_gather_x0_s64_offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1sw_gather_m4_s64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_m4_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, -4),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, -4))
++
++/*
++** ldff1sw_gather_0_s64_offset:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1sw_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ldff1sw_gather_6_s64_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ldff1sw_gather_7_s64_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_7_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 7),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 7))
++
++/*
++** ldff1sw_gather_8_s64_offset:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_8_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 8),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 8))
++
++/*
++** ldff1sw_gather_124_s64_offset:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_124_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 124),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 124))
++
++/*
++** ldff1sw_gather_128_s64_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_128_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 128),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 128))
++
++/*
++** ldff1sw_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svldff1sw_gather_index_s64 (p0, z0, x0))
++
++/*
++** ldff1sw_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svldff1sw_gather_index_s64 (p0, z0, -1))
++
++/*
++** ldff1sw_gather_0_s64_index:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 0))
++
++/*
++** ldff1sw_gather_5_s64_index:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 5))
++
++/*
++** ldff1sw_gather_31_s64_index:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 31))
++
++/*
++** ldff1sw_gather_32_s64_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 32))
++
++/*
++** ldff1sw_gather_x0_s64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_s64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_s64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_s64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_s64offset, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sw_gather_x0_s64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_s64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_s64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_s64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sw_gather_x0_s64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_s64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_s64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_s64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_s64index, svint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sw_gather_x0_s64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_s64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_s64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_s64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_u64index, svint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_u64.c
+new file mode 100644
+index 000000000..411428756
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_u64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sw_gather_u64_tied1:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1sw_gather_u64 (p0, z0))
++
++/*
++** ldff1sw_gather_u64_untied:
++**	ldff1sw	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1sw_gather_u64 (p0, z1))
++
++/*
++** ldff1sw_gather_x0_u64_offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1sw_gather_m4_u64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_m4_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, -4),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, -4))
++
++/*
++** ldff1sw_gather_0_u64_offset:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1sw_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ldff1sw_gather_6_u64_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ldff1sw_gather_7_u64_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_7_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 7),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 7))
++
++/*
++** ldff1sw_gather_8_u64_offset:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_8_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 8),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 8))
++
++/*
++** ldff1sw_gather_124_u64_offset:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_124_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 124),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 124))
++
++/*
++** ldff1sw_gather_128_u64_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_128_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 128),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 128))
++
++/*
++** ldff1sw_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svldff1sw_gather_index_u64 (p0, z0, x0))
++
++/*
++** ldff1sw_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svldff1sw_gather_index_u64 (p0, z0, -1))
++
++/*
++** ldff1sw_gather_0_u64_index:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 0))
++
++/*
++** ldff1sw_gather_5_u64_index:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 5))
++
++/*
++** ldff1sw_gather_31_u64_index:
++**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 31))
++
++/*
++** ldff1sw_gather_32_u64_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1sw_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 32))
++
++/*
++** ldff1sw_gather_x0_u64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_u64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_u64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_u64_s64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_s64offset, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sw_gather_x0_u64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_u64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_u64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_u64_u64offset:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_u64offset, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sw_gather_x0_u64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_u64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_u64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_u64_s64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_s64index, svuint64_t, int32_t, svint64_t,
++		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1sw_gather_x0_u64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_tied1_u64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1sw_gather_untied_u64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1sw_gather_ext_u64_u64index:
++**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_u64index, svuint64_t, int32_t, svuint64_t,
++		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_s64.c
+new file mode 100644
+index 000000000..d795ace63
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_s64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sw_s64_base:
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_s64_base, svint64_t, int32_t,
++	   z0 = svldff1sw_s64 (p0, x0),
++	   z0 = svldff1sw_s64 (p0, x0))
++
++/*
++** ldff1sw_s64_index:
++**	ldff1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_s64_index, svint64_t, int32_t,
++	   z0 = svldff1sw_s64 (p0, x0 + x1),
++	   z0 = svldff1sw_s64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_s64_1:
++**	inch	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_s64_1, svint64_t, int32_t,
++	   z0 = svldff1sw_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1sw_s64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_s64_m1:
++**	dech	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_s64_m1, svint64_t, int32_t,
++	   z0 = svldff1sw_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1sw_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1sw_vnum_s64_0:
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_s64_0, svint64_t, int32_t,
++	   z0 = svldff1sw_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1sw_vnum_s64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_vnum_s64_1:
++**	inch	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_s64_1, svint64_t, int32_t,
++	   z0 = svldff1sw_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1sw_vnum_s64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_vnum_s64_m1:
++**	dech	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_s64_m1, svint64_t, int32_t,
++	   z0 = svldff1sw_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1sw_vnum_s64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1sw_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1sw	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_s64_x1, svint64_t, int32_t,
++	   z0 = svldff1sw_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1sw_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_u64.c
+new file mode 100644
+index 000000000..6caf2f504
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_u64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1sw_u64_base:
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_u64_base, svuint64_t, int32_t,
++	   z0 = svldff1sw_u64 (p0, x0),
++	   z0 = svldff1sw_u64 (p0, x0))
++
++/*
++** ldff1sw_u64_index:
++**	ldff1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_u64_index, svuint64_t, int32_t,
++	   z0 = svldff1sw_u64 (p0, x0 + x1),
++	   z0 = svldff1sw_u64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_u64_1:
++**	inch	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_u64_1, svuint64_t, int32_t,
++	   z0 = svldff1sw_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1sw_u64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_u64_m1:
++**	dech	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_u64_m1, svuint64_t, int32_t,
++	   z0 = svldff1sw_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1sw_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1sw_vnum_u64_0:
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_u64_0, svuint64_t, int32_t,
++	   z0 = svldff1sw_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1sw_vnum_u64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_vnum_u64_1:
++**	inch	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_u64_1, svuint64_t, int32_t,
++	   z0 = svldff1sw_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1sw_vnum_u64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1sw_vnum_u64_m1:
++**	dech	x0
++**	ldff1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_u64_m1, svuint64_t, int32_t,
++	   z0 = svldff1sw_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1sw_vnum_u64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1sw_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1sw	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1sw_vnum_u64_x1, svuint64_t, int32_t,
++	   z0 = svldff1sw_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1sw_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s32.c
+new file mode 100644
+index 000000000..af0be08d2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_gather_s32_tied1:
++**	ldff1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_s32 (p0, z0),
++		     z0_res = svldff1ub_gather_s32 (p0, z0))
++
++/*
++** ldff1ub_gather_s32_untied:
++**	ldff1b	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_s32 (p0, z1),
++		     z0_res = svldff1ub_gather_s32 (p0, z1))
++
++/*
++** ldff1ub_gather_x0_s32_offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ldff1ub_gather_m1_s32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, -1),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, -1))
++
++/*
++** ldff1ub_gather_0_s32_offset:
++**	ldff1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ldff1ub_gather_5_s32_offset:
++**	ldff1b	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ldff1ub_gather_31_s32_offset:
++**	ldff1b	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 31),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 31))
++
++/*
++** ldff1ub_gather_32_s32_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 32),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 32))
++
++/*
++** ldff1ub_gather_x0_s32_s32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s32_s32offset, svint32_t, uint8_t, svint32_t,
++		     z0_res = svldff1ub_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_s32_s32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s32_s32offset, svint32_t, uint8_t, svint32_t,
++		     z0_res = svldff1ub_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_s32_s32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s32_s32offset, svint32_t, uint8_t, svint32_t,
++		     z0_res = svldff1ub_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ldff1ub_gather_x0_s32_u32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s32_u32offset, svint32_t, uint8_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_s32_u32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s32_u32offset, svint32_t, uint8_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_s32_u32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s32_u32offset, svint32_t, uint8_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s64.c
+new file mode 100644
+index 000000000..43124dd89
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_gather_s64_tied1:
++**	ldff1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1ub_gather_s64 (p0, z0))
++
++/*
++** ldff1ub_gather_s64_untied:
++**	ldff1b	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1ub_gather_s64 (p0, z1))
++
++/*
++** ldff1ub_gather_x0_s64_offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1ub_gather_m1_s64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, -1),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, -1))
++
++/*
++** ldff1ub_gather_0_s64_offset:
++**	ldff1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1ub_gather_5_s64_offset:
++**	ldff1b	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ldff1ub_gather_31_s64_offset:
++**	ldff1b	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 31),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 31))
++
++/*
++** ldff1ub_gather_32_s64_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 32),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 32))
++
++/*
++** ldff1ub_gather_x0_s64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_s64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_s64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1ub_gather_ext_s64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_s64_s64offset, svint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1ub_gather_x0_s64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_s64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_s64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1ub_gather_ext_s64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_s64_u64offset, svint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u32.c
+new file mode 100644
+index 000000000..90c4e58a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u32.c
+@@ -0,0 +1,131 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_gather_u32_tied1:
++**	ldff1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_u32 (p0, z0),
++		     z0_res = svldff1ub_gather_u32 (p0, z0))
++
++/*
++** ldff1ub_gather_u32_untied:
++**	ldff1b	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_u32 (p0, z1),
++		     z0_res = svldff1ub_gather_u32 (p0, z1))
++
++/*
++** ldff1ub_gather_x0_u32_offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ldff1ub_gather_m1_u32_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, -1),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, -1))
++
++/*
++** ldff1ub_gather_0_u32_offset:
++**	ldff1b	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ldff1ub_gather_5_u32_offset:
++**	ldff1b	z0\.s, p0/z, \[z0\.s, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ldff1ub_gather_31_u32_offset:
++**	ldff1b	z0\.s, p0/z, \[z0\.s, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 31),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 31))
++
++/*
++** ldff1ub_gather_32_u32_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 32),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 32))
++
++/*
++** ldff1ub_gather_x0_u32_s32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		     z0_res = svldff1ub_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_u32_s32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		     z0_res = svldff1ub_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_u32_s32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		     z0_res = svldff1ub_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ldff1ub_gather_x0_u32_u32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_u32_u32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_u32_u32offset:
++**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		     z0_res = svldff1ub_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u64.c
+new file mode 100644
+index 000000000..302623a40
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u64.c
+@@ -0,0 +1,149 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_gather_u64_tied1:
++**	ldff1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1ub_gather_u64 (p0, z0))
++
++/*
++** ldff1ub_gather_u64_untied:
++**	ldff1b	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1ub_gather_u64 (p0, z1))
++
++/*
++** ldff1ub_gather_x0_u64_offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1ub_gather_m1_u64_offset:
++**	mov	(x[0-9]+), #?-1
++**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, -1),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, -1))
++
++/*
++** ldff1ub_gather_0_u64_offset:
++**	ldff1b	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1ub_gather_5_u64_offset:
++**	ldff1b	z0\.d, p0/z, \[z0\.d, #5\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ldff1ub_gather_31_u64_offset:
++**	ldff1b	z0\.d, p0/z, \[z0\.d, #31\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 31),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 31))
++
++/*
++** ldff1ub_gather_32_u64_offset:
++**	mov	(x[0-9]+), #?32
++**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 32),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 32))
++
++/*
++** ldff1ub_gather_x0_u64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_u64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_u64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1ub_gather_ext_u64_s64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1ub_gather_x0_u64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_tied1_u64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1ub_gather_untied_u64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1ub_gather_ext_u64_u64offset:
++**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s16.c
+new file mode 100644
+index 000000000..88ad2d1dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s16.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_s16_base:
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s16_base, svint16_t, uint8_t,
++	   z0 = svldff1ub_s16 (p0, x0),
++	   z0 = svldff1ub_s16 (p0, x0))
++
++/*
++** ldff1ub_s16_index:
++**	ldff1b	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s16_index, svint16_t, uint8_t,
++	   z0 = svldff1ub_s16 (p0, x0 + x1),
++	   z0 = svldff1ub_s16 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_s16_1:
++**	inch	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s16_1, svint16_t, uint8_t,
++	   z0 = svldff1ub_s16 (p0, x0 + svcnth ()),
++	   z0 = svldff1ub_s16 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_s16_m1:
++**	dech	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s16_m1, svint16_t, uint8_t,
++	   z0 = svldff1ub_s16 (p0, x0 - svcnth ()),
++	   z0 = svldff1ub_s16 (p0, x0 - svcnth ()))
++
++/*
++** ldff1ub_vnum_s16_0:
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s16_0, svint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_s16 (p0, x0, 0),
++	   z0 = svldff1ub_vnum_s16 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_s16_1:
++**	inch	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s16_1, svint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_s16 (p0, x0, 1),
++	   z0 = svldff1ub_vnum_s16 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_s16_m1:
++**	dech	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s16_m1, svint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_s16 (p0, x0, -1),
++	   z0 = svldff1ub_vnum_s16 (p0, x0, -1))
++
++/*
++** ldff1ub_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s16_x1, svint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_s16 (p0, x0, x1),
++	   z0 = svldff1ub_vnum_s16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s32.c
+new file mode 100644
+index 000000000..e8e06411f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s32.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_s32_base:
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s32_base, svint32_t, uint8_t,
++	   z0 = svldff1ub_s32 (p0, x0),
++	   z0 = svldff1ub_s32 (p0, x0))
++
++/*
++** ldff1ub_s32_index:
++**	ldff1b	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s32_index, svint32_t, uint8_t,
++	   z0 = svldff1ub_s32 (p0, x0 + x1),
++	   z0 = svldff1ub_s32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_s32_1:
++**	incw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s32_1, svint32_t, uint8_t,
++	   z0 = svldff1ub_s32 (p0, x0 + svcntw ()),
++	   z0 = svldff1ub_s32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_s32_m1:
++**	decw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s32_m1, svint32_t, uint8_t,
++	   z0 = svldff1ub_s32 (p0, x0 - svcntw ()),
++	   z0 = svldff1ub_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1ub_vnum_s32_0:
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s32_0, svint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_s32 (p0, x0, 0),
++	   z0 = svldff1ub_vnum_s32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_s32_1:
++**	incw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s32_1, svint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_s32 (p0, x0, 1),
++	   z0 = svldff1ub_vnum_s32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_s32_m1:
++**	decw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s32_m1, svint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_s32 (p0, x0, -1),
++	   z0 = svldff1ub_vnum_s32 (p0, x0, -1))
++
++/*
++** ldff1ub_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s32_x1, svint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_s32 (p0, x0, x1),
++	   z0 = svldff1ub_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s64.c
+new file mode 100644
+index 000000000..21d02ddb7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s64.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_s64_base:
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s64_base, svint64_t, uint8_t,
++	   z0 = svldff1ub_s64 (p0, x0),
++	   z0 = svldff1ub_s64 (p0, x0))
++
++/*
++** ldff1ub_s64_index:
++**	ldff1b	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s64_index, svint64_t, uint8_t,
++	   z0 = svldff1ub_s64 (p0, x0 + x1),
++	   z0 = svldff1ub_s64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_s64_1:
++**	incd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s64_1, svint64_t, uint8_t,
++	   z0 = svldff1ub_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1ub_s64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_s64_m1:
++**	decd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_s64_m1, svint64_t, uint8_t,
++	   z0 = svldff1ub_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1ub_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1ub_vnum_s64_0:
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s64_0, svint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1ub_vnum_s64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_s64_1:
++**	incd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s64_1, svint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1ub_vnum_s64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_s64_m1:
++**	decd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s64_m1, svint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1ub_vnum_s64 (p0, x0, -1))
++
++/*
++** ldff1ub_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_s64_x1, svint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1ub_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u16.c
+new file mode 100644
+index 000000000..904cb027e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u16.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_u16_base:
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u16_base, svuint16_t, uint8_t,
++	   z0 = svldff1ub_u16 (p0, x0),
++	   z0 = svldff1ub_u16 (p0, x0))
++
++/*
++** ldff1ub_u16_index:
++**	ldff1b	z0\.h, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u16_index, svuint16_t, uint8_t,
++	   z0 = svldff1ub_u16 (p0, x0 + x1),
++	   z0 = svldff1ub_u16 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_u16_1:
++**	inch	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u16_1, svuint16_t, uint8_t,
++	   z0 = svldff1ub_u16 (p0, x0 + svcnth ()),
++	   z0 = svldff1ub_u16 (p0, x0 + svcnth ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_u16_m1:
++**	dech	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u16_m1, svuint16_t, uint8_t,
++	   z0 = svldff1ub_u16 (p0, x0 - svcnth ()),
++	   z0 = svldff1ub_u16 (p0, x0 - svcnth ()))
++
++/*
++** ldff1ub_vnum_u16_0:
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u16_0, svuint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_u16 (p0, x0, 0),
++	   z0 = svldff1ub_vnum_u16 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_u16_1:
++**	inch	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u16_1, svuint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_u16 (p0, x0, 1),
++	   z0 = svldff1ub_vnum_u16 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_u16_m1:
++**	dech	x0
++**	ldff1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u16_m1, svuint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_u16 (p0, x0, -1),
++	   z0 = svldff1ub_vnum_u16 (p0, x0, -1))
++
++/*
++** ldff1ub_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.h, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.h, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u16_x1, svuint16_t, uint8_t,
++	   z0 = svldff1ub_vnum_u16 (p0, x0, x1),
++	   z0 = svldff1ub_vnum_u16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u32.c
+new file mode 100644
+index 000000000..a40012318
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u32.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_u32_base:
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u32_base, svuint32_t, uint8_t,
++	   z0 = svldff1ub_u32 (p0, x0),
++	   z0 = svldff1ub_u32 (p0, x0))
++
++/*
++** ldff1ub_u32_index:
++**	ldff1b	z0\.s, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u32_index, svuint32_t, uint8_t,
++	   z0 = svldff1ub_u32 (p0, x0 + x1),
++	   z0 = svldff1ub_u32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_u32_1:
++**	incw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u32_1, svuint32_t, uint8_t,
++	   z0 = svldff1ub_u32 (p0, x0 + svcntw ()),
++	   z0 = svldff1ub_u32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_u32_m1:
++**	decw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u32_m1, svuint32_t, uint8_t,
++	   z0 = svldff1ub_u32 (p0, x0 - svcntw ()),
++	   z0 = svldff1ub_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1ub_vnum_u32_0:
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u32_0, svuint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_u32 (p0, x0, 0),
++	   z0 = svldff1ub_vnum_u32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_u32_1:
++**	incw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u32_1, svuint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_u32 (p0, x0, 1),
++	   z0 = svldff1ub_vnum_u32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_u32_m1:
++**	decw	x0
++**	ldff1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u32_m1, svuint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_u32 (p0, x0, -1),
++	   z0 = svldff1ub_vnum_u32 (p0, x0, -1))
++
++/*
++** ldff1ub_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.s, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.s, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u32_x1, svuint32_t, uint8_t,
++	   z0 = svldff1ub_vnum_u32 (p0, x0, x1),
++	   z0 = svldff1ub_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u64.c
+new file mode 100644
+index 000000000..a9a98a683
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u64.c
+@@ -0,0 +1,90 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1ub_u64_base:
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u64_base, svuint64_t, uint8_t,
++	   z0 = svldff1ub_u64 (p0, x0),
++	   z0 = svldff1ub_u64 (p0, x0))
++
++/*
++** ldff1ub_u64_index:
++**	ldff1b	z0\.d, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u64_index, svuint64_t, uint8_t,
++	   z0 = svldff1ub_u64 (p0, x0 + x1),
++	   z0 = svldff1ub_u64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_u64_1:
++**	incd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u64_1, svuint64_t, uint8_t,
++	   z0 = svldff1ub_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1ub_u64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_u64_m1:
++**	decd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_u64_m1, svuint64_t, uint8_t,
++	   z0 = svldff1ub_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1ub_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1ub_vnum_u64_0:
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u64_0, svuint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1ub_vnum_u64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_u64_1:
++**	incd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u64_1, svuint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1ub_vnum_u64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1ub_vnum_u64_m1:
++**	decd	x0
++**	ldff1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u64_m1, svuint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1ub_vnum_u64 (p0, x0, -1))
++
++/*
++** ldff1ub_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldff1b	z0\.d, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldff1b	z0\.d, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldff1ub_vnum_u64_x1, svuint64_t, uint8_t,
++	   z0 = svldff1ub_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1ub_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s32.c
+new file mode 100644
+index 000000000..d02e44342
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_gather_s32_tied1:
++**	ldff1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_s32_tied1, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_s32 (p0, z0),
++		     z0_res = svldff1uh_gather_s32 (p0, z0))
++
++/*
++** ldff1uh_gather_s32_untied:
++**	ldff1h	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_s32_untied, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_s32 (p0, z1),
++		     z0_res = svldff1uh_gather_s32 (p0, z1))
++
++/*
++** ldff1uh_gather_x0_s32_offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m2_s32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, -2),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, -2))
++
++/*
++** ldff1uh_gather_0_s32_offset:
++**	ldff1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_6_s32_offset:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 6),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 6))
++
++/*
++** ldff1uh_gather_62_s32_offset:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 62),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 62))
++
++/*
++** ldff1uh_gather_64_s32_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_s32_offset, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 64),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 64))
++
++/*
++** ldff1uh_gather_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_index_s32 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m1_s32_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, -1),
++		     z0_res = svldff1uh_gather_index_s32 (p0, z0, -1))
++
++/*
++** ldff1uh_gather_0_s32_index:
++**	ldff1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_s32_index:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_31_s32_index:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 31),
++		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 31))
++
++/*
++** ldff1uh_gather_32_s32_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_s32_index, svint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 32),
++		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 32))
++
++/*
++** ldff1uh_gather_x0_s32_s32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_s32offset, svint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s32_s32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_s32offset, svint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s32_s32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_s32offset, svint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_x0_s32_u32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_u32offset, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s32_u32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_u32offset, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32offset_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s32_u32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_u32offset, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32offset_s32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_x0_s32_s32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_s32index, svint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s32_s32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_s32index, svint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32index_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s32_s32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_s32index, svint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32index_s32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_x0_s32_u32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_u32index, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s32_u32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_u32index, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32index_s32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s32_u32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_u32index, svint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32index_s32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s64.c
+new file mode 100644
+index 000000000..663a73d27
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_gather_s64_tied1:
++**	ldff1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1uh_gather_s64 (p0, z0))
++
++/*
++** ldff1uh_gather_s64_untied:
++**	ldff1h	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1uh_gather_s64 (p0, z1))
++
++/*
++** ldff1uh_gather_x0_s64_offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m2_s64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, -2),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, -2))
++
++/*
++** ldff1uh_gather_0_s64_offset:
++**	ldff1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_6_s64_offset:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ldff1uh_gather_62_s64_offset:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 62),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 62))
++
++/*
++** ldff1uh_gather_64_s64_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 64),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 64))
++
++/*
++** ldff1uh_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_index_s64 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svldff1uh_gather_index_s64 (p0, z0, -1))
++
++/*
++** ldff1uh_gather_0_s64_index:
++**	ldff1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_s64_index:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_31_s64_index:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 31))
++
++/*
++** ldff1uh_gather_32_s64_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 32))
++
++/*
++** ldff1uh_gather_x0_s64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_s64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_s64offset, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uh_gather_x0_s64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_s64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_u64offset, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uh_gather_x0_s64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_s64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_s64index, svint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uh_gather_x0_s64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_s64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_s64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_s64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_u64index, svint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u32.c
+new file mode 100644
+index 000000000..5e0ef067f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u32.c
+@@ -0,0 +1,252 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_gather_u32_tied1:
++**	ldff1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_u32_tied1, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_u32 (p0, z0),
++		     z0_res = svldff1uh_gather_u32 (p0, z0))
++
++/*
++** ldff1uh_gather_u32_untied:
++**	ldff1h	z0\.s, p0/z, \[z1\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_u32_untied, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_u32 (p0, z1),
++		     z0_res = svldff1uh_gather_u32 (p0, z1))
++
++/*
++** ldff1uh_gather_x0_u32_offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m2_u32_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, -2),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, -2))
++
++/*
++** ldff1uh_gather_0_u32_offset:
++**	ldff1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_6_u32_offset:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 6),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 6))
++
++/*
++** ldff1uh_gather_62_u32_offset:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 62),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 62))
++
++/*
++** ldff1uh_gather_64_u32_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_u32_offset, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 64),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 64))
++
++/*
++** ldff1uh_gather_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_index_u32 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m1_u32_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, -1),
++		     z0_res = svldff1uh_gather_index_u32 (p0, z0, -1))
++
++/*
++** ldff1uh_gather_0_u32_index:
++**	ldff1h	z0\.s, p0/z, \[z0\.s\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_u32_index:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_31_u32_index:
++**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 31),
++		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 31))
++
++/*
++** ldff1uh_gather_32_u32_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_u32_index, svuint32_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 32),
++		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 32))
++
++/*
++** ldff1uh_gather_x0_u32_s32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u32_s32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u32_s32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_x0_u32_u32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u32_u32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32offset_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u32_u32offset:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32offset_u32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_x0_u32_s32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u32_s32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32index_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u32_s32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		     z0_res = svldff1uh_gather_s32index_u32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_x0_u32_u32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u32_u32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32index_u32 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u32_u32index:
++**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		     z0_res = svldff1uh_gather_u32index_u32 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u64.c
+new file mode 100644
+index 000000000..1cfae1b95
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u64.c
+@@ -0,0 +1,288 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_gather_u64_tied1:
++**	ldff1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1uh_gather_u64 (p0, z0))
++
++/*
++** ldff1uh_gather_u64_untied:
++**	ldff1h	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1uh_gather_u64 (p0, z1))
++
++/*
++** ldff1uh_gather_x0_u64_offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m2_u64_offset:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, -2),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, -2))
++
++/*
++** ldff1uh_gather_0_u64_offset:
++**	ldff1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_6_u64_offset:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #6\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ldff1uh_gather_62_u64_offset:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 62),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 62))
++
++/*
++** ldff1uh_gather_64_u64_offset:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 64),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 64))
++
++/*
++** ldff1uh_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svldff1uh_gather_index_u64 (p0, z0, x0))
++
++/*
++** ldff1uh_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-2
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svldff1uh_gather_index_u64 (p0, z0, -1))
++
++/*
++** ldff1uh_gather_0_u64_index:
++**	ldff1h	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 0))
++
++/*
++** ldff1uh_gather_5_u64_index:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #10\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 5))
++
++/*
++** ldff1uh_gather_31_u64_index:
++**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 31))
++
++/*
++** ldff1uh_gather_32_u64_index:
++**	mov	(x[0-9]+), #?64
++**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 32))
++
++/*
++** ldff1uh_gather_x0_u64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_u64_s64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uh_gather_x0_u64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_u64_u64offset:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uh_gather_x0_u64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_u64_s64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uh_gather_x0_u64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_tied1_u64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uh_gather_untied_u64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1uh_gather_ext_u64_u64index:
++**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s32.c
+new file mode 100644
+index 000000000..abb3d769a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_s32_base:
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s32_base, svint32_t, uint16_t,
++	   z0 = svldff1uh_s32 (p0, x0),
++	   z0 = svldff1uh_s32 (p0, x0))
++
++/*
++** ldff1uh_s32_index:
++**	ldff1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s32_index, svint32_t, uint16_t,
++	   z0 = svldff1uh_s32 (p0, x0 + x1),
++	   z0 = svldff1uh_s32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_s32_1:
++**	inch	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s32_1, svint32_t, uint16_t,
++	   z0 = svldff1uh_s32 (p0, x0 + svcntw ()),
++	   z0 = svldff1uh_s32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_s32_m1:
++**	dech	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s32_m1, svint32_t, uint16_t,
++	   z0 = svldff1uh_s32 (p0, x0 - svcntw ()),
++	   z0 = svldff1uh_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1uh_vnum_s32_0:
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s32_0, svint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_s32 (p0, x0, 0),
++	   z0 = svldff1uh_vnum_s32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_s32_1:
++**	inch	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s32_1, svint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_s32 (p0, x0, 1),
++	   z0 = svldff1uh_vnum_s32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_s32_m1:
++**	dech	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s32_m1, svint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_s32 (p0, x0, -1),
++	   z0 = svldff1uh_vnum_s32 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1uh_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s32_x1, svint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_s32 (p0, x0, x1),
++	   z0 = svldff1uh_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s64.c
+new file mode 100644
+index 000000000..6e330e8e8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_s64_base:
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s64_base, svint64_t, uint16_t,
++	   z0 = svldff1uh_s64 (p0, x0),
++	   z0 = svldff1uh_s64 (p0, x0))
++
++/*
++** ldff1uh_s64_index:
++**	ldff1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s64_index, svint64_t, uint16_t,
++	   z0 = svldff1uh_s64 (p0, x0 + x1),
++	   z0 = svldff1uh_s64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_s64_1:
++**	incw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s64_1, svint64_t, uint16_t,
++	   z0 = svldff1uh_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1uh_s64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_s64_m1:
++**	decw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_s64_m1, svint64_t, uint16_t,
++	   z0 = svldff1uh_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1uh_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1uh_vnum_s64_0:
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s64_0, svint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1uh_vnum_s64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_s64_1:
++**	incw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s64_1, svint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1uh_vnum_s64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_s64_m1:
++**	decw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s64_m1, svint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1uh_vnum_s64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1uh_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_s64_x1, svint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1uh_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u32.c
+new file mode 100644
+index 000000000..4eb5323e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u32.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_u32_base:
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u32_base, svuint32_t, uint16_t,
++	   z0 = svldff1uh_u32 (p0, x0),
++	   z0 = svldff1uh_u32 (p0, x0))
++
++/*
++** ldff1uh_u32_index:
++**	ldff1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u32_index, svuint32_t, uint16_t,
++	   z0 = svldff1uh_u32 (p0, x0 + x1),
++	   z0 = svldff1uh_u32 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_u32_1:
++**	inch	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u32_1, svuint32_t, uint16_t,
++	   z0 = svldff1uh_u32 (p0, x0 + svcntw ()),
++	   z0 = svldff1uh_u32 (p0, x0 + svcntw ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_u32_m1:
++**	dech	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u32_m1, svuint32_t, uint16_t,
++	   z0 = svldff1uh_u32 (p0, x0 - svcntw ()),
++	   z0 = svldff1uh_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldff1uh_vnum_u32_0:
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u32_0, svuint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_u32 (p0, x0, 0),
++	   z0 = svldff1uh_vnum_u32 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_u32_1:
++**	inch	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u32_1, svuint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_u32 (p0, x0, 1),
++	   z0 = svldff1uh_vnum_u32 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_u32_m1:
++**	dech	x0
++**	ldff1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u32_m1, svuint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_u32 (p0, x0, -1),
++	   z0 = svldff1uh_vnum_u32 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1uh_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u32_x1, svuint32_t, uint16_t,
++	   z0 = svldff1uh_vnum_u32 (p0, x0, x1),
++	   z0 = svldff1uh_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u64.c
+new file mode 100644
+index 000000000..ebac26e7d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uh_u64_base:
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u64_base, svuint64_t, uint16_t,
++	   z0 = svldff1uh_u64 (p0, x0),
++	   z0 = svldff1uh_u64 (p0, x0))
++
++/*
++** ldff1uh_u64_index:
++**	ldff1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u64_index, svuint64_t, uint16_t,
++	   z0 = svldff1uh_u64 (p0, x0 + x1),
++	   z0 = svldff1uh_u64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_u64_1:
++**	incw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u64_1, svuint64_t, uint16_t,
++	   z0 = svldff1uh_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1uh_u64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_u64_m1:
++**	decw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_u64_m1, svuint64_t, uint16_t,
++	   z0 = svldff1uh_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1uh_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1uh_vnum_u64_0:
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u64_0, svuint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1uh_vnum_u64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_u64_1:
++**	incw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u64_1, svuint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1uh_vnum_u64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uh_vnum_u64_m1:
++**	decw	x0
++**	ldff1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u64_m1, svuint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1uh_vnum_u64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1uh_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1h	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1uh_vnum_u64_x1, svuint64_t, uint16_t,
++	   z0 = svldff1uh_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1uh_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_s64.c
+new file mode 100644
+index 000000000..6c0daea52
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_s64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uw_gather_s64_tied1:
++**	ldff1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_s64_tied1, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_s64 (p0, z0),
++		     z0_res = svldff1uw_gather_s64 (p0, z0))
++
++/*
++** ldff1uw_gather_s64_untied:
++**	ldff1w	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_s64_untied, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_s64 (p0, z1),
++		     z0_res = svldff1uw_gather_s64 (p0, z1))
++
++/*
++** ldff1uw_gather_x0_s64_offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, x0),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, x0))
++
++/*
++** ldff1uw_gather_m4_s64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_m4_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, -4),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, -4))
++
++/*
++** ldff1uw_gather_0_s64_offset:
++**	ldff1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 0),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 0))
++
++/*
++** ldff1uw_gather_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 5),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 5))
++
++/*
++** ldff1uw_gather_6_s64_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_6_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 6),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 6))
++
++/*
++** ldff1uw_gather_7_s64_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_7_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 7),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 7))
++
++/*
++** ldff1uw_gather_8_s64_offset:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_8_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 8),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 8))
++
++/*
++** ldff1uw_gather_124_s64_offset:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_124_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 124),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 124))
++
++/*
++** ldff1uw_gather_128_s64_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_128_s64_offset, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 128),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 128))
++
++/*
++** ldff1uw_gather_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, x0),
++		     z0_res = svldff1uw_gather_index_s64 (p0, z0, x0))
++
++/*
++** ldff1uw_gather_m1_s64_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_m1_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, -1),
++		     z0_res = svldff1uw_gather_index_s64 (p0, z0, -1))
++
++/*
++** ldff1uw_gather_0_s64_index:
++**	ldff1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 0),
++		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 0))
++
++/*
++** ldff1uw_gather_5_s64_index:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 5),
++		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 5))
++
++/*
++** ldff1uw_gather_31_s64_index:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_31_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 31),
++		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 31))
++
++/*
++** ldff1uw_gather_32_s64_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_32_s64_index, svint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 32),
++		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 32))
++
++/*
++** ldff1uw_gather_x0_s64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_s64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_s64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_s64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_s64offset, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uw_gather_x0_s64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_s64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_s64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_s64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_u64offset, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uw_gather_x0_s64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_s64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_s64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_s64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_s64index, svint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uw_gather_x0_s64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_s64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_s64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_s64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_u64index, svint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_u64.c
+new file mode 100644
+index 000000000..0e400c679
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_u64.c
+@@ -0,0 +1,308 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uw_gather_u64_tied1:
++**	ldff1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_u64_tied1, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_u64 (p0, z0),
++		     z0_res = svldff1uw_gather_u64 (p0, z0))
++
++/*
++** ldff1uw_gather_u64_untied:
++**	ldff1w	z0\.d, p0/z, \[z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_u64_untied, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_u64 (p0, z1),
++		     z0_res = svldff1uw_gather_u64 (p0, z1))
++
++/*
++** ldff1uw_gather_x0_u64_offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, x0),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, x0))
++
++/*
++** ldff1uw_gather_m4_u64_offset:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_m4_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, -4),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, -4))
++
++/*
++** ldff1uw_gather_0_u64_offset:
++**	ldff1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 0),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 0))
++
++/*
++** ldff1uw_gather_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 5),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 5))
++
++/*
++** ldff1uw_gather_6_u64_offset:
++**	mov	(x[0-9]+), #?6
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_6_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 6),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 6))
++
++/*
++** ldff1uw_gather_7_u64_offset:
++**	mov	(x[0-9]+), #?7
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_7_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 7),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 7))
++
++/*
++** ldff1uw_gather_8_u64_offset:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #8\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_8_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 8),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 8))
++
++/*
++** ldff1uw_gather_124_u64_offset:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_124_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 124),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 124))
++
++/*
++** ldff1uw_gather_128_u64_offset:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_128_u64_offset, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 128),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 128))
++
++/*
++** ldff1uw_gather_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, x0),
++		     z0_res = svldff1uw_gather_index_u64 (p0, z0, x0))
++
++/*
++** ldff1uw_gather_m1_u64_index:
++**	mov	(x[0-9]+), #?-4
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_m1_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, -1),
++		     z0_res = svldff1uw_gather_index_u64 (p0, z0, -1))
++
++/*
++** ldff1uw_gather_0_u64_index:
++**	ldff1w	z0\.d, p0/z, \[z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 0),
++		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 0))
++
++/*
++** ldff1uw_gather_5_u64_index:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #20\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 5),
++		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 5))
++
++/*
++** ldff1uw_gather_31_u64_index:
++**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_31_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 31),
++		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 31))
++
++/*
++** ldff1uw_gather_32_u64_index:
++**	mov	(x[0-9]+), #?128
++**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_ZS (ldff1uw_gather_32_u64_index, svuint64_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 32),
++		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 32))
++
++/*
++** ldff1uw_gather_x0_u64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_u64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_u64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_u64_s64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uw_gather_x0_u64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_u64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_u64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_u64_u64offset:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uw_gather_x0_u64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_u64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_u64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_u64_s64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
++
++/*
++** ldff1uw_gather_x0_u64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_tied1_u64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, z0),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
++
++/*
++** ldff1uw_gather_untied_u64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, z1),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z1))
++
++/*
++** ldff1uw_gather_ext_u64_u64index:
++**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
++		     z0_res = svldff1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_s64.c
+new file mode 100644
+index 000000000..ac9779899
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_s64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uw_s64_base:
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_s64_base, svint64_t, uint32_t,
++	   z0 = svldff1uw_s64 (p0, x0),
++	   z0 = svldff1uw_s64 (p0, x0))
++
++/*
++** ldff1uw_s64_index:
++**	ldff1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_s64_index, svint64_t, uint32_t,
++	   z0 = svldff1uw_s64 (p0, x0 + x1),
++	   z0 = svldff1uw_s64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_s64_1:
++**	inch	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_s64_1, svint64_t, uint32_t,
++	   z0 = svldff1uw_s64 (p0, x0 + svcntd ()),
++	   z0 = svldff1uw_s64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_s64_m1:
++**	dech	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_s64_m1, svint64_t, uint32_t,
++	   z0 = svldff1uw_s64 (p0, x0 - svcntd ()),
++	   z0 = svldff1uw_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1uw_vnum_s64_0:
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_s64_0, svint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_s64 (p0, x0, 0),
++	   z0 = svldff1uw_vnum_s64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_vnum_s64_1:
++**	inch	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_s64_1, svint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_s64 (p0, x0, 1),
++	   z0 = svldff1uw_vnum_s64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_vnum_s64_m1:
++**	dech	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_s64_m1, svint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_s64 (p0, x0, -1),
++	   z0 = svldff1uw_vnum_s64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1uw_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1w	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_s64_x1, svint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_s64 (p0, x0, x1),
++	   z0 = svldff1uw_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_u64.c
+new file mode 100644
+index 000000000..c7ab06171
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_u64.c
+@@ -0,0 +1,86 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldff1uw_u64_base:
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_u64_base, svuint64_t, uint32_t,
++	   z0 = svldff1uw_u64 (p0, x0),
++	   z0 = svldff1uw_u64 (p0, x0))
++
++/*
++** ldff1uw_u64_index:
++**	ldff1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_u64_index, svuint64_t, uint32_t,
++	   z0 = svldff1uw_u64 (p0, x0 + x1),
++	   z0 = svldff1uw_u64 (p0, x0 + x1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_u64_1:
++**	inch	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_u64_1, svuint64_t, uint32_t,
++	   z0 = svldff1uw_u64 (p0, x0 + svcntd ()),
++	   z0 = svldff1uw_u64 (p0, x0 + svcntd ()))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_u64_m1:
++**	dech	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_u64_m1, svuint64_t, uint32_t,
++	   z0 = svldff1uw_u64 (p0, x0 - svcntd ()),
++	   z0 = svldff1uw_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldff1uw_vnum_u64_0:
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_u64_0, svuint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_u64 (p0, x0, 0),
++	   z0 = svldff1uw_vnum_u64 (p0, x0, 0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_vnum_u64_1:
++**	inch	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_u64_1, svuint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_u64 (p0, x0, 1),
++	   z0 = svldff1uw_vnum_u64 (p0, x0, 1))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldff1uw_vnum_u64_m1:
++**	dech	x0
++**	ldff1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_u64_m1, svuint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_u64 (p0, x0, -1),
++	   z0 = svldff1uw_vnum_u64 (p0, x0, -1))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldff1uw_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldff1w	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldff1uw_vnum_u64_x1, svuint64_t, uint32_t,
++	   z0 = svldff1uw_vnum_u64 (p0, x0, x1),
++	   z0 = svldff1uw_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c
+new file mode 100644
+index 000000000..947a896e7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_bf16_base:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_base, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_bf16_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_index, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_bf16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1_bf16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1_bf16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1_bf16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1_bf16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1_bf16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_bf16_m9, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_bf16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1_vnum_bf16_0:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_bf16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_bf16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_bf16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_bf16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_bf16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_bf16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnf1_vnum_bf16 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f16.c
+new file mode 100644
+index 000000000..cf0178688
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_f16_base:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_base, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_f16_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_index, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_f16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_1, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1_f16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_7, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1_f16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_8, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1_f16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_m1, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1_f16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_m8, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1_f16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f16_m9, svfloat16_t, float16_t,
++	   z0 = svldnf1_f16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1_vnum_f16_0:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_0, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_f16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_1, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_f16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_7, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_f16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_8, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_f16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_m1, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_f16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_m8, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_f16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_m9, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f16_x1, svfloat16_t, float16_t,
++	   z0 = svldnf1_vnum_f16 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f32.c
+new file mode 100644
+index 000000000..83b73ec8e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_f32_base:
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_base, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_f32_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1w	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_index, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_f32_1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_1, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1_f32_7:
++**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_7, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1_f32_8:
++**	incb	x0, all, mul #8
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_8, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1_f32_m1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_m1, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1_f32_m8:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_m8, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1_f32_m9:
++**	decb	x0, all, mul #9
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f32_m9, svfloat32_t, float32_t,
++	   z0 = svldnf1_f32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1_vnum_f32_0:
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_0, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_f32_1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_1, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_f32_7:
++**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_7, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_f32_8:
++**	incb	x0, all, mul #8
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_8, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_f32_m1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_m1, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_f32_m8:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_m8, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_f32_m9:
++**	decb	x0, all, mul #9
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_m9, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f32_x1, svfloat32_t, float32_t,
++	   z0 = svldnf1_vnum_f32 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f64.c
+new file mode 100644
+index 000000000..778096e82
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_f64_base:
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_base, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_f64_index:
++**	add	(x[0-9]+), x0, x1, lsl 3
++**	ldnf1d	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_index, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_f64_1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_1, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1_f64_7:
++**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_7, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1_f64_8:
++**	incb	x0, all, mul #8
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_8, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1_f64_m1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_m1, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1_f64_m8:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_m8, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1_f64_m9:
++**	decb	x0, all, mul #9
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_f64_m9, svfloat64_t, float64_t,
++	   z0 = svldnf1_f64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1_vnum_f64_0:
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_0, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_f64_1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_1, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_f64_7:
++**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_7, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_f64_8:
++**	incb	x0, all, mul #8
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_8, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_f64_m1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_m1, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_f64_m8:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_m8, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_f64_m9:
++**	decb	x0, all, mul #9
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_m9, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_f64_x1, svfloat64_t, float64_t,
++	   z0 = svldnf1_vnum_f64 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s16.c
+new file mode 100644
+index 000000000..592c8237d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_s16_base:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_base, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_s16_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_index, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_s16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_1, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1_s16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_7, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1_s16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_8, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1_s16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_m1, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1_s16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_m8, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1_s16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s16_m9, svint16_t, int16_t,
++	   z0 = svldnf1_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1_vnum_s16_0:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_0, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_s16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_1, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_s16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_7, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_s16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_8, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_s16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_m1, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_s16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_m8, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_s16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_m9, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s16_x1, svint16_t, int16_t,
++	   z0 = svldnf1_vnum_s16 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s32.c
+new file mode 100644
+index 000000000..634092af8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_s32_base:
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_base, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_s32_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1w	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_index, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_s32_1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_1, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1_s32_7:
++**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_7, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1_s32_8:
++**	incb	x0, all, mul #8
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_8, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1_s32_m1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_m1, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1_s32_m8:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_m8, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1_s32_m9:
++**	decb	x0, all, mul #9
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s32_m9, svint32_t, int32_t,
++	   z0 = svldnf1_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1_vnum_s32_0:
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_0, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_s32_1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_1, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_s32_7:
++**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_7, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_s32_8:
++**	incb	x0, all, mul #8
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_8, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_s32_m1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_m1, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_s32_m8:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_m8, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_s32_m9:
++**	decb	x0, all, mul #9
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_m9, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s32_x1, svint32_t, int32_t,
++	   z0 = svldnf1_vnum_s32 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s64.c
+new file mode 100644
+index 000000000..4a03f6676
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_s64_base:
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_base, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_s64_index:
++**	add	(x[0-9]+), x0, x1, lsl 3
++**	ldnf1d	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_index, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_s64_1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_1, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1_s64_7:
++**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_7, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1_s64_8:
++**	incb	x0, all, mul #8
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_8, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1_s64_m1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_m1, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1_s64_m8:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_m8, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1_s64_m9:
++**	decb	x0, all, mul #9
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s64_m9, svint64_t, int64_t,
++	   z0 = svldnf1_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1_vnum_s64_0:
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_0, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_s64_1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_1, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_s64_7:
++**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_7, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_s64_8:
++**	incb	x0, all, mul #8
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_8, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_s64_m1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_m1, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_s64_m8:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_m8, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_s64_m9:
++**	decb	x0, all, mul #9
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_m9, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s64_x1, svint64_t, int64_t,
++	   z0 = svldnf1_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s8.c
+new file mode 100644
+index 000000000..162ee176a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s8.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_s8_base:
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_base, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_s8_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_index, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_s8_1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_1, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 + svcntb ()),
++	   z0 = svldnf1 (p0, x0 + svcntb ()))
++
++/*
++** ldnf1_s8_7:
++**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_7, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 + svcntb () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntb () * 7))
++
++/*
++** ldnf1_s8_8:
++**	incb	x0, all, mul #8
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_8, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 + svcntb () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntb () * 8))
++
++/*
++** ldnf1_s8_m1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_m1, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 - svcntb ()),
++	   z0 = svldnf1 (p0, x0 - svcntb ()))
++
++/*
++** ldnf1_s8_m8:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_m8, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 - svcntb () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntb () * 8))
++
++/*
++** ldnf1_s8_m9:
++**	decb	x0, all, mul #9
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_s8_m9, svint8_t, int8_t,
++	   z0 = svldnf1_s8 (p0, x0 - svcntb () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntb () * 9))
++
++/*
++** ldnf1_vnum_s8_0:
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_0, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_s8_1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_1, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_s8_7:
++**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_7, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_s8_8:
++**	incb	x0, all, mul #8
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_8, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_s8_m1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_m1, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_s8_m8:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_m8, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_s8_m9:
++**	decb	x0, all, mul #9
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_m9, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.b, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_s8_x1, svint8_t, int8_t,
++	   z0 = svldnf1_vnum_s8 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u16.c
+new file mode 100644
+index 000000000..e920ac43b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_u16_base:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_base, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_u16_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_index, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_u16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_1, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1_u16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_7, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1_u16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_8, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1_u16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_m1, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1_u16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_m8, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1_u16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u16_m9, svuint16_t, uint16_t,
++	   z0 = svldnf1_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1_vnum_u16_0:
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_0, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_u16_1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_1, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_u16_7:
++**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_7, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_u16_8:
++**	incb	x0, all, mul #8
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_8, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_u16_m1:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_m1, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_u16_m8:
++**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_m8, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_u16_m9:
++**	decb	x0, all, mul #9
++**	ldnf1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_m9, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u16_x1, svuint16_t, uint16_t,
++	   z0 = svldnf1_vnum_u16 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u32.c
+new file mode 100644
+index 000000000..65e28c5c2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_u32_base:
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_base, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_u32_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1w	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_index, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_u32_1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_1, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1_u32_7:
++**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_7, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1_u32_8:
++**	incb	x0, all, mul #8
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_8, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1_u32_m1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_m1, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1_u32_m8:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_m8, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1_u32_m9:
++**	decb	x0, all, mul #9
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u32_m9, svuint32_t, uint32_t,
++	   z0 = svldnf1_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1_vnum_u32_0:
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_0, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_u32_1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_1, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_u32_7:
++**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_7, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_u32_8:
++**	incb	x0, all, mul #8
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_8, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_u32_m1:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_m1, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_u32_m8:
++**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_m8, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_u32_m9:
++**	decb	x0, all, mul #9
++**	ldnf1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_m9, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u32_x1, svuint32_t, uint32_t,
++	   z0 = svldnf1_vnum_u32 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u64.c
+new file mode 100644
+index 000000000..70d3f27d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_u64_base:
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_base, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_u64_index:
++**	add	(x[0-9]+), x0, x1, lsl 3
++**	ldnf1d	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_index, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_u64_1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_1, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1_u64_7:
++**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_7, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1_u64_8:
++**	incb	x0, all, mul #8
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_8, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1_u64_m1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_m1, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1_u64_m8:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_m8, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1_u64_m9:
++**	decb	x0, all, mul #9
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u64_m9, svuint64_t, uint64_t,
++	   z0 = svldnf1_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1_vnum_u64_0:
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_0, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_u64_1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_1, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_u64_7:
++**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_7, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_u64_8:
++**	incb	x0, all, mul #8
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_8, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_u64_m1:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_m1, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_u64_m8:
++**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_m8, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_u64_m9:
++**	decb	x0, all, mul #9
++**	ldnf1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_m9, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u64_x1, svuint64_t, uint64_t,
++	   z0 = svldnf1_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u8.c
+new file mode 100644
+index 000000000..5c29f1d19
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u8.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1_u8_base:
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_base, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0),
++	   z0 = svldnf1 (p0, x0))
++
++/*
++** ldnf1_u8_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.b, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_index, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 + x1),
++	   z0 = svldnf1 (p0, x0 + x1))
++
++/*
++** ldnf1_u8_1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_1, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 + svcntb ()),
++	   z0 = svldnf1 (p0, x0 + svcntb ()))
++
++/*
++** ldnf1_u8_7:
++**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_7, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 + svcntb () * 7),
++	   z0 = svldnf1 (p0, x0 + svcntb () * 7))
++
++/*
++** ldnf1_u8_8:
++**	incb	x0, all, mul #8
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_8, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 + svcntb () * 8),
++	   z0 = svldnf1 (p0, x0 + svcntb () * 8))
++
++/*
++** ldnf1_u8_m1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_m1, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 - svcntb ()),
++	   z0 = svldnf1 (p0, x0 - svcntb ()))
++
++/*
++** ldnf1_u8_m8:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_m8, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 - svcntb () * 8),
++	   z0 = svldnf1 (p0, x0 - svcntb () * 8))
++
++/*
++** ldnf1_u8_m9:
++**	decb	x0, all, mul #9
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_u8_m9, svuint8_t, uint8_t,
++	   z0 = svldnf1_u8 (p0, x0 - svcntb () * 9),
++	   z0 = svldnf1 (p0, x0 - svcntb () * 9))
++
++/*
++** ldnf1_vnum_u8_0:
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_0, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, 0),
++	   z0 = svldnf1_vnum (p0, x0, 0))
++
++/*
++** ldnf1_vnum_u8_1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_1, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, 1),
++	   z0 = svldnf1_vnum (p0, x0, 1))
++
++/*
++** ldnf1_vnum_u8_7:
++**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_7, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, 7),
++	   z0 = svldnf1_vnum (p0, x0, 7))
++
++/*
++** ldnf1_vnum_u8_8:
++**	incb	x0, all, mul #8
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_8, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, 8),
++	   z0 = svldnf1_vnum (p0, x0, 8))
++
++/*
++** ldnf1_vnum_u8_m1:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_m1, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, -1),
++	   z0 = svldnf1_vnum (p0, x0, -1))
++
++/*
++** ldnf1_vnum_u8_m8:
++**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_m8, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, -8),
++	   z0 = svldnf1_vnum (p0, x0, -8))
++
++/*
++** ldnf1_vnum_u8_m9:
++**	decb	x0, all, mul #9
++**	ldnf1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_m9, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, -9),
++	   z0 = svldnf1_vnum (p0, x0, -9))
++
++/*
++** ldnf1_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.b, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1_vnum_u8_x1, svuint8_t, uint8_t,
++	   z0 = svldnf1_vnum_u8 (p0, x0, x1),
++	   z0 = svldnf1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s16.c
+new file mode 100644
+index 000000000..e04b9a788
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sb_s16_base:
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_base, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0),
++	   z0 = svldnf1sb_s16 (p0, x0))
++
++/*
++** ldnf1sb_s16_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1sb	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_index, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 + x1),
++	   z0 = svldnf1sb_s16 (p0, x0 + x1))
++
++/*
++** ldnf1sb_s16_1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_1, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1sb_s16 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1sb_s16_7:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_7, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1sb_s16_8:
++**	incb	x0, all, mul #4
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_8, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1sb_s16_m1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_m1, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1sb_s16 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1sb_s16_m8:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_m8, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1sb_s16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s16_m9, svint16_t, int8_t,
++	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1sb_vnum_s16_0:
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_0, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 0),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 0))
++
++/*
++** ldnf1sb_vnum_s16_1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_1, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 1),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 1))
++
++/*
++** ldnf1sb_vnum_s16_7:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_7, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 7),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 7))
++
++/*
++** ldnf1sb_vnum_s16_8:
++**	incb	x0, all, mul #4
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_8, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 8),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, 8))
++
++/*
++** ldnf1sb_vnum_s16_m1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_m1, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, -1),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, -1))
++
++/*
++** ldnf1sb_vnum_s16_m8:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_m8, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, -8),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, -8))
++
++/*
++** ldnf1sb_vnum_s16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_m9, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, -9),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, -9))
++
++/*
++** ldnf1sb_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sb	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s16_x1, svint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, x1),
++	   z0 = svldnf1sb_vnum_s16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s32.c
+new file mode 100644
+index 000000000..0553fc98d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sb_s32_base:
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_base, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0),
++	   z0 = svldnf1sb_s32 (p0, x0))
++
++/*
++** ldnf1sb_s32_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1sb	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_index, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 + x1),
++	   z0 = svldnf1sb_s32 (p0, x0 + x1))
++
++/*
++** ldnf1sb_s32_1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_1, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1sb_s32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1sb_s32_7:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_7, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1sb_s32_8:
++**	incb	x0, all, mul #2
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_8, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1sb_s32_m1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_m1, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1sb_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1sb_s32_m8:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_m8, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1sb_s32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s32_m9, svint32_t, int8_t,
++	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1sb_vnum_s32_0:
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_0, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 0),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 0))
++
++/*
++** ldnf1sb_vnum_s32_1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_1, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 1),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 1))
++
++/*
++** ldnf1sb_vnum_s32_7:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_7, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 7),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 7))
++
++/*
++** ldnf1sb_vnum_s32_8:
++**	incb	x0, all, mul #2
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_8, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 8),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, 8))
++
++/*
++** ldnf1sb_vnum_s32_m1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_m1, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, -1),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, -1))
++
++/*
++** ldnf1sb_vnum_s32_m8:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_m8, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, -8),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, -8))
++
++/*
++** ldnf1sb_vnum_s32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_m9, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, -9),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, -9))
++
++/*
++** ldnf1sb_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sb	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s32_x1, svint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, x1),
++	   z0 = svldnf1sb_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s64.c
+new file mode 100644
+index 000000000..61a474fdf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sb_s64_base:
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_base, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0),
++	   z0 = svldnf1sb_s64 (p0, x0))
++
++/*
++** ldnf1sb_s64_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1sb	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_index, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 + x1),
++	   z0 = svldnf1sb_s64 (p0, x0 + x1))
++
++/*
++** ldnf1sb_s64_1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_1, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1sb_s64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1sb_s64_7:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_7, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1sb_s64_8:
++**	incb	x0
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_8, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1sb_s64_m1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_m1, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1sb_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1sb_s64_m8:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_m8, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1sb_s64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_s64_m9, svint64_t, int8_t,
++	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1sb_vnum_s64_0:
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_0, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 0))
++
++/*
++** ldnf1sb_vnum_s64_1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_1, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 1))
++
++/*
++** ldnf1sb_vnum_s64_7:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_7, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 7))
++
++/*
++** ldnf1sb_vnum_s64_8:
++**	incb	x0
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_8, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, 8))
++
++/*
++** ldnf1sb_vnum_s64_m1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_m1, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, -1))
++
++/*
++** ldnf1sb_vnum_s64_m8:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_m8, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, -8))
++
++/*
++** ldnf1sb_vnum_s64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_m9, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, -9))
++
++/*
++** ldnf1sb_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sb	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_s64_x1, svint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1sb_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u16.c
+new file mode 100644
+index 000000000..be63d8bf9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sb_u16_base:
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_base, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0),
++	   z0 = svldnf1sb_u16 (p0, x0))
++
++/*
++** ldnf1sb_u16_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1sb	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_index, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 + x1),
++	   z0 = svldnf1sb_u16 (p0, x0 + x1))
++
++/*
++** ldnf1sb_u16_1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_1, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1sb_u16 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1sb_u16_7:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_7, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1sb_u16_8:
++**	incb	x0, all, mul #4
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_8, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1sb_u16_m1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_m1, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1sb_u16 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1sb_u16_m8:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_m8, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1sb_u16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u16_m9, svuint16_t, int8_t,
++	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1sb_vnum_u16_0:
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_0, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 0),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 0))
++
++/*
++** ldnf1sb_vnum_u16_1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_1, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 1),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 1))
++
++/*
++** ldnf1sb_vnum_u16_7:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_7, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 7),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 7))
++
++/*
++** ldnf1sb_vnum_u16_8:
++**	incb	x0, all, mul #4
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_8, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 8),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, 8))
++
++/*
++** ldnf1sb_vnum_u16_m1:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_m1, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, -1),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, -1))
++
++/*
++** ldnf1sb_vnum_u16_m8:
++**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_m8, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, -8),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, -8))
++
++/*
++** ldnf1sb_vnum_u16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sb	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_m9, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, -9),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, -9))
++
++/*
++** ldnf1sb_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sb	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u16_x1, svuint16_t, int8_t,
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, x1),
++	   z0 = svldnf1sb_vnum_u16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u32.c
+new file mode 100644
+index 000000000..4f52490b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sb_u32_base:
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_base, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0),
++	   z0 = svldnf1sb_u32 (p0, x0))
++
++/*
++** ldnf1sb_u32_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1sb	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_index, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 + x1),
++	   z0 = svldnf1sb_u32 (p0, x0 + x1))
++
++/*
++** ldnf1sb_u32_1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_1, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1sb_u32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1sb_u32_7:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_7, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1sb_u32_8:
++**	incb	x0, all, mul #2
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_8, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1sb_u32_m1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_m1, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1sb_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1sb_u32_m8:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_m8, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1sb_u32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u32_m9, svuint32_t, int8_t,
++	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1sb_vnum_u32_0:
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_0, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 0),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 0))
++
++/*
++** ldnf1sb_vnum_u32_1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_1, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 1),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 1))
++
++/*
++** ldnf1sb_vnum_u32_7:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_7, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 7),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 7))
++
++/*
++** ldnf1sb_vnum_u32_8:
++**	incb	x0, all, mul #2
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_8, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 8),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, 8))
++
++/*
++** ldnf1sb_vnum_u32_m1:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_m1, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, -1),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, -1))
++
++/*
++** ldnf1sb_vnum_u32_m8:
++**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_m8, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, -8),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, -8))
++
++/*
++** ldnf1sb_vnum_u32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sb	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_m9, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, -9),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, -9))
++
++/*
++** ldnf1sb_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sb	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u32_x1, svuint32_t, int8_t,
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, x1),
++	   z0 = svldnf1sb_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u64.c
+new file mode 100644
+index 000000000..73f50d182
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sb_u64_base:
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_base, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0),
++	   z0 = svldnf1sb_u64 (p0, x0))
++
++/*
++** ldnf1sb_u64_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1sb	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_index, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 + x1),
++	   z0 = svldnf1sb_u64 (p0, x0 + x1))
++
++/*
++** ldnf1sb_u64_1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_1, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1sb_u64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1sb_u64_7:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_7, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1sb_u64_8:
++**	incb	x0
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_8, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1sb_u64_m1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_m1, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1sb_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1sb_u64_m8:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_m8, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1sb_u64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_u64_m9, svuint64_t, int8_t,
++	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1sb_vnum_u64_0:
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_0, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 0))
++
++/*
++** ldnf1sb_vnum_u64_1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_1, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 1))
++
++/*
++** ldnf1sb_vnum_u64_7:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_7, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 7))
++
++/*
++** ldnf1sb_vnum_u64_8:
++**	incb	x0
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_8, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, 8))
++
++/*
++** ldnf1sb_vnum_u64_m1:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_m1, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, -1))
++
++/*
++** ldnf1sb_vnum_u64_m8:
++**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_m8, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, -8))
++
++/*
++** ldnf1sb_vnum_u64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1sb	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_m9, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, -9))
++
++/*
++** ldnf1sb_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sb	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sb_vnum_u64_x1, svuint64_t, int8_t,
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1sb_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s32.c
+new file mode 100644
+index 000000000..08c7dc6dd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sh_s32_base:
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_base, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0),
++	   z0 = svldnf1sh_s32 (p0, x0))
++
++/*
++** ldnf1sh_s32_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1sh	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_index, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 + x1),
++	   z0 = svldnf1sh_s32 (p0, x0 + x1))
++
++/*
++** ldnf1sh_s32_1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_1, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1sh_s32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1sh_s32_7:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_7, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1sh_s32_8:
++**	incb	x0, all, mul #4
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_8, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1sh_s32_m1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_m1, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1sh_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1sh_s32_m8:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_m8, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1sh_s32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s32_m9, svint32_t, int16_t,
++	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1sh_vnum_s32_0:
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_0, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 0),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 0))
++
++/*
++** ldnf1sh_vnum_s32_1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_1, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 1),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 1))
++
++/*
++** ldnf1sh_vnum_s32_7:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_7, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 7),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 7))
++
++/*
++** ldnf1sh_vnum_s32_8:
++**	incb	x0, all, mul #4
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_8, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 8),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, 8))
++
++/*
++** ldnf1sh_vnum_s32_m1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_m1, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, -1),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, -1))
++
++/*
++** ldnf1sh_vnum_s32_m8:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_m8, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, -8),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, -8))
++
++/*
++** ldnf1sh_vnum_s32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_m9, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, -9),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, -9))
++
++/*
++** ldnf1sh_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sh	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s32_x1, svint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, x1),
++	   z0 = svldnf1sh_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s64.c
+new file mode 100644
+index 000000000..6a41bc26b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sh_s64_base:
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_base, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0),
++	   z0 = svldnf1sh_s64 (p0, x0))
++
++/*
++** ldnf1sh_s64_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1sh	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_index, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 + x1),
++	   z0 = svldnf1sh_s64 (p0, x0 + x1))
++
++/*
++** ldnf1sh_s64_1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_1, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1sh_s64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1sh_s64_7:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_7, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1sh_s64_8:
++**	incb	x0, all, mul #2
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_8, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1sh_s64_m1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_m1, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1sh_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1sh_s64_m8:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_m8, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1sh_s64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_s64_m9, svint64_t, int16_t,
++	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1sh_vnum_s64_0:
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_0, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 0))
++
++/*
++** ldnf1sh_vnum_s64_1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_1, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 1))
++
++/*
++** ldnf1sh_vnum_s64_7:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_7, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 7))
++
++/*
++** ldnf1sh_vnum_s64_8:
++**	incb	x0, all, mul #2
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_8, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, 8))
++
++/*
++** ldnf1sh_vnum_s64_m1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_m1, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, -1))
++
++/*
++** ldnf1sh_vnum_s64_m8:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_m8, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, -8))
++
++/*
++** ldnf1sh_vnum_s64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_m9, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, -9))
++
++/*
++** ldnf1sh_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sh	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_s64_x1, svint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1sh_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u32.c
+new file mode 100644
+index 000000000..2f7718730
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sh_u32_base:
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_base, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0),
++	   z0 = svldnf1sh_u32 (p0, x0))
++
++/*
++** ldnf1sh_u32_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1sh	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_index, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 + x1),
++	   z0 = svldnf1sh_u32 (p0, x0 + x1))
++
++/*
++** ldnf1sh_u32_1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_1, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1sh_u32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1sh_u32_7:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_7, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1sh_u32_8:
++**	incb	x0, all, mul #4
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_8, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1sh_u32_m1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_m1, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1sh_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1sh_u32_m8:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_m8, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1sh_u32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u32_m9, svuint32_t, int16_t,
++	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1sh_vnum_u32_0:
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_0, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 0),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 0))
++
++/*
++** ldnf1sh_vnum_u32_1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_1, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 1),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 1))
++
++/*
++** ldnf1sh_vnum_u32_7:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_7, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 7),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 7))
++
++/*
++** ldnf1sh_vnum_u32_8:
++**	incb	x0, all, mul #4
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_8, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 8),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, 8))
++
++/*
++** ldnf1sh_vnum_u32_m1:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_m1, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, -1),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, -1))
++
++/*
++** ldnf1sh_vnum_u32_m8:
++**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_m8, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, -8),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, -8))
++
++/*
++** ldnf1sh_vnum_u32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sh	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_m9, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, -9),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, -9))
++
++/*
++** ldnf1sh_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sh	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u32_x1, svuint32_t, int16_t,
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, x1),
++	   z0 = svldnf1sh_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u64.c
+new file mode 100644
+index 000000000..d7f1a68a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sh_u64_base:
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_base, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0),
++	   z0 = svldnf1sh_u64 (p0, x0))
++
++/*
++** ldnf1sh_u64_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1sh	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_index, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 + x1),
++	   z0 = svldnf1sh_u64 (p0, x0 + x1))
++
++/*
++** ldnf1sh_u64_1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_1, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1sh_u64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1sh_u64_7:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_7, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1sh_u64_8:
++**	incb	x0, all, mul #2
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_8, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1sh_u64_m1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_m1, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1sh_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1sh_u64_m8:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_m8, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1sh_u64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_u64_m9, svuint64_t, int16_t,
++	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1sh_vnum_u64_0:
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_0, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 0))
++
++/*
++** ldnf1sh_vnum_u64_1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_1, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 1))
++
++/*
++** ldnf1sh_vnum_u64_7:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_7, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 7))
++
++/*
++** ldnf1sh_vnum_u64_8:
++**	incb	x0, all, mul #2
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_8, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, 8))
++
++/*
++** ldnf1sh_vnum_u64_m1:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_m1, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, -1))
++
++/*
++** ldnf1sh_vnum_u64_m8:
++**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_m8, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, -8))
++
++/*
++** ldnf1sh_vnum_u64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1sh	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_m9, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, -9))
++
++/*
++** ldnf1sh_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sh	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sh_vnum_u64_x1, svuint64_t, int16_t,
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1sh_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_s64.c
+new file mode 100644
+index 000000000..5b483e4aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sw_s64_base:
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_base, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0),
++	   z0 = svldnf1sw_s64 (p0, x0))
++
++/*
++** ldnf1sw_s64_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1sw	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_index, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 + x1),
++	   z0 = svldnf1sw_s64 (p0, x0 + x1))
++
++/*
++** ldnf1sw_s64_1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_1, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1sw_s64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1sw_s64_7:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_7, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1sw_s64_8:
++**	incb	x0, all, mul #4
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_8, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1sw_s64_m1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_m1, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1sw_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1sw_s64_m8:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_m8, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1sw_s64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_s64_m9, svint64_t, int32_t,
++	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1sw_vnum_s64_0:
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_0, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 0))
++
++/*
++** ldnf1sw_vnum_s64_1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_1, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 1))
++
++/*
++** ldnf1sw_vnum_s64_7:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_7, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 7))
++
++/*
++** ldnf1sw_vnum_s64_8:
++**	incb	x0, all, mul #4
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_8, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, 8))
++
++/*
++** ldnf1sw_vnum_s64_m1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_m1, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, -1))
++
++/*
++** ldnf1sw_vnum_s64_m8:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_m8, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, -8))
++
++/*
++** ldnf1sw_vnum_s64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_m9, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, -9))
++
++/*
++** ldnf1sw_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sw	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_s64_x1, svint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1sw_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_u64.c
+new file mode 100644
+index 000000000..62121ce0a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1sw_u64_base:
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_base, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0),
++	   z0 = svldnf1sw_u64 (p0, x0))
++
++/*
++** ldnf1sw_u64_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1sw	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_index, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 + x1),
++	   z0 = svldnf1sw_u64 (p0, x0 + x1))
++
++/*
++** ldnf1sw_u64_1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_1, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1sw_u64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1sw_u64_7:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_7, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1sw_u64_8:
++**	incb	x0, all, mul #4
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_8, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1sw_u64_m1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_m1, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1sw_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1sw_u64_m8:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_m8, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1sw_u64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_u64_m9, svuint64_t, int32_t,
++	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1sw_vnum_u64_0:
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_0, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 0))
++
++/*
++** ldnf1sw_vnum_u64_1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_1, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 1))
++
++/*
++** ldnf1sw_vnum_u64_7:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_7, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 7))
++
++/*
++** ldnf1sw_vnum_u64_8:
++**	incb	x0, all, mul #4
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_8, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, 8))
++
++/*
++** ldnf1sw_vnum_u64_m1:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_m1, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, -1))
++
++/*
++** ldnf1sw_vnum_u64_m8:
++**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_m8, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, -8))
++
++/*
++** ldnf1sw_vnum_u64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1sw	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_m9, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, -9))
++
++/*
++** ldnf1sw_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1sw	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1sw_vnum_u64_x1, svuint64_t, int32_t,
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1sw_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s16.c
+new file mode 100644
+index 000000000..8fe13411f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1ub_s16_base:
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_base, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0),
++	   z0 = svldnf1ub_s16 (p0, x0))
++
++/*
++** ldnf1ub_s16_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_index, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 + x1),
++	   z0 = svldnf1ub_s16 (p0, x0 + x1))
++
++/*
++** ldnf1ub_s16_1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_1, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1ub_s16 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1ub_s16_7:
++**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_7, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1ub_s16_8:
++**	incb	x0, all, mul #4
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_8, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1ub_s16_m1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_m1, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1ub_s16 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1ub_s16_m8:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_m8, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1ub_s16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s16_m9, svint16_t, uint8_t,
++	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1ub_vnum_s16_0:
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_0, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 0),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 0))
++
++/*
++** ldnf1ub_vnum_s16_1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_1, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 1),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 1))
++
++/*
++** ldnf1ub_vnum_s16_7:
++**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_7, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 7),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 7))
++
++/*
++** ldnf1ub_vnum_s16_8:
++**	incb	x0, all, mul #4
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_8, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 8),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, 8))
++
++/*
++** ldnf1ub_vnum_s16_m1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_m1, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, -1),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, -1))
++
++/*
++** ldnf1ub_vnum_s16_m8:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_m8, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, -8),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, -8))
++
++/*
++** ldnf1ub_vnum_s16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_m9, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, -9),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, -9))
++
++/*
++** ldnf1ub_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s16_x1, svint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, x1),
++	   z0 = svldnf1ub_vnum_s16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s32.c
+new file mode 100644
+index 000000000..50122e3b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1ub_s32_base:
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_base, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0),
++	   z0 = svldnf1ub_s32 (p0, x0))
++
++/*
++** ldnf1ub_s32_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_index, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 + x1),
++	   z0 = svldnf1ub_s32 (p0, x0 + x1))
++
++/*
++** ldnf1ub_s32_1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_1, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1ub_s32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1ub_s32_7:
++**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_7, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1ub_s32_8:
++**	incb	x0, all, mul #2
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_8, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1ub_s32_m1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_m1, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1ub_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1ub_s32_m8:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_m8, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1ub_s32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s32_m9, svint32_t, uint8_t,
++	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1ub_vnum_s32_0:
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_0, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 0),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 0))
++
++/*
++** ldnf1ub_vnum_s32_1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_1, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 1),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 1))
++
++/*
++** ldnf1ub_vnum_s32_7:
++**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_7, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 7),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 7))
++
++/*
++** ldnf1ub_vnum_s32_8:
++**	incb	x0, all, mul #2
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_8, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 8),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, 8))
++
++/*
++** ldnf1ub_vnum_s32_m1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_m1, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, -1),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, -1))
++
++/*
++** ldnf1ub_vnum_s32_m8:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_m8, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, -8),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, -8))
++
++/*
++** ldnf1ub_vnum_s32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_m9, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, -9),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, -9))
++
++/*
++** ldnf1ub_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s32_x1, svint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, x1),
++	   z0 = svldnf1ub_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s64.c
+new file mode 100644
+index 000000000..d7cce11b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1ub_s64_base:
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_base, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0),
++	   z0 = svldnf1ub_s64 (p0, x0))
++
++/*
++** ldnf1ub_s64_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_index, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 + x1),
++	   z0 = svldnf1ub_s64 (p0, x0 + x1))
++
++/*
++** ldnf1ub_s64_1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_1, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1ub_s64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1ub_s64_7:
++**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_7, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1ub_s64_8:
++**	incb	x0
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_8, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1ub_s64_m1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_m1, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1ub_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1ub_s64_m8:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_m8, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1ub_s64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_s64_m9, svint64_t, uint8_t,
++	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1ub_vnum_s64_0:
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_0, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 0))
++
++/*
++** ldnf1ub_vnum_s64_1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_1, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 1))
++
++/*
++** ldnf1ub_vnum_s64_7:
++**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_7, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 7))
++
++/*
++** ldnf1ub_vnum_s64_8:
++**	incb	x0
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_8, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, 8))
++
++/*
++** ldnf1ub_vnum_s64_m1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_m1, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, -1))
++
++/*
++** ldnf1ub_vnum_s64_m8:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_m8, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, -8))
++
++/*
++** ldnf1ub_vnum_s64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_m9, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, -9))
++
++/*
++** ldnf1ub_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_s64_x1, svint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1ub_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u16.c
+new file mode 100644
+index 000000000..7bf82c3b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u16.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1ub_u16_base:
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_base, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0),
++	   z0 = svldnf1ub_u16 (p0, x0))
++
++/*
++** ldnf1ub_u16_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.h, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_index, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 + x1),
++	   z0 = svldnf1ub_u16 (p0, x0 + x1))
++
++/*
++** ldnf1ub_u16_1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_1, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 + svcnth ()),
++	   z0 = svldnf1ub_u16 (p0, x0 + svcnth ()))
++
++/*
++** ldnf1ub_u16_7:
++**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_7, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 7))
++
++/*
++** ldnf1ub_u16_8:
++**	incb	x0, all, mul #4
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_8, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnf1ub_u16_m1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_m1, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 - svcnth ()),
++	   z0 = svldnf1ub_u16 (p0, x0 - svcnth ()))
++
++/*
++** ldnf1ub_u16_m8:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_m8, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 8))
++
++/*
++** ldnf1ub_u16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u16_m9, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnf1ub_vnum_u16_0:
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_0, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 0),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 0))
++
++/*
++** ldnf1ub_vnum_u16_1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_1, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 1),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 1))
++
++/*
++** ldnf1ub_vnum_u16_7:
++**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_7, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 7),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 7))
++
++/*
++** ldnf1ub_vnum_u16_8:
++**	incb	x0, all, mul #4
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_8, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 8),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, 8))
++
++/*
++** ldnf1ub_vnum_u16_m1:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_m1, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, -1),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, -1))
++
++/*
++** ldnf1ub_vnum_u16_m8:
++**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_m8, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, -8),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, -8))
++
++/*
++** ldnf1ub_vnum_u16_m9:
++**	dech	x0, all, mul #9
++**	ldnf1b	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_m9, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, -9),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, -9))
++
++/*
++** ldnf1ub_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u16_x1, svuint16_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, x1),
++	   z0 = svldnf1ub_vnum_u16 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u32.c
+new file mode 100644
+index 000000000..e2fef064b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1ub_u32_base:
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_base, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0),
++	   z0 = svldnf1ub_u32 (p0, x0))
++
++/*
++** ldnf1ub_u32_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_index, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 + x1),
++	   z0 = svldnf1ub_u32 (p0, x0 + x1))
++
++/*
++** ldnf1ub_u32_1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_1, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1ub_u32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1ub_u32_7:
++**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_7, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1ub_u32_8:
++**	incb	x0, all, mul #2
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_8, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1ub_u32_m1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_m1, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1ub_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1ub_u32_m8:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_m8, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1ub_u32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u32_m9, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1ub_vnum_u32_0:
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_0, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 0),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 0))
++
++/*
++** ldnf1ub_vnum_u32_1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_1, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 1),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 1))
++
++/*
++** ldnf1ub_vnum_u32_7:
++**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_7, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 7),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 7))
++
++/*
++** ldnf1ub_vnum_u32_8:
++**	incb	x0, all, mul #2
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_8, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 8),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, 8))
++
++/*
++** ldnf1ub_vnum_u32_m1:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_m1, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, -1),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, -1))
++
++/*
++** ldnf1ub_vnum_u32_m8:
++**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_m8, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, -8),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, -8))
++
++/*
++** ldnf1ub_vnum_u32_m9:
++**	decw	x0, all, mul #9
++**	ldnf1b	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_m9, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, -9),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, -9))
++
++/*
++** ldnf1ub_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u32_x1, svuint32_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, x1),
++	   z0 = svldnf1ub_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u64.c
+new file mode 100644
+index 000000000..57c61e122
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1ub_u64_base:
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_base, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0),
++	   z0 = svldnf1ub_u64 (p0, x0))
++
++/*
++** ldnf1ub_u64_index:
++**	add	(x[0-9]+), x0, x1
++**	ldnf1b	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_index, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 + x1),
++	   z0 = svldnf1ub_u64 (p0, x0 + x1))
++
++/*
++** ldnf1ub_u64_1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_1, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1ub_u64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1ub_u64_7:
++**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_7, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1ub_u64_8:
++**	incb	x0
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_8, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1ub_u64_m1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_m1, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1ub_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1ub_u64_m8:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_m8, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1ub_u64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_u64_m9, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1ub_vnum_u64_0:
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_0, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 0))
++
++/*
++** ldnf1ub_vnum_u64_1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_1, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 1))
++
++/*
++** ldnf1ub_vnum_u64_7:
++**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_7, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 7))
++
++/*
++** ldnf1ub_vnum_u64_8:
++**	incb	x0
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_8, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, 8))
++
++/*
++** ldnf1ub_vnum_u64_m1:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_m1, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, -1))
++
++/*
++** ldnf1ub_vnum_u64_m8:
++**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_m8, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, -8))
++
++/*
++** ldnf1ub_vnum_u64_m9:
++**	decd	x0, all, mul #9
++**	ldnf1b	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_m9, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, -9))
++
++/*
++** ldnf1ub_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1b	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1ub_vnum_u64_x1, svuint64_t, uint8_t,
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1ub_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s32.c
+new file mode 100644
+index 000000000..ed9686c4e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1uh_s32_base:
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_base, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0),
++	   z0 = svldnf1uh_s32 (p0, x0))
++
++/*
++** ldnf1uh_s32_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_index, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 + x1),
++	   z0 = svldnf1uh_s32 (p0, x0 + x1))
++
++/*
++** ldnf1uh_s32_1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_1, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1uh_s32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1uh_s32_7:
++**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_7, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1uh_s32_8:
++**	incb	x0, all, mul #4
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_8, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1uh_s32_m1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_m1, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1uh_s32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1uh_s32_m8:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_m8, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1uh_s32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s32_m9, svint32_t, uint16_t,
++	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1uh_vnum_s32_0:
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_0, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 0),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 0))
++
++/*
++** ldnf1uh_vnum_s32_1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_1, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 1),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 1))
++
++/*
++** ldnf1uh_vnum_s32_7:
++**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_7, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 7),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 7))
++
++/*
++** ldnf1uh_vnum_s32_8:
++**	incb	x0, all, mul #4
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_8, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 8),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, 8))
++
++/*
++** ldnf1uh_vnum_s32_m1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_m1, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, -1),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, -1))
++
++/*
++** ldnf1uh_vnum_s32_m8:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_m8, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, -8),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, -8))
++
++/*
++** ldnf1uh_vnum_s32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_m9, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, -9),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, -9))
++
++/*
++** ldnf1uh_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s32_x1, svint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, x1),
++	   z0 = svldnf1uh_vnum_s32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s64.c
+new file mode 100644
+index 000000000..a3107f562
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1uh_s64_base:
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_base, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0),
++	   z0 = svldnf1uh_s64 (p0, x0))
++
++/*
++** ldnf1uh_s64_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_index, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 + x1),
++	   z0 = svldnf1uh_s64 (p0, x0 + x1))
++
++/*
++** ldnf1uh_s64_1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_1, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1uh_s64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1uh_s64_7:
++**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_7, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1uh_s64_8:
++**	incb	x0, all, mul #2
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_8, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1uh_s64_m1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_m1, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1uh_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1uh_s64_m8:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_m8, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1uh_s64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_s64_m9, svint64_t, uint16_t,
++	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1uh_vnum_s64_0:
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_0, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 0))
++
++/*
++** ldnf1uh_vnum_s64_1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_1, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 1))
++
++/*
++** ldnf1uh_vnum_s64_7:
++**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_7, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 7))
++
++/*
++** ldnf1uh_vnum_s64_8:
++**	incb	x0, all, mul #2
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_8, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, 8))
++
++/*
++** ldnf1uh_vnum_s64_m1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_m1, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, -1))
++
++/*
++** ldnf1uh_vnum_s64_m8:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_m8, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, -8))
++
++/*
++** ldnf1uh_vnum_s64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_m9, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, -9))
++
++/*
++** ldnf1uh_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_s64_x1, svint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1uh_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u32.c
+new file mode 100644
+index 000000000..93d5abaf7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u32.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1uh_u32_base:
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_base, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0),
++	   z0 = svldnf1uh_u32 (p0, x0))
++
++/*
++** ldnf1uh_u32_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.s, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_index, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 + x1),
++	   z0 = svldnf1uh_u32 (p0, x0 + x1))
++
++/*
++** ldnf1uh_u32_1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_1, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 + svcntw ()),
++	   z0 = svldnf1uh_u32 (p0, x0 + svcntw ()))
++
++/*
++** ldnf1uh_u32_7:
++**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_7, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 7))
++
++/*
++** ldnf1uh_u32_8:
++**	incb	x0, all, mul #4
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_8, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnf1uh_u32_m1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_m1, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 - svcntw ()),
++	   z0 = svldnf1uh_u32 (p0, x0 - svcntw ()))
++
++/*
++** ldnf1uh_u32_m8:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_m8, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 8))
++
++/*
++** ldnf1uh_u32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u32_m9, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnf1uh_vnum_u32_0:
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_0, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 0),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 0))
++
++/*
++** ldnf1uh_vnum_u32_1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_1, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 1),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 1))
++
++/*
++** ldnf1uh_vnum_u32_7:
++**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_7, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 7),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 7))
++
++/*
++** ldnf1uh_vnum_u32_8:
++**	incb	x0, all, mul #4
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_8, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 8),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, 8))
++
++/*
++** ldnf1uh_vnum_u32_m1:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_m1, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, -1),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, -1))
++
++/*
++** ldnf1uh_vnum_u32_m8:
++**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_m8, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, -8),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, -8))
++
++/*
++** ldnf1uh_vnum_u32_m9:
++**	dech	x0, all, mul #9
++**	ldnf1h	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_m9, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, -9),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, -9))
++
++/*
++** ldnf1uh_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u32_x1, svuint32_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, x1),
++	   z0 = svldnf1uh_vnum_u32 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u64.c
+new file mode 100644
+index 000000000..32d36a84c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1uh_u64_base:
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_base, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0),
++	   z0 = svldnf1uh_u64 (p0, x0))
++
++/*
++** ldnf1uh_u64_index:
++**	add	(x[0-9]+), x0, x1, lsl 1
++**	ldnf1h	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_index, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 + x1),
++	   z0 = svldnf1uh_u64 (p0, x0 + x1))
++
++/*
++** ldnf1uh_u64_1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_1, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1uh_u64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1uh_u64_7:
++**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_7, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1uh_u64_8:
++**	incb	x0, all, mul #2
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_8, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1uh_u64_m1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_m1, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1uh_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1uh_u64_m8:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_m8, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1uh_u64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_u64_m9, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1uh_vnum_u64_0:
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_0, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 0))
++
++/*
++** ldnf1uh_vnum_u64_1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_1, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 1))
++
++/*
++** ldnf1uh_vnum_u64_7:
++**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_7, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 7))
++
++/*
++** ldnf1uh_vnum_u64_8:
++**	incb	x0, all, mul #2
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_8, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, 8))
++
++/*
++** ldnf1uh_vnum_u64_m1:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_m1, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, -1))
++
++/*
++** ldnf1uh_vnum_u64_m8:
++**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_m8, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, -8))
++
++/*
++** ldnf1uh_vnum_u64_m9:
++**	decw	x0, all, mul #9
++**	ldnf1h	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_m9, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, -9))
++
++/*
++** ldnf1uh_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1h	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1uh_vnum_u64_x1, svuint64_t, uint16_t,
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1uh_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_s64.c
+new file mode 100644
+index 000000000..373922791
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_s64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1uw_s64_base:
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_base, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0),
++	   z0 = svldnf1uw_s64 (p0, x0))
++
++/*
++** ldnf1uw_s64_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1w	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_index, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 + x1),
++	   z0 = svldnf1uw_s64 (p0, x0 + x1))
++
++/*
++** ldnf1uw_s64_1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_1, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1uw_s64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1uw_s64_7:
++**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_7, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1uw_s64_8:
++**	incb	x0, all, mul #4
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_8, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1uw_s64_m1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_m1, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1uw_s64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1uw_s64_m8:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_m8, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1uw_s64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_s64_m9, svint64_t, uint32_t,
++	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1uw_vnum_s64_0:
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_0, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 0),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 0))
++
++/*
++** ldnf1uw_vnum_s64_1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_1, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 1),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 1))
++
++/*
++** ldnf1uw_vnum_s64_7:
++**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_7, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 7),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 7))
++
++/*
++** ldnf1uw_vnum_s64_8:
++**	incb	x0, all, mul #4
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_8, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 8),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, 8))
++
++/*
++** ldnf1uw_vnum_s64_m1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_m1, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, -1),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, -1))
++
++/*
++** ldnf1uw_vnum_s64_m8:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_m8, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, -8),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, -8))
++
++/*
++** ldnf1uw_vnum_s64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_m9, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, -9),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, -9))
++
++/*
++** ldnf1uw_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1w	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_s64_x1, svint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, x1),
++	   z0 = svldnf1uw_vnum_s64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_u64.c
+new file mode 100644
+index 000000000..b3c3be1d0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_u64.c
+@@ -0,0 +1,154 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnf1uw_u64_base:
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_base, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0),
++	   z0 = svldnf1uw_u64 (p0, x0))
++
++/*
++** ldnf1uw_u64_index:
++**	add	(x[0-9]+), x0, x1, lsl 2
++**	ldnf1w	z0\.d, p0/z, \[\1\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_index, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 + x1),
++	   z0 = svldnf1uw_u64 (p0, x0 + x1))
++
++/*
++** ldnf1uw_u64_1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_1, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnf1uw_u64 (p0, x0 + svcntd ()))
++
++/*
++** ldnf1uw_u64_7:
++**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_7, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 7))
++
++/*
++** ldnf1uw_u64_8:
++**	incb	x0, all, mul #4
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_8, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnf1uw_u64_m1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_m1, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnf1uw_u64 (p0, x0 - svcntd ()))
++
++/*
++** ldnf1uw_u64_m8:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_m8, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 8))
++
++/*
++** ldnf1uw_u64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_u64_m9, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnf1uw_vnum_u64_0:
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_0, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 0),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 0))
++
++/*
++** ldnf1uw_vnum_u64_1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_1, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 1),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 1))
++
++/*
++** ldnf1uw_vnum_u64_7:
++**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_7, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 7),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 7))
++
++/*
++** ldnf1uw_vnum_u64_8:
++**	incb	x0, all, mul #4
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_8, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 8),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, 8))
++
++/*
++** ldnf1uw_vnum_u64_m1:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_m1, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, -1),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, -1))
++
++/*
++** ldnf1uw_vnum_u64_m8:
++**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_m8, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, -8),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, -8))
++
++/*
++** ldnf1uw_vnum_u64_m9:
++**	dech	x0, all, mul #9
++**	ldnf1w	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_m9, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, -9),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, -9))
++
++/*
++** ldnf1uw_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnf1w	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnf1uw_vnum_u64_x1, svuint64_t, uint32_t,
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, x1),
++	   z0 = svldnf1uw_vnum_u64 (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c
+new file mode 100644
+index 000000000..b083901fa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_bf16_base:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_base, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_bf16_index:
++**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_index, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_bf16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 + svcnth ()),
++	   z0 = svldnt1 (p0, x0 + svcnth ()))
++
++/*
++** ldnt1_bf16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_bf16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnt1_bf16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 - svcnth ()),
++	   z0 = svldnt1 (p0, x0 - svcnth ()))
++
++/*
++** ldnt1_bf16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_bf16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_bf16_m9, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_bf16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnt1_vnum_bf16_0:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_bf16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_bf16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_bf16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_bf16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_bf16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_bf16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
++	   z0 = svldnt1_vnum_bf16 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f16.c
+new file mode 100644
+index 000000000..c98ab2da4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_f16_base:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_base, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_f16_index:
++**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_index, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_f16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_1, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 + svcnth ()),
++	   z0 = svldnt1 (p0, x0 + svcnth ()))
++
++/*
++** ldnt1_f16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_7, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_f16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_8, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnt1_f16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_m1, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 - svcnth ()),
++	   z0 = svldnt1 (p0, x0 - svcnth ()))
++
++/*
++** ldnt1_f16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_m8, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_f16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f16_m9, svfloat16_t, float16_t,
++	   z0 = svldnt1_f16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnt1_vnum_f16_0:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_0, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_f16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_1, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_f16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_7, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_f16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_8, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_f16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_m1, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_f16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_m8, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_f16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_m9, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f16_x1, svfloat16_t, float16_t,
++	   z0 = svldnt1_vnum_f16 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f32.c
+new file mode 100644
+index 000000000..fb09a8a6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_f32_base:
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_base, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_f32_index:
++**	ldnt1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_index, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_f32_1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_1, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 + svcntw ()),
++	   z0 = svldnt1 (p0, x0 + svcntw ()))
++
++/*
++** ldnt1_f32_7:
++**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_7, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_f32_8:
++**	incb	x0, all, mul #8
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_8, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnt1_f32_m1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_m1, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 - svcntw ()),
++	   z0 = svldnt1 (p0, x0 - svcntw ()))
++
++/*
++** ldnt1_f32_m8:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_m8, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_f32_m9:
++**	decb	x0, all, mul #9
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f32_m9, svfloat32_t, float32_t,
++	   z0 = svldnt1_f32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnt1_vnum_f32_0:
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_0, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_f32_1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_1, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_f32_7:
++**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_7, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_f32_8:
++**	incb	x0, all, mul #8
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_8, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_f32_m1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_m1, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_f32_m8:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_m8, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_f32_m9:
++**	decb	x0, all, mul #9
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_m9, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f32_x1, svfloat32_t, float32_t,
++	   z0 = svldnt1_vnum_f32 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f64.c
+new file mode 100644
+index 000000000..2a7863282
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_f64_base:
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_base, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_f64_index:
++**	ldnt1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_index, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_f64_1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_1, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 + svcntd ()),
++	   z0 = svldnt1 (p0, x0 + svcntd ()))
++
++/*
++** ldnt1_f64_7:
++**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_7, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_f64_8:
++**	incb	x0, all, mul #8
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_8, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnt1_f64_m1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_m1, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 - svcntd ()),
++	   z0 = svldnt1 (p0, x0 - svcntd ()))
++
++/*
++** ldnt1_f64_m8:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_m8, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_f64_m9:
++**	decb	x0, all, mul #9
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_f64_m9, svfloat64_t, float64_t,
++	   z0 = svldnt1_f64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnt1_vnum_f64_0:
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_0, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_f64_1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_1, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_f64_7:
++**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_7, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_f64_8:
++**	incb	x0, all, mul #8
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_8, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_f64_m1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_m1, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_f64_m8:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_m8, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_f64_m9:
++**	decb	x0, all, mul #9
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_m9, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_f64_x1, svfloat64_t, float64_t,
++	   z0 = svldnt1_vnum_f64 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s16.c
+new file mode 100644
+index 000000000..c307ed51f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_s16_base:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_base, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_s16_index:
++**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_index, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_s16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_1, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 + svcnth ()),
++	   z0 = svldnt1 (p0, x0 + svcnth ()))
++
++/*
++** ldnt1_s16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_7, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_8, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnt1_s16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_m1, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 - svcnth ()),
++	   z0 = svldnt1 (p0, x0 - svcnth ()))
++
++/*
++** ldnt1_s16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_m8, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s16_m9, svint16_t, int16_t,
++	   z0 = svldnt1_s16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnt1_vnum_s16_0:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_0, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_s16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_1, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_s16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_7, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_8, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_s16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_m1, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_s16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_m8, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_m9, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s16_x1, svint16_t, int16_t,
++	   z0 = svldnt1_vnum_s16 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s32.c
+new file mode 100644
+index 000000000..2b9df1781
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_s32_base:
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_base, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_s32_index:
++**	ldnt1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_index, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_s32_1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_1, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 + svcntw ()),
++	   z0 = svldnt1 (p0, x0 + svcntw ()))
++
++/*
++** ldnt1_s32_7:
++**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_7, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s32_8:
++**	incb	x0, all, mul #8
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_8, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnt1_s32_m1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_m1, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 - svcntw ()),
++	   z0 = svldnt1 (p0, x0 - svcntw ()))
++
++/*
++** ldnt1_s32_m8:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_m8, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s32_m9:
++**	decb	x0, all, mul #9
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s32_m9, svint32_t, int32_t,
++	   z0 = svldnt1_s32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnt1_vnum_s32_0:
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_0, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_s32_1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_1, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_s32_7:
++**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_7, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s32_8:
++**	incb	x0, all, mul #8
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_8, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_s32_m1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_m1, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_s32_m8:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_m8, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s32_m9:
++**	decb	x0, all, mul #9
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_m9, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s32_x1, svint32_t, int32_t,
++	   z0 = svldnt1_vnum_s32 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s64.c
+new file mode 100644
+index 000000000..5bc7ac6ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_s64_base:
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_base, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_s64_index:
++**	ldnt1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_index, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_s64_1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_1, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 + svcntd ()),
++	   z0 = svldnt1 (p0, x0 + svcntd ()))
++
++/*
++** ldnt1_s64_7:
++**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_7, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s64_8:
++**	incb	x0, all, mul #8
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_8, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnt1_s64_m1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_m1, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 - svcntd ()),
++	   z0 = svldnt1 (p0, x0 - svcntd ()))
++
++/*
++** ldnt1_s64_m8:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_m8, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s64_m9:
++**	decb	x0, all, mul #9
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s64_m9, svint64_t, int64_t,
++	   z0 = svldnt1_s64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnt1_vnum_s64_0:
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_0, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_s64_1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_1, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_s64_7:
++**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_7, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s64_8:
++**	incb	x0, all, mul #8
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_8, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_s64_m1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_m1, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_s64_m8:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_m8, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s64_m9:
++**	decb	x0, all, mul #9
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_m9, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s64_x1, svint64_t, int64_t,
++	   z0 = svldnt1_vnum_s64 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s8.c
+new file mode 100644
+index 000000000..eb8e2e548
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_s8_base:
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_base, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_s8_index:
++**	ldnt1b	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_index, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_s8_1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_1, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 + svcntb ()),
++	   z0 = svldnt1 (p0, x0 + svcntb ()))
++
++/*
++** ldnt1_s8_7:
++**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_7, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 + svcntb () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntb () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s8_8:
++**	incb	x0, all, mul #8
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_8, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 + svcntb () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntb () * 8))
++
++/*
++** ldnt1_s8_m1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_m1, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 - svcntb ()),
++	   z0 = svldnt1 (p0, x0 - svcntb ()))
++
++/*
++** ldnt1_s8_m8:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_m8, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 - svcntb () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntb () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_s8_m9:
++**	decb	x0, all, mul #9
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_s8_m9, svint8_t, int8_t,
++	   z0 = svldnt1_s8 (p0, x0 - svcntb () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntb () * 9))
++
++/*
++** ldnt1_vnum_s8_0:
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_0, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_s8_1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_1, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_s8_7:
++**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_7, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s8_8:
++**	incb	x0, all, mul #8
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_8, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_s8_m1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_m1, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_s8_m8:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_m8, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_s8_m9:
++**	decb	x0, all, mul #9
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_m9, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/*
++** ldnt1_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnt1b	z0\.b, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldnt1b	z0\.b, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_s8_x1, svint8_t, int8_t,
++	   z0 = svldnt1_vnum_s8 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u16.c
+new file mode 100644
+index 000000000..c032c3d93
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_u16_base:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_base, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_u16_index:
++**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_index, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_u16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_1, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 + svcnth ()),
++	   z0 = svldnt1 (p0, x0 + svcnth ()))
++
++/*
++** ldnt1_u16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_7, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 + svcnth () * 7),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_8, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 + svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
++
++/*
++** ldnt1_u16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_m1, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 - svcnth ()),
++	   z0 = svldnt1 (p0, x0 - svcnth ()))
++
++/*
++** ldnt1_u16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_m8, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 - svcnth () * 8),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u16_m9, svuint16_t, uint16_t,
++	   z0 = svldnt1_u16 (p0, x0 - svcnth () * 9),
++	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
++
++/*
++** ldnt1_vnum_u16_0:
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_0, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_u16_1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_1, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_u16_7:
++**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_7, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u16_8:
++**	incb	x0, all, mul #8
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_8, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_u16_m1:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_m1, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_u16_m8:
++**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_m8, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u16_m9:
++**	decb	x0, all, mul #9
++**	ldnt1h	z0\.h, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_m9, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1h	z0\.h, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u16_x1, svuint16_t, uint16_t,
++	   z0 = svldnt1_vnum_u16 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u32.c
+new file mode 100644
+index 000000000..278794459
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_u32_base:
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_base, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_u32_index:
++**	ldnt1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_index, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_u32_1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_1, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 + svcntw ()),
++	   z0 = svldnt1 (p0, x0 + svcntw ()))
++
++/*
++** ldnt1_u32_7:
++**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_7, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 + svcntw () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntw () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u32_8:
++**	incb	x0, all, mul #8
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_8, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 + svcntw () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntw () * 8))
++
++/*
++** ldnt1_u32_m1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_m1, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 - svcntw ()),
++	   z0 = svldnt1 (p0, x0 - svcntw ()))
++
++/*
++** ldnt1_u32_m8:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_m8, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 - svcntw () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntw () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u32_m9:
++**	decb	x0, all, mul #9
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u32_m9, svuint32_t, uint32_t,
++	   z0 = svldnt1_u32 (p0, x0 - svcntw () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntw () * 9))
++
++/*
++** ldnt1_vnum_u32_0:
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_0, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_u32_1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_1, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_u32_7:
++**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_7, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u32_8:
++**	incb	x0, all, mul #8
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_8, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_u32_m1:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_m1, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_u32_m8:
++**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_m8, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u32_m9:
++**	decb	x0, all, mul #9
++**	ldnt1w	z0\.s, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_m9, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1w	z0\.s, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u32_x1, svuint32_t, uint32_t,
++	   z0 = svldnt1_vnum_u32 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u64.c
+new file mode 100644
+index 000000000..abafee6f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_u64_base:
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_base, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_u64_index:
++**	ldnt1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_index, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_u64_1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_1, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 + svcntd ()),
++	   z0 = svldnt1 (p0, x0 + svcntd ()))
++
++/*
++** ldnt1_u64_7:
++**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_7, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 + svcntd () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntd () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u64_8:
++**	incb	x0, all, mul #8
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_8, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 + svcntd () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntd () * 8))
++
++/*
++** ldnt1_u64_m1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_m1, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 - svcntd ()),
++	   z0 = svldnt1 (p0, x0 - svcntd ()))
++
++/*
++** ldnt1_u64_m8:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_m8, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 - svcntd () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntd () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u64_m9:
++**	decb	x0, all, mul #9
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u64_m9, svuint64_t, uint64_t,
++	   z0 = svldnt1_u64 (p0, x0 - svcntd () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntd () * 9))
++
++/*
++** ldnt1_vnum_u64_0:
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_0, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_u64_1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_1, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_u64_7:
++**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_7, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u64_8:
++**	incb	x0, all, mul #8
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_8, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_u64_m1:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_m1, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_u64_m8:
++**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_m8, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u64_m9:
++**	decb	x0, all, mul #9
++**	ldnt1d	z0\.d, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_m9, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** ldnt1_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	ldnt1d	z0\.d, p0/z, \[\2\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u64_x1, svuint64_t, uint64_t,
++	   z0 = svldnt1_vnum_u64 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u8.c
+new file mode 100644
+index 000000000..7bf9acc26
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ldnt1_u8_base:
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_base, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0),
++	   z0 = svldnt1 (p0, x0))
++
++/*
++** ldnt1_u8_index:
++**	ldnt1b	z0\.b, p0/z, \[x0, x1\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_index, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 + x1),
++	   z0 = svldnt1 (p0, x0 + x1))
++
++/*
++** ldnt1_u8_1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_1, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 + svcntb ()),
++	   z0 = svldnt1 (p0, x0 + svcntb ()))
++
++/*
++** ldnt1_u8_7:
++**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_7, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 + svcntb () * 7),
++	   z0 = svldnt1 (p0, x0 + svcntb () * 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u8_8:
++**	incb	x0, all, mul #8
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_8, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 + svcntb () * 8),
++	   z0 = svldnt1 (p0, x0 + svcntb () * 8))
++
++/*
++** ldnt1_u8_m1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_m1, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 - svcntb ()),
++	   z0 = svldnt1 (p0, x0 - svcntb ()))
++
++/*
++** ldnt1_u8_m8:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_m8, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 - svcntb () * 8),
++	   z0 = svldnt1 (p0, x0 - svcntb () * 8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_u8_m9:
++**	decb	x0, all, mul #9
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_u8_m9, svuint8_t, uint8_t,
++	   z0 = svldnt1_u8 (p0, x0 - svcntb () * 9),
++	   z0 = svldnt1 (p0, x0 - svcntb () * 9))
++
++/*
++** ldnt1_vnum_u8_0:
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_0, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, 0),
++	   z0 = svldnt1_vnum (p0, x0, 0))
++
++/*
++** ldnt1_vnum_u8_1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_1, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, 1),
++	   z0 = svldnt1_vnum (p0, x0, 1))
++
++/*
++** ldnt1_vnum_u8_7:
++**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_7, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, 7),
++	   z0 = svldnt1_vnum (p0, x0, 7))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u8_8:
++**	incb	x0, all, mul #8
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_8, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, 8),
++	   z0 = svldnt1_vnum (p0, x0, 8))
++
++/*
++** ldnt1_vnum_u8_m1:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_m1, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, -1),
++	   z0 = svldnt1_vnum (p0, x0, -1))
++
++/*
++** ldnt1_vnum_u8_m8:
++**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_m8, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, -8),
++	   z0 = svldnt1_vnum (p0, x0, -8))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** ldnt1_vnum_u8_m9:
++**	decb	x0, all, mul #9
++**	ldnt1b	z0\.b, p0/z, \[x0\]
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_m9, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, -9),
++	   z0 = svldnt1_vnum (p0, x0, -9))
++
++/*
++** ldnt1_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	ldnt1b	z0\.b, p0/z, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	ldnt1b	z0\.b, p0/z, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_LOAD (ldnt1_vnum_u8_x1, svuint8_t, uint8_t,
++	   z0 = svldnt1_vnum_u8 (p0, x0, x1),
++	   z0 = svldnt1_vnum (p0, x0, x1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_bf16.c
+new file mode 100644
+index 000000000..cd91ff48d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_bf16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_bf16:
++**	cnth	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_bf16, uint64_t, svbfloat16_t,
++		  x0 = svlen_bf16 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f16.c
+new file mode 100644
+index 000000000..aa6d94bbc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_f16:
++**	cnth	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_f16, uint64_t, svfloat16_t,
++		  x0 = svlen_f16 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f32.c
+new file mode 100644
+index 000000000..1dd50cee0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_f32:
++**	cntw	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_f32, uint64_t, svfloat32_t,
++		  x0 = svlen_f32 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f64.c
+new file mode 100644
+index 000000000..1f210653e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_f64:
++**	cntd	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_f64, uint64_t, svfloat64_t,
++		  x0 = svlen_f64 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s16.c
+new file mode 100644
+index 000000000..f56796182
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_s16:
++**	cnth	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_s16, uint64_t, svint16_t,
++		  x0 = svlen_s16 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s32.c
+new file mode 100644
+index 000000000..662fac177
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_s32:
++**	cntw	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_s32, uint64_t, svint32_t,
++		  x0 = svlen_s32 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s64.c
+new file mode 100644
+index 000000000..f95770302
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_s64:
++**	cntd	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_s64, uint64_t, svint64_t,
++		  x0 = svlen_s64 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s8.c
+new file mode 100644
+index 000000000..6ed8a7177
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s8.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_s8:
++**	cntb	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_s8, uint64_t, svint8_t,
++		  x0 = svlen_s8 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u16.c
+new file mode 100644
+index 000000000..13692c927
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u16.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_u16:
++**	cnth	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_u16, uint64_t, svuint16_t,
++		  x0 = svlen_u16 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u32.c
+new file mode 100644
+index 000000000..b03146089
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u32.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_u32:
++**	cntw	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_u32, uint64_t, svuint32_t,
++		  x0 = svlen_u32 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u64.c
+new file mode 100644
+index 000000000..11f2e4b81
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u64.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_u64:
++**	cntd	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_u64, uint64_t, svuint64_t,
++		  x0 = svlen_u64 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u8.c
+new file mode 100644
+index 000000000..fbd39a432
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u8.c
+@@ -0,0 +1,12 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** len_x0_u8:
++**	cntb	x0
++**	ret
++*/
++TEST_REDUCTION_X (len_x0_u8, uint64_t, svuint8_t,
++		  x0 = svlen_u8 (z0),
++		  x0 = svlen (z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s16.c
+new file mode 100644
+index 000000000..edaaca5f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s16.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_s16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (lsl_s16_m_tied1, svint16_t, svuint16_t,
++	     z0 = svlsl_s16_m (p0, z0, z4),
++	     z0 = svlsl_m (p0, z0, z4))
++
++/*
++** lsl_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s16_m_tied2, svint16_t, svuint16_t,
++		 z0_res = svlsl_s16_m (p0, z4, z0),
++		 z0_res = svlsl_m (p0, z4, z0))
++
++/*
++** lsl_s16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (lsl_s16_m_untied, svint16_t, svuint16_t,
++	     z0 = svlsl_s16_m (p0, z1, z4),
++	     z0 = svlsl_m (p0, z1, z4))
++
++/*
++** lsl_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s16_m_tied1, svint16_t, uint16_t,
++		 z0 = svlsl_n_s16_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s16_m_untied, svint16_t, uint16_t,
++		 z0 = svlsl_n_s16_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_s16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s16_m_tied1, svint16_t,
++		z0 = svlsl_n_s16_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_s16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s16_m_untied, svint16_t,
++		z0 = svlsl_n_s16_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_15_s16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_s16_m_tied1, svint16_t,
++		z0 = svlsl_n_s16_m (p0, z0, 15),
++		z0 = svlsl_m (p0, z0, 15))
++
++/*
++** lsl_15_s16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_s16_m_untied, svint16_t,
++		z0 = svlsl_n_s16_m (p0, z1, 15),
++		z0 = svlsl_m (p0, z1, 15))
++
++/*
++** lsl_16_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #16
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_s16_m_tied1, svint16_t,
++		z0 = svlsl_n_s16_m (p0, z0, 16),
++		z0 = svlsl_m (p0, z0, 16))
++
++/*
++** lsl_16_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #16
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_s16_m_untied, svint16_t,
++		z0 = svlsl_n_s16_m (p0, z1, 16),
++		z0 = svlsl_m (p0, z1, 16))
++
++/*
++** lsl_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (lsl_s16_z_tied1, svint16_t, svuint16_t,
++	     z0 = svlsl_s16_z (p0, z0, z4),
++	     z0 = svlsl_z (p0, z0, z4))
++
++/*
++** lsl_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lslr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s16_z_tied2, svint16_t, svuint16_t,
++		 z0_res = svlsl_s16_z (p0, z4, z0),
++		 z0_res = svlsl_z (p0, z4, z0))
++
++/*
++** lsl_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, z4\.h
++** |
++**	movprfx	z0\.h, p0/z, z4\.h
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s16_z_untied, svint16_t, svuint16_t,
++	     z0 = svlsl_s16_z (p0, z1, z4),
++	     z0 = svlsl_z (p0, z1, z4))
++
++/*
++** lsl_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s16_z_tied1, svint16_t, uint16_t,
++		 z0 = svlsl_n_s16_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s16_z_untied, svint16_t, uint16_t,
++		 z0 = svlsl_n_s16_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s16_z_tied1, svint16_t,
++		z0 = svlsl_n_s16_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s16_z_untied, svint16_t,
++		z0 = svlsl_n_s16_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_15_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_s16_z_tied1, svint16_t,
++		z0 = svlsl_n_s16_z (p0, z0, 15),
++		z0 = svlsl_z (p0, z0, 15))
++
++/*
++** lsl_15_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_s16_z_untied, svint16_t,
++		z0 = svlsl_n_s16_z (p0, z1, 15),
++		z0 = svlsl_z (p0, z1, 15))
++
++/*
++** lsl_16_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #16
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_s16_z_tied1, svint16_t,
++		z0 = svlsl_n_s16_z (p0, z0, 16),
++		z0 = svlsl_z (p0, z0, 16))
++
++/*
++** lsl_16_s16_z_untied:
++**	mov	(z[0-9]+\.h), #16
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_s16_z_untied, svint16_t,
++		z0 = svlsl_n_s16_z (p0, z1, 16),
++		z0 = svlsl_z (p0, z1, 16))
++
++/*
++** lsl_s16_x_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (lsl_s16_x_tied1, svint16_t, svuint16_t,
++	     z0 = svlsl_s16_x (p0, z0, z4),
++	     z0 = svlsl_x (p0, z0, z4))
++
++/*
++** lsl_s16_x_tied2:
++**	lslr	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s16_x_tied2, svint16_t, svuint16_t,
++		 z0_res = svlsl_s16_x (p0, z4, z0),
++		 z0_res = svlsl_x (p0, z4, z0))
++
++/*
++** lsl_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, z4\.h
++** |
++**	movprfx	z0, z4
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s16_x_untied, svint16_t, svuint16_t,
++	     z0 = svlsl_s16_x (p0, z1, z4),
++	     z0 = svlsl_x (p0, z1, z4))
++
++/*
++** lsl_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s16_x_tied1, svint16_t, uint16_t,
++		 z0 = svlsl_n_s16_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s16_x_untied, svint16_t, uint16_t,
++		 z0 = svlsl_n_s16_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_s16_x_tied1:
++**	lsl	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s16_x_tied1, svint16_t,
++		z0 = svlsl_n_s16_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_s16_x_untied:
++**	lsl	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s16_x_untied, svint16_t,
++		z0 = svlsl_n_s16_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_15_s16_x_tied1:
++**	lsl	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_s16_x_tied1, svint16_t,
++		z0 = svlsl_n_s16_x (p0, z0, 15),
++		z0 = svlsl_x (p0, z0, 15))
++
++/*
++** lsl_15_s16_x_untied:
++**	lsl	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_s16_x_untied, svint16_t,
++		z0 = svlsl_n_s16_x (p0, z1, 15),
++		z0 = svlsl_x (p0, z1, 15))
++
++/*
++** lsl_16_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #16
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_s16_x_tied1, svint16_t,
++		z0 = svlsl_n_s16_x (p0, z0, 16),
++		z0 = svlsl_x (p0, z0, 16))
++
++/*
++** lsl_16_s16_x_untied:
++**	mov	z0\.h, #16
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_s16_x_untied, svint16_t,
++		z0 = svlsl_n_s16_x (p0, z1, 16),
++		z0 = svlsl_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s32.c
+new file mode 100644
+index 000000000..f98f1f94b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s32.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_s32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (lsl_s32_m_tied1, svint32_t, svuint32_t,
++	     z0 = svlsl_s32_m (p0, z0, z4),
++	     z0 = svlsl_m (p0, z0, z4))
++
++/*
++** lsl_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s32_m_tied2, svint32_t, svuint32_t,
++		 z0_res = svlsl_s32_m (p0, z4, z0),
++		 z0_res = svlsl_m (p0, z4, z0))
++
++/*
++** lsl_s32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (lsl_s32_m_untied, svint32_t, svuint32_t,
++	     z0 = svlsl_s32_m (p0, z1, z4),
++	     z0 = svlsl_m (p0, z1, z4))
++
++/*
++** lsl_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s32_m_tied1, svint32_t, uint32_t,
++		 z0 = svlsl_n_s32_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s32_m_untied, svint32_t, uint32_t,
++		 z0 = svlsl_n_s32_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_s32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s32_m_tied1, svint32_t,
++		z0 = svlsl_n_s32_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_s32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s32_m_untied, svint32_t,
++		z0 = svlsl_n_s32_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_31_s32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_s32_m_tied1, svint32_t,
++		z0 = svlsl_n_s32_m (p0, z0, 31),
++		z0 = svlsl_m (p0, z0, 31))
++
++/*
++** lsl_31_s32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_s32_m_untied, svint32_t,
++		z0 = svlsl_n_s32_m (p0, z1, 31),
++		z0 = svlsl_m (p0, z1, 31))
++
++/*
++** lsl_32_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #32
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_s32_m_tied1, svint32_t,
++		z0 = svlsl_n_s32_m (p0, z0, 32),
++		z0 = svlsl_m (p0, z0, 32))
++
++/*
++** lsl_32_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #32
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_s32_m_untied, svint32_t,
++		z0 = svlsl_n_s32_m (p0, z1, 32),
++		z0 = svlsl_m (p0, z1, 32))
++
++/*
++** lsl_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (lsl_s32_z_tied1, svint32_t, svuint32_t,
++	     z0 = svlsl_s32_z (p0, z0, z4),
++	     z0 = svlsl_z (p0, z0, z4))
++
++/*
++** lsl_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lslr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s32_z_tied2, svint32_t, svuint32_t,
++		 z0_res = svlsl_s32_z (p0, z4, z0),
++		 z0_res = svlsl_z (p0, z4, z0))
++
++/*
++** lsl_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, z4\.s
++** |
++**	movprfx	z0\.s, p0/z, z4\.s
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s32_z_untied, svint32_t, svuint32_t,
++	     z0 = svlsl_s32_z (p0, z1, z4),
++	     z0 = svlsl_z (p0, z1, z4))
++
++/*
++** lsl_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s32_z_tied1, svint32_t, uint32_t,
++		 z0 = svlsl_n_s32_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s32_z_untied, svint32_t, uint32_t,
++		 z0 = svlsl_n_s32_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s32_z_tied1, svint32_t,
++		z0 = svlsl_n_s32_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s32_z_untied, svint32_t,
++		z0 = svlsl_n_s32_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_31_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_s32_z_tied1, svint32_t,
++		z0 = svlsl_n_s32_z (p0, z0, 31),
++		z0 = svlsl_z (p0, z0, 31))
++
++/*
++** lsl_31_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_s32_z_untied, svint32_t,
++		z0 = svlsl_n_s32_z (p0, z1, 31),
++		z0 = svlsl_z (p0, z1, 31))
++
++/*
++** lsl_32_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #32
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_s32_z_tied1, svint32_t,
++		z0 = svlsl_n_s32_z (p0, z0, 32),
++		z0 = svlsl_z (p0, z0, 32))
++
++/*
++** lsl_32_s32_z_untied:
++**	mov	(z[0-9]+\.s), #32
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_s32_z_untied, svint32_t,
++		z0 = svlsl_n_s32_z (p0, z1, 32),
++		z0 = svlsl_z (p0, z1, 32))
++
++/*
++** lsl_s32_x_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (lsl_s32_x_tied1, svint32_t, svuint32_t,
++	     z0 = svlsl_s32_x (p0, z0, z4),
++	     z0 = svlsl_x (p0, z0, z4))
++
++/*
++** lsl_s32_x_tied2:
++**	lslr	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s32_x_tied2, svint32_t, svuint32_t,
++		 z0_res = svlsl_s32_x (p0, z4, z0),
++		 z0_res = svlsl_x (p0, z4, z0))
++
++/*
++** lsl_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, z4\.s
++** |
++**	movprfx	z0, z4
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s32_x_untied, svint32_t, svuint32_t,
++	     z0 = svlsl_s32_x (p0, z1, z4),
++	     z0 = svlsl_x (p0, z1, z4))
++
++/*
++** lsl_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s32_x_tied1, svint32_t, uint32_t,
++		 z0 = svlsl_n_s32_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s32_x_untied, svint32_t, uint32_t,
++		 z0 = svlsl_n_s32_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_s32_x_tied1:
++**	lsl	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s32_x_tied1, svint32_t,
++		z0 = svlsl_n_s32_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_s32_x_untied:
++**	lsl	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s32_x_untied, svint32_t,
++		z0 = svlsl_n_s32_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_31_s32_x_tied1:
++**	lsl	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_s32_x_tied1, svint32_t,
++		z0 = svlsl_n_s32_x (p0, z0, 31),
++		z0 = svlsl_x (p0, z0, 31))
++
++/*
++** lsl_31_s32_x_untied:
++**	lsl	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_s32_x_untied, svint32_t,
++		z0 = svlsl_n_s32_x (p0, z1, 31),
++		z0 = svlsl_x (p0, z1, 31))
++
++/*
++** lsl_32_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #32
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_s32_x_tied1, svint32_t,
++		z0 = svlsl_n_s32_x (p0, z0, 32),
++		z0 = svlsl_x (p0, z0, 32))
++
++/*
++** lsl_32_s32_x_untied:
++**	mov	z0\.s, #32
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_s32_x_untied, svint32_t,
++		z0 = svlsl_n_s32_x (p0, z1, 32),
++		z0 = svlsl_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s64.c
+new file mode 100644
+index 000000000..39753986b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s64.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_s64_m_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_s64_m_tied1, svint64_t, svuint64_t,
++	     z0 = svlsl_s64_m (p0, z0, z4),
++	     z0 = svlsl_m (p0, z0, z4))
++
++/*
++** lsl_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s64_m_tied2, svint64_t, svuint64_t,
++		 z0_res = svlsl_s64_m (p0, z4, z0),
++		 z0_res = svlsl_m (p0, z4, z0))
++
++/*
++** lsl_s64_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_s64_m_untied, svint64_t, svuint64_t,
++	     z0 = svlsl_s64_m (p0, z1, z4),
++	     z0 = svlsl_m (p0, z1, z4))
++
++/*
++** lsl_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_s64_m_tied1, svint64_t, uint64_t,
++		 z0 = svlsl_n_s64_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_s64_m_untied, svint64_t, uint64_t,
++		 z0 = svlsl_n_s64_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_s64_m_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s64_m_tied1, svint64_t,
++		z0 = svlsl_n_s64_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_s64_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s64_m_untied, svint64_t,
++		z0 = svlsl_n_s64_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_63_s64_m_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_s64_m_tied1, svint64_t,
++		z0 = svlsl_n_s64_m (p0, z0, 63),
++		z0 = svlsl_m (p0, z0, 63))
++
++/*
++** lsl_63_s64_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_s64_m_untied, svint64_t,
++		z0 = svlsl_n_s64_m (p0, z1, 63),
++		z0 = svlsl_m (p0, z1, 63))
++
++/*
++** lsl_64_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #64
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_s64_m_tied1, svint64_t,
++		z0 = svlsl_n_s64_m (p0, z0, 64),
++		z0 = svlsl_m (p0, z0, 64))
++
++/*
++** lsl_64_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #64
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_s64_m_untied, svint64_t,
++		z0 = svlsl_n_s64_m (p0, z1, 64),
++		z0 = svlsl_m (p0, z1, 64))
++
++/*
++** lsl_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_s64_z_tied1, svint64_t, svuint64_t,
++	     z0 = svlsl_s64_z (p0, z0, z4),
++	     z0 = svlsl_z (p0, z0, z4))
++
++/*
++** lsl_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lslr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s64_z_tied2, svint64_t, svuint64_t,
++		 z0_res = svlsl_s64_z (p0, z4, z0),
++		 z0_res = svlsl_z (p0, z4, z0))
++
++/*
++** lsl_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, z4\.d
++** |
++**	movprfx	z0\.d, p0/z, z4\.d
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s64_z_untied, svint64_t, svuint64_t,
++	     z0 = svlsl_s64_z (p0, z1, z4),
++	     z0 = svlsl_z (p0, z1, z4))
++
++/*
++** lsl_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_s64_z_tied1, svint64_t, uint64_t,
++		 z0 = svlsl_n_s64_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_s64_z_untied, svint64_t, uint64_t,
++		 z0 = svlsl_n_s64_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s64_z_tied1, svint64_t,
++		z0 = svlsl_n_s64_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s64_z_untied, svint64_t,
++		z0 = svlsl_n_s64_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_63_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_s64_z_tied1, svint64_t,
++		z0 = svlsl_n_s64_z (p0, z0, 63),
++		z0 = svlsl_z (p0, z0, 63))
++
++/*
++** lsl_63_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_s64_z_untied, svint64_t,
++		z0 = svlsl_n_s64_z (p0, z1, 63),
++		z0 = svlsl_z (p0, z1, 63))
++
++/*
++** lsl_64_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #64
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_s64_z_tied1, svint64_t,
++		z0 = svlsl_n_s64_z (p0, z0, 64),
++		z0 = svlsl_z (p0, z0, 64))
++
++/*
++** lsl_64_s64_z_untied:
++**	mov	(z[0-9]+\.d), #64
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_s64_z_untied, svint64_t,
++		z0 = svlsl_n_s64_z (p0, z1, 64),
++		z0 = svlsl_z (p0, z1, 64))
++
++/*
++** lsl_s64_x_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_s64_x_tied1, svint64_t, svuint64_t,
++	     z0 = svlsl_s64_x (p0, z0, z4),
++	     z0 = svlsl_x (p0, z0, z4))
++
++/*
++** lsl_s64_x_tied2:
++**	lslr	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s64_x_tied2, svint64_t, svuint64_t,
++		 z0_res = svlsl_s64_x (p0, z4, z0),
++		 z0_res = svlsl_x (p0, z4, z0))
++
++/*
++** lsl_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, z4\.d
++** |
++**	movprfx	z0, z4
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s64_x_untied, svint64_t, svuint64_t,
++	     z0 = svlsl_s64_x (p0, z1, z4),
++	     z0 = svlsl_x (p0, z1, z4))
++
++/*
++** lsl_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_s64_x_tied1, svint64_t, uint64_t,
++		 z0 = svlsl_n_s64_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_s64_x_untied, svint64_t, uint64_t,
++		 z0 = svlsl_n_s64_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_s64_x_tied1:
++**	lsl	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s64_x_tied1, svint64_t,
++		z0 = svlsl_n_s64_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_s64_x_untied:
++**	lsl	z0\.d, z1\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s64_x_untied, svint64_t,
++		z0 = svlsl_n_s64_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_63_s64_x_tied1:
++**	lsl	z0\.d, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_s64_x_tied1, svint64_t,
++		z0 = svlsl_n_s64_x (p0, z0, 63),
++		z0 = svlsl_x (p0, z0, 63))
++
++/*
++** lsl_63_s64_x_untied:
++**	lsl	z0\.d, z1\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_s64_x_untied, svint64_t,
++		z0 = svlsl_n_s64_x (p0, z1, 63),
++		z0 = svlsl_x (p0, z1, 63))
++
++/*
++** lsl_64_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #64
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_s64_x_tied1, svint64_t,
++		z0 = svlsl_n_s64_x (p0, z0, 64),
++		z0 = svlsl_x (p0, z0, 64))
++
++/*
++** lsl_64_s64_x_untied:
++**	mov	z0\.d, #64
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_s64_x_untied, svint64_t,
++		z0 = svlsl_n_s64_x (p0, z1, 64),
++		z0 = svlsl_x (p0, z1, 64))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s8.c
+new file mode 100644
+index 000000000..9a9cc959c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s8.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_s8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (lsl_s8_m_tied1, svint8_t, svuint8_t,
++	     z0 = svlsl_s8_m (p0, z0, z4),
++	     z0 = svlsl_m (p0, z0, z4))
++
++/*
++** lsl_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s8_m_tied2, svint8_t, svuint8_t,
++		 z0_res = svlsl_s8_m (p0, z4, z0),
++		 z0_res = svlsl_m (p0, z4, z0))
++
++/*
++** lsl_s8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (lsl_s8_m_untied, svint8_t, svuint8_t,
++	     z0 = svlsl_s8_m (p0, z1, z4),
++	     z0 = svlsl_m (p0, z1, z4))
++
++/*
++** lsl_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s8_m_tied1, svint8_t, uint8_t,
++		 z0 = svlsl_n_s8_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s8_m_untied, svint8_t, uint8_t,
++		 z0 = svlsl_n_s8_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_s8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s8_m_tied1, svint8_t,
++		z0 = svlsl_n_s8_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_s8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s8_m_untied, svint8_t,
++		z0 = svlsl_n_s8_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_7_s8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_s8_m_tied1, svint8_t,
++		z0 = svlsl_n_s8_m (p0, z0, 7),
++		z0 = svlsl_m (p0, z0, 7))
++
++/*
++** lsl_7_s8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_s8_m_untied, svint8_t,
++		z0 = svlsl_n_s8_m (p0, z1, 7),
++		z0 = svlsl_m (p0, z1, 7))
++
++/*
++** lsl_8_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #8
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_s8_m_tied1, svint8_t,
++		z0 = svlsl_n_s8_m (p0, z0, 8),
++		z0 = svlsl_m (p0, z0, 8))
++
++/*
++** lsl_8_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #8
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_s8_m_untied, svint8_t,
++		z0 = svlsl_n_s8_m (p0, z1, 8),
++		z0 = svlsl_m (p0, z1, 8))
++
++/*
++** lsl_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (lsl_s8_z_tied1, svint8_t, svuint8_t,
++	     z0 = svlsl_s8_z (p0, z0, z4),
++	     z0 = svlsl_z (p0, z0, z4))
++
++/*
++** lsl_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lslr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s8_z_tied2, svint8_t, svuint8_t,
++		 z0_res = svlsl_s8_z (p0, z4, z0),
++		 z0_res = svlsl_z (p0, z4, z0))
++
++/*
++** lsl_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, z4\.b
++** |
++**	movprfx	z0\.b, p0/z, z4\.b
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s8_z_untied, svint8_t, svuint8_t,
++	     z0 = svlsl_s8_z (p0, z1, z4),
++	     z0 = svlsl_z (p0, z1, z4))
++
++/*
++** lsl_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s8_z_tied1, svint8_t, uint8_t,
++		 z0 = svlsl_n_s8_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s8_z_untied, svint8_t, uint8_t,
++		 z0 = svlsl_n_s8_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s8_z_tied1, svint8_t,
++		z0 = svlsl_n_s8_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s8_z_untied, svint8_t,
++		z0 = svlsl_n_s8_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_7_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_s8_z_tied1, svint8_t,
++		z0 = svlsl_n_s8_z (p0, z0, 7),
++		z0 = svlsl_z (p0, z0, 7))
++
++/*
++** lsl_7_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_s8_z_untied, svint8_t,
++		z0 = svlsl_n_s8_z (p0, z1, 7),
++		z0 = svlsl_z (p0, z1, 7))
++
++/*
++** lsl_8_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #8
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_s8_z_tied1, svint8_t,
++		z0 = svlsl_n_s8_z (p0, z0, 8),
++		z0 = svlsl_z (p0, z0, 8))
++
++/*
++** lsl_8_s8_z_untied:
++**	mov	(z[0-9]+\.b), #8
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_s8_z_untied, svint8_t,
++		z0 = svlsl_n_s8_z (p0, z1, 8),
++		z0 = svlsl_z (p0, z1, 8))
++
++/*
++** lsl_s8_x_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (lsl_s8_x_tied1, svint8_t, svuint8_t,
++	     z0 = svlsl_s8_x (p0, z0, z4),
++	     z0 = svlsl_x (p0, z0, z4))
++
++/*
++** lsl_s8_x_tied2:
++**	lslr	z0\.b, p0/m, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_s8_x_tied2, svint8_t, svuint8_t,
++		 z0_res = svlsl_s8_x (p0, z4, z0),
++		 z0_res = svlsl_x (p0, z4, z0))
++
++/*
++** lsl_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, z4\.b
++** |
++**	movprfx	z0, z4
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_DUAL_Z (lsl_s8_x_untied, svint8_t, svuint8_t,
++	     z0 = svlsl_s8_x (p0, z1, z4),
++	     z0 = svlsl_x (p0, z1, z4))
++
++/*
++** lsl_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s8_x_tied1, svint8_t, uint8_t,
++		 z0 = svlsl_n_s8_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_s8_x_untied, svint8_t, uint8_t,
++		 z0 = svlsl_n_s8_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_s8_x_tied1:
++**	lsl	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s8_x_tied1, svint8_t,
++		z0 = svlsl_n_s8_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_s8_x_untied:
++**	lsl	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_s8_x_untied, svint8_t,
++		z0 = svlsl_n_s8_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_7_s8_x_tied1:
++**	lsl	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_s8_x_tied1, svint8_t,
++		z0 = svlsl_n_s8_x (p0, z0, 7),
++		z0 = svlsl_x (p0, z0, 7))
++
++/*
++** lsl_7_s8_x_untied:
++**	lsl	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_s8_x_untied, svint8_t,
++		z0 = svlsl_n_s8_x (p0, z1, 7),
++		z0 = svlsl_x (p0, z1, 7))
++
++/*
++** lsl_8_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #8
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_s8_x_tied1, svint8_t,
++		z0 = svlsl_n_s8_x (p0, z0, 8),
++		z0 = svlsl_x (p0, z0, 8))
++
++/*
++** lsl_8_s8_x_untied:
++**	mov	z0\.b, #8
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_s8_x_untied, svint8_t,
++		z0 = svlsl_n_s8_x (p0, z1, 8),
++		z0 = svlsl_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u16.c
+new file mode 100644
+index 000000000..57db0fda6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u16.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_u16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_m_tied1, svuint16_t,
++		z0 = svlsl_u16_m (p0, z0, z1),
++		z0 = svlsl_m (p0, z0, z1))
++
++/*
++** lsl_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_m_tied2, svuint16_t,
++		z0 = svlsl_u16_m (p0, z1, z0),
++		z0 = svlsl_m (p0, z1, z0))
++
++/*
++** lsl_u16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_m_untied, svuint16_t,
++		z0 = svlsl_u16_m (p0, z1, z2),
++		z0 = svlsl_m (p0, z1, z2))
++
++/*
++** lsl_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svlsl_n_u16_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svlsl_n_u16_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_u16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u16_m_tied1, svuint16_t,
++		z0 = svlsl_n_u16_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_u16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u16_m_untied, svuint16_t,
++		z0 = svlsl_n_u16_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_15_u16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_u16_m_tied1, svuint16_t,
++		z0 = svlsl_n_u16_m (p0, z0, 15),
++		z0 = svlsl_m (p0, z0, 15))
++
++/*
++** lsl_15_u16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_u16_m_untied, svuint16_t,
++		z0 = svlsl_n_u16_m (p0, z1, 15),
++		z0 = svlsl_m (p0, z1, 15))
++
++/*
++** lsl_16_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #16
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_u16_m_tied1, svuint16_t,
++		z0 = svlsl_n_u16_m (p0, z0, 16),
++		z0 = svlsl_m (p0, z0, 16))
++
++/*
++** lsl_16_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #16
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_u16_m_untied, svuint16_t,
++		z0 = svlsl_n_u16_m (p0, z1, 16),
++		z0 = svlsl_m (p0, z1, 16))
++
++/*
++** lsl_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_z_tied1, svuint16_t,
++		z0 = svlsl_u16_z (p0, z0, z1),
++		z0 = svlsl_z (p0, z0, z1))
++
++/*
++** lsl_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_z_tied2, svuint16_t,
++		z0 = svlsl_u16_z (p0, z1, z0),
++		z0 = svlsl_z (p0, z1, z0))
++
++/*
++** lsl_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_z_untied, svuint16_t,
++		z0 = svlsl_u16_z (p0, z1, z2),
++		z0 = svlsl_z (p0, z1, z2))
++
++/*
++** lsl_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svlsl_n_u16_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svlsl_n_u16_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u16_z_tied1, svuint16_t,
++		z0 = svlsl_n_u16_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u16_z_untied, svuint16_t,
++		z0 = svlsl_n_u16_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_15_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_u16_z_tied1, svuint16_t,
++		z0 = svlsl_n_u16_z (p0, z0, 15),
++		z0 = svlsl_z (p0, z0, 15))
++
++/*
++** lsl_15_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_u16_z_untied, svuint16_t,
++		z0 = svlsl_n_u16_z (p0, z1, 15),
++		z0 = svlsl_z (p0, z1, 15))
++
++/*
++** lsl_16_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #16
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_u16_z_tied1, svuint16_t,
++		z0 = svlsl_n_u16_z (p0, z0, 16),
++		z0 = svlsl_z (p0, z0, 16))
++
++/*
++** lsl_16_u16_z_untied:
++**	mov	(z[0-9]+\.h), #16
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_u16_z_untied, svuint16_t,
++		z0 = svlsl_n_u16_z (p0, z1, 16),
++		z0 = svlsl_z (p0, z1, 16))
++
++/*
++** lsl_u16_x_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_x_tied1, svuint16_t,
++		z0 = svlsl_u16_x (p0, z0, z1),
++		z0 = svlsl_x (p0, z0, z1))
++
++/*
++** lsl_u16_x_tied2:
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_x_tied2, svuint16_t,
++		z0 = svlsl_u16_x (p0, z1, z0),
++		z0 = svlsl_x (p0, z1, z0))
++
++/*
++** lsl_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u16_x_untied, svuint16_t,
++		z0 = svlsl_u16_x (p0, z1, z2),
++		z0 = svlsl_x (p0, z1, z2))
++
++/*
++** lsl_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svlsl_n_u16_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svlsl_n_u16_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_u16_x_tied1:
++**	lsl	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u16_x_tied1, svuint16_t,
++		z0 = svlsl_n_u16_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_u16_x_untied:
++**	lsl	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u16_x_untied, svuint16_t,
++		z0 = svlsl_n_u16_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_15_u16_x_tied1:
++**	lsl	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_u16_x_tied1, svuint16_t,
++		z0 = svlsl_n_u16_x (p0, z0, 15),
++		z0 = svlsl_x (p0, z0, 15))
++
++/*
++** lsl_15_u16_x_untied:
++**	lsl	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_15_u16_x_untied, svuint16_t,
++		z0 = svlsl_n_u16_x (p0, z1, 15),
++		z0 = svlsl_x (p0, z1, 15))
++
++/*
++** lsl_16_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #16
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_u16_x_tied1, svuint16_t,
++		z0 = svlsl_n_u16_x (p0, z0, 16),
++		z0 = svlsl_x (p0, z0, 16))
++
++/*
++** lsl_16_u16_x_untied:
++**	mov	z0\.h, #16
++**	lslr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_16_u16_x_untied, svuint16_t,
++		z0 = svlsl_n_u16_x (p0, z1, 16),
++		z0 = svlsl_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u32.c
+new file mode 100644
+index 000000000..8773f15db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u32.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_u32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_m_tied1, svuint32_t,
++		z0 = svlsl_u32_m (p0, z0, z1),
++		z0 = svlsl_m (p0, z0, z1))
++
++/*
++** lsl_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_m_tied2, svuint32_t,
++		z0 = svlsl_u32_m (p0, z1, z0),
++		z0 = svlsl_m (p0, z1, z0))
++
++/*
++** lsl_u32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_m_untied, svuint32_t,
++		z0 = svlsl_u32_m (p0, z1, z2),
++		z0 = svlsl_m (p0, z1, z2))
++
++/*
++** lsl_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svlsl_n_u32_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svlsl_n_u32_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_u32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u32_m_tied1, svuint32_t,
++		z0 = svlsl_n_u32_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_u32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u32_m_untied, svuint32_t,
++		z0 = svlsl_n_u32_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_31_u32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_u32_m_tied1, svuint32_t,
++		z0 = svlsl_n_u32_m (p0, z0, 31),
++		z0 = svlsl_m (p0, z0, 31))
++
++/*
++** lsl_31_u32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_u32_m_untied, svuint32_t,
++		z0 = svlsl_n_u32_m (p0, z1, 31),
++		z0 = svlsl_m (p0, z1, 31))
++
++/*
++** lsl_32_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #32
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_u32_m_tied1, svuint32_t,
++		z0 = svlsl_n_u32_m (p0, z0, 32),
++		z0 = svlsl_m (p0, z0, 32))
++
++/*
++** lsl_32_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #32
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_u32_m_untied, svuint32_t,
++		z0 = svlsl_n_u32_m (p0, z1, 32),
++		z0 = svlsl_m (p0, z1, 32))
++
++/*
++** lsl_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_z_tied1, svuint32_t,
++		z0 = svlsl_u32_z (p0, z0, z1),
++		z0 = svlsl_z (p0, z0, z1))
++
++/*
++** lsl_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_z_tied2, svuint32_t,
++		z0 = svlsl_u32_z (p0, z1, z0),
++		z0 = svlsl_z (p0, z1, z0))
++
++/*
++** lsl_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_z_untied, svuint32_t,
++		z0 = svlsl_u32_z (p0, z1, z2),
++		z0 = svlsl_z (p0, z1, z2))
++
++/*
++** lsl_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svlsl_n_u32_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svlsl_n_u32_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u32_z_tied1, svuint32_t,
++		z0 = svlsl_n_u32_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u32_z_untied, svuint32_t,
++		z0 = svlsl_n_u32_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_31_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_u32_z_tied1, svuint32_t,
++		z0 = svlsl_n_u32_z (p0, z0, 31),
++		z0 = svlsl_z (p0, z0, 31))
++
++/*
++** lsl_31_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_u32_z_untied, svuint32_t,
++		z0 = svlsl_n_u32_z (p0, z1, 31),
++		z0 = svlsl_z (p0, z1, 31))
++
++/*
++** lsl_32_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #32
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_u32_z_tied1, svuint32_t,
++		z0 = svlsl_n_u32_z (p0, z0, 32),
++		z0 = svlsl_z (p0, z0, 32))
++
++/*
++** lsl_32_u32_z_untied:
++**	mov	(z[0-9]+\.s), #32
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_u32_z_untied, svuint32_t,
++		z0 = svlsl_n_u32_z (p0, z1, 32),
++		z0 = svlsl_z (p0, z1, 32))
++
++/*
++** lsl_u32_x_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_x_tied1, svuint32_t,
++		z0 = svlsl_u32_x (p0, z0, z1),
++		z0 = svlsl_x (p0, z0, z1))
++
++/*
++** lsl_u32_x_tied2:
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_x_tied2, svuint32_t,
++		z0 = svlsl_u32_x (p0, z1, z0),
++		z0 = svlsl_x (p0, z1, z0))
++
++/*
++** lsl_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u32_x_untied, svuint32_t,
++		z0 = svlsl_u32_x (p0, z1, z2),
++		z0 = svlsl_x (p0, z1, z2))
++
++/*
++** lsl_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svlsl_n_u32_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svlsl_n_u32_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_u32_x_tied1:
++**	lsl	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u32_x_tied1, svuint32_t,
++		z0 = svlsl_n_u32_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_u32_x_untied:
++**	lsl	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u32_x_untied, svuint32_t,
++		z0 = svlsl_n_u32_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_31_u32_x_tied1:
++**	lsl	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_u32_x_tied1, svuint32_t,
++		z0 = svlsl_n_u32_x (p0, z0, 31),
++		z0 = svlsl_x (p0, z0, 31))
++
++/*
++** lsl_31_u32_x_untied:
++**	lsl	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_31_u32_x_untied, svuint32_t,
++		z0 = svlsl_n_u32_x (p0, z1, 31),
++		z0 = svlsl_x (p0, z1, 31))
++
++/*
++** lsl_32_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #32
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_u32_x_tied1, svuint32_t,
++		z0 = svlsl_n_u32_x (p0, z0, 32),
++		z0 = svlsl_x (p0, z0, 32))
++
++/*
++** lsl_32_u32_x_untied:
++**	mov	z0\.s, #32
++**	lslr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_32_u32_x_untied, svuint32_t,
++		z0 = svlsl_n_u32_x (p0, z1, 32),
++		z0 = svlsl_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u64.c
+new file mode 100644
+index 000000000..7b12bd43e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u64.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_u64_m_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_m_tied1, svuint64_t,
++		z0 = svlsl_u64_m (p0, z0, z1),
++		z0 = svlsl_m (p0, z0, z1))
++
++/*
++** lsl_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_m_tied2, svuint64_t,
++		z0 = svlsl_u64_m (p0, z1, z0),
++		z0 = svlsl_m (p0, z1, z0))
++
++/*
++** lsl_u64_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_m_untied, svuint64_t,
++		z0 = svlsl_u64_m (p0, z1, z2),
++		z0 = svlsl_m (p0, z1, z2))
++
++/*
++** lsl_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svlsl_n_u64_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svlsl_n_u64_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_u64_m_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u64_m_tied1, svuint64_t,
++		z0 = svlsl_n_u64_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_u64_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u64_m_untied, svuint64_t,
++		z0 = svlsl_n_u64_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_63_u64_m_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_u64_m_tied1, svuint64_t,
++		z0 = svlsl_n_u64_m (p0, z0, 63),
++		z0 = svlsl_m (p0, z0, 63))
++
++/*
++** lsl_63_u64_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_u64_m_untied, svuint64_t,
++		z0 = svlsl_n_u64_m (p0, z1, 63),
++		z0 = svlsl_m (p0, z1, 63))
++
++/*
++** lsl_64_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #64
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_u64_m_tied1, svuint64_t,
++		z0 = svlsl_n_u64_m (p0, z0, 64),
++		z0 = svlsl_m (p0, z0, 64))
++
++/*
++** lsl_64_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #64
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_u64_m_untied, svuint64_t,
++		z0 = svlsl_n_u64_m (p0, z1, 64),
++		z0 = svlsl_m (p0, z1, 64))
++
++/*
++** lsl_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_z_tied1, svuint64_t,
++		z0 = svlsl_u64_z (p0, z0, z1),
++		z0 = svlsl_z (p0, z0, z1))
++
++/*
++** lsl_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_z_tied2, svuint64_t,
++		z0 = svlsl_u64_z (p0, z1, z0),
++		z0 = svlsl_z (p0, z1, z0))
++
++/*
++** lsl_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_z_untied, svuint64_t,
++		z0 = svlsl_u64_z (p0, z1, z2),
++		z0 = svlsl_z (p0, z1, z2))
++
++/*
++** lsl_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svlsl_n_u64_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svlsl_n_u64_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u64_z_tied1, svuint64_t,
++		z0 = svlsl_n_u64_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u64_z_untied, svuint64_t,
++		z0 = svlsl_n_u64_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_63_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_u64_z_tied1, svuint64_t,
++		z0 = svlsl_n_u64_z (p0, z0, 63),
++		z0 = svlsl_z (p0, z0, 63))
++
++/*
++** lsl_63_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_u64_z_untied, svuint64_t,
++		z0 = svlsl_n_u64_z (p0, z1, 63),
++		z0 = svlsl_z (p0, z1, 63))
++
++/*
++** lsl_64_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #64
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_u64_z_tied1, svuint64_t,
++		z0 = svlsl_n_u64_z (p0, z0, 64),
++		z0 = svlsl_z (p0, z0, 64))
++
++/*
++** lsl_64_u64_z_untied:
++**	mov	(z[0-9]+\.d), #64
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsl	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_u64_z_untied, svuint64_t,
++		z0 = svlsl_n_u64_z (p0, z1, 64),
++		z0 = svlsl_z (p0, z1, 64))
++
++/*
++** lsl_u64_x_tied1:
++**	lsl	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_x_tied1, svuint64_t,
++		z0 = svlsl_u64_x (p0, z0, z1),
++		z0 = svlsl_x (p0, z0, z1))
++
++/*
++** lsl_u64_x_tied2:
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_x_tied2, svuint64_t,
++		z0 = svlsl_u64_x (p0, z1, z0),
++		z0 = svlsl_x (p0, z1, z0))
++
++/*
++** lsl_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u64_x_untied, svuint64_t,
++		z0 = svlsl_u64_x (p0, z1, z2),
++		z0 = svlsl_x (p0, z1, z2))
++
++/*
++** lsl_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svlsl_n_u64_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svlsl_n_u64_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_u64_x_tied1:
++**	lsl	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u64_x_tied1, svuint64_t,
++		z0 = svlsl_n_u64_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_u64_x_untied:
++**	lsl	z0\.d, z1\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u64_x_untied, svuint64_t,
++		z0 = svlsl_n_u64_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_63_u64_x_tied1:
++**	lsl	z0\.d, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_u64_x_tied1, svuint64_t,
++		z0 = svlsl_n_u64_x (p0, z0, 63),
++		z0 = svlsl_x (p0, z0, 63))
++
++/*
++** lsl_63_u64_x_untied:
++**	lsl	z0\.d, z1\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_63_u64_x_untied, svuint64_t,
++		z0 = svlsl_n_u64_x (p0, z1, 63),
++		z0 = svlsl_x (p0, z1, 63))
++
++/*
++** lsl_64_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #64
++**	lsl	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_u64_x_tied1, svuint64_t,
++		z0 = svlsl_n_u64_x (p0, z0, 64),
++		z0 = svlsl_x (p0, z0, 64))
++
++/*
++** lsl_64_u64_x_untied:
++**	mov	z0\.d, #64
++**	lslr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_64_u64_x_untied, svuint64_t,
++		z0 = svlsl_n_u64_x (p0, z1, 64),
++		z0 = svlsl_x (p0, z1, 64))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u8.c
+new file mode 100644
+index 000000000..894b55138
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u8.c
+@@ -0,0 +1,351 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_u8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_m_tied1, svuint8_t,
++		z0 = svlsl_u8_m (p0, z0, z1),
++		z0 = svlsl_m (p0, z0, z1))
++
++/*
++** lsl_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_m_tied2, svuint8_t,
++		z0 = svlsl_u8_m (p0, z1, z0),
++		z0 = svlsl_m (p0, z1, z0))
++
++/*
++** lsl_u8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_m_untied, svuint8_t,
++		z0 = svlsl_u8_m (p0, z1, z2),
++		z0 = svlsl_m (p0, z1, z2))
++
++/*
++** lsl_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svlsl_n_u8_m (p0, z0, x0),
++		 z0 = svlsl_m (p0, z0, x0))
++
++/*
++** lsl_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svlsl_n_u8_m (p0, z1, x0),
++		 z0 = svlsl_m (p0, z1, x0))
++
++/*
++** lsl_1_u8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u8_m_tied1, svuint8_t,
++		z0 = svlsl_n_u8_m (p0, z0, 1),
++		z0 = svlsl_m (p0, z0, 1))
++
++/*
++** lsl_1_u8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u8_m_untied, svuint8_t,
++		z0 = svlsl_n_u8_m (p0, z1, 1),
++		z0 = svlsl_m (p0, z1, 1))
++
++/*
++** lsl_7_u8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_u8_m_tied1, svuint8_t,
++		z0 = svlsl_n_u8_m (p0, z0, 7),
++		z0 = svlsl_m (p0, z0, 7))
++
++/*
++** lsl_7_u8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_u8_m_untied, svuint8_t,
++		z0 = svlsl_n_u8_m (p0, z1, 7),
++		z0 = svlsl_m (p0, z1, 7))
++
++/*
++** lsl_8_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #8
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_u8_m_tied1, svuint8_t,
++		z0 = svlsl_n_u8_m (p0, z0, 8),
++		z0 = svlsl_m (p0, z0, 8))
++
++/*
++** lsl_8_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #8
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_u8_m_untied, svuint8_t,
++		z0 = svlsl_n_u8_m (p0, z1, 8),
++		z0 = svlsl_m (p0, z1, 8))
++
++/*
++** lsl_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_z_tied1, svuint8_t,
++		z0 = svlsl_u8_z (p0, z0, z1),
++		z0 = svlsl_z (p0, z0, z1))
++
++/*
++** lsl_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_z_tied2, svuint8_t,
++		z0 = svlsl_u8_z (p0, z1, z0),
++		z0 = svlsl_z (p0, z1, z0))
++
++/*
++** lsl_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_z_untied, svuint8_t,
++		z0 = svlsl_u8_z (p0, z1, z2),
++		z0 = svlsl_z (p0, z1, z2))
++
++/*
++** lsl_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svlsl_n_u8_z (p0, z0, x0),
++		 z0 = svlsl_z (p0, z0, x0))
++
++/*
++** lsl_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svlsl_n_u8_z (p0, z1, x0),
++		 z0 = svlsl_z (p0, z1, x0))
++
++/*
++** lsl_1_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u8_z_tied1, svuint8_t,
++		z0 = svlsl_n_u8_z (p0, z0, 1),
++		z0 = svlsl_z (p0, z0, 1))
++
++/*
++** lsl_1_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u8_z_untied, svuint8_t,
++		z0 = svlsl_n_u8_z (p0, z1, 1),
++		z0 = svlsl_z (p0, z1, 1))
++
++/*
++** lsl_7_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_u8_z_tied1, svuint8_t,
++		z0 = svlsl_n_u8_z (p0, z0, 7),
++		z0 = svlsl_z (p0, z0, 7))
++
++/*
++** lsl_7_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_u8_z_untied, svuint8_t,
++		z0 = svlsl_n_u8_z (p0, z1, 7),
++		z0 = svlsl_z (p0, z1, 7))
++
++/*
++** lsl_8_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #8
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_u8_z_tied1, svuint8_t,
++		z0 = svlsl_n_u8_z (p0, z0, 8),
++		z0 = svlsl_z (p0, z0, 8))
++
++/*
++** lsl_8_u8_z_untied:
++**	mov	(z[0-9]+\.b), #8
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_u8_z_untied, svuint8_t,
++		z0 = svlsl_n_u8_z (p0, z1, 8),
++		z0 = svlsl_z (p0, z1, 8))
++
++/*
++** lsl_u8_x_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_x_tied1, svuint8_t,
++		z0 = svlsl_u8_x (p0, z0, z1),
++		z0 = svlsl_x (p0, z0, z1))
++
++/*
++** lsl_u8_x_tied2:
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_x_tied2, svuint8_t,
++		z0 = svlsl_u8_x (p0, z1, z0),
++		z0 = svlsl_x (p0, z1, z0))
++
++/*
++** lsl_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_u8_x_untied, svuint8_t,
++		z0 = svlsl_u8_x (p0, z1, z2),
++		z0 = svlsl_x (p0, z1, z2))
++
++/*
++** lsl_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svlsl_n_u8_x (p0, z0, x0),
++		 z0 = svlsl_x (p0, z0, x0))
++
++/*
++** lsl_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svlsl_n_u8_x (p0, z1, x0),
++		 z0 = svlsl_x (p0, z1, x0))
++
++/*
++** lsl_1_u8_x_tied1:
++**	lsl	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u8_x_tied1, svuint8_t,
++		z0 = svlsl_n_u8_x (p0, z0, 1),
++		z0 = svlsl_x (p0, z0, 1))
++
++/*
++** lsl_1_u8_x_untied:
++**	lsl	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_1_u8_x_untied, svuint8_t,
++		z0 = svlsl_n_u8_x (p0, z1, 1),
++		z0 = svlsl_x (p0, z1, 1))
++
++/*
++** lsl_7_u8_x_tied1:
++**	lsl	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_u8_x_tied1, svuint8_t,
++		z0 = svlsl_n_u8_x (p0, z0, 7),
++		z0 = svlsl_x (p0, z0, 7))
++
++/*
++** lsl_7_u8_x_untied:
++**	lsl	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_7_u8_x_untied, svuint8_t,
++		z0 = svlsl_n_u8_x (p0, z1, 7),
++		z0 = svlsl_x (p0, z1, 7))
++
++/*
++** lsl_8_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #8
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_u8_x_tied1, svuint8_t,
++		z0 = svlsl_n_u8_x (p0, z0, 8),
++		z0 = svlsl_x (p0, z0, 8))
++
++/*
++** lsl_8_u8_x_untied:
++**	mov	z0\.b, #8
++**	lslr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_8_u8_x_untied, svuint8_t,
++		z0 = svlsl_n_u8_x (p0, z1, 8),
++		z0 = svlsl_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s16.c
+new file mode 100644
+index 000000000..8d63d3909
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s16.c
+@@ -0,0 +1,331 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_wide_s16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s16_m_tied1, svint16_t, svuint64_t,
++	     z0 = svlsl_wide_s16_m (p0, z0, z4),
++	     z0 = svlsl_wide_m (p0, z0, z4))
++
++/*
++** lsl_wide_s16_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s16_m_tied2, svint16_t, svuint64_t,
++		 z0_res = svlsl_wide_s16_m (p0, z4, z0),
++		 z0_res = svlsl_wide_m (p0, z4, z0))
++
++/*
++** lsl_wide_s16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s16_m_untied, svint16_t, svuint64_t,
++	     z0 = svlsl_wide_s16_m (p0, z1, z4),
++	     z0 = svlsl_wide_m (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s16_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s16_m_tied1, svint16_t, uint64_t,
++		 z0 = svlsl_wide_n_s16_m (p0, z0, x0),
++		 z0 = svlsl_wide_m (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s16_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s16_m_untied, svint16_t, uint64_t,
++		 z0 = svlsl_wide_n_s16_m (p0, z1, x0),
++		 z0 = svlsl_wide_m (p0, z1, x0))
++
++/*
++** lsl_wide_1_s16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s16_m_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_m (p0, z0, 1),
++		z0 = svlsl_wide_m (p0, z0, 1))
++
++/*
++** lsl_wide_1_s16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s16_m_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_m (p0, z1, 1),
++		z0 = svlsl_wide_m (p0, z1, 1))
++
++/*
++** lsl_wide_15_s16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_s16_m_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_m (p0, z0, 15),
++		z0 = svlsl_wide_m (p0, z0, 15))
++
++/*
++** lsl_wide_15_s16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_s16_m_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_m (p0, z1, 15),
++		z0 = svlsl_wide_m (p0, z1, 15))
++
++/*
++** lsl_wide_16_s16_m_tied1:
++**	mov	(z[0-9]+\.d), #16
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_s16_m_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_m (p0, z0, 16),
++		z0 = svlsl_wide_m (p0, z0, 16))
++
++/*
++** lsl_wide_16_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #16
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_s16_m_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_m (p0, z1, 16),
++		z0 = svlsl_wide_m (p0, z1, 16))
++
++/*
++** lsl_wide_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s16_z_tied1, svint16_t, svuint64_t,
++	     z0 = svlsl_wide_s16_z (p0, z0, z4),
++	     z0 = svlsl_wide_z (p0, z0, z4))
++
++/*
++** lsl_wide_s16_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.h, p0/z, z4\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s16_z_tied2, svint16_t, svuint64_t,
++		 z0_res = svlsl_wide_s16_z (p0, z4, z0),
++		 z0_res = svlsl_wide_z (p0, z4, z0))
++
++/*
++** lsl_wide_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s16_z_untied, svint16_t, svuint64_t,
++	     z0 = svlsl_wide_s16_z (p0, z1, z4),
++	     z0 = svlsl_wide_z (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s16_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s16_z_tied1, svint16_t, uint64_t,
++		 z0 = svlsl_wide_n_s16_z (p0, z0, x0),
++		 z0 = svlsl_wide_z (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s16_z_untied, svint16_t, uint64_t,
++		 z0 = svlsl_wide_n_s16_z (p0, z1, x0),
++		 z0 = svlsl_wide_z (p0, z1, x0))
++
++/*
++** lsl_wide_1_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s16_z_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_z (p0, z0, 1),
++		z0 = svlsl_wide_z (p0, z0, 1))
++
++/*
++** lsl_wide_1_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s16_z_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_z (p0, z1, 1),
++		z0 = svlsl_wide_z (p0, z1, 1))
++
++/*
++** lsl_wide_15_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_s16_z_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_z (p0, z0, 15),
++		z0 = svlsl_wide_z (p0, z0, 15))
++
++/*
++** lsl_wide_15_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_s16_z_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_z (p0, z1, 15),
++		z0 = svlsl_wide_z (p0, z1, 15))
++
++/*
++** lsl_wide_16_s16_z_tied1:
++**	mov	(z[0-9]+\.d), #16
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_s16_z_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_z (p0, z0, 16),
++		z0 = svlsl_wide_z (p0, z0, 16))
++
++/*
++** lsl_wide_16_s16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #16
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_s16_z_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_z (p0, z1, 16),
++		z0 = svlsl_wide_z (p0, z1, 16))
++
++/*
++** lsl_wide_s16_x_tied1:
++**	lsl	z0\.h, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s16_x_tied1, svint16_t, svuint64_t,
++	     z0 = svlsl_wide_s16_x (p0, z0, z4),
++	     z0 = svlsl_wide_x (p0, z0, z4))
++
++/*
++** lsl_wide_s16_x_tied2:
++**	lsl	z0\.h, z4\.h, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s16_x_tied2, svint16_t, svuint64_t,
++		 z0_res = svlsl_wide_s16_x (p0, z4, z0),
++		 z0_res = svlsl_wide_x (p0, z4, z0))
++
++/*
++** lsl_wide_s16_x_untied:
++**	lsl	z0\.h, z1\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s16_x_untied, svint16_t, svuint64_t,
++	     z0 = svlsl_wide_s16_x (p0, z1, z4),
++	     z0 = svlsl_wide_x (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s16_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s16_x_tied1, svint16_t, uint64_t,
++		 z0 = svlsl_wide_n_s16_x (p0, z0, x0),
++		 z0 = svlsl_wide_x (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s16_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s16_x_untied, svint16_t, uint64_t,
++		 z0 = svlsl_wide_n_s16_x (p0, z1, x0),
++		 z0 = svlsl_wide_x (p0, z1, x0))
++
++/*
++** lsl_wide_1_s16_x_tied1:
++**	lsl	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s16_x_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_x (p0, z0, 1),
++		z0 = svlsl_wide_x (p0, z0, 1))
++
++/*
++** lsl_wide_1_s16_x_untied:
++**	lsl	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s16_x_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_x (p0, z1, 1),
++		z0 = svlsl_wide_x (p0, z1, 1))
++
++/*
++** lsl_wide_15_s16_x_tied1:
++**	lsl	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_s16_x_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_x (p0, z0, 15),
++		z0 = svlsl_wide_x (p0, z0, 15))
++
++/*
++** lsl_wide_15_s16_x_untied:
++**	lsl	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_s16_x_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_x (p0, z1, 15),
++		z0 = svlsl_wide_x (p0, z1, 15))
++
++/*
++** lsl_wide_16_s16_x_tied1:
++**	mov	(z[0-9]+\.d), #16
++**	lsl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_s16_x_tied1, svint16_t,
++		z0 = svlsl_wide_n_s16_x (p0, z0, 16),
++		z0 = svlsl_wide_x (p0, z0, 16))
++
++/*
++** lsl_wide_16_s16_x_untied:
++**	mov	(z[0-9]+\.d), #16
++**	lsl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_s16_x_untied, svint16_t,
++		z0 = svlsl_wide_n_s16_x (p0, z1, 16),
++		z0 = svlsl_wide_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s32.c
+new file mode 100644
+index 000000000..acd813df3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s32.c
+@@ -0,0 +1,331 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_wide_s32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s32_m_tied1, svint32_t, svuint64_t,
++	     z0 = svlsl_wide_s32_m (p0, z0, z4),
++	     z0 = svlsl_wide_m (p0, z0, z4))
++
++/*
++** lsl_wide_s32_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s32_m_tied2, svint32_t, svuint64_t,
++		 z0_res = svlsl_wide_s32_m (p0, z4, z0),
++		 z0_res = svlsl_wide_m (p0, z4, z0))
++
++/*
++** lsl_wide_s32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s32_m_untied, svint32_t, svuint64_t,
++	     z0 = svlsl_wide_s32_m (p0, z1, z4),
++	     z0 = svlsl_wide_m (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s32_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s32_m_tied1, svint32_t, uint64_t,
++		 z0 = svlsl_wide_n_s32_m (p0, z0, x0),
++		 z0 = svlsl_wide_m (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s32_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s32_m_untied, svint32_t, uint64_t,
++		 z0 = svlsl_wide_n_s32_m (p0, z1, x0),
++		 z0 = svlsl_wide_m (p0, z1, x0))
++
++/*
++** lsl_wide_1_s32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s32_m_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_m (p0, z0, 1),
++		z0 = svlsl_wide_m (p0, z0, 1))
++
++/*
++** lsl_wide_1_s32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s32_m_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_m (p0, z1, 1),
++		z0 = svlsl_wide_m (p0, z1, 1))
++
++/*
++** lsl_wide_31_s32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_s32_m_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_m (p0, z0, 31),
++		z0 = svlsl_wide_m (p0, z0, 31))
++
++/*
++** lsl_wide_31_s32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_s32_m_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_m (p0, z1, 31),
++		z0 = svlsl_wide_m (p0, z1, 31))
++
++/*
++** lsl_wide_32_s32_m_tied1:
++**	mov	(z[0-9]+\.d), #32
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_s32_m_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_m (p0, z0, 32),
++		z0 = svlsl_wide_m (p0, z0, 32))
++
++/*
++** lsl_wide_32_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #32
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_s32_m_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_m (p0, z1, 32),
++		z0 = svlsl_wide_m (p0, z1, 32))
++
++/*
++** lsl_wide_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s32_z_tied1, svint32_t, svuint64_t,
++	     z0 = svlsl_wide_s32_z (p0, z0, z4),
++	     z0 = svlsl_wide_z (p0, z0, z4))
++
++/*
++** lsl_wide_s32_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.s, p0/z, z4\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s32_z_tied2, svint32_t, svuint64_t,
++		 z0_res = svlsl_wide_s32_z (p0, z4, z0),
++		 z0_res = svlsl_wide_z (p0, z4, z0))
++
++/*
++** lsl_wide_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s32_z_untied, svint32_t, svuint64_t,
++	     z0 = svlsl_wide_s32_z (p0, z1, z4),
++	     z0 = svlsl_wide_z (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s32_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s32_z_tied1, svint32_t, uint64_t,
++		 z0 = svlsl_wide_n_s32_z (p0, z0, x0),
++		 z0 = svlsl_wide_z (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s32_z_untied, svint32_t, uint64_t,
++		 z0 = svlsl_wide_n_s32_z (p0, z1, x0),
++		 z0 = svlsl_wide_z (p0, z1, x0))
++
++/*
++** lsl_wide_1_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s32_z_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_z (p0, z0, 1),
++		z0 = svlsl_wide_z (p0, z0, 1))
++
++/*
++** lsl_wide_1_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s32_z_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_z (p0, z1, 1),
++		z0 = svlsl_wide_z (p0, z1, 1))
++
++/*
++** lsl_wide_31_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_s32_z_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_z (p0, z0, 31),
++		z0 = svlsl_wide_z (p0, z0, 31))
++
++/*
++** lsl_wide_31_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_s32_z_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_z (p0, z1, 31),
++		z0 = svlsl_wide_z (p0, z1, 31))
++
++/*
++** lsl_wide_32_s32_z_tied1:
++**	mov	(z[0-9]+\.d), #32
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_s32_z_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_z (p0, z0, 32),
++		z0 = svlsl_wide_z (p0, z0, 32))
++
++/*
++** lsl_wide_32_s32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #32
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_s32_z_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_z (p0, z1, 32),
++		z0 = svlsl_wide_z (p0, z1, 32))
++
++/*
++** lsl_wide_s32_x_tied1:
++**	lsl	z0\.s, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s32_x_tied1, svint32_t, svuint64_t,
++	     z0 = svlsl_wide_s32_x (p0, z0, z4),
++	     z0 = svlsl_wide_x (p0, z0, z4))
++
++/*
++** lsl_wide_s32_x_tied2:
++**	lsl	z0\.s, z4\.s, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s32_x_tied2, svint32_t, svuint64_t,
++		 z0_res = svlsl_wide_s32_x (p0, z4, z0),
++		 z0_res = svlsl_wide_x (p0, z4, z0))
++
++/*
++** lsl_wide_s32_x_untied:
++**	lsl	z0\.s, z1\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s32_x_untied, svint32_t, svuint64_t,
++	     z0 = svlsl_wide_s32_x (p0, z1, z4),
++	     z0 = svlsl_wide_x (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s32_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s32_x_tied1, svint32_t, uint64_t,
++		 z0 = svlsl_wide_n_s32_x (p0, z0, x0),
++		 z0 = svlsl_wide_x (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s32_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s32_x_untied, svint32_t, uint64_t,
++		 z0 = svlsl_wide_n_s32_x (p0, z1, x0),
++		 z0 = svlsl_wide_x (p0, z1, x0))
++
++/*
++** lsl_wide_1_s32_x_tied1:
++**	lsl	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s32_x_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_x (p0, z0, 1),
++		z0 = svlsl_wide_x (p0, z0, 1))
++
++/*
++** lsl_wide_1_s32_x_untied:
++**	lsl	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s32_x_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_x (p0, z1, 1),
++		z0 = svlsl_wide_x (p0, z1, 1))
++
++/*
++** lsl_wide_31_s32_x_tied1:
++**	lsl	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_s32_x_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_x (p0, z0, 31),
++		z0 = svlsl_wide_x (p0, z0, 31))
++
++/*
++** lsl_wide_31_s32_x_untied:
++**	lsl	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_s32_x_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_x (p0, z1, 31),
++		z0 = svlsl_wide_x (p0, z1, 31))
++
++/*
++** lsl_wide_32_s32_x_tied1:
++**	mov	(z[0-9]+\.d), #32
++**	lsl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_s32_x_tied1, svint32_t,
++		z0 = svlsl_wide_n_s32_x (p0, z0, 32),
++		z0 = svlsl_wide_x (p0, z0, 32))
++
++/*
++** lsl_wide_32_s32_x_untied:
++**	mov	(z[0-9]+\.d), #32
++**	lsl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_s32_x_untied, svint32_t,
++		z0 = svlsl_wide_n_s32_x (p0, z1, 32),
++		z0 = svlsl_wide_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s8.c
+new file mode 100644
+index 000000000..17e8e8685
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s8.c
+@@ -0,0 +1,331 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_wide_s8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s8_m_tied1, svint8_t, svuint64_t,
++	     z0 = svlsl_wide_s8_m (p0, z0, z4),
++	     z0 = svlsl_wide_m (p0, z0, z4))
++
++/*
++** lsl_wide_s8_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s8_m_tied2, svint8_t, svuint64_t,
++		 z0_res = svlsl_wide_s8_m (p0, z4, z0),
++		 z0_res = svlsl_wide_m (p0, z4, z0))
++
++/*
++** lsl_wide_s8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s8_m_untied, svint8_t, svuint64_t,
++	     z0 = svlsl_wide_s8_m (p0, z1, z4),
++	     z0 = svlsl_wide_m (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s8_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s8_m_tied1, svint8_t, uint64_t,
++		 z0 = svlsl_wide_n_s8_m (p0, z0, x0),
++		 z0 = svlsl_wide_m (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s8_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s8_m_untied, svint8_t, uint64_t,
++		 z0 = svlsl_wide_n_s8_m (p0, z1, x0),
++		 z0 = svlsl_wide_m (p0, z1, x0))
++
++/*
++** lsl_wide_1_s8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s8_m_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_m (p0, z0, 1),
++		z0 = svlsl_wide_m (p0, z0, 1))
++
++/*
++** lsl_wide_1_s8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s8_m_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_m (p0, z1, 1),
++		z0 = svlsl_wide_m (p0, z1, 1))
++
++/*
++** lsl_wide_7_s8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_s8_m_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_m (p0, z0, 7),
++		z0 = svlsl_wide_m (p0, z0, 7))
++
++/*
++** lsl_wide_7_s8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_s8_m_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_m (p0, z1, 7),
++		z0 = svlsl_wide_m (p0, z1, 7))
++
++/*
++** lsl_wide_8_s8_m_tied1:
++**	mov	(z[0-9]+\.d), #8
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_s8_m_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_m (p0, z0, 8),
++		z0 = svlsl_wide_m (p0, z0, 8))
++
++/*
++** lsl_wide_8_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #8
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_s8_m_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_m (p0, z1, 8),
++		z0 = svlsl_wide_m (p0, z1, 8))
++
++/*
++** lsl_wide_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s8_z_tied1, svint8_t, svuint64_t,
++	     z0 = svlsl_wide_s8_z (p0, z0, z4),
++	     z0 = svlsl_wide_z (p0, z0, z4))
++
++/*
++** lsl_wide_s8_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.b, p0/z, z4\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s8_z_tied2, svint8_t, svuint64_t,
++		 z0_res = svlsl_wide_s8_z (p0, z4, z0),
++		 z0_res = svlsl_wide_z (p0, z4, z0))
++
++/*
++** lsl_wide_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s8_z_untied, svint8_t, svuint64_t,
++	     z0 = svlsl_wide_s8_z (p0, z1, z4),
++	     z0 = svlsl_wide_z (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s8_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s8_z_tied1, svint8_t, uint64_t,
++		 z0 = svlsl_wide_n_s8_z (p0, z0, x0),
++		 z0 = svlsl_wide_z (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s8_z_untied, svint8_t, uint64_t,
++		 z0 = svlsl_wide_n_s8_z (p0, z1, x0),
++		 z0 = svlsl_wide_z (p0, z1, x0))
++
++/*
++** lsl_wide_1_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s8_z_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_z (p0, z0, 1),
++		z0 = svlsl_wide_z (p0, z0, 1))
++
++/*
++** lsl_wide_1_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s8_z_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_z (p0, z1, 1),
++		z0 = svlsl_wide_z (p0, z1, 1))
++
++/*
++** lsl_wide_7_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_s8_z_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_z (p0, z0, 7),
++		z0 = svlsl_wide_z (p0, z0, 7))
++
++/*
++** lsl_wide_7_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_s8_z_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_z (p0, z1, 7),
++		z0 = svlsl_wide_z (p0, z1, 7))
++
++/*
++** lsl_wide_8_s8_z_tied1:
++**	mov	(z[0-9]+\.d), #8
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_s8_z_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_z (p0, z0, 8),
++		z0 = svlsl_wide_z (p0, z0, 8))
++
++/*
++** lsl_wide_8_s8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #8
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_s8_z_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_z (p0, z1, 8),
++		z0 = svlsl_wide_z (p0, z1, 8))
++
++/*
++** lsl_wide_s8_x_tied1:
++**	lsl	z0\.b, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s8_x_tied1, svint8_t, svuint64_t,
++	     z0 = svlsl_wide_s8_x (p0, z0, z4),
++	     z0 = svlsl_wide_x (p0, z0, z4))
++
++/*
++** lsl_wide_s8_x_tied2:
++**	lsl	z0\.b, z4\.b, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_s8_x_tied2, svint8_t, svuint64_t,
++		 z0_res = svlsl_wide_s8_x (p0, z4, z0),
++		 z0_res = svlsl_wide_x (p0, z4, z0))
++
++/*
++** lsl_wide_s8_x_untied:
++**	lsl	z0\.b, z1\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_s8_x_untied, svint8_t, svuint64_t,
++	     z0 = svlsl_wide_s8_x (p0, z1, z4),
++	     z0 = svlsl_wide_x (p0, z1, z4))
++
++/*
++** lsl_wide_x0_s8_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s8_x_tied1, svint8_t, uint64_t,
++		 z0 = svlsl_wide_n_s8_x (p0, z0, x0),
++		 z0 = svlsl_wide_x (p0, z0, x0))
++
++/*
++** lsl_wide_x0_s8_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_s8_x_untied, svint8_t, uint64_t,
++		 z0 = svlsl_wide_n_s8_x (p0, z1, x0),
++		 z0 = svlsl_wide_x (p0, z1, x0))
++
++/*
++** lsl_wide_1_s8_x_tied1:
++**	lsl	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s8_x_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_x (p0, z0, 1),
++		z0 = svlsl_wide_x (p0, z0, 1))
++
++/*
++** lsl_wide_1_s8_x_untied:
++**	lsl	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_s8_x_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_x (p0, z1, 1),
++		z0 = svlsl_wide_x (p0, z1, 1))
++
++/*
++** lsl_wide_7_s8_x_tied1:
++**	lsl	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_s8_x_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_x (p0, z0, 7),
++		z0 = svlsl_wide_x (p0, z0, 7))
++
++/*
++** lsl_wide_7_s8_x_untied:
++**	lsl	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_s8_x_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_x (p0, z1, 7),
++		z0 = svlsl_wide_x (p0, z1, 7))
++
++/*
++** lsl_wide_8_s8_x_tied1:
++**	mov	(z[0-9]+\.d), #8
++**	lsl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_s8_x_tied1, svint8_t,
++		z0 = svlsl_wide_n_s8_x (p0, z0, 8),
++		z0 = svlsl_wide_x (p0, z0, 8))
++
++/*
++** lsl_wide_8_s8_x_untied:
++**	mov	(z[0-9]+\.d), #8
++**	lsl	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_s8_x_untied, svint8_t,
++		z0 = svlsl_wide_n_s8_x (p0, z1, 8),
++		z0 = svlsl_wide_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u16.c
+new file mode 100644
+index 000000000..cff24a850
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u16.c
+@@ -0,0 +1,331 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_wide_u16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u16_m_tied1, svuint16_t, svuint64_t,
++	     z0 = svlsl_wide_u16_m (p0, z0, z4),
++	     z0 = svlsl_wide_m (p0, z0, z4))
++
++/*
++** lsl_wide_u16_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u16_m_tied2, svuint16_t, svuint64_t,
++		 z0_res = svlsl_wide_u16_m (p0, z4, z0),
++		 z0_res = svlsl_wide_m (p0, z4, z0))
++
++/*
++** lsl_wide_u16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u16_m_untied, svuint16_t, svuint64_t,
++	     z0 = svlsl_wide_u16_m (p0, z1, z4),
++	     z0 = svlsl_wide_m (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u16_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u16_m_tied1, svuint16_t, uint64_t,
++		 z0 = svlsl_wide_n_u16_m (p0, z0, x0),
++		 z0 = svlsl_wide_m (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u16_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u16_m_untied, svuint16_t, uint64_t,
++		 z0 = svlsl_wide_n_u16_m (p0, z1, x0),
++		 z0 = svlsl_wide_m (p0, z1, x0))
++
++/*
++** lsl_wide_1_u16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u16_m_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_m (p0, z0, 1),
++		z0 = svlsl_wide_m (p0, z0, 1))
++
++/*
++** lsl_wide_1_u16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u16_m_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_m (p0, z1, 1),
++		z0 = svlsl_wide_m (p0, z1, 1))
++
++/*
++** lsl_wide_15_u16_m_tied1:
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_u16_m_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_m (p0, z0, 15),
++		z0 = svlsl_wide_m (p0, z0, 15))
++
++/*
++** lsl_wide_15_u16_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_u16_m_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_m (p0, z1, 15),
++		z0 = svlsl_wide_m (p0, z1, 15))
++
++/*
++** lsl_wide_16_u16_m_tied1:
++**	mov	(z[0-9]+\.d), #16
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_u16_m_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_m (p0, z0, 16),
++		z0 = svlsl_wide_m (p0, z0, 16))
++
++/*
++** lsl_wide_16_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #16
++**	movprfx	z0, z1
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_u16_m_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_m (p0, z1, 16),
++		z0 = svlsl_wide_m (p0, z1, 16))
++
++/*
++** lsl_wide_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u16_z_tied1, svuint16_t, svuint64_t,
++	     z0 = svlsl_wide_u16_z (p0, z0, z4),
++	     z0 = svlsl_wide_z (p0, z0, z4))
++
++/*
++** lsl_wide_u16_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.h, p0/z, z4\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u16_z_tied2, svuint16_t, svuint64_t,
++		 z0_res = svlsl_wide_u16_z (p0, z4, z0),
++		 z0_res = svlsl_wide_z (p0, z4, z0))
++
++/*
++** lsl_wide_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u16_z_untied, svuint16_t, svuint64_t,
++	     z0 = svlsl_wide_u16_z (p0, z1, z4),
++	     z0 = svlsl_wide_z (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u16_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u16_z_tied1, svuint16_t, uint64_t,
++		 z0 = svlsl_wide_n_u16_z (p0, z0, x0),
++		 z0 = svlsl_wide_z (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u16_z_untied, svuint16_t, uint64_t,
++		 z0 = svlsl_wide_n_u16_z (p0, z1, x0),
++		 z0 = svlsl_wide_z (p0, z1, x0))
++
++/*
++** lsl_wide_1_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u16_z_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_z (p0, z0, 1),
++		z0 = svlsl_wide_z (p0, z0, 1))
++
++/*
++** lsl_wide_1_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u16_z_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_z (p0, z1, 1),
++		z0 = svlsl_wide_z (p0, z1, 1))
++
++/*
++** lsl_wide_15_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_u16_z_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_z (p0, z0, 15),
++		z0 = svlsl_wide_z (p0, z0, 15))
++
++/*
++** lsl_wide_15_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_u16_z_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_z (p0, z1, 15),
++		z0 = svlsl_wide_z (p0, z1, 15))
++
++/*
++** lsl_wide_16_u16_z_tied1:
++**	mov	(z[0-9]+\.d), #16
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_u16_z_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_z (p0, z0, 16),
++		z0 = svlsl_wide_z (p0, z0, 16))
++
++/*
++** lsl_wide_16_u16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #16
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsl	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_u16_z_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_z (p0, z1, 16),
++		z0 = svlsl_wide_z (p0, z1, 16))
++
++/*
++** lsl_wide_u16_x_tied1:
++**	lsl	z0\.h, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u16_x_tied1, svuint16_t, svuint64_t,
++	     z0 = svlsl_wide_u16_x (p0, z0, z4),
++	     z0 = svlsl_wide_x (p0, z0, z4))
++
++/*
++** lsl_wide_u16_x_tied2:
++**	lsl	z0\.h, z4\.h, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u16_x_tied2, svuint16_t, svuint64_t,
++		 z0_res = svlsl_wide_u16_x (p0, z4, z0),
++		 z0_res = svlsl_wide_x (p0, z4, z0))
++
++/*
++** lsl_wide_u16_x_untied:
++**	lsl	z0\.h, z1\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u16_x_untied, svuint16_t, svuint64_t,
++	     z0 = svlsl_wide_u16_x (p0, z1, z4),
++	     z0 = svlsl_wide_x (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u16_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u16_x_tied1, svuint16_t, uint64_t,
++		 z0 = svlsl_wide_n_u16_x (p0, z0, x0),
++		 z0 = svlsl_wide_x (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u16_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u16_x_untied, svuint16_t, uint64_t,
++		 z0 = svlsl_wide_n_u16_x (p0, z1, x0),
++		 z0 = svlsl_wide_x (p0, z1, x0))
++
++/*
++** lsl_wide_1_u16_x_tied1:
++**	lsl	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u16_x_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_x (p0, z0, 1),
++		z0 = svlsl_wide_x (p0, z0, 1))
++
++/*
++** lsl_wide_1_u16_x_untied:
++**	lsl	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u16_x_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_x (p0, z1, 1),
++		z0 = svlsl_wide_x (p0, z1, 1))
++
++/*
++** lsl_wide_15_u16_x_tied1:
++**	lsl	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_u16_x_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_x (p0, z0, 15),
++		z0 = svlsl_wide_x (p0, z0, 15))
++
++/*
++** lsl_wide_15_u16_x_untied:
++**	lsl	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_15_u16_x_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_x (p0, z1, 15),
++		z0 = svlsl_wide_x (p0, z1, 15))
++
++/*
++** lsl_wide_16_u16_x_tied1:
++**	mov	(z[0-9]+\.d), #16
++**	lsl	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_u16_x_tied1, svuint16_t,
++		z0 = svlsl_wide_n_u16_x (p0, z0, 16),
++		z0 = svlsl_wide_x (p0, z0, 16))
++
++/*
++** lsl_wide_16_u16_x_untied:
++**	mov	(z[0-9]+\.d), #16
++**	lsl	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_16_u16_x_untied, svuint16_t,
++		z0 = svlsl_wide_n_u16_x (p0, z1, 16),
++		z0 = svlsl_wide_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u32.c
+new file mode 100644
+index 000000000..7b1afab49
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u32.c
+@@ -0,0 +1,331 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_wide_u32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u32_m_tied1, svuint32_t, svuint64_t,
++	     z0 = svlsl_wide_u32_m (p0, z0, z4),
++	     z0 = svlsl_wide_m (p0, z0, z4))
++
++/*
++** lsl_wide_u32_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u32_m_tied2, svuint32_t, svuint64_t,
++		 z0_res = svlsl_wide_u32_m (p0, z4, z0),
++		 z0_res = svlsl_wide_m (p0, z4, z0))
++
++/*
++** lsl_wide_u32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u32_m_untied, svuint32_t, svuint64_t,
++	     z0 = svlsl_wide_u32_m (p0, z1, z4),
++	     z0 = svlsl_wide_m (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u32_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u32_m_tied1, svuint32_t, uint64_t,
++		 z0 = svlsl_wide_n_u32_m (p0, z0, x0),
++		 z0 = svlsl_wide_m (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u32_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u32_m_untied, svuint32_t, uint64_t,
++		 z0 = svlsl_wide_n_u32_m (p0, z1, x0),
++		 z0 = svlsl_wide_m (p0, z1, x0))
++
++/*
++** lsl_wide_1_u32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u32_m_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_m (p0, z0, 1),
++		z0 = svlsl_wide_m (p0, z0, 1))
++
++/*
++** lsl_wide_1_u32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u32_m_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_m (p0, z1, 1),
++		z0 = svlsl_wide_m (p0, z1, 1))
++
++/*
++** lsl_wide_31_u32_m_tied1:
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_u32_m_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_m (p0, z0, 31),
++		z0 = svlsl_wide_m (p0, z0, 31))
++
++/*
++** lsl_wide_31_u32_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_u32_m_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_m (p0, z1, 31),
++		z0 = svlsl_wide_m (p0, z1, 31))
++
++/*
++** lsl_wide_32_u32_m_tied1:
++**	mov	(z[0-9]+\.d), #32
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_u32_m_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_m (p0, z0, 32),
++		z0 = svlsl_wide_m (p0, z0, 32))
++
++/*
++** lsl_wide_32_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #32
++**	movprfx	z0, z1
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_u32_m_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_m (p0, z1, 32),
++		z0 = svlsl_wide_m (p0, z1, 32))
++
++/*
++** lsl_wide_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u32_z_tied1, svuint32_t, svuint64_t,
++	     z0 = svlsl_wide_u32_z (p0, z0, z4),
++	     z0 = svlsl_wide_z (p0, z0, z4))
++
++/*
++** lsl_wide_u32_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.s, p0/z, z4\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u32_z_tied2, svuint32_t, svuint64_t,
++		 z0_res = svlsl_wide_u32_z (p0, z4, z0),
++		 z0_res = svlsl_wide_z (p0, z4, z0))
++
++/*
++** lsl_wide_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u32_z_untied, svuint32_t, svuint64_t,
++	     z0 = svlsl_wide_u32_z (p0, z1, z4),
++	     z0 = svlsl_wide_z (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u32_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u32_z_tied1, svuint32_t, uint64_t,
++		 z0 = svlsl_wide_n_u32_z (p0, z0, x0),
++		 z0 = svlsl_wide_z (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u32_z_untied, svuint32_t, uint64_t,
++		 z0 = svlsl_wide_n_u32_z (p0, z1, x0),
++		 z0 = svlsl_wide_z (p0, z1, x0))
++
++/*
++** lsl_wide_1_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u32_z_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_z (p0, z0, 1),
++		z0 = svlsl_wide_z (p0, z0, 1))
++
++/*
++** lsl_wide_1_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u32_z_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_z (p0, z1, 1),
++		z0 = svlsl_wide_z (p0, z1, 1))
++
++/*
++** lsl_wide_31_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_u32_z_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_z (p0, z0, 31),
++		z0 = svlsl_wide_z (p0, z0, 31))
++
++/*
++** lsl_wide_31_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_u32_z_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_z (p0, z1, 31),
++		z0 = svlsl_wide_z (p0, z1, 31))
++
++/*
++** lsl_wide_32_u32_z_tied1:
++**	mov	(z[0-9]+\.d), #32
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_u32_z_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_z (p0, z0, 32),
++		z0 = svlsl_wide_z (p0, z0, 32))
++
++/*
++** lsl_wide_32_u32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #32
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsl	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_u32_z_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_z (p0, z1, 32),
++		z0 = svlsl_wide_z (p0, z1, 32))
++
++/*
++** lsl_wide_u32_x_tied1:
++**	lsl	z0\.s, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u32_x_tied1, svuint32_t, svuint64_t,
++	     z0 = svlsl_wide_u32_x (p0, z0, z4),
++	     z0 = svlsl_wide_x (p0, z0, z4))
++
++/*
++** lsl_wide_u32_x_tied2:
++**	lsl	z0\.s, z4\.s, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u32_x_tied2, svuint32_t, svuint64_t,
++		 z0_res = svlsl_wide_u32_x (p0, z4, z0),
++		 z0_res = svlsl_wide_x (p0, z4, z0))
++
++/*
++** lsl_wide_u32_x_untied:
++**	lsl	z0\.s, z1\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u32_x_untied, svuint32_t, svuint64_t,
++	     z0 = svlsl_wide_u32_x (p0, z1, z4),
++	     z0 = svlsl_wide_x (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u32_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u32_x_tied1, svuint32_t, uint64_t,
++		 z0 = svlsl_wide_n_u32_x (p0, z0, x0),
++		 z0 = svlsl_wide_x (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u32_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u32_x_untied, svuint32_t, uint64_t,
++		 z0 = svlsl_wide_n_u32_x (p0, z1, x0),
++		 z0 = svlsl_wide_x (p0, z1, x0))
++
++/*
++** lsl_wide_1_u32_x_tied1:
++**	lsl	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u32_x_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_x (p0, z0, 1),
++		z0 = svlsl_wide_x (p0, z0, 1))
++
++/*
++** lsl_wide_1_u32_x_untied:
++**	lsl	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u32_x_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_x (p0, z1, 1),
++		z0 = svlsl_wide_x (p0, z1, 1))
++
++/*
++** lsl_wide_31_u32_x_tied1:
++**	lsl	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_u32_x_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_x (p0, z0, 31),
++		z0 = svlsl_wide_x (p0, z0, 31))
++
++/*
++** lsl_wide_31_u32_x_untied:
++**	lsl	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_31_u32_x_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_x (p0, z1, 31),
++		z0 = svlsl_wide_x (p0, z1, 31))
++
++/*
++** lsl_wide_32_u32_x_tied1:
++**	mov	(z[0-9]+\.d), #32
++**	lsl	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_u32_x_tied1, svuint32_t,
++		z0 = svlsl_wide_n_u32_x (p0, z0, 32),
++		z0 = svlsl_wide_x (p0, z0, 32))
++
++/*
++** lsl_wide_32_u32_x_untied:
++**	mov	(z[0-9]+\.d), #32
++**	lsl	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_32_u32_x_untied, svuint32_t,
++		z0 = svlsl_wide_n_u32_x (p0, z1, 32),
++		z0 = svlsl_wide_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u8.c
+new file mode 100644
+index 000000000..df8b1ec86
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u8.c
+@@ -0,0 +1,331 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsl_wide_u8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u8_m_tied1, svuint8_t, svuint64_t,
++	     z0 = svlsl_wide_u8_m (p0, z0, z4),
++	     z0 = svlsl_wide_m (p0, z0, z4))
++
++/*
++** lsl_wide_u8_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u8_m_tied2, svuint8_t, svuint64_t,
++		 z0_res = svlsl_wide_u8_m (p0, z4, z0),
++		 z0_res = svlsl_wide_m (p0, z4, z0))
++
++/*
++** lsl_wide_u8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u8_m_untied, svuint8_t, svuint64_t,
++	     z0 = svlsl_wide_u8_m (p0, z1, z4),
++	     z0 = svlsl_wide_m (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u8_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u8_m_tied1, svuint8_t, uint64_t,
++		 z0 = svlsl_wide_n_u8_m (p0, z0, x0),
++		 z0 = svlsl_wide_m (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u8_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u8_m_untied, svuint8_t, uint64_t,
++		 z0 = svlsl_wide_n_u8_m (p0, z1, x0),
++		 z0 = svlsl_wide_m (p0, z1, x0))
++
++/*
++** lsl_wide_1_u8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u8_m_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_m (p0, z0, 1),
++		z0 = svlsl_wide_m (p0, z0, 1))
++
++/*
++** lsl_wide_1_u8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u8_m_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_m (p0, z1, 1),
++		z0 = svlsl_wide_m (p0, z1, 1))
++
++/*
++** lsl_wide_7_u8_m_tied1:
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_u8_m_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_m (p0, z0, 7),
++		z0 = svlsl_wide_m (p0, z0, 7))
++
++/*
++** lsl_wide_7_u8_m_untied:
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_u8_m_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_m (p0, z1, 7),
++		z0 = svlsl_wide_m (p0, z1, 7))
++
++/*
++** lsl_wide_8_u8_m_tied1:
++**	mov	(z[0-9]+\.d), #8
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_u8_m_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_m (p0, z0, 8),
++		z0 = svlsl_wide_m (p0, z0, 8))
++
++/*
++** lsl_wide_8_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #8
++**	movprfx	z0, z1
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_u8_m_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_m (p0, z1, 8),
++		z0 = svlsl_wide_m (p0, z1, 8))
++
++/*
++** lsl_wide_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u8_z_tied1, svuint8_t, svuint64_t,
++	     z0 = svlsl_wide_u8_z (p0, z0, z4),
++	     z0 = svlsl_wide_z (p0, z0, z4))
++
++/*
++** lsl_wide_u8_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.b, p0/z, z4\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u8_z_tied2, svuint8_t, svuint64_t,
++		 z0_res = svlsl_wide_u8_z (p0, z4, z0),
++		 z0_res = svlsl_wide_z (p0, z4, z0))
++
++/*
++** lsl_wide_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u8_z_untied, svuint8_t, svuint64_t,
++	     z0 = svlsl_wide_u8_z (p0, z1, z4),
++	     z0 = svlsl_wide_z (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u8_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u8_z_tied1, svuint8_t, uint64_t,
++		 z0 = svlsl_wide_n_u8_z (p0, z0, x0),
++		 z0 = svlsl_wide_z (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u8_z_untied, svuint8_t, uint64_t,
++		 z0 = svlsl_wide_n_u8_z (p0, z1, x0),
++		 z0 = svlsl_wide_z (p0, z1, x0))
++
++/*
++** lsl_wide_1_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u8_z_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_z (p0, z0, 1),
++		z0 = svlsl_wide_z (p0, z0, 1))
++
++/*
++** lsl_wide_1_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u8_z_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_z (p0, z1, 1),
++		z0 = svlsl_wide_z (p0, z1, 1))
++
++/*
++** lsl_wide_7_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_u8_z_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_z (p0, z0, 7),
++		z0 = svlsl_wide_z (p0, z0, 7))
++
++/*
++** lsl_wide_7_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_u8_z_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_z (p0, z1, 7),
++		z0 = svlsl_wide_z (p0, z1, 7))
++
++/*
++** lsl_wide_8_u8_z_tied1:
++**	mov	(z[0-9]+\.d), #8
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_u8_z_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_z (p0, z0, 8),
++		z0 = svlsl_wide_z (p0, z0, 8))
++
++/*
++** lsl_wide_8_u8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #8
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsl	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_u8_z_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_z (p0, z1, 8),
++		z0 = svlsl_wide_z (p0, z1, 8))
++
++/*
++** lsl_wide_u8_x_tied1:
++**	lsl	z0\.b, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u8_x_tied1, svuint8_t, svuint64_t,
++	     z0 = svlsl_wide_u8_x (p0, z0, z4),
++	     z0 = svlsl_wide_x (p0, z0, z4))
++
++/*
++** lsl_wide_u8_x_tied2:
++**	lsl	z0\.b, z4\.b, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsl_wide_u8_x_tied2, svuint8_t, svuint64_t,
++		 z0_res = svlsl_wide_u8_x (p0, z4, z0),
++		 z0_res = svlsl_wide_x (p0, z4, z0))
++
++/*
++** lsl_wide_u8_x_untied:
++**	lsl	z0\.b, z1\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsl_wide_u8_x_untied, svuint8_t, svuint64_t,
++	     z0 = svlsl_wide_u8_x (p0, z1, z4),
++	     z0 = svlsl_wide_x (p0, z1, z4))
++
++/*
++** lsl_wide_x0_u8_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u8_x_tied1, svuint8_t, uint64_t,
++		 z0 = svlsl_wide_n_u8_x (p0, z0, x0),
++		 z0 = svlsl_wide_x (p0, z0, x0))
++
++/*
++** lsl_wide_x0_u8_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsl	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsl_wide_x0_u8_x_untied, svuint8_t, uint64_t,
++		 z0 = svlsl_wide_n_u8_x (p0, z1, x0),
++		 z0 = svlsl_wide_x (p0, z1, x0))
++
++/*
++** lsl_wide_1_u8_x_tied1:
++**	lsl	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u8_x_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_x (p0, z0, 1),
++		z0 = svlsl_wide_x (p0, z0, 1))
++
++/*
++** lsl_wide_1_u8_x_untied:
++**	lsl	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_1_u8_x_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_x (p0, z1, 1),
++		z0 = svlsl_wide_x (p0, z1, 1))
++
++/*
++** lsl_wide_7_u8_x_tied1:
++**	lsl	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_u8_x_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_x (p0, z0, 7),
++		z0 = svlsl_wide_x (p0, z0, 7))
++
++/*
++** lsl_wide_7_u8_x_untied:
++**	lsl	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_7_u8_x_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_x (p0, z1, 7),
++		z0 = svlsl_wide_x (p0, z1, 7))
++
++/*
++** lsl_wide_8_u8_x_tied1:
++**	mov	(z[0-9]+\.d), #8
++**	lsl	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_u8_x_tied1, svuint8_t,
++		z0 = svlsl_wide_n_u8_x (p0, z0, 8),
++		z0 = svlsl_wide_x (p0, z0, 8))
++
++/*
++** lsl_wide_8_u8_x_untied:
++**	mov	(z[0-9]+\.d), #8
++**	lsl	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsl_wide_8_u8_x_untied, svuint8_t,
++		z0 = svlsl_wide_n_u8_x (p0, z1, 8),
++		z0 = svlsl_wide_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u16.c
+new file mode 100644
+index 000000000..61575645f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u16.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_m_tied1, svuint16_t,
++		z0 = svlsr_u16_m (p0, z0, z1),
++		z0 = svlsr_m (p0, z0, z1))
++
++/*
++** lsr_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_m_tied2, svuint16_t,
++		z0 = svlsr_u16_m (p0, z1, z0),
++		z0 = svlsr_m (p0, z1, z0))
++
++/*
++** lsr_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_m_untied, svuint16_t,
++		z0 = svlsr_u16_m (p0, z1, z2),
++		z0 = svlsr_m (p0, z1, z2))
++
++/*
++** lsr_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svlsr_n_u16_m (p0, z0, x0),
++		 z0 = svlsr_m (p0, z0, x0))
++
++/*
++** lsr_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svlsr_n_u16_m (p0, z1, x0),
++		 z0 = svlsr_m (p0, z1, x0))
++
++/*
++** lsr_1_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u16_m_tied1, svuint16_t,
++		z0 = svlsr_n_u16_m (p0, z0, 1),
++		z0 = svlsr_m (p0, z0, 1))
++
++/*
++** lsr_1_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u16_m_untied, svuint16_t,
++		z0 = svlsr_n_u16_m (p0, z1, 1),
++		z0 = svlsr_m (p0, z1, 1))
++
++/*
++** lsr_15_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_15_u16_m_tied1, svuint16_t,
++		z0 = svlsr_n_u16_m (p0, z0, 15),
++		z0 = svlsr_m (p0, z0, 15))
++
++/*
++** lsr_15_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_15_u16_m_untied, svuint16_t,
++		z0 = svlsr_n_u16_m (p0, z1, 15),
++		z0 = svlsr_m (p0, z1, 15))
++
++/*
++** lsr_16_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_16_u16_m_tied1, svuint16_t,
++		z0 = svlsr_n_u16_m (p0, z0, 16),
++		z0 = svlsr_m (p0, z0, 16))
++
++/*
++** lsr_16_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_16_u16_m_untied, svuint16_t,
++		z0 = svlsr_n_u16_m (p0, z1, 16),
++		z0 = svlsr_m (p0, z1, 16))
++
++/*
++** lsr_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_z_tied1, svuint16_t,
++		z0 = svlsr_u16_z (p0, z0, z1),
++		z0 = svlsr_z (p0, z0, z1))
++
++/*
++** lsr_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_z_tied2, svuint16_t,
++		z0 = svlsr_u16_z (p0, z1, z0),
++		z0 = svlsr_z (p0, z1, z0))
++
++/*
++** lsr_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_z_untied, svuint16_t,
++		z0 = svlsr_u16_z (p0, z1, z2),
++		z0 = svlsr_z (p0, z1, z2))
++
++/*
++** lsr_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svlsr_n_u16_z (p0, z0, x0),
++		 z0 = svlsr_z (p0, z0, x0))
++
++/*
++** lsr_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svlsr_n_u16_z (p0, z1, x0),
++		 z0 = svlsr_z (p0, z1, x0))
++
++/*
++** lsr_1_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u16_z_tied1, svuint16_t,
++		z0 = svlsr_n_u16_z (p0, z0, 1),
++		z0 = svlsr_z (p0, z0, 1))
++
++/*
++** lsr_1_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u16_z_untied, svuint16_t,
++		z0 = svlsr_n_u16_z (p0, z1, 1),
++		z0 = svlsr_z (p0, z1, 1))
++
++/*
++** lsr_15_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_15_u16_z_tied1, svuint16_t,
++		z0 = svlsr_n_u16_z (p0, z0, 15),
++		z0 = svlsr_z (p0, z0, 15))
++
++/*
++** lsr_15_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_15_u16_z_untied, svuint16_t,
++		z0 = svlsr_n_u16_z (p0, z1, 15),
++		z0 = svlsr_z (p0, z1, 15))
++
++/*
++** lsr_16_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_16_u16_z_tied1, svuint16_t,
++		z0 = svlsr_n_u16_z (p0, z0, 16),
++		z0 = svlsr_z (p0, z0, 16))
++
++/*
++** lsr_16_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_16_u16_z_untied, svuint16_t,
++		z0 = svlsr_n_u16_z (p0, z1, 16),
++		z0 = svlsr_z (p0, z1, 16))
++
++/*
++** lsr_u16_x_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_x_tied1, svuint16_t,
++		z0 = svlsr_u16_x (p0, z0, z1),
++		z0 = svlsr_x (p0, z0, z1))
++
++/*
++** lsr_u16_x_tied2:
++**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_x_tied2, svuint16_t,
++		z0 = svlsr_u16_x (p0, z1, z0),
++		z0 = svlsr_x (p0, z1, z0))
++
++/*
++** lsr_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u16_x_untied, svuint16_t,
++		z0 = svlsr_u16_x (p0, z1, z2),
++		z0 = svlsr_x (p0, z1, z2))
++
++/*
++** lsr_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svlsr_n_u16_x (p0, z0, x0),
++		 z0 = svlsr_x (p0, z0, x0))
++
++/*
++** lsr_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svlsr_n_u16_x (p0, z1, x0),
++		 z0 = svlsr_x (p0, z1, x0))
++
++/*
++** lsr_1_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u16_x_tied1, svuint16_t,
++		z0 = svlsr_n_u16_x (p0, z0, 1),
++		z0 = svlsr_x (p0, z0, 1))
++
++/*
++** lsr_1_u16_x_untied:
++**	lsr	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u16_x_untied, svuint16_t,
++		z0 = svlsr_n_u16_x (p0, z1, 1),
++		z0 = svlsr_x (p0, z1, 1))
++
++/*
++** lsr_15_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_15_u16_x_tied1, svuint16_t,
++		z0 = svlsr_n_u16_x (p0, z0, 15),
++		z0 = svlsr_x (p0, z0, 15))
++
++/*
++** lsr_15_u16_x_untied:
++**	lsr	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_15_u16_x_untied, svuint16_t,
++		z0 = svlsr_n_u16_x (p0, z1, 15),
++		z0 = svlsr_x (p0, z1, 15))
++
++/*
++** lsr_16_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_16_u16_x_tied1, svuint16_t,
++		z0 = svlsr_n_u16_x (p0, z0, 16),
++		z0 = svlsr_x (p0, z0, 16))
++
++/*
++** lsr_16_u16_x_untied:
++**	lsr	z0\.h, z1\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_16_u16_x_untied, svuint16_t,
++		z0 = svlsr_n_u16_x (p0, z1, 16),
++		z0 = svlsr_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u32.c
+new file mode 100644
+index 000000000..796867ef8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u32.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_m_tied1, svuint32_t,
++		z0 = svlsr_u32_m (p0, z0, z1),
++		z0 = svlsr_m (p0, z0, z1))
++
++/*
++** lsr_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_m_tied2, svuint32_t,
++		z0 = svlsr_u32_m (p0, z1, z0),
++		z0 = svlsr_m (p0, z1, z0))
++
++/*
++** lsr_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_m_untied, svuint32_t,
++		z0 = svlsr_u32_m (p0, z1, z2),
++		z0 = svlsr_m (p0, z1, z2))
++
++/*
++** lsr_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svlsr_n_u32_m (p0, z0, x0),
++		 z0 = svlsr_m (p0, z0, x0))
++
++/*
++** lsr_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svlsr_n_u32_m (p0, z1, x0),
++		 z0 = svlsr_m (p0, z1, x0))
++
++/*
++** lsr_1_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u32_m_tied1, svuint32_t,
++		z0 = svlsr_n_u32_m (p0, z0, 1),
++		z0 = svlsr_m (p0, z0, 1))
++
++/*
++** lsr_1_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u32_m_untied, svuint32_t,
++		z0 = svlsr_n_u32_m (p0, z1, 1),
++		z0 = svlsr_m (p0, z1, 1))
++
++/*
++** lsr_31_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_31_u32_m_tied1, svuint32_t,
++		z0 = svlsr_n_u32_m (p0, z0, 31),
++		z0 = svlsr_m (p0, z0, 31))
++
++/*
++** lsr_31_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_31_u32_m_untied, svuint32_t,
++		z0 = svlsr_n_u32_m (p0, z1, 31),
++		z0 = svlsr_m (p0, z1, 31))
++
++/*
++** lsr_32_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_32_u32_m_tied1, svuint32_t,
++		z0 = svlsr_n_u32_m (p0, z0, 32),
++		z0 = svlsr_m (p0, z0, 32))
++
++/*
++** lsr_32_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_32_u32_m_untied, svuint32_t,
++		z0 = svlsr_n_u32_m (p0, z1, 32),
++		z0 = svlsr_m (p0, z1, 32))
++
++/*
++** lsr_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_z_tied1, svuint32_t,
++		z0 = svlsr_u32_z (p0, z0, z1),
++		z0 = svlsr_z (p0, z0, z1))
++
++/*
++** lsr_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_z_tied2, svuint32_t,
++		z0 = svlsr_u32_z (p0, z1, z0),
++		z0 = svlsr_z (p0, z1, z0))
++
++/*
++** lsr_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_z_untied, svuint32_t,
++		z0 = svlsr_u32_z (p0, z1, z2),
++		z0 = svlsr_z (p0, z1, z2))
++
++/*
++** lsr_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svlsr_n_u32_z (p0, z0, x0),
++		 z0 = svlsr_z (p0, z0, x0))
++
++/*
++** lsr_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svlsr_n_u32_z (p0, z1, x0),
++		 z0 = svlsr_z (p0, z1, x0))
++
++/*
++** lsr_1_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u32_z_tied1, svuint32_t,
++		z0 = svlsr_n_u32_z (p0, z0, 1),
++		z0 = svlsr_z (p0, z0, 1))
++
++/*
++** lsr_1_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u32_z_untied, svuint32_t,
++		z0 = svlsr_n_u32_z (p0, z1, 1),
++		z0 = svlsr_z (p0, z1, 1))
++
++/*
++** lsr_31_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_31_u32_z_tied1, svuint32_t,
++		z0 = svlsr_n_u32_z (p0, z0, 31),
++		z0 = svlsr_z (p0, z0, 31))
++
++/*
++** lsr_31_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_31_u32_z_untied, svuint32_t,
++		z0 = svlsr_n_u32_z (p0, z1, 31),
++		z0 = svlsr_z (p0, z1, 31))
++
++/*
++** lsr_32_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_32_u32_z_tied1, svuint32_t,
++		z0 = svlsr_n_u32_z (p0, z0, 32),
++		z0 = svlsr_z (p0, z0, 32))
++
++/*
++** lsr_32_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_32_u32_z_untied, svuint32_t,
++		z0 = svlsr_n_u32_z (p0, z1, 32),
++		z0 = svlsr_z (p0, z1, 32))
++
++/*
++** lsr_u32_x_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_x_tied1, svuint32_t,
++		z0 = svlsr_u32_x (p0, z0, z1),
++		z0 = svlsr_x (p0, z0, z1))
++
++/*
++** lsr_u32_x_tied2:
++**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_x_tied2, svuint32_t,
++		z0 = svlsr_u32_x (p0, z1, z0),
++		z0 = svlsr_x (p0, z1, z0))
++
++/*
++** lsr_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u32_x_untied, svuint32_t,
++		z0 = svlsr_u32_x (p0, z1, z2),
++		z0 = svlsr_x (p0, z1, z2))
++
++/*
++** lsr_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svlsr_n_u32_x (p0, z0, x0),
++		 z0 = svlsr_x (p0, z0, x0))
++
++/*
++** lsr_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svlsr_n_u32_x (p0, z1, x0),
++		 z0 = svlsr_x (p0, z1, x0))
++
++/*
++** lsr_1_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u32_x_tied1, svuint32_t,
++		z0 = svlsr_n_u32_x (p0, z0, 1),
++		z0 = svlsr_x (p0, z0, 1))
++
++/*
++** lsr_1_u32_x_untied:
++**	lsr	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u32_x_untied, svuint32_t,
++		z0 = svlsr_n_u32_x (p0, z1, 1),
++		z0 = svlsr_x (p0, z1, 1))
++
++/*
++** lsr_31_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_31_u32_x_tied1, svuint32_t,
++		z0 = svlsr_n_u32_x (p0, z0, 31),
++		z0 = svlsr_x (p0, z0, 31))
++
++/*
++** lsr_31_u32_x_untied:
++**	lsr	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_31_u32_x_untied, svuint32_t,
++		z0 = svlsr_n_u32_x (p0, z1, 31),
++		z0 = svlsr_x (p0, z1, 31))
++
++/*
++** lsr_32_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_32_u32_x_tied1, svuint32_t,
++		z0 = svlsr_n_u32_x (p0, z0, 32),
++		z0 = svlsr_x (p0, z0, 32))
++
++/*
++** lsr_32_u32_x_untied:
++**	lsr	z0\.s, z1\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_32_u32_x_untied, svuint32_t,
++		z0 = svlsr_n_u32_x (p0, z1, 32),
++		z0 = svlsr_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u64.c
+new file mode 100644
+index 000000000..b50777f50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u64.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_u64_m_tied1:
++**	lsr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_m_tied1, svuint64_t,
++		z0 = svlsr_u64_m (p0, z0, z1),
++		z0 = svlsr_m (p0, z0, z1))
++
++/*
++** lsr_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_m_tied2, svuint64_t,
++		z0 = svlsr_u64_m (p0, z1, z0),
++		z0 = svlsr_m (p0, z1, z0))
++
++/*
++** lsr_u64_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_m_untied, svuint64_t,
++		z0 = svlsr_u64_m (p0, z1, z2),
++		z0 = svlsr_m (p0, z1, z2))
++
++/*
++** lsr_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svlsr_n_u64_m (p0, z0, x0),
++		 z0 = svlsr_m (p0, z0, x0))
++
++/*
++** lsr_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svlsr_n_u64_m (p0, z1, x0),
++		 z0 = svlsr_m (p0, z1, x0))
++
++/*
++** lsr_1_u64_m_tied1:
++**	lsr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u64_m_tied1, svuint64_t,
++		z0 = svlsr_n_u64_m (p0, z0, 1),
++		z0 = svlsr_m (p0, z0, 1))
++
++/*
++** lsr_1_u64_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u64_m_untied, svuint64_t,
++		z0 = svlsr_n_u64_m (p0, z1, 1),
++		z0 = svlsr_m (p0, z1, 1))
++
++/*
++** lsr_63_u64_m_tied1:
++**	lsr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_63_u64_m_tied1, svuint64_t,
++		z0 = svlsr_n_u64_m (p0, z0, 63),
++		z0 = svlsr_m (p0, z0, 63))
++
++/*
++** lsr_63_u64_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_63_u64_m_untied, svuint64_t,
++		z0 = svlsr_n_u64_m (p0, z1, 63),
++		z0 = svlsr_m (p0, z1, 63))
++
++/*
++** lsr_64_u64_m_tied1:
++**	lsr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_64_u64_m_tied1, svuint64_t,
++		z0 = svlsr_n_u64_m (p0, z0, 64),
++		z0 = svlsr_m (p0, z0, 64))
++
++/*
++** lsr_64_u64_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_64_u64_m_untied, svuint64_t,
++		z0 = svlsr_n_u64_m (p0, z1, 64),
++		z0 = svlsr_m (p0, z1, 64))
++
++/*
++** lsr_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_z_tied1, svuint64_t,
++		z0 = svlsr_u64_z (p0, z0, z1),
++		z0 = svlsr_z (p0, z0, z1))
++
++/*
++** lsr_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_z_tied2, svuint64_t,
++		z0 = svlsr_u64_z (p0, z1, z0),
++		z0 = svlsr_z (p0, z1, z0))
++
++/*
++** lsr_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_z_untied, svuint64_t,
++		z0 = svlsr_u64_z (p0, z1, z2),
++		z0 = svlsr_z (p0, z1, z2))
++
++/*
++** lsr_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svlsr_n_u64_z (p0, z0, x0),
++		 z0 = svlsr_z (p0, z0, x0))
++
++/*
++** lsr_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svlsr_n_u64_z (p0, z1, x0),
++		 z0 = svlsr_z (p0, z1, x0))
++
++/*
++** lsr_1_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u64_z_tied1, svuint64_t,
++		z0 = svlsr_n_u64_z (p0, z0, 1),
++		z0 = svlsr_z (p0, z0, 1))
++
++/*
++** lsr_1_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsr	z0\.d, p0/m, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u64_z_untied, svuint64_t,
++		z0 = svlsr_n_u64_z (p0, z1, 1),
++		z0 = svlsr_z (p0, z1, 1))
++
++/*
++** lsr_63_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_63_u64_z_tied1, svuint64_t,
++		z0 = svlsr_n_u64_z (p0, z0, 63),
++		z0 = svlsr_z (p0, z0, 63))
++
++/*
++** lsr_63_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsr	z0\.d, p0/m, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_63_u64_z_untied, svuint64_t,
++		z0 = svlsr_n_u64_z (p0, z1, 63),
++		z0 = svlsr_z (p0, z1, 63))
++
++/*
++** lsr_64_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	lsr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_64_u64_z_tied1, svuint64_t,
++		z0 = svlsr_n_u64_z (p0, z0, 64),
++		z0 = svlsr_z (p0, z0, 64))
++
++/*
++** lsr_64_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	lsr	z0\.d, p0/m, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_64_u64_z_untied, svuint64_t,
++		z0 = svlsr_n_u64_z (p0, z1, 64),
++		z0 = svlsr_z (p0, z1, 64))
++
++/*
++** lsr_u64_x_tied1:
++**	lsr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_x_tied1, svuint64_t,
++		z0 = svlsr_u64_x (p0, z0, z1),
++		z0 = svlsr_x (p0, z0, z1))
++
++/*
++** lsr_u64_x_tied2:
++**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_x_tied2, svuint64_t,
++		z0 = svlsr_u64_x (p0, z1, z0),
++		z0 = svlsr_x (p0, z1, z0))
++
++/*
++** lsr_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u64_x_untied, svuint64_t,
++		z0 = svlsr_u64_x (p0, z1, z2),
++		z0 = svlsr_x (p0, z1, z2))
++
++/*
++** lsr_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svlsr_n_u64_x (p0, z0, x0),
++		 z0 = svlsr_x (p0, z0, x0))
++
++/*
++** lsr_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svlsr_n_u64_x (p0, z1, x0),
++		 z0 = svlsr_x (p0, z1, x0))
++
++/*
++** lsr_1_u64_x_tied1:
++**	lsr	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u64_x_tied1, svuint64_t,
++		z0 = svlsr_n_u64_x (p0, z0, 1),
++		z0 = svlsr_x (p0, z0, 1))
++
++/*
++** lsr_1_u64_x_untied:
++**	lsr	z0\.d, z1\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u64_x_untied, svuint64_t,
++		z0 = svlsr_n_u64_x (p0, z1, 1),
++		z0 = svlsr_x (p0, z1, 1))
++
++/*
++** lsr_63_u64_x_tied1:
++**	lsr	z0\.d, z0\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_63_u64_x_tied1, svuint64_t,
++		z0 = svlsr_n_u64_x (p0, z0, 63),
++		z0 = svlsr_x (p0, z0, 63))
++
++/*
++** lsr_63_u64_x_untied:
++**	lsr	z0\.d, z1\.d, #63
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_63_u64_x_untied, svuint64_t,
++		z0 = svlsr_n_u64_x (p0, z1, 63),
++		z0 = svlsr_x (p0, z1, 63))
++
++/*
++** lsr_64_u64_x_tied1:
++**	lsr	z0\.d, z0\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_64_u64_x_tied1, svuint64_t,
++		z0 = svlsr_n_u64_x (p0, z0, 64),
++		z0 = svlsr_x (p0, z0, 64))
++
++/*
++** lsr_64_u64_x_untied:
++**	lsr	z0\.d, z1\.d, #64
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_64_u64_x_untied, svuint64_t,
++		z0 = svlsr_n_u64_x (p0, z1, 64),
++		z0 = svlsr_x (p0, z1, 64))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u8.c
+new file mode 100644
+index 000000000..a049ca905
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u8.c
+@@ -0,0 +1,340 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_m_tied1, svuint8_t,
++		z0 = svlsr_u8_m (p0, z0, z1),
++		z0 = svlsr_m (p0, z0, z1))
++
++/*
++** lsr_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_m_tied2, svuint8_t,
++		z0 = svlsr_u8_m (p0, z1, z0),
++		z0 = svlsr_m (p0, z1, z0))
++
++/*
++** lsr_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_m_untied, svuint8_t,
++		z0 = svlsr_u8_m (p0, z1, z2),
++		z0 = svlsr_m (p0, z1, z2))
++
++/*
++** lsr_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svlsr_n_u8_m (p0, z0, x0),
++		 z0 = svlsr_m (p0, z0, x0))
++
++/*
++** lsr_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svlsr_n_u8_m (p0, z1, x0),
++		 z0 = svlsr_m (p0, z1, x0))
++
++/*
++** lsr_1_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u8_m_tied1, svuint8_t,
++		z0 = svlsr_n_u8_m (p0, z0, 1),
++		z0 = svlsr_m (p0, z0, 1))
++
++/*
++** lsr_1_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u8_m_untied, svuint8_t,
++		z0 = svlsr_n_u8_m (p0, z1, 1),
++		z0 = svlsr_m (p0, z1, 1))
++
++/*
++** lsr_7_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_7_u8_m_tied1, svuint8_t,
++		z0 = svlsr_n_u8_m (p0, z0, 7),
++		z0 = svlsr_m (p0, z0, 7))
++
++/*
++** lsr_7_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_7_u8_m_untied, svuint8_t,
++		z0 = svlsr_n_u8_m (p0, z1, 7),
++		z0 = svlsr_m (p0, z1, 7))
++
++/*
++** lsr_8_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_8_u8_m_tied1, svuint8_t,
++		z0 = svlsr_n_u8_m (p0, z0, 8),
++		z0 = svlsr_m (p0, z0, 8))
++
++/*
++** lsr_8_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_8_u8_m_untied, svuint8_t,
++		z0 = svlsr_n_u8_m (p0, z1, 8),
++		z0 = svlsr_m (p0, z1, 8))
++
++/*
++** lsr_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_z_tied1, svuint8_t,
++		z0 = svlsr_u8_z (p0, z0, z1),
++		z0 = svlsr_z (p0, z0, z1))
++
++/*
++** lsr_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_z_tied2, svuint8_t,
++		z0 = svlsr_u8_z (p0, z1, z0),
++		z0 = svlsr_z (p0, z1, z0))
++
++/*
++** lsr_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_z_untied, svuint8_t,
++		z0 = svlsr_u8_z (p0, z1, z2),
++		z0 = svlsr_z (p0, z1, z2))
++
++/*
++** lsr_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svlsr_n_u8_z (p0, z0, x0),
++		 z0 = svlsr_z (p0, z0, x0))
++
++/*
++** lsr_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svlsr_n_u8_z (p0, z1, x0),
++		 z0 = svlsr_z (p0, z1, x0))
++
++/*
++** lsr_1_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u8_z_tied1, svuint8_t,
++		z0 = svlsr_n_u8_z (p0, z0, 1),
++		z0 = svlsr_z (p0, z0, 1))
++
++/*
++** lsr_1_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u8_z_untied, svuint8_t,
++		z0 = svlsr_n_u8_z (p0, z1, 1),
++		z0 = svlsr_z (p0, z1, 1))
++
++/*
++** lsr_7_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_7_u8_z_tied1, svuint8_t,
++		z0 = svlsr_n_u8_z (p0, z0, 7),
++		z0 = svlsr_z (p0, z0, 7))
++
++/*
++** lsr_7_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_7_u8_z_untied, svuint8_t,
++		z0 = svlsr_n_u8_z (p0, z1, 7),
++		z0 = svlsr_z (p0, z1, 7))
++
++/*
++** lsr_8_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_8_u8_z_tied1, svuint8_t,
++		z0 = svlsr_n_u8_z (p0, z0, 8),
++		z0 = svlsr_z (p0, z0, 8))
++
++/*
++** lsr_8_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_8_u8_z_untied, svuint8_t,
++		z0 = svlsr_n_u8_z (p0, z1, 8),
++		z0 = svlsr_z (p0, z1, 8))
++
++/*
++** lsr_u8_x_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_x_tied1, svuint8_t,
++		z0 = svlsr_u8_x (p0, z0, z1),
++		z0 = svlsr_x (p0, z0, z1))
++
++/*
++** lsr_u8_x_tied2:
++**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_x_tied2, svuint8_t,
++		z0 = svlsr_u8_x (p0, z1, z0),
++		z0 = svlsr_x (p0, z1, z0))
++
++/*
++** lsr_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_u8_x_untied, svuint8_t,
++		z0 = svlsr_u8_x (p0, z1, z2),
++		z0 = svlsr_x (p0, z1, z2))
++
++/*
++** lsr_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svlsr_n_u8_x (p0, z0, x0),
++		 z0 = svlsr_x (p0, z0, x0))
++
++/*
++** lsr_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svlsr_n_u8_x (p0, z1, x0),
++		 z0 = svlsr_x (p0, z1, x0))
++
++/*
++** lsr_1_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u8_x_tied1, svuint8_t,
++		z0 = svlsr_n_u8_x (p0, z0, 1),
++		z0 = svlsr_x (p0, z0, 1))
++
++/*
++** lsr_1_u8_x_untied:
++**	lsr	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_1_u8_x_untied, svuint8_t,
++		z0 = svlsr_n_u8_x (p0, z1, 1),
++		z0 = svlsr_x (p0, z1, 1))
++
++/*
++** lsr_7_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_7_u8_x_tied1, svuint8_t,
++		z0 = svlsr_n_u8_x (p0, z0, 7),
++		z0 = svlsr_x (p0, z0, 7))
++
++/*
++** lsr_7_u8_x_untied:
++**	lsr	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_7_u8_x_untied, svuint8_t,
++		z0 = svlsr_n_u8_x (p0, z1, 7),
++		z0 = svlsr_x (p0, z1, 7))
++
++/*
++** lsr_8_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_8_u8_x_tied1, svuint8_t,
++		z0 = svlsr_n_u8_x (p0, z0, 8),
++		z0 = svlsr_x (p0, z0, 8))
++
++/*
++** lsr_8_u8_x_untied:
++**	lsr	z0\.b, z1\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_8_u8_x_untied, svuint8_t,
++		z0 = svlsr_n_u8_x (p0, z1, 8),
++		z0 = svlsr_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u16.c
+new file mode 100644
+index 000000000..863b51a2f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u16.c
+@@ -0,0 +1,325 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_wide_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u16_m_tied1, svuint16_t, svuint64_t,
++	     z0 = svlsr_wide_u16_m (p0, z0, z4),
++	     z0 = svlsr_wide_m (p0, z0, z4))
++
++/*
++** lsr_wide_u16_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u16_m_tied2, svuint16_t, svuint64_t,
++		 z0_res = svlsr_wide_u16_m (p0, z4, z0),
++		 z0_res = svlsr_wide_m (p0, z4, z0))
++
++/*
++** lsr_wide_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u16_m_untied, svuint16_t, svuint64_t,
++	     z0 = svlsr_wide_u16_m (p0, z1, z4),
++	     z0 = svlsr_wide_m (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u16_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u16_m_tied1, svuint16_t, uint64_t,
++		 z0 = svlsr_wide_n_u16_m (p0, z0, x0),
++		 z0 = svlsr_wide_m (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u16_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u16_m_untied, svuint16_t, uint64_t,
++		 z0 = svlsr_wide_n_u16_m (p0, z1, x0),
++		 z0 = svlsr_wide_m (p0, z1, x0))
++
++/*
++** lsr_wide_1_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u16_m_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_m (p0, z0, 1),
++		z0 = svlsr_wide_m (p0, z0, 1))
++
++/*
++** lsr_wide_1_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u16_m_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_m (p0, z1, 1),
++		z0 = svlsr_wide_m (p0, z1, 1))
++
++/*
++** lsr_wide_15_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_15_u16_m_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_m (p0, z0, 15),
++		z0 = svlsr_wide_m (p0, z0, 15))
++
++/*
++** lsr_wide_15_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_15_u16_m_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_m (p0, z1, 15),
++		z0 = svlsr_wide_m (p0, z1, 15))
++
++/*
++** lsr_wide_16_u16_m_tied1:
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_16_u16_m_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_m (p0, z0, 16),
++		z0 = svlsr_wide_m (p0, z0, 16))
++
++/*
++** lsr_wide_16_u16_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_16_u16_m_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_m (p0, z1, 16),
++		z0 = svlsr_wide_m (p0, z1, 16))
++
++/*
++** lsr_wide_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u16_z_tied1, svuint16_t, svuint64_t,
++	     z0 = svlsr_wide_u16_z (p0, z0, z4),
++	     z0 = svlsr_wide_z (p0, z0, z4))
++
++/*
++** lsr_wide_u16_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.h, p0/z, z4\.h
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u16_z_tied2, svuint16_t, svuint64_t,
++		 z0_res = svlsr_wide_u16_z (p0, z4, z0),
++		 z0_res = svlsr_wide_z (p0, z4, z0))
++
++/*
++** lsr_wide_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u16_z_untied, svuint16_t, svuint64_t,
++	     z0 = svlsr_wide_u16_z (p0, z1, z4),
++	     z0 = svlsr_wide_z (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u16_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u16_z_tied1, svuint16_t, uint64_t,
++		 z0 = svlsr_wide_n_u16_z (p0, z0, x0),
++		 z0 = svlsr_wide_z (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u16_z_untied, svuint16_t, uint64_t,
++		 z0 = svlsr_wide_n_u16_z (p0, z1, x0),
++		 z0 = svlsr_wide_z (p0, z1, x0))
++
++/*
++** lsr_wide_1_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u16_z_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_z (p0, z0, 1),
++		z0 = svlsr_wide_z (p0, z0, 1))
++
++/*
++** lsr_wide_1_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u16_z_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_z (p0, z1, 1),
++		z0 = svlsr_wide_z (p0, z1, 1))
++
++/*
++** lsr_wide_15_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_15_u16_z_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_z (p0, z0, 15),
++		z0 = svlsr_wide_z (p0, z0, 15))
++
++/*
++** lsr_wide_15_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_15_u16_z_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_z (p0, z1, 15),
++		z0 = svlsr_wide_z (p0, z1, 15))
++
++/*
++** lsr_wide_16_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_16_u16_z_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_z (p0, z0, 16),
++		z0 = svlsr_wide_z (p0, z0, 16))
++
++/*
++** lsr_wide_16_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	lsr	z0\.h, p0/m, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_16_u16_z_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_z (p0, z1, 16),
++		z0 = svlsr_wide_z (p0, z1, 16))
++
++/*
++** lsr_wide_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u16_x_tied1, svuint16_t, svuint64_t,
++	     z0 = svlsr_wide_u16_x (p0, z0, z4),
++	     z0 = svlsr_wide_x (p0, z0, z4))
++
++/*
++** lsr_wide_u16_x_tied2:
++**	lsr	z0\.h, z4\.h, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u16_x_tied2, svuint16_t, svuint64_t,
++		 z0_res = svlsr_wide_u16_x (p0, z4, z0),
++		 z0_res = svlsr_wide_x (p0, z4, z0))
++
++/*
++** lsr_wide_u16_x_untied:
++**	lsr	z0\.h, z1\.h, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u16_x_untied, svuint16_t, svuint64_t,
++	     z0 = svlsr_wide_u16_x (p0, z1, z4),
++	     z0 = svlsr_wide_x (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u16_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u16_x_tied1, svuint16_t, uint64_t,
++		 z0 = svlsr_wide_n_u16_x (p0, z0, x0),
++		 z0 = svlsr_wide_x (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u16_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u16_x_untied, svuint16_t, uint64_t,
++		 z0 = svlsr_wide_n_u16_x (p0, z1, x0),
++		 z0 = svlsr_wide_x (p0, z1, x0))
++
++/*
++** lsr_wide_1_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u16_x_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_x (p0, z0, 1),
++		z0 = svlsr_wide_x (p0, z0, 1))
++
++/*
++** lsr_wide_1_u16_x_untied:
++**	lsr	z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u16_x_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_x (p0, z1, 1),
++		z0 = svlsr_wide_x (p0, z1, 1))
++
++/*
++** lsr_wide_15_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_15_u16_x_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_x (p0, z0, 15),
++		z0 = svlsr_wide_x (p0, z0, 15))
++
++/*
++** lsr_wide_15_u16_x_untied:
++**	lsr	z0\.h, z1\.h, #15
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_15_u16_x_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_x (p0, z1, 15),
++		z0 = svlsr_wide_x (p0, z1, 15))
++
++/*
++** lsr_wide_16_u16_x_tied1:
++**	lsr	z0\.h, z0\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_16_u16_x_tied1, svuint16_t,
++		z0 = svlsr_wide_n_u16_x (p0, z0, 16),
++		z0 = svlsr_wide_x (p0, z0, 16))
++
++/*
++** lsr_wide_16_u16_x_untied:
++**	lsr	z0\.h, z1\.h, #16
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_16_u16_x_untied, svuint16_t,
++		z0 = svlsr_wide_n_u16_x (p0, z1, 16),
++		z0 = svlsr_wide_x (p0, z1, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u32.c
+new file mode 100644
+index 000000000..73c2cf86e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u32.c
+@@ -0,0 +1,325 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_wide_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u32_m_tied1, svuint32_t, svuint64_t,
++	     z0 = svlsr_wide_u32_m (p0, z0, z4),
++	     z0 = svlsr_wide_m (p0, z0, z4))
++
++/*
++** lsr_wide_u32_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u32_m_tied2, svuint32_t, svuint64_t,
++		 z0_res = svlsr_wide_u32_m (p0, z4, z0),
++		 z0_res = svlsr_wide_m (p0, z4, z0))
++
++/*
++** lsr_wide_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u32_m_untied, svuint32_t, svuint64_t,
++	     z0 = svlsr_wide_u32_m (p0, z1, z4),
++	     z0 = svlsr_wide_m (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u32_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u32_m_tied1, svuint32_t, uint64_t,
++		 z0 = svlsr_wide_n_u32_m (p0, z0, x0),
++		 z0 = svlsr_wide_m (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u32_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u32_m_untied, svuint32_t, uint64_t,
++		 z0 = svlsr_wide_n_u32_m (p0, z1, x0),
++		 z0 = svlsr_wide_m (p0, z1, x0))
++
++/*
++** lsr_wide_1_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u32_m_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_m (p0, z0, 1),
++		z0 = svlsr_wide_m (p0, z0, 1))
++
++/*
++** lsr_wide_1_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u32_m_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_m (p0, z1, 1),
++		z0 = svlsr_wide_m (p0, z1, 1))
++
++/*
++** lsr_wide_31_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_31_u32_m_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_m (p0, z0, 31),
++		z0 = svlsr_wide_m (p0, z0, 31))
++
++/*
++** lsr_wide_31_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_31_u32_m_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_m (p0, z1, 31),
++		z0 = svlsr_wide_m (p0, z1, 31))
++
++/*
++** lsr_wide_32_u32_m_tied1:
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_32_u32_m_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_m (p0, z0, 32),
++		z0 = svlsr_wide_m (p0, z0, 32))
++
++/*
++** lsr_wide_32_u32_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_32_u32_m_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_m (p0, z1, 32),
++		z0 = svlsr_wide_m (p0, z1, 32))
++
++/*
++** lsr_wide_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u32_z_tied1, svuint32_t, svuint64_t,
++	     z0 = svlsr_wide_u32_z (p0, z0, z4),
++	     z0 = svlsr_wide_z (p0, z0, z4))
++
++/*
++** lsr_wide_u32_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.s, p0/z, z4\.s
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u32_z_tied2, svuint32_t, svuint64_t,
++		 z0_res = svlsr_wide_u32_z (p0, z4, z0),
++		 z0_res = svlsr_wide_z (p0, z4, z0))
++
++/*
++** lsr_wide_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u32_z_untied, svuint32_t, svuint64_t,
++	     z0 = svlsr_wide_u32_z (p0, z1, z4),
++	     z0 = svlsr_wide_z (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u32_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u32_z_tied1, svuint32_t, uint64_t,
++		 z0 = svlsr_wide_n_u32_z (p0, z0, x0),
++		 z0 = svlsr_wide_z (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u32_z_untied, svuint32_t, uint64_t,
++		 z0 = svlsr_wide_n_u32_z (p0, z1, x0),
++		 z0 = svlsr_wide_z (p0, z1, x0))
++
++/*
++** lsr_wide_1_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u32_z_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_z (p0, z0, 1),
++		z0 = svlsr_wide_z (p0, z0, 1))
++
++/*
++** lsr_wide_1_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u32_z_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_z (p0, z1, 1),
++		z0 = svlsr_wide_z (p0, z1, 1))
++
++/*
++** lsr_wide_31_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_31_u32_z_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_z (p0, z0, 31),
++		z0 = svlsr_wide_z (p0, z0, 31))
++
++/*
++** lsr_wide_31_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_31_u32_z_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_z (p0, z1, 31),
++		z0 = svlsr_wide_z (p0, z1, 31))
++
++/*
++** lsr_wide_32_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_32_u32_z_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_z (p0, z0, 32),
++		z0 = svlsr_wide_z (p0, z0, 32))
++
++/*
++** lsr_wide_32_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	lsr	z0\.s, p0/m, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_32_u32_z_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_z (p0, z1, 32),
++		z0 = svlsr_wide_z (p0, z1, 32))
++
++/*
++** lsr_wide_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u32_x_tied1, svuint32_t, svuint64_t,
++	     z0 = svlsr_wide_u32_x (p0, z0, z4),
++	     z0 = svlsr_wide_x (p0, z0, z4))
++
++/*
++** lsr_wide_u32_x_tied2:
++**	lsr	z0\.s, z4\.s, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u32_x_tied2, svuint32_t, svuint64_t,
++		 z0_res = svlsr_wide_u32_x (p0, z4, z0),
++		 z0_res = svlsr_wide_x (p0, z4, z0))
++
++/*
++** lsr_wide_u32_x_untied:
++**	lsr	z0\.s, z1\.s, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u32_x_untied, svuint32_t, svuint64_t,
++	     z0 = svlsr_wide_u32_x (p0, z1, z4),
++	     z0 = svlsr_wide_x (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u32_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u32_x_tied1, svuint32_t, uint64_t,
++		 z0 = svlsr_wide_n_u32_x (p0, z0, x0),
++		 z0 = svlsr_wide_x (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u32_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u32_x_untied, svuint32_t, uint64_t,
++		 z0 = svlsr_wide_n_u32_x (p0, z1, x0),
++		 z0 = svlsr_wide_x (p0, z1, x0))
++
++/*
++** lsr_wide_1_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u32_x_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_x (p0, z0, 1),
++		z0 = svlsr_wide_x (p0, z0, 1))
++
++/*
++** lsr_wide_1_u32_x_untied:
++**	lsr	z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u32_x_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_x (p0, z1, 1),
++		z0 = svlsr_wide_x (p0, z1, 1))
++
++/*
++** lsr_wide_31_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_31_u32_x_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_x (p0, z0, 31),
++		z0 = svlsr_wide_x (p0, z0, 31))
++
++/*
++** lsr_wide_31_u32_x_untied:
++**	lsr	z0\.s, z1\.s, #31
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_31_u32_x_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_x (p0, z1, 31),
++		z0 = svlsr_wide_x (p0, z1, 31))
++
++/*
++** lsr_wide_32_u32_x_tied1:
++**	lsr	z0\.s, z0\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_32_u32_x_tied1, svuint32_t,
++		z0 = svlsr_wide_n_u32_x (p0, z0, 32),
++		z0 = svlsr_wide_x (p0, z0, 32))
++
++/*
++** lsr_wide_32_u32_x_untied:
++**	lsr	z0\.s, z1\.s, #32
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_32_u32_x_untied, svuint32_t,
++		z0 = svlsr_wide_n_u32_x (p0, z1, 32),
++		z0 = svlsr_wide_x (p0, z1, 32))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u8.c
+new file mode 100644
+index 000000000..fe44eabda
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u8.c
+@@ -0,0 +1,325 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** lsr_wide_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u8_m_tied1, svuint8_t, svuint64_t,
++	     z0 = svlsr_wide_u8_m (p0, z0, z4),
++	     z0 = svlsr_wide_m (p0, z0, z4))
++
++/*
++** lsr_wide_u8_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u8_m_tied2, svuint8_t, svuint64_t,
++		 z0_res = svlsr_wide_u8_m (p0, z4, z0),
++		 z0_res = svlsr_wide_m (p0, z4, z0))
++
++/*
++** lsr_wide_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u8_m_untied, svuint8_t, svuint64_t,
++	     z0 = svlsr_wide_u8_m (p0, z1, z4),
++	     z0 = svlsr_wide_m (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u8_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u8_m_tied1, svuint8_t, uint64_t,
++		 z0 = svlsr_wide_n_u8_m (p0, z0, x0),
++		 z0 = svlsr_wide_m (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u8_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u8_m_untied, svuint8_t, uint64_t,
++		 z0 = svlsr_wide_n_u8_m (p0, z1, x0),
++		 z0 = svlsr_wide_m (p0, z1, x0))
++
++/*
++** lsr_wide_1_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u8_m_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_m (p0, z0, 1),
++		z0 = svlsr_wide_m (p0, z0, 1))
++
++/*
++** lsr_wide_1_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u8_m_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_m (p0, z1, 1),
++		z0 = svlsr_wide_m (p0, z1, 1))
++
++/*
++** lsr_wide_7_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_7_u8_m_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_m (p0, z0, 7),
++		z0 = svlsr_wide_m (p0, z0, 7))
++
++/*
++** lsr_wide_7_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_7_u8_m_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_m (p0, z1, 7),
++		z0 = svlsr_wide_m (p0, z1, 7))
++
++/*
++** lsr_wide_8_u8_m_tied1:
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_8_u8_m_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_m (p0, z0, 8),
++		z0 = svlsr_wide_m (p0, z0, 8))
++
++/*
++** lsr_wide_8_u8_m_untied:
++**	movprfx	z0, z1
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_8_u8_m_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_m (p0, z1, 8),
++		z0 = svlsr_wide_m (p0, z1, 8))
++
++/*
++** lsr_wide_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u8_z_tied1, svuint8_t, svuint64_t,
++	     z0 = svlsr_wide_u8_z (p0, z0, z4),
++	     z0 = svlsr_wide_z (p0, z0, z4))
++
++/*
++** lsr_wide_u8_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.b, p0/z, z4\.b
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u8_z_tied2, svuint8_t, svuint64_t,
++		 z0_res = svlsr_wide_u8_z (p0, z4, z0),
++		 z0_res = svlsr_wide_z (p0, z4, z0))
++
++/*
++** lsr_wide_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u8_z_untied, svuint8_t, svuint64_t,
++	     z0 = svlsr_wide_u8_z (p0, z1, z4),
++	     z0 = svlsr_wide_z (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u8_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u8_z_tied1, svuint8_t, uint64_t,
++		 z0 = svlsr_wide_n_u8_z (p0, z0, x0),
++		 z0 = svlsr_wide_z (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u8_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u8_z_untied, svuint8_t, uint64_t,
++		 z0 = svlsr_wide_n_u8_z (p0, z1, x0),
++		 z0 = svlsr_wide_z (p0, z1, x0))
++
++/*
++** lsr_wide_1_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u8_z_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_z (p0, z0, 1),
++		z0 = svlsr_wide_z (p0, z0, 1))
++
++/*
++** lsr_wide_1_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u8_z_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_z (p0, z1, 1),
++		z0 = svlsr_wide_z (p0, z1, 1))
++
++/*
++** lsr_wide_7_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_7_u8_z_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_z (p0, z0, 7),
++		z0 = svlsr_wide_z (p0, z0, 7))
++
++/*
++** lsr_wide_7_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_7_u8_z_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_z (p0, z1, 7),
++		z0 = svlsr_wide_z (p0, z1, 7))
++
++/*
++** lsr_wide_8_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_8_u8_z_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_z (p0, z0, 8),
++		z0 = svlsr_wide_z (p0, z0, 8))
++
++/*
++** lsr_wide_8_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	lsr	z0\.b, p0/m, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_8_u8_z_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_z (p0, z1, 8),
++		z0 = svlsr_wide_z (p0, z1, 8))
++
++/*
++** lsr_wide_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u8_x_tied1, svuint8_t, svuint64_t,
++	     z0 = svlsr_wide_u8_x (p0, z0, z4),
++	     z0 = svlsr_wide_x (p0, z0, z4))
++
++/*
++** lsr_wide_u8_x_tied2:
++**	lsr	z0\.b, z4\.b, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (lsr_wide_u8_x_tied2, svuint8_t, svuint64_t,
++		 z0_res = svlsr_wide_u8_x (p0, z4, z0),
++		 z0_res = svlsr_wide_x (p0, z4, z0))
++
++/*
++** lsr_wide_u8_x_untied:
++**	lsr	z0\.b, z1\.b, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (lsr_wide_u8_x_untied, svuint8_t, svuint64_t,
++	     z0 = svlsr_wide_u8_x (p0, z1, z4),
++	     z0 = svlsr_wide_x (p0, z1, z4))
++
++/*
++** lsr_wide_x0_u8_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u8_x_tied1, svuint8_t, uint64_t,
++		 z0 = svlsr_wide_n_u8_x (p0, z0, x0),
++		 z0 = svlsr_wide_x (p0, z0, x0))
++
++/*
++** lsr_wide_x0_u8_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	lsr	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (lsr_wide_x0_u8_x_untied, svuint8_t, uint64_t,
++		 z0 = svlsr_wide_n_u8_x (p0, z1, x0),
++		 z0 = svlsr_wide_x (p0, z1, x0))
++
++/*
++** lsr_wide_1_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u8_x_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_x (p0, z0, 1),
++		z0 = svlsr_wide_x (p0, z0, 1))
++
++/*
++** lsr_wide_1_u8_x_untied:
++**	lsr	z0\.b, z1\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_1_u8_x_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_x (p0, z1, 1),
++		z0 = svlsr_wide_x (p0, z1, 1))
++
++/*
++** lsr_wide_7_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_7_u8_x_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_x (p0, z0, 7),
++		z0 = svlsr_wide_x (p0, z0, 7))
++
++/*
++** lsr_wide_7_u8_x_untied:
++**	lsr	z0\.b, z1\.b, #7
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_7_u8_x_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_x (p0, z1, 7),
++		z0 = svlsr_wide_x (p0, z1, 7))
++
++/*
++** lsr_wide_8_u8_x_tied1:
++**	lsr	z0\.b, z0\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_8_u8_x_tied1, svuint8_t,
++		z0 = svlsr_wide_n_u8_x (p0, z0, 8),
++		z0 = svlsr_wide_x (p0, z0, 8))
++
++/*
++** lsr_wide_8_u8_x_untied:
++**	lsr	z0\.b, z1\.b, #8
++**	ret
++*/
++TEST_UNIFORM_Z (lsr_wide_8_u8_x_untied, svuint8_t,
++		z0 = svlsr_wide_n_u8_x (p0, z1, 8),
++		z0 = svlsr_wide_x (p0, z1, 8))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f16.c
+new file mode 100644
+index 000000000..7656f9e54
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_f16_m_tied1:
++**	fmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_m_tied1, svfloat16_t,
++		z0 = svmad_f16_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmad	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_m_tied2, svfloat16_t,
++		z0 = svmad_f16_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmad	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_m_tied3, svfloat16_t,
++		z0 = svmad_f16_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_f16_m_untied:
++**	movprfx	z0, z1
++**	fmad	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_m_untied, svfloat16_t,
++		z0 = svmad_f16_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_m (p0, z0, z1, d4),
++		 z0 = svmad_m (p0, z0, z1, d4))
++
++/*
++** mad_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_m (p0, z1, z2, d4),
++		 z0 = svmad_m (p0, z1, z2, d4))
++
++/*
++** mad_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_m_tied1, svfloat16_t,
++		z0 = svmad_n_f16_m (p0, z0, z1, 2),
++		z0 = svmad_m (p0, z0, z1, 2))
++
++/*
++** mad_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_m_untied, svfloat16_t,
++		z0 = svmad_n_f16_m (p0, z1, z2, 2),
++		z0 = svmad_m (p0, z1, z2, 2))
++
++/*
++** mad_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_z_tied1, svfloat16_t,
++		z0 = svmad_f16_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_z_tied2, svfloat16_t,
++		z0 = svmad_f16_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_z_tied3, svfloat16_t,
++		z0 = svmad_f16_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_z_untied, svfloat16_t,
++		z0 = svmad_f16_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_z (p0, z0, z1, d4),
++		 z0 = svmad_z (p0, z0, z1, d4))
++
++/*
++** mad_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_z (p0, z1, z0, d4),
++		 z0 = svmad_z (p0, z1, z0, d4))
++
++/*
++** mad_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_z (p0, z1, z2, d4),
++		 z0 = svmad_z (p0, z1, z2, d4))
++
++/*
++** mad_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_z_tied1, svfloat16_t,
++		z0 = svmad_n_f16_z (p0, z0, z1, 2),
++		z0 = svmad_z (p0, z0, z1, 2))
++
++/*
++** mad_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_z_tied2, svfloat16_t,
++		z0 = svmad_n_f16_z (p0, z1, z0, 2),
++		z0 = svmad_z (p0, z1, z0, 2))
++
++/*
++** mad_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_z_untied, svfloat16_t,
++		z0 = svmad_n_f16_z (p0, z1, z2, 2),
++		z0 = svmad_z (p0, z1, z2, 2))
++
++/*
++** mad_f16_x_tied1:
++**	fmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_x_tied1, svfloat16_t,
++		z0 = svmad_f16_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_f16_x_tied2:
++**	fmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_x_tied2, svfloat16_t,
++		z0 = svmad_f16_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_f16_x_tied3:
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_x_tied3, svfloat16_t,
++		z0 = svmad_f16_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fmad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f16_x_untied, svfloat16_t,
++		z0 = svmad_f16_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_x (p0, z0, z1, d4),
++		 z0 = svmad_x (p0, z0, z1, d4))
++
++/*
++** mad_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_x (p0, z1, z0, d4),
++		 z0 = svmad_x (p0, z1, z0, d4))
++
++/*
++** mad_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmad_n_f16_x (p0, z1, z2, d4),
++		 z0 = svmad_x (p0, z1, z2, d4))
++
++/*
++** mad_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_x_tied1, svfloat16_t,
++		z0 = svmad_n_f16_x (p0, z0, z1, 2),
++		z0 = svmad_x (p0, z0, z1, 2))
++
++/*
++** mad_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_x_tied2, svfloat16_t,
++		z0 = svmad_n_f16_x (p0, z1, z0, 2),
++		z0 = svmad_x (p0, z1, z0, 2))
++
++/*
++** mad_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f16_x_untied, svfloat16_t,
++		z0 = svmad_n_f16_x (p0, z1, z2, 2),
++		z0 = svmad_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mad_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f16_x_tied1, svfloat16_t,
++		z0 = svmad_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svmad_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_mad_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f16_x_tied2, svfloat16_t,
++		z0 = svmad_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svmad_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_mad_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f16_x_tied3, svfloat16_t,
++		z0 = svmad_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svmad_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_mad_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f16_x_untied, svfloat16_t,
++		z0 = svmad_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svmad_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_mad_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f16_x_tied1, svfloat16_t,
++		z0 = svmad_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svmad_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_mad_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f16_x_tied2, svfloat16_t,
++		z0 = svmad_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svmad_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_mad_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f16_x_untied, svfloat16_t,
++		z0 = svmad_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svmad_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f32.c
+new file mode 100644
+index 000000000..dbdd2b9d1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_f32_m_tied1:
++**	fmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_m_tied1, svfloat32_t,
++		z0 = svmad_f32_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmad	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_m_tied2, svfloat32_t,
++		z0 = svmad_f32_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmad	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_m_tied3, svfloat32_t,
++		z0 = svmad_f32_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_f32_m_untied:
++**	movprfx	z0, z1
++**	fmad	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_m_untied, svfloat32_t,
++		z0 = svmad_f32_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmad_n_f32_m (p0, z0, z1, d4),
++		 z0 = svmad_m (p0, z0, z1, d4))
++
++/*
++** mad_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmad_n_f32_m (p0, z1, z2, d4),
++		 z0 = svmad_m (p0, z1, z2, d4))
++
++/*
++** mad_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_m_tied1, svfloat32_t,
++		z0 = svmad_n_f32_m (p0, z0, z1, 2),
++		z0 = svmad_m (p0, z0, z1, 2))
++
++/*
++** mad_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_m_untied, svfloat32_t,
++		z0 = svmad_n_f32_m (p0, z1, z2, 2),
++		z0 = svmad_m (p0, z1, z2, 2))
++
++/*
++** mad_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_z_tied1, svfloat32_t,
++		z0 = svmad_f32_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_z_tied2, svfloat32_t,
++		z0 = svmad_f32_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_z_tied3, svfloat32_t,
++		z0 = svmad_f32_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_z_untied, svfloat32_t,
++		z0 = svmad_f32_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmad_n_f32_z (p0, z0, z1, d4),
++		 z0 = svmad_z (p0, z0, z1, d4))
++
++/*
++** mad_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svmad_n_f32_z (p0, z1, z0, d4),
++		 z0 = svmad_z (p0, z1, z0, d4))
++
++/*
++** mad_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmad_n_f32_z (p0, z1, z2, d4),
++		 z0 = svmad_z (p0, z1, z2, d4))
++
++/*
++** mad_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_z_tied1, svfloat32_t,
++		z0 = svmad_n_f32_z (p0, z0, z1, 2),
++		z0 = svmad_z (p0, z0, z1, 2))
++
++/*
++** mad_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_z_tied2, svfloat32_t,
++		z0 = svmad_n_f32_z (p0, z1, z0, 2),
++		z0 = svmad_z (p0, z1, z0, 2))
++
++/*
++** mad_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_z_untied, svfloat32_t,
++		z0 = svmad_n_f32_z (p0, z1, z2, 2),
++		z0 = svmad_z (p0, z1, z2, 2))
++
++/*
++** mad_f32_x_tied1:
++**	fmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_x_tied1, svfloat32_t,
++		z0 = svmad_f32_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_f32_x_tied2:
++**	fmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_x_tied2, svfloat32_t,
++		z0 = svmad_f32_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_f32_x_tied3:
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_x_tied3, svfloat32_t,
++		z0 = svmad_f32_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fmad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f32_x_untied, svfloat32_t,
++		z0 = svmad_f32_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmad_n_f32_x (p0, z0, z1, d4),
++		 z0 = svmad_x (p0, z0, z1, d4))
++
++/*
++** mad_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svmad_n_f32_x (p0, z1, z0, d4),
++		 z0 = svmad_x (p0, z1, z0, d4))
++
++/*
++** mad_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmad_n_f32_x (p0, z1, z2, d4),
++		 z0 = svmad_x (p0, z1, z2, d4))
++
++/*
++** mad_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_x_tied1, svfloat32_t,
++		z0 = svmad_n_f32_x (p0, z0, z1, 2),
++		z0 = svmad_x (p0, z0, z1, 2))
++
++/*
++** mad_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_x_tied2, svfloat32_t,
++		z0 = svmad_n_f32_x (p0, z1, z0, 2),
++		z0 = svmad_x (p0, z1, z0, 2))
++
++/*
++** mad_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f32_x_untied, svfloat32_t,
++		z0 = svmad_n_f32_x (p0, z1, z2, 2),
++		z0 = svmad_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mad_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f32_x_tied1, svfloat32_t,
++		z0 = svmad_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svmad_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_mad_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f32_x_tied2, svfloat32_t,
++		z0 = svmad_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svmad_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_mad_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f32_x_tied3, svfloat32_t,
++		z0 = svmad_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svmad_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_mad_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f32_x_untied, svfloat32_t,
++		z0 = svmad_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svmad_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_mad_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f32_x_tied1, svfloat32_t,
++		z0 = svmad_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svmad_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_mad_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f32_x_tied2, svfloat32_t,
++		z0 = svmad_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svmad_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_mad_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f32_x_untied, svfloat32_t,
++		z0 = svmad_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svmad_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f64.c
+new file mode 100644
+index 000000000..978281295
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_f64_m_tied1:
++**	fmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_m_tied1, svfloat64_t,
++		z0 = svmad_f64_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmad	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_m_tied2, svfloat64_t,
++		z0 = svmad_f64_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_m_tied3, svfloat64_t,
++		z0 = svmad_f64_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_f64_m_untied:
++**	movprfx	z0, z1
++**	fmad	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_m_untied, svfloat64_t,
++		z0 = svmad_f64_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmad_n_f64_m (p0, z0, z1, d4),
++		 z0 = svmad_m (p0, z0, z1, d4))
++
++/*
++** mad_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmad_n_f64_m (p0, z1, z2, d4),
++		 z0 = svmad_m (p0, z1, z2, d4))
++
++/*
++** mad_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_m_tied1, svfloat64_t,
++		z0 = svmad_n_f64_m (p0, z0, z1, 2),
++		z0 = svmad_m (p0, z0, z1, 2))
++
++/*
++** mad_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_m_untied, svfloat64_t,
++		z0 = svmad_n_f64_m (p0, z1, z2, 2),
++		z0 = svmad_m (p0, z1, z2, 2))
++
++/*
++** mad_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_z_tied1, svfloat64_t,
++		z0 = svmad_f64_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_z_tied2, svfloat64_t,
++		z0 = svmad_f64_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_z_tied3, svfloat64_t,
++		z0 = svmad_f64_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_z_untied, svfloat64_t,
++		z0 = svmad_f64_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmad_n_f64_z (p0, z0, z1, d4),
++		 z0 = svmad_z (p0, z0, z1, d4))
++
++/*
++** mad_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svmad_n_f64_z (p0, z1, z0, d4),
++		 z0 = svmad_z (p0, z1, z0, d4))
++
++/*
++** mad_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmad_n_f64_z (p0, z1, z2, d4),
++		 z0 = svmad_z (p0, z1, z2, d4))
++
++/*
++** mad_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_z_tied1, svfloat64_t,
++		z0 = svmad_n_f64_z (p0, z0, z1, 2),
++		z0 = svmad_z (p0, z0, z1, 2))
++
++/*
++** mad_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_z_tied2, svfloat64_t,
++		z0 = svmad_n_f64_z (p0, z1, z0, 2),
++		z0 = svmad_z (p0, z1, z0, 2))
++
++/*
++** mad_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_z_untied, svfloat64_t,
++		z0 = svmad_n_f64_z (p0, z1, z2, 2),
++		z0 = svmad_z (p0, z1, z2, 2))
++
++/*
++** mad_f64_x_tied1:
++**	fmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_x_tied1, svfloat64_t,
++		z0 = svmad_f64_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_f64_x_tied2:
++**	fmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_x_tied2, svfloat64_t,
++		z0 = svmad_f64_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_f64_x_tied3:
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_x_tied3, svfloat64_t,
++		z0 = svmad_f64_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fmad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_f64_x_untied, svfloat64_t,
++		z0 = svmad_f64_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmad_n_f64_x (p0, z0, z1, d4),
++		 z0 = svmad_x (p0, z0, z1, d4))
++
++/*
++** mad_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svmad_n_f64_x (p0, z1, z0, d4),
++		 z0 = svmad_x (p0, z1, z0, d4))
++
++/*
++** mad_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mad_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmad_n_f64_x (p0, z1, z2, d4),
++		 z0 = svmad_x (p0, z1, z2, d4))
++
++/*
++** mad_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_x_tied1, svfloat64_t,
++		z0 = svmad_n_f64_x (p0, z0, z1, 2),
++		z0 = svmad_x (p0, z0, z1, 2))
++
++/*
++** mad_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_x_tied2, svfloat64_t,
++		z0 = svmad_n_f64_x (p0, z1, z0, 2),
++		z0 = svmad_x (p0, z1, z0, 2))
++
++/*
++** mad_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_2_f64_x_untied, svfloat64_t,
++		z0 = svmad_n_f64_x (p0, z1, z2, 2),
++		z0 = svmad_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mad_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f64_x_tied1, svfloat64_t,
++		z0 = svmad_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svmad_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_mad_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f64_x_tied2, svfloat64_t,
++		z0 = svmad_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svmad_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_mad_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f64_x_tied3, svfloat64_t,
++		z0 = svmad_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svmad_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_mad_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_f64_x_untied, svfloat64_t,
++		z0 = svmad_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svmad_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_mad_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f64_x_tied1, svfloat64_t,
++		z0 = svmad_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svmad_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_mad_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f64_x_tied2, svfloat64_t,
++		z0 = svmad_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svmad_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_mad_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mad_2_f64_x_untied, svfloat64_t,
++		z0 = svmad_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svmad_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s16.c
+new file mode 100644
+index 000000000..02a6d4588
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_s16_m_tied1:
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_m_tied1, svint16_t,
++		z0 = svmad_s16_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_m_tied2, svint16_t,
++		z0 = svmad_s16_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_s16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_m_tied3, svint16_t,
++		z0 = svmad_s16_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_s16_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_m_untied, svint16_t,
++		z0 = svmad_s16_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmad_n_s16_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmad_n_s16_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_m_tied1, svint16_t,
++		z0 = svmad_n_s16_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_m_untied, svint16_t,
++		z0 = svmad_n_s16_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_z_tied1, svint16_t,
++		z0 = svmad_s16_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_z_tied2, svint16_t,
++		z0 = svmad_s16_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_s16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_z_tied3, svint16_t,
++		z0 = svmad_s16_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_z_untied, svint16_t,
++		z0 = svmad_s16_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmad_n_s16_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_w0_s16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_z_tied2, svint16_t, int16_t,
++		 z0 = svmad_n_s16_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmad_n_s16_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_z_tied1, svint16_t,
++		z0 = svmad_n_s16_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_s16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_z_tied2, svint16_t,
++		z0 = svmad_n_s16_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_s16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_z_untied, svint16_t,
++		z0 = svmad_n_s16_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_s16_x_tied1:
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_x_tied1, svint16_t,
++		z0 = svmad_s16_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_s16_x_tied2:
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_x_tied2, svint16_t,
++		z0 = svmad_s16_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_s16_x_tied3:
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_x_tied3, svint16_t,
++		z0 = svmad_s16_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	mad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s16_x_untied, svint16_t,
++		z0 = svmad_s16_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmad_n_s16_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_w0_s16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_x_tied2, svint16_t, int16_t,
++		 z0 = svmad_n_s16_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmad_n_s16_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_x_tied1, svint16_t,
++		z0 = svmad_n_s16_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_s16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_x_tied2, svint16_t,
++		z0 = svmad_n_s16_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_s16_x_untied:
++**	mov	z0\.h, #11
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s16_x_untied, svint16_t,
++		z0 = svmad_n_s16_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s32.c
+new file mode 100644
+index 000000000..d676a0c11
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_s32_m_tied1:
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_m_tied1, svint32_t,
++		z0 = svmad_s32_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_m_tied2, svint32_t,
++		z0 = svmad_s32_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_s32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_m_tied3, svint32_t,
++		z0 = svmad_s32_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_s32_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_m_untied, svint32_t,
++		z0 = svmad_s32_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmad_n_s32_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmad_n_s32_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_m_tied1, svint32_t,
++		z0 = svmad_n_s32_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_m_untied, svint32_t,
++		z0 = svmad_n_s32_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_z_tied1, svint32_t,
++		z0 = svmad_s32_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_z_tied2, svint32_t,
++		z0 = svmad_s32_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_s32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_z_tied3, svint32_t,
++		z0 = svmad_s32_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_z_untied, svint32_t,
++		z0 = svmad_s32_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmad_n_s32_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_w0_s32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_z_tied2, svint32_t, int32_t,
++		 z0 = svmad_n_s32_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmad_n_s32_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_z_tied1, svint32_t,
++		z0 = svmad_n_s32_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_s32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_z_tied2, svint32_t,
++		z0 = svmad_n_s32_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_s32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_z_untied, svint32_t,
++		z0 = svmad_n_s32_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_s32_x_tied1:
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_x_tied1, svint32_t,
++		z0 = svmad_s32_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_s32_x_tied2:
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_x_tied2, svint32_t,
++		z0 = svmad_s32_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_s32_x_tied3:
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_x_tied3, svint32_t,
++		z0 = svmad_s32_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	mad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s32_x_untied, svint32_t,
++		z0 = svmad_s32_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmad_n_s32_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_w0_s32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_x_tied2, svint32_t, int32_t,
++		 z0 = svmad_n_s32_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmad_n_s32_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_x_tied1, svint32_t,
++		z0 = svmad_n_s32_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_s32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_x_tied2, svint32_t,
++		z0 = svmad_n_s32_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_s32_x_untied:
++**	mov	z0\.s, #11
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s32_x_untied, svint32_t,
++		z0 = svmad_n_s32_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s64.c
+new file mode 100644
+index 000000000..7aa017536
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_s64_m_tied1:
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_m_tied1, svint64_t,
++		z0 = svmad_s64_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_m_tied2, svint64_t,
++		z0 = svmad_s64_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_s64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_m_tied3, svint64_t,
++		z0 = svmad_s64_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_s64_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_m_untied, svint64_t,
++		z0 = svmad_s64_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmad_n_s64_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmad_n_s64_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_m_tied1, svint64_t,
++		z0 = svmad_n_s64_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_m_untied, svint64_t,
++		z0 = svmad_n_s64_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_z_tied1, svint64_t,
++		z0 = svmad_s64_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_z_tied2, svint64_t,
++		z0 = svmad_s64_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_s64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_z_tied3, svint64_t,
++		z0 = svmad_s64_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_z_untied, svint64_t,
++		z0 = svmad_s64_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmad_n_s64_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_x0_s64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_z_tied2, svint64_t, int64_t,
++		 z0 = svmad_n_s64_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmad_n_s64_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_z_tied1, svint64_t,
++		z0 = svmad_n_s64_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_s64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_z_tied2, svint64_t,
++		z0 = svmad_n_s64_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_s64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_z_untied, svint64_t,
++		z0 = svmad_n_s64_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_s64_x_tied1:
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_x_tied1, svint64_t,
++		z0 = svmad_s64_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_s64_x_tied2:
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_x_tied2, svint64_t,
++		z0 = svmad_s64_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_s64_x_tied3:
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_x_tied3, svint64_t,
++		z0 = svmad_s64_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	mad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s64_x_untied, svint64_t,
++		z0 = svmad_s64_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmad_n_s64_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_x0_s64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_x_tied2, svint64_t, int64_t,
++		 z0 = svmad_n_s64_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmad_n_s64_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_x_tied1, svint64_t,
++		z0 = svmad_n_s64_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_s64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_x_tied2, svint64_t,
++		z0 = svmad_n_s64_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_s64_x_untied:
++**	mov	z0\.d, #11
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s64_x_untied, svint64_t,
++		z0 = svmad_n_s64_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s8.c
+new file mode 100644
+index 000000000..90d712686
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_s8_m_tied1:
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_m_tied1, svint8_t,
++		z0 = svmad_s8_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_m_tied2, svint8_t,
++		z0 = svmad_s8_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_s8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_m_tied3, svint8_t,
++		z0 = svmad_s8_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_s8_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_m_untied, svint8_t,
++		z0 = svmad_s8_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmad_n_s8_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmad_n_s8_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_m_tied1, svint8_t,
++		z0 = svmad_n_s8_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_m_untied, svint8_t,
++		z0 = svmad_n_s8_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_z_tied1, svint8_t,
++		z0 = svmad_s8_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_z_tied2, svint8_t,
++		z0 = svmad_s8_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_s8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_z_tied3, svint8_t,
++		z0 = svmad_s8_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mad	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_z_untied, svint8_t,
++		z0 = svmad_s8_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmad_n_s8_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_w0_s8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_z_tied2, svint8_t, int8_t,
++		 z0 = svmad_n_s8_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mad	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmad_n_s8_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_z_tied1, svint8_t,
++		z0 = svmad_n_s8_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_s8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_z_tied2, svint8_t,
++		z0 = svmad_n_s8_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_s8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mad	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_z_untied, svint8_t,
++		z0 = svmad_n_s8_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_s8_x_tied1:
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_x_tied1, svint8_t,
++		z0 = svmad_s8_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_s8_x_tied2:
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_x_tied2, svint8_t,
++		z0 = svmad_s8_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_s8_x_tied3:
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_x_tied3, svint8_t,
++		z0 = svmad_s8_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	mad	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0, z3
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_s8_x_untied, svint8_t,
++		z0 = svmad_s8_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmad_n_s8_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_w0_s8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_x_tied2, svint8_t, int8_t,
++		 z0 = svmad_n_s8_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmad_n_s8_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_x_tied1, svint8_t,
++		z0 = svmad_n_s8_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_s8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_x_tied2, svint8_t,
++		z0 = svmad_n_s8_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_s8_x_untied:
++**	mov	z0\.b, #11
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_s8_x_untied, svint8_t,
++		z0 = svmad_n_s8_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u16.c
+new file mode 100644
+index 000000000..1d2ad9c5f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_u16_m_tied1:
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_m_tied1, svuint16_t,
++		z0 = svmad_u16_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_m_tied2, svuint16_t,
++		z0 = svmad_u16_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_u16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_m_tied3, svuint16_t,
++		z0 = svmad_u16_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_u16_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_m_untied, svuint16_t,
++		z0 = svmad_u16_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_m_tied1, svuint16_t,
++		z0 = svmad_n_u16_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_m_untied, svuint16_t,
++		z0 = svmad_n_u16_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_z_tied1, svuint16_t,
++		z0 = svmad_u16_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_z_tied2, svuint16_t,
++		z0 = svmad_u16_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_u16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_z_tied3, svuint16_t,
++		z0 = svmad_u16_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_z_untied, svuint16_t,
++		z0 = svmad_u16_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_w0_u16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_z_tied2, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_z_tied1, svuint16_t,
++		z0 = svmad_n_u16_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_u16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_z_tied2, svuint16_t,
++		z0 = svmad_n_u16_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_u16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_z_untied, svuint16_t,
++		z0 = svmad_n_u16_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_u16_x_tied1:
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_x_tied1, svuint16_t,
++		z0 = svmad_u16_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_u16_x_tied2:
++**	mad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_x_tied2, svuint16_t,
++		z0 = svmad_u16_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_u16_x_tied3:
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_x_tied3, svuint16_t,
++		z0 = svmad_u16_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	mad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u16_x_untied, svuint16_t,
++		z0 = svmad_u16_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_w0_u16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_x_tied2, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmad_n_u16_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_x_tied1, svuint16_t,
++		z0 = svmad_n_u16_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_u16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_x_tied2, svuint16_t,
++		z0 = svmad_n_u16_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_u16_x_untied:
++**	mov	z0\.h, #11
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u16_x_untied, svuint16_t,
++		z0 = svmad_n_u16_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u32.c
+new file mode 100644
+index 000000000..4b51958b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_u32_m_tied1:
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_m_tied1, svuint32_t,
++		z0 = svmad_u32_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_m_tied2, svuint32_t,
++		z0 = svmad_u32_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_u32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_m_tied3, svuint32_t,
++		z0 = svmad_u32_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_u32_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_m_untied, svuint32_t,
++		z0 = svmad_u32_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_m_tied1, svuint32_t,
++		z0 = svmad_n_u32_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_m_untied, svuint32_t,
++		z0 = svmad_n_u32_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_z_tied1, svuint32_t,
++		z0 = svmad_u32_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_z_tied2, svuint32_t,
++		z0 = svmad_u32_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_u32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_z_tied3, svuint32_t,
++		z0 = svmad_u32_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_z_untied, svuint32_t,
++		z0 = svmad_u32_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_w0_u32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_z_tied2, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_z_tied1, svuint32_t,
++		z0 = svmad_n_u32_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_u32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_z_tied2, svuint32_t,
++		z0 = svmad_n_u32_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_u32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_z_untied, svuint32_t,
++		z0 = svmad_n_u32_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_u32_x_tied1:
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_x_tied1, svuint32_t,
++		z0 = svmad_u32_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_u32_x_tied2:
++**	mad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_x_tied2, svuint32_t,
++		z0 = svmad_u32_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_u32_x_tied3:
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_x_tied3, svuint32_t,
++		z0 = svmad_u32_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	mad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u32_x_untied, svuint32_t,
++		z0 = svmad_u32_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_w0_u32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_x_tied2, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmad_n_u32_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_x_tied1, svuint32_t,
++		z0 = svmad_n_u32_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_u32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_x_tied2, svuint32_t,
++		z0 = svmad_n_u32_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_u32_x_untied:
++**	mov	z0\.s, #11
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u32_x_untied, svuint32_t,
++		z0 = svmad_n_u32_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u64.c
+new file mode 100644
+index 000000000..c4939093e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_u64_m_tied1:
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_m_tied1, svuint64_t,
++		z0 = svmad_u64_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_m_tied2, svuint64_t,
++		z0 = svmad_u64_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_u64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_m_tied3, svuint64_t,
++		z0 = svmad_u64_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_u64_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_m_untied, svuint64_t,
++		z0 = svmad_u64_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_m_tied1, svuint64_t,
++		z0 = svmad_n_u64_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_m_untied, svuint64_t,
++		z0 = svmad_n_u64_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_z_tied1, svuint64_t,
++		z0 = svmad_u64_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_z_tied2, svuint64_t,
++		z0 = svmad_u64_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_u64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_z_tied3, svuint64_t,
++		z0 = svmad_u64_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_z_untied, svuint64_t,
++		z0 = svmad_u64_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_x0_u64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_z_tied2, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_z_tied1, svuint64_t,
++		z0 = svmad_n_u64_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_u64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_z_tied2, svuint64_t,
++		z0 = svmad_n_u64_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_u64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_z_untied, svuint64_t,
++		z0 = svmad_n_u64_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_u64_x_tied1:
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_x_tied1, svuint64_t,
++		z0 = svmad_u64_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_u64_x_tied2:
++**	mad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_x_tied2, svuint64_t,
++		z0 = svmad_u64_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_u64_x_tied3:
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_x_tied3, svuint64_t,
++		z0 = svmad_u64_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	mad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u64_x_untied, svuint64_t,
++		z0 = svmad_u64_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_x0_u64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_x_tied2, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmad_n_u64_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_x_tied1, svuint64_t,
++		z0 = svmad_n_u64_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_u64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_x_tied2, svuint64_t,
++		z0 = svmad_n_u64_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_u64_x_untied:
++**	mov	z0\.d, #11
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u64_x_untied, svuint64_t,
++		z0 = svmad_n_u64_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u8.c
+new file mode 100644
+index 000000000..0b4b1b8cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mad_u8_m_tied1:
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_m_tied1, svuint8_t,
++		z0 = svmad_u8_m (p0, z0, z1, z2),
++		z0 = svmad_m (p0, z0, z1, z2))
++
++/*
++** mad_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_m_tied2, svuint8_t,
++		z0 = svmad_u8_m (p0, z1, z0, z2),
++		z0 = svmad_m (p0, z1, z0, z2))
++
++/*
++** mad_u8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_m_tied3, svuint8_t,
++		z0 = svmad_u8_m (p0, z1, z2, z0),
++		z0 = svmad_m (p0, z1, z2, z0))
++
++/*
++** mad_u8_m_untied:
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_m_untied, svuint8_t,
++		z0 = svmad_u8_m (p0, z1, z2, z3),
++		z0 = svmad_m (p0, z1, z2, z3))
++
++/*
++** mad_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_m (p0, z0, z1, x0),
++		 z0 = svmad_m (p0, z0, z1, x0))
++
++/*
++** mad_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_m (p0, z1, z2, x0),
++		 z0 = svmad_m (p0, z1, z2, x0))
++
++/*
++** mad_11_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_m_tied1, svuint8_t,
++		z0 = svmad_n_u8_m (p0, z0, z1, 11),
++		z0 = svmad_m (p0, z0, z1, 11))
++
++/*
++** mad_11_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_m_untied, svuint8_t,
++		z0 = svmad_n_u8_m (p0, z1, z2, 11),
++		z0 = svmad_m (p0, z1, z2, 11))
++
++/*
++** mad_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_z_tied1, svuint8_t,
++		z0 = svmad_u8_z (p0, z0, z1, z2),
++		z0 = svmad_z (p0, z0, z1, z2))
++
++/*
++** mad_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_z_tied2, svuint8_t,
++		z0 = svmad_u8_z (p0, z1, z0, z2),
++		z0 = svmad_z (p0, z1, z0, z2))
++
++/*
++** mad_u8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_z_tied3, svuint8_t,
++		z0 = svmad_u8_z (p0, z1, z2, z0),
++		z0 = svmad_z (p0, z1, z2, z0))
++
++/*
++** mad_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mad	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_z_untied, svuint8_t,
++		z0 = svmad_u8_z (p0, z1, z2, z3),
++		z0 = svmad_z (p0, z1, z2, z3))
++
++/*
++** mad_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_z (p0, z0, z1, x0),
++		 z0 = svmad_z (p0, z0, z1, x0))
++
++/*
++** mad_w0_u8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_z_tied2, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_z (p0, z1, z0, x0),
++		 z0 = svmad_z (p0, z1, z0, x0))
++
++/*
++** mad_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mad	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_z (p0, z1, z2, x0),
++		 z0 = svmad_z (p0, z1, z2, x0))
++
++/*
++** mad_11_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_z_tied1, svuint8_t,
++		z0 = svmad_n_u8_z (p0, z0, z1, 11),
++		z0 = svmad_z (p0, z0, z1, 11))
++
++/*
++** mad_11_u8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_z_tied2, svuint8_t,
++		z0 = svmad_n_u8_z (p0, z1, z0, 11),
++		z0 = svmad_z (p0, z1, z0, 11))
++
++/*
++** mad_11_u8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mad	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_z_untied, svuint8_t,
++		z0 = svmad_n_u8_z (p0, z1, z2, 11),
++		z0 = svmad_z (p0, z1, z2, 11))
++
++/*
++** mad_u8_x_tied1:
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_x_tied1, svuint8_t,
++		z0 = svmad_u8_x (p0, z0, z1, z2),
++		z0 = svmad_x (p0, z0, z1, z2))
++
++/*
++** mad_u8_x_tied2:
++**	mad	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_x_tied2, svuint8_t,
++		z0 = svmad_u8_x (p0, z1, z0, z2),
++		z0 = svmad_x (p0, z1, z0, z2))
++
++/*
++** mad_u8_x_tied3:
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_x_tied3, svuint8_t,
++		z0 = svmad_u8_x (p0, z1, z2, z0),
++		z0 = svmad_x (p0, z1, z2, z0))
++
++/*
++** mad_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mad	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	mad	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0, z3
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mad_u8_x_untied, svuint8_t,
++		z0 = svmad_u8_x (p0, z1, z2, z3),
++		z0 = svmad_x (p0, z1, z2, z3))
++
++/*
++** mad_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_x (p0, z0, z1, x0),
++		 z0 = svmad_x (p0, z0, z1, x0))
++
++/*
++** mad_w0_u8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_x_tied2, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_x (p0, z1, z0, x0),
++		 z0 = svmad_x (p0, z1, z0, x0))
++
++/*
++** mad_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mad_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmad_n_u8_x (p0, z1, z2, x0),
++		 z0 = svmad_x (p0, z1, z2, x0))
++
++/*
++** mad_11_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_x_tied1, svuint8_t,
++		z0 = svmad_n_u8_x (p0, z0, z1, 11),
++		z0 = svmad_x (p0, z0, z1, 11))
++
++/*
++** mad_11_u8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_x_tied2, svuint8_t,
++		z0 = svmad_n_u8_x (p0, z1, z0, 11),
++		z0 = svmad_x (p0, z1, z0, 11))
++
++/*
++** mad_11_u8_x_untied:
++**	mov	z0\.b, #11
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mad_11_u8_x_untied, svuint8_t,
++		z0 = svmad_n_u8_x (p0, z1, z2, 11),
++		z0 = svmad_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f16.c
+new file mode 100644
+index 000000000..f21099a24
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f16.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_f16_m_tied1:
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_m_tied1, svfloat16_t,
++		z0 = svmax_f16_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_m_tied2, svfloat16_t,
++		z0 = svmax_f16_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_f16_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_m_untied, svfloat16_t,
++		z0 = svmax_f16_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmax_n_f16_m (p0, z0, d4),
++		 z0 = svmax_m (p0, z0, d4))
++
++/*
++** max_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmax_n_f16_m (p0, z1, d4),
++		 z0 = svmax_m (p0, z1, d4))
++
++/*
++** max_0_f16_m_tied1:
++**	fmax	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f16_m_tied1, svfloat16_t,
++		z0 = svmax_n_f16_m (p0, z0, 0),
++		z0 = svmax_m (p0, z0, 0))
++
++/*
++** max_0_f16_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f16_m_untied, svfloat16_t,
++		z0 = svmax_n_f16_m (p0, z1, 0),
++		z0 = svmax_m (p0, z1, 0))
++
++/*
++** max_1_f16_m_tied1:
++**	fmax	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f16_m_tied1, svfloat16_t,
++		z0 = svmax_n_f16_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f16_m_untied, svfloat16_t,
++		z0 = svmax_n_f16_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_2_f16_m:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f16_m, svfloat16_t,
++		z0 = svmax_n_f16_m (p0, z0, 2),
++		z0 = svmax_m (p0, z0, 2))
++
++/*
++** max_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_z_tied1, svfloat16_t,
++		z0 = svmax_f16_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_z_tied2, svfloat16_t,
++		z0 = svmax_f16_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmax	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_z_untied, svfloat16_t,
++		z0 = svmax_f16_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmax_n_f16_z (p0, z0, d4),
++		 z0 = svmax_z (p0, z0, d4))
++
++/*
++** max_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmax	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (max_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmax_n_f16_z (p0, z1, d4),
++		 z0 = svmax_z (p0, z1, d4))
++
++/*
++** max_0_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmax	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f16_z_tied1, svfloat16_t,
++		z0 = svmax_n_f16_z (p0, z0, 0),
++		z0 = svmax_z (p0, z0, 0))
++
++/*
++** max_0_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmax	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f16_z_untied, svfloat16_t,
++		z0 = svmax_n_f16_z (p0, z1, 0),
++		z0 = svmax_z (p0, z1, 0))
++
++/*
++** max_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmax	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f16_z_tied1, svfloat16_t,
++		z0 = svmax_n_f16_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmax	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f16_z_untied, svfloat16_t,
++		z0 = svmax_n_f16_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_2_f16_z:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f16_z, svfloat16_t,
++		z0 = svmax_n_f16_z (p0, z0, 2),
++		z0 = svmax_z (p0, z0, 2))
++
++/*
++** max_f16_x_tied1:
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_x_tied1, svfloat16_t,
++		z0 = svmax_f16_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_f16_x_tied2:
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_x_tied2, svfloat16_t,
++		z0 = svmax_f16_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_f16_x_untied, svfloat16_t,
++		z0 = svmax_f16_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmax_n_f16_x (p0, z0, d4),
++		 z0 = svmax_x (p0, z0, d4))
++
++/*
++** max_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (max_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmax_n_f16_x (p0, z1, d4),
++		 z0 = svmax_x (p0, z1, d4))
++
++/*
++** max_0_f16_x_tied1:
++**	fmax	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f16_x_tied1, svfloat16_t,
++		z0 = svmax_n_f16_x (p0, z0, 0),
++		z0 = svmax_x (p0, z0, 0))
++
++/*
++** max_0_f16_x_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f16_x_untied, svfloat16_t,
++		z0 = svmax_n_f16_x (p0, z1, 0),
++		z0 = svmax_x (p0, z1, 0))
++
++/*
++** max_1_f16_x_tied1:
++**	fmax	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f16_x_tied1, svfloat16_t,
++		z0 = svmax_n_f16_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f16_x_untied, svfloat16_t,
++		z0 = svmax_n_f16_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f16_x_tied1, svfloat16_t,
++		z0 = svmax_n_f16_x (p0, z0, 2),
++		z0 = svmax_x (p0, z0, 2))
++
++/*
++** max_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f16_x_untied, svfloat16_t,
++		z0 = svmax_n_f16_x (p0, z1, 2),
++		z0 = svmax_x (p0, z1, 2))
++
++/*
++** ptrue_max_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f16_x_tied1, svfloat16_t,
++		z0 = svmax_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svmax_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_max_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f16_x_tied2, svfloat16_t,
++		z0 = svmax_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svmax_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_max_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f16_x_untied, svfloat16_t,
++		z0 = svmax_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svmax_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_max_0_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_0_f16_x_tied1, svfloat16_t,
++		z0 = svmax_n_f16_x (svptrue_b16 (), z0, 0),
++		z0 = svmax_x (svptrue_b16 (), z0, 0))
++
++/*
++** ptrue_max_0_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_0_f16_x_untied, svfloat16_t,
++		z0 = svmax_n_f16_x (svptrue_b16 (), z1, 0),
++		z0 = svmax_x (svptrue_b16 (), z1, 0))
++
++/*
++** ptrue_max_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_1_f16_x_tied1, svfloat16_t,
++		z0 = svmax_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svmax_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_max_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_1_f16_x_untied, svfloat16_t,
++		z0 = svmax_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svmax_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_max_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_2_f16_x_tied1, svfloat16_t,
++		z0 = svmax_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svmax_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_max_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_2_f16_x_untied, svfloat16_t,
++		z0 = svmax_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svmax_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f32.c
+new file mode 100644
+index 000000000..6f5c92c9f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f32.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_f32_m_tied1:
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_m_tied1, svfloat32_t,
++		z0 = svmax_f32_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_m_tied2, svfloat32_t,
++		z0 = svmax_f32_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_f32_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_m_untied, svfloat32_t,
++		z0 = svmax_f32_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmax_n_f32_m (p0, z0, d4),
++		 z0 = svmax_m (p0, z0, d4))
++
++/*
++** max_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmax_n_f32_m (p0, z1, d4),
++		 z0 = svmax_m (p0, z1, d4))
++
++/*
++** max_0_f32_m_tied1:
++**	fmax	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f32_m_tied1, svfloat32_t,
++		z0 = svmax_n_f32_m (p0, z0, 0),
++		z0 = svmax_m (p0, z0, 0))
++
++/*
++** max_0_f32_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f32_m_untied, svfloat32_t,
++		z0 = svmax_n_f32_m (p0, z1, 0),
++		z0 = svmax_m (p0, z1, 0))
++
++/*
++** max_1_f32_m_tied1:
++**	fmax	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f32_m_tied1, svfloat32_t,
++		z0 = svmax_n_f32_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f32_m_untied, svfloat32_t,
++		z0 = svmax_n_f32_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_2_f32_m:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f32_m, svfloat32_t,
++		z0 = svmax_n_f32_m (p0, z0, 2),
++		z0 = svmax_m (p0, z0, 2))
++
++/*
++** max_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_z_tied1, svfloat32_t,
++		z0 = svmax_f32_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_z_tied2, svfloat32_t,
++		z0 = svmax_f32_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmax	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_z_untied, svfloat32_t,
++		z0 = svmax_f32_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmax_n_f32_z (p0, z0, d4),
++		 z0 = svmax_z (p0, z0, d4))
++
++/*
++** max_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmax	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (max_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmax_n_f32_z (p0, z1, d4),
++		 z0 = svmax_z (p0, z1, d4))
++
++/*
++** max_0_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmax	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f32_z_tied1, svfloat32_t,
++		z0 = svmax_n_f32_z (p0, z0, 0),
++		z0 = svmax_z (p0, z0, 0))
++
++/*
++** max_0_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmax	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f32_z_untied, svfloat32_t,
++		z0 = svmax_n_f32_z (p0, z1, 0),
++		z0 = svmax_z (p0, z1, 0))
++
++/*
++** max_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmax	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f32_z_tied1, svfloat32_t,
++		z0 = svmax_n_f32_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmax	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f32_z_untied, svfloat32_t,
++		z0 = svmax_n_f32_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_2_f32_z:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f32_z, svfloat32_t,
++		z0 = svmax_n_f32_z (p0, z0, 2),
++		z0 = svmax_z (p0, z0, 2))
++
++/*
++** max_f32_x_tied1:
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_x_tied1, svfloat32_t,
++		z0 = svmax_f32_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_f32_x_tied2:
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_x_tied2, svfloat32_t,
++		z0 = svmax_f32_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_f32_x_untied, svfloat32_t,
++		z0 = svmax_f32_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmax_n_f32_x (p0, z0, d4),
++		 z0 = svmax_x (p0, z0, d4))
++
++/*
++** max_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (max_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmax_n_f32_x (p0, z1, d4),
++		 z0 = svmax_x (p0, z1, d4))
++
++/*
++** max_0_f32_x_tied1:
++**	fmax	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f32_x_tied1, svfloat32_t,
++		z0 = svmax_n_f32_x (p0, z0, 0),
++		z0 = svmax_x (p0, z0, 0))
++
++/*
++** max_0_f32_x_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f32_x_untied, svfloat32_t,
++		z0 = svmax_n_f32_x (p0, z1, 0),
++		z0 = svmax_x (p0, z1, 0))
++
++/*
++** max_1_f32_x_tied1:
++**	fmax	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f32_x_tied1, svfloat32_t,
++		z0 = svmax_n_f32_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f32_x_untied, svfloat32_t,
++		z0 = svmax_n_f32_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f32_x_tied1, svfloat32_t,
++		z0 = svmax_n_f32_x (p0, z0, 2),
++		z0 = svmax_x (p0, z0, 2))
++
++/*
++** max_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f32_x_untied, svfloat32_t,
++		z0 = svmax_n_f32_x (p0, z1, 2),
++		z0 = svmax_x (p0, z1, 2))
++
++/*
++** ptrue_max_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f32_x_tied1, svfloat32_t,
++		z0 = svmax_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svmax_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_max_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f32_x_tied2, svfloat32_t,
++		z0 = svmax_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svmax_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_max_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f32_x_untied, svfloat32_t,
++		z0 = svmax_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svmax_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_max_0_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_0_f32_x_tied1, svfloat32_t,
++		z0 = svmax_n_f32_x (svptrue_b32 (), z0, 0),
++		z0 = svmax_x (svptrue_b32 (), z0, 0))
++
++/*
++** ptrue_max_0_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_0_f32_x_untied, svfloat32_t,
++		z0 = svmax_n_f32_x (svptrue_b32 (), z1, 0),
++		z0 = svmax_x (svptrue_b32 (), z1, 0))
++
++/*
++** ptrue_max_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_1_f32_x_tied1, svfloat32_t,
++		z0 = svmax_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svmax_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_max_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_1_f32_x_untied, svfloat32_t,
++		z0 = svmax_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svmax_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_max_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_2_f32_x_tied1, svfloat32_t,
++		z0 = svmax_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svmax_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_max_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_2_f32_x_untied, svfloat32_t,
++		z0 = svmax_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svmax_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f64.c
+new file mode 100644
+index 000000000..8ac6cca75
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f64.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_f64_m_tied1:
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_m_tied1, svfloat64_t,
++		z0 = svmax_f64_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_m_tied2, svfloat64_t,
++		z0 = svmax_f64_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_f64_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_m_untied, svfloat64_t,
++		z0 = svmax_f64_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmax_n_f64_m (p0, z0, d4),
++		 z0 = svmax_m (p0, z0, d4))
++
++/*
++** max_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmax_n_f64_m (p0, z1, d4),
++		 z0 = svmax_m (p0, z1, d4))
++
++/*
++** max_0_f64_m_tied1:
++**	fmax	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f64_m_tied1, svfloat64_t,
++		z0 = svmax_n_f64_m (p0, z0, 0),
++		z0 = svmax_m (p0, z0, 0))
++
++/*
++** max_0_f64_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f64_m_untied, svfloat64_t,
++		z0 = svmax_n_f64_m (p0, z1, 0),
++		z0 = svmax_m (p0, z1, 0))
++
++/*
++** max_1_f64_m_tied1:
++**	fmax	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f64_m_tied1, svfloat64_t,
++		z0 = svmax_n_f64_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f64_m_untied, svfloat64_t,
++		z0 = svmax_n_f64_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_2_f64_m:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f64_m, svfloat64_t,
++		z0 = svmax_n_f64_m (p0, z0, 2),
++		z0 = svmax_m (p0, z0, 2))
++
++/*
++** max_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_z_tied1, svfloat64_t,
++		z0 = svmax_f64_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_z_tied2, svfloat64_t,
++		z0 = svmax_f64_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmax	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_z_untied, svfloat64_t,
++		z0 = svmax_f64_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmax_n_f64_z (p0, z0, d4),
++		 z0 = svmax_z (p0, z0, d4))
++
++/*
++** max_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmax	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (max_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmax_n_f64_z (p0, z1, d4),
++		 z0 = svmax_z (p0, z1, d4))
++
++/*
++** max_0_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmax	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f64_z_tied1, svfloat64_t,
++		z0 = svmax_n_f64_z (p0, z0, 0),
++		z0 = svmax_z (p0, z0, 0))
++
++/*
++** max_0_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmax	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f64_z_untied, svfloat64_t,
++		z0 = svmax_n_f64_z (p0, z1, 0),
++		z0 = svmax_z (p0, z1, 0))
++
++/*
++** max_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmax	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f64_z_tied1, svfloat64_t,
++		z0 = svmax_n_f64_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmax	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f64_z_untied, svfloat64_t,
++		z0 = svmax_n_f64_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_2_f64_z:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f64_z, svfloat64_t,
++		z0 = svmax_n_f64_z (p0, z0, 2),
++		z0 = svmax_z (p0, z0, 2))
++
++/*
++** max_f64_x_tied1:
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_x_tied1, svfloat64_t,
++		z0 = svmax_f64_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_f64_x_tied2:
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_x_tied2, svfloat64_t,
++		z0 = svmax_f64_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_f64_x_untied, svfloat64_t,
++		z0 = svmax_f64_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (max_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmax_n_f64_x (p0, z0, d4),
++		 z0 = svmax_x (p0, z0, d4))
++
++/*
++** max_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (max_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmax_n_f64_x (p0, z1, d4),
++		 z0 = svmax_x (p0, z1, d4))
++
++/*
++** max_0_f64_x_tied1:
++**	fmax	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f64_x_tied1, svfloat64_t,
++		z0 = svmax_n_f64_x (p0, z0, 0),
++		z0 = svmax_x (p0, z0, 0))
++
++/*
++** max_0_f64_x_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_0_f64_x_untied, svfloat64_t,
++		z0 = svmax_n_f64_x (p0, z1, 0),
++		z0 = svmax_x (p0, z1, 0))
++
++/*
++** max_1_f64_x_tied1:
++**	fmax	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f64_x_tied1, svfloat64_t,
++		z0 = svmax_n_f64_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fmax	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_f64_x_untied, svfloat64_t,
++		z0 = svmax_n_f64_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f64_x_tied1, svfloat64_t,
++		z0 = svmax_n_f64_x (p0, z0, 2),
++		z0 = svmax_x (p0, z0, 2))
++
++/*
++** max_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_2_f64_x_untied, svfloat64_t,
++		z0 = svmax_n_f64_x (p0, z1, 2),
++		z0 = svmax_x (p0, z1, 2))
++
++/*
++** ptrue_max_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f64_x_tied1, svfloat64_t,
++		z0 = svmax_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svmax_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_max_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f64_x_tied2, svfloat64_t,
++		z0 = svmax_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svmax_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_max_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_f64_x_untied, svfloat64_t,
++		z0 = svmax_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svmax_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_max_0_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_0_f64_x_tied1, svfloat64_t,
++		z0 = svmax_n_f64_x (svptrue_b64 (), z0, 0),
++		z0 = svmax_x (svptrue_b64 (), z0, 0))
++
++/*
++** ptrue_max_0_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_0_f64_x_untied, svfloat64_t,
++		z0 = svmax_n_f64_x (svptrue_b64 (), z1, 0),
++		z0 = svmax_x (svptrue_b64 (), z1, 0))
++
++/*
++** ptrue_max_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_1_f64_x_tied1, svfloat64_t,
++		z0 = svmax_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svmax_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_max_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_1_f64_x_untied, svfloat64_t,
++		z0 = svmax_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svmax_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_max_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_2_f64_x_tied1, svfloat64_t,
++		z0 = svmax_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svmax_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_max_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_max_2_f64_x_untied, svfloat64_t,
++		z0 = svmax_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svmax_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s16.c
+new file mode 100644
+index 000000000..6a2167522
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s16.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_s16_m_tied1:
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_m_tied1, svint16_t,
++		z0 = svmax_s16_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smax	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_m_tied2, svint16_t,
++		z0 = svmax_s16_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_s16_m_untied:
++**	movprfx	z0, z1
++**	smax	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_m_untied, svint16_t,
++		z0 = svmax_s16_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmax_n_s16_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmax_n_s16_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s16_m_tied1, svint16_t,
++		z0 = svmax_n_s16_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s16_m_untied, svint16_t,
++		z0 = svmax_n_s16_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_s16_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	smax	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s16_m, svint16_t,
++		z0 = svmax_n_s16_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_z_tied1, svint16_t,
++		z0 = svmax_s16_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_z_tied2, svint16_t,
++		z0 = svmax_s16_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smax	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_z_untied, svint16_t,
++		z0 = svmax_s16_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmax_n_s16_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smax	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmax_n_s16_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s16_z_tied1, svint16_t,
++		z0 = svmax_n_s16_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smax	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s16_z_untied, svint16_t,
++		z0 = svmax_n_s16_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_s16_x_tied1:
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_x_tied1, svint16_t,
++		z0 = svmax_s16_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_s16_x_tied2:
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_x_tied2, svint16_t,
++		z0 = svmax_s16_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	smax	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s16_x_untied, svint16_t,
++		z0 = svmax_s16_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmax_n_s16_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	smax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmax_n_s16_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_s16_x_tied1:
++**	smax	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s16_x_tied1, svint16_t,
++		z0 = svmax_n_s16_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_s16_x_untied:
++**	movprfx	z0, z1
++**	smax	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s16_x_untied, svint16_t,
++		z0 = svmax_n_s16_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_s16_x:
++**	smax	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_s16_x, svint16_t,
++		z0 = svmax_n_s16_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_s16_x:
++**	mov	(z[0-9]+\.h), #128
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_s16_x, svint16_t,
++		z0 = svmax_n_s16_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_m1_s16_x:
++**	smax	z0\.h, z0\.h, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s16_x, svint16_t,
++		z0 = svmax_n_s16_x (p0, z0, -1),
++		z0 = svmax_x (p0, z0, -1))
++
++/*
++** max_m128_s16_x:
++**	smax	z0\.h, z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (max_m128_s16_x, svint16_t,
++		z0 = svmax_n_s16_x (p0, z0, -128),
++		z0 = svmax_x (p0, z0, -128))
++
++/*
++** max_m129_s16_x:
++**	mov	(z[0-9]+\.h), #-129
++**	smax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m129_s16_x, svint16_t,
++		z0 = svmax_n_s16_x (p0, z0, -129),
++		z0 = svmax_x (p0, z0, -129))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s32.c
+new file mode 100644
+index 000000000..07402c7a9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s32.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_s32_m_tied1:
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_m_tied1, svint32_t,
++		z0 = svmax_s32_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smax	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_m_tied2, svint32_t,
++		z0 = svmax_s32_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_s32_m_untied:
++**	movprfx	z0, z1
++**	smax	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_m_untied, svint32_t,
++		z0 = svmax_s32_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmax_n_s32_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmax_n_s32_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s32_m_tied1, svint32_t,
++		z0 = svmax_n_s32_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s32_m_untied, svint32_t,
++		z0 = svmax_n_s32_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_s32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	smax	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s32_m, svint32_t,
++		z0 = svmax_n_s32_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_z_tied1, svint32_t,
++		z0 = svmax_s32_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_z_tied2, svint32_t,
++		z0 = svmax_s32_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smax	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_z_untied, svint32_t,
++		z0 = svmax_s32_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmax_n_s32_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smax	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmax_n_s32_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s32_z_tied1, svint32_t,
++		z0 = svmax_n_s32_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smax	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s32_z_untied, svint32_t,
++		z0 = svmax_n_s32_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_s32_x_tied1:
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_x_tied1, svint32_t,
++		z0 = svmax_s32_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_s32_x_tied2:
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_x_tied2, svint32_t,
++		z0 = svmax_s32_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	smax	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s32_x_untied, svint32_t,
++		z0 = svmax_s32_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmax_n_s32_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	smax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmax_n_s32_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_s32_x_tied1:
++**	smax	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s32_x_tied1, svint32_t,
++		z0 = svmax_n_s32_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_s32_x_untied:
++**	movprfx	z0, z1
++**	smax	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s32_x_untied, svint32_t,
++		z0 = svmax_n_s32_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_s32_x:
++**	smax	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_s32_x, svint32_t,
++		z0 = svmax_n_s32_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_s32_x:
++**	mov	(z[0-9]+\.s), #128
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_s32_x, svint32_t,
++		z0 = svmax_n_s32_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_m1_s32_x:
++**	smax	z0\.s, z0\.s, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s32_x, svint32_t,
++		z0 = svmax_n_s32_x (p0, z0, -1),
++		z0 = svmax_x (p0, z0, -1))
++
++/*
++** max_m128_s32_x:
++**	smax	z0\.s, z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (max_m128_s32_x, svint32_t,
++		z0 = svmax_n_s32_x (p0, z0, -128),
++		z0 = svmax_x (p0, z0, -128))
++
++/*
++** max_m129_s32_x:
++**	mov	(z[0-9]+\.s), #-129
++**	smax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m129_s32_x, svint32_t,
++		z0 = svmax_n_s32_x (p0, z0, -129),
++		z0 = svmax_x (p0, z0, -129))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s64.c
+new file mode 100644
+index 000000000..66f00fdf1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s64.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_s64_m_tied1:
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_m_tied1, svint64_t,
++		z0 = svmax_s64_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_m_tied2, svint64_t,
++		z0 = svmax_s64_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_s64_m_untied:
++**	movprfx	z0, z1
++**	smax	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_m_untied, svint64_t,
++		z0 = svmax_s64_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmax_n_s64_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmax_n_s64_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s64_m_tied1, svint64_t,
++		z0 = svmax_n_s64_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s64_m_untied, svint64_t,
++		z0 = svmax_n_s64_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_s64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	smax	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s64_m, svint64_t,
++		z0 = svmax_n_s64_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_z_tied1, svint64_t,
++		z0 = svmax_s64_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_z_tied2, svint64_t,
++		z0 = svmax_s64_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smax	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_z_untied, svint64_t,
++		z0 = svmax_s64_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmax_n_s64_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smax	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmax_n_s64_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s64_z_tied1, svint64_t,
++		z0 = svmax_n_s64_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smax	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s64_z_untied, svint64_t,
++		z0 = svmax_n_s64_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_s64_x_tied1:
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_x_tied1, svint64_t,
++		z0 = svmax_s64_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_s64_x_tied2:
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_x_tied2, svint64_t,
++		z0 = svmax_s64_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	smax	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s64_x_untied, svint64_t,
++		z0 = svmax_s64_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmax_n_s64_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	smax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmax_n_s64_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_s64_x_tied1:
++**	smax	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s64_x_tied1, svint64_t,
++		z0 = svmax_n_s64_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_s64_x_untied:
++**	movprfx	z0, z1
++**	smax	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s64_x_untied, svint64_t,
++		z0 = svmax_n_s64_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_s64_x:
++**	smax	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_s64_x, svint64_t,
++		z0 = svmax_n_s64_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_s64_x:
++**	mov	(z[0-9]+\.d), #128
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_s64_x, svint64_t,
++		z0 = svmax_n_s64_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_m1_s64_x:
++**	smax	z0\.d, z0\.d, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s64_x, svint64_t,
++		z0 = svmax_n_s64_x (p0, z0, -1),
++		z0 = svmax_x (p0, z0, -1))
++
++/*
++** max_m128_s64_x:
++**	smax	z0\.d, z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (max_m128_s64_x, svint64_t,
++		z0 = svmax_n_s64_x (p0, z0, -128),
++		z0 = svmax_x (p0, z0, -128))
++
++/*
++** max_m129_s64_x:
++**	mov	(z[0-9]+\.d), #-129
++**	smax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m129_s64_x, svint64_t,
++		z0 = svmax_n_s64_x (p0, z0, -129),
++		z0 = svmax_x (p0, z0, -129))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s8.c
+new file mode 100644
+index 000000000..c651a26f0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s8.c
+@@ -0,0 +1,273 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_s8_m_tied1:
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_m_tied1, svint8_t,
++		z0 = svmax_s8_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smax	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_m_tied2, svint8_t,
++		z0 = svmax_s8_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_s8_m_untied:
++**	movprfx	z0, z1
++**	smax	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_m_untied, svint8_t,
++		z0 = svmax_s8_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmax_n_s8_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmax_n_s8_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s8_m_tied1, svint8_t,
++		z0 = svmax_n_s8_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s8_m_untied, svint8_t,
++		z0 = svmax_n_s8_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_s8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s8_m, svint8_t,
++		z0 = svmax_n_s8_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_z_tied1, svint8_t,
++		z0 = svmax_s8_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_z_tied2, svint8_t,
++		z0 = svmax_s8_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smax	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_z_untied, svint8_t,
++		z0 = svmax_s8_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmax_n_s8_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smax	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmax_n_s8_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s8_z_tied1, svint8_t,
++		z0 = svmax_n_s8_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smax	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s8_z_untied, svint8_t,
++		z0 = svmax_n_s8_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_s8_x_tied1:
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_x_tied1, svint8_t,
++		z0 = svmax_s8_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_s8_x_tied2:
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_x_tied2, svint8_t,
++		z0 = svmax_s8_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	smax	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_s8_x_untied, svint8_t,
++		z0 = svmax_s8_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	smax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmax_n_s8_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	smax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmax_n_s8_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_s8_x_tied1:
++**	smax	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s8_x_tied1, svint8_t,
++		z0 = svmax_n_s8_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_s8_x_untied:
++**	movprfx	z0, z1
++**	smax	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_s8_x_untied, svint8_t,
++		z0 = svmax_n_s8_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_s8_x:
++**	smax	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_s8_x, svint8_t,
++		z0 = svmax_n_s8_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_m1_s8_x:
++**	smax	z0\.b, z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_s8_x, svint8_t,
++		z0 = svmax_n_s8_x (p0, z0, -1),
++		z0 = svmax_x (p0, z0, -1))
++
++/*
++** max_m127_s8_x:
++**	smax	z0\.b, z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (max_m127_s8_x, svint8_t,
++		z0 = svmax_n_s8_x (p0, z0, -127),
++		z0 = svmax_x (p0, z0, -127))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u16.c
+new file mode 100644
+index 000000000..9a0b95431
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u16.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_u16_m_tied1:
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_m_tied1, svuint16_t,
++		z0 = svmax_u16_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umax	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_m_tied2, svuint16_t,
++		z0 = svmax_u16_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_u16_m_untied:
++**	movprfx	z0, z1
++**	umax	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_m_untied, svuint16_t,
++		z0 = svmax_u16_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmax_n_u16_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmax_n_u16_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u16_m_tied1, svuint16_t,
++		z0 = svmax_n_u16_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u16_m_untied, svuint16_t,
++		z0 = svmax_n_u16_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_u16_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	umax	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_u16_m, svuint16_t,
++		z0 = svmax_n_u16_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_z_tied1, svuint16_t,
++		z0 = svmax_u16_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_z_tied2, svuint16_t,
++		z0 = svmax_u16_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umax	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_z_untied, svuint16_t,
++		z0 = svmax_u16_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmax_n_u16_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umax	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmax_n_u16_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u16_z_tied1, svuint16_t,
++		z0 = svmax_n_u16_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umax	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u16_z_untied, svuint16_t,
++		z0 = svmax_n_u16_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_u16_x_tied1:
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_x_tied1, svuint16_t,
++		z0 = svmax_u16_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_u16_x_tied2:
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_x_tied2, svuint16_t,
++		z0 = svmax_u16_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	umax	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u16_x_untied, svuint16_t,
++		z0 = svmax_u16_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmax_n_u16_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	umax	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmax_n_u16_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_u16_x_tied1:
++**	umax	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u16_x_tied1, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_u16_x_untied:
++**	movprfx	z0, z1
++**	umax	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u16_x_untied, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_u16_x:
++**	umax	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_u16_x, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_u16_x:
++**	umax	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_u16_x, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_255_u16_x:
++**	umax	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (max_255_u16_x, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z0, 255),
++		z0 = svmax_x (p0, z0, 255))
++
++/*
++** max_256_u16_x:
++**	mov	(z[0-9]+\.h), #256
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_256_u16_x, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z0, 256),
++		z0 = svmax_x (p0, z0, 256))
++
++/*
++** max_m2_u16_x:
++**	mov	(z[0-9]+\.h), #-2
++**	umax	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m2_u16_x, svuint16_t,
++		z0 = svmax_n_u16_x (p0, z0, -2),
++		z0 = svmax_x (p0, z0, -2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u32.c
+new file mode 100644
+index 000000000..91eba25c1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u32.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_u32_m_tied1:
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_m_tied1, svuint32_t,
++		z0 = svmax_u32_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umax	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_m_tied2, svuint32_t,
++		z0 = svmax_u32_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_u32_m_untied:
++**	movprfx	z0, z1
++**	umax	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_m_untied, svuint32_t,
++		z0 = svmax_u32_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmax_n_u32_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmax_n_u32_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u32_m_tied1, svuint32_t,
++		z0 = svmax_n_u32_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u32_m_untied, svuint32_t,
++		z0 = svmax_n_u32_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_u32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	umax	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_u32_m, svuint32_t,
++		z0 = svmax_n_u32_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_z_tied1, svuint32_t,
++		z0 = svmax_u32_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_z_tied2, svuint32_t,
++		z0 = svmax_u32_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umax	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_z_untied, svuint32_t,
++		z0 = svmax_u32_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmax_n_u32_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umax	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmax_n_u32_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u32_z_tied1, svuint32_t,
++		z0 = svmax_n_u32_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umax	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u32_z_untied, svuint32_t,
++		z0 = svmax_n_u32_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_u32_x_tied1:
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_x_tied1, svuint32_t,
++		z0 = svmax_u32_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_u32_x_tied2:
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_x_tied2, svuint32_t,
++		z0 = svmax_u32_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	umax	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u32_x_untied, svuint32_t,
++		z0 = svmax_u32_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmax_n_u32_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	umax	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmax_n_u32_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_u32_x_tied1:
++**	umax	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u32_x_tied1, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_u32_x_untied:
++**	movprfx	z0, z1
++**	umax	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u32_x_untied, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_u32_x:
++**	umax	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_u32_x, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_u32_x:
++**	umax	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_u32_x, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_255_u32_x:
++**	umax	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (max_255_u32_x, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z0, 255),
++		z0 = svmax_x (p0, z0, 255))
++
++/*
++** max_256_u32_x:
++**	mov	(z[0-9]+\.s), #256
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_256_u32_x, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z0, 256),
++		z0 = svmax_x (p0, z0, 256))
++
++/*
++** max_m2_u32_x:
++**	mov	(z[0-9]+\.s), #-2
++**	umax	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m2_u32_x, svuint32_t,
++		z0 = svmax_n_u32_x (p0, z0, -2),
++		z0 = svmax_x (p0, z0, -2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u64.c
+new file mode 100644
+index 000000000..5be4c9fb7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u64.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_u64_m_tied1:
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_m_tied1, svuint64_t,
++		z0 = svmax_u64_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_m_tied2, svuint64_t,
++		z0 = svmax_u64_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_u64_m_untied:
++**	movprfx	z0, z1
++**	umax	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_m_untied, svuint64_t,
++		z0 = svmax_u64_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmax_n_u64_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmax_n_u64_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u64_m_tied1, svuint64_t,
++		z0 = svmax_n_u64_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u64_m_untied, svuint64_t,
++		z0 = svmax_n_u64_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_u64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	umax	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_u64_m, svuint64_t,
++		z0 = svmax_n_u64_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_z_tied1, svuint64_t,
++		z0 = svmax_u64_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_z_tied2, svuint64_t,
++		z0 = svmax_u64_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umax	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_z_untied, svuint64_t,
++		z0 = svmax_u64_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmax_n_u64_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umax	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmax_n_u64_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u64_z_tied1, svuint64_t,
++		z0 = svmax_n_u64_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umax	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u64_z_untied, svuint64_t,
++		z0 = svmax_n_u64_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_u64_x_tied1:
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_x_tied1, svuint64_t,
++		z0 = svmax_u64_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_u64_x_tied2:
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_x_tied2, svuint64_t,
++		z0 = svmax_u64_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	umax	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u64_x_untied, svuint64_t,
++		z0 = svmax_u64_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmax_n_u64_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	umax	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (max_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmax_n_u64_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_u64_x_tied1:
++**	umax	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u64_x_tied1, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_u64_x_untied:
++**	movprfx	z0, z1
++**	umax	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u64_x_untied, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_u64_x:
++**	umax	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_u64_x, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_u64_x:
++**	umax	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_u64_x, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_255_u64_x:
++**	umax	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (max_255_u64_x, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z0, 255),
++		z0 = svmax_x (p0, z0, 255))
++
++/*
++** max_256_u64_x:
++**	mov	(z[0-9]+\.d), #256
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_256_u64_x, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z0, 256),
++		z0 = svmax_x (p0, z0, 256))
++
++/*
++** max_m2_u64_x:
++**	mov	(z[0-9]+\.d), #-2
++**	umax	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m2_u64_x, svuint64_t,
++		z0 = svmax_n_u64_x (p0, z0, -2),
++		z0 = svmax_x (p0, z0, -2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u8.c
+new file mode 100644
+index 000000000..04c9ddb36
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u8.c
+@@ -0,0 +1,273 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** max_u8_m_tied1:
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_m_tied1, svuint8_t,
++		z0 = svmax_u8_m (p0, z0, z1),
++		z0 = svmax_m (p0, z0, z1))
++
++/*
++** max_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umax	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_m_tied2, svuint8_t,
++		z0 = svmax_u8_m (p0, z1, z0),
++		z0 = svmax_m (p0, z1, z0))
++
++/*
++** max_u8_m_untied:
++**	movprfx	z0, z1
++**	umax	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_m_untied, svuint8_t,
++		z0 = svmax_u8_m (p0, z1, z2),
++		z0 = svmax_m (p0, z1, z2))
++
++/*
++** max_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmax_n_u8_m (p0, z0, x0),
++		 z0 = svmax_m (p0, z0, x0))
++
++/*
++** max_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmax_n_u8_m (p0, z1, x0),
++		 z0 = svmax_m (p0, z1, x0))
++
++/*
++** max_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u8_m_tied1, svuint8_t,
++		z0 = svmax_n_u8_m (p0, z0, 1),
++		z0 = svmax_m (p0, z0, 1))
++
++/*
++** max_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u8_m_untied, svuint8_t,
++		z0 = svmax_n_u8_m (p0, z1, 1),
++		z0 = svmax_m (p0, z1, 1))
++
++/*
++** max_m1_u8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_m1_u8_m, svuint8_t,
++		z0 = svmax_n_u8_m (p0, z0, -1),
++		z0 = svmax_m (p0, z0, -1))
++
++/*
++** max_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_z_tied1, svuint8_t,
++		z0 = svmax_u8_z (p0, z0, z1),
++		z0 = svmax_z (p0, z0, z1))
++
++/*
++** max_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_z_tied2, svuint8_t,
++		z0 = svmax_u8_z (p0, z1, z0),
++		z0 = svmax_z (p0, z1, z0))
++
++/*
++** max_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umax	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_z_untied, svuint8_t,
++		z0 = svmax_u8_z (p0, z1, z2),
++		z0 = svmax_z (p0, z1, z2))
++
++/*
++** max_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmax_n_u8_z (p0, z0, x0),
++		 z0 = svmax_z (p0, z0, x0))
++
++/*
++** max_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umax	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmax_n_u8_z (p0, z1, x0),
++		 z0 = svmax_z (p0, z1, x0))
++
++/*
++** max_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u8_z_tied1, svuint8_t,
++		z0 = svmax_n_u8_z (p0, z0, 1),
++		z0 = svmax_z (p0, z0, 1))
++
++/*
++** max_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umax	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u8_z_untied, svuint8_t,
++		z0 = svmax_n_u8_z (p0, z1, 1),
++		z0 = svmax_z (p0, z1, 1))
++
++/*
++** max_u8_x_tied1:
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_x_tied1, svuint8_t,
++		z0 = svmax_u8_x (p0, z0, z1),
++		z0 = svmax_x (p0, z0, z1))
++
++/*
++** max_u8_x_tied2:
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_x_tied2, svuint8_t,
++		z0 = svmax_u8_x (p0, z1, z0),
++		z0 = svmax_x (p0, z1, z0))
++
++/*
++** max_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	umax	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (max_u8_x_untied, svuint8_t,
++		z0 = svmax_u8_x (p0, z1, z2),
++		z0 = svmax_x (p0, z1, z2))
++
++/*
++** max_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	umax	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmax_n_u8_x (p0, z0, x0),
++		 z0 = svmax_x (p0, z0, x0))
++
++/*
++** max_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	umax	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (max_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmax_n_u8_x (p0, z1, x0),
++		 z0 = svmax_x (p0, z1, x0))
++
++/*
++** max_1_u8_x_tied1:
++**	umax	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u8_x_tied1, svuint8_t,
++		z0 = svmax_n_u8_x (p0, z0, 1),
++		z0 = svmax_x (p0, z0, 1))
++
++/*
++** max_1_u8_x_untied:
++**	movprfx	z0, z1
++**	umax	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (max_1_u8_x_untied, svuint8_t,
++		z0 = svmax_n_u8_x (p0, z1, 1),
++		z0 = svmax_x (p0, z1, 1))
++
++/*
++** max_127_u8_x:
++**	umax	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (max_127_u8_x, svuint8_t,
++		z0 = svmax_n_u8_x (p0, z0, 127),
++		z0 = svmax_x (p0, z0, 127))
++
++/*
++** max_128_u8_x:
++**	umax	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (max_128_u8_x, svuint8_t,
++		z0 = svmax_n_u8_x (p0, z0, 128),
++		z0 = svmax_x (p0, z0, 128))
++
++/*
++** max_254_u8_x:
++**	umax	z0\.b, z0\.b, #254
++**	ret
++*/
++TEST_UNIFORM_Z (max_254_u8_x, svuint8_t,
++		z0 = svmax_n_u8_x (p0, z0, 254),
++		z0 = svmax_x (p0, z0, 254))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f16.c
+new file mode 100644
+index 000000000..a9da710d0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f16.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxnm_f16_m_tied1:
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_m_tied1, svfloat16_t,
++		z0 = svmaxnm_f16_m (p0, z0, z1),
++		z0 = svmaxnm_m (p0, z0, z1))
++
++/*
++** maxnm_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_m_tied2, svfloat16_t,
++		z0 = svmaxnm_f16_m (p0, z1, z0),
++		z0 = svmaxnm_m (p0, z1, z0))
++
++/*
++** maxnm_f16_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_m_untied, svfloat16_t,
++		z0 = svmaxnm_f16_m (p0, z1, z2),
++		z0 = svmaxnm_m (p0, z1, z2))
++
++/*
++** maxnm_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmaxnm_n_f16_m (p0, z0, d4),
++		 z0 = svmaxnm_m (p0, z0, d4))
++
++/*
++** maxnm_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmaxnm_n_f16_m (p0, z1, d4),
++		 z0 = svmaxnm_m (p0, z1, d4))
++
++/*
++** maxnm_0_f16_m_tied1:
++**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f16_m_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_m (p0, z0, 0),
++		z0 = svmaxnm_m (p0, z0, 0))
++
++/*
++** maxnm_0_f16_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f16_m_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_m (p0, z1, 0),
++		z0 = svmaxnm_m (p0, z1, 0))
++
++/*
++** maxnm_1_f16_m_tied1:
++**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f16_m_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_m (p0, z0, 1),
++		z0 = svmaxnm_m (p0, z0, 1))
++
++/*
++** maxnm_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f16_m_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_m (p0, z1, 1),
++		z0 = svmaxnm_m (p0, z1, 1))
++
++/*
++** maxnm_2_f16_m:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f16_m, svfloat16_t,
++		z0 = svmaxnm_n_f16_m (p0, z0, 2),
++		z0 = svmaxnm_m (p0, z0, 2))
++
++/*
++** maxnm_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_z_tied1, svfloat16_t,
++		z0 = svmaxnm_f16_z (p0, z0, z1),
++		z0 = svmaxnm_z (p0, z0, z1))
++
++/*
++** maxnm_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_z_tied2, svfloat16_t,
++		z0 = svmaxnm_f16_z (p0, z1, z0),
++		z0 = svmaxnm_z (p0, z1, z0))
++
++/*
++** maxnm_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_z_untied, svfloat16_t,
++		z0 = svmaxnm_f16_z (p0, z1, z2),
++		z0 = svmaxnm_z (p0, z1, z2))
++
++/*
++** maxnm_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmaxnm_n_f16_z (p0, z0, d4),
++		 z0 = svmaxnm_z (p0, z0, d4))
++
++/*
++** maxnm_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmaxnm_n_f16_z (p0, z1, d4),
++		 z0 = svmaxnm_z (p0, z1, d4))
++
++/*
++** maxnm_0_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f16_z_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_z (p0, z0, 0),
++		z0 = svmaxnm_z (p0, z0, 0))
++
++/*
++** maxnm_0_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f16_z_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_z (p0, z1, 0),
++		z0 = svmaxnm_z (p0, z1, 0))
++
++/*
++** maxnm_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f16_z_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_z (p0, z0, 1),
++		z0 = svmaxnm_z (p0, z0, 1))
++
++/*
++** maxnm_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f16_z_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_z (p0, z1, 1),
++		z0 = svmaxnm_z (p0, z1, 1))
++
++/*
++** maxnm_2_f16_z:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f16_z, svfloat16_t,
++		z0 = svmaxnm_n_f16_z (p0, z0, 2),
++		z0 = svmaxnm_z (p0, z0, 2))
++
++/*
++** maxnm_f16_x_tied1:
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_f16_x (p0, z0, z1),
++		z0 = svmaxnm_x (p0, z0, z1))
++
++/*
++** maxnm_f16_x_tied2:
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_x_tied2, svfloat16_t,
++		z0 = svmaxnm_f16_x (p0, z1, z0),
++		z0 = svmaxnm_x (p0, z1, z0))
++
++/*
++** maxnm_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_f16_x (p0, z1, z2),
++		z0 = svmaxnm_x (p0, z1, z2))
++
++/*
++** maxnm_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmaxnm_n_f16_x (p0, z0, d4),
++		 z0 = svmaxnm_x (p0, z0, d4))
++
++/*
++** maxnm_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmaxnm_n_f16_x (p0, z1, d4),
++		 z0 = svmaxnm_x (p0, z1, d4))
++
++/*
++** maxnm_0_f16_x_tied1:
++**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (p0, z0, 0),
++		z0 = svmaxnm_x (p0, z0, 0))
++
++/*
++** maxnm_0_f16_x_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (p0, z1, 0),
++		z0 = svmaxnm_x (p0, z1, 0))
++
++/*
++** maxnm_1_f16_x_tied1:
++**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (p0, z0, 1),
++		z0 = svmaxnm_x (p0, z0, 1))
++
++/*
++** maxnm_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (p0, z1, 1),
++		z0 = svmaxnm_x (p0, z1, 1))
++
++/*
++** maxnm_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmaxnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (p0, z0, 2),
++		z0 = svmaxnm_x (p0, z0, 2))
++
++/*
++** maxnm_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (p0, z1, 2),
++		z0 = svmaxnm_x (p0, z1, 2))
++
++/*
++** ptrue_maxnm_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svmaxnm_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_maxnm_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f16_x_tied2, svfloat16_t,
++		z0 = svmaxnm_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svmaxnm_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_maxnm_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svmaxnm_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_maxnm_0_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_0_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z0, 0),
++		z0 = svmaxnm_x (svptrue_b16 (), z0, 0))
++
++/*
++** ptrue_maxnm_0_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_0_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z1, 0),
++		z0 = svmaxnm_x (svptrue_b16 (), z1, 0))
++
++/*
++** ptrue_maxnm_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_1_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svmaxnm_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_maxnm_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_1_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svmaxnm_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_maxnm_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_2_f16_x_tied1, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svmaxnm_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_maxnm_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_2_f16_x_untied, svfloat16_t,
++		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svmaxnm_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f32.c
+new file mode 100644
+index 000000000..4657d57c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f32.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxnm_f32_m_tied1:
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_m_tied1, svfloat32_t,
++		z0 = svmaxnm_f32_m (p0, z0, z1),
++		z0 = svmaxnm_m (p0, z0, z1))
++
++/*
++** maxnm_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_m_tied2, svfloat32_t,
++		z0 = svmaxnm_f32_m (p0, z1, z0),
++		z0 = svmaxnm_m (p0, z1, z0))
++
++/*
++** maxnm_f32_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_m_untied, svfloat32_t,
++		z0 = svmaxnm_f32_m (p0, z1, z2),
++		z0 = svmaxnm_m (p0, z1, z2))
++
++/*
++** maxnm_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmaxnm_n_f32_m (p0, z0, d4),
++		 z0 = svmaxnm_m (p0, z0, d4))
++
++/*
++** maxnm_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmaxnm_n_f32_m (p0, z1, d4),
++		 z0 = svmaxnm_m (p0, z1, d4))
++
++/*
++** maxnm_0_f32_m_tied1:
++**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f32_m_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_m (p0, z0, 0),
++		z0 = svmaxnm_m (p0, z0, 0))
++
++/*
++** maxnm_0_f32_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f32_m_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_m (p0, z1, 0),
++		z0 = svmaxnm_m (p0, z1, 0))
++
++/*
++** maxnm_1_f32_m_tied1:
++**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f32_m_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_m (p0, z0, 1),
++		z0 = svmaxnm_m (p0, z0, 1))
++
++/*
++** maxnm_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f32_m_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_m (p0, z1, 1),
++		z0 = svmaxnm_m (p0, z1, 1))
++
++/*
++** maxnm_2_f32_m:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f32_m, svfloat32_t,
++		z0 = svmaxnm_n_f32_m (p0, z0, 2),
++		z0 = svmaxnm_m (p0, z0, 2))
++
++/*
++** maxnm_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_z_tied1, svfloat32_t,
++		z0 = svmaxnm_f32_z (p0, z0, z1),
++		z0 = svmaxnm_z (p0, z0, z1))
++
++/*
++** maxnm_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_z_tied2, svfloat32_t,
++		z0 = svmaxnm_f32_z (p0, z1, z0),
++		z0 = svmaxnm_z (p0, z1, z0))
++
++/*
++** maxnm_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_z_untied, svfloat32_t,
++		z0 = svmaxnm_f32_z (p0, z1, z2),
++		z0 = svmaxnm_z (p0, z1, z2))
++
++/*
++** maxnm_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmaxnm_n_f32_z (p0, z0, d4),
++		 z0 = svmaxnm_z (p0, z0, d4))
++
++/*
++** maxnm_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmaxnm_n_f32_z (p0, z1, d4),
++		 z0 = svmaxnm_z (p0, z1, d4))
++
++/*
++** maxnm_0_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f32_z_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_z (p0, z0, 0),
++		z0 = svmaxnm_z (p0, z0, 0))
++
++/*
++** maxnm_0_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f32_z_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_z (p0, z1, 0),
++		z0 = svmaxnm_z (p0, z1, 0))
++
++/*
++** maxnm_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f32_z_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_z (p0, z0, 1),
++		z0 = svmaxnm_z (p0, z0, 1))
++
++/*
++** maxnm_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f32_z_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_z (p0, z1, 1),
++		z0 = svmaxnm_z (p0, z1, 1))
++
++/*
++** maxnm_2_f32_z:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f32_z, svfloat32_t,
++		z0 = svmaxnm_n_f32_z (p0, z0, 2),
++		z0 = svmaxnm_z (p0, z0, 2))
++
++/*
++** maxnm_f32_x_tied1:
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_f32_x (p0, z0, z1),
++		z0 = svmaxnm_x (p0, z0, z1))
++
++/*
++** maxnm_f32_x_tied2:
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_x_tied2, svfloat32_t,
++		z0 = svmaxnm_f32_x (p0, z1, z0),
++		z0 = svmaxnm_x (p0, z1, z0))
++
++/*
++** maxnm_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_f32_x (p0, z1, z2),
++		z0 = svmaxnm_x (p0, z1, z2))
++
++/*
++** maxnm_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmaxnm_n_f32_x (p0, z0, d4),
++		 z0 = svmaxnm_x (p0, z0, d4))
++
++/*
++** maxnm_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmaxnm_n_f32_x (p0, z1, d4),
++		 z0 = svmaxnm_x (p0, z1, d4))
++
++/*
++** maxnm_0_f32_x_tied1:
++**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (p0, z0, 0),
++		z0 = svmaxnm_x (p0, z0, 0))
++
++/*
++** maxnm_0_f32_x_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (p0, z1, 0),
++		z0 = svmaxnm_x (p0, z1, 0))
++
++/*
++** maxnm_1_f32_x_tied1:
++**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (p0, z0, 1),
++		z0 = svmaxnm_x (p0, z0, 1))
++
++/*
++** maxnm_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (p0, z1, 1),
++		z0 = svmaxnm_x (p0, z1, 1))
++
++/*
++** maxnm_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmaxnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (p0, z0, 2),
++		z0 = svmaxnm_x (p0, z0, 2))
++
++/*
++** maxnm_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (p0, z1, 2),
++		z0 = svmaxnm_x (p0, z1, 2))
++
++/*
++** ptrue_maxnm_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svmaxnm_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_maxnm_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f32_x_tied2, svfloat32_t,
++		z0 = svmaxnm_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svmaxnm_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_maxnm_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svmaxnm_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_maxnm_0_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_0_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z0, 0),
++		z0 = svmaxnm_x (svptrue_b32 (), z0, 0))
++
++/*
++** ptrue_maxnm_0_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_0_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z1, 0),
++		z0 = svmaxnm_x (svptrue_b32 (), z1, 0))
++
++/*
++** ptrue_maxnm_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_1_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svmaxnm_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_maxnm_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_1_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svmaxnm_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_maxnm_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_2_f32_x_tied1, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svmaxnm_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_maxnm_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_2_f32_x_untied, svfloat32_t,
++		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svmaxnm_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f64.c
+new file mode 100644
+index 000000000..07d88e6c1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f64.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxnm_f64_m_tied1:
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_m_tied1, svfloat64_t,
++		z0 = svmaxnm_f64_m (p0, z0, z1),
++		z0 = svmaxnm_m (p0, z0, z1))
++
++/*
++** maxnm_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_m_tied2, svfloat64_t,
++		z0 = svmaxnm_f64_m (p0, z1, z0),
++		z0 = svmaxnm_m (p0, z1, z0))
++
++/*
++** maxnm_f64_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_m_untied, svfloat64_t,
++		z0 = svmaxnm_f64_m (p0, z1, z2),
++		z0 = svmaxnm_m (p0, z1, z2))
++
++/*
++** maxnm_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmaxnm_n_f64_m (p0, z0, d4),
++		 z0 = svmaxnm_m (p0, z0, d4))
++
++/*
++** maxnm_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmaxnm_n_f64_m (p0, z1, d4),
++		 z0 = svmaxnm_m (p0, z1, d4))
++
++/*
++** maxnm_0_f64_m_tied1:
++**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f64_m_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_m (p0, z0, 0),
++		z0 = svmaxnm_m (p0, z0, 0))
++
++/*
++** maxnm_0_f64_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f64_m_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_m (p0, z1, 0),
++		z0 = svmaxnm_m (p0, z1, 0))
++
++/*
++** maxnm_1_f64_m_tied1:
++**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f64_m_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_m (p0, z0, 1),
++		z0 = svmaxnm_m (p0, z0, 1))
++
++/*
++** maxnm_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f64_m_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_m (p0, z1, 1),
++		z0 = svmaxnm_m (p0, z1, 1))
++
++/*
++** maxnm_2_f64_m:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f64_m, svfloat64_t,
++		z0 = svmaxnm_n_f64_m (p0, z0, 2),
++		z0 = svmaxnm_m (p0, z0, 2))
++
++/*
++** maxnm_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_z_tied1, svfloat64_t,
++		z0 = svmaxnm_f64_z (p0, z0, z1),
++		z0 = svmaxnm_z (p0, z0, z1))
++
++/*
++** maxnm_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_z_tied2, svfloat64_t,
++		z0 = svmaxnm_f64_z (p0, z1, z0),
++		z0 = svmaxnm_z (p0, z1, z0))
++
++/*
++** maxnm_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_z_untied, svfloat64_t,
++		z0 = svmaxnm_f64_z (p0, z1, z2),
++		z0 = svmaxnm_z (p0, z1, z2))
++
++/*
++** maxnm_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmaxnm_n_f64_z (p0, z0, d4),
++		 z0 = svmaxnm_z (p0, z0, d4))
++
++/*
++** maxnm_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmaxnm_n_f64_z (p0, z1, d4),
++		 z0 = svmaxnm_z (p0, z1, d4))
++
++/*
++** maxnm_0_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f64_z_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_z (p0, z0, 0),
++		z0 = svmaxnm_z (p0, z0, 0))
++
++/*
++** maxnm_0_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f64_z_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_z (p0, z1, 0),
++		z0 = svmaxnm_z (p0, z1, 0))
++
++/*
++** maxnm_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f64_z_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_z (p0, z0, 1),
++		z0 = svmaxnm_z (p0, z0, 1))
++
++/*
++** maxnm_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f64_z_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_z (p0, z1, 1),
++		z0 = svmaxnm_z (p0, z1, 1))
++
++/*
++** maxnm_2_f64_z:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f64_z, svfloat64_t,
++		z0 = svmaxnm_n_f64_z (p0, z0, 2),
++		z0 = svmaxnm_z (p0, z0, 2))
++
++/*
++** maxnm_f64_x_tied1:
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_f64_x (p0, z0, z1),
++		z0 = svmaxnm_x (p0, z0, z1))
++
++/*
++** maxnm_f64_x_tied2:
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_x_tied2, svfloat64_t,
++		z0 = svmaxnm_f64_x (p0, z1, z0),
++		z0 = svmaxnm_x (p0, z1, z0))
++
++/*
++** maxnm_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_f64_x (p0, z1, z2),
++		z0 = svmaxnm_x (p0, z1, z2))
++
++/*
++** maxnm_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmaxnm_n_f64_x (p0, z0, d4),
++		 z0 = svmaxnm_x (p0, z0, d4))
++
++/*
++** maxnm_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (maxnm_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmaxnm_n_f64_x (p0, z1, d4),
++		 z0 = svmaxnm_x (p0, z1, d4))
++
++/*
++** maxnm_0_f64_x_tied1:
++**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (p0, z0, 0),
++		z0 = svmaxnm_x (p0, z0, 0))
++
++/*
++** maxnm_0_f64_x_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_0_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (p0, z1, 0),
++		z0 = svmaxnm_x (p0, z1, 0))
++
++/*
++** maxnm_1_f64_x_tied1:
++**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (p0, z0, 1),
++		z0 = svmaxnm_x (p0, z0, 1))
++
++/*
++** maxnm_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_1_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (p0, z1, 1),
++		z0 = svmaxnm_x (p0, z1, 1))
++
++/*
++** maxnm_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmaxnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (p0, z0, 2),
++		z0 = svmaxnm_x (p0, z0, 2))
++
++/*
++** maxnm_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (maxnm_2_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (p0, z1, 2),
++		z0 = svmaxnm_x (p0, z1, 2))
++
++/*
++** ptrue_maxnm_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svmaxnm_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_maxnm_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f64_x_tied2, svfloat64_t,
++		z0 = svmaxnm_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svmaxnm_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_maxnm_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svmaxnm_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_maxnm_0_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_0_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z0, 0),
++		z0 = svmaxnm_x (svptrue_b64 (), z0, 0))
++
++/*
++** ptrue_maxnm_0_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_0_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z1, 0),
++		z0 = svmaxnm_x (svptrue_b64 (), z1, 0))
++
++/*
++** ptrue_maxnm_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_1_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svmaxnm_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_maxnm_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_1_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svmaxnm_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_maxnm_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_2_f64_x_tied1, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svmaxnm_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_maxnm_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_maxnm_2_f64_x_untied, svfloat64_t,
++		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svmaxnm_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f16.c
+new file mode 100644
+index 000000000..086bcf974
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxnmv_d0_f16_tied:
++**	fmaxnmv	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (maxnmv_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svmaxnmv_f16 (p0, z0),
++		  d0 = svmaxnmv (p0, z0))
++
++/*
++** maxnmv_d0_f16_untied:
++**	fmaxnmv	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (maxnmv_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svmaxnmv_f16 (p0, z1),
++		  d0 = svmaxnmv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f32.c
+new file mode 100644
+index 000000000..7fca8bc9e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxnmv_d0_f32_tied:
++**	fmaxnmv	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (maxnmv_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svmaxnmv_f32 (p0, z0),
++		  d0 = svmaxnmv (p0, z0))
++
++/*
++** maxnmv_d0_f32_untied:
++**	fmaxnmv	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (maxnmv_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svmaxnmv_f32 (p0, z1),
++		  d0 = svmaxnmv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f64.c
+new file mode 100644
+index 000000000..8b0884479
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxnmv_d0_f64_tied:
++**	fmaxnmv	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (maxnmv_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svmaxnmv_f64 (p0, z0),
++		  d0 = svmaxnmv (p0, z0))
++
++/*
++** maxnmv_d0_f64_untied:
++**	fmaxnmv	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (maxnmv_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svmaxnmv_f64 (p0, z1),
++		  d0 = svmaxnmv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f16.c
+new file mode 100644
+index 000000000..a16823987
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_d0_f16_tied:
++**	fmaxv	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (maxv_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svmaxv_f16 (p0, z0),
++		  d0 = svmaxv (p0, z0))
++
++/*
++** maxv_d0_f16_untied:
++**	fmaxv	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (maxv_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svmaxv_f16 (p0, z1),
++		  d0 = svmaxv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f32.c
+new file mode 100644
+index 000000000..64e5edfef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_d0_f32_tied:
++**	fmaxv	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (maxv_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svmaxv_f32 (p0, z0),
++		  d0 = svmaxv (p0, z0))
++
++/*
++** maxv_d0_f32_untied:
++**	fmaxv	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (maxv_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svmaxv_f32 (p0, z1),
++		  d0 = svmaxv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f64.c
+new file mode 100644
+index 000000000..837d6dfdc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_d0_f64_tied:
++**	fmaxv	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (maxv_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svmaxv_f64 (p0, z0),
++		  d0 = svmaxv (p0, z0))
++
++/*
++** maxv_d0_f64_untied:
++**	fmaxv	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (maxv_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svmaxv_f64 (p0, z1),
++		  d0 = svmaxv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s16.c
+new file mode 100644
+index 000000000..bbf36a110
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_s16:
++**	smaxv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_s16, int16_t, svint16_t,
++		  x0 = svmaxv_s16 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s32.c
+new file mode 100644
+index 000000000..645169ee8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_s32:
++**	smaxv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_s32, int32_t, svint32_t,
++		  x0 = svmaxv_s32 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s64.c
+new file mode 100644
+index 000000000..009c1e9e2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_s64:
++**	smaxv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_s64, int64_t, svint64_t,
++		  x0 = svmaxv_s64 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s8.c
+new file mode 100644
+index 000000000..2c1f1b9b3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_s8:
++**	smaxv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_s8, int8_t, svint8_t,
++		  x0 = svmaxv_s8 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u16.c
+new file mode 100644
+index 000000000..978b8251a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_u16:
++**	umaxv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_u16, uint16_t, svuint16_t,
++		  x0 = svmaxv_u16 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u32.c
+new file mode 100644
+index 000000000..85853b4b0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_u32:
++**	umaxv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_u32, uint32_t, svuint32_t,
++		  x0 = svmaxv_u32 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u64.c
+new file mode 100644
+index 000000000..95980ed34
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_u64:
++**	umaxv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_u64, uint64_t, svuint64_t,
++		  x0 = svmaxv_u64 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u8.c
+new file mode 100644
+index 000000000..a0b23d242
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** maxv_x0_u8:
++**	umaxv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (maxv_x0_u8, uint8_t, svuint8_t,
++		  x0 = svmaxv_u8 (p0, z0),
++		  x0 = svmaxv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f16.c
+new file mode 100644
+index 000000000..721ee7389
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f16.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_f16_m_tied1:
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_m_tied1, svfloat16_t,
++		z0 = svmin_f16_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_m_tied2, svfloat16_t,
++		z0 = svmin_f16_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_f16_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_m_untied, svfloat16_t,
++		z0 = svmin_f16_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmin_n_f16_m (p0, z0, d4),
++		 z0 = svmin_m (p0, z0, d4))
++
++/*
++** min_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmin_n_f16_m (p0, z1, d4),
++		 z0 = svmin_m (p0, z1, d4))
++
++/*
++** min_0_f16_m_tied1:
++**	fmin	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f16_m_tied1, svfloat16_t,
++		z0 = svmin_n_f16_m (p0, z0, 0),
++		z0 = svmin_m (p0, z0, 0))
++
++/*
++** min_0_f16_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f16_m_untied, svfloat16_t,
++		z0 = svmin_n_f16_m (p0, z1, 0),
++		z0 = svmin_m (p0, z1, 0))
++
++/*
++** min_1_f16_m_tied1:
++**	fmin	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f16_m_tied1, svfloat16_t,
++		z0 = svmin_n_f16_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f16_m_untied, svfloat16_t,
++		z0 = svmin_n_f16_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_2_f16_m:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f16_m, svfloat16_t,
++		z0 = svmin_n_f16_m (p0, z0, 2),
++		z0 = svmin_m (p0, z0, 2))
++
++/*
++** min_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_z_tied1, svfloat16_t,
++		z0 = svmin_f16_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_z_tied2, svfloat16_t,
++		z0 = svmin_f16_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmin	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_z_untied, svfloat16_t,
++		z0 = svmin_f16_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmin_n_f16_z (p0, z0, d4),
++		 z0 = svmin_z (p0, z0, d4))
++
++/*
++** min_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmin	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (min_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmin_n_f16_z (p0, z1, d4),
++		 z0 = svmin_z (p0, z1, d4))
++
++/*
++** min_0_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmin	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f16_z_tied1, svfloat16_t,
++		z0 = svmin_n_f16_z (p0, z0, 0),
++		z0 = svmin_z (p0, z0, 0))
++
++/*
++** min_0_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmin	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f16_z_untied, svfloat16_t,
++		z0 = svmin_n_f16_z (p0, z1, 0),
++		z0 = svmin_z (p0, z1, 0))
++
++/*
++** min_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmin	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f16_z_tied1, svfloat16_t,
++		z0 = svmin_n_f16_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmin	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f16_z_untied, svfloat16_t,
++		z0 = svmin_n_f16_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_2_f16_z:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f16_z, svfloat16_t,
++		z0 = svmin_n_f16_z (p0, z0, 2),
++		z0 = svmin_z (p0, z0, 2))
++
++/*
++** min_f16_x_tied1:
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_x_tied1, svfloat16_t,
++		z0 = svmin_f16_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_f16_x_tied2:
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_x_tied2, svfloat16_t,
++		z0 = svmin_f16_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_f16_x_untied, svfloat16_t,
++		z0 = svmin_f16_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmin_n_f16_x (p0, z0, d4),
++		 z0 = svmin_x (p0, z0, d4))
++
++/*
++** min_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (min_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmin_n_f16_x (p0, z1, d4),
++		 z0 = svmin_x (p0, z1, d4))
++
++/*
++** min_0_f16_x_tied1:
++**	fmin	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f16_x_tied1, svfloat16_t,
++		z0 = svmin_n_f16_x (p0, z0, 0),
++		z0 = svmin_x (p0, z0, 0))
++
++/*
++** min_0_f16_x_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f16_x_untied, svfloat16_t,
++		z0 = svmin_n_f16_x (p0, z1, 0),
++		z0 = svmin_x (p0, z1, 0))
++
++/*
++** min_1_f16_x_tied1:
++**	fmin	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f16_x_tied1, svfloat16_t,
++		z0 = svmin_n_f16_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f16_x_untied, svfloat16_t,
++		z0 = svmin_n_f16_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f16_x_tied1, svfloat16_t,
++		z0 = svmin_n_f16_x (p0, z0, 2),
++		z0 = svmin_x (p0, z0, 2))
++
++/*
++** min_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f16_x_untied, svfloat16_t,
++		z0 = svmin_n_f16_x (p0, z1, 2),
++		z0 = svmin_x (p0, z1, 2))
++
++/*
++** ptrue_min_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f16_x_tied1, svfloat16_t,
++		z0 = svmin_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svmin_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_min_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f16_x_tied2, svfloat16_t,
++		z0 = svmin_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svmin_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_min_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f16_x_untied, svfloat16_t,
++		z0 = svmin_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svmin_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_min_0_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_0_f16_x_tied1, svfloat16_t,
++		z0 = svmin_n_f16_x (svptrue_b16 (), z0, 0),
++		z0 = svmin_x (svptrue_b16 (), z0, 0))
++
++/*
++** ptrue_min_0_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_0_f16_x_untied, svfloat16_t,
++		z0 = svmin_n_f16_x (svptrue_b16 (), z1, 0),
++		z0 = svmin_x (svptrue_b16 (), z1, 0))
++
++/*
++** ptrue_min_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_1_f16_x_tied1, svfloat16_t,
++		z0 = svmin_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svmin_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_min_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_1_f16_x_untied, svfloat16_t,
++		z0 = svmin_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svmin_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_min_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_2_f16_x_tied1, svfloat16_t,
++		z0 = svmin_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svmin_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_min_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_2_f16_x_untied, svfloat16_t,
++		z0 = svmin_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svmin_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f32.c
+new file mode 100644
+index 000000000..a3b1cf5c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f32.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_f32_m_tied1:
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_m_tied1, svfloat32_t,
++		z0 = svmin_f32_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_m_tied2, svfloat32_t,
++		z0 = svmin_f32_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_f32_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_m_untied, svfloat32_t,
++		z0 = svmin_f32_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmin_n_f32_m (p0, z0, d4),
++		 z0 = svmin_m (p0, z0, d4))
++
++/*
++** min_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmin_n_f32_m (p0, z1, d4),
++		 z0 = svmin_m (p0, z1, d4))
++
++/*
++** min_0_f32_m_tied1:
++**	fmin	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f32_m_tied1, svfloat32_t,
++		z0 = svmin_n_f32_m (p0, z0, 0),
++		z0 = svmin_m (p0, z0, 0))
++
++/*
++** min_0_f32_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f32_m_untied, svfloat32_t,
++		z0 = svmin_n_f32_m (p0, z1, 0),
++		z0 = svmin_m (p0, z1, 0))
++
++/*
++** min_1_f32_m_tied1:
++**	fmin	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f32_m_tied1, svfloat32_t,
++		z0 = svmin_n_f32_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f32_m_untied, svfloat32_t,
++		z0 = svmin_n_f32_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_2_f32_m:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f32_m, svfloat32_t,
++		z0 = svmin_n_f32_m (p0, z0, 2),
++		z0 = svmin_m (p0, z0, 2))
++
++/*
++** min_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_z_tied1, svfloat32_t,
++		z0 = svmin_f32_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_z_tied2, svfloat32_t,
++		z0 = svmin_f32_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmin	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_z_untied, svfloat32_t,
++		z0 = svmin_f32_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmin_n_f32_z (p0, z0, d4),
++		 z0 = svmin_z (p0, z0, d4))
++
++/*
++** min_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmin	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (min_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmin_n_f32_z (p0, z1, d4),
++		 z0 = svmin_z (p0, z1, d4))
++
++/*
++** min_0_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmin	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f32_z_tied1, svfloat32_t,
++		z0 = svmin_n_f32_z (p0, z0, 0),
++		z0 = svmin_z (p0, z0, 0))
++
++/*
++** min_0_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmin	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f32_z_untied, svfloat32_t,
++		z0 = svmin_n_f32_z (p0, z1, 0),
++		z0 = svmin_z (p0, z1, 0))
++
++/*
++** min_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmin	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f32_z_tied1, svfloat32_t,
++		z0 = svmin_n_f32_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmin	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f32_z_untied, svfloat32_t,
++		z0 = svmin_n_f32_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_2_f32_z:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f32_z, svfloat32_t,
++		z0 = svmin_n_f32_z (p0, z0, 2),
++		z0 = svmin_z (p0, z0, 2))
++
++/*
++** min_f32_x_tied1:
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_x_tied1, svfloat32_t,
++		z0 = svmin_f32_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_f32_x_tied2:
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_x_tied2, svfloat32_t,
++		z0 = svmin_f32_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_f32_x_untied, svfloat32_t,
++		z0 = svmin_f32_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmin_n_f32_x (p0, z0, d4),
++		 z0 = svmin_x (p0, z0, d4))
++
++/*
++** min_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (min_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmin_n_f32_x (p0, z1, d4),
++		 z0 = svmin_x (p0, z1, d4))
++
++/*
++** min_0_f32_x_tied1:
++**	fmin	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f32_x_tied1, svfloat32_t,
++		z0 = svmin_n_f32_x (p0, z0, 0),
++		z0 = svmin_x (p0, z0, 0))
++
++/*
++** min_0_f32_x_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f32_x_untied, svfloat32_t,
++		z0 = svmin_n_f32_x (p0, z1, 0),
++		z0 = svmin_x (p0, z1, 0))
++
++/*
++** min_1_f32_x_tied1:
++**	fmin	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f32_x_tied1, svfloat32_t,
++		z0 = svmin_n_f32_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f32_x_untied, svfloat32_t,
++		z0 = svmin_n_f32_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f32_x_tied1, svfloat32_t,
++		z0 = svmin_n_f32_x (p0, z0, 2),
++		z0 = svmin_x (p0, z0, 2))
++
++/*
++** min_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f32_x_untied, svfloat32_t,
++		z0 = svmin_n_f32_x (p0, z1, 2),
++		z0 = svmin_x (p0, z1, 2))
++
++/*
++** ptrue_min_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f32_x_tied1, svfloat32_t,
++		z0 = svmin_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svmin_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_min_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f32_x_tied2, svfloat32_t,
++		z0 = svmin_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svmin_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_min_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f32_x_untied, svfloat32_t,
++		z0 = svmin_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svmin_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_min_0_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_0_f32_x_tied1, svfloat32_t,
++		z0 = svmin_n_f32_x (svptrue_b32 (), z0, 0),
++		z0 = svmin_x (svptrue_b32 (), z0, 0))
++
++/*
++** ptrue_min_0_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_0_f32_x_untied, svfloat32_t,
++		z0 = svmin_n_f32_x (svptrue_b32 (), z1, 0),
++		z0 = svmin_x (svptrue_b32 (), z1, 0))
++
++/*
++** ptrue_min_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_1_f32_x_tied1, svfloat32_t,
++		z0 = svmin_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svmin_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_min_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_1_f32_x_untied, svfloat32_t,
++		z0 = svmin_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svmin_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_min_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_2_f32_x_tied1, svfloat32_t,
++		z0 = svmin_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svmin_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_min_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_2_f32_x_untied, svfloat32_t,
++		z0 = svmin_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svmin_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f64.c
+new file mode 100644
+index 000000000..bb31102e2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f64.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_f64_m_tied1:
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_m_tied1, svfloat64_t,
++		z0 = svmin_f64_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_m_tied2, svfloat64_t,
++		z0 = svmin_f64_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_f64_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_m_untied, svfloat64_t,
++		z0 = svmin_f64_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmin_n_f64_m (p0, z0, d4),
++		 z0 = svmin_m (p0, z0, d4))
++
++/*
++** min_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmin_n_f64_m (p0, z1, d4),
++		 z0 = svmin_m (p0, z1, d4))
++
++/*
++** min_0_f64_m_tied1:
++**	fmin	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f64_m_tied1, svfloat64_t,
++		z0 = svmin_n_f64_m (p0, z0, 0),
++		z0 = svmin_m (p0, z0, 0))
++
++/*
++** min_0_f64_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f64_m_untied, svfloat64_t,
++		z0 = svmin_n_f64_m (p0, z1, 0),
++		z0 = svmin_m (p0, z1, 0))
++
++/*
++** min_1_f64_m_tied1:
++**	fmin	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f64_m_tied1, svfloat64_t,
++		z0 = svmin_n_f64_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f64_m_untied, svfloat64_t,
++		z0 = svmin_n_f64_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_2_f64_m:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f64_m, svfloat64_t,
++		z0 = svmin_n_f64_m (p0, z0, 2),
++		z0 = svmin_m (p0, z0, 2))
++
++/*
++** min_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_z_tied1, svfloat64_t,
++		z0 = svmin_f64_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_z_tied2, svfloat64_t,
++		z0 = svmin_f64_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmin	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_z_untied, svfloat64_t,
++		z0 = svmin_f64_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmin_n_f64_z (p0, z0, d4),
++		 z0 = svmin_z (p0, z0, d4))
++
++/*
++** min_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmin	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (min_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmin_n_f64_z (p0, z1, d4),
++		 z0 = svmin_z (p0, z1, d4))
++
++/*
++** min_0_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmin	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f64_z_tied1, svfloat64_t,
++		z0 = svmin_n_f64_z (p0, z0, 0),
++		z0 = svmin_z (p0, z0, 0))
++
++/*
++** min_0_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmin	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f64_z_untied, svfloat64_t,
++		z0 = svmin_n_f64_z (p0, z1, 0),
++		z0 = svmin_z (p0, z1, 0))
++
++/*
++** min_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmin	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f64_z_tied1, svfloat64_t,
++		z0 = svmin_n_f64_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmin	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f64_z_untied, svfloat64_t,
++		z0 = svmin_n_f64_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_2_f64_z:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f64_z, svfloat64_t,
++		z0 = svmin_n_f64_z (p0, z0, 2),
++		z0 = svmin_z (p0, z0, 2))
++
++/*
++** min_f64_x_tied1:
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_x_tied1, svfloat64_t,
++		z0 = svmin_f64_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_f64_x_tied2:
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_x_tied2, svfloat64_t,
++		z0 = svmin_f64_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_f64_x_untied, svfloat64_t,
++		z0 = svmin_f64_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (min_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmin_n_f64_x (p0, z0, d4),
++		 z0 = svmin_x (p0, z0, d4))
++
++/*
++** min_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (min_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmin_n_f64_x (p0, z1, d4),
++		 z0 = svmin_x (p0, z1, d4))
++
++/*
++** min_0_f64_x_tied1:
++**	fmin	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f64_x_tied1, svfloat64_t,
++		z0 = svmin_n_f64_x (p0, z0, 0),
++		z0 = svmin_x (p0, z0, 0))
++
++/*
++** min_0_f64_x_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_0_f64_x_untied, svfloat64_t,
++		z0 = svmin_n_f64_x (p0, z1, 0),
++		z0 = svmin_x (p0, z1, 0))
++
++/*
++** min_1_f64_x_tied1:
++**	fmin	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f64_x_tied1, svfloat64_t,
++		z0 = svmin_n_f64_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fmin	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_f64_x_untied, svfloat64_t,
++		z0 = svmin_n_f64_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f64_x_tied1, svfloat64_t,
++		z0 = svmin_n_f64_x (p0, z0, 2),
++		z0 = svmin_x (p0, z0, 2))
++
++/*
++** min_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_2_f64_x_untied, svfloat64_t,
++		z0 = svmin_n_f64_x (p0, z1, 2),
++		z0 = svmin_x (p0, z1, 2))
++
++/*
++** ptrue_min_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f64_x_tied1, svfloat64_t,
++		z0 = svmin_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svmin_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_min_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f64_x_tied2, svfloat64_t,
++		z0 = svmin_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svmin_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_min_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_f64_x_untied, svfloat64_t,
++		z0 = svmin_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svmin_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_min_0_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_0_f64_x_tied1, svfloat64_t,
++		z0 = svmin_n_f64_x (svptrue_b64 (), z0, 0),
++		z0 = svmin_x (svptrue_b64 (), z0, 0))
++
++/*
++** ptrue_min_0_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_0_f64_x_untied, svfloat64_t,
++		z0 = svmin_n_f64_x (svptrue_b64 (), z1, 0),
++		z0 = svmin_x (svptrue_b64 (), z1, 0))
++
++/*
++** ptrue_min_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_1_f64_x_tied1, svfloat64_t,
++		z0 = svmin_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svmin_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_min_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_1_f64_x_untied, svfloat64_t,
++		z0 = svmin_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svmin_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_min_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_2_f64_x_tied1, svfloat64_t,
++		z0 = svmin_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svmin_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_min_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_min_2_f64_x_untied, svfloat64_t,
++		z0 = svmin_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svmin_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s16.c
+new file mode 100644
+index 000000000..14dfcc4c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s16.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_s16_m_tied1:
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_m_tied1, svint16_t,
++		z0 = svmin_s16_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smin	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_m_tied2, svint16_t,
++		z0 = svmin_s16_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_s16_m_untied:
++**	movprfx	z0, z1
++**	smin	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_m_untied, svint16_t,
++		z0 = svmin_s16_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmin_n_s16_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmin_n_s16_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s16_m_tied1, svint16_t,
++		z0 = svmin_n_s16_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s16_m_untied, svint16_t,
++		z0 = svmin_n_s16_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_s16_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	smin	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s16_m, svint16_t,
++		z0 = svmin_n_s16_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_z_tied1, svint16_t,
++		z0 = svmin_s16_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_z_tied2, svint16_t,
++		z0 = svmin_s16_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smin	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_z_untied, svint16_t,
++		z0 = svmin_s16_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmin_n_s16_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smin	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmin_n_s16_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s16_z_tied1, svint16_t,
++		z0 = svmin_n_s16_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smin	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s16_z_untied, svint16_t,
++		z0 = svmin_n_s16_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_s16_x_tied1:
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_x_tied1, svint16_t,
++		z0 = svmin_s16_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_s16_x_tied2:
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_x_tied2, svint16_t,
++		z0 = svmin_s16_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	smin	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s16_x_untied, svint16_t,
++		z0 = svmin_s16_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmin_n_s16_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	smin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmin_n_s16_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_s16_x_tied1:
++**	smin	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s16_x_tied1, svint16_t,
++		z0 = svmin_n_s16_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_s16_x_untied:
++**	movprfx	z0, z1
++**	smin	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s16_x_untied, svint16_t,
++		z0 = svmin_n_s16_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_s16_x:
++**	smin	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_s16_x, svint16_t,
++		z0 = svmin_n_s16_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_s16_x:
++**	mov	(z[0-9]+\.h), #128
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_s16_x, svint16_t,
++		z0 = svmin_n_s16_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_m1_s16_x:
++**	smin	z0\.h, z0\.h, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s16_x, svint16_t,
++		z0 = svmin_n_s16_x (p0, z0, -1),
++		z0 = svmin_x (p0, z0, -1))
++
++/*
++** min_m128_s16_x:
++**	smin	z0\.h, z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (min_m128_s16_x, svint16_t,
++		z0 = svmin_n_s16_x (p0, z0, -128),
++		z0 = svmin_x (p0, z0, -128))
++
++/*
++** min_m129_s16_x:
++**	mov	(z[0-9]+\.h), #-129
++**	smin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m129_s16_x, svint16_t,
++		z0 = svmin_n_s16_x (p0, z0, -129),
++		z0 = svmin_x (p0, z0, -129))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s32.c
+new file mode 100644
+index 000000000..cee2b649d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s32.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_s32_m_tied1:
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_m_tied1, svint32_t,
++		z0 = svmin_s32_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smin	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_m_tied2, svint32_t,
++		z0 = svmin_s32_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_s32_m_untied:
++**	movprfx	z0, z1
++**	smin	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_m_untied, svint32_t,
++		z0 = svmin_s32_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmin_n_s32_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmin_n_s32_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s32_m_tied1, svint32_t,
++		z0 = svmin_n_s32_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s32_m_untied, svint32_t,
++		z0 = svmin_n_s32_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_s32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	smin	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s32_m, svint32_t,
++		z0 = svmin_n_s32_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_z_tied1, svint32_t,
++		z0 = svmin_s32_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_z_tied2, svint32_t,
++		z0 = svmin_s32_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smin	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_z_untied, svint32_t,
++		z0 = svmin_s32_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmin_n_s32_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smin	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmin_n_s32_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s32_z_tied1, svint32_t,
++		z0 = svmin_n_s32_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smin	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s32_z_untied, svint32_t,
++		z0 = svmin_n_s32_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_s32_x_tied1:
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_x_tied1, svint32_t,
++		z0 = svmin_s32_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_s32_x_tied2:
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_x_tied2, svint32_t,
++		z0 = svmin_s32_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	smin	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s32_x_untied, svint32_t,
++		z0 = svmin_s32_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmin_n_s32_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	smin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmin_n_s32_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_s32_x_tied1:
++**	smin	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s32_x_tied1, svint32_t,
++		z0 = svmin_n_s32_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_s32_x_untied:
++**	movprfx	z0, z1
++**	smin	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s32_x_untied, svint32_t,
++		z0 = svmin_n_s32_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_s32_x:
++**	smin	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_s32_x, svint32_t,
++		z0 = svmin_n_s32_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_s32_x:
++**	mov	(z[0-9]+\.s), #128
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_s32_x, svint32_t,
++		z0 = svmin_n_s32_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_m1_s32_x:
++**	smin	z0\.s, z0\.s, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s32_x, svint32_t,
++		z0 = svmin_n_s32_x (p0, z0, -1),
++		z0 = svmin_x (p0, z0, -1))
++
++/*
++** min_m128_s32_x:
++**	smin	z0\.s, z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (min_m128_s32_x, svint32_t,
++		z0 = svmin_n_s32_x (p0, z0, -128),
++		z0 = svmin_x (p0, z0, -128))
++
++/*
++** min_m129_s32_x:
++**	mov	(z[0-9]+\.s), #-129
++**	smin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m129_s32_x, svint32_t,
++		z0 = svmin_n_s32_x (p0, z0, -129),
++		z0 = svmin_x (p0, z0, -129))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s64.c
+new file mode 100644
+index 000000000..0d20bd0b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s64.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_s64_m_tied1:
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_m_tied1, svint64_t,
++		z0 = svmin_s64_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_m_tied2, svint64_t,
++		z0 = svmin_s64_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_s64_m_untied:
++**	movprfx	z0, z1
++**	smin	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_m_untied, svint64_t,
++		z0 = svmin_s64_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmin_n_s64_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmin_n_s64_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s64_m_tied1, svint64_t,
++		z0 = svmin_n_s64_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s64_m_untied, svint64_t,
++		z0 = svmin_n_s64_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_s64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	smin	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s64_m, svint64_t,
++		z0 = svmin_n_s64_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_z_tied1, svint64_t,
++		z0 = svmin_s64_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_z_tied2, svint64_t,
++		z0 = svmin_s64_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smin	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_z_untied, svint64_t,
++		z0 = svmin_s64_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmin_n_s64_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smin	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmin_n_s64_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s64_z_tied1, svint64_t,
++		z0 = svmin_n_s64_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smin	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s64_z_untied, svint64_t,
++		z0 = svmin_n_s64_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_s64_x_tied1:
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_x_tied1, svint64_t,
++		z0 = svmin_s64_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_s64_x_tied2:
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_x_tied2, svint64_t,
++		z0 = svmin_s64_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	smin	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s64_x_untied, svint64_t,
++		z0 = svmin_s64_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmin_n_s64_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	smin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmin_n_s64_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_s64_x_tied1:
++**	smin	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s64_x_tied1, svint64_t,
++		z0 = svmin_n_s64_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_s64_x_untied:
++**	movprfx	z0, z1
++**	smin	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s64_x_untied, svint64_t,
++		z0 = svmin_n_s64_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_s64_x:
++**	smin	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_s64_x, svint64_t,
++		z0 = svmin_n_s64_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_s64_x:
++**	mov	(z[0-9]+\.d), #128
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_s64_x, svint64_t,
++		z0 = svmin_n_s64_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_m1_s64_x:
++**	smin	z0\.d, z0\.d, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s64_x, svint64_t,
++		z0 = svmin_n_s64_x (p0, z0, -1),
++		z0 = svmin_x (p0, z0, -1))
++
++/*
++** min_m128_s64_x:
++**	smin	z0\.d, z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (min_m128_s64_x, svint64_t,
++		z0 = svmin_n_s64_x (p0, z0, -128),
++		z0 = svmin_x (p0, z0, -128))
++
++/*
++** min_m129_s64_x:
++**	mov	(z[0-9]+\.d), #-129
++**	smin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m129_s64_x, svint64_t,
++		z0 = svmin_n_s64_x (p0, z0, -129),
++		z0 = svmin_x (p0, z0, -129))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s8.c
+new file mode 100644
+index 000000000..714b1576d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s8.c
+@@ -0,0 +1,273 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_s8_m_tied1:
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_m_tied1, svint8_t,
++		z0 = svmin_s8_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smin	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_m_tied2, svint8_t,
++		z0 = svmin_s8_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_s8_m_untied:
++**	movprfx	z0, z1
++**	smin	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_m_untied, svint8_t,
++		z0 = svmin_s8_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmin_n_s8_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmin_n_s8_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s8_m_tied1, svint8_t,
++		z0 = svmin_n_s8_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s8_m_untied, svint8_t,
++		z0 = svmin_n_s8_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_s8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s8_m, svint8_t,
++		z0 = svmin_n_s8_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_z_tied1, svint8_t,
++		z0 = svmin_s8_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_z_tied2, svint8_t,
++		z0 = svmin_s8_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smin	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_z_untied, svint8_t,
++		z0 = svmin_s8_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmin_n_s8_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smin	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmin_n_s8_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s8_z_tied1, svint8_t,
++		z0 = svmin_n_s8_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smin	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s8_z_untied, svint8_t,
++		z0 = svmin_n_s8_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_s8_x_tied1:
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_x_tied1, svint8_t,
++		z0 = svmin_s8_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_s8_x_tied2:
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_x_tied2, svint8_t,
++		z0 = svmin_s8_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	smin	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_s8_x_untied, svint8_t,
++		z0 = svmin_s8_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	smin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmin_n_s8_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	smin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmin_n_s8_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_s8_x_tied1:
++**	smin	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s8_x_tied1, svint8_t,
++		z0 = svmin_n_s8_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_s8_x_untied:
++**	movprfx	z0, z1
++**	smin	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_s8_x_untied, svint8_t,
++		z0 = svmin_n_s8_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_s8_x:
++**	smin	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_s8_x, svint8_t,
++		z0 = svmin_n_s8_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_m1_s8_x:
++**	smin	z0\.b, z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_s8_x, svint8_t,
++		z0 = svmin_n_s8_x (p0, z0, -1),
++		z0 = svmin_x (p0, z0, -1))
++
++/*
++** min_m127_s8_x:
++**	smin	z0\.b, z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (min_m127_s8_x, svint8_t,
++		z0 = svmin_n_s8_x (p0, z0, -127),
++		z0 = svmin_x (p0, z0, -127))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u16.c
+new file mode 100644
+index 000000000..df35cf113
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u16.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_u16_m_tied1:
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_m_tied1, svuint16_t,
++		z0 = svmin_u16_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umin	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_m_tied2, svuint16_t,
++		z0 = svmin_u16_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_u16_m_untied:
++**	movprfx	z0, z1
++**	umin	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_m_untied, svuint16_t,
++		z0 = svmin_u16_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmin_n_u16_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmin_n_u16_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u16_m_tied1, svuint16_t,
++		z0 = svmin_n_u16_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u16_m_untied, svuint16_t,
++		z0 = svmin_n_u16_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_u16_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	umin	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_u16_m, svuint16_t,
++		z0 = svmin_n_u16_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_z_tied1, svuint16_t,
++		z0 = svmin_u16_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_z_tied2, svuint16_t,
++		z0 = svmin_u16_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umin	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_z_untied, svuint16_t,
++		z0 = svmin_u16_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmin_n_u16_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umin	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmin_n_u16_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u16_z_tied1, svuint16_t,
++		z0 = svmin_n_u16_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umin	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u16_z_untied, svuint16_t,
++		z0 = svmin_n_u16_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_u16_x_tied1:
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_x_tied1, svuint16_t,
++		z0 = svmin_u16_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_u16_x_tied2:
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_x_tied2, svuint16_t,
++		z0 = svmin_u16_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	umin	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u16_x_untied, svuint16_t,
++		z0 = svmin_u16_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmin_n_u16_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	umin	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmin_n_u16_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_u16_x_tied1:
++**	umin	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u16_x_tied1, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_u16_x_untied:
++**	movprfx	z0, z1
++**	umin	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u16_x_untied, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_u16_x:
++**	umin	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_u16_x, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_u16_x:
++**	umin	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_u16_x, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_255_u16_x:
++**	umin	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (min_255_u16_x, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z0, 255),
++		z0 = svmin_x (p0, z0, 255))
++
++/*
++** min_256_u16_x:
++**	mov	(z[0-9]+\.h), #256
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_256_u16_x, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z0, 256),
++		z0 = svmin_x (p0, z0, 256))
++
++/*
++** min_m2_u16_x:
++**	mov	(z[0-9]+\.h), #-2
++**	umin	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m2_u16_x, svuint16_t,
++		z0 = svmin_n_u16_x (p0, z0, -2),
++		z0 = svmin_x (p0, z0, -2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u32.c
+new file mode 100644
+index 000000000..7f84d099d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u32.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_u32_m_tied1:
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_m_tied1, svuint32_t,
++		z0 = svmin_u32_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umin	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_m_tied2, svuint32_t,
++		z0 = svmin_u32_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_u32_m_untied:
++**	movprfx	z0, z1
++**	umin	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_m_untied, svuint32_t,
++		z0 = svmin_u32_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmin_n_u32_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmin_n_u32_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u32_m_tied1, svuint32_t,
++		z0 = svmin_n_u32_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u32_m_untied, svuint32_t,
++		z0 = svmin_n_u32_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_u32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	umin	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_u32_m, svuint32_t,
++		z0 = svmin_n_u32_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_z_tied1, svuint32_t,
++		z0 = svmin_u32_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_z_tied2, svuint32_t,
++		z0 = svmin_u32_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umin	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_z_untied, svuint32_t,
++		z0 = svmin_u32_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmin_n_u32_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umin	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmin_n_u32_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u32_z_tied1, svuint32_t,
++		z0 = svmin_n_u32_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umin	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u32_z_untied, svuint32_t,
++		z0 = svmin_n_u32_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_u32_x_tied1:
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_x_tied1, svuint32_t,
++		z0 = svmin_u32_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_u32_x_tied2:
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_x_tied2, svuint32_t,
++		z0 = svmin_u32_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	umin	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u32_x_untied, svuint32_t,
++		z0 = svmin_u32_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmin_n_u32_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	umin	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmin_n_u32_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_u32_x_tied1:
++**	umin	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u32_x_tied1, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_u32_x_untied:
++**	movprfx	z0, z1
++**	umin	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u32_x_untied, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_u32_x:
++**	umin	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_u32_x, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_u32_x:
++**	umin	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_u32_x, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_255_u32_x:
++**	umin	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (min_255_u32_x, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z0, 255),
++		z0 = svmin_x (p0, z0, 255))
++
++/*
++** min_256_u32_x:
++**	mov	(z[0-9]+\.s), #256
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_256_u32_x, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z0, 256),
++		z0 = svmin_x (p0, z0, 256))
++
++/*
++** min_m2_u32_x:
++**	mov	(z[0-9]+\.s), #-2
++**	umin	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m2_u32_x, svuint32_t,
++		z0 = svmin_n_u32_x (p0, z0, -2),
++		z0 = svmin_x (p0, z0, -2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u64.c
+new file mode 100644
+index 000000000..06e6e5099
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u64.c
+@@ -0,0 +1,293 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_u64_m_tied1:
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_m_tied1, svuint64_t,
++		z0 = svmin_u64_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_m_tied2, svuint64_t,
++		z0 = svmin_u64_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_u64_m_untied:
++**	movprfx	z0, z1
++**	umin	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_m_untied, svuint64_t,
++		z0 = svmin_u64_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmin_n_u64_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmin_n_u64_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u64_m_tied1, svuint64_t,
++		z0 = svmin_n_u64_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u64_m_untied, svuint64_t,
++		z0 = svmin_n_u64_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_u64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	umin	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_u64_m, svuint64_t,
++		z0 = svmin_n_u64_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_z_tied1, svuint64_t,
++		z0 = svmin_u64_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_z_tied2, svuint64_t,
++		z0 = svmin_u64_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umin	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_z_untied, svuint64_t,
++		z0 = svmin_u64_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmin_n_u64_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umin	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmin_n_u64_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u64_z_tied1, svuint64_t,
++		z0 = svmin_n_u64_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umin	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u64_z_untied, svuint64_t,
++		z0 = svmin_n_u64_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_u64_x_tied1:
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_x_tied1, svuint64_t,
++		z0 = svmin_u64_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_u64_x_tied2:
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_x_tied2, svuint64_t,
++		z0 = svmin_u64_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	umin	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u64_x_untied, svuint64_t,
++		z0 = svmin_u64_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmin_n_u64_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	umin	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (min_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmin_n_u64_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_u64_x_tied1:
++**	umin	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u64_x_tied1, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_u64_x_untied:
++**	movprfx	z0, z1
++**	umin	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u64_x_untied, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_u64_x:
++**	umin	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_u64_x, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_u64_x:
++**	umin	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_u64_x, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_255_u64_x:
++**	umin	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (min_255_u64_x, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z0, 255),
++		z0 = svmin_x (p0, z0, 255))
++
++/*
++** min_256_u64_x:
++**	mov	(z[0-9]+\.d), #256
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_256_u64_x, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z0, 256),
++		z0 = svmin_x (p0, z0, 256))
++
++/*
++** min_m2_u64_x:
++**	mov	(z[0-9]+\.d), #-2
++**	umin	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m2_u64_x, svuint64_t,
++		z0 = svmin_n_u64_x (p0, z0, -2),
++		z0 = svmin_x (p0, z0, -2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u8.c
+new file mode 100644
+index 000000000..2ca274278
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u8.c
+@@ -0,0 +1,273 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** min_u8_m_tied1:
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_m_tied1, svuint8_t,
++		z0 = svmin_u8_m (p0, z0, z1),
++		z0 = svmin_m (p0, z0, z1))
++
++/*
++** min_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umin	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_m_tied2, svuint8_t,
++		z0 = svmin_u8_m (p0, z1, z0),
++		z0 = svmin_m (p0, z1, z0))
++
++/*
++** min_u8_m_untied:
++**	movprfx	z0, z1
++**	umin	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_m_untied, svuint8_t,
++		z0 = svmin_u8_m (p0, z1, z2),
++		z0 = svmin_m (p0, z1, z2))
++
++/*
++** min_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmin_n_u8_m (p0, z0, x0),
++		 z0 = svmin_m (p0, z0, x0))
++
++/*
++** min_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmin_n_u8_m (p0, z1, x0),
++		 z0 = svmin_m (p0, z1, x0))
++
++/*
++** min_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u8_m_tied1, svuint8_t,
++		z0 = svmin_n_u8_m (p0, z0, 1),
++		z0 = svmin_m (p0, z0, 1))
++
++/*
++** min_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u8_m_untied, svuint8_t,
++		z0 = svmin_n_u8_m (p0, z1, 1),
++		z0 = svmin_m (p0, z1, 1))
++
++/*
++** min_m1_u8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_m1_u8_m, svuint8_t,
++		z0 = svmin_n_u8_m (p0, z0, -1),
++		z0 = svmin_m (p0, z0, -1))
++
++/*
++** min_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_z_tied1, svuint8_t,
++		z0 = svmin_u8_z (p0, z0, z1),
++		z0 = svmin_z (p0, z0, z1))
++
++/*
++** min_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_z_tied2, svuint8_t,
++		z0 = svmin_u8_z (p0, z1, z0),
++		z0 = svmin_z (p0, z1, z0))
++
++/*
++** min_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umin	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_z_untied, svuint8_t,
++		z0 = svmin_u8_z (p0, z1, z2),
++		z0 = svmin_z (p0, z1, z2))
++
++/*
++** min_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmin_n_u8_z (p0, z0, x0),
++		 z0 = svmin_z (p0, z0, x0))
++
++/*
++** min_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umin	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmin_n_u8_z (p0, z1, x0),
++		 z0 = svmin_z (p0, z1, x0))
++
++/*
++** min_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u8_z_tied1, svuint8_t,
++		z0 = svmin_n_u8_z (p0, z0, 1),
++		z0 = svmin_z (p0, z0, 1))
++
++/*
++** min_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umin	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u8_z_untied, svuint8_t,
++		z0 = svmin_n_u8_z (p0, z1, 1),
++		z0 = svmin_z (p0, z1, 1))
++
++/*
++** min_u8_x_tied1:
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_x_tied1, svuint8_t,
++		z0 = svmin_u8_x (p0, z0, z1),
++		z0 = svmin_x (p0, z0, z1))
++
++/*
++** min_u8_x_tied2:
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_x_tied2, svuint8_t,
++		z0 = svmin_u8_x (p0, z1, z0),
++		z0 = svmin_x (p0, z1, z0))
++
++/*
++** min_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	umin	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (min_u8_x_untied, svuint8_t,
++		z0 = svmin_u8_x (p0, z1, z2),
++		z0 = svmin_x (p0, z1, z2))
++
++/*
++** min_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	umin	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmin_n_u8_x (p0, z0, x0),
++		 z0 = svmin_x (p0, z0, x0))
++
++/*
++** min_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	umin	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (min_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmin_n_u8_x (p0, z1, x0),
++		 z0 = svmin_x (p0, z1, x0))
++
++/*
++** min_1_u8_x_tied1:
++**	umin	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u8_x_tied1, svuint8_t,
++		z0 = svmin_n_u8_x (p0, z0, 1),
++		z0 = svmin_x (p0, z0, 1))
++
++/*
++** min_1_u8_x_untied:
++**	movprfx	z0, z1
++**	umin	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (min_1_u8_x_untied, svuint8_t,
++		z0 = svmin_n_u8_x (p0, z1, 1),
++		z0 = svmin_x (p0, z1, 1))
++
++/*
++** min_127_u8_x:
++**	umin	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (min_127_u8_x, svuint8_t,
++		z0 = svmin_n_u8_x (p0, z0, 127),
++		z0 = svmin_x (p0, z0, 127))
++
++/*
++** min_128_u8_x:
++**	umin	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (min_128_u8_x, svuint8_t,
++		z0 = svmin_n_u8_x (p0, z0, 128),
++		z0 = svmin_x (p0, z0, 128))
++
++/*
++** min_254_u8_x:
++**	umin	z0\.b, z0\.b, #254
++**	ret
++*/
++TEST_UNIFORM_Z (min_254_u8_x, svuint8_t,
++		z0 = svmin_n_u8_x (p0, z0, 254),
++		z0 = svmin_x (p0, z0, 254))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f16.c
+new file mode 100644
+index 000000000..43caaa14e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f16.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minnm_f16_m_tied1:
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_m_tied1, svfloat16_t,
++		z0 = svminnm_f16_m (p0, z0, z1),
++		z0 = svminnm_m (p0, z0, z1))
++
++/*
++** minnm_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_m_tied2, svfloat16_t,
++		z0 = svminnm_f16_m (p0, z1, z0),
++		z0 = svminnm_m (p0, z1, z0))
++
++/*
++** minnm_f16_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_m_untied, svfloat16_t,
++		z0 = svminnm_f16_m (p0, z1, z2),
++		z0 = svminnm_m (p0, z1, z2))
++
++/*
++** minnm_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svminnm_n_f16_m (p0, z0, d4),
++		 z0 = svminnm_m (p0, z0, d4))
++
++/*
++** minnm_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svminnm_n_f16_m (p0, z1, d4),
++		 z0 = svminnm_m (p0, z1, d4))
++
++/*
++** minnm_0_f16_m_tied1:
++**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f16_m_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_m (p0, z0, 0),
++		z0 = svminnm_m (p0, z0, 0))
++
++/*
++** minnm_0_f16_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f16_m_untied, svfloat16_t,
++		z0 = svminnm_n_f16_m (p0, z1, 0),
++		z0 = svminnm_m (p0, z1, 0))
++
++/*
++** minnm_1_f16_m_tied1:
++**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f16_m_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_m (p0, z0, 1),
++		z0 = svminnm_m (p0, z0, 1))
++
++/*
++** minnm_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f16_m_untied, svfloat16_t,
++		z0 = svminnm_n_f16_m (p0, z1, 1),
++		z0 = svminnm_m (p0, z1, 1))
++
++/*
++** minnm_2_f16_m:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f16_m, svfloat16_t,
++		z0 = svminnm_n_f16_m (p0, z0, 2),
++		z0 = svminnm_m (p0, z0, 2))
++
++/*
++** minnm_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_z_tied1, svfloat16_t,
++		z0 = svminnm_f16_z (p0, z0, z1),
++		z0 = svminnm_z (p0, z0, z1))
++
++/*
++** minnm_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_z_tied2, svfloat16_t,
++		z0 = svminnm_f16_z (p0, z1, z0),
++		z0 = svminnm_z (p0, z1, z0))
++
++/*
++** minnm_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fminnm	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_z_untied, svfloat16_t,
++		z0 = svminnm_f16_z (p0, z1, z2),
++		z0 = svminnm_z (p0, z1, z2))
++
++/*
++** minnm_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svminnm_n_f16_z (p0, z0, d4),
++		 z0 = svminnm_z (p0, z0, d4))
++
++/*
++** minnm_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svminnm_n_f16_z (p0, z1, d4),
++		 z0 = svminnm_z (p0, z1, d4))
++
++/*
++** minnm_0_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f16_z_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_z (p0, z0, 0),
++		z0 = svminnm_z (p0, z0, 0))
++
++/*
++** minnm_0_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f16_z_untied, svfloat16_t,
++		z0 = svminnm_n_f16_z (p0, z1, 0),
++		z0 = svminnm_z (p0, z1, 0))
++
++/*
++** minnm_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f16_z_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_z (p0, z0, 1),
++		z0 = svminnm_z (p0, z0, 1))
++
++/*
++** minnm_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f16_z_untied, svfloat16_t,
++		z0 = svminnm_n_f16_z (p0, z1, 1),
++		z0 = svminnm_z (p0, z1, 1))
++
++/*
++** minnm_2_f16_z:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f16_z, svfloat16_t,
++		z0 = svminnm_n_f16_z (p0, z0, 2),
++		z0 = svminnm_z (p0, z0, 2))
++
++/*
++** minnm_f16_x_tied1:
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_f16_x (p0, z0, z1),
++		z0 = svminnm_x (p0, z0, z1))
++
++/*
++** minnm_f16_x_tied2:
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_x_tied2, svfloat16_t,
++		z0 = svminnm_f16_x (p0, z1, z0),
++		z0 = svminnm_x (p0, z1, z0))
++
++/*
++** minnm_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f16_x_untied, svfloat16_t,
++		z0 = svminnm_f16_x (p0, z1, z2),
++		z0 = svminnm_x (p0, z1, z2))
++
++/*
++** minnm_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svminnm_n_f16_x (p0, z0, d4),
++		 z0 = svminnm_x (p0, z0, d4))
++
++/*
++** minnm_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svminnm_n_f16_x (p0, z1, d4),
++		 z0 = svminnm_x (p0, z1, d4))
++
++/*
++** minnm_0_f16_x_tied1:
++**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_x (p0, z0, 0),
++		z0 = svminnm_x (p0, z0, 0))
++
++/*
++** minnm_0_f16_x_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f16_x_untied, svfloat16_t,
++		z0 = svminnm_n_f16_x (p0, z1, 0),
++		z0 = svminnm_x (p0, z1, 0))
++
++/*
++** minnm_1_f16_x_tied1:
++**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_x (p0, z0, 1),
++		z0 = svminnm_x (p0, z0, 1))
++
++/*
++** minnm_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f16_x_untied, svfloat16_t,
++		z0 = svminnm_n_f16_x (p0, z1, 1),
++		z0 = svminnm_x (p0, z1, 1))
++
++/*
++** minnm_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fminnm	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_x (p0, z0, 2),
++		z0 = svminnm_x (p0, z0, 2))
++
++/*
++** minnm_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f16_x_untied, svfloat16_t,
++		z0 = svminnm_n_f16_x (p0, z1, 2),
++		z0 = svminnm_x (p0, z1, 2))
++
++/*
++** ptrue_minnm_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svminnm_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_minnm_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f16_x_tied2, svfloat16_t,
++		z0 = svminnm_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svminnm_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_minnm_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f16_x_untied, svfloat16_t,
++		z0 = svminnm_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svminnm_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_minnm_0_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_0_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_x (svptrue_b16 (), z0, 0),
++		z0 = svminnm_x (svptrue_b16 (), z0, 0))
++
++/*
++** ptrue_minnm_0_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_0_f16_x_untied, svfloat16_t,
++		z0 = svminnm_n_f16_x (svptrue_b16 (), z1, 0),
++		z0 = svminnm_x (svptrue_b16 (), z1, 0))
++
++/*
++** ptrue_minnm_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_1_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svminnm_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_minnm_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_1_f16_x_untied, svfloat16_t,
++		z0 = svminnm_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svminnm_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_minnm_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_2_f16_x_tied1, svfloat16_t,
++		z0 = svminnm_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svminnm_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_minnm_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_2_f16_x_untied, svfloat16_t,
++		z0 = svminnm_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svminnm_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f32.c
+new file mode 100644
+index 000000000..4fac8e8ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f32.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minnm_f32_m_tied1:
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_m_tied1, svfloat32_t,
++		z0 = svminnm_f32_m (p0, z0, z1),
++		z0 = svminnm_m (p0, z0, z1))
++
++/*
++** minnm_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_m_tied2, svfloat32_t,
++		z0 = svminnm_f32_m (p0, z1, z0),
++		z0 = svminnm_m (p0, z1, z0))
++
++/*
++** minnm_f32_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_m_untied, svfloat32_t,
++		z0 = svminnm_f32_m (p0, z1, z2),
++		z0 = svminnm_m (p0, z1, z2))
++
++/*
++** minnm_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svminnm_n_f32_m (p0, z0, d4),
++		 z0 = svminnm_m (p0, z0, d4))
++
++/*
++** minnm_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svminnm_n_f32_m (p0, z1, d4),
++		 z0 = svminnm_m (p0, z1, d4))
++
++/*
++** minnm_0_f32_m_tied1:
++**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f32_m_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_m (p0, z0, 0),
++		z0 = svminnm_m (p0, z0, 0))
++
++/*
++** minnm_0_f32_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f32_m_untied, svfloat32_t,
++		z0 = svminnm_n_f32_m (p0, z1, 0),
++		z0 = svminnm_m (p0, z1, 0))
++
++/*
++** minnm_1_f32_m_tied1:
++**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f32_m_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_m (p0, z0, 1),
++		z0 = svminnm_m (p0, z0, 1))
++
++/*
++** minnm_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f32_m_untied, svfloat32_t,
++		z0 = svminnm_n_f32_m (p0, z1, 1),
++		z0 = svminnm_m (p0, z1, 1))
++
++/*
++** minnm_2_f32_m:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f32_m, svfloat32_t,
++		z0 = svminnm_n_f32_m (p0, z0, 2),
++		z0 = svminnm_m (p0, z0, 2))
++
++/*
++** minnm_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_z_tied1, svfloat32_t,
++		z0 = svminnm_f32_z (p0, z0, z1),
++		z0 = svminnm_z (p0, z0, z1))
++
++/*
++** minnm_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_z_tied2, svfloat32_t,
++		z0 = svminnm_f32_z (p0, z1, z0),
++		z0 = svminnm_z (p0, z1, z0))
++
++/*
++** minnm_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fminnm	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_z_untied, svfloat32_t,
++		z0 = svminnm_f32_z (p0, z1, z2),
++		z0 = svminnm_z (p0, z1, z2))
++
++/*
++** minnm_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svminnm_n_f32_z (p0, z0, d4),
++		 z0 = svminnm_z (p0, z0, d4))
++
++/*
++** minnm_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svminnm_n_f32_z (p0, z1, d4),
++		 z0 = svminnm_z (p0, z1, d4))
++
++/*
++** minnm_0_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f32_z_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_z (p0, z0, 0),
++		z0 = svminnm_z (p0, z0, 0))
++
++/*
++** minnm_0_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f32_z_untied, svfloat32_t,
++		z0 = svminnm_n_f32_z (p0, z1, 0),
++		z0 = svminnm_z (p0, z1, 0))
++
++/*
++** minnm_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f32_z_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_z (p0, z0, 1),
++		z0 = svminnm_z (p0, z0, 1))
++
++/*
++** minnm_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f32_z_untied, svfloat32_t,
++		z0 = svminnm_n_f32_z (p0, z1, 1),
++		z0 = svminnm_z (p0, z1, 1))
++
++/*
++** minnm_2_f32_z:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f32_z, svfloat32_t,
++		z0 = svminnm_n_f32_z (p0, z0, 2),
++		z0 = svminnm_z (p0, z0, 2))
++
++/*
++** minnm_f32_x_tied1:
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_f32_x (p0, z0, z1),
++		z0 = svminnm_x (p0, z0, z1))
++
++/*
++** minnm_f32_x_tied2:
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_x_tied2, svfloat32_t,
++		z0 = svminnm_f32_x (p0, z1, z0),
++		z0 = svminnm_x (p0, z1, z0))
++
++/*
++** minnm_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f32_x_untied, svfloat32_t,
++		z0 = svminnm_f32_x (p0, z1, z2),
++		z0 = svminnm_x (p0, z1, z2))
++
++/*
++** minnm_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svminnm_n_f32_x (p0, z0, d4),
++		 z0 = svminnm_x (p0, z0, d4))
++
++/*
++** minnm_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svminnm_n_f32_x (p0, z1, d4),
++		 z0 = svminnm_x (p0, z1, d4))
++
++/*
++** minnm_0_f32_x_tied1:
++**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_x (p0, z0, 0),
++		z0 = svminnm_x (p0, z0, 0))
++
++/*
++** minnm_0_f32_x_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f32_x_untied, svfloat32_t,
++		z0 = svminnm_n_f32_x (p0, z1, 0),
++		z0 = svminnm_x (p0, z1, 0))
++
++/*
++** minnm_1_f32_x_tied1:
++**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_x (p0, z0, 1),
++		z0 = svminnm_x (p0, z0, 1))
++
++/*
++** minnm_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f32_x_untied, svfloat32_t,
++		z0 = svminnm_n_f32_x (p0, z1, 1),
++		z0 = svminnm_x (p0, z1, 1))
++
++/*
++** minnm_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fminnm	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_x (p0, z0, 2),
++		z0 = svminnm_x (p0, z0, 2))
++
++/*
++** minnm_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f32_x_untied, svfloat32_t,
++		z0 = svminnm_n_f32_x (p0, z1, 2),
++		z0 = svminnm_x (p0, z1, 2))
++
++/*
++** ptrue_minnm_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svminnm_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_minnm_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f32_x_tied2, svfloat32_t,
++		z0 = svminnm_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svminnm_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_minnm_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f32_x_untied, svfloat32_t,
++		z0 = svminnm_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svminnm_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_minnm_0_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_0_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_x (svptrue_b32 (), z0, 0),
++		z0 = svminnm_x (svptrue_b32 (), z0, 0))
++
++/*
++** ptrue_minnm_0_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_0_f32_x_untied, svfloat32_t,
++		z0 = svminnm_n_f32_x (svptrue_b32 (), z1, 0),
++		z0 = svminnm_x (svptrue_b32 (), z1, 0))
++
++/*
++** ptrue_minnm_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_1_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svminnm_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_minnm_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_1_f32_x_untied, svfloat32_t,
++		z0 = svminnm_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svminnm_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_minnm_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_2_f32_x_tied1, svfloat32_t,
++		z0 = svminnm_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svminnm_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_minnm_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_2_f32_x_untied, svfloat32_t,
++		z0 = svminnm_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svminnm_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f64.c
+new file mode 100644
+index 000000000..67993928f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f64.c
+@@ -0,0 +1,425 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minnm_f64_m_tied1:
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_m_tied1, svfloat64_t,
++		z0 = svminnm_f64_m (p0, z0, z1),
++		z0 = svminnm_m (p0, z0, z1))
++
++/*
++** minnm_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_m_tied2, svfloat64_t,
++		z0 = svminnm_f64_m (p0, z1, z0),
++		z0 = svminnm_m (p0, z1, z0))
++
++/*
++** minnm_f64_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_m_untied, svfloat64_t,
++		z0 = svminnm_f64_m (p0, z1, z2),
++		z0 = svminnm_m (p0, z1, z2))
++
++/*
++** minnm_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svminnm_n_f64_m (p0, z0, d4),
++		 z0 = svminnm_m (p0, z0, d4))
++
++/*
++** minnm_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svminnm_n_f64_m (p0, z1, d4),
++		 z0 = svminnm_m (p0, z1, d4))
++
++/*
++** minnm_0_f64_m_tied1:
++**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f64_m_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_m (p0, z0, 0),
++		z0 = svminnm_m (p0, z0, 0))
++
++/*
++** minnm_0_f64_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f64_m_untied, svfloat64_t,
++		z0 = svminnm_n_f64_m (p0, z1, 0),
++		z0 = svminnm_m (p0, z1, 0))
++
++/*
++** minnm_1_f64_m_tied1:
++**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f64_m_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_m (p0, z0, 1),
++		z0 = svminnm_m (p0, z0, 1))
++
++/*
++** minnm_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f64_m_untied, svfloat64_t,
++		z0 = svminnm_n_f64_m (p0, z1, 1),
++		z0 = svminnm_m (p0, z1, 1))
++
++/*
++** minnm_2_f64_m:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f64_m, svfloat64_t,
++		z0 = svminnm_n_f64_m (p0, z0, 2),
++		z0 = svminnm_m (p0, z0, 2))
++
++/*
++** minnm_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_z_tied1, svfloat64_t,
++		z0 = svminnm_f64_z (p0, z0, z1),
++		z0 = svminnm_z (p0, z0, z1))
++
++/*
++** minnm_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_z_tied2, svfloat64_t,
++		z0 = svminnm_f64_z (p0, z1, z0),
++		z0 = svminnm_z (p0, z1, z0))
++
++/*
++** minnm_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fminnm	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_z_untied, svfloat64_t,
++		z0 = svminnm_f64_z (p0, z1, z2),
++		z0 = svminnm_z (p0, z1, z2))
++
++/*
++** minnm_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svminnm_n_f64_z (p0, z0, d4),
++		 z0 = svminnm_z (p0, z0, d4))
++
++/*
++** minnm_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svminnm_n_f64_z (p0, z1, d4),
++		 z0 = svminnm_z (p0, z1, d4))
++
++/*
++** minnm_0_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f64_z_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_z (p0, z0, 0),
++		z0 = svminnm_z (p0, z0, 0))
++
++/*
++** minnm_0_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f64_z_untied, svfloat64_t,
++		z0 = svminnm_n_f64_z (p0, z1, 0),
++		z0 = svminnm_z (p0, z1, 0))
++
++/*
++** minnm_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f64_z_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_z (p0, z0, 1),
++		z0 = svminnm_z (p0, z0, 1))
++
++/*
++** minnm_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f64_z_untied, svfloat64_t,
++		z0 = svminnm_n_f64_z (p0, z1, 1),
++		z0 = svminnm_z (p0, z1, 1))
++
++/*
++** minnm_2_f64_z:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f64_z, svfloat64_t,
++		z0 = svminnm_n_f64_z (p0, z0, 2),
++		z0 = svminnm_z (p0, z0, 2))
++
++/*
++** minnm_f64_x_tied1:
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_f64_x (p0, z0, z1),
++		z0 = svminnm_x (p0, z0, z1))
++
++/*
++** minnm_f64_x_tied2:
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_x_tied2, svfloat64_t,
++		z0 = svminnm_f64_x (p0, z1, z0),
++		z0 = svminnm_x (p0, z1, z0))
++
++/*
++** minnm_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_f64_x_untied, svfloat64_t,
++		z0 = svminnm_f64_x (p0, z1, z2),
++		z0 = svminnm_x (p0, z1, z2))
++
++/*
++** minnm_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svminnm_n_f64_x (p0, z0, d4),
++		 z0 = svminnm_x (p0, z0, d4))
++
++/*
++** minnm_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (minnm_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svminnm_n_f64_x (p0, z1, d4),
++		 z0 = svminnm_x (p0, z1, d4))
++
++/*
++** minnm_0_f64_x_tied1:
++**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_x (p0, z0, 0),
++		z0 = svminnm_x (p0, z0, 0))
++
++/*
++** minnm_0_f64_x_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_0_f64_x_untied, svfloat64_t,
++		z0 = svminnm_n_f64_x (p0, z1, 0),
++		z0 = svminnm_x (p0, z1, 0))
++
++/*
++** minnm_1_f64_x_tied1:
++**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_x (p0, z0, 1),
++		z0 = svminnm_x (p0, z0, 1))
++
++/*
++** minnm_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_1_f64_x_untied, svfloat64_t,
++		z0 = svminnm_n_f64_x (p0, z1, 1),
++		z0 = svminnm_x (p0, z1, 1))
++
++/*
++** minnm_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fminnm	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_x (p0, z0, 2),
++		z0 = svminnm_x (p0, z0, 2))
++
++/*
++** minnm_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (minnm_2_f64_x_untied, svfloat64_t,
++		z0 = svminnm_n_f64_x (p0, z1, 2),
++		z0 = svminnm_x (p0, z1, 2))
++
++/*
++** ptrue_minnm_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svminnm_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_minnm_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f64_x_tied2, svfloat64_t,
++		z0 = svminnm_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svminnm_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_minnm_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_f64_x_untied, svfloat64_t,
++		z0 = svminnm_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svminnm_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_minnm_0_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_0_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_x (svptrue_b64 (), z0, 0),
++		z0 = svminnm_x (svptrue_b64 (), z0, 0))
++
++/*
++** ptrue_minnm_0_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_0_f64_x_untied, svfloat64_t,
++		z0 = svminnm_n_f64_x (svptrue_b64 (), z1, 0),
++		z0 = svminnm_x (svptrue_b64 (), z1, 0))
++
++/*
++** ptrue_minnm_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_1_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svminnm_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_minnm_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_1_f64_x_untied, svfloat64_t,
++		z0 = svminnm_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svminnm_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_minnm_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_2_f64_x_tied1, svfloat64_t,
++		z0 = svminnm_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svminnm_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_minnm_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_minnm_2_f64_x_untied, svfloat64_t,
++		z0 = svminnm_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svminnm_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f16.c
+new file mode 100644
+index 000000000..827f41bfe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minnmv_d0_f16_tied:
++**	fminnmv	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (minnmv_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svminnmv_f16 (p0, z0),
++		  d0 = svminnmv (p0, z0))
++
++/*
++** minnmv_d0_f16_untied:
++**	fminnmv	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (minnmv_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svminnmv_f16 (p0, z1),
++		  d0 = svminnmv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f32.c
+new file mode 100644
+index 000000000..2352ec2a3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minnmv_d0_f32_tied:
++**	fminnmv	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (minnmv_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svminnmv_f32 (p0, z0),
++		  d0 = svminnmv (p0, z0))
++
++/*
++** minnmv_d0_f32_untied:
++**	fminnmv	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (minnmv_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svminnmv_f32 (p0, z1),
++		  d0 = svminnmv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f64.c
+new file mode 100644
+index 000000000..3d769a3d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minnmv_d0_f64_tied:
++**	fminnmv	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (minnmv_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svminnmv_f64 (p0, z0),
++		  d0 = svminnmv (p0, z0))
++
++/*
++** minnmv_d0_f64_untied:
++**	fminnmv	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (minnmv_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svminnmv_f64 (p0, z1),
++		  d0 = svminnmv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f16.c
+new file mode 100644
+index 000000000..190aa16e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_d0_f16_tied:
++**	fminv	h0, p0, z0\.h
++**	ret
++*/
++TEST_REDUCTION_D (minv_d0_f16_tied, float16_t, svfloat16_t,
++		  d0 = svminv_f16 (p0, z0),
++		  d0 = svminv (p0, z0))
++
++/*
++** minv_d0_f16_untied:
++**	fminv	h0, p0, z1\.h
++**	ret
++*/
++TEST_REDUCTION_D (minv_d0_f16_untied, float16_t, svfloat16_t,
++		  d0 = svminv_f16 (p0, z1),
++		  d0 = svminv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f32.c
+new file mode 100644
+index 000000000..07871b893
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_d0_f32_tied:
++**	fminv	s0, p0, z0\.s
++**	ret
++*/
++TEST_REDUCTION_D (minv_d0_f32_tied, float32_t, svfloat32_t,
++		  d0 = svminv_f32 (p0, z0),
++		  d0 = svminv (p0, z0))
++
++/*
++** minv_d0_f32_untied:
++**	fminv	s0, p0, z1\.s
++**	ret
++*/
++TEST_REDUCTION_D (minv_d0_f32_untied, float32_t, svfloat32_t,
++		  d0 = svminv_f32 (p0, z1),
++		  d0 = svminv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f64.c
+new file mode 100644
+index 000000000..7435f306f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_d0_f64_tied:
++**	fminv	d0, p0, z0\.d
++**	ret
++*/
++TEST_REDUCTION_D (minv_d0_f64_tied, float64_t, svfloat64_t,
++		  d0 = svminv_f64 (p0, z0),
++		  d0 = svminv (p0, z0))
++
++/*
++** minv_d0_f64_untied:
++**	fminv	d0, p0, z1\.d
++**	ret
++*/
++TEST_REDUCTION_D (minv_d0_f64_untied, float64_t, svfloat64_t,
++		  d0 = svminv_f64 (p0, z1),
++		  d0 = svminv (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s16.c
+new file mode 100644
+index 000000000..dfb66a9f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_s16:
++**	sminv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_s16, int16_t, svint16_t,
++		  x0 = svminv_s16 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s32.c
+new file mode 100644
+index 000000000..c02df5dd3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_s32:
++**	sminv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_s32, int32_t, svint32_t,
++		  x0 = svminv_s32 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s64.c
+new file mode 100644
+index 000000000..784973231
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_s64:
++**	sminv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_s64, int64_t, svint64_t,
++		  x0 = svminv_s64 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s8.c
+new file mode 100644
+index 000000000..0b1bce5de
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_s8:
++**	sminv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_s8, int8_t, svint8_t,
++		  x0 = svminv_s8 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u16.c
+new file mode 100644
+index 000000000..b499de33e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_u16:
++**	uminv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_u16, uint16_t, svuint16_t,
++		  x0 = svminv_u16 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u32.c
+new file mode 100644
+index 000000000..18c9d8c6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_u32:
++**	uminv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_u32, uint32_t, svuint32_t,
++		  x0 = svminv_u32 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u64.c
+new file mode 100644
+index 000000000..374d5e426
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_u64:
++**	uminv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_u64, uint64_t, svuint64_t,
++		  x0 = svminv_u64 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u8.c
+new file mode 100644
+index 000000000..d9f6f5835
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** minv_x0_u8:
++**	uminv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (minv_x0_u8, uint8_t, svuint8_t,
++		  x0 = svminv_u8 (p0, z0),
++		  x0 = svminv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f16.c
+new file mode 100644
+index 000000000..f22a582ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_f16_m_tied1:
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_m_tied1, svfloat16_t,
++		z0 = svmla_f16_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_m_tied2, svfloat16_t,
++		z0 = svmla_f16_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_m_tied3, svfloat16_t,
++		z0 = svmla_f16_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_f16_m_untied:
++**	movprfx	z0, z1
++**	fmla	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_m_untied, svfloat16_t,
++		z0 = svmla_f16_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_m (p0, z0, z1, d4),
++		 z0 = svmla_m (p0, z0, z1, d4))
++
++/*
++** mla_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_m (p0, z1, z2, d4),
++		 z0 = svmla_m (p0, z1, z2, d4))
++
++/*
++** mla_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_m_tied1, svfloat16_t,
++		z0 = svmla_n_f16_m (p0, z0, z1, 2),
++		z0 = svmla_m (p0, z0, z1, 2))
++
++/*
++** mla_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_m_untied, svfloat16_t,
++		z0 = svmla_n_f16_m (p0, z1, z2, 2),
++		z0 = svmla_m (p0, z1, z2, 2))
++
++/*
++** mla_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_z_tied1, svfloat16_t,
++		z0 = svmla_f16_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_z_tied2, svfloat16_t,
++		z0 = svmla_f16_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_z_tied3, svfloat16_t,
++		z0 = svmla_f16_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_z_untied, svfloat16_t,
++		z0 = svmla_f16_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_z (p0, z0, z1, d4),
++		 z0 = svmla_z (p0, z0, z1, d4))
++
++/*
++** mla_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_z (p0, z1, z0, d4),
++		 z0 = svmla_z (p0, z1, z0, d4))
++
++/*
++** mla_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_z (p0, z1, z2, d4),
++		 z0 = svmla_z (p0, z1, z2, d4))
++
++/*
++** mla_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_z_tied1, svfloat16_t,
++		z0 = svmla_n_f16_z (p0, z0, z1, 2),
++		z0 = svmla_z (p0, z0, z1, 2))
++
++/*
++** mla_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_z_tied2, svfloat16_t,
++		z0 = svmla_n_f16_z (p0, z1, z0, 2),
++		z0 = svmla_z (p0, z1, z0, 2))
++
++/*
++** mla_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_z_untied, svfloat16_t,
++		z0 = svmla_n_f16_z (p0, z1, z2, 2),
++		z0 = svmla_z (p0, z1, z2, 2))
++
++/*
++** mla_f16_x_tied1:
++**	fmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_x_tied1, svfloat16_t,
++		z0 = svmla_f16_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_f16_x_tied2:
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_x_tied2, svfloat16_t,
++		z0 = svmla_f16_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_f16_x_tied3:
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_x_tied3, svfloat16_t,
++		z0 = svmla_f16_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fmad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f16_x_untied, svfloat16_t,
++		z0 = svmla_f16_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_x (p0, z0, z1, d4),
++		 z0 = svmla_x (p0, z0, z1, d4))
++
++/*
++** mla_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_x (p0, z1, z0, d4),
++		 z0 = svmla_x (p0, z1, z0, d4))
++
++/*
++** mla_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmla_n_f16_x (p0, z1, z2, d4),
++		 z0 = svmla_x (p0, z1, z2, d4))
++
++/*
++** mla_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_x_tied1, svfloat16_t,
++		z0 = svmla_n_f16_x (p0, z0, z1, 2),
++		z0 = svmla_x (p0, z0, z1, 2))
++
++/*
++** mla_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_x_tied2, svfloat16_t,
++		z0 = svmla_n_f16_x (p0, z1, z0, 2),
++		z0 = svmla_x (p0, z1, z0, 2))
++
++/*
++** mla_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f16_x_untied, svfloat16_t,
++		z0 = svmla_n_f16_x (p0, z1, z2, 2),
++		z0 = svmla_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mla_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f16_x_tied1, svfloat16_t,
++		z0 = svmla_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svmla_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_mla_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f16_x_tied2, svfloat16_t,
++		z0 = svmla_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svmla_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_mla_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f16_x_tied3, svfloat16_t,
++		z0 = svmla_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svmla_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_mla_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f16_x_untied, svfloat16_t,
++		z0 = svmla_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svmla_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_mla_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f16_x_tied1, svfloat16_t,
++		z0 = svmla_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svmla_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_mla_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f16_x_tied2, svfloat16_t,
++		z0 = svmla_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svmla_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_mla_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f16_x_untied, svfloat16_t,
++		z0 = svmla_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svmla_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f32.c
+new file mode 100644
+index 000000000..1d95eb0a7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_f32_m_tied1:
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_m_tied1, svfloat32_t,
++		z0 = svmla_f32_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_m_tied2, svfloat32_t,
++		z0 = svmla_f32_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_m_tied3, svfloat32_t,
++		z0 = svmla_f32_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_f32_m_untied:
++**	movprfx	z0, z1
++**	fmla	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_m_untied, svfloat32_t,
++		z0 = svmla_f32_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmla_n_f32_m (p0, z0, z1, d4),
++		 z0 = svmla_m (p0, z0, z1, d4))
++
++/*
++** mla_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmla_n_f32_m (p0, z1, z2, d4),
++		 z0 = svmla_m (p0, z1, z2, d4))
++
++/*
++** mla_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_m_tied1, svfloat32_t,
++		z0 = svmla_n_f32_m (p0, z0, z1, 2),
++		z0 = svmla_m (p0, z0, z1, 2))
++
++/*
++** mla_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_m_untied, svfloat32_t,
++		z0 = svmla_n_f32_m (p0, z1, z2, 2),
++		z0 = svmla_m (p0, z1, z2, 2))
++
++/*
++** mla_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_z_tied1, svfloat32_t,
++		z0 = svmla_f32_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_z_tied2, svfloat32_t,
++		z0 = svmla_f32_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_z_tied3, svfloat32_t,
++		z0 = svmla_f32_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_z_untied, svfloat32_t,
++		z0 = svmla_f32_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmla_n_f32_z (p0, z0, z1, d4),
++		 z0 = svmla_z (p0, z0, z1, d4))
++
++/*
++** mla_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svmla_n_f32_z (p0, z1, z0, d4),
++		 z0 = svmla_z (p0, z1, z0, d4))
++
++/*
++** mla_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmla_n_f32_z (p0, z1, z2, d4),
++		 z0 = svmla_z (p0, z1, z2, d4))
++
++/*
++** mla_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_z_tied1, svfloat32_t,
++		z0 = svmla_n_f32_z (p0, z0, z1, 2),
++		z0 = svmla_z (p0, z0, z1, 2))
++
++/*
++** mla_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_z_tied2, svfloat32_t,
++		z0 = svmla_n_f32_z (p0, z1, z0, 2),
++		z0 = svmla_z (p0, z1, z0, 2))
++
++/*
++** mla_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_z_untied, svfloat32_t,
++		z0 = svmla_n_f32_z (p0, z1, z2, 2),
++		z0 = svmla_z (p0, z1, z2, 2))
++
++/*
++** mla_f32_x_tied1:
++**	fmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_x_tied1, svfloat32_t,
++		z0 = svmla_f32_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_f32_x_tied2:
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_x_tied2, svfloat32_t,
++		z0 = svmla_f32_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_f32_x_tied3:
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_x_tied3, svfloat32_t,
++		z0 = svmla_f32_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fmad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f32_x_untied, svfloat32_t,
++		z0 = svmla_f32_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmla_n_f32_x (p0, z0, z1, d4),
++		 z0 = svmla_x (p0, z0, z1, d4))
++
++/*
++** mla_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svmla_n_f32_x (p0, z1, z0, d4),
++		 z0 = svmla_x (p0, z1, z0, d4))
++
++/*
++** mla_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmla_n_f32_x (p0, z1, z2, d4),
++		 z0 = svmla_x (p0, z1, z2, d4))
++
++/*
++** mla_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_x_tied1, svfloat32_t,
++		z0 = svmla_n_f32_x (p0, z0, z1, 2),
++		z0 = svmla_x (p0, z0, z1, 2))
++
++/*
++** mla_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_x_tied2, svfloat32_t,
++		z0 = svmla_n_f32_x (p0, z1, z0, 2),
++		z0 = svmla_x (p0, z1, z0, 2))
++
++/*
++** mla_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f32_x_untied, svfloat32_t,
++		z0 = svmla_n_f32_x (p0, z1, z2, 2),
++		z0 = svmla_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mla_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f32_x_tied1, svfloat32_t,
++		z0 = svmla_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svmla_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_mla_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f32_x_tied2, svfloat32_t,
++		z0 = svmla_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svmla_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_mla_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f32_x_tied3, svfloat32_t,
++		z0 = svmla_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svmla_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_mla_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f32_x_untied, svfloat32_t,
++		z0 = svmla_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svmla_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_mla_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f32_x_tied1, svfloat32_t,
++		z0 = svmla_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svmla_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_mla_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f32_x_tied2, svfloat32_t,
++		z0 = svmla_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svmla_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_mla_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f32_x_untied, svfloat32_t,
++		z0 = svmla_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svmla_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f64.c
+new file mode 100644
+index 000000000..74fd29267
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_f64_m_tied1:
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_m_tied1, svfloat64_t,
++		z0 = svmla_f64_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_m_tied2, svfloat64_t,
++		z0 = svmla_f64_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_m_tied3, svfloat64_t,
++		z0 = svmla_f64_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_f64_m_untied:
++**	movprfx	z0, z1
++**	fmla	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_m_untied, svfloat64_t,
++		z0 = svmla_f64_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmla_n_f64_m (p0, z0, z1, d4),
++		 z0 = svmla_m (p0, z0, z1, d4))
++
++/*
++** mla_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmla_n_f64_m (p0, z1, z2, d4),
++		 z0 = svmla_m (p0, z1, z2, d4))
++
++/*
++** mla_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_m_tied1, svfloat64_t,
++		z0 = svmla_n_f64_m (p0, z0, z1, 2),
++		z0 = svmla_m (p0, z0, z1, 2))
++
++/*
++** mla_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_m_untied, svfloat64_t,
++		z0 = svmla_n_f64_m (p0, z1, z2, 2),
++		z0 = svmla_m (p0, z1, z2, 2))
++
++/*
++** mla_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_z_tied1, svfloat64_t,
++		z0 = svmla_f64_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_z_tied2, svfloat64_t,
++		z0 = svmla_f64_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_z_tied3, svfloat64_t,
++		z0 = svmla_f64_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_z_untied, svfloat64_t,
++		z0 = svmla_f64_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmla_n_f64_z (p0, z0, z1, d4),
++		 z0 = svmla_z (p0, z0, z1, d4))
++
++/*
++** mla_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svmla_n_f64_z (p0, z1, z0, d4),
++		 z0 = svmla_z (p0, z1, z0, d4))
++
++/*
++** mla_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmla_n_f64_z (p0, z1, z2, d4),
++		 z0 = svmla_z (p0, z1, z2, d4))
++
++/*
++** mla_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_z_tied1, svfloat64_t,
++		z0 = svmla_n_f64_z (p0, z0, z1, 2),
++		z0 = svmla_z (p0, z0, z1, 2))
++
++/*
++** mla_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_z_tied2, svfloat64_t,
++		z0 = svmla_n_f64_z (p0, z1, z0, 2),
++		z0 = svmla_z (p0, z1, z0, 2))
++
++/*
++** mla_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_z_untied, svfloat64_t,
++		z0 = svmla_n_f64_z (p0, z1, z2, 2),
++		z0 = svmla_z (p0, z1, z2, 2))
++
++/*
++** mla_f64_x_tied1:
++**	fmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_x_tied1, svfloat64_t,
++		z0 = svmla_f64_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_f64_x_tied2:
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_x_tied2, svfloat64_t,
++		z0 = svmla_f64_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_f64_x_tied3:
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_x_tied3, svfloat64_t,
++		z0 = svmla_f64_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fmad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_f64_x_untied, svfloat64_t,
++		z0 = svmla_f64_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmla_n_f64_x (p0, z0, z1, d4),
++		 z0 = svmla_x (p0, z0, z1, d4))
++
++/*
++** mla_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svmla_n_f64_x (p0, z1, z0, d4),
++		 z0 = svmla_x (p0, z1, z0, d4))
++
++/*
++** mla_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mla_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmla_n_f64_x (p0, z1, z2, d4),
++		 z0 = svmla_x (p0, z1, z2, d4))
++
++/*
++** mla_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_x_tied1, svfloat64_t,
++		z0 = svmla_n_f64_x (p0, z0, z1, 2),
++		z0 = svmla_x (p0, z0, z1, 2))
++
++/*
++** mla_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_x_tied2, svfloat64_t,
++		z0 = svmla_n_f64_x (p0, z1, z0, 2),
++		z0 = svmla_x (p0, z1, z0, 2))
++
++/*
++** mla_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_2_f64_x_untied, svfloat64_t,
++		z0 = svmla_n_f64_x (p0, z1, z2, 2),
++		z0 = svmla_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mla_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f64_x_tied1, svfloat64_t,
++		z0 = svmla_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svmla_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_mla_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f64_x_tied2, svfloat64_t,
++		z0 = svmla_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svmla_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_mla_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f64_x_tied3, svfloat64_t,
++		z0 = svmla_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svmla_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_mla_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_f64_x_untied, svfloat64_t,
++		z0 = svmla_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svmla_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_mla_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f64_x_tied1, svfloat64_t,
++		z0 = svmla_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svmla_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_mla_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f64_x_tied2, svfloat64_t,
++		z0 = svmla_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svmla_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_mla_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mla_2_f64_x_untied, svfloat64_t,
++		z0 = svmla_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svmla_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f16.c
+new file mode 100644
+index 000000000..949e3bb47
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f16.c
+@@ -0,0 +1,128 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_lane_0_f16_tied1:
++**	fmla	z0\.h, z1\.h, z2\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f16_tied1, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 0),
++		z0 = svmla_lane (z0, z1, z2, 0))
++
++/*
++** mla_lane_0_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.h, \1\.h, z2\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f16_tied2, svfloat16_t,
++		z0 = svmla_lane_f16 (z1, z0, z2, 0),
++		z0 = svmla_lane (z1, z0, z2, 0))
++
++/*
++** mla_lane_0_f16_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.h, z2\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f16_tied3, svfloat16_t,
++		z0 = svmla_lane_f16 (z1, z2, z0, 0),
++		z0 = svmla_lane (z1, z2, z0, 0))
++
++/*
++** mla_lane_0_f16_untied:
++**	movprfx	z0, z1
++**	fmla	z0\.h, z2\.h, z3\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f16_untied, svfloat16_t,
++		z0 = svmla_lane_f16 (z1, z2, z3, 0),
++		z0 = svmla_lane (z1, z2, z3, 0))
++
++/*
++** mla_lane_1_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_1_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 1),
++		z0 = svmla_lane (z0, z1, z2, 1))
++
++/*
++** mla_lane_2_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_2_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 2),
++		z0 = svmla_lane (z0, z1, z2, 2))
++
++/*
++** mla_lane_3_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_3_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 3),
++		z0 = svmla_lane (z0, z1, z2, 3))
++
++/*
++** mla_lane_4_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[4\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_4_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 4),
++		z0 = svmla_lane (z0, z1, z2, 4))
++
++/*
++** mla_lane_5_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[5\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_5_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 5),
++		z0 = svmla_lane (z0, z1, z2, 5))
++
++/*
++** mla_lane_6_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[6\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_6_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 6),
++		z0 = svmla_lane (z0, z1, z2, 6))
++
++/*
++** mla_lane_7_f16:
++**	fmla	z0\.h, z1\.h, z2\.h\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_7_f16, svfloat16_t,
++		z0 = svmla_lane_f16 (z0, z1, z2, 7),
++		z0 = svmla_lane (z0, z1, z2, 7))
++
++/*
++** mla_lane_z7_f16:
++**	fmla	z0\.h, z1\.h, z7\.h\[7\]
++**	ret
++*/
++TEST_DUAL_Z (mla_lane_z7_f16, svfloat16_t, svfloat16_t,
++	     z0 = svmla_lane_f16 (z0, z1, z7, 7),
++	     z0 = svmla_lane (z0, z1, z7, 7))
++
++/*
++** mla_lane_z8_f16:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	fmla	z0\.h, z1\.h, \1\.h\[7\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mla_lane_z8_f16, svfloat16_t, svfloat16_t, z8,
++		    z0 = svmla_lane_f16 (z0, z1, z8, 7),
++		    z0 = svmla_lane (z0, z1, z8, 7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f32.c
+new file mode 100644
+index 000000000..d376532d6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f32.c
+@@ -0,0 +1,92 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_lane_0_f32_tied1:
++**	fmla	z0\.s, z1\.s, z2\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f32_tied1, svfloat32_t,
++		z0 = svmla_lane_f32 (z0, z1, z2, 0),
++		z0 = svmla_lane (z0, z1, z2, 0))
++
++/*
++** mla_lane_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.s, \1\.s, z2\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f32_tied2, svfloat32_t,
++		z0 = svmla_lane_f32 (z1, z0, z2, 0),
++		z0 = svmla_lane (z1, z0, z2, 0))
++
++/*
++** mla_lane_0_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.s, z2\.s, \1\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f32_tied3, svfloat32_t,
++		z0 = svmla_lane_f32 (z1, z2, z0, 0),
++		z0 = svmla_lane (z1, z2, z0, 0))
++
++/*
++** mla_lane_0_f32_untied:
++**	movprfx	z0, z1
++**	fmla	z0\.s, z2\.s, z3\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f32_untied, svfloat32_t,
++		z0 = svmla_lane_f32 (z1, z2, z3, 0),
++		z0 = svmla_lane (z1, z2, z3, 0))
++
++/*
++** mla_lane_1_f32:
++**	fmla	z0\.s, z1\.s, z2\.s\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_1_f32, svfloat32_t,
++		z0 = svmla_lane_f32 (z0, z1, z2, 1),
++		z0 = svmla_lane (z0, z1, z2, 1))
++
++/*
++** mla_lane_2_f32:
++**	fmla	z0\.s, z1\.s, z2\.s\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_2_f32, svfloat32_t,
++		z0 = svmla_lane_f32 (z0, z1, z2, 2),
++		z0 = svmla_lane (z0, z1, z2, 2))
++
++/*
++** mla_lane_3_f32:
++**	fmla	z0\.s, z1\.s, z2\.s\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_3_f32, svfloat32_t,
++		z0 = svmla_lane_f32 (z0, z1, z2, 3),
++		z0 = svmla_lane (z0, z1, z2, 3))
++
++/*
++** mla_lane_z7_f32:
++**	fmla	z0\.s, z1\.s, z7\.s\[3\]
++**	ret
++*/
++TEST_DUAL_Z (mla_lane_z7_f32, svfloat32_t, svfloat32_t,
++	     z0 = svmla_lane_f32 (z0, z1, z7, 3),
++	     z0 = svmla_lane (z0, z1, z7, 3))
++
++/*
++** mla_lane_z8_f32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	fmla	z0\.s, z1\.s, \1\.s\[3\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mla_lane_z8_f32, svfloat32_t, svfloat32_t, z8,
++		    z0 = svmla_lane_f32 (z0, z1, z8, 3),
++		    z0 = svmla_lane (z0, z1, z8, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f64.c
+new file mode 100644
+index 000000000..7c58a8a57
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f64.c
+@@ -0,0 +1,83 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_lane_0_f64_tied1:
++**	fmla	z0\.d, z1\.d, z2\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f64_tied1, svfloat64_t,
++		z0 = svmla_lane_f64 (z0, z1, z2, 0),
++		z0 = svmla_lane (z0, z1, z2, 0))
++
++/*
++** mla_lane_0_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.d, \1, z2\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f64_tied2, svfloat64_t,
++		z0 = svmla_lane_f64 (z1, z0, z2, 0),
++		z0 = svmla_lane (z1, z0, z2, 0))
++
++/*
++** mla_lane_0_f64_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmla	z0\.d, z2\.d, \1\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f64_tied3, svfloat64_t,
++		z0 = svmla_lane_f64 (z1, z2, z0, 0),
++		z0 = svmla_lane (z1, z2, z0, 0))
++
++/*
++** mla_lane_0_f64_untied:
++**	movprfx	z0, z1
++**	fmla	z0\.d, z2\.d, z3\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_0_f64_untied, svfloat64_t,
++		z0 = svmla_lane_f64 (z1, z2, z3, 0),
++		z0 = svmla_lane (z1, z2, z3, 0))
++
++/*
++** mla_lane_1_f64:
++**	fmla	z0\.d, z1\.d, z2\.d\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mla_lane_1_f64, svfloat64_t,
++		z0 = svmla_lane_f64 (z0, z1, z2, 1),
++		z0 = svmla_lane (z0, z1, z2, 1))
++
++/*
++** mla_lane_z7_f64:
++**	fmla	z0\.d, z1\.d, z7\.d\[1\]
++**	ret
++*/
++TEST_DUAL_Z (mla_lane_z7_f64, svfloat64_t, svfloat64_t,
++	     z0 = svmla_lane_f64 (z0, z1, z7, 1),
++	     z0 = svmla_lane (z0, z1, z7, 1))
++
++/*
++** mla_lane_z15_f64:
++**	str	d15, \[sp, -16\]!
++**	fmla	z0\.d, z1\.d, z15\.d\[1\]
++**	ldr	d15, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mla_lane_z15_f64, svfloat64_t, svfloat64_t, z15,
++		    z0 = svmla_lane_f64 (z0, z1, z15, 1),
++		    z0 = svmla_lane (z0, z1, z15, 1))
++
++/*
++** mla_lane_z16_f64:
++**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
++**	fmla	z0\.d, z1\.d, \1\.d\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (mla_lane_z16_f64, svfloat64_t, svfloat64_t, z16,
++		    z0 = svmla_lane_f64 (z0, z1, z16, 1),
++		    z0 = svmla_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s16.c
+new file mode 100644
+index 000000000..f3ed191db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_s16_m_tied1:
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_m_tied1, svint16_t,
++		z0 = svmla_s16_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_m_tied2, svint16_t,
++		z0 = svmla_s16_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_s16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_m_tied3, svint16_t,
++		z0 = svmla_s16_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_s16_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_m_untied, svint16_t,
++		z0 = svmla_s16_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmla_n_s16_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmla_n_s16_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_m_tied1, svint16_t,
++		z0 = svmla_n_s16_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_m_untied, svint16_t,
++		z0 = svmla_n_s16_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_z_tied1, svint16_t,
++		z0 = svmla_s16_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_z_tied2, svint16_t,
++		z0 = svmla_s16_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_s16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_z_tied3, svint16_t,
++		z0 = svmla_s16_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_z_untied, svint16_t,
++		z0 = svmla_s16_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmla_n_s16_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_w0_s16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_z_tied2, svint16_t, int16_t,
++		 z0 = svmla_n_s16_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmla_n_s16_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_z_tied1, svint16_t,
++		z0 = svmla_n_s16_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_s16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_z_tied2, svint16_t,
++		z0 = svmla_n_s16_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_s16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_z_untied, svint16_t,
++		z0 = svmla_n_s16_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_s16_x_tied1:
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_x_tied1, svint16_t,
++		z0 = svmla_s16_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_s16_x_tied2:
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_x_tied2, svint16_t,
++		z0 = svmla_s16_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_s16_x_tied3:
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_x_tied3, svint16_t,
++		z0 = svmla_s16_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	mad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s16_x_untied, svint16_t,
++		z0 = svmla_s16_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmla_n_s16_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_w0_s16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_x_tied2, svint16_t, int16_t,
++		 z0 = svmla_n_s16_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmla_n_s16_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_x_tied1, svint16_t,
++		z0 = svmla_n_s16_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_s16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_x_tied2, svint16_t,
++		z0 = svmla_n_s16_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_s16_x_untied:
++**	mov	z0\.h, #11
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s16_x_untied, svint16_t,
++		z0 = svmla_n_s16_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s32.c
+new file mode 100644
+index 000000000..5e8001a71
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_s32_m_tied1:
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_m_tied1, svint32_t,
++		z0 = svmla_s32_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_m_tied2, svint32_t,
++		z0 = svmla_s32_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_s32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_m_tied3, svint32_t,
++		z0 = svmla_s32_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_s32_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_m_untied, svint32_t,
++		z0 = svmla_s32_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmla_n_s32_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmla_n_s32_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_m_tied1, svint32_t,
++		z0 = svmla_n_s32_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_m_untied, svint32_t,
++		z0 = svmla_n_s32_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_z_tied1, svint32_t,
++		z0 = svmla_s32_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_z_tied2, svint32_t,
++		z0 = svmla_s32_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_s32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_z_tied3, svint32_t,
++		z0 = svmla_s32_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_z_untied, svint32_t,
++		z0 = svmla_s32_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmla_n_s32_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_w0_s32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_z_tied2, svint32_t, int32_t,
++		 z0 = svmla_n_s32_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmla_n_s32_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_z_tied1, svint32_t,
++		z0 = svmla_n_s32_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_s32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_z_tied2, svint32_t,
++		z0 = svmla_n_s32_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_s32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_z_untied, svint32_t,
++		z0 = svmla_n_s32_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_s32_x_tied1:
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_x_tied1, svint32_t,
++		z0 = svmla_s32_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_s32_x_tied2:
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_x_tied2, svint32_t,
++		z0 = svmla_s32_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_s32_x_tied3:
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_x_tied3, svint32_t,
++		z0 = svmla_s32_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	mad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s32_x_untied, svint32_t,
++		z0 = svmla_s32_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmla_n_s32_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_w0_s32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_x_tied2, svint32_t, int32_t,
++		 z0 = svmla_n_s32_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmla_n_s32_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_x_tied1, svint32_t,
++		z0 = svmla_n_s32_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_s32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_x_tied2, svint32_t,
++		z0 = svmla_n_s32_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_s32_x_untied:
++**	mov	z0\.s, #11
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s32_x_untied, svint32_t,
++		z0 = svmla_n_s32_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s64.c
+new file mode 100644
+index 000000000..7b619e521
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_s64_m_tied1:
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_m_tied1, svint64_t,
++		z0 = svmla_s64_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_m_tied2, svint64_t,
++		z0 = svmla_s64_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_s64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_m_tied3, svint64_t,
++		z0 = svmla_s64_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_s64_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_m_untied, svint64_t,
++		z0 = svmla_s64_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmla_n_s64_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmla_n_s64_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_m_tied1, svint64_t,
++		z0 = svmla_n_s64_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_m_untied, svint64_t,
++		z0 = svmla_n_s64_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_z_tied1, svint64_t,
++		z0 = svmla_s64_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_z_tied2, svint64_t,
++		z0 = svmla_s64_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_s64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_z_tied3, svint64_t,
++		z0 = svmla_s64_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_z_untied, svint64_t,
++		z0 = svmla_s64_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmla_n_s64_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_x0_s64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_z_tied2, svint64_t, int64_t,
++		 z0 = svmla_n_s64_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmla_n_s64_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_z_tied1, svint64_t,
++		z0 = svmla_n_s64_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_s64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_z_tied2, svint64_t,
++		z0 = svmla_n_s64_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_s64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_z_untied, svint64_t,
++		z0 = svmla_n_s64_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_s64_x_tied1:
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_x_tied1, svint64_t,
++		z0 = svmla_s64_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_s64_x_tied2:
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_x_tied2, svint64_t,
++		z0 = svmla_s64_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_s64_x_tied3:
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_x_tied3, svint64_t,
++		z0 = svmla_s64_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	mad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s64_x_untied, svint64_t,
++		z0 = svmla_s64_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmla_n_s64_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_x0_s64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_x_tied2, svint64_t, int64_t,
++		 z0 = svmla_n_s64_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmla_n_s64_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_x_tied1, svint64_t,
++		z0 = svmla_n_s64_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_s64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_x_tied2, svint64_t,
++		z0 = svmla_n_s64_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_s64_x_untied:
++**	mov	z0\.d, #11
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s64_x_untied, svint64_t,
++		z0 = svmla_n_s64_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s8.c
+new file mode 100644
+index 000000000..47468947d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_s8_m_tied1:
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_m_tied1, svint8_t,
++		z0 = svmla_s8_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_m_tied2, svint8_t,
++		z0 = svmla_s8_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_s8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_m_tied3, svint8_t,
++		z0 = svmla_s8_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_s8_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_m_untied, svint8_t,
++		z0 = svmla_s8_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmla_n_s8_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmla_n_s8_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_m_tied1, svint8_t,
++		z0 = svmla_n_s8_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_m_untied, svint8_t,
++		z0 = svmla_n_s8_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_z_tied1, svint8_t,
++		z0 = svmla_s8_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_z_tied2, svint8_t,
++		z0 = svmla_s8_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_s8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_z_tied3, svint8_t,
++		z0 = svmla_s8_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mla	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_z_untied, svint8_t,
++		z0 = svmla_s8_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmla_n_s8_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_w0_s8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_z_tied2, svint8_t, int8_t,
++		 z0 = svmla_n_s8_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mla	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmla_n_s8_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_z_tied1, svint8_t,
++		z0 = svmla_n_s8_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_s8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_z_tied2, svint8_t,
++		z0 = svmla_n_s8_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_s8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mla	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_z_untied, svint8_t,
++		z0 = svmla_n_s8_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_s8_x_tied1:
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_x_tied1, svint8_t,
++		z0 = svmla_s8_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_s8_x_tied2:
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_x_tied2, svint8_t,
++		z0 = svmla_s8_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_s8_x_tied3:
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_x_tied3, svint8_t,
++		z0 = svmla_s8_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	mad	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0, z3
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_s8_x_untied, svint8_t,
++		z0 = svmla_s8_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmla_n_s8_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_w0_s8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_x_tied2, svint8_t, int8_t,
++		 z0 = svmla_n_s8_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmla_n_s8_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_x_tied1, svint8_t,
++		z0 = svmla_n_s8_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_s8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_x_tied2, svint8_t,
++		z0 = svmla_n_s8_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_s8_x_untied:
++**	mov	z0\.b, #11
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_s8_x_untied, svint8_t,
++		z0 = svmla_n_s8_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u16.c
+new file mode 100644
+index 000000000..7238e428f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_u16_m_tied1:
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_m_tied1, svuint16_t,
++		z0 = svmla_u16_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_m_tied2, svuint16_t,
++		z0 = svmla_u16_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_u16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_m_tied3, svuint16_t,
++		z0 = svmla_u16_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_u16_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_m_untied, svuint16_t,
++		z0 = svmla_u16_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_m_tied1, svuint16_t,
++		z0 = svmla_n_u16_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_m_untied, svuint16_t,
++		z0 = svmla_n_u16_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_z_tied1, svuint16_t,
++		z0 = svmla_u16_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_z_tied2, svuint16_t,
++		z0 = svmla_u16_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_u16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_z_tied3, svuint16_t,
++		z0 = svmla_u16_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_z_untied, svuint16_t,
++		z0 = svmla_u16_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_w0_u16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_z_tied2, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_z_tied1, svuint16_t,
++		z0 = svmla_n_u16_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_u16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_z_tied2, svuint16_t,
++		z0 = svmla_n_u16_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_u16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_z_untied, svuint16_t,
++		z0 = svmla_n_u16_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_u16_x_tied1:
++**	mla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_x_tied1, svuint16_t,
++		z0 = svmla_u16_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_u16_x_tied2:
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_x_tied2, svuint16_t,
++		z0 = svmla_u16_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_u16_x_tied3:
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_x_tied3, svuint16_t,
++		z0 = svmla_u16_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	mad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u16_x_untied, svuint16_t,
++		z0 = svmla_u16_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_w0_u16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_x_tied2, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmla_n_u16_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_x_tied1, svuint16_t,
++		z0 = svmla_n_u16_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_u16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	mad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_x_tied2, svuint16_t,
++		z0 = svmla_n_u16_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_u16_x_untied:
++**	mov	z0\.h, #11
++**	mad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u16_x_untied, svuint16_t,
++		z0 = svmla_n_u16_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u32.c
+new file mode 100644
+index 000000000..7a68bce3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_u32_m_tied1:
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_m_tied1, svuint32_t,
++		z0 = svmla_u32_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_m_tied2, svuint32_t,
++		z0 = svmla_u32_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_u32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_m_tied3, svuint32_t,
++		z0 = svmla_u32_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_u32_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_m_untied, svuint32_t,
++		z0 = svmla_u32_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_m_tied1, svuint32_t,
++		z0 = svmla_n_u32_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_m_untied, svuint32_t,
++		z0 = svmla_n_u32_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_z_tied1, svuint32_t,
++		z0 = svmla_u32_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_z_tied2, svuint32_t,
++		z0 = svmla_u32_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_u32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_z_tied3, svuint32_t,
++		z0 = svmla_u32_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_z_untied, svuint32_t,
++		z0 = svmla_u32_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_w0_u32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_z_tied2, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_z_tied1, svuint32_t,
++		z0 = svmla_n_u32_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_u32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_z_tied2, svuint32_t,
++		z0 = svmla_n_u32_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_u32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_z_untied, svuint32_t,
++		z0 = svmla_n_u32_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_u32_x_tied1:
++**	mla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_x_tied1, svuint32_t,
++		z0 = svmla_u32_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_u32_x_tied2:
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_x_tied2, svuint32_t,
++		z0 = svmla_u32_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_u32_x_tied3:
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_x_tied3, svuint32_t,
++		z0 = svmla_u32_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	mad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u32_x_untied, svuint32_t,
++		z0 = svmla_u32_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_w0_u32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_x_tied2, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmla_n_u32_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_x_tied1, svuint32_t,
++		z0 = svmla_n_u32_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_u32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	mad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_x_tied2, svuint32_t,
++		z0 = svmla_n_u32_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_u32_x_untied:
++**	mov	z0\.s, #11
++**	mad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u32_x_untied, svuint32_t,
++		z0 = svmla_n_u32_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u64.c
+new file mode 100644
+index 000000000..6233265c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_u64_m_tied1:
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_m_tied1, svuint64_t,
++		z0 = svmla_u64_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_m_tied2, svuint64_t,
++		z0 = svmla_u64_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_u64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_m_tied3, svuint64_t,
++		z0 = svmla_u64_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_u64_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_m_untied, svuint64_t,
++		z0 = svmla_u64_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_m_tied1, svuint64_t,
++		z0 = svmla_n_u64_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_m_untied, svuint64_t,
++		z0 = svmla_n_u64_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_z_tied1, svuint64_t,
++		z0 = svmla_u64_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_z_tied2, svuint64_t,
++		z0 = svmla_u64_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_u64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_z_tied3, svuint64_t,
++		z0 = svmla_u64_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_z_untied, svuint64_t,
++		z0 = svmla_u64_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_x0_u64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_z_tied2, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_z_tied1, svuint64_t,
++		z0 = svmla_n_u64_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_u64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_z_tied2, svuint64_t,
++		z0 = svmla_n_u64_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_u64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_z_untied, svuint64_t,
++		z0 = svmla_n_u64_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_u64_x_tied1:
++**	mla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_x_tied1, svuint64_t,
++		z0 = svmla_u64_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_u64_x_tied2:
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_x_tied2, svuint64_t,
++		z0 = svmla_u64_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_u64_x_tied3:
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_x_tied3, svuint64_t,
++		z0 = svmla_u64_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	mad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u64_x_untied, svuint64_t,
++		z0 = svmla_u64_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_x0_u64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_x_tied2, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmla_n_u64_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_x_tied1, svuint64_t,
++		z0 = svmla_n_u64_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_u64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	mad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_x_tied2, svuint64_t,
++		z0 = svmla_n_u64_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_u64_x_untied:
++**	mov	z0\.d, #11
++**	mad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u64_x_untied, svuint64_t,
++		z0 = svmla_n_u64_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u8.c
+new file mode 100644
+index 000000000..832ed4141
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mla_u8_m_tied1:
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_m_tied1, svuint8_t,
++		z0 = svmla_u8_m (p0, z0, z1, z2),
++		z0 = svmla_m (p0, z0, z1, z2))
++
++/*
++** mla_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_m_tied2, svuint8_t,
++		z0 = svmla_u8_m (p0, z1, z0, z2),
++		z0 = svmla_m (p0, z1, z0, z2))
++
++/*
++** mla_u8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_m_tied3, svuint8_t,
++		z0 = svmla_u8_m (p0, z1, z2, z0),
++		z0 = svmla_m (p0, z1, z2, z0))
++
++/*
++** mla_u8_m_untied:
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_m_untied, svuint8_t,
++		z0 = svmla_u8_m (p0, z1, z2, z3),
++		z0 = svmla_m (p0, z1, z2, z3))
++
++/*
++** mla_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_m (p0, z0, z1, x0),
++		 z0 = svmla_m (p0, z0, z1, x0))
++
++/*
++** mla_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_m (p0, z1, z2, x0),
++		 z0 = svmla_m (p0, z1, z2, x0))
++
++/*
++** mla_11_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_m_tied1, svuint8_t,
++		z0 = svmla_n_u8_m (p0, z0, z1, 11),
++		z0 = svmla_m (p0, z0, z1, 11))
++
++/*
++** mla_11_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_m_untied, svuint8_t,
++		z0 = svmla_n_u8_m (p0, z1, z2, 11),
++		z0 = svmla_m (p0, z1, z2, 11))
++
++/*
++** mla_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_z_tied1, svuint8_t,
++		z0 = svmla_u8_z (p0, z0, z1, z2),
++		z0 = svmla_z (p0, z0, z1, z2))
++
++/*
++** mla_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_z_tied2, svuint8_t,
++		z0 = svmla_u8_z (p0, z1, z0, z2),
++		z0 = svmla_z (p0, z1, z0, z2))
++
++/*
++** mla_u8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_z_tied3, svuint8_t,
++		z0 = svmla_u8_z (p0, z1, z2, z0),
++		z0 = svmla_z (p0, z1, z2, z0))
++
++/*
++** mla_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mla	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_z_untied, svuint8_t,
++		z0 = svmla_u8_z (p0, z1, z2, z3),
++		z0 = svmla_z (p0, z1, z2, z3))
++
++/*
++** mla_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_z (p0, z0, z1, x0),
++		 z0 = svmla_z (p0, z0, z1, x0))
++
++/*
++** mla_w0_u8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_z_tied2, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_z (p0, z1, z0, x0),
++		 z0 = svmla_z (p0, z1, z0, x0))
++
++/*
++** mla_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mla	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_z (p0, z1, z2, x0),
++		 z0 = svmla_z (p0, z1, z2, x0))
++
++/*
++** mla_11_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_z_tied1, svuint8_t,
++		z0 = svmla_n_u8_z (p0, z0, z1, 11),
++		z0 = svmla_z (p0, z0, z1, 11))
++
++/*
++** mla_11_u8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_z_tied2, svuint8_t,
++		z0 = svmla_n_u8_z (p0, z1, z0, 11),
++		z0 = svmla_z (p0, z1, z0, 11))
++
++/*
++** mla_11_u8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mla	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mad	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_z_untied, svuint8_t,
++		z0 = svmla_n_u8_z (p0, z1, z2, 11),
++		z0 = svmla_z (p0, z1, z2, 11))
++
++/*
++** mla_u8_x_tied1:
++**	mla	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_x_tied1, svuint8_t,
++		z0 = svmla_u8_x (p0, z0, z1, z2),
++		z0 = svmla_x (p0, z0, z1, z2))
++
++/*
++** mla_u8_x_tied2:
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_x_tied2, svuint8_t,
++		z0 = svmla_u8_x (p0, z1, z0, z2),
++		z0 = svmla_x (p0, z1, z0, z2))
++
++/*
++** mla_u8_x_tied3:
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_x_tied3, svuint8_t,
++		z0 = svmla_u8_x (p0, z1, z2, z0),
++		z0 = svmla_x (p0, z1, z2, z0))
++
++/*
++** mla_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mla	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	mad	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0, z3
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mla_u8_x_untied, svuint8_t,
++		z0 = svmla_u8_x (p0, z1, z2, z3),
++		z0 = svmla_x (p0, z1, z2, z3))
++
++/*
++** mla_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_x (p0, z0, z1, x0),
++		 z0 = svmla_x (p0, z0, z1, x0))
++
++/*
++** mla_w0_u8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_x_tied2, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_x (p0, z1, z0, x0),
++		 z0 = svmla_x (p0, z1, z0, x0))
++
++/*
++** mla_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mla_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmla_n_u8_x (p0, z1, z2, x0),
++		 z0 = svmla_x (p0, z1, z2, x0))
++
++/*
++** mla_11_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mla	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_x_tied1, svuint8_t,
++		z0 = svmla_n_u8_x (p0, z0, z1, 11),
++		z0 = svmla_x (p0, z0, z1, 11))
++
++/*
++** mla_11_u8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	mad	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_x_tied2, svuint8_t,
++		z0 = svmla_n_u8_x (p0, z1, z0, 11),
++		z0 = svmla_x (p0, z1, z0, 11))
++
++/*
++** mla_11_u8_x_untied:
++**	mov	z0\.b, #11
++**	mad	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mla_11_u8_x_untied, svuint8_t,
++		z0 = svmla_n_u8_x (p0, z1, z2, 11),
++		z0 = svmla_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f16.c
+new file mode 100644
+index 000000000..87fba3da7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_f16_m_tied1:
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_m_tied1, svfloat16_t,
++		z0 = svmls_f16_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_m_tied2, svfloat16_t,
++		z0 = svmls_f16_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_m_tied3, svfloat16_t,
++		z0 = svmls_f16_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_f16_m_untied:
++**	movprfx	z0, z1
++**	fmls	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_m_untied, svfloat16_t,
++		z0 = svmls_f16_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_m (p0, z0, z1, d4),
++		 z0 = svmls_m (p0, z0, z1, d4))
++
++/*
++** mls_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_m (p0, z1, z2, d4),
++		 z0 = svmls_m (p0, z1, z2, d4))
++
++/*
++** mls_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_m_tied1, svfloat16_t,
++		z0 = svmls_n_f16_m (p0, z0, z1, 2),
++		z0 = svmls_m (p0, z0, z1, 2))
++
++/*
++** mls_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_m_untied, svfloat16_t,
++		z0 = svmls_n_f16_m (p0, z1, z2, 2),
++		z0 = svmls_m (p0, z1, z2, 2))
++
++/*
++** mls_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_z_tied1, svfloat16_t,
++		z0 = svmls_f16_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_z_tied2, svfloat16_t,
++		z0 = svmls_f16_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_z_tied3, svfloat16_t,
++		z0 = svmls_f16_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmsb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_z_untied, svfloat16_t,
++		z0 = svmls_f16_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_z (p0, z0, z1, d4),
++		 z0 = svmls_z (p0, z0, z1, d4))
++
++/*
++** mls_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_z (p0, z1, z0, d4),
++		 z0 = svmls_z (p0, z1, z0, d4))
++
++/*
++** mls_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmsb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_z (p0, z1, z2, d4),
++		 z0 = svmls_z (p0, z1, z2, d4))
++
++/*
++** mls_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_z_tied1, svfloat16_t,
++		z0 = svmls_n_f16_z (p0, z0, z1, 2),
++		z0 = svmls_z (p0, z0, z1, 2))
++
++/*
++** mls_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_z_tied2, svfloat16_t,
++		z0 = svmls_n_f16_z (p0, z1, z0, 2),
++		z0 = svmls_z (p0, z1, z0, 2))
++
++/*
++** mls_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmsb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_z_untied, svfloat16_t,
++		z0 = svmls_n_f16_z (p0, z1, z2, 2),
++		z0 = svmls_z (p0, z1, z2, 2))
++
++/*
++** mls_f16_x_tied1:
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_x_tied1, svfloat16_t,
++		z0 = svmls_f16_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_f16_x_tied2:
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_x_tied2, svfloat16_t,
++		z0 = svmls_f16_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_f16_x_tied3:
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_x_tied3, svfloat16_t,
++		z0 = svmls_f16_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fmsb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f16_x_untied, svfloat16_t,
++		z0 = svmls_f16_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_x (p0, z0, z1, d4),
++		 z0 = svmls_x (p0, z0, z1, d4))
++
++/*
++** mls_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_x (p0, z1, z0, d4),
++		 z0 = svmls_x (p0, z1, z0, d4))
++
++/*
++** mls_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmls_n_f16_x (p0, z1, z2, d4),
++		 z0 = svmls_x (p0, z1, z2, d4))
++
++/*
++** mls_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_x_tied1, svfloat16_t,
++		z0 = svmls_n_f16_x (p0, z0, z1, 2),
++		z0 = svmls_x (p0, z0, z1, 2))
++
++/*
++** mls_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_x_tied2, svfloat16_t,
++		z0 = svmls_n_f16_x (p0, z1, z0, 2),
++		z0 = svmls_x (p0, z1, z0, 2))
++
++/*
++** mls_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f16_x_untied, svfloat16_t,
++		z0 = svmls_n_f16_x (p0, z1, z2, 2),
++		z0 = svmls_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mls_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f16_x_tied1, svfloat16_t,
++		z0 = svmls_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svmls_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_mls_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f16_x_tied2, svfloat16_t,
++		z0 = svmls_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svmls_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_mls_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f16_x_tied3, svfloat16_t,
++		z0 = svmls_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svmls_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_mls_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f16_x_untied, svfloat16_t,
++		z0 = svmls_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svmls_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_mls_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f16_x_tied1, svfloat16_t,
++		z0 = svmls_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svmls_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_mls_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f16_x_tied2, svfloat16_t,
++		z0 = svmls_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svmls_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_mls_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f16_x_untied, svfloat16_t,
++		z0 = svmls_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svmls_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f32.c
+new file mode 100644
+index 000000000..04ce1ec46
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_f32_m_tied1:
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_m_tied1, svfloat32_t,
++		z0 = svmls_f32_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_m_tied2, svfloat32_t,
++		z0 = svmls_f32_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_m_tied3, svfloat32_t,
++		z0 = svmls_f32_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_f32_m_untied:
++**	movprfx	z0, z1
++**	fmls	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_m_untied, svfloat32_t,
++		z0 = svmls_f32_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmls_n_f32_m (p0, z0, z1, d4),
++		 z0 = svmls_m (p0, z0, z1, d4))
++
++/*
++** mls_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmls_n_f32_m (p0, z1, z2, d4),
++		 z0 = svmls_m (p0, z1, z2, d4))
++
++/*
++** mls_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_m_tied1, svfloat32_t,
++		z0 = svmls_n_f32_m (p0, z0, z1, 2),
++		z0 = svmls_m (p0, z0, z1, 2))
++
++/*
++** mls_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_m_untied, svfloat32_t,
++		z0 = svmls_n_f32_m (p0, z1, z2, 2),
++		z0 = svmls_m (p0, z1, z2, 2))
++
++/*
++** mls_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_z_tied1, svfloat32_t,
++		z0 = svmls_f32_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_z_tied2, svfloat32_t,
++		z0 = svmls_f32_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_z_tied3, svfloat32_t,
++		z0 = svmls_f32_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmsb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_z_untied, svfloat32_t,
++		z0 = svmls_f32_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmls_n_f32_z (p0, z0, z1, d4),
++		 z0 = svmls_z (p0, z0, z1, d4))
++
++/*
++** mls_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svmls_n_f32_z (p0, z1, z0, d4),
++		 z0 = svmls_z (p0, z1, z0, d4))
++
++/*
++** mls_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmsb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmls_n_f32_z (p0, z1, z2, d4),
++		 z0 = svmls_z (p0, z1, z2, d4))
++
++/*
++** mls_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_z_tied1, svfloat32_t,
++		z0 = svmls_n_f32_z (p0, z0, z1, 2),
++		z0 = svmls_z (p0, z0, z1, 2))
++
++/*
++** mls_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_z_tied2, svfloat32_t,
++		z0 = svmls_n_f32_z (p0, z1, z0, 2),
++		z0 = svmls_z (p0, z1, z0, 2))
++
++/*
++** mls_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmsb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_z_untied, svfloat32_t,
++		z0 = svmls_n_f32_z (p0, z1, z2, 2),
++		z0 = svmls_z (p0, z1, z2, 2))
++
++/*
++** mls_f32_x_tied1:
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_x_tied1, svfloat32_t,
++		z0 = svmls_f32_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_f32_x_tied2:
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_x_tied2, svfloat32_t,
++		z0 = svmls_f32_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_f32_x_tied3:
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_x_tied3, svfloat32_t,
++		z0 = svmls_f32_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fmsb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f32_x_untied, svfloat32_t,
++		z0 = svmls_f32_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmls_n_f32_x (p0, z0, z1, d4),
++		 z0 = svmls_x (p0, z0, z1, d4))
++
++/*
++** mls_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svmls_n_f32_x (p0, z1, z0, d4),
++		 z0 = svmls_x (p0, z1, z0, d4))
++
++/*
++** mls_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmls_n_f32_x (p0, z1, z2, d4),
++		 z0 = svmls_x (p0, z1, z2, d4))
++
++/*
++** mls_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_x_tied1, svfloat32_t,
++		z0 = svmls_n_f32_x (p0, z0, z1, 2),
++		z0 = svmls_x (p0, z0, z1, 2))
++
++/*
++** mls_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_x_tied2, svfloat32_t,
++		z0 = svmls_n_f32_x (p0, z1, z0, 2),
++		z0 = svmls_x (p0, z1, z0, 2))
++
++/*
++** mls_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f32_x_untied, svfloat32_t,
++		z0 = svmls_n_f32_x (p0, z1, z2, 2),
++		z0 = svmls_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mls_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f32_x_tied1, svfloat32_t,
++		z0 = svmls_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svmls_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_mls_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f32_x_tied2, svfloat32_t,
++		z0 = svmls_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svmls_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_mls_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f32_x_tied3, svfloat32_t,
++		z0 = svmls_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svmls_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_mls_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f32_x_untied, svfloat32_t,
++		z0 = svmls_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svmls_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_mls_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f32_x_tied1, svfloat32_t,
++		z0 = svmls_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svmls_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_mls_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f32_x_tied2, svfloat32_t,
++		z0 = svmls_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svmls_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_mls_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f32_x_untied, svfloat32_t,
++		z0 = svmls_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svmls_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f64.c
+new file mode 100644
+index 000000000..1e2108af6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_f64_m_tied1:
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_m_tied1, svfloat64_t,
++		z0 = svmls_f64_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_m_tied2, svfloat64_t,
++		z0 = svmls_f64_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_m_tied3, svfloat64_t,
++		z0 = svmls_f64_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_f64_m_untied:
++**	movprfx	z0, z1
++**	fmls	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_m_untied, svfloat64_t,
++		z0 = svmls_f64_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmls_n_f64_m (p0, z0, z1, d4),
++		 z0 = svmls_m (p0, z0, z1, d4))
++
++/*
++** mls_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmls_n_f64_m (p0, z1, z2, d4),
++		 z0 = svmls_m (p0, z1, z2, d4))
++
++/*
++** mls_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_m_tied1, svfloat64_t,
++		z0 = svmls_n_f64_m (p0, z0, z1, 2),
++		z0 = svmls_m (p0, z0, z1, 2))
++
++/*
++** mls_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_m_untied, svfloat64_t,
++		z0 = svmls_n_f64_m (p0, z1, z2, 2),
++		z0 = svmls_m (p0, z1, z2, 2))
++
++/*
++** mls_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_z_tied1, svfloat64_t,
++		z0 = svmls_f64_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_z_tied2, svfloat64_t,
++		z0 = svmls_f64_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_z_tied3, svfloat64_t,
++		z0 = svmls_f64_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmsb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_z_untied, svfloat64_t,
++		z0 = svmls_f64_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmls_n_f64_z (p0, z0, z1, d4),
++		 z0 = svmls_z (p0, z0, z1, d4))
++
++/*
++** mls_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svmls_n_f64_z (p0, z1, z0, d4),
++		 z0 = svmls_z (p0, z1, z0, d4))
++
++/*
++** mls_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmsb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmls_n_f64_z (p0, z1, z2, d4),
++		 z0 = svmls_z (p0, z1, z2, d4))
++
++/*
++** mls_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_z_tied1, svfloat64_t,
++		z0 = svmls_n_f64_z (p0, z0, z1, 2),
++		z0 = svmls_z (p0, z0, z1, 2))
++
++/*
++** mls_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_z_tied2, svfloat64_t,
++		z0 = svmls_n_f64_z (p0, z1, z0, 2),
++		z0 = svmls_z (p0, z1, z0, 2))
++
++/*
++** mls_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmsb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_z_untied, svfloat64_t,
++		z0 = svmls_n_f64_z (p0, z1, z2, 2),
++		z0 = svmls_z (p0, z1, z2, 2))
++
++/*
++** mls_f64_x_tied1:
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_x_tied1, svfloat64_t,
++		z0 = svmls_f64_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_f64_x_tied2:
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_x_tied2, svfloat64_t,
++		z0 = svmls_f64_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_f64_x_tied3:
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_x_tied3, svfloat64_t,
++		z0 = svmls_f64_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fmsb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_f64_x_untied, svfloat64_t,
++		z0 = svmls_f64_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmls_n_f64_x (p0, z0, z1, d4),
++		 z0 = svmls_x (p0, z0, z1, d4))
++
++/*
++** mls_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svmls_n_f64_x (p0, z1, z0, d4),
++		 z0 = svmls_x (p0, z1, z0, d4))
++
++/*
++** mls_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mls_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmls_n_f64_x (p0, z1, z2, d4),
++		 z0 = svmls_x (p0, z1, z2, d4))
++
++/*
++** mls_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_x_tied1, svfloat64_t,
++		z0 = svmls_n_f64_x (p0, z0, z1, 2),
++		z0 = svmls_x (p0, z0, z1, 2))
++
++/*
++** mls_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_x_tied2, svfloat64_t,
++		z0 = svmls_n_f64_x (p0, z1, z0, 2),
++		z0 = svmls_x (p0, z1, z0, 2))
++
++/*
++** mls_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_2_f64_x_untied, svfloat64_t,
++		z0 = svmls_n_f64_x (p0, z1, z2, 2),
++		z0 = svmls_x (p0, z1, z2, 2))
++
++/*
++** ptrue_mls_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f64_x_tied1, svfloat64_t,
++		z0 = svmls_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svmls_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_mls_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f64_x_tied2, svfloat64_t,
++		z0 = svmls_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svmls_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_mls_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f64_x_tied3, svfloat64_t,
++		z0 = svmls_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svmls_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_mls_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_f64_x_untied, svfloat64_t,
++		z0 = svmls_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svmls_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_mls_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f64_x_tied1, svfloat64_t,
++		z0 = svmls_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svmls_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_mls_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f64_x_tied2, svfloat64_t,
++		z0 = svmls_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svmls_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_mls_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mls_2_f64_x_untied, svfloat64_t,
++		z0 = svmls_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svmls_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f16.c
+new file mode 100644
+index 000000000..832376d0b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f16.c
+@@ -0,0 +1,128 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_lane_0_f16_tied1:
++**	fmls	z0\.h, z1\.h, z2\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f16_tied1, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 0),
++		z0 = svmls_lane (z0, z1, z2, 0))
++
++/*
++** mls_lane_0_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.h, \1\.h, z2\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f16_tied2, svfloat16_t,
++		z0 = svmls_lane_f16 (z1, z0, z2, 0),
++		z0 = svmls_lane (z1, z0, z2, 0))
++
++/*
++** mls_lane_0_f16_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.h, z2\.h, \1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f16_tied3, svfloat16_t,
++		z0 = svmls_lane_f16 (z1, z2, z0, 0),
++		z0 = svmls_lane (z1, z2, z0, 0))
++
++/*
++** mls_lane_0_f16_untied:
++**	movprfx	z0, z1
++**	fmls	z0\.h, z2\.h, z3\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f16_untied, svfloat16_t,
++		z0 = svmls_lane_f16 (z1, z2, z3, 0),
++		z0 = svmls_lane (z1, z2, z3, 0))
++
++/*
++** mls_lane_1_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_1_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 1),
++		z0 = svmls_lane (z0, z1, z2, 1))
++
++/*
++** mls_lane_2_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_2_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 2),
++		z0 = svmls_lane (z0, z1, z2, 2))
++
++/*
++** mls_lane_3_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_3_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 3),
++		z0 = svmls_lane (z0, z1, z2, 3))
++
++/*
++** mls_lane_4_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[4\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_4_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 4),
++		z0 = svmls_lane (z0, z1, z2, 4))
++
++/*
++** mls_lane_5_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[5\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_5_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 5),
++		z0 = svmls_lane (z0, z1, z2, 5))
++
++/*
++** mls_lane_6_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[6\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_6_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 6),
++		z0 = svmls_lane (z0, z1, z2, 6))
++
++/*
++** mls_lane_7_f16:
++**	fmls	z0\.h, z1\.h, z2\.h\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_7_f16, svfloat16_t,
++		z0 = svmls_lane_f16 (z0, z1, z2, 7),
++		z0 = svmls_lane (z0, z1, z2, 7))
++
++/*
++** mls_lane_z7_f16:
++**	fmls	z0\.h, z1\.h, z7\.h\[7\]
++**	ret
++*/
++TEST_DUAL_Z (mls_lane_z7_f16, svfloat16_t, svfloat16_t,
++	     z0 = svmls_lane_f16 (z0, z1, z7, 7),
++	     z0 = svmls_lane (z0, z1, z7, 7))
++
++/*
++** mls_lane_z8_f16:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	fmls	z0\.h, z1\.h, \1\.h\[7\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mls_lane_z8_f16, svfloat16_t, svfloat16_t, z8,
++		    z0 = svmls_lane_f16 (z0, z1, z8, 7),
++		    z0 = svmls_lane (z0, z1, z8, 7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f32.c
+new file mode 100644
+index 000000000..3244b972f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f32.c
+@@ -0,0 +1,92 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_lane_0_f32_tied1:
++**	fmls	z0\.s, z1\.s, z2\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f32_tied1, svfloat32_t,
++		z0 = svmls_lane_f32 (z0, z1, z2, 0),
++		z0 = svmls_lane (z0, z1, z2, 0))
++
++/*
++** mls_lane_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.s, \1\.s, z2\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f32_tied2, svfloat32_t,
++		z0 = svmls_lane_f32 (z1, z0, z2, 0),
++		z0 = svmls_lane (z1, z0, z2, 0))
++
++/*
++** mls_lane_0_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.s, z2\.s, \1\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f32_tied3, svfloat32_t,
++		z0 = svmls_lane_f32 (z1, z2, z0, 0),
++		z0 = svmls_lane (z1, z2, z0, 0))
++
++/*
++** mls_lane_0_f32_untied:
++**	movprfx	z0, z1
++**	fmls	z0\.s, z2\.s, z3\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f32_untied, svfloat32_t,
++		z0 = svmls_lane_f32 (z1, z2, z3, 0),
++		z0 = svmls_lane (z1, z2, z3, 0))
++
++/*
++** mls_lane_1_f32:
++**	fmls	z0\.s, z1\.s, z2\.s\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_1_f32, svfloat32_t,
++		z0 = svmls_lane_f32 (z0, z1, z2, 1),
++		z0 = svmls_lane (z0, z1, z2, 1))
++
++/*
++** mls_lane_2_f32:
++**	fmls	z0\.s, z1\.s, z2\.s\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_2_f32, svfloat32_t,
++		z0 = svmls_lane_f32 (z0, z1, z2, 2),
++		z0 = svmls_lane (z0, z1, z2, 2))
++
++/*
++** mls_lane_3_f32:
++**	fmls	z0\.s, z1\.s, z2\.s\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_3_f32, svfloat32_t,
++		z0 = svmls_lane_f32 (z0, z1, z2, 3),
++		z0 = svmls_lane (z0, z1, z2, 3))
++
++/*
++** mls_lane_z7_f32:
++**	fmls	z0\.s, z1\.s, z7\.s\[3\]
++**	ret
++*/
++TEST_DUAL_Z (mls_lane_z7_f32, svfloat32_t, svfloat32_t,
++	     z0 = svmls_lane_f32 (z0, z1, z7, 3),
++	     z0 = svmls_lane (z0, z1, z7, 3))
++
++/*
++** mls_lane_z8_f32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	fmls	z0\.s, z1\.s, \1\.s\[3\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mls_lane_z8_f32, svfloat32_t, svfloat32_t, z8,
++		    z0 = svmls_lane_f32 (z0, z1, z8, 3),
++		    z0 = svmls_lane (z0, z1, z8, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f64.c
+new file mode 100644
+index 000000000..16f20ca53
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f64.c
+@@ -0,0 +1,83 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_lane_0_f64_tied1:
++**	fmls	z0\.d, z1\.d, z2\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f64_tied1, svfloat64_t,
++		z0 = svmls_lane_f64 (z0, z1, z2, 0),
++		z0 = svmls_lane (z0, z1, z2, 0))
++
++/*
++** mls_lane_0_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.d, \1, z2\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f64_tied2, svfloat64_t,
++		z0 = svmls_lane_f64 (z1, z0, z2, 0),
++		z0 = svmls_lane (z1, z0, z2, 0))
++
++/*
++** mls_lane_0_f64_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmls	z0\.d, z2\.d, \1\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f64_tied3, svfloat64_t,
++		z0 = svmls_lane_f64 (z1, z2, z0, 0),
++		z0 = svmls_lane (z1, z2, z0, 0))
++
++/*
++** mls_lane_0_f64_untied:
++**	movprfx	z0, z1
++**	fmls	z0\.d, z2\.d, z3\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_0_f64_untied, svfloat64_t,
++		z0 = svmls_lane_f64 (z1, z2, z3, 0),
++		z0 = svmls_lane (z1, z2, z3, 0))
++
++/*
++** mls_lane_1_f64:
++**	fmls	z0\.d, z1\.d, z2\.d\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mls_lane_1_f64, svfloat64_t,
++		z0 = svmls_lane_f64 (z0, z1, z2, 1),
++		z0 = svmls_lane (z0, z1, z2, 1))
++
++/*
++** mls_lane_z7_f64:
++**	fmls	z0\.d, z1\.d, z7\.d\[1\]
++**	ret
++*/
++TEST_DUAL_Z (mls_lane_z7_f64, svfloat64_t, svfloat64_t,
++	     z0 = svmls_lane_f64 (z0, z1, z7, 1),
++	     z0 = svmls_lane (z0, z1, z7, 1))
++
++/*
++** mls_lane_z15_f64:
++**	str	d15, \[sp, -16\]!
++**	fmls	z0\.d, z1\.d, z15\.d\[1\]
++**	ldr	d15, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mls_lane_z15_f64, svfloat64_t, svfloat64_t, z15,
++		    z0 = svmls_lane_f64 (z0, z1, z15, 1),
++		    z0 = svmls_lane (z0, z1, z15, 1))
++
++/*
++** mls_lane_z16_f64:
++**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
++**	fmls	z0\.d, z1\.d, \1\.d\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (mls_lane_z16_f64, svfloat64_t, svfloat64_t, z16,
++		    z0 = svmls_lane_f64 (z0, z1, z16, 1),
++		    z0 = svmls_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s16.c
+new file mode 100644
+index 000000000..e199829c4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_s16_m_tied1:
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_m_tied1, svint16_t,
++		z0 = svmls_s16_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_m_tied2, svint16_t,
++		z0 = svmls_s16_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_s16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_m_tied3, svint16_t,
++		z0 = svmls_s16_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_s16_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_m_untied, svint16_t,
++		z0 = svmls_s16_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmls_n_s16_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmls_n_s16_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_m_tied1, svint16_t,
++		z0 = svmls_n_s16_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_m_untied, svint16_t,
++		z0 = svmls_n_s16_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_z_tied1, svint16_t,
++		z0 = svmls_s16_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_z_tied2, svint16_t,
++		z0 = svmls_s16_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_s16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_z_tied3, svint16_t,
++		z0 = svmls_s16_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_z_untied, svint16_t,
++		z0 = svmls_s16_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmls_n_s16_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_w0_s16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_z_tied2, svint16_t, int16_t,
++		 z0 = svmls_n_s16_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmls_n_s16_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_z_tied1, svint16_t,
++		z0 = svmls_n_s16_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_s16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_z_tied2, svint16_t,
++		z0 = svmls_n_s16_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_s16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_z_untied, svint16_t,
++		z0 = svmls_n_s16_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_s16_x_tied1:
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_x_tied1, svint16_t,
++		z0 = svmls_s16_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_s16_x_tied2:
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_x_tied2, svint16_t,
++		z0 = svmls_s16_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_s16_x_tied3:
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_x_tied3, svint16_t,
++		z0 = svmls_s16_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	msb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s16_x_untied, svint16_t,
++		z0 = svmls_s16_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmls_n_s16_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_w0_s16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_x_tied2, svint16_t, int16_t,
++		 z0 = svmls_n_s16_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmls_n_s16_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_x_tied1, svint16_t,
++		z0 = svmls_n_s16_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_s16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_x_tied2, svint16_t,
++		z0 = svmls_n_s16_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_s16_x_untied:
++**	mov	z0\.h, #11
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s16_x_untied, svint16_t,
++		z0 = svmls_n_s16_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s32.c
+new file mode 100644
+index 000000000..fe386d01c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_s32_m_tied1:
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_m_tied1, svint32_t,
++		z0 = svmls_s32_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_m_tied2, svint32_t,
++		z0 = svmls_s32_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_s32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_m_tied3, svint32_t,
++		z0 = svmls_s32_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_s32_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_m_untied, svint32_t,
++		z0 = svmls_s32_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmls_n_s32_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmls_n_s32_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_m_tied1, svint32_t,
++		z0 = svmls_n_s32_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_m_untied, svint32_t,
++		z0 = svmls_n_s32_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_z_tied1, svint32_t,
++		z0 = svmls_s32_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_z_tied2, svint32_t,
++		z0 = svmls_s32_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_s32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_z_tied3, svint32_t,
++		z0 = svmls_s32_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_z_untied, svint32_t,
++		z0 = svmls_s32_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmls_n_s32_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_w0_s32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_z_tied2, svint32_t, int32_t,
++		 z0 = svmls_n_s32_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmls_n_s32_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_z_tied1, svint32_t,
++		z0 = svmls_n_s32_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_s32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_z_tied2, svint32_t,
++		z0 = svmls_n_s32_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_s32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_z_untied, svint32_t,
++		z0 = svmls_n_s32_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_s32_x_tied1:
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_x_tied1, svint32_t,
++		z0 = svmls_s32_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_s32_x_tied2:
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_x_tied2, svint32_t,
++		z0 = svmls_s32_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_s32_x_tied3:
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_x_tied3, svint32_t,
++		z0 = svmls_s32_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	msb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s32_x_untied, svint32_t,
++		z0 = svmls_s32_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmls_n_s32_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_w0_s32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_x_tied2, svint32_t, int32_t,
++		 z0 = svmls_n_s32_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmls_n_s32_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_x_tied1, svint32_t,
++		z0 = svmls_n_s32_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_s32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_x_tied2, svint32_t,
++		z0 = svmls_n_s32_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_s32_x_untied:
++**	mov	z0\.s, #11
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s32_x_untied, svint32_t,
++		z0 = svmls_n_s32_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s64.c
+new file mode 100644
+index 000000000..2998d733f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_s64_m_tied1:
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_m_tied1, svint64_t,
++		z0 = svmls_s64_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_m_tied2, svint64_t,
++		z0 = svmls_s64_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_s64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_m_tied3, svint64_t,
++		z0 = svmls_s64_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_s64_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_m_untied, svint64_t,
++		z0 = svmls_s64_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmls_n_s64_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmls_n_s64_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_m_tied1, svint64_t,
++		z0 = svmls_n_s64_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_m_untied, svint64_t,
++		z0 = svmls_n_s64_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_z_tied1, svint64_t,
++		z0 = svmls_s64_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_z_tied2, svint64_t,
++		z0 = svmls_s64_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_s64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_z_tied3, svint64_t,
++		z0 = svmls_s64_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_z_untied, svint64_t,
++		z0 = svmls_s64_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmls_n_s64_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_x0_s64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_z_tied2, svint64_t, int64_t,
++		 z0 = svmls_n_s64_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmls_n_s64_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_z_tied1, svint64_t,
++		z0 = svmls_n_s64_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_s64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_z_tied2, svint64_t,
++		z0 = svmls_n_s64_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_s64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_z_untied, svint64_t,
++		z0 = svmls_n_s64_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_s64_x_tied1:
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_x_tied1, svint64_t,
++		z0 = svmls_s64_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_s64_x_tied2:
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_x_tied2, svint64_t,
++		z0 = svmls_s64_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_s64_x_tied3:
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_x_tied3, svint64_t,
++		z0 = svmls_s64_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	msb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s64_x_untied, svint64_t,
++		z0 = svmls_s64_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmls_n_s64_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_x0_s64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_x_tied2, svint64_t, int64_t,
++		 z0 = svmls_n_s64_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmls_n_s64_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_x_tied1, svint64_t,
++		z0 = svmls_n_s64_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_s64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_x_tied2, svint64_t,
++		z0 = svmls_n_s64_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_s64_x_untied:
++**	mov	z0\.d, #11
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s64_x_untied, svint64_t,
++		z0 = svmls_n_s64_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s8.c
+new file mode 100644
+index 000000000..c60c43145
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_s8_m_tied1:
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_m_tied1, svint8_t,
++		z0 = svmls_s8_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_m_tied2, svint8_t,
++		z0 = svmls_s8_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_s8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_m_tied3, svint8_t,
++		z0 = svmls_s8_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_s8_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_m_untied, svint8_t,
++		z0 = svmls_s8_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmls_n_s8_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmls_n_s8_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_m_tied1, svint8_t,
++		z0 = svmls_n_s8_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_m_untied, svint8_t,
++		z0 = svmls_n_s8_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_z_tied1, svint8_t,
++		z0 = svmls_s8_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_z_tied2, svint8_t,
++		z0 = svmls_s8_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_s8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_z_tied3, svint8_t,
++		z0 = svmls_s8_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mls	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_z_untied, svint8_t,
++		z0 = svmls_s8_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmls_n_s8_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_w0_s8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_z_tied2, svint8_t, int8_t,
++		 z0 = svmls_n_s8_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mls	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmls_n_s8_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_z_tied1, svint8_t,
++		z0 = svmls_n_s8_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_s8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_z_tied2, svint8_t,
++		z0 = svmls_n_s8_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_s8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mls	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_z_untied, svint8_t,
++		z0 = svmls_n_s8_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_s8_x_tied1:
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_x_tied1, svint8_t,
++		z0 = svmls_s8_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_s8_x_tied2:
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_x_tied2, svint8_t,
++		z0 = svmls_s8_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_s8_x_tied3:
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_x_tied3, svint8_t,
++		z0 = svmls_s8_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	msb	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0, z3
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_s8_x_untied, svint8_t,
++		z0 = svmls_s8_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmls_n_s8_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_w0_s8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_x_tied2, svint8_t, int8_t,
++		 z0 = svmls_n_s8_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmls_n_s8_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_x_tied1, svint8_t,
++		z0 = svmls_n_s8_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_s8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_x_tied2, svint8_t,
++		z0 = svmls_n_s8_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_s8_x_untied:
++**	mov	z0\.b, #11
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_s8_x_untied, svint8_t,
++		z0 = svmls_n_s8_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u16.c
+new file mode 100644
+index 000000000..e8a9f5cd9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_u16_m_tied1:
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_m_tied1, svuint16_t,
++		z0 = svmls_u16_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_m_tied2, svuint16_t,
++		z0 = svmls_u16_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_u16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_m_tied3, svuint16_t,
++		z0 = svmls_u16_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_u16_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_m_untied, svuint16_t,
++		z0 = svmls_u16_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_m_tied1, svuint16_t,
++		z0 = svmls_n_u16_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_m_untied, svuint16_t,
++		z0 = svmls_n_u16_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_z_tied1, svuint16_t,
++		z0 = svmls_u16_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_z_tied2, svuint16_t,
++		z0 = svmls_u16_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_u16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_z_tied3, svuint16_t,
++		z0 = svmls_u16_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_z_untied, svuint16_t,
++		z0 = svmls_u16_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_w0_u16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_z_tied2, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_z_tied1, svuint16_t,
++		z0 = svmls_n_u16_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_u16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_z_tied2, svuint16_t,
++		z0 = svmls_n_u16_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_u16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_z_untied, svuint16_t,
++		z0 = svmls_n_u16_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_u16_x_tied1:
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_x_tied1, svuint16_t,
++		z0 = svmls_u16_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_u16_x_tied2:
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_x_tied2, svuint16_t,
++		z0 = svmls_u16_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_u16_x_tied3:
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_x_tied3, svuint16_t,
++		z0 = svmls_u16_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	msb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u16_x_untied, svuint16_t,
++		z0 = svmls_u16_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_w0_u16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_x_tied2, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmls_n_u16_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	mls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_x_tied1, svuint16_t,
++		z0 = svmls_n_u16_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_u16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_x_tied2, svuint16_t,
++		z0 = svmls_n_u16_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_u16_x_untied:
++**	mov	z0\.h, #11
++**	msb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u16_x_untied, svuint16_t,
++		z0 = svmls_n_u16_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u32.c
+new file mode 100644
+index 000000000..47e885012
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_u32_m_tied1:
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_m_tied1, svuint32_t,
++		z0 = svmls_u32_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_m_tied2, svuint32_t,
++		z0 = svmls_u32_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_u32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_m_tied3, svuint32_t,
++		z0 = svmls_u32_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_u32_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_m_untied, svuint32_t,
++		z0 = svmls_u32_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_m_tied1, svuint32_t,
++		z0 = svmls_n_u32_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_m_untied, svuint32_t,
++		z0 = svmls_n_u32_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_z_tied1, svuint32_t,
++		z0 = svmls_u32_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_z_tied2, svuint32_t,
++		z0 = svmls_u32_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_u32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_z_tied3, svuint32_t,
++		z0 = svmls_u32_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_z_untied, svuint32_t,
++		z0 = svmls_u32_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_w0_u32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_z_tied2, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_z_tied1, svuint32_t,
++		z0 = svmls_n_u32_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_u32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_z_tied2, svuint32_t,
++		z0 = svmls_n_u32_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_u32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_z_untied, svuint32_t,
++		z0 = svmls_n_u32_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_u32_x_tied1:
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_x_tied1, svuint32_t,
++		z0 = svmls_u32_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_u32_x_tied2:
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_x_tied2, svuint32_t,
++		z0 = svmls_u32_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_u32_x_tied3:
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_x_tied3, svuint32_t,
++		z0 = svmls_u32_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	msb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u32_x_untied, svuint32_t,
++		z0 = svmls_u32_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_w0_u32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_x_tied2, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmls_n_u32_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	mls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_x_tied1, svuint32_t,
++		z0 = svmls_n_u32_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_u32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_x_tied2, svuint32_t,
++		z0 = svmls_n_u32_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_u32_x_untied:
++**	mov	z0\.s, #11
++**	msb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u32_x_untied, svuint32_t,
++		z0 = svmls_n_u32_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u64.c
+new file mode 100644
+index 000000000..4d441b759
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_u64_m_tied1:
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_m_tied1, svuint64_t,
++		z0 = svmls_u64_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_m_tied2, svuint64_t,
++		z0 = svmls_u64_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_u64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_m_tied3, svuint64_t,
++		z0 = svmls_u64_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_u64_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_m_untied, svuint64_t,
++		z0 = svmls_u64_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_m_tied1, svuint64_t,
++		z0 = svmls_n_u64_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_m_untied, svuint64_t,
++		z0 = svmls_n_u64_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_z_tied1, svuint64_t,
++		z0 = svmls_u64_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_z_tied2, svuint64_t,
++		z0 = svmls_u64_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_u64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_z_tied3, svuint64_t,
++		z0 = svmls_u64_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_z_untied, svuint64_t,
++		z0 = svmls_u64_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_x0_u64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_z_tied2, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_z_tied1, svuint64_t,
++		z0 = svmls_n_u64_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_u64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_z_tied2, svuint64_t,
++		z0 = svmls_n_u64_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_u64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_z_untied, svuint64_t,
++		z0 = svmls_n_u64_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_u64_x_tied1:
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_x_tied1, svuint64_t,
++		z0 = svmls_u64_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_u64_x_tied2:
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_x_tied2, svuint64_t,
++		z0 = svmls_u64_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_u64_x_tied3:
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_x_tied3, svuint64_t,
++		z0 = svmls_u64_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	msb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u64_x_untied, svuint64_t,
++		z0 = svmls_u64_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_x0_u64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_x_tied2, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmls_n_u64_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	mls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_x_tied1, svuint64_t,
++		z0 = svmls_n_u64_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_u64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_x_tied2, svuint64_t,
++		z0 = svmls_n_u64_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_u64_x_untied:
++**	mov	z0\.d, #11
++**	msb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u64_x_untied, svuint64_t,
++		z0 = svmls_n_u64_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u8.c
+new file mode 100644
+index 000000000..0489aaa7c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mls_u8_m_tied1:
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_m_tied1, svuint8_t,
++		z0 = svmls_u8_m (p0, z0, z1, z2),
++		z0 = svmls_m (p0, z0, z1, z2))
++
++/*
++** mls_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_m_tied2, svuint8_t,
++		z0 = svmls_u8_m (p0, z1, z0, z2),
++		z0 = svmls_m (p0, z1, z0, z2))
++
++/*
++** mls_u8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_m_tied3, svuint8_t,
++		z0 = svmls_u8_m (p0, z1, z2, z0),
++		z0 = svmls_m (p0, z1, z2, z0))
++
++/*
++** mls_u8_m_untied:
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_m_untied, svuint8_t,
++		z0 = svmls_u8_m (p0, z1, z2, z3),
++		z0 = svmls_m (p0, z1, z2, z3))
++
++/*
++** mls_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_m (p0, z0, z1, x0),
++		 z0 = svmls_m (p0, z0, z1, x0))
++
++/*
++** mls_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_m (p0, z1, z2, x0),
++		 z0 = svmls_m (p0, z1, z2, x0))
++
++/*
++** mls_11_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_m_tied1, svuint8_t,
++		z0 = svmls_n_u8_m (p0, z0, z1, 11),
++		z0 = svmls_m (p0, z0, z1, 11))
++
++/*
++** mls_11_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_m_untied, svuint8_t,
++		z0 = svmls_n_u8_m (p0, z1, z2, 11),
++		z0 = svmls_m (p0, z1, z2, 11))
++
++/*
++** mls_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_z_tied1, svuint8_t,
++		z0 = svmls_u8_z (p0, z0, z1, z2),
++		z0 = svmls_z (p0, z0, z1, z2))
++
++/*
++** mls_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_z_tied2, svuint8_t,
++		z0 = svmls_u8_z (p0, z1, z0, z2),
++		z0 = svmls_z (p0, z1, z0, z2))
++
++/*
++** mls_u8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_z_tied3, svuint8_t,
++		z0 = svmls_u8_z (p0, z1, z2, z0),
++		z0 = svmls_z (p0, z1, z2, z0))
++
++/*
++** mls_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mls	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_z_untied, svuint8_t,
++		z0 = svmls_u8_z (p0, z1, z2, z3),
++		z0 = svmls_z (p0, z1, z2, z3))
++
++/*
++** mls_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_z (p0, z0, z1, x0),
++		 z0 = svmls_z (p0, z0, z1, x0))
++
++/*
++** mls_w0_u8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_z_tied2, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_z (p0, z1, z0, x0),
++		 z0 = svmls_z (p0, z1, z0, x0))
++
++/*
++** mls_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mls	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_z (p0, z1, z2, x0),
++		 z0 = svmls_z (p0, z1, z2, x0))
++
++/*
++** mls_11_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_z_tied1, svuint8_t,
++		z0 = svmls_n_u8_z (p0, z0, z1, 11),
++		z0 = svmls_z (p0, z0, z1, 11))
++
++/*
++** mls_11_u8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_z_tied2, svuint8_t,
++		z0 = svmls_n_u8_z (p0, z1, z0, 11),
++		z0 = svmls_z (p0, z1, z0, 11))
++
++/*
++** mls_11_u8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mls	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, \1, z1\.b
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_z_untied, svuint8_t,
++		z0 = svmls_n_u8_z (p0, z1, z2, 11),
++		z0 = svmls_z (p0, z1, z2, 11))
++
++/*
++** mls_u8_x_tied1:
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_x_tied1, svuint8_t,
++		z0 = svmls_u8_x (p0, z0, z1, z2),
++		z0 = svmls_x (p0, z0, z1, z2))
++
++/*
++** mls_u8_x_tied2:
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_x_tied2, svuint8_t,
++		z0 = svmls_u8_x (p0, z1, z0, z2),
++		z0 = svmls_x (p0, z1, z0, z2))
++
++/*
++** mls_u8_x_tied3:
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_x_tied3, svuint8_t,
++		z0 = svmls_u8_x (p0, z1, z2, z0),
++		z0 = svmls_x (p0, z1, z2, z0))
++
++/*
++** mls_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mls	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	msb	z0\.b, p0/m, z3\.b, z1\.b
++** |
++**	movprfx	z0, z3
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mls_u8_x_untied, svuint8_t,
++		z0 = svmls_u8_x (p0, z1, z2, z3),
++		z0 = svmls_x (p0, z1, z2, z3))
++
++/*
++** mls_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_x (p0, z0, z1, x0),
++		 z0 = svmls_x (p0, z0, z1, x0))
++
++/*
++** mls_w0_u8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_x_tied2, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_x (p0, z1, z0, x0),
++		 z0 = svmls_x (p0, z1, z0, x0))
++
++/*
++** mls_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mls_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmls_n_u8_x (p0, z1, z2, x0),
++		 z0 = svmls_x (p0, z1, z2, x0))
++
++/*
++** mls_11_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	mls	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_x_tied1, svuint8_t,
++		z0 = svmls_n_u8_x (p0, z0, z1, 11),
++		z0 = svmls_x (p0, z0, z1, 11))
++
++/*
++** mls_11_u8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_x_tied2, svuint8_t,
++		z0 = svmls_n_u8_x (p0, z1, z0, 11),
++		z0 = svmls_x (p0, z1, z0, 11))
++
++/*
++** mls_11_u8_x_untied:
++**	mov	z0\.b, #11
++**	msb	z0\.b, p0/m, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mls_11_u8_x_untied, svuint8_t,
++		z0 = svmls_n_u8_x (p0, z1, z2, 11),
++		z0 = svmls_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f32.c
+new file mode 100644
+index 000000000..f66dbf397
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f32.c
+@@ -0,0 +1,46 @@
++/* { dg-require-effective-target aarch64_asm_f32mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f32mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mmla_f32_tied1:
++**	fmmla	z0\.s, z4\.s, z5\.s
++**	ret
++*/
++TEST_DUAL_Z (mmla_f32_tied1, svfloat32_t, svfloat32_t,
++	     z0 = svmmla_f32 (z0, z4, z5),
++	     z0 = svmmla (z0, z4, z5))
++
++/*
++** mmla_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fmmla	z0\.s, \1\.s, z1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_f32_tied2, svfloat32_t, svfloat32_t,
++		 z0_res = svmmla_f32 (z4, z0, z1),
++		 z0_res = svmmla (z4, z0, z1))
++
++/*
++** mmla_f32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fmmla	z0\.s, z1\.s, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_f32_tied3, svfloat32_t, svfloat32_t,
++		 z0_res = svmmla_f32 (z4, z1, z0),
++		 z0_res = svmmla (z4, z1, z0))
++
++/*
++** mmla_f32_untied:
++**	movprfx	z0, z1
++**	fmmla	z0\.s, z4\.s, z5\.s
++**	ret
++*/
++TEST_DUAL_Z (mmla_f32_untied, svfloat32_t, svfloat32_t,
++	     z0 = svmmla_f32 (z1, z4, z5),
++	     z0 = svmmla (z1, z4, z5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f64.c
+new file mode 100644
+index 000000000..49dc0607c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f64.c
+@@ -0,0 +1,46 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mmla_f64_tied1:
++**	fmmla	z0\.d, z4\.d, z5\.d
++**	ret
++*/
++TEST_DUAL_Z (mmla_f64_tied1, svfloat64_t, svfloat64_t,
++	     z0 = svmmla_f64 (z0, z4, z5),
++	     z0 = svmmla (z0, z4, z5))
++
++/*
++** mmla_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fmmla	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_f64_tied2, svfloat64_t, svfloat64_t,
++		 z0_res = svmmla_f64 (z4, z0, z1),
++		 z0_res = svmmla (z4, z0, z1))
++
++/*
++** mmla_f64_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fmmla	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_f64_tied3, svfloat64_t, svfloat64_t,
++		 z0_res = svmmla_f64 (z4, z1, z0),
++		 z0_res = svmmla (z4, z1, z0))
++
++/*
++** mmla_f64_untied:
++**	movprfx	z0, z1
++**	fmmla	z0\.d, z4\.d, z5\.d
++**	ret
++*/
++TEST_DUAL_Z (mmla_f64_untied, svfloat64_t, svfloat64_t,
++	     z0 = svmmla_f64 (z1, z4, z5),
++	     z0 = svmmla (z1, z4, z5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_s32.c
+new file mode 100644
+index 000000000..e7ce009ac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_s32.c
+@@ -0,0 +1,46 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mmla_s32_tied1:
++**	smmla	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (mmla_s32_tied1, svint32_t, svint8_t,
++	     z0 = svmmla_s32 (z0, z4, z5),
++	     z0 = svmmla (z0, z4, z5))
++
++/*
++** mmla_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	smmla	z0\.s, \1\.b, z1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_s32_tied2, svint32_t, svint8_t,
++		 z0_res = svmmla_s32 (z4, z0, z1),
++		 z0_res = svmmla (z4, z0, z1))
++
++/*
++** mmla_s32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	smmla	z0\.s, z1\.b, \1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_s32_tied3, svint32_t, svint8_t,
++		 z0_res = svmmla_s32 (z4, z1, z0),
++		 z0_res = svmmla (z4, z1, z0))
++
++/*
++** mmla_s32_untied:
++**	movprfx	z0, z1
++**	smmla	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (mmla_s32_untied, svint32_t, svint8_t,
++	     z0 = svmmla_s32 (z1, z4, z5),
++	     z0 = svmmla (z1, z4, z5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_u32.c
+new file mode 100644
+index 000000000..81f5166fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_u32.c
+@@ -0,0 +1,46 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mmla_u32_tied1:
++**	ummla	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (mmla_u32_tied1, svuint32_t, svuint8_t,
++	     z0 = svmmla_u32 (z0, z4, z5),
++	     z0 = svmmla (z0, z4, z5))
++
++/*
++** mmla_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	ummla	z0\.s, \1\.b, z1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_u32_tied2, svuint32_t, svuint8_t,
++		 z0_res = svmmla_u32 (z4, z0, z1),
++		 z0_res = svmmla (z4, z0, z1))
++
++/*
++** mmla_u32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	ummla	z0\.s, z1\.b, \1\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (mmla_u32_tied3, svuint32_t, svuint8_t,
++		 z0_res = svmmla_u32 (z4, z1, z0),
++		 z0_res = svmmla (z4, z1, z0))
++
++/*
++** mmla_u32_untied:
++**	movprfx	z0, z1
++**	ummla	z0\.s, z4\.b, z5\.b
++**	ret
++*/
++TEST_DUAL_Z (mmla_u32_untied, svuint32_t, svuint8_t,
++	     z0 = svmmla_u32 (z1, z4, z5),
++	     z0 = svmmla (z1, z4, z5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mov_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mov_b.c
+new file mode 100644
+index 000000000..6b78f348f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mov_b.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mov_b_z_tied1:
++**	and	p0\.b, (?:p3/z, p0\.b, p0\.b|p0/z, p3\.b, p3\.b)
++**	ret
++*/
++TEST_UNIFORM_P (mov_b_z_tied1,
++		p0 = svmov_b_z (p3, p0),
++		p0 = svmov_z (p3, p0))
++
++/*
++** mov_b_z_untied:
++**	and	p0\.b, (?:p3/z, p1\.b, p1\.b|p1/z, p3\.b, p3\.b)
++**	ret
++*/
++TEST_UNIFORM_P (mov_b_z_untied,
++		p0 = svmov_b_z (p3, p1),
++		p0 = svmov_z (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f16.c
+new file mode 100644
+index 000000000..fe11457c4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_f16_m_tied1:
++**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_m_tied1, svfloat16_t,
++		z0 = svmsb_f16_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmsb	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_m_tied2, svfloat16_t,
++		z0 = svmsb_f16_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmsb	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_m_tied3, svfloat16_t,
++		z0 = svmsb_f16_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_f16_m_untied:
++**	movprfx	z0, z1
++**	fmsb	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_m_untied, svfloat16_t,
++		z0 = svmsb_f16_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_m (p0, z0, z1, d4),
++		 z0 = svmsb_m (p0, z0, z1, d4))
++
++/*
++** msb_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmsb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_m (p0, z1, z2, d4),
++		 z0 = svmsb_m (p0, z1, z2, d4))
++
++/*
++** msb_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_m_tied1, svfloat16_t,
++		z0 = svmsb_n_f16_m (p0, z0, z1, 2),
++		z0 = svmsb_m (p0, z0, z1, 2))
++
++/*
++** msb_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmsb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_m_untied, svfloat16_t,
++		z0 = svmsb_n_f16_m (p0, z1, z2, 2),
++		z0 = svmsb_m (p0, z1, z2, 2))
++
++/*
++** msb_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_z_tied1, svfloat16_t,
++		z0 = svmsb_f16_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_z_tied2, svfloat16_t,
++		z0 = svmsb_f16_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_z_tied3, svfloat16_t,
++		z0 = svmsb_f16_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmsb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmsb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_z_untied, svfloat16_t,
++		z0 = svmsb_f16_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_z (p0, z0, z1, d4),
++		 z0 = svmsb_z (p0, z0, z1, d4))
++
++/*
++** msb_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_z (p0, z1, z0, d4),
++		 z0 = svmsb_z (p0, z1, z0, d4))
++
++/*
++** msb_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmsb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_z (p0, z1, z2, d4),
++		 z0 = svmsb_z (p0, z1, z2, d4))
++
++/*
++** msb_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_z_tied1, svfloat16_t,
++		z0 = svmsb_n_f16_z (p0, z0, z1, 2),
++		z0 = svmsb_z (p0, z0, z1, 2))
++
++/*
++** msb_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_z_tied2, svfloat16_t,
++		z0 = svmsb_n_f16_z (p0, z1, z0, 2),
++		z0 = svmsb_z (p0, z1, z0, 2))
++
++/*
++** msb_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmsb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_z_untied, svfloat16_t,
++		z0 = svmsb_n_f16_z (p0, z1, z2, 2),
++		z0 = svmsb_z (p0, z1, z2, 2))
++
++/*
++** msb_f16_x_tied1:
++**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_x_tied1, svfloat16_t,
++		z0 = svmsb_f16_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_f16_x_tied2:
++**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_x_tied2, svfloat16_t,
++		z0 = svmsb_f16_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_f16_x_tied3:
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_x_tied3, svfloat16_t,
++		z0 = svmsb_f16_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmsb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fmsb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f16_x_untied, svfloat16_t,
++		z0 = svmsb_f16_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_x (p0, z0, z1, d4),
++		 z0 = svmsb_x (p0, z0, z1, d4))
++
++/*
++** msb_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_x (p0, z1, z0, d4),
++		 z0 = svmsb_x (p0, z1, z0, d4))
++
++/*
++** msb_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmsb_n_f16_x (p0, z1, z2, d4),
++		 z0 = svmsb_x (p0, z1, z2, d4))
++
++/*
++** msb_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_x_tied1, svfloat16_t,
++		z0 = svmsb_n_f16_x (p0, z0, z1, 2),
++		z0 = svmsb_x (p0, z0, z1, 2))
++
++/*
++** msb_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_x_tied2, svfloat16_t,
++		z0 = svmsb_n_f16_x (p0, z1, z0, 2),
++		z0 = svmsb_x (p0, z1, z0, 2))
++
++/*
++** msb_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f16_x_untied, svfloat16_t,
++		z0 = svmsb_n_f16_x (p0, z1, z2, 2),
++		z0 = svmsb_x (p0, z1, z2, 2))
++
++/*
++** ptrue_msb_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f16_x_tied1, svfloat16_t,
++		z0 = svmsb_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svmsb_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_msb_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f16_x_tied2, svfloat16_t,
++		z0 = svmsb_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svmsb_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_msb_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f16_x_tied3, svfloat16_t,
++		z0 = svmsb_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svmsb_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_msb_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f16_x_untied, svfloat16_t,
++		z0 = svmsb_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svmsb_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_msb_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f16_x_tied1, svfloat16_t,
++		z0 = svmsb_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svmsb_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_msb_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f16_x_tied2, svfloat16_t,
++		z0 = svmsb_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svmsb_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_msb_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f16_x_untied, svfloat16_t,
++		z0 = svmsb_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svmsb_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f32.c
+new file mode 100644
+index 000000000..f7a9f2767
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_f32_m_tied1:
++**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_m_tied1, svfloat32_t,
++		z0 = svmsb_f32_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmsb	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_m_tied2, svfloat32_t,
++		z0 = svmsb_f32_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmsb	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_m_tied3, svfloat32_t,
++		z0 = svmsb_f32_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_f32_m_untied:
++**	movprfx	z0, z1
++**	fmsb	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_m_untied, svfloat32_t,
++		z0 = svmsb_f32_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmsb_n_f32_m (p0, z0, z1, d4),
++		 z0 = svmsb_m (p0, z0, z1, d4))
++
++/*
++** msb_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmsb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmsb_n_f32_m (p0, z1, z2, d4),
++		 z0 = svmsb_m (p0, z1, z2, d4))
++
++/*
++** msb_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_m_tied1, svfloat32_t,
++		z0 = svmsb_n_f32_m (p0, z0, z1, 2),
++		z0 = svmsb_m (p0, z0, z1, 2))
++
++/*
++** msb_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmsb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_m_untied, svfloat32_t,
++		z0 = svmsb_n_f32_m (p0, z1, z2, 2),
++		z0 = svmsb_m (p0, z1, z2, 2))
++
++/*
++** msb_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_z_tied1, svfloat32_t,
++		z0 = svmsb_f32_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_z_tied2, svfloat32_t,
++		z0 = svmsb_f32_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_z_tied3, svfloat32_t,
++		z0 = svmsb_f32_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmsb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmsb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_z_untied, svfloat32_t,
++		z0 = svmsb_f32_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmsb_n_f32_z (p0, z0, z1, d4),
++		 z0 = svmsb_z (p0, z0, z1, d4))
++
++/*
++** msb_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svmsb_n_f32_z (p0, z1, z0, d4),
++		 z0 = svmsb_z (p0, z1, z0, d4))
++
++/*
++** msb_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmsb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmsb_n_f32_z (p0, z1, z2, d4),
++		 z0 = svmsb_z (p0, z1, z2, d4))
++
++/*
++** msb_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_z_tied1, svfloat32_t,
++		z0 = svmsb_n_f32_z (p0, z0, z1, 2),
++		z0 = svmsb_z (p0, z0, z1, 2))
++
++/*
++** msb_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_z_tied2, svfloat32_t,
++		z0 = svmsb_n_f32_z (p0, z1, z0, 2),
++		z0 = svmsb_z (p0, z1, z0, 2))
++
++/*
++** msb_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmsb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_z_untied, svfloat32_t,
++		z0 = svmsb_n_f32_z (p0, z1, z2, 2),
++		z0 = svmsb_z (p0, z1, z2, 2))
++
++/*
++** msb_f32_x_tied1:
++**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_x_tied1, svfloat32_t,
++		z0 = svmsb_f32_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_f32_x_tied2:
++**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_x_tied2, svfloat32_t,
++		z0 = svmsb_f32_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_f32_x_tied3:
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_x_tied3, svfloat32_t,
++		z0 = svmsb_f32_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmsb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fmsb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f32_x_untied, svfloat32_t,
++		z0 = svmsb_f32_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmsb_n_f32_x (p0, z0, z1, d4),
++		 z0 = svmsb_x (p0, z0, z1, d4))
++
++/*
++** msb_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svmsb_n_f32_x (p0, z1, z0, d4),
++		 z0 = svmsb_x (p0, z1, z0, d4))
++
++/*
++** msb_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmsb_n_f32_x (p0, z1, z2, d4),
++		 z0 = svmsb_x (p0, z1, z2, d4))
++
++/*
++** msb_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_x_tied1, svfloat32_t,
++		z0 = svmsb_n_f32_x (p0, z0, z1, 2),
++		z0 = svmsb_x (p0, z0, z1, 2))
++
++/*
++** msb_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_x_tied2, svfloat32_t,
++		z0 = svmsb_n_f32_x (p0, z1, z0, 2),
++		z0 = svmsb_x (p0, z1, z0, 2))
++
++/*
++** msb_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f32_x_untied, svfloat32_t,
++		z0 = svmsb_n_f32_x (p0, z1, z2, 2),
++		z0 = svmsb_x (p0, z1, z2, 2))
++
++/*
++** ptrue_msb_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f32_x_tied1, svfloat32_t,
++		z0 = svmsb_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svmsb_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_msb_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f32_x_tied2, svfloat32_t,
++		z0 = svmsb_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svmsb_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_msb_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f32_x_tied3, svfloat32_t,
++		z0 = svmsb_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svmsb_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_msb_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f32_x_untied, svfloat32_t,
++		z0 = svmsb_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svmsb_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_msb_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f32_x_tied1, svfloat32_t,
++		z0 = svmsb_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svmsb_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_msb_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f32_x_tied2, svfloat32_t,
++		z0 = svmsb_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svmsb_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_msb_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f32_x_untied, svfloat32_t,
++		z0 = svmsb_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svmsb_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f64.c
+new file mode 100644
+index 000000000..e3ff414d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_f64_m_tied1:
++**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_m_tied1, svfloat64_t,
++		z0 = svmsb_f64_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmsb	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_m_tied2, svfloat64_t,
++		z0 = svmsb_f64_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmsb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_m_tied3, svfloat64_t,
++		z0 = svmsb_f64_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_f64_m_untied:
++**	movprfx	z0, z1
++**	fmsb	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_m_untied, svfloat64_t,
++		z0 = svmsb_f64_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmsb_n_f64_m (p0, z0, z1, d4),
++		 z0 = svmsb_m (p0, z0, z1, d4))
++
++/*
++** msb_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmsb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmsb_n_f64_m (p0, z1, z2, d4),
++		 z0 = svmsb_m (p0, z1, z2, d4))
++
++/*
++** msb_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_m_tied1, svfloat64_t,
++		z0 = svmsb_n_f64_m (p0, z0, z1, 2),
++		z0 = svmsb_m (p0, z0, z1, 2))
++
++/*
++** msb_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmsb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_m_untied, svfloat64_t,
++		z0 = svmsb_n_f64_m (p0, z1, z2, 2),
++		z0 = svmsb_m (p0, z1, z2, 2))
++
++/*
++** msb_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_z_tied1, svfloat64_t,
++		z0 = svmsb_f64_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_z_tied2, svfloat64_t,
++		z0 = svmsb_f64_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_z_tied3, svfloat64_t,
++		z0 = svmsb_f64_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmsb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmsb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_z_untied, svfloat64_t,
++		z0 = svmsb_f64_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmsb_n_f64_z (p0, z0, z1, d4),
++		 z0 = svmsb_z (p0, z0, z1, d4))
++
++/*
++** msb_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svmsb_n_f64_z (p0, z1, z0, d4),
++		 z0 = svmsb_z (p0, z1, z0, d4))
++
++/*
++** msb_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmsb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmsb_n_f64_z (p0, z1, z2, d4),
++		 z0 = svmsb_z (p0, z1, z2, d4))
++
++/*
++** msb_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_z_tied1, svfloat64_t,
++		z0 = svmsb_n_f64_z (p0, z0, z1, 2),
++		z0 = svmsb_z (p0, z0, z1, 2))
++
++/*
++** msb_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_z_tied2, svfloat64_t,
++		z0 = svmsb_n_f64_z (p0, z1, z0, 2),
++		z0 = svmsb_z (p0, z1, z0, 2))
++
++/*
++** msb_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmsb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_z_untied, svfloat64_t,
++		z0 = svmsb_n_f64_z (p0, z1, z2, 2),
++		z0 = svmsb_z (p0, z1, z2, 2))
++
++/*
++** msb_f64_x_tied1:
++**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_x_tied1, svfloat64_t,
++		z0 = svmsb_f64_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_f64_x_tied2:
++**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_x_tied2, svfloat64_t,
++		z0 = svmsb_f64_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_f64_x_tied3:
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_x_tied3, svfloat64_t,
++		z0 = svmsb_f64_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmsb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fmsb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_f64_x_untied, svfloat64_t,
++		z0 = svmsb_f64_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmsb_n_f64_x (p0, z0, z1, d4),
++		 z0 = svmsb_x (p0, z0, z1, d4))
++
++/*
++** msb_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svmsb_n_f64_x (p0, z1, z0, d4),
++		 z0 = svmsb_x (p0, z1, z0, d4))
++
++/*
++** msb_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (msb_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmsb_n_f64_x (p0, z1, z2, d4),
++		 z0 = svmsb_x (p0, z1, z2, d4))
++
++/*
++** msb_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_x_tied1, svfloat64_t,
++		z0 = svmsb_n_f64_x (p0, z0, z1, 2),
++		z0 = svmsb_x (p0, z0, z1, 2))
++
++/*
++** msb_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_x_tied2, svfloat64_t,
++		z0 = svmsb_n_f64_x (p0, z1, z0, 2),
++		z0 = svmsb_x (p0, z1, z0, 2))
++
++/*
++** msb_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_2_f64_x_untied, svfloat64_t,
++		z0 = svmsb_n_f64_x (p0, z1, z2, 2),
++		z0 = svmsb_x (p0, z1, z2, 2))
++
++/*
++** ptrue_msb_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f64_x_tied1, svfloat64_t,
++		z0 = svmsb_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svmsb_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_msb_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f64_x_tied2, svfloat64_t,
++		z0 = svmsb_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svmsb_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_msb_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f64_x_tied3, svfloat64_t,
++		z0 = svmsb_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svmsb_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_msb_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_f64_x_untied, svfloat64_t,
++		z0 = svmsb_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svmsb_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_msb_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f64_x_tied1, svfloat64_t,
++		z0 = svmsb_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svmsb_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_msb_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f64_x_tied2, svfloat64_t,
++		z0 = svmsb_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svmsb_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_msb_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_msb_2_f64_x_untied, svfloat64_t,
++		z0 = svmsb_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svmsb_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s16.c
+new file mode 100644
+index 000000000..56347cfb9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_s16_m_tied1:
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_m_tied1, svint16_t,
++		z0 = svmsb_s16_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_m_tied2, svint16_t,
++		z0 = svmsb_s16_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_s16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_m_tied3, svint16_t,
++		z0 = svmsb_s16_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_s16_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_m_untied, svint16_t,
++		z0 = svmsb_s16_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_m_tied1, svint16_t,
++		z0 = svmsb_n_s16_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_m_untied, svint16_t,
++		z0 = svmsb_n_s16_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_z_tied1, svint16_t,
++		z0 = svmsb_s16_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_z_tied2, svint16_t,
++		z0 = svmsb_s16_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_s16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_z_tied3, svint16_t,
++		z0 = svmsb_s16_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	msb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_z_untied, svint16_t,
++		z0 = svmsb_s16_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_w0_s16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_z_tied2, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	msb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_z_tied1, svint16_t,
++		z0 = svmsb_n_s16_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_s16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_z_tied2, svint16_t,
++		z0 = svmsb_n_s16_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_s16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	msb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_z_untied, svint16_t,
++		z0 = svmsb_n_s16_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_s16_x_tied1:
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_x_tied1, svint16_t,
++		z0 = svmsb_s16_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_s16_x_tied2:
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_x_tied2, svint16_t,
++		z0 = svmsb_s16_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_s16_x_tied3:
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_x_tied3, svint16_t,
++		z0 = svmsb_s16_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	msb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s16_x_untied, svint16_t,
++		z0 = svmsb_s16_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_w0_s16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_x_tied2, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmsb_n_s16_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_x_tied1, svint16_t,
++		z0 = svmsb_n_s16_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_s16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_x_tied2, svint16_t,
++		z0 = svmsb_n_s16_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_s16_x_untied:
++**	mov	z0\.h, #11
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s16_x_untied, svint16_t,
++		z0 = svmsb_n_s16_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s32.c
+new file mode 100644
+index 000000000..fb7a7815b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_s32_m_tied1:
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_m_tied1, svint32_t,
++		z0 = svmsb_s32_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_m_tied2, svint32_t,
++		z0 = svmsb_s32_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_s32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_m_tied3, svint32_t,
++		z0 = svmsb_s32_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_s32_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_m_untied, svint32_t,
++		z0 = svmsb_s32_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_m_tied1, svint32_t,
++		z0 = svmsb_n_s32_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_m_untied, svint32_t,
++		z0 = svmsb_n_s32_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_z_tied1, svint32_t,
++		z0 = svmsb_s32_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_z_tied2, svint32_t,
++		z0 = svmsb_s32_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_s32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_z_tied3, svint32_t,
++		z0 = svmsb_s32_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	msb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_z_untied, svint32_t,
++		z0 = svmsb_s32_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_w0_s32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_z_tied2, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	msb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_z_tied1, svint32_t,
++		z0 = svmsb_n_s32_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_s32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_z_tied2, svint32_t,
++		z0 = svmsb_n_s32_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_s32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	msb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_z_untied, svint32_t,
++		z0 = svmsb_n_s32_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_s32_x_tied1:
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_x_tied1, svint32_t,
++		z0 = svmsb_s32_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_s32_x_tied2:
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_x_tied2, svint32_t,
++		z0 = svmsb_s32_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_s32_x_tied3:
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_x_tied3, svint32_t,
++		z0 = svmsb_s32_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	msb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s32_x_untied, svint32_t,
++		z0 = svmsb_s32_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_w0_s32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_x_tied2, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmsb_n_s32_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_x_tied1, svint32_t,
++		z0 = svmsb_n_s32_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_s32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_x_tied2, svint32_t,
++		z0 = svmsb_n_s32_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_s32_x_untied:
++**	mov	z0\.s, #11
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s32_x_untied, svint32_t,
++		z0 = svmsb_n_s32_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s64.c
+new file mode 100644
+index 000000000..6829fab36
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_s64_m_tied1:
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_m_tied1, svint64_t,
++		z0 = svmsb_s64_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_m_tied2, svint64_t,
++		z0 = svmsb_s64_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_s64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_m_tied3, svint64_t,
++		z0 = svmsb_s64_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_s64_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_m_untied, svint64_t,
++		z0 = svmsb_s64_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_m_tied1, svint64_t,
++		z0 = svmsb_n_s64_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_m_untied, svint64_t,
++		z0 = svmsb_n_s64_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_z_tied1, svint64_t,
++		z0 = svmsb_s64_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_z_tied2, svint64_t,
++		z0 = svmsb_s64_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_s64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_z_tied3, svint64_t,
++		z0 = svmsb_s64_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	msb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_z_untied, svint64_t,
++		z0 = svmsb_s64_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_x0_s64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_z_tied2, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	msb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_z_tied1, svint64_t,
++		z0 = svmsb_n_s64_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_s64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_z_tied2, svint64_t,
++		z0 = svmsb_n_s64_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_s64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	msb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_z_untied, svint64_t,
++		z0 = svmsb_n_s64_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_s64_x_tied1:
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_x_tied1, svint64_t,
++		z0 = svmsb_s64_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_s64_x_tied2:
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_x_tied2, svint64_t,
++		z0 = svmsb_s64_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_s64_x_tied3:
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_x_tied3, svint64_t,
++		z0 = svmsb_s64_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	msb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s64_x_untied, svint64_t,
++		z0 = svmsb_s64_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_x0_s64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_x_tied2, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmsb_n_s64_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_x_tied1, svint64_t,
++		z0 = svmsb_n_s64_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_s64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_x_tied2, svint64_t,
++		z0 = svmsb_n_s64_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_s64_x_untied:
++**	mov	z0\.d, #11
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s64_x_untied, svint64_t,
++		z0 = svmsb_n_s64_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s8.c
+new file mode 100644
+index 000000000..d7fcafdd0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_s8_m_tied1:
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_m_tied1, svint8_t,
++		z0 = svmsb_s8_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_m_tied2, svint8_t,
++		z0 = svmsb_s8_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_s8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_m_tied3, svint8_t,
++		z0 = svmsb_s8_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_s8_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_m_untied, svint8_t,
++		z0 = svmsb_s8_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_m_tied1, svint8_t,
++		z0 = svmsb_n_s8_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_m_untied, svint8_t,
++		z0 = svmsb_n_s8_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_z_tied1, svint8_t,
++		z0 = svmsb_s8_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_z_tied2, svint8_t,
++		z0 = svmsb_s8_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_s8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_z_tied3, svint8_t,
++		z0 = svmsb_s8_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	msb	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_z_untied, svint8_t,
++		z0 = svmsb_s8_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_w0_s8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_z_tied2, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	msb	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_z_tied1, svint8_t,
++		z0 = svmsb_n_s8_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_s8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_z_tied2, svint8_t,
++		z0 = svmsb_n_s8_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_s8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	msb	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_z_untied, svint8_t,
++		z0 = svmsb_n_s8_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_s8_x_tied1:
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_x_tied1, svint8_t,
++		z0 = svmsb_s8_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_s8_x_tied2:
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_x_tied2, svint8_t,
++		z0 = svmsb_s8_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_s8_x_tied3:
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_x_tied3, svint8_t,
++		z0 = svmsb_s8_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	msb	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0, z3
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_s8_x_untied, svint8_t,
++		z0 = svmsb_s8_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_w0_s8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_x_tied2, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmsb_n_s8_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_x_tied1, svint8_t,
++		z0 = svmsb_n_s8_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_s8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_x_tied2, svint8_t,
++		z0 = svmsb_n_s8_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_s8_x_untied:
++**	mov	z0\.b, #11
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_s8_x_untied, svint8_t,
++		z0 = svmsb_n_s8_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u16.c
+new file mode 100644
+index 000000000..437a96040
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u16.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_u16_m_tied1:
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_m_tied1, svuint16_t,
++		z0 = svmsb_u16_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_m_tied2, svuint16_t,
++		z0 = svmsb_u16_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_u16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_m_tied3, svuint16_t,
++		z0 = svmsb_u16_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_u16_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_m_untied, svuint16_t,
++		z0 = svmsb_u16_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_m_tied1, svuint16_t,
++		z0 = svmsb_n_u16_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_m_untied, svuint16_t,
++		z0 = svmsb_n_u16_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_z_tied1, svuint16_t,
++		z0 = svmsb_u16_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_z_tied2, svuint16_t,
++		z0 = svmsb_u16_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_u16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_z_tied3, svuint16_t,
++		z0 = svmsb_u16_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	msb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_z_untied, svuint16_t,
++		z0 = svmsb_u16_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_w0_u16_z_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_z_tied2, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	msb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_z_tied1, svuint16_t,
++		z0 = svmsb_n_u16_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_u16_z_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_z_tied2, svuint16_t,
++		z0 = svmsb_n_u16_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_u16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	msb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	msb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_z_untied, svuint16_t,
++		z0 = svmsb_n_u16_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_u16_x_tied1:
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_x_tied1, svuint16_t,
++		z0 = svmsb_u16_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_u16_x_tied2:
++**	msb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_x_tied2, svuint16_t,
++		z0 = svmsb_u16_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_u16_x_tied3:
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_x_tied3, svuint16_t,
++		z0 = svmsb_u16_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	msb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u16_x_untied, svuint16_t,
++		z0 = svmsb_u16_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_w0_u16_x_tied2:
++**	mov	(z[0-9]+\.h), w0
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_x_tied2, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmsb_n_u16_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_x_tied1, svuint16_t,
++		z0 = svmsb_n_u16_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_u16_x_tied2:
++**	mov	(z[0-9]+\.h), #11
++**	msb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_x_tied2, svuint16_t,
++		z0 = svmsb_n_u16_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_u16_x_untied:
++**	mov	z0\.h, #11
++**	mls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u16_x_untied, svuint16_t,
++		z0 = svmsb_n_u16_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u32.c
+new file mode 100644
+index 000000000..aaaf0344a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u32.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_u32_m_tied1:
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_m_tied1, svuint32_t,
++		z0 = svmsb_u32_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_m_tied2, svuint32_t,
++		z0 = svmsb_u32_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_u32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_m_tied3, svuint32_t,
++		z0 = svmsb_u32_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_u32_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_m_untied, svuint32_t,
++		z0 = svmsb_u32_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_m_tied1, svuint32_t,
++		z0 = svmsb_n_u32_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_m_untied, svuint32_t,
++		z0 = svmsb_n_u32_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_z_tied1, svuint32_t,
++		z0 = svmsb_u32_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_z_tied2, svuint32_t,
++		z0 = svmsb_u32_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_u32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_z_tied3, svuint32_t,
++		z0 = svmsb_u32_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	msb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_z_untied, svuint32_t,
++		z0 = svmsb_u32_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_w0_u32_z_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_z_tied2, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	msb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_z_tied1, svuint32_t,
++		z0 = svmsb_n_u32_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_u32_z_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_z_tied2, svuint32_t,
++		z0 = svmsb_n_u32_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_u32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	msb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	msb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_z_untied, svuint32_t,
++		z0 = svmsb_n_u32_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_u32_x_tied1:
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_x_tied1, svuint32_t,
++		z0 = svmsb_u32_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_u32_x_tied2:
++**	msb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_x_tied2, svuint32_t,
++		z0 = svmsb_u32_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_u32_x_tied3:
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_x_tied3, svuint32_t,
++		z0 = svmsb_u32_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	msb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u32_x_untied, svuint32_t,
++		z0 = svmsb_u32_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_w0_u32_x_tied2:
++**	mov	(z[0-9]+\.s), w0
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_x_tied2, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmsb_n_u32_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_x_tied1, svuint32_t,
++		z0 = svmsb_n_u32_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_u32_x_tied2:
++**	mov	(z[0-9]+\.s), #11
++**	msb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_x_tied2, svuint32_t,
++		z0 = svmsb_n_u32_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_u32_x_untied:
++**	mov	z0\.s, #11
++**	mls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u32_x_untied, svuint32_t,
++		z0 = svmsb_n_u32_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u64.c
+new file mode 100644
+index 000000000..5c5d33073
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u64.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_u64_m_tied1:
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_m_tied1, svuint64_t,
++		z0 = svmsb_u64_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_m_tied2, svuint64_t,
++		z0 = svmsb_u64_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_u64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_m_tied3, svuint64_t,
++		z0 = svmsb_u64_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_u64_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_m_untied, svuint64_t,
++		z0 = svmsb_u64_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_m_tied1, svuint64_t,
++		z0 = svmsb_n_u64_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_m_untied, svuint64_t,
++		z0 = svmsb_n_u64_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_z_tied1, svuint64_t,
++		z0 = svmsb_u64_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_z_tied2, svuint64_t,
++		z0 = svmsb_u64_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_u64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_z_tied3, svuint64_t,
++		z0 = svmsb_u64_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	msb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_z_untied, svuint64_t,
++		z0 = svmsb_u64_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_x0_u64_z_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_z_tied2, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	msb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_z_tied1, svuint64_t,
++		z0 = svmsb_n_u64_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_u64_z_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_z_tied2, svuint64_t,
++		z0 = svmsb_n_u64_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_u64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	msb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	msb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_z_untied, svuint64_t,
++		z0 = svmsb_n_u64_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_u64_x_tied1:
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_x_tied1, svuint64_t,
++		z0 = svmsb_u64_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_u64_x_tied2:
++**	msb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_x_tied2, svuint64_t,
++		z0 = svmsb_u64_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_u64_x_tied3:
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_x_tied3, svuint64_t,
++		z0 = svmsb_u64_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	msb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u64_x_untied, svuint64_t,
++		z0 = svmsb_u64_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_x0_u64_x_tied2:
++**	mov	(z[0-9]+\.d), x0
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_x_tied2, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmsb_n_u64_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_x_tied1, svuint64_t,
++		z0 = svmsb_n_u64_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_u64_x_tied2:
++**	mov	(z[0-9]+\.d), #11
++**	msb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_x_tied2, svuint64_t,
++		z0 = svmsb_n_u64_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_u64_x_untied:
++**	mov	z0\.d, #11
++**	mls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u64_x_untied, svuint64_t,
++		z0 = svmsb_n_u64_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u8.c
+new file mode 100644
+index 000000000..5665ec9e3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u8.c
+@@ -0,0 +1,321 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** msb_u8_m_tied1:
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_m_tied1, svuint8_t,
++		z0 = svmsb_u8_m (p0, z0, z1, z2),
++		z0 = svmsb_m (p0, z0, z1, z2))
++
++/*
++** msb_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, \1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_m_tied2, svuint8_t,
++		z0 = svmsb_u8_m (p0, z1, z0, z2),
++		z0 = svmsb_m (p0, z1, z0, z2))
++
++/*
++** msb_u8_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_m_tied3, svuint8_t,
++		z0 = svmsb_u8_m (p0, z1, z2, z0),
++		z0 = svmsb_m (p0, z1, z2, z0))
++
++/*
++** msb_u8_m_untied:
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, z3\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_m_untied, svuint8_t,
++		z0 = svmsb_u8_m (p0, z1, z2, z3),
++		z0 = svmsb_m (p0, z1, z2, z3))
++
++/*
++** msb_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_m (p0, z0, z1, x0),
++		 z0 = svmsb_m (p0, z0, z1, x0))
++
++/*
++** msb_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_m (p0, z1, z2, x0),
++		 z0 = svmsb_m (p0, z1, z2, x0))
++
++/*
++** msb_11_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_m_tied1, svuint8_t,
++		z0 = svmsb_n_u8_m (p0, z0, z1, 11),
++		z0 = svmsb_m (p0, z0, z1, 11))
++
++/*
++** msb_11_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_m_untied, svuint8_t,
++		z0 = svmsb_n_u8_m (p0, z1, z2, 11),
++		z0 = svmsb_m (p0, z1, z2, 11))
++
++/*
++** msb_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_z_tied1, svuint8_t,
++		z0 = svmsb_u8_z (p0, z0, z1, z2),
++		z0 = svmsb_z (p0, z0, z1, z2))
++
++/*
++** msb_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_z_tied2, svuint8_t,
++		z0 = svmsb_u8_z (p0, z1, z0, z2),
++		z0 = svmsb_z (p0, z1, z0, z2))
++
++/*
++** msb_u8_z_tied3:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_z_tied3, svuint8_t,
++		z0 = svmsb_u8_z (p0, z1, z2, z0),
++		z0 = svmsb_z (p0, z1, z2, z0))
++
++/*
++** msb_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	msb	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0\.b, p0/z, z3\.b
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_z_untied, svuint8_t,
++		z0 = svmsb_u8_z (p0, z1, z2, z3),
++		z0 = svmsb_z (p0, z1, z2, z3))
++
++/*
++** msb_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_z (p0, z0, z1, x0),
++		 z0 = svmsb_z (p0, z0, z1, x0))
++
++/*
++** msb_w0_u8_z_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_z_tied2, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_z (p0, z1, z0, x0),
++		 z0 = svmsb_z (p0, z1, z0, x0))
++
++/*
++** msb_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	msb	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_z (p0, z1, z2, x0),
++		 z0 = svmsb_z (p0, z1, z2, x0))
++
++/*
++** msb_11_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_z_tied1, svuint8_t,
++		z0 = svmsb_n_u8_z (p0, z0, z1, 11),
++		z0 = svmsb_z (p0, z0, z1, 11))
++
++/*
++** msb_11_u8_z_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_z_tied2, svuint8_t,
++		z0 = svmsb_n_u8_z (p0, z1, z0, 11),
++		z0 = svmsb_z (p0, z1, z0, 11))
++
++/*
++** msb_11_u8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	msb	z0\.b, p0/m, z2\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	msb	z0\.b, p0/m, z1\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_z_untied, svuint8_t,
++		z0 = svmsb_n_u8_z (p0, z1, z2, 11),
++		z0 = svmsb_z (p0, z1, z2, 11))
++
++/*
++** msb_u8_x_tied1:
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_x_tied1, svuint8_t,
++		z0 = svmsb_u8_x (p0, z0, z1, z2),
++		z0 = svmsb_x (p0, z0, z1, z2))
++
++/*
++** msb_u8_x_tied2:
++**	msb	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_x_tied2, svuint8_t,
++		z0 = svmsb_u8_x (p0, z1, z0, z2),
++		z0 = svmsb_x (p0, z1, z0, z2))
++
++/*
++** msb_u8_x_tied3:
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_x_tied3, svuint8_t,
++		z0 = svmsb_u8_x (p0, z1, z2, z0),
++		z0 = svmsb_x (p0, z1, z2, z0))
++
++/*
++** msb_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	msb	z0\.b, p0/m, z2\.b, z3\.b
++** |
++**	movprfx	z0, z2
++**	msb	z0\.b, p0/m, z1\.b, z3\.b
++** |
++**	movprfx	z0, z3
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (msb_u8_x_untied, svuint8_t,
++		z0 = svmsb_u8_x (p0, z1, z2, z3),
++		z0 = svmsb_x (p0, z1, z2, z3))
++
++/*
++** msb_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_x (p0, z0, z1, x0),
++		 z0 = svmsb_x (p0, z0, z1, x0))
++
++/*
++** msb_w0_u8_x_tied2:
++**	mov	(z[0-9]+\.b), w0
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_x_tied2, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_x (p0, z1, z0, x0),
++		 z0 = svmsb_x (p0, z1, z0, x0))
++
++/*
++** msb_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (msb_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmsb_n_u8_x (p0, z1, z2, x0),
++		 z0 = svmsb_x (p0, z1, z2, x0))
++
++/*
++** msb_11_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_x_tied1, svuint8_t,
++		z0 = svmsb_n_u8_x (p0, z0, z1, 11),
++		z0 = svmsb_x (p0, z0, z1, 11))
++
++/*
++** msb_11_u8_x_tied2:
++**	mov	(z[0-9]+\.b), #11
++**	msb	z0\.b, p0/m, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_x_tied2, svuint8_t,
++		z0 = svmsb_n_u8_x (p0, z1, z0, 11),
++		z0 = svmsb_x (p0, z1, z0, 11))
++
++/*
++** msb_11_u8_x_untied:
++**	mov	z0\.b, #11
++**	mls	z0\.b, p0/m, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (msb_11_u8_x_untied, svuint8_t,
++		z0 = svmsb_n_u8_x (p0, z1, z2, 11),
++		z0 = svmsb_x (p0, z1, z2, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16.c
+new file mode 100644
+index 000000000..ef3de0c59
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16.c
+@@ -0,0 +1,444 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_f16_m_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_m_tied1, svfloat16_t,
++		z0 = svmul_f16_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_m_tied2, svfloat16_t,
++		z0 = svmul_f16_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_f16_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_m_untied, svfloat16_t,
++		z0 = svmul_f16_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_m (p0, z0, d4),
++		 z0 = svmul_m (p0, z0, d4))
++
++/*
++** mul_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_m (p0, z1, d4),
++		 z0 = svmul_m (p0, z1, d4))
++
++/*
++** mul_1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_m_tied1, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z0, 1),
++		z0 = svmul_m (p0, z0, 1))
++
++/*
++** mul_1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_m_untied, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z1, 1),
++		z0 = svmul_m (p0, z1, 1))
++
++/*
++** mul_0p5_f16_m_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z0, 0.5),
++		z0 = svmul_m (p0, z0, 0.5))
++
++/*
++** mul_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_m_untied, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z1, 0.5),
++		z0 = svmul_m (p0, z1, 0.5))
++
++/*
++** mul_2_f16_m_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_m_tied1, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_f16_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_m_untied, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_z_tied1, svfloat16_t,
++		z0 = svmul_f16_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_z_tied2, svfloat16_t,
++		z0 = svmul_f16_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_z_untied, svfloat16_t,
++		z0 = svmul_f16_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_z (p0, z0, d4),
++		 z0 = svmul_z (p0, z0, d4))
++
++/*
++** mul_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_z (p0, z1, d4),
++		 z0 = svmul_z (p0, z1, d4))
++
++/*
++** mul_1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_z_tied1, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z0, 1),
++		z0 = svmul_z (p0, z0, 1))
++
++/*
++** mul_1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_z_untied, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z1, 1),
++		z0 = svmul_z (p0, z1, 1))
++
++/*
++** mul_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z0, 0.5),
++		z0 = svmul_z (p0, z0, 0.5))
++
++/*
++** mul_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_z_untied, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z1, 0.5),
++		z0 = svmul_z (p0, z1, 0.5))
++
++/*
++** mul_2_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_z_tied1, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_z_untied, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_f16_x_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_x_tied1, svfloat16_t,
++		z0 = svmul_f16_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_f16_x_tied2:
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_x_tied2, svfloat16_t,
++		z0 = svmul_f16_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_x_untied, svfloat16_t,
++		z0 = svmul_f16_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_x (p0, z0, d4),
++		 z0 = svmul_x (p0, z0, d4))
++
++/*
++** mul_h4_f16_x_untied:
++**	mov	z0\.h, h4
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_x (p0, z1, d4),
++		 z0 = svmul_x (p0, z1, d4))
++
++/*
++** mul_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z0, 1),
++		z0 = svmul_x (p0, z0, 1))
++
++/*
++** mul_1_f16_x_untied:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z1, 1),
++		z0 = svmul_x (p0, z1, 1))
++
++/*
++** mul_0p5_f16_x_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z0, 0.5),
++		z0 = svmul_x (p0, z0, 0.5))
++
++/*
++** mul_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z1, 0.5),
++		z0 = svmul_x (p0, z1, 0.5))
++
++/*
++** mul_2_f16_x_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_f16_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** ptrue_mul_f16_x_tied1:
++**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f16_x_tied1, svfloat16_t,
++		z0 = svmul_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svmul_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_mul_f16_x_tied2:
++**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f16_x_tied2, svfloat16_t,
++		z0 = svmul_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svmul_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_mul_f16_x_untied:
++**	fmul	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f16_x_untied, svfloat16_t,
++		z0 = svmul_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svmul_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_mul_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svmul_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_mul_1_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svmul_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_mul_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svmul_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_mul_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svmul_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_mul_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svmul_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_mul_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svmul_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16_notrap.c
+new file mode 100644
+index 000000000..481fe999c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16_notrap.c
+@@ -0,0 +1,439 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_f16_m_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_m_tied1, svfloat16_t,
++		z0 = svmul_f16_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_m_tied2, svfloat16_t,
++		z0 = svmul_f16_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_f16_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_m_untied, svfloat16_t,
++		z0 = svmul_f16_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_m (p0, z0, d4),
++		 z0 = svmul_m (p0, z0, d4))
++
++/*
++** mul_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_m (p0, z1, d4),
++		 z0 = svmul_m (p0, z1, d4))
++
++/*
++** mul_1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_m_tied1, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z0, 1),
++		z0 = svmul_m (p0, z0, 1))
++
++/*
++** mul_1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_m_untied, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z1, 1),
++		z0 = svmul_m (p0, z1, 1))
++
++/*
++** mul_0p5_f16_m_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z0, 0.5),
++		z0 = svmul_m (p0, z0, 0.5))
++
++/*
++** mul_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_m_untied, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z1, 0.5),
++		z0 = svmul_m (p0, z1, 0.5))
++
++/*
++** mul_2_f16_m_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_m_tied1, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_f16_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_m_untied, svfloat16_t,
++		z0 = svmul_n_f16_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_z_tied1, svfloat16_t,
++		z0 = svmul_f16_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_z_tied2, svfloat16_t,
++		z0 = svmul_f16_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_z_untied, svfloat16_t,
++		z0 = svmul_f16_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_z (p0, z0, d4),
++		 z0 = svmul_z (p0, z0, d4))
++
++/*
++** mul_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_z (p0, z1, d4),
++		 z0 = svmul_z (p0, z1, d4))
++
++/*
++** mul_1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_z_tied1, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z0, 1),
++		z0 = svmul_z (p0, z0, 1))
++
++/*
++** mul_1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_z_untied, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z1, 1),
++		z0 = svmul_z (p0, z1, 1))
++
++/*
++** mul_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z0, 0.5),
++		z0 = svmul_z (p0, z0, 0.5))
++
++/*
++** mul_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_z_untied, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z1, 0.5),
++		z0 = svmul_z (p0, z1, 0.5))
++
++/*
++** mul_2_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_z_tied1, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_z_untied, svfloat16_t,
++		z0 = svmul_n_f16_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_f16_x_tied1:
++**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_x_tied1, svfloat16_t,
++		z0 = svmul_f16_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_f16_x_tied2:
++**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_x_tied2, svfloat16_t,
++		z0 = svmul_f16_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_f16_x_untied:
++**	fmul	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f16_x_untied, svfloat16_t,
++		z0 = svmul_f16_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_x (p0, z0, d4),
++		 z0 = svmul_x (p0, z0, d4))
++
++/*
++** mul_h4_f16_x_untied:
++**	mov	(z[0-9]+\.h), h4
++**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmul_n_f16_x (p0, z1, d4),
++		 z0 = svmul_x (p0, z1, d4))
++
++/*
++** mul_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z0, 1),
++		z0 = svmul_x (p0, z0, 1))
++
++/*
++** mul_1_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z1, 1),
++		z0 = svmul_x (p0, z1, 1))
++
++/*
++** mul_0p5_f16_x_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z0, 0.5),
++		z0 = svmul_x (p0, z0, 0.5))
++
++/*
++** mul_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z1, 0.5),
++		z0 = svmul_x (p0, z1, 0.5))
++
++/*
++** mul_2_f16_x_tied1:
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_f16_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.h, p0/m, z0\.h, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** ptrue_mul_f16_x_tied1:
++**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f16_x_tied1, svfloat16_t,
++		z0 = svmul_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svmul_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_mul_f16_x_tied2:
++**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f16_x_tied2, svfloat16_t,
++		z0 = svmul_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svmul_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_mul_f16_x_untied:
++**	fmul	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f16_x_untied, svfloat16_t,
++		z0 = svmul_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svmul_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_mul_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svmul_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_mul_1_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svmul_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_mul_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svmul_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_mul_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svmul_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_mul_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f16_x_tied1, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svmul_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_mul_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f16_x_untied, svfloat16_t,
++		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svmul_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32.c
+new file mode 100644
+index 000000000..5b3df6fde
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32.c
+@@ -0,0 +1,444 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_f32_m_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_m_tied1, svfloat32_t,
++		z0 = svmul_f32_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_m_tied2, svfloat32_t,
++		z0 = svmul_f32_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_f32_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_m_untied, svfloat32_t,
++		z0 = svmul_f32_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmul_n_f32_m (p0, z0, d4),
++		 z0 = svmul_m (p0, z0, d4))
++
++/*
++** mul_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmul_n_f32_m (p0, z1, d4),
++		 z0 = svmul_m (p0, z1, d4))
++
++/*
++** mul_1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_m_tied1, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z0, 1),
++		z0 = svmul_m (p0, z0, 1))
++
++/*
++** mul_1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_m_untied, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z1, 1),
++		z0 = svmul_m (p0, z1, 1))
++
++/*
++** mul_0p5_f32_m_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z0, 0.5),
++		z0 = svmul_m (p0, z0, 0.5))
++
++/*
++** mul_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_m_untied, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z1, 0.5),
++		z0 = svmul_m (p0, z1, 0.5))
++
++/*
++** mul_2_f32_m_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_m_tied1, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_f32_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_m_untied, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_z_tied1, svfloat32_t,
++		z0 = svmul_f32_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_z_tied2, svfloat32_t,
++		z0 = svmul_f32_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_z_untied, svfloat32_t,
++		z0 = svmul_f32_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmul_n_f32_z (p0, z0, d4),
++		 z0 = svmul_z (p0, z0, d4))
++
++/*
++** mul_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmul_n_f32_z (p0, z1, d4),
++		 z0 = svmul_z (p0, z1, d4))
++
++/*
++** mul_1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_z_tied1, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z0, 1),
++		z0 = svmul_z (p0, z0, 1))
++
++/*
++** mul_1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_z_untied, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z1, 1),
++		z0 = svmul_z (p0, z1, 1))
++
++/*
++** mul_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z0, 0.5),
++		z0 = svmul_z (p0, z0, 0.5))
++
++/*
++** mul_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_z_untied, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z1, 0.5),
++		z0 = svmul_z (p0, z1, 0.5))
++
++/*
++** mul_2_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_z_tied1, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_z_untied, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_f32_x_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_x_tied1, svfloat32_t,
++		z0 = svmul_f32_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_f32_x_tied2:
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_x_tied2, svfloat32_t,
++		z0 = svmul_f32_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_x_untied, svfloat32_t,
++		z0 = svmul_f32_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmul_n_f32_x (p0, z0, d4),
++		 z0 = svmul_x (p0, z0, d4))
++
++/*
++** mul_s4_f32_x_untied:
++**	mov	z0\.s, s4
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmul_n_f32_x (p0, z1, d4),
++		 z0 = svmul_x (p0, z1, d4))
++
++/*
++** mul_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z0, 1),
++		z0 = svmul_x (p0, z0, 1))
++
++/*
++** mul_1_f32_x_untied:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z1, 1),
++		z0 = svmul_x (p0, z1, 1))
++
++/*
++** mul_0p5_f32_x_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z0, 0.5),
++		z0 = svmul_x (p0, z0, 0.5))
++
++/*
++** mul_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z1, 0.5),
++		z0 = svmul_x (p0, z1, 0.5))
++
++/*
++** mul_2_f32_x_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_f32_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** ptrue_mul_f32_x_tied1:
++**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f32_x_tied1, svfloat32_t,
++		z0 = svmul_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svmul_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_mul_f32_x_tied2:
++**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f32_x_tied2, svfloat32_t,
++		z0 = svmul_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svmul_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_mul_f32_x_untied:
++**	fmul	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f32_x_untied, svfloat32_t,
++		z0 = svmul_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svmul_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_mul_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svmul_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_mul_1_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svmul_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_mul_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svmul_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_mul_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svmul_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_mul_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svmul_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_mul_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svmul_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32_notrap.c
+new file mode 100644
+index 000000000..eb2d240ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32_notrap.c
+@@ -0,0 +1,439 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_f32_m_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_m_tied1, svfloat32_t,
++		z0 = svmul_f32_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_m_tied2, svfloat32_t,
++		z0 = svmul_f32_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_f32_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_m_untied, svfloat32_t,
++		z0 = svmul_f32_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmul_n_f32_m (p0, z0, d4),
++		 z0 = svmul_m (p0, z0, d4))
++
++/*
++** mul_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmul_n_f32_m (p0, z1, d4),
++		 z0 = svmul_m (p0, z1, d4))
++
++/*
++** mul_1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_m_tied1, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z0, 1),
++		z0 = svmul_m (p0, z0, 1))
++
++/*
++** mul_1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_m_untied, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z1, 1),
++		z0 = svmul_m (p0, z1, 1))
++
++/*
++** mul_0p5_f32_m_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z0, 0.5),
++		z0 = svmul_m (p0, z0, 0.5))
++
++/*
++** mul_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_m_untied, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z1, 0.5),
++		z0 = svmul_m (p0, z1, 0.5))
++
++/*
++** mul_2_f32_m_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_m_tied1, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_f32_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_m_untied, svfloat32_t,
++		z0 = svmul_n_f32_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_z_tied1, svfloat32_t,
++		z0 = svmul_f32_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_z_tied2, svfloat32_t,
++		z0 = svmul_f32_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_z_untied, svfloat32_t,
++		z0 = svmul_f32_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmul_n_f32_z (p0, z0, d4),
++		 z0 = svmul_z (p0, z0, d4))
++
++/*
++** mul_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmul_n_f32_z (p0, z1, d4),
++		 z0 = svmul_z (p0, z1, d4))
++
++/*
++** mul_1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_z_tied1, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z0, 1),
++		z0 = svmul_z (p0, z0, 1))
++
++/*
++** mul_1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_z_untied, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z1, 1),
++		z0 = svmul_z (p0, z1, 1))
++
++/*
++** mul_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z0, 0.5),
++		z0 = svmul_z (p0, z0, 0.5))
++
++/*
++** mul_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_z_untied, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z1, 0.5),
++		z0 = svmul_z (p0, z1, 0.5))
++
++/*
++** mul_2_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_z_tied1, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_z_untied, svfloat32_t,
++		z0 = svmul_n_f32_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_f32_x_tied1:
++**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_x_tied1, svfloat32_t,
++		z0 = svmul_f32_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_f32_x_tied2:
++**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_x_tied2, svfloat32_t,
++		z0 = svmul_f32_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_f32_x_untied:
++**	fmul	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f32_x_untied, svfloat32_t,
++		z0 = svmul_f32_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmul_n_f32_x (p0, z0, d4),
++		 z0 = svmul_x (p0, z0, d4))
++
++/*
++** mul_s4_f32_x_untied:
++**	mov	(z[0-9]+\.s), s4
++**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmul_n_f32_x (p0, z1, d4),
++		 z0 = svmul_x (p0, z1, d4))
++
++/*
++** mul_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z0, 1),
++		z0 = svmul_x (p0, z0, 1))
++
++/*
++** mul_1_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z1, 1),
++		z0 = svmul_x (p0, z1, 1))
++
++/*
++** mul_0p5_f32_x_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z0, 0.5),
++		z0 = svmul_x (p0, z0, 0.5))
++
++/*
++** mul_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z1, 0.5),
++		z0 = svmul_x (p0, z1, 0.5))
++
++/*
++** mul_2_f32_x_tied1:
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_f32_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.s, p0/m, z0\.s, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** ptrue_mul_f32_x_tied1:
++**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f32_x_tied1, svfloat32_t,
++		z0 = svmul_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svmul_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_mul_f32_x_tied2:
++**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f32_x_tied2, svfloat32_t,
++		z0 = svmul_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svmul_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_mul_f32_x_untied:
++**	fmul	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f32_x_untied, svfloat32_t,
++		z0 = svmul_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svmul_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_mul_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svmul_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_mul_1_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svmul_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_mul_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svmul_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_mul_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svmul_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_mul_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f32_x_tied1, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svmul_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_mul_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f32_x_untied, svfloat32_t,
++		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svmul_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64.c
+new file mode 100644
+index 000000000..f5654a9f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64.c
+@@ -0,0 +1,444 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_f64_m_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_m_tied1, svfloat64_t,
++		z0 = svmul_f64_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_m_tied2, svfloat64_t,
++		z0 = svmul_f64_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_f64_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_m_untied, svfloat64_t,
++		z0 = svmul_f64_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmul_n_f64_m (p0, z0, d4),
++		 z0 = svmul_m (p0, z0, d4))
++
++/*
++** mul_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmul_n_f64_m (p0, z1, d4),
++		 z0 = svmul_m (p0, z1, d4))
++
++/*
++** mul_1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_m_tied1, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z0, 1),
++		z0 = svmul_m (p0, z0, 1))
++
++/*
++** mul_1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_m_untied, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z1, 1),
++		z0 = svmul_m (p0, z1, 1))
++
++/*
++** mul_0p5_f64_m_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z0, 0.5),
++		z0 = svmul_m (p0, z0, 0.5))
++
++/*
++** mul_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_m_untied, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z1, 0.5),
++		z0 = svmul_m (p0, z1, 0.5))
++
++/*
++** mul_2_f64_m_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_m_tied1, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_f64_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_m_untied, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_z_tied1, svfloat64_t,
++		z0 = svmul_f64_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_z_tied2, svfloat64_t,
++		z0 = svmul_f64_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_z_untied, svfloat64_t,
++		z0 = svmul_f64_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmul_n_f64_z (p0, z0, d4),
++		 z0 = svmul_z (p0, z0, d4))
++
++/*
++** mul_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmul_n_f64_z (p0, z1, d4),
++		 z0 = svmul_z (p0, z1, d4))
++
++/*
++** mul_1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_z_tied1, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z0, 1),
++		z0 = svmul_z (p0, z0, 1))
++
++/*
++** mul_1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_z_untied, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z1, 1),
++		z0 = svmul_z (p0, z1, 1))
++
++/*
++** mul_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z0, 0.5),
++		z0 = svmul_z (p0, z0, 0.5))
++
++/*
++** mul_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_z_untied, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z1, 0.5),
++		z0 = svmul_z (p0, z1, 0.5))
++
++/*
++** mul_2_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_z_tied1, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_z_untied, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_f64_x_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_x_tied1, svfloat64_t,
++		z0 = svmul_f64_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_f64_x_tied2:
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_x_tied2, svfloat64_t,
++		z0 = svmul_f64_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_x_untied, svfloat64_t,
++		z0 = svmul_f64_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmul_n_f64_x (p0, z0, d4),
++		 z0 = svmul_x (p0, z0, d4))
++
++/*
++** mul_d4_f64_x_untied:
++**	mov	z0\.d, d4
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmul_n_f64_x (p0, z1, d4),
++		 z0 = svmul_x (p0, z1, d4))
++
++/*
++** mul_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z0, 1),
++		z0 = svmul_x (p0, z0, 1))
++
++/*
++** mul_1_f64_x_untied:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z1, 1),
++		z0 = svmul_x (p0, z1, 1))
++
++/*
++** mul_0p5_f64_x_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z0, 0.5),
++		z0 = svmul_x (p0, z0, 0.5))
++
++/*
++** mul_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z1, 0.5),
++		z0 = svmul_x (p0, z1, 0.5))
++
++/*
++** mul_2_f64_x_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_f64_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** ptrue_mul_f64_x_tied1:
++**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f64_x_tied1, svfloat64_t,
++		z0 = svmul_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svmul_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_mul_f64_x_tied2:
++**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f64_x_tied2, svfloat64_t,
++		z0 = svmul_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svmul_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_mul_f64_x_untied:
++**	fmul	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f64_x_untied, svfloat64_t,
++		z0 = svmul_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svmul_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_mul_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svmul_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_mul_1_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svmul_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_mul_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svmul_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_mul_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svmul_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_mul_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svmul_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_mul_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svmul_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64_notrap.c
+new file mode 100644
+index 000000000..d865618d4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64_notrap.c
+@@ -0,0 +1,439 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_f64_m_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_m_tied1, svfloat64_t,
++		z0 = svmul_f64_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_m_tied2, svfloat64_t,
++		z0 = svmul_f64_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_f64_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_m_untied, svfloat64_t,
++		z0 = svmul_f64_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmul_n_f64_m (p0, z0, d4),
++		 z0 = svmul_m (p0, z0, d4))
++
++/*
++** mul_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmul_n_f64_m (p0, z1, d4),
++		 z0 = svmul_m (p0, z1, d4))
++
++/*
++** mul_1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_m_tied1, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z0, 1),
++		z0 = svmul_m (p0, z0, 1))
++
++/*
++** mul_1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_m_untied, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z1, 1),
++		z0 = svmul_m (p0, z1, 1))
++
++/*
++** mul_0p5_f64_m_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z0, 0.5),
++		z0 = svmul_m (p0, z0, 0.5))
++
++/*
++** mul_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_m_untied, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z1, 0.5),
++		z0 = svmul_m (p0, z1, 0.5))
++
++/*
++** mul_2_f64_m_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_m_tied1, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_f64_m_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_m_untied, svfloat64_t,
++		z0 = svmul_n_f64_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_z_tied1, svfloat64_t,
++		z0 = svmul_f64_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_z_tied2, svfloat64_t,
++		z0 = svmul_f64_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_z_untied, svfloat64_t,
++		z0 = svmul_f64_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmul_n_f64_z (p0, z0, d4),
++		 z0 = svmul_z (p0, z0, d4))
++
++/*
++** mul_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmul_n_f64_z (p0, z1, d4),
++		 z0 = svmul_z (p0, z1, d4))
++
++/*
++** mul_1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_z_tied1, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z0, 1),
++		z0 = svmul_z (p0, z0, 1))
++
++/*
++** mul_1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_z_untied, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z1, 1),
++		z0 = svmul_z (p0, z1, 1))
++
++/*
++** mul_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z0, 0.5),
++		z0 = svmul_z (p0, z0, 0.5))
++
++/*
++** mul_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_z_untied, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z1, 0.5),
++		z0 = svmul_z (p0, z1, 0.5))
++
++/*
++** mul_2_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_z_tied1, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_z_untied, svfloat64_t,
++		z0 = svmul_n_f64_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_f64_x_tied1:
++**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_x_tied1, svfloat64_t,
++		z0 = svmul_f64_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_f64_x_tied2:
++**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_x_tied2, svfloat64_t,
++		z0 = svmul_f64_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_f64_x_untied:
++**	fmul	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_f64_x_untied, svfloat64_t,
++		z0 = svmul_f64_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmul_n_f64_x (p0, z0, d4),
++		 z0 = svmul_x (p0, z0, d4))
++
++/*
++** mul_d4_f64_x_untied:
++**	mov	(z[0-9]+\.d), d4
++**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZD (mul_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmul_n_f64_x (p0, z1, d4),
++		 z0 = svmul_x (p0, z1, d4))
++
++/*
++** mul_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z0, 1),
++		z0 = svmul_x (p0, z0, 1))
++
++/*
++** mul_1_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (mul_1_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z1, 1),
++		z0 = svmul_x (p0, z1, 1))
++
++/*
++** mul_0p5_f64_x_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z0, 0.5),
++		z0 = svmul_x (p0, z0, 0.5))
++
++/*
++** mul_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (mul_0p5_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z1, 0.5),
++		z0 = svmul_x (p0, z1, 0.5))
++
++/*
++** mul_2_f64_x_tied1:
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_f64_x_untied:
++**	movprfx	z0, z1
++**	fmul	z0\.d, p0/m, z0\.d, #2\.0
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** ptrue_mul_f64_x_tied1:
++**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f64_x_tied1, svfloat64_t,
++		z0 = svmul_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svmul_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_mul_f64_x_tied2:
++**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f64_x_tied2, svfloat64_t,
++		z0 = svmul_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svmul_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_mul_f64_x_untied:
++**	fmul	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_f64_x_untied, svfloat64_t,
++		z0 = svmul_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svmul_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_mul_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svmul_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_mul_1_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_1_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svmul_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_mul_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svmul_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_mul_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svmul_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_mul_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f64_x_tied1, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svmul_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_mul_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mul_2_f64_x_untied, svfloat64_t,
++		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svmul_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f16.c
+new file mode 100644
+index 000000000..1c7503bfd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f16.c
+@@ -0,0 +1,114 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_lane_0_f16_tied1:
++**	fmul	z0\.h, z0\.h, z1\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f16_tied1, svfloat16_t,
++		z0 = svmul_lane_f16 (z0, z1, 0),
++		z0 = svmul_lane (z0, z1, 0))
++
++/*
++** mul_lane_0_f16_tied2:
++**	fmul	z0\.h, z1\.h, z0\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f16_tied2, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z0, 0),
++		z0 = svmul_lane (z1, z0, 0))
++
++/*
++** mul_lane_0_f16_untied:
++**	fmul	z0\.h, z1\.h, z2\.h\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f16_untied, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 0),
++		z0 = svmul_lane (z1, z2, 0))
++
++/*
++** mul_lane_1_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_1_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 1),
++		z0 = svmul_lane (z1, z2, 1))
++
++/*
++** mul_lane_2_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_2_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 2),
++		z0 = svmul_lane (z1, z2, 2))
++
++/*
++** mul_lane_3_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_3_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 3),
++		z0 = svmul_lane (z1, z2, 3))
++
++/*
++** mul_lane_4_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[4\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_4_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 4),
++		z0 = svmul_lane (z1, z2, 4))
++
++/*
++** mul_lane_5_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[5\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_5_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 5),
++		z0 = svmul_lane (z1, z2, 5))
++
++/*
++** mul_lane_6_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[6\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_6_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 6),
++		z0 = svmul_lane (z1, z2, 6))
++
++/*
++** mul_lane_7_f16:
++**	fmul	z0\.h, z1\.h, z2\.h\[7\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_7_f16, svfloat16_t,
++		z0 = svmul_lane_f16 (z1, z2, 7),
++		z0 = svmul_lane (z1, z2, 7))
++
++/*
++** mul_lane_z7_f16:
++**	fmul	z0\.h, z1\.h, z7\.h\[7\]
++**	ret
++*/
++TEST_DUAL_Z (mul_lane_z7_f16, svfloat16_t, svfloat16_t,
++	     z0 = svmul_lane_f16 (z1, z7, 7),
++	     z0 = svmul_lane (z1, z7, 7))
++
++/*
++** mul_lane_z8_f16:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	fmul	z0\.h, z1\.h, \1\.h\[7\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mul_lane_z8_f16, svfloat16_t, svfloat16_t, z8,
++		    z0 = svmul_lane_f16 (z1, z8, 7),
++		    z0 = svmul_lane (z1, z8, 7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f32.c
+new file mode 100644
+index 000000000..5355e7e0b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f32.c
+@@ -0,0 +1,78 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_lane_0_f32_tied1:
++**	fmul	z0\.s, z0\.s, z1\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f32_tied1, svfloat32_t,
++		z0 = svmul_lane_f32 (z0, z1, 0),
++		z0 = svmul_lane (z0, z1, 0))
++
++/*
++** mul_lane_0_f32_tied2:
++**	fmul	z0\.s, z1\.s, z0\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f32_tied2, svfloat32_t,
++		z0 = svmul_lane_f32 (z1, z0, 0),
++		z0 = svmul_lane (z1, z0, 0))
++
++/*
++** mul_lane_0_f32_untied:
++**	fmul	z0\.s, z1\.s, z2\.s\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f32_untied, svfloat32_t,
++		z0 = svmul_lane_f32 (z1, z2, 0),
++		z0 = svmul_lane (z1, z2, 0))
++
++/*
++** mul_lane_1_f32:
++**	fmul	z0\.s, z1\.s, z2\.s\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_1_f32, svfloat32_t,
++		z0 = svmul_lane_f32 (z1, z2, 1),
++		z0 = svmul_lane (z1, z2, 1))
++
++/*
++** mul_lane_2_f32:
++**	fmul	z0\.s, z1\.s, z2\.s\[2\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_2_f32, svfloat32_t,
++		z0 = svmul_lane_f32 (z1, z2, 2),
++		z0 = svmul_lane (z1, z2, 2))
++
++/*
++** mul_lane_3_f32:
++**	fmul	z0\.s, z1\.s, z2\.s\[3\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_3_f32, svfloat32_t,
++		z0 = svmul_lane_f32 (z1, z2, 3),
++		z0 = svmul_lane (z1, z2, 3))
++
++/*
++** mul_lane_z7_f32:
++**	fmul	z0\.s, z1\.s, z7\.s\[3\]
++**	ret
++*/
++TEST_DUAL_Z (mul_lane_z7_f32, svfloat32_t, svfloat32_t,
++	     z0 = svmul_lane_f32 (z1, z7, 3),
++	     z0 = svmul_lane (z1, z7, 3))
++
++/*
++** mul_lane_z8_f32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	fmul	z0\.s, z1\.s, \1\.s\[3\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mul_lane_z8_f32, svfloat32_t, svfloat32_t, z8,
++		    z0 = svmul_lane_f32 (z1, z8, 3),
++		    z0 = svmul_lane (z1, z8, 3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f64.c
+new file mode 100644
+index 000000000..a53a013c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f64.c
+@@ -0,0 +1,69 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_lane_0_f64_tied1:
++**	fmul	z0\.d, z0\.d, z1\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f64_tied1, svfloat64_t,
++		z0 = svmul_lane_f64 (z0, z1, 0),
++		z0 = svmul_lane (z0, z1, 0))
++
++/*
++** mul_lane_0_f64_tied2:
++**	fmul	z0\.d, z1\.d, z0\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f64_tied2, svfloat64_t,
++		z0 = svmul_lane_f64 (z1, z0, 0),
++		z0 = svmul_lane (z1, z0, 0))
++
++/*
++** mul_lane_0_f64_untied:
++**	fmul	z0\.d, z1\.d, z2\.d\[0\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_0_f64_untied, svfloat64_t,
++		z0 = svmul_lane_f64 (z1, z2, 0),
++		z0 = svmul_lane (z1, z2, 0))
++
++/*
++** mul_lane_1_f64:
++**	fmul	z0\.d, z1\.d, z2\.d\[1\]
++**	ret
++*/
++TEST_UNIFORM_Z (mul_lane_1_f64, svfloat64_t,
++		z0 = svmul_lane_f64 (z1, z2, 1),
++		z0 = svmul_lane (z1, z2, 1))
++
++/*
++** mul_lane_z7_f64:
++**	fmul	z0\.d, z1\.d, z7\.d\[1\]
++**	ret
++*/
++TEST_DUAL_Z (mul_lane_z7_f64, svfloat64_t, svfloat64_t,
++	     z0 = svmul_lane_f64 (z1, z7, 1),
++	     z0 = svmul_lane (z1, z7, 1))
++
++/*
++** mul_lane_z15_f64:
++**	str	d15, \[sp, -16\]!
++**	fmul	z0\.d, z1\.d, z15\.d\[1\]
++**	ldr	d15, \[sp\], 16
++**	ret
++*/
++TEST_DUAL_LANE_REG (mul_lane_z15_f64, svfloat64_t, svfloat64_t, z15,
++		    z0 = svmul_lane_f64 (z1, z15, 1),
++		    z0 = svmul_lane (z1, z15, 1))
++
++/*
++** mul_lane_z16_f64:
++**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
++**	fmul	z0\.d, z1\.d, \1\.d\[1\]
++**	ret
++*/
++TEST_DUAL_LANE_REG (mul_lane_z16_f64, svfloat64_t, svfloat64_t, z16,
++		    z0 = svmul_lane_f64 (z1, z16, 1),
++		    z0 = svmul_lane (z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
+new file mode 100644
+index 000000000..aa08bc274
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
+@@ -0,0 +1,302 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_s16_m_tied1:
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_m_tied1, svint16_t,
++		z0 = svmul_s16_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_m_tied2, svint16_t,
++		z0 = svmul_s16_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_s16_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_m_untied, svint16_t,
++		z0 = svmul_s16_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmul_n_s16_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmul_n_s16_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #2
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s16_m_tied1, svint16_t,
++		z0 = svmul_n_s16_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #2
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s16_m_untied, svint16_t,
++		z0 = svmul_n_s16_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_s16_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	mul	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s16_m, svint16_t,
++		z0 = svmul_n_s16_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_z_tied1, svint16_t,
++		z0 = svmul_s16_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_z_tied2, svint16_t,
++		z0 = svmul_s16_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_z_untied, svint16_t,
++		z0 = svmul_s16_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmul_n_s16_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmul_n_s16_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #2
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s16_z_tied1, svint16_t,
++		z0 = svmul_n_s16_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_s16_z_untied:
++**	mov	(z[0-9]+\.h), #2
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s16_z_untied, svint16_t,
++		z0 = svmul_n_s16_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_s16_x_tied1:
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_x_tied1, svint16_t,
++		z0 = svmul_s16_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_s16_x_tied2:
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_x_tied2, svint16_t,
++		z0 = svmul_s16_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s16_x_untied, svint16_t,
++		z0 = svmul_s16_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmul_n_s16_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmul_n_s16_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_s16_x_tied1:
++**	mul	z0\.h, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s16_x_tied1, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_s16_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.h, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s16_x_untied, svint16_t,
++		z0 = svmul_n_s16_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_s16_x:
++**	mul	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_s16_x, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_s16_x:
++**	mov	(z[0-9]+\.h), #128
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_s16_x, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_s16_x:
++**	mov	(z[0-9]+\.h), #255
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_s16_x, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_s16_x:
++**	mul	z0\.h, z0\.h, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s16_x, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_s16_x:
++**	mul	z0\.h, z0\.h, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_s16_x, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_s16_x:
++**	mul	z0\.h, z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_s16_x, svint16_t,
++		z0 = svmul_n_s16_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
+new file mode 100644
+index 000000000..7acf77fdb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
+@@ -0,0 +1,302 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_s32_m_tied1:
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_m_tied1, svint32_t,
++		z0 = svmul_s32_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_m_tied2, svint32_t,
++		z0 = svmul_s32_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_s32_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_m_untied, svint32_t,
++		z0 = svmul_s32_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmul_n_s32_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmul_n_s32_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s32_m_tied1, svint32_t,
++		z0 = svmul_n_s32_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s32_m_untied, svint32_t,
++		z0 = svmul_n_s32_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_s32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	mul	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t,
++		z0 = svmul_n_s32_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_z_tied1, svint32_t,
++		z0 = svmul_s32_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_z_tied2, svint32_t,
++		z0 = svmul_s32_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_z_untied, svint32_t,
++		z0 = svmul_s32_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmul_n_s32_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmul_n_s32_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s32_z_tied1, svint32_t,
++		z0 = svmul_n_s32_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_s32_z_untied:
++**	mov	(z[0-9]+\.s), #2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s32_z_untied, svint32_t,
++		z0 = svmul_n_s32_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_s32_x_tied1:
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_x_tied1, svint32_t,
++		z0 = svmul_s32_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_s32_x_tied2:
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_x_tied2, svint32_t,
++		z0 = svmul_s32_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s32_x_untied, svint32_t,
++		z0 = svmul_s32_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmul_n_s32_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmul_n_s32_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_s32_x_tied1:
++**	mul	z0\.s, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s32_x_tied1, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_s32_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.s, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s32_x_untied, svint32_t,
++		z0 = svmul_n_s32_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_s32_x:
++**	mul	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_s32_x, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_s32_x:
++**	mov	(z[0-9]+\.s), #128
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_s32_x, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_s32_x:
++**	mov	(z[0-9]+\.s), #255
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_s32_x, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_s32_x:
++**	mul	z0\.s, z0\.s, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_s32_x:
++**	mul	z0\.s, z0\.s, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_s32_x, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_s32_x:
++**	mul	z0\.s, z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_s32_x, svint32_t,
++		z0 = svmul_n_s32_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
+new file mode 100644
+index 000000000..549105f1e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
+@@ -0,0 +1,302 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_s64_m_tied1:
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_m_tied1, svint64_t,
++		z0 = svmul_s64_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_m_tied2, svint64_t,
++		z0 = svmul_s64_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_s64_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_m_untied, svint64_t,
++		z0 = svmul_s64_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmul_n_s64_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmul_n_s64_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s64_m_tied1, svint64_t,
++		z0 = svmul_n_s64_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s64_m_untied, svint64_t,
++		z0 = svmul_n_s64_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_s64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	mul	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t,
++		z0 = svmul_n_s64_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_z_tied1, svint64_t,
++		z0 = svmul_s64_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_z_tied2, svint64_t,
++		z0 = svmul_s64_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_z_untied, svint64_t,
++		z0 = svmul_s64_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmul_n_s64_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmul_n_s64_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s64_z_tied1, svint64_t,
++		z0 = svmul_n_s64_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_s64_z_untied:
++**	mov	(z[0-9]+\.d), #2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s64_z_untied, svint64_t,
++		z0 = svmul_n_s64_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_s64_x_tied1:
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_x_tied1, svint64_t,
++		z0 = svmul_s64_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_s64_x_tied2:
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_x_tied2, svint64_t,
++		z0 = svmul_s64_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s64_x_untied, svint64_t,
++		z0 = svmul_s64_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmul_n_s64_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmul_n_s64_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_s64_x_tied1:
++**	mul	z0\.d, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s64_x_tied1, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_s64_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.d, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s64_x_untied, svint64_t,
++		z0 = svmul_n_s64_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_s64_x:
++**	mul	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_s64_x, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_s64_x:
++**	mov	(z[0-9]+\.d), #128
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_s64_x, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_s64_x:
++**	mov	(z[0-9]+\.d), #255
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_s64_x, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_s64_x:
++**	mul	z0\.d, z0\.d, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_s64_x:
++**	mul	z0\.d, z0\.d, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_s64_x, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_s64_x:
++**	mul	z0\.d, z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_s64_x, svint64_t,
++		z0 = svmul_n_s64_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
+new file mode 100644
+index 000000000..012e6f250
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
+@@ -0,0 +1,300 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_s8_m_tied1:
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_m_tied1, svint8_t,
++		z0 = svmul_s8_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_m_tied2, svint8_t,
++		z0 = svmul_s8_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_s8_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_m_untied, svint8_t,
++		z0 = svmul_s8_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmul_n_s8_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmul_n_s8_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #2
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s8_m_tied1, svint8_t,
++		z0 = svmul_n_s8_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #2
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s8_m_untied, svint8_t,
++		z0 = svmul_n_s8_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_s8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t,
++		z0 = svmul_n_s8_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_z_tied1, svint8_t,
++		z0 = svmul_s8_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_z_tied2, svint8_t,
++		z0 = svmul_s8_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mul	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_z_untied, svint8_t,
++		z0 = svmul_s8_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmul_n_s8_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmul_n_s8_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #2
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s8_z_tied1, svint8_t,
++		z0 = svmul_n_s8_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_s8_z_untied:
++**	mov	(z[0-9]+\.b), #2
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s8_z_untied, svint8_t,
++		z0 = svmul_n_s8_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_s8_x_tied1:
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_x_tied1, svint8_t,
++		z0 = svmul_s8_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_s8_x_tied2:
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_x_tied2, svint8_t,
++		z0 = svmul_s8_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_s8_x_untied, svint8_t,
++		z0 = svmul_s8_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmul_n_s8_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmul_n_s8_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_s8_x_tied1:
++**	mul	z0\.b, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s8_x_tied1, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_s8_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.b, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_s8_x_untied, svint8_t,
++		z0 = svmul_n_s8_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_s8_x:
++**	mul	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_s8_x, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_s8_x:
++**	mul	z0\.b, z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_s8_x, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_s8_x:
++**	mul	z0\.b, z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_s8_x, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_s8_x:
++**	mul	z0\.b, z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_s8_x:
++**	mul	z0\.b, z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_s8_x, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_s8_x:
++**	mul	z0\.b, z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_s8_x, svint8_t,
++		z0 = svmul_n_s8_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
+new file mode 100644
+index 000000000..300987eb6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
+@@ -0,0 +1,302 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_u16_m_tied1:
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_m_tied1, svuint16_t,
++		z0 = svmul_u16_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_m_tied2, svuint16_t,
++		z0 = svmul_u16_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_u16_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_m_untied, svuint16_t,
++		z0 = svmul_u16_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmul_n_u16_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmul_n_u16_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #2
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u16_m_tied1, svuint16_t,
++		z0 = svmul_n_u16_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #2
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u16_m_untied, svuint16_t,
++		z0 = svmul_n_u16_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_u16_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	mul	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u16_m, svuint16_t,
++		z0 = svmul_n_u16_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_z_tied1, svuint16_t,
++		z0 = svmul_u16_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_z_tied2, svuint16_t,
++		z0 = svmul_u16_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_z_untied, svuint16_t,
++		z0 = svmul_u16_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmul_n_u16_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmul_n_u16_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #2
++**	movprfx	z0\.h, p0/z, z0\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u16_z_tied1, svuint16_t,
++		z0 = svmul_n_u16_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_u16_z_untied:
++**	mov	(z[0-9]+\.h), #2
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	mul	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u16_z_untied, svuint16_t,
++		z0 = svmul_n_u16_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_u16_x_tied1:
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_x_tied1, svuint16_t,
++		z0 = svmul_u16_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_u16_x_tied2:
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_x_tied2, svuint16_t,
++		z0 = svmul_u16_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u16_x_untied, svuint16_t,
++		z0 = svmul_u16_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmul_n_u16_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	mul	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmul_n_u16_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_u16_x_tied1:
++**	mul	z0\.h, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u16_x_tied1, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_u16_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.h, z0\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u16_x_untied, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_u16_x:
++**	mul	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_u16_x, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_u16_x:
++**	mov	(z[0-9]+\.h), #128
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_u16_x, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_u16_x:
++**	mov	(z[0-9]+\.h), #255
++**	mul	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_u16_x, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_u16_x:
++**	mul	z0\.h, z0\.h, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u16_x, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_u16_x:
++**	mul	z0\.h, z0\.h, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_u16_x, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_u16_x:
++**	mul	z0\.h, z0\.h, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_u16_x, svuint16_t,
++		z0 = svmul_n_u16_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
+new file mode 100644
+index 000000000..288d17b16
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
+@@ -0,0 +1,302 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_u32_m_tied1:
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_m_tied1, svuint32_t,
++		z0 = svmul_u32_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_m_tied2, svuint32_t,
++		z0 = svmul_u32_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_u32_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_m_untied, svuint32_t,
++		z0 = svmul_u32_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmul_n_u32_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmul_n_u32_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u32_m_tied1, svuint32_t,
++		z0 = svmul_n_u32_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u32_m_untied, svuint32_t,
++		z0 = svmul_n_u32_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_u32_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	mul	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u32_m, svuint32_t,
++		z0 = svmul_n_u32_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_z_tied1, svuint32_t,
++		z0 = svmul_u32_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_z_tied2, svuint32_t,
++		z0 = svmul_u32_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_z_untied, svuint32_t,
++		z0 = svmul_u32_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmul_n_u32_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmul_n_u32_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #2
++**	movprfx	z0\.s, p0/z, z0\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u32_z_tied1, svuint32_t,
++		z0 = svmul_n_u32_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_u32_z_untied:
++**	mov	(z[0-9]+\.s), #2
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	mul	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u32_z_untied, svuint32_t,
++		z0 = svmul_n_u32_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_u32_x_tied1:
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_x_tied1, svuint32_t,
++		z0 = svmul_u32_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_u32_x_tied2:
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_x_tied2, svuint32_t,
++		z0 = svmul_u32_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u32_x_untied, svuint32_t,
++		z0 = svmul_u32_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmul_n_u32_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	mul	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmul_n_u32_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_u32_x_tied1:
++**	mul	z0\.s, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u32_x_tied1, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_u32_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.s, z0\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u32_x_untied, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_u32_x:
++**	mul	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_u32_x, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_u32_x:
++**	mov	(z[0-9]+\.s), #128
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_u32_x, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_u32_x:
++**	mov	(z[0-9]+\.s), #255
++**	mul	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_u32_x, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_u32_x:
++**	mul	z0\.s, z0\.s, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u32_x, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_u32_x:
++**	mul	z0\.s, z0\.s, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_u32_x, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_u32_x:
++**	mul	z0\.s, z0\.s, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_u32_x, svuint32_t,
++		z0 = svmul_n_u32_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
+new file mode 100644
+index 000000000..f6959dbc7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
+@@ -0,0 +1,302 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_u64_m_tied1:
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_m_tied1, svuint64_t,
++		z0 = svmul_u64_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_m_tied2, svuint64_t,
++		z0 = svmul_u64_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_u64_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_m_untied, svuint64_t,
++		z0 = svmul_u64_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmul_n_u64_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmul_n_u64_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u64_m_tied1, svuint64_t,
++		z0 = svmul_n_u64_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u64_m_untied, svuint64_t,
++		z0 = svmul_n_u64_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_u64_m:
++**	mov	(z[0-9]+)\.b, #-1
++**	mul	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u64_m, svuint64_t,
++		z0 = svmul_n_u64_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_z_tied1, svuint64_t,
++		z0 = svmul_u64_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_z_tied2, svuint64_t,
++		z0 = svmul_u64_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_z_untied, svuint64_t,
++		z0 = svmul_u64_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmul_n_u64_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmul_n_u64_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #2
++**	movprfx	z0\.d, p0/z, z0\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u64_z_tied1, svuint64_t,
++		z0 = svmul_n_u64_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_u64_z_untied:
++**	mov	(z[0-9]+\.d), #2
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	mul	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u64_z_untied, svuint64_t,
++		z0 = svmul_n_u64_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_u64_x_tied1:
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_x_tied1, svuint64_t,
++		z0 = svmul_u64_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_u64_x_tied2:
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_x_tied2, svuint64_t,
++		z0 = svmul_u64_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u64_x_untied, svuint64_t,
++		z0 = svmul_u64_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmul_n_u64_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	mul	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmul_n_u64_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_u64_x_tied1:
++**	mul	z0\.d, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u64_x_tied1, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_u64_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.d, z0\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u64_x_untied, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_u64_x:
++**	mul	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_u64_x, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_u64_x:
++**	mov	(z[0-9]+\.d), #128
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_u64_x, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_u64_x:
++**	mov	(z[0-9]+\.d), #255
++**	mul	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_u64_x, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_u64_x:
++**	mul	z0\.d, z0\.d, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u64_x, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_u64_x:
++**	mul	z0\.d, z0\.d, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_u64_x, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_u64_x:
++**	mul	z0\.d, z0\.d, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_u64_x, svuint64_t,
++		z0 = svmul_n_u64_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
+new file mode 100644
+index 000000000..b2745a48f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
+@@ -0,0 +1,300 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mul_u8_m_tied1:
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_m_tied1, svuint8_t,
++		z0 = svmul_u8_m (p0, z0, z1),
++		z0 = svmul_m (p0, z0, z1))
++
++/*
++** mul_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_m_tied2, svuint8_t,
++		z0 = svmul_u8_m (p0, z1, z0),
++		z0 = svmul_m (p0, z1, z0))
++
++/*
++** mul_u8_m_untied:
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_m_untied, svuint8_t,
++		z0 = svmul_u8_m (p0, z1, z2),
++		z0 = svmul_m (p0, z1, z2))
++
++/*
++** mul_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmul_n_u8_m (p0, z0, x0),
++		 z0 = svmul_m (p0, z0, x0))
++
++/*
++** mul_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmul_n_u8_m (p0, z1, x0),
++		 z0 = svmul_m (p0, z1, x0))
++
++/*
++** mul_2_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #2
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u8_m_tied1, svuint8_t,
++		z0 = svmul_n_u8_m (p0, z0, 2),
++		z0 = svmul_m (p0, z0, 2))
++
++/*
++** mul_2_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #2
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u8_m_untied, svuint8_t,
++		z0 = svmul_n_u8_m (p0, z1, 2),
++		z0 = svmul_m (p0, z1, 2))
++
++/*
++** mul_m1_u8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u8_m, svuint8_t,
++		z0 = svmul_n_u8_m (p0, z0, -1),
++		z0 = svmul_m (p0, z0, -1))
++
++/*
++** mul_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_z_tied1, svuint8_t,
++		z0 = svmul_u8_z (p0, z0, z1),
++		z0 = svmul_z (p0, z0, z1))
++
++/*
++** mul_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_z_tied2, svuint8_t,
++		z0 = svmul_u8_z (p0, z1, z0),
++		z0 = svmul_z (p0, z1, z0))
++
++/*
++** mul_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mul	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_z_untied, svuint8_t,
++		z0 = svmul_u8_z (p0, z1, z2),
++		z0 = svmul_z (p0, z1, z2))
++
++/*
++** mul_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmul_n_u8_z (p0, z0, x0),
++		 z0 = svmul_z (p0, z0, x0))
++
++/*
++** mul_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmul_n_u8_z (p0, z1, x0),
++		 z0 = svmul_z (p0, z1, x0))
++
++/*
++** mul_2_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #2
++**	movprfx	z0\.b, p0/z, z0\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u8_z_tied1, svuint8_t,
++		z0 = svmul_n_u8_z (p0, z0, 2),
++		z0 = svmul_z (p0, z0, 2))
++
++/*
++** mul_2_u8_z_untied:
++**	mov	(z[0-9]+\.b), #2
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	mul	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u8_z_untied, svuint8_t,
++		z0 = svmul_n_u8_z (p0, z1, 2),
++		z0 = svmul_z (p0, z1, 2))
++
++/*
++** mul_u8_x_tied1:
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_x_tied1, svuint8_t,
++		z0 = svmul_u8_x (p0, z0, z1),
++		z0 = svmul_x (p0, z0, z1))
++
++/*
++** mul_u8_x_tied2:
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_x_tied2, svuint8_t,
++		z0 = svmul_u8_x (p0, z1, z0),
++		z0 = svmul_x (p0, z1, z0))
++
++/*
++** mul_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	mul	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mul_u8_x_untied, svuint8_t,
++		z0 = svmul_u8_x (p0, z1, z2),
++		z0 = svmul_x (p0, z1, z2))
++
++/*
++** mul_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	mul	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmul_n_u8_x (p0, z0, x0),
++		 z0 = svmul_x (p0, z0, x0))
++
++/*
++** mul_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	mul	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mul_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmul_n_u8_x (p0, z1, x0),
++		 z0 = svmul_x (p0, z1, x0))
++
++/*
++** mul_2_u8_x_tied1:
++**	mul	z0\.b, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u8_x_tied1, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, 2),
++		z0 = svmul_x (p0, z0, 2))
++
++/*
++** mul_2_u8_x_untied:
++**	movprfx	z0, z1
++**	mul	z0\.b, z0\.b, #2
++**	ret
++*/
++TEST_UNIFORM_Z (mul_2_u8_x_untied, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z1, 2),
++		z0 = svmul_x (p0, z1, 2))
++
++/*
++** mul_127_u8_x:
++**	mul	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_127_u8_x, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, 127),
++		z0 = svmul_x (p0, z0, 127))
++
++/*
++** mul_128_u8_x:
++**	mul	z0\.b, z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_128_u8_x, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, 128),
++		z0 = svmul_x (p0, z0, 128))
++
++/*
++** mul_255_u8_x:
++**	mul	z0\.b, z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_255_u8_x, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, 255),
++		z0 = svmul_x (p0, z0, 255))
++
++/*
++** mul_m1_u8_x:
++**	mul	z0\.b, z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m1_u8_x, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, -1),
++		z0 = svmul_x (p0, z0, -1))
++
++/*
++** mul_m127_u8_x:
++**	mul	z0\.b, z0\.b, #-127
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m127_u8_x, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, -127),
++		z0 = svmul_x (p0, z0, -127))
++
++/*
++** mul_m128_u8_x:
++**	mul	z0\.b, z0\.b, #-128
++**	ret
++*/
++TEST_UNIFORM_Z (mul_m128_u8_x, svuint8_t,
++		z0 = svmul_n_u8_x (p0, z0, -128),
++		z0 = svmul_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s16.c
+new file mode 100644
+index 000000000..a81532f5d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s16.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_s16_m_tied1:
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_m_tied1, svint16_t,
++		z0 = svmulh_s16_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smulh	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_m_tied2, svint16_t,
++		z0 = svmulh_s16_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_s16_m_untied:
++**	movprfx	z0, z1
++**	smulh	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_m_untied, svint16_t,
++		z0 = svmulh_s16_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svmulh_n_s16_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svmulh_n_s16_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s16_m_tied1, svint16_t,
++		z0 = svmulh_n_s16_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s16_m_untied, svint16_t,
++		z0 = svmulh_n_s16_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_z_tied1, svint16_t,
++		z0 = svmulh_s16_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_z_tied2, svint16_t,
++		z0 = svmulh_s16_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smulh	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_z_untied, svint16_t,
++		z0 = svmulh_s16_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svmulh_n_s16_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smulh	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svmulh_n_s16_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s16_z_tied1, svint16_t,
++		z0 = svmulh_n_s16_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_s16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	smulh	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s16_z_untied, svint16_t,
++		z0 = svmulh_n_s16_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_s16_x_tied1:
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_x_tied1, svint16_t,
++		z0 = svmulh_s16_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_s16_x_tied2:
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_x_tied2, svint16_t,
++		z0 = svmulh_s16_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_s16_x_untied:
++** (
++**	movprfx	z0, z1
++**	smulh	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s16_x_untied, svint16_t,
++		z0 = svmulh_s16_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svmulh_n_s16_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_w0_s16_x_untied:
++**	mov	z0\.h, w0
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svmulh_n_s16_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_s16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	smulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s16_x_tied1, svint16_t,
++		z0 = svmulh_n_s16_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_s16_x_untied:
++**	mov	z0\.h, #11
++**	smulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s16_x_untied, svint16_t,
++		z0 = svmulh_n_s16_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s32.c
+new file mode 100644
+index 000000000..078feeb6a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s32.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_s32_m_tied1:
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_m_tied1, svint32_t,
++		z0 = svmulh_s32_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smulh	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_m_tied2, svint32_t,
++		z0 = svmulh_s32_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_s32_m_untied:
++**	movprfx	z0, z1
++**	smulh	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_m_untied, svint32_t,
++		z0 = svmulh_s32_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svmulh_n_s32_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svmulh_n_s32_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s32_m_tied1, svint32_t,
++		z0 = svmulh_n_s32_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s32_m_untied, svint32_t,
++		z0 = svmulh_n_s32_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_z_tied1, svint32_t,
++		z0 = svmulh_s32_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_z_tied2, svint32_t,
++		z0 = svmulh_s32_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smulh	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_z_untied, svint32_t,
++		z0 = svmulh_s32_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svmulh_n_s32_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smulh	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svmulh_n_s32_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s32_z_tied1, svint32_t,
++		z0 = svmulh_n_s32_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_s32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	smulh	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s32_z_untied, svint32_t,
++		z0 = svmulh_n_s32_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_s32_x_tied1:
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_x_tied1, svint32_t,
++		z0 = svmulh_s32_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_s32_x_tied2:
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_x_tied2, svint32_t,
++		z0 = svmulh_s32_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_s32_x_untied:
++** (
++**	movprfx	z0, z1
++**	smulh	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s32_x_untied, svint32_t,
++		z0 = svmulh_s32_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svmulh_n_s32_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_w0_s32_x_untied:
++**	mov	z0\.s, w0
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svmulh_n_s32_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_s32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	smulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s32_x_tied1, svint32_t,
++		z0 = svmulh_n_s32_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_s32_x_untied:
++**	mov	z0\.s, #11
++**	smulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s32_x_untied, svint32_t,
++		z0 = svmulh_n_s32_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s64.c
+new file mode 100644
+index 000000000..a87d4d5ce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s64.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_s64_m_tied1:
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_m_tied1, svint64_t,
++		z0 = svmulh_s64_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_m_tied2, svint64_t,
++		z0 = svmulh_s64_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_s64_m_untied:
++**	movprfx	z0, z1
++**	smulh	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_m_untied, svint64_t,
++		z0 = svmulh_s64_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svmulh_n_s64_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svmulh_n_s64_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s64_m_tied1, svint64_t,
++		z0 = svmulh_n_s64_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s64_m_untied, svint64_t,
++		z0 = svmulh_n_s64_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_z_tied1, svint64_t,
++		z0 = svmulh_s64_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_z_tied2, svint64_t,
++		z0 = svmulh_s64_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smulh	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_z_untied, svint64_t,
++		z0 = svmulh_s64_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svmulh_n_s64_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smulh	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svmulh_n_s64_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s64_z_tied1, svint64_t,
++		z0 = svmulh_n_s64_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_s64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	smulh	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s64_z_untied, svint64_t,
++		z0 = svmulh_n_s64_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_s64_x_tied1:
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_x_tied1, svint64_t,
++		z0 = svmulh_s64_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_s64_x_tied2:
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_x_tied2, svint64_t,
++		z0 = svmulh_s64_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_s64_x_untied:
++** (
++**	movprfx	z0, z1
++**	smulh	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s64_x_untied, svint64_t,
++		z0 = svmulh_s64_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svmulh_n_s64_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_x0_s64_x_untied:
++**	mov	z0\.d, x0
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svmulh_n_s64_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_s64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	smulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s64_x_tied1, svint64_t,
++		z0 = svmulh_n_s64_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_s64_x_untied:
++**	mov	z0\.d, #11
++**	smulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s64_x_untied, svint64_t,
++		z0 = svmulh_n_s64_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s8.c
+new file mode 100644
+index 000000000..f9cd01afd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s8.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_s8_m_tied1:
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_m_tied1, svint8_t,
++		z0 = svmulh_s8_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	smulh	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_m_tied2, svint8_t,
++		z0 = svmulh_s8_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_s8_m_untied:
++**	movprfx	z0, z1
++**	smulh	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_m_untied, svint8_t,
++		z0 = svmulh_s8_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svmulh_n_s8_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svmulh_n_s8_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s8_m_tied1, svint8_t,
++		z0 = svmulh_n_s8_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s8_m_untied, svint8_t,
++		z0 = svmulh_n_s8_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_z_tied1, svint8_t,
++		z0 = svmulh_s8_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_z_tied2, svint8_t,
++		z0 = svmulh_s8_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smulh	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_z_untied, svint8_t,
++		z0 = svmulh_s8_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svmulh_n_s8_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smulh	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svmulh_n_s8_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s8_z_tied1, svint8_t,
++		z0 = svmulh_n_s8_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_s8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	smulh	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s8_z_untied, svint8_t,
++		z0 = svmulh_n_s8_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_s8_x_tied1:
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_x_tied1, svint8_t,
++		z0 = svmulh_s8_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_s8_x_tied2:
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_x_tied2, svint8_t,
++		z0 = svmulh_s8_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_s8_x_untied:
++** (
++**	movprfx	z0, z1
++**	smulh	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_s8_x_untied, svint8_t,
++		z0 = svmulh_s8_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svmulh_n_s8_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_w0_s8_x_untied:
++**	mov	z0\.b, w0
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svmulh_n_s8_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_s8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	smulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s8_x_tied1, svint8_t,
++		z0 = svmulh_n_s8_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_s8_x_untied:
++**	mov	z0\.b, #11
++**	smulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_s8_x_untied, svint8_t,
++		z0 = svmulh_n_s8_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u16.c
+new file mode 100644
+index 000000000..e9173eb24
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u16.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_u16_m_tied1:
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_m_tied1, svuint16_t,
++		z0 = svmulh_u16_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umulh	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_m_tied2, svuint16_t,
++		z0 = svmulh_u16_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_u16_m_untied:
++**	movprfx	z0, z1
++**	umulh	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_m_untied, svuint16_t,
++		z0 = svmulh_u16_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svmulh_n_u16_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svmulh_n_u16_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u16_m_tied1, svuint16_t,
++		z0 = svmulh_n_u16_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0, z1
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u16_m_untied, svuint16_t,
++		z0 = svmulh_n_u16_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_z_tied1, svuint16_t,
++		z0 = svmulh_u16_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_z_tied2, svuint16_t,
++		z0 = svmulh_u16_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umulh	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_z_untied, svuint16_t,
++		z0 = svmulh_u16_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svmulh_n_u16_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umulh	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svmulh_n_u16_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	movprfx	z0\.h, p0/z, z0\.h
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u16_z_tied1, svuint16_t,
++		z0 = svmulh_n_u16_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_u16_z_untied:
++**	mov	(z[0-9]+\.h), #11
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	umulh	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u16_z_untied, svuint16_t,
++		z0 = svmulh_n_u16_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_u16_x_tied1:
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_x_tied1, svuint16_t,
++		z0 = svmulh_u16_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_u16_x_tied2:
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_x_tied2, svuint16_t,
++		z0 = svmulh_u16_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_u16_x_untied:
++** (
++**	movprfx	z0, z1
++**	umulh	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u16_x_untied, svuint16_t,
++		z0 = svmulh_u16_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svmulh_n_u16_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_w0_u16_x_untied:
++**	mov	z0\.h, w0
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svmulh_n_u16_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_u16_x_tied1:
++**	mov	(z[0-9]+\.h), #11
++**	umulh	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u16_x_tied1, svuint16_t,
++		z0 = svmulh_n_u16_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_u16_x_untied:
++**	mov	z0\.h, #11
++**	umulh	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u16_x_untied, svuint16_t,
++		z0 = svmulh_n_u16_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u32.c
+new file mode 100644
+index 000000000..de1f24f09
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u32.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_u32_m_tied1:
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_m_tied1, svuint32_t,
++		z0 = svmulh_u32_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umulh	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_m_tied2, svuint32_t,
++		z0 = svmulh_u32_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_u32_m_untied:
++**	movprfx	z0, z1
++**	umulh	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_m_untied, svuint32_t,
++		z0 = svmulh_u32_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svmulh_n_u32_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svmulh_n_u32_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u32_m_tied1, svuint32_t,
++		z0 = svmulh_n_u32_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0, z1
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u32_m_untied, svuint32_t,
++		z0 = svmulh_n_u32_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_z_tied1, svuint32_t,
++		z0 = svmulh_u32_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_z_tied2, svuint32_t,
++		z0 = svmulh_u32_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umulh	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_z_untied, svuint32_t,
++		z0 = svmulh_u32_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svmulh_n_u32_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umulh	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svmulh_n_u32_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	movprfx	z0\.s, p0/z, z0\.s
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u32_z_tied1, svuint32_t,
++		z0 = svmulh_n_u32_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_u32_z_untied:
++**	mov	(z[0-9]+\.s), #11
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	umulh	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u32_z_untied, svuint32_t,
++		z0 = svmulh_n_u32_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_u32_x_tied1:
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_x_tied1, svuint32_t,
++		z0 = svmulh_u32_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_u32_x_tied2:
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_x_tied2, svuint32_t,
++		z0 = svmulh_u32_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_u32_x_untied:
++** (
++**	movprfx	z0, z1
++**	umulh	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u32_x_untied, svuint32_t,
++		z0 = svmulh_u32_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svmulh_n_u32_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_w0_u32_x_untied:
++**	mov	z0\.s, w0
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svmulh_n_u32_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_u32_x_tied1:
++**	mov	(z[0-9]+\.s), #11
++**	umulh	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u32_x_tied1, svuint32_t,
++		z0 = svmulh_n_u32_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_u32_x_untied:
++**	mov	z0\.s, #11
++**	umulh	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u32_x_untied, svuint32_t,
++		z0 = svmulh_n_u32_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u64.c
+new file mode 100644
+index 000000000..0d7e12a7c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u64.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_u64_m_tied1:
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_m_tied1, svuint64_t,
++		z0 = svmulh_u64_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_m_tied2, svuint64_t,
++		z0 = svmulh_u64_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_u64_m_untied:
++**	movprfx	z0, z1
++**	umulh	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_m_untied, svuint64_t,
++		z0 = svmulh_u64_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svmulh_n_u64_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svmulh_n_u64_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u64_m_tied1, svuint64_t,
++		z0 = svmulh_n_u64_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0, z1
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u64_m_untied, svuint64_t,
++		z0 = svmulh_n_u64_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_z_tied1, svuint64_t,
++		z0 = svmulh_u64_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_z_tied2, svuint64_t,
++		z0 = svmulh_u64_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umulh	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_z_untied, svuint64_t,
++		z0 = svmulh_u64_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svmulh_n_u64_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umulh	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svmulh_n_u64_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	movprfx	z0\.d, p0/z, z0\.d
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u64_z_tied1, svuint64_t,
++		z0 = svmulh_n_u64_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_u64_z_untied:
++**	mov	(z[0-9]+\.d), #11
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	umulh	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u64_z_untied, svuint64_t,
++		z0 = svmulh_n_u64_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_u64_x_tied1:
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_x_tied1, svuint64_t,
++		z0 = svmulh_u64_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_u64_x_tied2:
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_x_tied2, svuint64_t,
++		z0 = svmulh_u64_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_u64_x_untied:
++** (
++**	movprfx	z0, z1
++**	umulh	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u64_x_untied, svuint64_t,
++		z0 = svmulh_u64_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svmulh_n_u64_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_x0_u64_x_untied:
++**	mov	z0\.d, x0
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svmulh_n_u64_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_u64_x_tied1:
++**	mov	(z[0-9]+\.d), #11
++**	umulh	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u64_x_tied1, svuint64_t,
++		z0 = svmulh_n_u64_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_u64_x_untied:
++**	mov	z0\.d, #11
++**	umulh	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u64_x_untied, svuint64_t,
++		z0 = svmulh_n_u64_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u8.c
+new file mode 100644
+index 000000000..db7b1be1b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u8.c
+@@ -0,0 +1,237 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulh_u8_m_tied1:
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_m_tied1, svuint8_t,
++		z0 = svmulh_u8_m (p0, z0, z1),
++		z0 = svmulh_m (p0, z0, z1))
++
++/*
++** mulh_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	umulh	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_m_tied2, svuint8_t,
++		z0 = svmulh_u8_m (p0, z1, z0),
++		z0 = svmulh_m (p0, z1, z0))
++
++/*
++** mulh_u8_m_untied:
++**	movprfx	z0, z1
++**	umulh	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_m_untied, svuint8_t,
++		z0 = svmulh_u8_m (p0, z1, z2),
++		z0 = svmulh_m (p0, z1, z2))
++
++/*
++** mulh_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svmulh_n_u8_m (p0, z0, x0),
++		 z0 = svmulh_m (p0, z0, x0))
++
++/*
++** mulh_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svmulh_n_u8_m (p0, z1, x0),
++		 z0 = svmulh_m (p0, z1, x0))
++
++/*
++** mulh_11_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u8_m_tied1, svuint8_t,
++		z0 = svmulh_n_u8_m (p0, z0, 11),
++		z0 = svmulh_m (p0, z0, 11))
++
++/*
++** mulh_11_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0, z1
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u8_m_untied, svuint8_t,
++		z0 = svmulh_n_u8_m (p0, z1, 11),
++		z0 = svmulh_m (p0, z1, 11))
++
++/*
++** mulh_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_z_tied1, svuint8_t,
++		z0 = svmulh_u8_z (p0, z0, z1),
++		z0 = svmulh_z (p0, z0, z1))
++
++/*
++** mulh_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_z_tied2, svuint8_t,
++		z0 = svmulh_u8_z (p0, z1, z0),
++		z0 = svmulh_z (p0, z1, z0))
++
++/*
++** mulh_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umulh	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_z_untied, svuint8_t,
++		z0 = svmulh_u8_z (p0, z1, z2),
++		z0 = svmulh_z (p0, z1, z2))
++
++/*
++** mulh_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svmulh_n_u8_z (p0, z0, x0),
++		 z0 = svmulh_z (p0, z0, x0))
++
++/*
++** mulh_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umulh	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svmulh_n_u8_z (p0, z1, x0),
++		 z0 = svmulh_z (p0, z1, x0))
++
++/*
++** mulh_11_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	movprfx	z0\.b, p0/z, z0\.b
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u8_z_tied1, svuint8_t,
++		z0 = svmulh_n_u8_z (p0, z0, 11),
++		z0 = svmulh_z (p0, z0, 11))
++
++/*
++** mulh_11_u8_z_untied:
++**	mov	(z[0-9]+\.b), #11
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	umulh	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u8_z_untied, svuint8_t,
++		z0 = svmulh_n_u8_z (p0, z1, 11),
++		z0 = svmulh_z (p0, z1, 11))
++
++/*
++** mulh_u8_x_tied1:
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_x_tied1, svuint8_t,
++		z0 = svmulh_u8_x (p0, z0, z1),
++		z0 = svmulh_x (p0, z0, z1))
++
++/*
++** mulh_u8_x_tied2:
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_x_tied2, svuint8_t,
++		z0 = svmulh_u8_x (p0, z1, z0),
++		z0 = svmulh_x (p0, z1, z0))
++
++/*
++** mulh_u8_x_untied:
++** (
++**	movprfx	z0, z1
++**	umulh	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0, z2
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_u8_x_untied, svuint8_t,
++		z0 = svmulh_u8_x (p0, z1, z2),
++		z0 = svmulh_x (p0, z1, z2))
++
++/*
++** mulh_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svmulh_n_u8_x (p0, z0, x0),
++		 z0 = svmulh_x (p0, z0, x0))
++
++/*
++** mulh_w0_u8_x_untied:
++**	mov	z0\.b, w0
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (mulh_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svmulh_n_u8_x (p0, z1, x0),
++		 z0 = svmulh_x (p0, z1, x0))
++
++/*
++** mulh_11_u8_x_tied1:
++**	mov	(z[0-9]+\.b), #11
++**	umulh	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u8_x_tied1, svuint8_t,
++		z0 = svmulh_n_u8_x (p0, z0, 11),
++		z0 = svmulh_x (p0, z0, 11))
++
++/*
++** mulh_11_u8_x_untied:
++**	mov	z0\.b, #11
++**	umulh	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (mulh_11_u8_x_untied, svuint8_t,
++		z0 = svmulh_n_u8_x (p0, z1, 11),
++		z0 = svmulh_x (p0, z1, 11))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f16.c
+new file mode 100644
+index 000000000..ce02c3caa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f16.c
+@@ -0,0 +1,472 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulx_f16_m_tied1:
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_m_tied1, svfloat16_t,
++		z0 = svmulx_f16_m (p0, z0, z1),
++		z0 = svmulx_m (p0, z0, z1))
++
++/*
++** mulx_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_m_tied2, svfloat16_t,
++		z0 = svmulx_f16_m (p0, z1, z0),
++		z0 = svmulx_m (p0, z1, z0))
++
++/*
++** mulx_f16_m_untied:
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_m_untied, svfloat16_t,
++		z0 = svmulx_f16_m (p0, z1, z2),
++		z0 = svmulx_m (p0, z1, z2))
++
++/*
++** mulx_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svmulx_n_f16_m (p0, z0, d4),
++		 z0 = svmulx_m (p0, z0, d4))
++
++/*
++** mulx_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svmulx_n_f16_m (p0, z1, d4),
++		 z0 = svmulx_m (p0, z1, d4))
++
++/*
++** mulx_1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f16_m_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_m (p0, z0, 1),
++		z0 = svmulx_m (p0, z0, 1))
++
++/*
++** mulx_1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f16_m_untied, svfloat16_t,
++		z0 = svmulx_n_f16_m (p0, z1, 1),
++		z0 = svmulx_m (p0, z1, 1))
++
++/*
++** mulx_0p5_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_m (p0, z0, 0.5),
++		z0 = svmulx_m (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f16_m_untied, svfloat16_t,
++		z0 = svmulx_n_f16_m (p0, z1, 0.5),
++		z0 = svmulx_m (p0, z1, 0.5))
++
++/*
++** mulx_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f16_m_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_m (p0, z0, 2),
++		z0 = svmulx_m (p0, z0, 2))
++
++/*
++** mulx_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f16_m_untied, svfloat16_t,
++		z0 = svmulx_n_f16_m (p0, z1, 2),
++		z0 = svmulx_m (p0, z1, 2))
++
++/*
++** mulx_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_z_tied1, svfloat16_t,
++		z0 = svmulx_f16_z (p0, z0, z1),
++		z0 = svmulx_z (p0, z0, z1))
++
++/*
++** mulx_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_z_tied2, svfloat16_t,
++		z0 = svmulx_f16_z (p0, z1, z0),
++		z0 = svmulx_z (p0, z1, z0))
++
++/*
++** mulx_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmulx	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_z_untied, svfloat16_t,
++		z0 = svmulx_f16_z (p0, z1, z2),
++		z0 = svmulx_z (p0, z1, z2))
++
++/*
++** mulx_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svmulx_n_f16_z (p0, z0, d4),
++		 z0 = svmulx_z (p0, z0, d4))
++
++/*
++** mulx_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svmulx_n_f16_z (p0, z1, d4),
++		 z0 = svmulx_z (p0, z1, d4))
++
++/*
++** mulx_1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f16_z_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_z (p0, z0, 1),
++		z0 = svmulx_z (p0, z0, 1))
++
++/*
++** mulx_1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f16_z_untied, svfloat16_t,
++		z0 = svmulx_n_f16_z (p0, z1, 1),
++		z0 = svmulx_z (p0, z1, 1))
++
++/*
++** mulx_0p5_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_z (p0, z0, 0.5),
++		z0 = svmulx_z (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f16_z_untied, svfloat16_t,
++		z0 = svmulx_n_f16_z (p0, z1, 0.5),
++		z0 = svmulx_z (p0, z1, 0.5))
++
++/*
++** mulx_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f16_z_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_z (p0, z0, 2),
++		z0 = svmulx_z (p0, z0, 2))
++
++/*
++** mulx_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f16_z_untied, svfloat16_t,
++		z0 = svmulx_n_f16_z (p0, z1, 2),
++		z0 = svmulx_z (p0, z1, 2))
++
++/*
++** mulx_f16_x_tied1:
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_f16_x (p0, z0, z1),
++		z0 = svmulx_x (p0, z0, z1))
++
++/*
++** mulx_f16_x_tied2:
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_x_tied2, svfloat16_t,
++		z0 = svmulx_f16_x (p0, z1, z0),
++		z0 = svmulx_x (p0, z1, z0))
++
++/*
++** mulx_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmulx	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f16_x_untied, svfloat16_t,
++		z0 = svmulx_f16_x (p0, z1, z2),
++		z0 = svmulx_x (p0, z1, z2))
++
++/*
++** mulx_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svmulx_n_f16_x (p0, z0, d4),
++		 z0 = svmulx_x (p0, z0, d4))
++
++/*
++** mulx_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svmulx_n_f16_x (p0, z1, d4),
++		 z0 = svmulx_x (p0, z1, d4))
++
++/*
++** mulx_1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_x (p0, z0, 1),
++		z0 = svmulx_x (p0, z0, 1))
++
++/*
++** mulx_1_f16_x_untied:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f16_x_untied, svfloat16_t,
++		z0 = svmulx_n_f16_x (p0, z1, 1),
++		z0 = svmulx_x (p0, z1, 1))
++
++/*
++** mulx_0p5_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_x (p0, z0, 0.5),
++		z0 = svmulx_x (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f16_x_untied:
++**	fmov	z0\.h, #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f16_x_untied, svfloat16_t,
++		z0 = svmulx_n_f16_x (p0, z1, 0.5),
++		z0 = svmulx_x (p0, z1, 0.5))
++
++/*
++** mulx_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fmulx	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_x (p0, z0, 2),
++		z0 = svmulx_x (p0, z0, 2))
++
++/*
++** mulx_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f16_x_untied, svfloat16_t,
++		z0 = svmulx_n_f16_x (p0, z1, 2),
++		z0 = svmulx_x (p0, z1, 2))
++
++/*
++** ptrue_mulx_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svmulx_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_mulx_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f16_x_tied2, svfloat16_t,
++		z0 = svmulx_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svmulx_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_mulx_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f16_x_untied, svfloat16_t,
++		z0 = svmulx_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svmulx_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_mulx_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_1_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svmulx_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_mulx_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_1_f16_x_untied, svfloat16_t,
++		z0 = svmulx_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svmulx_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_mulx_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svmulx_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_mulx_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_0p5_f16_x_untied, svfloat16_t,
++		z0 = svmulx_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svmulx_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_mulx_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_2_f16_x_tied1, svfloat16_t,
++		z0 = svmulx_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svmulx_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_mulx_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_2_f16_x_untied, svfloat16_t,
++		z0 = svmulx_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svmulx_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f32.c
+new file mode 100644
+index 000000000..e0d369593
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f32.c
+@@ -0,0 +1,472 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulx_f32_m_tied1:
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_m_tied1, svfloat32_t,
++		z0 = svmulx_f32_m (p0, z0, z1),
++		z0 = svmulx_m (p0, z0, z1))
++
++/*
++** mulx_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_m_tied2, svfloat32_t,
++		z0 = svmulx_f32_m (p0, z1, z0),
++		z0 = svmulx_m (p0, z1, z0))
++
++/*
++** mulx_f32_m_untied:
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_m_untied, svfloat32_t,
++		z0 = svmulx_f32_m (p0, z1, z2),
++		z0 = svmulx_m (p0, z1, z2))
++
++/*
++** mulx_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svmulx_n_f32_m (p0, z0, d4),
++		 z0 = svmulx_m (p0, z0, d4))
++
++/*
++** mulx_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svmulx_n_f32_m (p0, z1, d4),
++		 z0 = svmulx_m (p0, z1, d4))
++
++/*
++** mulx_1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f32_m_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_m (p0, z0, 1),
++		z0 = svmulx_m (p0, z0, 1))
++
++/*
++** mulx_1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f32_m_untied, svfloat32_t,
++		z0 = svmulx_n_f32_m (p0, z1, 1),
++		z0 = svmulx_m (p0, z1, 1))
++
++/*
++** mulx_0p5_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_m (p0, z0, 0.5),
++		z0 = svmulx_m (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f32_m_untied, svfloat32_t,
++		z0 = svmulx_n_f32_m (p0, z1, 0.5),
++		z0 = svmulx_m (p0, z1, 0.5))
++
++/*
++** mulx_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f32_m_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_m (p0, z0, 2),
++		z0 = svmulx_m (p0, z0, 2))
++
++/*
++** mulx_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f32_m_untied, svfloat32_t,
++		z0 = svmulx_n_f32_m (p0, z1, 2),
++		z0 = svmulx_m (p0, z1, 2))
++
++/*
++** mulx_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_z_tied1, svfloat32_t,
++		z0 = svmulx_f32_z (p0, z0, z1),
++		z0 = svmulx_z (p0, z0, z1))
++
++/*
++** mulx_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_z_tied2, svfloat32_t,
++		z0 = svmulx_f32_z (p0, z1, z0),
++		z0 = svmulx_z (p0, z1, z0))
++
++/*
++** mulx_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmulx	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_z_untied, svfloat32_t,
++		z0 = svmulx_f32_z (p0, z1, z2),
++		z0 = svmulx_z (p0, z1, z2))
++
++/*
++** mulx_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svmulx_n_f32_z (p0, z0, d4),
++		 z0 = svmulx_z (p0, z0, d4))
++
++/*
++** mulx_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svmulx_n_f32_z (p0, z1, d4),
++		 z0 = svmulx_z (p0, z1, d4))
++
++/*
++** mulx_1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f32_z_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_z (p0, z0, 1),
++		z0 = svmulx_z (p0, z0, 1))
++
++/*
++** mulx_1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f32_z_untied, svfloat32_t,
++		z0 = svmulx_n_f32_z (p0, z1, 1),
++		z0 = svmulx_z (p0, z1, 1))
++
++/*
++** mulx_0p5_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_z (p0, z0, 0.5),
++		z0 = svmulx_z (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f32_z_untied, svfloat32_t,
++		z0 = svmulx_n_f32_z (p0, z1, 0.5),
++		z0 = svmulx_z (p0, z1, 0.5))
++
++/*
++** mulx_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f32_z_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_z (p0, z0, 2),
++		z0 = svmulx_z (p0, z0, 2))
++
++/*
++** mulx_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f32_z_untied, svfloat32_t,
++		z0 = svmulx_n_f32_z (p0, z1, 2),
++		z0 = svmulx_z (p0, z1, 2))
++
++/*
++** mulx_f32_x_tied1:
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_f32_x (p0, z0, z1),
++		z0 = svmulx_x (p0, z0, z1))
++
++/*
++** mulx_f32_x_tied2:
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_x_tied2, svfloat32_t,
++		z0 = svmulx_f32_x (p0, z1, z0),
++		z0 = svmulx_x (p0, z1, z0))
++
++/*
++** mulx_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmulx	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f32_x_untied, svfloat32_t,
++		z0 = svmulx_f32_x (p0, z1, z2),
++		z0 = svmulx_x (p0, z1, z2))
++
++/*
++** mulx_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svmulx_n_f32_x (p0, z0, d4),
++		 z0 = svmulx_x (p0, z0, d4))
++
++/*
++** mulx_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svmulx_n_f32_x (p0, z1, d4),
++		 z0 = svmulx_x (p0, z1, d4))
++
++/*
++** mulx_1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_x (p0, z0, 1),
++		z0 = svmulx_x (p0, z0, 1))
++
++/*
++** mulx_1_f32_x_untied:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f32_x_untied, svfloat32_t,
++		z0 = svmulx_n_f32_x (p0, z1, 1),
++		z0 = svmulx_x (p0, z1, 1))
++
++/*
++** mulx_0p5_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_x (p0, z0, 0.5),
++		z0 = svmulx_x (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f32_x_untied:
++**	fmov	z0\.s, #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f32_x_untied, svfloat32_t,
++		z0 = svmulx_n_f32_x (p0, z1, 0.5),
++		z0 = svmulx_x (p0, z1, 0.5))
++
++/*
++** mulx_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fmulx	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_x (p0, z0, 2),
++		z0 = svmulx_x (p0, z0, 2))
++
++/*
++** mulx_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f32_x_untied, svfloat32_t,
++		z0 = svmulx_n_f32_x (p0, z1, 2),
++		z0 = svmulx_x (p0, z1, 2))
++
++/*
++** ptrue_mulx_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svmulx_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_mulx_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f32_x_tied2, svfloat32_t,
++		z0 = svmulx_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svmulx_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_mulx_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f32_x_untied, svfloat32_t,
++		z0 = svmulx_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svmulx_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_mulx_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_1_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svmulx_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_mulx_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_1_f32_x_untied, svfloat32_t,
++		z0 = svmulx_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svmulx_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_mulx_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svmulx_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_mulx_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_0p5_f32_x_untied, svfloat32_t,
++		z0 = svmulx_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svmulx_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_mulx_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_2_f32_x_tied1, svfloat32_t,
++		z0 = svmulx_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svmulx_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_mulx_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_2_f32_x_untied, svfloat32_t,
++		z0 = svmulx_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svmulx_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f64.c
+new file mode 100644
+index 000000000..6af5703ff
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f64.c
+@@ -0,0 +1,472 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** mulx_f64_m_tied1:
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_m_tied1, svfloat64_t,
++		z0 = svmulx_f64_m (p0, z0, z1),
++		z0 = svmulx_m (p0, z0, z1))
++
++/*
++** mulx_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_m_tied2, svfloat64_t,
++		z0 = svmulx_f64_m (p0, z1, z0),
++		z0 = svmulx_m (p0, z1, z0))
++
++/*
++** mulx_f64_m_untied:
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_m_untied, svfloat64_t,
++		z0 = svmulx_f64_m (p0, z1, z2),
++		z0 = svmulx_m (p0, z1, z2))
++
++/*
++** mulx_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svmulx_n_f64_m (p0, z0, d4),
++		 z0 = svmulx_m (p0, z0, d4))
++
++/*
++** mulx_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svmulx_n_f64_m (p0, z1, d4),
++		 z0 = svmulx_m (p0, z1, d4))
++
++/*
++** mulx_1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f64_m_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_m (p0, z0, 1),
++		z0 = svmulx_m (p0, z0, 1))
++
++/*
++** mulx_1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f64_m_untied, svfloat64_t,
++		z0 = svmulx_n_f64_m (p0, z1, 1),
++		z0 = svmulx_m (p0, z1, 1))
++
++/*
++** mulx_0p5_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_m (p0, z0, 0.5),
++		z0 = svmulx_m (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f64_m_untied, svfloat64_t,
++		z0 = svmulx_n_f64_m (p0, z1, 0.5),
++		z0 = svmulx_m (p0, z1, 0.5))
++
++/*
++** mulx_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f64_m_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_m (p0, z0, 2),
++		z0 = svmulx_m (p0, z0, 2))
++
++/*
++** mulx_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f64_m_untied, svfloat64_t,
++		z0 = svmulx_n_f64_m (p0, z1, 2),
++		z0 = svmulx_m (p0, z1, 2))
++
++/*
++** mulx_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_z_tied1, svfloat64_t,
++		z0 = svmulx_f64_z (p0, z0, z1),
++		z0 = svmulx_z (p0, z0, z1))
++
++/*
++** mulx_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_z_tied2, svfloat64_t,
++		z0 = svmulx_f64_z (p0, z1, z0),
++		z0 = svmulx_z (p0, z1, z0))
++
++/*
++** mulx_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmulx	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_z_untied, svfloat64_t,
++		z0 = svmulx_f64_z (p0, z1, z2),
++		z0 = svmulx_z (p0, z1, z2))
++
++/*
++** mulx_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svmulx_n_f64_z (p0, z0, d4),
++		 z0 = svmulx_z (p0, z0, d4))
++
++/*
++** mulx_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svmulx_n_f64_z (p0, z1, d4),
++		 z0 = svmulx_z (p0, z1, d4))
++
++/*
++** mulx_1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f64_z_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_z (p0, z0, 1),
++		z0 = svmulx_z (p0, z0, 1))
++
++/*
++** mulx_1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f64_z_untied, svfloat64_t,
++		z0 = svmulx_n_f64_z (p0, z1, 1),
++		z0 = svmulx_z (p0, z1, 1))
++
++/*
++** mulx_0p5_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_z (p0, z0, 0.5),
++		z0 = svmulx_z (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f64_z_untied, svfloat64_t,
++		z0 = svmulx_n_f64_z (p0, z1, 0.5),
++		z0 = svmulx_z (p0, z1, 0.5))
++
++/*
++** mulx_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f64_z_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_z (p0, z0, 2),
++		z0 = svmulx_z (p0, z0, 2))
++
++/*
++** mulx_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f64_z_untied, svfloat64_t,
++		z0 = svmulx_n_f64_z (p0, z1, 2),
++		z0 = svmulx_z (p0, z1, 2))
++
++/*
++** mulx_f64_x_tied1:
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_f64_x (p0, z0, z1),
++		z0 = svmulx_x (p0, z0, z1))
++
++/*
++** mulx_f64_x_tied2:
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_x_tied2, svfloat64_t,
++		z0 = svmulx_f64_x (p0, z1, z0),
++		z0 = svmulx_x (p0, z1, z0))
++
++/*
++** mulx_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fmulx	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_f64_x_untied, svfloat64_t,
++		z0 = svmulx_f64_x (p0, z1, z2),
++		z0 = svmulx_x (p0, z1, z2))
++
++/*
++** mulx_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svmulx_n_f64_x (p0, z0, d4),
++		 z0 = svmulx_x (p0, z0, d4))
++
++/*
++** mulx_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (mulx_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svmulx_n_f64_x (p0, z1, d4),
++		 z0 = svmulx_x (p0, z1, d4))
++
++/*
++** mulx_1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_x (p0, z0, 1),
++		z0 = svmulx_x (p0, z0, 1))
++
++/*
++** mulx_1_f64_x_untied:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_1_f64_x_untied, svfloat64_t,
++		z0 = svmulx_n_f64_x (p0, z1, 1),
++		z0 = svmulx_x (p0, z1, 1))
++
++/*
++** mulx_0p5_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_x (p0, z0, 0.5),
++		z0 = svmulx_x (p0, z0, 0.5))
++
++/*
++** mulx_0p5_f64_x_untied:
++**	fmov	z0\.d, #(?:0\.5|5\.0e-1)
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_0p5_f64_x_untied, svfloat64_t,
++		z0 = svmulx_n_f64_x (p0, z1, 0.5),
++		z0 = svmulx_x (p0, z1, 0.5))
++
++/*
++** mulx_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fmulx	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_x (p0, z0, 2),
++		z0 = svmulx_x (p0, z0, 2))
++
++/*
++** mulx_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (mulx_2_f64_x_untied, svfloat64_t,
++		z0 = svmulx_n_f64_x (p0, z1, 2),
++		z0 = svmulx_x (p0, z1, 2))
++
++/*
++** ptrue_mulx_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svmulx_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_mulx_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f64_x_tied2, svfloat64_t,
++		z0 = svmulx_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svmulx_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_mulx_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_f64_x_untied, svfloat64_t,
++		z0 = svmulx_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svmulx_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_mulx_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_1_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svmulx_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_mulx_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_1_f64_x_untied, svfloat64_t,
++		z0 = svmulx_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svmulx_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_mulx_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svmulx_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_mulx_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_0p5_f64_x_untied, svfloat64_t,
++		z0 = svmulx_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svmulx_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_mulx_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_2_f64_x_tied1, svfloat64_t,
++		z0 = svmulx_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svmulx_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_mulx_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_mulx_2_f64_x_untied, svfloat64_t,
++		z0 = svmulx_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svmulx_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nand_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nand_b.c
+new file mode 100644
+index 000000000..c306b80c7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nand_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nand_b_z_tied1:
++**	nand	p0\.b, p3/z, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (nand_b_z_tied1,
++		p0 = svnand_b_z (p3, p0, p1),
++		p0 = svnand_z (p3, p0, p1))
++
++/*
++** nand_b_z_tied2:
++**	nand	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (nand_b_z_tied2,
++		p0 = svnand_b_z (p3, p1, p0),
++		p0 = svnand_z (p3, p1, p0))
++
++/*
++** nand_b_z_untied:
++**	nand	p0\.b, p3/z, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (nand_b_z_untied,
++		p0 = svnand_b_z (p3, p1, p2),
++		p0 = svnand_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f16.c
+new file mode 100644
+index 000000000..c31eba922
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_f16_m_tied12:
++**	fneg	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_m_tied12, svfloat16_t,
++		z0 = svneg_f16_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_f16_m_tied1:
++**	fneg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_m_tied1, svfloat16_t,
++		z0 = svneg_f16_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fneg	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_m_tied2, svfloat16_t,
++		z0 = svneg_f16_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_f16_m_untied:
++**	movprfx	z0, z2
++**	fneg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_m_untied, svfloat16_t,
++		z0 = svneg_f16_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	fneg	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_z_tied1, svfloat16_t,
++		z0 = svneg_f16_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fneg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_z_untied, svfloat16_t,
++		z0 = svneg_f16_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_f16_x_tied1:
++**	fneg	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_x_tied1, svfloat16_t,
++		z0 = svneg_f16_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_f16_x_untied:
++**	fneg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f16_x_untied, svfloat16_t,
++		z0 = svneg_f16_x (p0, z1),
++		z0 = svneg_x (p0, z1))
++
++/*
++** ptrue_neg_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_neg_f16_x_tied1, svfloat16_t,
++		z0 = svneg_f16_x (svptrue_b16 (), z0),
++		z0 = svneg_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_neg_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_neg_f16_x_untied, svfloat16_t,
++		z0 = svneg_f16_x (svptrue_b16 (), z1),
++		z0 = svneg_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f32.c
+new file mode 100644
+index 000000000..a57d264ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_f32_m_tied12:
++**	fneg	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_m_tied12, svfloat32_t,
++		z0 = svneg_f32_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_f32_m_tied1:
++**	fneg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_m_tied1, svfloat32_t,
++		z0 = svneg_f32_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fneg	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_m_tied2, svfloat32_t,
++		z0 = svneg_f32_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_f32_m_untied:
++**	movprfx	z0, z2
++**	fneg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_m_untied, svfloat32_t,
++		z0 = svneg_f32_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fneg	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_z_tied1, svfloat32_t,
++		z0 = svneg_f32_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fneg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_z_untied, svfloat32_t,
++		z0 = svneg_f32_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_f32_x_tied1:
++**	fneg	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_x_tied1, svfloat32_t,
++		z0 = svneg_f32_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_f32_x_untied:
++**	fneg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f32_x_untied, svfloat32_t,
++		z0 = svneg_f32_x (p0, z1),
++		z0 = svneg_x (p0, z1))
++
++/*
++** ptrue_neg_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_neg_f32_x_tied1, svfloat32_t,
++		z0 = svneg_f32_x (svptrue_b32 (), z0),
++		z0 = svneg_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_neg_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_neg_f32_x_untied, svfloat32_t,
++		z0 = svneg_f32_x (svptrue_b32 (), z1),
++		z0 = svneg_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f64.c
+new file mode 100644
+index 000000000..90cadd4f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_f64_m_tied12:
++**	fneg	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_m_tied12, svfloat64_t,
++		z0 = svneg_f64_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_f64_m_tied1:
++**	fneg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_m_tied1, svfloat64_t,
++		z0 = svneg_f64_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fneg	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_m_tied2, svfloat64_t,
++		z0 = svneg_f64_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_f64_m_untied:
++**	movprfx	z0, z2
++**	fneg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_m_untied, svfloat64_t,
++		z0 = svneg_f64_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fneg	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_z_tied1, svfloat64_t,
++		z0 = svneg_f64_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fneg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_z_untied, svfloat64_t,
++		z0 = svneg_f64_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_f64_x_tied1:
++**	fneg	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_x_tied1, svfloat64_t,
++		z0 = svneg_f64_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_f64_x_untied:
++**	fneg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_f64_x_untied, svfloat64_t,
++		z0 = svneg_f64_x (p0, z1),
++		z0 = svneg_x (p0, z1))
++
++/*
++** ptrue_neg_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_neg_f64_x_tied1, svfloat64_t,
++		z0 = svneg_f64_x (svptrue_b64 (), z0),
++		z0 = svneg_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_neg_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_neg_f64_x_untied, svfloat64_t,
++		z0 = svneg_f64_x (svptrue_b64 (), z1),
++		z0 = svneg_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s16.c
+new file mode 100644
+index 000000000..80b2ee0f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_s16_m_tied12:
++**	neg	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_m_tied12, svint16_t,
++		z0 = svneg_s16_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_s16_m_tied1:
++**	neg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_m_tied1, svint16_t,
++		z0 = svneg_s16_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	neg	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_m_tied2, svint16_t,
++		z0 = svneg_s16_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_s16_m_untied:
++**	movprfx	z0, z2
++**	neg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_m_untied, svint16_t,
++		z0 = svneg_s16_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	neg	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_z_tied1, svint16_t,
++		z0 = svneg_s16_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	neg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_z_untied, svint16_t,
++		z0 = svneg_s16_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_s16_x_tied1:
++**	neg	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_x_tied1, svint16_t,
++		z0 = svneg_s16_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_s16_x_untied:
++**	neg	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s16_x_untied, svint16_t,
++		z0 = svneg_s16_x (p0, z1),
++		z0 = svneg_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s32.c
+new file mode 100644
+index 000000000..b8805034e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_s32_m_tied12:
++**	neg	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_m_tied12, svint32_t,
++		z0 = svneg_s32_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_s32_m_tied1:
++**	neg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_m_tied1, svint32_t,
++		z0 = svneg_s32_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	neg	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_m_tied2, svint32_t,
++		z0 = svneg_s32_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_s32_m_untied:
++**	movprfx	z0, z2
++**	neg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_m_untied, svint32_t,
++		z0 = svneg_s32_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	neg	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_z_tied1, svint32_t,
++		z0 = svneg_s32_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	neg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_z_untied, svint32_t,
++		z0 = svneg_s32_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_s32_x_tied1:
++**	neg	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_x_tied1, svint32_t,
++		z0 = svneg_s32_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_s32_x_untied:
++**	neg	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s32_x_untied, svint32_t,
++		z0 = svneg_s32_x (p0, z1),
++		z0 = svneg_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s64.c
+new file mode 100644
+index 000000000..82abe6723
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_s64_m_tied12:
++**	neg	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_m_tied12, svint64_t,
++		z0 = svneg_s64_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_s64_m_tied1:
++**	neg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_m_tied1, svint64_t,
++		z0 = svneg_s64_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	neg	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_m_tied2, svint64_t,
++		z0 = svneg_s64_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_s64_m_untied:
++**	movprfx	z0, z2
++**	neg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_m_untied, svint64_t,
++		z0 = svneg_s64_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	neg	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_z_tied1, svint64_t,
++		z0 = svneg_s64_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	neg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_z_untied, svint64_t,
++		z0 = svneg_s64_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_s64_x_tied1:
++**	neg	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_x_tied1, svint64_t,
++		z0 = svneg_s64_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_s64_x_untied:
++**	neg	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s64_x_untied, svint64_t,
++		z0 = svneg_s64_x (p0, z1),
++		z0 = svneg_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s8.c
+new file mode 100644
+index 000000000..b7c9949ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** neg_s8_m_tied12:
++**	neg	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_m_tied12, svint8_t,
++		z0 = svneg_s8_m (z0, p0, z0),
++		z0 = svneg_m (z0, p0, z0))
++
++/*
++** neg_s8_m_tied1:
++**	neg	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_m_tied1, svint8_t,
++		z0 = svneg_s8_m (z0, p0, z1),
++		z0 = svneg_m (z0, p0, z1))
++
++/*
++** neg_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	neg	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_m_tied2, svint8_t,
++		z0 = svneg_s8_m (z1, p0, z0),
++		z0 = svneg_m (z1, p0, z0))
++
++/*
++** neg_s8_m_untied:
++**	movprfx	z0, z2
++**	neg	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_m_untied, svint8_t,
++		z0 = svneg_s8_m (z2, p0, z1),
++		z0 = svneg_m (z2, p0, z1))
++
++/*
++** neg_s8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	neg	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_z_tied1, svint8_t,
++		z0 = svneg_s8_z (p0, z0),
++		z0 = svneg_z (p0, z0))
++
++/*
++** neg_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	neg	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_z_untied, svint8_t,
++		z0 = svneg_s8_z (p0, z1),
++		z0 = svneg_z (p0, z1))
++
++/*
++** neg_s8_x_tied1:
++**	neg	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_x_tied1, svint8_t,
++		z0 = svneg_s8_x (p0, z0),
++		z0 = svneg_x (p0, z0))
++
++/*
++** neg_s8_x_untied:
++**	neg	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (neg_s8_x_untied, svint8_t,
++		z0 = svneg_s8_x (p0, z1),
++		z0 = svneg_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f16.c
+new file mode 100644
+index 000000000..abfe0a0c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmad_f16_m_tied1:
++**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_m_tied1, svfloat16_t,
++		z0 = svnmad_f16_m (p0, z0, z1, z2),
++		z0 = svnmad_m (p0, z0, z1, z2))
++
++/*
++** nmad_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmad	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_m_tied2, svfloat16_t,
++		z0 = svnmad_f16_m (p0, z1, z0, z2),
++		z0 = svnmad_m (p0, z1, z0, z2))
++
++/*
++** nmad_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmad	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_m_tied3, svfloat16_t,
++		z0 = svnmad_f16_m (p0, z1, z2, z0),
++		z0 = svnmad_m (p0, z1, z2, z0))
++
++/*
++** nmad_f16_m_untied:
++**	movprfx	z0, z1
++**	fnmad	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_m_untied, svfloat16_t,
++		z0 = svnmad_f16_m (p0, z1, z2, z3),
++		z0 = svnmad_m (p0, z1, z2, z3))
++
++/*
++** nmad_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_m (p0, z0, z1, d4),
++		 z0 = svnmad_m (p0, z0, z1, d4))
++
++/*
++** nmad_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fnmad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_m (p0, z1, z2, d4),
++		 z0 = svnmad_m (p0, z1, z2, d4))
++
++/*
++** nmad_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_m_tied1, svfloat16_t,
++		z0 = svnmad_n_f16_m (p0, z0, z1, 2),
++		z0 = svnmad_m (p0, z0, z1, 2))
++
++/*
++** nmad_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmad	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_m_untied, svfloat16_t,
++		z0 = svnmad_n_f16_m (p0, z1, z2, 2),
++		z0 = svnmad_m (p0, z1, z2, 2))
++
++/*
++** nmad_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_z_tied1, svfloat16_t,
++		z0 = svnmad_f16_z (p0, z0, z1, z2),
++		z0 = svnmad_z (p0, z0, z1, z2))
++
++/*
++** nmad_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_z_tied2, svfloat16_t,
++		z0 = svnmad_f16_z (p0, z1, z0, z2),
++		z0 = svnmad_z (p0, z1, z0, z2))
++
++/*
++** nmad_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_z_tied3, svfloat16_t,
++		z0 = svnmad_f16_z (p0, z1, z2, z0),
++		z0 = svnmad_z (p0, z1, z2, z0))
++
++/*
++** nmad_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_z_untied, svfloat16_t,
++		z0 = svnmad_f16_z (p0, z1, z2, z3),
++		z0 = svnmad_z (p0, z1, z2, z3))
++
++/*
++** nmad_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_z (p0, z0, z1, d4),
++		 z0 = svnmad_z (p0, z0, z1, d4))
++
++/*
++** nmad_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_z (p0, z1, z0, d4),
++		 z0 = svnmad_z (p0, z1, z0, d4))
++
++/*
++** nmad_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_z (p0, z1, z2, d4),
++		 z0 = svnmad_z (p0, z1, z2, d4))
++
++/*
++** nmad_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_z_tied1, svfloat16_t,
++		z0 = svnmad_n_f16_z (p0, z0, z1, 2),
++		z0 = svnmad_z (p0, z0, z1, 2))
++
++/*
++** nmad_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_z_tied2, svfloat16_t,
++		z0 = svnmad_n_f16_z (p0, z1, z0, 2),
++		z0 = svnmad_z (p0, z1, z0, 2))
++
++/*
++** nmad_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmad	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_z_untied, svfloat16_t,
++		z0 = svnmad_n_f16_z (p0, z1, z2, 2),
++		z0 = svnmad_z (p0, z1, z2, 2))
++
++/*
++** nmad_f16_x_tied1:
++**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_x_tied1, svfloat16_t,
++		z0 = svnmad_f16_x (p0, z0, z1, z2),
++		z0 = svnmad_x (p0, z0, z1, z2))
++
++/*
++** nmad_f16_x_tied2:
++**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_x_tied2, svfloat16_t,
++		z0 = svnmad_f16_x (p0, z1, z0, z2),
++		z0 = svnmad_x (p0, z1, z0, z2))
++
++/*
++** nmad_f16_x_tied3:
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_x_tied3, svfloat16_t,
++		z0 = svnmad_f16_x (p0, z1, z2, z0),
++		z0 = svnmad_x (p0, z1, z2, z0))
++
++/*
++** nmad_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmad	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fnmad	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f16_x_untied, svfloat16_t,
++		z0 = svnmad_f16_x (p0, z1, z2, z3),
++		z0 = svnmad_x (p0, z1, z2, z3))
++
++/*
++** nmad_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_x (p0, z0, z1, d4),
++		 z0 = svnmad_x (p0, z0, z1, d4))
++
++/*
++** nmad_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_x (p0, z1, z0, d4),
++		 z0 = svnmad_x (p0, z1, z0, d4))
++
++/*
++** nmad_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svnmad_n_f16_x (p0, z1, z2, d4),
++		 z0 = svnmad_x (p0, z1, z2, d4))
++
++/*
++** nmad_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmad_n_f16_x (p0, z0, z1, 2),
++		z0 = svnmad_x (p0, z0, z1, 2))
++
++/*
++** nmad_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmad	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmad_n_f16_x (p0, z1, z0, 2),
++		z0 = svnmad_x (p0, z1, z0, 2))
++
++/*
++** nmad_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f16_x_untied, svfloat16_t,
++		z0 = svnmad_n_f16_x (p0, z1, z2, 2),
++		z0 = svnmad_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmad_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f16_x_tied1, svfloat16_t,
++		z0 = svnmad_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svnmad_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_nmad_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f16_x_tied2, svfloat16_t,
++		z0 = svnmad_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svnmad_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_nmad_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f16_x_tied3, svfloat16_t,
++		z0 = svnmad_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svnmad_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_nmad_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f16_x_untied, svfloat16_t,
++		z0 = svnmad_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svnmad_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_nmad_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmad_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svnmad_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_nmad_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmad_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svnmad_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_nmad_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f16_x_untied, svfloat16_t,
++		z0 = svnmad_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svnmad_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f32.c
+new file mode 100644
+index 000000000..ab86385c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmad_f32_m_tied1:
++**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_m_tied1, svfloat32_t,
++		z0 = svnmad_f32_m (p0, z0, z1, z2),
++		z0 = svnmad_m (p0, z0, z1, z2))
++
++/*
++** nmad_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmad	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_m_tied2, svfloat32_t,
++		z0 = svnmad_f32_m (p0, z1, z0, z2),
++		z0 = svnmad_m (p0, z1, z0, z2))
++
++/*
++** nmad_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmad	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_m_tied3, svfloat32_t,
++		z0 = svnmad_f32_m (p0, z1, z2, z0),
++		z0 = svnmad_m (p0, z1, z2, z0))
++
++/*
++** nmad_f32_m_untied:
++**	movprfx	z0, z1
++**	fnmad	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_m_untied, svfloat32_t,
++		z0 = svnmad_f32_m (p0, z1, z2, z3),
++		z0 = svnmad_m (p0, z1, z2, z3))
++
++/*
++** nmad_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svnmad_n_f32_m (p0, z0, z1, d4),
++		 z0 = svnmad_m (p0, z0, z1, d4))
++
++/*
++** nmad_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fnmad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svnmad_n_f32_m (p0, z1, z2, d4),
++		 z0 = svnmad_m (p0, z1, z2, d4))
++
++/*
++** nmad_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_m_tied1, svfloat32_t,
++		z0 = svnmad_n_f32_m (p0, z0, z1, 2),
++		z0 = svnmad_m (p0, z0, z1, 2))
++
++/*
++** nmad_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmad	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_m_untied, svfloat32_t,
++		z0 = svnmad_n_f32_m (p0, z1, z2, 2),
++		z0 = svnmad_m (p0, z1, z2, 2))
++
++/*
++** nmad_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_z_tied1, svfloat32_t,
++		z0 = svnmad_f32_z (p0, z0, z1, z2),
++		z0 = svnmad_z (p0, z0, z1, z2))
++
++/*
++** nmad_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_z_tied2, svfloat32_t,
++		z0 = svnmad_f32_z (p0, z1, z0, z2),
++		z0 = svnmad_z (p0, z1, z0, z2))
++
++/*
++** nmad_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_z_tied3, svfloat32_t,
++		z0 = svnmad_f32_z (p0, z1, z2, z0),
++		z0 = svnmad_z (p0, z1, z2, z0))
++
++/*
++** nmad_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_z_untied, svfloat32_t,
++		z0 = svnmad_f32_z (p0, z1, z2, z3),
++		z0 = svnmad_z (p0, z1, z2, z3))
++
++/*
++** nmad_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svnmad_n_f32_z (p0, z0, z1, d4),
++		 z0 = svnmad_z (p0, z0, z1, d4))
++
++/*
++** nmad_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svnmad_n_f32_z (p0, z1, z0, d4),
++		 z0 = svnmad_z (p0, z1, z0, d4))
++
++/*
++** nmad_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svnmad_n_f32_z (p0, z1, z2, d4),
++		 z0 = svnmad_z (p0, z1, z2, d4))
++
++/*
++** nmad_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_z_tied1, svfloat32_t,
++		z0 = svnmad_n_f32_z (p0, z0, z1, 2),
++		z0 = svnmad_z (p0, z0, z1, 2))
++
++/*
++** nmad_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_z_tied2, svfloat32_t,
++		z0 = svnmad_n_f32_z (p0, z1, z0, 2),
++		z0 = svnmad_z (p0, z1, z0, 2))
++
++/*
++** nmad_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmad	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_z_untied, svfloat32_t,
++		z0 = svnmad_n_f32_z (p0, z1, z2, 2),
++		z0 = svnmad_z (p0, z1, z2, 2))
++
++/*
++** nmad_f32_x_tied1:
++**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_x_tied1, svfloat32_t,
++		z0 = svnmad_f32_x (p0, z0, z1, z2),
++		z0 = svnmad_x (p0, z0, z1, z2))
++
++/*
++** nmad_f32_x_tied2:
++**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_x_tied2, svfloat32_t,
++		z0 = svnmad_f32_x (p0, z1, z0, z2),
++		z0 = svnmad_x (p0, z1, z0, z2))
++
++/*
++** nmad_f32_x_tied3:
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_x_tied3, svfloat32_t,
++		z0 = svnmad_f32_x (p0, z1, z2, z0),
++		z0 = svnmad_x (p0, z1, z2, z0))
++
++/*
++** nmad_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmad	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fnmad	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f32_x_untied, svfloat32_t,
++		z0 = svnmad_f32_x (p0, z1, z2, z3),
++		z0 = svnmad_x (p0, z1, z2, z3))
++
++/*
++** nmad_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svnmad_n_f32_x (p0, z0, z1, d4),
++		 z0 = svnmad_x (p0, z0, z1, d4))
++
++/*
++** nmad_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svnmad_n_f32_x (p0, z1, z0, d4),
++		 z0 = svnmad_x (p0, z1, z0, d4))
++
++/*
++** nmad_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svnmad_n_f32_x (p0, z1, z2, d4),
++		 z0 = svnmad_x (p0, z1, z2, d4))
++
++/*
++** nmad_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmad_n_f32_x (p0, z0, z1, 2),
++		z0 = svnmad_x (p0, z0, z1, 2))
++
++/*
++** nmad_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmad	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmad_n_f32_x (p0, z1, z0, 2),
++		z0 = svnmad_x (p0, z1, z0, 2))
++
++/*
++** nmad_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f32_x_untied, svfloat32_t,
++		z0 = svnmad_n_f32_x (p0, z1, z2, 2),
++		z0 = svnmad_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmad_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f32_x_tied1, svfloat32_t,
++		z0 = svnmad_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svnmad_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_nmad_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f32_x_tied2, svfloat32_t,
++		z0 = svnmad_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svnmad_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_nmad_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f32_x_tied3, svfloat32_t,
++		z0 = svnmad_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svnmad_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_nmad_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f32_x_untied, svfloat32_t,
++		z0 = svnmad_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svnmad_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_nmad_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmad_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svnmad_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_nmad_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmad_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svnmad_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_nmad_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f32_x_untied, svfloat32_t,
++		z0 = svnmad_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svnmad_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f64.c
+new file mode 100644
+index 000000000..c236ff5a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmad_f64_m_tied1:
++**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_m_tied1, svfloat64_t,
++		z0 = svnmad_f64_m (p0, z0, z1, z2),
++		z0 = svnmad_m (p0, z0, z1, z2))
++
++/*
++** nmad_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmad	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_m_tied2, svfloat64_t,
++		z0 = svnmad_f64_m (p0, z1, z0, z2),
++		z0 = svnmad_m (p0, z1, z0, z2))
++
++/*
++** nmad_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_m_tied3, svfloat64_t,
++		z0 = svnmad_f64_m (p0, z1, z2, z0),
++		z0 = svnmad_m (p0, z1, z2, z0))
++
++/*
++** nmad_f64_m_untied:
++**	movprfx	z0, z1
++**	fnmad	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_m_untied, svfloat64_t,
++		z0 = svnmad_f64_m (p0, z1, z2, z3),
++		z0 = svnmad_m (p0, z1, z2, z3))
++
++/*
++** nmad_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svnmad_n_f64_m (p0, z0, z1, d4),
++		 z0 = svnmad_m (p0, z0, z1, d4))
++
++/*
++** nmad_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fnmad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svnmad_n_f64_m (p0, z1, z2, d4),
++		 z0 = svnmad_m (p0, z1, z2, d4))
++
++/*
++** nmad_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_m_tied1, svfloat64_t,
++		z0 = svnmad_n_f64_m (p0, z0, z1, 2),
++		z0 = svnmad_m (p0, z0, z1, 2))
++
++/*
++** nmad_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmad	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_m_untied, svfloat64_t,
++		z0 = svnmad_n_f64_m (p0, z1, z2, 2),
++		z0 = svnmad_m (p0, z1, z2, 2))
++
++/*
++** nmad_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_z_tied1, svfloat64_t,
++		z0 = svnmad_f64_z (p0, z0, z1, z2),
++		z0 = svnmad_z (p0, z0, z1, z2))
++
++/*
++** nmad_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_z_tied2, svfloat64_t,
++		z0 = svnmad_f64_z (p0, z1, z0, z2),
++		z0 = svnmad_z (p0, z1, z0, z2))
++
++/*
++** nmad_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_z_tied3, svfloat64_t,
++		z0 = svnmad_f64_z (p0, z1, z2, z0),
++		z0 = svnmad_z (p0, z1, z2, z0))
++
++/*
++** nmad_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_z_untied, svfloat64_t,
++		z0 = svnmad_f64_z (p0, z1, z2, z3),
++		z0 = svnmad_z (p0, z1, z2, z3))
++
++/*
++** nmad_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svnmad_n_f64_z (p0, z0, z1, d4),
++		 z0 = svnmad_z (p0, z0, z1, d4))
++
++/*
++** nmad_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svnmad_n_f64_z (p0, z1, z0, d4),
++		 z0 = svnmad_z (p0, z1, z0, d4))
++
++/*
++** nmad_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svnmad_n_f64_z (p0, z1, z2, d4),
++		 z0 = svnmad_z (p0, z1, z2, d4))
++
++/*
++** nmad_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_z_tied1, svfloat64_t,
++		z0 = svnmad_n_f64_z (p0, z0, z1, 2),
++		z0 = svnmad_z (p0, z0, z1, 2))
++
++/*
++** nmad_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_z_tied2, svfloat64_t,
++		z0 = svnmad_n_f64_z (p0, z1, z0, 2),
++		z0 = svnmad_z (p0, z1, z0, 2))
++
++/*
++** nmad_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmad	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_z_untied, svfloat64_t,
++		z0 = svnmad_n_f64_z (p0, z1, z2, 2),
++		z0 = svnmad_z (p0, z1, z2, 2))
++
++/*
++** nmad_f64_x_tied1:
++**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_x_tied1, svfloat64_t,
++		z0 = svnmad_f64_x (p0, z0, z1, z2),
++		z0 = svnmad_x (p0, z0, z1, z2))
++
++/*
++** nmad_f64_x_tied2:
++**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_x_tied2, svfloat64_t,
++		z0 = svnmad_f64_x (p0, z1, z0, z2),
++		z0 = svnmad_x (p0, z1, z0, z2))
++
++/*
++** nmad_f64_x_tied3:
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_x_tied3, svfloat64_t,
++		z0 = svnmad_f64_x (p0, z1, z2, z0),
++		z0 = svnmad_x (p0, z1, z2, z0))
++
++/*
++** nmad_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmad	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fnmad	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_f64_x_untied, svfloat64_t,
++		z0 = svnmad_f64_x (p0, z1, z2, z3),
++		z0 = svnmad_x (p0, z1, z2, z3))
++
++/*
++** nmad_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svnmad_n_f64_x (p0, z0, z1, d4),
++		 z0 = svnmad_x (p0, z0, z1, d4))
++
++/*
++** nmad_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svnmad_n_f64_x (p0, z1, z0, d4),
++		 z0 = svnmad_x (p0, z1, z0, d4))
++
++/*
++** nmad_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmad_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svnmad_n_f64_x (p0, z1, z2, d4),
++		 z0 = svnmad_x (p0, z1, z2, d4))
++
++/*
++** nmad_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmad_n_f64_x (p0, z0, z1, 2),
++		z0 = svnmad_x (p0, z0, z1, 2))
++
++/*
++** nmad_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmad	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmad_n_f64_x (p0, z1, z0, 2),
++		z0 = svnmad_x (p0, z1, z0, 2))
++
++/*
++** nmad_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmad_2_f64_x_untied, svfloat64_t,
++		z0 = svnmad_n_f64_x (p0, z1, z2, 2),
++		z0 = svnmad_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmad_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f64_x_tied1, svfloat64_t,
++		z0 = svnmad_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svnmad_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_nmad_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f64_x_tied2, svfloat64_t,
++		z0 = svnmad_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svnmad_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_nmad_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f64_x_tied3, svfloat64_t,
++		z0 = svnmad_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svnmad_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_nmad_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_f64_x_untied, svfloat64_t,
++		z0 = svnmad_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svnmad_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_nmad_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmad_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svnmad_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_nmad_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmad_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svnmad_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_nmad_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmad_2_f64_x_untied, svfloat64_t,
++		z0 = svnmad_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svnmad_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f16.c
+new file mode 100644
+index 000000000..f7ac377fd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmla_f16_m_tied1:
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_m_tied1, svfloat16_t,
++		z0 = svnmla_f16_m (p0, z0, z1, z2),
++		z0 = svnmla_m (p0, z0, z1, z2))
++
++/*
++** nmla_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmla	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_m_tied2, svfloat16_t,
++		z0 = svnmla_f16_m (p0, z1, z0, z2),
++		z0 = svnmla_m (p0, z1, z0, z2))
++
++/*
++** nmla_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmla	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_m_tied3, svfloat16_t,
++		z0 = svnmla_f16_m (p0, z1, z2, z0),
++		z0 = svnmla_m (p0, z1, z2, z0))
++
++/*
++** nmla_f16_m_untied:
++**	movprfx	z0, z1
++**	fnmla	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_m_untied, svfloat16_t,
++		z0 = svnmla_f16_m (p0, z1, z2, z3),
++		z0 = svnmla_m (p0, z1, z2, z3))
++
++/*
++** nmla_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_m (p0, z0, z1, d4),
++		 z0 = svnmla_m (p0, z0, z1, d4))
++
++/*
++** nmla_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fnmla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_m (p0, z1, z2, d4),
++		 z0 = svnmla_m (p0, z1, z2, d4))
++
++/*
++** nmla_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_m_tied1, svfloat16_t,
++		z0 = svnmla_n_f16_m (p0, z0, z1, 2),
++		z0 = svnmla_m (p0, z0, z1, 2))
++
++/*
++** nmla_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmla	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_m_untied, svfloat16_t,
++		z0 = svnmla_n_f16_m (p0, z1, z2, 2),
++		z0 = svnmla_m (p0, z1, z2, 2))
++
++/*
++** nmla_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_z_tied1, svfloat16_t,
++		z0 = svnmla_f16_z (p0, z0, z1, z2),
++		z0 = svnmla_z (p0, z0, z1, z2))
++
++/*
++** nmla_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_z_tied2, svfloat16_t,
++		z0 = svnmla_f16_z (p0, z1, z0, z2),
++		z0 = svnmla_z (p0, z1, z0, z2))
++
++/*
++** nmla_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_z_tied3, svfloat16_t,
++		z0 = svnmla_f16_z (p0, z1, z2, z0),
++		z0 = svnmla_z (p0, z1, z2, z0))
++
++/*
++** nmla_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_z_untied, svfloat16_t,
++		z0 = svnmla_f16_z (p0, z1, z2, z3),
++		z0 = svnmla_z (p0, z1, z2, z3))
++
++/*
++** nmla_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_z (p0, z0, z1, d4),
++		 z0 = svnmla_z (p0, z0, z1, d4))
++
++/*
++** nmla_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_z (p0, z1, z0, d4),
++		 z0 = svnmla_z (p0, z1, z0, d4))
++
++/*
++** nmla_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_z (p0, z1, z2, d4),
++		 z0 = svnmla_z (p0, z1, z2, d4))
++
++/*
++** nmla_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_z_tied1, svfloat16_t,
++		z0 = svnmla_n_f16_z (p0, z0, z1, 2),
++		z0 = svnmla_z (p0, z0, z1, 2))
++
++/*
++** nmla_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_z_tied2, svfloat16_t,
++		z0 = svnmla_n_f16_z (p0, z1, z0, 2),
++		z0 = svnmla_z (p0, z1, z0, 2))
++
++/*
++** nmla_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmla	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmad	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_z_untied, svfloat16_t,
++		z0 = svnmla_n_f16_z (p0, z1, z2, 2),
++		z0 = svnmla_z (p0, z1, z2, 2))
++
++/*
++** nmla_f16_x_tied1:
++**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_x_tied1, svfloat16_t,
++		z0 = svnmla_f16_x (p0, z0, z1, z2),
++		z0 = svnmla_x (p0, z0, z1, z2))
++
++/*
++** nmla_f16_x_tied2:
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_x_tied2, svfloat16_t,
++		z0 = svnmla_f16_x (p0, z1, z0, z2),
++		z0 = svnmla_x (p0, z1, z0, z2))
++
++/*
++** nmla_f16_x_tied3:
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_x_tied3, svfloat16_t,
++		z0 = svnmla_f16_x (p0, z1, z2, z0),
++		z0 = svnmla_x (p0, z1, z2, z0))
++
++/*
++** nmla_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmla	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fnmad	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f16_x_untied, svfloat16_t,
++		z0 = svnmla_f16_x (p0, z1, z2, z3),
++		z0 = svnmla_x (p0, z1, z2, z3))
++
++/*
++** nmla_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_x (p0, z0, z1, d4),
++		 z0 = svnmla_x (p0, z0, z1, d4))
++
++/*
++** nmla_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fnmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_x (p0, z1, z0, d4),
++		 z0 = svnmla_x (p0, z1, z0, d4))
++
++/*
++** nmla_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svnmla_n_f16_x (p0, z1, z2, d4),
++		 z0 = svnmla_x (p0, z1, z2, d4))
++
++/*
++** nmla_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmla	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmla_n_f16_x (p0, z0, z1, 2),
++		z0 = svnmla_x (p0, z0, z1, 2))
++
++/*
++** nmla_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmad	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmla_n_f16_x (p0, z1, z0, 2),
++		z0 = svnmla_x (p0, z1, z0, 2))
++
++/*
++** nmla_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f16_x_untied, svfloat16_t,
++		z0 = svnmla_n_f16_x (p0, z1, z2, 2),
++		z0 = svnmla_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmla_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f16_x_tied1, svfloat16_t,
++		z0 = svnmla_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svnmla_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_nmla_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f16_x_tied2, svfloat16_t,
++		z0 = svnmla_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svnmla_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_nmla_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f16_x_tied3, svfloat16_t,
++		z0 = svnmla_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svnmla_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_nmla_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f16_x_untied, svfloat16_t,
++		z0 = svnmla_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svnmla_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_nmla_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmla_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svnmla_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_nmla_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmla_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svnmla_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_nmla_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f16_x_untied, svfloat16_t,
++		z0 = svnmla_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svnmla_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f32.c
+new file mode 100644
+index 000000000..ef9542d74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmla_f32_m_tied1:
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_m_tied1, svfloat32_t,
++		z0 = svnmla_f32_m (p0, z0, z1, z2),
++		z0 = svnmla_m (p0, z0, z1, z2))
++
++/*
++** nmla_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmla	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_m_tied2, svfloat32_t,
++		z0 = svnmla_f32_m (p0, z1, z0, z2),
++		z0 = svnmla_m (p0, z1, z0, z2))
++
++/*
++** nmla_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmla	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_m_tied3, svfloat32_t,
++		z0 = svnmla_f32_m (p0, z1, z2, z0),
++		z0 = svnmla_m (p0, z1, z2, z0))
++
++/*
++** nmla_f32_m_untied:
++**	movprfx	z0, z1
++**	fnmla	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_m_untied, svfloat32_t,
++		z0 = svnmla_f32_m (p0, z1, z2, z3),
++		z0 = svnmla_m (p0, z1, z2, z3))
++
++/*
++** nmla_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svnmla_n_f32_m (p0, z0, z1, d4),
++		 z0 = svnmla_m (p0, z0, z1, d4))
++
++/*
++** nmla_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fnmla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svnmla_n_f32_m (p0, z1, z2, d4),
++		 z0 = svnmla_m (p0, z1, z2, d4))
++
++/*
++** nmla_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_m_tied1, svfloat32_t,
++		z0 = svnmla_n_f32_m (p0, z0, z1, 2),
++		z0 = svnmla_m (p0, z0, z1, 2))
++
++/*
++** nmla_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmla	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_m_untied, svfloat32_t,
++		z0 = svnmla_n_f32_m (p0, z1, z2, 2),
++		z0 = svnmla_m (p0, z1, z2, 2))
++
++/*
++** nmla_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_z_tied1, svfloat32_t,
++		z0 = svnmla_f32_z (p0, z0, z1, z2),
++		z0 = svnmla_z (p0, z0, z1, z2))
++
++/*
++** nmla_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_z_tied2, svfloat32_t,
++		z0 = svnmla_f32_z (p0, z1, z0, z2),
++		z0 = svnmla_z (p0, z1, z0, z2))
++
++/*
++** nmla_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_z_tied3, svfloat32_t,
++		z0 = svnmla_f32_z (p0, z1, z2, z0),
++		z0 = svnmla_z (p0, z1, z2, z0))
++
++/*
++** nmla_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_z_untied, svfloat32_t,
++		z0 = svnmla_f32_z (p0, z1, z2, z3),
++		z0 = svnmla_z (p0, z1, z2, z3))
++
++/*
++** nmla_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svnmla_n_f32_z (p0, z0, z1, d4),
++		 z0 = svnmla_z (p0, z0, z1, d4))
++
++/*
++** nmla_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svnmla_n_f32_z (p0, z1, z0, d4),
++		 z0 = svnmla_z (p0, z1, z0, d4))
++
++/*
++** nmla_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svnmla_n_f32_z (p0, z1, z2, d4),
++		 z0 = svnmla_z (p0, z1, z2, d4))
++
++/*
++** nmla_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_z_tied1, svfloat32_t,
++		z0 = svnmla_n_f32_z (p0, z0, z1, 2),
++		z0 = svnmla_z (p0, z0, z1, 2))
++
++/*
++** nmla_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_z_tied2, svfloat32_t,
++		z0 = svnmla_n_f32_z (p0, z1, z0, 2),
++		z0 = svnmla_z (p0, z1, z0, 2))
++
++/*
++** nmla_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmla	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmad	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_z_untied, svfloat32_t,
++		z0 = svnmla_n_f32_z (p0, z1, z2, 2),
++		z0 = svnmla_z (p0, z1, z2, 2))
++
++/*
++** nmla_f32_x_tied1:
++**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_x_tied1, svfloat32_t,
++		z0 = svnmla_f32_x (p0, z0, z1, z2),
++		z0 = svnmla_x (p0, z0, z1, z2))
++
++/*
++** nmla_f32_x_tied2:
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_x_tied2, svfloat32_t,
++		z0 = svnmla_f32_x (p0, z1, z0, z2),
++		z0 = svnmla_x (p0, z1, z0, z2))
++
++/*
++** nmla_f32_x_tied3:
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_x_tied3, svfloat32_t,
++		z0 = svnmla_f32_x (p0, z1, z2, z0),
++		z0 = svnmla_x (p0, z1, z2, z0))
++
++/*
++** nmla_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmla	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fnmad	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f32_x_untied, svfloat32_t,
++		z0 = svnmla_f32_x (p0, z1, z2, z3),
++		z0 = svnmla_x (p0, z1, z2, z3))
++
++/*
++** nmla_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svnmla_n_f32_x (p0, z0, z1, d4),
++		 z0 = svnmla_x (p0, z0, z1, d4))
++
++/*
++** nmla_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fnmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svnmla_n_f32_x (p0, z1, z0, d4),
++		 z0 = svnmla_x (p0, z1, z0, d4))
++
++/*
++** nmla_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svnmla_n_f32_x (p0, z1, z2, d4),
++		 z0 = svnmla_x (p0, z1, z2, d4))
++
++/*
++** nmla_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmla	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmla_n_f32_x (p0, z0, z1, 2),
++		z0 = svnmla_x (p0, z0, z1, 2))
++
++/*
++** nmla_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmad	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmla_n_f32_x (p0, z1, z0, 2),
++		z0 = svnmla_x (p0, z1, z0, 2))
++
++/*
++** nmla_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f32_x_untied, svfloat32_t,
++		z0 = svnmla_n_f32_x (p0, z1, z2, 2),
++		z0 = svnmla_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmla_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f32_x_tied1, svfloat32_t,
++		z0 = svnmla_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svnmla_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_nmla_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f32_x_tied2, svfloat32_t,
++		z0 = svnmla_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svnmla_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_nmla_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f32_x_tied3, svfloat32_t,
++		z0 = svnmla_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svnmla_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_nmla_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f32_x_untied, svfloat32_t,
++		z0 = svnmla_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svnmla_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_nmla_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmla_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svnmla_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_nmla_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmla_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svnmla_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_nmla_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f32_x_untied, svfloat32_t,
++		z0 = svnmla_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svnmla_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f64.c
+new file mode 100644
+index 000000000..441821f60
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmla_f64_m_tied1:
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_m_tied1, svfloat64_t,
++		z0 = svnmla_f64_m (p0, z0, z1, z2),
++		z0 = svnmla_m (p0, z0, z1, z2))
++
++/*
++** nmla_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmla	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_m_tied2, svfloat64_t,
++		z0 = svnmla_f64_m (p0, z1, z0, z2),
++		z0 = svnmla_m (p0, z1, z0, z2))
++
++/*
++** nmla_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_m_tied3, svfloat64_t,
++		z0 = svnmla_f64_m (p0, z1, z2, z0),
++		z0 = svnmla_m (p0, z1, z2, z0))
++
++/*
++** nmla_f64_m_untied:
++**	movprfx	z0, z1
++**	fnmla	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_m_untied, svfloat64_t,
++		z0 = svnmla_f64_m (p0, z1, z2, z3),
++		z0 = svnmla_m (p0, z1, z2, z3))
++
++/*
++** nmla_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svnmla_n_f64_m (p0, z0, z1, d4),
++		 z0 = svnmla_m (p0, z0, z1, d4))
++
++/*
++** nmla_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fnmla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svnmla_n_f64_m (p0, z1, z2, d4),
++		 z0 = svnmla_m (p0, z1, z2, d4))
++
++/*
++** nmla_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_m_tied1, svfloat64_t,
++		z0 = svnmla_n_f64_m (p0, z0, z1, 2),
++		z0 = svnmla_m (p0, z0, z1, 2))
++
++/*
++** nmla_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmla	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_m_untied, svfloat64_t,
++		z0 = svnmla_n_f64_m (p0, z1, z2, 2),
++		z0 = svnmla_m (p0, z1, z2, 2))
++
++/*
++** nmla_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_z_tied1, svfloat64_t,
++		z0 = svnmla_f64_z (p0, z0, z1, z2),
++		z0 = svnmla_z (p0, z0, z1, z2))
++
++/*
++** nmla_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_z_tied2, svfloat64_t,
++		z0 = svnmla_f64_z (p0, z1, z0, z2),
++		z0 = svnmla_z (p0, z1, z0, z2))
++
++/*
++** nmla_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_z_tied3, svfloat64_t,
++		z0 = svnmla_f64_z (p0, z1, z2, z0),
++		z0 = svnmla_z (p0, z1, z2, z0))
++
++/*
++** nmla_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_z_untied, svfloat64_t,
++		z0 = svnmla_f64_z (p0, z1, z2, z3),
++		z0 = svnmla_z (p0, z1, z2, z3))
++
++/*
++** nmla_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svnmla_n_f64_z (p0, z0, z1, d4),
++		 z0 = svnmla_z (p0, z0, z1, d4))
++
++/*
++** nmla_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svnmla_n_f64_z (p0, z1, z0, d4),
++		 z0 = svnmla_z (p0, z1, z0, d4))
++
++/*
++** nmla_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svnmla_n_f64_z (p0, z1, z2, d4),
++		 z0 = svnmla_z (p0, z1, z2, d4))
++
++/*
++** nmla_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_z_tied1, svfloat64_t,
++		z0 = svnmla_n_f64_z (p0, z0, z1, 2),
++		z0 = svnmla_z (p0, z0, z1, 2))
++
++/*
++** nmla_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_z_tied2, svfloat64_t,
++		z0 = svnmla_n_f64_z (p0, z1, z0, 2),
++		z0 = svnmla_z (p0, z1, z0, 2))
++
++/*
++** nmla_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmla	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmad	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_z_untied, svfloat64_t,
++		z0 = svnmla_n_f64_z (p0, z1, z2, 2),
++		z0 = svnmla_z (p0, z1, z2, 2))
++
++/*
++** nmla_f64_x_tied1:
++**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_x_tied1, svfloat64_t,
++		z0 = svnmla_f64_x (p0, z0, z1, z2),
++		z0 = svnmla_x (p0, z0, z1, z2))
++
++/*
++** nmla_f64_x_tied2:
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_x_tied2, svfloat64_t,
++		z0 = svnmla_f64_x (p0, z1, z0, z2),
++		z0 = svnmla_x (p0, z1, z0, z2))
++
++/*
++** nmla_f64_x_tied3:
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_x_tied3, svfloat64_t,
++		z0 = svnmla_f64_x (p0, z1, z2, z0),
++		z0 = svnmla_x (p0, z1, z2, z0))
++
++/*
++** nmla_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmla	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fnmad	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_f64_x_untied, svfloat64_t,
++		z0 = svnmla_f64_x (p0, z1, z2, z3),
++		z0 = svnmla_x (p0, z1, z2, z3))
++
++/*
++** nmla_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svnmla_n_f64_x (p0, z0, z1, d4),
++		 z0 = svnmla_x (p0, z0, z1, d4))
++
++/*
++** nmla_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fnmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svnmla_n_f64_x (p0, z1, z0, d4),
++		 z0 = svnmla_x (p0, z1, z0, d4))
++
++/*
++** nmla_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmla_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svnmla_n_f64_x (p0, z1, z2, d4),
++		 z0 = svnmla_x (p0, z1, z2, d4))
++
++/*
++** nmla_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmla	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmla_n_f64_x (p0, z0, z1, 2),
++		z0 = svnmla_x (p0, z0, z1, 2))
++
++/*
++** nmla_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmad	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmla_n_f64_x (p0, z1, z0, 2),
++		z0 = svnmla_x (p0, z1, z0, 2))
++
++/*
++** nmla_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmla_2_f64_x_untied, svfloat64_t,
++		z0 = svnmla_n_f64_x (p0, z1, z2, 2),
++		z0 = svnmla_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmla_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f64_x_tied1, svfloat64_t,
++		z0 = svnmla_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svnmla_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_nmla_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f64_x_tied2, svfloat64_t,
++		z0 = svnmla_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svnmla_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_nmla_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f64_x_tied3, svfloat64_t,
++		z0 = svnmla_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svnmla_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_nmla_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_f64_x_untied, svfloat64_t,
++		z0 = svnmla_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svnmla_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_nmla_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmla_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svnmla_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_nmla_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmla_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svnmla_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_nmla_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmla_2_f64_x_untied, svfloat64_t,
++		z0 = svnmla_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svnmla_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f16.c
+new file mode 100644
+index 000000000..8aa6c7509
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmls_f16_m_tied1:
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_m_tied1, svfloat16_t,
++		z0 = svnmls_f16_m (p0, z0, z1, z2),
++		z0 = svnmls_m (p0, z0, z1, z2))
++
++/*
++** nmls_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmls	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_m_tied2, svfloat16_t,
++		z0 = svnmls_f16_m (p0, z1, z0, z2),
++		z0 = svnmls_m (p0, z1, z0, z2))
++
++/*
++** nmls_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmls	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_m_tied3, svfloat16_t,
++		z0 = svnmls_f16_m (p0, z1, z2, z0),
++		z0 = svnmls_m (p0, z1, z2, z0))
++
++/*
++** nmls_f16_m_untied:
++**	movprfx	z0, z1
++**	fnmls	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_m_untied, svfloat16_t,
++		z0 = svnmls_f16_m (p0, z1, z2, z3),
++		z0 = svnmls_m (p0, z1, z2, z3))
++
++/*
++** nmls_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_m (p0, z0, z1, d4),
++		 z0 = svnmls_m (p0, z0, z1, d4))
++
++/*
++** nmls_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fnmls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_m (p0, z1, z2, d4),
++		 z0 = svnmls_m (p0, z1, z2, d4))
++
++/*
++** nmls_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_m_tied1, svfloat16_t,
++		z0 = svnmls_n_f16_m (p0, z0, z1, 2),
++		z0 = svnmls_m (p0, z0, z1, 2))
++
++/*
++** nmls_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmls	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_m_untied, svfloat16_t,
++		z0 = svnmls_n_f16_m (p0, z1, z2, 2),
++		z0 = svnmls_m (p0, z1, z2, 2))
++
++/*
++** nmls_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_z_tied1, svfloat16_t,
++		z0 = svnmls_f16_z (p0, z0, z1, z2),
++		z0 = svnmls_z (p0, z0, z1, z2))
++
++/*
++** nmls_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_z_tied2, svfloat16_t,
++		z0 = svnmls_f16_z (p0, z1, z0, z2),
++		z0 = svnmls_z (p0, z1, z0, z2))
++
++/*
++** nmls_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_z_tied3, svfloat16_t,
++		z0 = svnmls_f16_z (p0, z1, z2, z0),
++		z0 = svnmls_z (p0, z1, z2, z0))
++
++/*
++** nmls_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmsb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_z_untied, svfloat16_t,
++		z0 = svnmls_f16_z (p0, z1, z2, z3),
++		z0 = svnmls_z (p0, z1, z2, z3))
++
++/*
++** nmls_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_z (p0, z0, z1, d4),
++		 z0 = svnmls_z (p0, z0, z1, d4))
++
++/*
++** nmls_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_z (p0, z1, z0, d4),
++		 z0 = svnmls_z (p0, z1, z0, d4))
++
++/*
++** nmls_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmsb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_z (p0, z1, z2, d4),
++		 z0 = svnmls_z (p0, z1, z2, d4))
++
++/*
++** nmls_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_z_tied1, svfloat16_t,
++		z0 = svnmls_n_f16_z (p0, z0, z1, 2),
++		z0 = svnmls_z (p0, z0, z1, 2))
++
++/*
++** nmls_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_z_tied2, svfloat16_t,
++		z0 = svnmls_n_f16_z (p0, z1, z0, 2),
++		z0 = svnmls_z (p0, z1, z0, 2))
++
++/*
++** nmls_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmls	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmsb	z0\.h, p0/m, \1, z1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_z_untied, svfloat16_t,
++		z0 = svnmls_n_f16_z (p0, z1, z2, 2),
++		z0 = svnmls_z (p0, z1, z2, 2))
++
++/*
++** nmls_f16_x_tied1:
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_x_tied1, svfloat16_t,
++		z0 = svnmls_f16_x (p0, z0, z1, z2),
++		z0 = svnmls_x (p0, z0, z1, z2))
++
++/*
++** nmls_f16_x_tied2:
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_x_tied2, svfloat16_t,
++		z0 = svnmls_f16_x (p0, z1, z0, z2),
++		z0 = svnmls_x (p0, z1, z0, z2))
++
++/*
++** nmls_f16_x_tied3:
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_x_tied3, svfloat16_t,
++		z0 = svnmls_f16_x (p0, z1, z2, z0),
++		z0 = svnmls_x (p0, z1, z2, z0))
++
++/*
++** nmls_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmls	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fnmsb	z0\.h, p0/m, z3\.h, z1\.h
++** |
++**	movprfx	z0, z3
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f16_x_untied, svfloat16_t,
++		z0 = svnmls_f16_x (p0, z1, z2, z3),
++		z0 = svnmls_x (p0, z1, z2, z3))
++
++/*
++** nmls_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_x (p0, z0, z1, d4),
++		 z0 = svnmls_x (p0, z0, z1, d4))
++
++/*
++** nmls_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fnmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_x (p0, z1, z0, d4),
++		 z0 = svnmls_x (p0, z1, z0, d4))
++
++/*
++** nmls_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svnmls_n_f16_x (p0, z1, z2, d4),
++		 z0 = svnmls_x (p0, z1, z2, d4))
++
++/*
++** nmls_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmls	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmls_n_f16_x (p0, z0, z1, 2),
++		z0 = svnmls_x (p0, z0, z1, 2))
++
++/*
++** nmls_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmsb	z0\.h, p0/m, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmls_n_f16_x (p0, z1, z0, 2),
++		z0 = svnmls_x (p0, z1, z0, 2))
++
++/*
++** nmls_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f16_x_untied, svfloat16_t,
++		z0 = svnmls_n_f16_x (p0, z1, z2, 2),
++		z0 = svnmls_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmls_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f16_x_tied1, svfloat16_t,
++		z0 = svnmls_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svnmls_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_nmls_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f16_x_tied2, svfloat16_t,
++		z0 = svnmls_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svnmls_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_nmls_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f16_x_tied3, svfloat16_t,
++		z0 = svnmls_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svnmls_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_nmls_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f16_x_untied, svfloat16_t,
++		z0 = svnmls_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svnmls_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_nmls_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmls_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svnmls_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_nmls_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmls_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svnmls_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_nmls_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f16_x_untied, svfloat16_t,
++		z0 = svnmls_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svnmls_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f32.c
+new file mode 100644
+index 000000000..42ea13fac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmls_f32_m_tied1:
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_m_tied1, svfloat32_t,
++		z0 = svnmls_f32_m (p0, z0, z1, z2),
++		z0 = svnmls_m (p0, z0, z1, z2))
++
++/*
++** nmls_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmls	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_m_tied2, svfloat32_t,
++		z0 = svnmls_f32_m (p0, z1, z0, z2),
++		z0 = svnmls_m (p0, z1, z0, z2))
++
++/*
++** nmls_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmls	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_m_tied3, svfloat32_t,
++		z0 = svnmls_f32_m (p0, z1, z2, z0),
++		z0 = svnmls_m (p0, z1, z2, z0))
++
++/*
++** nmls_f32_m_untied:
++**	movprfx	z0, z1
++**	fnmls	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_m_untied, svfloat32_t,
++		z0 = svnmls_f32_m (p0, z1, z2, z3),
++		z0 = svnmls_m (p0, z1, z2, z3))
++
++/*
++** nmls_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svnmls_n_f32_m (p0, z0, z1, d4),
++		 z0 = svnmls_m (p0, z0, z1, d4))
++
++/*
++** nmls_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fnmls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svnmls_n_f32_m (p0, z1, z2, d4),
++		 z0 = svnmls_m (p0, z1, z2, d4))
++
++/*
++** nmls_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_m_tied1, svfloat32_t,
++		z0 = svnmls_n_f32_m (p0, z0, z1, 2),
++		z0 = svnmls_m (p0, z0, z1, 2))
++
++/*
++** nmls_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmls	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_m_untied, svfloat32_t,
++		z0 = svnmls_n_f32_m (p0, z1, z2, 2),
++		z0 = svnmls_m (p0, z1, z2, 2))
++
++/*
++** nmls_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_z_tied1, svfloat32_t,
++		z0 = svnmls_f32_z (p0, z0, z1, z2),
++		z0 = svnmls_z (p0, z0, z1, z2))
++
++/*
++** nmls_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_z_tied2, svfloat32_t,
++		z0 = svnmls_f32_z (p0, z1, z0, z2),
++		z0 = svnmls_z (p0, z1, z0, z2))
++
++/*
++** nmls_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_z_tied3, svfloat32_t,
++		z0 = svnmls_f32_z (p0, z1, z2, z0),
++		z0 = svnmls_z (p0, z1, z2, z0))
++
++/*
++** nmls_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmsb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_z_untied, svfloat32_t,
++		z0 = svnmls_f32_z (p0, z1, z2, z3),
++		z0 = svnmls_z (p0, z1, z2, z3))
++
++/*
++** nmls_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svnmls_n_f32_z (p0, z0, z1, d4),
++		 z0 = svnmls_z (p0, z0, z1, d4))
++
++/*
++** nmls_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svnmls_n_f32_z (p0, z1, z0, d4),
++		 z0 = svnmls_z (p0, z1, z0, d4))
++
++/*
++** nmls_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmsb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svnmls_n_f32_z (p0, z1, z2, d4),
++		 z0 = svnmls_z (p0, z1, z2, d4))
++
++/*
++** nmls_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_z_tied1, svfloat32_t,
++		z0 = svnmls_n_f32_z (p0, z0, z1, 2),
++		z0 = svnmls_z (p0, z0, z1, 2))
++
++/*
++** nmls_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_z_tied2, svfloat32_t,
++		z0 = svnmls_n_f32_z (p0, z1, z0, 2),
++		z0 = svnmls_z (p0, z1, z0, 2))
++
++/*
++** nmls_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmls	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmsb	z0\.s, p0/m, \1, z1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_z_untied, svfloat32_t,
++		z0 = svnmls_n_f32_z (p0, z1, z2, 2),
++		z0 = svnmls_z (p0, z1, z2, 2))
++
++/*
++** nmls_f32_x_tied1:
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_x_tied1, svfloat32_t,
++		z0 = svnmls_f32_x (p0, z0, z1, z2),
++		z0 = svnmls_x (p0, z0, z1, z2))
++
++/*
++** nmls_f32_x_tied2:
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_x_tied2, svfloat32_t,
++		z0 = svnmls_f32_x (p0, z1, z0, z2),
++		z0 = svnmls_x (p0, z1, z0, z2))
++
++/*
++** nmls_f32_x_tied3:
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_x_tied3, svfloat32_t,
++		z0 = svnmls_f32_x (p0, z1, z2, z0),
++		z0 = svnmls_x (p0, z1, z2, z0))
++
++/*
++** nmls_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmls	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fnmsb	z0\.s, p0/m, z3\.s, z1\.s
++** |
++**	movprfx	z0, z3
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f32_x_untied, svfloat32_t,
++		z0 = svnmls_f32_x (p0, z1, z2, z3),
++		z0 = svnmls_x (p0, z1, z2, z3))
++
++/*
++** nmls_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svnmls_n_f32_x (p0, z0, z1, d4),
++		 z0 = svnmls_x (p0, z0, z1, d4))
++
++/*
++** nmls_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fnmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svnmls_n_f32_x (p0, z1, z0, d4),
++		 z0 = svnmls_x (p0, z1, z0, d4))
++
++/*
++** nmls_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svnmls_n_f32_x (p0, z1, z2, d4),
++		 z0 = svnmls_x (p0, z1, z2, d4))
++
++/*
++** nmls_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmls	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmls_n_f32_x (p0, z0, z1, 2),
++		z0 = svnmls_x (p0, z0, z1, 2))
++
++/*
++** nmls_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmsb	z0\.s, p0/m, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmls_n_f32_x (p0, z1, z0, 2),
++		z0 = svnmls_x (p0, z1, z0, 2))
++
++/*
++** nmls_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f32_x_untied, svfloat32_t,
++		z0 = svnmls_n_f32_x (p0, z1, z2, 2),
++		z0 = svnmls_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmls_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f32_x_tied1, svfloat32_t,
++		z0 = svnmls_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svnmls_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_nmls_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f32_x_tied2, svfloat32_t,
++		z0 = svnmls_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svnmls_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_nmls_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f32_x_tied3, svfloat32_t,
++		z0 = svnmls_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svnmls_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_nmls_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f32_x_untied, svfloat32_t,
++		z0 = svnmls_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svnmls_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_nmls_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmls_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svnmls_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_nmls_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmls_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svnmls_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_nmls_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f32_x_untied, svfloat32_t,
++		z0 = svnmls_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svnmls_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f64.c
+new file mode 100644
+index 000000000..994c2a74e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmls_f64_m_tied1:
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_m_tied1, svfloat64_t,
++		z0 = svnmls_f64_m (p0, z0, z1, z2),
++		z0 = svnmls_m (p0, z0, z1, z2))
++
++/*
++** nmls_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmls	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_m_tied2, svfloat64_t,
++		z0 = svnmls_f64_m (p0, z1, z0, z2),
++		z0 = svnmls_m (p0, z1, z0, z2))
++
++/*
++** nmls_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_m_tied3, svfloat64_t,
++		z0 = svnmls_f64_m (p0, z1, z2, z0),
++		z0 = svnmls_m (p0, z1, z2, z0))
++
++/*
++** nmls_f64_m_untied:
++**	movprfx	z0, z1
++**	fnmls	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_m_untied, svfloat64_t,
++		z0 = svnmls_f64_m (p0, z1, z2, z3),
++		z0 = svnmls_m (p0, z1, z2, z3))
++
++/*
++** nmls_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svnmls_n_f64_m (p0, z0, z1, d4),
++		 z0 = svnmls_m (p0, z0, z1, d4))
++
++/*
++** nmls_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fnmls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svnmls_n_f64_m (p0, z1, z2, d4),
++		 z0 = svnmls_m (p0, z1, z2, d4))
++
++/*
++** nmls_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_m_tied1, svfloat64_t,
++		z0 = svnmls_n_f64_m (p0, z0, z1, 2),
++		z0 = svnmls_m (p0, z0, z1, 2))
++
++/*
++** nmls_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmls	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_m_untied, svfloat64_t,
++		z0 = svnmls_n_f64_m (p0, z1, z2, 2),
++		z0 = svnmls_m (p0, z1, z2, 2))
++
++/*
++** nmls_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_z_tied1, svfloat64_t,
++		z0 = svnmls_f64_z (p0, z0, z1, z2),
++		z0 = svnmls_z (p0, z0, z1, z2))
++
++/*
++** nmls_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_z_tied2, svfloat64_t,
++		z0 = svnmls_f64_z (p0, z1, z0, z2),
++		z0 = svnmls_z (p0, z1, z0, z2))
++
++/*
++** nmls_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_z_tied3, svfloat64_t,
++		z0 = svnmls_f64_z (p0, z1, z2, z0),
++		z0 = svnmls_z (p0, z1, z2, z0))
++
++/*
++** nmls_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmsb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_z_untied, svfloat64_t,
++		z0 = svnmls_f64_z (p0, z1, z2, z3),
++		z0 = svnmls_z (p0, z1, z2, z3))
++
++/*
++** nmls_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svnmls_n_f64_z (p0, z0, z1, d4),
++		 z0 = svnmls_z (p0, z0, z1, d4))
++
++/*
++** nmls_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svnmls_n_f64_z (p0, z1, z0, d4),
++		 z0 = svnmls_z (p0, z1, z0, d4))
++
++/*
++** nmls_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmsb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svnmls_n_f64_z (p0, z1, z2, d4),
++		 z0 = svnmls_z (p0, z1, z2, d4))
++
++/*
++** nmls_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_z_tied1, svfloat64_t,
++		z0 = svnmls_n_f64_z (p0, z0, z1, 2),
++		z0 = svnmls_z (p0, z0, z1, 2))
++
++/*
++** nmls_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_z_tied2, svfloat64_t,
++		z0 = svnmls_n_f64_z (p0, z1, z0, 2),
++		z0 = svnmls_z (p0, z1, z0, 2))
++
++/*
++** nmls_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmls	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmsb	z0\.d, p0/m, \1, z1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_z_untied, svfloat64_t,
++		z0 = svnmls_n_f64_z (p0, z1, z2, 2),
++		z0 = svnmls_z (p0, z1, z2, 2))
++
++/*
++** nmls_f64_x_tied1:
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_x_tied1, svfloat64_t,
++		z0 = svnmls_f64_x (p0, z0, z1, z2),
++		z0 = svnmls_x (p0, z0, z1, z2))
++
++/*
++** nmls_f64_x_tied2:
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_x_tied2, svfloat64_t,
++		z0 = svnmls_f64_x (p0, z1, z0, z2),
++		z0 = svnmls_x (p0, z1, z0, z2))
++
++/*
++** nmls_f64_x_tied3:
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_x_tied3, svfloat64_t,
++		z0 = svnmls_f64_x (p0, z1, z2, z0),
++		z0 = svnmls_x (p0, z1, z2, z0))
++
++/*
++** nmls_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmls	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fnmsb	z0\.d, p0/m, z3\.d, z1\.d
++** |
++**	movprfx	z0, z3
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_f64_x_untied, svfloat64_t,
++		z0 = svnmls_f64_x (p0, z1, z2, z3),
++		z0 = svnmls_x (p0, z1, z2, z3))
++
++/*
++** nmls_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svnmls_n_f64_x (p0, z0, z1, d4),
++		 z0 = svnmls_x (p0, z0, z1, d4))
++
++/*
++** nmls_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fnmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svnmls_n_f64_x (p0, z1, z0, d4),
++		 z0 = svnmls_x (p0, z1, z0, d4))
++
++/*
++** nmls_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmls_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svnmls_n_f64_x (p0, z1, z2, d4),
++		 z0 = svnmls_x (p0, z1, z2, d4))
++
++/*
++** nmls_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmls	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmls_n_f64_x (p0, z0, z1, 2),
++		z0 = svnmls_x (p0, z0, z1, 2))
++
++/*
++** nmls_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmsb	z0\.d, p0/m, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmls_n_f64_x (p0, z1, z0, 2),
++		z0 = svnmls_x (p0, z1, z0, 2))
++
++/*
++** nmls_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmls_2_f64_x_untied, svfloat64_t,
++		z0 = svnmls_n_f64_x (p0, z1, z2, 2),
++		z0 = svnmls_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmls_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f64_x_tied1, svfloat64_t,
++		z0 = svnmls_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svnmls_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_nmls_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f64_x_tied2, svfloat64_t,
++		z0 = svnmls_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svnmls_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_nmls_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f64_x_tied3, svfloat64_t,
++		z0 = svnmls_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svnmls_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_nmls_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_f64_x_untied, svfloat64_t,
++		z0 = svnmls_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svnmls_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_nmls_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmls_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svnmls_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_nmls_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmls_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svnmls_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_nmls_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmls_2_f64_x_untied, svfloat64_t,
++		z0 = svnmls_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svnmls_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f16.c
+new file mode 100644
+index 000000000..c11401485
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f16.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmsb_f16_m_tied1:
++**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_m_tied1, svfloat16_t,
++		z0 = svnmsb_f16_m (p0, z0, z1, z2),
++		z0 = svnmsb_m (p0, z0, z1, z2))
++
++/*
++** nmsb_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmsb	z0\.h, p0/m, \1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_m_tied2, svfloat16_t,
++		z0 = svnmsb_f16_m (p0, z1, z0, z2),
++		z0 = svnmsb_m (p0, z1, z0, z2))
++
++/*
++** nmsb_f16_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmsb	z0\.h, p0/m, z2\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_m_tied3, svfloat16_t,
++		z0 = svnmsb_f16_m (p0, z1, z2, z0),
++		z0 = svnmsb_m (p0, z1, z2, z0))
++
++/*
++** nmsb_f16_m_untied:
++**	movprfx	z0, z1
++**	fnmsb	z0\.h, p0/m, z2\.h, z3\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_m_untied, svfloat16_t,
++		z0 = svnmsb_f16_m (p0, z1, z2, z3),
++		z0 = svnmsb_m (p0, z1, z2, z3))
++
++/*
++** nmsb_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_m (p0, z0, z1, d4),
++		 z0 = svnmsb_m (p0, z0, z1, d4))
++
++/*
++** nmsb_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fnmsb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_m (p0, z1, z2, d4),
++		 z0 = svnmsb_m (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_m_tied1, svfloat16_t,
++		z0 = svnmsb_n_f16_m (p0, z0, z1, 2),
++		z0 = svnmsb_m (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmsb	z0\.h, p0/m, z2\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_m_untied, svfloat16_t,
++		z0 = svnmsb_n_f16_m (p0, z1, z2, 2),
++		z0 = svnmsb_m (p0, z1, z2, 2))
++
++/*
++** nmsb_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_z_tied1, svfloat16_t,
++		z0 = svnmsb_f16_z (p0, z0, z1, z2),
++		z0 = svnmsb_z (p0, z0, z1, z2))
++
++/*
++** nmsb_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_z_tied2, svfloat16_t,
++		z0 = svnmsb_f16_z (p0, z1, z0, z2),
++		z0 = svnmsb_z (p0, z1, z0, z2))
++
++/*
++** nmsb_f16_z_tied3:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_z_tied3, svfloat16_t,
++		z0 = svnmsb_f16_z (p0, z1, z2, z0),
++		z0 = svnmsb_z (p0, z1, z2, z0))
++
++/*
++** nmsb_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmsb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0\.h, p0/z, z3\.h
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_z_untied, svfloat16_t,
++		z0 = svnmsb_f16_z (p0, z1, z2, z3),
++		z0 = svnmsb_z (p0, z1, z2, z3))
++
++/*
++** nmsb_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_z (p0, z0, z1, d4),
++		 z0 = svnmsb_z (p0, z0, z1, d4))
++
++/*
++** nmsb_h4_f16_z_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_z_tied2, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_z (p0, z1, z0, d4),
++		 z0 = svnmsb_z (p0, z1, z0, d4))
++
++/*
++** nmsb_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmsb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_z (p0, z1, z2, d4),
++		 z0 = svnmsb_z (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_z_tied1, svfloat16_t,
++		z0 = svnmsb_n_f16_z (p0, z0, z1, 2),
++		z0 = svnmsb_z (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f16_z_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_z_tied2, svfloat16_t,
++		z0 = svnmsb_n_f16_z (p0, z1, z0, 2),
++		z0 = svnmsb_z (p0, z1, z0, 2))
++
++/*
++** nmsb_2_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fnmsb	z0\.h, p0/m, z2\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_z_untied, svfloat16_t,
++		z0 = svnmsb_n_f16_z (p0, z1, z2, 2),
++		z0 = svnmsb_z (p0, z1, z2, 2))
++
++/*
++** nmsb_f16_x_tied1:
++**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_x_tied1, svfloat16_t,
++		z0 = svnmsb_f16_x (p0, z0, z1, z2),
++		z0 = svnmsb_x (p0, z0, z1, z2))
++
++/*
++** nmsb_f16_x_tied2:
++**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_x_tied2, svfloat16_t,
++		z0 = svnmsb_f16_x (p0, z1, z0, z2),
++		z0 = svnmsb_x (p0, z1, z0, z2))
++
++/*
++** nmsb_f16_x_tied3:
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_x_tied3, svfloat16_t,
++		z0 = svnmsb_f16_x (p0, z1, z2, z0),
++		z0 = svnmsb_x (p0, z1, z2, z0))
++
++/*
++** nmsb_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmsb	z0\.h, p0/m, z2\.h, z3\.h
++** |
++**	movprfx	z0, z2
++**	fnmsb	z0\.h, p0/m, z1\.h, z3\.h
++** |
++**	movprfx	z0, z3
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f16_x_untied, svfloat16_t,
++		z0 = svnmsb_f16_x (p0, z1, z2, z3),
++		z0 = svnmsb_x (p0, z1, z2, z3))
++
++/*
++** nmsb_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_x (p0, z0, z1, d4),
++		 z0 = svnmsb_x (p0, z0, z1, d4))
++
++/*
++** nmsb_h4_f16_x_tied2:
++**	mov	(z[0-9]+\.h), h4
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_x_tied2, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_x (p0, z1, z0, d4),
++		 z0 = svnmsb_x (p0, z1, z0, d4))
++
++/*
++** nmsb_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svnmsb_n_f16_x (p0, z1, z2, d4),
++		 z0 = svnmsb_x (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmsb_n_f16_x (p0, z0, z1, 2),
++		z0 = svnmsb_x (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f16_x_tied2:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fnmsb	z0\.h, p0/m, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmsb_n_f16_x (p0, z1, z0, 2),
++		z0 = svnmsb_x (p0, z1, z0, 2))
++
++/*
++** nmsb_2_f16_x_untied:
++**	fmov	z0\.h, #2\.0(?:e\+0)?
++**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f16_x_untied, svfloat16_t,
++		z0 = svnmsb_n_f16_x (p0, z1, z2, 2),
++		z0 = svnmsb_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmsb_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f16_x_tied1, svfloat16_t,
++		z0 = svnmsb_f16_x (svptrue_b16 (), z0, z1, z2),
++		z0 = svnmsb_x (svptrue_b16 (), z0, z1, z2))
++
++/*
++** ptrue_nmsb_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f16_x_tied2, svfloat16_t,
++		z0 = svnmsb_f16_x (svptrue_b16 (), z1, z0, z2),
++		z0 = svnmsb_x (svptrue_b16 (), z1, z0, z2))
++
++/*
++** ptrue_nmsb_f16_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f16_x_tied3, svfloat16_t,
++		z0 = svnmsb_f16_x (svptrue_b16 (), z1, z2, z0),
++		z0 = svnmsb_x (svptrue_b16 (), z1, z2, z0))
++
++/*
++** ptrue_nmsb_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f16_x_untied, svfloat16_t,
++		z0 = svnmsb_f16_x (svptrue_b16 (), z1, z2, z3),
++		z0 = svnmsb_x (svptrue_b16 (), z1, z2, z3))
++
++/*
++** ptrue_nmsb_2_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f16_x_tied1, svfloat16_t,
++		z0 = svnmsb_n_f16_x (svptrue_b16 (), z0, z1, 2),
++		z0 = svnmsb_x (svptrue_b16 (), z0, z1, 2))
++
++/*
++** ptrue_nmsb_2_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f16_x_tied2, svfloat16_t,
++		z0 = svnmsb_n_f16_x (svptrue_b16 (), z1, z0, 2),
++		z0 = svnmsb_x (svptrue_b16 (), z1, z0, 2))
++
++/*
++** ptrue_nmsb_2_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f16_x_untied, svfloat16_t,
++		z0 = svnmsb_n_f16_x (svptrue_b16 (), z1, z2, 2),
++		z0 = svnmsb_x (svptrue_b16 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f32.c
+new file mode 100644
+index 000000000..c2204e040
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f32.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmsb_f32_m_tied1:
++**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_m_tied1, svfloat32_t,
++		z0 = svnmsb_f32_m (p0, z0, z1, z2),
++		z0 = svnmsb_m (p0, z0, z1, z2))
++
++/*
++** nmsb_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmsb	z0\.s, p0/m, \1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_m_tied2, svfloat32_t,
++		z0 = svnmsb_f32_m (p0, z1, z0, z2),
++		z0 = svnmsb_m (p0, z1, z0, z2))
++
++/*
++** nmsb_f32_m_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fnmsb	z0\.s, p0/m, z2\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_m_tied3, svfloat32_t,
++		z0 = svnmsb_f32_m (p0, z1, z2, z0),
++		z0 = svnmsb_m (p0, z1, z2, z0))
++
++/*
++** nmsb_f32_m_untied:
++**	movprfx	z0, z1
++**	fnmsb	z0\.s, p0/m, z2\.s, z3\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_m_untied, svfloat32_t,
++		z0 = svnmsb_f32_m (p0, z1, z2, z3),
++		z0 = svnmsb_m (p0, z1, z2, z3))
++
++/*
++** nmsb_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_m (p0, z0, z1, d4),
++		 z0 = svnmsb_m (p0, z0, z1, d4))
++
++/*
++** nmsb_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fnmsb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_m (p0, z1, z2, d4),
++		 z0 = svnmsb_m (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_m_tied1, svfloat32_t,
++		z0 = svnmsb_n_f32_m (p0, z0, z1, 2),
++		z0 = svnmsb_m (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmsb	z0\.s, p0/m, z2\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_m_untied, svfloat32_t,
++		z0 = svnmsb_n_f32_m (p0, z1, z2, 2),
++		z0 = svnmsb_m (p0, z1, z2, 2))
++
++/*
++** nmsb_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_z_tied1, svfloat32_t,
++		z0 = svnmsb_f32_z (p0, z0, z1, z2),
++		z0 = svnmsb_z (p0, z0, z1, z2))
++
++/*
++** nmsb_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_z_tied2, svfloat32_t,
++		z0 = svnmsb_f32_z (p0, z1, z0, z2),
++		z0 = svnmsb_z (p0, z1, z0, z2))
++
++/*
++** nmsb_f32_z_tied3:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_z_tied3, svfloat32_t,
++		z0 = svnmsb_f32_z (p0, z1, z2, z0),
++		z0 = svnmsb_z (p0, z1, z2, z0))
++
++/*
++** nmsb_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmsb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0\.s, p0/z, z3\.s
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_z_untied, svfloat32_t,
++		z0 = svnmsb_f32_z (p0, z1, z2, z3),
++		z0 = svnmsb_z (p0, z1, z2, z3))
++
++/*
++** nmsb_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_z (p0, z0, z1, d4),
++		 z0 = svnmsb_z (p0, z0, z1, d4))
++
++/*
++** nmsb_s4_f32_z_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_z_tied2, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_z (p0, z1, z0, d4),
++		 z0 = svnmsb_z (p0, z1, z0, d4))
++
++/*
++** nmsb_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmsb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_z (p0, z1, z2, d4),
++		 z0 = svnmsb_z (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_z_tied1, svfloat32_t,
++		z0 = svnmsb_n_f32_z (p0, z0, z1, 2),
++		z0 = svnmsb_z (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f32_z_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_z_tied2, svfloat32_t,
++		z0 = svnmsb_n_f32_z (p0, z1, z0, 2),
++		z0 = svnmsb_z (p0, z1, z0, 2))
++
++/*
++** nmsb_2_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fnmsb	z0\.s, p0/m, z2\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_z_untied, svfloat32_t,
++		z0 = svnmsb_n_f32_z (p0, z1, z2, 2),
++		z0 = svnmsb_z (p0, z1, z2, 2))
++
++/*
++** nmsb_f32_x_tied1:
++**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_x_tied1, svfloat32_t,
++		z0 = svnmsb_f32_x (p0, z0, z1, z2),
++		z0 = svnmsb_x (p0, z0, z1, z2))
++
++/*
++** nmsb_f32_x_tied2:
++**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_x_tied2, svfloat32_t,
++		z0 = svnmsb_f32_x (p0, z1, z0, z2),
++		z0 = svnmsb_x (p0, z1, z0, z2))
++
++/*
++** nmsb_f32_x_tied3:
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_x_tied3, svfloat32_t,
++		z0 = svnmsb_f32_x (p0, z1, z2, z0),
++		z0 = svnmsb_x (p0, z1, z2, z0))
++
++/*
++** nmsb_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmsb	z0\.s, p0/m, z2\.s, z3\.s
++** |
++**	movprfx	z0, z2
++**	fnmsb	z0\.s, p0/m, z1\.s, z3\.s
++** |
++**	movprfx	z0, z3
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f32_x_untied, svfloat32_t,
++		z0 = svnmsb_f32_x (p0, z1, z2, z3),
++		z0 = svnmsb_x (p0, z1, z2, z3))
++
++/*
++** nmsb_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_x (p0, z0, z1, d4),
++		 z0 = svnmsb_x (p0, z0, z1, d4))
++
++/*
++** nmsb_s4_f32_x_tied2:
++**	mov	(z[0-9]+\.s), s4
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_x_tied2, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_x (p0, z1, z0, d4),
++		 z0 = svnmsb_x (p0, z1, z0, d4))
++
++/*
++** nmsb_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svnmsb_n_f32_x (p0, z1, z2, d4),
++		 z0 = svnmsb_x (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmsb_n_f32_x (p0, z0, z1, 2),
++		z0 = svnmsb_x (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f32_x_tied2:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fnmsb	z0\.s, p0/m, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmsb_n_f32_x (p0, z1, z0, 2),
++		z0 = svnmsb_x (p0, z1, z0, 2))
++
++/*
++** nmsb_2_f32_x_untied:
++**	fmov	z0\.s, #2\.0(?:e\+0)?
++**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f32_x_untied, svfloat32_t,
++		z0 = svnmsb_n_f32_x (p0, z1, z2, 2),
++		z0 = svnmsb_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmsb_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f32_x_tied1, svfloat32_t,
++		z0 = svnmsb_f32_x (svptrue_b32 (), z0, z1, z2),
++		z0 = svnmsb_x (svptrue_b32 (), z0, z1, z2))
++
++/*
++** ptrue_nmsb_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f32_x_tied2, svfloat32_t,
++		z0 = svnmsb_f32_x (svptrue_b32 (), z1, z0, z2),
++		z0 = svnmsb_x (svptrue_b32 (), z1, z0, z2))
++
++/*
++** ptrue_nmsb_f32_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f32_x_tied3, svfloat32_t,
++		z0 = svnmsb_f32_x (svptrue_b32 (), z1, z2, z0),
++		z0 = svnmsb_x (svptrue_b32 (), z1, z2, z0))
++
++/*
++** ptrue_nmsb_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f32_x_untied, svfloat32_t,
++		z0 = svnmsb_f32_x (svptrue_b32 (), z1, z2, z3),
++		z0 = svnmsb_x (svptrue_b32 (), z1, z2, z3))
++
++/*
++** ptrue_nmsb_2_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f32_x_tied1, svfloat32_t,
++		z0 = svnmsb_n_f32_x (svptrue_b32 (), z0, z1, 2),
++		z0 = svnmsb_x (svptrue_b32 (), z0, z1, 2))
++
++/*
++** ptrue_nmsb_2_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f32_x_tied2, svfloat32_t,
++		z0 = svnmsb_n_f32_x (svptrue_b32 (), z1, z0, 2),
++		z0 = svnmsb_x (svptrue_b32 (), z1, z0, 2))
++
++/*
++** ptrue_nmsb_2_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f32_x_untied, svfloat32_t,
++		z0 = svnmsb_n_f32_x (svptrue_b32 (), z1, z2, 2),
++		z0 = svnmsb_x (svptrue_b32 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f64.c
+new file mode 100644
+index 000000000..56592d3ae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f64.c
+@@ -0,0 +1,398 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nmsb_f64_m_tied1:
++**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_m_tied1, svfloat64_t,
++		z0 = svnmsb_f64_m (p0, z0, z1, z2),
++		z0 = svnmsb_m (p0, z0, z1, z2))
++
++/*
++** nmsb_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmsb	z0\.d, p0/m, \1, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_m_tied2, svfloat64_t,
++		z0 = svnmsb_f64_m (p0, z1, z0, z2),
++		z0 = svnmsb_m (p0, z1, z0, z2))
++
++/*
++** nmsb_f64_m_tied3:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fnmsb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_m_tied3, svfloat64_t,
++		z0 = svnmsb_f64_m (p0, z1, z2, z0),
++		z0 = svnmsb_m (p0, z1, z2, z0))
++
++/*
++** nmsb_f64_m_untied:
++**	movprfx	z0, z1
++**	fnmsb	z0\.d, p0/m, z2\.d, z3\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_m_untied, svfloat64_t,
++		z0 = svnmsb_f64_m (p0, z1, z2, z3),
++		z0 = svnmsb_m (p0, z1, z2, z3))
++
++/*
++** nmsb_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_m (p0, z0, z1, d4),
++		 z0 = svnmsb_m (p0, z0, z1, d4))
++
++/*
++** nmsb_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fnmsb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_m (p0, z1, z2, d4),
++		 z0 = svnmsb_m (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_m_tied1, svfloat64_t,
++		z0 = svnmsb_n_f64_m (p0, z0, z1, 2),
++		z0 = svnmsb_m (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fnmsb	z0\.d, p0/m, z2\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_m_untied, svfloat64_t,
++		z0 = svnmsb_n_f64_m (p0, z1, z2, 2),
++		z0 = svnmsb_m (p0, z1, z2, 2))
++
++/*
++** nmsb_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_z_tied1, svfloat64_t,
++		z0 = svnmsb_f64_z (p0, z0, z1, z2),
++		z0 = svnmsb_z (p0, z0, z1, z2))
++
++/*
++** nmsb_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_z_tied2, svfloat64_t,
++		z0 = svnmsb_f64_z (p0, z1, z0, z2),
++		z0 = svnmsb_z (p0, z1, z0, z2))
++
++/*
++** nmsb_f64_z_tied3:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_z_tied3, svfloat64_t,
++		z0 = svnmsb_f64_z (p0, z1, z2, z0),
++		z0 = svnmsb_z (p0, z1, z2, z0))
++
++/*
++** nmsb_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmsb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0\.d, p0/z, z3\.d
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_z_untied, svfloat64_t,
++		z0 = svnmsb_f64_z (p0, z1, z2, z3),
++		z0 = svnmsb_z (p0, z1, z2, z3))
++
++/*
++** nmsb_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_z (p0, z0, z1, d4),
++		 z0 = svnmsb_z (p0, z0, z1, d4))
++
++/*
++** nmsb_d4_f64_z_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_z_tied2, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_z (p0, z1, z0, d4),
++		 z0 = svnmsb_z (p0, z1, z0, d4))
++
++/*
++** nmsb_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmsb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_z (p0, z1, z2, d4),
++		 z0 = svnmsb_z (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_z_tied1, svfloat64_t,
++		z0 = svnmsb_n_f64_z (p0, z0, z1, 2),
++		z0 = svnmsb_z (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f64_z_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_z_tied2, svfloat64_t,
++		z0 = svnmsb_n_f64_z (p0, z1, z0, 2),
++		z0 = svnmsb_z (p0, z1, z0, 2))
++
++/*
++** nmsb_2_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fnmsb	z0\.d, p0/m, z2\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_z_untied, svfloat64_t,
++		z0 = svnmsb_n_f64_z (p0, z1, z2, 2),
++		z0 = svnmsb_z (p0, z1, z2, 2))
++
++/*
++** nmsb_f64_x_tied1:
++**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_x_tied1, svfloat64_t,
++		z0 = svnmsb_f64_x (p0, z0, z1, z2),
++		z0 = svnmsb_x (p0, z0, z1, z2))
++
++/*
++** nmsb_f64_x_tied2:
++**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_x_tied2, svfloat64_t,
++		z0 = svnmsb_f64_x (p0, z1, z0, z2),
++		z0 = svnmsb_x (p0, z1, z0, z2))
++
++/*
++** nmsb_f64_x_tied3:
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_x_tied3, svfloat64_t,
++		z0 = svnmsb_f64_x (p0, z1, z2, z0),
++		z0 = svnmsb_x (p0, z1, z2, z0))
++
++/*
++** nmsb_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fnmsb	z0\.d, p0/m, z2\.d, z3\.d
++** |
++**	movprfx	z0, z2
++**	fnmsb	z0\.d, p0/m, z1\.d, z3\.d
++** |
++**	movprfx	z0, z3
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_f64_x_untied, svfloat64_t,
++		z0 = svnmsb_f64_x (p0, z1, z2, z3),
++		z0 = svnmsb_x (p0, z1, z2, z3))
++
++/*
++** nmsb_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_x (p0, z0, z1, d4),
++		 z0 = svnmsb_x (p0, z0, z1, d4))
++
++/*
++** nmsb_d4_f64_x_tied2:
++**	mov	(z[0-9]+\.d), d4
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_x_tied2, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_x (p0, z1, z0, d4),
++		 z0 = svnmsb_x (p0, z1, z0, d4))
++
++/*
++** nmsb_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (nmsb_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svnmsb_n_f64_x (p0, z1, z2, d4),
++		 z0 = svnmsb_x (p0, z1, z2, d4))
++
++/*
++** nmsb_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmsb_n_f64_x (p0, z0, z1, 2),
++		z0 = svnmsb_x (p0, z0, z1, 2))
++
++/*
++** nmsb_2_f64_x_tied2:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fnmsb	z0\.d, p0/m, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmsb_n_f64_x (p0, z1, z0, 2),
++		z0 = svnmsb_x (p0, z1, z0, 2))
++
++/*
++** nmsb_2_f64_x_untied:
++**	fmov	z0\.d, #2\.0(?:e\+0)?
++**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (nmsb_2_f64_x_untied, svfloat64_t,
++		z0 = svnmsb_n_f64_x (p0, z1, z2, 2),
++		z0 = svnmsb_x (p0, z1, z2, 2))
++
++/*
++** ptrue_nmsb_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f64_x_tied1, svfloat64_t,
++		z0 = svnmsb_f64_x (svptrue_b64 (), z0, z1, z2),
++		z0 = svnmsb_x (svptrue_b64 (), z0, z1, z2))
++
++/*
++** ptrue_nmsb_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f64_x_tied2, svfloat64_t,
++		z0 = svnmsb_f64_x (svptrue_b64 (), z1, z0, z2),
++		z0 = svnmsb_x (svptrue_b64 (), z1, z0, z2))
++
++/*
++** ptrue_nmsb_f64_x_tied3:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f64_x_tied3, svfloat64_t,
++		z0 = svnmsb_f64_x (svptrue_b64 (), z1, z2, z0),
++		z0 = svnmsb_x (svptrue_b64 (), z1, z2, z0))
++
++/*
++** ptrue_nmsb_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_f64_x_untied, svfloat64_t,
++		z0 = svnmsb_f64_x (svptrue_b64 (), z1, z2, z3),
++		z0 = svnmsb_x (svptrue_b64 (), z1, z2, z3))
++
++/*
++** ptrue_nmsb_2_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f64_x_tied1, svfloat64_t,
++		z0 = svnmsb_n_f64_x (svptrue_b64 (), z0, z1, 2),
++		z0 = svnmsb_x (svptrue_b64 (), z0, z1, 2))
++
++/*
++** ptrue_nmsb_2_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f64_x_tied2, svfloat64_t,
++		z0 = svnmsb_n_f64_x (svptrue_b64 (), z1, z0, 2),
++		z0 = svnmsb_x (svptrue_b64 (), z1, z0, 2))
++
++/*
++** ptrue_nmsb_2_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_nmsb_2_f64_x_untied, svfloat64_t,
++		z0 = svnmsb_n_f64_x (svptrue_b64 (), z1, z2, 2),
++		z0 = svnmsb_x (svptrue_b64 (), z1, z2, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nor_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nor_b.c
+new file mode 100644
+index 000000000..997e34537
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nor_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** nor_b_z_tied1:
++**	nor	p0\.b, p3/z, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (nor_b_z_tied1,
++		p0 = svnor_b_z (p3, p0, p1),
++		p0 = svnor_z (p3, p0, p1))
++
++/*
++** nor_b_z_tied2:
++**	nor	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (nor_b_z_tied2,
++		p0 = svnor_b_z (p3, p1, p0),
++		p0 = svnor_z (p3, p1, p0))
++
++/*
++** nor_b_z_untied:
++**	nor	p0\.b, p3/z, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (nor_b_z_untied,
++		p0 = svnor_b_z (p3, p1, p2),
++		p0 = svnor_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_b.c
+new file mode 100644
+index 000000000..23a3a6aae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_b.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_b_z_tied1:
++**	not	p0\.b, p3/z, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (not_b_z_tied1,
++		p0 = svnot_b_z (p3, p0),
++		p0 = svnot_z (p3, p0))
++
++/*
++** not_b_z_untied:
++**	not	p0\.b, p3/z, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (not_b_z_untied,
++		p0 = svnot_b_z (p3, p1),
++		p0 = svnot_z (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s16.c
+new file mode 100644
+index 000000000..bacd6b12c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_s16_m_tied12:
++**	not	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_m_tied12, svint16_t,
++		z0 = svnot_s16_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_s16_m_tied1:
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_m_tied1, svint16_t,
++		z0 = svnot_s16_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	not	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_m_tied2, svint16_t,
++		z0 = svnot_s16_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_s16_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_m_untied, svint16_t,
++		z0 = svnot_s16_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	not	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_z_tied1, svint16_t,
++		z0 = svnot_s16_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_z_untied, svint16_t,
++		z0 = svnot_s16_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_s16_x_tied1:
++**	not	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_x_tied1, svint16_t,
++		z0 = svnot_s16_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_s16_x_untied:
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_s16_x_untied, svint16_t,
++		z0 = svnot_s16_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s32.c
+new file mode 100644
+index 000000000..8b15d6e91
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_s32_m_tied12:
++**	not	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_m_tied12, svint32_t,
++		z0 = svnot_s32_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_s32_m_tied1:
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_m_tied1, svint32_t,
++		z0 = svnot_s32_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	not	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_m_tied2, svint32_t,
++		z0 = svnot_s32_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_s32_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_m_untied, svint32_t,
++		z0 = svnot_s32_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	not	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_z_tied1, svint32_t,
++		z0 = svnot_s32_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_z_untied, svint32_t,
++		z0 = svnot_s32_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_s32_x_tied1:
++**	not	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_x_tied1, svint32_t,
++		z0 = svnot_s32_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_s32_x_untied:
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_s32_x_untied, svint32_t,
++		z0 = svnot_s32_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s64.c
+new file mode 100644
+index 000000000..8e7f7b9e8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_s64_m_tied12:
++**	not	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_m_tied12, svint64_t,
++		z0 = svnot_s64_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_s64_m_tied1:
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_m_tied1, svint64_t,
++		z0 = svnot_s64_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	not	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_m_tied2, svint64_t,
++		z0 = svnot_s64_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_s64_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_m_untied, svint64_t,
++		z0 = svnot_s64_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	not	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_z_tied1, svint64_t,
++		z0 = svnot_s64_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_z_untied, svint64_t,
++		z0 = svnot_s64_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_s64_x_tied1:
++**	not	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_x_tied1, svint64_t,
++		z0 = svnot_s64_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_s64_x_untied:
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_s64_x_untied, svint64_t,
++		z0 = svnot_s64_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s8.c
+new file mode 100644
+index 000000000..e807f08f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_s8_m_tied12:
++**	not	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_m_tied12, svint8_t,
++		z0 = svnot_s8_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_s8_m_tied1:
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_m_tied1, svint8_t,
++		z0 = svnot_s8_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	not	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_m_tied2, svint8_t,
++		z0 = svnot_s8_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_s8_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_m_untied, svint8_t,
++		z0 = svnot_s8_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_s8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	not	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_z_tied1, svint8_t,
++		z0 = svnot_s8_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_z_untied, svint8_t,
++		z0 = svnot_s8_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_s8_x_tied1:
++**	not	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_x_tied1, svint8_t,
++		z0 = svnot_s8_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_s8_x_untied:
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_s8_x_untied, svint8_t,
++		z0 = svnot_s8_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u16.c
+new file mode 100644
+index 000000000..c812005f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_u16_m_tied12:
++**	not	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_m_tied12, svuint16_t,
++		z0 = svnot_u16_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_u16_m_tied1:
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_m_tied1, svuint16_t,
++		z0 = svnot_u16_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	not	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_m_tied2, svuint16_t,
++		z0 = svnot_u16_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_u16_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_m_untied, svuint16_t,
++		z0 = svnot_u16_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	not	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_z_tied1, svuint16_t,
++		z0 = svnot_u16_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_z_untied, svuint16_t,
++		z0 = svnot_u16_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_u16_x_tied1:
++**	not	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_x_tied1, svuint16_t,
++		z0 = svnot_u16_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_u16_x_untied:
++**	not	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (not_u16_x_untied, svuint16_t,
++		z0 = svnot_u16_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u32.c
+new file mode 100644
+index 000000000..7b7e9ca21
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_u32_m_tied12:
++**	not	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_m_tied12, svuint32_t,
++		z0 = svnot_u32_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_u32_m_tied1:
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_m_tied1, svuint32_t,
++		z0 = svnot_u32_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	not	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_m_tied2, svuint32_t,
++		z0 = svnot_u32_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_u32_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_m_untied, svuint32_t,
++		z0 = svnot_u32_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	not	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_z_tied1, svuint32_t,
++		z0 = svnot_u32_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_z_untied, svuint32_t,
++		z0 = svnot_u32_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_u32_x_tied1:
++**	not	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_x_tied1, svuint32_t,
++		z0 = svnot_u32_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_u32_x_untied:
++**	not	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (not_u32_x_untied, svuint32_t,
++		z0 = svnot_u32_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u64.c
+new file mode 100644
+index 000000000..27b92ad84
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_u64_m_tied12:
++**	not	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_m_tied12, svuint64_t,
++		z0 = svnot_u64_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_u64_m_tied1:
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_m_tied1, svuint64_t,
++		z0 = svnot_u64_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	not	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_m_tied2, svuint64_t,
++		z0 = svnot_u64_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_u64_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_m_untied, svuint64_t,
++		z0 = svnot_u64_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	not	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_z_tied1, svuint64_t,
++		z0 = svnot_u64_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_z_untied, svuint64_t,
++		z0 = svnot_u64_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_u64_x_tied1:
++**	not	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_x_tied1, svuint64_t,
++		z0 = svnot_u64_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_u64_x_untied:
++**	not	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (not_u64_x_untied, svuint64_t,
++		z0 = svnot_u64_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u8.c
+new file mode 100644
+index 000000000..bd2f36cad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** not_u8_m_tied12:
++**	not	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_m_tied12, svuint8_t,
++		z0 = svnot_u8_m (z0, p0, z0),
++		z0 = svnot_m (z0, p0, z0))
++
++/*
++** not_u8_m_tied1:
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_m_tied1, svuint8_t,
++		z0 = svnot_u8_m (z0, p0, z1),
++		z0 = svnot_m (z0, p0, z1))
++
++/*
++** not_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	not	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_m_tied2, svuint8_t,
++		z0 = svnot_u8_m (z1, p0, z0),
++		z0 = svnot_m (z1, p0, z0))
++
++/*
++** not_u8_m_untied:
++**	movprfx	z0, z2
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_m_untied, svuint8_t,
++		z0 = svnot_u8_m (z2, p0, z1),
++		z0 = svnot_m (z2, p0, z1))
++
++/*
++** not_u8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	not	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_z_tied1, svuint8_t,
++		z0 = svnot_u8_z (p0, z0),
++		z0 = svnot_z (p0, z0))
++
++/*
++** not_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_z_untied, svuint8_t,
++		z0 = svnot_u8_z (p0, z1),
++		z0 = svnot_z (p0, z1))
++
++/*
++** not_u8_x_tied1:
++**	not	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_x_tied1, svuint8_t,
++		z0 = svnot_u8_x (p0, z0),
++		z0 = svnot_x (p0, z0))
++
++/*
++** not_u8_x_untied:
++**	not	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (not_u8_x_untied, svuint8_t,
++		z0 = svnot_u8_x (p0, z1),
++		z0 = svnot_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orn_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orn_b.c
+new file mode 100644
+index 000000000..423a18bc7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orn_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orn_b_z_tied1:
++**	orn	p0\.b, p3/z, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (orn_b_z_tied1,
++		p0 = svorn_b_z (p3, p0, p1),
++		p0 = svorn_z (p3, p0, p1))
++
++/*
++** orn_b_z_tied2:
++**	orn	p0\.b, p3/z, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (orn_b_z_tied2,
++		p0 = svorn_b_z (p3, p1, p0),
++		p0 = svorn_z (p3, p1, p0))
++
++/*
++** orn_b_z_untied:
++**	orn	p0\.b, p3/z, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (orn_b_z_untied,
++		p0 = svorn_b_z (p3, p1, p2),
++		p0 = svorn_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_b.c
+new file mode 100644
+index 000000000..fba9ba7df
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_b_z_tied1:
++**	orr	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
++**	ret
++*/
++TEST_UNIFORM_P (orr_b_z_tied1,
++		p0 = svorr_b_z (p3, p0, p1),
++		p0 = svorr_z (p3, p0, p1))
++
++/*
++** orr_b_z_tied2:
++**	orr	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
++**	ret
++*/
++TEST_UNIFORM_P (orr_b_z_tied2,
++		p0 = svorr_b_z (p3, p1, p0),
++		p0 = svorr_z (p3, p1, p0))
++
++/*
++** orr_b_z_untied:
++**	orr	p0\.b, p3/z, (p1\.b, p2\.b|p2\.b, p1\.b)
++**	ret
++*/
++TEST_UNIFORM_P (orr_b_z_untied,
++		p0 = svorr_b_z (p3, p1, p2),
++		p0 = svorr_z (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s16.c
+new file mode 100644
+index 000000000..62b707a9c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s16.c
+@@ -0,0 +1,376 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_s16_m_tied1:
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_m_tied1, svint16_t,
++		z0 = svorr_s16_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_m_tied2, svint16_t,
++		z0 = svorr_s16_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_s16_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_m_untied, svint16_t,
++		z0 = svorr_s16_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svorr_n_s16_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svorr_n_s16_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s16_m_tied1, svint16_t,
++		z0 = svorr_n_s16_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s16_m_untied, svint16_t,
++		z0 = svorr_n_s16_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_s16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_s16_m, svint16_t,
++		z0 = svorr_n_s16_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_z_tied1, svint16_t,
++		z0 = svorr_s16_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_z_tied2, svint16_t,
++		z0 = svorr_s16_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	orr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_z_untied, svint16_t,
++		z0 = svorr_s16_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svorr_n_s16_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svorr_n_s16_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s16_z_tied1, svint16_t,
++		z0 = svorr_n_s16_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s16_z_untied, svint16_t,
++		z0 = svorr_n_s16_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_s16_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_x_tied1, svint16_t,
++		z0 = svorr_s16_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_s16_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_x_tied2, svint16_t,
++		z0 = svorr_s16_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_s16_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s16_x_untied, svint16_t,
++		z0 = svorr_s16_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_w0_s16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svorr_n_s16_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_w0_s16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svorr_n_s16_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_s16_x_tied1:
++**	orr	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s16_x_tied1, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_s16_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s16_x_untied, svint16_t,
++		z0 = svorr_n_s16_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_s16_x:
++**	orr	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_s16_x:
++**	orr	z0\.h, z0\.h, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_s16_x:
++**	orr	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_256_s16_x:
++**	orr	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (orr_256_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 256),
++		z0 = svorr_x (p0, z0, 256))
++
++/*
++** orr_257_s16_x:
++**	orr	z0\.h, z0\.h, #0x101
++**	ret
++*/
++TEST_UNIFORM_Z (orr_257_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 257),
++		z0 = svorr_x (p0, z0, 257))
++
++/*
++** orr_512_s16_x:
++**	orr	z0\.h, z0\.h, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (orr_512_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 512),
++		z0 = svorr_x (p0, z0, 512))
++
++/*
++** orr_65280_s16_x:
++**	orr	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_65280_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 0xff00),
++		z0 = svorr_x (p0, z0, 0xff00))
++
++/*
++** orr_m127_s16_x:
++**	orr	z0\.h, z0\.h, #0xff81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_s16_x:
++**	orr	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_m255_s16_x:
++**	orr	z0\.h, z0\.h, #0xff01
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m255_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -255),
++		z0 = svorr_x (p0, z0, -255))
++
++/*
++** orr_m256_s16_x:
++**	orr	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m256_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -256),
++		z0 = svorr_x (p0, z0, -256))
++
++/*
++** orr_m257_s16_x:
++**	orr	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m257_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -257),
++		z0 = svorr_x (p0, z0, -257))
++
++/*
++** orr_m512_s16_x:
++**	orr	z0\.h, z0\.h, #0xfe00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m512_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -512),
++		z0 = svorr_x (p0, z0, -512))
++
++/*
++** orr_m32768_s16_x:
++**	orr	z0\.h, z0\.h, #0x8000
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m32768_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, -0x8000),
++		z0 = svorr_x (p0, z0, -0x8000))
++
++/*
++** orr_5_s16_x:
++**	mov	(z[0-9]+)\.h, #5
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_s16_x, svint16_t,
++		z0 = svorr_n_s16_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s32.c
+new file mode 100644
+index 000000000..2e0e1e888
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s32.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_s32_m_tied1:
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_m_tied1, svint32_t,
++		z0 = svorr_s32_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_m_tied2, svint32_t,
++		z0 = svorr_s32_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_s32_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_m_untied, svint32_t,
++		z0 = svorr_s32_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svorr_n_s32_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svorr_n_s32_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s32_m_tied1, svint32_t,
++		z0 = svorr_n_s32_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s32_m_untied, svint32_t,
++		z0 = svorr_n_s32_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_s32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_s32_m, svint32_t,
++		z0 = svorr_n_s32_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_z_tied1, svint32_t,
++		z0 = svorr_s32_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_z_tied2, svint32_t,
++		z0 = svorr_s32_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	orr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_z_untied, svint32_t,
++		z0 = svorr_s32_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svorr_n_s32_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svorr_n_s32_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s32_z_tied1, svint32_t,
++		z0 = svorr_n_s32_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s32_z_untied, svint32_t,
++		z0 = svorr_n_s32_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_s32_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_x_tied1, svint32_t,
++		z0 = svorr_s32_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_s32_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_x_tied2, svint32_t,
++		z0 = svorr_s32_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_s32_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s32_x_untied, svint32_t,
++		z0 = svorr_s32_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_w0_s32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svorr_n_s32_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_w0_s32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svorr_n_s32_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_s32_x_tied1:
++**	orr	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s32_x_tied1, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_s32_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s32_x_untied, svint32_t,
++		z0 = svorr_n_s32_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_s32_x:
++**	orr	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_s32_x:
++**	orr	z0\.s, z0\.s, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_s32_x:
++**	orr	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_256_s32_x:
++**	orr	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (orr_256_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 256),
++		z0 = svorr_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (orr_257_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 257),
++		z0 = svorr_x (p0, z0, 257))
++
++/*
++** orr_512_s32_x:
++**	orr	z0\.s, z0\.s, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (orr_512_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 512),
++		z0 = svorr_x (p0, z0, 512))
++
++/*
++** orr_65280_s32_x:
++**	orr	z0\.s, z0\.s, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_65280_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 0xff00),
++		z0 = svorr_x (p0, z0, 0xff00))
++
++/*
++** orr_m127_s32_x:
++**	orr	z0\.s, z0\.s, #0xffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_s32_x:
++**	orr	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_m255_s32_x:
++**	orr	z0\.s, z0\.s, #0xffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m255_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -255),
++		z0 = svorr_x (p0, z0, -255))
++
++/*
++** orr_m256_s32_x:
++**	orr	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m256_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -256),
++		z0 = svorr_x (p0, z0, -256))
++
++/*
++** orr_m257_s32_x:
++**	orr	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m257_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -257),
++		z0 = svorr_x (p0, z0, -257))
++
++/*
++** orr_m512_s32_x:
++**	orr	z0\.s, z0\.s, #0xfffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m512_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -512),
++		z0 = svorr_x (p0, z0, -512))
++
++/*
++** orr_m32768_s32_x:
++**	orr	z0\.s, z0\.s, #0xffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m32768_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, -0x8000),
++		z0 = svorr_x (p0, z0, -0x8000))
++
++/*
++** orr_5_s32_x:
++**	mov	(z[0-9]+)\.s, #5
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_s32_x, svint32_t,
++		z0 = svorr_n_s32_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s64.c
+new file mode 100644
+index 000000000..1538fdd14
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s64.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_s64_m_tied1:
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_m_tied1, svint64_t,
++		z0 = svorr_s64_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_m_tied2, svint64_t,
++		z0 = svorr_s64_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_s64_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_m_untied, svint64_t,
++		z0 = svorr_s64_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svorr_n_s64_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svorr_n_s64_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s64_m_tied1, svint64_t,
++		z0 = svorr_n_s64_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s64_m_untied, svint64_t,
++		z0 = svorr_n_s64_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_s64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_s64_m, svint64_t,
++		z0 = svorr_n_s64_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_z_tied1, svint64_t,
++		z0 = svorr_s64_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_z_tied2, svint64_t,
++		z0 = svorr_s64_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	orr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_z_untied, svint64_t,
++		z0 = svorr_s64_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svorr_n_s64_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svorr_n_s64_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s64_z_tied1, svint64_t,
++		z0 = svorr_n_s64_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s64_z_untied, svint64_t,
++		z0 = svorr_n_s64_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_s64_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_x_tied1, svint64_t,
++		z0 = svorr_s64_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_s64_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_x_tied2, svint64_t,
++		z0 = svorr_s64_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_s64_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s64_x_untied, svint64_t,
++		z0 = svorr_s64_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svorr_n_s64_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	orr	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svorr_n_s64_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_s64_x_tied1:
++**	orr	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s64_x_tied1, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_s64_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s64_x_untied, svint64_t,
++		z0 = svorr_n_s64_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_s64_x:
++**	orr	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_s64_x:
++**	orr	z0\.d, z0\.d, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_s64_x:
++**	orr	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_256_s64_x:
++**	orr	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (orr_256_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 256),
++		z0 = svorr_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (orr_257_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 257),
++		z0 = svorr_x (p0, z0, 257))
++
++/*
++** orr_512_s64_x:
++**	orr	z0\.d, z0\.d, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (orr_512_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 512),
++		z0 = svorr_x (p0, z0, 512))
++
++/*
++** orr_65280_s64_x:
++**	orr	z0\.d, z0\.d, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_65280_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 0xff00),
++		z0 = svorr_x (p0, z0, 0xff00))
++
++/*
++** orr_m127_s64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_s64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_m255_s64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m255_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -255),
++		z0 = svorr_x (p0, z0, -255))
++
++/*
++** orr_m256_s64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m256_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -256),
++		z0 = svorr_x (p0, z0, -256))
++
++/*
++** orr_m257_s64_x:
++**	orr	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m257_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -257),
++		z0 = svorr_x (p0, z0, -257))
++
++/*
++** orr_m512_s64_x:
++**	orr	z0\.d, z0\.d, #0xfffffffffffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m512_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -512),
++		z0 = svorr_x (p0, z0, -512))
++
++/*
++** orr_m32768_s64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m32768_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, -0x8000),
++		z0 = svorr_x (p0, z0, -0x8000))
++
++/*
++** orr_5_s64_x:
++**	mov	(z[0-9]+\.d), #5
++**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_s64_x, svint64_t,
++		z0 = svorr_n_s64_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s8.c
+new file mode 100644
+index 000000000..b6483b6e7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s8.c
+@@ -0,0 +1,295 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_s8_m_tied1:
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_m_tied1, svint8_t,
++		z0 = svorr_s8_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_m_tied2, svint8_t,
++		z0 = svorr_s8_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_s8_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_m_untied, svint8_t,
++		z0 = svorr_s8_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svorr_n_s8_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svorr_n_s8_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s8_m_tied1, svint8_t,
++		z0 = svorr_n_s8_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s8_m_untied, svint8_t,
++		z0 = svorr_n_s8_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_s8_m:
++**	mov	(z[0-9]+\.b), #-2
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_s8_m, svint8_t,
++		z0 = svorr_n_s8_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_z_tied1, svint8_t,
++		z0 = svorr_s8_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_z_tied2, svint8_t,
++		z0 = svorr_s8_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	orr	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_z_untied, svint8_t,
++		z0 = svorr_s8_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svorr_n_s8_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svorr_n_s8_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s8_z_tied1, svint8_t,
++		z0 = svorr_n_s8_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s8_z_untied, svint8_t,
++		z0 = svorr_n_s8_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_s8_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_x_tied1, svint8_t,
++		z0 = svorr_s8_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_s8_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_x_tied2, svint8_t,
++		z0 = svorr_s8_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_s8_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_s8_x_untied, svint8_t,
++		z0 = svorr_s8_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_w0_s8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svorr_n_s8_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_w0_s8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svorr_n_s8_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_s8_x_tied1:
++**	orr	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s8_x_tied1, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_s8_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_s8_x_untied, svint8_t,
++		z0 = svorr_n_s8_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_s8_x:
++**	orr	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_s8_x, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_s8_x:
++**	orr	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_s8_x, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_s8_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_s8_x, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_m127_s8_x:
++**	orr	z0\.b, z0\.b, #0x81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_s8_x, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_s8_x:
++**	orr	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_s8_x, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_5_s8_x:
++**	mov	(z[0-9]+)\.b, #5
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_s8_x, svint8_t,
++		z0 = svorr_n_s8_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u16.c
+new file mode 100644
+index 000000000..000a0444c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u16.c
+@@ -0,0 +1,376 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_u16_m_tied1:
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_m_tied1, svuint16_t,
++		z0 = svorr_u16_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_m_tied2, svuint16_t,
++		z0 = svorr_u16_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_u16_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_m_untied, svuint16_t,
++		z0 = svorr_u16_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svorr_n_u16_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svorr_n_u16_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u16_m_tied1, svuint16_t,
++		z0 = svorr_n_u16_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u16_m_untied, svuint16_t,
++		z0 = svorr_n_u16_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_u16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_u16_m, svuint16_t,
++		z0 = svorr_n_u16_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_z_tied1, svuint16_t,
++		z0 = svorr_u16_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_z_tied2, svuint16_t,
++		z0 = svorr_u16_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	orr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_z_untied, svuint16_t,
++		z0 = svorr_u16_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svorr_n_u16_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svorr_n_u16_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u16_z_tied1, svuint16_t,
++		z0 = svorr_n_u16_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	orr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	orr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u16_z_untied, svuint16_t,
++		z0 = svorr_n_u16_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_u16_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_x_tied1, svuint16_t,
++		z0 = svorr_u16_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_u16_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_x_tied2, svuint16_t,
++		z0 = svorr_u16_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_u16_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u16_x_untied, svuint16_t,
++		z0 = svorr_u16_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_w0_u16_x_tied1:
++**	mov	(z[0-9]+)\.h, w0
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svorr_n_u16_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_w0_u16_x_untied:
++**	mov	(z[0-9]+)\.h, w0
++**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svorr_n_u16_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_u16_x_tied1:
++**	orr	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u16_x_tied1, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_u16_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.h, z0\.h, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u16_x_untied, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_u16_x:
++**	orr	z0\.h, z0\.h, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_u16_x:
++**	orr	z0\.h, z0\.h, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_u16_x:
++**	orr	z0\.h, z0\.h, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_256_u16_x:
++**	orr	z0\.h, z0\.h, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (orr_256_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 256),
++		z0 = svorr_x (p0, z0, 256))
++
++/*
++** orr_257_u16_x:
++**	orr	z0\.h, z0\.h, #0x101
++**	ret
++*/
++TEST_UNIFORM_Z (orr_257_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 257),
++		z0 = svorr_x (p0, z0, 257))
++
++/*
++** orr_512_u16_x:
++**	orr	z0\.h, z0\.h, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (orr_512_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 512),
++		z0 = svorr_x (p0, z0, 512))
++
++/*
++** orr_65280_u16_x:
++**	orr	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_65280_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 0xff00),
++		z0 = svorr_x (p0, z0, 0xff00))
++
++/*
++** orr_m127_u16_x:
++**	orr	z0\.h, z0\.h, #0xff81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_u16_x:
++**	orr	z0\.h, z0\.h, #0xff80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_m255_u16_x:
++**	orr	z0\.h, z0\.h, #0xff01
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m255_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -255),
++		z0 = svorr_x (p0, z0, -255))
++
++/*
++** orr_m256_u16_x:
++**	orr	z0\.h, z0\.h, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m256_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -256),
++		z0 = svorr_x (p0, z0, -256))
++
++/*
++** orr_m257_u16_x:
++**	orr	z0\.h, z0\.h, #0xfeff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m257_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -257),
++		z0 = svorr_x (p0, z0, -257))
++
++/*
++** orr_m512_u16_x:
++**	orr	z0\.h, z0\.h, #0xfe00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m512_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -512),
++		z0 = svorr_x (p0, z0, -512))
++
++/*
++** orr_m32768_u16_x:
++**	orr	z0\.h, z0\.h, #0x8000
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m32768_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, -0x8000),
++		z0 = svorr_x (p0, z0, -0x8000))
++
++/*
++** orr_5_u16_x:
++**	mov	(z[0-9]+)\.h, #5
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_u16_x, svuint16_t,
++		z0 = svorr_n_u16_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u32.c
+new file mode 100644
+index 000000000..8e2351d16
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u32.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_u32_m_tied1:
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_m_tied1, svuint32_t,
++		z0 = svorr_u32_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_m_tied2, svuint32_t,
++		z0 = svorr_u32_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_u32_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_m_untied, svuint32_t,
++		z0 = svorr_u32_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svorr_n_u32_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svorr_n_u32_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u32_m_tied1, svuint32_t,
++		z0 = svorr_n_u32_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u32_m_untied, svuint32_t,
++		z0 = svorr_n_u32_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_u32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_u32_m, svuint32_t,
++		z0 = svorr_n_u32_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_z_tied1, svuint32_t,
++		z0 = svorr_u32_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_z_tied2, svuint32_t,
++		z0 = svorr_u32_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	orr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_z_untied, svuint32_t,
++		z0 = svorr_u32_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svorr_n_u32_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svorr_n_u32_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u32_z_tied1, svuint32_t,
++		z0 = svorr_n_u32_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	orr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	orr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u32_z_untied, svuint32_t,
++		z0 = svorr_n_u32_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_u32_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_x_tied1, svuint32_t,
++		z0 = svorr_u32_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_u32_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_x_tied2, svuint32_t,
++		z0 = svorr_u32_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_u32_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u32_x_untied, svuint32_t,
++		z0 = svorr_u32_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_w0_u32_x_tied1:
++**	mov	(z[0-9]+)\.s, w0
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svorr_n_u32_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_w0_u32_x_untied:
++**	mov	(z[0-9]+)\.s, w0
++**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svorr_n_u32_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_u32_x_tied1:
++**	orr	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u32_x_tied1, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_u32_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.s, z0\.s, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u32_x_untied, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_u32_x:
++**	orr	z0\.s, z0\.s, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_u32_x:
++**	orr	z0\.s, z0\.s, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_u32_x:
++**	orr	z0\.s, z0\.s, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_256_u32_x:
++**	orr	z0\.s, z0\.s, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (orr_256_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 256),
++		z0 = svorr_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (orr_257_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 257),
++		z0 = svorr_x (p0, z0, 257))
++
++/*
++** orr_512_u32_x:
++**	orr	z0\.s, z0\.s, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (orr_512_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 512),
++		z0 = svorr_x (p0, z0, 512))
++
++/*
++** orr_65280_u32_x:
++**	orr	z0\.s, z0\.s, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_65280_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 0xff00),
++		z0 = svorr_x (p0, z0, 0xff00))
++
++/*
++** orr_m127_u32_x:
++**	orr	z0\.s, z0\.s, #0xffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_u32_x:
++**	orr	z0\.s, z0\.s, #0xffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_m255_u32_x:
++**	orr	z0\.s, z0\.s, #0xffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m255_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -255),
++		z0 = svorr_x (p0, z0, -255))
++
++/*
++** orr_m256_u32_x:
++**	orr	z0\.s, z0\.s, #0xffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m256_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -256),
++		z0 = svorr_x (p0, z0, -256))
++
++/*
++** orr_m257_u32_x:
++**	orr	z0\.s, z0\.s, #0xfffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m257_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -257),
++		z0 = svorr_x (p0, z0, -257))
++
++/*
++** orr_m512_u32_x:
++**	orr	z0\.s, z0\.s, #0xfffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m512_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -512),
++		z0 = svorr_x (p0, z0, -512))
++
++/*
++** orr_m32768_u32_x:
++**	orr	z0\.s, z0\.s, #0xffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m32768_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, -0x8000),
++		z0 = svorr_x (p0, z0, -0x8000))
++
++/*
++** orr_5_u32_x:
++**	mov	(z[0-9]+)\.s, #5
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_u32_x, svuint32_t,
++		z0 = svorr_n_u32_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u64.c
+new file mode 100644
+index 000000000..323e2101e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u64.c
+@@ -0,0 +1,372 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_u64_m_tied1:
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_m_tied1, svuint64_t,
++		z0 = svorr_u64_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_m_tied2, svuint64_t,
++		z0 = svorr_u64_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_u64_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_m_untied, svuint64_t,
++		z0 = svorr_u64_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svorr_n_u64_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svorr_n_u64_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u64_m_tied1, svuint64_t,
++		z0 = svorr_n_u64_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u64_m_untied, svuint64_t,
++		z0 = svorr_n_u64_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_u64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_u64_m, svuint64_t,
++		z0 = svorr_n_u64_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_z_tied1, svuint64_t,
++		z0 = svorr_u64_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_z_tied2, svuint64_t,
++		z0 = svorr_u64_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	orr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_z_untied, svuint64_t,
++		z0 = svorr_u64_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svorr_n_u64_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svorr_n_u64_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u64_z_tied1, svuint64_t,
++		z0 = svorr_n_u64_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	orr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	orr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u64_z_untied, svuint64_t,
++		z0 = svorr_n_u64_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_u64_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_x_tied1, svuint64_t,
++		z0 = svorr_u64_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_u64_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_x_tied2, svuint64_t,
++		z0 = svorr_u64_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_u64_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u64_x_untied, svuint64_t,
++		z0 = svorr_u64_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svorr_n_u64_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	orr	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svorr_n_u64_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_u64_x_tied1:
++**	orr	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u64_x_tied1, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_u64_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.d, z0\.d, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u64_x_untied, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_u64_x:
++**	orr	z0\.d, z0\.d, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_u64_x:
++**	orr	z0\.d, z0\.d, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_u64_x:
++**	orr	z0\.d, z0\.d, #0xff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_256_u64_x:
++**	orr	z0\.d, z0\.d, #0x100
++**	ret
++*/
++TEST_UNIFORM_Z (orr_256_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 256),
++		z0 = svorr_x (p0, z0, 256))
++
++/* TODO: Bad code and needs fixing.  */
++TEST_UNIFORM_Z (orr_257_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 257),
++		z0 = svorr_x (p0, z0, 257))
++
++/*
++** orr_512_u64_x:
++**	orr	z0\.d, z0\.d, #0x200
++**	ret
++*/
++TEST_UNIFORM_Z (orr_512_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 512),
++		z0 = svorr_x (p0, z0, 512))
++
++/*
++** orr_65280_u64_x:
++**	orr	z0\.d, z0\.d, #0xff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_65280_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 0xff00),
++		z0 = svorr_x (p0, z0, 0xff00))
++
++/*
++** orr_m127_u64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_u64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_m255_u64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff01
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m255_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -255),
++		z0 = svorr_x (p0, z0, -255))
++
++/*
++** orr_m256_u64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffffff00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m256_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -256),
++		z0 = svorr_x (p0, z0, -256))
++
++/*
++** orr_m257_u64_x:
++**	orr	z0\.d, z0\.d, #0xfffffffffffffeff
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m257_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -257),
++		z0 = svorr_x (p0, z0, -257))
++
++/*
++** orr_m512_u64_x:
++**	orr	z0\.d, z0\.d, #0xfffffffffffffe00
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m512_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -512),
++		z0 = svorr_x (p0, z0, -512))
++
++/*
++** orr_m32768_u64_x:
++**	orr	z0\.d, z0\.d, #0xffffffffffff8000
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m32768_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, -0x8000),
++		z0 = svorr_x (p0, z0, -0x8000))
++
++/*
++** orr_5_u64_x:
++**	mov	(z[0-9]+\.d), #5
++**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_u64_x, svuint64_t,
++		z0 = svorr_n_u64_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u8.c
+new file mode 100644
+index 000000000..efe5591b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u8.c
+@@ -0,0 +1,295 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orr_u8_m_tied1:
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_m_tied1, svuint8_t,
++		z0 = svorr_u8_m (p0, z0, z1),
++		z0 = svorr_m (p0, z0, z1))
++
++/*
++** orr_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_m_tied2, svuint8_t,
++		z0 = svorr_u8_m (p0, z1, z0),
++		z0 = svorr_m (p0, z1, z0))
++
++/*
++** orr_u8_m_untied:
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_m_untied, svuint8_t,
++		z0 = svorr_u8_m (p0, z1, z2),
++		z0 = svorr_m (p0, z1, z2))
++
++/*
++** orr_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svorr_n_u8_m (p0, z0, x0),
++		 z0 = svorr_m (p0, z0, x0))
++
++/*
++** orr_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svorr_n_u8_m (p0, z1, x0),
++		 z0 = svorr_m (p0, z1, x0))
++
++/*
++** orr_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u8_m_tied1, svuint8_t,
++		z0 = svorr_n_u8_m (p0, z0, 1),
++		z0 = svorr_m (p0, z0, 1))
++
++/*
++** orr_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u8_m_untied, svuint8_t,
++		z0 = svorr_n_u8_m (p0, z1, 1),
++		z0 = svorr_m (p0, z1, 1))
++
++/*
++** orr_m2_u8_m:
++**	mov	(z[0-9]+\.b), #-2
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m2_u8_m, svuint8_t,
++		z0 = svorr_n_u8_m (p0, z0, -2),
++		z0 = svorr_m (p0, z0, -2))
++
++/*
++** orr_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_z_tied1, svuint8_t,
++		z0 = svorr_u8_z (p0, z0, z1),
++		z0 = svorr_z (p0, z0, z1))
++
++/*
++** orr_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_z_tied2, svuint8_t,
++		z0 = svorr_u8_z (p0, z1, z0),
++		z0 = svorr_z (p0, z1, z0))
++
++/*
++** orr_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	orr	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_z_untied, svuint8_t,
++		z0 = svorr_u8_z (p0, z1, z2),
++		z0 = svorr_z (p0, z1, z2))
++
++/*
++** orr_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svorr_n_u8_z (p0, z0, x0),
++		 z0 = svorr_z (p0, z0, x0))
++
++/*
++** orr_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svorr_n_u8_z (p0, z1, x0),
++		 z0 = svorr_z (p0, z1, x0))
++
++/*
++** orr_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u8_z_tied1, svuint8_t,
++		z0 = svorr_n_u8_z (p0, z0, 1),
++		z0 = svorr_z (p0, z0, 1))
++
++/*
++** orr_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	orr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	orr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u8_z_untied, svuint8_t,
++		z0 = svorr_n_u8_z (p0, z1, 1),
++		z0 = svorr_z (p0, z1, 1))
++
++/*
++** orr_u8_x_tied1:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_x_tied1, svuint8_t,
++		z0 = svorr_u8_x (p0, z0, z1),
++		z0 = svorr_x (p0, z0, z1))
++
++/*
++** orr_u8_x_tied2:
++**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_x_tied2, svuint8_t,
++		z0 = svorr_u8_x (p0, z1, z0),
++		z0 = svorr_x (p0, z1, z0))
++
++/*
++** orr_u8_x_untied:
++**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_u8_x_untied, svuint8_t,
++		z0 = svorr_u8_x (p0, z1, z2),
++		z0 = svorr_x (p0, z1, z2))
++
++/*
++** orr_w0_u8_x_tied1:
++**	mov	(z[0-9]+)\.b, w0
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svorr_n_u8_x (p0, z0, x0),
++		 z0 = svorr_x (p0, z0, x0))
++
++/*
++** orr_w0_u8_x_untied:
++**	mov	(z[0-9]+)\.b, w0
++**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (orr_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svorr_n_u8_x (p0, z1, x0),
++		 z0 = svorr_x (p0, z1, x0))
++
++/*
++** orr_1_u8_x_tied1:
++**	orr	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u8_x_tied1, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, 1),
++		z0 = svorr_x (p0, z0, 1))
++
++/*
++** orr_1_u8_x_untied:
++**	movprfx	z0, z1
++**	orr	z0\.b, z0\.b, #0x1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_1_u8_x_untied, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z1, 1),
++		z0 = svorr_x (p0, z1, 1))
++
++/*
++** orr_127_u8_x:
++**	orr	z0\.b, z0\.b, #0x7f
++**	ret
++*/
++TEST_UNIFORM_Z (orr_127_u8_x, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, 127),
++		z0 = svorr_x (p0, z0, 127))
++
++/*
++** orr_128_u8_x:
++**	orr	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_128_u8_x, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, 128),
++		z0 = svorr_x (p0, z0, 128))
++
++/*
++** orr_255_u8_x:
++**	mov	z0\.b, #-1
++**	ret
++*/
++TEST_UNIFORM_Z (orr_255_u8_x, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, 255),
++		z0 = svorr_x (p0, z0, 255))
++
++/*
++** orr_m127_u8_x:
++**	orr	z0\.b, z0\.b, #0x81
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m127_u8_x, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, -127),
++		z0 = svorr_x (p0, z0, -127))
++
++/*
++** orr_m128_u8_x:
++**	orr	z0\.b, z0\.b, #0x80
++**	ret
++*/
++TEST_UNIFORM_Z (orr_m128_u8_x, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, -128),
++		z0 = svorr_x (p0, z0, -128))
++
++/*
++** orr_5_u8_x:
++**	mov	(z[0-9]+)\.b, #5
++**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (orr_5_u8_x, svuint8_t,
++		z0 = svorr_n_u8_x (p0, z0, 5),
++		z0 = svorr_x (p0, z0, 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s16.c
+new file mode 100644
+index 000000000..c9b268d3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_s16:
++**	orv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_s16, int16_t, svint16_t,
++		  x0 = svorv_s16 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s32.c
+new file mode 100644
+index 000000000..df4025f54
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_s32:
++**	orv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_s32, int32_t, svint32_t,
++		  x0 = svorv_s32 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s64.c
+new file mode 100644
+index 000000000..76a835ce3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_s64:
++**	orv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_s64, int64_t, svint64_t,
++		  x0 = svorv_s64 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s8.c
+new file mode 100644
+index 000000000..3f2031d9c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_s8:
++**	orv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_s8, int8_t, svint8_t,
++		  x0 = svorv_s8 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u16.c
+new file mode 100644
+index 000000000..28bfbecb0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u16.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_u16:
++**	orv	h([0-9]+), p0, z0\.h
++**	umov	w0, v\1\.h\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_u16, uint16_t, svuint16_t,
++		  x0 = svorv_u16 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u32.c
+new file mode 100644
+index 000000000..1988d5623
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u32.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_u32:
++**	orv	(s[0-9]+), p0, z0\.s
++**	fmov	w0, \1
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_u32, uint32_t, svuint32_t,
++		  x0 = svorv_u32 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u64.c
+new file mode 100644
+index 000000000..c8a8429a7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u64.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_u64:
++**	orv	(d[0-9]+), p0, z0\.d
++**	fmov	x0, \1
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_u64, uint64_t, svuint64_t,
++		  x0 = svorv_u64 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u8.c
+new file mode 100644
+index 000000000..bcab32d8b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u8.c
+@@ -0,0 +1,13 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** orv_x0_u8:
++**	orv	b([0-9]+), p0, z0\.b
++**	umov	w0, v\1\.b\[0\]
++**	ret
++*/
++TEST_REDUCTION_X (orv_x0_u8, uint8_t, svuint8_t,
++		  x0 = svorv_u8 (p0, z0),
++		  x0 = svorv (p0, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfalse.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfalse.c
+new file mode 100644
+index 000000000..a74a59283
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfalse.c
+@@ -0,0 +1,13 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** pfalse_b:
++**	pfalse	p0\.b
++**	ret
++*/
++TEST_P (pfalse_b,
++	p0 = svpfalse_b (),
++	p0 = svpfalse ());
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfirst_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfirst_b.c
+new file mode 100644
+index 000000000..a32099656
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfirst_b.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** pfirst_b_tied1:
++**	pfirst	p0\.b, p3, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (pfirst_b_tied1,
++		p0 = svpfirst_b (p3, p0),
++		p0 = svpfirst (p3, p0))
++
++/*
++** pfirst_b_untied:
++**	mov	p0\.b, p1\.b
++**	pfirst	p0\.b, p3, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (pfirst_b_untied,
++		p0 = svpfirst_b (p3, p1),
++		p0 = svpfirst (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b16.c
+new file mode 100644
+index 000000000..ad0efe5e7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** pnext_b16_tied1:
++**	pnext	p0\.h, p3, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b16_tied1,
++		p0 = svpnext_b16 (p3, p0),
++		p0 = svpnext_b16 (p3, p0))
++
++/*
++** pnext_b16_untied:
++**	mov	p0\.b, p1\.b
++**	pnext	p0\.h, p3, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b16_untied,
++		p0 = svpnext_b16 (p3, p1),
++		p0 = svpnext_b16 (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b32.c
+new file mode 100644
+index 000000000..a0030fae1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b32.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** pnext_b32_tied1:
++**	pnext	p0\.s, p3, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b32_tied1,
++		p0 = svpnext_b32 (p3, p0),
++		p0 = svpnext_b32 (p3, p0))
++
++/*
++** pnext_b32_untied:
++**	mov	p0\.b, p1\.b
++**	pnext	p0\.s, p3, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b32_untied,
++		p0 = svpnext_b32 (p3, p1),
++		p0 = svpnext_b32 (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b64.c
+new file mode 100644
+index 000000000..59db2f04f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b64.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** pnext_b64_tied1:
++**	pnext	p0\.d, p3, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b64_tied1,
++		p0 = svpnext_b64 (p3, p0),
++		p0 = svpnext_b64 (p3, p0))
++
++/*
++** pnext_b64_untied:
++**	mov	p0\.b, p1\.b
++**	pnext	p0\.d, p3, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b64_untied,
++		p0 = svpnext_b64 (p3, p1),
++		p0 = svpnext_b64 (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b8.c
+new file mode 100644
+index 000000000..cfc2e907c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b8.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** pnext_b8_tied1:
++**	pnext	p0\.b, p3, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b8_tied1,
++		p0 = svpnext_b8 (p3, p0),
++		p0 = svpnext_b8 (p3, p0))
++
++/*
++** pnext_b8_untied:
++**	mov	p0\.b, p1\.b
++**	pnext	p0\.b, p3, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (pnext_b8_untied,
++		p0 = svpnext_b8 (p3, p1),
++		p0 = svpnext_b8 (p3, p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+new file mode 100644
+index 000000000..d2b2777e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+@@ -0,0 +1,245 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfb_base:
++**	prfb	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_base, uint8_t,
++	       svprfb (p0, x0, SV_PLDL1KEEP),
++	       svprfb (p0, x0, SV_PLDL1KEEP))
++
++/*
++** prfb_u8_index:
++**	prfb	pldl1keep, p0, \[x0, x1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u8_index, uint8_t,
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfb_u8_1:
++**	add	(x[0-9+]), x0, #?1
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u8_1, uint8_t,
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfb_u16_index:
++**	add	(x[0-9+]), x0, x1, lsl #?1
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u16_index, uint16_t,
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfb_u16_1:
++**	add	(x[0-9+]), x0, #?2
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u16_1, uint16_t,
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfb_u32_index:
++**	add	(x[0-9+]), x0, x1, lsl #?2
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u32_index, uint32_t,
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfb_u32_1:
++**	add	(x[0-9+]), x0, #?4
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u32_1, uint32_t,
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfb_u64_index:
++**	add	(x[0-9+]), x0, x1, lsl #?3
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u64_index, uint64_t,
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfb_u64_1:
++**	add	(x[0-9+]), x0, #?8
++**	prfb	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfb_u64_1, uint64_t,
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfb_pldl1strm:
++**	prfb	pldl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pldl1strm, uint8_t,
++	       svprfb (p0, x0, SV_PLDL1STRM),
++	       svprfb (p0, x0, SV_PLDL1STRM))
++
++/*
++** prfb_pldl2keep:
++**	prfb	pldl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pldl2keep, uint8_t,
++	       svprfb (p0, x0, SV_PLDL2KEEP),
++	       svprfb (p0, x0, SV_PLDL2KEEP))
++
++/*
++** prfb_pldl2strm:
++**	prfb	pldl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pldl2strm, uint8_t,
++	       svprfb (p0, x0, SV_PLDL2STRM),
++	       svprfb (p0, x0, SV_PLDL2STRM))
++
++/*
++** prfb_pldl3keep:
++**	prfb	pldl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pldl3keep, uint8_t,
++	       svprfb (p0, x0, SV_PLDL3KEEP),
++	       svprfb (p0, x0, SV_PLDL3KEEP))
++
++/*
++** prfb_pldl3strm:
++**	prfb	pldl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pldl3strm, uint8_t,
++	       svprfb (p0, x0, SV_PLDL3STRM),
++	       svprfb (p0, x0, SV_PLDL3STRM))
++
++/*
++** prfb_pstl1keep:
++**	prfb	pstl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pstl1keep, uint8_t,
++	       svprfb (p0, x0, SV_PSTL1KEEP),
++	       svprfb (p0, x0, SV_PSTL1KEEP))
++
++/*
++** prfb_pstl1strm:
++**	prfb	pstl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pstl1strm, uint8_t,
++	       svprfb (p0, x0, SV_PSTL1STRM),
++	       svprfb (p0, x0, SV_PSTL1STRM))
++
++/*
++** prfb_pstl2keep:
++**	prfb	pstl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pstl2keep, uint8_t,
++	       svprfb (p0, x0, SV_PSTL2KEEP),
++	       svprfb (p0, x0, SV_PSTL2KEEP))
++
++/*
++** prfb_pstl2strm:
++**	prfb	pstl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pstl2strm, uint8_t,
++	       svprfb (p0, x0, SV_PSTL2STRM),
++	       svprfb (p0, x0, SV_PSTL2STRM))
++
++/*
++** prfb_pstl3keep:
++**	prfb	pstl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pstl3keep, uint8_t,
++	       svprfb (p0, x0, SV_PSTL3KEEP),
++	       svprfb (p0, x0, SV_PSTL3KEEP))
++
++/*
++** prfb_pstl3strm:
++**	prfb	pstl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_pstl3strm, uint8_t,
++	       svprfb (p0, x0, SV_PSTL3STRM),
++	       svprfb (p0, x0, SV_PSTL3STRM))
++
++/*
++** prfb_vnum_0:
++**	prfb	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_vnum_0, uint8_t,
++	       svprfb_vnum (p0, x0, 0, SV_PLDL1KEEP),
++	       svprfb_vnum (p0, x0, 0, SV_PLDL1KEEP))
++
++/*
++** prfb_vnum_1:
++**	incb	x0
++**	prfb	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_vnum_1, uint16_t,
++	       svprfb_vnum (p0, x0, 1, SV_PLDL1KEEP),
++	       svprfb_vnum (p0, x0, 1, SV_PLDL1KEEP))
++
++/*
++** prfb_vnum_2:
++**	incb	x0, all, mul #2
++**	prfb	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_vnum_2, uint32_t,
++	       svprfb_vnum (p0, x0, 2, SV_PLDL1KEEP),
++	       svprfb_vnum (p0, x0, 2, SV_PLDL1KEEP))
++
++/*
++** prfb_vnum_3:
++**	incb	x0, all, mul #3
++**	prfb	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfb_vnum_3, uint64_t,
++	       svprfb_vnum (p0, x0, 3, SV_PLDL1KEEP),
++	       svprfb_vnum (p0, x0, 3, SV_PLDL1KEEP))
++
++/*
++** prfb_vnum_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	prfb	pldl1keep, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	prfb	zldl1keep, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_PREFETCH (prfb_vnum_x1, uint64_t,
++	       svprfb_vnum (p0, x0, x1, SV_PLDL1KEEP),
++	       svprfb_vnum (p0, x0, x1, SV_PLDL1KEEP))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb_gather.c
+new file mode 100644
+index 000000000..c4bfbbbf7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb_gather.c
+@@ -0,0 +1,223 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfb_gather_u32base:
++**	prfb	pldl1keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_u32base, svuint32_t,
++			 svprfb_gather_u32base (p0, z0, SV_PLDL1KEEP),
++			 svprfb_gather (p0, z0, SV_PLDL1KEEP))
++
++/*
++** prfb_gather_u64base:
++**	prfb	pldl1strm, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_u64base, svuint64_t,
++			 svprfb_gather_u64base (p0, z0, SV_PLDL1STRM),
++			 svprfb_gather (p0, z0, SV_PLDL1STRM))
++
++/*
++** prfb_gather_x0_u32base_offset:
++**	prfb	pldl2keep, p0, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_x0_u32base_offset, svuint32_t,
++			 svprfb_gather_u32base_offset (p0, z0, x0, SV_PLDL2KEEP),
++			 svprfb_gather_offset (p0, z0, x0, SV_PLDL2KEEP))
++
++/*
++** prfb_gather_m1_u32base_offset:
++**	mov	(x[0-9]+), #?-1
++**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_m1_u32base_offset, svuint32_t,
++			 svprfb_gather_u32base_offset (p0, z0, -1, SV_PLDL2STRM),
++			 svprfb_gather_offset (p0, z0, -1, SV_PLDL2STRM))
++
++/*
++** prfb_gather_0_u32base_offset:
++**	prfb	pldl3keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_0_u32base_offset, svuint32_t,
++			 svprfb_gather_u32base_offset (p0, z0, 0, SV_PLDL3KEEP),
++			 svprfb_gather_offset (p0, z0, 0, SV_PLDL3KEEP))
++
++/*
++** prfb_gather_5_u32base_offset:
++**	prfb	pldl3strm, p0, \[z0\.s, #5\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_5_u32base_offset, svuint32_t,
++			 svprfb_gather_u32base_offset (p0, z0, 5, SV_PLDL3STRM),
++			 svprfb_gather_offset (p0, z0, 5, SV_PLDL3STRM))
++
++/*
++** prfb_gather_31_u32base_offset:
++**	prfb	pstl1keep, p0, \[z0\.s, #31\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_31_u32base_offset, svuint32_t,
++			 svprfb_gather_u32base_offset (p0, z0, 31, SV_PSTL1KEEP),
++			 svprfb_gather_offset (p0, z0, 31, SV_PSTL1KEEP))
++
++/*
++** prfb_gather_32_u32base_offset:
++**	mov	(x[0-9]+), #?32
++**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_32_u32base_offset, svuint32_t,
++			 svprfb_gather_u32base_offset (p0, z0, 32, SV_PSTL1STRM),
++			 svprfb_gather_offset (p0, z0, 32, SV_PSTL1STRM))
++
++/*
++** prfb_gather_x0_u64base_offset:
++**	prfb	pstl2keep, p0, \[x0, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_x0_u64base_offset, svuint64_t,
++			 svprfb_gather_u64base_offset (p0, z0, x0, SV_PSTL2KEEP),
++			 svprfb_gather_offset (p0, z0, x0, SV_PSTL2KEEP))
++
++/*
++** prfb_gather_m1_u64base_offset:
++**	mov	(x[0-9]+), #?-1
++**	prfb	pstl2strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_m1_u64base_offset, svuint64_t,
++			 svprfb_gather_u64base_offset (p0, z0, -1, SV_PSTL2STRM),
++			 svprfb_gather_offset (p0, z0, -1, SV_PSTL2STRM))
++
++/*
++** prfb_gather_0_u64base_offset:
++**	prfb	pstl3keep, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_0_u64base_offset, svuint64_t,
++			 svprfb_gather_u64base_offset (p0, z0, 0, SV_PSTL3KEEP),
++			 svprfb_gather_offset (p0, z0, 0, SV_PSTL3KEEP))
++
++/*
++** prfb_gather_5_u64base_offset:
++**	prfb	pstl3strm, p0, \[z0\.d, #5\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_5_u64base_offset, svuint64_t,
++			 svprfb_gather_u64base_offset (p0, z0, 5, SV_PSTL3STRM),
++			 svprfb_gather_offset (p0, z0, 5, SV_PSTL3STRM))
++
++/*
++** prfb_gather_31_u64base_offset:
++**	prfb	pldl1keep, p0, \[z0\.d, #31\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_31_u64base_offset, svuint64_t,
++			 svprfb_gather_u64base_offset (p0, z0, 31, SV_PLDL1KEEP),
++			 svprfb_gather_offset (p0, z0, 31, SV_PLDL1KEEP))
++
++/*
++** prfb_gather_32_u64base_offset:
++**	mov	(x[0-9]+), #?32
++**	prfb	pldl1strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfb_gather_32_u64base_offset, svuint64_t,
++			 svprfb_gather_u64base_offset (p0, z0, 32, SV_PLDL1STRM),
++			 svprfb_gather_offset (p0, z0, 32, SV_PLDL1STRM))
++
++/*
++** prfb_gather_x0_s32offset:
++**	prfb	pldl2keep, p0, \[x0, z0\.s, sxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_s32offset, svint32_t,
++			 svprfb_gather_s32offset (p0, x0, z0, SV_PLDL2KEEP),
++			 svprfb_gather_offset (p0, x0, z0, SV_PLDL2KEEP))
++
++/*
++** prfb_gather_s32offset:
++**	prfb	pldl2strm, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_s32offset, svint32_t,
++			 svprfb_gather_s32offset (p0, x0, z1, SV_PLDL2STRM),
++			 svprfb_gather_offset (p0, x0, z1, SV_PLDL2STRM))
++
++/*
++** prfb_gather_x0_u32offset:
++**	prfb	pldl3keep, p0, \[x0, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_u32offset, svuint32_t,
++			 svprfb_gather_u32offset (p0, x0, z0, SV_PLDL3KEEP),
++			 svprfb_gather_offset (p0, x0, z0, SV_PLDL3KEEP))
++
++/*
++** prfb_gather_u32offset:
++**	prfb	pldl3strm, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_u32offset, svuint32_t,
++			 svprfb_gather_u32offset (p0, x0, z1, SV_PLDL3STRM),
++			 svprfb_gather_offset (p0, x0, z1, SV_PLDL3STRM))
++
++/*
++** prfb_gather_x0_s64offset:
++**	prfb	pstl1keep, p0, \[x0, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_s64offset, svint64_t,
++			 svprfb_gather_s64offset (p0, x0, z0, SV_PSTL1KEEP),
++			 svprfb_gather_offset (p0, x0, z0, SV_PSTL1KEEP))
++
++/*
++** prfb_gather_s64offset:
++**	prfb	pstl1strm, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_s64offset, svint64_t,
++			 svprfb_gather_s64offset (p0, x0, z1, SV_PSTL1STRM),
++			 svprfb_gather_offset (p0, x0, z1, SV_PSTL1STRM))
++
++/*
++** prfb_gather_ext_s64offset:
++**	prfb	pstl1strm, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_ext_s64offset, svint64_t,
++			 svprfb_gather_s64offset (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
++			 svprfb_gather_offset (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
++
++/*
++** prfb_gather_x0_u64offset:
++**	prfb	pstl2keep, p0, \[x0, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_u64offset, svuint64_t,
++			 svprfb_gather_u64offset (p0, x0, z0, SV_PSTL2KEEP),
++			 svprfb_gather_offset (p0, x0, z0, SV_PSTL2KEEP))
++
++/*
++** prfb_gather_u64offset:
++**	prfb	pstl2strm, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_u64offset, svuint64_t,
++			 svprfb_gather_u64offset (p0, x0, z1, SV_PSTL2STRM),
++			 svprfb_gather_offset (p0, x0, z1, SV_PSTL2STRM))
++
++/*
++** prfb_gather_ext_u64offset:
++**	prfb	pstl2strm, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfb_gather_ext_u64offset, svuint64_t,
++			 svprfb_gather_u64offset (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
++			 svprfb_gather_offset (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+new file mode 100644
+index 000000000..72b2e6415
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+@@ -0,0 +1,245 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfd_base:
++**	prfd	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_base, uint8_t,
++	       svprfd (p0, x0, SV_PLDL1KEEP),
++	       svprfd (p0, x0, SV_PLDL1KEEP))
++
++/*
++** prfd_u8_index:
++**	add	(x[0-9+]), (x0, x1|x1, x0)
++**	prfd	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u8_index, uint8_t,
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfd_u8_1:
++**	add	(x[0-9+]), x0, #?1
++**	prfd	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u8_1, uint8_t,
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfd_u16_index:
++**	add	(x[0-9+]), x0, x1, lsl #?1
++**	prfd	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u16_index, uint16_t,
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfd_u16_1:
++**	add	(x[0-9+]), x0, #?2
++**	prfd	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u16_1, uint16_t,
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfd_u32_index:
++**	add	(x[0-9+]), x0, x1, lsl #?2
++**	prfd	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u32_index, uint32_t,
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfd_u32_1:
++**	add	(x[0-9+]), x0, #?4
++**	prfd	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u32_1, uint32_t,
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfd_u64_index:
++**	prfd	pldl1keep, p0, \[x0, x1, lsl #?3\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u64_index, uint64_t,
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfd_u64_1:
++**	add	(x[0-9+]), x0, #?8
++**	prfd	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfd_u64_1, uint64_t,
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfd_pldl1strm:
++**	prfd	pldl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pldl1strm, uint8_t,
++	       svprfd (p0, x0, SV_PLDL1STRM),
++	       svprfd (p0, x0, SV_PLDL1STRM))
++
++/*
++** prfd_pldl2keep:
++**	prfd	pldl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pldl2keep, uint8_t,
++	       svprfd (p0, x0, SV_PLDL2KEEP),
++	       svprfd (p0, x0, SV_PLDL2KEEP))
++
++/*
++** prfd_pldl2strm:
++**	prfd	pldl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pldl2strm, uint8_t,
++	       svprfd (p0, x0, SV_PLDL2STRM),
++	       svprfd (p0, x0, SV_PLDL2STRM))
++
++/*
++** prfd_pldl3keep:
++**	prfd	pldl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pldl3keep, uint8_t,
++	       svprfd (p0, x0, SV_PLDL3KEEP),
++	       svprfd (p0, x0, SV_PLDL3KEEP))
++
++/*
++** prfd_pldl3strm:
++**	prfd	pldl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pldl3strm, uint8_t,
++	       svprfd (p0, x0, SV_PLDL3STRM),
++	       svprfd (p0, x0, SV_PLDL3STRM))
++
++/*
++** prfd_pstl1keep:
++**	prfd	pstl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pstl1keep, uint8_t,
++	       svprfd (p0, x0, SV_PSTL1KEEP),
++	       svprfd (p0, x0, SV_PSTL1KEEP))
++
++/*
++** prfd_pstl1strm:
++**	prfd	pstl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pstl1strm, uint8_t,
++	       svprfd (p0, x0, SV_PSTL1STRM),
++	       svprfd (p0, x0, SV_PSTL1STRM))
++
++/*
++** prfd_pstl2keep:
++**	prfd	pstl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pstl2keep, uint8_t,
++	       svprfd (p0, x0, SV_PSTL2KEEP),
++	       svprfd (p0, x0, SV_PSTL2KEEP))
++
++/*
++** prfd_pstl2strm:
++**	prfd	pstl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pstl2strm, uint8_t,
++	       svprfd (p0, x0, SV_PSTL2STRM),
++	       svprfd (p0, x0, SV_PSTL2STRM))
++
++/*
++** prfd_pstl3keep:
++**	prfd	pstl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pstl3keep, uint8_t,
++	       svprfd (p0, x0, SV_PSTL3KEEP),
++	       svprfd (p0, x0, SV_PSTL3KEEP))
++
++/*
++** prfd_pstl3strm:
++**	prfd	pstl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_pstl3strm, uint8_t,
++	       svprfd (p0, x0, SV_PSTL3STRM),
++	       svprfd (p0, x0, SV_PSTL3STRM))
++
++/*
++** prfd_vnum_0:
++**	prfd	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_vnum_0, uint8_t,
++	       svprfd_vnum (p0, x0, 0, SV_PLDL1KEEP),
++	       svprfd_vnum (p0, x0, 0, SV_PLDL1KEEP))
++
++/*
++** prfd_vnum_1:
++**	incb	x0
++**	prfd	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_vnum_1, uint16_t,
++	       svprfd_vnum (p0, x0, 1, SV_PLDL1KEEP),
++	       svprfd_vnum (p0, x0, 1, SV_PLDL1KEEP))
++
++/*
++** prfd_vnum_2:
++**	incb	x0, all, mul #2
++**	prfd	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_vnum_2, uint32_t,
++	       svprfd_vnum (p0, x0, 2, SV_PLDL1KEEP),
++	       svprfd_vnum (p0, x0, 2, SV_PLDL1KEEP))
++
++/*
++** prfd_vnum_3:
++**	incb	x0, all, mul #3
++**	prfd	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfd_vnum_3, uint64_t,
++	       svprfd_vnum (p0, x0, 3, SV_PLDL1KEEP),
++	       svprfd_vnum (p0, x0, 3, SV_PLDL1KEEP))
++
++/*
++** prfd_vnum_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	prfd	pldl1keep, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	prfd	zldl1keep, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_PREFETCH (prfd_vnum_x1, uint64_t,
++	       svprfd_vnum (p0, x0, x1, SV_PLDL1KEEP),
++	       svprfd_vnum (p0, x0, x1, SV_PLDL1KEEP))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd_gather.c
+new file mode 100644
+index 000000000..a84acb1a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd_gather.c
+@@ -0,0 +1,225 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfd_gather_u32base:
++**	prfd	pldl1keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_u32base, svuint32_t,
++			 svprfd_gather_u32base (p0, z0, SV_PLDL1KEEP),
++			 svprfd_gather (p0, z0, SV_PLDL1KEEP))
++
++/*
++** prfd_gather_u64base:
++**	prfd	pldl1strm, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_u64base, svuint64_t,
++			 svprfd_gather_u64base (p0, z0, SV_PLDL1STRM),
++			 svprfd_gather (p0, z0, SV_PLDL1STRM))
++
++/*
++** prfd_gather_x0_u32base_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	prfb	pldl2keep, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_x0_u32base_index, svuint32_t,
++			 svprfd_gather_u32base_index (p0, z0, x0, SV_PLDL2KEEP),
++			 svprfd_gather_index (p0, z0, x0, SV_PLDL2KEEP))
++
++/*
++** prfd_gather_m1_u32base_index:
++**	mov	(x[0-9]+), #?-8
++**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_m1_u32base_index, svuint32_t,
++			 svprfd_gather_u32base_index (p0, z0, -1, SV_PLDL2STRM),
++			 svprfd_gather_index (p0, z0, -1, SV_PLDL2STRM))
++
++/*
++** prfd_gather_0_u32base_index:
++**	prfd	pldl3keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_0_u32base_index, svuint32_t,
++			 svprfd_gather_u32base_index (p0, z0, 0, SV_PLDL3KEEP),
++			 svprfd_gather_index (p0, z0, 0, SV_PLDL3KEEP))
++
++/*
++** prfd_gather_5_u32base_index:
++**	prfd	pldl3strm, p0, \[z0\.s, #40\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_5_u32base_index, svuint32_t,
++			 svprfd_gather_u32base_index (p0, z0, 5, SV_PLDL3STRM),
++			 svprfd_gather_index (p0, z0, 5, SV_PLDL3STRM))
++
++/*
++** prfd_gather_31_u32base_index:
++**	prfd	pstl1keep, p0, \[z0\.s, #248\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_31_u32base_index, svuint32_t,
++			 svprfd_gather_u32base_index (p0, z0, 31, SV_PSTL1KEEP),
++			 svprfd_gather_index (p0, z0, 31, SV_PSTL1KEEP))
++
++/*
++** prfd_gather_32_u32base_index:
++**	mov	(x[0-9]+), #?256
++**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_32_u32base_index, svuint32_t,
++			 svprfd_gather_u32base_index (p0, z0, 32, SV_PSTL1STRM),
++			 svprfd_gather_index (p0, z0, 32, SV_PSTL1STRM))
++
++/*
++** prfd_gather_x0_u64base_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	prfb	pstl2keep, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_x0_u64base_index, svuint64_t,
++			 svprfd_gather_u64base_index (p0, z0, x0, SV_PSTL2KEEP),
++			 svprfd_gather_index (p0, z0, x0, SV_PSTL2KEEP))
++
++/*
++** prfd_gather_m1_u64base_index:
++**	mov	(x[0-9]+), #?-8
++**	prfb	pstl2strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_m1_u64base_index, svuint64_t,
++			 svprfd_gather_u64base_index (p0, z0, -1, SV_PSTL2STRM),
++			 svprfd_gather_index (p0, z0, -1, SV_PSTL2STRM))
++
++/*
++** prfd_gather_0_u64base_index:
++**	prfd	pstl3keep, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_0_u64base_index, svuint64_t,
++			 svprfd_gather_u64base_index (p0, z0, 0, SV_PSTL3KEEP),
++			 svprfd_gather_index (p0, z0, 0, SV_PSTL3KEEP))
++
++/*
++** prfd_gather_5_u64base_index:
++**	prfd	pstl3strm, p0, \[z0\.d, #40\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_5_u64base_index, svuint64_t,
++			 svprfd_gather_u64base_index (p0, z0, 5, SV_PSTL3STRM),
++			 svprfd_gather_index (p0, z0, 5, SV_PSTL3STRM))
++
++/*
++** prfd_gather_31_u64base_index:
++**	prfd	pldl1keep, p0, \[z0\.d, #248\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_31_u64base_index, svuint64_t,
++			 svprfd_gather_u64base_index (p0, z0, 31, SV_PLDL1KEEP),
++			 svprfd_gather_index (p0, z0, 31, SV_PLDL1KEEP))
++
++/*
++** prfd_gather_32_u64base_index:
++**	mov	(x[0-9]+), #?256
++**	prfb	pldl1strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfd_gather_32_u64base_index, svuint64_t,
++			 svprfd_gather_u64base_index (p0, z0, 32, SV_PLDL1STRM),
++			 svprfd_gather_index (p0, z0, 32, SV_PLDL1STRM))
++
++/*
++** prfd_gather_x0_s32index:
++**	prfd	pldl2keep, p0, \[x0, z0\.s, sxtw 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_s32index, svint32_t,
++			 svprfd_gather_s32index (p0, x0, z0, SV_PLDL2KEEP),
++			 svprfd_gather_index (p0, x0, z0, SV_PLDL2KEEP))
++
++/*
++** prfd_gather_s32index:
++**	prfd	pldl2strm, p0, \[x0, z1\.s, sxtw 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_s32index, svint32_t,
++			 svprfd_gather_s32index (p0, x0, z1, SV_PLDL2STRM),
++			 svprfd_gather_index (p0, x0, z1, SV_PLDL2STRM))
++
++/*
++** prfd_gather_x0_u32index:
++**	prfd	pldl3keep, p0, \[x0, z0\.s, uxtw 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_u32index, svuint32_t,
++			 svprfd_gather_u32index (p0, x0, z0, SV_PLDL3KEEP),
++			 svprfd_gather_index (p0, x0, z0, SV_PLDL3KEEP))
++
++/*
++** prfd_gather_u32index:
++**	prfd	pldl3strm, p0, \[x0, z1\.s, uxtw 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_u32index, svuint32_t,
++			 svprfd_gather_u32index (p0, x0, z1, SV_PLDL3STRM),
++			 svprfd_gather_index (p0, x0, z1, SV_PLDL3STRM))
++
++/*
++** prfd_gather_x0_s64index:
++**	prfd	pstl1keep, p0, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_s64index, svint64_t,
++			 svprfd_gather_s64index (p0, x0, z0, SV_PSTL1KEEP),
++			 svprfd_gather_index (p0, x0, z0, SV_PSTL1KEEP))
++
++/*
++** prfd_gather_s64index:
++**	prfd	pstl1strm, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_s64index, svint64_t,
++			 svprfd_gather_s64index (p0, x0, z1, SV_PSTL1STRM),
++			 svprfd_gather_index (p0, x0, z1, SV_PSTL1STRM))
++
++/*
++** prfd_gather_ext_s64index:
++**	prfd	pstl1strm, p0, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_ext_s64index, svint64_t,
++			 svprfd_gather_s64index (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
++			 svprfd_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
++
++/*
++** prfd_gather_x0_u64index:
++**	prfd	pstl2keep, p0, \[x0, z0\.d, lsl 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_u64index, svuint64_t,
++			 svprfd_gather_u64index (p0, x0, z0, SV_PSTL2KEEP),
++			 svprfd_gather_index (p0, x0, z0, SV_PSTL2KEEP))
++
++/*
++** prfd_gather_u64index:
++**	prfd	pstl2strm, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_u64index, svuint64_t,
++			 svprfd_gather_u64index (p0, x0, z1, SV_PSTL2STRM),
++			 svprfd_gather_index (p0, x0, z1, SV_PSTL2STRM))
++
++/*
++** prfd_gather_ext_u64index:
++**	prfd	pstl2strm, p0, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfd_gather_ext_u64index, svuint64_t,
++			 svprfd_gather_u64index (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
++			 svprfd_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+new file mode 100644
+index 000000000..89069f9b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+@@ -0,0 +1,245 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfh_base:
++**	prfh	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_base, uint8_t,
++	       svprfh (p0, x0, SV_PLDL1KEEP),
++	       svprfh (p0, x0, SV_PLDL1KEEP))
++
++/*
++** prfh_u8_index:
++**	add	(x[0-9+]), (x0, x1|x1, x0)
++**	prfh	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u8_index, uint8_t,
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfh_u8_1:
++**	add	(x[0-9+]), x0, #?1
++**	prfh	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u8_1, uint8_t,
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfh_u16_index:
++**	prfh	pldl1keep, p0, \[x0, x1, lsl #?1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u16_index, uint16_t,
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfh_u16_1:
++**	add	(x[0-9+]), x0, #?2
++**	prfh	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u16_1, uint16_t,
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfh_u32_index:
++**	add	(x[0-9+]), x0, x1, lsl #?2
++**	prfh	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u32_index, uint32_t,
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfh_u32_1:
++**	add	(x[0-9+]), x0, #?4
++**	prfh	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u32_1, uint32_t,
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfh_u64_index:
++**	add	(x[0-9+]), x0, x1, lsl #?3
++**	prfh	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u64_index, uint64_t,
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfh_u64_1:
++**	add	(x[0-9+]), x0, #?8
++**	prfh	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfh_u64_1, uint64_t,
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfh_pldl1strm:
++**	prfh	pldl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pldl1strm, uint8_t,
++	       svprfh (p0, x0, SV_PLDL1STRM),
++	       svprfh (p0, x0, SV_PLDL1STRM))
++
++/*
++** prfh_pldl2keep:
++**	prfh	pldl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pldl2keep, uint8_t,
++	       svprfh (p0, x0, SV_PLDL2KEEP),
++	       svprfh (p0, x0, SV_PLDL2KEEP))
++
++/*
++** prfh_pldl2strm:
++**	prfh	pldl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pldl2strm, uint8_t,
++	       svprfh (p0, x0, SV_PLDL2STRM),
++	       svprfh (p0, x0, SV_PLDL2STRM))
++
++/*
++** prfh_pldl3keep:
++**	prfh	pldl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pldl3keep, uint8_t,
++	       svprfh (p0, x0, SV_PLDL3KEEP),
++	       svprfh (p0, x0, SV_PLDL3KEEP))
++
++/*
++** prfh_pldl3strm:
++**	prfh	pldl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pldl3strm, uint8_t,
++	       svprfh (p0, x0, SV_PLDL3STRM),
++	       svprfh (p0, x0, SV_PLDL3STRM))
++
++/*
++** prfh_pstl1keep:
++**	prfh	pstl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pstl1keep, uint8_t,
++	       svprfh (p0, x0, SV_PSTL1KEEP),
++	       svprfh (p0, x0, SV_PSTL1KEEP))
++
++/*
++** prfh_pstl1strm:
++**	prfh	pstl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pstl1strm, uint8_t,
++	       svprfh (p0, x0, SV_PSTL1STRM),
++	       svprfh (p0, x0, SV_PSTL1STRM))
++
++/*
++** prfh_pstl2keep:
++**	prfh	pstl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pstl2keep, uint8_t,
++	       svprfh (p0, x0, SV_PSTL2KEEP),
++	       svprfh (p0, x0, SV_PSTL2KEEP))
++
++/*
++** prfh_pstl2strm:
++**	prfh	pstl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pstl2strm, uint8_t,
++	       svprfh (p0, x0, SV_PSTL2STRM),
++	       svprfh (p0, x0, SV_PSTL2STRM))
++
++/*
++** prfh_pstl3keep:
++**	prfh	pstl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pstl3keep, uint8_t,
++	       svprfh (p0, x0, SV_PSTL3KEEP),
++	       svprfh (p0, x0, SV_PSTL3KEEP))
++
++/*
++** prfh_pstl3strm:
++**	prfh	pstl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_pstl3strm, uint8_t,
++	       svprfh (p0, x0, SV_PSTL3STRM),
++	       svprfh (p0, x0, SV_PSTL3STRM))
++
++/*
++** prfh_vnum_0:
++**	prfh	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_vnum_0, uint8_t,
++	       svprfh_vnum (p0, x0, 0, SV_PLDL1KEEP),
++	       svprfh_vnum (p0, x0, 0, SV_PLDL1KEEP))
++
++/*
++** prfh_vnum_1:
++**	incb	x0
++**	prfh	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_vnum_1, uint16_t,
++	       svprfh_vnum (p0, x0, 1, SV_PLDL1KEEP),
++	       svprfh_vnum (p0, x0, 1, SV_PLDL1KEEP))
++
++/*
++** prfh_vnum_2:
++**	incb	x0, all, mul #2
++**	prfh	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_vnum_2, uint32_t,
++	       svprfh_vnum (p0, x0, 2, SV_PLDL1KEEP),
++	       svprfh_vnum (p0, x0, 2, SV_PLDL1KEEP))
++
++/*
++** prfh_vnum_3:
++**	incb	x0, all, mul #3
++**	prfh	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfh_vnum_3, uint64_t,
++	       svprfh_vnum (p0, x0, 3, SV_PLDL1KEEP),
++	       svprfh_vnum (p0, x0, 3, SV_PLDL1KEEP))
++
++/*
++** prfh_vnum_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	prfh	pldl1keep, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	prfh	zldl1keep, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_PREFETCH (prfh_vnum_x1, uint64_t,
++	       svprfh_vnum (p0, x0, x1, SV_PLDL1KEEP),
++	       svprfh_vnum (p0, x0, x1, SV_PLDL1KEEP))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh_gather.c
+new file mode 100644
+index 000000000..04b7a1575
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh_gather.c
+@@ -0,0 +1,225 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfh_gather_u32base:
++**	prfh	pldl1keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_u32base, svuint32_t,
++			 svprfh_gather_u32base (p0, z0, SV_PLDL1KEEP),
++			 svprfh_gather (p0, z0, SV_PLDL1KEEP))
++
++/*
++** prfh_gather_u64base:
++**	prfh	pldl1strm, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_u64base, svuint64_t,
++			 svprfh_gather_u64base (p0, z0, SV_PLDL1STRM),
++			 svprfh_gather (p0, z0, SV_PLDL1STRM))
++
++/*
++** prfh_gather_x0_u32base_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	prfb	pldl2keep, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_x0_u32base_index, svuint32_t,
++			 svprfh_gather_u32base_index (p0, z0, x0, SV_PLDL2KEEP),
++			 svprfh_gather_index (p0, z0, x0, SV_PLDL2KEEP))
++
++/*
++** prfh_gather_m1_u32base_index:
++**	mov	(x[0-9]+), #?-2
++**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_m1_u32base_index, svuint32_t,
++			 svprfh_gather_u32base_index (p0, z0, -1, SV_PLDL2STRM),
++			 svprfh_gather_index (p0, z0, -1, SV_PLDL2STRM))
++
++/*
++** prfh_gather_0_u32base_index:
++**	prfh	pldl3keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_0_u32base_index, svuint32_t,
++			 svprfh_gather_u32base_index (p0, z0, 0, SV_PLDL3KEEP),
++			 svprfh_gather_index (p0, z0, 0, SV_PLDL3KEEP))
++
++/*
++** prfh_gather_5_u32base_index:
++**	prfh	pldl3strm, p0, \[z0\.s, #10\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_5_u32base_index, svuint32_t,
++			 svprfh_gather_u32base_index (p0, z0, 5, SV_PLDL3STRM),
++			 svprfh_gather_index (p0, z0, 5, SV_PLDL3STRM))
++
++/*
++** prfh_gather_31_u32base_index:
++**	prfh	pstl1keep, p0, \[z0\.s, #62\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_31_u32base_index, svuint32_t,
++			 svprfh_gather_u32base_index (p0, z0, 31, SV_PSTL1KEEP),
++			 svprfh_gather_index (p0, z0, 31, SV_PSTL1KEEP))
++
++/*
++** prfh_gather_32_u32base_index:
++**	mov	(x[0-9]+), #?64
++**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_32_u32base_index, svuint32_t,
++			 svprfh_gather_u32base_index (p0, z0, 32, SV_PSTL1STRM),
++			 svprfh_gather_index (p0, z0, 32, SV_PSTL1STRM))
++
++/*
++** prfh_gather_x0_u64base_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	prfb	pstl2keep, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_x0_u64base_index, svuint64_t,
++			 svprfh_gather_u64base_index (p0, z0, x0, SV_PSTL2KEEP),
++			 svprfh_gather_index (p0, z0, x0, SV_PSTL2KEEP))
++
++/*
++** prfh_gather_m1_u64base_index:
++**	mov	(x[0-9]+), #?-2
++**	prfb	pstl2strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_m1_u64base_index, svuint64_t,
++			 svprfh_gather_u64base_index (p0, z0, -1, SV_PSTL2STRM),
++			 svprfh_gather_index (p0, z0, -1, SV_PSTL2STRM))
++
++/*
++** prfh_gather_0_u64base_index:
++**	prfh	pstl3keep, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_0_u64base_index, svuint64_t,
++			 svprfh_gather_u64base_index (p0, z0, 0, SV_PSTL3KEEP),
++			 svprfh_gather_index (p0, z0, 0, SV_PSTL3KEEP))
++
++/*
++** prfh_gather_5_u64base_index:
++**	prfh	pstl3strm, p0, \[z0\.d, #10\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_5_u64base_index, svuint64_t,
++			 svprfh_gather_u64base_index (p0, z0, 5, SV_PSTL3STRM),
++			 svprfh_gather_index (p0, z0, 5, SV_PSTL3STRM))
++
++/*
++** prfh_gather_31_u64base_index:
++**	prfh	pldl1keep, p0, \[z0\.d, #62\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_31_u64base_index, svuint64_t,
++			 svprfh_gather_u64base_index (p0, z0, 31, SV_PLDL1KEEP),
++			 svprfh_gather_index (p0, z0, 31, SV_PLDL1KEEP))
++
++/*
++** prfh_gather_32_u64base_index:
++**	mov	(x[0-9]+), #?64
++**	prfb	pldl1strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfh_gather_32_u64base_index, svuint64_t,
++			 svprfh_gather_u64base_index (p0, z0, 32, SV_PLDL1STRM),
++			 svprfh_gather_index (p0, z0, 32, SV_PLDL1STRM))
++
++/*
++** prfh_gather_x0_s32index:
++**	prfh	pldl2keep, p0, \[x0, z0\.s, sxtw 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_s32index, svint32_t,
++			 svprfh_gather_s32index (p0, x0, z0, SV_PLDL2KEEP),
++			 svprfh_gather_index (p0, x0, z0, SV_PLDL2KEEP))
++
++/*
++** prfh_gather_s32index:
++**	prfh	pldl2strm, p0, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_s32index, svint32_t,
++			 svprfh_gather_s32index (p0, x0, z1, SV_PLDL2STRM),
++			 svprfh_gather_index (p0, x0, z1, SV_PLDL2STRM))
++
++/*
++** prfh_gather_x0_u32index:
++**	prfh	pldl3keep, p0, \[x0, z0\.s, uxtw 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_u32index, svuint32_t,
++			 svprfh_gather_u32index (p0, x0, z0, SV_PLDL3KEEP),
++			 svprfh_gather_index (p0, x0, z0, SV_PLDL3KEEP))
++
++/*
++** prfh_gather_u32index:
++**	prfh	pldl3strm, p0, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_u32index, svuint32_t,
++			 svprfh_gather_u32index (p0, x0, z1, SV_PLDL3STRM),
++			 svprfh_gather_index (p0, x0, z1, SV_PLDL3STRM))
++
++/*
++** prfh_gather_x0_s64index:
++**	prfh	pstl1keep, p0, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_s64index, svint64_t,
++			 svprfh_gather_s64index (p0, x0, z0, SV_PSTL1KEEP),
++			 svprfh_gather_index (p0, x0, z0, SV_PSTL1KEEP))
++
++/*
++** prfh_gather_s64index:
++**	prfh	pstl1strm, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_s64index, svint64_t,
++			 svprfh_gather_s64index (p0, x0, z1, SV_PSTL1STRM),
++			 svprfh_gather_index (p0, x0, z1, SV_PSTL1STRM))
++
++/*
++** prfh_gather_ext_s64index:
++**	prfh	pstl1strm, p0, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_ext_s64index, svint64_t,
++			 svprfh_gather_s64index (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
++			 svprfh_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
++
++/*
++** prfh_gather_x0_u64index:
++**	prfh	pstl2keep, p0, \[x0, z0\.d, lsl 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_u64index, svuint64_t,
++			 svprfh_gather_u64index (p0, x0, z0, SV_PSTL2KEEP),
++			 svprfh_gather_index (p0, x0, z0, SV_PSTL2KEEP))
++
++/*
++** prfh_gather_u64index:
++**	prfh	pstl2strm, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_u64index, svuint64_t,
++			 svprfh_gather_u64index (p0, x0, z1, SV_PSTL2STRM),
++			 svprfh_gather_index (p0, x0, z1, SV_PSTL2STRM))
++
++/*
++** prfh_gather_ext_u64index:
++**	prfh	pstl2strm, p0, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfh_gather_ext_u64index, svuint64_t,
++			 svprfh_gather_u64index (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
++			 svprfh_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+new file mode 100644
+index 000000000..bbf6a45c9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+@@ -0,0 +1,245 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfw_base:
++**	prfw	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_base, uint8_t,
++	       svprfw (p0, x0, SV_PLDL1KEEP),
++	       svprfw (p0, x0, SV_PLDL1KEEP))
++
++/*
++** prfw_u8_index:
++**	add	(x[0-9+]), (x0, x1|x1, x0)
++**	prfw	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u8_index, uint8_t,
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfw_u8_1:
++**	add	(x[0-9+]), x0, #?1
++**	prfw	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u8_1, uint8_t,
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfw_u16_index:
++**	add	(x[0-9+]), x0, x1, lsl #?1
++**	prfw	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u16_index, uint16_t,
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfw_u16_1:
++**	add	(x[0-9+]), x0, #?2
++**	prfw	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u16_1, uint16_t,
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfw_u32_index:
++**	prfw	pldl1keep, p0, \[x0, x1, lsl #?2\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u32_index, uint32_t,
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfw_u32_1:
++**	add	(x[0-9+]), x0, #?4
++**	prfw	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u32_1, uint32_t,
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfw_u64_index:
++**	add	(x[0-9+]), x0, x1, lsl #?3
++**	prfw	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u64_index, uint64_t,
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
++
++/*
++** prfw_u64_1:
++**	add	(x[0-9+]), x0, #?8
++**	prfw	pldl1keep, p0, \[\1\]
++**	ret
++*/
++TEST_PREFETCH (prfw_u64_1, uint64_t,
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
++	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
++
++/*
++** prfw_pldl1strm:
++**	prfw	pldl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pldl1strm, uint8_t,
++	       svprfw (p0, x0, SV_PLDL1STRM),
++	       svprfw (p0, x0, SV_PLDL1STRM))
++
++/*
++** prfw_pldl2keep:
++**	prfw	pldl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pldl2keep, uint8_t,
++	       svprfw (p0, x0, SV_PLDL2KEEP),
++	       svprfw (p0, x0, SV_PLDL2KEEP))
++
++/*
++** prfw_pldl2strm:
++**	prfw	pldl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pldl2strm, uint8_t,
++	       svprfw (p0, x0, SV_PLDL2STRM),
++	       svprfw (p0, x0, SV_PLDL2STRM))
++
++/*
++** prfw_pldl3keep:
++**	prfw	pldl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pldl3keep, uint8_t,
++	       svprfw (p0, x0, SV_PLDL3KEEP),
++	       svprfw (p0, x0, SV_PLDL3KEEP))
++
++/*
++** prfw_pldl3strm:
++**	prfw	pldl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pldl3strm, uint8_t,
++	       svprfw (p0, x0, SV_PLDL3STRM),
++	       svprfw (p0, x0, SV_PLDL3STRM))
++
++/*
++** prfw_pstl1keep:
++**	prfw	pstl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pstl1keep, uint8_t,
++	       svprfw (p0, x0, SV_PSTL1KEEP),
++	       svprfw (p0, x0, SV_PSTL1KEEP))
++
++/*
++** prfw_pstl1strm:
++**	prfw	pstl1strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pstl1strm, uint8_t,
++	       svprfw (p0, x0, SV_PSTL1STRM),
++	       svprfw (p0, x0, SV_PSTL1STRM))
++
++/*
++** prfw_pstl2keep:
++**	prfw	pstl2keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pstl2keep, uint8_t,
++	       svprfw (p0, x0, SV_PSTL2KEEP),
++	       svprfw (p0, x0, SV_PSTL2KEEP))
++
++/*
++** prfw_pstl2strm:
++**	prfw	pstl2strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pstl2strm, uint8_t,
++	       svprfw (p0, x0, SV_PSTL2STRM),
++	       svprfw (p0, x0, SV_PSTL2STRM))
++
++/*
++** prfw_pstl3keep:
++**	prfw	pstl3keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pstl3keep, uint8_t,
++	       svprfw (p0, x0, SV_PSTL3KEEP),
++	       svprfw (p0, x0, SV_PSTL3KEEP))
++
++/*
++** prfw_pstl3strm:
++**	prfw	pstl3strm, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_pstl3strm, uint8_t,
++	       svprfw (p0, x0, SV_PSTL3STRM),
++	       svprfw (p0, x0, SV_PSTL3STRM))
++
++/*
++** prfw_vnum_0:
++**	prfw	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_vnum_0, uint8_t,
++	       svprfw_vnum (p0, x0, 0, SV_PLDL1KEEP),
++	       svprfw_vnum (p0, x0, 0, SV_PLDL1KEEP))
++
++/*
++** prfw_vnum_1:
++**	incb	x0
++**	prfw	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_vnum_1, uint16_t,
++	       svprfw_vnum (p0, x0, 1, SV_PLDL1KEEP),
++	       svprfw_vnum (p0, x0, 1, SV_PLDL1KEEP))
++
++/*
++** prfw_vnum_2:
++**	incb	x0, all, mul #2
++**	prfw	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_vnum_2, uint32_t,
++	       svprfw_vnum (p0, x0, 2, SV_PLDL1KEEP),
++	       svprfw_vnum (p0, x0, 2, SV_PLDL1KEEP))
++
++/*
++** prfw_vnum_3:
++**	incb	x0, all, mul #3
++**	prfw	pldl1keep, p0, \[x0\]
++**	ret
++*/
++TEST_PREFETCH (prfw_vnum_3, uint64_t,
++	       svprfw_vnum (p0, x0, 3, SV_PLDL1KEEP),
++	       svprfw_vnum (p0, x0, 3, SV_PLDL1KEEP))
++
++/*
++** prfw_vnum_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	prfw	pldl1keep, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	prfw	zldl1keep, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_PREFETCH (prfw_vnum_x1, uint64_t,
++	       svprfw_vnum (p0, x0, x1, SV_PLDL1KEEP),
++	       svprfw_vnum (p0, x0, x1, SV_PLDL1KEEP))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw_gather.c
+new file mode 100644
+index 000000000..2bbae1b9e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw_gather.c
+@@ -0,0 +1,225 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** prfw_gather_u32base:
++**	prfw	pldl1keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_u32base, svuint32_t,
++			 svprfw_gather_u32base (p0, z0, SV_PLDL1KEEP),
++			 svprfw_gather (p0, z0, SV_PLDL1KEEP))
++
++/*
++** prfw_gather_u64base:
++**	prfw	pldl1strm, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_u64base, svuint64_t,
++			 svprfw_gather_u64base (p0, z0, SV_PLDL1STRM),
++			 svprfw_gather (p0, z0, SV_PLDL1STRM))
++
++/*
++** prfw_gather_x0_u32base_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	prfb	pldl2keep, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_x0_u32base_index, svuint32_t,
++			 svprfw_gather_u32base_index (p0, z0, x0, SV_PLDL2KEEP),
++			 svprfw_gather_index (p0, z0, x0, SV_PLDL2KEEP))
++
++/*
++** prfw_gather_m1_u32base_index:
++**	mov	(x[0-9]+), #?-4
++**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_m1_u32base_index, svuint32_t,
++			 svprfw_gather_u32base_index (p0, z0, -1, SV_PLDL2STRM),
++			 svprfw_gather_index (p0, z0, -1, SV_PLDL2STRM))
++
++/*
++** prfw_gather_0_u32base_index:
++**	prfw	pldl3keep, p0, \[z0\.s\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_0_u32base_index, svuint32_t,
++			 svprfw_gather_u32base_index (p0, z0, 0, SV_PLDL3KEEP),
++			 svprfw_gather_index (p0, z0, 0, SV_PLDL3KEEP))
++
++/*
++** prfw_gather_5_u32base_index:
++**	prfw	pldl3strm, p0, \[z0\.s, #20\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_5_u32base_index, svuint32_t,
++			 svprfw_gather_u32base_index (p0, z0, 5, SV_PLDL3STRM),
++			 svprfw_gather_index (p0, z0, 5, SV_PLDL3STRM))
++
++/*
++** prfw_gather_31_u32base_index:
++**	prfw	pstl1keep, p0, \[z0\.s, #124\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_31_u32base_index, svuint32_t,
++			 svprfw_gather_u32base_index (p0, z0, 31, SV_PSTL1KEEP),
++			 svprfw_gather_index (p0, z0, 31, SV_PSTL1KEEP))
++
++/*
++** prfw_gather_32_u32base_index:
++**	mov	(x[0-9]+), #?128
++**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_32_u32base_index, svuint32_t,
++			 svprfw_gather_u32base_index (p0, z0, 32, SV_PSTL1STRM),
++			 svprfw_gather_index (p0, z0, 32, SV_PSTL1STRM))
++
++/*
++** prfw_gather_x0_u64base_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	prfb	pstl2keep, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_x0_u64base_index, svuint64_t,
++			 svprfw_gather_u64base_index (p0, z0, x0, SV_PSTL2KEEP),
++			 svprfw_gather_index (p0, z0, x0, SV_PSTL2KEEP))
++
++/*
++** prfw_gather_m1_u64base_index:
++**	mov	(x[0-9]+), #?-4
++**	prfb	pstl2strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_m1_u64base_index, svuint64_t,
++			 svprfw_gather_u64base_index (p0, z0, -1, SV_PSTL2STRM),
++			 svprfw_gather_index (p0, z0, -1, SV_PSTL2STRM))
++
++/*
++** prfw_gather_0_u64base_index:
++**	prfw	pstl3keep, p0, \[z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_0_u64base_index, svuint64_t,
++			 svprfw_gather_u64base_index (p0, z0, 0, SV_PSTL3KEEP),
++			 svprfw_gather_index (p0, z0, 0, SV_PSTL3KEEP))
++
++/*
++** prfw_gather_5_u64base_index:
++**	prfw	pstl3strm, p0, \[z0\.d, #20\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_5_u64base_index, svuint64_t,
++			 svprfw_gather_u64base_index (p0, z0, 5, SV_PSTL3STRM),
++			 svprfw_gather_index (p0, z0, 5, SV_PSTL3STRM))
++
++/*
++** prfw_gather_31_u64base_index:
++**	prfw	pldl1keep, p0, \[z0\.d, #124\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_31_u64base_index, svuint64_t,
++			 svprfw_gather_u64base_index (p0, z0, 31, SV_PLDL1KEEP),
++			 svprfw_gather_index (p0, z0, 31, SV_PLDL1KEEP))
++
++/*
++** prfw_gather_32_u64base_index:
++**	mov	(x[0-9]+), #?128
++**	prfb	pldl1strm, p0, \[\1, z0\.d\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_ZS (prfw_gather_32_u64base_index, svuint64_t,
++			 svprfw_gather_u64base_index (p0, z0, 32, SV_PLDL1STRM),
++			 svprfw_gather_index (p0, z0, 32, SV_PLDL1STRM))
++
++/*
++** prfw_gather_x0_s32index:
++**	prfw	pldl2keep, p0, \[x0, z0\.s, sxtw 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_s32index, svint32_t,
++			 svprfw_gather_s32index (p0, x0, z0, SV_PLDL2KEEP),
++			 svprfw_gather_index (p0, x0, z0, SV_PLDL2KEEP))
++
++/*
++** prfw_gather_s32index:
++**	prfw	pldl2strm, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_s32index, svint32_t,
++			 svprfw_gather_s32index (p0, x0, z1, SV_PLDL2STRM),
++			 svprfw_gather_index (p0, x0, z1, SV_PLDL2STRM))
++
++/*
++** prfw_gather_x0_u32index:
++**	prfw	pldl3keep, p0, \[x0, z0\.s, uxtw 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_u32index, svuint32_t,
++			 svprfw_gather_u32index (p0, x0, z0, SV_PLDL3KEEP),
++			 svprfw_gather_index (p0, x0, z0, SV_PLDL3KEEP))
++
++/*
++** prfw_gather_u32index:
++**	prfw	pldl3strm, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_u32index, svuint32_t,
++			 svprfw_gather_u32index (p0, x0, z1, SV_PLDL3STRM),
++			 svprfw_gather_index (p0, x0, z1, SV_PLDL3STRM))
++
++/*
++** prfw_gather_x0_s64index:
++**	prfw	pstl1keep, p0, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_s64index, svint64_t,
++			 svprfw_gather_s64index (p0, x0, z0, SV_PSTL1KEEP),
++			 svprfw_gather_index (p0, x0, z0, SV_PSTL1KEEP))
++
++/*
++** prfw_gather_s64index:
++**	prfw	pstl1strm, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_s64index, svint64_t,
++			 svprfw_gather_s64index (p0, x0, z1, SV_PSTL1STRM),
++			 svprfw_gather_index (p0, x0, z1, SV_PSTL1STRM))
++
++/*
++** prfw_gather_ext_s64index:
++**	prfw	pstl1strm, p0, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_ext_s64index, svint64_t,
++			 svprfw_gather_s64index (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
++			 svprfw_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
++
++/*
++** prfw_gather_x0_u64index:
++**	prfw	pstl2keep, p0, \[x0, z0\.d, lsl 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_u64index, svuint64_t,
++			 svprfw_gather_u64index (p0, x0, z0, SV_PSTL2KEEP),
++			 svprfw_gather_index (p0, x0, z0, SV_PSTL2KEEP))
++
++/*
++** prfw_gather_u64index:
++**	prfw	pstl2strm, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_u64index, svuint64_t,
++			 svprfw_gather_u64index (p0, x0, z1, SV_PSTL2STRM),
++			 svprfw_gather_index (p0, x0, z1, SV_PSTL2STRM))
++
++/*
++** prfw_gather_ext_u64index:
++**	prfw	pstl2strm, p0, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_PREFETCH_GATHER_SZ (prfw_gather_ext_u64index, svuint64_t,
++			 svprfw_gather_u64index (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
++			 svprfw_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_any.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_any.c
+new file mode 100644
+index 000000000..33280d388
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_any.c
+@@ -0,0 +1,77 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** test_bool_any:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, any
++**	ret
++*/
++TEST_PTEST (test_bool_any, bool,
++	    x0 = svptest_any (p0, p1));
++
++/*
++** test_bool_none:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, none
++**	ret
++*/
++TEST_PTEST (test_bool_none, bool,
++	    x0 = !svptest_any (p0, p1));
++
++/*
++** test_int_any:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, any
++**	ret
++*/
++TEST_PTEST (test_int_any, int,
++	    x0 = svptest_any (p0, p1));
++
++/*
++** test_int_none:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, none
++**	ret
++*/
++TEST_PTEST (test_int_none, int,
++	    x0 = !svptest_any (p0, p1));
++
++/*
++** test_int64_t_any:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, any
++**	ret
++*/
++TEST_PTEST (test_int64_t_any, int64_t,
++	    x0 = svptest_any (p0, p1));
++
++/*
++** test_int64_t_none:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, none
++**	ret
++*/
++TEST_PTEST (test_int64_t_none, int64_t,
++	    x0 = !svptest_any (p0, p1));
++
++/*
++** sel_any:
++**	ptest	p0, p1\.b
++**	csel	x0, (x0, x1, any|x1, x0, none)
++**	ret
++*/
++TEST_PTEST (sel_any, int64_t,
++	    x0 = svptest_any (p0, p1) ? x0 : x1);
++
++/*
++** sel_none:
++**	ptest	p0, p1\.b
++**	csel	x0, (x0, x1, none|x1, x0, any)
++**	ret
++*/
++TEST_PTEST (sel_none, int64_t,
++	    x0 = !svptest_any (p0, p1) ? x0 : x1);
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_first.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_first.c
+new file mode 100644
+index 000000000..991dabd3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_first.c
+@@ -0,0 +1,77 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** test_bool_first:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, first
++**	ret
++*/
++TEST_PTEST (test_bool_first, bool,
++	    x0 = svptest_first (p0, p1));
++
++/*
++** test_bool_nfrst:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, nfrst
++**	ret
++*/
++TEST_PTEST (test_bool_nfrst, bool,
++	    x0 = !svptest_first (p0, p1));
++
++/*
++** test_int_first:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, first
++**	ret
++*/
++TEST_PTEST (test_int_first, int,
++	    x0 = svptest_first (p0, p1));
++
++/*
++** test_int_nfrst:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, nfrst
++**	ret
++*/
++TEST_PTEST (test_int_nfrst, int,
++	    x0 = !svptest_first (p0, p1));
++
++/*
++** test_int64_t_first:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, first
++**	ret
++*/
++TEST_PTEST (test_int64_t_first, int64_t,
++	    x0 = svptest_first (p0, p1));
++
++/*
++** test_int64_t_nfrst:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, nfrst
++**	ret
++*/
++TEST_PTEST (test_int64_t_nfrst, int64_t,
++	    x0 = !svptest_first (p0, p1));
++
++/*
++** sel_first:
++**	ptest	p0, p1\.b
++**	csel	x0, (x0, x1, first|x1, x0, nfrst)
++**	ret
++*/
++TEST_PTEST (sel_first, int64_t,
++	    x0 = svptest_first (p0, p1) ? x0 : x1);
++
++/*
++** sel_nfrst:
++**	ptest	p0, p1\.b
++**	csel	x0, (x0, x1, nfrst|x1, x0, first)
++**	ret
++*/
++TEST_PTEST (sel_nfrst, int64_t,
++	    x0 = !svptest_first (p0, p1) ? x0 : x1);
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_last.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_last.c
+new file mode 100644
+index 000000000..b952a4149
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_last.c
+@@ -0,0 +1,77 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++#include <stdbool.h>
++
++/*
++** test_bool_last:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, last
++**	ret
++*/
++TEST_PTEST (test_bool_last, bool,
++	    x0 = svptest_last (p0, p1));
++
++/*
++** test_bool_nlast:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, nlast
++**	ret
++*/
++TEST_PTEST (test_bool_nlast, bool,
++	    x0 = !svptest_last (p0, p1));
++
++/*
++** test_int_last:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, last
++**	ret
++*/
++TEST_PTEST (test_int_last, int,
++	    x0 = svptest_last (p0, p1));
++
++/*
++** test_int_nlast:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, nlast
++**	ret
++*/
++TEST_PTEST (test_int_nlast, int,
++	    x0 = !svptest_last (p0, p1));
++
++/*
++** test_int64_t_last:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, last
++**	ret
++*/
++TEST_PTEST (test_int64_t_last, int64_t,
++	    x0 = svptest_last (p0, p1));
++
++/*
++** test_int64_t_nlast:
++**	ptest	p0, p1\.b
++**	cset	[wx]0, nlast
++**	ret
++*/
++TEST_PTEST (test_int64_t_nlast, int64_t,
++	    x0 = !svptest_last (p0, p1));
++
++/*
++** sel_last:
++**	ptest	p0, p1\.b
++**	csel	x0, (x0, x1, last|x1, x0, nlast)
++**	ret
++*/
++TEST_PTEST (sel_last, int64_t,
++	    x0 = svptest_last (p0, p1) ? x0 : x1);
++
++/*
++** sel_nlast:
++**	ptest	p0, p1\.b
++**	csel	x0, (x0, x1, nlast|x1, x0, last)
++**	ret
++*/
++TEST_PTEST (sel_nlast, int64_t,
++	    x0 = !svptest_last (p0, p1) ? x0 : x1);
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue.c
+new file mode 100644
+index 000000000..9c86170cb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue.c
+@@ -0,0 +1,40 @@
++/* { dg-additional-options "-msve-vector-bits=scalable" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ptrue_b8:
++**	ptrue	p0\.b, all
++**	ret
++*/
++TEST_P (ptrue_b8,
++	p0 = svptrue_b8 (),
++	p0 = svptrue_b8 ());
++
++/*
++** ptrue_b16:
++**	ptrue	p0\.h, all
++**	ret
++*/
++TEST_P (ptrue_b16,
++	p0 = svptrue_b16 (),
++	p0 = svptrue_b16 ());
++
++/*
++** ptrue_b32:
++**	ptrue	p0\.s, all
++**	ret
++*/
++TEST_P (ptrue_b32,
++	p0 = svptrue_b32 (),
++	p0 = svptrue_b32 ());
++
++/*
++** ptrue_b64:
++**	ptrue	p0\.d, all
++**	ret
++*/
++TEST_P (ptrue_b64,
++	p0 = svptrue_b64 (),
++	p0 = svptrue_b64 ());
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b16.c
+new file mode 100644
+index 000000000..d7f83f5c6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b16.c
+@@ -0,0 +1,156 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ptrue_pat_pow2_b16:
++**	ptrue	p0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_pow2_b16,
++		p0 = svptrue_pat_b16 (SV_POW2),
++		p0 = svptrue_pat_b16 (SV_POW2))
++
++/*
++** ptrue_pat_vl1_b16:
++**	ptrue	p0\.[bhsd], vl1
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl1_b16,
++		p0 = svptrue_pat_b16 (SV_VL1),
++		p0 = svptrue_pat_b16 (SV_VL1))
++
++/*
++** ptrue_pat_vl2_b16:
++**	ptrue	p0\.h, vl2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl2_b16,
++		p0 = svptrue_pat_b16 (SV_VL2),
++		p0 = svptrue_pat_b16 (SV_VL2))
++
++/*
++** ptrue_pat_vl3_b16:
++**	ptrue	p0\.h, vl3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl3_b16,
++		p0 = svptrue_pat_b16 (SV_VL3),
++		p0 = svptrue_pat_b16 (SV_VL3))
++
++/*
++** ptrue_pat_vl4_b16:
++**	ptrue	p0\.h, vl4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl4_b16,
++		p0 = svptrue_pat_b16 (SV_VL4),
++		p0 = svptrue_pat_b16 (SV_VL4))
++
++/*
++** ptrue_pat_vl5_b16:
++**	ptrue	p0\.h, vl5
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl5_b16,
++		p0 = svptrue_pat_b16 (SV_VL5),
++		p0 = svptrue_pat_b16 (SV_VL5))
++
++/*
++** ptrue_pat_vl6_b16:
++**	ptrue	p0\.h, vl6
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl6_b16,
++		p0 = svptrue_pat_b16 (SV_VL6),
++		p0 = svptrue_pat_b16 (SV_VL6))
++
++/*
++** ptrue_pat_vl7_b16:
++**	ptrue	p0\.h, vl7
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl7_b16,
++		p0 = svptrue_pat_b16 (SV_VL7),
++		p0 = svptrue_pat_b16 (SV_VL7))
++
++/*
++** ptrue_pat_vl8_b16:
++**	ptrue	p0\.h, vl8
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl8_b16,
++		p0 = svptrue_pat_b16 (SV_VL8),
++		p0 = svptrue_pat_b16 (SV_VL8))
++
++/*
++** ptrue_pat_vl16_b16:
++**	ptrue	p0\.[bhsd], vl16
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl16_b16,
++		p0 = svptrue_pat_b16 (SV_VL16),
++		p0 = svptrue_pat_b16 (SV_VL16))
++
++/*
++** ptrue_pat_vl32_b16:
++**	ptrue	p0\.h, vl32
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl32_b16,
++		p0 = svptrue_pat_b16 (SV_VL32),
++		p0 = svptrue_pat_b16 (SV_VL32))
++
++/*
++** ptrue_pat_vl64_b16:
++**	ptrue	p0\.h, vl64
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl64_b16,
++		p0 = svptrue_pat_b16 (SV_VL64),
++		p0 = svptrue_pat_b16 (SV_VL64))
++
++/*
++** ptrue_pat_vl128_b16:
++**	ptrue	p0\.[bhsd], vl128
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl128_b16,
++		p0 = svptrue_pat_b16 (SV_VL128),
++		p0 = svptrue_pat_b16 (SV_VL128))
++
++/*
++** ptrue_pat_vl256_b16:
++**	ptrue	p0\.h, vl256
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl256_b16,
++		p0 = svptrue_pat_b16 (SV_VL256),
++		p0 = svptrue_pat_b16 (SV_VL256))
++
++/*
++** ptrue_pat_mul4_b16:
++**	ptrue	p0\.h, mul4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul4_b16,
++		p0 = svptrue_pat_b16 (SV_MUL4),
++		p0 = svptrue_pat_b16 (SV_MUL4))
++
++/*
++** ptrue_pat_mul3_b16:
++**	ptrue	p0\.h, mul3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul3_b16,
++		p0 = svptrue_pat_b16 (SV_MUL3),
++		p0 = svptrue_pat_b16 (SV_MUL3))
++
++/*
++** ptrue_pat_all_b16:
++**	ptrue	p0\.h[^\n]*
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_all_b16,
++		p0 = svptrue_pat_b16 (SV_ALL),
++		p0 = svptrue_pat_b16 (SV_ALL))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b32.c
+new file mode 100644
+index 000000000..11cf5aebb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b32.c
+@@ -0,0 +1,156 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ptrue_pat_pow2_b32:
++**	ptrue	p0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_pow2_b32,
++		p0 = svptrue_pat_b32 (SV_POW2),
++		p0 = svptrue_pat_b32 (SV_POW2))
++
++/*
++** ptrue_pat_vl1_b32:
++**	ptrue	p0\.[bhsd], vl1
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl1_b32,
++		p0 = svptrue_pat_b32 (SV_VL1),
++		p0 = svptrue_pat_b32 (SV_VL1))
++
++/*
++** ptrue_pat_vl2_b32:
++**	ptrue	p0\.s, vl2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl2_b32,
++		p0 = svptrue_pat_b32 (SV_VL2),
++		p0 = svptrue_pat_b32 (SV_VL2))
++
++/*
++** ptrue_pat_vl3_b32:
++**	ptrue	p0\.s, vl3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl3_b32,
++		p0 = svptrue_pat_b32 (SV_VL3),
++		p0 = svptrue_pat_b32 (SV_VL3))
++
++/*
++** ptrue_pat_vl4_b32:
++**	ptrue	p0\.s, vl4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl4_b32,
++		p0 = svptrue_pat_b32 (SV_VL4),
++		p0 = svptrue_pat_b32 (SV_VL4))
++
++/*
++** ptrue_pat_vl5_b32:
++**	ptrue	p0\.s, vl5
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl5_b32,
++		p0 = svptrue_pat_b32 (SV_VL5),
++		p0 = svptrue_pat_b32 (SV_VL5))
++
++/*
++** ptrue_pat_vl6_b32:
++**	ptrue	p0\.s, vl6
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl6_b32,
++		p0 = svptrue_pat_b32 (SV_VL6),
++		p0 = svptrue_pat_b32 (SV_VL6))
++
++/*
++** ptrue_pat_vl7_b32:
++**	ptrue	p0\.s, vl7
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl7_b32,
++		p0 = svptrue_pat_b32 (SV_VL7),
++		p0 = svptrue_pat_b32 (SV_VL7))
++
++/*
++** ptrue_pat_vl8_b32:
++**	ptrue	p0\.s, vl8
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl8_b32,
++		p0 = svptrue_pat_b32 (SV_VL8),
++		p0 = svptrue_pat_b32 (SV_VL8))
++
++/*
++** ptrue_pat_vl16_b32:
++**	ptrue	p0\.[bhsd], vl16
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl16_b32,
++		p0 = svptrue_pat_b32 (SV_VL16),
++		p0 = svptrue_pat_b32 (SV_VL16))
++
++/*
++** ptrue_pat_vl32_b32:
++**	ptrue	p0\.s, vl32
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl32_b32,
++		p0 = svptrue_pat_b32 (SV_VL32),
++		p0 = svptrue_pat_b32 (SV_VL32))
++
++/*
++** ptrue_pat_vl64_b32:
++**	ptrue	p0\.s, vl64
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl64_b32,
++		p0 = svptrue_pat_b32 (SV_VL64),
++		p0 = svptrue_pat_b32 (SV_VL64))
++
++/*
++** ptrue_pat_vl128_b32:
++**	ptrue	p0\.[bhsd], vl128
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl128_b32,
++		p0 = svptrue_pat_b32 (SV_VL128),
++		p0 = svptrue_pat_b32 (SV_VL128))
++
++/*
++** ptrue_pat_vl256_b32:
++**	ptrue	p0\.s, vl256
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl256_b32,
++		p0 = svptrue_pat_b32 (SV_VL256),
++		p0 = svptrue_pat_b32 (SV_VL256))
++
++/*
++** ptrue_pat_mul4_b32:
++**	ptrue	p0\.s, mul4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul4_b32,
++		p0 = svptrue_pat_b32 (SV_MUL4),
++		p0 = svptrue_pat_b32 (SV_MUL4))
++
++/*
++** ptrue_pat_mul3_b32:
++**	ptrue	p0\.s, mul3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul3_b32,
++		p0 = svptrue_pat_b32 (SV_MUL3),
++		p0 = svptrue_pat_b32 (SV_MUL3))
++
++/*
++** ptrue_pat_all_b32:
++**	ptrue	p0\.s[^\n]*
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_all_b32,
++		p0 = svptrue_pat_b32 (SV_ALL),
++		p0 = svptrue_pat_b32 (SV_ALL))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b64.c
+new file mode 100644
+index 000000000..4c4202bb3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b64.c
+@@ -0,0 +1,156 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ptrue_pat_pow2_b64:
++**	ptrue	p0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_pow2_b64,
++		p0 = svptrue_pat_b64 (SV_POW2),
++		p0 = svptrue_pat_b64 (SV_POW2))
++
++/*
++** ptrue_pat_vl1_b64:
++**	ptrue	p0\.[bhsd], vl1
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl1_b64,
++		p0 = svptrue_pat_b64 (SV_VL1),
++		p0 = svptrue_pat_b64 (SV_VL1))
++
++/*
++** ptrue_pat_vl2_b64:
++**	ptrue	p0\.d, vl2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl2_b64,
++		p0 = svptrue_pat_b64 (SV_VL2),
++		p0 = svptrue_pat_b64 (SV_VL2))
++
++/*
++** ptrue_pat_vl3_b64:
++**	ptrue	p0\.d, vl3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl3_b64,
++		p0 = svptrue_pat_b64 (SV_VL3),
++		p0 = svptrue_pat_b64 (SV_VL3))
++
++/*
++** ptrue_pat_vl4_b64:
++**	ptrue	p0\.d, vl4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl4_b64,
++		p0 = svptrue_pat_b64 (SV_VL4),
++		p0 = svptrue_pat_b64 (SV_VL4))
++
++/*
++** ptrue_pat_vl5_b64:
++**	ptrue	p0\.d, vl5
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl5_b64,
++		p0 = svptrue_pat_b64 (SV_VL5),
++		p0 = svptrue_pat_b64 (SV_VL5))
++
++/*
++** ptrue_pat_vl6_b64:
++**	ptrue	p0\.d, vl6
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl6_b64,
++		p0 = svptrue_pat_b64 (SV_VL6),
++		p0 = svptrue_pat_b64 (SV_VL6))
++
++/*
++** ptrue_pat_vl7_b64:
++**	ptrue	p0\.d, vl7
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl7_b64,
++		p0 = svptrue_pat_b64 (SV_VL7),
++		p0 = svptrue_pat_b64 (SV_VL7))
++
++/*
++** ptrue_pat_vl8_b64:
++**	ptrue	p0\.d, vl8
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl8_b64,
++		p0 = svptrue_pat_b64 (SV_VL8),
++		p0 = svptrue_pat_b64 (SV_VL8))
++
++/*
++** ptrue_pat_vl16_b64:
++**	ptrue	p0\.[bhsd], vl16
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl16_b64,
++		p0 = svptrue_pat_b64 (SV_VL16),
++		p0 = svptrue_pat_b64 (SV_VL16))
++
++/*
++** ptrue_pat_vl32_b64:
++**	ptrue	p0\.d, vl32
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl32_b64,
++		p0 = svptrue_pat_b64 (SV_VL32),
++		p0 = svptrue_pat_b64 (SV_VL32))
++
++/*
++** ptrue_pat_vl64_b64:
++**	ptrue	p0\.d, vl64
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl64_b64,
++		p0 = svptrue_pat_b64 (SV_VL64),
++		p0 = svptrue_pat_b64 (SV_VL64))
++
++/*
++** ptrue_pat_vl128_b64:
++**	ptrue	p0\.[bhsd], vl128
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl128_b64,
++		p0 = svptrue_pat_b64 (SV_VL128),
++		p0 = svptrue_pat_b64 (SV_VL128))
++
++/*
++** ptrue_pat_vl256_b64:
++**	ptrue	p0\.d, vl256
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl256_b64,
++		p0 = svptrue_pat_b64 (SV_VL256),
++		p0 = svptrue_pat_b64 (SV_VL256))
++
++/*
++** ptrue_pat_mul4_b64:
++**	ptrue	p0\.d, mul4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul4_b64,
++		p0 = svptrue_pat_b64 (SV_MUL4),
++		p0 = svptrue_pat_b64 (SV_MUL4))
++
++/*
++** ptrue_pat_mul3_b64:
++**	ptrue	p0\.d, mul3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul3_b64,
++		p0 = svptrue_pat_b64 (SV_MUL3),
++		p0 = svptrue_pat_b64 (SV_MUL3))
++
++/*
++** ptrue_pat_all_b64:
++**	ptrue	p0\.d[^\n]*
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_all_b64,
++		p0 = svptrue_pat_b64 (SV_ALL),
++		p0 = svptrue_pat_b64 (SV_ALL))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b8.c
+new file mode 100644
+index 000000000..49fb8c555
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b8.c
+@@ -0,0 +1,156 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** ptrue_pat_pow2_b8:
++**	ptrue	p0\.b, pow2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_pow2_b8,
++		p0 = svptrue_pat_b8 (SV_POW2),
++		p0 = svptrue_pat_b8 (SV_POW2))
++
++/*
++** ptrue_pat_vl1_b8:
++**	ptrue	p0\.[bhsd], vl1
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl1_b8,
++		p0 = svptrue_pat_b8 (SV_VL1),
++		p0 = svptrue_pat_b8 (SV_VL1))
++
++/*
++** ptrue_pat_vl2_b8:
++**	ptrue	p0\.b, vl2
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl2_b8,
++		p0 = svptrue_pat_b8 (SV_VL2),
++		p0 = svptrue_pat_b8 (SV_VL2))
++
++/*
++** ptrue_pat_vl3_b8:
++**	ptrue	p0\.b, vl3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl3_b8,
++		p0 = svptrue_pat_b8 (SV_VL3),
++		p0 = svptrue_pat_b8 (SV_VL3))
++
++/*
++** ptrue_pat_vl4_b8:
++**	ptrue	p0\.b, vl4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl4_b8,
++		p0 = svptrue_pat_b8 (SV_VL4),
++		p0 = svptrue_pat_b8 (SV_VL4))
++
++/*
++** ptrue_pat_vl5_b8:
++**	ptrue	p0\.b, vl5
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl5_b8,
++		p0 = svptrue_pat_b8 (SV_VL5),
++		p0 = svptrue_pat_b8 (SV_VL5))
++
++/*
++** ptrue_pat_vl6_b8:
++**	ptrue	p0\.b, vl6
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl6_b8,
++		p0 = svptrue_pat_b8 (SV_VL6),
++		p0 = svptrue_pat_b8 (SV_VL6))
++
++/*
++** ptrue_pat_vl7_b8:
++**	ptrue	p0\.b, vl7
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl7_b8,
++		p0 = svptrue_pat_b8 (SV_VL7),
++		p0 = svptrue_pat_b8 (SV_VL7))
++
++/*
++** ptrue_pat_vl8_b8:
++**	ptrue	p0\.b, vl8
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl8_b8,
++		p0 = svptrue_pat_b8 (SV_VL8),
++		p0 = svptrue_pat_b8 (SV_VL8))
++
++/*
++** ptrue_pat_vl16_b8:
++**	ptrue	p0\.[bhsd], vl16
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl16_b8,
++		p0 = svptrue_pat_b8 (SV_VL16),
++		p0 = svptrue_pat_b8 (SV_VL16))
++
++/*
++** ptrue_pat_vl32_b8:
++**	ptrue	p0\.b, vl32
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl32_b8,
++		p0 = svptrue_pat_b8 (SV_VL32),
++		p0 = svptrue_pat_b8 (SV_VL32))
++
++/*
++** ptrue_pat_vl64_b8:
++**	ptrue	p0\.b, vl64
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl64_b8,
++		p0 = svptrue_pat_b8 (SV_VL64),
++		p0 = svptrue_pat_b8 (SV_VL64))
++
++/*
++** ptrue_pat_vl128_b8:
++**	ptrue	p0\.[bhsd], vl128
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl128_b8,
++		p0 = svptrue_pat_b8 (SV_VL128),
++		p0 = svptrue_pat_b8 (SV_VL128))
++
++/*
++** ptrue_pat_vl256_b8:
++**	ptrue	p0\.b, vl256
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_vl256_b8,
++		p0 = svptrue_pat_b8 (SV_VL256),
++		p0 = svptrue_pat_b8 (SV_VL256))
++
++/*
++** ptrue_pat_mul4_b8:
++**	ptrue	p0\.b, mul4
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul4_b8,
++		p0 = svptrue_pat_b8 (SV_MUL4),
++		p0 = svptrue_pat_b8 (SV_MUL4))
++
++/*
++** ptrue_pat_mul3_b8:
++**	ptrue	p0\.b, mul3
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_mul3_b8,
++		p0 = svptrue_pat_b8 (SV_MUL3),
++		p0 = svptrue_pat_b8 (SV_MUL3))
++
++/*
++** ptrue_pat_all_b8:
++**	ptrue	p0\.b[^\n]*
++**	ret
++*/
++TEST_UNIFORM_P (ptrue_pat_all_b8,
++		p0 = svptrue_pat_b8 (SV_ALL),
++		p0 = svptrue_pat_b8 (SV_ALL))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s16.c
+new file mode 100644
+index 000000000..03255c41c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s16.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_s16_tied1:
++**	sqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s16_tied1, svint16_t,
++		z0 = svqadd_s16 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_s16_tied2:
++**	sqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s16_tied2, svint16_t,
++		z0 = svqadd_s16 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_s16_untied:
++**	sqadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s16_untied, svint16_t,
++		z0 = svqadd_s16 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_w0_s16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_s16_tied1, svint16_t, int16_t,
++		 z0 = svqadd_n_s16 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_w0_s16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	sqadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_s16_untied, svint16_t, int16_t,
++		 z0 = svqadd_n_s16 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_s16_tied1:
++**	sqadd	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s16_tied1, svint16_t,
++		z0 = svqadd_n_s16 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_s16_untied:
++**	movprfx	z0, z1
++**	sqadd	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s16_untied, svint16_t,
++		z0 = svqadd_n_s16 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_s16:
++**	sqadd	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_s16, svint16_t,
++		z0 = svqadd_n_s16 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_s16:
++**	sqadd	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_s16, svint16_t,
++		z0 = svqadd_n_s16 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_s16:
++**	sqadd	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_s16, svint16_t,
++		z0 = svqadd_n_s16 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_s16:
++**	sqsub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_s16, svint16_t,
++		z0 = svqadd_n_s16 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_s16:
++**	sqsub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_s16, svint16_t,
++		z0 = svqadd_n_s16 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_s16:
++**	sqsub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_s16, svint16_t,
++		z0 = svqadd_n_s16 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s32.c
+new file mode 100644
+index 000000000..197cc3840
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s32.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_s32_tied1:
++**	sqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s32_tied1, svint32_t,
++		z0 = svqadd_s32 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_s32_tied2:
++**	sqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s32_tied2, svint32_t,
++		z0 = svqadd_s32 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_s32_untied:
++**	sqadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s32_untied, svint32_t,
++		z0 = svqadd_s32 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_w0_s32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_s32_tied1, svint32_t, int32_t,
++		 z0 = svqadd_n_s32 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_w0_s32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	sqadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_s32_untied, svint32_t, int32_t,
++		 z0 = svqadd_n_s32 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_s32_tied1:
++**	sqadd	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s32_tied1, svint32_t,
++		z0 = svqadd_n_s32 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_s32_untied:
++**	movprfx	z0, z1
++**	sqadd	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s32_untied, svint32_t,
++		z0 = svqadd_n_s32 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_s32:
++**	sqadd	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_s32, svint32_t,
++		z0 = svqadd_n_s32 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_s32:
++**	sqadd	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_s32, svint32_t,
++		z0 = svqadd_n_s32 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_s32:
++**	sqadd	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_s32, svint32_t,
++		z0 = svqadd_n_s32 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_s32:
++**	sqsub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_s32, svint32_t,
++		z0 = svqadd_n_s32 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_s32:
++**	sqsub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_s32, svint32_t,
++		z0 = svqadd_n_s32 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_s32:
++**	sqsub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_s32, svint32_t,
++		z0 = svqadd_n_s32 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s64.c
+new file mode 100644
+index 000000000..0218866ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s64.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_s64_tied1:
++**	sqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s64_tied1, svint64_t,
++		z0 = svqadd_s64 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_s64_tied2:
++**	sqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s64_tied2, svint64_t,
++		z0 = svqadd_s64 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_s64_untied:
++**	sqadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s64_untied, svint64_t,
++		z0 = svqadd_s64 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_x0_s64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_x0_s64_tied1, svint64_t, int64_t,
++		 z0 = svqadd_n_s64 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_x0_s64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	sqadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_x0_s64_untied, svint64_t, int64_t,
++		 z0 = svqadd_n_s64 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_s64_tied1:
++**	sqadd	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s64_tied1, svint64_t,
++		z0 = svqadd_n_s64 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_s64_untied:
++**	movprfx	z0, z1
++**	sqadd	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s64_untied, svint64_t,
++		z0 = svqadd_n_s64 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_s64:
++**	sqadd	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_s64, svint64_t,
++		z0 = svqadd_n_s64 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_s64:
++**	sqadd	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_s64, svint64_t,
++		z0 = svqadd_n_s64 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_s64:
++**	sqadd	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_s64, svint64_t,
++		z0 = svqadd_n_s64 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_s64:
++**	sqsub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_s64, svint64_t,
++		z0 = svqadd_n_s64 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_s64:
++**	sqsub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_s64, svint64_t,
++		z0 = svqadd_n_s64 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_s64:
++**	sqsub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_s64, svint64_t,
++		z0 = svqadd_n_s64 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s8.c
+new file mode 100644
+index 000000000..c8b88fa82
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s8.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_s8_tied1:
++**	sqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s8_tied1, svint8_t,
++		z0 = svqadd_s8 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_s8_tied2:
++**	sqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s8_tied2, svint8_t,
++		z0 = svqadd_s8 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_s8_untied:
++**	sqadd	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_s8_untied, svint8_t,
++		z0 = svqadd_s8 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_w0_s8_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sqadd	z0\.b, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_s8_tied1, svint8_t, int8_t,
++		 z0 = svqadd_n_s8 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_w0_s8_untied:
++**	mov	(z[0-9]+\.b), w0
++**	sqadd	z0\.b, (z1\.b, \1|\1, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_s8_untied, svint8_t, int8_t,
++		 z0 = svqadd_n_s8 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_s8_tied1:
++**	sqadd	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s8_tied1, svint8_t,
++		z0 = svqadd_n_s8 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_s8_untied:
++**	movprfx	z0, z1
++**	sqadd	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_s8_untied, svint8_t,
++		z0 = svqadd_n_s8 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_s8:
++**	sqadd	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_s8, svint8_t,
++		z0 = svqadd_n_s8 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_s8:
++**	sqsub	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_s8, svint8_t,
++		z0 = svqadd_n_s8 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_s8:
++**	sqsub	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_s8, svint8_t,
++		z0 = svqadd_n_s8 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_s8:
++**	sqsub	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_s8, svint8_t,
++		z0 = svqadd_n_s8 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_s8:
++**	sqsub	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_s8, svint8_t,
++		z0 = svqadd_n_s8 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_s8:
++**	sqsub	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_s8, svint8_t,
++		z0 = svqadd_n_s8 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u16.c
+new file mode 100644
+index 000000000..dd7bc5b6a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u16.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_u16_tied1:
++**	uqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u16_tied1, svuint16_t,
++		z0 = svqadd_u16 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_u16_tied2:
++**	uqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u16_tied2, svuint16_t,
++		z0 = svqadd_u16 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_u16_untied:
++**	uqadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u16_untied, svuint16_t,
++		z0 = svqadd_u16 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_w0_u16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	uqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_u16_tied1, svuint16_t, uint16_t,
++		 z0 = svqadd_n_u16 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_w0_u16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	uqadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_u16_untied, svuint16_t, uint16_t,
++		 z0 = svqadd_n_u16 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_u16_tied1:
++**	uqadd	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u16_tied1, svuint16_t,
++		z0 = svqadd_n_u16 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_u16_untied:
++**	movprfx	z0, z1
++**	uqadd	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u16_untied, svuint16_t,
++		z0 = svqadd_n_u16 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_u16:
++**	uqadd	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_u16, svuint16_t,
++		z0 = svqadd_n_u16 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_u16:
++**	uqadd	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_u16, svuint16_t,
++		z0 = svqadd_n_u16 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_u16:
++**	uqadd	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_u16, svuint16_t,
++		z0 = svqadd_n_u16 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++**	uqadd	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_u16, svuint16_t,
++		z0 = svqadd_n_u16 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_u16:
++**	mov	(z[0-9]+\.h), #-127
++**	uqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_u16, svuint16_t,
++		z0 = svqadd_n_u16 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_u16:
++**	mov	(z[0-9]+\.h), #-128
++**	uqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_u16, svuint16_t,
++		z0 = svqadd_n_u16 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u32.c
+new file mode 100644
+index 000000000..0f846e44e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u32.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_u32_tied1:
++**	uqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u32_tied1, svuint32_t,
++		z0 = svqadd_u32 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_u32_tied2:
++**	uqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u32_tied2, svuint32_t,
++		z0 = svqadd_u32 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_u32_untied:
++**	uqadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u32_untied, svuint32_t,
++		z0 = svqadd_u32 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_w0_u32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	uqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_u32_tied1, svuint32_t, uint32_t,
++		 z0 = svqadd_n_u32 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_w0_u32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	uqadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_u32_untied, svuint32_t, uint32_t,
++		 z0 = svqadd_n_u32 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_u32_tied1:
++**	uqadd	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u32_tied1, svuint32_t,
++		z0 = svqadd_n_u32 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_u32_untied:
++**	movprfx	z0, z1
++**	uqadd	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u32_untied, svuint32_t,
++		z0 = svqadd_n_u32 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_u32:
++**	uqadd	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_u32, svuint32_t,
++		z0 = svqadd_n_u32 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_u32:
++**	uqadd	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_u32, svuint32_t,
++		z0 = svqadd_n_u32 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_u32:
++**	uqadd	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_u32, svuint32_t,
++		z0 = svqadd_n_u32 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++**	uqadd	z0\.s, (z0\.s, \1\.s|\1\.s, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_u32, svuint32_t,
++		z0 = svqadd_n_u32 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_u32:
++**	mov	(z[0-9]+\.s), #-127
++**	uqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_u32, svuint32_t,
++		z0 = svqadd_n_u32 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_u32:
++**	mov	(z[0-9]+\.s), #-128
++**	uqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_u32, svuint32_t,
++		z0 = svqadd_n_u32 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u64.c
+new file mode 100644
+index 000000000..454fb1d63
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u64.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_u64_tied1:
++**	uqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u64_tied1, svuint64_t,
++		z0 = svqadd_u64 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_u64_tied2:
++**	uqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u64_tied2, svuint64_t,
++		z0 = svqadd_u64 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_u64_untied:
++**	uqadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u64_untied, svuint64_t,
++		z0 = svqadd_u64 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_x0_u64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	uqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_x0_u64_tied1, svuint64_t, uint64_t,
++		 z0 = svqadd_n_u64 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_x0_u64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	uqadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_x0_u64_untied, svuint64_t, uint64_t,
++		 z0 = svqadd_n_u64 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_u64_tied1:
++**	uqadd	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u64_tied1, svuint64_t,
++		z0 = svqadd_n_u64 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_u64_untied:
++**	movprfx	z0, z1
++**	uqadd	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u64_untied, svuint64_t,
++		z0 = svqadd_n_u64 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_u64:
++**	uqadd	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_u64, svuint64_t,
++		z0 = svqadd_n_u64 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_u64:
++**	uqadd	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_u64, svuint64_t,
++		z0 = svqadd_n_u64 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_u64:
++**	uqadd	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_u64, svuint64_t,
++		z0 = svqadd_n_u64 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_u64:
++**	mov	(z[0-9]+)\.b, #-1
++**	uqadd	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_u64, svuint64_t,
++		z0 = svqadd_n_u64 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_u64:
++**	mov	(z[0-9]+\.d), #-127
++**	uqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_u64, svuint64_t,
++		z0 = svqadd_n_u64 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_u64:
++**	mov	(z[0-9]+\.d), #-128
++**	uqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_u64, svuint64_t,
++		z0 = svqadd_n_u64 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u8.c
+new file mode 100644
+index 000000000..e86b8988c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u8.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qadd_u8_tied1:
++**	uqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u8_tied1, svuint8_t,
++		z0 = svqadd_u8 (z0, z1),
++		z0 = svqadd (z0, z1))
++
++/*
++** qadd_u8_tied2:
++**	uqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u8_tied2, svuint8_t,
++		z0 = svqadd_u8 (z1, z0),
++		z0 = svqadd (z1, z0))
++
++/*
++** qadd_u8_untied:
++**	uqadd	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_u8_untied, svuint8_t,
++		z0 = svqadd_u8 (z1, z2),
++		z0 = svqadd (z1, z2))
++
++/*
++** qadd_w0_u8_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	uqadd	z0\.b, (z0\.b, \1|\1, z0\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_u8_tied1, svuint8_t, uint8_t,
++		 z0 = svqadd_n_u8 (z0, x0),
++		 z0 = svqadd (z0, x0))
++
++/*
++** qadd_w0_u8_untied:
++**	mov	(z[0-9]+\.b), w0
++**	uqadd	z0\.b, (z1\.b, \1|\1, z1\.b)
++**	ret
++*/
++TEST_UNIFORM_ZX (qadd_w0_u8_untied, svuint8_t, uint8_t,
++		 z0 = svqadd_n_u8 (z1, x0),
++		 z0 = svqadd (z1, x0))
++
++/*
++** qadd_1_u8_tied1:
++**	uqadd	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u8_tied1, svuint8_t,
++		z0 = svqadd_n_u8 (z0, 1),
++		z0 = svqadd (z0, 1))
++
++/*
++** qadd_1_u8_untied:
++**	movprfx	z0, z1
++**	uqadd	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_1_u8_untied, svuint8_t,
++		z0 = svqadd_n_u8 (z1, 1),
++		z0 = svqadd (z1, 1))
++
++/*
++** qadd_127_u8:
++**	uqadd	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_127_u8, svuint8_t,
++		z0 = svqadd_n_u8 (z0, 127),
++		z0 = svqadd (z0, 127))
++
++/*
++** qadd_128_u8:
++**	uqadd	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_128_u8, svuint8_t,
++		z0 = svqadd_n_u8 (z0, 128),
++		z0 = svqadd (z0, 128))
++
++/*
++** qadd_255_u8:
++**	uqadd	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_255_u8, svuint8_t,
++		z0 = svqadd_n_u8 (z0, 255),
++		z0 = svqadd (z0, 255))
++
++/*
++** qadd_m1_u8:
++**	uqadd	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m1_u8, svuint8_t,
++		z0 = svqadd_n_u8 (z0, -1),
++		z0 = svqadd (z0, -1))
++
++/*
++** qadd_m127_u8:
++**	uqadd	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m127_u8, svuint8_t,
++		z0 = svqadd_n_u8 (z0, -127),
++		z0 = svqadd (z0, -127))
++
++/*
++** qadd_m128_u8:
++**	uqadd	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qadd_m128_u8, svuint8_t,
++		z0 = svqadd_n_u8 (z0, -128),
++		z0 = svqadd (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s32.c
+new file mode 100644
+index 000000000..22b3afef7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_pat_n_1_s32_tied:
++**	sqdecb	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_s32_tied, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqdecb_pat (x0, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdecb	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_s32_untied, int32_t,
++		x0 = svqdecb_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqdecb_pat (x1, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_2_s32:
++**	sqdecb	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_2_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqdecb_pat (x0, SV_POW2, 2))
++
++/*
++** qdecb_pat_n_7_s32:
++**	sqdecb	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_7_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqdecb_pat (x0, SV_POW2, 7))
++
++/*
++** qdecb_pat_n_15_s32:
++**	sqdecb	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_15_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqdecb_pat (x0, SV_POW2, 15))
++
++/*
++** qdecb_pat_n_16_s32:
++**	sqdecb	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_16_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqdecb_pat (x0, SV_POW2, 16))
++
++/*
++** qdecb_pat_n_vl1_s32:
++**	sqdecb	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl1_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqdecb_pat (x0, SV_VL1, 16))
++
++/*
++** qdecb_pat_n_vl2_s32:
++**	sqdecb	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl2_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqdecb_pat (x0, SV_VL2, 16))
++
++/*
++** qdecb_pat_n_vl3_s32:
++**	sqdecb	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl3_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqdecb_pat (x0, SV_VL3, 16))
++
++/*
++** qdecb_pat_n_vl4_s32:
++**	sqdecb	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl4_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqdecb_pat (x0, SV_VL4, 16))
++
++/*
++** qdecb_pat_n_vl5_s32:
++**	sqdecb	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl5_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqdecb_pat (x0, SV_VL5, 16))
++
++/*
++** qdecb_pat_n_vl6_s32:
++**	sqdecb	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl6_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqdecb_pat (x0, SV_VL6, 16))
++
++/*
++** qdecb_pat_n_vl7_s32:
++**	sqdecb	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl7_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqdecb_pat (x0, SV_VL7, 16))
++
++/*
++** qdecb_pat_n_vl8_s32:
++**	sqdecb	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl8_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqdecb_pat (x0, SV_VL8, 16))
++
++/*
++** qdecb_pat_n_vl16_s32:
++**	sqdecb	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl16_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqdecb_pat (x0, SV_VL16, 16))
++
++/*
++** qdecb_pat_n_vl32_s32:
++**	sqdecb	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl32_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqdecb_pat (x0, SV_VL32, 16))
++
++/*
++** qdecb_pat_n_vl64_s32:
++**	sqdecb	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl64_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqdecb_pat (x0, SV_VL64, 16))
++
++/*
++** qdecb_pat_n_vl128_s32:
++**	sqdecb	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl128_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqdecb_pat (x0, SV_VL128, 16))
++
++/*
++** qdecb_pat_n_vl256_s32:
++**	sqdecb	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl256_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqdecb_pat (x0, SV_VL256, 16))
++
++/*
++** qdecb_pat_n_mul4_s32:
++**	sqdecb	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul4_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqdecb_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecb_pat_n_mul3_s32:
++**	sqdecb	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul3_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqdecb_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecb_pat_n_all_s32:
++**	sqdecb	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_all_s32, int32_t,
++		x0 = svqdecb_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqdecb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s64.c
+new file mode 100644
+index 000000000..1380e6c8e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_pat_n_1_s64_tied:
++**	sqdecb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_s64_tied, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqdecb_pat (x0, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdecb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_s64_untied, int64_t,
++		x0 = svqdecb_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqdecb_pat (x1, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_2_s64:
++**	sqdecb	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_2_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqdecb_pat (x0, SV_POW2, 2))
++
++/*
++** qdecb_pat_n_7_s64:
++**	sqdecb	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_7_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqdecb_pat (x0, SV_POW2, 7))
++
++/*
++** qdecb_pat_n_15_s64:
++**	sqdecb	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_15_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqdecb_pat (x0, SV_POW2, 15))
++
++/*
++** qdecb_pat_n_16_s64:
++**	sqdecb	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_16_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqdecb_pat (x0, SV_POW2, 16))
++
++/*
++** qdecb_pat_n_vl1_s64:
++**	sqdecb	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl1_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqdecb_pat (x0, SV_VL1, 16))
++
++/*
++** qdecb_pat_n_vl2_s64:
++**	sqdecb	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl2_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqdecb_pat (x0, SV_VL2, 16))
++
++/*
++** qdecb_pat_n_vl3_s64:
++**	sqdecb	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl3_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqdecb_pat (x0, SV_VL3, 16))
++
++/*
++** qdecb_pat_n_vl4_s64:
++**	sqdecb	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl4_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqdecb_pat (x0, SV_VL4, 16))
++
++/*
++** qdecb_pat_n_vl5_s64:
++**	sqdecb	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl5_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqdecb_pat (x0, SV_VL5, 16))
++
++/*
++** qdecb_pat_n_vl6_s64:
++**	sqdecb	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl6_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqdecb_pat (x0, SV_VL6, 16))
++
++/*
++** qdecb_pat_n_vl7_s64:
++**	sqdecb	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl7_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqdecb_pat (x0, SV_VL7, 16))
++
++/*
++** qdecb_pat_n_vl8_s64:
++**	sqdecb	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl8_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqdecb_pat (x0, SV_VL8, 16))
++
++/*
++** qdecb_pat_n_vl16_s64:
++**	sqdecb	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl16_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqdecb_pat (x0, SV_VL16, 16))
++
++/*
++** qdecb_pat_n_vl32_s64:
++**	sqdecb	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl32_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqdecb_pat (x0, SV_VL32, 16))
++
++/*
++** qdecb_pat_n_vl64_s64:
++**	sqdecb	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl64_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqdecb_pat (x0, SV_VL64, 16))
++
++/*
++** qdecb_pat_n_vl128_s64:
++**	sqdecb	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl128_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqdecb_pat (x0, SV_VL128, 16))
++
++/*
++** qdecb_pat_n_vl256_s64:
++**	sqdecb	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl256_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqdecb_pat (x0, SV_VL256, 16))
++
++/*
++** qdecb_pat_n_mul4_s64:
++**	sqdecb	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul4_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqdecb_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecb_pat_n_mul3_s64:
++**	sqdecb	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul3_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqdecb_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecb_pat_n_all_s64:
++**	sqdecb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_all_s64, int64_t,
++		x0 = svqdecb_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqdecb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u32.c
+new file mode 100644
+index 000000000..3db3da866
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_pat_n_1_u32_tied:
++**	uqdecb	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_u32_tied, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqdecb_pat (x0, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdecb	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_u32_untied, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqdecb_pat (x1, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_2_u32:
++**	uqdecb	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_2_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqdecb_pat (x0, SV_POW2, 2))
++
++/*
++** qdecb_pat_n_7_u32:
++**	uqdecb	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_7_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqdecb_pat (x0, SV_POW2, 7))
++
++/*
++** qdecb_pat_n_15_u32:
++**	uqdecb	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_15_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqdecb_pat (x0, SV_POW2, 15))
++
++/*
++** qdecb_pat_n_16_u32:
++**	uqdecb	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_16_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqdecb_pat (x0, SV_POW2, 16))
++
++/*
++** qdecb_pat_n_vl1_u32:
++**	uqdecb	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl1_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqdecb_pat (x0, SV_VL1, 16))
++
++/*
++** qdecb_pat_n_vl2_u32:
++**	uqdecb	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl2_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqdecb_pat (x0, SV_VL2, 16))
++
++/*
++** qdecb_pat_n_vl3_u32:
++**	uqdecb	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl3_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqdecb_pat (x0, SV_VL3, 16))
++
++/*
++** qdecb_pat_n_vl4_u32:
++**	uqdecb	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl4_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqdecb_pat (x0, SV_VL4, 16))
++
++/*
++** qdecb_pat_n_vl5_u32:
++**	uqdecb	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl5_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqdecb_pat (x0, SV_VL5, 16))
++
++/*
++** qdecb_pat_n_vl6_u32:
++**	uqdecb	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl6_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqdecb_pat (x0, SV_VL6, 16))
++
++/*
++** qdecb_pat_n_vl7_u32:
++**	uqdecb	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl7_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqdecb_pat (x0, SV_VL7, 16))
++
++/*
++** qdecb_pat_n_vl8_u32:
++**	uqdecb	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl8_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqdecb_pat (x0, SV_VL8, 16))
++
++/*
++** qdecb_pat_n_vl16_u32:
++**	uqdecb	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl16_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqdecb_pat (x0, SV_VL16, 16))
++
++/*
++** qdecb_pat_n_vl32_u32:
++**	uqdecb	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl32_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqdecb_pat (x0, SV_VL32, 16))
++
++/*
++** qdecb_pat_n_vl64_u32:
++**	uqdecb	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl64_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqdecb_pat (x0, SV_VL64, 16))
++
++/*
++** qdecb_pat_n_vl128_u32:
++**	uqdecb	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl128_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqdecb_pat (x0, SV_VL128, 16))
++
++/*
++** qdecb_pat_n_vl256_u32:
++**	uqdecb	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl256_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqdecb_pat (x0, SV_VL256, 16))
++
++/*
++** qdecb_pat_n_mul4_u32:
++**	uqdecb	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul4_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqdecb_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecb_pat_n_mul3_u32:
++**	uqdecb	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul3_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqdecb_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecb_pat_n_all_u32:
++**	uqdecb	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_all_u32, uint32_t,
++		x0 = svqdecb_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqdecb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u64.c
+new file mode 100644
+index 000000000..2f4c3c7aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_pat_n_1_u64_tied:
++**	uqdecb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_u64_tied, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqdecb_pat (x0, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdecb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_1_u64_untied, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqdecb_pat (x1, SV_POW2, 1))
++
++/*
++** qdecb_pat_n_2_u64:
++**	uqdecb	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_2_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqdecb_pat (x0, SV_POW2, 2))
++
++/*
++** qdecb_pat_n_7_u64:
++**	uqdecb	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_7_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqdecb_pat (x0, SV_POW2, 7))
++
++/*
++** qdecb_pat_n_15_u64:
++**	uqdecb	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_15_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqdecb_pat (x0, SV_POW2, 15))
++
++/*
++** qdecb_pat_n_16_u64:
++**	uqdecb	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_16_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqdecb_pat (x0, SV_POW2, 16))
++
++/*
++** qdecb_pat_n_vl1_u64:
++**	uqdecb	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl1_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqdecb_pat (x0, SV_VL1, 16))
++
++/*
++** qdecb_pat_n_vl2_u64:
++**	uqdecb	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl2_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqdecb_pat (x0, SV_VL2, 16))
++
++/*
++** qdecb_pat_n_vl3_u64:
++**	uqdecb	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl3_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqdecb_pat (x0, SV_VL3, 16))
++
++/*
++** qdecb_pat_n_vl4_u64:
++**	uqdecb	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl4_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqdecb_pat (x0, SV_VL4, 16))
++
++/*
++** qdecb_pat_n_vl5_u64:
++**	uqdecb	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl5_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqdecb_pat (x0, SV_VL5, 16))
++
++/*
++** qdecb_pat_n_vl6_u64:
++**	uqdecb	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl6_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqdecb_pat (x0, SV_VL6, 16))
++
++/*
++** qdecb_pat_n_vl7_u64:
++**	uqdecb	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl7_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqdecb_pat (x0, SV_VL7, 16))
++
++/*
++** qdecb_pat_n_vl8_u64:
++**	uqdecb	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl8_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqdecb_pat (x0, SV_VL8, 16))
++
++/*
++** qdecb_pat_n_vl16_u64:
++**	uqdecb	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl16_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqdecb_pat (x0, SV_VL16, 16))
++
++/*
++** qdecb_pat_n_vl32_u64:
++**	uqdecb	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl32_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqdecb_pat (x0, SV_VL32, 16))
++
++/*
++** qdecb_pat_n_vl64_u64:
++**	uqdecb	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl64_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqdecb_pat (x0, SV_VL64, 16))
++
++/*
++** qdecb_pat_n_vl128_u64:
++**	uqdecb	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl128_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqdecb_pat (x0, SV_VL128, 16))
++
++/*
++** qdecb_pat_n_vl256_u64:
++**	uqdecb	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_vl256_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqdecb_pat (x0, SV_VL256, 16))
++
++/*
++** qdecb_pat_n_mul4_u64:
++**	uqdecb	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul4_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqdecb_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecb_pat_n_mul3_u64:
++**	uqdecb	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_mul3_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqdecb_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecb_pat_n_all_u64:
++**	uqdecb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_pat_n_all_u64, uint64_t,
++		x0 = svqdecb_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqdecb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s32.c
+new file mode 100644
+index 000000000..11180654e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_n_1_s32_tied:
++**	sqdecb	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_s32_tied, int32_t,
++		x0 = svqdecb_n_s32 (x0, 1),
++		x0 = svqdecb (x0, 1))
++
++/*
++** qdecb_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdecb	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_s32_untied, int32_t,
++		x0 = svqdecb_n_s32 (x1, 1),
++		x0 = svqdecb (x1, 1))
++
++/*
++** qdecb_n_2_s32:
++**	sqdecb	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_2_s32, int32_t,
++		x0 = svqdecb_n_s32 (x0, 2),
++		x0 = svqdecb (x0, 2))
++
++/*
++** qdecb_n_7_s32:
++**	sqdecb	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_7_s32, int32_t,
++		x0 = svqdecb_n_s32 (x0, 7),
++		x0 = svqdecb (x0, 7))
++
++/*
++** qdecb_n_15_s32:
++**	sqdecb	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_15_s32, int32_t,
++		x0 = svqdecb_n_s32 (x0, 15),
++		x0 = svqdecb (x0, 15))
++
++/*
++** qdecb_n_16_s32:
++**	sqdecb	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_16_s32, int32_t,
++		x0 = svqdecb_n_s32 (x0, 16),
++		x0 = svqdecb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s64.c
+new file mode 100644
+index 000000000..17b765655
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_n_1_s64_tied:
++**	sqdecb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_s64_tied, int64_t,
++		x0 = svqdecb_n_s64 (x0, 1),
++		x0 = svqdecb (x0, 1))
++
++/*
++** qdecb_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdecb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_s64_untied, int64_t,
++		x0 = svqdecb_n_s64 (x1, 1),
++		x0 = svqdecb (x1, 1))
++
++/*
++** qdecb_n_2_s64:
++**	sqdecb	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_2_s64, int64_t,
++		x0 = svqdecb_n_s64 (x0, 2),
++		x0 = svqdecb (x0, 2))
++
++/*
++** qdecb_n_7_s64:
++**	sqdecb	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_7_s64, int64_t,
++		x0 = svqdecb_n_s64 (x0, 7),
++		x0 = svqdecb (x0, 7))
++
++/*
++** qdecb_n_15_s64:
++**	sqdecb	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_15_s64, int64_t,
++		x0 = svqdecb_n_s64 (x0, 15),
++		x0 = svqdecb (x0, 15))
++
++/*
++** qdecb_n_16_s64:
++**	sqdecb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_16_s64, int64_t,
++		x0 = svqdecb_n_s64 (x0, 16),
++		x0 = svqdecb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u32.c
+new file mode 100644
+index 000000000..b31e04de5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_n_1_u32_tied:
++**	uqdecb	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_u32_tied, uint32_t,
++		x0 = svqdecb_n_u32 (x0, 1),
++		x0 = svqdecb (x0, 1))
++
++/*
++** qdecb_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdecb	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_u32_untied, uint32_t,
++		x0 = svqdecb_n_u32 (x1, 1),
++		x0 = svqdecb (x1, 1))
++
++/*
++** qdecb_n_2_u32:
++**	uqdecb	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_2_u32, uint32_t,
++		x0 = svqdecb_n_u32 (x0, 2),
++		x0 = svqdecb (x0, 2))
++
++/*
++** qdecb_n_7_u32:
++**	uqdecb	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_7_u32, uint32_t,
++		x0 = svqdecb_n_u32 (x0, 7),
++		x0 = svqdecb (x0, 7))
++
++/*
++** qdecb_n_15_u32:
++**	uqdecb	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_15_u32, uint32_t,
++		x0 = svqdecb_n_u32 (x0, 15),
++		x0 = svqdecb (x0, 15))
++
++/*
++** qdecb_n_16_u32:
++**	uqdecb	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_16_u32, uint32_t,
++		x0 = svqdecb_n_u32 (x0, 16),
++		x0 = svqdecb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u64.c
+new file mode 100644
+index 000000000..aab6faba9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecb_n_1_u64_tied:
++**	uqdecb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_u64_tied, uint64_t,
++		x0 = svqdecb_n_u64 (x0, 1),
++		x0 = svqdecb (x0, 1))
++
++/*
++** qdecb_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdecb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_1_u64_untied, uint64_t,
++		x0 = svqdecb_n_u64 (x1, 1),
++		x0 = svqdecb (x1, 1))
++
++/*
++** qdecb_n_2_u64:
++**	uqdecb	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_2_u64, uint64_t,
++		x0 = svqdecb_n_u64 (x0, 2),
++		x0 = svqdecb (x0, 2))
++
++/*
++** qdecb_n_7_u64:
++**	uqdecb	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_7_u64, uint64_t,
++		x0 = svqdecb_n_u64 (x0, 7),
++		x0 = svqdecb (x0, 7))
++
++/*
++** qdecb_n_15_u64:
++**	uqdecb	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_15_u64, uint64_t,
++		x0 = svqdecb_n_u64 (x0, 15),
++		x0 = svqdecb (x0, 15))
++
++/*
++** qdecb_n_16_u64:
++**	uqdecb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecb_n_16_u64, uint64_t,
++		x0 = svqdecb_n_u64 (x0, 16),
++		x0 = svqdecb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s32.c
+new file mode 100644
+index 000000000..bc491d397
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_pat_n_1_s32_tied:
++**	sqdecd	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_s32_tied, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqdecd_pat (x0, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdecd	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_s32_untied, int32_t,
++		x0 = svqdecd_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqdecd_pat (x1, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_2_s32:
++**	sqdecd	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_2_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqdecd_pat (x0, SV_POW2, 2))
++
++/*
++** qdecd_pat_n_7_s32:
++**	sqdecd	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_7_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqdecd_pat (x0, SV_POW2, 7))
++
++/*
++** qdecd_pat_n_15_s32:
++**	sqdecd	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_15_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqdecd_pat (x0, SV_POW2, 15))
++
++/*
++** qdecd_pat_n_16_s32:
++**	sqdecd	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_16_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqdecd_pat (x0, SV_POW2, 16))
++
++/*
++** qdecd_pat_n_vl1_s32:
++**	sqdecd	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl1_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqdecd_pat (x0, SV_VL1, 16))
++
++/*
++** qdecd_pat_n_vl2_s32:
++**	sqdecd	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl2_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqdecd_pat (x0, SV_VL2, 16))
++
++/*
++** qdecd_pat_n_vl3_s32:
++**	sqdecd	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl3_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqdecd_pat (x0, SV_VL3, 16))
++
++/*
++** qdecd_pat_n_vl4_s32:
++**	sqdecd	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl4_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqdecd_pat (x0, SV_VL4, 16))
++
++/*
++** qdecd_pat_n_vl5_s32:
++**	sqdecd	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl5_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqdecd_pat (x0, SV_VL5, 16))
++
++/*
++** qdecd_pat_n_vl6_s32:
++**	sqdecd	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl6_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqdecd_pat (x0, SV_VL6, 16))
++
++/*
++** qdecd_pat_n_vl7_s32:
++**	sqdecd	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl7_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqdecd_pat (x0, SV_VL7, 16))
++
++/*
++** qdecd_pat_n_vl8_s32:
++**	sqdecd	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl8_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqdecd_pat (x0, SV_VL8, 16))
++
++/*
++** qdecd_pat_n_vl16_s32:
++**	sqdecd	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl16_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqdecd_pat (x0, SV_VL16, 16))
++
++/*
++** qdecd_pat_n_vl32_s32:
++**	sqdecd	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl32_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqdecd_pat (x0, SV_VL32, 16))
++
++/*
++** qdecd_pat_n_vl64_s32:
++**	sqdecd	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl64_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqdecd_pat (x0, SV_VL64, 16))
++
++/*
++** qdecd_pat_n_vl128_s32:
++**	sqdecd	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl128_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqdecd_pat (x0, SV_VL128, 16))
++
++/*
++** qdecd_pat_n_vl256_s32:
++**	sqdecd	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl256_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqdecd_pat (x0, SV_VL256, 16))
++
++/*
++** qdecd_pat_n_mul4_s32:
++**	sqdecd	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul4_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqdecd_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecd_pat_n_mul3_s32:
++**	sqdecd	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul3_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqdecd_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecd_pat_n_all_s32:
++**	sqdecd	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_all_s32, int32_t,
++		x0 = svqdecd_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqdecd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s64.c
+new file mode 100644
+index 000000000..3970ff058
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s64.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_pat_1_s64_tied:
++**	sqdecd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_1_s64_tied, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_POW2, 1),
++		z0 = svqdecd_pat (z0, SV_POW2, 1))
++
++/*
++** qdecd_pat_1_s64_untied:
++**	movprfx	z0, z1
++**	sqdecd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_1_s64_untied, svint64_t,
++		z0 = svqdecd_pat_s64 (z1, SV_POW2, 1),
++		z0 = svqdecd_pat (z1, SV_POW2, 1))
++
++/*
++** qdecd_pat_2_s64:
++**	sqdecd	z0\.d, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_2_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_POW2, 2),
++		z0 = svqdecd_pat (z0, SV_POW2, 2))
++
++/*
++** qdecd_pat_7_s64:
++**	sqdecd	z0\.d, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_7_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_POW2, 7),
++		z0 = svqdecd_pat (z0, SV_POW2, 7))
++
++/*
++** qdecd_pat_15_s64:
++**	sqdecd	z0\.d, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_15_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_POW2, 15),
++		z0 = svqdecd_pat (z0, SV_POW2, 15))
++
++/*
++** qdecd_pat_16_s64:
++**	sqdecd	z0\.d, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_16_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_POW2, 16),
++		z0 = svqdecd_pat (z0, SV_POW2, 16))
++
++/*
++** qdecd_pat_vl1_s64:
++**	sqdecd	z0\.d, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl1_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL1, 16),
++		z0 = svqdecd_pat (z0, SV_VL1, 16))
++
++/*
++** qdecd_pat_vl2_s64:
++**	sqdecd	z0\.d, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl2_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL2, 16),
++		z0 = svqdecd_pat (z0, SV_VL2, 16))
++
++/*
++** qdecd_pat_vl3_s64:
++**	sqdecd	z0\.d, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl3_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL3, 16),
++		z0 = svqdecd_pat (z0, SV_VL3, 16))
++
++/*
++** qdecd_pat_vl4_s64:
++**	sqdecd	z0\.d, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl4_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL4, 16),
++		z0 = svqdecd_pat (z0, SV_VL4, 16))
++
++/*
++** qdecd_pat_vl5_s64:
++**	sqdecd	z0\.d, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl5_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL5, 16),
++		z0 = svqdecd_pat (z0, SV_VL5, 16))
++
++/*
++** qdecd_pat_vl6_s64:
++**	sqdecd	z0\.d, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl6_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL6, 16),
++		z0 = svqdecd_pat (z0, SV_VL6, 16))
++
++/*
++** qdecd_pat_vl7_s64:
++**	sqdecd	z0\.d, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl7_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL7, 16),
++		z0 = svqdecd_pat (z0, SV_VL7, 16))
++
++/*
++** qdecd_pat_vl8_s64:
++**	sqdecd	z0\.d, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl8_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL8, 16),
++		z0 = svqdecd_pat (z0, SV_VL8, 16))
++
++/*
++** qdecd_pat_vl16_s64:
++**	sqdecd	z0\.d, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl16_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL16, 16),
++		z0 = svqdecd_pat (z0, SV_VL16, 16))
++
++/*
++** qdecd_pat_vl32_s64:
++**	sqdecd	z0\.d, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl32_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL32, 16),
++		z0 = svqdecd_pat (z0, SV_VL32, 16))
++
++/*
++** qdecd_pat_vl64_s64:
++**	sqdecd	z0\.d, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl64_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL64, 16),
++		z0 = svqdecd_pat (z0, SV_VL64, 16))
++
++/*
++** qdecd_pat_vl128_s64:
++**	sqdecd	z0\.d, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl128_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL128, 16),
++		z0 = svqdecd_pat (z0, SV_VL128, 16))
++
++/*
++** qdecd_pat_vl256_s64:
++**	sqdecd	z0\.d, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl256_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_VL256, 16),
++		z0 = svqdecd_pat (z0, SV_VL256, 16))
++
++/*
++** qdecd_pat_mul4_s64:
++**	sqdecd	z0\.d, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_mul4_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_MUL4, 16),
++		z0 = svqdecd_pat (z0, SV_MUL4, 16))
++
++/*
++** qdecd_pat_mul3_s64:
++**	sqdecd	z0\.d, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_mul3_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_MUL3, 16),
++		z0 = svqdecd_pat (z0, SV_MUL3, 16))
++
++/*
++** qdecd_pat_all_s64:
++**	sqdecd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_all_s64, svint64_t,
++		z0 = svqdecd_pat_s64 (z0, SV_ALL, 16),
++		z0 = svqdecd_pat (z0, SV_ALL, 16))
++
++/*
++** qdecd_pat_n_1_s64_tied:
++**	sqdecd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_s64_tied, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqdecd_pat (x0, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdecd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_s64_untied, int64_t,
++		x0 = svqdecd_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqdecd_pat (x1, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_2_s64:
++**	sqdecd	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_2_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqdecd_pat (x0, SV_POW2, 2))
++
++/*
++** qdecd_pat_n_7_s64:
++**	sqdecd	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_7_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqdecd_pat (x0, SV_POW2, 7))
++
++/*
++** qdecd_pat_n_15_s64:
++**	sqdecd	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_15_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqdecd_pat (x0, SV_POW2, 15))
++
++/*
++** qdecd_pat_n_16_s64:
++**	sqdecd	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_16_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqdecd_pat (x0, SV_POW2, 16))
++
++/*
++** qdecd_pat_n_vl1_s64:
++**	sqdecd	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl1_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqdecd_pat (x0, SV_VL1, 16))
++
++/*
++** qdecd_pat_n_vl2_s64:
++**	sqdecd	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl2_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqdecd_pat (x0, SV_VL2, 16))
++
++/*
++** qdecd_pat_n_vl3_s64:
++**	sqdecd	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl3_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqdecd_pat (x0, SV_VL3, 16))
++
++/*
++** qdecd_pat_n_vl4_s64:
++**	sqdecd	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl4_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqdecd_pat (x0, SV_VL4, 16))
++
++/*
++** qdecd_pat_n_vl5_s64:
++**	sqdecd	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl5_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqdecd_pat (x0, SV_VL5, 16))
++
++/*
++** qdecd_pat_n_vl6_s64:
++**	sqdecd	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl6_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqdecd_pat (x0, SV_VL6, 16))
++
++/*
++** qdecd_pat_n_vl7_s64:
++**	sqdecd	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl7_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqdecd_pat (x0, SV_VL7, 16))
++
++/*
++** qdecd_pat_n_vl8_s64:
++**	sqdecd	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl8_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqdecd_pat (x0, SV_VL8, 16))
++
++/*
++** qdecd_pat_n_vl16_s64:
++**	sqdecd	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl16_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqdecd_pat (x0, SV_VL16, 16))
++
++/*
++** qdecd_pat_n_vl32_s64:
++**	sqdecd	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl32_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqdecd_pat (x0, SV_VL32, 16))
++
++/*
++** qdecd_pat_n_vl64_s64:
++**	sqdecd	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl64_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqdecd_pat (x0, SV_VL64, 16))
++
++/*
++** qdecd_pat_n_vl128_s64:
++**	sqdecd	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl128_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqdecd_pat (x0, SV_VL128, 16))
++
++/*
++** qdecd_pat_n_vl256_s64:
++**	sqdecd	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl256_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqdecd_pat (x0, SV_VL256, 16))
++
++/*
++** qdecd_pat_n_mul4_s64:
++**	sqdecd	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul4_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqdecd_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecd_pat_n_mul3_s64:
++**	sqdecd	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul3_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqdecd_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecd_pat_n_all_s64:
++**	sqdecd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_all_s64, int64_t,
++		x0 = svqdecd_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqdecd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u32.c
+new file mode 100644
+index 000000000..b33e402f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_pat_n_1_u32_tied:
++**	uqdecd	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_u32_tied, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqdecd_pat (x0, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdecd	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_u32_untied, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqdecd_pat (x1, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_2_u32:
++**	uqdecd	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_2_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqdecd_pat (x0, SV_POW2, 2))
++
++/*
++** qdecd_pat_n_7_u32:
++**	uqdecd	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_7_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqdecd_pat (x0, SV_POW2, 7))
++
++/*
++** qdecd_pat_n_15_u32:
++**	uqdecd	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_15_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqdecd_pat (x0, SV_POW2, 15))
++
++/*
++** qdecd_pat_n_16_u32:
++**	uqdecd	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_16_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqdecd_pat (x0, SV_POW2, 16))
++
++/*
++** qdecd_pat_n_vl1_u32:
++**	uqdecd	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl1_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqdecd_pat (x0, SV_VL1, 16))
++
++/*
++** qdecd_pat_n_vl2_u32:
++**	uqdecd	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl2_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqdecd_pat (x0, SV_VL2, 16))
++
++/*
++** qdecd_pat_n_vl3_u32:
++**	uqdecd	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl3_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqdecd_pat (x0, SV_VL3, 16))
++
++/*
++** qdecd_pat_n_vl4_u32:
++**	uqdecd	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl4_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqdecd_pat (x0, SV_VL4, 16))
++
++/*
++** qdecd_pat_n_vl5_u32:
++**	uqdecd	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl5_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqdecd_pat (x0, SV_VL5, 16))
++
++/*
++** qdecd_pat_n_vl6_u32:
++**	uqdecd	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl6_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqdecd_pat (x0, SV_VL6, 16))
++
++/*
++** qdecd_pat_n_vl7_u32:
++**	uqdecd	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl7_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqdecd_pat (x0, SV_VL7, 16))
++
++/*
++** qdecd_pat_n_vl8_u32:
++**	uqdecd	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl8_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqdecd_pat (x0, SV_VL8, 16))
++
++/*
++** qdecd_pat_n_vl16_u32:
++**	uqdecd	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl16_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqdecd_pat (x0, SV_VL16, 16))
++
++/*
++** qdecd_pat_n_vl32_u32:
++**	uqdecd	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl32_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqdecd_pat (x0, SV_VL32, 16))
++
++/*
++** qdecd_pat_n_vl64_u32:
++**	uqdecd	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl64_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqdecd_pat (x0, SV_VL64, 16))
++
++/*
++** qdecd_pat_n_vl128_u32:
++**	uqdecd	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl128_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqdecd_pat (x0, SV_VL128, 16))
++
++/*
++** qdecd_pat_n_vl256_u32:
++**	uqdecd	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl256_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqdecd_pat (x0, SV_VL256, 16))
++
++/*
++** qdecd_pat_n_mul4_u32:
++**	uqdecd	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul4_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqdecd_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecd_pat_n_mul3_u32:
++**	uqdecd	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul3_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqdecd_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecd_pat_n_all_u32:
++**	uqdecd	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_all_u32, uint32_t,
++		x0 = svqdecd_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqdecd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u64.c
+new file mode 100644
+index 000000000..f0d1bd357
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u64.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_pat_1_u64_tied:
++**	uqdecd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_1_u64_tied, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_POW2, 1),
++		z0 = svqdecd_pat (z0, SV_POW2, 1))
++
++/*
++** qdecd_pat_1_u64_untied:
++**	movprfx	z0, z1
++**	uqdecd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_1_u64_untied, svuint64_t,
++		z0 = svqdecd_pat_u64 (z1, SV_POW2, 1),
++		z0 = svqdecd_pat (z1, SV_POW2, 1))
++
++/*
++** qdecd_pat_2_u64:
++**	uqdecd	z0\.d, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_2_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_POW2, 2),
++		z0 = svqdecd_pat (z0, SV_POW2, 2))
++
++/*
++** qdecd_pat_7_u64:
++**	uqdecd	z0\.d, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_7_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_POW2, 7),
++		z0 = svqdecd_pat (z0, SV_POW2, 7))
++
++/*
++** qdecd_pat_15_u64:
++**	uqdecd	z0\.d, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_15_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_POW2, 15),
++		z0 = svqdecd_pat (z0, SV_POW2, 15))
++
++/*
++** qdecd_pat_16_u64:
++**	uqdecd	z0\.d, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_16_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_POW2, 16),
++		z0 = svqdecd_pat (z0, SV_POW2, 16))
++
++/*
++** qdecd_pat_vl1_u64:
++**	uqdecd	z0\.d, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl1_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL1, 16),
++		z0 = svqdecd_pat (z0, SV_VL1, 16))
++
++/*
++** qdecd_pat_vl2_u64:
++**	uqdecd	z0\.d, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl2_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL2, 16),
++		z0 = svqdecd_pat (z0, SV_VL2, 16))
++
++/*
++** qdecd_pat_vl3_u64:
++**	uqdecd	z0\.d, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl3_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL3, 16),
++		z0 = svqdecd_pat (z0, SV_VL3, 16))
++
++/*
++** qdecd_pat_vl4_u64:
++**	uqdecd	z0\.d, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl4_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL4, 16),
++		z0 = svqdecd_pat (z0, SV_VL4, 16))
++
++/*
++** qdecd_pat_vl5_u64:
++**	uqdecd	z0\.d, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl5_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL5, 16),
++		z0 = svqdecd_pat (z0, SV_VL5, 16))
++
++/*
++** qdecd_pat_vl6_u64:
++**	uqdecd	z0\.d, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl6_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL6, 16),
++		z0 = svqdecd_pat (z0, SV_VL6, 16))
++
++/*
++** qdecd_pat_vl7_u64:
++**	uqdecd	z0\.d, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl7_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL7, 16),
++		z0 = svqdecd_pat (z0, SV_VL7, 16))
++
++/*
++** qdecd_pat_vl8_u64:
++**	uqdecd	z0\.d, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl8_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL8, 16),
++		z0 = svqdecd_pat (z0, SV_VL8, 16))
++
++/*
++** qdecd_pat_vl16_u64:
++**	uqdecd	z0\.d, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl16_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL16, 16),
++		z0 = svqdecd_pat (z0, SV_VL16, 16))
++
++/*
++** qdecd_pat_vl32_u64:
++**	uqdecd	z0\.d, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl32_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL32, 16),
++		z0 = svqdecd_pat (z0, SV_VL32, 16))
++
++/*
++** qdecd_pat_vl64_u64:
++**	uqdecd	z0\.d, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl64_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL64, 16),
++		z0 = svqdecd_pat (z0, SV_VL64, 16))
++
++/*
++** qdecd_pat_vl128_u64:
++**	uqdecd	z0\.d, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl128_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL128, 16),
++		z0 = svqdecd_pat (z0, SV_VL128, 16))
++
++/*
++** qdecd_pat_vl256_u64:
++**	uqdecd	z0\.d, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_vl256_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_VL256, 16),
++		z0 = svqdecd_pat (z0, SV_VL256, 16))
++
++/*
++** qdecd_pat_mul4_u64:
++**	uqdecd	z0\.d, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_mul4_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_MUL4, 16),
++		z0 = svqdecd_pat (z0, SV_MUL4, 16))
++
++/*
++** qdecd_pat_mul3_u64:
++**	uqdecd	z0\.d, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_mul3_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_MUL3, 16),
++		z0 = svqdecd_pat (z0, SV_MUL3, 16))
++
++/*
++** qdecd_pat_all_u64:
++**	uqdecd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_pat_all_u64, svuint64_t,
++		z0 = svqdecd_pat_u64 (z0, SV_ALL, 16),
++		z0 = svqdecd_pat (z0, SV_ALL, 16))
++
++/*
++** qdecd_pat_n_1_u64_tied:
++**	uqdecd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_u64_tied, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqdecd_pat (x0, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdecd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_1_u64_untied, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqdecd_pat (x1, SV_POW2, 1))
++
++/*
++** qdecd_pat_n_2_u64:
++**	uqdecd	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_2_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqdecd_pat (x0, SV_POW2, 2))
++
++/*
++** qdecd_pat_n_7_u64:
++**	uqdecd	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_7_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqdecd_pat (x0, SV_POW2, 7))
++
++/*
++** qdecd_pat_n_15_u64:
++**	uqdecd	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_15_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqdecd_pat (x0, SV_POW2, 15))
++
++/*
++** qdecd_pat_n_16_u64:
++**	uqdecd	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_16_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqdecd_pat (x0, SV_POW2, 16))
++
++/*
++** qdecd_pat_n_vl1_u64:
++**	uqdecd	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl1_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqdecd_pat (x0, SV_VL1, 16))
++
++/*
++** qdecd_pat_n_vl2_u64:
++**	uqdecd	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl2_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqdecd_pat (x0, SV_VL2, 16))
++
++/*
++** qdecd_pat_n_vl3_u64:
++**	uqdecd	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl3_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqdecd_pat (x0, SV_VL3, 16))
++
++/*
++** qdecd_pat_n_vl4_u64:
++**	uqdecd	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl4_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqdecd_pat (x0, SV_VL4, 16))
++
++/*
++** qdecd_pat_n_vl5_u64:
++**	uqdecd	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl5_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqdecd_pat (x0, SV_VL5, 16))
++
++/*
++** qdecd_pat_n_vl6_u64:
++**	uqdecd	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl6_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqdecd_pat (x0, SV_VL6, 16))
++
++/*
++** qdecd_pat_n_vl7_u64:
++**	uqdecd	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl7_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqdecd_pat (x0, SV_VL7, 16))
++
++/*
++** qdecd_pat_n_vl8_u64:
++**	uqdecd	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl8_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqdecd_pat (x0, SV_VL8, 16))
++
++/*
++** qdecd_pat_n_vl16_u64:
++**	uqdecd	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl16_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqdecd_pat (x0, SV_VL16, 16))
++
++/*
++** qdecd_pat_n_vl32_u64:
++**	uqdecd	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl32_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqdecd_pat (x0, SV_VL32, 16))
++
++/*
++** qdecd_pat_n_vl64_u64:
++**	uqdecd	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl64_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqdecd_pat (x0, SV_VL64, 16))
++
++/*
++** qdecd_pat_n_vl128_u64:
++**	uqdecd	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl128_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqdecd_pat (x0, SV_VL128, 16))
++
++/*
++** qdecd_pat_n_vl256_u64:
++**	uqdecd	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_vl256_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqdecd_pat (x0, SV_VL256, 16))
++
++/*
++** qdecd_pat_n_mul4_u64:
++**	uqdecd	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul4_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqdecd_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecd_pat_n_mul3_u64:
++**	uqdecd	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_mul3_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqdecd_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecd_pat_n_all_u64:
++**	uqdecd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_pat_n_all_u64, uint64_t,
++		x0 = svqdecd_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqdecd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s32.c
+new file mode 100644
+index 000000000..1912ed53f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_n_1_s32_tied:
++**	sqdecd	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_s32_tied, int32_t,
++		x0 = svqdecd_n_s32 (x0, 1),
++		x0 = svqdecd (x0, 1))
++
++/*
++** qdecd_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdecd	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_s32_untied, int32_t,
++		x0 = svqdecd_n_s32 (x1, 1),
++		x0 = svqdecd (x1, 1))
++
++/*
++** qdecd_n_2_s32:
++**	sqdecd	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_2_s32, int32_t,
++		x0 = svqdecd_n_s32 (x0, 2),
++		x0 = svqdecd (x0, 2))
++
++/*
++** qdecd_n_7_s32:
++**	sqdecd	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_7_s32, int32_t,
++		x0 = svqdecd_n_s32 (x0, 7),
++		x0 = svqdecd (x0, 7))
++
++/*
++** qdecd_n_15_s32:
++**	sqdecd	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_15_s32, int32_t,
++		x0 = svqdecd_n_s32 (x0, 15),
++		x0 = svqdecd (x0, 15))
++
++/*
++** qdecd_n_16_s32:
++**	sqdecd	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_16_s32, int32_t,
++		x0 = svqdecd_n_s32 (x0, 16),
++		x0 = svqdecd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s64.c
+new file mode 100644
+index 000000000..bd113fc66
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s64.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_1_s64_tied:
++**	sqdecd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_1_s64_tied, svint64_t,
++		z0 = svqdecd_s64 (z0, 1),
++		z0 = svqdecd (z0, 1))
++
++/*
++** qdecd_1_s64_untied:
++**	movprfx	z0, z1
++**	sqdecd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_1_s64_untied, svint64_t,
++		z0 = svqdecd_s64 (z1, 1),
++		z0 = svqdecd (z1, 1))
++
++/*
++** qdecd_2_s64:
++**	sqdecd	z0\.d, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_2_s64, svint64_t,
++		z0 = svqdecd_s64 (z0, 2),
++		z0 = svqdecd (z0, 2))
++
++/*
++** qdecd_7_s64:
++**	sqdecd	z0\.d, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_7_s64, svint64_t,
++		z0 = svqdecd_s64 (z0, 7),
++		z0 = svqdecd (z0, 7))
++
++/*
++** qdecd_15_s64:
++**	sqdecd	z0\.d, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_15_s64, svint64_t,
++		z0 = svqdecd_s64 (z0, 15),
++		z0 = svqdecd (z0, 15))
++
++/*
++** qdecd_16_s64:
++**	sqdecd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_16_s64, svint64_t,
++		z0 = svqdecd_s64 (z0, 16),
++		z0 = svqdecd (z0, 16))
++
++/*
++** qdecd_n_1_s64_tied:
++**	sqdecd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_s64_tied, int64_t,
++		x0 = svqdecd_n_s64 (x0, 1),
++		x0 = svqdecd (x0, 1))
++
++/*
++** qdecd_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdecd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_s64_untied, int64_t,
++		x0 = svqdecd_n_s64 (x1, 1),
++		x0 = svqdecd (x1, 1))
++
++/*
++** qdecd_n_2_s64:
++**	sqdecd	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_2_s64, int64_t,
++		x0 = svqdecd_n_s64 (x0, 2),
++		x0 = svqdecd (x0, 2))
++
++/*
++** qdecd_n_7_s64:
++**	sqdecd	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_7_s64, int64_t,
++		x0 = svqdecd_n_s64 (x0, 7),
++		x0 = svqdecd (x0, 7))
++
++/*
++** qdecd_n_15_s64:
++**	sqdecd	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_15_s64, int64_t,
++		x0 = svqdecd_n_s64 (x0, 15),
++		x0 = svqdecd (x0, 15))
++
++/*
++** qdecd_n_16_s64:
++**	sqdecd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_16_s64, int64_t,
++		x0 = svqdecd_n_s64 (x0, 16),
++		x0 = svqdecd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u32.c
+new file mode 100644
+index 000000000..a672dc215
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_n_1_u32_tied:
++**	uqdecd	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_u32_tied, uint32_t,
++		x0 = svqdecd_n_u32 (x0, 1),
++		x0 = svqdecd (x0, 1))
++
++/*
++** qdecd_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdecd	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_u32_untied, uint32_t,
++		x0 = svqdecd_n_u32 (x1, 1),
++		x0 = svqdecd (x1, 1))
++
++/*
++** qdecd_n_2_u32:
++**	uqdecd	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_2_u32, uint32_t,
++		x0 = svqdecd_n_u32 (x0, 2),
++		x0 = svqdecd (x0, 2))
++
++/*
++** qdecd_n_7_u32:
++**	uqdecd	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_7_u32, uint32_t,
++		x0 = svqdecd_n_u32 (x0, 7),
++		x0 = svqdecd (x0, 7))
++
++/*
++** qdecd_n_15_u32:
++**	uqdecd	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_15_u32, uint32_t,
++		x0 = svqdecd_n_u32 (x0, 15),
++		x0 = svqdecd (x0, 15))
++
++/*
++** qdecd_n_16_u32:
++**	uqdecd	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_16_u32, uint32_t,
++		x0 = svqdecd_n_u32 (x0, 16),
++		x0 = svqdecd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u64.c
+new file mode 100644
+index 000000000..fca8868f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u64.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecd_1_u64_tied:
++**	uqdecd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_1_u64_tied, svuint64_t,
++		z0 = svqdecd_u64 (z0, 1),
++		z0 = svqdecd (z0, 1))
++
++/*
++** qdecd_1_u64_untied:
++**	movprfx	z0, z1
++**	uqdecd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_1_u64_untied, svuint64_t,
++		z0 = svqdecd_u64 (z1, 1),
++		z0 = svqdecd (z1, 1))
++
++/*
++** qdecd_2_u64:
++**	uqdecd	z0\.d, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_2_u64, svuint64_t,
++		z0 = svqdecd_u64 (z0, 2),
++		z0 = svqdecd (z0, 2))
++
++/*
++** qdecd_7_u64:
++**	uqdecd	z0\.d, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_7_u64, svuint64_t,
++		z0 = svqdecd_u64 (z0, 7),
++		z0 = svqdecd (z0, 7))
++
++/*
++** qdecd_15_u64:
++**	uqdecd	z0\.d, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_15_u64, svuint64_t,
++		z0 = svqdecd_u64 (z0, 15),
++		z0 = svqdecd (z0, 15))
++
++/*
++** qdecd_16_u64:
++**	uqdecd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecd_16_u64, svuint64_t,
++		z0 = svqdecd_u64 (z0, 16),
++		z0 = svqdecd (z0, 16))
++
++/*
++** qdecd_n_1_u64_tied:
++**	uqdecd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_u64_tied, uint64_t,
++		x0 = svqdecd_n_u64 (x0, 1),
++		x0 = svqdecd (x0, 1))
++
++/*
++** qdecd_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdecd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_1_u64_untied, uint64_t,
++		x0 = svqdecd_n_u64 (x1, 1),
++		x0 = svqdecd (x1, 1))
++
++/*
++** qdecd_n_2_u64:
++**	uqdecd	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_2_u64, uint64_t,
++		x0 = svqdecd_n_u64 (x0, 2),
++		x0 = svqdecd (x0, 2))
++
++/*
++** qdecd_n_7_u64:
++**	uqdecd	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_7_u64, uint64_t,
++		x0 = svqdecd_n_u64 (x0, 7),
++		x0 = svqdecd (x0, 7))
++
++/*
++** qdecd_n_15_u64:
++**	uqdecd	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_15_u64, uint64_t,
++		x0 = svqdecd_n_u64 (x0, 15),
++		x0 = svqdecd (x0, 15))
++
++/*
++** qdecd_n_16_u64:
++**	uqdecd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecd_n_16_u64, uint64_t,
++		x0 = svqdecd_n_u64 (x0, 16),
++		x0 = svqdecd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s16.c
+new file mode 100644
+index 000000000..c084043f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s16.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_pat_1_s16_tied:
++**	sqdech	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_1_s16_tied, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_POW2, 1),
++		z0 = svqdech_pat (z0, SV_POW2, 1))
++
++/*
++** qdech_pat_1_s16_untied:
++**	movprfx	z0, z1
++**	sqdech	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_1_s16_untied, svint16_t,
++		z0 = svqdech_pat_s16 (z1, SV_POW2, 1),
++		z0 = svqdech_pat (z1, SV_POW2, 1))
++
++/*
++** qdech_pat_2_s16:
++**	sqdech	z0\.h, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_2_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_POW2, 2),
++		z0 = svqdech_pat (z0, SV_POW2, 2))
++
++/*
++** qdech_pat_7_s16:
++**	sqdech	z0\.h, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_7_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_POW2, 7),
++		z0 = svqdech_pat (z0, SV_POW2, 7))
++
++/*
++** qdech_pat_15_s16:
++**	sqdech	z0\.h, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_15_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_POW2, 15),
++		z0 = svqdech_pat (z0, SV_POW2, 15))
++
++/*
++** qdech_pat_16_s16:
++**	sqdech	z0\.h, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_16_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_POW2, 16),
++		z0 = svqdech_pat (z0, SV_POW2, 16))
++
++/*
++** qdech_pat_vl1_s16:
++**	sqdech	z0\.h, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl1_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL1, 16),
++		z0 = svqdech_pat (z0, SV_VL1, 16))
++
++/*
++** qdech_pat_vl2_s16:
++**	sqdech	z0\.h, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl2_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL2, 16),
++		z0 = svqdech_pat (z0, SV_VL2, 16))
++
++/*
++** qdech_pat_vl3_s16:
++**	sqdech	z0\.h, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl3_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL3, 16),
++		z0 = svqdech_pat (z0, SV_VL3, 16))
++
++/*
++** qdech_pat_vl4_s16:
++**	sqdech	z0\.h, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl4_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL4, 16),
++		z0 = svqdech_pat (z0, SV_VL4, 16))
++
++/*
++** qdech_pat_vl5_s16:
++**	sqdech	z0\.h, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl5_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL5, 16),
++		z0 = svqdech_pat (z0, SV_VL5, 16))
++
++/*
++** qdech_pat_vl6_s16:
++**	sqdech	z0\.h, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl6_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL6, 16),
++		z0 = svqdech_pat (z0, SV_VL6, 16))
++
++/*
++** qdech_pat_vl7_s16:
++**	sqdech	z0\.h, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl7_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL7, 16),
++		z0 = svqdech_pat (z0, SV_VL7, 16))
++
++/*
++** qdech_pat_vl8_s16:
++**	sqdech	z0\.h, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl8_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL8, 16),
++		z0 = svqdech_pat (z0, SV_VL8, 16))
++
++/*
++** qdech_pat_vl16_s16:
++**	sqdech	z0\.h, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl16_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL16, 16),
++		z0 = svqdech_pat (z0, SV_VL16, 16))
++
++/*
++** qdech_pat_vl32_s16:
++**	sqdech	z0\.h, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl32_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL32, 16),
++		z0 = svqdech_pat (z0, SV_VL32, 16))
++
++/*
++** qdech_pat_vl64_s16:
++**	sqdech	z0\.h, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl64_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL64, 16),
++		z0 = svqdech_pat (z0, SV_VL64, 16))
++
++/*
++** qdech_pat_vl128_s16:
++**	sqdech	z0\.h, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl128_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL128, 16),
++		z0 = svqdech_pat (z0, SV_VL128, 16))
++
++/*
++** qdech_pat_vl256_s16:
++**	sqdech	z0\.h, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl256_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_VL256, 16),
++		z0 = svqdech_pat (z0, SV_VL256, 16))
++
++/*
++** qdech_pat_mul4_s16:
++**	sqdech	z0\.h, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_mul4_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_MUL4, 16),
++		z0 = svqdech_pat (z0, SV_MUL4, 16))
++
++/*
++** qdech_pat_mul3_s16:
++**	sqdech	z0\.h, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_mul3_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_MUL3, 16),
++		z0 = svqdech_pat (z0, SV_MUL3, 16))
++
++/*
++** qdech_pat_all_s16:
++**	sqdech	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_all_s16, svint16_t,
++		z0 = svqdech_pat_s16 (z0, SV_ALL, 16),
++		z0 = svqdech_pat (z0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s32.c
+new file mode 100644
+index 000000000..b56306db7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_pat_n_1_s32_tied:
++**	sqdech	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_s32_tied, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqdech_pat (x0, SV_POW2, 1))
++
++/*
++** qdech_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdech	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_s32_untied, int32_t,
++		x0 = svqdech_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqdech_pat (x1, SV_POW2, 1))
++
++/*
++** qdech_pat_n_2_s32:
++**	sqdech	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_2_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqdech_pat (x0, SV_POW2, 2))
++
++/*
++** qdech_pat_n_7_s32:
++**	sqdech	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_7_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqdech_pat (x0, SV_POW2, 7))
++
++/*
++** qdech_pat_n_15_s32:
++**	sqdech	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_15_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqdech_pat (x0, SV_POW2, 15))
++
++/*
++** qdech_pat_n_16_s32:
++**	sqdech	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_16_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqdech_pat (x0, SV_POW2, 16))
++
++/*
++** qdech_pat_n_vl1_s32:
++**	sqdech	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl1_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqdech_pat (x0, SV_VL1, 16))
++
++/*
++** qdech_pat_n_vl2_s32:
++**	sqdech	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl2_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqdech_pat (x0, SV_VL2, 16))
++
++/*
++** qdech_pat_n_vl3_s32:
++**	sqdech	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl3_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqdech_pat (x0, SV_VL3, 16))
++
++/*
++** qdech_pat_n_vl4_s32:
++**	sqdech	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl4_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqdech_pat (x0, SV_VL4, 16))
++
++/*
++** qdech_pat_n_vl5_s32:
++**	sqdech	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl5_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqdech_pat (x0, SV_VL5, 16))
++
++/*
++** qdech_pat_n_vl6_s32:
++**	sqdech	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl6_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqdech_pat (x0, SV_VL6, 16))
++
++/*
++** qdech_pat_n_vl7_s32:
++**	sqdech	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl7_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqdech_pat (x0, SV_VL7, 16))
++
++/*
++** qdech_pat_n_vl8_s32:
++**	sqdech	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl8_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqdech_pat (x0, SV_VL8, 16))
++
++/*
++** qdech_pat_n_vl16_s32:
++**	sqdech	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl16_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqdech_pat (x0, SV_VL16, 16))
++
++/*
++** qdech_pat_n_vl32_s32:
++**	sqdech	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl32_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqdech_pat (x0, SV_VL32, 16))
++
++/*
++** qdech_pat_n_vl64_s32:
++**	sqdech	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl64_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqdech_pat (x0, SV_VL64, 16))
++
++/*
++** qdech_pat_n_vl128_s32:
++**	sqdech	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl128_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqdech_pat (x0, SV_VL128, 16))
++
++/*
++** qdech_pat_n_vl256_s32:
++**	sqdech	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl256_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqdech_pat (x0, SV_VL256, 16))
++
++/*
++** qdech_pat_n_mul4_s32:
++**	sqdech	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul4_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqdech_pat (x0, SV_MUL4, 16))
++
++/*
++** qdech_pat_n_mul3_s32:
++**	sqdech	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul3_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqdech_pat (x0, SV_MUL3, 16))
++
++/*
++** qdech_pat_n_all_s32:
++**	sqdech	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_all_s32, int32_t,
++		x0 = svqdech_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqdech_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s64.c
+new file mode 100644
+index 000000000..591658f54
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_pat_n_1_s64_tied:
++**	sqdech	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_s64_tied, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqdech_pat (x0, SV_POW2, 1))
++
++/*
++** qdech_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdech	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_s64_untied, int64_t,
++		x0 = svqdech_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqdech_pat (x1, SV_POW2, 1))
++
++/*
++** qdech_pat_n_2_s64:
++**	sqdech	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_2_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqdech_pat (x0, SV_POW2, 2))
++
++/*
++** qdech_pat_n_7_s64:
++**	sqdech	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_7_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqdech_pat (x0, SV_POW2, 7))
++
++/*
++** qdech_pat_n_15_s64:
++**	sqdech	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_15_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqdech_pat (x0, SV_POW2, 15))
++
++/*
++** qdech_pat_n_16_s64:
++**	sqdech	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_16_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqdech_pat (x0, SV_POW2, 16))
++
++/*
++** qdech_pat_n_vl1_s64:
++**	sqdech	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl1_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqdech_pat (x0, SV_VL1, 16))
++
++/*
++** qdech_pat_n_vl2_s64:
++**	sqdech	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl2_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqdech_pat (x0, SV_VL2, 16))
++
++/*
++** qdech_pat_n_vl3_s64:
++**	sqdech	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl3_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqdech_pat (x0, SV_VL3, 16))
++
++/*
++** qdech_pat_n_vl4_s64:
++**	sqdech	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl4_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqdech_pat (x0, SV_VL4, 16))
++
++/*
++** qdech_pat_n_vl5_s64:
++**	sqdech	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl5_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqdech_pat (x0, SV_VL5, 16))
++
++/*
++** qdech_pat_n_vl6_s64:
++**	sqdech	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl6_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqdech_pat (x0, SV_VL6, 16))
++
++/*
++** qdech_pat_n_vl7_s64:
++**	sqdech	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl7_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqdech_pat (x0, SV_VL7, 16))
++
++/*
++** qdech_pat_n_vl8_s64:
++**	sqdech	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl8_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqdech_pat (x0, SV_VL8, 16))
++
++/*
++** qdech_pat_n_vl16_s64:
++**	sqdech	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl16_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqdech_pat (x0, SV_VL16, 16))
++
++/*
++** qdech_pat_n_vl32_s64:
++**	sqdech	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl32_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqdech_pat (x0, SV_VL32, 16))
++
++/*
++** qdech_pat_n_vl64_s64:
++**	sqdech	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl64_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqdech_pat (x0, SV_VL64, 16))
++
++/*
++** qdech_pat_n_vl128_s64:
++**	sqdech	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl128_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqdech_pat (x0, SV_VL128, 16))
++
++/*
++** qdech_pat_n_vl256_s64:
++**	sqdech	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl256_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqdech_pat (x0, SV_VL256, 16))
++
++/*
++** qdech_pat_n_mul4_s64:
++**	sqdech	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul4_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqdech_pat (x0, SV_MUL4, 16))
++
++/*
++** qdech_pat_n_mul3_s64:
++**	sqdech	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul3_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqdech_pat (x0, SV_MUL3, 16))
++
++/*
++** qdech_pat_n_all_s64:
++**	sqdech	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_all_s64, int64_t,
++		x0 = svqdech_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqdech_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u16.c
+new file mode 100644
+index 000000000..ce0b5f3e8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u16.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_pat_1_u16_tied:
++**	uqdech	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_1_u16_tied, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_POW2, 1),
++		z0 = svqdech_pat (z0, SV_POW2, 1))
++
++/*
++** qdech_pat_1_u16_untied:
++**	movprfx	z0, z1
++**	uqdech	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_1_u16_untied, svuint16_t,
++		z0 = svqdech_pat_u16 (z1, SV_POW2, 1),
++		z0 = svqdech_pat (z1, SV_POW2, 1))
++
++/*
++** qdech_pat_2_u16:
++**	uqdech	z0\.h, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_2_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_POW2, 2),
++		z0 = svqdech_pat (z0, SV_POW2, 2))
++
++/*
++** qdech_pat_7_u16:
++**	uqdech	z0\.h, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_7_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_POW2, 7),
++		z0 = svqdech_pat (z0, SV_POW2, 7))
++
++/*
++** qdech_pat_15_u16:
++**	uqdech	z0\.h, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_15_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_POW2, 15),
++		z0 = svqdech_pat (z0, SV_POW2, 15))
++
++/*
++** qdech_pat_16_u16:
++**	uqdech	z0\.h, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_16_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_POW2, 16),
++		z0 = svqdech_pat (z0, SV_POW2, 16))
++
++/*
++** qdech_pat_vl1_u16:
++**	uqdech	z0\.h, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl1_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL1, 16),
++		z0 = svqdech_pat (z0, SV_VL1, 16))
++
++/*
++** qdech_pat_vl2_u16:
++**	uqdech	z0\.h, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl2_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL2, 16),
++		z0 = svqdech_pat (z0, SV_VL2, 16))
++
++/*
++** qdech_pat_vl3_u16:
++**	uqdech	z0\.h, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl3_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL3, 16),
++		z0 = svqdech_pat (z0, SV_VL3, 16))
++
++/*
++** qdech_pat_vl4_u16:
++**	uqdech	z0\.h, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl4_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL4, 16),
++		z0 = svqdech_pat (z0, SV_VL4, 16))
++
++/*
++** qdech_pat_vl5_u16:
++**	uqdech	z0\.h, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl5_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL5, 16),
++		z0 = svqdech_pat (z0, SV_VL5, 16))
++
++/*
++** qdech_pat_vl6_u16:
++**	uqdech	z0\.h, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl6_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL6, 16),
++		z0 = svqdech_pat (z0, SV_VL6, 16))
++
++/*
++** qdech_pat_vl7_u16:
++**	uqdech	z0\.h, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl7_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL7, 16),
++		z0 = svqdech_pat (z0, SV_VL7, 16))
++
++/*
++** qdech_pat_vl8_u16:
++**	uqdech	z0\.h, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl8_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL8, 16),
++		z0 = svqdech_pat (z0, SV_VL8, 16))
++
++/*
++** qdech_pat_vl16_u16:
++**	uqdech	z0\.h, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl16_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL16, 16),
++		z0 = svqdech_pat (z0, SV_VL16, 16))
++
++/*
++** qdech_pat_vl32_u16:
++**	uqdech	z0\.h, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl32_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL32, 16),
++		z0 = svqdech_pat (z0, SV_VL32, 16))
++
++/*
++** qdech_pat_vl64_u16:
++**	uqdech	z0\.h, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl64_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL64, 16),
++		z0 = svqdech_pat (z0, SV_VL64, 16))
++
++/*
++** qdech_pat_vl128_u16:
++**	uqdech	z0\.h, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl128_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL128, 16),
++		z0 = svqdech_pat (z0, SV_VL128, 16))
++
++/*
++** qdech_pat_vl256_u16:
++**	uqdech	z0\.h, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_vl256_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_VL256, 16),
++		z0 = svqdech_pat (z0, SV_VL256, 16))
++
++/*
++** qdech_pat_mul4_u16:
++**	uqdech	z0\.h, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_mul4_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_MUL4, 16),
++		z0 = svqdech_pat (z0, SV_MUL4, 16))
++
++/*
++** qdech_pat_mul3_u16:
++**	uqdech	z0\.h, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_mul3_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_MUL3, 16),
++		z0 = svqdech_pat (z0, SV_MUL3, 16))
++
++/*
++** qdech_pat_all_u16:
++**	uqdech	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_pat_all_u16, svuint16_t,
++		z0 = svqdech_pat_u16 (z0, SV_ALL, 16),
++		z0 = svqdech_pat (z0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u32.c
+new file mode 100644
+index 000000000..177f32ec7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_pat_n_1_u32_tied:
++**	uqdech	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_u32_tied, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqdech_pat (x0, SV_POW2, 1))
++
++/*
++** qdech_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdech	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_u32_untied, uint32_t,
++		x0 = svqdech_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqdech_pat (x1, SV_POW2, 1))
++
++/*
++** qdech_pat_n_2_u32:
++**	uqdech	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_2_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqdech_pat (x0, SV_POW2, 2))
++
++/*
++** qdech_pat_n_7_u32:
++**	uqdech	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_7_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqdech_pat (x0, SV_POW2, 7))
++
++/*
++** qdech_pat_n_15_u32:
++**	uqdech	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_15_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqdech_pat (x0, SV_POW2, 15))
++
++/*
++** qdech_pat_n_16_u32:
++**	uqdech	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_16_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqdech_pat (x0, SV_POW2, 16))
++
++/*
++** qdech_pat_n_vl1_u32:
++**	uqdech	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl1_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqdech_pat (x0, SV_VL1, 16))
++
++/*
++** qdech_pat_n_vl2_u32:
++**	uqdech	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl2_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqdech_pat (x0, SV_VL2, 16))
++
++/*
++** qdech_pat_n_vl3_u32:
++**	uqdech	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl3_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqdech_pat (x0, SV_VL3, 16))
++
++/*
++** qdech_pat_n_vl4_u32:
++**	uqdech	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl4_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqdech_pat (x0, SV_VL4, 16))
++
++/*
++** qdech_pat_n_vl5_u32:
++**	uqdech	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl5_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqdech_pat (x0, SV_VL5, 16))
++
++/*
++** qdech_pat_n_vl6_u32:
++**	uqdech	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl6_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqdech_pat (x0, SV_VL6, 16))
++
++/*
++** qdech_pat_n_vl7_u32:
++**	uqdech	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl7_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqdech_pat (x0, SV_VL7, 16))
++
++/*
++** qdech_pat_n_vl8_u32:
++**	uqdech	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl8_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqdech_pat (x0, SV_VL8, 16))
++
++/*
++** qdech_pat_n_vl16_u32:
++**	uqdech	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl16_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqdech_pat (x0, SV_VL16, 16))
++
++/*
++** qdech_pat_n_vl32_u32:
++**	uqdech	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl32_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqdech_pat (x0, SV_VL32, 16))
++
++/*
++** qdech_pat_n_vl64_u32:
++**	uqdech	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl64_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqdech_pat (x0, SV_VL64, 16))
++
++/*
++** qdech_pat_n_vl128_u32:
++**	uqdech	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl128_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqdech_pat (x0, SV_VL128, 16))
++
++/*
++** qdech_pat_n_vl256_u32:
++**	uqdech	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl256_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqdech_pat (x0, SV_VL256, 16))
++
++/*
++** qdech_pat_n_mul4_u32:
++**	uqdech	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul4_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqdech_pat (x0, SV_MUL4, 16))
++
++/*
++** qdech_pat_n_mul3_u32:
++**	uqdech	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul3_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqdech_pat (x0, SV_MUL3, 16))
++
++/*
++** qdech_pat_n_all_u32:
++**	uqdech	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_all_u32, uint32_t,
++		x0 = svqdech_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqdech_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u64.c
+new file mode 100644
+index 000000000..7092127f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_pat_n_1_u64_tied:
++**	uqdech	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_u64_tied, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqdech_pat (x0, SV_POW2, 1))
++
++/*
++** qdech_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdech	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_1_u64_untied, uint64_t,
++		x0 = svqdech_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqdech_pat (x1, SV_POW2, 1))
++
++/*
++** qdech_pat_n_2_u64:
++**	uqdech	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_2_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqdech_pat (x0, SV_POW2, 2))
++
++/*
++** qdech_pat_n_7_u64:
++**	uqdech	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_7_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqdech_pat (x0, SV_POW2, 7))
++
++/*
++** qdech_pat_n_15_u64:
++**	uqdech	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_15_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqdech_pat (x0, SV_POW2, 15))
++
++/*
++** qdech_pat_n_16_u64:
++**	uqdech	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_16_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqdech_pat (x0, SV_POW2, 16))
++
++/*
++** qdech_pat_n_vl1_u64:
++**	uqdech	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl1_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqdech_pat (x0, SV_VL1, 16))
++
++/*
++** qdech_pat_n_vl2_u64:
++**	uqdech	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl2_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqdech_pat (x0, SV_VL2, 16))
++
++/*
++** qdech_pat_n_vl3_u64:
++**	uqdech	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl3_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqdech_pat (x0, SV_VL3, 16))
++
++/*
++** qdech_pat_n_vl4_u64:
++**	uqdech	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl4_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqdech_pat (x0, SV_VL4, 16))
++
++/*
++** qdech_pat_n_vl5_u64:
++**	uqdech	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl5_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqdech_pat (x0, SV_VL5, 16))
++
++/*
++** qdech_pat_n_vl6_u64:
++**	uqdech	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl6_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqdech_pat (x0, SV_VL6, 16))
++
++/*
++** qdech_pat_n_vl7_u64:
++**	uqdech	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl7_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqdech_pat (x0, SV_VL7, 16))
++
++/*
++** qdech_pat_n_vl8_u64:
++**	uqdech	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl8_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqdech_pat (x0, SV_VL8, 16))
++
++/*
++** qdech_pat_n_vl16_u64:
++**	uqdech	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl16_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqdech_pat (x0, SV_VL16, 16))
++
++/*
++** qdech_pat_n_vl32_u64:
++**	uqdech	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl32_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqdech_pat (x0, SV_VL32, 16))
++
++/*
++** qdech_pat_n_vl64_u64:
++**	uqdech	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl64_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqdech_pat (x0, SV_VL64, 16))
++
++/*
++** qdech_pat_n_vl128_u64:
++**	uqdech	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl128_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqdech_pat (x0, SV_VL128, 16))
++
++/*
++** qdech_pat_n_vl256_u64:
++**	uqdech	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_vl256_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqdech_pat (x0, SV_VL256, 16))
++
++/*
++** qdech_pat_n_mul4_u64:
++**	uqdech	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul4_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqdech_pat (x0, SV_MUL4, 16))
++
++/*
++** qdech_pat_n_mul3_u64:
++**	uqdech	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_mul3_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqdech_pat (x0, SV_MUL3, 16))
++
++/*
++** qdech_pat_n_all_u64:
++**	uqdech	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_pat_n_all_u64, uint64_t,
++		x0 = svqdech_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqdech_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s16.c
+new file mode 100644
+index 000000000..2a7a8f7a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s16.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_1_s16_tied:
++**	sqdech	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_1_s16_tied, svint16_t,
++		z0 = svqdech_s16 (z0, 1),
++		z0 = svqdech (z0, 1))
++
++/*
++** qdech_1_s16_untied:
++**	movprfx	z0, z1
++**	sqdech	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_1_s16_untied, svint16_t,
++		z0 = svqdech_s16 (z1, 1),
++		z0 = svqdech (z1, 1))
++
++/*
++** qdech_2_s16:
++**	sqdech	z0\.h, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_2_s16, svint16_t,
++		z0 = svqdech_s16 (z0, 2),
++		z0 = svqdech (z0, 2))
++
++/*
++** qdech_7_s16:
++**	sqdech	z0\.h, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_7_s16, svint16_t,
++		z0 = svqdech_s16 (z0, 7),
++		z0 = svqdech (z0, 7))
++
++/*
++** qdech_15_s16:
++**	sqdech	z0\.h, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_15_s16, svint16_t,
++		z0 = svqdech_s16 (z0, 15),
++		z0 = svqdech (z0, 15))
++
++/*
++** qdech_16_s16:
++**	sqdech	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_16_s16, svint16_t,
++		z0 = svqdech_s16 (z0, 16),
++		z0 = svqdech (z0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s32.c
+new file mode 100644
+index 000000000..7fd57d85a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_n_1_s32_tied:
++**	sqdech	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_s32_tied, int32_t,
++		x0 = svqdech_n_s32 (x0, 1),
++		x0 = svqdech (x0, 1))
++
++/*
++** qdech_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdech	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_s32_untied, int32_t,
++		x0 = svqdech_n_s32 (x1, 1),
++		x0 = svqdech (x1, 1))
++
++/*
++** qdech_n_2_s32:
++**	sqdech	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_2_s32, int32_t,
++		x0 = svqdech_n_s32 (x0, 2),
++		x0 = svqdech (x0, 2))
++
++/*
++** qdech_n_7_s32:
++**	sqdech	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_7_s32, int32_t,
++		x0 = svqdech_n_s32 (x0, 7),
++		x0 = svqdech (x0, 7))
++
++/*
++** qdech_n_15_s32:
++**	sqdech	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_15_s32, int32_t,
++		x0 = svqdech_n_s32 (x0, 15),
++		x0 = svqdech (x0, 15))
++
++/*
++** qdech_n_16_s32:
++**	sqdech	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_16_s32, int32_t,
++		x0 = svqdech_n_s32 (x0, 16),
++		x0 = svqdech (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s64.c
+new file mode 100644
+index 000000000..61989f8d6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_n_1_s64_tied:
++**	sqdech	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_s64_tied, int64_t,
++		x0 = svqdech_n_s64 (x0, 1),
++		x0 = svqdech (x0, 1))
++
++/*
++** qdech_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdech	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_s64_untied, int64_t,
++		x0 = svqdech_n_s64 (x1, 1),
++		x0 = svqdech (x1, 1))
++
++/*
++** qdech_n_2_s64:
++**	sqdech	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_2_s64, int64_t,
++		x0 = svqdech_n_s64 (x0, 2),
++		x0 = svqdech (x0, 2))
++
++/*
++** qdech_n_7_s64:
++**	sqdech	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_7_s64, int64_t,
++		x0 = svqdech_n_s64 (x0, 7),
++		x0 = svqdech (x0, 7))
++
++/*
++** qdech_n_15_s64:
++**	sqdech	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_15_s64, int64_t,
++		x0 = svqdech_n_s64 (x0, 15),
++		x0 = svqdech (x0, 15))
++
++/*
++** qdech_n_16_s64:
++**	sqdech	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_16_s64, int64_t,
++		x0 = svqdech_n_s64 (x0, 16),
++		x0 = svqdech (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u16.c
+new file mode 100644
+index 000000000..0d6587851
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u16.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_1_u16_tied:
++**	uqdech	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_1_u16_tied, svuint16_t,
++		z0 = svqdech_u16 (z0, 1),
++		z0 = svqdech (z0, 1))
++
++/*
++** qdech_1_u16_untied:
++**	movprfx	z0, z1
++**	uqdech	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_1_u16_untied, svuint16_t,
++		z0 = svqdech_u16 (z1, 1),
++		z0 = svqdech (z1, 1))
++
++/*
++** qdech_2_u16:
++**	uqdech	z0\.h, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_2_u16, svuint16_t,
++		z0 = svqdech_u16 (z0, 2),
++		z0 = svqdech (z0, 2))
++
++/*
++** qdech_7_u16:
++**	uqdech	z0\.h, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_7_u16, svuint16_t,
++		z0 = svqdech_u16 (z0, 7),
++		z0 = svqdech (z0, 7))
++
++/*
++** qdech_15_u16:
++**	uqdech	z0\.h, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_15_u16, svuint16_t,
++		z0 = svqdech_u16 (z0, 15),
++		z0 = svqdech (z0, 15))
++
++/*
++** qdech_16_u16:
++**	uqdech	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdech_16_u16, svuint16_t,
++		z0 = svqdech_u16 (z0, 16),
++		z0 = svqdech (z0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u32.c
+new file mode 100644
+index 000000000..179d67953
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_n_1_u32_tied:
++**	uqdech	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_u32_tied, uint32_t,
++		x0 = svqdech_n_u32 (x0, 1),
++		x0 = svqdech (x0, 1))
++
++/*
++** qdech_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdech	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_u32_untied, uint32_t,
++		x0 = svqdech_n_u32 (x1, 1),
++		x0 = svqdech (x1, 1))
++
++/*
++** qdech_n_2_u32:
++**	uqdech	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_2_u32, uint32_t,
++		x0 = svqdech_n_u32 (x0, 2),
++		x0 = svqdech (x0, 2))
++
++/*
++** qdech_n_7_u32:
++**	uqdech	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_7_u32, uint32_t,
++		x0 = svqdech_n_u32 (x0, 7),
++		x0 = svqdech (x0, 7))
++
++/*
++** qdech_n_15_u32:
++**	uqdech	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_15_u32, uint32_t,
++		x0 = svqdech_n_u32 (x0, 15),
++		x0 = svqdech (x0, 15))
++
++/*
++** qdech_n_16_u32:
++**	uqdech	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_16_u32, uint32_t,
++		x0 = svqdech_n_u32 (x0, 16),
++		x0 = svqdech (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u64.c
+new file mode 100644
+index 000000000..da2f051af
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdech_n_1_u64_tied:
++**	uqdech	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_u64_tied, uint64_t,
++		x0 = svqdech_n_u64 (x0, 1),
++		x0 = svqdech (x0, 1))
++
++/*
++** qdech_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdech	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_1_u64_untied, uint64_t,
++		x0 = svqdech_n_u64 (x1, 1),
++		x0 = svqdech (x1, 1))
++
++/*
++** qdech_n_2_u64:
++**	uqdech	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_2_u64, uint64_t,
++		x0 = svqdech_n_u64 (x0, 2),
++		x0 = svqdech (x0, 2))
++
++/*
++** qdech_n_7_u64:
++**	uqdech	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_7_u64, uint64_t,
++		x0 = svqdech_n_u64 (x0, 7),
++		x0 = svqdech (x0, 7))
++
++/*
++** qdech_n_15_u64:
++**	uqdech	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_15_u64, uint64_t,
++		x0 = svqdech_n_u64 (x0, 15),
++		x0 = svqdech (x0, 15))
++
++/*
++** qdech_n_16_u64:
++**	uqdech	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdech_n_16_u64, uint64_t,
++		x0 = svqdech_n_u64 (x0, 16),
++		x0 = svqdech (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s16.c
+new file mode 100644
+index 000000000..71b40c152
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecp_s16_tied:
++**	sqdecp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_s16_tied, svint16_t,
++		z0 = svqdecp_s16 (z0, p0),
++		z0 = svqdecp (z0, p0))
++
++/*
++** qdecp_s16_untied:
++**	movprfx	z0, z1
++**	sqdecp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_s16_untied, svint16_t,
++		z0 = svqdecp_s16 (z1, p0),
++		z0 = svqdecp (z1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s32.c
+new file mode 100644
+index 000000000..55e4067d1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s32.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecp_s32_tied:
++**	sqdecp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_s32_tied, svint32_t,
++		z0 = svqdecp_s32 (z0, p0),
++		z0 = svqdecp (z0, p0))
++
++/*
++** qdecp_s32_untied:
++**	movprfx	z0, z1
++**	sqdecp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_s32_untied, svint32_t,
++		z0 = svqdecp_s32 (z1, p0),
++		z0 = svqdecp (z1, p0))
++
++/*
++** qdecp_n_s32_b8_tied:
++**	sqdecp	x0, p0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b8_tied, int32_t,
++		x0 = svqdecp_n_s32_b8 (x0, p0),
++		x0 = svqdecp_b8 (x0, p0))
++
++/*
++** qdecp_n_s32_b8_untied:
++**	mov	w0, w1
++**	sqdecp	x0, p0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b8_untied, int32_t,
++		x0 = svqdecp_n_s32_b8 (x1, p0),
++		x0 = svqdecp_b8 (x1, p0))
++
++/*
++** qdecp_n_s32_b16_tied:
++**	sqdecp	x0, p0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b16_tied, int32_t,
++		x0 = svqdecp_n_s32_b16 (x0, p0),
++		x0 = svqdecp_b16 (x0, p0))
++
++/*
++** qdecp_n_s32_b16_untied:
++**	mov	w0, w1
++**	sqdecp	x0, p0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b16_untied, int32_t,
++		x0 = svqdecp_n_s32_b16 (x1, p0),
++		x0 = svqdecp_b16 (x1, p0))
++
++/*
++** qdecp_n_s32_b32_tied:
++**	sqdecp	x0, p0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b32_tied, int32_t,
++		x0 = svqdecp_n_s32_b32 (x0, p0),
++		x0 = svqdecp_b32 (x0, p0))
++
++/*
++** qdecp_n_s32_b32_untied:
++**	mov	w0, w1
++**	sqdecp	x0, p0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b32_untied, int32_t,
++		x0 = svqdecp_n_s32_b32 (x1, p0),
++		x0 = svqdecp_b32 (x1, p0))
++
++/*
++** qdecp_n_s32_b64_tied:
++**	sqdecp	x0, p0\.d, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b64_tied, int32_t,
++		x0 = svqdecp_n_s32_b64 (x0, p0),
++		x0 = svqdecp_b64 (x0, p0))
++
++/*
++** qdecp_n_s32_b64_untied:
++**	mov	w0, w1
++**	sqdecp	x0, p0\.d, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s32_b64_untied, int32_t,
++		x0 = svqdecp_n_s32_b64 (x1, p0),
++		x0 = svqdecp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s64.c
+new file mode 100644
+index 000000000..9527999c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s64.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecp_s64_tied:
++**	sqdecp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_s64_tied, svint64_t,
++		z0 = svqdecp_s64 (z0, p0),
++		z0 = svqdecp (z0, p0))
++
++/*
++** qdecp_s64_untied:
++**	movprfx	z0, z1
++**	sqdecp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_s64_untied, svint64_t,
++		z0 = svqdecp_s64 (z1, p0),
++		z0 = svqdecp (z1, p0))
++
++/*
++** qdecp_n_s64_b8_tied:
++**	sqdecp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b8_tied, int64_t,
++		x0 = svqdecp_n_s64_b8 (x0, p0),
++		x0 = svqdecp_b8 (x0, p0))
++
++/*
++** qdecp_n_s64_b8_untied:
++**	mov	x0, x1
++**	sqdecp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b8_untied, int64_t,
++		x0 = svqdecp_n_s64_b8 (x1, p0),
++		x0 = svqdecp_b8 (x1, p0))
++
++/*
++** qdecp_n_s64_b16_tied:
++**	sqdecp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b16_tied, int64_t,
++		x0 = svqdecp_n_s64_b16 (x0, p0),
++		x0 = svqdecp_b16 (x0, p0))
++
++/*
++** qdecp_n_s64_b16_untied:
++**	mov	x0, x1
++**	sqdecp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b16_untied, int64_t,
++		x0 = svqdecp_n_s64_b16 (x1, p0),
++		x0 = svqdecp_b16 (x1, p0))
++
++/*
++** qdecp_n_s64_b32_tied:
++**	sqdecp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b32_tied, int64_t,
++		x0 = svqdecp_n_s64_b32 (x0, p0),
++		x0 = svqdecp_b32 (x0, p0))
++
++/*
++** qdecp_n_s64_b32_untied:
++**	mov	x0, x1
++**	sqdecp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b32_untied, int64_t,
++		x0 = svqdecp_n_s64_b32 (x1, p0),
++		x0 = svqdecp_b32 (x1, p0))
++
++/*
++** qdecp_n_s64_b64_tied:
++**	sqdecp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b64_tied, int64_t,
++		x0 = svqdecp_n_s64_b64 (x0, p0),
++		x0 = svqdecp_b64 (x0, p0))
++
++/*
++** qdecp_n_s64_b64_untied:
++**	mov	x0, x1
++**	sqdecp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_s64_b64_untied, int64_t,
++		x0 = svqdecp_n_s64_b64 (x1, p0),
++		x0 = svqdecp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u16.c
+new file mode 100644
+index 000000000..33357ada4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecp_u16_tied:
++**	uqdecp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_u16_tied, svuint16_t,
++		z0 = svqdecp_u16 (z0, p0),
++		z0 = svqdecp (z0, p0))
++
++/*
++** qdecp_u16_untied:
++**	movprfx	z0, z1
++**	uqdecp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_u16_untied, svuint16_t,
++		z0 = svqdecp_u16 (z1, p0),
++		z0 = svqdecp (z1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u32.c
+new file mode 100644
+index 000000000..58e9a642e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u32.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecp_u32_tied:
++**	uqdecp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_u32_tied, svuint32_t,
++		z0 = svqdecp_u32 (z0, p0),
++		z0 = svqdecp (z0, p0))
++
++/*
++** qdecp_u32_untied:
++**	movprfx	z0, z1
++**	uqdecp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_u32_untied, svuint32_t,
++		z0 = svqdecp_u32 (z1, p0),
++		z0 = svqdecp (z1, p0))
++
++/*
++** qdecp_n_u32_b8_tied:
++**	uqdecp	w0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b8_tied, uint32_t,
++		x0 = svqdecp_n_u32_b8 (x0, p0),
++		x0 = svqdecp_b8 (x0, p0))
++
++/*
++** qdecp_n_u32_b8_untied:
++**	mov	w0, w1
++**	uqdecp	w0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b8_untied, uint32_t,
++		x0 = svqdecp_n_u32_b8 (x1, p0),
++		x0 = svqdecp_b8 (x1, p0))
++
++/*
++** qdecp_n_u32_b16_tied:
++**	uqdecp	w0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b16_tied, uint32_t,
++		x0 = svqdecp_n_u32_b16 (x0, p0),
++		x0 = svqdecp_b16 (x0, p0))
++
++/*
++** qdecp_n_u32_b16_untied:
++**	mov	w0, w1
++**	uqdecp	w0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b16_untied, uint32_t,
++		x0 = svqdecp_n_u32_b16 (x1, p0),
++		x0 = svqdecp_b16 (x1, p0))
++
++/*
++** qdecp_n_u32_b32_tied:
++**	uqdecp	w0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b32_tied, uint32_t,
++		x0 = svqdecp_n_u32_b32 (x0, p0),
++		x0 = svqdecp_b32 (x0, p0))
++
++/*
++** qdecp_n_u32_b32_untied:
++**	mov	w0, w1
++**	uqdecp	w0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b32_untied, uint32_t,
++		x0 = svqdecp_n_u32_b32 (x1, p0),
++		x0 = svqdecp_b32 (x1, p0))
++
++/*
++** qdecp_n_u32_b64_tied:
++**	uqdecp	w0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b64_tied, uint32_t,
++		x0 = svqdecp_n_u32_b64 (x0, p0),
++		x0 = svqdecp_b64 (x0, p0))
++
++/*
++** qdecp_n_u32_b64_untied:
++**	mov	w0, w1
++**	uqdecp	w0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u32_b64_untied, uint32_t,
++		x0 = svqdecp_n_u32_b64 (x1, p0),
++		x0 = svqdecp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u64.c
+new file mode 100644
+index 000000000..e2091d8ae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u64.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecp_u64_tied:
++**	uqdecp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_u64_tied, svuint64_t,
++		z0 = svqdecp_u64 (z0, p0),
++		z0 = svqdecp (z0, p0))
++
++/*
++** qdecp_u64_untied:
++**	movprfx	z0, z1
++**	uqdecp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qdecp_u64_untied, svuint64_t,
++		z0 = svqdecp_u64 (z1, p0),
++		z0 = svqdecp (z1, p0))
++
++/*
++** qdecp_n_u64_b8_tied:
++**	uqdecp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b8_tied, uint64_t,
++		x0 = svqdecp_n_u64_b8 (x0, p0),
++		x0 = svqdecp_b8 (x0, p0))
++
++/*
++** qdecp_n_u64_b8_untied:
++**	mov	x0, x1
++**	uqdecp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b8_untied, uint64_t,
++		x0 = svqdecp_n_u64_b8 (x1, p0),
++		x0 = svqdecp_b8 (x1, p0))
++
++/*
++** qdecp_n_u64_b16_tied:
++**	uqdecp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b16_tied, uint64_t,
++		x0 = svqdecp_n_u64_b16 (x0, p0),
++		x0 = svqdecp_b16 (x0, p0))
++
++/*
++** qdecp_n_u64_b16_untied:
++**	mov	x0, x1
++**	uqdecp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b16_untied, uint64_t,
++		x0 = svqdecp_n_u64_b16 (x1, p0),
++		x0 = svqdecp_b16 (x1, p0))
++
++/*
++** qdecp_n_u64_b32_tied:
++**	uqdecp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b32_tied, uint64_t,
++		x0 = svqdecp_n_u64_b32 (x0, p0),
++		x0 = svqdecp_b32 (x0, p0))
++
++/*
++** qdecp_n_u64_b32_untied:
++**	mov	x0, x1
++**	uqdecp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b32_untied, uint64_t,
++		x0 = svqdecp_n_u64_b32 (x1, p0),
++		x0 = svqdecp_b32 (x1, p0))
++
++/*
++** qdecp_n_u64_b64_tied:
++**	uqdecp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b64_tied, uint64_t,
++		x0 = svqdecp_n_u64_b64 (x0, p0),
++		x0 = svqdecp_b64 (x0, p0))
++
++/*
++** qdecp_n_u64_b64_untied:
++**	mov	x0, x1
++**	uqdecp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qdecp_n_u64_b64_untied, uint64_t,
++		x0 = svqdecp_n_u64_b64 (x1, p0),
++		x0 = svqdecp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s32.c
+new file mode 100644
+index 000000000..d80f7be4d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s32.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_pat_1_s32_tied:
++**	sqdecw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_1_s32_tied, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_POW2, 1),
++		z0 = svqdecw_pat (z0, SV_POW2, 1))
++
++/*
++** qdecw_pat_1_s32_untied:
++**	movprfx	z0, z1
++**	sqdecw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_1_s32_untied, svint32_t,
++		z0 = svqdecw_pat_s32 (z1, SV_POW2, 1),
++		z0 = svqdecw_pat (z1, SV_POW2, 1))
++
++/*
++** qdecw_pat_2_s32:
++**	sqdecw	z0\.s, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_2_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_POW2, 2),
++		z0 = svqdecw_pat (z0, SV_POW2, 2))
++
++/*
++** qdecw_pat_7_s32:
++**	sqdecw	z0\.s, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_7_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_POW2, 7),
++		z0 = svqdecw_pat (z0, SV_POW2, 7))
++
++/*
++** qdecw_pat_15_s32:
++**	sqdecw	z0\.s, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_15_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_POW2, 15),
++		z0 = svqdecw_pat (z0, SV_POW2, 15))
++
++/*
++** qdecw_pat_16_s32:
++**	sqdecw	z0\.s, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_16_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_POW2, 16),
++		z0 = svqdecw_pat (z0, SV_POW2, 16))
++
++/*
++** qdecw_pat_vl1_s32:
++**	sqdecw	z0\.s, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl1_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL1, 16),
++		z0 = svqdecw_pat (z0, SV_VL1, 16))
++
++/*
++** qdecw_pat_vl2_s32:
++**	sqdecw	z0\.s, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl2_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL2, 16),
++		z0 = svqdecw_pat (z0, SV_VL2, 16))
++
++/*
++** qdecw_pat_vl3_s32:
++**	sqdecw	z0\.s, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl3_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL3, 16),
++		z0 = svqdecw_pat (z0, SV_VL3, 16))
++
++/*
++** qdecw_pat_vl4_s32:
++**	sqdecw	z0\.s, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl4_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL4, 16),
++		z0 = svqdecw_pat (z0, SV_VL4, 16))
++
++/*
++** qdecw_pat_vl5_s32:
++**	sqdecw	z0\.s, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl5_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL5, 16),
++		z0 = svqdecw_pat (z0, SV_VL5, 16))
++
++/*
++** qdecw_pat_vl6_s32:
++**	sqdecw	z0\.s, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl6_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL6, 16),
++		z0 = svqdecw_pat (z0, SV_VL6, 16))
++
++/*
++** qdecw_pat_vl7_s32:
++**	sqdecw	z0\.s, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl7_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL7, 16),
++		z0 = svqdecw_pat (z0, SV_VL7, 16))
++
++/*
++** qdecw_pat_vl8_s32:
++**	sqdecw	z0\.s, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl8_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL8, 16),
++		z0 = svqdecw_pat (z0, SV_VL8, 16))
++
++/*
++** qdecw_pat_vl16_s32:
++**	sqdecw	z0\.s, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl16_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL16, 16),
++		z0 = svqdecw_pat (z0, SV_VL16, 16))
++
++/*
++** qdecw_pat_vl32_s32:
++**	sqdecw	z0\.s, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl32_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL32, 16),
++		z0 = svqdecw_pat (z0, SV_VL32, 16))
++
++/*
++** qdecw_pat_vl64_s32:
++**	sqdecw	z0\.s, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl64_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL64, 16),
++		z0 = svqdecw_pat (z0, SV_VL64, 16))
++
++/*
++** qdecw_pat_vl128_s32:
++**	sqdecw	z0\.s, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl128_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL128, 16),
++		z0 = svqdecw_pat (z0, SV_VL128, 16))
++
++/*
++** qdecw_pat_vl256_s32:
++**	sqdecw	z0\.s, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl256_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_VL256, 16),
++		z0 = svqdecw_pat (z0, SV_VL256, 16))
++
++/*
++** qdecw_pat_mul4_s32:
++**	sqdecw	z0\.s, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_mul4_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_MUL4, 16),
++		z0 = svqdecw_pat (z0, SV_MUL4, 16))
++
++/*
++** qdecw_pat_mul3_s32:
++**	sqdecw	z0\.s, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_mul3_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_MUL3, 16),
++		z0 = svqdecw_pat (z0, SV_MUL3, 16))
++
++/*
++** qdecw_pat_all_s32:
++**	sqdecw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_all_s32, svint32_t,
++		z0 = svqdecw_pat_s32 (z0, SV_ALL, 16),
++		z0 = svqdecw_pat (z0, SV_ALL, 16))
++
++/*
++** qdecw_pat_n_1_s32_tied:
++**	sqdecw	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_s32_tied, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqdecw_pat (x0, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdecw	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_s32_untied, int32_t,
++		x0 = svqdecw_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqdecw_pat (x1, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_2_s32:
++**	sqdecw	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_2_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqdecw_pat (x0, SV_POW2, 2))
++
++/*
++** qdecw_pat_n_7_s32:
++**	sqdecw	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_7_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqdecw_pat (x0, SV_POW2, 7))
++
++/*
++** qdecw_pat_n_15_s32:
++**	sqdecw	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_15_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqdecw_pat (x0, SV_POW2, 15))
++
++/*
++** qdecw_pat_n_16_s32:
++**	sqdecw	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_16_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqdecw_pat (x0, SV_POW2, 16))
++
++/*
++** qdecw_pat_n_vl1_s32:
++**	sqdecw	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl1_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqdecw_pat (x0, SV_VL1, 16))
++
++/*
++** qdecw_pat_n_vl2_s32:
++**	sqdecw	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl2_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqdecw_pat (x0, SV_VL2, 16))
++
++/*
++** qdecw_pat_n_vl3_s32:
++**	sqdecw	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl3_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqdecw_pat (x0, SV_VL3, 16))
++
++/*
++** qdecw_pat_n_vl4_s32:
++**	sqdecw	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl4_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqdecw_pat (x0, SV_VL4, 16))
++
++/*
++** qdecw_pat_n_vl5_s32:
++**	sqdecw	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl5_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqdecw_pat (x0, SV_VL5, 16))
++
++/*
++** qdecw_pat_n_vl6_s32:
++**	sqdecw	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl6_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqdecw_pat (x0, SV_VL6, 16))
++
++/*
++** qdecw_pat_n_vl7_s32:
++**	sqdecw	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl7_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqdecw_pat (x0, SV_VL7, 16))
++
++/*
++** qdecw_pat_n_vl8_s32:
++**	sqdecw	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl8_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqdecw_pat (x0, SV_VL8, 16))
++
++/*
++** qdecw_pat_n_vl16_s32:
++**	sqdecw	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl16_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqdecw_pat (x0, SV_VL16, 16))
++
++/*
++** qdecw_pat_n_vl32_s32:
++**	sqdecw	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl32_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqdecw_pat (x0, SV_VL32, 16))
++
++/*
++** qdecw_pat_n_vl64_s32:
++**	sqdecw	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl64_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqdecw_pat (x0, SV_VL64, 16))
++
++/*
++** qdecw_pat_n_vl128_s32:
++**	sqdecw	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl128_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqdecw_pat (x0, SV_VL128, 16))
++
++/*
++** qdecw_pat_n_vl256_s32:
++**	sqdecw	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl256_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqdecw_pat (x0, SV_VL256, 16))
++
++/*
++** qdecw_pat_n_mul4_s32:
++**	sqdecw	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul4_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqdecw_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecw_pat_n_mul3_s32:
++**	sqdecw	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul3_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqdecw_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecw_pat_n_all_s32:
++**	sqdecw	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_all_s32, int32_t,
++		x0 = svqdecw_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqdecw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s64.c
+new file mode 100644
+index 000000000..9c684a7c7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_pat_n_1_s64_tied:
++**	sqdecw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_s64_tied, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqdecw_pat (x0, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdecw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_s64_untied, int64_t,
++		x0 = svqdecw_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqdecw_pat (x1, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_2_s64:
++**	sqdecw	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_2_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqdecw_pat (x0, SV_POW2, 2))
++
++/*
++** qdecw_pat_n_7_s64:
++**	sqdecw	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_7_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqdecw_pat (x0, SV_POW2, 7))
++
++/*
++** qdecw_pat_n_15_s64:
++**	sqdecw	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_15_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqdecw_pat (x0, SV_POW2, 15))
++
++/*
++** qdecw_pat_n_16_s64:
++**	sqdecw	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_16_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqdecw_pat (x0, SV_POW2, 16))
++
++/*
++** qdecw_pat_n_vl1_s64:
++**	sqdecw	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl1_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqdecw_pat (x0, SV_VL1, 16))
++
++/*
++** qdecw_pat_n_vl2_s64:
++**	sqdecw	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl2_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqdecw_pat (x0, SV_VL2, 16))
++
++/*
++** qdecw_pat_n_vl3_s64:
++**	sqdecw	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl3_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqdecw_pat (x0, SV_VL3, 16))
++
++/*
++** qdecw_pat_n_vl4_s64:
++**	sqdecw	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl4_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqdecw_pat (x0, SV_VL4, 16))
++
++/*
++** qdecw_pat_n_vl5_s64:
++**	sqdecw	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl5_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqdecw_pat (x0, SV_VL5, 16))
++
++/*
++** qdecw_pat_n_vl6_s64:
++**	sqdecw	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl6_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqdecw_pat (x0, SV_VL6, 16))
++
++/*
++** qdecw_pat_n_vl7_s64:
++**	sqdecw	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl7_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqdecw_pat (x0, SV_VL7, 16))
++
++/*
++** qdecw_pat_n_vl8_s64:
++**	sqdecw	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl8_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqdecw_pat (x0, SV_VL8, 16))
++
++/*
++** qdecw_pat_n_vl16_s64:
++**	sqdecw	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl16_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqdecw_pat (x0, SV_VL16, 16))
++
++/*
++** qdecw_pat_n_vl32_s64:
++**	sqdecw	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl32_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqdecw_pat (x0, SV_VL32, 16))
++
++/*
++** qdecw_pat_n_vl64_s64:
++**	sqdecw	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl64_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqdecw_pat (x0, SV_VL64, 16))
++
++/*
++** qdecw_pat_n_vl128_s64:
++**	sqdecw	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl128_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqdecw_pat (x0, SV_VL128, 16))
++
++/*
++** qdecw_pat_n_vl256_s64:
++**	sqdecw	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl256_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqdecw_pat (x0, SV_VL256, 16))
++
++/*
++** qdecw_pat_n_mul4_s64:
++**	sqdecw	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul4_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqdecw_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecw_pat_n_mul3_s64:
++**	sqdecw	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul3_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqdecw_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecw_pat_n_all_s64:
++**	sqdecw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_all_s64, int64_t,
++		x0 = svqdecw_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqdecw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u32.c
+new file mode 100644
+index 000000000..8d3fcb473
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u32.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_pat_1_u32_tied:
++**	uqdecw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_1_u32_tied, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_POW2, 1),
++		z0 = svqdecw_pat (z0, SV_POW2, 1))
++
++/*
++** qdecw_pat_1_u32_untied:
++**	movprfx	z0, z1
++**	uqdecw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_1_u32_untied, svuint32_t,
++		z0 = svqdecw_pat_u32 (z1, SV_POW2, 1),
++		z0 = svqdecw_pat (z1, SV_POW2, 1))
++
++/*
++** qdecw_pat_2_u32:
++**	uqdecw	z0\.s, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_2_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_POW2, 2),
++		z0 = svqdecw_pat (z0, SV_POW2, 2))
++
++/*
++** qdecw_pat_7_u32:
++**	uqdecw	z0\.s, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_7_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_POW2, 7),
++		z0 = svqdecw_pat (z0, SV_POW2, 7))
++
++/*
++** qdecw_pat_15_u32:
++**	uqdecw	z0\.s, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_15_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_POW2, 15),
++		z0 = svqdecw_pat (z0, SV_POW2, 15))
++
++/*
++** qdecw_pat_16_u32:
++**	uqdecw	z0\.s, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_16_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_POW2, 16),
++		z0 = svqdecw_pat (z0, SV_POW2, 16))
++
++/*
++** qdecw_pat_vl1_u32:
++**	uqdecw	z0\.s, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl1_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL1, 16),
++		z0 = svqdecw_pat (z0, SV_VL1, 16))
++
++/*
++** qdecw_pat_vl2_u32:
++**	uqdecw	z0\.s, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl2_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL2, 16),
++		z0 = svqdecw_pat (z0, SV_VL2, 16))
++
++/*
++** qdecw_pat_vl3_u32:
++**	uqdecw	z0\.s, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl3_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL3, 16),
++		z0 = svqdecw_pat (z0, SV_VL3, 16))
++
++/*
++** qdecw_pat_vl4_u32:
++**	uqdecw	z0\.s, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl4_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL4, 16),
++		z0 = svqdecw_pat (z0, SV_VL4, 16))
++
++/*
++** qdecw_pat_vl5_u32:
++**	uqdecw	z0\.s, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl5_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL5, 16),
++		z0 = svqdecw_pat (z0, SV_VL5, 16))
++
++/*
++** qdecw_pat_vl6_u32:
++**	uqdecw	z0\.s, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl6_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL6, 16),
++		z0 = svqdecw_pat (z0, SV_VL6, 16))
++
++/*
++** qdecw_pat_vl7_u32:
++**	uqdecw	z0\.s, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl7_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL7, 16),
++		z0 = svqdecw_pat (z0, SV_VL7, 16))
++
++/*
++** qdecw_pat_vl8_u32:
++**	uqdecw	z0\.s, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl8_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL8, 16),
++		z0 = svqdecw_pat (z0, SV_VL8, 16))
++
++/*
++** qdecw_pat_vl16_u32:
++**	uqdecw	z0\.s, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl16_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL16, 16),
++		z0 = svqdecw_pat (z0, SV_VL16, 16))
++
++/*
++** qdecw_pat_vl32_u32:
++**	uqdecw	z0\.s, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl32_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL32, 16),
++		z0 = svqdecw_pat (z0, SV_VL32, 16))
++
++/*
++** qdecw_pat_vl64_u32:
++**	uqdecw	z0\.s, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl64_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL64, 16),
++		z0 = svqdecw_pat (z0, SV_VL64, 16))
++
++/*
++** qdecw_pat_vl128_u32:
++**	uqdecw	z0\.s, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl128_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL128, 16),
++		z0 = svqdecw_pat (z0, SV_VL128, 16))
++
++/*
++** qdecw_pat_vl256_u32:
++**	uqdecw	z0\.s, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_vl256_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_VL256, 16),
++		z0 = svqdecw_pat (z0, SV_VL256, 16))
++
++/*
++** qdecw_pat_mul4_u32:
++**	uqdecw	z0\.s, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_mul4_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_MUL4, 16),
++		z0 = svqdecw_pat (z0, SV_MUL4, 16))
++
++/*
++** qdecw_pat_mul3_u32:
++**	uqdecw	z0\.s, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_mul3_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_MUL3, 16),
++		z0 = svqdecw_pat (z0, SV_MUL3, 16))
++
++/*
++** qdecw_pat_all_u32:
++**	uqdecw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_pat_all_u32, svuint32_t,
++		z0 = svqdecw_pat_u32 (z0, SV_ALL, 16),
++		z0 = svqdecw_pat (z0, SV_ALL, 16))
++
++/*
++** qdecw_pat_n_1_u32_tied:
++**	uqdecw	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_u32_tied, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqdecw_pat (x0, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdecw	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_u32_untied, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqdecw_pat (x1, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_2_u32:
++**	uqdecw	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_2_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqdecw_pat (x0, SV_POW2, 2))
++
++/*
++** qdecw_pat_n_7_u32:
++**	uqdecw	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_7_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqdecw_pat (x0, SV_POW2, 7))
++
++/*
++** qdecw_pat_n_15_u32:
++**	uqdecw	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_15_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqdecw_pat (x0, SV_POW2, 15))
++
++/*
++** qdecw_pat_n_16_u32:
++**	uqdecw	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_16_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqdecw_pat (x0, SV_POW2, 16))
++
++/*
++** qdecw_pat_n_vl1_u32:
++**	uqdecw	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl1_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqdecw_pat (x0, SV_VL1, 16))
++
++/*
++** qdecw_pat_n_vl2_u32:
++**	uqdecw	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl2_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqdecw_pat (x0, SV_VL2, 16))
++
++/*
++** qdecw_pat_n_vl3_u32:
++**	uqdecw	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl3_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqdecw_pat (x0, SV_VL3, 16))
++
++/*
++** qdecw_pat_n_vl4_u32:
++**	uqdecw	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl4_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqdecw_pat (x0, SV_VL4, 16))
++
++/*
++** qdecw_pat_n_vl5_u32:
++**	uqdecw	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl5_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqdecw_pat (x0, SV_VL5, 16))
++
++/*
++** qdecw_pat_n_vl6_u32:
++**	uqdecw	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl6_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqdecw_pat (x0, SV_VL6, 16))
++
++/*
++** qdecw_pat_n_vl7_u32:
++**	uqdecw	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl7_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqdecw_pat (x0, SV_VL7, 16))
++
++/*
++** qdecw_pat_n_vl8_u32:
++**	uqdecw	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl8_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqdecw_pat (x0, SV_VL8, 16))
++
++/*
++** qdecw_pat_n_vl16_u32:
++**	uqdecw	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl16_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqdecw_pat (x0, SV_VL16, 16))
++
++/*
++** qdecw_pat_n_vl32_u32:
++**	uqdecw	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl32_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqdecw_pat (x0, SV_VL32, 16))
++
++/*
++** qdecw_pat_n_vl64_u32:
++**	uqdecw	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl64_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqdecw_pat (x0, SV_VL64, 16))
++
++/*
++** qdecw_pat_n_vl128_u32:
++**	uqdecw	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl128_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqdecw_pat (x0, SV_VL128, 16))
++
++/*
++** qdecw_pat_n_vl256_u32:
++**	uqdecw	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl256_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqdecw_pat (x0, SV_VL256, 16))
++
++/*
++** qdecw_pat_n_mul4_u32:
++**	uqdecw	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul4_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqdecw_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecw_pat_n_mul3_u32:
++**	uqdecw	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul3_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqdecw_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecw_pat_n_all_u32:
++**	uqdecw	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_all_u32, uint32_t,
++		x0 = svqdecw_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqdecw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u64.c
+new file mode 100644
+index 000000000..015775b17
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_pat_n_1_u64_tied:
++**	uqdecw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_u64_tied, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqdecw_pat (x0, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdecw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_1_u64_untied, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqdecw_pat (x1, SV_POW2, 1))
++
++/*
++** qdecw_pat_n_2_u64:
++**	uqdecw	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_2_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqdecw_pat (x0, SV_POW2, 2))
++
++/*
++** qdecw_pat_n_7_u64:
++**	uqdecw	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_7_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqdecw_pat (x0, SV_POW2, 7))
++
++/*
++** qdecw_pat_n_15_u64:
++**	uqdecw	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_15_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqdecw_pat (x0, SV_POW2, 15))
++
++/*
++** qdecw_pat_n_16_u64:
++**	uqdecw	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_16_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqdecw_pat (x0, SV_POW2, 16))
++
++/*
++** qdecw_pat_n_vl1_u64:
++**	uqdecw	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl1_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqdecw_pat (x0, SV_VL1, 16))
++
++/*
++** qdecw_pat_n_vl2_u64:
++**	uqdecw	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl2_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqdecw_pat (x0, SV_VL2, 16))
++
++/*
++** qdecw_pat_n_vl3_u64:
++**	uqdecw	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl3_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqdecw_pat (x0, SV_VL3, 16))
++
++/*
++** qdecw_pat_n_vl4_u64:
++**	uqdecw	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl4_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqdecw_pat (x0, SV_VL4, 16))
++
++/*
++** qdecw_pat_n_vl5_u64:
++**	uqdecw	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl5_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqdecw_pat (x0, SV_VL5, 16))
++
++/*
++** qdecw_pat_n_vl6_u64:
++**	uqdecw	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl6_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqdecw_pat (x0, SV_VL6, 16))
++
++/*
++** qdecw_pat_n_vl7_u64:
++**	uqdecw	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl7_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqdecw_pat (x0, SV_VL7, 16))
++
++/*
++** qdecw_pat_n_vl8_u64:
++**	uqdecw	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl8_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqdecw_pat (x0, SV_VL8, 16))
++
++/*
++** qdecw_pat_n_vl16_u64:
++**	uqdecw	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl16_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqdecw_pat (x0, SV_VL16, 16))
++
++/*
++** qdecw_pat_n_vl32_u64:
++**	uqdecw	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl32_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqdecw_pat (x0, SV_VL32, 16))
++
++/*
++** qdecw_pat_n_vl64_u64:
++**	uqdecw	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl64_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqdecw_pat (x0, SV_VL64, 16))
++
++/*
++** qdecw_pat_n_vl128_u64:
++**	uqdecw	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl128_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqdecw_pat (x0, SV_VL128, 16))
++
++/*
++** qdecw_pat_n_vl256_u64:
++**	uqdecw	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_vl256_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqdecw_pat (x0, SV_VL256, 16))
++
++/*
++** qdecw_pat_n_mul4_u64:
++**	uqdecw	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul4_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqdecw_pat (x0, SV_MUL4, 16))
++
++/*
++** qdecw_pat_n_mul3_u64:
++**	uqdecw	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_mul3_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqdecw_pat (x0, SV_MUL3, 16))
++
++/*
++** qdecw_pat_n_all_u64:
++**	uqdecw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_pat_n_all_u64, uint64_t,
++		x0 = svqdecw_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqdecw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s32.c
+new file mode 100644
+index 000000000..8dfe8a177
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s32.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_1_s32_tied:
++**	sqdecw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_1_s32_tied, svint32_t,
++		z0 = svqdecw_s32 (z0, 1),
++		z0 = svqdecw (z0, 1))
++
++/*
++** qdecw_1_s32_untied:
++**	movprfx	z0, z1
++**	sqdecw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_1_s32_untied, svint32_t,
++		z0 = svqdecw_s32 (z1, 1),
++		z0 = svqdecw (z1, 1))
++
++/*
++** qdecw_2_s32:
++**	sqdecw	z0\.s, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_2_s32, svint32_t,
++		z0 = svqdecw_s32 (z0, 2),
++		z0 = svqdecw (z0, 2))
++
++/*
++** qdecw_7_s32:
++**	sqdecw	z0\.s, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_7_s32, svint32_t,
++		z0 = svqdecw_s32 (z0, 7),
++		z0 = svqdecw (z0, 7))
++
++/*
++** qdecw_15_s32:
++**	sqdecw	z0\.s, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_15_s32, svint32_t,
++		z0 = svqdecw_s32 (z0, 15),
++		z0 = svqdecw (z0, 15))
++
++/*
++** qdecw_16_s32:
++**	sqdecw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_16_s32, svint32_t,
++		z0 = svqdecw_s32 (z0, 16),
++		z0 = svqdecw (z0, 16))
++
++/*
++** qdecw_n_1_s32_tied:
++**	sqdecw	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_s32_tied, int32_t,
++		x0 = svqdecw_n_s32 (x0, 1),
++		x0 = svqdecw (x0, 1))
++
++/*
++** qdecw_n_1_s32_untied:
++**	mov	w0, w1
++**	sqdecw	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_s32_untied, int32_t,
++		x0 = svqdecw_n_s32 (x1, 1),
++		x0 = svqdecw (x1, 1))
++
++/*
++** qdecw_n_2_s32:
++**	sqdecw	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_2_s32, int32_t,
++		x0 = svqdecw_n_s32 (x0, 2),
++		x0 = svqdecw (x0, 2))
++
++/*
++** qdecw_n_7_s32:
++**	sqdecw	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_7_s32, int32_t,
++		x0 = svqdecw_n_s32 (x0, 7),
++		x0 = svqdecw (x0, 7))
++
++/*
++** qdecw_n_15_s32:
++**	sqdecw	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_15_s32, int32_t,
++		x0 = svqdecw_n_s32 (x0, 15),
++		x0 = svqdecw (x0, 15))
++
++/*
++** qdecw_n_16_s32:
++**	sqdecw	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_16_s32, int32_t,
++		x0 = svqdecw_n_s32 (x0, 16),
++		x0 = svqdecw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s64.c
+new file mode 100644
+index 000000000..b0841a8b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_n_1_s64_tied:
++**	sqdecw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_s64_tied, int64_t,
++		x0 = svqdecw_n_s64 (x0, 1),
++		x0 = svqdecw (x0, 1))
++
++/*
++** qdecw_n_1_s64_untied:
++**	mov	x0, x1
++**	sqdecw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_s64_untied, int64_t,
++		x0 = svqdecw_n_s64 (x1, 1),
++		x0 = svqdecw (x1, 1))
++
++/*
++** qdecw_n_2_s64:
++**	sqdecw	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_2_s64, int64_t,
++		x0 = svqdecw_n_s64 (x0, 2),
++		x0 = svqdecw (x0, 2))
++
++/*
++** qdecw_n_7_s64:
++**	sqdecw	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_7_s64, int64_t,
++		x0 = svqdecw_n_s64 (x0, 7),
++		x0 = svqdecw (x0, 7))
++
++/*
++** qdecw_n_15_s64:
++**	sqdecw	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_15_s64, int64_t,
++		x0 = svqdecw_n_s64 (x0, 15),
++		x0 = svqdecw (x0, 15))
++
++/*
++** qdecw_n_16_s64:
++**	sqdecw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_16_s64, int64_t,
++		x0 = svqdecw_n_s64 (x0, 16),
++		x0 = svqdecw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u32.c
+new file mode 100644
+index 000000000..22e8a8d69
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u32.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_1_u32_tied:
++**	uqdecw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_1_u32_tied, svuint32_t,
++		z0 = svqdecw_u32 (z0, 1),
++		z0 = svqdecw (z0, 1))
++
++/*
++** qdecw_1_u32_untied:
++**	movprfx	z0, z1
++**	uqdecw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_1_u32_untied, svuint32_t,
++		z0 = svqdecw_u32 (z1, 1),
++		z0 = svqdecw (z1, 1))
++
++/*
++** qdecw_2_u32:
++**	uqdecw	z0\.s, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_2_u32, svuint32_t,
++		z0 = svqdecw_u32 (z0, 2),
++		z0 = svqdecw (z0, 2))
++
++/*
++** qdecw_7_u32:
++**	uqdecw	z0\.s, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_7_u32, svuint32_t,
++		z0 = svqdecw_u32 (z0, 7),
++		z0 = svqdecw (z0, 7))
++
++/*
++** qdecw_15_u32:
++**	uqdecw	z0\.s, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_15_u32, svuint32_t,
++		z0 = svqdecw_u32 (z0, 15),
++		z0 = svqdecw (z0, 15))
++
++/*
++** qdecw_16_u32:
++**	uqdecw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qdecw_16_u32, svuint32_t,
++		z0 = svqdecw_u32 (z0, 16),
++		z0 = svqdecw (z0, 16))
++
++/*
++** qdecw_n_1_u32_tied:
++**	uqdecw	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_u32_tied, uint32_t,
++		x0 = svqdecw_n_u32 (x0, 1),
++		x0 = svqdecw (x0, 1))
++
++/*
++** qdecw_n_1_u32_untied:
++**	mov	w0, w1
++**	uqdecw	w0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_u32_untied, uint32_t,
++		x0 = svqdecw_n_u32 (x1, 1),
++		x0 = svqdecw (x1, 1))
++
++/*
++** qdecw_n_2_u32:
++**	uqdecw	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_2_u32, uint32_t,
++		x0 = svqdecw_n_u32 (x0, 2),
++		x0 = svqdecw (x0, 2))
++
++/*
++** qdecw_n_7_u32:
++**	uqdecw	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_7_u32, uint32_t,
++		x0 = svqdecw_n_u32 (x0, 7),
++		x0 = svqdecw (x0, 7))
++
++/*
++** qdecw_n_15_u32:
++**	uqdecw	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_15_u32, uint32_t,
++		x0 = svqdecw_n_u32 (x0, 15),
++		x0 = svqdecw (x0, 15))
++
++/*
++** qdecw_n_16_u32:
++**	uqdecw	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_16_u32, uint32_t,
++		x0 = svqdecw_n_u32 (x0, 16),
++		x0 = svqdecw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u64.c
+new file mode 100644
+index 000000000..88c484e8b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qdecw_n_1_u64_tied:
++**	uqdecw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_u64_tied, uint64_t,
++		x0 = svqdecw_n_u64 (x0, 1),
++		x0 = svqdecw (x0, 1))
++
++/*
++** qdecw_n_1_u64_untied:
++**	mov	x0, x1
++**	uqdecw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_1_u64_untied, uint64_t,
++		x0 = svqdecw_n_u64 (x1, 1),
++		x0 = svqdecw (x1, 1))
++
++/*
++** qdecw_n_2_u64:
++**	uqdecw	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_2_u64, uint64_t,
++		x0 = svqdecw_n_u64 (x0, 2),
++		x0 = svqdecw (x0, 2))
++
++/*
++** qdecw_n_7_u64:
++**	uqdecw	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_7_u64, uint64_t,
++		x0 = svqdecw_n_u64 (x0, 7),
++		x0 = svqdecw (x0, 7))
++
++/*
++** qdecw_n_15_u64:
++**	uqdecw	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_15_u64, uint64_t,
++		x0 = svqdecw_n_u64 (x0, 15),
++		x0 = svqdecw (x0, 15))
++
++/*
++** qdecw_n_16_u64:
++**	uqdecw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qdecw_n_16_u64, uint64_t,
++		x0 = svqdecw_n_u64 (x0, 16),
++		x0 = svqdecw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s32.c
+new file mode 100644
+index 000000000..16a8d8e9a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_pat_n_1_s32_tied:
++**	sqincb	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_s32_tied, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqincb_pat (x0, SV_POW2, 1))
++
++/*
++** qincb_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqincb	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_s32_untied, int32_t,
++		x0 = svqincb_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqincb_pat (x1, SV_POW2, 1))
++
++/*
++** qincb_pat_n_2_s32:
++**	sqincb	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_2_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqincb_pat (x0, SV_POW2, 2))
++
++/*
++** qincb_pat_n_7_s32:
++**	sqincb	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_7_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqincb_pat (x0, SV_POW2, 7))
++
++/*
++** qincb_pat_n_15_s32:
++**	sqincb	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_15_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqincb_pat (x0, SV_POW2, 15))
++
++/*
++** qincb_pat_n_16_s32:
++**	sqincb	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_16_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqincb_pat (x0, SV_POW2, 16))
++
++/*
++** qincb_pat_n_vl1_s32:
++**	sqincb	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl1_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqincb_pat (x0, SV_VL1, 16))
++
++/*
++** qincb_pat_n_vl2_s32:
++**	sqincb	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl2_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqincb_pat (x0, SV_VL2, 16))
++
++/*
++** qincb_pat_n_vl3_s32:
++**	sqincb	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl3_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqincb_pat (x0, SV_VL3, 16))
++
++/*
++** qincb_pat_n_vl4_s32:
++**	sqincb	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl4_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqincb_pat (x0, SV_VL4, 16))
++
++/*
++** qincb_pat_n_vl5_s32:
++**	sqincb	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl5_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqincb_pat (x0, SV_VL5, 16))
++
++/*
++** qincb_pat_n_vl6_s32:
++**	sqincb	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl6_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqincb_pat (x0, SV_VL6, 16))
++
++/*
++** qincb_pat_n_vl7_s32:
++**	sqincb	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl7_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqincb_pat (x0, SV_VL7, 16))
++
++/*
++** qincb_pat_n_vl8_s32:
++**	sqincb	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl8_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqincb_pat (x0, SV_VL8, 16))
++
++/*
++** qincb_pat_n_vl16_s32:
++**	sqincb	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl16_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqincb_pat (x0, SV_VL16, 16))
++
++/*
++** qincb_pat_n_vl32_s32:
++**	sqincb	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl32_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqincb_pat (x0, SV_VL32, 16))
++
++/*
++** qincb_pat_n_vl64_s32:
++**	sqincb	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl64_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqincb_pat (x0, SV_VL64, 16))
++
++/*
++** qincb_pat_n_vl128_s32:
++**	sqincb	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl128_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqincb_pat (x0, SV_VL128, 16))
++
++/*
++** qincb_pat_n_vl256_s32:
++**	sqincb	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl256_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqincb_pat (x0, SV_VL256, 16))
++
++/*
++** qincb_pat_n_mul4_s32:
++**	sqincb	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul4_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqincb_pat (x0, SV_MUL4, 16))
++
++/*
++** qincb_pat_n_mul3_s32:
++**	sqincb	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul3_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqincb_pat (x0, SV_MUL3, 16))
++
++/*
++** qincb_pat_n_all_s32:
++**	sqincb	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_all_s32, int32_t,
++		x0 = svqincb_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqincb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s64.c
+new file mode 100644
+index 000000000..79ed73ba7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_pat_n_1_s64_tied:
++**	sqincb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_s64_tied, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqincb_pat (x0, SV_POW2, 1))
++
++/*
++** qincb_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqincb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_s64_untied, int64_t,
++		x0 = svqincb_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqincb_pat (x1, SV_POW2, 1))
++
++/*
++** qincb_pat_n_2_s64:
++**	sqincb	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_2_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqincb_pat (x0, SV_POW2, 2))
++
++/*
++** qincb_pat_n_7_s64:
++**	sqincb	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_7_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqincb_pat (x0, SV_POW2, 7))
++
++/*
++** qincb_pat_n_15_s64:
++**	sqincb	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_15_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqincb_pat (x0, SV_POW2, 15))
++
++/*
++** qincb_pat_n_16_s64:
++**	sqincb	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_16_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqincb_pat (x0, SV_POW2, 16))
++
++/*
++** qincb_pat_n_vl1_s64:
++**	sqincb	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl1_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqincb_pat (x0, SV_VL1, 16))
++
++/*
++** qincb_pat_n_vl2_s64:
++**	sqincb	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl2_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqincb_pat (x0, SV_VL2, 16))
++
++/*
++** qincb_pat_n_vl3_s64:
++**	sqincb	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl3_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqincb_pat (x0, SV_VL3, 16))
++
++/*
++** qincb_pat_n_vl4_s64:
++**	sqincb	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl4_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqincb_pat (x0, SV_VL4, 16))
++
++/*
++** qincb_pat_n_vl5_s64:
++**	sqincb	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl5_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqincb_pat (x0, SV_VL5, 16))
++
++/*
++** qincb_pat_n_vl6_s64:
++**	sqincb	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl6_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqincb_pat (x0, SV_VL6, 16))
++
++/*
++** qincb_pat_n_vl7_s64:
++**	sqincb	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl7_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqincb_pat (x0, SV_VL7, 16))
++
++/*
++** qincb_pat_n_vl8_s64:
++**	sqincb	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl8_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqincb_pat (x0, SV_VL8, 16))
++
++/*
++** qincb_pat_n_vl16_s64:
++**	sqincb	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl16_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqincb_pat (x0, SV_VL16, 16))
++
++/*
++** qincb_pat_n_vl32_s64:
++**	sqincb	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl32_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqincb_pat (x0, SV_VL32, 16))
++
++/*
++** qincb_pat_n_vl64_s64:
++**	sqincb	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl64_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqincb_pat (x0, SV_VL64, 16))
++
++/*
++** qincb_pat_n_vl128_s64:
++**	sqincb	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl128_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqincb_pat (x0, SV_VL128, 16))
++
++/*
++** qincb_pat_n_vl256_s64:
++**	sqincb	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl256_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqincb_pat (x0, SV_VL256, 16))
++
++/*
++** qincb_pat_n_mul4_s64:
++**	sqincb	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul4_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqincb_pat (x0, SV_MUL4, 16))
++
++/*
++** qincb_pat_n_mul3_s64:
++**	sqincb	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul3_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqincb_pat (x0, SV_MUL3, 16))
++
++/*
++** qincb_pat_n_all_s64:
++**	sqincb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_all_s64, int64_t,
++		x0 = svqincb_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqincb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u32.c
+new file mode 100644
+index 000000000..30e5f28ee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_pat_n_1_u32_tied:
++**	uqincb	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_u32_tied, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqincb_pat (x0, SV_POW2, 1))
++
++/*
++** qincb_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqincb	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_u32_untied, uint32_t,
++		x0 = svqincb_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqincb_pat (x1, SV_POW2, 1))
++
++/*
++** qincb_pat_n_2_u32:
++**	uqincb	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_2_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqincb_pat (x0, SV_POW2, 2))
++
++/*
++** qincb_pat_n_7_u32:
++**	uqincb	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_7_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqincb_pat (x0, SV_POW2, 7))
++
++/*
++** qincb_pat_n_15_u32:
++**	uqincb	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_15_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqincb_pat (x0, SV_POW2, 15))
++
++/*
++** qincb_pat_n_16_u32:
++**	uqincb	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_16_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqincb_pat (x0, SV_POW2, 16))
++
++/*
++** qincb_pat_n_vl1_u32:
++**	uqincb	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl1_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqincb_pat (x0, SV_VL1, 16))
++
++/*
++** qincb_pat_n_vl2_u32:
++**	uqincb	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl2_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqincb_pat (x0, SV_VL2, 16))
++
++/*
++** qincb_pat_n_vl3_u32:
++**	uqincb	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl3_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqincb_pat (x0, SV_VL3, 16))
++
++/*
++** qincb_pat_n_vl4_u32:
++**	uqincb	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl4_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqincb_pat (x0, SV_VL4, 16))
++
++/*
++** qincb_pat_n_vl5_u32:
++**	uqincb	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl5_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqincb_pat (x0, SV_VL5, 16))
++
++/*
++** qincb_pat_n_vl6_u32:
++**	uqincb	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl6_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqincb_pat (x0, SV_VL6, 16))
++
++/*
++** qincb_pat_n_vl7_u32:
++**	uqincb	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl7_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqincb_pat (x0, SV_VL7, 16))
++
++/*
++** qincb_pat_n_vl8_u32:
++**	uqincb	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl8_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqincb_pat (x0, SV_VL8, 16))
++
++/*
++** qincb_pat_n_vl16_u32:
++**	uqincb	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl16_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqincb_pat (x0, SV_VL16, 16))
++
++/*
++** qincb_pat_n_vl32_u32:
++**	uqincb	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl32_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqincb_pat (x0, SV_VL32, 16))
++
++/*
++** qincb_pat_n_vl64_u32:
++**	uqincb	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl64_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqincb_pat (x0, SV_VL64, 16))
++
++/*
++** qincb_pat_n_vl128_u32:
++**	uqincb	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl128_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqincb_pat (x0, SV_VL128, 16))
++
++/*
++** qincb_pat_n_vl256_u32:
++**	uqincb	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl256_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqincb_pat (x0, SV_VL256, 16))
++
++/*
++** qincb_pat_n_mul4_u32:
++**	uqincb	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul4_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqincb_pat (x0, SV_MUL4, 16))
++
++/*
++** qincb_pat_n_mul3_u32:
++**	uqincb	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul3_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqincb_pat (x0, SV_MUL3, 16))
++
++/*
++** qincb_pat_n_all_u32:
++**	uqincb	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_all_u32, uint32_t,
++		x0 = svqincb_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqincb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u64.c
+new file mode 100644
+index 000000000..038b1edb6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_pat_n_1_u64_tied:
++**	uqincb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_u64_tied, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqincb_pat (x0, SV_POW2, 1))
++
++/*
++** qincb_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqincb	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_1_u64_untied, uint64_t,
++		x0 = svqincb_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqincb_pat (x1, SV_POW2, 1))
++
++/*
++** qincb_pat_n_2_u64:
++**	uqincb	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_2_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqincb_pat (x0, SV_POW2, 2))
++
++/*
++** qincb_pat_n_7_u64:
++**	uqincb	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_7_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqincb_pat (x0, SV_POW2, 7))
++
++/*
++** qincb_pat_n_15_u64:
++**	uqincb	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_15_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqincb_pat (x0, SV_POW2, 15))
++
++/*
++** qincb_pat_n_16_u64:
++**	uqincb	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_16_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqincb_pat (x0, SV_POW2, 16))
++
++/*
++** qincb_pat_n_vl1_u64:
++**	uqincb	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl1_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqincb_pat (x0, SV_VL1, 16))
++
++/*
++** qincb_pat_n_vl2_u64:
++**	uqincb	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl2_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqincb_pat (x0, SV_VL2, 16))
++
++/*
++** qincb_pat_n_vl3_u64:
++**	uqincb	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl3_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqincb_pat (x0, SV_VL3, 16))
++
++/*
++** qincb_pat_n_vl4_u64:
++**	uqincb	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl4_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqincb_pat (x0, SV_VL4, 16))
++
++/*
++** qincb_pat_n_vl5_u64:
++**	uqincb	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl5_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqincb_pat (x0, SV_VL5, 16))
++
++/*
++** qincb_pat_n_vl6_u64:
++**	uqincb	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl6_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqincb_pat (x0, SV_VL6, 16))
++
++/*
++** qincb_pat_n_vl7_u64:
++**	uqincb	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl7_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqincb_pat (x0, SV_VL7, 16))
++
++/*
++** qincb_pat_n_vl8_u64:
++**	uqincb	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl8_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqincb_pat (x0, SV_VL8, 16))
++
++/*
++** qincb_pat_n_vl16_u64:
++**	uqincb	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl16_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqincb_pat (x0, SV_VL16, 16))
++
++/*
++** qincb_pat_n_vl32_u64:
++**	uqincb	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl32_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqincb_pat (x0, SV_VL32, 16))
++
++/*
++** qincb_pat_n_vl64_u64:
++**	uqincb	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl64_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqincb_pat (x0, SV_VL64, 16))
++
++/*
++** qincb_pat_n_vl128_u64:
++**	uqincb	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl128_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqincb_pat (x0, SV_VL128, 16))
++
++/*
++** qincb_pat_n_vl256_u64:
++**	uqincb	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_vl256_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqincb_pat (x0, SV_VL256, 16))
++
++/*
++** qincb_pat_n_mul4_u64:
++**	uqincb	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul4_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqincb_pat (x0, SV_MUL4, 16))
++
++/*
++** qincb_pat_n_mul3_u64:
++**	uqincb	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_mul3_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqincb_pat (x0, SV_MUL3, 16))
++
++/*
++** qincb_pat_n_all_u64:
++**	uqincb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_pat_n_all_u64, uint64_t,
++		x0 = svqincb_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqincb_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s32.c
+new file mode 100644
+index 000000000..8e74073de
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_n_1_s32_tied:
++**	sqincb	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_s32_tied, int32_t,
++		x0 = svqincb_n_s32 (x0, 1),
++		x0 = svqincb (x0, 1))
++
++/*
++** qincb_n_1_s32_untied:
++**	mov	w0, w1
++**	sqincb	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_s32_untied, int32_t,
++		x0 = svqincb_n_s32 (x1, 1),
++		x0 = svqincb (x1, 1))
++
++/*
++** qincb_n_2_s32:
++**	sqincb	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_2_s32, int32_t,
++		x0 = svqincb_n_s32 (x0, 2),
++		x0 = svqincb (x0, 2))
++
++/*
++** qincb_n_7_s32:
++**	sqincb	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_7_s32, int32_t,
++		x0 = svqincb_n_s32 (x0, 7),
++		x0 = svqincb (x0, 7))
++
++/*
++** qincb_n_15_s32:
++**	sqincb	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_15_s32, int32_t,
++		x0 = svqincb_n_s32 (x0, 15),
++		x0 = svqincb (x0, 15))
++
++/*
++** qincb_n_16_s32:
++**	sqincb	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_16_s32, int32_t,
++		x0 = svqincb_n_s32 (x0, 16),
++		x0 = svqincb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s64.c
+new file mode 100644
+index 000000000..b064c1264
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_n_1_s64_tied:
++**	sqincb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_s64_tied, int64_t,
++		x0 = svqincb_n_s64 (x0, 1),
++		x0 = svqincb (x0, 1))
++
++/*
++** qincb_n_1_s64_untied:
++**	mov	x0, x1
++**	sqincb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_s64_untied, int64_t,
++		x0 = svqincb_n_s64 (x1, 1),
++		x0 = svqincb (x1, 1))
++
++/*
++** qincb_n_2_s64:
++**	sqincb	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_2_s64, int64_t,
++		x0 = svqincb_n_s64 (x0, 2),
++		x0 = svqincb (x0, 2))
++
++/*
++** qincb_n_7_s64:
++**	sqincb	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_7_s64, int64_t,
++		x0 = svqincb_n_s64 (x0, 7),
++		x0 = svqincb (x0, 7))
++
++/*
++** qincb_n_15_s64:
++**	sqincb	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_15_s64, int64_t,
++		x0 = svqincb_n_s64 (x0, 15),
++		x0 = svqincb (x0, 15))
++
++/*
++** qincb_n_16_s64:
++**	sqincb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_16_s64, int64_t,
++		x0 = svqincb_n_s64 (x0, 16),
++		x0 = svqincb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u32.c
+new file mode 100644
+index 000000000..df3add73e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_n_1_u32_tied:
++**	uqincb	w0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_u32_tied, uint32_t,
++		x0 = svqincb_n_u32 (x0, 1),
++		x0 = svqincb (x0, 1))
++
++/*
++** qincb_n_1_u32_untied:
++**	mov	w0, w1
++**	uqincb	w0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_u32_untied, uint32_t,
++		x0 = svqincb_n_u32 (x1, 1),
++		x0 = svqincb (x1, 1))
++
++/*
++** qincb_n_2_u32:
++**	uqincb	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_2_u32, uint32_t,
++		x0 = svqincb_n_u32 (x0, 2),
++		x0 = svqincb (x0, 2))
++
++/*
++** qincb_n_7_u32:
++**	uqincb	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_7_u32, uint32_t,
++		x0 = svqincb_n_u32 (x0, 7),
++		x0 = svqincb (x0, 7))
++
++/*
++** qincb_n_15_u32:
++**	uqincb	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_15_u32, uint32_t,
++		x0 = svqincb_n_u32 (x0, 15),
++		x0 = svqincb (x0, 15))
++
++/*
++** qincb_n_16_u32:
++**	uqincb	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_16_u32, uint32_t,
++		x0 = svqincb_n_u32 (x0, 16),
++		x0 = svqincb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u64.c
+new file mode 100644
+index 000000000..d9a08c865
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincb_n_1_u64_tied:
++**	uqincb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_u64_tied, uint64_t,
++		x0 = svqincb_n_u64 (x0, 1),
++		x0 = svqincb (x0, 1))
++
++/*
++** qincb_n_1_u64_untied:
++**	mov	x0, x1
++**	uqincb	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_1_u64_untied, uint64_t,
++		x0 = svqincb_n_u64 (x1, 1),
++		x0 = svqincb (x1, 1))
++
++/*
++** qincb_n_2_u64:
++**	uqincb	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_2_u64, uint64_t,
++		x0 = svqincb_n_u64 (x0, 2),
++		x0 = svqincb (x0, 2))
++
++/*
++** qincb_n_7_u64:
++**	uqincb	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_7_u64, uint64_t,
++		x0 = svqincb_n_u64 (x0, 7),
++		x0 = svqincb (x0, 7))
++
++/*
++** qincb_n_15_u64:
++**	uqincb	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_15_u64, uint64_t,
++		x0 = svqincb_n_u64 (x0, 15),
++		x0 = svqincb (x0, 15))
++
++/*
++** qincb_n_16_u64:
++**	uqincb	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincb_n_16_u64, uint64_t,
++		x0 = svqincb_n_u64 (x0, 16),
++		x0 = svqincb (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s32.c
+new file mode 100644
+index 000000000..061f88314
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_pat_n_1_s32_tied:
++**	sqincd	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_s32_tied, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqincd_pat (x0, SV_POW2, 1))
++
++/*
++** qincd_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqincd	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_s32_untied, int32_t,
++		x0 = svqincd_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqincd_pat (x1, SV_POW2, 1))
++
++/*
++** qincd_pat_n_2_s32:
++**	sqincd	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_2_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqincd_pat (x0, SV_POW2, 2))
++
++/*
++** qincd_pat_n_7_s32:
++**	sqincd	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_7_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqincd_pat (x0, SV_POW2, 7))
++
++/*
++** qincd_pat_n_15_s32:
++**	sqincd	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_15_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqincd_pat (x0, SV_POW2, 15))
++
++/*
++** qincd_pat_n_16_s32:
++**	sqincd	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_16_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqincd_pat (x0, SV_POW2, 16))
++
++/*
++** qincd_pat_n_vl1_s32:
++**	sqincd	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl1_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqincd_pat (x0, SV_VL1, 16))
++
++/*
++** qincd_pat_n_vl2_s32:
++**	sqincd	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl2_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqincd_pat (x0, SV_VL2, 16))
++
++/*
++** qincd_pat_n_vl3_s32:
++**	sqincd	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl3_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqincd_pat (x0, SV_VL3, 16))
++
++/*
++** qincd_pat_n_vl4_s32:
++**	sqincd	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl4_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqincd_pat (x0, SV_VL4, 16))
++
++/*
++** qincd_pat_n_vl5_s32:
++**	sqincd	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl5_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqincd_pat (x0, SV_VL5, 16))
++
++/*
++** qincd_pat_n_vl6_s32:
++**	sqincd	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl6_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqincd_pat (x0, SV_VL6, 16))
++
++/*
++** qincd_pat_n_vl7_s32:
++**	sqincd	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl7_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqincd_pat (x0, SV_VL7, 16))
++
++/*
++** qincd_pat_n_vl8_s32:
++**	sqincd	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl8_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqincd_pat (x0, SV_VL8, 16))
++
++/*
++** qincd_pat_n_vl16_s32:
++**	sqincd	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl16_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqincd_pat (x0, SV_VL16, 16))
++
++/*
++** qincd_pat_n_vl32_s32:
++**	sqincd	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl32_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqincd_pat (x0, SV_VL32, 16))
++
++/*
++** qincd_pat_n_vl64_s32:
++**	sqincd	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl64_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqincd_pat (x0, SV_VL64, 16))
++
++/*
++** qincd_pat_n_vl128_s32:
++**	sqincd	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl128_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqincd_pat (x0, SV_VL128, 16))
++
++/*
++** qincd_pat_n_vl256_s32:
++**	sqincd	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl256_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqincd_pat (x0, SV_VL256, 16))
++
++/*
++** qincd_pat_n_mul4_s32:
++**	sqincd	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul4_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqincd_pat (x0, SV_MUL4, 16))
++
++/*
++** qincd_pat_n_mul3_s32:
++**	sqincd	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul3_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqincd_pat (x0, SV_MUL3, 16))
++
++/*
++** qincd_pat_n_all_s32:
++**	sqincd	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_all_s32, int32_t,
++		x0 = svqincd_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqincd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s64.c
+new file mode 100644
+index 000000000..02b53e1bc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s64.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_pat_1_s64_tied:
++**	sqincd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_1_s64_tied, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_POW2, 1),
++		z0 = svqincd_pat (z0, SV_POW2, 1))
++
++/*
++** qincd_pat_1_s64_untied:
++**	movprfx	z0, z1
++**	sqincd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_1_s64_untied, svint64_t,
++		z0 = svqincd_pat_s64 (z1, SV_POW2, 1),
++		z0 = svqincd_pat (z1, SV_POW2, 1))
++
++/*
++** qincd_pat_2_s64:
++**	sqincd	z0\.d, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_2_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_POW2, 2),
++		z0 = svqincd_pat (z0, SV_POW2, 2))
++
++/*
++** qincd_pat_7_s64:
++**	sqincd	z0\.d, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_7_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_POW2, 7),
++		z0 = svqincd_pat (z0, SV_POW2, 7))
++
++/*
++** qincd_pat_15_s64:
++**	sqincd	z0\.d, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_15_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_POW2, 15),
++		z0 = svqincd_pat (z0, SV_POW2, 15))
++
++/*
++** qincd_pat_16_s64:
++**	sqincd	z0\.d, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_16_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_POW2, 16),
++		z0 = svqincd_pat (z0, SV_POW2, 16))
++
++/*
++** qincd_pat_vl1_s64:
++**	sqincd	z0\.d, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl1_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL1, 16),
++		z0 = svqincd_pat (z0, SV_VL1, 16))
++
++/*
++** qincd_pat_vl2_s64:
++**	sqincd	z0\.d, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl2_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL2, 16),
++		z0 = svqincd_pat (z0, SV_VL2, 16))
++
++/*
++** qincd_pat_vl3_s64:
++**	sqincd	z0\.d, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl3_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL3, 16),
++		z0 = svqincd_pat (z0, SV_VL3, 16))
++
++/*
++** qincd_pat_vl4_s64:
++**	sqincd	z0\.d, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl4_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL4, 16),
++		z0 = svqincd_pat (z0, SV_VL4, 16))
++
++/*
++** qincd_pat_vl5_s64:
++**	sqincd	z0\.d, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl5_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL5, 16),
++		z0 = svqincd_pat (z0, SV_VL5, 16))
++
++/*
++** qincd_pat_vl6_s64:
++**	sqincd	z0\.d, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl6_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL6, 16),
++		z0 = svqincd_pat (z0, SV_VL6, 16))
++
++/*
++** qincd_pat_vl7_s64:
++**	sqincd	z0\.d, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl7_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL7, 16),
++		z0 = svqincd_pat (z0, SV_VL7, 16))
++
++/*
++** qincd_pat_vl8_s64:
++**	sqincd	z0\.d, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl8_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL8, 16),
++		z0 = svqincd_pat (z0, SV_VL8, 16))
++
++/*
++** qincd_pat_vl16_s64:
++**	sqincd	z0\.d, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl16_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL16, 16),
++		z0 = svqincd_pat (z0, SV_VL16, 16))
++
++/*
++** qincd_pat_vl32_s64:
++**	sqincd	z0\.d, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl32_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL32, 16),
++		z0 = svqincd_pat (z0, SV_VL32, 16))
++
++/*
++** qincd_pat_vl64_s64:
++**	sqincd	z0\.d, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl64_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL64, 16),
++		z0 = svqincd_pat (z0, SV_VL64, 16))
++
++/*
++** qincd_pat_vl128_s64:
++**	sqincd	z0\.d, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl128_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL128, 16),
++		z0 = svqincd_pat (z0, SV_VL128, 16))
++
++/*
++** qincd_pat_vl256_s64:
++**	sqincd	z0\.d, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl256_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_VL256, 16),
++		z0 = svqincd_pat (z0, SV_VL256, 16))
++
++/*
++** qincd_pat_mul4_s64:
++**	sqincd	z0\.d, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_mul4_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_MUL4, 16),
++		z0 = svqincd_pat (z0, SV_MUL4, 16))
++
++/*
++** qincd_pat_mul3_s64:
++**	sqincd	z0\.d, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_mul3_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_MUL3, 16),
++		z0 = svqincd_pat (z0, SV_MUL3, 16))
++
++/*
++** qincd_pat_all_s64:
++**	sqincd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_all_s64, svint64_t,
++		z0 = svqincd_pat_s64 (z0, SV_ALL, 16),
++		z0 = svqincd_pat (z0, SV_ALL, 16))
++
++/*
++** qincd_pat_n_1_s64_tied:
++**	sqincd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_s64_tied, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqincd_pat (x0, SV_POW2, 1))
++
++/*
++** qincd_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqincd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_s64_untied, int64_t,
++		x0 = svqincd_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqincd_pat (x1, SV_POW2, 1))
++
++/*
++** qincd_pat_n_2_s64:
++**	sqincd	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_2_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqincd_pat (x0, SV_POW2, 2))
++
++/*
++** qincd_pat_n_7_s64:
++**	sqincd	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_7_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqincd_pat (x0, SV_POW2, 7))
++
++/*
++** qincd_pat_n_15_s64:
++**	sqincd	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_15_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqincd_pat (x0, SV_POW2, 15))
++
++/*
++** qincd_pat_n_16_s64:
++**	sqincd	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_16_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqincd_pat (x0, SV_POW2, 16))
++
++/*
++** qincd_pat_n_vl1_s64:
++**	sqincd	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl1_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqincd_pat (x0, SV_VL1, 16))
++
++/*
++** qincd_pat_n_vl2_s64:
++**	sqincd	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl2_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqincd_pat (x0, SV_VL2, 16))
++
++/*
++** qincd_pat_n_vl3_s64:
++**	sqincd	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl3_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqincd_pat (x0, SV_VL3, 16))
++
++/*
++** qincd_pat_n_vl4_s64:
++**	sqincd	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl4_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqincd_pat (x0, SV_VL4, 16))
++
++/*
++** qincd_pat_n_vl5_s64:
++**	sqincd	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl5_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqincd_pat (x0, SV_VL5, 16))
++
++/*
++** qincd_pat_n_vl6_s64:
++**	sqincd	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl6_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqincd_pat (x0, SV_VL6, 16))
++
++/*
++** qincd_pat_n_vl7_s64:
++**	sqincd	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl7_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqincd_pat (x0, SV_VL7, 16))
++
++/*
++** qincd_pat_n_vl8_s64:
++**	sqincd	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl8_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqincd_pat (x0, SV_VL8, 16))
++
++/*
++** qincd_pat_n_vl16_s64:
++**	sqincd	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl16_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqincd_pat (x0, SV_VL16, 16))
++
++/*
++** qincd_pat_n_vl32_s64:
++**	sqincd	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl32_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqincd_pat (x0, SV_VL32, 16))
++
++/*
++** qincd_pat_n_vl64_s64:
++**	sqincd	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl64_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqincd_pat (x0, SV_VL64, 16))
++
++/*
++** qincd_pat_n_vl128_s64:
++**	sqincd	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl128_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqincd_pat (x0, SV_VL128, 16))
++
++/*
++** qincd_pat_n_vl256_s64:
++**	sqincd	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl256_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqincd_pat (x0, SV_VL256, 16))
++
++/*
++** qincd_pat_n_mul4_s64:
++**	sqincd	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul4_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqincd_pat (x0, SV_MUL4, 16))
++
++/*
++** qincd_pat_n_mul3_s64:
++**	sqincd	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul3_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqincd_pat (x0, SV_MUL3, 16))
++
++/*
++** qincd_pat_n_all_s64:
++**	sqincd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_all_s64, int64_t,
++		x0 = svqincd_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqincd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u32.c
+new file mode 100644
+index 000000000..0e3cbdb54
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_pat_n_1_u32_tied:
++**	uqincd	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_u32_tied, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqincd_pat (x0, SV_POW2, 1))
++
++/*
++** qincd_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqincd	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_u32_untied, uint32_t,
++		x0 = svqincd_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqincd_pat (x1, SV_POW2, 1))
++
++/*
++** qincd_pat_n_2_u32:
++**	uqincd	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_2_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqincd_pat (x0, SV_POW2, 2))
++
++/*
++** qincd_pat_n_7_u32:
++**	uqincd	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_7_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqincd_pat (x0, SV_POW2, 7))
++
++/*
++** qincd_pat_n_15_u32:
++**	uqincd	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_15_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqincd_pat (x0, SV_POW2, 15))
++
++/*
++** qincd_pat_n_16_u32:
++**	uqincd	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_16_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqincd_pat (x0, SV_POW2, 16))
++
++/*
++** qincd_pat_n_vl1_u32:
++**	uqincd	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl1_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqincd_pat (x0, SV_VL1, 16))
++
++/*
++** qincd_pat_n_vl2_u32:
++**	uqincd	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl2_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqincd_pat (x0, SV_VL2, 16))
++
++/*
++** qincd_pat_n_vl3_u32:
++**	uqincd	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl3_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqincd_pat (x0, SV_VL3, 16))
++
++/*
++** qincd_pat_n_vl4_u32:
++**	uqincd	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl4_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqincd_pat (x0, SV_VL4, 16))
++
++/*
++** qincd_pat_n_vl5_u32:
++**	uqincd	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl5_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqincd_pat (x0, SV_VL5, 16))
++
++/*
++** qincd_pat_n_vl6_u32:
++**	uqincd	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl6_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqincd_pat (x0, SV_VL6, 16))
++
++/*
++** qincd_pat_n_vl7_u32:
++**	uqincd	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl7_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqincd_pat (x0, SV_VL7, 16))
++
++/*
++** qincd_pat_n_vl8_u32:
++**	uqincd	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl8_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqincd_pat (x0, SV_VL8, 16))
++
++/*
++** qincd_pat_n_vl16_u32:
++**	uqincd	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl16_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqincd_pat (x0, SV_VL16, 16))
++
++/*
++** qincd_pat_n_vl32_u32:
++**	uqincd	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl32_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqincd_pat (x0, SV_VL32, 16))
++
++/*
++** qincd_pat_n_vl64_u32:
++**	uqincd	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl64_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqincd_pat (x0, SV_VL64, 16))
++
++/*
++** qincd_pat_n_vl128_u32:
++**	uqincd	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl128_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqincd_pat (x0, SV_VL128, 16))
++
++/*
++** qincd_pat_n_vl256_u32:
++**	uqincd	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl256_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqincd_pat (x0, SV_VL256, 16))
++
++/*
++** qincd_pat_n_mul4_u32:
++**	uqincd	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul4_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqincd_pat (x0, SV_MUL4, 16))
++
++/*
++** qincd_pat_n_mul3_u32:
++**	uqincd	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul3_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqincd_pat (x0, SV_MUL3, 16))
++
++/*
++** qincd_pat_n_all_u32:
++**	uqincd	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_all_u32, uint32_t,
++		x0 = svqincd_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqincd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u64.c
+new file mode 100644
+index 000000000..49dc350df
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u64.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_pat_1_u64_tied:
++**	uqincd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_1_u64_tied, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_POW2, 1),
++		z0 = svqincd_pat (z0, SV_POW2, 1))
++
++/*
++** qincd_pat_1_u64_untied:
++**	movprfx	z0, z1
++**	uqincd	z0\.d, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_1_u64_untied, svuint64_t,
++		z0 = svqincd_pat_u64 (z1, SV_POW2, 1),
++		z0 = svqincd_pat (z1, SV_POW2, 1))
++
++/*
++** qincd_pat_2_u64:
++**	uqincd	z0\.d, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_2_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_POW2, 2),
++		z0 = svqincd_pat (z0, SV_POW2, 2))
++
++/*
++** qincd_pat_7_u64:
++**	uqincd	z0\.d, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_7_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_POW2, 7),
++		z0 = svqincd_pat (z0, SV_POW2, 7))
++
++/*
++** qincd_pat_15_u64:
++**	uqincd	z0\.d, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_15_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_POW2, 15),
++		z0 = svqincd_pat (z0, SV_POW2, 15))
++
++/*
++** qincd_pat_16_u64:
++**	uqincd	z0\.d, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_16_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_POW2, 16),
++		z0 = svqincd_pat (z0, SV_POW2, 16))
++
++/*
++** qincd_pat_vl1_u64:
++**	uqincd	z0\.d, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl1_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL1, 16),
++		z0 = svqincd_pat (z0, SV_VL1, 16))
++
++/*
++** qincd_pat_vl2_u64:
++**	uqincd	z0\.d, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl2_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL2, 16),
++		z0 = svqincd_pat (z0, SV_VL2, 16))
++
++/*
++** qincd_pat_vl3_u64:
++**	uqincd	z0\.d, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl3_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL3, 16),
++		z0 = svqincd_pat (z0, SV_VL3, 16))
++
++/*
++** qincd_pat_vl4_u64:
++**	uqincd	z0\.d, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl4_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL4, 16),
++		z0 = svqincd_pat (z0, SV_VL4, 16))
++
++/*
++** qincd_pat_vl5_u64:
++**	uqincd	z0\.d, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl5_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL5, 16),
++		z0 = svqincd_pat (z0, SV_VL5, 16))
++
++/*
++** qincd_pat_vl6_u64:
++**	uqincd	z0\.d, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl6_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL6, 16),
++		z0 = svqincd_pat (z0, SV_VL6, 16))
++
++/*
++** qincd_pat_vl7_u64:
++**	uqincd	z0\.d, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl7_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL7, 16),
++		z0 = svqincd_pat (z0, SV_VL7, 16))
++
++/*
++** qincd_pat_vl8_u64:
++**	uqincd	z0\.d, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl8_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL8, 16),
++		z0 = svqincd_pat (z0, SV_VL8, 16))
++
++/*
++** qincd_pat_vl16_u64:
++**	uqincd	z0\.d, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl16_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL16, 16),
++		z0 = svqincd_pat (z0, SV_VL16, 16))
++
++/*
++** qincd_pat_vl32_u64:
++**	uqincd	z0\.d, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl32_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL32, 16),
++		z0 = svqincd_pat (z0, SV_VL32, 16))
++
++/*
++** qincd_pat_vl64_u64:
++**	uqincd	z0\.d, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl64_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL64, 16),
++		z0 = svqincd_pat (z0, SV_VL64, 16))
++
++/*
++** qincd_pat_vl128_u64:
++**	uqincd	z0\.d, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl128_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL128, 16),
++		z0 = svqincd_pat (z0, SV_VL128, 16))
++
++/*
++** qincd_pat_vl256_u64:
++**	uqincd	z0\.d, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_vl256_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_VL256, 16),
++		z0 = svqincd_pat (z0, SV_VL256, 16))
++
++/*
++** qincd_pat_mul4_u64:
++**	uqincd	z0\.d, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_mul4_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_MUL4, 16),
++		z0 = svqincd_pat (z0, SV_MUL4, 16))
++
++/*
++** qincd_pat_mul3_u64:
++**	uqincd	z0\.d, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_mul3_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_MUL3, 16),
++		z0 = svqincd_pat (z0, SV_MUL3, 16))
++
++/*
++** qincd_pat_all_u64:
++**	uqincd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_pat_all_u64, svuint64_t,
++		z0 = svqincd_pat_u64 (z0, SV_ALL, 16),
++		z0 = svqincd_pat (z0, SV_ALL, 16))
++
++/*
++** qincd_pat_n_1_u64_tied:
++**	uqincd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_u64_tied, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqincd_pat (x0, SV_POW2, 1))
++
++/*
++** qincd_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqincd	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_1_u64_untied, uint64_t,
++		x0 = svqincd_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqincd_pat (x1, SV_POW2, 1))
++
++/*
++** qincd_pat_n_2_u64:
++**	uqincd	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_2_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqincd_pat (x0, SV_POW2, 2))
++
++/*
++** qincd_pat_n_7_u64:
++**	uqincd	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_7_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqincd_pat (x0, SV_POW2, 7))
++
++/*
++** qincd_pat_n_15_u64:
++**	uqincd	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_15_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqincd_pat (x0, SV_POW2, 15))
++
++/*
++** qincd_pat_n_16_u64:
++**	uqincd	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_16_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqincd_pat (x0, SV_POW2, 16))
++
++/*
++** qincd_pat_n_vl1_u64:
++**	uqincd	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl1_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqincd_pat (x0, SV_VL1, 16))
++
++/*
++** qincd_pat_n_vl2_u64:
++**	uqincd	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl2_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqincd_pat (x0, SV_VL2, 16))
++
++/*
++** qincd_pat_n_vl3_u64:
++**	uqincd	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl3_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqincd_pat (x0, SV_VL3, 16))
++
++/*
++** qincd_pat_n_vl4_u64:
++**	uqincd	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl4_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqincd_pat (x0, SV_VL4, 16))
++
++/*
++** qincd_pat_n_vl5_u64:
++**	uqincd	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl5_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqincd_pat (x0, SV_VL5, 16))
++
++/*
++** qincd_pat_n_vl6_u64:
++**	uqincd	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl6_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqincd_pat (x0, SV_VL6, 16))
++
++/*
++** qincd_pat_n_vl7_u64:
++**	uqincd	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl7_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqincd_pat (x0, SV_VL7, 16))
++
++/*
++** qincd_pat_n_vl8_u64:
++**	uqincd	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl8_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqincd_pat (x0, SV_VL8, 16))
++
++/*
++** qincd_pat_n_vl16_u64:
++**	uqincd	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl16_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqincd_pat (x0, SV_VL16, 16))
++
++/*
++** qincd_pat_n_vl32_u64:
++**	uqincd	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl32_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqincd_pat (x0, SV_VL32, 16))
++
++/*
++** qincd_pat_n_vl64_u64:
++**	uqincd	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl64_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqincd_pat (x0, SV_VL64, 16))
++
++/*
++** qincd_pat_n_vl128_u64:
++**	uqincd	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl128_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqincd_pat (x0, SV_VL128, 16))
++
++/*
++** qincd_pat_n_vl256_u64:
++**	uqincd	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_vl256_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqincd_pat (x0, SV_VL256, 16))
++
++/*
++** qincd_pat_n_mul4_u64:
++**	uqincd	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul4_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqincd_pat (x0, SV_MUL4, 16))
++
++/*
++** qincd_pat_n_mul3_u64:
++**	uqincd	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_mul3_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqincd_pat (x0, SV_MUL3, 16))
++
++/*
++** qincd_pat_n_all_u64:
++**	uqincd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_pat_n_all_u64, uint64_t,
++		x0 = svqincd_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqincd_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s32.c
+new file mode 100644
+index 000000000..2fa0438a3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_n_1_s32_tied:
++**	sqincd	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_s32_tied, int32_t,
++		x0 = svqincd_n_s32 (x0, 1),
++		x0 = svqincd (x0, 1))
++
++/*
++** qincd_n_1_s32_untied:
++**	mov	w0, w1
++**	sqincd	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_s32_untied, int32_t,
++		x0 = svqincd_n_s32 (x1, 1),
++		x0 = svqincd (x1, 1))
++
++/*
++** qincd_n_2_s32:
++**	sqincd	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_2_s32, int32_t,
++		x0 = svqincd_n_s32 (x0, 2),
++		x0 = svqincd (x0, 2))
++
++/*
++** qincd_n_7_s32:
++**	sqincd	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_7_s32, int32_t,
++		x0 = svqincd_n_s32 (x0, 7),
++		x0 = svqincd (x0, 7))
++
++/*
++** qincd_n_15_s32:
++**	sqincd	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_15_s32, int32_t,
++		x0 = svqincd_n_s32 (x0, 15),
++		x0 = svqincd (x0, 15))
++
++/*
++** qincd_n_16_s32:
++**	sqincd	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_16_s32, int32_t,
++		x0 = svqincd_n_s32 (x0, 16),
++		x0 = svqincd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s64.c
+new file mode 100644
+index 000000000..0920ac2ec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s64.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_1_s64_tied:
++**	sqincd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_1_s64_tied, svint64_t,
++		z0 = svqincd_s64 (z0, 1),
++		z0 = svqincd (z0, 1))
++
++/*
++** qincd_1_s64_untied:
++**	movprfx	z0, z1
++**	sqincd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_1_s64_untied, svint64_t,
++		z0 = svqincd_s64 (z1, 1),
++		z0 = svqincd (z1, 1))
++
++/*
++** qincd_2_s64:
++**	sqincd	z0\.d, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_2_s64, svint64_t,
++		z0 = svqincd_s64 (z0, 2),
++		z0 = svqincd (z0, 2))
++
++/*
++** qincd_7_s64:
++**	sqincd	z0\.d, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_7_s64, svint64_t,
++		z0 = svqincd_s64 (z0, 7),
++		z0 = svqincd (z0, 7))
++
++/*
++** qincd_15_s64:
++**	sqincd	z0\.d, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_15_s64, svint64_t,
++		z0 = svqincd_s64 (z0, 15),
++		z0 = svqincd (z0, 15))
++
++/*
++** qincd_16_s64:
++**	sqincd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_16_s64, svint64_t,
++		z0 = svqincd_s64 (z0, 16),
++		z0 = svqincd (z0, 16))
++
++/*
++** qincd_n_1_s64_tied:
++**	sqincd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_s64_tied, int64_t,
++		x0 = svqincd_n_s64 (x0, 1),
++		x0 = svqincd (x0, 1))
++
++/*
++** qincd_n_1_s64_untied:
++**	mov	x0, x1
++**	sqincd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_s64_untied, int64_t,
++		x0 = svqincd_n_s64 (x1, 1),
++		x0 = svqincd (x1, 1))
++
++/*
++** qincd_n_2_s64:
++**	sqincd	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_2_s64, int64_t,
++		x0 = svqincd_n_s64 (x0, 2),
++		x0 = svqincd (x0, 2))
++
++/*
++** qincd_n_7_s64:
++**	sqincd	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_7_s64, int64_t,
++		x0 = svqincd_n_s64 (x0, 7),
++		x0 = svqincd (x0, 7))
++
++/*
++** qincd_n_15_s64:
++**	sqincd	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_15_s64, int64_t,
++		x0 = svqincd_n_s64 (x0, 15),
++		x0 = svqincd (x0, 15))
++
++/*
++** qincd_n_16_s64:
++**	sqincd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_16_s64, int64_t,
++		x0 = svqincd_n_s64 (x0, 16),
++		x0 = svqincd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u32.c
+new file mode 100644
+index 000000000..33dc12cb1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_n_1_u32_tied:
++**	uqincd	w0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_u32_tied, uint32_t,
++		x0 = svqincd_n_u32 (x0, 1),
++		x0 = svqincd (x0, 1))
++
++/*
++** qincd_n_1_u32_untied:
++**	mov	w0, w1
++**	uqincd	w0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_u32_untied, uint32_t,
++		x0 = svqincd_n_u32 (x1, 1),
++		x0 = svqincd (x1, 1))
++
++/*
++** qincd_n_2_u32:
++**	uqincd	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_2_u32, uint32_t,
++		x0 = svqincd_n_u32 (x0, 2),
++		x0 = svqincd (x0, 2))
++
++/*
++** qincd_n_7_u32:
++**	uqincd	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_7_u32, uint32_t,
++		x0 = svqincd_n_u32 (x0, 7),
++		x0 = svqincd (x0, 7))
++
++/*
++** qincd_n_15_u32:
++**	uqincd	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_15_u32, uint32_t,
++		x0 = svqincd_n_u32 (x0, 15),
++		x0 = svqincd (x0, 15))
++
++/*
++** qincd_n_16_u32:
++**	uqincd	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_16_u32, uint32_t,
++		x0 = svqincd_n_u32 (x0, 16),
++		x0 = svqincd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u64.c
+new file mode 100644
+index 000000000..28c611a8f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u64.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincd_1_u64_tied:
++**	uqincd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_1_u64_tied, svuint64_t,
++		z0 = svqincd_u64 (z0, 1),
++		z0 = svqincd (z0, 1))
++
++/*
++** qincd_1_u64_untied:
++**	movprfx	z0, z1
++**	uqincd	z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_1_u64_untied, svuint64_t,
++		z0 = svqincd_u64 (z1, 1),
++		z0 = svqincd (z1, 1))
++
++/*
++** qincd_2_u64:
++**	uqincd	z0\.d, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_2_u64, svuint64_t,
++		z0 = svqincd_u64 (z0, 2),
++		z0 = svqincd (z0, 2))
++
++/*
++** qincd_7_u64:
++**	uqincd	z0\.d, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_7_u64, svuint64_t,
++		z0 = svqincd_u64 (z0, 7),
++		z0 = svqincd (z0, 7))
++
++/*
++** qincd_15_u64:
++**	uqincd	z0\.d, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_15_u64, svuint64_t,
++		z0 = svqincd_u64 (z0, 15),
++		z0 = svqincd (z0, 15))
++
++/*
++** qincd_16_u64:
++**	uqincd	z0\.d, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincd_16_u64, svuint64_t,
++		z0 = svqincd_u64 (z0, 16),
++		z0 = svqincd (z0, 16))
++
++/*
++** qincd_n_1_u64_tied:
++**	uqincd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_u64_tied, uint64_t,
++		x0 = svqincd_n_u64 (x0, 1),
++		x0 = svqincd (x0, 1))
++
++/*
++** qincd_n_1_u64_untied:
++**	mov	x0, x1
++**	uqincd	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_1_u64_untied, uint64_t,
++		x0 = svqincd_n_u64 (x1, 1),
++		x0 = svqincd (x1, 1))
++
++/*
++** qincd_n_2_u64:
++**	uqincd	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_2_u64, uint64_t,
++		x0 = svqincd_n_u64 (x0, 2),
++		x0 = svqincd (x0, 2))
++
++/*
++** qincd_n_7_u64:
++**	uqincd	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_7_u64, uint64_t,
++		x0 = svqincd_n_u64 (x0, 7),
++		x0 = svqincd (x0, 7))
++
++/*
++** qincd_n_15_u64:
++**	uqincd	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_15_u64, uint64_t,
++		x0 = svqincd_n_u64 (x0, 15),
++		x0 = svqincd (x0, 15))
++
++/*
++** qincd_n_16_u64:
++**	uqincd	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincd_n_16_u64, uint64_t,
++		x0 = svqincd_n_u64 (x0, 16),
++		x0 = svqincd (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s16.c
+new file mode 100644
+index 000000000..708d635c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s16.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_pat_1_s16_tied:
++**	sqinch	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_1_s16_tied, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_POW2, 1),
++		z0 = svqinch_pat (z0, SV_POW2, 1))
++
++/*
++** qinch_pat_1_s16_untied:
++**	movprfx	z0, z1
++**	sqinch	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_1_s16_untied, svint16_t,
++		z0 = svqinch_pat_s16 (z1, SV_POW2, 1),
++		z0 = svqinch_pat (z1, SV_POW2, 1))
++
++/*
++** qinch_pat_2_s16:
++**	sqinch	z0\.h, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_2_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_POW2, 2),
++		z0 = svqinch_pat (z0, SV_POW2, 2))
++
++/*
++** qinch_pat_7_s16:
++**	sqinch	z0\.h, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_7_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_POW2, 7),
++		z0 = svqinch_pat (z0, SV_POW2, 7))
++
++/*
++** qinch_pat_15_s16:
++**	sqinch	z0\.h, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_15_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_POW2, 15),
++		z0 = svqinch_pat (z0, SV_POW2, 15))
++
++/*
++** qinch_pat_16_s16:
++**	sqinch	z0\.h, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_16_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_POW2, 16),
++		z0 = svqinch_pat (z0, SV_POW2, 16))
++
++/*
++** qinch_pat_vl1_s16:
++**	sqinch	z0\.h, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl1_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL1, 16),
++		z0 = svqinch_pat (z0, SV_VL1, 16))
++
++/*
++** qinch_pat_vl2_s16:
++**	sqinch	z0\.h, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl2_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL2, 16),
++		z0 = svqinch_pat (z0, SV_VL2, 16))
++
++/*
++** qinch_pat_vl3_s16:
++**	sqinch	z0\.h, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl3_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL3, 16),
++		z0 = svqinch_pat (z0, SV_VL3, 16))
++
++/*
++** qinch_pat_vl4_s16:
++**	sqinch	z0\.h, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl4_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL4, 16),
++		z0 = svqinch_pat (z0, SV_VL4, 16))
++
++/*
++** qinch_pat_vl5_s16:
++**	sqinch	z0\.h, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl5_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL5, 16),
++		z0 = svqinch_pat (z0, SV_VL5, 16))
++
++/*
++** qinch_pat_vl6_s16:
++**	sqinch	z0\.h, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl6_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL6, 16),
++		z0 = svqinch_pat (z0, SV_VL6, 16))
++
++/*
++** qinch_pat_vl7_s16:
++**	sqinch	z0\.h, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl7_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL7, 16),
++		z0 = svqinch_pat (z0, SV_VL7, 16))
++
++/*
++** qinch_pat_vl8_s16:
++**	sqinch	z0\.h, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl8_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL8, 16),
++		z0 = svqinch_pat (z0, SV_VL8, 16))
++
++/*
++** qinch_pat_vl16_s16:
++**	sqinch	z0\.h, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl16_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL16, 16),
++		z0 = svqinch_pat (z0, SV_VL16, 16))
++
++/*
++** qinch_pat_vl32_s16:
++**	sqinch	z0\.h, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl32_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL32, 16),
++		z0 = svqinch_pat (z0, SV_VL32, 16))
++
++/*
++** qinch_pat_vl64_s16:
++**	sqinch	z0\.h, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl64_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL64, 16),
++		z0 = svqinch_pat (z0, SV_VL64, 16))
++
++/*
++** qinch_pat_vl128_s16:
++**	sqinch	z0\.h, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl128_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL128, 16),
++		z0 = svqinch_pat (z0, SV_VL128, 16))
++
++/*
++** qinch_pat_vl256_s16:
++**	sqinch	z0\.h, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl256_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_VL256, 16),
++		z0 = svqinch_pat (z0, SV_VL256, 16))
++
++/*
++** qinch_pat_mul4_s16:
++**	sqinch	z0\.h, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_mul4_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_MUL4, 16),
++		z0 = svqinch_pat (z0, SV_MUL4, 16))
++
++/*
++** qinch_pat_mul3_s16:
++**	sqinch	z0\.h, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_mul3_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_MUL3, 16),
++		z0 = svqinch_pat (z0, SV_MUL3, 16))
++
++/*
++** qinch_pat_all_s16:
++**	sqinch	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_all_s16, svint16_t,
++		z0 = svqinch_pat_s16 (z0, SV_ALL, 16),
++		z0 = svqinch_pat (z0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s32.c
+new file mode 100644
+index 000000000..7c91c6202
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_pat_n_1_s32_tied:
++**	sqinch	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_s32_tied, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqinch_pat (x0, SV_POW2, 1))
++
++/*
++** qinch_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqinch	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_s32_untied, int32_t,
++		x0 = svqinch_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqinch_pat (x1, SV_POW2, 1))
++
++/*
++** qinch_pat_n_2_s32:
++**	sqinch	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_2_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqinch_pat (x0, SV_POW2, 2))
++
++/*
++** qinch_pat_n_7_s32:
++**	sqinch	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_7_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqinch_pat (x0, SV_POW2, 7))
++
++/*
++** qinch_pat_n_15_s32:
++**	sqinch	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_15_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqinch_pat (x0, SV_POW2, 15))
++
++/*
++** qinch_pat_n_16_s32:
++**	sqinch	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_16_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqinch_pat (x0, SV_POW2, 16))
++
++/*
++** qinch_pat_n_vl1_s32:
++**	sqinch	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl1_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqinch_pat (x0, SV_VL1, 16))
++
++/*
++** qinch_pat_n_vl2_s32:
++**	sqinch	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl2_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqinch_pat (x0, SV_VL2, 16))
++
++/*
++** qinch_pat_n_vl3_s32:
++**	sqinch	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl3_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqinch_pat (x0, SV_VL3, 16))
++
++/*
++** qinch_pat_n_vl4_s32:
++**	sqinch	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl4_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqinch_pat (x0, SV_VL4, 16))
++
++/*
++** qinch_pat_n_vl5_s32:
++**	sqinch	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl5_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqinch_pat (x0, SV_VL5, 16))
++
++/*
++** qinch_pat_n_vl6_s32:
++**	sqinch	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl6_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqinch_pat (x0, SV_VL6, 16))
++
++/*
++** qinch_pat_n_vl7_s32:
++**	sqinch	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl7_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqinch_pat (x0, SV_VL7, 16))
++
++/*
++** qinch_pat_n_vl8_s32:
++**	sqinch	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl8_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqinch_pat (x0, SV_VL8, 16))
++
++/*
++** qinch_pat_n_vl16_s32:
++**	sqinch	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl16_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqinch_pat (x0, SV_VL16, 16))
++
++/*
++** qinch_pat_n_vl32_s32:
++**	sqinch	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl32_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqinch_pat (x0, SV_VL32, 16))
++
++/*
++** qinch_pat_n_vl64_s32:
++**	sqinch	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl64_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqinch_pat (x0, SV_VL64, 16))
++
++/*
++** qinch_pat_n_vl128_s32:
++**	sqinch	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl128_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqinch_pat (x0, SV_VL128, 16))
++
++/*
++** qinch_pat_n_vl256_s32:
++**	sqinch	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl256_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqinch_pat (x0, SV_VL256, 16))
++
++/*
++** qinch_pat_n_mul4_s32:
++**	sqinch	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul4_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqinch_pat (x0, SV_MUL4, 16))
++
++/*
++** qinch_pat_n_mul3_s32:
++**	sqinch	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul3_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqinch_pat (x0, SV_MUL3, 16))
++
++/*
++** qinch_pat_n_all_s32:
++**	sqinch	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_all_s32, int32_t,
++		x0 = svqinch_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqinch_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s64.c
+new file mode 100644
+index 000000000..2cde6482f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_pat_n_1_s64_tied:
++**	sqinch	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_s64_tied, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqinch_pat (x0, SV_POW2, 1))
++
++/*
++** qinch_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqinch	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_s64_untied, int64_t,
++		x0 = svqinch_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqinch_pat (x1, SV_POW2, 1))
++
++/*
++** qinch_pat_n_2_s64:
++**	sqinch	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_2_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqinch_pat (x0, SV_POW2, 2))
++
++/*
++** qinch_pat_n_7_s64:
++**	sqinch	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_7_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqinch_pat (x0, SV_POW2, 7))
++
++/*
++** qinch_pat_n_15_s64:
++**	sqinch	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_15_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqinch_pat (x0, SV_POW2, 15))
++
++/*
++** qinch_pat_n_16_s64:
++**	sqinch	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_16_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqinch_pat (x0, SV_POW2, 16))
++
++/*
++** qinch_pat_n_vl1_s64:
++**	sqinch	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl1_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqinch_pat (x0, SV_VL1, 16))
++
++/*
++** qinch_pat_n_vl2_s64:
++**	sqinch	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl2_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqinch_pat (x0, SV_VL2, 16))
++
++/*
++** qinch_pat_n_vl3_s64:
++**	sqinch	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl3_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqinch_pat (x0, SV_VL3, 16))
++
++/*
++** qinch_pat_n_vl4_s64:
++**	sqinch	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl4_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqinch_pat (x0, SV_VL4, 16))
++
++/*
++** qinch_pat_n_vl5_s64:
++**	sqinch	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl5_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqinch_pat (x0, SV_VL5, 16))
++
++/*
++** qinch_pat_n_vl6_s64:
++**	sqinch	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl6_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqinch_pat (x0, SV_VL6, 16))
++
++/*
++** qinch_pat_n_vl7_s64:
++**	sqinch	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl7_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqinch_pat (x0, SV_VL7, 16))
++
++/*
++** qinch_pat_n_vl8_s64:
++**	sqinch	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl8_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqinch_pat (x0, SV_VL8, 16))
++
++/*
++** qinch_pat_n_vl16_s64:
++**	sqinch	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl16_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqinch_pat (x0, SV_VL16, 16))
++
++/*
++** qinch_pat_n_vl32_s64:
++**	sqinch	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl32_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqinch_pat (x0, SV_VL32, 16))
++
++/*
++** qinch_pat_n_vl64_s64:
++**	sqinch	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl64_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqinch_pat (x0, SV_VL64, 16))
++
++/*
++** qinch_pat_n_vl128_s64:
++**	sqinch	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl128_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqinch_pat (x0, SV_VL128, 16))
++
++/*
++** qinch_pat_n_vl256_s64:
++**	sqinch	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl256_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqinch_pat (x0, SV_VL256, 16))
++
++/*
++** qinch_pat_n_mul4_s64:
++**	sqinch	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul4_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqinch_pat (x0, SV_MUL4, 16))
++
++/*
++** qinch_pat_n_mul3_s64:
++**	sqinch	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul3_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqinch_pat (x0, SV_MUL3, 16))
++
++/*
++** qinch_pat_n_all_s64:
++**	sqinch	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_all_s64, int64_t,
++		x0 = svqinch_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqinch_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u16.c
+new file mode 100644
+index 000000000..5a1a846a0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u16.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_pat_1_u16_tied:
++**	uqinch	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_1_u16_tied, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_POW2, 1),
++		z0 = svqinch_pat (z0, SV_POW2, 1))
++
++/*
++** qinch_pat_1_u16_untied:
++**	movprfx	z0, z1
++**	uqinch	z0\.h, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_1_u16_untied, svuint16_t,
++		z0 = svqinch_pat_u16 (z1, SV_POW2, 1),
++		z0 = svqinch_pat (z1, SV_POW2, 1))
++
++/*
++** qinch_pat_2_u16:
++**	uqinch	z0\.h, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_2_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_POW2, 2),
++		z0 = svqinch_pat (z0, SV_POW2, 2))
++
++/*
++** qinch_pat_7_u16:
++**	uqinch	z0\.h, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_7_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_POW2, 7),
++		z0 = svqinch_pat (z0, SV_POW2, 7))
++
++/*
++** qinch_pat_15_u16:
++**	uqinch	z0\.h, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_15_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_POW2, 15),
++		z0 = svqinch_pat (z0, SV_POW2, 15))
++
++/*
++** qinch_pat_16_u16:
++**	uqinch	z0\.h, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_16_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_POW2, 16),
++		z0 = svqinch_pat (z0, SV_POW2, 16))
++
++/*
++** qinch_pat_vl1_u16:
++**	uqinch	z0\.h, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl1_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL1, 16),
++		z0 = svqinch_pat (z0, SV_VL1, 16))
++
++/*
++** qinch_pat_vl2_u16:
++**	uqinch	z0\.h, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl2_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL2, 16),
++		z0 = svqinch_pat (z0, SV_VL2, 16))
++
++/*
++** qinch_pat_vl3_u16:
++**	uqinch	z0\.h, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl3_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL3, 16),
++		z0 = svqinch_pat (z0, SV_VL3, 16))
++
++/*
++** qinch_pat_vl4_u16:
++**	uqinch	z0\.h, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl4_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL4, 16),
++		z0 = svqinch_pat (z0, SV_VL4, 16))
++
++/*
++** qinch_pat_vl5_u16:
++**	uqinch	z0\.h, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl5_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL5, 16),
++		z0 = svqinch_pat (z0, SV_VL5, 16))
++
++/*
++** qinch_pat_vl6_u16:
++**	uqinch	z0\.h, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl6_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL6, 16),
++		z0 = svqinch_pat (z0, SV_VL6, 16))
++
++/*
++** qinch_pat_vl7_u16:
++**	uqinch	z0\.h, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl7_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL7, 16),
++		z0 = svqinch_pat (z0, SV_VL7, 16))
++
++/*
++** qinch_pat_vl8_u16:
++**	uqinch	z0\.h, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl8_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL8, 16),
++		z0 = svqinch_pat (z0, SV_VL8, 16))
++
++/*
++** qinch_pat_vl16_u16:
++**	uqinch	z0\.h, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl16_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL16, 16),
++		z0 = svqinch_pat (z0, SV_VL16, 16))
++
++/*
++** qinch_pat_vl32_u16:
++**	uqinch	z0\.h, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl32_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL32, 16),
++		z0 = svqinch_pat (z0, SV_VL32, 16))
++
++/*
++** qinch_pat_vl64_u16:
++**	uqinch	z0\.h, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl64_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL64, 16),
++		z0 = svqinch_pat (z0, SV_VL64, 16))
++
++/*
++** qinch_pat_vl128_u16:
++**	uqinch	z0\.h, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl128_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL128, 16),
++		z0 = svqinch_pat (z0, SV_VL128, 16))
++
++/*
++** qinch_pat_vl256_u16:
++**	uqinch	z0\.h, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_vl256_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_VL256, 16),
++		z0 = svqinch_pat (z0, SV_VL256, 16))
++
++/*
++** qinch_pat_mul4_u16:
++**	uqinch	z0\.h, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_mul4_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_MUL4, 16),
++		z0 = svqinch_pat (z0, SV_MUL4, 16))
++
++/*
++** qinch_pat_mul3_u16:
++**	uqinch	z0\.h, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_mul3_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_MUL3, 16),
++		z0 = svqinch_pat (z0, SV_MUL3, 16))
++
++/*
++** qinch_pat_all_u16:
++**	uqinch	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_pat_all_u16, svuint16_t,
++		z0 = svqinch_pat_u16 (z0, SV_ALL, 16),
++		z0 = svqinch_pat (z0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u32.c
+new file mode 100644
+index 000000000..8398c5689
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u32.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_pat_n_1_u32_tied:
++**	uqinch	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_u32_tied, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqinch_pat (x0, SV_POW2, 1))
++
++/*
++** qinch_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqinch	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_u32_untied, uint32_t,
++		x0 = svqinch_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqinch_pat (x1, SV_POW2, 1))
++
++/*
++** qinch_pat_n_2_u32:
++**	uqinch	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_2_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqinch_pat (x0, SV_POW2, 2))
++
++/*
++** qinch_pat_n_7_u32:
++**	uqinch	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_7_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqinch_pat (x0, SV_POW2, 7))
++
++/*
++** qinch_pat_n_15_u32:
++**	uqinch	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_15_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqinch_pat (x0, SV_POW2, 15))
++
++/*
++** qinch_pat_n_16_u32:
++**	uqinch	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_16_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqinch_pat (x0, SV_POW2, 16))
++
++/*
++** qinch_pat_n_vl1_u32:
++**	uqinch	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl1_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqinch_pat (x0, SV_VL1, 16))
++
++/*
++** qinch_pat_n_vl2_u32:
++**	uqinch	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl2_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqinch_pat (x0, SV_VL2, 16))
++
++/*
++** qinch_pat_n_vl3_u32:
++**	uqinch	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl3_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqinch_pat (x0, SV_VL3, 16))
++
++/*
++** qinch_pat_n_vl4_u32:
++**	uqinch	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl4_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqinch_pat (x0, SV_VL4, 16))
++
++/*
++** qinch_pat_n_vl5_u32:
++**	uqinch	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl5_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqinch_pat (x0, SV_VL5, 16))
++
++/*
++** qinch_pat_n_vl6_u32:
++**	uqinch	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl6_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqinch_pat (x0, SV_VL6, 16))
++
++/*
++** qinch_pat_n_vl7_u32:
++**	uqinch	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl7_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqinch_pat (x0, SV_VL7, 16))
++
++/*
++** qinch_pat_n_vl8_u32:
++**	uqinch	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl8_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqinch_pat (x0, SV_VL8, 16))
++
++/*
++** qinch_pat_n_vl16_u32:
++**	uqinch	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl16_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqinch_pat (x0, SV_VL16, 16))
++
++/*
++** qinch_pat_n_vl32_u32:
++**	uqinch	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl32_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqinch_pat (x0, SV_VL32, 16))
++
++/*
++** qinch_pat_n_vl64_u32:
++**	uqinch	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl64_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqinch_pat (x0, SV_VL64, 16))
++
++/*
++** qinch_pat_n_vl128_u32:
++**	uqinch	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl128_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqinch_pat (x0, SV_VL128, 16))
++
++/*
++** qinch_pat_n_vl256_u32:
++**	uqinch	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl256_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqinch_pat (x0, SV_VL256, 16))
++
++/*
++** qinch_pat_n_mul4_u32:
++**	uqinch	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul4_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqinch_pat (x0, SV_MUL4, 16))
++
++/*
++** qinch_pat_n_mul3_u32:
++**	uqinch	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul3_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqinch_pat (x0, SV_MUL3, 16))
++
++/*
++** qinch_pat_n_all_u32:
++**	uqinch	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_all_u32, uint32_t,
++		x0 = svqinch_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqinch_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u64.c
+new file mode 100644
+index 000000000..51722646d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_pat_n_1_u64_tied:
++**	uqinch	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_u64_tied, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqinch_pat (x0, SV_POW2, 1))
++
++/*
++** qinch_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqinch	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_1_u64_untied, uint64_t,
++		x0 = svqinch_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqinch_pat (x1, SV_POW2, 1))
++
++/*
++** qinch_pat_n_2_u64:
++**	uqinch	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_2_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqinch_pat (x0, SV_POW2, 2))
++
++/*
++** qinch_pat_n_7_u64:
++**	uqinch	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_7_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqinch_pat (x0, SV_POW2, 7))
++
++/*
++** qinch_pat_n_15_u64:
++**	uqinch	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_15_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqinch_pat (x0, SV_POW2, 15))
++
++/*
++** qinch_pat_n_16_u64:
++**	uqinch	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_16_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqinch_pat (x0, SV_POW2, 16))
++
++/*
++** qinch_pat_n_vl1_u64:
++**	uqinch	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl1_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqinch_pat (x0, SV_VL1, 16))
++
++/*
++** qinch_pat_n_vl2_u64:
++**	uqinch	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl2_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqinch_pat (x0, SV_VL2, 16))
++
++/*
++** qinch_pat_n_vl3_u64:
++**	uqinch	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl3_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqinch_pat (x0, SV_VL3, 16))
++
++/*
++** qinch_pat_n_vl4_u64:
++**	uqinch	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl4_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqinch_pat (x0, SV_VL4, 16))
++
++/*
++** qinch_pat_n_vl5_u64:
++**	uqinch	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl5_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqinch_pat (x0, SV_VL5, 16))
++
++/*
++** qinch_pat_n_vl6_u64:
++**	uqinch	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl6_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqinch_pat (x0, SV_VL6, 16))
++
++/*
++** qinch_pat_n_vl7_u64:
++**	uqinch	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl7_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqinch_pat (x0, SV_VL7, 16))
++
++/*
++** qinch_pat_n_vl8_u64:
++**	uqinch	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl8_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqinch_pat (x0, SV_VL8, 16))
++
++/*
++** qinch_pat_n_vl16_u64:
++**	uqinch	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl16_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqinch_pat (x0, SV_VL16, 16))
++
++/*
++** qinch_pat_n_vl32_u64:
++**	uqinch	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl32_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqinch_pat (x0, SV_VL32, 16))
++
++/*
++** qinch_pat_n_vl64_u64:
++**	uqinch	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl64_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqinch_pat (x0, SV_VL64, 16))
++
++/*
++** qinch_pat_n_vl128_u64:
++**	uqinch	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl128_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqinch_pat (x0, SV_VL128, 16))
++
++/*
++** qinch_pat_n_vl256_u64:
++**	uqinch	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_vl256_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqinch_pat (x0, SV_VL256, 16))
++
++/*
++** qinch_pat_n_mul4_u64:
++**	uqinch	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul4_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqinch_pat (x0, SV_MUL4, 16))
++
++/*
++** qinch_pat_n_mul3_u64:
++**	uqinch	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_mul3_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqinch_pat (x0, SV_MUL3, 16))
++
++/*
++** qinch_pat_n_all_u64:
++**	uqinch	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_pat_n_all_u64, uint64_t,
++		x0 = svqinch_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqinch_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s16.c
+new file mode 100644
+index 000000000..1f460db8e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s16.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_1_s16_tied:
++**	sqinch	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_1_s16_tied, svint16_t,
++		z0 = svqinch_s16 (z0, 1),
++		z0 = svqinch (z0, 1))
++
++/*
++** qinch_1_s16_untied:
++**	movprfx	z0, z1
++**	sqinch	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_1_s16_untied, svint16_t,
++		z0 = svqinch_s16 (z1, 1),
++		z0 = svqinch (z1, 1))
++
++/*
++** qinch_2_s16:
++**	sqinch	z0\.h, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_2_s16, svint16_t,
++		z0 = svqinch_s16 (z0, 2),
++		z0 = svqinch (z0, 2))
++
++/*
++** qinch_7_s16:
++**	sqinch	z0\.h, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_7_s16, svint16_t,
++		z0 = svqinch_s16 (z0, 7),
++		z0 = svqinch (z0, 7))
++
++/*
++** qinch_15_s16:
++**	sqinch	z0\.h, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_15_s16, svint16_t,
++		z0 = svqinch_s16 (z0, 15),
++		z0 = svqinch (z0, 15))
++
++/*
++** qinch_16_s16:
++**	sqinch	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_16_s16, svint16_t,
++		z0 = svqinch_s16 (z0, 16),
++		z0 = svqinch (z0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s32.c
+new file mode 100644
+index 000000000..a7b1aac80
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_n_1_s32_tied:
++**	sqinch	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_s32_tied, int32_t,
++		x0 = svqinch_n_s32 (x0, 1),
++		x0 = svqinch (x0, 1))
++
++/*
++** qinch_n_1_s32_untied:
++**	mov	w0, w1
++**	sqinch	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_s32_untied, int32_t,
++		x0 = svqinch_n_s32 (x1, 1),
++		x0 = svqinch (x1, 1))
++
++/*
++** qinch_n_2_s32:
++**	sqinch	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_2_s32, int32_t,
++		x0 = svqinch_n_s32 (x0, 2),
++		x0 = svqinch (x0, 2))
++
++/*
++** qinch_n_7_s32:
++**	sqinch	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_7_s32, int32_t,
++		x0 = svqinch_n_s32 (x0, 7),
++		x0 = svqinch (x0, 7))
++
++/*
++** qinch_n_15_s32:
++**	sqinch	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_15_s32, int32_t,
++		x0 = svqinch_n_s32 (x0, 15),
++		x0 = svqinch (x0, 15))
++
++/*
++** qinch_n_16_s32:
++**	sqinch	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_16_s32, int32_t,
++		x0 = svqinch_n_s32 (x0, 16),
++		x0 = svqinch (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s64.c
+new file mode 100644
+index 000000000..74ac6a3df
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_n_1_s64_tied:
++**	sqinch	x0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_s64_tied, int64_t,
++		x0 = svqinch_n_s64 (x0, 1),
++		x0 = svqinch (x0, 1))
++
++/*
++** qinch_n_1_s64_untied:
++**	mov	x0, x1
++**	sqinch	x0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_s64_untied, int64_t,
++		x0 = svqinch_n_s64 (x1, 1),
++		x0 = svqinch (x1, 1))
++
++/*
++** qinch_n_2_s64:
++**	sqinch	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_2_s64, int64_t,
++		x0 = svqinch_n_s64 (x0, 2),
++		x0 = svqinch (x0, 2))
++
++/*
++** qinch_n_7_s64:
++**	sqinch	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_7_s64, int64_t,
++		x0 = svqinch_n_s64 (x0, 7),
++		x0 = svqinch (x0, 7))
++
++/*
++** qinch_n_15_s64:
++**	sqinch	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_15_s64, int64_t,
++		x0 = svqinch_n_s64 (x0, 15),
++		x0 = svqinch (x0, 15))
++
++/*
++** qinch_n_16_s64:
++**	sqinch	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_16_s64, int64_t,
++		x0 = svqinch_n_s64 (x0, 16),
++		x0 = svqinch (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u16.c
+new file mode 100644
+index 000000000..aa9905897
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u16.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_1_u16_tied:
++**	uqinch	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_1_u16_tied, svuint16_t,
++		z0 = svqinch_u16 (z0, 1),
++		z0 = svqinch (z0, 1))
++
++/*
++** qinch_1_u16_untied:
++**	movprfx	z0, z1
++**	uqinch	z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_1_u16_untied, svuint16_t,
++		z0 = svqinch_u16 (z1, 1),
++		z0 = svqinch (z1, 1))
++
++/*
++** qinch_2_u16:
++**	uqinch	z0\.h, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_2_u16, svuint16_t,
++		z0 = svqinch_u16 (z0, 2),
++		z0 = svqinch (z0, 2))
++
++/*
++** qinch_7_u16:
++**	uqinch	z0\.h, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_7_u16, svuint16_t,
++		z0 = svqinch_u16 (z0, 7),
++		z0 = svqinch (z0, 7))
++
++/*
++** qinch_15_u16:
++**	uqinch	z0\.h, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_15_u16, svuint16_t,
++		z0 = svqinch_u16 (z0, 15),
++		z0 = svqinch (z0, 15))
++
++/*
++** qinch_16_u16:
++**	uqinch	z0\.h, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qinch_16_u16, svuint16_t,
++		z0 = svqinch_u16 (z0, 16),
++		z0 = svqinch (z0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u32.c
+new file mode 100644
+index 000000000..396f95b2a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_n_1_u32_tied:
++**	uqinch	w0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_u32_tied, uint32_t,
++		x0 = svqinch_n_u32 (x0, 1),
++		x0 = svqinch (x0, 1))
++
++/*
++** qinch_n_1_u32_untied:
++**	mov	w0, w1
++**	uqinch	w0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_u32_untied, uint32_t,
++		x0 = svqinch_n_u32 (x1, 1),
++		x0 = svqinch (x1, 1))
++
++/*
++** qinch_n_2_u32:
++**	uqinch	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_2_u32, uint32_t,
++		x0 = svqinch_n_u32 (x0, 2),
++		x0 = svqinch (x0, 2))
++
++/*
++** qinch_n_7_u32:
++**	uqinch	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_7_u32, uint32_t,
++		x0 = svqinch_n_u32 (x0, 7),
++		x0 = svqinch (x0, 7))
++
++/*
++** qinch_n_15_u32:
++**	uqinch	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_15_u32, uint32_t,
++		x0 = svqinch_n_u32 (x0, 15),
++		x0 = svqinch (x0, 15))
++
++/*
++** qinch_n_16_u32:
++**	uqinch	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_16_u32, uint32_t,
++		x0 = svqinch_n_u32 (x0, 16),
++		x0 = svqinch (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u64.c
+new file mode 100644
+index 000000000..5a9231722
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qinch_n_1_u64_tied:
++**	uqinch	x0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_u64_tied, uint64_t,
++		x0 = svqinch_n_u64 (x0, 1),
++		x0 = svqinch (x0, 1))
++
++/*
++** qinch_n_1_u64_untied:
++**	mov	x0, x1
++**	uqinch	x0
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_1_u64_untied, uint64_t,
++		x0 = svqinch_n_u64 (x1, 1),
++		x0 = svqinch (x1, 1))
++
++/*
++** qinch_n_2_u64:
++**	uqinch	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_2_u64, uint64_t,
++		x0 = svqinch_n_u64 (x0, 2),
++		x0 = svqinch (x0, 2))
++
++/*
++** qinch_n_7_u64:
++**	uqinch	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_7_u64, uint64_t,
++		x0 = svqinch_n_u64 (x0, 7),
++		x0 = svqinch (x0, 7))
++
++/*
++** qinch_n_15_u64:
++**	uqinch	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_15_u64, uint64_t,
++		x0 = svqinch_n_u64 (x0, 15),
++		x0 = svqinch (x0, 15))
++
++/*
++** qinch_n_16_u64:
++**	uqinch	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qinch_n_16_u64, uint64_t,
++		x0 = svqinch_n_u64 (x0, 16),
++		x0 = svqinch (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s16.c
+new file mode 100644
+index 000000000..979b57476
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincp_s16_tied:
++**	sqincp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_s16_tied, svint16_t,
++		z0 = svqincp_s16 (z0, p0),
++		z0 = svqincp (z0, p0))
++
++/*
++** qincp_s16_untied:
++**	movprfx	z0, z1
++**	sqincp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_s16_untied, svint16_t,
++		z0 = svqincp_s16 (z1, p0),
++		z0 = svqincp (z1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s32.c
+new file mode 100644
+index 000000000..46ad51b01
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s32.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincp_s32_tied:
++**	sqincp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_s32_tied, svint32_t,
++		z0 = svqincp_s32 (z0, p0),
++		z0 = svqincp (z0, p0))
++
++/*
++** qincp_s32_untied:
++**	movprfx	z0, z1
++**	sqincp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_s32_untied, svint32_t,
++		z0 = svqincp_s32 (z1, p0),
++		z0 = svqincp (z1, p0))
++
++/*
++** qincp_n_s32_b8_tied:
++**	sqincp	x0, p0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b8_tied, int32_t,
++		x0 = svqincp_n_s32_b8 (x0, p0),
++		x0 = svqincp_b8 (x0, p0))
++
++/*
++** qincp_n_s32_b8_untied:
++**	mov	w0, w1
++**	sqincp	x0, p0\.b, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b8_untied, int32_t,
++		x0 = svqincp_n_s32_b8 (x1, p0),
++		x0 = svqincp_b8 (x1, p0))
++
++/*
++** qincp_n_s32_b16_tied:
++**	sqincp	x0, p0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b16_tied, int32_t,
++		x0 = svqincp_n_s32_b16 (x0, p0),
++		x0 = svqincp_b16 (x0, p0))
++
++/*
++** qincp_n_s32_b16_untied:
++**	mov	w0, w1
++**	sqincp	x0, p0\.h, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b16_untied, int32_t,
++		x0 = svqincp_n_s32_b16 (x1, p0),
++		x0 = svqincp_b16 (x1, p0))
++
++/*
++** qincp_n_s32_b32_tied:
++**	sqincp	x0, p0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b32_tied, int32_t,
++		x0 = svqincp_n_s32_b32 (x0, p0),
++		x0 = svqincp_b32 (x0, p0))
++
++/*
++** qincp_n_s32_b32_untied:
++**	mov	w0, w1
++**	sqincp	x0, p0\.s, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b32_untied, int32_t,
++		x0 = svqincp_n_s32_b32 (x1, p0),
++		x0 = svqincp_b32 (x1, p0))
++
++/*
++** qincp_n_s32_b64_tied:
++**	sqincp	x0, p0\.d, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b64_tied, int32_t,
++		x0 = svqincp_n_s32_b64 (x0, p0),
++		x0 = svqincp_b64 (x0, p0))
++
++/*
++** qincp_n_s32_b64_untied:
++**	mov	w0, w1
++**	sqincp	x0, p0\.d, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s32_b64_untied, int32_t,
++		x0 = svqincp_n_s32_b64 (x1, p0),
++		x0 = svqincp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s64.c
+new file mode 100644
+index 000000000..226502328
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s64.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincp_s64_tied:
++**	sqincp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_s64_tied, svint64_t,
++		z0 = svqincp_s64 (z0, p0),
++		z0 = svqincp (z0, p0))
++
++/*
++** qincp_s64_untied:
++**	movprfx	z0, z1
++**	sqincp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_s64_untied, svint64_t,
++		z0 = svqincp_s64 (z1, p0),
++		z0 = svqincp (z1, p0))
++
++/*
++** qincp_n_s64_b8_tied:
++**	sqincp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b8_tied, int64_t,
++		x0 = svqincp_n_s64_b8 (x0, p0),
++		x0 = svqincp_b8 (x0, p0))
++
++/*
++** qincp_n_s64_b8_untied:
++**	mov	x0, x1
++**	sqincp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b8_untied, int64_t,
++		x0 = svqincp_n_s64_b8 (x1, p0),
++		x0 = svqincp_b8 (x1, p0))
++
++/*
++** qincp_n_s64_b16_tied:
++**	sqincp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b16_tied, int64_t,
++		x0 = svqincp_n_s64_b16 (x0, p0),
++		x0 = svqincp_b16 (x0, p0))
++
++/*
++** qincp_n_s64_b16_untied:
++**	mov	x0, x1
++**	sqincp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b16_untied, int64_t,
++		x0 = svqincp_n_s64_b16 (x1, p0),
++		x0 = svqincp_b16 (x1, p0))
++
++/*
++** qincp_n_s64_b32_tied:
++**	sqincp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b32_tied, int64_t,
++		x0 = svqincp_n_s64_b32 (x0, p0),
++		x0 = svqincp_b32 (x0, p0))
++
++/*
++** qincp_n_s64_b32_untied:
++**	mov	x0, x1
++**	sqincp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b32_untied, int64_t,
++		x0 = svqincp_n_s64_b32 (x1, p0),
++		x0 = svqincp_b32 (x1, p0))
++
++/*
++** qincp_n_s64_b64_tied:
++**	sqincp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b64_tied, int64_t,
++		x0 = svqincp_n_s64_b64 (x0, p0),
++		x0 = svqincp_b64 (x0, p0))
++
++/*
++** qincp_n_s64_b64_untied:
++**	mov	x0, x1
++**	sqincp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_s64_b64_untied, int64_t,
++		x0 = svqincp_n_s64_b64 (x1, p0),
++		x0 = svqincp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u16.c
+new file mode 100644
+index 000000000..ecd84470c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u16.c
+@@ -0,0 +1,22 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincp_u16_tied:
++**	uqincp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_u16_tied, svuint16_t,
++		z0 = svqincp_u16 (z0, p0),
++		z0 = svqincp (z0, p0))
++
++/*
++** qincp_u16_untied:
++**	movprfx	z0, z1
++**	uqincp	z0\.h, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_u16_untied, svuint16_t,
++		z0 = svqincp_u16 (z1, p0),
++		z0 = svqincp (z1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u32.c
+new file mode 100644
+index 000000000..011a26253
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u32.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincp_u32_tied:
++**	uqincp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_u32_tied, svuint32_t,
++		z0 = svqincp_u32 (z0, p0),
++		z0 = svqincp (z0, p0))
++
++/*
++** qincp_u32_untied:
++**	movprfx	z0, z1
++**	uqincp	z0\.s, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_u32_untied, svuint32_t,
++		z0 = svqincp_u32 (z1, p0),
++		z0 = svqincp (z1, p0))
++
++/*
++** qincp_n_u32_b8_tied:
++**	uqincp	w0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b8_tied, uint32_t,
++		x0 = svqincp_n_u32_b8 (x0, p0),
++		x0 = svqincp_b8 (x0, p0))
++
++/*
++** qincp_n_u32_b8_untied:
++**	mov	w0, w1
++**	uqincp	w0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b8_untied, uint32_t,
++		x0 = svqincp_n_u32_b8 (x1, p0),
++		x0 = svqincp_b8 (x1, p0))
++
++/*
++** qincp_n_u32_b16_tied:
++**	uqincp	w0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b16_tied, uint32_t,
++		x0 = svqincp_n_u32_b16 (x0, p0),
++		x0 = svqincp_b16 (x0, p0))
++
++/*
++** qincp_n_u32_b16_untied:
++**	mov	w0, w1
++**	uqincp	w0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b16_untied, uint32_t,
++		x0 = svqincp_n_u32_b16 (x1, p0),
++		x0 = svqincp_b16 (x1, p0))
++
++/*
++** qincp_n_u32_b32_tied:
++**	uqincp	w0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b32_tied, uint32_t,
++		x0 = svqincp_n_u32_b32 (x0, p0),
++		x0 = svqincp_b32 (x0, p0))
++
++/*
++** qincp_n_u32_b32_untied:
++**	mov	w0, w1
++**	uqincp	w0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b32_untied, uint32_t,
++		x0 = svqincp_n_u32_b32 (x1, p0),
++		x0 = svqincp_b32 (x1, p0))
++
++/*
++** qincp_n_u32_b64_tied:
++**	uqincp	w0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b64_tied, uint32_t,
++		x0 = svqincp_n_u32_b64 (x0, p0),
++		x0 = svqincp_b64 (x0, p0))
++
++/*
++** qincp_n_u32_b64_untied:
++**	mov	w0, w1
++**	uqincp	w0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u32_b64_untied, uint32_t,
++		x0 = svqincp_n_u32_b64 (x1, p0),
++		x0 = svqincp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u64.c
+new file mode 100644
+index 000000000..761ac553a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u64.c
+@@ -0,0 +1,98 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincp_u64_tied:
++**	uqincp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_u64_tied, svuint64_t,
++		z0 = svqincp_u64 (z0, p0),
++		z0 = svqincp (z0, p0))
++
++/*
++** qincp_u64_untied:
++**	movprfx	z0, z1
++**	uqincp	z0\.d, p0
++**	ret
++*/
++TEST_UNIFORM_Z (qincp_u64_untied, svuint64_t,
++		z0 = svqincp_u64 (z1, p0),
++		z0 = svqincp (z1, p0))
++
++/*
++** qincp_n_u64_b8_tied:
++**	uqincp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b8_tied, uint64_t,
++		x0 = svqincp_n_u64_b8 (x0, p0),
++		x0 = svqincp_b8 (x0, p0))
++
++/*
++** qincp_n_u64_b8_untied:
++**	mov	x0, x1
++**	uqincp	x0, p0\.b
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b8_untied, uint64_t,
++		x0 = svqincp_n_u64_b8 (x1, p0),
++		x0 = svqincp_b8 (x1, p0))
++
++/*
++** qincp_n_u64_b16_tied:
++**	uqincp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b16_tied, uint64_t,
++		x0 = svqincp_n_u64_b16 (x0, p0),
++		x0 = svqincp_b16 (x0, p0))
++
++/*
++** qincp_n_u64_b16_untied:
++**	mov	x0, x1
++**	uqincp	x0, p0\.h
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b16_untied, uint64_t,
++		x0 = svqincp_n_u64_b16 (x1, p0),
++		x0 = svqincp_b16 (x1, p0))
++
++/*
++** qincp_n_u64_b32_tied:
++**	uqincp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b32_tied, uint64_t,
++		x0 = svqincp_n_u64_b32 (x0, p0),
++		x0 = svqincp_b32 (x0, p0))
++
++/*
++** qincp_n_u64_b32_untied:
++**	mov	x0, x1
++**	uqincp	x0, p0\.s
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b32_untied, uint64_t,
++		x0 = svqincp_n_u64_b32 (x1, p0),
++		x0 = svqincp_b32 (x1, p0))
++
++/*
++** qincp_n_u64_b64_tied:
++**	uqincp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b64_tied, uint64_t,
++		x0 = svqincp_n_u64_b64 (x0, p0),
++		x0 = svqincp_b64 (x0, p0))
++
++/*
++** qincp_n_u64_b64_untied:
++**	mov	x0, x1
++**	uqincp	x0, p0\.d
++**	ret
++*/
++TEST_UNIFORM_S (qincp_n_u64_b64_untied, uint64_t,
++		x0 = svqincp_n_u64_b64 (x1, p0),
++		x0 = svqincp_b64 (x1, p0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s32.c
+new file mode 100644
+index 000000000..6ceb003ab
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s32.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_pat_1_s32_tied:
++**	sqincw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_1_s32_tied, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_POW2, 1),
++		z0 = svqincw_pat (z0, SV_POW2, 1))
++
++/*
++** qincw_pat_1_s32_untied:
++**	movprfx	z0, z1
++**	sqincw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_1_s32_untied, svint32_t,
++		z0 = svqincw_pat_s32 (z1, SV_POW2, 1),
++		z0 = svqincw_pat (z1, SV_POW2, 1))
++
++/*
++** qincw_pat_2_s32:
++**	sqincw	z0\.s, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_2_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_POW2, 2),
++		z0 = svqincw_pat (z0, SV_POW2, 2))
++
++/*
++** qincw_pat_7_s32:
++**	sqincw	z0\.s, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_7_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_POW2, 7),
++		z0 = svqincw_pat (z0, SV_POW2, 7))
++
++/*
++** qincw_pat_15_s32:
++**	sqincw	z0\.s, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_15_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_POW2, 15),
++		z0 = svqincw_pat (z0, SV_POW2, 15))
++
++/*
++** qincw_pat_16_s32:
++**	sqincw	z0\.s, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_16_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_POW2, 16),
++		z0 = svqincw_pat (z0, SV_POW2, 16))
++
++/*
++** qincw_pat_vl1_s32:
++**	sqincw	z0\.s, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl1_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL1, 16),
++		z0 = svqincw_pat (z0, SV_VL1, 16))
++
++/*
++** qincw_pat_vl2_s32:
++**	sqincw	z0\.s, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl2_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL2, 16),
++		z0 = svqincw_pat (z0, SV_VL2, 16))
++
++/*
++** qincw_pat_vl3_s32:
++**	sqincw	z0\.s, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl3_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL3, 16),
++		z0 = svqincw_pat (z0, SV_VL3, 16))
++
++/*
++** qincw_pat_vl4_s32:
++**	sqincw	z0\.s, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl4_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL4, 16),
++		z0 = svqincw_pat (z0, SV_VL4, 16))
++
++/*
++** qincw_pat_vl5_s32:
++**	sqincw	z0\.s, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl5_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL5, 16),
++		z0 = svqincw_pat (z0, SV_VL5, 16))
++
++/*
++** qincw_pat_vl6_s32:
++**	sqincw	z0\.s, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl6_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL6, 16),
++		z0 = svqincw_pat (z0, SV_VL6, 16))
++
++/*
++** qincw_pat_vl7_s32:
++**	sqincw	z0\.s, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl7_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL7, 16),
++		z0 = svqincw_pat (z0, SV_VL7, 16))
++
++/*
++** qincw_pat_vl8_s32:
++**	sqincw	z0\.s, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl8_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL8, 16),
++		z0 = svqincw_pat (z0, SV_VL8, 16))
++
++/*
++** qincw_pat_vl16_s32:
++**	sqincw	z0\.s, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl16_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL16, 16),
++		z0 = svqincw_pat (z0, SV_VL16, 16))
++
++/*
++** qincw_pat_vl32_s32:
++**	sqincw	z0\.s, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl32_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL32, 16),
++		z0 = svqincw_pat (z0, SV_VL32, 16))
++
++/*
++** qincw_pat_vl64_s32:
++**	sqincw	z0\.s, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl64_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL64, 16),
++		z0 = svqincw_pat (z0, SV_VL64, 16))
++
++/*
++** qincw_pat_vl128_s32:
++**	sqincw	z0\.s, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl128_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL128, 16),
++		z0 = svqincw_pat (z0, SV_VL128, 16))
++
++/*
++** qincw_pat_vl256_s32:
++**	sqincw	z0\.s, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl256_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_VL256, 16),
++		z0 = svqincw_pat (z0, SV_VL256, 16))
++
++/*
++** qincw_pat_mul4_s32:
++**	sqincw	z0\.s, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_mul4_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_MUL4, 16),
++		z0 = svqincw_pat (z0, SV_MUL4, 16))
++
++/*
++** qincw_pat_mul3_s32:
++**	sqincw	z0\.s, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_mul3_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_MUL3, 16),
++		z0 = svqincw_pat (z0, SV_MUL3, 16))
++
++/*
++** qincw_pat_all_s32:
++**	sqincw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_all_s32, svint32_t,
++		z0 = svqincw_pat_s32 (z0, SV_ALL, 16),
++		z0 = svqincw_pat (z0, SV_ALL, 16))
++
++/*
++** qincw_pat_n_1_s32_tied:
++**	sqincw	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_s32_tied, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 1),
++		x0 = svqincw_pat (x0, SV_POW2, 1))
++
++/*
++** qincw_pat_n_1_s32_untied:
++**	mov	w0, w1
++**	sqincw	x0, w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_s32_untied, int32_t,
++		x0 = svqincw_pat_n_s32 (x1, SV_POW2, 1),
++		x0 = svqincw_pat (x1, SV_POW2, 1))
++
++/*
++** qincw_pat_n_2_s32:
++**	sqincw	x0, w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_2_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 2),
++		x0 = svqincw_pat (x0, SV_POW2, 2))
++
++/*
++** qincw_pat_n_7_s32:
++**	sqincw	x0, w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_7_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 7),
++		x0 = svqincw_pat (x0, SV_POW2, 7))
++
++/*
++** qincw_pat_n_15_s32:
++**	sqincw	x0, w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_15_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 15),
++		x0 = svqincw_pat (x0, SV_POW2, 15))
++
++/*
++** qincw_pat_n_16_s32:
++**	sqincw	x0, w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_16_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 16),
++		x0 = svqincw_pat (x0, SV_POW2, 16))
++
++/*
++** qincw_pat_n_vl1_s32:
++**	sqincw	x0, w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl1_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL1, 16),
++		x0 = svqincw_pat (x0, SV_VL1, 16))
++
++/*
++** qincw_pat_n_vl2_s32:
++**	sqincw	x0, w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl2_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL2, 16),
++		x0 = svqincw_pat (x0, SV_VL2, 16))
++
++/*
++** qincw_pat_n_vl3_s32:
++**	sqincw	x0, w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl3_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL3, 16),
++		x0 = svqincw_pat (x0, SV_VL3, 16))
++
++/*
++** qincw_pat_n_vl4_s32:
++**	sqincw	x0, w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl4_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL4, 16),
++		x0 = svqincw_pat (x0, SV_VL4, 16))
++
++/*
++** qincw_pat_n_vl5_s32:
++**	sqincw	x0, w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl5_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL5, 16),
++		x0 = svqincw_pat (x0, SV_VL5, 16))
++
++/*
++** qincw_pat_n_vl6_s32:
++**	sqincw	x0, w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl6_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL6, 16),
++		x0 = svqincw_pat (x0, SV_VL6, 16))
++
++/*
++** qincw_pat_n_vl7_s32:
++**	sqincw	x0, w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl7_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL7, 16),
++		x0 = svqincw_pat (x0, SV_VL7, 16))
++
++/*
++** qincw_pat_n_vl8_s32:
++**	sqincw	x0, w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl8_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL8, 16),
++		x0 = svqincw_pat (x0, SV_VL8, 16))
++
++/*
++** qincw_pat_n_vl16_s32:
++**	sqincw	x0, w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl16_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL16, 16),
++		x0 = svqincw_pat (x0, SV_VL16, 16))
++
++/*
++** qincw_pat_n_vl32_s32:
++**	sqincw	x0, w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl32_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL32, 16),
++		x0 = svqincw_pat (x0, SV_VL32, 16))
++
++/*
++** qincw_pat_n_vl64_s32:
++**	sqincw	x0, w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl64_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL64, 16),
++		x0 = svqincw_pat (x0, SV_VL64, 16))
++
++/*
++** qincw_pat_n_vl128_s32:
++**	sqincw	x0, w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl128_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL128, 16),
++		x0 = svqincw_pat (x0, SV_VL128, 16))
++
++/*
++** qincw_pat_n_vl256_s32:
++**	sqincw	x0, w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl256_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_VL256, 16),
++		x0 = svqincw_pat (x0, SV_VL256, 16))
++
++/*
++** qincw_pat_n_mul4_s32:
++**	sqincw	x0, w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul4_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_MUL4, 16),
++		x0 = svqincw_pat (x0, SV_MUL4, 16))
++
++/*
++** qincw_pat_n_mul3_s32:
++**	sqincw	x0, w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul3_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_MUL3, 16),
++		x0 = svqincw_pat (x0, SV_MUL3, 16))
++
++/*
++** qincw_pat_n_all_s32:
++**	sqincw	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_all_s32, int32_t,
++		x0 = svqincw_pat_n_s32 (x0, SV_ALL, 16),
++		x0 = svqincw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s64.c
+new file mode 100644
+index 000000000..feebc25cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_pat_n_1_s64_tied:
++**	sqincw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_s64_tied, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 1),
++		x0 = svqincw_pat (x0, SV_POW2, 1))
++
++/*
++** qincw_pat_n_1_s64_untied:
++**	mov	x0, x1
++**	sqincw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_s64_untied, int64_t,
++		x0 = svqincw_pat_n_s64 (x1, SV_POW2, 1),
++		x0 = svqincw_pat (x1, SV_POW2, 1))
++
++/*
++** qincw_pat_n_2_s64:
++**	sqincw	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_2_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 2),
++		x0 = svqincw_pat (x0, SV_POW2, 2))
++
++/*
++** qincw_pat_n_7_s64:
++**	sqincw	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_7_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 7),
++		x0 = svqincw_pat (x0, SV_POW2, 7))
++
++/*
++** qincw_pat_n_15_s64:
++**	sqincw	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_15_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 15),
++		x0 = svqincw_pat (x0, SV_POW2, 15))
++
++/*
++** qincw_pat_n_16_s64:
++**	sqincw	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_16_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 16),
++		x0 = svqincw_pat (x0, SV_POW2, 16))
++
++/*
++** qincw_pat_n_vl1_s64:
++**	sqincw	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl1_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL1, 16),
++		x0 = svqincw_pat (x0, SV_VL1, 16))
++
++/*
++** qincw_pat_n_vl2_s64:
++**	sqincw	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl2_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL2, 16),
++		x0 = svqincw_pat (x0, SV_VL2, 16))
++
++/*
++** qincw_pat_n_vl3_s64:
++**	sqincw	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl3_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL3, 16),
++		x0 = svqincw_pat (x0, SV_VL3, 16))
++
++/*
++** qincw_pat_n_vl4_s64:
++**	sqincw	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl4_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL4, 16),
++		x0 = svqincw_pat (x0, SV_VL4, 16))
++
++/*
++** qincw_pat_n_vl5_s64:
++**	sqincw	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl5_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL5, 16),
++		x0 = svqincw_pat (x0, SV_VL5, 16))
++
++/*
++** qincw_pat_n_vl6_s64:
++**	sqincw	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl6_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL6, 16),
++		x0 = svqincw_pat (x0, SV_VL6, 16))
++
++/*
++** qincw_pat_n_vl7_s64:
++**	sqincw	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl7_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL7, 16),
++		x0 = svqincw_pat (x0, SV_VL7, 16))
++
++/*
++** qincw_pat_n_vl8_s64:
++**	sqincw	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl8_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL8, 16),
++		x0 = svqincw_pat (x0, SV_VL8, 16))
++
++/*
++** qincw_pat_n_vl16_s64:
++**	sqincw	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl16_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL16, 16),
++		x0 = svqincw_pat (x0, SV_VL16, 16))
++
++/*
++** qincw_pat_n_vl32_s64:
++**	sqincw	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl32_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL32, 16),
++		x0 = svqincw_pat (x0, SV_VL32, 16))
++
++/*
++** qincw_pat_n_vl64_s64:
++**	sqincw	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl64_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL64, 16),
++		x0 = svqincw_pat (x0, SV_VL64, 16))
++
++/*
++** qincw_pat_n_vl128_s64:
++**	sqincw	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl128_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL128, 16),
++		x0 = svqincw_pat (x0, SV_VL128, 16))
++
++/*
++** qincw_pat_n_vl256_s64:
++**	sqincw	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl256_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_VL256, 16),
++		x0 = svqincw_pat (x0, SV_VL256, 16))
++
++/*
++** qincw_pat_n_mul4_s64:
++**	sqincw	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul4_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_MUL4, 16),
++		x0 = svqincw_pat (x0, SV_MUL4, 16))
++
++/*
++** qincw_pat_n_mul3_s64:
++**	sqincw	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul3_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_MUL3, 16),
++		x0 = svqincw_pat (x0, SV_MUL3, 16))
++
++/*
++** qincw_pat_n_all_s64:
++**	sqincw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_all_s64, int64_t,
++		x0 = svqincw_pat_n_s64 (x0, SV_ALL, 16),
++		x0 = svqincw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u32.c
+new file mode 100644
+index 000000000..e08e91d09
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u32.c
+@@ -0,0 +1,401 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_pat_1_u32_tied:
++**	uqincw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_1_u32_tied, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_POW2, 1),
++		z0 = svqincw_pat (z0, SV_POW2, 1))
++
++/*
++** qincw_pat_1_u32_untied:
++**	movprfx	z0, z1
++**	uqincw	z0\.s, pow2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_1_u32_untied, svuint32_t,
++		z0 = svqincw_pat_u32 (z1, SV_POW2, 1),
++		z0 = svqincw_pat (z1, SV_POW2, 1))
++
++/*
++** qincw_pat_2_u32:
++**	uqincw	z0\.s, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_2_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_POW2, 2),
++		z0 = svqincw_pat (z0, SV_POW2, 2))
++
++/*
++** qincw_pat_7_u32:
++**	uqincw	z0\.s, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_7_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_POW2, 7),
++		z0 = svqincw_pat (z0, SV_POW2, 7))
++
++/*
++** qincw_pat_15_u32:
++**	uqincw	z0\.s, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_15_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_POW2, 15),
++		z0 = svqincw_pat (z0, SV_POW2, 15))
++
++/*
++** qincw_pat_16_u32:
++**	uqincw	z0\.s, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_16_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_POW2, 16),
++		z0 = svqincw_pat (z0, SV_POW2, 16))
++
++/*
++** qincw_pat_vl1_u32:
++**	uqincw	z0\.s, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl1_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL1, 16),
++		z0 = svqincw_pat (z0, SV_VL1, 16))
++
++/*
++** qincw_pat_vl2_u32:
++**	uqincw	z0\.s, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl2_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL2, 16),
++		z0 = svqincw_pat (z0, SV_VL2, 16))
++
++/*
++** qincw_pat_vl3_u32:
++**	uqincw	z0\.s, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl3_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL3, 16),
++		z0 = svqincw_pat (z0, SV_VL3, 16))
++
++/*
++** qincw_pat_vl4_u32:
++**	uqincw	z0\.s, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl4_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL4, 16),
++		z0 = svqincw_pat (z0, SV_VL4, 16))
++
++/*
++** qincw_pat_vl5_u32:
++**	uqincw	z0\.s, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl5_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL5, 16),
++		z0 = svqincw_pat (z0, SV_VL5, 16))
++
++/*
++** qincw_pat_vl6_u32:
++**	uqincw	z0\.s, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl6_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL6, 16),
++		z0 = svqincw_pat (z0, SV_VL6, 16))
++
++/*
++** qincw_pat_vl7_u32:
++**	uqincw	z0\.s, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl7_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL7, 16),
++		z0 = svqincw_pat (z0, SV_VL7, 16))
++
++/*
++** qincw_pat_vl8_u32:
++**	uqincw	z0\.s, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl8_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL8, 16),
++		z0 = svqincw_pat (z0, SV_VL8, 16))
++
++/*
++** qincw_pat_vl16_u32:
++**	uqincw	z0\.s, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl16_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL16, 16),
++		z0 = svqincw_pat (z0, SV_VL16, 16))
++
++/*
++** qincw_pat_vl32_u32:
++**	uqincw	z0\.s, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl32_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL32, 16),
++		z0 = svqincw_pat (z0, SV_VL32, 16))
++
++/*
++** qincw_pat_vl64_u32:
++**	uqincw	z0\.s, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl64_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL64, 16),
++		z0 = svqincw_pat (z0, SV_VL64, 16))
++
++/*
++** qincw_pat_vl128_u32:
++**	uqincw	z0\.s, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl128_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL128, 16),
++		z0 = svqincw_pat (z0, SV_VL128, 16))
++
++/*
++** qincw_pat_vl256_u32:
++**	uqincw	z0\.s, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_vl256_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_VL256, 16),
++		z0 = svqincw_pat (z0, SV_VL256, 16))
++
++/*
++** qincw_pat_mul4_u32:
++**	uqincw	z0\.s, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_mul4_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_MUL4, 16),
++		z0 = svqincw_pat (z0, SV_MUL4, 16))
++
++/*
++** qincw_pat_mul3_u32:
++**	uqincw	z0\.s, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_mul3_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_MUL3, 16),
++		z0 = svqincw_pat (z0, SV_MUL3, 16))
++
++/*
++** qincw_pat_all_u32:
++**	uqincw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_pat_all_u32, svuint32_t,
++		z0 = svqincw_pat_u32 (z0, SV_ALL, 16),
++		z0 = svqincw_pat (z0, SV_ALL, 16))
++
++/*
++** qincw_pat_n_1_u32_tied:
++**	uqincw	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_u32_tied, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 1),
++		x0 = svqincw_pat (x0, SV_POW2, 1))
++
++/*
++** qincw_pat_n_1_u32_untied:
++**	mov	w0, w1
++**	uqincw	w0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_u32_untied, uint32_t,
++		x0 = svqincw_pat_n_u32 (x1, SV_POW2, 1),
++		x0 = svqincw_pat (x1, SV_POW2, 1))
++
++/*
++** qincw_pat_n_2_u32:
++**	uqincw	w0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_2_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 2),
++		x0 = svqincw_pat (x0, SV_POW2, 2))
++
++/*
++** qincw_pat_n_7_u32:
++**	uqincw	w0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_7_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 7),
++		x0 = svqincw_pat (x0, SV_POW2, 7))
++
++/*
++** qincw_pat_n_15_u32:
++**	uqincw	w0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_15_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 15),
++		x0 = svqincw_pat (x0, SV_POW2, 15))
++
++/*
++** qincw_pat_n_16_u32:
++**	uqincw	w0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_16_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 16),
++		x0 = svqincw_pat (x0, SV_POW2, 16))
++
++/*
++** qincw_pat_n_vl1_u32:
++**	uqincw	w0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl1_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL1, 16),
++		x0 = svqincw_pat (x0, SV_VL1, 16))
++
++/*
++** qincw_pat_n_vl2_u32:
++**	uqincw	w0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl2_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL2, 16),
++		x0 = svqincw_pat (x0, SV_VL2, 16))
++
++/*
++** qincw_pat_n_vl3_u32:
++**	uqincw	w0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl3_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL3, 16),
++		x0 = svqincw_pat (x0, SV_VL3, 16))
++
++/*
++** qincw_pat_n_vl4_u32:
++**	uqincw	w0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl4_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL4, 16),
++		x0 = svqincw_pat (x0, SV_VL4, 16))
++
++/*
++** qincw_pat_n_vl5_u32:
++**	uqincw	w0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl5_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL5, 16),
++		x0 = svqincw_pat (x0, SV_VL5, 16))
++
++/*
++** qincw_pat_n_vl6_u32:
++**	uqincw	w0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl6_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL6, 16),
++		x0 = svqincw_pat (x0, SV_VL6, 16))
++
++/*
++** qincw_pat_n_vl7_u32:
++**	uqincw	w0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl7_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL7, 16),
++		x0 = svqincw_pat (x0, SV_VL7, 16))
++
++/*
++** qincw_pat_n_vl8_u32:
++**	uqincw	w0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl8_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL8, 16),
++		x0 = svqincw_pat (x0, SV_VL8, 16))
++
++/*
++** qincw_pat_n_vl16_u32:
++**	uqincw	w0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl16_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL16, 16),
++		x0 = svqincw_pat (x0, SV_VL16, 16))
++
++/*
++** qincw_pat_n_vl32_u32:
++**	uqincw	w0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl32_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL32, 16),
++		x0 = svqincw_pat (x0, SV_VL32, 16))
++
++/*
++** qincw_pat_n_vl64_u32:
++**	uqincw	w0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl64_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL64, 16),
++		x0 = svqincw_pat (x0, SV_VL64, 16))
++
++/*
++** qincw_pat_n_vl128_u32:
++**	uqincw	w0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl128_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL128, 16),
++		x0 = svqincw_pat (x0, SV_VL128, 16))
++
++/*
++** qincw_pat_n_vl256_u32:
++**	uqincw	w0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl256_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_VL256, 16),
++		x0 = svqincw_pat (x0, SV_VL256, 16))
++
++/*
++** qincw_pat_n_mul4_u32:
++**	uqincw	w0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul4_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_MUL4, 16),
++		x0 = svqincw_pat (x0, SV_MUL4, 16))
++
++/*
++** qincw_pat_n_mul3_u32:
++**	uqincw	w0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul3_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_MUL3, 16),
++		x0 = svqincw_pat (x0, SV_MUL3, 16))
++
++/*
++** qincw_pat_n_all_u32:
++**	uqincw	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_all_u32, uint32_t,
++		x0 = svqincw_pat_n_u32 (x0, SV_ALL, 16),
++		x0 = svqincw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u64.c
+new file mode 100644
+index 000000000..a2ac9ee72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u64.c
+@@ -0,0 +1,202 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_pat_n_1_u64_tied:
++**	uqincw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_u64_tied, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 1),
++		x0 = svqincw_pat (x0, SV_POW2, 1))
++
++/*
++** qincw_pat_n_1_u64_untied:
++**	mov	x0, x1
++**	uqincw	x0, pow2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_1_u64_untied, uint64_t,
++		x0 = svqincw_pat_n_u64 (x1, SV_POW2, 1),
++		x0 = svqincw_pat (x1, SV_POW2, 1))
++
++/*
++** qincw_pat_n_2_u64:
++**	uqincw	x0, pow2, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_2_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 2),
++		x0 = svqincw_pat (x0, SV_POW2, 2))
++
++/*
++** qincw_pat_n_7_u64:
++**	uqincw	x0, pow2, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_7_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 7),
++		x0 = svqincw_pat (x0, SV_POW2, 7))
++
++/*
++** qincw_pat_n_15_u64:
++**	uqincw	x0, pow2, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_15_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 15),
++		x0 = svqincw_pat (x0, SV_POW2, 15))
++
++/*
++** qincw_pat_n_16_u64:
++**	uqincw	x0, pow2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_16_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 16),
++		x0 = svqincw_pat (x0, SV_POW2, 16))
++
++/*
++** qincw_pat_n_vl1_u64:
++**	uqincw	x0, vl1, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl1_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL1, 16),
++		x0 = svqincw_pat (x0, SV_VL1, 16))
++
++/*
++** qincw_pat_n_vl2_u64:
++**	uqincw	x0, vl2, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl2_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL2, 16),
++		x0 = svqincw_pat (x0, SV_VL2, 16))
++
++/*
++** qincw_pat_n_vl3_u64:
++**	uqincw	x0, vl3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl3_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL3, 16),
++		x0 = svqincw_pat (x0, SV_VL3, 16))
++
++/*
++** qincw_pat_n_vl4_u64:
++**	uqincw	x0, vl4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl4_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL4, 16),
++		x0 = svqincw_pat (x0, SV_VL4, 16))
++
++/*
++** qincw_pat_n_vl5_u64:
++**	uqincw	x0, vl5, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl5_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL5, 16),
++		x0 = svqincw_pat (x0, SV_VL5, 16))
++
++/*
++** qincw_pat_n_vl6_u64:
++**	uqincw	x0, vl6, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl6_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL6, 16),
++		x0 = svqincw_pat (x0, SV_VL6, 16))
++
++/*
++** qincw_pat_n_vl7_u64:
++**	uqincw	x0, vl7, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl7_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL7, 16),
++		x0 = svqincw_pat (x0, SV_VL7, 16))
++
++/*
++** qincw_pat_n_vl8_u64:
++**	uqincw	x0, vl8, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl8_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL8, 16),
++		x0 = svqincw_pat (x0, SV_VL8, 16))
++
++/*
++** qincw_pat_n_vl16_u64:
++**	uqincw	x0, vl16, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl16_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL16, 16),
++		x0 = svqincw_pat (x0, SV_VL16, 16))
++
++/*
++** qincw_pat_n_vl32_u64:
++**	uqincw	x0, vl32, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl32_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL32, 16),
++		x0 = svqincw_pat (x0, SV_VL32, 16))
++
++/*
++** qincw_pat_n_vl64_u64:
++**	uqincw	x0, vl64, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl64_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL64, 16),
++		x0 = svqincw_pat (x0, SV_VL64, 16))
++
++/*
++** qincw_pat_n_vl128_u64:
++**	uqincw	x0, vl128, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl128_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL128, 16),
++		x0 = svqincw_pat (x0, SV_VL128, 16))
++
++/*
++** qincw_pat_n_vl256_u64:
++**	uqincw	x0, vl256, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_vl256_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_VL256, 16),
++		x0 = svqincw_pat (x0, SV_VL256, 16))
++
++/*
++** qincw_pat_n_mul4_u64:
++**	uqincw	x0, mul4, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul4_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_MUL4, 16),
++		x0 = svqincw_pat (x0, SV_MUL4, 16))
++
++/*
++** qincw_pat_n_mul3_u64:
++**	uqincw	x0, mul3, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_mul3_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_MUL3, 16),
++		x0 = svqincw_pat (x0, SV_MUL3, 16))
++
++/*
++** qincw_pat_n_all_u64:
++**	uqincw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_pat_n_all_u64, uint64_t,
++		x0 = svqincw_pat_n_u64 (x0, SV_ALL, 16),
++		x0 = svqincw_pat (x0, SV_ALL, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s32.c
+new file mode 100644
+index 000000000..031824acf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s32.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_1_s32_tied:
++**	sqincw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_1_s32_tied, svint32_t,
++		z0 = svqincw_s32 (z0, 1),
++		z0 = svqincw (z0, 1))
++
++/*
++** qincw_1_s32_untied:
++**	movprfx	z0, z1
++**	sqincw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_1_s32_untied, svint32_t,
++		z0 = svqincw_s32 (z1, 1),
++		z0 = svqincw (z1, 1))
++
++/*
++** qincw_2_s32:
++**	sqincw	z0\.s, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_2_s32, svint32_t,
++		z0 = svqincw_s32 (z0, 2),
++		z0 = svqincw (z0, 2))
++
++/*
++** qincw_7_s32:
++**	sqincw	z0\.s, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_7_s32, svint32_t,
++		z0 = svqincw_s32 (z0, 7),
++		z0 = svqincw (z0, 7))
++
++/*
++** qincw_15_s32:
++**	sqincw	z0\.s, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_15_s32, svint32_t,
++		z0 = svqincw_s32 (z0, 15),
++		z0 = svqincw (z0, 15))
++
++/*
++** qincw_16_s32:
++**	sqincw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_16_s32, svint32_t,
++		z0 = svqincw_s32 (z0, 16),
++		z0 = svqincw (z0, 16))
++
++/*
++** qincw_n_1_s32_tied:
++**	sqincw	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_s32_tied, int32_t,
++		x0 = svqincw_n_s32 (x0, 1),
++		x0 = svqincw (x0, 1))
++
++/*
++** qincw_n_1_s32_untied:
++**	mov	w0, w1
++**	sqincw	x0, w0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_s32_untied, int32_t,
++		x0 = svqincw_n_s32 (x1, 1),
++		x0 = svqincw (x1, 1))
++
++/*
++** qincw_n_2_s32:
++**	sqincw	x0, w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_2_s32, int32_t,
++		x0 = svqincw_n_s32 (x0, 2),
++		x0 = svqincw (x0, 2))
++
++/*
++** qincw_n_7_s32:
++**	sqincw	x0, w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_7_s32, int32_t,
++		x0 = svqincw_n_s32 (x0, 7),
++		x0 = svqincw (x0, 7))
++
++/*
++** qincw_n_15_s32:
++**	sqincw	x0, w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_15_s32, int32_t,
++		x0 = svqincw_n_s32 (x0, 15),
++		x0 = svqincw (x0, 15))
++
++/*
++** qincw_n_16_s32:
++**	sqincw	x0, w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_16_s32, int32_t,
++		x0 = svqincw_n_s32 (x0, 16),
++		x0 = svqincw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s64.c
+new file mode 100644
+index 000000000..df61f909f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_n_1_s64_tied:
++**	sqincw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_s64_tied, int64_t,
++		x0 = svqincw_n_s64 (x0, 1),
++		x0 = svqincw (x0, 1))
++
++/*
++** qincw_n_1_s64_untied:
++**	mov	x0, x1
++**	sqincw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_s64_untied, int64_t,
++		x0 = svqincw_n_s64 (x1, 1),
++		x0 = svqincw (x1, 1))
++
++/*
++** qincw_n_2_s64:
++**	sqincw	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_2_s64, int64_t,
++		x0 = svqincw_n_s64 (x0, 2),
++		x0 = svqincw (x0, 2))
++
++/*
++** qincw_n_7_s64:
++**	sqincw	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_7_s64, int64_t,
++		x0 = svqincw_n_s64 (x0, 7),
++		x0 = svqincw (x0, 7))
++
++/*
++** qincw_n_15_s64:
++**	sqincw	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_15_s64, int64_t,
++		x0 = svqincw_n_s64 (x0, 15),
++		x0 = svqincw (x0, 15))
++
++/*
++** qincw_n_16_s64:
++**	sqincw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_16_s64, int64_t,
++		x0 = svqincw_n_s64 (x0, 16),
++		x0 = svqincw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u32.c
+new file mode 100644
+index 000000000..65a446ab6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u32.c
+@@ -0,0 +1,113 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_1_u32_tied:
++**	uqincw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_1_u32_tied, svuint32_t,
++		z0 = svqincw_u32 (z0, 1),
++		z0 = svqincw (z0, 1))
++
++/*
++** qincw_1_u32_untied:
++**	movprfx	z0, z1
++**	uqincw	z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_1_u32_untied, svuint32_t,
++		z0 = svqincw_u32 (z1, 1),
++		z0 = svqincw (z1, 1))
++
++/*
++** qincw_2_u32:
++**	uqincw	z0\.s, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_2_u32, svuint32_t,
++		z0 = svqincw_u32 (z0, 2),
++		z0 = svqincw (z0, 2))
++
++/*
++** qincw_7_u32:
++**	uqincw	z0\.s, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_7_u32, svuint32_t,
++		z0 = svqincw_u32 (z0, 7),
++		z0 = svqincw (z0, 7))
++
++/*
++** qincw_15_u32:
++**	uqincw	z0\.s, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_15_u32, svuint32_t,
++		z0 = svqincw_u32 (z0, 15),
++		z0 = svqincw (z0, 15))
++
++/*
++** qincw_16_u32:
++**	uqincw	z0\.s, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_Z (qincw_16_u32, svuint32_t,
++		z0 = svqincw_u32 (z0, 16),
++		z0 = svqincw (z0, 16))
++
++/*
++** qincw_n_1_u32_tied:
++**	uqincw	w0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_u32_tied, uint32_t,
++		x0 = svqincw_n_u32 (x0, 1),
++		x0 = svqincw (x0, 1))
++
++/*
++** qincw_n_1_u32_untied:
++**	mov	w0, w1
++**	uqincw	w0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_u32_untied, uint32_t,
++		x0 = svqincw_n_u32 (x1, 1),
++		x0 = svqincw (x1, 1))
++
++/*
++** qincw_n_2_u32:
++**	uqincw	w0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_2_u32, uint32_t,
++		x0 = svqincw_n_u32 (x0, 2),
++		x0 = svqincw (x0, 2))
++
++/*
++** qincw_n_7_u32:
++**	uqincw	w0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_7_u32, uint32_t,
++		x0 = svqincw_n_u32 (x0, 7),
++		x0 = svqincw (x0, 7))
++
++/*
++** qincw_n_15_u32:
++**	uqincw	w0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_15_u32, uint32_t,
++		x0 = svqincw_n_u32 (x0, 15),
++		x0 = svqincw (x0, 15))
++
++/*
++** qincw_n_16_u32:
++**	uqincw	w0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_16_u32, uint32_t,
++		x0 = svqincw_n_u32 (x0, 16),
++		x0 = svqincw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u64.c
+new file mode 100644
+index 000000000..806a79945
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qincw_n_1_u64_tied:
++**	uqincw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_u64_tied, uint64_t,
++		x0 = svqincw_n_u64 (x0, 1),
++		x0 = svqincw (x0, 1))
++
++/*
++** qincw_n_1_u64_untied:
++**	mov	x0, x1
++**	uqincw	x0
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_1_u64_untied, uint64_t,
++		x0 = svqincw_n_u64 (x1, 1),
++		x0 = svqincw (x1, 1))
++
++/*
++** qincw_n_2_u64:
++**	uqincw	x0, all, mul #2
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_2_u64, uint64_t,
++		x0 = svqincw_n_u64 (x0, 2),
++		x0 = svqincw (x0, 2))
++
++/*
++** qincw_n_7_u64:
++**	uqincw	x0, all, mul #7
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_7_u64, uint64_t,
++		x0 = svqincw_n_u64 (x0, 7),
++		x0 = svqincw (x0, 7))
++
++/*
++** qincw_n_15_u64:
++**	uqincw	x0, all, mul #15
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_15_u64, uint64_t,
++		x0 = svqincw_n_u64 (x0, 15),
++		x0 = svqincw (x0, 15))
++
++/*
++** qincw_n_16_u64:
++**	uqincw	x0, all, mul #16
++**	ret
++*/
++TEST_UNIFORM_S (qincw_n_16_u64, uint64_t,
++		x0 = svqincw_n_u64 (x0, 16),
++		x0 = svqincw (x0, 16))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s16.c
+new file mode 100644
+index 000000000..8dd8381dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s16.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_s16_tied1:
++**	sqsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s16_tied1, svint16_t,
++		z0 = svqsub_s16 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_s16_tied2:
++**	sqsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s16_tied2, svint16_t,
++		z0 = svqsub_s16 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_s16_untied:
++**	sqsub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s16_untied, svint16_t,
++		z0 = svqsub_s16 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_w0_s16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sqsub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_s16_tied1, svint16_t, int16_t,
++		 z0 = svqsub_n_s16 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_w0_s16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	sqsub	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_s16_untied, svint16_t, int16_t,
++		 z0 = svqsub_n_s16 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_s16_tied1:
++**	sqsub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s16_tied1, svint16_t,
++		z0 = svqsub_n_s16 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_s16_untied:
++**	movprfx	z0, z1
++**	sqsub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s16_untied, svint16_t,
++		z0 = svqsub_n_s16 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_s16:
++**	sqsub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_s16, svint16_t,
++		z0 = svqsub_n_s16 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_s16:
++**	sqsub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_s16, svint16_t,
++		z0 = svqsub_n_s16 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_s16:
++**	sqsub	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_s16, svint16_t,
++		z0 = svqsub_n_s16 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_s16:
++**	sqadd	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_s16, svint16_t,
++		z0 = svqsub_n_s16 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_s16:
++**	sqadd	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_s16, svint16_t,
++		z0 = svqsub_n_s16 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_s16:
++**	sqadd	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_s16, svint16_t,
++		z0 = svqsub_n_s16 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s32.c
+new file mode 100644
+index 000000000..920736aec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s32.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_s32_tied1:
++**	sqsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s32_tied1, svint32_t,
++		z0 = svqsub_s32 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_s32_tied2:
++**	sqsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s32_tied2, svint32_t,
++		z0 = svqsub_s32 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_s32_untied:
++**	sqsub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s32_untied, svint32_t,
++		z0 = svqsub_s32 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_w0_s32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sqsub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_s32_tied1, svint32_t, int32_t,
++		 z0 = svqsub_n_s32 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_w0_s32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	sqsub	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_s32_untied, svint32_t, int32_t,
++		 z0 = svqsub_n_s32 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_s32_tied1:
++**	sqsub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s32_tied1, svint32_t,
++		z0 = svqsub_n_s32 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_s32_untied:
++**	movprfx	z0, z1
++**	sqsub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s32_untied, svint32_t,
++		z0 = svqsub_n_s32 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_s32:
++**	sqsub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_s32, svint32_t,
++		z0 = svqsub_n_s32 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_s32:
++**	sqsub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_s32, svint32_t,
++		z0 = svqsub_n_s32 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_s32:
++**	sqsub	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_s32, svint32_t,
++		z0 = svqsub_n_s32 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_s32:
++**	sqadd	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_s32, svint32_t,
++		z0 = svqsub_n_s32 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_s32:
++**	sqadd	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_s32, svint32_t,
++		z0 = svqsub_n_s32 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_s32:
++**	sqadd	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_s32, svint32_t,
++		z0 = svqsub_n_s32 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s64.c
+new file mode 100644
+index 000000000..3d0fc2bcc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s64.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_s64_tied1:
++**	sqsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s64_tied1, svint64_t,
++		z0 = svqsub_s64 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_s64_tied2:
++**	sqsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s64_tied2, svint64_t,
++		z0 = svqsub_s64 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_s64_untied:
++**	sqsub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s64_untied, svint64_t,
++		z0 = svqsub_s64 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_x0_s64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sqsub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_x0_s64_tied1, svint64_t, int64_t,
++		 z0 = svqsub_n_s64 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_x0_s64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	sqsub	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_x0_s64_untied, svint64_t, int64_t,
++		 z0 = svqsub_n_s64 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_s64_tied1:
++**	sqsub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s64_tied1, svint64_t,
++		z0 = svqsub_n_s64 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_s64_untied:
++**	movprfx	z0, z1
++**	sqsub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s64_untied, svint64_t,
++		z0 = svqsub_n_s64 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_s64:
++**	sqsub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_s64, svint64_t,
++		z0 = svqsub_n_s64 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_s64:
++**	sqsub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_s64, svint64_t,
++		z0 = svqsub_n_s64 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_s64:
++**	sqsub	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_s64, svint64_t,
++		z0 = svqsub_n_s64 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_s64:
++**	sqadd	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_s64, svint64_t,
++		z0 = svqsub_n_s64 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_s64:
++**	sqadd	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_s64, svint64_t,
++		z0 = svqsub_n_s64 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_s64:
++**	sqadd	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_s64, svint64_t,
++		z0 = svqsub_n_s64 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s8.c
+new file mode 100644
+index 000000000..3e7e84c77
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s8.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_s8_tied1:
++**	sqsub	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s8_tied1, svint8_t,
++		z0 = svqsub_s8 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_s8_tied2:
++**	sqsub	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s8_tied2, svint8_t,
++		z0 = svqsub_s8 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_s8_untied:
++**	sqsub	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_s8_untied, svint8_t,
++		z0 = svqsub_s8 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_w0_s8_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sqsub	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_s8_tied1, svint8_t, int8_t,
++		 z0 = svqsub_n_s8 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_w0_s8_untied:
++**	mov	(z[0-9]+\.b), w0
++**	sqsub	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_s8_untied, svint8_t, int8_t,
++		 z0 = svqsub_n_s8 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_s8_tied1:
++**	sqsub	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s8_tied1, svint8_t,
++		z0 = svqsub_n_s8 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_s8_untied:
++**	movprfx	z0, z1
++**	sqsub	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_s8_untied, svint8_t,
++		z0 = svqsub_n_s8 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_s8:
++**	sqsub	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_s8, svint8_t,
++		z0 = svqsub_n_s8 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_s8:
++**	sqadd	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_s8, svint8_t,
++		z0 = svqsub_n_s8 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_s8:
++**	sqadd	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_s8, svint8_t,
++		z0 = svqsub_n_s8 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_s8:
++**	sqadd	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_s8, svint8_t,
++		z0 = svqsub_n_s8 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_s8:
++**	sqadd	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_s8, svint8_t,
++		z0 = svqsub_n_s8 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_s8:
++**	sqadd	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_s8, svint8_t,
++		z0 = svqsub_n_s8 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u16.c
+new file mode 100644
+index 000000000..6d4d68e20
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u16.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_u16_tied1:
++**	uqsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u16_tied1, svuint16_t,
++		z0 = svqsub_u16 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_u16_tied2:
++**	uqsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u16_tied2, svuint16_t,
++		z0 = svqsub_u16 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_u16_untied:
++**	uqsub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u16_untied, svuint16_t,
++		z0 = svqsub_u16 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_w0_u16_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	uqsub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_u16_tied1, svuint16_t, uint16_t,
++		 z0 = svqsub_n_u16 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_w0_u16_untied:
++**	mov	(z[0-9]+\.h), w0
++**	uqsub	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_u16_untied, svuint16_t, uint16_t,
++		 z0 = svqsub_n_u16 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_u16_tied1:
++**	uqsub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u16_tied1, svuint16_t,
++		z0 = svqsub_n_u16 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_u16_untied:
++**	movprfx	z0, z1
++**	uqsub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u16_untied, svuint16_t,
++		z0 = svqsub_n_u16 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_u16:
++**	uqsub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_u16, svuint16_t,
++		z0 = svqsub_n_u16 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_u16:
++**	uqsub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_u16, svuint16_t,
++		z0 = svqsub_n_u16 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_u16:
++**	uqsub	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_u16, svuint16_t,
++		z0 = svqsub_n_u16 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_u16:
++**	mov	(z[0-9]+)\.b, #-1
++**	uqsub	z0\.h, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_u16, svuint16_t,
++		z0 = svqsub_n_u16 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_u16:
++**	mov	(z[0-9]+\.h), #-127
++**	uqsub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_u16, svuint16_t,
++		z0 = svqsub_n_u16 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_u16:
++**	mov	(z[0-9]+\.h), #-128
++**	uqsub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_u16, svuint16_t,
++		z0 = svqsub_n_u16 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u32.c
+new file mode 100644
+index 000000000..9c93cfc45
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u32.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_u32_tied1:
++**	uqsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u32_tied1, svuint32_t,
++		z0 = svqsub_u32 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_u32_tied2:
++**	uqsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u32_tied2, svuint32_t,
++		z0 = svqsub_u32 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_u32_untied:
++**	uqsub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u32_untied, svuint32_t,
++		z0 = svqsub_u32 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_w0_u32_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	uqsub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_u32_tied1, svuint32_t, uint32_t,
++		 z0 = svqsub_n_u32 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_w0_u32_untied:
++**	mov	(z[0-9]+\.s), w0
++**	uqsub	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_u32_untied, svuint32_t, uint32_t,
++		 z0 = svqsub_n_u32 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_u32_tied1:
++**	uqsub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u32_tied1, svuint32_t,
++		z0 = svqsub_n_u32 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_u32_untied:
++**	movprfx	z0, z1
++**	uqsub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u32_untied, svuint32_t,
++		z0 = svqsub_n_u32 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_u32:
++**	uqsub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_u32, svuint32_t,
++		z0 = svqsub_n_u32 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_u32:
++**	uqsub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_u32, svuint32_t,
++		z0 = svqsub_n_u32 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_u32:
++**	uqsub	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_u32, svuint32_t,
++		z0 = svqsub_n_u32 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_u32:
++**	mov	(z[0-9]+)\.b, #-1
++**	uqsub	z0\.s, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_u32, svuint32_t,
++		z0 = svqsub_n_u32 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_u32:
++**	mov	(z[0-9]+\.s), #-127
++**	uqsub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_u32, svuint32_t,
++		z0 = svqsub_n_u32 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_u32:
++**	mov	(z[0-9]+\.s), #-128
++**	uqsub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_u32, svuint32_t,
++		z0 = svqsub_n_u32 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u64.c
+new file mode 100644
+index 000000000..6109b5f29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u64.c
+@@ -0,0 +1,126 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_u64_tied1:
++**	uqsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u64_tied1, svuint64_t,
++		z0 = svqsub_u64 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_u64_tied2:
++**	uqsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u64_tied2, svuint64_t,
++		z0 = svqsub_u64 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_u64_untied:
++**	uqsub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u64_untied, svuint64_t,
++		z0 = svqsub_u64 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_x0_u64_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	uqsub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_x0_u64_tied1, svuint64_t, uint64_t,
++		 z0 = svqsub_n_u64 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_x0_u64_untied:
++**	mov	(z[0-9]+\.d), x0
++**	uqsub	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_x0_u64_untied, svuint64_t, uint64_t,
++		 z0 = svqsub_n_u64 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_u64_tied1:
++**	uqsub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u64_tied1, svuint64_t,
++		z0 = svqsub_n_u64 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_u64_untied:
++**	movprfx	z0, z1
++**	uqsub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u64_untied, svuint64_t,
++		z0 = svqsub_n_u64 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_u64:
++**	uqsub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_u64, svuint64_t,
++		z0 = svqsub_n_u64 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_u64:
++**	uqsub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_u64, svuint64_t,
++		z0 = svqsub_n_u64 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_u64:
++**	uqsub	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_u64, svuint64_t,
++		z0 = svqsub_n_u64 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_u64:
++**	mov	(z[0-9]+)\.b, #-1
++**	uqsub	z0\.d, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_u64, svuint64_t,
++		z0 = svqsub_n_u64 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_u64:
++**	mov	(z[0-9]+\.d), #-127
++**	uqsub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_u64, svuint64_t,
++		z0 = svqsub_n_u64 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_u64:
++**	mov	(z[0-9]+\.d), #-128
++**	uqsub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_u64, svuint64_t,
++		z0 = svqsub_n_u64 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u8.c
+new file mode 100644
+index 000000000..40aa74e8d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u8.c
+@@ -0,0 +1,123 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** qsub_u8_tied1:
++**	uqsub	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u8_tied1, svuint8_t,
++		z0 = svqsub_u8 (z0, z1),
++		z0 = svqsub (z0, z1))
++
++/*
++** qsub_u8_tied2:
++**	uqsub	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u8_tied2, svuint8_t,
++		z0 = svqsub_u8 (z1, z0),
++		z0 = svqsub (z1, z0))
++
++/*
++** qsub_u8_untied:
++**	uqsub	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_u8_untied, svuint8_t,
++		z0 = svqsub_u8 (z1, z2),
++		z0 = svqsub (z1, z2))
++
++/*
++** qsub_w0_u8_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	uqsub	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_u8_tied1, svuint8_t, uint8_t,
++		 z0 = svqsub_n_u8 (z0, x0),
++		 z0 = svqsub (z0, x0))
++
++/*
++** qsub_w0_u8_untied:
++**	mov	(z[0-9]+\.b), w0
++**	uqsub	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (qsub_w0_u8_untied, svuint8_t, uint8_t,
++		 z0 = svqsub_n_u8 (z1, x0),
++		 z0 = svqsub (z1, x0))
++
++/*
++** qsub_1_u8_tied1:
++**	uqsub	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u8_tied1, svuint8_t,
++		z0 = svqsub_n_u8 (z0, 1),
++		z0 = svqsub (z0, 1))
++
++/*
++** qsub_1_u8_untied:
++**	movprfx	z0, z1
++**	uqsub	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_1_u8_untied, svuint8_t,
++		z0 = svqsub_n_u8 (z1, 1),
++		z0 = svqsub (z1, 1))
++
++/*
++** qsub_127_u8:
++**	uqsub	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_127_u8, svuint8_t,
++		z0 = svqsub_n_u8 (z0, 127),
++		z0 = svqsub (z0, 127))
++
++/*
++** qsub_128_u8:
++**	uqsub	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_128_u8, svuint8_t,
++		z0 = svqsub_n_u8 (z0, 128),
++		z0 = svqsub (z0, 128))
++
++/*
++** qsub_255_u8:
++**	uqsub	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_255_u8, svuint8_t,
++		z0 = svqsub_n_u8 (z0, 255),
++		z0 = svqsub (z0, 255))
++
++/*
++** qsub_m1_u8:
++**	uqsub	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m1_u8, svuint8_t,
++		z0 = svqsub_n_u8 (z0, -1),
++		z0 = svqsub (z0, -1))
++
++/*
++** qsub_m127_u8:
++**	uqsub	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m127_u8, svuint8_t,
++		z0 = svqsub_n_u8 (z0, -127),
++		z0 = svqsub (z0, -127))
++
++/*
++** qsub_m128_u8:
++**	uqsub	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (qsub_m128_u8, svuint8_t,
++		z0 = svqsub_n_u8 (z0, -128),
++		z0 = svqsub (z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s16.c
+new file mode 100644
+index 000000000..4f794f600
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_s16_m_tied12:
++**	rbit	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_m_tied12, svint16_t,
++		z0 = svrbit_s16_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_s16_m_tied1:
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_m_tied1, svint16_t,
++		z0 = svrbit_s16_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_m_tied2, svint16_t,
++		z0 = svrbit_s16_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_s16_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_m_untied, svint16_t,
++		z0 = svrbit_s16_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	rbit	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_z_tied1, svint16_t,
++		z0 = svrbit_s16_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_z_untied, svint16_t,
++		z0 = svrbit_s16_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_s16_x_tied1:
++**	rbit	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_x_tied1, svint16_t,
++		z0 = svrbit_s16_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_s16_x_untied:
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s16_x_untied, svint16_t,
++		z0 = svrbit_s16_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s32.c
+new file mode 100644
+index 000000000..8b5e1a463
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_s32_m_tied12:
++**	rbit	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_m_tied12, svint32_t,
++		z0 = svrbit_s32_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_s32_m_tied1:
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_m_tied1, svint32_t,
++		z0 = svrbit_s32_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_m_tied2, svint32_t,
++		z0 = svrbit_s32_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_s32_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_m_untied, svint32_t,
++		z0 = svrbit_s32_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	rbit	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_z_tied1, svint32_t,
++		z0 = svrbit_s32_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_z_untied, svint32_t,
++		z0 = svrbit_s32_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_s32_x_tied1:
++**	rbit	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_x_tied1, svint32_t,
++		z0 = svrbit_s32_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_s32_x_untied:
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s32_x_untied, svint32_t,
++		z0 = svrbit_s32_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s64.c
+new file mode 100644
+index 000000000..cec27a421
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_s64_m_tied12:
++**	rbit	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_m_tied12, svint64_t,
++		z0 = svrbit_s64_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_s64_m_tied1:
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_m_tied1, svint64_t,
++		z0 = svrbit_s64_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_m_tied2, svint64_t,
++		z0 = svrbit_s64_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_s64_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_m_untied, svint64_t,
++		z0 = svrbit_s64_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	rbit	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_z_tied1, svint64_t,
++		z0 = svrbit_s64_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_z_untied, svint64_t,
++		z0 = svrbit_s64_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_s64_x_tied1:
++**	rbit	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_x_tied1, svint64_t,
++		z0 = svrbit_s64_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_s64_x_untied:
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s64_x_untied, svint64_t,
++		z0 = svrbit_s64_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s8.c
+new file mode 100644
+index 000000000..9c152116a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_s8_m_tied12:
++**	rbit	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_m_tied12, svint8_t,
++		z0 = svrbit_s8_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_s8_m_tied1:
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_m_tied1, svint8_t,
++		z0 = svrbit_s8_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_m_tied2, svint8_t,
++		z0 = svrbit_s8_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_s8_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_m_untied, svint8_t,
++		z0 = svrbit_s8_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_s8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	rbit	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_z_tied1, svint8_t,
++		z0 = svrbit_s8_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_s8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_z_untied, svint8_t,
++		z0 = svrbit_s8_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_s8_x_tied1:
++**	rbit	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_x_tied1, svint8_t,
++		z0 = svrbit_s8_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_s8_x_untied:
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_s8_x_untied, svint8_t,
++		z0 = svrbit_s8_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u16.c
+new file mode 100644
+index 000000000..001ef2bf0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_u16_m_tied12:
++**	rbit	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_m_tied12, svuint16_t,
++		z0 = svrbit_u16_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_u16_m_tied1:
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_m_tied1, svuint16_t,
++		z0 = svrbit_u16_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_m_tied2, svuint16_t,
++		z0 = svrbit_u16_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_u16_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_m_untied, svuint16_t,
++		z0 = svrbit_u16_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	rbit	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_z_tied1, svuint16_t,
++		z0 = svrbit_u16_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_z_untied, svuint16_t,
++		z0 = svrbit_u16_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_u16_x_tied1:
++**	rbit	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_x_tied1, svuint16_t,
++		z0 = svrbit_u16_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_u16_x_untied:
++**	rbit	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u16_x_untied, svuint16_t,
++		z0 = svrbit_u16_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u32.c
+new file mode 100644
+index 000000000..4d91e954d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_u32_m_tied12:
++**	rbit	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_m_tied12, svuint32_t,
++		z0 = svrbit_u32_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_u32_m_tied1:
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_m_tied1, svuint32_t,
++		z0 = svrbit_u32_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_m_tied2, svuint32_t,
++		z0 = svrbit_u32_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_u32_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_m_untied, svuint32_t,
++		z0 = svrbit_u32_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	rbit	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_z_tied1, svuint32_t,
++		z0 = svrbit_u32_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_z_untied, svuint32_t,
++		z0 = svrbit_u32_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_u32_x_tied1:
++**	rbit	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_x_tied1, svuint32_t,
++		z0 = svrbit_u32_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_u32_x_untied:
++**	rbit	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u32_x_untied, svuint32_t,
++		z0 = svrbit_u32_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u64.c
+new file mode 100644
+index 000000000..77f88d116
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_u64_m_tied12:
++**	rbit	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_m_tied12, svuint64_t,
++		z0 = svrbit_u64_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_u64_m_tied1:
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_m_tied1, svuint64_t,
++		z0 = svrbit_u64_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_m_tied2, svuint64_t,
++		z0 = svrbit_u64_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_u64_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_m_untied, svuint64_t,
++		z0 = svrbit_u64_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	rbit	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_z_tied1, svuint64_t,
++		z0 = svrbit_u64_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_z_untied, svuint64_t,
++		z0 = svrbit_u64_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_u64_x_tied1:
++**	rbit	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_x_tied1, svuint64_t,
++		z0 = svrbit_u64_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_u64_x_untied:
++**	rbit	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u64_x_untied, svuint64_t,
++		z0 = svrbit_u64_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u8.c
+new file mode 100644
+index 000000000..fa347e4c7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u8.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rbit_u8_m_tied12:
++**	rbit	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_m_tied12, svuint8_t,
++		z0 = svrbit_u8_m (z0, p0, z0),
++		z0 = svrbit_m (z0, p0, z0))
++
++/*
++** rbit_u8_m_tied1:
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_m_tied1, svuint8_t,
++		z0 = svrbit_u8_m (z0, p0, z1),
++		z0 = svrbit_m (z0, p0, z1))
++
++/*
++** rbit_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	rbit	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_m_tied2, svuint8_t,
++		z0 = svrbit_u8_m (z1, p0, z0),
++		z0 = svrbit_m (z1, p0, z0))
++
++/*
++** rbit_u8_m_untied:
++**	movprfx	z0, z2
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_m_untied, svuint8_t,
++		z0 = svrbit_u8_m (z2, p0, z1),
++		z0 = svrbit_m (z2, p0, z1))
++
++/*
++** rbit_u8_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.b, p0/z, \1\.b
++**	rbit	z0\.b, p0/m, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_z_tied1, svuint8_t,
++		z0 = svrbit_u8_z (p0, z0),
++		z0 = svrbit_z (p0, z0))
++
++/*
++** rbit_u8_z_untied:
++**	movprfx	z0\.b, p0/z, z1\.b
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_z_untied, svuint8_t,
++		z0 = svrbit_u8_z (p0, z1),
++		z0 = svrbit_z (p0, z1))
++
++/*
++** rbit_u8_x_tied1:
++**	rbit	z0\.b, p0/m, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_x_tied1, svuint8_t,
++		z0 = svrbit_u8_x (p0, z0),
++		z0 = svrbit_x (p0, z0))
++
++/*
++** rbit_u8_x_untied:
++**	rbit	z0\.b, p0/m, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rbit_u8_x_untied, svuint8_t,
++		z0 = svrbit_u8_x (p0, z1),
++		z0 = svrbit_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rdffr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rdffr_1.c
+new file mode 100644
+index 000000000..5564e967f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rdffr_1.c
+@@ -0,0 +1,59 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** setffr_rdffr_1:
++**	ptrue	p0\.b, all
++**	ret
++*/
++TEST_UNIFORM_P_SINGLE (setffr_rdffr_1,
++		       svsetffr ();
++		       p0 = svrdffr ());
++
++/*
++** setffr_rdffr_2:
++**	ret
++*/
++TEST_UNIFORM_P_SINGLE (setffr_rdffr_2,
++		       svsetffr ();
++		       svrdffr ());
++
++/*
++** setffr_rdffr_3:
++**	ptrue	p0\.b, all
++**	ret
++*/
++TEST_UNIFORM_P_SINGLE (setffr_rdffr_3,
++		       svsetffr ();
++		       svsetffr ();
++		       svrdffr ();
++		       p0 = svrdffr ());
++
++/*
++** wrffr_rdffr_1:
++**	mov	p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P_SINGLE (wrffr_rdffr_1,
++		       svwrffr (p1);
++		       p0 = svrdffr ());
++
++/*
++** wrffr_rdffr_2:
++**	ret
++*/
++TEST_UNIFORM_P_SINGLE (wrffr_rdffr_2,
++		       svwrffr (p1);
++		       svrdffr ());
++
++/*
++** wrffr_rdffr_3:
++**	mov	p0\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P_SINGLE (wrffr_rdffr_3,
++		       svwrffr (p1);
++		       svwrffr (p2);
++		       svrdffr ();
++		       p0 = svrdffr ());
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f16.c
+new file mode 100644
+index 000000000..d0cd8281a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recpe_f16_tied1:
++**	frecpe	z0\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpe_f16_tied1, svfloat16_t,
++		z0 = svrecpe_f16 (z0),
++		z0 = svrecpe (z0))
++
++/*
++** recpe_f16_untied:
++**	frecpe	z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpe_f16_untied, svfloat16_t,
++		z0 = svrecpe_f16 (z1),
++		z0 = svrecpe (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f32.c
+new file mode 100644
+index 000000000..013ed8c43
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recpe_f32_tied1:
++**	frecpe	z0\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpe_f32_tied1, svfloat32_t,
++		z0 = svrecpe_f32 (z0),
++		z0 = svrecpe (z0))
++
++/*
++** recpe_f32_untied:
++**	frecpe	z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpe_f32_untied, svfloat32_t,
++		z0 = svrecpe_f32 (z1),
++		z0 = svrecpe (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f64.c
+new file mode 100644
+index 000000000..40b3df292
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recpe_f64_tied1:
++**	frecpe	z0\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpe_f64_tied1, svfloat64_t,
++		z0 = svrecpe_f64 (z0),
++		z0 = svrecpe (z0))
++
++/*
++** recpe_f64_untied:
++**	frecpe	z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpe_f64_untied, svfloat64_t,
++		z0 = svrecpe_f64 (z1),
++		z0 = svrecpe (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f16.c
+new file mode 100644
+index 000000000..e35c5c545
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recps_f16_tied1:
++**	frecps	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f16_tied1, svfloat16_t,
++		z0 = svrecps_f16 (z0, z1),
++		z0 = svrecps (z0, z1))
++
++/*
++** recps_f16_tied2:
++**	frecps	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f16_tied2, svfloat16_t,
++		z0 = svrecps_f16 (z1, z0),
++		z0 = svrecps (z1, z0))
++
++/*
++** recps_f16_untied:
++**	frecps	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f16_untied, svfloat16_t,
++		z0 = svrecps_f16 (z1, z2),
++		z0 = svrecps (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f32.c
+new file mode 100644
+index 000000000..3f3aa203e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recps_f32_tied1:
++**	frecps	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f32_tied1, svfloat32_t,
++		z0 = svrecps_f32 (z0, z1),
++		z0 = svrecps (z0, z1))
++
++/*
++** recps_f32_tied2:
++**	frecps	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f32_tied2, svfloat32_t,
++		z0 = svrecps_f32 (z1, z0),
++		z0 = svrecps (z1, z0))
++
++/*
++** recps_f32_untied:
++**	frecps	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f32_untied, svfloat32_t,
++		z0 = svrecps_f32 (z1, z2),
++		z0 = svrecps (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f64.c
+new file mode 100644
+index 000000000..eca421d5e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recps_f64_tied1:
++**	frecps	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f64_tied1, svfloat64_t,
++		z0 = svrecps_f64 (z0, z1),
++		z0 = svrecps (z0, z1))
++
++/*
++** recps_f64_tied2:
++**	frecps	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f64_tied2, svfloat64_t,
++		z0 = svrecps_f64 (z1, z0),
++		z0 = svrecps (z1, z0))
++
++/*
++** recps_f64_untied:
++**	frecps	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recps_f64_untied, svfloat64_t,
++		z0 = svrecps_f64 (z1, z2),
++		z0 = svrecps (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f16.c
+new file mode 100644
+index 000000000..2dd7ada2c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recpx_f16_m_tied12:
++**	frecpx	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_m_tied12, svfloat16_t,
++		z0 = svrecpx_f16_m (z0, p0, z0),
++		z0 = svrecpx_m (z0, p0, z0))
++
++/*
++** recpx_f16_m_tied1:
++**	frecpx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_m_tied1, svfloat16_t,
++		z0 = svrecpx_f16_m (z0, p0, z1),
++		z0 = svrecpx_m (z0, p0, z1))
++
++/*
++** recpx_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frecpx	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_m_tied2, svfloat16_t,
++		z0 = svrecpx_f16_m (z1, p0, z0),
++		z0 = svrecpx_m (z1, p0, z0))
++
++/*
++** recpx_f16_m_untied:
++**	movprfx	z0, z2
++**	frecpx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_m_untied, svfloat16_t,
++		z0 = svrecpx_f16_m (z2, p0, z1),
++		z0 = svrecpx_m (z2, p0, z1))
++
++/*
++** recpx_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frecpx	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_z_tied1, svfloat16_t,
++		z0 = svrecpx_f16_z (p0, z0),
++		z0 = svrecpx_z (p0, z0))
++
++/*
++** recpx_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frecpx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_z_untied, svfloat16_t,
++		z0 = svrecpx_f16_z (p0, z1),
++		z0 = svrecpx_z (p0, z1))
++
++/*
++** recpx_f16_x_tied1:
++**	frecpx	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_x_tied1, svfloat16_t,
++		z0 = svrecpx_f16_x (p0, z0),
++		z0 = svrecpx_x (p0, z0))
++
++/*
++** recpx_f16_x_untied:
++**	frecpx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f16_x_untied, svfloat16_t,
++		z0 = svrecpx_f16_x (p0, z1),
++		z0 = svrecpx_x (p0, z1))
++
++/*
++** ptrue_recpx_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_recpx_f16_x_tied1, svfloat16_t,
++		z0 = svrecpx_f16_x (svptrue_b16 (), z0),
++		z0 = svrecpx_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_recpx_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_recpx_f16_x_untied, svfloat16_t,
++		z0 = svrecpx_f16_x (svptrue_b16 (), z1),
++		z0 = svrecpx_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f32.c
+new file mode 100644
+index 000000000..6364fb83b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recpx_f32_m_tied12:
++**	frecpx	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_m_tied12, svfloat32_t,
++		z0 = svrecpx_f32_m (z0, p0, z0),
++		z0 = svrecpx_m (z0, p0, z0))
++
++/*
++** recpx_f32_m_tied1:
++**	frecpx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_m_tied1, svfloat32_t,
++		z0 = svrecpx_f32_m (z0, p0, z1),
++		z0 = svrecpx_m (z0, p0, z1))
++
++/*
++** recpx_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frecpx	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_m_tied2, svfloat32_t,
++		z0 = svrecpx_f32_m (z1, p0, z0),
++		z0 = svrecpx_m (z1, p0, z0))
++
++/*
++** recpx_f32_m_untied:
++**	movprfx	z0, z2
++**	frecpx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_m_untied, svfloat32_t,
++		z0 = svrecpx_f32_m (z2, p0, z1),
++		z0 = svrecpx_m (z2, p0, z1))
++
++/*
++** recpx_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frecpx	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_z_tied1, svfloat32_t,
++		z0 = svrecpx_f32_z (p0, z0),
++		z0 = svrecpx_z (p0, z0))
++
++/*
++** recpx_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frecpx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_z_untied, svfloat32_t,
++		z0 = svrecpx_f32_z (p0, z1),
++		z0 = svrecpx_z (p0, z1))
++
++/*
++** recpx_f32_x_tied1:
++**	frecpx	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_x_tied1, svfloat32_t,
++		z0 = svrecpx_f32_x (p0, z0),
++		z0 = svrecpx_x (p0, z0))
++
++/*
++** recpx_f32_x_untied:
++**	frecpx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f32_x_untied, svfloat32_t,
++		z0 = svrecpx_f32_x (p0, z1),
++		z0 = svrecpx_x (p0, z1))
++
++/*
++** ptrue_recpx_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_recpx_f32_x_tied1, svfloat32_t,
++		z0 = svrecpx_f32_x (svptrue_b32 (), z0),
++		z0 = svrecpx_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_recpx_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_recpx_f32_x_untied, svfloat32_t,
++		z0 = svrecpx_f32_x (svptrue_b32 (), z1),
++		z0 = svrecpx_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f64.c
+new file mode 100644
+index 000000000..ca5232331
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** recpx_f64_m_tied12:
++**	frecpx	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_m_tied12, svfloat64_t,
++		z0 = svrecpx_f64_m (z0, p0, z0),
++		z0 = svrecpx_m (z0, p0, z0))
++
++/*
++** recpx_f64_m_tied1:
++**	frecpx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_m_tied1, svfloat64_t,
++		z0 = svrecpx_f64_m (z0, p0, z1),
++		z0 = svrecpx_m (z0, p0, z1))
++
++/*
++** recpx_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frecpx	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_m_tied2, svfloat64_t,
++		z0 = svrecpx_f64_m (z1, p0, z0),
++		z0 = svrecpx_m (z1, p0, z0))
++
++/*
++** recpx_f64_m_untied:
++**	movprfx	z0, z2
++**	frecpx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_m_untied, svfloat64_t,
++		z0 = svrecpx_f64_m (z2, p0, z1),
++		z0 = svrecpx_m (z2, p0, z1))
++
++/*
++** recpx_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frecpx	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_z_tied1, svfloat64_t,
++		z0 = svrecpx_f64_z (p0, z0),
++		z0 = svrecpx_z (p0, z0))
++
++/*
++** recpx_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frecpx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_z_untied, svfloat64_t,
++		z0 = svrecpx_f64_z (p0, z1),
++		z0 = svrecpx_z (p0, z1))
++
++/*
++** recpx_f64_x_tied1:
++**	frecpx	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_x_tied1, svfloat64_t,
++		z0 = svrecpx_f64_x (p0, z0),
++		z0 = svrecpx_x (p0, z0))
++
++/*
++** recpx_f64_x_untied:
++**	frecpx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (recpx_f64_x_untied, svfloat64_t,
++		z0 = svrecpx_f64_x (p0, z1),
++		z0 = svrecpx_x (p0, z1))
++
++/*
++** ptrue_recpx_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_recpx_f64_x_tied1, svfloat64_t,
++		z0 = svrecpx_f64_x (svptrue_b64 (), z0),
++		z0 = svrecpx_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_recpx_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_recpx_f64_x_untied, svfloat64_t,
++		z0 = svrecpx_f64_x (svptrue_b64 (), z1),
++		z0 = svrecpx_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c
+new file mode 100644
+index 000000000..2d2c2a714
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_bf16_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_bf16_tied1, svbfloat16_t, svbfloat16_t,
++		 z0_res = svreinterpret_bf16_bf16 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_bf16_untied, svbfloat16_t, svbfloat16_t,
++	     z0 = svreinterpret_bf16_bf16 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_f16_tied1, svbfloat16_t, svfloat16_t,
++		 z0_res = svreinterpret_bf16_f16 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_f16_untied, svbfloat16_t, svfloat16_t,
++	     z0 = svreinterpret_bf16_f16 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_f32_tied1, svbfloat16_t, svfloat32_t,
++		 z0_res = svreinterpret_bf16_f32 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_f32_untied, svbfloat16_t, svfloat32_t,
++	     z0 = svreinterpret_bf16_f32 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_f64_tied1, svbfloat16_t, svfloat64_t,
++		 z0_res = svreinterpret_bf16_f64 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_f64_untied, svbfloat16_t, svfloat64_t,
++	     z0 = svreinterpret_bf16_f64 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_s8_tied1, svbfloat16_t, svint8_t,
++		 z0_res = svreinterpret_bf16_s8 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_s8_untied, svbfloat16_t, svint8_t,
++	     z0 = svreinterpret_bf16_s8 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_s16_tied1, svbfloat16_t, svint16_t,
++		 z0_res = svreinterpret_bf16_s16 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_s16_untied, svbfloat16_t, svint16_t,
++	     z0 = svreinterpret_bf16_s16 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_s32_tied1, svbfloat16_t, svint32_t,
++		 z0_res = svreinterpret_bf16_s32 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_s32_untied, svbfloat16_t, svint32_t,
++	     z0 = svreinterpret_bf16_s32 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_s64_tied1, svbfloat16_t, svint64_t,
++		 z0_res = svreinterpret_bf16_s64 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_s64_untied, svbfloat16_t, svint64_t,
++	     z0 = svreinterpret_bf16_s64 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_u8_tied1, svbfloat16_t, svuint8_t,
++		 z0_res = svreinterpret_bf16_u8 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_u8_untied, svbfloat16_t, svuint8_t,
++	     z0 = svreinterpret_bf16_u8 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_u16_tied1, svbfloat16_t, svuint16_t,
++		 z0_res = svreinterpret_bf16_u16 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_u16_untied, svbfloat16_t, svuint16_t,
++	     z0 = svreinterpret_bf16_u16 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_u32_tied1, svbfloat16_t, svuint32_t,
++		 z0_res = svreinterpret_bf16_u32 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_u32_untied, svbfloat16_t, svuint32_t,
++	     z0 = svreinterpret_bf16_u32 (z4),
++	     z0 = svreinterpret_bf16 (z4))
++
++/*
++** reinterpret_bf16_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_bf16_u64_tied1, svbfloat16_t, svuint64_t,
++		 z0_res = svreinterpret_bf16_u64 (z0),
++		 z0_res = svreinterpret_bf16 (z0))
++
++/*
++** reinterpret_bf16_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_bf16_u64_untied, svbfloat16_t, svuint64_t,
++	     z0 = svreinterpret_bf16_u64 (z4),
++	     z0 = svreinterpret_bf16 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c
+new file mode 100644
+index 000000000..60705e628
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_f16_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_bf16_tied1, svfloat16_t, svbfloat16_t,
++		 z0_res = svreinterpret_f16_bf16 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_bf16_untied, svfloat16_t, svbfloat16_t,
++	     z0 = svreinterpret_f16_bf16 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_f16_tied1, svfloat16_t, svfloat16_t,
++		 z0_res = svreinterpret_f16_f16 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_f16_untied, svfloat16_t, svfloat16_t,
++	     z0 = svreinterpret_f16_f16 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_f32_tied1, svfloat16_t, svfloat32_t,
++		 z0_res = svreinterpret_f16_f32 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_f32_untied, svfloat16_t, svfloat32_t,
++	     z0 = svreinterpret_f16_f32 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_f64_tied1, svfloat16_t, svfloat64_t,
++		 z0_res = svreinterpret_f16_f64 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_f64_untied, svfloat16_t, svfloat64_t,
++	     z0 = svreinterpret_f16_f64 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_s8_tied1, svfloat16_t, svint8_t,
++		 z0_res = svreinterpret_f16_s8 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_s8_untied, svfloat16_t, svint8_t,
++	     z0 = svreinterpret_f16_s8 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_s16_tied1, svfloat16_t, svint16_t,
++		 z0_res = svreinterpret_f16_s16 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_s16_untied, svfloat16_t, svint16_t,
++	     z0 = svreinterpret_f16_s16 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_s32_tied1, svfloat16_t, svint32_t,
++		 z0_res = svreinterpret_f16_s32 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_s32_untied, svfloat16_t, svint32_t,
++	     z0 = svreinterpret_f16_s32 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_s64_tied1, svfloat16_t, svint64_t,
++		 z0_res = svreinterpret_f16_s64 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_s64_untied, svfloat16_t, svint64_t,
++	     z0 = svreinterpret_f16_s64 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_u8_tied1, svfloat16_t, svuint8_t,
++		 z0_res = svreinterpret_f16_u8 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_u8_untied, svfloat16_t, svuint8_t,
++	     z0 = svreinterpret_f16_u8 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_u16_tied1, svfloat16_t, svuint16_t,
++		 z0_res = svreinterpret_f16_u16 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_u16_untied, svfloat16_t, svuint16_t,
++	     z0 = svreinterpret_f16_u16 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_u32_tied1, svfloat16_t, svuint32_t,
++		 z0_res = svreinterpret_f16_u32 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_u32_untied, svfloat16_t, svuint32_t,
++	     z0 = svreinterpret_f16_u32 (z4),
++	     z0 = svreinterpret_f16 (z4))
++
++/*
++** reinterpret_f16_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f16_u64_tied1, svfloat16_t, svuint64_t,
++		 z0_res = svreinterpret_f16_u64 (z0),
++		 z0_res = svreinterpret_f16 (z0))
++
++/*
++** reinterpret_f16_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f16_u64_untied, svfloat16_t, svuint64_t,
++	     z0 = svreinterpret_f16_u64 (z4),
++	     z0 = svreinterpret_f16 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c
+new file mode 100644
+index 000000000..06fc46f25
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_f32_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_bf16_tied1, svfloat32_t, svbfloat16_t,
++		 z0_res = svreinterpret_f32_bf16 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_bf16_untied, svfloat32_t, svbfloat16_t,
++	     z0 = svreinterpret_f32_bf16 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_f16_tied1, svfloat32_t, svfloat16_t,
++		 z0_res = svreinterpret_f32_f16 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_f16_untied, svfloat32_t, svfloat16_t,
++	     z0 = svreinterpret_f32_f16 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_f32_tied1, svfloat32_t, svfloat32_t,
++		 z0_res = svreinterpret_f32_f32 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_f32_untied, svfloat32_t, svfloat32_t,
++	     z0 = svreinterpret_f32_f32 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_f64_tied1, svfloat32_t, svfloat64_t,
++		 z0_res = svreinterpret_f32_f64 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_f64_untied, svfloat32_t, svfloat64_t,
++	     z0 = svreinterpret_f32_f64 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_s8_tied1, svfloat32_t, svint8_t,
++		 z0_res = svreinterpret_f32_s8 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_s8_untied, svfloat32_t, svint8_t,
++	     z0 = svreinterpret_f32_s8 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_s16_tied1, svfloat32_t, svint16_t,
++		 z0_res = svreinterpret_f32_s16 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_s16_untied, svfloat32_t, svint16_t,
++	     z0 = svreinterpret_f32_s16 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_s32_tied1, svfloat32_t, svint32_t,
++		 z0_res = svreinterpret_f32_s32 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_s32_untied, svfloat32_t, svint32_t,
++	     z0 = svreinterpret_f32_s32 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_s64_tied1, svfloat32_t, svint64_t,
++		 z0_res = svreinterpret_f32_s64 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_s64_untied, svfloat32_t, svint64_t,
++	     z0 = svreinterpret_f32_s64 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_u8_tied1, svfloat32_t, svuint8_t,
++		 z0_res = svreinterpret_f32_u8 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_u8_untied, svfloat32_t, svuint8_t,
++	     z0 = svreinterpret_f32_u8 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_u16_tied1, svfloat32_t, svuint16_t,
++		 z0_res = svreinterpret_f32_u16 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_u16_untied, svfloat32_t, svuint16_t,
++	     z0 = svreinterpret_f32_u16 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_u32_tied1, svfloat32_t, svuint32_t,
++		 z0_res = svreinterpret_f32_u32 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_u32_untied, svfloat32_t, svuint32_t,
++	     z0 = svreinterpret_f32_u32 (z4),
++	     z0 = svreinterpret_f32 (z4))
++
++/*
++** reinterpret_f32_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f32_u64_tied1, svfloat32_t, svuint64_t,
++		 z0_res = svreinterpret_f32_u64 (z0),
++		 z0_res = svreinterpret_f32 (z0))
++
++/*
++** reinterpret_f32_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f32_u64_untied, svfloat32_t, svuint64_t,
++	     z0 = svreinterpret_f32_u64 (z4),
++	     z0 = svreinterpret_f32 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c
+new file mode 100644
+index 000000000..003ee3fe2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_f64_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_bf16_tied1, svfloat64_t, svbfloat16_t,
++		 z0_res = svreinterpret_f64_bf16 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_bf16_untied, svfloat64_t, svbfloat16_t,
++	     z0 = svreinterpret_f64_bf16 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_f16_tied1, svfloat64_t, svfloat16_t,
++		 z0_res = svreinterpret_f64_f16 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_f16_untied, svfloat64_t, svfloat16_t,
++	     z0 = svreinterpret_f64_f16 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_f32_tied1, svfloat64_t, svfloat32_t,
++		 z0_res = svreinterpret_f64_f32 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_f32_untied, svfloat64_t, svfloat32_t,
++	     z0 = svreinterpret_f64_f32 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_f64_tied1, svfloat64_t, svfloat64_t,
++		 z0_res = svreinterpret_f64_f64 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_f64_untied, svfloat64_t, svfloat64_t,
++	     z0 = svreinterpret_f64_f64 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_s8_tied1, svfloat64_t, svint8_t,
++		 z0_res = svreinterpret_f64_s8 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_s8_untied, svfloat64_t, svint8_t,
++	     z0 = svreinterpret_f64_s8 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_s16_tied1, svfloat64_t, svint16_t,
++		 z0_res = svreinterpret_f64_s16 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_s16_untied, svfloat64_t, svint16_t,
++	     z0 = svreinterpret_f64_s16 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_s32_tied1, svfloat64_t, svint32_t,
++		 z0_res = svreinterpret_f64_s32 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_s32_untied, svfloat64_t, svint32_t,
++	     z0 = svreinterpret_f64_s32 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_s64_tied1, svfloat64_t, svint64_t,
++		 z0_res = svreinterpret_f64_s64 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_s64_untied, svfloat64_t, svint64_t,
++	     z0 = svreinterpret_f64_s64 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_u8_tied1, svfloat64_t, svuint8_t,
++		 z0_res = svreinterpret_f64_u8 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_u8_untied, svfloat64_t, svuint8_t,
++	     z0 = svreinterpret_f64_u8 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_u16_tied1, svfloat64_t, svuint16_t,
++		 z0_res = svreinterpret_f64_u16 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_u16_untied, svfloat64_t, svuint16_t,
++	     z0 = svreinterpret_f64_u16 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_u32_tied1, svfloat64_t, svuint32_t,
++		 z0_res = svreinterpret_f64_u32 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_u32_untied, svfloat64_t, svuint32_t,
++	     z0 = svreinterpret_f64_u32 (z4),
++	     z0 = svreinterpret_f64 (z4))
++
++/*
++** reinterpret_f64_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_f64_u64_tied1, svfloat64_t, svuint64_t,
++		 z0_res = svreinterpret_f64_u64 (z0),
++		 z0_res = svreinterpret_f64 (z0))
++
++/*
++** reinterpret_f64_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_f64_u64_untied, svfloat64_t, svuint64_t,
++	     z0 = svreinterpret_f64_u64 (z4),
++	     z0 = svreinterpret_f64 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c
+new file mode 100644
+index 000000000..d62817c2c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_s16_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_bf16_tied1, svint16_t, svbfloat16_t,
++		 z0_res = svreinterpret_s16_bf16 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_bf16_untied, svint16_t, svbfloat16_t,
++	     z0 = svreinterpret_s16_bf16 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_f16_tied1, svint16_t, svfloat16_t,
++		 z0_res = svreinterpret_s16_f16 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_f16_untied, svint16_t, svfloat16_t,
++	     z0 = svreinterpret_s16_f16 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_f32_tied1, svint16_t, svfloat32_t,
++		 z0_res = svreinterpret_s16_f32 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_f32_untied, svint16_t, svfloat32_t,
++	     z0 = svreinterpret_s16_f32 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_f64_tied1, svint16_t, svfloat64_t,
++		 z0_res = svreinterpret_s16_f64 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_f64_untied, svint16_t, svfloat64_t,
++	     z0 = svreinterpret_s16_f64 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_s8_tied1, svint16_t, svint8_t,
++		 z0_res = svreinterpret_s16_s8 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_s8_untied, svint16_t, svint8_t,
++	     z0 = svreinterpret_s16_s8 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_s16_tied1, svint16_t, svint16_t,
++		 z0_res = svreinterpret_s16_s16 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_s16_untied, svint16_t, svint16_t,
++	     z0 = svreinterpret_s16_s16 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_s32_tied1, svint16_t, svint32_t,
++		 z0_res = svreinterpret_s16_s32 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_s32_untied, svint16_t, svint32_t,
++	     z0 = svreinterpret_s16_s32 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_s64_tied1, svint16_t, svint64_t,
++		 z0_res = svreinterpret_s16_s64 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_s64_untied, svint16_t, svint64_t,
++	     z0 = svreinterpret_s16_s64 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_u8_tied1, svint16_t, svuint8_t,
++		 z0_res = svreinterpret_s16_u8 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_u8_untied, svint16_t, svuint8_t,
++	     z0 = svreinterpret_s16_u8 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_u16_tied1, svint16_t, svuint16_t,
++		 z0_res = svreinterpret_s16_u16 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_u16_untied, svint16_t, svuint16_t,
++	     z0 = svreinterpret_s16_u16 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_u32_tied1, svint16_t, svuint32_t,
++		 z0_res = svreinterpret_s16_u32 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_u32_untied, svint16_t, svuint32_t,
++	     z0 = svreinterpret_s16_u32 (z4),
++	     z0 = svreinterpret_s16 (z4))
++
++/*
++** reinterpret_s16_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s16_u64_tied1, svint16_t, svuint64_t,
++		 z0_res = svreinterpret_s16_u64 (z0),
++		 z0_res = svreinterpret_s16 (z0))
++
++/*
++** reinterpret_s16_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s16_u64_untied, svint16_t, svuint64_t,
++	     z0 = svreinterpret_s16_u64 (z4),
++	     z0 = svreinterpret_s16 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c
+new file mode 100644
+index 000000000..e1068f244
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_s32_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_bf16_tied1, svint32_t, svbfloat16_t,
++		 z0_res = svreinterpret_s32_bf16 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_bf16_untied, svint32_t, svbfloat16_t,
++	     z0 = svreinterpret_s32_bf16 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_f16_tied1, svint32_t, svfloat16_t,
++		 z0_res = svreinterpret_s32_f16 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_f16_untied, svint32_t, svfloat16_t,
++	     z0 = svreinterpret_s32_f16 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_f32_tied1, svint32_t, svfloat32_t,
++		 z0_res = svreinterpret_s32_f32 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_f32_untied, svint32_t, svfloat32_t,
++	     z0 = svreinterpret_s32_f32 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_f64_tied1, svint32_t, svfloat64_t,
++		 z0_res = svreinterpret_s32_f64 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_f64_untied, svint32_t, svfloat64_t,
++	     z0 = svreinterpret_s32_f64 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_s8_tied1, svint32_t, svint8_t,
++		 z0_res = svreinterpret_s32_s8 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_s8_untied, svint32_t, svint8_t,
++	     z0 = svreinterpret_s32_s8 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_s16_tied1, svint32_t, svint16_t,
++		 z0_res = svreinterpret_s32_s16 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_s16_untied, svint32_t, svint16_t,
++	     z0 = svreinterpret_s32_s16 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_s32_tied1, svint32_t, svint32_t,
++		 z0_res = svreinterpret_s32_s32 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_s32_untied, svint32_t, svint32_t,
++	     z0 = svreinterpret_s32_s32 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_s64_tied1, svint32_t, svint64_t,
++		 z0_res = svreinterpret_s32_s64 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_s64_untied, svint32_t, svint64_t,
++	     z0 = svreinterpret_s32_s64 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_u8_tied1, svint32_t, svuint8_t,
++		 z0_res = svreinterpret_s32_u8 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_u8_untied, svint32_t, svuint8_t,
++	     z0 = svreinterpret_s32_u8 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_u16_tied1, svint32_t, svuint16_t,
++		 z0_res = svreinterpret_s32_u16 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_u16_untied, svint32_t, svuint16_t,
++	     z0 = svreinterpret_s32_u16 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_u32_tied1, svint32_t, svuint32_t,
++		 z0_res = svreinterpret_s32_u32 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_u32_untied, svint32_t, svuint32_t,
++	     z0 = svreinterpret_s32_u32 (z4),
++	     z0 = svreinterpret_s32 (z4))
++
++/*
++** reinterpret_s32_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s32_u64_tied1, svint32_t, svuint64_t,
++		 z0_res = svreinterpret_s32_u64 (z0),
++		 z0_res = svreinterpret_s32 (z0))
++
++/*
++** reinterpret_s32_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s32_u64_untied, svint32_t, svuint64_t,
++	     z0 = svreinterpret_s32_u64 (z4),
++	     z0 = svreinterpret_s32 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c
+new file mode 100644
+index 000000000..cada7533c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_s64_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_bf16_tied1, svint64_t, svbfloat16_t,
++		 z0_res = svreinterpret_s64_bf16 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_bf16_untied, svint64_t, svbfloat16_t,
++	     z0 = svreinterpret_s64_bf16 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_f16_tied1, svint64_t, svfloat16_t,
++		 z0_res = svreinterpret_s64_f16 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_f16_untied, svint64_t, svfloat16_t,
++	     z0 = svreinterpret_s64_f16 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_f32_tied1, svint64_t, svfloat32_t,
++		 z0_res = svreinterpret_s64_f32 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_f32_untied, svint64_t, svfloat32_t,
++	     z0 = svreinterpret_s64_f32 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_f64_tied1, svint64_t, svfloat64_t,
++		 z0_res = svreinterpret_s64_f64 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_f64_untied, svint64_t, svfloat64_t,
++	     z0 = svreinterpret_s64_f64 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_s8_tied1, svint64_t, svint8_t,
++		 z0_res = svreinterpret_s64_s8 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_s8_untied, svint64_t, svint8_t,
++	     z0 = svreinterpret_s64_s8 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_s16_tied1, svint64_t, svint16_t,
++		 z0_res = svreinterpret_s64_s16 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_s16_untied, svint64_t, svint16_t,
++	     z0 = svreinterpret_s64_s16 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_s32_tied1, svint64_t, svint32_t,
++		 z0_res = svreinterpret_s64_s32 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_s32_untied, svint64_t, svint32_t,
++	     z0 = svreinterpret_s64_s32 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_s64_tied1, svint64_t, svint64_t,
++		 z0_res = svreinterpret_s64_s64 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_s64_untied, svint64_t, svint64_t,
++	     z0 = svreinterpret_s64_s64 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_u8_tied1, svint64_t, svuint8_t,
++		 z0_res = svreinterpret_s64_u8 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_u8_untied, svint64_t, svuint8_t,
++	     z0 = svreinterpret_s64_u8 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_u16_tied1, svint64_t, svuint16_t,
++		 z0_res = svreinterpret_s64_u16 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_u16_untied, svint64_t, svuint16_t,
++	     z0 = svreinterpret_s64_u16 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_u32_tied1, svint64_t, svuint32_t,
++		 z0_res = svreinterpret_s64_u32 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_u32_untied, svint64_t, svuint32_t,
++	     z0 = svreinterpret_s64_u32 (z4),
++	     z0 = svreinterpret_s64 (z4))
++
++/*
++** reinterpret_s64_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s64_u64_tied1, svint64_t, svuint64_t,
++		 z0_res = svreinterpret_s64_u64 (z0),
++		 z0_res = svreinterpret_s64 (z0))
++
++/*
++** reinterpret_s64_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s64_u64_untied, svint64_t, svuint64_t,
++	     z0 = svreinterpret_s64_u64 (z4),
++	     z0 = svreinterpret_s64 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c
+new file mode 100644
+index 000000000..23a40d0ba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_s8_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_bf16_tied1, svint8_t, svbfloat16_t,
++		 z0_res = svreinterpret_s8_bf16 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_bf16_untied, svint8_t, svbfloat16_t,
++	     z0 = svreinterpret_s8_bf16 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_f16_tied1, svint8_t, svfloat16_t,
++		 z0_res = svreinterpret_s8_f16 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_f16_untied, svint8_t, svfloat16_t,
++	     z0 = svreinterpret_s8_f16 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_f32_tied1, svint8_t, svfloat32_t,
++		 z0_res = svreinterpret_s8_f32 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_f32_untied, svint8_t, svfloat32_t,
++	     z0 = svreinterpret_s8_f32 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_f64_tied1, svint8_t, svfloat64_t,
++		 z0_res = svreinterpret_s8_f64 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_f64_untied, svint8_t, svfloat64_t,
++	     z0 = svreinterpret_s8_f64 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_s8_tied1, svint8_t, svint8_t,
++		 z0_res = svreinterpret_s8_s8 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_s8_untied, svint8_t, svint8_t,
++	     z0 = svreinterpret_s8_s8 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_s16_tied1, svint8_t, svint16_t,
++		 z0_res = svreinterpret_s8_s16 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_s16_untied, svint8_t, svint16_t,
++	     z0 = svreinterpret_s8_s16 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_s32_tied1, svint8_t, svint32_t,
++		 z0_res = svreinterpret_s8_s32 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_s32_untied, svint8_t, svint32_t,
++	     z0 = svreinterpret_s8_s32 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_s64_tied1, svint8_t, svint64_t,
++		 z0_res = svreinterpret_s8_s64 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_s64_untied, svint8_t, svint64_t,
++	     z0 = svreinterpret_s8_s64 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_u8_tied1, svint8_t, svuint8_t,
++		 z0_res = svreinterpret_s8_u8 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_u8_untied, svint8_t, svuint8_t,
++	     z0 = svreinterpret_s8_u8 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_u16_tied1, svint8_t, svuint16_t,
++		 z0_res = svreinterpret_s8_u16 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_u16_untied, svint8_t, svuint16_t,
++	     z0 = svreinterpret_s8_u16 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_u32_tied1, svint8_t, svuint32_t,
++		 z0_res = svreinterpret_s8_u32 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_u32_untied, svint8_t, svuint32_t,
++	     z0 = svreinterpret_s8_u32 (z4),
++	     z0 = svreinterpret_s8 (z4))
++
++/*
++** reinterpret_s8_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_s8_u64_tied1, svint8_t, svuint64_t,
++		 z0_res = svreinterpret_s8_u64 (z0),
++		 z0_res = svreinterpret_s8 (z0))
++
++/*
++** reinterpret_s8_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_s8_u64_untied, svint8_t, svuint64_t,
++	     z0 = svreinterpret_s8_u64 (z4),
++	     z0 = svreinterpret_s8 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c
+new file mode 100644
+index 000000000..48e8ecaff
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_u16_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_bf16_tied1, svuint16_t, svbfloat16_t,
++		 z0_res = svreinterpret_u16_bf16 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_bf16_untied, svuint16_t, svbfloat16_t,
++	     z0 = svreinterpret_u16_bf16 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_f16_tied1, svuint16_t, svfloat16_t,
++		 z0_res = svreinterpret_u16_f16 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_f16_untied, svuint16_t, svfloat16_t,
++	     z0 = svreinterpret_u16_f16 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_f32_tied1, svuint16_t, svfloat32_t,
++		 z0_res = svreinterpret_u16_f32 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_f32_untied, svuint16_t, svfloat32_t,
++	     z0 = svreinterpret_u16_f32 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_f64_tied1, svuint16_t, svfloat64_t,
++		 z0_res = svreinterpret_u16_f64 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_f64_untied, svuint16_t, svfloat64_t,
++	     z0 = svreinterpret_u16_f64 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_s8_tied1, svuint16_t, svint8_t,
++		 z0_res = svreinterpret_u16_s8 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_s8_untied, svuint16_t, svint8_t,
++	     z0 = svreinterpret_u16_s8 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_s16_tied1, svuint16_t, svint16_t,
++		 z0_res = svreinterpret_u16_s16 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_s16_untied, svuint16_t, svint16_t,
++	     z0 = svreinterpret_u16_s16 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_s32_tied1, svuint16_t, svint32_t,
++		 z0_res = svreinterpret_u16_s32 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_s32_untied, svuint16_t, svint32_t,
++	     z0 = svreinterpret_u16_s32 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_s64_tied1, svuint16_t, svint64_t,
++		 z0_res = svreinterpret_u16_s64 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_s64_untied, svuint16_t, svint64_t,
++	     z0 = svreinterpret_u16_s64 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_u8_tied1, svuint16_t, svuint8_t,
++		 z0_res = svreinterpret_u16_u8 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_u8_untied, svuint16_t, svuint8_t,
++	     z0 = svreinterpret_u16_u8 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_u16_tied1, svuint16_t, svuint16_t,
++		 z0_res = svreinterpret_u16_u16 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_u16_untied, svuint16_t, svuint16_t,
++	     z0 = svreinterpret_u16_u16 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_u32_tied1, svuint16_t, svuint32_t,
++		 z0_res = svreinterpret_u16_u32 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_u32_untied, svuint16_t, svuint32_t,
++	     z0 = svreinterpret_u16_u32 (z4),
++	     z0 = svreinterpret_u16 (z4))
++
++/*
++** reinterpret_u16_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u16_u64_tied1, svuint16_t, svuint64_t,
++		 z0_res = svreinterpret_u16_u64 (z0),
++		 z0_res = svreinterpret_u16 (z0))
++
++/*
++** reinterpret_u16_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u16_u64_untied, svuint16_t, svuint64_t,
++	     z0 = svreinterpret_u16_u64 (z4),
++	     z0 = svreinterpret_u16 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c
+new file mode 100644
+index 000000000..1d4e85712
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_u32_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_bf16_tied1, svuint32_t, svbfloat16_t,
++		 z0_res = svreinterpret_u32_bf16 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_bf16_untied, svuint32_t, svbfloat16_t,
++	     z0 = svreinterpret_u32_bf16 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_f16_tied1, svuint32_t, svfloat16_t,
++		 z0_res = svreinterpret_u32_f16 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_f16_untied, svuint32_t, svfloat16_t,
++	     z0 = svreinterpret_u32_f16 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_f32_tied1, svuint32_t, svfloat32_t,
++		 z0_res = svreinterpret_u32_f32 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_f32_untied, svuint32_t, svfloat32_t,
++	     z0 = svreinterpret_u32_f32 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_f64_tied1, svuint32_t, svfloat64_t,
++		 z0_res = svreinterpret_u32_f64 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_f64_untied, svuint32_t, svfloat64_t,
++	     z0 = svreinterpret_u32_f64 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_s8_tied1, svuint32_t, svint8_t,
++		 z0_res = svreinterpret_u32_s8 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_s8_untied, svuint32_t, svint8_t,
++	     z0 = svreinterpret_u32_s8 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_s16_tied1, svuint32_t, svint16_t,
++		 z0_res = svreinterpret_u32_s16 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_s16_untied, svuint32_t, svint16_t,
++	     z0 = svreinterpret_u32_s16 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_s32_tied1, svuint32_t, svint32_t,
++		 z0_res = svreinterpret_u32_s32 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_s32_untied, svuint32_t, svint32_t,
++	     z0 = svreinterpret_u32_s32 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_s64_tied1, svuint32_t, svint64_t,
++		 z0_res = svreinterpret_u32_s64 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_s64_untied, svuint32_t, svint64_t,
++	     z0 = svreinterpret_u32_s64 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_u8_tied1, svuint32_t, svuint8_t,
++		 z0_res = svreinterpret_u32_u8 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_u8_untied, svuint32_t, svuint8_t,
++	     z0 = svreinterpret_u32_u8 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_u16_tied1, svuint32_t, svuint16_t,
++		 z0_res = svreinterpret_u32_u16 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_u16_untied, svuint32_t, svuint16_t,
++	     z0 = svreinterpret_u32_u16 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_u32_tied1, svuint32_t, svuint32_t,
++		 z0_res = svreinterpret_u32_u32 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_u32_untied, svuint32_t, svuint32_t,
++	     z0 = svreinterpret_u32_u32 (z4),
++	     z0 = svreinterpret_u32 (z4))
++
++/*
++** reinterpret_u32_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u32_u64_tied1, svuint32_t, svuint64_t,
++		 z0_res = svreinterpret_u32_u64 (z0),
++		 z0_res = svreinterpret_u32 (z0))
++
++/*
++** reinterpret_u32_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u32_u64_untied, svuint32_t, svuint64_t,
++	     z0 = svreinterpret_u32_u64 (z4),
++	     z0 = svreinterpret_u32 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c
+new file mode 100644
+index 000000000..07af69dce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_u64_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_bf16_tied1, svuint64_t, svbfloat16_t,
++		 z0_res = svreinterpret_u64_bf16 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_bf16_untied, svuint64_t, svbfloat16_t,
++	     z0 = svreinterpret_u64_bf16 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_f16_tied1, svuint64_t, svfloat16_t,
++		 z0_res = svreinterpret_u64_f16 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_f16_untied, svuint64_t, svfloat16_t,
++	     z0 = svreinterpret_u64_f16 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_f32_tied1, svuint64_t, svfloat32_t,
++		 z0_res = svreinterpret_u64_f32 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_f32_untied, svuint64_t, svfloat32_t,
++	     z0 = svreinterpret_u64_f32 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_f64_tied1, svuint64_t, svfloat64_t,
++		 z0_res = svreinterpret_u64_f64 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_f64_untied, svuint64_t, svfloat64_t,
++	     z0 = svreinterpret_u64_f64 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_s8_tied1, svuint64_t, svint8_t,
++		 z0_res = svreinterpret_u64_s8 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_s8_untied, svuint64_t, svint8_t,
++	     z0 = svreinterpret_u64_s8 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_s16_tied1, svuint64_t, svint16_t,
++		 z0_res = svreinterpret_u64_s16 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_s16_untied, svuint64_t, svint16_t,
++	     z0 = svreinterpret_u64_s16 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_s32_tied1, svuint64_t, svint32_t,
++		 z0_res = svreinterpret_u64_s32 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_s32_untied, svuint64_t, svint32_t,
++	     z0 = svreinterpret_u64_s32 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_s64_tied1, svuint64_t, svint64_t,
++		 z0_res = svreinterpret_u64_s64 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_s64_untied, svuint64_t, svint64_t,
++	     z0 = svreinterpret_u64_s64 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_u8_tied1, svuint64_t, svuint8_t,
++		 z0_res = svreinterpret_u64_u8 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_u8_untied, svuint64_t, svuint8_t,
++	     z0 = svreinterpret_u64_u8 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_u16_tied1, svuint64_t, svuint16_t,
++		 z0_res = svreinterpret_u64_u16 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_u16_untied, svuint64_t, svuint16_t,
++	     z0 = svreinterpret_u64_u16 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_u32_tied1, svuint64_t, svuint32_t,
++		 z0_res = svreinterpret_u64_u32 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_u32_untied, svuint64_t, svuint32_t,
++	     z0 = svreinterpret_u64_u32 (z4),
++	     z0 = svreinterpret_u64 (z4))
++
++/*
++** reinterpret_u64_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u64_u64_tied1, svuint64_t, svuint64_t,
++		 z0_res = svreinterpret_u64_u64 (z0),
++		 z0_res = svreinterpret_u64 (z0))
++
++/*
++** reinterpret_u64_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u64_u64_untied, svuint64_t, svuint64_t,
++	     z0 = svreinterpret_u64_u64 (z4),
++	     z0 = svreinterpret_u64 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c
+new file mode 100644
+index 000000000..a4c7f4c8d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** reinterpret_u8_bf16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_bf16_tied1, svuint8_t, svbfloat16_t,
++		 z0_res = svreinterpret_u8_bf16 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_bf16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_bf16_untied, svuint8_t, svbfloat16_t,
++	     z0 = svreinterpret_u8_bf16 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_f16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_f16_tied1, svuint8_t, svfloat16_t,
++		 z0_res = svreinterpret_u8_f16 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_f16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_f16_untied, svuint8_t, svfloat16_t,
++	     z0 = svreinterpret_u8_f16 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_f32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_f32_tied1, svuint8_t, svfloat32_t,
++		 z0_res = svreinterpret_u8_f32 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_f32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_f32_untied, svuint8_t, svfloat32_t,
++	     z0 = svreinterpret_u8_f32 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_f64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_f64_tied1, svuint8_t, svfloat64_t,
++		 z0_res = svreinterpret_u8_f64 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_f64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_f64_untied, svuint8_t, svfloat64_t,
++	     z0 = svreinterpret_u8_f64 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_s8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_s8_tied1, svuint8_t, svint8_t,
++		 z0_res = svreinterpret_u8_s8 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_s8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_s8_untied, svuint8_t, svint8_t,
++	     z0 = svreinterpret_u8_s8 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_s16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_s16_tied1, svuint8_t, svint16_t,
++		 z0_res = svreinterpret_u8_s16 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_s16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_s16_untied, svuint8_t, svint16_t,
++	     z0 = svreinterpret_u8_s16 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_s32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_s32_tied1, svuint8_t, svint32_t,
++		 z0_res = svreinterpret_u8_s32 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_s32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_s32_untied, svuint8_t, svint32_t,
++	     z0 = svreinterpret_u8_s32 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_s64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_s64_tied1, svuint8_t, svint64_t,
++		 z0_res = svreinterpret_u8_s64 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_s64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_s64_untied, svuint8_t, svint64_t,
++	     z0 = svreinterpret_u8_s64 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_u8_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_u8_tied1, svuint8_t, svuint8_t,
++		 z0_res = svreinterpret_u8_u8 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_u8_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_u8_untied, svuint8_t, svuint8_t,
++	     z0 = svreinterpret_u8_u8 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_u16_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_u16_tied1, svuint8_t, svuint16_t,
++		 z0_res = svreinterpret_u8_u16 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_u16_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_u16_untied, svuint8_t, svuint16_t,
++	     z0 = svreinterpret_u8_u16 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_u32_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_u32_tied1, svuint8_t, svuint32_t,
++		 z0_res = svreinterpret_u8_u32 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_u32_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_u32_untied, svuint8_t, svuint32_t,
++	     z0 = svreinterpret_u8_u32 (z4),
++	     z0 = svreinterpret_u8 (z4))
++
++/*
++** reinterpret_u8_u64_tied1:
++**	ret
++*/
++TEST_DUAL_Z_REV (reinterpret_u8_u64_tied1, svuint8_t, svuint64_t,
++		 z0_res = svreinterpret_u8_u64 (z0),
++		 z0_res = svreinterpret_u8 (z0))
++
++/*
++** reinterpret_u8_u64_untied:
++**	mov	z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (reinterpret_u8_u64_untied, svuint8_t, svuint64_t,
++	     z0 = svreinterpret_u8_u64 (z4),
++	     z0 = svreinterpret_u8 (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b16.c
+new file mode 100644
+index 000000000..7d5c67d5c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_b16_tied1:
++**	rev	p0\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (rev_b16_tied1,
++		p0 = svrev_b16 (p0),
++		p0 = svrev_b16 (p0))
++
++/*
++** rev_b16_untied:
++**	rev	p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (rev_b16_untied,
++		p0 = svrev_b16 (p1),
++		p0 = svrev_b16 (p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b32.c
+new file mode 100644
+index 000000000..3f8c810c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_b32_tied1:
++**	rev	p0\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (rev_b32_tied1,
++		p0 = svrev_b32 (p0),
++		p0 = svrev_b32 (p0))
++
++/*
++** rev_b32_untied:
++**	rev	p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (rev_b32_untied,
++		p0 = svrev_b32 (p1),
++		p0 = svrev_b32 (p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b64.c
+new file mode 100644
+index 000000000..fe937ecc6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_b64_tied1:
++**	rev	p0\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (rev_b64_tied1,
++		p0 = svrev_b64 (p0),
++		p0 = svrev_b64 (p0))
++
++/*
++** rev_b64_untied:
++**	rev	p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (rev_b64_untied,
++		p0 = svrev_b64 (p1),
++		p0 = svrev_b64 (p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b8.c
+new file mode 100644
+index 000000000..d23e50407
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b8.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_b8_tied1:
++**	rev	p0\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (rev_b8_tied1,
++		p0 = svrev_b8 (p0),
++		p0 = svrev_b8 (p0))
++
++/*
++** rev_b8_untied:
++**	rev	p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (rev_b8_untied,
++		p0 = svrev_b8 (p1),
++		p0 = svrev_b8 (p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_bf16.c
+new file mode 100644
+index 000000000..fe587d42c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_bf16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_bf16_tied1:
++**	rev	z0\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_bf16_tied1, svbfloat16_t,
++		z0 = svrev_bf16 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_bf16_untied:
++**	rev	z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_bf16_untied, svbfloat16_t,
++		z0 = svrev_bf16 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f16.c
+new file mode 100644
+index 000000000..321e2f900
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_f16_tied1:
++**	rev	z0\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_f16_tied1, svfloat16_t,
++		z0 = svrev_f16 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_f16_untied:
++**	rev	z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_f16_untied, svfloat16_t,
++		z0 = svrev_f16 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f32.c
+new file mode 100644
+index 000000000..6f31928b5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_f32_tied1:
++**	rev	z0\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rev_f32_tied1, svfloat32_t,
++		z0 = svrev_f32 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_f32_untied:
++**	rev	z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rev_f32_untied, svfloat32_t,
++		z0 = svrev_f32 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f64.c
+new file mode 100644
+index 000000000..6f14078a7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_f64_tied1:
++**	rev	z0\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rev_f64_tied1, svfloat64_t,
++		z0 = svrev_f64 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_f64_untied:
++**	rev	z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rev_f64_untied, svfloat64_t,
++		z0 = svrev_f64 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s16.c
+new file mode 100644
+index 000000000..63f6ea73c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_s16_tied1:
++**	rev	z0\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s16_tied1, svint16_t,
++		z0 = svrev_s16 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_s16_untied:
++**	rev	z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s16_untied, svint16_t,
++		z0 = svrev_s16 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s32.c
+new file mode 100644
+index 000000000..38240b7ec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_s32_tied1:
++**	rev	z0\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s32_tied1, svint32_t,
++		z0 = svrev_s32 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_s32_untied:
++**	rev	z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s32_untied, svint32_t,
++		z0 = svrev_s32 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s64.c
+new file mode 100644
+index 000000000..0004e4586
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_s64_tied1:
++**	rev	z0\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s64_tied1, svint64_t,
++		z0 = svrev_s64 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_s64_untied:
++**	rev	z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s64_untied, svint64_t,
++		z0 = svrev_s64 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s8.c
+new file mode 100644
+index 000000000..44b874c92
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s8.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_s8_tied1:
++**	rev	z0\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s8_tied1, svint8_t,
++		z0 = svrev_s8 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_s8_untied:
++**	rev	z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rev_s8_untied, svint8_t,
++		z0 = svrev_s8 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u16.c
+new file mode 100644
+index 000000000..2b4c88854
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_u16_tied1:
++**	rev	z0\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u16_tied1, svuint16_t,
++		z0 = svrev_u16 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_u16_untied:
++**	rev	z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u16_untied, svuint16_t,
++		z0 = svrev_u16 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u32.c
+new file mode 100644
+index 000000000..e14351f30
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_u32_tied1:
++**	rev	z0\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u32_tied1, svuint32_t,
++		z0 = svrev_u32 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_u32_untied:
++**	rev	z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u32_untied, svuint32_t,
++		z0 = svrev_u32 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u64.c
+new file mode 100644
+index 000000000..5fc987475
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_u64_tied1:
++**	rev	z0\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u64_tied1, svuint64_t,
++		z0 = svrev_u64 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_u64_untied:
++**	rev	z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u64_untied, svuint64_t,
++		z0 = svrev_u64 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u8.c
+new file mode 100644
+index 000000000..9dd4f440b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u8.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rev_u8_tied1:
++**	rev	z0\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u8_tied1, svuint8_t,
++		z0 = svrev_u8 (z0),
++		z0 = svrev (z0))
++
++/*
++** rev_u8_untied:
++**	rev	z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (rev_u8_untied, svuint8_t,
++		z0 = svrev_u8 (z1),
++		z0 = svrev (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s16.c
+new file mode 100644
+index 000000000..ecfabe668
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revb_s16_m_tied12:
++**	revb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_m_tied12, svint16_t,
++		z0 = svrevb_s16_m (z0, p0, z0),
++		z0 = svrevb_m (z0, p0, z0))
++
++/*
++** revb_s16_m_tied1:
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_m_tied1, svint16_t,
++		z0 = svrevb_s16_m (z0, p0, z1),
++		z0 = svrevb_m (z0, p0, z1))
++
++/*
++** revb_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	revb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_m_tied2, svint16_t,
++		z0 = svrevb_s16_m (z1, p0, z0),
++		z0 = svrevb_m (z1, p0, z0))
++
++/*
++** revb_s16_m_untied:
++**	movprfx	z0, z2
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_m_untied, svint16_t,
++		z0 = svrevb_s16_m (z2, p0, z1),
++		z0 = svrevb_m (z2, p0, z1))
++
++/*
++** revb_s16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	revb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_z_tied1, svint16_t,
++		z0 = svrevb_s16_z (p0, z0),
++		z0 = svrevb_z (p0, z0))
++
++/*
++** revb_s16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_z_untied, svint16_t,
++		z0 = svrevb_s16_z (p0, z1),
++		z0 = svrevb_z (p0, z1))
++
++/*
++** revb_s16_x_tied1:
++**	revb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_x_tied1, svint16_t,
++		z0 = svrevb_s16_x (p0, z0),
++		z0 = svrevb_x (p0, z0))
++
++/*
++** revb_s16_x_untied:
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s16_x_untied, svint16_t,
++		z0 = svrevb_s16_x (p0, z1),
++		z0 = svrevb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s32.c
+new file mode 100644
+index 000000000..a46a81973
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revb_s32_m_tied12:
++**	revb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_m_tied12, svint32_t,
++		z0 = svrevb_s32_m (z0, p0, z0),
++		z0 = svrevb_m (z0, p0, z0))
++
++/*
++** revb_s32_m_tied1:
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_m_tied1, svint32_t,
++		z0 = svrevb_s32_m (z0, p0, z1),
++		z0 = svrevb_m (z0, p0, z1))
++
++/*
++** revb_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	revb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_m_tied2, svint32_t,
++		z0 = svrevb_s32_m (z1, p0, z0),
++		z0 = svrevb_m (z1, p0, z0))
++
++/*
++** revb_s32_m_untied:
++**	movprfx	z0, z2
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_m_untied, svint32_t,
++		z0 = svrevb_s32_m (z2, p0, z1),
++		z0 = svrevb_m (z2, p0, z1))
++
++/*
++** revb_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	revb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_z_tied1, svint32_t,
++		z0 = svrevb_s32_z (p0, z0),
++		z0 = svrevb_z (p0, z0))
++
++/*
++** revb_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_z_untied, svint32_t,
++		z0 = svrevb_s32_z (p0, z1),
++		z0 = svrevb_z (p0, z1))
++
++/*
++** revb_s32_x_tied1:
++**	revb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_x_tied1, svint32_t,
++		z0 = svrevb_s32_x (p0, z0),
++		z0 = svrevb_x (p0, z0))
++
++/*
++** revb_s32_x_untied:
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s32_x_untied, svint32_t,
++		z0 = svrevb_s32_x (p0, z1),
++		z0 = svrevb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s64.c
+new file mode 100644
+index 000000000..21547238c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revb_s64_m_tied12:
++**	revb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_m_tied12, svint64_t,
++		z0 = svrevb_s64_m (z0, p0, z0),
++		z0 = svrevb_m (z0, p0, z0))
++
++/*
++** revb_s64_m_tied1:
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_m_tied1, svint64_t,
++		z0 = svrevb_s64_m (z0, p0, z1),
++		z0 = svrevb_m (z0, p0, z1))
++
++/*
++** revb_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	revb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_m_tied2, svint64_t,
++		z0 = svrevb_s64_m (z1, p0, z0),
++		z0 = svrevb_m (z1, p0, z0))
++
++/*
++** revb_s64_m_untied:
++**	movprfx	z0, z2
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_m_untied, svint64_t,
++		z0 = svrevb_s64_m (z2, p0, z1),
++		z0 = svrevb_m (z2, p0, z1))
++
++/*
++** revb_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	revb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_z_tied1, svint64_t,
++		z0 = svrevb_s64_z (p0, z0),
++		z0 = svrevb_z (p0, z0))
++
++/*
++** revb_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_z_untied, svint64_t,
++		z0 = svrevb_s64_z (p0, z1),
++		z0 = svrevb_z (p0, z1))
++
++/*
++** revb_s64_x_tied1:
++**	revb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_x_tied1, svint64_t,
++		z0 = svrevb_s64_x (p0, z0),
++		z0 = svrevb_x (p0, z0))
++
++/*
++** revb_s64_x_untied:
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_s64_x_untied, svint64_t,
++		z0 = svrevb_s64_x (p0, z1),
++		z0 = svrevb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u16.c
+new file mode 100644
+index 000000000..d58bd3d74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u16.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revb_u16_m_tied12:
++**	revb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_m_tied12, svuint16_t,
++		z0 = svrevb_u16_m (z0, p0, z0),
++		z0 = svrevb_m (z0, p0, z0))
++
++/*
++** revb_u16_m_tied1:
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_m_tied1, svuint16_t,
++		z0 = svrevb_u16_m (z0, p0, z1),
++		z0 = svrevb_m (z0, p0, z1))
++
++/*
++** revb_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	revb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_m_tied2, svuint16_t,
++		z0 = svrevb_u16_m (z1, p0, z0),
++		z0 = svrevb_m (z1, p0, z0))
++
++/*
++** revb_u16_m_untied:
++**	movprfx	z0, z2
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_m_untied, svuint16_t,
++		z0 = svrevb_u16_m (z2, p0, z1),
++		z0 = svrevb_m (z2, p0, z1))
++
++/*
++** revb_u16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	revb	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_z_tied1, svuint16_t,
++		z0 = svrevb_u16_z (p0, z0),
++		z0 = svrevb_z (p0, z0))
++
++/*
++** revb_u16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_z_untied, svuint16_t,
++		z0 = svrevb_u16_z (p0, z1),
++		z0 = svrevb_z (p0, z1))
++
++/*
++** revb_u16_x_tied1:
++**	revb	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_x_tied1, svuint16_t,
++		z0 = svrevb_u16_x (p0, z0),
++		z0 = svrevb_x (p0, z0))
++
++/*
++** revb_u16_x_untied:
++**	revb	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u16_x_untied, svuint16_t,
++		z0 = svrevb_u16_x (p0, z1),
++		z0 = svrevb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u32.c
+new file mode 100644
+index 000000000..33df990d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revb_u32_m_tied12:
++**	revb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_m_tied12, svuint32_t,
++		z0 = svrevb_u32_m (z0, p0, z0),
++		z0 = svrevb_m (z0, p0, z0))
++
++/*
++** revb_u32_m_tied1:
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_m_tied1, svuint32_t,
++		z0 = svrevb_u32_m (z0, p0, z1),
++		z0 = svrevb_m (z0, p0, z1))
++
++/*
++** revb_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	revb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_m_tied2, svuint32_t,
++		z0 = svrevb_u32_m (z1, p0, z0),
++		z0 = svrevb_m (z1, p0, z0))
++
++/*
++** revb_u32_m_untied:
++**	movprfx	z0, z2
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_m_untied, svuint32_t,
++		z0 = svrevb_u32_m (z2, p0, z1),
++		z0 = svrevb_m (z2, p0, z1))
++
++/*
++** revb_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	revb	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_z_tied1, svuint32_t,
++		z0 = svrevb_u32_z (p0, z0),
++		z0 = svrevb_z (p0, z0))
++
++/*
++** revb_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_z_untied, svuint32_t,
++		z0 = svrevb_u32_z (p0, z1),
++		z0 = svrevb_z (p0, z1))
++
++/*
++** revb_u32_x_tied1:
++**	revb	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_x_tied1, svuint32_t,
++		z0 = svrevb_u32_x (p0, z0),
++		z0 = svrevb_x (p0, z0))
++
++/*
++** revb_u32_x_untied:
++**	revb	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u32_x_untied, svuint32_t,
++		z0 = svrevb_u32_x (p0, z1),
++		z0 = svrevb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u64.c
+new file mode 100644
+index 000000000..50ad618cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revb_u64_m_tied12:
++**	revb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_m_tied12, svuint64_t,
++		z0 = svrevb_u64_m (z0, p0, z0),
++		z0 = svrevb_m (z0, p0, z0))
++
++/*
++** revb_u64_m_tied1:
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_m_tied1, svuint64_t,
++		z0 = svrevb_u64_m (z0, p0, z1),
++		z0 = svrevb_m (z0, p0, z1))
++
++/*
++** revb_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	revb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_m_tied2, svuint64_t,
++		z0 = svrevb_u64_m (z1, p0, z0),
++		z0 = svrevb_m (z1, p0, z0))
++
++/*
++** revb_u64_m_untied:
++**	movprfx	z0, z2
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_m_untied, svuint64_t,
++		z0 = svrevb_u64_m (z2, p0, z1),
++		z0 = svrevb_m (z2, p0, z1))
++
++/*
++** revb_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	revb	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_z_tied1, svuint64_t,
++		z0 = svrevb_u64_z (p0, z0),
++		z0 = svrevb_z (p0, z0))
++
++/*
++** revb_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_z_untied, svuint64_t,
++		z0 = svrevb_u64_z (p0, z1),
++		z0 = svrevb_z (p0, z1))
++
++/*
++** revb_u64_x_tied1:
++**	revb	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_x_tied1, svuint64_t,
++		z0 = svrevb_u64_x (p0, z0),
++		z0 = svrevb_x (p0, z0))
++
++/*
++** revb_u64_x_untied:
++**	revb	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revb_u64_x_untied, svuint64_t,
++		z0 = svrevb_u64_x (p0, z1),
++		z0 = svrevb_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s32.c
+new file mode 100644
+index 000000000..07d512ddb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revh_s32_m_tied12:
++**	revh	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_m_tied12, svint32_t,
++		z0 = svrevh_s32_m (z0, p0, z0),
++		z0 = svrevh_m (z0, p0, z0))
++
++/*
++** revh_s32_m_tied1:
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_m_tied1, svint32_t,
++		z0 = svrevh_s32_m (z0, p0, z1),
++		z0 = svrevh_m (z0, p0, z1))
++
++/*
++** revh_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	revh	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_m_tied2, svint32_t,
++		z0 = svrevh_s32_m (z1, p0, z0),
++		z0 = svrevh_m (z1, p0, z0))
++
++/*
++** revh_s32_m_untied:
++**	movprfx	z0, z2
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_m_untied, svint32_t,
++		z0 = svrevh_s32_m (z2, p0, z1),
++		z0 = svrevh_m (z2, p0, z1))
++
++/*
++** revh_s32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	revh	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_z_tied1, svint32_t,
++		z0 = svrevh_s32_z (p0, z0),
++		z0 = svrevh_z (p0, z0))
++
++/*
++** revh_s32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_z_untied, svint32_t,
++		z0 = svrevh_s32_z (p0, z1),
++		z0 = svrevh_z (p0, z1))
++
++/*
++** revh_s32_x_tied1:
++**	revh	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_x_tied1, svint32_t,
++		z0 = svrevh_s32_x (p0, z0),
++		z0 = svrevh_x (p0, z0))
++
++/*
++** revh_s32_x_untied:
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s32_x_untied, svint32_t,
++		z0 = svrevh_s32_x (p0, z1),
++		z0 = svrevh_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s64.c
+new file mode 100644
+index 000000000..b1446347c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revh_s64_m_tied12:
++**	revh	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_m_tied12, svint64_t,
++		z0 = svrevh_s64_m (z0, p0, z0),
++		z0 = svrevh_m (z0, p0, z0))
++
++/*
++** revh_s64_m_tied1:
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_m_tied1, svint64_t,
++		z0 = svrevh_s64_m (z0, p0, z1),
++		z0 = svrevh_m (z0, p0, z1))
++
++/*
++** revh_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	revh	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_m_tied2, svint64_t,
++		z0 = svrevh_s64_m (z1, p0, z0),
++		z0 = svrevh_m (z1, p0, z0))
++
++/*
++** revh_s64_m_untied:
++**	movprfx	z0, z2
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_m_untied, svint64_t,
++		z0 = svrevh_s64_m (z2, p0, z1),
++		z0 = svrevh_m (z2, p0, z1))
++
++/*
++** revh_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	revh	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_z_tied1, svint64_t,
++		z0 = svrevh_s64_z (p0, z0),
++		z0 = svrevh_z (p0, z0))
++
++/*
++** revh_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_z_untied, svint64_t,
++		z0 = svrevh_s64_z (p0, z1),
++		z0 = svrevh_z (p0, z1))
++
++/*
++** revh_s64_x_tied1:
++**	revh	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_x_tied1, svint64_t,
++		z0 = svrevh_s64_x (p0, z0),
++		z0 = svrevh_x (p0, z0))
++
++/*
++** revh_s64_x_untied:
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_s64_x_untied, svint64_t,
++		z0 = svrevh_s64_x (p0, z1),
++		z0 = svrevh_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u32.c
+new file mode 100644
+index 000000000..9ea51884d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u32.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revh_u32_m_tied12:
++**	revh	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_m_tied12, svuint32_t,
++		z0 = svrevh_u32_m (z0, p0, z0),
++		z0 = svrevh_m (z0, p0, z0))
++
++/*
++** revh_u32_m_tied1:
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_m_tied1, svuint32_t,
++		z0 = svrevh_u32_m (z0, p0, z1),
++		z0 = svrevh_m (z0, p0, z1))
++
++/*
++** revh_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	revh	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_m_tied2, svuint32_t,
++		z0 = svrevh_u32_m (z1, p0, z0),
++		z0 = svrevh_m (z1, p0, z0))
++
++/*
++** revh_u32_m_untied:
++**	movprfx	z0, z2
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_m_untied, svuint32_t,
++		z0 = svrevh_u32_m (z2, p0, z1),
++		z0 = svrevh_m (z2, p0, z1))
++
++/*
++** revh_u32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	revh	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_z_tied1, svuint32_t,
++		z0 = svrevh_u32_z (p0, z0),
++		z0 = svrevh_z (p0, z0))
++
++/*
++** revh_u32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_z_untied, svuint32_t,
++		z0 = svrevh_u32_z (p0, z1),
++		z0 = svrevh_z (p0, z1))
++
++/*
++** revh_u32_x_tied1:
++**	revh	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_x_tied1, svuint32_t,
++		z0 = svrevh_u32_x (p0, z0),
++		z0 = svrevh_x (p0, z0))
++
++/*
++** revh_u32_x_untied:
++**	revh	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u32_x_untied, svuint32_t,
++		z0 = svrevh_u32_x (p0, z1),
++		z0 = svrevh_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u64.c
+new file mode 100644
+index 000000000..7b2da2701
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revh_u64_m_tied12:
++**	revh	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_m_tied12, svuint64_t,
++		z0 = svrevh_u64_m (z0, p0, z0),
++		z0 = svrevh_m (z0, p0, z0))
++
++/*
++** revh_u64_m_tied1:
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_m_tied1, svuint64_t,
++		z0 = svrevh_u64_m (z0, p0, z1),
++		z0 = svrevh_m (z0, p0, z1))
++
++/*
++** revh_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	revh	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_m_tied2, svuint64_t,
++		z0 = svrevh_u64_m (z1, p0, z0),
++		z0 = svrevh_m (z1, p0, z0))
++
++/*
++** revh_u64_m_untied:
++**	movprfx	z0, z2
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_m_untied, svuint64_t,
++		z0 = svrevh_u64_m (z2, p0, z1),
++		z0 = svrevh_m (z2, p0, z1))
++
++/*
++** revh_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	revh	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_z_tied1, svuint64_t,
++		z0 = svrevh_u64_z (p0, z0),
++		z0 = svrevh_z (p0, z0))
++
++/*
++** revh_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_z_untied, svuint64_t,
++		z0 = svrevh_u64_z (p0, z1),
++		z0 = svrevh_z (p0, z1))
++
++/*
++** revh_u64_x_tied1:
++**	revh	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_x_tied1, svuint64_t,
++		z0 = svrevh_u64_x (p0, z0),
++		z0 = svrevh_x (p0, z0))
++
++/*
++** revh_u64_x_untied:
++**	revh	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revh_u64_x_untied, svuint64_t,
++		z0 = svrevh_u64_x (p0, z1),
++		z0 = svrevh_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_s64.c
+new file mode 100644
+index 000000000..26ca0f0bd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_s64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revw_s64_m_tied12:
++**	revw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_m_tied12, svint64_t,
++		z0 = svrevw_s64_m (z0, p0, z0),
++		z0 = svrevw_m (z0, p0, z0))
++
++/*
++** revw_s64_m_tied1:
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_m_tied1, svint64_t,
++		z0 = svrevw_s64_m (z0, p0, z1),
++		z0 = svrevw_m (z0, p0, z1))
++
++/*
++** revw_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	revw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_m_tied2, svint64_t,
++		z0 = svrevw_s64_m (z1, p0, z0),
++		z0 = svrevw_m (z1, p0, z0))
++
++/*
++** revw_s64_m_untied:
++**	movprfx	z0, z2
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_m_untied, svint64_t,
++		z0 = svrevw_s64_m (z2, p0, z1),
++		z0 = svrevw_m (z2, p0, z1))
++
++/*
++** revw_s64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	revw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_z_tied1, svint64_t,
++		z0 = svrevw_s64_z (p0, z0),
++		z0 = svrevw_z (p0, z0))
++
++/*
++** revw_s64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_z_untied, svint64_t,
++		z0 = svrevw_s64_z (p0, z1),
++		z0 = svrevw_z (p0, z1))
++
++/*
++** revw_s64_x_tied1:
++**	revw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_x_tied1, svint64_t,
++		z0 = svrevw_s64_x (p0, z0),
++		z0 = svrevw_x (p0, z0))
++
++/*
++** revw_s64_x_untied:
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_s64_x_untied, svint64_t,
++		z0 = svrevw_s64_x (p0, z1),
++		z0 = svrevw_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_u64.c
+new file mode 100644
+index 000000000..c70cdb428
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_u64.c
+@@ -0,0 +1,81 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** revw_u64_m_tied12:
++**	revw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_m_tied12, svuint64_t,
++		z0 = svrevw_u64_m (z0, p0, z0),
++		z0 = svrevw_m (z0, p0, z0))
++
++/*
++** revw_u64_m_tied1:
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_m_tied1, svuint64_t,
++		z0 = svrevw_u64_m (z0, p0, z1),
++		z0 = svrevw_m (z0, p0, z1))
++
++/*
++** revw_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	revw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_m_tied2, svuint64_t,
++		z0 = svrevw_u64_m (z1, p0, z0),
++		z0 = svrevw_m (z1, p0, z0))
++
++/*
++** revw_u64_m_untied:
++**	movprfx	z0, z2
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_m_untied, svuint64_t,
++		z0 = svrevw_u64_m (z2, p0, z1),
++		z0 = svrevw_m (z2, p0, z1))
++
++/*
++** revw_u64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	revw	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_z_tied1, svuint64_t,
++		z0 = svrevw_u64_z (p0, z0),
++		z0 = svrevw_z (p0, z0))
++
++/*
++** revw_u64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_z_untied, svuint64_t,
++		z0 = svrevw_u64_z (p0, z1),
++		z0 = svrevw_z (p0, z1))
++
++/*
++** revw_u64_x_tied1:
++**	revw	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_x_tied1, svuint64_t,
++		z0 = svrevw_u64_x (p0, z0),
++		z0 = svrevw_x (p0, z0))
++
++/*
++** revw_u64_x_untied:
++**	revw	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (revw_u64_x_untied, svuint64_t,
++		z0 = svrevw_u64_x (p0, z1),
++		z0 = svrevw_x (p0, z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f16.c
+new file mode 100644
+index 000000000..99a604209
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rinta_f16_m_tied12:
++**	frinta	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_m_tied12, svfloat16_t,
++		z0 = svrinta_f16_m (z0, p0, z0),
++		z0 = svrinta_m (z0, p0, z0))
++
++/*
++** rinta_f16_m_tied1:
++**	frinta	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_m_tied1, svfloat16_t,
++		z0 = svrinta_f16_m (z0, p0, z1),
++		z0 = svrinta_m (z0, p0, z1))
++
++/*
++** rinta_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frinta	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_m_tied2, svfloat16_t,
++		z0 = svrinta_f16_m (z1, p0, z0),
++		z0 = svrinta_m (z1, p0, z0))
++
++/*
++** rinta_f16_m_untied:
++**	movprfx	z0, z2
++**	frinta	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_m_untied, svfloat16_t,
++		z0 = svrinta_f16_m (z2, p0, z1),
++		z0 = svrinta_m (z2, p0, z1))
++
++/*
++** rinta_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frinta	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_z_tied1, svfloat16_t,
++		z0 = svrinta_f16_z (p0, z0),
++		z0 = svrinta_z (p0, z0))
++
++/*
++** rinta_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frinta	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_z_untied, svfloat16_t,
++		z0 = svrinta_f16_z (p0, z1),
++		z0 = svrinta_z (p0, z1))
++
++/*
++** rinta_f16_x_tied1:
++**	frinta	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_x_tied1, svfloat16_t,
++		z0 = svrinta_f16_x (p0, z0),
++		z0 = svrinta_x (p0, z0))
++
++/*
++** rinta_f16_x_untied:
++**	frinta	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f16_x_untied, svfloat16_t,
++		z0 = svrinta_f16_x (p0, z1),
++		z0 = svrinta_x (p0, z1))
++
++/*
++** ptrue_rinta_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinta_f16_x_tied1, svfloat16_t,
++		z0 = svrinta_f16_x (svptrue_b16 (), z0),
++		z0 = svrinta_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rinta_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinta_f16_x_untied, svfloat16_t,
++		z0 = svrinta_f16_x (svptrue_b16 (), z1),
++		z0 = svrinta_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f32.c
+new file mode 100644
+index 000000000..b4e3714bc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rinta_f32_m_tied12:
++**	frinta	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_m_tied12, svfloat32_t,
++		z0 = svrinta_f32_m (z0, p0, z0),
++		z0 = svrinta_m (z0, p0, z0))
++
++/*
++** rinta_f32_m_tied1:
++**	frinta	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_m_tied1, svfloat32_t,
++		z0 = svrinta_f32_m (z0, p0, z1),
++		z0 = svrinta_m (z0, p0, z1))
++
++/*
++** rinta_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frinta	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_m_tied2, svfloat32_t,
++		z0 = svrinta_f32_m (z1, p0, z0),
++		z0 = svrinta_m (z1, p0, z0))
++
++/*
++** rinta_f32_m_untied:
++**	movprfx	z0, z2
++**	frinta	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_m_untied, svfloat32_t,
++		z0 = svrinta_f32_m (z2, p0, z1),
++		z0 = svrinta_m (z2, p0, z1))
++
++/*
++** rinta_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frinta	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_z_tied1, svfloat32_t,
++		z0 = svrinta_f32_z (p0, z0),
++		z0 = svrinta_z (p0, z0))
++
++/*
++** rinta_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frinta	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_z_untied, svfloat32_t,
++		z0 = svrinta_f32_z (p0, z1),
++		z0 = svrinta_z (p0, z1))
++
++/*
++** rinta_f32_x_tied1:
++**	frinta	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_x_tied1, svfloat32_t,
++		z0 = svrinta_f32_x (p0, z0),
++		z0 = svrinta_x (p0, z0))
++
++/*
++** rinta_f32_x_untied:
++**	frinta	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f32_x_untied, svfloat32_t,
++		z0 = svrinta_f32_x (p0, z1),
++		z0 = svrinta_x (p0, z1))
++
++/*
++** ptrue_rinta_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinta_f32_x_tied1, svfloat32_t,
++		z0 = svrinta_f32_x (svptrue_b32 (), z0),
++		z0 = svrinta_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rinta_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinta_f32_x_untied, svfloat32_t,
++		z0 = svrinta_f32_x (svptrue_b32 (), z1),
++		z0 = svrinta_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f64.c
+new file mode 100644
+index 000000000..24d6b7dc8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rinta_f64_m_tied12:
++**	frinta	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_m_tied12, svfloat64_t,
++		z0 = svrinta_f64_m (z0, p0, z0),
++		z0 = svrinta_m (z0, p0, z0))
++
++/*
++** rinta_f64_m_tied1:
++**	frinta	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_m_tied1, svfloat64_t,
++		z0 = svrinta_f64_m (z0, p0, z1),
++		z0 = svrinta_m (z0, p0, z1))
++
++/*
++** rinta_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frinta	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_m_tied2, svfloat64_t,
++		z0 = svrinta_f64_m (z1, p0, z0),
++		z0 = svrinta_m (z1, p0, z0))
++
++/*
++** rinta_f64_m_untied:
++**	movprfx	z0, z2
++**	frinta	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_m_untied, svfloat64_t,
++		z0 = svrinta_f64_m (z2, p0, z1),
++		z0 = svrinta_m (z2, p0, z1))
++
++/*
++** rinta_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frinta	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_z_tied1, svfloat64_t,
++		z0 = svrinta_f64_z (p0, z0),
++		z0 = svrinta_z (p0, z0))
++
++/*
++** rinta_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frinta	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_z_untied, svfloat64_t,
++		z0 = svrinta_f64_z (p0, z1),
++		z0 = svrinta_z (p0, z1))
++
++/*
++** rinta_f64_x_tied1:
++**	frinta	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_x_tied1, svfloat64_t,
++		z0 = svrinta_f64_x (p0, z0),
++		z0 = svrinta_x (p0, z0))
++
++/*
++** rinta_f64_x_untied:
++**	frinta	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinta_f64_x_untied, svfloat64_t,
++		z0 = svrinta_f64_x (p0, z1),
++		z0 = svrinta_x (p0, z1))
++
++/*
++** ptrue_rinta_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinta_f64_x_tied1, svfloat64_t,
++		z0 = svrinta_f64_x (svptrue_b64 (), z0),
++		z0 = svrinta_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rinta_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinta_f64_x_untied, svfloat64_t,
++		z0 = svrinta_f64_x (svptrue_b64 (), z1),
++		z0 = svrinta_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f16.c
+new file mode 100644
+index 000000000..1f0ac85e3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rinti_f16_m_tied12:
++**	frinti	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_m_tied12, svfloat16_t,
++		z0 = svrinti_f16_m (z0, p0, z0),
++		z0 = svrinti_m (z0, p0, z0))
++
++/*
++** rinti_f16_m_tied1:
++**	frinti	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_m_tied1, svfloat16_t,
++		z0 = svrinti_f16_m (z0, p0, z1),
++		z0 = svrinti_m (z0, p0, z1))
++
++/*
++** rinti_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frinti	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_m_tied2, svfloat16_t,
++		z0 = svrinti_f16_m (z1, p0, z0),
++		z0 = svrinti_m (z1, p0, z0))
++
++/*
++** rinti_f16_m_untied:
++**	movprfx	z0, z2
++**	frinti	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_m_untied, svfloat16_t,
++		z0 = svrinti_f16_m (z2, p0, z1),
++		z0 = svrinti_m (z2, p0, z1))
++
++/*
++** rinti_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frinti	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_z_tied1, svfloat16_t,
++		z0 = svrinti_f16_z (p0, z0),
++		z0 = svrinti_z (p0, z0))
++
++/*
++** rinti_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frinti	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_z_untied, svfloat16_t,
++		z0 = svrinti_f16_z (p0, z1),
++		z0 = svrinti_z (p0, z1))
++
++/*
++** rinti_f16_x_tied1:
++**	frinti	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_x_tied1, svfloat16_t,
++		z0 = svrinti_f16_x (p0, z0),
++		z0 = svrinti_x (p0, z0))
++
++/*
++** rinti_f16_x_untied:
++**	frinti	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f16_x_untied, svfloat16_t,
++		z0 = svrinti_f16_x (p0, z1),
++		z0 = svrinti_x (p0, z1))
++
++/*
++** ptrue_rinti_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinti_f16_x_tied1, svfloat16_t,
++		z0 = svrinti_f16_x (svptrue_b16 (), z0),
++		z0 = svrinti_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rinti_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinti_f16_x_untied, svfloat16_t,
++		z0 = svrinti_f16_x (svptrue_b16 (), z1),
++		z0 = svrinti_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f32.c
+new file mode 100644
+index 000000000..cf54fde5c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rinti_f32_m_tied12:
++**	frinti	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_m_tied12, svfloat32_t,
++		z0 = svrinti_f32_m (z0, p0, z0),
++		z0 = svrinti_m (z0, p0, z0))
++
++/*
++** rinti_f32_m_tied1:
++**	frinti	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_m_tied1, svfloat32_t,
++		z0 = svrinti_f32_m (z0, p0, z1),
++		z0 = svrinti_m (z0, p0, z1))
++
++/*
++** rinti_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frinti	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_m_tied2, svfloat32_t,
++		z0 = svrinti_f32_m (z1, p0, z0),
++		z0 = svrinti_m (z1, p0, z0))
++
++/*
++** rinti_f32_m_untied:
++**	movprfx	z0, z2
++**	frinti	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_m_untied, svfloat32_t,
++		z0 = svrinti_f32_m (z2, p0, z1),
++		z0 = svrinti_m (z2, p0, z1))
++
++/*
++** rinti_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frinti	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_z_tied1, svfloat32_t,
++		z0 = svrinti_f32_z (p0, z0),
++		z0 = svrinti_z (p0, z0))
++
++/*
++** rinti_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frinti	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_z_untied, svfloat32_t,
++		z0 = svrinti_f32_z (p0, z1),
++		z0 = svrinti_z (p0, z1))
++
++/*
++** rinti_f32_x_tied1:
++**	frinti	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_x_tied1, svfloat32_t,
++		z0 = svrinti_f32_x (p0, z0),
++		z0 = svrinti_x (p0, z0))
++
++/*
++** rinti_f32_x_untied:
++**	frinti	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f32_x_untied, svfloat32_t,
++		z0 = svrinti_f32_x (p0, z1),
++		z0 = svrinti_x (p0, z1))
++
++/*
++** ptrue_rinti_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinti_f32_x_tied1, svfloat32_t,
++		z0 = svrinti_f32_x (svptrue_b32 (), z0),
++		z0 = svrinti_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rinti_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinti_f32_x_untied, svfloat32_t,
++		z0 = svrinti_f32_x (svptrue_b32 (), z1),
++		z0 = svrinti_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f64.c
+new file mode 100644
+index 000000000..08b861caa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rinti_f64_m_tied12:
++**	frinti	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_m_tied12, svfloat64_t,
++		z0 = svrinti_f64_m (z0, p0, z0),
++		z0 = svrinti_m (z0, p0, z0))
++
++/*
++** rinti_f64_m_tied1:
++**	frinti	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_m_tied1, svfloat64_t,
++		z0 = svrinti_f64_m (z0, p0, z1),
++		z0 = svrinti_m (z0, p0, z1))
++
++/*
++** rinti_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frinti	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_m_tied2, svfloat64_t,
++		z0 = svrinti_f64_m (z1, p0, z0),
++		z0 = svrinti_m (z1, p0, z0))
++
++/*
++** rinti_f64_m_untied:
++**	movprfx	z0, z2
++**	frinti	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_m_untied, svfloat64_t,
++		z0 = svrinti_f64_m (z2, p0, z1),
++		z0 = svrinti_m (z2, p0, z1))
++
++/*
++** rinti_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frinti	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_z_tied1, svfloat64_t,
++		z0 = svrinti_f64_z (p0, z0),
++		z0 = svrinti_z (p0, z0))
++
++/*
++** rinti_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frinti	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_z_untied, svfloat64_t,
++		z0 = svrinti_f64_z (p0, z1),
++		z0 = svrinti_z (p0, z1))
++
++/*
++** rinti_f64_x_tied1:
++**	frinti	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_x_tied1, svfloat64_t,
++		z0 = svrinti_f64_x (p0, z0),
++		z0 = svrinti_x (p0, z0))
++
++/*
++** rinti_f64_x_untied:
++**	frinti	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rinti_f64_x_untied, svfloat64_t,
++		z0 = svrinti_f64_x (p0, z1),
++		z0 = svrinti_x (p0, z1))
++
++/*
++** ptrue_rinti_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinti_f64_x_tied1, svfloat64_t,
++		z0 = svrinti_f64_x (svptrue_b64 (), z0),
++		z0 = svrinti_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rinti_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rinti_f64_x_untied, svfloat64_t,
++		z0 = svrinti_f64_x (svptrue_b64 (), z1),
++		z0 = svrinti_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f16.c
+new file mode 100644
+index 000000000..194d01cbd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintm_f16_m_tied12:
++**	frintm	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_m_tied12, svfloat16_t,
++		z0 = svrintm_f16_m (z0, p0, z0),
++		z0 = svrintm_m (z0, p0, z0))
++
++/*
++** rintm_f16_m_tied1:
++**	frintm	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_m_tied1, svfloat16_t,
++		z0 = svrintm_f16_m (z0, p0, z1),
++		z0 = svrintm_m (z0, p0, z1))
++
++/*
++** rintm_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintm	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_m_tied2, svfloat16_t,
++		z0 = svrintm_f16_m (z1, p0, z0),
++		z0 = svrintm_m (z1, p0, z0))
++
++/*
++** rintm_f16_m_untied:
++**	movprfx	z0, z2
++**	frintm	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_m_untied, svfloat16_t,
++		z0 = svrintm_f16_m (z2, p0, z1),
++		z0 = svrintm_m (z2, p0, z1))
++
++/*
++** rintm_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frintm	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_z_tied1, svfloat16_t,
++		z0 = svrintm_f16_z (p0, z0),
++		z0 = svrintm_z (p0, z0))
++
++/*
++** rintm_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frintm	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_z_untied, svfloat16_t,
++		z0 = svrintm_f16_z (p0, z1),
++		z0 = svrintm_z (p0, z1))
++
++/*
++** rintm_f16_x_tied1:
++**	frintm	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_x_tied1, svfloat16_t,
++		z0 = svrintm_f16_x (p0, z0),
++		z0 = svrintm_x (p0, z0))
++
++/*
++** rintm_f16_x_untied:
++**	frintm	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f16_x_untied, svfloat16_t,
++		z0 = svrintm_f16_x (p0, z1),
++		z0 = svrintm_x (p0, z1))
++
++/*
++** ptrue_rintm_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintm_f16_x_tied1, svfloat16_t,
++		z0 = svrintm_f16_x (svptrue_b16 (), z0),
++		z0 = svrintm_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rintm_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintm_f16_x_untied, svfloat16_t,
++		z0 = svrintm_f16_x (svptrue_b16 (), z1),
++		z0 = svrintm_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f32.c
+new file mode 100644
+index 000000000..6c3297aa1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintm_f32_m_tied12:
++**	frintm	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_m_tied12, svfloat32_t,
++		z0 = svrintm_f32_m (z0, p0, z0),
++		z0 = svrintm_m (z0, p0, z0))
++
++/*
++** rintm_f32_m_tied1:
++**	frintm	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_m_tied1, svfloat32_t,
++		z0 = svrintm_f32_m (z0, p0, z1),
++		z0 = svrintm_m (z0, p0, z1))
++
++/*
++** rintm_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintm	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_m_tied2, svfloat32_t,
++		z0 = svrintm_f32_m (z1, p0, z0),
++		z0 = svrintm_m (z1, p0, z0))
++
++/*
++** rintm_f32_m_untied:
++**	movprfx	z0, z2
++**	frintm	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_m_untied, svfloat32_t,
++		z0 = svrintm_f32_m (z2, p0, z1),
++		z0 = svrintm_m (z2, p0, z1))
++
++/*
++** rintm_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frintm	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_z_tied1, svfloat32_t,
++		z0 = svrintm_f32_z (p0, z0),
++		z0 = svrintm_z (p0, z0))
++
++/*
++** rintm_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frintm	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_z_untied, svfloat32_t,
++		z0 = svrintm_f32_z (p0, z1),
++		z0 = svrintm_z (p0, z1))
++
++/*
++** rintm_f32_x_tied1:
++**	frintm	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_x_tied1, svfloat32_t,
++		z0 = svrintm_f32_x (p0, z0),
++		z0 = svrintm_x (p0, z0))
++
++/*
++** rintm_f32_x_untied:
++**	frintm	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f32_x_untied, svfloat32_t,
++		z0 = svrintm_f32_x (p0, z1),
++		z0 = svrintm_x (p0, z1))
++
++/*
++** ptrue_rintm_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintm_f32_x_tied1, svfloat32_t,
++		z0 = svrintm_f32_x (svptrue_b32 (), z0),
++		z0 = svrintm_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rintm_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintm_f32_x_untied, svfloat32_t,
++		z0 = svrintm_f32_x (svptrue_b32 (), z1),
++		z0 = svrintm_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f64.c
+new file mode 100644
+index 000000000..ecbb24447
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintm_f64_m_tied12:
++**	frintm	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_m_tied12, svfloat64_t,
++		z0 = svrintm_f64_m (z0, p0, z0),
++		z0 = svrintm_m (z0, p0, z0))
++
++/*
++** rintm_f64_m_tied1:
++**	frintm	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_m_tied1, svfloat64_t,
++		z0 = svrintm_f64_m (z0, p0, z1),
++		z0 = svrintm_m (z0, p0, z1))
++
++/*
++** rintm_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frintm	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_m_tied2, svfloat64_t,
++		z0 = svrintm_f64_m (z1, p0, z0),
++		z0 = svrintm_m (z1, p0, z0))
++
++/*
++** rintm_f64_m_untied:
++**	movprfx	z0, z2
++**	frintm	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_m_untied, svfloat64_t,
++		z0 = svrintm_f64_m (z2, p0, z1),
++		z0 = svrintm_m (z2, p0, z1))
++
++/*
++** rintm_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frintm	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_z_tied1, svfloat64_t,
++		z0 = svrintm_f64_z (p0, z0),
++		z0 = svrintm_z (p0, z0))
++
++/*
++** rintm_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frintm	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_z_untied, svfloat64_t,
++		z0 = svrintm_f64_z (p0, z1),
++		z0 = svrintm_z (p0, z1))
++
++/*
++** rintm_f64_x_tied1:
++**	frintm	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_x_tied1, svfloat64_t,
++		z0 = svrintm_f64_x (p0, z0),
++		z0 = svrintm_x (p0, z0))
++
++/*
++** rintm_f64_x_untied:
++**	frintm	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintm_f64_x_untied, svfloat64_t,
++		z0 = svrintm_f64_x (p0, z1),
++		z0 = svrintm_x (p0, z1))
++
++/*
++** ptrue_rintm_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintm_f64_x_tied1, svfloat64_t,
++		z0 = svrintm_f64_x (svptrue_b64 (), z0),
++		z0 = svrintm_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rintm_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintm_f64_x_untied, svfloat64_t,
++		z0 = svrintm_f64_x (svptrue_b64 (), z1),
++		z0 = svrintm_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f16.c
+new file mode 100644
+index 000000000..273307ef1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintn_f16_m_tied12:
++**	frintn	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_m_tied12, svfloat16_t,
++		z0 = svrintn_f16_m (z0, p0, z0),
++		z0 = svrintn_m (z0, p0, z0))
++
++/*
++** rintn_f16_m_tied1:
++**	frintn	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_m_tied1, svfloat16_t,
++		z0 = svrintn_f16_m (z0, p0, z1),
++		z0 = svrintn_m (z0, p0, z1))
++
++/*
++** rintn_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintn	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_m_tied2, svfloat16_t,
++		z0 = svrintn_f16_m (z1, p0, z0),
++		z0 = svrintn_m (z1, p0, z0))
++
++/*
++** rintn_f16_m_untied:
++**	movprfx	z0, z2
++**	frintn	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_m_untied, svfloat16_t,
++		z0 = svrintn_f16_m (z2, p0, z1),
++		z0 = svrintn_m (z2, p0, z1))
++
++/*
++** rintn_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frintn	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_z_tied1, svfloat16_t,
++		z0 = svrintn_f16_z (p0, z0),
++		z0 = svrintn_z (p0, z0))
++
++/*
++** rintn_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frintn	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_z_untied, svfloat16_t,
++		z0 = svrintn_f16_z (p0, z1),
++		z0 = svrintn_z (p0, z1))
++
++/*
++** rintn_f16_x_tied1:
++**	frintn	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_x_tied1, svfloat16_t,
++		z0 = svrintn_f16_x (p0, z0),
++		z0 = svrintn_x (p0, z0))
++
++/*
++** rintn_f16_x_untied:
++**	frintn	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f16_x_untied, svfloat16_t,
++		z0 = svrintn_f16_x (p0, z1),
++		z0 = svrintn_x (p0, z1))
++
++/*
++** ptrue_rintn_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintn_f16_x_tied1, svfloat16_t,
++		z0 = svrintn_f16_x (svptrue_b16 (), z0),
++		z0 = svrintn_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rintn_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintn_f16_x_untied, svfloat16_t,
++		z0 = svrintn_f16_x (svptrue_b16 (), z1),
++		z0 = svrintn_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f32.c
+new file mode 100644
+index 000000000..bafd43106
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintn_f32_m_tied12:
++**	frintn	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_m_tied12, svfloat32_t,
++		z0 = svrintn_f32_m (z0, p0, z0),
++		z0 = svrintn_m (z0, p0, z0))
++
++/*
++** rintn_f32_m_tied1:
++**	frintn	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_m_tied1, svfloat32_t,
++		z0 = svrintn_f32_m (z0, p0, z1),
++		z0 = svrintn_m (z0, p0, z1))
++
++/*
++** rintn_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintn	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_m_tied2, svfloat32_t,
++		z0 = svrintn_f32_m (z1, p0, z0),
++		z0 = svrintn_m (z1, p0, z0))
++
++/*
++** rintn_f32_m_untied:
++**	movprfx	z0, z2
++**	frintn	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_m_untied, svfloat32_t,
++		z0 = svrintn_f32_m (z2, p0, z1),
++		z0 = svrintn_m (z2, p0, z1))
++
++/*
++** rintn_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frintn	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_z_tied1, svfloat32_t,
++		z0 = svrintn_f32_z (p0, z0),
++		z0 = svrintn_z (p0, z0))
++
++/*
++** rintn_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frintn	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_z_untied, svfloat32_t,
++		z0 = svrintn_f32_z (p0, z1),
++		z0 = svrintn_z (p0, z1))
++
++/*
++** rintn_f32_x_tied1:
++**	frintn	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_x_tied1, svfloat32_t,
++		z0 = svrintn_f32_x (p0, z0),
++		z0 = svrintn_x (p0, z0))
++
++/*
++** rintn_f32_x_untied:
++**	frintn	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f32_x_untied, svfloat32_t,
++		z0 = svrintn_f32_x (p0, z1),
++		z0 = svrintn_x (p0, z1))
++
++/*
++** ptrue_rintn_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintn_f32_x_tied1, svfloat32_t,
++		z0 = svrintn_f32_x (svptrue_b32 (), z0),
++		z0 = svrintn_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rintn_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintn_f32_x_untied, svfloat32_t,
++		z0 = svrintn_f32_x (svptrue_b32 (), z1),
++		z0 = svrintn_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f64.c
+new file mode 100644
+index 000000000..0142315e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintn_f64_m_tied12:
++**	frintn	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_m_tied12, svfloat64_t,
++		z0 = svrintn_f64_m (z0, p0, z0),
++		z0 = svrintn_m (z0, p0, z0))
++
++/*
++** rintn_f64_m_tied1:
++**	frintn	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_m_tied1, svfloat64_t,
++		z0 = svrintn_f64_m (z0, p0, z1),
++		z0 = svrintn_m (z0, p0, z1))
++
++/*
++** rintn_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frintn	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_m_tied2, svfloat64_t,
++		z0 = svrintn_f64_m (z1, p0, z0),
++		z0 = svrintn_m (z1, p0, z0))
++
++/*
++** rintn_f64_m_untied:
++**	movprfx	z0, z2
++**	frintn	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_m_untied, svfloat64_t,
++		z0 = svrintn_f64_m (z2, p0, z1),
++		z0 = svrintn_m (z2, p0, z1))
++
++/*
++** rintn_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frintn	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_z_tied1, svfloat64_t,
++		z0 = svrintn_f64_z (p0, z0),
++		z0 = svrintn_z (p0, z0))
++
++/*
++** rintn_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frintn	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_z_untied, svfloat64_t,
++		z0 = svrintn_f64_z (p0, z1),
++		z0 = svrintn_z (p0, z1))
++
++/*
++** rintn_f64_x_tied1:
++**	frintn	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_x_tied1, svfloat64_t,
++		z0 = svrintn_f64_x (p0, z0),
++		z0 = svrintn_x (p0, z0))
++
++/*
++** rintn_f64_x_untied:
++**	frintn	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintn_f64_x_untied, svfloat64_t,
++		z0 = svrintn_f64_x (p0, z1),
++		z0 = svrintn_x (p0, z1))
++
++/*
++** ptrue_rintn_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintn_f64_x_tied1, svfloat64_t,
++		z0 = svrintn_f64_x (svptrue_b64 (), z0),
++		z0 = svrintn_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rintn_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintn_f64_x_untied, svfloat64_t,
++		z0 = svrintn_f64_x (svptrue_b64 (), z1),
++		z0 = svrintn_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f16.c
+new file mode 100644
+index 000000000..0e85c3448
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintp_f16_m_tied12:
++**	frintp	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_m_tied12, svfloat16_t,
++		z0 = svrintp_f16_m (z0, p0, z0),
++		z0 = svrintp_m (z0, p0, z0))
++
++/*
++** rintp_f16_m_tied1:
++**	frintp	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_m_tied1, svfloat16_t,
++		z0 = svrintp_f16_m (z0, p0, z1),
++		z0 = svrintp_m (z0, p0, z1))
++
++/*
++** rintp_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintp	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_m_tied2, svfloat16_t,
++		z0 = svrintp_f16_m (z1, p0, z0),
++		z0 = svrintp_m (z1, p0, z0))
++
++/*
++** rintp_f16_m_untied:
++**	movprfx	z0, z2
++**	frintp	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_m_untied, svfloat16_t,
++		z0 = svrintp_f16_m (z2, p0, z1),
++		z0 = svrintp_m (z2, p0, z1))
++
++/*
++** rintp_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frintp	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_z_tied1, svfloat16_t,
++		z0 = svrintp_f16_z (p0, z0),
++		z0 = svrintp_z (p0, z0))
++
++/*
++** rintp_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frintp	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_z_untied, svfloat16_t,
++		z0 = svrintp_f16_z (p0, z1),
++		z0 = svrintp_z (p0, z1))
++
++/*
++** rintp_f16_x_tied1:
++**	frintp	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_x_tied1, svfloat16_t,
++		z0 = svrintp_f16_x (p0, z0),
++		z0 = svrintp_x (p0, z0))
++
++/*
++** rintp_f16_x_untied:
++**	frintp	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f16_x_untied, svfloat16_t,
++		z0 = svrintp_f16_x (p0, z1),
++		z0 = svrintp_x (p0, z1))
++
++/*
++** ptrue_rintp_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintp_f16_x_tied1, svfloat16_t,
++		z0 = svrintp_f16_x (svptrue_b16 (), z0),
++		z0 = svrintp_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rintp_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintp_f16_x_untied, svfloat16_t,
++		z0 = svrintp_f16_x (svptrue_b16 (), z1),
++		z0 = svrintp_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f32.c
+new file mode 100644
+index 000000000..cec360d7c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintp_f32_m_tied12:
++**	frintp	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_m_tied12, svfloat32_t,
++		z0 = svrintp_f32_m (z0, p0, z0),
++		z0 = svrintp_m (z0, p0, z0))
++
++/*
++** rintp_f32_m_tied1:
++**	frintp	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_m_tied1, svfloat32_t,
++		z0 = svrintp_f32_m (z0, p0, z1),
++		z0 = svrintp_m (z0, p0, z1))
++
++/*
++** rintp_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintp	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_m_tied2, svfloat32_t,
++		z0 = svrintp_f32_m (z1, p0, z0),
++		z0 = svrintp_m (z1, p0, z0))
++
++/*
++** rintp_f32_m_untied:
++**	movprfx	z0, z2
++**	frintp	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_m_untied, svfloat32_t,
++		z0 = svrintp_f32_m (z2, p0, z1),
++		z0 = svrintp_m (z2, p0, z1))
++
++/*
++** rintp_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frintp	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_z_tied1, svfloat32_t,
++		z0 = svrintp_f32_z (p0, z0),
++		z0 = svrintp_z (p0, z0))
++
++/*
++** rintp_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frintp	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_z_untied, svfloat32_t,
++		z0 = svrintp_f32_z (p0, z1),
++		z0 = svrintp_z (p0, z1))
++
++/*
++** rintp_f32_x_tied1:
++**	frintp	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_x_tied1, svfloat32_t,
++		z0 = svrintp_f32_x (p0, z0),
++		z0 = svrintp_x (p0, z0))
++
++/*
++** rintp_f32_x_untied:
++**	frintp	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f32_x_untied, svfloat32_t,
++		z0 = svrintp_f32_x (p0, z1),
++		z0 = svrintp_x (p0, z1))
++
++/*
++** ptrue_rintp_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintp_f32_x_tied1, svfloat32_t,
++		z0 = svrintp_f32_x (svptrue_b32 (), z0),
++		z0 = svrintp_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rintp_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintp_f32_x_untied, svfloat32_t,
++		z0 = svrintp_f32_x (svptrue_b32 (), z1),
++		z0 = svrintp_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f64.c
+new file mode 100644
+index 000000000..1305fb682
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintp_f64_m_tied12:
++**	frintp	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_m_tied12, svfloat64_t,
++		z0 = svrintp_f64_m (z0, p0, z0),
++		z0 = svrintp_m (z0, p0, z0))
++
++/*
++** rintp_f64_m_tied1:
++**	frintp	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_m_tied1, svfloat64_t,
++		z0 = svrintp_f64_m (z0, p0, z1),
++		z0 = svrintp_m (z0, p0, z1))
++
++/*
++** rintp_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frintp	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_m_tied2, svfloat64_t,
++		z0 = svrintp_f64_m (z1, p0, z0),
++		z0 = svrintp_m (z1, p0, z0))
++
++/*
++** rintp_f64_m_untied:
++**	movprfx	z0, z2
++**	frintp	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_m_untied, svfloat64_t,
++		z0 = svrintp_f64_m (z2, p0, z1),
++		z0 = svrintp_m (z2, p0, z1))
++
++/*
++** rintp_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frintp	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_z_tied1, svfloat64_t,
++		z0 = svrintp_f64_z (p0, z0),
++		z0 = svrintp_z (p0, z0))
++
++/*
++** rintp_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frintp	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_z_untied, svfloat64_t,
++		z0 = svrintp_f64_z (p0, z1),
++		z0 = svrintp_z (p0, z1))
++
++/*
++** rintp_f64_x_tied1:
++**	frintp	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_x_tied1, svfloat64_t,
++		z0 = svrintp_f64_x (p0, z0),
++		z0 = svrintp_x (p0, z0))
++
++/*
++** rintp_f64_x_untied:
++**	frintp	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintp_f64_x_untied, svfloat64_t,
++		z0 = svrintp_f64_x (p0, z1),
++		z0 = svrintp_x (p0, z1))
++
++/*
++** ptrue_rintp_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintp_f64_x_tied1, svfloat64_t,
++		z0 = svrintp_f64_x (svptrue_b64 (), z0),
++		z0 = svrintp_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rintp_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintp_f64_x_untied, svfloat64_t,
++		z0 = svrintp_f64_x (svptrue_b64 (), z1),
++		z0 = svrintp_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f16.c
+new file mode 100644
+index 000000000..96f7f2c72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintx_f16_m_tied12:
++**	frintx	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_m_tied12, svfloat16_t,
++		z0 = svrintx_f16_m (z0, p0, z0),
++		z0 = svrintx_m (z0, p0, z0))
++
++/*
++** rintx_f16_m_tied1:
++**	frintx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_m_tied1, svfloat16_t,
++		z0 = svrintx_f16_m (z0, p0, z1),
++		z0 = svrintx_m (z0, p0, z1))
++
++/*
++** rintx_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintx	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_m_tied2, svfloat16_t,
++		z0 = svrintx_f16_m (z1, p0, z0),
++		z0 = svrintx_m (z1, p0, z0))
++
++/*
++** rintx_f16_m_untied:
++**	movprfx	z0, z2
++**	frintx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_m_untied, svfloat16_t,
++		z0 = svrintx_f16_m (z2, p0, z1),
++		z0 = svrintx_m (z2, p0, z1))
++
++/*
++** rintx_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frintx	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_z_tied1, svfloat16_t,
++		z0 = svrintx_f16_z (p0, z0),
++		z0 = svrintx_z (p0, z0))
++
++/*
++** rintx_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frintx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_z_untied, svfloat16_t,
++		z0 = svrintx_f16_z (p0, z1),
++		z0 = svrintx_z (p0, z1))
++
++/*
++** rintx_f16_x_tied1:
++**	frintx	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_x_tied1, svfloat16_t,
++		z0 = svrintx_f16_x (p0, z0),
++		z0 = svrintx_x (p0, z0))
++
++/*
++** rintx_f16_x_untied:
++**	frintx	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f16_x_untied, svfloat16_t,
++		z0 = svrintx_f16_x (p0, z1),
++		z0 = svrintx_x (p0, z1))
++
++/*
++** ptrue_rintx_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintx_f16_x_tied1, svfloat16_t,
++		z0 = svrintx_f16_x (svptrue_b16 (), z0),
++		z0 = svrintx_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rintx_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintx_f16_x_untied, svfloat16_t,
++		z0 = svrintx_f16_x (svptrue_b16 (), z1),
++		z0 = svrintx_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f32.c
+new file mode 100644
+index 000000000..1c42d2a94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintx_f32_m_tied12:
++**	frintx	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_m_tied12, svfloat32_t,
++		z0 = svrintx_f32_m (z0, p0, z0),
++		z0 = svrintx_m (z0, p0, z0))
++
++/*
++** rintx_f32_m_tied1:
++**	frintx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_m_tied1, svfloat32_t,
++		z0 = svrintx_f32_m (z0, p0, z1),
++		z0 = svrintx_m (z0, p0, z1))
++
++/*
++** rintx_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintx	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_m_tied2, svfloat32_t,
++		z0 = svrintx_f32_m (z1, p0, z0),
++		z0 = svrintx_m (z1, p0, z0))
++
++/*
++** rintx_f32_m_untied:
++**	movprfx	z0, z2
++**	frintx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_m_untied, svfloat32_t,
++		z0 = svrintx_f32_m (z2, p0, z1),
++		z0 = svrintx_m (z2, p0, z1))
++
++/*
++** rintx_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frintx	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_z_tied1, svfloat32_t,
++		z0 = svrintx_f32_z (p0, z0),
++		z0 = svrintx_z (p0, z0))
++
++/*
++** rintx_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frintx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_z_untied, svfloat32_t,
++		z0 = svrintx_f32_z (p0, z1),
++		z0 = svrintx_z (p0, z1))
++
++/*
++** rintx_f32_x_tied1:
++**	frintx	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_x_tied1, svfloat32_t,
++		z0 = svrintx_f32_x (p0, z0),
++		z0 = svrintx_x (p0, z0))
++
++/*
++** rintx_f32_x_untied:
++**	frintx	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f32_x_untied, svfloat32_t,
++		z0 = svrintx_f32_x (p0, z1),
++		z0 = svrintx_x (p0, z1))
++
++/*
++** ptrue_rintx_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintx_f32_x_tied1, svfloat32_t,
++		z0 = svrintx_f32_x (svptrue_b32 (), z0),
++		z0 = svrintx_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rintx_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintx_f32_x_untied, svfloat32_t,
++		z0 = svrintx_f32_x (svptrue_b32 (), z1),
++		z0 = svrintx_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f64.c
+new file mode 100644
+index 000000000..bee806b3b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintx_f64_m_tied12:
++**	frintx	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_m_tied12, svfloat64_t,
++		z0 = svrintx_f64_m (z0, p0, z0),
++		z0 = svrintx_m (z0, p0, z0))
++
++/*
++** rintx_f64_m_tied1:
++**	frintx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_m_tied1, svfloat64_t,
++		z0 = svrintx_f64_m (z0, p0, z1),
++		z0 = svrintx_m (z0, p0, z1))
++
++/*
++** rintx_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frintx	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_m_tied2, svfloat64_t,
++		z0 = svrintx_f64_m (z1, p0, z0),
++		z0 = svrintx_m (z1, p0, z0))
++
++/*
++** rintx_f64_m_untied:
++**	movprfx	z0, z2
++**	frintx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_m_untied, svfloat64_t,
++		z0 = svrintx_f64_m (z2, p0, z1),
++		z0 = svrintx_m (z2, p0, z1))
++
++/*
++** rintx_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frintx	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_z_tied1, svfloat64_t,
++		z0 = svrintx_f64_z (p0, z0),
++		z0 = svrintx_z (p0, z0))
++
++/*
++** rintx_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frintx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_z_untied, svfloat64_t,
++		z0 = svrintx_f64_z (p0, z1),
++		z0 = svrintx_z (p0, z1))
++
++/*
++** rintx_f64_x_tied1:
++**	frintx	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_x_tied1, svfloat64_t,
++		z0 = svrintx_f64_x (p0, z0),
++		z0 = svrintx_x (p0, z0))
++
++/*
++** rintx_f64_x_untied:
++**	frintx	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintx_f64_x_untied, svfloat64_t,
++		z0 = svrintx_f64_x (p0, z1),
++		z0 = svrintx_x (p0, z1))
++
++/*
++** ptrue_rintx_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintx_f64_x_tied1, svfloat64_t,
++		z0 = svrintx_f64_x (svptrue_b64 (), z0),
++		z0 = svrintx_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rintx_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintx_f64_x_untied, svfloat64_t,
++		z0 = svrintx_f64_x (svptrue_b64 (), z1),
++		z0 = svrintx_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f16.c
+new file mode 100644
+index 000000000..be13d82b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintz_f16_m_tied12:
++**	frintz	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_m_tied12, svfloat16_t,
++		z0 = svrintz_f16_m (z0, p0, z0),
++		z0 = svrintz_m (z0, p0, z0))
++
++/*
++** rintz_f16_m_tied1:
++**	frintz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_m_tied1, svfloat16_t,
++		z0 = svrintz_f16_m (z0, p0, z1),
++		z0 = svrintz_m (z0, p0, z1))
++
++/*
++** rintz_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintz	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_m_tied2, svfloat16_t,
++		z0 = svrintz_f16_m (z1, p0, z0),
++		z0 = svrintz_m (z1, p0, z0))
++
++/*
++** rintz_f16_m_untied:
++**	movprfx	z0, z2
++**	frintz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_m_untied, svfloat16_t,
++		z0 = svrintz_f16_m (z2, p0, z1),
++		z0 = svrintz_m (z2, p0, z1))
++
++/*
++** rintz_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	frintz	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_z_tied1, svfloat16_t,
++		z0 = svrintz_f16_z (p0, z0),
++		z0 = svrintz_z (p0, z0))
++
++/*
++** rintz_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	frintz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_z_untied, svfloat16_t,
++		z0 = svrintz_f16_z (p0, z1),
++		z0 = svrintz_z (p0, z1))
++
++/*
++** rintz_f16_x_tied1:
++**	frintz	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_x_tied1, svfloat16_t,
++		z0 = svrintz_f16_x (p0, z0),
++		z0 = svrintz_x (p0, z0))
++
++/*
++** rintz_f16_x_untied:
++**	frintz	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f16_x_untied, svfloat16_t,
++		z0 = svrintz_f16_x (p0, z1),
++		z0 = svrintz_x (p0, z1))
++
++/*
++** ptrue_rintz_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintz_f16_x_tied1, svfloat16_t,
++		z0 = svrintz_f16_x (svptrue_b16 (), z0),
++		z0 = svrintz_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_rintz_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintz_f16_x_untied, svfloat16_t,
++		z0 = svrintz_f16_x (svptrue_b16 (), z1),
++		z0 = svrintz_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f32.c
+new file mode 100644
+index 000000000..873c0d468
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintz_f32_m_tied12:
++**	frintz	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_m_tied12, svfloat32_t,
++		z0 = svrintz_f32_m (z0, p0, z0),
++		z0 = svrintz_m (z0, p0, z0))
++
++/*
++** rintz_f32_m_tied1:
++**	frintz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_m_tied1, svfloat32_t,
++		z0 = svrintz_f32_m (z0, p0, z1),
++		z0 = svrintz_m (z0, p0, z1))
++
++/*
++** rintz_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	frintz	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_m_tied2, svfloat32_t,
++		z0 = svrintz_f32_m (z1, p0, z0),
++		z0 = svrintz_m (z1, p0, z0))
++
++/*
++** rintz_f32_m_untied:
++**	movprfx	z0, z2
++**	frintz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_m_untied, svfloat32_t,
++		z0 = svrintz_f32_m (z2, p0, z1),
++		z0 = svrintz_m (z2, p0, z1))
++
++/*
++** rintz_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	frintz	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_z_tied1, svfloat32_t,
++		z0 = svrintz_f32_z (p0, z0),
++		z0 = svrintz_z (p0, z0))
++
++/*
++** rintz_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	frintz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_z_untied, svfloat32_t,
++		z0 = svrintz_f32_z (p0, z1),
++		z0 = svrintz_z (p0, z1))
++
++/*
++** rintz_f32_x_tied1:
++**	frintz	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_x_tied1, svfloat32_t,
++		z0 = svrintz_f32_x (p0, z0),
++		z0 = svrintz_x (p0, z0))
++
++/*
++** rintz_f32_x_untied:
++**	frintz	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f32_x_untied, svfloat32_t,
++		z0 = svrintz_f32_x (p0, z1),
++		z0 = svrintz_x (p0, z1))
++
++/*
++** ptrue_rintz_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintz_f32_x_tied1, svfloat32_t,
++		z0 = svrintz_f32_x (svptrue_b32 (), z0),
++		z0 = svrintz_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_rintz_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintz_f32_x_untied, svfloat32_t,
++		z0 = svrintz_f32_x (svptrue_b32 (), z1),
++		z0 = svrintz_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f64.c
+new file mode 100644
+index 000000000..e6c9d1fc8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rintz_f64_m_tied12:
++**	frintz	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_m_tied12, svfloat64_t,
++		z0 = svrintz_f64_m (z0, p0, z0),
++		z0 = svrintz_m (z0, p0, z0))
++
++/*
++** rintz_f64_m_tied1:
++**	frintz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_m_tied1, svfloat64_t,
++		z0 = svrintz_f64_m (z0, p0, z1),
++		z0 = svrintz_m (z0, p0, z1))
++
++/*
++** rintz_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	frintz	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_m_tied2, svfloat64_t,
++		z0 = svrintz_f64_m (z1, p0, z0),
++		z0 = svrintz_m (z1, p0, z0))
++
++/*
++** rintz_f64_m_untied:
++**	movprfx	z0, z2
++**	frintz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_m_untied, svfloat64_t,
++		z0 = svrintz_f64_m (z2, p0, z1),
++		z0 = svrintz_m (z2, p0, z1))
++
++/*
++** rintz_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	frintz	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_z_tied1, svfloat64_t,
++		z0 = svrintz_f64_z (p0, z0),
++		z0 = svrintz_z (p0, z0))
++
++/*
++** rintz_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	frintz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_z_untied, svfloat64_t,
++		z0 = svrintz_f64_z (p0, z1),
++		z0 = svrintz_z (p0, z1))
++
++/*
++** rintz_f64_x_tied1:
++**	frintz	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_x_tied1, svfloat64_t,
++		z0 = svrintz_f64_x (p0, z0),
++		z0 = svrintz_x (p0, z0))
++
++/*
++** rintz_f64_x_untied:
++**	frintz	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rintz_f64_x_untied, svfloat64_t,
++		z0 = svrintz_f64_x (p0, z1),
++		z0 = svrintz_x (p0, z1))
++
++/*
++** ptrue_rintz_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintz_f64_x_tied1, svfloat64_t,
++		z0 = svrintz_f64_x (svptrue_b64 (), z0),
++		z0 = svrintz_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_rintz_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_rintz_f64_x_untied, svfloat64_t,
++		z0 = svrintz_f64_x (svptrue_b64 (), z1),
++		z0 = svrintz_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f16.c
+new file mode 100644
+index 000000000..adfdc2b9c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rsqrte_f16_tied1:
++**	frsqrte	z0\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrte_f16_tied1, svfloat16_t,
++		z0 = svrsqrte_f16 (z0),
++		z0 = svrsqrte (z0))
++
++/*
++** rsqrte_f16_untied:
++**	frsqrte	z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrte_f16_untied, svfloat16_t,
++		z0 = svrsqrte_f16 (z1),
++		z0 = svrsqrte (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f32.c
+new file mode 100644
+index 000000000..fd938ebdf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rsqrte_f32_tied1:
++**	frsqrte	z0\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrte_f32_tied1, svfloat32_t,
++		z0 = svrsqrte_f32 (z0),
++		z0 = svrsqrte (z0))
++
++/*
++** rsqrte_f32_untied:
++**	frsqrte	z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrte_f32_untied, svfloat32_t,
++		z0 = svrsqrte_f32 (z1),
++		z0 = svrsqrte (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f64.c
+new file mode 100644
+index 000000000..3ac0f4053
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rsqrte_f64_tied1:
++**	frsqrte	z0\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrte_f64_tied1, svfloat64_t,
++		z0 = svrsqrte_f64 (z0),
++		z0 = svrsqrte (z0))
++
++/*
++** rsqrte_f64_untied:
++**	frsqrte	z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrte_f64_untied, svfloat64_t,
++		z0 = svrsqrte_f64 (z1),
++		z0 = svrsqrte (z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f16.c
+new file mode 100644
+index 000000000..2d88be3d6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rsqrts_f16_tied1:
++**	frsqrts	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f16_tied1, svfloat16_t,
++		z0 = svrsqrts_f16 (z0, z1),
++		z0 = svrsqrts (z0, z1))
++
++/*
++** rsqrts_f16_tied2:
++**	frsqrts	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f16_tied2, svfloat16_t,
++		z0 = svrsqrts_f16 (z1, z0),
++		z0 = svrsqrts (z1, z0))
++
++/*
++** rsqrts_f16_untied:
++**	frsqrts	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f16_untied, svfloat16_t,
++		z0 = svrsqrts_f16 (z1, z2),
++		z0 = svrsqrts (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f32.c
+new file mode 100644
+index 000000000..cd76aef4d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rsqrts_f32_tied1:
++**	frsqrts	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f32_tied1, svfloat32_t,
++		z0 = svrsqrts_f32 (z0, z1),
++		z0 = svrsqrts (z0, z1))
++
++/*
++** rsqrts_f32_tied2:
++**	frsqrts	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f32_tied2, svfloat32_t,
++		z0 = svrsqrts_f32 (z1, z0),
++		z0 = svrsqrts (z1, z0))
++
++/*
++** rsqrts_f32_untied:
++**	frsqrts	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f32_untied, svfloat32_t,
++		z0 = svrsqrts_f32 (z1, z2),
++		z0 = svrsqrts (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f64.c
+new file mode 100644
+index 000000000..e72a82fcb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** rsqrts_f64_tied1:
++**	frsqrts	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f64_tied1, svfloat64_t,
++		z0 = svrsqrts_f64 (z0, z1),
++		z0 = svrsqrts (z0, z1))
++
++/*
++** rsqrts_f64_tied2:
++**	frsqrts	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f64_tied2, svfloat64_t,
++		z0 = svrsqrts_f64 (z1, z0),
++		z0 = svrsqrts (z1, z0))
++
++/*
++** rsqrts_f64_untied:
++**	frsqrts	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (rsqrts_f64_untied, svfloat64_t,
++		z0 = svrsqrts_f64 (z1, z2),
++		z0 = svrsqrts (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f16.c
+new file mode 100644
+index 000000000..9c554255b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f16.c
+@@ -0,0 +1,330 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** scale_f16_m_tied1:
++**	fscale	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (scale_f16_m_tied1, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_m (p0, z0, z4),
++	     z0 = svscale_m (p0, z0, z4))
++
++/*
++** scale_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fscale	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f16_m_tied2, svfloat16_t, svint16_t,
++		 z0_res = svscale_f16_m (p0, z4, z0),
++		 z0_res = svscale_m (p0, z4, z0))
++
++/*
++** scale_f16_m_untied:
++**	movprfx	z0, z1
++**	fscale	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (scale_f16_m_untied, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_m (p0, z1, z4),
++	     z0 = svscale_m (p0, z1, z4))
++
++/*
++** scale_w0_f16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f16_m_tied1, svfloat16_t, int16_t,
++		 z0 = svscale_n_f16_m (p0, z0, x0),
++		 z0 = svscale_m (p0, z0, x0))
++
++/*
++** scale_w0_f16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f16_m_untied, svfloat16_t, int16_t,
++		 z0 = svscale_n_f16_m (p0, z1, x0),
++		 z0 = svscale_m (p0, z1, x0))
++
++/*
++** scale_3_f16_m_tied1:
++**	mov	(z[0-9]+\.h), #3
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f16_m_tied1, svfloat16_t,
++		z0 = svscale_n_f16_m (p0, z0, 3),
++		z0 = svscale_m (p0, z0, 3))
++
++/*
++** scale_3_f16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #3
++**	movprfx	z0, z1
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f16_m_untied, svfloat16_t,
++		z0 = svscale_n_f16_m (p0, z1, 3),
++		z0 = svscale_m (p0, z1, 3))
++
++/*
++** scale_m3_f16_m:
++**	mov	(z[0-9]+\.h), #-3
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f16_m, svfloat16_t,
++		z0 = svscale_n_f16_m (p0, z0, -3),
++		z0 = svscale_m (p0, z0, -3))
++
++/*
++** scale_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fscale	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (scale_f16_z_tied1, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_z (p0, z0, z4),
++	     z0 = svscale_z (p0, z0, z4))
++
++/*
++** scale_f16_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, z4\.h
++**	fscale	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f16_z_tied2, svfloat16_t, svint16_t,
++		 z0_res = svscale_f16_z (p0, z4, z0),
++		 z0_res = svscale_z (p0, z4, z0))
++
++/*
++** scale_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fscale	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (scale_f16_z_untied, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_z (p0, z1, z4),
++	     z0 = svscale_z (p0, z1, z4))
++
++/*
++** scale_w0_f16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f16_z_tied1, svfloat16_t, int16_t,
++		 z0 = svscale_n_f16_z (p0, z0, x0),
++		 z0 = svscale_z (p0, z0, x0))
++
++/*
++** scale_w0_f16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f16_z_untied, svfloat16_t, int16_t,
++		 z0 = svscale_n_f16_z (p0, z1, x0),
++		 z0 = svscale_z (p0, z1, x0))
++
++/*
++** scale_3_f16_z_tied1:
++**	mov	(z[0-9]+\.h), #3
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f16_z_tied1, svfloat16_t,
++		z0 = svscale_n_f16_z (p0, z0, 3),
++		z0 = svscale_z (p0, z0, 3))
++
++/*
++** scale_3_f16_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #3
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f16_z_untied, svfloat16_t,
++		z0 = svscale_n_f16_z (p0, z1, 3),
++		z0 = svscale_z (p0, z1, 3))
++
++/*
++** scale_m3_f16_z:
++**	mov	(z[0-9]+\.h), #-3
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f16_z, svfloat16_t,
++		z0 = svscale_n_f16_z (p0, z0, -3),
++		z0 = svscale_z (p0, z0, -3))
++
++/*
++** scale_f16_x_tied1:
++**	fscale	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (scale_f16_x_tied1, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_x (p0, z0, z4),
++	     z0 = svscale_x (p0, z0, z4))
++
++/*
++** scale_f16_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fscale	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f16_x_tied2, svfloat16_t, svint16_t,
++		 z0_res = svscale_f16_x (p0, z4, z0),
++		 z0_res = svscale_x (p0, z4, z0))
++
++/*
++** scale_f16_x_untied:
++**	movprfx	z0, z1
++**	fscale	z0\.h, p0/m, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (scale_f16_x_untied, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_x (p0, z1, z4),
++	     z0 = svscale_x (p0, z1, z4))
++
++/*
++** scale_w0_f16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f16_x_tied1, svfloat16_t, int16_t,
++		 z0 = svscale_n_f16_x (p0, z0, x0),
++		 z0 = svscale_x (p0, z0, x0))
++
++/*
++** scale_w0_f16_x_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f16_x_untied, svfloat16_t, int16_t,
++		 z0 = svscale_n_f16_x (p0, z1, x0),
++		 z0 = svscale_x (p0, z1, x0))
++
++/*
++** scale_3_f16_x_tied1:
++**	mov	(z[0-9]+\.h), #3
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f16_x_tied1, svfloat16_t,
++		z0 = svscale_n_f16_x (p0, z0, 3),
++		z0 = svscale_x (p0, z0, 3))
++
++/*
++** scale_3_f16_x_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #3
++**	movprfx	z0, z1
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f16_x_untied, svfloat16_t,
++		z0 = svscale_n_f16_x (p0, z1, 3),
++		z0 = svscale_x (p0, z1, 3))
++
++/*
++** scale_m3_f16_x:
++**	mov	(z[0-9]+\.h), #-3
++**	fscale	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f16_x, svfloat16_t,
++		z0 = svscale_n_f16_x (p0, z0, -3),
++		z0 = svscale_x (p0, z0, -3))
++
++/*
++** ptrue_scale_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_scale_f16_x_tied1, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_x (svptrue_b16 (), z0, z4),
++	     z0 = svscale_x (svptrue_b16 (), z0, z4))
++
++/*
++** ptrue_scale_f16_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_scale_f16_x_tied2, svfloat16_t, svint16_t,
++		 z0_res = svscale_f16_x (svptrue_b16 (), z4, z0),
++		 z0_res = svscale_x (svptrue_b16 (), z4, z0))
++
++/*
++** ptrue_scale_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_scale_f16_x_untied, svfloat16_t, svint16_t,
++	     z0 = svscale_f16_x (svptrue_b16 (), z1, z4),
++	     z0 = svscale_x (svptrue_b16 (), z1, z4))
++
++/*
++** ptrue_scale_3_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_3_f16_x_tied1, svfloat16_t,
++		z0 = svscale_n_f16_x (svptrue_b16 (), z0, 3),
++		z0 = svscale_x (svptrue_b16 (), z0, 3))
++
++/*
++** ptrue_scale_3_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_3_f16_x_untied, svfloat16_t,
++		z0 = svscale_n_f16_x (svptrue_b16 (), z1, 3),
++		z0 = svscale_x (svptrue_b16 (), z1, 3))
++
++/*
++** ptrue_scale_m3_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_m3_f16_x_tied1, svfloat16_t,
++		z0 = svscale_n_f16_x (svptrue_b16 (), z0, -3),
++		z0 = svscale_x (svptrue_b16 (), z0, -3))
++
++/*
++** ptrue_scale_m3_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_m3_f16_x_untied, svfloat16_t,
++		z0 = svscale_n_f16_x (svptrue_b16 (), z1, -3),
++		z0 = svscale_x (svptrue_b16 (), z1, -3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f32.c
+new file mode 100644
+index 000000000..747f8a639
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f32.c
+@@ -0,0 +1,330 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** scale_f32_m_tied1:
++**	fscale	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (scale_f32_m_tied1, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_m (p0, z0, z4),
++	     z0 = svscale_m (p0, z0, z4))
++
++/*
++** scale_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fscale	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f32_m_tied2, svfloat32_t, svint32_t,
++		 z0_res = svscale_f32_m (p0, z4, z0),
++		 z0_res = svscale_m (p0, z4, z0))
++
++/*
++** scale_f32_m_untied:
++**	movprfx	z0, z1
++**	fscale	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (scale_f32_m_untied, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_m (p0, z1, z4),
++	     z0 = svscale_m (p0, z1, z4))
++
++/*
++** scale_w0_f32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f32_m_tied1, svfloat32_t, int32_t,
++		 z0 = svscale_n_f32_m (p0, z0, x0),
++		 z0 = svscale_m (p0, z0, x0))
++
++/*
++** scale_w0_f32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f32_m_untied, svfloat32_t, int32_t,
++		 z0 = svscale_n_f32_m (p0, z1, x0),
++		 z0 = svscale_m (p0, z1, x0))
++
++/*
++** scale_3_f32_m_tied1:
++**	mov	(z[0-9]+\.s), #3
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f32_m_tied1, svfloat32_t,
++		z0 = svscale_n_f32_m (p0, z0, 3),
++		z0 = svscale_m (p0, z0, 3))
++
++/*
++** scale_3_f32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #3
++**	movprfx	z0, z1
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f32_m_untied, svfloat32_t,
++		z0 = svscale_n_f32_m (p0, z1, 3),
++		z0 = svscale_m (p0, z1, 3))
++
++/*
++** scale_m3_f32_m:
++**	mov	(z[0-9]+\.s), #-3
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f32_m, svfloat32_t,
++		z0 = svscale_n_f32_m (p0, z0, -3),
++		z0 = svscale_m (p0, z0, -3))
++
++/*
++** scale_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fscale	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (scale_f32_z_tied1, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_z (p0, z0, z4),
++	     z0 = svscale_z (p0, z0, z4))
++
++/*
++** scale_f32_z_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, z4\.s
++**	fscale	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f32_z_tied2, svfloat32_t, svint32_t,
++		 z0_res = svscale_f32_z (p0, z4, z0),
++		 z0_res = svscale_z (p0, z4, z0))
++
++/*
++** scale_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fscale	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (scale_f32_z_untied, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_z (p0, z1, z4),
++	     z0 = svscale_z (p0, z1, z4))
++
++/*
++** scale_w0_f32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f32_z_tied1, svfloat32_t, int32_t,
++		 z0 = svscale_n_f32_z (p0, z0, x0),
++		 z0 = svscale_z (p0, z0, x0))
++
++/*
++** scale_w0_f32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f32_z_untied, svfloat32_t, int32_t,
++		 z0 = svscale_n_f32_z (p0, z1, x0),
++		 z0 = svscale_z (p0, z1, x0))
++
++/*
++** scale_3_f32_z_tied1:
++**	mov	(z[0-9]+\.s), #3
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f32_z_tied1, svfloat32_t,
++		z0 = svscale_n_f32_z (p0, z0, 3),
++		z0 = svscale_z (p0, z0, 3))
++
++/*
++** scale_3_f32_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #3
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f32_z_untied, svfloat32_t,
++		z0 = svscale_n_f32_z (p0, z1, 3),
++		z0 = svscale_z (p0, z1, 3))
++
++/*
++** scale_m3_f32_z:
++**	mov	(z[0-9]+\.s), #-3
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f32_z, svfloat32_t,
++		z0 = svscale_n_f32_z (p0, z0, -3),
++		z0 = svscale_z (p0, z0, -3))
++
++/*
++** scale_f32_x_tied1:
++**	fscale	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (scale_f32_x_tied1, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_x (p0, z0, z4),
++	     z0 = svscale_x (p0, z0, z4))
++
++/*
++** scale_f32_x_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	fscale	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f32_x_tied2, svfloat32_t, svint32_t,
++		 z0_res = svscale_f32_x (p0, z4, z0),
++		 z0_res = svscale_x (p0, z4, z0))
++
++/*
++** scale_f32_x_untied:
++**	movprfx	z0, z1
++**	fscale	z0\.s, p0/m, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (scale_f32_x_untied, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_x (p0, z1, z4),
++	     z0 = svscale_x (p0, z1, z4))
++
++/*
++** scale_w0_f32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f32_x_tied1, svfloat32_t, int32_t,
++		 z0 = svscale_n_f32_x (p0, z0, x0),
++		 z0 = svscale_x (p0, z0, x0))
++
++/*
++** scale_w0_f32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_w0_f32_x_untied, svfloat32_t, int32_t,
++		 z0 = svscale_n_f32_x (p0, z1, x0),
++		 z0 = svscale_x (p0, z1, x0))
++
++/*
++** scale_3_f32_x_tied1:
++**	mov	(z[0-9]+\.s), #3
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f32_x_tied1, svfloat32_t,
++		z0 = svscale_n_f32_x (p0, z0, 3),
++		z0 = svscale_x (p0, z0, 3))
++
++/*
++** scale_3_f32_x_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #3
++**	movprfx	z0, z1
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f32_x_untied, svfloat32_t,
++		z0 = svscale_n_f32_x (p0, z1, 3),
++		z0 = svscale_x (p0, z1, 3))
++
++/*
++** scale_m3_f32_x:
++**	mov	(z[0-9]+\.s), #-3
++**	fscale	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f32_x, svfloat32_t,
++		z0 = svscale_n_f32_x (p0, z0, -3),
++		z0 = svscale_x (p0, z0, -3))
++
++/*
++** ptrue_scale_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_scale_f32_x_tied1, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_x (svptrue_b32 (), z0, z4),
++	     z0 = svscale_x (svptrue_b32 (), z0, z4))
++
++/*
++** ptrue_scale_f32_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_scale_f32_x_tied2, svfloat32_t, svint32_t,
++		 z0_res = svscale_f32_x (svptrue_b32 (), z4, z0),
++		 z0_res = svscale_x (svptrue_b32 (), z4, z0))
++
++/*
++** ptrue_scale_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_scale_f32_x_untied, svfloat32_t, svint32_t,
++	     z0 = svscale_f32_x (svptrue_b32 (), z1, z4),
++	     z0 = svscale_x (svptrue_b32 (), z1, z4))
++
++/*
++** ptrue_scale_3_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_3_f32_x_tied1, svfloat32_t,
++		z0 = svscale_n_f32_x (svptrue_b32 (), z0, 3),
++		z0 = svscale_x (svptrue_b32 (), z0, 3))
++
++/*
++** ptrue_scale_3_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_3_f32_x_untied, svfloat32_t,
++		z0 = svscale_n_f32_x (svptrue_b32 (), z1, 3),
++		z0 = svscale_x (svptrue_b32 (), z1, 3))
++
++/*
++** ptrue_scale_m3_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_m3_f32_x_tied1, svfloat32_t,
++		z0 = svscale_n_f32_x (svptrue_b32 (), z0, -3),
++		z0 = svscale_x (svptrue_b32 (), z0, -3))
++
++/*
++** ptrue_scale_m3_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_m3_f32_x_untied, svfloat32_t,
++		z0 = svscale_n_f32_x (svptrue_b32 (), z1, -3),
++		z0 = svscale_x (svptrue_b32 (), z1, -3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f64.c
+new file mode 100644
+index 000000000..004cbfa3e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f64.c
+@@ -0,0 +1,330 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** scale_f64_m_tied1:
++**	fscale	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (scale_f64_m_tied1, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_m (p0, z0, z4),
++	     z0 = svscale_m (p0, z0, z4))
++
++/*
++** scale_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f64_m_tied2, svfloat64_t, svint64_t,
++		 z0_res = svscale_f64_m (p0, z4, z0),
++		 z0_res = svscale_m (p0, z4, z0))
++
++/*
++** scale_f64_m_untied:
++**	movprfx	z0, z1
++**	fscale	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (scale_f64_m_untied, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_m (p0, z1, z4),
++	     z0 = svscale_m (p0, z1, z4))
++
++/*
++** scale_x0_f64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_x0_f64_m_tied1, svfloat64_t, int64_t,
++		 z0 = svscale_n_f64_m (p0, z0, x0),
++		 z0 = svscale_m (p0, z0, x0))
++
++/*
++** scale_x0_f64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_x0_f64_m_untied, svfloat64_t, int64_t,
++		 z0 = svscale_n_f64_m (p0, z1, x0),
++		 z0 = svscale_m (p0, z1, x0))
++
++/*
++** scale_3_f64_m_tied1:
++**	mov	(z[0-9]+\.d), #3
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f64_m_tied1, svfloat64_t,
++		z0 = svscale_n_f64_m (p0, z0, 3),
++		z0 = svscale_m (p0, z0, 3))
++
++/*
++** scale_3_f64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #3
++**	movprfx	z0, z1
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f64_m_untied, svfloat64_t,
++		z0 = svscale_n_f64_m (p0, z1, 3),
++		z0 = svscale_m (p0, z1, 3))
++
++/*
++** scale_m3_f64_m:
++**	mov	(z[0-9]+\.d), #-3
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f64_m, svfloat64_t,
++		z0 = svscale_n_f64_m (p0, z0, -3),
++		z0 = svscale_m (p0, z0, -3))
++
++/*
++** scale_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fscale	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (scale_f64_z_tied1, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_z (p0, z0, z4),
++	     z0 = svscale_z (p0, z0, z4))
++
++/*
++** scale_f64_z_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, z4\.d
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f64_z_tied2, svfloat64_t, svint64_t,
++		 z0_res = svscale_f64_z (p0, z4, z0),
++		 z0_res = svscale_z (p0, z4, z0))
++
++/*
++** scale_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fscale	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (scale_f64_z_untied, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_z (p0, z1, z4),
++	     z0 = svscale_z (p0, z1, z4))
++
++/*
++** scale_x0_f64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_x0_f64_z_tied1, svfloat64_t, int64_t,
++		 z0 = svscale_n_f64_z (p0, z0, x0),
++		 z0 = svscale_z (p0, z0, x0))
++
++/*
++** scale_x0_f64_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_x0_f64_z_untied, svfloat64_t, int64_t,
++		 z0 = svscale_n_f64_z (p0, z1, x0),
++		 z0 = svscale_z (p0, z1, x0))
++
++/*
++** scale_3_f64_z_tied1:
++**	mov	(z[0-9]+\.d), #3
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f64_z_tied1, svfloat64_t,
++		z0 = svscale_n_f64_z (p0, z0, 3),
++		z0 = svscale_z (p0, z0, 3))
++
++/*
++** scale_3_f64_z_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #3
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f64_z_untied, svfloat64_t,
++		z0 = svscale_n_f64_z (p0, z1, 3),
++		z0 = svscale_z (p0, z1, 3))
++
++/*
++** scale_m3_f64_z:
++**	mov	(z[0-9]+\.d), #-3
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f64_z, svfloat64_t,
++		z0 = svscale_n_f64_z (p0, z0, -3),
++		z0 = svscale_z (p0, z0, -3))
++
++/*
++** scale_f64_x_tied1:
++**	fscale	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (scale_f64_x_tied1, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_x (p0, z0, z4),
++	     z0 = svscale_x (p0, z0, z4))
++
++/*
++** scale_f64_x_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z4
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_DUAL_Z_REV (scale_f64_x_tied2, svfloat64_t, svint64_t,
++		 z0_res = svscale_f64_x (p0, z4, z0),
++		 z0_res = svscale_x (p0, z4, z0))
++
++/*
++** scale_f64_x_untied:
++**	movprfx	z0, z1
++**	fscale	z0\.d, p0/m, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (scale_f64_x_untied, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_x (p0, z1, z4),
++	     z0 = svscale_x (p0, z1, z4))
++
++/*
++** scale_x0_f64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_x0_f64_x_tied1, svfloat64_t, int64_t,
++		 z0 = svscale_n_f64_x (p0, z0, x0),
++		 z0 = svscale_x (p0, z0, x0))
++
++/*
++** scale_x0_f64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (scale_x0_f64_x_untied, svfloat64_t, int64_t,
++		 z0 = svscale_n_f64_x (p0, z1, x0),
++		 z0 = svscale_x (p0, z1, x0))
++
++/*
++** scale_3_f64_x_tied1:
++**	mov	(z[0-9]+\.d), #3
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f64_x_tied1, svfloat64_t,
++		z0 = svscale_n_f64_x (p0, z0, 3),
++		z0 = svscale_x (p0, z0, 3))
++
++/*
++** scale_3_f64_x_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #3
++**	movprfx	z0, z1
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_3_f64_x_untied, svfloat64_t,
++		z0 = svscale_n_f64_x (p0, z1, 3),
++		z0 = svscale_x (p0, z1, 3))
++
++/*
++** scale_m3_f64_x:
++**	mov	(z[0-9]+\.d), #-3
++**	fscale	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (scale_m3_f64_x, svfloat64_t,
++		z0 = svscale_n_f64_x (p0, z0, -3),
++		z0 = svscale_x (p0, z0, -3))
++
++/*
++** ptrue_scale_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_scale_f64_x_tied1, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_x (svptrue_b64 (), z0, z4),
++	     z0 = svscale_x (svptrue_b64 (), z0, z4))
++
++/*
++** ptrue_scale_f64_x_tied2:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z_REV (ptrue_scale_f64_x_tied2, svfloat64_t, svint64_t,
++		 z0_res = svscale_f64_x (svptrue_b64 (), z4, z0),
++		 z0_res = svscale_x (svptrue_b64 (), z4, z0))
++
++/*
++** ptrue_scale_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_DUAL_Z (ptrue_scale_f64_x_untied, svfloat64_t, svint64_t,
++	     z0 = svscale_f64_x (svptrue_b64 (), z1, z4),
++	     z0 = svscale_x (svptrue_b64 (), z1, z4))
++
++/*
++** ptrue_scale_3_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_3_f64_x_tied1, svfloat64_t,
++		z0 = svscale_n_f64_x (svptrue_b64 (), z0, 3),
++		z0 = svscale_x (svptrue_b64 (), z0, 3))
++
++/*
++** ptrue_scale_3_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_3_f64_x_untied, svfloat64_t,
++		z0 = svscale_n_f64_x (svptrue_b64 (), z1, 3),
++		z0 = svscale_x (svptrue_b64 (), z1, 3))
++
++/*
++** ptrue_scale_m3_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_m3_f64_x_tied1, svfloat64_t,
++		z0 = svscale_n_f64_x (svptrue_b64 (), z0, -3),
++		z0 = svscale_x (svptrue_b64 (), z0, -3))
++
++/*
++** ptrue_scale_m3_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_scale_m3_f64_x_untied, svfloat64_t,
++		z0 = svscale_n_f64_x (svptrue_b64 (), z1, -3),
++		z0 = svscale_x (svptrue_b64 (), z1, -3))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_b.c
+new file mode 100644
+index 000000000..a135e9c99
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_b.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_b_tied1:
++**	sel	p0\.b, p3, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (sel_b_tied1,
++		p0 = svsel_b (p3, p0, p1),
++		p0 = svsel (p3, p0, p1))
++
++/*
++** sel_b_tied2:
++**	sel	p0\.b, p3, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (sel_b_tied2,
++		p0 = svsel_b (p3, p1, p0),
++		p0 = svsel (p3, p1, p0))
++
++/*
++** sel_b_untied:
++**	sel	p0\.b, p3, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (sel_b_untied,
++		p0 = svsel_b (p3, p1, p2),
++		p0 = svsel (p3, p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_bf16.c
+new file mode 100644
+index 000000000..44636d8f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_bf16_tied1:
++**	sel	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_bf16_tied1, svbfloat16_t,
++		z0 = svsel_bf16 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_bf16_tied2:
++**	sel	z0\.h, p0, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_bf16_tied2, svbfloat16_t,
++		z0 = svsel_bf16 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_bf16_untied:
++**	sel	z0\.h, p0, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_bf16_untied, svbfloat16_t,
++		z0 = svsel_bf16 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f16.c
+new file mode 100644
+index 000000000..35750ea81
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_f16_tied1:
++**	sel	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f16_tied1, svfloat16_t,
++		z0 = svsel_f16 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_f16_tied2:
++**	sel	z0\.h, p0, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f16_tied2, svfloat16_t,
++		z0 = svsel_f16 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_f16_untied:
++**	sel	z0\.h, p0, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f16_untied, svfloat16_t,
++		z0 = svsel_f16 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f32.c
+new file mode 100644
+index 000000000..639a84724
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_f32_tied1:
++**	sel	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f32_tied1, svfloat32_t,
++		z0 = svsel_f32 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_f32_tied2:
++**	sel	z0\.s, p0, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f32_tied2, svfloat32_t,
++		z0 = svsel_f32 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_f32_untied:
++**	sel	z0\.s, p0, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f32_untied, svfloat32_t,
++		z0 = svsel_f32 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f64.c
+new file mode 100644
+index 000000000..048d6e52a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_f64_tied1:
++**	sel	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f64_tied1, svfloat64_t,
++		z0 = svsel_f64 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_f64_tied2:
++**	sel	z0\.d, p0, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f64_tied2, svfloat64_t,
++		z0 = svsel_f64 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_f64_untied:
++**	sel	z0\.d, p0, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_f64_untied, svfloat64_t,
++		z0 = svsel_f64 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s16.c
+new file mode 100644
+index 000000000..e162da499
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_s16_tied1:
++**	sel	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s16_tied1, svint16_t,
++		z0 = svsel_s16 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_s16_tied2:
++**	sel	z0\.h, p0, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s16_tied2, svint16_t,
++		z0 = svsel_s16 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_s16_untied:
++**	sel	z0\.h, p0, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s16_untied, svint16_t,
++		z0 = svsel_s16 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s32.c
+new file mode 100644
+index 000000000..80839d803
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_s32_tied1:
++**	sel	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s32_tied1, svint32_t,
++		z0 = svsel_s32 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_s32_tied2:
++**	sel	z0\.s, p0, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s32_tied2, svint32_t,
++		z0 = svsel_s32 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_s32_untied:
++**	sel	z0\.s, p0, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s32_untied, svint32_t,
++		z0 = svsel_s32 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s64.c
+new file mode 100644
+index 000000000..85a77eafb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_s64_tied1:
++**	sel	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s64_tied1, svint64_t,
++		z0 = svsel_s64 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_s64_tied2:
++**	sel	z0\.d, p0, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s64_tied2, svint64_t,
++		z0 = svsel_s64 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_s64_untied:
++**	sel	z0\.d, p0, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s64_untied, svint64_t,
++		z0 = svsel_s64 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s8.c
+new file mode 100644
+index 000000000..28c43f627
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_s8_tied1:
++**	sel	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s8_tied1, svint8_t,
++		z0 = svsel_s8 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_s8_tied2:
++**	sel	z0\.b, p0, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s8_tied2, svint8_t,
++		z0 = svsel_s8 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_s8_untied:
++**	sel	z0\.b, p0, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sel_s8_untied, svint8_t,
++		z0 = svsel_s8 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u16.c
+new file mode 100644
+index 000000000..b85ede803
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_u16_tied1:
++**	sel	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u16_tied1, svuint16_t,
++		z0 = svsel_u16 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_u16_tied2:
++**	sel	z0\.h, p0, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u16_tied2, svuint16_t,
++		z0 = svsel_u16 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_u16_untied:
++**	sel	z0\.h, p0, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u16_untied, svuint16_t,
++		z0 = svsel_u16 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u32.c
+new file mode 100644
+index 000000000..636cf8790
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_u32_tied1:
++**	sel	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u32_tied1, svuint32_t,
++		z0 = svsel_u32 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_u32_tied2:
++**	sel	z0\.s, p0, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u32_tied2, svuint32_t,
++		z0 = svsel_u32 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_u32_untied:
++**	sel	z0\.s, p0, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u32_untied, svuint32_t,
++		z0 = svsel_u32 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u64.c
+new file mode 100644
+index 000000000..6325ca56f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_u64_tied1:
++**	sel	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u64_tied1, svuint64_t,
++		z0 = svsel_u64 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_u64_tied2:
++**	sel	z0\.d, p0, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u64_tied2, svuint64_t,
++		z0 = svsel_u64 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_u64_untied:
++**	sel	z0\.d, p0, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u64_untied, svuint64_t,
++		z0 = svsel_u64 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u8.c
+new file mode 100644
+index 000000000..5af53dccd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sel_u8_tied1:
++**	sel	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u8_tied1, svuint8_t,
++		z0 = svsel_u8 (p0, z0, z1),
++		z0 = svsel (p0, z0, z1))
++
++/*
++** sel_u8_tied2:
++**	sel	z0\.b, p0, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u8_tied2, svuint8_t,
++		z0 = svsel_u8 (p0, z1, z0),
++		z0 = svsel (p0, z1, z0))
++
++/*
++** sel_u8_untied:
++**	sel	z0\.b, p0, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sel_u8_untied, svuint8_t,
++		z0 = svsel_u8 (p0, z1, z2),
++		z0 = svsel (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_bf16.c
+new file mode 100644
+index 000000000..b160a2517
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_bf16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_bf16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_bf16_z24_0, svbfloat16x2_t, svbfloat16_t,
++	  z24 = svset2_bf16 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_bf16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_bf16_z24_1, svbfloat16x2_t, svbfloat16_t,
++	  z24 = svset2_bf16 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_bf16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
++	  z4 = svset2_bf16 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_bf16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
++	  z4 = svset2_bf16 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f16.c
+new file mode 100644
+index 000000000..859600698
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_f16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f16_z24_0, svfloat16x2_t, svfloat16_t,
++	  z24 = svset2_f16 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_f16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f16_z24_1, svfloat16x2_t, svfloat16_t,
++	  z24 = svset2_f16 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_f16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f16_z4_0, svfloat16x2_t, svfloat16_t,
++	  z4 = svset2_f16 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_f16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f16_z4_1, svfloat16x2_t, svfloat16_t,
++	  z4 = svset2_f16 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f32.c
+new file mode 100644
+index 000000000..a95ff2fc5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f32.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_f32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f32_z24_0, svfloat32x2_t, svfloat32_t,
++	  z24 = svset2_f32 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_f32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f32_z24_1, svfloat32x2_t, svfloat32_t,
++	  z24 = svset2_f32 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_f32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f32_z4_0, svfloat32x2_t, svfloat32_t,
++	  z4 = svset2_f32 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_f32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f32_z4_1, svfloat32x2_t, svfloat32_t,
++	  z4 = svset2_f32 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f64.c
+new file mode 100644
+index 000000000..77837b7d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f64.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_f64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f64_z24_0, svfloat64x2_t, svfloat64_t,
++	  z24 = svset2_f64 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_f64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f64_z24_1, svfloat64x2_t, svfloat64_t,
++	  z24 = svset2_f64 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_f64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f64_z4_0, svfloat64x2_t, svfloat64_t,
++	  z4 = svset2_f64 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_f64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_f64_z4_1, svfloat64x2_t, svfloat64_t,
++	  z4 = svset2_f64 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s16.c
+new file mode 100644
+index 000000000..aa2e70fd1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_s16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s16_z24_0, svint16x2_t, svint16_t,
++	  z24 = svset2_s16 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_s16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s16_z24_1, svint16x2_t, svint16_t,
++	  z24 = svset2_s16 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_s16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s16_z4_0, svint16x2_t, svint16_t,
++	  z4 = svset2_s16 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_s16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s16_z4_1, svint16x2_t, svint16_t,
++	  z4 = svset2_s16 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s32.c
+new file mode 100644
+index 000000000..3a7c289aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s32.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_s32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s32_z24_0, svint32x2_t, svint32_t,
++	  z24 = svset2_s32 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_s32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s32_z24_1, svint32x2_t, svint32_t,
++	  z24 = svset2_s32 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_s32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s32_z4_0, svint32x2_t, svint32_t,
++	  z4 = svset2_s32 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_s32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s32_z4_1, svint32x2_t, svint32_t,
++	  z4 = svset2_s32 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s64.c
+new file mode 100644
+index 000000000..ca6df54d9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s64.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_s64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s64_z24_0, svint64x2_t, svint64_t,
++	  z24 = svset2_s64 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_s64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s64_z24_1, svint64x2_t, svint64_t,
++	  z24 = svset2_s64 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_s64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s64_z4_0, svint64x2_t, svint64_t,
++	  z4 = svset2_s64 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_s64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s64_z4_1, svint64x2_t, svint64_t,
++	  z4 = svset2_s64 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s8.c
+new file mode 100644
+index 000000000..e143128a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s8.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_s8_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s8_z24_0, svint8x2_t, svint8_t,
++	  z24 = svset2_s8 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_s8_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s8_z24_1, svint8x2_t, svint8_t,
++	  z24 = svset2_s8 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_s8_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s8_z4_0, svint8x2_t, svint8_t,
++	  z4 = svset2_s8 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_s8_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_s8_z4_1, svint8x2_t, svint8_t,
++	  z4 = svset2_s8 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u16.c
+new file mode 100644
+index 000000000..53da08398
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u16.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_u16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u16_z24_0, svuint16x2_t, svuint16_t,
++	  z24 = svset2_u16 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_u16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u16_z24_1, svuint16x2_t, svuint16_t,
++	  z24 = svset2_u16 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_u16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u16_z4_0, svuint16x2_t, svuint16_t,
++	  z4 = svset2_u16 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_u16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u16_z4_1, svuint16x2_t, svuint16_t,
++	  z4 = svset2_u16 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u32.c
+new file mode 100644
+index 000000000..5266a62d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u32.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_u32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u32_z24_0, svuint32x2_t, svuint32_t,
++	  z24 = svset2_u32 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_u32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u32_z24_1, svuint32x2_t, svuint32_t,
++	  z24 = svset2_u32 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_u32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u32_z4_0, svuint32x2_t, svuint32_t,
++	  z4 = svset2_u32 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_u32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u32_z4_1, svuint32x2_t, svuint32_t,
++	  z4 = svset2_u32 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u64.c
+new file mode 100644
+index 000000000..f7d2a1807
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u64.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_u64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u64_z24_0, svuint64x2_t, svuint64_t,
++	  z24 = svset2_u64 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_u64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u64_z24_1, svuint64x2_t, svuint64_t,
++	  z24 = svset2_u64 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_u64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u64_z4_0, svuint64x2_t, svuint64_t,
++	  z4 = svset2_u64 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_u64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u64_z4_1, svuint64x2_t, svuint64_t,
++	  z4 = svset2_u64 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u8.c
+new file mode 100644
+index 000000000..9494a0e54
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u8.c
+@@ -0,0 +1,41 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set2_u8_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u8_z24_0, svuint8x2_t, svuint8_t,
++	  z24 = svset2_u8 (z4, 0, z0),
++	  z24 = svset2 (z4, 0, z0))
++
++/*
++** set2_u8_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u8_z24_1, svuint8x2_t, svuint8_t,
++	  z24 = svset2_u8 (z4, 1, z0),
++	  z24 = svset2 (z4, 1, z0))
++
++/*
++** set2_u8_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u8_z4_0, svuint8x2_t, svuint8_t,
++	  z4 = svset2_u8 (z4, 0, z0),
++	  z4 = svset2 (z4, 0, z0))
++
++/*
++** set2_u8_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set2_u8_z4_1, svuint8x2_t, svuint8_t,
++	  z4 = svset2_u8 (z4, 1, z0),
++	  z4 = svset2 (z4, 1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_bf16.c
+new file mode 100644
+index 000000000..4e0707d09
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_bf16.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_bf16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_bf16_z24_0, svbfloat16x3_t, svbfloat16_t,
++	  z24 = svset3_bf16 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_bf16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_bf16_z24_1, svbfloat16x3_t, svbfloat16_t,
++	  z24 = svset3_bf16 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_bf16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_bf16_z24_2, svbfloat16x3_t, svbfloat16_t,
++	  z24 = svset3_bf16 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_bf16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
++	  z4 = svset3_bf16 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_bf16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
++	  z4 = svset3_bf16 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_bf16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
++	  z4 = svset3_bf16 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f16.c
+new file mode 100644
+index 000000000..b6bb3a2bf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f16.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_f16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f16_z24_0, svfloat16x3_t, svfloat16_t,
++	  z24 = svset3_f16 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_f16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f16_z24_1, svfloat16x3_t, svfloat16_t,
++	  z24 = svset3_f16 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_f16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f16_z24_2, svfloat16x3_t, svfloat16_t,
++	  z24 = svset3_f16 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_f16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f16_z4_0, svfloat16x3_t, svfloat16_t,
++	  z4 = svset3_f16 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_f16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f16_z4_1, svfloat16x3_t, svfloat16_t,
++	  z4 = svset3_f16 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_f16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f16_z4_2, svfloat16x3_t, svfloat16_t,
++	  z4 = svset3_f16 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f32.c
+new file mode 100644
+index 000000000..659bc713f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f32.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_f32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f32_z24_0, svfloat32x3_t, svfloat32_t,
++	  z24 = svset3_f32 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_f32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f32_z24_1, svfloat32x3_t, svfloat32_t,
++	  z24 = svset3_f32 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_f32_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f32_z24_2, svfloat32x3_t, svfloat32_t,
++	  z24 = svset3_f32 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_f32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f32_z4_0, svfloat32x3_t, svfloat32_t,
++	  z4 = svset3_f32 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_f32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f32_z4_1, svfloat32x3_t, svfloat32_t,
++	  z4 = svset3_f32 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_f32_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f32_z4_2, svfloat32x3_t, svfloat32_t,
++	  z4 = svset3_f32 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f64.c
+new file mode 100644
+index 000000000..2cf3b6015
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f64.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_f64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f64_z24_0, svfloat64x3_t, svfloat64_t,
++	  z24 = svset3_f64 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_f64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f64_z24_1, svfloat64x3_t, svfloat64_t,
++	  z24 = svset3_f64 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_f64_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f64_z24_2, svfloat64x3_t, svfloat64_t,
++	  z24 = svset3_f64 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_f64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f64_z4_0, svfloat64x3_t, svfloat64_t,
++	  z4 = svset3_f64 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_f64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f64_z4_1, svfloat64x3_t, svfloat64_t,
++	  z4 = svset3_f64 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_f64_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_f64_z4_2, svfloat64x3_t, svfloat64_t,
++	  z4 = svset3_f64 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s16.c
+new file mode 100644
+index 000000000..907ae9894
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s16.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_s16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s16_z24_0, svint16x3_t, svint16_t,
++	  z24 = svset3_s16 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_s16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s16_z24_1, svint16x3_t, svint16_t,
++	  z24 = svset3_s16 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_s16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s16_z24_2, svint16x3_t, svint16_t,
++	  z24 = svset3_s16 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_s16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s16_z4_0, svint16x3_t, svint16_t,
++	  z4 = svset3_s16 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_s16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s16_z4_1, svint16x3_t, svint16_t,
++	  z4 = svset3_s16 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_s16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s16_z4_2, svint16x3_t, svint16_t,
++	  z4 = svset3_s16 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s32.c
+new file mode 100644
+index 000000000..0baa33c3a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s32.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_s32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s32_z24_0, svint32x3_t, svint32_t,
++	  z24 = svset3_s32 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_s32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s32_z24_1, svint32x3_t, svint32_t,
++	  z24 = svset3_s32 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_s32_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s32_z24_2, svint32x3_t, svint32_t,
++	  z24 = svset3_s32 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_s32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s32_z4_0, svint32x3_t, svint32_t,
++	  z4 = svset3_s32 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_s32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s32_z4_1, svint32x3_t, svint32_t,
++	  z4 = svset3_s32 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_s32_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s32_z4_2, svint32x3_t, svint32_t,
++	  z4 = svset3_s32 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s64.c
+new file mode 100644
+index 000000000..d1d142c71
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s64.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_s64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s64_z24_0, svint64x3_t, svint64_t,
++	  z24 = svset3_s64 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_s64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s64_z24_1, svint64x3_t, svint64_t,
++	  z24 = svset3_s64 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_s64_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s64_z24_2, svint64x3_t, svint64_t,
++	  z24 = svset3_s64 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_s64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s64_z4_0, svint64x3_t, svint64_t,
++	  z4 = svset3_s64 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_s64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s64_z4_1, svint64x3_t, svint64_t,
++	  z4 = svset3_s64 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_s64_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s64_z4_2, svint64x3_t, svint64_t,
++	  z4 = svset3_s64 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s8.c
+new file mode 100644
+index 000000000..8badf4b1d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s8.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_s8_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s8_z24_0, svint8x3_t, svint8_t,
++	  z24 = svset3_s8 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_s8_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s8_z24_1, svint8x3_t, svint8_t,
++	  z24 = svset3_s8 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_s8_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s8_z24_2, svint8x3_t, svint8_t,
++	  z24 = svset3_s8 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_s8_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s8_z4_0, svint8x3_t, svint8_t,
++	  z4 = svset3_s8 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_s8_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s8_z4_1, svint8x3_t, svint8_t,
++	  z4 = svset3_s8 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_s8_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_s8_z4_2, svint8x3_t, svint8_t,
++	  z4 = svset3_s8 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u16.c
+new file mode 100644
+index 000000000..df7ce88d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u16.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_u16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u16_z24_0, svuint16x3_t, svuint16_t,
++	  z24 = svset3_u16 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_u16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u16_z24_1, svuint16x3_t, svuint16_t,
++	  z24 = svset3_u16 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_u16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u16_z24_2, svuint16x3_t, svuint16_t,
++	  z24 = svset3_u16 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_u16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u16_z4_0, svuint16x3_t, svuint16_t,
++	  z4 = svset3_u16 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_u16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u16_z4_1, svuint16x3_t, svuint16_t,
++	  z4 = svset3_u16 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_u16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u16_z4_2, svuint16x3_t, svuint16_t,
++	  z4 = svset3_u16 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u32.c
+new file mode 100644
+index 000000000..703a68f5c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u32.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_u32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u32_z24_0, svuint32x3_t, svuint32_t,
++	  z24 = svset3_u32 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_u32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u32_z24_1, svuint32x3_t, svuint32_t,
++	  z24 = svset3_u32 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_u32_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u32_z24_2, svuint32x3_t, svuint32_t,
++	  z24 = svset3_u32 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_u32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u32_z4_0, svuint32x3_t, svuint32_t,
++	  z4 = svset3_u32 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_u32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u32_z4_1, svuint32x3_t, svuint32_t,
++	  z4 = svset3_u32 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_u32_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u32_z4_2, svuint32x3_t, svuint32_t,
++	  z4 = svset3_u32 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u64.c
+new file mode 100644
+index 000000000..bff5b3539
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u64.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_u64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u64_z24_0, svuint64x3_t, svuint64_t,
++	  z24 = svset3_u64 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_u64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u64_z24_1, svuint64x3_t, svuint64_t,
++	  z24 = svset3_u64 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_u64_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u64_z24_2, svuint64x3_t, svuint64_t,
++	  z24 = svset3_u64 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_u64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u64_z4_0, svuint64x3_t, svuint64_t,
++	  z4 = svset3_u64 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_u64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u64_z4_1, svuint64x3_t, svuint64_t,
++	  z4 = svset3_u64 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_u64_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u64_z4_2, svuint64x3_t, svuint64_t,
++	  z4 = svset3_u64 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u8.c
+new file mode 100644
+index 000000000..9f40001c4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u8.c
+@@ -0,0 +1,63 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set3_u8_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u8_z24_0, svuint8x3_t, svuint8_t,
++	  z24 = svset3_u8 (z4, 0, z0),
++	  z24 = svset3 (z4, 0, z0))
++
++/*
++** set3_u8_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u8_z24_1, svuint8x3_t, svuint8_t,
++	  z24 = svset3_u8 (z4, 1, z0),
++	  z24 = svset3 (z4, 1, z0))
++
++/*
++** set3_u8_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u8_z24_2, svuint8x3_t, svuint8_t,
++	  z24 = svset3_u8 (z4, 2, z0),
++	  z24 = svset3 (z4, 2, z0))
++
++/*
++** set3_u8_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u8_z4_0, svuint8x3_t, svuint8_t,
++	  z4 = svset3_u8 (z4, 0, z0),
++	  z4 = svset3 (z4, 0, z0))
++
++/*
++** set3_u8_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u8_z4_1, svuint8x3_t, svuint8_t,
++	  z4 = svset3_u8 (z4, 1, z0),
++	  z4 = svset3 (z4, 1, z0))
++
++/*
++** set3_u8_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set3_u8_z4_2, svuint8x3_t, svuint8_t,
++	  z4 = svset3_u8 (z4, 2, z0),
++	  z4 = svset3 (z4, 2, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_bf16.c
+new file mode 100644
+index 000000000..4e26c1117
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_bf16.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_bf16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z24_0, svbfloat16x4_t, svbfloat16_t,
++	  z24 = svset4_bf16 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_bf16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z24_1, svbfloat16x4_t, svbfloat16_t,
++	  z24 = svset4_bf16 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_bf16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z24_2, svbfloat16x4_t, svbfloat16_t,
++	  z24 = svset4_bf16 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_bf16_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z24_3, svbfloat16x4_t, svbfloat16_t,
++	  z24 = svset4_bf16 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_bf16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
++	  z4 = svset4_bf16 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_bf16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
++	  z4 = svset4_bf16 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_bf16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
++	  z4 = svset4_bf16 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_bf16_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
++	  z4 = svset4_bf16 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f16.c
+new file mode 100644
+index 000000000..a28ff9ca6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f16.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_f16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z24_0, svfloat16x4_t, svfloat16_t,
++	  z24 = svset4_f16 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_f16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z24_1, svfloat16x4_t, svfloat16_t,
++	  z24 = svset4_f16 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_f16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z24_2, svfloat16x4_t, svfloat16_t,
++	  z24 = svset4_f16 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_f16_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z24_3, svfloat16x4_t, svfloat16_t,
++	  z24 = svset4_f16 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_f16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z4_0, svfloat16x4_t, svfloat16_t,
++	  z4 = svset4_f16 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_f16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z4_1, svfloat16x4_t, svfloat16_t,
++	  z4 = svset4_f16 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_f16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z4_2, svfloat16x4_t, svfloat16_t,
++	  z4 = svset4_f16 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_f16_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f16_z4_3, svfloat16x4_t, svfloat16_t,
++	  z4 = svset4_f16 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f32.c
+new file mode 100644
+index 000000000..e6e3f5ebd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f32.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_f32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z24_0, svfloat32x4_t, svfloat32_t,
++	  z24 = svset4_f32 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_f32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z24_1, svfloat32x4_t, svfloat32_t,
++	  z24 = svset4_f32 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_f32_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z24_2, svfloat32x4_t, svfloat32_t,
++	  z24 = svset4_f32 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_f32_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z24_3, svfloat32x4_t, svfloat32_t,
++	  z24 = svset4_f32 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_f32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z4_0, svfloat32x4_t, svfloat32_t,
++	  z4 = svset4_f32 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_f32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z4_1, svfloat32x4_t, svfloat32_t,
++	  z4 = svset4_f32 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_f32_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z4_2, svfloat32x4_t, svfloat32_t,
++	  z4 = svset4_f32 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_f32_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f32_z4_3, svfloat32x4_t, svfloat32_t,
++	  z4 = svset4_f32 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f64.c
+new file mode 100644
+index 000000000..3ceaa459a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f64.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_f64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z24_0, svfloat64x4_t, svfloat64_t,
++	  z24 = svset4_f64 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_f64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z24_1, svfloat64x4_t, svfloat64_t,
++	  z24 = svset4_f64 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_f64_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z24_2, svfloat64x4_t, svfloat64_t,
++	  z24 = svset4_f64 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_f64_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z24_3, svfloat64x4_t, svfloat64_t,
++	  z24 = svset4_f64 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_f64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z4_0, svfloat64x4_t, svfloat64_t,
++	  z4 = svset4_f64 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_f64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z4_1, svfloat64x4_t, svfloat64_t,
++	  z4 = svset4_f64 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_f64_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z4_2, svfloat64x4_t, svfloat64_t,
++	  z4 = svset4_f64 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_f64_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_f64_z4_3, svfloat64x4_t, svfloat64_t,
++	  z4 = svset4_f64 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s16.c
+new file mode 100644
+index 000000000..3cef6ebe8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s16.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_s16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z24_0, svint16x4_t, svint16_t,
++	  z24 = svset4_s16 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_s16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z24_1, svint16x4_t, svint16_t,
++	  z24 = svset4_s16 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_s16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z24_2, svint16x4_t, svint16_t,
++	  z24 = svset4_s16 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_s16_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z24_3, svint16x4_t, svint16_t,
++	  z24 = svset4_s16 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_s16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z4_0, svint16x4_t, svint16_t,
++	  z4 = svset4_s16 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_s16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z4_1, svint16x4_t, svint16_t,
++	  z4 = svset4_s16 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_s16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z4_2, svint16x4_t, svint16_t,
++	  z4 = svset4_s16 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_s16_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s16_z4_3, svint16x4_t, svint16_t,
++	  z4 = svset4_s16 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s32.c
+new file mode 100644
+index 000000000..49f646e8d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s32.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_s32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z24_0, svint32x4_t, svint32_t,
++	  z24 = svset4_s32 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_s32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z24_1, svint32x4_t, svint32_t,
++	  z24 = svset4_s32 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_s32_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z24_2, svint32x4_t, svint32_t,
++	  z24 = svset4_s32 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_s32_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z24_3, svint32x4_t, svint32_t,
++	  z24 = svset4_s32 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_s32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z4_0, svint32x4_t, svint32_t,
++	  z4 = svset4_s32 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_s32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z4_1, svint32x4_t, svint32_t,
++	  z4 = svset4_s32 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_s32_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z4_2, svint32x4_t, svint32_t,
++	  z4 = svset4_s32 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_s32_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s32_z4_3, svint32x4_t, svint32_t,
++	  z4 = svset4_s32 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s64.c
+new file mode 100644
+index 000000000..7544e25a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s64.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_s64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z24_0, svint64x4_t, svint64_t,
++	  z24 = svset4_s64 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_s64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z24_1, svint64x4_t, svint64_t,
++	  z24 = svset4_s64 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_s64_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z24_2, svint64x4_t, svint64_t,
++	  z24 = svset4_s64 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_s64_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z24_3, svint64x4_t, svint64_t,
++	  z24 = svset4_s64 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_s64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z4_0, svint64x4_t, svint64_t,
++	  z4 = svset4_s64 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_s64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z4_1, svint64x4_t, svint64_t,
++	  z4 = svset4_s64 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_s64_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z4_2, svint64x4_t, svint64_t,
++	  z4 = svset4_s64 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_s64_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s64_z4_3, svint64x4_t, svint64_t,
++	  z4 = svset4_s64 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s8.c
+new file mode 100644
+index 000000000..2ec9ff059
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s8.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_s8_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z24_0, svint8x4_t, svint8_t,
++	  z24 = svset4_s8 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_s8_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z24_1, svint8x4_t, svint8_t,
++	  z24 = svset4_s8 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_s8_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z24_2, svint8x4_t, svint8_t,
++	  z24 = svset4_s8 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_s8_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z24_3, svint8x4_t, svint8_t,
++	  z24 = svset4_s8 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_s8_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z4_0, svint8x4_t, svint8_t,
++	  z4 = svset4_s8 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_s8_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z4_1, svint8x4_t, svint8_t,
++	  z4 = svset4_s8 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_s8_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z4_2, svint8x4_t, svint8_t,
++	  z4 = svset4_s8 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_s8_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_s8_z4_3, svint8x4_t, svint8_t,
++	  z4 = svset4_s8 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u16.c
+new file mode 100644
+index 000000000..c9499b044
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u16.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_u16_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z24_0, svuint16x4_t, svuint16_t,
++	  z24 = svset4_u16 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_u16_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z24_1, svuint16x4_t, svuint16_t,
++	  z24 = svset4_u16 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_u16_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z24_2, svuint16x4_t, svuint16_t,
++	  z24 = svset4_u16 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_u16_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z24_3, svuint16x4_t, svuint16_t,
++	  z24 = svset4_u16 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_u16_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z4_0, svuint16x4_t, svuint16_t,
++	  z4 = svset4_u16 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_u16_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z4_1, svuint16x4_t, svuint16_t,
++	  z4 = svset4_u16 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_u16_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z4_2, svuint16x4_t, svuint16_t,
++	  z4 = svset4_u16 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_u16_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u16_z4_3, svuint16x4_t, svuint16_t,
++	  z4 = svset4_u16 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u32.c
+new file mode 100644
+index 000000000..00b3dc513
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u32.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_u32_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z24_0, svuint32x4_t, svuint32_t,
++	  z24 = svset4_u32 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_u32_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z24_1, svuint32x4_t, svuint32_t,
++	  z24 = svset4_u32 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_u32_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z24_2, svuint32x4_t, svuint32_t,
++	  z24 = svset4_u32 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_u32_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z24_3, svuint32x4_t, svuint32_t,
++	  z24 = svset4_u32 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_u32_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z4_0, svuint32x4_t, svuint32_t,
++	  z4 = svset4_u32 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_u32_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z4_1, svuint32x4_t, svuint32_t,
++	  z4 = svset4_u32 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_u32_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z4_2, svuint32x4_t, svuint32_t,
++	  z4 = svset4_u32 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_u32_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u32_z4_3, svuint32x4_t, svuint32_t,
++	  z4 = svset4_u32 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u64.c
+new file mode 100644
+index 000000000..d2f048b82
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u64.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_u64_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z24_0, svuint64x4_t, svuint64_t,
++	  z24 = svset4_u64 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_u64_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z24_1, svuint64x4_t, svuint64_t,
++	  z24 = svset4_u64 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_u64_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z24_2, svuint64x4_t, svuint64_t,
++	  z24 = svset4_u64 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_u64_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z24_3, svuint64x4_t, svuint64_t,
++	  z24 = svset4_u64 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_u64_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z4_0, svuint64x4_t, svuint64_t,
++	  z4 = svset4_u64 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_u64_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z4_1, svuint64x4_t, svuint64_t,
++	  z4 = svset4_u64 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_u64_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z4_2, svuint64x4_t, svuint64_t,
++	  z4 = svset4_u64 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_u64_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u64_z4_3, svuint64x4_t, svuint64_t,
++	  z4 = svset4_u64 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u8.c
+new file mode 100644
+index 000000000..b4f27c6f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u8.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** set4_u8_z24_0:
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z24\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z24_0, svuint8x4_t, svuint8_t,
++	  z24 = svset4_u8 (z4, 0, z0),
++	  z24 = svset4 (z4, 0, z0))
++
++/*
++** set4_u8_z24_1:
++**	mov	z24\.d, z4\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z7\.d
++**	mov	z25\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z24_1, svuint8x4_t, svuint8_t,
++	  z24 = svset4_u8 (z4, 1, z0),
++	  z24 = svset4 (z4, 1, z0))
++
++/*
++** set4_u8_z24_2:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z27\.d, z7\.d
++**	mov	z26\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z24_2, svuint8x4_t, svuint8_t,
++	  z24 = svset4_u8 (z4, 2, z0),
++	  z24 = svset4 (z4, 2, z0))
++
++/*
++** set4_u8_z24_3:
++**	mov	z24\.d, z4\.d
++**	mov	z25\.d, z5\.d
++**	mov	z26\.d, z6\.d
++**	mov	z27\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z24_3, svuint8x4_t, svuint8_t,
++	  z24 = svset4_u8 (z4, 3, z0),
++	  z24 = svset4 (z4, 3, z0))
++
++/*
++** set4_u8_z4_0:
++**	mov	z4\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z4_0, svuint8x4_t, svuint8_t,
++	  z4 = svset4_u8 (z4, 0, z0),
++	  z4 = svset4 (z4, 0, z0))
++
++/*
++** set4_u8_z4_1:
++**	mov	z5\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z4_1, svuint8x4_t, svuint8_t,
++	  z4 = svset4_u8 (z4, 1, z0),
++	  z4 = svset4 (z4, 1, z0))
++
++/*
++** set4_u8_z4_2:
++**	mov	z6\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z4_2, svuint8x4_t, svuint8_t,
++	  z4 = svset4_u8 (z4, 2, z0),
++	  z4 = svset4 (z4, 2, z0))
++
++/*
++** set4_u8_z4_3:
++**	mov	z7\.d, z0\.d
++**	ret
++*/
++TEST_SET (set4_u8_z4_3, svuint8x4_t, svuint8_t,
++	  z4 = svset4_u8 (z4, 3, z0),
++	  z4 = svset4 (z4, 3, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_bf16.c
+new file mode 100644
+index 000000000..3d2dbf20d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_bf16.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_bf16_tied1:
++**	splice	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_bf16_tied1, svbfloat16_t,
++		z0 = svsplice_bf16 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_bf16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_bf16_tied2, svbfloat16_t,
++		z0 = svsplice_bf16 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_bf16_untied:
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_bf16_untied, svbfloat16_t,
++		z0 = svsplice_bf16 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f16.c
+new file mode 100644
+index 000000000..b796eaf3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f16.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_f16_tied1:
++**	splice	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f16_tied1, svfloat16_t,
++		z0 = svsplice_f16 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f16_tied2, svfloat16_t,
++		z0 = svsplice_f16 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_f16_untied:
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f16_untied, svfloat16_t,
++		z0 = svsplice_f16 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f32.c
+new file mode 100644
+index 000000000..1fc552bc3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f32.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_f32_tied1:
++**	splice	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f32_tied1, svfloat32_t,
++		z0 = svsplice_f32 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f32_tied2, svfloat32_t,
++		z0 = svsplice_f32 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_f32_untied:
++**	movprfx	z0, z1
++**	splice	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f32_untied, svfloat32_t,
++		z0 = svsplice_f32 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f64.c
+new file mode 100644
+index 000000000..26b523520
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f64.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_f64_tied1:
++**	splice	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f64_tied1, svfloat64_t,
++		z0 = svsplice_f64 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f64_tied2, svfloat64_t,
++		z0 = svsplice_f64 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_f64_untied:
++**	movprfx	z0, z1
++**	splice	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (splice_f64_untied, svfloat64_t,
++		z0 = svsplice_f64 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s16.c
+new file mode 100644
+index 000000000..8796c6ecd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s16.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_s16_tied1:
++**	splice	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s16_tied1, svint16_t,
++		z0 = svsplice_s16 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_s16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s16_tied2, svint16_t,
++		z0 = svsplice_s16 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_s16_untied:
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s16_untied, svint16_t,
++		z0 = svsplice_s16 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s32.c
+new file mode 100644
+index 000000000..5f2798e06
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s32.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_s32_tied1:
++**	splice	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s32_tied1, svint32_t,
++		z0 = svsplice_s32 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s32_tied2, svint32_t,
++		z0 = svsplice_s32 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_s32_untied:
++**	movprfx	z0, z1
++**	splice	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s32_untied, svint32_t,
++		z0 = svsplice_s32 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s64.c
+new file mode 100644
+index 000000000..024bfa479
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s64.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_s64_tied1:
++**	splice	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s64_tied1, svint64_t,
++		z0 = svsplice_s64 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_s64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s64_tied2, svint64_t,
++		z0 = svsplice_s64 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_s64_untied:
++**	movprfx	z0, z1
++**	splice	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s64_untied, svint64_t,
++		z0 = svsplice_s64 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s8.c
+new file mode 100644
+index 000000000..cd91ee245
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s8.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_s8_tied1:
++**	splice	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s8_tied1, svint8_t,
++		z0 = svsplice_s8 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_s8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.b, p0, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s8_tied2, svint8_t,
++		z0 = svsplice_s8 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_s8_untied:
++**	movprfx	z0, z1
++**	splice	z0\.b, p0, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (splice_s8_untied, svint8_t,
++		z0 = svsplice_s8 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u16.c
+new file mode 100644
+index 000000000..821ebaee6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u16.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_u16_tied1:
++**	splice	z0\.h, p0, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u16_tied1, svuint16_t,
++		z0 = svsplice_u16 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_u16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u16_tied2, svuint16_t,
++		z0 = svsplice_u16 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_u16_untied:
++**	movprfx	z0, z1
++**	splice	z0\.h, p0, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u16_untied, svuint16_t,
++		z0 = svsplice_u16 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u32.c
+new file mode 100644
+index 000000000..200364f20
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u32.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_u32_tied1:
++**	splice	z0\.s, p0, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u32_tied1, svuint32_t,
++		z0 = svsplice_u32 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_u32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.s, p0, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u32_tied2, svuint32_t,
++		z0 = svsplice_u32 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_u32_untied:
++**	movprfx	z0, z1
++**	splice	z0\.s, p0, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u32_untied, svuint32_t,
++		z0 = svsplice_u32 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u64.c
+new file mode 100644
+index 000000000..352bcdeed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u64.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_u64_tied1:
++**	splice	z0\.d, p0, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u64_tied1, svuint64_t,
++		z0 = svsplice_u64 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_u64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.d, p0, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u64_tied2, svuint64_t,
++		z0 = svsplice_u64 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_u64_untied:
++**	movprfx	z0, z1
++**	splice	z0\.d, p0, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u64_untied, svuint64_t,
++		z0 = svsplice_u64 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u8.c
+new file mode 100644
+index 000000000..6c24fe64d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u8.c
+@@ -0,0 +1,33 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** splice_u8_tied1:
++**	splice	z0\.b, p0, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u8_tied1, svuint8_t,
++		z0 = svsplice_u8 (p0, z0, z1),
++		z0 = svsplice (p0, z0, z1))
++
++/*
++** splice_u8_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	splice	z0\.b, p0, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u8_tied2, svuint8_t,
++		z0 = svsplice_u8 (p0, z1, z0),
++		z0 = svsplice (p0, z1, z0))
++
++/*
++** splice_u8_untied:
++**	movprfx	z0, z1
++**	splice	z0\.b, p0, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (splice_u8_untied, svuint8_t,
++		z0 = svsplice_u8 (p0, z1, z2),
++		z0 = svsplice (p0, z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f16.c
+new file mode 100644
+index 000000000..6dc5940fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f16.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sqrt_f16_m_tied12:
++**	fsqrt	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_m_tied12, svfloat16_t,
++		z0 = svsqrt_f16_m (z0, p0, z0),
++		z0 = svsqrt_m (z0, p0, z0))
++
++/*
++** sqrt_f16_m_tied1:
++**	fsqrt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_m_tied1, svfloat16_t,
++		z0 = svsqrt_f16_m (z0, p0, z1),
++		z0 = svsqrt_m (z0, p0, z1))
++
++/*
++** sqrt_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsqrt	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_m_tied2, svfloat16_t,
++		z0 = svsqrt_f16_m (z1, p0, z0),
++		z0 = svsqrt_m (z1, p0, z0))
++
++/*
++** sqrt_f16_m_untied:
++**	movprfx	z0, z2
++**	fsqrt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_m_untied, svfloat16_t,
++		z0 = svsqrt_f16_m (z2, p0, z1),
++		z0 = svsqrt_m (z2, p0, z1))
++
++/*
++** sqrt_f16_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.h, p0/z, \1\.h
++**	fsqrt	z0\.h, p0/m, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_z_tied1, svfloat16_t,
++		z0 = svsqrt_f16_z (p0, z0),
++		z0 = svsqrt_z (p0, z0))
++
++/*
++** sqrt_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsqrt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_z_untied, svfloat16_t,
++		z0 = svsqrt_f16_z (p0, z1),
++		z0 = svsqrt_z (p0, z1))
++
++/*
++** sqrt_f16_x_tied1:
++**	fsqrt	z0\.h, p0/m, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_x_tied1, svfloat16_t,
++		z0 = svsqrt_f16_x (p0, z0),
++		z0 = svsqrt_x (p0, z0))
++
++/*
++** sqrt_f16_x_untied:
++**	fsqrt	z0\.h, p0/m, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f16_x_untied, svfloat16_t,
++		z0 = svsqrt_f16_x (p0, z1),
++		z0 = svsqrt_x (p0, z1))
++
++/*
++** ptrue_sqrt_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sqrt_f16_x_tied1, svfloat16_t,
++		z0 = svsqrt_f16_x (svptrue_b16 (), z0),
++		z0 = svsqrt_x (svptrue_b16 (), z0))
++
++/*
++** ptrue_sqrt_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sqrt_f16_x_untied, svfloat16_t,
++		z0 = svsqrt_f16_x (svptrue_b16 (), z1),
++		z0 = svsqrt_x (svptrue_b16 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f32.c
+new file mode 100644
+index 000000000..71d1f8f74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f32.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sqrt_f32_m_tied12:
++**	fsqrt	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_m_tied12, svfloat32_t,
++		z0 = svsqrt_f32_m (z0, p0, z0),
++		z0 = svsqrt_m (z0, p0, z0))
++
++/*
++** sqrt_f32_m_tied1:
++**	fsqrt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_m_tied1, svfloat32_t,
++		z0 = svsqrt_f32_m (z0, p0, z1),
++		z0 = svsqrt_m (z0, p0, z1))
++
++/*
++** sqrt_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsqrt	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_m_tied2, svfloat32_t,
++		z0 = svsqrt_f32_m (z1, p0, z0),
++		z0 = svsqrt_m (z1, p0, z0))
++
++/*
++** sqrt_f32_m_untied:
++**	movprfx	z0, z2
++**	fsqrt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_m_untied, svfloat32_t,
++		z0 = svsqrt_f32_m (z2, p0, z1),
++		z0 = svsqrt_m (z2, p0, z1))
++
++/*
++** sqrt_f32_z_tied1:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0\.s, p0/z, \1\.s
++**	fsqrt	z0\.s, p0/m, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_z_tied1, svfloat32_t,
++		z0 = svsqrt_f32_z (p0, z0),
++		z0 = svsqrt_z (p0, z0))
++
++/*
++** sqrt_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsqrt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_z_untied, svfloat32_t,
++		z0 = svsqrt_f32_z (p0, z1),
++		z0 = svsqrt_z (p0, z1))
++
++/*
++** sqrt_f32_x_tied1:
++**	fsqrt	z0\.s, p0/m, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_x_tied1, svfloat32_t,
++		z0 = svsqrt_f32_x (p0, z0),
++		z0 = svsqrt_x (p0, z0))
++
++/*
++** sqrt_f32_x_untied:
++**	fsqrt	z0\.s, p0/m, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f32_x_untied, svfloat32_t,
++		z0 = svsqrt_f32_x (p0, z1),
++		z0 = svsqrt_x (p0, z1))
++
++/*
++** ptrue_sqrt_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sqrt_f32_x_tied1, svfloat32_t,
++		z0 = svsqrt_f32_x (svptrue_b32 (), z0),
++		z0 = svsqrt_x (svptrue_b32 (), z0))
++
++/*
++** ptrue_sqrt_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sqrt_f32_x_untied, svfloat32_t,
++		z0 = svsqrt_f32_x (svptrue_b32 (), z1),
++		z0 = svsqrt_x (svptrue_b32 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f64.c
+new file mode 100644
+index 000000000..7771df545
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f64.c
+@@ -0,0 +1,103 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sqrt_f64_m_tied12:
++**	fsqrt	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_m_tied12, svfloat64_t,
++		z0 = svsqrt_f64_m (z0, p0, z0),
++		z0 = svsqrt_m (z0, p0, z0))
++
++/*
++** sqrt_f64_m_tied1:
++**	fsqrt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_m_tied1, svfloat64_t,
++		z0 = svsqrt_f64_m (z0, p0, z1),
++		z0 = svsqrt_m (z0, p0, z1))
++
++/*
++** sqrt_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fsqrt	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_m_tied2, svfloat64_t,
++		z0 = svsqrt_f64_m (z1, p0, z0),
++		z0 = svsqrt_m (z1, p0, z0))
++
++/*
++** sqrt_f64_m_untied:
++**	movprfx	z0, z2
++**	fsqrt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_m_untied, svfloat64_t,
++		z0 = svsqrt_f64_m (z2, p0, z1),
++		z0 = svsqrt_m (z2, p0, z1))
++
++/*
++** sqrt_f64_z_tied1:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0\.d, p0/z, \1
++**	fsqrt	z0\.d, p0/m, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_z_tied1, svfloat64_t,
++		z0 = svsqrt_f64_z (p0, z0),
++		z0 = svsqrt_z (p0, z0))
++
++/*
++** sqrt_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsqrt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_z_untied, svfloat64_t,
++		z0 = svsqrt_f64_z (p0, z1),
++		z0 = svsqrt_z (p0, z1))
++
++/*
++** sqrt_f64_x_tied1:
++**	fsqrt	z0\.d, p0/m, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_x_tied1, svfloat64_t,
++		z0 = svsqrt_f64_x (p0, z0),
++		z0 = svsqrt_x (p0, z0))
++
++/*
++** sqrt_f64_x_untied:
++**	fsqrt	z0\.d, p0/m, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sqrt_f64_x_untied, svfloat64_t,
++		z0 = svsqrt_f64_x (p0, z1),
++		z0 = svsqrt_x (p0, z1))
++
++/*
++** ptrue_sqrt_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sqrt_f64_x_tied1, svfloat64_t,
++		z0 = svsqrt_f64_x (svptrue_b64 (), z0),
++		z0 = svsqrt_x (svptrue_b64 (), z0))
++
++/*
++** ptrue_sqrt_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sqrt_f64_x_untied, svfloat64_t,
++		z0 = svsqrt_f64_x (svptrue_b64 (), z1),
++		z0 = svsqrt_x (svptrue_b64 (), z1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_bf16.c
+new file mode 100644
+index 000000000..ec3dbe318
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_bf16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_bf16_base:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_bf16_base, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_bf16_index:
++**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1_bf16_index, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_bf16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_bf16_1, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 + svcnth (), z0),
++	    svst1 (p0, x0 + svcnth (), z0))
++
++/*
++** st1_bf16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_bf16_7, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 + svcnth () * 7, z0),
++	    svst1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_bf16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_bf16_8, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 + svcnth () * 8, z0),
++	    svst1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** st1_bf16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_bf16_m1, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 - svcnth (), z0),
++	    svst1 (p0, x0 - svcnth (), z0))
++
++/*
++** st1_bf16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_bf16_m8, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 - svcnth () * 8, z0),
++	    svst1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_bf16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_bf16_m9, svbfloat16_t, bfloat16_t,
++	    svst1_bf16 (p0, x0 - svcnth () * 9, z0),
++	    svst1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** st1_vnum_bf16_0:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_bf16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_bf16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_bf16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_bf16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_bf16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_bf16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
++	    svst1_vnum_bf16 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f16.c
+new file mode 100644
+index 000000000..2406cfd97
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_f16_base:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f16_base, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_f16_index:
++**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1_f16_index, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_f16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f16_1, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 + svcnth (), z0),
++	    svst1 (p0, x0 + svcnth (), z0))
++
++/*
++** st1_f16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f16_7, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 + svcnth () * 7, z0),
++	    svst1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_f16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f16_8, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 + svcnth () * 8, z0),
++	    svst1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** st1_f16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f16_m1, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 - svcnth (), z0),
++	    svst1 (p0, x0 - svcnth (), z0))
++
++/*
++** st1_f16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f16_m8, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 - svcnth () * 8, z0),
++	    svst1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_f16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f16_m9, svfloat16_t, float16_t,
++	    svst1_f16 (p0, x0 - svcnth () * 9, z0),
++	    svst1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** st1_vnum_f16_0:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_0, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_f16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_1, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_f16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_7, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_f16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_8, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_f16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_m1, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_f16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_m8, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_f16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_m9, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f16_x1, svfloat16_t, float16_t,
++	    svst1_vnum_f16 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f32.c
+new file mode 100644
+index 000000000..5fad7f06f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_f32_base:
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f32_base, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_f32_index:
++**	st1w	z0\.s, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st1_f32_index, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_f32_1:
++**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f32_1, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 + svcntw (), z0),
++	    svst1 (p0, x0 + svcntw (), z0))
++
++/*
++** st1_f32_7:
++**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f32_7, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 + svcntw () * 7, z0),
++	    svst1 (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_f32_8:
++**	incb	x0, all, mul #8
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f32_8, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 + svcntw () * 8, z0),
++	    svst1 (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1_f32_m1:
++**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f32_m1, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 - svcntw (), z0),
++	    svst1 (p0, x0 - svcntw (), z0))
++
++/*
++** st1_f32_m8:
++**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f32_m8, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 - svcntw () * 8, z0),
++	    svst1 (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_f32_m9:
++**	decb	x0, all, mul #9
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f32_m9, svfloat32_t, float32_t,
++	    svst1_f32 (p0, x0 - svcntw () * 9, z0),
++	    svst1 (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1_vnum_f32_0:
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_0, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_f32_1:
++**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_1, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_f32_7:
++**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_7, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_f32_8:
++**	incb	x0, all, mul #8
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_8, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_f32_m1:
++**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_m1, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_f32_m8:
++**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_m8, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_f32_m9:
++**	decb	x0, all, mul #9
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_m9, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1w	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f32_x1, svfloat32_t, float32_t,
++	    svst1_vnum_f32 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f64.c
+new file mode 100644
+index 000000000..486f92beb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_f64_base:
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f64_base, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_f64_index:
++**	st1d	z0\.d, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st1_f64_index, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_f64_1:
++**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f64_1, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 + svcntd (), z0),
++	    svst1 (p0, x0 + svcntd (), z0))
++
++/*
++** st1_f64_7:
++**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f64_7, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 + svcntd () * 7, z0),
++	    svst1 (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_f64_8:
++**	incb	x0, all, mul #8
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f64_8, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 + svcntd () * 8, z0),
++	    svst1 (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1_f64_m1:
++**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f64_m1, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 - svcntd (), z0),
++	    svst1 (p0, x0 - svcntd (), z0))
++
++/*
++** st1_f64_m8:
++**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_f64_m8, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 - svcntd () * 8, z0),
++	    svst1 (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_f64_m9:
++**	decb	x0, all, mul #9
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_f64_m9, svfloat64_t, float64_t,
++	    svst1_f64 (p0, x0 - svcntd () * 9, z0),
++	    svst1 (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1_vnum_f64_0:
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_0, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_f64_1:
++**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_1, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_f64_7:
++**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_7, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_f64_8:
++**	incb	x0, all, mul #8
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_8, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_f64_m1:
++**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_m1, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_f64_m8:
++**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_m8, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_f64_m9:
++**	decb	x0, all, mul #9
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_m9, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1d	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_f64_x1, svfloat64_t, float64_t,
++	    svst1_vnum_f64 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s16.c
+new file mode 100644
+index 000000000..7d4ac25d2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_s16_base:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s16_base, svint16_t, int16_t,
++	    svst1_s16 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_s16_index:
++**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1_s16_index, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_s16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s16_1, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 + svcnth (), z0),
++	    svst1 (p0, x0 + svcnth (), z0))
++
++/*
++** st1_s16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s16_7, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 + svcnth () * 7, z0),
++	    svst1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s16_8, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 + svcnth () * 8, z0),
++	    svst1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** st1_s16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s16_m1, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 - svcnth (), z0),
++	    svst1 (p0, x0 - svcnth (), z0))
++
++/*
++** st1_s16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s16_m8, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 - svcnth () * 8, z0),
++	    svst1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s16_m9, svint16_t, int16_t,
++	    svst1_s16 (p0, x0 - svcnth () * 9, z0),
++	    svst1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** st1_vnum_s16_0:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_0, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_s16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_1, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_s16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_7, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_8, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_s16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_m1, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_s16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_m8, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_m9, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s16_x1, svint16_t, int16_t,
++	    svst1_vnum_s16 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s32.c
+new file mode 100644
+index 000000000..e2bcc3403
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_s32_base:
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s32_base, svint32_t, int32_t,
++	    svst1_s32 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_s32_index:
++**	st1w	z0\.s, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st1_s32_index, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_s32_1:
++**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s32_1, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 + svcntw (), z0),
++	    svst1 (p0, x0 + svcntw (), z0))
++
++/*
++** st1_s32_7:
++**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s32_7, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 + svcntw () * 7, z0),
++	    svst1 (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s32_8:
++**	incb	x0, all, mul #8
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s32_8, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 + svcntw () * 8, z0),
++	    svst1 (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1_s32_m1:
++**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s32_m1, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 - svcntw (), z0),
++	    svst1 (p0, x0 - svcntw (), z0))
++
++/*
++** st1_s32_m8:
++**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s32_m8, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 - svcntw () * 8, z0),
++	    svst1 (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s32_m9:
++**	decb	x0, all, mul #9
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s32_m9, svint32_t, int32_t,
++	    svst1_s32 (p0, x0 - svcntw () * 9, z0),
++	    svst1 (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1_vnum_s32_0:
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_0, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_s32_1:
++**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_1, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_s32_7:
++**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_7, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s32_8:
++**	incb	x0, all, mul #8
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_8, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_s32_m1:
++**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_m1, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_s32_m8:
++**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_m8, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s32_m9:
++**	decb	x0, all, mul #9
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_m9, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1w	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s32_x1, svint32_t, int32_t,
++	    svst1_vnum_s32 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s64.c
+new file mode 100644
+index 000000000..8e0b69f73
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_s64_base:
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s64_base, svint64_t, int64_t,
++	    svst1_s64 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_s64_index:
++**	st1d	z0\.d, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st1_s64_index, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_s64_1:
++**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s64_1, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 + svcntd (), z0),
++	    svst1 (p0, x0 + svcntd (), z0))
++
++/*
++** st1_s64_7:
++**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s64_7, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 + svcntd () * 7, z0),
++	    svst1 (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s64_8:
++**	incb	x0, all, mul #8
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s64_8, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 + svcntd () * 8, z0),
++	    svst1 (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1_s64_m1:
++**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s64_m1, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 - svcntd (), z0),
++	    svst1 (p0, x0 - svcntd (), z0))
++
++/*
++** st1_s64_m8:
++**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s64_m8, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 - svcntd () * 8, z0),
++	    svst1 (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s64_m9:
++**	decb	x0, all, mul #9
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s64_m9, svint64_t, int64_t,
++	    svst1_s64 (p0, x0 - svcntd () * 9, z0),
++	    svst1 (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1_vnum_s64_0:
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_0, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_s64_1:
++**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_1, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_s64_7:
++**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_7, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s64_8:
++**	incb	x0, all, mul #8
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_8, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_s64_m1:
++**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_m1, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_s64_m8:
++**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_m8, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s64_m9:
++**	decb	x0, all, mul #9
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_m9, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1d	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s64_x1, svint64_t, int64_t,
++	    svst1_vnum_s64 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s8.c
+new file mode 100644
+index 000000000..4155683ab
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_s8_base:
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s8_base, svint8_t, int8_t,
++	    svst1_s8 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_s8_index:
++**	st1b	z0\.b, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1_s8_index, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_s8_1:
++**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s8_1, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 + svcntb (), z0),
++	    svst1 (p0, x0 + svcntb (), z0))
++
++/*
++** st1_s8_7:
++**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s8_7, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 + svcntb () * 7, z0),
++	    svst1 (p0, x0 + svcntb () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s8_8:
++**	incb	x0, all, mul #8
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s8_8, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 + svcntb () * 8, z0),
++	    svst1 (p0, x0 + svcntb () * 8, z0))
++
++/*
++** st1_s8_m1:
++**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s8_m1, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 - svcntb (), z0),
++	    svst1 (p0, x0 - svcntb (), z0))
++
++/*
++** st1_s8_m8:
++**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_s8_m8, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 - svcntb () * 8, z0),
++	    svst1 (p0, x0 - svcntb () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_s8_m9:
++**	decb	x0, all, mul #9
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_s8_m9, svint8_t, int8_t,
++	    svst1_s8 (p0, x0 - svcntb () * 9, z0),
++	    svst1 (p0, x0 - svcntb () * 9, z0))
++
++/*
++** st1_vnum_s8_0:
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_0, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_s8_1:
++**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_1, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_s8_7:
++**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_7, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s8_8:
++**	incb	x0, all, mul #8
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_8, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_s8_m1:
++**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_m1, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_s8_m8:
++**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_m8, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_s8_m9:
++**	decb	x0, all, mul #9
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_m9, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/*
++** st1_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.b, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.b, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1_vnum_s8_x1, svint8_t, int8_t,
++	    svst1_vnum_s8 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f32.c
+new file mode 100644
+index 000000000..cb6774ad0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f32.c
+@@ -0,0 +1,227 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_scatter_f32:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_f32, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_f32 (p0, z1, z0),
++		       svst1_scatter (p0, z1, z0))
++
++/*
++** st1_scatter_x0_f32_offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, x0, z0),
++		       svst1_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m4_f32_offset:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m4_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, -4, z0),
++		       svst1_scatter_offset (p0, z1, -4, z0))
++
++/*
++** st1_scatter_0_f32_offset:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 0, z0),
++		       svst1_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_f32_offset:
++**	mov	(x[0-9]+), #?5
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 5, z0),
++		       svst1_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1_scatter_6_f32_offset:
++**	mov	(x[0-9]+), #?6
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_6_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 6, z0),
++		       svst1_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1_scatter_7_f32_offset:
++**	mov	(x[0-9]+), #?7
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_7_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 7, z0),
++		       svst1_scatter_offset (p0, z1, 7, z0))
++
++/*
++** st1_scatter_8_f32_offset:
++**	st1w	z0\.s, p0, \[z1\.s, #8\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_8_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 8, z0),
++		       svst1_scatter_offset (p0, z1, 8, z0))
++
++/*
++** st1_scatter_124_f32_offset:
++**	st1w	z0\.s, p0, \[z1\.s, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_124_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 124, z0),
++		       svst1_scatter_offset (p0, z1, 124, z0))
++
++/*
++** st1_scatter_128_f32_offset:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_128_f32_offset, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_f32 (p0, z1, 128, z0),
++		       svst1_scatter_offset (p0, z1, 128, z0))
++
++/*
++** st1_scatter_x0_f32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_f32_index, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_index_f32 (p0, z1, x0, z0),
++		       svst1_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m1_f32_index:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m1_f32_index, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_index_f32 (p0, z1, -1, z0),
++		       svst1_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1_scatter_0_f32_index:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_f32_index, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_index_f32 (p0, z1, 0, z0),
++		       svst1_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_f32_index:
++**	st1w	z0\.s, p0, \[z1\.s, #20\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_f32_index, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_index_f32 (p0, z1, 5, z0),
++		       svst1_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1_scatter_31_f32_index:
++**	st1w	z0\.s, p0, \[z1\.s, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_31_f32_index, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_index_f32 (p0, z1, 31, z0),
++		       svst1_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1_scatter_32_f32_index:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_32_f32_index, svfloat32_t, svuint32_t,
++		       svst1_scatter_u32base_index_f32 (p0, z1, 32, z0),
++		       svst1_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1_scatter_x0_f32_s32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		       svst1_scatter_s32offset_f32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f32_s32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f32_s32offset, svfloat32_t, float32_t, svint32_t,
++		       svst1_scatter_s32offset_f32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_f32_u32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		       svst1_scatter_u32offset_f32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f32_u32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
++		       svst1_scatter_u32offset_f32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_f32_s32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		       svst1_scatter_s32index_f32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f32_s32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f32_s32index, svfloat32_t, float32_t, svint32_t,
++		       svst1_scatter_s32index_f32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_f32_u32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		       svst1_scatter_u32index_f32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f32_u32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f32_u32index, svfloat32_t, float32_t, svuint32_t,
++		       svst1_scatter_u32index_f32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f64.c
+new file mode 100644
+index 000000000..fe978bbe5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f64.c
+@@ -0,0 +1,303 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_scatter_f64:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_f64, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_f64 (p0, z1, z0),
++		       svst1_scatter (p0, z1, z0))
++
++/*
++** st1_scatter_x0_f64_offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, x0, z0),
++		       svst1_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m8_f64_offset:
++**	mov	(x[0-9]+), #?-8
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m8_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, -8, z0),
++		       svst1_scatter_offset (p0, z1, -8, z0))
++
++/*
++** st1_scatter_0_f64_offset:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 0, z0),
++		       svst1_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1_scatter_9_f64_offset:
++**	mov	(x[0-9]+), #?9
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_9_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 9, z0),
++		       svst1_scatter_offset (p0, z1, 9, z0))
++
++/*
++** st1_scatter_10_f64_offset:
++**	mov	(x[0-9]+), #?10
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_10_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 10, z0),
++		       svst1_scatter_offset (p0, z1, 10, z0))
++
++/*
++** st1_scatter_11_f64_offset:
++**	mov	(x[0-9]+), #?11
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_11_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 11, z0),
++		       svst1_scatter_offset (p0, z1, 11, z0))
++
++/*
++** st1_scatter_12_f64_offset:
++**	mov	(x[0-9]+), #?12
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_12_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 12, z0),
++		       svst1_scatter_offset (p0, z1, 12, z0))
++
++/*
++** st1_scatter_13_f64_offset:
++**	mov	(x[0-9]+), #?13
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_13_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 13, z0),
++		       svst1_scatter_offset (p0, z1, 13, z0))
++
++/*
++** st1_scatter_14_f64_offset:
++**	mov	(x[0-9]+), #?14
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_14_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 14, z0),
++		       svst1_scatter_offset (p0, z1, 14, z0))
++
++/*
++** st1_scatter_15_f64_offset:
++**	mov	(x[0-9]+), #?15
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_15_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 15, z0),
++		       svst1_scatter_offset (p0, z1, 15, z0))
++
++/*
++** st1_scatter_16_f64_offset:
++**	st1d	z0\.d, p0, \[z1\.d, #16\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_16_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 16, z0),
++		       svst1_scatter_offset (p0, z1, 16, z0))
++
++/*
++** st1_scatter_248_f64_offset:
++**	st1d	z0\.d, p0, \[z1\.d, #248\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_248_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 248, z0),
++		       svst1_scatter_offset (p0, z1, 248, z0))
++
++/*
++** st1_scatter_256_f64_offset:
++**	mov	(x[0-9]+), #?256
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_256_f64_offset, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_f64 (p0, z1, 256, z0),
++		       svst1_scatter_offset (p0, z1, 256, z0))
++
++/*
++** st1_scatter_x0_f64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_f64_index, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_index_f64 (p0, z1, x0, z0),
++		       svst1_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m1_f64_index:
++**	mov	(x[0-9]+), #?-8
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m1_f64_index, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_index_f64 (p0, z1, -1, z0),
++		       svst1_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1_scatter_0_f64_index:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_f64_index, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_index_f64 (p0, z1, 0, z0),
++		       svst1_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_f64_index:
++**	st1d	z0\.d, p0, \[z1\.d, #40\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_f64_index, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_index_f64 (p0, z1, 5, z0),
++		       svst1_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1_scatter_31_f64_index:
++**	st1d	z0\.d, p0, \[z1\.d, #248\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_31_f64_index, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_index_f64 (p0, z1, 31, z0),
++		       svst1_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1_scatter_32_f64_index:
++**	mov	(x[0-9]+), #?256
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_32_f64_index, svfloat64_t, svuint64_t,
++		       svst1_scatter_u64base_index_f64 (p0, z1, 32, z0),
++		       svst1_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1_scatter_x0_f64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		       svst1_scatter_s64offset_f64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		       svst1_scatter_s64offset_f64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_f64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_s64offset, svfloat64_t, float64_t, svint64_t,
++		       svst1_scatter_s64offset_f64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_f64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		       svst1_scatter_u64offset_f64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		       svst1_scatter_u64offset_f64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_f64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
++		       svst1_scatter_u64offset_f64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_f64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		       svst1_scatter_s64index_f64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		       svst1_scatter_s64index_f64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_f64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_s64index, svfloat64_t, float64_t, svint64_t,
++		       svst1_scatter_s64index_f64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_f64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		       svst1_scatter_u64index_f64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_f64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		       svst1_scatter_u64index_f64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_f64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_u64index, svfloat64_t, float64_t, svuint64_t,
++		       svst1_scatter_u64index_f64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s32.c
+new file mode 100644
+index 000000000..d244e701a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s32.c
+@@ -0,0 +1,227 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_scatter_s32:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_s32, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_s32 (p0, z1, z0),
++		       svst1_scatter (p0, z1, z0))
++
++/*
++** st1_scatter_x0_s32_offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, x0, z0),
++		       svst1_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m4_s32_offset:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m4_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, -4, z0),
++		       svst1_scatter_offset (p0, z1, -4, z0))
++
++/*
++** st1_scatter_0_s32_offset:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 0, z0),
++		       svst1_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 5, z0),
++		       svst1_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1_scatter_6_s32_offset:
++**	mov	(x[0-9]+), #?6
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_6_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 6, z0),
++		       svst1_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1_scatter_7_s32_offset:
++**	mov	(x[0-9]+), #?7
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_7_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 7, z0),
++		       svst1_scatter_offset (p0, z1, 7, z0))
++
++/*
++** st1_scatter_8_s32_offset:
++**	st1w	z0\.s, p0, \[z1\.s, #8\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_8_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 8, z0),
++		       svst1_scatter_offset (p0, z1, 8, z0))
++
++/*
++** st1_scatter_124_s32_offset:
++**	st1w	z0\.s, p0, \[z1\.s, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_124_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 124, z0),
++		       svst1_scatter_offset (p0, z1, 124, z0))
++
++/*
++** st1_scatter_128_s32_offset:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_128_s32_offset, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_s32 (p0, z1, 128, z0),
++		       svst1_scatter_offset (p0, z1, 128, z0))
++
++/*
++** st1_scatter_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_s32_index, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_s32 (p0, z1, x0, z0),
++		       svst1_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m1_s32_index:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m1_s32_index, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_s32 (p0, z1, -1, z0),
++		       svst1_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1_scatter_0_s32_index:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_s32_index, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_s32 (p0, z1, 0, z0),
++		       svst1_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_s32_index:
++**	st1w	z0\.s, p0, \[z1\.s, #20\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_s32_index, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_s32 (p0, z1, 5, z0),
++		       svst1_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1_scatter_31_s32_index:
++**	st1w	z0\.s, p0, \[z1\.s, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_31_s32_index, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_s32 (p0, z1, 31, z0),
++		       svst1_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1_scatter_32_s32_index:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_32_s32_index, svint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_s32 (p0, z1, 32, z0),
++		       svst1_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1_scatter_x0_s32_s32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_s32offset, svint32_t, int32_t, svint32_t,
++		       svst1_scatter_s32offset_s32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s32_s32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s32_s32offset, svint32_t, int32_t, svint32_t,
++		       svst1_scatter_s32offset_s32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_s32_u32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		       svst1_scatter_u32offset_s32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s32_u32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s32_u32offset, svint32_t, int32_t, svuint32_t,
++		       svst1_scatter_u32offset_s32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_s32_s32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_s32index, svint32_t, int32_t, svint32_t,
++		       svst1_scatter_s32index_s32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s32_s32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s32_s32index, svint32_t, int32_t, svint32_t,
++		       svst1_scatter_s32index_s32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_s32_u32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_u32index, svint32_t, int32_t, svuint32_t,
++		       svst1_scatter_u32index_s32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s32_u32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s32_u32index, svint32_t, int32_t, svuint32_t,
++		       svst1_scatter_u32index_s32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s64.c
+new file mode 100644
+index 000000000..5c4ebf440
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s64.c
+@@ -0,0 +1,303 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_scatter_s64:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_s64, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_s64 (p0, z1, z0),
++		       svst1_scatter (p0, z1, z0))
++
++/*
++** st1_scatter_x0_s64_offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, x0, z0),
++		       svst1_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m8_s64_offset:
++**	mov	(x[0-9]+), #?-8
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m8_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, -8, z0),
++		       svst1_scatter_offset (p0, z1, -8, z0))
++
++/*
++** st1_scatter_0_s64_offset:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 0, z0),
++		       svst1_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1_scatter_9_s64_offset:
++**	mov	(x[0-9]+), #?9
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_9_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 9, z0),
++		       svst1_scatter_offset (p0, z1, 9, z0))
++
++/*
++** st1_scatter_10_s64_offset:
++**	mov	(x[0-9]+), #?10
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_10_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 10, z0),
++		       svst1_scatter_offset (p0, z1, 10, z0))
++
++/*
++** st1_scatter_11_s64_offset:
++**	mov	(x[0-9]+), #?11
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_11_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 11, z0),
++		       svst1_scatter_offset (p0, z1, 11, z0))
++
++/*
++** st1_scatter_12_s64_offset:
++**	mov	(x[0-9]+), #?12
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_12_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 12, z0),
++		       svst1_scatter_offset (p0, z1, 12, z0))
++
++/*
++** st1_scatter_13_s64_offset:
++**	mov	(x[0-9]+), #?13
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_13_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 13, z0),
++		       svst1_scatter_offset (p0, z1, 13, z0))
++
++/*
++** st1_scatter_14_s64_offset:
++**	mov	(x[0-9]+), #?14
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_14_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 14, z0),
++		       svst1_scatter_offset (p0, z1, 14, z0))
++
++/*
++** st1_scatter_15_s64_offset:
++**	mov	(x[0-9]+), #?15
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_15_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 15, z0),
++		       svst1_scatter_offset (p0, z1, 15, z0))
++
++/*
++** st1_scatter_16_s64_offset:
++**	st1d	z0\.d, p0, \[z1\.d, #16\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_16_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 16, z0),
++		       svst1_scatter_offset (p0, z1, 16, z0))
++
++/*
++** st1_scatter_248_s64_offset:
++**	st1d	z0\.d, p0, \[z1\.d, #248\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_248_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 248, z0),
++		       svst1_scatter_offset (p0, z1, 248, z0))
++
++/*
++** st1_scatter_256_s64_offset:
++**	mov	(x[0-9]+), #?256
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_256_s64_offset, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_s64 (p0, z1, 256, z0),
++		       svst1_scatter_offset (p0, z1, 256, z0))
++
++/*
++** st1_scatter_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_s64_index, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_s64 (p0, z1, x0, z0),
++		       svst1_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m1_s64_index:
++**	mov	(x[0-9]+), #?-8
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m1_s64_index, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_s64 (p0, z1, -1, z0),
++		       svst1_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1_scatter_0_s64_index:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_s64_index, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_s64 (p0, z1, 0, z0),
++		       svst1_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_s64_index:
++**	st1d	z0\.d, p0, \[z1\.d, #40\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_s64_index, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_s64 (p0, z1, 5, z0),
++		       svst1_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1_scatter_31_s64_index:
++**	st1d	z0\.d, p0, \[z1\.d, #248\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_31_s64_index, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_s64 (p0, z1, 31, z0),
++		       svst1_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1_scatter_32_s64_index:
++**	mov	(x[0-9]+), #?256
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_32_s64_index, svint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_s64 (p0, z1, 32, z0),
++		       svst1_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1_scatter_x0_s64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_s64offset, svint64_t, int64_t, svint64_t,
++		       svst1_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s64_s64offset, svint64_t, int64_t, svint64_t,
++		       svst1_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_s64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_s64offset, svint64_t, int64_t, svint64_t,
++		       svst1_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_s64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		       svst1_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		       svst1_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_s64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_u64offset, svint64_t, int64_t, svuint64_t,
++		       svst1_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_s64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_s64index, svint64_t, int64_t, svint64_t,
++		       svst1_scatter_s64index_s64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s64_s64index, svint64_t, int64_t, svint64_t,
++		       svst1_scatter_s64index_s64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_s64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_s64index, svint64_t, int64_t, svint64_t,
++		       svst1_scatter_s64index_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_s64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_u64index, svint64_t, int64_t, svuint64_t,
++		       svst1_scatter_u64index_s64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_s64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_s64_u64index, svint64_t, int64_t, svuint64_t,
++		       svst1_scatter_u64index_s64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_s64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_u64index, svint64_t, int64_t, svuint64_t,
++		       svst1_scatter_u64index_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u32.c
+new file mode 100644
+index 000000000..fe3f7259f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u32.c
+@@ -0,0 +1,227 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_scatter_u32:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_u32, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_u32 (p0, z1, z0),
++		       svst1_scatter (p0, z1, z0))
++
++/*
++** st1_scatter_x0_u32_offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, x0, z0),
++		       svst1_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m4_u32_offset:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m4_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, -4, z0),
++		       svst1_scatter_offset (p0, z1, -4, z0))
++
++/*
++** st1_scatter_0_u32_offset:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 0, z0),
++		       svst1_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 5, z0),
++		       svst1_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1_scatter_6_u32_offset:
++**	mov	(x[0-9]+), #?6
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_6_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 6, z0),
++		       svst1_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1_scatter_7_u32_offset:
++**	mov	(x[0-9]+), #?7
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_7_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 7, z0),
++		       svst1_scatter_offset (p0, z1, 7, z0))
++
++/*
++** st1_scatter_8_u32_offset:
++**	st1w	z0\.s, p0, \[z1\.s, #8\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_8_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 8, z0),
++		       svst1_scatter_offset (p0, z1, 8, z0))
++
++/*
++** st1_scatter_124_u32_offset:
++**	st1w	z0\.s, p0, \[z1\.s, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_124_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 124, z0),
++		       svst1_scatter_offset (p0, z1, 124, z0))
++
++/*
++** st1_scatter_128_u32_offset:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_128_u32_offset, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_offset_u32 (p0, z1, 128, z0),
++		       svst1_scatter_offset (p0, z1, 128, z0))
++
++/*
++** st1_scatter_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_u32_index, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_u32 (p0, z1, x0, z0),
++		       svst1_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m1_u32_index:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m1_u32_index, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_u32 (p0, z1, -1, z0),
++		       svst1_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1_scatter_0_u32_index:
++**	st1w	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_u32_index, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_u32 (p0, z1, 0, z0),
++		       svst1_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_u32_index:
++**	st1w	z0\.s, p0, \[z1\.s, #20\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_u32_index, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_u32 (p0, z1, 5, z0),
++		       svst1_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1_scatter_31_u32_index:
++**	st1w	z0\.s, p0, \[z1\.s, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_31_u32_index, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_u32 (p0, z1, 31, z0),
++		       svst1_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1_scatter_32_u32_index:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_32_u32_index, svuint32_t, svuint32_t,
++		       svst1_scatter_u32base_index_u32 (p0, z1, 32, z0),
++		       svst1_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1_scatter_x0_u32_s32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		       svst1_scatter_s32offset_u32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u32_s32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u32_s32offset, svuint32_t, uint32_t, svint32_t,
++		       svst1_scatter_s32offset_u32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_u32_u32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		       svst1_scatter_u32offset_u32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u32_u32offset:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
++		       svst1_scatter_u32offset_u32 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_u32_s32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		       svst1_scatter_s32index_u32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u32_s32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u32_s32index, svuint32_t, uint32_t, svint32_t,
++		       svst1_scatter_s32index_u32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_x0_u32_u32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		       svst1_scatter_u32index_u32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u32_u32index:
++**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u32_u32index, svuint32_t, uint32_t, svuint32_t,
++		       svst1_scatter_u32index_u32 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u64.c
+new file mode 100644
+index 000000000..232123566
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u64.c
+@@ -0,0 +1,303 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_scatter_u64:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_u64, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_u64 (p0, z1, z0),
++		       svst1_scatter (p0, z1, z0))
++
++/*
++** st1_scatter_x0_u64_offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, x0, z0),
++		       svst1_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m8_u64_offset:
++**	mov	(x[0-9]+), #?-8
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m8_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, -8, z0),
++		       svst1_scatter_offset (p0, z1, -8, z0))
++
++/*
++** st1_scatter_0_u64_offset:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 0, z0),
++		       svst1_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1_scatter_9_u64_offset:
++**	mov	(x[0-9]+), #?9
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_9_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 9, z0),
++		       svst1_scatter_offset (p0, z1, 9, z0))
++
++/*
++** st1_scatter_10_u64_offset:
++**	mov	(x[0-9]+), #?10
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_10_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 10, z0),
++		       svst1_scatter_offset (p0, z1, 10, z0))
++
++/*
++** st1_scatter_11_u64_offset:
++**	mov	(x[0-9]+), #?11
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_11_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 11, z0),
++		       svst1_scatter_offset (p0, z1, 11, z0))
++
++/*
++** st1_scatter_12_u64_offset:
++**	mov	(x[0-9]+), #?12
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_12_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 12, z0),
++		       svst1_scatter_offset (p0, z1, 12, z0))
++
++/*
++** st1_scatter_13_u64_offset:
++**	mov	(x[0-9]+), #?13
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_13_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 13, z0),
++		       svst1_scatter_offset (p0, z1, 13, z0))
++
++/*
++** st1_scatter_14_u64_offset:
++**	mov	(x[0-9]+), #?14
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_14_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 14, z0),
++		       svst1_scatter_offset (p0, z1, 14, z0))
++
++/*
++** st1_scatter_15_u64_offset:
++**	mov	(x[0-9]+), #?15
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_15_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 15, z0),
++		       svst1_scatter_offset (p0, z1, 15, z0))
++
++/*
++** st1_scatter_16_u64_offset:
++**	st1d	z0\.d, p0, \[z1\.d, #16\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_16_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 16, z0),
++		       svst1_scatter_offset (p0, z1, 16, z0))
++
++/*
++** st1_scatter_248_u64_offset:
++**	st1d	z0\.d, p0, \[z1\.d, #248\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_248_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 248, z0),
++		       svst1_scatter_offset (p0, z1, 248, z0))
++
++/*
++** st1_scatter_256_u64_offset:
++**	mov	(x[0-9]+), #?256
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_256_u64_offset, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_offset_u64 (p0, z1, 256, z0),
++		       svst1_scatter_offset (p0, z1, 256, z0))
++
++/*
++** st1_scatter_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?3
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_x0_u64_index, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_u64 (p0, z1, x0, z0),
++		       svst1_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1_scatter_m1_u64_index:
++**	mov	(x[0-9]+), #?-8
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_m1_u64_index, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_u64 (p0, z1, -1, z0),
++		       svst1_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1_scatter_0_u64_index:
++**	st1d	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_0_u64_index, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_u64 (p0, z1, 0, z0),
++		       svst1_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1_scatter_5_u64_index:
++**	st1d	z0\.d, p0, \[z1\.d, #40\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_5_u64_index, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_u64 (p0, z1, 5, z0),
++		       svst1_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1_scatter_31_u64_index:
++**	st1d	z0\.d, p0, \[z1\.d, #248\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_31_u64_index, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_u64 (p0, z1, 31, z0),
++		       svst1_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1_scatter_32_u64_index:
++**	mov	(x[0-9]+), #?256
++**	st1d	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1_scatter_32_u64_index, svuint64_t, svuint64_t,
++		       svst1_scatter_u64base_index_u64 (p0, z1, 32, z0),
++		       svst1_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1_scatter_x0_u64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		       svst1_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		       svst1_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_u64_s64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_s64offset, svuint64_t, uint64_t, svint64_t,
++		       svst1_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_u64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		       svst1_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		       svst1_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_u64_u64offset:
++**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
++		       svst1_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_u64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		       svst1_scatter_s64index_u64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		       svst1_scatter_s64index_u64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_u64_s64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_s64index, svuint64_t, uint64_t, svint64_t,
++		       svst1_scatter_s64index_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1_scatter_x0_u64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		       svst1_scatter_u64index_u64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_u64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		       svst1_scatter_u64index_u64 (p0, x0, z1, z0),
++		       svst1_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1_scatter_ext_u64_u64index:
++**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw 3\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_u64index, svuint64_t, uint64_t, svuint64_t,
++		       svst1_scatter_u64index_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u16.c
+new file mode 100644
+index 000000000..e9dc05219
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_u16_base:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u16_base, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_u16_index:
++**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1_u16_index, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_u16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u16_1, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 + svcnth (), z0),
++	    svst1 (p0, x0 + svcnth (), z0))
++
++/*
++** st1_u16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u16_7, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 + svcnth () * 7, z0),
++	    svst1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u16_8, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 + svcnth () * 8, z0),
++	    svst1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** st1_u16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u16_m1, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 - svcnth (), z0),
++	    svst1 (p0, x0 - svcnth (), z0))
++
++/*
++** st1_u16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u16_m8, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 - svcnth () * 8, z0),
++	    svst1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u16_m9, svuint16_t, uint16_t,
++	    svst1_u16 (p0, x0 - svcnth () * 9, z0),
++	    svst1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** st1_vnum_u16_0:
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_0, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_u16_1:
++**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_1, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_u16_7:
++**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_7, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u16_8:
++**	incb	x0, all, mul #8
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_8, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_u16_m1:
++**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_m1, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_u16_m8:
++**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_m8, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u16_m9:
++**	decb	x0, all, mul #9
++**	st1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_m9, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u16_x1, svuint16_t, uint16_t,
++	    svst1_vnum_u16 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u32.c
+new file mode 100644
+index 000000000..8610ae4c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_u32_base:
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u32_base, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_u32_index:
++**	st1w	z0\.s, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st1_u32_index, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_u32_1:
++**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u32_1, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 + svcntw (), z0),
++	    svst1 (p0, x0 + svcntw (), z0))
++
++/*
++** st1_u32_7:
++**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u32_7, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 + svcntw () * 7, z0),
++	    svst1 (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u32_8:
++**	incb	x0, all, mul #8
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u32_8, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 + svcntw () * 8, z0),
++	    svst1 (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1_u32_m1:
++**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u32_m1, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 - svcntw (), z0),
++	    svst1 (p0, x0 - svcntw (), z0))
++
++/*
++** st1_u32_m8:
++**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u32_m8, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 - svcntw () * 8, z0),
++	    svst1 (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u32_m9:
++**	decb	x0, all, mul #9
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u32_m9, svuint32_t, uint32_t,
++	    svst1_u32 (p0, x0 - svcntw () * 9, z0),
++	    svst1 (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1_vnum_u32_0:
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_0, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_u32_1:
++**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_1, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_u32_7:
++**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_7, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u32_8:
++**	incb	x0, all, mul #8
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_8, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_u32_m1:
++**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_m1, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_u32_m8:
++**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_m8, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u32_m9:
++**	decb	x0, all, mul #9
++**	st1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_m9, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1w	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u32_x1, svuint32_t, uint32_t,
++	    svst1_vnum_u32 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u64.c
+new file mode 100644
+index 000000000..5d4fae932
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_u64_base:
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u64_base, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_u64_index:
++**	st1d	z0\.d, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st1_u64_index, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_u64_1:
++**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u64_1, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 + svcntd (), z0),
++	    svst1 (p0, x0 + svcntd (), z0))
++
++/*
++** st1_u64_7:
++**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u64_7, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 + svcntd () * 7, z0),
++	    svst1 (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u64_8:
++**	incb	x0, all, mul #8
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u64_8, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 + svcntd () * 8, z0),
++	    svst1 (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1_u64_m1:
++**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u64_m1, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 - svcntd (), z0),
++	    svst1 (p0, x0 - svcntd (), z0))
++
++/*
++** st1_u64_m8:
++**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u64_m8, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 - svcntd () * 8, z0),
++	    svst1 (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u64_m9:
++**	decb	x0, all, mul #9
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u64_m9, svuint64_t, uint64_t,
++	    svst1_u64 (p0, x0 - svcntd () * 9, z0),
++	    svst1 (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1_vnum_u64_0:
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_0, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_u64_1:
++**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_1, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_u64_7:
++**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_7, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u64_8:
++**	incb	x0, all, mul #8
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_8, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_u64_m1:
++**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_m1, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_u64_m8:
++**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_m8, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u64_m9:
++**	decb	x0, all, mul #9
++**	st1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_m9, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1d	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u64_x1, svuint64_t, uint64_t,
++	    svst1_vnum_u64 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u8.c
+new file mode 100644
+index 000000000..52c79d0e0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1_u8_base:
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u8_base, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0, z0),
++	    svst1 (p0, x0, z0))
++
++/*
++** st1_u8_index:
++**	st1b	z0\.b, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1_u8_index, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 + x1, z0),
++	    svst1 (p0, x0 + x1, z0))
++
++/*
++** st1_u8_1:
++**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u8_1, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 + svcntb (), z0),
++	    svst1 (p0, x0 + svcntb (), z0))
++
++/*
++** st1_u8_7:
++**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u8_7, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 + svcntb () * 7, z0),
++	    svst1 (p0, x0 + svcntb () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u8_8:
++**	incb	x0, all, mul #8
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u8_8, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 + svcntb () * 8, z0),
++	    svst1 (p0, x0 + svcntb () * 8, z0))
++
++/*
++** st1_u8_m1:
++**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u8_m1, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 - svcntb (), z0),
++	    svst1 (p0, x0 - svcntb (), z0))
++
++/*
++** st1_u8_m8:
++**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_u8_m8, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 - svcntb () * 8, z0),
++	    svst1 (p0, x0 - svcntb () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_u8_m9:
++**	decb	x0, all, mul #9
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_u8_m9, svuint8_t, uint8_t,
++	    svst1_u8 (p0, x0 - svcntb () * 9, z0),
++	    svst1 (p0, x0 - svcntb () * 9, z0))
++
++/*
++** st1_vnum_u8_0:
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_0, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, 0, z0),
++	    svst1_vnum (p0, x0, 0, z0))
++
++/*
++** st1_vnum_u8_1:
++**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_1, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, 1, z0),
++	    svst1_vnum (p0, x0, 1, z0))
++
++/*
++** st1_vnum_u8_7:
++**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_7, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, 7, z0),
++	    svst1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u8_8:
++**	incb	x0, all, mul #8
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_8, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, 8, z0),
++	    svst1_vnum (p0, x0, 8, z0))
++
++/*
++** st1_vnum_u8_m1:
++**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_m1, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, -1, z0),
++	    svst1_vnum (p0, x0, -1, z0))
++
++/*
++** st1_vnum_u8_m8:
++**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_m8, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, -8, z0),
++	    svst1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1_vnum_u8_m9:
++**	decb	x0, all, mul #9
++**	st1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_m9, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, -9, z0),
++	    svst1_vnum (p0, x0, -9, z0))
++
++/*
++** st1_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.b, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.b, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1_vnum_u8_x1, svuint8_t, uint8_t,
++	    svst1_vnum_u8 (p0, x0, x1, z0),
++	    svst1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s16.c
+new file mode 100644
+index 000000000..770fb61e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s16.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_s16_base:
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s16_base, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0, z0),
++	    svst1b (p0, x0, z0))
++
++/*
++** st1b_s16_index:
++**	st1b	z0\.h, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1b_s16_index, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 + x1, z0),
++	    svst1b (p0, x0 + x1, z0))
++
++/*
++** st1b_s16_1:
++**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s16_1, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 + svcnth (), z0),
++	    svst1b (p0, x0 + svcnth (), z0))
++
++/*
++** st1b_s16_7:
++**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s16_7, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 + svcnth () * 7, z0),
++	    svst1b (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_s16_8:
++**	incb	x0, all, mul #4
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s16_8, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 + svcnth () * 8, z0),
++	    svst1b (p0, x0 + svcnth () * 8, z0))
++
++/*
++** st1b_s16_m1:
++**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s16_m1, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 - svcnth (), z0),
++	    svst1b (p0, x0 - svcnth (), z0))
++
++/*
++** st1b_s16_m8:
++**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s16_m8, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 - svcnth () * 8, z0),
++	    svst1b (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_s16_m9:
++**	dech	x0, all, mul #9
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s16_m9, svint16_t, int8_t,
++	    svst1b_s16 (p0, x0 - svcnth () * 9, z0),
++	    svst1b (p0, x0 - svcnth () * 9, z0))
++
++/*
++** st1b_vnum_s16_0:
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_0, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, 0, z0),
++	    svst1b_vnum (p0, x0, 0, z0))
++
++/*
++** st1b_vnum_s16_1:
++**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_1, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, 1, z0),
++	    svst1b_vnum (p0, x0, 1, z0))
++
++/*
++** st1b_vnum_s16_7:
++**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_7, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, 7, z0),
++	    svst1b_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_s16_8:
++**	incb	x0, all, mul #4
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_8, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, 8, z0),
++	    svst1b_vnum (p0, x0, 8, z0))
++
++/*
++** st1b_vnum_s16_m1:
++**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_m1, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, -1, z0),
++	    svst1b_vnum (p0, x0, -1, z0))
++
++/*
++** st1b_vnum_s16_m8:
++**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_m8, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, -8, z0),
++	    svst1b_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_s16_m9:
++**	dech	x0, all, mul #9
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_m9, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, -9, z0),
++	    svst1b_vnum (p0, x0, -9, z0))
++
++/*
++** st1b_vnum_s16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.h, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.h, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1b_vnum_s16_x1, svint16_t, int8_t,
++	    svst1b_vnum_s16 (p0, x0, x1, z0),
++	    svst1b_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s32.c
+new file mode 100644
+index 000000000..85333aea9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s32.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_s32_base:
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s32_base, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0, z0),
++	    svst1b (p0, x0, z0))
++
++/*
++** st1b_s32_index:
++**	st1b	z0\.s, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1b_s32_index, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 + x1, z0),
++	    svst1b (p0, x0 + x1, z0))
++
++/*
++** st1b_s32_1:
++**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s32_1, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 + svcntw (), z0),
++	    svst1b (p0, x0 + svcntw (), z0))
++
++/*
++** st1b_s32_7:
++**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s32_7, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 + svcntw () * 7, z0),
++	    svst1b (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_s32_8:
++**	incb	x0, all, mul #2
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s32_8, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 + svcntw () * 8, z0),
++	    svst1b (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1b_s32_m1:
++**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s32_m1, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 - svcntw (), z0),
++	    svst1b (p0, x0 - svcntw (), z0))
++
++/*
++** st1b_s32_m8:
++**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s32_m8, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 - svcntw () * 8, z0),
++	    svst1b (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_s32_m9:
++**	decw	x0, all, mul #9
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s32_m9, svint32_t, int8_t,
++	    svst1b_s32 (p0, x0 - svcntw () * 9, z0),
++	    svst1b (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1b_vnum_s32_0:
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_0, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, 0, z0),
++	    svst1b_vnum (p0, x0, 0, z0))
++
++/*
++** st1b_vnum_s32_1:
++**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_1, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, 1, z0),
++	    svst1b_vnum (p0, x0, 1, z0))
++
++/*
++** st1b_vnum_s32_7:
++**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_7, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, 7, z0),
++	    svst1b_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_s32_8:
++**	incb	x0, all, mul #2
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_8, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, 8, z0),
++	    svst1b_vnum (p0, x0, 8, z0))
++
++/*
++** st1b_vnum_s32_m1:
++**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_m1, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, -1, z0),
++	    svst1b_vnum (p0, x0, -1, z0))
++
++/*
++** st1b_vnum_s32_m8:
++**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_m8, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, -8, z0),
++	    svst1b_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_s32_m9:
++**	decw	x0, all, mul #9
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_m9, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, -9, z0),
++	    svst1b_vnum (p0, x0, -9, z0))
++
++/*
++** st1b_vnum_s32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.s, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.s, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1b_vnum_s32_x1, svint32_t, int8_t,
++	    svst1b_vnum_s32 (p0, x0, x1, z0),
++	    svst1b_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s64.c
+new file mode 100644
+index 000000000..321f168d9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s64.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_s64_base:
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s64_base, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0, z0),
++	    svst1b (p0, x0, z0))
++
++/*
++** st1b_s64_index:
++**	st1b	z0\.d, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1b_s64_index, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 + x1, z0),
++	    svst1b (p0, x0 + x1, z0))
++
++/*
++** st1b_s64_1:
++**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s64_1, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 + svcntd (), z0),
++	    svst1b (p0, x0 + svcntd (), z0))
++
++/*
++** st1b_s64_7:
++**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s64_7, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 + svcntd () * 7, z0),
++	    svst1b (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_s64_8:
++**	incb	x0
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s64_8, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 + svcntd () * 8, z0),
++	    svst1b (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1b_s64_m1:
++**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s64_m1, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 - svcntd (), z0),
++	    svst1b (p0, x0 - svcntd (), z0))
++
++/*
++** st1b_s64_m8:
++**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_s64_m8, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 - svcntd () * 8, z0),
++	    svst1b (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_s64_m9:
++**	decd	x0, all, mul #9
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_s64_m9, svint64_t, int8_t,
++	    svst1b_s64 (p0, x0 - svcntd () * 9, z0),
++	    svst1b (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1b_vnum_s64_0:
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_0, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, 0, z0),
++	    svst1b_vnum (p0, x0, 0, z0))
++
++/*
++** st1b_vnum_s64_1:
++**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_1, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, 1, z0),
++	    svst1b_vnum (p0, x0, 1, z0))
++
++/*
++** st1b_vnum_s64_7:
++**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_7, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, 7, z0),
++	    svst1b_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_s64_8:
++**	incb	x0
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_8, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, 8, z0),
++	    svst1b_vnum (p0, x0, 8, z0))
++
++/*
++** st1b_vnum_s64_m1:
++**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_m1, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, -1, z0),
++	    svst1b_vnum (p0, x0, -1, z0))
++
++/*
++** st1b_vnum_s64_m8:
++**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_m8, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, -8, z0),
++	    svst1b_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_s64_m9:
++**	decd	x0, all, mul #9
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_m9, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, -9, z0),
++	    svst1b_vnum (p0, x0, -9, z0))
++
++/*
++** st1b_vnum_s64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.d, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.d, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1b_vnum_s64_x1, svint64_t, int8_t,
++	    svst1b_vnum_s64 (p0, x0, x1, z0),
++	    svst1b_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s32.c
+new file mode 100644
+index 000000000..d59033356
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s32.c
+@@ -0,0 +1,104 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_scatter_s32:
++**	st1b	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_s32, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_s32 (p0, z1, z0),
++		       svst1b_scatter (p0, z1, z0))
++
++/*
++** st1b_scatter_x0_s32_offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_x0_s32_offset, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_s32 (p0, z1, x0, z0),
++		       svst1b_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1b_scatter_m1_s32_offset:
++**	mov	(x[0-9]+), #?-1
++**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_m1_s32_offset, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_s32 (p0, z1, -1, z0),
++		       svst1b_scatter_offset (p0, z1, -1, z0))
++
++/*
++** st1b_scatter_0_s32_offset:
++**	st1b	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_0_s32_offset, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_s32 (p0, z1, 0, z0),
++		       svst1b_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1b_scatter_5_s32_offset:
++**	st1b	z0\.s, p0, \[z1\.s, #5\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_5_s32_offset, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_s32 (p0, z1, 5, z0),
++		       svst1b_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1b_scatter_31_s32_offset:
++**	st1b	z0\.s, p0, \[z1\.s, #31\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_31_s32_offset, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_s32 (p0, z1, 31, z0),
++		       svst1b_scatter_offset (p0, z1, 31, z0))
++
++/*
++** st1b_scatter_32_s32_offset:
++**	mov	(x[0-9]+), #?32
++**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_32_s32_offset, svint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_s32 (p0, z1, 32, z0),
++		       svst1b_scatter_offset (p0, z1, 32, z0))
++
++/*
++** st1b_scatter_x0_s32_s32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s32_s32offset, svint32_t, int8_t, svint32_t,
++		       svst1b_scatter_s32offset_s32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_s32_s32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_s32_s32offset, svint32_t, int8_t, svint32_t,
++		       svst1b_scatter_s32offset_s32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_x0_s32_u32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		       svst1b_scatter_u32offset_s32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_s32_u32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_s32_u32offset, svint32_t, int8_t, svuint32_t,
++		       svst1b_scatter_u32offset_s32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s64.c
+new file mode 100644
+index 000000000..c7a35f1b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s64.c
+@@ -0,0 +1,122 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_scatter_s64:
++**	st1b	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_s64, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_s64 (p0, z1, z0),
++		       svst1b_scatter (p0, z1, z0))
++
++/*
++** st1b_scatter_x0_s64_offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_x0_s64_offset, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_s64 (p0, z1, x0, z0),
++		       svst1b_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1b_scatter_m1_s64_offset:
++**	mov	(x[0-9]+), #?-1
++**	st1b	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_m1_s64_offset, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_s64 (p0, z1, -1, z0),
++		       svst1b_scatter_offset (p0, z1, -1, z0))
++
++/*
++** st1b_scatter_0_s64_offset:
++**	st1b	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_0_s64_offset, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_s64 (p0, z1, 0, z0),
++		       svst1b_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1b_scatter_5_s64_offset:
++**	st1b	z0\.d, p0, \[z1\.d, #5\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_5_s64_offset, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_s64 (p0, z1, 5, z0),
++		       svst1b_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1b_scatter_31_s64_offset:
++**	st1b	z0\.d, p0, \[z1\.d, #31\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_31_s64_offset, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_s64 (p0, z1, 31, z0),
++		       svst1b_scatter_offset (p0, z1, 31, z0))
++
++/*
++** st1b_scatter_32_s64_offset:
++**	mov	(x[0-9]+), #?32
++**	st1b	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_32_s64_offset, svint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_s64 (p0, z1, 32, z0),
++		       svst1b_scatter_offset (p0, z1, 32, z0))
++
++/*
++** st1b_scatter_x0_s64_s64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s64_s64offset, svint64_t, int8_t, svint64_t,
++		       svst1b_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_s64_s64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_s64_s64offset, svint64_t, int8_t, svint64_t,
++		       svst1b_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_ext_s64_s64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_ext_s64_s64offset, svint64_t, int8_t, svint64_t,
++		       svst1b_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1b_scatter_x0_s64_u64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		       svst1b_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_s64_u64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		       svst1b_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_ext_s64_u64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_ext_s64_u64offset, svint64_t, int8_t, svuint64_t,
++		       svst1b_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u32.c
+new file mode 100644
+index 000000000..e098cb9b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u32.c
+@@ -0,0 +1,104 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_scatter_u32:
++**	st1b	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_u32, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_u32 (p0, z1, z0),
++		       svst1b_scatter (p0, z1, z0))
++
++/*
++** st1b_scatter_x0_u32_offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_x0_u32_offset, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_u32 (p0, z1, x0, z0),
++		       svst1b_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1b_scatter_m1_u32_offset:
++**	mov	(x[0-9]+), #?-1
++**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_m1_u32_offset, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_u32 (p0, z1, -1, z0),
++		       svst1b_scatter_offset (p0, z1, -1, z0))
++
++/*
++** st1b_scatter_0_u32_offset:
++**	st1b	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_0_u32_offset, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_u32 (p0, z1, 0, z0),
++		       svst1b_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1b_scatter_5_u32_offset:
++**	st1b	z0\.s, p0, \[z1\.s, #5\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_5_u32_offset, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_u32 (p0, z1, 5, z0),
++		       svst1b_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1b_scatter_31_u32_offset:
++**	st1b	z0\.s, p0, \[z1\.s, #31\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_31_u32_offset, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_u32 (p0, z1, 31, z0),
++		       svst1b_scatter_offset (p0, z1, 31, z0))
++
++/*
++** st1b_scatter_32_u32_offset:
++**	mov	(x[0-9]+), #?32
++**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_32_u32_offset, svuint32_t, svuint32_t,
++		       svst1b_scatter_u32base_offset_u32 (p0, z1, 32, z0),
++		       svst1b_scatter_offset (p0, z1, 32, z0))
++
++/*
++** st1b_scatter_x0_u32_s32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		       svst1b_scatter_s32offset_u32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_u32_s32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_u32_s32offset, svuint32_t, uint8_t, svint32_t,
++		       svst1b_scatter_s32offset_u32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_x0_u32_u32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		       svst1b_scatter_u32offset_u32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_u32_u32offset:
++**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
++		       svst1b_scatter_u32offset_u32 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u64.c
+new file mode 100644
+index 000000000..058d1313f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u64.c
+@@ -0,0 +1,122 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_scatter_u64:
++**	st1b	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_u64, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_u64 (p0, z1, z0),
++		       svst1b_scatter (p0, z1, z0))
++
++/*
++** st1b_scatter_x0_u64_offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_x0_u64_offset, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_u64 (p0, z1, x0, z0),
++		       svst1b_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1b_scatter_m1_u64_offset:
++**	mov	(x[0-9]+), #?-1
++**	st1b	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_m1_u64_offset, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_u64 (p0, z1, -1, z0),
++		       svst1b_scatter_offset (p0, z1, -1, z0))
++
++/*
++** st1b_scatter_0_u64_offset:
++**	st1b	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_0_u64_offset, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_u64 (p0, z1, 0, z0),
++		       svst1b_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1b_scatter_5_u64_offset:
++**	st1b	z0\.d, p0, \[z1\.d, #5\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_5_u64_offset, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_u64 (p0, z1, 5, z0),
++		       svst1b_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1b_scatter_31_u64_offset:
++**	st1b	z0\.d, p0, \[z1\.d, #31\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_31_u64_offset, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_u64 (p0, z1, 31, z0),
++		       svst1b_scatter_offset (p0, z1, 31, z0))
++
++/*
++** st1b_scatter_32_u64_offset:
++**	mov	(x[0-9]+), #?32
++**	st1b	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1b_scatter_32_u64_offset, svuint64_t, svuint64_t,
++		       svst1b_scatter_u64base_offset_u64 (p0, z1, 32, z0),
++		       svst1b_scatter_offset (p0, z1, 32, z0))
++
++/*
++** st1b_scatter_x0_u64_s64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		       svst1b_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_u64_s64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		       svst1b_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_ext_u64_s64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_ext_u64_s64offset, svuint64_t, uint8_t, svint64_t,
++		       svst1b_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1b_scatter_x0_u64_u64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		       svst1b_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_u64_u64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		       svst1b_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1b_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1b_scatter_ext_u64_u64offset:
++**	st1b	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1b_scatter_ext_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
++		       svst1b_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u16.c
+new file mode 100644
+index 000000000..025a2212a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u16.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_u16_base:
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u16_base, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0, z0),
++	    svst1b (p0, x0, z0))
++
++/*
++** st1b_u16_index:
++**	st1b	z0\.h, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1b_u16_index, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 + x1, z0),
++	    svst1b (p0, x0 + x1, z0))
++
++/*
++** st1b_u16_1:
++**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u16_1, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 + svcnth (), z0),
++	    svst1b (p0, x0 + svcnth (), z0))
++
++/*
++** st1b_u16_7:
++**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u16_7, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 + svcnth () * 7, z0),
++	    svst1b (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_u16_8:
++**	incb	x0, all, mul #4
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u16_8, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 + svcnth () * 8, z0),
++	    svst1b (p0, x0 + svcnth () * 8, z0))
++
++/*
++** st1b_u16_m1:
++**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u16_m1, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 - svcnth (), z0),
++	    svst1b (p0, x0 - svcnth (), z0))
++
++/*
++** st1b_u16_m8:
++**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u16_m8, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 - svcnth () * 8, z0),
++	    svst1b (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_u16_m9:
++**	dech	x0, all, mul #9
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u16_m9, svuint16_t, uint8_t,
++	    svst1b_u16 (p0, x0 - svcnth () * 9, z0),
++	    svst1b (p0, x0 - svcnth () * 9, z0))
++
++/*
++** st1b_vnum_u16_0:
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_0, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, 0, z0),
++	    svst1b_vnum (p0, x0, 0, z0))
++
++/*
++** st1b_vnum_u16_1:
++**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_1, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, 1, z0),
++	    svst1b_vnum (p0, x0, 1, z0))
++
++/*
++** st1b_vnum_u16_7:
++**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_7, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, 7, z0),
++	    svst1b_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_u16_8:
++**	incb	x0, all, mul #4
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_8, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, 8, z0),
++	    svst1b_vnum (p0, x0, 8, z0))
++
++/*
++** st1b_vnum_u16_m1:
++**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_m1, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, -1, z0),
++	    svst1b_vnum (p0, x0, -1, z0))
++
++/*
++** st1b_vnum_u16_m8:
++**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_m8, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, -8, z0),
++	    svst1b_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_u16_m9:
++**	dech	x0, all, mul #9
++**	st1b	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_m9, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, -9, z0),
++	    svst1b_vnum (p0, x0, -9, z0))
++
++/*
++** st1b_vnum_u16_x1:
++**	cnth	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.h, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.h, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1b_vnum_u16_x1, svuint16_t, uint8_t,
++	    svst1b_vnum_u16 (p0, x0, x1, z0),
++	    svst1b_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u32.c
+new file mode 100644
+index 000000000..5833cb44b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u32.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_u32_base:
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u32_base, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0, z0),
++	    svst1b (p0, x0, z0))
++
++/*
++** st1b_u32_index:
++**	st1b	z0\.s, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1b_u32_index, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 + x1, z0),
++	    svst1b (p0, x0 + x1, z0))
++
++/*
++** st1b_u32_1:
++**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u32_1, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 + svcntw (), z0),
++	    svst1b (p0, x0 + svcntw (), z0))
++
++/*
++** st1b_u32_7:
++**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u32_7, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 + svcntw () * 7, z0),
++	    svst1b (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_u32_8:
++**	incb	x0, all, mul #2
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u32_8, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 + svcntw () * 8, z0),
++	    svst1b (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1b_u32_m1:
++**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u32_m1, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 - svcntw (), z0),
++	    svst1b (p0, x0 - svcntw (), z0))
++
++/*
++** st1b_u32_m8:
++**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u32_m8, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 - svcntw () * 8, z0),
++	    svst1b (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_u32_m9:
++**	decw	x0, all, mul #9
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u32_m9, svuint32_t, uint8_t,
++	    svst1b_u32 (p0, x0 - svcntw () * 9, z0),
++	    svst1b (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1b_vnum_u32_0:
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_0, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, 0, z0),
++	    svst1b_vnum (p0, x0, 0, z0))
++
++/*
++** st1b_vnum_u32_1:
++**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_1, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, 1, z0),
++	    svst1b_vnum (p0, x0, 1, z0))
++
++/*
++** st1b_vnum_u32_7:
++**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_7, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, 7, z0),
++	    svst1b_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_u32_8:
++**	incb	x0, all, mul #2
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_8, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, 8, z0),
++	    svst1b_vnum (p0, x0, 8, z0))
++
++/*
++** st1b_vnum_u32_m1:
++**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_m1, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, -1, z0),
++	    svst1b_vnum (p0, x0, -1, z0))
++
++/*
++** st1b_vnum_u32_m8:
++**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_m8, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, -8, z0),
++	    svst1b_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_u32_m9:
++**	decw	x0, all, mul #9
++**	st1b	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_m9, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, -9, z0),
++	    svst1b_vnum (p0, x0, -9, z0))
++
++/*
++** st1b_vnum_u32_x1:
++**	cntw	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.s, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.s, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1b_vnum_u32_x1, svuint32_t, uint8_t,
++	    svst1b_vnum_u32 (p0, x0, x1, z0),
++	    svst1b_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u64.c
+new file mode 100644
+index 000000000..e96f4c486
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u64.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1b_u64_base:
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u64_base, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0, z0),
++	    svst1b (p0, x0, z0))
++
++/*
++** st1b_u64_index:
++**	st1b	z0\.d, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st1b_u64_index, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 + x1, z0),
++	    svst1b (p0, x0 + x1, z0))
++
++/*
++** st1b_u64_1:
++**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u64_1, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 + svcntd (), z0),
++	    svst1b (p0, x0 + svcntd (), z0))
++
++/*
++** st1b_u64_7:
++**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u64_7, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 + svcntd () * 7, z0),
++	    svst1b (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_u64_8:
++**	incb	x0
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u64_8, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 + svcntd () * 8, z0),
++	    svst1b (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1b_u64_m1:
++**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u64_m1, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 - svcntd (), z0),
++	    svst1b (p0, x0 - svcntd (), z0))
++
++/*
++** st1b_u64_m8:
++**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_u64_m8, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 - svcntd () * 8, z0),
++	    svst1b (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_u64_m9:
++**	decd	x0, all, mul #9
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_u64_m9, svuint64_t, uint8_t,
++	    svst1b_u64 (p0, x0 - svcntd () * 9, z0),
++	    svst1b (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1b_vnum_u64_0:
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_0, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, 0, z0),
++	    svst1b_vnum (p0, x0, 0, z0))
++
++/*
++** st1b_vnum_u64_1:
++**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_1, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, 1, z0),
++	    svst1b_vnum (p0, x0, 1, z0))
++
++/*
++** st1b_vnum_u64_7:
++**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_7, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, 7, z0),
++	    svst1b_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_u64_8:
++**	incb	x0
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_8, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, 8, z0),
++	    svst1b_vnum (p0, x0, 8, z0))
++
++/*
++** st1b_vnum_u64_m1:
++**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_m1, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, -1, z0),
++	    svst1b_vnum (p0, x0, -1, z0))
++
++/*
++** st1b_vnum_u64_m8:
++**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_m8, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, -8, z0),
++	    svst1b_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1b_vnum_u64_m9:
++**	decd	x0, all, mul #9
++**	st1b	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_m9, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, -9, z0),
++	    svst1b_vnum (p0, x0, -9, z0))
++
++/*
++** st1b_vnum_u64_x1:
++**	cntd	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st1b	z0\.d, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st1b	z0\.d, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st1b_vnum_u64_x1, svuint64_t, uint8_t,
++	    svst1b_vnum_u64 (p0, x0, x1, z0),
++	    svst1b_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s32.c
+new file mode 100644
+index 000000000..3466e3293
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_s32_base:
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_s32_base, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0, z0),
++	    svst1h (p0, x0, z0))
++
++/*
++** st1h_s32_index:
++**	st1h	z0\.s, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1h_s32_index, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 + x1, z0),
++	    svst1h (p0, x0 + x1, z0))
++
++/*
++** st1h_s32_1:
++**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s32_1, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 + svcntw (), z0),
++	    svst1h (p0, x0 + svcntw (), z0))
++
++/*
++** st1h_s32_7:
++**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s32_7, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 + svcntw () * 7, z0),
++	    svst1h (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_s32_8:
++**	incb	x0, all, mul #4
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_s32_8, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 + svcntw () * 8, z0),
++	    svst1h (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1h_s32_m1:
++**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s32_m1, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 - svcntw (), z0),
++	    svst1h (p0, x0 - svcntw (), z0))
++
++/*
++** st1h_s32_m8:
++**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s32_m8, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 - svcntw () * 8, z0),
++	    svst1h (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_s32_m9:
++**	dech	x0, all, mul #9
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_s32_m9, svint32_t, int16_t,
++	    svst1h_s32 (p0, x0 - svcntw () * 9, z0),
++	    svst1h (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1h_vnum_s32_0:
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_0, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, 0, z0),
++	    svst1h_vnum (p0, x0, 0, z0))
++
++/*
++** st1h_vnum_s32_1:
++**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_1, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, 1, z0),
++	    svst1h_vnum (p0, x0, 1, z0))
++
++/*
++** st1h_vnum_s32_7:
++**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_7, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, 7, z0),
++	    svst1h_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_s32_8:
++**	incb	x0, all, mul #4
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_8, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, 8, z0),
++	    svst1h_vnum (p0, x0, 8, z0))
++
++/*
++** st1h_vnum_s32_m1:
++**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_m1, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, -1, z0),
++	    svst1h_vnum (p0, x0, -1, z0))
++
++/*
++** st1h_vnum_s32_m8:
++**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_m8, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, -8, z0),
++	    svst1h_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_s32_m9:
++**	dech	x0, all, mul #9
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_m9, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, -9, z0),
++	    svst1h_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1h_vnum_s32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s32_x1, svint32_t, int16_t,
++	    svst1h_vnum_s32 (p0, x0, x1, z0),
++	    svst1h_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s64.c
+new file mode 100644
+index 000000000..c5df3b0c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_s64_base:
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_s64_base, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0, z0),
++	    svst1h (p0, x0, z0))
++
++/*
++** st1h_s64_index:
++**	st1h	z0\.d, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1h_s64_index, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 + x1, z0),
++	    svst1h (p0, x0 + x1, z0))
++
++/*
++** st1h_s64_1:
++**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s64_1, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 + svcntd (), z0),
++	    svst1h (p0, x0 + svcntd (), z0))
++
++/*
++** st1h_s64_7:
++**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s64_7, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 + svcntd () * 7, z0),
++	    svst1h (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_s64_8:
++**	incb	x0, all, mul #2
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_s64_8, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 + svcntd () * 8, z0),
++	    svst1h (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1h_s64_m1:
++**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s64_m1, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 - svcntd (), z0),
++	    svst1h (p0, x0 - svcntd (), z0))
++
++/*
++** st1h_s64_m8:
++**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_s64_m8, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 - svcntd () * 8, z0),
++	    svst1h (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_s64_m9:
++**	decw	x0, all, mul #9
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_s64_m9, svint64_t, int16_t,
++	    svst1h_s64 (p0, x0 - svcntd () * 9, z0),
++	    svst1h (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1h_vnum_s64_0:
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_0, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, 0, z0),
++	    svst1h_vnum (p0, x0, 0, z0))
++
++/*
++** st1h_vnum_s64_1:
++**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_1, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, 1, z0),
++	    svst1h_vnum (p0, x0, 1, z0))
++
++/*
++** st1h_vnum_s64_7:
++**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_7, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, 7, z0),
++	    svst1h_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_s64_8:
++**	incb	x0, all, mul #2
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_8, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, 8, z0),
++	    svst1h_vnum (p0, x0, 8, z0))
++
++/*
++** st1h_vnum_s64_m1:
++**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_m1, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, -1, z0),
++	    svst1h_vnum (p0, x0, -1, z0))
++
++/*
++** st1h_vnum_s64_m8:
++**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_m8, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, -8, z0),
++	    svst1h_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_s64_m9:
++**	decw	x0, all, mul #9
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_m9, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, -9, z0),
++	    svst1h_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1h_vnum_s64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_s64_x1, svint64_t, int16_t,
++	    svst1h_vnum_s64 (p0, x0, x1, z0),
++	    svst1h_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s32.c
+new file mode 100644
+index 000000000..2a23d41f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s32.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_scatter_s32:
++**	st1h	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_s32, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_s32 (p0, z1, z0),
++		       svst1h_scatter (p0, z1, z0))
++
++/*
++** st1h_scatter_x0_s32_offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, x0, z0),
++		       svst1h_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m2_s32_offset:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m2_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, -2, z0),
++		       svst1h_scatter_offset (p0, z1, -2, z0))
++
++/*
++** st1h_scatter_0_s32_offset:
++**	st1h	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, 0, z0),
++		       svst1h_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_s32_offset:
++**	mov	(x[0-9]+), #?5
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, 5, z0),
++		       svst1h_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_6_s32_offset:
++**	st1h	z0\.s, p0, \[z1\.s, #6\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_6_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, 6, z0),
++		       svst1h_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1h_scatter_62_s32_offset:
++**	st1h	z0\.s, p0, \[z1\.s, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_62_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, 62, z0),
++		       svst1h_scatter_offset (p0, z1, 62, z0))
++
++/*
++** st1h_scatter_64_s32_offset:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_64_s32_offset, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_s32 (p0, z1, 64, z0),
++		       svst1h_scatter_offset (p0, z1, 64, z0))
++
++/*
++** st1h_scatter_x0_s32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s32_index, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_s32 (p0, z1, x0, z0),
++		       svst1h_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m1_s32_index:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m1_s32_index, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_s32 (p0, z1, -1, z0),
++		       svst1h_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1h_scatter_0_s32_index:
++**	st1h	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_s32_index, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_s32 (p0, z1, 0, z0),
++		       svst1h_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_s32_index:
++**	st1h	z0\.s, p0, \[z1\.s, #10\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_s32_index, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_s32 (p0, z1, 5, z0),
++		       svst1h_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_31_s32_index:
++**	st1h	z0\.s, p0, \[z1\.s, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_31_s32_index, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_s32 (p0, z1, 31, z0),
++		       svst1h_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1h_scatter_32_s32_index:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_32_s32_index, svint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_s32 (p0, z1, 32, z0),
++		       svst1h_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1h_scatter_x0_s32_s32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_s32offset, svint32_t, int16_t, svint32_t,
++		       svst1h_scatter_s32offset_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s32_s32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s32_s32offset, svint32_t, int16_t, svint32_t,
++		       svst1h_scatter_s32offset_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_x0_s32_u32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		       svst1h_scatter_u32offset_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s32_u32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s32_u32offset, svint32_t, int16_t, svuint32_t,
++		       svst1h_scatter_u32offset_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_x0_s32_s32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_s32index, svint32_t, int16_t, svint32_t,
++		       svst1h_scatter_s32index_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s32_s32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s32_s32index, svint32_t, int16_t, svint32_t,
++		       svst1h_scatter_s32index_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_x0_s32_u32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_u32index, svint32_t, int16_t, svuint32_t,
++		       svst1h_scatter_u32index_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s32_u32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s32_u32index, svint32_t, int16_t, svuint32_t,
++		       svst1h_scatter_u32index_s32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s64.c
+new file mode 100644
+index 000000000..6a1adb056
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s64.c
+@@ -0,0 +1,243 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_scatter_s64:
++**	st1h	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_s64, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_s64 (p0, z1, z0),
++		       svst1h_scatter (p0, z1, z0))
++
++/*
++** st1h_scatter_x0_s64_offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, x0, z0),
++		       svst1h_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m2_s64_offset:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m2_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, -2, z0),
++		       svst1h_scatter_offset (p0, z1, -2, z0))
++
++/*
++** st1h_scatter_0_s64_offset:
++**	st1h	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, 0, z0),
++		       svst1h_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, 5, z0),
++		       svst1h_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_6_s64_offset:
++**	st1h	z0\.d, p0, \[z1\.d, #6\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_6_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, 6, z0),
++		       svst1h_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1h_scatter_62_s64_offset:
++**	st1h	z0\.d, p0, \[z1\.d, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_62_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, 62, z0),
++		       svst1h_scatter_offset (p0, z1, 62, z0))
++
++/*
++** st1h_scatter_64_s64_offset:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_64_s64_offset, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_s64 (p0, z1, 64, z0),
++		       svst1h_scatter_offset (p0, z1, 64, z0))
++
++/*
++** st1h_scatter_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s64_index, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_s64 (p0, z1, x0, z0),
++		       svst1h_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m1_s64_index:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m1_s64_index, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_s64 (p0, z1, -1, z0),
++		       svst1h_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1h_scatter_0_s64_index:
++**	st1h	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_s64_index, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_s64 (p0, z1, 0, z0),
++		       svst1h_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_s64_index:
++**	st1h	z0\.d, p0, \[z1\.d, #10\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_s64_index, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_s64 (p0, z1, 5, z0),
++		       svst1h_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_31_s64_index:
++**	st1h	z0\.d, p0, \[z1\.d, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_31_s64_index, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_s64 (p0, z1, 31, z0),
++		       svst1h_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1h_scatter_32_s64_index:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_32_s64_index, svint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_s64 (p0, z1, 32, z0),
++		       svst1h_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1h_scatter_x0_s64_s64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_s64offset, svint64_t, int16_t, svint64_t,
++		       svst1h_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s64_s64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s64_s64offset, svint64_t, int16_t, svint64_t,
++		       svst1h_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_s64_s64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_s64offset, svint64_t, int16_t, svint64_t,
++		       svst1h_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1h_scatter_x0_s64_u64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		       svst1h_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s64_u64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		       svst1h_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_s64_u64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_u64offset, svint64_t, int16_t, svuint64_t,
++		       svst1h_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1h_scatter_x0_s64_s64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_s64index, svint64_t, int16_t, svint64_t,
++		       svst1h_scatter_s64index_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s64_s64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s64_s64index, svint64_t, int16_t, svint64_t,
++		       svst1h_scatter_s64index_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_s64_s64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_s64index, svint64_t, int16_t, svint64_t,
++		       svst1h_scatter_s64index_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1h_scatter_x0_s64_u64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_u64index, svint64_t, int16_t, svuint64_t,
++		       svst1h_scatter_u64index_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_s64_u64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_s64_u64index, svint64_t, int16_t, svuint64_t,
++		       svst1h_scatter_u64index_s64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_s64_u64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_u64index, svint64_t, int16_t, svuint64_t,
++		       svst1h_scatter_u64index_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u32.c
+new file mode 100644
+index 000000000..12197315d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u32.c
+@@ -0,0 +1,207 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_scatter_u32:
++**	st1h	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_u32, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_u32 (p0, z1, z0),
++		       svst1h_scatter (p0, z1, z0))
++
++/*
++** st1h_scatter_x0_u32_offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, x0, z0),
++		       svst1h_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m2_u32_offset:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m2_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, -2, z0),
++		       svst1h_scatter_offset (p0, z1, -2, z0))
++
++/*
++** st1h_scatter_0_u32_offset:
++**	st1h	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, 0, z0),
++		       svst1h_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_u32_offset:
++**	mov	(x[0-9]+), #?5
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, 5, z0),
++		       svst1h_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_6_u32_offset:
++**	st1h	z0\.s, p0, \[z1\.s, #6\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_6_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, 6, z0),
++		       svst1h_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1h_scatter_62_u32_offset:
++**	st1h	z0\.s, p0, \[z1\.s, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_62_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, 62, z0),
++		       svst1h_scatter_offset (p0, z1, 62, z0))
++
++/*
++** st1h_scatter_64_u32_offset:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_64_u32_offset, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_offset_u32 (p0, z1, 64, z0),
++		       svst1h_scatter_offset (p0, z1, 64, z0))
++
++/*
++** st1h_scatter_x0_u32_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u32_index, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_u32 (p0, z1, x0, z0),
++		       svst1h_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m1_u32_index:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m1_u32_index, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_u32 (p0, z1, -1, z0),
++		       svst1h_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1h_scatter_0_u32_index:
++**	st1h	z0\.s, p0, \[z1\.s\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_u32_index, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_u32 (p0, z1, 0, z0),
++		       svst1h_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_u32_index:
++**	st1h	z0\.s, p0, \[z1\.s, #10\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_u32_index, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_u32 (p0, z1, 5, z0),
++		       svst1h_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_31_u32_index:
++**	st1h	z0\.s, p0, \[z1\.s, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_31_u32_index, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_u32 (p0, z1, 31, z0),
++		       svst1h_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1h_scatter_32_u32_index:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_32_u32_index, svuint32_t, svuint32_t,
++		       svst1h_scatter_u32base_index_u32 (p0, z1, 32, z0),
++		       svst1h_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1h_scatter_x0_u32_s32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		       svst1h_scatter_s32offset_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u32_s32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u32_s32offset, svuint32_t, uint16_t, svint32_t,
++		       svst1h_scatter_s32offset_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_x0_u32_u32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		       svst1h_scatter_u32offset_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u32_u32offset:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
++		       svst1h_scatter_u32offset_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_x0_u32_s32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		       svst1h_scatter_s32index_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u32_s32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u32_s32index, svuint32_t, uint16_t, svint32_t,
++		       svst1h_scatter_s32index_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_x0_u32_u32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		       svst1h_scatter_u32index_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u32_u32index:
++**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u32_u32index, svuint32_t, uint16_t, svuint32_t,
++		       svst1h_scatter_u32index_u32 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u64.c
+new file mode 100644
+index 000000000..7021ea68f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u64.c
+@@ -0,0 +1,243 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_scatter_u64:
++**	st1h	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_u64, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_u64 (p0, z1, z0),
++		       svst1h_scatter (p0, z1, z0))
++
++/*
++** st1h_scatter_x0_u64_offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, x0, z0),
++		       svst1h_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m2_u64_offset:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m2_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, -2, z0),
++		       svst1h_scatter_offset (p0, z1, -2, z0))
++
++/*
++** st1h_scatter_0_u64_offset:
++**	st1h	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, 0, z0),
++		       svst1h_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, 5, z0),
++		       svst1h_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_6_u64_offset:
++**	st1h	z0\.d, p0, \[z1\.d, #6\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_6_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, 6, z0),
++		       svst1h_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1h_scatter_62_u64_offset:
++**	st1h	z0\.d, p0, \[z1\.d, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_62_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, 62, z0),
++		       svst1h_scatter_offset (p0, z1, 62, z0))
++
++/*
++** st1h_scatter_64_u64_offset:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_64_u64_offset, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_offset_u64 (p0, z1, 64, z0),
++		       svst1h_scatter_offset (p0, z1, 64, z0))
++
++/*
++** st1h_scatter_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?1
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u64_index, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_u64 (p0, z1, x0, z0),
++		       svst1h_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1h_scatter_m1_u64_index:
++**	mov	(x[0-9]+), #?-2
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_m1_u64_index, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_u64 (p0, z1, -1, z0),
++		       svst1h_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1h_scatter_0_u64_index:
++**	st1h	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_0_u64_index, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_u64 (p0, z1, 0, z0),
++		       svst1h_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1h_scatter_5_u64_index:
++**	st1h	z0\.d, p0, \[z1\.d, #10\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_5_u64_index, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_u64 (p0, z1, 5, z0),
++		       svst1h_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1h_scatter_31_u64_index:
++**	st1h	z0\.d, p0, \[z1\.d, #62\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_31_u64_index, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_u64 (p0, z1, 31, z0),
++		       svst1h_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1h_scatter_32_u64_index:
++**	mov	(x[0-9]+), #?64
++**	st1h	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1h_scatter_32_u64_index, svuint64_t, svuint64_t,
++		       svst1h_scatter_u64base_index_u64 (p0, z1, 32, z0),
++		       svst1h_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1h_scatter_x0_u64_s64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		       svst1h_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u64_s64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		       svst1h_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_u64_s64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_s64offset, svuint64_t, uint16_t, svint64_t,
++		       svst1h_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1h_scatter_x0_u64_u64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		       svst1h_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u64_u64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		       svst1h_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_u64_u64offset:
++**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
++		       svst1h_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1h_scatter_x0_u64_s64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		       svst1h_scatter_s64index_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u64_s64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		       svst1h_scatter_s64index_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_u64_s64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_s64index, svuint64_t, uint16_t, svint64_t,
++		       svst1h_scatter_s64index_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1h_scatter_x0_u64_u64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		       svst1h_scatter_u64index_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_u64_u64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		       svst1h_scatter_u64index_u64 (p0, x0, z1, z0),
++		       svst1h_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1h_scatter_ext_u64_u64index:
++**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw 1\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_u64index, svuint64_t, uint16_t, svuint64_t,
++		       svst1h_scatter_u64index_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u32.c
+new file mode 100644
+index 000000000..49111043b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_u32_base:
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_u32_base, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0, z0),
++	    svst1h (p0, x0, z0))
++
++/*
++** st1h_u32_index:
++**	st1h	z0\.s, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1h_u32_index, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 + x1, z0),
++	    svst1h (p0, x0 + x1, z0))
++
++/*
++** st1h_u32_1:
++**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u32_1, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 + svcntw (), z0),
++	    svst1h (p0, x0 + svcntw (), z0))
++
++/*
++** st1h_u32_7:
++**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u32_7, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 + svcntw () * 7, z0),
++	    svst1h (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_u32_8:
++**	incb	x0, all, mul #4
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_u32_8, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 + svcntw () * 8, z0),
++	    svst1h (p0, x0 + svcntw () * 8, z0))
++
++/*
++** st1h_u32_m1:
++**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u32_m1, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 - svcntw (), z0),
++	    svst1h (p0, x0 - svcntw (), z0))
++
++/*
++** st1h_u32_m8:
++**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u32_m8, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 - svcntw () * 8, z0),
++	    svst1h (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_u32_m9:
++**	dech	x0, all, mul #9
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_u32_m9, svuint32_t, uint16_t,
++	    svst1h_u32 (p0, x0 - svcntw () * 9, z0),
++	    svst1h (p0, x0 - svcntw () * 9, z0))
++
++/*
++** st1h_vnum_u32_0:
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_0, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, 0, z0),
++	    svst1h_vnum (p0, x0, 0, z0))
++
++/*
++** st1h_vnum_u32_1:
++**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_1, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, 1, z0),
++	    svst1h_vnum (p0, x0, 1, z0))
++
++/*
++** st1h_vnum_u32_7:
++**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_7, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, 7, z0),
++	    svst1h_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_u32_8:
++**	incb	x0, all, mul #4
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_8, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, 8, z0),
++	    svst1h_vnum (p0, x0, 8, z0))
++
++/*
++** st1h_vnum_u32_m1:
++**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_m1, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, -1, z0),
++	    svst1h_vnum (p0, x0, -1, z0))
++
++/*
++** st1h_vnum_u32_m8:
++**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_m8, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, -8, z0),
++	    svst1h_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_u32_m9:
++**	dech	x0, all, mul #9
++**	st1h	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_m9, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, -9, z0),
++	    svst1h_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1h_vnum_u32_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u32_x1, svuint32_t, uint16_t,
++	    svst1h_vnum_u32 (p0, x0, x1, z0),
++	    svst1h_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u64.c
+new file mode 100644
+index 000000000..448cadb49
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1h_u64_base:
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_u64_base, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0, z0),
++	    svst1h (p0, x0, z0))
++
++/*
++** st1h_u64_index:
++**	st1h	z0\.d, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st1h_u64_index, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 + x1, z0),
++	    svst1h (p0, x0 + x1, z0))
++
++/*
++** st1h_u64_1:
++**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u64_1, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 + svcntd (), z0),
++	    svst1h (p0, x0 + svcntd (), z0))
++
++/*
++** st1h_u64_7:
++**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u64_7, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 + svcntd () * 7, z0),
++	    svst1h (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_u64_8:
++**	incb	x0, all, mul #2
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_u64_8, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 + svcntd () * 8, z0),
++	    svst1h (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1h_u64_m1:
++**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u64_m1, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 - svcntd (), z0),
++	    svst1h (p0, x0 - svcntd (), z0))
++
++/*
++** st1h_u64_m8:
++**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_u64_m8, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 - svcntd () * 8, z0),
++	    svst1h (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_u64_m9:
++**	decw	x0, all, mul #9
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_u64_m9, svuint64_t, uint16_t,
++	    svst1h_u64 (p0, x0 - svcntd () * 9, z0),
++	    svst1h (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1h_vnum_u64_0:
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_0, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, 0, z0),
++	    svst1h_vnum (p0, x0, 0, z0))
++
++/*
++** st1h_vnum_u64_1:
++**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_1, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, 1, z0),
++	    svst1h_vnum (p0, x0, 1, z0))
++
++/*
++** st1h_vnum_u64_7:
++**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_7, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, 7, z0),
++	    svst1h_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_u64_8:
++**	incb	x0, all, mul #2
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_8, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, 8, z0),
++	    svst1h_vnum (p0, x0, 8, z0))
++
++/*
++** st1h_vnum_u64_m1:
++**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_m1, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, -1, z0),
++	    svst1h_vnum (p0, x0, -1, z0))
++
++/*
++** st1h_vnum_u64_m8:
++**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_m8, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, -8, z0),
++	    svst1h_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1h_vnum_u64_m9:
++**	decw	x0, all, mul #9
++**	st1h	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_m9, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, -9, z0),
++	    svst1h_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1h_vnum_u64_x1:
++**	cntw	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1h	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1h_vnum_u64_x1, svuint64_t, uint16_t,
++	    svst1h_vnum_u64 (p0, x0, x1, z0),
++	    svst1h_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_s64.c
+new file mode 100644
+index 000000000..0893ce926
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1w_s64_base:
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_s64_base, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0, z0),
++	    svst1w (p0, x0, z0))
++
++/*
++** st1w_s64_index:
++**	st1w	z0\.d, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st1w_s64_index, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 + x1, z0),
++	    svst1w (p0, x0 + x1, z0))
++
++/*
++** st1w_s64_1:
++**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_s64_1, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 + svcntd (), z0),
++	    svst1w (p0, x0 + svcntd (), z0))
++
++/*
++** st1w_s64_7:
++**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_s64_7, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 + svcntd () * 7, z0),
++	    svst1w (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_s64_8:
++**	incb	x0, all, mul #4
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_s64_8, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 + svcntd () * 8, z0),
++	    svst1w (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1w_s64_m1:
++**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_s64_m1, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 - svcntd (), z0),
++	    svst1w (p0, x0 - svcntd (), z0))
++
++/*
++** st1w_s64_m8:
++**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_s64_m8, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 - svcntd () * 8, z0),
++	    svst1w (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_s64_m9:
++**	dech	x0, all, mul #9
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_s64_m9, svint64_t, int32_t,
++	    svst1w_s64 (p0, x0 - svcntd () * 9, z0),
++	    svst1w (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1w_vnum_s64_0:
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_0, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, 0, z0),
++	    svst1w_vnum (p0, x0, 0, z0))
++
++/*
++** st1w_vnum_s64_1:
++**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_1, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, 1, z0),
++	    svst1w_vnum (p0, x0, 1, z0))
++
++/*
++** st1w_vnum_s64_7:
++**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_7, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, 7, z0),
++	    svst1w_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_vnum_s64_8:
++**	incb	x0, all, mul #4
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_8, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, 8, z0),
++	    svst1w_vnum (p0, x0, 8, z0))
++
++/*
++** st1w_vnum_s64_m1:
++**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_m1, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, -1, z0),
++	    svst1w_vnum (p0, x0, -1, z0))
++
++/*
++** st1w_vnum_s64_m8:
++**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_m8, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, -8, z0),
++	    svst1w_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_vnum_s64_m9:
++**	dech	x0, all, mul #9
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_m9, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, -9, z0),
++	    svst1w_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1w_vnum_s64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1w	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_s64_x1, svint64_t, int32_t,
++	    svst1w_vnum_s64 (p0, x0, x1, z0),
++	    svst1w_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_s64.c
+new file mode 100644
+index 000000000..2363f592b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_s64.c
+@@ -0,0 +1,263 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1w_scatter_s64:
++**	st1w	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_s64, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_s64 (p0, z1, z0),
++		       svst1w_scatter (p0, z1, z0))
++
++/*
++** st1w_scatter_x0_s64_offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_x0_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, x0, z0),
++		       svst1w_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1w_scatter_m4_s64_offset:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_m4_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, -4, z0),
++		       svst1w_scatter_offset (p0, z1, -4, z0))
++
++/*
++** st1w_scatter_0_s64_offset:
++**	st1w	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_0_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 0, z0),
++		       svst1w_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1w_scatter_5_s64_offset:
++**	mov	(x[0-9]+), #?5
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_5_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 5, z0),
++		       svst1w_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1w_scatter_6_s64_offset:
++**	mov	(x[0-9]+), #?6
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_6_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 6, z0),
++		       svst1w_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1w_scatter_7_s64_offset:
++**	mov	(x[0-9]+), #?7
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_7_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 7, z0),
++		       svst1w_scatter_offset (p0, z1, 7, z0))
++
++/*
++** st1w_scatter_8_s64_offset:
++**	st1w	z0\.d, p0, \[z1\.d, #8\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_8_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 8, z0),
++		       svst1w_scatter_offset (p0, z1, 8, z0))
++
++/*
++** st1w_scatter_124_s64_offset:
++**	st1w	z0\.d, p0, \[z1\.d, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_124_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 124, z0),
++		       svst1w_scatter_offset (p0, z1, 124, z0))
++
++/*
++** st1w_scatter_128_s64_offset:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_128_s64_offset, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_s64 (p0, z1, 128, z0),
++		       svst1w_scatter_offset (p0, z1, 128, z0))
++
++/*
++** st1w_scatter_x0_s64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_x0_s64_index, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_s64 (p0, z1, x0, z0),
++		       svst1w_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1w_scatter_m1_s64_index:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_m1_s64_index, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_s64 (p0, z1, -1, z0),
++		       svst1w_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1w_scatter_0_s64_index:
++**	st1w	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_0_s64_index, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_s64 (p0, z1, 0, z0),
++		       svst1w_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1w_scatter_5_s64_index:
++**	st1w	z0\.d, p0, \[z1\.d, #20\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_5_s64_index, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_s64 (p0, z1, 5, z0),
++		       svst1w_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1w_scatter_31_s64_index:
++**	st1w	z0\.d, p0, \[z1\.d, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_31_s64_index, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_s64 (p0, z1, 31, z0),
++		       svst1w_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1w_scatter_32_s64_index:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_32_s64_index, svint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_s64 (p0, z1, 32, z0),
++		       svst1w_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1w_scatter_x0_s64_s64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_s64offset, svint64_t, int32_t, svint64_t,
++		       svst1w_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_s64_s64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_s64_s64offset, svint64_t, int32_t, svint64_t,
++		       svst1w_scatter_s64offset_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_s64_s64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_s64offset, svint64_t, int32_t, svint64_t,
++		       svst1w_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1w_scatter_x0_s64_u64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		       svst1w_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_s64_u64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		       svst1w_scatter_u64offset_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_s64_u64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_u64offset, svint64_t, int32_t, svuint64_t,
++		       svst1w_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1w_scatter_x0_s64_s64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_s64index, svint64_t, int32_t, svint64_t,
++		       svst1w_scatter_s64index_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_s64_s64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_s64_s64index, svint64_t, int32_t, svint64_t,
++		       svst1w_scatter_s64index_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_s64_s64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_s64index, svint64_t, int32_t, svint64_t,
++		       svst1w_scatter_s64index_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1w_scatter_x0_s64_u64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_u64index, svint64_t, int32_t, svuint64_t,
++		       svst1w_scatter_u64index_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_s64_u64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_s64_u64index, svint64_t, int32_t, svuint64_t,
++		       svst1w_scatter_u64index_s64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_s64_u64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_u64index, svint64_t, int32_t, svuint64_t,
++		       svst1w_scatter_u64index_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_u64.c
+new file mode 100644
+index 000000000..767c009b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_u64.c
+@@ -0,0 +1,263 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1w_scatter_u64:
++**	st1w	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_u64, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_u64 (p0, z1, z0),
++		       svst1w_scatter (p0, z1, z0))
++
++/*
++** st1w_scatter_x0_u64_offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_x0_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, x0, z0),
++		       svst1w_scatter_offset (p0, z1, x0, z0))
++
++/*
++** st1w_scatter_m4_u64_offset:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_m4_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, -4, z0),
++		       svst1w_scatter_offset (p0, z1, -4, z0))
++
++/*
++** st1w_scatter_0_u64_offset:
++**	st1w	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_0_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 0, z0),
++		       svst1w_scatter_offset (p0, z1, 0, z0))
++
++/*
++** st1w_scatter_5_u64_offset:
++**	mov	(x[0-9]+), #?5
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_5_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 5, z0),
++		       svst1w_scatter_offset (p0, z1, 5, z0))
++
++/*
++** st1w_scatter_6_u64_offset:
++**	mov	(x[0-9]+), #?6
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_6_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 6, z0),
++		       svst1w_scatter_offset (p0, z1, 6, z0))
++
++/*
++** st1w_scatter_7_u64_offset:
++**	mov	(x[0-9]+), #?7
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_7_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 7, z0),
++		       svst1w_scatter_offset (p0, z1, 7, z0))
++
++/*
++** st1w_scatter_8_u64_offset:
++**	st1w	z0\.d, p0, \[z1\.d, #8\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_8_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 8, z0),
++		       svst1w_scatter_offset (p0, z1, 8, z0))
++
++/*
++** st1w_scatter_124_u64_offset:
++**	st1w	z0\.d, p0, \[z1\.d, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_124_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 124, z0),
++		       svst1w_scatter_offset (p0, z1, 124, z0))
++
++/*
++** st1w_scatter_128_u64_offset:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_128_u64_offset, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_offset_u64 (p0, z1, 128, z0),
++		       svst1w_scatter_offset (p0, z1, 128, z0))
++
++/*
++** st1w_scatter_x0_u64_index:
++**	lsl	(x[0-9]+), x0, #?2
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_x0_u64_index, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_u64 (p0, z1, x0, z0),
++		       svst1w_scatter_index (p0, z1, x0, z0))
++
++/*
++** st1w_scatter_m1_u64_index:
++**	mov	(x[0-9]+), #?-4
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_m1_u64_index, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_u64 (p0, z1, -1, z0),
++		       svst1w_scatter_index (p0, z1, -1, z0))
++
++/*
++** st1w_scatter_0_u64_index:
++**	st1w	z0\.d, p0, \[z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_0_u64_index, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_u64 (p0, z1, 0, z0),
++		       svst1w_scatter_index (p0, z1, 0, z0))
++
++/*
++** st1w_scatter_5_u64_index:
++**	st1w	z0\.d, p0, \[z1\.d, #20\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_5_u64_index, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_u64 (p0, z1, 5, z0),
++		       svst1w_scatter_index (p0, z1, 5, z0))
++
++/*
++** st1w_scatter_31_u64_index:
++**	st1w	z0\.d, p0, \[z1\.d, #124\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_31_u64_index, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_u64 (p0, z1, 31, z0),
++		       svst1w_scatter_index (p0, z1, 31, z0))
++
++/*
++** st1w_scatter_32_u64_index:
++**	mov	(x[0-9]+), #?128
++**	st1w	z0\.d, p0, \[\1, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_ZS (st1w_scatter_32_u64_index, svuint64_t, svuint64_t,
++		       svst1w_scatter_u64base_index_u64 (p0, z1, 32, z0),
++		       svst1w_scatter_index (p0, z1, 32, z0))
++
++/*
++** st1w_scatter_x0_u64_s64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		       svst1w_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_u64_s64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		       svst1w_scatter_s64offset_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_u64_s64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_s64offset, svuint64_t, uint32_t, svint64_t,
++		       svst1w_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1w_scatter_x0_u64_u64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		       svst1w_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_u64_u64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		       svst1w_scatter_u64offset_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_offset (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_u64_u64offset:
++**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
++		       svst1w_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1w_scatter_x0_u64_s64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		       svst1w_scatter_s64index_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_u64_s64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		       svst1w_scatter_s64index_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_u64_s64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_s64index, svuint64_t, uint32_t, svint64_t,
++		       svst1w_scatter_s64index_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
++		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
++
++/*
++** st1w_scatter_x0_u64_u64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		       svst1w_scatter_u64index_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_u64_u64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		       svst1w_scatter_u64index_u64 (p0, x0, z1, z0),
++		       svst1w_scatter_index (p0, x0, z1, z0))
++
++/*
++** st1w_scatter_ext_u64_u64index:
++**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw 2\]
++**	ret
++*/
++TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_u64index, svuint64_t, uint32_t, svuint64_t,
++		       svst1w_scatter_u64index_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
++		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_u64.c
+new file mode 100644
+index 000000000..882abebbb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st1w_u64_base:
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_u64_base, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0, z0),
++	    svst1w (p0, x0, z0))
++
++/*
++** st1w_u64_index:
++**	st1w	z0\.d, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st1w_u64_index, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 + x1, z0),
++	    svst1w (p0, x0 + x1, z0))
++
++/*
++** st1w_u64_1:
++**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_u64_1, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 + svcntd (), z0),
++	    svst1w (p0, x0 + svcntd (), z0))
++
++/*
++** st1w_u64_7:
++**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_u64_7, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 + svcntd () * 7, z0),
++	    svst1w (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_u64_8:
++**	incb	x0, all, mul #4
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_u64_8, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 + svcntd () * 8, z0),
++	    svst1w (p0, x0 + svcntd () * 8, z0))
++
++/*
++** st1w_u64_m1:
++**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_u64_m1, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 - svcntd (), z0),
++	    svst1w (p0, x0 - svcntd (), z0))
++
++/*
++** st1w_u64_m8:
++**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_u64_m8, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 - svcntd () * 8, z0),
++	    svst1w (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_u64_m9:
++**	dech	x0, all, mul #9
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_u64_m9, svuint64_t, uint32_t,
++	    svst1w_u64 (p0, x0 - svcntd () * 9, z0),
++	    svst1w (p0, x0 - svcntd () * 9, z0))
++
++/*
++** st1w_vnum_u64_0:
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_0, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, 0, z0),
++	    svst1w_vnum (p0, x0, 0, z0))
++
++/*
++** st1w_vnum_u64_1:
++**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_1, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, 1, z0),
++	    svst1w_vnum (p0, x0, 1, z0))
++
++/*
++** st1w_vnum_u64_7:
++**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_7, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, 7, z0),
++	    svst1w_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_vnum_u64_8:
++**	incb	x0, all, mul #4
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_8, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, 8, z0),
++	    svst1w_vnum (p0, x0, 8, z0))
++
++/*
++** st1w_vnum_u64_m1:
++**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_m1, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, -1, z0),
++	    svst1w_vnum (p0, x0, -1, z0))
++
++/*
++** st1w_vnum_u64_m8:
++**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_m8, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, -8, z0),
++	    svst1w_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st1w_vnum_u64_m9:
++**	dech	x0, all, mul #9
++**	st1w	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_m9, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, -9, z0),
++	    svst1w_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st1w_vnum_u64_x1:
++**	cnth	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st1w	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st1w_vnum_u64_x1, svuint64_t, uint32_t,
++	    svst1w_vnum_u64 (p0, x0, x1, z0),
++	    svst1w_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_bf16.c
+new file mode 100644
+index 000000000..a4a57af08
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_bf16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_bf16_base:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_bf16_base, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_bf16_index:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st2_bf16_index, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_bf16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_bf16_1, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 + svcnth (), z0),
++	    svst2 (p0, x0 + svcnth (), z0))
++
++/*
++** st2_bf16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_bf16_2, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 + svcnth () * 2, z0),
++	    svst2 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st2_bf16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_bf16_14, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 + svcnth () * 14, z0),
++	    svst2 (p0, x0 + svcnth () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_bf16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_bf16_16, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 + svcnth () * 16, z0),
++	    svst2 (p0, x0 + svcnth () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_bf16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_bf16_m1, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 - svcnth (), z0),
++	    svst2 (p0, x0 - svcnth (), z0))
++
++/*
++** st2_bf16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_bf16_m2, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 - svcnth () * 2, z0),
++	    svst2 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st2_bf16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_bf16_m16, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 - svcnth () * 16, z0),
++	    svst2 (p0, x0 - svcnth () * 16, z0))
++
++/*
++** st2_bf16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_bf16_m18, svbfloat16x2_t, bfloat16_t,
++	    svst2_bf16 (p0, x0 - svcnth () * 18, z0),
++	    svst2 (p0, x0 - svcnth () * 18, z0))
++
++/*
++** st2_vnum_bf16_0:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_bf16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_bf16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_bf16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_bf16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_bf16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_bf16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_bf16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_bf16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
++	    svst2_vnum_bf16 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f16.c
+new file mode 100644
+index 000000000..014203be6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_f16_base:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f16_base, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_f16_index:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st2_f16_index, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f16_1, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 + svcnth (), z0),
++	    svst2 (p0, x0 + svcnth (), z0))
++
++/*
++** st2_f16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f16_2, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 + svcnth () * 2, z0),
++	    svst2 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st2_f16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f16_14, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 + svcnth () * 14, z0),
++	    svst2 (p0, x0 + svcnth () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f16_16, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 + svcnth () * 16, z0),
++	    svst2 (p0, x0 + svcnth () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f16_m1, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 - svcnth (), z0),
++	    svst2 (p0, x0 - svcnth (), z0))
++
++/*
++** st2_f16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f16_m2, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 - svcnth () * 2, z0),
++	    svst2 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st2_f16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f16_m16, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 - svcnth () * 16, z0),
++	    svst2 (p0, x0 - svcnth () * 16, z0))
++
++/*
++** st2_f16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_f16_m18, svfloat16x2_t, float16_t,
++	    svst2_f16 (p0, x0 - svcnth () * 18, z0),
++	    svst2 (p0, x0 - svcnth () * 18, z0))
++
++/*
++** st2_vnum_f16_0:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_0, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_1, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_f16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_2, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_f16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_14, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_16, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_m1, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_f16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_m2, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_f16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_m16, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_f16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_m18, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f16_x1, svfloat16x2_t, float16_t,
++	    svst2_vnum_f16 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f32.c
+new file mode 100644
+index 000000000..ba271882e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f32.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_f32_base:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f32_base, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_f32_index:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st2_f32_index, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f32_1:
++**	incb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f32_1, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 + svcntw (), z0),
++	    svst2 (p0, x0 + svcntw (), z0))
++
++/*
++** st2_f32_2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f32_2, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 + svcntw () * 2, z0),
++	    svst2 (p0, x0 + svcntw () * 2, z0))
++
++/*
++** st2_f32_14:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f32_14, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 + svcntw () * 14, z0),
++	    svst2 (p0, x0 + svcntw () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f32_16:
++**	incb	x0, all, mul #16
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f32_16, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 + svcntw () * 16, z0),
++	    svst2 (p0, x0 + svcntw () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f32_m1:
++**	decb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f32_m1, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 - svcntw (), z0),
++	    svst2 (p0, x0 - svcntw (), z0))
++
++/*
++** st2_f32_m2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f32_m2, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 - svcntw () * 2, z0),
++	    svst2 (p0, x0 - svcntw () * 2, z0))
++
++/*
++** st2_f32_m16:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f32_m16, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 - svcntw () * 16, z0),
++	    svst2 (p0, x0 - svcntw () * 16, z0))
++
++/*
++** st2_f32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_f32_m18, svfloat32x2_t, float32_t,
++	    svst2_f32 (p0, x0 - svcntw () * 18, z0),
++	    svst2 (p0, x0 - svcntw () * 18, z0))
++
++/*
++** st2_vnum_f32_0:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_0, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f32_1:
++**	incb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_1, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_f32_2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_2, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_f32_14:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_14, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f32_16:
++**	incb	x0, all, mul #16
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_16, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f32_m1:
++**	decb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_m1, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_f32_m2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_m2, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_f32_m16:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_m16, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_f32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_m18, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f32_x1, svfloat32x2_t, float32_t,
++	    svst2_vnum_f32 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f64.c
+new file mode 100644
+index 000000000..c499ba0fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f64.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_f64_base:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f64_base, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_f64_index:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st2_f64_index, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f64_1:
++**	incb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f64_1, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 + svcntd (), z0),
++	    svst2 (p0, x0 + svcntd (), z0))
++
++/*
++** st2_f64_2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f64_2, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 + svcntd () * 2, z0),
++	    svst2 (p0, x0 + svcntd () * 2, z0))
++
++/*
++** st2_f64_14:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f64_14, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 + svcntd () * 14, z0),
++	    svst2 (p0, x0 + svcntd () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f64_16:
++**	incb	x0, all, mul #16
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f64_16, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 + svcntd () * 16, z0),
++	    svst2 (p0, x0 + svcntd () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_f64_m1:
++**	decb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_f64_m1, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 - svcntd (), z0),
++	    svst2 (p0, x0 - svcntd (), z0))
++
++/*
++** st2_f64_m2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f64_m2, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 - svcntd () * 2, z0),
++	    svst2 (p0, x0 - svcntd () * 2, z0))
++
++/*
++** st2_f64_m16:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_f64_m16, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 - svcntd () * 16, z0),
++	    svst2 (p0, x0 - svcntd () * 16, z0))
++
++/*
++** st2_f64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_f64_m18, svfloat64x2_t, float64_t,
++	    svst2_f64 (p0, x0 - svcntd () * 18, z0),
++	    svst2 (p0, x0 - svcntd () * 18, z0))
++
++/*
++** st2_vnum_f64_0:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_0, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f64_1:
++**	incb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_1, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_f64_2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_2, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_f64_14:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_14, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f64_16:
++**	incb	x0, all, mul #16
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_16, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_f64_m1:
++**	decb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_m1, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_f64_m2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_m2, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_f64_m16:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_m16, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_f64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_m18, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_f64_x1, svfloat64x2_t, float64_t,
++	    svst2_vnum_f64 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s16.c
+new file mode 100644
+index 000000000..860b45eac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_s16_base:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s16_base, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_s16_index:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st2_s16_index, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s16_1, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 + svcnth (), z0),
++	    svst2 (p0, x0 + svcnth (), z0))
++
++/*
++** st2_s16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s16_2, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 + svcnth () * 2, z0),
++	    svst2 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st2_s16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s16_14, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 + svcnth () * 14, z0),
++	    svst2 (p0, x0 + svcnth () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s16_16, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 + svcnth () * 16, z0),
++	    svst2 (p0, x0 + svcnth () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s16_m1, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 - svcnth (), z0),
++	    svst2 (p0, x0 - svcnth (), z0))
++
++/*
++** st2_s16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s16_m2, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 - svcnth () * 2, z0),
++	    svst2 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st2_s16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s16_m16, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 - svcnth () * 16, z0),
++	    svst2 (p0, x0 - svcnth () * 16, z0))
++
++/*
++** st2_s16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_s16_m18, svint16x2_t, int16_t,
++	    svst2_s16 (p0, x0 - svcnth () * 18, z0),
++	    svst2 (p0, x0 - svcnth () * 18, z0))
++
++/*
++** st2_vnum_s16_0:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_0, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_1, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_s16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_2, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_s16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_14, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_16, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_m1, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_s16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_m2, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_s16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_m16, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_s16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_m18, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s16_x1, svint16x2_t, int16_t,
++	    svst2_vnum_s16 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s32.c
+new file mode 100644
+index 000000000..16b674992
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s32.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_s32_base:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s32_base, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_s32_index:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st2_s32_index, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s32_1:
++**	incb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s32_1, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 + svcntw (), z0),
++	    svst2 (p0, x0 + svcntw (), z0))
++
++/*
++** st2_s32_2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s32_2, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 + svcntw () * 2, z0),
++	    svst2 (p0, x0 + svcntw () * 2, z0))
++
++/*
++** st2_s32_14:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s32_14, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 + svcntw () * 14, z0),
++	    svst2 (p0, x0 + svcntw () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s32_16:
++**	incb	x0, all, mul #16
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s32_16, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 + svcntw () * 16, z0),
++	    svst2 (p0, x0 + svcntw () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s32_m1:
++**	decb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s32_m1, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 - svcntw (), z0),
++	    svst2 (p0, x0 - svcntw (), z0))
++
++/*
++** st2_s32_m2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s32_m2, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 - svcntw () * 2, z0),
++	    svst2 (p0, x0 - svcntw () * 2, z0))
++
++/*
++** st2_s32_m16:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s32_m16, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 - svcntw () * 16, z0),
++	    svst2 (p0, x0 - svcntw () * 16, z0))
++
++/*
++** st2_s32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_s32_m18, svint32x2_t, int32_t,
++	    svst2_s32 (p0, x0 - svcntw () * 18, z0),
++	    svst2 (p0, x0 - svcntw () * 18, z0))
++
++/*
++** st2_vnum_s32_0:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_0, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s32_1:
++**	incb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_1, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_s32_2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_2, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_s32_14:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_14, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s32_16:
++**	incb	x0, all, mul #16
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_16, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s32_m1:
++**	decb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_m1, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_s32_m2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_m2, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_s32_m16:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_m16, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_s32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_m18, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s32_x1, svint32x2_t, int32_t,
++	    svst2_vnum_s32 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s64.c
+new file mode 100644
+index 000000000..1421333cb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s64.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_s64_base:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s64_base, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_s64_index:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st2_s64_index, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s64_1:
++**	incb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s64_1, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 + svcntd (), z0),
++	    svst2 (p0, x0 + svcntd (), z0))
++
++/*
++** st2_s64_2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s64_2, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 + svcntd () * 2, z0),
++	    svst2 (p0, x0 + svcntd () * 2, z0))
++
++/*
++** st2_s64_14:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s64_14, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 + svcntd () * 14, z0),
++	    svst2 (p0, x0 + svcntd () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s64_16:
++**	incb	x0, all, mul #16
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s64_16, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 + svcntd () * 16, z0),
++	    svst2 (p0, x0 + svcntd () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s64_m1:
++**	decb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s64_m1, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 - svcntd (), z0),
++	    svst2 (p0, x0 - svcntd (), z0))
++
++/*
++** st2_s64_m2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s64_m2, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 - svcntd () * 2, z0),
++	    svst2 (p0, x0 - svcntd () * 2, z0))
++
++/*
++** st2_s64_m16:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s64_m16, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 - svcntd () * 16, z0),
++	    svst2 (p0, x0 - svcntd () * 16, z0))
++
++/*
++** st2_s64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_s64_m18, svint64x2_t, int64_t,
++	    svst2_s64 (p0, x0 - svcntd () * 18, z0),
++	    svst2 (p0, x0 - svcntd () * 18, z0))
++
++/*
++** st2_vnum_s64_0:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_0, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s64_1:
++**	incb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_1, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_s64_2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_2, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_s64_14:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_14, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s64_16:
++**	incb	x0, all, mul #16
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_16, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s64_m1:
++**	decb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_m1, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_s64_m2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_m2, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_s64_m16:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_m16, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_s64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_m18, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s64_x1, svint64x2_t, int64_t,
++	    svst2_vnum_s64 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s8.c
+new file mode 100644
+index 000000000..f0b7df3c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s8.c
+@@ -0,0 +1,204 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_s8_base:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s8_base, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_s8_index:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st2_s8_index, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s8_1:
++**	incb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s8_1, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 + svcntb (), z0),
++	    svst2 (p0, x0 + svcntb (), z0))
++
++/*
++** st2_s8_2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s8_2, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 + svcntb () * 2, z0),
++	    svst2 (p0, x0 + svcntb () * 2, z0))
++
++/*
++** st2_s8_14:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s8_14, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 + svcntb () * 14, z0),
++	    svst2 (p0, x0 + svcntb () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s8_16:
++**	incb	x0, all, mul #16
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s8_16, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 + svcntb () * 16, z0),
++	    svst2 (p0, x0 + svcntb () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_s8_m1:
++**	decb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_s8_m1, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 - svcntb (), z0),
++	    svst2 (p0, x0 - svcntb (), z0))
++
++/*
++** st2_s8_m2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s8_m2, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 - svcntb () * 2, z0),
++	    svst2 (p0, x0 - svcntb () * 2, z0))
++
++/*
++** st2_s8_m16:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_s8_m16, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 - svcntb () * 16, z0),
++	    svst2 (p0, x0 - svcntb () * 16, z0))
++
++/*
++** st2_s8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_s8_m18, svint8x2_t, int8_t,
++	    svst2_s8 (p0, x0 - svcntb () * 18, z0),
++	    svst2 (p0, x0 - svcntb () * 18, z0))
++
++/*
++** st2_vnum_s8_0:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_0, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s8_1:
++**	incb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_1, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_s8_2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_2, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_s8_14:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_14, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s8_16:
++**	incb	x0, all, mul #16
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_16, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_s8_m1:
++**	decb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_m1, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_s8_m2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_m2, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_s8_m16:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_m16, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_s8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_m18, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/*
++** st2_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st2_vnum_s8_x1, svint8x2_t, int8_t,
++	    svst2_vnum_s8 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u16.c
+new file mode 100644
+index 000000000..edd32d81e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u16.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_u16_base:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u16_base, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_u16_index:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st2_u16_index, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u16_1, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 + svcnth (), z0),
++	    svst2 (p0, x0 + svcnth (), z0))
++
++/*
++** st2_u16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u16_2, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 + svcnth () * 2, z0),
++	    svst2 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st2_u16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u16_14, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 + svcnth () * 14, z0),
++	    svst2 (p0, x0 + svcnth () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u16_16, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 + svcnth () * 16, z0),
++	    svst2 (p0, x0 + svcnth () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u16_m1, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 - svcnth (), z0),
++	    svst2 (p0, x0 - svcnth (), z0))
++
++/*
++** st2_u16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u16_m2, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 - svcnth () * 2, z0),
++	    svst2 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st2_u16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u16_m16, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 - svcnth () * 16, z0),
++	    svst2 (p0, x0 - svcnth () * 16, z0))
++
++/*
++** st2_u16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_u16_m18, svuint16x2_t, uint16_t,
++	    svst2_u16 (p0, x0 - svcnth () * 18, z0),
++	    svst2 (p0, x0 - svcnth () * 18, z0))
++
++/*
++** st2_vnum_u16_0:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_0, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u16_1:
++**	incb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_1, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_u16_2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_2, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_u16_14:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_14, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u16_16:
++**	incb	x0, all, mul #16
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_16, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u16_m1:
++**	decb	x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_m1, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_u16_m2:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_m2, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_u16_m16:
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_m16, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_u16_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_m18, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u16_x1, svuint16x2_t, uint16_t,
++	    svst2_vnum_u16 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u32.c
+new file mode 100644
+index 000000000..46f1b5ca7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u32.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_u32_base:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u32_base, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_u32_index:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st2_u32_index, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u32_1:
++**	incb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u32_1, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 + svcntw (), z0),
++	    svst2 (p0, x0 + svcntw (), z0))
++
++/*
++** st2_u32_2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u32_2, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 + svcntw () * 2, z0),
++	    svst2 (p0, x0 + svcntw () * 2, z0))
++
++/*
++** st2_u32_14:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u32_14, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 + svcntw () * 14, z0),
++	    svst2 (p0, x0 + svcntw () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u32_16:
++**	incb	x0, all, mul #16
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u32_16, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 + svcntw () * 16, z0),
++	    svst2 (p0, x0 + svcntw () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u32_m1:
++**	decb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u32_m1, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 - svcntw (), z0),
++	    svst2 (p0, x0 - svcntw (), z0))
++
++/*
++** st2_u32_m2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u32_m2, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 - svcntw () * 2, z0),
++	    svst2 (p0, x0 - svcntw () * 2, z0))
++
++/*
++** st2_u32_m16:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u32_m16, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 - svcntw () * 16, z0),
++	    svst2 (p0, x0 - svcntw () * 16, z0))
++
++/*
++** st2_u32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_u32_m18, svuint32x2_t, uint32_t,
++	    svst2_u32 (p0, x0 - svcntw () * 18, z0),
++	    svst2 (p0, x0 - svcntw () * 18, z0))
++
++/*
++** st2_vnum_u32_0:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_0, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u32_1:
++**	incb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_1, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_u32_2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_2, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_u32_14:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_14, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u32_16:
++**	incb	x0, all, mul #16
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_16, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u32_m1:
++**	decb	x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_m1, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_u32_m2:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_m2, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_u32_m16:
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_m16, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_u32_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_m18, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u32_x1, svuint32x2_t, uint32_t,
++	    svst2_vnum_u32 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u64.c
+new file mode 100644
+index 000000000..0d9202b72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u64.c
+@@ -0,0 +1,200 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_u64_base:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u64_base, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_u64_index:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st2_u64_index, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u64_1:
++**	incb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u64_1, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 + svcntd (), z0),
++	    svst2 (p0, x0 + svcntd (), z0))
++
++/*
++** st2_u64_2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u64_2, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 + svcntd () * 2, z0),
++	    svst2 (p0, x0 + svcntd () * 2, z0))
++
++/*
++** st2_u64_14:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u64_14, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 + svcntd () * 14, z0),
++	    svst2 (p0, x0 + svcntd () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u64_16:
++**	incb	x0, all, mul #16
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u64_16, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 + svcntd () * 16, z0),
++	    svst2 (p0, x0 + svcntd () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u64_m1:
++**	decb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u64_m1, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 - svcntd (), z0),
++	    svst2 (p0, x0 - svcntd (), z0))
++
++/*
++** st2_u64_m2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u64_m2, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 - svcntd () * 2, z0),
++	    svst2 (p0, x0 - svcntd () * 2, z0))
++
++/*
++** st2_u64_m16:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u64_m16, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 - svcntd () * 16, z0),
++	    svst2 (p0, x0 - svcntd () * 16, z0))
++
++/*
++** st2_u64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_u64_m18, svuint64x2_t, uint64_t,
++	    svst2_u64 (p0, x0 - svcntd () * 18, z0),
++	    svst2 (p0, x0 - svcntd () * 18, z0))
++
++/*
++** st2_vnum_u64_0:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_0, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u64_1:
++**	incb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_1, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_u64_2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_2, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_u64_14:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_14, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u64_16:
++**	incb	x0, all, mul #16
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_16, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u64_m1:
++**	decb	x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_m1, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_u64_m2:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_m2, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_u64_m16:
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_m16, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_u64_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_m18, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st2_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u64_x1, svuint64x2_t, uint64_t,
++	    svst2_vnum_u64 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u8.c
+new file mode 100644
+index 000000000..e7ea977a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u8.c
+@@ -0,0 +1,204 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st2_u8_base:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u8_base, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0, z0),
++	    svst2 (p0, x0, z0))
++
++/*
++** st2_u8_index:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st2_u8_index, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 + x1, z0),
++	    svst2 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u8_1:
++**	incb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u8_1, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 + svcntb (), z0),
++	    svst2 (p0, x0 + svcntb (), z0))
++
++/*
++** st2_u8_2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u8_2, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 + svcntb () * 2, z0),
++	    svst2 (p0, x0 + svcntb () * 2, z0))
++
++/*
++** st2_u8_14:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u8_14, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 + svcntb () * 14, z0),
++	    svst2 (p0, x0 + svcntb () * 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u8_16:
++**	incb	x0, all, mul #16
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u8_16, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 + svcntb () * 16, z0),
++	    svst2 (p0, x0 + svcntb () * 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_u8_m1:
++**	decb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_u8_m1, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 - svcntb (), z0),
++	    svst2 (p0, x0 - svcntb (), z0))
++
++/*
++** st2_u8_m2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u8_m2, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 - svcntb () * 2, z0),
++	    svst2 (p0, x0 - svcntb () * 2, z0))
++
++/*
++** st2_u8_m16:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_u8_m16, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 - svcntb () * 16, z0),
++	    svst2 (p0, x0 - svcntb () * 16, z0))
++
++/*
++** st2_u8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_u8_m18, svuint8x2_t, uint8_t,
++	    svst2_u8 (p0, x0 - svcntb () * 18, z0),
++	    svst2 (p0, x0 - svcntb () * 18, z0))
++
++/*
++** st2_vnum_u8_0:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_0, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, 0, z0),
++	    svst2_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u8_1:
++**	incb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_1, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, 1, z0),
++	    svst2_vnum (p0, x0, 1, z0))
++
++/*
++** st2_vnum_u8_2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_2, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, 2, z0),
++	    svst2_vnum (p0, x0, 2, z0))
++
++/*
++** st2_vnum_u8_14:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_14, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, 14, z0),
++	    svst2_vnum (p0, x0, 14, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u8_16:
++**	incb	x0, all, mul #16
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_16, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, 16, z0),
++	    svst2_vnum (p0, x0, 16, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st2_vnum_u8_m1:
++**	decb	x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_m1, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, -1, z0),
++	    svst2_vnum (p0, x0, -1, z0))
++
++/*
++** st2_vnum_u8_m2:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_m2, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, -2, z0),
++	    svst2_vnum (p0, x0, -2, z0))
++
++/*
++** st2_vnum_u8_m16:
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_m16, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, -16, z0),
++	    svst2_vnum (p0, x0, -16, z0))
++
++/*
++** st2_vnum_u8_m18:
++**	addvl	(x[0-9]+), x0, #-18
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_m18, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, -18, z0),
++	    svst2_vnum (p0, x0, -18, z0))
++
++/*
++** st2_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st2_vnum_u8_x1, svuint8x2_t, uint8_t,
++	    svst2_vnum_u8 (p0, x0, x1, z0),
++	    svst2_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_bf16.c
+new file mode 100644
+index 000000000..2f921687c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_bf16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_bf16_base:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_bf16_base, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_bf16_index:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st3_bf16_index, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_bf16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_bf16_1, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 + svcnth (), z0),
++	    svst3 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_bf16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_bf16_2, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 + svcnth () * 2, z0),
++	    svst3 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st3_bf16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_bf16_3, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 + svcnth () * 3, z0),
++	    svst3 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st3_bf16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_bf16_21, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 + svcnth () * 21, z0),
++	    svst3 (p0, x0 + svcnth () * 21, z0))
++
++/*
++** st3_bf16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_bf16_24, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 + svcnth () * 24, z0),
++	    svst3 (p0, x0 + svcnth () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_bf16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_bf16_m1, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 - svcnth (), z0),
++	    svst3 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_bf16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_bf16_m2, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 - svcnth () * 2, z0),
++	    svst3 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st3_bf16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_bf16_m3, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 - svcnth () * 3, z0),
++	    svst3 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st3_bf16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_bf16_m24, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 - svcnth () * 24, z0),
++	    svst3 (p0, x0 - svcnth () * 24, z0))
++
++/*
++** st3_bf16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_bf16_m27, svbfloat16x3_t, bfloat16_t,
++	    svst3_bf16 (p0, x0 - svcnth () * 27, z0),
++	    svst3 (p0, x0 - svcnth () * 27, z0))
++
++/*
++** st3_vnum_bf16_0:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_bf16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_bf16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_bf16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_bf16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_bf16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_bf16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_bf16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_bf16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_bf16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_bf16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
++	    svst3_vnum_bf16 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f16.c
+new file mode 100644
+index 000000000..388eb3708
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_f16_base:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f16_base, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_f16_index:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st3_f16_index, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f16_1, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 + svcnth (), z0),
++	    svst3 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f16_2, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 + svcnth () * 2, z0),
++	    svst3 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st3_f16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f16_3, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 + svcnth () * 3, z0),
++	    svst3 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st3_f16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f16_21, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 + svcnth () * 21, z0),
++	    svst3 (p0, x0 + svcnth () * 21, z0))
++
++/*
++** st3_f16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_f16_24, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 + svcnth () * 24, z0),
++	    svst3 (p0, x0 + svcnth () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f16_m1, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 - svcnth (), z0),
++	    svst3 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f16_m2, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 - svcnth () * 2, z0),
++	    svst3 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st3_f16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f16_m3, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 - svcnth () * 3, z0),
++	    svst3 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st3_f16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f16_m24, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 - svcnth () * 24, z0),
++	    svst3 (p0, x0 - svcnth () * 24, z0))
++
++/*
++** st3_f16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_f16_m27, svfloat16x3_t, float16_t,
++	    svst3_f16 (p0, x0 - svcnth () * 27, z0),
++	    svst3 (p0, x0 - svcnth () * 27, z0))
++
++/*
++** st3_vnum_f16_0:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_0, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_1, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_2, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_f16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_3, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_f16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_21, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_f16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_24, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_m1, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_m2, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_f16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_m3, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_f16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_m24, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_f16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_m27, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f16_x1, svfloat16x3_t, float16_t,
++	    svst3_vnum_f16 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f32.c
+new file mode 100644
+index 000000000..a5e3bdb45
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f32.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_f32_base:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f32_base, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_f32_index:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st3_f32_index, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f32_1:
++**	incb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f32_1, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 + svcntw (), z0),
++	    svst3 (p0, x0 + svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f32_2:
++**	incb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f32_2, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 + svcntw () * 2, z0),
++	    svst3 (p0, x0 + svcntw () * 2, z0))
++
++/*
++** st3_f32_3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f32_3, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 + svcntw () * 3, z0),
++	    svst3 (p0, x0 + svcntw () * 3, z0))
++
++/*
++** st3_f32_21:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f32_21, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 + svcntw () * 21, z0),
++	    svst3 (p0, x0 + svcntw () * 21, z0))
++
++/*
++** st3_f32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_f32_24, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 + svcntw () * 24, z0),
++	    svst3 (p0, x0 + svcntw () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f32_m1:
++**	decb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f32_m1, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 - svcntw (), z0),
++	    svst3 (p0, x0 - svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f32_m2:
++**	decb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f32_m2, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 - svcntw () * 2, z0),
++	    svst3 (p0, x0 - svcntw () * 2, z0))
++
++/*
++** st3_f32_m3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f32_m3, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 - svcntw () * 3, z0),
++	    svst3 (p0, x0 - svcntw () * 3, z0))
++
++/*
++** st3_f32_m24:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f32_m24, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 - svcntw () * 24, z0),
++	    svst3 (p0, x0 - svcntw () * 24, z0))
++
++/*
++** st3_f32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_f32_m27, svfloat32x3_t, float32_t,
++	    svst3_f32 (p0, x0 - svcntw () * 27, z0),
++	    svst3 (p0, x0 - svcntw () * 27, z0))
++
++/*
++** st3_vnum_f32_0:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_0, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f32_1:
++**	incb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_1, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f32_2:
++**	incb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_2, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_f32_3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_3, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_f32_21:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_21, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_f32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_24, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f32_m1:
++**	decb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_m1, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f32_m2:
++**	decb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_m2, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_f32_m3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_m3, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_f32_m24:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_m24, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_f32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_m27, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3w	{z0\.s - z2\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f32_x1, svfloat32x3_t, float32_t,
++	    svst3_vnum_f32 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f64.c
+new file mode 100644
+index 000000000..30407da8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f64.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_f64_base:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f64_base, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_f64_index:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st3_f64_index, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f64_1:
++**	incb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f64_1, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 + svcntd (), z0),
++	    svst3 (p0, x0 + svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f64_2:
++**	incb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f64_2, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 + svcntd () * 2, z0),
++	    svst3 (p0, x0 + svcntd () * 2, z0))
++
++/*
++** st3_f64_3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f64_3, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 + svcntd () * 3, z0),
++	    svst3 (p0, x0 + svcntd () * 3, z0))
++
++/*
++** st3_f64_21:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f64_21, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 + svcntd () * 21, z0),
++	    svst3 (p0, x0 + svcntd () * 21, z0))
++
++/*
++** st3_f64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_f64_24, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 + svcntd () * 24, z0),
++	    svst3 (p0, x0 + svcntd () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f64_m1:
++**	decb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f64_m1, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 - svcntd (), z0),
++	    svst3 (p0, x0 - svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_f64_m2:
++**	decb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_f64_m2, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 - svcntd () * 2, z0),
++	    svst3 (p0, x0 - svcntd () * 2, z0))
++
++/*
++** st3_f64_m3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f64_m3, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 - svcntd () * 3, z0),
++	    svst3 (p0, x0 - svcntd () * 3, z0))
++
++/*
++** st3_f64_m24:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_f64_m24, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 - svcntd () * 24, z0),
++	    svst3 (p0, x0 - svcntd () * 24, z0))
++
++/*
++** st3_f64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_f64_m27, svfloat64x3_t, float64_t,
++	    svst3_f64 (p0, x0 - svcntd () * 27, z0),
++	    svst3 (p0, x0 - svcntd () * 27, z0))
++
++/*
++** st3_vnum_f64_0:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_0, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f64_1:
++**	incb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_1, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f64_2:
++**	incb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_2, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_f64_3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_3, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_f64_21:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_21, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_f64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_24, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f64_m1:
++**	decb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_m1, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_f64_m2:
++**	decb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_m2, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_f64_m3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_m3, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_f64_m24:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_m24, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_f64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_m27, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3d	{z0\.d - z2\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_f64_x1, svfloat64x3_t, float64_t,
++	    svst3_vnum_f64 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s16.c
+new file mode 100644
+index 000000000..a4a1109c5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_s16_base:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s16_base, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_s16_index:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st3_s16_index, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s16_1, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 + svcnth (), z0),
++	    svst3 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s16_2, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 + svcnth () * 2, z0),
++	    svst3 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st3_s16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s16_3, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 + svcnth () * 3, z0),
++	    svst3 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st3_s16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s16_21, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 + svcnth () * 21, z0),
++	    svst3 (p0, x0 + svcnth () * 21, z0))
++
++/*
++** st3_s16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s16_24, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 + svcnth () * 24, z0),
++	    svst3 (p0, x0 + svcnth () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s16_m1, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 - svcnth (), z0),
++	    svst3 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s16_m2, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 - svcnth () * 2, z0),
++	    svst3 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st3_s16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s16_m3, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 - svcnth () * 3, z0),
++	    svst3 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st3_s16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s16_m24, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 - svcnth () * 24, z0),
++	    svst3 (p0, x0 - svcnth () * 24, z0))
++
++/*
++** st3_s16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s16_m27, svint16x3_t, int16_t,
++	    svst3_s16 (p0, x0 - svcnth () * 27, z0),
++	    svst3 (p0, x0 - svcnth () * 27, z0))
++
++/*
++** st3_vnum_s16_0:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_0, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_1, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_2, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_s16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_3, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_s16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_21, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_s16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_24, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_m1, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_m2, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_s16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_m3, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_s16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_m24, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_s16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_m27, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s16_x1, svint16x3_t, int16_t,
++	    svst3_vnum_s16 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s32.c
+new file mode 100644
+index 000000000..2442d9b28
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s32.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_s32_base:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s32_base, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_s32_index:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st3_s32_index, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s32_1:
++**	incb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s32_1, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 + svcntw (), z0),
++	    svst3 (p0, x0 + svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s32_2:
++**	incb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s32_2, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 + svcntw () * 2, z0),
++	    svst3 (p0, x0 + svcntw () * 2, z0))
++
++/*
++** st3_s32_3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s32_3, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 + svcntw () * 3, z0),
++	    svst3 (p0, x0 + svcntw () * 3, z0))
++
++/*
++** st3_s32_21:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s32_21, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 + svcntw () * 21, z0),
++	    svst3 (p0, x0 + svcntw () * 21, z0))
++
++/*
++** st3_s32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s32_24, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 + svcntw () * 24, z0),
++	    svst3 (p0, x0 + svcntw () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s32_m1:
++**	decb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s32_m1, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 - svcntw (), z0),
++	    svst3 (p0, x0 - svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s32_m2:
++**	decb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s32_m2, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 - svcntw () * 2, z0),
++	    svst3 (p0, x0 - svcntw () * 2, z0))
++
++/*
++** st3_s32_m3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s32_m3, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 - svcntw () * 3, z0),
++	    svst3 (p0, x0 - svcntw () * 3, z0))
++
++/*
++** st3_s32_m24:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s32_m24, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 - svcntw () * 24, z0),
++	    svst3 (p0, x0 - svcntw () * 24, z0))
++
++/*
++** st3_s32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s32_m27, svint32x3_t, int32_t,
++	    svst3_s32 (p0, x0 - svcntw () * 27, z0),
++	    svst3 (p0, x0 - svcntw () * 27, z0))
++
++/*
++** st3_vnum_s32_0:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_0, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s32_1:
++**	incb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_1, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s32_2:
++**	incb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_2, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_s32_3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_3, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_s32_21:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_21, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_s32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_24, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s32_m1:
++**	decb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_m1, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s32_m2:
++**	decb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_m2, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_s32_m3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_m3, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_s32_m24:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_m24, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_s32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_m27, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3w	{z0\.s - z2\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s32_x1, svint32x3_t, int32_t,
++	    svst3_vnum_s32 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s64.c
+new file mode 100644
+index 000000000..eca6a7cea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s64.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_s64_base:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s64_base, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_s64_index:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st3_s64_index, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s64_1:
++**	incb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s64_1, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 + svcntd (), z0),
++	    svst3 (p0, x0 + svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s64_2:
++**	incb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s64_2, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 + svcntd () * 2, z0),
++	    svst3 (p0, x0 + svcntd () * 2, z0))
++
++/*
++** st3_s64_3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s64_3, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 + svcntd () * 3, z0),
++	    svst3 (p0, x0 + svcntd () * 3, z0))
++
++/*
++** st3_s64_21:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s64_21, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 + svcntd () * 21, z0),
++	    svst3 (p0, x0 + svcntd () * 21, z0))
++
++/*
++** st3_s64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s64_24, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 + svcntd () * 24, z0),
++	    svst3 (p0, x0 + svcntd () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s64_m1:
++**	decb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s64_m1, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 - svcntd (), z0),
++	    svst3 (p0, x0 - svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s64_m2:
++**	decb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s64_m2, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 - svcntd () * 2, z0),
++	    svst3 (p0, x0 - svcntd () * 2, z0))
++
++/*
++** st3_s64_m3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s64_m3, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 - svcntd () * 3, z0),
++	    svst3 (p0, x0 - svcntd () * 3, z0))
++
++/*
++** st3_s64_m24:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s64_m24, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 - svcntd () * 24, z0),
++	    svst3 (p0, x0 - svcntd () * 24, z0))
++
++/*
++** st3_s64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s64_m27, svint64x3_t, int64_t,
++	    svst3_s64 (p0, x0 - svcntd () * 27, z0),
++	    svst3 (p0, x0 - svcntd () * 27, z0))
++
++/*
++** st3_vnum_s64_0:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_0, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s64_1:
++**	incb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_1, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s64_2:
++**	incb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_2, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_s64_3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_3, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_s64_21:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_21, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_s64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_24, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s64_m1:
++**	decb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_m1, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s64_m2:
++**	decb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_m2, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_s64_m3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_m3, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_s64_m24:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_m24, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_s64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_m27, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3d	{z0\.d - z2\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s64_x1, svint64x3_t, int64_t,
++	    svst3_vnum_s64 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s8.c
+new file mode 100644
+index 000000000..a54ff4b74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s8.c
+@@ -0,0 +1,246 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_s8_base:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s8_base, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_s8_index:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st3_s8_index, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s8_1:
++**	incb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s8_1, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 + svcntb (), z0),
++	    svst3 (p0, x0 + svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s8_2:
++**	incb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s8_2, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 + svcntb () * 2, z0),
++	    svst3 (p0, x0 + svcntb () * 2, z0))
++
++/*
++** st3_s8_3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s8_3, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 + svcntb () * 3, z0),
++	    svst3 (p0, x0 + svcntb () * 3, z0))
++
++/*
++** st3_s8_21:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s8_21, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 + svcntb () * 21, z0),
++	    svst3 (p0, x0 + svcntb () * 21, z0))
++
++/*
++** st3_s8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s8_24, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 + svcntb () * 24, z0),
++	    svst3 (p0, x0 + svcntb () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s8_m1:
++**	decb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s8_m1, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 - svcntb (), z0),
++	    svst3 (p0, x0 - svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_s8_m2:
++**	decb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_s8_m2, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 - svcntb () * 2, z0),
++	    svst3 (p0, x0 - svcntb () * 2, z0))
++
++/*
++** st3_s8_m3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s8_m3, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 - svcntb () * 3, z0),
++	    svst3 (p0, x0 - svcntb () * 3, z0))
++
++/*
++** st3_s8_m24:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_s8_m24, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 - svcntb () * 24, z0),
++	    svst3 (p0, x0 - svcntb () * 24, z0))
++
++/*
++** st3_s8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_s8_m27, svint8x3_t, int8_t,
++	    svst3_s8 (p0, x0 - svcntb () * 27, z0),
++	    svst3 (p0, x0 - svcntb () * 27, z0))
++
++/*
++** st3_vnum_s8_0:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_0, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s8_1:
++**	incb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_1, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s8_2:
++**	incb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_2, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_s8_3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_3, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_s8_21:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_21, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_s8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_24, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s8_m1:
++**	decb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_m1, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_s8_m2:
++**	decb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_m2, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_s8_m3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_m3, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_s8_m24:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_m24, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_s8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_m27, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/*
++** st3_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st3b	{z0\.b - z2\.b}, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st3_vnum_s8_x1, svint8x3_t, int8_t,
++	    svst3_vnum_s8 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u16.c
+new file mode 100644
+index 000000000..d4e8efca3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u16.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_u16_base:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u16_base, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_u16_index:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st3_u16_index, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u16_1, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 + svcnth (), z0),
++	    svst3 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u16_2, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 + svcnth () * 2, z0),
++	    svst3 (p0, x0 + svcnth () * 2, z0))
++
++/*
++** st3_u16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u16_3, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 + svcnth () * 3, z0),
++	    svst3 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st3_u16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u16_21, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 + svcnth () * 21, z0),
++	    svst3 (p0, x0 + svcnth () * 21, z0))
++
++/*
++** st3_u16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u16_24, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 + svcnth () * 24, z0),
++	    svst3 (p0, x0 + svcnth () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u16_m1, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 - svcnth (), z0),
++	    svst3 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u16_m2, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 - svcnth () * 2, z0),
++	    svst3 (p0, x0 - svcnth () * 2, z0))
++
++/*
++** st3_u16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u16_m3, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 - svcnth () * 3, z0),
++	    svst3 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st3_u16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u16_m24, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 - svcnth () * 24, z0),
++	    svst3 (p0, x0 - svcnth () * 24, z0))
++
++/*
++** st3_u16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u16_m27, svuint16x3_t, uint16_t,
++	    svst3_u16 (p0, x0 - svcnth () * 27, z0),
++	    svst3 (p0, x0 - svcnth () * 27, z0))
++
++/*
++** st3_vnum_u16_0:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_0, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u16_1:
++**	incb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_1, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u16_2:
++**	incb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_2, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_u16_3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_3, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_u16_21:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_21, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_u16_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_24, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u16_m1:
++**	decb	x0
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_m1, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u16_m2:
++**	decb	x0, all, mul #2
++**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_m2, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_u16_m3:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_m3, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_u16_m24:
++**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_m24, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_u16_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_m27, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u16_x1, svuint16x3_t, uint16_t,
++	    svst3_vnum_u16 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u32.c
+new file mode 100644
+index 000000000..8be3aa957
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u32.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_u32_base:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u32_base, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_u32_index:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st3_u32_index, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u32_1:
++**	incb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u32_1, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 + svcntw (), z0),
++	    svst3 (p0, x0 + svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u32_2:
++**	incb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u32_2, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 + svcntw () * 2, z0),
++	    svst3 (p0, x0 + svcntw () * 2, z0))
++
++/*
++** st3_u32_3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u32_3, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 + svcntw () * 3, z0),
++	    svst3 (p0, x0 + svcntw () * 3, z0))
++
++/*
++** st3_u32_21:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u32_21, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 + svcntw () * 21, z0),
++	    svst3 (p0, x0 + svcntw () * 21, z0))
++
++/*
++** st3_u32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u32_24, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 + svcntw () * 24, z0),
++	    svst3 (p0, x0 + svcntw () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u32_m1:
++**	decb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u32_m1, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 - svcntw (), z0),
++	    svst3 (p0, x0 - svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u32_m2:
++**	decb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u32_m2, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 - svcntw () * 2, z0),
++	    svst3 (p0, x0 - svcntw () * 2, z0))
++
++/*
++** st3_u32_m3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u32_m3, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 - svcntw () * 3, z0),
++	    svst3 (p0, x0 - svcntw () * 3, z0))
++
++/*
++** st3_u32_m24:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u32_m24, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 - svcntw () * 24, z0),
++	    svst3 (p0, x0 - svcntw () * 24, z0))
++
++/*
++** st3_u32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u32_m27, svuint32x3_t, uint32_t,
++	    svst3_u32 (p0, x0 - svcntw () * 27, z0),
++	    svst3 (p0, x0 - svcntw () * 27, z0))
++
++/*
++** st3_vnum_u32_0:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_0, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u32_1:
++**	incb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_1, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u32_2:
++**	incb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_2, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_u32_3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_3, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_u32_21:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_21, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_u32_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_24, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u32_m1:
++**	decb	x0
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_m1, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u32_m2:
++**	decb	x0, all, mul #2
++**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_m2, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_u32_m3:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_m3, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_u32_m24:
++**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_m24, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_u32_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_m27, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3w	{z0\.s - z2\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u32_x1, svuint32x3_t, uint32_t,
++	    svst3_vnum_u32 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u64.c
+new file mode 100644
+index 000000000..31cb304ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u64.c
+@@ -0,0 +1,242 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_u64_base:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u64_base, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_u64_index:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st3_u64_index, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u64_1:
++**	incb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u64_1, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 + svcntd (), z0),
++	    svst3 (p0, x0 + svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u64_2:
++**	incb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u64_2, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 + svcntd () * 2, z0),
++	    svst3 (p0, x0 + svcntd () * 2, z0))
++
++/*
++** st3_u64_3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u64_3, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 + svcntd () * 3, z0),
++	    svst3 (p0, x0 + svcntd () * 3, z0))
++
++/*
++** st3_u64_21:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u64_21, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 + svcntd () * 21, z0),
++	    svst3 (p0, x0 + svcntd () * 21, z0))
++
++/*
++** st3_u64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u64_24, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 + svcntd () * 24, z0),
++	    svst3 (p0, x0 + svcntd () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u64_m1:
++**	decb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u64_m1, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 - svcntd (), z0),
++	    svst3 (p0, x0 - svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u64_m2:
++**	decb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u64_m2, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 - svcntd () * 2, z0),
++	    svst3 (p0, x0 - svcntd () * 2, z0))
++
++/*
++** st3_u64_m3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u64_m3, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 - svcntd () * 3, z0),
++	    svst3 (p0, x0 - svcntd () * 3, z0))
++
++/*
++** st3_u64_m24:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u64_m24, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 - svcntd () * 24, z0),
++	    svst3 (p0, x0 - svcntd () * 24, z0))
++
++/*
++** st3_u64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u64_m27, svuint64x3_t, uint64_t,
++	    svst3_u64 (p0, x0 - svcntd () * 27, z0),
++	    svst3 (p0, x0 - svcntd () * 27, z0))
++
++/*
++** st3_vnum_u64_0:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_0, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u64_1:
++**	incb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_1, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u64_2:
++**	incb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_2, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_u64_3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_3, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_u64_21:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_21, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_u64_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_24, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u64_m1:
++**	decb	x0
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_m1, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u64_m2:
++**	decb	x0, all, mul #2
++**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_m2, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_u64_m3:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_m3, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_u64_m24:
++**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_m24, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_u64_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_m27, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st3_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st3d	{z0\.d - z2\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u64_x1, svuint64x3_t, uint64_t,
++	    svst3_vnum_u64 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u8.c
+new file mode 100644
+index 000000000..e2d5a19ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u8.c
+@@ -0,0 +1,246 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st3_u8_base:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u8_base, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0, z0),
++	    svst3 (p0, x0, z0))
++
++/*
++** st3_u8_index:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st3_u8_index, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 + x1, z0),
++	    svst3 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u8_1:
++**	incb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u8_1, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 + svcntb (), z0),
++	    svst3 (p0, x0 + svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u8_2:
++**	incb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u8_2, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 + svcntb () * 2, z0),
++	    svst3 (p0, x0 + svcntb () * 2, z0))
++
++/*
++** st3_u8_3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u8_3, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 + svcntb () * 3, z0),
++	    svst3 (p0, x0 + svcntb () * 3, z0))
++
++/*
++** st3_u8_21:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u8_21, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 + svcntb () * 21, z0),
++	    svst3 (p0, x0 + svcntb () * 21, z0))
++
++/*
++** st3_u8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u8_24, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 + svcntb () * 24, z0),
++	    svst3 (p0, x0 + svcntb () * 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u8_m1:
++**	decb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u8_m1, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 - svcntb (), z0),
++	    svst3 (p0, x0 - svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_u8_m2:
++**	decb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_u8_m2, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 - svcntb () * 2, z0),
++	    svst3 (p0, x0 - svcntb () * 2, z0))
++
++/*
++** st3_u8_m3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u8_m3, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 - svcntb () * 3, z0),
++	    svst3 (p0, x0 - svcntb () * 3, z0))
++
++/*
++** st3_u8_m24:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_u8_m24, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 - svcntb () * 24, z0),
++	    svst3 (p0, x0 - svcntb () * 24, z0))
++
++/*
++** st3_u8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_u8_m27, svuint8x3_t, uint8_t,
++	    svst3_u8 (p0, x0 - svcntb () * 27, z0),
++	    svst3 (p0, x0 - svcntb () * 27, z0))
++
++/*
++** st3_vnum_u8_0:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_0, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, 0, z0),
++	    svst3_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u8_1:
++**	incb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_1, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, 1, z0),
++	    svst3_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u8_2:
++**	incb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_2, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, 2, z0),
++	    svst3_vnum (p0, x0, 2, z0))
++
++/*
++** st3_vnum_u8_3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_3, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, 3, z0),
++	    svst3_vnum (p0, x0, 3, z0))
++
++/*
++** st3_vnum_u8_21:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_21, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, 21, z0),
++	    svst3_vnum (p0, x0, 21, z0))
++
++/*
++** st3_vnum_u8_24:
++**	addvl	(x[0-9]+), x0, #24
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_24, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, 24, z0),
++	    svst3_vnum (p0, x0, 24, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u8_m1:
++**	decb	x0
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_m1, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, -1, z0),
++	    svst3_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st3_vnum_u8_m2:
++**	decb	x0, all, mul #2
++**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_m2, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, -2, z0),
++	    svst3_vnum (p0, x0, -2, z0))
++
++/*
++** st3_vnum_u8_m3:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_m3, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, -3, z0),
++	    svst3_vnum (p0, x0, -3, z0))
++
++/*
++** st3_vnum_u8_m24:
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_m24, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, -24, z0),
++	    svst3_vnum (p0, x0, -24, z0))
++
++/*
++** st3_vnum_u8_m27:
++**	addvl	(x[0-9]+), x0, #-27
++**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_m27, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, -27, z0),
++	    svst3_vnum (p0, x0, -27, z0))
++
++/*
++** st3_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st3b	{z0\.b - z2\.b}, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st3b	{z0\.b - z2\.b}, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st3_vnum_u8_x1, svuint8x3_t, uint8_t,
++	    svst3_vnum_u8 (p0, x0, x1, z0),
++	    svst3_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_bf16.c
+new file mode 100644
+index 000000000..b8d9f4afa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_bf16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_bf16_base:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_base, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_bf16_index:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st4_bf16_index, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_bf16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_1, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + svcnth (), z0),
++	    svst4 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_bf16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_2, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + svcnth () * 2, z0),
++	    svst4 (p0, x0 + svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_bf16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_3, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + svcnth () * 3, z0),
++	    svst4 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st4_bf16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_bf16_4, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + svcnth () * 4, z0),
++	    svst4 (p0, x0 + svcnth () * 4, z0))
++
++/*
++** st4_bf16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_bf16_28, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + svcnth () * 28, z0),
++	    svst4 (p0, x0 + svcnth () * 28, z0))
++
++/*
++** st4_bf16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_bf16_32, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 + svcnth () * 32, z0),
++	    svst4 (p0, x0 + svcnth () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_bf16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_m1, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 - svcnth (), z0),
++	    svst4 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_bf16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_m2, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 - svcnth () * 2, z0),
++	    svst4 (p0, x0 - svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_bf16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_bf16_m3, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 - svcnth () * 3, z0),
++	    svst4 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st4_bf16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_bf16_m4, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 - svcnth () * 4, z0),
++	    svst4 (p0, x0 - svcnth () * 4, z0))
++
++/*
++** st4_bf16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_bf16_m32, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 - svcnth () * 32, z0),
++	    svst4 (p0, x0 - svcnth () * 32, z0))
++
++/*
++** st4_bf16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_bf16_m36, svbfloat16x4_t, bfloat16_t,
++	    svst4_bf16 (p0, x0 - svcnth () * 36, z0),
++	    svst4 (p0, x0 - svcnth () * 36, z0))
++
++/*
++** st4_vnum_bf16_0:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_bf16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_bf16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_bf16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_bf16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_bf16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_bf16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_bf16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_bf16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_bf16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_bf16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_bf16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_bf16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
++	    svst4_vnum_bf16 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f16.c
+new file mode 100644
+index 000000000..296bdb4a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_f16_base:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_base, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_f16_index:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st4_f16_index, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_1, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + svcnth (), z0),
++	    svst4 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_2, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + svcnth () * 2, z0),
++	    svst4 (p0, x0 + svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_3, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + svcnth () * 3, z0),
++	    svst4 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st4_f16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f16_4, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + svcnth () * 4, z0),
++	    svst4 (p0, x0 + svcnth () * 4, z0))
++
++/*
++** st4_f16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f16_28, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + svcnth () * 28, z0),
++	    svst4 (p0, x0 + svcnth () * 28, z0))
++
++/*
++** st4_f16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_f16_32, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 + svcnth () * 32, z0),
++	    svst4 (p0, x0 + svcnth () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_m1, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 - svcnth (), z0),
++	    svst4 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_m2, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 - svcnth () * 2, z0),
++	    svst4 (p0, x0 - svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f16_m3, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 - svcnth () * 3, z0),
++	    svst4 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st4_f16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f16_m4, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 - svcnth () * 4, z0),
++	    svst4 (p0, x0 - svcnth () * 4, z0))
++
++/*
++** st4_f16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f16_m32, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 - svcnth () * 32, z0),
++	    svst4 (p0, x0 - svcnth () * 32, z0))
++
++/*
++** st4_f16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_f16_m36, svfloat16x4_t, float16_t,
++	    svst4_f16 (p0, x0 - svcnth () * 36, z0),
++	    svst4 (p0, x0 - svcnth () * 36, z0))
++
++/*
++** st4_vnum_f16_0:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_0, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_1, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_2, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_3, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_f16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_4, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_f16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_28, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_f16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_32, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_m1, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_m2, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_m3, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_f16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_m4, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_f16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_m32, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_f16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_m36, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f16_x1, svfloat16x4_t, float16_t,
++	    svst4_vnum_f16 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f32.c
+new file mode 100644
+index 000000000..313ed7bc0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f32.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_f32_base:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_base, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_f32_index:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st4_f32_index, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f32_1:
++**	incb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_1, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + svcntw (), z0),
++	    svst4 (p0, x0 + svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f32_2:
++**	incb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_2, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + svcntw () * 2, z0),
++	    svst4 (p0, x0 + svcntw () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f32_3:
++**	incb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_3, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + svcntw () * 3, z0),
++	    svst4 (p0, x0 + svcntw () * 3, z0))
++
++/*
++** st4_f32_4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f32_4, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + svcntw () * 4, z0),
++	    svst4 (p0, x0 + svcntw () * 4, z0))
++
++/*
++** st4_f32_28:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f32_28, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + svcntw () * 28, z0),
++	    svst4 (p0, x0 + svcntw () * 28, z0))
++
++/*
++** st4_f32_32:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_f32_32, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 + svcntw () * 32, z0),
++	    svst4 (p0, x0 + svcntw () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f32_m1:
++**	decb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_m1, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 - svcntw (), z0),
++	    svst4 (p0, x0 - svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f32_m2:
++**	decb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_m2, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 - svcntw () * 2, z0),
++	    svst4 (p0, x0 - svcntw () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f32_m3:
++**	decb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f32_m3, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 - svcntw () * 3, z0),
++	    svst4 (p0, x0 - svcntw () * 3, z0))
++
++/*
++** st4_f32_m4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f32_m4, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 - svcntw () * 4, z0),
++	    svst4 (p0, x0 - svcntw () * 4, z0))
++
++/*
++** st4_f32_m32:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f32_m32, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 - svcntw () * 32, z0),
++	    svst4 (p0, x0 - svcntw () * 32, z0))
++
++/*
++** st4_f32_m36:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_f32_m36, svfloat32x4_t, float32_t,
++	    svst4_f32 (p0, x0 - svcntw () * 36, z0),
++	    svst4 (p0, x0 - svcntw () * 36, z0))
++
++/*
++** st4_vnum_f32_0:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_0, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f32_1:
++**	incb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_1, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f32_2:
++**	incb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_2, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f32_3:
++**	incb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_3, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_f32_4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_4, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_f32_28:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_28, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_f32_32:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_32, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f32_m1:
++**	decb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_m1, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f32_m2:
++**	decb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_m2, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f32_m3:
++**	decb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_m3, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_f32_m4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_m4, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_f32_m32:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_m32, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_f32_m36:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_m36, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4w	{z0\.s - z3\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f32_x1, svfloat32x4_t, float32_t,
++	    svst4_vnum_f32 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f64.c
+new file mode 100644
+index 000000000..6c65ef016
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f64.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_f64_base:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_base, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_f64_index:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st4_f64_index, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f64_1:
++**	incb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_1, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + svcntd (), z0),
++	    svst4 (p0, x0 + svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f64_2:
++**	incb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_2, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + svcntd () * 2, z0),
++	    svst4 (p0, x0 + svcntd () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f64_3:
++**	incb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_3, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + svcntd () * 3, z0),
++	    svst4 (p0, x0 + svcntd () * 3, z0))
++
++/*
++** st4_f64_4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f64_4, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + svcntd () * 4, z0),
++	    svst4 (p0, x0 + svcntd () * 4, z0))
++
++/*
++** st4_f64_28:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f64_28, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + svcntd () * 28, z0),
++	    svst4 (p0, x0 + svcntd () * 28, z0))
++
++/*
++** st4_f64_32:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_f64_32, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 + svcntd () * 32, z0),
++	    svst4 (p0, x0 + svcntd () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f64_m1:
++**	decb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_m1, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 - svcntd (), z0),
++	    svst4 (p0, x0 - svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f64_m2:
++**	decb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_m2, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 - svcntd () * 2, z0),
++	    svst4 (p0, x0 - svcntd () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_f64_m3:
++**	decb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_f64_m3, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 - svcntd () * 3, z0),
++	    svst4 (p0, x0 - svcntd () * 3, z0))
++
++/*
++** st4_f64_m4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f64_m4, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 - svcntd () * 4, z0),
++	    svst4 (p0, x0 - svcntd () * 4, z0))
++
++/*
++** st4_f64_m32:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_f64_m32, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 - svcntd () * 32, z0),
++	    svst4 (p0, x0 - svcntd () * 32, z0))
++
++/*
++** st4_f64_m36:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_f64_m36, svfloat64x4_t, float64_t,
++	    svst4_f64 (p0, x0 - svcntd () * 36, z0),
++	    svst4 (p0, x0 - svcntd () * 36, z0))
++
++/*
++** st4_vnum_f64_0:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_0, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f64_1:
++**	incb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_1, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f64_2:
++**	incb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_2, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f64_3:
++**	incb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_3, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_f64_4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_4, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_f64_28:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_28, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_f64_32:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_32, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f64_m1:
++**	decb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_m1, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f64_m2:
++**	decb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_m2, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_f64_m3:
++**	decb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_m3, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_f64_m4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_m4, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_f64_m32:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_m32, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_f64_m36:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_m36, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4d	{z0\.d - z3\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_f64_x1, svfloat64x4_t, float64_t,
++	    svst4_vnum_f64 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s16.c
+new file mode 100644
+index 000000000..35ac5f803
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_s16_base:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_base, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_s16_index:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st4_s16_index, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_1, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + svcnth (), z0),
++	    svst4 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_2, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + svcnth () * 2, z0),
++	    svst4 (p0, x0 + svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_3, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + svcnth () * 3, z0),
++	    svst4 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st4_s16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s16_4, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + svcnth () * 4, z0),
++	    svst4 (p0, x0 + svcnth () * 4, z0))
++
++/*
++** st4_s16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s16_28, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + svcnth () * 28, z0),
++	    svst4 (p0, x0 + svcnth () * 28, z0))
++
++/*
++** st4_s16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s16_32, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 + svcnth () * 32, z0),
++	    svst4 (p0, x0 + svcnth () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_m1, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 - svcnth (), z0),
++	    svst4 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_m2, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 - svcnth () * 2, z0),
++	    svst4 (p0, x0 - svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s16_m3, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 - svcnth () * 3, z0),
++	    svst4 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st4_s16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s16_m4, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 - svcnth () * 4, z0),
++	    svst4 (p0, x0 - svcnth () * 4, z0))
++
++/*
++** st4_s16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s16_m32, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 - svcnth () * 32, z0),
++	    svst4 (p0, x0 - svcnth () * 32, z0))
++
++/*
++** st4_s16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s16_m36, svint16x4_t, int16_t,
++	    svst4_s16 (p0, x0 - svcnth () * 36, z0),
++	    svst4 (p0, x0 - svcnth () * 36, z0))
++
++/*
++** st4_vnum_s16_0:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_0, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_1, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_2, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_3, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_s16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_4, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_s16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_28, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_s16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_32, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_m1, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_m2, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_m3, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_s16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_m4, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_s16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_m32, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_s16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_m36, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s16_x1, svint16x4_t, int16_t,
++	    svst4_vnum_s16 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s32.c
+new file mode 100644
+index 000000000..b8302f10d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s32.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_s32_base:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_base, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_s32_index:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st4_s32_index, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s32_1:
++**	incb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_1, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + svcntw (), z0),
++	    svst4 (p0, x0 + svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s32_2:
++**	incb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_2, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + svcntw () * 2, z0),
++	    svst4 (p0, x0 + svcntw () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s32_3:
++**	incb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_3, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + svcntw () * 3, z0),
++	    svst4 (p0, x0 + svcntw () * 3, z0))
++
++/*
++** st4_s32_4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s32_4, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + svcntw () * 4, z0),
++	    svst4 (p0, x0 + svcntw () * 4, z0))
++
++/*
++** st4_s32_28:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s32_28, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + svcntw () * 28, z0),
++	    svst4 (p0, x0 + svcntw () * 28, z0))
++
++/*
++** st4_s32_32:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s32_32, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 + svcntw () * 32, z0),
++	    svst4 (p0, x0 + svcntw () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s32_m1:
++**	decb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_m1, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 - svcntw (), z0),
++	    svst4 (p0, x0 - svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s32_m2:
++**	decb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_m2, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 - svcntw () * 2, z0),
++	    svst4 (p0, x0 - svcntw () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s32_m3:
++**	decb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s32_m3, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 - svcntw () * 3, z0),
++	    svst4 (p0, x0 - svcntw () * 3, z0))
++
++/*
++** st4_s32_m4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s32_m4, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 - svcntw () * 4, z0),
++	    svst4 (p0, x0 - svcntw () * 4, z0))
++
++/*
++** st4_s32_m32:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s32_m32, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 - svcntw () * 32, z0),
++	    svst4 (p0, x0 - svcntw () * 32, z0))
++
++/*
++** st4_s32_m36:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s32_m36, svint32x4_t, int32_t,
++	    svst4_s32 (p0, x0 - svcntw () * 36, z0),
++	    svst4 (p0, x0 - svcntw () * 36, z0))
++
++/*
++** st4_vnum_s32_0:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_0, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s32_1:
++**	incb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_1, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s32_2:
++**	incb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_2, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s32_3:
++**	incb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_3, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_s32_4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_4, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_s32_28:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_28, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_s32_32:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_32, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s32_m1:
++**	decb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_m1, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s32_m2:
++**	decb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_m2, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s32_m3:
++**	decb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_m3, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_s32_m4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_m4, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_s32_m32:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_m32, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_s32_m36:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_m36, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4w	{z0\.s - z3\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s32_x1, svint32x4_t, int32_t,
++	    svst4_vnum_s32 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s64.c
+new file mode 100644
+index 000000000..bf9cdf5e0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s64.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_s64_base:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_base, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_s64_index:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st4_s64_index, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s64_1:
++**	incb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_1, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + svcntd (), z0),
++	    svst4 (p0, x0 + svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s64_2:
++**	incb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_2, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + svcntd () * 2, z0),
++	    svst4 (p0, x0 + svcntd () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s64_3:
++**	incb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_3, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + svcntd () * 3, z0),
++	    svst4 (p0, x0 + svcntd () * 3, z0))
++
++/*
++** st4_s64_4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s64_4, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + svcntd () * 4, z0),
++	    svst4 (p0, x0 + svcntd () * 4, z0))
++
++/*
++** st4_s64_28:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s64_28, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + svcntd () * 28, z0),
++	    svst4 (p0, x0 + svcntd () * 28, z0))
++
++/*
++** st4_s64_32:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s64_32, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 + svcntd () * 32, z0),
++	    svst4 (p0, x0 + svcntd () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s64_m1:
++**	decb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_m1, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 - svcntd (), z0),
++	    svst4 (p0, x0 - svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s64_m2:
++**	decb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_m2, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 - svcntd () * 2, z0),
++	    svst4 (p0, x0 - svcntd () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s64_m3:
++**	decb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s64_m3, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 - svcntd () * 3, z0),
++	    svst4 (p0, x0 - svcntd () * 3, z0))
++
++/*
++** st4_s64_m4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s64_m4, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 - svcntd () * 4, z0),
++	    svst4 (p0, x0 - svcntd () * 4, z0))
++
++/*
++** st4_s64_m32:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s64_m32, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 - svcntd () * 32, z0),
++	    svst4 (p0, x0 - svcntd () * 32, z0))
++
++/*
++** st4_s64_m36:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s64_m36, svint64x4_t, int64_t,
++	    svst4_s64 (p0, x0 - svcntd () * 36, z0),
++	    svst4 (p0, x0 - svcntd () * 36, z0))
++
++/*
++** st4_vnum_s64_0:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_0, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s64_1:
++**	incb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_1, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s64_2:
++**	incb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_2, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s64_3:
++**	incb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_3, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_s64_4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_4, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_s64_28:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_28, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_s64_32:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_32, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s64_m1:
++**	decb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_m1, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s64_m2:
++**	decb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_m2, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s64_m3:
++**	decb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_m3, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_s64_m4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_m4, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_s64_m32:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_m32, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_s64_m36:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_m36, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4d	{z0\.d - z3\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s64_x1, svint64x4_t, int64_t,
++	    svst4_vnum_s64 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s8.c
+new file mode 100644
+index 000000000..1eb0bf131
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s8.c
+@@ -0,0 +1,290 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_s8_base:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_base, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_s8_index:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st4_s8_index, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s8_1:
++**	incb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_1, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + svcntb (), z0),
++	    svst4 (p0, x0 + svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s8_2:
++**	incb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_2, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + svcntb () * 2, z0),
++	    svst4 (p0, x0 + svcntb () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s8_3:
++**	incb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_3, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + svcntb () * 3, z0),
++	    svst4 (p0, x0 + svcntb () * 3, z0))
++
++/*
++** st4_s8_4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s8_4, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + svcntb () * 4, z0),
++	    svst4 (p0, x0 + svcntb () * 4, z0))
++
++/*
++** st4_s8_28:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s8_28, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + svcntb () * 28, z0),
++	    svst4 (p0, x0 + svcntb () * 28, z0))
++
++/*
++** st4_s8_32:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s8_32, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 + svcntb () * 32, z0),
++	    svst4 (p0, x0 + svcntb () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s8_m1:
++**	decb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_m1, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 - svcntb (), z0),
++	    svst4 (p0, x0 - svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s8_m2:
++**	decb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_m2, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 - svcntb () * 2, z0),
++	    svst4 (p0, x0 - svcntb () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_s8_m3:
++**	decb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_s8_m3, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 - svcntb () * 3, z0),
++	    svst4 (p0, x0 - svcntb () * 3, z0))
++
++/*
++** st4_s8_m4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s8_m4, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 - svcntb () * 4, z0),
++	    svst4 (p0, x0 - svcntb () * 4, z0))
++
++/*
++** st4_s8_m32:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_s8_m32, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 - svcntb () * 32, z0),
++	    svst4 (p0, x0 - svcntb () * 32, z0))
++
++/*
++** st4_s8_m36:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_s8_m36, svint8x4_t, int8_t,
++	    svst4_s8 (p0, x0 - svcntb () * 36, z0),
++	    svst4 (p0, x0 - svcntb () * 36, z0))
++
++/*
++** st4_vnum_s8_0:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_0, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s8_1:
++**	incb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_1, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s8_2:
++**	incb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_2, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s8_3:
++**	incb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_3, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_s8_4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_4, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_s8_28:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_28, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_s8_32:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_32, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s8_m1:
++**	decb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_m1, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s8_m2:
++**	decb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_m2, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_s8_m3:
++**	decb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_m3, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_s8_m4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_m4, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_s8_m32:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_m32, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_s8_m36:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_m36, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/*
++** st4_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st4b	{z0\.b - z3\.b}, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st4_vnum_s8_x1, svint8x4_t, int8_t,
++	    svst4_vnum_s8 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u16.c
+new file mode 100644
+index 000000000..5272c7f61
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u16.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_u16_base:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_base, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_u16_index:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (st4_u16_index, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_1, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + svcnth (), z0),
++	    svst4 (p0, x0 + svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_2, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + svcnth () * 2, z0),
++	    svst4 (p0, x0 + svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_3, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + svcnth () * 3, z0),
++	    svst4 (p0, x0 + svcnth () * 3, z0))
++
++/*
++** st4_u16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u16_4, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + svcnth () * 4, z0),
++	    svst4 (p0, x0 + svcnth () * 4, z0))
++
++/*
++** st4_u16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u16_28, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + svcnth () * 28, z0),
++	    svst4 (p0, x0 + svcnth () * 28, z0))
++
++/*
++** st4_u16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u16_32, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 + svcnth () * 32, z0),
++	    svst4 (p0, x0 + svcnth () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_m1, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 - svcnth (), z0),
++	    svst4 (p0, x0 - svcnth (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_m2, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 - svcnth () * 2, z0),
++	    svst4 (p0, x0 - svcnth () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u16_m3, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 - svcnth () * 3, z0),
++	    svst4 (p0, x0 - svcnth () * 3, z0))
++
++/*
++** st4_u16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u16_m4, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 - svcnth () * 4, z0),
++	    svst4 (p0, x0 - svcnth () * 4, z0))
++
++/*
++** st4_u16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u16_m32, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 - svcnth () * 32, z0),
++	    svst4 (p0, x0 - svcnth () * 32, z0))
++
++/*
++** st4_u16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u16_m36, svuint16x4_t, uint16_t,
++	    svst4_u16 (p0, x0 - svcnth () * 36, z0),
++	    svst4 (p0, x0 - svcnth () * 36, z0))
++
++/*
++** st4_vnum_u16_0:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_0, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u16_1:
++**	incb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_1, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u16_2:
++**	incb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_2, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u16_3:
++**	incb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_3, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_u16_4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_4, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_u16_28:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_28, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_u16_32:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_32, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u16_m1:
++**	decb	x0
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_m1, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u16_m2:
++**	decb	x0, all, mul #2
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_m2, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u16_m3:
++**	decb	x0, all, mul #3
++**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_m3, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_u16_m4:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_m4, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_u16_m32:
++**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_m32, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_u16_m36:
++**	[^{]*
++**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_m36, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u16_x1, svuint16x4_t, uint16_t,
++	    svst4_vnum_u16 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u32.c
+new file mode 100644
+index 000000000..8b9b322e5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u32.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_u32_base:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_base, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_u32_index:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (st4_u32_index, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u32_1:
++**	incb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_1, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + svcntw (), z0),
++	    svst4 (p0, x0 + svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u32_2:
++**	incb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_2, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + svcntw () * 2, z0),
++	    svst4 (p0, x0 + svcntw () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u32_3:
++**	incb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_3, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + svcntw () * 3, z0),
++	    svst4 (p0, x0 + svcntw () * 3, z0))
++
++/*
++** st4_u32_4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u32_4, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + svcntw () * 4, z0),
++	    svst4 (p0, x0 + svcntw () * 4, z0))
++
++/*
++** st4_u32_28:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u32_28, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + svcntw () * 28, z0),
++	    svst4 (p0, x0 + svcntw () * 28, z0))
++
++/*
++** st4_u32_32:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u32_32, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 + svcntw () * 32, z0),
++	    svst4 (p0, x0 + svcntw () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u32_m1:
++**	decb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_m1, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 - svcntw (), z0),
++	    svst4 (p0, x0 - svcntw (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u32_m2:
++**	decb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_m2, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 - svcntw () * 2, z0),
++	    svst4 (p0, x0 - svcntw () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u32_m3:
++**	decb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u32_m3, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 - svcntw () * 3, z0),
++	    svst4 (p0, x0 - svcntw () * 3, z0))
++
++/*
++** st4_u32_m4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u32_m4, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 - svcntw () * 4, z0),
++	    svst4 (p0, x0 - svcntw () * 4, z0))
++
++/*
++** st4_u32_m32:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u32_m32, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 - svcntw () * 32, z0),
++	    svst4 (p0, x0 - svcntw () * 32, z0))
++
++/*
++** st4_u32_m36:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u32_m36, svuint32x4_t, uint32_t,
++	    svst4_u32 (p0, x0 - svcntw () * 36, z0),
++	    svst4 (p0, x0 - svcntw () * 36, z0))
++
++/*
++** st4_vnum_u32_0:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_0, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u32_1:
++**	incb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_1, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u32_2:
++**	incb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_2, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u32_3:
++**	incb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_3, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_u32_4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_4, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_u32_28:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_28, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_u32_32:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_32, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u32_m1:
++**	decb	x0
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_m1, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u32_m2:
++**	decb	x0, all, mul #2
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_m2, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u32_m3:
++**	decb	x0, all, mul #3
++**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_m3, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_u32_m4:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_m4, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_u32_m32:
++**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_m32, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_u32_m36:
++**	[^{]*
++**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_m36, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4w	{z0\.s - z3\.s}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u32_x1, svuint32x4_t, uint32_t,
++	    svst4_vnum_u32 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u64.c
+new file mode 100644
+index 000000000..53b78f5ba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u64.c
+@@ -0,0 +1,286 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_u64_base:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_base, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_u64_index:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (st4_u64_index, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u64_1:
++**	incb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_1, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + svcntd (), z0),
++	    svst4 (p0, x0 + svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u64_2:
++**	incb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_2, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + svcntd () * 2, z0),
++	    svst4 (p0, x0 + svcntd () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u64_3:
++**	incb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_3, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + svcntd () * 3, z0),
++	    svst4 (p0, x0 + svcntd () * 3, z0))
++
++/*
++** st4_u64_4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u64_4, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + svcntd () * 4, z0),
++	    svst4 (p0, x0 + svcntd () * 4, z0))
++
++/*
++** st4_u64_28:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u64_28, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + svcntd () * 28, z0),
++	    svst4 (p0, x0 + svcntd () * 28, z0))
++
++/*
++** st4_u64_32:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u64_32, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 + svcntd () * 32, z0),
++	    svst4 (p0, x0 + svcntd () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u64_m1:
++**	decb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_m1, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 - svcntd (), z0),
++	    svst4 (p0, x0 - svcntd (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u64_m2:
++**	decb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_m2, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 - svcntd () * 2, z0),
++	    svst4 (p0, x0 - svcntd () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u64_m3:
++**	decb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u64_m3, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 - svcntd () * 3, z0),
++	    svst4 (p0, x0 - svcntd () * 3, z0))
++
++/*
++** st4_u64_m4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u64_m4, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 - svcntd () * 4, z0),
++	    svst4 (p0, x0 - svcntd () * 4, z0))
++
++/*
++** st4_u64_m32:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u64_m32, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 - svcntd () * 32, z0),
++	    svst4 (p0, x0 - svcntd () * 32, z0))
++
++/*
++** st4_u64_m36:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u64_m36, svuint64x4_t, uint64_t,
++	    svst4_u64 (p0, x0 - svcntd () * 36, z0),
++	    svst4 (p0, x0 - svcntd () * 36, z0))
++
++/*
++** st4_vnum_u64_0:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_0, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u64_1:
++**	incb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_1, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u64_2:
++**	incb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_2, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u64_3:
++**	incb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_3, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_u64_4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_4, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_u64_28:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_28, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_u64_32:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_32, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u64_m1:
++**	decb	x0
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_m1, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u64_m2:
++**	decb	x0, all, mul #2
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_m2, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u64_m3:
++**	decb	x0, all, mul #3
++**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_m3, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_u64_m4:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_m4, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_u64_m32:
++**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_m32, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_u64_m36:
++**	[^{]*
++**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_m36, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** st4_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	st4d	{z0\.d - z3\.d}, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u64_x1, svuint64x4_t, uint64_t,
++	    svst4_vnum_u64 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u8.c
+new file mode 100644
+index 000000000..e7c2e7d76
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u8.c
+@@ -0,0 +1,290 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** st4_u8_base:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_base, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0, z0),
++	    svst4 (p0, x0, z0))
++
++/*
++** st4_u8_index:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (st4_u8_index, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + x1, z0),
++	    svst4 (p0, x0 + x1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u8_1:
++**	incb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_1, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + svcntb (), z0),
++	    svst4 (p0, x0 + svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u8_2:
++**	incb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_2, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + svcntb () * 2, z0),
++	    svst4 (p0, x0 + svcntb () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u8_3:
++**	incb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_3, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + svcntb () * 3, z0),
++	    svst4 (p0, x0 + svcntb () * 3, z0))
++
++/*
++** st4_u8_4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u8_4, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + svcntb () * 4, z0),
++	    svst4 (p0, x0 + svcntb () * 4, z0))
++
++/*
++** st4_u8_28:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u8_28, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + svcntb () * 28, z0),
++	    svst4 (p0, x0 + svcntb () * 28, z0))
++
++/*
++** st4_u8_32:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u8_32, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 + svcntb () * 32, z0),
++	    svst4 (p0, x0 + svcntb () * 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u8_m1:
++**	decb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_m1, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 - svcntb (), z0),
++	    svst4 (p0, x0 - svcntb (), z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u8_m2:
++**	decb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_m2, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 - svcntb () * 2, z0),
++	    svst4 (p0, x0 - svcntb () * 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_u8_m3:
++**	decb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_u8_m3, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 - svcntb () * 3, z0),
++	    svst4 (p0, x0 - svcntb () * 3, z0))
++
++/*
++** st4_u8_m4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u8_m4, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 - svcntb () * 4, z0),
++	    svst4 (p0, x0 - svcntb () * 4, z0))
++
++/*
++** st4_u8_m32:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_u8_m32, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 - svcntb () * 32, z0),
++	    svst4 (p0, x0 - svcntb () * 32, z0))
++
++/*
++** st4_u8_m36:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_u8_m36, svuint8x4_t, uint8_t,
++	    svst4_u8 (p0, x0 - svcntb () * 36, z0),
++	    svst4 (p0, x0 - svcntb () * 36, z0))
++
++/*
++** st4_vnum_u8_0:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_0, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 0, z0),
++	    svst4_vnum (p0, x0, 0, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u8_1:
++**	incb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_1, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 1, z0),
++	    svst4_vnum (p0, x0, 1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u8_2:
++**	incb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_2, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 2, z0),
++	    svst4_vnum (p0, x0, 2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u8_3:
++**	incb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_3, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 3, z0),
++	    svst4_vnum (p0, x0, 3, z0))
++
++/*
++** st4_vnum_u8_4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_4, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 4, z0),
++	    svst4_vnum (p0, x0, 4, z0))
++
++/*
++** st4_vnum_u8_28:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_28, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 28, z0),
++	    svst4_vnum (p0, x0, 28, z0))
++
++/*
++** st4_vnum_u8_32:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_32, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, 32, z0),
++	    svst4_vnum (p0, x0, 32, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u8_m1:
++**	decb	x0
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_m1, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, -1, z0),
++	    svst4_vnum (p0, x0, -1, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u8_m2:
++**	decb	x0, all, mul #2
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_m2, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, -2, z0),
++	    svst4_vnum (p0, x0, -2, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** st4_vnum_u8_m3:
++**	decb	x0, all, mul #3
++**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_m3, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, -3, z0),
++	    svst4_vnum (p0, x0, -3, z0))
++
++/*
++** st4_vnum_u8_m4:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_m4, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, -4, z0),
++	    svst4_vnum (p0, x0, -4, z0))
++
++/*
++** st4_vnum_u8_m32:
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_m32, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, -32, z0),
++	    svst4_vnum (p0, x0, -32, z0))
++
++/*
++** st4_vnum_u8_m36:
++**	[^{]*
++**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_m36, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, -36, z0),
++	    svst4_vnum (p0, x0, -36, z0))
++
++/*
++** st4_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	st4b	{z0\.b - z3\.b}, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	st4b	{z0\.b - z3\.b}, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (st4_vnum_u8_x1, svuint8x4_t, uint8_t,
++	    svst4_vnum_u8 (p0, x0, x1, z0),
++	    svst4_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c
+new file mode 100644
+index 000000000..3c4d21f27
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_bf16_base:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_base, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_bf16_index:
++**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_index, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_bf16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_1, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 + svcnth (), z0),
++	    svstnt1 (p0, x0 + svcnth (), z0))
++
++/*
++** stnt1_bf16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_7, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 + svcnth () * 7, z0),
++	    svstnt1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_bf16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_8, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 + svcnth () * 8, z0),
++	    svstnt1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** stnt1_bf16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_m1, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 - svcnth (), z0),
++	    svstnt1 (p0, x0 - svcnth (), z0))
++
++/*
++** stnt1_bf16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_m8, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 - svcnth () * 8, z0),
++	    svstnt1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_bf16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_bf16_m9, svbfloat16_t, bfloat16_t,
++	    svstnt1_bf16 (p0, x0 - svcnth () * 9, z0),
++	    svstnt1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** stnt1_vnum_bf16_0:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_bf16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_bf16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_bf16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_bf16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_bf16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_bf16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_bf16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
++	    svstnt1_vnum_bf16 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f16.c
+new file mode 100644
+index 000000000..a3d89caf1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_f16_base:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_base, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_f16_index:
++**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_index, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_f16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_1, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 + svcnth (), z0),
++	    svstnt1 (p0, x0 + svcnth (), z0))
++
++/*
++** stnt1_f16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_7, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 + svcnth () * 7, z0),
++	    svstnt1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_f16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_8, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 + svcnth () * 8, z0),
++	    svstnt1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** stnt1_f16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_m1, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 - svcnth (), z0),
++	    svstnt1 (p0, x0 - svcnth (), z0))
++
++/*
++** stnt1_f16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_m8, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 - svcnth () * 8, z0),
++	    svstnt1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_f16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f16_m9, svfloat16_t, float16_t,
++	    svstnt1_f16 (p0, x0 - svcnth () * 9, z0),
++	    svstnt1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** stnt1_vnum_f16_0:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_0, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_f16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_1, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_f16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_7, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_f16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_8, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_f16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_m1, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_f16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_m8, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_f16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_m9, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_f16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f16_x1, svfloat16_t, float16_t,
++	    svstnt1_vnum_f16 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f32.c
+new file mode 100644
+index 000000000..24e890512
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_f32_base:
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_base, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_f32_index:
++**	stnt1w	z0\.s, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_index, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_f32_1:
++**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_1, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 + svcntw (), z0),
++	    svstnt1 (p0, x0 + svcntw (), z0))
++
++/*
++** stnt1_f32_7:
++**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_7, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 + svcntw () * 7, z0),
++	    svstnt1 (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_f32_8:
++**	incb	x0, all, mul #8
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_8, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 + svcntw () * 8, z0),
++	    svstnt1 (p0, x0 + svcntw () * 8, z0))
++
++/*
++** stnt1_f32_m1:
++**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_m1, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 - svcntw (), z0),
++	    svstnt1 (p0, x0 - svcntw (), z0))
++
++/*
++** stnt1_f32_m8:
++**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_m8, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 - svcntw () * 8, z0),
++	    svstnt1 (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_f32_m9:
++**	decb	x0, all, mul #9
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f32_m9, svfloat32_t, float32_t,
++	    svstnt1_f32 (p0, x0 - svcntw () * 9, z0),
++	    svstnt1 (p0, x0 - svcntw () * 9, z0))
++
++/*
++** stnt1_vnum_f32_0:
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_0, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_f32_1:
++**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_1, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_f32_7:
++**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_7, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_f32_8:
++**	incb	x0, all, mul #8
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_8, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_f32_m1:
++**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_m1, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_f32_m8:
++**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_m8, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_f32_m9:
++**	decb	x0, all, mul #9
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_m9, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_f32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1w	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f32_x1, svfloat32_t, float32_t,
++	    svstnt1_vnum_f32 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f64.c
+new file mode 100644
+index 000000000..9555a1faf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_f64_base:
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_base, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_f64_index:
++**	stnt1d	z0\.d, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_index, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_f64_1:
++**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_1, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 + svcntd (), z0),
++	    svstnt1 (p0, x0 + svcntd (), z0))
++
++/*
++** stnt1_f64_7:
++**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_7, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 + svcntd () * 7, z0),
++	    svstnt1 (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_f64_8:
++**	incb	x0, all, mul #8
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_8, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 + svcntd () * 8, z0),
++	    svstnt1 (p0, x0 + svcntd () * 8, z0))
++
++/*
++** stnt1_f64_m1:
++**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_m1, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 - svcntd (), z0),
++	    svstnt1 (p0, x0 - svcntd (), z0))
++
++/*
++** stnt1_f64_m8:
++**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_m8, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 - svcntd () * 8, z0),
++	    svstnt1 (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_f64_m9:
++**	decb	x0, all, mul #9
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_f64_m9, svfloat64_t, float64_t,
++	    svstnt1_f64 (p0, x0 - svcntd () * 9, z0),
++	    svstnt1 (p0, x0 - svcntd () * 9, z0))
++
++/*
++** stnt1_vnum_f64_0:
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_0, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_f64_1:
++**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_1, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_f64_7:
++**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_7, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_f64_8:
++**	incb	x0, all, mul #8
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_8, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_f64_m1:
++**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_m1, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_f64_m8:
++**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_m8, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_f64_m9:
++**	decb	x0, all, mul #9
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_m9, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_f64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1d	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_f64_x1, svfloat64_t, float64_t,
++	    svstnt1_vnum_f64 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s16.c
+new file mode 100644
+index 000000000..62e31450d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_s16_base:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_base, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_s16_index:
++**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_index, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_s16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_1, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 + svcnth (), z0),
++	    svstnt1 (p0, x0 + svcnth (), z0))
++
++/*
++** stnt1_s16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_7, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 + svcnth () * 7, z0),
++	    svstnt1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_8, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 + svcnth () * 8, z0),
++	    svstnt1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** stnt1_s16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_m1, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 - svcnth (), z0),
++	    svstnt1 (p0, x0 - svcnth (), z0))
++
++/*
++** stnt1_s16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_m8, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 - svcnth () * 8, z0),
++	    svstnt1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s16_m9, svint16_t, int16_t,
++	    svstnt1_s16 (p0, x0 - svcnth () * 9, z0),
++	    svstnt1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** stnt1_vnum_s16_0:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_0, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_s16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_1, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_s16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_7, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_8, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_s16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_m1, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_s16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_m8, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_m9, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_s16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s16_x1, svint16_t, int16_t,
++	    svstnt1_vnum_s16 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s32.c
+new file mode 100644
+index 000000000..ff1f27c05
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_s32_base:
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_base, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_s32_index:
++**	stnt1w	z0\.s, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_index, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_s32_1:
++**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_1, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 + svcntw (), z0),
++	    svstnt1 (p0, x0 + svcntw (), z0))
++
++/*
++** stnt1_s32_7:
++**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_7, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 + svcntw () * 7, z0),
++	    svstnt1 (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s32_8:
++**	incb	x0, all, mul #8
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_8, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 + svcntw () * 8, z0),
++	    svstnt1 (p0, x0 + svcntw () * 8, z0))
++
++/*
++** stnt1_s32_m1:
++**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_m1, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 - svcntw (), z0),
++	    svstnt1 (p0, x0 - svcntw (), z0))
++
++/*
++** stnt1_s32_m8:
++**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_m8, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 - svcntw () * 8, z0),
++	    svstnt1 (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s32_m9:
++**	decb	x0, all, mul #9
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s32_m9, svint32_t, int32_t,
++	    svstnt1_s32 (p0, x0 - svcntw () * 9, z0),
++	    svstnt1 (p0, x0 - svcntw () * 9, z0))
++
++/*
++** stnt1_vnum_s32_0:
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_0, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_s32_1:
++**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_1, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_s32_7:
++**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_7, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s32_8:
++**	incb	x0, all, mul #8
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_8, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_s32_m1:
++**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_m1, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_s32_m8:
++**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_m8, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s32_m9:
++**	decb	x0, all, mul #9
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_m9, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_s32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1w	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s32_x1, svint32_t, int32_t,
++	    svstnt1_vnum_s32 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s64.c
+new file mode 100644
+index 000000000..7d548f8f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_s64_base:
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_base, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_s64_index:
++**	stnt1d	z0\.d, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_index, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_s64_1:
++**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_1, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 + svcntd (), z0),
++	    svstnt1 (p0, x0 + svcntd (), z0))
++
++/*
++** stnt1_s64_7:
++**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_7, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 + svcntd () * 7, z0),
++	    svstnt1 (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s64_8:
++**	incb	x0, all, mul #8
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_8, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 + svcntd () * 8, z0),
++	    svstnt1 (p0, x0 + svcntd () * 8, z0))
++
++/*
++** stnt1_s64_m1:
++**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_m1, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 - svcntd (), z0),
++	    svstnt1 (p0, x0 - svcntd (), z0))
++
++/*
++** stnt1_s64_m8:
++**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_m8, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 - svcntd () * 8, z0),
++	    svstnt1 (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s64_m9:
++**	decb	x0, all, mul #9
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s64_m9, svint64_t, int64_t,
++	    svstnt1_s64 (p0, x0 - svcntd () * 9, z0),
++	    svstnt1 (p0, x0 - svcntd () * 9, z0))
++
++/*
++** stnt1_vnum_s64_0:
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_0, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_s64_1:
++**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_1, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_s64_7:
++**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_7, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s64_8:
++**	incb	x0, all, mul #8
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_8, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_s64_m1:
++**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_m1, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_s64_m8:
++**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_m8, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s64_m9:
++**	decb	x0, all, mul #9
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_m9, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_s64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1d	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s64_x1, svint64_t, int64_t,
++	    svstnt1_vnum_s64 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s8.c
+new file mode 100644
+index 000000000..87c88035d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_s8_base:
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_base, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_s8_index:
++**	stnt1b	z0\.b, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_index, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_s8_1:
++**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_1, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 + svcntb (), z0),
++	    svstnt1 (p0, x0 + svcntb (), z0))
++
++/*
++** stnt1_s8_7:
++**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_7, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 + svcntb () * 7, z0),
++	    svstnt1 (p0, x0 + svcntb () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s8_8:
++**	incb	x0, all, mul #8
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_8, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 + svcntb () * 8, z0),
++	    svstnt1 (p0, x0 + svcntb () * 8, z0))
++
++/*
++** stnt1_s8_m1:
++**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_m1, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 - svcntb (), z0),
++	    svstnt1 (p0, x0 - svcntb (), z0))
++
++/*
++** stnt1_s8_m8:
++**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_m8, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 - svcntb () * 8, z0),
++	    svstnt1 (p0, x0 - svcntb () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_s8_m9:
++**	decb	x0, all, mul #9
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_s8_m9, svint8_t, int8_t,
++	    svstnt1_s8 (p0, x0 - svcntb () * 9, z0),
++	    svstnt1 (p0, x0 - svcntb () * 9, z0))
++
++/*
++** stnt1_vnum_s8_0:
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_0, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_s8_1:
++**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_1, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_s8_7:
++**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_7, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s8_8:
++**	incb	x0, all, mul #8
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_8, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_s8_m1:
++**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_m1, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_s8_m8:
++**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_m8, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_s8_m9:
++**	decb	x0, all, mul #9
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_m9, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/*
++** stnt1_vnum_s8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	stnt1b	z0\.b, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	stnt1b	z0\.b, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (stnt1_vnum_s8_x1, svint8_t, int8_t,
++	    svstnt1_vnum_s8 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u16.c
+new file mode 100644
+index 000000000..7d32df362
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u16.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_u16_base:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_base, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_u16_index:
++**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_index, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_u16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_1, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 + svcnth (), z0),
++	    svstnt1 (p0, x0 + svcnth (), z0))
++
++/*
++** stnt1_u16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_7, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 + svcnth () * 7, z0),
++	    svstnt1 (p0, x0 + svcnth () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_8, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 + svcnth () * 8, z0),
++	    svstnt1 (p0, x0 + svcnth () * 8, z0))
++
++/*
++** stnt1_u16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_m1, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 - svcnth (), z0),
++	    svstnt1 (p0, x0 - svcnth (), z0))
++
++/*
++** stnt1_u16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_m8, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 - svcnth () * 8, z0),
++	    svstnt1 (p0, x0 - svcnth () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u16_m9, svuint16_t, uint16_t,
++	    svstnt1_u16 (p0, x0 - svcnth () * 9, z0),
++	    svstnt1 (p0, x0 - svcnth () * 9, z0))
++
++/*
++** stnt1_vnum_u16_0:
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_0, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_u16_1:
++**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_1, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_u16_7:
++**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_7, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u16_8:
++**	incb	x0, all, mul #8
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_8, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_u16_m1:
++**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_m1, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_u16_m8:
++**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_m8, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u16_m9:
++**	decb	x0, all, mul #9
++**	stnt1h	z0\.h, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_m9, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_u16_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1h	z0\.h, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u16_x1, svuint16_t, uint16_t,
++	    svstnt1_vnum_u16 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u32.c
+new file mode 100644
+index 000000000..cd4ccaba9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u32.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_u32_base:
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_base, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_u32_index:
++**	stnt1w	z0\.s, p0, \[x0, x1, lsl 2\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_index, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_u32_1:
++**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_1, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 + svcntw (), z0),
++	    svstnt1 (p0, x0 + svcntw (), z0))
++
++/*
++** stnt1_u32_7:
++**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_7, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 + svcntw () * 7, z0),
++	    svstnt1 (p0, x0 + svcntw () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u32_8:
++**	incb	x0, all, mul #8
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_8, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 + svcntw () * 8, z0),
++	    svstnt1 (p0, x0 + svcntw () * 8, z0))
++
++/*
++** stnt1_u32_m1:
++**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_m1, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 - svcntw (), z0),
++	    svstnt1 (p0, x0 - svcntw (), z0))
++
++/*
++** stnt1_u32_m8:
++**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_m8, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 - svcntw () * 8, z0),
++	    svstnt1 (p0, x0 - svcntw () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u32_m9:
++**	decb	x0, all, mul #9
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u32_m9, svuint32_t, uint32_t,
++	    svstnt1_u32 (p0, x0 - svcntw () * 9, z0),
++	    svstnt1 (p0, x0 - svcntw () * 9, z0))
++
++/*
++** stnt1_vnum_u32_0:
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_0, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_u32_1:
++**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_1, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_u32_7:
++**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_7, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u32_8:
++**	incb	x0, all, mul #8
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_8, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_u32_m1:
++**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_m1, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_u32_m8:
++**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_m8, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u32_m9:
++**	decb	x0, all, mul #9
++**	stnt1w	z0\.s, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_m9, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_u32_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1w	z0\.s, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u32_x1, svuint32_t, uint32_t,
++	    svstnt1_vnum_u32 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u64.c
+new file mode 100644
+index 000000000..c8145f65c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u64.c
+@@ -0,0 +1,158 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_u64_base:
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_base, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_u64_index:
++**	stnt1d	z0\.d, p0, \[x0, x1, lsl 3\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_index, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_u64_1:
++**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_1, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 + svcntd (), z0),
++	    svstnt1 (p0, x0 + svcntd (), z0))
++
++/*
++** stnt1_u64_7:
++**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_7, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 + svcntd () * 7, z0),
++	    svstnt1 (p0, x0 + svcntd () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u64_8:
++**	incb	x0, all, mul #8
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_8, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 + svcntd () * 8, z0),
++	    svstnt1 (p0, x0 + svcntd () * 8, z0))
++
++/*
++** stnt1_u64_m1:
++**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_m1, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 - svcntd (), z0),
++	    svstnt1 (p0, x0 - svcntd (), z0))
++
++/*
++** stnt1_u64_m8:
++**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_m8, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 - svcntd () * 8, z0),
++	    svstnt1 (p0, x0 - svcntd () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u64_m9:
++**	decb	x0, all, mul #9
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u64_m9, svuint64_t, uint64_t,
++	    svstnt1_u64 (p0, x0 - svcntd () * 9, z0),
++	    svstnt1 (p0, x0 - svcntd () * 9, z0))
++
++/*
++** stnt1_vnum_u64_0:
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_0, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_u64_1:
++**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_1, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_u64_7:
++**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_7, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u64_8:
++**	incb	x0, all, mul #8
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_8, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_u64_m1:
++**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_m1, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_u64_m8:
++**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_m8, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u64_m9:
++**	decb	x0, all, mul #9
++**	stnt1d	z0\.d, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_m9, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/* Using MUL to calculate an index would also be OK.  */
++/*
++** stnt1_vnum_u64_x1:
++**	cntb	(x[0-9]+)
++**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
++**	stnt1d	z0\.d, p0, \[\2\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u64_x1, svuint64_t, uint64_t,
++	    svstnt1_vnum_u64 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u8.c
+new file mode 100644
+index 000000000..11c68f555
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u8.c
+@@ -0,0 +1,162 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
++
++#include "test_sve_acle.h"
++
++/*
++** stnt1_u8_base:
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_base, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0, z0),
++	    svstnt1 (p0, x0, z0))
++
++/*
++** stnt1_u8_index:
++**	stnt1b	z0\.b, p0, \[x0, x1\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_index, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 + x1, z0),
++	    svstnt1 (p0, x0 + x1, z0))
++
++/*
++** stnt1_u8_1:
++**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_1, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 + svcntb (), z0),
++	    svstnt1 (p0, x0 + svcntb (), z0))
++
++/*
++** stnt1_u8_7:
++**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_7, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 + svcntb () * 7, z0),
++	    svstnt1 (p0, x0 + svcntb () * 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u8_8:
++**	incb	x0, all, mul #8
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_8, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 + svcntb () * 8, z0),
++	    svstnt1 (p0, x0 + svcntb () * 8, z0))
++
++/*
++** stnt1_u8_m1:
++**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_m1, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 - svcntb (), z0),
++	    svstnt1 (p0, x0 - svcntb (), z0))
++
++/*
++** stnt1_u8_m8:
++**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_m8, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 - svcntb () * 8, z0),
++	    svstnt1 (p0, x0 - svcntb () * 8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_u8_m9:
++**	decb	x0, all, mul #9
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_u8_m9, svuint8_t, uint8_t,
++	    svstnt1_u8 (p0, x0 - svcntb () * 9, z0),
++	    svstnt1 (p0, x0 - svcntb () * 9, z0))
++
++/*
++** stnt1_vnum_u8_0:
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_0, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, 0, z0),
++	    svstnt1_vnum (p0, x0, 0, z0))
++
++/*
++** stnt1_vnum_u8_1:
++**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_1, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, 1, z0),
++	    svstnt1_vnum (p0, x0, 1, z0))
++
++/*
++** stnt1_vnum_u8_7:
++**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_7, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, 7, z0),
++	    svstnt1_vnum (p0, x0, 7, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u8_8:
++**	incb	x0, all, mul #8
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_8, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, 8, z0),
++	    svstnt1_vnum (p0, x0, 8, z0))
++
++/*
++** stnt1_vnum_u8_m1:
++**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_m1, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, -1, z0),
++	    svstnt1_vnum (p0, x0, -1, z0))
++
++/*
++** stnt1_vnum_u8_m8:
++**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_m8, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, -8, z0),
++	    svstnt1_vnum (p0, x0, -8, z0))
++
++/* Moving the constant into a register would also be OK.  */
++/*
++** stnt1_vnum_u8_m9:
++**	decb	x0, all, mul #9
++**	stnt1b	z0\.b, p0, \[x0\]
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_m9, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, -9, z0),
++	    svstnt1_vnum (p0, x0, -9, z0))
++
++/*
++** stnt1_vnum_u8_x1:
++**	cntb	(x[0-9]+)
++** (
++**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
++**	stnt1b	z0\.b, p0, \[\2\]
++** |
++**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
++**	stnt1b	z0\.b, p0, \[x0, \3\]
++** )
++**	ret
++*/
++TEST_STORE (stnt1_vnum_u8_x1, svuint8_t, uint8_t,
++	    svstnt1_vnum_u8 (p0, x0, x1, z0),
++	    svstnt1_vnum (p0, x0, x1, z0))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16.c
+new file mode 100644
+index 000000000..bf4a0ab1e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16.c
+@@ -0,0 +1,577 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_m_tied1, svfloat16_t,
++		z0 = svsub_f16_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_m_tied2, svfloat16_t,
++		z0 = svsub_f16_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_m_untied, svfloat16_t,
++		z0 = svsub_f16_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_m (p0, z0, d4),
++		 z0 = svsub_m (p0, z0, d4))
++
++/*
++** sub_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_m (p0, z1, d4),
++		 z0 = svsub_m (p0, z1, d4))
++
++/*
++** sub_1_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_0p5_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, 0.5),
++		z0 = svsub_m (p0, z0, 0.5))
++
++/*
++** sub_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, 0.5),
++		z0 = svsub_m (p0, z1, 0.5))
++
++/*
++** sub_m1_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_m1_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, -1),
++		z0 = svsub_m (p0, z1, -1))
++
++/*
++** sub_m0p5_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, -0.5),
++		z0 = svsub_m (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, -0.5),
++		z0 = svsub_m (p0, z1, -0.5))
++
++/*
++** sub_m2_f16_m:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f16_m, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_z_tied1, svfloat16_t,
++		z0 = svsub_f16_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_z_tied2, svfloat16_t,
++		z0 = svsub_f16_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_z_untied, svfloat16_t,
++		z0 = svsub_f16_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_z (p0, z0, d4),
++		 z0 = svsub_z (p0, z0, d4))
++
++/*
++** sub_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_z (p0, z1, d4),
++		 z0 = svsub_z (p0, z1, d4))
++
++/*
++** sub_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, 0.5),
++		z0 = svsub_z (p0, z0, 0.5))
++
++/*
++** sub_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, 0.5),
++		z0 = svsub_z (p0, z1, 0.5))
++
++/*
++** sub_m1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, -1),
++		z0 = svsub_z (p0, z0, -1))
++
++/*
++** sub_m1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, -1),
++		z0 = svsub_z (p0, z1, -1))
++
++/*
++** sub_m0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, -0.5),
++		z0 = svsub_z (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, -0.5),
++		z0 = svsub_z (p0, z1, -0.5))
++
++/*
++** sub_m2_f16_z:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f16_z, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, -2),
++		z0 = svsub_z (p0, z0, -2))
++
++/*
++** sub_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_x_tied1, svfloat16_t,
++		z0 = svsub_f16_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_f16_x_tied2:
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_x_tied2, svfloat16_t,
++		z0 = svsub_f16_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_x_untied, svfloat16_t,
++		z0 = svsub_f16_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_x (p0, z0, d4),
++		 z0 = svsub_x (p0, z0, d4))
++
++/*
++** sub_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_x (p0, z1, d4),
++		 z0 = svsub_x (p0, z1, d4))
++
++/*
++** sub_1_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_0p5_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, 0.5),
++		z0 = svsub_x (p0, z0, 0.5))
++
++/*
++** sub_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, 0.5),
++		z0 = svsub_x (p0, z1, 0.5))
++
++/*
++** sub_m1_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m1_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, -1),
++		z0 = svsub_x (p0, z1, -1))
++
++/*
++** sub_m0p5_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, -0.5),
++		z0 = svsub_x (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, -0.5),
++		z0 = svsub_x (p0, z1, -0.5))
++
++/*
++** sub_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, 2),
++		z0 = svsub_x (p0, z0, 2))
++
++/*
++** sub_2_f16_x_untied:
++**	fmov	z0\.h, #-2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, 2),
++		z0 = svsub_x (p0, z1, 2))
++
++/*
++** ptrue_sub_f16_x_tied1:
++**	fsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f16_x_tied1, svfloat16_t,
++		z0 = svsub_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svsub_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_sub_f16_x_tied2:
++**	fsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f16_x_tied2, svfloat16_t,
++		z0 = svsub_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svsub_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_sub_f16_x_untied:
++**	fsub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f16_x_untied, svfloat16_t,
++		z0 = svsub_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svsub_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_sub_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svsub_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_sub_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svsub_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_sub_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svsub_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_sub_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svsub_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_sub_m1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svsub_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_sub_m1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svsub_x (svptrue_b16 (), z1, -1))
++
++/*
++** ptrue_sub_m0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -0.5),
++		z0 = svsub_x (svptrue_b16 (), z0, -0.5))
++
++/*
++** ptrue_sub_m0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -0.5),
++		z0 = svsub_x (svptrue_b16 (), z1, -0.5))
++
++/*
++** ptrue_sub_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svsub_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_sub_2_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svsub_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16_notrap.c
+new file mode 100644
+index 000000000..e45098944
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16_notrap.c
+@@ -0,0 +1,572 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_m_tied1, svfloat16_t,
++		z0 = svsub_f16_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_m_tied2, svfloat16_t,
++		z0 = svsub_f16_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_m_untied, svfloat16_t,
++		z0 = svsub_f16_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_m (p0, z0, d4),
++		 z0 = svsub_m (p0, z0, d4))
++
++/*
++** sub_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_m (p0, z1, d4),
++		 z0 = svsub_m (p0, z1, d4))
++
++/*
++** sub_1_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_0p5_f16_m_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, 0.5),
++		z0 = svsub_m (p0, z0, 0.5))
++
++/*
++** sub_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, 0.5),
++		z0 = svsub_m (p0, z1, 0.5))
++
++/*
++** sub_m1_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_m1_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, -1),
++		z0 = svsub_m (p0, z1, -1))
++
++/*
++** sub_m0p5_f16_m_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_m_tied1, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, -0.5),
++		z0 = svsub_m (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_m_untied, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z1, -0.5),
++		z0 = svsub_m (p0, z1, -0.5))
++
++/*
++** sub_m2_f16_m:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f16_m, svfloat16_t,
++		z0 = svsub_n_f16_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_z_tied1, svfloat16_t,
++		z0 = svsub_f16_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_z_tied2, svfloat16_t,
++		z0 = svsub_f16_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_z_untied, svfloat16_t,
++		z0 = svsub_f16_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_z (p0, z0, d4),
++		 z0 = svsub_z (p0, z0, d4))
++
++/*
++** sub_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_z (p0, z1, d4),
++		 z0 = svsub_z (p0, z1, d4))
++
++/*
++** sub_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, 0.5),
++		z0 = svsub_z (p0, z0, 0.5))
++
++/*
++** sub_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, 0.5),
++		z0 = svsub_z (p0, z1, 0.5))
++
++/*
++** sub_m1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, -1),
++		z0 = svsub_z (p0, z0, -1))
++
++/*
++** sub_m1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, -1),
++		z0 = svsub_z (p0, z1, -1))
++
++/*
++** sub_m0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_z_tied1, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, -0.5),
++		z0 = svsub_z (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_z_untied, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z1, -0.5),
++		z0 = svsub_z (p0, z1, -0.5))
++
++/*
++** sub_m2_f16_z:
++**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fadd	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f16_z, svfloat16_t,
++		z0 = svsub_n_f16_z (p0, z0, -2),
++		z0 = svsub_z (p0, z0, -2))
++
++/*
++** sub_f16_x_tied1:
++**	fsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_x_tied1, svfloat16_t,
++		z0 = svsub_f16_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_f16_x_tied2:
++**	fsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_x_tied2, svfloat16_t,
++		z0 = svsub_f16_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_f16_x_untied:
++**	fsub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f16_x_untied, svfloat16_t,
++		z0 = svsub_f16_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_x (p0, z0, d4),
++		 z0 = svsub_x (p0, z0, d4))
++
++/*
++** sub_h4_f16_x_untied:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svsub_n_f16_x (p0, z1, d4),
++		 z0 = svsub_x (p0, z1, d4))
++
++/*
++** sub_1_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_0p5_f16_x_tied1:
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, 0.5),
++		z0 = svsub_x (p0, z0, 0.5))
++
++/*
++** sub_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, 0.5),
++		z0 = svsub_x (p0, z1, 0.5))
++
++/*
++** sub_m1_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m1_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, -1),
++		z0 = svsub_x (p0, z1, -1))
++
++/*
++** sub_m0p5_f16_x_tied1:
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, -0.5),
++		z0 = svsub_x (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, -0.5),
++		z0 = svsub_x (p0, z1, -0.5))
++
++/*
++** sub_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z0, 2),
++		z0 = svsub_x (p0, z0, 2))
++
++/*
++** sub_2_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (p0, z1, 2),
++		z0 = svsub_x (p0, z1, 2))
++
++/*
++** ptrue_sub_f16_x_tied1:
++**	fsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f16_x_tied1, svfloat16_t,
++		z0 = svsub_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svsub_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_sub_f16_x_tied2:
++**	fsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f16_x_tied2, svfloat16_t,
++		z0 = svsub_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svsub_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_sub_f16_x_untied:
++**	fsub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f16_x_untied, svfloat16_t,
++		z0 = svsub_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svsub_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_sub_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svsub_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_sub_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svsub_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_sub_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svsub_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_sub_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svsub_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_sub_m1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svsub_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_sub_m1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svsub_x (svptrue_b16 (), z1, -1))
++
++/*
++** ptrue_sub_m0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -0.5),
++		z0 = svsub_x (svptrue_b16 (), z0, -0.5))
++
++/*
++** ptrue_sub_m0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -0.5),
++		z0 = svsub_x (svptrue_b16 (), z1, -0.5))
++
++/*
++** ptrue_sub_2_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f16_x_tied1, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 2),
++		z0 = svsub_x (svptrue_b16 (), z0, 2))
++
++/*
++** ptrue_sub_2_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
++**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f16_x_untied, svfloat16_t,
++		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 2),
++		z0 = svsub_x (svptrue_b16 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32.c
+new file mode 100644
+index 000000000..05be52bad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32.c
+@@ -0,0 +1,577 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_m_tied1, svfloat32_t,
++		z0 = svsub_f32_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_m_tied2, svfloat32_t,
++		z0 = svsub_f32_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_m_untied, svfloat32_t,
++		z0 = svsub_f32_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svsub_n_f32_m (p0, z0, d4),
++		 z0 = svsub_m (p0, z0, d4))
++
++/*
++** sub_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svsub_n_f32_m (p0, z1, d4),
++		 z0 = svsub_m (p0, z1, d4))
++
++/*
++** sub_1_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_0p5_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, 0.5),
++		z0 = svsub_m (p0, z0, 0.5))
++
++/*
++** sub_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, 0.5),
++		z0 = svsub_m (p0, z1, 0.5))
++
++/*
++** sub_m1_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_m1_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, -1),
++		z0 = svsub_m (p0, z1, -1))
++
++/*
++** sub_m0p5_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, -0.5),
++		z0 = svsub_m (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, -0.5),
++		z0 = svsub_m (p0, z1, -0.5))
++
++/*
++** sub_m2_f32_m:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f32_m, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_z_tied1, svfloat32_t,
++		z0 = svsub_f32_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_z_tied2, svfloat32_t,
++		z0 = svsub_f32_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_z_untied, svfloat32_t,
++		z0 = svsub_f32_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svsub_n_f32_z (p0, z0, d4),
++		 z0 = svsub_z (p0, z0, d4))
++
++/*
++** sub_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svsub_n_f32_z (p0, z1, d4),
++		 z0 = svsub_z (p0, z1, d4))
++
++/*
++** sub_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, 0.5),
++		z0 = svsub_z (p0, z0, 0.5))
++
++/*
++** sub_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, 0.5),
++		z0 = svsub_z (p0, z1, 0.5))
++
++/*
++** sub_m1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, -1),
++		z0 = svsub_z (p0, z0, -1))
++
++/*
++** sub_m1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, -1),
++		z0 = svsub_z (p0, z1, -1))
++
++/*
++** sub_m0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, -0.5),
++		z0 = svsub_z (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, -0.5),
++		z0 = svsub_z (p0, z1, -0.5))
++
++/*
++** sub_m2_f32_z:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f32_z, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, -2),
++		z0 = svsub_z (p0, z0, -2))
++
++/*
++** sub_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_x_tied1, svfloat32_t,
++		z0 = svsub_f32_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_f32_x_tied2:
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_x_tied2, svfloat32_t,
++		z0 = svsub_f32_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_x_untied, svfloat32_t,
++		z0 = svsub_f32_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svsub_n_f32_x (p0, z0, d4),
++		 z0 = svsub_x (p0, z0, d4))
++
++/*
++** sub_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svsub_n_f32_x (p0, z1, d4),
++		 z0 = svsub_x (p0, z1, d4))
++
++/*
++** sub_1_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_0p5_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, 0.5),
++		z0 = svsub_x (p0, z0, 0.5))
++
++/*
++** sub_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, 0.5),
++		z0 = svsub_x (p0, z1, 0.5))
++
++/*
++** sub_m1_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m1_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, -1),
++		z0 = svsub_x (p0, z1, -1))
++
++/*
++** sub_m0p5_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, -0.5),
++		z0 = svsub_x (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, -0.5),
++		z0 = svsub_x (p0, z1, -0.5))
++
++/*
++** sub_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, 2),
++		z0 = svsub_x (p0, z0, 2))
++
++/*
++** sub_2_f32_x_untied:
++**	fmov	z0\.s, #-2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, 2),
++		z0 = svsub_x (p0, z1, 2))
++
++/*
++** ptrue_sub_f32_x_tied1:
++**	fsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f32_x_tied1, svfloat32_t,
++		z0 = svsub_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svsub_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_sub_f32_x_tied2:
++**	fsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f32_x_tied2, svfloat32_t,
++		z0 = svsub_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svsub_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_sub_f32_x_untied:
++**	fsub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f32_x_untied, svfloat32_t,
++		z0 = svsub_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svsub_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_sub_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svsub_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_sub_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svsub_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_sub_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svsub_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_sub_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svsub_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_sub_m1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svsub_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_sub_m1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svsub_x (svptrue_b32 (), z1, -1))
++
++/*
++** ptrue_sub_m0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -0.5),
++		z0 = svsub_x (svptrue_b32 (), z0, -0.5))
++
++/*
++** ptrue_sub_m0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -0.5),
++		z0 = svsub_x (svptrue_b32 (), z1, -0.5))
++
++/*
++** ptrue_sub_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svsub_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_sub_2_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svsub_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32_notrap.c
+new file mode 100644
+index 000000000..eb79a253a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32_notrap.c
+@@ -0,0 +1,572 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_m_tied1, svfloat32_t,
++		z0 = svsub_f32_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_m_tied2, svfloat32_t,
++		z0 = svsub_f32_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_m_untied, svfloat32_t,
++		z0 = svsub_f32_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svsub_n_f32_m (p0, z0, d4),
++		 z0 = svsub_m (p0, z0, d4))
++
++/*
++** sub_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svsub_n_f32_m (p0, z1, d4),
++		 z0 = svsub_m (p0, z1, d4))
++
++/*
++** sub_1_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_0p5_f32_m_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, 0.5),
++		z0 = svsub_m (p0, z0, 0.5))
++
++/*
++** sub_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, 0.5),
++		z0 = svsub_m (p0, z1, 0.5))
++
++/*
++** sub_m1_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_m1_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, -1),
++		z0 = svsub_m (p0, z1, -1))
++
++/*
++** sub_m0p5_f32_m_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_m_tied1, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, -0.5),
++		z0 = svsub_m (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_m_untied, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z1, -0.5),
++		z0 = svsub_m (p0, z1, -0.5))
++
++/*
++** sub_m2_f32_m:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f32_m, svfloat32_t,
++		z0 = svsub_n_f32_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_z_tied1, svfloat32_t,
++		z0 = svsub_f32_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_z_tied2, svfloat32_t,
++		z0 = svsub_f32_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_z_untied, svfloat32_t,
++		z0 = svsub_f32_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svsub_n_f32_z (p0, z0, d4),
++		 z0 = svsub_z (p0, z0, d4))
++
++/*
++** sub_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svsub_n_f32_z (p0, z1, d4),
++		 z0 = svsub_z (p0, z1, d4))
++
++/*
++** sub_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, 0.5),
++		z0 = svsub_z (p0, z0, 0.5))
++
++/*
++** sub_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, 0.5),
++		z0 = svsub_z (p0, z1, 0.5))
++
++/*
++** sub_m1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, -1),
++		z0 = svsub_z (p0, z0, -1))
++
++/*
++** sub_m1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, -1),
++		z0 = svsub_z (p0, z1, -1))
++
++/*
++** sub_m0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_z_tied1, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, -0.5),
++		z0 = svsub_z (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_z_untied, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z1, -0.5),
++		z0 = svsub_z (p0, z1, -0.5))
++
++/*
++** sub_m2_f32_z:
++**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fadd	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f32_z, svfloat32_t,
++		z0 = svsub_n_f32_z (p0, z0, -2),
++		z0 = svsub_z (p0, z0, -2))
++
++/*
++** sub_f32_x_tied1:
++**	fsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_x_tied1, svfloat32_t,
++		z0 = svsub_f32_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_f32_x_tied2:
++**	fsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_x_tied2, svfloat32_t,
++		z0 = svsub_f32_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_f32_x_untied:
++**	fsub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f32_x_untied, svfloat32_t,
++		z0 = svsub_f32_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svsub_n_f32_x (p0, z0, d4),
++		 z0 = svsub_x (p0, z0, d4))
++
++/*
++** sub_s4_f32_x_untied:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svsub_n_f32_x (p0, z1, d4),
++		 z0 = svsub_x (p0, z1, d4))
++
++/*
++** sub_1_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_0p5_f32_x_tied1:
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, 0.5),
++		z0 = svsub_x (p0, z0, 0.5))
++
++/*
++** sub_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, 0.5),
++		z0 = svsub_x (p0, z1, 0.5))
++
++/*
++** sub_m1_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m1_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, -1),
++		z0 = svsub_x (p0, z1, -1))
++
++/*
++** sub_m0p5_f32_x_tied1:
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, -0.5),
++		z0 = svsub_x (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, -0.5),
++		z0 = svsub_x (p0, z1, -0.5))
++
++/*
++** sub_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z0, 2),
++		z0 = svsub_x (p0, z0, 2))
++
++/*
++** sub_2_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (p0, z1, 2),
++		z0 = svsub_x (p0, z1, 2))
++
++/*
++** ptrue_sub_f32_x_tied1:
++**	fsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f32_x_tied1, svfloat32_t,
++		z0 = svsub_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svsub_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_sub_f32_x_tied2:
++**	fsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f32_x_tied2, svfloat32_t,
++		z0 = svsub_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svsub_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_sub_f32_x_untied:
++**	fsub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f32_x_untied, svfloat32_t,
++		z0 = svsub_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svsub_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_sub_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svsub_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_sub_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svsub_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_sub_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svsub_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_sub_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svsub_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_sub_m1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svsub_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_sub_m1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svsub_x (svptrue_b32 (), z1, -1))
++
++/*
++** ptrue_sub_m0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -0.5),
++		z0 = svsub_x (svptrue_b32 (), z0, -0.5))
++
++/*
++** ptrue_sub_m0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -0.5),
++		z0 = svsub_x (svptrue_b32 (), z1, -0.5))
++
++/*
++** ptrue_sub_2_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f32_x_tied1, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 2),
++		z0 = svsub_x (svptrue_b32 (), z0, 2))
++
++/*
++** ptrue_sub_2_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
++**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f32_x_untied, svfloat32_t,
++		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 2),
++		z0 = svsub_x (svptrue_b32 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64.c
+new file mode 100644
+index 000000000..2179382c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64.c
+@@ -0,0 +1,577 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_m_tied1, svfloat64_t,
++		z0 = svsub_f64_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_m_tied2, svfloat64_t,
++		z0 = svsub_f64_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_m_untied, svfloat64_t,
++		z0 = svsub_f64_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svsub_n_f64_m (p0, z0, d4),
++		 z0 = svsub_m (p0, z0, d4))
++
++/*
++** sub_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svsub_n_f64_m (p0, z1, d4),
++		 z0 = svsub_m (p0, z1, d4))
++
++/*
++** sub_1_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_0p5_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, 0.5),
++		z0 = svsub_m (p0, z0, 0.5))
++
++/*
++** sub_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, 0.5),
++		z0 = svsub_m (p0, z1, 0.5))
++
++/*
++** sub_m1_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_m1_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, -1),
++		z0 = svsub_m (p0, z1, -1))
++
++/*
++** sub_m0p5_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, -0.5),
++		z0 = svsub_m (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, -0.5),
++		z0 = svsub_m (p0, z1, -0.5))
++
++/*
++** sub_m2_f64_m:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f64_m, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_z_tied1, svfloat64_t,
++		z0 = svsub_f64_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_z_tied2, svfloat64_t,
++		z0 = svsub_f64_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_z_untied, svfloat64_t,
++		z0 = svsub_f64_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svsub_n_f64_z (p0, z0, d4),
++		 z0 = svsub_z (p0, z0, d4))
++
++/*
++** sub_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svsub_n_f64_z (p0, z1, d4),
++		 z0 = svsub_z (p0, z1, d4))
++
++/*
++** sub_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, 0.5),
++		z0 = svsub_z (p0, z0, 0.5))
++
++/*
++** sub_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, 0.5),
++		z0 = svsub_z (p0, z1, 0.5))
++
++/*
++** sub_m1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, -1),
++		z0 = svsub_z (p0, z0, -1))
++
++/*
++** sub_m1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, -1),
++		z0 = svsub_z (p0, z1, -1))
++
++/*
++** sub_m0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, -0.5),
++		z0 = svsub_z (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, -0.5),
++		z0 = svsub_z (p0, z1, -0.5))
++
++/*
++** sub_m2_f64_z:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f64_z, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, -2),
++		z0 = svsub_z (p0, z0, -2))
++
++/*
++** sub_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_x_tied1, svfloat64_t,
++		z0 = svsub_f64_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_f64_x_tied2:
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_x_tied2, svfloat64_t,
++		z0 = svsub_f64_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_x_untied, svfloat64_t,
++		z0 = svsub_f64_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svsub_n_f64_x (p0, z0, d4),
++		 z0 = svsub_x (p0, z0, d4))
++
++/*
++** sub_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svsub_n_f64_x (p0, z1, d4),
++		 z0 = svsub_x (p0, z1, d4))
++
++/*
++** sub_1_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_0p5_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, 0.5),
++		z0 = svsub_x (p0, z0, 0.5))
++
++/*
++** sub_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, 0.5),
++		z0 = svsub_x (p0, z1, 0.5))
++
++/*
++** sub_m1_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m1_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, -1),
++		z0 = svsub_x (p0, z1, -1))
++
++/*
++** sub_m0p5_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, -0.5),
++		z0 = svsub_x (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, -0.5),
++		z0 = svsub_x (p0, z1, -0.5))
++
++/*
++** sub_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, 2),
++		z0 = svsub_x (p0, z0, 2))
++
++/*
++** sub_2_f64_x_untied:
++**	fmov	z0\.d, #-2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, 2),
++		z0 = svsub_x (p0, z1, 2))
++
++/*
++** ptrue_sub_f64_x_tied1:
++**	fsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f64_x_tied1, svfloat64_t,
++		z0 = svsub_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svsub_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_sub_f64_x_tied2:
++**	fsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f64_x_tied2, svfloat64_t,
++		z0 = svsub_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svsub_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_sub_f64_x_untied:
++**	fsub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f64_x_untied, svfloat64_t,
++		z0 = svsub_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svsub_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_sub_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svsub_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_sub_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svsub_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_sub_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svsub_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_sub_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svsub_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_sub_m1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svsub_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_sub_m1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svsub_x (svptrue_b64 (), z1, -1))
++
++/*
++** ptrue_sub_m0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -0.5),
++		z0 = svsub_x (svptrue_b64 (), z0, -0.5))
++
++/*
++** ptrue_sub_m0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -0.5),
++		z0 = svsub_x (svptrue_b64 (), z1, -0.5))
++
++/*
++** ptrue_sub_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svsub_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_sub_2_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svsub_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64_notrap.c
+new file mode 100644
+index 000000000..bd89f44b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64_notrap.c
+@@ -0,0 +1,572 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_m_tied1, svfloat64_t,
++		z0 = svsub_f64_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_m_tied2, svfloat64_t,
++		z0 = svsub_f64_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_m_untied, svfloat64_t,
++		z0 = svsub_f64_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svsub_n_f64_m (p0, z0, d4),
++		 z0 = svsub_m (p0, z0, d4))
++
++/*
++** sub_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svsub_n_f64_m (p0, z1, d4),
++		 z0 = svsub_m (p0, z1, d4))
++
++/*
++** sub_1_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_0p5_f64_m_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, 0.5),
++		z0 = svsub_m (p0, z0, 0.5))
++
++/*
++** sub_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, 0.5),
++		z0 = svsub_m (p0, z1, 0.5))
++
++/*
++** sub_m1_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_m1_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, -1),
++		z0 = svsub_m (p0, z1, -1))
++
++/*
++** sub_m0p5_f64_m_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_m_tied1, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, -0.5),
++		z0 = svsub_m (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_m_untied, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z1, -0.5),
++		z0 = svsub_m (p0, z1, -0.5))
++
++/*
++** sub_m2_f64_m:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f64_m, svfloat64_t,
++		z0 = svsub_n_f64_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_z_tied1, svfloat64_t,
++		z0 = svsub_f64_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_z_tied2, svfloat64_t,
++		z0 = svsub_f64_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_z_untied, svfloat64_t,
++		z0 = svsub_f64_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svsub_n_f64_z (p0, z0, d4),
++		 z0 = svsub_z (p0, z0, d4))
++
++/*
++** sub_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svsub_n_f64_z (p0, z1, d4),
++		 z0 = svsub_z (p0, z1, d4))
++
++/*
++** sub_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, 0.5),
++		z0 = svsub_z (p0, z0, 0.5))
++
++/*
++** sub_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, 0.5),
++		z0 = svsub_z (p0, z1, 0.5))
++
++/*
++** sub_m1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, -1),
++		z0 = svsub_z (p0, z0, -1))
++
++/*
++** sub_m1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, -1),
++		z0 = svsub_z (p0, z1, -1))
++
++/*
++** sub_m0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_z_tied1, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, -0.5),
++		z0 = svsub_z (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_z_untied, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z1, -0.5),
++		z0 = svsub_z (p0, z1, -0.5))
++
++/*
++** sub_m2_f64_z:
++**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fadd	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_f64_z, svfloat64_t,
++		z0 = svsub_n_f64_z (p0, z0, -2),
++		z0 = svsub_z (p0, z0, -2))
++
++/*
++** sub_f64_x_tied1:
++**	fsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_x_tied1, svfloat64_t,
++		z0 = svsub_f64_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_f64_x_tied2:
++**	fsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_x_tied2, svfloat64_t,
++		z0 = svsub_f64_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_f64_x_untied:
++**	fsub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_f64_x_untied, svfloat64_t,
++		z0 = svsub_f64_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svsub_n_f64_x (p0, z0, d4),
++		 z0 = svsub_x (p0, z0, d4))
++
++/*
++** sub_d4_f64_x_untied:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (sub_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svsub_n_f64_x (p0, z1, d4),
++		 z0 = svsub_x (p0, z1, d4))
++
++/*
++** sub_1_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_0p5_f64_x_tied1:
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, 0.5),
++		z0 = svsub_x (p0, z0, 0.5))
++
++/*
++** sub_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fsub	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, 0.5),
++		z0 = svsub_x (p0, z1, 0.5))
++
++/*
++** sub_m1_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m1_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, -1),
++		z0 = svsub_x (p0, z1, -1))
++
++/*
++** sub_m0p5_f64_x_tied1:
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, -0.5),
++		z0 = svsub_x (p0, z0, -0.5))
++
++/*
++** sub_m0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fadd	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, -0.5),
++		z0 = svsub_x (p0, z1, -0.5))
++
++/*
++** sub_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z0, 2),
++		z0 = svsub_x (p0, z0, 2))
++
++/*
++** sub_2_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_2_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (p0, z1, 2),
++		z0 = svsub_x (p0, z1, 2))
++
++/*
++** ptrue_sub_f64_x_tied1:
++**	fsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f64_x_tied1, svfloat64_t,
++		z0 = svsub_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svsub_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_sub_f64_x_tied2:
++**	fsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f64_x_tied2, svfloat64_t,
++		z0 = svsub_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svsub_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_sub_f64_x_untied:
++**	fsub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_f64_x_untied, svfloat64_t,
++		z0 = svsub_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svsub_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_sub_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svsub_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_sub_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svsub_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_sub_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svsub_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_sub_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svsub_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_sub_m1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svsub_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_sub_m1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svsub_x (svptrue_b64 (), z1, -1))
++
++/*
++** ptrue_sub_m0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -0.5),
++		z0 = svsub_x (svptrue_b64 (), z0, -0.5))
++
++/*
++** ptrue_sub_m0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -0.5),
++		z0 = svsub_x (svptrue_b64 (), z1, -0.5))
++
++/*
++** ptrue_sub_2_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f64_x_tied1, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 2),
++		z0 = svsub_x (svptrue_b64 (), z0, 2))
++
++/*
++** ptrue_sub_2_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
++**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_sub_2_f64_x_untied, svfloat64_t,
++		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 2),
++		z0 = svsub_x (svptrue_b64 (), z1, 2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s16.c
+new file mode 100644
+index 000000000..aea8ea2b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s16.c
+@@ -0,0 +1,377 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_s16_m_tied1:
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_m_tied1, svint16_t,
++		z0 = svsub_s16_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_m_tied2, svint16_t,
++		z0 = svsub_s16_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_s16_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_m_untied, svint16_t,
++		z0 = svsub_s16_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svsub_n_s16_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	sub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svsub_n_s16_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_s16_m_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s16_m_tied1, svint16_t,
++		z0 = svsub_n_s16_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s16_m_untied, svint16_t,
++		z0 = svsub_n_s16_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m2_s16_m:
++**	mov	(z[0-9]+\.h), #2
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_s16_m, svint16_t,
++		z0 = svsub_n_s16_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_z_tied1, svint16_t,
++		z0 = svsub_s16_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_z_tied2, svint16_t,
++		z0 = svsub_s16_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sub	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_z_untied, svint16_t,
++		z0 = svsub_s16_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svsub_n_s16_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sub	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svsub_n_s16_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_s16_z_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s16_z_tied1, svint16_t,
++		z0 = svsub_n_s16_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_s16_z_untied:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s16_z_untied, svint16_t,
++		z0 = svsub_n_s16_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_s16_x_tied1:
++**	sub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_x_tied1, svint16_t,
++		z0 = svsub_s16_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_s16_x_tied2:
++**	sub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_x_tied2, svint16_t,
++		z0 = svsub_s16_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_s16_x_untied:
++**	sub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s16_x_untied, svint16_t,
++		z0 = svsub_s16_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svsub_n_s16_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_w0_s16_x_untied:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svsub_n_s16_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_s16_x_tied1:
++**	sub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s16_x_tied1, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_s16_x_untied:
++**	movprfx	z0, z1
++**	sub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s16_x_untied, svint16_t,
++		z0 = svsub_n_s16_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_s16_x:
++**	sub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_s16_x:
++**	sub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_s16_x:
++**	sub	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_256_s16_x:
++**	add	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_256_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 256),
++		z0 = svsub_x (p0, z0, 256))
++
++/*
++** sub_257_s16_x:
++**	mov	(z[0-9]+\.h), #-257
++**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_257_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 257),
++		z0 = svsub_x (p0, z0, 257))
++
++/*
++** sub_512_s16_x:
++**	add	z0\.h, z0\.h, #65024
++**	ret
++*/
++TEST_UNIFORM_Z (sub_512_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 512),
++		z0 = svsub_x (p0, z0, 512))
++
++/*
++** sub_65280_s16_x:
++**	add	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65280_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, 0xff00),
++		z0 = svsub_x (p0, z0, 0xff00))
++
++/*
++** sub_m1_s16_x:
++**	add	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_s16_x:
++**	add	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_s16_x:
++**	add	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
++
++/*
++** sub_m255_s16_x:
++**	add	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m255_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -255),
++		z0 = svsub_x (p0, z0, -255))
++
++/*
++** sub_m256_s16_x:
++**	add	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m256_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -256),
++		z0 = svsub_x (p0, z0, -256))
++
++/*
++** sub_m257_s16_x:
++**	mov	(z[0-9]+)\.b, #1
++**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m257_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -257),
++		z0 = svsub_x (p0, z0, -257))
++
++/*
++** sub_m512_s16_x:
++**	add	z0\.h, z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m512_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -512),
++		z0 = svsub_x (p0, z0, -512))
++
++/*
++** sub_m32768_s16_x:
++**	add	z0\.h, z0\.h, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m32768_s16_x, svint16_t,
++		z0 = svsub_n_s16_x (p0, z0, -0x8000),
++		z0 = svsub_x (p0, z0, -0x8000))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s32.c
+new file mode 100644
+index 000000000..db6f3df90
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s32.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_s32_m_tied1:
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_m_tied1, svint32_t,
++		z0 = svsub_s32_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_m_tied2, svint32_t,
++		z0 = svsub_s32_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_s32_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_m_untied, svint32_t,
++		z0 = svsub_s32_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svsub_n_s32_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	sub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svsub_n_s32_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_s32_m_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s32_m_tied1, svint32_t,
++		z0 = svsub_n_s32_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s32_m_untied, svint32_t,
++		z0 = svsub_n_s32_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m2_s32_m:
++**	mov	(z[0-9]+\.s), #2
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_s32_m, svint32_t,
++		z0 = svsub_n_s32_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_z_tied1, svint32_t,
++		z0 = svsub_s32_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_z_tied2, svint32_t,
++		z0 = svsub_s32_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sub	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_z_untied, svint32_t,
++		z0 = svsub_s32_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svsub_n_s32_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sub	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svsub_n_s32_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_s32_z_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s32_z_tied1, svint32_t,
++		z0 = svsub_n_s32_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_s32_z_untied:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s32_z_untied, svint32_t,
++		z0 = svsub_n_s32_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_s32_x_tied1:
++**	sub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_x_tied1, svint32_t,
++		z0 = svsub_s32_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_s32_x_tied2:
++**	sub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_x_tied2, svint32_t,
++		z0 = svsub_s32_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_s32_x_untied:
++**	sub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s32_x_untied, svint32_t,
++		z0 = svsub_s32_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svsub_n_s32_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_w0_s32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svsub_n_s32_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_s32_x_tied1:
++**	sub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s32_x_tied1, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_s32_x_untied:
++**	movprfx	z0, z1
++**	sub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s32_x_untied, svint32_t,
++		z0 = svsub_n_s32_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_s32_x:
++**	sub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_s32_x:
++**	sub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_s32_x:
++**	sub	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_256_s32_x:
++**	sub	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_256_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 256),
++		z0 = svsub_x (p0, z0, 256))
++
++/*
++** sub_511_s32_x:
++**	mov	(z[0-9]+\.s), #-511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_511_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 511),
++		z0 = svsub_x (p0, z0, 511))
++
++/*
++** sub_512_s32_x:
++**	sub	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_512_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 512),
++		z0 = svsub_x (p0, z0, 512))
++
++/*
++** sub_65280_s32_x:
++**	sub	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65280_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 0xff00),
++		z0 = svsub_x (p0, z0, 0xff00))
++
++/*
++** sub_65535_s32_x:
++**	mov	(z[0-9]+\.s), #-65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65535_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 65535),
++		z0 = svsub_x (p0, z0, 65535))
++
++/*
++** sub_65536_s32_x:
++**	mov	(z[0-9]+\.s), #-65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65536_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, 65536),
++		z0 = svsub_x (p0, z0, 65536))
++
++/*
++** sub_m1_s32_x:
++**	add	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_s32_x:
++**	add	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_s32_x:
++**	add	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
++
++/*
++** sub_m255_s32_x:
++**	add	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m255_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -255),
++		z0 = svsub_x (p0, z0, -255))
++
++/*
++** sub_m256_s32_x:
++**	add	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m256_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -256),
++		z0 = svsub_x (p0, z0, -256))
++
++/*
++** sub_m511_s32_x:
++**	mov	(z[0-9]+\.s), #511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m511_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -511),
++		z0 = svsub_x (p0, z0, -511))
++
++/*
++** sub_m512_s32_x:
++**	add	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m512_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -512),
++		z0 = svsub_x (p0, z0, -512))
++
++/*
++** sub_m32768_s32_x:
++**	add	z0\.s, z0\.s, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m32768_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -0x8000),
++		z0 = svsub_x (p0, z0, -0x8000))
++
++/*
++** sub_m65280_s32_x:
++**	add	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65280_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -0xff00),
++		z0 = svsub_x (p0, z0, -0xff00))
++
++/*
++** sub_m65535_s32_x:
++**	mov	(z[0-9]+\.s), #65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65535_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -65535),
++		z0 = svsub_x (p0, z0, -65535))
++
++/*
++** sub_m65536_s32_x:
++**	mov	(z[0-9]+\.s), #65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65536_s32_x, svint32_t,
++		z0 = svsub_n_s32_x (p0, z0, -65536),
++		z0 = svsub_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s64.c
+new file mode 100644
+index 000000000..b9184c3a8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s64.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_s64_m_tied1:
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_m_tied1, svint64_t,
++		z0 = svsub_s64_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_m_tied2, svint64_t,
++		z0 = svsub_s64_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_s64_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_m_untied, svint64_t,
++		z0 = svsub_s64_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svsub_n_s64_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svsub_n_s64_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_s64_m_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s64_m_tied1, svint64_t,
++		z0 = svsub_n_s64_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s64_m_untied, svint64_t,
++		z0 = svsub_n_s64_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m2_s64_m:
++**	mov	(z[0-9]+\.d), #2
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_s64_m, svint64_t,
++		z0 = svsub_n_s64_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_z_tied1, svint64_t,
++		z0 = svsub_s64_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_z_tied2, svint64_t,
++		z0 = svsub_s64_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sub	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_z_untied, svint64_t,
++		z0 = svsub_s64_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svsub_n_s64_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sub	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svsub_n_s64_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_s64_z_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s64_z_tied1, svint64_t,
++		z0 = svsub_n_s64_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_s64_z_untied:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s64_z_untied, svint64_t,
++		z0 = svsub_n_s64_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_s64_x_tied1:
++**	sub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_x_tied1, svint64_t,
++		z0 = svsub_s64_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_s64_x_tied2:
++**	sub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_x_tied2, svint64_t,
++		z0 = svsub_s64_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_s64_x_untied:
++**	sub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s64_x_untied, svint64_t,
++		z0 = svsub_s64_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svsub_n_s64_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svsub_n_s64_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_s64_x_tied1:
++**	sub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s64_x_tied1, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_s64_x_untied:
++**	movprfx	z0, z1
++**	sub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s64_x_untied, svint64_t,
++		z0 = svsub_n_s64_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_s64_x:
++**	sub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_s64_x:
++**	sub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_s64_x:
++**	sub	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_256_s64_x:
++**	sub	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_256_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 256),
++		z0 = svsub_x (p0, z0, 256))
++
++/*
++** sub_511_s64_x:
++**	mov	(z[0-9]+\.d), #-511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_511_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 511),
++		z0 = svsub_x (p0, z0, 511))
++
++/*
++** sub_512_s64_x:
++**	sub	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_512_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 512),
++		z0 = svsub_x (p0, z0, 512))
++
++/*
++** sub_65280_s64_x:
++**	sub	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65280_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 0xff00),
++		z0 = svsub_x (p0, z0, 0xff00))
++
++/*
++** sub_65535_s64_x:
++**	mov	(z[0-9]+\.d), #-65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65535_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 65535),
++		z0 = svsub_x (p0, z0, 65535))
++
++/*
++** sub_65536_s64_x:
++**	mov	(z[0-9]+\.d), #-65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65536_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, 65536),
++		z0 = svsub_x (p0, z0, 65536))
++
++/*
++** sub_m1_s64_x:
++**	add	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_s64_x:
++**	add	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_s64_x:
++**	add	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
++
++/*
++** sub_m255_s64_x:
++**	add	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m255_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -255),
++		z0 = svsub_x (p0, z0, -255))
++
++/*
++** sub_m256_s64_x:
++**	add	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m256_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -256),
++		z0 = svsub_x (p0, z0, -256))
++
++/*
++** sub_m511_s64_x:
++**	mov	(z[0-9]+\.d), #511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m511_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -511),
++		z0 = svsub_x (p0, z0, -511))
++
++/*
++** sub_m512_s64_x:
++**	add	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m512_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -512),
++		z0 = svsub_x (p0, z0, -512))
++
++/*
++** sub_m32768_s64_x:
++**	add	z0\.d, z0\.d, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m32768_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -0x8000),
++		z0 = svsub_x (p0, z0, -0x8000))
++
++/*
++** sub_m65280_s64_x:
++**	add	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65280_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -0xff00),
++		z0 = svsub_x (p0, z0, -0xff00))
++
++/*
++** sub_m65535_s64_x:
++**	mov	(z[0-9]+\.d), #65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65535_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -65535),
++		z0 = svsub_x (p0, z0, -65535))
++
++/*
++** sub_m65536_s64_x:
++**	mov	(z[0-9]+\.d), #65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65536_s64_x, svint64_t,
++		z0 = svsub_n_s64_x (p0, z0, -65536),
++		z0 = svsub_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s8.c
+new file mode 100644
+index 000000000..0d7ba99aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_s8_m_tied1:
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_m_tied1, svint8_t,
++		z0 = svsub_s8_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_m_tied2, svint8_t,
++		z0 = svsub_s8_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_s8_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_m_untied, svint8_t,
++		z0 = svsub_s8_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svsub_n_s8_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	sub	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svsub_n_s8_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #-1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s8_m_tied1, svint8_t,
++		z0 = svsub_n_s8_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #-1
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s8_m_untied, svint8_t,
++		z0 = svsub_n_s8_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m1_s8_m:
++**	mov	(z[0-9]+\.b), #1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_s8_m, svint8_t,
++		z0 = svsub_n_s8_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_z_tied1, svint8_t,
++		z0 = svsub_s8_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_z_tied2, svint8_t,
++		z0 = svsub_s8_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sub	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_z_untied, svint8_t,
++		z0 = svsub_s8_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sub	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svsub_n_s8_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sub	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svsub_n_s8_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #-1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s8_z_tied1, svint8_t,
++		z0 = svsub_n_s8_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #-1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s8_z_untied, svint8_t,
++		z0 = svsub_n_s8_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_s8_x_tied1:
++**	sub	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_x_tied1, svint8_t,
++		z0 = svsub_s8_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_s8_x_tied2:
++**	sub	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_x_tied2, svint8_t,
++		z0 = svsub_s8_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_s8_x_untied:
++**	sub	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_s8_x_untied, svint8_t,
++		z0 = svsub_s8_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svsub_n_s8_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_w0_s8_x_untied:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svsub_n_s8_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_s8_x_tied1:
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s8_x_tied1, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_s8_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_s8_x_untied, svint8_t,
++		z0 = svsub_n_s8_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_s8_x:
++**	add	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_s8_x, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_s8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_s8_x, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_s8_x:
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_s8_x, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_m1_s8_x:
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_s8_x, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_s8_x:
++**	add	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_s8_x, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_s8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_s8_x, svint8_t,
++		z0 = svsub_n_s8_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u16.c
+new file mode 100644
+index 000000000..89620e159
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u16.c
+@@ -0,0 +1,377 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_u16_m_tied1:
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_m_tied1, svuint16_t,
++		z0 = svsub_u16_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_m_tied2, svuint16_t,
++		z0 = svsub_u16_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_u16_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_m_untied, svuint16_t,
++		z0 = svsub_u16_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svsub_n_u16_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	sub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svsub_n_u16_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_u16_m_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u16_m_tied1, svuint16_t,
++		z0 = svsub_n_u16_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0, z1
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u16_m_untied, svuint16_t,
++		z0 = svsub_n_u16_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m2_u16_m:
++**	mov	(z[0-9]+\.h), #2
++**	add	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_u16_m, svuint16_t,
++		z0 = svsub_n_u16_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_z_tied1, svuint16_t,
++		z0 = svsub_u16_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_z_tied2, svuint16_t,
++		z0 = svsub_u16_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sub	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_z_untied, svuint16_t,
++		z0 = svsub_u16_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sub	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svsub_n_u16_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	sub	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svsub_n_u16_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_u16_z_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u16_z_tied1, svuint16_t,
++		z0 = svsub_n_u16_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_u16_z_untied:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	add	z0\.h, p0/m, z0\.h, \1\.h
++** |
++**	movprfx	z0\.h, p0/z, \1\.h
++**	add	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u16_z_untied, svuint16_t,
++		z0 = svsub_n_u16_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_u16_x_tied1:
++**	sub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_x_tied1, svuint16_t,
++		z0 = svsub_u16_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_u16_x_tied2:
++**	sub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_x_tied2, svuint16_t,
++		z0 = svsub_u16_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_u16_x_untied:
++**	sub	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u16_x_untied, svuint16_t,
++		z0 = svsub_u16_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svsub_n_u16_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_w0_u16_x_untied:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, z1\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svsub_n_u16_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_u16_x_tied1:
++**	sub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u16_x_tied1, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_u16_x_untied:
++**	movprfx	z0, z1
++**	sub	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u16_x_untied, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_u16_x:
++**	sub	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_u16_x:
++**	sub	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_u16_x:
++**	sub	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_256_u16_x:
++**	add	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_256_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 256),
++		z0 = svsub_x (p0, z0, 256))
++
++/*
++** sub_257_u16_x:
++**	mov	(z[0-9]+\.h), #-257
++**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_257_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 257),
++		z0 = svsub_x (p0, z0, 257))
++
++/*
++** sub_512_u16_x:
++**	add	z0\.h, z0\.h, #65024
++**	ret
++*/
++TEST_UNIFORM_Z (sub_512_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 512),
++		z0 = svsub_x (p0, z0, 512))
++
++/*
++** sub_65280_u16_x:
++**	add	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65280_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, 0xff00),
++		z0 = svsub_x (p0, z0, 0xff00))
++
++/*
++** sub_m1_u16_x:
++**	add	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_u16_x:
++**	add	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_u16_x:
++**	add	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
++
++/*
++** sub_m255_u16_x:
++**	add	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m255_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -255),
++		z0 = svsub_x (p0, z0, -255))
++
++/*
++** sub_m256_u16_x:
++**	add	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m256_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -256),
++		z0 = svsub_x (p0, z0, -256))
++
++/*
++** sub_m257_u16_x:
++**	mov	(z[0-9]+)\.b, #1
++**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m257_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -257),
++		z0 = svsub_x (p0, z0, -257))
++
++/*
++** sub_m512_u16_x:
++**	add	z0\.h, z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m512_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -512),
++		z0 = svsub_x (p0, z0, -512))
++
++/*
++** sub_m32768_u16_x:
++**	add	z0\.h, z0\.h, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m32768_u16_x, svuint16_t,
++		z0 = svsub_n_u16_x (p0, z0, -0x8000),
++		z0 = svsub_x (p0, z0, -0x8000))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u32.c
+new file mode 100644
+index 000000000..c4b405d4d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u32.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_u32_m_tied1:
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_m_tied1, svuint32_t,
++		z0 = svsub_u32_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_m_tied2, svuint32_t,
++		z0 = svsub_u32_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_u32_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_m_untied, svuint32_t,
++		z0 = svsub_u32_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svsub_n_u32_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	sub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svsub_n_u32_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_u32_m_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u32_m_tied1, svuint32_t,
++		z0 = svsub_n_u32_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0, z1
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u32_m_untied, svuint32_t,
++		z0 = svsub_n_u32_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m2_u32_m:
++**	mov	(z[0-9]+\.s), #2
++**	add	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_u32_m, svuint32_t,
++		z0 = svsub_n_u32_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_z_tied1, svuint32_t,
++		z0 = svsub_u32_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_z_tied2, svuint32_t,
++		z0 = svsub_u32_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sub	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_z_untied, svuint32_t,
++		z0 = svsub_u32_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sub	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svsub_n_u32_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	sub	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svsub_n_u32_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_u32_z_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u32_z_tied1, svuint32_t,
++		z0 = svsub_n_u32_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_u32_z_untied:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	add	z0\.s, p0/m, z0\.s, \1\.s
++** |
++**	movprfx	z0\.s, p0/z, \1\.s
++**	add	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u32_z_untied, svuint32_t,
++		z0 = svsub_n_u32_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_u32_x_tied1:
++**	sub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_x_tied1, svuint32_t,
++		z0 = svsub_u32_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_u32_x_tied2:
++**	sub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_x_tied2, svuint32_t,
++		z0 = svsub_u32_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_u32_x_untied:
++**	sub	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u32_x_untied, svuint32_t,
++		z0 = svsub_u32_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svsub_n_u32_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_w0_u32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, z1\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svsub_n_u32_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_u32_x_tied1:
++**	sub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u32_x_tied1, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_u32_x_untied:
++**	movprfx	z0, z1
++**	sub	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u32_x_untied, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_u32_x:
++**	sub	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_u32_x:
++**	sub	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_u32_x:
++**	sub	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_256_u32_x:
++**	sub	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_256_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 256),
++		z0 = svsub_x (p0, z0, 256))
++
++/*
++** sub_511_u32_x:
++**	mov	(z[0-9]+\.s), #-511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_511_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 511),
++		z0 = svsub_x (p0, z0, 511))
++
++/*
++** sub_512_u32_x:
++**	sub	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_512_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 512),
++		z0 = svsub_x (p0, z0, 512))
++
++/*
++** sub_65280_u32_x:
++**	sub	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65280_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 0xff00),
++		z0 = svsub_x (p0, z0, 0xff00))
++
++/*
++** sub_65535_u32_x:
++**	mov	(z[0-9]+\.s), #-65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65535_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 65535),
++		z0 = svsub_x (p0, z0, 65535))
++
++/*
++** sub_65536_u32_x:
++**	mov	(z[0-9]+\.s), #-65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65536_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, 65536),
++		z0 = svsub_x (p0, z0, 65536))
++
++/*
++** sub_m1_u32_x:
++**	add	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_u32_x:
++**	add	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_u32_x:
++**	add	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
++
++/*
++** sub_m255_u32_x:
++**	add	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m255_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -255),
++		z0 = svsub_x (p0, z0, -255))
++
++/*
++** sub_m256_u32_x:
++**	add	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m256_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -256),
++		z0 = svsub_x (p0, z0, -256))
++
++/*
++** sub_m511_u32_x:
++**	mov	(z[0-9]+\.s), #511
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m511_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -511),
++		z0 = svsub_x (p0, z0, -511))
++
++/*
++** sub_m512_u32_x:
++**	add	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m512_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -512),
++		z0 = svsub_x (p0, z0, -512))
++
++/*
++** sub_m32768_u32_x:
++**	add	z0\.s, z0\.s, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m32768_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -0x8000),
++		z0 = svsub_x (p0, z0, -0x8000))
++
++/*
++** sub_m65280_u32_x:
++**	add	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65280_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -0xff00),
++		z0 = svsub_x (p0, z0, -0xff00))
++
++/*
++** sub_m65535_u32_x:
++**	mov	(z[0-9]+\.s), #65535
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65535_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -65535),
++		z0 = svsub_x (p0, z0, -65535))
++
++/*
++** sub_m65536_u32_x:
++**	mov	(z[0-9]+\.s), #65536
++**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65536_u32_x, svuint32_t,
++		z0 = svsub_n_u32_x (p0, z0, -65536),
++		z0 = svsub_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u64.c
+new file mode 100644
+index 000000000..fb7f7173a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u64.c
+@@ -0,0 +1,426 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_u64_m_tied1:
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_m_tied1, svuint64_t,
++		z0 = svsub_u64_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_m_tied2, svuint64_t,
++		z0 = svsub_u64_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_u64_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_m_untied, svuint64_t,
++		z0 = svsub_u64_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svsub_n_u64_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svsub_n_u64_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_u64_m_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u64_m_tied1, svuint64_t,
++		z0 = svsub_n_u64_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0, z1
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u64_m_untied, svuint64_t,
++		z0 = svsub_n_u64_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m2_u64_m:
++**	mov	(z[0-9]+\.d), #2
++**	add	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m2_u64_m, svuint64_t,
++		z0 = svsub_n_u64_m (p0, z0, -2),
++		z0 = svsub_m (p0, z0, -2))
++
++/*
++** sub_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_z_tied1, svuint64_t,
++		z0 = svsub_u64_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_z_tied2, svuint64_t,
++		z0 = svsub_u64_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sub	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_z_untied, svuint64_t,
++		z0 = svsub_u64_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sub	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svsub_n_u64_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	sub	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svsub_n_u64_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_u64_z_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u64_z_tied1, svuint64_t,
++		z0 = svsub_n_u64_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_u64_z_untied:
++**	mov	(z[0-9]+)\.b, #-1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	add	z0\.d, p0/m, z0\.d, \1\.d
++** |
++**	movprfx	z0\.d, p0/z, \1\.d
++**	add	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u64_z_untied, svuint64_t,
++		z0 = svsub_n_u64_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_u64_x_tied1:
++**	sub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_x_tied1, svuint64_t,
++		z0 = svsub_u64_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_u64_x_tied2:
++**	sub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_x_tied2, svuint64_t,
++		z0 = svsub_u64_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_u64_x_untied:
++**	sub	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u64_x_untied, svuint64_t,
++		z0 = svsub_u64_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svsub_n_u64_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, z1\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svsub_n_u64_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_u64_x_tied1:
++**	sub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u64_x_tied1, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_u64_x_untied:
++**	movprfx	z0, z1
++**	sub	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u64_x_untied, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_u64_x:
++**	sub	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_u64_x:
++**	sub	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_u64_x:
++**	sub	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_256_u64_x:
++**	sub	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_256_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 256),
++		z0 = svsub_x (p0, z0, 256))
++
++/*
++** sub_511_u64_x:
++**	mov	(z[0-9]+\.d), #-511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_511_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 511),
++		z0 = svsub_x (p0, z0, 511))
++
++/*
++** sub_512_u64_x:
++**	sub	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_512_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 512),
++		z0 = svsub_x (p0, z0, 512))
++
++/*
++** sub_65280_u64_x:
++**	sub	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65280_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 0xff00),
++		z0 = svsub_x (p0, z0, 0xff00))
++
++/*
++** sub_65535_u64_x:
++**	mov	(z[0-9]+\.d), #-65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65535_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 65535),
++		z0 = svsub_x (p0, z0, 65535))
++
++/*
++** sub_65536_u64_x:
++**	mov	(z[0-9]+\.d), #-65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_65536_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, 65536),
++		z0 = svsub_x (p0, z0, 65536))
++
++/*
++** sub_m1_u64_x:
++**	add	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_u64_x:
++**	add	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_u64_x:
++**	add	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
++
++/*
++** sub_m255_u64_x:
++**	add	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m255_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -255),
++		z0 = svsub_x (p0, z0, -255))
++
++/*
++** sub_m256_u64_x:
++**	add	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m256_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -256),
++		z0 = svsub_x (p0, z0, -256))
++
++/*
++** sub_m511_u64_x:
++**	mov	(z[0-9]+\.d), #511
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m511_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -511),
++		z0 = svsub_x (p0, z0, -511))
++
++/*
++** sub_m512_u64_x:
++**	add	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m512_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -512),
++		z0 = svsub_x (p0, z0, -512))
++
++/*
++** sub_m32768_u64_x:
++**	add	z0\.d, z0\.d, #32768
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m32768_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -0x8000),
++		z0 = svsub_x (p0, z0, -0x8000))
++
++/*
++** sub_m65280_u64_x:
++**	add	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65280_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -0xff00),
++		z0 = svsub_x (p0, z0, -0xff00))
++
++/*
++** sub_m65535_u64_x:
++**	mov	(z[0-9]+\.d), #65535
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65535_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -65535),
++		z0 = svsub_x (p0, z0, -65535))
++
++/*
++** sub_m65536_u64_x:
++**	mov	(z[0-9]+\.d), #65536
++**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m65536_u64_x, svuint64_t,
++		z0 = svsub_n_u64_x (p0, z0, -65536),
++		z0 = svsub_x (p0, z0, -65536))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u8.c
+new file mode 100644
+index 000000000..455204191
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sub_u8_m_tied1:
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_m_tied1, svuint8_t,
++		z0 = svsub_u8_m (p0, z0, z1),
++		z0 = svsub_m (p0, z0, z1))
++
++/*
++** sub_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	sub	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_m_tied2, svuint8_t,
++		z0 = svsub_u8_m (p0, z1, z0),
++		z0 = svsub_m (p0, z1, z0))
++
++/*
++** sub_u8_m_untied:
++**	movprfx	z0, z1
++**	sub	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_m_untied, svuint8_t,
++		z0 = svsub_u8_m (p0, z1, z2),
++		z0 = svsub_m (p0, z1, z2))
++
++/*
++** sub_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svsub_n_u8_m (p0, z0, x0),
++		 z0 = svsub_m (p0, z0, x0))
++
++/*
++** sub_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	sub	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svsub_n_u8_m (p0, z1, x0),
++		 z0 = svsub_m (p0, z1, x0))
++
++/*
++** sub_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #-1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u8_m_tied1, svuint8_t,
++		z0 = svsub_n_u8_m (p0, z0, 1),
++		z0 = svsub_m (p0, z0, 1))
++
++/*
++** sub_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #-1
++**	movprfx	z0, z1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u8_m_untied, svuint8_t,
++		z0 = svsub_n_u8_m (p0, z1, 1),
++		z0 = svsub_m (p0, z1, 1))
++
++/*
++** sub_m1_u8_m:
++**	mov	(z[0-9]+\.b), #1
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_u8_m, svuint8_t,
++		z0 = svsub_n_u8_m (p0, z0, -1),
++		z0 = svsub_m (p0, z0, -1))
++
++/*
++** sub_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_z_tied1, svuint8_t,
++		z0 = svsub_u8_z (p0, z0, z1),
++		z0 = svsub_z (p0, z0, z1))
++
++/*
++** sub_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_z_tied2, svuint8_t,
++		z0 = svsub_u8_z (p0, z1, z0),
++		z0 = svsub_z (p0, z1, z0))
++
++/*
++** sub_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sub	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_z_untied, svuint8_t,
++		z0 = svsub_u8_z (p0, z1, z2),
++		z0 = svsub_z (p0, z1, z2))
++
++/*
++** sub_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sub	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svsub_n_u8_z (p0, z0, x0),
++		 z0 = svsub_z (p0, z0, x0))
++
++/*
++** sub_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	sub	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svsub_n_u8_z (p0, z1, x0),
++		 z0 = svsub_z (p0, z1, x0))
++
++/*
++** sub_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #-1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u8_z_tied1, svuint8_t,
++		z0 = svsub_n_u8_z (p0, z0, 1),
++		z0 = svsub_z (p0, z0, 1))
++
++/*
++** sub_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #-1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	add	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	add	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u8_z_untied, svuint8_t,
++		z0 = svsub_n_u8_z (p0, z1, 1),
++		z0 = svsub_z (p0, z1, 1))
++
++/*
++** sub_u8_x_tied1:
++**	sub	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_x_tied1, svuint8_t,
++		z0 = svsub_u8_x (p0, z0, z1),
++		z0 = svsub_x (p0, z0, z1))
++
++/*
++** sub_u8_x_tied2:
++**	sub	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_x_tied2, svuint8_t,
++		z0 = svsub_u8_x (p0, z1, z0),
++		z0 = svsub_x (p0, z1, z0))
++
++/*
++** sub_u8_x_untied:
++**	sub	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (sub_u8_x_untied, svuint8_t,
++		z0 = svsub_u8_x (p0, z1, z2),
++		z0 = svsub_x (p0, z1, z2))
++
++/*
++** sub_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svsub_n_u8_x (p0, z0, x0),
++		 z0 = svsub_x (p0, z0, x0))
++
++/*
++** sub_w0_u8_x_untied:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, z1\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (sub_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svsub_n_u8_x (p0, z1, x0),
++		 z0 = svsub_x (p0, z1, x0))
++
++/*
++** sub_1_u8_x_tied1:
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u8_x_tied1, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, 1),
++		z0 = svsub_x (p0, z0, 1))
++
++/*
++** sub_1_u8_x_untied:
++**	movprfx	z0, z1
++**	add	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (sub_1_u8_x_untied, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z1, 1),
++		z0 = svsub_x (p0, z1, 1))
++
++/*
++** sub_127_u8_x:
++**	add	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (sub_127_u8_x, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, 127),
++		z0 = svsub_x (p0, z0, 127))
++
++/*
++** sub_128_u8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_128_u8_x, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, 128),
++		z0 = svsub_x (p0, z0, 128))
++
++/*
++** sub_255_u8_x:
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_255_u8_x, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, 255),
++		z0 = svsub_x (p0, z0, 255))
++
++/*
++** sub_m1_u8_x:
++**	add	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m1_u8_x, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, -1),
++		z0 = svsub_x (p0, z0, -1))
++
++/*
++** sub_m127_u8_x:
++**	add	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m127_u8_x, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, -127),
++		z0 = svsub_x (p0, z0, -127))
++
++/*
++** sub_m128_u8_x:
++**	add	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (sub_m128_u8_x, svuint8_t,
++		z0 = svsub_n_u8_x (p0, z0, -128),
++		z0 = svsub_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16.c
+new file mode 100644
+index 000000000..e14357db2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16.c
+@@ -0,0 +1,444 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_f16_m_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_f16_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_m_tied2, svfloat16_t,
++		z0 = svsubr_f16_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_f16_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_m_untied, svfloat16_t,
++		z0 = svsubr_f16_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_m (p0, z0, d4),
++		 z0 = svsubr_m (p0, z0, d4))
++
++/*
++** subr_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_m (p0, z1, d4),
++		 z0 = svsubr_m (p0, z1, d4))
++
++/*
++** subr_1_f16_m_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_m_untied, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_0p5_f16_m_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z0, 0.5),
++		z0 = svsubr_m (p0, z0, 0.5))
++
++/*
++** subr_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_m_untied, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z1, 0.5),
++		z0 = svsubr_m (p0, z1, 0.5))
++
++/*
++** subr_m1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_m1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_m_untied, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z1, -1),
++		z0 = svsubr_m (p0, z1, -1))
++
++/*
++** subr_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_f16_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_z_tied2, svfloat16_t,
++		z0 = svsubr_f16_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_z_untied, svfloat16_t,
++		z0 = svsubr_f16_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_z (p0, z0, d4),
++		 z0 = svsubr_z (p0, z0, d4))
++
++/*
++** subr_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_z (p0, z1, d4),
++		 z0 = svsubr_z (p0, z1, d4))
++
++/*
++** subr_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_z_untied, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z0, 0.5),
++		z0 = svsubr_z (p0, z0, 0.5))
++
++/*
++** subr_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_z_untied, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z1, 0.5),
++		z0 = svsubr_z (p0, z1, 0.5))
++
++/*
++** subr_m1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z0, -1),
++		z0 = svsubr_z (p0, z0, -1))
++
++/*
++** subr_m1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_z_untied, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z1, -1),
++		z0 = svsubr_z (p0, z1, -1))
++
++/*
++** subr_f16_x_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_f16_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_f16_x_tied2:
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_x_tied2, svfloat16_t,
++		z0 = svsubr_f16_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_f16_x_untied:
++** (
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0, z2
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_x_untied, svfloat16_t,
++		z0 = svsubr_f16_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_x (p0, z0, d4),
++		 z0 = svsubr_x (p0, z0, d4))
++
++/*
++** subr_h4_f16_x_untied: { xfail *-*-* }
++**	mov	z0\.h, h4
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_x (p0, z1, d4),
++		 z0 = svsubr_x (p0, z1, d4))
++
++/*
++** subr_1_f16_x_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_0p5_f16_x_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z0, 0.5),
++		z0 = svsubr_x (p0, z0, 0.5))
++
++/*
++** subr_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z1, 0.5),
++		z0 = svsubr_x (p0, z1, 0.5))
++
++/*
++** subr_m1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_f16_x_untied:
++**	fmov	z0\.h, #-1\.0(?:e\+0)?
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
++
++/*
++** ptrue_subr_f16_x_tied1:
++**	fsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svsubr_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_subr_f16_x_tied2:
++**	fsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f16_x_tied2, svfloat16_t,
++		z0 = svsubr_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svsubr_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_subr_f16_x_untied:
++**	fsub	z0\.h, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f16_x_untied, svfloat16_t,
++		z0 = svsubr_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svsubr_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_subr_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svsubr_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_subr_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svsubr_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_subr_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svsubr_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_subr_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svsubr_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_subr_m1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsub	z0\.h, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svsubr_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_subr_m1_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsub	z0\.h, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svsubr_x (svptrue_b16 (), z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16_notrap.c
+new file mode 100644
+index 000000000..a31ebd2ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16_notrap.c
+@@ -0,0 +1,439 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_f16_m_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_f16_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_f16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_m_tied2, svfloat16_t,
++		z0 = svsubr_f16_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_f16_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_m_untied, svfloat16_t,
++		z0 = svsubr_f16_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_h4_f16_m_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_m_tied1, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_m (p0, z0, d4),
++		 z0 = svsubr_m (p0, z0, d4))
++
++/*
++** subr_h4_f16_m_untied:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_m_untied, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_m (p0, z1, d4),
++		 z0 = svsubr_m (p0, z1, d4))
++
++/*
++** subr_1_f16_m_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_f16_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_m_untied, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_0p5_f16_m_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z0, 0.5),
++		z0 = svsubr_m (p0, z0, 0.5))
++
++/*
++** subr_0p5_f16_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_m_untied, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z1, 0.5),
++		z0 = svsubr_m (p0, z1, 0.5))
++
++/*
++** subr_m1_f16_m_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_m_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_m1_f16_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_m_untied, svfloat16_t,
++		z0 = svsubr_n_f16_m (p0, z1, -1),
++		z0 = svsubr_m (p0, z1, -1))
++
++/*
++** subr_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_f16_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_f16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_z_tied2, svfloat16_t,
++		z0 = svsubr_f16_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_f16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_z_untied, svfloat16_t,
++		z0 = svsubr_f16_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_h4_f16_z_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_z_tied1, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_z (p0, z0, d4),
++		 z0 = svsubr_z (p0, z0, d4))
++
++/*
++** subr_h4_f16_z_untied:
++**	mov	(z[0-9]+\.h), h4
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_z_untied, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_z (p0, z1, d4),
++		 z0 = svsubr_z (p0, z1, d4))
++
++/*
++** subr_1_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_z_untied, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_0p5_f16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z0, 0.5),
++		z0 = svsubr_z (p0, z0, 0.5))
++
++/*
++** subr_0p5_f16_z_untied:
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_z_untied, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z1, 0.5),
++		z0 = svsubr_z (p0, z1, 0.5))
++
++/*
++** subr_m1_f16_z_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	movprfx	z0\.h, p0/z, z0\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_z_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z0, -1),
++		z0 = svsubr_z (p0, z0, -1))
++
++/*
++** subr_m1_f16_z_untied:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	fsubr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	fsub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_z_untied, svfloat16_t,
++		z0 = svsubr_n_f16_z (p0, z1, -1),
++		z0 = svsubr_z (p0, z1, -1))
++
++/*
++** subr_f16_x_tied1:
++**	fsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_f16_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_f16_x_tied2:
++**	fsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_x_tied2, svfloat16_t,
++		z0 = svsubr_f16_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_f16_x_untied:
++**	fsub	z0\.h, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f16_x_untied, svfloat16_t,
++		z0 = svsubr_f16_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_h4_f16_x_tied1:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_x_tied1, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_x (p0, z0, d4),
++		 z0 = svsubr_x (p0, z0, d4))
++
++/*
++** subr_h4_f16_x_untied:
++**	mov	(z[0-9]+\.h), h4
++**	fsub	z0\.h, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_h4_f16_x_untied, svfloat16_t, __fp16,
++		 z0 = svsubr_n_f16_x (p0, z1, d4),
++		 z0 = svsubr_x (p0, z1, d4))
++
++/*
++** subr_1_f16_x_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_f16_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_0p5_f16_x_tied1:
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z0, 0.5),
++		z0 = svsubr_x (p0, z0, 0.5))
++
++/*
++** subr_0p5_f16_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z1, 0.5),
++		z0 = svsubr_x (p0, z1, 0.5))
++
++/*
++** subr_m1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsub	z0\.h, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsub	z0\.h, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
++
++/*
++** ptrue_subr_f16_x_tied1:
++**	fsub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_f16_x (svptrue_b16 (), z0, z1),
++		z0 = svsubr_x (svptrue_b16 (), z0, z1))
++
++/*
++** ptrue_subr_f16_x_tied2:
++**	fsub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f16_x_tied2, svfloat16_t,
++		z0 = svsubr_f16_x (svptrue_b16 (), z1, z0),
++		z0 = svsubr_x (svptrue_b16 (), z1, z0))
++
++/*
++** ptrue_subr_f16_x_untied:
++**	fsub	z0\.h, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f16_x_untied, svfloat16_t,
++		z0 = svsubr_f16_x (svptrue_b16 (), z1, z2),
++		z0 = svsubr_x (svptrue_b16 (), z1, z2))
++
++/*
++** ptrue_subr_1_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 1),
++		z0 = svsubr_x (svptrue_b16 (), z0, 1))
++
++/*
++** ptrue_subr_1_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 1),
++		z0 = svsubr_x (svptrue_b16 (), z1, 1))
++
++/*
++** ptrue_subr_0p5_f16_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 0.5),
++		z0 = svsubr_x (svptrue_b16 (), z0, 0.5))
++
++/*
++** ptrue_subr_0p5_f16_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 0.5),
++		z0 = svsubr_x (svptrue_b16 (), z1, 0.5))
++
++/*
++** ptrue_subr_m1_f16_x_tied1:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsub	z0\.h, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_tied1, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, -1),
++		z0 = svsubr_x (svptrue_b16 (), z0, -1))
++
++/*
++** ptrue_subr_m1_f16_x_untied:
++**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
++**	fsub	z0\.h, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_untied, svfloat16_t,
++		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, -1),
++		z0 = svsubr_x (svptrue_b16 (), z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32.c
+new file mode 100644
+index 000000000..98dc7ad2b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32.c
+@@ -0,0 +1,444 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_f32_m_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_f32_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_m_tied2, svfloat32_t,
++		z0 = svsubr_f32_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_f32_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_m_untied, svfloat32_t,
++		z0 = svsubr_f32_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svsubr_n_f32_m (p0, z0, d4),
++		 z0 = svsubr_m (p0, z0, d4))
++
++/*
++** subr_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svsubr_n_f32_m (p0, z1, d4),
++		 z0 = svsubr_m (p0, z1, d4))
++
++/*
++** subr_1_f32_m_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_m_untied, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_0p5_f32_m_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z0, 0.5),
++		z0 = svsubr_m (p0, z0, 0.5))
++
++/*
++** subr_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_m_untied, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z1, 0.5),
++		z0 = svsubr_m (p0, z1, 0.5))
++
++/*
++** subr_m1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_m1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_m_untied, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z1, -1),
++		z0 = svsubr_m (p0, z1, -1))
++
++/*
++** subr_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_f32_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_z_tied2, svfloat32_t,
++		z0 = svsubr_f32_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_z_untied, svfloat32_t,
++		z0 = svsubr_f32_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svsubr_n_f32_z (p0, z0, d4),
++		 z0 = svsubr_z (p0, z0, d4))
++
++/*
++** subr_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svsubr_n_f32_z (p0, z1, d4),
++		 z0 = svsubr_z (p0, z1, d4))
++
++/*
++** subr_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_z_untied, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z0, 0.5),
++		z0 = svsubr_z (p0, z0, 0.5))
++
++/*
++** subr_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_z_untied, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z1, 0.5),
++		z0 = svsubr_z (p0, z1, 0.5))
++
++/*
++** subr_m1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z0, -1),
++		z0 = svsubr_z (p0, z0, -1))
++
++/*
++** subr_m1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_z_untied, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z1, -1),
++		z0 = svsubr_z (p0, z1, -1))
++
++/*
++** subr_f32_x_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_f32_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_f32_x_tied2:
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_x_tied2, svfloat32_t,
++		z0 = svsubr_f32_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_f32_x_untied:
++** (
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0, z2
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_x_untied, svfloat32_t,
++		z0 = svsubr_f32_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svsubr_n_f32_x (p0, z0, d4),
++		 z0 = svsubr_x (p0, z0, d4))
++
++/*
++** subr_s4_f32_x_untied: { xfail *-*-* }
++**	mov	z0\.s, s4
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svsubr_n_f32_x (p0, z1, d4),
++		 z0 = svsubr_x (p0, z1, d4))
++
++/*
++** subr_1_f32_x_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_0p5_f32_x_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z0, 0.5),
++		z0 = svsubr_x (p0, z0, 0.5))
++
++/*
++** subr_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z1, 0.5),
++		z0 = svsubr_x (p0, z1, 0.5))
++
++/*
++** subr_m1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_f32_x_untied:
++**	fmov	z0\.s, #-1\.0(?:e\+0)?
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
++
++/*
++** ptrue_subr_f32_x_tied1:
++**	fsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svsubr_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_subr_f32_x_tied2:
++**	fsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f32_x_tied2, svfloat32_t,
++		z0 = svsubr_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svsubr_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_subr_f32_x_untied:
++**	fsub	z0\.s, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f32_x_untied, svfloat32_t,
++		z0 = svsubr_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svsubr_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_subr_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svsubr_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_subr_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svsubr_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_subr_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svsubr_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_subr_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svsubr_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_subr_m1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svsubr_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_subr_m1_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsub	z0\.s, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svsubr_x (svptrue_b32 (), z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32_notrap.c
+new file mode 100644
+index 000000000..75ae0dc61
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32_notrap.c
+@@ -0,0 +1,439 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_f32_m_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_f32_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_f32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_m_tied2, svfloat32_t,
++		z0 = svsubr_f32_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_f32_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_m_untied, svfloat32_t,
++		z0 = svsubr_f32_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_s4_f32_m_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_m_tied1, svfloat32_t, float,
++		 z0 = svsubr_n_f32_m (p0, z0, d4),
++		 z0 = svsubr_m (p0, z0, d4))
++
++/*
++** subr_s4_f32_m_untied:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_m_untied, svfloat32_t, float,
++		 z0 = svsubr_n_f32_m (p0, z1, d4),
++		 z0 = svsubr_m (p0, z1, d4))
++
++/*
++** subr_1_f32_m_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_f32_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_m_untied, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_0p5_f32_m_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z0, 0.5),
++		z0 = svsubr_m (p0, z0, 0.5))
++
++/*
++** subr_0p5_f32_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_m_untied, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z1, 0.5),
++		z0 = svsubr_m (p0, z1, 0.5))
++
++/*
++** subr_m1_f32_m_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_m_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_m1_f32_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_m_untied, svfloat32_t,
++		z0 = svsubr_n_f32_m (p0, z1, -1),
++		z0 = svsubr_m (p0, z1, -1))
++
++/*
++** subr_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_f32_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_f32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_z_tied2, svfloat32_t,
++		z0 = svsubr_f32_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_f32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_z_untied, svfloat32_t,
++		z0 = svsubr_f32_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_s4_f32_z_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_z_tied1, svfloat32_t, float,
++		 z0 = svsubr_n_f32_z (p0, z0, d4),
++		 z0 = svsubr_z (p0, z0, d4))
++
++/*
++** subr_s4_f32_z_untied:
++**	mov	(z[0-9]+\.s), s4
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_z_untied, svfloat32_t, float,
++		 z0 = svsubr_n_f32_z (p0, z1, d4),
++		 z0 = svsubr_z (p0, z1, d4))
++
++/*
++** subr_1_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_z_untied, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_0p5_f32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z0, 0.5),
++		z0 = svsubr_z (p0, z0, 0.5))
++
++/*
++** subr_0p5_f32_z_untied:
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_z_untied, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z1, 0.5),
++		z0 = svsubr_z (p0, z1, 0.5))
++
++/*
++** subr_m1_f32_z_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	movprfx	z0\.s, p0/z, z0\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_z_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z0, -1),
++		z0 = svsubr_z (p0, z0, -1))
++
++/*
++** subr_m1_f32_z_untied:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	fsubr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	fsub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_z_untied, svfloat32_t,
++		z0 = svsubr_n_f32_z (p0, z1, -1),
++		z0 = svsubr_z (p0, z1, -1))
++
++/*
++** subr_f32_x_tied1:
++**	fsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_f32_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_f32_x_tied2:
++**	fsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_x_tied2, svfloat32_t,
++		z0 = svsubr_f32_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_f32_x_untied:
++**	fsub	z0\.s, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f32_x_untied, svfloat32_t,
++		z0 = svsubr_f32_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_s4_f32_x_tied1:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_x_tied1, svfloat32_t, float,
++		 z0 = svsubr_n_f32_x (p0, z0, d4),
++		 z0 = svsubr_x (p0, z0, d4))
++
++/*
++** subr_s4_f32_x_untied:
++**	mov	(z[0-9]+\.s), s4
++**	fsub	z0\.s, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_s4_f32_x_untied, svfloat32_t, float,
++		 z0 = svsubr_n_f32_x (p0, z1, d4),
++		 z0 = svsubr_x (p0, z1, d4))
++
++/*
++** subr_1_f32_x_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_f32_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_0p5_f32_x_tied1:
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z0, 0.5),
++		z0 = svsubr_x (p0, z0, 0.5))
++
++/*
++** subr_0p5_f32_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z1, 0.5),
++		z0 = svsubr_x (p0, z1, 0.5))
++
++/*
++** subr_m1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsub	z0\.s, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
++
++/*
++** ptrue_subr_f32_x_tied1:
++**	fsub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_f32_x (svptrue_b32 (), z0, z1),
++		z0 = svsubr_x (svptrue_b32 (), z0, z1))
++
++/*
++** ptrue_subr_f32_x_tied2:
++**	fsub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f32_x_tied2, svfloat32_t,
++		z0 = svsubr_f32_x (svptrue_b32 (), z1, z0),
++		z0 = svsubr_x (svptrue_b32 (), z1, z0))
++
++/*
++** ptrue_subr_f32_x_untied:
++**	fsub	z0\.s, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f32_x_untied, svfloat32_t,
++		z0 = svsubr_f32_x (svptrue_b32 (), z1, z2),
++		z0 = svsubr_x (svptrue_b32 (), z1, z2))
++
++/*
++** ptrue_subr_1_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 1),
++		z0 = svsubr_x (svptrue_b32 (), z0, 1))
++
++/*
++** ptrue_subr_1_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 1),
++		z0 = svsubr_x (svptrue_b32 (), z1, 1))
++
++/*
++** ptrue_subr_0p5_f32_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 0.5),
++		z0 = svsubr_x (svptrue_b32 (), z0, 0.5))
++
++/*
++** ptrue_subr_0p5_f32_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 0.5),
++		z0 = svsubr_x (svptrue_b32 (), z1, 0.5))
++
++/*
++** ptrue_subr_m1_f32_x_tied1:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_tied1, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, -1),
++		z0 = svsubr_x (svptrue_b32 (), z0, -1))
++
++/*
++** ptrue_subr_m1_f32_x_untied:
++**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
++**	fsub	z0\.s, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_untied, svfloat32_t,
++		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, -1),
++		z0 = svsubr_x (svptrue_b32 (), z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64.c
+new file mode 100644
+index 000000000..81f1112d7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64.c
+@@ -0,0 +1,444 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_f64_m_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_f64_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_m_tied2, svfloat64_t,
++		z0 = svsubr_f64_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_f64_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_m_untied, svfloat64_t,
++		z0 = svsubr_f64_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svsubr_n_f64_m (p0, z0, d4),
++		 z0 = svsubr_m (p0, z0, d4))
++
++/*
++** subr_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svsubr_n_f64_m (p0, z1, d4),
++		 z0 = svsubr_m (p0, z1, d4))
++
++/*
++** subr_1_f64_m_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_m_untied, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_0p5_f64_m_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z0, 0.5),
++		z0 = svsubr_m (p0, z0, 0.5))
++
++/*
++** subr_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_m_untied, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z1, 0.5),
++		z0 = svsubr_m (p0, z1, 0.5))
++
++/*
++** subr_m1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_m1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_m_untied, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z1, -1),
++		z0 = svsubr_m (p0, z1, -1))
++
++/*
++** subr_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_f64_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_z_tied2, svfloat64_t,
++		z0 = svsubr_f64_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_z_untied, svfloat64_t,
++		z0 = svsubr_f64_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svsubr_n_f64_z (p0, z0, d4),
++		 z0 = svsubr_z (p0, z0, d4))
++
++/*
++** subr_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svsubr_n_f64_z (p0, z1, d4),
++		 z0 = svsubr_z (p0, z1, d4))
++
++/*
++** subr_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_z_untied, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z0, 0.5),
++		z0 = svsubr_z (p0, z0, 0.5))
++
++/*
++** subr_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_z_untied, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z1, 0.5),
++		z0 = svsubr_z (p0, z1, 0.5))
++
++/*
++** subr_m1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z0, -1),
++		z0 = svsubr_z (p0, z0, -1))
++
++/*
++** subr_m1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_z_untied, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z1, -1),
++		z0 = svsubr_z (p0, z1, -1))
++
++/*
++** subr_f64_x_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_f64_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_f64_x_tied2:
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_x_tied2, svfloat64_t,
++		z0 = svsubr_f64_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_f64_x_untied:
++** (
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0, z2
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_x_untied, svfloat64_t,
++		z0 = svsubr_f64_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svsubr_n_f64_x (p0, z0, d4),
++		 z0 = svsubr_x (p0, z0, d4))
++
++/*
++** subr_d4_f64_x_untied: { xfail *-*-* }
++**	mov	z0\.d, d4
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svsubr_n_f64_x (p0, z1, d4),
++		 z0 = svsubr_x (p0, z1, d4))
++
++/*
++** subr_1_f64_x_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_0p5_f64_x_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z0, 0.5),
++		z0 = svsubr_x (p0, z0, 0.5))
++
++/*
++** subr_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z1, 0.5),
++		z0 = svsubr_x (p0, z1, 0.5))
++
++/*
++** subr_m1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_f64_x_untied:
++**	fmov	z0\.d, #-1\.0(?:e\+0)?
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
++
++/*
++** ptrue_subr_f64_x_tied1:
++**	fsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svsubr_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_subr_f64_x_tied2:
++**	fsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f64_x_tied2, svfloat64_t,
++		z0 = svsubr_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svsubr_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_subr_f64_x_untied:
++**	fsub	z0\.d, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f64_x_untied, svfloat64_t,
++		z0 = svsubr_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svsubr_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_subr_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svsubr_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_subr_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svsubr_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_subr_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svsubr_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_subr_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svsubr_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_subr_m1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svsubr_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_subr_m1_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsub	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svsubr_x (svptrue_b64 (), z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64_notrap.c
+new file mode 100644
+index 000000000..98598dd77
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64_notrap.c
+@@ -0,0 +1,439 @@
++/* { dg-additional-options "-fno-trapping-math" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_f64_m_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_f64_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_f64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_m_tied2, svfloat64_t,
++		z0 = svsubr_f64_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_f64_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_m_untied, svfloat64_t,
++		z0 = svsubr_f64_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_d4_f64_m_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_m_tied1, svfloat64_t, double,
++		 z0 = svsubr_n_f64_m (p0, z0, d4),
++		 z0 = svsubr_m (p0, z0, d4))
++
++/*
++** subr_d4_f64_m_untied:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_m_untied, svfloat64_t, double,
++		 z0 = svsubr_n_f64_m (p0, z1, d4),
++		 z0 = svsubr_m (p0, z1, d4))
++
++/*
++** subr_1_f64_m_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_f64_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_m_untied, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_0p5_f64_m_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z0, 0.5),
++		z0 = svsubr_m (p0, z0, 0.5))
++
++/*
++** subr_0p5_f64_m_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_m_untied, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z1, 0.5),
++		z0 = svsubr_m (p0, z1, 0.5))
++
++/*
++** subr_m1_f64_m_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_m_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_m1_f64_m_untied: { xfail *-*-* }
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_m_untied, svfloat64_t,
++		z0 = svsubr_n_f64_m (p0, z1, -1),
++		z0 = svsubr_m (p0, z1, -1))
++
++/*
++** subr_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_f64_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_f64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_z_tied2, svfloat64_t,
++		z0 = svsubr_f64_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_f64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_z_untied, svfloat64_t,
++		z0 = svsubr_f64_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_d4_f64_z_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_z_tied1, svfloat64_t, double,
++		 z0 = svsubr_n_f64_z (p0, z0, d4),
++		 z0 = svsubr_z (p0, z0, d4))
++
++/*
++** subr_d4_f64_z_untied:
++**	mov	(z[0-9]+\.d), d4
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_z_untied, svfloat64_t, double,
++		 z0 = svsubr_n_f64_z (p0, z1, d4),
++		 z0 = svsubr_z (p0, z1, d4))
++
++/*
++** subr_1_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_z_untied, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_0p5_f64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z0, 0.5),
++		z0 = svsubr_z (p0, z0, 0.5))
++
++/*
++** subr_0p5_f64_z_untied:
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_z_untied, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z1, 0.5),
++		z0 = svsubr_z (p0, z1, 0.5))
++
++/*
++** subr_m1_f64_z_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	movprfx	z0\.d, p0/z, z0\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_z_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z0, -1),
++		z0 = svsubr_z (p0, z0, -1))
++
++/*
++** subr_m1_f64_z_untied:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	fsubr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	fsub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_z_untied, svfloat64_t,
++		z0 = svsubr_n_f64_z (p0, z1, -1),
++		z0 = svsubr_z (p0, z1, -1))
++
++/*
++** subr_f64_x_tied1:
++**	fsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_f64_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_f64_x_tied2:
++**	fsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_x_tied2, svfloat64_t,
++		z0 = svsubr_f64_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_f64_x_untied:
++**	fsub	z0\.d, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_f64_x_untied, svfloat64_t,
++		z0 = svsubr_f64_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_d4_f64_x_tied1:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_x_tied1, svfloat64_t, double,
++		 z0 = svsubr_n_f64_x (p0, z0, d4),
++		 z0 = svsubr_x (p0, z0, d4))
++
++/*
++** subr_d4_f64_x_untied:
++**	mov	(z[0-9]+\.d), d4
++**	fsub	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZD (subr_d4_f64_x_untied, svfloat64_t, double,
++		 z0 = svsubr_n_f64_x (p0, z1, d4),
++		 z0 = svsubr_x (p0, z1, d4))
++
++/*
++** subr_1_f64_x_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_f64_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_0p5_f64_x_tied1:
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z0, 0.5),
++		z0 = svsubr_x (p0, z0, 0.5))
++
++/*
++** subr_0p5_f64_x_untied:
++**	movprfx	z0, z1
++**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
++**	ret
++*/
++TEST_UNIFORM_Z (subr_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z1, 0.5),
++		z0 = svsubr_x (p0, z1, 0.5))
++
++/*
++** subr_m1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsub	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
++
++/*
++** ptrue_subr_f64_x_tied1:
++**	fsub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_f64_x (svptrue_b64 (), z0, z1),
++		z0 = svsubr_x (svptrue_b64 (), z0, z1))
++
++/*
++** ptrue_subr_f64_x_tied2:
++**	fsub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f64_x_tied2, svfloat64_t,
++		z0 = svsubr_f64_x (svptrue_b64 (), z1, z0),
++		z0 = svsubr_x (svptrue_b64 (), z1, z0))
++
++/*
++** ptrue_subr_f64_x_untied:
++**	fsub	z0\.d, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_f64_x_untied, svfloat64_t,
++		z0 = svsubr_f64_x (svptrue_b64 (), z1, z2),
++		z0 = svsubr_x (svptrue_b64 (), z1, z2))
++
++/*
++** ptrue_subr_1_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 1),
++		z0 = svsubr_x (svptrue_b64 (), z0, 1))
++
++/*
++** ptrue_subr_1_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 1),
++		z0 = svsubr_x (svptrue_b64 (), z1, 1))
++
++/*
++** ptrue_subr_0p5_f64_x_tied1:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 0.5),
++		z0 = svsubr_x (svptrue_b64 (), z0, 0.5))
++
++/*
++** ptrue_subr_0p5_f64_x_untied:
++**	...
++**	ptrue	p[0-9]+\.b[^\n]*
++**	...
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 0.5),
++		z0 = svsubr_x (svptrue_b64 (), z1, 0.5))
++
++/*
++** ptrue_subr_m1_f64_x_tied1:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_tied1, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, -1),
++		z0 = svsubr_x (svptrue_b64 (), z0, -1))
++
++/*
++** ptrue_subr_m1_f64_x_untied:
++**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
++**	fsub	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_untied, svfloat64_t,
++		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, -1),
++		z0 = svsubr_x (svptrue_b64 (), z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s16.c
+new file mode 100644
+index 000000000..d3dad62da
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s16.c
+@@ -0,0 +1,324 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_s16_m_tied1:
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_m_tied1, svint16_t,
++		z0 = svsubr_s16_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_s16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_m_tied2, svint16_t,
++		z0 = svsubr_s16_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_s16_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_m_untied, svint16_t,
++		z0 = svsubr_s16_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_w0_s16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s16_m_tied1, svint16_t, int16_t,
++		 z0 = svsubr_n_s16_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_w0_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s16_m_untied, svint16_t, int16_t,
++		 z0 = svsubr_n_s16_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_s16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s16_m_tied1, svint16_t,
++		z0 = svsubr_n_s16_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_s16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s16_m_untied, svint16_t,
++		z0 = svsubr_n_s16_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m2_s16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m2_s16_m, svint16_t,
++		z0 = svsubr_n_s16_m (p0, z0, -2),
++		z0 = svsubr_m (p0, z0, -2))
++
++/*
++** subr_s16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_z_tied1, svint16_t,
++		z0 = svsubr_s16_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_s16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_z_tied2, svint16_t,
++		z0 = svsubr_s16_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_s16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	subr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_z_untied, svint16_t,
++		z0 = svsubr_s16_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_w0_s16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s16_z_tied1, svint16_t, int16_t,
++		 z0 = svsubr_n_s16_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_w0_s16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s16_z_untied, svint16_t, int16_t,
++		 z0 = svsubr_n_s16_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_s16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s16_z_tied1, svint16_t,
++		z0 = svsubr_n_s16_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_s16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s16_z_untied, svint16_t,
++		z0 = svsubr_n_s16_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_s16_x_tied1:
++**	sub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_x_tied1, svint16_t,
++		z0 = svsubr_s16_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_s16_x_tied2:
++**	sub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_x_tied2, svint16_t,
++		z0 = svsubr_s16_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_s16_x_untied:
++**	sub	z0\.h, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s16_x_untied, svint16_t,
++		z0 = svsubr_s16_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_w0_s16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s16_x_tied1, svint16_t, int16_t,
++		 z0 = svsubr_n_s16_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_w0_s16_x_untied:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s16_x_untied, svint16_t, int16_t,
++		 z0 = svsubr_n_s16_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_s16_x_tied1:
++**	subr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s16_x_tied1, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_s16_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s16_x_untied, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_s16_x:
++**	subr	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_s16_x:
++**	subr	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_s16_x:
++**	subr	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_256_s16_x:
++**	subr	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (subr_256_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 256),
++		z0 = svsubr_x (p0, z0, 256))
++
++/*
++** subr_257_s16_x:
++**	mov	(z[0-9]+)\.b, #1
++**	sub	z0\.h, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_257_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 257),
++		z0 = svsubr_x (p0, z0, 257))
++
++/*
++** subr_512_s16_x:
++**	subr	z0\.h, z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (subr_512_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 512),
++		z0 = svsubr_x (p0, z0, 512))
++
++/*
++** subr_65280_s16_x:
++**	subr	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65280_s16_x, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, 0xff00),
++		z0 = svsubr_x (p0, z0, 0xff00))
++
++/*
++** subr_m1_s16_x_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.h, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s16_x_tied1, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_s16_x_untied:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.h, \1\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s16_x_untied, svint16_t,
++		z0 = svsubr_n_s16_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s32.c
+new file mode 100644
+index 000000000..ce62e2f21
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s32.c
+@@ -0,0 +1,344 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_s32_m_tied1:
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_m_tied1, svint32_t,
++		z0 = svsubr_s32_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_s32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_m_tied2, svint32_t,
++		z0 = svsubr_s32_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_s32_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_m_untied, svint32_t,
++		z0 = svsubr_s32_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_w0_s32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s32_m_tied1, svint32_t, int32_t,
++		 z0 = svsubr_n_s32_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_w0_s32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s32_m_untied, svint32_t, int32_t,
++		 z0 = svsubr_n_s32_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_s32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s32_m_tied1, svint32_t,
++		z0 = svsubr_n_s32_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_s32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s32_m_untied, svint32_t,
++		z0 = svsubr_n_s32_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m2_s32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m2_s32_m, svint32_t,
++		z0 = svsubr_n_s32_m (p0, z0, -2),
++		z0 = svsubr_m (p0, z0, -2))
++
++/*
++** subr_s32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_z_tied1, svint32_t,
++		z0 = svsubr_s32_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_s32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_z_tied2, svint32_t,
++		z0 = svsubr_s32_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_s32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	subr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_z_untied, svint32_t,
++		z0 = svsubr_s32_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_w0_s32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s32_z_tied1, svint32_t, int32_t,
++		 z0 = svsubr_n_s32_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_w0_s32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s32_z_untied, svint32_t, int32_t,
++		 z0 = svsubr_n_s32_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_s32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s32_z_tied1, svint32_t,
++		z0 = svsubr_n_s32_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_s32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s32_z_untied, svint32_t,
++		z0 = svsubr_n_s32_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_s32_x_tied1:
++**	sub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_x_tied1, svint32_t,
++		z0 = svsubr_s32_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_s32_x_tied2:
++**	sub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_x_tied2, svint32_t,
++		z0 = svsubr_s32_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_s32_x_untied:
++**	sub	z0\.s, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s32_x_untied, svint32_t,
++		z0 = svsubr_s32_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_w0_s32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s32_x_tied1, svint32_t, int32_t,
++		 z0 = svsubr_n_s32_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_w0_s32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s32_x_untied, svint32_t, int32_t,
++		 z0 = svsubr_n_s32_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_s32_x_tied1:
++**	subr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s32_x_tied1, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_s32_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s32_x_untied, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_s32_x:
++**	subr	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_s32_x:
++**	subr	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_s32_x:
++**	subr	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_256_s32_x:
++**	subr	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (subr_256_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 256),
++		z0 = svsubr_x (p0, z0, 256))
++
++/*
++** subr_511_s32_x:
++**	mov	(z[0-9]+\.s), #511
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_511_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 511),
++		z0 = svsubr_x (p0, z0, 511))
++
++/*
++** subr_512_s32_x:
++**	subr	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (subr_512_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 512),
++		z0 = svsubr_x (p0, z0, 512))
++
++/*
++** subr_65280_s32_x:
++**	subr	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65280_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 0xff00),
++		z0 = svsubr_x (p0, z0, 0xff00))
++
++/*
++** subr_65535_s32_x:
++**	mov	(z[0-9]+\.s), #65535
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65535_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 65535),
++		z0 = svsubr_x (p0, z0, 65535))
++
++/*
++** subr_65536_s32_x:
++**	mov	(z[0-9]+\.s), #65536
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65536_s32_x, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, 65536),
++		z0 = svsubr_x (p0, z0, 65536))
++
++/*
++** subr_m1_s32_x_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.s, \1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s32_x_tied1, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_s32_x_untied:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.s, \1\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s32_x_untied, svint32_t,
++		z0 = svsubr_n_s32_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s64.c
+new file mode 100644
+index 000000000..ada9e977c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s64.c
+@@ -0,0 +1,344 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_s64_m_tied1:
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_m_tied1, svint64_t,
++		z0 = svsubr_s64_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_s64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_m_tied2, svint64_t,
++		z0 = svsubr_s64_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_s64_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_m_untied, svint64_t,
++		z0 = svsubr_s64_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_x0_s64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_s64_m_tied1, svint64_t, int64_t,
++		 z0 = svsubr_n_s64_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_x0_s64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_s64_m_untied, svint64_t, int64_t,
++		 z0 = svsubr_n_s64_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_s64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s64_m_tied1, svint64_t,
++		z0 = svsubr_n_s64_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_s64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s64_m_untied, svint64_t,
++		z0 = svsubr_n_s64_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m2_s64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m2_s64_m, svint64_t,
++		z0 = svsubr_n_s64_m (p0, z0, -2),
++		z0 = svsubr_m (p0, z0, -2))
++
++/*
++** subr_s64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_z_tied1, svint64_t,
++		z0 = svsubr_s64_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_s64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_z_tied2, svint64_t,
++		z0 = svsubr_s64_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_s64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	subr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_z_untied, svint64_t,
++		z0 = svsubr_s64_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_x0_s64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_s64_z_tied1, svint64_t, int64_t,
++		 z0 = svsubr_n_s64_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_x0_s64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_s64_z_untied, svint64_t, int64_t,
++		 z0 = svsubr_n_s64_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_s64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s64_z_tied1, svint64_t,
++		z0 = svsubr_n_s64_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_s64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s64_z_untied, svint64_t,
++		z0 = svsubr_n_s64_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_s64_x_tied1:
++**	sub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_x_tied1, svint64_t,
++		z0 = svsubr_s64_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_s64_x_tied2:
++**	sub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_x_tied2, svint64_t,
++		z0 = svsubr_s64_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_s64_x_untied:
++**	sub	z0\.d, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s64_x_untied, svint64_t,
++		z0 = svsubr_s64_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_x0_s64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_s64_x_tied1, svint64_t, int64_t,
++		 z0 = svsubr_n_s64_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_x0_s64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_s64_x_untied, svint64_t, int64_t,
++		 z0 = svsubr_n_s64_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_s64_x_tied1:
++**	subr	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s64_x_tied1, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_s64_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s64_x_untied, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_s64_x:
++**	subr	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_s64_x:
++**	subr	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_s64_x:
++**	subr	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_256_s64_x:
++**	subr	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (subr_256_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 256),
++		z0 = svsubr_x (p0, z0, 256))
++
++/*
++** subr_511_s64_x:
++**	mov	(z[0-9]+\.d), #511
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_511_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 511),
++		z0 = svsubr_x (p0, z0, 511))
++
++/*
++** subr_512_s64_x:
++**	subr	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (subr_512_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 512),
++		z0 = svsubr_x (p0, z0, 512))
++
++/*
++** subr_65280_s64_x:
++**	subr	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65280_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 0xff00),
++		z0 = svsubr_x (p0, z0, 0xff00))
++
++/*
++** subr_65535_s64_x:
++**	mov	(z[0-9]+\.d), #65535
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65535_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 65535),
++		z0 = svsubr_x (p0, z0, 65535))
++
++/*
++** subr_65536_s64_x:
++**	mov	(z[0-9]+\.d), #65536
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65536_s64_x, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, 65536),
++		z0 = svsubr_x (p0, z0, 65536))
++
++/*
++** subr_m1_s64_x_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.d, \1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s64_x_tied1, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_s64_x_untied:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.d, \1\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s64_x_untied, svint64_t,
++		z0 = svsubr_n_s64_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
+new file mode 100644
+index 000000000..90d2a6de9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_s8_m_tied1:
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_m_tied1, svint8_t,
++		z0 = svsubr_s8_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_s8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_m_tied2, svint8_t,
++		z0 = svsubr_s8_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_s8_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_m_untied, svint8_t,
++		z0 = svsubr_s8_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_w0_s8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s8_m_tied1, svint8_t, int8_t,
++		 z0 = svsubr_n_s8_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_w0_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s8_m_untied, svint8_t, int8_t,
++		 z0 = svsubr_n_s8_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_s8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s8_m_tied1, svint8_t,
++		z0 = svsubr_n_s8_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_s8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s8_m_untied, svint8_t,
++		z0 = svsubr_n_s8_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m1_s8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s8_m, svint8_t,
++		z0 = svsubr_n_s8_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_s8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_z_tied1, svint8_t,
++		z0 = svsubr_s8_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_s8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_z_tied2, svint8_t,
++		z0 = svsubr_s8_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_s8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	subr	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_z_untied, svint8_t,
++		z0 = svsubr_s8_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_w0_s8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s8_z_tied1, svint8_t, int8_t,
++		 z0 = svsubr_n_s8_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_w0_s8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s8_z_untied, svint8_t, int8_t,
++		 z0 = svsubr_n_s8_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_s8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s8_z_tied1, svint8_t,
++		z0 = svsubr_n_s8_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_s8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s8_z_untied, svint8_t,
++		z0 = svsubr_n_s8_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_s8_x_tied1:
++**	sub	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_x_tied1, svint8_t,
++		z0 = svsubr_s8_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_s8_x_tied2:
++**	sub	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_x_tied2, svint8_t,
++		z0 = svsubr_s8_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_s8_x_untied:
++**	sub	z0\.b, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_s8_x_untied, svint8_t,
++		z0 = svsubr_s8_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_w0_s8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, \1, z0\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s8_x_tied1, svint8_t, int8_t,
++		 z0 = svsubr_n_s8_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_w0_s8_x_untied:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_s8_x_untied, svint8_t, int8_t,
++		 z0 = svsubr_n_s8_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_s8_x_tied1:
++**	subr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s8_x_tied1, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_s8_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_s8_x_untied, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_s8_x:
++**	subr	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_s8_x, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_s8_x:
++**	subr	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_s8_x, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_s8_x:
++**	subr	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_s8_x, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_m1_s8_x:
++**	subr	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_s8_x, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m127_s8_x:
++**	subr	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m127_s8_x, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, -127),
++		z0 = svsubr_x (p0, z0, -127))
++
++/*
++** subr_m128_s8_x:
++**	subr	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m128_s8_x, svint8_t,
++		z0 = svsubr_n_s8_x (p0, z0, -128),
++		z0 = svsubr_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u16.c
+new file mode 100644
+index 000000000..379a80fb1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u16.c
+@@ -0,0 +1,324 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_u16_m_tied1:
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_m_tied1, svuint16_t,
++		z0 = svsubr_u16_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_u16_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, \1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_m_tied2, svuint16_t,
++		z0 = svsubr_u16_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_u16_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_m_untied, svuint16_t,
++		z0 = svsubr_u16_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_w0_u16_m_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u16_m_tied1, svuint16_t, uint16_t,
++		 z0 = svsubr_n_u16_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_w0_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u16_m_untied, svuint16_t, uint16_t,
++		 z0 = svsubr_n_u16_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_u16_m_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u16_m_tied1, svuint16_t,
++		z0 = svsubr_n_u16_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_u16_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0, z1
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u16_m_untied, svuint16_t,
++		z0 = svsubr_n_u16_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m2_u16_m:
++**	mov	(z[0-9]+\.h), #-2
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m2_u16_m, svuint16_t,
++		z0 = svsubr_n_u16_m (p0, z0, -2),
++		z0 = svsubr_m (p0, z0, -2))
++
++/*
++** subr_u16_z_tied1:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_z_tied1, svuint16_t,
++		z0 = svsubr_u16_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_u16_z_tied2:
++**	movprfx	z0\.h, p0/z, z0\.h
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_z_tied2, svuint16_t,
++		z0 = svsubr_u16_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_u16_z_untied:
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	subr	z0\.h, p0/m, z0\.h, z2\.h
++** |
++**	movprfx	z0\.h, p0/z, z2\.h
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_z_untied, svuint16_t,
++		z0 = svsubr_u16_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_w0_u16_z_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u16_z_tied1, svuint16_t, uint16_t,
++		 z0 = svsubr_n_u16_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_w0_u16_z_untied:
++**	mov	(z[0-9]+\.h), w0
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u16_z_untied, svuint16_t, uint16_t,
++		 z0 = svsubr_n_u16_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_u16_z_tied1:
++**	mov	(z[0-9]+\.h), #1
++**	movprfx	z0\.h, p0/z, z0\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u16_z_tied1, svuint16_t,
++		z0 = svsubr_n_u16_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_u16_z_untied:
++**	mov	(z[0-9]+\.h), #1
++** (
++**	movprfx	z0\.h, p0/z, z1\.h
++**	subr	z0\.h, p0/m, z0\.h, \1
++** |
++**	movprfx	z0\.h, p0/z, \1
++**	sub	z0\.h, p0/m, z0\.h, z1\.h
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u16_z_untied, svuint16_t,
++		z0 = svsubr_n_u16_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_u16_x_tied1:
++**	sub	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_x_tied1, svuint16_t,
++		z0 = svsubr_u16_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_u16_x_tied2:
++**	sub	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_x_tied2, svuint16_t,
++		z0 = svsubr_u16_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_u16_x_untied:
++**	sub	z0\.h, z2\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u16_x_untied, svuint16_t,
++		z0 = svsubr_u16_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_w0_u16_x_tied1:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, \1, z0\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u16_x_tied1, svuint16_t, uint16_t,
++		 z0 = svsubr_n_u16_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_w0_u16_x_untied:
++**	mov	(z[0-9]+\.h), w0
++**	sub	z0\.h, \1, z1\.h
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u16_x_untied, svuint16_t, uint16_t,
++		 z0 = svsubr_n_u16_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_u16_x_tied1:
++**	subr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u16_x_tied1, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_u16_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.h, z0\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u16_x_untied, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_u16_x:
++**	subr	z0\.h, z0\.h, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_u16_x:
++**	subr	z0\.h, z0\.h, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_u16_x:
++**	subr	z0\.h, z0\.h, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_256_u16_x:
++**	subr	z0\.h, z0\.h, #256
++**	ret
++*/
++TEST_UNIFORM_Z (subr_256_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 256),
++		z0 = svsubr_x (p0, z0, 256))
++
++/*
++** subr_257_u16_x:
++**	mov	(z[0-9]+)\.b, #1
++**	sub	z0\.h, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_257_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 257),
++		z0 = svsubr_x (p0, z0, 257))
++
++/*
++** subr_512_u16_x:
++**	subr	z0\.h, z0\.h, #512
++**	ret
++*/
++TEST_UNIFORM_Z (subr_512_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 512),
++		z0 = svsubr_x (p0, z0, 512))
++
++/*
++** subr_65280_u16_x:
++**	subr	z0\.h, z0\.h, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65280_u16_x, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, 0xff00),
++		z0 = svsubr_x (p0, z0, 0xff00))
++
++/*
++** subr_m1_u16_x_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.h, \1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u16_x_tied1, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_u16_x_untied:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.h, \1\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u16_x_untied, svuint16_t,
++		z0 = svsubr_n_u16_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u32.c
+new file mode 100644
+index 000000000..215f8b449
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u32.c
+@@ -0,0 +1,344 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_u32_m_tied1:
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_m_tied1, svuint32_t,
++		z0 = svsubr_u32_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_u32_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, \1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_m_tied2, svuint32_t,
++		z0 = svsubr_u32_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_u32_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_m_untied, svuint32_t,
++		z0 = svsubr_u32_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_w0_u32_m_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u32_m_tied1, svuint32_t, uint32_t,
++		 z0 = svsubr_n_u32_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_w0_u32_m_untied:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u32_m_untied, svuint32_t, uint32_t,
++		 z0 = svsubr_n_u32_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_u32_m_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u32_m_tied1, svuint32_t,
++		z0 = svsubr_n_u32_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_u32_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0, z1
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u32_m_untied, svuint32_t,
++		z0 = svsubr_n_u32_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m2_u32_m:
++**	mov	(z[0-9]+\.s), #-2
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m2_u32_m, svuint32_t,
++		z0 = svsubr_n_u32_m (p0, z0, -2),
++		z0 = svsubr_m (p0, z0, -2))
++
++/*
++** subr_u32_z_tied1:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_z_tied1, svuint32_t,
++		z0 = svsubr_u32_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_u32_z_tied2:
++**	movprfx	z0\.s, p0/z, z0\.s
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_z_tied2, svuint32_t,
++		z0 = svsubr_u32_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_u32_z_untied:
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	subr	z0\.s, p0/m, z0\.s, z2\.s
++** |
++**	movprfx	z0\.s, p0/z, z2\.s
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_z_untied, svuint32_t,
++		z0 = svsubr_u32_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_w0_u32_z_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u32_z_tied1, svuint32_t, uint32_t,
++		 z0 = svsubr_n_u32_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_w0_u32_z_untied:
++**	mov	(z[0-9]+\.s), w0
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u32_z_untied, svuint32_t, uint32_t,
++		 z0 = svsubr_n_u32_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_u32_z_tied1:
++**	mov	(z[0-9]+\.s), #1
++**	movprfx	z0\.s, p0/z, z0\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u32_z_tied1, svuint32_t,
++		z0 = svsubr_n_u32_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_u32_z_untied:
++**	mov	(z[0-9]+\.s), #1
++** (
++**	movprfx	z0\.s, p0/z, z1\.s
++**	subr	z0\.s, p0/m, z0\.s, \1
++** |
++**	movprfx	z0\.s, p0/z, \1
++**	sub	z0\.s, p0/m, z0\.s, z1\.s
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u32_z_untied, svuint32_t,
++		z0 = svsubr_n_u32_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_u32_x_tied1:
++**	sub	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_x_tied1, svuint32_t,
++		z0 = svsubr_u32_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_u32_x_tied2:
++**	sub	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_x_tied2, svuint32_t,
++		z0 = svsubr_u32_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_u32_x_untied:
++**	sub	z0\.s, z2\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u32_x_untied, svuint32_t,
++		z0 = svsubr_u32_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_w0_u32_x_tied1:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u32_x_tied1, svuint32_t, uint32_t,
++		 z0 = svsubr_n_u32_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_w0_u32_x_untied:
++**	mov	(z[0-9]+\.s), w0
++**	sub	z0\.s, \1, z1\.s
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u32_x_untied, svuint32_t, uint32_t,
++		 z0 = svsubr_n_u32_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_u32_x_tied1:
++**	subr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u32_x_tied1, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_u32_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.s, z0\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u32_x_untied, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_u32_x:
++**	subr	z0\.s, z0\.s, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_u32_x:
++**	subr	z0\.s, z0\.s, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_u32_x:
++**	subr	z0\.s, z0\.s, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_256_u32_x:
++**	subr	z0\.s, z0\.s, #256
++**	ret
++*/
++TEST_UNIFORM_Z (subr_256_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 256),
++		z0 = svsubr_x (p0, z0, 256))
++
++/*
++** subr_511_u32_x:
++**	mov	(z[0-9]+\.s), #511
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_511_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 511),
++		z0 = svsubr_x (p0, z0, 511))
++
++/*
++** subr_512_u32_x:
++**	subr	z0\.s, z0\.s, #512
++**	ret
++*/
++TEST_UNIFORM_Z (subr_512_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 512),
++		z0 = svsubr_x (p0, z0, 512))
++
++/*
++** subr_65280_u32_x:
++**	subr	z0\.s, z0\.s, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65280_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 0xff00),
++		z0 = svsubr_x (p0, z0, 0xff00))
++
++/*
++** subr_65535_u32_x:
++**	mov	(z[0-9]+\.s), #65535
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65535_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 65535),
++		z0 = svsubr_x (p0, z0, 65535))
++
++/*
++** subr_65536_u32_x:
++**	mov	(z[0-9]+\.s), #65536
++**	sub	z0\.s, \1, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65536_u32_x, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, 65536),
++		z0 = svsubr_x (p0, z0, 65536))
++
++/*
++** subr_m1_u32_x_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.s, \1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u32_x_tied1, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_u32_x_untied:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.s, \1\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u32_x_untied, svuint32_t,
++		z0 = svsubr_n_u32_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u64.c
+new file mode 100644
+index 000000000..78d94515b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u64.c
+@@ -0,0 +1,344 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_u64_m_tied1:
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_m_tied1, svuint64_t,
++		z0 = svsubr_u64_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_u64_m_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_m_tied2, svuint64_t,
++		z0 = svsubr_u64_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_u64_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_m_untied, svuint64_t,
++		z0 = svsubr_u64_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_x0_u64_m_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_u64_m_tied1, svuint64_t, uint64_t,
++		 z0 = svsubr_n_u64_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_x0_u64_m_untied:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_u64_m_untied, svuint64_t, uint64_t,
++		 z0 = svsubr_n_u64_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_u64_m_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u64_m_tied1, svuint64_t,
++		z0 = svsubr_n_u64_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_u64_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0, z1
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u64_m_untied, svuint64_t,
++		z0 = svsubr_n_u64_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m2_u64_m:
++**	mov	(z[0-9]+\.d), #-2
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m2_u64_m, svuint64_t,
++		z0 = svsubr_n_u64_m (p0, z0, -2),
++		z0 = svsubr_m (p0, z0, -2))
++
++/*
++** subr_u64_z_tied1:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_z_tied1, svuint64_t,
++		z0 = svsubr_u64_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_u64_z_tied2:
++**	movprfx	z0\.d, p0/z, z0\.d
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_z_tied2, svuint64_t,
++		z0 = svsubr_u64_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_u64_z_untied:
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	subr	z0\.d, p0/m, z0\.d, z2\.d
++** |
++**	movprfx	z0\.d, p0/z, z2\.d
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_z_untied, svuint64_t,
++		z0 = svsubr_u64_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_x0_u64_z_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_u64_z_tied1, svuint64_t, uint64_t,
++		 z0 = svsubr_n_u64_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_x0_u64_z_untied:
++**	mov	(z[0-9]+\.d), x0
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_u64_z_untied, svuint64_t, uint64_t,
++		 z0 = svsubr_n_u64_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_u64_z_tied1:
++**	mov	(z[0-9]+\.d), #1
++**	movprfx	z0\.d, p0/z, z0\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u64_z_tied1, svuint64_t,
++		z0 = svsubr_n_u64_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_u64_z_untied:
++**	mov	(z[0-9]+\.d), #1
++** (
++**	movprfx	z0\.d, p0/z, z1\.d
++**	subr	z0\.d, p0/m, z0\.d, \1
++** |
++**	movprfx	z0\.d, p0/z, \1
++**	sub	z0\.d, p0/m, z0\.d, z1\.d
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u64_z_untied, svuint64_t,
++		z0 = svsubr_n_u64_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_u64_x_tied1:
++**	sub	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_x_tied1, svuint64_t,
++		z0 = svsubr_u64_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_u64_x_tied2:
++**	sub	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_x_tied2, svuint64_t,
++		z0 = svsubr_u64_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_u64_x_untied:
++**	sub	z0\.d, z2\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u64_x_untied, svuint64_t,
++		z0 = svsubr_u64_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_x0_u64_x_tied1:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_u64_x_tied1, svuint64_t, uint64_t,
++		 z0 = svsubr_n_u64_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_x0_u64_x_untied:
++**	mov	(z[0-9]+\.d), x0
++**	sub	z0\.d, \1, z1\.d
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_x0_u64_x_untied, svuint64_t, uint64_t,
++		 z0 = svsubr_n_u64_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_u64_x_tied1:
++**	subr	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u64_x_tied1, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_u64_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.d, z0\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u64_x_untied, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_u64_x:
++**	subr	z0\.d, z0\.d, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_u64_x:
++**	subr	z0\.d, z0\.d, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_u64_x:
++**	subr	z0\.d, z0\.d, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_256_u64_x:
++**	subr	z0\.d, z0\.d, #256
++**	ret
++*/
++TEST_UNIFORM_Z (subr_256_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 256),
++		z0 = svsubr_x (p0, z0, 256))
++
++/*
++** subr_511_u64_x:
++**	mov	(z[0-9]+\.d), #511
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_511_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 511),
++		z0 = svsubr_x (p0, z0, 511))
++
++/*
++** subr_512_u64_x:
++**	subr	z0\.d, z0\.d, #512
++**	ret
++*/
++TEST_UNIFORM_Z (subr_512_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 512),
++		z0 = svsubr_x (p0, z0, 512))
++
++/*
++** subr_65280_u64_x:
++**	subr	z0\.d, z0\.d, #65280
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65280_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 0xff00),
++		z0 = svsubr_x (p0, z0, 0xff00))
++
++/*
++** subr_65535_u64_x:
++**	mov	(z[0-9]+\.d), #65535
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65535_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 65535),
++		z0 = svsubr_x (p0, z0, 65535))
++
++/*
++** subr_65536_u64_x:
++**	mov	(z[0-9]+\.d), #65536
++**	sub	z0\.d, \1, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_65536_u64_x, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, 65536),
++		z0 = svsubr_x (p0, z0, 65536))
++
++/*
++** subr_m1_u64_x_tied1:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.d, \1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u64_x_tied1, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m1_u64_x_untied:
++**	mov	(z[0-9]+)\.b, #-1
++**	sub	z0\.d, \1\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u64_x_untied, svuint64_t,
++		z0 = svsubr_n_u64_x (p0, z1, -1),
++		z0 = svsubr_x (p0, z1, -1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
+new file mode 100644
+index 000000000..fe5f96da8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
+@@ -0,0 +1,294 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** subr_u8_m_tied1:
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_m_tied1, svuint8_t,
++		z0 = svsubr_u8_m (p0, z0, z1),
++		z0 = svsubr_m (p0, z0, z1))
++
++/*
++** subr_u8_m_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, \1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_m_tied2, svuint8_t,
++		z0 = svsubr_u8_m (p0, z1, z0),
++		z0 = svsubr_m (p0, z1, z0))
++
++/*
++** subr_u8_m_untied:
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_m_untied, svuint8_t,
++		z0 = svsubr_u8_m (p0, z1, z2),
++		z0 = svsubr_m (p0, z1, z2))
++
++/*
++** subr_w0_u8_m_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u8_m_tied1, svuint8_t, uint8_t,
++		 z0 = svsubr_n_u8_m (p0, z0, x0),
++		 z0 = svsubr_m (p0, z0, x0))
++
++/*
++** subr_w0_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u8_m_untied, svuint8_t, uint8_t,
++		 z0 = svsubr_n_u8_m (p0, z1, x0),
++		 z0 = svsubr_m (p0, z1, x0))
++
++/*
++** subr_1_u8_m_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u8_m_tied1, svuint8_t,
++		z0 = svsubr_n_u8_m (p0, z0, 1),
++		z0 = svsubr_m (p0, z0, 1))
++
++/*
++** subr_1_u8_m_untied: { xfail *-*-* }
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0, z1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u8_m_untied, svuint8_t,
++		z0 = svsubr_n_u8_m (p0, z1, 1),
++		z0 = svsubr_m (p0, z1, 1))
++
++/*
++** subr_m1_u8_m:
++**	mov	(z[0-9]+\.b), #-1
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u8_m, svuint8_t,
++		z0 = svsubr_n_u8_m (p0, z0, -1),
++		z0 = svsubr_m (p0, z0, -1))
++
++/*
++** subr_u8_z_tied1:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_z_tied1, svuint8_t,
++		z0 = svsubr_u8_z (p0, z0, z1),
++		z0 = svsubr_z (p0, z0, z1))
++
++/*
++** subr_u8_z_tied2:
++**	movprfx	z0\.b, p0/z, z0\.b
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_z_tied2, svuint8_t,
++		z0 = svsubr_u8_z (p0, z1, z0),
++		z0 = svsubr_z (p0, z1, z0))
++
++/*
++** subr_u8_z_untied:
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	subr	z0\.b, p0/m, z0\.b, z2\.b
++** |
++**	movprfx	z0\.b, p0/z, z2\.b
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_z_untied, svuint8_t,
++		z0 = svsubr_u8_z (p0, z1, z2),
++		z0 = svsubr_z (p0, z1, z2))
++
++/*
++** subr_w0_u8_z_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u8_z_tied1, svuint8_t, uint8_t,
++		 z0 = svsubr_n_u8_z (p0, z0, x0),
++		 z0 = svsubr_z (p0, z0, x0))
++
++/*
++** subr_w0_u8_z_untied:
++**	mov	(z[0-9]+\.b), w0
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u8_z_untied, svuint8_t, uint8_t,
++		 z0 = svsubr_n_u8_z (p0, z1, x0),
++		 z0 = svsubr_z (p0, z1, x0))
++
++/*
++** subr_1_u8_z_tied1:
++**	mov	(z[0-9]+\.b), #1
++**	movprfx	z0\.b, p0/z, z0\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u8_z_tied1, svuint8_t,
++		z0 = svsubr_n_u8_z (p0, z0, 1),
++		z0 = svsubr_z (p0, z0, 1))
++
++/*
++** subr_1_u8_z_untied:
++**	mov	(z[0-9]+\.b), #1
++** (
++**	movprfx	z0\.b, p0/z, z1\.b
++**	subr	z0\.b, p0/m, z0\.b, \1
++** |
++**	movprfx	z0\.b, p0/z, \1
++**	sub	z0\.b, p0/m, z0\.b, z1\.b
++** )
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u8_z_untied, svuint8_t,
++		z0 = svsubr_n_u8_z (p0, z1, 1),
++		z0 = svsubr_z (p0, z1, 1))
++
++/*
++** subr_u8_x_tied1:
++**	sub	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_x_tied1, svuint8_t,
++		z0 = svsubr_u8_x (p0, z0, z1),
++		z0 = svsubr_x (p0, z0, z1))
++
++/*
++** subr_u8_x_tied2:
++**	sub	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_x_tied2, svuint8_t,
++		z0 = svsubr_u8_x (p0, z1, z0),
++		z0 = svsubr_x (p0, z1, z0))
++
++/*
++** subr_u8_x_untied:
++**	sub	z0\.b, z2\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (subr_u8_x_untied, svuint8_t,
++		z0 = svsubr_u8_x (p0, z1, z2),
++		z0 = svsubr_x (p0, z1, z2))
++
++/*
++** subr_w0_u8_x_tied1:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, \1, z0\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u8_x_tied1, svuint8_t, uint8_t,
++		 z0 = svsubr_n_u8_x (p0, z0, x0),
++		 z0 = svsubr_x (p0, z0, x0))
++
++/*
++** subr_w0_u8_x_untied:
++**	mov	(z[0-9]+\.b), w0
++**	sub	z0\.b, \1, z1\.b
++**	ret
++*/
++TEST_UNIFORM_ZX (subr_w0_u8_x_untied, svuint8_t, uint8_t,
++		 z0 = svsubr_n_u8_x (p0, z1, x0),
++		 z0 = svsubr_x (p0, z1, x0))
++
++/*
++** subr_1_u8_x_tied1:
++**	subr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u8_x_tied1, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, 1),
++		z0 = svsubr_x (p0, z0, 1))
++
++/*
++** subr_1_u8_x_untied:
++**	movprfx	z0, z1
++**	subr	z0\.b, z0\.b, #1
++**	ret
++*/
++TEST_UNIFORM_Z (subr_1_u8_x_untied, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z1, 1),
++		z0 = svsubr_x (p0, z1, 1))
++
++/*
++** subr_127_u8_x:
++**	subr	z0\.b, z0\.b, #127
++**	ret
++*/
++TEST_UNIFORM_Z (subr_127_u8_x, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, 127),
++		z0 = svsubr_x (p0, z0, 127))
++
++/*
++** subr_128_u8_x:
++**	subr	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_128_u8_x, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, 128),
++		z0 = svsubr_x (p0, z0, 128))
++
++/*
++** subr_255_u8_x:
++**	subr	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_255_u8_x, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, 255),
++		z0 = svsubr_x (p0, z0, 255))
++
++/*
++** subr_m1_u8_x:
++**	subr	z0\.b, z0\.b, #255
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m1_u8_x, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, -1),
++		z0 = svsubr_x (p0, z0, -1))
++
++/*
++** subr_m127_u8_x:
++**	subr	z0\.b, z0\.b, #129
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m127_u8_x, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, -127),
++		z0 = svsubr_x (p0, z0, -127))
++
++/*
++** subr_m128_u8_x:
++**	subr	z0\.b, z0\.b, #128
++**	ret
++*/
++TEST_UNIFORM_Z (subr_m128_u8_x, svuint8_t,
++		z0 = svsubr_n_u8_x (p0, z0, -128),
++		z0 = svsubr_x (p0, z0, -128))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_lane_s32.c
+new file mode 100644
+index 000000000..c6d74a4af
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_lane_s32.c
+@@ -0,0 +1,97 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sudot_lane_0_s32_tied1:
++**	sudot	z0\.s, z2\.b, z4\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_lane_0_s32_tied1, svint32_t, svint8_t, svuint8_t,
++	       z0 = svsudot_lane_s32 (z0, z2, z4, 0),
++	       z0 = svsudot_lane (z0, z2, z4, 0))
++
++/*
++** sudot_lane_0_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z2
++**	sudot	z0\.s, \1\.b, z4\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z_REV2 (sudot_lane_0_s32_tied2, svint32_t, svint8_t, svuint8_t,
++		    z0_res = svsudot_lane_s32 (z2, z0, z4, 0),
++		    z0_res = svsudot_lane (z2, z0, z4, 0))
++
++/*
++** sudot_lane_0_s32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	sudot	z0\.s, z2\.b, \1\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z_REV (sudot_lane_0_s32_tied3, svint32_t, svint8_t, svuint8_t,
++		   z0_res = svsudot_lane_s32 (z4, z2, z0, 0),
++		   z0_res = svsudot_lane (z4, z2, z0, 0))
++
++/*
++** sudot_lane_0_s32_untied:
++**	movprfx	z0, z1
++**	sudot	z0\.s, z2\.b, z4\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_lane_0_s32_untied, svint32_t, svint8_t, svuint8_t,
++	       z0 = svsudot_lane_s32 (z1, z2, z4, 0),
++	       z0 = svsudot_lane (z1, z2, z4, 0))
++
++/*
++** sudot_lane_1_s32:
++**	sudot	z0\.s, z2\.b, z5\.b\[1\]
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_lane_1_s32, svint32_t, svint8_t, svuint8_t,
++	       z0 = svsudot_lane_s32 (z0, z2, z5, 1),
++	       z0 = svsudot_lane (z0, z2, z5, 1))
++
++/*
++** sudot_lane_2_s32:
++**	sudot	z0\.s, z2\.b, z5\.b\[2\]
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_lane_2_s32, svint32_t, svint8_t, svuint8_t,
++	       z0 = svsudot_lane_s32 (z0, z2, z5, 2),
++	       z0 = svsudot_lane (z0, z2, z5, 2))
++
++/*
++** sudot_lane_3_s32:
++**	sudot	z0\.s, z2\.b, z5\.b\[3\]
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_lane_3_s32, svint32_t, svint8_t, svuint8_t,
++	       z0 = svsudot_lane_s32 (z0, z2, z5, 3),
++	       z0 = svsudot_lane (z0, z2, z5, 3))
++
++/*
++** sudot_lane_z8_s32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	sudot	z0\.s, z1\.b, \1\.b\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_TRIPLE_LANE_REG (sudot_lane_z8_s32, svint32_t, svint8_t, svuint8_t,
++		      z8,
++		      z0 = svsudot_lane_s32 (z0, z1, z8, 1),
++		      z0 = svsudot_lane (z0, z1, z8, 1))
++
++/*
++** sudot_lane_z16_s32:
++**	mov	(z[0-7])\.d, z16\.d
++**	sudot	z0\.s, z1\.b, \1\.b\[1\]
++**	ret
++*/
++TEST_TRIPLE_LANE_REG (sudot_lane_z16_s32, svint32_t, svint8_t, svuint8_t,
++		      z16,
++		      z0 = svsudot_lane_s32 (z0, z1, z16, 1),
++		      z0 = svsudot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
+new file mode 100644
+index 000000000..4b452619e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
+@@ -0,0 +1,45 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** sudot_s32_tied1:
++**	usdot	z0\.s, z2\.b, z4\.b
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t,
++	       z0 = svsudot_s32 (z0, z2, z4),
++	       z0 = svsudot (z0, z2, z4))
++
++/*
++** sudot_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	usdot	z0\.s, z2\.b, \1\.b
++**	ret
++*/
++TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t,
++		   z0_res = svsudot_s32 (z4, z2, z0),
++		   z0_res = svsudot (z4, z2, z0))
++
++/*
++** sudot_w0_s32_tied:
++**	mov	(z[0-9]+\.b), w0
++**	usdot	z0\.s, z2\.b, \1
++**	ret
++*/
++TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t,
++	       z0 = svsudot_n_s32 (z0, z2, x0),
++	       z0 = svsudot (z0, z2, x0))
++
++/*
++** sudot_9_s32_tied:
++**	mov	(z[0-9]+\.b), #9
++**	usdot	z0\.s, z2\.b, \1
++**	ret
++*/
++TEST_TRIPLE_Z (sudot_9_s32_tied, svint32_t, svint8_t, uint8_t,
++	       z0 = svsudot_n_s32 (z0, z2, 9),
++	       z0 = svsudot (z0, z2, 9))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_bf16.c
+new file mode 100644
+index 000000000..8c077d118
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_bf16_tied1:
++**	tbl	z0\.h, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_bf16_tied1, svbfloat16_t, svuint16_t,
++	     z0 = svtbl_bf16 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_bf16_tied2:
++**	tbl	z0\.h, z4\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_bf16_tied2, svbfloat16_t, svuint16_t,
++		 z0_res = svtbl_bf16 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_bf16_untied:
++**	tbl	z0\.h, z1\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_bf16_untied, svbfloat16_t, svuint16_t,
++	     z0 = svtbl_bf16 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f16.c
+new file mode 100644
+index 000000000..94b610412
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_f16_tied1:
++**	tbl	z0\.h, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_f16_tied1, svfloat16_t, svuint16_t,
++	     z0 = svtbl_f16 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_f16_tied2:
++**	tbl	z0\.h, z4\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_f16_tied2, svfloat16_t, svuint16_t,
++		 z0_res = svtbl_f16 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_f16_untied:
++**	tbl	z0\.h, z1\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_f16_untied, svfloat16_t, svuint16_t,
++	     z0 = svtbl_f16 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f32.c
+new file mode 100644
+index 000000000..741d3bdcf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_f32_tied1:
++**	tbl	z0\.s, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tbl_f32_tied1, svfloat32_t, svuint32_t,
++	     z0 = svtbl_f32 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_f32_tied2:
++**	tbl	z0\.s, z4\.s, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_f32_tied2, svfloat32_t, svuint32_t,
++		 z0_res = svtbl_f32 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_f32_untied:
++**	tbl	z0\.s, z1\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tbl_f32_untied, svfloat32_t, svuint32_t,
++	     z0 = svtbl_f32 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f64.c
+new file mode 100644
+index 000000000..3c24e9a59
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_f64_tied1:
++**	tbl	z0\.d, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tbl_f64_tied1, svfloat64_t, svuint64_t,
++	     z0 = svtbl_f64 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_f64_tied2:
++**	tbl	z0\.d, z4\.d, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_f64_tied2, svfloat64_t, svuint64_t,
++		 z0_res = svtbl_f64 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_f64_untied:
++**	tbl	z0\.d, z1\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tbl_f64_untied, svfloat64_t, svuint64_t,
++	     z0 = svtbl_f64 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s16.c
+new file mode 100644
+index 000000000..2ec9c389a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_s16_tied1:
++**	tbl	z0\.h, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_s16_tied1, svint16_t, svuint16_t,
++	     z0 = svtbl_s16 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_s16_tied2:
++**	tbl	z0\.h, z4\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_s16_tied2, svint16_t, svuint16_t,
++		 z0_res = svtbl_s16 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_s16_untied:
++**	tbl	z0\.h, z1\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_s16_untied, svint16_t, svuint16_t,
++	     z0 = svtbl_s16 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s32.c
+new file mode 100644
+index 000000000..98b2d8d8b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_s32_tied1:
++**	tbl	z0\.s, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tbl_s32_tied1, svint32_t, svuint32_t,
++	     z0 = svtbl_s32 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_s32_tied2:
++**	tbl	z0\.s, z4\.s, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_s32_tied2, svint32_t, svuint32_t,
++		 z0_res = svtbl_s32 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_s32_untied:
++**	tbl	z0\.s, z1\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tbl_s32_untied, svint32_t, svuint32_t,
++	     z0 = svtbl_s32 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s64.c
+new file mode 100644
+index 000000000..0138a80d2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_s64_tied1:
++**	tbl	z0\.d, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tbl_s64_tied1, svint64_t, svuint64_t,
++	     z0 = svtbl_s64 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_s64_tied2:
++**	tbl	z0\.d, z4\.d, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_s64_tied2, svint64_t, svuint64_t,
++		 z0_res = svtbl_s64 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_s64_untied:
++**	tbl	z0\.d, z1\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tbl_s64_untied, svint64_t, svuint64_t,
++	     z0 = svtbl_s64 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s8.c
+new file mode 100644
+index 000000000..7818d1b6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_s8_tied1:
++**	tbl	z0\.b, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (tbl_s8_tied1, svint8_t, svuint8_t,
++	     z0 = svtbl_s8 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_s8_tied2:
++**	tbl	z0\.b, z4\.b, z0\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_s8_tied2, svint8_t, svuint8_t,
++		 z0_res = svtbl_s8 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_s8_untied:
++**	tbl	z0\.b, z1\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (tbl_s8_untied, svint8_t, svuint8_t,
++	     z0 = svtbl_s8 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u16.c
+new file mode 100644
+index 000000000..f15da9211
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_u16_tied1:
++**	tbl	z0\.h, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_u16_tied1, svuint16_t, svuint16_t,
++	     z0 = svtbl_u16 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_u16_tied2:
++**	tbl	z0\.h, z4\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_u16_tied2, svuint16_t, svuint16_t,
++		 z0_res = svtbl_u16 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_u16_untied:
++**	tbl	z0\.h, z1\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tbl_u16_untied, svuint16_t, svuint16_t,
++	     z0 = svtbl_u16 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u32.c
+new file mode 100644
+index 000000000..494300436
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_u32_tied1:
++**	tbl	z0\.s, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tbl_u32_tied1, svuint32_t, svuint32_t,
++	     z0 = svtbl_u32 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_u32_tied2:
++**	tbl	z0\.s, z4\.s, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_u32_tied2, svuint32_t, svuint32_t,
++		 z0_res = svtbl_u32 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_u32_untied:
++**	tbl	z0\.s, z1\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tbl_u32_untied, svuint32_t, svuint32_t,
++	     z0 = svtbl_u32 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u64.c
+new file mode 100644
+index 000000000..158990e12
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_u64_tied1:
++**	tbl	z0\.d, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tbl_u64_tied1, svuint64_t, svuint64_t,
++	     z0 = svtbl_u64 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_u64_tied2:
++**	tbl	z0\.d, z4\.d, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_u64_tied2, svuint64_t, svuint64_t,
++		 z0_res = svtbl_u64 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_u64_untied:
++**	tbl	z0\.d, z1\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tbl_u64_untied, svuint64_t, svuint64_t,
++	     z0 = svtbl_u64 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u8.c
+new file mode 100644
+index 000000000..a46309a95
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tbl_u8_tied1:
++**	tbl	z0\.b, z0\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (tbl_u8_tied1, svuint8_t, svuint8_t,
++	     z0 = svtbl_u8 (z0, z4),
++	     z0 = svtbl (z0, z4))
++
++/*
++** tbl_u8_tied2:
++**	tbl	z0\.b, z4\.b, z0\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (tbl_u8_tied2, svuint8_t, svuint8_t,
++		 z0_res = svtbl_u8 (z4, z0),
++		 z0_res = svtbl (z4, z0))
++
++/*
++** tbl_u8_untied:
++**	tbl	z0\.b, z1\.b, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (tbl_u8_untied, svuint8_t, svuint8_t,
++	     z0 = svtbl_u8 (z1, z4),
++	     z0 = svtbl (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
+new file mode 100644
+index 000000000..d1f8fdb13
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
+@@ -0,0 +1,424 @@
++#ifndef TEST_SVE_ACLE_H
++#define TEST_SVE_ACLE_H 1
++
++#include <arm_sve.h>
++
++#if defined (TEST_OVERLOADS)
++#define INVOKE(CODE1, CODE2) CODE2
++#elif defined (TEST_FULL)
++#define INVOKE(CODE1, CODE2) CODE1
++#else
++#error "Please define -DTEST_OVERLOADS or -DTEST_FULL"
++#endif
++
++#ifdef __cplusplus
++#define PROTO(NAME, RET, ARGS) extern "C" RET NAME ARGS; RET NAME ARGS
++#else
++#define PROTO(NAME, RET, ARGS) RET NAME ARGS
++#endif
++
++#define TEST_UNIFORM_Z(NAME, TYPE, CODE1, CODE2)		\
++  PROTO (NAME, TYPE, (TYPE z0, TYPE z1, TYPE z2, TYPE z3,	\
++		      svbool_t p0, svbool_t p1))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_UNIFORM_P(NAME, CODE1, CODE2)		\
++  PROTO (NAME, svbool_t, (svbool_t p0, svbool_t p1,	\
++			  svbool_t p2, svbool_t p3))	\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++    return p0;						\
++  }
++
++#define TEST_UNIFORM_P_SINGLE(NAME, CODE)		\
++  PROTO (NAME, svbool_t, (svbool_t p0, svbool_t p1,	\
++			  svbool_t p2, svbool_t p3))	\
++  {							\
++    CODE;						\
++    return p0;						\
++  }
++
++#define TEST_UNIFORM_S(NAME, TYPE, CODE1, CODE2)		\
++  PROTO (NAME, TYPE, (TYPE x0, TYPE x1, TYPE x2, TYPE x3,	\
++		      svbool_t p0, svbool_t p1))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return x0;							\
++  }
++
++#define TEST_DUAL_Z(NAME, TYPE1, TYPE2, CODE1, CODE2)		\
++  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE1 z2, TYPE1 z3,	\
++		       TYPE2 z4, TYPE2 z5, TYPE2 z6, TYPE2 z7,	\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_DUAL_Z_REV(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
++  PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE2 z2, TYPE2 z3,	\
++		       TYPE1 z4, TYPE1 z5, TYPE1 z6, TYPE1 z7,	\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    TYPE1 z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_TRIPLE_Z(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)	\
++  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE2 z2, TYPE2 z3,	\
++		       TYPE3 z4, TYPE3 z5,			\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_TRIPLE_Z_REV2(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)\
++  PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE1 z2, TYPE1 z3,	\
++		       TYPE3 z4, TYPE3 z5,			\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    TYPE1 z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_TRIPLE_Z_REV(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)\
++  PROTO (NAME, TYPE1, (TYPE3 z0, TYPE3 z1, TYPE2 z2, TYPE2 z3,	\
++		       TYPE1 z4, TYPE1 z5,			\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    TYPE1 z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_DUAL_LANE_REG(NAME, ZTYPE1, ZTYPE2, REG, CODE1, CODE2) \
++  PROTO (NAME, void, (void))					\
++  {								\
++    register ZTYPE1 z0 __asm ("z0");				\
++    register ZTYPE2 z1 __asm ("z1");				\
++    register ZTYPE2 REG __asm (#REG);				\
++    __asm volatile ("" : "=w" (z0), "=w" (z1), "=w" (REG));	\
++    INVOKE (CODE1, CODE2);					\
++    __asm volatile ("" :: "w" (z0));				\
++  }
++
++#define TEST_TYPE_CHANGE_Z(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
++  PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE2 z2, TYPE2 z3,	\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    TYPE1 z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_TRIPLE_LANE_REG(NAME, ZTYPE1, ZTYPE2, ZTYPE3, REG, CODE1, CODE2) \
++  PROTO (NAME, void, (void))					\
++  {								\
++    register ZTYPE1 z0 __asm ("z0");				\
++    register ZTYPE2 z1 __asm ("z1");				\
++    register ZTYPE3 REG __asm (#REG);				\
++    __asm volatile ("" : "=w" (z0), "=w" (z1), "=w" (REG));	\
++    INVOKE (CODE1, CODE2);					\
++    __asm volatile ("" :: "w" (z0));				\
++  }
++
++#define TEST_TRIPLE_ZX(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)	\
++  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE2 z2, TYPE2 z3,	\
++		       TYPE3 x0, TYPE3 x1,			\
++		       svbool_t p0, svbool_t p1))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_UNIFORM_ZX(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE, (ZTYPE z0, ZTYPE z1, ZTYPE z2, ZTYPE z3,	\
++		       svbool_t p0, STYPE x0))			\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_UNIFORM_ZD(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE, (ZTYPE z0, ZTYPE z1, ZTYPE z2, ZTYPE z3,	\
++		       svbool_t p0, STYPE d4))			\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_UNIFORM_PS(NAME, CODE1, CODE2)			\
++  PROTO (NAME, svbool_t, (svbool_t p0, svbool_t p1,		\
++			  svbool_t p2, svbool_t p3, bool x0))	\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return p0;							\
++  }
++
++#define TEST_DUAL_ZD(NAME, ZTYPE1, ZTYPE2, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE1, (ZTYPE1 z0, ZTYPE1 z1, ZTYPE1 z2,	\
++			ZTYPE1 z3, ZTYPE2 z4, ZTYPE2 z5,	\
++			ZTYPE2 z6, STYPE d7, svbool_t p0,	\
++			svbool_t p1))				\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_DUAL_ZX(NAME, ZTYPE1, ZTYPE2, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE1, (ZTYPE1 z0, ZTYPE1 z1, ZTYPE1 z2,	\
++			ZTYPE1 z3, ZTYPE2 z4, ZTYPE2 z5,	\
++			ZTYPE2 z6, ZTYPE2 z7, svbool_t p0,	\
++			svbool_t p1, STYPE x0))			\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_TYPE_CHANGE_ZX(NAME, ZTYPE1, ZTYPE2, STYPE, CODE1, CODE2) \
++  PROTO (NAME, ZTYPE1, (ZTYPE2 z0, ZTYPE2 z1, ZTYPE2 z2,	\
++			ZTYPE2 z3, svbool_t p0, svbool_t p1,	\
++			STYPE x0))				\
++  {								\
++    ZTYPE1 z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_LOAD(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE, (svbool_t p0, const STYPE *x0,	\
++		       intptr_t x1))			\
++  {							\
++    ZTYPE z0;						\
++    INVOKE (CODE1, CODE2);				\
++    return z0;						\
++  }
++
++#define TEST_LOAD_GATHER_SZ(NAME, RES_TYPE, STYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, RES_TYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0,	\
++			  const STYPE *x0))			\
++  {								\
++    RES_TYPE z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_LOAD_GATHER_ZS(NAME, RES_TYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, RES_TYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0,	\
++			  int64_t x0))				\
++  {								\
++    RES_TYPE z0_res;						\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_PREFETCH(NAME, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, void, (svbool_t p0, const STYPE *x0,	\
++		      intptr_t x1))			\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++  }
++
++#define TEST_PREFETCH_GATHER_SZ(NAME, ZTYPE, CODE1, CODE2)	\
++  PROTO (NAME, void, (ZTYPE z0, ZTYPE z1, svbool_t p0,		\
++		      const void *x0))				\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++  }
++
++#define TEST_PREFETCH_GATHER_ZS(NAME, ZTYPE, CODE1, CODE2)	\
++  PROTO (NAME, void, (ZTYPE z0, ZTYPE z1, svbool_t p0,		\
++		      int64_t x0))				\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++  }
++
++#define TEST_STORE(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, void, (ZTYPE z0, svbool_t p0, STYPE *x0,	\
++		      intptr_t x1))			\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++  }
++
++#define TEST_STORE_SCATTER_SZ(NAME, DATA_TYPE, STYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, void, (DATA_TYPE z0, ZTYPE z1, svbool_t p0,	\
++		      STYPE *x0))				\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++  }
++
++#define TEST_STORE_SCATTER_ZS(NAME, DATA_TYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, void, (DATA_TYPE z0, ZTYPE z1, svbool_t p0,	\
++		      int64_t x0))				\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++  }
++
++#define TEST_P(NAME, CODE1, CODE2)	\
++  PROTO (NAME, svbool_t, (void))	\
++  {					\
++    svbool_t p0;			\
++    INVOKE (CODE1, CODE2);		\
++    return p0;				\
++  }
++
++#define TEST_PTEST(NAME, TYPE, CODE)				\
++  PROTO (NAME, TYPE, (svbool_t p0, svbool_t p1, svbool_t p2,	\
++		      svbool_t p3, TYPE x0, TYPE x1))		\
++  {								\
++    INVOKE (CODE, CODE);					\
++    return x0;							\
++  }
++
++#define TEST_COMPARE_S(NAME, TYPE, CODE1, CODE2)	\
++  PROTO (NAME, svbool_t, (TYPE x0, TYPE x1))		\
++  {							\
++    svbool_t p0;					\
++    INVOKE (CODE1, CODE2);				\
++    return p0;						\
++  }
++
++#define TEST_COMPARE_Z(NAME, TYPE, CODE1, CODE2)		\
++  PROTO (NAME, svbool_t, (TYPE z0, TYPE z1,			\
++			  svbool_t p0, svbool_t p1))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return p0;							\
++  }
++
++#define TEST_COMPARE_ZX(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, svbool_t, (ZTYPE z0, ZTYPE z1, svbool_t p0,	\
++			  svbool_t p1, STYPE x0))		\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return p0;							\
++  }
++
++#define TEST_COMPARE_ZD(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
++  PROTO (NAME, svbool_t, (ZTYPE z0, ZTYPE z1, ZTYPE z2,		\
++			  ZTYPE z3, svbool_t p0, svbool_t p1,	\
++			  STYPE d4))				\
++  {								\
++    INVOKE (CODE1, CODE2);					\
++    return p0;							\
++  }
++
++#define TEST_COMPARE_DUAL_Z(NAME, TYPE1, TYPE2, CODE1, CODE2) \
++  PROTO (NAME, svbool_t, (TYPE1 z0, TYPE2 z1,		\
++			  svbool_t p0, svbool_t p1))	\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++    return p0;						\
++  }
++
++#define TEST_REDUCTION_X(NAME, STYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, STYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0))   \
++  {							   \
++    STYPE x0;						   \
++    INVOKE (CODE1, CODE2);				   \
++    return x0;						   \
++  }
++
++#define TEST_REDUCTION_D(NAME, STYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, STYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0))   \
++  {							   \
++    STYPE d0;						   \
++    INVOKE (CODE1, CODE2);				   \
++    return d0;						   \
++  }
++
++#define TEST_FOLD_LEFT_D(NAME, STYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, STYPE, (STYPE d0, STYPE d1, ZTYPE z2,	\
++		       svbool_t p0))   			\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++    return d0;						\
++  }
++
++#define TEST_FOLD_LEFT_X(NAME, STYPE, ZTYPE, CODE1, CODE2) \
++  PROTO (NAME, STYPE, (STYPE x0, STYPE x1, ZTYPE z0,	\
++		       svbool_t p0))   			\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++    return x0;						\
++  }
++
++#define TEST_S(NAME, ZTYPE, STYPE, CODE)	\
++  PROTO (NAME, ZTYPE, (STYPE x0, STYPE x1))	\
++  {						\
++    ZTYPE z0;					\
++    CODE;					\
++    return z0;					\
++  }
++
++#define TEST_ADR(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
++  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE2 z1))		\
++  {							\
++    INVOKE (CODE1, CODE2);				\
++    return z0;						\
++  }
++
++#define TEST_UNDEF(NAME, TYPE, CODE)	\
++  PROTO (NAME, TYPE, (void))		\
++  {					\
++    TYPE z0;				\
++    CODE;				\
++    return z0;				\
++  }
++
++#define TEST_CREATE(NAME, TTYPE, ZTYPE, CODE1, CODE2)		\
++  PROTO (NAME, TTYPE, (ZTYPE unused0, ZTYPE unused1,		\
++		       ZTYPE unused2, ZTYPE unused3,		\
++		       ZTYPE z4, ZTYPE z5, ZTYPE z6, ZTYPE z7))	\
++  {								\
++    TTYPE z0;							\
++    INVOKE (CODE1, CODE2);					\
++    return z0;							\
++  }
++
++#define TEST_GET(NAME, TTYPE, ZTYPE, CODE1, CODE2)		\
++  PROTO (NAME, void, (ZTYPE unused0, ZTYPE unused1,		\
++		      ZTYPE unused2, ZTYPE unused3, TTYPE z4))	\
++  {								\
++    register ZTYPE z0 __asm ("z0");				\
++    register ZTYPE z4_res __asm ("z4");				\
++    register ZTYPE z5_res __asm ("z5");				\
++    register ZTYPE z6_res __asm ("z6");				\
++    register ZTYPE z7_res __asm ("z7");				\
++    INVOKE (CODE1, CODE2);					\
++    __asm volatile ("" :: "w" (z0), "w" (z4_res), "w" (z5_res),	\
++		    "w" (z6_res), "w" (z7_res));		\
++  }
++
++#define TEST_SET(NAME, TTYPE, ZTYPE, CODE1, CODE2)		\
++  PROTO (NAME, void, (ZTYPE z0, ZTYPE z1, ZTYPE z2, ZTYPE z3,	\
++		      TTYPE z4))				\
++  {								\
++    register TTYPE z24 __asm ("z24");				\
++    INVOKE (CODE1, CODE2);					\
++    __asm volatile ("" :: "w" (z4), "w" (z24));			\
++  }
++
++#define TEST_TBL2(NAME, TTYPE, ZTYPE, UTYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE, (TTYPE z0, TTYPE z2, UTYPE z4))		\
++  {								\
++    register ZTYPE z0_res __asm ("z0");				\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#define TEST_TBL2_REV(NAME, TTYPE, ZTYPE, UTYPE, CODE1, CODE2)	\
++  PROTO (NAME, ZTYPE, (UTYPE z0, TTYPE z1, TTYPE z3))		\
++  {								\
++    register ZTYPE z0_res __asm ("z0");				\
++    INVOKE (CODE1, CODE2);					\
++    return z0_res;						\
++  }
++
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f16.c
+new file mode 100644
+index 000000000..3a00716e3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f16.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tmad_0_f16_tied1:
++**	ftmad	z0\.h, z0\.h, z1\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f16_tied1, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 0),
++		z0 = svtmad (z0, z1, 0))
++
++/*
++** tmad_0_f16_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ftmad	z0\.h, z0\.h, \1\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f16_tied2, svfloat16_t,
++		z0 = svtmad_f16 (z1, z0, 0),
++		z0 = svtmad (z1, z0, 0))
++
++/*
++** tmad_0_f16_untied:
++**	movprfx	z0, z1
++**	ftmad	z0\.h, z0\.h, z2\.h, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f16_untied, svfloat16_t,
++		z0 = svtmad_f16 (z1, z2, 0),
++		z0 = svtmad (z1, z2, 0))
++
++/*
++** tmad_1_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #1
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_1_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 1),
++		z0 = svtmad (z0, z1, 1))
++
++/*
++** tmad_2_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #2
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_2_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 2),
++		z0 = svtmad (z0, z1, 2))
++
++/*
++** tmad_3_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #3
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_3_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 3),
++		z0 = svtmad (z0, z1, 3))
++
++/*
++** tmad_4_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #4
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_4_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 4),
++		z0 = svtmad (z0, z1, 4))
++
++/*
++** tmad_5_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #5
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_5_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 5),
++		z0 = svtmad (z0, z1, 5))
++
++/*
++** tmad_6_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #6
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_6_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 6),
++		z0 = svtmad (z0, z1, 6))
++
++/*
++** tmad_7_f16:
++**	ftmad	z0\.h, z0\.h, z1\.h, #7
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_7_f16, svfloat16_t,
++		z0 = svtmad_f16 (z0, z1, 7),
++		z0 = svtmad (z0, z1, 7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f32.c
+new file mode 100644
+index 000000000..b73d420fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f32.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tmad_0_f32_tied1:
++**	ftmad	z0\.s, z0\.s, z1\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f32_tied1, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 0),
++		z0 = svtmad (z0, z1, 0))
++
++/*
++** tmad_0_f32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z1
++**	ftmad	z0\.s, z0\.s, \1\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f32_tied2, svfloat32_t,
++		z0 = svtmad_f32 (z1, z0, 0),
++		z0 = svtmad (z1, z0, 0))
++
++/*
++** tmad_0_f32_untied:
++**	movprfx	z0, z1
++**	ftmad	z0\.s, z0\.s, z2\.s, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f32_untied, svfloat32_t,
++		z0 = svtmad_f32 (z1, z2, 0),
++		z0 = svtmad (z1, z2, 0))
++
++/*
++** tmad_1_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #1
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_1_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 1),
++		z0 = svtmad (z0, z1, 1))
++
++/*
++** tmad_2_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #2
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_2_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 2),
++		z0 = svtmad (z0, z1, 2))
++
++/*
++** tmad_3_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #3
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_3_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 3),
++		z0 = svtmad (z0, z1, 3))
++
++/*
++** tmad_4_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #4
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_4_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 4),
++		z0 = svtmad (z0, z1, 4))
++
++/*
++** tmad_5_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #5
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_5_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 5),
++		z0 = svtmad (z0, z1, 5))
++
++/*
++** tmad_6_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #6
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_6_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 6),
++		z0 = svtmad (z0, z1, 6))
++
++/*
++** tmad_7_f32:
++**	ftmad	z0\.s, z0\.s, z1\.s, #7
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_7_f32, svfloat32_t,
++		z0 = svtmad_f32 (z0, z1, 7),
++		z0 = svtmad (z0, z1, 7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f64.c
+new file mode 100644
+index 000000000..fc31928a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f64.c
+@@ -0,0 +1,96 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tmad_0_f64_tied1:
++**	ftmad	z0\.d, z0\.d, z1\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f64_tied1, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 0),
++		z0 = svtmad (z0, z1, 0))
++
++/*
++** tmad_0_f64_tied2:
++**	mov	(z[0-9]+\.d), z0\.d
++**	movprfx	z0, z1
++**	ftmad	z0\.d, z0\.d, \1, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f64_tied2, svfloat64_t,
++		z0 = svtmad_f64 (z1, z0, 0),
++		z0 = svtmad (z1, z0, 0))
++
++/*
++** tmad_0_f64_untied:
++**	movprfx	z0, z1
++**	ftmad	z0\.d, z0\.d, z2\.d, #0
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_0_f64_untied, svfloat64_t,
++		z0 = svtmad_f64 (z1, z2, 0),
++		z0 = svtmad (z1, z2, 0))
++
++/*
++** tmad_1_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #1
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_1_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 1),
++		z0 = svtmad (z0, z1, 1))
++
++/*
++** tmad_2_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #2
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_2_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 2),
++		z0 = svtmad (z0, z1, 2))
++
++/*
++** tmad_3_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #3
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_3_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 3),
++		z0 = svtmad (z0, z1, 3))
++
++/*
++** tmad_4_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #4
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_4_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 4),
++		z0 = svtmad (z0, z1, 4))
++
++/*
++** tmad_5_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #5
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_5_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 5),
++		z0 = svtmad (z0, z1, 5))
++
++/*
++** tmad_6_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #6
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_6_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 6),
++		z0 = svtmad (z0, z1, 6))
++
++/*
++** tmad_7_f64:
++**	ftmad	z0\.d, z0\.d, z1\.d, #7
++**	ret
++*/
++TEST_UNIFORM_Z (tmad_7_f64, svfloat64_t,
++		z0 = svtmad_f64 (z0, z1, 7),
++		z0 = svtmad (z0, z1, 7))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b16.c
+new file mode 100644
+index 000000000..902f8c397
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_b16_tied1:
++**	trn1	p0\.h, p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b16_tied1,
++		p0 = svtrn1_b16 (p0, p1),
++		p0 = svtrn1_b16 (p0, p1))
++
++/*
++** trn1_b16_tied2:
++**	trn1	p0\.h, p1\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b16_tied2,
++		p0 = svtrn1_b16 (p1, p0),
++		p0 = svtrn1_b16 (p1, p0))
++
++/*
++** trn1_b16_untied:
++**	trn1	p0\.h, p1\.h, p2\.h
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b16_untied,
++		p0 = svtrn1_b16 (p1, p2),
++		p0 = svtrn1_b16 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b32.c
+new file mode 100644
+index 000000000..8c9ed5152
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_b32_tied1:
++**	trn1	p0\.s, p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b32_tied1,
++		p0 = svtrn1_b32 (p0, p1),
++		p0 = svtrn1_b32 (p0, p1))
++
++/*
++** trn1_b32_tied2:
++**	trn1	p0\.s, p1\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b32_tied2,
++		p0 = svtrn1_b32 (p1, p0),
++		p0 = svtrn1_b32 (p1, p0))
++
++/*
++** trn1_b32_untied:
++**	trn1	p0\.s, p1\.s, p2\.s
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b32_untied,
++		p0 = svtrn1_b32 (p1, p2),
++		p0 = svtrn1_b32 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b64.c
+new file mode 100644
+index 000000000..55b00571d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_b64_tied1:
++**	trn1	p0\.d, p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b64_tied1,
++		p0 = svtrn1_b64 (p0, p1),
++		p0 = svtrn1_b64 (p0, p1))
++
++/*
++** trn1_b64_tied2:
++**	trn1	p0\.d, p1\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b64_tied2,
++		p0 = svtrn1_b64 (p1, p0),
++		p0 = svtrn1_b64 (p1, p0))
++
++/*
++** trn1_b64_untied:
++**	trn1	p0\.d, p1\.d, p2\.d
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b64_untied,
++		p0 = svtrn1_b64 (p1, p2),
++		p0 = svtrn1_b64 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b8.c
+new file mode 100644
+index 000000000..4b5e80fbe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_b8_tied1:
++**	trn1	p0\.b, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b8_tied1,
++		p0 = svtrn1_b8 (p0, p1),
++		p0 = svtrn1_b8 (p0, p1))
++
++/*
++** trn1_b8_tied2:
++**	trn1	p0\.b, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b8_tied2,
++		p0 = svtrn1_b8 (p1, p0),
++		p0 = svtrn1_b8 (p1, p0))
++
++/*
++** trn1_b8_untied:
++**	trn1	p0\.b, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (trn1_b8_untied,
++		p0 = svtrn1_b8 (p1, p2),
++		p0 = svtrn1_b8 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_bf16.c
+new file mode 100644
+index 000000000..b04c7da4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_bf16_tied1:
++**	trn1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_bf16_tied1, svbfloat16_t,
++		z0 = svtrn1_bf16 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_bf16_tied2:
++**	trn1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_bf16_tied2, svbfloat16_t,
++		z0 = svtrn1_bf16 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_bf16_untied:
++**	trn1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_bf16_untied, svbfloat16_t,
++		z0 = svtrn1_bf16 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f16.c
+new file mode 100644
+index 000000000..373eb9dd9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_f16_tied1:
++**	trn1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f16_tied1, svfloat16_t,
++		z0 = svtrn1_f16 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_f16_tied2:
++**	trn1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f16_tied2, svfloat16_t,
++		z0 = svtrn1_f16 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_f16_untied:
++**	trn1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f16_untied, svfloat16_t,
++		z0 = svtrn1_f16 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f32.c
+new file mode 100644
+index 000000000..ccd84d94e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_f32_tied1:
++**	trn1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f32_tied1, svfloat32_t,
++		z0 = svtrn1_f32 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_f32_tied2:
++**	trn1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f32_tied2, svfloat32_t,
++		z0 = svtrn1_f32 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_f32_untied:
++**	trn1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f32_untied, svfloat32_t,
++		z0 = svtrn1_f32 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f64.c
+new file mode 100644
+index 000000000..d3cc51948
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_f64_tied1:
++**	trn1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f64_tied1, svfloat64_t,
++		z0 = svtrn1_f64 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_f64_tied2:
++**	trn1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f64_tied2, svfloat64_t,
++		z0 = svtrn1_f64 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_f64_untied:
++**	trn1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_f64_untied, svfloat64_t,
++		z0 = svtrn1_f64 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s16.c
+new file mode 100644
+index 000000000..466bb8c02
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_s16_tied1:
++**	trn1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s16_tied1, svint16_t,
++		z0 = svtrn1_s16 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_s16_tied2:
++**	trn1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s16_tied2, svint16_t,
++		z0 = svtrn1_s16 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_s16_untied:
++**	trn1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s16_untied, svint16_t,
++		z0 = svtrn1_s16 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s32.c
+new file mode 100644
+index 000000000..24655e622
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_s32_tied1:
++**	trn1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s32_tied1, svint32_t,
++		z0 = svtrn1_s32 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_s32_tied2:
++**	trn1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s32_tied2, svint32_t,
++		z0 = svtrn1_s32 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_s32_untied:
++**	trn1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s32_untied, svint32_t,
++		z0 = svtrn1_s32 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s64.c
+new file mode 100644
+index 000000000..553fb610b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_s64_tied1:
++**	trn1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s64_tied1, svint64_t,
++		z0 = svtrn1_s64 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_s64_tied2:
++**	trn1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s64_tied2, svint64_t,
++		z0 = svtrn1_s64 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_s64_untied:
++**	trn1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s64_untied, svint64_t,
++		z0 = svtrn1_s64 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s8.c
+new file mode 100644
+index 000000000..1fa150792
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_s8_tied1:
++**	trn1	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s8_tied1, svint8_t,
++		z0 = svtrn1_s8 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_s8_tied2:
++**	trn1	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s8_tied2, svint8_t,
++		z0 = svtrn1_s8 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_s8_untied:
++**	trn1	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_s8_untied, svint8_t,
++		z0 = svtrn1_s8 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u16.c
+new file mode 100644
+index 000000000..a3ce936f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_u16_tied1:
++**	trn1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u16_tied1, svuint16_t,
++		z0 = svtrn1_u16 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_u16_tied2:
++**	trn1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u16_tied2, svuint16_t,
++		z0 = svtrn1_u16 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_u16_untied:
++**	trn1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u16_untied, svuint16_t,
++		z0 = svtrn1_u16 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u32.c
+new file mode 100644
+index 000000000..b14d7a67a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_u32_tied1:
++**	trn1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u32_tied1, svuint32_t,
++		z0 = svtrn1_u32 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_u32_tied2:
++**	trn1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u32_tied2, svuint32_t,
++		z0 = svtrn1_u32 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_u32_untied:
++**	trn1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u32_untied, svuint32_t,
++		z0 = svtrn1_u32 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u64.c
+new file mode 100644
+index 000000000..2ccda1d72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_u64_tied1:
++**	trn1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u64_tied1, svuint64_t,
++		z0 = svtrn1_u64 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_u64_tied2:
++**	trn1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u64_tied2, svuint64_t,
++		z0 = svtrn1_u64 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_u64_untied:
++**	trn1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u64_untied, svuint64_t,
++		z0 = svtrn1_u64 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u8.c
+new file mode 100644
+index 000000000..84f8d31e8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1_u8_tied1:
++**	trn1	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u8_tied1, svuint8_t,
++		z0 = svtrn1_u8 (z0, z1),
++		z0 = svtrn1 (z0, z1))
++
++/*
++** trn1_u8_tied2:
++**	trn1	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u8_tied2, svuint8_t,
++		z0 = svtrn1_u8 (z1, z0),
++		z0 = svtrn1 (z1, z0))
++
++/*
++** trn1_u8_untied:
++**	trn1	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn1_u8_untied, svuint8_t,
++		z0 = svtrn1_u8 (z1, z2),
++		z0 = svtrn1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c
+new file mode 100644
+index 000000000..f1810da9e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_bf16_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_bf16_tied1, svbfloat16_t,
++		z0 = svtrn1q_bf16 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_bf16_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_bf16_tied2, svbfloat16_t,
++		z0 = svtrn1q_bf16 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_bf16_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_bf16_untied, svbfloat16_t,
++		z0 = svtrn1q_bf16 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f16.c
+new file mode 100644
+index 000000000..6420d0f0a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_f16_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f16_tied1, svfloat16_t,
++		z0 = svtrn1q_f16 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_f16_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f16_tied2, svfloat16_t,
++		z0 = svtrn1q_f16 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_f16_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f16_untied, svfloat16_t,
++		z0 = svtrn1q_f16 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f32.c
+new file mode 100644
+index 000000000..6fb2eecf5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_f32_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f32_tied1, svfloat32_t,
++		z0 = svtrn1q_f32 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_f32_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f32_tied2, svfloat32_t,
++		z0 = svtrn1q_f32 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_f32_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f32_untied, svfloat32_t,
++		z0 = svtrn1q_f32 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f64.c
+new file mode 100644
+index 000000000..e786a8d04
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_f64_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f64_tied1, svfloat64_t,
++		z0 = svtrn1q_f64 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_f64_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f64_tied2, svfloat64_t,
++		z0 = svtrn1q_f64 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_f64_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_f64_untied, svfloat64_t,
++		z0 = svtrn1q_f64 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s16.c
+new file mode 100644
+index 000000000..548360719
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_s16_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s16_tied1, svint16_t,
++		z0 = svtrn1q_s16 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_s16_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s16_tied2, svint16_t,
++		z0 = svtrn1q_s16 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_s16_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s16_untied, svint16_t,
++		z0 = svtrn1q_s16 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s32.c
+new file mode 100644
+index 000000000..ccb8319f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_s32_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s32_tied1, svint32_t,
++		z0 = svtrn1q_s32 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_s32_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s32_tied2, svint32_t,
++		z0 = svtrn1q_s32 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_s32_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s32_untied, svint32_t,
++		z0 = svtrn1q_s32 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s64.c
+new file mode 100644
+index 000000000..fe8125a8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_s64_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s64_tied1, svint64_t,
++		z0 = svtrn1q_s64 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_s64_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s64_tied2, svint64_t,
++		z0 = svtrn1q_s64 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_s64_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s64_untied, svint64_t,
++		z0 = svtrn1q_s64 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s8.c
+new file mode 100644
+index 000000000..48040c1ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_s8_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s8_tied1, svint8_t,
++		z0 = svtrn1q_s8 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_s8_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s8_tied2, svint8_t,
++		z0 = svtrn1q_s8 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_s8_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_s8_untied, svint8_t,
++		z0 = svtrn1q_s8 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u16.c
+new file mode 100644
+index 000000000..3657f919e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_u16_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u16_tied1, svuint16_t,
++		z0 = svtrn1q_u16 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_u16_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u16_tied2, svuint16_t,
++		z0 = svtrn1q_u16 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_u16_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u16_untied, svuint16_t,
++		z0 = svtrn1q_u16 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u32.c
+new file mode 100644
+index 000000000..cc5ea2878
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_u32_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u32_tied1, svuint32_t,
++		z0 = svtrn1q_u32 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_u32_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u32_tied2, svuint32_t,
++		z0 = svtrn1q_u32 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_u32_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u32_untied, svuint32_t,
++		z0 = svtrn1q_u32 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u64.c
+new file mode 100644
+index 000000000..4435b53d0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_u64_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u64_tied1, svuint64_t,
++		z0 = svtrn1q_u64 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_u64_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u64_tied2, svuint64_t,
++		z0 = svtrn1q_u64 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_u64_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u64_untied, svuint64_t,
++		z0 = svtrn1q_u64 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u8.c
+new file mode 100644
+index 000000000..4ebfedbea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn1q_u8_tied1:
++**	trn1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u8_tied1, svuint8_t,
++		z0 = svtrn1q_u8 (z0, z1),
++		z0 = svtrn1q (z0, z1))
++
++/*
++** trn1q_u8_tied2:
++**	trn1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u8_tied2, svuint8_t,
++		z0 = svtrn1q_u8 (z1, z0),
++		z0 = svtrn1q (z1, z0))
++
++/*
++** trn1q_u8_untied:
++**	trn1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn1q_u8_untied, svuint8_t,
++		z0 = svtrn1q_u8 (z1, z2),
++		z0 = svtrn1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b16.c
+new file mode 100644
+index 000000000..54b593afe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_b16_tied1:
++**	trn2	p0\.h, p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b16_tied1,
++		p0 = svtrn2_b16 (p0, p1),
++		p0 = svtrn2_b16 (p0, p1))
++
++/*
++** trn2_b16_tied2:
++**	trn2	p0\.h, p1\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b16_tied2,
++		p0 = svtrn2_b16 (p1, p0),
++		p0 = svtrn2_b16 (p1, p0))
++
++/*
++** trn2_b16_untied:
++**	trn2	p0\.h, p1\.h, p2\.h
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b16_untied,
++		p0 = svtrn2_b16 (p1, p2),
++		p0 = svtrn2_b16 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b32.c
+new file mode 100644
+index 000000000..ead3d85cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_b32_tied1:
++**	trn2	p0\.s, p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b32_tied1,
++		p0 = svtrn2_b32 (p0, p1),
++		p0 = svtrn2_b32 (p0, p1))
++
++/*
++** trn2_b32_tied2:
++**	trn2	p0\.s, p1\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b32_tied2,
++		p0 = svtrn2_b32 (p1, p0),
++		p0 = svtrn2_b32 (p1, p0))
++
++/*
++** trn2_b32_untied:
++**	trn2	p0\.s, p1\.s, p2\.s
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b32_untied,
++		p0 = svtrn2_b32 (p1, p2),
++		p0 = svtrn2_b32 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b64.c
+new file mode 100644
+index 000000000..ccca03557
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_b64_tied1:
++**	trn2	p0\.d, p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b64_tied1,
++		p0 = svtrn2_b64 (p0, p1),
++		p0 = svtrn2_b64 (p0, p1))
++
++/*
++** trn2_b64_tied2:
++**	trn2	p0\.d, p1\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b64_tied2,
++		p0 = svtrn2_b64 (p1, p0),
++		p0 = svtrn2_b64 (p1, p0))
++
++/*
++** trn2_b64_untied:
++**	trn2	p0\.d, p1\.d, p2\.d
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b64_untied,
++		p0 = svtrn2_b64 (p1, p2),
++		p0 = svtrn2_b64 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b8.c
+new file mode 100644
+index 000000000..7b0803e79
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_b8_tied1:
++**	trn2	p0\.b, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b8_tied1,
++		p0 = svtrn2_b8 (p0, p1),
++		p0 = svtrn2_b8 (p0, p1))
++
++/*
++** trn2_b8_tied2:
++**	trn2	p0\.b, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b8_tied2,
++		p0 = svtrn2_b8 (p1, p0),
++		p0 = svtrn2_b8 (p1, p0))
++
++/*
++** trn2_b8_untied:
++**	trn2	p0\.b, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (trn2_b8_untied,
++		p0 = svtrn2_b8 (p1, p2),
++		p0 = svtrn2_b8 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_bf16.c
+new file mode 100644
+index 000000000..12028b0f6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_bf16_tied1:
++**	trn2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_bf16_tied1, svbfloat16_t,
++		z0 = svtrn2_bf16 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_bf16_tied2:
++**	trn2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_bf16_tied2, svbfloat16_t,
++		z0 = svtrn2_bf16 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_bf16_untied:
++**	trn2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_bf16_untied, svbfloat16_t,
++		z0 = svtrn2_bf16 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f16.c
+new file mode 100644
+index 000000000..112567725
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_f16_tied1:
++**	trn2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f16_tied1, svfloat16_t,
++		z0 = svtrn2_f16 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_f16_tied2:
++**	trn2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f16_tied2, svfloat16_t,
++		z0 = svtrn2_f16 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_f16_untied:
++**	trn2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f16_untied, svfloat16_t,
++		z0 = svtrn2_f16 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f32.c
+new file mode 100644
+index 000000000..daee566cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_f32_tied1:
++**	trn2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f32_tied1, svfloat32_t,
++		z0 = svtrn2_f32 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_f32_tied2:
++**	trn2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f32_tied2, svfloat32_t,
++		z0 = svtrn2_f32 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_f32_untied:
++**	trn2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f32_untied, svfloat32_t,
++		z0 = svtrn2_f32 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f64.c
+new file mode 100644
+index 000000000..338fee49f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_f64_tied1:
++**	trn2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f64_tied1, svfloat64_t,
++		z0 = svtrn2_f64 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_f64_tied2:
++**	trn2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f64_tied2, svfloat64_t,
++		z0 = svtrn2_f64 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_f64_untied:
++**	trn2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_f64_untied, svfloat64_t,
++		z0 = svtrn2_f64 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s16.c
+new file mode 100644
+index 000000000..93f63de5e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_s16_tied1:
++**	trn2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s16_tied1, svint16_t,
++		z0 = svtrn2_s16 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_s16_tied2:
++**	trn2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s16_tied2, svint16_t,
++		z0 = svtrn2_s16 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_s16_untied:
++**	trn2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s16_untied, svint16_t,
++		z0 = svtrn2_s16 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s32.c
+new file mode 100644
+index 000000000..82edd72f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_s32_tied1:
++**	trn2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s32_tied1, svint32_t,
++		z0 = svtrn2_s32 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_s32_tied2:
++**	trn2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s32_tied2, svint32_t,
++		z0 = svtrn2_s32 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_s32_untied:
++**	trn2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s32_untied, svint32_t,
++		z0 = svtrn2_s32 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s64.c
+new file mode 100644
+index 000000000..5f43441d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_s64_tied1:
++**	trn2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s64_tied1, svint64_t,
++		z0 = svtrn2_s64 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_s64_tied2:
++**	trn2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s64_tied2, svint64_t,
++		z0 = svtrn2_s64 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_s64_untied:
++**	trn2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s64_untied, svint64_t,
++		z0 = svtrn2_s64 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s8.c
+new file mode 100644
+index 000000000..716538119
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_s8_tied1:
++**	trn2	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s8_tied1, svint8_t,
++		z0 = svtrn2_s8 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_s8_tied2:
++**	trn2	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s8_tied2, svint8_t,
++		z0 = svtrn2_s8 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_s8_untied:
++**	trn2	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_s8_untied, svint8_t,
++		z0 = svtrn2_s8 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u16.c
+new file mode 100644
+index 000000000..e68d233b8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_u16_tied1:
++**	trn2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u16_tied1, svuint16_t,
++		z0 = svtrn2_u16 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_u16_tied2:
++**	trn2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u16_tied2, svuint16_t,
++		z0 = svtrn2_u16 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_u16_untied:
++**	trn2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u16_untied, svuint16_t,
++		z0 = svtrn2_u16 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u32.c
+new file mode 100644
+index 000000000..e48aad179
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_u32_tied1:
++**	trn2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u32_tied1, svuint32_t,
++		z0 = svtrn2_u32 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_u32_tied2:
++**	trn2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u32_tied2, svuint32_t,
++		z0 = svtrn2_u32 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_u32_untied:
++**	trn2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u32_untied, svuint32_t,
++		z0 = svtrn2_u32 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u64.c
+new file mode 100644
+index 000000000..aa452275b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_u64_tied1:
++**	trn2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u64_tied1, svuint64_t,
++		z0 = svtrn2_u64 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_u64_tied2:
++**	trn2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u64_tied2, svuint64_t,
++		z0 = svtrn2_u64 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_u64_untied:
++**	trn2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u64_untied, svuint64_t,
++		z0 = svtrn2_u64 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u8.c
+new file mode 100644
+index 000000000..cb26b2338
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2_u8_tied1:
++**	trn2	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u8_tied1, svuint8_t,
++		z0 = svtrn2_u8 (z0, z1),
++		z0 = svtrn2 (z0, z1))
++
++/*
++** trn2_u8_tied2:
++**	trn2	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u8_tied2, svuint8_t,
++		z0 = svtrn2_u8 (z1, z0),
++		z0 = svtrn2 (z1, z0))
++
++/*
++** trn2_u8_untied:
++**	trn2	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (trn2_u8_untied, svuint8_t,
++		z0 = svtrn2_u8 (z1, z2),
++		z0 = svtrn2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c
+new file mode 100644
+index 000000000..5623b54f0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_bf16_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_bf16_tied1, svbfloat16_t,
++		z0 = svtrn2q_bf16 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_bf16_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_bf16_tied2, svbfloat16_t,
++		z0 = svtrn2q_bf16 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_bf16_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_bf16_untied, svbfloat16_t,
++		z0 = svtrn2q_bf16 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f16.c
+new file mode 100644
+index 000000000..db2190929
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_f16_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f16_tied1, svfloat16_t,
++		z0 = svtrn2q_f16 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_f16_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f16_tied2, svfloat16_t,
++		z0 = svtrn2q_f16 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_f16_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f16_untied, svfloat16_t,
++		z0 = svtrn2q_f16 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f32.c
+new file mode 100644
+index 000000000..1367a1e06
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_f32_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f32_tied1, svfloat32_t,
++		z0 = svtrn2q_f32 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_f32_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f32_tied2, svfloat32_t,
++		z0 = svtrn2q_f32 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_f32_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f32_untied, svfloat32_t,
++		z0 = svtrn2q_f32 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f64.c
+new file mode 100644
+index 000000000..54325e705
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_f64_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f64_tied1, svfloat64_t,
++		z0 = svtrn2q_f64 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_f64_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f64_tied2, svfloat64_t,
++		z0 = svtrn2q_f64 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_f64_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_f64_untied, svfloat64_t,
++		z0 = svtrn2q_f64 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s16.c
+new file mode 100644
+index 000000000..a0b641278
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_s16_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s16_tied1, svint16_t,
++		z0 = svtrn2q_s16 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_s16_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s16_tied2, svint16_t,
++		z0 = svtrn2q_s16 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_s16_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s16_untied, svint16_t,
++		z0 = svtrn2q_s16 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s32.c
+new file mode 100644
+index 000000000..7c128c6ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_s32_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s32_tied1, svint32_t,
++		z0 = svtrn2q_s32 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_s32_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s32_tied2, svint32_t,
++		z0 = svtrn2q_s32 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_s32_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s32_untied, svint32_t,
++		z0 = svtrn2q_s32 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s64.c
+new file mode 100644
+index 000000000..f22222525
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_s64_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s64_tied1, svint64_t,
++		z0 = svtrn2q_s64 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_s64_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s64_tied2, svint64_t,
++		z0 = svtrn2q_s64 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_s64_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s64_untied, svint64_t,
++		z0 = svtrn2q_s64 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s8.c
+new file mode 100644
+index 000000000..bd5243f35
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_s8_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s8_tied1, svint8_t,
++		z0 = svtrn2q_s8 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_s8_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s8_tied2, svint8_t,
++		z0 = svtrn2q_s8 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_s8_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_s8_untied, svint8_t,
++		z0 = svtrn2q_s8 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u16.c
+new file mode 100644
+index 000000000..8da8563b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_u16_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u16_tied1, svuint16_t,
++		z0 = svtrn2q_u16 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_u16_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u16_tied2, svuint16_t,
++		z0 = svtrn2q_u16 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_u16_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u16_untied, svuint16_t,
++		z0 = svtrn2q_u16 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u32.c
+new file mode 100644
+index 000000000..6c0af02da
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_u32_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u32_tied1, svuint32_t,
++		z0 = svtrn2q_u32 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_u32_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u32_tied2, svuint32_t,
++		z0 = svtrn2q_u32 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_u32_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u32_untied, svuint32_t,
++		z0 = svtrn2q_u32 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u64.c
+new file mode 100644
+index 000000000..857595cbb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_u64_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u64_tied1, svuint64_t,
++		z0 = svtrn2q_u64 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_u64_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u64_tied2, svuint64_t,
++		z0 = svtrn2q_u64 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_u64_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u64_untied, svuint64_t,
++		z0 = svtrn2q_u64 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u8.c
+new file mode 100644
+index 000000000..1fb85b249
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** trn2q_u8_tied1:
++**	trn2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u8_tied1, svuint8_t,
++		z0 = svtrn2q_u8 (z0, z1),
++		z0 = svtrn2q (z0, z1))
++
++/*
++** trn2q_u8_tied2:
++**	trn2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u8_tied2, svuint8_t,
++		z0 = svtrn2q_u8 (z1, z0),
++		z0 = svtrn2q (z1, z0))
++
++/*
++** trn2q_u8_untied:
++**	trn2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (trn2q_u8_untied, svuint8_t,
++		z0 = svtrn2q_u8 (z1, z2),
++		z0 = svtrn2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f16.c
+new file mode 100644
+index 000000000..94bc696eb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tsmul_f16_tied1:
++**	ftsmul	z0\.h, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tsmul_f16_tied1, svfloat16_t, svuint16_t,
++	     z0 = svtsmul_f16 (z0, z4),
++	     z0 = svtsmul (z0, z4))
++
++/*
++** tsmul_f16_tied2:
++**	ftsmul	z0\.h, z4\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (tsmul_f16_tied2, svfloat16_t, svuint16_t,
++		 z0_res = svtsmul_f16 (z4, z0),
++		 z0_res = svtsmul (z4, z0))
++
++/*
++** tsmul_f16_untied:
++**	ftsmul	z0\.h, z1\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tsmul_f16_untied, svfloat16_t, svuint16_t,
++	     z0 = svtsmul_f16 (z1, z4),
++	     z0 = svtsmul (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f32.c
+new file mode 100644
+index 000000000..d0ec91882
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tsmul_f32_tied1:
++**	ftsmul	z0\.s, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tsmul_f32_tied1, svfloat32_t, svuint32_t,
++	     z0 = svtsmul_f32 (z0, z4),
++	     z0 = svtsmul (z0, z4))
++
++/*
++** tsmul_f32_tied2:
++**	ftsmul	z0\.s, z4\.s, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (tsmul_f32_tied2, svfloat32_t, svuint32_t,
++		 z0_res = svtsmul_f32 (z4, z0),
++		 z0_res = svtsmul (z4, z0))
++
++/*
++** tsmul_f32_untied:
++**	ftsmul	z0\.s, z1\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tsmul_f32_untied, svfloat32_t, svuint32_t,
++	     z0 = svtsmul_f32 (z1, z4),
++	     z0 = svtsmul (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f64.c
+new file mode 100644
+index 000000000..23e0da3f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tsmul_f64_tied1:
++**	ftsmul	z0\.d, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tsmul_f64_tied1, svfloat64_t, svuint64_t,
++	     z0 = svtsmul_f64 (z0, z4),
++	     z0 = svtsmul (z0, z4))
++
++/*
++** tsmul_f64_tied2:
++**	ftsmul	z0\.d, z4\.d, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (tsmul_f64_tied2, svfloat64_t, svuint64_t,
++		 z0_res = svtsmul_f64 (z4, z0),
++		 z0_res = svtsmul (z4, z0))
++
++/*
++** tsmul_f64_untied:
++**	ftsmul	z0\.d, z1\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tsmul_f64_untied, svfloat64_t, svuint64_t,
++	     z0 = svtsmul_f64 (z1, z4),
++	     z0 = svtsmul (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f16.c
+new file mode 100644
+index 000000000..e7c3ea03b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tssel_f16_tied1:
++**	ftssel	z0\.h, z0\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tssel_f16_tied1, svfloat16_t, svuint16_t,
++	     z0 = svtssel_f16 (z0, z4),
++	     z0 = svtssel (z0, z4))
++
++/*
++** tssel_f16_tied2:
++**	ftssel	z0\.h, z4\.h, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (tssel_f16_tied2, svfloat16_t, svuint16_t,
++		 z0_res = svtssel_f16 (z4, z0),
++		 z0_res = svtssel (z4, z0))
++
++/*
++** tssel_f16_untied:
++**	ftssel	z0\.h, z1\.h, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (tssel_f16_untied, svfloat16_t, svuint16_t,
++	     z0 = svtssel_f16 (z1, z4),
++	     z0 = svtssel (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f32.c
+new file mode 100644
+index 000000000..022573a19
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tssel_f32_tied1:
++**	ftssel	z0\.s, z0\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tssel_f32_tied1, svfloat32_t, svuint32_t,
++	     z0 = svtssel_f32 (z0, z4),
++	     z0 = svtssel (z0, z4))
++
++/*
++** tssel_f32_tied2:
++**	ftssel	z0\.s, z4\.s, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (tssel_f32_tied2, svfloat32_t, svuint32_t,
++		 z0_res = svtssel_f32 (z4, z0),
++		 z0_res = svtssel (z4, z0))
++
++/*
++** tssel_f32_untied:
++**	ftssel	z0\.s, z1\.s, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (tssel_f32_untied, svfloat32_t, svuint32_t,
++	     z0 = svtssel_f32 (z1, z4),
++	     z0 = svtssel (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f64.c
+new file mode 100644
+index 000000000..ffcdf4224
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** tssel_f64_tied1:
++**	ftssel	z0\.d, z0\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tssel_f64_tied1, svfloat64_t, svuint64_t,
++	     z0 = svtssel_f64 (z0, z4),
++	     z0 = svtssel (z0, z4))
++
++/*
++** tssel_f64_tied2:
++**	ftssel	z0\.d, z4\.d, z0\.d
++**	ret
++*/
++TEST_DUAL_Z_REV (tssel_f64_tied2, svfloat64_t, svuint64_t,
++		 z0_res = svtssel_f64 (z4, z0),
++		 z0_res = svtssel (z4, z0))
++
++/*
++** tssel_f64_untied:
++**	ftssel	z0\.d, z1\.d, z4\.d
++**	ret
++*/
++TEST_DUAL_Z (tssel_f64_untied, svfloat64_t, svuint64_t,
++	     z0 = svtssel_f64 (z1, z4),
++	     z0 = svtssel (z1, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef2_1.c
+new file mode 100644
+index 000000000..fe6c4c7c7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef2_1.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** int8:
++**	ret
++*/
++TEST_UNDEF (int8, svint8x2_t,
++	    z0 = svundef2_s8 ())
++
++/*
++** uint8:
++**	ret
++*/
++TEST_UNDEF (uint8, svuint8x2_t,
++	    z0 = svundef2_u8 ())
++
++/*
++** int16:
++**	ret
++*/
++TEST_UNDEF (int16, svint16x2_t,
++	    z0 = svundef2_s16 ())
++
++/*
++** uint16:
++**	ret
++*/
++TEST_UNDEF (uint16, svuint16x2_t,
++	    z0 = svundef2_u16 ())
++
++/*
++** float16:
++**	ret
++*/
++TEST_UNDEF (float16, svfloat16x2_t,
++	    z0 = svundef2_f16 ())
++
++/*
++** bfloat16:
++**	ret
++*/
++TEST_UNDEF (bfloat16, svbfloat16x2_t,
++	    z0 = svundef2_bf16 ())
++
++/*
++** int32:
++**	ret
++*/
++TEST_UNDEF (int32, svint32x2_t,
++	    z0 = svundef2_s32 ())
++
++/*
++** uint32:
++**	ret
++*/
++TEST_UNDEF (uint32, svuint32x2_t,
++	    z0 = svundef2_u32 ())
++
++/*
++** float32:
++**	ret
++*/
++TEST_UNDEF (float32, svfloat32x2_t,
++	    z0 = svundef2_f32 ())
++
++/*
++** int64:
++**	ret
++*/
++TEST_UNDEF (int64, svint64x2_t,
++	    z0 = svundef2_s64 ())
++
++/*
++** uint64:
++**	ret
++*/
++TEST_UNDEF (uint64, svuint64x2_t,
++	    z0 = svundef2_u64 ())
++
++/*
++** float64:
++**	ret
++*/
++TEST_UNDEF (float64, svfloat64x2_t,
++	    z0 = svundef2_f64 ())
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef3_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef3_1.c
+new file mode 100644
+index 000000000..5c18c6317
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef3_1.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** int8:
++**	ret
++*/
++TEST_UNDEF (int8, svint8x3_t,
++	    z0 = svundef3_s8 ())
++
++/*
++** uint8:
++**	ret
++*/
++TEST_UNDEF (uint8, svuint8x3_t,
++	    z0 = svundef3_u8 ())
++
++/*
++** int16:
++**	ret
++*/
++TEST_UNDEF (int16, svint16x3_t,
++	    z0 = svundef3_s16 ())
++
++/*
++** uint16:
++**	ret
++*/
++TEST_UNDEF (uint16, svuint16x3_t,
++	    z0 = svundef3_u16 ())
++
++/*
++** float16:
++**	ret
++*/
++TEST_UNDEF (float16, svfloat16x3_t,
++	    z0 = svundef3_f16 ())
++
++/*
++** bfloat16:
++**	ret
++*/
++TEST_UNDEF (bfloat16, svbfloat16x3_t,
++	    z0 = svundef3_bf16 ())
++
++/*
++** int32:
++**	ret
++*/
++TEST_UNDEF (int32, svint32x3_t,
++	    z0 = svundef3_s32 ())
++
++/*
++** uint32:
++**	ret
++*/
++TEST_UNDEF (uint32, svuint32x3_t,
++	    z0 = svundef3_u32 ())
++
++/*
++** float32:
++**	ret
++*/
++TEST_UNDEF (float32, svfloat32x3_t,
++	    z0 = svundef3_f32 ())
++
++/*
++** int64:
++**	ret
++*/
++TEST_UNDEF (int64, svint64x3_t,
++	    z0 = svundef3_s64 ())
++
++/*
++** uint64:
++**	ret
++*/
++TEST_UNDEF (uint64, svuint64x3_t,
++	    z0 = svundef3_u64 ())
++
++/*
++** float64:
++**	ret
++*/
++TEST_UNDEF (float64, svfloat64x3_t,
++	    z0 = svundef3_f64 ())
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef4_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef4_1.c
+new file mode 100644
+index 000000000..4d6b86b04
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef4_1.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** int8:
++**	ret
++*/
++TEST_UNDEF (int8, svint8x4_t,
++	    z0 = svundef4_s8 ())
++
++/*
++** uint8:
++**	ret
++*/
++TEST_UNDEF (uint8, svuint8x4_t,
++	    z0 = svundef4_u8 ())
++
++/*
++** int16:
++**	ret
++*/
++TEST_UNDEF (int16, svint16x4_t,
++	    z0 = svundef4_s16 ())
++
++/*
++** uint16:
++**	ret
++*/
++TEST_UNDEF (uint16, svuint16x4_t,
++	    z0 = svundef4_u16 ())
++
++/*
++** float16:
++**	ret
++*/
++TEST_UNDEF (float16, svfloat16x4_t,
++	    z0 = svundef4_f16 ())
++
++/*
++** bfloat16:
++**	ret
++*/
++TEST_UNDEF (bfloat16, svbfloat16x4_t,
++	    z0 = svundef4_bf16 ())
++
++/*
++** int32:
++**	ret
++*/
++TEST_UNDEF (int32, svint32x4_t,
++	    z0 = svundef4_s32 ())
++
++/*
++** uint32:
++**	ret
++*/
++TEST_UNDEF (uint32, svuint32x4_t,
++	    z0 = svundef4_u32 ())
++
++/*
++** float32:
++**	ret
++*/
++TEST_UNDEF (float32, svfloat32x4_t,
++	    z0 = svundef4_f32 ())
++
++/*
++** int64:
++**	ret
++*/
++TEST_UNDEF (int64, svint64x4_t,
++	    z0 = svundef4_s64 ())
++
++/*
++** uint64:
++**	ret
++*/
++TEST_UNDEF (uint64, svuint64x4_t,
++	    z0 = svundef4_u64 ())
++
++/*
++** float64:
++**	ret
++*/
++TEST_UNDEF (float64, svfloat64x4_t,
++	    z0 = svundef4_f64 ())
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef_1.c
+new file mode 100644
+index 000000000..62873b6e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef_1.c
+@@ -0,0 +1,87 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** int8:
++**	ret
++*/
++TEST_UNDEF (int8, svint8_t,
++	    z0 = svundef_s8 ())
++
++/*
++** uint8:
++**	ret
++*/
++TEST_UNDEF (uint8, svuint8_t,
++	    z0 = svundef_u8 ())
++
++/*
++** int16:
++**	ret
++*/
++TEST_UNDEF (int16, svint16_t,
++	    z0 = svundef_s16 ())
++
++/*
++** uint16:
++**	ret
++*/
++TEST_UNDEF (uint16, svuint16_t,
++	    z0 = svundef_u16 ())
++
++/*
++** float16:
++**	ret
++*/
++TEST_UNDEF (float16, svfloat16_t,
++	    z0 = svundef_f16 ())
++
++/*
++** bfloat16:
++**	ret
++*/
++TEST_UNDEF (bfloat16, svbfloat16_t,
++	    z0 = svundef_bf16 ())
++
++/*
++** int32:
++**	ret
++*/
++TEST_UNDEF (int32, svint32_t,
++	    z0 = svundef_s32 ())
++
++/*
++** uint32:
++**	ret
++*/
++TEST_UNDEF (uint32, svuint32_t,
++	    z0 = svundef_u32 ())
++
++/*
++** float32:
++**	ret
++*/
++TEST_UNDEF (float32, svfloat32_t,
++	    z0 = svundef_f32 ())
++
++/*
++** int64:
++**	ret
++*/
++TEST_UNDEF (int64, svint64_t,
++	    z0 = svundef_s64 ())
++
++/*
++** uint64:
++**	ret
++*/
++TEST_UNDEF (uint64, svuint64_t,
++	    z0 = svundef_u64 ())
++
++/*
++** float64:
++**	ret
++*/
++TEST_UNDEF (float64, svfloat64_t,
++	    z0 = svundef_f64 ())
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_b.c
+new file mode 100644
+index 000000000..ff1a84aac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_b.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_b_tied1:
++**	punpkhi	p0\.h, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (unpkhi_b_tied1,
++		p0 = svunpkhi_b (p0),
++		p0 = svunpkhi (p0))
++
++/*
++** unpkhi_b_untied:
++**	punpkhi	p0\.h, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (unpkhi_b_untied,
++		p0 = svunpkhi_b (p1),
++		p0 = svunpkhi (p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s16.c
+new file mode 100644
+index 000000000..3f79ac65f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_s16_tied1:
++**	sunpkhi	z0\.h, z0\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (unpkhi_s16_tied1, svint16_t, svint8_t,
++		 z0_res = svunpkhi_s16 (z0),
++		 z0_res = svunpkhi (z0))
++
++/*
++** unpkhi_s16_untied:
++**	sunpkhi	z0\.h, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (unpkhi_s16_untied, svint16_t, svint8_t,
++	     z0 = svunpkhi_s16 (z4),
++	     z0 = svunpkhi (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s32.c
+new file mode 100644
+index 000000000..619fb0882
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_s32_tied1:
++**	sunpkhi	z0\.s, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (unpkhi_s32_tied1, svint32_t, svint16_t,
++		 z0_res = svunpkhi_s32 (z0),
++		 z0_res = svunpkhi (z0))
++
++/*
++** unpkhi_s32_untied:
++**	sunpkhi	z0\.s, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (unpkhi_s32_untied, svint32_t, svint16_t,
++	     z0 = svunpkhi_s32 (z4),
++	     z0 = svunpkhi (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s64.c
+new file mode 100644
+index 000000000..5d6da1768
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_s64_tied1:
++**	sunpkhi	z0\.d, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (unpkhi_s64_tied1, svint64_t, svint32_t,
++		 z0_res = svunpkhi_s64 (z0),
++		 z0_res = svunpkhi (z0))
++
++/*
++** unpkhi_s64_untied:
++**	sunpkhi	z0\.d, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (unpkhi_s64_untied, svint64_t, svint32_t,
++	     z0 = svunpkhi_s64 (z4),
++	     z0 = svunpkhi (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u16.c
+new file mode 100644
+index 000000000..68f47a282
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_u16_tied1:
++**	uunpkhi	z0\.h, z0\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (unpkhi_u16_tied1, svuint16_t, svuint8_t,
++		 z0_res = svunpkhi_u16 (z0),
++		 z0_res = svunpkhi (z0))
++
++/*
++** unpkhi_u16_untied:
++**	uunpkhi	z0\.h, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (unpkhi_u16_untied, svuint16_t, svuint8_t,
++	     z0 = svunpkhi_u16 (z4),
++	     z0 = svunpkhi (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u32.c
+new file mode 100644
+index 000000000..3c4b161e4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_u32_tied1:
++**	uunpkhi	z0\.s, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (unpkhi_u32_tied1, svuint32_t, svuint16_t,
++		 z0_res = svunpkhi_u32 (z0),
++		 z0_res = svunpkhi (z0))
++
++/*
++** unpkhi_u32_untied:
++**	uunpkhi	z0\.s, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (unpkhi_u32_untied, svuint32_t, svuint16_t,
++	     z0 = svunpkhi_u32 (z4),
++	     z0 = svunpkhi (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u64.c
+new file mode 100644
+index 000000000..94cfbd493
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpkhi_u64_tied1:
++**	uunpkhi	z0\.d, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (unpkhi_u64_tied1, svuint64_t, svuint32_t,
++		 z0_res = svunpkhi_u64 (z0),
++		 z0_res = svunpkhi (z0))
++
++/*
++** unpkhi_u64_untied:
++**	uunpkhi	z0\.d, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (unpkhi_u64_untied, svuint64_t, svuint32_t,
++	     z0 = svunpkhi_u64 (z4),
++	     z0 = svunpkhi (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_b.c
+new file mode 100644
+index 000000000..476ec8bc3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_b.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_b_tied1:
++**	punpklo	p0\.h, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (unpklo_b_tied1,
++		p0 = svunpklo_b (p0),
++		p0 = svunpklo (p0))
++
++/*
++** unpklo_b_untied:
++**	punpklo	p0\.h, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (unpklo_b_untied,
++		p0 = svunpklo_b (p1),
++		p0 = svunpklo (p1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s16.c
+new file mode 100644
+index 000000000..a0e83ff1b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_s16_tied1:
++**	sunpklo	z0\.h, z0\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (unpklo_s16_tied1, svint16_t, svint8_t,
++		 z0_res = svunpklo_s16 (z0),
++		 z0_res = svunpklo (z0))
++
++/*
++** unpklo_s16_untied:
++**	sunpklo	z0\.h, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (unpklo_s16_untied, svint16_t, svint8_t,
++	     z0 = svunpklo_s16 (z4),
++	     z0 = svunpklo (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s32.c
+new file mode 100644
+index 000000000..49a14fb7b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_s32_tied1:
++**	sunpklo	z0\.s, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (unpklo_s32_tied1, svint32_t, svint16_t,
++		 z0_res = svunpklo_s32 (z0),
++		 z0_res = svunpklo (z0))
++
++/*
++** unpklo_s32_untied:
++**	sunpklo	z0\.s, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (unpklo_s32_untied, svint32_t, svint16_t,
++	     z0 = svunpklo_s32 (z4),
++	     z0 = svunpklo (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s64.c
+new file mode 100644
+index 000000000..c430047e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_s64_tied1:
++**	sunpklo	z0\.d, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (unpklo_s64_tied1, svint64_t, svint32_t,
++		 z0_res = svunpklo_s64 (z0),
++		 z0_res = svunpklo (z0))
++
++/*
++** unpklo_s64_untied:
++**	sunpklo	z0\.d, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (unpklo_s64_untied, svint64_t, svint32_t,
++	     z0 = svunpklo_s64 (z4),
++	     z0 = svunpklo (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u16.c
+new file mode 100644
+index 000000000..6feee4427
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u16.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_u16_tied1:
++**	uunpklo	z0\.h, z0\.b
++**	ret
++*/
++TEST_DUAL_Z_REV (unpklo_u16_tied1, svuint16_t, svuint8_t,
++		 z0_res = svunpklo_u16 (z0),
++		 z0_res = svunpklo (z0))
++
++/*
++** unpklo_u16_untied:
++**	uunpklo	z0\.h, z4\.b
++**	ret
++*/
++TEST_DUAL_Z (unpklo_u16_untied, svuint16_t, svuint8_t,
++	     z0 = svunpklo_u16 (z4),
++	     z0 = svunpklo (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u32.c
+new file mode 100644
+index 000000000..c4d4efc86
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u32.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_u32_tied1:
++**	uunpklo	z0\.s, z0\.h
++**	ret
++*/
++TEST_DUAL_Z_REV (unpklo_u32_tied1, svuint32_t, svuint16_t,
++		 z0_res = svunpklo_u32 (z0),
++		 z0_res = svunpklo (z0))
++
++/*
++** unpklo_u32_untied:
++**	uunpklo	z0\.s, z4\.h
++**	ret
++*/
++TEST_DUAL_Z (unpklo_u32_untied, svuint32_t, svuint16_t,
++	     z0 = svunpklo_u32 (z4),
++	     z0 = svunpklo (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u64.c
+new file mode 100644
+index 000000000..2845e37a5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u64.c
+@@ -0,0 +1,21 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** unpklo_u64_tied1:
++**	uunpklo	z0\.d, z0\.s
++**	ret
++*/
++TEST_DUAL_Z_REV (unpklo_u64_tied1, svuint64_t, svuint32_t,
++		 z0_res = svunpklo_u64 (z0),
++		 z0_res = svunpklo (z0))
++
++/*
++** unpklo_u64_untied:
++**	uunpklo	z0\.d, z4\.s
++**	ret
++*/
++TEST_DUAL_Z (unpklo_u64_untied, svuint64_t, svuint32_t,
++	     z0 = svunpklo_u64 (z4),
++	     z0 = svunpklo (z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_lane_s32.c
+new file mode 100644
+index 000000000..8fd255687
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_lane_s32.c
+@@ -0,0 +1,97 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** usdot_lane_0_s32_tied1:
++**	usdot	z0\.s, z2\.b, z4\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_lane_0_s32_tied1, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusdot_lane_s32 (z0, z2, z4, 0),
++	       z0 = svusdot_lane (z0, z2, z4, 0))
++
++/*
++** usdot_lane_0_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z2
++**	usdot	z0\.s, \1\.b, z4\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z_REV2 (usdot_lane_0_s32_tied2, svint32_t, svuint8_t, svint8_t,
++		    z0_res = svusdot_lane_s32 (z2, z0, z4, 0),
++		    z0_res = svusdot_lane (z2, z0, z4, 0))
++
++/*
++** usdot_lane_0_s32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	usdot	z0\.s, z2\.b, \1\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z_REV (usdot_lane_0_s32_tied3, svint32_t, svuint8_t, svint8_t,
++		   z0_res = svusdot_lane_s32 (z4, z2, z0, 0),
++		   z0_res = svusdot_lane (z4, z2, z0, 0))
++
++/*
++** usdot_lane_0_s32_untied:
++**	movprfx	z0, z1
++**	usdot	z0\.s, z2\.b, z4\.b\[0\]
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_lane_0_s32_untied, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusdot_lane_s32 (z1, z2, z4, 0),
++	       z0 = svusdot_lane (z1, z2, z4, 0))
++
++/*
++** usdot_lane_1_s32:
++**	usdot	z0\.s, z2\.b, z5\.b\[1\]
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_lane_1_s32, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusdot_lane_s32 (z0, z2, z5, 1),
++	       z0 = svusdot_lane (z0, z2, z5, 1))
++
++/*
++** usdot_lane_2_s32:
++**	usdot	z0\.s, z2\.b, z5\.b\[2\]
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_lane_2_s32, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusdot_lane_s32 (z0, z2, z5, 2),
++	       z0 = svusdot_lane (z0, z2, z5, 2))
++
++/*
++** usdot_lane_3_s32:
++**	usdot	z0\.s, z2\.b, z5\.b\[3\]
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_lane_3_s32, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusdot_lane_s32 (z0, z2, z5, 3),
++	       z0 = svusdot_lane (z0, z2, z5, 3))
++
++/*
++** usdot_lane_z8_s32:
++**	str	d8, \[sp, -16\]!
++**	mov	(z[0-7])\.d, z8\.d
++**	usdot	z0\.s, z1\.b, \1\.b\[1\]
++**	ldr	d8, \[sp\], 16
++**	ret
++*/
++TEST_TRIPLE_LANE_REG (usdot_lane_z8_s32, svint32_t, svuint8_t, svint8_t,
++		      z8,
++		      z0 = svusdot_lane_s32 (z0, z1, z8, 1),
++		      z0 = svusdot_lane (z0, z1, z8, 1))
++
++/*
++** usdot_lane_z16_s32:
++**	mov	(z[0-7])\.d, z16\.d
++**	usdot	z0\.s, z1\.b, \1\.b\[1\]
++**	ret
++*/
++TEST_TRIPLE_LANE_REG (usdot_lane_z16_s32, svint32_t, svuint8_t, svint8_t,
++		      z16,
++		      z0 = svusdot_lane_s32 (z0, z1, z16, 1),
++		      z0 = svusdot_lane (z0, z1, z16, 1))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_s32.c
+new file mode 100644
+index 000000000..ccac5cae5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_s32.c
+@@ -0,0 +1,46 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** usdot_s32_tied1:
++**	usdot	z0\.s, z2\.b, z4\.b
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_s32_tied1, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusdot_s32 (z0, z2, z4),
++	       z0 = svusdot (z0, z2, z4))
++
++/*
++** usdot_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	usdot	z0\.s, z2\.b, \1\.b
++**	ret
++*/
++TEST_TRIPLE_Z_REV (usdot_s32_tied2, svint32_t, svuint8_t, svint8_t,
++		   z0_res = svusdot_s32 (z4, z2, z0),
++		   z0_res = svusdot (z4, z2, z0))
++
++/*
++** usdot_w0_s32_tied:
++**	mov	(z[0-9]+\.b), w0
++**	usdot	z0\.s, z2\.b, \1
++**	ret
++*/
++TEST_TRIPLE_ZX (usdot_w0_s32_tied, svint32_t, svuint8_t, int8_t,
++		z0 = svusdot_n_s32 (z0, z2, x0),
++		z0 = svusdot (z0, z2, x0))
++
++/*
++** usdot_9_s32_tied:
++**	mov	(z[0-9]+\.b), #9
++**	usdot	z0\.s, z2\.b, \1
++**	ret
++*/
++TEST_TRIPLE_Z (usdot_9_s32_tied, svint32_t, svuint8_t, int8_t,
++	       z0 = svusdot_n_s32 (z0, z2, 9),
++	       z0 = svusdot (z0, z2, 9))
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usmmla_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usmmla_s32.c
+new file mode 100644
+index 000000000..9440f3fd9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usmmla_s32.c
+@@ -0,0 +1,46 @@
++/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** usmmla_s32_tied1:
++**	usmmla	z0\.s, z2\.b, z4\.b
++**	ret
++*/
++TEST_TRIPLE_Z (usmmla_s32_tied1, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusmmla_s32 (z0, z2, z4),
++	       z0 = svusmmla (z0, z2, z4))
++
++/*
++** usmmla_s32_tied2:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z2
++**	usmmla	z0\.s, \1\.b, z4\.b
++**	ret
++*/
++TEST_TRIPLE_Z_REV2 (usmmla_s32_tied2, svint32_t, svuint8_t, svint8_t,
++		    z0_res = svusmmla_s32 (z2, z0, z4),
++		    z0_res = svusmmla (z2, z0, z4))
++
++/*
++** usmmla_s32_tied3:
++**	mov	(z[0-9]+)\.d, z0\.d
++**	movprfx	z0, z4
++**	usmmla	z0\.s, z2\.b, \1\.b
++**	ret
++*/
++TEST_TRIPLE_Z_REV (usmmla_s32_tied3, svint32_t, svuint8_t, svint8_t,
++		   z0_res = svusmmla_s32 (z4, z2, z0),
++		   z0_res = svusmmla (z4, z2, z0))
++
++/*
++** usmmla_s32_untied:
++**	movprfx	z0, z1
++**	usmmla	z0\.s, z2\.b, z4\.b
++**	ret
++*/
++TEST_TRIPLE_Z (usmmla_s32_untied, svint32_t, svuint8_t, svint8_t,
++	       z0 = svusmmla_s32 (z1, z2, z4),
++	       z0 = svusmmla (z1, z2, z4))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b16.c
+new file mode 100644
+index 000000000..245e401aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_b16_tied1:
++**	uzp1	p0\.h, p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b16_tied1,
++		p0 = svuzp1_b16 (p0, p1),
++		p0 = svuzp1_b16 (p0, p1))
++
++/*
++** uzp1_b16_tied2:
++**	uzp1	p0\.h, p1\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b16_tied2,
++		p0 = svuzp1_b16 (p1, p0),
++		p0 = svuzp1_b16 (p1, p0))
++
++/*
++** uzp1_b16_untied:
++**	uzp1	p0\.h, p1\.h, p2\.h
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b16_untied,
++		p0 = svuzp1_b16 (p1, p2),
++		p0 = svuzp1_b16 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b32.c
+new file mode 100644
+index 000000000..c88034492
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_b32_tied1:
++**	uzp1	p0\.s, p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b32_tied1,
++		p0 = svuzp1_b32 (p0, p1),
++		p0 = svuzp1_b32 (p0, p1))
++
++/*
++** uzp1_b32_tied2:
++**	uzp1	p0\.s, p1\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b32_tied2,
++		p0 = svuzp1_b32 (p1, p0),
++		p0 = svuzp1_b32 (p1, p0))
++
++/*
++** uzp1_b32_untied:
++**	uzp1	p0\.s, p1\.s, p2\.s
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b32_untied,
++		p0 = svuzp1_b32 (p1, p2),
++		p0 = svuzp1_b32 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b64.c
+new file mode 100644
+index 000000000..71ac5c150
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_b64_tied1:
++**	uzp1	p0\.d, p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b64_tied1,
++		p0 = svuzp1_b64 (p0, p1),
++		p0 = svuzp1_b64 (p0, p1))
++
++/*
++** uzp1_b64_tied2:
++**	uzp1	p0\.d, p1\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b64_tied2,
++		p0 = svuzp1_b64 (p1, p0),
++		p0 = svuzp1_b64 (p1, p0))
++
++/*
++** uzp1_b64_untied:
++**	uzp1	p0\.d, p1\.d, p2\.d
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b64_untied,
++		p0 = svuzp1_b64 (p1, p2),
++		p0 = svuzp1_b64 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b8.c
+new file mode 100644
+index 000000000..250054bb6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_b8_tied1:
++**	uzp1	p0\.b, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b8_tied1,
++		p0 = svuzp1_b8 (p0, p1),
++		p0 = svuzp1_b8 (p0, p1))
++
++/*
++** uzp1_b8_tied2:
++**	uzp1	p0\.b, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b8_tied2,
++		p0 = svuzp1_b8 (p1, p0),
++		p0 = svuzp1_b8 (p1, p0))
++
++/*
++** uzp1_b8_untied:
++**	uzp1	p0\.b, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (uzp1_b8_untied,
++		p0 = svuzp1_b8 (p1, p2),
++		p0 = svuzp1_b8 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c
+new file mode 100644
+index 000000000..19d43ed11
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_bf16_tied1:
++**	uzp1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_bf16_tied1, svbfloat16_t,
++		z0 = svuzp1_bf16 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_bf16_tied2:
++**	uzp1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_bf16_tied2, svbfloat16_t,
++		z0 = svuzp1_bf16 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_bf16_untied:
++**	uzp1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_bf16_untied, svbfloat16_t,
++		z0 = svuzp1_bf16 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f16.c
+new file mode 100644
+index 000000000..313673e9d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_f16_tied1:
++**	uzp1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f16_tied1, svfloat16_t,
++		z0 = svuzp1_f16 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_f16_tied2:
++**	uzp1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f16_tied2, svfloat16_t,
++		z0 = svuzp1_f16 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_f16_untied:
++**	uzp1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f16_untied, svfloat16_t,
++		z0 = svuzp1_f16 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f32.c
+new file mode 100644
+index 000000000..5bbac2c60
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_f32_tied1:
++**	uzp1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f32_tied1, svfloat32_t,
++		z0 = svuzp1_f32 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_f32_tied2:
++**	uzp1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f32_tied2, svfloat32_t,
++		z0 = svuzp1_f32 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_f32_untied:
++**	uzp1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f32_untied, svfloat32_t,
++		z0 = svuzp1_f32 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f64.c
+new file mode 100644
+index 000000000..ef97b1765
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_f64_tied1:
++**	uzp1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f64_tied1, svfloat64_t,
++		z0 = svuzp1_f64 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_f64_tied2:
++**	uzp1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f64_tied2, svfloat64_t,
++		z0 = svuzp1_f64 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_f64_untied:
++**	uzp1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_f64_untied, svfloat64_t,
++		z0 = svuzp1_f64 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s16.c
+new file mode 100644
+index 000000000..b77832b07
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_s16_tied1:
++**	uzp1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s16_tied1, svint16_t,
++		z0 = svuzp1_s16 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_s16_tied2:
++**	uzp1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s16_tied2, svint16_t,
++		z0 = svuzp1_s16 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_s16_untied:
++**	uzp1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s16_untied, svint16_t,
++		z0 = svuzp1_s16 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s32.c
+new file mode 100644
+index 000000000..64291afbe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_s32_tied1:
++**	uzp1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s32_tied1, svint32_t,
++		z0 = svuzp1_s32 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_s32_tied2:
++**	uzp1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s32_tied2, svint32_t,
++		z0 = svuzp1_s32 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_s32_untied:
++**	uzp1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s32_untied, svint32_t,
++		z0 = svuzp1_s32 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s64.c
+new file mode 100644
+index 000000000..e8f7799f6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_s64_tied1:
++**	uzp1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s64_tied1, svint64_t,
++		z0 = svuzp1_s64 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_s64_tied2:
++**	uzp1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s64_tied2, svint64_t,
++		z0 = svuzp1_s64 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_s64_untied:
++**	uzp1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s64_untied, svint64_t,
++		z0 = svuzp1_s64 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s8.c
+new file mode 100644
+index 000000000..98464b790
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_s8_tied1:
++**	uzp1	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s8_tied1, svint8_t,
++		z0 = svuzp1_s8 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_s8_tied2:
++**	uzp1	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s8_tied2, svint8_t,
++		z0 = svuzp1_s8 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_s8_untied:
++**	uzp1	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_s8_untied, svint8_t,
++		z0 = svuzp1_s8 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u16.c
+new file mode 100644
+index 000000000..da95171fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_u16_tied1:
++**	uzp1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u16_tied1, svuint16_t,
++		z0 = svuzp1_u16 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_u16_tied2:
++**	uzp1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u16_tied2, svuint16_t,
++		z0 = svuzp1_u16 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_u16_untied:
++**	uzp1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u16_untied, svuint16_t,
++		z0 = svuzp1_u16 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u32.c
+new file mode 100644
+index 000000000..a57cdcc06
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_u32_tied1:
++**	uzp1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u32_tied1, svuint32_t,
++		z0 = svuzp1_u32 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_u32_tied2:
++**	uzp1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u32_tied2, svuint32_t,
++		z0 = svuzp1_u32 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_u32_untied:
++**	uzp1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u32_untied, svuint32_t,
++		z0 = svuzp1_u32 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u64.c
+new file mode 100644
+index 000000000..24d820359
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_u64_tied1:
++**	uzp1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u64_tied1, svuint64_t,
++		z0 = svuzp1_u64 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_u64_tied2:
++**	uzp1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u64_tied2, svuint64_t,
++		z0 = svuzp1_u64 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_u64_untied:
++**	uzp1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u64_untied, svuint64_t,
++		z0 = svuzp1_u64 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u8.c
+new file mode 100644
+index 000000000..359d4c5f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1_u8_tied1:
++**	uzp1	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u8_tied1, svuint8_t,
++		z0 = svuzp1_u8 (z0, z1),
++		z0 = svuzp1 (z0, z1))
++
++/*
++** uzp1_u8_tied2:
++**	uzp1	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u8_tied2, svuint8_t,
++		z0 = svuzp1_u8 (z1, z0),
++		z0 = svuzp1 (z1, z0))
++
++/*
++** uzp1_u8_untied:
++**	uzp1	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1_u8_untied, svuint8_t,
++		z0 = svuzp1_u8 (z1, z2),
++		z0 = svuzp1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c
+new file mode 100644
+index 000000000..30a199241
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_bf16_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_bf16_tied1, svbfloat16_t,
++		z0 = svuzp1q_bf16 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_bf16_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_bf16_tied2, svbfloat16_t,
++		z0 = svuzp1q_bf16 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_bf16_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_bf16_untied, svbfloat16_t,
++		z0 = svuzp1q_bf16 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f16.c
+new file mode 100644
+index 000000000..c11e5bdc4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_f16_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f16_tied1, svfloat16_t,
++		z0 = svuzp1q_f16 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_f16_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f16_tied2, svfloat16_t,
++		z0 = svuzp1q_f16 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_f16_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f16_untied, svfloat16_t,
++		z0 = svuzp1q_f16 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f32.c
+new file mode 100644
+index 000000000..d0ac94543
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_f32_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f32_tied1, svfloat32_t,
++		z0 = svuzp1q_f32 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_f32_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f32_tied2, svfloat32_t,
++		z0 = svuzp1q_f32 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_f32_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f32_untied, svfloat32_t,
++		z0 = svuzp1q_f32 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f64.c
+new file mode 100644
+index 000000000..ac2e5c5cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_f64_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f64_tied1, svfloat64_t,
++		z0 = svuzp1q_f64 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_f64_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f64_tied2, svfloat64_t,
++		z0 = svuzp1q_f64 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_f64_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_f64_untied, svfloat64_t,
++		z0 = svuzp1q_f64 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s16.c
+new file mode 100644
+index 000000000..aa200b24e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_s16_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s16_tied1, svint16_t,
++		z0 = svuzp1q_s16 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_s16_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s16_tied2, svint16_t,
++		z0 = svuzp1q_s16 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_s16_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s16_untied, svint16_t,
++		z0 = svuzp1q_s16 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s32.c
+new file mode 100644
+index 000000000..eb849df74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_s32_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s32_tied1, svint32_t,
++		z0 = svuzp1q_s32 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_s32_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s32_tied2, svint32_t,
++		z0 = svuzp1q_s32 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_s32_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s32_untied, svint32_t,
++		z0 = svuzp1q_s32 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s64.c
+new file mode 100644
+index 000000000..e1049761c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_s64_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s64_tied1, svint64_t,
++		z0 = svuzp1q_s64 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_s64_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s64_tied2, svint64_t,
++		z0 = svuzp1q_s64 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_s64_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s64_untied, svint64_t,
++		z0 = svuzp1q_s64 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s8.c
+new file mode 100644
+index 000000000..8aa592199
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_s8_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s8_tied1, svint8_t,
++		z0 = svuzp1q_s8 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_s8_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s8_tied2, svint8_t,
++		z0 = svuzp1q_s8 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_s8_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_s8_untied, svint8_t,
++		z0 = svuzp1q_s8 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u16.c
+new file mode 100644
+index 000000000..00ffaab06
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_u16_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u16_tied1, svuint16_t,
++		z0 = svuzp1q_u16 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_u16_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u16_tied2, svuint16_t,
++		z0 = svuzp1q_u16 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_u16_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u16_untied, svuint16_t,
++		z0 = svuzp1q_u16 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u32.c
+new file mode 100644
+index 000000000..cd2e4db26
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_u32_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u32_tied1, svuint32_t,
++		z0 = svuzp1q_u32 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_u32_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u32_tied2, svuint32_t,
++		z0 = svuzp1q_u32 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_u32_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u32_untied, svuint32_t,
++		z0 = svuzp1q_u32 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u64.c
+new file mode 100644
+index 000000000..7d8823329
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_u64_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u64_tied1, svuint64_t,
++		z0 = svuzp1q_u64 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_u64_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u64_tied2, svuint64_t,
++		z0 = svuzp1q_u64 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_u64_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u64_untied, svuint64_t,
++		z0 = svuzp1q_u64 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u8.c
+new file mode 100644
+index 000000000..701a1d575
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp1q_u8_tied1:
++**	uzp1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u8_tied1, svuint8_t,
++		z0 = svuzp1q_u8 (z0, z1),
++		z0 = svuzp1q (z0, z1))
++
++/*
++** uzp1q_u8_tied2:
++**	uzp1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u8_tied2, svuint8_t,
++		z0 = svuzp1q_u8 (z1, z0),
++		z0 = svuzp1q (z1, z0))
++
++/*
++** uzp1q_u8_untied:
++**	uzp1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp1q_u8_untied, svuint8_t,
++		z0 = svuzp1q_u8 (z1, z2),
++		z0 = svuzp1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b16.c
+new file mode 100644
+index 000000000..c3a91e7fc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_b16_tied1:
++**	uzp2	p0\.h, p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b16_tied1,
++		p0 = svuzp2_b16 (p0, p1),
++		p0 = svuzp2_b16 (p0, p1))
++
++/*
++** uzp2_b16_tied2:
++**	uzp2	p0\.h, p1\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b16_tied2,
++		p0 = svuzp2_b16 (p1, p0),
++		p0 = svuzp2_b16 (p1, p0))
++
++/*
++** uzp2_b16_untied:
++**	uzp2	p0\.h, p1\.h, p2\.h
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b16_untied,
++		p0 = svuzp2_b16 (p1, p2),
++		p0 = svuzp2_b16 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b32.c
+new file mode 100644
+index 000000000..e3294a6f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_b32_tied1:
++**	uzp2	p0\.s, p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b32_tied1,
++		p0 = svuzp2_b32 (p0, p1),
++		p0 = svuzp2_b32 (p0, p1))
++
++/*
++** uzp2_b32_tied2:
++**	uzp2	p0\.s, p1\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b32_tied2,
++		p0 = svuzp2_b32 (p1, p0),
++		p0 = svuzp2_b32 (p1, p0))
++
++/*
++** uzp2_b32_untied:
++**	uzp2	p0\.s, p1\.s, p2\.s
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b32_untied,
++		p0 = svuzp2_b32 (p1, p2),
++		p0 = svuzp2_b32 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b64.c
+new file mode 100644
+index 000000000..3ae72e10c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_b64_tied1:
++**	uzp2	p0\.d, p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b64_tied1,
++		p0 = svuzp2_b64 (p0, p1),
++		p0 = svuzp2_b64 (p0, p1))
++
++/*
++** uzp2_b64_tied2:
++**	uzp2	p0\.d, p1\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b64_tied2,
++		p0 = svuzp2_b64 (p1, p0),
++		p0 = svuzp2_b64 (p1, p0))
++
++/*
++** uzp2_b64_untied:
++**	uzp2	p0\.d, p1\.d, p2\.d
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b64_untied,
++		p0 = svuzp2_b64 (p1, p2),
++		p0 = svuzp2_b64 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b8.c
+new file mode 100644
+index 000000000..726a9a079
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_b8_tied1:
++**	uzp2	p0\.b, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b8_tied1,
++		p0 = svuzp2_b8 (p0, p1),
++		p0 = svuzp2_b8 (p0, p1))
++
++/*
++** uzp2_b8_tied2:
++**	uzp2	p0\.b, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b8_tied2,
++		p0 = svuzp2_b8 (p1, p0),
++		p0 = svuzp2_b8 (p1, p0))
++
++/*
++** uzp2_b8_untied:
++**	uzp2	p0\.b, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (uzp2_b8_untied,
++		p0 = svuzp2_b8 (p1, p2),
++		p0 = svuzp2_b8 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c
+new file mode 100644
+index 000000000..b5566bfdf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_bf16_tied1:
++**	uzp2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_bf16_tied1, svbfloat16_t,
++		z0 = svuzp2_bf16 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_bf16_tied2:
++**	uzp2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_bf16_tied2, svbfloat16_t,
++		z0 = svuzp2_bf16 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_bf16_untied:
++**	uzp2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_bf16_untied, svbfloat16_t,
++		z0 = svuzp2_bf16 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f16.c
+new file mode 100644
+index 000000000..d4847ef37
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_f16_tied1:
++**	uzp2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f16_tied1, svfloat16_t,
++		z0 = svuzp2_f16 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_f16_tied2:
++**	uzp2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f16_tied2, svfloat16_t,
++		z0 = svuzp2_f16 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_f16_untied:
++**	uzp2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f16_untied, svfloat16_t,
++		z0 = svuzp2_f16 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f32.c
+new file mode 100644
+index 000000000..c1699fc9c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_f32_tied1:
++**	uzp2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f32_tied1, svfloat32_t,
++		z0 = svuzp2_f32 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_f32_tied2:
++**	uzp2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f32_tied2, svfloat32_t,
++		z0 = svuzp2_f32 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_f32_untied:
++**	uzp2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f32_untied, svfloat32_t,
++		z0 = svuzp2_f32 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f64.c
+new file mode 100644
+index 000000000..afbf5c11a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_f64_tied1:
++**	uzp2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f64_tied1, svfloat64_t,
++		z0 = svuzp2_f64 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_f64_tied2:
++**	uzp2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f64_tied2, svfloat64_t,
++		z0 = svuzp2_f64 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_f64_untied:
++**	uzp2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_f64_untied, svfloat64_t,
++		z0 = svuzp2_f64 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s16.c
+new file mode 100644
+index 000000000..e88df8734
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_s16_tied1:
++**	uzp2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s16_tied1, svint16_t,
++		z0 = svuzp2_s16 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_s16_tied2:
++**	uzp2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s16_tied2, svint16_t,
++		z0 = svuzp2_s16 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_s16_untied:
++**	uzp2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s16_untied, svint16_t,
++		z0 = svuzp2_s16 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s32.c
+new file mode 100644
+index 000000000..2e9a73d1f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_s32_tied1:
++**	uzp2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s32_tied1, svint32_t,
++		z0 = svuzp2_s32 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_s32_tied2:
++**	uzp2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s32_tied2, svint32_t,
++		z0 = svuzp2_s32 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_s32_untied:
++**	uzp2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s32_untied, svint32_t,
++		z0 = svuzp2_s32 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s64.c
+new file mode 100644
+index 000000000..ffec78ccc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_s64_tied1:
++**	uzp2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s64_tied1, svint64_t,
++		z0 = svuzp2_s64 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_s64_tied2:
++**	uzp2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s64_tied2, svint64_t,
++		z0 = svuzp2_s64 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_s64_untied:
++**	uzp2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s64_untied, svint64_t,
++		z0 = svuzp2_s64 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s8.c
+new file mode 100644
+index 000000000..72037a088
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_s8_tied1:
++**	uzp2	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s8_tied1, svint8_t,
++		z0 = svuzp2_s8 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_s8_tied2:
++**	uzp2	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s8_tied2, svint8_t,
++		z0 = svuzp2_s8 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_s8_untied:
++**	uzp2	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_s8_untied, svint8_t,
++		z0 = svuzp2_s8 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u16.c
+new file mode 100644
+index 000000000..d84f8c9ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_u16_tied1:
++**	uzp2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u16_tied1, svuint16_t,
++		z0 = svuzp2_u16 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_u16_tied2:
++**	uzp2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u16_tied2, svuint16_t,
++		z0 = svuzp2_u16 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_u16_untied:
++**	uzp2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u16_untied, svuint16_t,
++		z0 = svuzp2_u16 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u32.c
+new file mode 100644
+index 000000000..0285ff91f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_u32_tied1:
++**	uzp2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u32_tied1, svuint32_t,
++		z0 = svuzp2_u32 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_u32_tied2:
++**	uzp2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u32_tied2, svuint32_t,
++		z0 = svuzp2_u32 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_u32_untied:
++**	uzp2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u32_untied, svuint32_t,
++		z0 = svuzp2_u32 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u64.c
+new file mode 100644
+index 000000000..1b51baf90
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_u64_tied1:
++**	uzp2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u64_tied1, svuint64_t,
++		z0 = svuzp2_u64 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_u64_tied2:
++**	uzp2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u64_tied2, svuint64_t,
++		z0 = svuzp2_u64 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_u64_untied:
++**	uzp2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u64_untied, svuint64_t,
++		z0 = svuzp2_u64 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u8.c
+new file mode 100644
+index 000000000..662e0b818
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2_u8_tied1:
++**	uzp2	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u8_tied1, svuint8_t,
++		z0 = svuzp2_u8 (z0, z1),
++		z0 = svuzp2 (z0, z1))
++
++/*
++** uzp2_u8_tied2:
++**	uzp2	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u8_tied2, svuint8_t,
++		z0 = svuzp2_u8 (z1, z0),
++		z0 = svuzp2 (z1, z0))
++
++/*
++** uzp2_u8_untied:
++**	uzp2	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2_u8_untied, svuint8_t,
++		z0 = svuzp2_u8 (z1, z2),
++		z0 = svuzp2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c
+new file mode 100644
+index 000000000..bbac53a7a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_bf16_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_bf16_tied1, svbfloat16_t,
++		z0 = svuzp2q_bf16 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_bf16_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_bf16_tied2, svbfloat16_t,
++		z0 = svuzp2q_bf16 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_bf16_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_bf16_untied, svbfloat16_t,
++		z0 = svuzp2q_bf16 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f16.c
+new file mode 100644
+index 000000000..e19d118fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_f16_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f16_tied1, svfloat16_t,
++		z0 = svuzp2q_f16 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_f16_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f16_tied2, svfloat16_t,
++		z0 = svuzp2q_f16 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_f16_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f16_untied, svfloat16_t,
++		z0 = svuzp2q_f16 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f32.c
+new file mode 100644
+index 000000000..af7112b15
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_f32_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f32_tied1, svfloat32_t,
++		z0 = svuzp2q_f32 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_f32_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f32_tied2, svfloat32_t,
++		z0 = svuzp2q_f32 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_f32_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f32_untied, svfloat32_t,
++		z0 = svuzp2q_f32 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f64.c
+new file mode 100644
+index 000000000..4109b843c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_f64_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f64_tied1, svfloat64_t,
++		z0 = svuzp2q_f64 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_f64_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f64_tied2, svfloat64_t,
++		z0 = svuzp2q_f64 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_f64_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_f64_untied, svfloat64_t,
++		z0 = svuzp2q_f64 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s16.c
+new file mode 100644
+index 000000000..0c6ab25cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_s16_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s16_tied1, svint16_t,
++		z0 = svuzp2q_s16 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_s16_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s16_tied2, svint16_t,
++		z0 = svuzp2q_s16 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_s16_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s16_untied, svint16_t,
++		z0 = svuzp2q_s16 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s32.c
+new file mode 100644
+index 000000000..9b914e704
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_s32_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s32_tied1, svint32_t,
++		z0 = svuzp2q_s32 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_s32_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s32_tied2, svint32_t,
++		z0 = svuzp2q_s32 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_s32_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s32_untied, svint32_t,
++		z0 = svuzp2q_s32 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s64.c
+new file mode 100644
+index 000000000..697e37d78
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_s64_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s64_tied1, svint64_t,
++		z0 = svuzp2q_s64 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_s64_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s64_tied2, svint64_t,
++		z0 = svuzp2q_s64 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_s64_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s64_untied, svint64_t,
++		z0 = svuzp2q_s64 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s8.c
+new file mode 100644
+index 000000000..576262c5d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_s8_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s8_tied1, svint8_t,
++		z0 = svuzp2q_s8 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_s8_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s8_tied2, svint8_t,
++		z0 = svuzp2q_s8 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_s8_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_s8_untied, svint8_t,
++		z0 = svuzp2q_s8 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u16.c
+new file mode 100644
+index 000000000..f2debc28f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_u16_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u16_tied1, svuint16_t,
++		z0 = svuzp2q_u16 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_u16_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u16_tied2, svuint16_t,
++		z0 = svuzp2q_u16 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_u16_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u16_untied, svuint16_t,
++		z0 = svuzp2q_u16 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u32.c
+new file mode 100644
+index 000000000..ad6a4bcc0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_u32_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u32_tied1, svuint32_t,
++		z0 = svuzp2q_u32 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_u32_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u32_tied2, svuint32_t,
++		z0 = svuzp2q_u32 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_u32_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u32_untied, svuint32_t,
++		z0 = svuzp2q_u32 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u64.c
+new file mode 100644
+index 000000000..a846aa295
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_u64_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u64_tied1, svuint64_t,
++		z0 = svuzp2q_u64 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_u64_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u64_tied2, svuint64_t,
++		z0 = svuzp2q_u64 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_u64_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u64_untied, svuint64_t,
++		z0 = svuzp2q_u64 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u8.c
+new file mode 100644
+index 000000000..163c22659
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** uzp2q_u8_tied1:
++**	uzp2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u8_tied1, svuint8_t,
++		z0 = svuzp2q_u8 (z0, z1),
++		z0 = svuzp2q (z0, z1))
++
++/*
++** uzp2q_u8_tied2:
++**	uzp2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u8_tied2, svuint8_t,
++		z0 = svuzp2q_u8 (z1, z0),
++		z0 = svuzp2q (z1, z0))
++
++/*
++** uzp2q_u8_untied:
++**	uzp2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (uzp2q_u8_untied, svuint8_t,
++		z0 = svuzp2q_u8 (z1, z2),
++		z0 = svuzp2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b16.c
+new file mode 100644
+index 000000000..c285a7a73
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b16.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilele_rr_b16_s32:
++**	whilele	p0\.h, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b16_s32, int32_t,
++		p0 = svwhilele_b16_s32 (x0, x1),
++		p0 = svwhilele_b16 (x0, x1))
++
++/*
++** whilele_0r_b16_s32:
++**	whilele	p0\.h, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b16_s32, int32_t,
++		p0 = svwhilele_b16_s32 (0, x1),
++		p0 = svwhilele_b16 (0, x1))
++
++/*
++** whilele_5r_b16_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.h, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b16_s32, int32_t,
++		p0 = svwhilele_b16_s32 (5, x1),
++		p0 = svwhilele_b16 (5, x1))
++
++/*
++** whilele_r0_b16_s32:
++**	whilele	p0\.h, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b16_s32, int32_t,
++		p0 = svwhilele_b16_s32 (x0, 0),
++		p0 = svwhilele_b16 (x0, 0))
++
++/*
++** whilele_r5_b16_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.h, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b16_s32, int32_t,
++		p0 = svwhilele_b16_s32 (x0, 5),
++		p0 = svwhilele_b16 (x0, 5))
++
++/*
++** whilele_rr_b16_s64:
++**	whilele	p0\.h, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b16_s64, int64_t,
++		p0 = svwhilele_b16_s64 (x0, x1),
++		p0 = svwhilele_b16 (x0, x1))
++
++/*
++** whilele_0r_b16_s64:
++**	whilele	p0\.h, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b16_s64, int64_t,
++		p0 = svwhilele_b16_s64 (0, x1),
++		p0 = svwhilele_b16 ((int64_t) 0, x1))
++
++/*
++** whilele_5r_b16_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.h, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b16_s64, int64_t,
++		p0 = svwhilele_b16_s64 (5, x1),
++		p0 = svwhilele_b16 ((int64_t) 5, x1))
++
++/*
++** whilele_r0_b16_s64:
++**	whilele	p0\.h, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b16_s64, int64_t,
++		p0 = svwhilele_b16_s64 (x0, 0),
++		p0 = svwhilele_b16 (x0, (int64_t) 0))
++
++/*
++** whilele_r5_b16_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.h, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b16_s64, int64_t,
++		p0 = svwhilele_b16_s64 (x0, 5),
++		p0 = svwhilele_b16 (x0, (int64_t) 5))
++
++/*
++** whilele_rr_b16_u32:
++**	whilels	p0\.h, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b16_u32, uint32_t,
++		p0 = svwhilele_b16_u32 (x0, x1),
++		p0 = svwhilele_b16 (x0, x1))
++
++/*
++** whilele_0r_b16_u32:
++**	whilels	p0\.h, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b16_u32, uint32_t,
++		p0 = svwhilele_b16_u32 (0, x1),
++		p0 = svwhilele_b16 ((uint32_t) 0, x1))
++
++/*
++** whilele_5r_b16_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.h, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b16_u32, uint32_t,
++		p0 = svwhilele_b16_u32 (5, x1),
++		p0 = svwhilele_b16 ((uint32_t) 5, x1))
++
++/*
++** whilele_r5_b16_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.h, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b16_u32, uint32_t,
++		p0 = svwhilele_b16_u32 (x0, 5),
++		p0 = svwhilele_b16 (x0, (uint32_t) 5))
++
++/*
++** whilele_rr_b16_u64:
++**	whilels	p0\.h, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b16_u64, uint64_t,
++		p0 = svwhilele_b16_u64 (x0, x1),
++		p0 = svwhilele_b16 (x0, x1))
++
++/*
++** whilele_0r_b16_u64:
++**	whilels	p0\.h, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b16_u64, uint64_t,
++		p0 = svwhilele_b16_u64 (0, x1),
++		p0 = svwhilele_b16 ((uint64_t) 0, x1))
++
++/*
++** whilele_5r_b16_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.h, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b16_u64, uint64_t,
++		p0 = svwhilele_b16_u64 (5, x1),
++		p0 = svwhilele_b16 ((uint64_t) 5, x1))
++
++/*
++** whilele_r5_b16_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.h, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b16_u64, uint64_t,
++		p0 = svwhilele_b16_u64 (x0, 5),
++		p0 = svwhilele_b16 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b32.c
+new file mode 100644
+index 000000000..d369ccfa3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b32.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilele_rr_b32_s32:
++**	whilele	p0\.s, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b32_s32, int32_t,
++		p0 = svwhilele_b32_s32 (x0, x1),
++		p0 = svwhilele_b32 (x0, x1))
++
++/*
++** whilele_0r_b32_s32:
++**	whilele	p0\.s, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b32_s32, int32_t,
++		p0 = svwhilele_b32_s32 (0, x1),
++		p0 = svwhilele_b32 (0, x1))
++
++/*
++** whilele_5r_b32_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.s, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b32_s32, int32_t,
++		p0 = svwhilele_b32_s32 (5, x1),
++		p0 = svwhilele_b32 (5, x1))
++
++/*
++** whilele_r0_b32_s32:
++**	whilele	p0\.s, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b32_s32, int32_t,
++		p0 = svwhilele_b32_s32 (x0, 0),
++		p0 = svwhilele_b32 (x0, 0))
++
++/*
++** whilele_r5_b32_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.s, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b32_s32, int32_t,
++		p0 = svwhilele_b32_s32 (x0, 5),
++		p0 = svwhilele_b32 (x0, 5))
++
++/*
++** whilele_rr_b32_s64:
++**	whilele	p0\.s, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b32_s64, int64_t,
++		p0 = svwhilele_b32_s64 (x0, x1),
++		p0 = svwhilele_b32 (x0, x1))
++
++/*
++** whilele_0r_b32_s64:
++**	whilele	p0\.s, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b32_s64, int64_t,
++		p0 = svwhilele_b32_s64 (0, x1),
++		p0 = svwhilele_b32 ((int64_t) 0, x1))
++
++/*
++** whilele_5r_b32_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.s, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b32_s64, int64_t,
++		p0 = svwhilele_b32_s64 (5, x1),
++		p0 = svwhilele_b32 ((int64_t) 5, x1))
++
++/*
++** whilele_r0_b32_s64:
++**	whilele	p0\.s, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b32_s64, int64_t,
++		p0 = svwhilele_b32_s64 (x0, 0),
++		p0 = svwhilele_b32 (x0, (int64_t) 0))
++
++/*
++** whilele_r5_b32_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.s, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b32_s64, int64_t,
++		p0 = svwhilele_b32_s64 (x0, 5),
++		p0 = svwhilele_b32 (x0, (int64_t) 5))
++
++/*
++** whilele_rr_b32_u32:
++**	whilels	p0\.s, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b32_u32, uint32_t,
++		p0 = svwhilele_b32_u32 (x0, x1),
++		p0 = svwhilele_b32 (x0, x1))
++
++/*
++** whilele_0r_b32_u32:
++**	whilels	p0\.s, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b32_u32, uint32_t,
++		p0 = svwhilele_b32_u32 (0, x1),
++		p0 = svwhilele_b32 ((uint32_t) 0, x1))
++
++/*
++** whilele_5r_b32_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.s, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b32_u32, uint32_t,
++		p0 = svwhilele_b32_u32 (5, x1),
++		p0 = svwhilele_b32 ((uint32_t) 5, x1))
++
++/*
++** whilele_r5_b32_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.s, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b32_u32, uint32_t,
++		p0 = svwhilele_b32_u32 (x0, 5),
++		p0 = svwhilele_b32 (x0, (uint32_t) 5))
++
++/*
++** whilele_rr_b32_u64:
++**	whilels	p0\.s, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b32_u64, uint64_t,
++		p0 = svwhilele_b32_u64 (x0, x1),
++		p0 = svwhilele_b32 (x0, x1))
++
++/*
++** whilele_0r_b32_u64:
++**	whilels	p0\.s, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b32_u64, uint64_t,
++		p0 = svwhilele_b32_u64 (0, x1),
++		p0 = svwhilele_b32 ((uint64_t) 0, x1))
++
++/*
++** whilele_5r_b32_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.s, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b32_u64, uint64_t,
++		p0 = svwhilele_b32_u64 (5, x1),
++		p0 = svwhilele_b32 ((uint64_t) 5, x1))
++
++/*
++** whilele_r5_b32_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.s, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b32_u64, uint64_t,
++		p0 = svwhilele_b32_u64 (x0, 5),
++		p0 = svwhilele_b32 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b64.c
+new file mode 100644
+index 000000000..394f51f44
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b64.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilele_rr_b64_s32:
++**	whilele	p0\.d, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b64_s32, int32_t,
++		p0 = svwhilele_b64_s32 (x0, x1),
++		p0 = svwhilele_b64 (x0, x1))
++
++/*
++** whilele_0r_b64_s32:
++**	whilele	p0\.d, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b64_s32, int32_t,
++		p0 = svwhilele_b64_s32 (0, x1),
++		p0 = svwhilele_b64 (0, x1))
++
++/*
++** whilele_5r_b64_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.d, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b64_s32, int32_t,
++		p0 = svwhilele_b64_s32 (5, x1),
++		p0 = svwhilele_b64 (5, x1))
++
++/*
++** whilele_r0_b64_s32:
++**	whilele	p0\.d, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b64_s32, int32_t,
++		p0 = svwhilele_b64_s32 (x0, 0),
++		p0 = svwhilele_b64 (x0, 0))
++
++/*
++** whilele_r5_b64_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.d, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b64_s32, int32_t,
++		p0 = svwhilele_b64_s32 (x0, 5),
++		p0 = svwhilele_b64 (x0, 5))
++
++/*
++** whilele_rr_b64_s64:
++**	whilele	p0\.d, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b64_s64, int64_t,
++		p0 = svwhilele_b64_s64 (x0, x1),
++		p0 = svwhilele_b64 (x0, x1))
++
++/*
++** whilele_0r_b64_s64:
++**	whilele	p0\.d, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b64_s64, int64_t,
++		p0 = svwhilele_b64_s64 (0, x1),
++		p0 = svwhilele_b64 ((int64_t) 0, x1))
++
++/*
++** whilele_5r_b64_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.d, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b64_s64, int64_t,
++		p0 = svwhilele_b64_s64 (5, x1),
++		p0 = svwhilele_b64 ((int64_t) 5, x1))
++
++/*
++** whilele_r0_b64_s64:
++**	whilele	p0\.d, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b64_s64, int64_t,
++		p0 = svwhilele_b64_s64 (x0, 0),
++		p0 = svwhilele_b64 (x0, (int64_t) 0))
++
++/*
++** whilele_r5_b64_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.d, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b64_s64, int64_t,
++		p0 = svwhilele_b64_s64 (x0, 5),
++		p0 = svwhilele_b64 (x0, (int64_t) 5))
++
++/*
++** whilele_rr_b64_u32:
++**	whilels	p0\.d, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b64_u32, uint32_t,
++		p0 = svwhilele_b64_u32 (x0, x1),
++		p0 = svwhilele_b64 (x0, x1))
++
++/*
++** whilele_0r_b64_u32:
++**	whilels	p0\.d, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b64_u32, uint32_t,
++		p0 = svwhilele_b64_u32 (0, x1),
++		p0 = svwhilele_b64 ((uint32_t) 0, x1))
++
++/*
++** whilele_5r_b64_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.d, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b64_u32, uint32_t,
++		p0 = svwhilele_b64_u32 (5, x1),
++		p0 = svwhilele_b64 ((uint32_t) 5, x1))
++
++/*
++** whilele_r5_b64_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.d, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b64_u32, uint32_t,
++		p0 = svwhilele_b64_u32 (x0, 5),
++		p0 = svwhilele_b64 (x0, (uint32_t) 5))
++
++/*
++** whilele_rr_b64_u64:
++**	whilels	p0\.d, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b64_u64, uint64_t,
++		p0 = svwhilele_b64_u64 (x0, x1),
++		p0 = svwhilele_b64 (x0, x1))
++
++/*
++** whilele_0r_b64_u64:
++**	whilels	p0\.d, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b64_u64, uint64_t,
++		p0 = svwhilele_b64_u64 (0, x1),
++		p0 = svwhilele_b64 ((uint64_t) 0, x1))
++
++/*
++** whilele_5r_b64_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.d, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b64_u64, uint64_t,
++		p0 = svwhilele_b64_u64 (5, x1),
++		p0 = svwhilele_b64 ((uint64_t) 5, x1))
++
++/*
++** whilele_r5_b64_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.d, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b64_u64, uint64_t,
++		p0 = svwhilele_b64_u64 (x0, 5),
++		p0 = svwhilele_b64 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b8.c
+new file mode 100644
+index 000000000..2ec101473
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b8.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilele_rr_b8_s32:
++**	whilele	p0\.b, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b8_s32, int32_t,
++		p0 = svwhilele_b8_s32 (x0, x1),
++		p0 = svwhilele_b8 (x0, x1))
++
++/*
++** whilele_0r_b8_s32:
++**	whilele	p0\.b, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b8_s32, int32_t,
++		p0 = svwhilele_b8_s32 (0, x1),
++		p0 = svwhilele_b8 (0, x1))
++
++/*
++** whilele_5r_b8_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.b, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b8_s32, int32_t,
++		p0 = svwhilele_b8_s32 (5, x1),
++		p0 = svwhilele_b8 (5, x1))
++
++/*
++** whilele_r0_b8_s32:
++**	whilele	p0\.b, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b8_s32, int32_t,
++		p0 = svwhilele_b8_s32 (x0, 0),
++		p0 = svwhilele_b8 (x0, 0))
++
++/*
++** whilele_r5_b8_s32:
++**	mov	(w[0-9]+), #?5
++**	whilele	p0\.b, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b8_s32, int32_t,
++		p0 = svwhilele_b8_s32 (x0, 5),
++		p0 = svwhilele_b8 (x0, 5))
++
++/*
++** whilele_rr_b8_s64:
++**	whilele	p0\.b, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b8_s64, int64_t,
++		p0 = svwhilele_b8_s64 (x0, x1),
++		p0 = svwhilele_b8 (x0, x1))
++
++/*
++** whilele_0r_b8_s64:
++**	whilele	p0\.b, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b8_s64, int64_t,
++		p0 = svwhilele_b8_s64 (0, x1),
++		p0 = svwhilele_b8 ((int64_t) 0, x1))
++
++/*
++** whilele_5r_b8_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.b, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b8_s64, int64_t,
++		p0 = svwhilele_b8_s64 (5, x1),
++		p0 = svwhilele_b8 ((int64_t) 5, x1))
++
++/*
++** whilele_r0_b8_s64:
++**	whilele	p0\.b, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilele_r0_b8_s64, int64_t,
++		p0 = svwhilele_b8_s64 (x0, 0),
++		p0 = svwhilele_b8 (x0, (int64_t) 0))
++
++/*
++** whilele_r5_b8_s64:
++**	mov	(x[0-9]+), #?5
++**	whilele	p0\.b, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b8_s64, int64_t,
++		p0 = svwhilele_b8_s64 (x0, 5),
++		p0 = svwhilele_b8 (x0, (int64_t) 5))
++
++/*
++** whilele_rr_b8_u32:
++**	whilels	p0\.b, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b8_u32, uint32_t,
++		p0 = svwhilele_b8_u32 (x0, x1),
++		p0 = svwhilele_b8 (x0, x1))
++
++/*
++** whilele_0r_b8_u32:
++**	whilels	p0\.b, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b8_u32, uint32_t,
++		p0 = svwhilele_b8_u32 (0, x1),
++		p0 = svwhilele_b8 ((uint32_t) 0, x1))
++
++/*
++** whilele_5r_b8_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.b, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b8_u32, uint32_t,
++		p0 = svwhilele_b8_u32 (5, x1),
++		p0 = svwhilele_b8 ((uint32_t) 5, x1))
++
++/*
++** whilele_r5_b8_u32:
++**	mov	(w[0-9]+), #?5
++**	whilels	p0\.b, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b8_u32, uint32_t,
++		p0 = svwhilele_b8_u32 (x0, 5),
++		p0 = svwhilele_b8 (x0, (uint32_t) 5))
++
++/*
++** whilele_rr_b8_u64:
++**	whilels	p0\.b, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_rr_b8_u64, uint64_t,
++		p0 = svwhilele_b8_u64 (x0, x1),
++		p0 = svwhilele_b8 (x0, x1))
++
++/*
++** whilele_0r_b8_u64:
++**	whilels	p0\.b, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_0r_b8_u64, uint64_t,
++		p0 = svwhilele_b8_u64 (0, x1),
++		p0 = svwhilele_b8 ((uint64_t) 0, x1))
++
++/*
++** whilele_5r_b8_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.b, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilele_5r_b8_u64, uint64_t,
++		p0 = svwhilele_b8_u64 (5, x1),
++		p0 = svwhilele_b8 ((uint64_t) 5, x1))
++
++/*
++** whilele_r5_b8_u64:
++**	mov	(x[0-9]+), #?5
++**	whilels	p0\.b, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilele_r5_b8_u64, uint64_t,
++		p0 = svwhilele_b8_u64 (x0, 5),
++		p0 = svwhilele_b8 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b16.c
+new file mode 100644
+index 000000000..14a60432b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b16.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilelt_rr_b16_s32:
++**	whilelt	p0\.h, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b16_s32, int32_t,
++		p0 = svwhilelt_b16_s32 (x0, x1),
++		p0 = svwhilelt_b16 (x0, x1))
++
++/*
++** whilelt_0r_b16_s32:
++**	whilelt	p0\.h, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b16_s32, int32_t,
++		p0 = svwhilelt_b16_s32 (0, x1),
++		p0 = svwhilelt_b16 (0, x1))
++
++/*
++** whilelt_5r_b16_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.h, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b16_s32, int32_t,
++		p0 = svwhilelt_b16_s32 (5, x1),
++		p0 = svwhilelt_b16 (5, x1))
++
++/*
++** whilelt_r0_b16_s32:
++**	whilelt	p0\.h, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b16_s32, int32_t,
++		p0 = svwhilelt_b16_s32 (x0, 0),
++		p0 = svwhilelt_b16 (x0, 0))
++
++/*
++** whilelt_r5_b16_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.h, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b16_s32, int32_t,
++		p0 = svwhilelt_b16_s32 (x0, 5),
++		p0 = svwhilelt_b16 (x0, 5))
++
++/*
++** whilelt_rr_b16_s64:
++**	whilelt	p0\.h, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b16_s64, int64_t,
++		p0 = svwhilelt_b16_s64 (x0, x1),
++		p0 = svwhilelt_b16 (x0, x1))
++
++/*
++** whilelt_0r_b16_s64:
++**	whilelt	p0\.h, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b16_s64, int64_t,
++		p0 = svwhilelt_b16_s64 (0, x1),
++		p0 = svwhilelt_b16 ((int64_t) 0, x1))
++
++/*
++** whilelt_5r_b16_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.h, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b16_s64, int64_t,
++		p0 = svwhilelt_b16_s64 (5, x1),
++		p0 = svwhilelt_b16 ((int64_t) 5, x1))
++
++/*
++** whilelt_r0_b16_s64:
++**	whilelt	p0\.h, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b16_s64, int64_t,
++		p0 = svwhilelt_b16_s64 (x0, 0),
++		p0 = svwhilelt_b16 (x0, (int64_t) 0))
++
++/*
++** whilelt_r5_b16_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.h, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b16_s64, int64_t,
++		p0 = svwhilelt_b16_s64 (x0, 5),
++		p0 = svwhilelt_b16 (x0, (int64_t) 5))
++
++/*
++** whilelt_rr_b16_u32:
++**	whilelo	p0\.h, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b16_u32, uint32_t,
++		p0 = svwhilelt_b16_u32 (x0, x1),
++		p0 = svwhilelt_b16 (x0, x1))
++
++/*
++** whilelt_0r_b16_u32:
++**	whilelo	p0\.h, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b16_u32, uint32_t,
++		p0 = svwhilelt_b16_u32 (0, x1),
++		p0 = svwhilelt_b16 ((uint32_t) 0, x1))
++
++/*
++** whilelt_5r_b16_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.h, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b16_u32, uint32_t,
++		p0 = svwhilelt_b16_u32 (5, x1),
++		p0 = svwhilelt_b16 ((uint32_t) 5, x1))
++
++/*
++** whilelt_r5_b16_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.h, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b16_u32, uint32_t,
++		p0 = svwhilelt_b16_u32 (x0, 5),
++		p0 = svwhilelt_b16 (x0, (uint32_t) 5))
++
++/*
++** whilelt_rr_b16_u64:
++**	whilelo	p0\.h, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b16_u64, uint64_t,
++		p0 = svwhilelt_b16_u64 (x0, x1),
++		p0 = svwhilelt_b16 (x0, x1))
++
++/*
++** whilelt_0r_b16_u64:
++**	whilelo	p0\.h, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b16_u64, uint64_t,
++		p0 = svwhilelt_b16_u64 (0, x1),
++		p0 = svwhilelt_b16 ((uint64_t) 0, x1))
++
++/*
++** whilelt_5r_b16_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.h, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b16_u64, uint64_t,
++		p0 = svwhilelt_b16_u64 (5, x1),
++		p0 = svwhilelt_b16 ((uint64_t) 5, x1))
++
++/*
++** whilelt_r5_b16_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.h, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b16_u64, uint64_t,
++		p0 = svwhilelt_b16_u64 (x0, 5),
++		p0 = svwhilelt_b16 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b32.c
+new file mode 100644
+index 000000000..0e50bb07a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b32.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilelt_rr_b32_s32:
++**	whilelt	p0\.s, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b32_s32, int32_t,
++		p0 = svwhilelt_b32_s32 (x0, x1),
++		p0 = svwhilelt_b32 (x0, x1))
++
++/*
++** whilelt_0r_b32_s32:
++**	whilelt	p0\.s, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b32_s32, int32_t,
++		p0 = svwhilelt_b32_s32 (0, x1),
++		p0 = svwhilelt_b32 (0, x1))
++
++/*
++** whilelt_5r_b32_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.s, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b32_s32, int32_t,
++		p0 = svwhilelt_b32_s32 (5, x1),
++		p0 = svwhilelt_b32 (5, x1))
++
++/*
++** whilelt_r0_b32_s32:
++**	whilelt	p0\.s, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b32_s32, int32_t,
++		p0 = svwhilelt_b32_s32 (x0, 0),
++		p0 = svwhilelt_b32 (x0, 0))
++
++/*
++** whilelt_r5_b32_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.s, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b32_s32, int32_t,
++		p0 = svwhilelt_b32_s32 (x0, 5),
++		p0 = svwhilelt_b32 (x0, 5))
++
++/*
++** whilelt_rr_b32_s64:
++**	whilelt	p0\.s, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b32_s64, int64_t,
++		p0 = svwhilelt_b32_s64 (x0, x1),
++		p0 = svwhilelt_b32 (x0, x1))
++
++/*
++** whilelt_0r_b32_s64:
++**	whilelt	p0\.s, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b32_s64, int64_t,
++		p0 = svwhilelt_b32_s64 (0, x1),
++		p0 = svwhilelt_b32 ((int64_t) 0, x1))
++
++/*
++** whilelt_5r_b32_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.s, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b32_s64, int64_t,
++		p0 = svwhilelt_b32_s64 (5, x1),
++		p0 = svwhilelt_b32 ((int64_t) 5, x1))
++
++/*
++** whilelt_r0_b32_s64:
++**	whilelt	p0\.s, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b32_s64, int64_t,
++		p0 = svwhilelt_b32_s64 (x0, 0),
++		p0 = svwhilelt_b32 (x0, (int64_t) 0))
++
++/*
++** whilelt_r5_b32_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.s, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b32_s64, int64_t,
++		p0 = svwhilelt_b32_s64 (x0, 5),
++		p0 = svwhilelt_b32 (x0, (int64_t) 5))
++
++/*
++** whilelt_rr_b32_u32:
++**	whilelo	p0\.s, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b32_u32, uint32_t,
++		p0 = svwhilelt_b32_u32 (x0, x1),
++		p0 = svwhilelt_b32 (x0, x1))
++
++/*
++** whilelt_0r_b32_u32:
++**	whilelo	p0\.s, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b32_u32, uint32_t,
++		p0 = svwhilelt_b32_u32 (0, x1),
++		p0 = svwhilelt_b32 ((uint32_t) 0, x1))
++
++/*
++** whilelt_5r_b32_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.s, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b32_u32, uint32_t,
++		p0 = svwhilelt_b32_u32 (5, x1),
++		p0 = svwhilelt_b32 ((uint32_t) 5, x1))
++
++/*
++** whilelt_r5_b32_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.s, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b32_u32, uint32_t,
++		p0 = svwhilelt_b32_u32 (x0, 5),
++		p0 = svwhilelt_b32 (x0, (uint32_t) 5))
++
++/*
++** whilelt_rr_b32_u64:
++**	whilelo	p0\.s, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b32_u64, uint64_t,
++		p0 = svwhilelt_b32_u64 (x0, x1),
++		p0 = svwhilelt_b32 (x0, x1))
++
++/*
++** whilelt_0r_b32_u64:
++**	whilelo	p0\.s, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b32_u64, uint64_t,
++		p0 = svwhilelt_b32_u64 (0, x1),
++		p0 = svwhilelt_b32 ((uint64_t) 0, x1))
++
++/*
++** whilelt_5r_b32_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.s, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b32_u64, uint64_t,
++		p0 = svwhilelt_b32_u64 (5, x1),
++		p0 = svwhilelt_b32 ((uint64_t) 5, x1))
++
++/*
++** whilelt_r5_b32_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.s, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b32_u64, uint64_t,
++		p0 = svwhilelt_b32_u64 (x0, 5),
++		p0 = svwhilelt_b32 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b64.c
+new file mode 100644
+index 000000000..539c93347
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b64.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilelt_rr_b64_s32:
++**	whilelt	p0\.d, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b64_s32, int32_t,
++		p0 = svwhilelt_b64_s32 (x0, x1),
++		p0 = svwhilelt_b64 (x0, x1))
++
++/*
++** whilelt_0r_b64_s32:
++**	whilelt	p0\.d, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b64_s32, int32_t,
++		p0 = svwhilelt_b64_s32 (0, x1),
++		p0 = svwhilelt_b64 (0, x1))
++
++/*
++** whilelt_5r_b64_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.d, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b64_s32, int32_t,
++		p0 = svwhilelt_b64_s32 (5, x1),
++		p0 = svwhilelt_b64 (5, x1))
++
++/*
++** whilelt_r0_b64_s32:
++**	whilelt	p0\.d, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b64_s32, int32_t,
++		p0 = svwhilelt_b64_s32 (x0, 0),
++		p0 = svwhilelt_b64 (x0, 0))
++
++/*
++** whilelt_r5_b64_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.d, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b64_s32, int32_t,
++		p0 = svwhilelt_b64_s32 (x0, 5),
++		p0 = svwhilelt_b64 (x0, 5))
++
++/*
++** whilelt_rr_b64_s64:
++**	whilelt	p0\.d, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b64_s64, int64_t,
++		p0 = svwhilelt_b64_s64 (x0, x1),
++		p0 = svwhilelt_b64 (x0, x1))
++
++/*
++** whilelt_0r_b64_s64:
++**	whilelt	p0\.d, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b64_s64, int64_t,
++		p0 = svwhilelt_b64_s64 (0, x1),
++		p0 = svwhilelt_b64 ((int64_t) 0, x1))
++
++/*
++** whilelt_5r_b64_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.d, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b64_s64, int64_t,
++		p0 = svwhilelt_b64_s64 (5, x1),
++		p0 = svwhilelt_b64 ((int64_t) 5, x1))
++
++/*
++** whilelt_r0_b64_s64:
++**	whilelt	p0\.d, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b64_s64, int64_t,
++		p0 = svwhilelt_b64_s64 (x0, 0),
++		p0 = svwhilelt_b64 (x0, (int64_t) 0))
++
++/*
++** whilelt_r5_b64_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.d, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b64_s64, int64_t,
++		p0 = svwhilelt_b64_s64 (x0, 5),
++		p0 = svwhilelt_b64 (x0, (int64_t) 5))
++
++/*
++** whilelt_rr_b64_u32:
++**	whilelo	p0\.d, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b64_u32, uint32_t,
++		p0 = svwhilelt_b64_u32 (x0, x1),
++		p0 = svwhilelt_b64 (x0, x1))
++
++/*
++** whilelt_0r_b64_u32:
++**	whilelo	p0\.d, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b64_u32, uint32_t,
++		p0 = svwhilelt_b64_u32 (0, x1),
++		p0 = svwhilelt_b64 ((uint32_t) 0, x1))
++
++/*
++** whilelt_5r_b64_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.d, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b64_u32, uint32_t,
++		p0 = svwhilelt_b64_u32 (5, x1),
++		p0 = svwhilelt_b64 ((uint32_t) 5, x1))
++
++/*
++** whilelt_r5_b64_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.d, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b64_u32, uint32_t,
++		p0 = svwhilelt_b64_u32 (x0, 5),
++		p0 = svwhilelt_b64 (x0, (uint32_t) 5))
++
++/*
++** whilelt_rr_b64_u64:
++**	whilelo	p0\.d, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b64_u64, uint64_t,
++		p0 = svwhilelt_b64_u64 (x0, x1),
++		p0 = svwhilelt_b64 (x0, x1))
++
++/*
++** whilelt_0r_b64_u64:
++**	whilelo	p0\.d, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b64_u64, uint64_t,
++		p0 = svwhilelt_b64_u64 (0, x1),
++		p0 = svwhilelt_b64 ((uint64_t) 0, x1))
++
++/*
++** whilelt_5r_b64_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.d, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b64_u64, uint64_t,
++		p0 = svwhilelt_b64_u64 (5, x1),
++		p0 = svwhilelt_b64 ((uint64_t) 5, x1))
++
++/*
++** whilelt_r5_b64_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.d, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b64_u64, uint64_t,
++		p0 = svwhilelt_b64_u64 (x0, 5),
++		p0 = svwhilelt_b64 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b8.c
+new file mode 100644
+index 000000000..5b6a5c44d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b8.c
+@@ -0,0 +1,173 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** whilelt_rr_b8_s32:
++**	whilelt	p0\.b, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b8_s32, int32_t,
++		p0 = svwhilelt_b8_s32 (x0, x1),
++		p0 = svwhilelt_b8 (x0, x1))
++
++/*
++** whilelt_0r_b8_s32:
++**	whilelt	p0\.b, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b8_s32, int32_t,
++		p0 = svwhilelt_b8_s32 (0, x1),
++		p0 = svwhilelt_b8 (0, x1))
++
++/*
++** whilelt_5r_b8_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.b, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b8_s32, int32_t,
++		p0 = svwhilelt_b8_s32 (5, x1),
++		p0 = svwhilelt_b8 (5, x1))
++
++/*
++** whilelt_r0_b8_s32:
++**	whilelt	p0\.b, w0, wzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b8_s32, int32_t,
++		p0 = svwhilelt_b8_s32 (x0, 0),
++		p0 = svwhilelt_b8 (x0, 0))
++
++/*
++** whilelt_r5_b8_s32:
++**	mov	(w[0-9]+), #?5
++**	whilelt	p0\.b, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b8_s32, int32_t,
++		p0 = svwhilelt_b8_s32 (x0, 5),
++		p0 = svwhilelt_b8 (x0, 5))
++
++/*
++** whilelt_rr_b8_s64:
++**	whilelt	p0\.b, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b8_s64, int64_t,
++		p0 = svwhilelt_b8_s64 (x0, x1),
++		p0 = svwhilelt_b8 (x0, x1))
++
++/*
++** whilelt_0r_b8_s64:
++**	whilelt	p0\.b, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b8_s64, int64_t,
++		p0 = svwhilelt_b8_s64 (0, x1),
++		p0 = svwhilelt_b8 ((int64_t) 0, x1))
++
++/*
++** whilelt_5r_b8_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.b, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b8_s64, int64_t,
++		p0 = svwhilelt_b8_s64 (5, x1),
++		p0 = svwhilelt_b8 ((int64_t) 5, x1))
++
++/*
++** whilelt_r0_b8_s64:
++**	whilelt	p0\.b, x0, xzr
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r0_b8_s64, int64_t,
++		p0 = svwhilelt_b8_s64 (x0, 0),
++		p0 = svwhilelt_b8 (x0, (int64_t) 0))
++
++/*
++** whilelt_r5_b8_s64:
++**	mov	(x[0-9]+), #?5
++**	whilelt	p0\.b, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b8_s64, int64_t,
++		p0 = svwhilelt_b8_s64 (x0, 5),
++		p0 = svwhilelt_b8 (x0, (int64_t) 5))
++
++/*
++** whilelt_rr_b8_u32:
++**	whilelo	p0\.b, w0, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b8_u32, uint32_t,
++		p0 = svwhilelt_b8_u32 (x0, x1),
++		p0 = svwhilelt_b8 (x0, x1))
++
++/*
++** whilelt_0r_b8_u32:
++**	whilelo	p0\.b, wzr, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b8_u32, uint32_t,
++		p0 = svwhilelt_b8_u32 (0, x1),
++		p0 = svwhilelt_b8 ((uint32_t) 0, x1))
++
++/*
++** whilelt_5r_b8_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.b, \1, w1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b8_u32, uint32_t,
++		p0 = svwhilelt_b8_u32 (5, x1),
++		p0 = svwhilelt_b8 ((uint32_t) 5, x1))
++
++/*
++** whilelt_r5_b8_u32:
++**	mov	(w[0-9]+), #?5
++**	whilelo	p0\.b, w0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b8_u32, uint32_t,
++		p0 = svwhilelt_b8_u32 (x0, 5),
++		p0 = svwhilelt_b8 (x0, (uint32_t) 5))
++
++/*
++** whilelt_rr_b8_u64:
++**	whilelo	p0\.b, x0, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_rr_b8_u64, uint64_t,
++		p0 = svwhilelt_b8_u64 (x0, x1),
++		p0 = svwhilelt_b8 (x0, x1))
++
++/*
++** whilelt_0r_b8_u64:
++**	whilelo	p0\.b, xzr, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_0r_b8_u64, uint64_t,
++		p0 = svwhilelt_b8_u64 (0, x1),
++		p0 = svwhilelt_b8 ((uint64_t) 0, x1))
++
++/*
++** whilelt_5r_b8_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.b, \1, x1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_5r_b8_u64, uint64_t,
++		p0 = svwhilelt_b8_u64 (5, x1),
++		p0 = svwhilelt_b8 ((uint64_t) 5, x1))
++
++/*
++** whilelt_r5_b8_u64:
++**	mov	(x[0-9]+), #?5
++**	whilelo	p0\.b, x0, \1
++**	ret
++*/
++TEST_COMPARE_S (whilelt_r5_b8_u64, uint64_t,
++		p0 = svwhilelt_b8_u64 (x0, 5),
++		p0 = svwhilelt_b8 (x0, (uint64_t) 5))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b16.c
+new file mode 100644
+index 000000000..269260eb4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_b16_tied1:
++**	zip1	p0\.h, p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b16_tied1,
++		p0 = svzip1_b16 (p0, p1),
++		p0 = svzip1_b16 (p0, p1))
++
++/*
++** zip1_b16_tied2:
++**	zip1	p0\.h, p1\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b16_tied2,
++		p0 = svzip1_b16 (p1, p0),
++		p0 = svzip1_b16 (p1, p0))
++
++/*
++** zip1_b16_untied:
++**	zip1	p0\.h, p1\.h, p2\.h
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b16_untied,
++		p0 = svzip1_b16 (p1, p2),
++		p0 = svzip1_b16 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b32.c
+new file mode 100644
+index 000000000..027609a7d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_b32_tied1:
++**	zip1	p0\.s, p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b32_tied1,
++		p0 = svzip1_b32 (p0, p1),
++		p0 = svzip1_b32 (p0, p1))
++
++/*
++** zip1_b32_tied2:
++**	zip1	p0\.s, p1\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b32_tied2,
++		p0 = svzip1_b32 (p1, p0),
++		p0 = svzip1_b32 (p1, p0))
++
++/*
++** zip1_b32_untied:
++**	zip1	p0\.s, p1\.s, p2\.s
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b32_untied,
++		p0 = svzip1_b32 (p1, p2),
++		p0 = svzip1_b32 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b64.c
+new file mode 100644
+index 000000000..8add16d8e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_b64_tied1:
++**	zip1	p0\.d, p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b64_tied1,
++		p0 = svzip1_b64 (p0, p1),
++		p0 = svzip1_b64 (p0, p1))
++
++/*
++** zip1_b64_tied2:
++**	zip1	p0\.d, p1\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b64_tied2,
++		p0 = svzip1_b64 (p1, p0),
++		p0 = svzip1_b64 (p1, p0))
++
++/*
++** zip1_b64_untied:
++**	zip1	p0\.d, p1\.d, p2\.d
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b64_untied,
++		p0 = svzip1_b64 (p1, p2),
++		p0 = svzip1_b64 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b8.c
+new file mode 100644
+index 000000000..8648298ac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_b8_tied1:
++**	zip1	p0\.b, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b8_tied1,
++		p0 = svzip1_b8 (p0, p1),
++		p0 = svzip1_b8 (p0, p1))
++
++/*
++** zip1_b8_tied2:
++**	zip1	p0\.b, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b8_tied2,
++		p0 = svzip1_b8 (p1, p0),
++		p0 = svzip1_b8 (p1, p0))
++
++/*
++** zip1_b8_untied:
++**	zip1	p0\.b, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (zip1_b8_untied,
++		p0 = svzip1_b8 (p1, p2),
++		p0 = svzip1_b8 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_bf16.c
+new file mode 100644
+index 000000000..6017cde41
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_bf16_tied1:
++**	zip1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_bf16_tied1, svbfloat16_t,
++		z0 = svzip1_bf16 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_bf16_tied2:
++**	zip1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_bf16_tied2, svbfloat16_t,
++		z0 = svzip1_bf16 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_bf16_untied:
++**	zip1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_bf16_untied, svbfloat16_t,
++		z0 = svzip1_bf16 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f16.c
+new file mode 100644
+index 000000000..1c6ce4e7d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_f16_tied1:
++**	zip1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f16_tied1, svfloat16_t,
++		z0 = svzip1_f16 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_f16_tied2:
++**	zip1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f16_tied2, svfloat16_t,
++		z0 = svzip1_f16 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_f16_untied:
++**	zip1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f16_untied, svfloat16_t,
++		z0 = svzip1_f16 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f32.c
+new file mode 100644
+index 000000000..288ceff3f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_f32_tied1:
++**	zip1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f32_tied1, svfloat32_t,
++		z0 = svzip1_f32 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_f32_tied2:
++**	zip1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f32_tied2, svfloat32_t,
++		z0 = svzip1_f32 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_f32_untied:
++**	zip1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f32_untied, svfloat32_t,
++		z0 = svzip1_f32 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f64.c
+new file mode 100644
+index 000000000..5abbea1cd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_f64_tied1:
++**	zip1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f64_tied1, svfloat64_t,
++		z0 = svzip1_f64 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_f64_tied2:
++**	zip1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f64_tied2, svfloat64_t,
++		z0 = svzip1_f64 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_f64_untied:
++**	zip1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_f64_untied, svfloat64_t,
++		z0 = svzip1_f64 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s16.c
+new file mode 100644
+index 000000000..8ecd20142
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_s16_tied1:
++**	zip1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s16_tied1, svint16_t,
++		z0 = svzip1_s16 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_s16_tied2:
++**	zip1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s16_tied2, svint16_t,
++		z0 = svzip1_s16 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_s16_untied:
++**	zip1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s16_untied, svint16_t,
++		z0 = svzip1_s16 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s32.c
+new file mode 100644
+index 000000000..c523885ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_s32_tied1:
++**	zip1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s32_tied1, svint32_t,
++		z0 = svzip1_s32 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_s32_tied2:
++**	zip1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s32_tied2, svint32_t,
++		z0 = svzip1_s32 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_s32_untied:
++**	zip1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s32_untied, svint32_t,
++		z0 = svzip1_s32 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s64.c
+new file mode 100644
+index 000000000..d1dca7ee9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_s64_tied1:
++**	zip1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s64_tied1, svint64_t,
++		z0 = svzip1_s64 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_s64_tied2:
++**	zip1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s64_tied2, svint64_t,
++		z0 = svzip1_s64 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_s64_untied:
++**	zip1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s64_untied, svint64_t,
++		z0 = svzip1_s64 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s8.c
+new file mode 100644
+index 000000000..1600ab586
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_s8_tied1:
++**	zip1	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s8_tied1, svint8_t,
++		z0 = svzip1_s8 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_s8_tied2:
++**	zip1	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s8_tied2, svint8_t,
++		z0 = svzip1_s8 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_s8_untied:
++**	zip1	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_s8_untied, svint8_t,
++		z0 = svzip1_s8 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u16.c
+new file mode 100644
+index 000000000..3773ed22f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_u16_tied1:
++**	zip1	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u16_tied1, svuint16_t,
++		z0 = svzip1_u16 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_u16_tied2:
++**	zip1	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u16_tied2, svuint16_t,
++		z0 = svzip1_u16 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_u16_untied:
++**	zip1	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u16_untied, svuint16_t,
++		z0 = svzip1_u16 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u32.c
+new file mode 100644
+index 000000000..e67c121e5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_u32_tied1:
++**	zip1	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u32_tied1, svuint32_t,
++		z0 = svzip1_u32 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_u32_tied2:
++**	zip1	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u32_tied2, svuint32_t,
++		z0 = svzip1_u32 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_u32_untied:
++**	zip1	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u32_untied, svuint32_t,
++		z0 = svzip1_u32 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u64.c
+new file mode 100644
+index 000000000..bb6380a6a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_u64_tied1:
++**	zip1	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u64_tied1, svuint64_t,
++		z0 = svzip1_u64 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_u64_tied2:
++**	zip1	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u64_tied2, svuint64_t,
++		z0 = svzip1_u64 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_u64_untied:
++**	zip1	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u64_untied, svuint64_t,
++		z0 = svzip1_u64 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u8.c
+new file mode 100644
+index 000000000..01d89d4fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1_u8_tied1:
++**	zip1	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u8_tied1, svuint8_t,
++		z0 = svzip1_u8 (z0, z1),
++		z0 = svzip1 (z0, z1))
++
++/*
++** zip1_u8_tied2:
++**	zip1	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u8_tied2, svuint8_t,
++		z0 = svzip1_u8 (z1, z0),
++		z0 = svzip1 (z1, z0))
++
++/*
++** zip1_u8_untied:
++**	zip1	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip1_u8_untied, svuint8_t,
++		z0 = svzip1_u8 (z1, z2),
++		z0 = svzip1 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c
+new file mode 100644
+index 000000000..aabf7c0e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_bf16_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_bf16_tied1, svbfloat16_t,
++		z0 = svzip1q_bf16 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_bf16_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_bf16_tied2, svbfloat16_t,
++		z0 = svzip1q_bf16 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_bf16_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_bf16_untied, svbfloat16_t,
++		z0 = svzip1q_bf16 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f16.c
+new file mode 100644
+index 000000000..1170cc5e7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_f16_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f16_tied1, svfloat16_t,
++		z0 = svzip1q_f16 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_f16_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f16_tied2, svfloat16_t,
++		z0 = svzip1q_f16 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_f16_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f16_untied, svfloat16_t,
++		z0 = svzip1q_f16 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f32.c
+new file mode 100644
+index 000000000..09666da1b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_f32_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f32_tied1, svfloat32_t,
++		z0 = svzip1q_f32 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_f32_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f32_tied2, svfloat32_t,
++		z0 = svzip1q_f32 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_f32_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f32_untied, svfloat32_t,
++		z0 = svzip1q_f32 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f64.c
+new file mode 100644
+index 000000000..d77fb1c90
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_f64_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f64_tied1, svfloat64_t,
++		z0 = svzip1q_f64 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_f64_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f64_tied2, svfloat64_t,
++		z0 = svzip1q_f64 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_f64_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_f64_untied, svfloat64_t,
++		z0 = svzip1q_f64 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s16.c
+new file mode 100644
+index 000000000..92a6b5514
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_s16_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s16_tied1, svint16_t,
++		z0 = svzip1q_s16 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_s16_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s16_tied2, svint16_t,
++		z0 = svzip1q_s16 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_s16_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s16_untied, svint16_t,
++		z0 = svzip1q_s16 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s32.c
+new file mode 100644
+index 000000000..a918d2d4c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_s32_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s32_tied1, svint32_t,
++		z0 = svzip1q_s32 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_s32_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s32_tied2, svint32_t,
++		z0 = svzip1q_s32 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_s32_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s32_untied, svint32_t,
++		z0 = svzip1q_s32 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s64.c
+new file mode 100644
+index 000000000..be3524fd5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_s64_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s64_tied1, svint64_t,
++		z0 = svzip1q_s64 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_s64_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s64_tied2, svint64_t,
++		z0 = svzip1q_s64 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_s64_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s64_untied, svint64_t,
++		z0 = svzip1q_s64 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s8.c
+new file mode 100644
+index 000000000..24ea2399c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_s8_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s8_tied1, svint8_t,
++		z0 = svzip1q_s8 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_s8_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s8_tied2, svint8_t,
++		z0 = svzip1q_s8 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_s8_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_s8_untied, svint8_t,
++		z0 = svzip1q_s8 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u16.c
+new file mode 100644
+index 000000000..65caf9706
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_u16_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u16_tied1, svuint16_t,
++		z0 = svzip1q_u16 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_u16_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u16_tied2, svuint16_t,
++		z0 = svzip1q_u16 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_u16_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u16_untied, svuint16_t,
++		z0 = svzip1q_u16 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u32.c
+new file mode 100644
+index 000000000..abd76b74f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_u32_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u32_tied1, svuint32_t,
++		z0 = svzip1q_u32 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_u32_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u32_tied2, svuint32_t,
++		z0 = svzip1q_u32 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_u32_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u32_untied, svuint32_t,
++		z0 = svzip1q_u32 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u64.c
+new file mode 100644
+index 000000000..0e91929b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_u64_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u64_tied1, svuint64_t,
++		z0 = svzip1q_u64 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_u64_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u64_tied2, svuint64_t,
++		z0 = svzip1q_u64 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_u64_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u64_untied, svuint64_t,
++		z0 = svzip1q_u64 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u8.c
+new file mode 100644
+index 000000000..07d484b0b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip1q_u8_tied1:
++**	zip1	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u8_tied1, svuint8_t,
++		z0 = svzip1q_u8 (z0, z1),
++		z0 = svzip1q (z0, z1))
++
++/*
++** zip1q_u8_tied2:
++**	zip1	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u8_tied2, svuint8_t,
++		z0 = svzip1q_u8 (z1, z0),
++		z0 = svzip1q (z1, z0))
++
++/*
++** zip1q_u8_untied:
++**	zip1	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip1q_u8_untied, svuint8_t,
++		z0 = svzip1q_u8 (z1, z2),
++		z0 = svzip1q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b16.c
+new file mode 100644
+index 000000000..5624c9815
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_b16_tied1:
++**	zip2	p0\.h, p0\.h, p1\.h
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b16_tied1,
++		p0 = svzip2_b16 (p0, p1),
++		p0 = svzip2_b16 (p0, p1))
++
++/*
++** zip2_b16_tied2:
++**	zip2	p0\.h, p1\.h, p0\.h
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b16_tied2,
++		p0 = svzip2_b16 (p1, p0),
++		p0 = svzip2_b16 (p1, p0))
++
++/*
++** zip2_b16_untied:
++**	zip2	p0\.h, p1\.h, p2\.h
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b16_untied,
++		p0 = svzip2_b16 (p1, p2),
++		p0 = svzip2_b16 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b32.c
+new file mode 100644
+index 000000000..b73d5b490
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_b32_tied1:
++**	zip2	p0\.s, p0\.s, p1\.s
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b32_tied1,
++		p0 = svzip2_b32 (p0, p1),
++		p0 = svzip2_b32 (p0, p1))
++
++/*
++** zip2_b32_tied2:
++**	zip2	p0\.s, p1\.s, p0\.s
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b32_tied2,
++		p0 = svzip2_b32 (p1, p0),
++		p0 = svzip2_b32 (p1, p0))
++
++/*
++** zip2_b32_untied:
++**	zip2	p0\.s, p1\.s, p2\.s
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b32_untied,
++		p0 = svzip2_b32 (p1, p2),
++		p0 = svzip2_b32 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b64.c
+new file mode 100644
+index 000000000..9ebf050b8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_b64_tied1:
++**	zip2	p0\.d, p0\.d, p1\.d
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b64_tied1,
++		p0 = svzip2_b64 (p0, p1),
++		p0 = svzip2_b64 (p0, p1))
++
++/*
++** zip2_b64_tied2:
++**	zip2	p0\.d, p1\.d, p0\.d
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b64_tied2,
++		p0 = svzip2_b64 (p1, p0),
++		p0 = svzip2_b64 (p1, p0))
++
++/*
++** zip2_b64_untied:
++**	zip2	p0\.d, p1\.d, p2\.d
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b64_untied,
++		p0 = svzip2_b64 (p1, p2),
++		p0 = svzip2_b64 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b8.c
+new file mode 100644
+index 000000000..223a22f99
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_b8_tied1:
++**	zip2	p0\.b, p0\.b, p1\.b
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b8_tied1,
++		p0 = svzip2_b8 (p0, p1),
++		p0 = svzip2_b8 (p0, p1))
++
++/*
++** zip2_b8_tied2:
++**	zip2	p0\.b, p1\.b, p0\.b
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b8_tied2,
++		p0 = svzip2_b8 (p1, p0),
++		p0 = svzip2_b8 (p1, p0))
++
++/*
++** zip2_b8_untied:
++**	zip2	p0\.b, p1\.b, p2\.b
++**	ret
++*/
++TEST_UNIFORM_P (zip2_b8_untied,
++		p0 = svzip2_b8 (p1, p2),
++		p0 = svzip2_b8 (p1, p2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_bf16.c
+new file mode 100644
+index 000000000..a9e0cfc93
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_bf16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_bf16_tied1:
++**	zip2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_bf16_tied1, svbfloat16_t,
++		z0 = svzip2_bf16 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_bf16_tied2:
++**	zip2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_bf16_tied2, svbfloat16_t,
++		z0 = svzip2_bf16 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_bf16_untied:
++**	zip2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_bf16_untied, svbfloat16_t,
++		z0 = svzip2_bf16 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f16.c
+new file mode 100644
+index 000000000..73d4272bc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_f16_tied1:
++**	zip2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f16_tied1, svfloat16_t,
++		z0 = svzip2_f16 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_f16_tied2:
++**	zip2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f16_tied2, svfloat16_t,
++		z0 = svzip2_f16 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_f16_untied:
++**	zip2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f16_untied, svfloat16_t,
++		z0 = svzip2_f16 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f32.c
+new file mode 100644
+index 000000000..2ad8ff81d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_f32_tied1:
++**	zip2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f32_tied1, svfloat32_t,
++		z0 = svzip2_f32 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_f32_tied2:
++**	zip2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f32_tied2, svfloat32_t,
++		z0 = svzip2_f32 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_f32_untied:
++**	zip2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f32_untied, svfloat32_t,
++		z0 = svzip2_f32 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f64.c
+new file mode 100644
+index 000000000..de5c2646f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_f64_tied1:
++**	zip2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f64_tied1, svfloat64_t,
++		z0 = svzip2_f64 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_f64_tied2:
++**	zip2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f64_tied2, svfloat64_t,
++		z0 = svzip2_f64 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_f64_untied:
++**	zip2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_f64_untied, svfloat64_t,
++		z0 = svzip2_f64 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s16.c
+new file mode 100644
+index 000000000..fc366c991
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_s16_tied1:
++**	zip2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s16_tied1, svint16_t,
++		z0 = svzip2_s16 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_s16_tied2:
++**	zip2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s16_tied2, svint16_t,
++		z0 = svzip2_s16 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_s16_untied:
++**	zip2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s16_untied, svint16_t,
++		z0 = svzip2_s16 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s32.c
+new file mode 100644
+index 000000000..e56934d26
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_s32_tied1:
++**	zip2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s32_tied1, svint32_t,
++		z0 = svzip2_s32 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_s32_tied2:
++**	zip2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s32_tied2, svint32_t,
++		z0 = svzip2_s32 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_s32_untied:
++**	zip2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s32_untied, svint32_t,
++		z0 = svzip2_s32 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s64.c
+new file mode 100644
+index 000000000..cefc73b72
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_s64_tied1:
++**	zip2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s64_tied1, svint64_t,
++		z0 = svzip2_s64 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_s64_tied2:
++**	zip2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s64_tied2, svint64_t,
++		z0 = svzip2_s64 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_s64_untied:
++**	zip2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s64_untied, svint64_t,
++		z0 = svzip2_s64 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s8.c
+new file mode 100644
+index 000000000..452bbce26
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_s8_tied1:
++**	zip2	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s8_tied1, svint8_t,
++		z0 = svzip2_s8 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_s8_tied2:
++**	zip2	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s8_tied2, svint8_t,
++		z0 = svzip2_s8 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_s8_untied:
++**	zip2	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_s8_untied, svint8_t,
++		z0 = svzip2_s8 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u16.c
+new file mode 100644
+index 000000000..9a20b4ed1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u16.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_u16_tied1:
++**	zip2	z0\.h, z0\.h, z1\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u16_tied1, svuint16_t,
++		z0 = svzip2_u16 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_u16_tied2:
++**	zip2	z0\.h, z1\.h, z0\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u16_tied2, svuint16_t,
++		z0 = svzip2_u16 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_u16_untied:
++**	zip2	z0\.h, z1\.h, z2\.h
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u16_untied, svuint16_t,
++		z0 = svzip2_u16 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u32.c
+new file mode 100644
+index 000000000..70626c66e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u32.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_u32_tied1:
++**	zip2	z0\.s, z0\.s, z1\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u32_tied1, svuint32_t,
++		z0 = svzip2_u32 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_u32_tied2:
++**	zip2	z0\.s, z1\.s, z0\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u32_tied2, svuint32_t,
++		z0 = svzip2_u32 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_u32_untied:
++**	zip2	z0\.s, z1\.s, z2\.s
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u32_untied, svuint32_t,
++		z0 = svzip2_u32 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u64.c
+new file mode 100644
+index 000000000..43a43ff7c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u64.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_u64_tied1:
++**	zip2	z0\.d, z0\.d, z1\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u64_tied1, svuint64_t,
++		z0 = svzip2_u64 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_u64_tied2:
++**	zip2	z0\.d, z1\.d, z0\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u64_tied2, svuint64_t,
++		z0 = svzip2_u64 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_u64_untied:
++**	zip2	z0\.d, z1\.d, z2\.d
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u64_untied, svuint64_t,
++		z0 = svzip2_u64 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u8.c
+new file mode 100644
+index 000000000..015f1844b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u8.c
+@@ -0,0 +1,30 @@
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2_u8_tied1:
++**	zip2	z0\.b, z0\.b, z1\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u8_tied1, svuint8_t,
++		z0 = svzip2_u8 (z0, z1),
++		z0 = svzip2 (z0, z1))
++
++/*
++** zip2_u8_tied2:
++**	zip2	z0\.b, z1\.b, z0\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u8_tied2, svuint8_t,
++		z0 = svzip2_u8 (z1, z0),
++		z0 = svzip2 (z1, z0))
++
++/*
++** zip2_u8_untied:
++**	zip2	z0\.b, z1\.b, z2\.b
++**	ret
++*/
++TEST_UNIFORM_Z (zip2_u8_untied, svuint8_t,
++		z0 = svzip2_u8 (z1, z2),
++		z0 = svzip2 (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c
+new file mode 100644
+index 000000000..6d79136cf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_bf16_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_bf16_tied1, svbfloat16_t,
++		z0 = svzip2q_bf16 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_bf16_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_bf16_tied2, svbfloat16_t,
++		z0 = svzip2q_bf16 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_bf16_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_bf16_untied, svbfloat16_t,
++		z0 = svzip2q_bf16 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f16.c
+new file mode 100644
+index 000000000..984240e19
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_f16_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f16_tied1, svfloat16_t,
++		z0 = svzip2q_f16 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_f16_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f16_tied2, svfloat16_t,
++		z0 = svzip2q_f16 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_f16_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f16_untied, svfloat16_t,
++		z0 = svzip2q_f16 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f32.c
+new file mode 100644
+index 000000000..0f8ccd804
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_f32_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f32_tied1, svfloat32_t,
++		z0 = svzip2q_f32 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_f32_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f32_tied2, svfloat32_t,
++		z0 = svzip2q_f32 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_f32_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f32_untied, svfloat32_t,
++		z0 = svzip2q_f32 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f64.c
+new file mode 100644
+index 000000000..b5411cff7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_f64_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f64_tied1, svfloat64_t,
++		z0 = svzip2q_f64 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_f64_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f64_tied2, svfloat64_t,
++		z0 = svzip2q_f64 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_f64_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_f64_untied, svfloat64_t,
++		z0 = svzip2q_f64 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s16.c
+new file mode 100644
+index 000000000..66751fc7f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_s16_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s16_tied1, svint16_t,
++		z0 = svzip2q_s16 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_s16_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s16_tied2, svint16_t,
++		z0 = svzip2q_s16 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_s16_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s16_untied, svint16_t,
++		z0 = svzip2q_s16 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s32.c
+new file mode 100644
+index 000000000..830de3311
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_s32_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s32_tied1, svint32_t,
++		z0 = svzip2q_s32 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_s32_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s32_tied2, svint32_t,
++		z0 = svzip2q_s32 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_s32_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s32_untied, svint32_t,
++		z0 = svzip2q_s32 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s64.c
+new file mode 100644
+index 000000000..917be4f40
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_s64_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s64_tied1, svint64_t,
++		z0 = svzip2q_s64 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_s64_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s64_tied2, svint64_t,
++		z0 = svzip2q_s64 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_s64_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s64_untied, svint64_t,
++		z0 = svzip2q_s64 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s8.c
+new file mode 100644
+index 000000000..dff6e2d7b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_s8_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s8_tied1, svint8_t,
++		z0 = svzip2q_s8 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_s8_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s8_tied2, svint8_t,
++		z0 = svzip2q_s8 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_s8_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_s8_untied, svint8_t,
++		z0 = svzip2q_s8 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u16.c
+new file mode 100644
+index 000000000..9e194425c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u16.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_u16_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u16_tied1, svuint16_t,
++		z0 = svzip2q_u16 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_u16_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u16_tied2, svuint16_t,
++		z0 = svzip2q_u16 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_u16_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u16_untied, svuint16_t,
++		z0 = svzip2q_u16 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u32.c
+new file mode 100644
+index 000000000..89de27f6b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u32.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_u32_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u32_tied1, svuint32_t,
++		z0 = svzip2q_u32 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_u32_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u32_tied2, svuint32_t,
++		z0 = svzip2q_u32 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_u32_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u32_untied, svuint32_t,
++		z0 = svzip2q_u32 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u64.c
+new file mode 100644
+index 000000000..f2c9852ac
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u64.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_u64_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u64_tied1, svuint64_t,
++		z0 = svzip2q_u64 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_u64_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u64_tied2, svuint64_t,
++		z0 = svzip2q_u64 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_u64_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u64_untied, svuint64_t,
++		z0 = svzip2q_u64 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u8.c
+new file mode 100644
+index 000000000..a12905586
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u8.c
+@@ -0,0 +1,32 @@
++/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
++/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
++/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
++
++#include "test_sve_acle.h"
++
++/*
++** zip2q_u8_tied1:
++**	zip2	z0\.q, z0\.q, z1\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u8_tied1, svuint8_t,
++		z0 = svzip2q_u8 (z0, z1),
++		z0 = svzip2q (z0, z1))
++
++/*
++** zip2q_u8_tied2:
++**	zip2	z0\.q, z1\.q, z0\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u8_tied2, svuint8_t,
++		z0 = svzip2q_u8 (z1, z0),
++		z0 = svzip2q (z1, z0))
++
++/*
++** zip2q_u8_untied:
++**	zip2	z0\.q, z1\.q, z2\.q
++**	ret
++*/
++TEST_UNIFORM_Z (zip2q_u8_untied, svuint8_t,
++		z0 = svzip2q_u8 (z1, z2),
++		z0 = svzip2q (z1, z2))
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_index_1.c
+new file mode 100644
+index 000000000..714265ed1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_index_1.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, uint32_t *u32_ptr, svuint8_t u8, svuint16_t u16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64)
++{
++  svadrh_index (u32); /* { dg-error {too few arguments to function 'svadrh_index'} } */
++  svadrh_index (u32, u32, u32); /* { dg-error {too many arguments to function 'svadrh_index'} } */
++  svadrh_index (u32_ptr, s32); /* { dg-error {passing '[^']*\*'[^\n]* to argument 1 of 'svadrh_index', which expects an SVE vector type} } */
++  svadrh_index (0, s32); /* { dg-error {passing 'int' to argument 1 of 'svadrh_index', which expects an SVE vector type} } */
++  svadrh_index (u16, u16); /* { dg-error {passing 'svuint16_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svadrh_index (s32, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svadrh_index (f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svadrh_index (pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++
++  svadrh_index (u32, 0); /* { dg-error {passing 'int' to argument 2 of 'svadrh_index', which expects an SVE vector type} } */
++  svadrh_index (u32, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svadrh_index', which expects a vector of 32-bit or 64-bit integers} } */
++  svadrh_index (u32, u16); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svadrh_index', which expects a vector of 32-bit or 64-bit integers} } */
++  svadrh_index (u32, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
++
++  svadrh_index (u32, s32);
++  svadrh_index (u32, u32);
++  svadrh_index (u32, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
++  svadrh_index (u32, s64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an index of type 'svint64_t'} } */
++  svadrh_index (u32, u64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an index of type 'svuint64_t'} } */
++  svadrh_index (u32, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
++
++  svadrh_index (u64, s32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an index of type 'svint32_t'} } */
++  svadrh_index (u64, u32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an index of type 'svuint32_t'} } */
++  svadrh_index (u64, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
++  svadrh_index (u64, s64);
++  svadrh_index (u64, u64);
++  svadrh_index (u64, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_offset_1.c
+new file mode 100644
+index 000000000..528d7ac51
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_offset_1.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, uint32_t *u32_ptr, svuint8_t u8, svuint16_t u16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64)
++{
++  svadrb_offset (u32); /* { dg-error {too few arguments to function 'svadrb_offset'} } */
++  svadrb_offset (u32, u32, u32); /* { dg-error {too many arguments to function 'svadrb_offset'} } */
++  svadrb_offset (u32_ptr, s32); /* { dg-error {passing '[^']*\*'[^\n]* to argument 1 of 'svadrb_offset', which expects an SVE vector type} } */
++  svadrb_offset (0, s32); /* { dg-error {passing 'int' to argument 1 of 'svadrb_offset', which expects an SVE vector type} } */
++  svadrb_offset (u16, u16); /* { dg-error {passing 'svuint16_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svadrb_offset (s32, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svadrb_offset (f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svadrb_offset (pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++
++  svadrb_offset (u32, 0); /* { dg-error {passing 'int' to argument 2 of 'svadrb_offset', which expects an SVE vector type} } */
++  svadrb_offset (u32, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svadrb_offset', which expects a vector of 32-bit or 64-bit integers} } */
++  svadrb_offset (u32, u16); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svadrb_offset', which expects a vector of 32-bit or 64-bit integers} } */
++  svadrb_offset (u32, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
++
++  svadrb_offset (u32, s32);
++  svadrb_offset (u32, u32);
++  svadrb_offset (u32, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
++  svadrb_offset (u32, s64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an offset of type 'svint64_t'} } */
++  svadrb_offset (u32, u64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an offset of type 'svuint64_t'} } */
++  svadrb_offset (u32, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
++
++  svadrb_offset (u64, s32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an offset of type 'svint32_t'} } */
++  svadrb_offset (u64, u32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an offset of type 'svuint32_t'} } */
++  svadrb_offset (u64, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
++  svadrb_offset (u64, s64);
++  svadrb_offset (u64, u64);
++  svadrb_offset (u64, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_1.c
+new file mode 100644
+index 000000000..8ce89fa10
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_1.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svuint8_t
++f1 (svbool_t pg, svuint8_t u8, svint16_t s16)
++{
++  svzip1 (pg); /* { dg-error {too few arguments to function 'svzip1'} } */
++  svzip1 (pg, u8, u8); /* { dg-error {too many arguments to function 'svzip1'} } */
++  svzip1 (pg, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svzip1', but previous arguments had type 'svbool_t'} } */
++  svzip1 (u8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svzip1', but previous arguments had type 'svuint8_t'} } */
++  svzip1 (u8, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svzip1', but previous arguments had type 'svuint8_t'} } */
++  svzip1 (u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svzip1', which expects an SVE vector type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_int_opt_n.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_int_opt_n.c
+new file mode 100644
+index 000000000..965e9a13c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_int_opt_n.c
+@@ -0,0 +1,28 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat16_t f16, svint16_t s16, svuint16_t u16,
++    svfloat32_t f32, svint32_t s32, svuint32_t u32)
++{
++  svscale_x (pg, f16); /* { dg-error {too few arguments to function 'svscale_x'} } */
++  svscale_x (pg, f16, s16, s16); /* { dg-error {too many arguments to function 'svscale_x'} } */
++  svscale_x (s32, f16, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svscale_x', which expects 'svbool_t'} } */
++  svscale_x (1, f16, s32); /* { dg-error {passing 'int' to argument 1 of 'svscale_x', which expects 'svbool_t'} } */
++  svscale_x (pg, pg, s16); /* { dg-error {'svscale_x' has no form that takes 'svbool_t' arguments} } */
++  svscale_x (pg, 1, s16); /* { dg-error {passing 'int' to argument 2 of 'svscale_x', which expects an SVE vector type} } */
++  svscale_x (pg, f16, s16);
++  svscale_x (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, f16, s32); /* { dg-error {arguments 2 and 3 of 'svscale_x' must have the same element size, but the values passed here have type 'svfloat16_t' and 'svint32_t' respectively} } */
++  svscale_x (pg, f16, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, f16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, f16, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, f16, 0);
++  svscale_x (pg, s16, s16); /* { dg-error {'svscale_x' has no form that takes 'svint16_t' arguments} } */
++  svscale_x (pg, s16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, s16, s32); /* { dg-error {'svscale_x' has no form that takes 'svint16_t' arguments} } */
++  svscale_x (pg, s16, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
++  svscale_x (pg, u16, s16); /* { dg-error {'svscale_x' has no form that takes 'svuint16_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_lane_1.c
+new file mode 100644
+index 000000000..f1879ca6e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_lane_1.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
++    svint32_t s32, int i)
++{
++  svmul_lane (f32, f32); /* { dg-error {too few arguments to function 'svmul_lane'} } */
++  svmul_lane (f32, f32, 0, 0); /* { dg-error {too many arguments to function 'svmul_lane'} } */
++  svmul_lane (pg, pg, 0); /* { dg-error {'svmul_lane' has no form that takes 'svbool_t' arguments} } */
++  svmul_lane (s32, s32, 0); /* { dg-error {'svmul_lane' has no form that takes 'svint32_t' arguments} } */
++  svmul_lane (1, f32, 0); /* { dg-error {passing 'int' to argument 1 of 'svmul_lane', which expects an SVE vector type} } */
++  svmul_lane (f32, 1, 0); /* { dg-error {passing 'int' to argument 2 of 'svmul_lane', which expects an SVE vector type} } */
++  svmul_lane (f32, f64, 0); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svmul_lane', but previous arguments had type 'svfloat32_t'} } */
++  svmul_lane (f32, f32, s32); /* { dg-error {argument 3 of 'svmul_lane' must be an integer constant expression} } */
++  svmul_lane (f32, f32, i); /* { dg-error {argument 3 of 'svmul_lane' must be an integer constant expression} } */
++
++  svmul_lane (f16, f16, 0);
++  svmul_lane (f16, f16, 7);
++  svmul_lane (f16, f16, 8); /* { dg-error {passing 8 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 7\]} } */
++  svmul_lane (f16, f16, -1); /* { dg-error {passing -1 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 7\]} } */
++
++  svmul_lane (f32, f32, 0);
++  svmul_lane (f32, f32, 3);
++  svmul_lane (f32, f32, 4); /* { dg-error {passing 4 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 3\]} } */
++  svmul_lane (f32, f32, -1); /* { dg-error {passing -1 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 3\]} } */
++
++  svmul_lane (f64, f64, 0);
++  svmul_lane (f64, f64, 1);
++  svmul_lane (f64, f64, 2); /* { dg-error {passing 2 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 1\]} } */
++  svmul_lane (f64, f64, -1); /* { dg-error {passing -1 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 1\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_n_1.c
+new file mode 100644
+index 000000000..0c69e66a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_n_1.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8, svfloat16_t f16, int i, float f)
++{
++  svinsr (u8); /* { dg-error {too few arguments to function 'svinsr'} } */
++  svinsr (u8, 0, 0); /* { dg-error {too many arguments to function 'svinsr'} } */
++  svinsr (0, 0); /* { dg-error {passing 'int' to argument 1 of 'svinsr', which expects an SVE vector type} } */
++  svinsr (u8, 0);
++  svinsr (u8, -1);
++  svinsr (u8, i);
++  svinsr (u8, f);
++  svinsr (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svinsr', which expects a scalar element} } */
++  svinsr (pg, 0); /* { dg-error {'svinsr' has no form that takes 'svbool_t' arguments} } */
++  svinsr (f16, f);
++  svinsr (f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svinsr', which expects a scalar element} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_1.c
+new file mode 100644
+index 000000000..29615e5be
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_1.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svuint8_t
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8)
++{
++  svadd_u8_x (pg, u8, s8); /* { dg-error {incompatible type for argument 3 of 'svadd_u8_x'} } */
++  svadd_u8_x (pg, u8); /* { dg-error {too few arguments to function 'svadd_u8_x'} } */
++  svadd_u8_x (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svadd_u8_x'} } */
++  return svadd_s8_x (pg, s8, s8); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_2.c
+new file mode 100644
+index 000000000..9fa83ca99
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_2.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
++    svint16_t s16, svuint16_t u16, svfloat16_t f16)
++{
++  svadd_x (pg, u8); /* { dg-error {too few arguments to function 'svadd_x'} } */
++  svadd_x (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svadd_x'} } */
++  svadd_x (u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svadd_x', which expects 'svbool_t'} } */
++  svadd_x (pg, pg, pg); /* { dg-error {'svadd_x' has no form that takes 'svbool_t' arguments} } */
++  svadd_x (pg, 1, u8); /* { dg-error {passing 'int' to argument 2 of 'svadd_x', which expects an SVE vector type} } */
++  svadd_x (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
++  svadd_x (pg, u8, u8);
++  svadd_x (pg, u8, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
++  svadd_x (pg, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
++  svadd_x (pg, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
++  svadd_x (pg, u8, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
++  svadd_x (pg, u8, 0);
++
++  svadd_x (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svfloat16_t'} } */
++  svadd_x (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svfloat16_t'} } */
++  svadd_x (pg, f16, f16);
++  svadd_x (pg, f16, 1);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_3.c
+new file mode 100644
+index 000000000..4d0b253e3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_3.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
++    svint16_t s16, svuint16_t u16, svfloat16_t f16)
++{
++  svand_z (pg, u8); /* { dg-error {too few arguments to function 'svand_z'} } */
++  svand_z (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svand_z'} } */
++  svand_z (u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svand_z', which expects 'svbool_t'} } */
++  svand_z (pg, pg, pg);
++  svand_z (pg, 1, u8); /* { dg-error {passing 'int' to argument 2 of 'svand_z', which expects an SVE vector type} } */
++  svand_z (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
++  svand_z (pg, u8, u8);
++  svand_z (pg, u8, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
++  svand_z (pg, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
++  svand_z (pg, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
++  svand_z (pg, u8, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
++  svand_z (pg, u8, 0);
++
++  svand_z (pg, pg, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svand_z', but previous arguments had type 'svbool_t'} } */
++  svand_z (pg, pg, 0); /* { dg-error {passing 'int' to argument 3 of 'svand_z', but its 'svbool_t' form does not accept scalars} } */
++
++  svand_z (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svfloat16_t'} } */
++  svand_z (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svfloat16_t'} } */
++  svand_z (pg, f16, f16); /* { dg-error {'svand_z' has no form that takes 'svfloat16_t' arguments} } */
++  svand_z (pg, f16, 1); /* { dg-error {'svand_z' has no form that takes 'svfloat16_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_rotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_rotate_1.c
+new file mode 100644
+index 000000000..8ffe91bce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_rotate_1.c
+@@ -0,0 +1,24 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat32_t f32, svfloat64_t f64, svint32_t s32, int i)
++{
++  svcadd_x (pg, f32, f32); /* { dg-error {too few arguments to function 'svcadd_x'} } */
++  svcadd_x (pg, f32, f32, 90, 90); /* { dg-error {too many arguments to function 'svcadd_x'} } */
++  svcadd_x (f32, f32, f32, 90); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svcadd_x', which expects 'svbool_t'} } */
++  svcadd_x (pg, pg, pg, 90); /* { dg-error {'svcadd_x' has no form that takes 'svbool_t' arguments} } */
++  svcadd_x (pg, s32, s32, 90); /* { dg-error {'svcadd_x' has no form that takes 'svint32_t' arguments} } */
++  svcadd_x (pg, 1, f32, 90); /* { dg-error {passing 'int' to argument 2 of 'svcadd_x', which expects an SVE vector type} } */
++  svcadd_x (pg, f32, 1, 90); /* { dg-error {passing 'int' to argument 3 of 'svcadd_x', which expects an SVE vector type} } */
++  svcadd_x (pg, f32, f64, 90); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcadd_x', but previous arguments had type 'svfloat32_t'} } */
++  svcadd_x (pg, f32, f32, s32); /* { dg-error {argument 4 of 'svcadd_x' must be an integer constant expression} } */
++  svcadd_x (pg, f32, f32, i); /* { dg-error {argument 4 of 'svcadd_x' must be an integer constant expression} } */
++  svcadd_x (pg, f32, f32, -90); /* { dg-error {passing -90 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
++  svcadd_x (pg, f32, f32, 0); /* { dg-error {passing 0 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
++  svcadd_x (pg, f32, f32, 1); /* { dg-error {passing 1 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
++  svcadd_x (pg, f32, f32, 90);
++  svcadd_x (pg, f32, f32, 180); /* { dg-error {passing 180 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
++  svcadd_x (pg, f32, f32, 270);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_n_1.c
+new file mode 100644
+index 000000000..c8ca5f746
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_n_1.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8, int i, float f)
++{
++  svdupq_lane (u8); /* { dg-error {too few arguments to function 'svdupq_lane'} } */
++  svdupq_lane (u8, 0, 0); /* { dg-error {too many arguments to function 'svdupq_lane'} } */
++  svdupq_lane (0, 0); /* { dg-error {passing 'int' to argument 1 of 'svdupq_lane', which expects an SVE vector type} } */
++  svdupq_lane (u8, 0);
++  svdupq_lane (u8, -1);
++  svdupq_lane (u8, i);
++  svdupq_lane (u8, f);
++  svdupq_lane (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svdupq_lane', which expects 'uint64_t'} } */
++  svdupq_lane (pg, 0); /* { dg-error {'svdupq_lane' has no form that takes 'svbool_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_1.c
+new file mode 100644
+index 000000000..27726a80f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_1.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svuint8_t
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svuint64_t u64)
++{
++  svlsl_wide_u8_x (pg, u8, u8); /* { dg-error {incompatible type for argument 3 of 'svlsl_wide_u8_x'} } */
++  svlsl_wide_u8_x (pg, u8); /* { dg-error {too few arguments to function 'svlsl_wide_u8_x'} } */
++  svlsl_wide_u8_x (pg, u8, u64, u8); /* { dg-error {too many arguments to function 'svlsl_wide_u8_x'} } */
++  return svlsl_wide_s8_x (pg, s8, u64); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_2.c
+new file mode 100644
+index 000000000..be217394f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_2.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8, svuint64_t u64)
++{
++  svlsl_wide_x (pg, u8); /* { dg-error {too few arguments to function 'svlsl_wide_x'} } */
++  svlsl_wide_x (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svlsl_wide_x'} } */
++  svlsl_wide_x (u8, u8, u64); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svlsl_wide_x', which expects 'svbool_t'} } */
++  svlsl_wide_x (pg, 1, u64); /* { dg-error {passing 'int' to argument 2 of 'svlsl_wide_x', which expects an SVE vector type} } */
++  svlsl_wide_x (pg, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svlsl_wide_x', which expects 'svuint64_t'} } */
++  svlsl_wide_x (pg, u64, u64); /* { dg-error {'svlsl_wide_x' has no form that takes 'svuint64_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_1.c
+new file mode 100644
+index 000000000..8f86c50b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_1.c
+@@ -0,0 +1,44 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svuint16_t u16, svint16_t s16,
++    svfloat16_t f16)
++{
++  svtbl (u8); /* { dg-error {too few arguments to function 'svtbl'} } */
++  svtbl (u8, u8, u8); /* { dg-error {too many arguments to function 'svtbl'} } */
++  svtbl (pg, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (pg, u8); /* { dg-error {'svtbl' has no form that takes 'svbool_t' arguments} } */
++
++  svtbl (u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svtbl', which expects an SVE vector type} } */
++  svtbl (u8, u8);
++  svtbl (u8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (u8, u16); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svuint8_t' and 'svuint16_t' respectively} } */
++  svtbl (u8, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (u8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++
++  svtbl (s8, u8);
++  svtbl (s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (s8, u16); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svint8_t' and 'svuint16_t' respectively} } */
++  svtbl (s8, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (s8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++
++  svtbl (u16, u8); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svuint16_t' and 'svuint8_t' respectively} } */
++  svtbl (u16, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (u16, u16);
++  svtbl (u16, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (u16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++
++  svtbl (s16, u8); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svint16_t' and 'svuint8_t' respectively} } */
++  svtbl (s16, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (s16, u16);
++  svtbl (s16, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (s16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++
++  svtbl (f16, u8); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svfloat16_t' and 'svuint8_t' respectively} } */
++  svtbl (f16, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (f16, u16);
++  svtbl (f16, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++  svtbl (f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_n_1.c
+new file mode 100644
+index 000000000..36a902e69
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_n_1.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8, int i, float f)
++{
++  svdup_lane (u8); /* { dg-error {too few arguments to function 'svdup_lane'} } */
++  svdup_lane (u8, 0, 0); /* { dg-error {too many arguments to function 'svdup_lane'} } */
++  svdup_lane (0, 0); /* { dg-error {passing 'int' to argument 1 of 'svdup_lane', which expects an SVE vector type} } */
++  svdup_lane (u8, 0);
++  svdup_lane (u8, -1);
++  svdup_lane (u8, i);
++  svdup_lane (u8, f);
++  svdup_lane (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svdup_lane', which expects a scalar integer} } */
++  svdup_lane (pg, 0); /* { dg-error {'svdup_lane' has no form that takes 'svbool_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_opt_n_1.c
+new file mode 100644
+index 000000000..b162ab405
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_opt_n_1.c
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat16_t f16, svint16_t s16, svuint16_t u16,
++    svfloat32_t f32, svint32_t s32, svuint32_t u32)
++{
++  svlsl_x (pg, s16); /* { dg-error {too few arguments to function 'svlsl_x'} } */
++  svlsl_x (pg, s16, u16, u16); /* { dg-error {too many arguments to function 'svlsl_x'} } */
++  svlsl_x (s32, s32, u32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svlsl_x', which expects 'svbool_t'} } */
++  svlsl_x (1, s32, u32); /* { dg-error {passing 'int' to argument 1 of 'svlsl_x', which expects 'svbool_t'} } */
++  svlsl_x (pg, pg, u16); /* { dg-error {'svlsl_x' has no form that takes 'svbool_t' arguments} } */
++  svlsl_x (pg, 1, s16); /* { dg-error {passing 'int' to argument 2 of 'svlsl_x', which expects an SVE vector type} } */
++  svlsl_x (pg, s16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, s16, u16);
++  svlsl_x (pg, s16, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, s16, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, s16, u32); /* { dg-error {arguments 2 and 3 of 'svlsl_x' must have the same element size, but the values passed here have type 'svint16_t' and 'svuint32_t' respectively} } */
++  svlsl_x (pg, s16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, s16, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, s16, 0);
++  svlsl_x (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, f16, u16); /* { dg-error {'svlsl_x' has no form that takes 'svfloat16_t' arguments} } */
++  svlsl_x (pg, f16, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
++  svlsl_x (pg, f16, u32); /* { dg-error {'svlsl_x' has no form that takes 'svfloat16_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/clast_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/clast_1.c
+new file mode 100644
+index 000000000..cb9ac946c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/clast_1.c
+@@ -0,0 +1,15 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint32_t s32, svint64_t s64, int i)
++{
++  svclasta (pg, 1); /* { dg-error {too few arguments to function 'svclasta'} } */
++  svclasta (pg, 1, s32, 1); /* { dg-error {too many arguments to function 'svclasta'} } */
++  svclasta (1, 1, s32); /* { dg-error {passing 'int' to argument 1 of 'svclasta', which expects 'svbool_t'} } */
++  svclasta (pg, 1, 1); /* { dg-error {passing 'int' to argument 3 of 'svclasta', which expects an SVE vector type} } */
++  svclasta (pg, 1, pg); /* { dg-error {'svclasta' has no form that takes 'svbool_t' arguments} } */
++  svclasta (pg, i, s32);
++  svclasta (pg, s32, 1); /* { dg-error {passing 'int' to argument 3 of 'svclasta', which expects an SVE vector type} } */
++  svclasta (pg, s32, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svclasta', but previous arguments had type 'svint32_t'} } */
++  svclasta (pg, pg, pg); /* { dg-error {'svclasta' has no form that takes 'svbool_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_opt_n_1.c
+new file mode 100644
+index 000000000..71c8e86d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_opt_n_1.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
++    svint16_t s16, svuint16_t u16, svfloat16_t f16)
++{
++  svcmpeq (pg, u8); /* { dg-error {too few arguments to function 'svcmpeq'} } */
++  svcmpeq (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svcmpeq'} } */
++  svcmpeq (u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svcmpeq', which expects 'svbool_t'} } */
++  svcmpeq (pg, pg, pg); /* { dg-error {'svcmpeq' has no form that takes 'svbool_t' arguments} } */
++  svcmpeq (pg, 1, u8); /* { dg-error {passing 'int' to argument 2 of 'svcmpeq', which expects an SVE vector type} } */
++  svcmpeq (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
++  svcmpeq (pg, u8, u8);
++  svcmpeq (pg, u8, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
++  svcmpeq (pg, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
++  svcmpeq (pg, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
++  svcmpeq (pg, u8, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
++  svcmpeq (pg, u8, 0);
++
++  svcmpeq (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svfloat16_t'} } */
++  svcmpeq (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svfloat16_t'} } */
++  svcmpeq (pg, f16, f16);
++  svcmpeq (pg, f16, 1);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_scalar_1.c
+new file mode 100644
+index 000000000..d5a60f841
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_scalar_1.c
+@@ -0,0 +1,85 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++#include <stdbool.h>
++
++enum signed_enum { SA = -1, SB };
++enum unsigned_enum { UA, UB };
++
++void
++test (int8_t s8, int16_t s16, int32_t s32, int64_t s64,
++      uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64,
++      bool b, enum signed_enum se, enum unsigned_enum ue,
++      int *ptr, float f32, svbool_t pg, svint32_t vec)
++{
++  svwhilele_b8 (s32); /* { dg-error {too few arguments to function 'svwhilele_b8'} } */
++  svwhilele_b8 (s32, s32, s32); /* { dg-error {too many arguments to function 'svwhilele_b8'} } */
++
++  svwhilele_b8 (b, b);
++  svwhilele_b8 (se, se);
++  svwhilele_b8 (ue, ue);
++  svwhilele_b8 (s8, s8);
++  svwhilele_b8 (u8, u8);
++  svwhilele_b8 (s16, s16);
++  svwhilele_b8 (u16, u16);
++  svwhilele_b8 (ptr, ptr); /* { dg-error {passing 'int \*' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
++  svwhilele_b8 (f32, f32); /* { dg-error {passing 'float' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
++  svwhilele_b8 (pg, pg); /* { dg-error {passing 'svbool_t' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
++  svwhilele_b8 (vec, vec); /* { dg-error {passing 'svint32_t' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
++
++  svwhilele_b8 (s32, b);
++  svwhilele_b8 (s32, se);
++  svwhilele_b8 (s32, ue); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (s32, s8);
++  svwhilele_b8 (s32, u8);
++  svwhilele_b8 (s32, s16);
++  svwhilele_b8 (s32, u16);
++
++  svwhilele_b8 (u32, b); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u32, se); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u32, ue);
++  svwhilele_b8 (u32, s8); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u32, u8); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u32, s16); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u32, u16); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++
++  svwhilele_b8 (s32, s32);
++  svwhilele_b8 (s32, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (s32, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'int64_t'} } */
++  svwhilele_b8 (s32, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint64_t'} } */
++
++  svwhilele_b8 (u32, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u32, u32);
++  svwhilele_b8 (u32, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int64_t'} } */
++  svwhilele_b8 (u32, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'uint64_t'} } */
++
++  svwhilele_b8 (s64, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (s64, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (s64, s64);
++  svwhilele_b8 (s64, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'uint64_t'} } */
++
++  svwhilele_b8 (u64, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u64, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (u64, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'int64_t'} } */
++  svwhilele_b8 (u64, u64);
++
++  svwhilele_b8 (0, s32);
++  svwhilele_b8 (0, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (0, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'int64_t'} } */
++  svwhilele_b8 (0, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint64_t'} } */
++
++  svwhilele_b8 (s32, 0);
++  svwhilele_b8 (u32, 0); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (s64, 0); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (u64, 0); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'int32_t'} } */
++
++  svwhilele_b8 (0U, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
++  svwhilele_b8 (0U, u32);
++  svwhilele_b8 (0U, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int64_t'} } */
++  svwhilele_b8 (0U, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'uint64_t'} } */
++
++  svwhilele_b8 (s32, 0U); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (u32, 0U);
++  svwhilele_b8 (s64, 0U); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'uint32_t'} } */
++  svwhilele_b8 (u64, 0U); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'uint32_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_wide_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_wide_opt_n_1.c
+new file mode 100644
+index 000000000..fc5e45663
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_wide_opt_n_1.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svuint8_t
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint64_t s64, svuint64_t u64,
++    svfloat32_t f32, svfloat64_t f64, unsigned int x)
++{
++  svcmpeq_wide (pg, s8); /* { dg-error {too few arguments to function 'svcmpeq_wide'} } */
++  svcmpeq_wide (pg, s8, s64, s8); /* { dg-error {too many arguments to function 'svcmpeq_wide'} } */
++  svcmpeq_wide (s8, s8, s64); /* { dg-error {passing 'svint8_t' to argument 1 of 'svcmpeq_wide', which expects 'svbool_t'} } */
++  svcmpeq_wide (pg, 0, s64); /* { dg-error {passing 'int' to argument 2 of 'svcmpeq_wide', which expects an SVE vector type} } */
++  svcmpeq_wide (pg, s8, 0);
++  svcmpeq_wide (pg, s8, x);
++  svcmpeq_wide (pg, s8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
++  svcmpeq_wide (pg, s8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
++  svcmpeq_wide (pg, s8, s64);
++  svcmpeq_wide (pg, s8, u64); /* { dg-error {arguments 2 and 3 of 'svcmpeq_wide' must have the same signedness, but the values passed here have type 'svint8_t' and 'svuint64_t' respectively} } */
++  svcmpeq_wide (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
++  svcmpeq_wide (pg, u8, u64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svuint8_t' arguments} } */
++  svcmpeq_wide (pg, s64, s64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svint64_t' arguments} } */
++  svcmpeq_wide (pg, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
++  svcmpeq_wide (pg, f32, f64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svfloat32_t' arguments} } */
++  svcmpeq_wide (pg, f64, f64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svfloat64_t' arguments} } */
++  svcmpeq_wide (pg, pg, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_pat_1.c
+new file mode 100644
+index 000000000..8dd76a553
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_pat_1.c
+@@ -0,0 +1,42 @@
++#include <arm_sve.h>
++
++void
++test (enum svpattern pat, int i)
++{
++  svcntb_pat (pat); /* { dg-error {argument 1 of 'svcntb_pat' must be an integer constant expression} } */
++  svcntb_pat (i); /* { dg-error {argument 1 of 'svcntb_pat' must be an integer constant expression} } */
++  svcntb_pat ((enum svpattern) -1); /* { dg-error {passing 4294967295 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 0);
++  svcntb_pat ((enum svpattern) 1);
++  svcntb_pat ((enum svpattern) 2);
++  svcntb_pat ((enum svpattern) 3);
++  svcntb_pat ((enum svpattern) 4);
++  svcntb_pat ((enum svpattern) 5);
++  svcntb_pat ((enum svpattern) 6);
++  svcntb_pat ((enum svpattern) 7);
++  svcntb_pat ((enum svpattern) 8);
++  svcntb_pat ((enum svpattern) 9);
++  svcntb_pat ((enum svpattern) 10);
++  svcntb_pat ((enum svpattern) 11);
++  svcntb_pat ((enum svpattern) 12);
++  svcntb_pat ((enum svpattern) 13);
++  svcntb_pat ((enum svpattern) 14); /* { dg-error {passing 14 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 15); /* { dg-error {passing 15 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 16); /* { dg-error {passing 16 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 17); /* { dg-error {passing 17 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 18); /* { dg-error {passing 18 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 19); /* { dg-error {passing 19 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 20); /* { dg-error {passing 20 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 21); /* { dg-error {passing 21 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 22); /* { dg-error {passing 22 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 23); /* { dg-error {passing 23 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 24); /* { dg-error {passing 24 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 25); /* { dg-error {passing 25 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 26); /* { dg-error {passing 26 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 27); /* { dg-error {passing 27 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 28); /* { dg-error {passing 28 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++  svcntb_pat ((enum svpattern) 29);
++  svcntb_pat ((enum svpattern) 30);
++  svcntb_pat ((enum svpattern) 31);
++  svcntb_pat ((enum svpattern) 32); /* { dg-error {passing 32 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_vector_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_vector_1.c
+new file mode 100644
+index 000000000..daf9e0d5b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_vector_1.c
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint32_t u32, svuint32x2_t u32x2)
++{
++  svlen (); /* { dg-error {too few arguments to function 'svlen'} } */
++  svlen (u32, u32); /* { dg-error {too many arguments to function 'svlen'} } */
++  svlen (0); /* { dg-error {passing 'int' to argument 1 of 'svlen', which expects an SVE vector type} } */
++  svlen (pg); /* { dg-error {'svlen' has no form that takes 'svbool_t' arguments} } */
++  svlen (u32x2); /* { dg-error {passing 'svuint32x2_t' to argument 1 of 'svlen', which expects a single SVE vector rather than a tuple} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_1.c
+new file mode 100644
+index 000000000..31321a046
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_1.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
++    svuint8x2_t u8x2, int x)
++{
++  *ptr = svcreate2 (u8); /* { dg-error {too few arguments to function 'svcreate2'} } */
++  *ptr = svcreate2 (u8, u8, u8); /* { dg-error {too many arguments to function 'svcreate2'} } */
++  *ptr = svcreate2 (u8x2, u8x2); /* { dg-error {passing 'svuint8x2_t' to argument 1 of 'svcreate2', which expects a single SVE vector rather than a tuple} } */
++  *ptr = svcreate2 (u8, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svcreate2', but previous arguments had type 'svuint8_t'} } */
++  *ptr = svcreate2 (u8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svcreate2', but previous arguments had type 'svuint8_t'} } */
++  *ptr = svcreate2 (u8, x); /* { dg-error {passing 'int' to argument 2 of 'svcreate2', which expects an SVE vector type} } */
++  *ptr = svcreate2 (x, u8); /* { dg-error {passing 'int' to argument 1 of 'svcreate2', which expects an SVE vector type} } */
++  *ptr = svcreate2 (pg, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svcreate2', but previous arguments had type 'svbool_t'} } */
++  *ptr = svcreate2 (pg, pg); /* { dg-error {'svcreate2' has no form that takes 'svbool_t' arguments} } */
++  *ptr = svcreate2 (u8, u8);
++  *ptr = svcreate2 (f64, f64); /* { dg-error {incompatible types when assigning to type 'svuint8x2_t' from type 'svfloat64x2_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_2.c
+new file mode 100644
+index 000000000..28ad16c2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_2.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
++    svuint8x2_t u8x2, int x)
++{
++  *ptr = svcreate2_u8 (u8); /* { dg-error {too few arguments to function 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (u8, u8, u8); /* { dg-error {too many arguments to function 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (u8x2, u8x2); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
++  /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} "" { target *-*-* } .-1 } */
++  *ptr = svcreate2_u8 (u8, f64); /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (u8, pg); /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (u8, x); /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (x, u8); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (pg, u8); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
++  *ptr = svcreate2_u8 (pg, pg); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
++  /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} "" { target *-*-* } .-1 } */
++  *ptr = svcreate2_u8 (u8, u8);
++  *ptr = svcreate2_f64 (f64, f64); /* { dg-error {incompatible types when assigning to type 'svuint8x2_t' from type 'svfloat64x2_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_3.c
+new file mode 100644
+index 000000000..a88e56b31
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_3.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
++    svfloat16x3_t f16x3, int x)
++{
++  *ptr = svcreate3 (f16); /* { dg-error {too few arguments to function 'svcreate3'} } */
++  *ptr = svcreate3 (f16, f16); /* { dg-error {too few arguments to function 'svcreate3'} } */
++  *ptr = svcreate3 (f16, f16, f16, f16); /* { dg-error {too many arguments to function 'svcreate3'} } */
++  *ptr = svcreate3 (f16x3, f16x3, f16x3); /* { dg-error {passing 'svfloat16x3_t' to argument 1 of 'svcreate3', which expects a single SVE vector rather than a tuple} } */
++  *ptr = svcreate3 (f16, f16, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcreate3', but previous arguments had type 'svfloat16_t'} } */
++  *ptr = svcreate3 (f16, pg, f16); /* { dg-error {passing 'svbool_t' to argument 2 of 'svcreate3', but previous arguments had type 'svfloat16_t'} } */
++  *ptr = svcreate3 (f16, x, f16); /* { dg-error {passing 'int' to argument 2 of 'svcreate3', which expects an SVE vector type} } */
++  *ptr = svcreate3 (x, f16, f16); /* { dg-error {passing 'int' to argument 1 of 'svcreate3', which expects an SVE vector type} } */
++  *ptr = svcreate3 (pg, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svcreate3', but previous arguments had type 'svbool_t'} } */
++  *ptr = svcreate3 (pg, pg, pg); /* { dg-error {'svcreate3' has no form that takes 'svbool_t' arguments} } */
++  *ptr = svcreate3 (f16, f16, f16);
++  *ptr = svcreate3 (f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svfloat16x3_t' from type 'svfloat64x3_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_4.c
+new file mode 100644
+index 000000000..c111e9f29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_4.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
++    svfloat16x3_t f16x3, int x)
++{
++  *ptr = svcreate3_f16 (f16); /* { dg-error {too few arguments to function 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (f16, f16); /* { dg-error {too few arguments to function 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (f16, f16, f16, f16); /* { dg-error {too many arguments to function 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (f16x3, f16x3, f16x3); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
++  /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} "" { target *-*-* } .-1 } */
++  /* { dg-error {incompatible type for argument 3 of 'svcreate3_f16'} "" { target *-*-* } .-2 } */
++  *ptr = svcreate3_f16 (f16, f16, f64); /* { dg-error {incompatible type for argument 3 of 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (f16, pg, f16); /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (f16, x, f16); /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (x, f16, f16); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (pg, f16, f16); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
++  *ptr = svcreate3_f16 (pg, pg, pg); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
++  /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} "" { target *-*-* } .-1 } */
++  /* { dg-error {incompatible type for argument 3 of 'svcreate3_f16'} "" { target *-*-* } .-2 } */
++  *ptr = svcreate3_f16 (f16, f16, f16);
++  *ptr = svcreate3_f64 (f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svfloat16x3_t' from type 'svfloat64x3_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_5.c
+new file mode 100644
+index 000000000..fed124506
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_5.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
++    svint32x4_t s32x4, int x)
++{
++  *ptr = svcreate4 (s32); /* { dg-error {too few arguments to function 'svcreate4'} } */
++  *ptr = svcreate4 (s32, s32); /* { dg-error {too few arguments to function 'svcreate4'} } */
++  *ptr = svcreate4 (s32, s32, s32); /* { dg-error {too few arguments to function 'svcreate4'} } */
++  *ptr = svcreate4 (s32, s32, s32, s32, s32); /* { dg-error {too many arguments to function 'svcreate4'} } */
++  *ptr = svcreate4 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {passing 'svint32x4_t' to argument 1 of 'svcreate4', which expects a single SVE vector rather than a tuple} } */
++  *ptr = svcreate4 (s32, s32, s32, f64); /* { dg-error {passing 'svfloat64_t' to argument 4 of 'svcreate4', but previous arguments had type 'svint32_t'} } */
++  *ptr = svcreate4 (s32, s32, pg, s32); /* { dg-error {passing 'svbool_t' to argument 3 of 'svcreate4', but previous arguments had type 'svint32_t'} } */
++  *ptr = svcreate4 (s32, x, s32, s32); /* { dg-error {passing 'int' to argument 2 of 'svcreate4', which expects an SVE vector type} } */
++  *ptr = svcreate4 (x, s32, s32, s32); /* { dg-error {passing 'int' to argument 1 of 'svcreate4', which expects an SVE vector type} } */
++  *ptr = svcreate4 (pg, s32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svcreate4', but previous arguments had type 'svbool_t'} } */
++  *ptr = svcreate4 (pg, pg, pg, pg); /* { dg-error {'svcreate4' has no form that takes 'svbool_t' arguments} } */
++  *ptr = svcreate4 (s32, s32, s32, s32);
++  *ptr = svcreate4 (f64, f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svint32x4_t' from type 'svfloat64x4_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_6.c
+new file mode 100644
+index 000000000..b9e298acf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_6.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
++    svint32x4_t s32x4, int x)
++{
++  *ptr = svcreate4_s32 (s32); /* { dg-error {too few arguments to function 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (s32, s32); /* { dg-error {too few arguments to function 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (s32, s32, s32); /* { dg-error {too few arguments to function 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (s32, s32, s32, s32, s32); /* { dg-error {too many arguments to function 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
++  /* { dg-error {incompatible type for argument 2 of 'svcreate4_s32'} "" { target *-*-* } .-1 } */
++  /* { dg-error {incompatible type for argument 3 of 'svcreate4_s32'} "" { target *-*-* } .-2 } */
++  /* { dg-error {incompatible type for argument 4 of 'svcreate4_s32'} "" { target *-*-* } .-3 } */
++  *ptr = svcreate4_s32 (s32, s32, s32, f64); /* { dg-error {incompatible type for argument 4 of 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (s32, s32, pg, s32); /* { dg-error {incompatible type for argument 3 of 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (s32, x, s32, s32); /* { dg-error {incompatible type for argument 2 of 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (x, s32, s32, s32); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (pg, s32, s32, s32); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
++  *ptr = svcreate4_s32 (pg, pg, pg, pg); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
++  /* { dg-error {incompatible type for argument 2 of 'svcreate4_s32'} "" { target *-*-* } .-1 } */
++  /* { dg-error {incompatible type for argument 3 of 'svcreate4_s32'} "" { target *-*-* } .-2 } */
++  /* { dg-error {incompatible type for argument 4 of 'svcreate4_s32'} "" { target *-*-* } .-3 } */
++  *ptr = svcreate4_s32 (s32, s32, s32, s32);
++  *ptr = svcreate4_f64 (f64, f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svint32x4_t' from type 'svfloat64x4_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ext_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ext_1.c
+new file mode 100644
+index 000000000..bdce3926d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ext_1.c
+@@ -0,0 +1,67 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
++    svfloat16_t f16, svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, int i)
++{
++  svext (pg, pg, 0); /* { dg-error {'svext' has no form that takes 'svbool_t' arguments} } */
++  svext (s8, s8, i); /* { dg-error {argument 3 of 'svext' must be an integer constant expression} } */
++
++  svext (s8, s8, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
++  svext (s8, s8, 0);
++  svext (s8, s8, 255);
++  svext (s8, s8, 256); /* { dg-error {passing 256 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
++
++  svext (u8, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
++  svext (u8, u8, 0);
++  svext (u8, u8, 255);
++  svext (u8, u8, 256); /* { dg-error {passing 256 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
++
++  svext (s16, s16, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
++  svext (s16, s16, 0);
++  svext (s16, s16, 127);
++  svext (s16, s16, 128); /* { dg-error {passing 128 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
++
++  svext (u16, u16, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
++  svext (u16, u16, 0);
++  svext (u16, u16, 127);
++  svext (u16, u16, 128); /* { dg-error {passing 128 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
++
++  svext (f16, f16, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
++  svext (f16, f16, 0);
++  svext (f16, f16, 127);
++  svext (f16, f16, 128); /* { dg-error {passing 128 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
++
++  svext (s32, s32, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
++  svext (s32, s32, 0);
++  svext (s32, s32, 63);
++  svext (s32, s32, 64); /* { dg-error {passing 64 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
++
++  svext (u32, u32, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
++  svext (u32, u32, 0);
++  svext (u32, u32, 63);
++  svext (u32, u32, 64); /* { dg-error {passing 64 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
++
++  svext (f32, f32, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
++  svext (f32, f32, 0);
++  svext (f32, f32, 63);
++  svext (f32, f32, 64); /* { dg-error {passing 64 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
++
++  svext (s64, s64, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
++  svext (s64, s64, 0);
++  svext (s64, s64, 31);
++  svext (s64, s64, 32); /* { dg-error {passing 32 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
++
++  svext (u64, u64, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
++  svext (u64, u64, 0);
++  svext (u64, u64, 31);
++  svext (u64, u64, 32); /* { dg-error {passing 32 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
++
++  svext (f64, f64, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
++  svext (f64, f64, 0);
++  svext (f64, f64, 31);
++  svext (f64, f64, 32); /* { dg-error {passing 32 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/fold_left_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/fold_left_1.c
+new file mode 100644
+index 000000000..1d292786d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/fold_left_1.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svuint8_t
++f1 (svbool_t pg, int i, float f, double d, void *ptr, svfloat32_t f32,
++    svint32_t i32)
++{
++  svadda (pg, f); /* { dg-error {too few arguments to function 'svadda'} } */
++  svadda (pg, f, f32, f32); /* { dg-error {too many arguments to function 'svadda'} } */
++  svadda (f32, f, f32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svadda', which expects 'svbool_t'} } */
++  svadda (pg, i, f32);
++  svadda (pg, f, f32);
++  svadda (pg, d, f32);
++  svadda (pg, ptr, f32); /* { dg-error {incompatible type for argument 2 of 'svadda_f32'} } */
++  svadda (pg, pg, f32); /* { dg-error {passing 'svbool_t' to argument 2 of 'svadda', which expects a scalar element} } */
++  svadda (pg, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadda', which expects a scalar element} } */
++  svadda (pg, f, f); /* { dg-error {passing 'float' to argument 3 of 'svadda', which expects an SVE vector type} } */
++  svadda (pg, i, i32); /* { dg-error {'svadda' has no form that takes 'svint32_t' arguments} } */
++  svadda (pg, i, i); /* { dg-error {passing 'int' to argument 3 of 'svadda', which expects an SVE vector type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_1.c
+new file mode 100644
+index 000000000..e1b99fa36
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_1.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svadd_n_u8_x; /* { dg-message "note: previous declaration of 'svadd_n_u8_x' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svadd_n_u8_x' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_2.c
+new file mode 100644
+index 000000000..7f653f117
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_2.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svadd_n_u8_x = 1; /* { dg-message "note: previous definition of 'svadd_n_u8_x' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svadd_n_u8_x' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_3.c
+new file mode 100644
+index 000000000..d9ff15a6c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_3.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++extern __SVInt8_t svadd_u8_x (__SVBool_t, __SVInt8_t, __SVInt8_t); /* { dg-message "note: previous declaration of 'svadd_u8_x' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svadd_u8_x'} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_4.c
+new file mode 100644
+index 000000000..9591e3d01
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_4.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++
++/* Although somewhat suspect, this isn't actively wrong, and doesn't need
++   to be diagnosed.  Any attempt to call the function before including
++   arm_sve.h will lead to a link failure.  (Same for taking its address,
++   etc.)  */
++extern __SVUint8_t svadd_u8_x (__SVBool_t, __SVUint8_t, __SVUint8_t);
++
++#pragma GCC aarch64 "arm_sve.h"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_5.c
+new file mode 100644
+index 000000000..85923611d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_5.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++
++/* There's no requirement to diagnose this.  In particular, arm_sve.h
++   is allowed to use macros to implement the functions, and defining
++   a macro that matches an existing symbol would not be diagnosed.
++
++   At the moment this works like other built-ins in the sense that the
++   explicit definition "wins".  This isn't supported behavior though.  */
++__SVUint8_t
++svadd_u8_x (__SVBool_t pg, __SVUint8_t x, __SVUint8_t y)
++{
++  return x;
++}
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svuint8_t
++f (svbool_t pg, svuint8_t x, svuint8_t y)
++{
++  return svadd_u8_x (pg, x, y);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_6.c
+new file mode 100644
+index 000000000..1f04e4644
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_6.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef int svadd_u8_x; /* { dg-message "note: previous declaration of 'svadd_u8_x' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svadd_u8_x' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_1.c
+new file mode 100644
+index 000000000..a3ac08fa8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_1.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8 = svget2 (u8x2); /* { dg-error {too few arguments to function 'svget2'} } */
++  u8 = svget2 (u8x2, 1, 2); /* { dg-error {too many arguments to function 'svget2'} } */
++  u8 = svget2 (u8, 0); /* { dg-error {passing single vector 'svuint8_t' to argument 1 of 'svget2', which expects a tuple of 2 vectors} } */
++  u8 = svget2 (u8x3, 0); /* { dg-error {passing 'svuint8x3_t' to argument 1 of 'svget2', which expects a tuple of 2 vectors} } */
++  u8 = svget2 (pg, 0); /* { dg-error {passing 'svbool_t' to argument 1 of 'svget2', which expects a tuple of 2 vectors} } */
++  u8 = svget2 (u8x2, x); /* { dg-error {argument 2 of 'svget2' must be an integer constant expression} } */
++  u8 = svget2 (u8x2, 0);
++  f64 = svget2 (u8x2, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8_t'} } */
++  u8 = svget2 (u8x2, 1);
++  u8 = svget2 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2 (u8x2, one); /* { dg-error {argument 2 of 'svget2' must be an integer constant expression} } */
++  u8 = svget2 (u8x2, 3 - 2);
++  u8 = svget2 (u8x2, 1.0);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_2.c
+new file mode 100644
+index 000000000..4eee2439e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_2.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
++    svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8 = svget2_u8 (u8x2); /* { dg-error {too few arguments to function 'svget2_u8'} } */
++  u8 = svget2_u8 (u8x2, 1, 2); /* { dg-error {too many arguments to function 'svget2_u8'} } */
++  u8 = svget2_u8 (u8, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
++  u8 = svget2_u8 (s8x2, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
++  u8 = svget2_u8 (u8x3, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
++  u8 = svget2_u8 (pg, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
++  u8 = svget2_u8 (u8x2, x); /* { dg-error {argument 2 of 'svget2_u8' must be an integer constant expression} } */
++  u8 = svget2_u8 (u8x2, 0);
++  f64 = svget2_u8 (u8x2, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8_t'} } */
++  u8 = svget2_u8 (u8x2, 1);
++  u8 = svget2_u8 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
++  u8 = svget2_u8 (u8x2, one); /* { dg-error {argument 2 of 'svget2_u8' must be an integer constant expression} } */
++  u8 = svget2_u8 (u8x2, 3 - 2);
++  u8 = svget2_u8 (u8x2, 1.0);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_3.c
+new file mode 100644
+index 000000000..0e7b2e227
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_3.c
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
++    int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16 = svget3 (f16x3); /* { dg-error {too few arguments to function 'svget3'} } */
++  f16 = svget3 (f16x3, 1, 2); /* { dg-error {too many arguments to function 'svget3'} } */
++  f16 = svget3 (f16, 0); /* { dg-error {passing single vector 'svfloat16_t' to argument 1 of 'svget3', which expects a tuple of 3 vectors} } */
++  f16 = svget3 (f16x4, 0); /* { dg-error {passing 'svfloat16x4_t' to argument 1 of 'svget3', which expects a tuple of 3 vectors} } */
++  f16 = svget3 (pg, 0); /* { dg-error {passing 'svbool_t' to argument 1 of 'svget3', which expects a tuple of 3 vectors} } */
++  f16 = svget3 (f16x3, x); /* { dg-error {argument 2 of 'svget3' must be an integer constant expression} } */
++  f16 = svget3 (f16x3, 0);
++  f64 = svget3 (f16x3, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16_t'} } */
++  f16 = svget3 (f16x3, 1);
++  f16 = svget3 (f16x3, 2);
++  f16 = svget3 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3 (f16x3, one); /* { dg-error {argument 2 of 'svget3' must be an integer constant expression} } */
++  f16 = svget3 (f16x3, 3 - 2);
++  f16 = svget3 (f16x3, 1.0);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_4.c
+new file mode 100644
+index 000000000..72b4f82a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_4.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat32x3_t f32x3,
++    svfloat16x4_t f16x4, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16 = svget3_f16 (f16x3); /* { dg-error {too few arguments to function 'svget3_f16'} } */
++  f16 = svget3_f16 (f16x3, 1, 2); /* { dg-error {too many arguments to function 'svget3_f16'} } */
++  f16 = svget3_f16 (f16, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
++  f16 = svget3_f16 (f32x3, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
++  f16 = svget3_f16 (f16x4, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
++  f16 = svget3_f16 (pg, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
++  f16 = svget3_f16 (f16x3, x); /* { dg-error {argument 2 of 'svget3_f16' must be an integer constant expression} } */
++  f16 = svget3_f16 (f16x3, 0);
++  f64 = svget3_f16 (f16x3, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16_t'} } */
++  f16 = svget3_f16 (f16x3, 1);
++  f16 = svget3_f16 (f16x3, 2);
++  f16 = svget3_f16 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
++  f16 = svget3_f16 (f16x3, one); /* { dg-error {argument 2 of 'svget3_f16' must be an integer constant expression} } */
++  f16 = svget3_f16 (f16x3, 3 - 2);
++  f16 = svget3_f16 (f16x3, 1.0);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_5.c
+new file mode 100644
+index 000000000..b0b69b95e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_5.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32 = svget4 (s32x4); /* { dg-error {too few arguments to function 'svget4'} } */
++  s32 = svget4 (s32x4, 1, 2); /* { dg-error {too many arguments to function 'svget4'} } */
++  s32 = svget4 (s32, 0); /* { dg-error {passing single vector 'svint32_t' to argument 1 of 'svget4', which expects a tuple of 4 vectors} } */
++  s32 = svget4 (s32x2, 0); /* { dg-error {passing 'svint32x2_t' to argument 1 of 'svget4', which expects a tuple of 4 vectors} } */
++  s32 = svget4 (pg, 0); /* { dg-error {passing 'svbool_t' to argument 1 of 'svget4', which expects a tuple of 4 vectors} } */
++  s32 = svget4 (s32x4, x); /* { dg-error {argument 2 of 'svget4' must be an integer constant expression} } */
++  s32 = svget4 (s32x4, 0);
++  f64 = svget4 (s32x4, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32_t'} } */
++  s32 = svget4 (s32x4, 1);
++  s32 = svget4 (s32x4, 2);
++  s32 = svget4 (s32x4, 3);
++  s32 = svget4 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4 (s32x4, one); /* { dg-error {argument 2 of 'svget4' must be an integer constant expression} } */
++  s32 = svget4 (s32x4, 3 - 2);
++  s32 = svget4 (s32x4, 1.0);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_6.c
+new file mode 100644
+index 000000000..3801c0c4e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_6.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
++    svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32 = svget4_s32 (s32x4); /* { dg-error {too few arguments to function 'svget4_s32'} } */
++  s32 = svget4_s32 (s32x4, 1, 2); /* { dg-error {too many arguments to function 'svget4_s32'} } */
++  s32 = svget4_s32 (s32, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
++  s32 = svget4_s32 (f32x4, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
++  s32 = svget4_s32 (s32x2, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
++  s32 = svget4_s32 (pg, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
++  s32 = svget4_s32 (s32x4, x); /* { dg-error {argument 2 of 'svget4_s32' must be an integer constant expression} } */
++  s32 = svget4_s32 (s32x4, 0);
++  f64 = svget4_s32 (s32x4, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32_t'} } */
++  s32 = svget4_s32 (s32x4, 1);
++  s32 = svget4_s32 (s32x4, 2);
++  s32 = svget4_s32 (s32x4, 3);
++  s32 = svget4_s32 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
++  s32 = svget4_s32 (s32x4, one); /* { dg-error {argument 2 of 'svget4_s32' must be an integer constant expression} } */
++  s32 = svget4_s32 (s32x4, 3 - 2);
++  s32 = svget4_s32 (s32x4, 1.0);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_1.c
+new file mode 100644
+index 000000000..dcd291da6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_1.c
+@@ -0,0 +1,37 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f, int i)
++{
++  svqincb (sw); /* { dg-error {too few arguments to function 'svqincb'} } */
++  svqincb (sw, 1, 1); /* { dg-error {too many arguments to function 'svqincb'} } */
++
++  svqincb (pg, 1); /* { dg-error {'svqincb' has no form that takes 'svbool_t' arguments} } */
++  svqincb (s8, 1); /* { dg-error {'svqincb' has no form that takes 'svint8_t' arguments} } */
++  svqincb (u8, 1); /* { dg-error {'svqincb' has no form that takes 'svuint8_t' arguments} } */
++  svqincb (s16, 1); /* { dg-error {'svqincb' has no form that takes 'svint16_t' arguments} } */
++  svqincb (u16, 1); /* { dg-error {'svqincb' has no form that takes 'svuint16_t' arguments} } */
++  svqincb (s32, 1); /* { dg-error {'svqincb' has no form that takes 'svint32_t' arguments} } */
++  svqincb (u32, 1); /* { dg-error {'svqincb' has no form that takes 'svuint32_t' arguments} } */
++  svqincb (s64, 1); /* { dg-error {'svqincb' has no form that takes 'svint64_t' arguments} } */
++  svqincb (u64, 1); /* { dg-error {'svqincb' has no form that takes 'svuint64_t' arguments} } */
++  svqincb (sh, 1);
++  svqincb (sw, 1);
++  svqincb (sd, 1);
++  svqincb (uh, 1);
++  svqincb (uw, 1);
++  svqincb (ud, 1);
++  svqincb (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincb', which expects a 32-bit or 64-bit integer type} } */
++  svqincb (ud, i); /* { dg-error {argument 2 of 'svqincb' must be an integer constant expression} } */
++
++  svqincb (sw, -1); /* { dg-error {passing -1 to argument 2 of 'svqincb', which expects a value in the range \[1, 16\]} } */
++  svqincb (sw, 0); /* { dg-error {passing 0 to argument 2 of 'svqincb', which expects a value in the range \[1, 16\]} } */
++  svqincb (sw, 1);
++  svqincb (sw, 2);
++  svqincb (sw, 16);
++  svqincb (sw, 17); /* { dg-error {passing 17 to argument 2 of 'svqincb', which expects a value in the range \[1, 16\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_2.c
+new file mode 100644
+index 000000000..e5acad187
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_2.c
+@@ -0,0 +1,13 @@
++#include <arm_sve.h>
++
++void
++test (int32_t sw, int i)
++{
++  svqincb_n_s32 (sw, -1); /* { dg-error {passing -1 to argument 2 of 'svqincb_n_s32', which expects a value in the range \[1, 16\]} } */
++  svqincb_n_s32 (sw, 0); /* { dg-error {passing 0 to argument 2 of 'svqincb_n_s32', which expects a value in the range \[1, 16\]} } */
++  svqincb_n_s32 (sw, 1);
++  svqincb_n_s32 (sw, 2);
++  svqincb_n_s32 (sw, 16);
++  svqincb_n_s32 (sw, 17); /* { dg-error {passing 17 to argument 2 of 'svqincb_n_s32', which expects a value in the range \[1, 16\]} } */
++  svqincb_n_s32 (sw, i); /* { dg-error {argument 2 of 'svqincb_n_s32' must be an integer constant expression} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_3.c
+new file mode 100644
+index 000000000..351e7757f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_3.c
+@@ -0,0 +1,26 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f)
++{
++  svqinch (pg, 1); /* { dg-error {'svqinch' has no form that takes 'svbool_t' arguments} } */
++  svqinch (s8, 1); /* { dg-error {'svqinch' has no form that takes 'svint8_t' arguments} } */
++  svqinch (u8, 1); /* { dg-error {'svqinch' has no form that takes 'svuint8_t' arguments} } */
++  svqinch (s16, 1);
++  svqinch (u16, 1);
++  svqinch (s32, 1); /* { dg-error {'svqinch' has no form that takes 'svint32_t' arguments} } */
++  svqinch (u32, 1); /* { dg-error {'svqinch' has no form that takes 'svuint32_t' arguments} } */
++  svqinch (s64, 1); /* { dg-error {'svqinch' has no form that takes 'svint64_t' arguments} } */
++  svqinch (u64, 1); /* { dg-error {'svqinch' has no form that takes 'svuint64_t' arguments} } */
++  svqinch (sh, 1);
++  svqinch (sw, 1);
++  svqinch (sd, 1);
++  svqinch (uh, 1);
++  svqinch (uw, 1);
++  svqinch (ud, 1);
++  svqinch (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqinch', which expects a 32-bit or 64-bit integer type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_4.c
+new file mode 100644
+index 000000000..e071c0229
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_4.c
+@@ -0,0 +1,26 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f)
++{
++  svqincw (pg, 1); /* { dg-error {'svqincw' has no form that takes 'svbool_t' arguments} } */
++  svqincw (s8, 1); /* { dg-error {'svqincw' has no form that takes 'svint8_t' arguments} } */
++  svqincw (u8, 1); /* { dg-error {'svqincw' has no form that takes 'svuint8_t' arguments} } */
++  svqincw (s16, 1); /* { dg-error {'svqincw' has no form that takes 'svint16_t' arguments} } */
++  svqincw (u16, 1); /* { dg-error {'svqincw' has no form that takes 'svuint16_t' arguments} } */
++  svqincw (s32, 1);
++  svqincw (u32, 1);
++  svqincw (s64, 1); /* { dg-error {'svqincw' has no form that takes 'svint64_t' arguments} } */
++  svqincw (u64, 1); /* { dg-error {'svqincw' has no form that takes 'svuint64_t' arguments} } */
++  svqincw (sh, 1);
++  svqincw (sw, 1);
++  svqincw (sd, 1);
++  svqincw (uh, 1);
++  svqincw (uw, 1);
++  svqincw (ud, 1);
++  svqincw (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincw', which expects a 32-bit or 64-bit integer type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_5.c
+new file mode 100644
+index 000000000..be9c76928
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_5.c
+@@ -0,0 +1,26 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f)
++{
++  svqincd (pg, 1); /* { dg-error {'svqincd' has no form that takes 'svbool_t' arguments} } */
++  svqincd (s8, 1); /* { dg-error {'svqincd' has no form that takes 'svint8_t' arguments} } */
++  svqincd (u8, 1); /* { dg-error {'svqincd' has no form that takes 'svuint8_t' arguments} } */
++  svqincd (s16, 1); /* { dg-error {'svqincd' has no form that takes 'svint16_t' arguments} } */
++  svqincd (u16, 1); /* { dg-error {'svqincd' has no form that takes 'svuint16_t' arguments} } */
++  svqincd (s32, 1); /* { dg-error {'svqincd' has no form that takes 'svint32_t' arguments} } */
++  svqincd (u32, 1); /* { dg-error {'svqincd' has no form that takes 'svuint32_t' arguments} } */
++  svqincd (s64, 1);
++  svqincd (u64, 1);
++  svqincd (sh, 1);
++  svqincd (sw, 1);
++  svqincd (sd, 1);
++  svqincd (uh, 1);
++  svqincd (uw, 1);
++  svqincd (ud, 1);
++  svqincd (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincd', which expects a 32-bit or 64-bit integer type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_1.c
+new file mode 100644
+index 000000000..f2e5841d4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_1.c
+@@ -0,0 +1,47 @@
++#include <arm_sve.h>
++
++void
++test (enum svpattern pat, svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f, int i)
++{
++  svqincb_pat (sw, pat); /* { dg-error {too few arguments to function 'svqincb_pat'} } */
++  svqincb_pat (sw, pat, 1, 1); /* { dg-error {too many arguments to function 'svqincb_pat'} } */
++
++  svqincb_pat (pg, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svbool_t' arguments} } */
++  svqincb_pat (s8, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint8_t' arguments} } */
++  svqincb_pat (u8, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint8_t' arguments} } */
++  svqincb_pat (s16, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint16_t' arguments} } */
++  svqincb_pat (u16, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint16_t' arguments} } */
++  svqincb_pat (s32, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint32_t' arguments} } */
++  svqincb_pat (u32, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint32_t' arguments} } */
++  svqincb_pat (s64, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint64_t' arguments} } */
++  svqincb_pat (u64, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint64_t' arguments} } */
++  svqincb_pat (sh, SV_ALL, 1);
++  svqincb_pat (sw, SV_ALL, 1);
++  svqincb_pat (sd, SV_ALL, 1);
++  svqincb_pat (uh, SV_ALL, 1);
++  svqincb_pat (uw, SV_ALL, 1);
++  svqincb_pat (ud, SV_ALL, 1);
++  svqincb_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincb_pat', which expects a 32-bit or 64-bit integer type} } */
++
++  svqincb_pat (sw, pat, 1); /* { dg-error {argument 2 of 'svqincb_pat' must be an integer constant expression} } */
++  svqincb_pat (sw, i, 1); /* { dg-error {argument 2 of 'svqincb_pat' must be an integer constant expression} } */
++  svqincb_pat (sw, (enum svpattern) -1, 1); /* { dg-error {passing 4294967295 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
++  svqincb_pat (sw, (enum svpattern) 0, 1);
++  svqincb_pat (sw, (enum svpattern) 13, 1);
++  svqincb_pat (sw, (enum svpattern) 14, 1); /* { dg-error {passing 14 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
++  svqincb_pat (sw, (enum svpattern) 28, 1); /* { dg-error {passing 28 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
++  svqincb_pat (sw, (enum svpattern) 29, 1);
++  svqincb_pat (sw, (enum svpattern) 31, 1);
++  svqincb_pat (sw, (enum svpattern) 32, 1); /* { dg-error {passing 32 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
++
++  svqincb_pat (sw, SV_POW2, -1); /* { dg-error {passing -1 to argument 3 of 'svqincb_pat', which expects a value in the range \[1, 16\]} } */
++  svqincb_pat (sw, SV_POW2, 0); /* { dg-error {passing 0 to argument 3 of 'svqincb_pat', which expects a value in the range \[1, 16\]} } */
++  svqincb_pat (sw, SV_POW2, 1);
++  svqincb_pat (sw, SV_POW2, 2);
++  svqincb_pat (sw, SV_POW2, 16);
++  svqincb_pat (sw, SV_POW2, 17); /* { dg-error {passing 17 to argument 3 of 'svqincb_pat', which expects a value in the range \[1, 16\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_2.c
+new file mode 100644
+index 000000000..c1c1ab9d9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_2.c
+@@ -0,0 +1,23 @@
++#include <arm_sve.h>
++
++void
++test (int32_t sw, enum svpattern pat, int i)
++{
++  svqincb_pat_n_s32 (sw, pat, 1); /* { dg-error {argument 2 of 'svqincb_pat_n_s32' must be an integer constant expression} } */
++  svqincb_pat_n_s32 (sw, i, 1); /* { dg-error {argument 2 of 'svqincb_pat_n_s32' must be an integer constant expression} } */
++  svqincb_pat_n_s32 (sw, (enum svpattern) -1, 1); /* { dg-error {passing 4294967295 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
++  svqincb_pat_n_s32 (sw, (enum svpattern) 0, 1);
++  svqincb_pat_n_s32 (sw, (enum svpattern) 13, 1);
++  svqincb_pat_n_s32 (sw, (enum svpattern) 14, 1); /* { dg-error {passing 14 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
++  svqincb_pat_n_s32 (sw, (enum svpattern) 28, 1); /* { dg-error {passing 28 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
++  svqincb_pat_n_s32 (sw, (enum svpattern) 29, 1);
++  svqincb_pat_n_s32 (sw, (enum svpattern) 31, 1);
++  svqincb_pat_n_s32 (sw, (enum svpattern) 32, 1); /* { dg-error {passing 32 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
++
++  svqincb_pat_n_s32 (sw, SV_POW2, -1); /* { dg-error {passing -1 to argument 3 of 'svqincb_pat_n_s32', which expects a value in the range \[1, 16\]} } */
++  svqincb_pat_n_s32 (sw, SV_POW2, 0); /* { dg-error {passing 0 to argument 3 of 'svqincb_pat_n_s32', which expects a value in the range \[1, 16\]} } */
++  svqincb_pat_n_s32 (sw, SV_POW2, 1);
++  svqincb_pat_n_s32 (sw, SV_POW2, 2);
++  svqincb_pat_n_s32 (sw, SV_POW2, 16);
++  svqincb_pat_n_s32 (sw, SV_POW2, 17); /* { dg-error {passing 17 to argument 3 of 'svqincb_pat_n_s32', which expects a value in the range \[1, 16\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_3.c
+new file mode 100644
+index 000000000..4126b2461
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_3.c
+@@ -0,0 +1,26 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f)
++{
++  svqinch_pat (pg, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svbool_t' arguments} } */
++  svqinch_pat (s8, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svint8_t' arguments} } */
++  svqinch_pat (u8, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svuint8_t' arguments} } */
++  svqinch_pat (s16, SV_ALL, 1);
++  svqinch_pat (u16, SV_ALL, 1);
++  svqinch_pat (s32, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svint32_t' arguments} } */
++  svqinch_pat (u32, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svuint32_t' arguments} } */
++  svqinch_pat (s64, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svint64_t' arguments} } */
++  svqinch_pat (u64, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svuint64_t' arguments} } */
++  svqinch_pat (sh, SV_ALL, 1);
++  svqinch_pat (sw, SV_ALL, 1);
++  svqinch_pat (sd, SV_ALL, 1);
++  svqinch_pat (uh, SV_ALL, 1);
++  svqinch_pat (uw, SV_ALL, 1);
++  svqinch_pat (ud, SV_ALL, 1);
++  svqinch_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqinch_pat', which expects a 32-bit or 64-bit integer type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_4.c
+new file mode 100644
+index 000000000..9aabbd714
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_4.c
+@@ -0,0 +1,26 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f)
++{
++  svqincw_pat (pg, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svbool_t' arguments} } */
++  svqincw_pat (s8, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svint8_t' arguments} } */
++  svqincw_pat (u8, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svuint8_t' arguments} } */
++  svqincw_pat (s16, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svint16_t' arguments} } */
++  svqincw_pat (u16, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svuint16_t' arguments} } */
++  svqincw_pat (s32, SV_ALL, 1);
++  svqincw_pat (u32, SV_ALL, 1);
++  svqincw_pat (s64, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svint64_t' arguments} } */
++  svqincw_pat (u64, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svuint64_t' arguments} } */
++  svqincw_pat (sh, SV_ALL, 1);
++  svqincw_pat (sw, SV_ALL, 1);
++  svqincw_pat (sd, SV_ALL, 1);
++  svqincw_pat (uh, SV_ALL, 1);
++  svqincw_pat (uw, SV_ALL, 1);
++  svqincw_pat (ud, SV_ALL, 1);
++  svqincw_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincw_pat', which expects a 32-bit or 64-bit integer type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_5.c
+new file mode 100644
+index 000000000..5df88c649
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_5.c
+@@ -0,0 +1,26 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
++      float f)
++{
++  svqincd_pat (pg, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svbool_t' arguments} } */
++  svqincd_pat (s8, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svint8_t' arguments} } */
++  svqincd_pat (u8, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svuint8_t' arguments} } */
++  svqincd_pat (s16, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svint16_t' arguments} } */
++  svqincd_pat (u16, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svuint16_t' arguments} } */
++  svqincd_pat (s32, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svint32_t' arguments} } */
++  svqincd_pat (u32, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svuint32_t' arguments} } */
++  svqincd_pat (s64, SV_ALL, 1);
++  svqincd_pat (u64, SV_ALL, 1);
++  svqincd_pat (sh, SV_ALL, 1);
++  svqincd_pat (sw, SV_ALL, 1);
++  svqincd_pat (sd, SV_ALL, 1);
++  svqincd_pat (uh, SV_ALL, 1);
++  svqincd_pat (uw, SV_ALL, 1);
++  svqincd_pat (ud, SV_ALL, 1);
++  svqincd_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincd_pat', which expects a 32-bit or 64-bit integer type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_1.c
+new file mode 100644
+index 000000000..a61afcd2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_1.c
+@@ -0,0 +1,22 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, int i)
++{
++  svqincp (s32); /* { dg-error {too few arguments to function 'svqincp'} } */
++  svqincp (s32, pg, pg); /* { dg-error {too many arguments to function 'svqincp'} } */
++  svqincp (i, pg); /* { dg-error {passing 'int' to argument 1 of 'svqincp', which expects an SVE vector type} } */
++  svqincp (pg, pg); /* { dg-error {'svqincp' has no form that takes 'svbool_t' arguments} } */
++  svqincp (s8, pg); /* { dg-error {'svqincp' has no form that takes 'svint8_t' arguments} } */
++  svqincp (u8, pg); /* { dg-error {'svqincp' has no form that takes 'svuint8_t' arguments} } */
++  svqincp (s16, pg);
++  svqincp (u16, pg);
++  svqincp (s32, pg);
++  svqincp (u32, pg);
++  svqincp (s64, pg);
++  svqincp (u64, pg);
++  svqincp (u64, 0); /* { dg-error {passing 'int' to argument 2 of 'svqincp', which expects 'svbool_t'} } */
++  svqincp (u64, u64); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svqincp', which expects 'svbool_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_scalar_1.c
+new file mode 100644
+index 000000000..94ebe7e7a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_scalar_1.c
+@@ -0,0 +1,19 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint32_t s32, svuint64_t u64, int16_t sh, uint16_t uh,
++      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud)
++{
++  svqincp_b8 (s32); /* { dg-error {too few arguments to function 'svqincp_b8'} } */
++  svqincp_b8 (s32, pg, pg); /* { dg-error {too many arguments to function 'svqincp_b8'} } */
++  svqincp_b8 (pg, pg); /* { dg-error {passing 'svbool_t' to argument 1 of 'svqincp_b8', which expects a 32-bit or 64-bit integer type} } */
++  svqincp_b8 (s32, pg); /* { dg-error {passing 'svint32_t' to argument 1 of 'svqincp_b8', which expects a 32-bit or 64-bit integer type} } */
++  svqincp_b8 (sh, pg);
++  svqincp_b8 (uh, pg);
++  svqincp_b8 (sw, pg);
++  svqincp_b8 (uw, pg);
++  svqincp_b8 (sd, pg);
++  svqincp_b8 (ud, pg);
++  svqincp_b8 (ud, 0); /* { dg-error {passing 'int' to argument 2 of 'svqincp_b8', which expects 'svbool_t'} } */
++  svqincp_b8 (ud, u64); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svqincp_b8', which expects 'svbool_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ld1sh_gather_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ld1sh_gather_1.c
+new file mode 100644
+index 000000000..91f37f6a5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ld1sh_gather_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99 -Wpointer-sign" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, short *s16_ptr, unsigned short *u16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1sh_gather_index (pg, s16_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sh_gather_index'; did you mean 'svld1_gather_index'} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr); /* { dg-error {too few arguments to function 'svld1sh_gather_index_u32'} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sh_gather_index_u32'} } */
++  svld1sh_gather_index_u32 (pg, u16_ptr, s32); /* { dg-warning {pointer targets in passing argument 2 of 'svld1sh_gather_s32index_u32' differ in signedness} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s32);
++  svld1sh_gather_index_u32 (pg, s16_ptr, u32);
++  svld1sh_gather_index_u32 (pg, s16_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++
++  svld1sh_gather_index_u32 (pg, 0, s32);
++  svld1sh_gather_index_u32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sh_gather_index_u32', which expects a vector or pointer base address} } */
++
++  svld1sh_gather_index_u32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
++  svld1sh_gather_index_u32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
++  svld1sh_gather_index_u32 (pg, u32, 0);
++  svld1sh_gather_index_u32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
+new file mode 100644
+index 000000000..34f989bf8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint8_t
++f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
++    float *f32_ptr, _Complex float *cf32_ptr, int **ptr_ptr)
++{
++  svld1 (pg); /* { dg-error {too few arguments to function 'svld1'} } */
++  svld1 (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1'} } */
++  svld1 (0, s8_ptr); /* { dg-error {passing 'int' to argument 1 of 'svld1', which expects 'svbool_t'} } */
++  svld1 (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1', which expects a pointer type} } */
++  svld1 (pg, (int *) 0);
++  svld1 (pg, void_ptr); /* { dg-error {passing 'void \*' to argument 2 of 'svld1', but 'void' is not a valid SVE element type} } */
++  svld1 (pg, s_ptr); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1', but 'struct s' is not a valid SVE element type} } */
++  svld1 (pg, f32_ptr);
++  svld1 (pg, cf32_ptr); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1', but 'complex float' is not a valid SVE element type} } */
++  svld1 (pg, ptr_ptr); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1', but 'int \*' is not a valid SVE element type} } */
++  return svld1 (pg, s8_ptr); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_2.c
+new file mode 100644
+index 000000000..beb07f138
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_2.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint8_t
++f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
++    float *f32_ptr, _Complex float *cf32_ptr)
++{
++  svld1_s8 (pg); /* { dg-error {too few arguments to function 'svld1_s8'} } */
++  svld1_s8 (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1_s8'} } */
++  svld1_s8 (0, 0); /* { dg-error {incompatible type for argument 1 of 'svld1_s8'} } */
++  svld1_s8 (pg, 0);
++  svld1_s32 (pg, (int *) 0);
++  svld1_s8 (pg, void_ptr);
++  svld1_s8 (pg, s_ptr); /* { dg-warning {passing argument 2 of 'svld1_s8' from incompatible pointer type} } */
++  svld1_f32 (pg, f32_ptr);
++  svld1_f32 (pg, cf32_ptr); /* { dg-warning {passing argument 2 of 'svld1_f32' from incompatible pointer type} } */
++  return svld1_s8 (pg, s8_ptr); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_3.c
+new file mode 100644
+index 000000000..770203f64
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_3.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint8_t
++f1 (svbool_t pg, signed char *s8_ptr, svint8_t s8)
++{
++  svld1_vnum (pg); /* { dg-error {too few arguments to function 'svld1_vnum'} } */
++  svld1_vnum (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1_vnum'} } */
++  svld1_vnum (pg, s8_ptr, 0, 0); /* { dg-error {too many arguments to function 'svld1_vnum'} } */
++  svld1_vnum (0, s8_ptr, 0); /* { dg-error {passing 'int' to argument 1 of 'svld1_vnum', which expects 'svbool_t'} } */
++  svld1_vnum (pg, 0, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1_vnum', which expects a pointer type} } */
++  svld1_vnum (pg, s8_ptr, s8_ptr); /* { dg-warning "passing argument 3 of 'svld1_vnum_s8' makes integer from pointer without a cast" } */
++  svld1_vnum (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1_vnum', which expects 'int64_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_index_1.c
+new file mode 100644
+index 000000000..91f37f6a5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_index_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99 -Wpointer-sign" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, short *s16_ptr, unsigned short *u16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1sh_gather_index (pg, s16_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sh_gather_index'; did you mean 'svld1_gather_index'} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr); /* { dg-error {too few arguments to function 'svld1sh_gather_index_u32'} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sh_gather_index_u32'} } */
++  svld1sh_gather_index_u32 (pg, u16_ptr, s32); /* { dg-warning {pointer targets in passing argument 2 of 'svld1sh_gather_s32index_u32' differ in signedness} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s32);
++  svld1sh_gather_index_u32 (pg, s16_ptr, u32);
++  svld1sh_gather_index_u32 (pg, s16_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++  svld1sh_gather_index_u32 (pg, s16_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
++
++  svld1sh_gather_index_u32 (pg, 0, s32);
++  svld1sh_gather_index_u32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sh_gather_index_u32', which expects a vector or pointer base address} } */
++
++  svld1sh_gather_index_u32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
++  svld1sh_gather_index_u32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
++  svld1sh_gather_index_u32 (pg, u32, 0);
++  svld1sh_gather_index_u32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c
+new file mode 100644
+index 000000000..dae4d0ce1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1sb_gather_offset (pg, s8_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_s32'} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_s32'} } */
++  svld1sb_gather_offset_s32 (pg, s16_ptr, s32); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s32offset_s32' from incompatible pointer type} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, s32);
++  svld1sb_gather_offset_s32 (pg, s8_ptr, u32);
++  svld1sb_gather_offset_s32 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_s32 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
++
++  svld1sb_gather_offset_s32 (pg, 0, s32);
++  svld1sb_gather_offset_s32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_s32', which expects a vector or pointer base address} } */
++
++  svld1sb_gather_offset_s32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_s32', which expects 'svuint32_t'} } */
++  svld1sb_gather_offset_s32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_s32', which expects 'svuint32_t'} } */
++  svld1sb_gather_offset_s32 (pg, u32, 0);
++  svld1sb_gather_offset_s32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sb_gather_offset_s32', which expects 'svuint32_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c
+new file mode 100644
+index 000000000..1bc66977c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1sb_gather_offset (pg, s8_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_u32'} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_u32'} } */
++  svld1sb_gather_offset_u32 (pg, s16_ptr, s32); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s32offset_u32' from incompatible pointer type} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, s32);
++  svld1sb_gather_offset_u32 (pg, s8_ptr, u32);
++  svld1sb_gather_offset_u32 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++  svld1sb_gather_offset_u32 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
++
++  svld1sb_gather_offset_u32 (pg, 0, s32);
++  svld1sb_gather_offset_u32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_u32', which expects a vector or pointer base address} } */
++
++  svld1sb_gather_offset_u32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_u32', which expects 'svuint32_t'} } */
++  svld1sb_gather_offset_u32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_u32', which expects 'svuint32_t'} } */
++  svld1sb_gather_offset_u32 (pg, u32, 0);
++  svld1sb_gather_offset_u32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sb_gather_offset_u32', which expects 'svuint32_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_3.c
+new file mode 100644
+index 000000000..6522889db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_3.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1sb_gather_offset (pg, s8_ptr, s64); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_s64'} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, s64, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_s64'} } */
++  svld1sb_gather_offset_s64 (pg, s16_ptr, s64); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s64offset_s64' from incompatible pointer type} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_s64 (pg, s8_ptr, s64);
++  svld1sb_gather_offset_s64 (pg, s8_ptr, u64);
++  svld1sb_gather_offset_s64 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
++
++  svld1sb_gather_offset_s64 (pg, 0, s64);
++  svld1sb_gather_offset_s64 (pg, s, s64); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_s64', which expects a vector or pointer base address} } */
++
++  svld1sb_gather_offset_s64 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_s64', which expects 'svuint64_t'} } */
++  svld1sb_gather_offset_s64 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_s64', which expects 'svuint64_t'} } */
++  svld1sb_gather_offset_s64 (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1sb_gather_offset_s64', which expects 'svuint64_t'} } */
++  svld1sb_gather_offset_s64 (pg, u64, 0);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_4.c
+new file mode 100644
+index 000000000..025621989
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_4.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1sb_gather_offset (pg, s8_ptr, s64); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_u64'} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, s64, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_u64'} } */
++  svld1sb_gather_offset_u64 (pg, s16_ptr, s64); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s64offset_u64' from incompatible pointer type} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++  svld1sb_gather_offset_u64 (pg, s8_ptr, s64);
++  svld1sb_gather_offset_u64 (pg, s8_ptr, u64);
++  svld1sb_gather_offset_u64 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
++
++  svld1sb_gather_offset_u64 (pg, 0, s64);
++  svld1sb_gather_offset_u64 (pg, s, s64); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_u64', which expects a vector or pointer base address} } */
++
++  svld1sb_gather_offset_u64 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_u64', which expects 'svuint64_t'} } */
++  svld1sb_gather_offset_u64 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_u64', which expects 'svuint64_t'} } */
++  svld1sb_gather_offset_u64 (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1sb_gather_offset_u64', which expects 'svuint64_t'} } */
++  svld1sb_gather_offset_u64 (pg, u64, 0);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_5.c
+new file mode 100644
+index 000000000..8d57aa020
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_5.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, unsigned char *s8_ptr, unsigned short *s16_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svld1ub_gather_offset (pg, s8_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1ub_gather_offset'; did you mean 'svld1_gather_offset'} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1ub_gather_offset_s32'} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1ub_gather_offset_s32'} } */
++  svld1ub_gather_offset_s32 (pg, s16_ptr, s32); /* { dg-warning {passing argument 2 of 'svld1ub_gather_s32offset_s32' from incompatible pointer type} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, s32);
++  svld1ub_gather_offset_s32 (pg, s8_ptr, u32);
++  svld1ub_gather_offset_s32 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++  svld1ub_gather_offset_s32 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
++
++  svld1ub_gather_offset_s32 (pg, 0, s32);
++  svld1ub_gather_offset_s32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1ub_gather_offset_s32', which expects a vector or pointer base address} } */
++
++  svld1ub_gather_offset_s32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1ub_gather_offset_s32', which expects 'svuint32_t'} } */
++  svld1ub_gather_offset_s32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1ub_gather_offset_s32', which expects 'svuint32_t'} } */
++  svld1ub_gather_offset_s32 (pg, u32, 0);
++  svld1ub_gather_offset_s32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1ub_gather_offset_s32', which expects 'svuint32_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_1.c
+new file mode 100644
+index 000000000..21566a9d9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_1.c
+@@ -0,0 +1,80 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint32_t
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
++    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
++    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr, int **ptr_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64)
++{
++  svld1_gather_offset (pg, s32_ptr); /* { dg-error {too few arguments to function 'svld1_gather_offset'} } */
++  svld1_gather_offset (pg, s32_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1_gather_offset'} } */
++  svld1_gather_offset (0, s32_ptr, s32); /* { dg-error {passing 'int' to argument 1 of 'svld1_gather_offset', which expects 'svbool_t'} } */
++  svld1_gather_offset (pg, 0, s32); /* { dg-error {passing 'int' to argument 2 of 'svld1_gather_offset', which expects a pointer type} } */
++  svld1_gather_offset (pg, (int *) 0, s32);
++  svld1_gather_offset (pg, void_ptr, s32); /* { dg-error {passing 'void \*' to argument 2 of 'svld1_gather_offset', but 'void' is not a valid SVE element type} } */
++  svld1_gather_offset (pg, s_ptr, s32); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1_gather_offset', but 'struct s' is not a valid SVE element type} } */
++  svld1_gather_offset (pg, f32_ptr, s32);
++  svld1_gather_offset (pg, cf32_ptr, s32); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1_gather_offset', but 'complex float' is not a valid SVE element type} } */
++  svld1_gather_offset (pg, ptr_ptr, u64); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1_gather_offset', but 'int \*' is not a valid SVE element type} } */
++  svld1_gather_offset (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1_gather_offset', which expects a pointer type} } */
++  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
++  svld1_gather_offset (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1_gather_offset', which expects a pointer type} } */
++  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
++
++  svld1_gather_offset (pg, s8_ptr, s8); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
++  svld1_gather_offset (pg, s8_ptr, s32); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
++  svld1_gather_offset (pg, s16_ptr, s16); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
++  svld1_gather_offset (pg, s16_ptr, s32); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
++
++  svld1_gather_offset (pg, s32_ptr, s32);
++  svld1_gather_offset (pg, s32_ptr, u32);
++  svld1_gather_offset (pg, s32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, s32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, s32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, s32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++
++  svld1_gather_offset (pg, u32_ptr, s32);
++  svld1_gather_offset (pg, u32_ptr, u32);
++  svld1_gather_offset (pg, u32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, u32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, u32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, u32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++
++  svld1_gather_offset (pg, f32_ptr, s32);
++  svld1_gather_offset (pg, f32_ptr, u32);
++  svld1_gather_offset (pg, f32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, f32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, f32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_offset (pg, f32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++
++  svld1_gather_offset (pg, s64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, s64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, s64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, s64_ptr, s64);
++  svld1_gather_offset (pg, s64_ptr, u64);
++  svld1_gather_offset (pg, s64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++
++  svld1_gather_offset (pg, u64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, u64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, u64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, u64_ptr, s64);
++  svld1_gather_offset (pg, u64_ptr, u64);
++  svld1_gather_offset (pg, u64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++
++  svld1_gather_offset (pg, f64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, f64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, f64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_offset (pg, f64_ptr, s64);
++  svld1_gather_offset (pg, f64_ptr, u64);
++  svld1_gather_offset (pg, f64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++
++  return svld1_gather_offset (pg, s32_ptr, s32); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_2.c
+new file mode 100644
+index 000000000..4c15fc40c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_2.c
+@@ -0,0 +1,80 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint32_t
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
++    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
++    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr, int **ptr_ptr,
++    svint8_t s8, svint16_t s16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64)
++{
++  svld1_gather_index (pg, s32_ptr); /* { dg-error {too few arguments to function 'svld1_gather_index'} } */
++  svld1_gather_index (pg, s32_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1_gather_index'} } */
++  svld1_gather_index (0, s32_ptr, s32); /* { dg-error {passing 'int' to argument 1 of 'svld1_gather_index', which expects 'svbool_t'} } */
++  svld1_gather_index (pg, 0, s32); /* { dg-error {passing 'int' to argument 2 of 'svld1_gather_index', which expects a pointer type} } */
++  svld1_gather_index (pg, (int *) 0, s32);
++  svld1_gather_index (pg, void_ptr, s32); /* { dg-error {passing 'void \*' to argument 2 of 'svld1_gather_index', but 'void' is not a valid SVE element type} } */
++  svld1_gather_index (pg, s_ptr, s32); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1_gather_index', but 'struct s' is not a valid SVE element type} } */
++  svld1_gather_index (pg, f32_ptr, s32);
++  svld1_gather_index (pg, cf32_ptr, s32); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1_gather_index', but 'complex float' is not a valid SVE element type} } */
++  svld1_gather_index (pg, ptr_ptr, u64); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1_gather_index', but 'int \*' is not a valid SVE element type} } */
++  svld1_gather_index (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1_gather_index', which expects a pointer type} } */
++  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
++  svld1_gather_index (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1_gather_index', which expects a pointer type} } */
++  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
++
++  svld1_gather_index (pg, s8_ptr, s8); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
++  svld1_gather_index (pg, s8_ptr, s32); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
++  svld1_gather_index (pg, s16_ptr, s16); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
++  svld1_gather_index (pg, s16_ptr, s32); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
++
++  svld1_gather_index (pg, s32_ptr, s32);
++  svld1_gather_index (pg, s32_ptr, u32);
++  svld1_gather_index (pg, s32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, s32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, s32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, s32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
++
++  svld1_gather_index (pg, u32_ptr, s32);
++  svld1_gather_index (pg, u32_ptr, u32);
++  svld1_gather_index (pg, u32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, u32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, u32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, u32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
++
++  svld1_gather_index (pg, f32_ptr, s32);
++  svld1_gather_index (pg, f32_ptr, u32);
++  svld1_gather_index (pg, f32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, f32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, f32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svld1_gather_index (pg, f32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
++
++  svld1_gather_index (pg, s64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, s64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, s64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, s64_ptr, s64);
++  svld1_gather_index (pg, s64_ptr, u64);
++  svld1_gather_index (pg, s64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
++
++  svld1_gather_index (pg, u64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, u64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, u64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, u64_ptr, s64);
++  svld1_gather_index (pg, u64_ptr, u64);
++  svld1_gather_index (pg, u64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
++
++  svld1_gather_index (pg, f64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, f64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, f64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svld1_gather_index (pg, f64_ptr, s64);
++  svld1_gather_index (pg, f64_ptr, u64);
++  svld1_gather_index (pg, f64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
++
++  return svld1_gather_index (pg, s32_ptr, s32); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_replicate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_replicate_1.c
+new file mode 100644
+index 000000000..d4ff76ea8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_replicate_1.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint8_t
++f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
++    float *f32_ptr, _Complex float *cf32_ptr, int **ptr_ptr)
++{
++  svld1rq (pg); /* { dg-error {too few arguments to function 'svld1rq'} } */
++  svld1rq (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1rq'} } */
++  svld1rq (0, s8_ptr); /* { dg-error {passing 'int' to argument 1 of 'svld1rq', which expects 'svbool_t'} } */
++  svld1rq (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1rq', which expects a pointer type} } */
++  svld1rq (pg, (int *) 0);
++  svld1rq (pg, void_ptr); /* { dg-error {passing 'void \*' to argument 2 of 'svld1rq', but 'void' is not a valid SVE element type} } */
++  svld1rq (pg, s_ptr); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1rq', but 'struct s' is not a valid SVE element type} } */
++  svld1rq (pg, f32_ptr);
++  svld1rq (pg, cf32_ptr); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1rq', but 'complex float' is not a valid SVE element type} } */
++  svld1rq (pg, ptr_ptr); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1rq', but 'int \*' is not a valid SVE element type} } */
++  return svld1rq (pg, s8_ptr); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_1.c
+new file mode 100644
+index 000000000..5b0b00e96
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_1.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve+i8mm+f32mm+f64mm" } */
++
++#include <arm_sve.h>
++
++svuint32_t
++f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
++{
++  svmmla_s32 (s32); /* { dg-error {too few arguments to function 'svmmla_s32'} } */
++  svmmla_s32 (s32, s8, s8, u32); /* { dg-error {too many arguments to function 'svmmla_s32'} } */
++  svmmla_s32 (s32, u32, s8); /* { dg-error {incompatible type for argument 2 of 'svmmla_s32'} } */
++  svmmla_s32 (s32, u8, s8); /* { dg-error {incompatible type for argument 2 of 'svmmla_s32'} } */
++  svmmla_s32 (s32, s8, u8); /* { dg-error {incompatible type for argument 3 of 'svmmla_s32'} } */
++  svmmla_s32 (s32, s8, s32); /* { dg-error {incompatible type for argument 3 of 'svmmla_s32'} } */
++  svmmla_s32 (s32, s8, 0); /* { dg-error {incompatible type for argument 3 of 'svmmla_s32'} } */
++  svmmla_s32 (s32, s8, s8);
++  return svmmla_s32 (s32, s8, s8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
++}
++
++void
++f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32, svint32_t s32,
++    svfloat16_t f16, svfloat32_t f32, svfloat64_t f64)
++{
++  svmmla (s32, s8); /* { dg-error {too few arguments to function 'svmmla'} } */
++  svmmla (s32, s8, s8, s8); /* { dg-error {too many arguments to function 'svmmla'} } */
++  svmmla (0, s8, s8); /* { dg-error {passing 'int' to argument 1 of 'svmmla', which expects an SVE vector type} } */
++  svmmla (pg, s8, s8); /* { dg-error {'svmmla' has no form that takes 'svbool_t' arguments} } */
++  svmmla (u8, s8, s8); /* { dg-error {'svmmla' has no form that takes 'svuint8_t' arguments} } */
++
++  svmmla (s32, 0, s8); /* { dg-error {passing 'int' to argument 2 of 'svmmla', which expects an SVE vector type} } */
++  svmmla (s32, u8, s8); /* { dg-error {arguments 1 and 2 of 'svmmla' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
++  svmmla (s32, s8, u8); /* { dg-error {arguments 1 and 3 of 'svmmla' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
++  svmmla (s32, s8, 0); /* { dg-error {passing 'int' to argument 3 of 'svmmla', which expects an SVE vector type} } */
++  svmmla (s32, s8, s8);
++  svmmla (s32, s32, s32); /* { dg-error {passing 'svint32_t' instead of the expected 'svint8_t' to argument 2 of 'svmmla', after passing 'svint32_t' to argument 1} } */
++  svmmla (s32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svint8_t' to argument 2 of 'svmmla', after passing 'svint32_t' to argument 1} } */
++
++  svmmla (u32, 0, u8); /* { dg-error {passing 'int' to argument 2 of 'svmmla', which expects an SVE vector type} } */
++  svmmla (u32, s8, u8); /* { dg-error {arguments 1 and 2 of 'svmmla' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
++  svmmla (u32, u8, s8); /* { dg-error {arguments 1 and 3 of 'svmmla' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
++  svmmla (u32, u8, 0); /* { dg-error {passing 'int' to argument 3 of 'svmmla', which expects an SVE vector type} } */
++  svmmla (u32, u8, u8);
++  svmmla (u32, s32, s32); /* { dg-error {passing 'svint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svmmla', after passing 'svuint32_t' to argument 1} } */
++  svmmla (u32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svmmla', after passing 'svuint32_t' to argument 1} } */
++
++  svmmla (f16, s8, s8); /* { dg-error {'svmmla' has no form that takes 'svfloat16_t' arguments} } */
++  svmmla (f32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
++  svmmla (f32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
++  svmmla (f32, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
++  svmmla (f64, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat64_t'} } */
++  svmmla (f32, f32, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
++  svmmla (f64, f32, f16); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat64_t'} } */
++  svmmla (f64, f64, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmmla', but previous arguments had type 'svfloat64_t'} } */
++
++  svmmla (f16, f16, f16); /* { dg-error {'svmmla' has no form that takes 'svfloat16_t' arguments} } */
++  svmmla (f32, f32, f32);
++  svmmla (f64, f64, f64);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_2.c
+new file mode 100644
+index 000000000..b54725736
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_2.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svint32_t s32, svint8_t s8)
++{
++  svmmla_s32 (s32, s8, s8); /* { dg-error {ACLE function 'svmmla_s32' requires ISA extension 'i8mm'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_3.c
+new file mode 100644
+index 000000000..d1c8297cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_3.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svint32_t s32, svint8_t s8)
++{
++  svmmla (s32, s8, s8); /* { dg-error {ACLE function 'svmmla_s32' requires ISA extension 'i8mm'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_4.c
+new file mode 100644
+index 000000000..e6c3f5f94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_4.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat32_t f32)
++{
++  svmmla_f32 (f32, f32, f32); /* { dg-error {ACLE function 'svmmla_f32' requires ISA extension 'f32mm'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_5.c
+new file mode 100644
+index 000000000..8f6f42366
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_5.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat32_t f32)
++{
++  svmmla (f32, f32, f32); /* { dg-error {ACLE function 'svmmla_f32' requires ISA extension 'f32mm'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_6.c
+new file mode 100644
+index 000000000..7ebeb4981
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_6.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat64_t f64)
++{
++  svmmla_f64 (f64, f64, f64); /* { dg-error {ACLE function 'svmmla_f64' requires ISA extension 'f64mm'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_7.c
+new file mode 100644
+index 000000000..e64ec1ea6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_7.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.2-a+sve" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svfloat64_t f64)
++{
++  svmmla (f64, f64, f64); /* { dg-error {ACLE function 'svmmla_f64' requires ISA extension 'f64mm'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/pattern_pred_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/pattern_pred_1.c
+new file mode 100644
+index 000000000..99b61bdf1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/pattern_pred_1.c
+@@ -0,0 +1,14 @@
++#include <arm_sve.h>
++
++void
++test ()
++{
++  svptrue_pat_b16 ((enum svpattern) -1); /* { dg-error {passing 4294967295 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
++  svptrue_pat_b16 ((enum svpattern) 0);
++  svptrue_pat_b16 ((enum svpattern) 13);
++  svptrue_pat_b16 ((enum svpattern) 14); /* { dg-error {passing 14 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
++  svptrue_pat_b16 ((enum svpattern) 28); /* { dg-error {passing 28 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
++  svptrue_pat_b16 ((enum svpattern) 29);
++  svptrue_pat_b16 ((enum svpattern) 31);
++  svptrue_pat_b16 ((enum svpattern) 32); /* { dg-error {passing 32 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
+new file mode 100644
+index 000000000..316f77fc7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, int32_t *s32_ptr, enum svprfop op)
++{
++  svprfb (pg, s32_ptr, op); /* { dg-error {argument 3 of 'svprfb' must be an integer constant expression} } */
++  svprfb (pg, s32_ptr, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
++  svprfb (pg, s32_ptr, (enum svprfop) 0);
++  svprfb (pg, s32_ptr, (enum svprfop) 5);
++  svprfb (pg, s32_ptr, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
++  svprfb (pg, s32_ptr, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
++  svprfb (pg, s32_ptr, (enum svprfop) 8);
++  svprfb (pg, s32_ptr, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
+new file mode 100644
+index 000000000..c33c95440
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr,
++    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, enum svprfop op,
++    struct s s)
++{
++  svprfh_gather_index (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svprfh_gather_index'} } */
++  svprfh_gather_index (pg, s32_ptr, s32, SV_PLDL1KEEP, 0); /* { dg-error {too many arguments to function 'svprfh_gather_index'} } */
++  svprfh_gather_index (0, s32_ptr, s32, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 1 of 'svprfh_gather_index', which expects 'svbool_t'} } */
++  svprfh_gather_index (pg, 0, s32, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, (int *) 0, s32, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, void_ptr, s32, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, ptr_ptr, s32, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, s, s32, SV_PLDL1KEEP); /* { dg-error {passing 'struct s' to argument 2 of 'svprfh_gather_index', which expects a vector or pointer base address} } */
++
++  svprfh_gather_index (pg, s32_ptr, s8, SV_PLDL1KEEP); /* { dg-error {passing 'svint8_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfh_gather_index (pg, s32_ptr, u8, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfh_gather_index (pg, s32_ptr, s16, SV_PLDL1KEEP); /* { dg-error {passing 'svint16_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfh_gather_index (pg, s32_ptr, u16, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfh_gather_index (pg, s32_ptr, f16, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svprfh_gather_index', which expects a vector of integers} } */
++  svprfh_gather_index (pg, s32_ptr, s32, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, s32_ptr, u32, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, s32_ptr, f32, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svprfh_gather_index', which expects a vector of integers} } */
++  svprfh_gather_index (pg, s32_ptr, s64, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, s32_ptr, u64, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, s32_ptr, f64, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svprfh_gather_index', which expects a vector of integers} } */
++
++  svprfh_gather_index (pg, u8, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfh_gather_index (pg, u16, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfh_gather_index (pg, s32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint32_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfh_gather_index (pg, u32, 0, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, f32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfh_gather_index (pg, s64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint64_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfh_gather_index (pg, u64, 0, SV_PLDL1KEEP);
++  svprfh_gather_index (pg, f64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
++
++  svprfh_gather_index (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfh_gather_index' must be an integer constant expression} } */
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 0);
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 5);
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 8);
++  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
+new file mode 100644
+index 000000000..3d7797305
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op)
++{
++  svprfh_gather_s32index (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfh_gather_s32index' must be an integer constant expression} } */
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 0);
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 5);
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 8);
++  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
+new file mode 100644
+index 000000000..cc61901cb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { int i; };
++
++void
++f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr,
++    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, enum svprfop op,
++    struct s s)
++{
++  svprfb_gather_offset (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svprfb_gather_offset'} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, SV_PLDL1KEEP, 0); /* { dg-error {too many arguments to function 'svprfb_gather_offset'} } */
++  svprfb_gather_offset (0, s32_ptr, s32, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 1 of 'svprfb_gather_offset', which expects 'svbool_t'} } */
++  svprfb_gather_offset (pg, 0, s32, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, (int *) 0, s32, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, void_ptr, s32, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, ptr_ptr, s32, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, s, s32, SV_PLDL1KEEP); /* { dg-error {passing 'struct s' to argument 2 of 'svprfb_gather_offset', which expects a vector or pointer base address} } */
++
++  svprfb_gather_offset (pg, s32_ptr, s8, SV_PLDL1KEEP); /* { dg-error {passing 'svint8_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfb_gather_offset (pg, s32_ptr, u8, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfb_gather_offset (pg, s32_ptr, s16, SV_PLDL1KEEP); /* { dg-error {passing 'svint16_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfb_gather_offset (pg, s32_ptr, u16, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
++  svprfb_gather_offset (pg, s32_ptr, f16, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of integers} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, s32_ptr, u32, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, s32_ptr, f32, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of integers} } */
++  svprfb_gather_offset (pg, s32_ptr, s64, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, s32_ptr, u64, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, s32_ptr, f64, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of integers} } */
++
++  svprfb_gather_offset (pg, u8, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather_offset (pg, u16, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather_offset (pg, s32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint32_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather_offset (pg, u32, 0, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, f32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather_offset (pg, s64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint64_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather_offset (pg, u64, 0, SV_PLDL1KEEP);
++  svprfb_gather_offset (pg, f64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
++
++  svprfb_gather_offset (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfb_gather_offset' must be an integer constant expression} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 0);
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 5);
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 8);
++  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
+new file mode 100644
+index 000000000..b74721fad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
++    svint16_t s16, svuint16_t u16, svfloat16_t f16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, enum svprfop op)
++{
++  svprfb_gather (pg, u32); /* { dg-error {too few arguments to function 'svprfb_gather'} } */
++  svprfb_gather (pg, u32, SV_PLDL1KEEP, 0); /* { dg-error {too many arguments to function 'svprfb_gather'} } */
++  svprfb_gather (0, u32, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 1 of 'svprfb_gather', which expects 'svbool_t'} } */
++  svprfb_gather (pg, 0, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 2 of 'svprfb_gather', which expects an SVE vector type} } */
++
++  svprfb_gather (pg, s8, SV_PLDL1KEEP); /* { dg-error {passing 'svint8_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, u8, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, s16, SV_PLDL1KEEP); /* { dg-error {passing 'svint16_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, u16, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, f16, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, s32, SV_PLDL1KEEP); /* { dg-error {passing 'svint32_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, u32, SV_PLDL1KEEP);
++  svprfb_gather (pg, f32, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, s64, SV_PLDL1KEEP); /* { dg-error {passing 'svint64_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++  svprfb_gather (pg, u64, SV_PLDL1KEEP);
++  svprfb_gather (pg, f64, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
++
++  svprfb_gather (pg, u32, op); /* { dg-error {argument 3 of 'svprfb_gather' must be an integer constant expression} } */
++  svprfb_gather (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather (pg, u32, (enum svprfop) 0);
++  svprfb_gather (pg, u32, (enum svprfop) 5);
++  svprfb_gather (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather (pg, u32, (enum svprfop) 8);
++  svprfb_gather (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
+new file mode 100644
+index 000000000..24b4aa190
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op)
++{
++  svprfb_gather_s32offset (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfb_gather_s32offset' must be an integer constant expression} } */
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 0);
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 5);
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 8);
++  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
+new file mode 100644
+index 000000000..63ccdc5a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint32_t u32, enum svprfop op)
++{
++  svprfb_gather_u32base (pg, u32, op); /* { dg-error {argument 3 of 'svprfb_gather_u32base' must be an integer constant expression} } */
++  svprfb_gather_u32base (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 0);
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 5);
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 8);
++  svprfb_gather_u32base (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_1.c
+new file mode 100644
+index 000000000..ab0ef304a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_1.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svuint32x2_t u32x2)
++{
++  svorv (pg); /* { dg-error {too few arguments to function 'svorv'} } */
++  svorv (pg, u32, u32); /* { dg-error {too many arguments to function 'svorv'} } */
++  svorv (0, u32); /* { dg-error {passing 'int' to argument 1 of 'svorv', which expects 'svbool_t'} } */
++  svorv (u32, u32); /* { dg-error {passing 'svuint32_t' to argument 1 of 'svorv', which expects 'svbool_t'} } */
++  svorv (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svorv', which expects an SVE vector type} } */
++  svorv (pg, pg); /* { dg-error {'svorv' has no form that takes 'svbool_t' arguments} } */
++  svorv (pg, s32);
++  svorv (pg, u32);
++  svorv (pg, f32); /* { dg-error {'svorv' has no form that takes 'svfloat32_t' arguments} } */
++  svorv (pg, u32x2); /* { dg-error {passing 'svuint32x2_t' to argument 2 of 'svorv', which expects a single SVE vector rather than a tuple} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_wide_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_wide_1.c
+new file mode 100644
+index 000000000..f99a2887b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_wide_1.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svuint32x2_t u32x2)
++{
++  svaddv (pg); /* { dg-error {too few arguments to function 'svaddv'} } */
++  svaddv (pg, u32, u32); /* { dg-error {too many arguments to function 'svaddv'} } */
++  svaddv (0, u32); /* { dg-error {passing 'int' to argument 1 of 'svaddv', which expects 'svbool_t'} } */
++  svaddv (u32, u32); /* { dg-error {passing 'svuint32_t' to argument 1 of 'svaddv', which expects 'svbool_t'} } */
++  svaddv (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svaddv', which expects an SVE vector type} } */
++  svaddv (pg, pg); /* { dg-error {'svaddv' has no form that takes 'svbool_t' arguments} } */
++  svaddv (pg, s32);
++  svaddv (pg, u32);
++  svaddv (pg, f32);
++  svaddv (pg, u32x2); /* { dg-error {passing 'svuint32x2_t' to argument 2 of 'svaddv', which expects a single SVE vector rather than a tuple} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_1.c
+new file mode 100644
+index 000000000..f07c76102
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8x2 = svset2 (u8x2); /* { dg-error {too few arguments to function 'svset2'} } */
++  u8x2 = svset2 (u8x2, 1); /* { dg-error {too few arguments to function 'svset2'} } */
++  u8x2 = svset2 (u8x2, 1, u8, 3); /* { dg-error {too many arguments to function 'svset2'} } */
++  u8x2 = svset2 (u8, 0, u8); /* { dg-error {passing single vector 'svuint8_t' to argument 1 of 'svset2', which expects a tuple of 2 vectors} } */
++  u8x2 = svset2 (u8x3, 0, u8); /* { dg-error {passing 'svuint8x3_t' to argument 1 of 'svset2', which expects a tuple of 2 vectors} } */
++  u8x2 = svset2 (pg, 0, u8); /* { dg-error {passing 'svbool_t' to argument 1 of 'svset2', which expects a tuple of 2 vectors} } */
++  u8x2 = svset2 (u8x2, 0, u8x2); /* { dg-error {passing 'svuint8x2_t' to argument 3 of 'svset2', which expects a single SVE vector rather than a tuple} } */
++  u8x2 = svset2 (u8x2, 0, f64); /* { dg-error {passing 'svfloat64_t' instead of the expected 'svuint8_t' to argument 3 of 'svset2', after passing 'svuint8x2_t' to argument 1} } */
++  u8x2 = svset2 (u8x2, 0, pg); /* { dg-error {passing 'svbool_t' instead of the expected 'svuint8_t' to argument 3 of 'svset2', after passing 'svuint8x2_t' to argument 1} } */
++  u8x2 = svset2 (u8x2, x, u8); /* { dg-error {argument 2 of 'svset2' must be an integer constant expression} } */
++  u8x2 = svset2 (u8x2, 0, u8);
++  f64 = svset2 (u8x2, 0, u8); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8x2_t'} } */
++  u8x2 = svset2 (u8x2, 1, u8);
++  u8x2 = svset2 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2 (u8x2, one, u8); /* { dg-error {argument 2 of 'svset2' must be an integer constant expression} } */
++  u8x2 = svset2 (u8x2, 3 - 2, u8);
++  u8x2 = svset2 (u8x2, 1.0, u8);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_2.c
+new file mode 100644
+index 000000000..ae277eafd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_2.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
++    svuint8x3_t u8x3, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  u8x2 = svset2_u8 (u8x2); /* { dg-error {too few arguments to function 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x2, 1); /* { dg-error {too few arguments to function 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x2, 1, u8, 3); /* { dg-error {too many arguments to function 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (s8x2, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x3, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (pg, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x2, 0, u8x2); /* { dg-error {incompatible type for argument 3 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x2, 0, f64); /* { dg-error {incompatible type for argument 3 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x2, 0, pg); /* { dg-error {incompatible type for argument 3 of 'svset2_u8'} } */
++  u8x2 = svset2_u8 (u8x2, x, u8); /* { dg-error {argument 2 of 'svset2_u8' must be an integer constant expression} } */
++  u8x2 = svset2_u8 (u8x2, 0, u8);
++  f64 = svset2_u8 (u8x2, 0, u8); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8x2_t'} } */
++  u8x2 = svset2_u8 (u8x2, 1, u8);
++  u8x2 = svset2_u8 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
++  u8x2 = svset2_u8 (u8x2, one, u8); /* { dg-error {argument 2 of 'svset2_u8' must be an integer constant expression} } */
++  u8x2 = svset2_u8 (u8x2, 3 - 2, u8);
++  u8x2 = svset2_u8 (u8x2, 1.0, u8);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_3.c
+new file mode 100644
+index 000000000..543a1bea8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_3.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
++    int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16x3 = svset3 (f16x3); /* { dg-error {too few arguments to function 'svset3'} } */
++  f16x3 = svset3 (f16x3, 1); /* { dg-error {too few arguments to function 'svset3'} } */
++  f16x3 = svset3 (f16x3, 1, f16, 3); /* { dg-error {too many arguments to function 'svset3'} } */
++  f16x3 = svset3 (f16, 0, f16); /* { dg-error {passing single vector 'svfloat16_t' to argument 1 of 'svset3', which expects a tuple of 3 vectors} } */
++  f16x3 = svset3 (f16x4, 0, f16); /* { dg-error {passing 'svfloat16x4_t' to argument 1 of 'svset3', which expects a tuple of 3 vectors} } */
++  f16x3 = svset3 (pg, 0, f16); /* { dg-error {passing 'svbool_t' to argument 1 of 'svset3', which expects a tuple of 3 vectors} } */
++  f16x3 = svset3 (f16x3, 0, f16x3); /* { dg-error {passing 'svfloat16x3_t' to argument 3 of 'svset3', which expects a single SVE vector rather than a tuple} } */
++  f16x3 = svset3 (f16x3, 0, f64); /* { dg-error {passing 'svfloat64_t' instead of the expected 'svfloat16_t' to argument 3 of 'svset3', after passing 'svfloat16x3_t' to argument 1} } */
++  f16x3 = svset3 (f16x3, 0, pg); /* { dg-error {passing 'svbool_t' instead of the expected 'svfloat16_t' to argument 3 of 'svset3', after passing 'svfloat16x3_t' to argument 1} } */
++  f16x3 = svset3 (f16x3, x, f16); /* { dg-error {argument 2 of 'svset3' must be an integer constant expression} } */
++  f16x3 = svset3 (f16x3, 0, f16);
++  f64 = svset3 (f16x3, 0, f16); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16x3_t'} } */
++  f16x3 = svset3 (f16x3, 1, f16);
++  f16x3 = svset3 (f16x3, 2, f16);
++  f16x3 = svset3 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3 (f16x3, one, f16); /* { dg-error {argument 2 of 'svset3' must be an integer constant expression} } */
++  f16x3 = svset3 (f16x3, 3 - 2, f16);
++  f16x3 = svset3 (f16x3, 1.0, f16);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_4.c
+new file mode 100644
+index 000000000..198b03407
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_4.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svuint16x3_t u16x3,
++    svfloat16x4_t f16x4, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  f16x3 = svset3_f16 (f16x3); /* { dg-error {too few arguments to function 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x3, 1); /* { dg-error {too few arguments to function 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x3, 1, f16, 3); /* { dg-error {too many arguments to function 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (u16x3, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x4, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (pg, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x3, 0, f16x3); /* { dg-error {incompatible type for argument 3 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x3, 0, f64); /* { dg-error {incompatible type for argument 3 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x3, 0, pg); /* { dg-error {incompatible type for argument 3 of 'svset3_f16'} } */
++  f16x3 = svset3_f16 (f16x3, x, f16); /* { dg-error {argument 2 of 'svset3_f16' must be an integer constant expression} } */
++  f16x3 = svset3_f16 (f16x3, 0, f16);
++  f64 = svset3_f16 (f16x3, 0, f16); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16x3_t'} } */
++  f16x3 = svset3_f16 (f16x3, 1, f16);
++  f16x3 = svset3_f16 (f16x3, 2, f16);
++  f16x3 = svset3_f16 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
++  f16x3 = svset3_f16 (f16x3, one, f16); /* { dg-error {argument 2 of 'svset3_f16' must be an integer constant expression} } */
++  f16x3 = svset3_f16 (f16x3, 3 - 2, f16);
++  f16x3 = svset3_f16 (f16x3, 1.0, f16);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_5.c
+new file mode 100644
+index 000000000..be911a731
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_5.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32x4 = svset4 (s32x4); /* { dg-error {too few arguments to function 'svset4'} } */
++  s32x4 = svset4 (s32x4, 1); /* { dg-error {too few arguments to function 'svset4'} } */
++  s32x4 = svset4 (s32x4, 1, s32, 3); /* { dg-error {too many arguments to function 'svset4'} } */
++  s32x4 = svset4 (s32, 0, s32); /* { dg-error {passing single vector 'svint32_t' to argument 1 of 'svset4', which expects a tuple of 4 vectors} } */
++  s32x4 = svset4 (s32x2, 0, s32); /* { dg-error {passing 'svint32x2_t' to argument 1 of 'svset4', which expects a tuple of 4 vectors} } */
++  s32x4 = svset4 (pg, 0, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svset4', which expects a tuple of 4 vectors} } */
++  s32x4 = svset4 (s32x4, 0, s32x4); /* { dg-error {passing 'svint32x4_t' to argument 3 of 'svset4', which expects a single SVE vector rather than a tuple} } */
++  s32x4 = svset4 (s32x4, 0, f64); /* { dg-error {passing 'svfloat64_t' instead of the expected 'svint32_t' to argument 3 of 'svset4', after passing 'svint32x4_t' to argument 1} } */
++  s32x4 = svset4 (s32x4, 0, pg); /* { dg-error {passing 'svbool_t' instead of the expected 'svint32_t' to argument 3 of 'svset4', after passing 'svint32x4_t' to argument 1} } */
++  s32x4 = svset4 (s32x4, x, s32); /* { dg-error {argument 2 of 'svset4' must be an integer constant expression} } */
++  s32x4 = svset4 (s32x4, 0, s32);
++  f64 = svset4 (s32x4, 0, s32); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32x4_t'} } */
++  s32x4 = svset4 (s32x4, 1, s32);
++  s32x4 = svset4 (s32x4, 2, s32);
++  s32x4 = svset4 (s32x4, 3, s32);
++  s32x4 = svset4 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4 (s32x4, one, s32); /* { dg-error {argument 2 of 'svset4' must be an integer constant expression} } */
++  s32x4 = svset4 (s32x4, 3 - 2, s32);
++  s32x4 = svset4 (s32x4, 1.0, s32);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_6.c
+new file mode 100644
+index 000000000..cec435413
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_6.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++svfloat64_t
++f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
++    svint32x2_t s32x2, int x)
++{
++  const int one = 1;
++  svfloat64_t f64;
++
++  s32x4 = svset4_s32 (s32x4); /* { dg-error {too few arguments to function 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x4, 1); /* { dg-error {too few arguments to function 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x4, 1, s32, 3); /* { dg-error {too many arguments to function 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (f32x4, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x2, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (pg, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x4, 0, s32x4); /* { dg-error {incompatible type for argument 3 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x4, 0, f64); /* { dg-error {incompatible type for argument 3 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x4, 0, pg); /* { dg-error {incompatible type for argument 3 of 'svset4_s32'} } */
++  s32x4 = svset4_s32 (s32x4, x, s32); /* { dg-error {argument 2 of 'svset4_s32' must be an integer constant expression} } */
++  s32x4 = svset4_s32 (s32x4, 0, s32);
++  f64 = svset4_s32 (s32x4, 0, s32); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32x4_t'} } */
++  s32x4 = svset4_s32 (s32x4, 1, s32);
++  s32x4 = svset4_s32 (s32x4, 2, s32);
++  s32x4 = svset4_s32 (s32x4, 3, s32);
++  s32x4 = svset4_s32 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
++  s32x4 = svset4_s32 (s32x4, one, s32); /* { dg-error {argument 2 of 'svset4_s32' must be an integer constant expression} } */
++  s32x4 = svset4_s32 (s32x4, 3 - 2, s32);
++  s32x4 = svset4_s32 (s32x4, 1.0, s32);
++
++  return f64;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_1.c
+new file mode 100644
+index 000000000..4dd9a9c76
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_1.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint16_t s16,
++    svint32_t s32, svint64_t s64, int x)
++{
++  const int one = 1;
++  u8 = svasrd_x (pg, u8, 1); /* { dg-error {'svasrd_x' has no form that takes 'svuint8_t' arguments} } */
++  s8 = svasrd_x (pg, s8, x); /* { dg-error {argument 3 of 'svasrd_x' must be an integer constant expression} } */
++  s8 = svasrd_x (pg, s8, one); /* { dg-error {argument 3 of 'svasrd_x' must be an integer constant expression} } */
++  s8 = svasrd_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_x (pg, s8, 1.0);
++  s8 = svasrd_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_x (pg, s8, 1);
++  s8 = svasrd_x (pg, s8, 1 + 1);
++  s8 = svasrd_x (pg, s8, 8);
++  s8 = svasrd_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_x (pg, s8, (1ULL << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
++  s16 = svasrd_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
++  s16 = svasrd_x (pg, s16, 1);
++  s16 = svasrd_x (pg, s16, 16);
++  s16 = svasrd_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
++  s32 = svasrd_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
++  s32 = svasrd_x (pg, s32, 1);
++  s32 = svasrd_x (pg, s32, 32);
++  s32 = svasrd_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
++  s64 = svasrd_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
++  s64 = svasrd_x (pg, s64, 1);
++  s64 = svasrd_x (pg, s64, 64);
++  s64 = svasrd_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_2.c
+new file mode 100644
+index 000000000..4970689e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_2.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
++    int x)
++{
++  const int one = 1;
++  s8 = svasrd_n_s8_x (pg, s8, x); /* { dg-error {argument 3 of 'svasrd_n_s8_x' must be an integer constant expression} } */
++  s8 = svasrd_n_s8_x (pg, s8, one); /* { dg-error {argument 3 of 'svasrd_n_s8_x' must be an integer constant expression} } */
++  s8 = svasrd_n_s8_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_n_s8_x (pg, s8, 1.0);
++  s8 = svasrd_n_s8_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_n_s8_x (pg, s8, 1);
++  s8 = svasrd_n_s8_x (pg, s8, 1 + 1);
++  s8 = svasrd_n_s8_x (pg, s8, 8);
++  s8 = svasrd_n_s8_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s8 = svasrd_n_s8_x (pg, s8, (1ULL << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
++  s16 = svasrd_n_s16_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
++  s16 = svasrd_n_s16_x (pg, s16, 1);
++  s16 = svasrd_n_s16_x (pg, s16, 16);
++  s16 = svasrd_n_s16_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
++  s32 = svasrd_n_s32_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
++  s32 = svasrd_n_s32_x (pg, s32, 1);
++  s32 = svasrd_n_s32_x (pg, s32, 32);
++  s32 = svasrd_n_s32_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
++  s64 = svasrd_n_s64_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
++  s64 = svasrd_n_s64_x (pg, s64, 1);
++  s64 = svasrd_n_s64_x (pg, s64, 64);
++  s64 = svasrd_n_s64_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_1.c
+new file mode 100644
+index 000000000..267db83f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_1.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint8_t
++f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
++    float *f32_ptr, _Complex float *cf32_ptr, svint8_t s8, svfloat32_t f32,
++    struct s s)
++{
++  svst1 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svst1'} } */
++  svst1 (pg, s8_ptr, s8, 0); /* { dg-error {too many arguments to function 'svst1'} } */
++  svst1 (0, s8_ptr, s8); /* { dg-error {passing 'int' to argument 1 of 'svst1', which expects 'svbool_t'} } */
++  svst1 (pg, void_ptr, 0); /* { dg-error {passing 'int' to argument 3 of 'svst1', which expects an SVE vector type} } */
++  svst1 (pg, void_ptr, pg); /* { dg-error {'svst1' has no form that takes 'svbool_t' arguments} } */
++  svst1 (pg, 0, s8);
++  svst1 (pg, (int *) 0, s8); /* { dg-warning "passing argument 2 of 'svst1_s8' from incompatible pointer type" } */
++  svst1 (pg, void_ptr, s8);
++  svst1 (pg, s_ptr, s8); /* { dg-warning "passing argument 2 of 'svst1_s8' from incompatible pointer type" } */
++  svst1 (pg, f32_ptr, s8); /* { dg-warning "passing argument 2 of 'svst1_s8' from incompatible pointer type" } */
++  svst1 (pg, f32_ptr, f32);
++  svst1 (pg, cf32_ptr, f32); /* { dg-warning "passing argument 2 of 'svst1_f32' from incompatible pointer type" } */
++  svst1 (pg, s, s8); /* { dg-error {passing 'struct s' to argument 2 of 'svst1', which expects a scalar pointer} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_2.c
+new file mode 100644
+index 000000000..4e4fb3c6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_2.c
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint8_t
++f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
++    float *f32_ptr, _Complex float *cf32_ptr, svint8_t s8, svfloat32_t f32)
++{
++  svst1_vnum (pg, s8_ptr, 0); /* { dg-error {too few arguments to function 'svst1_vnum'} } */
++  svst1_vnum (pg, s8_ptr, 0, s8, 0); /* { dg-error {too many arguments to function 'svst1_vnum'} } */
++  svst1_vnum (0, s8_ptr, 0, s8); /* { dg-error {passing 'int' to argument 1 of 'svst1_vnum', which expects 'svbool_t'} } */
++  svst1_vnum (pg, s8_ptr, pg, s8); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_vnum', which expects 'int64_t'} } */
++  svst1_vnum (pg, s8_ptr, s8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svst1_vnum', which expects 'int64_t'} } */
++  svst1_vnum (pg, s8_ptr, void_ptr, s8); /* { dg-warning "passing argument 3 of 'svst1_vnum_s8' makes integer from pointer without a cast" } */
++  svst1_vnum (pg, void_ptr, 0, 0); /* { dg-error {passing 'int' to argument 4 of 'svst1_vnum', which expects an SVE vector type} } */
++  svst1_vnum (pg, void_ptr, 0, pg); /* { dg-error {'svst1_vnum' has no form that takes 'svbool_t' arguments} } */
++  svst1_vnum (pg, 0, 0, s8);
++  svst1_vnum (pg, (int *) 0, 0, s8); /* { dg-warning "passing argument 2 of 'svst1_vnum_s8' from incompatible pointer type" } */
++  svst1_vnum (pg, void_ptr, 0, s8);
++  svst1_vnum (pg, s_ptr, 0, s8); /* { dg-warning "passing argument 2 of 'svst1_vnum_s8' from incompatible pointer type" } */
++  svst1_vnum (pg, f32_ptr, 0, s8); /* { dg-warning "passing argument 2 of 'svst1_vnum_s8' from incompatible pointer type" } */
++  svst1_vnum (pg, f32_ptr, 0, f32);
++  svst1_vnum (pg, cf32_ptr, 0, f32); /* { dg-warning "passing argument 2 of 'svst1_vnum_f32' from incompatible pointer type" } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_index_1.c
+new file mode 100644
+index 000000000..3209149b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_index_1.c
+@@ -0,0 +1,101 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint32_t
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
++    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
++    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr,
++    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svst1_scatter_index (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svst1_scatter_index'} } */
++  svst1_scatter_index (pg, s32_ptr, s32, s32, 0); /* { dg-error {too many arguments to function 'svst1_scatter_index'} } */
++  svst1_scatter_index (0, s32_ptr, s32, s32); /* { dg-error {passing 'int' to argument 1 of 'svst1_scatter_index', which expects 'svbool_t'} } */
++  svst1_scatter_index (pg, 0, s32, s32);
++  svst1_scatter_index (pg, (int *) 0, s32, s32);
++  svst1_scatter_index (pg, void_ptr, s32, s32);
++  svst1_scatter_index (pg, s_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32index_s32' from incompatible pointer type" } */
++  svst1_scatter_index (pg, f32_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32index_s32' from incompatible pointer type" } */
++  svst1_scatter_index (pg, f32_ptr, s32, f32);
++  svst1_scatter_index (pg, cf32_ptr, s32, f32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32index_f32' from incompatible pointer type" } */
++  svst1_scatter_index (pg, s, s32, s32); /* { dg-error {passing 'struct s' to argument 2 of 'svst1_scatter_index', which expects a vector or pointer base address} } */
++
++  svst1_scatter_index (pg, u32, void_ptr, s32); /* { dg-warning "passing argument 3 of 'svst1_scatter_u32base_index_s32' makes integer from pointer without a cast" } */
++  svst1_scatter_index (pg, u32, pg, s32); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_scatter_index', which expects 'int64_t'} } */
++  svst1_scatter_index (pg, u32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which expects 'int64_t'} } */
++
++  svst1_scatter_index (pg, void_ptr, u32, pg); /* { dg-error {passing 'svbool_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter_index (pg, s8_ptr, u32, s8); /* { dg-error {passing 'svint8_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter_index (pg, s8_ptr, u32, u8); /* { dg-error {passing 'svuint8_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter_index (pg, s16_ptr, u32, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter_index (pg, s16_ptr, u32, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter_index (pg, s16_ptr, u32, f16); /* { dg-error {passing 'svfloat16_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter_index (pg, u32, 0, s32);
++  svst1_scatter_index (pg, s32, 0, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint32_t'} } */
++
++  svst1_scatter_index (pg, u32, 0, u32);
++  svst1_scatter_index (pg, s32, 0, u32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint32_t'} } */
++
++  svst1_scatter_index (pg, u32, 0, f32);
++  svst1_scatter_index (pg, s32, 0, f32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint32_t'} } */
++
++  svst1_scatter_index (pg, u64, 0, s64);
++  svst1_scatter_index (pg, s64, 0, s64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint64_t'} } */
++
++  svst1_scatter_index (pg, u64, 0, u64);
++  svst1_scatter_index (pg, s64, 0, u64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint64_t'} } */
++
++  svst1_scatter_index (pg, u64, 0, f64);
++  svst1_scatter_index (pg, s64, 0, f64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint64_t'} } */
++
++  svst1_scatter_index (pg, s32_ptr, s32, s32);
++  svst1_scatter_index (pg, s32_ptr, u32, s32);
++  svst1_scatter_index (pg, s32_ptr, f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, s32_ptr, s64, s32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, s32_ptr, u64, s32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, s32_ptr, f64, s32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++
++  svst1_scatter_index (pg, u32_ptr, s32, u32);
++  svst1_scatter_index (pg, u32_ptr, u32, u32);
++  svst1_scatter_index (pg, u32_ptr, f32, u32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, u32_ptr, s64, u32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, u32_ptr, u64, u32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, u32_ptr, f64, u32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++
++  svst1_scatter_index (pg, f32_ptr, s32, f32);
++  svst1_scatter_index (pg, f32_ptr, u32, f32);
++  svst1_scatter_index (pg, f32_ptr, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, f32_ptr, s64, f32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, f32_ptr, u64, f32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_index (pg, f32_ptr, f64, f32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++
++  svst1_scatter_index (pg, s64_ptr, s32, s64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, s64_ptr, u32, s64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, s64_ptr, f32, s64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, s64_ptr, s64, s64);
++  svst1_scatter_index (pg, s64_ptr, u64, s64);
++  svst1_scatter_index (pg, s64_ptr, f64, s64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++
++  svst1_scatter_index (pg, u64_ptr, s32, u64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, u64_ptr, u32, u64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, u64_ptr, f32, u64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, u64_ptr, s64, u64);
++  svst1_scatter_index (pg, u64_ptr, u64, u64);
++  svst1_scatter_index (pg, u64_ptr, f64, u64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++
++  svst1_scatter_index (pg, f64_ptr, s32, f64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, f64_ptr, u32, f64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, f64_ptr, f32, f64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_index (pg, f64_ptr, s64, f64);
++  svst1_scatter_index (pg, f64_ptr, u64, f64);
++  svst1_scatter_index (pg, f64_ptr, f64, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_1.c
+new file mode 100644
+index 000000000..10abf758c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_1.c
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint32_t
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
++    svfloat16_t f16, svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64)
++{
++  svst1_scatter (pg, u32); /* { dg-error {too few arguments to function 'svst1_scatter'} } */
++  svst1_scatter (pg, u32, u32, 0); /* { dg-error {too many arguments to function 'svst1_scatter'} } */
++  svst1_scatter (0, u32, u32); /* { dg-error {passing 'int' to argument 1 of 'svst1_scatter', which expects 'svbool_t'} } */
++  svst1_scatter (pg, 0, u32); /* { dg-error {passing 'int' to argument 2 of 'svst1_scatter', which expects an SVE vector type} } */
++  svst1_scatter (pg, u32, 0); /* { dg-error {passing 'int' to argument 3 of 'svst1_scatter', which expects an SVE vector type} } */
++
++  svst1_scatter (pg, u32, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter (pg, u32, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter (pg, u32, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter (pg, u32, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter (pg, u32, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter (pg, u32, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter (pg, u32, s32);
++  svst1_scatter (pg, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter', which expects 'svuint32_t'} } */
++
++  svst1_scatter (pg, u32, u32);
++  svst1_scatter (pg, s32, u32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter', which expects 'svuint32_t'} } */
++
++  svst1_scatter (pg, u32, f32);
++  svst1_scatter (pg, s32, f32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter', which expects 'svuint32_t'} } */
++
++  svst1_scatter (pg, u64, s64);
++  svst1_scatter (pg, s64, s64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter', which expects 'svuint64_t'} } */
++
++  svst1_scatter (pg, u64, u64);
++  svst1_scatter (pg, s64, u64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter', which expects 'svuint64_t'} } */
++
++  svst1_scatter (pg, u64, f64);
++  svst1_scatter (pg, s64, f64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter', which expects 'svuint64_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_2.c
+new file mode 100644
+index 000000000..8ee8129fa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_2.c
+@@ -0,0 +1,101 @@
++/* { dg-do compile } */
++/* { dg-options "-std=c99" } */
++
++#include <arm_sve.h>
++
++struct s { signed char x; };
++
++svuint32_t
++f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
++    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
++    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
++    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr,
++    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
++    svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
++{
++  svst1_scatter_offset (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svst1_scatter_offset'} } */
++  svst1_scatter_offset (pg, s32_ptr, s32, s32, 0); /* { dg-error {too many arguments to function 'svst1_scatter_offset'} } */
++  svst1_scatter_offset (0, s32_ptr, s32, s32); /* { dg-error {passing 'int' to argument 1 of 'svst1_scatter_offset', which expects 'svbool_t'} } */
++  svst1_scatter_offset (pg, 0, s32, s32);
++  svst1_scatter_offset (pg, (int *) 0, s32, s32);
++  svst1_scatter_offset (pg, void_ptr, s32, s32);
++  svst1_scatter_offset (pg, s_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32offset_s32' from incompatible pointer type" } */
++  svst1_scatter_offset (pg, f32_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32offset_s32' from incompatible pointer type" } */
++  svst1_scatter_offset (pg, f32_ptr, s32, f32);
++  svst1_scatter_offset (pg, cf32_ptr, s32, f32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32offset_f32' from incompatible pointer type" } */
++  svst1_scatter_offset (pg, s, s32, s32); /* { dg-error {passing 'struct s' to argument 2 of 'svst1_scatter_offset', which expects a vector or pointer base address} } */
++
++  svst1_scatter_offset (pg, u32, void_ptr, s32); /* { dg-warning "passing argument 3 of 'svst1_scatter_u32base_offset_s32' makes integer from pointer without a cast" } */
++  svst1_scatter_offset (pg, u32, pg, s32); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_scatter_offset', which expects 'int64_t'} } */
++  svst1_scatter_offset (pg, u32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which expects 'int64_t'} } */
++
++  svst1_scatter_offset (pg, void_ptr, u32, pg); /* { dg-error {passing 'svbool_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter_offset (pg, s8_ptr, u32, s8); /* { dg-error {passing 'svint8_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter_offset (pg, s8_ptr, u32, u8); /* { dg-error {passing 'svuint8_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter_offset (pg, s16_ptr, u32, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter_offset (pg, s16_ptr, u32, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
++  svst1_scatter_offset (pg, s16_ptr, u32, f16); /* { dg-error {passing 'svfloat16_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
++
++  svst1_scatter_offset (pg, u32, 0, s32);
++  svst1_scatter_offset (pg, s32, 0, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint32_t'} } */
++
++  svst1_scatter_offset (pg, u32, 0, u32);
++  svst1_scatter_offset (pg, s32, 0, u32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint32_t'} } */
++
++  svst1_scatter_offset (pg, u32, 0, f32);
++  svst1_scatter_offset (pg, s32, 0, f32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint32_t'} } */
++
++  svst1_scatter_offset (pg, u64, 0, s64);
++  svst1_scatter_offset (pg, s64, 0, s64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint64_t'} } */
++
++  svst1_scatter_offset (pg, u64, 0, u64);
++  svst1_scatter_offset (pg, s64, 0, u64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint64_t'} } */
++
++  svst1_scatter_offset (pg, u64, 0, f64);
++  svst1_scatter_offset (pg, s64, 0, f64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint64_t'} } */
++
++  svst1_scatter_offset (pg, s32_ptr, s32, s32);
++  svst1_scatter_offset (pg, s32_ptr, u32, s32);
++  svst1_scatter_offset (pg, s32_ptr, f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, s32_ptr, s64, s32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, s32_ptr, u64, s32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, s32_ptr, f64, s32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
++
++  svst1_scatter_offset (pg, u32_ptr, s32, u32);
++  svst1_scatter_offset (pg, u32_ptr, u32, u32);
++  svst1_scatter_offset (pg, u32_ptr, f32, u32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, u32_ptr, s64, u32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, u32_ptr, u64, u32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, u32_ptr, f64, u32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
++
++  svst1_scatter_offset (pg, f32_ptr, s32, f32);
++  svst1_scatter_offset (pg, f32_ptr, u32, f32);
++  svst1_scatter_offset (pg, f32_ptr, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, f32_ptr, s64, f32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, f32_ptr, u64, f32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++  svst1_scatter_offset (pg, f32_ptr, f64, f32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
++
++  svst1_scatter_offset (pg, s64_ptr, s32, s64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, s64_ptr, u32, s64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, s64_ptr, f32, s64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, s64_ptr, s64, s64);
++  svst1_scatter_offset (pg, s64_ptr, u64, s64);
++  svst1_scatter_offset (pg, s64_ptr, f64, s64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
++
++  svst1_scatter_offset (pg, u64_ptr, s32, u64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, u64_ptr, u32, u64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, u64_ptr, f32, u64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, u64_ptr, s64, u64);
++  svst1_scatter_offset (pg, u64_ptr, u64, u64);
++  svst1_scatter_offset (pg, u64_ptr, f64, u64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
++
++  svst1_scatter_offset (pg, f64_ptr, s32, f64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, f64_ptr, u32, f64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, f64_ptr, f32, f64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++  svst1_scatter_offset (pg, f64_ptr, s64, f64);
++  svst1_scatter_offset (pg, f64_ptr, u64, f64);
++  svst1_scatter_offset (pg, f64_ptr, f64, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_1.c
+new file mode 100644
+index 000000000..a9233324c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_1.c
+@@ -0,0 +1,24 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++#pragma GCC target ("arch=armv8.2-a+sve+bf16")
++
++void
++f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
++    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, bfloat16_t bf)
++{
++  svbfmmla (f32, bf16); /* { dg-error {too few arguments to function 'svbfmmla'} } */
++  svbfmmla (f32, bf16, bf16, 0); /* { dg-error {too many arguments to function 'svbfmmla'} } */
++  svbfmmla (0, bf16, bf16); /* { dg-error {passing 'int' to argument 1 of 'svbfmmla', which expects an SVE vector type} } */
++  svbfmmla (pg, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svbool_t' arguments} } */
++  svbfmmla (u8, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svuint8_t' arguments} } */
++  svbfmmla (u16, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svuint16_t' arguments} } */
++  svbfmmla (f64, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svfloat64_t' arguments} } */
++  svbfmmla (f32, bf16, bf16);
++  svbfmmla (f32, 0, bf16); /* { dg-error {passing 'int' to argument 2 of 'svbfmmla', which expects 'svbfloat16_t'} } */
++  svbfmmla (f32, f32, bf16); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfmmla', which expects 'svbfloat16_t'} } */
++  svbfmmla (f32, bf16, 0); /* { dg-error {passing 'int' to argument 3 of 'svbfmmla', which expects 'svbfloat16_t'} } */
++  svbfmmla (f32, bf16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfmmla', which expects 'svbfloat16_t'} } */
++  svbfmmla (f32, bf16, bf); /* { dg-error {passing 'bfloat16_t'[^\n]* to argument 3 of 'svbfmmla', which expects 'svbfloat16_t'} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lane_1.c
+new file mode 100644
+index 000000000..23f027f2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lane_1.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++#pragma GCC target ("arch=armv8.2-a+sve+bf16")
++
++void
++f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
++    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, int i)
++{
++  svbfmlalb_lane (f32, bf16, bf16); /* { dg-error {too few arguments to function 'svbfmlalb_lane'} } */
++  svbfmlalb_lane (f32, bf16, bf16, 0, 0); /* { dg-error {too many arguments to function 'svbfmlalb_lane'} } */
++  svbfmlalb_lane (0, bf16, bf16, 0); /* { dg-error {passing 'int' to argument 1 of 'svbfmlalb_lane', which expects an SVE vector type} } */
++  svbfmlalb_lane (pg, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svbool_t' arguments} } */
++  svbfmlalb_lane (u8, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svuint8_t' arguments} } */
++  svbfmlalb_lane (u16, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svuint16_t' arguments} } */
++  svbfmlalb_lane (f64, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svfloat64_t' arguments} } */
++  svbfmlalb_lane (f32, bf16, bf16, 0);
++  svbfmlalb_lane (f32, 0, bf16, 0); /* { dg-error {passing 'int' to argument 2 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
++  svbfmlalb_lane (f32, f32, bf16, 0); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
++  svbfmlalb_lane (f32, bf16, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
++  svbfmlalb_lane (f32, bf16, f32, 0); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
++  svbfmlalb_lane (f32, bf16, bf16, s32); /* { dg-error {argument 4 of 'svbfmlalb_lane' must be an integer constant expression} } */
++  svbfmlalb_lane (f32, bf16, bf16, i); /* { dg-error {argument 4 of 'svbfmlalb_lane' must be an integer constant expression} } */
++
++  svbfmlalb_lane (f32, bf16, bf16, 0);
++  svbfmlalb_lane (f32, bf16, bf16, 7);
++  svbfmlalb_lane (f32, bf16, bf16, 8); /* { dg-error {passing 8 to argument 4 of 'svbfmlalb_lane', which expects a value in the range \[0, 7\]} } */
++  svbfmlalb_lane (f32, bf16, bf16, -1); /* { dg-error {passing -1 to argument 4 of 'svbfmlalb_lane', which expects a value in the range \[0, 7\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lanex2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lanex2_1.c
+new file mode 100644
+index 000000000..4755ca79a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lanex2_1.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++#pragma GCC target ("arch=armv8.2-a+sve+bf16")
++
++void
++f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
++    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, int i)
++{
++  svbfdot_lane (f32, bf16, bf16); /* { dg-error {too few arguments to function 'svbfdot_lane'} } */
++  svbfdot_lane (f32, bf16, bf16, 0, 0); /* { dg-error {too many arguments to function 'svbfdot_lane'} } */
++  svbfdot_lane (0, bf16, bf16, 0); /* { dg-error {passing 'int' to argument 1 of 'svbfdot_lane', which expects an SVE vector type} } */
++  svbfdot_lane (pg, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svbool_t' arguments} } */
++  svbfdot_lane (u8, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svuint8_t' arguments} } */
++  svbfdot_lane (u16, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svuint16_t' arguments} } */
++  svbfdot_lane (f64, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svfloat64_t' arguments} } */
++  svbfdot_lane (f32, bf16, bf16, 0);
++  svbfdot_lane (f32, 0, bf16, 0); /* { dg-error {passing 'int' to argument 2 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
++  svbfdot_lane (f32, f32, bf16, 0); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
++  svbfdot_lane (f32, bf16, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
++  svbfdot_lane (f32, bf16, f32, 0); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
++  svbfdot_lane (f32, bf16, bf16, s32); /* { dg-error {argument 4 of 'svbfdot_lane' must be an integer constant expression} } */
++  svbfdot_lane (f32, bf16, bf16, i); /* { dg-error {argument 4 of 'svbfdot_lane' must be an integer constant expression} } */
++
++  svbfdot_lane (f32, bf16, bf16, 0);
++  svbfdot_lane (f32, bf16, bf16, 3);
++  svbfdot_lane (f32, bf16, bf16, 4); /* { dg-error {passing 4 to argument 4 of 'svbfdot_lane', which expects a value in the range \[0, 3\]} } */
++  svbfdot_lane (f32, bf16, bf16, -1); /* { dg-error {passing -1 to argument 4 of 'svbfdot_lane', which expects a value in the range \[0, 3\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_opt_n_1.c
+new file mode 100644
+index 000000000..2d09a8eeb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_opt_n_1.c
+@@ -0,0 +1,24 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++#pragma GCC target ("arch=armv8.2-a+sve+bf16")
++
++void
++f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
++    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, bfloat16_t bf)
++{
++  svbfdot (f32, bf16); /* { dg-error {too few arguments to function 'svbfdot'} } */
++  svbfdot (f32, bf16, bf16, 0); /* { dg-error {too many arguments to function 'svbfdot'} } */
++  svbfdot (0, bf16, bf16); /* { dg-error {passing 'int' to argument 1 of 'svbfdot', which expects an SVE vector type} } */
++  svbfdot (pg, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svbool_t' arguments} } */
++  svbfdot (u8, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svuint8_t' arguments} } */
++  svbfdot (u16, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svuint16_t' arguments} } */
++  svbfdot (f64, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svfloat64_t' arguments} } */
++  svbfdot (f32, bf16, bf16);
++  svbfdot (f32, 0, bf16); /* { dg-error {passing 'int' to argument 2 of 'svbfdot', which expects 'svbfloat16_t'} } */
++  svbfdot (f32, f32, bf16); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfdot', which expects 'svbfloat16_t'} } */
++  svbfdot (f32, bf16, 0); /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
++  svbfdot (f32, bf16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfdot', which expects 'svbfloat16_t'} } */
++  svbfdot (f32, bf16, bf);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_lane_1.c
+new file mode 100644
+index 000000000..600be05a8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_lane_1.c
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
++    svint32_t s32, svuint32_t u32, svint64_t s64, svuint64_t u64,
++    svfloat32_t f32, int i)
++{
++  svsudot_lane (s32, s8, u8); /* { dg-error {too few arguments to function 'svsudot_lane'} } */
++  svsudot_lane (s32, s8, u8, 0, 0); /* { dg-error {too many arguments to function 'svsudot_lane'} } */
++  svsudot_lane (0, s8, u8, 0); /* { dg-error {passing 'int' to argument 1 of 'svsudot_lane', which expects an SVE vector type} } */
++  svsudot_lane (pg, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svbool_t' arguments} } */
++  svsudot_lane (u8, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svuint8_t' arguments} } */
++  svsudot_lane (f32, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svfloat32_t' arguments} } */
++  svsudot_lane (u32, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svuint32_t' arguments} } */
++  svsudot_lane (s32, s8, u8, 0);
++  svsudot_lane (s32, 0, u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svsudot_lane', which expects an SVE vector type} } */
++  svsudot_lane (s32, s8, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svsudot_lane', which expects an SVE vector type} } */
++
++  svsudot_lane (s32, s8, u8, 0);
++  svsudot_lane (s32, u8, u8, 0); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svsudot_lane', which expects a vector of signed integers} } */
++  svsudot_lane (s32, s8, s8, 0); /* { dg-error {passing 'svint8_t' to argument 3 of 'svsudot_lane', which expects a vector of unsigned integers} } */
++  svsudot_lane (s32, s32, s32, 0); /* { dg-error {passing 'svint32_t' instead of the expected 'svint8_t' to argument 2 of 'svsudot_lane', after passing 'svint32_t' to argument 1} } */
++
++  svsudot_lane (s32, s8, u8, i); /* { dg-error {argument 4 of 'svsudot_lane' must be an integer constant expression} } */
++  svsudot_lane (s32, s8, u8, 0);
++  svsudot_lane (s32, s8, u8, 3);
++  svsudot_lane (s32, s8, u8, 4);  /* { dg-error {passing 4 to argument 4 of 'svsudot_lane', which expects a value in the range \[0, 3\]} } */
++  svsudot_lane (s32, s8, u8, -1);  /* { dg-error {passing -1 to argument 4 of 'svsudot_lane', which expects a value in the range \[0, 3\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_opt_n_1.c
+new file mode 100644
+index 000000000..f95ac582f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_opt_n_1.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
++
++#include <arm_sve.h>
++
++svuint32_t
++f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
++{
++  svsudot_s32 (s32); /* { dg-error {too few arguments to function 'svsudot_s32'} } */
++  svsudot_s32 (s32, s8, u8, u32); /* { dg-error {too many arguments to function 'svsudot_s32'} } */
++  svsudot_s32 (s32, s32, u8); /* { dg-error {incompatible type for argument 2 of 'svsudot_s32'} } */
++  svsudot_s32 (s32, u8, u8); /* { dg-error {incompatible type for argument 2 of 'svsudot_s32'} } */
++  svsudot_s32 (s32, s8, u32); /* { dg-error {incompatible type for argument 3 of 'svsudot_s32'} } */
++  svsudot_s32 (s32, s8, s8); /* { dg-error {incompatible type for argument 3 of 'svsudot_s32'} } */
++  svsudot_s32 (s32, s8, 0); /* { dg-error {incompatible type for argument 3 of 'svsudot_s32'} } */
++  svsudot_s32 (s32, s8, u8);
++  return svsudot_s32 (s32, s8, u8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
++}
++
++void
++f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
++    svint32_t s32, svfloat32_t f32)
++{
++  svsudot (s32, s8); /* { dg-error {too few arguments to function 'svsudot'} } */
++  svsudot (s32, s8, u8, u8); /* { dg-error {too many arguments to function 'svsudot'} } */
++  svsudot (0, s8, u8); /* { dg-error {passing 'int' to argument 1 of 'svsudot', which expects an SVE vector type} } */
++  svsudot (pg, s8, u8); /* { dg-error {'svsudot' has no form that takes 'svbool_t' arguments} } */
++  svsudot (u8, s8, u8); /* { dg-error {'svsudot' has no form that takes 'svuint8_t' arguments} } */
++  svsudot (f32, s8, u8); /* { dg-error {'svsudot' has no form that takes 'svfloat32_t' arguments} } */
++  svsudot (s32, s8, u8);
++  svsudot (s32, 0, u8); /* { dg-error {passing 'int' to argument 2 of 'svsudot', which expects an SVE vector type} } */
++  svsudot (s32, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svsudot', which expects a vector of signed integers} } */
++  svsudot (s32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svsudot', which expects a vector of unsigned integers} } */
++  svsudot (s32, s8, 0);
++  svsudot (s32, s8, u8);
++  svsudot (s32, u32, u32); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svsudot', which expects a vector of signed integers} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_1.c
+new file mode 100644
+index 000000000..bbd1f91be
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
++    svint32_t s32, int i)
++{
++  svmla_lane (f32, f32, f32); /* { dg-error {too few arguments to function 'svmla_lane'} } */
++  svmla_lane (f32, f32, f32, 0, 0); /* { dg-error {too many arguments to function 'svmla_lane'} } */
++  svmla_lane (pg, pg, pg, 0); /* { dg-error {'svmla_lane' has no form that takes 'svbool_t' arguments} } */
++  svmla_lane (s32, s32, s32, 0); /* { dg-error {'svmla_lane' has no form that takes 'svint32_t' arguments} } */
++  svmla_lane (1, f32, f32, 0); /* { dg-error {passing 'int' to argument 1 of 'svmla_lane', which expects an SVE vector type} } */
++  svmla_lane (f32, 1, f32, 0); /* { dg-error {passing 'int' to argument 2 of 'svmla_lane', which expects an SVE vector type} } */
++  svmla_lane (f32, f32, 1, 0); /* { dg-error {passing 'int' to argument 3 of 'svmla_lane', which expects an SVE vector type} } */
++  svmla_lane (f32, f64, f32, 0); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svmla_lane', but previous arguments had type 'svfloat32_t'} } */
++  svmla_lane (f32, f32, f64, 0); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svmla_lane', but previous arguments had type 'svfloat32_t'} } */
++  svmla_lane (f32, f32, f32, s32); /* { dg-error {argument 4 of 'svmla_lane' must be an integer constant expression} } */
++  svmla_lane (f32, f32, f32, i); /* { dg-error {argument 4 of 'svmla_lane' must be an integer constant expression} } */
++
++  svmla_lane (f16, f16, f16, 0);
++  svmla_lane (f16, f16, f16, 7);
++  svmla_lane (f16, f16, f16, 8); /* { dg-error {passing 8 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 7\]} } */
++  svmla_lane (f16, f16, f16, -1); /* { dg-error {passing -1 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 7\]} } */
++
++  svmla_lane (f32, f32, f32, 0);
++  svmla_lane (f32, f32, f32, 3);
++  svmla_lane (f32, f32, f32, 4); /* { dg-error {passing 4 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 3\]} } */
++  svmla_lane (f32, f32, f32, -1); /* { dg-error {passing -1 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 3\]} } */
++
++  svmla_lane (f64, f64, f64, 0);
++  svmla_lane (f64, f64, f64, 1);
++  svmla_lane (f64, f64, f64, 2); /* { dg-error {passing 2 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 1\]} } */
++  svmla_lane (f64, f64, f64, -1); /* { dg-error {passing -1 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 1\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_rotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_rotate_1.c
+new file mode 100644
+index 000000000..bccc6c7e2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_rotate_1.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
++    svint32_t s32, int i)
++{
++  svcmla_lane (f32, f32, f32, 0); /* { dg-error {too few arguments to function 'svcmla_lane'} } */
++  svcmla_lane (f32, f32, f32, 0, 90, 90); /* { dg-error {too many arguments to function 'svcmla_lane'} } */
++  svcmla_lane (pg, pg, pg, 0, 90); /* { dg-error {'svcmla_lane' has no form that takes 'svbool_t' arguments} } */
++  svcmla_lane (s32, s32, s32, 0, 90); /* { dg-error {'svcmla_lane' has no form that takes 'svint32_t' arguments} } */
++  svcmla_lane (f64, f64, f64, 0, 90); /* { dg-error {'svcmla_lane' has no form that takes 'svfloat64_t' arguments} } */
++  svcmla_lane (1, f32, f32, 0, 90); /* { dg-error {passing 'int' to argument 1 of 'svcmla_lane', which expects an SVE vector type} } */
++  svcmla_lane (f32, 1, f32, 0, 90); /* { dg-error {passing 'int' to argument 2 of 'svcmla_lane', which expects an SVE vector type} } */
++  svcmla_lane (f32, f32, 1, 0, 90); /* { dg-error {passing 'int' to argument 3 of 'svcmla_lane', which expects an SVE vector type} } */
++  svcmla_lane (f32, f64, f32, 0, 90); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svcmla_lane', but previous arguments had type 'svfloat32_t'} } */
++  svcmla_lane (f32, f32, f64, 0, 90); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcmla_lane', but previous arguments had type 'svfloat32_t'} } */
++  svcmla_lane (f32, f32, f32, s32, 0); /* { dg-error {argument 4 of 'svcmla_lane' must be an integer constant expression} } */
++  svcmla_lane (f32, f32, f32, i, 0); /* { dg-error {argument 4 of 'svcmla_lane' must be an integer constant expression} } */
++
++  svcmla_lane (f16, f16, f16, 0, 0);
++  svcmla_lane (f16, f16, f16, 3, 0);
++  svcmla_lane (f16, f16, f16, 4, 0); /* { dg-error {passing 4 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 3\]} } */
++  svcmla_lane (f16, f16, f16, -1, 0); /* { dg-error {passing -1 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 3\]} } */
++
++  svcmla_lane (f32, f32, f32, 0, 0);
++  svcmla_lane (f32, f32, f32, 1, 0);
++  svcmla_lane (f32, f32, f32, 2, 0); /* { dg-error {passing 2 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 1\]} } */
++  svcmla_lane (f32, f32, f32, -1, 0); /* { dg-error {passing -1 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 1\]} } */
++
++  svcmla_lane (f32, f32, f32, 0, -90); /* { dg-error {passing -90 to argument 5 of 'svcmla_lane', which expects 0, 90, 180 or 270} } */
++  svcmla_lane (f32, f32, f32, 0, 0);
++  svcmla_lane (f32, f32, f32, 0, 1); /* { dg-error {passing 1 to argument 5 of 'svcmla_lane', which expects 0, 90, 180 or 270} } */
++  svcmla_lane (f32, f32, f32, 0, 90);
++  svcmla_lane (f32, f32, f32, 0, 180);
++  svcmla_lane (f32, f32, f32, 0, 270);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_opt_n_1.c
+new file mode 100644
+index 000000000..c4a80e9da
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_opt_n_1.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
++    svint16_t s16, svuint16_t u16, svfloat16_t f16)
++{
++  svmla_x (pg, u8, u8); /* { dg-error {too few arguments to function 'svmla_x'} } */
++  svmla_x (pg, u8, u8, u8, u8); /* { dg-error {too many arguments to function 'svmla_x'} } */
++  svmla_x (u8, u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svmla_x', which expects 'svbool_t'} } */
++  svmla_x (pg, pg, pg, pg); /* { dg-error {'svmla_x' has no form that takes 'svbool_t' arguments} } */
++  svmla_x (pg, 1, u8, u8); /* { dg-error {passing 'int' to argument 2 of 'svmla_x', which expects an SVE vector type} } */
++  svmla_x (pg, u8, s8, u8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u8, u8);
++  svmla_x (pg, u8, s16, u8); /* { dg-error {passing 'svint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u16, u8); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, f16, u8); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, pg, u8); /* { dg-error {passing 'svbool_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, 0, u8); /* { dg-error {passing 'int' to argument 3 of 'svmla_x', which expects an SVE vector type} } */
++  svmla_x (pg, u8, u8, s8); /* { dg-error {passing 'svint8_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u8, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u8, pg); /* { dg-error {passing 'svbool_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
++  svmla_x (pg, u8, u8, 0);
++
++  svmla_x (pg, f16, s16, f16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
++  svmla_x (pg, f16, u16, f16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
++  svmla_x (pg, f16, f16, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
++  svmla_x (pg, f16, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
++  svmla_x (pg, f16, f16, f16);
++  svmla_x (pg, f16, f16, 1);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c
+new file mode 100644
+index 000000000..e81552b64
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
++    svint32_t s32, svuint32_t u32, svint64_t s64, svuint64_t u64,
++    svfloat32_t f32, int i)
++{
++  svdot_lane (u32, u8, u8); /* { dg-error {too few arguments to function 'svdot_lane'} } */
++  svdot_lane (u32, u8, u8, 0, 0); /* { dg-error {too many arguments to function 'svdot_lane'} } */
++  svdot_lane (0, u8, u8, 0); /* { dg-error {passing 'int' to argument 1 of 'svdot_lane', which expects an SVE vector type} } */
++  svdot_lane (pg, u8, u8, 0); /* { dg-error {'svdot_lane' has no form that takes 'svbool_t' arguments} } */
++  svdot_lane (u8, u8, u8, 0); /* { dg-error {'svdot_lane' has no form that takes 'svuint8_t' arguments} } */
++  svdot_lane (f32, u8, u8, 0); /* { dg-error {'svdot_lane' has no form that takes 'svfloat32_t' arguments} } */
++  svdot_lane (u32, u8, u8, 0);
++  svdot_lane (u32, 0, u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svdot_lane', which expects an SVE vector type} } */
++  svdot_lane (u32, u8, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svdot_lane', which expects an SVE vector type} } */
++
++  svdot_lane (s32, s8, s8, 0);
++  svdot_lane (s32, u8, s8, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
++  svdot_lane (s32, s8, u8, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
++  svdot_lane (s32, s32, s32, 0); /* { dg-error {passing 'svint32_t' instead of the expected 'svint8_t' to argument 2 of 'svdot_lane', after passing 'svint32_t' to argument 1} } */
++
++  svdot_lane (u32, u8, u8, 0);
++  svdot_lane (u32, s8, u8, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
++  svdot_lane (u32, u8, s8, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
++  svdot_lane (u32, u32, u32, 0); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svdot_lane', after passing 'svuint32_t' to argument 1} } */
++
++  svdot_lane (s64, s16, s16, 0);
++  svdot_lane (s64, u16, s16, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint64_t' and 'svuint16_t' respectively} } */
++  svdot_lane (s64, s16, u16, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint64_t' and 'svuint16_t' respectively} } */
++  svdot_lane (s64, s64, s64, 0); /* { dg-error {passing 'svint64_t' instead of the expected 'svint16_t' to argument 2 of 'svdot_lane', after passing 'svint64_t' to argument 1} } */
++
++  svdot_lane (u64, u16, u16, 0);
++  svdot_lane (u64, s16, u16, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint64_t' and 'svint16_t' respectively} } */
++  svdot_lane (u64, u16, s16, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint64_t' and 'svint16_t' respectively} } */
++  svdot_lane (u64, u64, u64, 0); /* { dg-error {passing 'svuint64_t' instead of the expected 'svuint16_t' to argument 2 of 'svdot_lane', after passing 'svuint64_t' to argument 1} } */
++
++  svdot_lane (s32, s8, s8, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
++  svdot_lane (s32, s8, s8, 0);
++  svdot_lane (s32, s8, s8, 3);
++  svdot_lane (s32, s8, s8, 4);  /* { dg-error {passing 4 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
++  svdot_lane (s32, s8, s8, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
++
++  svdot_lane (u32, u8, u8, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
++  svdot_lane (u32, u8, u8, 0);
++  svdot_lane (u32, u8, u8, 3);
++  svdot_lane (u32, u8, u8, 4);  /* { dg-error {passing 4 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
++  svdot_lane (u32, u8, u8, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
++
++  svdot_lane (s64, s16, s16, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
++  svdot_lane (s64, s16, s16, 0);
++  svdot_lane (s64, s16, s16, 1);
++  svdot_lane (s64, s16, s16, 2);  /* { dg-error {passing 2 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
++  svdot_lane (s64, s16, s16, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
++
++  svdot_lane (u64, u16, u16, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
++  svdot_lane (u64, u16, u16, 0);
++  svdot_lane (u64, u16, u16, 1);
++  svdot_lane (u64, u16, u16, 2);  /* { dg-error {passing 2 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
++  svdot_lane (u64, u16, u16, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_1.c
+new file mode 100644
+index 000000000..b41e6fcce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_1.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svint32_t
++f1 (svuint32_t u32, svuint8_t u8, svint8_t s8)
++{
++  svdot_u32 (u32); /* { dg-error {too few arguments to function 'svdot_u32'} } */
++  svdot_u32 (u32, u8, u8, u32); /* { dg-error {too many arguments to function 'svdot_u32'} } */
++  svdot_u32 (u32, u32, u8); /* { dg-error {incompatible type for argument 2 of 'svdot_u32'} } */
++  svdot_u32 (u32, s8, u8); /* { dg-error {incompatible type for argument 2 of 'svdot_u32'} } */
++  svdot_u32 (u32, u8, u32); /* { dg-error {incompatible type for argument 3 of 'svdot_u32'} } */
++  svdot_u32 (u32, u8, s8); /* { dg-error {incompatible type for argument 3 of 'svdot_u32'} } */
++  return svdot_u32 (u32, u8, u8); /* { dg-error {incompatible types when returning type 'svuint32_t' but 'svint32_t' was expected} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c
+new file mode 100644
+index 000000000..fee4096fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
++    svfloat32_t f32)
++{
++  svdot (u32, u8); /* { dg-error {too few arguments to function 'svdot'} } */
++  svdot (u32, u8, u8, u8); /* { dg-error {too many arguments to function 'svdot'} } */
++  svdot (0, u8, u8); /* { dg-error {passing 'int' to argument 1 of 'svdot', which expects an SVE vector type} } */
++  svdot (pg, u8, u8); /* { dg-error {'svdot' has no form that takes 'svbool_t' arguments} } */
++  svdot (u8, u8, u8); /* { dg-error {'svdot' has no form that takes 'svuint8_t' arguments} } */
++  svdot (f32, u8, u8); /* { dg-error {'svdot' has no form that takes 'svfloat32_t' arguments} } */
++  svdot (u32, u8, u8);
++  svdot (u32, 0, u8); /* { dg-error {passing 'int' to argument 2 of 'svdot', which expects an SVE vector type} } */
++  svdot (u32, s8, u8); /* { dg-error {arguments 1 and 2 of 'svdot' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
++  svdot (u32, u8, 0);
++  svdot (u32, u8, s8); /* { dg-error {arguments 1 and 3 of 'svdot' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
++  svdot (u32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svdot', after passing 'svuint32_t' to argument 1} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_rotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_rotate_1.c
+new file mode 100644
+index 000000000..f340e3d1e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_rotate_1.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat32_t f32, svfloat64_t f64, svint32_t s32, int i)
++{
++  svcmla_x (pg, f32, f32, f32); /* { dg-error {too few arguments to function 'svcmla_x'} } */
++  svcmla_x (pg, f32, f32, f32, 90, 90); /* { dg-error {too many arguments to function 'svcmla_x'} } */
++  svcmla_x (f32, f32, f32, f32, 90); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svcmla_x', which expects 'svbool_t'} } */
++  svcmla_x (pg, pg, pg, pg, 90); /* { dg-error {'svcmla_x' has no form that takes 'svbool_t' arguments} } */
++  svcmla_x (pg, s32, s32, s32, 90); /* { dg-error {'svcmla_x' has no form that takes 'svint32_t' arguments} } */
++  svcmla_x (pg, 1, f32, f32, 90); /* { dg-error {passing 'int' to argument 2 of 'svcmla_x', which expects an SVE vector type} } */
++  svcmla_x (pg, f32, 1, f32, 90); /* { dg-error {passing 'int' to argument 3 of 'svcmla_x', which expects an SVE vector type} } */
++  svcmla_x (pg, f32, f32, 1, 90); /* { dg-error {passing 'int' to argument 4 of 'svcmla_x', which expects an SVE vector type} } */
++  svcmla_x (pg, f32, f64, f32, 90); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcmla_x', but previous arguments had type 'svfloat32_t'} } */
++  svcmla_x (pg, f32, f32, f64, 90); /* { dg-error {passing 'svfloat64_t' to argument 4 of 'svcmla_x', but previous arguments had type 'svfloat32_t'} } */
++  svcmla_x (pg, f32, f32, f32, s32); /* { dg-error {argument 5 of 'svcmla_x' must be an integer constant expression} } */
++  svcmla_x (pg, f32, f32, f32, i); /* { dg-error {argument 5 of 'svcmla_x' must be an integer constant expression} } */
++  svcmla_x (pg, f32, f32, f32, -90); /* { dg-error {passing -90 to argument 5 of 'svcmla_x', which expects 0, 90, 180 or 270} } */
++  svcmla_x (pg, f32, f32, f32, 0);
++  svcmla_x (pg, f32, f32, f32, 1); /* { dg-error {passing 1 to argument 5 of 'svcmla_x', which expects 0, 90, 180 or 270} } */
++  svcmla_x (pg, f32, f32, f32, 90);
++  svcmla_x (pg, f32, f32, f32, 180);
++  svcmla_x (pg, f32, f32, f32, 270);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_1.c
+new file mode 100644
+index 000000000..f52fb39bf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_1.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
++
++#include <arm_sve.h>
++
++svuint32_t
++f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
++{
++  svusmmla_s32 (s32); /* { dg-error {too few arguments to function 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, u8, s8, u32); /* { dg-error {too many arguments to function 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, u32, s8); /* { dg-error {incompatible type for argument 2 of 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, s8, s8); /* { dg-error {incompatible type for argument 2 of 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, u8, u8); /* { dg-error {incompatible type for argument 3 of 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, u8, s32); /* { dg-error {incompatible type for argument 3 of 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, u8, 0); /* { dg-error {incompatible type for argument 3 of 'svusmmla_s32'} } */
++  svusmmla_s32 (s32, u8, s8);
++  return svusmmla_s32 (s32, u8, s8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
++}
++
++void
++f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
++    svint32_t s32, svfloat32_t f32)
++{
++  svusmmla (s32, u8); /* { dg-error {too few arguments to function 'svusmmla'} } */
++  svusmmla (s32, u8, s8, u8); /* { dg-error {too many arguments to function 'svusmmla'} } */
++  svusmmla (0, u8, s8); /* { dg-error {passing 'int' to argument 1 of 'svusmmla', which expects an SVE vector type} } */
++  svusmmla (pg, u8, s8); /* { dg-error {'svusmmla' has no form that takes 'svbool_t' arguments} } */
++  svusmmla (u8, u8, s8); /* { dg-error {'svusmmla' has no form that takes 'svuint8_t' arguments} } */
++  svusmmla (f32, u8, s8); /* { dg-error {'svusmmla' has no form that takes 'svfloat32_t' arguments} } */
++  svusmmla (s32, u8, s8);
++  svusmmla (s32, 0, s8); /* { dg-error {passing 'int' to argument 2 of 'svusmmla', which expects an SVE vector type} } */
++  svusmmla (s32, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svusmmla', which expects a vector of signed integers} } */
++  svusmmla (s32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svusmmla', which expects a vector of unsigned integers} } */
++  svusmmla (s32, u8, 0); /* { dg-error {passing 'int' to argument 3 of 'svusmmla', which expects an SVE vector type} } */
++  svusmmla (s32, u8, s8);
++  svusmmla (s32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svusmmla', after passing 'svint32_t' to argument 1} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_lane_1.c
+new file mode 100644
+index 000000000..b40cfe9e8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_lane_1.c
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
++    svint32_t s32, svuint32_t u32, svint64_t s64, svuint64_t u64,
++    svfloat32_t f32, int i)
++{
++  svusdot_lane (s32, u8, s8); /* { dg-error {too few arguments to function 'svusdot_lane'} } */
++  svusdot_lane (s32, u8, s8, 0, 0); /* { dg-error {too many arguments to function 'svusdot_lane'} } */
++  svusdot_lane (0, u8, s8, 0); /* { dg-error {passing 'int' to argument 1 of 'svusdot_lane', which expects an SVE vector type} } */
++  svusdot_lane (pg, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svbool_t' arguments} } */
++  svusdot_lane (u8, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svuint8_t' arguments} } */
++  svusdot_lane (f32, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svfloat32_t' arguments} } */
++  svusdot_lane (u32, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svuint32_t' arguments} } */
++  svusdot_lane (s32, u8, s8, 0);
++  svusdot_lane (s32, 0, s8, 0); /* { dg-error {passing 'int' to argument 2 of 'svusdot_lane', which expects an SVE vector type} } */
++  svusdot_lane (s32, u8, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svusdot_lane', which expects an SVE vector type} } */
++
++  svusdot_lane (s32, u8, s8, 0);
++  svusdot_lane (s32, s8, s8, 0); /* { dg-error {passing 'svint8_t' to argument 2 of 'svusdot_lane', which expects a vector of unsigned integers} } */
++  svusdot_lane (s32, u8, u8, 0); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svusdot_lane', which expects a vector of signed integers} } */
++  svusdot_lane (s32, s32, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svusdot_lane', which expects a vector of unsigned integers} } */
++
++  svusdot_lane (s32, u8, s8, i); /* { dg-error {argument 4 of 'svusdot_lane' must be an integer constant expression} } */
++  svusdot_lane (s32, u8, s8, 0);
++  svusdot_lane (s32, u8, s8, 3);
++  svusdot_lane (s32, u8, s8, 4);  /* { dg-error {passing 4 to argument 4 of 'svusdot_lane', which expects a value in the range \[0, 3\]} } */
++  svusdot_lane (s32, u8, s8, -1);  /* { dg-error {passing -1 to argument 4 of 'svusdot_lane', which expects a value in the range \[0, 3\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_opt_n_1.c
+new file mode 100644
+index 000000000..896b80390
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_opt_n_1.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
++
++#include <arm_sve.h>
++
++svuint32_t
++f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
++{
++  svusdot_s32 (s32); /* { dg-error {too few arguments to function 'svusdot_s32'} } */
++  svusdot_s32 (s32, u8, s8, u32); /* { dg-error {too many arguments to function 'svusdot_s32'} } */
++  svusdot_s32 (s32, u32, s8); /* { dg-error {incompatible type for argument 2 of 'svusdot_s32'} } */
++  svusdot_s32 (s32, s8, s8); /* { dg-error {incompatible type for argument 2 of 'svusdot_s32'} } */
++  svusdot_s32 (s32, u8, u8); /* { dg-error {incompatible type for argument 3 of 'svusdot_s32'} } */
++  svusdot_s32 (s32, u8, s32); /* { dg-error {incompatible type for argument 3 of 'svusdot_s32'} } */
++  svusdot_s32 (s32, u8, 0); /* { dg-error {incompatible type for argument 3 of 'svusdot_s32'} } */
++  svusdot_s32 (s32, u8, s8);
++  return svusdot_s32 (s32, u8, s8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
++}
++
++void
++f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
++    svint32_t s32, svfloat32_t f32)
++{
++  svusdot (s32, u8); /* { dg-error {too few arguments to function 'svusdot'} } */
++  svusdot (s32, u8, s8, u8); /* { dg-error {too many arguments to function 'svusdot'} } */
++  svusdot (0, u8, s8); /* { dg-error {passing 'int' to argument 1 of 'svusdot', which expects an SVE vector type} } */
++  svusdot (pg, u8, s8); /* { dg-error {'svusdot' has no form that takes 'svbool_t' arguments} } */
++  svusdot (u8, u8, s8); /* { dg-error {'svusdot' has no form that takes 'svuint8_t' arguments} } */
++  svusdot (f32, u8, s8); /* { dg-error {'svusdot' has no form that takes 'svfloat32_t' arguments} } */
++  svusdot (s32, u8, s8);
++  svusdot (s32, 0, s8); /* { dg-error {passing 'int' to argument 2 of 'svusdot', which expects an SVE vector type} } */
++  svusdot (s32, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svusdot', which expects a vector of signed integers} } */
++  svusdot (s32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svusdot', which expects a vector of unsigned integers} } */
++  svusdot (s32, u8, 0);
++  svusdot (s32, u8, s8);
++  svusdot (s32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svusdot', after passing 'svint32_t' to argument 1} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/tmad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/tmad_1.c
+new file mode 100644
+index 000000000..8b98fc24d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/tmad_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svfloat32_t f32, svfloat64_t f64, svint32_t s32, int i)
++{
++  svtmad (f32, f32); /* { dg-error {too few arguments to function 'svtmad'} } */
++  svtmad (f32, f32, 0, 0); /* { dg-error {too many arguments to function 'svtmad'} } */
++  svtmad (pg, pg, 0); /* { dg-error {'svtmad' has no form that takes 'svbool_t' arguments} } */
++  svtmad (s32, s32, 0); /* { dg-error {'svtmad' has no form that takes 'svint32_t' arguments} } */
++  svtmad (1, f32, 0); /* { dg-error {passing 'int' to argument 1 of 'svtmad', which expects an SVE vector type} } */
++  svtmad (f32, 1, 0); /* { dg-error {passing 'int' to argument 2 of 'svtmad', which expects an SVE vector type} } */
++  svtmad (f32, f64, 0); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svtmad', but previous arguments had type 'svfloat32_t'} } */
++  svtmad (f32, f32, s32); /* { dg-error {argument 3 of 'svtmad' must be an integer constant expression} } */
++  svtmad (f32, f32, i); /* { dg-error {argument 3 of 'svtmad' must be an integer constant expression} } */
++  svtmad (f32, f32, -1); /* { dg-error {passing -1 to argument 3 of 'svtmad', which expects a value in the range \[0, 7\]} } */
++  svtmad (f32, f32, 0);
++  svtmad (f32, f32, 1);
++  svtmad (f32, f32, 7);
++  svtmad (f32, f32, 8); /* { dg-error {passing 8 to argument 3 of 'svtmad', which expects a value in the range \[0, 7\]} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_1.c
+new file mode 100644
+index 000000000..70b2d9dd1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_1.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svbool_t; /* { dg-message "note: previous declaration of 'svbool_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svbool_t' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_10.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_10.c
+new file mode 100644
+index 000000000..8278c1cad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_10.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef struct svint8x2_t svint8x2_t; /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svint8x2_t'} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_11.c
+new file mode 100644
+index 000000000..2147df72c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_11.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++
++/* This isn't explicitly allowed or disallowed, but mustn't ICE.  */
++struct svint8x2_t;
++
++#pragma GCC aarch64 "arm_sve.h"
++
++void
++f (svint8x2_t *a, struct svint8x2_t *b)
++{
++  *a = *b; /* { dg-error {dereferencing pointer to incomplete type} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_12.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_12.c
+new file mode 100644
+index 000000000..1a6ccbd05
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_12.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++
++/* This isn't explicitly allowed or disallowed, but mustn't ICE.  */
++struct svint8x2_t { int x; };
++
++#pragma GCC aarch64 "arm_sve.h"
++
++void
++f (svint8x2_t *a, struct svint8x2_t *b)
++{
++  *a = *b; /* { dg-error {incompatible types} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_13.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_13.c
+new file mode 100644
+index 000000000..62bab1f84
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_13.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
++
++int svint8x2_t;  /* { dg-error {'svint8x2_t' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_14.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_14.c
+new file mode 100644
+index 000000000..0f00db1fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_14.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++enum svpattern { FOO }; /* { dg-message "note: originally defined here" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {redeclaration of 'enum svpattern'} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_15.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_15.c
+new file mode 100644
+index 000000000..ea9721749
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_15.c
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-message "note: originally defined here" } */
++
++enum svpattern { FOO }; /* { dg-error {redeclaration of 'enum svpattern'} } */
++enum foo { SV_ALL }; /* { dg-error {redeclaration of enumerator 'SV_ALL'} } */
++typedef int SV_POW2; /* { dg-error {'SV_POW2' redeclared as different kind of symbol} } */
++int SV_VL3; /* { dg-error {'SV_VL3' redeclared as different kind of symbol} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_16.c
+new file mode 100644
+index 000000000..a59dabc6c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_16.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++struct svpattern { int x; };
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svpattern' defined as wrong kind of tag} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_17.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_17.c
+new file mode 100644
+index 000000000..027fdb2b9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_17.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++struct svpattern { int x; }; /* { dg-error {'svpattern' defined as wrong kind of tag} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_18.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_18.c
+new file mode 100644
+index 000000000..b6706150b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_18.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svpattern; /* OK in C.  */
++
++#pragma GCC aarch64 "arm_sve.h"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_19.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_19.c
+new file mode 100644
+index 000000000..c6379f762
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_19.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++int svpattern; /* OK in C.  */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_2.c
+new file mode 100644
+index 000000000..ffd86ae7b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_2.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svint8_t; /* { dg-message "note: previous declaration of 'svint8_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svint8_t' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_20.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_20.c
+new file mode 100644
+index 000000000..3d770a956
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_20.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++
++enum foo { SV_VL4 };
++typedef int SV_POW2;
++int SV_ALL;
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {redeclaration of enumerator 'SV_VL4'} } */
++/* { dg-error {'SV_POW2' redeclared as different kind of symbol} "" { target *-*-* } .-1 } */
++/* { dg-error {'SV_ALL' redeclared as different kind of symbol} "" { target *-*-* } .-2 } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_3.c
+new file mode 100644
+index 000000000..f42dd9680
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_3.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svuint16_t; /* { dg-message "note: previous declaration of 'svuint16_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svuint16_t' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_4.c
+new file mode 100644
+index 000000000..91c95a1f5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_4.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svfloat32_t; /* { dg-message "note: previous declaration of 'svfloat32_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svfloat32_t' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_5.c
+new file mode 100644
+index 000000000..3cb6b8a1c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_5.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef int svbool_t; /* { dg-message "note: previous declaration of 'svbool_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svbool_t'} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_6.c
+new file mode 100644
+index 000000000..c051897b6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_6.c
+@@ -0,0 +1,6 @@
++/* { dg-do compile } */
++
++typedef __SVBool_t svbool_t; /* { dg-message "note: previous declaration of 'svbool_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error {redefinition of typedef 'svbool_t'} } */
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_7.c
+new file mode 100644
+index 000000000..fd4063154
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_7.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++/* { dg-options "-std=gnu90" } */
++
++typedef __SVBool_t svbool_t;
++
++/* Without -pedantic-errors this should compile.  */
++#pragma GCC aarch64 "arm_sve.h"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_8.c
+new file mode 100644
+index 000000000..41614a304
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_8.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++int svint8x2_t; /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svint8x2_t' redeclared} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_9.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_9.c
+new file mode 100644
+index 000000000..83b6855df
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_9.c
+@@ -0,0 +1,5 @@
++/* { dg-do compile } */
++
++typedef int svint8x2_t; /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
++
++#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svint8x2_t'} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_1.c
+new file mode 100644
+index 000000000..eef85a01d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_1.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32)
++{
++  svabs_m (s32, pg); /* { dg-error {too few arguments to function 'svabs_m'} } */
++  svabs_m (s32, pg, s32, s32); /* { dg-error {too many arguments to function 'svabs_m'} } */
++  svabs_m (0, pg, s32); /* { dg-error {passing 'int' to argument 1 of 'svabs_m', which expects an SVE vector type} } */
++  svabs_m (s32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svabs_m', which expects 'svbool_t'} } */
++  svabs_m (s32, 0, s32); /* { dg-error {passing 'int' to argument 2 of 'svabs_m', which expects 'svbool_t'} } */
++  svabs_m (s32, pg, s32);
++  svabs_m (u32, pg, u32); /* { dg-error {'svabs_m' has no form that takes 'svuint32_t' arguments} } */
++  svabs_m (f32, pg, f32);
++  svabs_m (s32, pg, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svabs_m', but previous arguments had type 'svint32_t'} } */
++  svabs_m (s32, pg, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svabs_m', but previous arguments had type 'svint32_t'} } */
++  svabs_m (s32, pg, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svabs_m', but previous arguments had type 'svint32_t'} } */
++  svabs_m (pg, pg, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svabs_m', but previous arguments had type 'svbool_t'} } */
++  svabs_m (pg, pg, pg); /* { dg-error {'svabs_m' has no form that takes 'svbool_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_2.c
+new file mode 100644
+index 000000000..e94673a66
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_2.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8)
++{
++  svabs_x (pg); /* { dg-error {too few arguments to function 'svabs_x'} } */
++  svabs_x (pg, s8, s8); /* { dg-error {too many arguments to function 'svabs_x'} } */
++  svabs_x (s8, s8); /* { dg-error {passing 'svint8_t' to argument 1 of 'svabs_x', which expects 'svbool_t'} } */
++  svabs_x (pg, pg); /* { dg-error {'svabs_x' has no form that takes 'svbool_t' arguments} } */
++  svabs_x (pg, 1); /* { dg-error {passing 'int' to argument 2 of 'svabs_x', which expects an SVE vector type} } */
++  svabs_x (pg, s8);
++  svabs_x (pg, u8); /* { dg-error {'svabs_x' has no form that takes 'svuint8_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
+new file mode 100644
+index 000000000..caa4e623d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
+@@ -0,0 +1,73 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, svfloat16_t f16, svfloat32_t f32,
++      svfloat64_t f64)
++{
++  svcvt_f64_x (pg); /* { dg-error {too few arguments to function 'svcvt_f64_x'} } */
++  svcvt_f64_x (pg, s32, 0); /* { dg-error {too many arguments to function 'svcvt_f64_x'} } */
++  svcvt_f64_x (s32, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svcvt_f64_x', which expects 'svbool_t'} } */
++  svcvt_f64_x (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svcvt_f64_x', which expects an SVE vector type} } */
++
++  svcvt_f64_x (pg, s8); /* { dg-error {'svcvt_f64_x' has no form that takes 'svint8_t' arguments} } */
++  svcvt_f64_x (pg, s16); /* { dg-error {'svcvt_f64_x' has no form that takes 'svint16_t' arguments} } */
++  svcvt_f64_x (pg, s32);
++  svcvt_f64_x (pg, s64);
++  svcvt_f64_x (pg, u8); /* { dg-error {'svcvt_f64_x' has no form that takes 'svuint8_t' arguments} } */
++  svcvt_f64_x (pg, u16); /* { dg-error {'svcvt_f64_x' has no form that takes 'svuint16_t' arguments} } */
++  svcvt_f64_x (pg, u32);
++  svcvt_f64_x (pg, u64);
++  svcvt_f64_x (pg, f16);
++  svcvt_f64_x (pg, f32);
++  svcvt_f64_x (pg, f64); /* { dg-error {'svcvt_f64_x' has no form that takes 'svfloat64_t' arguments} } */
++
++  svcvt_f32_x (pg, s8); /* { dg-error {'svcvt_f32_x' has no form that takes 'svint8_t' arguments} } */
++  svcvt_f32_x (pg, s16); /* { dg-error {'svcvt_f32_x' has no form that takes 'svint16_t' arguments} } */
++  svcvt_f32_x (pg, s32);
++  svcvt_f32_x (pg, s64);
++  svcvt_f32_x (pg, u8); /* { dg-error {'svcvt_f32_x' has no form that takes 'svuint8_t' arguments} } */
++  svcvt_f32_x (pg, u16); /* { dg-error {'svcvt_f32_x' has no form that takes 'svuint16_t' arguments} } */
++  svcvt_f32_x (pg, u32);
++  svcvt_f32_x (pg, u64);
++  svcvt_f32_x (pg, f16);
++  svcvt_f32_x (pg, f32); /* { dg-error {'svcvt_f32_x' has no form that takes 'svfloat32_t' arguments} } */
++  svcvt_f32_x (pg, f64);
++
++  svcvt_f16_x (pg, s8); /* { dg-error {'svcvt_f16_x' has no form that takes 'svint8_t' arguments} } */
++  svcvt_f16_x (pg, s16);
++  svcvt_f16_x (pg, s32);
++  svcvt_f16_x (pg, s64);
++  svcvt_f16_x (pg, u8); /* { dg-error {'svcvt_f16_x' has no form that takes 'svuint8_t' arguments} } */
++  svcvt_f16_x (pg, u16);
++  svcvt_f16_x (pg, u32);
++  svcvt_f16_x (pg, u64);
++  svcvt_f16_x (pg, f16); /* { dg-error {'svcvt_f16_x' has no form that takes 'svfloat16_t' arguments} } */
++  svcvt_f16_x (pg, f32);
++  svcvt_f16_x (pg, f64);
++
++  svcvt_s64_x (pg, f16);
++  svcvt_s64_x (pg, f32);
++  svcvt_s64_x (pg, f64);
++
++  svcvt_s32_x (pg, f16);
++  svcvt_s32_x (pg, f32);
++  svcvt_s32_x (pg, f64);
++
++  svcvt_s16_x (pg, f16);
++  svcvt_s16_x (pg, f32); /* { dg-error {'svcvt_s16_x' has no form that takes 'svfloat32_t' arguments} } */
++  svcvt_s16_x (pg, f64); /* { dg-error {'svcvt_s16_x' has no form that takes 'svfloat64_t' arguments} } */
++
++  svcvt_u64_x (pg, f16);
++  svcvt_u64_x (pg, f32);
++  svcvt_u64_x (pg, f64);
++
++  svcvt_u32_x (pg, f16);
++  svcvt_u32_x (pg, f32);
++  svcvt_u32_x (pg, f64);
++
++  svcvt_u16_x (pg, f16);
++  svcvt_u16_x (pg, f32); /* { dg-error {'svcvt_u16_x' has no form that takes 'svfloat32_t' arguments} } */
++  svcvt_u16_x (pg, f64); /* { dg-error {'svcvt_u16_x' has no form that takes 'svfloat64_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_2.c
+new file mode 100644
+index 000000000..ddbd93b69
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_2.c
+@@ -0,0 +1,76 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
++      svint64_t s64, svuint64_t u64, svfloat16_t f16, svfloat32_t f32,
++      svfloat64_t f64)
++{
++  svcvt_f64_m (f64, pg); /* { dg-error {too few arguments to function 'svcvt_f64_m'} } */
++  svcvt_f64_m (f64, pg, s32, 0); /* { dg-error {too many arguments to function 'svcvt_f64_m'} } */
++  svcvt_f64_m (f32, pg, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svcvt_f64_m', which expects 'svfloat64_t'} } */
++  svcvt_f64_m (0, pg, s32); /* { dg-error {passing 'int' to argument 1 of 'svcvt_f64_m', which expects 'svfloat64_t'} } */
++  svcvt_f64_m (pg, pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svcvt_f64_m', which expects 'svfloat64_t'} } */
++  svcvt_f64_m (f64, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svcvt_f64_m', which expects 'svbool_t'} } */
++  svcvt_f64_m (f64, pg, 0); /* { dg-error {passing 'int' to argument 3 of 'svcvt_f64_m', which expects an SVE vector type} } */
++
++  svcvt_f64_m (f64, pg, s8); /* { dg-error {'svcvt_f64_m' has no form that takes 'svint8_t' arguments} } */
++  svcvt_f64_m (f64, pg, s16); /* { dg-error {'svcvt_f64_m' has no form that takes 'svint16_t' arguments} } */
++  svcvt_f64_m (f64, pg, s32);
++  svcvt_f64_m (f64, pg, s64);
++  svcvt_f64_m (f64, pg, u8); /* { dg-error {'svcvt_f64_m' has no form that takes 'svuint8_t' arguments} } */
++  svcvt_f64_m (f64, pg, u16); /* { dg-error {'svcvt_f64_m' has no form that takes 'svuint16_t' arguments} } */
++  svcvt_f64_m (f64, pg, u32);
++  svcvt_f64_m (f64, pg, u64);
++  svcvt_f64_m (f64, pg, f16);
++  svcvt_f64_m (f64, pg, f32);
++  svcvt_f64_m (f64, pg, f64); /* { dg-error {'svcvt_f64_m' has no form that takes 'svfloat64_t' arguments} } */
++
++  svcvt_f32_m (f32, pg, s8); /* { dg-error {'svcvt_f32_m' has no form that takes 'svint8_t' arguments} } */
++  svcvt_f32_m (f32, pg, s16); /* { dg-error {'svcvt_f32_m' has no form that takes 'svint16_t' arguments} } */
++  svcvt_f32_m (f32, pg, s32);
++  svcvt_f32_m (f32, pg, s64);
++  svcvt_f32_m (f32, pg, u8); /* { dg-error {'svcvt_f32_m' has no form that takes 'svuint8_t' arguments} } */
++  svcvt_f32_m (f32, pg, u16); /* { dg-error {'svcvt_f32_m' has no form that takes 'svuint16_t' arguments} } */
++  svcvt_f32_m (f32, pg, u32);
++  svcvt_f32_m (f32, pg, u64);
++  svcvt_f32_m (f32, pg, f16);
++  svcvt_f32_m (f32, pg, f32); /* { dg-error {'svcvt_f32_m' has no form that takes 'svfloat32_t' arguments} } */
++  svcvt_f32_m (f32, pg, f64);
++
++  svcvt_f16_m (f16, pg, s8); /* { dg-error {'svcvt_f16_m' has no form that takes 'svint8_t' arguments} } */
++  svcvt_f16_m (f16, pg, s16);
++  svcvt_f16_m (f16, pg, s32);
++  svcvt_f16_m (f16, pg, s64);
++  svcvt_f16_m (f16, pg, u8); /* { dg-error {'svcvt_f16_m' has no form that takes 'svuint8_t' arguments} } */
++  svcvt_f16_m (f16, pg, u16);
++  svcvt_f16_m (f16, pg, u32);
++  svcvt_f16_m (f16, pg, u64);
++  svcvt_f16_m (f16, pg, f16); /* { dg-error {'svcvt_f16_m' has no form that takes 'svfloat16_t' arguments} } */
++  svcvt_f16_m (f16, pg, f32);
++  svcvt_f16_m (f16, pg, f64);
++
++  svcvt_s64_m (s64, pg, f16);
++  svcvt_s64_m (s64, pg, f32);
++  svcvt_s64_m (s64, pg, f64);
++
++  svcvt_s32_m (s32, pg, f16);
++  svcvt_s32_m (s32, pg, f32);
++  svcvt_s32_m (s32, pg, f64);
++
++  svcvt_s16_m (s16, pg, f16);
++  svcvt_s16_m (s16, pg, f32); /* { dg-error {'svcvt_s16_m' has no form that takes 'svfloat32_t' arguments} } */
++  svcvt_s16_m (s16, pg, f64); /* { dg-error {'svcvt_s16_m' has no form that takes 'svfloat64_t' arguments} } */
++
++  svcvt_u64_m (u64, pg, f16);
++  svcvt_u64_m (u64, pg, f32);
++  svcvt_u64_m (u64, pg, f64);
++
++  svcvt_u32_m (u32, pg, f16);
++  svcvt_u32_m (u32, pg, f32);
++  svcvt_u32_m (u32, pg, f64);
++
++  svcvt_u16_m (u16, pg, f16);
++  svcvt_u16_m (u16, pg, f32); /* { dg-error {'svcvt_u16_m' has no form that takes 'svfloat32_t' arguments} } */
++  svcvt_u16_m (u16, pg, f64); /* { dg-error {'svcvt_u16_m' has no form that takes 'svfloat64_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_1.c
+new file mode 100644
+index 000000000..888b52513
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_1.c
+@@ -0,0 +1,24 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64)
++{
++  svclz_m (u32, pg); /* { dg-error {too few arguments to function 'svclz_m'} } */
++  svclz_m (u32, pg, s32, s32); /* { dg-error {too many arguments to function 'svclz_m'} } */
++  svclz_m (0, pg, f32); /* { dg-error {passing 'int' to argument 1 of 'svclz_m', which expects an SVE vector type} } */
++  svclz_m (u32, u32, f32); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
++  svclz_m (u32, 0, f32); /* { dg-error {passing 'int' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
++  svclz_m (u32, pg, s32);
++  svclz_m (u32, pg, u32);
++  svclz_m (u32, pg, f32); /* { dg-error {'svclz_m' has no form that takes 'svfloat32_t' arguments} } */
++  svclz_m (u32, pg, pg); /* { dg-error {'svclz_m' has no form that takes 'svbool_t' arguments} } */
++
++  svclz_m (pg, pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (s32, pg, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (f32, pg, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (s64, pg, s32); /* { dg-error {passing 'svint64_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (u64, pg, s32); /* { dg-error {arguments 1 and 3 of 'svclz_m' must have the same element size, but the values passed here have type 'svuint64_t' and 'svint32_t' respectively} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_2.c
+new file mode 100644
+index 000000000..233e847e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_2.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-flax-vector-conversions" } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
++    svint64_t s64, svuint64_t u64)
++{
++  svclz_m (u32, pg); /* { dg-error {too few arguments to function 'svclz_m'} } */
++  svclz_m (u32, pg, s32, s32); /* { dg-error {too many arguments to function 'svclz_m'} } */
++  svclz_m (0, pg, f32); /* { dg-error {passing 'int' to argument 1 of 'svclz_m', which expects an SVE vector type} } */
++  svclz_m (u32, u32, f32); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
++  svclz_m (u32, 0, f32); /* { dg-error {passing 'int' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
++  svclz_m (u32, pg, s32);
++  svclz_m (u32, pg, u32);
++  svclz_m (u32, pg, f32); /* { dg-error {'svclz_m' has no form that takes 'svfloat32_t' arguments} } */
++  svclz_m (u32, pg, pg); /* { dg-error {'svclz_m' has no form that takes 'svbool_t' arguments} } */
++
++  svclz_m (pg, pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (s32, pg, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (f32, pg, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (s64, pg, s32); /* { dg-error {passing 'svint64_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
++  svclz_m (u64, pg, s32); /* { dg-error {arguments 1 and 3 of 'svclz_m' must have the same element size, but the values passed here have type 'svuint64_t' and 'svint32_t' respectively} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_3.c
+new file mode 100644
+index 000000000..da57b07ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_3.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svuint8_t u8)
++{
++  svcnt_x (pg); /* { dg-error {too few arguments to function 'svcnt_x'} } */
++  svcnt_x (pg, u8, u8); /* { dg-error {too many arguments to function 'svcnt_x'} } */
++  svcnt_x (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svcnt_x', which expects 'svbool_t'} } */
++  svcnt_x (pg, pg); /* { dg-error {'svcnt_x' has no form that takes 'svbool_t' arguments} } */
++  svcnt_x (pg, 1); /* { dg-error {passing 'int' to argument 2 of 'svcnt_x', which expects an SVE vector type} } */
++  svcnt_x (pg, u8);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_uint_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_uint_1.c
+new file mode 100644
+index 000000000..9c8acdf2d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_uint_1.c
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void
++f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
++    svint16_t s16, svuint16_t u16, svfloat16_t f16)
++{
++  svexpa (); /* { dg-error {too few arguments to function 'svexpa'} } */
++  svexpa (u16, u16); /* { dg-error {too many arguments to function 'svexpa'} } */
++  svexpa (1); /* { dg-error {passing 'int' to argument 1 of 'svexpa', which expects an SVE vector type} } */
++  svexpa (pg); /* { dg-error {passing 'svbool_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
++  svexpa (s8); /* { dg-error {passing 'svint8_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
++  svexpa (s16); /* { dg-error {passing 'svint16_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
++  svexpa (f16); /* { dg-error {passing 'svfloat16_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
++
++  svexpa (u8); /* { dg-error {'svexpa' has no form that takes 'svuint8_t' arguments} } */
++  svexpa (u16);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_widen_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_widen_1.c
+new file mode 100644
+index 000000000..95a97a72e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_widen_1.c
+@@ -0,0 +1,25 @@
++#include <arm_sve.h>
++
++void
++test (svbool_t pg, svint8_t s8, svuint8_t u8,
++      svint16_t s16, svuint16_t u16, svfloat16_t f16,
++      svint32_t s32, svuint32_t u32, svfloat32_t f32,
++      svint64_t s64, svuint64_t u64, svfloat64_t f64, float f, int i)
++{
++  svunpklo (); /* { dg-error {too few arguments to function 'svunpklo'} } */
++  svunpklo (pg, s8); /* { dg-error {too many arguments to function 'svunpklo'} } */
++  svunpklo (i); /* { dg-error {passing 'int' to argument 1 of 'svunpklo', which expects an SVE vector type} } */
++  svunpklo (f); /* { dg-error {passing 'float' to argument 1 of 'svunpklo', which expects an SVE vector type} } */
++  svunpklo (pg);
++  svunpklo (s8);
++  svunpklo (s16);
++  svunpklo (s32);
++  svunpklo (s64); /* { dg-error {'svunpklo' has no form that takes 'svint64_t' arguments} } */
++  svunpklo (u8);
++  svunpklo (u16);
++  svunpklo (u32);
++  svunpklo (u64); /* { dg-error {'svunpklo' has no form that takes 'svuint64_t' arguments} } */
++  svunpklo (f16); /* { dg-error {'svunpklo' has no form that takes 'svfloat16_t' arguments} } */
++  svunpklo (f32); /* { dg-error {'svunpklo' has no form that takes 'svfloat32_t' arguments} } */
++  svunpklo (f64); /* { dg-error {'svunpklo' has no form that takes 'svfloat64_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_1.c
+new file mode 100644
+index 000000000..37524c2ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_1.c
+@@ -0,0 +1,17 @@
++#include <arm_sve.h>
++
++void
++f (svint8_t s8, svuint16_t u16, svfloat32_t f32,
++   svint16x2_t s16x2, svuint32x3_t u32x3, svfloat64x4_t f64x4,
++   svbool_t pg)
++{
++  s8 = no_ret_s8 (); /* { dg-error {incompatible types when assigning to type 'svint8_t' from type 'int'} } */
++  u16 = no_ret_u16 (); /* { dg-error {incompatible types when assigning to type 'svuint16_t' from type 'int'} } */
++  f32 = no_ret_f32 (); /* { dg-error {incompatible types when assigning to type 'svfloat32_t' from type 'int'} } */
++  s16x2 = no_ret_s16x2 (); /* { dg-error {incompatible types when assigning to type 'svint16x2_t' from type 'int'} } */
++  u32x3 = no_ret_u32x3 (); /* { dg-error {incompatible types when assigning to type 'svuint32x3_t' from type 'int'} } */
++  f64x4 = no_ret_f64x4 (); /* { dg-error {incompatible types when assigning to type 'svfloat64x4_t' from type 'int'} } */
++  pg = no_ret_pg (); /* { dg-error {incompatible types when assigning to type 'svbool_t' from type 'int'} } */
++
++  no_pass_args (pg, u16, f32, s16x2, u32x3, f64x4, pg);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_2.c
+new file mode 100644
+index 000000000..7e869bda8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_2.c
+@@ -0,0 +1,15 @@
++#include <arm_sve.h>
++
++void
++f (svint8_t s8, svuint16_t u16, svfloat32_t f32,
++   svint16x2_t s16x2, svuint32x3_t u32x3, svfloat64x4_t f64x4,
++   svbool_t pg)
++{
++  s8 = svlsr_x (pg, s8, 1); /* { dg-error {'svlsr_x' has no form that takes 'svint8_t' arguments} } */
++  u16 = svneg_x (pg, u16); /* { dg-error {'svneg_x' has no form that takes 'svuint16_t' arguments} } */
++  f32 = svclz_x (pg, f32); /* { dg-error {'svclz_x' has no form that takes 'svfloat32_t' arguments} } */
++  s16x2 = svcreate2 (s8); /* { dg-error {too few arguments to function 'svcreate2'} } */
++  u32x3 = svcreate3 (u16, u16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svcreate3', but previous arguments had type 'svuint16_t'} } */
++  f64x4 = svcreate4 (f32, f32, f32, f32, f32); /* { dg-error {too many arguments to function 'svcreate4'} } */
++  pg = svadd_x (pg, pg, pg); /* { dg-error {'svadd_x' has no form that takes 'svbool_t' arguments} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/add_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/add_1.c
+new file mode 100644
+index 000000000..f5c6285f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/add_1.c
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-optimized" } */
++
++#include <arm_sve.h>
++
++void
++foo (svint8_t *res1, svint8_t *res2, svbool_t pg, svint8_t a, svint8_t b)
++{
++  *res1 = svadd_m (pg, a, b);
++  *res2 = svadd_m (pg, a, b);
++}
++
++/* { dg-final { scan-tree-dump-times {svadd_s8_m|svadd_m} 1 "optimized" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/and_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/and_1.c
+new file mode 100644
+index 000000000..59348cece
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/and_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svand_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svand_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tands\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tand\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/bic_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/bic_1.c
+new file mode 100644
+index 000000000..e1c484995
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/bic_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbic_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svbic_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbics\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tbic\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_1.c
+new file mode 100644
+index 000000000..24aa8f317
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrka_m (x, pg, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svbrka_m (x, pg, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkas\tp[0-9]+\.b, p[0-9]+/m,} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrka\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_2.c
+new file mode 100644
+index 000000000..8aa338867
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_2.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrka_z (pg, x);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, int *any)
++{
++  svbool_t res = svbrka_z (pg, x);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkas\tp[0-9]+\.b, p[0-9]+/z,} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrka\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_1.c
+new file mode 100644
+index 000000000..07e3622ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrkb_m (x, pg, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svbrkb_m (x, pg, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkbs\tp[0-9]+\.b, p[0-9]+/m,} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrkb\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_2.c
+new file mode 100644
+index 000000000..ee677cedd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_2.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrkb_z (pg, x);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, int *any)
++{
++  svbool_t res = svbrkb_z (pg, x);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkbs\tp[0-9]+\.b, p[0-9]+/z,} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrkb\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkn_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkn_1.c
+new file mode 100644
+index 000000000..7fd9318c1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkn_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrkn_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svbrkn_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkns\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrkn\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpa_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpa_1.c
+new file mode 100644
+index 000000000..18cca370c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpa_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrkpa_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svbrkpa_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkpas\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrkpa\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpb_1.c
+new file mode 100644
+index 000000000..73eb7094d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpb_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svbrkpb_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svbrkpb_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tbrkpbs\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tbrkpb\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_1.c
+new file mode 100644
+index 000000000..dd8f6c494
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svint8_t x, svint64_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svcmpeq_wide (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svint8_t x, svint64_t y, int *any)
++{
++  svbool_t res = svcmpeq_wide (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tcmpeq\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_2.c
+new file mode 100644
+index 000000000..028d37516
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_2.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svint8_t x, svint8_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svcmpeq (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svint8_t x, svint8_t y, int *any)
++{
++  svbool_t res = svcmpeq (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++void
++test3 (svbool_t pg, svint8_t x, int *any, svbool_t *ptr)
++{
++  svbool_t res = svcmpeq (pg, x, 10);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test4 (svbool_t pg, svint8_t x, int *any)
++{
++  svbool_t res = svcmpeq (pg, x, 10);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tcmpeq\t} 4 } } */
++/* { dg-final { scan-assembler-times {\tcmpeq\t[^\n]*, #10} 2 } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_3.c
+new file mode 100644
+index 000000000..115b26c8e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_3.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svfloat32_t x, svfloat32_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svcmpeq (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svfloat32_t x, svfloat32_t y, int *any)
++{
++  svbool_t res = svcmpeq (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++void
++test3 (svbool_t pg, svfloat32_t x, int *any, svbool_t *ptr)
++{
++  svbool_t res = svcmpeq (pg, x, 0.0);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test4 (svbool_t pg, svfloat32_t x, int *any)
++{
++  svbool_t res = svcmpeq (pg, x, 0.0);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tfcmeq\t} 4 } } */
++/* { dg-final { scan-assembler-times {\tfcmeq\t[^\n]*, #0\.0} 2 } } */
++/* { dg-final { scan-assembler-times {\tptest\t} 4 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntb_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntb_pat_1.c
+new file mode 100644
+index 000000000..d57a75c20
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntb_pat_1.c
+@@ -0,0 +1,132 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-O -msve-vector-bits=256" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** cntb_pow2:
++**	mov	x0, #?32
++**	ret
++*/
++uint64_t cntb_pow2 () { return svcntb_pat (SV_POW2); }
++
++/*
++** cntb_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++uint64_t cntb_vl1 () { return svcntb_pat (SV_VL1); }
++
++/*
++** cntb_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++uint64_t cntb_vl2 () { return svcntb_pat (SV_VL2); }
++
++/*
++** cntb_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++uint64_t cntb_vl3 () { return svcntb_pat (SV_VL3); }
++
++/*
++** cntb_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cntb_vl4 () { return svcntb_pat (SV_VL4); }
++
++/*
++** cntb_vl5:
++**	mov	x0, #?5
++**	ret
++*/
++uint64_t cntb_vl5 () { return svcntb_pat (SV_VL5); }
++
++/*
++** cntb_vl6:
++**	mov	x0, #?6
++**	ret
++*/
++uint64_t cntb_vl6 () { return svcntb_pat (SV_VL6); }
++
++/*
++** cntb_vl7:
++**	mov	x0, #?7
++**	ret
++*/
++uint64_t cntb_vl7 () { return svcntb_pat (SV_VL7); }
++
++/*
++** cntb_vl8:
++**	mov	x0, #?8
++**	ret
++*/
++uint64_t cntb_vl8 () { return svcntb_pat (SV_VL8); }
++
++/*
++** cntb_vl16:
++**	mov	x0, #?16
++**	ret
++*/
++uint64_t cntb_vl16 () { return svcntb_pat (SV_VL16); }
++
++/*
++** cntb_vl32:
++**	mov	x0, #?32
++**	ret
++*/
++uint64_t cntb_vl32 () { return svcntb_pat (SV_VL32); }
++
++/*
++** cntb_vl64:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntb_vl64 () { return svcntb_pat (SV_VL64); }
++
++/*
++** cntb_vl128:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntb_vl128 () { return svcntb_pat (SV_VL128); }
++
++/*
++** cntb_vl256:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntb_vl256 () { return svcntb_pat (SV_VL256); }
++
++/*
++** cntb_mul3:
++**	mov	x0, #?30
++**	ret
++*/
++uint64_t cntb_mul3 () { return svcntb_pat (SV_MUL3); }
++
++/*
++** cntb_mul4:
++**	mov	x0, #?32
++**	ret
++*/
++uint64_t cntb_mul4 () { return svcntb_pat (SV_MUL4); }
++
++/*
++** cntb_all:
++**	mov	x0, #?32
++**	ret
++*/
++uint64_t cntb_all () { return svcntb_pat (SV_ALL); }
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntd_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntd_pat_1.c
+new file mode 100644
+index 000000000..d93a32054
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntd_pat_1.c
+@@ -0,0 +1,132 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-O -msve-vector-bits=256" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** cntd_pow2:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cntd_pow2 () { return svcntd_pat (SV_POW2); }
++
++/*
++** cntd_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++uint64_t cntd_vl1 () { return svcntd_pat (SV_VL1); }
++
++/*
++** cntd_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++uint64_t cntd_vl2 () { return svcntd_pat (SV_VL2); }
++
++/*
++** cntd_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++uint64_t cntd_vl3 () { return svcntd_pat (SV_VL3); }
++
++/*
++** cntd_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cntd_vl4 () { return svcntd_pat (SV_VL4); }
++
++/*
++** cntd_vl5:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl5 () { return svcntd_pat (SV_VL5); }
++
++/*
++** cntd_vl6:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl6 () { return svcntd_pat (SV_VL6); }
++
++/*
++** cntd_vl7:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl7 () { return svcntd_pat (SV_VL7); }
++
++/*
++** cntd_vl8:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl8 () { return svcntd_pat (SV_VL8); }
++
++/*
++** cntd_vl16:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl16 () { return svcntd_pat (SV_VL16); }
++
++/*
++** cntd_vl32:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl32 () { return svcntd_pat (SV_VL32); }
++
++/*
++** cntd_vl64:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl64 () { return svcntd_pat (SV_VL64); }
++
++/*
++** cntd_vl128:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl128 () { return svcntd_pat (SV_VL128); }
++
++/*
++** cntd_vl256:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntd_vl256 () { return svcntd_pat (SV_VL256); }
++
++/*
++** cntd_mul3:
++**	mov	x0, #?3
++**	ret
++*/
++uint64_t cntd_mul3 () { return svcntd_pat (SV_MUL3); }
++
++/*
++** cntd_mul4:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cntd_mul4 () { return svcntd_pat (SV_MUL4); }
++
++/*
++** cntd_all:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cntd_all () { return svcntd_pat (SV_ALL); }
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnth_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnth_pat_1.c
+new file mode 100644
+index 000000000..bd988f53d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnth_pat_1.c
+@@ -0,0 +1,132 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-O -msve-vector-bits=256" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** cnth_pow2:
++**	mov	x0, #?16
++**	ret
++*/
++uint64_t cnth_pow2 () { return svcnth_pat (SV_POW2); }
++
++/*
++** cnth_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++uint64_t cnth_vl1 () { return svcnth_pat (SV_VL1); }
++
++/*
++** cnth_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++uint64_t cnth_vl2 () { return svcnth_pat (SV_VL2); }
++
++/*
++** cnth_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++uint64_t cnth_vl3 () { return svcnth_pat (SV_VL3); }
++
++/*
++** cnth_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cnth_vl4 () { return svcnth_pat (SV_VL4); }
++
++/*
++** cnth_vl5:
++**	mov	x0, #?5
++**	ret
++*/
++uint64_t cnth_vl5 () { return svcnth_pat (SV_VL5); }
++
++/*
++** cnth_vl6:
++**	mov	x0, #?6
++**	ret
++*/
++uint64_t cnth_vl6 () { return svcnth_pat (SV_VL6); }
++
++/*
++** cnth_vl7:
++**	mov	x0, #?7
++**	ret
++*/
++uint64_t cnth_vl7 () { return svcnth_pat (SV_VL7); }
++
++/*
++** cnth_vl8:
++**	mov	x0, #?8
++**	ret
++*/
++uint64_t cnth_vl8 () { return svcnth_pat (SV_VL8); }
++
++/*
++** cnth_vl16:
++**	mov	x0, #?16
++**	ret
++*/
++uint64_t cnth_vl16 () { return svcnth_pat (SV_VL16); }
++
++/*
++** cnth_vl32:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cnth_vl32 () { return svcnth_pat (SV_VL32); }
++
++/*
++** cnth_vl64:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cnth_vl64 () { return svcnth_pat (SV_VL64); }
++
++/*
++** cnth_vl128:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cnth_vl128 () { return svcnth_pat (SV_VL128); }
++
++/*
++** cnth_vl256:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cnth_vl256 () { return svcnth_pat (SV_VL256); }
++
++/*
++** cnth_mul3:
++**	mov	x0, #?15
++**	ret
++*/
++uint64_t cnth_mul3 () { return svcnth_pat (SV_MUL3); }
++
++/*
++** cnth_mul4:
++**	mov	x0, #?16
++**	ret
++*/
++uint64_t cnth_mul4 () { return svcnth_pat (SV_MUL4); }
++
++/*
++** cnth_all:
++**	mov	x0, #?16
++**	ret
++*/
++uint64_t cnth_all () { return svcnth_pat (SV_ALL); }
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntw_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntw_pat_1.c
+new file mode 100644
+index 000000000..53c8435b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntw_pat_1.c
+@@ -0,0 +1,132 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-O -msve-vector-bits=256" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** cntw_pow2:
++**	mov	x0, #?8
++**	ret
++*/
++uint64_t cntw_pow2 () { return svcntw_pat (SV_POW2); }
++
++/*
++** cntw_vl1:
++**	mov	x0, #?1
++**	ret
++*/
++uint64_t cntw_vl1 () { return svcntw_pat (SV_VL1); }
++
++/*
++** cntw_vl2:
++**	mov	x0, #?2
++**	ret
++*/
++uint64_t cntw_vl2 () { return svcntw_pat (SV_VL2); }
++
++/*
++** cntw_vl3:
++**	mov	x0, #?3
++**	ret
++*/
++uint64_t cntw_vl3 () { return svcntw_pat (SV_VL3); }
++
++/*
++** cntw_vl4:
++**	mov	x0, #?4
++**	ret
++*/
++uint64_t cntw_vl4 () { return svcntw_pat (SV_VL4); }
++
++/*
++** cntw_vl5:
++**	mov	x0, #?5
++**	ret
++*/
++uint64_t cntw_vl5 () { return svcntw_pat (SV_VL5); }
++
++/*
++** cntw_vl6:
++**	mov	x0, #?6
++**	ret
++*/
++uint64_t cntw_vl6 () { return svcntw_pat (SV_VL6); }
++
++/*
++** cntw_vl7:
++**	mov	x0, #?7
++**	ret
++*/
++uint64_t cntw_vl7 () { return svcntw_pat (SV_VL7); }
++
++/*
++** cntw_vl8:
++**	mov	x0, #?8
++**	ret
++*/
++uint64_t cntw_vl8 () { return svcntw_pat (SV_VL8); }
++
++/*
++** cntw_vl16:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntw_vl16 () { return svcntw_pat (SV_VL16); }
++
++/*
++** cntw_vl32:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntw_vl32 () { return svcntw_pat (SV_VL32); }
++
++/*
++** cntw_vl64:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntw_vl64 () { return svcntw_pat (SV_VL64); }
++
++/*
++** cntw_vl128:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntw_vl128 () { return svcntw_pat (SV_VL128); }
++
++/*
++** cntw_vl256:
++**	mov	x0, #?0
++**	ret
++*/
++uint64_t cntw_vl256 () { return svcntw_pat (SV_VL256); }
++
++/*
++** cntw_mul3:
++**	mov	x0, #?6
++**	ret
++*/
++uint64_t cntw_mul3 () { return svcntw_pat (SV_MUL3); }
++
++/*
++** cntw_mul4:
++**	mov	x0, #?8
++**	ret
++*/
++uint64_t cntw_mul4 () { return svcntw_pat (SV_MUL4); }
++
++/*
++** cntw_all:
++**	mov	x0, #?8
++**	ret
++*/
++uint64_t cntw_all () { return svcntw_pat (SV_ALL); }
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_1.c
+new file mode 100644
+index 000000000..0442efef3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_1.c
+@@ -0,0 +1,16 @@
++/* { dg-options "-g" } */
++
++#include <arm_sve.h>
++
++svbool_t f_b (svbool_t x) { return x; }
++svint8_t f_s8 (svint8_t x) { return x; }
++svuint8_t f_u8 (svuint8_t x) { return x; }
++svint16_t f_s16 (svint16_t x) { return x; }
++svuint16_t f_u16 (svuint16_t x) { return x; }
++svfloat16_t f_f16 (svfloat16_t x) { return x; }
++svint32_t f_s32 (svint32_t x) { return x; }
++svuint32_t f_u32 (svuint32_t x) { return x; }
++svfloat32_t f_f32 (svfloat32_t x) { return x; }
++svint64_t f_s64 (svint64_t x) { return x; }
++svuint64_t f_u64 (svuint64_t x) { return x; }
++svfloat64_t f_f64 (svfloat64_t x) { return x; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_2.c
+new file mode 100644
+index 000000000..63a26d2e9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_2.c
+@@ -0,0 +1,16 @@
++/* { dg-options "-g" } */
++
++#include <arm_sve.h>
++
++svbool_t f_b (svbool_t x) { return svptrue_b32 (); }
++svint8_t f_s8 (svint8_t x) { return svdup_s8 (0); }
++svuint8_t f_u8 (svuint8_t x) { return svdup_u8 (1); }
++svint16_t f_s16 (svint16_t x) { return svdup_s16 (2); }
++svuint16_t f_u16 (svuint16_t x) { return svdup_u16 (3); }
++svfloat16_t f_f16 (svfloat16_t x) { return svdup_f16 (4); }
++svint32_t f_s32 (svint32_t x) { return svdup_s32 (5); }
++svuint32_t f_u32 (svuint32_t x) { return svdup_u32 (6); }
++svfloat32_t f_f32 (svfloat32_t x) { return svdup_f32 (7); }
++svint64_t f_s64 (svint64_t x) { return svdup_s64 (8); }
++svuint64_t f_u64 (svuint64_t x) { return svdup_u64 (9); }
++svfloat64_t f_f64 (svfloat64_t x) { return svdup_f64 (10); }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_3.c
+new file mode 100644
+index 000000000..ac151e465
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_3.c
+@@ -0,0 +1,39 @@
++/* { dg-options "-g" } */
++
++#include <arm_sve.h>
++
++svint8x2_t f2_s8 (svint8x2_t x) { return x; }
++svuint8x2_t f2_u8 (svuint8x2_t x) { return x; }
++svint16x2_t f2_s16 (svint16x2_t x) { return x; }
++svuint16x2_t f2_u16 (svuint16x2_t x) { return x; }
++svfloat16x2_t f2_f16 (svfloat16x2_t x) { return x; }
++svint32x2_t f2_s32 (svint32x2_t x) { return x; }
++svuint32x2_t f2_u32 (svuint32x2_t x) { return x; }
++svfloat32x2_t f2_f32 (svfloat32x2_t x) { return x; }
++svint64x2_t f2_s64 (svint64x2_t x) { return x; }
++svuint64x2_t f2_u64 (svuint64x2_t x) { return x; }
++svfloat64x2_t f2_f64 (svfloat64x2_t x) { return x; }
++
++svint8x3_t f3_s8 (svint8x3_t x) { return x; }
++svuint8x3_t f3_u8 (svuint8x3_t x) { return x; }
++svint16x3_t f3_s16 (svint16x3_t x) { return x; }
++svuint16x3_t f3_u16 (svuint16x3_t x) { return x; }
++svfloat16x3_t f3_f16 (svfloat16x3_t x) { return x; }
++svint32x3_t f3_s32 (svint32x3_t x) { return x; }
++svuint32x3_t f3_u32 (svuint32x3_t x) { return x; }
++svfloat32x3_t f3_f32 (svfloat32x3_t x) { return x; }
++svint64x3_t f3_s64 (svint64x3_t x) { return x; }
++svuint64x3_t f3_u64 (svuint64x3_t x) { return x; }
++svfloat64x3_t f3_f64 (svfloat64x3_t x) { return x; }
++
++svint8x4_t f4_s8 (svint8x4_t x) { return x; }
++svuint8x4_t f4_u8 (svuint8x4_t x) { return x; }
++svint16x4_t f4_s16 (svint16x4_t x) { return x; }
++svuint16x4_t f4_u16 (svuint16x4_t x) { return x; }
++svfloat16x4_t f4_f16 (svfloat16x4_t x) { return x; }
++svint32x4_t f4_s32 (svint32x4_t x) { return x; }
++svuint32x4_t f4_u32 (svuint32x4_t x) { return x; }
++svfloat32x4_t f4_f42 (svfloat32x4_t x) { return x; }
++svint64x4_t f4_s64 (svint64x4_t x) { return x; }
++svuint64x4_t f4_u64 (svuint64x4_t x) { return x; }
++svfloat64x4_t f4_f64 (svfloat64x4_t x) { return x; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/double_pragma_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/double_pragma_1.c
+new file mode 100644
+index 000000000..9b3c3697c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/double_pragma_1.c
+@@ -0,0 +1,7 @@
++/* { dg-do compile } */
++/* { dg-options "" } */
++
++/* It doesn't really matter if this produces errors about redefinitions,
++   but it mustn't trigger an ICE.  */
++#pragma GCC aarch64 "arm_sve.h"
++#pragma GCC aarch64 "arm_sve.h" /* { dg-error "duplicate definition of 'arm_sve.h'" } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+new file mode 100644
+index 000000000..d71507baa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mlittle-endian" } */
++
++#include <arm_sve.h>
++
++svint32_t
++dupq (int x)
++{
++  return svdupq_s32 (x, 1, 2, 3);
++}
++
++/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
++/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
++/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_10.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_10.c
+new file mode 100644
+index 000000000..f8f797c97
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_10.c
+@@ -0,0 +1,66 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++svbool_t __attribute__ ((noipa))
++make_b8 (int8_t x0, int8_t x1, int8_t x2, int8_t x3,
++	 int8_t x4, int8_t x5, int8_t x6, int8_t x7,
++	 int8_t x8, int8_t x9, int8_t xa, int8_t xb,
++	 int8_t xc, int8_t xd, int8_t xe, int8_t xf)
++{
++  return svdupq_b8 (x0, x1, x2, x3, x4, x5, x6, x7,
++		    x8, x9, xa, xb, xc, xd, xe, xf);
++}
++
++svbool_t __attribute__ ((noipa))
++make_b16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
++	  int16_t x4, int16_t x5, int16_t x6, int16_t x7)
++{
++  return svdupq_b16 (x0, x1, x2, x3, x4, x5, x6, x7);
++}
++
++svbool_t __attribute__ ((noipa))
++make_b32 (int32_t x0, int32_t x1, int32_t x2, int32_t x3)
++{
++  return svdupq_b32 (x0, x1, x2, x3);
++}
++
++svbool_t __attribute__ ((noipa))
++make_b64 (int64_t x0, int64_t x1)
++{
++  return svdupq_b64 (x0, x1);
++}
++
++int8_t a[16] = { 1, 0, 0, -3, 0, 9, 11, 0, 0, 1, 0, -4, 9, 9, 0, 0 };
++
++int
++main ()
++{
++  svbool_t pg = svptrue_pat_b8 (SV_VL16);
++  svbool_t b8 = make_b8 (a[0], a[1], a[2], a[3],
++			 a[4], a[5], a[6], a[7],
++			 a[8], a[9], a[10], a[11],
++			 a[12], a[13], a[14], a[15]);
++  if (svptest_any (svptrue_b8 (),
++		   sveor_z (pg, b8, svcmpne (pg, svld1 (pg, a), 0))))
++    __builtin_abort ();
++
++  svbool_t b16 = make_b16 (a[0], a[1], a[2], a[3],
++			   a[4], a[5], a[6], a[7]);
++  if (svptest_any (svptrue_b16 (),
++		   sveor_z (pg, b16, svcmpne (pg, svld1sb_u16 (pg, a), 0))))
++    __builtin_abort ();
++
++  svbool_t b32 = make_b32 (a[0], a[1], a[2], a[3]);
++  if (svptest_any (svptrue_b32 (),
++		   sveor_z (pg, b32, svcmpne (pg, svld1sb_u32 (pg, a), 0))))
++    __builtin_abort ();
++
++  svbool_t b64 = make_b64 (a[0], a[1]);
++  if (svptest_any (svptrue_b64 (),
++		   sveor_z (pg, b64, svcmpne (pg, svld1sb_u64 (pg, a), 0))))
++    __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+new file mode 100644
+index 000000000..d494943a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+@@ -0,0 +1,16 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mbig-endian" } */
++
++/* To avoid needing big-endian header files.  */
++#pragma GCC aarch64 "arm_sve.h"
++
++svint32_t
++dupq (int x)
++{
++  return svdupq_s32 (x, 1, 2, 3);
++}
++
++/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
++/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
++/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
+new file mode 100644
+index 000000000..4bc8259df
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
+@@ -0,0 +1,16 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mlittle-endian" } */
++
++/* To avoid needing big-endian header files.  */
++#pragma GCC aarch64 "arm_sve.h"
++
++svint32_t
++dupq (int x)
++{
++  return svdupq_s32 (0, 1, x, 3);
++}
++
++/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
++/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
++/* { dg-final { scan-assembler {\t\.word\t0\n\t\.word\t1\n\t\.word\t[^\n]*\n\t\.word\t3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
+new file mode 100644
+index 000000000..6f9f9f2f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
+@@ -0,0 +1,16 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mbig-endian" } */
++
++/* To avoid needing big-endian header files.  */
++#pragma GCC aarch64 "arm_sve.h"
++
++svint32_t
++dupq (int x)
++{
++  return svdupq_s32 (0, 1, x, 3);
++}
++
++/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
++/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
++/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t[^\n]*\n\t\.word\t1\n\t\.word\t0\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
+new file mode 100644
+index 000000000..53426c9af
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mlittle-endian" } */
++
++#include <arm_sve.h>
++
++svint32_t
++dupq (int x1, int x2, int x3, int x4)
++{
++  return svdupq_s32 (x1, x2, x3, x4);
++}
++
++/* { dg-final { scan-assembler-not {\tldr\t} } } */
++/* { dg-final { scan-assembler {, [wx]0\n} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
++/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
+new file mode 100644
+index 000000000..dfce5e7a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mbig-endian" } */
++
++/* To avoid needing big-endian header files.  */
++#pragma GCC aarch64 "arm_sve.h"
++
++svint32_t
++dupq (int x1, int x2, int x3, int x4)
++{
++  return svdupq_s32 (x1, x2, x3, x4);
++}
++
++/* { dg-final { scan-assembler-not {\tldr\t} } } */
++/* { dg-final { scan-assembler {, [wx]0\n} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
++/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
++/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_7.c
+new file mode 100644
+index 000000000..08decb5f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_7.c
+@@ -0,0 +1,66 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++svint8_t __attribute__ ((noipa))
++make_s8 (int8_t x0, int8_t x1, int8_t x2, int8_t x3,
++	 int8_t x4, int8_t x5, int8_t x6, int8_t x7,
++	 int8_t x8, int8_t x9, int8_t xa, int8_t xb,
++	 int8_t xc, int8_t xd, int8_t xe, int8_t xf)
++{
++  return svdupq_s8 (x0, x1, x2, x3, x4, x5, x6, x7,
++		    x8, x9, xa, xb, xc, xd, xe, xf);
++}
++
++svint16_t __attribute__ ((noipa))
++make_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
++	  int16_t x4, int16_t x5, int16_t x6, int16_t x7)
++{
++  return svdupq_s16 (x0, x1, x2, x3, x4, x5, x6, x7);
++}
++
++svint32_t __attribute__ ((noipa))
++make_s32 (int32_t x0, int32_t x1, int32_t x2, int32_t x3)
++{
++  return svdupq_s32 (x0, x1, x2, x3);
++}
++
++svint64_t __attribute__ ((noipa))
++make_s64 (int64_t x0, int64_t x1)
++{
++  return svdupq_s64 (x0, x1);
++}
++
++int8_t a[16] = { 1, -44, 91, -24, 101, -55, 77, 83,
++		 -30, 69, 121, -128, -1, 13, 127, 26 };
++int16_t b[8] = { -716, -10288, 30604, -19258, -9418, -10435, -16001, 7300 };
++int32_t c[4] = { 1268374995, -1023602831, -891830021, -1793452959 };
++int64_t d[2] = { 0x123456789abcdefLL, -0x123456789abcdefLL };
++
++int
++main ()
++{
++  svbool_t pg = svptrue_pat_b8 (SV_VL16);
++  svint8_t s8 = make_s8 (a[0], a[1], a[2], a[3],
++			 a[4], a[5], a[6], a[7],
++			 a[8], a[9], a[10], a[11],
++			 a[12], a[13], a[14], a[15]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, s8, svld1 (pg, a))))
++    __builtin_abort ();
++
++  svint16_t s16 = make_s16 (b[0], b[1], b[2], b[3],
++			    b[4], b[5], b[6], b[7]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, s16, svld1 (pg, b))))
++    __builtin_abort ();
++
++  svint32_t s32 = make_s32 (c[0], c[1], c[2], c[3]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, s32, svld1 (pg, c))))
++    __builtin_abort ();
++
++  svint64_t s64 = make_s64 (d[0], d[1]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, s64, svld1 (pg, d))))
++    __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_8.c
+new file mode 100644
+index 000000000..c20fb7324
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_8.c
+@@ -0,0 +1,66 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++svuint8_t __attribute__ ((noipa))
++make_u8 (uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3,
++	 uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7,
++	 uint8_t x8, uint8_t x9, uint8_t xa, uint8_t xb,
++	 uint8_t xc, uint8_t xd, uint8_t xe, uint8_t xf)
++{
++  return svdupq_u8 (x0, x1, x2, x3, x4, x5, x6, x7,
++		    x8, x9, xa, xb, xc, xd, xe, xf);
++}
++
++svuint16_t __attribute__ ((noipa))
++make_u16 (uint16_t x0, uint16_t x1, uint16_t x2, uint16_t x3,
++	  uint16_t x4, uint16_t x5, uint16_t x6, uint16_t x7)
++{
++  return svdupq_u16 (x0, x1, x2, x3, x4, x5, x6, x7);
++}
++
++svuint32_t __attribute__ ((noipa))
++make_u32 (uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3)
++{
++  return svdupq_u32 (x0, x1, x2, x3);
++}
++
++svuint64_t __attribute__ ((noipa))
++make_u64 (uint64_t x0, uint64_t x1)
++{
++  return svdupq_u64 (x0, x1);
++}
++
++uint8_t a[16] = { 1, 212, 91, 232, 101, 201, 77, 83,
++		  226, 69, 121, 128, 255, 13, 127, 26 };
++uint16_t b[8] = { 64820, 55248, 30604, 46278, 56118, 55101, 49535, 7300 };
++uint32_t c[4] = { 1268374995, 3271364465, 3403137275, 2501514337 };
++uint64_t d[2] = { 0x123456789abcdefULL, 0xfedcba9876543210ULL };
++
++int
++main ()
++{
++  svbool_t pg = svptrue_pat_b8 (SV_VL16);
++  svuint8_t u8 = make_u8 (a[0], a[1], a[2], a[3],
++			  a[4], a[5], a[6], a[7],
++			  a[8], a[9], a[10], a[11],
++			  a[12], a[13], a[14], a[15]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, u8, svld1 (pg, a))))
++    __builtin_abort ();
++
++  svuint16_t u16 = make_u16 (b[0], b[1], b[2], b[3],
++			     b[4], b[5], b[6], b[7]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, u16, svld1 (pg, b))))
++    __builtin_abort ();
++
++  svuint32_t u32 = make_u32 (c[0], c[1], c[2], c[3]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, u32, svld1 (pg, c))))
++    __builtin_abort ();
++
++  svuint64_t u64 = make_u64 (d[0], d[1]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, u64, svld1 (pg, d))))
++    __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_9.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_9.c
+new file mode 100644
+index 000000000..b29aa9474
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_9.c
+@@ -0,0 +1,47 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++svfloat16_t __attribute__ ((noipa))
++make_f16 (float16_t x0, float16_t x1, float16_t x2, float16_t x3,
++	  float16_t x4, float16_t x5, float16_t x6, float16_t x7)
++{
++  return svdupq_f16 (x0, x1, x2, x3, x4, x5, x6, x7);
++}
++
++svfloat32_t __attribute__ ((noipa))
++make_f32 (float32_t x0, float32_t x1, float32_t x2, float32_t x3)
++{
++  return svdupq_f32 (x0, x1, x2, x3);
++}
++
++svfloat64_t __attribute__ ((noipa))
++make_f64 (float64_t x0, float64_t x1)
++{
++  return svdupq_f64 (x0, x1);
++}
++
++float16_t a[8] = { 1.0, -4.25, 9.75, 6.5, -2.125, 5.5, -3.75, 7.625 };
++float32_t b[4] = { 1.0, -90.25, -11.75, 141.5 };
++float64_t c[2] = { 9221.5, -4491.25 };
++
++int
++main ()
++{
++  svbool_t pg = svptrue_pat_b8 (SV_VL16);
++  svfloat16_t f16 = make_f16 (a[0], a[1], a[2], a[3],
++			      a[4], a[5], a[6], a[7]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, f16, svld1 (pg, a))))
++    __builtin_abort ();
++
++  svfloat32_t f32 = make_f32 (b[0], b[1], b[2], b[3]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, f32, svld1 (pg, b))))
++    __builtin_abort ();
++
++  svfloat64_t f64 = make_f64 (c[0], c[1]);
++  if (svptest_any (svptrue_b8 (), svcmpne (pg, f64, svld1 (pg, c))))
++    __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_1.c
+new file mode 100644
+index 000000000..32ccb08d6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_1.c
+@@ -0,0 +1,87 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++#ifndef TYPE
++#define TYPE svint8_t
++#define DUPQ svdupq_lane_s8
++#define INDEX svindex_s8
++#define COUNT 16
++#endif
++
++#define BASE 42
++
++TYPE __attribute__ ((noipa))
++dupq_var (TYPE x, uint64_t y)
++{
++  return DUPQ (x, y);
++}
++
++TYPE __attribute__ ((noipa))
++dupq_0 (TYPE x)
++{
++  return DUPQ (x, 0);
++}
++
++TYPE __attribute__ ((noipa))
++dupq_1 (TYPE x)
++{
++  return DUPQ (x, 1);
++}
++
++TYPE __attribute__ ((noipa))
++dupq_2 (TYPE x)
++{
++  return DUPQ (x, 2);
++}
++
++TYPE __attribute__ ((noipa))
++dupq_3 (TYPE x)
++{
++  return DUPQ (x, 3);
++}
++
++TYPE __attribute__ ((noipa))
++dupq_4 (TYPE x)
++{
++  return DUPQ (x, 4);
++}
++
++void __attribute__ ((noipa))
++check (TYPE x, uint64_t y)
++{
++  svbool_t pg = svptrue_b8 ();
++  if (y * 2 >= svcntd ())
++    {
++      if (svptest_any (pg, svcmpne (pg, x, 0)))
++	__builtin_abort ();
++    }
++  else
++    {
++      TYPE repeat = svand_x (pg, INDEX (0, 1), COUNT - 1);
++      TYPE expected = svadd_x (pg, repeat, BASE + y * COUNT);
++      if (svptest_any (pg, svcmpne (pg, x, expected)))
++	__builtin_abort ();
++    }
++}
++
++int
++main ()
++{
++  TYPE x = INDEX (BASE, 1);
++
++  check (dupq_0 (x), 0);
++  check (dupq_1 (x), 1);
++  check (dupq_2 (x), 2);
++  check (dupq_3 (x), 3);
++  check (dupq_4 (x), 4);
++
++  for (int i = 0; i < 63; ++i)
++    {
++      check (dupq_var (x, i), i);
++      check (dupq_var (x, (uint64_t) 1 << i), (uint64_t) 1 << i);
++    }
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_2.c
+new file mode 100644
+index 000000000..40de1c7dc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_2.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svuint8_t
++#define DUPQ svdupq_lane_u8
++#define INDEX svindex_u8
++#define COUNT 16
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_3.c
+new file mode 100644
+index 000000000..4ebe89545
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_3.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svint16_t
++#define DUPQ svdupq_lane_s16
++#define INDEX svindex_s16
++#define COUNT 8
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_4.c
+new file mode 100644
+index 000000000..1be20c8e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_4.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svuint16_t
++#define DUPQ svdupq_lane_u16
++#define INDEX svindex_u16
++#define COUNT 8
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_5.c
+new file mode 100644
+index 000000000..67554d06a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_5.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svint32_t
++#define DUPQ svdupq_lane_s32
++#define INDEX svindex_s32
++#define COUNT 4
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_6.c
+new file mode 100644
+index 000000000..1914d2368
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_6.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svuint32_t
++#define DUPQ svdupq_lane_u32
++#define INDEX svindex_u32
++#define COUNT 4
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_7.c
+new file mode 100644
+index 000000000..d7a8e52f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_7.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svint64_t
++#define DUPQ svdupq_lane_s64
++#define INDEX svindex_s64
++#define COUNT 2
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_8.c
+new file mode 100644
+index 000000000..68655fefa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_8.c
+@@ -0,0 +1,9 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2" } */
++
++#define TYPE svuint64_t
++#define DUPQ svdupq_lane_u64
++#define INDEX svindex_u64
++#define COUNT 2
++
++#include "dupq_lane_1.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/eor_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/eor_1.c
+new file mode 100644
+index 000000000..357b0bfb8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/eor_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = sveor_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = sveor_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\teors\t} 2 } } */
++/* { dg-final { scan-assembler-not {\teor\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1_1.c
+new file mode 100644
+index 000000000..c68a9ed99
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1_1.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-O" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** nop1:
++**	ret
++*/
++void nop1 (int8_t *s) { svld1 (svptrue_b8 (), s); }
++
++/*
++** nop2:
++**	ret
++*/
++void nop2 (svbool_t pg, int16_t *s) { svld1 (pg, s); }
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_1.c
+new file mode 100644
+index 000000000..79f8bee1f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_1.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that SETFFR comes first, however high the priority of the
++   LDFF1 is.  */
++svint8_t
++foo (svbool_t pg, int8_t *ptr)
++{
++  svsetffr ();
++  svint8_t x = svldff1 (pg, ptr);
++  x = svadd_x (pg, x, x);
++  x = svmul_x (pg, x, x);
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_2.c
+new file mode 100644
+index 000000000..7c3c8d8b5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_2.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that RDFFR comes after the LDFF1 and that the RDFFRs can
++   be CSEd.  */
++svint8_t
++foo (svbool_t pg, int8_t *__restrict ptr,
++     svbool_t *__restrict *__restrict preds)
++{
++  svsetffr ();
++  svint8_t x = svldff1 (pg, ptr);
++  *preds[0] = svrdffr ();
++  *preds[1] = svrdffr ();
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffr\t} } } */
++/* { dg-final { scan-assembler-times {\trdffr\t} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
+new file mode 100644
+index 000000000..41ad0bcea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that LDFF1s can be reordered.  The load of x should come due
++   to its longer dependence chain.  */
++svint8_t
++foo (int8_t *ptr1, int8_t *ptr2)
++{
++  svsetffr ();
++  svbool_t pg = svptrue_b8 ();
++  svint8_t y = svldff1 (pg, ptr2);
++  svint8_t x = svldff1 (pg, ptr1);
++  x = svadd_x (pg, x, x);
++  x = svmul_x (pg, x, x);
++  x = svadd_x (pg, x, y);
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tldff1b\tz[0-9]+\.b, p[0-7]/z, \[x0\]\n.*\tldff1b\tz[0-9]+\.b, p[0-7]/z, \[x1\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_4.c
+new file mode 100644
+index 000000000..c27302139
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_4.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that we can use RDFFRS to test for a fault.  */
++svint8_t
++foo (svbool_t pg, int8_t *ptr, int *fault)
++{
++  svsetffr ();
++  svint8_t x = svldff1 (pg, ptr);
++  *fault = svptest_any (pg, svrdffr_z (pg));
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
++/* { dg-final { scan-assembler-not {\trdffr\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_5.c
+new file mode 100644
+index 000000000..76e7ab8ba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_5.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that we can use RDFFRS to read the FFR while testing for a
++   fault.  */
++svint8_t
++foo (svbool_t pg, int8_t *ptr, svbool_t *pred, int *fault)
++{
++  svsetffr ();
++  svint8_t x = svldff1 (pg, ptr);
++  svbool_t ffr = svrdffr_z (pg);
++  *fault = svptest_any (pg, ffr);
++  *pred = ffr;
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
++/* { dg-final { scan-assembler-not {\trdffr\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_6.c
+new file mode 100644
+index 000000000..7110e5f1a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_6.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that we can use RDFFRS to test for a fault.  */
++svint8_t
++foo (svbool_t pg, int8_t *ptr, int *fault)
++{
++  svsetffr ();
++  svint8_t x = svldff1 (pg, ptr);
++  *fault = svptest_any (svptrue_b8 (), svrdffr ());
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
++/* { dg-final { scan-assembler-not {\trdffr\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_7.c
+new file mode 100644
+index 000000000..355fe91f1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_7.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* Make sure that we can use RDFFRS to read the FFR while testing for a
++   fault.  */
++svint8_t
++foo (svbool_t pg, int8_t *ptr, svbool_t *pred, int *fault)
++{
++  svsetffr ();
++  svint8_t x = svldff1 (pg, ptr);
++  svbool_t ffr = svrdffr ();
++  *fault = svptest_any (svptrue_b8 (), ffr);
++  *pred = ffr;
++  return x;
++}
++
++/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
++/* { dg-final { scan-assembler-not {\trdffr\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nand_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nand_1.c
+new file mode 100644
+index 000000000..0bc54c049
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nand_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svnand_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svnand_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tnands\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tnand\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nor_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nor_1.c
+new file mode 100644
+index 000000000..7973294d1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nor_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svnor_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svnor_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tnors\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tnor\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_1.c
+new file mode 100644
+index 000000000..09dfacd22
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_1.c
+@@ -0,0 +1,17 @@
++/* { dg-options "-march=armv8-a" } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++void
++f (svbool_t *x, svint8_t *y)
++{
++  *x = svptrue_b8 (); /* { dg-error {ACLE function '(svbool_t svptrue_b8\(\)|svptrue_b8)' requires ISA extension 'sve'} } */
++  /* { dg-message {note: you can enable 'sve' using the command-line option '-march', or by using the 'target' attribute or pragma} "" { target *-*-* } .-1 } */
++  *x = svptrue_b8 ();
++  *x = svptrue_b8 ();
++  *x = svptrue_b8 ();
++  *x = svptrue_b8 ();
++  *x = svptrue_b8 ();
++  *x = svptrue_b8 ();
++  *y = svadd_m (*x, *y, 1);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_2.c
+new file mode 100644
+index 000000000..594be1cf4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_2.c
+@@ -0,0 +1,14 @@
++/* { dg-options "-march=armv8-a" } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++#pragma GCC target "+sve"
++
++void
++f (svbool_t *x, svint8_t *y)
++{
++  *x = svptrue_b8 ();
++  *y = svadd_m (*x, *y, 1);
++}
++
++/* { dg-final { scan-assembler {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_3.c
+new file mode 100644
+index 000000000..85f4eb3c0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_3.c
+@@ -0,0 +1,12 @@
++/* { dg-options "-march=armv8-a" } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++void __attribute__ ((target("+sve")))
++f (svbool_t *x, svint8_t *y)
++{
++  *x = svptrue_b8 ();
++  *y = svadd_m (*x, *y, 1);
++}
++
++/* { dg-final { scan-assembler {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orn_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orn_1.c
+new file mode 100644
+index 000000000..c3ed1eb61
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orn_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svorn_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svorn_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\torns\t} 2 } } */
++/* { dg-final { scan-assembler-not {\torn\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orr_1.c
+new file mode 100644
+index 000000000..4456fa630
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orr_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svorr_z (pg, x, y);
++  *any = svptest_any (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
++{
++  svbool_t res = svorr_z (pg, x, y);
++  return svptest_any (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\torrs\t} 2 } } */
++/* { dg-final { scan-assembler-not {\torr\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pfirst_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pfirst_1.c
+new file mode 100644
+index 000000000..de1ff691a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pfirst_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, int *last, svbool_t *ptr)
++{
++  svbool_t res = svpfirst (pg, svpfalse ());
++  *last = svptest_last (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg)
++{
++  svbool_t res = svpfirst (pg, svpfalse ());
++  return svptest_last (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tpfirst\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_1.c
+new file mode 100644
+index 000000000..bf59cb963
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
++{
++  svbool_t res = svpnext_b8 (pg, prev);
++  *last = svptest_last (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t prev)
++{
++  svbool_t res = svpnext_b8 (pg, prev);
++  return svptest_last (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tpnext\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_2.c
+new file mode 100644
+index 000000000..9926a2bee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_2.c
+@@ -0,0 +1,52 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
++{
++  svbool_t res = svpnext_b16 (pg, prev);
++  *last = svptest_last (pg, res);
++  *ptr = res;
++}
++
++int
++test2 (svbool_t pg, svbool_t prev)
++{
++  svbool_t res = svpnext_b16 (pg, prev);
++  return svptest_last (pg, res);
++}
++
++void
++test3 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
++{
++  svbool_t res = svpnext_b32 (pg, prev);
++  *last = svptest_last (pg, res);
++  *ptr = res;
++}
++
++int
++test4 (svbool_t pg, svbool_t prev)
++{
++  svbool_t res = svpnext_b32 (pg, prev);
++  return svptest_last (pg, res);
++}
++
++void
++test5 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
++{
++  svbool_t res = svpnext_b64 (pg, prev);
++  *last = svptest_last (pg, res);
++  *ptr = res;
++}
++
++int
++test6 (svbool_t pg, svbool_t prev)
++{
++  svbool_t res = svpnext_b64 (pg, prev);
++  return svptest_last (pg, res);
++}
++
++/* { dg-final { scan-assembler-times {\tpnext\t} 6 } } */
++/* { dg-final { scan-assembler-times {\tptest\t} 6 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_1.c
+new file mode 100644
+index 000000000..69bbb1ed0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_1.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int *last, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL32);
++  *last = svptest_last (svptrue_b8 (), res);
++  *ptr = res;
++}
++
++int
++test2 ()
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL32);
++  return svptest_last (svptrue_b8 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.b, vl32\n} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_2.c
+new file mode 100644
+index 000000000..ede83405e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_2.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int *last, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL16);
++  *last = svptest_last (svptrue_b16 (), res);
++  *ptr = res;
++}
++
++int
++test2 ()
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL16);
++  return svptest_last (svptrue_b16 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.h, vl16\n} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_3.c
+new file mode 100644
+index 000000000..d2eb3fc30
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_3.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int *last, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL16);
++  *last = svptest_last (svptrue_b32 (), res);
++  *ptr = res;
++}
++
++int
++test2 ()
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL16);
++  return svptest_last (svptrue_b32 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.s, vl16\n} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_4.c
+new file mode 100644
+index 000000000..59a21da9e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_4.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL7);
++  *any = svptest_any (svptrue_b64 (), res);
++  *ptr = res;
++}
++
++int
++test2 ()
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL7);
++  return svptest_any (svptrue_b64 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.d, vl7\n} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_5.c
+new file mode 100644
+index 000000000..c8f6d8aca
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_5.c
+@@ -0,0 +1,188 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++b8_b16_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL64);
++  *any = svptest_any (svptrue_b16 (), res);
++  *ptr = res;
++}
++
++int
++b8_b16_2 ()
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL64);
++  return svptest_any (svptrue_b16 (), res);
++}
++
++void
++b8_b32_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL32);
++  *any = svptest_any (svptrue_b32 (), res);
++  *ptr = res;
++}
++
++int
++b8_b32_2 ()
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL32);
++  return svptest_any (svptrue_b32 (), res);
++}
++
++void
++b8_b64_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL128);
++  *any = svptest_any (svptrue_b64 (), res);
++  *ptr = res;
++}
++
++int
++b8_b64_2 ()
++{
++  svbool_t res = svptrue_pat_b8 (SV_VL128);
++  return svptest_any (svptrue_b64 (), res);
++}
++
++void
++b16_b8_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL32);
++  *any = svptest_any (svptrue_b8 (), res);
++  *ptr = res;
++}
++
++int
++b16_b8_2 ()
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL32);
++  return svptest_any (svptrue_b8 (), res);
++}
++
++void
++b16_b32_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL16);
++  *any = svptest_any (svptrue_b32 (), res);
++  *ptr = res;
++}
++
++int
++b16_b32_2 ()
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL16);
++  return svptest_any (svptrue_b32 (), res);
++}
++
++void
++b16_b64_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL64);
++  *any = svptest_any (svptrue_b64 (), res);
++  *ptr = res;
++}
++
++int
++b16_b64_2 ()
++{
++  svbool_t res = svptrue_pat_b16 (SV_VL64);
++  return svptest_any (svptrue_b64 (), res);
++}
++
++void
++b32_b8_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL16);
++  *any = svptest_any (svptrue_b8 (), res);
++  *ptr = res;
++}
++
++int
++b32_b8_2 ()
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL16);
++  return svptest_any (svptrue_b8 (), res);
++}
++
++void
++b32_b16_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL6);
++  *any = svptest_any (svptrue_b16 (), res);
++  *ptr = res;
++}
++
++int
++b32_b16_2 ()
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL6);
++  return svptest_any (svptrue_b16 (), res);
++}
++
++void
++b32_b64_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL32);
++  *any = svptest_any (svptrue_b64 (), res);
++  *ptr = res;
++}
++
++int
++b32_b64_2 ()
++{
++  svbool_t res = svptrue_pat_b32 (SV_VL32);
++  return svptest_any (svptrue_b64 (), res);
++}
++
++void
++b64_b8_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL7);
++  *any = svptest_any (svptrue_b8 (), res);
++  *ptr = res;
++}
++
++int
++b64_b8_2 ()
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL7);
++  return svptest_any (svptrue_b8 (), res);
++}
++
++void
++b64_b16_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL16);
++  *any = svptest_any (svptrue_b16 (), res);
++  *ptr = res;
++}
++
++int
++b64_b16_2 ()
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL16);
++  return svptest_any (svptrue_b16 (), res);
++}
++
++void
++b64_b32_1 (int *any, svbool_t *ptr)
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL32);
++  *any = svptest_any (svptrue_b32 (), res);
++  *ptr = res;
++}
++
++int
++b64_b32_2 ()
++{
++  svbool_t res = svptrue_pat_b64 (SV_VL32);
++  return svptest_any (svptrue_b32 (), res);
++}
++
++/* { dg-final { scan-assembler-not {\tptrues\n} } } */
++/* { dg-final { scan-assembler-times {\tptrue\t} 48 } } */
++/* { dg-final { scan-assembler-times {\tptest\t} 24 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/qincb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/qincb_1.c
+new file mode 100644
+index 000000000..ba512f406
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/qincb_1.c
+@@ -0,0 +1,43 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-O" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** qincb_s32_s:
++**	sqincb	x0, w0, all, mul #15
++**	ret
++*/
++uint64_t qincb_s32_s (int32_t x) { return svqincb (x, 15); }
++
++/*
++** qincb_s32_z:
++**	sqincb	x([0-9]+), w0, all, mul #15
++**	uxtw	x0, w\1
++**	ret
++*/
++uint64_t qincb_s32_z (int32_t x) { return (uint32_t) svqincb (x, 15); }
++
++/*
++** qincb_u32_s:
++**	uqincb	(w[0-9]+), all, mul #15
++**	sxtw	x0, \1
++**	ret
++*/
++uint64_t qincb_u32_s (uint32_t x) { return (int32_t) svqincb (x, 15); }
++
++/*
++** qincb_u32_z:
++**	uqincb	w0, all, mul #15
++**	ret
++*/
++uint64_t qincb_u32_z (uint32_t x) { return svqincb (x, 15); }
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/struct_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/struct_1.c
+new file mode 100644
+index 000000000..50892c85a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/struct_1.c
+@@ -0,0 +1,16 @@
++#include <arm_sve.h>
++
++void
++f (svint8x2_t *a, svint8x2_t *b)
++{
++  svint8_t *ptr;
++  svint8x2_t x = *a;
++  *a = *b;
++  a = &x;
++  (void) (a == b);
++  (void) (a != b);
++  (void) (a < b);
++  (void) (a > b);
++  (void) (a <= b);
++  (void) (a >= b);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/temporaries_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/temporaries_1.c
+new file mode 100644
+index 000000000..2543e1e62
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/temporaries_1.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile } */
++/* { dg-options "-O" } */
++/* { dg-final { check-function-bodies "**" "" "" { target { ! ilp32 } } } } */
++
++#include <arm_sve.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/*
++** test_s8:
++**	ptrue	(p[0-7])\.b, all
++**	ld1b	(z[0-9]+\.b), \1/z, \[x0\]
++**	add	\2, \2, #1
++**	st1b	\2, \1, \[x1\]
++**	ret
++*/
++void
++test_s8 (int8_t *x, int8_t *y)
++{
++  int8_t tmp1[32], tmp2[32];
++
++  svbool_t pg = svptrue_b8 ();
++  svst1 (pg, tmp1, svld1 (pg, x));
++  svst1 (pg, tmp2, svadd_x (pg, svld1 (pg, tmp1), 1));
++  svst1 (pg, y, svld1 (pg, tmp2));
++}
++
++/*
++** test_s32_b8:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	(z[0-9]+\.s), \1/z, \[x0\]
++**	add	\2, \2, #1
++**	st1w	\2, \1, \[x1\]
++**	ret
++*/
++void
++test_s32_b8 (int32_t *x, int32_t *y)
++{
++  int32_t tmp1[8], tmp2[8];
++
++  svbool_t pg = svptrue_b8 ();
++  svst1 (pg, tmp1, svld1 (pg, x));
++  svst1 (pg, tmp2, svadd_x (pg, svld1 (pg, tmp1), 1));
++  svst1 (pg, y, svld1 (pg, tmp2));
++}
++
++/*
++** test_s32_b32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	(z[0-9]+\.s), \1/z, \[x0\]
++**	add	\2, \2, #1
++**	st1w	\2, \1, \[x1\]
++**	ret
++*/
++void
++test_s32_b32 (int32_t *x, int32_t *y)
++{
++  int32_t tmp1[8], tmp2[8];
++
++  svbool_t pg = svptrue_b32 ();
++  svst1 (pg, tmp1, svld1 (pg, x));
++  svst1 (pg, tmp2, svadd_x (pg, svld1 (pg, tmp1), 1));
++  svst1 (pg, y, svld1 (pg, tmp2));
++}
++
++#ifdef __cplusplus
++}
++#endif
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_1.c
+new file mode 100644
+index 000000000..1d5523e31
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_1.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svwhilele_b8 (x, y);
++  *any = svptest_last (svptrue_b8 (), res);
++  *ptr = res;
++}
++
++int
++test2 (int32_t x, int32_t y)
++{
++  svbool_t res = svwhilele_b8 (x, y);
++  return svptest_last (svptrue_b8 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_10.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_10.c
+new file mode 100644
+index 000000000..ca339c41c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_10.c
+@@ -0,0 +1,28 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilele\t} } } */
++/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b32_u32 (-1, 0);
++}
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_u64 (0x80000000, 0);
++}
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b8_u64 (0x8000000000000001ULL, 0x7ffffffffffffffeULL);
++}
++
++/* { dg-final { scan-assembler-times {\tpfalse\tp[0-7]\.b\n} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_2.c
+new file mode 100644
+index 000000000..020846007
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_2.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svwhilele_b16 (x, y);
++  *any = svptest_last (svptrue_b16 (), res);
++  *ptr = res;
++}
++
++int
++test2 (int32_t x, int32_t y)
++{
++  svbool_t res = svwhilele_b16 (x, y);
++  return svptest_last (svptrue_b16 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_3.c
+new file mode 100644
+index 000000000..4a1045cf6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_3.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svwhilele_b32 (x, y);
++  *any = svptest_last (svptrue_b32 (), res);
++  *ptr = res;
++}
++
++int
++test2 (int32_t x, int32_t y)
++{
++  svbool_t res = svwhilele_b32 (x, y);
++  return svptest_last (svptrue_b32 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_4.c
+new file mode 100644
+index 000000000..f6fb0d099
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_4.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++void
++test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
++{
++  svbool_t res = svwhilele_b64 (x, y);
++  *any = svptest_last (svptrue_b64 (), res);
++  *ptr = res;
++}
++
++int
++test2 (int32_t x, int32_t y)
++{
++  svbool_t res = svwhilele_b64 (x, y);
++  return svptest_last (svptrue_b64 (), res);
++}
++
++/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++/* { dg-final { scan-assembler-not {\tptest\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_5.c
+new file mode 100644
+index 000000000..ada958b29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_5.c
+@@ -0,0 +1,47 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilele\t} } } */
++/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b32_s32 (-8, -8);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.[bhsd], vl1\n} } } */
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_s64 (-1, 1);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl3\n} } } */
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_s32 (0x7ffffffb, 0x7fffffff);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl5\n} } } */
++
++void
++test4 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b8_s64 (svcntb (), svcntb () + 6);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, vl7\n} } } */
++
++void
++test5 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b64_s64 (0, 1);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, vl2\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_6.c
+new file mode 100644
+index 000000000..00d92ba8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_6.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilele\t} } } */
++/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b32_s32 (-8, -9);
++}
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_s64 (50, -1);
++}
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_s32 (0x7ffffffb, 0x80000000);
++}
++
++void
++test4 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b8_s64 (svcntb (), 15);
++}
++
++void
++test5 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b8_s64 (svcntb (), svcntw ());
++}
++
++/* { dg-final { scan-assembler-times {\tpfalse\tp[0-7]\.b\n} 5 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_7.c
+new file mode 100644
+index 000000000..92488f597
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_7.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilel[et]\t} } } */
++/* { dg-final { scan-assembler-not {\tpfalse\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b8_s32 (-svcnth (), svcnth () - 1);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, all\n} } } */
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_s64 (1, svcntw () * 2);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, all\n} } } */
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b32_s32 (svcntd (), svcntw () + svcntd () - 1);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.s, all\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_9.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_9.c
+new file mode 100644
+index 000000000..e7f81a86f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_9.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilele\t} } } */
++/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b32_u32 (1, 3);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.s, vl3\n} } } */
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b16_u64 (svcntd (), svcntd () + 5);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilele_b8_u32 (0x7ffffffb, 0x80000002);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, vl8\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_1.c
+new file mode 100644
+index 000000000..5c8f97e2f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_1.c
+@@ -0,0 +1,47 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilele\t} } } */
++/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b32_s32 (-8, -7);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.[bhsd], vl1\n} } } */
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b16_s64 (-1, 2);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl3\n} } } */
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b16_s32 (0x7ffffffa, 0x7fffffff);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl5\n} } } */
++
++void
++test4 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b8_s64 (svcntb (), svcntb () + 7);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, vl7\n} } } */
++
++void
++test5 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b64_s64 (0, 2);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, vl2\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_2.c
+new file mode 100644
+index 000000000..2be3a5b0c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_2.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilele\t} } } */
++/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
++/* { dg-final { scan-assembler-not {\tptrue\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b32_s32 (0, 0);
++}
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b16_s64 (50, -1);
++}
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b16_s32 (0x7ffffffb, 0x80000000);
++}
++
++void
++test4 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b8_s64 (svcntb (), svcntb ());
++}
++
++void
++test5 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b8_s64 (svcntb (), svcntw ());
++}
++
++/* { dg-final { scan-assembler-times {\tpfalse\tp[0-7]\.b\n} 5 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_3.c
+new file mode 100644
+index 000000000..650b2652f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_3.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++#include <arm_sve.h>
++
++/* { dg-final { scan-assembler-not {\twhilel[et]\t} } } */
++/* { dg-final { scan-assembler-not {\tpfalse\t} } } */
++
++void
++test1 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b8_s32 (-svcnth (), svcnth ());
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, all\n} } } */
++
++void
++test2 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b16_s64 (0, svcntw () * 2);
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, all\n} } } */
++
++void
++test3 (svbool_t *ptr)
++{
++  *ptr = svwhilelt_b32_s32 (svcntd (), svcntw () + svcntd ());
++}
++
++/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.s, all\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_1.c
+new file mode 100644
+index 000000000..223351c2f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_1.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#ifndef FACTOR
++#define FACTOR 2
++#endif
++
++#define LOOP(TYPE)						\
++  __attribute__ ((noipa))					\
++  void								\
++  test_##TYPE (TYPE *restrict dst, TYPE *restrict src,		\
++	       int count)					\
++  {								\
++    for (int i = 0; i < count; ++i)				\
++      dst[i] += src[i] * FACTOR;				\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (int16_t) \
++  T (int32_t) \
++  T (int64_t) \
++  T (uint8_t) \
++  T (uint16_t) \
++  T (uint32_t) \
++  T (uint64_t)
++
++TEST_ALL (LOOP)
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 1\]} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 1\]} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c
+new file mode 100644
+index 000000000..383a90c24
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c
+@@ -0,0 +1,31 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "adr_1.c"
++
++#define N 131
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE a[N], b[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (TYPE) i * i + i % 5;				\
++	b[i] = (TYPE) i * 3 + i % 7;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (a, b, N);					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = ((TYPE) (i * i + i % 5)			\
++			 + ((TYPE) i * 3 + i % 7) * FACTOR);	\
++	if (a[i] != expected)					\
++	  __builtin_abort ();					\
++      }								\
++  }
++
++int __attribute__ ((optimize (1)))
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_2.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_2.c
+new file mode 100644
+index 000000000..dc20ddbad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_2.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FACTOR 4
++#include "adr_1.c"
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 2\]} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 2\]} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c
+new file mode 100644
+index 000000000..e823d3d0a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FACTOR 4
++#include "adr_1_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_3.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_3.c
+new file mode 100644
+index 000000000..b0cb180dd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_3.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FACTOR 8
++#include "adr_1.c"
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 3\]} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 3\]} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c
+new file mode 100644
+index 000000000..721dd68ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FACTOR 8
++#include "adr_1_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_4.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_4.c
+new file mode 100644
+index 000000000..7c039ba13
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_4.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FACTOR 16
++#include "adr_1.c"
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.[bhsd],} 8 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.[bhsd],} 8 } } */
++/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.[bhsd],} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c
+new file mode 100644
+index 000000000..3fb9099e1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FACTOR 16
++#include "adr_1_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_5.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_5.c
+new file mode 100644
+index 000000000..ce3991cb2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_5.c
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define LOOP(FACTOR)						\
++  __attribute__ ((noipa))					\
++  void								\
++  test_##FACTOR (uint64_t *restrict dst,			\
++		 uint64_t *restrict src, int count)		\
++  {								\
++    for (int i = 0; i < count; ++i)				\
++      dst[i] += (src[i] & 0xffffffff) * FACTOR;			\
++  }
++
++#define TEST_ALL(T) T (1) T (2) T (4) T (8)
++
++TEST_ALL (LOOP)
++
++/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-not {\tand\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-not {\tuxtw\tz[0-9]\.d,} } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw\]} 1 } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 1\]} 1 } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 2\]} 1 } } */
++/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 3\]} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c
+new file mode 100644
+index 000000000..025c38d23
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "adr_5.c"
++
++#define N 131
++
++#define TEST_LOOP(FACTOR)						\
++  {									\
++    uint64_t a[N], b[N];						\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	a[i] = (uint64_t) i * i + i % 5;				\
++	b[i] = (uint64_t) (i * 3) << ((i & 7) * 8);			\
++	asm volatile ("" ::: "memory");					\
++      }									\
++    test_##FACTOR (a, b, N);						\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	uint64_t expected = ((uint64_t) (i * i + i % 5)			\
++			     + (((uint64_t) (i * 3) << ((i & 7) * 8))	\
++				& 0xffffffff) * FACTOR);		\
++	if (a[i] != expected)						\
++	  __builtin_abort ();						\
++      }									\
++  }
++
++int __attribute__ ((optimize (1)))
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c
+new file mode 100644
+index 000000000..615d8b885
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c
+@@ -0,0 +1,51 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */
++
++#include <stdint.h>
++
++#define SIGNED(S) int##S##_t
++
++#define DIV(x,y) ((x)/(y))
++#define MOD(x,y) ((x)%(y))
++
++#define TEMPLATE(OP,SIZE)						\
++void __attribute__ ((noinline, noclone))				\
++f_##OP##_##SIZE (SIGNED(SIZE) *restrict a, SIGNED(SIZE) *restrict b,	\
++		 __INTPTR_TYPE__ n)					\
++{									\
++  for (__INTPTR_TYPE__ i = 0; i < n; ++i)				\
++    a[i] = OP (b[i], ((SIGNED(SIZE))1 << ((SIZE)/2+1)));		\
++}
++#define DIVMOD(SIZE)	\
++TEMPLATE (DIV,SIZE);	\
++TEMPLATE (MOD,SIZE);
++
++DIVMOD (8);
++DIVMOD (16);
++DIVMOD (32);
++DIVMOD (64);
++
++/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 4 } } */
++
++/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-9]+/m, z[0-9]+\.b, #5\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-9]+/m, z[0-9]+\.h, #9\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #9\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-9]+/m, z[0-9]+\.s, #17\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #17\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.d, p[0-9]+/m, z[0-9]+\.d, #33\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #33\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tasr\t%} } } */
++/* { dg-final { scan-assembler-not {\tlsr\t%} } } */
++/* { dg-final { scan-assembler-not {\tcmplt\t%} } } */
++/* { dg-final { scan-assembler-not {\tand\t%} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
+new file mode 100644
+index 000000000..d86a428a7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
+@@ -0,0 +1,25 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++#define TEST_TYPE(TYPE) \
++  void \
++  test_##TYPE (TYPE *ptr, TYPE *a, TYPE *b, TYPE min_v) \
++  { \
++    TYPE last = *ptr; \
++    for (int i = 0; i < 1024; i++) \
++      if (a[i] < min_v) \
++	last = b[i]; \
++    *ptr = last; \
++  }
++
++TEST_TYPE (uint8_t);
++TEST_TYPE (uint16_t);
++TEST_TYPE (uint32_t);
++TEST_TYPE (uint64_t);
++
++/* { dg-final { scan-assembler {\tclastb\t(b[0-9]+), p[0-7], \1, z[0-9]+\.b\n} } } */
++/* { dg-final { scan-assembler {\tclastb\t(h[0-9]+), p[0-7], \1, z[0-9]+\.h\n} } } */
++/* { dg-final { scan-assembler {\tclastb\t(s[0-9]+), p[0-7], \1, z[0-9]+\.s\n} } } */
++/* { dg-final { scan-assembler {\tclastb\t(d[0-9]+), p[0-7], \1, z[0-9]+\.d\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1.c
+new file mode 100644
+index 000000000..bdc9856fa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
++
++#include <stdint.h>
++
++void __attribute__ ((noinline, noclone))
++clrsb_32 (unsigned int *restrict dst, uint32_t *restrict src, int size)
++{
++  for (int i = 0; i < size; ++i)
++    dst[i] = __builtin_clrsb (src[i]);
++}
++
++void __attribute__ ((noinline, noclone))
++clrsb_64 (unsigned int *restrict dst, uint64_t *restrict src, int size)
++{
++  for (int i = 0; i < size; ++i)
++    dst[i] = __builtin_clrsbll (src[i]);
++}
++
++/* { dg-final { scan-assembler-times {\tcls\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tcls\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1_run.c
+new file mode 100644
+index 000000000..287630d7f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1_run.c
+@@ -0,0 +1,50 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "clrsb_1.c"
++
++extern void abort (void) __attribute__ ((noreturn));
++
++unsigned int data[] = {
++  0xffffff80, 24,
++  0xffffffff, 31,
++  0x00000000, 31,
++  0x80000000, 0,
++  0x7fffffff, 0,
++  0x000003ff, 21,
++  0x1fffffff, 2,
++  0x0000ffff, 15,
++  0xffff0000, 15
++};
++
++int __attribute__ ((optimize (1)))
++main (void)
++{
++  unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
++
++  uint32_t in32[count];
++  unsigned int out32[count];
++  for (unsigned int i = 0; i < count; ++i)
++    {
++      in32[i] = data[i * 2];
++      asm volatile ("" ::: "memory");
++    }
++  clrsb_32 (out32, in32, count);
++  for (unsigned int i = 0; i < count; ++i)
++    if (out32[i] != data[i * 2 + 1])
++      abort ();
++
++  uint64_t in64[count];
++  unsigned int out64[count];
++  for (unsigned int i = 0; i < count; ++i)
++    {
++      in64[i] = (uint64_t) data[i * 2] << 32;
++      asm volatile ("" ::: "memory");
++    }
++  clrsb_64 (out64, in64, count);
++  for (unsigned int i = 0; i < count; ++i)
++    if (out64[i] != (data[i * 2] ? data[i * 2 + 1] : 63))
++      abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clz_1.c b/gcc/testsuite/gcc.target/aarch64/sve/clz_1.c
+new file mode 100644
+index 000000000..0c7a4e6d7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/clz_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
++
++#include <stdint.h>
++
++void __attribute__ ((noinline, noclone))
++clz_32 (unsigned int *restrict dst, uint32_t *restrict src, int size)
++{
++  for (int i = 0; i < size; ++i)
++    dst[i] = __builtin_clz (src[i]);
++}
++
++void __attribute__ ((noinline, noclone))
++clz_64 (unsigned int *restrict dst, uint64_t *restrict src, int size)
++{
++  for (int i = 0; i < size; ++i)
++    dst[i] = __builtin_clzll (src[i]);
++}
++
++/* { dg-final { scan-assembler-times {\tclz\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tclz\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clz_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/clz_1_run.c
+new file mode 100644
+index 000000000..12d9cf276
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/clz_1_run.c
+@@ -0,0 +1,50 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "clz_1.c"
++
++extern void abort (void) __attribute__ ((noreturn));
++
++unsigned int data[] = {
++  0xffffff80, 0,
++  0xffffffff, 0,
++  0x00000000, 32,
++  0x80000000, 0,
++  0x7fffffff, 1,
++  0x000003ff, 22,
++  0x1fffffff, 3,
++  0x0000ffff, 16,
++  0xffff0000, 0
++};
++
++int __attribute__ ((optimize (1)))
++main (void)
++{
++  unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
++
++  uint32_t in32[count];
++  unsigned int out32[count];
++  for (unsigned int i = 0; i < count; ++i)
++    {
++      in32[i] = data[i * 2];
++      asm volatile ("" ::: "memory");
++    }
++  clz_32 (out32, in32, count);
++  for (unsigned int i = 0; i < count; ++i)
++    if (out32[i] != data[i * 2 + 1])
++      abort ();
++
++  uint64_t in64[count];
++  unsigned int out64[count];
++  for (unsigned int i = 0; i < count; ++i)
++    {
++      in64[i] = (uint64_t) data[i * 2] << 10;
++      asm volatile ("" ::: "memory");
++    }
++  clz_64 (out64, in64, count);
++  for (unsigned int i = 0; i < count; ++i)
++    if (out64[i] != (data[i * 2] ? data[i * 2 + 1] + 22 : 64))
++      abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cnot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cnot_1.c
+new file mode 100644
+index 000000000..5fa33461c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cnot_1.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE)						\
++  void __attribute__ ((noipa))					\
++  test_##TYPE (TYPE *restrict r, TYPE *restrict a, int n)	\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = !a[i];						\
++  }
++
++#define TEST_ALL(T)		\
++  T (int8_t)			\
++  T (int16_t)			\
++  T (int32_t)			\
++  T (int64_t)			\
++  T (uint8_t)			\
++  T (uint16_t)			\
++  T (uint32_t)			\
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1.c
+new file mode 100644
+index 000000000..c02e8ae8e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1.c
+@@ -0,0 +1,42 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? abd (b[i], c[i]) : b[i];	\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1_run.c
+new file mode 100644
+index 000000000..a45beefc2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_abd_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : b[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2.c
+new file mode 100644
+index 000000000..97901b6f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2.c
+@@ -0,0 +1,42 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? abd (b[i], c[i]) : c[i];	\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2_run.c
+new file mode 100644
+index 000000000..474bc0f9a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_abd_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : c[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3.c
+new file mode 100644
+index 000000000..dc8bc3cee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? abd (b[i], c[i]) : a[i];	\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3_run.c
+new file mode 100644
+index 000000000..9f1ac2df8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_abd_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : a[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4.c
+new file mode 100644
+index 000000000..5c65e59ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4.c
+@@ -0,0 +1,42 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? abd (b[i], c[i]) : 79;		\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-times {\tsel\t} 8 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4_run.c
+new file mode 100644
+index 000000000..47fd9e09f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_abd_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : 79;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5.c
+new file mode 100644
+index 000000000..f2c013158
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? abd (b[i], c[i]) : 0;		\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5_run.c
+new file mode 100644
+index 000000000..7cd44be38
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_abd_5.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : 0;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1.c
+new file mode 100644
+index 000000000..bd8776637
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noipa))				\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, int n)		\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] == 0 ? !b[i] : b[i];			\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* Currently we canonicalize the ?: so that !b[i] is the "false" value.  */
++/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1_run.c
+new file mode 100644
+index 000000000..802bcbb2e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_cnot_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i % 3) < (i % 5);				\
++	b[i] = i % 7 < 3;					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, N);					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] == 0 ? !b[i] : b[i];		\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
+new file mode 100644
+index 000000000..3df2431be
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noipa))				\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, int n)		\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] == 0 ? !b[i] : a[i];			\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* Currently we canonicalize the ?: so that !b[i] is the "false" value.  */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2_run.c
+new file mode 100644
+index 000000000..6db8bf14e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_cnot_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i % 3) < (i % 5);				\
++	b[i] = i % 7 < 3;					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, N);					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] == 0 ? !b[i] : a[i];		\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3.c
+new file mode 100644
+index 000000000..806e51788
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE)					\
++  void __attribute__ ((noipa))				\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, int n)		\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] == 0 ? !b[i] : 127;			\
++  }
++
++#define TEST_ALL(T) \
++  T (int8_t) \
++  T (uint8_t) \
++  T (int16_t) \
++  T (uint16_t) \
++  T (int32_t) \
++  T (uint32_t) \
++  T (int64_t) \
++  T (uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 8 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3_run.c
+new file mode 100644
+index 000000000..6e025e489
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_cnot_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE)						\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i % 3) < (i % 5);				\
++	b[i] = i % 7 < 3;					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, N);					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] == 0 ? !b[i] : 127;		\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
+new file mode 100644
+index 000000000..86064ebfc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  void __attribute__ ((noipa))					\
++  test_##INT_TYPE (FLOAT_TYPE *__restrict r,			\
++		   INT_TYPE *__restrict a,			\
++		   FLOAT_TYPE *__restrict b,			\
++		   INT_TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      {								\
++	FLOAT_TYPE bi = b[i];					\
++	r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi;		\
++      }								\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, int16_t) \
++  T (_Float16, uint16_t) \
++  T (float, int32_t) \
++  T (float, uint32_t) \
++  T (double, int64_t) \
++  T (double, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1_run.c
+new file mode 100644
+index 000000000..1f712b485
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1_run.c
+@@ -0,0 +1,29 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
++
++#include "cond_convert_1.c"
++
++#define N 99
++
++#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  {								\
++    FLOAT_TYPE r[N], b[N];					\
++    INT_TYPE a[N], pred[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	b[i] = (i % 9) * (i % 7 + 1);				\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##INT_TYPE (r, a, b, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? (FLOAT_TYPE) a[i] : b[i]))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2.c
+new file mode 100644
+index 000000000..0e60b4381
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  void __attribute__ ((noipa))					\
++  test_##INT_TYPE (FLOAT_TYPE *__restrict r,			\
++		   INT_TYPE *__restrict a,			\
++		   INT_TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? (FLOAT_TYPE) a[i] : 1.0;			\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, int16_t) \
++  T (_Float16, uint16_t) \
++  T (float, int32_t) \
++  T (float, uint32_t) \
++  T (double, int64_t) \
++  T (double, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2_run.c
+new file mode 100644
+index 000000000..9a4834921
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
++
++#include "cond_convert_2.c"
++
++#define N 99
++
++#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  {								\
++    FLOAT_TYPE r[N];						\
++    INT_TYPE a[N], pred[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##INT_TYPE (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? (FLOAT_TYPE) a[i] : 1.0))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3.c
+new file mode 100644
+index 000000000..a294effd4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  void __attribute__ ((noipa))					\
++  test_##INT_TYPE (FLOAT_TYPE *__restrict r,			\
++		   INT_TYPE *__restrict a,			\
++		   INT_TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? (FLOAT_TYPE) a[i] : 0.0;			\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, int16_t) \
++  T (_Float16, uint16_t) \
++  T (float, int32_t) \
++  T (float, uint32_t) \
++  T (double, int64_t) \
++  T (double, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* Really we should be able to use MOVPRFX /z here, but at the moment
++   we're relying on combine to merge a SEL and an arithmetic operation,
++   and the SEL doesn't allow the "false" value to be zero when the "true"
++   value is a register.  */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3_run.c
+new file mode 100644
+index 000000000..90021097c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
++
++#include "cond_convert_3.c"
++
++#define N 99
++
++#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  {								\
++    FLOAT_TYPE r[N];						\
++    INT_TYPE a[N], pred[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##INT_TYPE (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? (FLOAT_TYPE) a[i] : 0.0))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
+new file mode 100644
+index 000000000..e3a947b26
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  void __attribute__ ((noipa))					\
++  test_##INT_TYPE (INT_TYPE *__restrict r,			\
++		   FLOAT_TYPE *__restrict a,			\
++		   INT_TYPE *__restrict b,			\
++		   INT_TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      {								\
++	INT_TYPE bi = b[i];					\
++	r[i] = pred[i] ? (INT_TYPE) a[i] : bi;			\
++      }								\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, int16_t) \
++  T (_Float16, uint16_t) \
++  T (float, int32_t) \
++  T (float, uint32_t) \
++  T (double, int64_t) \
++  T (double, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4_run.c
+new file mode 100644
+index 000000000..eaadcb7d4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4_run.c
+@@ -0,0 +1,29 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
++
++#include "cond_convert_4.c"
++
++#define N 99
++
++#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  {								\
++    INT_TYPE r[N], b[N], pred[N];				\
++    FLOAT_TYPE a[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	b[i] = (i % 9) * (i % 7 + 1);				\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##INT_TYPE (r, a, b, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? (INT_TYPE) a[i] : b[i]))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5.c
+new file mode 100644
+index 000000000..5f3da83e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  void __attribute__ ((noipa))					\
++  test_##INT_TYPE (INT_TYPE *__restrict r,			\
++		   FLOAT_TYPE *__restrict a,			\
++		   INT_TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? (INT_TYPE) a[i] : 72;			\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, int16_t) \
++  T (_Float16, uint16_t) \
++  T (float, int32_t) \
++  T (float, uint32_t) \
++  T (double, int64_t) \
++  T (double, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5_run.c
+new file mode 100644
+index 000000000..a1f2d4977
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
++
++#include "cond_convert_5.c"
++
++#define N 99
++
++#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  {								\
++    INT_TYPE r[N], pred[N];					\
++    FLOAT_TYPE a[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##INT_TYPE (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? (INT_TYPE) a[i] : 72))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6.c
+new file mode 100644
+index 000000000..6541a2ea4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  void __attribute__ ((noipa))					\
++  test_##INT_TYPE (INT_TYPE *__restrict r,			\
++		   FLOAT_TYPE *__restrict a,			\
++		   INT_TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? (INT_TYPE) a[i] : 0;			\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, int16_t) \
++  T (_Float16, uint16_t) \
++  T (float, int32_t) \
++  T (float, uint32_t) \
++  T (double, int64_t) \
++  T (double, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* Really we should be able to use MOVPRFX /z here, but at the moment
++   we're relying on combine to merge a SEL and an arithmetic operation,
++   and the SEL doesn't allow the "false" value to be zero when the "true"
++   value is a register.  */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6_run.c
+new file mode 100644
+index 000000000..49a64b4fc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
++
++#include "cond_convert_6.c"
++
++#define N 99
++
++#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
++  {								\
++    INT_TYPE r[N], pred[N];					\
++    FLOAT_TYPE a[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##INT_TYPE (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? (INT_TYPE) a[i] : 0))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1.c
+new file mode 100644
+index 000000000..c1f54e391
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, ABS)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : b[i];	\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, __builtin_fabsf16) \
++  T (float, __builtin_fabsf) \
++  T (double, __builtin_fabs)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1_run.c
+new file mode 100644
+index 000000000..a4d6972b9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include "cond_fabd_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, ABS)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : b[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2.c
+new file mode 100644
+index 000000000..dd6eecc17
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, ABS)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : c[i];	\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, __builtin_fabsf16) \
++  T (float, __builtin_fabsf) \
++  T (double, __builtin_fabs)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2_run.c
+new file mode 100644
+index 000000000..28dc7d011
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include "cond_fabd_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, ABS)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : c[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3.c
+new file mode 100644
+index 000000000..26fd7b265
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3.c
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, ABS)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : a[i];	\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, __builtin_fabsf16) \
++  T (float, __builtin_fabsf) \
++  T (double, __builtin_fabs)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3_run.c
+new file mode 100644
+index 000000000..be21b7f99
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include "cond_fabd_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, ABS)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : a[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4.c
+new file mode 100644
+index 000000000..78f1fd914
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, ABS)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : 8.0;	\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, __builtin_fabsf16) \
++  T (float, __builtin_fabsf) \
++  T (double, __builtin_fabs)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-times {\tsel\t} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4_run.c
+new file mode 100644
+index 000000000..86bdab415
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include "cond_fabd_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, ABS)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : 8;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5.c
+new file mode 100644
+index 000000000..e66477b3b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5.c
+@@ -0,0 +1,35 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, ABS)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
++	       TYPE *__restrict b, TYPE *__restrict c,	\
++	       int n)					\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : 0.0;	\
++  }
++
++#define TEST_ALL(T) \
++  T (_Float16, __builtin_fabsf16) \
++  T (float, __builtin_fabsf) \
++  T (double, __builtin_fabs)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* Really we should be able to use MOVPRFX /Z here, but at the moment
++   we're relying on combine to merge a SEL and an arithmetic operation,
++   and the SEL doesn't allow zero operands.  */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 1 { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 1 { xfail *-*-* } } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 1 { xfail *-*-* } } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5_run.c
+new file mode 100644
+index 000000000..9fb5fbb81
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
++
++#include "cond_fabd_5.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, ABS)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : 0;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1.c
+new file mode 100644
+index 000000000..d103e1f38
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? y[i] + (TYPE) CONST : y[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, one, 1.0) \
++  T (TYPE, PRED_TYPE, two, 2.0) \
++  T (TYPE, PRED_TYPE, minus_half, -0.5) \
++  T (TYPE, PRED_TYPE, minus_one, -1.0) \
++  T (TYPE, PRED_TYPE, minus_two, -2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1_run.c
+new file mode 100644
+index 000000000..956ae1435
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fadd_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? y[i] + (TYPE) CONST : y[i];	\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2.c
+new file mode 100644
+index 000000000..b7d02f4ad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2.c
+@@ -0,0 +1,56 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, CONST)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			TYPE *__restrict z,		\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = y[i] < 8 ? z[i] + (TYPE) CONST : y[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, half, 0.5) \
++  T (TYPE, one, 1.0) \
++  T (TYPE, two, 2.0) \
++  T (TYPE, minus_half, -0.5) \
++  T (TYPE, minus_one, -1.0) \
++  T (TYPE, minus_two, -2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 6 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2_run.c
+new file mode 100644
+index 000000000..debf395cc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2_run.c
+@@ -0,0 +1,31 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fadd_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, CONST)					\
++  {									\
++    TYPE x[N], y[N], z[N];						\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i % 13;							\
++	z[i] = i * i;							\
++      }									\
++    test_##TYPE##_##NAME (x, y, z, N);					\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = y[i] < 8 ? z[i] + (TYPE) CONST : y[i];		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3.c
+new file mode 100644
+index 000000000..aec0e5aca
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3.c
+@@ -0,0 +1,65 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? y[i] + (TYPE) CONST : 4;	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, one, 1.0) \
++  T (TYPE, PRED_TYPE, two, 2.0) \
++  T (TYPE, PRED_TYPE, minus_half, -0.5) \
++  T (TYPE, PRED_TYPE, minus_one, -1.0) \
++  T (TYPE, PRED_TYPE, minus_two, -2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3_run.c
+new file mode 100644
+index 000000000..d5268c5ca
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fadd_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)			\
++  {								\
++    TYPE x[N], y[N];						\
++    PRED_TYPE pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	y[i] = i * i;						\
++	pred[i] = i % 3;					\
++      }								\
++    test_##TYPE##_##NAME (x, y, pred, N);			\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = i % 3 != 1 ? y[i] + (TYPE) CONST : 4;	\
++	if (x[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4.c
+new file mode 100644
+index 000000000..bb276c140
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4.c
+@@ -0,0 +1,64 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? y[i] + (TYPE) CONST : 0;	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, one, 1.0) \
++  T (TYPE, PRED_TYPE, two, 2.0) \
++  T (TYPE, PRED_TYPE, minus_half, -0.5) \
++  T (TYPE, PRED_TYPE, minus_one, -1.0) \
++  T (TYPE, PRED_TYPE, minus_two, -2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 6 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 6 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4_run.c
+new file mode 100644
+index 000000000..4ea8be661
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fadd_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? y[i] + (TYPE) CONST : 0;		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1.c
+new file mode 100644
+index 000000000..d0db0900e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1.c
+@@ -0,0 +1,55 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include <stdint.h>
++
++#ifndef FN
++#define FN(X) __builtin_fmax##X
++#endif
++
++#define DEF_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? FN (y[i], CONST) : y[i];	\
++  }
++
++#define TEST_TYPE(T, FN, TYPE, PRED_TYPE) \
++  T (FN, TYPE, PRED_TYPE, zero, 0) \
++  T (FN, TYPE, PRED_TYPE, one, 1) \
++  T (FN, TYPE, PRED_TYPE, two, 2)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, FN (f16), _Float16, int16_t) \
++  TEST_TYPE (T, FN (f32), float, int32_t) \
++  TEST_TYPE (T, FN (f64), double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1_run.c
+new file mode 100644
+index 000000000..00a3c41f2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include "cond_fmaxnm_1.c"
++
++#define N 99
++
++#define TEST_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)			\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? FN (y[i], CONST) : y[i];		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2.c
+new file mode 100644
+index 000000000..0b535d15f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2.c
+@@ -0,0 +1,48 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include <stdint.h>
++
++#ifndef FN
++#define FN(X) __builtin_fmax##X
++#endif
++
++#define DEF_LOOP(FN, TYPE, NAME, CONST)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			TYPE *__restrict z,		\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = y[i] < 8 ? FN (z[i], CONST) : y[i];	\
++  }
++
++#define TEST_TYPE(T, FN, TYPE) \
++  T (FN, TYPE, zero, 0) \
++  T (FN, TYPE, one, 1) \
++  T (FN, TYPE, two, 2)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, FN (f32), float) \
++  TEST_TYPE (T, FN (f64), double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2_run.c
+new file mode 100644
+index 000000000..9eb4d80fc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2_run.c
+@@ -0,0 +1,31 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include "cond_fmaxnm_2.c"
++
++#define N 99
++
++#define TEST_LOOP(FN, TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N], z[N];						\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i % 13;							\
++	z[i] = i * i;							\
++      }									\
++    test_##TYPE##_##NAME (x, y, z, N);					\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = y[i] < 8 ? FN (z[i], CONST) : y[i];		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3.c
+new file mode 100644
+index 000000000..741f8f6d0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3.c
+@@ -0,0 +1,54 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include <stdint.h>
++
++#ifndef FN
++#define FN(X) __builtin_fmax##X
++#endif
++
++#define DEF_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? FN (y[i], CONST) : 4;	\
++  }
++
++#define TEST_TYPE(T, FN, TYPE, PRED_TYPE) \
++  T (FN, TYPE, PRED_TYPE, zero, 0) \
++  T (FN, TYPE, PRED_TYPE, one, 1) \
++  T (FN, TYPE, PRED_TYPE, two, 2)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, FN (f16), _Float16, int16_t) \
++  TEST_TYPE (T, FN (f32), float, int32_t) \
++  TEST_TYPE (T, FN (f64), double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3_run.c
+new file mode 100644
+index 000000000..4aac75f0e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include "cond_fmaxnm_3.c"
++
++#define N 99
++
++#define TEST_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	       	\
++  {								\
++    TYPE x[N], y[N];						\
++    PRED_TYPE pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	y[i] = i * i;						\
++	pred[i] = i % 3;					\
++      }								\
++    test_##TYPE##_##NAME (x, y, pred, N);			\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = i % 3 != 1 ? FN (y[i], CONST) : 4;	\
++	if (x[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4.c
+new file mode 100644
+index 000000000..83a53c7d4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include <stdint.h>
++
++#ifndef FN
++#define FN(X) __builtin_fmax##X
++#endif
++
++#define DEF_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? FN (y[i], CONST) : 0;	\
++  }
++
++#define TEST_TYPE(T, FN, TYPE, PRED_TYPE) \
++  T (FN, TYPE, PRED_TYPE, zero, 0) \
++  T (FN, TYPE, PRED_TYPE, one, 1) \
++  T (FN, TYPE, PRED_TYPE, two, 2)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, FN (f16), _Float16, int16_t) \
++  TEST_TYPE (T, FN (f32), float, int32_t) \
++  TEST_TYPE (T, FN (f64), double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4_run.c
+new file mode 100644
+index 000000000..e1d904338
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#include "cond_fmaxnm_4.c"
++
++#define N 99
++
++#define TEST_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)			\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? FN (y[i], CONST) : 0;		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1.c
+new file mode 100644
+index 000000000..d667b2088
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_1.c"
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1_run.c
+new file mode 100644
+index 000000000..5df2ff84b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_1_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2.c
+new file mode 100644
+index 000000000..d66a84b01
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_2.c"
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2_run.c
+new file mode 100644
+index 000000000..79a98bb77
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_2_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3.c
+new file mode 100644
+index 000000000..d39dd1825
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3.c
+@@ -0,0 +1,28 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_3.c"
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3_run.c
+new file mode 100644
+index 000000000..ca1a047da
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_3_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4.c
+new file mode 100644
+index 000000000..fff6fdd37
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4.c
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_4.c"
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4_run.c
+new file mode 100644
+index 000000000..b945d0470
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4_run.c
+@@ -0,0 +1,5 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
++
++#define FN(X) __builtin_fmin##X
++#include "cond_fmaxnm_4_run.c"
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1.c
+new file mode 100644
+index 000000000..ce417ed85
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1.c
+@@ -0,0 +1,47 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? y[i] * (TYPE) CONST : y[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, two, 2.0) \
++  T (TYPE, PRED_TYPE, four, 4.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1_run.c
+new file mode 100644
+index 000000000..9ca5b5080
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fmul_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? y[i] * (TYPE) CONST : y[i];	\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2.c
+new file mode 100644
+index 000000000..cbf9d13a5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2.c
+@@ -0,0 +1,44 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, CONST)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			TYPE *__restrict z,		\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = y[i] < 8 ? z[i] * (TYPE) CONST : y[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, half, 0.5) \
++  T (TYPE, two, 2.0) \
++  T (TYPE, four, 4.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2_run.c
+new file mode 100644
+index 000000000..44b283ba3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2_run.c
+@@ -0,0 +1,31 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fmul_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, CONST)					\
++  {									\
++    TYPE x[N], y[N], z[N];						\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i % 13;							\
++	z[i] = i * i;							\
++      }									\
++    test_##TYPE##_##NAME (x, y, z, N);					\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = y[i] < 8 ? z[i] * (TYPE) CONST : y[i];		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3.c
+new file mode 100644
+index 000000000..4da147e15
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? y[i] * (TYPE) CONST : 8;	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, two, 2.0) \
++  T (TYPE, PRED_TYPE, four, 4.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3_run.c
+new file mode 100644
+index 000000000..9b81d43c9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fmul_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)			\
++  {								\
++    TYPE x[N], y[N];						\
++    PRED_TYPE pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	y[i] = i * i;						\
++	pred[i] = i % 3;					\
++      }								\
++    test_##TYPE##_##NAME (x, y, pred, N);			\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = i % 3 != 1 ? y[i] * (TYPE) CONST : 8;	\
++	if (x[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4.c
+new file mode 100644
+index 000000000..c4fdb2b2b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4.c
+@@ -0,0 +1,49 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? y[i] * (TYPE) CONST : 0;	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, two, 2.0) \
++  T (TYPE, PRED_TYPE, four, 4.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4_run.c
+new file mode 100644
+index 000000000..b93e031e5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fmul_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? y[i] * (TYPE) CONST : 0;		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1.c
+new file mode 100644
+index 000000000..8e7172af4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1.c
+@@ -0,0 +1,47 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? (TYPE) CONST - y[i] : y[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, one, 1.0) \
++  T (TYPE, PRED_TYPE, two, 2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1_run.c
+new file mode 100644
+index 000000000..61ffac429
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fsubr_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? (TYPE) CONST - y[i] : y[i];	\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2.c
+new file mode 100644
+index 000000000..6d2efde94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2.c
+@@ -0,0 +1,44 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, CONST)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			TYPE *__restrict z,		\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = y[i] < 8 ? (TYPE) CONST - z[i] : y[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, half, 0.5) \
++  T (TYPE, one, 1.0) \
++  T (TYPE, two, 2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2_run.c
+new file mode 100644
+index 000000000..1b25392b0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2_run.c
+@@ -0,0 +1,31 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fsubr_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, CONST)					\
++  {									\
++    TYPE x[N], y[N], z[N];						\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i % 13;							\
++	z[i] = i * i;							\
++      }									\
++    test_##TYPE##_##NAME (x, y, z, N);					\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = y[i] < 8 ? (TYPE) CONST - z[i] : y[i];		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3.c
+new file mode 100644
+index 000000000..328af5741
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? (TYPE) CONST - y[i] : 4;	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, one, 1.0) \
++  T (TYPE, PRED_TYPE, two, 2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3_run.c
+new file mode 100644
+index 000000000..8978287df
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fsubr_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)			\
++  {								\
++    TYPE x[N], y[N];						\
++    PRED_TYPE pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	y[i] = i * i;						\
++	pred[i] = i % 3;					\
++      }								\
++    test_##TYPE##_##NAME (x, y, pred, N);			\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = i % 3 != 1 ? (TYPE) CONST - y[i] : 4;	\
++	if (x[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4.c
+new file mode 100644
+index 000000000..1d420b104
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4.c
+@@ -0,0 +1,49 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y,		\
++			PRED_TYPE *__restrict pred,	\
++			int n)				\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] != 1 ? (TYPE) CONST - y[i] : 0;	\
++  }
++
++#define TEST_TYPE(T, TYPE, PRED_TYPE) \
++  T (TYPE, PRED_TYPE, half, 0.5) \
++  T (TYPE, PRED_TYPE, one, 1.0) \
++  T (TYPE, PRED_TYPE, two, 2.0)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, _Float16, int16_t) \
++  TEST_TYPE (T, float, int32_t) \
++  TEST_TYPE (T, double, int64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4_run.c
+new file mode 100644
+index 000000000..2cb3409af
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4_run.c
+@@ -0,0 +1,32 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_fsubr_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
++  {									\
++    TYPE x[N], y[N];							\
++    PRED_TYPE pred[N];							\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	y[i] = i * i;							\
++	pred[i] = i % 3;						\
++      }									\
++    test_##TYPE##_##NAME (x, y, pred, N);				\
++    for (int i = 0; i < N; ++i)						\
++      {									\
++	TYPE expected = i % 3 != 1 ? (TYPE) CONST - y[i] : 0;		\
++	if (x[i] != expected)						\
++	  __builtin_abort ();						\
++	asm volatile ("" ::: "memory");					\
++      }									\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1.c
+new file mode 100644
+index 000000000..a1e80b8a9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define bit_and(A, B) ((A) & (B))
++#define bit_or(A, B) ((A) | (B))
++#define bit_xor(A, B) ((A) ^ (B))
++#define bit_bic(A, B) ((A) & ~(B))
++
++#define DEF_LOOP(TYPE, OP)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE##_##OP (TYPE *__restrict r,		\
++		      TYPE *__restrict a,		\
++		      TYPE *__restrict b,		\
++		      TYPE *__restrict c, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? OP (b[i], c[i]) : b[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, bit_and) \
++  T (TYPE, bit_or) \
++  T (TYPE, bit_xor) \
++  T (TYPE, bit_bic)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1_run.c
+new file mode 100644
+index 000000000..cb12e5609
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_logical_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : b[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2.c
+new file mode 100644
+index 000000000..c476fe2ff
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define bit_and(A, B) ((A) & (B))
++#define bit_or(A, B) ((A) | (B))
++#define bit_xor(A, B) ((A) ^ (B))
++#define bit_bic(A, B) ((A) & ~(B))
++
++#define DEF_LOOP(TYPE, OP)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE##_##OP (TYPE *__restrict r,		\
++		      TYPE *__restrict a,		\
++		      TYPE *__restrict b,		\
++		      TYPE *__restrict c, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? OP (b[i], c[i]) : c[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, bit_and) \
++  T (TYPE, bit_or) \
++  T (TYPE, bit_xor) \
++  T (TYPE, bit_bic)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* There's no BICR or equivalent, so the BIC functions need a select.  */
++/* { dg-final { scan-assembler-times {\tsel\t} 8 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2_run.c
+new file mode 100644
+index 000000000..9b9918cc8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_logical_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : c[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3.c
+new file mode 100644
+index 000000000..7ad2c4ea3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3.c
+@@ -0,0 +1,66 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define bit_and(A, B) ((A) & (B))
++#define bit_or(A, B) ((A) | (B))
++#define bit_xor(A, B) ((A) ^ (B))
++#define bit_bic(A, B) ((A) & ~(B))
++
++#define DEF_LOOP(TYPE, OP)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE##_##OP (TYPE *__restrict r,		\
++		      TYPE *__restrict a,		\
++		      TYPE *__restrict b,		\
++		      TYPE *__restrict c, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? OP (b[i], c[i]) : a[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, bit_and) \
++  T (TYPE, bit_or) \
++  T (TYPE, bit_xor) \
++  T (TYPE, bit_bic)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 8 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 8 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 8 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 8 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3_run.c
+new file mode 100644
+index 000000000..05dc78ab3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_logical_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : a[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4.c
+new file mode 100644
+index 000000000..00217bffa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define bit_and(A, B) ((A) & (B))
++#define bit_or(A, B) ((A) | (B))
++#define bit_xor(A, B) ((A) ^ (B))
++#define bit_bic(A, B) ((A) & ~(B))
++
++#define DEF_LOOP(TYPE, OP)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE##_##OP (TYPE *__restrict r,		\
++		      TYPE *__restrict a,		\
++		      TYPE *__restrict b,		\
++		      TYPE *__restrict c, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? OP (b[i], c[i]) : 42;		\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, bit_and) \
++  T (TYPE, bit_or) \
++  T (TYPE, bit_xor) \
++  T (TYPE, bit_bic)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-times {\tsel\t} 32 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4_run.c
+new file mode 100644
+index 000000000..46fb11594
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_logical_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : 42;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5.c
+new file mode 100644
+index 000000000..36b541f21
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5.c
+@@ -0,0 +1,66 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define bit_and(A, B) ((A) & (B))
++#define bit_or(A, B) ((A) | (B))
++#define bit_xor(A, B) ((A) ^ (B))
++#define bit_bic(A, B) ((A) & ~(B))
++
++#define DEF_LOOP(TYPE, OP)				\
++  void __attribute__ ((noinline, noclone))		\
++  test_##TYPE##_##OP (TYPE *__restrict r,		\
++		      TYPE *__restrict a,		\
++		      TYPE *__restrict b,		\
++		      TYPE *__restrict c, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = a[i] < 20 ? OP (b[i], c[i]) : 0;		\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, bit_and) \
++  T (TYPE, bit_or) \
++  T (TYPE, bit_xor) \
++  T (TYPE, bit_bic)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 8 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 8 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 8 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 8 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5_run.c
+new file mode 100644
+index 000000000..e0da5fe58
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5_run.c
+@@ -0,0 +1,33 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_logical_5.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ((i + 2) % 3) * (i + 1);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, b, c, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : 0;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
+new file mode 100644
+index 000000000..cb01d50f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
+@@ -0,0 +1,52 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict r,		\
++			TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE c,	\
++			TYPE *__restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, add, +) \
++  T (TYPE, sub, -)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, uint64_t) \
++  TEST_TYPE (T, _Float16) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
+new file mode 100644
+index 000000000..bcfc62280
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
+@@ -0,0 +1,35 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_1.c"
++
++#define FACTOR 17
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected						\
++	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
+new file mode 100644
+index 000000000..b6ea1a3e2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict r,		\
++			TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE c,	\
++			TYPE *__restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, add, +) \
++  T (TYPE, sub, -)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, uint64_t) \
++  TEST_TYPE (T, _Float16) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
+new file mode 100644
+index 000000000..79998b84e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
+@@ -0,0 +1,36 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_2.c"
++
++#define FACTOR 17
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = (pred[i] != 1				\
++			 ? a[i] OP b[i] * (TYPE) FACTOR		\
++			 : (TYPE) FACTOR);			\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
+new file mode 100644
+index 000000000..085fccf53
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
+@@ -0,0 +1,52 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict r,		\
++			TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE c,	\
++			TYPE *__restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, add, +) \
++  T (TYPE, sub, -)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, uint64_t) \
++  TEST_TYPE (T, _Float16) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
+new file mode 100644
+index 000000000..cbd1185b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
+@@ -0,0 +1,35 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_3.c"
++
++#define FACTOR 17
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected						\
++	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
+new file mode 100644
+index 000000000..ed9f73e9c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
+@@ -0,0 +1,56 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict r,		\
++			TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE c,	\
++			TYPE *__restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];	\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, add, +) \
++  T (TYPE, sub, -)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, uint64_t) \
++  TEST_TYPE (T, _Float16) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
+new file mode 100644
+index 000000000..5e078594a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
+@@ -0,0 +1,36 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_4.c"
++
++#define FACTOR 17
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3;					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected = (pred[i] == 1				\
++			 ? a[i] OP b[i] * (TYPE) FACTOR		\
++			 : pred[i]);				\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
+new file mode 100644
+index 000000000..97e233579
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
+@@ -0,0 +1,56 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict r,		\
++			TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE c,	\
++			TYPE *__restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = pred[i] ? a[i] OP b[i] * c : 0;		\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, add, +) \
++  T (TYPE, sub, -)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, uint64_t) \
++  TEST_TYPE (T, _Float16) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\t(?:fmla|fmad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:fmla|fmad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:fmla|fmad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\t(?:fmls|fmsb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:fmls|fmsb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\t(?:fmls|fmsb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
+new file mode 100644
+index 000000000..9de46e30f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
+@@ -0,0 +1,35 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_5.c"
++
++#define FACTOR 17
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected						\
++	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;		\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
+new file mode 100644
+index 000000000..832bdb3d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict r,		\
++			TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE c,	\
++			TYPE *__restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = pred[i] ? a[i] OP b[i] * c : 5;		\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, add, +) \
++  T (TYPE, sub, -)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, uint64_t) \
++  TEST_TYPE (T, _Float16) \
++  TEST_TYPE (T, float) \
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
+new file mode 100644
+index 000000000..59f57a2db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
+@@ -0,0 +1,35 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_6.c"
++
++#define FACTOR 17
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected						\
++	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5;	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
+new file mode 100644
+index 000000000..5561f4219
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
++  void __attribute__ ((noipa))					\
++  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
++				  TYPE *__restrict a,		\
++				  TYPE *__restrict b,		\
++				  TYPE *__restrict pred, int n)	\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];	\
++  }
++
++#define TEST_COUNT(T, TYPE, CONST) \
++  T (TYPE, add, +, CONST) \
++  T (TYPE, sub, -, CONST)
++
++#define TEST_TYPE(T, TYPE, CONST) \
++  TEST_COUNT (T, TYPE, 2) \
++  TEST_COUNT (T, TYPE, 4) \
++  TEST_COUNT (T, TYPE, CONST)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t, 0x80) \
++  TEST_TYPE (T, uint16_t, 0x8000) \
++  TEST_TYPE (T, uint32_t, 0x80000000) \
++  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
++
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
+new file mode 100644
+index 000000000..b094f40a2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
+@@ -0,0 +1,34 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_7.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected						\
++	  = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];		\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
+new file mode 100644
+index 000000000..d5549272e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
++  void __attribute__ ((noipa))					\
++  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
++				  TYPE *__restrict a,		\
++				  TYPE *__restrict b,		\
++				  TYPE *__restrict pred, int n)	\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
++  }
++
++#define TEST_COUNT(T, TYPE, CONST) \
++  T (TYPE, add, +, CONST) \
++  T (TYPE, sub, -, CONST)
++
++#define TEST_TYPE(T, TYPE, CONST) \
++  TEST_COUNT (T, TYPE, 2) \
++  TEST_COUNT (T, TYPE, 4) \
++  TEST_COUNT (T, TYPE, CONST)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, uint8_t, 0x80) \
++  TEST_TYPE (T, uint16_t, 0x8000) \
++  TEST_TYPE (T, uint32_t, 0x80000000) \
++  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
++
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
+new file mode 100644
+index 000000000..7fb58aa70
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
+@@ -0,0 +1,34 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_mla_8.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	pred[i] = i % 3 < i % 5;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	TYPE expected						\
++	  = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
++	if (r[i] != expected)					\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1.c
+new file mode 100644
+index 000000000..f2c51b291
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1.c
+@@ -0,0 +1,48 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, int n)			\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP 3 : b[i];				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1_run.c
+new file mode 100644
+index 000000000..acc403ec8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : b[i]))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2.c
+new file mode 100644
+index 000000000..c9082c9c8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2.c
+@@ -0,0 +1,52 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, int n)			\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP 3 : a[i];				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 4 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2_run.c
+new file mode 100644
+index 000000000..4917d3af6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : a[i]))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3.c
+new file mode 100644
+index 000000000..55e0de8aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3.c
+@@ -0,0 +1,48 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, int n)			\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP 3 : 72;				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-times {\tsel\t} 16 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3_run.c
+new file mode 100644
+index 000000000..194c75b8d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : 72))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4.c
+new file mode 100644
+index 000000000..32dd68199
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4.c
+@@ -0,0 +1,52 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, int n)			\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP 3 : 0;					\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int8_t) \
++  TEST_TYPE (T, uint8_t) \
++  TEST_TYPE (T, int16_t) \
++  TEST_TYPE (T, uint16_t) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 4 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4_run.c
+new file mode 100644
+index 000000000..ee263000d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : 0))		\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5.c
+new file mode 100644
+index 000000000..1d4491531
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE *__restrict c, int n)	\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP c[i] : b[i];				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5_run.c
+new file mode 100644
+index 000000000..35bf1b871
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_5.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ~i & 7;						\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, c, N);			\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : b[i]))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6.c
+new file mode 100644
+index 000000000..35cb67677
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE *__restrict c, int n)	\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP c[i] : c[i];				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlslr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasrr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsrr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6_run.c
+new file mode 100644
+index 000000000..e601c6156
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_6.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ~i & 7;						\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, c, N);			\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : c[i]))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7.c
+new file mode 100644
+index 000000000..80154b25e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE *__restrict c, int n)	\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP c[i] : a[i];				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 4 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7_run.c
+new file mode 100644
+index 000000000..d23b0093d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_7.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ~i & 7;						\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, c, N);			\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : a[i]))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8.c
+new file mode 100644
+index 000000000..b478c0c4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE *__restrict c, int n)	\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP c[i] : 91;				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-times {\tsel\t} 8 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8_run.c
+new file mode 100644
+index 000000000..72e5a7b59
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_8.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ~i & 7;						\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, c, N);			\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : 91))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9.c
+new file mode 100644
+index 000000000..184e93ab8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_LOOP(TYPE, NAME, OP)					\
++  void __attribute__ ((noipa))						\
++  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
++			TYPE *__restrict b, TYPE *__restrict c, int n)	\
++  {									\
++    for (int i = 0; i < n; ++i)						\
++      r[i] = a[i] > 20 ? b[i] OP c[i] : 0;				\
++  }
++
++#define TEST_TYPE(T, TYPE) \
++  T (TYPE, shl, <<) \
++  T (TYPE, shr, >>)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, int32_t) \
++  TEST_TYPE (T, uint32_t) \
++  TEST_TYPE (T, int64_t) \
++  TEST_TYPE (T, uint64_t)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tlslr?\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
++/* { dg-final { scan-assembler-times {\tlslr?\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tasrr?\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tasrr?\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tlsrr?\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tlsrr?\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 4 } } */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 4 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9_run.c
+new file mode 100644
+index 000000000..6e41ac4da
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_shift_9.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N], c[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i);				\
++	b[i] = (i >> 4) << (i & 15);				\
++	c[i] = ~i & 7;						\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, c, N);			\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : 0))	\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1.c
+new file mode 100644
+index 000000000..2b5f9c345
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1.c
+@@ -0,0 +1,59 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abs(A) ((A) < 0 ? -(A) : (A))
++#define neg(A) (-(A))
++
++#define DEF_LOOP(TYPE, OP)					\
++  void __attribute__ ((noipa))					\
++  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
++		      TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? OP (a[i]) : a[i];			\
++  }
++
++#define TEST_INT_TYPE(T, TYPE) \
++  T (TYPE, abs) \
++  T (TYPE, neg)
++
++#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
++  T (TYPE, __builtin_fabs##SUFFIX) \
++  T (TYPE, neg)
++
++#define TEST_ALL(T) \
++  TEST_INT_TYPE (T, int8_t) \
++  TEST_INT_TYPE (T, int16_t) \
++  TEST_INT_TYPE (T, int32_t) \
++  TEST_INT_TYPE (T, int64_t) \
++  TEST_FLOAT_TYPE (T, _Float16, f16) \
++  TEST_FLOAT_TYPE (T, float, f) \
++  TEST_FLOAT_TYPE (T, double, )
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* XFAILed because the ?: gets canonicalized so that the operation is in
++   the false arm.  */
++/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1_run.c
+new file mode 100644
+index 000000000..a6c1a49dd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_unary_1.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], pred[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? OP (a[i]) : a[i]))			\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
+new file mode 100644
+index 000000000..97d1b8f5d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
+@@ -0,0 +1,61 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abs(A) ((A) < 0 ? -(A) : (A))
++#define neg(A) (-(A))
++
++#define DEF_LOOP(TYPE, OP)					\
++  void __attribute__ ((noipa))					\
++  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
++		      TYPE *__restrict b,			\
++		      TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      {								\
++	TYPE bi = b[i];						\
++	r[i] = pred[i] ? OP (a[i]) : bi;			\
++      }								\
++  }
++
++#define TEST_INT_TYPE(T, TYPE) \
++  T (TYPE, abs) \
++  T (TYPE, neg)
++
++#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
++  T (TYPE, __builtin_fabs##SUFFIX) \
++  T (TYPE, neg)
++
++#define TEST_ALL(T) \
++  TEST_INT_TYPE (T, int8_t) \
++  TEST_INT_TYPE (T, int16_t) \
++  TEST_INT_TYPE (T, int32_t) \
++  TEST_INT_TYPE (T, int64_t) \
++  TEST_FLOAT_TYPE (T, _Float16, f16) \
++  TEST_FLOAT_TYPE (T, float, f) \
++  TEST_FLOAT_TYPE (T, double, )
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2_run.c
+new file mode 100644
+index 000000000..1a385c323
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2_run.c
+@@ -0,0 +1,28 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_unary_2.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], b[N], pred[N];				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	b[i] = (i % 9) * (i % 7 + 1);				\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, b, pred, N);			\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? OP (a[i]) : b[i]))			\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3.c
+new file mode 100644
+index 000000000..dde0fdd92
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abs(A) ((A) < 0 ? -(A) : (A))
++#define neg(A) (-(A))
++
++#define DEF_LOOP(TYPE, OP)					\
++  void __attribute__ ((noipa))					\
++  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
++		      TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? OP (a[i]) : 5;				\
++  }
++
++#define TEST_INT_TYPE(T, TYPE) \
++  T (TYPE, abs) \
++  T (TYPE, neg)
++
++#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
++  T (TYPE, __builtin_fabs##SUFFIX) \
++  T (TYPE, neg)
++
++#define TEST_ALL(T) \
++  TEST_INT_TYPE (T, int8_t) \
++  TEST_INT_TYPE (T, int16_t) \
++  TEST_INT_TYPE (T, int32_t) \
++  TEST_INT_TYPE (T, int64_t) \
++  TEST_FLOAT_TYPE (T, _Float16, f16) \
++  TEST_FLOAT_TYPE (T, float, f) \
++  TEST_FLOAT_TYPE (T, double, )
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3_run.c
+new file mode 100644
+index 000000000..3c72b239a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_unary_3.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], pred[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? OP (a[i]) : 5))			\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4.c
+new file mode 100644
+index 000000000..4604365fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4.c
+@@ -0,0 +1,62 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define abs(A) ((A) < 0 ? -(A) : (A))
++#define neg(A) (-(A))
++
++#define DEF_LOOP(TYPE, OP)					\
++  void __attribute__ ((noipa))					\
++  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
++		      TYPE *__restrict pred, int n)		\
++  {								\
++    for (int i = 0; i < n; ++i)					\
++      r[i] = pred[i] ? OP (a[i]) : 0;				\
++  }
++
++#define TEST_INT_TYPE(T, TYPE) \
++  T (TYPE, abs) \
++  T (TYPE, neg)
++
++#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
++  T (TYPE, __builtin_fabs##SUFFIX) \
++  T (TYPE, neg)
++
++#define TEST_ALL(T) \
++  TEST_INT_TYPE (T, int8_t) \
++  TEST_INT_TYPE (T, int16_t) \
++  TEST_INT_TYPE (T, int32_t) \
++  TEST_INT_TYPE (T, int64_t) \
++  TEST_FLOAT_TYPE (T, _Float16, f16) \
++  TEST_FLOAT_TYPE (T, float, f) \
++  TEST_FLOAT_TYPE (T, double, )
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
++/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
++
++/* Really we should be able to use MOVPRFX /z here, but at the moment
++   we're relying on combine to merge a SEL and an arithmetic operation,
++   and the SEL doesn't allow the "false" value to be zero when the "true"
++   value is a register.  */
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4_run.c
+new file mode 100644
+index 000000000..48d254150
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_unary_4.c"
++
++#define N 99
++
++#define TEST_LOOP(TYPE, OP)					\
++  {								\
++    TYPE r[N], a[N], pred[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
++	pred[i] = (i % 7 < 4);					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##OP (r, a, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      if (r[i] != (pred[i] ? OP (a[i]) : 0))			\
++	__builtin_abort ();					\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1.c
+new file mode 100644
+index 000000000..05641199e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
++
++#define DEF_LOOP(TYPE, CONST)					\
++  void __attribute__ ((noipa))					\
++  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
++			 TYPE *restrict b)			\
++  {								\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
++      r[i] = a[i] > 20 ? b[i] & CONST : b[i];			\
++  }
++
++#define TEST_ALL(T)			\
++  T (uint16_t, 0xff)			\
++					\
++  T (uint32_t, 0xff)			\
++  T (uint32_t, 0xffff)			\
++					\
++  T (uint64_t, 0xff)			\
++  T (uint64_t, 0xffff)			\
++  T (uint64_t, 0xffffffff)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \1\n} } } */
++
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \1\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \1\n} } } */
++
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \1\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \1\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtw\t\1, p[0-7]/m, \1\n} } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1_run.c
+new file mode 100644
+index 000000000..685f39478
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_uxt_1.c"
++
++#define TEST_LOOP(TYPE, CONST)				\
++  {							\
++    TYPE r[NUM_ELEMS (TYPE)];				\
++    TYPE a[NUM_ELEMS (TYPE)];				\
++    TYPE b[NUM_ELEMS (TYPE)];				\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      {							\
++	a[i] = (i & 1 ? i : 3 * i);			\
++	b[i] = (i >> 4) << (i & 15);			\
++	asm volatile ("" ::: "memory");			\
++      }							\
++    test_##CONST##_##TYPE (r, a, b);			\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      if (r[i] != (a[i] > 20 ? b[i] & CONST : b[i]))	\
++	__builtin_abort ();				\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2.c
+new file mode 100644
+index 000000000..c900498a0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2.c
+@@ -0,0 +1,40 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
++
++#define DEF_LOOP(TYPE, CONST)					\
++  void __attribute__ ((noipa))					\
++  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
++			 TYPE *restrict b)			\
++  {								\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
++      r[i] = a[i] > 20 ? b[i] & CONST : a[i];			\
++  }
++
++#define TEST_ALL(T)			\
++  T (uint16_t, 0xff)			\
++					\
++  T (uint32_t, 0xff)			\
++  T (uint32_t, 0xffff)			\
++					\
++  T (uint64_t, 0xff)			\
++  T (uint64_t, 0xffff)			\
++  T (uint64_t, 0xffffffff)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x1,[^L]*\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \2\n} } } */
++
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x1,[^L]*\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \2\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x1,[^L]*\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \2\n} } } */
++
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x1,[^L]*\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \2\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x1,[^L]*\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \2\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x1,[^L]*\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtw\t\1, p[0-7]/m, \2\n} } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz} } } */
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2_run.c
+new file mode 100644
+index 000000000..75679cdf9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_uxt_2.c"
++
++#define TEST_LOOP(TYPE, CONST)				\
++  {							\
++    TYPE r[NUM_ELEMS (TYPE)];				\
++    TYPE a[NUM_ELEMS (TYPE)];				\
++    TYPE b[NUM_ELEMS (TYPE)];				\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      {							\
++	a[i] = (i & 1 ? i : 3 * i);			\
++	b[i] = (i >> 4) << (i & 15);			\
++	asm volatile ("" ::: "memory");			\
++      }							\
++    test_##CONST##_##TYPE (r, a, b);			\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      if (r[i] != (a[i] > 20 ? b[i] & CONST : a[i]))	\
++	__builtin_abort ();				\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3.c
+new file mode 100644
+index 000000000..cf1fd0029
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3.c
+@@ -0,0 +1,39 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
++
++#define DEF_LOOP(TYPE, CONST)					\
++  void __attribute__ ((noipa))					\
++  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
++			 TYPE *restrict b)			\
++  {								\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
++      r[i] = a[i] > 20 ? b[i] & CONST : 127;			\
++  }
++
++#define TEST_ALL(T)			\
++  T (uint16_t, 0xff)			\
++					\
++  T (uint32_t, 0xff)			\
++  T (uint32_t, 0xffff)			\
++					\
++  T (uint64_t, 0xff)			\
++  T (uint64_t, 0xffff)			\
++  T (uint64_t, 0xffffffff)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtb\t\1\.h, p[0-7]/m, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtb\t\1\.s, p[0-7]/m, z[0-9]+\.s\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxth\t\1\.s, p[0-7]/m, z[0-9]+\.s\n} } } */
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtb\t\1\.d, p[0-7]/m, z[0-9]+\.d\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxth\t\1\.d, p[0-7]/m, z[0-9]+\.d\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtw\t\1\.d, p[0-7]/m, z[0-9]+\.d\n} } } */
++
++/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
++/* { dg-final { scan-assembler-not {\tsel\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3_run.c
+new file mode 100644
+index 000000000..3d33d3a39
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_uxt_3.c"
++
++#define TEST_LOOP(TYPE, CONST)				\
++  {							\
++    TYPE r[NUM_ELEMS (TYPE)];				\
++    TYPE a[NUM_ELEMS (TYPE)];				\
++    TYPE b[NUM_ELEMS (TYPE)];				\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      {							\
++	a[i] = (i & 1 ? i : 3 * i);			\
++	b[i] = (i >> 4) << (i & 15);			\
++	asm volatile ("" ::: "memory");			\
++      }							\
++    test_##CONST##_##TYPE (r, a, b);			\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      if (r[i] != (a[i] > 20 ? b[i] & CONST : 127))	\
++	__builtin_abort ();				\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4.c
+new file mode 100644
+index 000000000..25c664780
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4.c
+@@ -0,0 +1,36 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
++
++#define DEF_LOOP(TYPE, CONST)					\
++  void __attribute__ ((noipa))					\
++  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
++			 TYPE *restrict b)			\
++  {								\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
++      r[i] = a[i] > 20 ? b[i] & CONST : 0;			\
++  }
++
++#define TEST_ALL(T)			\
++  T (uint16_t, 0xff)			\
++					\
++  T (uint32_t, 0xff)			\
++  T (uint32_t, 0xffff)			\
++					\
++  T (uint64_t, 0xff)			\
++  T (uint64_t, 0xffff)			\
++  T (uint64_t, 0xffffffff)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.h), (p[0-7])/z, z[0-9]+\.h\n\tuxtb\t\1, \2/m, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, z[0-9]+\.s\n\tuxtb\t\1, \2/m, z[0-9]+\.s\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, z[0-9]+\.s\n\tuxth\t\1, \2/m, z[0-9]+\.s\n} } } */
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, z[0-9]+\.d\n\tuxtb\t\1, \2/m, z[0-9]+\.d\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, z[0-9]+\.d\n\tuxth\t\1, \2/m, z[0-9]+\.d\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, z[0-9]+\.d\n\tuxtw\t\1, \2/m, z[0-9]+\.d\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4_run.c
+new file mode 100644
+index 000000000..f3c4374ba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4_run.c
+@@ -0,0 +1,27 @@
++/* { dg-do run { target { aarch64_sve_hw } } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "cond_uxt_4.c"
++
++#define TEST_LOOP(TYPE, CONST)				\
++  {							\
++    TYPE r[NUM_ELEMS (TYPE)];				\
++    TYPE a[NUM_ELEMS (TYPE)];				\
++    TYPE b[NUM_ELEMS (TYPE)];				\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      {							\
++	a[i] = (i & 1 ? i : 3 * i);			\
++	b[i] = (i >> 4) << (i & 15);			\
++	asm volatile ("" ::: "memory");			\
++      }							\
++    test_##CONST##_##TYPE (r, a, b);			\
++    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
++      if (r[i] != (a[i] > 20 ? b[i] & CONST : 0))	\
++	__builtin_abort ();				\
++  }
++
++int main ()
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_1.c b/gcc/testsuite/gcc.target/aarch64/sve/const_1.c
+new file mode 100644
+index 000000000..ae25dcb73
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/const_1.c
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++/* { dg-options "-O3" } */
++
++#include <stdint.h>
++
++void
++set (uint64_t *dst, int count)
++{
++  for (int i = 0; i < count; ++i)
++    dst[i] = 0xffff00ff00ffff00ULL;
++}
++
++/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.2d, 0xffff00ff00ffff00\n.*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_2.c b/gcc/testsuite/gcc.target/aarch64/sve/const_2.c
+new file mode 100644
+index 000000000..7b2b5c2a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/const_2.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O3" } */
++
++#include <stdint.h>
++
++#define TEST(TYPE, CONST)			\
++  void						\
++  set_##TYPE (TYPE *dst, int count)		\
++  {						\
++    for (int i = 0; i < count; ++i)		\
++      dst[i] = CONST;				\
++  }
++
++TEST (uint16_t, 129)
++TEST (uint32_t, 129)
++TEST (uint64_t, 129)
++
++/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.8h, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
++/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.4s, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 129\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_3.c b/gcc/testsuite/gcc.target/aarch64/sve/const_3.c
+new file mode 100644
+index 000000000..c18ceaedc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/const_3.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O3" } */
++
++#include <stdint.h>
++
++#define TEST(TYPE, CONST)			\
++  void						\
++  set_##TYPE (TYPE *dst, int count)		\
++  {						\
++    for (int i = 0; i < count; ++i)		\
++      dst[i] = CONST;				\
++  }
++
++TEST (uint16_t, 0x1234)
++TEST (uint32_t, 0x1234)
++TEST (uint64_t, 0x1234)
++
++/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.h, \1\n} } } */
++/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.s, \1\n} } } */
++/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c
+index 0fe7e4c28..5593b070c 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c
+@@ -14,5 +14,4 @@ foo (void)
+   asm volatile ("" :: "w" (x));
+ }
+ 
+-/* { dg-final { scan-assembler {\tmov\tz0\.d, z1\.d\n} } } */
+-/* { dg-final { scan-assembler {\text\tz0\.b, z0\.b, z[01]\.b, #4\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\text\tz0\.b, z0\.b, z1\.b, #4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_3.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_3.c
+new file mode 100644
+index 000000000..83c04c856
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_3.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=1024" } */
++
++typedef int vnx4si __attribute__((vector_size (128)));
++
++void
++foo (void)
++{
++  register int x asm ("z0");
++  register vnx4si y asm ("z1");
++
++  asm volatile ("" : "=w" (y));
++  x = y[21];
++  asm volatile ("" :: "w" (x));
++}
++
++/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\text\tz0\.b, z0\.b, z1\.b, #84\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
+new file mode 100644
+index 000000000..13ad83be2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
+@@ -0,0 +1,35 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O3 --save-temps" } */
++
++#define N 16
++
++typedef float *__restrict__ vnx4sf;
++typedef double *__restrict__ vnx2df;
++typedef _Float16 *__restrict__ vnx8hf_a;
++typedef __fp16 *__restrict__ vnx8hf_b;
++
++extern float fabsf (float);
++extern double fabs (double);
++
++#define FABD(type, abs, n)				\
++	void fabd_##type (type res, type a, type b)	\
++	{						\
++	    int i;					\
++	    for (i = 0; i < n; i++)			\
++		res[i] = abs (a[i] - b[i]);		\
++	}
++
++#define TEST_SVE_F_MODES(FUNC)	\
++  FUNC (vnx2df, fabs, N)	\
++  FUNC (vnx4sf, fabsf, N)	\
++  FUNC (vnx8hf_a, fabsf, N)	\
++  FUNC (vnx8hf_b, fabsf, N)	\
++
++TEST_SVE_F_MODES (FABD)
++
++/* { dg-final { scan-assembler "fabd" } } */
++/* { dg-final { scan-assembler-not "fsub" } } */
++/* { dg-final { scan-assembler-not "fabs" } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c
+new file mode 100644
+index 000000000..158cd6c84
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c
+@@ -0,0 +1,20 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
++
++#define DO_OPS(TYPE)					\
++TYPE fold_##TYPE (TYPE *src, int count)			\
++{							\
++  TYPE res = 0;						\
++  for (int i = 0; i < count; ++i)			\
++    res += src[i];					\
++  return res;						\
++}
++
++DO_OPS (_Float16)
++DO_OPS (float)
++DO_OPS (double)
++
++/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
++/* { dg-final { scan-assembler-not "sel" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmaxnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fmaxnm_1.c
+new file mode 100644
+index 000000000..2f0d64bd4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/fmaxnm_1.c
+@@ -0,0 +1,45 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#ifndef FN
++#define FN(X) __builtin_fmax##X
++#endif
++
++#define DEF_LOOP(FN, TYPE, NAME, CONST)			\
++  void __attribute__ ((noipa))				\
++  test_##TYPE##_##NAME (TYPE *__restrict x,		\
++			TYPE *__restrict y, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = FN (y[i], CONST);				\
++  }
++
++#define TEST_TYPE(T, FN, TYPE) \
++  T (FN, TYPE, zero, 0) \
++  T (FN, TYPE, one, 1) \
++  T (FN, TYPE, two, 2)
++
++#define TEST_ALL(T) \
++  TEST_TYPE (T, FN (f16), _Float16) \
++  TEST_TYPE (T, FN (f32), float) \
++  TEST_TYPE (T, FN (f64), double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fminnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fminnm_1.c
+new file mode 100644
+index 000000000..547772e29
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/fminnm_1.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define FN(X) __builtin_fmin##X
++#include "fmaxnm_1.c"
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
++/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
++
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c
+new file mode 100644
+index 000000000..8e6004337
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c
+@@ -0,0 +1,22 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 1.1: Trailing constants with stepped sequence.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	index	(z[0-9]+\.s), #1, #1
++**	insr	\1, w1
++**	insr	\1, w0
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b)
++{
++  return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c
+new file mode 100644
+index 000000000..bee039415
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c
+@@ -0,0 +1,24 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 5.4: Interleaved repeating elements and non-repeating elements.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), w3
++**	mov	(z[0-9]+\.s), w2
++**	insr	\2, w1
++**	insr	\2, w0
++**	zip1	\2, \2, \1
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int c, int f)
++{
++  return (vnx4si) { a, f, b, f, c, f, c, f };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c
+new file mode 100644
+index 000000000..9a6d8650e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c
+@@ -0,0 +1,21 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_10.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int c = 12;
++  int f = 13;
++
++  vnx4si v = foo (a, b, c, f);
++  int expected[] = { a, f, b, f, c, f, c, f };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c
+new file mode 100644
+index 000000000..8a9496f34
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c
+@@ -0,0 +1,23 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 5.5: Interleaved repeating elements and trailing same elements.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), w1
++**	insr	\1, w0
++**	mov	(z[0-9]+\.s), w2
++**	zip1	\1, \1, \2
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int f) 
++{
++  return (vnx4si) { a, f, b, f, b, f, b, f };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c
+new file mode 100644
+index 000000000..437155581
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c
+@@ -0,0 +1,20 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_11.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int f = 12;
++
++  vnx4si v = foo (a, b, f);
++  int expected[] = { a, f, b, f, b, f, b, f };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c
+new file mode 100644
+index 000000000..bc698ddd3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c
+@@ -0,0 +1,26 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 5.5: Interleaved repeating elements and trailing same elements.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	fmov	(s[0-9]+), w1
++**	mov	(z[0-9]+\.s), w2
++**	mov	(z[0-9]+\.s), w0
++**	insr	\3, \1
++**	insr	\3, \1
++**	insr	\3, \1
++**	zip1	\3, \3, \2
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int f) 
++{
++  return (vnx4si) { b, f, b, f, b, f, a, f };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c
+new file mode 100644
+index 000000000..5ce7edb1e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c
+@@ -0,0 +1,20 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_12.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int f = 12;
++
++  vnx4si v = foo (a, b, f);
++  int expected[] = { b, f, b, f, b, f, a, f };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_13.c b/gcc/testsuite/gcc.target/aarch64/sve/init_13.c
+new file mode 100644
+index 000000000..eea417063
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_13.c
+@@ -0,0 +1,17 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++typedef float vnx4sf __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), s0
++**	insr	\1, wzr
++**	...
++*/
++vnx4sf
++foo (float a)
++{
++  return (vnx4sf) { 0.0f, a, a, a, a, a, a, a };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c
+new file mode 100644
+index 000000000..824a5cbea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c
+@@ -0,0 +1,19 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_1.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++
++  vnx4si v = foo (a, b);
++  int expected[] = { a, b, 1, 2, 3, 4, 5, 6 };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c
+new file mode 100644
+index 000000000..0a8aa8dec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c
+@@ -0,0 +1,23 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 1.2: Trailing constants with repeating sequence.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	...
++**	ld1rd	(z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
++**	insr	\1\.s, w1
++**	insr	\1\.s, w0
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b)
++{
++  return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c
+new file mode 100644
+index 000000000..86c191c77
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c
+@@ -0,0 +1,19 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_2.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++
++  vnx4si v = foo (a, b);
++  int expected[] = { a, b, 2, 3, 2, 3, 2, 3 };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c
+new file mode 100644
+index 000000000..4a418b633
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c
+@@ -0,0 +1,24 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 2.1: Leading constants with stepped sequence.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	index	(z[0-9]+\.s), #6, #-1
++**	insr	\1, w0
++**	insr	\1, w1
++**	rev	\1, \1
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b)
++{
++  return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b };
++}
++
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c
+new file mode 100644
+index 000000000..ce4de6950
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c
+@@ -0,0 +1,19 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_3.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++
++  vnx4si v = foo (a, b);
++  int expected[] = { 1, 2, 3, 4, 5, 6, a, b };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c
+new file mode 100644
+index 000000000..0fa99c151
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c
+@@ -0,0 +1,24 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 2.2: Leading constants with stepped sequence.  */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	...
++**	ld1rd	(z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
++**	insr	\1\.s, w1
++**	insr	\1\.s, w0
++**	rev	\1\.s, \1\.s
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b)
++{
++  return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c
+new file mode 100644
+index 000000000..defee421f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c
+@@ -0,0 +1,19 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_4.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++
++  vnx4si v = foo (a, b);
++  int expected[] = { 3, 2, 3, 2, 3, 2, b, a };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c
+new file mode 100644
+index 000000000..794e265c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c
+@@ -0,0 +1,22 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 3: Trailing same element.  */ 
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), w2
++**	insr	\1, w1
++**	insr	\1, w0
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int c)
++{
++  return (vnx4si) { a, b, c, c, c, c, c, c };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c
+new file mode 100644
+index 000000000..ba91d6fec
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c
+@@ -0,0 +1,20 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_5.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int c = 12;
++
++  vnx4si v = foo (a, b, c);
++  int expected[] = { a, b, c, c, c, c, c, c };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c
+new file mode 100644
+index 000000000..8443fc000
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c
+@@ -0,0 +1,23 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 3: Trailing same element.  */ 
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), w2
++**	insr	\1, w1
++**	insr	\1, w0
++**	rev	\1, \1
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int c)
++{
++  return (vnx4si) { c, c, c, c, c, c, b, a };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c
+new file mode 100644
+index 000000000..802b28f98
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c
+@@ -0,0 +1,20 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_6.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int c = 12;
++
++  vnx4si v = foo (a, b, c);
++  int expected[] = { c, c, c, c, c, c, b, a };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c
+new file mode 100644
+index 000000000..63dbbbe61
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c
+@@ -0,0 +1,27 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 5.1: All elements.  */ 
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), w7
++**	insr	\1, w6
++**	insr	\1, w5
++**	insr	\1, w4
++**	insr	\1, w3
++**	insr	\1, w2
++**	insr	\1, w1
++**	insr	\1, w0
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h)
++{
++  return (vnx4si) { a, b, c, d, e, f, g, h };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c
+new file mode 100644
+index 000000000..61fe28508
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c
+@@ -0,0 +1,25 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_7.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int c = 12;
++  int d = 13;
++  int e = 14;
++  int f = 15;
++  int g = 16;
++  int h = 17;
++
++  vnx4si v = foo (a, b, c, d, e, f, g, h);
++  int expected[] = { a, b, c, d, e, f, g, h };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c
+new file mode 100644
+index 000000000..9c2456785
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c
+@@ -0,0 +1,26 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 5.2: Interleaved elements and constants.  */ 
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	...
++**	ld1w	(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
++**	mov	(z[0-9]+\.s), w3
++**	insr	\2, w2
++**	insr	\2, w1
++**	insr	\2, w0
++**	zip1	\2, \2, \1
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b, int c, int d)
++{
++  return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; 
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c
+new file mode 100644
+index 000000000..24a0a6e06
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c
+@@ -0,0 +1,21 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_8.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++  int c = 12;
++  int d = 13;
++
++  vnx4si v = foo (a, b, c, d);
++  int expected[] = { a, 1, b, 2, c, 3, d, 4 };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c
+new file mode 100644
+index 000000000..d22ab71e6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c
+@@ -0,0 +1,22 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++/* Case 5.3: Repeated elements.  */ 
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++/*
++** foo:
++**	mov	(z[0-9]+\.s), w0
++**	mov	(z[0-9]+\.s), w1
++**	zip1	\1, \1, \2
++**	...
++*/
++__attribute__((noipa))
++vnx4si foo(int a, int b)
++{
++  return (vnx4si) { a, b, a, b, a, b, a, b };
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c
+new file mode 100644
+index 000000000..636ae3b8b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c
+@@ -0,0 +1,19 @@
++/* { dg-do run { target aarch64_sve256_hw } } */
++/* { dg-options "-O2 -msve-vector-bits=256" } */
++
++#include "init_9.c"
++
++int main()
++{
++  int a = 10;
++  int b = 11;
++
++  vnx4si v = foo (a, b);
++  int expected[] = { a, b, a, b, a, b, a, b };
++
++  for (int i = 0; i < 8; i++)
++    if (v[i] != expected[i])
++      __builtin_abort ();
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c b/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
+index 2e6b59ab4..e0e0f4ee6 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
+@@ -28,22 +28,6 @@
+   T (int64_t)
+ 
+ #define FOR_EACH_LOAD_BROADCAST_IMM(T)					\
+-  T (int16_t, 129, imm_129)						\
+-  T (int32_t, 129, imm_129)						\
+-  T (int64_t, 129, imm_129)						\
+-									\
+-  T (int16_t, -130, imm_m130)						\
+-  T (int32_t, -130, imm_m130)						\
+-  T (int64_t, -130, imm_m130)						\
+-									\
+-  T (int16_t, 0x1234, imm_0x1234)					\
+-  T (int32_t, 0x1234, imm_0x1234)					\
+-  T (int64_t, 0x1234, imm_0x1234)					\
+-									\
+-  T (int16_t, 0xFEDC, imm_0xFEDC)					\
+-  T (int32_t, 0xFEDC, imm_0xFEDC)					\
+-  T (int64_t, 0xFEDC, imm_0xFEDC)					\
+-									\
+   T (int32_t, 0x12345678, imm_0x12345678)				\
+   T (int64_t, 0x12345678, imm_0x12345678)				\
+ 									\
+@@ -56,6 +40,6 @@ FOR_EACH_LOAD_BROADCAST (DEF_LOAD_BROADCAST)
+ FOR_EACH_LOAD_BROADCAST_IMM (DEF_LOAD_BROADCAST_IMM)
+ 
+ /* { dg-final { scan-assembler-times {\tld1rb\tz[0-9]+\.b, p[0-7]/z, } 1 } } */
+-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 5 } } */
+-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 7 } } */
+-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 8 } } */
++/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 1 } } */
++/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 3 } } */
++/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 4 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+index 7f02497e8..9ead9c21b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+@@ -68,7 +68,8 @@ TEST_ALL (LOOP)
+ /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
+ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
+-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
++/* 2 for the calculations of -17 and 17.  */
++/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
+@@ -85,7 +86,8 @@ TEST_ALL (LOOP)
+ /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
+ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
+-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
++/* 2 for the calculations of -17 and 17.  */
++/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/aarch64-sve-pcs.exp b/gcc/testsuite/gcc.target/aarch64/sve/pcs/aarch64-sve-pcs.exp
+new file mode 100644
+index 000000000..745887593
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/aarch64-sve-pcs.exp
+@@ -0,0 +1,52 @@
++#  Specific regression driver for AArch64 SVE.
++#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
++#  Contributed by ARM Ltd.
++#
++#  This file is part of GCC.
++#
++#  GCC is free software; you can redistribute it and/or modify it
++#  under the terms of the GNU General Public License as published by
++#  the Free Software Foundation; either version 3, or (at your option)
++#  any later version.
++#
++#  GCC is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of
++#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++#  General Public License for more details.
++#
++#  You should have received a copy of the GNU General Public License
++#  along with GCC; see the file COPYING3.  If not see
++#  <http://www.gnu.org/licenses/>.  */
++
++# GCC testsuite that uses the `dg.exp' driver.
++
++# Exit immediately if this isn't an AArch64 target.
++if {![istarget aarch64*-*-*] } then {
++    return
++}
++
++# Load support procs.
++load_lib gcc-dg.exp
++
++# If a testcase doesn't have special options, use these.
++global DEFAULT_CFLAGS
++if ![info exists DEFAULT_CFLAGS] then {
++    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
++}
++
++# Initialize `dg'.
++dg-init
++
++# Force SVE if we're not testing it already.
++if { [check_effective_target_aarch64_sve] } {
++    set sve_flags ""
++} else {
++    set sve_flags "-march=armv8.2-a+sve"
++}
++
++# Main loop.
++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
++    $sve_flags $DEFAULT_CFLAGS
++
++# All done.
++dg-finish
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
+new file mode 100644
+index 000000000..12ae76789
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
+@@ -0,0 +1,112 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++svbool_t ret_b (void) { return svptrue_b8 (); }
++
++svint8_t ret_s8 (void) { return svdup_s8 (0); }
++svint16_t ret_s16 (void) { return svdup_s16 (0); }
++svint32_t ret_s32 (void) { return svdup_s32 (0); }
++svint64_t ret_s64 (void) { return svdup_s64 (0); }
++svuint8_t ret_u8 (void) { return svdup_u8 (0); }
++svuint16_t ret_u16 (void) { return svdup_u16 (0); }
++svuint32_t ret_u32 (void) { return svdup_u32 (0); }
++svuint64_t ret_u64 (void) { return svdup_u64 (0); }
++svbfloat16_t ret_bf16 (void) { return svundef_bf16 (); }
++svfloat16_t ret_f16 (void) { return svdup_f16 (0); }
++svfloat32_t ret_f32 (void) { return svdup_f32 (0); }
++svfloat64_t ret_f64 (void) { return svdup_f64 (0); }
++
++svint8x2_t ret_s8x2 (void) { return svundef2_s8 (); }
++svint16x2_t ret_s16x2 (void) { return svundef2_s16 (); }
++svint32x2_t ret_s32x2 (void) { return svundef2_s32 (); }
++svint64x2_t ret_s64x2 (void) { return svundef2_s64 (); }
++svuint8x2_t ret_u8x2 (void) { return svundef2_u8 (); }
++svuint16x2_t ret_u16x2 (void) { return svundef2_u16 (); }
++svuint32x2_t ret_u32x2 (void) { return svundef2_u32 (); }
++svuint64x2_t ret_u64x2 (void) { return svundef2_u64 (); }
++svbfloat16x2_t ret_bf16x2 (void) { return svundef2_bf16 (); }
++svfloat16x2_t ret_f16x2 (void) { return svundef2_f16 (); }
++svfloat32x2_t ret_f32x2 (void) { return svundef2_f32 (); }
++svfloat64x2_t ret_f64x2 (void) { return svundef2_f64 (); }
++
++svint8x3_t ret_s8x3 (void) { return svundef3_s8 (); }
++svint16x3_t ret_s16x3 (void) { return svundef3_s16 (); }
++svint32x3_t ret_s32x3 (void) { return svundef3_s32 (); }
++svint64x3_t ret_s64x3 (void) { return svundef3_s64 (); }
++svuint8x3_t ret_u8x3 (void) { return svundef3_u8 (); }
++svuint16x3_t ret_u16x3 (void) { return svundef3_u16 (); }
++svuint32x3_t ret_u32x3 (void) { return svundef3_u32 (); }
++svuint64x3_t ret_u64x3 (void) { return svundef3_u64 (); }
++svbfloat16x3_t ret_bf16x3 (void) { return svundef3_bf16 (); }
++svfloat16x3_t ret_f16x3 (void) { return svundef3_f16 (); }
++svfloat32x3_t ret_f32x3 (void) { return svundef3_f32 (); }
++svfloat64x3_t ret_f64x3 (void) { return svundef3_f64 (); }
++
++svint8x4_t ret_s8x4 (void) { return svundef4_s8 (); }
++svint16x4_t ret_s16x4 (void) { return svundef4_s16 (); }
++svint32x4_t ret_s32x4 (void) { return svundef4_s32 (); }
++svint64x4_t ret_s64x4 (void) { return svundef4_s64 (); }
++svuint8x4_t ret_u8x4 (void) { return svundef4_u8 (); }
++svuint16x4_t ret_u16x4 (void) { return svundef4_u16 (); }
++svuint32x4_t ret_u32x4 (void) { return svundef4_u32 (); }
++svuint64x4_t ret_u64x4 (void) { return svundef4_u64 (); }
++svbfloat16x4_t ret_bf16x4 (void) { return svundef4_bf16 (); }
++svfloat16x4_t ret_f16x4 (void) { return svundef4_f16 (); }
++svfloat32x4_t ret_f32x4 (void) { return svundef4_f32 (); }
++svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_b\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x2\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x3\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
+new file mode 100644
+index 000000000..9f0741e3c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
+@@ -0,0 +1,111 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void fn_b (svbool_t x) {}
++
++void fn_s8 (svint8_t x) {}
++void fn_s16 (svint16_t x) {}
++void fn_s32 (svint32_t x) {}
++void fn_s64 (svint64_t x) {}
++void fn_u8 (svuint8_t x) {}
++void fn_u16 (svuint16_t x) {}
++void fn_u32 (svuint32_t x) {}
++void fn_u64 (svuint64_t x) {}
++void fn_bf16 (svbfloat16_t x) {}
++void fn_f16 (svfloat16_t x) {}
++void fn_f32 (svfloat32_t x) {}
++void fn_f64 (svfloat64_t x) {}
++
++void fn_s8x2 (svint8x2_t x) {}
++void fn_s16x2 (svint16x2_t x) {}
++void fn_s32x2 (svint32x2_t x) {}
++void fn_s64x2 (svint64x2_t x) {}
++void fn_u8x2 (svuint8x2_t x) {}
++void fn_u16x2 (svuint16x2_t x) {}
++void fn_u32x2 (svuint32x2_t x) {}
++void fn_u64x2 (svuint64x2_t x) {}
++void fn_bf16x2 (svbfloat16x2_t x) {}
++void fn_f16x2 (svfloat16x2_t x) {}
++void fn_f32x2 (svfloat32x2_t x) {}
++void fn_f64x2 (svfloat64x2_t x) {}
++
++void fn_s8x3 (svint8x3_t x) {}
++void fn_s16x3 (svint16x3_t x) {}
++void fn_s32x3 (svint32x3_t x) {}
++void fn_s64x3 (svint64x3_t x) {}
++void fn_u8x3 (svuint8x3_t x) {}
++void fn_u16x3 (svuint16x3_t x) {}
++void fn_u32x3 (svuint32x3_t x) {}
++void fn_u64x3 (svuint64x3_t x) {}
++void fn_bf16x3 (svbfloat16x3_t x) {}
++void fn_f16x3 (svfloat16x3_t x) {}
++void fn_f32x3 (svfloat32x3_t x) {}
++void fn_f64x3 (svfloat64x3_t x) {}
++
++void fn_s8x4 (svint8x4_t x) {}
++void fn_s16x4 (svint16x4_t x) {}
++void fn_s32x4 (svint32x4_t x) {}
++void fn_s64x4 (svint64x4_t x) {}
++void fn_u8x4 (svuint8x4_t x) {}
++void fn_u16x4 (svuint16x4_t x) {}
++void fn_u32x4 (svuint32x4_t x) {}
++void fn_u64x4 (svuint64x4_t x) {}
++void fn_bf16x4 (svbfloat16x4_t x) {}
++void fn_f16x4 (svfloat16x4_t x) {}
++void fn_f32x4 (svfloat32x4_t x) {}
++void fn_f64x4 (svfloat64x4_t x) {}
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_b\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_3.c
+new file mode 100644
+index 000000000..42e7860ff
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_3.c
+@@ -0,0 +1,107 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void fn_s8 (float d0, float d1, float d2, float d3, svint8_t x) {}
++void fn_s16 (float d0, float d1, float d2, float d3, svint16_t x) {}
++void fn_s32 (float d0, float d1, float d2, float d3, svint32_t x) {}
++void fn_s64 (float d0, float d1, float d2, float d3, svint64_t x) {}
++void fn_u8 (float d0, float d1, float d2, float d3, svuint8_t x) {}
++void fn_u16 (float d0, float d1, float d2, float d3, svuint16_t x) {}
++void fn_u32 (float d0, float d1, float d2, float d3, svuint32_t x) {}
++void fn_u64 (float d0, float d1, float d2, float d3, svuint64_t x) {}
++void fn_bf16 (float d0, float d1, float d2, float d3, svbfloat16_t x) {}
++void fn_f16 (float d0, float d1, float d2, float d3, svfloat16_t x) {}
++void fn_f32 (float d0, float d1, float d2, float d3, svfloat32_t x) {}
++void fn_f64 (float d0, float d1, float d2, float d3, svfloat64_t x) {}
++
++void fn_s8x2 (float d0, float d1, float d2, float d3, svint8x2_t x) {}
++void fn_s16x2 (float d0, float d1, float d2, float d3, svint16x2_t x) {}
++void fn_s32x2 (float d0, float d1, float d2, float d3, svint32x2_t x) {}
++void fn_s64x2 (float d0, float d1, float d2, float d3, svint64x2_t x) {}
++void fn_u8x2 (float d0, float d1, float d2, float d3, svuint8x2_t x) {}
++void fn_u16x2 (float d0, float d1, float d2, float d3, svuint16x2_t x) {}
++void fn_u32x2 (float d0, float d1, float d2, float d3, svuint32x2_t x) {}
++void fn_u64x2 (float d0, float d1, float d2, float d3, svuint64x2_t x) {}
++void fn_bf16x2 (float d0, float d1, float d2, float d3, svbfloat16x2_t x) {}
++void fn_f16x2 (float d0, float d1, float d2, float d3, svfloat16x2_t x) {}
++void fn_f32x2 (float d0, float d1, float d2, float d3, svfloat32x2_t x) {}
++void fn_f64x2 (float d0, float d1, float d2, float d3, svfloat64x2_t x) {}
++
++void fn_s8x3 (float d0, float d1, float d2, float d3, svint8x3_t x) {}
++void fn_s16x3 (float d0, float d1, float d2, float d3, svint16x3_t x) {}
++void fn_s32x3 (float d0, float d1, float d2, float d3, svint32x3_t x) {}
++void fn_s64x3 (float d0, float d1, float d2, float d3, svint64x3_t x) {}
++void fn_u8x3 (float d0, float d1, float d2, float d3, svuint8x3_t x) {}
++void fn_u16x3 (float d0, float d1, float d2, float d3, svuint16x3_t x) {}
++void fn_u32x3 (float d0, float d1, float d2, float d3, svuint32x3_t x) {}
++void fn_u64x3 (float d0, float d1, float d2, float d3, svuint64x3_t x) {}
++void fn_bf16x3 (float d0, float d1, float d2, float d3, svbfloat16x3_t x) {}
++void fn_f16x3 (float d0, float d1, float d2, float d3, svfloat16x3_t x) {}
++void fn_f32x3 (float d0, float d1, float d2, float d3, svfloat32x3_t x) {}
++void fn_f64x3 (float d0, float d1, float d2, float d3, svfloat64x3_t x) {}
++
++void fn_s8x4 (float d0, float d1, float d2, float d3, svint8x4_t x) {}
++void fn_s16x4 (float d0, float d1, float d2, float d3, svint16x4_t x) {}
++void fn_s32x4 (float d0, float d1, float d2, float d3, svint32x4_t x) {}
++void fn_s64x4 (float d0, float d1, float d2, float d3, svint64x4_t x) {}
++void fn_u8x4 (float d0, float d1, float d2, float d3, svuint8x4_t x) {}
++void fn_u16x4 (float d0, float d1, float d2, float d3, svuint16x4_t x) {}
++void fn_u32x4 (float d0, float d1, float d2, float d3, svuint32x4_t x) {}
++void fn_u64x4 (float d0, float d1, float d2, float d3, svuint64x4_t x) {}
++void fn_bf16x4 (float d0, float d1, float d2, float d3, svbfloat16x4_t x) {}
++void fn_f16x4 (float d0, float d1, float d2, float d3, svfloat16x4_t x) {}
++void fn_f32x4 (float d0, float d1, float d2, float d3, svfloat32x4_t x) {}
++void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_4.c
+new file mode 100644
+index 000000000..7e4438ed4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_4.c
+@@ -0,0 +1,155 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void fn_s8 (float d0, float d1, float d2, float d3,
++	    float d4, svint8_t x) {}
++void fn_s16 (float d0, float d1, float d2, float d3,
++	     float d4, svint16_t x) {}
++void fn_s32 (float d0, float d1, float d2, float d3,
++	     float d4, svint32_t x) {}
++void fn_s64 (float d0, float d1, float d2, float d3,
++	     float d4, svint64_t x) {}
++void fn_u8 (float d0, float d1, float d2, float d3,
++	    float d4, svuint8_t x) {}
++void fn_u16 (float d0, float d1, float d2, float d3,
++	     float d4, svuint16_t x) {}
++void fn_u32 (float d0, float d1, float d2, float d3,
++	     float d4, svuint32_t x) {}
++void fn_u64 (float d0, float d1, float d2, float d3,
++	     float d4, svuint64_t x) {}
++void fn_bf16 (float d0, float d1, float d2, float d3,
++	      float d4, svbfloat16_t x) {}
++void fn_f16 (float d0, float d1, float d2, float d3,
++	     float d4, svfloat16_t x) {}
++void fn_f32 (float d0, float d1, float d2, float d3,
++	     float d4, svfloat32_t x) {}
++void fn_f64 (float d0, float d1, float d2, float d3,
++	     float d4, svfloat64_t x) {}
++
++void fn_s8x2 (float d0, float d1, float d2, float d3,
++	      float d4, svint8x2_t x) {}
++void fn_s16x2 (float d0, float d1, float d2, float d3,
++	       float d4, svint16x2_t x) {}
++void fn_s32x2 (float d0, float d1, float d2, float d3,
++	       float d4, svint32x2_t x) {}
++void fn_s64x2 (float d0, float d1, float d2, float d3,
++	       float d4, svint64x2_t x) {}
++void fn_u8x2 (float d0, float d1, float d2, float d3,
++	      float d4, svuint8x2_t x) {}
++void fn_u16x2 (float d0, float d1, float d2, float d3,
++	       float d4, svuint16x2_t x) {}
++void fn_u32x2 (float d0, float d1, float d2, float d3,
++	       float d4, svuint32x2_t x) {}
++void fn_u64x2 (float d0, float d1, float d2, float d3,
++	       float d4, svuint64x2_t x) {}
++void fn_bf16x2 (float d0, float d1, float d2, float d3,
++		float d4, svbfloat16x2_t x) {}
++void fn_f16x2 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat16x2_t x) {}
++void fn_f32x2 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat32x2_t x) {}
++void fn_f64x2 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat64x2_t x) {}
++
++void fn_s8x3 (float d0, float d1, float d2, float d3,
++	      float d4, svint8x3_t x) {}
++void fn_s16x3 (float d0, float d1, float d2, float d3,
++	       float d4, svint16x3_t x) {}
++void fn_s32x3 (float d0, float d1, float d2, float d3,
++	       float d4, svint32x3_t x) {}
++void fn_s64x3 (float d0, float d1, float d2, float d3,
++	       float d4, svint64x3_t x) {}
++void fn_u8x3 (float d0, float d1, float d2, float d3,
++	      float d4, svuint8x3_t x) {}
++void fn_u16x3 (float d0, float d1, float d2, float d3,
++	       float d4, svuint16x3_t x) {}
++void fn_u32x3 (float d0, float d1, float d2, float d3,
++	       float d4, svuint32x3_t x) {}
++void fn_u64x3 (float d0, float d1, float d2, float d3,
++	       float d4, svuint64x3_t x) {}
++void fn_bf16x3 (float d0, float d1, float d2, float d3,
++		float d4, svbfloat16x3_t x) {}
++void fn_f16x3 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat16x3_t x) {}
++void fn_f32x3 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat32x3_t x) {}
++void fn_f64x3 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat64x3_t x) {}
++
++void fn_s8x4 (float d0, float d1, float d2, float d3,
++	      float d4, svint8x4_t x) {}
++void fn_s16x4 (float d0, float d1, float d2, float d3,
++	       float d4, svint16x4_t x) {}
++void fn_s32x4 (float d0, float d1, float d2, float d3,
++	       float d4, svint32x4_t x) {}
++void fn_s64x4 (float d0, float d1, float d2, float d3,
++	       float d4, svint64x4_t x) {}
++void fn_u8x4 (float d0, float d1, float d2, float d3,
++	      float d4, svuint8x4_t x) {}
++void fn_u16x4 (float d0, float d1, float d2, float d3,
++	       float d4, svuint16x4_t x) {}
++void fn_u32x4 (float d0, float d1, float d2, float d3,
++	       float d4, svuint32x4_t x) {}
++void fn_u64x4 (float d0, float d1, float d2, float d3,
++	       float d4, svuint64x4_t x) {}
++void fn_bf16x4 (float d0, float d1, float d2, float d3,
++		float d4, svbfloat16x4_t x) {}
++void fn_f16x4 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat16x4_t x) {}
++void fn_f32x4 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat32x4_t x) {}
++void fn_f64x4 (float d0, float d1, float d2, float d3,
++	       float d4, svfloat64x4_t x) {}
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_5.c
+new file mode 100644
+index 000000000..6dadc0492
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_5.c
+@@ -0,0 +1,155 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void fn_s8 (float d0, float d1, float d2, float d3,
++	    float d4, float d5, svint8_t x) {}
++void fn_s16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svint16_t x) {}
++void fn_s32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svint32_t x) {}
++void fn_s64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svint64_t x) {}
++void fn_u8 (float d0, float d1, float d2, float d3,
++	    float d4, float d5, svuint8_t x) {}
++void fn_u16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svuint16_t x) {}
++void fn_u32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svuint32_t x) {}
++void fn_u64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svuint64_t x) {}
++void fn_bf16 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svbfloat16_t x) {}
++void fn_f16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svfloat16_t x) {}
++void fn_f32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svfloat32_t x) {}
++void fn_f64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, svfloat64_t x) {}
++
++void fn_s8x2 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svint8x2_t x) {}
++void fn_s16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint16x2_t x) {}
++void fn_s32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint32x2_t x) {}
++void fn_s64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint64x2_t x) {}
++void fn_u8x2 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svuint8x2_t x) {}
++void fn_u16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint16x2_t x) {}
++void fn_u32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint32x2_t x) {}
++void fn_u64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint64x2_t x) {}
++void fn_bf16x2 (float d0, float d1, float d2, float d3,
++		float d4, float d5, svbfloat16x2_t x) {}
++void fn_f16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat16x2_t x) {}
++void fn_f32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat32x2_t x) {}
++void fn_f64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat64x2_t x) {}
++
++void fn_s8x3 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svint8x3_t x) {}
++void fn_s16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint16x3_t x) {}
++void fn_s32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint32x3_t x) {}
++void fn_s64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint64x3_t x) {}
++void fn_u8x3 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svuint8x3_t x) {}
++void fn_u16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint16x3_t x) {}
++void fn_u32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint32x3_t x) {}
++void fn_u64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint64x3_t x) {}
++void fn_bf16x3 (float d0, float d1, float d2, float d3,
++		float d4, float d5, svbfloat16x3_t x) {}
++void fn_f16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat16x3_t x) {}
++void fn_f32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat32x3_t x) {}
++void fn_f64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat64x3_t x) {}
++
++void fn_s8x4 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svint8x4_t x) {}
++void fn_s16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint16x4_t x) {}
++void fn_s32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint32x4_t x) {}
++void fn_s64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svint64x4_t x) {}
++void fn_u8x4 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, svuint8x4_t x) {}
++void fn_u16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint16x4_t x) {}
++void fn_u32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint32x4_t x) {}
++void fn_u64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svuint64x4_t x) {}
++void fn_bf16x4 (float d0, float d1, float d2, float d3,
++		float d4, float d5, svbfloat16x4_t x) {}
++void fn_f16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat16x4_t x) {}
++void fn_f32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat32x4_t x) {}
++void fn_f64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, svfloat64x4_t x) {}
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_6.c
+new file mode 100644
+index 000000000..0ff73e259
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_6.c
+@@ -0,0 +1,155 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void fn_s8 (float d0, float d1, float d2, float d3,
++	    float d4, float d5, float d6, svint8_t x) {}
++void fn_s16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svint16_t x) {}
++void fn_s32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svint32_t x) {}
++void fn_s64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svint64_t x) {}
++void fn_u8 (float d0, float d1, float d2, float d3,
++	    float d4, float d5, float d6, svuint8_t x) {}
++void fn_u16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svuint16_t x) {}
++void fn_u32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svuint32_t x) {}
++void fn_u64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svuint64_t x) {}
++void fn_bf16 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svbfloat16_t x) {}
++void fn_f16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svfloat16_t x) {}
++void fn_f32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svfloat32_t x) {}
++void fn_f64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, svfloat64_t x) {}
++
++void fn_s8x2 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svint8x2_t x) {}
++void fn_s16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint16x2_t x) {}
++void fn_s32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint32x2_t x) {}
++void fn_s64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint64x2_t x) {}
++void fn_u8x2 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svuint8x2_t x) {}
++void fn_u16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint16x2_t x) {}
++void fn_u32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint32x2_t x) {}
++void fn_u64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint64x2_t x) {}
++void fn_bf16x2 (float d0, float d1, float d2, float d3,
++		float d4, float d5, float d6, svbfloat16x2_t x) {}
++void fn_f16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat16x2_t x) {}
++void fn_f32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat32x2_t x) {}
++void fn_f64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat64x2_t x) {}
++
++void fn_s8x3 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svint8x3_t x) {}
++void fn_s16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint16x3_t x) {}
++void fn_s32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint32x3_t x) {}
++void fn_s64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint64x3_t x) {}
++void fn_u8x3 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svuint8x3_t x) {}
++void fn_u16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint16x3_t x) {}
++void fn_u32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint32x3_t x) {}
++void fn_u64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint64x3_t x) {}
++void fn_bf16x3 (float d0, float d1, float d2, float d3,
++		float d4, float d5, float d6, svbfloat16x3_t x) {}
++void fn_f16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat16x3_t x) {}
++void fn_f32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat32x3_t x) {}
++void fn_f64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat64x3_t x) {}
++
++void fn_s8x4 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svint8x4_t x) {}
++void fn_s16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint16x4_t x) {}
++void fn_s32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint32x4_t x) {}
++void fn_s64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svint64x4_t x) {}
++void fn_u8x4 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, svuint8x4_t x) {}
++void fn_u16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint16x4_t x) {}
++void fn_u32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint32x4_t x) {}
++void fn_u64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svuint64x4_t x) {}
++void fn_bf16x4 (float d0, float d1, float d2, float d3,
++		float d4, float d5, float d6, svbfloat16x4_t x) {}
++void fn_f16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat16x4_t x) {}
++void fn_f32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat32x4_t x) {}
++void fn_f64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, svfloat64x4_t x) {}
++
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
++/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x2\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x2\n} } } */
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_7.c
+new file mode 100644
+index 000000000..4f3ff8107
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_7.c
+@@ -0,0 +1,105 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void fn_s8 (float d0, float d1, float d2, float d3,
++	    float d4, float d5, float d6, float d7, svint8_t x) {}
++void fn_s16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svint16_t x) {}
++void fn_s32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svint32_t x) {}
++void fn_s64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svint64_t x) {}
++void fn_u8 (float d0, float d1, float d2, float d3,
++	    float d4, float d5, float d6, float d7, svuint8_t x) {}
++void fn_u16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svuint16_t x) {}
++void fn_u32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svuint32_t x) {}
++void fn_u64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svuint64_t x) {}
++void fn_bf16 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svbfloat16_t x) {}
++void fn_f16 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svfloat16_t x) {}
++void fn_f32 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svfloat32_t x) {}
++void fn_f64 (float d0, float d1, float d2, float d3,
++	     float d4, float d5, float d6, float d7, svfloat64_t x) {}
++
++void fn_s8x2 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svint8x2_t x) {}
++void fn_s16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint16x2_t x) {}
++void fn_s32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint32x2_t x) {}
++void fn_s64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint64x2_t x) {}
++void fn_u8x2 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svuint8x2_t x) {}
++void fn_u16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint16x2_t x) {}
++void fn_u32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint32x2_t x) {}
++void fn_u64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint64x2_t x) {}
++void fn_bf16x2 (float d0, float d1, float d2, float d3,
++		float d4, float d5, float d6, float d7, svbfloat16x2_t x) {}
++void fn_f16x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat16x2_t x) {}
++void fn_f32x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat32x2_t x) {}
++void fn_f64x2 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat64x2_t x) {}
++
++void fn_s8x3 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svint8x3_t x) {}
++void fn_s16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint16x3_t x) {}
++void fn_s32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint32x3_t x) {}
++void fn_s64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint64x3_t x) {}
++void fn_u8x3 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svuint8x3_t x) {}
++void fn_u16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint16x3_t x) {}
++void fn_u32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint32x3_t x) {}
++void fn_u64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint64x3_t x) {}
++void fn_bf16x3 (float d0, float d1, float d2, float d3,
++		float d4, float d5, float d6, float d7, svbfloat16x3_t x) {}
++void fn_f16x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat16x3_t x) {}
++void fn_f32x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat32x3_t x) {}
++void fn_f64x3 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat64x3_t x) {}
++
++void fn_s8x4 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svint8x4_t x) {}
++void fn_s16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint16x4_t x) {}
++void fn_s32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint32x4_t x) {}
++void fn_s64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svint64x4_t x) {}
++void fn_u8x4 (float d0, float d1, float d2, float d3,
++	      float d4, float d5, float d6, float d7, svuint8x4_t x) {}
++void fn_u16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint16x4_t x) {}
++void fn_u32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint32x4_t x) {}
++void fn_u64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svuint64x4_t x) {}
++void fn_bf16x4 (float d0, float d1, float d2, float d3,
++		float d4, float d5, float d6, float d7, svbfloat16x4_t x) {}
++void fn_f16x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat16x4_t x) {}
++void fn_f32x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat32x4_t x) {}
++void fn_f64x4 (float d0, float d1, float d2, float d3,
++	       float d4, float d5, float d6, float d7, svfloat64x4_t x) {}
++
++/* { dg-final { scan-assembler-not {\t\.variant_pcs\t\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
+new file mode 100644
+index 000000000..fd9932e2e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
+@@ -0,0 +1,49 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_pred:
++**	ldr	(p[0-9]+), \[x0\]
++**	ldr	(p[0-9]+), \[x1\]
++**	brkpa	(p[0-7])\.b, p0/z, p1\.b, p2\.b
++**	brkpb	(p[0-7])\.b, \3/z, p3\.b, \1\.b
++**	brka	p0\.b, \4/z, \2\.b
++**	ret
++*/
++__SVBool_t __attribute__((noipa))
++callee_pred (__SVBool_t p0, __SVBool_t p1, __SVBool_t p2, __SVBool_t p3,
++	     __SVBool_t mem0, __SVBool_t mem1)
++{
++  p0 = svbrkpa_z (p0, p1, p2);
++  p0 = svbrkpb_z (p0, p3, mem0);
++  return svbrka_z (p0, mem1);
++}
++
++/*
++** caller_pred:
++**	...
++**	ptrue	(p[0-9]+)\.b, vl5
++**	str	\1, \[x0\]
++**	...
++**	ptrue	(p[0-9]+)\.h, vl6
++**	str	\2, \[x1\]
++**	ptrue	p3\.d, vl4
++**	ptrue	p2\.s, vl3
++**	ptrue	p1\.h, vl2
++**	ptrue	p0\.b, vl1
++**	bl	callee_pred
++**	...
++*/
++__SVBool_t __attribute__((noipa))
++caller_pred (void)
++{
++  return callee_pred (svptrue_pat_b8 (SV_VL1),
++		      svptrue_pat_b16 (SV_VL2),
++		      svptrue_pat_b32 (SV_VL3),
++		      svptrue_pat_b64 (SV_VL4),
++		      svptrue_pat_b8 (SV_VL5),
++		      svptrue_pat_b16 (SV_VL6));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_10.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_10.c
+new file mode 100644
+index 000000000..1bbcb770d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_10.c
+@@ -0,0 +1,33 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee:
++**	fadd	s0, (s0, s6|s6, s0)
++**	ret
++*/
++float __attribute__((noipa))
++callee (float s0, double d1, svfloat32x4_t z2, svfloat64x4_t stack1,
++	float s6, double d7)
++{
++  return s0 + s6;
++}
++
++float __attribute__((noipa))
++caller (float32_t *x0, float64_t *x1)
++{
++  return callee (0.0f, 1.0,
++		 svld4 (svptrue_b8 (), x0),
++		 svld4 (svptrue_b8 (), x1),
++		 6.0f, 7.0);
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z2\.s - z5\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - z[0-9]+\.d}, p[0-7]/z, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tmovi\tv0\.[24]s, #0\n} } } */
++/* { dg-final { scan-assembler {\tfmov\td1, #?1\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\ts6, #?6\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\td7, #?7\.0} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_nosc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_nosc.c
+new file mode 100644
+index 000000000..0f62e0b08
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_nosc.c
+@@ -0,0 +1,61 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O0 -g" } */
++
++#include <arm_sve.h>
++
++void __attribute__((noipa))
++callee (svbool_t p, svint8_t s8, svuint16x4_t u16, svfloat32x3_t f32,
++	svint64x2_t s64)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++
++  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
++    __builtin_abort ();
++}
++
++int __attribute__((noipa))
++main (void)
++{
++  callee (svptrue_pat_b8 (SV_VL7),
++	  svindex_s8 (1, 2),
++	  svcreate4 (svindex_u16 (2, 3),
++		     svindex_u16 (3, 4),
++		     svindex_u16 (4, 5),
++		     svindex_u16 (5, 6)),
++	  svcreate3 (svdup_f32 (1.0),
++		     svdup_f32 (2.0),
++		     svdup_f32 (3.0)),
++	  svcreate2 (svindex_s64 (6, 7),
++		     svindex_s64 (7, 8)));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_sc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_sc.c
+new file mode 100644
+index 000000000..8a98d58ce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_sc.c
+@@ -0,0 +1,61 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O0 -fstack-clash-protection -g" } */
++
++#include <arm_sve.h>
++
++void __attribute__((noipa))
++callee (svbool_t p, svint8_t s8, svuint16x4_t u16, svfloat32x3_t f32,
++	svint64x2_t s64)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++
++  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
++    __builtin_abort ();
++}
++
++int __attribute__((noipa))
++main (void)
++{
++  callee (svptrue_pat_b8 (SV_VL7),
++	  svindex_s8 (1, 2),
++	  svcreate4 (svindex_u16 (2, 3),
++		     svindex_u16 (3, 4),
++		     svindex_u16 (4, 5),
++		     svindex_u16 (5, 6)),
++	  svcreate3 (svdup_f32 (1.0),
++		     svdup_f32 (2.0),
++		     svdup_f32 (3.0)),
++	  svcreate2 (svindex_s64 (6, 7),
++		     svindex_s64 (7, 8)));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_2.c
+new file mode 100644
+index 000000000..43a50887d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_2.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_int:
++**	ptrue	p3\.b, all
++**	ld1b	(z(?:2[4-9]|3[0-1]).b), p3/z, \[x4\]
++**	st1b	\1, p2, \[x0\]
++**	st1b	z4\.b, p1, \[x0\]
++**	st1h	z5\.h, p1, \[x1\]
++**	st1w	z6\.s, p1, \[x2\]
++**	st1d	z7\.d, p1, \[x3\]
++**	st1b	z0\.b, p0, \[x0\]
++**	st1h	z1\.h, p0, \[x1\]
++**	st1w	z2\.s, p0, \[x2\]
++**	st1d	z3\.d, p0, \[x3\]
++**	ret
++*/
++void __attribute__((noipa))
++callee_int (int8_t *x0, int16_t *x1, int32_t *x2, int64_t *x3,
++	    svint8_t z0, svint16_t z1, svint32_t z2, svint64_t z3,
++	    svint8_t z4, svint16_t z5, svint32_t z6, svint64_t z7,
++	    svint8_t z8,
++	    svbool_t p0, svbool_t p1, svbool_t p2)
++{
++  svst1 (p2, x0, z8);
++  svst1 (p1, x0, z4);
++  svst1 (p1, x1, z5);
++  svst1 (p1, x2, z6);
++  svst1 (p1, x3, z7);
++  svst1 (p0, x0, z0);
++  svst1 (p0, x1, z1);
++  svst1 (p0, x2, z2);
++  svst1 (p0, x3, z3);
++}
++
++void __attribute__((noipa))
++caller_int (int8_t *x0, int16_t *x1, int32_t *x2, int64_t *x3)
++{
++  callee_int (x0, x1, x2, x3,
++	      svdup_s8 (0),
++	      svdup_s16 (1),
++	      svdup_s32 (2),
++	      svdup_s64 (3),
++	      svdup_s8 (4),
++	      svdup_s16 (5),
++	      svdup_s32 (6),
++	      svdup_s64 (7),
++	      svdup_s8 (8),
++	      svptrue_pat_b8 (SV_VL1),
++	      svptrue_pat_b16 (SV_VL2),
++	      svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tmov\tz0\.b, #0\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz1\.h, #1\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz2\.s, #2\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz3\.d, #3\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz4\.b, #4\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz5\.h, #5\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz6\.s, #6\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz7\.d, #7\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx4, sp\n} } } */
++/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #8\n.*\tst1b\t\1, p[0-7], \[x4\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_3.c
+new file mode 100644
+index 000000000..49fdfc984
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_3.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_uint:
++**	ptrue	p3\.b, all
++**	ld1b	(z(?:2[4-9]|3[0-1]).b), p3/z, \[x4\]
++**	st1b	\1, p2, \[x0\]
++**	st1b	z4\.b, p1, \[x0\]
++**	st1h	z5\.h, p1, \[x1\]
++**	st1w	z6\.s, p1, \[x2\]
++**	st1d	z7\.d, p1, \[x3\]
++**	st1b	z0\.b, p0, \[x0\]
++**	st1h	z1\.h, p0, \[x1\]
++**	st1w	z2\.s, p0, \[x2\]
++**	st1d	z3\.d, p0, \[x3\]
++**	ret
++*/
++void __attribute__((noipa))
++callee_uint (uint8_t *x0, uint16_t *x1, uint32_t *x2, uint64_t *x3,
++	     svuint8_t z0, svuint16_t z1, svuint32_t z2, svuint64_t z3,
++	     svuint8_t z4, svuint16_t z5, svuint32_t z6, svuint64_t z7,
++	     svuint8_t z8,
++	     svbool_t p0, svbool_t p1, svbool_t p2)
++{
++  svst1 (p2, x0, z8);
++  svst1 (p1, x0, z4);
++  svst1 (p1, x1, z5);
++  svst1 (p1, x2, z6);
++  svst1 (p1, x3, z7);
++  svst1 (p0, x0, z0);
++  svst1 (p0, x1, z1);
++  svst1 (p0, x2, z2);
++  svst1 (p0, x3, z3);
++}
++
++void __attribute__((noipa))
++caller_uint (uint8_t *x0, uint16_t *x1, uint32_t *x2, uint64_t *x3)
++{
++  callee_uint (x0, x1, x2, x3,
++	       svdup_u8 (0),
++	       svdup_u16 (1),
++	       svdup_u32 (2),
++	       svdup_u64 (3),
++	       svdup_u8 (4),
++	       svdup_u16 (5),
++	       svdup_u32 (6),
++	       svdup_u64 (7),
++	       svdup_u8 (8),
++	       svptrue_pat_b8 (SV_VL1),
++	       svptrue_pat_b16 (SV_VL2),
++	       svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tmov\tz0\.b, #0\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz1\.h, #1\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz2\.s, #2\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz3\.d, #3\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz4\.b, #4\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz5\.h, #5\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz6\.s, #6\n} } } */
++/* { dg-final { scan-assembler {\tmov\tz7\.d, #7\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx4, sp\n} } } */
++/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #8\n.*\tst1b\t\1, p[0-7], \[x4\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_4.c
+new file mode 100644
+index 000000000..4f15fdd50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_4.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_float:
++**	ptrue	p3\.b, all
++**	ld1h	(z(?:2[4-9]|3[0-1]).h), p3/z, \[x4\]
++**	st1h	\1, p2, \[x0\]
++**	st1h	z4\.h, p1, \[x0\]
++**	st1h	z5\.h, p1, \[x1\]
++**	st1w	z6\.s, p1, \[x2\]
++**	st1d	z7\.d, p1, \[x3\]
++**	st1h	z0\.h, p0, \[x0\]
++**	st1h	z1\.h, p0, \[x1\]
++**	st1w	z2\.s, p0, \[x2\]
++**	st1d	z3\.d, p0, \[x3\]
++**	ret
++*/
++void __attribute__((noipa))
++callee_float (float16_t *x0, float16_t *x1, float32_t *x2, float64_t *x3,
++	      svfloat16_t z0, svfloat16_t z1, svfloat32_t z2, svfloat64_t z3,
++	      svfloat16_t z4, svfloat16_t z5, svfloat32_t z6, svfloat64_t z7,
++	      svfloat16_t z8,
++	      svbool_t p0, svbool_t p1, svbool_t p2)
++{
++  svst1 (p2, x0, z8);
++  svst1 (p1, x0, z4);
++  svst1 (p1, x1, z5);
++  svst1 (p1, x2, z6);
++  svst1 (p1, x3, z7);
++  svst1 (p0, x0, z0);
++  svst1 (p0, x1, z1);
++  svst1 (p0, x2, z2);
++  svst1 (p0, x3, z3);
++}
++
++void __attribute__((noipa))
++caller_float (float16_t *x0, float16_t *x1, float32_t *x2, float64_t *x3)
++{
++  callee_float (x0, x1, x2, x3,
++		svdup_f16 (0),
++		svdup_f16 (1),
++		svdup_f32 (2),
++		svdup_f64 (3),
++		svdup_f16 (4),
++		svdup_f16 (5),
++		svdup_f32 (6),
++		svdup_f64 (7),
++		svdup_f16 (8),
++		svptrue_pat_b8 (SV_VL1),
++		svptrue_pat_b16 (SV_VL2),
++		svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tmov\tz0\.[bhsd], #0\n} } } */
++/* { dg-final { scan-assembler {\tfmov\tz1\.h, #1\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\tz2\.s, #2\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\tz3\.d, #3\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\tz4\.h, #4\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\tz5\.h, #5\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\tz6\.s, #6\.0} } } */
++/* { dg-final { scan-assembler {\tfmov\tz7\.d, #7\.0} } } */
++/* { dg-final { scan-assembler {\tmov\tx4, sp\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.h), #8\.0.*\tst1h\t\1, p[0-7], \[x4\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_bf16.c
+new file mode 100644
+index 000000000..e9b63a45d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_bf16.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	st2h	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	st2h	{\3 - \4}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
++	svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_bf16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_bf16 (pg, x0, -8),
++	  svld3_vnum_bf16 (pg, x0, -3),
++	  svld2_vnum_bf16 (pg, x0, 0),
++	  svld1_vnum_bf16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f16.c
+new file mode 100644
+index 000000000..4152f9125
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f16.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	st2h	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	st2h	{\3 - \4}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svfloat16x4_t z0, svfloat16x3_t z4, svfloat16x2_t stack,
++	svfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_f16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_f16 (pg, x0, -8),
++	  svld3_vnum_f16 (pg, x0, -3),
++	  svld2_vnum_f16 (pg, x0, 0),
++	  svld1_vnum_f16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f32.c
+new file mode 100644
+index 000000000..0f78fac79
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f32.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
++**	st2w	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
++**	st2w	{\3 - \4}, p0, \[x0\]
++** )
++**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
++**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
++**	st1w	z7\.s, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svfloat32x4_t z0, svfloat32x3_t z4, svfloat32x2_t stack,
++	svfloat32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_f32 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_f32 (pg, x0, -8),
++	  svld3_vnum_f32 (pg, x0, -3),
++	  svld2_vnum_f32 (pg, x0, 0),
++	  svld1_vnum_f32 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f64.c
+new file mode 100644
+index 000000000..fe832d0d0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f64.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
++**	st2d	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
++**	st2d	{\3 - \4}, p0, \[x0\]
++** )
++**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
++**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
++**	st1d	z7\.d, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svfloat64x4_t z0, svfloat64x3_t z4, svfloat64x2_t stack,
++	svfloat64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_f64 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_f64 (pg, x0, -8),
++	  svld3_vnum_f64 (pg, x0, -3),
++	  svld2_vnum_f64 (pg, x0, 0),
++	  svld1_vnum_f64 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s16.c
+new file mode 100644
+index 000000000..3f708e0f0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s16.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	st2h	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	st2h	{\3 - \4}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint16x4_t z0, svint16x3_t z4, svint16x2_t stack,
++	svint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s16 (pg, x0, -8),
++	  svld3_vnum_s16 (pg, x0, -3),
++	  svld2_vnum_s16 (pg, x0, 0),
++	  svld1_vnum_s16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s32.c
+new file mode 100644
+index 000000000..8c57190ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s32.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
++**	st2w	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
++**	st2w	{\3 - \4}, p0, \[x0\]
++** )
++**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
++**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
++**	st1w	z7\.s, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint32x4_t z0, svint32x3_t z4, svint32x2_t stack,
++	svint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s32 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s32 (pg, x0, -8),
++	  svld3_vnum_s32 (pg, x0, -3),
++	  svld2_vnum_s32 (pg, x0, 0),
++	  svld1_vnum_s32 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s64.c
+new file mode 100644
+index 000000000..e60d049fb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s64.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
++**	st2d	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
++**	st2d	{\3 - \4}, p0, \[x0\]
++** )
++**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
++**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
++**	st1d	z7\.d, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint64x4_t z0, svint64x3_t z4, svint64x2_t stack,
++	svint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s64 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s64 (pg, x0, -8),
++	  svld3_vnum_s64 (pg, x0, -3),
++	  svld2_vnum_s64 (pg, x0, 0),
++	  svld1_vnum_s64 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s8.c
+new file mode 100644
+index 000000000..bc0058372
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s8.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
++**	st2b	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
++**	st2b	{\3 - \4}, p0, \[x0\]
++** )
++**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
++**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
++**	st1b	z7\.b, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint8x4_t z0, svint8x3_t z4, svint8x2_t stack,
++	svint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s8 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s8 (pg, x0, -8),
++	  svld3_vnum_s8 (pg, x0, -3),
++	  svld2_vnum_s8 (pg, x0, 0),
++	  svld1_vnum_s8 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u16.c
+new file mode 100644
+index 000000000..8aa651a41
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u16.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	st2h	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
++**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
++**	st2h	{\3 - \4}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint16x4_t z0, svuint16x3_t z4, svuint16x2_t stack,
++	svuint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u16 (pg, x0, -8),
++	  svld3_vnum_u16 (pg, x0, -3),
++	  svld2_vnum_u16 (pg, x0, 0),
++	  svld1_vnum_u16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u32.c
+new file mode 100644
+index 000000000..9ea3066ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u32.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
++**	st2w	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
++**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
++**	st2w	{\3 - \4}, p0, \[x0\]
++** )
++**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
++**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
++**	st1w	z7\.s, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint32x4_t z0, svuint32x3_t z4, svuint32x2_t stack,
++	svuint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u32 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u32 (pg, x0, -8),
++	  svld3_vnum_u32 (pg, x0, -3),
++	  svld2_vnum_u32 (pg, x0, 0),
++	  svld1_vnum_u32 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u64.c
+new file mode 100644
+index 000000000..b64f3b6d5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u64.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
++**	st2d	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
++**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
++**	st2d	{\3 - \4}, p0, \[x0\]
++** )
++**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
++**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
++**	st1d	z7\.d, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint64x4_t z0, svuint64x3_t z4, svuint64x2_t stack,
++	svuint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u64 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u64 (pg, x0, -8),
++	  svld3_vnum_u64 (pg, x0, -3),
++	  svld2_vnum_u64 (pg, x0, 0),
++	  svld1_vnum_u64 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u8.c
+new file mode 100644
+index 000000000..5575673ae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u8.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++** (
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
++**	st2b	{\2 - \1}, p0, \[x0\]
++** |
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
++**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
++**	st2b	{\3 - \4}, p0, \[x0\]
++** )
++**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
++**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
++**	st1b	z7\.b, p3, \[x0\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint8x4_t z0, svuint8x3_t z4, svuint8x2_t stack,
++	svuint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u8 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u8 (pg, x0, -8),
++	  svld3_vnum_u8 (pg, x0, -3),
++	  svld2_vnum_u8 (pg, x0, 0),
++	  svld1_vnum_u8 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_bf16.c
+new file mode 100644
+index 000000000..94d84df4a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_bf16.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
++	svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_bf16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_bf16 (pg, x0, -8),
++	  svld3_vnum_bf16 (pg, x0, -3),
++	  svld2_vnum_bf16 (pg, x0, 0),
++	  svld1_vnum_bf16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f16.c
+new file mode 100644
+index 000000000..6271365c7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f16.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svfloat16x4_t z0, svfloat16x3_t z4, svfloat16x2_t stack,
++	svfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_f16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_f16 (pg, x0, -8),
++	  svld3_vnum_f16 (pg, x0, -3),
++	  svld2_vnum_f16 (pg, x0, 0),
++	  svld1_vnum_f16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f32.c
+new file mode 100644
+index 000000000..ef89de216
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f32.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2w	{\2\.s - \1\.s}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2w	{\3\.s - \4\.s}, p0, \[x0\]
++** )
++**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
++**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
++**	st1w	z7\.s, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svfloat32x4_t z0, svfloat32x3_t z4, svfloat32x2_t stack,
++	svfloat32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_f32 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_f32 (pg, x0, -8),
++	  svld3_vnum_f32 (pg, x0, -3),
++	  svld2_vnum_f32 (pg, x0, 0),
++	  svld1_vnum_f32 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f64.c
+new file mode 100644
+index 000000000..4eddf2d1f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f64.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2d	{\2\.d - \1\.d}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2d	{\3\.d - \4\.d}, p0, \[x0\]
++** )
++**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
++**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
++**	st1d	z7\.d, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svfloat64x4_t z0, svfloat64x3_t z4, svfloat64x2_t stack,
++	svfloat64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_f64 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_f64 (pg, x0, -8),
++	  svld3_vnum_f64 (pg, x0, -3),
++	  svld2_vnum_f64 (pg, x0, 0),
++	  svld1_vnum_f64 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s16.c
+new file mode 100644
+index 000000000..a4b6af071
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s16.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint16x4_t z0, svint16x3_t z4, svint16x2_t stack,
++	svint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s16 (pg, x0, -8),
++	  svld3_vnum_s16 (pg, x0, -3),
++	  svld2_vnum_s16 (pg, x0, 0),
++	  svld1_vnum_s16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s32.c
+new file mode 100644
+index 000000000..60b58d6fc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s32.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2w	{\2\.s - \1\.s}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2w	{\3\.s - \4\.s}, p0, \[x0\]
++** )
++**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
++**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
++**	st1w	z7\.s, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint32x4_t z0, svint32x3_t z4, svint32x2_t stack,
++	svint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s32 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s32 (pg, x0, -8),
++	  svld3_vnum_s32 (pg, x0, -3),
++	  svld2_vnum_s32 (pg, x0, 0),
++	  svld1_vnum_s32 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s64.c
+new file mode 100644
+index 000000000..b6126aa4c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s64.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2d	{\2\.d - \1\.d}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2d	{\3\.d - \4\.d}, p0, \[x0\]
++** )
++**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
++**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
++**	st1d	z7\.d, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint64x4_t z0, svint64x3_t z4, svint64x2_t stack,
++	svint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s64 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s64 (pg, x0, -8),
++	  svld3_vnum_s64 (pg, x0, -3),
++	  svld2_vnum_s64 (pg, x0, 0),
++	  svld1_vnum_s64 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s8.c
+new file mode 100644
+index 000000000..5c16c3c8f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s8.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2b	{\2\.b - \1\.b}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2b	{\3\.b - \4\.b}, p0, \[x0\]
++** )
++**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
++**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
++**	st1b	z7\.b, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svint8x4_t z0, svint8x3_t z4, svint8x2_t stack,
++	svint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_s8 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_s8 (pg, x0, -8),
++	  svld3_vnum_s8 (pg, x0, -3),
++	  svld2_vnum_s8 (pg, x0, 0),
++	  svld1_vnum_s8 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u16.c
+new file mode 100644
+index 000000000..2b9a90025
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u16.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
++** )
++**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
++**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
++**	st1h	z7\.h, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint16x4_t z0, svuint16x3_t z4, svuint16x2_t stack,
++	svuint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u16 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u16 (pg, x0, -8),
++	  svld3_vnum_u16 (pg, x0, -3),
++	  svld2_vnum_u16 (pg, x0, 0),
++	  svld1_vnum_u16 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u32.c
+new file mode 100644
+index 000000000..2902f59b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u32.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2w	{\2\.s - \1\.s}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2w	{\3\.s - \4\.s}, p0, \[x0\]
++** )
++**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
++**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
++**	st1w	z7\.s, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint32x4_t z0, svuint32x3_t z4, svuint32x2_t stack,
++	svuint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u32 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u32 (pg, x0, -8),
++	  svld3_vnum_u32 (pg, x0, -3),
++	  svld2_vnum_u32 (pg, x0, 0),
++	  svld1_vnum_u32 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u64.c
+new file mode 100644
+index 000000000..85b3cfdad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u64.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2d	{\2\.d - \1\.d}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2d	{\3\.d - \4\.d}, p0, \[x0\]
++** )
++**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
++**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
++**	st1d	z7\.d, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint64x4_t z0, svuint64x3_t z4, svuint64x2_t stack,
++	svuint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u64 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u64 (pg, x0, -8),
++	  svld3_vnum_u64 (pg, x0, -3),
++	  svld2_vnum_u64 (pg, x0, 0),
++	  svld1_vnum_u64 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u8.c
+new file mode 100644
+index 000000000..f56acb693
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u8.c
+@@ -0,0 +1,58 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee:
++** (
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	ldr	(z[0-9]+), \[x1\]
++**	st2b	{\2\.b - \1\.b}, p0, \[x0\]
++** |
++**	ldr	(z[0-9]+), \[x1\]
++**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
++**	st2b	{\3\.b - \4\.b}, p0, \[x0\]
++** )
++**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
++**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
++**	st1b	z7\.b, p3, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (void *x0, svuint8x4_t z0, svuint8x3_t z4, svuint8x2_t stack,
++	svuint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  svst2 (p0, x0, stack);
++  svst4 (p1, x0, z0);
++  svst3 (p2, x0, z4);
++  svst1_u8 (p3, x0, z7);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee (x0,
++	  svld4_vnum_u8 (pg, x0, -8),
++	  svld3_vnum_u8 (pg, x0, -3),
++	  svld2_vnum_u8 (pg, x0, 0),
++	  svld1_vnum_u8 (pg, x0, 2),
++	  svptrue_pat_b8 (SV_VL1),
++	  svptrue_pat_b16 (SV_VL2),
++	  svptrue_pat_b32 (SV_VL3),
++	  svptrue_pat_b64 (SV_VL4));
++}
++
++/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_bf16.c
+new file mode 100644
+index 000000000..84d2c406c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_bf16.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
++	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_bf16 (p0, x0, stack1);
++  svst2_bf16 (p1, x0, z3);
++  svst3_bf16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
++	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_bf16 (p0, x0, stack2);
++  svst2_bf16 (p1, x0, z3);
++  svst3_bf16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_bf16 (pg, x0, -9),
++	   svld2_vnum_bf16 (pg, x0, -2),
++	   svld3_vnum_bf16 (pg, x0, 0),
++	   svld4_vnum_bf16 (pg, x0, 8),
++	   svld1_vnum_bf16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f16.c
+new file mode 100644
+index 000000000..dd4ccc3b2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f16.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
++	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_f16 (p0, x0, stack1);
++  svst2_f16 (p1, x0, z3);
++  svst3_f16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
++	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_f16 (p0, x0, stack2);
++  svst2_f16 (p1, x0, z3);
++  svst3_f16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_f16 (pg, x0, -9),
++	   svld2_vnum_f16 (pg, x0, -2),
++	   svld3_vnum_f16 (pg, x0, 0),
++	   svld4_vnum_f16 (pg, x0, 8),
++	   svld1_vnum_f16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f32.c
+new file mode 100644
+index 000000000..26ea2a308
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f32.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1w	(z[0-9]+\.s), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4w	{z[0-9]+\.s - \1}, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
++	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_f32 (p0, x0, stack1);
++  svst2_f32 (p1, x0, z3);
++  svst3_f32 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
++**	st1w	\1, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
++	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_f32 (p0, x0, stack2);
++  svst2_f32 (p1, x0, z3);
++  svst3_f32 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_f32 (pg, x0, -9),
++	   svld2_vnum_f32 (pg, x0, -2),
++	   svld3_vnum_f32 (pg, x0, 0),
++	   svld4_vnum_f32 (pg, x0, 8),
++	   svld1_vnum_f32 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f64.c
+new file mode 100644
+index 000000000..62aded51c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f64.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1d	(z[0-9]+\.d), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4d	{z[0-9]+\.d - \1}, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
++	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_f64 (p0, x0, stack1);
++  svst2_f64 (p1, x0, z3);
++  svst3_f64 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
++**	st1d	\1, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
++	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_f64 (p0, x0, stack2);
++  svst2_f64 (p1, x0, z3);
++  svst3_f64 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_f64 (pg, x0, -9),
++	   svld2_vnum_f64 (pg, x0, -2),
++	   svld3_vnum_f64 (pg, x0, 0),
++	   svld4_vnum_f64 (pg, x0, 8),
++	   svld1_vnum_f64 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s16.c
+new file mode 100644
+index 000000000..204ef9a92
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s16.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
++	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s16 (p0, x0, stack1);
++  svst2_s16 (p1, x0, z3);
++  svst3_s16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
++	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s16 (p0, x0, stack2);
++  svst2_s16 (p1, x0, z3);
++  svst3_s16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s16 (pg, x0, -9),
++	   svld2_vnum_s16 (pg, x0, -2),
++	   svld3_vnum_s16 (pg, x0, 0),
++	   svld4_vnum_s16 (pg, x0, 8),
++	   svld1_vnum_s16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s32.c
+new file mode 100644
+index 000000000..9ae4567a4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s32.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1w	(z[0-9]+\.s), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4w	{z[0-9]+\.s - \1\}, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
++	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s32 (p0, x0, stack1);
++  svst2_s32 (p1, x0, z3);
++  svst3_s32 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
++**	st1w	\1, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
++	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s32 (p0, x0, stack2);
++  svst2_s32 (p1, x0, z3);
++  svst3_s32 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s32 (pg, x0, -9),
++	   svld2_vnum_s32 (pg, x0, -2),
++	   svld3_vnum_s32 (pg, x0, 0),
++	   svld4_vnum_s32 (pg, x0, 8),
++	   svld1_vnum_s32 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s64.c
+new file mode 100644
+index 000000000..0b8a2e213
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s64.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1d	(z[0-9]+\.d), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4d	{z[0-9]+\.d - \1}, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
++	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s64 (p0, x0, stack1);
++  svst2_s64 (p1, x0, z3);
++  svst3_s64 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
++**	st1d	\1, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
++	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s64 (p0, x0, stack2);
++  svst2_s64 (p1, x0, z3);
++  svst3_s64 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s64 (pg, x0, -9),
++	   svld2_vnum_s64 (pg, x0, -2),
++	   svld3_vnum_s64 (pg, x0, 0),
++	   svld4_vnum_s64 (pg, x0, 8),
++	   svld1_vnum_s64 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s8.c
+new file mode 100644
+index 000000000..0afbe71aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s8.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1b	(z[0-9]+\.b), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4b	{z[0-9]+\.b - \1}, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
++	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s8 (p0, x0, stack1);
++  svst2_s8 (p1, x0, z3);
++  svst3_s8 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
++**	st1b	\1, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
++	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s8 (p0, x0, stack2);
++  svst2_s8 (p1, x0, z3);
++  svst3_s8 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s8 (pg, x0, -9),
++	   svld2_vnum_s8 (pg, x0, -2),
++	   svld3_vnum_s8 (pg, x0, 0),
++	   svld4_vnum_s8 (pg, x0, 8),
++	   svld1_vnum_s8 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u16.c
+new file mode 100644
+index 000000000..f010f5ebb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u16.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
++	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u16 (p0, x0, stack1);
++  svst2_u16 (p1, x0, z3);
++  svst3_u16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
++	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u16 (p0, x0, stack2);
++  svst2_u16 (p1, x0, z3);
++  svst3_u16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u16 (pg, x0, -9),
++	   svld2_vnum_u16 (pg, x0, -2),
++	   svld3_vnum_u16 (pg, x0, 0),
++	   svld4_vnum_u16 (pg, x0, 8),
++	   svld1_vnum_u16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u32.c
+new file mode 100644
+index 000000000..60d903a31
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u32.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1w	(z[0-9]+\.s), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4w	{z[0-9]+\.s - \1}, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
++	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u32 (p0, x0, stack1);
++  svst2_u32 (p1, x0, z3);
++  svst3_u32 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
++**	st1w	\1, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
++	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u32 (p0, x0, stack2);
++  svst2_u32 (p1, x0, z3);
++  svst3_u32 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u32 (pg, x0, -9),
++	   svld2_vnum_u32 (pg, x0, -2),
++	   svld3_vnum_u32 (pg, x0, 0),
++	   svld4_vnum_u32 (pg, x0, 8),
++	   svld1_vnum_u32 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u64.c
+new file mode 100644
+index 000000000..948f426f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u64.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1d	(z[0-9]+\.d), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4d	{z[0-9]+\.d - \1}, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
++	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u64 (p0, x0, stack1);
++  svst2_u64 (p1, x0, z3);
++  svst3_u64 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
++**	st1d	\1, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
++	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u64 (p0, x0, stack2);
++  svst2_u64 (p1, x0, z3);
++  svst3_u64 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u64 (pg, x0, -9),
++	   svld2_vnum_u64 (pg, x0, -2),
++	   svld3_vnum_u64 (pg, x0, 0),
++	   svld4_vnum_u64 (pg, x0, 8),
++	   svld1_vnum_u64 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u8.c
+new file mode 100644
+index 000000000..8049ec078
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u8.c
+@@ -0,0 +1,71 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	ptrue	p3\.b, all
++**	...
++**	ld1b	(z[0-9]+\.b), p3/z, \[x1, #3, mul vl\]
++**	...
++**	st4b	{z[0-9]+\.b - \1}, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
++	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u8 (p0, x0, stack1);
++  svst2_u8 (p1, x0, z3);
++  svst3_u8 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
++**	st1b	\1, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
++	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u8 (p0, x0, stack2);
++  svst2_u8 (p1, x0, z3);
++  svst3_u8 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u8 (pg, x0, -9),
++	   svld2_vnum_u8 (pg, x0, -2),
++	   svld3_vnum_u8 (pg, x0, 0),
++	   svld4_vnum_u8 (pg, x0, 8),
++	   svld1_vnum_u8 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_bf16.c
+new file mode 100644
+index 000000000..3dc9e42ed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_bf16.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
++	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_bf16 (p0, x0, stack1);
++  svst2_bf16 (p1, x0, z3);
++  svst3_bf16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
++	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_bf16 (p0, x0, stack2);
++  svst2_bf16 (p1, x0, z3);
++  svst3_bf16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_bf16 (pg, x0, -9),
++	   svld2_vnum_bf16 (pg, x0, -2),
++	   svld3_vnum_bf16 (pg, x0, 0),
++	   svld4_vnum_bf16 (pg, x0, 8),
++	   svld1_vnum_bf16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f16.c
+new file mode 100644
+index 000000000..80a2e3aae
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f16.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
++	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_f16 (p0, x0, stack1);
++  svst2_f16 (p1, x0, z3);
++  svst3_f16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
++	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_f16 (p0, x0, stack2);
++  svst2_f16 (p1, x0, z3);
++  svst3_f16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_f16 (pg, x0, -9),
++	   svld2_vnum_f16 (pg, x0, -2),
++	   svld3_vnum_f16 (pg, x0, 0),
++	   svld4_vnum_f16 (pg, x0, 8),
++	   svld1_vnum_f16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f32.c
+new file mode 100644
+index 000000000..40ff42128
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f32.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4w	{z[0-9]+\.s - \1\.s}, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
++	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_f32 (p0, x0, stack1);
++  svst2_f32 (p1, x0, z3);
++  svst3_f32 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
++**	st1w	\1, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
++	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_f32 (p0, x0, stack2);
++  svst2_f32 (p1, x0, z3);
++  svst3_f32 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_f32 (pg, x0, -9),
++	   svld2_vnum_f32 (pg, x0, -2),
++	   svld3_vnum_f32 (pg, x0, 0),
++	   svld4_vnum_f32 (pg, x0, 8),
++	   svld1_vnum_f32 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f64.c
+new file mode 100644
+index 000000000..ee219ccdc
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f64.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4d	{z[0-9]+\.d - \1\.d}, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
++	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_f64 (p0, x0, stack1);
++  svst2_f64 (p1, x0, z3);
++  svst3_f64 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
++**	st1d	\1, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
++	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_f64 (p0, x0, stack2);
++  svst2_f64 (p1, x0, z3);
++  svst3_f64 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_f64 (pg, x0, -9),
++	   svld2_vnum_f64 (pg, x0, -2),
++	   svld3_vnum_f64 (pg, x0, 0),
++	   svld4_vnum_f64 (pg, x0, 8),
++	   svld1_vnum_f64 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s16.c
+new file mode 100644
+index 000000000..ade75cb34
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s16.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
++	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s16 (p0, x0, stack1);
++  svst2_s16 (p1, x0, z3);
++  svst3_s16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
++	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s16 (p0, x0, stack2);
++  svst2_s16 (p1, x0, z3);
++  svst3_s16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s16 (pg, x0, -9),
++	   svld2_vnum_s16 (pg, x0, -2),
++	   svld3_vnum_s16 (pg, x0, 0),
++	   svld4_vnum_s16 (pg, x0, 8),
++	   svld1_vnum_s16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s32.c
+new file mode 100644
+index 000000000..a6c06e235
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s32.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4w	{z[0-9]+\.s - \1\.s}, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
++	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s32 (p0, x0, stack1);
++  svst2_s32 (p1, x0, z3);
++  svst3_s32 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
++**	st1w	\1, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
++	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s32 (p0, x0, stack2);
++  svst2_s32 (p1, x0, z3);
++  svst3_s32 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s32 (pg, x0, -9),
++	   svld2_vnum_s32 (pg, x0, -2),
++	   svld3_vnum_s32 (pg, x0, 0),
++	   svld4_vnum_s32 (pg, x0, 8),
++	   svld1_vnum_s32 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s64.c
+new file mode 100644
+index 000000000..219c71d82
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s64.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4d	{z[0-9]+\.d - \1\.d}, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
++	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s64 (p0, x0, stack1);
++  svst2_s64 (p1, x0, z3);
++  svst3_s64 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
++**	st1d	\1, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
++	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s64 (p0, x0, stack2);
++  svst2_s64 (p1, x0, z3);
++  svst3_s64 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s64 (pg, x0, -9),
++	   svld2_vnum_s64 (pg, x0, -2),
++	   svld3_vnum_s64 (pg, x0, 0),
++	   svld4_vnum_s64 (pg, x0, 8),
++	   svld1_vnum_s64 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s8.c
+new file mode 100644
+index 000000000..c48d391ca
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s8.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4b	{z[0-9]+\.b - \1\.b}, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
++	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_s8 (p0, x0, stack1);
++  svst2_s8 (p1, x0, z3);
++  svst3_s8 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
++**	st1b	\1, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
++	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_s8 (p0, x0, stack2);
++  svst2_s8 (p1, x0, z3);
++  svst3_s8 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_s8 (pg, x0, -9),
++	   svld2_vnum_s8 (pg, x0, -2),
++	   svld3_vnum_s8 (pg, x0, 0),
++	   svld4_vnum_s8 (pg, x0, 8),
++	   svld1_vnum_s8 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u16.c
+new file mode 100644
+index 000000000..6c635fd94
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u16.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
++	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u16 (p0, x0, stack1);
++  svst2_u16 (p1, x0, z3);
++  svst3_u16 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
++**	st1h	\1, p0, \[x0\]
++**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
++**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
++	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u16 (p0, x0, stack2);
++  svst2_u16 (p1, x0, z3);
++  svst3_u16 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u16 (pg, x0, -9),
++	   svld2_vnum_u16 (pg, x0, -2),
++	   svld3_vnum_u16 (pg, x0, 0),
++	   svld4_vnum_u16 (pg, x0, 8),
++	   svld1_vnum_u16 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u32.c
+new file mode 100644
+index 000000000..c31d45426
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u32.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4w	{z[0-9]+\.s - \1\.s}, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
++	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u32 (p0, x0, stack1);
++  svst2_u32 (p1, x0, z3);
++  svst3_u32 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
++**	st1w	\1, p0, \[x0\]
++**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
++**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
++	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u32 (p0, x0, stack2);
++  svst2_u32 (p1, x0, z3);
++  svst3_u32 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u32 (pg, x0, -9),
++	   svld2_vnum_u32 (pg, x0, -2),
++	   svld3_vnum_u32 (pg, x0, 0),
++	   svld4_vnum_u32 (pg, x0, 8),
++	   svld1_vnum_u32 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u64.c
+new file mode 100644
+index 000000000..969b258b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u64.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4d	{z[0-9]+\.d - \1\.d}, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
++	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u64 (p0, x0, stack1);
++  svst2_u64 (p1, x0, z3);
++  svst3_u64 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
++**	st1d	\1, p0, \[x0\]
++**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
++**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
++	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u64 (p0, x0, stack2);
++  svst2_u64 (p1, x0, z3);
++  svst3_u64 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u64 (pg, x0, -9),
++	   svld2_vnum_u64 (pg, x0, -2),
++	   svld3_vnum_u64 (pg, x0, 0),
++	   svld4_vnum_u64 (pg, x0, 8),
++	   svld1_vnum_u64 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u8.c
+new file mode 100644
+index 000000000..d18604784
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u8.c
+@@ -0,0 +1,70 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** callee1:
++**	...
++**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
++**	...
++**	st4b	{z[0-9]+\.b - \1\.b}, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee1 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
++	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst4_u8 (p0, x0, stack1);
++  svst2_u8 (p1, x0, z3);
++  svst3_u8 (p2, x0, z5);
++}
++
++/*
++** callee2:
++**	ptrue	p3\.b, all
++**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
++**	st1b	\1, p0, \[x0\]
++**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
++**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
++**	ret
++*/
++void __attribute__((noipa))
++callee2 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
++	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
++	 svbool_t p1, svbool_t p2)
++{
++  svst1_u8 (p0, x0, stack2);
++  svst2_u8 (p1, x0, z3);
++  svst3_u8 (p2, x0, z0);
++}
++
++void __attribute__((noipa))
++caller (void *x0)
++{
++  svbool_t pg;
++  pg = svptrue_b8 ();
++  callee1 (x0,
++	   svld3_vnum_u8 (pg, x0, -9),
++	   svld2_vnum_u8 (pg, x0, -2),
++	   svld3_vnum_u8 (pg, x0, 0),
++	   svld4_vnum_u8 (pg, x0, 8),
++	   svld1_vnum_u8 (pg, x0, 5),
++	   svptrue_pat_b8 (SV_VL1),
++	   svptrue_pat_b16 (SV_VL2),
++	   svptrue_pat_b32 (SV_VL3));
++}
++
++/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
++/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
++/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_7.c
+new file mode 100644
+index 000000000..15c022486
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_7.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee:
++**	...
++**	ldr	(x[0-9]+), \[sp\]
++**	...
++**	ld1b	(z[0-9]+\.b), p[1-3]/z, \[\1\]
++**	st1b	\2, p0, \[x0, x7\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (int8_t *x0, int x1, int x2, int x3,
++	int x4, int x5, svbool_t p0, int x6, int64_t x7,
++	svint32x4_t z0, svint32x4_t z4, svint8_t stack)
++{
++  svst1 (p0, x0 + x7, stack);
++}
++
++void __attribute__((noipa))
++caller (int8_t *x0, svbool_t p0, svint32x4_t z0, svint32x4_t z4)
++{
++  callee (x0, 1, 2, 3, 4, 5, p0, 6, 7, z0, z4, svdup_s8 (42));
++}
++
++/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #42\n.*\tst1b\t\1, p[0-7], \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_8.c
+new file mode 100644
+index 000000000..93ace26f5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_8.c
+@@ -0,0 +1,28 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee:
++**	ptrue	(p[1-3])\.b, all
++**	ld1b	(z[0-9]+\.b), \1/z, \[x4\]
++**	st1b	\2, p0, \[x0, x7\]
++**	ret
++*/
++void __attribute__((noipa))
++callee (int8_t *x0, int x1, int x2, int x3,
++	svint32x4_t z0, svint32x4_t z4, svint8_t stack,
++	int x5, svbool_t p0, int x6, int64_t x7)
++{
++  svst1 (p0, x0 + x7, stack);
++}
++
++void __attribute__((noipa))
++caller (int8_t *x0, svbool_t p0, svint32x4_t z0, svint32x4_t z4)
++{
++  callee (x0, 1, 2, 3, z0, z4, svdup_s8 (42), 5, p0, 6, 7);
++}
++
++/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #42\n.*\tst1b\t\1, p[0-7], \[x4\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
+new file mode 100644
+index 000000000..ad9affadf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
+@@ -0,0 +1,49 @@
++/* { dg-do compile { target lp64 } } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee:
++**	ldr	(x[0-9]+), \[sp, 8\]
++**	ldr	p0, \[\1\]
++**	ret
++*/
++svbool_t __attribute__((noipa))
++callee (svint64x4_t z0, svint16x4_t z4,
++	svint64_t stack1, svint32_t stack2,
++	svint16_t stack3, svint8_t stack4,
++	svuint64_t stack5, svuint32_t stack6,
++	svuint16_t stack7, svuint8_t stack8,
++	svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3,
++	svbool_t stack9, svbool_t stack10)
++{
++  return stack10;
++}
++
++uint64_t __attribute__((noipa))
++caller (int64_t *x0, int16_t *x1, svbool_t p0)
++{
++  svbool_t res;
++  res = callee (svld4 (p0, x0),
++		svld4 (p0, x1),
++		svdup_s64 (1),
++		svdup_s32 (2),
++		svdup_s16 (3),
++		svdup_s8 (4),
++		svdup_u64 (5),
++		svdup_u32 (6),
++		svdup_u16 (7),
++		svdup_u8 (8),
++		svptrue_pat_b8 (SV_VL5),
++		svptrue_pat_b16 (SV_VL6),
++		svptrue_pat_b32 (SV_VL7),
++		svptrue_pat_b64 (SV_VL8),
++		svptrue_pat_b8 (SV_MUL3),
++		svptrue_pat_b16 (SV_MUL3));
++  return svcntp_b8 (res, res);
++}
++
++/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.b, mul3\n\tstr\t\1, \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp\]\n} } } */
++/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.h, mul3\n\tstr\t\1, \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp, 8\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_1.c
+new file mode 100644
+index 000000000..e5fceb14b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_1.c
+@@ -0,0 +1,107 @@
++/* { dg-options "-O -msve-vector-bits=256 -fomit-frame-pointer" } */
++
++#include <arm_sve.h>
++
++typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
++typedef float16_t float16x16_t __attribute__((vector_size (32)));
++typedef float32_t float32x8_t __attribute__((vector_size (32)));
++typedef float64_t float64x4_t __attribute__((vector_size (32)));
++typedef int8_t int8x32_t __attribute__((vector_size (32)));
++typedef int16_t int16x16_t __attribute__((vector_size (32)));
++typedef int32_t int32x8_t __attribute__((vector_size (32)));
++typedef int64_t int64x4_t __attribute__((vector_size (32)));
++typedef uint8_t uint8x32_t __attribute__((vector_size (32)));
++typedef uint16_t uint16x16_t __attribute__((vector_size (32)));
++typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
++typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
++
++void bfloat16_callee (bfloat16x16_t);
++void float16_callee (float16x16_t);
++void float32_callee (float32x8_t);
++void float64_callee (float64x4_t);
++void int8_callee (int8x32_t);
++void int16_callee (int16x16_t);
++void int32_callee (int32x8_t);
++void int64_callee (int64x4_t);
++void uint8_callee (uint8x32_t);
++void uint16_callee (uint16x16_t);
++void uint32_callee (uint32x8_t);
++void uint64_callee (uint64x4_t);
++
++void
++bfloat16_caller (bfloat16_t val)
++{
++  bfloat16_callee (svdup_bf16 (val));
++}
++
++void
++float16_caller (void)
++{
++  float16_callee (svdup_f16 (1.0));
++}
++
++void
++float32_caller (void)
++{
++  float32_callee (svdup_f32 (2.0));
++}
++
++void
++float64_caller (void)
++{
++  float64_callee (svdup_f64 (3.0));
++}
++
++void
++int8_caller (void)
++{
++  int8_callee (svindex_s8 (0, 1));
++}
++
++void
++int16_caller (void)
++{
++  int16_callee (svindex_s16 (0, 2));
++}
++
++void
++int32_caller (void)
++{
++  int32_callee (svindex_s32 (0, 3));
++}
++
++void
++int64_caller (void)
++{
++  int64_callee (svindex_s64 (0, 4));
++}
++
++void
++uint8_caller (void)
++{
++  uint8_callee (svindex_u8 (1, 1));
++}
++
++void
++uint16_caller (void)
++{
++  uint16_callee (svindex_u16 (1, 2));
++}
++
++void
++uint32_caller (void)
++{
++  uint32_callee (svindex_u32 (1, 3));
++}
++
++void
++uint64_caller (void)
++{
++  uint64_callee (svindex_u64 (1, 4));
++}
++
++/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0\]} 2 } } */
++/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x0\]} 4 } } */
++/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x0\]} 3 } } */
++/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x0\]} 3 } } */
++/* { dg-final { scan-assembler-times {\tadd\tx0, sp, #?16\n} 12 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_2.c
+new file mode 100644
+index 000000000..875567f01
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_2.c
+@@ -0,0 +1,107 @@
++/* { dg-options "-O -msve-vector-bits=256 -fomit-frame-pointer" } */
++
++#include <arm_sve.h>
++
++typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
++typedef float16_t float16x16_t __attribute__((vector_size (32)));
++typedef float32_t float32x8_t __attribute__((vector_size (32)));
++typedef float64_t float64x4_t __attribute__((vector_size (32)));
++typedef int8_t int8x32_t __attribute__((vector_size (32)));
++typedef int16_t int16x16_t __attribute__((vector_size (32)));
++typedef int32_t int32x8_t __attribute__((vector_size (32)));
++typedef int64_t int64x4_t __attribute__((vector_size (32)));
++typedef uint8_t uint8x32_t __attribute__((vector_size (32)));
++typedef uint16_t uint16x16_t __attribute__((vector_size (32)));
++typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
++typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
++
++void bfloat16_callee (svbfloat16_t);
++void float16_callee (svfloat16_t);
++void float32_callee (svfloat32_t);
++void float64_callee (svfloat64_t);
++void int8_callee (svint8_t);
++void int16_callee (svint16_t);
++void int32_callee (svint32_t);
++void int64_callee (svint64_t);
++void uint8_callee (svuint8_t);
++void uint16_callee (svuint16_t);
++void uint32_callee (svuint32_t);
++void uint64_callee (svuint64_t);
++
++void
++bfloat16_caller (bfloat16x16_t arg)
++{
++  bfloat16_callee (arg);
++}
++
++void
++float16_caller (float16x16_t arg)
++{
++  float16_callee (arg);
++}
++
++void
++float32_caller (float32x8_t arg)
++{
++  float32_callee (arg);
++}
++
++void
++float64_caller (float64x4_t arg)
++{
++  float64_callee (arg);
++}
++
++void
++int8_caller (int8x32_t arg)
++{
++  int8_callee (arg);
++}
++
++void
++int16_caller (int16x16_t arg)
++{
++  int16_callee (arg);
++}
++
++void
++int32_caller (int32x8_t arg)
++{
++  int32_callee (arg);
++}
++
++void
++int64_caller (int64x4_t arg)
++{
++  int64_callee (arg);
++}
++
++void
++uint8_caller (uint8x32_t arg)
++{
++  uint8_callee (arg);
++}
++
++void
++uint16_caller (uint16x16_t arg)
++{
++  uint16_callee (arg);
++}
++
++void
++uint32_caller (uint32x8_t arg)
++{
++  uint32_callee (arg);
++}
++
++void
++uint64_caller (uint64x4_t arg)
++{
++  uint64_callee (arg);
++}
++
++/* { dg-final { scan-assembler-times {\tld1b\tz0\.b, p[0-7]/z, \[x0\]} 2 } } */
++/* { dg-final { scan-assembler-times {\tld1h\tz0\.h, p[0-7]/z, \[x0\]} 4 } } */
++/* { dg-final { scan-assembler-times {\tld1w\tz0\.s, p[0-7]/z, \[x0\]} 3 } } */
++/* { dg-final { scan-assembler-times {\tld1d\tz0\.d, p[0-7]/z, \[x0\]} 3 } } */
++/* { dg-final { scan-assembler-not {\tst1[bhwd]\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_1.c
+new file mode 100644
+index 000000000..26802c87f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_1.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++svbool_t return_bool ();
++
++void
++f (void)
++{
++  return_bool (); /* { dg-error {'return_bool' requires the SVE ISA extension} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_2.c
+new file mode 100644
+index 000000000..663165f89
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_2.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++svbool_t return_bool ();
++
++void
++f (svbool_t *ptr)
++{
++  *ptr = return_bool (); /* { dg-error {'return_bool' requires the SVE ISA extension} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_3.c
+new file mode 100644
+index 000000000..6d5823cfd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_3.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++svbool_t (*return_bool) ();
++
++void
++f (svbool_t *ptr)
++{
++  *ptr = return_bool (); /* { dg-error {calls to functions of type 'svbool_t\(\)' require the SVE ISA extension} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_4.c
+new file mode 100644
+index 000000000..81e31cf4f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_4.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++void take_svuint8 (svuint8_t);
++
++void
++f (svuint8_t *ptr)
++{
++  take_svuint8 (*ptr); /* { dg-error {'take_svuint8' requires the SVE ISA extension} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_5.c
+new file mode 100644
+index 000000000..300ed00a0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_5.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++void take_svuint8_eventually (float, float, float, float,
++			      float, float, float, float, svuint8_t);
++
++void
++f (svuint8_t *ptr)
++{
++  take_svuint8_eventually (0, 0, 0, 0, 0, 0, 0, 0, *ptr); /* { dg-error {arguments of type '(svuint8_t|__SVUint8_t)' require the SVE ISA extension} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_6.c
+new file mode 100644
+index 000000000..4bddf76f8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_6.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++void unprototyped ();
++
++void
++f (svuint8_t *ptr)
++{
++  unprototyped (*ptr); /* { dg-error {arguments of type '(svuint8_t|__SVUint8_t)' require the SVE ISA extension} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_7.c
+new file mode 100644
+index 000000000..ef742711d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_7.c
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++void f (svuint8_t x) {} /* { dg-error {'f' requires the SVE ISA extension} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_8.c
+new file mode 100644
+index 000000000..45b549f12
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_8.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-prune-output "compilation terminated" } */
++
++#include <arm_sve.h>
++
++#pragma GCC target "+nosve"
++
++void
++f (float a, float b, float c, float d, float e, float f, float g, float h, svuint8_t x) /* { dg-error {arguments of type '(svuint8_t|__SVUint8_t)' require the SVE ISA extension} } */
++{
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1.c
+new file mode 100644
+index 000000000..f6328c901
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1.c
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++__SVBool_t __attribute__((noipa))
++callee_pred (__SVBool_t *ptr)
++{
++  return *ptr;
++}
++
++#include <arm_sve.h>
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (__SVBool_t *ptr1)
++{
++  __SVBool_t p;
++  p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_1024.c
+new file mode 100644
+index 000000000..450a3f029
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_1024.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=1024 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++__SVBool_t __attribute__((noipa))
++callee_pred (__SVBool_t *ptr)
++{
++  return *ptr;
++}
++
++#include <arm_sve.h>
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (__SVBool_t *ptr1)
++{
++  __SVBool_t p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_2048.c
+new file mode 100644
+index 000000000..c9ea26899
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_2048.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=2048 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++__SVBool_t __attribute__((noipa))
++callee_pred (__SVBool_t *ptr)
++{
++  return *ptr;
++}
++
++#include <arm_sve.h>
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (__SVBool_t *ptr1)
++{
++  __SVBool_t p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_256.c
+new file mode 100644
+index 000000000..62bc695d1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_256.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=256 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++__SVBool_t __attribute__((noipa))
++callee_pred (__SVBool_t *ptr)
++{
++  return *ptr;
++}
++
++#include <arm_sve.h>
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (__SVBool_t *ptr1)
++{
++  __SVBool_t p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_512.c
+new file mode 100644
+index 000000000..f687689ce
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_512.c
+@@ -0,0 +1,31 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=512 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++__SVBool_t __attribute__((noipa))
++callee_pred (__SVBool_t *ptr)
++{
++  return *ptr;
++}
++
++#include <arm_sve.h>
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (__SVBool_t *ptr1)
++{
++  __SVBool_t p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_2.c
+new file mode 100644
+index 000000000..efaa81394
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_2.c
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++svbool_t __attribute__((noipa))
++callee_pred (svbool_t *ptr)
++{
++  return *ptr;
++}
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (svbool_t *ptr1)
++{
++  svbool_t p;
++  p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_3.c
+new file mode 100644
+index 000000000..71046447d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_3.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++typedef svbool_t my_pred;
++
++/*
++** callee_pred:
++**	ldr	p0, \[x0\]
++**	ret
++*/
++my_pred __attribute__((noipa))
++callee_pred (my_pred *ptr)
++{
++  return *ptr;
++}
++
++/*
++** caller_pred:
++**	...
++**	bl	callee_pred
++**	cntp	x0, p0, p0.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_pred (my_pred *ptr1)
++{
++  my_pred p;
++  p = callee_pred (ptr1);
++  return svcntp_b8 (p, p);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4.c
+new file mode 100644
+index 000000000..00eb2cbda
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, all
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, __SVInt8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, all
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, __SVUint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, __SVInt16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, __SVUint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, __SVFloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, __SVBfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, __SVInt32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, __SVUint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, __SVFloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, all
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, __SVInt64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, all
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, __SVUint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, all
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, __SVFloat64_t)
++
++#include <arm_sve.h>
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, all
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, __SVInt8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, __SVUint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, all
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, __SVInt16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, __SVUint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, all
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, __SVFloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, all
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, __SVBfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, all
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, __SVInt32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, __SVUint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, all
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, __SVFloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, __SVInt64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, __SVUint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, all
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, __SVFloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_1024.c
+new file mode 100644
+index 000000000..43519634c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_1024.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=1024 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, __SVInt8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, __SVUint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, __SVInt16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, __SVUint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, __SVFloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, __SVBfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, __SVInt32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, __SVUint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, __SVFloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, __SVInt64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, __SVUint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, __SVFloat64_t)
++
++#include <arm_sve.h>
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl128
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, __SVInt8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, __SVUint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl128
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, __SVInt16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, __SVUint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl128
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, __SVFloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl128
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, __SVBfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl128
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, __SVInt32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, __SVUint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl128
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, __SVFloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, __SVInt64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, __SVUint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl128
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, __SVFloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_2048.c
+new file mode 100644
+index 000000000..8256645f5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_2048.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=2048 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, __SVInt8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, __SVUint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, __SVInt16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, __SVUint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, __SVFloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, __SVBfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, __SVInt32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, __SVUint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, __SVFloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, __SVInt64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, __SVUint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, __SVFloat64_t)
++
++#include <arm_sve.h>
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl256
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, __SVInt8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, __SVUint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl256
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, __SVInt16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, __SVUint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl256
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, __SVFloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl256
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, __SVBfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl256
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, __SVInt32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, __SVUint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl256
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, __SVFloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, __SVInt64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, __SVUint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl256
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, __SVFloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_256.c
+new file mode 100644
+index 000000000..1e0f6bb96
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_256.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=256 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, __SVInt8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, __SVUint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, __SVInt16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, __SVUint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, __SVFloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, __SVBfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, __SVInt32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, __SVUint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, __SVFloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, __SVInt64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, __SVUint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, __SVFloat64_t)
++
++#include <arm_sve.h>
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl32
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, __SVInt8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, __SVUint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl32
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, __SVInt16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, __SVUint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl32
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, __SVFloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl32
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, __SVBfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl32
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, __SVInt32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, __SVUint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl32
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, __SVFloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, __SVInt64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, __SVUint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl32
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, __SVFloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_512.c
+new file mode 100644
+index 000000000..5b58ed734
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_512.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=512 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, __SVInt8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, __SVUint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, __SVInt16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, __SVUint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, __SVFloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, __SVBfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, __SVInt32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, __SVUint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, __SVFloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, __SVInt64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, __SVUint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, __SVFloat64_t)
++
++#include <arm_sve.h>
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl64
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, __SVInt8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, __SVUint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl64
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, __SVInt16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, __SVUint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl64
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, __SVFloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl64
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, __SVBfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl64
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, __SVInt32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, __SVUint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl64
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, __SVFloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, __SVInt64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, __SVUint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl64
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, __SVFloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5.c
+new file mode 100644
+index 000000000..55c78e16f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, all
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, all
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, all
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, all
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, all
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, all
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, all
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, all
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, all
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, all
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, all
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, all
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, all
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, all
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, all
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_1024.c
+new file mode 100644
+index 000000000..52e9916d8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_1024.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=1024 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl128
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl128
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl128
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl128
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl128
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl128
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl128
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl128
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_2048.c
+new file mode 100644
+index 000000000..6f37d9d6c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_2048.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=2048 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl256
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl256
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl256
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl256
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl256
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl256
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl256
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl256
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_256.c
+new file mode 100644
+index 000000000..7ba094e16
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_256.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=256 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl32
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl32
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl32
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl32
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl32
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl32
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl32
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl32
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_512.c
+new file mode 100644
+index 000000000..36b14d420
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_512.c
+@@ -0,0 +1,264 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=512 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)					\
++  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++#define CALLER_BF16(SUFFIX, TYPE)				\
++  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
++  __attribute__((noipa))					\
++  caller_##SUFFIX (TYPE *ptr1)					\
++  {								\
++    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, vl64
++**	saddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.b
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, vl64
++**	saddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.h
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, vl64
++**	faddv	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ptrue	(p[0-7])\.b, vl64
++**	lasta	h0, \1, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER_BF16 (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, vl64
++**	saddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.s
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, vl64
++**	faddv	s0, \1, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, vl64
++**	uaddv	(d[0-9]+), \1, z0\.d
++**	fmov	x0, \2
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, vl64
++**	faddv	d0, \1, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6.c
+new file mode 100644
+index 000000000..72468eab1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6.c
+@@ -0,0 +1,272 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <stdint.h>
++
++typedef int8_t svint8_t __attribute__ ((vector_size (32)));
++typedef uint8_t svuint8_t __attribute__ ((vector_size (32)));
++
++typedef int16_t svint16_t __attribute__ ((vector_size (32)));
++typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
++typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
++typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
++
++typedef int32_t svint32_t __attribute__ ((vector_size (32)));
++typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
++typedef float svfloat32_t __attribute__ ((vector_size (32)));
++
++typedef int64_t svint64_t __attribute__ ((vector_size (32)));
++typedef uint64_t svuint64_t __attribute__ ((vector_size (32)));
++typedef double svfloat64_t __attribute__ ((vector_size (32)));
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/* Currently we scalarize this.  */
++CALLEE (f16, svfloat16_t)
++
++/* Currently we scalarize this.  */
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/* Currently we scalarize this.  */
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++** (
++**	ld1	({v.*}), \[x0\]
++**	st1	\1, \[x8\]
++** |
++**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
++**	stp	\2, \[x8\]
++** )
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/* Currently we scalarize this.  */
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)		\
++  typeof ((*(TYPE *) 0)[0])		\
++  __attribute__((noipa))		\
++  caller_##SUFFIX (TYPE *ptr1)		\
++  {					\
++    return callee_##SUFFIX (ptr1)[0];	\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ldrb	w0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ldrb	w0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ldrh	w0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ldrh	w0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ldr	h0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	ldr	h0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ldr	w0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ldr	w0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ldr	s0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ldr	x0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ldr	x0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ldr	d0, \[sp, 16\]
++**	ldp	x29, x30, \[sp\], 48
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_1024.c
+new file mode 100644
+index 000000000..b6f267e76
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_1024.c
+@@ -0,0 +1,287 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=1024 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <stdint.h>
++
++typedef int8_t svint8_t __attribute__ ((vector_size (128)));
++typedef uint8_t svuint8_t __attribute__ ((vector_size (128)));
++
++typedef int16_t svint16_t __attribute__ ((vector_size (128)));
++typedef uint16_t svuint16_t __attribute__ ((vector_size (128)));
++typedef __fp16 svfloat16_t __attribute__ ((vector_size (128)));
++typedef __bf16 svbfloat16_t __attribute__ ((vector_size (128)));
++
++typedef int32_t svint32_t __attribute__ ((vector_size (128)));
++typedef uint32_t svuint32_t __attribute__ ((vector_size (128)));
++typedef float svfloat32_t __attribute__ ((vector_size (128)));
++
++typedef int64_t svint64_t __attribute__ ((vector_size (128)));
++typedef uint64_t svuint64_t __attribute__ ((vector_size (128)));
++typedef double svfloat64_t __attribute__ ((vector_size (128)));
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl128
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)			\
++  void __attribute__((noipa))			\
++  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
++  {						\
++    *ptr2 = callee_##SUFFIX (ptr1);		\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_2048.c
+new file mode 100644
+index 000000000..46b7d683e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_2048.c
+@@ -0,0 +1,287 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=2048 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <stdint.h>
++
++typedef int8_t svint8_t __attribute__ ((vector_size (256)));
++typedef uint8_t svuint8_t __attribute__ ((vector_size (256)));
++
++typedef int16_t svint16_t __attribute__ ((vector_size (256)));
++typedef uint16_t svuint16_t __attribute__ ((vector_size (256)));
++typedef __fp16 svfloat16_t __attribute__ ((vector_size (256)));
++typedef __bf16 svbfloat16_t __attribute__ ((vector_size (256)));
++
++typedef int32_t svint32_t __attribute__ ((vector_size (256)));
++typedef uint32_t svuint32_t __attribute__ ((vector_size (256)));
++typedef float svfloat32_t __attribute__ ((vector_size (256)));
++
++typedef int64_t svint64_t __attribute__ ((vector_size (256)));
++typedef uint64_t svuint64_t __attribute__ ((vector_size (256)));
++typedef double svfloat64_t __attribute__ ((vector_size (256)));
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl256
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)			\
++  void __attribute__((noipa))			\
++  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
++  {						\
++    *ptr2 = callee_##SUFFIX (ptr1);		\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_256.c
+new file mode 100644
+index 000000000..04872493c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_256.c
+@@ -0,0 +1,287 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=256 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <stdint.h>
++
++typedef int8_t svint8_t __attribute__ ((vector_size (32)));
++typedef uint8_t svuint8_t __attribute__ ((vector_size (32)));
++
++typedef int16_t svint16_t __attribute__ ((vector_size (32)));
++typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
++typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
++typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
++
++typedef int32_t svint32_t __attribute__ ((vector_size (32)));
++typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
++typedef float svfloat32_t __attribute__ ((vector_size (32)));
++
++typedef int64_t svint64_t __attribute__ ((vector_size (32)));
++typedef uint64_t svuint64_t __attribute__ ((vector_size (32)));
++typedef double svfloat64_t __attribute__ ((vector_size (32)));
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl32
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)			\
++  void __attribute__((noipa))			\
++  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
++  {						\
++    *ptr2 = callee_##SUFFIX (ptr1);		\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_512.c
+new file mode 100644
+index 000000000..9817d856a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_512.c
+@@ -0,0 +1,287 @@
++/* { dg-do compile } */
++/* { dg-options "-O -msve-vector-bits=512 -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <stdint.h>
++
++typedef int8_t svint8_t __attribute__ ((vector_size (64)));
++typedef uint8_t svuint8_t __attribute__ ((vector_size (64)));
++
++typedef int16_t svint16_t __attribute__ ((vector_size (64)));
++typedef uint16_t svuint16_t __attribute__ ((vector_size (64)));
++typedef __fp16 svfloat16_t __attribute__ ((vector_size (64)));
++typedef __bf16 svbfloat16_t __attribute__ ((vector_size (64)));
++
++typedef int32_t svint32_t __attribute__ ((vector_size (64)));
++typedef uint32_t svuint32_t __attribute__ ((vector_size (64)));
++typedef float svfloat32_t __attribute__ ((vector_size (64)));
++
++typedef int64_t svint64_t __attribute__ ((vector_size (64)));
++typedef uint64_t svuint64_t __attribute__ ((vector_size (64)));
++typedef double svfloat64_t __attribute__ ((vector_size (64)));
++
++#define CALLEE(SUFFIX, TYPE)			\
++  TYPE __attribute__((noipa))			\
++  callee_##SUFFIX (TYPE *ptr)			\
++  {						\
++    return *ptr;				\
++  }
++
++/*
++** callee_s8:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (s8, svint8_t)
++
++/*
++** callee_u8:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1b	z0\.b, \1/z, \[x0\]
++**	st1b	z0\.b, \1, \[x8\]
++**	ret
++*/
++CALLEE (u8, svuint8_t)
++
++/*
++** callee_s16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (s16, svint16_t)
++
++/*
++** callee_u16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (u16, svuint16_t)
++
++/*
++** callee_f16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (f16, svfloat16_t)
++
++/*
++** callee_bf16:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1h	z0\.h, \1/z, \[x0\]
++**	st1h	z0\.h, \1, \[x8\]
++**	ret
++*/
++CALLEE (bf16, svbfloat16_t)
++
++/*
++** callee_s32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (s32, svint32_t)
++
++/*
++** callee_u32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (u32, svuint32_t)
++
++/*
++** callee_f32:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1w	z0\.s, \1/z, \[x0\]
++**	st1w	z0\.s, \1, \[x8\]
++**	ret
++*/
++CALLEE (f32, svfloat32_t)
++
++/*
++** callee_s64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (s64, svint64_t)
++
++/*
++** callee_u64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (u64, svuint64_t)
++
++/*
++** callee_f64:
++**	ptrue	(p[0-7])\.b, vl64
++**	ld1d	z0\.d, \1/z, \[x0\]
++**	st1d	z0\.d, \1, \[x8\]
++**	ret
++*/
++CALLEE (f64, svfloat64_t)
++
++#define CALLER(SUFFIX, TYPE)			\
++  void __attribute__((noipa))			\
++  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
++  {						\
++    *ptr2 = callee_##SUFFIX (ptr1);		\
++  }
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s8, svint8_t)
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
++**	st1b	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u8, svuint8_t)
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s16, svint16_t)
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u16, svuint16_t)
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f16, svfloat16_t)
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
++**	st1h	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (bf16, svbfloat16_t)
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s32, svint32_t)
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u32, svuint32_t)
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
++**	st1w	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f32, svfloat32_t)
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (s64, svint64_t)
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (u64, svuint64_t)
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
++**	st1d	\1, \2, \[[^]]*\]
++**	...
++**	ret
++*/
++CALLER (f64, svfloat64_t)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_7.c
+new file mode 100644
+index 000000000..55456a3b4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_7.c
+@@ -0,0 +1,341 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_s8:
++**	mov	z0\.b, #1
++**	mov	z1\.b, #2
++**	ret
++*/
++svint8x2_t __attribute__((noipa))
++callee_s8 (void)
++{
++  return svcreate2 (svdup_s8 (1), svdup_s8 (2));
++}
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	trn1	z0\.b, z0\.b, z1\.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint8_t __attribute__((noipa))
++caller_s8 (void)
++{
++  svint8x2_t res;
++  res = callee_s8 ();
++  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
++}
++
++/*
++** callee_u8:
++**	mov	z0\.b, #3
++**	mov	z1\.b, #4
++**	ret
++*/
++svuint8x2_t __attribute__((noipa))
++callee_u8 (void)
++{
++  return svcreate2 (svdup_u8 (3), svdup_u8 (4));
++}
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	trn2	z0\.b, z1\.b, z0\.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint8_t __attribute__((noipa))
++caller_u8 (void)
++{
++  svuint8x2_t res;
++  res = callee_u8 ();
++  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_s16:
++**	mov	z0\.h, #1
++**	mov	z1\.h, #2
++**	ret
++*/
++svint16x2_t __attribute__((noipa))
++callee_s16 (void)
++{
++  return svcreate2 (svdup_s16 (1), svdup_s16 (2));
++}
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	trn1	z0\.h, z0\.h, z1\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint16_t __attribute__((noipa))
++caller_s16 (void)
++{
++  svint16x2_t res;
++  res = callee_s16 ();
++  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
++}
++
++/*
++** callee_u16:
++**	mov	z0\.h, #3
++**	mov	z1\.h, #4
++**	ret
++*/
++svuint16x2_t __attribute__((noipa))
++callee_u16 (void)
++{
++  return svcreate2 (svdup_u16 (3), svdup_u16 (4));
++}
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	trn2	z0\.h, z1\.h, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint16_t __attribute__((noipa))
++caller_u16 (void)
++{
++  svuint16x2_t res;
++  res = callee_u16 ();
++  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_f16:
++**	fmov	z0\.h, #5\.0(?:e\+0)?
++**	fmov	z1\.h, #6\.0(?:e\+0)?
++**	ret
++*/
++svfloat16x2_t __attribute__((noipa))
++callee_f16 (void)
++{
++  return svcreate2 (svdup_f16 (5), svdup_f16 (6));
++}
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	zip1	z0\.h, z1\.h, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat16_t __attribute__((noipa))
++caller_f16 (void)
++{
++  svfloat16x2_t res;
++  res = callee_f16 ();
++  return svzip1 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_bf16:
++**	mov	z0\.h, h2
++**	mov	z1\.h, h3
++**	ret
++*/
++svbfloat16x2_t __attribute__((noipa))
++callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
++{
++  return svcreate2 (svdup_bf16 (h2), svdup_bf16 (h3));
++}
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	zip2	z0\.h, z1\.h, z0\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbfloat16_t __attribute__((noipa))
++caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
++{
++  svbfloat16x2_t res;
++  res = callee_bf16 (h0, h1, h2, h3);
++  return svzip2 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_s32:
++**	mov	z0\.s, #1
++**	mov	z1\.s, #2
++**	ret
++*/
++svint32x2_t __attribute__((noipa))
++callee_s32 (void)
++{
++  return svcreate2 (svdup_s32 (1), svdup_s32 (2));
++}
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	trn1	z0\.s, z0\.s, z1\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint32_t __attribute__((noipa))
++caller_s32 (void)
++{
++  svint32x2_t res;
++  res = callee_s32 ();
++  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
++}
++
++/*
++** callee_u32:
++**	mov	z0\.s, #3
++**	mov	z1\.s, #4
++**	ret
++*/
++svuint32x2_t __attribute__((noipa))
++callee_u32 (void)
++{
++  return svcreate2 (svdup_u32 (3), svdup_u32 (4));
++}
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	trn2	z0\.s, z1\.s, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint32_t __attribute__((noipa))
++caller_u32 (void)
++{
++  svuint32x2_t res;
++  res = callee_u32 ();
++  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_f32:
++**	fmov	z0\.s, #5\.0(?:e\+0)?
++**	fmov	z1\.s, #6\.0(?:e\+0)?
++**	ret
++*/
++svfloat32x2_t __attribute__((noipa))
++callee_f32 (void)
++{
++  return svcreate2 (svdup_f32 (5), svdup_f32 (6));
++}
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	zip1	z0\.s, z1\.s, z0\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat32_t __attribute__((noipa))
++caller_f32 (void)
++{
++  svfloat32x2_t res;
++  res = callee_f32 ();
++  return svzip1 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_s64:
++**	mov	z0\.d, #1
++**	mov	z1\.d, #2
++**	ret
++*/
++svint64x2_t __attribute__((noipa))
++callee_s64 (void)
++{
++  return svcreate2 (svdup_s64 (1), svdup_s64 (2));
++}
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	trn1	z0\.d, z0\.d, z1\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint64_t __attribute__((noipa))
++caller_s64 (void)
++{
++  svint64x2_t res;
++  res = callee_s64 ();
++  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
++}
++
++/*
++** callee_u64:
++**	mov	z0\.d, #3
++**	mov	z1\.d, #4
++**	ret
++*/
++svuint64x2_t __attribute__((noipa))
++callee_u64 (void)
++{
++  return svcreate2 (svdup_u64 (3), svdup_u64 (4));
++}
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	trn2	z0\.d, z1\.d, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint64_t __attribute__((noipa))
++caller_u64 (void)
++{
++  svuint64x2_t res;
++  res = callee_u64 ();
++  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
++}
++
++/*
++** callee_f64:
++**	fmov	z0\.d, #5\.0(?:e\+0)?
++**	fmov	z1\.d, #6\.0(?:e\+0)?
++**	ret
++*/
++svfloat64x2_t __attribute__((noipa))
++callee_f64 (void)
++{
++  return svcreate2 (svdup_f64 (5), svdup_f64 (6));
++}
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	zip1	z0\.d, z1\.d, z0\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat64_t __attribute__((noipa))
++caller_f64 (void)
++{
++  svfloat64x2_t res;
++  res = callee_f64 ();
++  return svzip1 (svget2 (res, 1), svget2 (res, 0));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_8.c
+new file mode 100644
+index 000000000..9581811e7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_8.c
+@@ -0,0 +1,375 @@
++/* { dg-do compile } */
++/* { dg-options "-O -frename-registers -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_s8:
++**	mov	z0\.b, #1
++**	mov	z1\.b, #2
++**	mov	z2\.b, #3
++**	ret
++*/
++svint8x3_t __attribute__((noipa))
++callee_s8 (void)
++{
++  return svcreate3 (svdup_s8 (1), svdup_s8 (2), svdup_s8 (3));
++}
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	ptrue	(p[0-7])\.b, all
++**	mad	z0\.b, \1/m, z1\.b, z2\.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint8_t __attribute__((noipa))
++caller_s8 (void)
++{
++  svint8x3_t res;
++  res = callee_s8 ();
++  return svmad_x (svptrue_b8 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_u8:
++**	mov	z0\.b, #4
++**	mov	z1\.b, #5
++**	mov	z2\.b, #6
++**	ret
++*/
++svuint8x3_t __attribute__((noipa))
++callee_u8 (void)
++{
++  return svcreate3 (svdup_u8 (4), svdup_u8 (5), svdup_u8 (6));
++}
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	ptrue	(p[0-7])\.b, all
++**	msb	z0\.b, \1/m, z1\.b, z2\.b
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint8_t __attribute__((noipa))
++caller_u8 (void)
++{
++  svuint8x3_t res;
++  res = callee_u8 ();
++  return svmsb_x (svptrue_b8 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_s16:
++**	mov	z0\.h, #1
++**	mov	z1\.h, #2
++**	mov	z2\.h, #3
++**	ret
++*/
++svint16x3_t __attribute__((noipa))
++callee_s16 (void)
++{
++  return svcreate3 (svdup_s16 (1), svdup_s16 (2), svdup_s16 (3));
++}
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	ptrue	(p[0-7])\.b, all
++**	mls	z0\.h, \1/m, z1\.h, z2\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint16_t __attribute__((noipa))
++caller_s16 (void)
++{
++  svint16x3_t res;
++  res = callee_s16 ();
++  return svmls_x (svptrue_b16 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_u16:
++**	mov	z0\.h, #4
++**	mov	z1\.h, #5
++**	mov	z2\.h, #6
++**	ret
++*/
++svuint16x3_t __attribute__((noipa))
++callee_u16 (void)
++{
++  return svcreate3 (svdup_u16 (4), svdup_u16 (5), svdup_u16 (6));
++}
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	ptrue	(p[0-7])\.b, all
++**	mla	z0\.h, \1/m, z1\.h, z2\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint16_t __attribute__((noipa))
++caller_u16 (void)
++{
++  svuint16x3_t res;
++  res = callee_u16 ();
++  return svmla_x (svptrue_b16 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_f16:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fmov	z1\.h, #2\.0(?:e\+0)?
++**	fmov	z2\.h, #3\.0(?:e\+0)?
++**	ret
++*/
++svfloat16x3_t __attribute__((noipa))
++callee_f16 (void)
++{
++  return svcreate3 (svdup_f16 (1), svdup_f16 (2), svdup_f16 (3));
++}
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	ptrue	(p[0-7])\.b, all
++**	fmla	z0\.h, \1/m, z1\.h, z2\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat16_t __attribute__((noipa))
++caller_f16 (void)
++{
++  svfloat16x3_t res;
++  res = callee_f16 ();
++  return svmla_x (svptrue_b16 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_bf16:
++**	mov	z0\.h, h0
++**	mov	z1\.h, h1
++**	mov	z2\.h, h2
++**	ret
++*/
++svbfloat16x3_t __attribute__((noipa))
++callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
++{
++  return svcreate3 (svdup_bf16 (h0), svdup_bf16 (h1), svdup_bf16 (h2));
++}
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	trn2	z0\.h, z0\.h, z2\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbfloat16_t __attribute__((noipa))
++caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
++{
++  svbfloat16x3_t res;
++  res = callee_bf16 (h0, h1, h2);
++  return svtrn2 (svget3 (res, 0), svget3 (res, 2));
++}
++
++/*
++** callee_s32:
++**	mov	z0\.s, #1
++**	mov	z1\.s, #2
++**	mov	z2\.s, #3
++**	ret
++*/
++svint32x3_t __attribute__((noipa))
++callee_s32 (void)
++{
++  return svcreate3 (svdup_s32 (1), svdup_s32 (2), svdup_s32 (3));
++}
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	ptrue	(p[0-7])\.b, all
++**	mad	z0\.s, \1/m, z1\.s, z2\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint32_t __attribute__((noipa))
++caller_s32 (void)
++{
++  svint32x3_t res;
++  res = callee_s32 ();
++  return svmad_x (svptrue_b32 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_u32:
++**	mov	z0\.s, #4
++**	mov	z1\.s, #5
++**	mov	z2\.s, #6
++**	ret
++*/
++svuint32x3_t __attribute__((noipa))
++callee_u32 (void)
++{
++  return svcreate3 (svdup_u32 (4), svdup_u32 (5), svdup_u32 (6));
++}
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	ptrue	(p[0-7])\.b, all
++**	msb	z0\.s, \1/m, z1\.s, z2\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint32_t __attribute__((noipa))
++caller_u32 (void)
++{
++  svuint32x3_t res;
++  res = callee_u32 ();
++  return svmsb_x (svptrue_b32 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_f32:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fmov	z1\.s, #2\.0(?:e\+0)?
++**	fmov	z2\.s, #3\.0(?:e\+0)?
++**	ret
++*/
++svfloat32x3_t __attribute__((noipa))
++callee_f32 (void)
++{
++  return svcreate3 (svdup_f32 (1), svdup_f32 (2), svdup_f32 (3));
++}
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	ptrue	(p[0-7])\.b, all
++**	fmla	z0\.s, \1/m, z1\.s, z2\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat32_t __attribute__((noipa))
++caller_f32 (void)
++{
++  svfloat32x3_t res;
++  res = callee_f32 ();
++  return svmla_x (svptrue_b32 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_s64:
++**	mov	z0\.d, #1
++**	mov	z1\.d, #2
++**	mov	z2\.d, #3
++**	ret
++*/
++svint64x3_t __attribute__((noipa))
++callee_s64 (void)
++{
++  return svcreate3 (svdup_s64 (1), svdup_s64 (2), svdup_s64 (3));
++}
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	ptrue	(p[0-7])\.b, all
++**	mls	z0\.d, \1/m, z1\.d, z2\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint64_t __attribute__((noipa))
++caller_s64 (void)
++{
++  svint64x3_t res;
++  res = callee_s64 ();
++  return svmls_x (svptrue_b64 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_u64:
++**	mov	z0\.d, #4
++**	mov	z1\.d, #5
++**	mov	z2\.d, #6
++**	ret
++*/
++svuint64x3_t __attribute__((noipa))
++callee_u64 (void)
++{
++  return svcreate3 (svdup_u64 (4), svdup_u64 (5), svdup_u64 (6));
++}
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	ptrue	(p[0-7])\.b, all
++**	mla	z0\.d, \1/m, z1\.d, z2\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint64_t __attribute__((noipa))
++caller_u64 (void)
++{
++  svuint64x3_t res;
++  res = callee_u64 ();
++  return svmla_x (svptrue_b64 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
++
++/*
++** callee_f64:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fmov	z1\.d, #2\.0(?:e\+0)?
++**	fmov	z2\.d, #3\.0(?:e\+0)?
++**	ret
++*/
++svfloat64x3_t __attribute__((noipa))
++callee_f64 (void)
++{
++  return svcreate3 (svdup_f64 (1), svdup_f64 (2), svdup_f64 (3));
++}
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	ptrue	(p[0-7])\.b, all
++**	fmla	z0\.d, \1/m, z1\.d, z2\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat64_t __attribute__((noipa))
++caller_f64 (void)
++{
++  svfloat64x3_t res;
++  res = callee_f64 ();
++  return svmla_x (svptrue_b64 (),
++		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_9.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_9.c
+new file mode 100644
+index 000000000..ad32e1fe5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_9.c
+@@ -0,0 +1,438 @@
++/* { dg-do compile } */
++/* { dg-options "-O -frename-registers -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_sve.h>
++
++/*
++** callee_s8:
++**	mov	z0\.b, #1
++**	mov	z1\.b, #2
++**	mov	z2\.b, #3
++**	mov	z3\.b, #4
++**	ret
++*/
++svint8x4_t __attribute__((noipa))
++callee_s8 (void)
++{
++  return svcreate4 (svdup_s8 (1), svdup_s8 (2), svdup_s8 (3), svdup_s8 (4));
++}
++
++/*
++** caller_s8:
++**	...
++**	bl	callee_s8
++**	add	(z[2-7]\.b), z2\.b, z3\.b
++**	ptrue	(p[0-7])\.b, all
++**	mla	z0\.b, \2/m, (z1\.b, \1|\1, z1\.b)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint8_t __attribute__((noipa))
++caller_s8 (void)
++{
++  svint8x4_t res;
++  res = callee_s8 ();
++  return svmla_x (svptrue_b8 (), svget4 (res, 0), svget4 (res, 1),
++		  svadd_x (svptrue_b8 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_u8:
++**	mov	z0\.b, #4
++**	mov	z1\.b, #5
++**	mov	z2\.b, #6
++**	mov	z3\.b, #7
++**	ret
++*/
++svuint8x4_t __attribute__((noipa))
++callee_u8 (void)
++{
++  return svcreate4 (svdup_u8 (4), svdup_u8 (5), svdup_u8 (6), svdup_u8 (7));
++}
++
++/*
++** caller_u8:
++**	...
++**	bl	callee_u8
++**	sub	(z[2-7]\.b), z2\.b, z3\.b
++**	ptrue	(p[0-7])\.b, all
++**	mla	z0\.b, \2/m, (z1\.b, \1|\1, z1\.b)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint8_t __attribute__((noipa))
++caller_u8 (void)
++{
++  svuint8x4_t res;
++  res = callee_u8 ();
++  return svmla_x (svptrue_b8 (), svget4 (res, 0), svget4 (res, 1),
++		  svsub_x (svptrue_b8 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_s16:
++**	mov	z0\.h, #1
++**	mov	z1\.h, #2
++**	mov	z2\.h, #3
++**	mov	z3\.h, #4
++**	ret
++*/
++svint16x4_t __attribute__((noipa))
++callee_s16 (void)
++{
++  return svcreate4 (svdup_s16 (1), svdup_s16 (2),
++		    svdup_s16 (3), svdup_s16 (4));
++}
++
++/*
++** caller_s16:
++**	...
++**	bl	callee_s16
++**	add	(z[2-7]\.h), z2\.h, z3\.h
++**	ptrue	(p[0-7])\.b, all
++**	mad	z0\.h, \2/m, (z1\.h, \1|\1, z1\.h)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint16_t __attribute__((noipa))
++caller_s16 (void)
++{
++  svint16x4_t res;
++  res = callee_s16 ();
++  return svmad_x (svptrue_b16 (), svget4 (res, 0), svget4 (res, 1),
++		  svadd_x (svptrue_b16 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_u16:
++**	mov	z0\.h, #4
++**	mov	z1\.h, #5
++**	mov	z2\.h, #6
++**	mov	z3\.h, #7
++**	ret
++*/
++svuint16x4_t __attribute__((noipa))
++callee_u16 (void)
++{
++  return svcreate4 (svdup_u16 (4), svdup_u16 (5),
++		    svdup_u16 (6), svdup_u16 (7));
++}
++
++/*
++** caller_u16:
++**	...
++**	bl	callee_u16
++**	sub	(z[2-7]\.h), z2\.h, z3\.h
++**	ptrue	(p[0-7])\.b, all
++**	mad	z0\.h, \2/m, (z1\.h, \1|\1, z1\.h)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint16_t __attribute__((noipa))
++caller_u16 (void)
++{
++  svuint16x4_t res;
++  res = callee_u16 ();
++  return svmad_x (svptrue_b16 (), svget4 (res, 0), svget4 (res, 1),
++		  svsub_x (svptrue_b16 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_f16:
++**	fmov	z0\.h, #1\.0(?:e\+0)?
++**	fmov	z1\.h, #2\.0(?:e\+0)?
++**	fmov	z2\.h, #3\.0(?:e\+0)?
++**	fmov	z3\.h, #4\.0(?:e\+0)?
++**	ret
++*/
++svfloat16x4_t __attribute__((noipa))
++callee_f16 (void)
++{
++  return svcreate4 (svdup_f16 (1), svdup_f16 (2),
++		    svdup_f16 (3), svdup_f16 (4));
++}
++
++/*
++** caller_f16:
++**	...
++**	bl	callee_f16
++**	fadd	(z[0-9]+\.h), z0\.h, z1\.h
++**	fmul	(z[0-9]+\.h), \1, z2\.h
++**	fadd	z0\.h, \2, z3\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat16_t __attribute__((noipa))
++caller_f16 (void)
++{
++  svfloat16x4_t res;
++  res = callee_f16 ();
++  return svadd_x (svptrue_b16 (),
++		  svmul_x (svptrue_b16 (),
++			   svadd_x (svptrue_b16 (), svget4 (res, 0),
++				    svget4 (res, 1)),
++			   svget4 (res, 2)),
++		  svget4 (res, 3));
++}
++
++/*
++** callee_bf16:
++**	mov	z0\.h, h4
++**	mov	z1\.h, h5
++**	mov	z2\.h, h6
++**	mov	z3\.h, h7
++**	ret
++*/
++svbfloat16x4_t __attribute__((noipa))
++callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
++	     bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
++{
++  return svcreate4 (svdup_bf16 (h4), svdup_bf16 (h5),
++		    svdup_bf16 (h6), svdup_bf16 (h7));
++}
++
++/*
++** caller_bf16:
++**	...
++**	bl	callee_bf16
++**	trn2	z0\.h, z0\.h, z3\.h
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbfloat16_t __attribute__((noipa))
++caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
++	     bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
++{
++  svbfloat16x4_t res;
++  res = callee_bf16 (h0, h1, h2, h3, h4, h5, h6, h7);
++  return svtrn2 (svget4 (res, 0), svget4 (res, 3));
++}
++
++/*
++** callee_s32:
++**	mov	z0\.s, #1
++**	mov	z1\.s, #2
++**	mov	z2\.s, #3
++**	mov	z3\.s, #4
++**	ret
++*/
++svint32x4_t __attribute__((noipa))
++callee_s32 (void)
++{
++  return svcreate4 (svdup_s32 (1), svdup_s32 (2),
++		    svdup_s32 (3), svdup_s32 (4));
++}
++
++/*
++** caller_s32:
++**	...
++**	bl	callee_s32
++**	add	(z[2-7]\.s), z2\.s, z3\.s
++**	ptrue	(p[0-7])\.b, all
++**	msb	z0\.s, \2/m, (z1\.s, \1|\1, z1\.s)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint32_t __attribute__((noipa))
++caller_s32 (void)
++{
++  svint32x4_t res;
++  res = callee_s32 ();
++  return svmsb_x (svptrue_b32 (), svget4 (res, 0), svget4 (res, 1),
++		  svadd_x (svptrue_b32 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_u32:
++**	mov	z0\.s, #4
++**	mov	z1\.s, #5
++**	mov	z2\.s, #6
++**	mov	z3\.s, #7
++**	ret
++*/
++svuint32x4_t __attribute__((noipa))
++callee_u32 (void)
++{
++  return svcreate4 (svdup_u32 (4), svdup_u32 (5),
++		    svdup_u32 (6), svdup_u32 (7));
++}
++
++/*
++** caller_u32:
++**	...
++**	bl	callee_u32
++**	sub	(z[2-7]\.s), z2\.s, z3\.s
++**	ptrue	(p[0-7])\.b, all
++**	msb	z0\.s, \2/m, (z1\.s, \1|\1, z1\.s)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint32_t __attribute__((noipa))
++caller_u32 (void)
++{
++  svuint32x4_t res;
++  res = callee_u32 ();
++  return svmsb_x (svptrue_b32 (), svget4 (res, 0), svget4 (res, 1),
++		  svsub_x (svptrue_b32 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_f32:
++**	fmov	z0\.s, #1\.0(?:e\+0)?
++**	fmov	z1\.s, #2\.0(?:e\+0)?
++**	fmov	z2\.s, #3\.0(?:e\+0)?
++**	fmov	z3\.s, #4\.0(?:e\+0)?
++**	ret
++*/
++svfloat32x4_t __attribute__((noipa))
++callee_f32 (void)
++{
++  return svcreate4 (svdup_f32 (1), svdup_f32 (2),
++		    svdup_f32 (3), svdup_f32 (4));
++}
++
++/*
++** caller_f32:
++**	...
++**	bl	callee_f32
++**	fadd	(z[0-9]+\.s), z0\.s, z1\.s
++**	fmul	(z[0-9]+\.s), \1, z2\.s
++**	fadd	z0\.s, \2, z3\.s
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat32_t __attribute__((noipa))
++caller_f32 (void)
++{
++  svfloat32x4_t res;
++  res = callee_f32 ();
++  return svadd_x (svptrue_b32 (),
++		  svmul_x (svptrue_b32 (),
++			   svadd_x (svptrue_b32 (), svget4 (res, 0),
++				    svget4 (res, 1)),
++			   svget4 (res, 2)),
++		  svget4 (res, 3));
++}
++
++/*
++** callee_s64:
++**	mov	z0\.d, #1
++**	mov	z1\.d, #2
++**	mov	z2\.d, #3
++**	mov	z3\.d, #4
++**	ret
++*/
++svint64x4_t __attribute__((noipa))
++callee_s64 (void)
++{
++  return svcreate4 (svdup_s64 (1), svdup_s64 (2),
++		    svdup_s64 (3), svdup_s64 (4));
++}
++
++/*
++** caller_s64:
++**	...
++**	bl	callee_s64
++**	add	(z[2-7]\.d), z2\.d, z3\.d
++**	ptrue	(p[0-7])\.b, all
++**	mls	z0\.d, \2/m, (z1\.d, \1|\1, z1\.d)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svint64_t __attribute__((noipa))
++caller_s64 (void)
++{
++  svint64x4_t res;
++  res = callee_s64 ();
++  return svmls_x (svptrue_b64 (), svget4 (res, 0), svget4 (res, 1),
++		  svadd_x (svptrue_b64 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_u64:
++**	mov	z0\.d, #4
++**	mov	z1\.d, #5
++**	mov	z2\.d, #6
++**	mov	z3\.d, #7
++**	ret
++*/
++svuint64x4_t __attribute__((noipa))
++callee_u64 (void)
++{
++  return svcreate4 (svdup_u64 (4), svdup_u64 (5),
++		    svdup_u64 (6), svdup_u64 (7));
++}
++
++/*
++** caller_u64:
++**	...
++**	bl	callee_u64
++**	sub	(z[2-7]\.d), z2\.d, z3\.d
++**	ptrue	(p[0-7])\.b, all
++**	mls	z0\.d, \2/m, (z1\.d, \1|\1, z1\.d)
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svuint64_t __attribute__((noipa))
++caller_u64 (void)
++{
++  svuint64x4_t res;
++  res = callee_u64 ();
++  return svmls_x (svptrue_b64 (), svget4 (res, 0), svget4 (res, 1),
++		  svsub_x (svptrue_b64 (),
++			   svget4 (res, 2),
++			   svget4 (res, 3)));
++}
++
++/*
++** callee_f64:
++**	fmov	z0\.d, #1\.0(?:e\+0)?
++**	fmov	z1\.d, #2\.0(?:e\+0)?
++**	fmov	z2\.d, #3\.0(?:e\+0)?
++**	fmov	z3\.d, #4\.0(?:e\+0)?
++**	ret
++*/
++svfloat64x4_t __attribute__((noipa))
++callee_f64 (void)
++{
++  return svcreate4 (svdup_f64 (1), svdup_f64 (2),
++		    svdup_f64 (3), svdup_f64 (4));
++}
++
++/*
++** caller_f64:
++**	...
++**	bl	callee_f64
++**	fadd	(z[0-9]+\.d), z0\.d, z1\.d
++**	fmul	(z[0-9]+\.d), \1, z2\.d
++**	fadd	z0\.d, \2, z3\.d
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svfloat64_t __attribute__((noipa))
++caller_f64 (void)
++{
++  svfloat64x4_t res;
++  res = callee_f64 ();
++  return svadd_x (svptrue_b64 (),
++		  svmul_x (svptrue_b64 (),
++			   svadd_x (svptrue_b64 (), svget4 (res, 0),
++				    svget4 (res, 1)),
++			   svget4 (res, 2)),
++		  svget4 (res, 3));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_nowrap.c
+new file mode 100644
+index 000000000..4eee04226
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_nowrap.c
+@@ -0,0 +1,196 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mbig-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p1\.b, all
++**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p1, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p1, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p1, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p1, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p1, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p1, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p1, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, all
++**	ptrue	p1\.b, all
++**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p1/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p1/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p1/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p1/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p1/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p1/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p1/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, all
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	addvl	sp, sp, #-6
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	ptrue	p1\.b, all
++**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
++**	st1d	z13\.d, p1, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, all
++**	ptrue	p1\.b, all
++**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
++**	ld1d	z13\.d, p1/z, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	addvl	sp, sp, #6
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	addvl	sp, sp, #-1
++**	ptrue	p1\.b, all
++**	st1d	z15\.d, p1, \[sp\]
++**	ptrue	p0\.b, all
++**	ptrue	p1\.b, all
++**	ld1d	z15\.d, p1/z, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	addvl	sp, sp, #-2
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++**	st1d	z15\.d, p4, \[sp, #1, mul vl\]
++**	mov	z0\.b, #1
++**	ptrue	p4\.b, all
++**	ld1d	z15\.d, p4/z, \[sp, #1, mul vl\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #2
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	addvl	sp, sp, #-1
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z16, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_wrap.c
+new file mode 100644
+index 000000000..e88a3dd1d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_wrap.c
+@@ -0,0 +1,196 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mbig-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p1\.b, all
++**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p1, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p1, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p1, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p1, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p1, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p1, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p1, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, all
++**	ptrue	p1\.b, all
++**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p1/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p1/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p1/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p1/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p1/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p1/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p1/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, all
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	addvl	sp, sp, #-6
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	ptrue	p1\.b, all
++**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
++**	st1d	z13\.d, p1, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, all
++**	ptrue	p1\.b, all
++**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
++**	ld1d	z13\.d, p1/z, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	addvl	sp, sp, #6
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	addvl	sp, sp, #-1
++**	ptrue	p1\.b, all
++**	st1d	z15\.d, p1, \[sp\]
++**	ptrue	p0\.b, all
++**	ptrue	p1\.b, all
++**	ld1d	z15\.d, p1/z, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	addvl	sp, sp, #-2
++**	str	p4, \[sp\]
++**	ptrue	p4\.b, all
++**	st1d	z15\.d, p4, \[sp, #1, mul vl\]
++**	mov	z0\.b, #1
++**	ptrue	p4\.b, all
++**	ld1d	z15\.d, p4/z, \[sp, #1, mul vl\]
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #2
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	addvl	sp, sp, #-1
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z16, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_nowrap.c
+new file mode 100644
+index 000000000..d14cd79b1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_nowrap.c
+@@ -0,0 +1,184 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, all
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, all
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	addvl	sp, sp, #-6
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, all
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	addvl	sp, sp, #6
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	addvl	sp, sp, #-1
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z15, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	addvl	sp, sp, #-1
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	addvl	sp, sp, #-1
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z16, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_wrap.c
+new file mode 100644
+index 000000000..d81dd8e6b
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_wrap.c
+@@ -0,0 +1,184 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, all
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, all
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	addvl	sp, sp, #-6
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, all
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	addvl	sp, sp, #6
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	addvl	sp, sp, #-1
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	addvl	sp, sp, #-1
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z15, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	addvl	sp, sp, #-1
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	addvl	sp, sp, #-1
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z16, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_nowrap.c
+new file mode 100644
+index 000000000..05aa18b3c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_nowrap.c
+@@ -0,0 +1,271 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mbig-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++void standard_callee (void);
++__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
++
++/*
++** calls_standard:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	standard_callee
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_standard (__SVInt8_t x) { standard_callee (); }
++
++/*
++** calls_vpcs:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	vpcs_callee
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
++
++/*
++** calls_standard_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
++{
++  fn ();
++}
++
++/*
++** calls_vpcs_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_vpcs_ptr (__SVInt8_t x,
++		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
++{
++  fn ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_wrap.c
+new file mode 100644
+index 000000000..85b7794d7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_wrap.c
+@@ -0,0 +1,271 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mbig-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++void standard_callee (void);
++__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
++
++/*
++** calls_standard:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	standard_callee
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_standard (__SVInt8_t x) { standard_callee (); }
++
++/*
++** calls_vpcs:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	vpcs_callee
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
++
++/*
++** calls_standard_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
++{
++  fn ();
++}
++
++/*
++** calls_vpcs_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_vpcs_ptr (__SVInt8_t x,
++		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
++{
++  fn ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_nowrap.c
+new file mode 100644
+index 000000000..0fcd357a0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_nowrap.c
+@@ -0,0 +1,255 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++void standard_callee (void);
++__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
++
++/*
++** calls_standard:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	standard_callee
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_standard (__SVInt8_t x) { standard_callee (); }
++
++/*
++** calls_vpcs:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	vpcs_callee
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
++
++/*
++** calls_standard_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
++{
++  fn ();
++}
++
++/*
++** calls_vpcs_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_vpcs_ptr (__SVInt8_t x,
++		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
++{
++  fn ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_wrap.c
+new file mode 100644
+index 000000000..e81194c74
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_wrap.c
+@@ -0,0 +1,255 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++void standard_callee (void);
++__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
++
++/*
++** calls_standard:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	standard_callee
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_standard (__SVInt8_t x) { standard_callee (); }
++
++/*
++** calls_vpcs:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	bl	vpcs_callee
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
++
++/*
++** calls_standard_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
++{
++  fn ();
++}
++
++/*
++** calls_vpcs_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	blr	x0
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++void
++calls_vpcs_ptr (__SVInt8_t x,
++		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
++{
++  fn ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_3.c
+new file mode 100644
+index 000000000..1fe86b0ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_3.c
+@@ -0,0 +1,92 @@
++/* { dg-do compile } */
++/* { dg-options "-O -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++
++int sve_callee (svint8_t);
++
++/*
++** standard_caller:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	mov	z0\.b, #1
++**	bl	sve_callee
++**	add	w0, w0, #?1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++int standard_caller (void) { return sve_callee (svdup_s8 (1)) + 1; }
++
++/*
++** vpcs_caller:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	mov	z0\.b, #1
++**	bl	sve_callee
++**	add	w0, w0, #?1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++__attribute__((aarch64_vector_pcs))
++int vpcs_caller (void) { return sve_callee (svdup_s8 (1)) + 1; }
++
++/*
++** sve_caller:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	mov	z0\.b, #1
++**	bl	sve_callee
++**	add	w0, w0, #?1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++int sve_caller (svbool_t p0) { return sve_callee (svdup_s8 (1)) + 1; }
++
++/*
++** standard_caller_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	mov	z0\.h, #1
++**	blr	x0
++**	add	w0, w0, #?1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++int
++standard_caller_ptr (int (*fn) (__SVInt16_t))
++{
++  return fn (svdup_s16 (1)) + 1;
++}
++
++/*
++** vpcs_caller_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	mov	z0\.h, #1
++**	blr	x0
++**	add	w0, w0, #?1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++int __attribute__((aarch64_vector_pcs))
++vpcs_caller_ptr (int (*fn) (__SVInt16_t))
++{
++  return fn (svdup_s16 (1)) + 1;
++}
++
++/*
++** sve_caller_ptr:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	mov	z0\.h, #1
++**	blr	x0
++**	add	w0, w0, #?1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++int
++sve_caller_ptr (svbool_t pg, int (*fn) (svint16_t))
++{
++  return fn (svdup_s16 (1)) + 1;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_be.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_be.c
+new file mode 100644
+index 000000000..c42699dc7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_be.c
+@@ -0,0 +1,84 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++void standard_callee (__SVInt8_t *);
++
++/*
++** calls_standard:
++**	addvl	sp, sp, #-1
++** (
++**	stp	x29, x30, \[sp, -16\]!
++** |
++**	sub	sp, sp, #?16
++**	stp	x29, x30, \[sp\]
++** )
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	addvl	x0, sp, #17
++**	add	x0, x0, #?16
++**	bl	standard_callee
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++** (
++**	ldp	x29, x30, \[sp\], 16
++**	addvl	sp, sp, #1
++** |
++**	ldp	x29, x30, \[sp\]
++**	addvl	sp, sp, #1
++**	add	sp, sp, #?16
++** )
++**	ret
++*/
++void calls_standard (__SVInt8_t x) { __SVInt8_t tmp; standard_callee (&tmp); }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_le.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_le.c
+new file mode 100644
+index 000000000..49fe96800
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_le.c
+@@ -0,0 +1,80 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++void standard_callee (__SVInt8_t *);
++
++/*
++** calls_standard:
++**	addvl	sp, sp, #-1
++** (
++**	stp	x29, x30, \[sp, -16\]!
++** |
++**	sub	sp, sp, #?16
++**	stp	x29, x30, \[sp\]
++** )
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	addvl	x0, sp, #17
++**	add	x0, x0, #?16
++**	bl	standard_callee
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++** (
++**	ldp	x29, x30, \[sp\], 16
++**	addvl	sp, sp, #1
++** |
++**	ldp	x29, x30, \[sp\]
++**	addvl	sp, sp, #1
++**	add	sp, sp, #?16
++** )
++**	ret
++*/
++void calls_standard (__SVInt8_t x) { __SVInt8_t tmp; standard_callee (&tmp); }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_be.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_be.c
+new file mode 100644
+index 000000000..dc3282eee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_be.c
+@@ -0,0 +1,78 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void standard_callee (void);
++
++/*
++** calls_standard:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	ptrue	p0\.b, all
++**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
++**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
++**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
++**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
++**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
++**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
++**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
++**	cbnz	w0, \.L[0-9]+
++**	ptrue	p0\.b, all
++**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
++**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
++**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
++**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
++**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
++**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
++**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
++**	addvl	x11, sp, #16
++**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++**	...
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	bl	standard_callee
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	b	\.L[0-9]+
++*/
++void
++calls_standard (__SVInt8_t x, int y)
++{
++  asm volatile ("" ::: "z8");
++  if (__builtin_expect (y, 0))
++    standard_callee ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_le.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_le.c
+new file mode 100644
+index 000000000..0d29ff2fd
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_le.c
+@@ -0,0 +1,74 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++void standard_callee (void);
++
++/*
++** calls_standard:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	addvl	sp, sp, #-17
++**	str	z8, \[sp, #1, mul vl\]
++**	cbnz	w0, \.L[0-9]+
++**	ldr	z8, \[sp, #1, mul vl\]
++**	addvl	sp, sp, #17
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++**	...
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	bl	standard_callee
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	b	\.L[0-9]+
++*/
++void
++calls_standard (__SVInt8_t x, int y)
++{
++  asm volatile ("" ::: "z8");
++  if (__builtin_expect (y, 0))
++    standard_callee ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+new file mode 100644
+index 000000000..485d01875
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+@@ -0,0 +1,204 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	cntb	x12
++**	mov	x13, #?17
++**	mul	x12, x12, x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, all
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #17
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, all
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	cntb	x12, all, mul #6
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, all
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	addvl	sp, sp, #6
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	cntb	x12
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	cntb	x12
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z15, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	cntb	x12
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	cntb	x12
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, all
++**	ldr	z16, \[sp\]
++**	addvl	sp, sp, #1
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_1024.c
+new file mode 100644
+index 000000000..087e8db9e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_1024.c
+@@ -0,0 +1,184 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=1024 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	sub	sp, sp, #2176
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, vl128
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	add	sp, sp, #?2176
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, vl128
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	sub	sp, sp, #768
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, vl128
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	add	sp, sp, #?768
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #128
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, vl128
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #128
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, vl128
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?128
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	sub	sp, sp, #128
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?128
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	sub	sp, sp, #128
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, vl128
++**	ldr	z16, \[sp\]
++**	add	sp, sp, #?128
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_2048.c
+new file mode 100644
+index 000000000..e8dc5d5e4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_2048.c
+@@ -0,0 +1,185 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=2048 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	mov	x12, #?4352
++**	sub	sp, sp, x12
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, vl256
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, vl256
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	sub	sp, sp, #1536
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, vl256
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	add	sp, sp, #?1536
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #256
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, vl256
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #256
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, vl256
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?256
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	sub	sp, sp, #256
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?256
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	sub	sp, sp, #256
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, vl256
++**	ldr	z16, \[sp\]
++**	add	sp, sp, #?256
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_256.c
+new file mode 100644
+index 000000000..73c49e4d4
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_256.c
+@@ -0,0 +1,184 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=256 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	sub	sp, sp, #544
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, vl32
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	add	sp, sp, #?544
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, vl32
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	sub	sp, sp, #192
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, vl32
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	add	sp, sp, #?192
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #32
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, vl32
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #32
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, vl32
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?32
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	sub	sp, sp, #32
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?32
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	sub	sp, sp, #32
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, vl32
++**	ldr	z16, \[sp\]
++**	add	sp, sp, #?32
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_512.c
+new file mode 100644
+index 000000000..d4b524147
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_512.c
+@@ -0,0 +1,184 @@
++/* { dg-do compile } */
++/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=512 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	sub	sp, sp, #1088
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z9, \[sp, #2, mul vl\]
++**	str	z10, \[sp, #3, mul vl\]
++**	str	z11, \[sp, #4, mul vl\]
++**	str	z12, \[sp, #5, mul vl\]
++**	str	z13, \[sp, #6, mul vl\]
++**	str	z14, \[sp, #7, mul vl\]
++**	str	z15, \[sp, #8, mul vl\]
++**	str	z16, \[sp, #9, mul vl\]
++**	str	z17, \[sp, #10, mul vl\]
++**	str	z18, \[sp, #11, mul vl\]
++**	str	z19, \[sp, #12, mul vl\]
++**	str	z20, \[sp, #13, mul vl\]
++**	str	z21, \[sp, #14, mul vl\]
++**	str	z22, \[sp, #15, mul vl\]
++**	str	z23, \[sp, #16, mul vl\]
++**	ptrue	p0\.b, vl64
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z9, \[sp, #2, mul vl\]
++**	ldr	z10, \[sp, #3, mul vl\]
++**	ldr	z11, \[sp, #4, mul vl\]
++**	ldr	z12, \[sp, #5, mul vl\]
++**	ldr	z13, \[sp, #6, mul vl\]
++**	ldr	z14, \[sp, #7, mul vl\]
++**	ldr	z15, \[sp, #8, mul vl\]
++**	ldr	z16, \[sp, #9, mul vl\]
++**	ldr	z17, \[sp, #10, mul vl\]
++**	ldr	z18, \[sp, #11, mul vl\]
++**	ldr	z19, \[sp, #12, mul vl\]
++**	ldr	z20, \[sp, #13, mul vl\]
++**	ldr	z21, \[sp, #14, mul vl\]
++**	ldr	z22, \[sp, #15, mul vl\]
++**	ldr	z23, \[sp, #16, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	add	sp, sp, #?1088
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
++		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
++		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	ptrue	p0\.b, vl64
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  asm volatile ("" :::
++		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
++		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
++		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	sub	sp, sp, #384
++**	str	p5, \[sp\]
++**	str	p6, \[sp, #1, mul vl\]
++**	str	p11, \[sp, #2, mul vl\]
++**	str	z8, \[sp, #1, mul vl\]
++**	str	z13, \[sp, #2, mul vl\]
++**	str	z19, \[sp, #3, mul vl\]
++**	str	z20, \[sp, #4, mul vl\]
++**	str	z22, \[sp, #5, mul vl\]
++**	ptrue	p0\.b, vl64
++**	ldr	z8, \[sp, #1, mul vl\]
++**	ldr	z13, \[sp, #2, mul vl\]
++**	ldr	z19, \[sp, #3, mul vl\]
++**	ldr	z20, \[sp, #4, mul vl\]
++**	ldr	z22, \[sp, #5, mul vl\]
++**	ldr	p5, \[sp\]
++**	ldr	p6, \[sp, #1, mul vl\]
++**	ldr	p11, \[sp, #2, mul vl\]
++**	add	sp, sp, #?384
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  asm volatile ("" :::
++		"z8", "z13", "z19", "z20", "z22",
++		"p5", "p6", "p11");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #64
++**	str	p4, \[sp\]
++**	ptrue	p0\.b, vl64
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #64
++**	str	z15, \[sp\]
++**	ptrue	p0\.b, vl64
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?64
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  asm volatile ("" ::: "z15");
++  return svptrue_b8 ();
++}
++
++/*
++** test_6:
++**	sub	sp, sp, #64
++**	str	z15, \[sp\]
++**	mov	z0\.b, #1
++**	ldr	z15, \[sp\]
++**	add	sp, sp, #?64
++**	ret
++*/
++svint8_t
++test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
++  return svdup_s8 (1);
++}
++
++/*
++** test_7:
++**	sub	sp, sp, #64
++**	str	z16, \[sp\]
++**	ptrue	p0\.b, vl64
++**	ldr	z16, \[sp\]
++**	add	sp, sp, #?64
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  asm volatile ("" ::: "z16");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
+new file mode 100644
+index 000000000..4622a1eed
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
+@@ -0,0 +1,336 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svbool_t take_stack_args (volatile void *, void *, int, int, int,
++			  int, int, int, int);
++
++/*
++** test_1:
++**	cntb	x12
++**	add	x12, x12, #?16
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	add	sp, sp, #?16
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	stp	x24, x25, \[sp, -48\]!
++**	str	x26, \[sp, 16\]
++**	cntb	x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x13
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldr	x26, \[sp, 16\]
++**	ldp	x24, x25, \[sp\], 48
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	cntb	x12
++**	mov	x13, #?4128
++**	add	x12, x12, x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x24, x25, \[x11\]
++**	str	x26, \[x11, 16\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x24, x25, \[sp\]
++**	ldr	x26, \[sp, 16\]
++**	mov	x12, #?4128
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  volatile int x[1024];
++  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	cntb	x12, all, mul #2
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #2
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4");
++  return svptrue_b16 ();
++}
++
++/*
++** test_5:
++**	cntb	x12, all, mul #2
++**	add	x12, x12, #?32
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x24, x25, \[x11\]
++**	str	x26, \[x11, 16\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x24, x25, \[sp\]
++**	ldr	x26, \[sp, 16\]
++**	addvl	sp, sp, #1
++**	add	sp, sp, #?32
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b16 ();
++}
++
++/*
++** test_6:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	cntb	x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x13
++**	str	p4, \[sp\]
++**	sub	sp, sp, #?16
++**	...
++**	ptrue	p0\.b, all
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbool_t
++test_6 (void)
++{
++  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_7:
++**	cntb	x12
++**	mov	x13, #?4112
++**	add	x12, x12, x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x29, x30, \[x11\]
++**	addvl	x29, sp, #1
++**	str	p4, \[sp\]
++**	sub	sp, sp, #?16
++**	...
++**	ptrue	p0\.b, all
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_8:
++**	cntb	x12
++**	mov	x13, #?4144
++**	add	x12, x12, x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x29, x30, \[x11\]
++**	addvl	x29, sp, #1
++**	stp	x24, x25, \[x29, 16\]
++**	str	x26, \[x29, 32\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #?16
++**	...
++**	ptrue	p0\.b, all
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_8 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_9:
++**	cntb	x12
++**	mov	x13, #?4112
++**	add	x12, x12, x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x29, x30, \[x11\]
++**	addvl	x29, sp, #1
++**	str	p4, \[sp\]
++**	sub	sp, sp, #?16
++**	...
++**	ptrue	p0\.b, all
++**	addvl	sp, x29, #-1
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_9 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_10:
++**	cntb	x12
++**	mov	x13, #?4144
++**	add	x12, x12, x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x29, x30, \[x11\]
++**	addvl	x29, sp, #1
++**	stp	x24, x25, \[x29, 16\]
++**	str	x26, \[x29, 32\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #?16
++**	...
++**	ptrue	p0\.b, all
++**	addvl	sp, x29, #-1
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_10 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_11:
++**	cntb	x12
++**	add	x12, x12, #?3008
++**	add	x12, x12, #?126976
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x12
++**	addvl	x11, sp, #1
++**	stp	x29, x30, \[x11\]
++**	addvl	x29, sp, #1
++**	stp	x24, x25, \[x29, 16\]
++**	str	x26, \[x29, 32\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #?16
++**	...
++**	ptrue	p0\.b, all
++**	addvl	sp, x29, #-1
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	add	sp, sp, #?3008
++**	add	sp, sp, #?126976
++**	ret
++*/
++svbool_t
++test_11 (int n)
++{
++  volatile int x[0x7ee4];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
+new file mode 100644
+index 000000000..d5a9d4444
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
+@@ -0,0 +1,285 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=1024 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svbool_t take_stack_args (volatile void *, void *, int, int, int,
++			  int, int, int, int);
++
++/*
++** test_1:
++**	sub	sp, sp, #144
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl128
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?144
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	sub	sp, sp, #176
++**	stp	x24, x25, \[sp, 128\]
++**	str	x26, \[sp, 144\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl128
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 128\]
++**	ldr	x26, \[sp, 144\]
++**	add	sp, sp, #?176
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	mov	x12, #?4256
++**	sub	sp, sp, x12
++**	stp	x24, x25, \[sp, 128\]
++**	str	x26, \[sp, 144\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl128
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 128\]
++**	ldr	x26, \[sp, 144\]
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  volatile int x[1024];
++  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #256
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl64
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4");
++  return svptrue_b16 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #288
++**	stp	x24, x25, \[sp, 128\]
++**	str	x26, \[sp, 144\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl64
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 128\]
++**	ldr	x26, \[sp, 144\]
++**	add	sp, sp, #?288
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b16 ();
++}
++
++/*
++** test_6:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	sub	sp, sp, #128
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl128
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbool_t
++test_6 (void)
++{
++  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_7:
++**	mov	x12, #?4240
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 128\]
++**	add	x29, sp, #?128
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl128
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_8:
++**	mov	x12, #?4272
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 128\]
++**	add	x29, sp, #?128
++**	stp	x24, x25, \[sp, 144\]
++**	str	x26, \[sp, 160\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl128
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_8 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_9:
++**	mov	x12, #?4240
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 128\]
++**	add	x29, sp, #?128
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl128
++**	sub	sp, x29, #128
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_9 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_10:
++**	mov	x12, #?4272
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 128\]
++**	add	x29, sp, #?128
++**	stp	x24, x25, \[sp, 144\]
++**	str	x26, \[sp, 160\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl128
++**	sub	sp, x29, #128
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_10 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_11:
++**	sub	sp, sp, #65536
++**	str	xzr, \[sp, 1024\]
++**	mov	x12, #?64576
++**	sub	sp, sp, x12
++**	str	xzr, \[sp, 1024\]
++**	stp	x29, x30, \[sp, 128\]
++**	add	x29, sp, #?128
++**	stp	x24, x25, \[sp, 144\]
++**	str	x26, \[sp, 160\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl128
++**	sub	sp, x29, #128
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	add	sp, sp, #?3008
++**	add	sp, sp, #?126976
++**	ret
++*/
++svbool_t
++test_11 (int n)
++{
++  volatile int x[0x7ee4];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
+new file mode 100644
+index 000000000..c185e2e36
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
+@@ -0,0 +1,285 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=2048 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svbool_t take_stack_args (volatile void *, void *, int, int, int,
++			  int, int, int, int);
++
++/*
++** test_1:
++**	sub	sp, sp, #272
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl256
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?272
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	sub	sp, sp, #304
++**	stp	x24, x25, \[sp, 256\]
++**	str	x26, \[sp, 272\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl256
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 256\]
++**	ldr	x26, \[sp, 272\]
++**	add	sp, sp, #?304
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	mov	x12, #?4384
++**	sub	sp, sp, x12
++**	stp	x24, x25, \[sp, 256\]
++**	str	x26, \[sp, 272\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl256
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 256\]
++**	ldr	x26, \[sp, 272\]
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  volatile int x[1024];
++  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #512
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl128
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?512
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4");
++  return svptrue_b16 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #544
++**	stp	x24, x25, \[sp, 256\]
++**	str	x26, \[sp, 272\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl128
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 256\]
++**	ldr	x26, \[sp, 272\]
++**	add	sp, sp, #?544
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b16 ();
++}
++
++/*
++** test_6:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	sub	sp, sp, #256
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl256
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbool_t
++test_6 (void)
++{
++  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_7:
++**	mov	x12, #?4368
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 256\]
++**	add	x29, sp, #?256
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl256
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_8:
++**	mov	x12, #?4400
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 256\]
++**	add	x29, sp, #?256
++**	stp	x24, x25, \[sp, 272\]
++**	str	x26, \[sp, 288\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl256
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_8 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_9:
++**	mov	x12, #?4368
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 256\]
++**	add	x29, sp, #?256
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl256
++**	sub	sp, x29, #256
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_9 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_10:
++**	mov	x12, #?4400
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 256\]
++**	add	x29, sp, #?256
++**	stp	x24, x25, \[sp, 272\]
++**	str	x26, \[sp, 288\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl256
++**	sub	sp, x29, #256
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_10 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_11:
++**	sub	sp, sp, #65536
++**	str	xzr, \[sp, 1024\]
++**	mov	x12, #?64704
++**	sub	sp, sp, x12
++**	str	xzr, \[sp, 1024\]
++**	stp	x29, x30, \[sp, 256\]
++**	add	x29, sp, #?256
++**	stp	x24, x25, \[sp, 272\]
++**	str	x26, \[sp, 288\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl256
++**	sub	sp, x29, #256
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?256
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	add	sp, sp, #?3008
++**	add	sp, sp, #?126976
++**	ret
++*/
++svbool_t
++test_11 (int n)
++{
++  volatile int x[0x7ee4];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
+new file mode 100644
+index 000000000..f8318b354
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
+@@ -0,0 +1,284 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=256 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svbool_t take_stack_args (volatile void *, void *, int, int, int,
++			  int, int, int, int);
++
++/*
++** test_1:
++**	sub	sp, sp, #48
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl32
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?48
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	sub	sp, sp, #80
++**	stp	x24, x25, \[sp, 32\]
++**	str	x26, \[sp, 48\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl32
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 32\]
++**	ldr	x26, \[sp, 48\]
++**	add	sp, sp, #?80
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	mov	x12, #?4160
++**	sub	sp, sp, x12
++**	stp	x24, x25, \[sp, 32\]
++**	str	x26, \[sp, 48\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl32
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 32\]
++**	ldr	x26, \[sp, 48\]
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  volatile int x[1024];
++  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #64
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4");
++  return svptrue_b16 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #96
++**	stp	x24, x25, \[sp, 32\]
++**	str	x26, \[sp, 48\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl16
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 32\]
++**	ldr	x26, \[sp, 48\]
++**	add	sp, sp, #?96
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b16 ();
++}
++
++/*
++** test_6:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	sub	sp, sp, #32
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl32
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbool_t
++test_6 (void)
++{
++  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_7:
++**	mov	x12, #?4144
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 32\]
++**	add	x29, sp, #?32
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl32
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_8:
++**	mov	x12, #?4176
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 32\]
++**	add	x29, sp, #?32
++**	stp	x24, x25, \[sp, 48\]
++**	str	x26, \[sp, 64\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl32
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_8 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_9:
++**	mov	x12, #?4144
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 32\]
++**	add	x29, sp, #?32
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl32
++**	sub	sp, x29, #32
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_9 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_10:
++**	mov	x12, #?4176
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 32\]
++**	add	x29, sp, #?32
++**	stp	x24, x25, \[sp, 48\]
++**	str	x26, \[sp, 64\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl32
++**	sub	sp, x29, #32
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_10 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_11:
++**	sub	sp, sp, #65536
++**	str	xzr, \[sp, 1024\]
++**	mov	x12, #?64480
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 32\]
++**	add	x29, sp, #?32
++**	stp	x24, x25, \[sp, 48\]
++**	str	x26, \[sp, 64\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl32
++**	sub	sp, x29, #32
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?32
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	add	sp, sp, #?3008
++**	add	sp, sp, #?126976
++**	ret
++*/
++svbool_t
++test_11 (int n)
++{
++  volatile int x[0x7ee4];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
+new file mode 100644
+index 000000000..45a23ad49
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
+@@ -0,0 +1,285 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=512 -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++svbool_t take_stack_args (volatile void *, void *, int, int, int,
++			  int, int, int, int);
++
++/*
++** test_1:
++**	sub	sp, sp, #80
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl64
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?80
++**	ret
++*/
++svbool_t
++test_1 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	sub	sp, sp, #112
++**	stp	x24, x25, \[sp, 64\]
++**	str	x26, \[sp, 80\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl64
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 64\]
++**	ldr	x26, \[sp, 80\]
++**	add	sp, sp, #?112
++**	ret
++*/
++svbool_t
++test_2 (void)
++{
++  volatile int x = 1;
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_3:
++**	mov	x12, #?4192
++**	sub	sp, sp, x12
++**	stp	x24, x25, \[sp, 64\]
++**	str	x26, \[sp, 80\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl64
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 64\]
++**	ldr	x26, \[sp, 80\]
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_3 (void)
++{
++  volatile int x[1024];
++  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_4:
++**	sub	sp, sp, #128
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl32
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?128
++**	ret
++*/
++svbool_t
++test_4 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4");
++  return svptrue_b16 ();
++}
++
++/*
++** test_5:
++**	sub	sp, sp, #160
++**	stp	x24, x25, \[sp, 64\]
++**	str	x26, \[sp, 80\]
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.h, vl32
++**	ldr	p4, \[sp\]
++**	ldp	x24, x25, \[sp, 64\]
++**	ldr	x26, \[sp, 80\]
++**	add	sp, sp, #?160
++**	ret
++*/
++svbool_t
++test_5 (void)
++{
++  volatile svint32_t b;
++  b = svdup_s32 (1);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b16 ();
++}
++
++/*
++** test_6:
++**	stp	x29, x30, \[sp, -16\]!
++**	mov	x29, sp
++**	sub	sp, sp, #64
++**	str	p4, \[sp\]
++**	...
++**	ptrue	p0\.b, vl64
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ldp	x29, x30, \[sp\], 16
++**	ret
++*/
++svbool_t
++test_6 (void)
++{
++  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_7:
++**	mov	x12, #?4176
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 64\]
++**	add	x29, sp, #?64
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl64
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_7 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_8:
++**	mov	x12, #?4208
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 64\]
++**	add	x29, sp, #?64
++**	stp	x24, x25, \[sp, 80\]
++**	str	x26, \[sp, 96\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl64
++**	add	sp, sp, #?16
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_8 (void)
++{
++  volatile int x[1024];
++  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_9:
++**	mov	x12, #?4176
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 64\]
++**	add	x29, sp, #?64
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl64
++**	sub	sp, x29, #64
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4112
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_9 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4");
++  return svptrue_b8 ();
++}
++
++/*
++** test_10:
++**	mov	x12, #?4208
++**	sub	sp, sp, x12
++**	stp	x29, x30, \[sp, 64\]
++**	add	x29, sp, #?64
++**	stp	x24, x25, \[sp, 80\]
++**	str	x26, \[sp, 96\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl64
++**	sub	sp, x29, #64
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	mov	x12, #?4144
++**	add	sp, sp, x12
++**	ret
++*/
++svbool_t
++test_10 (int n)
++{
++  volatile int x[1024];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
++
++/*
++** test_11:
++**	sub	sp, sp, #65536
++**	str	xzr, \[sp, 1024\]
++**	mov	x12, #?64512
++**	sub	sp, sp, x12
++**	str	xzr, \[sp, 1024\]
++**	stp	x29, x30, \[sp, 64\]
++**	add	x29, sp, #?64
++**	stp	x24, x25, \[sp, 80\]
++**	str	x26, \[sp, 96\]
++**	str	p4, \[sp\]
++**	sub	sp, sp, #16
++**	...
++**	ptrue	p0\.b, vl64
++**	sub	sp, x29, #64
++**	ldr	p4, \[sp\]
++**	add	sp, sp, #?64
++**	ldp	x24, x25, \[sp, 16\]
++**	ldr	x26, \[sp, 32\]
++**	ldp	x29, x30, \[sp\]
++**	add	sp, sp, #?3008
++**	add	sp, sp, #?126976
++**	ret
++*/
++svbool_t
++test_11 (int n)
++{
++  volatile int x[0x7ee4];
++  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
++  asm volatile ("" ::: "p4", "x24", "x25", "x26");
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+new file mode 100644
+index 000000000..3e01ec36c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+@@ -0,0 +1,63 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#pragma GCC aarch64 "arm_sve.h"
++
++/*
++** test_1:
++**	str	x24, \[sp, -32\]!
++**	cntb	x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x13
++**	str	p4, \[sp\]
++**	cbz	w0, [^\n]*
++**	...
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldr	x24, \[sp\], 32
++**	ret
++*/
++svbool_t
++test_1 (int n)
++{
++  asm volatile ("" ::: "x24");
++  if (n)
++    {
++      volatile int x = 1;
++      asm volatile ("" ::: "p4");
++    }
++  return svptrue_b8 ();
++}
++
++/*
++** test_2:
++**	str	x24, \[sp, -32\]!
++**	cntb	x13
++**	mov	x11, sp
++**	...
++**	sub	sp, sp, x13
++**	str	p4, \[sp\]
++**	cbz	w0, [^\n]*
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	...
++**	ptrue	p0\.b, all
++**	ldr	p4, \[sp\]
++**	addvl	sp, sp, #1
++**	ldr	x24, \[sp\], 32
++**	ret
++*/
++svbool_t
++test_2 (int n)
++{
++  asm volatile ("" ::: "x24");
++  if (n)
++    {
++      volatile int x = 1;
++      asm volatile ("" ::: "p4", "p5", "p6");
++    }
++  return svptrue_b8 ();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/unprototyped_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/unprototyped_1.c
+new file mode 100644
+index 000000000..5c7ed5167
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/unprototyped_1.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++
++#include <arm_sve.h>
++
++void unprototyped ();
++
++void
++f (svuint8_t *ptr)
++{
++  unprototyped (*ptr); /* { dg-error {SVE type '(svuint8_t|__SVUint8_t)' cannot be passed to an unprototyped function} } */
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_1.c
+new file mode 100644
+index 000000000..6987245a6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_1.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ldr	(p[0-7]), \[x1\]
++**	...
++**	cntp	x0, \1, \1\.b
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++callee_0 (int64_t *ptr, ...)
++{
++  va_list va;
++  svbool_t pg;
++
++  va_start (va, ptr);
++  pg = va_arg (va, svbool_t);
++  va_end (va);
++  return svcntp_b8 (pg, pg);
++}
++
++/*
++** caller_0:
++**	...
++**	ptrue	(p[0-7])\.d, vl7
++**	...
++**	str	\1, \[x1\]
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_0 (int64_t *ptr)
++{
++  return callee_0 (ptr, svptrue_pat_b64 (SV_VL7));
++}
++
++/*
++** callee_1:
++**	...
++**	ldr	(p[0-7]), \[x2\]
++**	...
++**	cntp	x0, \1, \1\.b
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++callee_1 (int64_t *ptr, ...)
++{
++  va_list va;
++  svbool_t pg;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  pg = va_arg (va, svbool_t);
++  va_end (va);
++  return svcntp_b8 (pg, pg);
++}
++
++/*
++** caller_1:
++**	...
++**	ptrue	(p[0-7])\.d, vl7
++**	...
++**	str	\1, \[x2\]
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_1 (int64_t *ptr)
++{
++  return callee_1 (ptr, 1, svptrue_pat_b64 (SV_VL7));
++}
++
++/*
++** callee_7:
++**	...
++**	ldr	(p[0-7]), \[x7\]
++**	...
++**	cntp	x0, \1, \1\.b
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++callee_7 (int64_t *ptr, ...)
++{
++  va_list va;
++  svbool_t pg;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  pg = va_arg (va, svbool_t);
++  va_end (va);
++  return svcntp_b8 (pg, pg);
++}
++
++/*
++** caller_7:
++**	...
++**	ptrue	(p[0-7])\.d, vl7
++**	...
++**	str	\1, \[x7\]
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_7 (int64_t *ptr)
++{
++  return callee_7 (ptr, 1, 2, 3, 4, 5, 6, svptrue_pat_b64 (SV_VL7));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ldr	(p[0-7]), \[\2\]
++**	...
++**	cntp	x0, \3, \3\.b
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++callee_8 (int64_t *ptr, ...)
++{
++  va_list va;
++  svbool_t pg;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  pg = va_arg (va, svbool_t);
++  va_end (va);
++  return svcntp_b8 (pg, pg);
++}
++
++/*
++** caller_8:
++**	...
++**	ptrue	(p[0-7])\.d, vl7
++**	...
++**	str	\1, \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++uint64_t __attribute__((noipa))
++caller_8 (int64_t *ptr)
++{
++  return callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svptrue_pat_b64 (SV_VL7));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f16.c
+new file mode 100644
+index 000000000..79098851c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f16.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x1\]
++**	...
++**	st1h	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
++**	...
++**	st1h	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int16_t *ptr)
++{
++  callee_0 (ptr, svdup_f16 (9));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x2\]
++**	...
++**	st1h	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
++**	...
++**	st1h	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int16_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_f16 (9));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x7\]
++**	...
++**	st1h	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
++**	...
++**	st1h	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int16_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_f16 (9));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[\2\]
++**	...
++**	st1h	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
++**	...
++**	st1h	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int16_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_f16 (9));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f32.c
+new file mode 100644
+index 000000000..325b0b2aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f32.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x1\]
++**	...
++**	st1w	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
++**	...
++**	st1w	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int32_t *ptr)
++{
++  callee_0 (ptr, svdup_f32 (9));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x2\]
++**	...
++**	st1w	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
++**	...
++**	st1w	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int32_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_f32 (9));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x7\]
++**	...
++**	st1w	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
++**	...
++**	st1w	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int32_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_f32 (9));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[\2\]
++**	...
++**	st1w	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
++**	...
++**	st1w	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int32_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_f32 (9));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f64.c
+new file mode 100644
+index 000000000..07a6c707e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f64.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x1\]
++**	...
++**	st1d	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
++**	...
++**	st1d	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int64_t *ptr)
++{
++  callee_0 (ptr, svdup_f64 (9));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x2\]
++**	...
++**	st1d	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
++**	...
++**	st1d	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int64_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_f64 (9));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x7\]
++**	...
++**	st1d	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
++**	...
++**	st1d	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int64_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_f64 (9));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[\2\]
++**	...
++**	st1d	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
++**	...
++**	st1d	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int64_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_f64 (9));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s16.c
+new file mode 100644
+index 000000000..173063833
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s16.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x1\]
++**	...
++**	st1h	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int16_t *ptr)
++{
++  callee_0 (ptr, svdup_s16 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x2\]
++**	...
++**	st1h	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int16_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_s16 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x7\]
++**	...
++**	st1h	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int16_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s16 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[\2\]
++**	...
++**	st1h	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int16_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s16 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s32.c
+new file mode 100644
+index 000000000..d93db8fc8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s32.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x1\]
++**	...
++**	st1w	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int32_t *ptr)
++{
++  callee_0 (ptr, svdup_s32 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x2\]
++**	...
++**	st1w	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int32_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_s32 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x7\]
++**	...
++**	st1w	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int32_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s32 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[\2\]
++**	...
++**	st1w	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int32_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s32 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s64.c
+new file mode 100644
+index 000000000..b8c77455d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s64.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x1\]
++**	...
++**	st1d	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int64_t *ptr)
++{
++  callee_0 (ptr, svdup_s64 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x2\]
++**	...
++**	st1d	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int64_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_s64 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x7\]
++**	...
++**	st1d	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int64_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s64 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[\2\]
++**	...
++**	st1d	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int64_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s64 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c
+new file mode 100644
+index 000000000..de7cbe37d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x1\]
++**	...
++**	st1b	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int8_t *ptr)
++{
++  callee_0 (ptr, svdup_s8 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x2\]
++**	...
++**	st1b	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int8_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_s8 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x7\]
++**	...
++**	st1b	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int8_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s8 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[\2\]
++**	...
++**	st1b	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int8_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s8 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u16.c
+new file mode 100644
+index 000000000..59c9ca7db
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u16.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x1\]
++**	...
++**	st1h	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int16_t *ptr)
++{
++  callee_0 (ptr, svdup_u16 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x2\]
++**	...
++**	st1h	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int16_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_u16 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x7\]
++**	...
++**	st1h	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int16_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u16 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[\2\]
++**	...
++**	st1h	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int16_t *ptr, ...)
++{
++  va_list va;
++  svint16_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint16_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.h), #42
++**	...
++**	st1h	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int16_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u16 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u32.c
+new file mode 100644
+index 000000000..3050ad5f6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u32.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x1\]
++**	...
++**	st1w	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int32_t *ptr)
++{
++  callee_0 (ptr, svdup_u32 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x2\]
++**	...
++**	st1w	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int32_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_u32 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x7\]
++**	...
++**	st1w	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int32_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u32 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[\2\]
++**	...
++**	st1w	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int32_t *ptr, ...)
++{
++  va_list va;
++  svint32_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint32_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.s), #42
++**	...
++**	st1w	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int32_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u32 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u64.c
+new file mode 100644
+index 000000000..94322a34c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u64.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x1\]
++**	...
++**	st1d	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int64_t *ptr)
++{
++  callee_0 (ptr, svdup_u64 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x2\]
++**	...
++**	st1d	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int64_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_u64 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x7\]
++**	...
++**	st1d	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int64_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u64 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[\2\]
++**	...
++**	st1d	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int64_t *ptr, ...)
++{
++  va_list va;
++  svint64_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint64_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.d), #42
++**	...
++**	st1d	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int64_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u64 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u8.c
+new file mode 100644
+index 000000000..cf8ac2171
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u8.c
+@@ -0,0 +1,170 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
++/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++/*
++** callee_0:
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x1\]
++**	...
++**	st1b	\1, \2, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_0 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_0:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[x1\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_0 (int8_t *ptr)
++{
++  callee_0 (ptr, svdup_u8 (42));
++}
++
++/*
++** callee_1:
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x2\]
++**	...
++**	st1b	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_1 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_1:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[x2\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_1 (int8_t *ptr)
++{
++  callee_1 (ptr, 1, svdup_u8 (42));
++}
++
++/*
++** callee_7:
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x7\]
++**	...
++**	st1b	\1, p[0-7], \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_7 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_7:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[x7\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_7 (int8_t *ptr)
++{
++  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u8 (42));
++}
++
++/* FIXME: We should be able to get rid of the va_list object.  */
++/*
++** callee_8:
++**	sub	sp, sp, #([0-9]+)
++**	...
++**	ldr	(x[0-9]+), \[sp, \1\]
++**	...
++**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[\2\]
++**	...
++**	st1b	\3, \4, \[x0\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++callee_8 (int8_t *ptr, ...)
++{
++  va_list va;
++  svint8_t vec;
++
++  va_start (va, ptr);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  va_arg (va, int);
++  vec = va_arg (va, svint8_t);
++  va_end (va);
++  svst1 (svptrue_b8 (), ptr, vec);
++}
++
++/*
++** caller_8:
++**	...
++**	mov	(z[0-9]+\.b), #42
++**	...
++**	st1b	\1, p[0-7], \[(x[0-9]+)\]
++**	...
++**	str	\2, \[sp\]
++**	...
++**	ret
++*/
++void __attribute__((noipa))
++caller_8 (int8_t *ptr)
++{
++  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u8 (42));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_nosc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_nosc.c
+new file mode 100644
+index 000000000..cea69cc88
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_nosc.c
+@@ -0,0 +1,75 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O0 -g" } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++void __attribute__((noipa))
++callee (int foo, ...)
++{
++  va_list va;
++  svbool_t pg, p;
++  svint8_t s8;
++  svuint16x4_t u16;
++  svfloat32x3_t f32;
++  svint64x2_t s64;
++
++  va_start (va, foo);
++  p = va_arg (va, svbool_t);
++  s8 = va_arg (va, svint8_t);
++  u16 = va_arg (va, svuint16x4_t);
++  f32 = va_arg (va, svfloat32x3_t);
++  s64 = va_arg (va, svint64x2_t);
++
++  pg = svptrue_b8 ();
++
++  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
++    __builtin_abort ();
++}
++
++int __attribute__((noipa))
++main (void)
++{
++  callee (100,
++	  svptrue_pat_b8 (SV_VL7),
++	  svindex_s8 (1, 2),
++	  svcreate4 (svindex_u16 (2, 3),
++		     svindex_u16 (3, 4),
++		     svindex_u16 (4, 5),
++		     svindex_u16 (5, 6)),
++	  svcreate3 (svdup_f32 (1.0),
++		     svdup_f32 (2.0),
++		     svdup_f32 (3.0)),
++	  svcreate2 (svindex_s64 (6, 7),
++		     svindex_s64 (7, 8)));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_sc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_sc.c
+new file mode 100644
+index 000000000..b939aa5ea
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_sc.c
+@@ -0,0 +1,75 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O0 -fstack-clash-protection -g" } */
++
++#include <arm_sve.h>
++#include <stdarg.h>
++
++void __attribute__((noipa))
++callee (int foo, ...)
++{
++  va_list va;
++  svbool_t pg, p;
++  svint8_t s8;
++  svuint16x4_t u16;
++  svfloat32x3_t f32;
++  svint64x2_t s64;
++
++  va_start (va, foo);
++  p = va_arg (va, svbool_t);
++  s8 = va_arg (va, svint8_t);
++  u16 = va_arg (va, svuint16x4_t);
++  f32 = va_arg (va, svfloat32x3_t);
++  s64 = va_arg (va, svint64x2_t);
++
++  pg = svptrue_b8 ();
++
++  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
++    __builtin_abort ();
++
++  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
++    __builtin_abort ();
++}
++
++int __attribute__((noipa))
++main (void)
++{
++  callee (100,
++	  svptrue_pat_b8 (SV_VL7),
++	  svindex_s8 (1, 2),
++	  svcreate4 (svindex_u16 (2, 3),
++		     svindex_u16 (3, 4),
++		     svindex_u16 (4, 5),
++		     svindex_u16 (5, 6)),
++	  svcreate3 (svdup_f32 (1.0),
++		     svdup_f32 (2.0),
++		     svdup_f32 (3.0)),
++	  svcreate2 (svindex_s64 (6, 7),
++		     svindex_s64 (7, 8)));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/vpcs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/vpcs_1.c
+new file mode 100644
+index 000000000..d9f4e6c41
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/vpcs_1.c
+@@ -0,0 +1,6 @@
++/* { dg-do compile } */
++
++__attribute__ ((aarch64_vector_pcs)) void f1 (__SVBool_t); /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
++__attribute__ ((aarch64_vector_pcs)) void f2 (__SVInt8_t s8) {} /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
++__attribute__ ((aarch64_vector_pcs)) void (*f3) (__SVInt16_t); /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
++typedef __attribute__ ((aarch64_vector_pcs)) void (*f4) (__SVInt32_t); /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
+index a064c337b..156d04ae5 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
+@@ -25,3 +25,4 @@ foo (void)
+ /* We should use an induction that starts at -5, with only the last
+    7 elements of the first iteration being active.  */
+ /* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */
++/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+\.b), vl1\n.*\tnot\tp[0-7]\.b, p[0-7]/z, \1\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
+index f2113be90..e792cdf2c 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
+@@ -20,3 +20,4 @@ foo (void)
+ /* { dg-final { scan-assembler {\t(adrp|adr)\tx[0-9]+, x\n} } } */
+ /* We should unroll the loop three times.  */
+ /* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */
++/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.s, vl7\n.*\teor\tp[0-7]\.b, (p[0-7])/z, (\1\.b, \2\.b|\2\.b, \1\.b)\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c
+index 1a3d9b4ea..9cf2f27c8 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c
+@@ -1,9 +1,7 @@
+ /* { dg-do assemble { target aarch64_asm_sve_ok } } */
+-/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps -mlittle-endian" } */
+ 
+-#include <stdint.h>
+-
+-typedef int8_t vnx16qi __attribute__((vector_size (32)));
++typedef __INT8_TYPE__ vnx16qi __attribute__((vector_size (32)));
+ 
+ #define MASK_2(X, Y) (X) ^ (Y), (X + 1) ^ (Y)
+ #define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revb_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revb_2.c
+new file mode 100644
+index 000000000..389739cc8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/revb_2.c
+@@ -0,0 +1,10 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps -mbig-endian" } */
++
++#include "revb_1.c"
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d} 1 } } */
++/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s} 1 } } */
++/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c
+index 76145812b..28a0399b9 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c
+@@ -1,9 +1,7 @@
+ /* { dg-do assemble { target aarch64_asm_sve_ok } } */
+-/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps -mlittle-endian" } */
+ 
+-#include <stdint.h>
+-
+-typedef uint16_t vnx8hi __attribute__((vector_size (32)));
++typedef __UINT16_TYPE__ vnx8hi __attribute__((vector_size (32)));
+ typedef _Float16 vnx8hf __attribute__((vector_size (32)));
+ 
+ #define MASK_2(X, Y) (X) ^ (Y), (X + 1) ^ (Y)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revh_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revh_2.c
+new file mode 100644
+index 000000000..e821b6402
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/revh_2.c
+@@ -0,0 +1,9 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps -mbig-endian" } */
++
++#include "revh_1.c"
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\trevh\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d} 2 } } */
++/* { dg-final { scan-assembler-times {\trevh\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c
+index 8ac68b782..de926753c 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c
+@@ -1,9 +1,7 @@
+ /* { dg-do assemble { target aarch64_asm_sve_ok } } */
+-/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps -mlittle-endian" } */
+ 
+-#include <stdint.h>
+-
+-typedef uint32_t vnx4si __attribute__((vector_size (32)));
++typedef __UINT32_TYPE__ vnx4si __attribute__((vector_size (32)));
+ typedef float vnx4sf __attribute__((vector_size (32)));
+ 
+ #define MASK_2(X, Y) (X) ^ (Y), (X + 1) ^ (Y)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revw_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revw_2.c
+new file mode 100644
+index 000000000..17243c05c
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/revw_2.c
+@@ -0,0 +1,8 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O -msve-vector-bits=256 --save-temps -mbig-endian" } */
++
++#include "revw_1.c"
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\trevw\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
+new file mode 100644
+index 000000000..e7bf64a57
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
+@@ -0,0 +1,28 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define DEF_SAD(TYPE1, TYPE2)						\
++TYPE1 __attribute__ ((noinline, noclone))				\
++sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)	\
++{									\
++  TYPE1 sum = 0;							\
++  for (int i = 0; i < n; i++)						\
++    {									\
++      sum += __builtin_abs (x[i] - y[i]);				\
++    }									\
++  return sum;								\
++}
++
++DEF_SAD(int32_t, uint8_t)
++DEF_SAD(int32_t, int8_t)
++DEF_SAD(int64_t, uint16_t)
++DEF_SAD(int64_t, int16_t)
++
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_1.c
+new file mode 100644
+index 000000000..e651e5b93
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_1.c
+@@ -0,0 +1,27 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++typedef int8_t vnx16qi __attribute__((vector_size (32)));
++
++/* Predicate vector: 1 0 1 0 ... */
++
++#define MASK_32		{ 0, 33, 2, 35, 4, 37, 6, 39, 8, 41,			\
++			  10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 		\
++			  20, 53, 22, 55, 24, 57, 26, 59, 28, 61, 30, 63 }
++
++#define INDEX_32 vnx16qi
++
++#define PERMUTE(type, nunits)						\
++type permute_##type (type x, type y)					\
++{									\
++  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
++}
++
++PERMUTE(vnx16qi, 32)
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h, vl16\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_2.c
+new file mode 100644
+index 000000000..05391474a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_2.c
+@@ -0,0 +1,41 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++typedef int8_t vnx16qi __attribute__((vector_size (32)));
++typedef int16_t vnx8hi __attribute__((vector_size (32)));
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++
++typedef _Float16 vnx8hf __attribute__((vector_size (32)));
++typedef float vnx4sf __attribute__((vector_size (32)));
++
++/* Predicate vector: 1 0 0 0 ... */
++
++#define MASK_32		{ 0, 33, 34, 35, 4, 37, 38, 39, 8, 41, 42, 43, 12,		\
++			  45, 46, 47, 16, 49, 50, 51, 20, 53, 54, 55, 24,		\
++			  57, 58, 59, 28, 61, 62, 63 } 
++
++/* Predicate vector: 1 0 1 0 ... */
++
++#define MASK_16		{0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}
++
++#define INDEX_32 vnx16qi
++#define INDEX_16 vnx8hi
++
++#define PERMUTE(type, nunits)						\
++type permute_##type (type x, type y)					\
++{									\
++  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
++}
++
++PERMUTE(vnx16qi, 32)
++PERMUTE(vnx8hi, 16)
++PERMUTE(vnx8hf, 16)
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s, vl8\n} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_3.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_3.c
+new file mode 100644
+index 000000000..a87492d9d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_3.c
+@@ -0,0 +1,50 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++typedef int8_t vnx16qi __attribute__((vector_size (32)));
++typedef int16_t vnx8hi __attribute__((vector_size (32)));
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++typedef _Float16 vnx8hf __attribute__((vector_size (32)));
++typedef float vnx4sf __attribute__((vector_size (32)));
++
++/* Predicate vector: 1 0 0 0 0 0 0 0 ... */
++
++#define MASK_32		{ 0, 33, 34, 35, 36, 37, 38, 39,  \
++			  8, 41, 42, 43, 44, 45, 46, 47,  \
++			  16, 49, 50, 51, 52, 53, 54, 55, \
++			  24, 57, 58, 59, 60, 61, 62, 63  }
++
++/* Predicate vector: 1 0 0 0 ... */
++
++#define MASK_16		{ 0, 17, 18, 19, 4, 21, 22, 23, \
++			  8, 25, 26, 27, 12, 29, 30, 31 } 
++
++/* Predicate vector: 1 0 ... */
++
++#define MASK_8		{ 0, 9, 2, 11, 4, 13, 6, 15 }
++
++#define INDEX_32 vnx16qi
++#define INDEX_16 vnx8hi
++#define INDEX_8 vnx4si
++
++#define PERMUTE(type, nunits)						\
++type permute_##type (type x, type y)					\
++{									\
++  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
++}
++
++PERMUTE(vnx16qi, 32)
++PERMUTE(vnx8hi, 16)
++PERMUTE(vnx4si, 8)
++PERMUTE(vnx8hf, 16)
++PERMUTE(vnx4sf, 8)
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d, vl4\n} 5 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_4.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_4.c
+new file mode 100644
+index 000000000..e9bbc5527
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_4.c
+@@ -0,0 +1,50 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++typedef int8_t vnx16qi __attribute__((vector_size (32)));
++typedef int16_t vnx8hi __attribute__((vector_size (32)));
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++typedef int64_t vnx2di __attribute__((vector_size (32)));
++
++typedef _Float16 vnx8hf __attribute__((vector_size (32)));
++typedef float vnx4sf __attribute__((vector_size (32)));
++typedef double vnx2df __attribute__((vector_size (32)));
++
++/* Predicate vector: 1 1 0 0 ... */
++
++#define MASK_32		{ 0, 1, 34, 35, 4, 5, 38, 39, 8, 9, 42, 43, 12, 13,	\
++			  46, 47, 16, 17, 50, 51, 20, 21, 54, 55, 24, 25,	\
++			  58, 59, 28, 29, 62, 63 } 
++
++#define MASK_16		{0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31} 
++#define MASK_8		{0, 1, 10, 11, 4, 5, 14, 15} 
++#define MASK_4		{0, 1, 6, 7}
++
++#define INDEX_32 vnx16qi
++#define INDEX_16 vnx8hi
++#define INDEX_8 vnx4si
++#define INDEX_4 vnx2di
++
++#define PERMUTE(type, nunits)						\
++type permute_##type (type x, type y)					\
++{									\
++  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
++}
++
++PERMUTE(vnx16qi, 32)
++PERMUTE(vnx8hi, 16)
++PERMUTE(vnx4si, 8)
++PERMUTE(vnx2di, 4)
++
++PERMUTE(vnx8hf, 16)
++PERMUTE(vnx4sf, 8)
++PERMUTE(vnx2df, 4)
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-9]+, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_5.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_5.c
+new file mode 100644
+index 000000000..935abb54d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_5.c
+@@ -0,0 +1,50 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++typedef int8_t vnx16qi __attribute__((vector_size (32)));
++typedef int16_t vnx8hi __attribute__((vector_size (32)));
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++typedef int64_t vnx2di __attribute__((vector_size (32)));
++
++typedef _Float16 vnx8hf __attribute__((vector_size (32)));
++typedef float vnx4sf __attribute__((vector_size (32)));
++typedef double vnx2df __attribute__((vector_size (32)));
++
++/* Predicate vector: 1 0 0 1 ... */
++
++#define MASK_32		{ 0, 33, 34, 3, 4, 37, 38, 7, 8, 41, 42, 11, 12, 45, 46,	\
++			  15, 16, 49, 50, 19, 20, 53, 54, 23, 24, 57, 58, 27, 28,	\
++			  61, 62, 31 } 
++
++#define MASK_16		{0, 17, 18, 3, 4, 21, 22, 7, 8, 25, 26, 11, 12, 29, 30, 15}
++#define MASK_8		{0, 9, 10, 3, 4, 13, 14, 7} 
++#define MASK_4		{0, 5, 6, 3}
++
++#define INDEX_32 vnx16qi
++#define INDEX_16 vnx8hi
++#define INDEX_8 vnx4si
++#define INDEX_4 vnx2di
++
++#define PERMUTE(type, nunits)						\
++type permute_##type (type x, type y)					\
++{									\
++  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
++}
++
++PERMUTE(vnx16qi, 32)
++PERMUTE(vnx8hi, 16)
++PERMUTE(vnx4si, 8)
++PERMUTE(vnx2di, 4)
++
++PERMUTE(vnx8hf, 16)
++PERMUTE(vnx4sf, 8)
++PERMUTE(vnx2df, 4)
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-9]+, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_6.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_6.c
+new file mode 100644
+index 000000000..772938f68
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_6.c
+@@ -0,0 +1,42 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
++
++#include <stdint.h>
++
++typedef int32_t vnx4si __attribute__((vector_size (32)));
++typedef int64_t vnx2di __attribute__((vector_size (32)));
++
++typedef float vnx4sf __attribute__((vector_size (32)));
++typedef double vnx2df __attribute__((vector_size (32)));
++
++/* Predicate vector: 1 0 0 0 ... */
++
++#define MASK_32		{ 0, 33, 34, 35, 4, 37, 38, 39, 8, 41, 42, 43, 12,		\
++			  45, 46, 47, 16, 49, 50, 51, 20, 53, 54, 55, 24,		\
++			  57, 58, 59, 28, 61, 62, 63 } 
++
++#define MASK_16		{0, 17, 18, 19, 4, 21, 22, 23, 8, 25, 26, 27, 12, 29, 30, 31} 
++#define MASK_8		{0, 9, 10, 11, 4, 13, 14, 15}
++#define MASK_4		{0, 5, 6, 7}
++
++#define INDEX_8 vnx4si
++#define INDEX_4 vnx2di
++
++#define PERMUTE(type, nunits)						\
++type permute_##type (type x, type y)					\
++{									\
++  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
++}
++
++PERMUTE(vnx4si, 8)
++PERMUTE(vnx2di, 4)
++
++PERMUTE(vnx4sf, 8)
++PERMUTE(vnx2df, 4)
++
++/* { dg-final { scan-assembler-not {\ttbl\t} } } */
++
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-9]+, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d, vl4\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c b/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c
+index f4c5ebd46..5ee66da15 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c
+@@ -75,9 +75,9 @@ DO_IMMEDIATE_OPS (63, int64_t, 63);
+ /* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+ 
+-/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+-/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+-/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tasrr?\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlsrr?\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tlslr?\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
+index 11b88aef7..7764a1b0f 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
+@@ -40,10 +40,7 @@ TEST_LOOP (double, 3.0)
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
+ 
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 2 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl16\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl8\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl4\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 11 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
+ /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_2.c b/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
+index 1fbf4892c..42fc17b73 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
+@@ -16,10 +16,7 @@
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
+ 
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 2 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl32\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl16\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl8\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 11 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
+ /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_3.c b/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
+index a3688b692..338ca1e3d 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
+@@ -16,10 +16,7 @@
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
+ 
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 2 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl64\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl32\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl16\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 11 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
+ /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_4.c b/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
+index 08965d39f..37c78a659 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
+@@ -16,10 +16,7 @@
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
+ 
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 2 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl128\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl64\n} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl32\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 11 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
+ /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
+index 413532c07..d4b9776fe 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
+@@ -29,12 +29,9 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
+ 
+ TEST_ALL (VEC_PERM)
+ 
+-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 { target aarch64_little_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 { target aarch64_little_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 { target aarch64_little_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 { target aarch64_big_endian } } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
++/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
+ /* { dg-final { scan-assembler-not {\tzip1\t} } } */
+ /* { dg-final { scan-assembler-not {\tzip2\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+index 0f9f01a00..82dd43a4d 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+@@ -32,18 +32,17 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
+ TEST_ALL (VEC_PERM)
+ 
+ /* 1 for each 8-bit type.  */
+-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 { target aarch64_little_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
+-/* 1 for each 16-bit type and 4 for double.  */
+-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 { target aarch64_little_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 { target aarch64_big_endian } } } */
++/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
++/* 1 for each 16-bit type plus 1 for double.  */
++/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
+ /* 1 for each 32-bit type.  */
+ /* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
++/* 3 for double.  */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
+ /* The 64-bit types need:
+ 
+       ZIP1 ZIP1 (2 ZIP2s optimized away)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+index 8d9d5ab58..49fb828e8 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+@@ -35,10 +35,8 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
+ 
+ TEST_ALL (VEC_PERM)
+ 
+-/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
+-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 { target aarch64_little_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
+-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 20 { target aarch64_big_endian } } } */
++/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double.  */
++/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
+ /* 1 for each 16-bit type.  */
+ /* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
+@@ -49,6 +47,8 @@ TEST_ALL (VEC_PERM)
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
++/* 4 for double.  */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
+ /* The 32-bit types need:
+ 
+       ZIP1 ZIP1 (2 ZIP2s optimized away)
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/smax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/smax_1.c
+new file mode 100644
+index 000000000..050248c81
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/smax_1.c
+@@ -0,0 +1,71 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O3 --save-temps" } */
++
++#include <stdint.h>
++
++#define DO_REGREG_OPS(TYPE)					\
++void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] > src[i] ? dst[i] : src[i];			\
++}
++
++#define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME)			\
++void varithimm_##NAME##_##TYPE (TYPE *dst, int count)		\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] > (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
++}
++
++#define DO_ARITH_OPS(TYPE)			\
++  DO_REGREG_OPS (TYPE);				\
++  DO_IMMEDIATE_OPS (0, TYPE, 0);		\
++  DO_IMMEDIATE_OPS (86, TYPE, 86);		\
++  DO_IMMEDIATE_OPS (109, TYPE, 109);		\
++  DO_IMMEDIATE_OPS (141, TYPE, 141);		\
++  DO_IMMEDIATE_OPS (-1, TYPE, minus1);		\
++  DO_IMMEDIATE_OPS (-110, TYPE, minus110);	\
++  DO_IMMEDIATE_OPS (-141, TYPE, minus141);
++
++DO_ARITH_OPS (int8_t)
++DO_ARITH_OPS (int16_t)
++DO_ARITH_OPS (int32_t)
++DO_ARITH_OPS (int64_t)
++
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #115\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-115\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-141\n} } } */
++
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #-141\n} } } */
++
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #-141\n} } } */
++
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #-141\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/smin_1.c b/gcc/testsuite/gcc.target/aarch64/sve/smin_1.c
+new file mode 100644
+index 000000000..d6a9e9467
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/smin_1.c
+@@ -0,0 +1,71 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O3 --save-temps" } */
++
++#include <stdint.h>
++
++#define DO_REGREG_OPS(TYPE)					\
++void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] < src[i] ? dst[i] : src[i];			\
++}
++
++#define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME)			\
++void varithimm_##NAME##_##TYPE (TYPE *dst, int count)		\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] < (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
++}
++
++#define DO_ARITH_OPS(TYPE)			\
++  DO_REGREG_OPS (TYPE);				\
++  DO_IMMEDIATE_OPS (0, TYPE, 0);		\
++  DO_IMMEDIATE_OPS (86, TYPE, 86);		\
++  DO_IMMEDIATE_OPS (109, TYPE, 109);		\
++  DO_IMMEDIATE_OPS (141, TYPE, 141);		\
++  DO_IMMEDIATE_OPS (-1, TYPE, minus1);		\
++  DO_IMMEDIATE_OPS (-110, TYPE, minus110);	\
++  DO_IMMEDIATE_OPS (-141, TYPE, minus141);
++
++DO_ARITH_OPS (int8_t)
++DO_ARITH_OPS (int16_t)
++DO_ARITH_OPS (int32_t)
++DO_ARITH_OPS (int64_t)
++
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #115\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-115\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-141\n} } } */
++
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #-141\n} } } */
++
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #-141\n} } } */
++
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #0\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #141\n} } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #-1\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #-110\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #-141\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c
+index 28fcc4429..fcd481611 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c
+@@ -9,29 +9,30 @@ void consumer (void *);
+   void						\
+   multi_loop_##TYPE (TYPE *x, TYPE val)		\
+   {						\
+-    for (int i = 0; i < 7; ++i)			\
++    for (int i = 0; i < 9; ++i)			\
+       x[i] += val;				\
+     consumer (x);				\
+-    for (int i = 0; i < 7; ++i)			\
++    for (int i = 0; i < 9; ++i)			\
+       x[i] += val;				\
+     consumer (x);				\
+-    for (int i = 0; i < 7; ++i)			\
++    for (int i = 0; i < 9; ++i)			\
+       x[i] += val;				\
+     consumer (x);				\
+   }
+ 
+ /* One iteration is enough.  */
+ TEST_LOOP (uint8_t);
++/* Two iterations are enough.  We specialize the second two loops based
++   on whether the first executes once or twice.  */
+ TEST_LOOP (uint16_t);
+-/* Two iterations are enough.  Complete unrolling makes sense
+-   even at -O2.  */
++/* Three iterations are needed; ought to stay a loop.  */
+ TEST_LOOP (uint32_t);
+-/* Four iterations are needed; ought to stay a loop.  */
++/* Five iterations are needed; ought to stay a loop.  */
+ TEST_LOOP (uint64_t);
+ 
+ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.b} 3 } } */
+-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 3 } } */
+-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]\.s} } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 8 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.s} 6 } } */
+ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.d} 6 } } */
+ /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
+ /* { dg-final { scan-assembler-not {\tstr\tz[0-9]} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
+index 29e1a49dc..81b3f6452 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
+@@ -24,10 +24,11 @@ TEST_LOOP (uint16_t, 0x1234);
+ TEST_LOOP (uint32_t, 0x12345);
+ TEST_LOOP (uint64_t, 0x123456);
+ 
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h,} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
+-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
+-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h,} 3 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.b,} 6 } } */
++/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
++/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.s,} } } */
++/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.d,} } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */
+ /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c
+index 6e3c8898a..918a58138 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c
+@@ -83,9 +83,9 @@ NAME(g4) (TYPE *__restrict a, TYPE *__restrict b, TYPE *__restrict c,
+     }
+ }
+ 
+-/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
+ /* { dg-final { scan-assembler {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
+-/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
+-/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
+ /* { dg-final { scan-assembler {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
+-/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
+index 45644b67b..a16a79e51 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
+@@ -43,12 +43,12 @@
+ #undef NAME
+ #undef TYPE
+ 
+-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
+index 814dbb3ae..bc00267c8 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
+@@ -3,12 +3,12 @@
+ 
+ #include "struct_vect_14.c"
+ 
+-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
+index 6ecf89b54..9e2a549f5 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
+@@ -3,12 +3,12 @@
+ 
+ #include "struct_vect_14.c"
+ 
+-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
+index 571c6d0d3..e791e2e12 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
+@@ -3,12 +3,12 @@
+ 
+ #include "struct_vect_14.c"
+ 
+-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+ /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
+index dc912e63c..3bc53b69d 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
+@@ -46,4 +46,4 @@ TEST (test)
+ /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+ 
+ /* The only branches should be in the vectorized loop.  */
+-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 4 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
+index 6568dc71c..833bf0669 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
+@@ -46,4 +46,4 @@ TEST (test)
+ /* Each function should have three branches: one directly to the exit
+    (n <= 0), one to the single scalar epilogue iteration (n == 1),
+    and one branch-back for the vectorized loop.  */
+-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 12 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
+index 6c3520c2f..858ca74f8 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
+@@ -46,4 +46,4 @@ TEST (test)
+ /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+ 
+ /* The only branches should be in the vectorized loop.  */
+-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 4 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
+index 4b2a5e463..95691fe9e 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
+@@ -46,4 +46,4 @@ TEST (test)
+ /* Each function should have three branches: one directly to the exit
+    (n <= 0), one to the single scalar epilogue iteration (n == 1),
+    and one branch-back for the vectorized loop.  */
+-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 12 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
+index b61536053..8eb072505 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
+@@ -46,4 +46,4 @@ TEST (test)
+ /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
+ 
+ /* The only branches should be in the vectorized loop.  */
+-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 4 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
+index b529e0386..705b2350a 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
+@@ -46,4 +46,4 @@ TEST (test)
+ /* Each function should have three branches: one directly to the exit
+    (n <= 0), one to the single scalar epilogue iteration (n == 1),
+    and one branch-back for the vectorized loop.  */
+-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 12 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c
+index b74190149..3d3070e77 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c
+@@ -78,9 +78,9 @@ g4 (TYPE *__restrict a, TYPE *__restrict b, TYPE *__restrict c,
+     }
+ }
+ 
+-/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
+ /* { dg-final { scan-assembler {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
+-/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
+-/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
+ /* { dg-final { scan-assembler {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
+-/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
++/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/umax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/umax_1.c
+new file mode 100644
+index 000000000..fffedb9c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/umax_1.c
+@@ -0,0 +1,65 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O3 --save-temps" } */
++
++#include <stdint.h>
++
++#define DO_REGREG_OPS(TYPE)					\
++void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] > src[i] ? dst[i] : src[i];			\
++}
++
++#define DO_IMMEDIATE_OPS(VALUE, TYPE)				\
++void varithimm_##VALUE##_##TYPE (TYPE *dst, int count)		\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] > (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
++}
++
++#define DO_ARITH_OPS(TYPE)		\
++  DO_REGREG_OPS (TYPE);			\
++  DO_IMMEDIATE_OPS (2, TYPE);		\
++  DO_IMMEDIATE_OPS (86, TYPE);		\
++  DO_IMMEDIATE_OPS (109, TYPE);		\
++  DO_IMMEDIATE_OPS (141, TYPE);		\
++  DO_IMMEDIATE_OPS (229, TYPE);		\
++  DO_IMMEDIATE_OPS (255, TYPE);		\
++  DO_IMMEDIATE_OPS (256, TYPE);
++
++DO_ARITH_OPS (uint8_t)
++DO_ARITH_OPS (uint16_t)
++DO_ARITH_OPS (uint32_t)
++DO_ARITH_OPS (uint64_t)
++
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #229\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #255\n} } } */
++/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #256\n} } } */
++
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #229\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #255\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #256\n} } } */
++
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #229\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #255\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #256\n} } } */
++
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #229\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #255\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #256\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/umin_1.c b/gcc/testsuite/gcc.target/aarch64/sve/umin_1.c
+new file mode 100644
+index 000000000..f7cdba3b7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/umin_1.c
+@@ -0,0 +1,65 @@
++/* { dg-do assemble { target aarch64_asm_sve_ok } } */
++/* { dg-options "-O3 --save-temps" } */
++
++#include <stdint.h>
++
++#define DO_REGREG_OPS(TYPE)					\
++void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] < src[i] ? dst[i] : src[i];			\
++}
++
++#define DO_IMMEDIATE_OPS(VALUE, TYPE)				\
++void varithimm_##VALUE##_##TYPE (TYPE *dst, int count)		\
++{								\
++  for (int i = 0; i < count; ++i)				\
++    dst[i] = dst[i] < (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
++}
++
++#define DO_ARITH_OPS(TYPE)		\
++  DO_REGREG_OPS (TYPE);			\
++  DO_IMMEDIATE_OPS (2, TYPE);		\
++  DO_IMMEDIATE_OPS (86, TYPE);		\
++  DO_IMMEDIATE_OPS (109, TYPE);		\
++  DO_IMMEDIATE_OPS (141, TYPE);		\
++  DO_IMMEDIATE_OPS (229, TYPE);		\
++  DO_IMMEDIATE_OPS (255, TYPE);		\
++  DO_IMMEDIATE_OPS (256, TYPE);
++
++DO_ARITH_OPS (uint8_t)
++DO_ARITH_OPS (uint16_t)
++DO_ARITH_OPS (uint32_t)
++DO_ARITH_OPS (uint64_t)
++
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #229\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #255\n} } } */
++/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #256\n} } } */
++
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #229\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #255\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #256\n} } } */
++
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #229\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #255\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #256\n} } } */
++
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #141\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #229\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #255\n} 1 } } */
++/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #256\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c b/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c
+index d4353009e..e33777fc3 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c
+@@ -10,4 +10,4 @@ fully_peel_me (double *x)
+     x[i] = x[i] * 2;
+ }
+ 
+-/* { dg-final { scan-assembler-times {b..\t\.L.\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_17.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17.c
+new file mode 100644
+index 000000000..cabcfa73e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17.c
+@@ -0,0 +1,94 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include <stdint.h>
++
++#define eq(A, B) ((A) == (B))
++#define ne(A, B) ((A) != (B))
++#define olt(A, B) ((A) < (B))
++#define ole(A, B) ((A) <= (B))
++#define oge(A, B) ((A) >= (B))
++#define ogt(A, B) ((A) > (B))
++#define ordered(A, B) (!__builtin_isunordered (A, B))
++#define unordered(A, B) (__builtin_isunordered (A, B))
++#define ueq(A, B) (!__builtin_islessgreater (A, B))
++#define ult(A, B) (__builtin_isless (A, B))
++#define ule(A, B) (__builtin_islessequal (A, B))
++#define uge(A, B) (__builtin_isgreaterequal (A, B))
++#define ugt(A, B) (__builtin_isgreater (A, B))
++#define nueq(A, B) (__builtin_islessgreater (A, B))
++#define nult(A, B) (!__builtin_isless (A, B))
++#define nule(A, B) (!__builtin_islessequal (A, B))
++#define nuge(A, B) (!__builtin_isgreaterequal (A, B))
++#define nugt(A, B) (!__builtin_isgreater (A, B))
++
++#define DEF_LOOP(CMP, EXPECT_INVALID)					\
++  void __attribute__ ((noinline, noclone))				\
++  test_##CMP##_var (__fp16 *restrict dest, __fp16 *restrict src,	\
++		    __fp16 fallback, __fp16 *restrict a,		\
++		    __fp16 *restrict b, int count)			\
++  {									\
++    for (int i = 0; i < count; ++i)					\
++      dest[i] = CMP (a[i], b[i]) ? src[i] : fallback;			\
++  }									\
++									\
++  void __attribute__ ((noinline, noclone))				\
++  test_##CMP##_zero (__fp16 *restrict dest,  __fp16 *restrict src,	\
++		     __fp16 fallback, __fp16 *restrict a,		\
++		     int count)						\
++  {									\
++    for (int i = 0; i < count; ++i)					\
++      dest[i] = CMP (a[i], (__fp16) 0) ? src[i] : fallback;		\
++  }									\
++									\
++  void __attribute__ ((noinline, noclone))				\
++  test_##CMP##_sel (__fp16 *restrict dest, __fp16 if_true,		\
++		    __fp16 if_false, __fp16 *restrict a,		\
++		    __fp16 b, int count)				\
++  {									\
++    for (int i = 0; i < count; ++i)					\
++      dest[i] = CMP (a[i], b) ? if_true : if_false;			\
++  }
++
++#define TEST_ALL(T)				\
++  T (eq, 0)					\
++  T (ne, 0)					\
++  T (olt, 1)					\
++  T (ole, 1)					\
++  T (oge, 1)					\
++  T (ogt, 1)					\
++  T (ordered, 0)				\
++  T (unordered, 0)				\
++  T (ueq, 0)					\
++  T (ult, 0)					\
++  T (ule, 0)					\
++  T (uge, 0)					\
++  T (ugt, 0)					\
++  T (nueq, 0)					\
++  T (nult, 0)					\
++  T (nule, 0)					\
++  T (nuge, 0)					\
++  T (nugt, 0)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} { xfail *-*-* } } } */
++/* { dg-final { scan-assembler {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
++/* { dg-final { scan-assembler {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
++/* { dg-final { scan-assembler {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
++/* { dg-final { scan-assembler {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
++/* { dg-final { scan-assembler {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
++/* { dg-final { scan-assembler {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
++
++/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
++/* { dg-final { scan-assembler {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_17_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17_run.c
+new file mode 100644
+index 000000000..4a228c8c2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17_run.c
+@@ -0,0 +1,54 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++/* { dg-require-effective-target fenv_exceptions } */
++
++#include <fenv.h>
++
++#include "vcond_17.c"
++
++#define N 401
++
++#define TEST_LOOP(CMP, EXPECT_INVALID)				\
++  {								\
++    __fp16 dest1[N], dest2[N], dest3[N], src[N];		\
++    __fp16 a[N], b[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	src[i] = i * i;						\
++	if (i % 5 == 0)						\
++	  a[i] = 0;						\
++	else if (i % 3)						\
++	  a[i] = i * 0.1;					\
++	else							\
++	  a[i] = i;						\
++	if (i % 7 == 0)						\
++	  b[i] = __builtin_nan ("");				\
++	else if (i % 6)						\
++	  b[i] = i * 0.1;					\
++	else							\
++	  b[i] = i;						\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    feclearexcept (FE_ALL_EXCEPT);				\
++    test_##CMP##_var (dest1, src, 11, a, b, N);			\
++    test_##CMP##_zero (dest2, src, 22, a, N);			\
++    test_##CMP##_sel (dest3, 33, 44, a, 9, N);			\
++    if (!fetestexcept (FE_INVALID) != !(EXPECT_INVALID))	\
++      __builtin_abort ();					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	if (dest1[i] != (CMP (a[i], b[i]) ? src[i] : 11))	\
++	  __builtin_abort ();					\
++	if (dest2[i] != (CMP (a[i], 0) ? src[i] : 22))		\
++	  __builtin_abort ();					\
++	if (dest3[i] != (CMP (a[i], 9) ? 33 : 44))		\
++	  __builtin_abort ();					\
++      }								\
++  }
++
++int __attribute__ ((optimize (1)))
++main (void)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_18.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18.c
+new file mode 100644
+index 000000000..a2590b9ee
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18.c
+@@ -0,0 +1,44 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define DEF_LOOP(TYPE, NAME, CONST)			\
++  void							\
++  test_##TYPE##_##NAME (TYPE *restrict x,		\
++			TYPE *restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] > 0 ? CONST : 0;			\
++  }
++
++#define TEST_TYPE(T, TYPE)			\
++  T (TYPE, 2, 2.0)				\
++  T (TYPE, 1p25, 1.25)				\
++  T (TYPE, 32p25, 32.25)			\
++  T (TYPE, m4, -4.0)				\
++  T (TYPE, m2p5, -2.5)				\
++  T (TYPE, m64p5, -64.5)
++
++#define TEST_ALL(T)				\
++  TEST_TYPE (T, _Float16)			\
++  TEST_TYPE (T, float)				\
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #16384\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #15616\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #-15360\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #-16128\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #2\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #1\.25(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-4\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-2\.5(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-7], z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #2\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #1\.25(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-4\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-2\.5(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-7], z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_18_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18_run.c
+new file mode 100644
+index 000000000..279b0a3ba
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18_run.c
+@@ -0,0 +1,30 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "vcond_18.c"
++
++#define N 97
++
++#define TEST_LOOP(TYPE, NAME, CONST)				\
++  {								\
++    TYPE x[N], pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	pred[i] = i % 5 <= i % 6;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (x, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	if (x[i] != (TYPE) (pred[i] > 0 ? CONST : 0))		\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int __attribute__ ((optimize (1)))
++main (int argc, char **argv)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_19.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19.c
+new file mode 100644
+index 000000000..2347b7f28
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define DEF_LOOP(TYPE, NAME, CONST)			\
++  void							\
++  test_##TYPE##_##NAME (TYPE *restrict x,		\
++			TYPE *restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] > 0 ? CONST : pred[i];		\
++  }
++
++#define TEST_TYPE(T, TYPE)			\
++  T (TYPE, 2, 2.0)				\
++  T (TYPE, 1p25, 1.25)				\
++  T (TYPE, 32p25, 32.25)			\
++  T (TYPE, m4, -4.0)				\
++  T (TYPE, m2p5, -2.5)				\
++  T (TYPE, m64p5, -64.5)
++
++#define TEST_ALL(T)				\
++  TEST_TYPE (T, _Float16)			\
++  TEST_TYPE (T, float)				\
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #16384\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #15616\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-15360\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-16128\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-7], z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-7], z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_19_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19_run.c
+new file mode 100644
+index 000000000..d93d8aa45
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19_run.c
+@@ -0,0 +1,30 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "vcond_19.c"
++
++#define N 97
++
++#define TEST_LOOP(TYPE, NAME, CONST)				\
++  {								\
++    TYPE x[N], pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	pred[i] = i % 5 <= i % 6 ? i : 0;			\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (x, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	if (x[i] != (TYPE) (pred[i] > 0 ? CONST : pred[i]))	\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int __attribute__ ((optimize (1)))
++main (int argc, char **argv)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_20.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20.c
+new file mode 100644
+index 000000000..bf2af1c62
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20.c
+@@ -0,0 +1,46 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define DEF_LOOP(TYPE, NAME, CONST)			\
++  void							\
++  test_##TYPE##_##NAME (TYPE *restrict x,		\
++			TYPE *restrict pred, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      x[i] = pred[i] > 0 ? CONST : 12.0;		\
++  }
++
++#define TEST_TYPE(T, TYPE)			\
++  T (TYPE, 2, 2.0)				\
++  T (TYPE, 1p25, 1.25)				\
++  T (TYPE, 32p25, 32.25)			\
++  T (TYPE, m4, -4.0)				\
++  T (TYPE, m2p5, -2.5)				\
++  T (TYPE, m64p5, -64.5)
++
++#define TEST_ALL(T)				\
++  TEST_TYPE (T, _Float16)			\
++  TEST_TYPE (T, float)				\
++  TEST_TYPE (T, double)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #16384\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #15616\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-15360\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-16128\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-7], z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
++/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-7], z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 12 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_20_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20_run.c
+new file mode 100644
+index 000000000..33c81deaa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20_run.c
+@@ -0,0 +1,30 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "vcond_20.c"
++
++#define N 97
++
++#define TEST_LOOP(TYPE, NAME, CONST)				\
++  {								\
++    TYPE x[N], pred[N];						\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	pred[i] = i % 5 <= i % 6;				\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (x, pred, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	if (x[i] != (TYPE) (pred[i] > 0 ? CONST : 12.0))	\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int __attribute__ ((optimize (1)))
++main (int argc, char **argv)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_21.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21.c
+new file mode 100644
+index 000000000..d5df2e199
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#define DEF_LOOP(TYPE, ABS, NAME, OP)			\
++  void							\
++  test_##TYPE##_##NAME (TYPE *restrict r,		\
++			TYPE *restrict a,		\
++			TYPE *restrict b, int n)	\
++  {							\
++    for (int i = 0; i < n; ++i)				\
++      r[i] = ABS (a[i]) OP ABS (b[i]) ? 1.0 : 0.0;	\
++  }
++
++#define TEST_TYPE(T, TYPE, ABS)			\
++  T (TYPE, ABS, lt, <)				\
++  T (TYPE, ABS, le, <=)				\
++  T (TYPE, ABS, ge, >=)				\
++  T (TYPE, ABS, gt, >)
++
++#define TEST_ALL(T)				\
++  TEST_TYPE (T, _Float16, __builtin_fabsf16)	\
++  TEST_TYPE (T, float, __builtin_fabsf)		\
++  TEST_TYPE (T, double, __builtin_fabs)
++
++TEST_ALL (DEF_LOOP)
++
++/* { dg-final { scan-assembler-times {\tfac[lg]t\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfac[lg]e\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tfac[lg]t\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfac[lg]e\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
++
++/* { dg-final { scan-assembler-times {\tfac[lg]t\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
++/* { dg-final { scan-assembler-times {\tfac[lg]e\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_21_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21_run.c
+new file mode 100644
+index 000000000..15c551324
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21_run.c
+@@ -0,0 +1,31 @@
++/* { dg-do run { target aarch64_sve_hw } } */
++/* { dg-options "-O2 -ftree-vectorize" } */
++
++#include "vcond_21.c"
++
++#define N 97
++
++#define TEST_LOOP(TYPE, ABS, NAME, OP)				\
++  {								\
++    TYPE r[N], a[N], b[N];					\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	a[i] = i % 5 * (i & 1 ? -1 : 1);			\
++	b[i] = i % 9 * (i & 2 ? -1 : 1);			\
++	asm volatile ("" ::: "memory");				\
++      }								\
++    test_##TYPE##_##NAME (r, a, b, N);				\
++    for (int i = 0; i < N; ++i)					\
++      {								\
++	if (r[i] != (ABS (a[i]) OP ABS (b[i]) ? 1.0 : 0.0))	\
++	  __builtin_abort ();					\
++	asm volatile ("" ::: "memory");				\
++      }								\
++  }
++
++int __attribute__ ((optimize (1)))
++main (int argc, char **argv)
++{
++  TEST_ALL (TEST_LOOP)
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+index a93a04baa..2655c4242 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+@@ -42,3 +42,4 @@ TEST_ALL (ADD_LOOP)
+ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x0, x[0-9]+, lsl 2\]\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x0, x[0-9]+, lsl 3\]\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x0, x[0-9]+, lsl 3\]\n} 3 } } */
++/* { dg-final { scan-assembler-times {\tb\.any\t} 10 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_10.c b/gcc/testsuite/gcc.target/aarch64/sve/while_10.c
+new file mode 100644
+index 000000000..eaed326f9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_10.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */
++
++#include <stdint.h>
++
++#define ADD_LOOP(TYPE, COUNT)			\
++  TYPE __attribute__ ((noinline, noclone))	\
++  vec_while_##TYPE (TYPE *restrict a)		\
++  {						\
++    for (int i = 0; i < COUNT; ++i)	       	\
++      a[i] += 1;				\
++  }
++
++#define TEST_ALL(T)				\
++  T (int8_t, 63)				\
++  T (int16_t, 30)				\
++  T (int32_t, 15)				\
++  T (int64_t, 6)
++
++TEST_ALL (ADD_LOOP)
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, mul3\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, mul3\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, mul3\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl6\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_6.c b/gcc/testsuite/gcc.target/aarch64/sve/while_6.c
+new file mode 100644
+index 000000000..b4cc596ef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_6.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
++
++#include <stdint.h>
++
++#define ADD_LOOP(TYPE)				\
++  TYPE __attribute__ ((noinline, noclone))	\
++  vec_while_##TYPE (TYPE *restrict a)		\
++  {						\
++    for (int i = 0; i < 7; ++i)			\
++      a[i] += 1;				\
++  }
++
++#define TEST_ALL(T)				\
++  T (int8_t)					\
++  T (int16_t)					\
++  T (int32_t)					\
++  T (int64_t)
++
++TEST_ALL (ADD_LOOP)
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl7\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl7\n} 1 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_7.c b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
+new file mode 100644
+index 000000000..d5ffb66a1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
++
++#include <stdint.h>
++
++#define ADD_LOOP(TYPE)				\
++  TYPE __attribute__ ((noinline, noclone))	\
++  vec_while_##TYPE (TYPE *restrict a)		\
++  {						\
++    for (int i = 0; i < 8; ++i)			\
++      a[i] += 1;				\
++  }
++
++#define TEST_ALL(T)				\
++  T (int8_t)					\
++  T (int16_t)					\
++  T (int32_t)					\
++  T (int64_t)
++
++TEST_ALL (ADD_LOOP)
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_8.c b/gcc/testsuite/gcc.target/aarch64/sve/while_8.c
+new file mode 100644
+index 000000000..1c11aa849
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_8.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
++
++#include <stdint.h>
++
++#define ADD_LOOP(TYPE)				\
++  TYPE __attribute__ ((noinline, noclone))	\
++  vec_while_##TYPE (TYPE *restrict a)		\
++  {						\
++    for (int i = 0; i < 9; ++i)			\
++      a[i] += 1;				\
++  }
++
++#define TEST_ALL(T)				\
++  T (int8_t)					\
++  T (int16_t)					\
++  T (int32_t)					\
++  T (int64_t)
++
++TEST_ALL (ADD_LOOP)
++
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 1 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_9.c b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
+new file mode 100644
+index 000000000..9a8e5fe12
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
+@@ -0,0 +1,25 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
++
++#include <stdint.h>
++
++#define ADD_LOOP(TYPE)				\
++  TYPE __attribute__ ((noinline, noclone))	\
++  vec_while_##TYPE (TYPE *restrict a)		\
++  {						\
++    for (int i = 0; i < 16; ++i)	       	\
++      a[i] += 1;				\
++  }
++
++#define TEST_ALL(T)				\
++  T (int8_t)					\
++  T (int16_t)					\
++  T (int32_t)					\
++  T (int64_t)
++
++TEST_ALL (ADD_LOOP)
++
++/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
++/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c b/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
+index d7e46b059..fc6a4f3ec 100644
+--- a/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
++++ b/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
+@@ -1,12 +1,12 @@
+-/* { dg-do compile } */
++/* { dg-do link } */
+ /* { dg-options "-O3 -save-temps -mcmodel=tiny" } */
+ 
+-int fixed_regs[0x00200000];
++char fixed_regs[0x00080000];
+ 
+ int
+-foo()
++main ()
+ {
+-  return fixed_regs[0x00080000];
++  return fixed_regs[0x000ff000];
+ }
+ 
+ /* { dg-final { scan-assembler-not "adr\tx\[0-9\]+, fixed_regs\\\+" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/symbol-range.c b/gcc/testsuite/gcc.target/aarch64/symbol-range.c
+index 6574cf431..d8e82fa1b 100644
+--- a/gcc/testsuite/gcc.target/aarch64/symbol-range.c
++++ b/gcc/testsuite/gcc.target/aarch64/symbol-range.c
+@@ -1,12 +1,12 @@
+-/* { dg-do compile } */
++/* { dg-do link } */
+ /* { dg-options "-O3 -save-temps -mcmodel=small" } */
+ 
+-int fixed_regs[0x200000000ULL];
++char fixed_regs[0x80000000];
+ 
+ int
+-foo()
++main ()
+ {
+-  return fixed_regs[0x100000000ULL];
++  return fixed_regs[0xfffff000];
+ }
+ 
+ /* { dg-final { scan-assembler-not "adrp\tx\[0-9\]+, fixed_regs\\\+" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
+index e571b2f13..f56415f33 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
++++ b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-outline-atomics" } */
+ 
+ #include "sync-comp-swap.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
+index 357bf1be3..39b3144aa 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
++++ b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "sync-op-acquire.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
+index c6ba16299..6b8b2043f 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
++++ b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-march=armv8-a+nolse -O2" } */
++/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
+ 
+ #include "sync-op-full.x"
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-10.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-10.c
+new file mode 100644
+index 000000000..3d6893ee0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-10.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++
++int __attribute__((aarch64_vector_pcs)) (*callee) (void);
++
++int __attribute__ ((aarch64_vector_pcs))
++caller (int *x)
++{
++  return callee () + 1;
++}
++
++/* { dg-final { scan-assembler-not {\tstp\tq} } } */
++/* { dg-final { scan-assembler-not {\tldp\tq} } } */
++/* { dg-final { scan-assembler-not {\tstr\tq} } } */
++/* { dg-final { scan-assembler-not {\tldr\tq} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-11.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-11.c
+new file mode 100644
+index 000000000..de99bd701
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-11.c
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++
++int (*callee) (void);
++
++int __attribute__ ((aarch64_vector_pcs))
++caller (int *x)
++{
++  return callee () + 1;
++}
++
++/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */
++/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */
++/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */
++/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */
++/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */
++/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */
++/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */
++/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */
++/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */
++/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */
++/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */
++/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */
++/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */
++/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */
++/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */
++/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-8.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-8.c
+new file mode 100644
+index 000000000..6463f6c50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-8.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-std=gnu99" } */
++/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
++
++#include <arm_neon.h>
++
++void __attribute__ ((aarch64_vector_pcs)) f (void);
++
++void
++g (int64x2x4_t *ptr)
++{
++  register int64x2x4_t copy asm ("v8") = *ptr;
++  int64x2x4_t save;
++  asm volatile ("" : "=w" (save) : "0" (copy));
++  f ();
++  *ptr = save;
++}
++
++/* { dg-final { scan-assembler-times {\tld1\t} 1 } } */
++/* { dg-final { scan-assembler-times {\tst1\t} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-9.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-9.c
+new file mode 100644
+index 000000000..aaa0316d1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-9.c
+@@ -0,0 +1,48 @@
++/* { dg-do compile } */
++/* { dg-options "-fshrink-wrap -ffat-lto-objects" } */
++/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
++/* { dg-final { check-function-bodies "**" "" } } */
++
++int callee (void);
++
++/*
++** caller:
++**	ldr	(w[0-9]+), \[x0\]
++**	cbn?z	\1, [^\n]*
++**	...
++**	ret
++*/
++int __attribute__ ((aarch64_vector_pcs))
++caller (int *x)
++{
++  if (*x)
++    return callee () + 1;
++  else
++    return 0;
++}
++
++/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */
++/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */
++/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */
++/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */
++/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */
++/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */
++/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */
++/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */
++/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */
++/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */
++/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */
++/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */
++/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */
++/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */
++/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */
++/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */
++
++/* { dg-final { scan-assembler-not {\tstp\tq[0-7],} } } */
++/* { dg-final { scan-assembler-not {\tldp\tq[0-7],} } } */
++/* { dg-final { scan-assembler-not {\tstp\tq2[4-9],} } } */
++/* { dg-final { scan-assembler-not {\tldp\tq2[4-9],} } } */
++/* { dg-final { scan-assembler-not {\tstp\td} } } */
++/* { dg-final { scan-assembler-not {\tldp\td} } } */
++/* { dg-final { scan-assembler-not {\tstr\tq} } } */
++/* { dg-final { scan-assembler-not {\tldr\tq} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
+new file mode 100644
+index 000000000..ea8de4d69
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
++/* { dg-add-options arm_v8_2a_dotprod_neon }  */
++/* { dg-additional-options "-O3" } */
++
++#pragma GCC target "+nosve"
++
++#define N 1024
++
++unsigned char pix1[N], pix2[N];
++
++int foo (void)
++{
++  int i_sum = 0;
++  int i;
++
++  for (i = 0; i < N; i++)
++    i_sum += __builtin_abs (pix1[i] - pix2[i]);
++
++  return i_sum;
++}
++
++/* { dg-final { scan-assembler-not {\tushll\t} } } */
++/* { dg-final { scan-assembler-not {\tushll2\t} } } */
++/* { dg-final { scan-assembler-not {\tusubl\t} } } */
++/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
++/* { dg-final { scan-assembler-not {\tabs\t} } } */
++
++/* { dg-final { scan-assembler {\tuabd\t} } } */
++/* { dg-final { scan-assembler {\tudot\t} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
+index 69ceaf425..a66e12096 100644
+--- a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
++++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
+@@ -1,7 +1,7 @@
+ /* { dg-do compile } */
+ /* { dg-options "-O3" } */
+ 
+-#pragma GCC target "+nosve"
++#pragma GCC target "+nosve+nodotprod"
+ 
+ #define N 1024
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/vect-clz.c b/gcc/testsuite/gcc.target/aarch64/vect-clz.c
+index 044fa9e99..cd181c346 100644
+--- a/gcc/testsuite/gcc.target/aarch64/vect-clz.c
++++ b/gcc/testsuite/gcc.target/aarch64/vect-clz.c
+@@ -1,6 +1,8 @@
+ /* { dg-do run } */
+ /* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
+ 
++#pragma GCC target "+nosve"
++
+ extern void abort ();
+ 
+ void
+diff --git a/gcc/testsuite/gcc.target/i386/asm-1.c b/gcc/testsuite/gcc.target/i386/asm-1.c
+index cd60a09bd..5e516d882 100644
+--- a/gcc/testsuite/gcc.target/i386/asm-1.c
++++ b/gcc/testsuite/gcc.target/i386/asm-1.c
+@@ -2,7 +2,7 @@
+ /* { dg-require-effective-target ia32 } */
+ /* { dg-options "" } */
+ 
+-register unsigned int EAX asm ("r14"); /* { dg-error "register name" } */
++register unsigned int EAX asm ("r14"); /* { dg-error "cannot be accessed" } */
+ 
+ void foo ()
+ {
+diff --git a/gcc/testsuite/gcc.target/i386/asm-7.c b/gcc/testsuite/gcc.target/i386/asm-7.c
+new file mode 100644
+index 000000000..d2d113626
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/asm-7.c
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++/* { dg-require-effective-target ia32 } */
++/* { dg-options "" } */
++
++void foo (void)
++{
++  asm volatile ("" : : : "%r12"); /* { dg-error "cannot be clobbered" } */
++}
+diff --git a/gcc/testsuite/gcc.target/i386/asm-flag-0.c b/gcc/testsuite/gcc.target/i386/asm-flag-0.c
+index b0c05239b..e7bd1a585 100644
+--- a/gcc/testsuite/gcc.target/i386/asm-flag-0.c
++++ b/gcc/testsuite/gcc.target/i386/asm-flag-0.c
+@@ -11,5 +11,5 @@ void a(void)
+ void b(void)
+ {
+   char x;
+-  asm("" : "=@ccbad"(x)); /* { dg-error "unknown asm flag output" } */
++  asm("" : "=@ccbad"(x)); /* { dg-error "unknown 'asm' flag output" } */
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/funcspec-4.c b/gcc/testsuite/gcc.target/i386/funcspec-4.c
+index 025b97dff..e345acdef 100644
+--- a/gcc/testsuite/gcc.target/i386/funcspec-4.c
++++ b/gcc/testsuite/gcc.target/i386/funcspec-4.c
+@@ -5,7 +5,7 @@
+ extern void error1 (void) __attribute__((__target__("fma400"))); /* { dg-error "unknown" } */
+ 
+ /* Multiple arch switches */
+-extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */
++extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "attribute value 'arch=k8' was already specified in 'target' attribute" } */
+ 
+ /* Unknown tune target */
+ extern void error3 (void) __attribute__((__target__("tune=foobar"))); /* { dg-error "bad value" } */
+diff --git a/gcc/testsuite/gcc.target/i386/inline_error.c b/gcc/testsuite/gcc.target/i386/inline_error.c
+index 18e506631..57e60fbad 100644
+--- a/gcc/testsuite/gcc.target/i386/inline_error.c
++++ b/gcc/testsuite/gcc.target/i386/inline_error.c
+@@ -2,7 +2,7 @@
+ /* { dg-options "-O0 -mno-popcnt" } */
+ 
+ inline int __attribute__ ((__gnu_inline__, __always_inline__, target("popcnt")))
+-foo () /* { dg-error "inlining failed in call to always_inline .* target specific option mismatch" } */
++foo () /* { dg-error "inlining failed in call to 'always_inline' .* target specific option mismatch" } */
+ {
+   return 0;
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/interrupt-6.c b/gcc/testsuite/gcc.target/i386/interrupt-6.c
+index bcbcc97c6..138b98fe1 100644
+--- a/gcc/testsuite/gcc.target/i386/interrupt-6.c
++++ b/gcc/testsuite/gcc.target/i386/interrupt-6.c
+@@ -31,7 +31,7 @@ fn4 (uword_t error_code, void *frame)
+   error = error_code;
+ }
+ 
+-extern int fn5 (void *) __attribute__ ((interrupt)); /* { dg-error "interrupt service routine can't have non-void return value" } */
++extern int fn5 (void *) __attribute__ ((interrupt)); /* { dg-error "interrupt service routine must return 'void'" } */
+ 
+ int
+ fn5 (void *frame)
+diff --git a/gcc/testsuite/gcc.target/i386/interrupt-7.c b/gcc/testsuite/gcc.target/i386/interrupt-7.c
+index 506f61afa..3e2f6a0eb 100644
+--- a/gcc/testsuite/gcc.target/i386/interrupt-7.c
++++ b/gcc/testsuite/gcc.target/i386/interrupt-7.c
+@@ -8,5 +8,5 @@ extern void fn (void *) __attribute__((interrupt));
+ void
+ foo (void)
+ {
+-  fn (&error); /* { dg-error "interrupt service routine can't be called directly" } */
++  fn (&error); /* { dg-error "interrupt service routine cannot be called directly" } */
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/pr30848.c b/gcc/testsuite/gcc.target/i386/pr30848.c
+index 2a9285151..9c4e22ac7 100644
+--- a/gcc/testsuite/gcc.target/i386/pr30848.c
++++ b/gcc/testsuite/gcc.target/i386/pr30848.c
+@@ -2,5 +2,5 @@
+ 
+ void foo(double d)
+ {
+-  __asm__ ("" : "=u" (d));  /* { dg-error "output regs" } */
++  __asm__ ("" : "=u" (d));  /* { dg-error "output registers" } */
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/pr39082-1.c b/gcc/testsuite/gcc.target/i386/pr39082-1.c
+index 2af2264c3..85b5671e9 100644
+--- a/gcc/testsuite/gcc.target/i386/pr39082-1.c
++++ b/gcc/testsuite/gcc.target/i386/pr39082-1.c
+@@ -13,7 +13,7 @@ extern int bar1 (union un);
+ extern union un bar2 (int);
+ 
+ int
+-foo1 (union un u) /* { dg-message "note: the ABI of passing union with long double has changed in GCC 4.4" } */
++foo1 (union un u) /* { dg-message "note: the ABI of passing union with 'long double' has changed in GCC 4.4" } */
+ {
+   bar1 (u);
+   return u.i;
+diff --git a/gcc/testsuite/gcc.target/i386/pr39678.c b/gcc/testsuite/gcc.target/i386/pr39678.c
+index 0548466d6..c94c002f1 100644
+--- a/gcc/testsuite/gcc.target/i386/pr39678.c
++++ b/gcc/testsuite/gcc.target/i386/pr39678.c
+@@ -10,7 +10,7 @@ struct X {
+ 
+ struct X
+ foo (float *p)
+-{ /* { dg-message "note: the ABI of passing structure with complex float member has changed in GCC 4.4" } */
++{ /* { dg-message "note: the ABI of passing structure with 'complex float' member has changed in GCC 4.4" } */
+   struct X x;
+   x.c = -3;
+   __real x.val = p[0];
+diff --git a/gcc/testsuite/gcc.target/i386/pr57756.c b/gcc/testsuite/gcc.target/i386/pr57756.c
+index 25c565c87..9a78f62c9 100644
+--- a/gcc/testsuite/gcc.target/i386/pr57756.c
++++ b/gcc/testsuite/gcc.target/i386/pr57756.c
+@@ -3,7 +3,7 @@
+ 
+ /* callee cannot be inlined into caller because it has a higher target ISA.  */
+ __attribute__((always_inline,target("sse4.2")))
+-__inline int callee () /* { dg-error "inlining failed in call to always_inline" }  */
++__inline int callee () /* { dg-error "inlining failed in call to 'always_inline'" }  */
+ {
+   return 0;
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/pr62120.c b/gcc/testsuite/gcc.target/i386/pr62120.c
+index bfb8c4703..28d85d377 100644
+--- a/gcc/testsuite/gcc.target/i386/pr62120.c
++++ b/gcc/testsuite/gcc.target/i386/pr62120.c
+@@ -3,6 +3,6 @@
+ 
+ void foo ()
+ {
+-  register int zmm_var asm ("ymm9");/* { dg-error "invalid register name" } */
+-  register int zmm_var2 asm ("23");/* { dg-error "invalid register name" } */
++  register int zmm_var asm ("ymm9");/* { dg-error "cannot be accessed" } */
++  register int zmm_var2 asm ("23");/* { dg-error "cannot be accessed" } */
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/pr68843-1.c b/gcc/testsuite/gcc.target/i386/pr68843-1.c
+index da0676aa6..6198ea9af 100644
+--- a/gcc/testsuite/gcc.target/i386/pr68843-1.c
++++ b/gcc/testsuite/gcc.target/i386/pr68843-1.c
+@@ -5,7 +5,7 @@ double
+ test ()
+ {
+   double x = 1.0;
+-  asm ("fld %1" /* { dg-error "explicitly used regs must be grouped at top of stack" } */
++  asm ("fld %1" /* { dg-error "explicitly used registers must be grouped at top of stack" } */
+        : "=&t" (x)
+        : "u" (x));
+   return x;
+diff --git a/gcc/testsuite/gcc.target/i386/pr79804.c b/gcc/testsuite/gcc.target/i386/pr79804.c
+index 10adb4466..08d1a3ea1 100644
+--- a/gcc/testsuite/gcc.target/i386/pr79804.c
++++ b/gcc/testsuite/gcc.target/i386/pr79804.c
+@@ -7,4 +7,4 @@ void foo (void)
+   register int r19 asm ("19");
+ 
+   asm volatile ("# %0" : "=r"(r19));  /* { dg-error "invalid use of register" } */
+-}  /* { dg-error "cannot be used in asm here" } */
++}  /* { dg-error "cannot be used in 'asm' here" } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr82673.c b/gcc/testsuite/gcc.target/i386/pr82673.c
+index 50eb5a3bc..161ec88e3 100644
+--- a/gcc/testsuite/gcc.target/i386/pr82673.c
++++ b/gcc/testsuite/gcc.target/i386/pr82673.c
+@@ -9,4 +9,4 @@ void
+ bar (void) /* { dg-error "frame pointer required, but reserved" } */
+ {
+   B = &y;
+-} /* { dg-error "bp cannot be used in asm here" } */
++} /* { dg-error "bp cannot be used in 'asm' here" } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr88809-2.c b/gcc/testsuite/gcc.target/i386/pr88809-2.c
+new file mode 100644
+index 000000000..b8ef51dab
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88809-2.c
+@@ -0,0 +1,9 @@
++/* PR target/88809 */
++/* { dg-options "-Os" } */
++
++unsigned int foo (const char *ptr)
++{
++  return __builtin_strlen (ptr);
++}
++
++/* { dg-final { scan-assembler "call\[ \t\]strlen" } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr88809.c b/gcc/testsuite/gcc.target/i386/pr88809.c
+new file mode 100644
+index 000000000..20844ddb9
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88809.c
+@@ -0,0 +1,9 @@
++/* PR target/88809 */
++/* { dg-options "-O" } */
++
++unsigned int foo (const char *ptr)
++{
++  return __builtin_strlen (ptr);
++}
++
++/* { dg-final { scan-assembler "call\[ \t\]strlen" } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
+new file mode 100644
+index 000000000..a15d1fea3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
+@@ -0,0 +1,49 @@
++/* { dg-do run { target sse2_runtime } } */
++/* { dg-options "-O2 -msse2" } */
++
++#include "pr88828-1a.c"
++#include "pr88828-1b.c"
++#include "pr88828-1c.c"
++
++extern void abort ();
++
++void
++do_check (__v4sf y, float f[4], float z)
++{
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (i == 0)
++      {
++	if (y[i] != z)
++	  abort ();
++      }
++    else
++      {
++	if (y[i] != f[i])
++	  abort ();
++      }
++}
++
++int
++main (void)
++{
++  float f[4] = { -11, 2, 55553, -4 };
++  float z = 134567;
++  __v4sf x = { f[0], f[1], f[2], f[3] };
++  __v4sf y;
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (x[i] != f[i])
++      abort ();
++
++  y = foo1 (x, z);
++  do_check (y, f, z);
++  y = foo2 (x, z);
++  do_check (y, f, z);
++  y = foo3 (x, z);
++  do_check (y, f, z);
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
+new file mode 100644
+index 000000000..d37b24c66
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
+@@ -0,0 +1,17 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler "movss" } } */
++/* { dg-final { scan-assembler-not "movaps" } } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__attribute__((noinline, noclone))
++__v4sf
++foo1 (__v4sf x, float f)
++{
++  __v4sf y = { f, x[1], x[2], x[3] };
++  return y;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
+new file mode 100644
+index 000000000..af4aced65
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler "movss" } } */
++/* { dg-final { scan-assembler-not "movaps" } } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++static __v4sf
++vector_init (float f0,float f1, float f2,float f3)
++{
++  __v4sf y = { f0, f1, f2, f3 };
++   return y;
++}
++
++__attribute__((noinline, noclone))
++__v4sf
++foo2 (__v4sf x, float f)
++{
++  return vector_init (f, x[1], x[2], x[3]) ;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
+new file mode 100644
+index 000000000..a117f3ec7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler "movss" } } */
++/* { dg-final { scan-assembler-not "movaps" } } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__attribute__((noinline, noclone))
++__v4sf
++foo3 (__v4sf x, float f)
++{
++  __v4sf y = x;
++  y[0] = f;
++  return y;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
+new file mode 100644
+index 000000000..64043b985
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler "movss" } } */
++/* { dg-final { scan-assembler-times "shufps" 1 } } */
++/* { dg-final { scan-assembler-not "movaps" } } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__attribute__((noinline, noclone))
++__v4sf
++foo (__v4sf x, float f)
++{
++  __v4sf y = { x[0], x[2], x[3], x[1] };
++  y[0] = f;
++  return y;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
+new file mode 100644
+index 000000000..ad8d2b985
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mavx" } */
++/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
++/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
++/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
++/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
++/* { dg-final { scan-assembler-not "vshufps" } } */
++/* { dg-final { scan-assembler-not "vmovaps" } } */
++/* { dg-final { scan-assembler-not "vmovlhps" } } */
++/* { dg-final { scan-assembler-not "vunpcklps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__attribute__((noinline, noclone))
++__v4sf
++foo (__v4sf x, float f)
++{
++  __v4sf y = { x[0], x[2], x[3], x[1] };
++  y[0] = f;
++  return y;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
+new file mode 100644
+index 000000000..5e908faef
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler "movss" } } */
++/* { dg-final { scan-assembler-times "shufps" 2 } } */
++/* { dg-final { scan-assembler-times "movaps" 1 } } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__attribute__((noinline, noclone))
++__v4sf
++foo (__v4sf x, float f)
++{
++  __v4sf y = { x[0], x[2], x[3], x[0] };
++  y[3] = f;
++  return y;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
+new file mode 100644
+index 000000000..988a48823
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mavx" } */
++/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
++/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
++/* { dg-final { scan-assembler-not "vshufps" } } */
++/* { dg-final { scan-assembler-not "vmovss" } } */
++/* { dg-final { scan-assembler-not "vmovaps" } } */
++/* { dg-final { scan-assembler-not "vmovlhps" } } */
++/* { dg-final { scan-assembler-not "vunpcklps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__attribute__((noinline, noclone))
++__v4sf
++foo (__v4sf x, float f)
++{
++  __v4sf y = { x[0], x[2], x[3], x[0] };
++  y[3] = f;
++  return y;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
+new file mode 100644
+index 000000000..4302c2664
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
+@@ -0,0 +1,53 @@
++/* { dg-do run { target sse2_runtime } } */
++/* { dg-options "-O2 -msse2 -fexcess-precision=standard" } */
++
++#include "pr88828-7a.c"
++#include "pr88828-7b.c"
++
++extern void abort ();
++
++float
++bar (float x, float y)
++{
++  return x / y - y * x;
++}
++
++void
++do_check (__v4sf x, float f1[4], float f2[4])
++{
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (i == 0)
++      {
++	if (x[i] != bar (f1[i], f2[i]))
++	  abort ();
++      }
++    else
++      {
++	if (x[i] != f1[i])
++	  abort ();
++      }
++}
++
++int
++main (void)
++{
++  float f1[4] = { -11, 2, 55553, -4 };
++  float f2[4] = { 111, 3.3, -55.553, 4.8 };
++  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
++  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
++  __v4sf z;
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (x[i] != f1[i] || y[i] != f2[i] )
++      abort ();
++
++  z = foo1 (x, y);
++  do_check (z, f1, f2);
++  x = foo2 (x, y);
++  do_check (z, f1, f2);
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
+new file mode 100644
+index 000000000..f1ae57422
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
+@@ -0,0 +1,16 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpckhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++extern float bar (float, float);
++
++__v4sf
++foo1 (__v4sf x, __v4sf y)
++{
++  __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] };
++  return z;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
+new file mode 100644
+index 000000000..c027c5694
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpckhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++extern float bar (float, float);
++
++static __v4sf
++vector_init (float f0,float f1, float f2,float f3)
++{
++  __v4sf y = { f0, f1, f2, f3 };
++   return y;
++}
++
++__v4sf
++foo2 (__v4sf x, __v4sf y)
++{
++  return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c
+new file mode 100644
+index 000000000..3b8eabd22
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c
+@@ -0,0 +1,46 @@
++/* { dg-do run { target sse2_runtime } } */
++/* { dg-options "-O2 -msse2" } */
++
++#include "pr88828-8a.c"
++#include "pr88828-8b.c"
++
++extern void abort ();
++
++void
++do_check (__v4sf y, float f[4], float z)
++{
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (i == 0)
++      {
++	if (y[i] != z)
++	  abort ();
++      }
++    else
++      {
++	if (y[i] != f[i])
++	  abort ();
++      }
++}
++
++int
++main (void)
++{
++  float f[4] = { -11, 2, 55553, -4 };
++  float z = 11.4;
++  __v4sf x = { f[0], f[1], f[2], f[3] };
++  __v4sf y;
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (x[i] != f[i])
++      abort ();
++
++  y = foo1 (x);
++  do_check (y, f, z);
++  y = foo2 (x);
++  do_check (y, f, z);
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
+new file mode 100644
+index 000000000..5d383dfd0
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpckhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++__v4sf
++foo1 (__v4sf x)
++{
++  __v4sf z = { 11.4, x[1], x[2], x[3] };
++  return z;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
+new file mode 100644
+index 000000000..5ffbc9c31
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
+@@ -0,0 +1,21 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpckhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++
++static __v4sf
++vector_init (float f0,float f1, float f2,float f3)
++{
++  __v4sf y = { f0, f1, f2, f3 };
++   return y;
++}
++
++__v4sf
++foo2 (__v4sf x)
++{
++  return vector_init (11.4, x[1], x[2], x[3]) ;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9.c b/gcc/testsuite/gcc.target/i386/pr88828-9.c
+new file mode 100644
+index 000000000..c33907b4a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-9.c
+@@ -0,0 +1,46 @@
++/* { dg-do run { target sse2_runtime } } */
++/* { dg-options "-O2 -msse2" } */
++
++#include "pr88828-9a.c"
++#include "pr88828-9b.c"
++
++extern void abort ();
++
++void
++do_check (__v4sf y, float f[4], float z)
++{
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (i == 0)
++      {
++	if (y[i] != z)
++	  abort ();
++      }
++    else
++      {
++	if (y[i] != f[i])
++	  abort ();
++      }
++}
++
++int
++main (void)
++{
++  float f[4] = { -11, 2, 55553, -4 };
++  float z = 11.4;
++  __m128 x = (__m128) (__v4sf) { f[0], f[1], f[2], f[3] };
++  __m128 y;
++  int i;
++
++  for (i = 0; i < 4; i++)
++    if (x[i] != f[i])
++      abort ();
++
++  y = foo1 (x);
++  do_check (y, f, z);
++  y = foo2 (x);
++  do_check (y, f, z);
++
++  return 0;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9a.c b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
+new file mode 100644
+index 000000000..7f8306577
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
+@@ -0,0 +1,16 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpckhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
++
++__m128
++foo1 (__m128 x)
++{
++  __v4sf z = { 11.4, ((__v4sf) x)[1], ((__v4sf) x)[2], ((__v4sf) x) [3] };
++  return (__m128) z;
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9b.c b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
+new file mode 100644
+index 000000000..6588ad15a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse -mno-sse4" } */
++/* { dg-final { scan-assembler-not "movlhps" } } */
++/* { dg-final { scan-assembler-not "unpckhps" } } */
++/* { dg-final { scan-assembler-not "unpcklps" } } */
++/* { dg-final { scan-assembler-not "shufps" } } */
++
++typedef float __v4sf __attribute__ ((__vector_size__ (16)));
++typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
++
++static __m128
++vector_init (float f0,float f1, float f2,float f3)
++{
++  __v4sf y = { f0, f1, f2, f3 };
++   return (__m128) y;
++}
++
++__m128
++foo2 (__m128 x)
++{
++  return vector_init (11.4, ((__v4sf) x)[1], ((__v4sf) x)[2],
++		      ((__v4sf) x) [3]);
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr88963-1.c b/gcc/testsuite/gcc.target/i386/pr88963-1.c
+new file mode 100644
+index 000000000..e6f15259e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88963-1.c
+@@ -0,0 +1,13 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -march=x86-64 -mavx2 -fdump-tree-optimized" } */
++
++typedef int VInt __attribute__((vector_size(64)));
++
++void test(VInt*__restrict a, VInt*__restrict b, 
++	  VInt*__restrict c)
++{
++  *a = *b + *c;
++}
++
++/* Vector loads and stores should be split.  */
++/* { dg-final { scan-tree-dump-not "vector\\(16\\)" "optimized" } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr88963-2.c b/gcc/testsuite/gcc.target/i386/pr88963-2.c
+new file mode 100644
+index 000000000..114f1f5c3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr88963-2.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -march=x86-64 -msse2 -fdump-tree-optimized" } */
++
++typedef int VInt __attribute__((vector_size(64)));
++
++void test(VInt*__restrict a, VInt*__restrict b, 
++	  VInt*__restrict c)
++{
++  *a = *b + *c;
++}
++
++/* Vector loads and stores should be split.  */
++/* { dg-final { scan-tree-dump-not "vector\\(16\\)" "optimized" } } */
++/* { dg-final { scan-tree-dump-not "vector\\(8\\)" "optimized" } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr89261.c b/gcc/testsuite/gcc.target/i386/pr89261.c
+new file mode 100644
+index 000000000..63882c099
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr89261.c
+@@ -0,0 +1,9 @@
++/* PR target/89261 */
++/* { dg-do compile } */
++/* { dg-options "-O2" } */
++
++typedef double __v2df __attribute__ ((vector_size (16), aligned (1 << 28)));
++
++__v2df foo = { 1.0, 2.0 };
++
++/* { dg-final { scan-assembler "\.align\[ \t]+268435456" } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
+new file mode 100644
+index 000000000..d34ed3aa8
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
+@@ -0,0 +1,34 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -msse2 -fdump-tree-cddce1" } */
++
++typedef int v4si __attribute__((vector_size(16)));
++typedef int v2si __attribute__((vector_size(8)));
++
++void low (v2si *dst, v4si *srcp)
++{
++  v4si src = *srcp;
++  *dst = (v2si) { src[0], src[1] };
++}
++
++void high (v2si *dst, v4si *srcp)
++{
++  v4si src = *srcp;
++  *dst = (v2si) { src[2], src[3] };
++}
++
++void even (v2si *dst, v4si *srcp)
++{
++  v4si src = *srcp;
++  *dst = (v2si) { src[0], src[2] };
++}
++
++void odd (v2si *dst, v4si *srcp)
++{
++  v4si src = *srcp;
++  *dst = (v2si) { src[1], src[3] };
++}
++
++/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
++/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
++/* Ideally highpart extraction would elide the permutation as well.  */
++/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
+new file mode 100644
+index 000000000..9c08c9fb6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
+@@ -0,0 +1,37 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mavx2 -fdump-tree-cddce1" } */
++
++typedef int v8si __attribute__((vector_size(32)));
++typedef float v4sf __attribute__((vector_size(16)));
++
++void low (v4sf *dst, v8si *srcp)
++{
++  v8si src = *srcp;
++  *dst = (v4sf) { src[0], src[1], src[2], src[3] };
++}
++
++void high (v4sf *dst, v8si *srcp)
++{
++  v8si src = *srcp;
++  *dst = (v4sf) { src[4], src[5], src[6], src[7] };
++}
++
++void even (v4sf *dst, v8si *srcp)
++{
++  v8si src = *srcp;
++  *dst = (v4sf) { src[0], src[2], src[4], src[6] };
++}
++
++void odd (v4sf *dst, v8si *srcp)
++{
++  v8si src = *srcp;
++  *dst = (v4sf) { src[1], src[3], src[5], src[7] };
++}
++
++/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
++/* Four conversions, on the smaller vector type, to not convert excess
++   elements.  */
++/* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
++/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
++/* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
++/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
+new file mode 100644
+index 000000000..788a97ed1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
+@@ -0,0 +1,56 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */
++
++typedef unsigned int u32v4 __attribute__((vector_size(16)));
++typedef unsigned short u16v16 __attribute__((vector_size(32)));
++typedef unsigned char u8v16 __attribute__((vector_size(16)));
++
++union vec128 {
++  u8v16 u8;
++  u32v4 u32;
++};
++
++#define memcpy __builtin_memcpy
++
++static u16v16 zxt(u8v16 x)
++{
++  return (u16v16) {
++    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
++    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
++  };
++}
++
++static u8v16 narrow(u16v16 x)
++{
++  return (u8v16) {
++    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
++    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
++  };
++}
++
++void f(char *dst, char *src, unsigned long n, unsigned c)
++{
++  unsigned ia = 255 - (c >> 24);
++  ia += ia >> 7;
++
++  union vec128 c4 = {0}, ia16 = {0};
++  c4.u32 += c;
++  ia16.u8 += (unsigned char)ia;
++
++  u16v16 c16 = (zxt(c4.u8) << 8) + 128;
++
++  for (; n; src += 16, dst += 16, n -= 4) {
++    union vec128 s;
++    memcpy(&s, src, sizeof s);
++    s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
++    memcpy(dst, &s, sizeof s);
++  }
++}
++
++/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */
++/* We're missing an opportunity to, after later optimizations, combine
++   a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted
++   element.  */
++/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */
++/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
++/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
+diff --git a/gcc/testsuite/gcc.target/i386/pr92803.c b/gcc/testsuite/gcc.target/i386/pr92803.c
+new file mode 100644
+index 000000000..fc8d64efb
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr92803.c
+@@ -0,0 +1,38 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -Wno-psabi -mavx2 -fdump-tree-forwprop1" } */
++
++typedef double v4df __attribute__((vector_size (32)));
++typedef float v8sf __attribute__((vector_size (32)));
++typedef float v4sf __attribute__((vector_size (16)));
++typedef int v4si __attribute__((vector_size (16)));
++typedef double v2df __attribute__((vector_size (16)));
++
++v2df
++foo (v4df x, double *p, v2df y)
++{
++  return (v2df) { x[3], *p };
++}
++
++v4sf
++bar (v4si x, float *p)
++{
++  return (v4sf) { x[0], x[1], x[2], *p };
++}
++
++v4sf
++baz (v4si x)
++{
++  return (v4sf) { x[0], x[1], 3.0f, 1.0f };
++}
++
++v4sf
++barf (v8sf x)
++{
++  return (v4sf) { x[4], x[5], 1.0f, 2.0f };
++}
++
++/* We expect all CTORs to turn into permutes, the FP converting ones
++   to two each with the one with constants possibly elided in the future
++   by converting 3.0f and 1.0f "back" to integers.  */
++/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 6 "forwprop1" } } */
++/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 5 "forwprop1" { xfail *-*-* } } } */
+diff --git a/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90 b/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90
+index 8070bbb4a..d827323ac 100644
+--- a/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90
++++ b/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90
+@@ -23,5 +23,3 @@ Program FOO
+ 366  format(/, ' PC = ',E12.4,/,' UC = ',E12.4,/,' VC = ',E12.4,/)
+ 
+ end Program FOO
+-
+-! { dg-final { scan-tree-dump "tiled" "graphite" } }
+diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
+new file mode 100644
+index 000000000..224e6ce5f
+--- /dev/null
++++ b/gcc/testsuite/gfortran.dg/pr88833.f90
+@@ -0,0 +1,9 @@
++! { dg-do assemble { target aarch64_asm_sve_ok } }
++! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
++
++subroutine foo(x)
++  real :: x(100)
++  x = x + 10
++end subroutine foo
++
++! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
+diff --git a/gcc/testsuite/gnat.dg/opt39.adb b/gcc/testsuite/gnat.dg/opt39.adb
+index 3b12cf201..0a5ef67a2 100644
+--- a/gcc/testsuite/gnat.dg/opt39.adb
++++ b/gcc/testsuite/gnat.dg/opt39.adb
+@@ -27,4 +27,5 @@ begin
+   end if;
+ end;
+ 
+--- { dg-final { scan-tree-dump-times "MEM" 1 "optimized" } }
++-- { dg-final { scan-tree-dump-not "MEM" "optimized" } }
++-- { dg-final { scan-tree-dump-not "tmp" "optimized" } }
+diff --git a/gcc/testsuite/lib/prune.exp b/gcc/testsuite/lib/prune.exp
+index 812c59e6f..a9beef48e 100644
+--- a/gcc/testsuite/lib/prune.exp
++++ b/gcc/testsuite/lib/prune.exp
+@@ -21,7 +21,7 @@ load_lib multiline.exp
+ if ![info exists TEST_ALWAYS_FLAGS] {
+     set TEST_ALWAYS_FLAGS ""
+ }
+-set TEST_ALWAYS_FLAGS "-fno-diagnostics-show-caret -fno-diagnostics-show-line-numbers -fdiagnostics-color=never $TEST_ALWAYS_FLAGS"
++set TEST_ALWAYS_FLAGS "-fno-diagnostics-show-caret -fno-diagnostics-show-line-numbers -fdiagnostics-color=never  -fdiagnostics-urls=never $TEST_ALWAYS_FLAGS"
+ 
+ proc prune_gcc_output { text } {
+     global srcdir
+diff --git a/gcc/testsuite/lib/scanasm.exp b/gcc/testsuite/lib/scanasm.exp
+index 35ccbc86f..4ff39dab3 100644
+--- a/gcc/testsuite/lib/scanasm.exp
++++ b/gcc/testsuite/lib/scanasm.exp
+@@ -546,3 +546,179 @@ proc scan-lto-assembler { args } {
+     verbose "output_file: $output_file"
+     dg-scan "scan-lto-assembler" 1 $testcase $output_file $args
+ }
++
++# Read assembly file FILENAME and store a mapping from function names
++# to function bodies in array RESULT.  FILENAME has already been uploaded
++# locally where necessary and is known to exist.
++
++proc parse_function_bodies { filename result } {
++    upvar $result up_result
++
++    # Regexp for the start of a function definition (name in \1).
++    set label {^([a-zA-Z_]\S+):$}
++
++    # Regexp for the end of a function definition.
++    set terminator {^\s*\.size}
++
++    # Regexp for lines that aren't interesting.
++    set fluff {^\s*(?:\.|//)}
++
++    set fd [open $filename r]
++    set in_function 0
++    while { [gets $fd line] >= 0 } {
++	if { [regexp $label $line dummy function_name] } {
++	    set in_function 1
++	    set function_body ""
++	} elseif { $in_function } {
++	    if { [regexp $terminator $line] } {
++		set up_result($function_name) $function_body
++		set in_function 0
++	    } elseif { ![regexp $fluff $line] } {
++		append function_body $line "\n"
++	    }
++	}
++    }
++    close $fd
++}
++
++# FUNCTIONS is an array that maps function names to function bodies.
++# Return true if it contains a definition of function NAME and if
++# that definition matches BODY_REGEXP.
++
++proc check_function_body { functions name body_regexp } {
++    upvar $functions up_functions
++
++    if { ![info exists up_functions($name)] } {
++	return 0
++    }
++    return [regexp "^$body_regexp\$" $up_functions($name)]
++}
++
++# Check the implementations of functions against expected output.  Used as:
++#
++# { dg-do { check-function-bodies PREFIX TERMINATOR[ OPTION[ SELECTOR]] } }
++#
++# See sourcebuild.texi for details.
++
++proc check-function-bodies { args } {
++    if { [llength $args] < 2 } {
++	error "too few arguments to check-function-bodies"
++    }
++    if { [llength $args] > 4 } {
++	error "too many arguments to check-function-bodies"
++    }
++
++    if { [llength $args] >= 3 } {
++	set required_flag [lindex $args 2]
++
++	upvar 2 dg-extra-tool-flags extra_tool_flags
++	set flags $extra_tool_flags
++
++	global torture_current_flags
++	if { [info exists torture_current_flags] } {
++	    append flags " " $torture_current_flags
++	}
++	if { ![regexp " $required_flag " $flags] } {
++	    return
++	}
++    }
++
++    set xfail_all 0
++    if { [llength $args] >= 4 } {
++	switch [dg-process-target [lindex $args 3]] {
++	    "S" { }
++	    "N" { return }
++	    "F" { set xfail_all 1 }
++	    "P" { }
++	}
++    }
++
++    set testcase [testname-for-summary]
++    # The name might include a list of options; extract the file name.
++    set filename [lindex $testcase 0]
++
++    global srcdir
++    set input_filename "$srcdir/$filename"
++    set output_filename "[file rootname [file tail $filename]].s"
++
++    set prefix [lindex $args 0]
++    set prefix_len [string length $prefix]
++    set terminator [lindex $args 1]
++    if { [string equal $terminator ""] } {
++	set terminator "*/"
++    }
++    set terminator_len [string length $terminator]
++
++    set have_bodies 0
++    if { [is_remote host] } {
++	remote_upload host "$filename"
++    }
++    if { [file exists $output_filename] } {
++	parse_function_bodies $output_filename functions
++	set have_bodies 1
++    } else {
++	verbose -log "$testcase: output file does not exist"
++    }
++
++    set count 0
++    set function_regexp ""
++    set label {^(\S+):$}
++
++    set lineno 1
++    set fd [open $input_filename r]
++    set in_function 0
++    while { [gets $fd line] >= 0 } {
++	if { [string equal -length $prefix_len $line $prefix] } {
++	    set line [string trim [string range $line $prefix_len end]]
++	    if { !$in_function } {
++		if { [regexp "^(.*\\S)\\s+{(.*)}\$" $line dummy \
++			  line selector] } {
++		    set selector [dg-process-target $selector]
++		} else {
++		    set selector "P"
++		}
++		if { ![regexp $label $line dummy function_name] } {
++		    close $fd
++		    error "check-function-bodies: line $lineno does not have a function label"
++		}
++		set in_function 1
++		set function_regexp ""
++	    } elseif { [string equal $line "("] } {
++		append function_regexp "(?:"
++	    } elseif { [string equal $line "|"] } {
++		append function_regexp "|"
++	    } elseif { [string equal $line ")"] } {
++		append function_regexp ")"
++	    } elseif { [string equal $line "..."] } {
++		append function_regexp ".*"
++	    } else {
++		append function_regexp "\t" $line "\n"
++	    }
++	} elseif { [string equal -length $terminator_len $line $terminator] } {
++	    if { ![string equal $selector "N"] } {
++		if { $xfail_all || [string equal $selector "F"] } {
++		    setup_xfail "*-*-*"
++		}
++		set testname "$testcase check-function-bodies $function_name"
++		if { !$have_bodies } {
++		    unresolved $testname
++		} elseif { [check_function_body functions $function_name \
++				$function_regexp] } {
++		    pass $testname
++		} else {
++		    fail $testname
++		}
++	    }
++	    set in_function 0
++	    incr count
++	}
++	incr lineno
++    }
++    close $fd
++    if { $in_function } {
++	error "check-function-bodies: missing \"$terminator\""
++    }
++    if { $count == 0 } {
++	error "check-function-bodies: no matches found"
++    }
++}
+diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
+index ea9a50ccb..2eeb6883a 100644
+--- a/gcc/testsuite/lib/target-supports.exp
++++ b/gcc/testsuite/lib/target-supports.exp
+@@ -3336,6 +3336,24 @@ proc check_effective_target_aarch64_sve { } {
+     }]
+ }
+ 
++# Return 1 if this is an AArch64 target supporting SVE2.
++proc check_effective_target_aarch64_sve2 { } {
++    if { ![istarget aarch64*-*-*] } {
++	return 0
++    }
++    return [check_no_compiler_messages aarch64_sve2 assembly {
++	#if !defined (__ARM_FEATURE_SVE2)
++	#error FOO
++	#endif
++    }]
++}
++
++# Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
++proc check_effective_target_aarch64_sve1_only { } {
++    return [expr { [check_effective_target_aarch64_sve]
++		   && ![check_effective_target_aarch64_sve2] }]
++}
++
+ # Return the size in bits of an SVE vector, or 0 if the size is variable.
+ proc aarch64_sve_bits { } {
+     return [check_cached_effective_target aarch64_sve_bits {
+@@ -4356,6 +4374,22 @@ proc check_effective_target_aarch64_sve_hw { } {
+     }]
+ }
+ 
++# Return true if this is an AArch64 target that can run SVE2 code.
++
++proc check_effective_target_aarch64_sve2_hw { } {
++    if { ![istarget aarch64*-*-*] } {
++	return 0
++    }
++    return [check_runtime aarch64_sve2_hw_available {
++	int
++	main (void)
++	{
++	  asm volatile ("addp z0.b, p0/m, z0.b, z1.b");
++	  return 0;
++	}
++    }]
++}
++
+ # Return true if this is an AArch64 target that can run SVE code and
+ # if its SVE vectors have exactly BITS bits.
+ 
+@@ -4569,6 +4603,49 @@ proc add_options_for_arm_v8_2a_dotprod_neon { flags } {
+     return "$flags $et_arm_v8_2a_dotprod_neon_flags"
+ }
+ 
++# Return 1 if the target supports ARMv8.2+i8mm Adv.SIMD Dot Product
++# instructions, 0 otherwise.  The test is valid for ARM and for AArch64.
++# Record the command line options needed.
++
++proc check_effective_target_arm_v8_2a_i8mm_ok_nocache { } {
++    global et_arm_v8_2a_i8mm_flags
++    set et_arm_v8_2a_i8mm_flags ""
++
++    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
++        return 0;
++    }
++
++    # Iterate through sets of options to find the compiler flags that
++    # need to be added to the -march option.
++    foreach flags {"" "-mfloat-abi=hard -mfpu=neon-fp-armv8" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" } {
++        if { [check_no_compiler_messages_nocache \
++                  arm_v8_2a_i8mm_ok object {
++            #include <arm_neon.h>
++            #if !defined (__ARM_FEATURE_MATMUL_INT8)
++            #error "__ARM_FEATURE_MATMUL_INT8 not defined"
++            #endif
++        } "$flags -march=armv8.2-a+i8mm"] } {
++            set et_arm_v8_2a_i8mm_flags "$flags -march=armv8.2-a+i8mm"
++            return 1
++        }
++    }
++
++    return 0;
++}
++
++proc check_effective_target_arm_v8_2a_i8mm_ok { } {
++    return [check_cached_effective_target arm_v8_2a_i8mm_ok \
++                check_effective_target_arm_v8_2a_i8mm_ok_nocache]
++}
++
++proc add_options_for_arm_v8_2a_i8mm { flags } {
++    if { ! [check_effective_target_arm_v8_2a_i8mm_ok] } {
++        return "$flags"
++    }
++    global et_arm_v8_2a_i8mm_flags
++    return "$flags $et_arm_v8_2a_i8mm_flags"
++}
++
+ # Return 1 if the target supports FP16 VFMAL and VFMSL
+ # instructions, 0 otherwise.
+ # Record the command line options needed.
+@@ -4614,6 +4691,45 @@ proc add_options_for_arm_fp16fml_neon { flags } {
+     return "$flags $et_arm_fp16fml_neon_flags"
+ }
+ 
++# Return 1 if the target supports BFloat16 SIMD instructions, 0 otherwise.
++# The test is valid for ARM and for AArch64.
++
++proc check_effective_target_arm_v8_2a_bf16_neon_ok_nocache { } {
++    global et_arm_v8_2a_bf16_neon_flags
++    set et_arm_v8_2a_bf16_neon_flags ""
++
++    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
++        return 0;
++    }
++
++    foreach flags {"" "-mfloat-abi=hard -mfpu=neon-fp-armv8" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" } {
++        if { [check_no_compiler_messages_nocache arm_v8_2a_bf16_neon_ok object {
++            #include <arm_neon.h>
++            #if !defined (__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
++            #error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC not defined"
++            #endif
++        } "$flags -march=armv8.2-a+bf16"] } {
++            set et_arm_v8_2a_bf16_neon_flags "$flags -march=armv8.2-a+bf16"
++            return 1
++        }
++    }
++
++    return 0;
++}
++
++proc check_effective_target_arm_v8_2a_bf16_neon_ok { } {
++    return [check_cached_effective_target arm_v8_2a_bf16_neon_ok \
++                check_effective_target_arm_v8_2a_bf16_neon_ok_nocache]
++}
++
++proc add_options_for_arm_v8_2a_bf16_neon { flags } {
++    if { ! [check_effective_target_arm_v8_2a_bf16_neon_ok] } {
++        return "$flags"
++    }
++    global et_arm_v8_2a_bf16_neon_flags
++    return "$flags $et_arm_v8_2a_bf16_neon_flags"
++}
++
+ # Return 1 if the target supports executing ARMv8 NEON instructions, 0
+ # otherwise.
+ 
+@@ -6093,7 +6209,24 @@ proc check_effective_target_vect_usad_char { } {
+ 
+ proc check_effective_target_vect_avg_qi {} {
+     return [expr { [istarget aarch64*-*-*]
+-		   && ![check_effective_target_aarch64_sve] }]
++		   && ![check_effective_target_aarch64_sve1_only] }]
++}
++
++# Return 1 if the target plus current options supports both signed
++# and unsigned multiply-high-with-round-and-scale operations
++# on vectors of half-words.
++
++proc check_effective_target_vect_mulhrs_hi {} {
++    return [expr { [istarget aarch64*-*-*]
++		   && [check_effective_target_aarch64_sve2] }]
++}
++
++# Return 1 if the target plus current options supports signed division
++# by power-of-2 operations on vectors of 4-byte integers.
++
++proc check_effective_target_vect_sdiv_pow2_si {} {
++    return [expr { [istarget aarch64*-*-*]
++		   && [check_effective_target_aarch64_sve] }]
+ }
+ 
+ # Return 1 if the target plus current options supports a vector
+@@ -8579,7 +8712,8 @@ proc check_effective_target_aarch64_tiny { } {
+ # Create functions to check that the AArch64 assembler supports the
+ # various architecture extensions via the .arch_extension pseudo-op.
+ 
+-foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"} {
++foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
++			  "i8mm" "f32mm" "f64mm" "bf16" } {
+     eval [string map [list FUNC $aarch64_ext] {
+ 	proc check_effective_target_aarch64_asm_FUNC_ok { } {
+ 	  if { [istarget aarch64*-*-*] } {
+diff --git a/gcc/testsuite/obj-c++.dg/stubify-1.mm b/gcc/testsuite/obj-c++.dg/stubify-1.mm
+index e8f21882d..a32e28251 100644
+--- a/gcc/testsuite/obj-c++.dg/stubify-1.mm
++++ b/gcc/testsuite/obj-c++.dg/stubify-1.mm
+@@ -4,7 +4,7 @@
+ /* { dg-do compile { target *-*-darwin* } } */
+ /* { dg-skip-if "" { *-*-* } { "-fgnu-runtime" } { "" } } */
+ /* { dg-require-effective-target ilp32 } */
+-/* { dg-options "-mdynamic-no-pic -fno-exceptions -mmacosx-version-min=10.4 -msymbol-stubs" } */
++/* { dg-options "-Os -mdynamic-no-pic -fno-exceptions -mmacosx-version-min=10.4 -msymbol-stubs" } */
+ 
+ typedef struct objc_object { } *id ;
+ int x = 41 ;
+diff --git a/gcc/testsuite/obj-c++.dg/stubify-2.mm b/gcc/testsuite/obj-c++.dg/stubify-2.mm
+index 1863f986c..69fea8def 100644
+--- a/gcc/testsuite/obj-c++.dg/stubify-2.mm
++++ b/gcc/testsuite/obj-c++.dg/stubify-2.mm
+@@ -4,7 +4,7 @@
+ /* { dg-do compile { target *-*-darwin* } } */
+ /* { dg-skip-if "" { *-*-* } { "-fgnu-runtime" } { "" } } */
+ /* { dg-require-effective-target ilp32 } */
+-/* { dg-options "-mdynamic-no-pic -mmacosx-version-min=10.4 -msymbol-stubs" } */
++/* { dg-options "-mdynamic-no-pic -fdump-rtl-jump -mmacosx-version-min=10.4 -msymbol-stubs" } */
+ 
+ typedef struct objc_object { } *id ;
+ int x = 41 ;
+@@ -30,6 +30,7 @@ extern int bogonic (int, int, int) ;
+ 
+ /* Any symbol_ref of an un-stubified objc_msgSend is an error; look
+    for "objc_msgSend" in quotes, without the $stub suffix. */
++/* { dg-final {  scan-rtl-dump-not {symbol_ref.*"objc_msgSend"} "jump" { target powerpc*-*-darwin* } } } */
+ 
+ /* { dg-final { scan-assembler-not {(bl|call)[ \t]+_objc_msgSend\n} } } */
+ /* { dg-final { scan-assembler     {(bl|call)[ \t]+L_objc_msgSend\$stub\n} } } */
+diff --git a/gcc/testsuite/objc.dg/stubify-2.m b/gcc/testsuite/objc.dg/stubify-2.m
+index 2930e46fc..904ac44b2 100644
+--- a/gcc/testsuite/objc.dg/stubify-2.m
++++ b/gcc/testsuite/objc.dg/stubify-2.m
+@@ -4,7 +4,7 @@
+ /* { dg-do compile { target *-*-darwin* } } */
+ /* { dg-skip-if "" { *-*-* } { "-fgnu-runtime" } { "" } } */
+ /* { dg-require-effective-target ilp32 } */
+-/* { dg-options "-mdynamic-no-pic -mmacosx-version-min=10.4 -msymbol-stubs" } */
++/* { dg-options "-mdynamic-no-pic -fdump-rtl-jump -mmacosx-version-min=10.4 -msymbol-stubs" } */
+ 
+ typedef struct objc_object { } *id ;
+ int x = 41 ;
+@@ -30,6 +30,7 @@ extern int bogonic (int, int, int) ;
+ 
+ /* Any symbol_ref of an un-stubified objc_msgSend is an error; look
+    for "objc_msgSend" in quotes, without the $stub suffix.  */
++/* { dg-final { scan-rtl-dump-not {symbol_ref.*"objc_msgSend"} "jump" { target powerpc*-*-darwin* } } } */
+ 
+ /* { dg-final { scan-assembler-not {(bl|call)[ \t]+_objc_msgSend\n} } } */
+ /* { dg-final { scan-assembler     {(bl|call)[ \t]+L_objc_msgSend\$stub\n} } } */
+diff --git a/gcc/trans-mem.c b/gcc/trans-mem.c
+index 0581aae2d..8fc9f44d8 100644
+--- a/gcc/trans-mem.c
++++ b/gcc/trans-mem.c
+@@ -3237,8 +3237,7 @@ expand_block_edges (struct tm_region *const region, basic_block bb)
+ 	  || (gimple_call_flags (call_stmt) & ECF_TM_BUILTIN) == 0)
+ 	continue;
+ 
+-      if (DECL_FUNCTION_CODE (gimple_call_fndecl (call_stmt))
+-	  == BUILT_IN_TM_ABORT)
++      if (gimple_call_builtin_p (call_stmt, BUILT_IN_TM_ABORT))
+ 	{
+ 	  // If we have a ``_transaction_cancel [[outer]]'', there is only
+ 	  // one abnormal edge: to the transaction marked OUTER.
+diff --git a/gcc/tree-call-cdce.c b/gcc/tree-call-cdce.c
+index 2e482b37e..43f1ec6ee 100644
+--- a/gcc/tree-call-cdce.c
++++ b/gcc/tree-call-cdce.c
+@@ -1074,9 +1074,7 @@ use_internal_fn (gcall *call)
+ 	{
+ 	  gimple_stmt_iterator gsi = gsi_for_stmt (call);
+ 	  gcall *new_call = gimple_build_call_internal (IFN_SET_EDOM, 0);
+-	  gimple_set_vuse (new_call, gimple_vuse (call));
+-	  gimple_set_vdef (new_call, gimple_vdef (call));
+-	  SSA_NAME_DEF_STMT (gimple_vdef (new_call)) = new_call;
++	  gimple_move_vops (new_call, call);
+ 	  gimple_set_location (new_call, gimple_location (call));
+ 	  gsi_replace (&gsi, new_call, false);
+ 	  call = new_call;
+diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
+index 621c8ea3d..527deffe4 100644
+--- a/gcc/tree-cfg.c
++++ b/gcc/tree-cfg.c
+@@ -9547,7 +9547,8 @@ execute_fixup_cfg (void)
+ 	     Keep access when store has side effect, i.e. in case when source
+ 	     is volatile.  */
+ 	  if (gimple_store_p (stmt)
+-	      && !gimple_has_side_effects (stmt))
++	      && !gimple_has_side_effects (stmt)
++	      && !optimize_debug)
+ 	    {
+ 	      tree lhs = get_base_address (gimple_get_lhs (stmt));
+ 
+diff --git a/gcc/tree-core.h b/gcc/tree-core.h
+index 41d052949..26b6f46ad 100644
+--- a/gcc/tree-core.h
++++ b/gcc/tree-core.h
+@@ -1791,6 +1791,17 @@ struct GTY(()) tree_decl_non_common {
+   tree result;
+ };
+ 
++/* Classify a special function declaration type.  */
++
++enum function_decl_type
++{
++  NONE,
++  OPERATOR_NEW,
++  LAMBDA_FUNCTION
++
++  /* 0 values left */
++};
++
+ /* FUNCTION_DECL inherits from DECL_NON_COMMON because of the use of the
+    arguments/result/saved_tree fields by front ends.   It was either inherit
+    FUNCTION_DECL from non_common, or inherit non_common from FUNCTION_DECL,
+@@ -1815,34 +1826,32 @@ struct GTY(()) tree_function_decl {
+   /* Index within a virtual table.  */
+   tree vindex;
+ 
+-  /* In a FUNCTION_DECL for which DECL_BUILT_IN holds, this is
+-     DECL_FUNCTION_CODE.  Otherwise unused.
+-     ???  The bitfield needs to be able to hold all target function
+-	  codes as well.  */
+-  ENUM_BITFIELD(built_in_function) function_code : 12;
+-  ENUM_BITFIELD(built_in_class) built_in_class : 2;
++  /* In a FUNCTION_DECL this is DECL_UNCHECKED_FUNCTION_CODE.  */
++  unsigned int function_code;
+ 
++  ENUM_BITFIELD(built_in_class) built_in_class : 2;
+   unsigned static_ctor_flag : 1;
+   unsigned static_dtor_flag : 1;
+-
+   unsigned uninlinable : 1;
+   unsigned possibly_inlined : 1;
+   unsigned novops_flag : 1;
+   unsigned returns_twice_flag : 1;
++
+   unsigned malloc_flag : 1;
+-  unsigned operator_new_flag : 1;
+   unsigned declared_inline_flag : 1;
+   unsigned no_inline_warning_flag : 1;
+-
+   unsigned no_instrument_function_entry_exit : 1;
+   unsigned no_limit_stack : 1;
+   unsigned disregard_inline_limits : 1;
+   unsigned pure_flag : 1;
+   unsigned looping_const_or_pure_flag : 1;
++
++  /* Align the bitfield to boundary of a byte.  */
++  ENUM_BITFIELD(function_decl_type) decl_type: 2;
+   unsigned has_debug_args_flag : 1;
+   unsigned versioned_function : 1;
+-  unsigned lambda_function: 1;
+-  /* No bits left.  */
++
++  /* 12 bits left for future expansion.  */
+ };
+ 
+ struct GTY(()) tree_translation_unit_decl {
+diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
+index ac81e10a3..38ebe4092 100644
+--- a/gcc/tree-if-conv.c
++++ b/gcc/tree-if-conv.c
+@@ -2142,9 +2142,7 @@ predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
+       new_stmt
+ 	= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+ 				      mask, rhs);
+-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+-      SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
++      gimple_move_vops (new_stmt, stmt);
+     }
+   gimple_call_set_nothrow (new_stmt, true);
+   return new_stmt;
+diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
+index 1110089fa..784ab48c1 100644
+--- a/gcc/tree-inline.c
++++ b/gcc/tree-inline.c
+@@ -4585,7 +4585,7 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id,
+ 	  /* PR 20090218-1_0.c. Body can be provided by another module. */
+ 	  && (reason != CIF_BODY_NOT_AVAILABLE || !flag_generate_lto))
+ 	{
+-	  error ("inlining failed in call to always_inline %q+F: %s", fn,
++	  error ("inlining failed in call to %<always_inline%> %q+F: %s", fn,
+ 		 cgraph_inline_failed_string (reason));
+ 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
+ 	    inform (gimple_location (stmt), "called from here");
+@@ -4834,7 +4834,7 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id,
+      we may get confused if the compiler sees that the inlined new
+      function returns a pointer which was just deleted.  See bug
+      33407.  */
+-  if (DECL_IS_OPERATOR_NEW (fn))
++  if (DECL_IS_OPERATOR_NEW_P (fn))
+     {
+       return_slot = NULL;
+       modify_dest = NULL;
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index 8741a9a49..1321a92c4 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -584,8 +584,6 @@ extern rtl_opt_pass *make_pass_value_profile_transformations (gcc::context
+ extern rtl_opt_pass *make_pass_postreload_cse (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
+-extern rtl_opt_pass *make_pass_branch_target_load_optimize1 (gcc::context
+-							     *ctxt);
+ extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
+ 							     *ctxt);
+ extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
+@@ -595,8 +593,6 @@ extern rtl_opt_pass *make_pass_if_after_reload (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_regrename (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_cprop_hardreg (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_reorder_blocks (gcc::context *ctxt);
+-extern rtl_opt_pass *make_pass_branch_target_load_optimize2 (gcc::context
+-							     *ctxt);
+ extern rtl_opt_pass *make_pass_leaf_regs (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_split_before_sched2 (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_compare_elim_after_reload (gcc::context *ctxt);
+diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
+index 8e4baf013..c36bf96ef 100644
+--- a/gcc/tree-sra.c
++++ b/gcc/tree-sra.c
+@@ -106,6 +106,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "ipa-utils.h"
+ #include "builtins.h"
+ 
++
+ /* Enumeration of all aggregate reductions we can do.  */
+ enum sra_mode { SRA_MODE_EARLY_IPA,   /* early call regularization */
+ 		SRA_MODE_EARLY_INTRA, /* early intraprocedural SRA */
+@@ -220,8 +221,11 @@ struct access
+      is not propagated in the access tree in any direction.  */
+   unsigned grp_scalar_write : 1;
+ 
+-  /* Is this access an artificial one created to scalarize some record
+-     entirely? */
++  /* In a root of an access tree, true means that the entire tree should be
++     totally scalarized - that all scalar leafs should be scalarized and
++     non-root grp_total_scalarization accesses should be honored.  Otherwise,
++     non-root accesses with grp_total_scalarization should never get scalar
++     replacements.  */
+   unsigned grp_total_scalarization : 1;
+ 
+   /* Other passes of the analysis use this bit to make function
+@@ -242,6 +246,10 @@ struct access
+      access tree.  */
+   unsigned grp_unscalarized_data : 1;
+ 
++  /* Set if all accesses in the group consist of the same chain of
++     COMPONENT_REFs and ARRAY_REFs.  */
++  unsigned grp_same_access_path : 1;
++
+   /* Does this access and/or group contain a write access through a
+      BIT_FIELD_REF?  */
+   unsigned grp_partial_lhs : 1;
+@@ -443,16 +451,18 @@ dump_access (FILE *f, struct access *access, bool grp)
+ 	     "grp_scalar_write = %d, grp_total_scalarization = %d, "
+ 	     "grp_hint = %d, grp_covered = %d, "
+ 	     "grp_unscalarizable_region = %d, grp_unscalarized_data = %d, "
+-	     "grp_partial_lhs = %d, grp_to_be_replaced = %d, "
+-	     "grp_to_be_debug_replaced = %d, grp_maybe_modified = %d, "
++	     "grp_same_access_path = %d, grp_partial_lhs = %d, "
++	     "grp_to_be_replaced = %d, grp_to_be_debug_replaced = %d, "
++	     "grp_maybe_modified = %d, "
+ 	     "grp_not_necessarilly_dereferenced = %d\n",
+ 	     access->grp_read, access->grp_write, access->grp_assignment_read,
+ 	     access->grp_assignment_write, access->grp_scalar_read,
+ 	     access->grp_scalar_write, access->grp_total_scalarization,
+ 	     access->grp_hint, access->grp_covered,
+ 	     access->grp_unscalarizable_region, access->grp_unscalarized_data,
+-	     access->grp_partial_lhs, access->grp_to_be_replaced,
+-	     access->grp_to_be_debug_replaced, access->grp_maybe_modified,
++	     access->grp_same_access_path, access->grp_partial_lhs,
++	     access->grp_to_be_replaced, access->grp_to_be_debug_replaced,
++	     access->grp_maybe_modified,
+ 	     access->grp_not_necessarilly_dereferenced);
+   else
+     fprintf (f, ", write = %d, grp_total_scalarization = %d, "
+@@ -540,6 +550,15 @@ find_access_in_subtree (struct access *access, HOST_WIDE_INT offset,
+       access = child;
+     }
+ 
++  /* Total scalarization does not replace single field structures with their
++     single field but rather creates an access for them underneath.  Look for
++     it.  */
++  if (access)
++    while (access->first_child
++	   && access->first_child->offset == offset
++	   && access->first_child->size == size)
++      access = access->first_child;
++
+   return access;
+ }
+ 
+@@ -971,7 +990,8 @@ create_access (tree expr, gimple *stmt, bool write)
+ static bool
+ scalarizable_type_p (tree type, bool const_decl)
+ {
+-  gcc_assert (!is_gimple_reg_type (type));
++  if (is_gimple_reg_type (type))
++    return true;
+   if (type_contains_placeholder_p (type))
+     return false;
+ 
+@@ -986,8 +1006,7 @@ scalarizable_type_p (tree type, bool const_decl)
+ 	  if (DECL_BIT_FIELD (fld))
+ 	    return false;
+ 
+-	  if (!is_gimple_reg_type (ft)
+-	      && !scalarizable_type_p (ft, const_decl))
++	  if (!scalarizable_type_p (ft, const_decl))
+ 	    return false;
+ 	}
+ 
+@@ -1017,8 +1036,7 @@ scalarizable_type_p (tree type, bool const_decl)
+ 	return false;
+ 
+       tree elem = TREE_TYPE (type);
+-      if (!is_gimple_reg_type (elem)
+-	  && !scalarizable_type_p (elem, const_decl))
++      if (!scalarizable_type_p (elem, const_decl))
+ 	return false;
+       return true;
+     }
+@@ -1027,114 +1045,6 @@ scalarizable_type_p (tree type, bool const_decl)
+   }
+ }
+ 
+-static void scalarize_elem (tree, HOST_WIDE_INT, HOST_WIDE_INT, bool, tree, tree);
+-
+-/* Create total_scalarization accesses for all scalar fields of a member
+-   of type DECL_TYPE conforming to scalarizable_type_p.  BASE
+-   must be the top-most VAR_DECL representing the variable; within that,
+-   OFFSET locates the member and REF must be the memory reference expression for
+-   the member.  */
+-
+-static void
+-completely_scalarize (tree base, tree decl_type, HOST_WIDE_INT offset, tree ref)
+-{
+-  switch (TREE_CODE (decl_type))
+-    {
+-    case RECORD_TYPE:
+-      for (tree fld = TYPE_FIELDS (decl_type); fld; fld = DECL_CHAIN (fld))
+-	if (TREE_CODE (fld) == FIELD_DECL)
+-	  {
+-	    HOST_WIDE_INT pos = offset + int_bit_position (fld);
+-	    tree ft = TREE_TYPE (fld);
+-	    tree nref = build3 (COMPONENT_REF, ft, ref, fld, NULL_TREE);
+-
+-	    scalarize_elem (base, pos, tree_to_uhwi (DECL_SIZE (fld)),
+-			    TYPE_REVERSE_STORAGE_ORDER (decl_type),
+-			    nref, ft);
+-	  }
+-      break;
+-    case ARRAY_TYPE:
+-      {
+-	tree elemtype = TREE_TYPE (decl_type);
+-	tree elem_size = TYPE_SIZE (elemtype);
+-	gcc_assert (elem_size && tree_fits_shwi_p (elem_size));
+-	HOST_WIDE_INT el_size = tree_to_shwi (elem_size);
+-	gcc_assert (el_size > 0);
+-
+-	tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (decl_type));
+-	gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
+-	tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (decl_type));
+-	/* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
+-	if (maxidx)
+-	  {
+-	    gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
+-	    tree domain = TYPE_DOMAIN (decl_type);
+-	    /* MINIDX and MAXIDX are inclusive, and must be interpreted in
+-	       DOMAIN (e.g. signed int, whereas min/max may be size_int).  */
+-	    offset_int idx = wi::to_offset (minidx);
+-	    offset_int max = wi::to_offset (maxidx);
+-	    if (!TYPE_UNSIGNED (domain))
+-	      {
+-		idx = wi::sext (idx, TYPE_PRECISION (domain));
+-		max = wi::sext (max, TYPE_PRECISION (domain));
+-	      }
+-	    for (int el_off = offset; idx <= max; ++idx)
+-	      {
+-		tree nref = build4 (ARRAY_REF, elemtype,
+-				    ref,
+-				    wide_int_to_tree (domain, idx),
+-				    NULL_TREE, NULL_TREE);
+-		scalarize_elem (base, el_off, el_size,
+-				TYPE_REVERSE_STORAGE_ORDER (decl_type),
+-				nref, elemtype);
+-		el_off += el_size;
+-	      }
+-	  }
+-      }
+-      break;
+-    default:
+-      gcc_unreachable ();
+-    }
+-}
+-
+-/* Create total_scalarization accesses for a member of type TYPE, which must
+-   satisfy either is_gimple_reg_type or scalarizable_type_p.  BASE must be the
+-   top-most VAR_DECL representing the variable; within that, POS and SIZE locate
+-   the member, REVERSE gives its torage order. and REF must be the reference
+-   expression for it.  */
+-
+-static void
+-scalarize_elem (tree base, HOST_WIDE_INT pos, HOST_WIDE_INT size, bool reverse,
+-		tree ref, tree type)
+-{
+-  if (is_gimple_reg_type (type))
+-  {
+-    struct access *access = create_access_1 (base, pos, size);
+-    access->expr = ref;
+-    access->type = type;
+-    access->grp_total_scalarization = 1;
+-    access->reverse = reverse;
+-    /* Accesses for intraprocedural SRA can have their stmt NULL.  */
+-  }
+-  else
+-    completely_scalarize (base, type, pos, ref);
+-}
+-
+-/* Create a total_scalarization access for VAR as a whole.  VAR must be of a
+-   RECORD_TYPE or ARRAY_TYPE conforming to scalarizable_type_p.  */
+-
+-static void
+-create_total_scalarization_access (tree var)
+-{
+-  HOST_WIDE_INT size = tree_to_uhwi (DECL_SIZE (var));
+-  struct access *access;
+-
+-  access = create_access_1 (var, 0, size);
+-  access->expr = var;
+-  access->type = TREE_TYPE (var);
+-  access->grp_total_scalarization = 1;
+-}
+-
+ /* Return true if REF has an VIEW_CONVERT_EXPR somewhere in it.  */
+ 
+ static inline bool
+@@ -1795,6 +1705,30 @@ build_ref_for_offset (location_t loc, tree base, poly_int64 offset,
+   return mem_ref;
+ }
+ 
++/* Construct and return a memory reference that is equal to a portion of
++   MODEL->expr but is based on BASE.  If this cannot be done, return NULL.  */
++
++static tree
++build_reconstructed_reference (location_t, tree base, struct access *model)
++{
++  tree expr = model->expr, prev_expr = NULL;
++  while (!types_compatible_p (TREE_TYPE (expr), TREE_TYPE (base)))
++    {
++      if (!handled_component_p (expr))
++	return NULL;
++      prev_expr = expr;
++      expr = TREE_OPERAND (expr, 0);
++    }
++
++  if (get_object_alignment (base) < get_object_alignment (expr))
++    return NULL;
++
++  TREE_OPERAND (prev_expr, 0) = base;
++  tree ref = unshare_expr (model->expr);
++  TREE_OPERAND (prev_expr, 0) = expr;
++  return ref;
++}
++
+ /* Construct a memory reference to a part of an aggregate BASE at the given
+    OFFSET and of the same type as MODEL.  In case this is a reference to a
+    bit-field, the function will replicate the last component_ref of model's
+@@ -1822,9 +1756,19 @@ build_ref_for_model (location_t loc, tree base, HOST_WIDE_INT offset,
+ 			      NULL_TREE);
+     }
+   else
+-    return
+-      build_ref_for_offset (loc, base, offset, model->reverse, model->type,
+-			    gsi, insert_after);
++    {
++      tree res;
++      if (model->grp_same_access_path
++	  && !TREE_THIS_VOLATILE (base)
++	  && offset <= model->offset
++	  /* build_reconstructed_reference can still fail if we have already
++	     massaged BASE because of another type incompatibility.  */
++	  && (res = build_reconstructed_reference (loc, base, model)))
++	return res;
++      else
++	return build_ref_for_offset (loc, base, offset, model->reverse,
++				     model->type, gsi, insert_after);
++    }
+ }
+ 
+ /* Attempt to build a memory reference that we could but into a gimple
+@@ -2076,6 +2020,69 @@ find_var_candidates (void)
+   return ret;
+ }
+ 
++/* Return true if EXP is a reference chain of COMPONENT_REFs and AREAY_REFs
++   ending either with a DECL or a MEM_REF with zero offset.  */
++
++static bool
++path_comparable_for_same_access (tree expr)
++{
++  while (handled_component_p (expr))
++    {
++      if (TREE_CODE (expr) == ARRAY_REF)
++	{
++	  /* SSA name indices can occur here too when the array is of sie one.
++	     But we cannot just re-use array_refs with SSA names elsewhere in
++	     the function, so disallow non-constant indices.  TODO: Remove this
++	     limitation after teaching build_reconstructed_reference to replace
++	     the index with the index type lower bound.  */
++	  if (TREE_CODE (TREE_OPERAND (expr, 1)) != INTEGER_CST)
++	    return false;
++	}
++      expr = TREE_OPERAND (expr, 0);
++    }
++
++  if (TREE_CODE (expr) == MEM_REF)
++    {
++      if (!zerop (TREE_OPERAND (expr, 1)))
++	return false;
++    }
++  else
++    gcc_assert (DECL_P (expr));
++
++  return true;
++}
++
++/* Assuming that EXP1 consists of only COMPONENT_REFs and ARRAY_REFs, return
++   true if the chain of these handled components are exactly the same as EXP2
++   and the expression under them is the same DECL or an equivalent MEM_REF.
++   The reference picked by compare_access_positions must go to EXP1.  */
++
++static bool
++same_access_path_p (tree exp1, tree exp2)
++{
++  if (TREE_CODE (exp1) != TREE_CODE (exp2))
++    {
++      /* Special case single-field structures loaded sometimes as the field
++	 and sometimes as the structure.  If the field is of a scalar type,
++	 compare_access_positions will put it into exp1.
++
++	 TODO: The gimple register type condition can be removed if teach
++	 compare_access_positions to put inner types first.  */
++      if (is_gimple_reg_type (TREE_TYPE (exp1))
++	  && TREE_CODE (exp1) == COMPONENT_REF
++	  && (TYPE_MAIN_VARIANT (TREE_TYPE (TREE_OPERAND (exp1, 0)))
++	      == TYPE_MAIN_VARIANT (TREE_TYPE (exp2))))
++	exp1 = TREE_OPERAND (exp1, 0);
++      else
++	return false;
++    }
++
++  if (!operand_equal_p (exp1, exp2, OEP_ADDRESS_OF))
++    return false;
++
++  return true;
++}
++
+ /* Sort all accesses for the given variable, check for partial overlaps and
+    return NULL if there are any.  If there are none, pick a representative for
+    each combination of offset and size and create a linked list out of them.
+@@ -2112,10 +2119,10 @@ sort_and_splice_var_accesses (tree var)
+       bool grp_assignment_read = access->grp_assignment_read;
+       bool grp_assignment_write = access->grp_assignment_write;
+       bool multiple_scalar_reads = false;
+-      bool total_scalarization = access->grp_total_scalarization;
+       bool grp_partial_lhs = access->grp_partial_lhs;
+       bool first_scalar = is_gimple_reg_type (access->type);
+       bool unscalarizable_region = access->grp_unscalarizable_region;
++      bool grp_same_access_path = true;
+       bool bf_non_full_precision
+ 	= (INTEGRAL_TYPE_P (access->type)
+ 	   && TYPE_PRECISION (access->type) != access->size
+@@ -2134,6 +2141,8 @@ sort_and_splice_var_accesses (tree var)
+ 	gcc_assert (access->offset >= low
+ 		    && access->offset + access->size <= high);
+ 
++      grp_same_access_path = path_comparable_for_same_access (access->expr);
++
+       j = i + 1;
+       while (j < access_count)
+ 	{
+@@ -2161,7 +2170,6 @@ sort_and_splice_var_accesses (tree var)
+ 	  grp_assignment_write |= ac2->grp_assignment_write;
+ 	  grp_partial_lhs |= ac2->grp_partial_lhs;
+ 	  unscalarizable_region |= ac2->grp_unscalarizable_region;
+-	  total_scalarization |= ac2->grp_total_scalarization;
+ 	  relink_to_new_repr (access, ac2);
+ 
+ 	  /* If there are both aggregate-type and scalar-type accesses with
+@@ -2184,6 +2192,11 @@ sort_and_splice_var_accesses (tree var)
+ 		}
+ 	      unscalarizable_region = true;
+ 	    }
++
++	  if (grp_same_access_path
++	      && !same_access_path_p (access->expr, ac2->expr))
++	    grp_same_access_path = false;
++
+ 	  ac2->group_representative = access;
+ 	  j++;
+ 	}
+@@ -2197,11 +2210,10 @@ sort_and_splice_var_accesses (tree var)
+       access->grp_scalar_write = grp_scalar_write;
+       access->grp_assignment_read = grp_assignment_read;
+       access->grp_assignment_write = grp_assignment_write;
+-      access->grp_hint = total_scalarization
+-	|| (multiple_scalar_reads && !constant_decl_p (var));
+-      access->grp_total_scalarization = total_scalarization;
++      access->grp_hint = multiple_scalar_reads && !constant_decl_p (var);
+       access->grp_partial_lhs = grp_partial_lhs;
+       access->grp_unscalarizable_region = unscalarizable_region;
++      access->grp_same_access_path = grp_same_access_path;
+ 
+       *prev_acc_ptr = access;
+       prev_acc_ptr = &access->next_grp;
+@@ -2395,6 +2407,88 @@ build_access_trees (struct access *access)
+   return true;
+ }
+ 
++/* Traverse the access forest where ROOT is the first root and verify that
++   various important invariants hold true.  */
++
++DEBUG_FUNCTION void
++verify_sra_access_forest (struct access *root)
++{
++  struct access *access = root;
++  tree first_base = root->base;
++  gcc_assert (DECL_P (first_base));
++  do
++    {
++      gcc_assert (access->base == first_base);
++      if (access->parent)
++	gcc_assert (access->offset >= access->parent->offset
++		    && access->size <= access->parent->size);
++      if (access->next_sibling)
++	gcc_assert (access->next_sibling->offset
++		    >= access->offset + access->size);
++
++      poly_int64 poffset, psize, pmax_size;
++      bool reverse;
++      tree base = get_ref_base_and_extent (access->expr, &poffset, &psize,
++					   &pmax_size, &reverse);
++      HOST_WIDE_INT offset, size, max_size;
++      if (!poffset.is_constant (&offset)
++	  || !psize.is_constant (&size)
++	  || !pmax_size.is_constant (&max_size))
++	gcc_unreachable ();
++      gcc_assert (base == first_base);
++      gcc_assert (offset == access->offset);
++      gcc_assert (access->grp_unscalarizable_region
++		  || size == max_size);
++      gcc_assert (max_size == access->size);
++      gcc_assert (reverse == access->reverse);
++
++      if (access->first_child)
++	{
++	  gcc_assert (access->first_child->parent == access);
++	  access = access->first_child;
++	}
++      else if (access->next_sibling)
++	{
++	  gcc_assert (access->next_sibling->parent == access->parent);
++	  access = access->next_sibling;
++	}
++      else
++	{
++	  while (access->parent && !access->next_sibling)
++	    access = access->parent;
++	  if (access->next_sibling)
++	    access = access->next_sibling;
++	  else
++	    {
++	      gcc_assert (access == root);
++	      root = root->next_grp;
++	      access = root;
++	    }
++	}
++    }
++  while (access);
++}
++
++/* Verify access forests of all candidates with accesses by calling
++   verify_access_forest on each on them.  */
++
++DEBUG_FUNCTION void
++verify_all_sra_access_forests (void)
++{
++  bitmap_iterator bi;
++  unsigned i;
++  EXECUTE_IF_SET_IN_BITMAP (candidate_bitmap, 0, i, bi)
++    {
++      tree var = candidate (i);
++      struct access *access = get_first_repr_for_decl (var);
++      if (access)
++	{
++	  gcc_assert (access->base == var);
++	  verify_sra_access_forest (access);
++	}
++    }
++}
++
+ /* Return true if expr contains some ARRAY_REFs into a variable bounded
+    array.  */
+ 
+@@ -2412,15 +2506,16 @@ expr_with_var_bounded_array_refs_p (tree expr)
+ }
+ 
+ /* Analyze the subtree of accesses rooted in ROOT, scheduling replacements when
+-   both seeming beneficial and when ALLOW_REPLACEMENTS allows it.  Also set all
+-   sorts of access flags appropriately along the way, notably always set
+-   grp_read and grp_assign_read according to MARK_READ and grp_write when
+-   MARK_WRITE is true.
++   both seeming beneficial and when ALLOW_REPLACEMENTS allows it.  If TOTALLY
++   is set, we are totally scalarizing the aggregate.  Also set all sorts of
++   access flags appropriately along the way, notably always set grp_read and
++   grp_assign_read according to MARK_READ and grp_write when MARK_WRITE is
++   true.
+ 
+    Creating a replacement for a scalar access is considered beneficial if its
+-   grp_hint is set (this means we are either attempting total scalarization or
+-   there is more than one direct read access) or according to the following
+-   table:
++   grp_hint ot TOTALLY is set (this means either that there is more than one
++   direct read access or that we are attempting total scalarization) or
++   according to the following table:
+ 
+    Access written to through a scalar type (once or more times)
+    |
+@@ -2451,7 +2546,7 @@ expr_with_var_bounded_array_refs_p (tree expr)
+ 
+ static bool
+ analyze_access_subtree (struct access *root, struct access *parent,
+-			bool allow_replacements)
++			bool allow_replacements, bool totally)
+ {
+   struct access *child;
+   HOST_WIDE_INT limit = root->offset + root->size;
+@@ -2469,8 +2564,8 @@ analyze_access_subtree (struct access *root, struct access *parent,
+ 	root->grp_write = 1;
+       if (parent->grp_assignment_write)
+ 	root->grp_assignment_write = 1;
+-      if (parent->grp_total_scalarization)
+-	root->grp_total_scalarization = 1;
++      if (!parent->grp_same_access_path)
++	root->grp_same_access_path = 0;
+     }
+ 
+   if (root->grp_unscalarizable_region)
+@@ -2483,10 +2578,10 @@ analyze_access_subtree (struct access *root, struct access *parent,
+     {
+       hole |= covered_to < child->offset;
+       sth_created |= analyze_access_subtree (child, root,
+-					     allow_replacements && !scalar);
++					     allow_replacements && !scalar,
++					     totally);
+ 
+       root->grp_unscalarized_data |= child->grp_unscalarized_data;
+-      root->grp_total_scalarization &= child->grp_total_scalarization;
+       if (child->grp_covered)
+ 	covered_to += child->size;
+       else
+@@ -2494,7 +2589,9 @@ analyze_access_subtree (struct access *root, struct access *parent,
+     }
+ 
+   if (allow_replacements && scalar && !root->first_child
+-      && (root->grp_hint
++      && (totally || !root->grp_total_scalarization)
++      && (totally
++	  || root->grp_hint
+ 	  || ((root->grp_scalar_read || root->grp_assignment_read)
+ 	      && (root->grp_scalar_write || root->grp_assignment_write))))
+     {
+@@ -2536,6 +2633,7 @@ analyze_access_subtree (struct access *root, struct access *parent,
+     {
+       if (allow_replacements
+ 	  && scalar && !root->first_child
++	  && !root->grp_total_scalarization
+ 	  && (root->grp_scalar_write || root->grp_assignment_write)
+ 	  && !bitmap_bit_p (cannot_scalarize_away_bitmap,
+ 			    DECL_UID (root->base)))
+@@ -2556,7 +2654,7 @@ analyze_access_subtree (struct access *root, struct access *parent,
+ 	root->grp_total_scalarization = 0;
+     }
+ 
+-  if (!hole || root->grp_total_scalarization)
++  if (!hole || totally)
+     root->grp_covered = 1;
+   else if (root->grp_write || comes_initialized_p (root->base))
+     root->grp_unscalarized_data = 1; /* not covered and written to */
+@@ -2572,7 +2670,8 @@ analyze_access_trees (struct access *access)
+ 
+   while (access)
+     {
+-      if (analyze_access_subtree (access, NULL, true))
++      if (analyze_access_subtree (access, NULL, true,
++				  access->grp_total_scalarization))
+ 	ret = true;
+       access = access->next_grp;
+     }
+@@ -2638,6 +2737,7 @@ create_artificial_child_access (struct access *parent, struct access *model,
+   access->offset = new_offset;
+   access->size = model->size;
+   access->type = model->type;
++  access->parent = parent;
+   access->grp_write = set_grp_write;
+   access->grp_read = false;
+   access->reverse = model->reverse;
+@@ -2721,13 +2821,17 @@ propagate_subaccesses_across_link (struct access *lacc, struct access *racc)
+ 	  lacc->type = racc->type;
+ 	  if (build_user_friendly_ref_for_offset (&t, TREE_TYPE (t),
+ 						  lacc->offset, racc->type))
+-	    lacc->expr = t;
++	    {
++	      lacc->expr = t;
++	      lacc->grp_same_access_path = true;
++	    }
+ 	  else
+ 	    {
+ 	      lacc->expr = build_ref_for_model (EXPR_LOCATION (lacc->base),
+ 						lacc->base, lacc->offset,
+ 						racc, NULL, false);
+ 	      lacc->grp_no_warning = true;
++	      lacc->grp_same_access_path = false;
+ 	    }
+ 	}
+       return ret;
+@@ -2840,6 +2944,369 @@ propagate_all_subaccesses (void)
+     }
+ }
+ 
++/* Return true if the forest beginning with ROOT does not contain
++   unscalarizable regions or non-byte aligned accesses.  */
++
++static bool
++can_totally_scalarize_forest_p (struct access *root)
++{
++  struct access *access = root;
++  do
++    {
++      if (access->grp_unscalarizable_region
++	  || (access->offset % BITS_PER_UNIT) != 0
++	  || (access->size % BITS_PER_UNIT) != 0
++	  || (is_gimple_reg_type (access->type)
++	      && access->first_child))
++	return false;
++
++      if (access->first_child)
++	access = access->first_child;
++      else if (access->next_sibling)
++	access = access->next_sibling;
++      else
++	{
++	  while (access->parent && !access->next_sibling)
++	    access = access->parent;
++	  if (access->next_sibling)
++	    access = access->next_sibling;
++	  else
++	    {
++	      gcc_assert (access == root);
++	      root = root->next_grp;
++	      access = root;
++	    }
++	}
++    }
++  while (access);
++  return true;
++}
++
++/* Create and return an ACCESS in PARENT spanning from POS with SIZE, TYPE and
++   reference EXPR for total scalarization purposes and mark it as such.  Within
++   the children of PARENT, link it in between PTR and NEXT_SIBLING.  */
++
++static struct access *
++create_total_scalarization_access (struct access *parent, HOST_WIDE_INT pos,
++				   HOST_WIDE_INT size, tree type, tree expr,
++				   struct access **ptr,
++				   struct access *next_sibling)
++{
++  struct access *access = access_pool.allocate ();
++  memset (access, 0, sizeof (struct access));
++  access->base = parent->base;
++  access->offset = pos;
++  access->size = size;
++  access->expr = expr;
++  access->type = type;
++  access->parent = parent;
++  access->grp_write = parent->grp_write;
++  access->grp_total_scalarization = 1;
++  access->grp_hint = 1;
++  access->grp_same_access_path = path_comparable_for_same_access (expr);
++  access->reverse = reverse_storage_order_for_component_p (expr);
++
++  access->next_sibling = next_sibling;
++  *ptr = access;
++  return access;
++}
++
++/* Create and return an ACCESS in PARENT spanning from POS with SIZE, TYPE and
++   reference EXPR for total scalarization purposes and mark it as such, link it
++   at *PTR and reshape the tree so that those elements at *PTR and their
++   siblings which fall within the part described by POS and SIZE are moved to
++   be children of the new access.  If a partial overlap is detected, return
++   NULL.  */
++
++static struct access *
++create_total_access_and_reshape (struct access *parent, HOST_WIDE_INT pos,
++				 HOST_WIDE_INT size, tree type, tree expr,
++				 struct access **ptr)
++{
++  struct access **p = ptr;
++
++  while (*p && (*p)->offset < pos + size)
++    {
++      if ((*p)->offset + (*p)->size > pos + size)
++	return NULL;
++      p = &(*p)->next_sibling;
++    }
++
++  struct access *next_child = *ptr;
++  struct access *new_acc
++    = create_total_scalarization_access (parent, pos, size, type, expr,
++					 ptr, *p);
++  if (p != ptr)
++    {
++      new_acc->first_child = next_child;
++      *p = NULL;
++      for (struct access *a = next_child; a; a = a->next_sibling)
++	a->parent = new_acc;
++    }
++  return new_acc;
++}
++
++static bool totally_scalarize_subtree (struct access *root);
++
++/* Return true if INNER is either the same type as OUTER or if it is the type
++   of a record field in OUTER at offset zero, possibly in nested
++   sub-records.  */
++
++static bool
++access_and_field_type_match_p (tree outer, tree inner)
++{
++  if (TYPE_MAIN_VARIANT (outer) == TYPE_MAIN_VARIANT (inner))
++    return true;
++  if (TREE_CODE (outer) != RECORD_TYPE)
++    return false;
++  tree fld = TYPE_FIELDS (outer);
++  while (fld)
++    {
++     if (TREE_CODE (fld) == FIELD_DECL)
++       {
++	if (!zerop (DECL_FIELD_OFFSET (fld)))
++	  return false;
++	if (TYPE_MAIN_VARIANT (TREE_TYPE (fld)) == inner)
++	  return true;
++	if (TREE_CODE (TREE_TYPE (fld)) == RECORD_TYPE)
++	  fld = TYPE_FIELDS (TREE_TYPE (fld));
++	else
++	  return false;
++       }
++     else
++       fld = DECL_CHAIN (fld);
++    }
++  return false;
++}
++
++/* Return type of total_should_skip_creating_access indicating whether a total
++   scalarization access for a field/element should be created, whether it
++   already exists or whether the entire total scalarization has to fail.  */
++
++enum total_sra_field_state {TOTAL_FLD_CREATE, TOTAL_FLD_DONE, TOTAL_FLD_FAILED};
++
++/* Do all the necessary steps in total scalarization when the given aggregate
++   type has a TYPE at POS with the given SIZE should be put into PARENT and
++   when we have processed all its siblings with smaller offsets up until and
++   including LAST_SEEN_SIBLING (which can be NULL).
++
++   If some further siblings are to be skipped, set *LAST_SEEN_SIBLING as
++   appropriate.  Return TOTAL_FLD_CREATE id the caller should carry on with
++   creating a new access, TOTAL_FLD_DONE if access or accesses capable of
++   representing the described part of the aggregate for the purposes of total
++   scalarization already exist or TOTAL_FLD_FAILED if there is a problem which
++   prevents total scalarization from happening at all.  */
++
++static enum total_sra_field_state
++total_should_skip_creating_access (struct access *parent,
++				   struct access **last_seen_sibling,
++				   tree type, HOST_WIDE_INT pos,
++				   HOST_WIDE_INT size)
++{
++  struct access *next_child;
++  if (!*last_seen_sibling)
++    next_child = parent->first_child;
++  else
++    next_child = (*last_seen_sibling)->next_sibling;
++
++  /* First, traverse the chain of siblings until it points to an access with
++     offset at least equal to POS.  Check all skipped accesses whether they
++     span the POS boundary and if so, return with a failure.  */
++  while (next_child && next_child->offset < pos)
++    {
++      if (next_child->offset + next_child->size > pos)
++	return TOTAL_FLD_FAILED;
++      *last_seen_sibling = next_child;
++      next_child = next_child->next_sibling;
++    }
++
++  /* Now check whether next_child has exactly the right POS and SIZE and if so,
++     whether it can represent what we need and can be totally scalarized
++     itself.  */
++  if (next_child && next_child->offset == pos
++      && next_child->size == size)
++    {
++      if (!is_gimple_reg_type (next_child->type)
++	  && (!access_and_field_type_match_p (type, next_child->type)
++	      || !totally_scalarize_subtree (next_child)))
++	return TOTAL_FLD_FAILED;
++
++      *last_seen_sibling = next_child;
++      return TOTAL_FLD_DONE;
++    }
++
++  /* If the child we're looking at would partially overlap, we just cannot
++     totally scalarize.  */
++  if (next_child
++      && next_child->offset < pos + size
++      && next_child->offset + next_child->size > pos + size)
++    return TOTAL_FLD_FAILED;
++
++  if (is_gimple_reg_type (type))
++    {
++      /* We don't scalarize accesses that are children of other scalar type
++	 accesses, so if we go on and create an access for a register type,
++	 there should not be any pre-existing children.  There are rare cases
++	 where the requested type is a vector but we already have register
++	 accesses for all its elements which is equally good.  Detect that
++	 situation or whether we need to bail out.  */
++
++      HOST_WIDE_INT covered = pos;
++      bool skipping = false;
++      while (next_child
++	     && next_child->offset + next_child->size <= pos + size)
++	{
++	  if (next_child->offset != covered
++	      || !is_gimple_reg_type (next_child->type))
++	    return TOTAL_FLD_FAILED;
++
++	  covered += next_child->size;
++	  *last_seen_sibling = next_child;
++	  next_child = next_child->next_sibling;
++	  skipping = true;
++	}
++
++      if (skipping)
++	{
++	  if (covered != pos + size)
++	    return TOTAL_FLD_FAILED;
++	  else
++	    return TOTAL_FLD_DONE;
++	}
++    }
++
++  return TOTAL_FLD_CREATE;
++}
++
++/* Go over sub-tree rooted in ROOT and attempt to create scalar accesses
++   spanning all uncovered areas covered by ROOT, return false if the attempt
++   failed.  All created accesses will have grp_unscalarizable_region set (and
++   should be ignored if the function returns false).  */
++
++static bool
++totally_scalarize_subtree (struct access *root)
++{
++  gcc_checking_assert (!root->grp_unscalarizable_region);
++  gcc_checking_assert (!is_gimple_reg_type (root->type));
++
++  struct access *last_seen_sibling = NULL;
++
++  switch (TREE_CODE (root->type))
++    {
++    case RECORD_TYPE:
++      for (tree fld = TYPE_FIELDS (root->type); fld; fld = DECL_CHAIN (fld))
++	if (TREE_CODE (fld) == FIELD_DECL)
++	  {
++	    tree ft = TREE_TYPE (fld);
++	    HOST_WIDE_INT fsize = tree_to_uhwi (DECL_SIZE (fld));
++	    if (!fsize)
++	      continue;
++
++	    HOST_WIDE_INT pos = root->offset + int_bit_position (fld);
++	    enum total_sra_field_state
++	      state = total_should_skip_creating_access (root,
++							 &last_seen_sibling,
++							 ft, pos, fsize);
++	    switch (state)
++	      {
++	      case TOTAL_FLD_FAILED:
++		return false;
++	      case TOTAL_FLD_DONE:
++		continue;
++	      case TOTAL_FLD_CREATE:
++		break;
++	      default:
++		gcc_unreachable ();
++	      }
++
++	    struct access **p = (last_seen_sibling
++				 ? &last_seen_sibling->next_sibling
++				 : &root->first_child);
++	    tree nref = build3 (COMPONENT_REF, ft, root->expr, fld, NULL_TREE);
++	    struct access *new_child
++	      = create_total_access_and_reshape (root, pos, fsize, ft, nref, p);
++	    if (!new_child)
++	      return false;
++
++	    if (!is_gimple_reg_type (ft)
++		&& !totally_scalarize_subtree (new_child))
++	      return false;
++	    last_seen_sibling = new_child;
++	  }
++      break;
++    case ARRAY_TYPE:
++      {
++	tree elemtype = TREE_TYPE (root->type);
++	tree elem_size = TYPE_SIZE (elemtype);
++	gcc_assert (elem_size && tree_fits_shwi_p (elem_size));
++	HOST_WIDE_INT el_size = tree_to_shwi (elem_size);
++	gcc_assert (el_size > 0);
++
++	tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (root->type));
++	gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
++	tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (root->type));
++	/* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
++	if (!maxidx)
++	  goto out;
++	gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
++	tree domain = TYPE_DOMAIN (root->type);
++	/* MINIDX and MAXIDX are inclusive, and must be interpreted in
++	   DOMAIN (e.g. signed int, whereas min/max may be size_int).  */
++	offset_int idx = wi::to_offset (minidx);
++	offset_int max = wi::to_offset (maxidx);
++	if (!TYPE_UNSIGNED (domain))
++	  {
++	    idx = wi::sext (idx, TYPE_PRECISION (domain));
++	    max = wi::sext (max, TYPE_PRECISION (domain));
++	  }
++	for (HOST_WIDE_INT pos = root->offset;
++	     idx <= max;
++	     pos += el_size, ++idx)
++	  {
++	    enum total_sra_field_state
++	      state = total_should_skip_creating_access (root,
++							 &last_seen_sibling,
++							 elemtype, pos,
++							 el_size);
++	    switch (state)
++	      {
++	      case TOTAL_FLD_FAILED:
++		return false;
++	      case TOTAL_FLD_DONE:
++		continue;
++	      case TOTAL_FLD_CREATE:
++		break;
++	      default:
++		gcc_unreachable ();
++	      }
++
++	    struct access **p = (last_seen_sibling
++				 ? &last_seen_sibling->next_sibling
++				 : &root->first_child);
++	    tree nref = build4 (ARRAY_REF, elemtype, root->expr,
++				wide_int_to_tree (domain, idx),
++				NULL_TREE, NULL_TREE);
++	    struct access *new_child
++	      = create_total_access_and_reshape (root, pos, el_size, elemtype,
++						 nref, p);
++	    if (!new_child)
++	      return false;
++
++	    if (!is_gimple_reg_type (elemtype)
++		&& !totally_scalarize_subtree (new_child))
++	      return false;
++	    last_seen_sibling = new_child;
++	  }
++      }
++      break;
++    default:
++      gcc_unreachable ();
++    }
++
++ out:
++  return true;
++}
++
+ /* Go through all accesses collected throughout the (intraprocedural) analysis
+    stage, exclude overlapping ones, identify representatives and build trees
+    out of them, making decisions about scalarization on the way.  Return true
+@@ -2852,8 +3319,22 @@ analyze_all_variable_accesses (void)
+   bitmap tmp = BITMAP_ALLOC (NULL);
+   bitmap_iterator bi;
+   unsigned i;
+-  bool optimize_speed_p = !optimize_function_for_size_p (cfun);
+ 
++  bitmap_copy (tmp, candidate_bitmap);
++  EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
++    {
++      tree var = candidate (i);
++      struct access *access;
++
++      access = sort_and_splice_var_accesses (var);
++      if (!access || !build_access_trees (access))
++	disqualify_candidate (var,
++			      "No or inhibitingly overlapping accesses.");
++    }
++
++  propagate_all_subaccesses ();
++
++  bool optimize_speed_p = !optimize_function_for_size_p (cfun);
+   enum compiler_param param = optimize_speed_p
+ 			? PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED
+ 			: PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE;
+@@ -2872,46 +3353,59 @@ analyze_all_variable_accesses (void)
+ 	&& !bitmap_bit_p (cannot_scalarize_away_bitmap, i))
+       {
+ 	tree var = candidate (i);
++	if (!VAR_P (var))
++	  continue;
+ 
+-	if (VAR_P (var) && scalarizable_type_p (TREE_TYPE (var),
+-						constant_decl_p (var)))
++	if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var))) > max_scalarization_size)
+ 	  {
+-	    if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var)))
+-		<= max_scalarization_size)
+-	      {
+-		create_total_scalarization_access (var);
+-		completely_scalarize (var, TREE_TYPE (var), 0, var);
+-		statistics_counter_event (cfun,
+-					  "Totally-scalarized aggregates", 1);
+-		if (dump_file && (dump_flags & TDF_DETAILS))
+-		  {
+-		    fprintf (dump_file, "Will attempt to totally scalarize ");
+-		    print_generic_expr (dump_file, var);
+-		    fprintf (dump_file, " (UID: %u): \n", DECL_UID (var));
+-		  }
+-	      }
+-	    else if (dump_file && (dump_flags & TDF_DETAILS))
++	    if (dump_file && (dump_flags & TDF_DETAILS))
+ 	      {
+ 		fprintf (dump_file, "Too big to totally scalarize: ");
+ 		print_generic_expr (dump_file, var);
+ 		fprintf (dump_file, " (UID: %u)\n", DECL_UID (var));
+ 	      }
++	    continue;
+ 	  }
+-      }
+ 
+-  bitmap_copy (tmp, candidate_bitmap);
+-  EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
+-    {
+-      tree var = candidate (i);
+-      struct access *access;
++	bool all_types_ok = true;
++	for (struct access *access = get_first_repr_for_decl (var);
++	     access;
++	     access = access->next_grp)
++	  if (!can_totally_scalarize_forest_p (access)
++	      || !scalarizable_type_p (access->type, constant_decl_p (var)))
++	    {
++	      all_types_ok = false;
++	      break;
++	    }
++	if (!all_types_ok)
++	  continue;
+ 
+-      access = sort_and_splice_var_accesses (var);
+-      if (!access || !build_access_trees (access))
+-	disqualify_candidate (var,
+-			      "No or inhibitingly overlapping accesses.");
+-    }
++	if (dump_file && (dump_flags & TDF_DETAILS))
++	  {
++	    fprintf (dump_file, "Will attempt to totally scalarize ");
++	    print_generic_expr (dump_file, var);
++	    fprintf (dump_file, " (UID: %u): \n", DECL_UID (var));
++	  }
++	bool scalarized = true;
++	for (struct access *access = get_first_repr_for_decl (var);
++	     access;
++	     access = access->next_grp)
++	  if (!is_gimple_reg_type (access->type)
++	      && !totally_scalarize_subtree (access))
++	    {
++	      scalarized = false;
++	      break;
++	    }
+ 
+-  propagate_all_subaccesses ();
++	if (scalarized)
++	  for (struct access *access = get_first_repr_for_decl (var);
++	       access;
++	       access = access->next_grp)
++	    access->grp_total_scalarization = true;
++      }
++
++  if (flag_checking)
++    verify_all_sra_access_forests ();
+ 
+   bitmap_copy (tmp, candidate_bitmap);
+   EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
+@@ -3775,25 +4269,39 @@ initialize_constant_pool_replacements (void)
+       tree var = candidate (i);
+       if (!constant_decl_p (var))
+ 	continue;
+-      vec<access_p> *access_vec = get_base_access_vector (var);
+-      if (!access_vec)
+-	continue;
+-      for (unsigned i = 0; i < access_vec->length (); i++)
++
++      struct access *access = get_first_repr_for_decl (var);
++
++      while (access)
+ 	{
+-	  struct access *access = (*access_vec)[i];
+-	  if (!access->replacement_decl)
+-	    continue;
+-	  gassign *stmt
+-	    = gimple_build_assign (get_access_replacement (access),
+-				   unshare_expr (access->expr));
+-	  if (dump_file && (dump_flags & TDF_DETAILS))
++	  if (access->replacement_decl)
+ 	    {
+-	      fprintf (dump_file, "Generating constant initializer: ");
+-	      print_gimple_stmt (dump_file, stmt, 0);
+-	      fprintf (dump_file, "\n");
++	      gassign *stmt
++		= gimple_build_assign (get_access_replacement (access),
++				       unshare_expr (access->expr));
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		{
++		  fprintf (dump_file, "Generating constant initializer: ");
++		  print_gimple_stmt (dump_file, stmt, 0);
++		  fprintf (dump_file, "\n");
++		}
++	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
++	      update_stmt (stmt);
++	    }
++
++	  if (access->first_child)
++	    access = access->first_child;
++	  else if (access->next_sibling)
++	    access = access->next_sibling;
++	  else
++	    {
++	      while (access->parent && !access->next_sibling)
++		access = access->parent;
++	      if (access->next_sibling)
++		access = access->next_sibling;
++	      else
++		access = access->next_grp;
+ 	    }
+-	  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+-	  update_stmt (stmt);
+ 	}
+     }
+ 
+diff --git a/gcc/tree-ssa-address.c b/gcc/tree-ssa-address.c
+index 2e5d87734..3195a21c7 100644
+--- a/gcc/tree-ssa-address.c
++++ b/gcc/tree-ssa-address.c
+@@ -1141,6 +1141,35 @@ maybe_fold_tmr (tree ref)
+   return new_ref;
+ }
+ 
++/* Return the preferred index scale factor for accessing memory of mode
++   MEM_MODE in the address space of pointer BASE.  Assume that we're
++   optimizing for speed if SPEED is true and for size otherwise.  */
++unsigned int
++preferred_mem_scale_factor (tree base, machine_mode mem_mode,
++			    bool speed)
++{
++  struct mem_address parts = {};
++  addr_space_t as = TYPE_ADDR_SPACE (TREE_TYPE (base));
++  unsigned int fact = GET_MODE_UNIT_SIZE (mem_mode);
++
++  /* Addressing mode "base + index".  */
++  parts.index = integer_one_node;
++  parts.base = integer_one_node;
++  rtx addr = addr_for_mem_ref (&parts, as, false);
++  unsigned cost = address_cost (addr, mem_mode, as, speed);
++
++  /* Addressing mode "base + index << scale".  */
++  parts.step = wide_int_to_tree (sizetype, fact);
++  addr = addr_for_mem_ref (&parts, as, false);
++  unsigned new_cost = address_cost (addr, mem_mode, as, speed);
++
++  /* Compare the cost of an address with an unscaled index with
++     a scaled index and return factor if useful. */
++  if (new_cost < cost)
++    return GET_MODE_UNIT_SIZE (mem_mode);
++  return 1;
++}
++
+ /* Dump PARTS to FILE.  */
+ 
+ extern void dump_mem_address (FILE *, struct mem_address *);
+diff --git a/gcc/tree-ssa-address.h b/gcc/tree-ssa-address.h
+index 6fa4eae89..9812f36fb 100644
+--- a/gcc/tree-ssa-address.h
++++ b/gcc/tree-ssa-address.h
+@@ -39,4 +39,7 @@ tree create_mem_ref (gimple_stmt_iterator *, tree,
+ extern void copy_ref_info (tree, tree);
+ tree maybe_fold_tmr (tree);
+ 
++extern unsigned int preferred_mem_scale_factor (tree base,
++						machine_mode mem_mode,
++						bool speed);
+ #endif /* GCC_TREE_SSA_ADDRESS_H */
+diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
+index 8db6a34e0..dbe2fda96 100644
+--- a/gcc/tree-ssa-ccp.c
++++ b/gcc/tree-ssa-ccp.c
+@@ -614,9 +614,17 @@ get_value_for_expr (tree expr, bool for_bits_p)
+ 	  val.mask = -1;
+ 	}
+       if (for_bits_p
+-	  && val.lattice_val == CONSTANT
+-	  && TREE_CODE (val.value) == ADDR_EXPR)
+-	val = get_value_from_alignment (val.value);
++	  && val.lattice_val == CONSTANT)
++	{
++	  if (TREE_CODE (val.value) == ADDR_EXPR)
++	    val = get_value_from_alignment (val.value);
++	  else if (TREE_CODE (val.value) != INTEGER_CST)
++	    {
++	      val.lattice_val = VARYING;
++	      val.value = NULL_TREE;
++	      val.mask = -1;
++	    }
++	}
+       /* Fall back to a copy value.  */
+       if (!for_bits_p
+ 	  && val.lattice_val == VARYING
+@@ -2566,7 +2574,7 @@ optimize_stack_restore (gimple_stmt_iterator i)
+ 	  || ALLOCA_FUNCTION_CODE_P (DECL_FUNCTION_CODE (callee)))
+ 	return NULL_TREE;
+ 
+-      if (DECL_FUNCTION_CODE (callee) == BUILT_IN_STACK_RESTORE)
++      if (fndecl_built_in_p (callee, BUILT_IN_STACK_RESTORE))
+ 	goto second_stack_restore;
+     }
+ 
+@@ -2625,9 +2633,6 @@ optimize_stdarg_builtin (gimple *call)
+   bool va_list_simple_ptr;
+   location_t loc = gimple_location (call);
+ 
+-  if (gimple_code (call) != GIMPLE_CALL)
+-    return NULL_TREE;
+-
+   callee = gimple_call_fndecl (call);
+ 
+   cfun_va_list = targetm.fn_abi_va_list (callee);
+@@ -2930,12 +2935,10 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
+ 				    bit, flag);
+   gimple_call_set_lhs (g, new_lhs);
+   gimple_set_location (g, gimple_location (call));
+-  gimple_set_vuse (g, gimple_vuse (call));
+-  gimple_set_vdef (g, gimple_vdef (call));
++  gimple_move_vops (g, call);
+   bool throws = stmt_can_throw_internal (cfun, call);
+   gimple_call_set_nothrow (as_a <gcall *> (g),
+ 			   gimple_call_nothrow_p (as_a <gcall *> (call)));
+-  SSA_NAME_DEF_STMT (gimple_vdef (call)) = g;
+   gimple_stmt_iterator gsi = *gsip;
+   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+   edge e = NULL;
+diff --git a/gcc/tree-ssa-dce.c b/gcc/tree-ssa-dce.c
+index a38899edd..be9f501c9 100644
+--- a/gcc/tree-ssa-dce.c
++++ b/gcc/tree-ssa-dce.c
+@@ -115,6 +115,14 @@ static bool cfg_altered;
+ static int *bb_postorder;
+ 
+ 
++/* True if we should treat any stmt with a vdef as necessary.  */
++
++static inline bool
++keep_all_vdefs_p ()
++{
++  return optimize_debug;
++}
++
+ /* If STMT is not already marked necessary, mark it, and add it to the
+    worklist if ADD_TO_WORKLIST is true.  */
+ 
+@@ -311,6 +319,12 @@ mark_stmt_if_obviously_necessary (gimple *stmt, bool aggressive)
+       return;
+     }
+ 
++  if (gimple_vdef (stmt) && keep_all_vdefs_p ())
++    {
++      mark_stmt_necessary (stmt, true);
++      return;
++    }
++
+   return;
+ }
+ 
+@@ -526,6 +540,9 @@ mark_aliased_reaching_defs_necessary_1 (ao_ref *ref, tree vdef, void *data)
+ static void
+ mark_aliased_reaching_defs_necessary (gimple *stmt, tree ref)
+ {
++  /* Should have been caught before calling this function.  */
++  gcc_checking_assert (!keep_all_vdefs_p ());
++
+   unsigned int chain;
+   ao_ref refd;
+   gcc_assert (!chain_ovfl);
+@@ -599,6 +616,8 @@ mark_all_reaching_defs_necessary_1 (ao_ref *ref ATTRIBUTE_UNUSED,
+ static void
+ mark_all_reaching_defs_necessary (gimple *stmt)
+ {
++  /* Should have been caught before calling this function.  */
++  gcc_checking_assert (!keep_all_vdefs_p ());
+   walk_aliased_vdefs (NULL, gimple_vuse (stmt),
+ 		      mark_all_reaching_defs_necessary_1, NULL, &visited);
+ }
+@@ -798,6 +817,10 @@ propagate_necessity (bool aggressive)
+ 	  if (!use)
+ 	    continue;
+ 
++	  /* No need to search for vdefs if we intrinsicly keep them all.  */
++	  if (keep_all_vdefs_p ())
++	    continue;
++
+ 	  /* If we dropped to simple mode make all immediately
+ 	     reachable definitions necessary.  */
+ 	  if (chain_ovfl)
+diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
+index e753689a7..0d716062b 100644
+--- a/gcc/tree-ssa-forwprop.c
++++ b/gcc/tree-ssa-forwprop.c
+@@ -2011,16 +2011,12 @@ get_bit_field_ref_def (tree val, enum tree_code &conv_code)
+     return NULL_TREE;
+   enum tree_code code = gimple_assign_rhs_code (def_stmt);
+   if (code == FLOAT_EXPR
+-      || code == FIX_TRUNC_EXPR)
++      || code == FIX_TRUNC_EXPR
++      || CONVERT_EXPR_CODE_P (code))
+     {
+       tree op1 = gimple_assign_rhs1 (def_stmt);
+       if (conv_code == ERROR_MARK)
+-	{
+-	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))),
+-			GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
+-	    return NULL_TREE;
+-	  conv_code = code;
+-	}
++	conv_code = code;
+       else if (conv_code != code)
+ 	return NULL_TREE;
+       if (TREE_CODE (op1) != SSA_NAME)
+@@ -2041,109 +2037,213 @@ static bool
+ simplify_vector_constructor (gimple_stmt_iterator *gsi)
+ {
+   gimple *stmt = gsi_stmt (*gsi);
+-  tree op, op2, orig[2], type, elem_type;
++  tree op, orig[2], type, elem_type;
+   unsigned elem_size, i;
+   unsigned HOST_WIDE_INT nelts;
++  unsigned HOST_WIDE_INT refnelts;
+   enum tree_code conv_code;
+   constructor_elt *elt;
+   bool maybe_ident;
+ 
+-  gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
+-
+   op = gimple_assign_rhs1 (stmt);
+   type = TREE_TYPE (op);
+-  gcc_checking_assert (TREE_CODE (type) == VECTOR_TYPE);
++  gcc_checking_assert (TREE_CODE (op) == CONSTRUCTOR
++		       && TREE_CODE (type) == VECTOR_TYPE);
+ 
+   if (!TYPE_VECTOR_SUBPARTS (type).is_constant (&nelts))
+     return false;
+   elem_type = TREE_TYPE (type);
+   elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+ 
+-  vec_perm_builder sel (nelts, nelts, 1);
+   orig[0] = NULL;
+   orig[1] = NULL;
+   conv_code = ERROR_MARK;
+   maybe_ident = true;
+   tree one_constant = NULL_TREE;
++  tree one_nonconstant = NULL_TREE;
+   auto_vec<tree> constants;
+   constants.safe_grow_cleared (nelts);
++  auto_vec<std::pair<unsigned, unsigned>, 64> elts;
+   FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
+     {
+       tree ref, op1;
++      unsigned int elem;
+ 
+       if (i >= nelts)
+ 	return false;
+ 
++      /* Look for elements extracted and possibly converted from
++         another vector.  */
+       op1 = get_bit_field_ref_def (elt->value, conv_code);
+-      if (op1)
++      if (op1
++	  && TREE_CODE ((ref = TREE_OPERAND (op1, 0))) == SSA_NAME
++	  && VECTOR_TYPE_P (TREE_TYPE (ref))
++	  && useless_type_conversion_p (TREE_TYPE (op1),
++					TREE_TYPE (TREE_TYPE (ref)))
++	  && constant_multiple_p (bit_field_offset (op1),
++				  bit_field_size (op1), &elem)
++	  && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
+ 	{
+-	  ref = TREE_OPERAND (op1, 0);
+ 	  unsigned int j;
+ 	  for (j = 0; j < 2; ++j)
+ 	    {
+ 	      if (!orig[j])
+ 		{
+-		  if (TREE_CODE (ref) != SSA_NAME)
+-		    return false;
+-		  if (! VECTOR_TYPE_P (TREE_TYPE (ref))
+-		      || ! useless_type_conversion_p (TREE_TYPE (op1),
+-						      TREE_TYPE (TREE_TYPE (ref))))
+-		    return false;
+-		  if (j && !useless_type_conversion_p (TREE_TYPE (orig[0]),
+-						       TREE_TYPE (ref)))
+-		    return false;
+-		  orig[j] = ref;
+-		  break;
++		  if (j == 0
++		      || useless_type_conversion_p (TREE_TYPE (orig[0]),
++						    TREE_TYPE (ref)))
++		    break;
+ 		}
+ 	      else if (ref == orig[j])
+ 		break;
+ 	    }
+-	  if (j == 2)
+-	    return false;
+-
+-	  unsigned int elt;
+-	  if (maybe_ne (bit_field_size (op1), elem_size)
+-	      || !constant_multiple_p (bit_field_offset (op1), elem_size, &elt))
+-	    return false;
+-	  if (j)
+-	    elt += nelts;
+-	  if (elt != i)
+-	    maybe_ident = false;
+-	  sel.quick_push (elt);
++	  /* Found a suitable vector element.  */
++	  if (j < 2)
++	    {
++	      orig[j] = ref;
++	      if (elem != i || j != 0)
++		maybe_ident = false;
++	      elts.safe_push (std::make_pair (j, elem));
++	      continue;
++	    }
++	  /* Else fallthru.  */
+ 	}
+-      else if (CONSTANT_CLASS_P (elt->value))
++      /* Handle elements not extracted from a vector.
++          1. constants by permuting with constant vector
++	  2. a unique non-constant element by permuting with a splat vector  */
++      if (orig[1]
++	  && orig[1] != error_mark_node)
++	return false;
++      orig[1] = error_mark_node;
++      if (CONSTANT_CLASS_P (elt->value))
+ 	{
+-	  if (orig[1]
+-	      && orig[1] != error_mark_node)
++	  if (one_nonconstant)
+ 	    return false;
+-	  orig[1] = error_mark_node;
+ 	  if (!one_constant)
+ 	    one_constant = elt->value;
+ 	  constants[i] = elt->value;
+-	  sel.quick_push (i + nelts);
+-	  maybe_ident = false;
+ 	}
+       else
+-	return false;
++	{
++	  if (one_constant)
++	    return false;
++	  if (!one_nonconstant)
++	    one_nonconstant = elt->value;
++	  else if (!operand_equal_p (one_nonconstant, elt->value, 0))
++	    return false;
++	}
++      elts.safe_push (std::make_pair (1, i));
++      maybe_ident = false;
+     }
+   if (i < nelts)
+     return false;
+ 
+-  if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
+-      || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
+-		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
++  if (! orig[0]
++      || ! VECTOR_TYPE_P (TREE_TYPE (orig[0])))
+     return false;
+-
+-  tree tem;
+-  if (conv_code != ERROR_MARK
+-      && (! supportable_convert_operation (conv_code, type,
+-					   TREE_TYPE (orig[0]),
+-					   &tem, &conv_code)
+-	  || conv_code == CALL_EXPR))
++  refnelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0])).to_constant ();
++  /* We currently do not handle larger destination vectors.  */
++  if (refnelts < nelts)
+     return false;
+ 
+   if (maybe_ident)
+     {
++      tree conv_src_type
++	= (nelts != refnelts
++	   ? (conv_code != ERROR_MARK
++	      ? build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), nelts)
++	      : type)
++	   : TREE_TYPE (orig[0]));
++      if (conv_code != ERROR_MARK
++	  && !supportable_convert_operation (conv_code, type, conv_src_type,
++					     &conv_code))
++	{
++	  /* Only few targets implement direct conversion patterns so try
++	     some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR.  */
++	  optab optab;
++	  tree halfvectype, dblvectype;
++	  if (CONVERT_EXPR_CODE_P (conv_code)
++	      && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
++		  == TYPE_PRECISION (TREE_TYPE (type)))
++	      && mode_for_vector (as_a <scalar_mode>
++				  (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))),
++				  nelts * 2).exists ()
++	      && (dblvectype
++		  = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
++				       nelts * 2))
++	      && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type))
++					       ? VEC_UNPACK_FLOAT_LO_EXPR
++					       : VEC_UNPACK_LO_EXPR,
++					       dblvectype,
++					       optab_default))
++	      && (optab_handler (optab, TYPE_MODE (dblvectype))
++		  != CODE_FOR_nothing))
++	    {
++	      gimple_seq stmts = NULL;
++	      tree dbl;
++	      if (refnelts == nelts)
++		{
++		  /* ???  Paradoxical subregs don't exist, so insert into
++		     the lower half of a wider zero vector.  */
++		  dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype,
++				      build_zero_cst (dblvectype), orig[0],
++				      bitsize_zero_node);
++		}
++	      else if (refnelts == 2 * nelts)
++		dbl = orig[0];
++	      else
++		dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype,
++				    orig[0], TYPE_SIZE (dblvectype),
++				    bitsize_zero_node);
++	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
++	      gimple_assign_set_rhs_with_ops (gsi,
++					      FLOAT_TYPE_P (TREE_TYPE (type))
++					      ? VEC_UNPACK_FLOAT_LO_EXPR
++					      : VEC_UNPACK_LO_EXPR,
++					      dbl);
++	    }
++	  else if (CONVERT_EXPR_CODE_P (conv_code)
++		   && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
++		       == 2 * TYPE_PRECISION (TREE_TYPE (type)))
++		   && mode_for_vector (as_a <scalar_mode>
++				         (TYPE_MODE
++					   (TREE_TYPE (TREE_TYPE (orig[0])))),
++				       nelts / 2).exists ()
++		   && (halfvectype
++		         = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
++					      nelts / 2))
++		   && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR,
++						    halfvectype,
++						    optab_default))
++		   && (optab_handler (optab, TYPE_MODE (halfvectype))
++		       != CODE_FOR_nothing))
++	    {
++	      gimple_seq stmts = NULL;
++	      tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
++				       orig[0], TYPE_SIZE (halfvectype),
++				       bitsize_zero_node);
++	      tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
++				       orig[0], TYPE_SIZE (halfvectype),
++				       TYPE_SIZE (halfvectype));
++	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
++	      gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR,
++					      low, hig);
++	    }
++	  else
++	    return false;
++	  update_stmt (gsi_stmt (*gsi));
++	  return true;
++	}
++      if (nelts != refnelts)
++	{
++	  gassign *lowpart
++	    = gimple_build_assign (make_ssa_name (conv_src_type),
++				   build3 (BIT_FIELD_REF, conv_src_type,
++					   orig[0], TYPE_SIZE (conv_src_type),
++					   bitsize_zero_node));
++	  gsi_insert_before (gsi, lowpart, GSI_SAME_STMT);
++	  orig[0] = gimple_assign_lhs (lowpart);
++	}
+       if (conv_code == ERROR_MARK)
+ 	gimple_assign_set_rhs_from_tree (gsi, orig[0]);
+       else
+@@ -2152,54 +2252,119 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
+     }
+   else
+     {
+-      tree mask_type;
++      tree mask_type, perm_type, conv_src_type;
++      perm_type = TREE_TYPE (orig[0]);
++      conv_src_type = (nelts == refnelts
++		       ? perm_type
++		       : build_vector_type (TREE_TYPE (perm_type), nelts));
++      if (conv_code != ERROR_MARK
++	  && !supportable_convert_operation (conv_code, type, conv_src_type,
++					     &conv_code))
++	return false;
+ 
+-      vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
+-      if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
++      /* Now that we know the number of elements of the source build the
++	 permute vector.
++	 ???  When the second vector has constant values we can shuffle
++	 it and its source indexes to make the permutation supported.
++	 For now it mimics a blend.  */
++      vec_perm_builder sel (refnelts, refnelts, 1);
++      bool all_same_p = true;
++      for (i = 0; i < elts.length (); ++i)
++	{
++	  sel.quick_push (elts[i].second + elts[i].first * refnelts);
++	  all_same_p &= known_eq (sel[i], sel[0]);
++	}
++      /* And fill the tail with "something".  It's really don't care,
++         and ideally we'd allow VEC_PERM to have a smaller destination
++	 vector.  As a heuristic:
++
++	 (a) if what we have so far duplicates a single element, make the
++	     tail do the same
++
++	 (b) otherwise preserve a uniform orig[0].  This facilitates
++	     later pattern-matching of VEC_PERM_EXPR to a BIT_INSERT_EXPR.  */
++      for (; i < refnelts; ++i)
++	sel.quick_push (all_same_p
++			? sel[0]
++			: (elts[0].second == 0 && elts[0].first == 0
++			   ? 0 : refnelts) + i);
++      vec_perm_indices indices (sel, orig[1] ? 2 : 1, refnelts);
++      if (!can_vec_perm_const_p (TYPE_MODE (perm_type), indices))
+ 	return false;
+       mask_type
+ 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
+-			     nelts);
++			     refnelts);
+       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
+ 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
+-		       GET_MODE_SIZE (TYPE_MODE (type))))
++		       GET_MODE_SIZE (TYPE_MODE (perm_type))))
+ 	return false;
+-      op2 = vec_perm_indices_to_tree (mask_type, indices);
++      tree op2 = vec_perm_indices_to_tree (mask_type, indices);
++      bool converted_orig1 = false;
++      gimple_seq stmts = NULL;
+       if (!orig[1])
+ 	orig[1] = orig[0];
+-      if (orig[1] == error_mark_node)
++      else if (orig[1] == error_mark_node
++	       && one_nonconstant)
+ 	{
+-	  tree_vector_builder vec (type, nelts, 1);
+-	  for (unsigned i = 0; i < nelts; ++i)
+-	    if (constants[i])
++	  /* ???  We can see if we can safely convert to the original
++	     element type.  */
++	  converted_orig1 = conv_code != ERROR_MARK;
++	  orig[1] = gimple_build_vector_from_val (&stmts, UNKNOWN_LOCATION,
++						  converted_orig1
++						  ? type : perm_type,
++						  one_nonconstant);
++	}
++      else if (orig[1] == error_mark_node)
++	{
++	  /* ???  See if we can convert the vector to the original type.  */
++	  converted_orig1 = conv_code != ERROR_MARK;
++	  unsigned n = converted_orig1 ? nelts : refnelts;
++	  tree_vector_builder vec (converted_orig1
++				   ? type : perm_type, n, 1);
++	  for (unsigned i = 0; i < n; ++i)
++	    if (i < nelts && constants[i])
+ 	      vec.quick_push (constants[i]);
+ 	    else
+ 	      /* ??? Push a don't-care value.  */
+ 	      vec.quick_push (one_constant);
+ 	  orig[1] = vec.build ();
+ 	}
+-      if (conv_code == ERROR_MARK)
+-	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
+-					orig[1], op2);
+-      else if (TREE_CODE (orig[1]) == VECTOR_CST)
++      tree blend_op2 = NULL_TREE;
++      if (converted_orig1)
+ 	{
+-	  gimple *conv
+-	    = gimple_build_assign (make_ssa_name (type), conv_code, orig[0]);
+-	  orig[0] = gimple_assign_lhs (conv);
+-	  gsi_insert_before (gsi, conv, GSI_SAME_STMT);
+-	  gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR,
+-					  orig[0], orig[1], op2);
+-	}
+-      else
+-	{
+-	  gimple *perm
+-	    = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
+-				   VEC_PERM_EXPR, orig[0], orig[1], op2);
+-	  orig[0] = gimple_assign_lhs (perm);
+-	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
+-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
+-					  NULL_TREE, NULL_TREE);
++	  /* Make sure we can do a blend in the target type.  */
++	  vec_perm_builder sel (nelts, nelts, 1);
++	  for (i = 0; i < elts.length (); ++i)
++	    sel.quick_push (elts[i].first
++			    ? elts[i].second + nelts : i);
++	  vec_perm_indices indices (sel, 2, nelts);
++	  if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
++	    return false;
++	  mask_type
++	    = build_vector_type (build_nonstandard_integer_type (elem_size, 1),
++				 nelts);
++	  if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
++	      || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
++			   GET_MODE_SIZE (TYPE_MODE (type))))
++	    return false;
++	  blend_op2 = vec_perm_indices_to_tree (mask_type, indices);
+ 	}
++      tree orig1_for_perm
++	= converted_orig1 ? build_zero_cst (perm_type) : orig[1];
++      tree res = gimple_build (&stmts, VEC_PERM_EXPR, perm_type,
++			       orig[0], orig1_for_perm, op2);
++      if (nelts != refnelts)
++	res = gimple_build (&stmts, BIT_FIELD_REF,
++			    conv_code != ERROR_MARK ? conv_src_type : type,
++			    res, TYPE_SIZE (type), bitsize_zero_node);
++      if (conv_code != ERROR_MARK)
++	res = gimple_build (&stmts, conv_code, type, res);
++      /* Blend in the actual constant.  */
++      if (converted_orig1)
++	res = gimple_build (&stmts, VEC_PERM_EXPR, type,
++			    res, orig[1], blend_op2);
++      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
++      gimple_assign_set_rhs_with_ops (gsi, SSA_NAME, res);
+     }
+   update_stmt (gsi_stmt (*gsi));
+   return true;
+@@ -2449,6 +2614,72 @@ pass_forwprop::execute (function *fun)
+ 	      else
+ 		gsi_next (&gsi);
+ 	    }
++	  else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
++		   && TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
++		   && gimple_assign_load_p (stmt)
++		   && !gimple_has_volatile_ops (stmt)
++		   && (TREE_CODE (gimple_assign_rhs1 (stmt))
++		       != TARGET_MEM_REF)
++		   && !stmt_can_throw_internal (cfun, stmt))
++	    {
++	      /* Rewrite loads used only in BIT_FIELD_REF extractions to
++	         component-wise loads.  */
++	      use_operand_p use_p;
++	      imm_use_iterator iter;
++	      bool rewrite = true;
++	      FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
++		{
++		  gimple *use_stmt = USE_STMT (use_p);
++		  if (is_gimple_debug (use_stmt))
++		    continue;
++		  if (!is_gimple_assign (use_stmt)
++		      || gimple_assign_rhs_code (use_stmt) != BIT_FIELD_REF)
++		    {
++		      rewrite = false;
++		      break;
++		    }
++		}
++	      if (rewrite)
++		{
++		  gimple *use_stmt;
++		  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
++		    {
++		      if (is_gimple_debug (use_stmt))
++			{
++			  if (gimple_debug_bind_p (use_stmt))
++			    {
++			      gimple_debug_bind_reset_value (use_stmt);
++			      update_stmt (use_stmt);
++			    }
++			  continue;
++			}
++
++		      tree bfr = gimple_assign_rhs1 (use_stmt);
++		      tree new_rhs = fold_build3 (BIT_FIELD_REF,
++						  TREE_TYPE (bfr),
++						  unshare_expr (rhs),
++						  TREE_OPERAND (bfr, 1),
++						  TREE_OPERAND (bfr, 2));
++		      gimple *new_stmt
++			= gimple_build_assign (gimple_assign_lhs (use_stmt),
++					       new_rhs);
++
++		      location_t loc = gimple_location (use_stmt);
++		      gimple_set_location (new_stmt, loc);
++		      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
++		      unlink_stmt_vdef (use_stmt);
++		      gsi_remove (&gsi2, true);
++
++		      gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
++		    }
++
++		  release_defs (stmt);
++		  gsi_remove (&gsi, true);
++		}
++	      else
++		gsi_next (&gsi);
++	    }
++
+ 	  else if (code == COMPLEX_EXPR)
+ 	    {
+ 	      /* Rewrite stores of a single-use complex build expression
+@@ -2489,6 +2720,66 @@ pass_forwprop::execute (function *fun)
+ 	      else
+ 		gsi_next (&gsi);
+ 	    }
++	  else if (code == CONSTRUCTOR
++		   && VECTOR_TYPE_P (TREE_TYPE (rhs))
++		   && TYPE_MODE (TREE_TYPE (rhs)) == BLKmode
++		   && CONSTRUCTOR_NELTS (rhs) > 0
++		   && (!VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
++		       || (TYPE_MODE (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
++			   != BLKmode)))
++	    {
++	      /* Rewrite stores of a single-use vector constructors
++	         to component-wise stores if the mode isn't supported.  */
++	      use_operand_p use_p;
++	      gimple *use_stmt;
++	      if (single_imm_use (lhs, &use_p, &use_stmt)
++		  && gimple_store_p (use_stmt)
++		  && !gimple_has_volatile_ops (use_stmt)
++		  && !stmt_can_throw_internal (cfun, use_stmt)
++		  && is_gimple_assign (use_stmt)
++		  && (TREE_CODE (gimple_assign_lhs (use_stmt))
++		      != TARGET_MEM_REF))
++		{
++		  tree elt_t = TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value);
++		  unsigned HOST_WIDE_INT elt_w
++		    = tree_to_uhwi (TYPE_SIZE (elt_t));
++		  unsigned HOST_WIDE_INT n
++		    = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (rhs)));
++		  for (unsigned HOST_WIDE_INT bi = 0; bi < n; bi += elt_w)
++		    {
++		      unsigned HOST_WIDE_INT ci = bi / elt_w;
++		      tree new_rhs;
++		      if (ci < CONSTRUCTOR_NELTS (rhs))
++			new_rhs = CONSTRUCTOR_ELT (rhs, ci)->value;
++		      else
++			new_rhs = build_zero_cst (elt_t);
++		      tree use_lhs = gimple_assign_lhs (use_stmt);
++		      tree new_lhs = build3 (BIT_FIELD_REF,
++					     elt_t,
++					     unshare_expr (use_lhs),
++					     bitsize_int (elt_w),
++					     bitsize_int (bi));
++		      gimple *new_stmt = gimple_build_assign (new_lhs, new_rhs);
++		      location_t loc = gimple_location (use_stmt);
++		      gimple_set_location (new_stmt, loc);
++		      gimple_set_vuse (new_stmt, gimple_vuse (use_stmt));
++		      gimple_set_vdef (new_stmt,
++				       make_ssa_name (gimple_vop (cfun)));
++		      SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
++		      gimple_set_vuse (use_stmt, gimple_vdef (new_stmt));
++		      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
++		      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
++		    }
++		  gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
++		  unlink_stmt_vdef (use_stmt);
++		  release_defs (use_stmt);
++		  gsi_remove (&gsi2, true);
++		  release_defs (stmt);
++		  gsi_remove (&gsi, true);
++		}
++	      else
++		gsi_next (&gsi);
++	    }
+ 	  else
+ 	    gsi_next (&gsi);
+ 	}
+diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
+index fec378490..695646764 100644
+--- a/gcc/tree-ssa-loop-ivopts.c
++++ b/gcc/tree-ssa-loop-ivopts.c
+@@ -2461,11 +2461,13 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
+   switch (gimple_call_internal_fn (call))
+     {
+     case IFN_MASK_LOAD:
++    case IFN_MASK_LOAD_LANES:
+       if (op_p == gimple_call_arg_ptr (call, 0))
+ 	return TREE_TYPE (gimple_call_lhs (call));
+       return NULL_TREE;
+ 
+     case IFN_MASK_STORE:
++    case IFN_MASK_STORE_LANES:
+       if (op_p == gimple_call_arg_ptr (call, 0))
+ 	return TREE_TYPE (gimple_call_arg (call, 3));
+       return NULL_TREE;
+@@ -3510,6 +3512,26 @@ add_iv_candidate_for_use (struct ivopts_data *data, struct iv_use *use)
+     basetype = sizetype;
+   record_common_cand (data, build_int_cst (basetype, 0), iv->step, use);
+ 
++  /* Compare the cost of an address with an unscaled index with the cost of
++    an address with a scaled index and add candidate if useful.  */
++  poly_int64 step;
++  if (use != NULL
++      && poly_int_tree_p (iv->step, &step)
++      && address_p (use->type))
++    {
++      poly_int64 new_step;
++      unsigned int fact = preferred_mem_scale_factor
++	(use->iv->base,
++	 TYPE_MODE (use->mem_type),
++	 optimize_loop_for_speed_p (data->current_loop));
++
++      if (fact != 1
++	  && multiple_p (step, fact, &new_step))
++	add_candidate (data, size_int (0),
++		       wide_int_to_tree (sizetype, new_step),
++		       true, NULL);
++    }
++
+   /* Record common candidate with constant offset stripped in base.
+      Like the use itself, we also add candidate directly for it.  */
+   base = strip_offset (iv->base, &offset);
+@@ -4036,6 +4058,94 @@ get_computation_at (struct loop *loop, gimple *at,
+   return fold_convert (type, aff_combination_to_tree (&aff));
+ }
+ 
++/* Like get_computation_at, but try harder, even if the computation
++   is more expensive.  Intended for debug stmts.  */
++
++static tree
++get_debug_computation_at (class loop *loop, gimple *at,
++			  struct iv_use *use, struct iv_cand *cand)
++{
++  if (tree ret = get_computation_at (loop, at, use, cand))
++    return ret;
++
++  tree ubase = use->iv->base, ustep = use->iv->step;
++  tree cbase = cand->iv->base, cstep = cand->iv->step;
++  tree var;
++  tree utype = TREE_TYPE (ubase), ctype = TREE_TYPE (cbase);
++  widest_int rat;
++
++  /* We must have a precision to express the values of use.  */
++  if (TYPE_PRECISION (utype) >= TYPE_PRECISION (ctype))
++    return NULL_TREE;
++
++  /* Try to handle the case that get_computation_at doesn't,
++     try to express
++     use = ubase + (var - cbase) / ratio.  */
++  if (!constant_multiple_of (cstep, fold_convert (TREE_TYPE (cstep), ustep),
++			     &rat))
++    return NULL_TREE;
++
++  bool neg_p = false;
++  if (wi::neg_p (rat))
++    {
++      if (TYPE_UNSIGNED (ctype))
++	return NULL_TREE;
++      neg_p = true;
++      rat = wi::neg (rat);
++    }
++
++  /* If both IVs can wrap around and CAND doesn't have a power of two step,
++     it is unsafe.  Consider uint16_t CAND with step 9, when wrapping around,
++     the values will be ... 0xfff0, 0xfff9, 2, 11 ... and when use is say
++     uint8_t with step 3, those values divided by 3 cast to uint8_t will be
++     ... 0x50, 0x53, 0, 3 ... rather than expected 0x50, 0x53, 0x56, 0x59.  */
++  if (!use->iv->no_overflow
++      && !cand->iv->no_overflow
++      && !integer_pow2p (cstep))
++    return NULL_TREE;
++
++  int bits = wi::exact_log2 (rat);
++  if (bits == -1)
++    bits = wi::floor_log2 (rat) + 1;
++  if (!cand->iv->no_overflow
++      && TYPE_PRECISION (utype) + bits > TYPE_PRECISION (ctype))
++    return NULL_TREE;
++
++  var = var_at_stmt (loop, cand, at);
++
++  if (POINTER_TYPE_P (ctype))
++    {
++      ctype = unsigned_type_for (ctype);
++      cbase = fold_convert (ctype, cbase);
++      cstep = fold_convert (ctype, cstep);
++      var = fold_convert (ctype, var);
++    }
++
++  ubase = unshare_expr (ubase);
++  cbase = unshare_expr (cbase);
++  if (stmt_after_increment (loop, cand, at))
++    var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var,
++		       unshare_expr (cstep));
++
++  var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var, cbase);
++  var = fold_build2 (EXACT_DIV_EXPR, TREE_TYPE (var), var,
++		     wide_int_to_tree (TREE_TYPE (var), rat));
++  if (POINTER_TYPE_P (utype))
++    {
++      var = fold_convert (sizetype, var);
++      if (neg_p)
++	var = fold_build1 (NEGATE_EXPR, sizetype, var);
++      var = fold_build2 (POINTER_PLUS_EXPR, utype, ubase, var);
++    }
++  else
++    {
++      var = fold_convert (utype, var);
++      var = fold_build2 (neg_p ? MINUS_EXPR : PLUS_EXPR, utype,
++			 ubase, var);
++    }
++  return var;
++}
++
+ /* Adjust the cost COST for being in loop setup rather than loop body.
+    If we're optimizing for space, the loop setup overhead is constant;
+    if we're optimizing for speed, amortize it over the per-iteration cost.
+@@ -7122,6 +7232,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
+     {
+     case IFN_MASK_LOAD:
+     case IFN_MASK_STORE:
++    case IFN_MASK_LOAD_LANES:
++    case IFN_MASK_STORE_LANES:
+       /* The second argument contains the correct alias type.  */
+       gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
+       return TREE_TYPE (gimple_call_arg (call, 1));
+@@ -7339,6 +7451,7 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
+ 	      struct iv_use dummy_use;
+ 	      struct iv_cand *best_cand = NULL, *cand;
+ 	      unsigned i, best_pref = 0, cand_pref;
++	      tree comp = NULL_TREE;
+ 
+ 	      memset (&dummy_use, 0, sizeof (dummy_use));
+ 	      dummy_use.iv = info->iv;
+@@ -7359,20 +7472,22 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
+ 		    ? 1 : 0;
+ 		  if (best_cand == NULL || best_pref < cand_pref)
+ 		    {
+-		      best_cand = cand;
+-		      best_pref = cand_pref;
++		      tree this_comp
++			= get_debug_computation_at (data->current_loop,
++						    SSA_NAME_DEF_STMT (def),
++						    &dummy_use, cand);
++		      if (this_comp)
++			{
++			  best_cand = cand;
++			  best_pref = cand_pref;
++			  comp = this_comp;
++			}
+ 		    }
+ 		}
+ 
+ 	      if (!best_cand)
+ 		continue;
+ 
+-	      tree comp = get_computation_at (data->current_loop,
+-					      SSA_NAME_DEF_STMT (def),
+-					      &dummy_use, best_cand);
+-	      if (!comp)
+-		continue;
+-
+ 	      if (count > 1)
+ 		{
+ 		  tree vexpr = make_node (DEBUG_EXPR_DECL);
+diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
+index 3dfda7a4f..8607a59d4 100644
+--- a/gcc/tree-ssa-math-opts.c
++++ b/gcc/tree-ssa-math-opts.c
+@@ -1040,14 +1040,9 @@ pass_cse_reciprocals::execute (function *fun)
+ 		      else
+ 			stmt2 = gimple_build_call_internal_vec (ifn, args);
+ 		      gimple_call_set_lhs (stmt2, arg1);
+-		      if (gimple_vdef (call))
+-			{
+-			  gimple_set_vdef (stmt2, gimple_vdef (call));
+-			  SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
+-			}
++		      gimple_move_vops (stmt2, call);
+ 		      gimple_call_set_nothrow (stmt2,
+ 					       gimple_call_nothrow_p (call));
+-		      gimple_set_vuse (stmt2, gimple_vuse (call));
+ 		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
+ 		      gsi_replace (&gsi2, stmt2, true);
+ 		    }
+@@ -3048,6 +3043,8 @@ last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
+ /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
+    with uses in additions and subtractions to form fused multiply-add
+    operations.  Returns true if successful and MUL_STMT should be removed.
++   If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
++   on MUL_COND, otherwise it is unconditional.
+ 
+    If STATE indicates that we are deferring FMA transformation, that means
+    that we do not produce FMAs for basic blocks which look like:
+@@ -3064,7 +3061,7 @@ last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
+ 
+ static bool
+ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
+-		     fma_deferring_state *state)
++		     fma_deferring_state *state, tree mul_cond = NULL_TREE)
+ {
+   tree mul_result = gimple_get_lhs (mul_stmt);
+   tree type = TREE_TYPE (mul_result);
+@@ -3178,6 +3175,9 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
+ 	  return false;
+ 	}
+ 
++      if (mul_cond && cond != mul_cond)
++	return false;
++
+       if (cond)
+ 	{
+ 	  if (cond == result || else_value == result)
+@@ -3789,38 +3789,48 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
+ 	}
+       else if (is_gimple_call (stmt))
+ 	{
+-	  tree fndecl = gimple_call_fndecl (stmt);
+-	  if (fndecl && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
++	  switch (gimple_call_combined_fn (stmt))
+ 	    {
+-	      switch (DECL_FUNCTION_CODE (fndecl))
++	    CASE_CFN_POW:
++	      if (gimple_call_lhs (stmt)
++		  && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
++		  && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
++				 &dconst2)
++		  && convert_mult_to_fma (stmt,
++					  gimple_call_arg (stmt, 0),
++					  gimple_call_arg (stmt, 0),
++					  &fma_state))
+ 		{
+-		case BUILT_IN_POWF:
+-		case BUILT_IN_POW:
+-		case BUILT_IN_POWL:
+-		  if (gimple_call_lhs (stmt)
+-		      && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
+-		      && real_equal
+-		      (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
+-		       &dconst2)
+-		      && convert_mult_to_fma (stmt,
+-					      gimple_call_arg (stmt, 0),
+-					      gimple_call_arg (stmt, 0),
+-					      &fma_state))
+-		    {
+-		      unlink_stmt_vdef (stmt);
+-		      if (gsi_remove (&gsi, true)
+-			  && gimple_purge_dead_eh_edges (bb))
+-			*m_cfg_changed_p = true;
+-		      release_defs (stmt);
+-		      continue;
+-		    }
+-		  break;
++		  unlink_stmt_vdef (stmt);
++		  if (gsi_remove (&gsi, true)
++		      && gimple_purge_dead_eh_edges (bb))
++		    *m_cfg_changed_p = true;
++		  release_defs (stmt);
++		  continue;
++		}
++	      break;
+ 
+-		default:;
++	    case CFN_COND_MUL:
++	      if (convert_mult_to_fma (stmt,
++				       gimple_call_arg (stmt, 1),
++				       gimple_call_arg (stmt, 2),
++				       &fma_state,
++				       gimple_call_arg (stmt, 0)))
++
++		{
++		  gsi_remove (&gsi, true);
++		  release_defs (stmt);
++		  continue;
+ 		}
++	      break;
++
++	    case CFN_LAST:
++	      cancel_fma_deferring (&fma_state);
++	      break;
++
++	    default:
++	      break;
+ 	    }
+-	  else
+-	    cancel_fma_deferring (&fma_state);
+ 	}
+       gsi_next (&gsi);
+     }
+diff --git a/gcc/tree-ssa-propagate.c b/gcc/tree-ssa-propagate.c
+index 6b78dc1c0..0862f83e9 100644
+--- a/gcc/tree-ssa-propagate.c
++++ b/gcc/tree-ssa-propagate.c
+@@ -625,8 +625,7 @@ finish_update_gimple_call (gimple_stmt_iterator *si_p, gimple *new_stmt,
+ {
+   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+   move_ssa_defining_stmt_for_defs (new_stmt, stmt);
+-  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
++  gimple_move_vops (new_stmt, stmt);
+   gimple_set_location (new_stmt, gimple_location (stmt));
+   if (gimple_block (new_stmt) == NULL_TREE)
+     gimple_set_block (new_stmt, gimple_block (stmt));
+@@ -706,8 +705,7 @@ update_call_from_tree (gimple_stmt_iterator *si_p, tree expr)
+           STRIP_USELESS_TYPE_CONVERSION (expr);
+           new_stmt = gimple_build_assign (lhs, expr);
+           move_ssa_defining_stmt_for_defs (new_stmt, stmt);
+-	  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-	  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
++	  gimple_move_vops (new_stmt, stmt);
+         }
+       else if (!TREE_SIDE_EFFECTS (expr))
+         {
+@@ -732,8 +730,7 @@ update_call_from_tree (gimple_stmt_iterator *si_p, tree expr)
+ 	  else
+ 	    lhs = create_tmp_var (TREE_TYPE (expr));
+           new_stmt = gimple_build_assign (lhs, expr);
+-	  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+-	  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
++	  gimple_move_vops (new_stmt, stmt);
+           move_ssa_defining_stmt_for_defs (new_stmt, stmt);
+         }
+       gimple_set_location (new_stmt, gimple_location (stmt));
+diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
+index 91494d761..096584062 100644
+--- a/gcc/tree-ssa-threadedge.c
++++ b/gcc/tree-ssa-threadedge.c
+@@ -331,6 +331,7 @@ record_temporary_equivalences_from_stmts_at_dest (edge e,
+ 	{
+ 	  tree fndecl = gimple_call_fndecl (stmt);
+ 	  if (fndecl
++	      && fndecl_built_in_p (fndecl, BUILT_IN_NORMAL)
+ 	      && (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_OBJECT_SIZE
+ 		  || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_CONSTANT_P))
+ 	    continue;
+diff --git a/gcc/tree-streamer-in.c b/gcc/tree-streamer-in.c
+index f6d137316..eb3e174fc 100644
+--- a/gcc/tree-streamer-in.c
++++ b/gcc/tree-streamer-in.c
+@@ -324,8 +324,7 @@ unpack_ts_decl_with_vis_value_fields (struct bitpack_d *bp, tree expr)
+ static void
+ unpack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
+ {
+-  DECL_BUILT_IN_CLASS (expr) = bp_unpack_enum (bp, built_in_class,
+-					       BUILT_IN_LAST);
++  built_in_class cl = bp_unpack_enum (bp, built_in_class, BUILT_IN_LAST);
+   DECL_STATIC_CONSTRUCTOR (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_STATIC_DESTRUCTOR (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_UNINLINABLE (expr) = (unsigned) bp_unpack_value (bp, 1);
+@@ -333,7 +332,7 @@ unpack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
+   DECL_IS_NOVOPS (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_IS_RETURNS_TWICE (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_IS_MALLOC (expr) = (unsigned) bp_unpack_value (bp, 1);
+-  DECL_IS_OPERATOR_NEW (expr) = (unsigned) bp_unpack_value (bp, 1);
++  DECL_SET_IS_OPERATOR_NEW (expr, (unsigned) bp_unpack_value (bp, 1));
+   DECL_DECLARED_INLINE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_STATIC_CHAIN (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_NO_INLINE_WARNING_P (expr) = (unsigned) bp_unpack_value (bp, 1);
+@@ -343,22 +342,22 @@ unpack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
+   DECL_DISREGARD_INLINE_LIMITS (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_PURE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
+   DECL_LOOPING_CONST_OR_PURE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
+-  if (DECL_BUILT_IN_CLASS (expr) != NOT_BUILT_IN)
++  unsigned int fcode = 0;
++  if (cl != NOT_BUILT_IN)
+     {
+-      DECL_FUNCTION_CODE (expr) = (enum built_in_function) bp_unpack_value (bp,
+-	                                                                    12);
+-      if (DECL_BUILT_IN_CLASS (expr) == BUILT_IN_NORMAL
+-	  && DECL_FUNCTION_CODE (expr) >= END_BUILTINS)
++      fcode = bp_unpack_value (bp, 32);
++      if (cl == BUILT_IN_NORMAL && fcode >= END_BUILTINS)
+ 	fatal_error (input_location,
+ 		     "machine independent builtin code out of range");
+-      else if (DECL_BUILT_IN_CLASS (expr) == BUILT_IN_MD)
++      else if (cl == BUILT_IN_MD)
+ 	{
+-          tree result = targetm.builtin_decl (DECL_FUNCTION_CODE (expr), true);
++          tree result = targetm.builtin_decl (fcode, true);
+ 	  if (!result || result == error_mark_node)
+ 	    fatal_error (input_location,
+ 			 "target specific builtin not available");
+ 	}
+     }
++  set_decl_built_in_function (expr, cl, fcode);
+ }
+ 
+ 
+diff --git a/gcc/tree-streamer-out.c b/gcc/tree-streamer-out.c
+index 3f619e830..12693f6f4 100644
+--- a/gcc/tree-streamer-out.c
++++ b/gcc/tree-streamer-out.c
+@@ -295,7 +295,7 @@ pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
+   bp_pack_value (bp, DECL_IS_NOVOPS (expr), 1);
+   bp_pack_value (bp, DECL_IS_RETURNS_TWICE (expr), 1);
+   bp_pack_value (bp, DECL_IS_MALLOC (expr), 1);
+-  bp_pack_value (bp, DECL_IS_OPERATOR_NEW (expr), 1);
++  bp_pack_value (bp, DECL_IS_OPERATOR_NEW_P (expr), 1);
+   bp_pack_value (bp, DECL_DECLARED_INLINE_P (expr), 1);
+   bp_pack_value (bp, DECL_STATIC_CHAIN (expr), 1);
+   bp_pack_value (bp, DECL_NO_INLINE_WARNING_P (expr), 1);
+@@ -305,7 +305,7 @@ pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
+   bp_pack_value (bp, DECL_PURE_P (expr), 1);
+   bp_pack_value (bp, DECL_LOOPING_CONST_OR_PURE_P (expr), 1);
+   if (DECL_BUILT_IN_CLASS (expr) != NOT_BUILT_IN)
+-    bp_pack_value (bp, DECL_FUNCTION_CODE (expr), 12);
++    bp_pack_value (bp, DECL_UNCHECKED_FUNCTION_CODE (expr), 32);
+ }
+ 
+ 
+diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
+index 39bc2a82b..8d97deaf2 100644
+--- a/gcc/tree-vect-generic.c
++++ b/gcc/tree-vect-generic.c
+@@ -1671,7 +1671,6 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
+   gimple *g;
+   tree lhs = gimple_call_lhs (stmt);
+   tree arg = gimple_call_arg (stmt, 0);
+-  tree decl = NULL_TREE;
+   tree ret_type = TREE_TYPE (lhs);
+   tree arg_type = TREE_TYPE (arg);
+   tree new_rhs, compute_type = TREE_TYPE (arg_type);
+@@ -1698,16 +1697,9 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
+ 
+   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
+     {
+-      if (supportable_convert_operation (code, ret_type, arg_type, &decl,
+-					 &code1))
++      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
+ 	{
+-	  if (code1 == CALL_EXPR)
+-	    {
+-	      g = gimple_build_call (decl, 1, arg);
+-	      gimple_call_set_lhs (g, lhs);
+-	    }
+-	  else
+-	    g = gimple_build_assign (lhs, code1, arg);
++	  g = gimple_build_assign (lhs, code1, arg);
+ 	  gsi_replace (gsi, g, false);
+ 	  return;
+ 	}
+@@ -1726,11 +1718,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
+ 	      tree ret1_type = build_vector_type (TREE_TYPE (ret_type), nelts);
+ 	      tree arg1_type = build_vector_type (TREE_TYPE (arg_type), nelts);
+ 	      if (supportable_convert_operation (code, ret1_type, arg1_type,
+-						 &decl, &code1))
++						 &code1))
+ 		{
+ 		  new_rhs = expand_vector_piecewise (gsi, do_vec_conversion,
+ 						     ret_type, arg1_type, arg,
+-						     decl, code1);
++						     NULL_TREE, code1);
+ 		  g = gimple_build_assign (lhs, new_rhs);
+ 		  gsi_replace (gsi, g, false);
+ 		  return;
+diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
+index 85be01748..b76728452 100644
+--- a/gcc/tree-vect-loop.c
++++ b/gcc/tree-vect-loop.c
+@@ -5581,6 +5581,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
+   return lhs;
+ }
+ 
++/* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
++   type of the vector input.  */
++
++static internal_fn
++get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
++{
++  internal_fn mask_reduc_fn;
++
++  switch (reduc_fn)
++    {
++    case IFN_FOLD_LEFT_PLUS:
++      mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
++      break;
++
++    default:
++      return IFN_LAST;
++    }
++
++  if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
++				      OPTIMIZE_FOR_SPEED))
++    return mask_reduc_fn;
++  return IFN_LAST;
++}
++
+ /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
+    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
+    statement.  CODE is the operation performed by STMT_INFO and OPS are
+@@ -5603,6 +5627,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
+   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+   stmt_vec_info new_stmt_info = NULL;
++  internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
+ 
+   int ncopies;
+   if (slp_node)
+@@ -5673,16 +5698,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
+ 	  def0 = negated;
+ 	}
+ 
+-      if (mask)
++      if (mask && mask_reduc_fn == IFN_LAST)
+ 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
+ 				    vector_identity);
+ 
+       /* On the first iteration the input is simply the scalar phi
+ 	 result, and for subsequent iterations it is the output of
+ 	 the preceding operation.  */
+-      if (reduc_fn != IFN_LAST)
++      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
+ 	{
+-	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
++	  if (mask && mask_reduc_fn != IFN_LAST)
++	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
++						   def0, mask);
++	  else
++	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
++						   def0);
+ 	  /* For chained SLP reductions the output of the previous reduction
+ 	     operation serves as the input of the next. For the final statement
+ 	     the output cannot be a temporary - we reuse the original
+@@ -5782,6 +5812,7 @@ use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
+   switch (code)
+     {
+     case DOT_PROD_EXPR:
++    case SAD_EXPR:
+       return true;
+ 
+     default:
+@@ -5811,6 +5842,17 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
+ 	break;
+       }
+ 
++    case SAD_EXPR:
++      {
++	tree vectype = TREE_TYPE (vop[1]);
++	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
++	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
++					       mask, vop[1], vop[0]);
++	gsi_insert_before (gsi, select, GSI_SAME_STMT);
++	vop[1] = masked_op1;
++	break;
++      }
++
+     default:
+       gcc_unreachable ();
+     }
+diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
+index 026148cc4..99df38711 100644
+--- a/gcc/tree-vect-patterns.c
++++ b/gcc/tree-vect-patterns.c
+@@ -1302,7 +1302,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
+     {
+       if (flag_unsafe_math_optimizations
+ 	  && TREE_CODE (base) == REAL_CST
+-	  && !gimple_call_internal_p (last_stmt))
++	  && gimple_call_builtin_p (last_stmt, BUILT_IN_NORMAL))
+ 	{
+ 	  combined_fn log_cfn;
+ 	  built_in_function exp_bfn;
+@@ -1728,6 +1728,175 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out)
+   return pattern_stmt;
+ }
+ 
++/* Recognize the following patterns:
++
++     ATYPE a;  // narrower than TYPE
++     BTYPE b;  // narrower than TYPE
++
++   1) Multiply high with scaling
++     TYPE res = ((TYPE) a * (TYPE) b) >> c;
++   2) ... or also with rounding
++     TYPE res = (((TYPE) a * (TYPE) b) >> d + 1) >> 1;
++
++   where only the bottom half of res is used.  */
++
++static gimple *
++vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out)
++{
++  /* Check for a right shift.  */
++  gassign *last_stmt = dyn_cast <gassign *> (last_stmt_info->stmt);
++  if (!last_stmt
++      || gimple_assign_rhs_code (last_stmt) != RSHIFT_EXPR)
++    return NULL;
++  vec_info *vinfo = last_stmt_info->vinfo;
++
++  /* Check that the shift result is wider than the users of the
++     result need (i.e. that narrowing would be a natural choice).  */
++  tree lhs_type = TREE_TYPE (gimple_assign_lhs (last_stmt));
++  unsigned int target_precision
++    = vect_element_precision (last_stmt_info->min_output_precision);
++  if (!INTEGRAL_TYPE_P (lhs_type)
++      || target_precision >= TYPE_PRECISION (lhs_type))
++    return NULL;
++
++  /* Look through any change in sign on the outer shift input.  */
++  vect_unpromoted_value unprom_rshift_input;
++  tree rshift_input = vect_look_through_possible_promotion
++    (vinfo, gimple_assign_rhs1 (last_stmt), &unprom_rshift_input);
++  if (!rshift_input
++      || TYPE_PRECISION (TREE_TYPE (rshift_input))
++	   != TYPE_PRECISION (lhs_type))
++    return NULL;
++
++  /* Get the definition of the shift input.  */
++  stmt_vec_info rshift_input_stmt_info
++    = vect_get_internal_def (vinfo, rshift_input);
++  if (!rshift_input_stmt_info)
++    return NULL;
++  gassign *rshift_input_stmt
++    = dyn_cast <gassign *> (rshift_input_stmt_info->stmt);
++  if (!rshift_input_stmt)
++    return NULL;
++
++  stmt_vec_info mulh_stmt_info;
++  tree scale_term;
++  internal_fn ifn;
++  unsigned int expect_offset;
++
++  /* Check for the presence of the rounding term.  */
++  if (gimple_assign_rhs_code (rshift_input_stmt) == PLUS_EXPR)
++    {
++      /* Check that the outer shift was by 1.  */
++      if (!integer_onep (gimple_assign_rhs2 (last_stmt)))
++	return NULL;
++
++      /* Check that the second operand of the PLUS_EXPR is 1.  */
++      if (!integer_onep (gimple_assign_rhs2 (rshift_input_stmt)))
++	return NULL;
++
++      /* Look through any change in sign on the addition input.  */
++      vect_unpromoted_value unprom_plus_input;
++      tree plus_input = vect_look_through_possible_promotion
++	(vinfo, gimple_assign_rhs1 (rshift_input_stmt), &unprom_plus_input);
++      if (!plus_input
++	   || TYPE_PRECISION (TREE_TYPE (plus_input))
++		!= TYPE_PRECISION (TREE_TYPE (rshift_input)))
++	return NULL;
++
++      /* Get the definition of the multiply-high-scale part.  */
++      stmt_vec_info plus_input_stmt_info
++	= vect_get_internal_def (vinfo, plus_input);
++      if (!plus_input_stmt_info)
++	return NULL;
++      gassign *plus_input_stmt
++	= dyn_cast <gassign *> (plus_input_stmt_info->stmt);
++      if (!plus_input_stmt
++	  || gimple_assign_rhs_code (plus_input_stmt) != RSHIFT_EXPR)
++	return NULL;
++
++      /* Look through any change in sign on the scaling input.  */
++      vect_unpromoted_value unprom_scale_input;
++      tree scale_input = vect_look_through_possible_promotion
++	(vinfo, gimple_assign_rhs1 (plus_input_stmt), &unprom_scale_input);
++      if (!scale_input
++	  || TYPE_PRECISION (TREE_TYPE (scale_input))
++	       != TYPE_PRECISION (TREE_TYPE (plus_input)))
++	return NULL;
++
++      /* Get the definition of the multiply-high part.  */
++      mulh_stmt_info = vect_get_internal_def (vinfo, scale_input);
++      if (!mulh_stmt_info)
++	return NULL;
++
++      /* Get the scaling term.  */
++      scale_term = gimple_assign_rhs2 (plus_input_stmt);
++
++      expect_offset = target_precision + 2;
++      ifn = IFN_MULHRS;
++    }
++  else
++    {
++      mulh_stmt_info = rshift_input_stmt_info;
++      scale_term = gimple_assign_rhs2 (last_stmt);
++
++      expect_offset = target_precision + 1;
++      ifn = IFN_MULHS;
++    }
++
++  /* Check that the scaling factor is correct.  */
++  if (TREE_CODE (scale_term) != INTEGER_CST
++      || wi::to_widest (scale_term) + expect_offset
++	   != TYPE_PRECISION (lhs_type))
++    return NULL;
++
++  /* Check whether the scaling input term can be seen as two widened
++     inputs multiplied together.  */
++  vect_unpromoted_value unprom_mult[2];
++  tree new_type;
++  unsigned int nops
++    = vect_widened_op_tree (mulh_stmt_info, MULT_EXPR, WIDEN_MULT_EXPR,
++			    false, 2, unprom_mult, &new_type);
++  if (nops != 2)
++    return NULL;
++
++  vect_pattern_detected ("vect_recog_mulhs_pattern", last_stmt);
++
++  /* Adjust output precision.  */
++  if (TYPE_PRECISION (new_type) < target_precision)
++    new_type = build_nonstandard_integer_type
++      (target_precision, TYPE_UNSIGNED (new_type));
++
++  /* Check for target support.  */
++  tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
++  if (!new_vectype
++      || !direct_internal_fn_supported_p
++	    (ifn, new_vectype, OPTIMIZE_FOR_SPEED))
++    return NULL;
++
++  /* The IR requires a valid vector type for the cast result, even though
++     it's likely to be discarded.  */
++  *type_out = get_vectype_for_scalar_type (vinfo, lhs_type);
++  if (!*type_out)
++    return NULL;
++
++  /* Generate the IFN_MULHRS call.  */
++  tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
++  tree new_ops[2];
++  vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
++		       unprom_mult, new_vectype);
++  gcall *mulhrs_stmt
++    = gimple_build_call_internal (ifn, 2, new_ops[0], new_ops[1]);
++  gimple_call_set_lhs (mulhrs_stmt, new_var);
++  gimple_set_location (mulhrs_stmt, gimple_location (last_stmt));
++
++  if (dump_enabled_p ())
++    dump_printf_loc (MSG_NOTE, vect_location,
++		     "created pattern stmt: %G", mulhrs_stmt);
++
++  return vect_convert_output (last_stmt_info, lhs_type,
++			      mulhrs_stmt, new_vectype);
++}
++
+ /* Recognize the patterns:
+ 
+ 	    ATYPE a;  // narrower than TYPE
+@@ -2872,6 +3041,37 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
+       /* Pattern detected.  */
+       vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
+ 
++      *type_out = vectype;
++
++      /* Check if the target supports this internal function.  */
++      internal_fn ifn = IFN_DIV_POW2;
++      if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
++	{
++	  tree shift = build_int_cst (itype, tree_log2 (oprnd1));
++
++	  tree var_div = vect_recog_temp_ssa_var (itype, NULL);
++	  gimple *div_stmt = gimple_build_call_internal (ifn, 2, oprnd0, shift);
++	  gimple_call_set_lhs (div_stmt, var_div);
++
++	  if (rhs_code == TRUNC_MOD_EXPR)
++	    {
++	      append_pattern_def_seq (stmt_vinfo, div_stmt);
++	      def_stmt
++		= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
++				       LSHIFT_EXPR, var_div, shift);
++	      append_pattern_def_seq (stmt_vinfo, def_stmt);
++	      pattern_stmt
++		= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
++				       MINUS_EXPR, oprnd0,
++				       gimple_assign_lhs (def_stmt));
++	    }
++	  else
++	    pattern_stmt = div_stmt;
++	  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
++
++	  return pattern_stmt;
++	}
++
+       cond = build2 (LT_EXPR, boolean_type_node, oprnd0,
+ 		     build_int_cst (itype, 0));
+       if (rhs_code == TRUNC_DIV_EXPR
+@@ -2948,7 +3148,6 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
+ 				   signmask);
+ 	}
+ 
+-      *type_out = vectype;
+       return pattern_stmt;
+     }
+ 
+@@ -4875,6 +5074,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
+   /* Must come after over_widening, which narrows the shift as much as
+      possible beforehand.  */
+   { vect_recog_average_pattern, "average" },
++  { vect_recog_mulhs_pattern, "mult_high" },
+   { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
+   { vect_recog_widen_mult_pattern, "widen_mult" },
+   { vect_recog_dot_prod_pattern, "dot_prod" },
+diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+index 82b868926..68a9f7574 100644
+--- a/gcc/tree-vect-stmts.c
++++ b/gcc/tree-vect-stmts.c
+@@ -4497,7 +4497,6 @@ vectorizable_simd_clone_call (stmt_vec_info stmt_info,
+ 
+ static gimple *
+ vect_gen_widened_results_half (enum tree_code code,
+-			       tree decl,
+                                tree vec_oprnd0, tree vec_oprnd1, int op_type,
+ 			       tree vec_dest, gimple_stmt_iterator *gsi,
+ 			       stmt_vec_info stmt_info)
+@@ -4506,26 +4505,12 @@ vect_gen_widened_results_half (enum tree_code code,
+   tree new_temp;
+ 
+   /* Generate half of the widened result:  */
+-  if (code == CALL_EXPR)
+-    {
+-      /* Target specific support  */
+-      if (op_type == binary_op)
+-	new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
+-      else
+-	new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
+-      new_temp = make_ssa_name (vec_dest, new_stmt);
+-      gimple_call_set_lhs (new_stmt, new_temp);
+-    }
+-  else
+-    {
+-      /* Generic support */
+-      gcc_assert (op_type == TREE_CODE_LENGTH (code));
+-      if (op_type != binary_op)
+-	vec_oprnd1 = NULL;
+-      new_stmt = gimple_build_assign (vec_dest, code, vec_oprnd0, vec_oprnd1);
+-      new_temp = make_ssa_name (vec_dest, new_stmt);
+-      gimple_assign_set_lhs (new_stmt, new_temp);
+-    }
++  gcc_assert (op_type == TREE_CODE_LENGTH (code));
++  if (op_type != binary_op)
++    vec_oprnd1 = NULL;
++  new_stmt = gimple_build_assign (vec_dest, code, vec_oprnd0, vec_oprnd1);
++  new_temp = make_ssa_name (vec_dest, new_stmt);
++  gimple_assign_set_lhs (new_stmt, new_temp);
+   vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ 
+   return new_stmt;
+@@ -4651,8 +4636,7 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0,
+ 					stmt_vec_info stmt_info, tree vec_dest,
+ 					gimple_stmt_iterator *gsi,
+ 					enum tree_code code1,
+-					enum tree_code code2, tree decl1,
+-					tree decl2, int op_type)
++					enum tree_code code2, int op_type)
+ {
+   int i;
+   tree vop0, vop1, new_tmp1, new_tmp2;
+@@ -4668,10 +4652,10 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0,
+ 	vop1 = NULL_TREE;
+ 
+       /* Generate the two halves of promotion operation.  */
+-      new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
++      new_stmt1 = vect_gen_widened_results_half (code1, vop0, vop1,
+ 						 op_type, vec_dest, gsi,
+ 						 stmt_info);
+-      new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
++      new_stmt2 = vect_gen_widened_results_half (code2, vop0, vop1,
+ 						 op_type, vec_dest, gsi,
+ 						 stmt_info);
+       if (is_gimple_call (new_stmt1))
+@@ -4712,7 +4696,6 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+   enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
+   enum tree_code codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
+-  tree decl1 = NULL_TREE, decl2 = NULL_TREE;
+   tree new_temp;
+   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
+   int ndts = 2;
+@@ -4883,8 +4866,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 	  && code != FLOAT_EXPR
+ 	  && !CONVERT_EXPR_CODE_P (code))
+ 	return false;
+-      if (supportable_convert_operation (code, vectype_out, vectype_in,
+-					 &decl1, &code1))
++      if (supportable_convert_operation (code, vectype_out, vectype_in, &code1))
+ 	break;
+       /* FALLTHRU */
+     unsupported:
+@@ -4924,7 +4906,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 	  if (GET_MODE_SIZE (rhs_mode) == fltsz)
+ 	    {
+ 	      if (!supportable_convert_operation (code, vectype_out,
+-						  cvt_type, &decl1, &codecvt1))
++						  cvt_type, &codecvt1))
+ 		goto unsupported;
+ 	    }
+ 	  else if (!supportable_widening_operation (code, stmt_info,
+@@ -4975,7 +4957,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+       if (cvt_type == NULL_TREE)
+ 	goto unsupported;
+       if (!supportable_convert_operation (code, cvt_type, vectype_in,
+-					  &decl1, &codecvt1))
++					  &codecvt1))
+ 	goto unsupported;
+       if (supportable_narrowing_operation (NOP_EXPR, vectype_out, cvt_type,
+ 					   &code1, &multi_step_cvt,
+@@ -5084,24 +5066,12 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 	    {
+ 	      stmt_vec_info new_stmt_info;
+ 	      /* Arguments are ready, create the new vector stmt.  */
+-	      if (code1 == CALL_EXPR)
+-		{
+-		  gcall *new_stmt = gimple_build_call (decl1, 1, vop0);
+-		  new_temp = make_ssa_name (vec_dest, new_stmt);
+-		  gimple_call_set_lhs (new_stmt, new_temp);
+-		  new_stmt_info
+-		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+-		}
+-	      else
+-		{
+-		  gcc_assert (TREE_CODE_LENGTH (code1) == unary_op);
+-		  gassign *new_stmt
+-		    = gimple_build_assign (vec_dest, code1, vop0);
+-		  new_temp = make_ssa_name (vec_dest, new_stmt);
+-		  gimple_assign_set_lhs (new_stmt, new_temp);
+-		  new_stmt_info
+-		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+-		}
++	      gcc_assert (TREE_CODE_LENGTH (code1) == unary_op);
++	      gassign *new_stmt = gimple_build_assign (vec_dest, code1, vop0);
++	      new_temp = make_ssa_name (vec_dest, new_stmt);
++	      gimple_assign_set_lhs (new_stmt, new_temp);
++	      new_stmt_info
++		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ 
+ 	      if (slp_node)
+ 		SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
+@@ -5193,8 +5163,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 	      vect_create_vectorized_promotion_stmts (&vec_oprnds0,
+ 						      &vec_oprnds1, stmt_info,
+ 						      this_dest, gsi,
+-						      c1, c2, decl1, decl2,
+-						      op_type);
++						      c1, c2, op_type);
+ 	    }
+ 
+ 	  FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
+@@ -5202,25 +5171,12 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 	      stmt_vec_info new_stmt_info;
+ 	      if (cvt_type)
+ 		{
+-		  if (codecvt1 == CALL_EXPR)
+-		    {
+-		      gcall *new_stmt = gimple_build_call (decl1, 1, vop0);
+-		      new_temp = make_ssa_name (vec_dest, new_stmt);
+-		      gimple_call_set_lhs (new_stmt, new_temp);
+-		      new_stmt_info
+-			= vect_finish_stmt_generation (stmt_info, new_stmt,
+-						       gsi);
+-		    }
+-		  else
+-		    {
+-		      gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
+-		      new_temp = make_ssa_name (vec_dest);
+-		      gassign *new_stmt
+-			= gimple_build_assign (new_temp, codecvt1, vop0);
+-		      new_stmt_info
+-			= vect_finish_stmt_generation (stmt_info, new_stmt,
+-						       gsi);
+-		    }
++		  gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
++		  new_temp = make_ssa_name (vec_dest);
++		  gassign *new_stmt
++		    = gimple_build_assign (new_temp, codecvt1, vop0);
++		  new_stmt_info
++		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ 		}
+ 	      else
+ 		new_stmt_info = vinfo->lookup_def (vop0);
+@@ -5263,22 +5219,11 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 	  if (cvt_type)
+ 	    FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
+ 	      {
+-		if (codecvt1 == CALL_EXPR)
+-		  {
+-		    gcall *new_stmt = gimple_build_call (decl1, 1, vop0);
+-		    new_temp = make_ssa_name (vec_dest, new_stmt);
+-		    gimple_call_set_lhs (new_stmt, new_temp);
+-		    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+-		  }
+-		else
+-		  {
+-		    gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
+-		    new_temp = make_ssa_name (vec_dest);
+-		    gassign *new_stmt
+-		      = gimple_build_assign (new_temp, codecvt1, vop0);
+-		    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+-		  }
+-
++		gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
++		new_temp = make_ssa_name (vec_dest);
++		gassign *new_stmt
++		    = gimple_build_assign (new_temp, codecvt1, vop0);
++		vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ 		vec_oprnds0[i] = new_temp;
+ 	      }
+ 
+@@ -8774,8 +8719,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 		    new_stmt = gimple_build_assign (vec_dest, data_ref);
+ 		    new_temp = make_ssa_name (vec_dest, new_stmt);
+ 		    gimple_assign_set_lhs (new_stmt, new_temp);
+-		    gimple_set_vdef (new_stmt, gimple_vdef (stmt_info->stmt));
+-		    gimple_set_vuse (new_stmt, gimple_vuse (stmt_info->stmt));
++		    gimple_move_vops (new_stmt, stmt_info->stmt);
+ 		    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ 		    msq = new_temp;
+ 
+diff --git a/gcc/tree-vector-builder.c b/gcc/tree-vector-builder.c
+index f31dc13b4..d02fb950c 100644
+--- a/gcc/tree-vector-builder.c
++++ b/gcc/tree-vector-builder.c
+@@ -24,103 +24,6 @@ along with GCC; see the file COPYING3.  If not see
+ #include "fold-const.h"
+ #include "tree-vector-builder.h"
+ 
+-/* Try to start building a new vector of type TYPE that holds the result of
+-   a unary operation on VECTOR_CST T.  ALLOW_STEPPED_P is true if the
+-   operation can handle stepped encodings directly, without having to
+-   expand the full sequence.
+-
+-   Return true if the operation is possible, which it always is when
+-   ALLOW_STEPPED_P is true.  Leave the builder unchanged otherwise.  */
+-
+-bool
+-tree_vector_builder::new_unary_operation (tree type, tree t,
+-					  bool allow_stepped_p)
+-{
+-  poly_uint64 full_nelts = TYPE_VECTOR_SUBPARTS (type);
+-  gcc_assert (known_eq (full_nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t))));
+-  unsigned int npatterns = VECTOR_CST_NPATTERNS (t);
+-  unsigned int nelts_per_pattern = VECTOR_CST_NELTS_PER_PATTERN (t);
+-  if (!allow_stepped_p && nelts_per_pattern > 2)
+-    {
+-      if (!full_nelts.is_constant ())
+-	return false;
+-      npatterns = full_nelts.to_constant ();
+-      nelts_per_pattern = 1;
+-    }
+-  new_vector (type, npatterns, nelts_per_pattern);
+-  return true;
+-}
+-
+-/* Try to start building a new vector of type TYPE that holds the result of
+-   a binary operation on VECTOR_CSTs T1 and T2.  ALLOW_STEPPED_P is true if
+-   the operation can handle stepped encodings directly, without having to
+-   expand the full sequence.
+-
+-   Return true if the operation is possible.  Leave the builder unchanged
+-   otherwise.  */
+-
+-bool
+-tree_vector_builder::new_binary_operation (tree type, tree t1, tree t2,
+-					   bool allow_stepped_p)
+-{
+-  poly_uint64 full_nelts = TYPE_VECTOR_SUBPARTS (type);
+-  gcc_assert (known_eq (full_nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t1)))
+-	      && known_eq (full_nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t2))));
+-  /* Conceptually we split the patterns in T1 and T2 until we have
+-     an equal number for both.  Each split pattern requires the same
+-     number of elements per pattern as the original.  E.g. splitting:
+-
+-       { 1, 2, 3, ... }
+-
+-     into two gives:
+-
+-       { 1, 3, 5, ... }
+-       { 2, 4, 6, ... }
+-
+-     while splitting:
+-
+-       { 1, 0, ... }
+-
+-     into two gives:
+-
+-       { 1, 0, ... }
+-       { 0, 0, ... }.  */
+-  unsigned int npatterns = least_common_multiple (VECTOR_CST_NPATTERNS (t1),
+-						  VECTOR_CST_NPATTERNS (t2));
+-  unsigned int nelts_per_pattern = MAX (VECTOR_CST_NELTS_PER_PATTERN (t1),
+-					VECTOR_CST_NELTS_PER_PATTERN (t2));
+-  if (!allow_stepped_p && nelts_per_pattern > 2)
+-    {
+-      if (!full_nelts.is_constant ())
+-	return false;
+-      npatterns = full_nelts.to_constant ();
+-      nelts_per_pattern = 1;
+-    }
+-  new_vector (type, npatterns, nelts_per_pattern);
+-  return true;
+-}
+-
+-/* Return the number of elements that the caller needs to operate on in
+-   order to handle a binary operation on VECTOR_CSTs T1 and T2.  This static
+-   function is used instead of new_binary_operation if the result of the
+-   operation is not a VECTOR_CST.  */
+-
+-unsigned int
+-tree_vector_builder::binary_encoded_nelts (tree t1, tree t2)
+-{
+-  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (t1));
+-  gcc_assert (known_eq (nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t2))));
+-  /* See new_binary_operation for details.  */
+-  unsigned int npatterns = least_common_multiple (VECTOR_CST_NPATTERNS (t1),
+-						  VECTOR_CST_NPATTERNS (t2));
+-  unsigned int nelts_per_pattern = MAX (VECTOR_CST_NELTS_PER_PATTERN (t1),
+-					VECTOR_CST_NELTS_PER_PATTERN (t2));
+-  unsigned HOST_WIDE_INT const_nelts;
+-  if (nelts.is_constant (&const_nelts))
+-    return MIN (npatterns * nelts_per_pattern, const_nelts);
+-  return npatterns * nelts_per_pattern;
+-}
+-
+ /* Return a vector element with the value BASE + FACTOR * STEP.  */
+ 
+ tree
+diff --git a/gcc/tree-vector-builder.h b/gcc/tree-vector-builder.h
+index 13af74ad8..add79e476 100644
+--- a/gcc/tree-vector-builder.h
++++ b/gcc/tree-vector-builder.h
+@@ -24,10 +24,11 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ /* This class is used to build VECTOR_CSTs from a sequence of elements.
+    See vector_builder for more details.  */
+-class tree_vector_builder : public vector_builder<tree, tree_vector_builder>
++class tree_vector_builder : public vector_builder<tree, tree,
++						  tree_vector_builder>
+ {
+-  typedef vector_builder<tree, tree_vector_builder> parent;
+-  friend class vector_builder<tree, tree_vector_builder>;
++  typedef vector_builder<tree, tree, tree_vector_builder> parent;
++  friend class vector_builder<tree, tree, tree_vector_builder>;
+ 
+ public:
+   tree_vector_builder () : m_type (0) {}
+@@ -37,10 +38,6 @@ public:
+   tree type () const { return m_type; }
+ 
+   void new_vector (tree, unsigned int, unsigned int);
+-  bool new_unary_operation (tree, tree, bool);
+-  bool new_binary_operation (tree, tree, tree, bool);
+-
+-  static unsigned int binary_encoded_nelts (tree, tree);
+ 
+ private:
+   bool equal_p (const_tree, const_tree) const;
+@@ -51,6 +48,15 @@ private:
+   bool can_elide_p (const_tree) const;
+   void note_representative (tree *, tree);
+ 
++  static poly_uint64 shape_nelts (const_tree t)
++    { return TYPE_VECTOR_SUBPARTS (t); }
++  static poly_uint64 nelts_of (const_tree t)
++    { return VECTOR_CST_NELTS (t); }
++  static unsigned int npatterns_of (const_tree t)
++    { return VECTOR_CST_NPATTERNS (t); }
++  static unsigned int nelts_per_pattern_of (const_tree t)
++    { return VECTOR_CST_NELTS_PER_PATTERN (t); }
++
+   tree m_type;
+ };
+ 
+diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
+index c2c6377d3..71ca80937 100644
+--- a/gcc/tree-vectorizer.c
++++ b/gcc/tree-vectorizer.c
+@@ -288,10 +288,7 @@ adjust_simduid_builtins (hash_table<simduid_to_vf> *htab)
+ 		       : BUILT_IN_GOMP_ORDERED_END);
+ 		  gimple *g
+ 		    = gimple_build_call (builtin_decl_explicit (bcode), 0);
+-		  tree vdef = gimple_vdef (stmt);
+-		  gimple_set_vdef (g, vdef);
+-		  SSA_NAME_DEF_STMT (vdef) = g;
+-		  gimple_set_vuse (g, gimple_vuse (stmt));
++		  gimple_move_vops (g, stmt);
+ 		  gsi_replace (&i, g, true);
+ 		  continue;
+ 		}
+diff --git a/gcc/tree.c b/gcc/tree.c
+index c4b8eea67..62607c63a 100644
+--- a/gcc/tree.c
++++ b/gcc/tree.c
+@@ -1965,6 +1965,23 @@ build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step)
+   return v.build ();
+ }
+ 
++/* Return a VECTOR_CST of type VEC_TYPE in which the first NUM_A
++   elements are A and the rest are B.  */
++
++tree
++build_vector_a_then_b (tree vec_type, unsigned int num_a, tree a, tree b)
++{
++  gcc_assert (known_le (num_a, TYPE_VECTOR_SUBPARTS (vec_type)));
++  unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vec_type));
++  /* Optimize the constant case.  */
++  if ((count & 1) == 0 && TYPE_VECTOR_SUBPARTS (vec_type).is_constant ())
++    count /= 2;
++  tree_vector_builder builder (vec_type, count, 2);
++  for (unsigned int i = 0; i < count * 2; ++i)
++    builder.quick_push (i < num_a ? a : b);
++  return builder.build ();
++}
++
+ /* Something has messed with the elements of CONSTRUCTOR C after it was built;
+    calculate TREE_CONSTANT and TREE_SIDE_EFFECTS.  */
+ 
+diff --git a/gcc/tree.h b/gcc/tree.h
+index 6f73593fa..356a9f544 100644
+--- a/gcc/tree.h
++++ b/gcc/tree.h
+@@ -2475,10 +2475,10 @@ extern machine_mode vector_type_mode (const_tree);
+   (DECL_COMMON_CHECK (NODE)->decl_common.mode = (MODE))
+ 
+ /* For FUNCTION_DECL, if it is built-in, this identifies which built-in
+-   operation it is.  Note, however, that this field is overloaded, with
+-   DECL_BUILT_IN_CLASS as the discriminant, so the latter must always be
+-   checked before any access to the former.  */
+-#define DECL_FUNCTION_CODE(NODE) \
++   operation it is.  This is only intended for low-level accesses;
++   normally DECL_FUNCTION_CODE, DECL_FE_FUNCTION_CODE or DECL_MD_FUNCTION
++   should be used instead.  */
++#define DECL_UNCHECKED_FUNCTION_CODE(NODE) \
+   (FUNCTION_DECL_CHECK (NODE)->function_decl.function_code)
+ 
+ /* Test if FCODE is a function code for an alloca operation.  */
+@@ -2955,11 +2955,34 @@ extern void decl_fini_priority_insert (tree, priority_type);
+ #define DECL_IS_MALLOC(NODE) \
+   (FUNCTION_DECL_CHECK (NODE)->function_decl.malloc_flag)
+ 
++/* Macro for direct set and get of function_decl.decl_type.  */
++#define FUNCTION_DECL_DECL_TYPE(NODE) \
++  (NODE->function_decl.decl_type)
++
++/* Set decl_type of a DECL.  Set it to T when SET is true, or reset
++   it to NONE.  */
++
++static inline void
++set_function_decl_type (tree decl, function_decl_type t, bool set)
++{
++  if (set)
++    {
++      gcc_assert (FUNCTION_DECL_DECL_TYPE (decl) == NONE
++		  || FUNCTION_DECL_DECL_TYPE (decl) == t);
++      decl->function_decl.decl_type = t;
++    }
++  else if (FUNCTION_DECL_DECL_TYPE (decl) == t)
++    FUNCTION_DECL_DECL_TYPE (decl) = NONE;
++}
++
+ /* Nonzero in a FUNCTION_DECL means this function should be treated as
+    C++ operator new, meaning that it returns a pointer for which we
+    should not use type based aliasing.  */
+-#define DECL_IS_OPERATOR_NEW(NODE) \
+-  (FUNCTION_DECL_CHECK (NODE)->function_decl.operator_new_flag)
++#define DECL_IS_OPERATOR_NEW_P(NODE) \
++  (FUNCTION_DECL_CHECK (NODE)->function_decl.decl_type == OPERATOR_NEW)
++
++#define DECL_SET_IS_OPERATOR_NEW(NODE, VAL) \
++  set_function_decl_type (FUNCTION_DECL_CHECK (NODE), OPERATOR_NEW, VAL)
+ 
+ /* Nonzero in a FUNCTION_DECL means this function may return more
+    than once.  */
+@@ -3066,10 +3089,9 @@ extern vec<tree, va_gc> **decl_debug_args_insert (tree);
+ #define DECL_STRUCT_FUNCTION(NODE) \
+   (FUNCTION_DECL_CHECK (NODE)->function_decl.f)
+ 
+-
+ /* For a builtin function, identify which part of the compiler defined it.  */
+ #define DECL_BUILT_IN_CLASS(NODE) \
+-   (FUNCTION_DECL_CHECK (NODE)->function_decl.built_in_class)
++   ((built_in_class) FUNCTION_DECL_CHECK (NODE)->function_decl.built_in_class)
+ 
+ /* In FUNCTION_DECL, a chain of ..._DECL nodes.  */
+ #define DECL_ARGUMENTS(NODE) \
+@@ -3104,8 +3126,11 @@ extern vec<tree, va_gc> **decl_debug_args_insert (tree);
+    (FUNCTION_DECL_CHECK (NODE)->decl_with_vis.cxx_destructor)
+ 
+ /* In FUNCTION_DECL, this is set if this function is a lambda function.  */
+-#define DECL_LAMBDA_FUNCTION(NODE) \
+-  (FUNCTION_DECL_CHECK (NODE)->function_decl.lambda_function)
++#define DECL_LAMBDA_FUNCTION_P(NODE) \
++  (FUNCTION_DECL_CHECK (NODE)->function_decl.decl_type == LAMBDA_FUNCTION)
++
++#define DECL_SET_LAMBDA_FUNCTION(NODE, VAL) \
++  set_function_decl_type (FUNCTION_DECL_CHECK (NODE), LAMBDA_FUNCTION, VAL)
+ 
+ /* In FUNCTION_DECL that represent an virtual method this is set when
+    the method is final.  */
+@@ -3788,6 +3813,61 @@ valid_vector_subparts_p (poly_uint64 subparts)
+   return true;
+ }
+ 
++/* Return the built-in function that DECL represents, given that it is known
++   to be a FUNCTION_DECL with built-in class BUILT_IN_NORMAL.  */
++inline built_in_function
++DECL_FUNCTION_CODE (const_tree decl)
++{
++  const tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
++  gcc_checking_assert (fndecl.built_in_class == BUILT_IN_NORMAL);
++  return (built_in_function) fndecl.function_code;
++}
++
++/* Return the target-specific built-in function that DECL represents,
++   given that it is known to be a FUNCTION_DECL with built-in class
++   BUILT_IN_MD.  */
++inline int
++DECL_MD_FUNCTION_CODE (const_tree decl)
++{
++  const tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
++  gcc_checking_assert (fndecl.built_in_class == BUILT_IN_MD);
++  return fndecl.function_code;
++}
++
++/* Return the frontend-specific built-in function that DECL represents,
++   given that it is known to be a FUNCTION_DECL with built-in class
++   BUILT_IN_FRONTEND.  */
++inline int
++DECL_FE_FUNCTION_CODE (const_tree decl)
++{
++  const tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
++  gcc_checking_assert (fndecl.built_in_class == BUILT_IN_FRONTEND);
++  return fndecl.function_code;
++}
++
++/* Record that FUNCTION_DECL DECL represents built-in function FCODE of
++   class FCLASS.  */
++inline void
++set_decl_built_in_function (tree decl, built_in_class fclass,
++			    unsigned int fcode)
++{
++  tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
++  fndecl.built_in_class = fclass;
++  fndecl.function_code = fcode;
++}
++
++/* Record that FUNCTION_DECL NEWDECL represents the same built-in function
++   as OLDDECL (or none, if OLDDECL doesn't represent a built-in function).  */
++inline void
++copy_decl_built_in_function (tree newdecl, const_tree olddecl)
++{
++  tree_function_decl &newfndecl = FUNCTION_DECL_CHECK (newdecl)->function_decl;
++  const tree_function_decl &oldfndecl
++    = FUNCTION_DECL_CHECK (olddecl)->function_decl;
++  newfndecl.built_in_class = oldfndecl.built_in_class;
++  newfndecl.function_code = oldfndecl.function_code;
++}
++
+ /* In NON_LVALUE_EXPR and VIEW_CONVERT_EXPR, set when this node is merely a
+    wrapper added to express a location_t on behalf of the node's child
+    (e.g. by maybe_wrap_with_location).  */
+@@ -4212,6 +4292,7 @@ extern tree build_vector_from_val (tree, tree);
+ extern tree build_uniform_cst (tree, tree);
+ extern tree build_vec_series (tree, tree, tree);
+ extern tree build_index_vector (tree, poly_uint64, poly_uint64);
++extern tree build_vector_a_then_b (tree, unsigned int, tree, tree);
+ extern void recompute_constructor_flags (tree);
+ extern void verify_constructor_flags (tree);
+ extern tree build_constructor (tree, vec<constructor_elt, va_gc> *);
+@@ -5967,9 +6048,10 @@ fndecl_built_in_p (const_tree node, built_in_class klass)
+    of class KLASS with name equal to NAME.  */
+ 
+ inline bool
+-fndecl_built_in_p (const_tree node, int name, built_in_class klass)
++fndecl_built_in_p (const_tree node, unsigned int name, built_in_class klass)
+ {
+-  return (fndecl_built_in_p (node, klass) && DECL_FUNCTION_CODE (node) == name);
++  return (fndecl_built_in_p (node, klass)
++	  && DECL_UNCHECKED_FUNCTION_CODE (node) == name);
+ }
+ 
+ /* Return true if a FUNCTION_DECL NODE is a GCC built-in function
+diff --git a/gcc/var-tracking.c b/gcc/var-tracking.c
+index 96e0c93a6..982ef13d1 100644
+--- a/gcc/var-tracking.c
++++ b/gcc/var-tracking.c
+@@ -116,6 +116,7 @@
+ #include "rtl-iter.h"
+ #include "fibonacci_heap.h"
+ #include "print-rtl.h"
++#include "function-abi.h"
+ 
+ typedef fibonacci_heap <long, basic_block_def> bb_heap_t;
+ typedef fibonacci_node <long, basic_block_def> bb_heap_node_t;
+@@ -1238,7 +1239,7 @@ adjust_insn (basic_block bb, rtx_insn *insn)
+   amd.stack_adjust = -VTI (bb)->out.stack_adjust;
+ 
+   amd.store = true;
+-  note_stores (PATTERN (insn), adjust_mem_stores, &amd);
++  note_stores (insn, adjust_mem_stores, &amd);
+ 
+   amd.store = false;
+   if (GET_CODE (PATTERN (insn)) == PARALLEL
+@@ -4899,12 +4900,11 @@ dataflow_set_clear_at_call (dataflow_set *set, rtx_insn *call_insn)
+ {
+   unsigned int r;
+   hard_reg_set_iterator hrsi;
+-  HARD_REG_SET invalidated_regs;
+ 
+-  get_call_reg_set_usage (call_insn, &invalidated_regs,
+-			  regs_invalidated_by_call);
++  HARD_REG_SET callee_clobbers
++    = insn_callee_abi (call_insn).full_reg_clobbers ();
+ 
+-  EXECUTE_IF_SET_IN_HARD_REG_SET (invalidated_regs, 0, r, hrsi)
++  EXECUTE_IF_SET_IN_HARD_REG_SET (callee_clobbers, 0, r, hrsi)
+     var_regno_delete (set, r);
+ 
+   if (MAY_HAVE_DEBUG_BIND_INSNS)
+@@ -6292,14 +6292,12 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
+ 		  && targetm.calls.struct_value_rtx (type, 0) == 0)
+ 		{
+ 		  tree struct_addr = build_pointer_type (TREE_TYPE (type));
+-		  machine_mode mode = TYPE_MODE (struct_addr);
++		  function_arg_info arg (struct_addr, /*named=*/true);
+ 		  rtx reg;
+ 		  INIT_CUMULATIVE_ARGS (args_so_far_v, type, NULL_RTX, fndecl,
+ 					nargs + 1);
+-		  reg = targetm.calls.function_arg (args_so_far, mode,
+-						    struct_addr, true);
+-		  targetm.calls.function_arg_advance (args_so_far, mode,
+-						      struct_addr, true);
++		  reg = targetm.calls.function_arg (args_so_far, arg);
++		  targetm.calls.function_arg_advance (args_so_far, arg);
+ 		  if (reg == NULL_RTX)
+ 		    {
+ 		      for (; link; link = XEXP (link, 1))
+@@ -6317,11 +6315,9 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
+ 				      nargs);
+ 	      if (obj_type_ref && TYPE_ARG_TYPES (type) != void_list_node)
+ 		{
+-		  machine_mode mode;
+ 		  t = TYPE_ARG_TYPES (type);
+-		  mode = TYPE_MODE (TREE_VALUE (t));
+-		  this_arg = targetm.calls.function_arg (args_so_far, mode,
+-							 TREE_VALUE (t), true);
++		  function_arg_info arg (TREE_VALUE (t), /*named=*/true);
++		  this_arg = targetm.calls.function_arg (args_so_far, arg);
+ 		  if (this_arg && !REG_P (this_arg))
+ 		    this_arg = NULL_RTX;
+ 		  else if (this_arg == NULL_RTX)
+@@ -6429,30 +6425,24 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
+ 	  }
+ 	if (t && t != void_list_node)
+ 	  {
+-	    tree argtype = TREE_VALUE (t);
+-	    machine_mode mode = TYPE_MODE (argtype);
+ 	    rtx reg;
+-	    if (pass_by_reference (&args_so_far_v, mode, argtype, true))
+-	      {
+-		argtype = build_pointer_type (argtype);
+-		mode = TYPE_MODE (argtype);
+-	      }
+-	    reg = targetm.calls.function_arg (args_so_far, mode,
+-					      argtype, true);
+-	    if (TREE_CODE (argtype) == REFERENCE_TYPE
+-		&& INTEGRAL_TYPE_P (TREE_TYPE (argtype))
++	    function_arg_info arg (TREE_VALUE (t), /*named=*/true);
++	    apply_pass_by_reference_rules (&args_so_far_v, arg);
++	    reg = targetm.calls.function_arg (args_so_far, arg);
++	    if (TREE_CODE (arg.type) == REFERENCE_TYPE
++		&& INTEGRAL_TYPE_P (TREE_TYPE (arg.type))
+ 		&& reg
+ 		&& REG_P (reg)
+-		&& GET_MODE (reg) == mode
+-		&& (GET_MODE_CLASS (mode) == MODE_INT
+-		    || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
++		&& GET_MODE (reg) == arg.mode
++		&& (GET_MODE_CLASS (arg.mode) == MODE_INT
++		    || GET_MODE_CLASS (arg.mode) == MODE_PARTIAL_INT)
+ 		&& REG_P (x)
+ 		&& REGNO (x) == REGNO (reg)
+-		&& GET_MODE (x) == mode
++		&& GET_MODE (x) == arg.mode
+ 		&& item)
+ 	      {
+ 		machine_mode indmode
+-		  = TYPE_MODE (TREE_TYPE (argtype));
++		  = TYPE_MODE (TREE_TYPE (arg.type));
+ 		rtx mem = gen_rtx_MEM (indmode, x);
+ 		cselib_val *val = cselib_lookup (mem, indmode, 0, VOIDmode);
+ 		if (val && cselib_preserved_value_p (val))
+@@ -6492,8 +6482,7 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
+ 			}
+ 		  }
+ 	      }
+-	    targetm.calls.function_arg_advance (args_so_far, mode,
+-						argtype, true);
++	    targetm.calls.function_arg_advance (args_so_far, arg);
+ 	    t = TREE_CHAIN (t);
+ 	  }
+       }
+@@ -6642,7 +6631,7 @@ add_with_sets (rtx_insn *insn, struct cselib_set *sets, int n_sets)
+      insert notes before it without worrying about any
+      notes that MO_USEs might emit after the insn.  */
+   cui.store_p = true;
+-  note_stores (PATTERN (insn), add_stores, &cui);
++  note_stores (insn, add_stores, &cui);
+   n2 = VTI (bb)->mos.length () - 1;
+   mos = VTI (bb)->mos.address ();
+ 
+diff --git a/gcc/vector-builder.h b/gcc/vector-builder.h
+index 9967daa6e..37911ac69 100644
+--- a/gcc/vector-builder.h
++++ b/gcc/vector-builder.h
+@@ -45,8 +45,11 @@ along with GCC; see the file COPYING3.  If not see
+       variable-length vectors.  finalize () then canonicalizes the encoding
+       to a simpler form if possible.
+ 
+-   The derived class Derived provides this functionality for specific Ts.
+-   Derived needs to provide the following interface:
++   Shape is the type that specifies the number of elements in the vector
++   and (where relevant) the type of each element.
++
++   The derived class Derived provides the functionality of this class
++   for specific Ts.  Derived needs to provide the following interface:
+ 
+       bool equal_p (T elt1, T elt2) const;
+ 
+@@ -82,9 +85,30 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ 	  Record that ELT2 is being elided, given that ELT1_PTR points to
+ 	  the last encoded element for the containing pattern.  This is
+-	  again provided for TREE_OVERFLOW handling.  */
++	  again provided for TREE_OVERFLOW handling.
++
++      static poly_uint64 shape_nelts (Shape shape);
++
++	  Return the number of elements in SHAPE.
++
++    The class provides additional functionality for the case in which
++    T can describe a vector constant as well as an individual element.
++    This functionality requires:
++
++      static poly_uint64 nelts_of (T x);
++
++	  Return the number of elements in vector constant X.
++
++      static unsigned int npatterns_of (T x);
+ 
+-template<typename T, typename Derived>
++	  Return the number of patterns used to encode vector constant X.
++
++      static unsigned int nelts_per_pattern_of (T x);
++
++	  Return the number of elements used to encode each pattern
++	  in vector constant X.  */
++
++template<typename T, typename Shape, typename Derived>
+ class vector_builder : public auto_vec<T, 32>
+ {
+ public:
+@@ -96,12 +120,18 @@ public:
+   unsigned int encoded_nelts () const;
+   bool encoded_full_vector_p () const;
+   T elt (unsigned int) const;
++  unsigned int count_dups (int, int, int) const;
+ 
+   bool operator == (const Derived &) const;
+   bool operator != (const Derived &x) const { return !operator == (x); }
+ 
++  bool new_unary_operation (Shape, T, bool);
++  bool new_binary_operation (Shape, T, T, bool);
++
+   void finalize ();
+ 
++  static unsigned int binary_encoded_nelts (T, T);
++
+ protected:
+   void new_vector (poly_uint64, unsigned int, unsigned int);
+   void reshape (unsigned int, unsigned int);
+@@ -120,16 +150,16 @@ private:
+   unsigned int m_nelts_per_pattern;
+ };
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ inline const Derived *
+-vector_builder<T, Derived>::derived () const
++vector_builder<T, Shape, Derived>::derived () const
+ {
+   return static_cast<const Derived *> (this);
+ }
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ inline
+-vector_builder<T, Derived>::vector_builder ()
++vector_builder<T, Shape, Derived>::vector_builder ()
+   : m_full_nelts (0),
+     m_npatterns (0),
+     m_nelts_per_pattern (0)
+@@ -139,18 +169,18 @@ vector_builder<T, Derived>::vector_builder ()
+    starts with these explicitly-encoded elements and may contain additional
+    elided elements.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ inline unsigned int
+-vector_builder<T, Derived>::encoded_nelts () const
++vector_builder<T, Shape, Derived>::encoded_nelts () const
+ {
+   return m_npatterns * m_nelts_per_pattern;
+ }
+ 
+ /* Return true if every element of the vector is explicitly encoded.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ inline bool
+-vector_builder<T, Derived>::encoded_full_vector_p () const
++vector_builder<T, Shape, Derived>::encoded_full_vector_p () const
+ {
+   return known_eq (m_npatterns * m_nelts_per_pattern, m_full_nelts);
+ }
+@@ -158,11 +188,11 @@ vector_builder<T, Derived>::encoded_full_vector_p () const
+ /* Start building a vector that has FULL_NELTS elements.  Initially
+    encode it using NPATTERNS patterns with NELTS_PER_PATTERN each.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ void
+-vector_builder<T, Derived>::new_vector (poly_uint64 full_nelts,
+-					unsigned int npatterns,
+-					unsigned int nelts_per_pattern)
++vector_builder<T, Shape, Derived>::new_vector (poly_uint64 full_nelts,
++					       unsigned int npatterns,
++					       unsigned int nelts_per_pattern)
+ {
+   m_full_nelts = full_nelts;
+   m_npatterns = npatterns;
+@@ -174,9 +204,9 @@ vector_builder<T, Derived>::new_vector (poly_uint64 full_nelts,
+ /* Return true if this vector and OTHER have the same elements and
+    are encoded in the same way.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ bool
+-vector_builder<T, Derived>::operator == (const Derived &other) const
++vector_builder<T, Shape, Derived>::operator == (const Derived &other) const
+ {
+   if (maybe_ne (m_full_nelts, other.m_full_nelts)
+       || m_npatterns != other.m_npatterns
+@@ -194,18 +224,19 @@ vector_builder<T, Derived>::operator == (const Derived &other) const
+ /* Return the value of vector element I, which might or might not be
+    encoded explicitly.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ T
+-vector_builder<T, Derived>::elt (unsigned int i) const
++vector_builder<T, Shape, Derived>::elt (unsigned int i) const
+ {
+-  /* This only makes sense if the encoding has been fully populated.  */
+-  gcc_checking_assert (encoded_nelts () <= this->length ());
+-
+   /* First handle elements that are already present in the underlying
+      vector, regardless of whether they're part of the encoding or not.  */
+   if (i < this->length ())
+     return (*this)[i];
+ 
++  /* Extrapolation is only possible if the encoding has been fully
++     populated.  */
++  gcc_checking_assert (encoded_nelts () <= this->length ());
++
+   /* Identify the pattern that contains element I and work out the index of
+      the last encoded element for that pattern.  */
+   unsigned int pattern = i % m_npatterns;
+@@ -223,13 +254,136 @@ vector_builder<T, Derived>::elt (unsigned int i) const
+ 				 derived ()->step (prev, final));
+ }
+ 
++/* Try to start building a new vector of shape SHAPE that holds the result of
++   a unary operation on vector constant VEC.  ALLOW_STEPPED_P is true if the
++   operation can handle stepped encodings directly, without having to expand
++   the full sequence.
++
++   Return true if the operation is possible, which it always is when
++   ALLOW_STEPPED_P is true.  Leave the builder unchanged otherwise.  */
++
++template<typename T, typename Shape, typename Derived>
++bool
++vector_builder<T, Shape, Derived>::new_unary_operation (Shape shape, T vec,
++							bool allow_stepped_p)
++{
++  poly_uint64 full_nelts = Derived::shape_nelts (shape);
++  gcc_assert (known_eq (full_nelts, Derived::nelts_of (vec)));
++  unsigned int npatterns = Derived::npatterns_of (vec);
++  unsigned int nelts_per_pattern = Derived::nelts_per_pattern_of (vec);
++  if (!allow_stepped_p && nelts_per_pattern > 2)
++    {
++      if (!full_nelts.is_constant ())
++	return false;
++      npatterns = full_nelts.to_constant ();
++      nelts_per_pattern = 1;
++    }
++  derived ()->new_vector (shape, npatterns, nelts_per_pattern);
++  return true;
++}
++
++/* Try to start building a new vector of shape SHAPE that holds the result of
++   a binary operation on vector constants VEC1 and VEC2.  ALLOW_STEPPED_P is
++   true if the operation can handle stepped encodings directly, without
++   having to expand the full sequence.
++
++   Return true if the operation is possible.  Leave the builder unchanged
++   otherwise.  */
++
++template<typename T, typename Shape, typename Derived>
++bool
++vector_builder<T, Shape, Derived>::new_binary_operation (Shape shape,
++							 T vec1, T vec2,
++							 bool allow_stepped_p)
++{
++  poly_uint64 full_nelts = Derived::shape_nelts (shape);
++  gcc_assert (known_eq (full_nelts, Derived::nelts_of (vec1))
++	      && known_eq (full_nelts, Derived::nelts_of (vec2)));
++  /* Conceptually we split the patterns in VEC1 and VEC2 until we have
++     an equal number for both.  Each split pattern requires the same
++     number of elements per pattern as the original.  E.g. splitting:
++
++       { 1, 2, 3, ... }
++
++     into two gives:
++
++       { 1, 3, 5, ... }
++       { 2, 4, 6, ... }
++
++     while splitting:
++
++       { 1, 0, ... }
++
++     into two gives:
++
++       { 1, 0, ... }
++       { 0, 0, ... }.  */
++  unsigned int npatterns
++    = least_common_multiple (Derived::npatterns_of (vec1),
++			     Derived::npatterns_of (vec2));
++  unsigned int nelts_per_pattern
++    = MAX (Derived::nelts_per_pattern_of (vec1),
++	   Derived::nelts_per_pattern_of (vec2));
++  if (!allow_stepped_p && nelts_per_pattern > 2)
++    {
++      if (!full_nelts.is_constant ())
++	return false;
++      npatterns = full_nelts.to_constant ();
++      nelts_per_pattern = 1;
++    }
++  derived ()->new_vector (shape, npatterns, nelts_per_pattern);
++  return true;
++}
++
++/* Return the number of elements that the caller needs to operate on in
++   order to handle a binary operation on vector constants VEC1 and VEC2.
++   This static function is used instead of new_binary_operation if the
++   result of the operation is not a constant vector.  */
++
++template<typename T, typename Shape, typename Derived>
++unsigned int
++vector_builder<T, Shape, Derived>::binary_encoded_nelts (T vec1, T vec2)
++{
++  poly_uint64 nelts = Derived::nelts_of (vec1);
++  gcc_assert (known_eq (nelts, Derived::nelts_of (vec2)));
++  /* See new_binary_operation for details.  */
++  unsigned int npatterns
++    = least_common_multiple (Derived::npatterns_of (vec1),
++			     Derived::npatterns_of (vec2));
++  unsigned int nelts_per_pattern
++    = MAX (Derived::nelts_per_pattern_of (vec1),
++	   Derived::nelts_per_pattern_of (vec2));
++  unsigned HOST_WIDE_INT const_nelts;
++  if (nelts.is_constant (&const_nelts))
++    return MIN (npatterns * nelts_per_pattern, const_nelts);
++  return npatterns * nelts_per_pattern;
++}
++
++/* Return the number of leading duplicate elements in the range
++   [START:END:STEP].  The value is always at least 1.  */
++
++template<typename T, typename Shape, typename Derived>
++unsigned int
++vector_builder<T, Shape, Derived>::count_dups (int start, int end,
++					       int step) const
++{
++  gcc_assert ((end - start) % step == 0);
++
++  unsigned int ndups = 1;
++  for (int i = start + step;
++       i != end && derived ()->equal_p (elt (i), elt (start));
++       i += step)
++    ndups++;
++  return ndups;
++}
++
+ /* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each,
+    but without changing the underlying vector.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ void
+-vector_builder<T, Derived>::reshape (unsigned int npatterns,
+-				     unsigned int nelts_per_pattern)
++vector_builder<T, Shape, Derived>::reshape (unsigned int npatterns,
++					    unsigned int nelts_per_pattern)
+ {
+   unsigned int old_encoded_nelts = encoded_nelts ();
+   unsigned int new_encoded_nelts = npatterns * nelts_per_pattern;
+@@ -249,11 +403,11 @@ vector_builder<T, Derived>::reshape (unsigned int npatterns,
+ /* Return true if elements [START, END) contain a repeating sequence of
+    STEP elements.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ bool
+-vector_builder<T, Derived>::repeating_sequence_p (unsigned int start,
+-						  unsigned int end,
+-						  unsigned int step)
++vector_builder<T, Shape, Derived>::repeating_sequence_p (unsigned int start,
++							 unsigned int end,
++							 unsigned int step)
+ {
+   for (unsigned int i = start; i < end - step; ++i)
+     if (!derived ()->equal_p ((*this)[i], (*this)[i + step]))
+@@ -264,11 +418,11 @@ vector_builder<T, Derived>::repeating_sequence_p (unsigned int start,
+ /* Return true if elements [START, END) contain STEP interleaved linear
+    series.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ bool
+-vector_builder<T, Derived>::stepped_sequence_p (unsigned int start,
+-						unsigned int end,
+-						unsigned int step)
++vector_builder<T, Shape, Derived>::stepped_sequence_p (unsigned int start,
++						       unsigned int end,
++						       unsigned int step)
+ {
+   if (!derived ()->allow_steps_p ())
+     return false;
+@@ -297,9 +451,9 @@ vector_builder<T, Derived>::stepped_sequence_p (unsigned int start,
+ /* Try to change the number of encoded patterns to NPATTERNS, returning
+    true on success.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ bool
+-vector_builder<T, Derived>::try_npatterns (unsigned int npatterns)
++vector_builder<T, Shape, Derived>::try_npatterns (unsigned int npatterns)
+ {
+   if (m_nelts_per_pattern == 1)
+     {
+@@ -350,9 +504,9 @@ vector_builder<T, Derived>::try_npatterns (unsigned int npatterns)
+ 
+ /* Replace the current encoding with the canonical form.  */
+ 
+-template<typename T, typename Derived>
++template<typename T, typename Shape, typename Derived>
+ void
+-vector_builder<T, Derived>::finalize ()
++vector_builder<T, Shape, Derived>::finalize ()
+ {
+   /* The encoding requires the same number of elements to come from each
+      pattern.  */
+diff --git a/libgcc/config.host b/libgcc/config.host
+index 0f15fda36..9500ec2ee 100644
+--- a/libgcc/config.host
++++ b/libgcc/config.host
+@@ -356,6 +356,12 @@ aarch64*-*-freebsd*)
+ 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+ 	md_unwind_header=aarch64/freebsd-unwind.h
+ 	;;
++aarch64*-*-netbsd*)
++	extra_parts="$extra_parts crtfastmath.o"
++	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
++	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
++	md_unwind_header=aarch64/aarch64-unwind.h
++	;;
+ aarch64*-*-fuchsia*)
+ 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+ 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
+diff --git a/libgcc/config/aarch64/aarch64-unwind.h b/libgcc/config/aarch64/aarch64-unwind.h
+index 223ac9157..13e6e4a6a 100644
+--- a/libgcc/config/aarch64/aarch64-unwind.h
++++ b/libgcc/config/aarch64/aarch64-unwind.h
+@@ -35,6 +35,23 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ #define MD_FROB_UPDATE_CONTEXT(context, fs) \
+   aarch64_frob_update_context (context, fs)
+ 
++static inline int
++aarch64_cie_signed_with_b_key (struct _Unwind_Context *context)
++{
++  const struct dwarf_fde *fde = _Unwind_Find_FDE (context->bases.func,
++						  &context->bases);
++  if (fde != NULL)
++    {
++      const struct dwarf_cie *cie = get_cie (fde);
++      if (cie != NULL)
++	{
++	  char *aug_str = cie->augmentation;
++	  return strchr (aug_str, 'B') == NULL ? 0 : 1;
++	}
++    }
++  return 0;
++}
++
+ /* Do AArch64 private extraction on ADDR based on context info CONTEXT and
+    unwind frame info FS.  If ADDR is signed, we do address authentication on it
+    using CFA of current frame.  */
+@@ -43,9 +60,11 @@ static inline void *
+ aarch64_post_extract_frame_addr (struct _Unwind_Context *context,
+ 				 _Unwind_FrameState *fs, void *addr)
+ {
+-  if (fs->regs.reg[DWARF_REGNUM_AARCH64_RA_STATE].loc.offset & 0x1)
++  if (context->flags & RA_SIGNED_BIT)
+     {
+       _Unwind_Word salt = (_Unwind_Word) context->cfa;
++      if (aarch64_cie_signed_with_b_key (context) != 0)
++	return __builtin_aarch64_autib1716 (addr, salt);
+       return __builtin_aarch64_autia1716 (addr, salt);
+     }
+   else
+@@ -62,9 +81,14 @@ aarch64_post_frob_eh_handler_addr (struct _Unwind_Context *current,
+ 				   ATTRIBUTE_UNUSED,
+ 				   void *handler_addr)
+ {
+-  if (current->flags & RA_A_SIGNED_BIT)
+-    return __builtin_aarch64_pacia1716 (handler_addr,
++  if (current->flags & RA_SIGNED_BIT)
++    {
++      if (aarch64_cie_signed_with_b_key (current))
++	return __builtin_aarch64_pacib1716 (handler_addr,
++					    (_Unwind_Word) current->cfa);
++      return __builtin_aarch64_pacia1716 (handler_addr,
+ 					(_Unwind_Word) current->cfa);
++    }
+   else
+     return handler_addr;
+ }
+@@ -79,7 +103,7 @@ aarch64_frob_update_context (struct _Unwind_Context *context,
+ {
+   if (fs->regs.reg[DWARF_REGNUM_AARCH64_RA_STATE].loc.offset & 0x1)
+     /* The flag is used for re-authenticating EH handler's address.  */
+-    context->flags |= RA_A_SIGNED_BIT;
++    context->flags |= RA_SIGNED_BIT;
+ 
+   return;
+ }
+diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
+index 24b4ecee6..40ebf85a9 100644
+--- a/libgcc/unwind-dw2-fde.c
++++ b/libgcc/unwind-dw2-fde.c
+@@ -334,6 +334,9 @@ get_cie_encoding (const struct dwarf_cie *cie)
+       /* LSDA encoding.  */
+       else if (*aug == 'L')
+ 	p++;
++      /* aarch64 b-key pointer authentication.  */
++      else if (*aug == 'B')
++	p++;
+       /* Otherwise end of string, or unknown augmentation.  */
+       else
+ 	return DW_EH_PE_absptr;
+diff --git a/libgcc/unwind-dw2.c b/libgcc/unwind-dw2.c
+index e6130af2f..e76a1cbc4 100644
+--- a/libgcc/unwind-dw2.c
++++ b/libgcc/unwind-dw2.c
+@@ -136,8 +136,9 @@ struct _Unwind_Context
+ #define SIGNAL_FRAME_BIT ((~(_Unwind_Word) 0 >> 1) + 1)
+   /* Context which has version/args_size/by_value fields.  */
+ #define EXTENDED_CONTEXT_BIT ((~(_Unwind_Word) 0 >> 2) + 1)
+-  /* Bit reserved on AArch64, return address has been signed with A key.  */
+-#define RA_A_SIGNED_BIT ((~(_Unwind_Word) 0 >> 3) + 1)
++  /* Bit reserved on AArch64, return address has been signed with A or B
++     key.  */
++#define RA_SIGNED_BIT ((~(_Unwind_Word) 0 >> 3) + 1)
+   _Unwind_Word flags;
+   /* 0 for now, can be increased when further fields are added to
+      struct _Unwind_Context.  */
+@@ -502,6 +503,11 @@ extract_cie_info (const struct dwarf_cie *cie, struct _Unwind_Context *context,
+ 	  fs->signal_frame = 1;
+ 	  aug += 1;
+ 	}
++      /* aarch64 B-key pointer authentication.  */
++      else if (aug[0] == 'B')
++	{
++	  aug += 1;
++      }
+ 
+       /* Otherwise we have an unknown augmentation string.
+ 	 Bail unless we saw a 'z' prefix.  */
diff --git a/change-gcc-BASE-VER.patch b/change-gcc-BASE-VER.patch
index 79dd167..95e8324 100644
--- a/change-gcc-BASE-VER.patch
+++ b/change-gcc-BASE-VER.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-Bump-BASE-VER-to-9.3.1.patch
+9f26e5863a75744bbee1479792ecae084a3ceb20
+
 diff -Nurp a/gcc/BASE-VER b/gcc/BASE-VER
 --- a/gcc/BASE-VER	2020-08-19 10:47:14.100000000 +0800
 +++ b/gcc/BASE-VER	2020-08-19 10:32:30.380000000 +0800
diff --git a/dont-generate-IF_THEN_ELSE.patch b/dont-generate-IF_THEN_ELSE.patch
index 791b57b..16f28a4 100644
--- a/dont-generate-IF_THEN_ELSE.patch
+++ b/dont-generate-IF_THEN_ELSE.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-combine-Don-t-generate-IF_THEN_ELSE.patch
+ddbb5da5199fb421dc398911c37fa7f896efc13f
+
 diff --git a/gcc/combine.c b/gcc/combine.c
 index 4de759a8e6b..ce7aeecb5c2 100644
 --- a/gcc/combine.c
diff --git a/fix-ICE-IPA-compare-VRP-types.patch b/fix-ICE-IPA-compare-VRP-types.patch
new file mode 100644
index 0000000..3f1b316
--- /dev/null
+++ b/fix-ICE-IPA-compare-VRP-types.patch
@@ -0,0 +1,51 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-IPA-compare-VRP-types.patch
+a86623902767122c71c7229150a8b8a79cbb3673
+
+diff -Nurp a/gcc/ipa-prop.c b/gcc/ipa-prop.c
+--- a/gcc/ipa-prop.c	2020-11-28 00:19:34.340000000 +0800
++++ b/gcc/ipa-prop.c	2020-11-28 00:21:24.680000000 +0800
+@@ -122,7 +122,8 @@ struct ipa_vr_ggc_hash_traits : public g
+   static bool
+   equal (const value_range_base *a, const value_range_base *b)
+     {
+-      return a->equal_p (*b);
++      return (a->equal_p (*b)
++	      && types_compatible_p (a->type (), b->type ()));
+     }
+   static void
+   mark_empty (value_range_base *&p)
+diff -Nurp a/gcc/testsuite/gcc.c-torture/execute/pr97404.c b/gcc/testsuite/gcc.c-torture/execute/pr97404.c
+--- a/gcc/testsuite/gcc.c-torture/execute/pr97404.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.c-torture/execute/pr97404.c	2020-11-28 00:21:24.680000000 +0800
+@@ -0,0 +1,28 @@
++/* PR ipa/97404 */
++/* { dg-additional-options "-fno-inline" } */
++
++char a, b;
++long c;
++short d, e;
++long *f = &c;
++int g;
++char h(signed char i) { return 0; }
++static short j(short i, int k) { return i < 0 ? 0 : i >> k; }
++void l(void);
++void m(void)
++{
++  e = j(d | 9766, 11);
++    *f = e;
++}
++void l(void)
++{
++  a = 5 | g;
++    b = h(a);
++}
++int main()
++{
++  m();
++  if (c != 4)
++    __builtin_abort();
++  return 0;
++}
diff --git a/fix-ICE-in-affine-combination.patch b/fix-ICE-in-affine-combination.patch
new file mode 100644
index 0000000..e582681
--- /dev/null
+++ b/fix-ICE-in-affine-combination.patch
@@ -0,0 +1,396 @@
+This backport contains 2 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-tree-affine.c-expr_to_aff_combination-New-function-s.patch
+5120e0d8d48f4590a275e60565de6c5a4e772fc1
+
+0001-PR-tree-optimization-94574-aarch64-ICE-during-GIMPLE.patch
+0447929f11e6a3e1b076841712b90a8b6bc7d33a
+
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c	2020-12-08 14:54:11.467633230 +0800
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
++
++#define TYPE unsigned int
++
++#include "pr83403.h"
++
++/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } */
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c	2020-12-08 14:54:11.467633230 +0800
+@@ -0,0 +1,8 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
++
++#define TYPE int
++
++#include "pr83403.h"
++
++/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } */
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h	2020-12-08 14:54:11.467633230 +0800
+@@ -0,0 +1,30 @@
++__attribute__ ((noinline)) void
++calculate (const double *__restrict__ A, const double *__restrict__ B,
++	   double *__restrict__ C)
++{
++  TYPE m = 0;
++  TYPE n = 0;
++  TYPE k = 0;
++
++  A = (const double *) __builtin_assume_aligned (A, 16);
++  B = (const double *) __builtin_assume_aligned (B, 16);
++  C = (double *) __builtin_assume_aligned (C, 16);
++
++  for (n = 0; n < 9; n++)
++    {
++      for (m = 0; m < 10; m++)
++	{
++	  C[(n * 10) + m] = 0.0;
++	}
++
++      for (k = 0; k < 17; k++)
++	{
++#pragma simd
++	  for (m = 0; m < 10; m++)
++	    {
++	      C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
++	    }
++	}
++    }
++}
++
+diff -Nurp a/gcc/tree-affine.c b/gcc/tree-affine.c
+--- a/gcc/tree-affine.c	2020-12-09 09:01:13.179633230 +0800
++++ b/gcc/tree-affine.c	2020-12-08 14:54:11.467633230 +0800
+@@ -259,104 +259,66 @@ aff_combination_convert (aff_tree *comb,
+     }
+ }
+ 
+-/* Splits EXPR into an affine combination of parts.  */
++/* Tries to handle OP0 CODE OP1 as affine combination of parts.  Returns
++   true when that was successful and returns the combination in COMB.  */
+ 
+-void
+-tree_to_aff_combination (tree expr, tree type, aff_tree *comb)
++static bool
++expr_to_aff_combination (aff_tree *comb, tree_code code, tree type,
++			 tree op0, tree op1 = NULL_TREE)
+ {
+   aff_tree tmp;
+-  enum tree_code code;
+-  tree cst, core, toffset;
+   poly_int64 bitpos, bitsize, bytepos;
+-  machine_mode mode;
+-  int unsignedp, reversep, volatilep;
+-
+-  STRIP_NOPS (expr);
+ 
+-  code = TREE_CODE (expr);
+   switch (code)
+     {
+     case POINTER_PLUS_EXPR:
+-      tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
+-      tree_to_aff_combination (TREE_OPERAND (expr, 1), sizetype, &tmp);
++      tree_to_aff_combination (op0, type, comb);
++      tree_to_aff_combination (op1, sizetype, &tmp);
+       aff_combination_add (comb, &tmp);
+-      return;
++      return true;
+ 
+     case PLUS_EXPR:
+     case MINUS_EXPR:
+-      tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
+-      tree_to_aff_combination (TREE_OPERAND (expr, 1), type, &tmp);
++      tree_to_aff_combination (op0, type, comb);
++      tree_to_aff_combination (op1, type, &tmp);
+       if (code == MINUS_EXPR)
+ 	aff_combination_scale (&tmp, -1);
+       aff_combination_add (comb, &tmp);
+-      return;
++      return true;
+ 
+     case MULT_EXPR:
+-      cst = TREE_OPERAND (expr, 1);
+-      if (TREE_CODE (cst) != INTEGER_CST)
++      if (TREE_CODE (op1) != INTEGER_CST)
+ 	break;
+-      tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
+-      aff_combination_scale (comb, wi::to_widest (cst));
+-      return;
++      tree_to_aff_combination (op0, type, comb);
++      aff_combination_scale (comb, wi::to_widest (op1));
++      return true;
+ 
+     case NEGATE_EXPR:
+-      tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
++      tree_to_aff_combination (op0, type, comb);
+       aff_combination_scale (comb, -1);
+-      return;
++      return true;
+ 
+     case BIT_NOT_EXPR:
+       /* ~x = -x - 1 */
+-      tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
++      tree_to_aff_combination (op0, type, comb);
+       aff_combination_scale (comb, -1);
+       aff_combination_add_cst (comb, -1);
+-      return;
+-
+-    case ADDR_EXPR:
+-      /* Handle &MEM[ptr + CST] which is equivalent to POINTER_PLUS_EXPR.  */
+-      if (TREE_CODE (TREE_OPERAND (expr, 0)) == MEM_REF)
+-	{
+-	  expr = TREE_OPERAND (expr, 0);
+-	  tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
+-	  tree_to_aff_combination (TREE_OPERAND (expr, 1), sizetype, &tmp);
+-	  aff_combination_add (comb, &tmp);
+-	  return;
+-	}
+-      core = get_inner_reference (TREE_OPERAND (expr, 0), &bitsize, &bitpos,
+-				  &toffset, &mode, &unsignedp, &reversep,
+-				  &volatilep);
+-      if (!multiple_p (bitpos, BITS_PER_UNIT, &bytepos))
+-	break;
+-      aff_combination_const (comb, type, bytepos);
+-      if (TREE_CODE (core) == MEM_REF)
+-	{
+-	  tree mem_offset = TREE_OPERAND (core, 1);
+-	  aff_combination_add_cst (comb, wi::to_poly_widest (mem_offset));
+-	  core = TREE_OPERAND (core, 0);
+-	}
+-      else
+-	core = build_fold_addr_expr (core);
+-
+-      if (TREE_CODE (core) == ADDR_EXPR)
+-	aff_combination_add_elt (comb, core, 1);
+-      else
+-	{
+-	  tree_to_aff_combination (core, type, &tmp);
+-	  aff_combination_add (comb, &tmp);
+-	}
+-      if (toffset)
+-	{
+-	  tree_to_aff_combination (toffset, type, &tmp);
+-	  aff_combination_add (comb, &tmp);
+-	}
+-      return;
++      return true;
+ 
+     CASE_CONVERT:
+       {
+-	tree otype = TREE_TYPE (expr);
+-	tree inner = TREE_OPERAND (expr, 0);
++	tree otype = type;
++	tree inner = op0;
+ 	tree itype = TREE_TYPE (inner);
+ 	enum tree_code icode = TREE_CODE (inner);
+ 
++	/* STRIP_NOPS  */
++	if (tree_nop_conversion_p (otype, itype))
++	  {
++	    tree_to_aff_combination (op0, type, comb);
++	    return true;
++	  }
++
+ 	/* In principle this is a valid folding, but it isn't necessarily
+ 	   an optimization, so do it here and not in fold_unary.  */
+ 	if ((icode == PLUS_EXPR || icode == MINUS_EXPR || icode == MULT_EXPR)
+@@ -376,38 +338,127 @@ tree_to_aff_combination (tree expr, tree
+ 	      {
+ 		op0 = fold_convert (otype, op0);
+ 		op1 = fold_convert (otype, op1);
+-		expr = fold_build2 (icode, otype, op0, op1);
+-		tree_to_aff_combination (expr, type, comb);
+-		return;
++		return expr_to_aff_combination (comb, icode, otype, op0, op1);
+ 	      }
+ 	    wide_int minv, maxv;
+ 	    /* If inner type has wrapping overflow behavior, fold conversion
+ 	       for below case:
+-		 (T1)(X - CST) -> (T1)X - (T1)CST
+-	       if X - CST doesn't overflow by range information.  Also handle
+-	       (T1)(X + CST) as (T1)(X - (-CST)).  */
++		 (T1)(X *+- CST) -> (T1)X *+- (T1)CST
++	       if X *+- CST doesn't overflow by range information.  */
+ 	    if (TYPE_UNSIGNED (itype)
+ 		&& TYPE_OVERFLOW_WRAPS (itype)
+-		&& TREE_CODE (op0) == SSA_NAME
+ 		&& TREE_CODE (op1) == INTEGER_CST
+-		&& icode != MULT_EXPR
+-		&& get_range_info (op0, &minv, &maxv) == VR_RANGE)
++		&& determine_value_range (op0, &minv, &maxv) == VR_RANGE)
+ 	      {
++		wi::overflow_type overflow = wi::OVF_NONE;
++		signop sign = UNSIGNED;
+ 		if (icode == PLUS_EXPR)
+-		  op1 = wide_int_to_tree (itype, -wi::to_wide (op1));
+-		if (wi::geu_p (minv, wi::to_wide (op1)))
++		  wi::add (maxv, wi::to_wide (op1), sign, &overflow);
++		else if (icode == MULT_EXPR)
++		  wi::mul (maxv, wi::to_wide (op1), sign, &overflow);
++		else
++		  wi::sub (minv, wi::to_wide (op1), sign, &overflow);
++
++		if (overflow == wi::OVF_NONE)
+ 		  {
+ 		    op0 = fold_convert (otype, op0);
+ 		    op1 = fold_convert (otype, op1);
+-		    expr = fold_build2 (MINUS_EXPR, otype, op0, op1);
+-		    tree_to_aff_combination (expr, type, comb);
+-		    return;
++		    return expr_to_aff_combination (comb, icode, otype, op0,
++						    op1);
+ 		  }
+ 	      }
+ 	  }
+       }
+       break;
+ 
++    default:;
++    }
++
++  return false;
++}
++
++/* Splits EXPR into an affine combination of parts.  */
++
++void
++tree_to_aff_combination (tree expr, tree type, aff_tree *comb)
++{
++  aff_tree tmp;
++  enum tree_code code;
++  tree core, toffset;
++  poly_int64 bitpos, bitsize, bytepos;
++  machine_mode mode;
++  int unsignedp, reversep, volatilep;
++
++  STRIP_NOPS (expr);
++
++  code = TREE_CODE (expr);
++  switch (code)
++    {
++    case POINTER_PLUS_EXPR:
++    case PLUS_EXPR:
++    case MINUS_EXPR:
++    case MULT_EXPR:
++      if (expr_to_aff_combination (comb, code, type, TREE_OPERAND (expr, 0),
++				   TREE_OPERAND (expr, 1)))
++	return;
++      break;
++
++    case NEGATE_EXPR:
++    case BIT_NOT_EXPR:
++      if (expr_to_aff_combination (comb, code, type, TREE_OPERAND (expr, 0)))
++	return;
++      break;
++
++    CASE_CONVERT:
++      /* ???  TREE_TYPE (expr) should be equal to type here, but IVOPTS
++	 calls this with not showing an outer widening cast.  */
++      if (expr_to_aff_combination (comb, code,
++				   TREE_TYPE (expr), TREE_OPERAND (expr, 0)))
++	{
++	  aff_combination_convert (comb, type);
++	  return;
++	}
++      break;
++
++    case ADDR_EXPR:
++      /* Handle &MEM[ptr + CST] which is equivalent to POINTER_PLUS_EXPR.  */
++      if (TREE_CODE (TREE_OPERAND (expr, 0)) == MEM_REF)
++	{
++	  expr = TREE_OPERAND (expr, 0);
++	  tree_to_aff_combination (TREE_OPERAND (expr, 0), type, comb);
++	  tree_to_aff_combination (TREE_OPERAND (expr, 1), sizetype, &tmp);
++	  aff_combination_add (comb, &tmp);
++	  return;
++	}
++      core = get_inner_reference (TREE_OPERAND (expr, 0), &bitsize, &bitpos,
++				  &toffset, &mode, &unsignedp, &reversep,
++				  &volatilep);
++      if (!multiple_p (bitpos, BITS_PER_UNIT, &bytepos))
++	break;
++      aff_combination_const (comb, type, bytepos);
++      if (TREE_CODE (core) == MEM_REF)
++	{
++	  tree mem_offset = TREE_OPERAND (core, 1);
++	  aff_combination_add_cst (comb, wi::to_poly_widest (mem_offset));
++	  core = TREE_OPERAND (core, 0);
++	}
++      else
++	core = build_fold_addr_expr (core);
++
++      if (TREE_CODE (core) == ADDR_EXPR)
++	aff_combination_add_elt (comb, core, 1);
++      else
++	{
++	  tree_to_aff_combination (core, type, &tmp);
++	  aff_combination_add (comb, &tmp);
++	}
++      if (toffset)
++	{
++	  tree_to_aff_combination (toffset, type, &tmp);
++	  aff_combination_add (comb, &tmp);
++	}
++      return;
++
+     default:
+       {
+ 	if (poly_int_tree_p (expr))
+@@ -665,7 +716,7 @@ aff_combination_expand (aff_tree *comb A
+ {
+   unsigned i;
+   aff_tree to_add, current, curre;
+-  tree e, rhs;
++  tree e;
+   gimple *def;
+   widest_int scale;
+   struct name_expansion *exp;
+@@ -715,20 +766,38 @@ aff_combination_expand (aff_tree *comb A
+ 	    case PLUS_EXPR:
+ 	    case MINUS_EXPR:
+ 	    case MULT_EXPR:
++	      if (!expr_to_aff_combination (&current, code, TREE_TYPE (name),
++					    gimple_assign_rhs1 (def),
++					    gimple_assign_rhs2 (def)))
++		continue;
++	      break;
+ 	    case NEGATE_EXPR:
+ 	    case BIT_NOT_EXPR:
++	      if (!expr_to_aff_combination (&current, code, TREE_TYPE (name),
++					    gimple_assign_rhs1 (def)))
++		continue;
++	      break;
+ 	    CASE_CONVERT:
+-	      rhs = gimple_assign_rhs_to_tree (def);
++	      if (!expr_to_aff_combination (&current, code, TREE_TYPE (name),
++					    gimple_assign_rhs1 (def)))
++		/* This makes us always expand conversions which we did
++		   in the past and makes gcc.dg/tree-ssa/ivopts-lt-2.c
++		   PASS, eliminating one induction variable in IVOPTs.
++		   ???  But it is really excessive and we should try
++		   harder to do without it.  */
++		aff_combination_elt (&current, TREE_TYPE (name),
++				     fold_convert (TREE_TYPE (name),
++						   gimple_assign_rhs1 (def)));
+ 	      break;
+ 	    case ADDR_EXPR:
+ 	    case INTEGER_CST:
+ 	    case POLY_INT_CST:
+-	      rhs = gimple_assign_rhs1 (def);
++	      tree_to_aff_combination (gimple_assign_rhs1 (def),
++				       TREE_TYPE (name), &current);
+ 	      break;
+ 	    default:
+ 	      continue;
+ 	    }
+-	  tree_to_aff_combination (rhs, TREE_TYPE (name), &current);
+ 	  exp = XNEW (struct name_expansion);
+ 	  exp->in_progress = 1;
+ 	  if (!*cache)
diff --git a/fix-ICE-in-compute_live_loop_exits.patch b/fix-ICE-in-compute_live_loop_exits.patch
index 013ec83..5479487 100644
--- a/fix-ICE-in-compute_live_loop_exits.patch
+++ b/fix-ICE-in-compute_live_loop_exits.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-tree-optimization-92085-ICE-tree-check-expecte.patch
+3c8e341b996546607fa1f39a0fd9a9d7c2c38214
+
 diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c
 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c	1970-01-01 08:00:00.000000000 +0800
 +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92085-1.c	2020-07-09 11:05:23.136000000 +0800
diff --git a/fix-ICE-in-eliminate_stmt.patch b/fix-ICE-in-eliminate_stmt.patch
index 7cc61c5..983e193 100644
--- a/fix-ICE-in-eliminate_stmt.patch
+++ b/fix-ICE-in-eliminate_stmt.patch
@@ -1,12 +1,12 @@
-This backport contains 2 patchs from gcc main stream tree. 
-The commit id of these patchs list as following in the order of time.
-
-0001-Tweak-gcc.dg-vect-bb-slp-4-01-.c-PR92366.patch
-3771033244b3ee1b53a8a00d734580b16384fdd3
-
-0001-tree-vect-slp.c-vect_analyze_slp_instance-Dump-const.patch
-140ee00a961fda084c1b4b3f0e7e489a917858f7
-
+This backport contains 2 patchs from gcc main stream tree. 
+The commit id of these patchs list as following in the order of time.
+
+0001-Tweak-gcc.dg-vect-bb-slp-4-01-.c-PR92366.patch
+3771033244b3ee1b53a8a00d734580b16384fdd3
+
+0001-tree-vect-slp.c-vect_analyze_slp_instance-Dump-const.patch
+140ee00a961fda084c1b4b3f0e7e489a917858f7
+
 diff -Nurp a/gcc/testsuite/gcc.dg/vect/bb-slp-40.c b/gcc/testsuite/gcc.dg/vect/bb-slp-40.c
 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-40.c	2020-09-14 21:24:20.899694710 +0800
 +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-40.c	2020-09-15 20:54:05.456027442 +0800
diff --git a/fix-ICE-in-pass-vect.patch b/fix-ICE-in-pass-vect.patch
new file mode 100644
index 0000000..38effd1
--- /dev/null
+++ b/fix-ICE-in-pass-vect.patch
@@ -0,0 +1,37 @@
+diff -uprN a/gcc/testsuite/gcc.target/aarch64/sve/slp_fix_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_fix_1.c
+--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_fix_1.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_fix_1.c	2020-11-17 02:38:45.284000000 +0800
+@@ -0,0 +1,19 @@
++/* { dg-do compiler} */
++/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 -funsafe-math-optimizations" } */
++
++long a, b;
++float c, e;
++float *d;
++void f() {
++  float g, h, i, j;
++  b = 0;
++  for (; b < a; b++) {
++    i = d[0];
++    g = g + i * e;
++    j = d[1];
++    h = h - j * e;
++    d = d + 2;
++  }
++  c = g;
++  e = h;
++}
+diff -uprN a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
+--- a/gcc/tree-vect-slp.c	2020-11-16 10:59:36.000000000 +0800
++++ b/gcc/tree-vect-slp.c	2020-11-16 23:30:19.560000000 +0800
+@@ -4140,8 +4140,8 @@ vect_schedule_slp_instance (slp_tree nod
+ 	      gimple *vstmt;
+ 	      vstmt = gimple_build_assign (make_ssa_name (vectype),
+ 					   VEC_PERM_EXPR,
+-					   gimple_assign_lhs (v0[j]->stmt),
+-					   gimple_assign_lhs (v1[j]->stmt),
++					   gimple_get_lhs (v0[j]->stmt),
++					   gimple_get_lhs (v1[j]->stmt),
+ 					   tmask);
+ 	      SLP_TREE_VEC_STMTS (node).quick_push
+ 		(vect_finish_stmt_generation (stmt_info, vstmt, &si));
diff --git a/fix-ICE-in-vect_create_epilog_for_reduction.patch b/fix-ICE-in-vect_create_epilog_for_reduction.patch
index fef451b..7abf6aa 100644
--- a/fix-ICE-in-vect_create_epilog_for_reduction.patch
+++ b/fix-ICE-in-vect_create_epilog_for_reduction.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-Don-t-assign-a-cost-to-vectorizable_assignment.patch
+e4020b28d02a00d478a3a769855ae6a8d9cc6b26
+
 diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
 --- a/gcc/tree-vect-loop.c	2020-07-09 10:42:35.824000000 +0800
 +++ b/gcc/tree-vect-loop.c	2020-07-09 10:43:23.920000000 +0800
diff --git a/fix-ICE-in-vect_stmt_to_vectorize.patch b/fix-ICE-in-vect_stmt_to_vectorize.patch
index 67c9818..80229b5 100644
--- a/fix-ICE-in-vect_stmt_to_vectorize.patch
+++ b/fix-ICE-in-vect_stmt_to_vectorize.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-tree-optimization-92252-ICE-Segmentation-fault.patch
+97c6bea819ec0a773041308e62a7c05c33f093b0
+
 diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr92252.c b/gcc/testsuite/gcc.dg/torture/pr92252.c
 --- a/gcc/testsuite/gcc.dg/torture/pr92252.c	1970-01-01 08:00:00.000000000 +0800
 +++ b/gcc/testsuite/gcc.dg/torture/pr92252.c	2020-07-03 10:39:44.808000000 +0800
diff --git a/fix-ICE-in-vect_update_misalignment_for_peel.patch b/fix-ICE-in-vect_update_misalignment_for_peel.patch
new file mode 100644
index 0000000..30a9548
--- /dev/null
+++ b/fix-ICE-in-vect_update_misalignment_for_peel.patch
@@ -0,0 +1,784 @@
+This backport contains 5 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+8801ca5c28c3a9e9f36fa39a6a4455b48c8221fa
+9ac1403ca2c65ba4f28cf051b5326617fa9298d1
+7e99af4816cfad578094fcf08e2377f3ed76e201
+ef8777c14ce8694f53eab7a88d24513cbf541ba4
+dccbf1e2a6e544f71b4a5795f0c79015db019fc3
+
+
+diff -Nurp a/gcc/testsuite/gcc.dg/vect/pr92677.c b/gcc/testsuite/gcc.dg/vect/pr92677.c
+--- a/gcc/testsuite/gcc.dg/vect/pr92677.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/vect/pr92677.c	2020-10-26 18:31:50.980000000 +0800
+@@ -0,0 +1,26 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-O3" } */
++
++int a, c;
++int *b;
++long d;
++double *e;
++
++void fn1() {
++  long f;
++  double g, h;
++  while (c) {
++    if (d) {
++      g = *e;
++      *(b + 4) = g;
++    }
++    if (f) {
++      h = *(e + 2);
++      *(b + 6) = h;
++    }
++    e += a;
++    b += 8;
++    c--;
++    d += 2;
++  }
++}
+diff -Nurp a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c
+--- a/gcc/testsuite/gcc.dg/vect/slp-46.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/vect/slp-46.c	2020-10-26 18:31:56.512000000 +0800
+@@ -0,0 +1,96 @@
++/* { dg-require-effective-target vect_double } */
++
++#include "tree-vect.h"
++
++double x[1024], y[1024];
++
++void __attribute__((noipa)) foo()
++{
++  for (int i = 0; i < 512; ++i)
++    {
++      x[2*i] = y[i];
++      x[2*i+1] = y[i];
++    }
++}
++
++void __attribute__((noipa)) bar()
++{
++  for (int i = 0; i < 512; ++i)
++    {
++      x[2*i] = y[2*i];
++      x[2*i+1] = y[2*i];
++    }
++}
++
++void __attribute__((noipa)) baz()
++{
++  for (int i = 0; i < 512; ++i)
++    {
++      x[2*i] = y[511-i];
++      x[2*i+1] = y[511-i];
++    }
++}
++
++void __attribute__((noipa)) boo()
++{
++  for (int i = 0; i < 512; ++i)
++    {
++      x[2*i] = y[2*(511-i)];
++      x[2*i+1] = y[2*(511-i)];
++    }
++}
++
++int 
++main ()
++{
++  check_vect ();
++
++  for (int i = 0; i < 1024; ++i)
++    {
++      x[i] = 0;
++      y[i] = i;
++      __asm__ volatile ("");
++    }
++
++  foo ();
++  for (int i = 0; i < 1024; ++i)
++    if (x[i] != y[i/2])
++      abort ();
++
++  for (int i = 0; i < 1024; ++i)
++    {
++      x[i] = 0;
++      __asm__ volatile ("");
++    }
++
++  bar ();
++  for (int i = 0; i < 1024; ++i)
++    if (x[i] != y[2*(i/2)])
++      abort ();
++
++  for (int i = 0; i < 1024; ++i)
++    {
++      x[i] = 0;
++      __asm__ volatile ("");
++    }
++
++  baz ();
++  for (int i = 0; i < 1024; ++i)
++    if (x[i] != y[511 - i/2])
++      abort ();
++
++  for (int i = 0; i < 1024; ++i)
++    {
++      x[i] = 0;
++      __asm__ volatile ("");
++    }
++
++  boo ();
++  for (int i = 0; i < 1024; ++i)
++    if (x[i] != y[2*(511 - i/2)])
++      abort ();
++
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
+diff -Nurp a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-5.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-5.c
+--- a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-5.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-5.c	2020-10-26 18:31:53.584000000 +0800
+@@ -0,0 +1,36 @@
++#include "tree-vect.h"
++
++#define N 512
++
++int a[N], b[N];
++
++int __attribute__((noipa))
++foo (int aval, int bval)
++{
++  int i, res = 0;
++  for (i=0; i<N; i++)
++  {
++    if (a[i] != 0)
++      res = aval;
++    if (b[i] != 0)
++      res = bval;
++  }
++  return res;
++}
++
++int main()
++{
++  check_vect ();
++  if (foo (1, 2) != 0)
++    abort ();
++  a[3] = 1;
++  b[4] = 1;
++  if (foo (1, 2) != 2)
++    abort ();
++  a[7] = 1;
++  if (foo (1, 2) != 1)
++    abort ();
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_condition } } } */
+diff -Nurp a/gcc/testsuite/g++.dg/pr91221.C b/gcc/testsuite/g++.dg/pr91221.C
+--- a/gcc/testsuite/g++.dg/pr91221.C	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/g++.dg/pr91221.C	2020-10-26 18:31:45.768000000 +0800
+@@ -0,0 +1,13 @@
++// { dg-do compile }
++// { dg-options "-O2 -fno-ipa-pure-const -fpack-struct -Wno-address-of-packed-member" }
++
++void printf(...);
++struct A {
++    A() : bar_(), dbar_() {
++	for (int i;; i++)
++	  printf(i, bar_[i]);
++    }
++    int bar_[5];
++    double dbar_[5];
++};
++void fn1() { A a; }
+diff -Nurp a/gcc/tree-scalar-evolution.c b/gcc/tree-scalar-evolution.c
+--- a/gcc/tree-scalar-evolution.c	2020-10-26 18:28:58.720000000 +0800
++++ b/gcc/tree-scalar-evolution.c	2020-10-26 18:31:48.472000000 +0800
+@@ -933,8 +933,8 @@ enum t_bool {
+ };
+ 
+ 
+-static t_bool follow_ssa_edge (struct loop *loop, gimple *, gphi *,
+-			       tree *, int);
++static t_bool follow_ssa_edge_expr (struct loop *loop, gimple *, tree, gphi *,
++				    tree *, int);
+ 
+ /* Follow the ssa edge into the binary expression RHS0 CODE RHS1.
+    Return true if the strongly connected component has been found.  */
+@@ -969,8 +969,8 @@ follow_ssa_edge_binary (struct loop *loo
+ 		  (loop->num,
+ 		   chrec_convert (type, evol, at_stmt),
+ 		   code, rhs1, at_stmt);
+-	      res = follow_ssa_edge
+-		(loop, SSA_NAME_DEF_STMT (rhs0), halting_phi, &evol, limit);
++	      res = follow_ssa_edge_expr
++		(loop, at_stmt, rhs0, halting_phi, &evol, limit);
+ 	      if (res == t_true)
+ 		*evolution_of_loop = evol;
+ 	      else if (res == t_false)
+@@ -979,8 +979,8 @@ follow_ssa_edge_binary (struct loop *loo
+ 		      (loop->num,
+ 		       chrec_convert (type, *evolution_of_loop, at_stmt),
+ 		       code, rhs0, at_stmt);
+-		  res = follow_ssa_edge
+-		    (loop, SSA_NAME_DEF_STMT (rhs1), halting_phi,
++		  res = follow_ssa_edge_expr
++		    (loop, at_stmt, rhs1, halting_phi,
+ 		     evolution_of_loop, limit);
+ 		  if (res == t_true)
+ 		    ;
+@@ -1000,8 +1000,8 @@ follow_ssa_edge_binary (struct loop *loo
+ 		  (loop->num, chrec_convert (type, *evolution_of_loop,
+ 					     at_stmt),
+ 		   code, rhs1, at_stmt);
+-	      res = follow_ssa_edge
+-		(loop, SSA_NAME_DEF_STMT (rhs0), halting_phi,
++	      res = follow_ssa_edge_expr
++		(loop, at_stmt, rhs0, halting_phi,
+ 		 evolution_of_loop, limit);
+ 	      if (res == t_true)
+ 		;	
+@@ -1018,8 +1018,8 @@ follow_ssa_edge_binary (struct loop *loo
+ 	      (loop->num, chrec_convert (type, *evolution_of_loop,
+ 					 at_stmt),
+ 	       code, rhs0, at_stmt);
+-	  res = follow_ssa_edge
+-	    (loop, SSA_NAME_DEF_STMT (rhs1), halting_phi,
++	  res = follow_ssa_edge_expr
++	    (loop, at_stmt, rhs1, halting_phi,
+ 	     evolution_of_loop, limit);
+ 	  if (res == t_true)
+ 	    ;
+@@ -1050,8 +1050,8 @@ follow_ssa_edge_binary (struct loop *loo
+ 	  *evolution_of_loop = add_to_evolution
+ 	      (loop->num, chrec_convert (type, *evolution_of_loop, at_stmt),
+ 	       MINUS_EXPR, rhs1, at_stmt);
+-	  res = follow_ssa_edge (loop, SSA_NAME_DEF_STMT (rhs0), halting_phi,
+-				 evolution_of_loop, limit);
++	  res = follow_ssa_edge_expr (loop, at_stmt, rhs0, halting_phi,
++				      evolution_of_loop, limit);
+ 	  if (res == t_true)
+ 	    ;
+ 	  else if (res == t_dont_know)
+@@ -1071,140 +1071,6 @@ follow_ssa_edge_binary (struct loop *loo
+   return res;
+ }
+ 
+-/* Follow the ssa edge into the expression EXPR.
+-   Return true if the strongly connected component has been found.  */
+-
+-static t_bool
+-follow_ssa_edge_expr (struct loop *loop, gimple *at_stmt, tree expr,
+-		      gphi *halting_phi, tree *evolution_of_loop,
+-		      int limit)
+-{
+-  enum tree_code code = TREE_CODE (expr);
+-  tree type = TREE_TYPE (expr), rhs0, rhs1;
+-  t_bool res;
+-
+-  /* The EXPR is one of the following cases:
+-     - an SSA_NAME,
+-     - an INTEGER_CST,
+-     - a PLUS_EXPR,
+-     - a POINTER_PLUS_EXPR,
+-     - a MINUS_EXPR,
+-     - an ASSERT_EXPR,
+-     - other cases are not yet handled.  */
+-
+-  switch (code)
+-    {
+-    CASE_CONVERT:
+-      /* This assignment is under the form "a_1 = (cast) rhs.  */
+-      res = follow_ssa_edge_expr (loop, at_stmt, TREE_OPERAND (expr, 0),
+-				  halting_phi, evolution_of_loop, limit);
+-      *evolution_of_loop = chrec_convert (type, *evolution_of_loop, at_stmt);
+-      break;
+-
+-    case INTEGER_CST:
+-      /* This assignment is under the form "a_1 = 7".  */
+-      res = t_false;
+-      break;
+-
+-    case SSA_NAME:
+-      /* This assignment is under the form: "a_1 = b_2".  */
+-      res = follow_ssa_edge
+-	(loop, SSA_NAME_DEF_STMT (expr), halting_phi, evolution_of_loop, limit);
+-      break;
+-
+-    case POINTER_PLUS_EXPR:
+-    case PLUS_EXPR:
+-    case MINUS_EXPR:
+-      /* This case is under the form "rhs0 +- rhs1".  */
+-      rhs0 = TREE_OPERAND (expr, 0);
+-      rhs1 = TREE_OPERAND (expr, 1);
+-      type = TREE_TYPE (rhs0);
+-      STRIP_USELESS_TYPE_CONVERSION (rhs0);
+-      STRIP_USELESS_TYPE_CONVERSION (rhs1);
+-      res = follow_ssa_edge_binary (loop, at_stmt, type, rhs0, code, rhs1,
+-				    halting_phi, evolution_of_loop, limit);
+-      break;
+-
+-    case ADDR_EXPR:
+-      /* Handle &MEM[ptr + CST] which is equivalent to POINTER_PLUS_EXPR.  */
+-      if (TREE_CODE (TREE_OPERAND (expr, 0)) == MEM_REF)
+-	{
+-	  expr = TREE_OPERAND (expr, 0);
+-	  rhs0 = TREE_OPERAND (expr, 0);
+-	  rhs1 = TREE_OPERAND (expr, 1);
+-	  type = TREE_TYPE (rhs0);
+-	  STRIP_USELESS_TYPE_CONVERSION (rhs0);
+-	  STRIP_USELESS_TYPE_CONVERSION (rhs1);
+-	  res = follow_ssa_edge_binary (loop, at_stmt, type,
+-					rhs0, POINTER_PLUS_EXPR, rhs1,
+-					halting_phi, evolution_of_loop, limit);
+-	}
+-      else
+-	res = t_false;
+-      break;
+-
+-    case ASSERT_EXPR:
+-      /* This assignment is of the form: "a_1 = ASSERT_EXPR <a_2, ...>"
+-	 It must be handled as a copy assignment of the form a_1 = a_2.  */
+-      rhs0 = ASSERT_EXPR_VAR (expr);
+-      if (TREE_CODE (rhs0) == SSA_NAME)
+-	res = follow_ssa_edge (loop, SSA_NAME_DEF_STMT (rhs0),
+-			       halting_phi, evolution_of_loop, limit);
+-      else
+-	res = t_false;
+-      break;
+-
+-    default:
+-      res = t_false;
+-      break;
+-    }
+-
+-  return res;
+-}
+-
+-/* Follow the ssa edge into the right hand side of an assignment STMT.
+-   Return true if the strongly connected component has been found.  */
+-
+-static t_bool
+-follow_ssa_edge_in_rhs (struct loop *loop, gimple *stmt,
+-			gphi *halting_phi, tree *evolution_of_loop,
+-			int limit)
+-{
+-  enum tree_code code = gimple_assign_rhs_code (stmt);
+-  tree type = gimple_expr_type (stmt), rhs1, rhs2;
+-  t_bool res;
+-
+-  switch (code)
+-    {
+-    CASE_CONVERT:
+-      /* This assignment is under the form "a_1 = (cast) rhs.  */
+-      res = follow_ssa_edge_expr (loop, stmt, gimple_assign_rhs1 (stmt),
+-				  halting_phi, evolution_of_loop, limit);
+-      *evolution_of_loop = chrec_convert (type, *evolution_of_loop, stmt);
+-      break;
+-
+-    case POINTER_PLUS_EXPR:
+-    case PLUS_EXPR:
+-    case MINUS_EXPR:
+-      rhs1 = gimple_assign_rhs1 (stmt);
+-      rhs2 = gimple_assign_rhs2 (stmt);
+-      type = TREE_TYPE (rhs1);
+-      res = follow_ssa_edge_binary (loop, stmt, type, rhs1, code, rhs2,
+-				    halting_phi, evolution_of_loop, limit);
+-      break;
+-
+-    default:
+-      if (get_gimple_rhs_class (code) == GIMPLE_SINGLE_RHS)
+-	res = follow_ssa_edge_expr (loop, stmt, gimple_assign_rhs1 (stmt),
+-				    halting_phi, evolution_of_loop, limit);
+-      else
+-	res = t_false;
+-      break;
+-    }
+-
+-  return res;
+-}
+-
+ /* Checks whether the I-th argument of a PHI comes from a backedge.  */
+ 
+ static bool
+@@ -1244,8 +1110,8 @@ follow_ssa_edge_in_condition_phi_branch
+   if (TREE_CODE (branch) == SSA_NAME)
+     {
+       *evolution_of_branch = init_cond;
+-      return follow_ssa_edge (loop, SSA_NAME_DEF_STMT (branch), halting_phi,
+-			      evolution_of_branch, limit);
++      return follow_ssa_edge_expr (loop, condition_phi, branch, halting_phi,
++				   evolution_of_branch, limit);
+     }
+ 
+   /* This case occurs when one of the condition branches sets
+@@ -1352,65 +1218,158 @@ follow_ssa_edge_inner_loop_phi (struct l
+ 			       evolution_of_loop, limit);
+ }
+ 
+-/* Follow an SSA edge from a loop-phi-node to itself, constructing a
+-   path that is analyzed on the return walk.  */
++/* Follow the ssa edge into the expression EXPR.
++   Return true if the strongly connected component has been found.  */
+ 
+ static t_bool
+-follow_ssa_edge (struct loop *loop, gimple *def, gphi *halting_phi,
+-		 tree *evolution_of_loop, int limit)
++follow_ssa_edge_expr (struct loop *loop, gimple *at_stmt, tree expr,
++		      gphi *halting_phi, tree *evolution_of_loop,
++		      int limit)
+ {
+-  struct loop *def_loop;
++  enum tree_code code;
++  tree type, rhs0, rhs1 = NULL_TREE;
+ 
+-  if (gimple_nop_p (def))
+-    return t_false;
++  /* The EXPR is one of the following cases:
++     - an SSA_NAME,
++     - an INTEGER_CST,
++     - a PLUS_EXPR,
++     - a POINTER_PLUS_EXPR,
++     - a MINUS_EXPR,
++     - an ASSERT_EXPR,
++     - other cases are not yet handled.  */
+ 
+-  /* Give up if the path is longer than the MAX that we allow.  */
+-  if (limit > PARAM_VALUE (PARAM_SCEV_MAX_EXPR_COMPLEXITY))
+-    return t_dont_know;
+-
+-  def_loop = loop_containing_stmt (def);
+-
+-  switch (gimple_code (def))
+-    {
+-    case GIMPLE_PHI:
+-      if (!loop_phi_node_p (def))
+-	/* DEF is a condition-phi-node.  Follow the branches, and
+-	   record their evolutions.  Finally, merge the collected
+-	   information and set the approximation to the main
+-	   variable.  */
+-	return follow_ssa_edge_in_condition_phi
+-	  (loop, as_a <gphi *> (def), halting_phi, evolution_of_loop,
+-	   limit);
+-
+-      /* When the analyzed phi is the halting_phi, the
+-	 depth-first search is over: we have found a path from
+-	 the halting_phi to itself in the loop.  */
+-      if (def == halting_phi)
+-	return t_true;
++  /* For SSA_NAME look at the definition statement, handling
++     PHI nodes and otherwise expand appropriately for the expression
++     handling below.  */
++  if (TREE_CODE (expr) == SSA_NAME)
++    {
++      gimple *def = SSA_NAME_DEF_STMT (expr);
+ 
+-      /* Otherwise, the evolution of the HALTING_PHI depends
+-	 on the evolution of another loop-phi-node, i.e. the
+-	 evolution function is a higher degree polynomial.  */
+-      if (def_loop == loop)
++      if (gimple_nop_p (def))
+ 	return t_false;
+ 
+-      /* Inner loop.  */
+-      if (flow_loop_nested_p (loop, def_loop))
+-	return follow_ssa_edge_inner_loop_phi
+-	  (loop, as_a <gphi *> (def), halting_phi, evolution_of_loop,
+-	   limit + 1);
++      /* Give up if the path is longer than the MAX that we allow.  */
++      if (limit > PARAM_VALUE (PARAM_SCEV_MAX_EXPR_COMPLEXITY))
++	return t_dont_know;
+ 
+-      /* Outer loop.  */
+-      return t_false;
++      if (gphi *phi = dyn_cast <gphi *>(def))
++	{
++	  if (!loop_phi_node_p (phi))
++	    /* DEF is a condition-phi-node.  Follow the branches, and
++	       record their evolutions.  Finally, merge the collected
++	       information and set the approximation to the main
++	       variable.  */
++	    return follow_ssa_edge_in_condition_phi
++		(loop, phi, halting_phi, evolution_of_loop, limit);
++
++	  /* When the analyzed phi is the halting_phi, the
++	     depth-first search is over: we have found a path from
++	     the halting_phi to itself in the loop.  */
++	  if (phi == halting_phi)
++	    return t_true;
++
++	  /* Otherwise, the evolution of the HALTING_PHI depends
++	     on the evolution of another loop-phi-node, i.e. the
++	     evolution function is a higher degree polynomial.  */
++	  class loop *def_loop = loop_containing_stmt (def);
++	  if (def_loop == loop)
++	    return t_false;
++
++	  /* Inner loop.  */
++	  if (flow_loop_nested_p (loop, def_loop))
++	    return follow_ssa_edge_inner_loop_phi
++		(loop, phi, halting_phi, evolution_of_loop,
++		 limit + 1);
+ 
+-    case GIMPLE_ASSIGN:
+-      return follow_ssa_edge_in_rhs (loop, def, halting_phi,
+-				     evolution_of_loop, limit);
++	  /* Outer loop.  */
++	  return t_false;
++	}
+ 
+-    default:
+       /* At this level of abstraction, the program is just a set
+ 	 of GIMPLE_ASSIGNs and PHI_NODEs.  In principle there is no
+-	 other node to be handled.  */
++	 other def to be handled.  */
++      if (!is_gimple_assign (def))
++	return t_false;
++
++      code = gimple_assign_rhs_code (def);
++      switch (get_gimple_rhs_class (code))
++	{
++	case GIMPLE_BINARY_RHS:
++	  rhs0 = gimple_assign_rhs1 (def);
++	  rhs1 = gimple_assign_rhs2 (def);
++	  break;
++	case GIMPLE_UNARY_RHS:
++	case GIMPLE_SINGLE_RHS:
++	  rhs0 = gimple_assign_rhs1 (def);
++	  break;
++	default:
++	  return t_false;
++	}
++      type = TREE_TYPE (gimple_assign_lhs (def));
++      at_stmt = def;
++    }
++  else
++    {
++      code = TREE_CODE (expr);
++      type = TREE_TYPE (expr);
++      switch (code)
++	{
++	CASE_CONVERT:
++	  rhs0 = TREE_OPERAND (expr, 0);
++	  break;
++	case POINTER_PLUS_EXPR:
++	case PLUS_EXPR:
++	case MINUS_EXPR:
++	  rhs0 = TREE_OPERAND (expr, 0);
++	  rhs1 = TREE_OPERAND (expr, 1);
++	  break;
++	default:
++	  rhs0 = expr;
++	}
++    }
++
++  switch (code)
++    {
++    CASE_CONVERT:
++      {
++	/* This assignment is under the form "a_1 = (cast) rhs.  */
++	t_bool res = follow_ssa_edge_expr (loop, at_stmt, rhs0, halting_phi,
++					   evolution_of_loop, limit);
++	*evolution_of_loop = chrec_convert (type, *evolution_of_loop, at_stmt);
++	return res;
++      }
++
++    case INTEGER_CST:
++      /* This assignment is under the form "a_1 = 7".  */
++      return t_false;
++
++    case ADDR_EXPR:
++      {
++	/* Handle &MEM[ptr + CST] which is equivalent to POINTER_PLUS_EXPR.  */
++	if (TREE_CODE (TREE_OPERAND (rhs0, 0)) != MEM_REF)
++	  return t_false;
++	tree mem = TREE_OPERAND (rhs0, 0);
++	rhs0 = TREE_OPERAND (mem, 0);
++	rhs1 = TREE_OPERAND (mem, 1);
++	code = POINTER_PLUS_EXPR;
++      }
++      /* Fallthru.  */
++    case POINTER_PLUS_EXPR:
++    case PLUS_EXPR:
++    case MINUS_EXPR:
++      /* This case is under the form "rhs0 +- rhs1".  */
++      STRIP_USELESS_TYPE_CONVERSION (rhs0);
++      STRIP_USELESS_TYPE_CONVERSION (rhs1);
++      return follow_ssa_edge_binary (loop, at_stmt, type, rhs0, code, rhs1,
++				     halting_phi, evolution_of_loop, limit);
++
++    case ASSERT_EXPR:
++      /* This assignment is of the form: "a_1 = ASSERT_EXPR <a_2, ...>"
++	 It must be handled as a copy assignment of the form a_1 = a_2.  */
++      return follow_ssa_edge_expr (loop, at_stmt, ASSERT_EXPR_VAR (rhs0),
++				   halting_phi, evolution_of_loop, limit);
++
++    default:
+       return t_false;
+     }
+ }
+@@ -1504,7 +1463,6 @@ analyze_evolution_in_loop (gphi *loop_ph
+   for (i = 0; i < n; i++)
+     {
+       tree arg = PHI_ARG_DEF (loop_phi_node, i);
+-      gimple *ssa_chain;
+       tree ev_fn;
+       t_bool res;
+ 
+@@ -1517,11 +1475,10 @@ analyze_evolution_in_loop (gphi *loop_ph
+ 	{
+ 	  bool val = false;
+ 
+-	  ssa_chain = SSA_NAME_DEF_STMT (arg);
+-
+ 	  /* Pass in the initial condition to the follow edge function.  */
+ 	  ev_fn = init_cond;
+-	  res = follow_ssa_edge (loop, ssa_chain, loop_phi_node, &ev_fn, 0);
++	  res = follow_ssa_edge_expr (loop, loop_phi_node, arg,
++				      loop_phi_node, &ev_fn, 0);
+ 
+ 	  /* If ev_fn has no evolution in the inner loop, and the
+ 	     init_cond is not equal to ev_fn, then we have an
+diff -Nurp a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
+--- a/gcc/tree-ssa-sccvn.c	2020-10-26 18:28:58.736000000 +0800
++++ b/gcc/tree-ssa-sccvn.c	2020-10-26 18:31:45.768000000 +0800
+@@ -2456,7 +2456,8 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	           (vuse, vr->set, vr->type, vr->operands, val);
+ 	}
+       /* For now handle clearing memory with partial defs.  */
+-      else if (integer_zerop (gimple_call_arg (def_stmt, 1))
++      else if (known_eq (ref->size, maxsize)
++	       && integer_zerop (gimple_call_arg (def_stmt, 1))
+ 	       && tree_to_poly_int64 (len).is_constant (&leni)
+ 	       && offset.is_constant (&offseti)
+ 	       && offset2.is_constant (&offset2i)
+@@ -2494,7 +2495,8 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	      return vn_reference_lookup_or_insert_for_pieces
+ 		  (vuse, vr->set, vr->type, vr->operands, val);
+ 	    }
+-	  else if (maxsize.is_constant (&maxsizei)
++	  else if (known_eq (ref->size, maxsize)
++		   && maxsize.is_constant (&maxsizei)
+ 		   && maxsizei % BITS_PER_UNIT == 0
+ 		   && offset.is_constant (&offseti)
+ 		   && offseti % BITS_PER_UNIT == 0
+diff -Nurp a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
+--- a/gcc/tree-vect-data-refs.c	2020-10-26 18:28:58.792000000 +0800
++++ b/gcc/tree-vect-data-refs.c	2020-10-26 18:31:56.512000000 +0800
+@@ -1045,7 +1045,7 @@ vect_compute_data_ref_alignment (dr_vec_
+   if (tree_int_cst_sgn (drb->step) < 0)
+     /* PLUS because STEP is negative.  */
+     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
+-		     * TREE_INT_CST_LOW (drb->step));
++		     * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
+ 
+   unsigned int const_misalignment;
+   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
+diff -Nurp a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
+--- a/gcc/tree-vect-loop.c	2020-10-26 18:28:58.728000000 +0800
++++ b/gcc/tree-vect-loop.c	2020-10-26 18:31:53.584000000 +0800
+@@ -1850,7 +1850,10 @@ vect_dissolve_slp_only_groups (loop_vec_
+ 		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
+ 		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+ 		  DR_GROUP_SIZE (vinfo) = 1;
+-		  DR_GROUP_GAP (vinfo) = group_size - 1;
++		  if (STMT_VINFO_STRIDED_P (first_element))
++		    DR_GROUP_GAP (vinfo) = 0;
++		  else
++		    DR_GROUP_GAP (vinfo) = group_size - 1;
+ 		  vinfo = next;
+ 		}
+ 	    }
+@@ -4516,18 +4519,26 @@ vect_create_epilog_for_reduction (stmt_v
+      zeroes.  */
+   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
+     {
++      auto_vec<std::pair<tree, bool>, 2> ccompares;
+       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
+       cond_info = vect_stmt_to_vectorize (cond_info);
+-      while (gimple_assign_rhs_code (cond_info->stmt) != COND_EXPR)
++      while (cond_info != reduc_info)
+ 	{
++	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
++	    {
++	      gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
++	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
++	      ccompares.safe_push
++		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
++				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
++	    }
+ 	  cond_info
+ 	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
+ 						 1 + STMT_VINFO_REDUC_IDX
+ 							(cond_info)));
+ 	  cond_info = vect_stmt_to_vectorize (cond_info);
+ 	}
+-      gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
+-      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
++      gcc_assert (ccompares.length () != 0);
+ 
+       tree indx_before_incr, indx_after_incr;
+       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
+@@ -4569,37 +4580,35 @@ vect_create_epilog_for_reduction (stmt_v
+       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
+ 		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
+ 
+-      /* Now take the condition from the loops original cond_expr
+-	 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
++      /* Now take the condition from the loops original cond_exprs
++	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
+ 	 every match uses values from the induction variable
+ 	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
+ 	 (NEW_PHI_TREE).
+ 	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
+ 	 the new cond_expr (INDEX_COND_EXPR).  */
+-
+-      /* Duplicate the condition from vec_stmt.  */
+-      tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
+-
+-      /* Create a conditional, where the condition is taken from vec_stmt
+-	 (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
+-	 the reduction phi corresponds to NEW_PHI_TREE and the new values
+-	 correspond to INDEX_BEFORE_INCR.  */
+-      gcc_assert (STMT_VINFO_REDUC_IDX (cond_info) >= 1);
+-      tree index_cond_expr;
+-      if (STMT_VINFO_REDUC_IDX (cond_info) == 2)
+-	index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
+-				  ccompare, indx_before_incr, new_phi_tree);
+-      else
+-	index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
+-				  ccompare, new_phi_tree, indx_before_incr);
+-      induction_index = make_ssa_name (cr_index_vector_type);
+-      gimple *index_condition = gimple_build_assign (induction_index,
+-						     index_cond_expr);
+-      gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
+-      stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
++      gimple_seq stmts = NULL;
++      for (int i = ccompares.length () - 1; i != -1; --i)
++	{
++	  tree ccompare = ccompares[i].first;
++	  if (ccompares[i].second)
++	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
++					 cr_index_vector_type,
++					 ccompare,
++					 indx_before_incr, new_phi_tree);
++	  else
++	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
++					 cr_index_vector_type,
++					 ccompare,
++					 new_phi_tree, indx_before_incr);
++	}
++      gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
++      stmt_vec_info index_vec_info
++	= loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
+       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
+ 
+       /* Update the phi with the vec cond.  */
++      induction_index = new_phi_tree;
+       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
+ 		   loop_latch_edge (loop), UNKNOWN_LOCATION);
+     }
diff --git a/fix-ICE-in-verify_ssa.patch b/fix-ICE-in-verify_ssa.patch
index 056c276..fa48fcb 100644
--- a/fix-ICE-in-verify_ssa.patch
+++ b/fix-ICE-in-verify_ssa.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-tree-optimization-92461-ICE-verify_ssa-failed-.patch
+830d1b18526dd1f085e8a2e1467a6dde18fc6434
+
 diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr92461.c b/gcc/testsuite/gcc.dg/torture/pr92461.c
 --- a/gcc/testsuite/gcc.dg/torture/pr92461.c	1970-01-01 08:00:00.000000000 +0800
 +++ b/gcc/testsuite/gcc.dg/torture/pr92461.c	2020-07-28 19:48:09.324000000 +0800
diff --git a/fix-ICE-when-vectorizing-nested-cycles.patch b/fix-ICE-when-vectorizing-nested-cycles.patch
index d8a5b69..2aa3f46 100644
--- a/fix-ICE-when-vectorizing-nested-cycles.patch
+++ b/fix-ICE-when-vectorizing-nested-cycles.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-tree-optimization-96698-fix-ICE-when-vectorizing-nes.patch
+2130efe6ac7beba72d289e3dd145daa10aeaed54
+
 diff -uprN a/gcc/testsuite/gcc.dg/vect/pr96698.c b/gcc/testsuite/gcc.dg/vect/pr96698.c
 --- a/gcc/testsuite/gcc.dg/vect/pr96698.c	1970-01-01 08:00:00.000000000 +0800
 +++ b/gcc/testsuite/gcc.dg/vect/pr96698.c	2020-08-27 17:53:24.396000000 +0800
diff --git a/fix-PR-92351-When-peeling-for-alignment.patch b/fix-PR-92351-When-peeling-for-alignment.patch
new file mode 100644
index 0000000..88866e6
--- /dev/null
+++ b/fix-PR-92351-When-peeling-for-alignment.patch
@@ -0,0 +1,152 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-vect-PR-92351-When-peeling-for-alignment-make-alignm.patch
+4e9d58d16767b1bc686f0c4b3bd2da25dc71e8f3
+
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-2-epilogues.c b/gcc/testsuite/gcc.dg/vect/vect-peel-2-epilogues.c
+new file mode 100644
+index 00000000000..c06fa442faf
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-2-epilogues.c
+@@ -0,0 +1,3 @@
++/* { dg-require-effective-target vect_int } */
++
++#include "vect-peel-2-src.c"
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-2-src.c b/gcc/testsuite/gcc.dg/vect/vect-peel-2-src.c
+new file mode 100644
+index 00000000000..f6fc134c870
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-2-src.c
+@@ -0,0 +1,48 @@
++#include <stdarg.h>
++#include "tree-vect.h"
++
++#define N 128
++
++/* unaligned store.  */
++
++int ib[N+7];
++
++__attribute__ ((noinline))
++int main1 ()
++{
++  int i;
++  int ia[N+1];
++
++  /* The store is aligned and the loads are misaligned with the same 
++     misalignment. Cost model is disabled. If misaligned stores are supported,
++     we peel according to the loads to align them.  */
++  for (i = 0; i <= N; i++)
++    {
++      ia[i] = ib[i+2] + ib[i+6];
++    }
++
++  /* check results:  */
++  for (i = 1; i <= N; i++)
++    {
++      if (ia[i] != ib[i+2] + ib[i+6])
++        abort ();
++    }
++
++  return 0;
++}
++
++int main (void)
++{ 
++  int i;
++
++  check_vect ();
++
++  for (i = 0; i <= N+6; i++)
++    {
++      asm volatile ("" : "+r" (i));
++      ib[i] = i;
++    }
++
++  return main1 ();
++}
++
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-2.c b/gcc/testsuite/gcc.dg/vect/vect-peel-2.c
+index b6061c3b855..65e70bd4417 100644
+--- a/gcc/testsuite/gcc.dg/vect/vect-peel-2.c
++++ b/gcc/testsuite/gcc.dg/vect/vect-peel-2.c
+@@ -1,52 +1,8 @@
+ /* { dg-require-effective-target vect_int } */
++/* Disabling epilogues until we find a better way to deal with scans.  */
++/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+ 
+-#include <stdarg.h>
+-#include "tree-vect.h"
+-
+-#define N 128
+-
+-/* unaligned store.  */
+-
+-int ib[N+7];
+-
+-__attribute__ ((noinline))
+-int main1 ()
+-{
+-  int i;
+-  int ia[N+1];
+-
+-  /* The store is aligned and the loads are misaligned with the same 
+-     misalignment. Cost model is disabled. If misaligned stores are supported,
+-     we peel according to the loads to align them.  */
+-  for (i = 0; i <= N; i++)
+-    {
+-      ia[i] = ib[i+2] + ib[i+6];
+-    }
+-
+-  /* check results:  */
+-  for (i = 1; i <= N; i++)
+-    {
+-      if (ia[i] != ib[i+2] + ib[i+6])
+-        abort ();
+-    }
+-
+-  return 0;
+-}
+-
+-int main (void)
+-{ 
+-  int i;
+-
+-  check_vect ();
+-
+-  for (i = 0; i <= N+6; i++)
+-    {
+-      asm volatile ("" : "+r" (i));
+-      ib[i] = i;
+-    }
+-
+-  return main1 ();
+-}
++#include "vect-peel-2-src.c"
+ 
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { { vect_element_align } && { vect_aligned_arrays } } } } } */
+diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
+index 36639b697f1..88f14e73d65 100644
+--- a/gcc/tree-vect-data-refs.c
++++ b/gcc/tree-vect-data-refs.c
+@@ -938,6 +938,18 @@ vect_compute_data_ref_alignment (dr_vec_info *dr_info)
+     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
+   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
+ 
++  /* If the main loop has peeled for alignment we have no way of knowing
++     whether the data accesses in the epilogues are aligned.  We can't at
++     compile time answer the question whether we have entered the main loop or
++     not.  Fixes PR 92351.  */
++  if (loop_vinfo)
++    {
++      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
++      if (orig_loop_vinfo
++	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
++	return;
++    }
++
+   unsigned HOST_WIDE_INT vect_align_c;
+   if (!vector_alignment.is_constant (&vect_align_c))
+     return;
diff --git a/fix-addlosymdi-ICE-in-pass-reload.patch b/fix-addlosymdi-ICE-in-pass-reload.patch
new file mode 100644
index 0000000..409a3ea
--- /dev/null
+++ b/fix-addlosymdi-ICE-in-pass-reload.patch
@@ -0,0 +1,30 @@
+diff -uprN a/gcc/lra.c b/gcc/lra.c
+--- a/gcc/lra.c	2020-12-14 15:26:36.331633230 +0800
++++ b/gcc/lra.c	2020-12-15 18:56:33.699633230 +0800
+@@ -507,6 +507,26 @@ lra_emit_move (rtx x, rtx y)
+ 	 data.	*/
+       if (old != max_reg_num ())
+ 	expand_reg_data (old);
++      while (insn != NULL)
++	{
++	  if (GET_CODE (PATTERN (insn)) == SET
++	      && GET_CODE (SET_SRC (PATTERN (insn))) == LO_SUM
++	      && GET_CODE (SET_DEST (PATTERN (insn))) == REG
++	      && strcmp (insn_data[recog_memoized (insn)].name,
++			 "add_losym_di") == 0)
++	    {
++	      rtx add_losym_dest = SET_DEST (PATTERN (insn));
++	      for (int i = (int) max_reg_num () - 1; i >= old; i--)
++		{
++		  if (regno_reg_rtx[i] == add_losym_dest)
++		    {
++		      setup_reg_classes (i, GENERAL_REGS,
++					 NO_REGS, GENERAL_REGS);
++		    }
++		}
++	    }
++	  insn = PREV_INSN (insn);
++	}
+       return;
+     }
+   lra_emit_add (x, XEXP (y, 0), XEXP (y, 1));
diff --git a/fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch b/fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch
new file mode 100644
index 0000000..fc236e9
--- /dev/null
+++ b/fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch
@@ -0,0 +1,115 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-aarch64-Fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch:
+91d80cf4bd2827dd9c40fe6a7c719c909d79083d
+
+diff -Nurp a/gcc/testsuite/gcc.target/aarch64/pr96757.c b/gcc/testsuite/gcc.target/aarch64/pr96757.c
+--- a/gcc/testsuite/gcc.target/aarch64/pr96757.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.target/aarch64/pr96757.c	2020-10-12 08:32:12.192000000 -0400
+@@ -0,0 +1,23 @@
++/* PR target/96757 */
++/* { dg-do compile } */
++/* { dg-options "-O3" } */
++
++short 
++fun1(short i, short j)
++{ 
++  return i * j; 
++}
++
++int 
++fun(int a, int b, int c) 
++{
++  int *v, z, k, m;
++  short f, d;
++  for (int i=0; i<c; i++) 
++  {
++    f= 4 <= d;
++    k= a > m;
++    z = f > k;
++    *v += fun1(z,b);
++  }
++}
+diff -Nurp a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
+--- a/gcc/tree-vect-patterns.c	2020-10-12 08:05:18.924000000 -0400
++++ b/gcc/tree-vect-patterns.c	2020-10-12 08:50:56.996000000 -0400
+@@ -3917,6 +3917,8 @@ vect_recog_mask_conversion_pattern (stmt
+   tree vectype1, vectype2;
+   stmt_vec_info pattern_stmt_info;
+   vec_info *vinfo = stmt_vinfo->vinfo;
++  tree rhs1_op0 = NULL_TREE, rhs1_op1 = NULL_TREE;
++  tree rhs1_op0_type = NULL_TREE, rhs1_op1_type = NULL_TREE;
+ 
+   /* Check for MASK_LOAD ans MASK_STORE calls requiring mask conversion.  */
+   if (is_gimple_call (last_stmt)
+@@ -4016,9 +4018,37 @@ vect_recog_mask_conversion_pattern (stmt
+ 
+ 	     it is better for b1 and b2 to use the mask type associated
+ 	     with int elements rather bool (byte) elements.  */
+-	  rhs1_type = search_type_for_mask (TREE_OPERAND (rhs1, 0), vinfo);
+-	  if (!rhs1_type)
+-	    rhs1_type = TREE_TYPE (TREE_OPERAND (rhs1, 0));
++	  rhs1_op0 = TREE_OPERAND (rhs1, 0);
++	  rhs1_op1 = TREE_OPERAND (rhs1, 1);
++	  if (!rhs1_op0 || !rhs1_op1)
++	    return NULL;
++	  rhs1_op0_type = search_type_for_mask (rhs1_op0, vinfo);
++	  rhs1_op1_type = search_type_for_mask (rhs1_op1, vinfo);
++
++	  if (!rhs1_op0_type)
++	    rhs1_type = TREE_TYPE (rhs1_op0);
++	  else if (!rhs1_op1_type)
++	    rhs1_type = TREE_TYPE (rhs1_op1);
++	  else if (TYPE_PRECISION (rhs1_op0_type)
++		   != TYPE_PRECISION (rhs1_op1_type))
++	    {
++	      int tmp0 = (int) TYPE_PRECISION (rhs1_op0_type)
++			 - (int) TYPE_PRECISION (TREE_TYPE (lhs));
++	      int tmp1 = (int) TYPE_PRECISION (rhs1_op1_type)
++			 - (int) TYPE_PRECISION (TREE_TYPE (lhs));
++	      if ((tmp0 > 0 && tmp1 > 0) || (tmp0 < 0 && tmp1 < 0))
++		{
++		  if (abs (tmp0) > abs (tmp1))
++		    rhs1_type = rhs1_op1_type;
++		  else
++		    rhs1_type = rhs1_op0_type;
++		}
++	      else
++		rhs1_type = build_nonstandard_integer_type
++		  (TYPE_PRECISION (TREE_TYPE (lhs)), 1);
++	    }
++	  else
++	    rhs1_type = rhs1_op0_type;
+ 	}
+       else
+ 	return NULL;
+@@ -4036,8 +4066,8 @@ vect_recog_mask_conversion_pattern (stmt
+ 	 name from the outset.  */
+       if (known_eq (TYPE_VECTOR_SUBPARTS (vectype1),
+ 		    TYPE_VECTOR_SUBPARTS (vectype2))
+-	  && (TREE_CODE (rhs1) == SSA_NAME
+-	      || rhs1_type == TREE_TYPE (TREE_OPERAND (rhs1, 0))))
++	  && !rhs1_op0_type
++	  && !rhs1_op1_type)
+ 	return NULL;
+ 
+       /* If rhs1 is invariant and we can promote it leave the COND_EXPR
+@@ -4069,7 +4099,16 @@ vect_recog_mask_conversion_pattern (stmt
+       if (TREE_CODE (rhs1) != SSA_NAME)
+ 	{
+ 	  tmp = vect_recog_temp_ssa_var (TREE_TYPE (rhs1), NULL);
+-	  pattern_stmt = gimple_build_assign (tmp, rhs1);
++	  if (rhs1_op0_type
++	      && TYPE_PRECISION (rhs1_op0_type) != TYPE_PRECISION (rhs1_type))
++	    rhs1_op0 = build_mask_conversion (rhs1_op0,
++					      vectype2, stmt_vinfo);
++	  if (rhs1_op1_type
++	      && TYPE_PRECISION (rhs1_op1_type) != TYPE_PRECISION (rhs1_type))
++	    rhs1_op1 = build_mask_conversion (rhs1_op1,
++					      vectype2, stmt_vinfo);
++	  pattern_stmt = gimple_build_assign (tmp, TREE_CODE (rhs1),
++			 		      rhs1_op0, rhs1_op1);
+ 	  rhs1 = tmp;
+ 	  append_pattern_def_seq (stmt_vinfo, pattern_stmt, vectype2);
+ 	}
diff --git a/fix-avx512vl-vcvttpd2dq-2-fail.patch b/fix-avx512vl-vcvttpd2dq-2-fail.patch
new file mode 100644
index 0000000..60afadd
--- /dev/null
+++ b/fix-avx512vl-vcvttpd2dq-2-fail.patch
@@ -0,0 +1,301 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch
+946732df902dbb23dd44abe97fea41e154e6e5f9
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index 3ce22395c65..12d6dc0cb7e 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -5927,16 +5927,16 @@
+    (set_attr "btver2_decode" "vector")
+    (set_attr "mode" "OI")])
+ 
+-(define_insn "sse2_cvtpd2dq<mask_name>"
++(define_insn "sse2_cvtpd2dq"
+   [(set (match_operand:V4SI 0 "register_operand" "=v")
+ 	(vec_concat:V4SI
+ 	  (unspec:V2SI [(match_operand:V2DF 1 "vector_operand" "vBm")]
+ 		       UNSPEC_FIX_NOTRUNC)
+ 	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
+-  "TARGET_SSE2 && <mask_avx512vl_condition>"
++  "TARGET_SSE2"
+ {
+   if (TARGET_AVX)
+-    return "vcvtpd2dq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}";
++    return "vcvtpd2dq{x}\t{%1, %0|%0, %1}";
+   else
+     return "cvtpd2dq\t{%1, %0|%0, %1}";
+ }
+@@ -5949,6 +5949,38 @@
+    (set_attr "athlon_decode" "vector")
+    (set_attr "bdver1_decode" "double")])
+ 
++(define_insn "sse2_cvtpd2dq_mask"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vm")]
++			  UNSPEC_FIX_NOTRUNC)
++	    (vec_select:V2SI
++	      (match_operand:V4SI 2 "nonimm_or_0_operand" "0C")
++	      (parallel [(const_int 0) (const_int 1)]))
++	    (match_operand:QI 3 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvtpd2dq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
++(define_insn "*sse2_cvtpd2dq_mask_1"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vm")]
++			  UNSPEC_FIX_NOTRUNC)
++	    (const_vector:V2SI [(const_int 0) (const_int 0)])
++	    (match_operand:QI 2 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvtpd2dq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
+ ;; For ufix_notrunc* insn patterns
+ (define_mode_attr pd2udqsuff
+   [(V8DF "") (V4DF "{y}")])
+@@ -5964,15 +5996,49 @@
+    (set_attr "prefix" "evex")
+    (set_attr "mode" "<sseinsnmode>")])
+ 
+-(define_insn "ufix_notruncv2dfv2si2<mask_name>"
++(define_insn "ufix_notruncv2dfv2si2"
+   [(set (match_operand:V4SI 0 "register_operand" "=v")
+ 	(vec_concat:V4SI
+ 	  (unspec:V2SI
+ 	    [(match_operand:V2DF 1 "nonimmediate_operand" "vm")]
+-	    UNSPEC_UNSIGNED_FIX_NOTRUNC)
++	       UNSPEC_UNSIGNED_FIX_NOTRUNC)
+ 	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
+   "TARGET_AVX512VL"
+-  "vcvtpd2udq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
++  "vcvtpd2udq{x}\t{%1, %0|%0, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
++(define_insn "ufix_notruncv2dfv2si2_mask"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (unspec:V2SI
++	      [(match_operand:V2DF 1 "nonimmediate_operand" "vm")]
++		 UNSPEC_UNSIGNED_FIX_NOTRUNC)
++	    (vec_select:V2SI
++	      (match_operand:V4SI 2 "nonimm_or_0_operand" "0C")
++	      (parallel [(const_int 0) (const_int 1)]))
++	    (match_operand:QI 3 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvtpd2udq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
++(define_insn "*ufix_notruncv2dfv2si2_mask_1"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (unspec:V2SI
++	      [(match_operand:V2DF 1 "nonimmediate_operand" "vm")]
++		 UNSPEC_UNSIGNED_FIX_NOTRUNC)
++	    (const_vector:V2SI [(const_int 0) (const_int 0)])
++	    (match_operand:QI 2 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvtpd2udq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+   [(set_attr "type" "ssecvt")
+    (set_attr "prefix" "evex")
+    (set_attr "mode" "TI")])
+@@ -5987,13 +6053,43 @@
+    (set_attr "prefix" "evex")
+    (set_attr "mode" "OI")])
+ 
+-(define_insn "ufix_truncv2dfv2si2<mask_name>"
++(define_insn "ufix_truncv2dfv2si2"
+   [(set (match_operand:V4SI 0 "register_operand" "=v")
+ 	(vec_concat:V4SI
+ 	  (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
+ 	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
+   "TARGET_AVX512VL"
+-  "vcvttpd2udq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
++  "vcvttpd2udq{x}\t{%1, %0|%0, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
++(define_insn "ufix_truncv2dfv2si2_mask"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
++	    (vec_select:V2SI
++	      (match_operand:V4SI 2 "nonimm_or_0_operand" "0C")
++	      (parallel [(const_int 0) (const_int 1)]))
++	    (match_operand:QI 3 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvttpd2udq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
++(define_insn "*ufix_truncv2dfv2si2_mask_1"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
++	    (const_vector:V2SI [(const_int 0) (const_int 0)])
++	    (match_operand:QI 2 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvttpd2udq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
+   [(set_attr "type" "ssecvt")
+    (set_attr "prefix" "evex")
+    (set_attr "mode" "TI")])
+@@ -6138,15 +6234,15 @@
+   "TARGET_AVX"
+   "operands[2] = CONST0_RTX (V4SImode);")
+ 
+-(define_insn "sse2_cvttpd2dq<mask_name>"
++(define_insn "sse2_cvttpd2dq"
+   [(set (match_operand:V4SI 0 "register_operand" "=v")
+ 	(vec_concat:V4SI
+ 	  (fix:V2SI (match_operand:V2DF 1 "vector_operand" "vBm"))
+ 	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
+-  "TARGET_SSE2 && <mask_avx512vl_condition>"
++  "TARGET_SSE2"
+ {
+   if (TARGET_AVX)
+-    return "vcvttpd2dq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}";
++    return "vcvttpd2dq{x}\t{%1, %0|%0, %1}";
+   else
+     return "cvttpd2dq\t{%1, %0|%0, %1}";
+ }
+@@ -6157,6 +6253,36 @@
+    (set_attr "prefix" "maybe_vex")
+    (set_attr "mode" "TI")])
+ 
++(define_insn "sse2_cvttpd2dq_mask"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
++	    (vec_select:V2SI
++	      (match_operand:V4SI 2 "nonimm_or_0_operand" "0C")
++	      (parallel [(const_int 0) (const_int 1)]))
++	    (match_operand:QI 3 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvttpd2dq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
++(define_insn "*sse2_cvttpd2dq_mask_1"
++  [(set (match_operand:V4SI 0 "register_operand" "=v")
++	(vec_concat:V4SI
++	  (vec_merge:V2SI
++	    (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
++	    (const_vector:V2SI [(const_int 0) (const_int 0)])
++	    (match_operand:QI 2 "register_operand" "Yk"))
++	  (const_vector:V2SI [(const_int 0) (const_int 0)])))]
++  "TARGET_AVX512VL"
++  "vcvttpd2dq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "TI")])
++
+ (define_insn "sse2_cvtsd2ss<round_name>"
+   [(set (match_operand:V4SF 0 "register_operand" "=x,x,v")
+ 	(vec_merge:V4SF
+@@ -6276,26 +6402,28 @@
+ 
+ (define_expand "sse2_cvtpd2ps_mask"
+   [(set (match_operand:V4SF 0 "register_operand")
+-	(vec_merge:V4SF
+-	  (vec_concat:V4SF
++	(vec_concat:V4SF
++	  (vec_merge:V2SF
+ 	    (float_truncate:V2SF
+ 	      (match_operand:V2DF 1 "vector_operand"))
+-	    (match_dup 4))
+-	  (match_operand:V4SF 2 "register_operand")
+-	  (match_operand:QI 3 "register_operand")))]
++	    (vec_select:V2SF
++	      (match_operand:V4SF 2 "nonimm_or_0_operand")
++	      (parallel [(const_int 0) (const_int 1)]))
++	    (match_operand:QI 3 "register_operand"))
++	  (match_dup 4)))]
+   "TARGET_SSE2"
+   "operands[4] = CONST0_RTX (V2SFmode);")
+ 
+-(define_insn "*sse2_cvtpd2ps<mask_name>"
++(define_insn "*sse2_cvtpd2ps"
+   [(set (match_operand:V4SF 0 "register_operand" "=v")
+ 	(vec_concat:V4SF
+ 	  (float_truncate:V2SF
+ 	    (match_operand:V2DF 1 "vector_operand" "vBm"))
+-	  (match_operand:V2SF 2 "const0_operand")))]
+-  "TARGET_SSE2 && <mask_avx512vl_condition>"
++	  (match_operand:V2SF 2 "const0_operand" "C")))]
++  "TARGET_SSE2"
+ {
+   if (TARGET_AVX)
+-    return "vcvtpd2ps{x}\t{%1, %0<mask_operand3>|%0<mask_operand3>, %1}";
++    return "vcvtpd2ps{x}\t{%1, %0|%0, %1}";
+   else
+     return "cvtpd2ps\t{%1, %0|%0, %1}";
+ }
+@@ -6307,6 +6435,38 @@
+    (set_attr "prefix" "maybe_vex")
+    (set_attr "mode" "V4SF")])
+ 
++(define_insn "*sse2_cvtpd2ps_mask"
++  [(set (match_operand:V4SF 0 "register_operand" "=v")
++	(vec_concat:V4SF
++	  (vec_merge:V2SF
++	    (float_truncate:V2SF
++	      (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
++	    (vec_select:V2SF
++	      (match_operand:V4SF 2 "nonimm_or_0_operand" "0C")
++	      (parallel [(const_int 0) (const_int 1)]))
++	    (match_operand:QI 3 "register_operand" "Yk"))
++	  (match_operand:V2SF 4 "const0_operand" "C")))]
++  "TARGET_AVX512VL"
++  "vcvtpd2ps{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "V4SF")])
++
++(define_insn "*sse2_cvtpd2ps_mask_1"
++  [(set (match_operand:V4SF 0 "register_operand" "=v")
++	(vec_concat:V4SF
++	  (vec_merge:V2SF
++	    (float_truncate:V2SF
++	      (match_operand:V2DF 1 "nonimmediate_operand" "vm"))
++	    (match_operand:V2SF 3 "const0_operand" "C")
++	    (match_operand:QI 2 "register_operand" "Yk"))
++	  (match_operand:V2SF 4 "const0_operand" "C")))]
++  "TARGET_AVX512VL"
++  "vcvtpd2ps{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}"
++  [(set_attr "type" "ssecvt")
++   (set_attr "prefix" "evex")
++   (set_attr "mode" "V4SF")])
++
+ ;; For <sse2_avx_avx512f>_cvtps2pd<avxsizesuffix> insn pattern
+ (define_mode_attr sf2dfmode
+   [(V8DF "V8SF") (V4DF "V4SF")])
diff --git a/fix-cost-of-plus.patch b/fix-cost-of-plus.patch
index 7edb1b1..5a0e2f0 100644
--- a/fix-cost-of-plus.patch
+++ b/fix-cost-of-plus.patch
@@ -1,3 +1,6 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
 AArch64-Fix-cost-of-plus-.-const_int-C.patch:
 commit 835d50c66aa5bde2f354a6e63a2afa7d2f76a05a
 
diff --git a/fix-issue499-add-nop-convert.patch b/fix-issue499-add-nop-convert.patch
new file mode 100644
index 0000000..fad9584
--- /dev/null
+++ b/fix-issue499-add-nop-convert.patch
@@ -0,0 +1,928 @@
+This patch is a combine of following 8 commits
+
+commit e944354ec05891474b0d204c6c239c04ee7b527b
+Author: Robin Dapp <rdapp@linux.ibm.com>
+Date:   Mon Aug 26 10:18:24 2019 +0000
+
+    [PATCH 1/2] Allow folding all statements.
+
+commit df7d46d925c7baca7bf9961aee900876d8aef225
+Author: Robin Dapp <rdapp@linux.ibm.com>
+Date:   Mon Aug 26 10:24:44 2019 +0000
+
+    [PATCH 2/2] Add simplify rule for wrapped addition.
+
+commit 6c14d008122fcee4157be79a60f8d6685869ad19
+Author: Robin Dapp <rdapp@linux.ibm.com>
+Date:   Tue Aug 27 12:08:58 2019 +0000
+
+    re PR testsuite/91549 (gcc.dg/wrapped-binop-simplify.c fails starting with r274925)
+
+commit 129bd066049f065e522990e63bb10ff92b3c018d
+Author: Jakub Jelinek <jakub@redhat.com>
+Date:   Tue Dec 3 10:20:43 2019 +0100
+
+    re PR tree-optimization/92734 (Missing match.pd simplification done by fold_binary_loc on generic)
+
+commit 526b4c716a340ee9464965e63eee2b9954fe21f1
+Author: Jakub Jelinek <jakub@redhat.com>
+Date:   Wed Dec 4 10:38:48 2019 +0100
+
+    re PR tree-optimization/92734 (Missing match.pd simplification done by fold_binary_loc on generic)
+
+commit 28fabd43d9d249134244eb9d7815917c7ae44b64
+Author: Richard Biener <rguenther@suse.de>
+Date:   Fri Dec 6 10:25:08 2019 +0000
+
+    genmatch.c (enum tree_code): Remove CONVERT{0,1,2} and VIEW_CONVERT{0,1,2}.
+
+commit e150da383346adc762bc904342f9877f2f071265
+Author: Richard Biener <rguenther@suse.de>
+Date:   Fri Dec 6 11:44:27 2019 +0000
+
+    match.pd (nop_convert): Remove empty match.
+
+commit 496f4f884716ae061f771a62e44868a32dbd502f
+Author: Jakub Jelinek <jakub@redhat.com>
+Date:   Mon May 4 11:01:08 2020 +0200
+
+    match.pd: Decrease number of nop conversions around bitwise ops [PR94718]
+
+diff -Nurp a/gcc/genmatch.c b/gcc/genmatch.c
+--- a/gcc/genmatch.c	2020-03-12 19:07:21.000000000 +0800
++++ b/gcc/genmatch.c	2020-11-24 14:49:12.792000000 +0800
+@@ -224,12 +224,6 @@ output_line_directive (FILE *f, location
+ #define DEFTREECODE(SYM, STRING, TYPE, NARGS)   SYM,
+ enum tree_code {
+ #include "tree.def"
+-CONVERT0,
+-CONVERT1,
+-CONVERT2,
+-VIEW_CONVERT0,
+-VIEW_CONVERT1,
+-VIEW_CONVERT2,
+ MAX_TREE_CODES
+ };
+ #undef DEFTREECODE
+@@ -695,11 +689,12 @@ struct expr : public operand
+   expr (id_base *operation_, location_t loc, bool is_commutative_ = false)
+     : operand (OP_EXPR, loc), operation (operation_),
+       ops (vNULL), expr_type (NULL), is_commutative (is_commutative_),
+-      is_generic (false), force_single_use (false) {}
++      is_generic (false), force_single_use (false), opt_grp (0) {}
+   expr (expr *e)
+     : operand (OP_EXPR, e->location), operation (e->operation),
+       ops (vNULL), expr_type (e->expr_type), is_commutative (e->is_commutative),
+-      is_generic (e->is_generic), force_single_use (e->force_single_use) {}
++      is_generic (e->is_generic), force_single_use (e->force_single_use),
++      opt_grp (e->opt_grp) {}
+   void append_op (operand *op) { ops.safe_push (op); }
+   /* The operator and its operands.  */
+   id_base *operation;
+@@ -714,6 +709,8 @@ struct expr : public operand
+   /* Whether pushing any stmt to the sequence should be conditional
+      on this expression having a single-use.  */
+   bool force_single_use;
++  /* If non-zero, the group for optional handling.  */
++  unsigned char opt_grp;
+   virtual void gen_transform (FILE *f, int, const char *, bool, int,
+ 			      const char *, capture_info *,
+ 			      dt_operand ** = 0, int = 0);
+@@ -1079,18 +1076,17 @@ lower_commutative (simplify *s, vec<simp
+     }
+ }
+ 
+-/* Strip conditional conversios using operator OPER from O and its
+-   children if STRIP, else replace them with an unconditional convert.  */
++/* Strip conditional operations using group GRP from O and its
++   children if STRIP, else replace them with an unconditional operation.  */
+ 
+ operand *
+-lower_opt_convert (operand *o, enum tree_code oper,
+-		   enum tree_code to_oper, bool strip)
++lower_opt (operand *o, unsigned char grp, bool strip)
+ {
+   if (capture *c = dyn_cast<capture *> (o))
+     {
+       if (c->what)
+ 	return new capture (c->location, c->where,
+-			    lower_opt_convert (c->what, oper, to_oper, strip),
++			    lower_opt (c->what, grp, strip),
+ 			    c->value_match);
+       else
+ 	return c;
+@@ -1100,36 +1096,34 @@ lower_opt_convert (operand *o, enum tree
+   if (!e)
+     return o;
+ 
+-  if (*e->operation == oper)
++  if (e->opt_grp == grp)
+     {
+       if (strip)
+-	return lower_opt_convert (e->ops[0], oper, to_oper, strip);
++	return lower_opt (e->ops[0], grp, strip);
+ 
+       expr *ne = new expr (e);
+-      ne->operation = (to_oper == CONVERT_EXPR
+-		       ? get_operator ("CONVERT_EXPR")
+-		       : get_operator ("VIEW_CONVERT_EXPR"));
+-      ne->append_op (lower_opt_convert (e->ops[0], oper, to_oper, strip));
++      ne->opt_grp = 0;
++      ne->append_op (lower_opt (e->ops[0], grp, strip));
+       return ne;
+     }
+ 
+   expr *ne = new expr (e);
+   for (unsigned i = 0; i < e->ops.length (); ++i)
+-    ne->append_op (lower_opt_convert (e->ops[i], oper, to_oper, strip));
++    ne->append_op (lower_opt (e->ops[i], grp, strip));
+ 
+   return ne;
+ }
+ 
+-/* Determine whether O or its children uses the conditional conversion
+-   operator OPER.  */
++/* Determine whether O or its children uses the conditional operation 
++   group GRP.  */
+ 
+ static bool
+-has_opt_convert (operand *o, enum tree_code oper)
++has_opt (operand *o, unsigned char grp)
+ {
+   if (capture *c = dyn_cast<capture *> (o))
+     {
+       if (c->what)
+-	return has_opt_convert (c->what, oper);
++	return has_opt (c->what, grp);
+       else
+ 	return false;
+     }
+@@ -1138,11 +1132,11 @@ has_opt_convert (operand *o, enum tree_c
+   if (!e)
+     return false;
+ 
+-  if (*e->operation == oper)
++  if (e->opt_grp == grp)
+     return true;
+ 
+   for (unsigned i = 0; i < e->ops.length (); ++i)
+-    if (has_opt_convert (e->ops[i], oper))
++    if (has_opt (e->ops[i], grp))
+       return true;
+ 
+   return false;
+@@ -1152,34 +1146,24 @@ has_opt_convert (operand *o, enum tree_c
+    if required.  */
+ 
+ static vec<operand *>
+-lower_opt_convert (operand *o)
++lower_opt (operand *o)
+ {
+   vec<operand *> v1 = vNULL, v2;
+ 
+   v1.safe_push (o);
+ 
+-  enum tree_code opers[]
+-    = { CONVERT0, CONVERT_EXPR,
+-	CONVERT1, CONVERT_EXPR,
+-	CONVERT2, CONVERT_EXPR,
+-	VIEW_CONVERT0, VIEW_CONVERT_EXPR,
+-	VIEW_CONVERT1, VIEW_CONVERT_EXPR,
+-	VIEW_CONVERT2, VIEW_CONVERT_EXPR };
+-
+-  /* Conditional converts are lowered to a pattern with the
+-     conversion and one without.  The three different conditional
+-     convert codes are lowered separately.  */
++  /* Conditional operations are lowered to a pattern with the
++     operation and one without.  All different conditional operation
++     groups are lowered separately.  */
+ 
+-  for (unsigned i = 0; i < sizeof (opers) / sizeof (enum tree_code); i += 2)
++  for (unsigned i = 1; i <= 10; ++i)
+     {
+       v2 = vNULL;
+       for (unsigned j = 0; j < v1.length (); ++j)
+-	if (has_opt_convert (v1[j], opers[i]))
++	if (has_opt (v1[j], i))
+ 	  {
+-	    v2.safe_push (lower_opt_convert (v1[j],
+-					     opers[i], opers[i+1], false));
+-	    v2.safe_push (lower_opt_convert (v1[j],
+-					     opers[i], opers[i+1], true));
++	    v2.safe_push (lower_opt (v1[j], i, false));
++	    v2.safe_push (lower_opt (v1[j], i, true));
+ 	  }
+ 
+       if (v2 != vNULL)
+@@ -1197,9 +1181,9 @@ lower_opt_convert (operand *o)
+    the resulting multiple patterns to SIMPLIFIERS.  */
+ 
+ static void
+-lower_opt_convert (simplify *s, vec<simplify *>& simplifiers)
++lower_opt (simplify *s, vec<simplify *>& simplifiers)
+ {
+-  vec<operand *> matchers = lower_opt_convert (s->match);
++  vec<operand *> matchers = lower_opt (s->match);
+   for (unsigned i = 0; i < matchers.length (); ++i)
+     {
+       simplify *ns = new simplify (s->kind, s->id, matchers[i], s->result,
+@@ -1543,7 +1527,7 @@ lower (vec<simplify *>& simplifiers, boo
+ {
+   auto_vec<simplify *> out_simplifiers;
+   for (unsigned i = 0; i < simplifiers.length (); ++i)
+-    lower_opt_convert (simplifiers[i], out_simplifiers);
++    lower_opt (simplifiers[i], out_simplifiers);
+ 
+   simplifiers.truncate (0);
+   for (unsigned i = 0; i < out_simplifiers.length (); ++i)
+@@ -3927,7 +3911,7 @@ private:
+ 
+   unsigned get_internal_capture_id ();
+ 
+-  id_base *parse_operation ();
++  id_base *parse_operation (unsigned char &);
+   operand *parse_capture (operand *, bool);
+   operand *parse_expr ();
+   c_expr *parse_c_expr (cpp_ttype);
+@@ -4118,47 +4102,36 @@ parser::record_operlist (location_t loc,
+    convert2?  */
+ 
+ id_base *
+-parser::parse_operation ()
++parser::parse_operation (unsigned char &opt_grp)
+ {
+   const cpp_token *id_tok = peek ();
++  char *alt_id = NULL;
+   const char *id = get_ident ();
+   const cpp_token *token = peek ();
+-  if (strcmp (id, "convert0") == 0)
+-    fatal_at (id_tok, "use 'convert?' here");
+-  else if (strcmp (id, "view_convert0") == 0)
+-    fatal_at (id_tok, "use 'view_convert?' here");
++  opt_grp = 0;
+   if (token->type == CPP_QUERY
+       && !(token->flags & PREV_WHITE))
+     {
+-      if (strcmp (id, "convert") == 0)
+-	id = "convert0";
+-      else if (strcmp (id, "convert1") == 0)
+-	;
+-      else if (strcmp (id, "convert2") == 0)
+-	;
+-      else if (strcmp (id, "view_convert") == 0)
+-	id = "view_convert0";
+-      else if (strcmp (id, "view_convert1") == 0)
+-	;
+-      else if (strcmp (id, "view_convert2") == 0)
+-	;
+-      else
+-	fatal_at (id_tok, "non-convert operator conditionalized");
+-
+       if (!parsing_match_operand)
+ 	fatal_at (id_tok, "conditional convert can only be used in "
+ 		  "match expression");
++      if (ISDIGIT (id[strlen (id) - 1]))
++	{
++	  opt_grp = id[strlen (id) - 1] - '0' + 1;
++	  alt_id = xstrdup (id);
++	  alt_id[strlen (id) - 1] = '\0';
++	  if (opt_grp == 1)
++	    fatal_at (id_tok, "use '%s?' here", alt_id);
++	}
++      else
++	opt_grp = 1;
+       eat_token (CPP_QUERY);
+     }
+-  else if (strcmp (id, "convert1") == 0
+-	   || strcmp (id, "convert2") == 0
+-	   || strcmp (id, "view_convert1") == 0
+-	   || strcmp (id, "view_convert2") == 0)
+-    fatal_at (id_tok, "expected '?' after conditional operator");
+-  id_base *op = get_operator (id);
++  id_base *op = get_operator (alt_id ? alt_id : id);
+   if (!op)
+-    fatal_at (id_tok, "unknown operator %s", id);
+-
++    fatal_at (id_tok, "unknown operator %s", alt_id ? alt_id : id);
++  if (alt_id)
++    free (alt_id);
+   user_id *p = dyn_cast<user_id *> (op);
+   if (p && p->is_oper_list)
+     {
+@@ -4214,7 +4187,8 @@ struct operand *
+ parser::parse_expr ()
+ {
+   const cpp_token *token = peek ();
+-  expr *e = new expr (parse_operation (), token->src_loc);
++  unsigned char opt_grp;
++  expr *e = new expr (parse_operation (opt_grp), token->src_loc);
+   token = peek ();
+   operand *op;
+   bool is_commutative = false;
+@@ -4310,6 +4284,12 @@ parser::parse_expr ()
+ 			  "commutative");
+ 	    }
+ 	  e->expr_type = expr_type;
++	  if (opt_grp != 0)
++	    {
++	      if (e->ops.length () != 1)
++		fatal_at (token, "only unary operations can be conditional");
++	      e->opt_grp = opt_grp;
++	    }
+ 	  return op;
+ 	}
+       else if (!(token->flags & PREV_WHITE))
+@@ -4692,10 +4672,6 @@ parser::parse_for (location_t)
+ 	  id_base *idb = get_operator (oper, true);
+ 	  if (idb == NULL)
+ 	    fatal_at (token, "no such operator '%s'", oper);
+-	  if (*idb == CONVERT0 || *idb == CONVERT1 || *idb == CONVERT2
+-	      || *idb == VIEW_CONVERT0 || *idb == VIEW_CONVERT1
+-	      || *idb == VIEW_CONVERT2)
+-	    fatal_at (token, "conditional operators cannot be used inside for");
+ 
+ 	  if (arity == -1)
+ 	    arity = idb->nargs;
+@@ -5102,12 +5078,6 @@ main (int argc, char **argv)
+   add_operator (SYM, # SYM, # TYPE, NARGS);
+ #define END_OF_BASE_TREE_CODES
+ #include "tree.def"
+-add_operator (CONVERT0, "convert0", "tcc_unary", 1);
+-add_operator (CONVERT1, "convert1", "tcc_unary", 1);
+-add_operator (CONVERT2, "convert2", "tcc_unary", 1);
+-add_operator (VIEW_CONVERT0, "view_convert0", "tcc_unary", 1);
+-add_operator (VIEW_CONVERT1, "view_convert1", "tcc_unary", 1);
+-add_operator (VIEW_CONVERT2, "view_convert2", "tcc_unary", 1);
+ #undef END_OF_BASE_TREE_CODES
+ #undef DEFTREECODE
+ 
+diff -Nurp a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc
+--- a/gcc/gimple-loop-versioning.cc	2020-03-12 19:07:21.000000000 +0800
++++ b/gcc/gimple-loop-versioning.cc	2020-11-24 14:49:12.792000000 +0800
+@@ -1264,6 +1264,12 @@ loop_versioning::record_address_fragment
+ 		  continue;
+ 		}
+ 	    }
++	  if (CONVERT_EXPR_CODE_P (code))
++	    {
++	      tree op1 = gimple_assign_rhs1 (assign);
++	      address->terms[i].expr = strip_casts (op1);
++	      continue;
++	    }
+ 	}
+       i += 1;
+     }
+diff -Nurp a/gcc/match.pd b/gcc/match.pd
+--- a/gcc/match.pd	2020-11-24 14:54:43.576000000 +0800
++++ b/gcc/match.pd	2020-11-24 14:49:12.792000000 +0800
+@@ -97,8 +97,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+ (define_operator_list COND_TERNARY
+   IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
+ 
+-/* As opposed to convert?, this still creates a single pattern, so
+-   it is not a suitable replacement for convert? in all cases.  */
++/* With nop_convert? combine convert? and view_convert? in one pattern
++   plus conditionalize on tree_nop_conversion_p conversions.  */
+ (match (nop_convert @0)
+  (convert @0)
+  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))))
+@@ -108,9 +108,6 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+       && known_eq (TYPE_VECTOR_SUBPARTS (type),
+ 		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)))
+       && tree_nop_conversion_p (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))))
+-/* This one has to be last, or it shadows the others.  */
+-(match (nop_convert @0)
+- @0)
+ 
+ /* Transform likes of (char) ABS_EXPR <(int) x> into (char) ABSU_EXPR <x>
+    ABSU_EXPR returns unsigned absolute value of the operand and the operand
+@@ -1260,7 +1257,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+    We combine the above two cases by using a conditional convert.  */
+ (for bitop (bit_and bit_ior bit_xor)
+  (simplify
+-  (bitop (convert @0) (convert? @1))
++  (bitop (convert@2 @0) (convert?@3 @1))
+   (if (((TREE_CODE (@1) == INTEGER_CST
+ 	 && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+ 	 && int_fits_type_p (@1, TREE_TYPE (@0)))
+@@ -1279,8 +1276,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+ 	   || GET_MODE_CLASS (TYPE_MODE (type)) != MODE_INT
+ 	   /* Or if the precision of TO is not the same as the precision
+ 	      of its mode.  */
+-	   || !type_has_mode_precision_p (type)))
+-   (convert (bitop @0 (convert @1))))))
++	   || !type_has_mode_precision_p (type)
++	   /* In GIMPLE, getting rid of 2 conversions for one new results
++	      in smaller IL.  */
++	   || (GIMPLE
++	       && TREE_CODE (@1) != INTEGER_CST
++	       && tree_nop_conversion_p (type, TREE_TYPE (@0))
++	       && single_use (@2)
++	       && single_use (@3))))
++   (convert (bitop @0 (convert @1)))))
++ /* In GIMPLE, getting rid of 2 conversions for one new results
++    in smaller IL.  */
++ (simplify
++  (convert (bitop:cs@2 (nop_convert:s @0) @1))
++  (if (GIMPLE
++       && TREE_CODE (@1) != INTEGER_CST
++       && tree_nop_conversion_p (type, TREE_TYPE (@2))
++       && types_match (type, @0))
++   (bitop @0 (convert @1)))))
+ 
+ (for bitop (bit_and bit_ior)
+      rbitop (bit_ior bit_and)
+@@ -1374,7 +1387,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+ 
+ /* Convert - (~A) to A + 1.  */
+ (simplify
+- (negate (nop_convert (bit_not @0)))
++ (negate (nop_convert? (bit_not @0)))
+  (plus (view_convert @0) { build_each_one_cst (type); }))
+ 
+ /* Convert ~ (A - 1) or ~ (A + -1) to -A.  */
+@@ -1401,7 +1414,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+ 
+ /* Otherwise prefer ~(X ^ Y) to ~X ^ Y as more canonical.  */
+ (simplify
+- (bit_xor:c (nop_convert:s (bit_not:s @0)) @1)
++ (bit_xor:c (nop_convert?:s (bit_not:s @0)) @1)
+  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+   (bit_not (bit_xor (view_convert @0) @1))))
+ 
+@@ -1614,7 +1627,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+ /* For equality, this is also true with wrapping overflow.  */
+ (for op (eq ne)
+  (simplify
+-  (op:c (nop_convert@3 (plus:c@2 @0 (convert1? @1))) (convert2? @1))
++  (op:c (nop_convert?@3 (plus:c@2 @0 (convert1? @1))) (convert2? @1))
+   (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
+        && (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0))
+ 	   || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
+@@ -1623,7 +1636,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+        && tree_nop_conversion_p (TREE_TYPE (@3), TREE_TYPE (@1)))
+    (op @0 { build_zero_cst (TREE_TYPE (@0)); })))
+  (simplify
+-  (op:c (nop_convert@3 (pointer_plus@2 (convert1? @0) @1)) (convert2? @0))
++  (op:c (nop_convert?@3 (pointer_plus@2 (convert1? @0) @1)) (convert2? @0))
+   (if (tree_nop_conversion_p (TREE_TYPE (@2), TREE_TYPE (@0))
+        && tree_nop_conversion_p (TREE_TYPE (@3), TREE_TYPE (@0))
+        && (CONSTANT_CLASS_P (@1) || (single_use (@2) && single_use (@3))))
+@@ -1866,7 +1879,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+ 	   || !HONOR_SIGN_DEPENDENT_ROUNDING (type)))
+    (convert (negate @1))))
+  (simplify
+-  (negate (nop_convert (negate @1)))
++  (negate (nop_convert? (negate @1)))
+   (if (!TYPE_OVERFLOW_SANITIZED (type)
+        && !TYPE_OVERFLOW_SANITIZED (TREE_TYPE (@1)))
+    (view_convert @1)))
+@@ -1883,20 +1896,26 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+   /* A - (A +- B)       -> -+ B */
+   /* A +- (B -+ A)      ->  +- B */
+   (simplify
+-    (minus (plus:c @0 @1) @0)
+-    @1)
+-  (simplify
+-    (minus (minus @0 @1) @0)
+-    (negate @1))
++   (minus (nop_convert1? (plus:c (nop_convert2? @0) @1)) @0)
++   (view_convert @1))
+   (simplify
+-    (plus:c (minus @0 @1) @1)
+-    @0)
++   (minus (nop_convert1? (minus (nop_convert2? @0) @1)) @0)
++   (if (!ANY_INTEGRAL_TYPE_P (type)
++	|| TYPE_OVERFLOW_WRAPS (type))
++   (negate (view_convert @1))
++   (view_convert (negate @1))))
++  (simplify
++   (plus:c (nop_convert1? (minus @0 (nop_convert2? @1))) @1)
++   (view_convert @0))
++  (simplify
++   (minus @0 (nop_convert1? (plus:c (nop_convert2? @0) @1)))
++    (if (!ANY_INTEGRAL_TYPE_P (type)
++	 || TYPE_OVERFLOW_WRAPS (type))
++     (negate (view_convert @1))
++     (view_convert (negate @1))))
+   (simplify
+-   (minus @0 (plus:c @0 @1))
+-   (negate @1))
+-  (simplify
+-   (minus @0 (minus @0 @1))
+-   @1)
++   (minus @0 (nop_convert1? (minus (nop_convert2? @0) @1)))
++   (view_convert @1))
+   /* (A +- B) + (C - A)   -> C +- B */
+   /* (A +  B) - (A - C)   -> B + C */
+   /* More cases are handled with comparisons.  */
+@@ -1922,7 +1941,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+    (for inner_op (plus minus)
+ 	neg_inner_op (minus plus)
+     (simplify
+-     (outer_op (nop_convert (inner_op @0 CONSTANT_CLASS_P@1))
++     (outer_op (nop_convert? (inner_op @0 CONSTANT_CLASS_P@1))
+ 	       CONSTANT_CLASS_P@2)
+      /* If one of the types wraps, use that one.  */
+      (if (!ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type))
+@@ -1961,17 +1980,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
+   /* (CST1 - A) +- CST2 -> CST3 - A  */
+   (for outer_op (plus minus)
+    (simplify
+-    (outer_op (minus CONSTANT_CLASS_P@1 @0) CONSTANT_CLASS_P@2)
+-    (with { tree cst = const_binop (outer_op, type, @1, @2); }
+-     (if (cst && !TREE_OVERFLOW (cst))
+-      (minus { cst; } @0)))))
+-
+-  /* CST1 - (CST2 - A) -> CST3 + A  */
+-  (simplify
+-   (minus CONSTANT_CLASS_P@1 (minus CONSTANT_CLASS_P@2 @0))
+-   (with { tree cst = const_binop (MINUS_EXPR, type, @1, @2); }
+-    (if (cst && !TREE_OVERFLOW (cst))
+-     (plus { cst; } @0))))
++    (outer_op (nop_convert? (minus CONSTANT_CLASS_P@1 @0)) CONSTANT_CLASS_P@2)
++    /* If one of the types wraps, use that one.  */
++    (if (!ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type))
++     /* If all 3 captures are CONSTANT_CLASS_P, punt, as we might recurse
++	forever if something doesn't simplify into a constant.  */
++     (if (!CONSTANT_CLASS_P (@0))
++      (minus (outer_op (view_convert @1) @2) (view_convert @0)))
++     (if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
++	  || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
++      (view_convert (minus (outer_op @1 (view_convert @2)) @0))
++      (if (types_match (type, @0))
++       (with { tree cst = const_binop (outer_op, type, @1, @2); }
++	(if (cst && !TREE_OVERFLOW (cst))
++	 (minus { cst; } @0))))))))
++
++  /* CST1 - (CST2 - A) -> CST3 + A
++     Use view_convert because it is safe for vectors and equivalent for
++     scalars.  */
++  (simplify
++   (minus CONSTANT_CLASS_P@1 (nop_convert? (minus CONSTANT_CLASS_P@2 @0)))
++   /* If one of the types wraps, use that one.  */
++   (if (!ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type))
++    /* If all 3 captures are CONSTANT_CLASS_P, punt, as we might recurse
++      forever if something doesn't simplify into a constant.  */
++    (if (!CONSTANT_CLASS_P (@0))
++     (plus (view_convert @0) (minus @1 (view_convert @2))))
++    (if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
++	 || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
++     (view_convert (plus @0 (minus (view_convert @1) @2)))
++     (if (types_match (type, @0))
++      (with { tree cst = const_binop (MINUS_EXPR, type, @1, @2); }
++       (if (cst && !TREE_OVERFLOW (cst))
++	(plus { cst; } @0)))))))
++
++/* ((T)(A)) + CST -> (T)(A + CST)  */
++#if GIMPLE
++  (simplify
++   (plus (convert SSA_NAME@0) INTEGER_CST@1)
++    (if (TREE_CODE (TREE_TYPE (@0)) == INTEGER_TYPE
++         && TREE_CODE (type) == INTEGER_TYPE
++         && TYPE_PRECISION (type) > TYPE_PRECISION (TREE_TYPE (@0))
++         && int_fits_type_p (@1, TREE_TYPE (@0)))
++     /* Perform binary operation inside the cast if the constant fits
++        and (A + CST)'s range does not overflow.  */
++     (with
++      {
++	wi::overflow_type min_ovf = wi::OVF_OVERFLOW,
++			  max_ovf = wi::OVF_OVERFLOW;
++        tree inner_type = TREE_TYPE (@0);
++
++        wide_int w1 = wide_int::from (wi::to_wide (@1), TYPE_PRECISION (inner_type),
++	    TYPE_SIGN (inner_type));
++
++        wide_int wmin0, wmax0;
++        if (get_range_info (@0, &wmin0, &wmax0) == VR_RANGE)
++          {
++            wi::add (wmin0, w1, TYPE_SIGN (inner_type), &min_ovf);
++            wi::add (wmax0, w1, TYPE_SIGN (inner_type), &max_ovf);
++          }
++      }
++     (if (min_ovf == wi::OVF_NONE && max_ovf == wi::OVF_NONE)
++      (convert (plus @0 { wide_int_to_tree (TREE_TYPE (@0), w1); } )))
++     )))
++#endif
+ 
+   /* ~A + A -> -1 */
+   (simplify
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-5.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-5.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-5.c	2020-03-12 19:07:22.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-5.c	2020-11-24 14:49:14.568000000 +0800
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-ch2-details" } */
++/* { dg-options "-O2 -fno-tree-vrp -fdump-tree-ch2-details" } */
+ 
+ int is_sorted(int *a, int n)
+ {
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-7.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-7.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-7.c	2020-03-12 19:07:22.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-headers-7.c	2020-11-24 14:49:14.568000000 +0800
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-ch2-details --param logical-op-non-short-circuit=0" } */
++/* { dg-options "-O2 -fno-tree-vrp -fdump-tree-ch2-details --param logical-op-non-short-circuit=0" } */
+ 
+ int is_sorted(int *a, int n, int m, int k)
+ {
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/loop-15.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-15.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-15.c	2020-03-12 19:07:22.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-15.c	2020-11-24 14:49:14.568000000 +0800
+@@ -19,7 +19,7 @@ int bla(void)
+ }
+ 
+ /* Since the loop is removed, there should be no addition.  */
+-/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" { xfail *-*-* } } } */
++/* { dg-final { scan-tree-dump-times " \\+ " 0 "optimized" } } */
+ /* { dg-final { scan-tree-dump-times " \\* " 1 "optimized" } } */
+ 
+ /* The if from the loop header copying remains in the code.  */
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr23744.c b/gcc/testsuite/gcc.dg/tree-ssa/pr23744.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr23744.c	2020-03-12 19:07:22.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr23744.c	2020-11-24 14:49:14.568000000 +0800
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fno-tree-ccp -fdisable-tree-evrp -fdump-tree-vrp1" } */
++/* { dg-options "-O2 -fno-tree-ccp -fdisable-tree-evrp -fdump-tree-vrp1-details" } */
+ 
+ void h (void);
+ 
+@@ -17,4 +17,4 @@ int g (int i, int j)
+     return 1;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "Folding predicate.*to 1" 1 "vrp1" } } */
++/* { dg-final { scan-tree-dump-times "gimple_simplified" 1 "vrp1" } } */
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr92734-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92734-2.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr92734-2.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92734-2.c	2020-11-24 14:49:14.568000000 +0800
+@@ -0,0 +1,76 @@
++/* PR tree-optimization/92734 */
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-optimized" } */
++/* Verify there are no binary additions or subtractions left.  There can
++   be just casts and negations.  */
++/* { dg-final { scan-tree-dump-not " \[+-] " "optimized" } } */
++
++int
++f1 (int x, unsigned y)
++{
++  int a = x + y;
++  return a - x;
++}
++
++unsigned
++f2 (unsigned x, int y)
++{
++  unsigned a = (int) x + y;
++  return a - x;
++}
++
++int
++f3 (int x, unsigned y)
++{
++  int a = x - y;
++  return a - x;
++}
++
++unsigned
++f4 (unsigned x, int y)
++{
++  unsigned a = (int) x - y;
++  return a - x;
++}
++
++int
++f5 (unsigned x, int y)
++{
++  int a = x - y;
++  return a + y;
++}
++
++unsigned
++f6 (int x, unsigned y)
++{
++  unsigned a = x - (int) y;
++  return a + y;
++}
++
++int
++f7 (int x, unsigned y)
++{
++  int a = x + y;
++  return x - a;
++}
++
++unsigned
++f8 (unsigned x, int y)
++{
++  unsigned a = (int) x + y;
++  return x - a;
++}
++
++int
++f9 (int x, unsigned y)
++{
++  int a = x - y;
++  return x - a;
++}
++
++unsigned
++f10 (unsigned x, int y)
++{
++  unsigned a = (int) x - y;
++  return x - a;
++}
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr92734.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92734.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr92734.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92734.c	2020-11-24 14:49:14.568000000 +0800
+@@ -0,0 +1,31 @@
++/* PR tree-optimization/92734 */
++/* { dg-do compile } */
++/* { dg-options "-O2 -fdump-tree-forwprop1" } */
++/* { dg-final { scan-tree-dump-times "return t_\[0-9]*\\\(D\\\);" 4 "forwprop1" } } */
++
++int
++f1 (int t)
++{
++  return 1 - (int) (1U - t);
++}
++
++int
++f2 (int t)
++{
++  int a = 7U - t;
++  return 7 - a;
++}
++
++int
++f3 (int t)
++{
++  int a = 32U - t;
++  return 32 - a;
++}
++
++int
++f4 (int t)
++{
++  int a = 32 - t;
++  return (int) (32 - (unsigned) a);
++}
+diff -Nurp a/gcc/testsuite/gcc.dg/tree-ssa/pr94718-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr94718-3.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr94718-3.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr94718-3.c	2020-11-24 14:49:14.568000000 +0800
+@@ -0,0 +1,45 @@
++/* PR tree-optimization/94718 */
++/* { dg-do compile } */
++/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-optimized" } */
++/* { dg-final { scan-tree-dump-times " \\\(int\\\) " 2 "optimized" } } */
++/* { dg-final { scan-tree-dump-times " \\\(unsigned int\\\) " 2 "optimized" } } */
++
++int
++f1 (int x, int y)
++{
++  return (int) ((unsigned) x | (unsigned) y);
++}
++
++int
++f2 (int x, int y)
++{
++  unsigned a = x;
++  unsigned b = y;
++  return a | b;
++}
++
++int
++f3 (int x, unsigned y)
++{
++  return (int) ((unsigned) x | y);
++}
++
++int
++f4 (int x, unsigned y)
++{
++  unsigned a = x;
++  return a | y;
++}
++
++unsigned
++f5 (int x, unsigned y)
++{
++  return (unsigned) (x | (int) y);
++}
++
++unsigned
++f6 (int x, unsigned y)
++{
++  int a = y;
++  return x | a;
++}
+diff -Nurp a/gcc/testsuite/gcc.dg/wrapped-binop-simplify.c b/gcc/testsuite/gcc.dg/wrapped-binop-simplify.c
+--- a/gcc/testsuite/gcc.dg/wrapped-binop-simplify.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/wrapped-binop-simplify.c	2020-11-24 14:49:14.484000000 +0800
+@@ -0,0 +1,43 @@
++/* { dg-do compile { target { { i?86-*-* x86_64-*-* s390*-*-* } && lp64 } } } */
++/* { dg-options "-O2 -fdump-tree-vrp2-details" } */
++/* { dg-final { scan-tree-dump-times "gimple_simplified to" 4 "vrp2" } } */
++
++void v1 (unsigned long *in, unsigned long *out, unsigned int n)
++{
++  int i;
++
++  for (i = 0; i < n; i++)
++  {
++    out[i] = in[i];
++  }
++}
++
++void v2 (unsigned long *in, unsigned long *out, int n)
++{
++  int i;
++
++  for (i = 0; i < n; i++)
++  {
++    out[i] = in[i];
++  }
++}
++
++void v3 (unsigned long *in, unsigned long *out, unsigned int n)
++{
++  unsigned int i;
++
++  for (i = 0; i < n; i++)
++  {
++    out[i] = in[i];
++  }
++}
++
++void v4 (unsigned long *in, unsigned long *out, int n)
++{
++  unsigned int i;
++
++  for (i = 0; i < n; i++)
++  {
++    out[i] = in[i];
++  }
++}
+diff -Nurp a/gcc/tree-ssa-propagate.c b/gcc/tree-ssa-propagate.c
+--- a/gcc/tree-ssa-propagate.c	2020-11-24 14:54:42.556000000 +0800
++++ b/gcc/tree-ssa-propagate.c	2020-11-24 14:49:12.792000000 +0800
+@@ -814,7 +814,6 @@ ssa_propagation_engine::ssa_propagate (v
+   ssa_prop_fini ();
+ }
+ 
+-
+ /* Return true if STMT is of the form 'mem_ref = RHS', where 'mem_ref'
+    is a non-volatile pointer dereference, a structure reference or a
+    reference to a single _DECL.  Ignore volatile memory references
+@@ -1071,6 +1070,14 @@ substitute_and_fold_dom_walker::before_d
+ 	  stmt = gsi_stmt (i);
+ 	  gimple_set_modified (stmt, true);
+ 	}
++      /* Also fold if we want to fold all statements.  */
++      else if (substitute_and_fold_engine->fold_all_stmts
++	  && fold_stmt (&i, follow_single_use_edges))
++	{
++	  did_replace = true;
++	  stmt = gsi_stmt (i);
++	  gimple_set_modified (stmt, true);
++	}
+ 
+       /* Some statements may be simplified using propagator
+ 	 specific information.  Do this before propagating
+diff -Nurp a/gcc/tree-ssa-propagate.h b/gcc/tree-ssa-propagate.h
+--- a/gcc/tree-ssa-propagate.h	2020-03-12 19:07:23.000000000 +0800
++++ b/gcc/tree-ssa-propagate.h	2020-11-24 14:49:12.792000000 +0800
+@@ -100,6 +100,8 @@ class ssa_propagation_engine
+ class substitute_and_fold_engine
+ {
+  public:
++  substitute_and_fold_engine (bool fold_all_stmts = false)
++    : fold_all_stmts (fold_all_stmts) { }
+   virtual ~substitute_and_fold_engine (void) { }
+   virtual bool fold_stmt (gimple_stmt_iterator *) { return false; }
+   virtual tree get_value (tree) { return NULL_TREE; }
+@@ -107,6 +109,10 @@ class substitute_and_fold_engine
+   bool substitute_and_fold (basic_block = NULL);
+   bool replace_uses_in (gimple *);
+   bool replace_phi_args_in (gphi *);
++
++  /* Users like VRP can set this when they want to perform
++     folding for every propagation.  */
++  bool fold_all_stmts;
+ };
+ 
+ #endif /* _TREE_SSA_PROPAGATE_H  */
+diff -Nurp a/gcc/tree-vrp.c b/gcc/tree-vrp.c
+--- a/gcc/tree-vrp.c	2020-11-24 14:54:43.564000000 +0800
++++ b/gcc/tree-vrp.c	2020-11-24 14:49:12.792000000 +0800
+@@ -6384,6 +6384,7 @@ vrp_prop::visit_phi (gphi *phi)
+ class vrp_folder : public substitute_and_fold_engine
+ {
+  public:
++  vrp_folder () : substitute_and_fold_engine (/* Fold all stmts.  */ true) {  }
+   tree get_value (tree) FINAL OVERRIDE;
+   bool fold_stmt (gimple_stmt_iterator *) FINAL OVERRIDE;
+   bool fold_predicate_in (gimple_stmt_iterator *);
diff --git a/fix-issue604-ldist-dependency-fixup.patch b/fix-issue604-ldist-dependency-fixup.patch
new file mode 100644
index 0000000..5aaf858
--- /dev/null
+++ b/fix-issue604-ldist-dependency-fixup.patch
@@ -0,0 +1,108 @@
+commit f6e1a4cd83190746b6544917f7526fa480ca5f18
+Author: Bin Cheng <bin.cheng@linux.alibaba.com>
+Date:   Wed May 13 11:37:47 2020 +0800
+
+    Add missing unit dependence vector in data dependence analysis
+    
+    Current data dependence analysis misses unit distant vector if DRs in
+    DDR have the same invariant access functions.  This adds the vector as
+    the constant access function case.
+    
+    2020-05-13  Bin Cheng  <bin.cheng@linux.alibaba.com>
+    PR tree-optimization/94969
+    
+    gcc/
+        * tree-data-dependence.c (constant_access_functions): Rename to...
+        (invariant_access_functions): ...this.  Add parameter.  Check for
+        invariant access function, rather than constant.
+        (build_classic_dist_vector): Call above function.
+        * tree-loop-distribution.c (pg_add_dependence_edges): Add comment.
+    
+    gcc/testsuite/
+        * gcc.dg/tree-ssa/pr94969.c: New test.
+
+diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr94969.c b/gcc/testsuite/gcc.dg/tree-ssa/pr94969.c
+new file mode 100644
+index 00000000000..056b015f97c
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr94969.c
+@@ -0,0 +1,28 @@
++/* PR tree-optimization/52267 */
++/* { dg-do run } */
++/* { dg-options "-O3 -fdump-tree-ldist-details" } */
++
++int a = 0, b = 0, c = 0;
++struct S {
++  signed m : 7;
++  signed e : 2;
++};
++struct S f[2] = {{0, 0}, {0, 0}};
++struct S g = {0, 0};
++
++void __attribute__((noinline))
++k()
++{
++  for (; c <= 1; c++) {
++    f[b] = g;
++    f[b].e ^= 1;
++  }
++}
++int main()
++{
++  k();
++  if (f[b].e != 1)
++    __builtin_abort ();
++}
++
++/* { dg-final { scan-tree-dump-not "ldist" "Loop 1 distributed: split to 3 loops"} } */
+diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
+index 851225e1171..5505ba46778 100644
+--- a/gcc/tree-data-ref.c
++++ b/gcc/tree-data-ref.c
+@@ -4821,17 +4821,19 @@ build_classic_dist_vector_1 (struct data_dependence_relation *ddr,
+   return true;
+ }
+ 
+-/* Return true when the DDR contains only constant access functions.  */
++/* Return true when the DDR contains only invariant access functions wrto. loop
++   number LNUM.  */
+ 
+ static bool
+-constant_access_functions (const struct data_dependence_relation *ddr)
++invariant_access_functions (const struct data_dependence_relation *ddr,
++			    int lnum)
+ {
+   unsigned i;
+   subscript *sub;
+ 
+   FOR_EACH_VEC_ELT (DDR_SUBSCRIPTS (ddr), i, sub)
+-    if (!evolution_function_is_constant_p (SUB_ACCESS_FN (sub, 0))
+-	|| !evolution_function_is_constant_p (SUB_ACCESS_FN (sub, 1)))
++    if (!evolution_function_is_invariant_p (SUB_ACCESS_FN (sub, 0), lnum)
++	|| !evolution_function_is_invariant_p (SUB_ACCESS_FN (sub, 1), lnum))
+       return false;
+ 
+   return true;
+@@ -5030,7 +5032,7 @@ build_classic_dist_vector (struct data_dependence_relation *ddr,
+       dist_v = lambda_vector_new (DDR_NB_LOOPS (ddr));
+       save_dist_v (ddr, dist_v);
+ 
+-      if (constant_access_functions (ddr))
++      if (invariant_access_functions (ddr, loop_nest->num))
+ 	add_distance_for_zero_overlaps (ddr);
+ 
+       if (DDR_NB_LOOPS (ddr) > 1)
+diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
+index 44423215332..b122c3964a0 100644
+--- a/gcc/tree-loop-distribution.c
++++ b/gcc/tree-loop-distribution.c
+@@ -2080,7 +2080,8 @@ loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
+ 		this_dir = -this_dir;
+ 
+ 	      /* Known dependences can still be unordered througout the
+-		 iteration space, see gcc.dg/tree-ssa/ldist-16.c.  */
++		 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
++		 gcc.dg/tree-ssa/pr94969.c.  */
+ 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
+ 		this_dir = 2;
+ 	      /* If the overlap is exact preserve stmt order.  */
diff --git a/fix-when-peeling-for-alignment.patch b/fix-when-peeling-for-alignment.patch
deleted file mode 100644
index 1d86732..0000000
--- a/fix-when-peeling-for-alignment.patch
+++ /dev/null
@@ -1,23 +0,0 @@
-diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
-index 36639b697f1..88f14e73d65 100644
---- a/gcc/tree-vect-data-refs.c
-+++ b/gcc/tree-vect-data-refs.c
-@@ -938,6 +938,18 @@ vect_compute_data_ref_alignment (dr_vec_info *dr_info)
-     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
-   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
- 
-+  /* If the main loop has peeled for alignment we have no way of knowing
-+     whether the data accesses in the epilogues are aligned.  We can't at
-+     compile time answer the question whether we have entered the main loop or
-+     not.  Fixes PR 92351.  */
-+  if (loop_vinfo)
-+    {
-+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
-+      if (orig_loop_vinfo
-+	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
-+	return;
-+    }
-+
-   unsigned HOST_WIDE_INT vect_align_c;
-   if (!vector_alignment.is_constant (&vect_align_c))
-     return;
diff --git a/gcc.spec b/gcc.spec
index 5e4eb3b..07ae2a2 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -1,4 +1,4 @@
-%global DATE 20200922
+%global DATE 20201229
 
 %global gcc_version 9.3.1
 %global gcc_major 9.3.1
@@ -59,7 +59,7 @@
 Summary: Various compilers (C, C++, Objective-C, ...)
 Name: gcc
 Version: %{gcc_version}
-Release: %{DATE}.12
+Release: %{DATE}.13
 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD
 URL: https://gcc.gnu.org
 
@@ -114,26 +114,26 @@ Provides: bundled(libiberty)
 Provides: gcc(major) = %{gcc_major}
 
 Patch0: enable-aarch64-libquadmath.patch
-Patch1: medium-code-mode.patch
-Patch2: generate-csel.patch
-Patch3: delete-incorrect-smw.patch
-Patch4: remove-array-index-inliner-hint.patch
-Patch5: ivopts-1.patch
-Patch6: ivopts-2.patch
-Patch7: dont-generate-IF_THEN_ELSE.patch
-Patch8: fix-cost-of-plus.patch
-Patch9: div-opti.patch
-Patch10: fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch
-Patch11: fix-ICE-during-pass-ccp.patch
-Patch12: loop-split.patch
-Patch13: loop-finite.patch
-Patch14: loop-finite-bugfix.patch
-Patch15: fix-regno-out-of-range.patch
-Patch16: fix-ICE-in-vectorizable-load.patch
-Patch17: address-calculation-optimization-within-loop.patch
-Patch18: skip-debug-insns-when-computing-inline-costs.patch
-Patch19: ipa-const-prop.patch
-Patch20: ipa-const-prop-self-recursion-bugfix.patch
+Patch1: generate-csel.patch
+Patch2: delete-incorrect-smw.patch
+Patch3: remove-array-index-inliner-hint.patch
+Patch4: ivopts-1.patch
+Patch5: ivopts-2.patch
+Patch6: dont-generate-IF_THEN_ELSE.patch
+Patch7: fix-cost-of-plus.patch
+Patch8: div-opti.patch
+Patch9: fix-SYMBOL_TINY_GOT-handling-for-ILP32.patch
+Patch10: fix-ICE-during-pass-ccp.patch
+Patch11: loop-split.patch
+Patch12: loop-finite.patch
+Patch13: loop-finite-bugfix.patch
+Patch14: fix-regno-out-of-range.patch
+Patch15: fix-ICE-in-vectorizable-load.patch
+Patch16: address-calculation-optimization-within-loop.patch
+Patch17: skip-debug-insns-when-computing-inline-costs.patch
+Patch18: ipa-const-prop.patch
+Patch19: ipa-const-prop-self-recursion-bugfix.patch
+Patch20: ipa-const-prop-null-point-check-bugfix.patch
 Patch21: change-gcc-BASE-VER.patch
 Patch22: add-option-fallow-store-data-races.patch
 Patch23: tighten-range-for-generating-csel.patch
@@ -177,16 +177,49 @@ Patch60: fix-load-eliding-in-SM.patch
 Patch61: fix-SSA-update-for-vectorizer-epilogue.patch
 Patch62: fix-ICE-when-vectorizing-nested-cycles.patch
 Patch63: fix-avoid-bogus-uninit-warning-with-store-motion.patch
-Patch64: ipa-const-prop-null-point-check-bugfix.patch
-Patch65: avoid-cycling-on-vertain-subreg-reloads.patch
-Patch66: fix-ICE-in-verify_target_availability.patch
-Patch67: fix-ICE-vect_slp_analyze_node_operations.patch
-Patch68: fix-ICE-in-extract_constrain_insn.patch
-Patch69: fix-ICE-during-GIMPLE-pass-dse.patch
-Patch70: ipa-const-prop-buffer-overflow-bugfix.patch
-Patch71: fix-ICE-in-eliminate_stmt.patch
-Patch72: fix-make-ifcvt-clean-up-dead-comparisons.patch
-Patch73: fix-when-peeling-for-alignment.patch
+Patch64: avoid-cycling-on-vertain-subreg-reloads.patch
+Patch65: fix-ICE-in-verify_target_availability.patch
+Patch66: fix-ICE-vect_slp_analyze_node_operations.patch
+Patch67: fix-ICE-in-extract_constrain_insn.patch
+Patch68: fix-ICE-during-GIMPLE-pass-dse.patch
+Patch69: ipa-const-prop-buffer-overflow-bugfix.patch
+Patch70: fix-ICE-in-eliminate_stmt.patch
+Patch71: fix-make-ifcvt-clean-up-dead-comparisons.patch
+Patch72: fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch
+Patch73: fix-ICE-in-vect_update_misalignment_for_peel.patch
+Patch74: redundant-loop-elimination.patch
+Patch75: bf16-and-matrix-characteristic.patch
+Patch76: medium-code-mode.patch
+Patch77: tree-optimization-96920-another-ICE-when-vectorizing.patch
+Patch78: reduction-paths-with-unhandled-live-stmt.patch
+Patch79: aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch
+Patch80: tree-optimization-97812-fix-range-query-in-VRP-asser.patch
+Patch81: aarch64-Fix-bf16-and-matrix-g++-gfortran.patch
+Patch82: IRA-Handle-fully-tied-destinations.patch
+Patch83: fix-ICE-in-pass-vect.patch
+Patch84: SLP-VECT-Add-check-to-fix-96837.patch
+Patch85: adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch
+Patch86: fix-issue499-add-nop-convert.patch
+Patch87: aarch64-fix-sve-acle-error.patch
+Patch88: fix-ICE-IPA-compare-VRP-types.patch
+Patch89: vectorizable-comparison-Swap-operands-only-once.patch
+Patch90: sccvn-Improve-handling-of-load-masked-with-integer.patch
+Patch91: speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch
+Patch92: x86-Fix-bf16-and-matrix.patch
+Patch93: Fix-up-push_partial_def-little-endian-bitfield.patch
+Patch94: modulo-sched-Carefully-process-loop-counter-initiali.patch
+Patch95: fix-ICE-in-affine-combination.patch
+Patch96: aarch64-Fix-mismatched-SVE-predicate-modes.patch
+Patch97: Fix-EXTRACT_LAST_REDUCTION-segfault.patch
+Patch98: fix-PR-92351-When-peeling-for-alignment.patch
+Patch99: fix-addlosymdi-ICE-in-pass-reload.patch
+Patch100: store-merging-Consider-also-overlapping-stores-earlier.patch
+Patch101: AArch64-Fix-constraints-for-CPY-M.patch
+Patch102: Fix-zero-masking-for-vcvtps2ph.patch
+Patch103: re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch
+Patch104: fix-avx512vl-vcvttpd2dq-2-fail.patch
+Patch105: fix-issue604-ldist-dependency-fixup.patch
+Patch106: Apply-maximum-nunits-for-BB-SLP.patch
 
 
 %global gcc_target_platform %{_arch}-linux-gnu
@@ -703,6 +736,39 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch71 -p1
 %patch72 -p1
 %patch73 -p1
+%patch74 -p1
+%patch75 -p1
+%patch76 -p1
+%patch77 -p1
+%patch78 -p1
+%patch79 -p1
+%patch80 -p1
+%patch81 -p1
+%patch82 -p1
+%patch83 -p1
+%patch84 -p1
+%patch85 -p1
+%patch86 -p1
+%patch87 -p1
+%patch88 -p1
+%patch89 -p1
+%patch90 -p1
+%patch91 -p1
+%patch92 -p1
+%patch93 -p1
+%patch94 -p1
+%patch95 -p1
+%patch96 -p1
+%patch97 -p1
+%patch98 -p1
+%patch99 -p1
+%patch100 -p1
+%patch101 -p1
+%patch102 -p1
+%patch103 -p1
+%patch104 -p1
+%patch105 -p1
+%patch106 -p1
 
 
 %build
@@ -2631,6 +2697,57 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Tue Dec 29 2020 eastb233 <xiezhiheng@huawei.com> - 9.3.1-20201229.13
+- avoid-cycling-on-vertain-subreg-reloads.patch: Add patch source comment
+- change-gcc-BASE-VER.patch: Likewise
+- dont-generate-IF_THEN_ELSE.patch: Likewise
+- fix-ICE-in-compute_live_loop_exits.patch: Likewise
+- fix-ICE-in-eliminate_stmt.patch: Likewise
+- fix-ICE-in-vect_create_epilog_for_reduction.patch: Likewise
+- fix-ICE-in-vect_stmt_to_vectorize.patch: Likewise
+- fix-ICE-in-verify_ssa.patch: Likewise
+- fix-ICE-when-vectorizing-nested-cycles.patch: Likewise
+- fix-cost-of-plus.patch: Likewise
+- ipa-const-prop-self-recursion-bugfix.patch: Likewise
+- simplify-removing-subregs.patch: Likewise
+- medium-code-mode.patch: Bugfix
+- fix-when-peeling-for-alignment.patch: Move to ...
+- fix-PR-92351-When-peeling-for-alignment.patch: ... this
+- AArch64-Fix-constraints-for-CPY-M.patch: New file
+- Apply-maximum-nunits-for-BB-SLP.patch: New file
+- Fix-EXTRACT_LAST_REDUCTION-segfault.patch: New file
+- Fix-up-push_partial_def-little-endian-bitfield.patch: New file
+- Fix-zero-masking-for-vcvtps2ph.patch: New file
+- IRA-Handle-fully-tied-destinations.patch: New file
+- SLP-VECT-Add-check-to-fix-96837.patch: New file
+- aarch64-Fix-ash-lr-lshr-mode-3-expanders.patch: New file
+- aarch64-Fix-bf16-and-matrix-g++-gfortran.patch: New file
+- aarch64-Fix-mismatched-SVE-predicate-modes.patch: New file
+- aarch64-fix-sve-acle-error.patch: New file
+- adjust-vector-cost-and-move-EXTRACT_LAST_REDUCTION-costing.patch: New file
+- bf16-and-matrix-characteristic.patch: New file
+- fix-ICE-IPA-compare-VRP-types.patch: New file
+- fix-ICE-in-affine-combination.patch: New file
+- fix-ICE-in-pass-vect.patch: New file
+- fix-ICE-in-vect_update_misalignment_for_peel.patch: New file
+- fix-addlosymdi-ICE-in-pass-reload.patch: New file
+- fix-an-ICE-in-vect_recog_mask_conversion_pattern.patch: New file
+- fix-avx512vl-vcvttpd2dq-2-fail.patch: New file
+- fix-issue499-add-nop-convert.patch: New file
+- fix-issue604-ldist-dependency-fixup.patch: New file
+- modulo-sched-Carefully-process-loop-counter-initiali.patch: New file
+- re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch: New file
+- reduction-paths-with-unhandled-live-stmt.patch: New file
+- redundant-loop-elimination.patch: New file
+- sccvn-Improve-handling-of-load-masked-with-integer.patch: New file
+- speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch: New file
+- store-merging-Consider-also-overlapping-stores-earlier.patch: New file
+- tree-optimization-96920-another-ICE-when-vectorizing.patch: New file
+- tree-optimization-97812-fix-range-query-in-VRP-asser.patch: New file
+- vectorizable-comparison-Swap-operands-only-once.patch: New file
+- x86-Fix-bf16-and-matrix.patch: New file
+- gcc.spec: Add uploaded patch
+
 * Tue Sep 22 2020 eastb233 <xiezhiheng@huawei.com> - 9.3.1-20200922.12
 - fix-when-peeling-for-alignment.patch: New file
 
diff --git a/ipa-const-prop-self-recursion-bugfix.patch b/ipa-const-prop-self-recursion-bugfix.patch
index 9e878a3..e407ff9 100644
--- a/ipa-const-prop-self-recursion-bugfix.patch
+++ b/ipa-const-prop-self-recursion-bugfix.patch
@@ -1,14 +1,11 @@
-This patch is backport from gcc-trunk. It is a combined patch from
+This backport contains 2 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
 
-Find matched aggregate lattice for self-recursive CP (PR ipa/93084)
-https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=42d73fa9d575e3c8c21e88bd7f65922e17b052f1
+0001-Find-matched-aggregate-lattice-for-self-recursive-CP.patch
+709d7838e753bbb6f16e2ed88a118ed81c367040
 
-and 
-
-Do not propagate self-dependent value (PR ipa/93763)
-https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=47772af10c00f7e1e95cd52557fc893dc602a420
-
-adapted the using of parameter to gcc9 style.
+0002-Do-not-propagate-self-dependent-value-PR-ipa-93763.patch
+47772af10c00f7e1e95cd52557fc893dc602a420
 
 diff -Nurp a/gcc/ipa-cp.c b/gcc/ipa-cp.c
 --- a/gcc/ipa-cp.c	2020-05-23 16:16:58.032000000 +0800
diff --git a/medium-code-mode.patch b/medium-code-mode.patch
index 9133683..cf629d2 100644
--- a/medium-code-mode.patch
+++ b/medium-code-mode.patch
@@ -194,8 +194,8 @@ diff -Nurp a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
  	case AARCH64_CMODEL_SMALL:
 +	AARCH64_SMALL_ROUTINE:
  	  /* Same reasoning as the tiny code model, but the offset cap here is
- 	     4G.  */
- 	  if ((SYMBOL_REF_WEAK (x)
+ 	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
+ 
 @@ -13121,7 +13225,48 @@ aarch64_classify_symbol (rtx x, HOST_WID
  		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
  	  return SYMBOL_SMALL_ABSOLUTE;
@@ -300,7 +300,7 @@ diff -Nurp a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
 +    UNSPEC_LOAD_SYMBOL_MEDIUM
      UNSPEC_LD1_SVE
      UNSPEC_ST1_SVE
-     UNSPEC_LD1RQ
+     UNSPEC_LDNT1_SVE
 @@ -6548,6 +6553,39 @@
    [(set_attr "type" "load_4")]
  )
diff --git a/modulo-sched-Carefully-process-loop-counter-initiali.patch b/modulo-sched-Carefully-process-loop-counter-initiali.patch
new file mode 100644
index 0000000..536d149
--- /dev/null
+++ b/modulo-sched-Carefully-process-loop-counter-initiali.patch
@@ -0,0 +1,251 @@
+This backport contains 1 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-modulo-sched-Carefully-process-loop-counter-initiali.patch
+4eb8f93d026eaa1de9b4820337069f3ce3465cd0
+
+diff --git a/gcc/modulo-sched.c b/gcc/modulo-sched.c
+index 6f699a874e3..4568674aa6c 100644
+--- a/gcc/modulo-sched.c
++++ b/gcc/modulo-sched.c
+@@ -210,8 +210,6 @@ static int sms_order_nodes (ddg_ptr, int, int *, int *);
+ static void set_node_sched_params (ddg_ptr);
+ static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
+ static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *);
+-static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
+-                                    rtx, rtx);
+ static int calculate_stage_count (partial_schedule_ptr, int);
+ static void calculate_must_precede_follow (ddg_node_ptr, int, int,
+ 					   int, int, sbitmap, sbitmap, sbitmap);
+@@ -391,30 +389,40 @@ doloop_register_get (rtx_insn *head, rtx_insn *tail)
+    this constant.  Otherwise return 0.  */
+ static rtx_insn *
+ const_iteration_count (rtx count_reg, basic_block pre_header,
+-		       int64_t * count)
++		       int64_t *count, bool* adjust_inplace)
+ {
+   rtx_insn *insn;
+   rtx_insn *head, *tail;
+ 
++  *adjust_inplace = false;
++  bool read_after = false;
++
+   if (! pre_header)
+     return NULL;
+ 
+   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
+ 
+   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
+-    if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
+-	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
++    if (single_set (insn) && rtx_equal_p (count_reg,
++					  SET_DEST (single_set (insn))))
+       {
+ 	rtx pat = single_set (insn);
+ 
+ 	if (CONST_INT_P (SET_SRC (pat)))
+ 	  {
+ 	    *count = INTVAL (SET_SRC (pat));
++	    *adjust_inplace = !read_after;
+ 	    return insn;
+ 	  }
+ 
+ 	return NULL;
+       }
++    else if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (count_reg, insn))
++      {
++	read_after = true;
++	if (reg_set_p (count_reg, insn))
++	   break;
++      }
+ 
+   return NULL;
+ }
+@@ -1126,7 +1134,7 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
+ /* Generate the instructions (including reg_moves) for prolog & epilog.  */
+ static void
+ generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
+-                        rtx count_reg, rtx count_init)
++			rtx count_reg, bool adjust_init)
+ {
+   int i;
+   int last_stage = PS_STAGE_COUNT (ps) - 1;
+@@ -1135,12 +1143,12 @@ generate_prolog_epilog (partial_schedule_ptr ps, class loop *loop,
+   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
+   start_sequence ();
+ 
+-  if (!count_init)
++  if (adjust_init)
+     {
+       /* Generate instructions at the beginning of the prolog to
+-         adjust the loop count by STAGE_COUNT.  If loop count is constant
+-         (count_init), this constant is adjusted by STAGE_COUNT in
+-         generate_prolog_epilog function.  */
++	 adjust the loop count by STAGE_COUNT.  If loop count is constant
++	 and it not used anywhere in prologue, this constant is adjusted by
++	 STAGE_COUNT outside of generate_prolog_epilog function.  */
+       rtx sub_reg = NULL_RTX;
+ 
+       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg,
+@@ -1528,7 +1536,8 @@ sms_schedule (void)
+       rtx_insn *count_init;
+       int mii, rec_mii, stage_count, min_cycle;
+       int64_t loop_count = 0;
+-      bool opt_sc_p;
++      bool opt_sc_p, adjust_inplace = false;
++      basic_block pre_header;
+ 
+       if (! (g = g_arr[loop->num]))
+         continue;
+@@ -1569,19 +1578,13 @@ sms_schedule (void)
+ 	}
+ 
+ 
+-      /* In case of th loop have doloop register it gets special
+-	 handling.  */
+-      count_init = NULL;
+-      if ((count_reg = doloop_register_get (head, tail)))
+-	{
+-	  basic_block pre_header;
+-
+-	  pre_header = loop_preheader_edge (loop)->src;
+-	  count_init = const_iteration_count (count_reg, pre_header,
+-					      &loop_count);
+-	}
++      count_reg = doloop_register_get (head, tail);
+       gcc_assert (count_reg);
+ 
++      pre_header = loop_preheader_edge (loop)->src;
++      count_init = const_iteration_count (count_reg, pre_header, &loop_count,
++					  &adjust_inplace);
++
+       if (dump_file && count_init)
+         {
+           fprintf (dump_file, "SMS const-doloop ");
+@@ -1701,9 +1704,20 @@ sms_schedule (void)
+ 	      print_partial_schedule (ps, dump_file);
+ 	    }
+  
+-          /* case the BCT count is not known , Do loop-versioning */
+-	  if (count_reg && ! count_init)
++	  if (count_init)
++	    {
++	       if (adjust_inplace)
++		{
++		  /* When possible, set new iteration count of loop kernel in
++		     place.  Otherwise, generate_prolog_epilog creates an insn
++		     to adjust.  */
++		  SET_SRC (single_set (count_init)) = GEN_INT (loop_count
++							    - stage_count + 1);
++		}
++	    }
++	  else
+             {
++	      /* case the BCT count is not known , Do loop-versioning */
+ 	      rtx comp_rtx = gen_rtx_GT (VOIDmode, count_reg,
+ 					 gen_int_mode (stage_count,
+ 						       GET_MODE (count_reg)));
+@@ -1713,12 +1727,7 @@ sms_schedule (void)
+ 	      loop_version (loop, comp_rtx, &condition_bb,
+ 	  		    prob, prob.invert (),
+ 			    prob, prob.invert (), true);
+-	     }
+-
+-	  /* Set new iteration count of loop kernel.  */
+-          if (count_reg && count_init)
+-	    SET_SRC (single_set (count_init)) = GEN_INT (loop_count
+-						     - stage_count + 1);
++	    }
+ 
+ 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
+ 	  permute_partial_schedule (ps, g->closing_branch->first_note);
+@@ -1735,7 +1744,7 @@ sms_schedule (void)
+ 	  if (dump_file)
+ 	    print_node_sched_params (dump_file, g->num_nodes, ps);
+ 	  /* Generate prolog and epilog.  */
+-          generate_prolog_epilog (ps, loop, count_reg, count_init);
++	  generate_prolog_epilog (ps, loop, count_reg, !adjust_inplace);
+ 	  break;
+ 	}
+ 
+diff --git a/gcc/testsuite/gcc.c-torture/execute/pr97421-1.c b/gcc/testsuite/gcc.c-torture/execute/pr97421-1.c
+new file mode 100644
+index 00000000000..e32fb129f18
+--- /dev/null
++++ b/gcc/testsuite/gcc.c-torture/execute/pr97421-1.c
+@@ -0,0 +1,23 @@
++/* PR rtl-optimization/97421 */
++/* { dg-additional-options "-fmodulo-sched" } */
++
++int a, b, d, e;
++int *volatile c = &a;
++
++__attribute__((noinline))
++void f(void)
++{
++  for (int g = 2; g >= 0; g--) {
++    d = 0;
++    for (b = 0; b <= 2; b++)
++      ;
++    e = *c;
++  }
++}
++
++int main(void)
++{
++  f();
++  if (b != 3)
++    __builtin_abort();
++}
+diff --git a/gcc/testsuite/gcc.c-torture/execute/pr97421-2.c b/gcc/testsuite/gcc.c-torture/execute/pr97421-2.c
+new file mode 100644
+index 00000000000..142bcbcee91
+--- /dev/null
++++ b/gcc/testsuite/gcc.c-torture/execute/pr97421-2.c
+@@ -0,0 +1,18 @@
++/* PR rtl-optimization/97421 */
++/* { dg-additional-options "-fmodulo-sched -fno-dce -fno-strict-aliasing" } */
++
++static int a, b, c;
++int *d = &c;
++int **e = &d;
++int ***f = &e;
++int main()
++{
++  int h;
++  for (a = 2; a; a--)
++    for (h = 0; h <= 2; h++)
++      for (b = 0; b <= 2; b++)
++        ***f = 6;
++
++  if (b != 3)
++    __builtin_abort();
++}
+diff --git a/gcc/testsuite/gcc.c-torture/execute/pr97421-3.c b/gcc/testsuite/gcc.c-torture/execute/pr97421-3.c
+new file mode 100644
+index 00000000000..3f1485a4a3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.c-torture/execute/pr97421-3.c
+@@ -0,0 +1,22 @@
++/* PR rtl-optimization/97421 */
++/* { dg-additional-options "-fmodulo-sched" } */
++
++int a, b, c;
++short d;
++void e(void) {
++  unsigned f = 0;
++  for (; f <= 2; f++) {
++    int g[1];
++    int h = (long)g;
++    c = 0;
++    for (; c < 10; c++)
++      g[0] = a = 0;
++    for (; a <= 2; a++)
++      b = d;
++  }
++}
++int main(void) {
++  e();
++  if (a != 3)
++    __builtin_abort();
++}
diff --git a/re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch b/re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch
new file mode 100644
index 0000000..d95d3b2
--- /dev/null
+++ b/re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch
@@ -0,0 +1,215 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+491b0b4015a70071a05e0faa5c2082c43a51a0d3 
+0001-re-PR-target-91124-gcc.target-i386-avx512vl-vpshldvd.patch
+
+diff -urpN a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
+--- a/gcc/config/i386/i386-builtin.def	2020-03-12 07:07:21.000000000 -0400
++++ b/gcc/config/i386/i386-builtin.def	2020-12-17 20:46:53.868000000 -0500
+@@ -2516,60 +2516,60 @@ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPT
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshld_v2di_mask, "__builtin_ia32_vpshld_v2di_mask", IX86_BUILTIN_VPSHLDV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT)
+ 
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v32hi, "__builtin_ia32_vpshrdv_v32hi", IX86_BUILTIN_VPSHRDVV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrdv_v32hi_mask, "__builtin_ia32_vpshrdv_v32hi_mask", IX86_BUILTIN_VPSHRDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrdv_v32hi_maskz, "__builtin_ia32_vpshrdv_v32hi_maskz", IX86_BUILTIN_VPSHRDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrdv_v32hi_mask, "__builtin_ia32_vpshrdv_v32hi_mask", IX86_BUILTIN_VPSHRDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshrdv_v32hi_maskz, "__builtin_ia32_vpshrdv_v32hi_maskz", IX86_BUILTIN_VPSHRDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi, "__builtin_ia32_vpshrdv_v16hi", IX86_BUILTIN_VPSHRDVV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi_mask, "__builtin_ia32_vpshrdv_v16hi_mask", IX86_BUILTIN_VPSHRDVV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi_maskz, "__builtin_ia32_vpshrdv_v16hi_maskz", IX86_BUILTIN_VPSHRDVV16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi_mask, "__builtin_ia32_vpshrdv_v16hi_mask", IX86_BUILTIN_VPSHRDVV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v16hi_maskz, "__builtin_ia32_vpshrdv_v16hi_maskz", IX86_BUILTIN_VPSHRDVV16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8hi, "__builtin_ia32_vpshrdv_v8hi", IX86_BUILTIN_VPSHRDVV8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8hi_mask, "__builtin_ia32_vpshrdv_v8hi_mask", IX86_BUILTIN_VPSHRDVV8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8hi_maskz, "__builtin_ia32_vpshrdv_v8hi_maskz", IX86_BUILTIN_VPSHRDVV8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8hi_mask, "__builtin_ia32_vpshrdv_v8hi_mask", IX86_BUILTIN_VPSHRDVV8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8hi_maskz, "__builtin_ia32_vpshrdv_v8hi_maskz", IX86_BUILTIN_VPSHRDVV8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v16si, "__builtin_ia32_vpshrdv_v16si", IX86_BUILTIN_VPSHRDVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v16si_mask, "__builtin_ia32_vpshrdv_v16si_mask", IX86_BUILTIN_VPSHRDVV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v16si_maskz, "__builtin_ia32_vpshrdv_v16si_maskz", IX86_BUILTIN_VPSHRDVV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v16si_mask, "__builtin_ia32_vpshrdv_v16si_mask", IX86_BUILTIN_VPSHRDVV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v16si_maskz, "__builtin_ia32_vpshrdv_v16si_maskz", IX86_BUILTIN_VPSHRDVV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8si, "__builtin_ia32_vpshrdv_v8si", IX86_BUILTIN_VPSHRDVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8si_mask, "__builtin_ia32_vpshrdv_v8si_mask", IX86_BUILTIN_VPSHRDVV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8si_maskz, "__builtin_ia32_vpshrdv_v8si_maskz", IX86_BUILTIN_VPSHRDVV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8si_mask, "__builtin_ia32_vpshrdv_v8si_mask", IX86_BUILTIN_VPSHRDVV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v8si_maskz, "__builtin_ia32_vpshrdv_v8si_maskz", IX86_BUILTIN_VPSHRDVV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4si, "__builtin_ia32_vpshrdv_v4si", IX86_BUILTIN_VPSHRDVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4si_mask, "__builtin_ia32_vpshrdv_v4si_mask", IX86_BUILTIN_VPSHRDVV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4si_maskz, "__builtin_ia32_vpshrdv_v4si_maskz", IX86_BUILTIN_VPSHRDVV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4si_mask, "__builtin_ia32_vpshrdv_v4si_mask", IX86_BUILTIN_VPSHRDVV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4si_maskz, "__builtin_ia32_vpshrdv_v4si_maskz", IX86_BUILTIN_VPSHRDVV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v8di, "__builtin_ia32_vpshrdv_v8di", IX86_BUILTIN_VPSHRDVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v8di_mask, "__builtin_ia32_vpshrdv_v8di_mask", IX86_BUILTIN_VPSHRDVV8DI_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v8di_maskz, "__builtin_ia32_vpshrdv_v8di_maskz", IX86_BUILTIN_VPSHRDVV8DI_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v8di_mask, "__builtin_ia32_vpshrdv_v8di_mask", IX86_BUILTIN_VPSHRDVV8DI_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshrdv_v8di_maskz, "__builtin_ia32_vpshrdv_v8di_maskz", IX86_BUILTIN_VPSHRDVV8DI_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4di, "__builtin_ia32_vpshrdv_v4di", IX86_BUILTIN_VPSHRDVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4di_mask, "__builtin_ia32_vpshrdv_v4di_mask", IX86_BUILTIN_VPSHRDVV4DI_MASK, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4di_maskz, "__builtin_ia32_vpshrdv_v4di_maskz", IX86_BUILTIN_VPSHRDVV4DI_MASKZ, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4di_mask, "__builtin_ia32_vpshrdv_v4di_mask", IX86_BUILTIN_VPSHRDVV4DI_MASK, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v4di_maskz, "__builtin_ia32_vpshrdv_v4di_maskz", IX86_BUILTIN_VPSHRDVV4DI_MASKZ, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v2di, "__builtin_ia32_vpshrdv_v2di", IX86_BUILTIN_VPSHRDVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v2di_mask, "__builtin_ia32_vpshrdv_v2di_mask", IX86_BUILTIN_VPSHRDVV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v2di_maskz, "__builtin_ia32_vpshrdv_v2di_maskz", IX86_BUILTIN_VPSHRDVV2DI_MASKZ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v2di_mask, "__builtin_ia32_vpshrdv_v2di_mask", IX86_BUILTIN_VPSHRDVV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshrdv_v2di_maskz, "__builtin_ia32_vpshrdv_v2di_maskz", IX86_BUILTIN_VPSHRDVV2DI_MASKZ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_UQI)
+ 
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v32hi, "__builtin_ia32_vpshldv_v32hi", IX86_BUILTIN_VPSHLDVV32HI, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshldv_v32hi_mask, "__builtin_ia32_vpshldv_v32hi_mask", IX86_BUILTIN_VPSHLDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshldv_v32hi_maskz, "__builtin_ia32_vpshldv_v32hi_maskz", IX86_BUILTIN_VPSHLDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshldv_v32hi_mask, "__builtin_ia32_vpshldv_v32hi_mask", IX86_BUILTIN_VPSHLDVV32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_vpshldv_v32hi_maskz, "__builtin_ia32_vpshldv_v32hi_maskz", IX86_BUILTIN_VPSHLDVV32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V32HI_V32HI_V32HI_USI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi, "__builtin_ia32_vpshldv_v16hi", IX86_BUILTIN_VPSHLDVV16HI, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi_mask, "__builtin_ia32_vpshldv_v16hi_mask", IX86_BUILTIN_VPSHLDVV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi_maskz, "__builtin_ia32_vpshldv_v16hi_maskz", IX86_BUILTIN_VPSHLDVV16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi_mask, "__builtin_ia32_vpshldv_v16hi_mask", IX86_BUILTIN_VPSHLDVV16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v16hi_maskz, "__builtin_ia32_vpshldv_v16hi_maskz", IX86_BUILTIN_VPSHLDVV16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_V16HI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8hi, "__builtin_ia32_vpshldv_v8hi", IX86_BUILTIN_VPSHLDVV8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8hi_mask, "__builtin_ia32_vpshldv_v8hi_mask", IX86_BUILTIN_VPSHLDVV8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8hi_maskz, "__builtin_ia32_vpshldv_v8hi_maskz", IX86_BUILTIN_VPSHLDVV8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8hi_mask, "__builtin_ia32_vpshldv_v8hi_mask", IX86_BUILTIN_VPSHLDVV8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8hi_maskz, "__builtin_ia32_vpshldv_v8hi_maskz", IX86_BUILTIN_VPSHLDVV8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v16si, "__builtin_ia32_vpshldv_v16si", IX86_BUILTIN_VPSHLDVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v16si_mask, "__builtin_ia32_vpshldv_v16si_mask", IX86_BUILTIN_VPSHLDVV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v16si_maskz, "__builtin_ia32_vpshldv_v16si_maskz", IX86_BUILTIN_VPSHLDVV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v16si_mask, "__builtin_ia32_vpshldv_v16si_mask", IX86_BUILTIN_VPSHLDVV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v16si_maskz, "__builtin_ia32_vpshldv_v16si_maskz", IX86_BUILTIN_VPSHLDVV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8si, "__builtin_ia32_vpshldv_v8si", IX86_BUILTIN_VPSHLDVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8si_mask, "__builtin_ia32_vpshldv_v8si_mask", IX86_BUILTIN_VPSHLDVV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8si_maskz, "__builtin_ia32_vpshldv_v8si_maskz", IX86_BUILTIN_VPSHLDVV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8si_mask, "__builtin_ia32_vpshldv_v8si_mask", IX86_BUILTIN_VPSHLDVV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v8si_maskz, "__builtin_ia32_vpshldv_v8si_maskz", IX86_BUILTIN_VPSHLDVV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4si, "__builtin_ia32_vpshldv_v4si", IX86_BUILTIN_VPSHLDVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4si_mask, "__builtin_ia32_vpshldv_v4si_mask", IX86_BUILTIN_VPSHLDVV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4si_maskz, "__builtin_ia32_vpshldv_v4si_maskz", IX86_BUILTIN_VPSHLDVV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4si_mask, "__builtin_ia32_vpshldv_v4si_mask", IX86_BUILTIN_VPSHLDVV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4si_maskz, "__builtin_ia32_vpshldv_v4si_maskz", IX86_BUILTIN_VPSHLDVV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v8di, "__builtin_ia32_vpshldv_v8di", IX86_BUILTIN_VPSHLDVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v8di_mask, "__builtin_ia32_vpshldv_v8di_mask", IX86_BUILTIN_VPSHLDVV8DI_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v8di_maskz, "__builtin_ia32_vpshldv_v8di_maskz", IX86_BUILTIN_VPSHLDVV8DI_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v8di_mask, "__builtin_ia32_vpshldv_v8di_mask", IX86_BUILTIN_VPSHLDVV8DI_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_vpshldv_v8di_maskz, "__builtin_ia32_vpshldv_v8di_maskz", IX86_BUILTIN_VPSHLDVV8DI_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4di, "__builtin_ia32_vpshldv_v4di", IX86_BUILTIN_VPSHLDVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4di_mask, "__builtin_ia32_vpshldv_v4di_mask", IX86_BUILTIN_VPSHLDVV4DI_MASK, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4di_maskz, "__builtin_ia32_vpshldv_v4di_maskz", IX86_BUILTIN_VPSHLDVV4DI_MASKZ, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4di_mask, "__builtin_ia32_vpshldv_v4di_mask", IX86_BUILTIN_VPSHLDVV4DI_MASK, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v4di_maskz, "__builtin_ia32_vpshldv_v4di_maskz", IX86_BUILTIN_VPSHLDVV4DI_MASKZ, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_V4DI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v2di, "__builtin_ia32_vpshldv_v2di", IX86_BUILTIN_VPSHLDVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v2di_mask, "__builtin_ia32_vpshldv_v2di_mask", IX86_BUILTIN_VPSHLDVV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v2di_maskz, "__builtin_ia32_vpshldv_v2di_maskz", IX86_BUILTIN_VPSHLDVV2DI_MASKZ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v2di_mask, "__builtin_ia32_vpshldv_v2di_mask", IX86_BUILTIN_VPSHLDVV2DI_MASK, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpshldv_v2di_maskz, "__builtin_ia32_vpshldv_v2di_maskz", IX86_BUILTIN_VPSHLDVV2DI_MASKZ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI_UQI)
+ 
+ /* GFNI */
+ BDESC (OPTION_MASK_ISA_GFNI | OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_vgf2p8affineinvqb_v64qi, "__builtin_ia32_vgf2p8affineinvqb_v64qi", IX86_BUILTIN_VGF2P8AFFINEINVQB512, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI_INT)
+@@ -2594,44 +2594,44 @@ BDESC (OPTION_MASK_ISA_GFNI | OPTION_MAS
+ /* VNNI */
+ 
+ BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusd_v16si, "__builtin_ia32_vpdpbusd_v16si", IX86_BUILTIN_VPDPBUSDV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusd_v16si_mask, "__builtin_ia32_vpdpbusd_v16si_mask", IX86_BUILTIN_VPDPBUSDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusd_v16si_maskz, "__builtin_ia32_vpdpbusd_v16si_maskz", IX86_BUILTIN_VPDPBUSDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusd_v16si_mask, "__builtin_ia32_vpdpbusd_v16si_mask", IX86_BUILTIN_VPDPBUSDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusd_v16si_maskz, "__builtin_ia32_vpdpbusd_v16si_maskz", IX86_BUILTIN_VPDPBUSDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v8si, "__builtin_ia32_vpdpbusd_v8si", IX86_BUILTIN_VPDPBUSDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v8si_mask, "__builtin_ia32_vpdpbusd_v8si_mask", IX86_BUILTIN_VPDPBUSDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v8si_maskz, "__builtin_ia32_vpdpbusd_v8si_maskz", IX86_BUILTIN_VPDPBUSDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v8si_mask, "__builtin_ia32_vpdpbusd_v8si_mask", IX86_BUILTIN_VPDPBUSDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v8si_maskz, "__builtin_ia32_vpdpbusd_v8si_maskz", IX86_BUILTIN_VPDPBUSDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v4si, "__builtin_ia32_vpdpbusd_v4si", IX86_BUILTIN_VPDPBUSDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v4si_mask, "__builtin_ia32_vpdpbusd_v4si_mask", IX86_BUILTIN_VPDPBUSDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v4si_maskz, "__builtin_ia32_vpdpbusd_v4si_maskz", IX86_BUILTIN_VPDPBUSDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v4si_mask, "__builtin_ia32_vpdpbusd_v4si_mask", IX86_BUILTIN_VPDPBUSDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusd_v4si_maskz, "__builtin_ia32_vpdpbusd_v4si_maskz", IX86_BUILTIN_VPDPBUSDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
+ 
+ BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusds_v16si, "__builtin_ia32_vpdpbusds_v16si", IX86_BUILTIN_VPDPBUSDSV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusds_v16si_mask, "__builtin_ia32_vpdpbusds_v16si_mask", IX86_BUILTIN_VPDPBUSDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusds_v16si_maskz, "__builtin_ia32_vpdpbusds_v16si_maskz", IX86_BUILTIN_VPDPBUSDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusds_v16si_mask, "__builtin_ia32_vpdpbusds_v16si_mask", IX86_BUILTIN_VPDPBUSDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpbusds_v16si_maskz, "__builtin_ia32_vpdpbusds_v16si_maskz", IX86_BUILTIN_VPDPBUSDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v8si, "__builtin_ia32_vpdpbusds_v8si", IX86_BUILTIN_VPDPBUSDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v8si_mask, "__builtin_ia32_vpdpbusds_v8si_mask", IX86_BUILTIN_VPDPBUSDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v8si_maskz, "__builtin_ia32_vpdpbusds_v8si_maskz", IX86_BUILTIN_VPDPBUSDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v8si_mask, "__builtin_ia32_vpdpbusds_v8si_mask", IX86_BUILTIN_VPDPBUSDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v8si_maskz, "__builtin_ia32_vpdpbusds_v8si_maskz", IX86_BUILTIN_VPDPBUSDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v4si, "__builtin_ia32_vpdpbusds_v4si", IX86_BUILTIN_VPDPBUSDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v4si_mask, "__builtin_ia32_vpdpbusds_v4si_mask", IX86_BUILTIN_VPDPBUSDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v4si_maskz, "__builtin_ia32_vpdpbusds_v4si_maskz", IX86_BUILTIN_VPDPBUSDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v4si_mask, "__builtin_ia32_vpdpbusds_v4si_mask", IX86_BUILTIN_VPDPBUSDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpbusds_v4si_maskz, "__builtin_ia32_vpdpbusds_v4si_maskz", IX86_BUILTIN_VPDPBUSDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
+ 
+ BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssd_v16si, "__builtin_ia32_vpdpwssd_v16si", IX86_BUILTIN_VPDPWSSDV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssd_v16si_mask, "__builtin_ia32_vpdpwssd_v16si_mask", IX86_BUILTIN_VPDPWSSDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssd_v16si_maskz, "__builtin_ia32_vpdpwssd_v16si_maskz", IX86_BUILTIN_VPDPWSSDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssd_v16si_mask, "__builtin_ia32_vpdpwssd_v16si_mask", IX86_BUILTIN_VPDPWSSDV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssd_v16si_maskz, "__builtin_ia32_vpdpwssd_v16si_maskz", IX86_BUILTIN_VPDPWSSDV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v8si, "__builtin_ia32_vpdpwssd_v8si", IX86_BUILTIN_VPDPWSSDV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v8si_mask, "__builtin_ia32_vpdpwssd_v8si_mask", IX86_BUILTIN_VPDPWSSDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v8si_maskz, "__builtin_ia32_vpdpwssd_v8si_maskz", IX86_BUILTIN_VPDPWSSDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v8si_mask, "__builtin_ia32_vpdpwssd_v8si_mask", IX86_BUILTIN_VPDPWSSDV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v8si_maskz, "__builtin_ia32_vpdpwssd_v8si_maskz", IX86_BUILTIN_VPDPWSSDV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v4si, "__builtin_ia32_vpdpwssd_v4si", IX86_BUILTIN_VPDPWSSDV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v4si_mask, "__builtin_ia32_vpdpwssd_v4si_mask", IX86_BUILTIN_VPDPWSSDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v4si_maskz, "__builtin_ia32_vpdpwssd_v4si_maskz", IX86_BUILTIN_VPDPWSSDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v4si_mask, "__builtin_ia32_vpdpwssd_v4si_mask", IX86_BUILTIN_VPDPWSSDV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssd_v4si_maskz, "__builtin_ia32_vpdpwssd_v4si_maskz", IX86_BUILTIN_VPDPWSSDV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
+ 
+ BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssds_v16si, "__builtin_ia32_vpdpwssds_v16si", IX86_BUILTIN_VPDPWSSDSV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssds_v16si_mask, "__builtin_ia32_vpdpwssds_v16si_mask", IX86_BUILTIN_VPDPWSSDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssds_v16si_maskz, "__builtin_ia32_vpdpwssds_v16si_maskz", IX86_BUILTIN_VPDPWSSDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssds_v16si_mask, "__builtin_ia32_vpdpwssds_v16si_mask", IX86_BUILTIN_VPDPWSSDSV16SI_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI, 0, CODE_FOR_vpdpwssds_v16si_maskz, "__builtin_ia32_vpdpwssds_v16si_maskz", IX86_BUILTIN_VPDPWSSDSV16SI_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_UHI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v8si, "__builtin_ia32_vpdpwssds_v8si", IX86_BUILTIN_VPDPWSSDSV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v8si_mask, "__builtin_ia32_vpdpwssds_v8si_mask", IX86_BUILTIN_VPDPWSSDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v8si_maskz, "__builtin_ia32_vpdpwssds_v8si_maskz", IX86_BUILTIN_VPDPWSSDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v8si_mask, "__builtin_ia32_vpdpwssds_v8si_mask", IX86_BUILTIN_VPDPWSSDSV8SI_MASK, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v8si_maskz, "__builtin_ia32_vpdpwssds_v8si_maskz", IX86_BUILTIN_VPDPWSSDSV8SI_MASKZ, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_V8SI_UQI)
+ BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v4si, "__builtin_ia32_vpdpwssds_v4si", IX86_BUILTIN_VPDPWSSDSV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v4si_mask, "__builtin_ia32_vpdpwssds_v4si_mask", IX86_BUILTIN_VPDPWSSDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
+-BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v4si_maskz, "__builtin_ia32_vpdpwssds_v4si_maskz", IX86_BUILTIN_VPDPWSSDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_INT)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v4si_mask, "__builtin_ia32_vpdpwssds_v4si_mask", IX86_BUILTIN_VPDPWSSDSV4SI_MASK, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
++BDESC (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpdpwssds_v4si_maskz, "__builtin_ia32_vpdpwssds_v4si_maskz", IX86_BUILTIN_VPDPWSSDSV4SI_MASKZ, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI_UQI)
+ 
+ /* VPCLMULQDQ */
+ BDESC (OPTION_MASK_ISA_VPCLMULQDQ | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_vpclmulqdq_v2di, "__builtin_ia32_vpclmulqdq_v2di", IX86_BUILTIN_VPCLMULQDQ2, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT)
+diff -urpN a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
+--- a/gcc/config/i386/i386-builtin-types.def	2020-03-12 07:07:21.000000000 -0400
++++ b/gcc/config/i386/i386-builtin-types.def	2020-12-17 20:46:53.868000000 -0500
+@@ -1246,17 +1246,8 @@ DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, INT
+ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, INT, V4SI, INT)
+ DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, INT, V2DI, INT)
+ DEF_FUNCTION_TYPE (V32HI, V32HI, V32HI, V32HI)
+-DEF_FUNCTION_TYPE (V32HI, V32HI, V32HI, V32HI, INT)
+-DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, V16HI, INT)
+-DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V8HI, INT)
+-DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI, INT)
+-DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V4SI, INT)
+ DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI)
+-DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI, INT)
+-DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V4DI, INT)
+ DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI)
+-DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, INT)
+-DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, V2DI, INT)
+ 
+ # BITALG builtins
+ DEF_FUNCTION_TYPE (V4DI, V4DI)
+diff -urpN a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
+--- a/gcc/config/i386/i386-expand.c	2020-12-17 20:44:55.508000000 -0500
++++ b/gcc/config/i386/i386-expand.c	2020-12-17 20:46:53.872000000 -0500
+@@ -9437,15 +9437,6 @@ ix86_expand_args_builtin (const struct b
+     case USI_FTYPE_V32HI_V32HI_INT_USI:
+     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
+     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
+-    case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
+-    case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
+-    case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
+-    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
+-    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
+-    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
+-    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
+-    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
+-    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
+       nargs = 4;
+       mask_pos = 1;
+       nargs_constant = 1;
diff --git a/reduction-paths-with-unhandled-live-stmt.patch b/reduction-paths-with-unhandled-live-stmt.patch
new file mode 100644
index 0000000..22dc08d
--- /dev/null
+++ b/reduction-paths-with-unhandled-live-stmt.patch
@@ -0,0 +1,64 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+2686de5617bfb572343933be2883e8274c9735b5
+0001-tree-optimization-97760-reduction-paths-with-unhandl.patch
+
+diff --git a/gcc/testsuite/gcc.dg/vect/pr97760.c b/gcc/testsuite/gcc.dg/vect/pr97760.c
+new file mode 100644
+index 00000000000..da5ac937a43
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/pr97760.c
+@@ -0,0 +1,26 @@
++#include "tree-vect.h"
++
++int b=1;
++static int *g = &b;
++
++void __attribute__((noipa))
++h (unsigned int n)
++{
++  int i = 3;
++  int f = 3;
++  for (; f <= 50; f += 4) {
++    i += 4;
++    *g = i;
++    i += n;
++  }
++}
++
++int main ()
++{
++  check_vect ();
++
++  h (9);
++  if (*g != 150 || b != 150)
++    __builtin_abort ();
++  return 0;
++}
+diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
+index 977633a3ce3..39b7319e825 100644
+--- a/gcc/tree-vect-loop.c
++++ b/gcc/tree-vect-loop.c
+@@ -3326,14 +3326,17 @@ pop:
+ 	  fail = true;
+ 	  break;
+ 	}
+-      /* Check there's only a single stmt the op is used on inside
+-         of the loop.  */
++      /* Check there's only a single stmt the op is used on.  For the
++	 not value-changing tail and the last stmt allow out-of-loop uses.
++	 ???  We could relax this and handle arbitrary live stmts by
++	 forcing a scalar epilogue for example.  */
+       imm_use_iterator imm_iter;
+       gimple *op_use_stmt;
+       unsigned cnt = 0;
+       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
+ 	if (!is_gimple_debug (op_use_stmt)
+-	    && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
++	    && (*code != ERROR_MARK
++		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
+ 	  {
+ 	    /* We want to allow x + x but not x < 1 ? x : 2.  */
+ 	    if (is_gimple_assign (op_use_stmt)
+
diff --git a/redundant-loop-elimination.patch b/redundant-loop-elimination.patch
new file mode 100644
index 0000000..fb33bec
--- /dev/null
+++ b/redundant-loop-elimination.patch
@@ -0,0 +1,486 @@
+diff -Nurp a/gcc/common.opt b/gcc/common.opt
+--- a/gcc/common.opt	2020-11-23 03:24:54.760000000 -0500
++++ b/gcc/common.opt	2020-11-23 03:23:59.716000000 -0500
+@@ -1150,6 +1150,10 @@ fcompare-elim
+ Common Report Var(flag_compare_elim_after_reload) Optimization
+ Perform comparison elimination after register allocation has finished.
+ 
++floop-elim
++Common Report Var(flag_loop_elim) Init(0) Optimization
++Perform redundant loop elimination.
++
+ fconserve-stack
+ Common Var(flag_conserve_stack) Optimization
+ Do not perform optimizations increasing noticeably stack usage.
+diff -Nurp a/gcc/tree-ssa-phiopt.c b/gcc/tree-ssa-phiopt.c
+--- a/gcc/tree-ssa-phiopt.c	2020-11-23 03:24:54.760000000 -0500
++++ b/gcc/tree-ssa-phiopt.c	2020-11-23 03:27:42.824000000 -0500
+@@ -71,6 +71,7 @@ static hash_set<tree> * get_non_trapping
+ static void replace_phi_edge_with_variable (basic_block, edge, gimple *, tree);
+ static void hoist_adjacent_loads (basic_block, basic_block,
+ 				  basic_block, basic_block);
++static bool do_phiopt_pattern (basic_block, basic_block, basic_block);
+ static bool gate_hoist_loads (void);
+ 
+ /* This pass tries to transform conditional stores into unconditional
+@@ -259,6 +260,10 @@ tree_ssa_phiopt_worker (bool do_store_el
+ 	    hoist_adjacent_loads (bb, bb1, bb2, bb3);
+ 	  continue;
+ 	}
++      else if (flag_loop_elim && do_phiopt_pattern (bb, bb1, bb2))
++	{
++	  continue;
++	}
+       else
+ 	continue;
+ 
+@@ -2899,6 +2904,449 @@ hoist_adjacent_loads (basic_block bb0, b
+     }
+ }
+ 
++static bool check_uses (tree, hash_set<tree> *);
++
++/* Check SSA_NAME is used in
++     if (SSA_NAME == 0)
++     ...
++   or
++     if (SSA_NAME != 0)
++     ...
++*/
++static bool
++check_uses_cond (tree ssa_name, gimple *stmt,
++		 hash_set<tree> *hset ATTRIBUTE_UNUSED)
++{
++  tree_code code = gimple_cond_code (stmt);
++  if (code != EQ_EXPR && code != NE_EXPR)
++    {
++      return false;
++    }
++
++  tree lhs = gimple_cond_lhs (stmt);
++  tree rhs = gimple_cond_rhs (stmt);
++  if ((lhs == ssa_name && integer_zerop (rhs))
++      || (rhs == ssa_name && integer_zerop (lhs)))
++    {
++      return true;
++    }
++
++  return false;
++}
++
++/* Check SSA_NAME is used in
++     _tmp = SSA_NAME == 0;
++   or
++     _tmp = SSA_NAME != 0;
++   or
++     _tmp = SSA_NAME | _tmp2;
++*/
++static bool
++check_uses_assign (tree ssa_name, gimple *stmt, hash_set<tree> *hset)
++{
++  tree_code code = gimple_assign_rhs_code (stmt);
++  tree lhs, rhs1, rhs2;
++
++  switch (code)
++    {
++    case EQ_EXPR:
++    case NE_EXPR:
++      rhs1 = gimple_assign_rhs1 (stmt);
++      rhs2 = gimple_assign_rhs2 (stmt);
++      if ((rhs1 == ssa_name && integer_zerop (rhs2))
++	  || (rhs2 == ssa_name && integer_zerop (rhs1)))
++	{
++	  return true;
++	}
++      break;
++
++    case BIT_IOR_EXPR:
++      lhs = gimple_assign_lhs (stmt);
++      if (hset->contains (lhs))
++	{
++	  return false;
++	}
++      /* We should check the use of _tmp further.  */
++      return check_uses (lhs, hset);
++
++    default:
++      break;
++    }
++  return false;
++}
++
++/* Check SSA_NAME is used in
++     # result = PHI <SSA_NAME (bb1), 0 (bb2), 0 (bb3)>
++*/
++static bool
++check_uses_phi (tree ssa_name, gimple *stmt, hash_set<tree> *hset)
++{
++  for (unsigned i = 0; i < gimple_phi_num_args (stmt); i++)
++    {
++      tree arg = gimple_phi_arg_def (stmt, i);
++      if (!integer_zerop (arg) && arg != ssa_name)
++	{
++	  return false;
++	}
++    }
++
++  tree result = gimple_phi_result (stmt);
++
++  /* It is used to avoid infinite recursion,
++     <bb 1>
++     if (cond)
++       goto <bb 2>
++     else
++       goto <bb 3>
++
++     <bb 2>
++     # _tmp2 = PHI <0 (bb 1), _tmp3 (bb 3)>
++     {BODY}
++     if (cond)
++       goto <bb 3>
++     else
++       goto <bb 4>
++
++     <bb 3>
++     # _tmp3 = PHI <0 (bb 1), _tmp2 (bb 2)>
++     {BODY}
++     if (cond)
++       goto <bb 2>
++     else
++       goto <bb 4>
++
++     <bb 4>
++     ...
++  */
++  if (hset->contains (result))
++    {
++      return false;
++    }
++
++  return check_uses (result, hset);
++}
++
++/* Check the use of SSA_NAME, it should only be used in comparison
++   operation and PHI node.  HSET is used to record the ssa_names
++   that have been already checked.  */
++static bool
++check_uses (tree ssa_name, hash_set<tree> *hset)
++{
++  imm_use_iterator imm_iter;
++  use_operand_p use_p;
++
++  if (TREE_CODE (ssa_name) != SSA_NAME)
++    {
++      return false;
++    }
++
++  if (SSA_NAME_VAR (ssa_name)
++      && is_global_var (SSA_NAME_VAR (ssa_name)))
++    {
++      return false;
++    }
++
++  hset->add (ssa_name);
++
++  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, ssa_name)
++    {
++      gimple *stmt = USE_STMT (use_p);
++
++      /* Ignore debug gimple statements.  */
++      if (is_gimple_debug (stmt))
++	{
++	  continue;
++	}
++
++      switch (gimple_code (stmt))
++	{
++	case GIMPLE_COND:
++	  if (!check_uses_cond (ssa_name, stmt, hset))
++	    {
++	      return false;
++	    }
++	  break;
++
++	case GIMPLE_ASSIGN:
++	  if (!check_uses_assign (ssa_name, stmt, hset))
++	    {
++	      return false;
++	    }
++	  break;
++
++	case GIMPLE_PHI:
++	  if (!check_uses_phi (ssa_name, stmt, hset))
++	    {
++	      return false;
++	    }
++	  break;
++
++	default:
++	  return false;
++	}
++    }
++  return true;
++}
++
++static bool
++check_def_gimple (gimple *def1, gimple *def2, tree result)
++{
++  /* def1 and def2 should be POINTER_PLUS_EXPR.  */
++  if (!is_gimple_assign (def1) || !is_gimple_assign (def2)
++      || gimple_assign_rhs_code (def1) != POINTER_PLUS_EXPR
++      || gimple_assign_rhs_code (def2) != POINTER_PLUS_EXPR)
++    {
++      return false;
++    }
++
++  tree rhs12 = gimple_assign_rhs2 (def1);
++
++  tree rhs21 = gimple_assign_rhs1 (def2);
++  tree rhs22 = gimple_assign_rhs2 (def2);
++
++  if (rhs21 != result)
++    {
++      return false;
++    }
++
++  /* We should have a positive pointer-plus constant to ensure
++     that the pointer value is continuously increasing.  */
++  if (TREE_CODE (rhs12) != INTEGER_CST || TREE_CODE (rhs22) != INTEGER_CST
++      || compare_tree_int (rhs12, 0) <= 0 || compare_tree_int (rhs22, 0) <= 0)
++    {
++      return false;
++    }
++
++  return true;
++}
++
++static bool
++check_loop_body (basic_block bb0, basic_block bb2, tree result)
++{
++  gimple *g01 = first_stmt (bb0);
++  if (!g01 || !is_gimple_assign (g01)
++      || gimple_assign_rhs_code (g01) != MEM_REF
++      || TREE_OPERAND (gimple_assign_rhs1 (g01), 0) != result)
++    {
++      return false;
++    }
++
++  gimple *g02 = g01->next;
++  /* GIMPLE_COND would be the last gimple in a basic block,
++     and have no other side effects on RESULT.  */
++  if (!g02 || gimple_code (g02) != GIMPLE_COND)
++    {
++      return false;
++    }
++
++  if (first_stmt (bb2) != last_stmt (bb2))
++    {
++      return false;
++    }
++
++  return true;
++}
++
++/* Pattern is like
++   <pre bb>
++   arg1 = base (rhs11) + cst (rhs12); [def1]
++   goto <bb 0>
++
++   <bb 2>
++   arg2 = result (rhs21) + cst (rhs22); [def2]
++
++   <bb 0>
++   # result = PHI <arg1 (pre bb), arg2 (bb 2)>
++   _v = *result;  [g01]
++   if (_v == 0)   [g02]
++     goto <bb 1>
++   else
++     goto <bb 2>
++
++   <bb 1>
++   _1 = result - base;     [g1]
++   _2 = _1 /[ex] cst;      [g2]
++   _3 = (unsigned int) _2; [g3]
++   if (_3 == 0)
++   ...
++*/
++static bool
++check_bb_order (basic_block bb0, basic_block &bb1, basic_block &bb2,
++		gphi *phi_stmt, gimple *&output)
++{
++  /* Start check from PHI node in BB0.  */
++  if (gimple_phi_num_args (phi_stmt) != 2
++      || virtual_operand_p (gimple_phi_result (phi_stmt)))
++    {
++      return false;
++    }
++
++  tree result = gimple_phi_result (phi_stmt);
++  tree arg1 = gimple_phi_arg_def (phi_stmt, 0);
++  tree arg2 = gimple_phi_arg_def (phi_stmt, 1);
++
++  if (TREE_CODE (arg1) != SSA_NAME
++      || TREE_CODE (arg2) != SSA_NAME
++      || SSA_NAME_IS_DEFAULT_DEF (arg1)
++      || SSA_NAME_IS_DEFAULT_DEF (arg2))
++    {
++      return false;
++    }
++
++  gimple *def1 = SSA_NAME_DEF_STMT (arg1);
++  gimple *def2 = SSA_NAME_DEF_STMT (arg2);
++
++  /* Swap bb1 and bb2 if pattern is like
++     if (_v != 0)
++       goto <bb 2>
++     else
++       goto <bb 1>
++  */
++  if (gimple_bb (def2) == bb1 && EDGE_SUCC (bb1, 0)->dest == bb0)
++    {
++      std::swap (bb1, bb2);
++    }
++
++  /* prebb[def1] --> bb0 <-- bb2[def2] */
++  if (!gimple_bb (def1)
++      || EDGE_SUCC (gimple_bb (def1), 0)->dest != bb0
++      || gimple_bb (def2) != bb2 || EDGE_SUCC (bb2, 0)->dest != bb0)
++    {
++      return false;
++    }
++
++  /* Check whether define gimple meets the pattern requirements.  */
++  if (!check_def_gimple (def1, def2, result))
++    {
++      return false;
++    }
++
++  if (!check_loop_body (bb0, bb2, result))
++    {
++      return false;
++    }
++
++  output = def1;
++  return true;
++}
++
++/* Check pattern
++   <bb 1>
++   _1 = result - base;     [g1]
++   _2 = _1 /[ex] cst;      [g2]
++   _3 = (unsigned int) _2; [g3]
++   if (_3 == 0)
++   ...
++*/
++static bool
++check_gimple_order (basic_block bb1, tree base, tree cst, tree result,
++		    gimple *&output)
++{
++  gimple *g1 = first_stmt (bb1);
++  if (!g1 || !is_gimple_assign (g1)
++      || gimple_assign_rhs_code (g1) != POINTER_DIFF_EXPR
++      || gimple_assign_rhs1 (g1) != result
++      || gimple_assign_rhs2 (g1) != base)
++    {
++      return false;
++    }
++
++  gimple *g2 = g1->next;
++  if (!g2 || !is_gimple_assign (g2)
++      || gimple_assign_rhs_code (g2) != EXACT_DIV_EXPR
++      || gimple_assign_lhs (g1) != gimple_assign_rhs1 (g2)
++      || TREE_CODE (gimple_assign_rhs2 (g2)) != INTEGER_CST)
++    {
++      return false;
++    }
++
++  /* INTEGER_CST cst in gimple def1.  */
++  HOST_WIDE_INT num1 = TREE_INT_CST_LOW (cst);
++  /* INTEGER_CST cst in gimple g2.  */
++  HOST_WIDE_INT num2 = TREE_INT_CST_LOW (gimple_assign_rhs2 (g2));
++  /* _2 must be at least a positive number.  */
++  if (num2 == 0 || num1 / num2 <= 0)
++    {
++      return false;
++    }
++
++  gimple *g3 = g2->next;
++  if (!g3 || !is_gimple_assign (g3)
++      || gimple_assign_rhs_code (g3) != NOP_EXPR
++      || gimple_assign_lhs (g2) != gimple_assign_rhs1 (g3)
++      || TREE_CODE (gimple_assign_lhs (g3)) != SSA_NAME)
++    {
++      return false;
++    }
++
++  /* _3 should only be used in comparison operation or PHI node.  */
++  hash_set<tree> *hset = new hash_set<tree>;
++  if (!check_uses (gimple_assign_lhs (g3), hset))
++    {
++      delete hset;
++      return false;
++    }
++  delete hset;
++
++  output = g3;
++  return true;
++}
++
++static bool
++do_phiopt_pattern (basic_block bb0, basic_block bb1, basic_block bb2)
++{
++  gphi_iterator gsi;
++
++  for (gsi = gsi_start_phis (bb0); !gsi_end_p (gsi); gsi_next (&gsi))
++    {
++      gphi *phi_stmt = gsi.phi ();
++      gimple *def1 = NULL;
++      tree base, cst, result;
++
++      if (!check_bb_order (bb0, bb1, bb2, phi_stmt, def1))
++	{
++	  continue;
++	}
++
++      base = gimple_assign_rhs1 (def1);
++      cst = gimple_assign_rhs2 (def1);
++      result = gimple_phi_result (phi_stmt);
++
++      gimple *stmt = NULL;
++      if (!check_gimple_order (bb1, base, cst, result, stmt))
++	{
++	  continue;
++	}
++
++      gcc_assert (stmt);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  fprintf (dump_file, "PHIOPT pattern optimization (1) - Rewrite:\n");
++	  print_gimple_stmt (dump_file, stmt, 0);
++	  fprintf (dump_file, "to\n");
++	}
++
++      /* Rewrite statement
++	   _3 = (unsigned int) _2;
++	 to
++	   _3 = (unsigned int) 1;
++      */
++      tree type = TREE_TYPE (gimple_assign_rhs1 (stmt));
++      gimple_assign_set_rhs1 (stmt, build_int_cst (type, 1));
++      update_stmt (stmt);
++
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	{
++	  print_gimple_stmt (dump_file, stmt, 0);
++	  fprintf (dump_file, "\n");
++	}
++
++      return true;
++    }
++  return false;
++}
++
+ /* Determine whether we should attempt to hoist adjacent loads out of
+    diamond patterns in pass_phiopt.  Always hoist loads if
+    -fhoist-adjacent-loads is specified and the target machine has
diff --git a/sccvn-Improve-handling-of-load-masked-with-integer.patch b/sccvn-Improve-handling-of-load-masked-with-integer.patch
new file mode 100644
index 0000000..f24d0c3
--- /dev/null
+++ b/sccvn-Improve-handling-of-load-masked-with-integer.patch
@@ -0,0 +1,2397 @@
+This backport contains 14 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+c2851dc2896bfc0d27b32c90cafc873f67cd6727
+0001-tree-ssa-sccvn.c-struct-vn_walk_cb_data-Add-orig_ref.patch
+
+69b5279e977593d656906288316ee03a8bf79c6a
+0001-gimple-parser.c-c_parser_gimple_postfix_expression-H.patch
+
+8389386c6d55d57afc3ae01f71546ac4468f7926
+0001-gimple-parser.c-c_parser_gimple_postfix_expression-S.patch
+
+d1f2e4c1027b826cf3ba353e86c37589f63f8efe
+0001-tree-ssa-sccvn.c-vn_walk_cb_data-push_partial_def-Re.patch
+
+62e3e66f130fc280eac0bbb6b69e9adca328c03b
+0001-re-PR-tree-optimization-83518-Missing-optimization-u.patch
+
+10f30ac9cda947d117e50f0cbd4cf94ee70a944f
+0001-re-PR-tree-optimization-91756-g-.dg-lto-alias-3-FAIL.patch
+
+1284e2b104a81ad93daab5110cd844981e501086
+0001-re-PR-tree-optimization-90883-Generated-code-is-wors.patch
+
+fb08a53b2eb01cc06d66f479c865aca55c91fd26
+0001-tree-ssa-sccvn.c-vn_walk_cb_data-push_partial_def-Ba.patch
+
+0849cdae714ddf056a4944f31eef53a465f1bcd0
+0001-tree-ssa-sccvn.c-vn_walk_cb_data-push_partial_def-Ha.patch
+
+5f0653a8b75a5ad5a5405a27dd92d3a5759eed4c
+0001-tree-optimization-91123-restore-redundant-store-remo.patch
+
+8aba425f4ebc5e2c054776d3cdddf13f7c1918f8
+0001-sccvn-Handle-bitfields-in-vn_reference_lookup_3-PR93.patch
+
+7f5617b00445dcc861a498a4cecc8aaa59e05b8c
+0001-sccvn-Handle-bitfields-in-push_partial_def-PR93582.patch
+
+5f9cd512c4278621435cce486dd00248ea2e821c
+0001-sccvn-Handle-non-byte-aligned-offset-or-size-for-mem.patch
+
+b07e4e7c7520ca3e798f514dec0711eea2c027be
+0001-sccvn-Improve-handling-of-load-masked-with-integer-c.patch
+
+diff -urpN a/gcc/c/gimple-parser.c b/gcc/c/gimple-parser.c
+--- a/gcc/c/gimple-parser.c	2020-11-26 22:26:34.848000000 -0500
++++ b/gcc/c/gimple-parser.c	2020-11-26 22:06:08.032000000 -0500
+@@ -1320,17 +1320,24 @@ c_parser_gimple_postfix_expression (gimp
+ 		}
+ 	      else
+ 		{
+-		  bool neg_p;
++		  bool neg_p, addr_p;
+ 		  if ((neg_p = c_parser_next_token_is (parser, CPP_MINUS)))
+ 		    c_parser_consume_token (parser);
++		  if ((addr_p = c_parser_next_token_is (parser, CPP_AND)))
++		    c_parser_consume_token (parser);
+ 		  tree val = c_parser_gimple_postfix_expression (parser).value;
+ 		  if (! val
+ 		      || val == error_mark_node
+-		      || ! CONSTANT_CLASS_P (val))
++		      || (!CONSTANT_CLASS_P (val)
++			  && !(addr_p
++			       && (TREE_CODE (val) == STRING_CST
++				   || DECL_P (val)))))
+ 		    {
+ 		      c_parser_error (parser, "invalid _Literal");
+ 		      return expr;
+ 		    }
++		  if (addr_p)
++		    val = build1 (ADDR_EXPR, type, val);
+ 		  if (neg_p)
+ 		    {
+ 		      val = const_unop (NEGATE_EXPR, TREE_TYPE (val), val);
+diff -urpN a/gcc/fold-const.c b/gcc/fold-const.c
+--- a/gcc/fold-const.c	2020-11-26 22:26:32.816000000 -0500
++++ b/gcc/fold-const.c	2020-11-26 22:06:08.036000000 -0500
+@@ -7773,6 +7773,70 @@ native_decode_vector_tree (tree type, ve
+   return builder.build ();
+ }
+ 
++/* Routines for manipulation of native_encode_expr encoded data if the encoded
++   or extracted constant positions and/or sizes aren't byte aligned.  */
++
++/* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the
++   bits between adjacent elements.  AMNT should be within
++   [0, BITS_PER_UNIT).
++   Example, AMNT = 2:
++   00011111|11100000 << 2 = 01111111|10000000
++   PTR[1]  | PTR[0]	     PTR[1]  | PTR[0].  */
++
++void
++shift_bytes_in_array_left (unsigned char *ptr, unsigned int sz,
++			   unsigned int amnt)
++{
++  if (amnt == 0)
++    return;
++
++  unsigned char carry_over = 0U;
++  unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt);
++  unsigned char clear_mask = (~0U) << amnt;
++
++  for (unsigned int i = 0; i < sz; i++)
++    {
++      unsigned prev_carry_over = carry_over;
++      carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt);
++
++      ptr[i] <<= amnt;
++      if (i != 0)
++	{
++	  ptr[i] &= clear_mask;
++	  ptr[i] |= prev_carry_over;
++	}
++    }
++}
++
++/* Like shift_bytes_in_array_left but for big-endian.
++   Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the
++   bits between adjacent elements.  AMNT should be within
++   [0, BITS_PER_UNIT).
++   Example, AMNT = 2:
++   00011111|11100000 >> 2 = 00000111|11111000
++   PTR[0]  | PTR[1]	     PTR[0]  | PTR[1].  */
++
++void
++shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz,
++			    unsigned int amnt)
++{
++  if (amnt == 0)
++    return;
++
++  unsigned char carry_over = 0U;
++  unsigned char carry_mask = ~(~0U << amnt);
++
++  for (unsigned int i = 0; i < sz; i++)
++    {
++      unsigned prev_carry_over = carry_over;
++      carry_over = ptr[i] & carry_mask;
++
++      carry_over <<= (unsigned char) BITS_PER_UNIT - amnt;
++      ptr[i] >>= amnt;
++      ptr[i] |= prev_carry_over;
++    }
++}
++
+ /* Try to view-convert VECTOR_CST EXPR to VECTOR_TYPE TYPE by operating
+    directly on the VECTOR_CST encoding, in a way that works for variable-
+    length vectors.  Return the resulting VECTOR_CST on success or null
+diff -urpN a/gcc/fold-const.h b/gcc/fold-const.h
+--- a/gcc/fold-const.h	2020-11-26 22:26:32.816000000 -0500
++++ b/gcc/fold-const.h	2020-11-26 22:06:08.036000000 -0500
+@@ -27,6 +27,10 @@ extern int folding_initializer;
+ /* Convert between trees and native memory representation.  */
+ extern int native_encode_expr (const_tree, unsigned char *, int, int off = -1);
+ extern tree native_interpret_expr (tree, const unsigned char *, int);
++extern void shift_bytes_in_array_left (unsigned char *, unsigned int,
++				       unsigned int);
++extern void shift_bytes_in_array_right (unsigned char *, unsigned int,
++					unsigned int);
+ 
+ /* Fold constants as much as possible in an expression.
+    Returns the simplified expression.
+diff -urpN a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c
+--- a/gcc/gimple-ssa-store-merging.c	2020-11-26 22:26:32.860000000 -0500
++++ b/gcc/gimple-ssa-store-merging.c	2020-11-26 22:06:08.036000000 -0500
+@@ -1464,66 +1464,6 @@ dump_char_array (FILE *fd, unsigned char
+   fprintf (fd, "\n");
+ }
+ 
+-/* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the
+-   bits between adjacent elements.  AMNT should be within
+-   [0, BITS_PER_UNIT).
+-   Example, AMNT = 2:
+-   00011111|11100000 << 2 = 01111111|10000000
+-   PTR[1]  | PTR[0]         PTR[1]  | PTR[0].  */
+-
+-static void
+-shift_bytes_in_array (unsigned char *ptr, unsigned int sz, unsigned int amnt)
+-{
+-  if (amnt == 0)
+-    return;
+-
+-  unsigned char carry_over = 0U;
+-  unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt);
+-  unsigned char clear_mask = (~0U) << amnt;
+-
+-  for (unsigned int i = 0; i < sz; i++)
+-    {
+-      unsigned prev_carry_over = carry_over;
+-      carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt);
+-
+-      ptr[i] <<= amnt;
+-      if (i != 0)
+-	{
+-	  ptr[i] &= clear_mask;
+-	  ptr[i] |= prev_carry_over;
+-	}
+-    }
+-}
+-
+-/* Like shift_bytes_in_array but for big-endian.
+-   Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the
+-   bits between adjacent elements.  AMNT should be within
+-   [0, BITS_PER_UNIT).
+-   Example, AMNT = 2:
+-   00011111|11100000 >> 2 = 00000111|11111000
+-   PTR[0]  | PTR[1]         PTR[0]  | PTR[1].  */
+-
+-static void
+-shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz,
+-			    unsigned int amnt)
+-{
+-  if (amnt == 0)
+-    return;
+-
+-  unsigned char carry_over = 0U;
+-  unsigned char carry_mask = ~(~0U << amnt);
+-
+-  for (unsigned int i = 0; i < sz; i++)
+-    {
+-      unsigned prev_carry_over = carry_over;
+-      carry_over = ptr[i] & carry_mask;
+-
+-      carry_over <<= (unsigned char) BITS_PER_UNIT - amnt;
+-      ptr[i] >>= amnt;
+-      ptr[i] |= prev_carry_over;
+-    }
+-}
+-
+ /* Clear out LEN bits starting from bit START in the byte array
+    PTR.  This clears the bits to the *right* from START.
+    START must be within [0, BITS_PER_UNIT) and counts starting from
+@@ -1749,7 +1689,7 @@ encode_tree_to_bitpos (tree expr, unsign
+   /* Create the shifted version of EXPR.  */
+   if (!BYTES_BIG_ENDIAN)
+     {
+-      shift_bytes_in_array (tmpbuf, byte_size, shift_amnt);
++      shift_bytes_in_array_left (tmpbuf, byte_size, shift_amnt);
+       if (shift_amnt == 0)
+ 	byte_size--;
+     }
+@@ -4667,11 +4607,11 @@ verify_array_eq (unsigned char *x, unsig
+     }
+ }
+ 
+-/* Test shift_bytes_in_array and that it carries bits across between
++/* Test shift_bytes_in_array_left and that it carries bits across between
+    bytes correctly.  */
+ 
+ static void
+-verify_shift_bytes_in_array (void)
++verify_shift_bytes_in_array_left (void)
+ {
+    /* byte 1   | byte 0
+       00011111 | 11100000.  */
+@@ -4680,13 +4620,13 @@ verify_shift_bytes_in_array (void)
+   memcpy (in, orig, sizeof orig);
+ 
+   unsigned char expected[2] = { 0x80, 0x7f };
+-  shift_bytes_in_array (in, sizeof (in), 2);
++  shift_bytes_in_array_left (in, sizeof (in), 2);
+   verify_array_eq (in, expected, sizeof (in));
+ 
+   memcpy (in, orig, sizeof orig);
+   memcpy (expected, orig, sizeof orig);
+   /* Check that shifting by zero doesn't change anything.  */
+-  shift_bytes_in_array (in, sizeof (in), 0);
++  shift_bytes_in_array_left (in, sizeof (in), 0);
+   verify_array_eq (in, expected, sizeof (in));
+ 
+ }
+@@ -4771,7 +4711,7 @@ verify_clear_bit_region_be (void)
+ void
+ store_merging_c_tests (void)
+ {
+-  verify_shift_bytes_in_array ();
++  verify_shift_bytes_in_array_left ();
+   verify_shift_bytes_in_array_right ();
+   verify_clear_bit_region ();
+   verify_clear_bit_region_be ();
+diff -urpN a/gcc/testsuite/gcc.c-torture/execute/pr93582.c b/gcc/testsuite/gcc.c-torture/execute/pr93582.c
+--- a/gcc/testsuite/gcc.c-torture/execute/pr93582.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.c-torture/execute/pr93582.c	2020-11-26 22:25:43.532000000 -0500
+@@ -0,0 +1,22 @@
++/* PR tree-optimization/93582 */
++
++short a;
++int b, c;
++
++__attribute__((noipa)) void
++foo (void)
++{
++  b = c;
++  a &= 7;
++}
++
++int
++main ()
++{
++  c = 27;
++  a = 14;
++  foo ();
++  if (b != 27 || a != 6)
++    __builtin_abort ();
++  return 0;
++}
+diff -urpN a/gcc/testsuite/gcc.dg/gimplefe-42.c b/gcc/testsuite/gcc.dg/gimplefe-42.c
+--- a/gcc/testsuite/gcc.dg/gimplefe-42.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/gimplefe-42.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-fgimple" } */
++
++typedef char ref_all_char __attribute__((may_alias));
++char a[7];
++__GIMPLE void f()
++{
++  int _1;
++  /* string literals inside __MEM need their address taken.  */
++  __MEM <char[7]> ((ref_all_char *)&a)
++    = __MEM <char[7]> (_Literal (char *) &"654321");
++  /* but plain assignment also works.  */
++  __MEM <char[7]> ((ref_all_char *)&a) = "654321";
++  /* also punning with int.  */
++  _1 = __MEM <int> (_Literal (char *) &"654321");
++  __MEM <int> ((ref_all_char *)&a) = _1;
++  return;
++}
+diff -urpN a/gcc/testsuite/gcc.dg/pr93582.c b/gcc/testsuite/gcc.dg/pr93582.c
+--- a/gcc/testsuite/gcc.dg/pr93582.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/pr93582.c	2020-11-26 22:26:15.784000000 -0500
+@@ -0,0 +1,57 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile } */
++/* { dg-options "-O2 -Warray-bounds" } */
++
++struct S {
++  unsigned int s1:1;
++  unsigned int s2:1;
++  unsigned int s3:1;
++  unsigned int s4:1;
++  unsigned int s5:4;
++  unsigned char s6;
++  unsigned short s7;
++  unsigned short s8;
++};
++struct T {
++  int t1;
++  int t2;
++};
++
++static inline int
++bar (struct S *x)
++{
++  if (x->s4)
++    return ((struct T *)(x + 1))->t1 + ((struct T *)(x + 1))->t2;	/* { dg-bogus "array subscript 1 is outside array bounds of" } */
++  else
++    return 0;
++}
++
++int
++foo (int x, int y)
++{
++  struct S s;								/* { dg-bogus "while referencing" } */
++  s.s6 = x;
++  s.s7 = y & 0x1FFF;
++  s.s4 = 0;
++  return bar (&s);
++}
++
++static inline int
++qux (struct S *x)
++{
++  int s4 = x->s4;
++  if (s4)
++    return ((struct T *)(x + 1))->t1 + ((struct T *)(x + 1))->t2;
++  else
++    return 0;
++}
++
++int
++baz (int x, int y)
++{
++  struct S s;
++  s.s6 = x;
++  s.s7 = y & 0x1FFF;
++  s.s4 = 0;
++  return qux (&s);
++}
+diff -urpN a/gcc/testsuite/gcc.dg/torture/ssa-fre-5.c b/gcc/testsuite/gcc.dg/torture/ssa-fre-5.c
+--- a/gcc/testsuite/gcc.dg/torture/ssa-fre-5.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/torture/ssa-fre-5.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
++/* { dg-additional-options "-fgimple -fdump-tree-fre1" } */
++
++typedef int v4si __attribute__((vector_size(16)));
++
++int __GIMPLE (ssa,startwith("fre"))
++foo ()
++{
++  int * p;
++  int i;
++  int x[4];
++  long unsigned int _1;
++  long unsigned int _2;
++  int _7;
++
++  __BB(2):
++  i_3 = 0;
++  _1 = (long unsigned int) i_3;
++  _2 = _1 * 4ul;
++  p_4 = _Literal (int *) &x + _2;
++  __MEM <v4si> ((v4si *)p_4) = _Literal (v4si) { 1, 2, 3, 4 };
++  _7 = x[0];
++  return _7;
++}
++
++/* { dg-final { scan-tree-dump "return 1;" "fre1" } } */
+diff -urpN a/gcc/testsuite/gcc.dg/torture/ssa-fre-6.c b/gcc/testsuite/gcc.dg/torture/ssa-fre-6.c
+--- a/gcc/testsuite/gcc.dg/torture/ssa-fre-6.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/torture/ssa-fre-6.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,27 @@
++/* { dg-do compile } */
++/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
++/* { dg-additional-options "-fgimple -fdump-tree-fre1" } */
++
++typedef int v4si __attribute__((vector_size(16)));
++
++int __GIMPLE (ssa,startwith("fre"))
++foo ()
++{
++  int * p;
++  int i;
++  int x[4];
++  long unsigned int _1;
++  long unsigned int _2;
++  int _7;
++
++  __BB(2):
++  i_3 = 0;
++  _1 = (long unsigned int) i_3;
++  _2 = _1 * 4ul;
++  p_4 = _Literal (int *) &x + _2;
++  __MEM <v4si> ((v4si *)p_4) = _Literal (v4si) {};
++  _7 = x[0];
++  return _7;
++}
++
++/* { dg-final { scan-tree-dump "return 0;" "fre1" } } */
+diff -urpN a/gcc/testsuite/gcc.dg/torture/ssa-fre-7.c b/gcc/testsuite/gcc.dg/torture/ssa-fre-7.c
+--- a/gcc/testsuite/gcc.dg/torture/ssa-fre-7.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/torture/ssa-fre-7.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
++/* { dg-additional-options "-fgimple -fdump-tree-fre1" } */
++
++typedef int v4si __attribute__((vector_size(16)));
++
++int __GIMPLE (ssa,startwith("fre"))
++foo (int c)
++{
++  int * p;
++  int i;
++  int x[4];
++  long unsigned int _1;
++  long unsigned int _2;
++  int _7;
++  v4si _6;
++
++  __BB(2):
++  i_3 = 0;
++  _1 = (long unsigned int) i_3;
++  _2 = _1 * 4ul;
++  p_4 = _Literal (int *) &x + _2;
++  _6 = _Literal (v4si) { c_5(D), c_5(D), c_5(D), c_5(D) };
++  __MEM <v4si> ((v4si *)p_4) = _6;
++  _7 = x[0];
++  return _7;
++}
++
++/* { dg-final { scan-tree-dump "return c_5\\(D\\);" "fre1" } } */
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c b/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c	2020-11-26 22:26:34.324000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c	2020-11-26 22:06:08.036000000 -0500
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O2 -fdump-tree-fre3" } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
+ struct foo
+ {
+   int val;
+@@ -18,4 +18,4 @@ test ()
+   return barptr->val2;
+ }
+ 
+-/* { dg-final { scan-tree-dump-times "return 123" 1 "fre3"} } */
++/* { dg-final { scan-tree-dump-times "return 123" 1 "fre1"} } */
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-10.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-10.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-10.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-10.c	2020-11-26 22:24:45.812000000 -0500
+@@ -0,0 +1,29 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return 72876566;" "fre1" { target le } } } */
++/* { dg-final { scan-tree-dump "return 559957376;" "fre1" { target be } } } */
++
++union U {
++  struct S { int a : 12, b : 5, c : 10, d : 5; } s;
++  unsigned int i;
++};
++struct A { char a[12]; union U u; };
++void bar (struct A *);
++
++unsigned
++foo (void)
++{
++  struct A a;
++  bar (&a);
++  a.u.s.a = 1590;
++  a.u.s.c = -404;
++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
++#define M 0x67e0a5f
++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
++#define M 0xa5f067e0
++#else
++#define M 0
++#endif
++  return a.u.i & M;
++}
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-1.c	2020-11-26 22:18:39.368000000 -0500
+@@ -0,0 +1,18 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return 1;" "fre1" } } */
++
++union U {
++  struct S { int a : 1, b : 4, c : 27; } s;
++  struct T { int d : 2; int e : 2; int f : 28; } t;
++};
++
++int
++foo (void)
++{
++  union U u;
++  u.s.b = 10;
++  return u.t.e;
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-2.c	2020-11-26 22:18:44.832000000 -0500
+@@ -0,0 +1,17 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return 593;" "fre1" } } */
++
++union U {
++  struct S { int a : 1, b : 14, c : 17; } s;
++  struct T { int d : 2; int e : 12; int f : 18; } t;
++};
++
++int
++foo (void)
++{
++  union U u;
++  u.s.b = -7005;
++  return u.t.e;
++}
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-3.c	2020-11-26 22:21:44.936000000 -0500
+@@ -0,0 +1,19 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return 1;" "fre1" { target be } } } */
++/* { dg-final { scan-tree-dump "return 2;" "fre1" { target le } } } */
++
++union U {
++  struct S { int a : 1, b : 14, c : 17; } s;
++  struct T { int d : 10; int e : 4; int f : 18; } t;
++};
++
++int
++foo (void)
++{
++  union U u;
++  u.s.b = -7005;
++  return u.t.e;
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-4.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-4.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-4.c	2020-11-26 22:23:33.236000000 -0500
+@@ -0,0 +1,24 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return -1991560811;" "fre1" { target le } } } */
++/* { dg-final { scan-tree-dump "return -733324916;" "fre1" { target be } } } */
++
++union U {
++  struct S { int a : 1, b : 4, c : 27; } s;
++  unsigned int i;
++};
++struct A { char a[24]; union U u; };
++void bar (struct A *);
++
++int
++foo (void)
++{
++  struct A a;
++  bar (&a);
++  a.u.s.a = -1;
++  a.u.s.b = -6;
++  a.u.s.c = -62236276;
++  return a.u.i;
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-5.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-5.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-5.c	2020-11-26 22:23:38.324000000 -0500
+@@ -0,0 +1,26 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return -1462729318;" "fre1" { target le } } } */
++/* { dg-final { scan-tree-dump "return 1300568597;" "fre1" { target be } } } */
++
++union U {
++  struct S { int a : 1, b : 7, c : 8, d : 11, e : 5; } s;
++  unsigned int i;
++};
++struct A { char a[8]; union U u; };
++void bar (struct A *);
++
++int
++foo (void)
++{
++  struct A a;
++  bar (&a);
++  a.u.s.a = 0;
++  a.u.s.b = -51;
++  a.u.s.c = -123;
++  a.u.s.d = 208;
++  a.u.s.e = -11;
++  return a.u.i;
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-6.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-6.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-6.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-6.c	2020-11-26 22:23:42.348000000 -0500
+@@ -0,0 +1,25 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return 890118;" "fre1" { target le } } } */
++/* { dg-final { scan-tree-dump "return 447899;" "fre1" { target be } } } */
++
++union U {
++  struct S { int a : 16, b : 5, c : 10, d : 1; } s;
++  struct T { int a : 8, b : 21, c : 3; } t;
++};
++struct A { char a[4]; union U u; };
++void bar (struct A *);
++
++int
++foo (void)
++{
++  struct A a;
++  bar (&a);
++  a.u.s.a = 1590;
++  a.u.s.b = -11;
++  a.u.s.c = 620;
++  a.u.s.d = -1;
++  return a.u.t.b;
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-7.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-7.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-7.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-7.c	2020-11-26 22:23:45.756000000 -0500
+@@ -0,0 +1,25 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return -413012;" "fre1" { target le } } } */
++/* { dg-final { scan-tree-dump "return -611112;" "fre1" { target be } } } */
++
++union U {
++  struct S { int a : 12, b : 5, c : 10, d : 5; } s;
++  struct T { int a : 7, b : 21, c : 4; } t;
++};
++struct A { char a[48]; union U u; };
++void bar (struct A *);
++
++int
++foo (void)
++{
++  struct A a;
++  bar (&a);
++  a.u.s.a = 1590;
++  a.u.s.b = -11;
++  a.u.s.c = -404;
++  a.u.s.d = 7;
++  return a.u.t.b;
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-8.c b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-8.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/pr93582-8.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/pr93582-8.c	2020-11-26 22:23:53.088000000 -0500
+@@ -0,0 +1,15 @@
++/* PR tree-optimization/93582 */
++/* { dg-do compile { target int32 } } */
++/* { dg-options "-O2 -fdump-tree-fre1" } */
++/* { dg-final { scan-tree-dump "return 0;" "fre1" { target le } } } */
++/* { dg-final { scan-tree-dump "return -8531;" "fre1" { target be } } } */
++
++short
++foo (void)
++{
++  union U { char c[32]; short s[16]; int i[8]; } u;
++  __builtin_memset (u.c + 1, '\0', 5);
++  u.s[3] = 0xdead;
++  return u.i[1];
++}
++
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-82.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-82.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-82.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-82.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,25 @@
++/* { dg-do run } */
++/* { dg-options "-O -fdump-tree-fre1-details" } */
++
++struct S { _Bool x; };
++
++void
++foo (struct S *s)
++{
++  __builtin_memset (s, 1, sizeof (struct S));
++  s->x = 1;
++}
++
++int
++main ()
++{
++  struct S s;
++  foo (&s);
++  char c;
++  __builtin_memcpy (&c, &s.x, 1);
++  if (c != 1)
++    __builtin_abort ();
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "Deleted redundant store" "fre1" } } */
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-83.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-83.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-83.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-83.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,32 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fdump-tree-fre1-details" } */
++
++struct X
++{
++   int a : 1;
++   int b : 1;
++} x;
++
++void foo (int v)
++{
++  x.a = 1;
++  x.b = v;
++  x.a = 1;
++  x.b = v;
++}
++
++struct Y
++{
++   _Bool a;
++   _Bool b;
++} y;
++
++void bar (int v)
++{
++  y.a = 1;
++  y.b = v;
++  y.a = 1;
++  y.b = v;
++}
++
++/* { dg-final { scan-tree-dump-times "Deleted redundant store" 4 "fre1" } } */
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-84.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-84.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-84.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-84.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,19 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fdump-tree-fre1" } */
++
++typedef int v4si __attribute__((vector_size(16)));
++
++void foo (v4si *dst, int x)
++{
++  v4si v[2];
++  v[0][0] = 1;
++  v[0][1] = x;
++  v[0][2] = 2;
++  v[0][3] = 3;
++  v[0][1] = 0;
++  *dst = v[0];
++}
++
++/* The shadowed non-constant assign to v[0][1] shouldn't prevent us from
++   value-numbering the load to a constant.  */
++/* { dg-final { scan-tree-dump "\\*dst_\[0-9\]*\\\(D\\) = { 1, 0, 2, 3 };" "fre1" } } */
+diff -urpN a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-85.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-85.c
+--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-85.c	1969-12-31 19:00:00.000000000 -0500
++++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-85.c	2020-11-26 22:06:08.036000000 -0500
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-options "-O -fstrict-aliasing -fdump-tree-fre1-details" } */
++
++struct X { int i; int j; };
++
++struct X x, y;
++void foo ()
++{
++  x.i = 1;
++  y = x;
++  y.i = 1; // redundant
++}
++
++/* { dg-final { scan-tree-dump "Deleted redundant store y.i" "fre1" } } */
+diff -urpN a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c
+--- a/gcc/tree-ssa-alias.c	2020-11-26 22:26:32.884000000 -0500
++++ b/gcc/tree-ssa-alias.c	2020-11-26 22:06:08.036000000 -0500
+@@ -2628,7 +2628,8 @@ static bool
+ maybe_skip_until (gimple *phi, tree &target, basic_block target_bb,
+ 		  ao_ref *ref, tree vuse, bool tbaa_p, unsigned int &limit,
+ 		  bitmap *visited, bool abort_on_visited,
+-		  void *(*translate)(ao_ref *, tree, void *, bool *),
++		  void *(*translate)(ao_ref *, tree, void *, translate_flags *),
++		  translate_flags disambiguate_only,
+ 		  void *data)
+ {
+   basic_block bb = gimple_bb (phi);
+@@ -2663,7 +2664,7 @@ maybe_skip_until (gimple *phi, tree &tar
+ 	    return !abort_on_visited;
+ 	  vuse = get_continuation_for_phi (def_stmt, ref, tbaa_p, limit,
+ 					   visited, abort_on_visited,
+-					   translate, data);
++					   translate, data, disambiguate_only);
+ 	  if (!vuse)
+ 	    return false;
+ 	  continue;
+@@ -2678,9 +2679,9 @@ maybe_skip_until (gimple *phi, tree &tar
+ 	  --limit;
+ 	  if (stmt_may_clobber_ref_p_1 (def_stmt, ref, tbaa_p))
+ 	    {
+-	      bool disambiguate_only = true;
++	      translate_flags tf = disambiguate_only;
+ 	      if (translate
+-		  && (*translate) (ref, vuse, data, &disambiguate_only) == NULL)
++		  && (*translate) (ref, vuse, data, &tf) == NULL)
+ 		;
+ 	      else
+ 		return false;
+@@ -2711,8 +2712,10 @@ tree
+ get_continuation_for_phi (gimple *phi, ao_ref *ref, bool tbaa_p,
+ 			  unsigned int &limit, bitmap *visited,
+ 			  bool abort_on_visited,
+-			  void *(*translate)(ao_ref *, tree, void *, bool *),
+-			  void *data)
++			  void *(*translate)(ao_ref *, tree, void *,
++					     translate_flags *),
++			  void *data,
++			  translate_flags disambiguate_only)
+ {
+   unsigned nargs = gimple_phi_num_args (phi);
+ 
+@@ -2754,13 +2757,15 @@ get_continuation_for_phi (gimple *phi, a
+       else if (! maybe_skip_until (phi, arg0, dom, ref, arg1, tbaa_p,
+ 				   limit, visited,
+ 				   abort_on_visited,
+-				   /* Do not translate when walking over
++				   translate,
++				   /* Do not valueize when walking over
+ 				      backedges.  */
+ 				   dominated_by_p
+ 				     (CDI_DOMINATORS,
+ 				      gimple_bb (SSA_NAME_DEF_STMT (arg1)),
+ 				      phi_bb)
+-				   ? NULL : translate, data))
++				   ? TR_DISAMBIGUATE
++				   : disambiguate_only, data))
+ 	return NULL_TREE;
+     }
+ 
+@@ -2798,7 +2803,8 @@ get_continuation_for_phi (gimple *phi, a
+ void *
+ walk_non_aliased_vuses (ao_ref *ref, tree vuse, bool tbaa_p,
+ 			void *(*walker)(ao_ref *, tree, void *),
+-			void *(*translate)(ao_ref *, tree, void *, bool *),
++			void *(*translate)(ao_ref *, tree, void *,
++					   translate_flags *),
+ 			tree (*valueize)(tree),
+ 			unsigned &limit, void *data)
+ {
+@@ -2851,7 +2857,7 @@ walk_non_aliased_vuses (ao_ref *ref, tre
+ 	    {
+ 	      if (!translate)
+ 		break;
+-	      bool disambiguate_only = false;
++	      translate_flags disambiguate_only = TR_TRANSLATE;
+ 	      res = (*translate) (ref, vuse, data, &disambiguate_only);
+ 	      /* Failed lookup and translation.  */
+ 	      if (res == (void *)-1)
+@@ -2863,7 +2869,7 @@ walk_non_aliased_vuses (ao_ref *ref, tre
+ 	      else if (res != NULL)
+ 		break;
+ 	      /* Translation succeeded, continue walking.  */
+-	      translated = translated || !disambiguate_only;
++	      translated = translated || disambiguate_only == TR_TRANSLATE;
+ 	    }
+ 	  vuse = gimple_vuse (def_stmt);
+ 	}
+diff -urpN a/gcc/tree-ssa-alias.h b/gcc/tree-ssa-alias.h
+--- a/gcc/tree-ssa-alias.h	2020-11-26 22:26:32.868000000 -0500
++++ b/gcc/tree-ssa-alias.h	2020-11-26 22:06:08.040000000 -0500
+@@ -131,13 +131,18 @@ extern bool call_may_clobber_ref_p (gcal
+ extern bool call_may_clobber_ref_p_1 (gcall *, ao_ref *);
+ extern bool stmt_kills_ref_p (gimple *, tree);
+ extern bool stmt_kills_ref_p (gimple *, ao_ref *);
++enum translate_flags
++  { TR_TRANSLATE, TR_VALUEIZE_AND_DISAMBIGUATE, TR_DISAMBIGUATE };
+ extern tree get_continuation_for_phi (gimple *, ao_ref *, bool,
+ 				      unsigned int &, bitmap *, bool,
+-				      void *(*)(ao_ref *, tree, void *, bool *),
+-				      void *);
++				      void *(*)(ao_ref *, tree, void *,
++						translate_flags *),
++				      void *, translate_flags
++					= TR_VALUEIZE_AND_DISAMBIGUATE);
+ extern void *walk_non_aliased_vuses (ao_ref *, tree, bool,
+ 				     void *(*)(ao_ref *, tree, void *),
+-				     void *(*)(ao_ref *, tree, void *, bool *),
++				     void *(*)(ao_ref *, tree, void *,
++					       translate_flags *),
+ 				     tree (*)(tree), unsigned &, void *);
+ extern int walk_aliased_vdefs (ao_ref *, tree,
+ 			       bool (*)(ao_ref *, tree, void *),
+diff -urpN a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c
+--- a/gcc/tree-ssa-sccvn.c	2020-11-26 22:26:32.836000000 -0500
++++ b/gcc/tree-ssa-sccvn.c	2020-11-27 03:17:41.080000000 -0500
+@@ -1684,24 +1684,75 @@ struct pd_data
+ 
+ struct vn_walk_cb_data
+ {
+-  vn_walk_cb_data (vn_reference_t vr_, tree *last_vuse_ptr_,
+-		   vn_lookup_kind vn_walk_kind_, bool tbaa_p_)
+-    : vr (vr_), last_vuse_ptr (last_vuse_ptr_), vn_walk_kind (vn_walk_kind_),
+-      tbaa_p (tbaa_p_), known_ranges (NULL)
+-   {}
++  vn_walk_cb_data (vn_reference_t vr_, tree orig_ref_, tree *last_vuse_ptr_,
++		   vn_lookup_kind vn_walk_kind_, bool tbaa_p_, tree mask_)
++    : vr (vr_), last_vuse_ptr (last_vuse_ptr_), last_vuse (NULL_TREE),
++      mask (mask_), masked_result (NULL_TREE), vn_walk_kind (vn_walk_kind_),
++      tbaa_p (tbaa_p_), saved_operands (vNULL), first_set (-2),
++      known_ranges (NULL)
++  {
++    if (!last_vuse_ptr)
++      last_vuse_ptr = &last_vuse;
++    ao_ref_init (&orig_ref, orig_ref_);
++    if (mask)
++      {
++	wide_int w = wi::to_wide (mask);
++	unsigned int pos = 0, prec = w.get_precision ();
++	pd_data pd;
++	pd.rhs = build_constructor (NULL_TREE, NULL);
++	/* When bitwise and with a constant is done on a memory load,
++	   we don't really need all the bits to be defined or defined
++	   to constants, we don't really care what is in the position
++	   corresponding to 0 bits in the mask.
++	   So, push the ranges of those 0 bits in the mask as artificial
++	   zero stores and let the partial def handling code do the
++	   rest.  */
++	while (pos < prec)
++	  {
++	    int tz = wi::ctz (w);
++	    if (pos + tz > prec)
++	      tz = prec - pos;
++	    if (tz)
++	      {
++		if (BYTES_BIG_ENDIAN)
++		  pd.offset = prec - pos - tz;
++		else
++		  pd.offset = pos;
++		pd.size = tz;
++		void *r = push_partial_def (pd, 0, prec);
++		gcc_assert (r == NULL_TREE);
++	      }
++	    pos += tz;
++	    if (pos == prec)
++	      break;
++	    w = wi::lrshift (w, tz);
++	    tz = wi::ctz (wi::bit_not (w));
++	    if (pos + tz > prec)
++	      tz = prec - pos;
++	    pos += tz;
++	    w = wi::lrshift (w, tz);
++	  }
++      }
++  }
+   ~vn_walk_cb_data ();
+-  void *push_partial_def (const pd_data& pd, tree, HOST_WIDE_INT);
++  void *finish (alias_set_type, tree);
++  void *push_partial_def (const pd_data& pd, alias_set_type, HOST_WIDE_INT);
+ 
+   vn_reference_t vr;
++  ao_ref orig_ref;
+   tree *last_vuse_ptr;
++  tree last_vuse;
++  tree mask;
++  tree masked_result;
+   vn_lookup_kind vn_walk_kind;
+   bool tbaa_p;
++  vec<vn_reference_op_s> saved_operands;
+ 
+   /* The VDEFs of partial defs we come along.  */
+   auto_vec<pd_data, 2> partial_defs;
+   /* The first defs range to avoid splay tree setup in most cases.  */
+   pd_range first_range;
+-  tree first_vuse;
++  alias_set_type first_set;
+   splay_tree known_ranges;
+   obstack ranges_obstack;
+ };
+@@ -1713,6 +1764,23 @@ vn_walk_cb_data::~vn_walk_cb_data ()
+       splay_tree_delete (known_ranges);
+       obstack_free (&ranges_obstack, NULL);
+     }
++  saved_operands.release ();
++}
++
++void *
++vn_walk_cb_data::finish (alias_set_type set, tree val)
++{
++  if (first_set != -2)
++    set = first_set;
++  if (mask)
++    {
++      masked_result = val;
++      return (void *) -1;
++    }
++  vec<vn_reference_op_s> &operands
++    = saved_operands.exists () ? saved_operands : vr->operands;
++  return vn_reference_lookup_or_insert_for_pieces (last_vuse, set,
++		  vr->type, operands, val);
+ }
+ 
+ /* pd_range splay-tree helpers.  */
+@@ -1742,168 +1810,306 @@ pd_tree_dealloc (void *, void *)
+ }
+ 
+ /* Push PD to the vector of partial definitions returning a
+-   value when we are ready to combine things with VUSE and MAXSIZEI,
++   value when we are ready to combine things with VUSE, SET and MAXSIZEI,
+    NULL when we want to continue looking for partial defs or -1
+    on failure.  */
+ 
+ void *
+-vn_walk_cb_data::push_partial_def (const pd_data &pd, tree vuse,
+-				   HOST_WIDE_INT maxsizei)
++vn_walk_cb_data::push_partial_def (const pd_data &pd,
++				   alias_set_type set, HOST_WIDE_INT maxsizei)
+ {
++  const HOST_WIDE_INT bufsize = 64;
++  /* We're using a fixed buffer for encoding so fail early if the object
++     we want to interpret is bigger.  */
++  if (maxsizei > bufsize * BITS_PER_UNIT
++      || CHAR_BIT != 8
++      || BITS_PER_UNIT != 8
++      /* Not prepared to handle PDP endian.  */
++      || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
++    return (void *)-1;
++
++  bool pd_constant_p = (TREE_CODE (pd.rhs) == CONSTRUCTOR
++			|| CONSTANT_CLASS_P (pd.rhs));
+   if (partial_defs.is_empty ())
+     {
++      if (!pd_constant_p)
++	return (void *)-1;
+       partial_defs.safe_push (pd);
+       first_range.offset = pd.offset;
+       first_range.size = pd.size;
+-      first_vuse = vuse;
++      first_set = set;
+       last_vuse_ptr = NULL;
++      /* Continue looking for partial defs.  */
++      return NULL;
++    }
++
++  if (!known_ranges)
++    {
++      /* ???  Optimize the case where the 2nd partial def completes things.  */
++      gcc_obstack_init (&ranges_obstack);
++      known_ranges = splay_tree_new_with_allocator (pd_range_compare, 0, 0,
++						    pd_tree_alloc,
++						    pd_tree_dealloc, this);
++      splay_tree_insert (known_ranges,
++			 (splay_tree_key)&first_range.offset,
++			 (splay_tree_value)&first_range);
++    }
++
++  pd_range newr = { pd.offset, pd.size };
++  splay_tree_node n;
++  pd_range *r;
++  /* Lookup the predecessor of offset + 1 and see if we need to merge.  */
++  HOST_WIDE_INT loffset = newr.offset + 1;
++  if ((n = splay_tree_predecessor (known_ranges, (splay_tree_key)&loffset))
++      && ((r = (pd_range *)n->value), true)
++      && ranges_known_overlap_p (r->offset, r->size + 1,
++				 newr.offset, newr.size))
++    {
++      /* Ignore partial defs already covered.  */
++      if (known_subrange_p (newr.offset, newr.size, r->offset, r->size))
++	return NULL;
++      r->size = MAX (r->offset + r->size, newr.offset + newr.size) - r->offset;
+     }
+   else
+     {
+-      if (!known_ranges)
+-	{
+-	  /* ???  Optimize the case where the second partial def
+-	     completes things.  */
+-	  gcc_obstack_init (&ranges_obstack);
+-	  known_ranges
+-	      = splay_tree_new_with_allocator (pd_range_compare, 0, 0,
+-					       pd_tree_alloc,
+-					       pd_tree_dealloc, this);
+-	  splay_tree_insert (known_ranges,
+-			     (splay_tree_key)&first_range.offset,
+-			     (splay_tree_value)&first_range);
+-	}
+-      if (known_ranges)
+-	{
+-	  pd_range newr = { pd.offset, pd.size };
+-	  splay_tree_node n;
+-	  pd_range *r;
+-	  /* Lookup the predecessor of offset + 1 and see if
+-	     we need to merge with it.  */
+-	  HOST_WIDE_INT loffset = newr.offset + 1;
+-	  if ((n = splay_tree_predecessor (known_ranges,
+-					   (splay_tree_key)&loffset))
+-	      && ((r = (pd_range *)n->value), true)
+-	      && ranges_known_overlap_p (r->offset, r->size + 1,
+-					 newr.offset, newr.size))
+-	    {
+-	      /* Ignore partial defs already covered.  */
+-	      if (known_subrange_p (newr.offset, newr.size,
+-				    r->offset, r->size))
+-		return NULL;
+-	      r->size = MAX (r->offset + r->size,
+-			     newr.offset + newr.size) - r->offset;
+-	    }
+-	  else
+-	    {
+-	      /* newr.offset wasn't covered yet, insert the
+-		 range.  */
+-	      r = XOBNEW (&ranges_obstack, pd_range);
+-	      *r = newr;
+-	      splay_tree_insert (known_ranges,
+-				 (splay_tree_key)&r->offset,
+-				 (splay_tree_value)r);
+-	    }
+-	  /* Merge r which now contains newr and is a member
+-	     of the splay tree with adjacent overlapping ranges.  */
+-	  pd_range *rafter;
+-	  while ((n = splay_tree_successor (known_ranges,
+-					    (splay_tree_key)&r->offset))
+-		 && ((rafter = (pd_range *)n->value), true)
+-		 && ranges_known_overlap_p (r->offset, r->size + 1,
+-					    rafter->offset, rafter->size))
+-	    {
+-	      r->size = MAX (r->offset + r->size,
+-			     rafter->offset + rafter->size) - r->offset;
+-	      splay_tree_remove (known_ranges,
+-				 (splay_tree_key)&rafter->offset);
+-	    }
+-	  partial_defs.safe_push (pd);
+-
+-	  /* Now we have merged newr into the range tree.
+-	     When we have covered [offseti, sizei] then the
+-	     tree will contain exactly one node which has
+-	     the desired properties and it will be 'r'.  */
+-	  if (known_subrange_p (0, maxsizei / BITS_PER_UNIT,
+-				r->offset, r->size))
+-	    {
+-	      /* Now simply native encode all partial defs
+-		 in reverse order.  */
+-	      unsigned ndefs = partial_defs.length ();
+-	      /* We support up to 512-bit values (for V8DFmode).  */
+-	      unsigned char buffer[64];
+-	      int len;
++      /* newr.offset wasn't covered yet, insert the range.  */
++      r = XOBNEW (&ranges_obstack, pd_range);
++      *r = newr;
++      splay_tree_insert (known_ranges, (splay_tree_key)&r->offset,
++			 (splay_tree_value)r);
++    }
++  /* Merge r which now contains newr and is a member of the splay tree with
++     adjacent overlapping ranges.  */
++  pd_range *rafter;
++  while ((n = splay_tree_successor (known_ranges, (splay_tree_key)&r->offset))
++	 && ((rafter = (pd_range *)n->value), true)
++	 && ranges_known_overlap_p (r->offset, r->size + 1,
++				    rafter->offset, rafter->size))
++    {
++      r->size = MAX (r->offset + r->size,
++		     rafter->offset + rafter->size) - r->offset;
++      splay_tree_remove (known_ranges, (splay_tree_key)&rafter->offset);
++    }
++  /* Non-constants are OK as long as they are shadowed by a constant.  */
++  if (!pd_constant_p)
++    return (void *)-1;
++  partial_defs.safe_push (pd);
++
++  /* Now we have merged newr into the range tree.  When we have covered
++     [offseti, sizei] then the tree will contain exactly one node which has
++     the desired properties and it will be 'r'.  */
++  if (!known_subrange_p (0, maxsizei, r->offset, r->size))
++    /* Continue looking for partial defs.  */
++    return NULL;
+ 
+-	      while (!partial_defs.is_empty ())
++  /* Now simply native encode all partial defs in reverse order.  */
++  unsigned ndefs = partial_defs.length ();
++  /* We support up to 512-bit values (for V8DFmode).  */
++  unsigned char buffer[bufsize + 1];
++  unsigned char this_buffer[bufsize + 1];
++  int len;
++
++  memset (buffer, 0, bufsize + 1);
++  unsigned needed_len = ROUND_UP (maxsizei, BITS_PER_UNIT) / BITS_PER_UNIT;
++  while (!partial_defs.is_empty ())
++    {
++      pd_data pd = partial_defs.pop ();
++      unsigned int amnt;
++      if (TREE_CODE (pd.rhs) == CONSTRUCTOR)
++	{
++	  /* Empty CONSTRUCTOR.  */
++	  if (pd.size >= needed_len * BITS_PER_UNIT)
++	    len = needed_len;
++	  else
++	    len = ROUND_UP (pd.size, BITS_PER_UNIT) / BITS_PER_UNIT;
++	  memset (this_buffer, 0, len);
++	}
++      else
++ 	{
++	  len = native_encode_expr (pd.rhs, this_buffer, bufsize,
++				    MAX (0, -pd.offset) / BITS_PER_UNIT);
++	  if (len <= 0
++	      || len < (ROUND_UP (pd.size, BITS_PER_UNIT) / BITS_PER_UNIT
++			- MAX (0, -pd.offset) / BITS_PER_UNIT))
++ 	    {
++	      if (dump_file && (dump_flags & TDF_DETAILS))
++		fprintf (dump_file, "Failed to encode %u "
++			 "partial definitions\n", ndefs);
++	      return (void *)-1;
++ 	    }
++	}
++
++      unsigned char *p = buffer;
++      HOST_WIDE_INT size = pd.size;
++      if (pd.offset < 0)
++	size -= ROUND_DOWN (-pd.offset, BITS_PER_UNIT);
++      this_buffer[len] = 0;
++      if (BYTES_BIG_ENDIAN)
++	{
++	  /* LSB of this_buffer[len - 1] byte should be at
++	     pd.offset + pd.size - 1 bits in buffer.  */
++	  amnt = ((unsigned HOST_WIDE_INT) pd.offset
++		  + pd.size) % BITS_PER_UNIT;
++	  if (amnt)
++	    shift_bytes_in_array_right (this_buffer, len + 1, amnt);
++	  unsigned char *q = this_buffer;
++	  unsigned int off = 0;
++	  if (pd.offset >= 0)
++	    {
++	      unsigned int msk;
++	      off = pd.offset / BITS_PER_UNIT;
++	      gcc_assert (off < needed_len);
++	      p = buffer + off;
++	      if (size <= amnt)
+ 		{
+-		  pd_data pd = partial_defs.pop ();
+-		  if (TREE_CODE (pd.rhs) == CONSTRUCTOR)
+-		    /* Empty CONSTRUCTOR.  */
+-		    memset (buffer + MAX (0, pd.offset),
+-			    0, MIN ((HOST_WIDE_INT)sizeof (buffer)
+-				     - MAX (0, pd.offset),
+-				    pd.size + MIN (0, pd.offset)));
+-		  else
++		  msk = ((1 << size) - 1) << (BITS_PER_UNIT - amnt);
++		  *p = (*p & ~msk) | (this_buffer[len] & msk);
++		  size = 0;
++		}
++	      else
++		{
++		  if (TREE_CODE (pd.rhs) != CONSTRUCTOR)
++		    q = (this_buffer + len
++			 - (ROUND_UP (size - amnt, BITS_PER_UNIT)
++			    / BITS_PER_UNIT));
++		  if (pd.offset % BITS_PER_UNIT)
+ 		    {
+-		      len = native_encode_expr (pd.rhs,
+-						buffer + MAX (0, pd.offset),
+-						sizeof (buffer)
+-						- MAX (0, pd.offset),
+-						MAX (0, -pd.offset));
+-		      if (len <= 0
+-			  || len < (pd.size - MAX (0, -pd.offset)))
+-			{
+-			  if (dump_file && (dump_flags & TDF_DETAILS))
+-			    fprintf (dump_file, "Failed to encode %u "
+-				     "partial definitions\n", ndefs);
+-			  return (void *)-1;
+-			}
++		      msk = -1U << (BITS_PER_UNIT
++				    - (pd.offset % BITS_PER_UNIT));
++		      *p = (*p & msk) | (*q & ~msk);
++		      p++;
++		      q++;
++		      off++;
++		      size -= BITS_PER_UNIT - (pd.offset % BITS_PER_UNIT);
++		      gcc_assert (size >= 0);
+ 		    }
+ 		}
+-
+-	      tree type = vr->type;
+-	      /* Make sure to interpret in a type that has a range
+-		 covering the whole access size.  */
+-	      if (INTEGRAL_TYPE_P (vr->type)
+-		  && maxsizei != TYPE_PRECISION (vr->type))
+-		type = build_nonstandard_integer_type (maxsizei,
+-						       TYPE_UNSIGNED (type));
+-	      tree val = native_interpret_expr (type, buffer,
+-						maxsizei / BITS_PER_UNIT);
+-	      /* If we chop off bits because the types precision doesn't
+-		 match the memory access size this is ok when optimizing
+-		 reads but not when called from the DSE code during
+-		 elimination.  */
+-	      if (val
+-		  && type != vr->type)
++	    }
++	  else if (TREE_CODE (pd.rhs) != CONSTRUCTOR)
++	    {
++	      q = (this_buffer + len
++		   - (ROUND_UP (size - amnt, BITS_PER_UNIT)
++		      / BITS_PER_UNIT));
++	      if (pd.offset % BITS_PER_UNIT)
+ 		{
+-		  if (! int_fits_type_p (val, vr->type))
+-		    val = NULL_TREE;
+-		  else
+-		    val = fold_convert (vr->type, val);
++		  q++;
++		  size -= BITS_PER_UNIT - ((unsigned HOST_WIDE_INT) pd.offset
++					   % BITS_PER_UNIT);
++		  gcc_assert (size >= 0);
+ 		}
+-
+-	      if (val)
++	    }
++	  if ((unsigned HOST_WIDE_INT) size / BITS_PER_UNIT + off
++	      > needed_len)
++	    size = (needed_len - off) * BITS_PER_UNIT;
++	  memcpy (p, q, size / BITS_PER_UNIT);
++	  if (size % BITS_PER_UNIT)
++	    {
++	      unsigned int msk
++		= -1U << (BITS_PER_UNIT - (size % BITS_PER_UNIT));
++	      p += size / BITS_PER_UNIT;
++	      q += size / BITS_PER_UNIT;
++	      *p = (*q & msk) | (*p & ~msk);
++	    }
++	}
++      else
++	{
++	  size = MIN (size, (HOST_WIDE_INT) needed_len * BITS_PER_UNIT);
++	  if (pd.offset >= 0)
++	    {
++	      /* LSB of this_buffer[0] byte should be at pd.offset bits
++		 in buffer.  */
++	      unsigned int msk;
++	      amnt = pd.offset % BITS_PER_UNIT;
++	      if (amnt)
++		shift_bytes_in_array_left (this_buffer, len + 1, amnt);
++	      unsigned int off = pd.offset / BITS_PER_UNIT;
++	      gcc_assert (off < needed_len);
++	      p = buffer + off;
++	      if (amnt + size < BITS_PER_UNIT)
+ 		{
+-		  if (dump_file && (dump_flags & TDF_DETAILS))
+-		    fprintf (dump_file, "Successfully combined %u "
+-			     "partial definitions\n", ndefs);
+-		  return vn_reference_lookup_or_insert_for_pieces
+-		      (first_vuse,
+-		       vr->set, vr->type, vr->operands, val);
++		  /* Low amnt bits come from *p, then size bits
++		     from this_buffer[0] and the remaining again from
++		     *p.  */
++		  msk = ((1 << size) - 1) << amnt;
++		  *p = (*p & ~msk) | (this_buffer[0] & msk);
++		  size = 0;
+ 		}
+-	      else
++	      else if (amnt)
+ 		{
+-		  if (dump_file && (dump_flags & TDF_DETAILS))
+-		    fprintf (dump_file, "Failed to interpret %u "
+-			     "encoded partial definitions\n", ndefs);
+-		  return (void *)-1;
++		  msk = -1U << amnt;
++		  *p = (*p & ~msk) | (this_buffer[0] & msk);
++		  p++;
++		  size -= (BITS_PER_UNIT - amnt);
+ 		}
+ 	    }
++	  else
++	    {
++	      amnt = (unsigned HOST_WIDE_INT) pd.offset % BITS_PER_UNIT;
++	      if (amnt)
++		shift_bytes_in_array_left (this_buffer, len + 1, amnt);
++	    }
++	  memcpy (p, this_buffer + (amnt != 0), size / BITS_PER_UNIT);
++	  p += size / BITS_PER_UNIT;
++	  if (size % BITS_PER_UNIT)
++	    {
++	      unsigned int msk = -1U << (size % BITS_PER_UNIT);
++	      *p = (this_buffer[(amnt != 0) + size / BITS_PER_UNIT]
++		    & ~msk) | (*p & msk);
++	    }
+ 	}
+     }
+-  /* Continue looking for partial defs.  */
+-  return NULL;
++
++  tree type = vr->type;
++  /* Make sure to interpret in a type that has a range covering the whole
++     access size.  */
++  if (INTEGRAL_TYPE_P (vr->type) && maxsizei != TYPE_PRECISION (vr->type))
++    type = build_nonstandard_integer_type (maxsizei, TYPE_UNSIGNED (type));
++  tree val;
++  if (BYTES_BIG_ENDIAN)
++    {
++      unsigned sz = needed_len;
++      if (maxsizei % BITS_PER_UNIT)
++	shift_bytes_in_array_right (buffer, needed_len,
++				    BITS_PER_UNIT
++				    - (maxsizei % BITS_PER_UNIT));
++      if (INTEGRAL_TYPE_P (type))
++	sz = GET_MODE_SIZE (SCALAR_INT_TYPE_MODE (type));
++      if (sz > needed_len)
++	{
++	  memcpy (this_buffer + (sz - needed_len), buffer, needed_len);
++	  val = native_interpret_expr (type, this_buffer, sz);
++	}
++      else
++	val = native_interpret_expr (type, buffer, needed_len);
++    }
++  else
++    val = native_interpret_expr (type, buffer, bufsize);
++  /* If we chop off bits because the types precision doesn't match the memory
++     access size this is ok when optimizing reads but not when called from
++     the DSE code during elimination.  */
++  if (val && type != vr->type)
++    {
++      if (! int_fits_type_p (val, vr->type))
++	val = NULL_TREE;
++      else
++	val = fold_convert (vr->type, val);
++    }
++  if (val)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file,
++		 "Successfully combined %u partial definitions\n", ndefs);
++      /* We are using the alias-set of the first store we encounter which
++	 should be appropriate here.  */
++      return finish (first_set, val);
++    }
++  else
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file,
++		 "Failed to interpret %u encoded partial definitions\n", ndefs);
++      return (void *)-1;
++     }
+ }
+ 
+ /* Callback for walk_non_aliased_vuses.  Adjusts the vn_reference_t VR_
+@@ -1923,7 +2129,10 @@ vn_reference_lookup_2 (ao_ref *op ATTRIB
+     return NULL;
+ 
+   if (data->last_vuse_ptr)
+-    *data->last_vuse_ptr = vuse;
++    {
++      *data->last_vuse_ptr = vuse;
++      data->last_vuse = vuse;
++    }
+ 
+   /* Fixup vuse and hash.  */
+   if (vr->vuse)
+@@ -1935,7 +2144,11 @@ vn_reference_lookup_2 (ao_ref *op ATTRIB
+   hash = vr->hashcode;
+   slot = valid_info->references->find_slot_with_hash (vr, hash, NO_INSERT);
+   if (slot)
+-    return *slot;
++    {
++      if ((*slot)->result && data->saved_operands.exists ())
++	return data->finish (vr->set, (*slot)->result);
++      return *slot;
++    }
+ 
+   return NULL;
+ }
+@@ -2221,13 +2434,13 @@ adjust_offsets_for_equal_base_address (t
+ 
+ static void *
+ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_,
+-		       bool *disambiguate_only)
++		       translate_flags *disambiguate_only)
+ {
+   vn_walk_cb_data *data = (vn_walk_cb_data *)data_;
+   vn_reference_t vr = data->vr;
+   gimple *def_stmt = SSA_NAME_DEF_STMT (vuse);
+   tree base = ao_ref_base (ref);
+-  HOST_WIDE_INT offseti, maxsizei;
++  HOST_WIDE_INT offseti = 0, maxsizei, sizei = 0;
+   static vec<vn_reference_op_s> lhs_ops;
+   ao_ref lhs_ref;
+   bool lhs_ref_ok = false;
+@@ -2242,8 +2455,11 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+       lhs_ops.truncate (0);
+       basic_block saved_rpo_bb = vn_context_bb;
+       vn_context_bb = gimple_bb (def_stmt);
+-      copy_reference_ops_from_ref (lhs, &lhs_ops);
+-      lhs_ops = valueize_refs_1 (lhs_ops, &valueized_anything, true);
++      if (*disambiguate_only <= TR_VALUEIZE_AND_DISAMBIGUATE)
++	{
++	  copy_reference_ops_from_ref (lhs, &lhs_ops);
++	  lhs_ops = valueize_refs_1 (lhs_ops, &valueized_anything, true);
++	}
+       vn_context_bb = saved_rpo_bb;
+       if (valueized_anything)
+ 	{
+@@ -2253,7 +2469,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	  if (lhs_ref_ok
+ 	      && !refs_may_alias_p_1 (ref, &lhs_ref, data->tbaa_p))
+ 	    {
+-	      *disambiguate_only = true;
++	      *disambiguate_only = TR_VALUEIZE_AND_DISAMBIGUATE;
+ 	      return NULL;
+ 	    }
+ 	}
+@@ -2263,6 +2479,30 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	  lhs_ref_ok = true;
+ 	}
+ 
++      /* Besides valueizing the LHS we can also use access-path based
++	  disambiguation on the original non-valueized ref.  */
++      if (!ref->ref
++	  && lhs_ref_ok
++	  && data->orig_ref.ref)
++	{
++	  /* We want to use the non-valueized LHS for this, but avoid redundant
++	     work.  */
++	  ao_ref *lref = &lhs_ref;
++	  ao_ref lref_alt;
++	  if (valueized_anything)
++	    {
++	      ao_ref_init (&lref_alt, lhs);
++	      lref = &lref_alt;
++	    }
++	  if (!refs_may_alias_p_1 (&data->orig_ref, lref, data->tbaa_p))
++	    {
++	      *disambiguate_only = (valueized_anything
++				    ? TR_VALUEIZE_AND_DISAMBIGUATE
++				    : TR_DISAMBIGUATE);
++	      return NULL;
++	    }
++	}
++
+       /* If we reach a clobbering statement try to skip it and see if
+          we find a VN result with exactly the same value as the
+ 	 possible clobber.  In this case we can ignore the clobber
+@@ -2299,7 +2539,8 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	    }
+ 	}
+     }
+-  else if (gimple_call_builtin_p (def_stmt, BUILT_IN_NORMAL)
++  else if (*disambiguate_only <= TR_VALUEIZE_AND_DISAMBIGUATE
++	   && gimple_call_builtin_p (def_stmt, BUILT_IN_NORMAL)
+ 	   && gimple_call_num_args (def_stmt) <= 4)
+     {
+       /* For builtin calls valueize its arguments and call the
+@@ -2328,15 +2569,13 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	    gimple_call_set_arg (def_stmt, i, oldargs[i]);
+ 	  if (!res)
+ 	    {
+-	      *disambiguate_only = true;
++	      *disambiguate_only = TR_VALUEIZE_AND_DISAMBIGUATE;
+ 	      return NULL;
+ 	    }
+ 	}
+     }
+ 
+-  /* If we are looking for redundant stores do not create new hashtable
+-     entries from aliasing defs with made up alias-sets.  */
+-  if (*disambiguate_only || !data->tbaa_p)
++  if (*disambiguate_only > TR_TRANSLATE)
+     return (void *)-1;
+ 
+   /* If we cannot constrain the size of the reference we cannot
+@@ -2359,10 +2598,14 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+       && (integer_zerop (gimple_call_arg (def_stmt, 1))
+ 	  || ((TREE_CODE (gimple_call_arg (def_stmt, 1)) == INTEGER_CST
+ 	       || (INTEGRAL_TYPE_P (vr->type) && known_eq (ref->size, 8)))
+-	      && CHAR_BIT == 8 && BITS_PER_UNIT == 8
++	      && CHAR_BIT == 8
++	      && BITS_PER_UNIT == 8
++	      && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN
+ 	      && offset.is_constant (&offseti)
+-	      && offseti % BITS_PER_UNIT == 0
+ 	      && multiple_p (ref->size, BITS_PER_UNIT)))
++	      && ref->size.is_constant (&sizei)
++	      && (offseti % BITS_PER_UNIT == 0
++		  || TREE_CODE (gimple_call_arg (def_stmt, 1)) == INTEGER_CST)
+       && poly_int_tree_p (gimple_call_arg (def_stmt, 2))
+       && (TREE_CODE (gimple_call_arg (def_stmt, 0)) == ADDR_EXPR
+ 	  || TREE_CODE (gimple_call_arg (def_stmt, 0)) == SSA_NAME))
+@@ -2423,7 +2666,13 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+       else
+ 	return (void *)-1;
+       tree len = gimple_call_arg (def_stmt, 2);
+-      HOST_WIDE_INT leni, offset2i, offseti;
++      HOST_WIDE_INT leni, offset2i;
++      /* Sometimes the above trickery is smarter than alias analysis.  Take
++	  advantage of that.  */
++      if (!ranges_maybe_overlap_p (offset, maxsize, offset2,
++				   (wi::to_poly_offset (len)
++				    << LOG2_BITS_PER_UNIT)))
++	return NULL;
+       if (data->partial_defs.is_empty ()
+ 	  && known_subrange_p (offset, maxsize, offset2,
+ 			       wi::to_poly_offset (len) << LOG2_BITS_PER_UNIT))
+@@ -2432,7 +2681,8 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	  if (integer_zerop (gimple_call_arg (def_stmt, 1)))
+ 	    val = build_zero_cst (vr->type);
+ 	  else if (INTEGRAL_TYPE_P (vr->type)
+-		   && known_eq (ref->size, 8))
++		   && known_eq (ref->size, 8)
++		   && offseti % BITS_PER_UNIT == 0)
+ 	    {
+ 	      gimple_match_op res_op (gimple_match_cond::UNCOND, NOP_EXPR,
+ 				      vr->type, gimple_call_arg (def_stmt, 1));
+@@ -2444,30 +2694,57 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	    }
+ 	  else
+ 	    {
+-	      unsigned len = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (vr->type));
+-	      unsigned char *buf = XALLOCAVEC (unsigned char, len);
++	      unsigned buflen = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (vr->type))
++						  + 1;
++	      if (INTEGRAL_TYPE_P (vr->type))
++		buflen = GET_MODE_SIZE (SCALAR_INT_TYPE_MODE (vr->type)) + 1;
++	      unsigned char *buf = XALLOCAVEC (unsigned char, buflen);
+ 	      memset (buf, TREE_INT_CST_LOW (gimple_call_arg (def_stmt, 1)),
+-		      len);
+-	      val = native_interpret_expr (vr->type, buf, len);
++		      buflen);
++	      if (BYTES_BIG_ENDIAN)
++		{
++		  unsigned int amnt
++		    = (((unsigned HOST_WIDE_INT) offseti + sizei)
++		       % BITS_PER_UNIT);
++		  if (amnt)
++		    {
++		      shift_bytes_in_array_right (buf, buflen,
++						  BITS_PER_UNIT - amnt);
++		      buf++;
++		      buflen--;
++		    }
++		}
++	      else if (offseti % BITS_PER_UNIT != 0)
++		{
++		  unsigned int amnt
++		    = BITS_PER_UNIT - ((unsigned HOST_WIDE_INT) offseti
++				       % BITS_PER_UNIT);
++		  shift_bytes_in_array_left (buf, buflen, amnt);
++		  buf++;
++		  buflen--;
++		}
++	      val = native_interpret_expr (vr->type, buf, buflen);
+ 	      if (!val)
+ 		return (void *)-1;
+ 	    }
+-	  return vn_reference_lookup_or_insert_for_pieces
+-	           (vuse, vr->set, vr->type, vr->operands, val);
++	  return data->finish (0, val);
+ 	}
+       /* For now handle clearing memory with partial defs.  */
+       else if (known_eq (ref->size, maxsize)
+ 	       && integer_zerop (gimple_call_arg (def_stmt, 1))
+ 	       && tree_to_poly_int64 (len).is_constant (&leni)
++	       && leni <= INTTYPE_MAXIMUM (HOST_WIDE_INT) / BITS_PER_UNIT
+ 	       && offset.is_constant (&offseti)
+ 	       && offset2.is_constant (&offset2i)
+-	       && maxsize.is_constant (&maxsizei))
++	       && maxsize.is_constant (&maxsizei)
++	       && ranges_known_overlap_p (offseti, maxsizei, offset2i,
++		       			  leni << LOG2_BITS_PER_UNIT))
+ 	{
+ 	  pd_data pd;
+ 	  pd.rhs = build_constructor (NULL_TREE, NULL);
+-	  pd.offset = (offset2i - offseti) / BITS_PER_UNIT;
+-	  pd.size = leni;
+-	  return data->push_partial_def (pd, vuse, maxsizei);
++	  pd.offset = offset2i - offseti;
++	  pd.size = leni << LOG2_BITS_PER_UNIT;
++	  return data->push_partial_def (pd, 0, maxsizei);
+ 	}
+     }
+ 
+@@ -2477,12 +2754,22 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	   && gimple_assign_rhs_code (def_stmt) == CONSTRUCTOR
+ 	   && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (def_stmt)) == 0)
+     {
++      tree lhs = gimple_assign_lhs (def_stmt);
+       tree base2;
+       poly_int64 offset2, size2, maxsize2;
+       HOST_WIDE_INT offset2i, size2i;
+       bool reverse;
+-      base2 = get_ref_base_and_extent (gimple_assign_lhs (def_stmt),
+-				       &offset2, &size2, &maxsize2, &reverse);
++      if (lhs_ref_ok)
++	{
++	  base2 = ao_ref_base (&lhs_ref);
++	  offset2 = lhs_ref.offset;
++	  size2 = lhs_ref.size;
++	  maxsize2 = lhs_ref.max_size;
++	  reverse = reverse_storage_order_for_component_p (lhs);
++	}
++      else
++	base2 = get_ref_base_and_extent (lhs,
++					 &offset2, &size2, &maxsize2, &reverse);
+       if (known_size_p (maxsize2)
+ 	  && known_eq (maxsize2, size2)
+ 	  && adjust_offsets_for_equal_base_address (base, &offset,
+@@ -2492,24 +2779,21 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	      && known_subrange_p (offset, maxsize, offset2, size2))
+ 	    {
+ 	      tree val = build_zero_cst (vr->type);
+-	      return vn_reference_lookup_or_insert_for_pieces
+-		  (vuse, vr->set, vr->type, vr->operands, val);
++	      return data->finish (get_alias_set (lhs), val);
+ 	    }
+ 	  else if (known_eq (ref->size, maxsize)
+ 		   && maxsize.is_constant (&maxsizei)
+-		   && maxsizei % BITS_PER_UNIT == 0
+ 		   && offset.is_constant (&offseti)
+-		   && offseti % BITS_PER_UNIT == 0
+ 		   && offset2.is_constant (&offset2i)
+-		   && offset2i % BITS_PER_UNIT == 0
+ 		   && size2.is_constant (&size2i)
+-		   && size2i % BITS_PER_UNIT == 0)
++		   && ranges_known_overlap_p (offseti, maxsizei,
++					      offset2i, size2i))
+ 	    {
+ 	      pd_data pd;
+ 	      pd.rhs = gimple_assign_rhs1 (def_stmt);
+-	      pd.offset = (offset2i - offseti) / BITS_PER_UNIT;
+-	      pd.size = size2i / BITS_PER_UNIT;
+-	      return data->push_partial_def (pd, vuse, maxsizei);
++	      pd.offset = offset2i - offseti;
++	      pd.size = size2i;
++	      return data->push_partial_def (pd, get_alias_set (lhs), maxsizei);
+ 	    }
+ 	}
+     }
+@@ -2520,28 +2804,36 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	   && is_gimple_reg_type (vr->type)
+ 	   && !contains_storage_order_barrier_p (vr->operands)
+ 	   && gimple_assign_single_p (def_stmt)
+-	   && CHAR_BIT == 8 && BITS_PER_UNIT == 8
++	   && CHAR_BIT == 8
++	   && BITS_PER_UNIT == 8
++	   && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN
+ 	   /* native_encode and native_decode operate on arrays of bytes
+ 	      and so fundamentally need a compile-time size and offset.  */
+ 	   && maxsize.is_constant (&maxsizei)
+-	   && maxsizei % BITS_PER_UNIT == 0
+ 	   && offset.is_constant (&offseti)
+-	   && offseti % BITS_PER_UNIT == 0
+ 	   && (is_gimple_min_invariant (gimple_assign_rhs1 (def_stmt))
+ 	       || (TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
+ 		   && is_gimple_min_invariant (SSA_VAL (gimple_assign_rhs1 (def_stmt))))))
+     {
++      tree lhs = gimple_assign_lhs (def_stmt);
+       tree base2;
+       poly_int64 offset2, size2, maxsize2;
+       HOST_WIDE_INT offset2i, size2i;
+       bool reverse;
+-      base2 = get_ref_base_and_extent (gimple_assign_lhs (def_stmt),
+-				       &offset2, &size2, &maxsize2, &reverse);
++      if (lhs_ref_ok)
++	{
++	  base2 = ao_ref_base (&lhs_ref);
++	  offset2 = lhs_ref.offset;
++	  size2 = lhs_ref.size;
++	  maxsize2 = lhs_ref.max_size;
++	  reverse = reverse_storage_order_for_component_p (lhs);
++	}
++      else
++	base2 = get_ref_base_and_extent (lhs,
++					 &offset2, &size2, &maxsize2, &reverse);
+       if (base2
+ 	  && !reverse
+ 	  && known_eq (maxsize2, size2)
+-	  && multiple_p (size2, BITS_PER_UNIT)
+-	  && multiple_p (offset2, BITS_PER_UNIT)
+ 	  && adjust_offsets_for_equal_base_address (base, &offset,
+ 						    base2, &offset2)
+ 	  && offset.is_constant (&offseti)
+@@ -2552,37 +2844,80 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	      && known_subrange_p (offseti, maxsizei, offset2, size2))
+ 	    {
+ 	      /* We support up to 512-bit values (for V8DFmode).  */
+-	      unsigned char buffer[64];
++	      unsigned char buffer[65];
+ 	      int len;
+ 
+ 	      tree rhs = gimple_assign_rhs1 (def_stmt);
+ 	      if (TREE_CODE (rhs) == SSA_NAME)
+ 		rhs = SSA_VAL (rhs);
+-	      unsigned pad = 0;
+-	      if (BYTES_BIG_ENDIAN
+-		  && is_a <scalar_mode> (TYPE_MODE (TREE_TYPE (rhs))))
+-		{
+-		  /* On big-endian the padding is at the 'front' so
+-		     just skip the initial bytes.  */
+-		  fixed_size_mode mode
+-		    = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (rhs)));
+-		  pad = GET_MODE_SIZE (mode) - size2i / BITS_PER_UNIT;
+-		}
+ 	      len = native_encode_expr (rhs,
+-					buffer, sizeof (buffer),
+-					((offseti - offset2i) / BITS_PER_UNIT
+-					 + pad));
++					buffer, sizeof (buffer) - 1,
++					(offseti - offset2i) / BITS_PER_UNIT);
+ 	      if (len > 0 && len * BITS_PER_UNIT >= maxsizei)
+ 		{
+ 		  tree type = vr->type;
++		  unsigned char *buf = buffer;
++		  unsigned int amnt = 0;
+ 		  /* Make sure to interpret in a type that has a range
+ 		     covering the whole access size.  */
+ 		  if (INTEGRAL_TYPE_P (vr->type)
+ 		      && maxsizei != TYPE_PRECISION (vr->type))
+ 		    type = build_nonstandard_integer_type (maxsizei,
+ 							   TYPE_UNSIGNED (type));
+-		  tree val = native_interpret_expr (type, buffer,
+-						    maxsizei / BITS_PER_UNIT);
++		  if (BYTES_BIG_ENDIAN)
++		    {
++		      /* For big-endian native_encode_expr stored the rhs
++			 such that the LSB of it is the LSB of buffer[len - 1].
++			 That bit is stored into memory at position
++			 offset2 + size2 - 1, i.e. in byte
++			 base + (offset2 + size2 - 1) / BITS_PER_UNIT.
++			 E.g. for offset2 1 and size2 14, rhs -1 and memory
++			 previously cleared that is:
++			 0	  1
++			 01111111|11111110
++			 Now, if we want to extract offset 2 and size 12 from
++			 it using native_interpret_expr (which actually works
++			 for integral bitfield types in terms of byte size of
++			 the mode), the native_encode_expr stored the value
++			 into buffer as
++			 XX111111|11111111
++			 and returned len 2 (the X bits are outside of
++			 precision).
++			 Let sz be maxsize / BITS_PER_UNIT if not extracting
++			 a bitfield, and GET_MODE_SIZE otherwise.
++			 We need to align the LSB of the value we want to
++			 extract as the LSB of buf[sz - 1].
++			 The LSB from memory we need to read is at position
++			 offset + maxsize - 1.  */
++		      HOST_WIDE_INT sz = maxsizei / BITS_PER_UNIT;
++		      if (INTEGRAL_TYPE_P (type))
++			sz = GET_MODE_SIZE (SCALAR_INT_TYPE_MODE (type));
++		      amnt = ((unsigned HOST_WIDE_INT) offset2i + size2i
++			      - offseti - maxsizei) % BITS_PER_UNIT;
++		      if (amnt)
++			shift_bytes_in_array_right (buffer, len, amnt);
++		      amnt = ((unsigned HOST_WIDE_INT) offset2i + size2i
++			      - offseti - maxsizei - amnt) / BITS_PER_UNIT;
++		      if ((unsigned HOST_WIDE_INT) sz + amnt > (unsigned) len)
++			len = 0;
++		      else
++			{
++			  buf = buffer + len - sz - amnt;
++			  len -= (buf - buffer);
++			}
++		    }
++		  else
++		    {
++		      amnt = ((unsigned HOST_WIDE_INT) offset2i
++			      - offseti) % BITS_PER_UNIT;
++		      if (amnt)
++			{
++			  buffer[len] = 0;
++			  shift_bytes_in_array_left (buffer, len + 1, amnt);
++			  buf = buffer + 1;
++			}
++		    }
++		  tree val = native_interpret_expr (type, buf, len);
+ 		  /* If we chop off bits because the types precision doesn't
+ 		     match the memory access size this is ok when optimizing
+ 		     reads but not when called from the DSE code during
+@@ -2597,73 +2932,95 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 		    }
+ 
+ 		  if (val)
+-		    return vn_reference_lookup_or_insert_for_pieces
+-		      (vuse, vr->set, vr->type, vr->operands, val);
++		    return data->finish (get_alias_set (lhs), val);
+ 		}
+ 	    }
+-	  else if (ranges_known_overlap_p (offseti, maxsizei, offset2i, size2i))
++	  else if (ranges_known_overlap_p (offseti, maxsizei, offset2i,
++					   size2i))
+ 	    {
+ 	      pd_data pd;
+ 	      tree rhs = gimple_assign_rhs1 (def_stmt);
+ 	      if (TREE_CODE (rhs) == SSA_NAME)
+ 		rhs = SSA_VAL (rhs);
+ 	      pd.rhs = rhs;
+-	      pd.offset = (offset2i - offseti) / BITS_PER_UNIT;
+-	      pd.size = size2i / BITS_PER_UNIT;
+-	      return data->push_partial_def (pd, vuse, maxsizei);
++	      pd.offset = offset2i - offseti;
++	      pd.size = size2i;
++	      return data->push_partial_def (pd, get_alias_set (lhs), maxsizei);
+ 	    }
+ 	}
+     }
+ 
+   /* 4) Assignment from an SSA name which definition we may be able
+-     to access pieces from.  */
++     to access pieces from or we can combine to a larger entity.  */
+   else if (known_eq (ref->size, maxsize)
+ 	   && is_gimple_reg_type (vr->type)
+ 	   && !contains_storage_order_barrier_p (vr->operands)
+ 	   && gimple_assign_single_p (def_stmt)
+-	   && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
+-	   /* A subset of partial defs from non-constants can be handled
+-	      by for example inserting a CONSTRUCTOR, a COMPLEX_EXPR or
+-	      even a (series of) BIT_INSERT_EXPR hoping for simplifications
+-	      downstream, not so much for actually doing the insertion.  */
+-	   && data->partial_defs.is_empty ())
++	   && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME)
+     {
++      tree lhs = gimple_assign_lhs (def_stmt);
+       tree base2;
+       poly_int64 offset2, size2, maxsize2;
++      HOST_WIDE_INT offset2i, size2i, offseti;
+       bool reverse;
+-      base2 = get_ref_base_and_extent (gimple_assign_lhs (def_stmt),
+-				       &offset2, &size2, &maxsize2,
+-				       &reverse);
++      if (lhs_ref_ok)
++	{
++	  base2 = ao_ref_base (&lhs_ref);
++	  offset2 = lhs_ref.offset;
++	  size2 = lhs_ref.size;
++	  maxsize2 = lhs_ref.max_size;
++	  reverse = reverse_storage_order_for_component_p (lhs);
++	}
++      else
++	base2 = get_ref_base_and_extent (lhs,
++					 &offset2, &size2, &maxsize2, &reverse);
+       tree def_rhs = gimple_assign_rhs1 (def_stmt);
+       if (!reverse
+ 	  && known_size_p (maxsize2)
+ 	  && known_eq (maxsize2, size2)
+ 	  && adjust_offsets_for_equal_base_address (base, &offset,
+-						    base2, &offset2)
+-	  && known_subrange_p (offset, maxsize, offset2, size2)
+-	  /* ???  We can't handle bitfield precision extracts without
+-	     either using an alternate type for the BIT_FIELD_REF and
+-	     then doing a conversion or possibly adjusting the offset
+-	     according to endianness.  */
+-	  && (! INTEGRAL_TYPE_P (vr->type)
+-	      || known_eq (ref->size, TYPE_PRECISION (vr->type)))
+-	  && multiple_p (ref->size, BITS_PER_UNIT)
+-	  && (! INTEGRAL_TYPE_P (TREE_TYPE (def_rhs))
+-	      || type_has_mode_precision_p (TREE_TYPE (def_rhs))))
+-	{
+-	  gimple_match_op op (gimple_match_cond::UNCOND,
+-			      BIT_FIELD_REF, vr->type,
+-			      vn_valueize (def_rhs),
+-			      bitsize_int (ref->size),
+-			      bitsize_int (offset - offset2));
+-	  tree val = vn_nary_build_or_lookup (&op);
+-	  if (val
+-	      && (TREE_CODE (val) != SSA_NAME
+-		  || ! SSA_NAME_OCCURS_IN_ABNORMAL_PHI (val)))
+-	    {
+-	      vn_reference_t res = vn_reference_lookup_or_insert_for_pieces
+-		  (vuse, vr->set, vr->type, vr->operands, val);
+-	      return res;
++						    base2, &offset2))
++	{
++	  if (data->partial_defs.is_empty ()
++	      && known_subrange_p (offset, maxsize, offset2, size2)
++	      /* ???  We can't handle bitfield precision extracts without
++		 either using an alternate type for the BIT_FIELD_REF and
++		 then doing a conversion or possibly adjusting the offset
++		 according to endianness.  */
++	      && (! INTEGRAL_TYPE_P (vr->type)
++		  || known_eq (ref->size, TYPE_PRECISION (vr->type)))
++	      && multiple_p (ref->size, BITS_PER_UNIT))
++	    {
++	      if (known_eq (ref->size, size2))
++		return vn_reference_lookup_or_insert_for_pieces
++		    (vuse, get_alias_set (lhs), vr->type, vr->operands,
++		     SSA_VAL (def_rhs));
++	      else if (! INTEGRAL_TYPE_P (TREE_TYPE (def_rhs))
++		       || type_has_mode_precision_p (TREE_TYPE (def_rhs)))
++		{
++		  gimple_match_op op (gimple_match_cond::UNCOND,
++				      BIT_FIELD_REF, vr->type,
++				      SSA_VAL (def_rhs),
++				      bitsize_int (ref->size),
++				      bitsize_int (offset - offset2));
++		  tree val = vn_nary_build_or_lookup (&op);
++		  if (val
++		      && (TREE_CODE (val) != SSA_NAME
++			  || ! SSA_NAME_OCCURS_IN_ABNORMAL_PHI (val)))
++		    return data->finish (get_alias_set (lhs), val);
++		}
++	    }
++	  else if (maxsize.is_constant (&maxsizei)
++		   && offset.is_constant (&offseti)
++		   && offset2.is_constant (&offset2i)
++		   && size2.is_constant (&size2i)
++		   && ranges_known_overlap_p (offset, maxsize, offset2, size2))
++	    {
++	      pd_data pd;
++	      pd.rhs = SSA_VAL (def_rhs);
++	      pd.offset = offset2i - offseti;
++	      pd.size = size2i;
++	      return data->push_partial_def (pd, get_alias_set (lhs), maxsizei);
+ 	    }
+ 	}
+     }
+@@ -2678,6 +3035,7 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	   /* Handling this is more complicated, give up for now.  */
+ 	   && data->partial_defs.is_empty ())
+     {
++      tree lhs = gimple_assign_lhs (def_stmt);
+       tree base2;
+       int i, j, k;
+       auto_vec<vn_reference_op_s> rhs;
+@@ -2747,7 +3105,8 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 	}
+ 
+       /* Now re-write REF to be based on the rhs of the assignment.  */
+-      copy_reference_ops_from_ref (gimple_assign_rhs1 (def_stmt), &rhs);
++      tree rhs1 = gimple_assign_rhs1 (def_stmt);
++      copy_reference_ops_from_ref (rhs1, &rhs);
+ 
+       /* Apply an extra offset to the inner MEM_REF of the RHS.  */
+       if (maybe_ne (extra_off, 0))
+@@ -2764,6 +3123,11 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 							extra_off));
+ 	}
+ 
++      /* Save the operands since we need to use the original ones for
++	 the hash entry we use.  */
++      if (!data->saved_operands.exists ())
++	data->saved_operands = vr->operands.copy ();
++
+       /* We need to pre-pend vr->operands[0..i] to rhs.  */
+       vec<vn_reference_op_s> old = vr->operands;
+       if (i + 1 + rhs.length () > vr->operands.length ())
+@@ -2780,11 +3144,11 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+       /* Try folding the new reference to a constant.  */
+       tree val = fully_constant_vn_reference_p (vr);
+       if (val)
+-	return vn_reference_lookup_or_insert_for_pieces
+-		 (vuse, vr->set, vr->type, vr->operands, val);
++	return data->finish (get_alias_set (lhs), val);
+ 
+       /* Adjust *ref from the new operands.  */
+-      if (!ao_ref_init_from_vn_reference (&r, vr->set, vr->type, vr->operands))
++      if (!ao_ref_init_from_vn_reference (&r, get_alias_set (rhs1),
++					  vr->type, vr->operands))
+ 	return (void *)-1;
+       /* This can happen with bitfields.  */
+       if (maybe_ne (ref->size, r.size))
+@@ -2793,6 +3157,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 
+       /* Do not update last seen VUSE after translating.  */
+       data->last_vuse_ptr = NULL;
++      /* Invalidate the original access path since it now contains
++	  the wrong base.  */
++      data->orig_ref.ref = NULL_TREE;
++      /* Use the alias-set of this LHS for recording an eventual result.  */
++      if (data->first_set == -2)
++	data->first_set = get_alias_set (lhs);
+ 
+       /* Keep looking for the adjusted *REF / VR pair.  */
+       return NULL;
+@@ -2912,6 +3282,11 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+       if (!known_subrange_p (at, byte_maxsize, lhs_offset, copy_size))
+ 	return (void *)-1;
+ 
++      /* Save the operands since we need to use the original ones for
++	 the hash entry we use.  */
++      if (!data->saved_operands.exists ())
++	data->saved_operands = vr->operands.copy ();
++
+       /* Make room for 2 operands in the new reference.  */
+       if (vr->operands.length () < 2)
+ 	{
+@@ -2940,11 +3315,10 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+       /* Try folding the new reference to a constant.  */
+       tree val = fully_constant_vn_reference_p (vr);
+       if (val)
+-	return vn_reference_lookup_or_insert_for_pieces
+-		 (vuse, vr->set, vr->type, vr->operands, val);
++	return data->finish (0, val);
+ 
+       /* Adjust *ref from the new operands.  */
+-      if (!ao_ref_init_from_vn_reference (&r, vr->set, vr->type, vr->operands))
++      if (!ao_ref_init_from_vn_reference (&r, 0, vr->type, vr->operands))
+ 	return (void *)-1;
+       /* This can happen with bitfields.  */
+       if (maybe_ne (ref->size, r.size))
+@@ -2953,6 +3327,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree
+ 
+       /* Do not update last seen VUSE after translating.  */
+       data->last_vuse_ptr = NULL;
++      /* Invalidate the original access path since it now contains
++	  the wrong base.  */
++      data->orig_ref.ref = NULL_TREE;
++      /* Use the alias-set of this stmt for recording an eventual result.  */
++      if (data->first_set == -2)
++	data->first_set = 0;
+ 
+       /* Keep looking for the adjusted *REF / VR pair.  */
+       return NULL;
+@@ -3013,13 +3393,13 @@ vn_reference_lookup_pieces (tree vuse, a
+     {
+       ao_ref r;
+       unsigned limit = PARAM_VALUE (PARAM_SCCVN_MAX_ALIAS_QUERIES_PER_ACCESS);
+-      vn_walk_cb_data data (&vr1, NULL, kind, true);
++      vn_walk_cb_data data (&vr1, NULL_TREE, NULL, kind, true, NULL_TREE);
+       if (ao_ref_init_from_vn_reference (&r, set, type, vr1.operands))
+-	*vnresult =
+-	  (vn_reference_t)walk_non_aliased_vuses (&r, vr1.vuse, true,
+-						  vn_reference_lookup_2,
+-						  vn_reference_lookup_3,
+-						  vuse_valueize, limit, &data);
++	*vnresult
++	  = ((vn_reference_t)
++	     walk_non_aliased_vuses (&r, vr1.vuse, true, vn_reference_lookup_2,
++				     vn_reference_lookup_3, vuse_valueize,
++				     limit, &data));
+       gcc_checking_assert (vr1.operands == shared_lookup_references);
+     }
+ 
+@@ -3035,15 +3415,19 @@ vn_reference_lookup_pieces (tree vuse, a
+    was NULL..  VNRESULT will be filled in with the vn_reference_t
+    stored in the hashtable if one exists.  When TBAA_P is false assume
+    we are looking up a store and treat it as having alias-set zero.
+-   *LAST_VUSE_PTR will be updated with the VUSE the value lookup succeeded.  */
++   *LAST_VUSE_PTR will be updated with the VUSE the value lookup succeeded.
++   MASK is either NULL_TREE, or can be an INTEGER_CST if the result of the
++   load is bitwise anded with MASK and so we are only interested in a subset
++   of the bits and can ignore if the other bits are uninitialized or
++   not initialized with constants.  */
+ 
+ tree
+ vn_reference_lookup (tree op, tree vuse, vn_lookup_kind kind,
+-		     vn_reference_t *vnresult, bool tbaa_p, tree *last_vuse_ptr)
++		     vn_reference_t *vnresult, bool tbaa_p,
++		     tree *last_vuse_ptr, tree mask)
+ {
+   vec<vn_reference_op_s> operands;
+   struct vn_reference_s vr1;
+-  tree cst;
+   bool valuezied_anything;
+ 
+   if (vnresult)
+@@ -3055,11 +3439,11 @@ vn_reference_lookup (tree op, tree vuse,
+   vr1.type = TREE_TYPE (op);
+   vr1.set = get_alias_set (op);
+   vr1.hashcode = vn_reference_compute_hash (&vr1);
+-  if ((cst = fully_constant_vn_reference_p (&vr1)))
+-    return cst;
++  if (mask == NULL_TREE)
++    if (tree cst = fully_constant_vn_reference_p (&vr1))
++      return cst;
+ 
+-  if (kind != VN_NOWALK
+-      && vr1.vuse)
++  if (kind != VN_NOWALK && vr1.vuse)
+     {
+       vn_reference_t wvnresult;
+       ao_ref r;
+@@ -3070,23 +3454,32 @@ vn_reference_lookup (tree op, tree vuse,
+ 	  || !ao_ref_init_from_vn_reference (&r, vr1.set, vr1.type,
+ 					     vr1.operands))
+ 	ao_ref_init (&r, op);
+-      vn_walk_cb_data data (&vr1, last_vuse_ptr, kind, tbaa_p);
+-      wvnresult =
+-	(vn_reference_t)walk_non_aliased_vuses (&r, vr1.vuse, tbaa_p,
+-						vn_reference_lookup_2,
+-						vn_reference_lookup_3,
+-						vuse_valueize, limit, &data);
++      vn_walk_cb_data data (&vr1, r.ref ? NULL_TREE : op,
++			    last_vuse_ptr, kind, tbaa_p, mask);
++
++      wvnresult
++       = ((vn_reference_t)
++	   walk_non_aliased_vuses (&r, vr1.vuse, tbaa_p, vn_reference_lookup_2,
++				   vn_reference_lookup_3, vuse_valueize, limit,
++				   &data));
+       gcc_checking_assert (vr1.operands == shared_lookup_references);
+       if (wvnresult)
+ 	{
++	  gcc_assert (mask == NULL_TREE);
+ 	  if (vnresult)
+ 	    *vnresult = wvnresult;
+ 	  return wvnresult->result;
+ 	}
++      else if (mask)
++	return data.masked_result;
+ 
+       return NULL_TREE;
+     }
+ 
++  if (last_vuse_ptr)
++    *last_vuse_ptr = vr1.vuse;
++  if (mask)
++    return NULL_TREE;
+   return vn_reference_lookup_1 (&vr1, vnresult);
+ }
+ 
+@@ -4333,7 +4726,39 @@ visit_nary_op (tree lhs, gassign *stmt)
+ 		}
+ 	    }
+ 	}
+-    default:;
++      break;
++    case BIT_AND_EXPR:
++      if (INTEGRAL_TYPE_P (type)
++	  && TREE_CODE (rhs1) == SSA_NAME
++	  && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
++	  && !SSA_NAME_OCCURS_IN_ABNORMAL_PHI (rhs1)
++	  && default_vn_walk_kind != VN_NOWALK
++	  && CHAR_BIT == 8
++	  && BITS_PER_UNIT == 8
++	  && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN
++	  && !integer_all_onesp (gimple_assign_rhs2 (stmt))
++	  && !integer_zerop (gimple_assign_rhs2 (stmt)))
++	{
++	  gassign *ass = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (rhs1));
++	  if (ass
++	      && !gimple_has_volatile_ops (ass)
++	      && vn_get_stmt_kind (ass) == VN_REFERENCE)
++	    {
++	      tree last_vuse = gimple_vuse (ass);
++	      tree op = gimple_assign_rhs1 (ass);
++	      tree result = vn_reference_lookup (op, gimple_vuse (ass),
++						 default_vn_walk_kind,
++						 NULL, true, &last_vuse,
++						 gimple_assign_rhs2 (stmt));
++	      if (result
++		  && useless_type_conversion_p (TREE_TYPE (result),
++						TREE_TYPE (op)))
++		return set_ssa_val_to (lhs, result);
++	    }
++	}
++      break;
++    default:
++      break;
+     }
+ 
+   bool changed = set_ssa_val_to (lhs, lhs);
+@@ -4844,14 +5269,14 @@ visit_stmt (gimple *stmt, bool backedges
+ 	      switch (vn_get_stmt_kind (ass))
+ 		{
+ 		case VN_NARY:
+-		changed = visit_nary_op (lhs, ass);
+-		break;
++		  changed = visit_nary_op (lhs, ass);
++		  break;
+ 		case VN_REFERENCE:
+-		changed = visit_reference_op_load (lhs, rhs1, ass);
+-		break;
++		  changed = visit_reference_op_load (lhs, rhs1, ass);
++		  break;
+ 		default:
+-		changed = defs_to_varying (ass);
+-		break;
++		  changed = defs_to_varying (ass);
++		  break;
+ 		}
+ 	    }
+ 	}
+@@ -5525,8 +5950,48 @@ eliminate_dom_walker::eliminate_stmt (ba
+       tree val;
+       tree rhs = gimple_assign_rhs1 (stmt);
+       vn_reference_t vnresult;
+-      val = vn_reference_lookup (lhs, gimple_vuse (stmt), VN_WALKREWRITE,
+-				 &vnresult, false);
++      /* ???  gcc.dg/torture/pr91445.c shows that we lookup a boolean
++	 typed load of a byte known to be 0x11 as 1 so a store of
++	 a boolean 1 is detected as redundant.  Because of this we
++	 have to make sure to lookup with a ref where its size
++	 matches the precision.  */
++      tree lookup_lhs = lhs;
++      if (INTEGRAL_TYPE_P (TREE_TYPE (lhs))
++	  && (TREE_CODE (lhs) != COMPONENT_REF
++	      || !DECL_BIT_FIELD_TYPE (TREE_OPERAND (lhs, 1)))
++	  && !type_has_mode_precision_p (TREE_TYPE (lhs)))
++	{
++	  if (TREE_CODE (lhs) == COMPONENT_REF
++	      || TREE_CODE (lhs) == MEM_REF)
++	    {
++	      tree ltype = build_nonstandard_integer_type
++				(TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (lhs))),
++				 TYPE_UNSIGNED (TREE_TYPE (lhs)));
++	      if (TREE_CODE (lhs) == COMPONENT_REF)
++		{
++		  tree foff = component_ref_field_offset (lhs);
++		  tree f = TREE_OPERAND (lhs, 1);
++		  if (!poly_int_tree_p (foff))
++		    lookup_lhs = NULL_TREE;
++		  else
++		    lookup_lhs = build3 (BIT_FIELD_REF, ltype,
++					 TREE_OPERAND (lhs, 0),
++					 TYPE_SIZE (TREE_TYPE (lhs)),
++					 bit_from_pos
++					   (foff, DECL_FIELD_BIT_OFFSET (f)));
++		}
++	      else
++		lookup_lhs = build2 (MEM_REF, ltype,
++				     TREE_OPERAND (lhs, 0),
++				     TREE_OPERAND (lhs, 1));
++	    }
++	  else
++	    lookup_lhs = NULL_TREE;
++	}
++      val = NULL_TREE;
++      if (lookup_lhs)
++	val = vn_reference_lookup (lookup_lhs, gimple_vuse (stmt),
++				   VN_WALKREWRITE, &vnresult, false);
+       if (TREE_CODE (rhs) == SSA_NAME)
+ 	rhs = VN_INFO (rhs)->valnum;
+       if (val
+diff -urpN a/gcc/tree-ssa-sccvn.h b/gcc/tree-ssa-sccvn.h
+--- a/gcc/tree-ssa-sccvn.h	2020-11-26 22:26:32.856000000 -0500
++++ b/gcc/tree-ssa-sccvn.h	2020-11-26 22:06:08.040000000 -0500
+@@ -235,7 +235,7 @@ tree vn_reference_lookup_pieces (tree, a
+ 				 vec<vn_reference_op_s> ,
+ 				 vn_reference_t *, vn_lookup_kind);
+ tree vn_reference_lookup (tree, tree, vn_lookup_kind, vn_reference_t *, bool,
+-			  tree * = NULL);
++			  tree * = NULL, tree = NULL_TREE);
+ void vn_reference_lookup_call (gcall *, vn_reference_t *, vn_reference_t);
+ vn_reference_t vn_reference_insert_pieces (tree, alias_set_type, tree,
+ 					   vec<vn_reference_op_s> ,
diff --git a/simplify-removing-subregs.patch b/simplify-removing-subregs.patch
index fd6cbc6..cfd5804 100644
--- a/simplify-removing-subregs.patch
+++ b/simplify-removing-subregs.patch
@@ -1,3 +1,9 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-expand-Simplify-removing-subregs-when-expanding-a-co.patch
+9a182ef9ee011935d827ab5c6c9a7cd8e22257d8
+
 diff -Nurp a/gcc/expr.c b/gcc/expr.c
 --- a/gcc/expr.c	2020-08-05 20:33:04.068000000 +0800
 +++ b/gcc/expr.c	2020-08-05 20:33:21.420000000 +0800
diff --git a/speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch b/speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch
new file mode 100644
index 0000000..da7b905
--- /dev/null
+++ b/speed-up-DDG-analysis-and-fix-bootstrap-compare-debug.patch
@@ -0,0 +1,718 @@
+This backport contains 2 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+728c2e5eeaa91cf708f2b1b1f996653a7eebae59
+0001-modulo-sched-speed-up-DDG-analysis-PR90001.patch
+
+06d5d63d9944691bb4286e5f6b2422cc97148336
+0001-modulo-sched-fix-bootstrap-compare-debug-issue.patch
+
+diff -Nurp a/gcc/ddg.c b/gcc/ddg.c
+--- a/gcc/ddg.c	2020-11-28 18:40:12.371633230 +0800
++++ b/gcc/ddg.c	2020-11-28 18:38:33.835633230 +0800
+@@ -32,9 +32,6 @@ along with GCC; see the file COPYING3.
+ 
+ #ifdef INSN_SCHEDULING
+ 
+-/* A flag indicating that a ddg edge belongs to an SCC or not.  */
+-enum edge_flag {NOT_IN_SCC = 0, IN_SCC};
+-
+ /* Forward declarations.  */
+ static void add_backarc_to_ddg (ddg_ptr, ddg_edge_ptr);
+ static void add_backarc_to_scc (ddg_scc_ptr, ddg_edge_ptr);
+@@ -188,9 +185,6 @@ create_ddg_dep_from_intra_loop_link (ddg
+   else if (DEP_TYPE (link) == REG_DEP_OUTPUT)
+     t = OUTPUT_DEP;
+ 
+-  gcc_assert (!DEBUG_INSN_P (dest_node->insn) || t == ANTI_DEP);
+-  gcc_assert (!DEBUG_INSN_P (src_node->insn) || t == ANTI_DEP);
+-
+   /* We currently choose not to create certain anti-deps edges and
+      compensate for that by generating reg-moves based on the life-range
+      analysis.  The anti-deps that will be deleted are the ones which
+@@ -225,9 +219,9 @@ create_ddg_dep_from_intra_loop_link (ddg
+         }
+     }
+ 
+-   latency = dep_cost (link);
+-   e = create_ddg_edge (src_node, dest_node, t, dt, latency, distance);
+-   add_edge_to_ddg (g, e);
++  latency = dep_cost (link);
++  e = create_ddg_edge (src_node, dest_node, t, dt, latency, distance);
++  add_edge_to_ddg (g, e);
+ }
+ 
+ /* The same as the above function, but it doesn't require a link parameter.  */
+@@ -240,9 +234,6 @@ create_ddg_dep_no_link (ddg_ptr g, ddg_n
+   enum reg_note dep_kind;
+   struct _dep _dep, *dep = &_dep;
+ 
+-  gcc_assert (!DEBUG_INSN_P (to->insn) || d_t == ANTI_DEP);
+-  gcc_assert (!DEBUG_INSN_P (from->insn) || d_t == ANTI_DEP);
+-
+   if (d_t == ANTI_DEP)
+     dep_kind = REG_DEP_ANTI;
+   else if (d_t == OUTPUT_DEP)
+@@ -275,16 +266,15 @@ create_ddg_dep_no_link (ddg_ptr g, ddg_n
+ static void
+ add_cross_iteration_register_deps (ddg_ptr g, df_ref last_def)
+ {
+-  int regno = DF_REF_REGNO (last_def);
+   struct df_link *r_use;
+   int has_use_in_bb_p = false;
+-  rtx_insn *def_insn = DF_REF_INSN (last_def);
+-  ddg_node_ptr last_def_node = get_node_of_insn (g, def_insn);
+-  ddg_node_ptr use_node;
++  int regno = DF_REF_REGNO (last_def);
++  ddg_node_ptr last_def_node = get_node_of_insn (g, DF_REF_INSN (last_def));
+   df_ref first_def = df_bb_regno_first_def_find (g->bb, regno);
++  ddg_node_ptr first_def_node = get_node_of_insn (g, DF_REF_INSN (first_def));
++  ddg_node_ptr use_node;
+ 
+-  gcc_assert (last_def_node);
+-  gcc_assert (first_def);
++  gcc_assert (last_def_node && first_def && first_def_node);
+ 
+   if (flag_checking && DF_REF_ID (last_def) != DF_REF_ID (first_def))
+     {
+@@ -303,6 +293,9 @@ add_cross_iteration_register_deps (ddg_p
+ 
+       rtx_insn *use_insn = DF_REF_INSN (r_use->ref);
+ 
++      if (DEBUG_INSN_P (use_insn))
++	continue;
++
+       /* ??? Do not handle uses with DF_REF_IN_NOTE notes.  */
+       use_node = get_node_of_insn (g, use_insn);
+       gcc_assert (use_node);
+@@ -313,35 +306,28 @@ add_cross_iteration_register_deps (ddg_p
+ 	     iteration.  Any such upwards exposed use appears before
+ 	     the last_def def.  */
+ 	  create_ddg_dep_no_link (g, last_def_node, use_node,
+-				  DEBUG_INSN_P (use_insn) ? ANTI_DEP : TRUE_DEP,
+-				  REG_DEP, 1);
++				  TRUE_DEP, REG_DEP, 1);
+ 	}
+-      else if (!DEBUG_INSN_P (use_insn))
++      else
+ 	{
+ 	  /* Add anti deps from last_def's uses in the current iteration
+ 	     to the first def in the next iteration.  We do not add ANTI
+ 	     dep when there is an intra-loop TRUE dep in the opposite
+ 	     direction, but use regmoves to fix such disregarded ANTI
+ 	     deps when broken.	If the first_def reaches the USE then
+-	     there is such a dep.  */
+-	  ddg_node_ptr first_def_node = get_node_of_insn (g,
+-							  DF_REF_INSN (first_def));
+-
+-	  gcc_assert (first_def_node);
+-
+-         /* Always create the edge if the use node is a branch in
+-            order to prevent the creation of reg-moves.  
+-            If the address that is being auto-inc or auto-dec in LAST_DEF
+-            is used in USE_INSN then do not remove the edge to make sure
+-            reg-moves will not be created for that address.  */
+-          if (DF_REF_ID (last_def) != DF_REF_ID (first_def)
+-              || !flag_modulo_sched_allow_regmoves
++	     there is such a dep.
++	     Always create the edge if the use node is a branch in
++	     order to prevent the creation of reg-moves.
++	     If the address that is being auto-inc or auto-dec in LAST_DEF
++	     is used in USE_INSN then do not remove the edge to make sure
++	     reg-moves will not be created for that address.  */
++	  if (DF_REF_ID (last_def) != DF_REF_ID (first_def)
++	      || !flag_modulo_sched_allow_regmoves
+ 	      || JUMP_P (use_node->insn)
+-              || autoinc_var_is_used_p (DF_REF_INSN (last_def), use_insn)
++	      || autoinc_var_is_used_p (DF_REF_INSN (last_def), use_insn)
+ 	      || def_has_ccmode_p (DF_REF_INSN (last_def)))
+-            create_ddg_dep_no_link (g, use_node, first_def_node, ANTI_DEP,
+-                                    REG_DEP, 1);
+-
++	    create_ddg_dep_no_link (g, use_node, first_def_node, ANTI_DEP,
++				    REG_DEP, 1);
+ 	}
+     }
+   /* Create an inter-loop output dependence between LAST_DEF (which is the
+@@ -351,19 +337,11 @@ add_cross_iteration_register_deps (ddg_p
+      defs starting with a true dependence to a use which can be in the
+      next iteration; followed by an anti dependence of that use to the
+      first def (i.e. if there is a use between the two defs.)  */
+-  if (!has_use_in_bb_p)
+-    {
+-      ddg_node_ptr dest_node;
+-
+-      if (DF_REF_ID (last_def) == DF_REF_ID (first_def))
+-	return;
+-
+-      dest_node = get_node_of_insn (g, DF_REF_INSN (first_def));
+-      gcc_assert (dest_node);
+-      create_ddg_dep_no_link (g, last_def_node, dest_node,
+-			      OUTPUT_DEP, REG_DEP, 1);
+-    }
++  if (!has_use_in_bb_p && DF_REF_ID (last_def) != DF_REF_ID (first_def))
++    create_ddg_dep_no_link (g, last_def_node, first_def_node,
++			    OUTPUT_DEP, REG_DEP, 1);
+ }
++
+ /* Build inter-loop dependencies, by looking at DF analysis backwards.  */
+ static void
+ build_inter_loop_deps (ddg_ptr g)
+@@ -420,13 +398,9 @@ add_intra_loop_mem_dep (ddg_ptr g, ddg_n
+   if (mem_write_insn_p (from->insn))
+     {
+       if (mem_read_insn_p (to->insn))
+-	create_ddg_dep_no_link (g, from, to,
+-				DEBUG_INSN_P (to->insn)
+-				? ANTI_DEP : TRUE_DEP, MEM_DEP, 0);
++	create_ddg_dep_no_link (g, from, to, TRUE_DEP, MEM_DEP, 0);
+       else
+-	create_ddg_dep_no_link (g, from, to,
+-				DEBUG_INSN_P (to->insn)
+-				? ANTI_DEP : OUTPUT_DEP, MEM_DEP, 0);
++	create_ddg_dep_no_link (g, from, to, OUTPUT_DEP, MEM_DEP, 0);
+     }
+   else if (!mem_read_insn_p (to->insn))
+     create_ddg_dep_no_link (g, from, to, ANTI_DEP, MEM_DEP, 0);
+@@ -444,13 +418,9 @@ add_inter_loop_mem_dep (ddg_ptr g, ddg_n
+   if (mem_write_insn_p (from->insn))
+     {
+       if (mem_read_insn_p (to->insn))
+-  	create_ddg_dep_no_link (g, from, to,
+-				DEBUG_INSN_P (to->insn)
+-				? ANTI_DEP : TRUE_DEP, MEM_DEP, 1);
++	create_ddg_dep_no_link (g, from, to, TRUE_DEP, MEM_DEP, 1);
+       else if (from->cuid != to->cuid)
+-  	create_ddg_dep_no_link (g, from, to,
+-				DEBUG_INSN_P (to->insn)
+-				? ANTI_DEP : OUTPUT_DEP, MEM_DEP, 1);
++	create_ddg_dep_no_link (g, from, to, OUTPUT_DEP, MEM_DEP, 1);
+     }
+   else
+     {
+@@ -459,13 +429,9 @@ add_inter_loop_mem_dep (ddg_ptr g, ddg_n
+       else if (from->cuid != to->cuid)
+ 	{
+ 	  create_ddg_dep_no_link (g, from, to, ANTI_DEP, MEM_DEP, 1);
+-	  if (DEBUG_INSN_P (from->insn) || DEBUG_INSN_P (to->insn))
+-	    create_ddg_dep_no_link (g, to, from, ANTI_DEP, MEM_DEP, 1);
+-	  else
+-	    create_ddg_dep_no_link (g, to, from, TRUE_DEP, MEM_DEP, 1);
++	  create_ddg_dep_no_link (g, to, from, TRUE_DEP, MEM_DEP, 1);
+ 	}
+     }
+-
+ }
+ 
+ /* Perform intra-block Data Dependency analysis and connect the nodes in
+@@ -494,20 +460,10 @@ build_intra_loop_deps (ddg_ptr g)
+       sd_iterator_def sd_it;
+       dep_t dep;
+ 
+-      if (! INSN_P (dest_node->insn))
+-	continue;
+-
+       FOR_EACH_DEP (dest_node->insn, SD_LIST_BACK, sd_it, dep)
+ 	{
+ 	  rtx_insn *src_insn = DEP_PRO (dep);
+-	  ddg_node_ptr src_node;
+-
+-	  /* Don't add dependencies on debug insns to non-debug insns
+-	     to avoid codegen differences between -g and -g0.  */
+-	  if (DEBUG_INSN_P (src_insn) && !DEBUG_INSN_P (dest_node->insn))
+-	    continue;
+-
+-	  src_node = get_node_of_insn (g, src_insn);
++	  ddg_node_ptr src_node = get_node_of_insn (g, src_insn);
+ 
+ 	  if (!src_node)
+ 	    continue;
+@@ -524,8 +480,7 @@ build_intra_loop_deps (ddg_ptr g)
+ 	  for (j = 0; j <= i; j++)
+ 	    {
+ 	      ddg_node_ptr j_node = &g->nodes[j];
+-	      if (DEBUG_INSN_P (j_node->insn))
+-		continue;
++
+ 	      if (mem_access_insn_p (j_node->insn))
+ 		{
+ 		  /* Don't bother calculating inter-loop dep if an intra-loop dep
+@@ -564,7 +519,7 @@ create_ddg (basic_block bb, int closing_
+ {
+   ddg_ptr g;
+   rtx_insn *insn, *first_note;
+-  int i;
++  int i, j;
+   int num_nodes = 0;
+ 
+   g = (ddg_ptr) xcalloc (1, sizeof (struct ddg));
+@@ -576,23 +531,21 @@ create_ddg (basic_block bb, int closing_
+   for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
+        insn = NEXT_INSN (insn))
+     {
+-      if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
++      if (!INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
+ 	continue;
+ 
+-      if (DEBUG_INSN_P (insn))
+-	g->num_debug++;
+-      else
++      if (NONDEBUG_INSN_P (insn))
+ 	{
+ 	  if (mem_read_insn_p (insn))
+ 	    g->num_loads++;
+ 	  if (mem_write_insn_p (insn))
+ 	    g->num_stores++;
++	  num_nodes++;
+ 	}
+-      num_nodes++;
+     }
+ 
+   /* There is nothing to do for this BB.  */
+-  if ((num_nodes - g->num_debug) <= 1)
++  if (num_nodes <= 1)
+     {
+       free (g);
+       return NULL;
+@@ -607,32 +560,39 @@ create_ddg (basic_block bb, int closing_
+   for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
+        insn = NEXT_INSN (insn))
+     {
+-      if (! INSN_P (insn))
+-	{
+-	  if (! first_note && NOTE_P (insn)
+-	      && NOTE_KIND (insn) !=  NOTE_INSN_BASIC_BLOCK)
+-	    first_note = insn;
+-	  continue;
+-	}
++      if (LABEL_P (insn) || NOTE_INSN_BASIC_BLOCK_P (insn))
++	continue;
++
++      if (!first_note && (INSN_P (insn) || NOTE_P (insn)))
++	first_note = insn;
++
++      if (!INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
++	continue;
++
+       if (JUMP_P (insn))
+ 	{
+ 	  gcc_assert (!g->closing_branch);
+ 	  g->closing_branch = &g->nodes[i];
+ 	}
+-      else if (GET_CODE (PATTERN (insn)) == USE)
++
++      if (NONDEBUG_INSN_P (insn))
+ 	{
+-	  if (! first_note)
+-	    first_note = insn;
+-	  continue;
+-	}
++	  g->nodes[i].cuid = i;
++	  g->nodes[i].successors = sbitmap_alloc (num_nodes);
++	  bitmap_clear (g->nodes[i].successors);
++	  g->nodes[i].predecessors = sbitmap_alloc (num_nodes);
++	  bitmap_clear (g->nodes[i].predecessors);
++
++	  gcc_checking_assert (first_note);
++	  g->nodes[i].first_note = first_note;
++
++	  g->nodes[i].aux.count = -1;
++	  g->nodes[i].max_dist = XCNEWVEC (int, num_nodes);
++	  for (j = 0; j < num_nodes; j++)
++	    g->nodes[i].max_dist[j] = -1;
+ 
+-      g->nodes[i].cuid = i;
+-      g->nodes[i].successors = sbitmap_alloc (num_nodes);
+-      bitmap_clear (g->nodes[i].successors);
+-      g->nodes[i].predecessors = sbitmap_alloc (num_nodes);
+-      bitmap_clear (g->nodes[i].predecessors);
+-      g->nodes[i].first_note = (first_note ? first_note : insn);
+-      g->nodes[i++].insn = insn;
++	  g->nodes[i++].insn = insn;
++	}
+       first_note = NULL;
+     }
+ 
+@@ -668,6 +628,7 @@ free_ddg (ddg_ptr g)
+ 	}
+       sbitmap_free (g->nodes[i].successors);
+       sbitmap_free (g->nodes[i].predecessors);
++      free (g->nodes[i].max_dist);
+     }
+   if (g->num_backarcs > 0)
+     free (g->backarcs);
+@@ -792,7 +753,7 @@ create_ddg_edge (ddg_node_ptr src, ddg_n
+   e->latency = l;
+   e->distance = d;
+   e->next_in = e->next_out = NULL;
+-  e->aux.info = 0;
++  e->in_scc = false;
+   return e;
+ }
+ 
+@@ -820,7 +781,7 @@ add_edge_to_ddg (ddg_ptr g ATTRIBUTE_UNU
+    for now that cycles in the data dependence graph contain a single backarc.
+    This simplifies the algorithm, and can be generalized later.  */
+ static void
+-set_recurrence_length (ddg_scc_ptr scc, ddg_ptr g)
++set_recurrence_length (ddg_scc_ptr scc)
+ {
+   int j;
+   int result = -1;
+@@ -828,17 +789,14 @@ set_recurrence_length (ddg_scc_ptr scc,
+   for (j = 0; j < scc->num_backarcs; j++)
+     {
+       ddg_edge_ptr backarc = scc->backarcs[j];
+-      int length;
+       int distance = backarc->distance;
+       ddg_node_ptr src = backarc->dest;
+       ddg_node_ptr dest = backarc->src;
++      int length = src->max_dist[dest->cuid];
++
++      if (length < 0)
++        continue;
+ 
+-      length = longest_simple_path (g, src->cuid, dest->cuid, scc->nodes);
+-      if (length < 0 )
+-	{
+-	  /* fprintf (stderr, "Backarc not on simple cycle in SCC.\n"); */
+-	  continue;
+-	}
+       length += backarc->latency;
+       result = MAX (result, (length / distance));
+     }
+@@ -846,9 +804,9 @@ set_recurrence_length (ddg_scc_ptr scc,
+ }
+ 
+ /* Create a new SCC given the set of its nodes.  Compute its recurrence_length
+-   and mark edges that belong to this scc as IN_SCC.  */
++   and mark edges that belong to this scc.  */
+ static ddg_scc_ptr
+-create_scc (ddg_ptr g, sbitmap nodes)
++create_scc (ddg_ptr g, sbitmap nodes, int id)
+ {
+   ddg_scc_ptr scc;
+   unsigned int u = 0;
+@@ -866,16 +824,18 @@ create_scc (ddg_ptr g, sbitmap nodes)
+       ddg_edge_ptr e;
+       ddg_node_ptr n = &g->nodes[u];
+ 
++      gcc_assert (n->aux.count == -1);
++      n->aux.count = id;
++
+       for (e = n->out; e; e = e->next_out)
+ 	if (bitmap_bit_p (nodes, e->dest->cuid))
+ 	  {
+-	    e->aux.count = IN_SCC;
++	    e->in_scc = true;
+ 	    if (e->distance > 0)
+ 	      add_backarc_to_scc (scc, e);
+ 	  }
+     }
+ 
+-  set_recurrence_length (scc, g);
+   return scc;
+ }
+ 
+@@ -1018,7 +978,7 @@ check_sccs (ddg_all_sccs_ptr sccs, int n
+ ddg_all_sccs_ptr
+ create_ddg_all_sccs (ddg_ptr g)
+ {
+-  int i;
++  int i, j, k, scc, way;
+   int num_nodes = g->num_nodes;
+   auto_sbitmap from (num_nodes);
+   auto_sbitmap to (num_nodes);
+@@ -1038,7 +998,7 @@ create_ddg_all_sccs (ddg_ptr g)
+       ddg_node_ptr dest = backarc->dest;
+ 
+       /* If the backarc already belongs to an SCC, continue.  */
+-      if (backarc->aux.count == IN_SCC)
++      if (backarc->in_scc)
+ 	continue;
+ 
+       bitmap_clear (scc_nodes);
+@@ -1049,10 +1009,52 @@ create_ddg_all_sccs (ddg_ptr g)
+ 
+       if (find_nodes_on_paths (scc_nodes, g, from, to))
+ 	{
+-	  scc = create_scc (g, scc_nodes);
++	  scc = create_scc (g, scc_nodes, sccs->num_sccs);
+ 	  add_scc_to_ddg (sccs, scc);
+ 	}
+     }
++
++  /* Init max_dist arrays for FloydâWarshall-like
++     longest patch calculation algorithm.  */
++  for (k = 0; k < num_nodes; k++)
++    {
++      ddg_edge_ptr e;
++      ddg_node_ptr n = &g->nodes[k];
++
++      if (n->aux.count == -1)
++        continue;
++
++      n->max_dist[k] = 0;
++      for (e = n->out; e; e = e->next_out)
++        if (e->distance == 0 && g->nodes[e->dest->cuid].aux.count == n->aux.count)
++          n->max_dist[e->dest->cuid] = e->latency;
++    }
++
++  /* Run main Floid-Warshall loop.  We use only non-backarc edges
++     inside each scc.  */
++  for (k = 0; k < num_nodes; k++)
++    {
++      scc = g->nodes[k].aux.count;
++      if (scc != -1)
++        {
++          for (i = 0; i < num_nodes; i++)
++            if (g->nodes[i].aux.count == scc)
++              for (j = 0; j < num_nodes; j++)
++                if (g->nodes[j].aux.count == scc
++                    && g->nodes[i].max_dist[k] >= 0
++                    && g->nodes[k].max_dist[j] >= 0)
++                  {
++                    way = g->nodes[i].max_dist[k] + g->nodes[k].max_dist[j];
++                    if (g->nodes[i].max_dist[j] < way)
++                      g->nodes[i].max_dist[j] = way;
++                  }
++        }
++    }
++
++  /* Calculate recurrence_length using max_dist info.  */
++  for (i = 0; i < sccs->num_sccs; i++)
++    set_recurrence_length (sccs->sccs[i]);
++
+   order_sccs (sccs);
+ 
+   if (flag_checking)
+@@ -1155,72 +1157,4 @@ find_nodes_on_paths (sbitmap result, ddg
+   return bitmap_and (result, reachable_from, reach_to);
+ }
+ 
+-
+-/* Updates the counts of U_NODE's successors (that belong to NODES) to be
+-   at-least as large as the count of U_NODE plus the latency between them.
+-   Sets a bit in TMP for each successor whose count was changed (increased).
+-   Returns nonzero if any count was changed.  */
+-static int
+-update_dist_to_successors (ddg_node_ptr u_node, sbitmap nodes, sbitmap tmp)
+-{
+-  ddg_edge_ptr e;
+-  int result = 0;
+-
+-  for (e = u_node->out; e; e = e->next_out)
+-    {
+-      ddg_node_ptr v_node = e->dest;
+-      int v = v_node->cuid;
+-
+-      if (bitmap_bit_p (nodes, v)
+-	  && (e->distance == 0)
+-	  && (v_node->aux.count < u_node->aux.count + e->latency))
+-	{
+-	  v_node->aux.count = u_node->aux.count + e->latency;
+-	  bitmap_set_bit (tmp, v);
+-	  result = 1;
+-	}
+-    }
+-  return result;
+-}
+-
+-
+-/* Find the length of a longest path from SRC to DEST in G,
+-   going only through NODES, and disregarding backarcs.  */
+-int
+-longest_simple_path (struct ddg * g, int src, int dest, sbitmap nodes)
+-{
+-  int i;
+-  unsigned int u = 0;
+-  int change = 1;
+-  int num_nodes = g->num_nodes;
+-  auto_sbitmap workset (num_nodes);
+-  auto_sbitmap tmp (num_nodes);
+-
+-
+-  /* Data will hold the distance of the longest path found so far from
+-     src to each node.  Initialize to -1 = less than minimum.  */
+-  for (i = 0; i < g->num_nodes; i++)
+-    g->nodes[i].aux.count = -1;
+-  g->nodes[src].aux.count = 0;
+-
+-  bitmap_clear (tmp);
+-  bitmap_set_bit (tmp, src);
+-
+-  while (change)
+-    {
+-      sbitmap_iterator sbi;
+-
+-      change = 0;
+-      bitmap_copy (workset, tmp);
+-      bitmap_clear (tmp);
+-      EXECUTE_IF_SET_IN_BITMAP (workset, 0, u, sbi)
+-	{
+-	  ddg_node_ptr u_node = &g->nodes[u];
+-
+-	  change |= update_dist_to_successors (u_node, nodes, tmp);
+-	}
+-    }
+-  return g->nodes[dest].aux.count;
+-}
+-
+ #endif /* INSN_SCHEDULING */
+diff -Nurp a/gcc/ddg.h b/gcc/ddg.h
+--- a/gcc/ddg.h	2020-03-12 19:07:21.000000000 +0800
++++ b/gcc/ddg.h	2020-11-28 18:38:33.835633230 +0800
+@@ -64,6 +64,10 @@ struct ddg_node
+   sbitmap successors;
+   sbitmap predecessors;
+ 
++  /* Temporary array used for Floyd-Warshall algorithm to find
++     scc recurrence length.  */
++  int *max_dist;
++
+   /* For general use by algorithms manipulating the ddg.  */
+   union {
+     int count;
+@@ -95,11 +99,8 @@ struct ddg_edge
+   ddg_edge_ptr next_in;
+   ddg_edge_ptr next_out;
+ 
+-  /* For general use by algorithms manipulating the ddg.  */
+-  union {
+-    int count;
+-    void *info;
+-  } aux;
++  /* Is true when edge is already in scc.  */
++  bool in_scc;
+ };
+ 
+ /* This structure holds the Data Dependence Graph for a basic block.  */
+@@ -115,9 +116,6 @@ struct ddg
+   int num_loads;
+   int num_stores;
+ 
+-  /* Number of debug instructions in the BB.  */
+-  int num_debug;
+-
+   /* This array holds the nodes in the graph; it is indexed by the node
+      cuid, which follows the order of the instructions in the BB.  */
+   ddg_node_ptr nodes;
+@@ -178,7 +176,6 @@ ddg_all_sccs_ptr create_ddg_all_sccs (dd
+ void free_ddg_all_sccs (ddg_all_sccs_ptr);
+ 
+ int find_nodes_on_paths (sbitmap result, ddg_ptr, sbitmap from, sbitmap to);
+-int longest_simple_path (ddg_ptr, int from, int to, sbitmap via);
+ 
+ bool autoinc_var_is_used_p (rtx_insn *, rtx_insn *);
+ 
+diff -Nurp a/gcc/modulo-sched.c b/gcc/modulo-sched.c
+--- a/gcc/modulo-sched.c	2020-03-12 19:07:21.000000000 +0800
++++ b/gcc/modulo-sched.c	2020-11-28 18:38:33.835633230 +0800
+@@ -370,7 +370,7 @@ doloop_register_get (rtx_insn *head, rtx
+                              : prev_nondebug_insn (tail));
+ 
+   for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
+-    if (!DEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
++    if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
+       {
+         if (dump_file)
+         {
+@@ -429,7 +429,7 @@ res_MII (ddg_ptr g)
+   if (targetm.sched.sms_res_mii)
+     return targetm.sched.sms_res_mii (g);
+ 
+-  return ((g->num_nodes - g->num_debug) / issue_rate);
++  return g->num_nodes / issue_rate;
+ }
+ 
+ 
+@@ -2156,11 +2156,7 @@ sms_schedule_by_order (ddg_ptr g, int mi
+   	  ddg_node_ptr u_node = &ps->g->nodes[u];
+ 	  rtx_insn *insn = u_node->insn;
+ 
+-	  if (!NONDEBUG_INSN_P (insn))
+-	    {
+-	      bitmap_clear_bit (tobe_scheduled, u);
+-	      continue;
+-	    }
++	  gcc_checking_assert (NONDEBUG_INSN_P (insn));
+ 
+ 	  if (bitmap_bit_p (sched_nodes, u))
+ 	    continue;
+@@ -3162,9 +3158,6 @@ ps_has_conflicts (partial_schedule_ptr p
+ 	{
+ 	  rtx_insn *insn = ps_rtl_insn (ps, crr_insn->id);
+ 
+-	  if (!NONDEBUG_INSN_P (insn))
+-	    continue;
+-
+ 	  /* Check if there is room for the current insn.  */
+ 	  if (!can_issue_more || state_dead_lock_p (curr_state))
+ 	    return true;
+diff -Nurp a/gcc/testsuite/gcc.c-torture/execute/pr70127-debug-sms.c b/gcc/testsuite/gcc.c-torture/execute/pr70127-debug-sms.c
+--- a/gcc/testsuite/gcc.c-torture/execute/pr70127-debug-sms.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.c-torture/execute/pr70127-debug-sms.c	2020-11-28 18:38:33.835633230 +0800
+@@ -0,0 +1,23 @@
++/* { dg-additional-options "-fcompare-debug -fmodulo-sched" } */
++
++struct S { int f; signed int g : 2; } a[1], c = {5, 1}, d;
++short b;
++
++__attribute__((noinline, noclone)) void
++foo (int x)
++{
++  if (x != 1)
++    __builtin_abort ();
++}
++
++int
++main ()
++{
++  while (b++ <= 0)
++    {
++      struct S e = {1, 1};
++      d = e = a[0] = c;
++    }
++  foo (a[0].g);
++  return 0;
++}
+diff -Nurp a/gcc/testsuite/gcc.dg/torture/pr87197-debug-sms.c b/gcc/testsuite/gcc.dg/torture/pr87197-debug-sms.c
+--- a/gcc/testsuite/gcc.dg/torture/pr87197-debug-sms.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/torture/pr87197-debug-sms.c	2020-11-28 18:38:33.835633230 +0800
+@@ -0,0 +1,36 @@
++/* { dg-do compile } */
++/* { dg-additional-options "-fcompare-debug -fmodulo-sched --param sms-min-sc=1" } */
++
++int a, c, e, f, g;
++void
++h (int i)
++{
++  a = i;
++}
++void
++j (char *i, long k)
++{
++  while (k--)
++    c = *i++;
++}
++void
++l (unsigned char *i, long k)
++{
++  unsigned char *b = i + k;
++  while (i < b)
++    {
++      h (*i);
++      i++;
++    }
++}
++void
++m ()
++{
++  while (e)
++    {
++      float d = g;
++      l ((char *) &d, sizeof (g));
++      if (f)
++	j ((char *) &d, sizeof (g));
++    }
++}
diff --git a/store-merging-Consider-also-overlapping-stores-earlier.patch b/store-merging-Consider-also-overlapping-stores-earlier.patch
new file mode 100644
index 0000000..15dd0d1
--- /dev/null
+++ b/store-merging-Consider-also-overlapping-stores-earlier.patch
@@ -0,0 +1,359 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-store-merging-Consider-also-overlapping-stores-earli.patch
+bd909071ac04e94f4b6f0baab64d0687ec55681d
+
+diff -uprN a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c
+--- a/gcc/gimple-ssa-store-merging.c	2020-12-16 17:03:16.155633230 +0800
++++ b/gcc/gimple-ssa-store-merging.c	2020-12-16 11:15:58.575633230 +0800
+@@ -2021,7 +2021,8 @@ struct imm_store_chain_info
+       }
+   }
+   bool terminate_and_process_chain ();
+-  bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int);
++  bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int,
++			   unsigned int);
+   bool coalesce_immediate_stores ();
+   bool output_merged_store (merged_store_group *);
+   bool output_merged_stores ();
+@@ -2342,14 +2343,39 @@ gather_bswap_load_refs (vec<tree> *refs,
+    into the group.  That way it will be its own store group and will
+    not be touched.  If ALL_INTEGER_CST_P and there are overlapping
+    INTEGER_CST stores, those are mergeable using merge_overlapping,
+-   so don't return false for those.  */
++   so don't return false for those.
++
++   Similarly, check stores from FIRST_EARLIER (inclusive) to END_EARLIER
++   (exclusive), whether they don't overlap the bitrange START to END
++   and have order in between FIRST_ORDER and LAST_ORDER.  This is to
++   prevent merging in cases like:
++     MEM <char[12]> [&b + 8B] = {};
++     MEM[(short *) &b] = 5;
++     _5 = *x_4(D);
++     MEM <long long unsigned int> [&b + 2B] = _5;
++     MEM[(char *)&b + 16B] = 88;
++     MEM[(int *)&b + 20B] = 1;
++   The = {} store comes in sort_by_bitpos before the = 88 store, and can't
++   be merged with it, because the = _5 store overlaps these and is in between
++   them in sort_by_order ordering.  If it was merged, the merged store would
++   go after the = _5 store and thus change behavior.  */
+ 
+ static bool
+ check_no_overlap (vec<store_immediate_info *> m_store_info, unsigned int i,
+-		  bool all_integer_cst_p, unsigned int last_order,
+-		  unsigned HOST_WIDE_INT end)
++		  bool all_integer_cst_p, unsigned int first_order,
++		  unsigned int last_order, unsigned HOST_WIDE_INT start,
++		  unsigned HOST_WIDE_INT end, unsigned int first_earlier,
++		  unsigned end_earlier)
+ {
+   unsigned int len = m_store_info.length ();
++  for (unsigned int j = first_earlier; j < end_earlier; j++)
++    {
++      store_immediate_info *info = m_store_info[j];
++      if (info->order > first_order
++	  && info->order < last_order
++	  && info->bitpos + info->bitsize > start)
++	return false;
++    }
+   for (++i; i < len; ++i)
+     {
+       store_immediate_info *info = m_store_info[i];
+@@ -2370,7 +2396,8 @@ check_no_overlap (vec<store_immediate_in
+ bool
+ imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store,
+ 					  unsigned int first,
+-					  unsigned int try_size)
++					  unsigned int try_size,
++					  unsigned int first_earlier)
+ {
+   unsigned int len = m_store_info.length (), last = first;
+   unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize;
+@@ -2509,7 +2536,8 @@ imm_store_chain_info::try_coalesce_bswap
+   if (n.base_addr == NULL_TREE && !is_gimple_val (n.src))
+     return false;
+ 
+-  if (!check_no_overlap (m_store_info, last, false, last_order, end))
++  if (!check_no_overlap (m_store_info, last, false, first_order, last_order,
++			 merged_store->start, end, first_earlier, first))
+     return false;
+ 
+   /* Don't handle memory copy this way if normal non-bswap processing
+@@ -2601,6 +2629,8 @@ imm_store_chain_info::coalesce_immediate
+ 
+   store_immediate_info *info;
+   unsigned int i, ignore = 0;
++  unsigned int first_earlier = 0;
++  unsigned int end_earlier = 0;
+ 
+   /* Order the stores by the bitposition they write to.  */
+   m_store_info.qsort (sort_by_bitpos);
+@@ -2615,6 +2645,12 @@ imm_store_chain_info::coalesce_immediate
+       if (i <= ignore)
+ 	goto done;
+ 
++      while (first_earlier < end_earlier
++	     && (m_store_info[first_earlier]->bitpos
++		 + m_store_info[first_earlier]->bitsize
++		 <= merged_store->start))
++	first_earlier++;
++
+       /* First try to handle group of stores like:
+ 	 p[0] = data >> 24;
+ 	 p[1] = data >> 16;
+@@ -2628,7 +2664,8 @@ imm_store_chain_info::coalesce_immediate
+ 	{
+ 	  unsigned int try_size;
+ 	  for (try_size = 64; try_size >= 16; try_size >>= 1)
+-	    if (try_coalesce_bswap (merged_store, i - 1, try_size))
++	    if (try_coalesce_bswap (merged_store, i - 1, try_size,
++				    first_earlier))
+ 	      break;
+ 
+ 	  if (try_size >= 16)
+@@ -2636,7 +2673,10 @@ imm_store_chain_info::coalesce_immediate
+ 	      ignore = i + merged_store->stores.length () - 1;
+ 	      m_merged_store_groups.safe_push (merged_store);
+ 	      if (ignore < m_store_info.length ())
+-		merged_store = new merged_store_group (m_store_info[ignore]);
++		{
++		  merged_store = new merged_store_group (m_store_info[ignore]);
++		  end_earlier = ignore;
++		}
+ 	      else
+ 		merged_store = NULL;
+ 	      goto done;
+@@ -2662,12 +2702,16 @@ imm_store_chain_info::coalesce_immediate
+ 	  /* Only allow overlapping stores of constants.  */
+ 	  if (info->rhs_code == INTEGER_CST && merged_store->only_constants)
+ 	    {
++	      unsigned int first_order
++		= MIN (merged_store->first_order, info->order);
+ 	      unsigned int last_order
+ 		= MAX (merged_store->last_order, info->order);
+ 	      unsigned HOST_WIDE_INT end
+ 		= MAX (merged_store->start + merged_store->width,
+ 		       info->bitpos + info->bitsize);
+-	      if (check_no_overlap (m_store_info, i, true, last_order, end))
++	      if (check_no_overlap (m_store_info, i, true, first_order,
++				    last_order, merged_store->start, end,
++				    first_earlier, end_earlier))
+ 		{
+ 		  /* check_no_overlap call above made sure there are no
+ 		     overlapping stores with non-INTEGER_CST rhs_code
+@@ -2696,6 +2740,7 @@ imm_store_chain_info::coalesce_immediate
+ 		  do
+ 		    {
+ 		      unsigned int max_order = 0;
++		      unsigned int min_order = first_order;
+ 		      unsigned first_nonmergeable_int_order = ~0U;
+ 		      unsigned HOST_WIDE_INT this_end = end;
+ 		      k = i;
+@@ -2721,6 +2766,7 @@ imm_store_chain_info::coalesce_immediate
+ 				  break;
+ 				}
+ 			      k = j;
++			      min_order = MIN (min_order, info2->order);
+ 			      this_end = MAX (this_end,
+ 					      info2->bitpos + info2->bitsize);
+ 			    }
+@@ -2736,6 +2782,12 @@ imm_store_chain_info::coalesce_immediate
+ 			    first_nonmergeable_order
+ 			      = MIN (first_nonmergeable_order, info2->order);
+ 			}
++		      if (k > i
++			  && !check_no_overlap (m_store_info, len - 1, true,
++						min_order, try_order,
++						merged_store->start, this_end,
++						first_earlier, end_earlier))
++			k = 0;
+ 		      if (k == 0)
+ 			{
+ 			  if (last_order == try_order)
+@@ -2821,9 +2873,12 @@ imm_store_chain_info::coalesce_immediate
+ 	      info->ops_swapped_p = true;
+ 	    }
+ 	  if (check_no_overlap (m_store_info, i, false,
++				MIN (merged_store->first_order, info->order),
+ 				MAX (merged_store->last_order, info->order),
++				merged_store->start,
+ 				MAX (merged_store->start + merged_store->width,
+-				     info->bitpos + info->bitsize)))
++				     info->bitpos + info->bitsize),
++				first_earlier, end_earlier))
+ 	    {
+ 	      /* Turn MEM_REF into BIT_INSERT_EXPR for bit-field stores.  */
+ 	      if (info->rhs_code == MEM_REF && infof->rhs_code != MEM_REF)
+@@ -2868,6 +2923,7 @@ imm_store_chain_info::coalesce_immediate
+ 	delete merged_store;
+ 
+       merged_store = new merged_store_group (info);
++      end_earlier = i;
+       if (dump_file && (dump_flags & TDF_DETAILS))
+ 	fputs ("New store group\n", dump_file);
+ 
+diff -uprN a/gcc/testsuite/gcc.dg/store_merging_31.c b/gcc/testsuite/gcc.dg/store_merging_31.c
+--- a/gcc/testsuite/gcc.dg/store_merging_31.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/store_merging_31.c	2020-12-16 11:15:58.575633230 +0800
+@@ -0,0 +1,27 @@
++/* PR tree-optimization/97053 */
++/* { dg-do run } */
++/* { dg-options "-O2" } */
++
++struct S { short a; char b[9]; int c; char d; int e; };
++
++__attribute__((noipa)) void
++foo (char *x, char *y)
++{
++  if (__builtin_strcmp (x, "ABCDXXXX") != 0
++      || __builtin_strcmp (y, "ABCDXXXX") != 0)
++    __builtin_abort ();
++}
++
++int
++main ()
++{
++  char a[9] = "XXXXXXXX";
++  struct S b = {};
++  __builtin_memcpy (a, "ABCD", 4);
++  b.a = 5;
++  __builtin_memcpy (b.b, a, 8); 
++  b.d = 'X';
++  b.e = 1;
++  foo (a, b.b);
++  return 0;
++}
+diff -uprN a/gcc/testsuite/gcc.dg/store_merging_32.c b/gcc/testsuite/gcc.dg/store_merging_32.c
+--- a/gcc/testsuite/gcc.dg/store_merging_32.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/store_merging_32.c	2020-12-16 11:15:58.575633230 +0800
+@@ -0,0 +1,129 @@
++/* PR tree-optimization/97053 */
++/* { dg-do run } */
++/* { dg-options "-O2 -fno-tree-dse" } */
++
++struct __attribute__((packed, may_alias)) S { long long s; };
++struct __attribute__((packed, may_alias)) T { short t; };
++
++__attribute__((noipa)) void
++test (char *p, char *q, int s)
++{
++  if ((s & 1) == 0)
++    {
++      if (*(short __attribute__((may_alias)) *) &p[sizeof (short)]
++	  != *(short __attribute__((may_alias)) *) &q[sizeof (short)]
++	  || (((struct S __attribute__((may_alias)) *) &p[1])->s
++	      != ((struct S __attribute__((may_alias)) *) &q[1])->s)
++	  || (*(short __attribute__((may_alias)) *) &p[2 * sizeof (short)]
++	      != *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)]))
++	__builtin_abort ();
++    }
++  else
++    {
++      if (*(short __attribute__((may_alias)) *) &p[sizeof (short)]
++	  != *(short __attribute__((may_alias)) *) &q[sizeof (short)]
++	  || (((struct S __attribute__((may_alias)) *) &p[1])->s
++	      != ((struct S __attribute__((may_alias)) *) &q[1])->s)
++	  || (((struct T __attribute__((may_alias)) *) &p[2 * sizeof (short) - 1])->t
++	      != ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) - 1])->t)
++	  || p[3 * sizeof (short) - 2] != q[3 * sizeof (short) - 2])
++	__builtin_abort ();
++    }
++}
++
++__attribute__((noipa)) void
++foo (long long *p, char *q, char *r, char *s)
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
++  ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2;
++  *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1;
++  ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &s[2 * sizeof (short)] = 2;
++  test (a, q, 0);
++}
++
++__attribute__((noipa)) void
++bar (long long *p, char *q, char *r, char *s, char *t)
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
++  ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
++  ((struct T __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1])->t = 2;
++  a[3 * sizeof (short) - 2] = 3;
++  *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1;
++  ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
++  ((struct T __attribute__((may_alias)) *) &s[2 * sizeof (short) - 1])->t = 2;
++  t[3 * sizeof (short) - 2] = 3;
++  test (a, q, 1);
++}
++
++__attribute__((noipa)) void
++baz (long long *p, char *q, char *r, char *s)
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2;
++  ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
++  *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = 2;
++  ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &s[sizeof (short)] = 1;
++  test (a, q, 2);
++}
++
++__attribute__((noipa)) void
++qux (long long *p, char *q, char *r, char *s, char *t)
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  *(short __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1] = 2;
++  ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
++  a[3 * sizeof (short) - 2] = 3;
++  *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
++  ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) - 1])->t = 2;
++  ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
++  s[3 * sizeof (short) - 2] = 3;
++  ((struct T __attribute__((may_alias)) *) &t[sizeof (short)])->t = 1;
++  test (a, q, 3);
++}
++
++__attribute__((noipa)) void
++corge (long long *p, char *q, char *r, char *s, short u[3])
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2];
++  ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1];
++  *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2];
++  ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1];
++  test (a, q, 4);
++}
++
++__attribute__((noipa)) void
++garply (long long *p, char *q, char *r, char *s, short u[3])
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1];
++  ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2];
++  *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1];
++  ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
++  *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2];
++  test (a, q, 6);
++}
++
++int
++main ()
++{
++  char a[64] __attribute__((aligned (__alignof (short))));
++  long long p = -1LL;
++  short u[] = { 1, 2, 3 };
++  foo (&p, &a[0], &a[0], &a[0]);
++  bar (&p, &a[0], &a[0], &a[0], &a[0]);
++  baz (&p, &a[0], &a[0], &a[0]);
++  qux (&p, &a[0], &a[0], &a[0], &a[0]);
++  corge (&p, &a[0], &a[0], &a[0], u);
++  garply (&p, &a[0], &a[0], &a[0], u);
++  return 0;
++}
diff --git a/tree-optimization-96920-another-ICE-when-vectorizing.patch b/tree-optimization-96920-another-ICE-when-vectorizing.patch
new file mode 100644
index 0000000..ae6122c
--- /dev/null
+++ b/tree-optimization-96920-another-ICE-when-vectorizing.patch
@@ -0,0 +1,316 @@
+This backport contains 1 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+46a58c779af3055a4b10b285a1f4be28abe4351c
+0001-tree-optimization-96920-another-ICE-when-vectorizing.patch
+
+diff -uprN a/gcc/testsuite/gcc.dg/vect/pr96920.c b/gcc/testsuite/gcc.dg/vect/pr96920.c
+--- a/gcc/testsuite/gcc.dg/vect/pr96920.c	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gcc.dg/vect/pr96920.c	2020-10-26 21:46:25.316000000 +0800
+@@ -0,0 +1,20 @@
++/* { dg-do compile } */
++
++int a[1024];
++int b[2048];
++
++void foo (int x, int y)
++{
++  for (int i = 0; i < 1024; ++i)
++    {
++      int tem0 = b[2*i];
++      int tem1 = b[2*i+1];
++      for (int j = 0; j < 32; ++j)
++	{
++	  int tem = tem0;
++	  tem0 = tem1;
++	  tem1 = tem;
++	  a[i] += tem0;
++	}
++    }
++}
+diff -uprN a/gcc/testsuite/gfortran.dg/vect/pr96920.f90 b/gcc/testsuite/gfortran.dg/vect/pr96920.f90
+--- a/gcc/testsuite/gfortran.dg/vect/pr96920.f90	1970-01-01 08:00:00.000000000 +0800
++++ b/gcc/testsuite/gfortran.dg/vect/pr96920.f90	2020-10-26 21:46:25.316000000 +0800
+@@ -0,0 +1,37 @@
++! { dg-do compile }
++      subroutine ice(npoint, nterm, x, g)
++      implicit none
++      integer    norder
++      parameter (norder=10)
++      integer j
++      integer k
++      integer ii
++      integer nterm
++      integer npoint
++      real b(norder)
++      real c(norder)
++      real d(norder)
++      real x(npoint)
++      real g(npoint)
++      real gg
++      real prev
++      real prev2
++
++          j = 1
++    100   continue
++          j = j+1
++          if (nterm == j)  then
++             do ii=1,npoint
++                k = nterm
++                gg= d(k)
++                prev= 0.0
++                do k=k-1,1,-1
++                   prev2= prev
++                   prev= gg
++                   gg = d(k)+(x(ii)-b(k))*prev-c(k+1)*prev2
++                enddo
++                g(ii) = gg
++             enddo
++          endif
++          go to 100
++      end
+diff -uprN a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
+--- a/gcc/tree-vect-loop.c	2020-10-26 21:45:23.056000000 +0800
++++ b/gcc/tree-vect-loop.c	2020-10-26 21:49:02.884000000 +0800
+@@ -8166,6 +8166,47 @@ scale_profile_for_vect_loop (struct loop
+     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
+ }
+ 
++/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
++   latch edge values originally defined by it.  */
++
++static void
++maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
++				     stmt_vec_info def_stmt_info)
++{
++  tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
++  if (!def || TREE_CODE (def) != SSA_NAME)
++    return;
++  stmt_vec_info phi_info;
++  imm_use_iterator iter;
++  use_operand_p use_p;
++  FOR_EACH_IMM_USE_FAST (use_p, iter, def)
++    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
++      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
++	  && (phi_info = loop_vinfo->lookup_stmt (phi))
++	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
++	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
++	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
++	{
++	  loop_p loop = gimple_bb (phi)->loop_father;
++	  edge e = loop_latch_edge (loop);
++	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
++	    {
++	      stmt_vec_info phi_defs = STMT_VINFO_VEC_STMT (phi_info);
++	      stmt_vec_info latch_defs = STMT_VINFO_VEC_STMT (def_stmt_info);
++	      while (phi_defs && latch_defs)
++		{
++		  add_phi_arg (as_a <gphi *> (phi_defs->stmt),
++			       gimple_get_lhs (latch_defs->stmt), e,
++			       gimple_phi_arg_location (phi, e->dest_idx));
++		  phi_defs = STMT_VINFO_RELATED_STMT (phi_defs);
++		  latch_defs = STMT_VINFO_RELATED_STMT (latch_defs);
++		}
++	      gcc_assert (!latch_defs);
++	      gcc_assert (!phi_defs);
++	    }
++	}
++}
++
+ /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
+    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
+    stmt_vec_info.  */
+@@ -8533,7 +8574,7 @@ vect_transform_loop (loop_vec_info loop_
+ 
+       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
+ 	   gsi_next (&si))
+-        {
++	{
+ 	  gphi *phi = si.phi ();
+ 	  if (dump_enabled_p ())
+ 	    dump_printf_loc (MSG_NOTE, vect_location,
+@@ -8568,6 +8609,27 @@ vect_transform_loop (loop_vec_info loop_
+ 	    }
+ 	}
+ 
++      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
++	   gsi_next (&si))
++	{
++	  gphi *phi = si.phi ();
++	  stmt_info = loop_vinfo->lookup_stmt (phi);
++	  if (!stmt_info)
++	    continue;
++
++	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
++	      && !STMT_VINFO_LIVE_P (stmt_info))
++	    continue;
++
++	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
++	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
++	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
++	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
++	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
++	      && ! PURE_SLP_STMT (stmt_info))
++	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
++	}
++
+       for (gimple_stmt_iterator si = gsi_start_bb (bb);
+ 	   !gsi_end_p (si);)
+ 	{
+@@ -8604,9 +8666,16 @@ vect_transform_loop (loop_vec_info loop_
+ 			= STMT_VINFO_RELATED_STMT (stmt_info);
+ 		      vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
+ 						&seen_store);
++		      maybe_set_vectorized_backedge_value (loop_vinfo,
++				      			   pat_stmt_info);
++		    }
++		  else
++		    {
++		      vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
++				      		&seen_store);
++		      maybe_set_vectorized_backedge_value (loop_vinfo,
++				      			   stmt_info);
+ 		    }
+-		  vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+-					    &seen_store);
+ 		}
+ 	      gsi_next (&si);
+ 	      if (seen_store)
+@@ -8623,43 +8692,6 @@ vect_transform_loop (loop_vec_info loop_
+ 	    }
+ 	}
+ 
+-      /* Fill in backedge defs of reductions.  */
+-      for (unsigned i = 0; i < loop_vinfo->reduc_latch_defs.length (); ++i)
+-	{
+-	  stmt_vec_info stmt_info = loop_vinfo->reduc_latch_defs[i];
+-	  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
+-	  stmt_vec_info phi_info
+-	    = STMT_VINFO_VEC_STMT (STMT_VINFO_REDUC_DEF (orig_stmt_info));
+-	  stmt_vec_info vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+-	  gphi *phi
+-	    = dyn_cast <gphi *> (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt);
+-	  edge e = loop_latch_edge (gimple_bb (phi_info->stmt)->loop_father);
+-	  do
+-	    {
+-	      add_phi_arg (as_a <gphi *> (phi_info->stmt),
+-			   gimple_get_lhs (vec_stmt->stmt), e,
+-			   gimple_phi_arg_location (phi, e->dest_idx));
+-	      phi_info = STMT_VINFO_RELATED_STMT (phi_info);
+-	      vec_stmt = STMT_VINFO_RELATED_STMT (vec_stmt);
+-	    }
+-	  while (phi_info);
+-	  gcc_assert (!vec_stmt);
+-	}
+-      for (unsigned i = 0; i < loop_vinfo->reduc_latch_slp_defs.length (); ++i)
+-	{
+-	  slp_tree slp_node = loop_vinfo->reduc_latch_slp_defs[i].first;
+-	  slp_tree phi_node = loop_vinfo->reduc_latch_slp_defs[i].second;
+-	  gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
+-	  e = loop_latch_edge (gimple_bb (phi)->loop_father);
+-	  gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
+-		      == SLP_TREE_VEC_STMTS (slp_node).length ());
+-	  for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
+-	    add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]->stmt),
+-			 gimple_get_lhs
+-			     (SLP_TREE_VEC_STMTS (slp_node)[j]->stmt),
+-			 e, gimple_phi_arg_location (phi, e->dest_idx));
+-	}
+-
+       /* Stub out scalar statements that must not survive vectorization.
+ 	 Doing this here helps with grouped statements, or statements that
+ 	 are involved in patterns.  */
+diff -uprN a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+--- a/gcc/tree-vectorizer.h	2020-10-26 21:45:23.052000000 +0800
++++ b/gcc/tree-vectorizer.h	2020-10-26 21:46:25.316000000 +0800
+@@ -575,11 +575,6 @@ typedef struct _loop_vec_info : public v
+      stmt in the chain.  */
+   auto_vec<stmt_vec_info> reduction_chains;
+ 
+-  /* The vectorized stmts defining the latch values of the reduction
+-     they are involved with.  */
+-  auto_vec<stmt_vec_info> reduc_latch_defs;
+-  auto_vec<std::pair<slp_tree, slp_tree> > reduc_latch_slp_defs;
+-
+   /* Cost vector for a single scalar iteration.  */
+   auto_vec<stmt_info_for_cost> scalar_cost_vec;
+ 
+diff -uprN a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
+--- a/gcc/tree-vect-slp.c	2020-10-26 21:45:23.052000000 +0800
++++ b/gcc/tree-vect-slp.c	2020-10-26 21:46:25.320000000 +0800
+@@ -2189,6 +2189,7 @@ vect_analyze_slp_instance (vec_info *vin
+ 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+ 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ 	  SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
++	  new_instance->reduc_phis = NULL;
+ 
+ 	  vect_gather_slp_loads (new_instance, node);
+ 	  if (dump_enabled_p ())
+@@ -4282,6 +4283,26 @@ vect_schedule_slp (vec_info *vinfo)
+       stmt_vec_info store_info;
+       unsigned int j;
+ 
++      /* For reductions set the latch values of the vectorized PHIs.  */
++      if (instance->reduc_phis
++	  && STMT_VINFO_REDUC_TYPE (SLP_TREE_SCALAR_STMTS
++		  	(instance->reduc_phis)[0]) != FOLD_LEFT_REDUCTION
++	  && STMT_VINFO_REDUC_TYPE (SLP_TREE_SCALAR_STMTS
++		  	(instance->reduc_phis)[0]) != EXTRACT_LAST_REDUCTION)
++	{
++	  slp_tree slp_node = root;
++	  slp_tree phi_node = instance->reduc_phis;
++	  gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
++	  edge e = loop_latch_edge (gimple_bb (phi)->loop_father);
++	  gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
++		      == SLP_TREE_VEC_STMTS (slp_node).length ());
++	  for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
++	    add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]->stmt),
++			 gimple_get_lhs
++			     (SLP_TREE_VEC_STMTS (slp_node)[j]->stmt),
++			 e, gimple_phi_arg_location (phi, e->dest_idx));
++	}
++
+       /* Remove scalar call stmts.  Do not do this for basic-block
+ 	 vectorization as not all uses may be vectorized.
+ 	 ???  Why should this be necessary?  DCE should be able to
+diff -uprN a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+--- a/gcc/tree-vect-stmts.c	2020-10-26 21:45:23.012000000 +0800
++++ b/gcc/tree-vect-stmts.c	2020-10-26 21:46:25.320000000 +0800
+@@ -10229,37 +10229,6 @@ vect_transform_stmt (stmt_vec_info stmt_
+   if (STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
+     return is_store;
+ 
+-  /* If this stmt defines a value used on a backedge, record it so
+-     we can update the vectorized PHIs later.  */
+-  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
+-  stmt_vec_info reduc_info;
+-  if (STMT_VINFO_REDUC_DEF (orig_stmt_info)
+-      && vect_stmt_to_vectorize (orig_stmt_info) == stmt_info
+-      && (reduc_info = info_for_reduction (orig_stmt_info))
+-      && STMT_VINFO_REDUC_TYPE (reduc_info) != FOLD_LEFT_REDUCTION
+-      && STMT_VINFO_REDUC_TYPE (reduc_info) != EXTRACT_LAST_REDUCTION)
+-    {
+-      gphi *phi;
+-      edge e;
+-      if (!slp_node
+-	  && (phi = dyn_cast <gphi *>
+-		      (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt))
+-	  && dominated_by_p (CDI_DOMINATORS,
+-			     gimple_bb (orig_stmt_info->stmt), gimple_bb (phi))
+-	  && (e = loop_latch_edge (gimple_bb (phi)->loop_father))
+-	  && (PHI_ARG_DEF_FROM_EDGE (phi, e)
+-	      == gimple_get_lhs (orig_stmt_info->stmt)))
+-	{
+-	  as_a <loop_vec_info> (vinfo)->reduc_latch_defs.safe_push (stmt_info);
+-	}
+-      else if (slp_node
+-	       && slp_node != slp_node_instance->reduc_phis)
+-	{
+-	  as_a <loop_vec_info> (vinfo)->reduc_latch_slp_defs.safe_push
+-	    (std::make_pair (slp_node, slp_node_instance->reduc_phis));
+-	}
+-    }
+-
+   /* Handle stmts whose DEF is used outside the loop-nest that is
+      being vectorized.  */
+   done = can_vectorize_live_stmts (stmt_info, gsi, slp_node,
diff --git a/tree-optimization-97812-fix-range-query-in-VRP-asser.patch b/tree-optimization-97812-fix-range-query-in-VRP-asser.patch
new file mode 100644
index 0000000..09c77ee
--- /dev/null
+++ b/tree-optimization-97812-fix-range-query-in-VRP-asser.patch
@@ -0,0 +1,48 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+dcfd302a79a5e2ea3bb16fc4fc45a5ee31cc0eab
+0001-tree-optimization-97812-fix-range-query-in-VRP-asser.patch
+
+diff --git a/gcc/testsuite/gcc.dg/torture/pr97812.c b/gcc/testsuite/gcc.dg/torture/pr97812.c
+new file mode 100644
+index 00000000000..4d468adf8fa
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/torture/pr97812.c
+@@ -0,0 +1,15 @@
++/* { dg-do run } */
++/* { dg-additional-options "-fdisable-tree-evrp" } */
++
++unsigned char c;
++
++int main() {
++volatile short b = 4066;
++  unsigned short bp = b;
++  unsigned d = bp & 2305;
++  signed char e = d;
++  c = e ? : e;
++  if (!d)
++    __builtin_abort ();
++  return 0;
++}
+diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c
+index 54ce017e8b2..d661866630e 100644
+--- a/gcc/tree-vrp.c
++++ b/gcc/tree-vrp.c
+@@ -1740,8 +1740,14 @@ register_edge_assert_for_2 (tree name, edge e,
+ 	      && ((TYPE_PRECISION (TREE_TYPE (name))
+ 		   > TYPE_PRECISION (TREE_TYPE (rhs1)))
+ 		  || (get_range_info (rhs1, &rmin, &rmax) == VR_RANGE
+-		      && wi::fits_to_tree_p (rmin, TREE_TYPE (name))
+-		      && wi::fits_to_tree_p (rmax, TREE_TYPE (name)))))
++		      && wi::fits_to_tree_p
++			   (widest_int::from (rmin,
++					      TYPE_SIGN (TREE_TYPE (rhs1))),
++			    TREE_TYPE (name))
++		      && wi::fits_to_tree_p
++			   (widest_int::from (rmax,
++					      TYPE_SIGN (TREE_TYPE (rhs1))),
++			    TREE_TYPE (name)))))
+ 	    add_assert_info (asserts, rhs1, rhs1,
+ 		 	     comp_code, fold_convert (TREE_TYPE (rhs1), val));
+ 	}
diff --git a/vectorizable-comparison-Swap-operands-only-once.patch b/vectorizable-comparison-Swap-operands-only-once.patch
new file mode 100644
index 0000000..e42ef96
--- /dev/null
+++ b/vectorizable-comparison-Swap-operands-only-once.patch
@@ -0,0 +1,19 @@
+This backport contains 1 patch from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+a0aeb7fb93da156b64fd08391c79ff35a69af7ba
+0001-tree-vect-stmts.c-vectorizable_comparison-Swap-opera.patch
+
+diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+index e921225b5ec..601a6f55fbf 100644
+--- a/gcc/tree-vect-stmts.c
++++ b/gcc/tree-vect-stmts.c
+@@ -10369,7 +10369,7 @@ vectorizable_comparison (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ 
+       if (!slp_node)
+ 	{
+-	  if (swap_p)
++	  if (swap_p && j == 0)
+ 	    std::swap (vec_rhs1, vec_rhs2);
+ 	  vec_oprnds0.quick_push (vec_rhs1);
+ 	  vec_oprnds1.quick_push (vec_rhs2);
diff --git a/x86-Fix-bf16-and-matrix.patch b/x86-Fix-bf16-and-matrix.patch
new file mode 100644
index 0000000..8be95f8
--- /dev/null
+++ b/x86-Fix-bf16-and-matrix.patch
@@ -0,0 +1,321 @@
+This backport contains 4 patchs from gcc main stream tree.
+The commit id of these patchs list as following in the order of time.
+
+0001-re-PR-target-90424-memcpy-into-vector-builtin-not-op.patch
+1bf2a0b90f2457f6d9301535560eb5e05978261b
+
+0002-testsuite-aarch64-arm-Add-missing-quotes-to-expected.patch
+0ec537f3500924f29505977aa89c2a1d4671c584
+
+0003-x86-Tweak-testcases-for-PR82361.patch
+ad4644f378fe2f731cd987a4aff14b935f530b88
+
+0004-x86-Robustify-vzeroupper-handling-across-calls.patch
+2a2e3a0dfcbe0861915f421d11b828f0c35023f0
+
+diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
+index 9282a8fb6..ba72da1ec 100644
+--- a/gcc/config/i386/i386.c
++++ b/gcc/config/i386/i386.c
+@@ -95,6 +95,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "i386-builtins.h"
+ #include "i386-expand.h"
+ #include "i386-features.h"
++#include "function-abi.h"
+ 
+ /* This file should be included last.  */
+ #include "target-def.h"
+@@ -13529,6 +13530,15 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
+ 	    }
+ 	}
+ 
++      /* If the function is known to preserve some SSE registers,
++	 RA and previous passes can legitimately rely on that for
++	 modes wider than 256 bits.  It's only safe to issue a
++	 vzeroupper if all SSE registers are clobbered.  */
++      const function_abi &abi = insn_callee_abi (insn);
++      if (!hard_reg_set_subset_p (reg_class_contents[ALL_SSE_REGS],
++				  abi.mode_clobbers (V4DImode)))
++	return AVX_U128_ANY;
++
+       return AVX_U128_CLEAN;
+     }
+ 
+diff --git a/gcc/testsuite/g++.target/i386/pr90424-1.C b/gcc/testsuite/g++.target/i386/pr90424-1.C
+new file mode 100644
+index 000000000..9df8c089b
+--- /dev/null
++++ b/gcc/testsuite/g++.target/i386/pr90424-1.C
+@@ -0,0 +1,32 @@
++/* { dg-do compile { target c++11 } } */
++/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
++
++template <class T>
++using V [[gnu::vector_size(16)]] = T;
++
++template <class T, unsigned M = sizeof(V<T>)>
++V<T> load(const void *p) {
++  using W = V<T>;
++  W r;
++  __builtin_memcpy(&r, p, M);
++  return r;
++}
++
++// movq or movsd
++template V<char> load<char, 8>(const void *);     // bad
++template V<short> load<short, 8>(const void *);   // bad
++template V<int> load<int, 8>(const void *);       // bad
++template V<long> load<long, 8>(const void *);     // good
++// the following is disabled because V2SF isn't a supported mode
++// template V<float> load<float, 8>(const void *);   // bad
++template V<double> load<double, 8>(const void *); // good (movsd?)
++
++// movd or movss
++template V<char> load<char, 4>(const void *);   // bad
++template V<short> load<short, 4>(const void *); // bad
++template V<int> load<int, 4>(const void *);     // good
++template V<float> load<float, 4>(const void *); // good
++
++/* We should end up with one load and one insert for each function.  */
++/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
++/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
+diff --git a/gcc/testsuite/g++.target/i386/pr90424-2.C b/gcc/testsuite/g++.target/i386/pr90424-2.C
+new file mode 100644
+index 000000000..3abb65f45
+--- /dev/null
++++ b/gcc/testsuite/g++.target/i386/pr90424-2.C
+@@ -0,0 +1,31 @@
++/* { dg-do compile { target c++11 } } */
++/* { dg-options "-O2 -msse2 -fdump-tree-optimized" } */
++
++template <class T>
++using V [[gnu::vector_size(16)]] = T;
++
++template <class T, unsigned M = sizeof(V<T>)>
++V<T> load(const void *p) {
++  V<T> r = {};
++  __builtin_memcpy(&r, p, M);
++  return r;
++}
++
++// movq or movsd
++template V<char> load<char, 8>(const void *);     // bad
++template V<short> load<short, 8>(const void *);   // bad
++template V<int> load<int, 8>(const void *);       // bad
++template V<long> load<long, 8>(const void *);     // good
++// the following is disabled because V2SF isn't a supported mode
++// template V<float> load<float, 8>(const void *);   // bad
++template V<double> load<double, 8>(const void *); // good (movsd?)
++
++// movd or movss
++template V<char> load<char, 4>(const void *);   // bad
++template V<short> load<short, 4>(const void *); // bad
++template V<int> load<int, 4>(const void *);     // good
++template V<float> load<float, 4>(const void *); // good
++
++/* We should end up with one load and one insert for each function.  */
++/* { dg-final { scan-tree-dump-times "BIT_INSERT_EXPR" 9 "optimized" } } */
++/* { dg-final { scan-tree-dump-times "MEM" 9 "optimized" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
+index 184990471..d96a8733a 100644
+--- a/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
++++ b/gcc/testsuite/gcc.target/aarch64/target_attr_10.c
+@@ -13,4 +13,4 @@ foo (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+   return vbslq_u8 (a, b, c); /* { dg-message "called from here" } */
+ }
+ 
+-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
++/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
+diff --git a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
+index 05dc579f2..fb6e0b9cd 100644
+--- a/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
++++ b/gcc/testsuite/gcc.target/arm/attr-neon-builtin-fail.c
+@@ -14,5 +14,5 @@ foo (uint8x16_t *p)
+   *p = vmovq_n_u8 (3); /* { dg-message "called from here" } */
+ }
+ 
+-/* { dg-error "inlining failed in call to always_inline" "" { target *-*-* } 0 } */
++/* { dg-error "inlining failed in call to 'always_inline'" "" { target *-*-* } 0 } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr82361-1.c b/gcc/testsuite/gcc.target/i386/pr82361-1.c
+index e7c356557..dec1792ae 100644
+--- a/gcc/testsuite/gcc.target/i386/pr82361-1.c
++++ b/gcc/testsuite/gcc.target/i386/pr82361-1.c
+@@ -4,50 +4,50 @@
+ /* We should be able to optimize all %eax to %rax zero extensions, because
+    div and idiv instructions with 32-bit operands zero-extend both results.   */
+ /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
+-/* FIXME: We are still not able to optimize the modulo in f1/f2, only manage
+-   one.  */
++/* FIXME: The compiler does not merge zero-extension to the modulo part
++   of f1 and f2.  */
+ /* { dg-final { scan-assembler-times "movl\t%edx" 2 } } */
+ 
+ void
+ f1 (unsigned int a, unsigned int b)
+ {
+-  unsigned long long c = a / b;
+-  unsigned long long d = a % b;
++  register unsigned long long c asm ("rax") = a / b;
++  register unsigned long long d asm ("rdx") = a % b;
+   asm volatile ("" : : "r" (c), "r" (d));
+ }
+ 
+ void
+ f2 (int a, int b)
+ {
+-  unsigned long long c = (unsigned int) (a / b);
+-  unsigned long long d = (unsigned int) (a % b);
++  register unsigned long long c asm ("rax") = (unsigned int) (a / b);
++  register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
+   asm volatile ("" : : "r" (c), "r" (d));
+ }
+ 
+ void
+ f3 (unsigned int a, unsigned int b)
+ {
+-  unsigned long long c = a / b;
++  register unsigned long long c asm ("rax") = a / b;
+   asm volatile ("" : : "r" (c));
+ }
+ 
+ void
+ f4 (int a, int b)
+ {
+-  unsigned long long c = (unsigned int) (a / b);
++  register unsigned long long c asm ("rax") = (unsigned int) (a / b);
+   asm volatile ("" : : "r" (c));
+ }
+ 
+ void
+ f5 (unsigned int a, unsigned int b)
+ {
+-  unsigned long long d = a % b;
++  register unsigned long long d asm ("rdx") = a % b;
+   asm volatile ("" : : "r" (d));
+ }
+ 
+ void
+ f6 (int a, int b)
+ {
+-  unsigned long long d = (unsigned int) (a % b);
++  register unsigned long long d asm ("rdx") = (unsigned int) (a % b);
+   asm volatile ("" : : "r" (d));
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/pr82361-2.c b/gcc/testsuite/gcc.target/i386/pr82361-2.c
+index c1e484d6e..2d87de182 100644
+--- a/gcc/testsuite/gcc.target/i386/pr82361-2.c
++++ b/gcc/testsuite/gcc.target/i386/pr82361-2.c
+@@ -4,7 +4,8 @@
+ /* We should be able to optimize all %eax to %rax zero extensions, because
+    div and idiv instructions with 32-bit operands zero-extend both results.   */
+ /* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */
+-/* Ditto %edx to %rdx zero extensions.  */
+-/* { dg-final { scan-assembler-not "movl\t%edx, %edx" } } */
++/* FIXME: The compiler does not merge zero-extension to the modulo part
++   of f1 and f2.  */
++/* { dg-final { scan-assembler-times "movl\t%edx" 4 } } */
+ 
+ #include "pr82361-1.c"
+diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
+index 527deffe4..be47519bc 100644
+--- a/gcc/tree-cfg.c
++++ b/gcc/tree-cfg.c
+@@ -4297,8 +4297,17 @@ verify_gimple_assign_ternary (gassign *stmt)
+ 	}
+       if (! ((INTEGRAL_TYPE_P (rhs1_type)
+ 	      && INTEGRAL_TYPE_P (rhs2_type))
++	     /* Vector element insert.  */
+ 	     || (VECTOR_TYPE_P (rhs1_type)
+-		 && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))))
++		 && types_compatible_p (TREE_TYPE (rhs1_type), rhs2_type))
++	     /* Aligned sub-vector insert.  */
++	     || (VECTOR_TYPE_P (rhs1_type)
++		 && VECTOR_TYPE_P (rhs2_type)
++		 && types_compatible_p (TREE_TYPE (rhs1_type),
++					TREE_TYPE (rhs2_type))
++		 && multiple_p (TYPE_VECTOR_SUBPARTS (rhs1_type),
++				TYPE_VECTOR_SUBPARTS (rhs2_type))
++		 && multiple_of_p (bitsizetype, rhs3, TYPE_SIZE (rhs2_type)))))
+ 	{
+ 	  error ("not allowed type combination in BIT_INSERT_EXPR");
+ 	  debug_generic_expr (rhs1_type);
+diff --git a/gcc/tree-ssa.c b/gcc/tree-ssa.c
+index 1dc544b6d..a149f5e79 100644
+--- a/gcc/tree-ssa.c
++++ b/gcc/tree-ssa.c
+@@ -1522,8 +1522,6 @@ non_rewritable_lvalue_p (tree lhs)
+       if (DECL_P (decl)
+ 	  && VECTOR_TYPE_P (TREE_TYPE (decl))
+ 	  && TYPE_MODE (TREE_TYPE (decl)) != BLKmode
+-	  && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
+-			      TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (decl))), 0)
+ 	  && known_ge (mem_ref_offset (lhs), 0)
+ 	  && known_gt (wi::to_poly_offset (TYPE_SIZE_UNIT (TREE_TYPE (decl))),
+ 		       mem_ref_offset (lhs))
+@@ -1531,7 +1529,24 @@ non_rewritable_lvalue_p (tree lhs)
+ 			    TYPE_SIZE_UNIT (TREE_TYPE (lhs)))
+ 	  && known_ge (wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (decl))),
+ 		       wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (lhs)))))
+-	return false;
++	{
++	  poly_uint64 lhs_bits, nelts;
++	  if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)), &lhs_bits)
++	      && multiple_p (lhs_bits,
++			     tree_to_uhwi
++			       (TYPE_SIZE (TREE_TYPE (TREE_TYPE (decl)))),
++			     &nelts))
++	    {
++	      if (known_eq (nelts, 1u))
++		return false;
++	      /* For sub-vector inserts the insert vector mode has to be
++		 supported.  */
++	      tree vtype = build_vector_type (TREE_TYPE (TREE_TYPE (decl)),
++					      nelts);
++	      if (TYPE_MODE (vtype) != BLKmode)
++		return false;
++	    }
++	}
+     }
+ 
+   /* A vector-insert using a BIT_FIELD_REF is rewritable using
+@@ -1869,20 +1884,30 @@ execute_update_addresses_taken (void)
+ 		    && bitmap_bit_p (suitable_for_renaming, DECL_UID (sym))
+ 		    && VECTOR_TYPE_P (TREE_TYPE (sym))
+ 		    && TYPE_MODE (TREE_TYPE (sym)) != BLKmode
+-		    && operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (lhs)),
+-					TYPE_SIZE_UNIT
+-					  (TREE_TYPE (TREE_TYPE (sym))), 0)
+-		    && tree_fits_uhwi_p (TREE_OPERAND (lhs, 1))
+-		    && tree_int_cst_lt (TREE_OPERAND (lhs, 1),
+-					TYPE_SIZE_UNIT (TREE_TYPE (sym)))
+-		    && (tree_to_uhwi (TREE_OPERAND (lhs, 1))
+-			% tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (lhs)))) == 0)
++		    && known_ge (mem_ref_offset (lhs), 0)
++		    && known_gt (wi::to_poly_offset
++				   (TYPE_SIZE_UNIT (TREE_TYPE (sym))),
++				 mem_ref_offset (lhs))
++		    && multiple_of_p (sizetype,
++				      TREE_OPERAND (lhs, 1),
++				      TYPE_SIZE_UNIT (TREE_TYPE (lhs))))
+ 		  {
+ 		    tree val = gimple_assign_rhs1 (stmt);
+ 		    if (! types_compatible_p (TREE_TYPE (val),
+ 					      TREE_TYPE (TREE_TYPE (sym))))
+ 		      {
+-			tree tem = make_ssa_name (TREE_TYPE (TREE_TYPE (sym)));
++			poly_uint64 lhs_bits, nelts;
++			tree temtype = TREE_TYPE (TREE_TYPE (sym));
++			if (poly_int_tree_p (TYPE_SIZE (TREE_TYPE (lhs)),
++					     &lhs_bits)
++			    && multiple_p (lhs_bits,
++					   tree_to_uhwi
++					     (TYPE_SIZE (TREE_TYPE
++							   (TREE_TYPE (sym)))),
++					   &nelts)
++			    && maybe_ne (nelts, 1u))
++			  temtype = build_vector_type (temtype, nelts);
++			tree tem = make_ssa_name (temtype);
+ 			gimple *pun
+ 			  = gimple_build_assign (tem,
+ 						 build1 (VIEW_CONVERT_EXPR,
-- 
Gitee